Commit e6bef5c6 by Eric Coissac

Switch to cython 0.13 and add code to remove dependences on SSE2 availability

parent 124297a5
<?xml version="1.0" encoding="UTF-8"?>
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?eclipse-pydev version="1.0"?>
<pydev_project>
<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
<path>/OBITools/src</path>
<path>/OBITools/textwrangler</path>
</pydev_pathproperty>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.6</pydev_property>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">/Library/Frameworks/Python.framework/Versions/2.6/Resources/Python.app/Contents/MacOS/Python</pydev_property>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python-2.6</pydev_property>
</pydev_project>
......@@ -3,7 +3,8 @@
# The list of objects to document. Objects can be named using
# dotted names, module filenames, or package directory names.
# Alases for this option include "objects" and "values".
modules: src/obitools
#modules: src/obitools
modules: obitools
# The type of output that should be generated. Should be one
# of: html, text, latex, dvi, ps, pdf.
......@@ -124,7 +125,7 @@ separate-classes: no
# in the output. Graphs are generated using the Graphviz "dot"
# executable. Graph types include: "classtree", "callgraph",
# "umlclass". Use "all" to include all graph types
graph: classtree
graph: umlclasstree
# The path to the Graphviz "dot" executable, used to generate
# graphs.
......
......@@ -163,6 +163,12 @@ def annotate(sequence,options):
ip = iter(options.finder)
#
# Iterate over the direct primers
# until one match on direct or reverse strand
#
while direct is None:
try:
alid,alir=ip.next()
......@@ -174,6 +180,10 @@ def annotate(sequence,options):
except StopIteration:
break
# If a direct primer match
# look for corresponding reverse primer
# on the ooposite strand
if direct is not None:
pdirect=str(alid.seqB)
......@@ -190,12 +200,17 @@ def annotate(sequence,options):
except StopIteration:
break
# if direct and reverse primers are found then
# compute the length of the applicon
if reverse is not None:
preverse=str(alid.seqB)
if strand :
lampli = reverse[1]-direct[2]
else:
lampli = direct[1]-reverse[2]
# Check for tags if present
ltag = options.primers[pdirect][0]
......@@ -225,6 +240,9 @@ def annotate(sequence,options):
if treverse is not None and len(treverse)!=ltag:
treverse=None
# both primers are found
if direct is not None and reverse is not None:
if strand:
fragment = sequence[direct[2]:reverse[1]]
......@@ -239,10 +257,15 @@ def annotate(sequence,options):
quality.reverse()
else:
quality=None
# Only direct primer is found
elif direct is not None and reverse is None:
if strand:
print "direct partial"
fragment = sequence[direct[2]:]
else:
print "reverse partial"
fragment = sequence[0:direct[1]].complement()
if hasattr(sequence, 'quality'):
if strand:
......
from _nws import NWS
from _upperbond import indexSequences
from _lcs import LCS
from _assemble import DirectAssemble, ReverseAssemble
from _qsassemble import QSolexaDirectAssemble,QSolexaReverseAssemble
......
This source diff could not be displayed because it is too large. You can view the blob instead.
#include "_lcs.h"
#include <string.h>
#include <xmmintrin.h>
#include <stdint.h>
#include <stdlib.h>
#include <limits.h>
#include <stdio.h>
// Expands the 8 low weight 8 bit unsigned integer
// to 8 16bits signed integer
static inline vInt16 expand_8_to_16(vUInt8 data)
{
vUInt8 mask_00= _mm_setzero_si128();
return _mm_unpacklo_epi8(data,mask_00);
}
// Load an SSE register with the next 8 first symbols
// Allocate a band allowing to align sequences of length : 'length'
static inline vInt16 load8Symboles(const char* seq)
column_t* allocateColumn(int length,column_t *column, bool mode8bits)
{
vUInt8 s;
s = _mm_loadu_si128((vUInt8*)seq);
return expand_8_to_16(s);
}
int size;
bool newc = false;
// The band length should be equal to the length
// of the sequence + 7 for taking into account its
// shape
// Allocate a band allowing to align sequences of length : 'length'
size = length * ((mode8bits) ? sizeof(int8_t):sizeof(int16_t));
static inline band_t* allocateBand(int length,band_t *band)
{
int size;
size = (length + 7) * sizeof(vInt16);
// If the pointer to the old column is NULL we allocate
// a new column
if (band==NULL)
if (column==NULL)
{
band = malloc(sizeof(band_t));
if (!band)
return NULL;
if ((posix_memalign(&(band->band),16,size)) ||
(!(band->band)))
{
free(band);
column = malloc(sizeof(column_t));
if (!column)
return NULL;
}
band->size = length;
column->size = 0;
column->data.shrt=NULL;
newc = true;
}
else if (length > band->size)
// Otherwise we check if its size is sufficient
// or if it should be extend
if (size > column->size)
{
vInt16 *old = band->band;
if ((posix_memalign(&(band->band),16,size)) ||
(!(band->band)))
int16_t *old = column->data.shrt;
column->data.shrt = malloc(size);
if (!column->data.shrt)
{
band->band=old;
column->data.shrt = old;
if (newc)
{
free(column);
column=NULL;
return NULL;
}
return NULL;
}
band->size=length;
free(old);
else
column->size = size;
}
// SHRT_MIN
return band;
return column;
}
int fastLCSScore(const char* seq1, const char* seq2,band_t **band)
void freeColumn(column_p column)
{
int lseq1,lseq2; // length of the both sequences
int itmp; // tmp variables for swap
const char* stmp; //
int nbands; // Number of bands of width eight in the score matrix
int lastband; // width of the last band
vInt16 *mainband;
int16_t *scores;
// Made seq1 the shortest sequences
lseq1=strlen(seq1);
lseq2=strlen(seq2);
if (lseq1 > lseq2)
if (column)
{
itmp=lseq1;
lseq1=lseq2;
lseq2=itmp;
stmp=seq1;
seq1=seq2;
seq2=stmp;
if (column->data.shrt)
free(column->data.shrt);
free(column);
}
}
nbands = lseq1 / 8; // You have 8 short int in a SSE register
lastband = lseq1 - (nbands * 8);
int fastLCSScore(const char* seq1, const char* seq2)
{
return fastLCSScore16(seq1,seq2,NULL);
}
#include <xmmintrin.h>
#include <stdint.h>
#include "_sse.h"
#define ALIGN __attribute__((aligned(16)))
typedef __m128i vUInt8;
typedef __m128i vInt16;
#define bool char
#define false (1==0)
#define true (1==1)
typedef struct {
int16_t size;
vInt16 *band;
} band_t;
int fastLCSScore(const char* seq1, const char* seq2,band_t **band);
union { int16_t *shrt;
int8_t *byte;
} data;
} column_t, **column_pp, *column_p;
column_p allocateColumn(int length,column_t *column, bool mode8bits);
void freeColumn(column_p column);
int fastLCSScore16(const char* seq1, const char* seq2,column_pp ppcolumn);
int fastLCSScore8(const char* seq1, const char* seq2,column_pp ppcolumn);
int fastLCSScore(const char* seq1, const char* seq2);
......@@ -5,8 +5,12 @@ Created on 6 Nov. 2009
'''
#@PydevCodeAnalysisIgnore
from obitools import BioSequence
from _lcs cimport *
from _upperbond cimport *
from _dynamic cimport *
from _upperbond import *
cdef class LCS(DynamicProgramming):
......@@ -123,6 +127,65 @@ cdef class LCS(DynamicProgramming):
self.path.vStart=0
#return 0,0,path
def lenlcs(seq1,seq2,double minimum=0.,bint normalized=False, bint large=True):
cdef double lcs
cdef bytes se1=bytes(str(seq1))
cdef bytes se2=bytes(str(seq2))
cdef int l1 = len(seq1)
cdef int l2 = len(seq2)
cdef array.array[unsigned char] w1
cdef array.array[unsigned char] w2
cdef int o1
cdef int o2
cdef int wordcount
cdef bint possible
cdef char *s1
cdef char *s2
s1=se1
s2=se2
if min(l1,l2) < 7:
lcsali = LCS()
lcsali.seqA = seq1
lcsali.seqB = seq2
lcs = lcsali.doAlignment()
else:
if minimum > 0.:
if isinstance(seq1, BioSequence) and hasattr(seq1, "word4table"):
w1 = seq1.word4table
o1 = seq1.word4over
else:
w1 = newtable()
o1 = buildTable(s1,w1._B,&wordcount)
if isinstance(seq1, BioSequence):
seq1.word4table=w1
seq1.word4over=o1
if isinstance(seq2, BioSequence) and hasattr(seq2, "word4table"):
w2 = seq2.word4table
o2 = seq2.word4over
else:
w2 = newtable()
o2 = buildTable(s2,w2._B,&wordcount)
if isinstance(seq2, BioSequence) :
seq2.word4table=w2
seq2.word4over=o2
possible = ispossible(l1, w1._B, o1,
l2, w2._B, o2,
minimum,normalized,large)
if possible:
lcs = fastLCSScore(s1,s2)
else:
lcs = -1.0
else:
lcs = fastLCSScore(s1,s2)
if lcs >= 0 and normalized:
if large:
lcs /=max(l1,l2)
else:
lcs /=min(l1,l2)
return lcs
#include <xmmintrin.h>
#include <string.h>
#include "_sse.h"
#include <stdio.h>
#include <inttypes.h>
#include <math.h>
#define ALIGN __attribute__((aligned(16)))
typedef __m128i vUInt8;
typedef __m128i vUInt16;
typedef __m128i vUInt64;
typedef union
{
vUInt8 m;
uint8_t c[16];
} uchar_v;
typedef union
{
vUInt16 m;
uint16_t c[8];
} ushort_v;
typedef union
{
vUInt64 m;
uint64_t c[2];
} uint64_v;
inline static vUInt8 loadm128(const char* data)
{
vUInt8 tmp;
memcpy(&tmp,data,16);
return tmp;
}
inline static uchar_v hash4m128(uchar_v frag)
{
uchar_v words;
vUInt8 mask_03= _mm_set1_epi8(0x03); // charge le registre avec 16x le meme octet
vUInt8 mask_7F= _mm_set1_epi8(0x7F);
vUInt8 mask_FC= _mm_set1_epi8(0xFC);
vUInt8 mask_03= _MM_SET1_EPI8(0x03); // charge le registre avec 16x le meme octet
vUInt8 mask_FC= _MM_SET1_EPI8(0xFC);
frag.m = _mm_srli_epi64(frag.m,1); // shift logic a droite sur 2 x 64 bits
frag.m = _mm_and_si128(frag.m,mask_03); // and sur les 128 bits
frag.m = _MM_SRLI_EPI64(frag.m,1); // shift logic a droite sur 2 x 64 bits
frag.m = _MM_AND_SI128(frag.m,mask_03); // and sur les 128 bits
words.m= _mm_slli_epi64(frag.m,2);
words.m= _mm_and_si128(words.m,mask_FC);
frag.m = _mm_srli_si128(frag.m,1);
words.m= _mm_or_si128(words.m,frag.m);
words.m= _MM_SLLI_EPI64(frag.m,2);
words.m= _MM_AND_SI128(words.m,mask_FC);
frag.m = _MM_SRLI_SI128(frag.m,1);
words.m= _MM_OR_SI128(words.m,frag.m);
words.m= _mm_slli_epi64(words.m,2);
words.m= _mm_and_si128(words.m,mask_FC);
frag.m = _mm_srli_si128(frag.m,1);
words.m= _mm_or_si128(words.m,frag.m);
words.m= _MM_SLLI_EPI64(words.m,2);
words.m= _MM_AND_SI128(words.m,mask_FC);
frag.m = _MM_SRLI_SI128(frag.m,1);
words.m= _MM_OR_SI128(words.m,frag.m);
words.m= _mm_slli_epi64(words.m,2);
words.m= _mm_and_si128(words.m,mask_FC);
frag.m = _mm_srli_si128(frag.m,1);
words.m= _mm_or_si128(words.m,frag.m);
words.m= _MM_SLLI_EPI64(words.m,2);
words.m= _MM_AND_SI128(words.m,mask_FC);
frag.m = _MM_SRLI_SI128(frag.m,1);
words.m= _MM_OR_SI128(words.m,frag.m);
return words;
}
inline static int anyzerom128(vUInt8 data)
{
vUInt8 mask_00= _mm_setzero_si128();
vUInt8 mask_00= _MM_SETZERO_SI128();
uint64_v tmp;
tmp.m = _mm_cmpeq_epi8(data,mask_00);
tmp.m = _MM_CMPEQ_EPI8(data,mask_00);
return (int)(tmp.c[0]!=0 || tmp.c[1]!=0);
}
......@@ -83,7 +50,7 @@ int buildTable(const char* sequence, unsigned char *table, int *count)
int overflow = 0;
int wc=0;
int i;
vUInt8 mask_00= _mm_setzero_si128();
vUInt8 mask_00= _MM_SETZERO_SI128();
uchar_v frag;
uchar_v words;
......@@ -97,9 +64,9 @@ int buildTable(const char* sequence, unsigned char *table, int *count)
// encode ascii sequence with A : 00 C : 01 T: 10 G : 11
for(frag.m=_mm_loadu_si128((vUInt8*)s);
for(frag.m=_MM_LOADU_SI128((vUInt8*)s);
! anyzerom128(frag.m);
s+=12,frag.m=_mm_loadu_si128((vUInt8*)s))
s+=12,frag.m=_MM_LOADU_SI128((vUInt8*)s))
{
words= hash4m128(frag);
......@@ -121,7 +88,7 @@ int buildTable(const char* sequence, unsigned char *table, int *count)
wc+=12;
}
zero.m=_mm_cmpeq_epi8(frag.m,mask_00);
zero.m=_MM_CMPEQ_EPI8(frag.m,mask_00);
//printf("frag=%d %d %d %d\n",frag.c[0],frag.c[1],frag.c[2],frag.c[3]);
//printf("zero=%d %d %d %d\n",zero.c[0],zero.c[1],zero.c[2],zero.c[3]);
words = hash4m128(frag);
......@@ -139,13 +106,13 @@ static inline vUInt16 partialminsum(vUInt8 ft1,vUInt8 ft2)
vUInt8 mini;
vUInt16 minilo;
vUInt16 minihi;
vUInt8 mask_00= _mm_setzero_si128();
vUInt8 mask_00= _MM_SETZERO_SI128();
mini = _mm_min_epu8(ft1,ft2);
minilo = _mm_unpacklo_epi8(mini,mask_00);
minihi = _mm_unpackhi_epi8(mini,mask_00);
mini = _MM_MIN_EPU8(ft1,ft2);
minilo = _MM_UNPACKLO_EPI8(mini,mask_00);
minihi = _MM_UNPACKHI_EPI8(mini,mask_00);
return _mm_adds_epu16(minilo,minihi);
return _MM_ADDS_EPU16(minilo,minihi);
}
int compareTable(unsigned char *t1, int over1, unsigned char* t2, int over2)
......@@ -158,8 +125,8 @@ int compareTable(unsigned char *t1, int over1, unsigned char* t2, int over2)
int i;
int total;
ft1 = _mm_loadu_si128(table1);
ft2 = _mm_loadu_si128(table2);
ft1 = _MM_LOADU_SI128(table1);
ft2 = _MM_LOADU_SI128(table2);
summini.m = partialminsum(ft1,ft2);
table1++;
table2++;
......@@ -167,16 +134,16 @@ int compareTable(unsigned char *t1, int over1, unsigned char* t2, int over2)
for (i=1;i<16;i++,table1++,table2++)
{
ft1 = _mm_loadu_si128(table1);
ft2 = _mm_loadu_si128(table2);
summini.m = _mm_adds_epu16(summini.m,partialminsum(ft1,ft2));
ft1 = _MM_LOADU_SI128(table1);
ft2 = _MM_LOADU_SI128(table2);
summini.m = _MM_ADDS_EPU16(summini.m,partialminsum(ft1,ft2));
}
// Finishing the sum process
summini.m = _mm_adds_epu16(summini.m,_mm_srli_si128(summini.m,8)); // sum the 4 firsts with the 4 lasts
summini.m = _mm_adds_epu16(summini.m,_mm_srli_si128(summini.m,4));
summini.m = _MM_ADDS_EPU16(summini.m,_MM_SRLI_SI128(summini.m,8)); // sum the 4 firsts with the 4 lasts
summini.m = _MM_ADDS_EPU16(summini.m,_MM_SRLI_SI128(summini.m,4));
total = summini.c[0]+summini.c[1];
total+= (over1 < over2) ? over1:over2;
......@@ -192,10 +159,62 @@ int threshold4(int wordcount,double identity)
wordcount+=3;
error = (int)floor((double)wordcount * ((double)1.0-identity));
lmax = (wordcount - error) / (error + 1);
// printf("length = %d error= %d lmax= %d\n",wordcount,error,lmax);
if (lmax < 4)
return 0;
return (lmax - 3) \
* (error + 1) \
+ ((wordcount - error) % (error + 1));
}
int thresholdLCS4(int32_t reflen,int32_t lcs)
{
int error;
int lmax;
error = reflen - lcs;
lmax = lcs / (error + 1);
if (lmax < 4)
return 0;
return (lmax - 3) \
* (error + 1) \
+ (lcs % (error + 1));
}
#ifndef MAX
#define MAX(x,y) (((x)>(y)) ? (x):(y))
#define MIN(x,y) (((x)<(y)) ? (x):(y))
#endif
int ispossible(int len1, unsigned char *t1, int over1,
int len2, unsigned char* t2, int over2,
double minimum, int normalized, int large)
{
int32_t reflen;
int32_t lcs;
int32_t mincount;
if (normalized)
{
if (large)
reflen = MAX(len1,len2);
else
reflen = MIN(len1,len2);
lcs = (int32_t)ceil((double)reflen * minimum);
}
else
{
reflen = MIN(len1,len2);
lcs = (int32_t) minimum;
}
if (lcs > MIN(len1,len2))
return 0;
mincount = thresholdLCS4(reflen,lcs);
// fprintf(stderr,"MaxLCS %d : %d\n",compareTable(seq1->table,seq1->over,seq2->table,seq2->over),mincount);
return compareTable(t1,over1,t2,over2) >=mincount;
}
int buildTable(char *sequence, unsigned short *table, int *count);
int buildTable(const char *sequence, unsigned char *table, int *count);
int compareTable(unsigned char *t1, int over1, unsigned char* t2, int over2);
int threshold4(int wordcount,double identity);
int thresholdLCS4(int32_t reflen,int32_t lcs);
int ispossible(int len1, unsigned char *t1, int over1,
int len2, unsigned char* t2, int over2,
double minimum, int normalized, int large);
from array import array
cimport array
cdef extern from *:
ctypedef char* const_char_ptr "const char*"
......@@ -7,4 +9,9 @@ cdef import from "_upperbond.h":
int buildTable(const_char_ptr sequence, unsigned char *table, int *count)
int compareTable(unsigned char *t1, int over1, unsigned char* t2, int over2)
int threshold4(int wordcount,double identity)
int thresholdLCS4(int reflen,int lcs)
bint ispossible(int len1, unsigned char *t1, int over1,
int len2, unsigned char* t2, int over2,
double minimum, bint normalized, bint large)
cdef array.array[unsigned char] newtable()
......@@ -37,4 +37,10 @@ cpdef int countCommonWords(array.array table1,
int overflow2):
return compareTable(table1._B,overflow1,
table2._B,overflow2)
\ No newline at end of file
cpdef bint isLCSReachable(int len1, array.array table1, int overflow1,
int len2, array.array table2, int overflow2,
double minimum, bint normalized, bint large):
pass
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -6,12 +6,16 @@ Created on 16 sept. 2009
@author: coissac
'''
from obitools.fasta._fasta cimport *
from array import array
from obitools import BioSequence
from obitools import bioSeqGenerator,AASequence,NucSequence
from obitools.fasta import _parseFastaTag,parseFastaDescription
from obitools.fasta import _parseFastaTag
#from obitools.fasta._fasta import parseFastaDescription
from obitools.fasta._fasta cimport *
cimport array
......@@ -262,4 +266,3 @@ cdef class fastqParserGenetator:
seq.quality = quality
return seq
......@@ -11,6 +11,8 @@ from obitools.align import QSolexaReverseAssemble
from obitools.align import QSolexaRightReverseAssemble
from obitools.tools._solexapairend import buildConsensus
from itertools import chain
def addSolexaPairEndOptions(optionManager):
optionManager.add_option('-r','--reverse-reads',
action="store", dest="reverse",
......@@ -21,6 +23,34 @@ def addSolexaPairEndOptions(optionManager):
)
def cutDirectReverse(entries):
first = []
for i in xrange(10):
first.append(entries.next())
lens = [len(x) for x in first]
clen = {}
for i in lens:
clen[i]=clen.get(i,0)+1
freq = max(clen.values())
freq = [k for k in clen if clen[k]==freq]
assert len(freq)==1,"To many sequence length"
freq = freq[0]
assert freq % 2 == 0, ""
lread = freq/2