Commit 94905d41 by Eric Coissac

Add fast computation of LCS upperbond using SSE2 SIMD intel instructions

parent b17698a1
This source diff could not be displayed because it is too large. You can view the blob instead.
#include <xmmintrin.h>
#include <string.h>
#include <stdio.h>
#include <inttypes.h>
#include <math.h>
#define ALIGN __attribute__((aligned(16)))
typedef __m128i vUInt8;
typedef __m128i vUInt16;
typedef __m128i vUInt64;
typedef union
{
vUInt8 m;
uint8_t c[16];
} uchar_v;
typedef union
{
vUInt16 m;
uint16_t c[8];
} ushort_v;
typedef union
{
vUInt64 m;
uint64_t c[2];
} uint64_v;
inline static vUInt8 loadm128(const char* data)
{
vUInt8 tmp;
memcpy(&tmp,data,16);
return tmp;
}
inline static uchar_v hash4m128(uchar_v frag)
{
uchar_v words;
vUInt8 mask_06= _mm_set1_epi8(0x06); // charge le registre avec 16x le meme octet
vUInt8 mask_7F= _mm_set1_epi8(0x7F);
vUInt8 mask_FC= _mm_set1_epi8(0xFC);
frag.m = _mm_and_si128(frag.m,mask_06); // and sur les 128 bits
frag.m = _mm_srli_epi64(frag.m,1); // shift logic a droite sur 2 x 64 bits
frag.m = _mm_and_si128(frag.m,mask_7F);
words.m= _mm_slli_epi64(frag.m,2);
words.m= _mm_and_si128(words.m,mask_FC);
frag.m = _mm_srli_si128(frag.m,1);
words.m= _mm_or_si128(words.m,frag.m);
words.m= _mm_slli_epi64(words.m,2);
words.m= _mm_and_si128(words.m,mask_FC);
frag.m = _mm_srli_si128(frag.m,1);
words.m= _mm_or_si128(words.m,frag.m);
words.m= _mm_slli_epi64(words.m,2);
words.m= _mm_and_si128(words.m,mask_FC);
frag.m = _mm_srli_si128(frag.m,1);
words.m= _mm_or_si128(words.m,frag.m);
return words;
}
inline static int anyzerom128(vUInt8 data)
{
vUInt8 mask_00= _mm_setzero_si128();
uint64_v tmp;
tmp.m = _mm_cmpeq_epi8(data,mask_00);
return (int)(tmp.c[0]!=0 || tmp.c[1]!=0);
}
inline static void dumpm128(unsigned short *table,vUInt8 data)
{
memcpy(table,&data,16);
}
int buildTable(const char* sequence, unsigned char *table, int *count)
{
int overflow = 0;
int wc=0;
int i;
vUInt8 mask_00= _mm_setzero_si128();
uchar_v frag;
uchar_v words;
uchar_v zero;
char* s;
s=(char*)sequence;
memset(table,0,256*sizeof(unsigned char));
// encode ascii sequence with A : 00 C : 01 T: 10 G : 11
// for(frag.m=loadm128(s);! anyzerom128(frag.m);s+=12,frag.m=loadm128(s))
for(frag.m=_mm_loadu_si128((vUInt8*)s);
! anyzerom128(frag.m);
s+=12,frag.m=_mm_loadu_si128((vUInt8*)s))
{
words= hash4m128(frag);
// printf("%d %d %d %d\n",words.c[0],words.c[1],words.c[2],words.c[3]);
if (table[words.c[0]]<255) table[words.c[0]]++; else overflow++;
if (table[words.c[1]]<255) table[words.c[1]]++; else overflow++;
if (table[words.c[2]]<255) table[words.c[2]]++; else overflow++;
if (table[words.c[3]]<255) table[words.c[3]]++; else overflow++;
if (table[words.c[4]]<255) table[words.c[4]]++; else overflow++;
if (table[words.c[5]]<255) table[words.c[5]]++; else overflow++;
if (table[words.c[6]]<255) table[words.c[6]]++; else overflow++;
if (table[words.c[7]]<255) table[words.c[7]]++; else overflow++;
if (table[words.c[8]]<255) table[words.c[8]]++; else overflow++;
if (table[words.c[9]]<255) table[words.c[9]]++; else overflow++;
if (table[words.c[10]]<255) table[words.c[10]]++; else overflow++;
if (table[words.c[11]]<255) table[words.c[11]]++; else overflow++;
wc+=12;
}
zero.m=_mm_cmpeq_epi8(frag.m,mask_00);
//printf("frag=%d %d %d %d\n",frag.c[0],frag.c[1],frag.c[2],frag.c[3]);
//printf("zero=%d %d %d %d\n",zero.c[0],zero.c[1],zero.c[2],zero.c[3]);
words = hash4m128(frag);
if (zero.c[0]+zero.c[1]+zero.c[2]+zero.c[3]==0)
for(i=0;zero.c[i+3]==0;i++,wc++)
if (table[words.c[i]]<255) table[words.c[i]]++; else overflow++;
if (count) *count=wc;
return overflow;
}
static inline vUInt16 partialminsum(vUInt8 ft1,vUInt8 ft2)
{
vUInt8 mini;
vUInt16 minilo;
vUInt16 minihi;
vUInt8 mask_00= _mm_setzero_si128();
mini = _mm_min_epu8(ft1,ft2);
minilo = _mm_unpacklo_epi8(mini,mask_00);
minihi = _mm_unpackhi_epi8(mini,mask_00);
return _mm_adds_epu16(minilo,minihi);
}
int compareTable(unsigned char *t1, int over1, unsigned char* t2, int over2)
{
vUInt8 ft1;
vUInt8 ft2;
vUInt8 *table1=(vUInt8*)t1;
vUInt8 *table2=(vUInt8*)t2;
ushort_v summini;
int i;
int total;
ft1 = _mm_loadu_si128(table1);
ft2 = _mm_loadu_si128(table2);
summini.m = partialminsum(ft1,ft2);
table1++;
table2++;
for (i=1;i<16;i++,table1++,table2++)
{
ft1 = _mm_loadu_si128(table1);
ft2 = _mm_loadu_si128(table2);
summini.m = _mm_adds_epu16(summini.m,partialminsum(ft1,ft2));
}
// Finishing the sum process
summini.m = _mm_adds_epu16(summini.m,_mm_srli_si128(summini.m,8)); // sum the 4 firsts with the 4 lasts
summini.m = _mm_adds_epu16(summini.m,_mm_srli_si128(summini.m,4));
total = summini.c[0]+summini.c[1];
total+= (over1 < over2) ? over1:over2;
return total;
}
int threshold4(int wordcount,double identity)
{
int error;
int lmax;
wordcount+=3;
error = (int)floor((double)wordcount * ((double)1.0-identity));
lmax = (wordcount - error) / (error + 1);
// printf("length = %d error= %d lmax= %d\n",wordcount,error,lmax);
if (lmax < 4)
return 0;
return (lmax - 3) \
* (error + 1) \
+ ((wordcount - error) % (error + 1));
}
int buildTable(char *sequence, unsigned short *table, int *count);
int compareTable(unsigned char *t1, int over1, unsigned char* t2, int over2);
int threshold4(int wordcount,double identity);
cdef extern from *:
ctypedef char* const_char_ptr "const char*"
cdef import from "_upperbond.h":
int buildTable(const_char_ptr sequence, unsigned char *table, int *count)
int compareTable(unsigned char *t1, int over1, unsigned char* t2, int over2)
int threshold4(int wordcount,double identity)
'''
Created on 6 Nov. 2009
@author: coissac
'''
#@PydevCodeAnalysisIgnore
from _dynamic cimport *
from array import array
cimport array
from _upperbond cimport *
#from libupperbond import buildTable
cdef array.array[unsigned char] newtable():
table = array.array('B',[0])
array.resize(table,256)
return table
def indexSequences(seq,double threshold=0.95):
cdef bytes sequence
cdef array.array[unsigned char] table
cdef int overflow
cdef int wordcount
cdef int wordmin
table = newtable()
sequence=bytes(str(seq))
overflow = buildTable(sequence,table._B,&wordcount)
wordmin = threshold4(wordcount,threshold)
return (table,overflow,wordmin)
cpdef int countCommonWords(array.array table1,
int overflow1,
array.array table2,
int overflow2):
return compareTable(table1._B,overflow1,
table2._B,overflow2)
\ No newline at end of file
"""
array.pxd
Cython interface to Python's array.array module.
* 1D contiguous data view
* tools for fast array creation, maximum C-speed and handiness
* suitable as allround light weight auto-array within Cython code too
Usage:
>>> cimport array
Usage through Cython buffer interface (Py2.3+):
>>> def f(arg1, unsigned i, double dx)
array.array[double] a = arg1
a[i] += dx
Fast C-level new_array(_zeros), resize_array, copy_array, .length,
zero_array
cdef array.array[double] k = array.copy(d)
cdef array.array[double] n = array.array(d, d.length * 2 )
cdef array.array[double] m = array.zeros_like(FLOAT_TEMPLATE)
array.resize(f, 200000)
Zero overhead with naked data pointer views by union:
_f, _d, _i, _c, _u, ...
=> Original C array speed + Python dynamic memory management
cdef array.array a = inarray
if
a._d[2] += 0.66 # use as double array without extra casting
float *subview = vector._f + 10 # starting from 10th element
unsigned char *subview_buffer = vector._B + 4
Suitable as lightweight arrays intra Cython without speed penalty.
Replacement for C stack/malloc arrays; no trouble with refcounting,
mem.leaks; seamless Python compatibility, buffer() option
IMPORTANT: arrayarray.h (arrayobject, arraydescr) is not part of
the official Python C-API so far; arrayarray.h is located
next to this file copy it to PythonXX/include or local or
somewhere on your -I path
last changes: 2009-05-15 rk
: 2009-12-06 bp
"""
cimport stdlib
cdef extern from "stdlib.h" nogil:
void *memset(void *str, int c, size_t n)
char *strcat(char *str1, char *str2)
char *strncat(char *str1, char *str2, size_t n)
void *memchr(void *str, int c, size_t n)
int memcmp(void *str1, void *str2, size_t n)
void *memcpy(void *str1, void *str2, size_t n)
void *memmove(void *str1, void *str2, size_t n)
cdef extern from "arrayarray.h":
ctypedef void PyTypeObject
ctypedef short Py_UNICODE
int PyErr_BadArgument()
ctypedef class array.array [object arrayobject]
ctypedef object GETF(array a, Py_ssize_t ix)
ctypedef object SETF(array a, Py_ssize_t ix, object o)
ctypedef struct arraydescr: # [object arraydescr]:
int typecode
int itemsize
GETF getitem # PyObject * (*getitem)(struct arrayobject *, Py_ssize_t);
SETF setitem # int (*setitem)(struct arrayobject *, Py_ssize_t, PyObject *);
ctypedef class array.array [object arrayobject]:
cdef __cythonbufferdefaults__ = {'ndim' : 1, 'mode':'c'}
cdef:
PyTypeObject* ob_type
int ob_size # number of valid items;
unsigned length # == ob_size (by union)
char* ob_item # to first item
Py_ssize_t allocated # bytes
arraydescr* ob_descr # struct arraydescr *ob_descr;
object weakreflist # /* List of weak references */
# view's of ob_item:
float* _f # direct float pointer access to buffer
double* _d # double ...
int* _i
unsigned *_I
unsigned char *_B
signed char *_b
char *_c
unsigned long *_L
long *_l
short *_h
unsigned short *_H
Py_UNICODE *_u
void* _v
def __getbuffer__(array self, Py_buffer* info, int flags):
# This implementation of getbuffer is geared towards Cython
# requirements, and does not yet fullfill the PEP.
# In particular strided access is always provided regardless
# of flags
cdef unsigned rows, columns, itemsize
info.suboffsets = NULL
info.buf = self.ob_item
info.readonly = 0
info.ndim = 1
info.itemsize = itemsize = self.ob_descr.itemsize # e.g. sizeof(float)
info.strides = <Py_ssize_t*> \
stdlib.malloc(sizeof(Py_ssize_t) * info.ndim * 2 + 2)
info.shape = info.strides + 1
info.shape[0] = self.ob_size # number of items
info.strides[0] = info.itemsize
info.format = <char*>(info.strides + 2 * info.ndim)
info.format[0] = self.ob_descr.typecode
info.format[1] = 0
info.obj = self
def __releasebuffer__(array self, Py_buffer* info):
#if PyArray_HASFIELDS(self):
# stdlib.free(info.format)
#if sizeof(npy_intp) != sizeof(Py_ssize_t):
stdlib.free(info.strides)
array newarrayobject(PyTypeObject* type, Py_ssize_t size,
arraydescr *descr)
# fast resize/realloc
# not suitable for small increments; reallocation 'to the point'
int resize(array self, Py_ssize_t n)
# efficient for small increments (not in Py2.3-)
int resize_smart(array self, Py_ssize_t n)
# fast creation of a new array - init with zeros
# yet you need a (any) template array of the same item type (but not same size)
cdef inline array zeros_like(array sametype):
cdef array op = newarrayobject(<PyTypeObject*>sametype.ob_type, sametype.ob_size, sametype.ob_descr)
if op:
memset(op.ob_item, 0, op.ob_size * op.ob_descr.itemsize)
return op
# fast creation of a new array - no init with zeros
cdef inline array empty_like(array sametype):
return newarrayobject(<PyTypeObject*>sametype.ob_type, sametype.op.ob_size,
sametype.ob_descr)
cdef inline array copy(array self):
cdef array op = newarrayobject(<PyTypeObject*>self.ob_type, self.ob_size,
self.ob_descr)
memcpy(op.ob_item, self.ob_item, op.ob_size * op.ob_descr.itemsize)
return op
cdef inline int extend_buffer(array self, char* stuff, Py_ssize_t n):
""" efficent appending of new stuff of same type (e.g. of same array type)
n: number of elements (not number of bytes!)
"""
cdef Py_ssize_t itemsize = self.ob_descr.itemsize, orgsize = self.ob_size
if -1 == resize_smart(self, orgsize + n):
return -1
memcpy( self.ob_item + orgsize * itemsize, stuff, n * itemsize )
cdef inline int extend(array self, array other):
if self.ob_descr.typecode != self.ob_descr.typecode:
PyErr_BadArgument()
return -1
return extend_buffer(self, other.ob_item, other.ob_size)
cdef inline void zero(array op):
memset(op.ob_item, 0, op.ob_size * op.ob_descr.itemsize)
/* arrayarray.h
artificial C-API for Python's
<array.array> type.
copy this file to your -I path, e.g. .../pythonXX/include
See array.pxd next to this file
last changes: 2009-05-15 rk
*/
#ifndef _ARRAYARRAY_H
#define _ARRAYARRAY_H
#include <Python.h>
struct arrayobject; /* Forward */
/* All possible arraydescr values are defined in the vector "descriptors"
* below. That's defined later because the appropriate get and set
* functions aren't visible yet.
*/
typedef struct arraydescr {
int typecode;
int itemsize;
PyObject * (*getitem)(struct arrayobject *, Py_ssize_t);
int (*setitem)(struct arrayobject *, Py_ssize_t, PyObject *);
#if PY_VERSION_HEX >= 0x03000000
char *formats;
#endif
} arraydescr;
typedef struct arrayobject {
PyObject_HEAD
union {
int ob_size;
unsigned length;
};
union {
char *ob_item;
float *_f;
double *_d;
int *_i;
unsigned *_I;
unsigned char *_B;
signed char *_b;
char *_c;
unsigned long *_L;
long *_l;
short *_h;
unsigned short *_H;
Py_UNICODE *_u;
void *_v;
};
#if PY_VERSION_HEX >= 0x02040000
Py_ssize_t allocated;
#endif
struct arraydescr *ob_descr;
#if PY_VERSION_HEX >= 0x02040000
PyObject *weakreflist; /* List of weak references */
#if PY_VERSION_HEX >= 0x03000000
int ob_exports; /* Number of exported buffers */
#endif
#endif
} arrayobject;
#ifndef NO_NEWARRAY_INLINE
/*
*
* fast creation of a new array - init with zeros
*/
static inline PyObject *
newarrayobject(PyTypeObject *type, Py_ssize_t size, struct arraydescr *descr)
{
arrayobject *op;
size_t nbytes;
if (size < 0) {
PyErr_BadInternalCall();
return NULL;
}
nbytes = size * descr->itemsize;
/* Check for overflow */
if (nbytes / descr->itemsize != (size_t)size) {
return PyErr_NoMemory();
}
op = (arrayobject *) type->tp_alloc(type, 0);
if (op == NULL) {
return NULL;
}
op->ob_descr = descr;
#if !( PY_VERSION_HEX < 0x02040000 )
op->allocated = size;
op->weakreflist = NULL;
#endif
Py_SIZE(op) = size;
if (size <= 0) {
op->ob_item = NULL;
}
else {
op->ob_item = PyMem_NEW(char, nbytes);
if (op->ob_item == NULL) {
Py_DECREF(op);
return PyErr_NoMemory();
}
}
return (PyObject *) op;
}
#else
PyObject *
newarrayobject(PyTypeObject *type, Py_ssize_t size, struct arraydescr *descr);
#endif
/* fast resize (reallocation to the point)
not designed for filing small increments (but for fast opaque array apps) */
static inline int resize(arrayobject *self, Py_ssize_t n)
{
char *item=self->ob_item;
PyMem_RESIZE(item, char, (unsigned)(n * self->ob_descr->itemsize));
if (item == NULL) {
PyErr_NoMemory();
return -1;
}
self->ob_item = item;
self->ob_size = n;
#if PY_VERSION_HEX >= 0x02040000
self->allocated = n;
#endif
return 0;
}
/* suitable for small increments; over allocation 50% ;
Remains non-smart in Python 2.3- ; but exists for compatibility */
static inline int resize_smart(arrayobject *self, Py_ssize_t n)
{
char *item=self->ob_item;
#if PY_VERSION_HEX >= 0x02040000
if (n < self->allocated) {
if (n*4 > self->allocated) {
self->ob_size = n;
return 0;
}
}
Py_ssize_t newsize = n * 3 / 2 + 1;
PyMem_RESIZE(item, char, (unsigned)(newsize * self->ob_descr->itemsize));
if (item == NULL) {
PyErr_NoMemory();
return -1;
}
self->ob_item = item;
self->ob_size = n;
self->allocated = newsize;
return 0;
#else
return resize(self, n) /* Python 2.3 has no 'allocated' */
#endif
}
#endif
/* _ARRAYARRAY_H */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment