Commit 70c49e21 by Celine Mercier

Added the kmer filter to LCS alignments, and now obiblobs containing

encoded sequences are directly put in int16_t arrays for the alignment
parent 08e67a09
......@@ -46,7 +46,7 @@ def addOptions(parser):
metavar='<ALIGNMENT TYPE>',
default='lcs',
type=str,
help="Compute alignment using the LCS method.")
help="Compute alignment using the LCS method (default).")
group.add_argument('--threshold','-t',
action="store", dest="align:threshold",
......@@ -64,15 +64,15 @@ def addOptions(parser):
group.add_argument('--longest_length','-L',
action="store_const", dest="align:reflength",
default="ali",
const="longest",
default=0,
const=1,
help="The reference length is the length of the longest sequence."
" Default: the reference length is the length of the alignment.")
group.add_argument('--shortest_length','-l',
action="store_const", dest="align:reflength",
default="ali",
const="shortest",
default=0,
const=2,
help="The reference length is the length of the shortest sequence."
" Default: the reference length is the length of the alignment.")
......@@ -89,9 +89,7 @@ def addOptions(parser):
def run(config):
#pb = ProgressBar(1, config, seconde=5) # TODO
# Open DMS
d = OBIDMS(config['obi']['defaultdms'])
......@@ -106,9 +104,9 @@ def run(config):
# TODO Take other alignment types into account when they'll be implemented
# Call cython alignment function
iview.align(oview)
repr(oview)
iview.align(oview, threshold=config['align']['threshold'], normalize=config['align']['normalize'], reference=config['align']['reflength'], similarity_mode=config['align']['similarity'])
print(repr(oview))
iview.close()
oview.close()
......
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -19,6 +19,8 @@
#include "obitypes.h"
#include "obiview.h"
#include "sse_banded_LCS_alignment.h"
#include "upperband.h"
#include "obiblob.h"
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
......@@ -29,21 +31,22 @@
// option pour ecrire en stdint?
// check NUC_SEQS view type? and score type (int or float if normalize)
// what's with multiple sequences/line columns?
// make function that put blobs in int16
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column,
double threshold, bool normalize, int reference, bool similarity_mode)
{
index_t i, j, k;
index_t seq_count;
char* seq1;
char* seq2;
const char* id1;
const char* id2;
double score;
index_t i, j, k;
index_t seq_count;
const char* id1;
const char* id2;
double score;
OBIDMS_column_p id_column;
Kmer_table_p ktable;
Obi_blob_p blob1;
Obi_blob_p blob2;
int lcs_min;
k = 0;
......@@ -62,6 +65,15 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
return -1;
}
// Build kmer tables
ktable = hash_seq_column(seq_view, seq_column);
if (ktable == NULL)
{
obi_set_errno(OBI_ALIGN_ERROR);
obidebug(1, "\nError building kmer tables before aligning");
return -1;
}
// Get the ID column pointer
id_column = obi_view_get_column(seq_view, ID_COLUMN);
......@@ -69,66 +81,72 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
for (i=0; i < (seq_count - 1); i++)
{
if (i%100 == 0)
fprintf(stderr,"\rDone : %f %% ", (i / (float) seq_count)*100);
for (j=i+1; j < seq_count; j++)
{
//fprintf(stderr, "\ni=%lld, j=%lld, k=%lld", i, j, k);
seq1 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, 0);
seq2 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, 0);
blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, 0);
blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, 0);
if ((seq1 == NULL) || (seq2 == NULL))
if ((blob1 == NULL) || (blob2 == NULL))
{
obidebug(1, "\nError retrieving sequences to align");
return -1;
}
// TODO kmer filter
// kmer filter
align_filters(ktable, blob1, blob2, i, j, threshold, normalize, reference, similarity_mode, &score, &lcs_min);
// Compute alignment score
score = generic_sse_banded_lcs_align(seq1, seq2, threshold, normalize, reference, similarity_mode);
if ((threshold == 0) || (score == -1.0)) // no threshold or filter passed, and sequences not identical: align
score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode);
// Get sequence ids
id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0);
id2 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0);
if ((score >= 0) && (((normalize || similarity_mode) && (score >= threshold)) || ((!similarity_mode && !normalize) && (score <= threshold))))
{ // Print result
// Write sequence ids in output view
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id1_column, k, 0, id1) < 0)
{
obidebug(1, "\nError writing id1 in a column");
return -1;
}
// Get sequence ids
id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0);
id2 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0);
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id2_column, k, 0, id2) < 0)
{
obidebug(1, "\nError writing id2 in a column");
return -1;
}
// Write score in output view
if (normalize)
{
if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0)
// Write sequence ids in output view
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id1_column, k, 0, id1) < 0)
{
obidebug(1, "\nError writing alignment score in a column");
obidebug(1, "\nError writing id1 in a column");
return -1;
}
}
else
{
if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0)
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id2_column, k, 0, id2) < 0)
{
obidebug(1, "\nError writing alignment score in a column");
obidebug(1, "\nError writing id2 in a column");
return -1;
}
}
free(seq1);
free(seq2);
// Write score in output view
if (normalize)
{
if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0)
{
obidebug(1, "\nError writing alignment score in a column");
return -1;
}
}
else
{
if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0)
{
obidebug(1, "\nError writing alignment score in a column");
return -1;
}
}
k++;
k++;
}
}
}
free_kmer_tables(ktable, seq_count);
return 0;
}
......
......@@ -19,6 +19,7 @@
#include <stdbool.h>
#include "obidms.h"
#include "obiview.h"
#include "obidmscolumn.h"
#include "obitypes.h"
......
......@@ -12,6 +12,9 @@
#include <stdint.h>
#include <stdbool.h>
#include "obiblob.h"
#define ALILEN (0) // TODO enum
#define MAXLEN (1)
#define MINLEN (2)
......@@ -19,5 +22,6 @@
// TODO doc
int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int reference, bool lcsmode);
double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bool normalize, int reference, bool similarity_mode);
double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double threshold, bool normalize, int reference, bool similarity_mode);
#endif
#include <stdio.h>
#include <math.h>
#include <stdbool.h>
#include "upperband.h"
#include "_sse.h"
#include "sse_banded_LCS_alignment.h"
#include "obidmscolumn.h"
#include "obiview.h"
//#include "../libutils/utilities.h"
//#include "../libfasta/sequence.h"
inline static uchar_v hash4m128(uchar_v frag)
{
uchar_v words;
vUInt8 mask_03= _MM_SET1_EPI8(0x03); // charge le registre avec 16x le meme octet
vUInt8 mask_FC= _MM_SET1_EPI8(0xFC);
frag.m = _MM_SRLI_EPI64(frag.m,1); // shift logic a droite sur 2 x 64 bits
frag.m = _MM_AND_SI128(frag.m,mask_03); // and sur les 128 bits
words.m= _MM_SLLI_EPI64(frag.m,2);
words.m= _MM_AND_SI128(words.m,mask_FC);
frag.m = _MM_SRLI_SI128(frag.m,1);
words.m= _MM_OR_SI128(words.m,frag.m);
words.m= _MM_SLLI_EPI64(words.m,2);
words.m= _MM_AND_SI128(words.m,mask_FC);
frag.m = _MM_SRLI_SI128(frag.m,1);
words.m= _MM_OR_SI128(words.m,frag.m);
words.m= _MM_SLLI_EPI64(words.m,2);
words.m= _MM_AND_SI128(words.m,mask_FC);
frag.m = _MM_SRLI_SI128(frag.m,1);
words.m= _MM_OR_SI128(words.m,frag.m);
return words;
}
#ifdef __SSE2__
inline static int anyzerom128(vUInt8 data)
{
vUInt8 mask_00= _MM_SETZERO_SI128();
uint64_v tmp;
tmp.m = _MM_CMPEQ_EPI8(data,mask_00);
return (int)(tmp.c[0]!=0 || tmp.c[1]!=0);
}
#else
inline static int anyzerom128(vUInt8 data)
{
int i;
um128 tmp;
tmp.i = data;
for (i=0;i<8;i++)
if (tmp.s8[i]==0)
return 1;
return 0;
}
#endif
inline static void dumpm128(unsigned short *table,vUInt8 data)
{
memcpy(table,&data,16);
}
/**
* Compute 4mer occurrence table from a DNA sequence
*
* sequence : a pointer to the null terminated nuc sequence
* table : a pointer to a 256 cells unisgned char table for
* storing the occurrence table
* count : pointer to an int value used as a return value
* containing the global word counted
*
* returns the number of words observed in the sequence with a
* count greater than 255.
*/
int build_table(const char* sequence, unsigned char *table, int *count)
{
int overflow = 0;
int wc=0;
int i;
vUInt8 mask_00= _MM_SETZERO_SI128();
uchar_v frag;
uchar_v words;
uchar_v zero;
char* s;
s=(char*)sequence;
memset(table,0,256*sizeof(unsigned char));
// encode ascii sequence with A : 00 C : 01 T: 10 G : 11
for(frag.m=_MM_LOADU_SI128((vUInt8*)s);
! anyzerom128(frag.m);
s+=12,frag.m=_MM_LOADU_SI128((vUInt8*)s))
{
words= hash4m128(frag);
// printf("%d %d %d %d\n",words.c[0],words.c[1],words.c[2],words.c[3]);
if (table[words.c[0]]<255) table[words.c[0]]++; else overflow++;
if (table[words.c[1]]<255) table[words.c[1]]++; else overflow++;
if (table[words.c[2]]<255) table[words.c[2]]++; else overflow++;
if (table[words.c[3]]<255) table[words.c[3]]++; else overflow++;
if (table[words.c[4]]<255) table[words.c[4]]++; else overflow++;
if (table[words.c[5]]<255) table[words.c[5]]++; else overflow++;
if (table[words.c[6]]<255) table[words.c[6]]++; else overflow++;
if (table[words.c[7]]<255) table[words.c[7]]++; else overflow++;
if (table[words.c[8]]<255) table[words.c[8]]++; else overflow++;
if (table[words.c[9]]<255) table[words.c[9]]++; else overflow++;
if (table[words.c[10]]<255) table[words.c[10]]++; else overflow++;
if (table[words.c[11]]<255) table[words.c[11]]++; else overflow++;
wc+=12;
}
zero.m=_MM_CMPEQ_EPI8(frag.m,mask_00);
//printf("frag=%d %d %d %d\n",frag.c[0],frag.c[1],frag.c[2],frag.c[3]);
//printf("zero=%d %d %d %d\n",zero.c[0],zero.c[1],zero.c[2],zero.c[3]);
words = hash4m128(frag);
if (zero.c[0]+zero.c[1]+zero.c[2]+zero.c[3]==0)
{
for(i=0;zero.c[i+3]==0;i++,wc++)
{
if (table[words.c[i]]<255)
(table[words.c[i]])++;
else
(overflow)++;
}
}
if (count)
*count=wc;
return overflow;
}
static inline vUInt16 partial_min_sum(vUInt8 ft1,vUInt8 ft2)
{
vUInt8 mini;
vUInt16 minilo;
vUInt16 minihi;
vUInt8 mask_00= _MM_SETZERO_SI128();
mini = _MM_MIN_EPU8(ft1,ft2);
minilo = _MM_UNPACKLO_EPI8(mini,mask_00);
minihi = _MM_UNPACKHI_EPI8(mini,mask_00);
return _MM_ADDS_EPU16(minilo,minihi);
}
int compare_tables(unsigned char *t1, int over1, unsigned char* t2, int over2)
{
vUInt8 ft1;
vUInt8 ft2;
vUInt8 *table1=(vUInt8*)t1;
vUInt8 *table2=(vUInt8*)t2;
ushort_v summini;
int i;
int total;
ft1 = _MM_LOADU_SI128(table1);
ft2 = _MM_LOADU_SI128(table2);
summini.m = partial_min_sum(ft1,ft2);
table1++;
table2++;
for (i=1;i<16;i++,table1++,table2++)
{
ft1 = _MM_LOADU_SI128(table1);
ft2 = _MM_LOADU_SI128(table2);
summini.m = _MM_ADDS_EPU16(summini.m, partial_min_sum(ft1,ft2));
}
// Finishing the sum process
summini.m = _MM_ADDS_EPU16(summini.m,_MM_SRLI_SI128(summini.m,8)); // sum the 4 firsts with the 4 lasts
summini.m = _MM_ADDS_EPU16(summini.m,_MM_SRLI_SI128(summini.m,4));
total = summini.c[0]+summini.c[1];
total+= (over1 < over2) ? over1:over2;
return total;
}
int threshold4(int wordcount, double identity)
{
int error;
int lmax;
wordcount+=3;
error = (int)floor((double)wordcount * ((double)1.0-identity));
lmax = (wordcount - error) / (error + 1);
if (lmax < 4)
return 0;
return (lmax - 3) \
* (error + 1) \
+ ((wordcount - error) % (error + 1));
}
int thresholdLCS4(int32_t reflen, int32_t lcs)
{
int nbfrag;
int smin;
int R;
int common;
nbfrag = (reflen - lcs)*2 + 1;
smin = lcs/nbfrag;
R = lcs - smin * nbfrag;
common = MAX(smin - 2,0) * R + MAX(smin - 3,0) * (nbfrag - R);
return common;
}
Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col)
{
size_t i;
size_t seq_count;
int32_t count;
Kmer_table_p ktable;
char* seq;
fprintf(stderr,"Building kmer tables...");
seq_count = (seq_col->header)->lines_used;
// Allocate memory for the table structure
ktable = (Kmer_table_p) malloc(sizeof(Kmer_table_t) * seq_count);
if (ktable == NULL)
return NULL;
for (i=0; i < seq_count; i++)
{
seq = obi_get_seq_with_elt_idx_and_col_p_in_view(view, seq_col, i, 0); // TODO discuss 1 element per line mandatory
if (seq == NULL)
return NULL; // TODO or not
ktable[i].table = malloc(256 * sizeof(unsigned char));
if (ktable[i].table == NULL)
return NULL;
ktable[i].over = build_table(seq, ktable[i].table, &count);
free(seq);
}
fprintf(stderr," : Done\n");
return ktable;
}
void free_kmer_tables(Kmer_table_p ktable, size_t count)
{
size_t i;
for (i=0; i < count; i++)
free(ktable[i].table);
free(ktable);
}
bool is_possible(Kmer_table_p ktable, index_t idx1, index_t idx2, int l1, int l2, double threshold, bool normalize, int reference, bool similarity_mode)
{
int32_t reflen;
int32_t lcs;
int32_t mincount;
if (l1 < 12 || l2 < 12)
return true;
if (reference==ALILEN || reference==MAXLEN)
reflen = l1;
else
reflen = l2;
if (normalize)
lcs = (int32_t)ceil((double)reflen * threshold);
else
{
if (! similarity_mode)
threshold = reflen - threshold;
lcs = (int32_t) threshold;
}
mincount = thresholdLCS4(l1, lcs);
return compare_tables(ktable[idx1].table, ktable[idx1].over, ktable[idx2].table, ktable[idx2].over) >= mincount;
}
void align_filters(Kmer_table_p ktable, Obi_blob_p seq1, Obi_blob_p seq2, index_t idx1, index_t idx2, double threshold, bool normalize, int reference, bool similarity_mode, double* score, int* LCSmin)
{ // score takes value -2 if filters are not passed, -1 if filters are passed and >= 0 with max score if the 2 sequences are identical.
int l1;
int l2;
l1 = seq1->length_decoded_value;
*score = -2.0;
if (obi_blob_compare(seq1, seq2) == 0) // the 2 sequences are identical. TODO add bool arg indicating whether that's a possibility?
{
if (similarity_mode && normalize)
*score = 1.0;
else if (!similarity_mode)
*score = 0.0;
else
*score = l1;
}
else if (threshold != 0)
{
l2 = seq2->length_decoded_value;
if (l1 >= l2)
{
*LCSmin = calculateLCSmin(l1, l2, threshold, normalize, reference, similarity_mode);
if (l2 >= *LCSmin)
{
if (is_possible(ktable, idx1, idx2, l1, l2, threshold, normalize, reference, similarity_mode)) // 4-mers filter
*score = -1.0;
}
}
else
{
*LCSmin = calculateLCSmin(l2, l1, threshold, normalize, reference, similarity_mode);
if (l1 >= *LCSmin)
{
if (is_possible(ktable, idx2, idx1, l2, l1, threshold, normalize, reference, similarity_mode)) // 4-mers filter
*score = -1.0;
}
}
}
else
*LCSmin = 0;
}
#ifndef UPPERBAND_H_
#define UPPERBAND_H_
// TODO doc
#include <stdbool.h>
#include <stdio.h>
#include "obiblob.h"
#include "obiview.h"
#include "obidmscolumn.h"
typedef struct {
unsigned char* table; // 4mer occurrence table built using the build_table function
int32_t over; // count of 4mers with an occurrence greater than 255 (overflow)
} Kmer_table_t, *Kmer_table_p;
Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col);
void align_filters(Kmer_table_p ktable, Obi_blob_p seq1, Obi_blob_p seq2, index_t idx1, index_t idx2, double threshold, bool normalize, int reference, bool similarity_mode, double* score, int* LCSmin);
void free_kmer_tables(Kmer_table_p ktable, size_t count);
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment