Commit 70c49e21 authored by Celine Mercier's avatar Celine Mercier

Added the kmer filter to LCS alignments, and now obiblobs containing

encoded sequences are directly put in int16_t arrays for the alignment
parent 08e67a09
......@@ -46,7 +46,7 @@ def addOptions(parser):
metavar='<ALIGNMENT TYPE>',
default='lcs',
type=str,
help="Compute alignment using the LCS method.")
help="Compute alignment using the LCS method (default).")
group.add_argument('--threshold','-t',
action="store", dest="align:threshold",
......@@ -64,15 +64,15 @@ def addOptions(parser):
group.add_argument('--longest_length','-L',
action="store_const", dest="align:reflength",
default="ali",
const="longest",
default=0,
const=1,
help="The reference length is the length of the longest sequence."
" Default: the reference length is the length of the alignment.")
group.add_argument('--shortest_length','-l',
action="store_const", dest="align:reflength",
default="ali",
const="shortest",
default=0,
const=2,
help="The reference length is the length of the shortest sequence."
" Default: the reference length is the length of the alignment.")
......@@ -89,9 +89,7 @@ def addOptions(parser):
def run(config):
#pb = ProgressBar(1, config, seconde=5) # TODO
# Open DMS
d = OBIDMS(config['obi']['defaultdms'])
......@@ -106,9 +104,9 @@ def run(config):
# TODO Take other alignment types into account when they'll be implemented
# Call cython alignment function
iview.align(oview)
repr(oview)
iview.align(oview, threshold=config['align']['threshold'], normalize=config['align']['normalize'], reference=config['align']['reflength'], similarity_mode=config['align']['similarity'])
print(repr(oview))
iview.close()
oview.close()
......
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -59,5 +59,7 @@
../../../src/sse_banded_LCS_alignment.c
../../../src/uint8_indexer.h
../../../src/uint8_indexer.c
../../../src/upperband.h
../../../src/upperband.c
../../../src/utils.h
../../../src/utils.c
......@@ -19,6 +19,8 @@
#include "obitypes.h"
#include "obiview.h"
#include "sse_banded_LCS_alignment.h"
#include "upperband.h"
#include "obiblob.h"
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
......@@ -29,21 +31,22 @@
// option pour ecrire en stdint?
// check NUC_SEQS view type? and score type (int or float if normalize)
// what's with multiple sequences/line columns?
// make function that put blobs in int16
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column,
double threshold, bool normalize, int reference, bool similarity_mode)
{
index_t i, j, k;
index_t seq_count;
char* seq1;
char* seq2;
const char* id1;
const char* id2;
double score;
index_t i, j, k;
index_t seq_count;
const char* id1;
const char* id2;
double score;
OBIDMS_column_p id_column;
Kmer_table_p ktable;
Obi_blob_p blob1;
Obi_blob_p blob2;
int lcs_min;
k = 0;
......@@ -62,6 +65,15 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
return -1;
}
// Build kmer tables
ktable = hash_seq_column(seq_view, seq_column);
if (ktable == NULL)
{
obi_set_errno(OBI_ALIGN_ERROR);
obidebug(1, "\nError building kmer tables before aligning");
return -1;
}
// Get the ID column pointer
id_column = obi_view_get_column(seq_view, ID_COLUMN);
......@@ -69,66 +81,72 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
for (i=0; i < (seq_count - 1); i++)
{
if (i%100 == 0)
fprintf(stderr,"\rDone : %f %% ", (i / (float) seq_count)*100);
for (j=i+1; j < seq_count; j++)
{
//fprintf(stderr, "\ni=%lld, j=%lld, k=%lld", i, j, k);
seq1 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, 0);
seq2 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, 0);
blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, 0);
blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, 0);
if ((seq1 == NULL) || (seq2 == NULL))
if ((blob1 == NULL) || (blob2 == NULL))
{
obidebug(1, "\nError retrieving sequences to align");
return -1;
}
// TODO kmer filter
// kmer filter
align_filters(ktable, blob1, blob2, i, j, threshold, normalize, reference, similarity_mode, &score, &lcs_min);
// Compute alignment score
score = generic_sse_banded_lcs_align(seq1, seq2, threshold, normalize, reference, similarity_mode);
if ((threshold == 0) || (score == -1.0)) // no threshold or filter passed, and sequences not identical: align
score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode);
// Get sequence ids
id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0);
id2 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0);
if ((score >= 0) && (((normalize || similarity_mode) && (score >= threshold)) || ((!similarity_mode && !normalize) && (score <= threshold))))
{ // Print result
// Write sequence ids in output view
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id1_column, k, 0, id1) < 0)
{
obidebug(1, "\nError writing id1 in a column");
return -1;
}
// Get sequence ids
id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0);
id2 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0);
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id2_column, k, 0, id2) < 0)
{
obidebug(1, "\nError writing id2 in a column");
return -1;
}
// Write score in output view
if (normalize)
{
if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0)
// Write sequence ids in output view
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id1_column, k, 0, id1) < 0)
{
obidebug(1, "\nError writing alignment score in a column");
obidebug(1, "\nError writing id1 in a column");
return -1;
}
}
else
{
if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0)
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id2_column, k, 0, id2) < 0)
{
obidebug(1, "\nError writing alignment score in a column");
obidebug(1, "\nError writing id2 in a column");
return -1;
}
}
free(seq1);
free(seq2);
// Write score in output view
if (normalize)
{
if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0)
{
obidebug(1, "\nError writing alignment score in a column");
return -1;
}
}
else
{
if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0)
{
obidebug(1, "\nError writing alignment score in a column");
return -1;
}
}
k++;
k++;
}
}
}
free_kmer_tables(ktable, seq_count);
return 0;
}
......
......@@ -19,6 +19,7 @@
#include <stdbool.h>
#include "obidms.h"
#include "obiview.h"
#include "obidmscolumn.h"
#include "obitypes.h"
......
......@@ -17,6 +17,8 @@
#include "utils.h"
#include "_sse.h"
#include "sse_banded_LCS_alignment.h"
#include "obiblob.h"
#include "encode.h" // TODO move putBlobInSeq function to encode.c ?
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
......@@ -107,7 +109,7 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int
max = INT16_MAX - l1;
numberOfRegistersPerLine = bandLengthTotal / 8;
numberOfRegistersFor3Lines = 3 * numberOfRegistersPerLine;
numberOfRegistersFor3Lines = 3 * numberOfRegistersPerLine;
SSEregisters = (um128*) calloc(numberOfRegistersFor3Lines * 2, sizeof(um128));
l_ali_SSEregisters = SSEregisters + numberOfRegistersFor3Lines;
......@@ -115,7 +117,7 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int
// preparer registres SSE
for (j=0; j<numberOfRegistersFor3Lines; j++)
l_ali_SSEregisters[j].i = _MM_LOAD_SI128(address+j*8);
l_ali_SSEregisters[j].i = _MM_LOAD_SI128((const __m128i*)(address+j*8));
p_diag = SSEregisters;
p_gap1 = SSEregisters+numberOfRegistersPerLine;
......@@ -151,13 +153,15 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int
k2 = line - k1 - 1;
nucs1.i = _MM_LOADU_SI128(seq1+l1-k1);
nucs2.i = _MM_LOADU_SI128(seq2+k2);
nucs1.i = _MM_LOADU_SI128((const __m128i*)(seq1+l1-k1));
nucs2.i = _MM_LOADU_SI128((const __m128i*)(seq2+k2));
/* fprintf(stderr, "\nnucs, r %d, k1 = %d, k2 = %d\n", j, k1, k2);
printreg(nucs1.i);
printreg(nucs2.i);
*/
// if (print)
// {
// fprintf(stderr, "\nnucs, r %d, k1 = %d, k2 = %d\n", j, k1, k2);
// printreg(nucs1.i);
// printreg(nucs2.i);
// }
// computing diagonal score :
scores.i = _MM_AND_SI128(_MM_CMPEQ_EPI16(nucs1.i, nucs2.i), _MM_SET1_EPI16(1));
......@@ -199,25 +203,26 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int
_MM_AND_SI128(p_gap2_j->i, boolean_reg.i),
_MM_ANDNOT_SI128(boolean_reg.i, current.i));
/*
fprintf(stderr, "\nline = %d", line);
fprintf(stderr, "\nDiag, r %d : ", j);
printreg((*(p_diag_j)).i);
fprintf(stderr, "Gap1 : ");
printreg((*(p_gap1_j)).i);
fprintf(stderr, "Gap2 : ");
printreg((*(p_gap2_j)).i);
fprintf(stderr, "current : ");
printreg(current.i);
fprintf(stderr, "L ALI\nDiag r %d : ", j);
printreg((*(p_l_ali_diag_j)).i);
fprintf(stderr, "Gap1 : ");
printreg((*(p_l_ali_gap1_j)).i);
fprintf(stderr, "Gap2 : ");
printreg((*(p_l_ali_gap2_j)).i);
fprintf(stderr, "current : ");
printreg(l_ali_current.i);
*/
// if (print)
// {
// fprintf(stderr, "\nline = %d", line);
// fprintf(stderr, "\nDiag, r %d : ", j);
// printreg((*(p_diag_j)).i);
// fprintf(stderr, "Gap1 : ");
// printreg((*(p_gap1_j)).i);
// fprintf(stderr, "Gap2 : ");
// printreg((*(p_gap2_j)).i);
// fprintf(stderr, "current : ");
// printreg(current.i);
// fprintf(stderr, "L ALI\nDiag r %d : ", j);
// printreg((*(p_l_ali_diag_j)).i);
// fprintf(stderr, "Gap1 : ");
// printreg((*(p_l_ali_gap1_j)).i);
// fprintf(stderr, "Gap2 : ");
// printreg((*(p_l_ali_gap2_j)).i);
// fprintf(stderr, "current : ");
// printreg(l_ali_current.i);
// }
// diag = gap1 and gap1 = current
p_diag_j->i = p_gap1_j->i;
......@@ -234,8 +239,8 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int
{
if ((odd_line && even_BLL) || (even_line && odd_BLL))
{
p_gap2[j].i = _MM_LOADU_SI128((p_gap1[j].s16)-1);
p_l_ali_gap2[j].i = _MM_LOADU_SI128((p_l_ali_gap1[j].s16)-1);
p_gap2[j].i = _MM_LOADU_SI128((const __m128i*)((p_gap1[j].s16)-1));
p_l_ali_gap2[j].i = _MM_LOADU_SI128((const __m128i*)((p_l_ali_gap1[j].s16)-1));
if (j == 0)
{
p_gap2[j].i = _MM_INSERT_EPI16(p_gap2[j].i, 0, 0);
......@@ -244,8 +249,8 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int
}
else
{
p_gap2[j].i = _MM_LOADU_SI128(p_gap1[j].s16+1);
p_l_ali_gap2[j].i = _MM_LOADU_SI128(p_l_ali_gap1[j].s16+1);
p_gap2[j].i = _MM_LOADU_SI128((const __m128i*)(p_gap1[j].s16+1));
p_l_ali_gap2[j].i = _MM_LOADU_SI128((const __m128i*)(p_l_ali_gap1[j].s16+1));
if (j == numberOfRegistersPerLine - 1)
{
p_gap2[j].i = _MM_INSERT_EPI16(p_gap2[j].i, 0, 7);
......@@ -267,7 +272,10 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int
l_loc = (int) floor((double)(bandLengthLeft) / (double)2) - ceil((double)(diff) / (double)2);
l_reg = (int)floor((double)l_loc/(double)8.0);
//fprintf(stderr, "\nl_reg = %d, l_loc = %d\n", l_reg, l_loc);
// if (print)
// fprintf(stderr, "\nl_reg = %d, l_loc = %d\n", l_reg, l_loc);
l_loc = l_loc - l_reg*8;
// extracting the results from the registers :
......@@ -357,8 +365,8 @@ double sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, i
k2 = line - k1 - 1;
nucs1.i = _MM_LOADU_SI128(seq1+l1-k1);
nucs2.i = _MM_LOADU_SI128(seq2+k2);
nucs1.i = _MM_LOADU_SI128((const __m128i*)(seq1+l1-k1));
nucs2.i = _MM_LOADU_SI128((const __m128i*)(seq2+k2));
// computing diagonal score :
scores.i = _MM_AND_SI128(_MM_CMPEQ_EPI16(nucs1.i, nucs2.i), _MM_SET1_EPI16(1));
......@@ -381,7 +389,7 @@ double sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, i
{
if ((odd_line && even_BLL) || (even_line && odd_BLL))
{
(*(p_gap2+j)).i = _MM_LOADU_SI128(((*(p_gap1+j)).s16)-1);
(*(p_gap2+j)).i = _MM_LOADU_SI128((const __m128i*)(((*(p_gap1+j)).s16)-1));
if (j == 0)
{
(*(p_gap2+j)).i = _MM_INSERT_EPI16((*(p_gap2+j)).i, 0, 0);
......@@ -389,7 +397,7 @@ double sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, i
}
else
{
(*(p_gap2+j)).i = _MM_LOADU_SI128(((*(p_gap1+j)).s16)+1);
(*(p_gap2+j)).i = _MM_LOADU_SI128((const __m128i*)(((*(p_gap1+j)).s16)+1));
if (j == numberOfRegistersPerLine - 1)
{
(*(p_gap2+j)).i = _MM_INSERT_EPI16((*(p_gap2+j)).i, 0, 7);
......@@ -483,6 +491,41 @@ void putSeqInSeq(int16_t* seq, char* s, int l, bool reverse)
}
void putBlobInSeq(int16_t* seq, Obi_blob_p b, int l, bool reverse)
{
size_t i;
uint8_t shift;
uint8_t mask;
uint8_t nuc;
int16_t* target = seq;
int16_t* end = target + (size_t) l;
if (reverse)
{
for (i = l-1; target < end; target++, i--)
{
shift = 6 - 2*(i % 4);
mask = NUC_MASK_2B << shift;
nuc = (b->value[i/4] & mask) >> shift;
*target = (int16_t)nuc+1; // +1 because nucleotide can't be == 0 (0 is a default value used to initialize some registers)
}
}
else
{
for (i=0; target < end; target++, i++)
{
shift = 6 - 2*(i % 4);
mask = NUC_MASK_2B << shift;
nuc = (b->value[i/4] & mask) >> shift;
*target = (int16_t)nuc+1; // +1 because nucleotide can't be == 0 (0 is a default value used to initialize some registers)
}
}
}
void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLengthLeft, int l1)
{
int i;
......@@ -491,7 +534,7 @@ void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLe
int bm;
int value=INT16_MAX-l1;
numberOfRegistersPerLine = bandLengthTotal / 8;
numberOfRegistersPerLine = bandLengthTotal / 8;
bm = bandLengthLeft%2;
for (i=0; i < (3*numberOfRegistersPerLine*8); i++)
......@@ -543,7 +586,7 @@ double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool n
// fprintf(stderr, "\nid before normalizations = %f", id);
// fprintf(stderr, "\nlcs = %f, ali = %d\n", id, ali_length);
//fprintf(stderr, "\nlcs = %f, ali length = %d\n", id, ali_length);
if (!similarity_mode && !normalize)
switch(reference) {
......@@ -694,3 +737,98 @@ double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bo
return(id);
}
double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double threshold, bool normalize, int reference, bool similarity_mode)
{
double id;
int l1, l2;
int lmax, lmin;
int sizeToAllocateForBand, sizeToAllocateForSeqs;
int maxBLL;
int LCSmin;
int shift;
int16_t* address;
int16_t* iseq1;
int16_t* iseq2;
address = NULL;
l1 = seq1->length_decoded_value;
l2 = seq2->length_decoded_value;
if (l1 > l2)
{
lmax = l1;
lmin = l2;
}
else
{
lmax = l2;
lmin = l1;
}
// If the score is expressed as a normalized distance, get the corresponding identity
if (!similarity_mode && normalize)
threshold = 1.0 - threshold;
// Calculate the minimum LCS length corresponding to the threshold
LCSmin = calculateLCSmin(lmax, lmin, threshold, normalize, reference, similarity_mode);
// Allocate space for matrix band if the alignment length must be computed
if ((reference == ALILEN) && (normalize || !similarity_mode)) // cases in which alignment length must be computed
{
sizeToAllocateForBand = calculateSizeToAllocate(lmax, lmin, LCSmin);
address = obi_get_memory_aligned_on_16(sizeToAllocateForBand, &shift);
if (address == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError getting a memory address aligned on 16 bytes boundary");
return 0; // TODO DOUBLE_MIN
}
}
// Allocate space for the int16_t arrays representing the sequences
maxBLL = calculateLeftBandLength(lmax, LCSmin);
sizeToAllocateForSeqs = 2*maxBLL+lmax;
iseq1 = (int16_t*) malloc(sizeToAllocateForSeqs*sizeof(int16_t));
iseq2 = (int16_t*) malloc(sizeToAllocateForSeqs*sizeof(int16_t));
if ((iseq1 == NULL) || (iseq2 == NULL))
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for integer arrays to use in LCS alignment");
return 0; // TODO DOUBLE_MIN
}
// Initialize the int arrays
iniSeq(iseq1, (2*maxBLL)+lmax, 0);
iniSeq(iseq2, (2*maxBLL)+lmax, 255);
// Shift addresses to where the sequences have to be put
iseq1 = iseq1+maxBLL;
iseq2 = iseq2+maxBLL;
// Put the DNA sequences in the int arrays. Longest sequence must be first argument of sse_align function
if (l2 > l1)
{
putBlobInSeq(iseq1, seq2, l2, TRUE);
putBlobInSeq(iseq2, seq1, l1, FALSE);
// Compute alignment
id = sse_banded_lcs_align(iseq1, iseq2, l2, l1, normalize, reference, similarity_mode, address, LCSmin);
}
else
{
putBlobInSeq(iseq1, seq1, l1, TRUE);
putBlobInSeq(iseq2, seq2, l2, FALSE);
// Compute alignment
id = sse_banded_lcs_align(iseq1, iseq2, l1, l2, normalize, reference, similarity_mode, address, LCSmin);
}
// Free allocated elements