Commit 238e9f70 authored by MercierC's avatar MercierC

alignpairedend: fixed bug that would cut out sequence ends when it

should not have
parent e099a166
......@@ -77,7 +77,6 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
int32_t* shift_count_array;
Obi_ali_p ali = NULL;
int i, j;
bool switched_seqs;
bool reversed;
int score = 0;
Obi_blob_p blob1 = NULL;
......@@ -124,6 +123,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
bool keep_seq2_start;
bool keep_seq1_end;
bool keep_seq2_end;
bool left_ali;
bool rev_quals = false;
// Check kmer size
if ((kmer_size < 1) || (kmer_size > 4))
......@@ -148,19 +149,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
}
// Choose the shortest sequence to save kmer positions in array
switched_seqs = false;
len1 = blob1->length_decoded_value;
len2 = blob2->length_decoded_value;
if (len2 < len1)
{
switched_seqs = true;
temp_len = len1;
len1 = len2;
len2 = temp_len;
temp_blob = blob1;
blob1 = blob2;
blob2 = temp_blob;
}
// Force encoding on 2 bits by replacing ambiguous nucleotides by 'a's
if (blob1->element_size == 4)
......@@ -196,7 +186,47 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
else
reversed = false;
if (reversed)
switched_seqs = !switched_seqs;
// unreverse to make cases simpler. Costly but rare (direct match is reverse primer match)
{
if (seq2 == NULL)
seq2 = obi_blob_to_seq(blob2);
seq2 = reverse_complement_sequence(seq2);
blob2 = obi_seq_to_blob(seq2);
if (seq1 == NULL)
seq1 = obi_blob_to_seq(blob1);
seq1 = reverse_complement_sequence(seq1);
blob1 = obi_seq_to_blob(seq1);
free_blob1 = true;
// Get the quality arrays
qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
if (qual1 == NULL)
{
obidebug(1, "\nError getting the quality of the 1st sequence when computing the kmer similarity between two sequences");
return NULL;
}
qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len);
if (qual2 == NULL)
{
obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences");
return NULL;
}
uint8_t* newqual1 = malloc(qual1_len*sizeof(uint8_t));
uint8_t* newqual2 = malloc(qual2_len*sizeof(uint8_t));
for (i=0;i<qual1_len;i++)
newqual1[i] = qual1[qual1_len-1-i];
for (i=0;i<qual2_len;i++)
newqual2[i] = qual2[qual2_len-1-i];
qual1 = newqual1;
qual2 = newqual2;
rev_quals = true;
}
// Save total length for the shift counts array
total_len = len1 + len2 + 1; // +1 for shift 0
......@@ -237,7 +267,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
return NULL;
}
}
else if (len1 >= shift_array_height)
else if (total_len >= shift_array_height)
{
shift_array_height = total_len;
*shift_array_p = (int32_t*) realloc(*shift_array_p, ARRAY_LENGTH * shift_array_height * sizeof(int32_t));
......@@ -291,7 +321,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
*shift_array_height_p = shift_array_height;
*shift_count_array_length_p = shift_count_array_length;
// Fill array with positions of kmers in the shortest sequence
// Fill array with positions of kmers in the first sequence
encoding = blob1->element_size;
kmer_count = len1 - kmer_size + 1;
for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++)
......@@ -310,7 +340,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
kmer_pos_array[(kmer*kmer_pos_array_height)+i] = kmer_idx;
}
// Compare positions of kmers between both sequences and store shifts
// Compare positions of kmers between both sequences and store shifts (a shift corresponds to pos2 - pos1)
kmer_count = blob2->length_decoded_value - kmer_size + 1;
for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++)
{
......@@ -374,35 +404,42 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
// The 873863 cases of hell
if (best_shift > 0)
{
left_ali = false;
overlap_len = len2 - best_shift;
if (len1 <= overlap_len)
{
overlap_len = len1;
if (! switched_seqs)
keep_seq2_end = true;
else
keep_seq2_start = true;
}
else if (switched_seqs)
{
keep_seq2_start = true;
keep_seq1_end = true;
keep_seq2_end = true;
}
}
else if (best_shift < 0)
{
left_ali = true;
overlap_len = len1 + best_shift;
if (!switched_seqs)
{
keep_seq1_start = true;
keep_seq2_end = true;
}
if (len2 <= overlap_len)
{
overlap_len = len2;
keep_seq1_start = true;
}
else
{
keep_seq1_start = true;
keep_seq2_end = true;
}
}
else
else // if (best_shift == 0)
{
overlap_len = len1;
if ((!switched_seqs) && (len2 > len1))
if (len2 >= len1)
{
overlap_len = len1;
keep_seq2_end = true;
left_ali = true;
}
else
{
overlap_len = len2;
left_ali = false; // discussable
}
}
ali = (Obi_ali_p) malloc(sizeof(Obi_ali_t));
......@@ -433,7 +470,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
ali->direction[0] = '\0';
else
{
if (((best_shift <= 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs))
if (left_ali)
strcpy(ali->direction, "left");
else
strcpy(ali->direction, "right");
......@@ -442,28 +479,28 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
// Build the consensus sequence if asked
if (build_consensus)
{
// Get the quality arrays
qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
if (qual1 == NULL)
if (! rev_quals)
{
obidebug(1, "\nError getting the quality of the 1st sequence when computing the kmer similarity between two sequences");
return NULL;
}
qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len);
if (qual2 == NULL)
{
obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences");
return NULL;
// Get the quality arrays
qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
if (qual1 == NULL)
{
obidebug(1, "\nError getting the quality of the 1st sequence when computing the kmer similarity between two sequences");
return NULL;
}
qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len);
if (qual2 == NULL)
{
obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences");
return NULL;
}
}
// Decode the first sequence if not already done
if (seq1 == NULL)
seq1 = obi_blob_to_seq(blob1);
if (! switched_seqs)
consensus_len = len2 - best_shift;
else
consensus_len = len1 + best_shift;
consensus_len = len2 - best_shift;
// Allocate memory for consensus sequence
consensus_seq = (char*) malloc(consensus_len + 1 * sizeof(char)); // TODO keep malloced too maybe
......@@ -557,6 +594,12 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
free(seq2);
free(blob2);
if (rev_quals)
{
free(qual1);
free(qual2);
}
return ali;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment