obi_lcs.c 36.2 KB
Newer Older
1
/****************************************************************************
2
 * LCS sequence alignment functions				                            *
3 4 5
 ****************************************************************************/

/**
6
 * @file obi_lcs.c
7
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
8
 * @date May 4th 2016
9
 * @brief Functions handling LCS sequence alignments.
10 11
 */

12 13 14 15
//#define OMP_SUPPORT // TODO
#ifdef OMP_SUPPORT
#include <omp.h>
#endif
16 17 18 19 20

#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>

21
#include "obi_lcs.h"
22 23 24 25 26
#include "obidebug.h"
#include "obierrno.h"
#include "obitypes.h"
#include "obiview.h"
#include "sse_banded_LCS_alignment.h"
27 28
#include "upperband.h"
#include "obiblob.h"
29 30 31 32 33


#define DEBUG_LEVEL 0	// TODO has to be defined somewhere else (cython compil flag?)


34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
/**************************************************************************
 *
 * D E C L A R A T I O N   O F   T H E   P R I V A T E   F U N C T I O N S
 *
 **************************************************************************/


/**
 * @brief Internal function creating the columns where the alignment results are written.
 *
 * @param output_view A pointer on the writable view where the columns should be created.
 * @param id1_indexer_name The name of the indexer where the id of the 1st sequence aligned is indexed.
 * @param id2_indexer_name The name of the indexer where the id of the 2nd sequence aligned is indexed.
 * @param seq1_indexer_name The name of the indexer where the 1st sequence aligned is indexed (needed only if print_seq is True).
 * @param seq2_indexer_name The name of the indexer where the 2nd sequence aligned is indexed (needed only if print_seq is True).
 * @param print_seq A boolean indicating whether the aligned sequences should be copied in the output view.
 * @param print_count A boolean indicating whether the aligned sequence counts should be copied in the output view.
 * @param normalize Whether the score should be normalized with the reference sequence length.
 * @param reference The reference length. 0: The alignement length; 1: The longest sequence's length; 2: The shortest sequence's length.
 * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false).
 *
 * @retval 0 if the operation was successfully completed.
 * @retval -1 if an error occurred.
 *
 * @since December 2016
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
static int create_alignment_output_columns(Obiview_p output_view,
										   const char* id1_indexer_name,
										   const char* id2_indexer_name,
										   const char* seq1_indexer_name,
										   const char* seq2_indexer_name,
		                                   bool print_seq, bool print_count,
										   bool normalize, int reference, bool similarity_mode);


/**
 * @brief Internal function printing the result of one alignment to the output view.
 *
73
 * @param output_view A pointer on the writable view where the result should be written.
74 75 76 77 78 79 80 81 82 83 84 85 86 87
 * @param line The line in the output view where the result should be written.
 * @param idx1_column A pointer on the column where the index referring to the line of the first sequence aligned in the input view should be written.
 * @param idx2_column A pointer on the column where the index referring to the line of the second sequence aligned in the input view should be written.
 * @param idx1 The index referring to the line of the first sequence aligned in the input view.
 * @param idx2 The index referring to the line of the second sequence aligned in the input view.
 * @param id1_column A pointer on the column where the identifier of the first sequence aligned should be written.
 * @param id2_column A pointer on the column where the identifier of the second sequence aligned should be written.
 * @param id1_idx The index of the identifier of the first sequence aligned.
 * @param id2_idx The index of the identifier of the second sequence aligned.
 * @param print_seq A boolean indicating whether the aligned sequences should be copied in the output view.
 * @param seq1_column A pointer on the column where the first sequence aligned should be written.
 * @param seq2_column A pointer on the column where the second sequence aligned should be written.
 * @param seq1_idx The index of the sequence of the first sequence aligned.
 * @param seq2_idx The index of the sequence of the second sequence aligned.
88
 * @param print_count A boolean indicating whether the aligned sequence counts should be copied in the output view.
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
 * @param count1_column A pointer on the column where the count of the first sequence aligned should be written.
 * @param count2_column A pointer on the column where the count of the second sequence aligned should be written.
 * @param count1 The count of the first sequence aligned.
 * @param count2 The count of the second sequence aligned.
 * @param ali_length_column A pointer on the column where the alignment length should be written.
 * @param ali_length The alignment length.
 * @param lcs_length_column A pointer on the column where the LCS length should be written.
 * @param lcs_length The LCS length.
 * @param score_column A pointer on the column where the score should be written.
 * @param score The alignment score.
 * @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length.
 * @param normalize Whether the score should be normalized with the reference sequence length.
 * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false).
 *
 * @retval 0 if the operation was successfully completed.
 * @retval -1 if an error occurred.
 *
 * @since December 2016
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
static int print_alignment_result(Obiview_p output_view,
								   index_t line,
								   OBIDMS_column_p idx1_column,
								   OBIDMS_column_p idx2_column,
								   index_t idx1,
								   index_t idx2,
								   OBIDMS_column_p id1_column,
								   OBIDMS_column_p id2_column,
								   index_t id1_idx,
								   index_t id2_idx,
								   bool print_seq,
								   OBIDMS_column_p seq1_column,
								   OBIDMS_column_p seq2_column,
								   index_t seq1_idx,
								   index_t seq2_idx,
124 125 126 127 128
								   bool print_count,
								   OBIDMS_column_p count1_column,
								   OBIDMS_column_p count2_column,
								   int count1,
								   int count2,
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
								   OBIDMS_column_p ali_length_column,
								   int ali_length,
								   OBIDMS_column_p lcs_length_column,
								   int lcs_length,
								   OBIDMS_column_p score_column,
								   double score,
								   int reference,
								   bool normalize,
								   bool similarity_mode);



/************************************************************************
 *
 * D E F I N I T I O N   O F   T H E   P R I V A T E   F U N C T I O N S
 *
 ************************************************************************/


static int create_alignment_output_columns(Obiview_p output_view,
										   const char* id1_indexer_name,
										   const char* id2_indexer_name,
										   const char* seq1_indexer_name,
										   const char* seq2_indexer_name,
		                                   bool print_seq, bool print_count,
										   bool normalize, int reference, bool similarity_mode)
{
	// Create the column for the ids of the 1st sequence aligned
157
	if (obi_view_add_column(output_view, ID1_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, id1_indexer_name, NULL, -1, ID1_COLUMN_COMMENTS, true) < 0)
158 159 160 161 162 163
	{
		obidebug(1, "\nError creating the first column for the sequence ids when aligning");
		return -1;
	}

	// Create the column for the ids of the 2nd sequence aligned
164
	if (obi_view_add_column(output_view, ID2_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, id2_indexer_name, NULL, -1, ID2_COLUMN_COMMENTS, true) < 0)
165 166 167 168 169 170
	{
		obidebug(1, "\nError creating the second column for the sequence ids when aligning");
		return -1;
	}

	// Create the column for the index (in the input view) of the first sequences aligned
171
	if (obi_view_add_column(output_view, IDX1_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, IDX1_COLUMN_COMMENTS, true) < 0)
172 173 174 175 176 177
	{
		obidebug(1, "\nError creating the first column for the sequence indices when aligning");
		return -1;
	}

	// Create the column for the index (in the input view) of the second sequences aligned
178
	if (obi_view_add_column(output_view, IDX2_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, IDX2_COLUMN_COMMENTS, true) < 0)
179 180 181 182 183 184
	{
		obidebug(1, "\nError creating the second column for the sequence indices when aligning");
		return -1;
	}

	// Create the column for the LCS length
185
	if (obi_view_add_column(output_view, LCS_LENGTH_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, LCS_LENGTH_COLUMN_COMMENTS, true) < 0)
186 187 188 189 190 191 192 193
	{
		obidebug(1, "\nError creating the column for the LCS length when aligning");
		return -1;
	}

	// Create the column for the alignment length if it is computed
	if ((reference == ALILEN) && (normalize || !similarity_mode))
	{
194
		if (obi_view_add_column(output_view, ALI_LENGTH_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ALI_LENGTH_COLUMN_COMMENTS, true) < 0)
195 196 197 198 199 200 201 202
		{
			obidebug(1, "\nError creating the column for the alignment length when aligning");
			return -1;
		}
	}
	// Create the column for the alignment score
	if (normalize)
	{
203
		if (obi_view_add_column(output_view, SCORE_COLUMN_NAME, -1, NULL, OBI_FLOAT, 0, 1, NULL, false, false, false, NULL, NULL, -1, SCORE_COLUMN_NAME, true) < 0)
204 205 206 207 208 209 210
		{
			obidebug(1, "\nError creating the column for the score when aligning");
			return -1;
		}
	}
	else
	{
211
		if (obi_view_add_column(output_view, SCORE_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, SCORE_COLUMN_NAME, true) < 0)
212 213 214 215 216 217 218 219 220
		{
			obidebug(1, "\nError creating the column for the score when aligning");
			return -1;
		}
	}

	if (print_seq)
	{
		// Create the column for the first sequences aligned
221
		if (obi_view_add_column(output_view, SEQ1_COLUMN_NAME, -1, NULL, OBI_SEQ, 0, 1, NULL, false, false, false, seq1_indexer_name, NULL, -1, SEQ1_COLUMN_COMMENTS, true) < 0)
222 223 224 225 226 227
		{
			obidebug(1, "\nError creating the first column for the sequences when aligning");
			return -1;
		}

		// Create the column for the second sequences aligned
228
		if (obi_view_add_column(output_view, SEQ2_COLUMN_NAME, -1, NULL, OBI_SEQ, 0, 1, NULL, false, false, false, seq2_indexer_name, NULL, -1, SEQ2_COLUMN_COMMENTS, true) < 0)
229 230 231 232 233
		{
			obidebug(1, "\nError creating the second column for the sequences when aligning");
			return -1;
		}
	}
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
	if (print_count)
	{
		// Create the column for the count of the first sequences aligned
		if (obi_view_add_column(output_view, COUNT1_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, COUNT1_COLUMN_COMMENTS, true) < 0)
		{
			obidebug(1, "\nError creating the first column for the sequence counts when aligning");
			return -1;
		}

		// Create the column for the count of the second sequences aligned
		if (obi_view_add_column(output_view, COUNT2_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, COUNT2_COLUMN_COMMENTS, true) < 0)
		{
			obidebug(1, "\nError creating the second column for the sequence counts when aligning");
			return -1;
		}
	}
250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269

	return 0;
}


static int print_alignment_result(Obiview_p output_view,
								   index_t line,
								   OBIDMS_column_p idx1_column,
								   OBIDMS_column_p idx2_column,
								   index_t idx1,
								   index_t idx2,
								   OBIDMS_column_p id1_column,
								   OBIDMS_column_p id2_column,
								   index_t id1_idx,
								   index_t id2_idx,
								   bool print_seq,
								   OBIDMS_column_p seq1_column,
								   OBIDMS_column_p seq2_column,
								   index_t seq1_idx,
								   index_t seq2_idx,
270 271 272 273 274
								   bool print_count,
								   OBIDMS_column_p count1_column,
								   OBIDMS_column_p count2_column,
								   int count1,
								   int count2,
275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
								   OBIDMS_column_p ali_length_column,
								   int ali_length,
								   OBIDMS_column_p lcs_length_column,
								   int lcs_length,
								   OBIDMS_column_p score_column,
								   double score,
								   int reference,
								   bool normalize,
								   bool similarity_mode)
{
	// Write line indices of the input view in the output view (to easily refer to the input sequences from the output view)
	if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, idx1_column, line, 0, idx1) < 0)
	{
		obidebug(1, "\nError writing idx1 in a column");
		return -1;
	}
	if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, idx2_column, line, 0, idx2) < 0)
	{
		obidebug(1, "\nError writing idx2 in a column");
		return -1;
	}

	// Write ids in output view
	if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, id1_column, line, 0, id1_idx) < 0)
	{
		obidebug(1, "\nError writing id1 in a column");
		return -1;
	}
	if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, id2_column, line, 0, id2_idx) < 0)
	{
		obidebug(1, "\nError writing id2 in a column");
		return -1;
	}

	// Write the sequences if needed
	if (print_seq)
	{
		if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, seq1_column, line, 0, seq1_idx) < 0)
		{
			obidebug(1, "\nError writing seq1 in a column");
			return -1;
		}

		if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, seq2_column, line, 0, seq2_idx) < 0)
		{
			obidebug(1, "\nError writing seq2 in a column");
			return -1;
		}
	}

325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
	// Write the counts if needed
	if (print_count)
	{
		if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, count1_column, line, 0, count1) < 0)
		{
			obidebug(1, "\nError writing count1 in a column");
			return -1;
		}

		if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, count2_column, line, 0, count2) < 0)
		{
			obidebug(1, "\nError writing count2 in a column");
			return -1;
		}
	}
340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387

	// Write the alignment length if it was computed
	if ((reference == ALILEN) && (normalize || !similarity_mode))
	{
		if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, ali_length_column, line, 0, ali_length) < 0)
		{
			obidebug(1, "\nError writing alignment length in a column");
			return -1;
		}
	}

	// Write the LCS length
	if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, lcs_length_column, line, 0, lcs_length) < 0)
	{
		obidebug(1, "\nError writing LCS length in a column");
		return -1;
	}

	// Write score
	if (normalize)
	{
		if (obi_set_float_with_elt_idx_and_col_p_in_view(output_view, score_column, line, 0, (obifloat_t) score) < 0)
		{
			obidebug(1, "\nError writing alignment score in a column");
			return -1;
		}
	}
	else
	{
		if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, score_column, line, 0, (obiint_t) score) < 0)
		{
			obidebug(1, "\nError writing alignment score in a column");
			return -1;
		}
	}

	return 0;
}



/**********************************************************************
 *
 * D E F I N I T I O N   O F   T H E   P U B L I C   F U N C T I O N S
 *
 **********************************************************************/


388 389 390 391
int obi_lcs_align_one_column(const char* dms_name,
							 const char* seq_view_name,
							 const char* seq_column_name,
							 const char* seq_elt_name,
392
							 const char* id_column_name,
393 394
					         const char* output_view_name,
							 const char* output_view_comments,
395
							 bool print_seq, bool print_count,
396 397
						     double threshold, bool normalize, int reference, bool similarity_mode,
							 int thread_count)
398
{
399 400
	index_t         i, j, k;
	index_t         seq_count;
401 402
	index_t         id1_idx, id2_idx;
	index_t         seq1_idx, seq2_idx;
403
	int             count1, count2;
404
	double          score;
405 406
	int             lcs_length;
	int             ali_length;
407 408 409 410
	Kmer_table_p    ktable;
	Obi_blob_p      blob1;
	Obi_blob_p   	blob2;
	int				lcs_min;
411 412
	index_t         seq_elt_idx;

413
	OBIDMS_p        dms = NULL;
414 415 416
	Obiview_p       seq_view = NULL;
	Obiview_p       output_view = NULL;
	OBIDMS_column_p iseq_column = NULL;
417
	OBIDMS_column_p i_count_column = NULL;
418
	OBIDMS_column_p id_column = NULL;
419 420 421 422
	OBIDMS_column_p id1_column = NULL;
	OBIDMS_column_p id2_column = NULL;
	OBIDMS_column_p seq1_column = NULL;
	OBIDMS_column_p seq2_column = NULL;
423 424
	OBIDMS_column_p count1_column = NULL;
	OBIDMS_column_p count2_column = NULL;
425 426 427 428 429
	OBIDMS_column_p idx1_column = NULL;
	OBIDMS_column_p idx2_column = NULL;
	OBIDMS_column_p lcs_length_column = NULL;
	OBIDMS_column_p ali_length_column = NULL;
	OBIDMS_column_p score_column = NULL;
430 431 432

	k = 0;

433 434 435 436 437 438 439 440
	// Open DMS
	dms = obi_open_dms(dms_name);
	if (dms == NULL)
	{
		obidebug(1, "\nError opening the DMS");
		return -1;
	}

441 442 443 444 445 446 447 448 449 450
	// Open input view
	seq_view = obi_open_view(dms, seq_view_name);
	if (seq_view == NULL)
	{
		obidebug(1, "\nError opening the input view to align");
		return -1;
	}

	// Open the sequence column to align
	// If a column name wasn't given, open default sequence column
451
	if (strcmp(seq_column_name, "") == 0)  // TODO check for NULL
452
	{
453 454 455 456 457 458
		if (strcmp((seq_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0)
			iseq_column = obi_view_get_column(seq_view, NUC_SEQUENCE_COLUMN);
		else
		{
			obi_set_errno(OBI_ALIGN_ERROR);
			obidebug(1, "\nError: no column given to align");
459
			return -1;
460
		}
461
	}
462 463 464
	else
		iseq_column = obi_view_get_column(seq_view, seq_column_name);
	if (iseq_column == NULL)
465
	{
466
		obidebug(1, "\nError getting the column to align");
467 468 469
		return -1;
	}

470 471 472 473 474 475 476 477
	// Check column type
	if ((iseq_column->header)->returned_data_type != OBI_SEQ)
	{
		obi_set_errno(OBI_ALIGN_ERROR);
		obidebug(1, "\nError: column given to align is not an OBI_SEQ column");
		return -1;
	}

478 479
	// Get element index of the sequence to align in each line to compute it only once
	if ((strcmp(seq_elt_name, "") != 0) && (seq_elt_name != NULL))
480
	{
481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508
		seq_elt_idx = obi_column_get_element_index_from_name(iseq_column, seq_elt_name);
		if (seq_elt_idx == OBIIdx_NA)
		{
			obidebug(1, "\nError getting the sequence index in a column line when aligning");
			return -1;
		}
	}
	else
		seq_elt_idx = 0;

	// Open the ID column, containing the identifiers of the sequences to align
	// If a column name wasn't given, open default ID column
	if (strcmp(id_column_name, "") == 0)
	{
		if (strcmp((seq_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0)
			id_column = obi_view_get_column(seq_view, ID_COLUMN);
		else
		{
			obi_set_errno(OBI_ALIGN_ERROR);
			obidebug(1, "\nError: no ID column given");
			return -1;
		}
	}
	else
		id_column = obi_view_get_column(seq_view, id_column_name);
	if (id_column == NULL)
	{
		obidebug(1, "\nError getting the ID column");
509 510 511
		return -1;
	}

512 513 514 515 516 517 518 519 520 521 522
	// Open the input count column
	if (print_count)
	{
		i_count_column = obi_view_get_column(seq_view, COUNT_COLUMN);
		if (i_count_column == NULL)
		{
			obidebug(1, "\nError getting the input COUNT column");
			return -1;
		}
	}

523 524 525
	// Create the output view
	output_view = obi_new_view(dms, output_view_name, NULL, NULL, output_view_comments);
	if (output_view == NULL)
526
	{
527 528 529 530 531
		obidebug(1, "\nError creating the output view when aligning");
		return -1;
	}

	// Create the output columns
532 533 534 535
	if (create_alignment_output_columns(output_view,
			(id_column->header)->indexer_name, (id_column->header)->indexer_name,
			(iseq_column->header)->indexer_name, (iseq_column->header)->indexer_name,
			print_seq, print_count, normalize, reference, similarity_mode) < 0)
536 537 538 539 540
		return -1;
	id1_column = obi_view_get_column(output_view, ID1_COLUMN_NAME);
	id2_column = obi_view_get_column(output_view, ID2_COLUMN_NAME);
	idx1_column = obi_view_get_column(output_view, IDX1_COLUMN_NAME);
	idx2_column = obi_view_get_column(output_view, IDX2_COLUMN_NAME);
541
    lcs_length_column = obi_view_get_column(output_view, LCS_LENGTH_COLUMN_NAME);
542 543 544 545 546 547 548 549
	if ((reference == ALILEN) && (normalize || !similarity_mode))
		ali_length_column = obi_view_get_column(output_view, ALI_LENGTH_COLUMN_NAME);
	score_column = obi_view_get_column(output_view, SCORE_COLUMN_NAME);
	if (print_seq)
	{
		seq1_column = obi_view_get_column(output_view, SEQ1_COLUMN_NAME);
		seq2_column = obi_view_get_column(output_view, SEQ2_COLUMN_NAME);
	}
550 551 552 553 554
	if (print_count)
	{
		count1_column = obi_view_get_column(output_view, COUNT1_COLUMN_NAME);
		count2_column = obi_view_get_column(output_view, COUNT2_COLUMN_NAME);
	}
555

556
	// Build kmer tables
557
	ktable = hash_seq_column(seq_view, iseq_column, seq_elt_idx);
558 559 560 561 562 563 564
	if (ktable == NULL)
	{
		obi_set_errno(OBI_ALIGN_ERROR);
		obidebug(1, "\nError building kmer tables before aligning");
		return -1;
	}

565
	seq_count = (seq_view->infos)->line_count;
566

567 568 569 570 571
	#ifdef OMP_SUPPORT
	omp_set_num_threads(thread_count);
	#pragma omp parallel for
	#endif

572 573
	for (i=0; i < (seq_count - 1); i++)
	{
574 575 576
		if (i%100 == 0)
			fprintf(stderr,"\rDone : %f %%       ", (i / (float) seq_count)*100);

577 578 579 580 581 582 583 584 585 586 587
		// Get first id idx
		id1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0);	// TODO Could there be multiple IDs per line?
		// Get first sequence and its index
		seq1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx);
		blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx);
		if (blob1 == NULL)
		{
			obidebug(1, "\nError retrieving sequences to align");
			return -1;
		}

588 589
		for (j=i+1; j < seq_count; j++)
		{
590
			// Get second sequence and its index
591
			seq2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx);
592 593
			blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx);
			if (blob2 == NULL)
594 595 596 597 598
			{
				obidebug(1, "\nError retrieving sequences to align");
				return -1;
			}

599
			// Check if the sequences are identical in a quick way (same index in the same indexer)
600
			if (obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx) == obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx))
601 602 603 604 605 606 607 608
			{
				if (similarity_mode && normalize)
					score = 1.0;
				else if (!similarity_mode)
					score = 0.0;
				else
					score = blob1->length_decoded_value;
			}
609

610 611 612 613 614 615 616
			else // the sequences aren't identical
			{
				// kmer filter
				align_filters(ktable, blob1, blob2, i, j, threshold, normalize, reference, similarity_mode, &score, &lcs_min, false);

				// Compute alignment score
				if ((threshold == 0) || (score == -1.0))	// no threshold, or filter passed: align
617
					score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode, &lcs_length, &ali_length);
618
			}
619

620
			if ((score >= 0) && (((normalize || similarity_mode) && (score >= threshold)) || ((!similarity_mode && !normalize) && (score <= threshold))))
621
			{	// Print result
622

623
				// Get second id idx
624
				id2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0);
625

626 627 628 629 630 631 632
				// Get counts  // TODO use array for efficiency?
				if (print_count)
				{
					count1 = obi_get_int_with_elt_idx_and_col_p_in_view(seq_view, i_count_column, i, 0);
					count2 = obi_get_int_with_elt_idx_and_col_p_in_view(seq_view, i_count_column, j, 0);
				}

633 634 635 636
				if (print_alignment_result(output_view, k,
										   idx1_column, idx2_column, i, j,
										   id1_column, id2_column, id1_idx, id2_idx,
						                   print_seq, seq1_column, seq2_column, seq1_idx, seq2_idx,
637
										   print_count, count1_column, count2_column, count1, count2,
638 639 640 641
										   ali_length_column, ali_length,
										   lcs_length_column, lcs_length,
										   score_column, score,
										   reference, normalize, similarity_mode) < 0)
642
					return -1;
643

644 645
				k++;
			}
646 647 648
		}
	}

649 650
	fprintf(stderr,"\rDone : 100 %%         \n");

651
	// Close views
652
	if (obi_save_and_close_view(seq_view) < 0)
653 654 655 656
	{
		obidebug(1, "\nError closing the input view after aligning");
		return -1;
	}
657
	if (obi_save_and_close_view(output_view) < 0)
658 659 660 661 662
	{
		obidebug(1, "\nError closing the output view after aligning");
		return -1;
	}

663 664 665 666 667 668
	if (obi_close_dms(dms, false) < 0)
	{
		obidebug(1, "\nError closing the DMS after aligning");
		return -1;
	}

669 670
	free_kmer_tables(ktable, seq_count);

671 672 673 674
	return 0;
}


675
int obi_lcs_align_two_columns(const char* dms_name,
676 677 678 679 680 681 682 683
							  const char* seq1_view_name,
							  const char* seq2_view_name,
							  const char* seq1_column_name,
							  const char* seq2_column_name,
							  const char* seq1_elt_name,
							  const char* seq2_elt_name,
							  const char* id1_column_name,
							  const char* id2_column_name,
684 685
					          const char* output_view_name,
							  const char* output_view_comments,
686 687 688 689 690 691 692 693
							  bool print_seq, bool print_count,
						      double threshold, bool normalize, int reference, bool similarity_mode)
{
	index_t         i, j, k;
	index_t         seq1_count;
	index_t         seq2_count;
	index_t         id1_idx, id2_idx;
	index_t         seq1_idx, seq2_idx;
694
	int             count1, count2;
695 696 697 698 699 700 701 702 703 704 705
	double          score;
	int             lcs_length;
	int             ali_length;
	Kmer_table_p    ktable;
	Obi_blob_p      blob1;
	Obi_blob_p   	blob2;
	int				lcs_min;
	index_t         seq1_elt_idx;
	index_t         seq2_elt_idx;
	bool 			same_indexer;

706
	OBIDMS_p        dms = NULL;
707 708 709 710 711 712 713
	Obiview_p       seq1_view = NULL;
	Obiview_p       seq2_view = NULL;
	Obiview_p       output_view = NULL;
	OBIDMS_column_p i_seq1_column = NULL;
	OBIDMS_column_p i_seq2_column = NULL;
	OBIDMS_column_p i_id1_column = NULL;
	OBIDMS_column_p i_id2_column = NULL;
714 715
	OBIDMS_column_p i_count1_column = NULL;
	OBIDMS_column_p i_count2_column = NULL;
716 717 718 719
	OBIDMS_column_p id1_column = NULL;
	OBIDMS_column_p id2_column = NULL;
	OBIDMS_column_p seq1_column = NULL;
	OBIDMS_column_p seq2_column = NULL;
720 721
	OBIDMS_column_p count1_column = NULL;
	OBIDMS_column_p count2_column = NULL;
722 723 724 725 726 727 728 729
	OBIDMS_column_p idx1_column = NULL;
	OBIDMS_column_p idx2_column = NULL;
	OBIDMS_column_p lcs_length_column = NULL;
	OBIDMS_column_p ali_length_column = NULL;
	OBIDMS_column_p score_column = NULL;

	k = 0;

730 731 732 733 734 735 736 737
	// Open DMS
	dms = obi_open_dms(dms_name);
	if (dms == NULL)
	{
		obidebug(1, "\nError opening the DMS to align");
		return -1;
	}


	// Open the first input view
	seq1_view = obi_open_view(dms, seq1_view_name);
	if (seq1_view == NULL)
	{
		obidebug(1, "\nError opening the first input view to align");
		return -1;
	}

	// Open the second input view. Same as 1st if ""
	if (strcmp(seq2_view_name, "") == 0)
		seq2_view = seq1_view;
	else
	{
		seq2_view = obi_open_view(dms, seq2_view_name);
		if (seq2_view == NULL)
		{
			obidebug(1, "\nError opening the second input view to align");
			return -1;
		}
	}

	// Open the first sequence column to align
	// If a column name wasn't given, open default sequence column
	if (strcmp(seq1_column_name, "") == 0)
	{
		if (strcmp((seq1_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0)
			i_seq1_column = obi_view_get_column(seq1_view, NUC_SEQUENCE_COLUMN);
		else
		{
			obi_set_errno(OBI_ALIGN_ERROR);
			obidebug(1, "\nError: no first column given to align");
			return -1;
		}
	}
	else
		i_seq1_column = obi_view_get_column(seq1_view, seq1_column_name);
	if (i_seq1_column == NULL)
	{
		obidebug(1, "\nError getting the first column to align");
		return -1;
	}

	// Check column type
	if ((i_seq1_column->header)->returned_data_type != OBI_SEQ)
	{
		obi_set_errno(OBI_ALIGN_ERROR);
		obidebug(1, "\nError: first column given to align is not an OBI_SEQ column");
		return -1;
	}

	// Open the second sequence column to align
	// If a column name wasn't given, open default sequence column
	if (strcmp(seq2_column_name, "") == 0)
	{
		if (strcmp((seq2_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0)
			i_seq2_column = obi_view_get_column(seq2_view, NUC_SEQUENCE_COLUMN);
		else
		{
			obi_set_errno(OBI_ALIGN_ERROR);
			obidebug(1, "\nError: no second column given to align");
			return -1;
		}
	}
	else
		i_seq2_column = obi_view_get_column(seq2_view, seq2_column_name);
	if (i_seq2_column == NULL)
	{
		obidebug(1, "\nError getting the second column to align");
		return -1;
	}
	// Check that the sequence columns are not both the default NUC_SEQ column of the same view
	if (i_seq1_column == i_seq2_column)
	{
		obidebug(1, "\nError: trying to align a column with itself (default NUC_SEQ column of the same view)");
		return -1;
	}

	// Check column type
	if ((i_seq2_column->header)->returned_data_type != OBI_SEQ)
	{
		obi_set_errno(OBI_ALIGN_ERROR);
		obidebug(1, "\nError: second column given to align is not an OBI_SEQ column");
		return -1;
	}

	// Get element index of the sequence to align in each line of the first column to compute it only once
	if ((strcmp(seq1_elt_name, "") != 0) && (seq1_elt_name != NULL))
	{
		seq1_elt_idx = obi_column_get_element_index_from_name(i_seq1_column, seq1_elt_name);
		if (seq1_elt_idx == OBIIdx_NA)
		{
			obidebug(1, "\nError getting the sequence index in a column line when aligning");
			return -1;
		}
	}
	else
		seq1_elt_idx = 0;

	// Get element index of the sequence to align in each line of the second column to compute it only once
	if ((strcmp(seq2_elt_name, "") != 0) && (seq2_elt_name != NULL))
	{
		seq2_elt_idx = obi_column_get_element_index_from_name(i_seq2_column, seq2_elt_name);
		if (seq2_elt_idx == OBIIdx_NA)
		{
			obidebug(1, "\nError getting the sequence index in a column line when aligning");
			return -1;
		}
	}
	else
		seq2_elt_idx = 0;


	// Open the first ID column, containing the identifiers of the first sequence to align
	// If a column name wasn't given, open default ID column
	if (strcmp(id1_column_name, "") == 0)
	{
		if (strcmp((seq1_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0)
			i_id1_column = obi_view_get_column(seq1_view, ID_COLUMN);
		else
		{
			obi_set_errno(OBI_ALIGN_ERROR);
			obidebug(1, "\nError: no first ID column given");
			return -1;
		}
	}
	else
		i_id1_column = obi_view_get_column(seq1_view, id1_column_name);
	if (i_id1_column == NULL)
	{
		obidebug(1, "\nError getting the first ID column");
		return -1;
	}

	// Open the second ID column, containing the identifiers of the second sequence to align
	// If a column name wasn't given, open default ID column
	if (strcmp(id2_column_name, "") == 0)
	{
		if (strcmp((seq2_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0)
			i_id2_column = obi_view_get_column(seq2_view, ID_COLUMN);
		else
		{
			obi_set_errno(OBI_ALIGN_ERROR);
			obidebug(1, "\nError: no second ID column given");
			return -1;
		}
	}
	else
		i_id2_column = obi_view_get_column(seq2_view, id2_column_name);
	if (i_id2_column == NULL)
	{
		obidebug(1, "\nError getting the second ID column");
		return -1;
	}

892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908
	// Open the input count columns
	if (print_count)
	{
		i_count1_column = obi_view_get_column(seq1_view, COUNT_COLUMN);
		if (i_count1_column == NULL)
		{
			obidebug(1, "\nError getting the first input COUNT column");
			return -1;
		}
		i_count2_column = obi_view_get_column(seq2_view, COUNT_COLUMN);
		if (i_count2_column == NULL)
		{
			obidebug(1, "\nError getting the second input COUNT column");
			return -1;
		}
	}

909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935
	// Create the output view
	output_view = obi_new_view(dms, output_view_name, NULL, NULL, output_view_comments);
	if (output_view == NULL)
	{
		obidebug(1, "\nError creating the output view when aligning");
		return -1;
	}

	// Create the output columns
	if (create_alignment_output_columns(output_view,
			(i_id1_column->header)->indexer_name, (i_id2_column->header)->indexer_name,
			(i_seq1_column->header)->indexer_name, (i_seq2_column->header)->indexer_name,
			print_seq, print_count, normalize, reference, similarity_mode) < 0)
		return -1;
	id1_column = obi_view_get_column(output_view, ID1_COLUMN_NAME);
	id2_column = obi_view_get_column(output_view, ID2_COLUMN_NAME);
	idx1_column = obi_view_get_column(output_view, IDX1_COLUMN_NAME);
	idx2_column = obi_view_get_column(output_view, IDX2_COLUMN_NAME);
    lcs_length_column = obi_view_get_column(output_view, LCS_LENGTH_COLUMN_NAME);
	if ((reference == ALILEN) && (normalize || !similarity_mode))
		ali_length_column = obi_view_get_column(output_view, ALI_LENGTH_COLUMN_NAME);
	score_column = obi_view_get_column(output_view, SCORE_COLUMN_NAME);
	if (print_seq)
	{
		seq1_column = obi_view_get_column(output_view, SEQ1_COLUMN_NAME);
		seq2_column = obi_view_get_column(output_view, SEQ2_COLUMN_NAME);
	}
936 937 938 939 940
	if (print_count)
	{
		count1_column = obi_view_get_column(output_view, COUNT1_COLUMN_NAME);
		count2_column = obi_view_get_column(output_view, COUNT2_COLUMN_NAME);
	}
941

942 943 944 945 946 947 948 949 950 951 952 953 954 955 956
	// Check if the sequence columns share the same indexer (allows for quick checking of sequence equality)
	if (strcmp((i_seq1_column->header)->indexer_name, (i_seq2_column->header)->indexer_name) == 0)
		same_indexer = true;
	else
		same_indexer = false;

	// Build kmer tables
	ktable = hash_two_seq_columns(seq1_view, i_seq1_column, seq1_elt_idx, seq2_view, i_seq2_column, seq2_elt_idx);
	if (ktable == NULL)
	{
		obi_set_errno(OBI_ALIGN_ERROR);
		obidebug(1, "\nError building kmer tables before aligning");
		return -1;
	}

957 958 959 960
	// TODO check this
	if (!similarity_mode && normalize && (threshold > 0))
		threshold = 1.0 - threshold;

961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001
	seq1_count = (seq1_view->infos)->line_count;
	seq2_count = (seq2_view->infos)->line_count;

	for (i=0; i < seq1_count; i++)
	{
		if (i%100 == 0)
			fprintf(stderr,"\rDone : %f %%       ", (i / (float) seq1_count)*100);

		// Get id index of first sequence
		id1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq1_view, i_id1_column, i, 0); // TODO Could there be multiple IDs per line?
		// Get first sequence and its index
		seq1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq1_view, i_seq1_column, i, seq1_elt_idx);
		blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq1_view, i_seq1_column, i, seq1_elt_idx);
		if (blob1 == NULL)
		{
			obidebug(1, "\nError retrieving sequences to align");
			return -1;
		}

		for (j=0; j < seq2_count; j++)
		{
			// Get second sequence and its index
			seq2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq2_view, i_seq2_column, j, seq2_elt_idx);
			blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq2_view, i_seq2_column, j, seq2_elt_idx);
			if (blob2 == NULL)
			{
				obidebug(1, "\nError retrieving sequences to align");
				return -1;
			}

			// Check if the sequences are identical in a quick way (same index in the same indexer)
			if (same_indexer && (seq1_idx == seq2_idx))
			{
				if (similarity_mode && normalize)
					score = 1.0;
				else if (!similarity_mode)
					score = 0.0;
				else
					score = blob1->length_decoded_value;
			}

1002
			else // the sequences aren't identical or we don't know
1003 1004
			{
				// kmer filter (offset for the index of the kmer table of the 2nd sequence because the kmer tables of the 2 sequence columns are concatenated in one)
1005
				align_filters(ktable, blob1, blob2, i, seq1_count+j, threshold, normalize, reference, similarity_mode, &score, &lcs_min, !same_indexer);
1006 1007

				// Compute alignment score
1008
				if ((score < 0) && ((threshold == 0) || (score == -1.0)))	// (sequences are not identical), and (no threshold, or filter passed): align
1009
					score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode, &lcs_length, &ali_length);
1010 1011 1012 1013 1014

				// TODO check this
//				if (print && !lcsmode && normalize)
//					score = 1.0 - score;

1015 1016 1017 1018 1019 1020 1021 1022
			}

			if ((score >= 0) && (((normalize || similarity_mode) && (score >= threshold)) || ((!similarity_mode && !normalize) && (score <= threshold))))
			{	// Print result

				// Get second id idx
				id2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq2_view, i_id2_column, j, 0);

1023 1024 1025 1026 1027 1028 1029
				// Get counts  // TODO use array for efficiency?
				if (print_count)
				{
					count1 = obi_get_int_with_elt_idx_and_col_p_in_view(seq1_view, i_count1_column, i, 0);
					count2 = obi_get_int_with_elt_idx_and_col_p_in_view(seq2_view, i_count2_column, j, 0);
				}

1030 1031 1032 1033
				if (print_alignment_result(output_view, k,
										   idx1_column, idx2_column, i, j,
										   id1_column, id2_column, id1_idx, id2_idx,
						                   print_seq, seq1_column, seq2_column, seq1_idx, seq2_idx,
1034
										   print_count, count1_column, count2_column, count1, count2,
1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
										   ali_length_column, ali_length,
										   lcs_length_column, lcs_length,
										   score_column, score,
										   reference, normalize, similarity_mode) < 0)
					return -1;

				k++;
			}
		}
	}

	// Close views
	if (seq2_view != seq1_view)
	{
1049
		if (obi_save_and_close_view(seq2_view) < 0)
1050 1051 1052 1053 1054
		{
			obidebug(1, "\nError closing the second input view after aligning");
			return -1;
		}
	}
1055
	if (obi_save_and_close_view(seq1_view) < 0)
1056 1057 1058 1059 1060
	{
		obidebug(1, "\nError closing the first input view after aligning");
		return -1;
	}

1061
	if (obi_save_and_close_view(output_view) < 0)
1062 1063 1064 1065 1066
	{
		obidebug(1, "\nError closing the output view after aligning");
		return -1;
	}

1067 1068 1069 1070 1071 1072
	if (obi_close_dms(dms, false) < 0)
	{
		obidebug(1, "\nError closing the DMS after aligning");
		return -1;
	}

1073 1074 1075 1076
	free_kmer_tables(ktable, seq1_count + seq2_count);

	return 0;
}
1077