sse_banded_LCS_alignment.h 6.3 KB
Newer Older
1 2 3 4 5 6 7 8 9
/****************************************************************************
 * LCS alignment of two sequences header file                               *
 ****************************************************************************/

/**
 * @file sse_banded_LCS_alignment.h
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 * @date November 7th 2012
 * @brief header file for the functions handling the alignment of two sequences to compute their Longest Common Sequence.
10 11
 */

12

13 14 15 16 17 18 19
#ifndef SSE_BANDED_LCS_ALIGNMENT_H_
#define SSE_BANDED_LCS_ALIGNMENT_H_


#include <stdint.h>
#include <stdbool.h>

20 21 22
#include "obiblob.h"


23 24 25 26 27 28 29
/**
 * @brief Macros for reference lengths to use when aligning.
 *
 * @since 2012
 * @author Eric Coissac (eric.coissac@metabarcoding.org)
 */
#define ALILEN (0)
30 31 32
#define MAXLEN (1)
#define MINLEN (2)

33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83

/**
 * @brief Function calculating the minimum length of the Longest Common Subsequence between two sequences to be above a chosen score threshold.
 *
 * @warning The first argument (lmax) must correspond to length of the longest sequence.
 *
 * @param lmax The length of the longest sequence to align.
 * @param lmin The length of the shortest sequence to align.
 * @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95
 * 					for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity),
 * 					e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is
 * 					the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance,
 *                  it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed.
 * @param normalize Whether the score should be normalized with the reference sequence length.
 * @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length.	// TODO
 * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false).
 *
 * @returns The minimum length of the Longest Common Subsequence between two sequences to be above the chosen score threshold.
 *
 * @since 2012
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
int calculateLCSmin(int lmax, int lmin, double threshold, bool normalize, int reference, bool similarity_mode);


/**
 * @brief Function aligning two sequences.
 *
 * The alignment algorithm is a banded global alignment algorithm, a modified version of the classical Needleman and Wunsch algorithm,
 * and uses indices based on the length of the Longest Common Subsequence between the two sequences.
 *
 * Note: the sequences do not need to be ordered (e.g. with the longest sequence as first argument).
 *
 * @param seq1 A pointer on the character string corresponding to the first sequence.
 * @param seq2 A pointer on the character string corresponding to the second sequence.
 * @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95
 * 					for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity),
 * 					e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is
 * 					the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance,
 *                  it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed.
 * @param normalize Whether the score should be normalized with the reference sequence length.
 * @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length.	// TODO
 * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false).
 * @param lcs_length A pointer on the int where the LCS length will be stored.
 * @param ali_length A pointer on the int where the alignment length will be stored.
 *
 * @returns The alignment score (normalized according to the parameters).
 *
 * @since 2012
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
84
double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length);
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112


/**
 * @brief Function aligning two sequences encoded in obiblobs.
 *
 * The alignment algorithm is a banded global alignment algorithm, a modified version of the classical Needleman and Wunsch algorithm,
 * and uses indices based on the length of the Longest Common Subsequence between the two sequences.
 *
 * Note: the obiblobs do not need to be ordered (e.g. with the obiblob containing the longest sequence as first argument).
 *
 * @param seq1 A pointer on the blob containing the first sequence.
 * @param seq2 A pointer on the blob containing the second sequence.
 * @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95
 * 					for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity),
 * 					e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is
 * 					the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance,
 *                  it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed.
 * @param normalize Whether the score should be normalized with the reference sequence length.
 * @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length.	// TODO
 * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false).
 * @param lcs_length A pointer on the int where the LCS length will be stored.
 * @param ali_length A pointer on the int where the alignment length will be stored.
 *
 * @returns The alignment score (normalized according to the parameters).
 *
 * @since December 2016
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
113
double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length);
114

115

116
#endif