kmer_similarity.h 5.45 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
/****************************************************************************
 * Header file for Kmer similarity computation functions                    *
 ****************************************************************************/

/**
 * @file kmer_similarity.h
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 * @date January 7th 2019
 * @brief Header file for Kmer similarity computation functions.
 */


#ifndef KMER_SIMILARITY_H_
#define KMER_SIMILARITY_H_

#include <stdio.h>

#include "obitypes.h"
#include "obidmscolumn.h"
#include "obiview.h"


#define ARRAY_LENGTH (256)


/**
 * @brief Alignment structure, with informations about the similarity and to rebuild the alignment.
 */
typedef struct Obi_ali {
30
	int      score;	    		/**< Alignment score, corresponding to the number of matches (identical nucleotides aligned).
31 32 33 34 35
	 	 	 	 	 	 	 	 */
	int      consensus_length; 	/**< Length of the final consensus sequence.
	 	 	 	 	 	 	 	 */
	int      overlap_length;	/**< Length of the overlap between the aligned sequences.
	 	 	 	 	 	 	 	 */
36
	char*    consensus_seq; 	/**< Consensus sequence built as to reconstruct a pairedend read.
37
	 	 	 	 	 	 	 	 */
38
	uint8_t* consensus_qual;	/**< Consensus quality built as to reconstruct a pairedend read.
39
	 	 	 	        	 	 */
40
	int		 shift;   		    /**< Shift chosen to align the sequences.
41
	 	 	 	 	 	 	 	 */
42 43
	char     direction[6]; 	    /**< Alignment direction (positive/right or negative/left shift).
								 */
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
} Obi_ali_t, *Obi_ali_p;


/**
 * @brief Frees an Obi_ali_p structure and all its elements.
 *
 * @warning The pointer sent becomes unusable.
 *
 * @param ali The pointer on the Obi_ali_p structure to free.
 *
 * @since January 2019
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
void obi_free_shifted_ali(Obi_ali_p ali);


/**
 * Function computing the kmer similarity of two sequences stored in views.
 *
 * The similarity is computing this way: the positions of identical kmers in both sequences are
 * compared, and the most represented shift is chosen. The similarity is then calculated as:
 * kmer_similarity = number_of_common_kmers_with_the_chosen_shift + kmer_size - 1
 *
 * @warning Several pointers and structures passed to or returned by the function have to be freed by the caller:
 * 			  - kmer_pos_array
 * 			  - shift_array
 * 			  - shift_count_array
 * 			  - the Obi_ali_p structure returned
 *
 * @param view1 A pointer on the view containing the first sequence.
 * @param column1 A pointer on the column containing the first sequence.
 * @param idx1 The index of the first sequence in view1.
 * @param elt_idx1 The element index of the first sequence in column1.
 * @param view2 A pointer on the view containing the second sequence.
 * @param column2 A pointer on the column containing the second sequence.
 * @param idx2 The index of the second sequence in view2.
 * @param elt_idx2 The element index of the second sequence in column2.
81 82 83 84 85
 * @param qual_col1 A pointer on the column containing the quality associated with the first sequence (in view1).
 * @param qual_col2 A pointer on the column containing the quality associated with the second sequence (in view2).
 * @param reversed_column A pointer on the column containing a boolean indicating whether the sequences correspond
 * 						  to paired-end reads that might have been reversed before aligning them
 * 						  (e.g. when the ngsfilter tool is used before alignpairedend).
86 87 88 89 90 91 92 93 94 95 96 97 98
 * @param kmer_size The kmer length to use. Must be >= 1 <= 4.
 * @param kmer_pos_array The array used to store kmer positions. If NULL, allocated by the function.
 *        If needed, reallocated to a bigger size.
 * @param kmer_pos_array_height_p A pointer on an integer corresponding to the size (number of elements)
 *        allocated for kmer_pos_array. Updated by the function as needed.
 * @param shift_array The array used to store kmer shifts. If NULL, allocated by the function.
 *        If needed, reallocated to a bigger size.
 * @param shift_array_height_p A pointer on an integer corresponding to the size (number of elements)
 *        allocated for shift_array. Updated by the function as needed.
 * @param shift_count_array The array used to store shift counts. If NULL, allocated by the function.
 *        If needed, reallocated to a bigger size.
 * @param shift_count_array_height_p A pointer on an integer corresponding to the size (number of elements)
 *        allocated for shift_count_array. Updated by the function as needed.
99
 * @param build_consensus A boolean indicating whether the function should build the consensus sequence and quality as to reconstruct a pairedend read. // TODO option to build consensus without quality?
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
 *
 * @returns A pointer on an Obi_ali_p structure containing the results.
 * @retval NULL if an error occurred.
 *
 * @since January 2019
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
Obi_ali_p kmer_similarity(Obiview_p view1,
						  OBIDMS_column_p column1,
						  index_t idx1,
						  index_t elt_idx1,
						  Obiview_p view2,
						  OBIDMS_column_p column2,
						  index_t idx2,
						  index_t elt_idx2,
115 116
						  OBIDMS_column_p qual_col1,
						  OBIDMS_column_p qual_col2,
117
						  OBIDMS_column_p reversed_column,
118
						  uint8_t kmer_size,
119
						  int32_t** kmer_pos_array_p,
120
						  int32_t* kmer_pos_array_height_p,
121
						  int32_t** shift_array_p,
122
						  int32_t* shift_array_height_p,
123
						  int32_t** shift_count_array_p,
124 125 126 127 128
						  int32_t* shift_count_array_height_p,
						  bool build_consensus);


#endif /* KMER_SIMILARITY_H_ */