kmer_similarity.h 4.97 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
/****************************************************************************
 * Header file for Kmer similarity computation functions                    *
 ****************************************************************************/

/**
 * @file kmer_similarity.h
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 * @date January 7th 2019
 * @brief Header file for Kmer similarity computation functions.
 */


#ifndef KMER_SIMILARITY_H_
#define KMER_SIMILARITY_H_

#include <stdio.h>

#include "obitypes.h"
#include "obidmscolumn.h"
#include "obiview.h"


#define ARRAY_LENGTH (256)


/**
 * @brief Alignment structure, with informations about the similarity and to rebuild the alignment.
 */
typedef struct Obi_ali {
	double   score;	    		/**< Alignment score.
	 	 	 	 	 	 	 	 */
	int      consensus_length; 	/**< Length of the final consensus sequence.
	 	 	 	 	 	 	 	 */
	int      overlap_length;	/**< Length of the overlap between the aligned sequences.
	 	 	 	 	 	 	 	 */
36
	char*    consensus_seq; 	/**< Consensus sequence built as to reconstruct a pairedend read.
37
	 	 	 	 	 	 	 	 */
38
	uint8_t* consensus_qual;	/**< Consensus quality built as to reconstruct a pairedend read.
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
	 	 	 	        	 	 */
	int		 shift;   		    /**< Shift chosen to align the sequences (for shifted alignment).
	 	 	 	 	 	 	 	 */
	char     direction[6]; 	    /**< Alignement direction (positive/right or negative/left shift) (for shifted alignment).
	 	 	 	 	 	 	 	 */   // TODO but sequences switched around depending on size..... discuss
} Obi_ali_t, *Obi_ali_p;


/**
 * @brief Frees an Obi_ali_p structure and all its elements.
 *
 * @warning The pointer sent becomes unusable.
 *
 * @param ali The pointer on the Obi_ali_p structure to free.
 *
 * @since January 2019
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
void obi_free_shifted_ali(Obi_ali_p ali);


/**
 * Function computing the kmer similarity of two sequences stored in views.
 *
 * The similarity is computing this way: the positions of identical kmers in both sequences are
 * compared, and the most represented shift is chosen. The similarity is then calculated as:
 * kmer_similarity = number_of_common_kmers_with_the_chosen_shift + kmer_size - 1
 *
 * @warning Several pointers and structures passed to or returned by the function have to be freed by the caller:
 * 			  - kmer_pos_array
 * 			  - shift_array
 * 			  - shift_count_array
 * 			  - the Obi_ali_p structure returned
 *
 * @param view1 A pointer on the view containing the first sequence.
 * @param column1 A pointer on the column containing the first sequence.
 * @param idx1 The index of the first sequence in view1.
 * @param elt_idx1 The element index of the first sequence in column1.
 * @param view2 A pointer on the view containing the second sequence.
 * @param column2 A pointer on the column containing the second sequence.
 * @param idx2 The index of the second sequence in view2.
 * @param elt_idx2 The element index of the second sequence in column2.
 * @param kmer_size The kmer length to use. Must be >= 1 <= 4.
 * @param kmer_pos_array The array used to store kmer positions. If NULL, allocated by the function.
 *        If needed, reallocated to a bigger size.
 * @param kmer_pos_array_height_p A pointer on an integer corresponding to the size (number of elements)
 *        allocated for kmer_pos_array. Updated by the function as needed.
 * @param shift_array The array used to store kmer shifts. If NULL, allocated by the function.
 *        If needed, reallocated to a bigger size.
 * @param shift_array_height_p A pointer on an integer corresponding to the size (number of elements)
 *        allocated for shift_array. Updated by the function as needed.
 * @param shift_count_array The array used to store shift counts. If NULL, allocated by the function.
 *        If needed, reallocated to a bigger size.
 * @param shift_count_array_height_p A pointer on an integer corresponding to the size (number of elements)
 *        allocated for shift_count_array. Updated by the function as needed.
94
 * @param build_consensus A boolean indicating whether the function should build the consensus sequence and quality as to reconstruct a pairedend read. // TODO option to build consensus without quality?
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
 *
 * @returns A pointer on an Obi_ali_p structure containing the results.
 * @retval NULL if an error occurred.
 *
 * @since January 2019
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
Obi_ali_p kmer_similarity(Obiview_p view1,
						  OBIDMS_column_p column1,
						  index_t idx1,
						  index_t elt_idx1,
						  Obiview_p view2,
						  OBIDMS_column_p column2,
						  index_t idx2,
						  index_t elt_idx2,
110 111
						  OBIDMS_column_p qual_col1,
						  OBIDMS_column_p qual_col2,
112 113 114 115 116 117 118 119 120 121 122
						  uint8_t kmer_size,
						  int32_t* kmer_pos_array,
						  int32_t* kmer_pos_array_height_p,
						  int32_t* shift_array,
						  int32_t* shift_array_height_p,
						  int32_t* shift_count_array,
						  int32_t* shift_count_array_height_p,
						  bool build_consensus);


#endif /* KMER_SIMILARITY_H_ */