encode.h 5.96 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
/****************************************************************************
 * Encoding header file	                                                    *
 ****************************************************************************/

/**
 * @file encode.h
 * @author Celine Mercier
 * @date November 18th 2015
 * @brief Header file for encoding DNA sequences.
 */


13 14 15 16
#ifndef ENCODE_H_
#define ENCODE_H_


17 18 19 20 21
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>

22
#include "obitypes.h"
23 24


25 26 27 28
#define NUC_MASK_2B 0x3   	/**< Binary: 11 to use when decoding 2 bits sequences
 	 	 	 	 	 	 	 */
#define NUC_MASK_4B 0xF  	/**< Binary: 1111 to use when decoding 4 bits sequences
 	 	 	 	 	 	 	 */
29 30 31 32 33 34 35


/**
 * @brief enum for the 2-bits codes for each of the 4 nucleotides.
 */
enum
{
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
    NUC_A_2b = 0x0,   /* binary: 00 */
    NUC_C_2b = 0x1,   /* binary: 01 */
    NUC_G_2b = 0x2,   /* binary: 10 */
    NUC_T_2b = 0x3,   /* binary: 11 */
};


/**
 * @brief enum for the 4-bits codes for each of the 15 IUPAC nucleotides.
 */
enum
{
    NUC_A_4b = 0x1,   /* binary: 0001 */
    NUC_C_4b = 0x2,   /* binary: 0010 */
    NUC_G_4b = 0x3,   /* binary: 0011 */
    NUC_T_4b = 0x4,   /* binary: 0100 */
	NUC_R_4b = 0x5,   /* binary: 0101 */
	NUC_Y_4b = 0x6,   /* binary: 0110 */
	NUC_S_4b = 0x7,   /* binary: 0111 */
	NUC_W_4b = 0x8,   /* binary: 1000 */
	NUC_K_4b = 0x9,   /* binary: 1001 */
	NUC_M_4b = 0xA,   /* binary: 1010 */
	NUC_B_4b = 0xB,   /* binary: 1011 */
	NUC_D_4b = 0xC,   /* binary: 1100 */
	NUC_H_4b = 0xD,   /* binary: 1101 */
	NUC_V_4b = 0xE,   /* binary: 1110 */
	NUC_N_4b = 0xF,   /* binary: 1111 */
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
};


/**
 * @brief Checks if there are only 'atgcATGC' characters in a
 *        character string.
 *
 * @param seq The sequence to check.
 *
 * @returns A boolean value indicating if there are only
 *          'atgcATGC' characters in a character string.
 *
 * @since November 2015
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
78
bool only_ATGC(const char* seq);
79 80


81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
/**
 * @brief Checks if there are only IUPAC DNA characters in a
 *        character string.
 *
 * @param seq The sequence to check.
 *
 * @returns A boolean value indicating if there are only
 *          IUPAC DNA characters in a character string.
 *
 * @since May 2017
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
bool only_IUPAC_DNA(const char* seq);


/**
 * @brief Checks if a character string can be read as a DNA sequence encoded
 *        with ACGT or IUPAC characters (in capital letters or not).
 *
 * @param seq The sequence to check.
 *
 * @returns A boolean value indicating if the character string
 *          can be read as a DNA sequence.
 *
 * @since May 2017
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
bool is_a_DNA_seq(const char* seq);


111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
/**
 * @brief Returns a nucleotide from a DNA sequence encoded
 *        with each nucleotide on 2 or 4 bits.
 *
 * @param seq The encoded sequence.
 * @param idx The index (in the decoded sequence) of the nucleotide to get.
 * @param encoding An integer indicating whether the sequence is encoded with each nucleotide on 2 or 4 bits.
 *
 * @returns The (still encoded) nucleotide at the given index.
 * @retval 255 if an error occurred.
 *
 * @see decode_seq_on_2_bits() and decode_seq_on_4_bits()
 * @since January 2019
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
byte_t get_nucleotide_from_encoded_seq(byte_t* seq, int32_t idx, uint8_t encoding);


129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
/**
 * @brief Encodes a DNA sequence with each nucleotide coded on 2 bits.
 *
 *    A or a : 00
 *    C or c : 01
 *    T or t : 10
 *    G or g : 11
 *
 * @warning The DNA sequence must contain only 'atgcATGC' characters.
 *
 * @param seq The sequence to encode.
 * @param length The length of the sequence to encode.
 *
 * @returns The encoded sequence.
 *
 * @since November 2015
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
Celine Mercier committed
147
byte_t* encode_seq_on_2_bits(const char* seq, int32_t length);
148 149 150 151 152


/**
 * @brief Decodes a DNA sequence that is coded with each nucleotide on 2 bits.
 *
153 154 155 156
 *    00 -> a
 *    01 -> c
 *    10 -> t
 *    11 -> g
157
 *
158
 * @param seq_b The sequence to decode.
159 160 161 162 163 164 165 166 167 168
 * @param length_seq The initial length of the sequence before it was encoded.
 *
 * @returns The decoded sequence ended with '\0'.
 *
 * @since November 2015
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq);


169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
/**
 * @brief Encodes a DNA sequence with each nucleotide coded on 4 bits.
 *
 *		A or a : 0001
 *      C or c : 0010
 *      G or g : 0011
 *      T or t : 0100
 *      R or r : 0101
 *      Y or y : 0110
 *      S or s : 0111
 *      W or w : 1000
 *      K or k : 1001
 *      M or m : 1010
 *      B or b : 1011
 *      D or d : 1100
 *      H or h : 1101
 *      V or v : 1110
 *      N or n : 1111
 *
 * @warning The DNA sequence must contain only IUPAC characters.
 *
 * @param seq The sequence to encode.
 * @param length The length of the sequence to encode.
 *
 * @returns The encoded sequence.
 *
 * @since November 2015
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
Celine Mercier committed
198
byte_t* encode_seq_on_4_bits(const char* seq, int32_t length);
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219


/**
 * @brief Decodes a DNA sequence that is coded with each nucleotide on 4 bits.
 *
 *		A or a : 0001
 *      C or c : 0010
 *      G or g : 0011
 *      T or t : 0100
 *      R or r : 0101
 *      Y or y : 0110
 *      S or s : 0111
 *      W or w : 1000
 *      K or k : 1001
 *      M or m : 1010
 *      B or b : 1011
 *      D or d : 1100
 *      H or h : 1101
 *      V or v : 1110
 *      N or n : 1111
 *
220
 * @param seq_b The sequence to decode.
221 222 223 224 225 226 227 228 229 230
 * @param length_seq The initial length of the sequence before it was encoded.
 *
 * @returns The decoded sequence ended with '\0'.
 *
 * @since November 2015
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
char* decode_seq_on_4_bits(byte_t* seq_b, int32_t length_seq);


231 232 233 234 235
////////// FOR DEBUGGING ///////////

// little endian
void print_bits(void* ptr, int32_t length);

236 237 238

#endif /* ENCODE_H_ */