Commit 6ab1c833 by Celine Mercier

New column type for DNA sequences. Only for those coded on 2 bits (only

'ATGCatgc') for now.
parent e3712485
......@@ -14,3 +14,5 @@
../../../src/private_at_functions.c
../../../src/obiarray.h
../../../src/obiarray.c
../../../src/encode.h
../../../src/encode.c
\ No newline at end of file
......@@ -47,6 +47,11 @@ from ._obidmscolumn_str cimport OBIDMS_column_str, \
OBIDMS_column_str_multi_elts, \
OBIDMS_column_str_multi_elts_writable
from ._obidmscolumn_seq cimport OBIDMS_column_seq, \
OBIDMS_column_seq_writable, \
OBIDMS_column_seq_multi_elts, \
OBIDMS_column_seq_multi_elts_writable
cdef class OBIDMS :
......@@ -215,6 +220,17 @@ cdef class OBIDMS :
subclass = OBIDMS_column_str
else :
subclass = OBIDMS_column_str_multi_elts
elif data_type == 6 :
if (create or clone) :
if nb_elements_per_line == 1 :
subclass = OBIDMS_column_seq_writable
else :
subclass = OBIDMS_column_seq_multi_elts_writable
else :
if nb_elements_per_line == 1 :
subclass = OBIDMS_column_seq
else :
subclass = OBIDMS_column_seq_multi_elts
else :
raise Exception("Problem with the data type")
......@@ -238,7 +254,7 @@ cdef class OBIDMS_column :
bint create,
bint clone, bint clone_data,
obiversion_t version_number,
OBIType_t type,
OBIType_t type, # There's a problem with this with the OBI_IDX columns as there are 2 subtypes
index_t nb_lines,
index_t nb_elements_per_line,
list elements_names,
......
../../../src/obidmscolumn_seq.c
../../../src/obidmscolumn_seq.h
../../../src/obidmscolumn.h
../../../src/obidmscolumn.c
../../../src/obidmscolumndir.h
../../../src/obidmscolumndir.c
../../../src/obidms.h
../../../src/obidms.c
../../../src/obierrno.h
../../../src/obierrno.c
../../../src/obilittlebigman.h
../../../src/obilittlebigman.c
../../../src/obitypes.h
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/obiarray.h
../../../src/obiarray.c
#cython: language_level=3
from .capi.obitypes cimport index_t
from ._obidms cimport OBIDMS_column
cdef class OBIDMS_column_seq(OBIDMS_column):
cpdef object get_line(self, index_t line_nb)
cpdef set_line(self, index_t line_nb, object value)
cpdef close(self)
cdef class OBIDMS_column_seq_writable(OBIDMS_column_seq):
cpdef set_line(self, index_t line_nb, object value)
cpdef close(self)
cdef class OBIDMS_column_seq_multi_elts(OBIDMS_column_seq):
cpdef object get_item(self, index_t line_nb, str element_name)
cpdef object get_line(self, index_t line_nb)
cpdef set_item(self, index_t line_nb, str element_name, str value)
cpdef set_line(self, index_t line_nb, object values)
cdef class OBIDMS_column_seq_multi_elts_writable(OBIDMS_column_seq_multi_elts):
cpdef set_item(self, index_t line_nb, str element_name, str value)
cpdef set_line(self, index_t line_nb, object values)
cpdef close(self)
#cython: language_level=3
from .capi.obidmscolumn cimport obi_close_column,\
obi_truncate_and_close_column, \
obi_column_get_obiseq_with_elt_name, \
obi_column_get_obiseq_with_elt_idx, \
obi_column_set_obiseq_with_elt_name, \
obi_column_set_obiseq_with_elt_idx
from .capi.obierrno cimport obi_errno
from .capi.obitypes cimport OBIIdx_NA, const_char_p
from obitools3.utils cimport str2bytes, bytes2str
cdef class OBIDMS_column_seq(OBIDMS_column):
cpdef object get_line(self, index_t line_nb):
cdef bytes value
cdef object result
value = <bytes> obi_column_get_obiseq_with_elt_idx(self.pointer, line_nb, 0)
if obi_errno > 0 :
raise IndexError(line_nb)
if value == OBIIdx_NA :
result = None
else :
result = bytes2str(value)
return result
cpdef set_line(self, index_t line_nb, object value):
raise Exception("Column is read-only")
cpdef close(self):
if obi_close_column(self.pointer) < 0 :
raise Exception("Problem closing a column")
cdef class OBIDMS_column_seq_writable(OBIDMS_column_seq):
cpdef set_line(self, index_t line_nb, object value):
if obi_column_set_obiseq_with_elt_idx(self.pointer, line_nb, 0, str2bytes(value)) < 0:
raise Exception("Problem setting a value in a column")
cpdef close(self):
if obi_truncate_and_close_column(self.pointer) < 0 :
raise Exception("Problem closing a column")
cdef class OBIDMS_column_seq_multi_elts(OBIDMS_column_seq):
cpdef object get_item(self, index_t line_nb, str element_name):
cdef bytes value
cdef object result
value = <bytes> obi_column_get_obiseq_with_elt_name(self.pointer, line_nb, str2bytes(element_name))
if obi_errno > 0 :
raise IndexError(line_nb, element_name)
if value == OBIIdx_NA :
result = None
else :
result = bytes2str(value)
return result
cpdef object get_line(self, index_t line_nb) :
cdef bytes value
cdef object result
cdef index_t i
cdef bint all_NA
result = {}
all_NA = True
for i in range(self.nb_elements_per_line) :
value = <bytes> obi_column_get_obiseq_with_elt_idx(self.pointer, line_nb, i)
if obi_errno > 0 :
raise IndexError(line_nb)
result[self.elements_names[i]] = bytes2str(value)
if all_NA and (value != OBIIdx_NA) :
all_NA = False
if all_NA :
result = None
return result
cpdef set_item(self, index_t line_nb, str element_name, str value):
raise Exception("Column is read-only")
cpdef set_line(self, index_t line_nb, object values):
raise Exception("Column is read-only")
cdef class OBIDMS_column_seq_multi_elts_writable(OBIDMS_column_seq_multi_elts):
cpdef set_item(self, index_t line_nb, str element_name, str value):
if obi_column_set_obiseq_with_elt_name(self.pointer, line_nb, str2bytes(element_name), str2bytes(value)) < 0:
raise Exception("Problem setting a value in a column")
cpdef set_line(self, index_t line_nb, object values):
cdef str value
for element_name in values :
value = values[element_name]
self.set_item(line_nb, element_name, value)
cpdef close(self):
if obi_truncate_and_close_column(self.pointer) < 0 :
raise Exception("Problem closing a column")
\ No newline at end of file
......@@ -163,10 +163,30 @@ cdef extern from "obidmscolumn_str.h" nogil:
char* value)
const_char_p obi_column_get_obistr_with_elt_name(OBIDMS_column_p column,
index_t line_nb,
const_char_p element_name)
index_t line_nb,
const_char_p element_name)
const_char_p obi_column_get_obistr_with_elt_idx(OBIDMS_column_p column,
index_t line_nb,
index_t element_idx)
index_t line_nb,
index_t element_idx)
cdef extern from "obidmscolumn_seq.h" nogil:
int obi_column_set_obiseq_with_elt_name(OBIDMS_column_p column,
index_t line_nb,
const_char_p element_name,
char* value)
int obi_column_set_obiseq_with_elt_idx(OBIDMS_column_p column,
index_t line_nb,
index_t element_idx,
char* value)
const_char_p obi_column_get_obiseq_with_elt_name(OBIDMS_column_p column,
index_t line_nb,
const_char_p element_name)
const_char_p obi_column_get_obiseq_with_elt_idx(OBIDMS_column_p column,
index_t line_nb,
index_t element_idx)
......@@ -10,9 +10,9 @@ from obitools3.obidms._obidms import OBIDMS
LINE_COUNT_FOR_TEST_COLUMN = 10000 # TODO randomize?
SMALLER_LINE_COUNT_FOR_TEST_COLUMN = 1000 # TODO randomize?
NB_ELEMENTS_PER_LINE = 20 # TODO randomize?
NB_ELEMENTS_PER_LINE = 10 # TODO randomize?
DMS_NAME = "unit_test_dms"
DATA_TYPES = ['OBI_INT', 'OBI_FLOAT', 'OBI_BOOL', 'OBI_CHAR', 'OBI_IDX']
DATA_TYPES = ['OBI_INT', 'OBI_FLOAT', 'OBI_BOOL', 'OBI_CHAR', 'OBI_STR', 'OBI_SEQ']
def create_test_obidms():
......@@ -58,12 +58,15 @@ def random_obivalue(data_type):
elif data_type == "OBI_BOOL" :
return randint(0,1)
elif data_type == "OBI_CHAR" :
nucs = 'atgc'
return nucs[randint(0,3)]
elif data_type == "OBI_IDX" :
length = randint(1,500)
return choice(string.ascii_lowercase)
elif data_type == "OBI_STR" :
length = randint(1,200)
randoms = ''.join(choice(string.ascii_lowercase) for i in range(length))
return randoms
elif data_type == "OBI_SEQ" :
length = randint(1,200)
randoms = ''.join(choice("atgc") for i in range(length))
return randoms
class OBIDMS_Column_TestCase(unittest.TestCase):
def tearDown(self):
......@@ -255,6 +258,30 @@ class OBIDMS_Column_OBI_STR_multiple_elements_TestCase(OBIDMS_Column_multiple_el
self.data_type_code,
multiple_elements_per_line=True)
class OBIDMS_Column_OBI_SEQ_TestCase(OBIDMS_Column_TestCase):
def setUp(self):
self.data_type_code = 6
self.dms, \
self.dms_name, \
self.dms_dir_name = create_test_obidms()
self.col, \
self.col_name, \
self.data_type_str = create_test_column(self.dms,
self.data_type_code)
class OBIDMS_Column_OBI_SEQ_multiple_elements_TestCase(OBIDMS_Column_multiple_elements_TestCase):
def setUp(self):
self.data_type_code = 6
self.dms, \
self.dms_name, \
self.dms_dir_name = create_test_obidms()
self.col, \
self.col_name, \
self.elts_names, \
self.data_type_str = create_test_column(self.dms,
self.data_type_code,
multiple_elements_per_line=True)
if __name__ == '__main__':
unittest.main(verbosity=2, defaultTest=["OBIDMS_Column_OBI_INT_TestCase",
......@@ -266,6 +293,8 @@ if __name__ == '__main__':
"OBIDMS_Column_OBI_CHAR_TestCase",
"OBIDMS_Column_OBI_CHAR_multiple_elements_TestCase",
"OBIDMS_Column_OBI_STR_TestCase",
"OBIDMS_Column_OBI_STR_multiple_elements_TestCase"])
"OBIDMS_Column_OBI_STR_multiple_elements_TestCase",
"OBIDMS_Column_OBI_SEQ_TestCase",
"OBIDMS_Column_OBI_SEQ_multiple_elements_TestCase"])
/****************************************************************************
* Encoding functions *
****************************************************************************/
/**
* @file encode.c
* @author Celine Mercier
* @date November 18th 2015
* @brief Functions encoding DNA sequences.
*/
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include "encode.h"
#include "obiarray.h"
#include "obidebug.h"
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
// TODO: endianness problem?
bool only_ATGC(char* seq)
{
char* c = seq;
while (*c)
{
if (!((*c == 'A') || \
(*c == 'T') || \
(*c == 'G') || \
(*c == 'C') || \
(*c == 'a') || \
(*c == 't') || \
(*c == 'g') || \
(*c == 'c')))
{
return 0;
}
else
{
c++;
}
}
return 1;
}
byte_t* encode_seq_on_2_bits(char* seq, int32_t length) // TODO shift = 2
{
byte_t* seq_b;
uint8_t shift;
int32_t length_b;
int32_t i;
// fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>Encoding sequence %s", seq);
length_b = ceil((double) length / (double) 4.0);
// fprintf(stderr, "\nLength: %d", length_b);
seq_b = (byte_t*) malloc(length_b * sizeof(byte_t));
memset(seq_b, 0, length_b);
for (i=0; i<length; i++)
{
shift = 6 - 2*(i%4);
// fprintf(stderr, "\nshift: %u", shift);
switch (seq[i])
{
case 'a':
case 'A':
seq_b[i/4] |= NUC_A << shift;
// fprintf(stderr, "\nIn byte %d, writing A:", i/4);
// print_bits(seq_b, length_b);
break;
case 'c':
case 'C':
seq_b[i/4] |= NUC_C << shift;
// fprintf(stderr, "\nIn byte %d, writing C:", i/4);
// print_bits(seq_b, length_b);
break;
case 'g':
case 'G':
seq_b[i/4] |= NUC_G << shift;
// fprintf(stderr, "\nIn byte %d, writing G:", i/4);
// print_bits(seq_b, length_b);
break;
case 't':
case 'T':
seq_b[i/4] |= NUC_T << shift;
// fprintf(stderr, "\nIn byte %d, writing T:", i/4);
// print_bits(seq_b, length_b);
break;
default:
obidebug(1, "\nInvalid nucleotide base when encoding (not [atgcATGC])");
return NULL;
}
}
// fprintf(stderr, "\n>>>>>>>>>Encoded:");
// print_bits(seq_b, length_b);
return seq_b;
}
char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq)
{
char* seq;
int32_t i;
uint8_t shift;
uint8_t mask;
uint8_t nuc;
seq = (char*) malloc((length_seq+1) * sizeof(char));
for (i=0; i<length_seq; i++)
{
shift = 6 - 2*(i % 4);
mask = NUC_MASK << shift;
nuc = (seq_b[i/4] & mask) >> shift;
switch (nuc)
{
case NUC_A:
seq[i] = 'a';
break;
case NUC_C:
seq[i] = 'c';
break;
case NUC_G:
seq[i] = 'g';
break;
case NUC_T:
seq[i] = 't';
break;
default:
obidebug(1, "\nInvalid nucleotide base when decoding");
return NULL;
}
}
seq[length_seq] = '\0';
return seq;
}
////////// FOR DEBUGGING ///////////
// little endian
void print_bits(void* ptr, int32_t size)
{
uint8_t* b = (uint8_t*) ptr;
uint8_t byte;
int32_t i, j;
fprintf(stderr, "\n");
for (i=0;i<size;i++)
{
for (j=7;j>=0;j--)
{
byte = b[i] & (1<<j);
byte >>= j;
fprintf(stderr, "%u", byte);
}
fprintf(stderr, " ");
}
fprintf(stderr, "\n");
}
/****************************************************************************
* Encoding header file *
****************************************************************************/
/**
* @file encode.h
* @author Celine Mercier
* @date November 18th 2015
* @brief Header file for encoding DNA sequences.
*/
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include "obiarray.h"
#define NUC_MASK 0x3 /**< Binary: 11 to use when decoding */
/**
* @brief enum for the 2-bits codes for each of the 4 nucleotides.
*/
enum
{
NUC_A = 0x0, /* binary: 00 */
NUC_C = 0x1, /* binary: 01 */
NUC_G = 0x2, /* binary: 10 */
NUC_T = 0x3, /* binary: 11 */
};
/**
* @brief Checks if there are only 'atgcATGC' characters in a
* character string.
*
* @param seq The sequence to check.
*
* @returns A boolean value indicating if there are only
* 'atgcATGC' characters in a character string.
*
* @since November 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
bool only_ATGC(char* seq);
/**
* @brief Encodes a DNA sequence with each nucleotide coded on 2 bits.
*
* A or a : 00
* C or c : 01
* T or t : 10
* G or g : 11
*
* @warning The DNA sequence must contain only 'atgcATGC' characters.
*
* @param seq The sequence to encode.
* @param length The length of the sequence to encode.
*
* @returns The encoded sequence.
*
* @since November 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
byte_t* encode_seq_on_2_bits(char* seq, int32_t length);
/**
* @brief Decodes a DNA sequence that is coded with each nucleotide on 2 bits.
*
* A or a : 00
* C or c : 01
* T or t : 10
* G or g : 11
*
* @param seq The sequence to decode.
* @param length_seq The initial length of the sequence before it was encoded.
*
* @returns The decoded sequence ended with '\0'.
*
* @since November 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq);
////////// FOR DEBUGGING ///////////
// little endian
void print_bits(void* ptr, int32_t length);
......@@ -24,6 +24,7 @@
#include "obitypes.h"
#include "obidebug.h"
#include "private_at_functions.h"
#include "encode.h"
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
......@@ -446,6 +447,8 @@ int array_compare(byte_t* value_1, byte_t* value_2)
uint8_t size_2;
int32_t len_1;
int32_t len_2;
int32_t ini_len_1;
int32_t ini_len_2;
int32_t b;
//obidebug(1, "\nCOMPARING 1=%d,%.*s; 2=%d,%.*s", *((int32_t*)(value_1+1)), *((int32_t*)(value_1+1)), value_1+BYTE_ARRAY_HEADER_SIZE, *((int32_t*)(value_2+1)), *((int32_t*)(value_2+1)), value_2+BYTE_ARRAY_HEADER_SIZE);
......@@ -462,6 +465,15 @@ int array_compare(byte_t* value_1, byte_t* value_2)
if (len_1 != len_2)
return (len_1 - len_2);
if (size_1 != 8)
{
ini_len_1 = *((int32_t*)(value_1+5));
ini_len_2 = *((int32_t*)(value_2+5));
if (ini_len_1 != ini_len_2)
return (ini_len_1 - ini_len_2);
}
b = BYTE_ARRAY_HEADER_SIZE;
comp = 0;
while (!comp && (b < len_1+BYTE_ARRAY_HEADER_SIZE))
......@@ -475,7 +487,7 @@ int array_compare(byte_t* value_1, byte_t* value_2)
size_t array_sizeof(byte_t* value)
{
return (BYTE_ARRAY_HEADER_SIZE + *((int32_t*)(value+1)) + 1);
return (BYTE_ARRAY_HEADER_SIZE + *((int32_t*)(value+1)));
}
......@@ -995,6 +1007,8 @@ index_t obi_array_add(OBIDMS_array_p array, byte_t* value)
(array->first)[idx] = data_size_used;
// Store the value itself at the end of the data
// fprintf(stderr, "\nMEMCOPYING TO STORE, with size %ld :", value_size);
// printBits(value_size, value);
memcpy((((array->data)->data)+data_size_used), value, value_size);
// Update the data size
......@@ -1079,8 +1093,8 @@ byte_t* obi_str_to_obibytes(char* value)
uint8_t size;
size = 8;
length = strlen(value);
value_b = (byte_t*) malloc(length + BYTE_ARRAY_HEADER_SIZE + 1);
length = strlen(value) + 1; // +1 to store \0 at the end (makes retrieving faster)
value_b = (byte_t*) malloc(BYTE_ARRAY_HEADER_SIZE + length);
if (value_b == NULL)
{
obi_set_errno(OBI_ARRAY_ERROR);
......@@ -1090,7 +1104,8 @@ byte_t* obi_str_to_obibytes(char* value)
*(value_b) = size;
*((int32_t*)(value_b+1)) = length;
*((int32_t*)(value_b+1)) = length; // TODO comment
*((int32_t*)(value_b+5)) = length;
strcpy(value_b+BYTE_ARRAY_HEADER_SIZE, value);
......@@ -1107,3 +1122,73 @@ const char* obi_obibytes_to_str(byte_t* value_b)
return value;
}
byte_t* obi_seq_to_obibytes(char* seq)
{
byte_t* value_b;
int32_t length; // length of the value (without the header) in bytes
uint8_t size; // size of one element in bits
int32_t seq_length;
byte_t* encoded_seq;
// Check if just ATGC and set size of a nucleotide accordingly (2 bits or 4 bits)
//fprintf(stderr, "\nonly ATGC = %d", only_ATGC(seq));
if (only_ATGC(seq))
size = 2;
else
size = 4;
// Set length
seq_length = strlen(seq);
if (size == 2)
length = ceil((double) seq_length / (double) 4.0);
else // size == 4
length = ceil((double) seq_length / (double) 2.0);
// Encode
if (size == 2)
encoded_seq = encode_seq_on_2_bits(seq, seq_length);
else // size == 4
return NULL;
// encoded_seq = encode_seq_on_4_bits(seq, seq_length);
// Set the values in the byte array
value_b = (byte_t*) malloc(BYTE_ARRAY_HEADER_SIZE + length);
*(value_b) = size;
*((int32_t*)(value_b+1)) = length;
*((int32_t*)(value_b+5)) = seq_length;
//fprintf(stderr, "\nstored seq length : %d\n", *((int32_t*)(value_b+5)));
memcpy(value_b+BYTE_ARRAY_HEADER_SIZE, encoded_seq, length);
//obidebug(1, "\n\nENCODED VALUE_B = ");
//printBits(((*((int32_t*)(value_b+1)))+BYTE_ARRAY_HEADER_SIZE), value_b);
free(encoded_seq);
return value_b;
}
const char* obi_obibytes_to_seq(byte_t* value_b)
{
const char* value;
uint8_t size; // size of one element in bits
//obidebug(1, "\n\nGONNA DECODE VALUE_B = ");
//printBits(((*((int32_t*)(value_b+1)))+BYTE_ARRAY_HEADER_SIZE), value_b);
size = *(value_b);
// Decode
if (size == 2)
value = decode_seq_on_2_bits(value_b+BYTE_ARRAY_HEADER_SIZE, *((int32_t*)(value_b+5)));
else
return NULL;
// value = decode_seq_on_4_bits(value_b+BYTE_ARRAY_HEADER_SIZE, *((int32_t*)(value_b+5)));
return value;
}
......@@ -29,7 +29,7 @@
*/
#define ARRAY_GROWTH_FACTOR (2) /**< The growth factor when an array is enlarged.
*/
#define BYTE_ARRAY_HEADER_SIZE (5) /**< The size of the header of a byte array.
#define BYTE_ARRAY_HEADER_SIZE (9) /**< The size of the header of a byte array.
*/
......@@ -284,5 +284,34 @@ byte_t* obi_str_to_obibytes(char* value);
const char* obi_obibytes_to_str(byte_t* value_b);
/**
* @brief Converts a DNA sequence to a byte array with a header.
*
* @warning The byte array must be freed by the caller.
*
* @param value The DNA sequence to convert.
*
* @returns A pointer to the byte array created.
* @retval NULL if an error occurred.
*
* @since November 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
byte_t* obi_seq_to_obibytes(char* seq);
/**
* @brief Converts a byte array to a DNA sequence.