Commit 019dfc01 by Celine Mercier

Branch to refactor and debug (AVLs bugged)

parent edc4fd7b
......@@ -12,8 +12,6 @@
../../../src/obilittlebigman.c
../../../src/obitypes.h
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/obiavl.h
../../../src/obiavl.c
../../../src/encode.h
......@@ -28,3 +26,5 @@
../../../src/murmurhash2.h
../../../src/crc64.c
../../../src/crc64.h
../../../src/utils.c
../../../src/utils.h
......@@ -14,7 +14,7 @@
../../../src/obilittlebigman.c
../../../src/obitypes.h
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/utils.c
../../../src/utils.h
../../../src/obiavl.h
../../../src/obiavl.c
......@@ -14,7 +14,7 @@
../../../src/obilittlebigman.c
../../../src/obitypes.h
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/utils.c
../../../src/utils.h
../../../src/obiavl.h
../../../src/obiavl.c
......@@ -14,7 +14,7 @@
../../../src/obilittlebigman.c
../../../src/obitypes.h
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/utils.c
../../../src/utils.h
../../../src/obiavl.h
../../../src/obiavl.c
......@@ -14,7 +14,7 @@
../../../src/obilittlebigman.c
../../../src/obitypes.h
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/utils.c
../../../src/utils.h
../../../src/obiavl.h
../../../src/obiavl.c
......@@ -14,7 +14,7 @@
../../../src/obilittlebigman.c
../../../src/obitypes.h
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/utils.c
../../../src/utils.h
../../../src/obiavl.h
../../../src/obiavl.c
......@@ -10,19 +10,23 @@ from .capi.obitypes cimport OBISeq_NA, const_char_p
from obitools3.utils cimport str2bytes, bytes2str
from libc.stdlib cimport free
from libc.string cimport strcmp
cdef class OBIDMS_column_seq(OBIDMS_column):
cpdef object get_line(self, index_t line_nb):
cdef bytes value
cdef char* value
cdef object result
value = <bytes> obi_column_get_obiseq_with_elt_idx_in_view(self.view, (self.pointer)[0], line_nb, 0)
value = obi_column_get_obiseq_with_elt_idx_in_view(self.view, (self.pointer)[0], line_nb, 0)
if obi_errno > 0 :
raise IndexError(line_nb)
if value == OBISeq_NA : # TODO
if strcmp(value, OBISeq_NA) == 0 :
result = None
else :
result = bytes2str(value)
free(value)
return result
cpdef set_line(self, index_t line_nb, object value):
......@@ -38,33 +42,35 @@ cdef class OBIDMS_column_seq(OBIDMS_column):
cdef class OBIDMS_column_multi_elts_seq(OBIDMS_column_multi_elts):
cpdef object get_item(self, index_t line_nb, str element_name):
cdef bytes value
cdef char* value
cdef object result
value = <bytes> obi_column_get_obiseq_with_elt_name_in_view(self.view, (self.pointer)[0], line_nb, str2bytes(element_name))
value = obi_column_get_obiseq_with_elt_name_in_view(self.view, (self.pointer)[0], line_nb, str2bytes(element_name))
if obi_errno > 0 :
raise IndexError(line_nb, element_name)
if value == OBISeq_NA :
if strcmp(value, OBISeq_NA) == 0 :
result = None
else :
result = bytes2str(value)
free(value)
return result
cpdef object get_line(self, index_t line_nb) :
cdef bytes value
cdef object value_in_result
cdef dict result
cdef char* value
cdef object value_in_result
cdef dict result
cdef index_t i
cdef bint all_NA
cdef bint all_NA
result = {}
all_NA = True
for i in range(self.nb_elements_per_line) :
value = <bytes> obi_column_get_obiseq_with_elt_idx_in_view(self.view, (self.pointer)[0], line_nb, i)
value = obi_column_get_obiseq_with_elt_idx_in_view(self.view, (self.pointer)[0], line_nb, i)
if obi_errno > 0 :
raise IndexError(line_nb)
if value == OBISeq_NA :
if strcmp(value, OBISeq_NA) == 0 :
value_in_result = None
else :
value_in_result = bytes2str(value)
value_in_result = bytes2str(value)
free(value)
result[self.elements_names[i]] = value_in_result
if all_NA and (value_in_result is not None) :
all_NA = False
......
......@@ -14,7 +14,7 @@
../../../src/obilittlebigman.c
../../../src/obitypes.h
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/utils.c
../../../src/utils.h
../../../src/obiavl.h
../../../src/obiavl.c
......@@ -10,19 +10,22 @@ from .capi.obitypes cimport OBIStr_NA, const_char_p
from obitools3.utils cimport str2bytes, bytes2str
from libc.string cimport strcmp
cdef class OBIDMS_column_str(OBIDMS_column):
cpdef object get_line(self, index_t line_nb):
cdef bytes value
cdef char* value
cdef object result
value = <bytes> obi_column_get_obistr_with_elt_idx_in_view(self.view, (self.pointer)[0], line_nb, 0)
value = obi_column_get_obistr_with_elt_idx_in_view(self.view, (self.pointer)[0], line_nb, 0)
if obi_errno > 0 :
raise IndexError(line_nb)
if value == OBIStr_NA : # TODO
if strcmp(value, OBIStr_NA) == 0 :
result = None
else :
result = bytes2str(value)
# NOTE: value is not freed because the pointer points to a mmapped region in an AVL data file. (TODO discuss)
return result
cpdef set_line(self, index_t line_nb, object value):
......@@ -38,33 +41,35 @@ cdef class OBIDMS_column_str(OBIDMS_column):
cdef class OBIDMS_column_multi_elts_str(OBIDMS_column_multi_elts):
cpdef object get_item(self, index_t line_nb, str element_name):
cdef bytes value
cdef char* value
cdef object result
value = <bytes> obi_column_get_obistr_with_elt_name_in_view(self.view, (self.pointer)[0], line_nb, str2bytes(element_name))
value = obi_column_get_obistr_with_elt_name_in_view(self.view, (self.pointer)[0], line_nb, str2bytes(element_name))
if obi_errno > 0 :
raise IndexError(line_nb, element_name)
if value == OBIStr_NA :
if strcmp(value, OBIStr_NA) == 0 :
result = None
else :
result = bytes2str(value)
# NOTE: value is not freed because the pointer points to a mmapped region in an AVL data file. (TODO discuss)
return result
cpdef object get_line(self, index_t line_nb) :
cdef bytes value
cdef object value_in_result
cdef dict result
cdef char* value
cdef object value_in_result
cdef dict result
cdef index_t i
cdef bint all_NA
cdef bint all_NA
result = {}
all_NA = True
for i in range(self.nb_elements_per_line) :
value = <bytes> obi_column_get_obistr_with_elt_idx_in_view(self.view, (self.pointer)[0], line_nb, i)
value = obi_column_get_obistr_with_elt_idx_in_view(self.view, (self.pointer)[0], line_nb, i)
if obi_errno > 0 :
raise IndexError(line_nb)
if value == OBIStr_NA :
if strcmp(value, OBIStr_NA) == 0 :
value_in_result = None
else :
value_in_result = bytes2str(value)
value_in_result = bytes2str(value)
# NOTE: value is not freed because the pointer points to a mmapped region in an AVL data file. (TODO discuss)
result[self.elements_names[i]] = value_in_result
if all_NA and (value_in_result is not None) :
all_NA = False
......
......@@ -12,8 +12,8 @@
../../../src/obilittlebigman.c
../../../src/obitypes.h
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/utils.c
../../../src/utils.h
../../../src/obiavl.h
../../../src/obiavl.c
../../../src/encode.h
......
......@@ -26,7 +26,7 @@ cdef class OBI_Seq(dict) :
self[bytes2str(DESCRIPTION_COLUMN)] = description
cpdef get_description(self) :
return self.description
return self.description # TODO no
cpdef get_sequence(self) :
return self.sequence
......@@ -48,28 +48,25 @@ cdef class OBI_Nuc_Seq(OBI_Seq) :
cdef class OBI_Nuc_Seq_Stored(OBIView_line) :
cpdef set_id(self, str id) :
self.id = id
self[bytes2str(ID_COLUMN)] = id
cpdef get_id(self) :
return self.id
return self[bytes2str(ID_COLUMN)]
cpdef set_description(self, str description) :
self.description = description
self[bytes2str(DESCRIPTION_COLUMN)] = description
cpdef get_description(self) :
return self.description
return self[bytes2str(DESCRIPTION_COLUMN)]
cpdef set_sequence(self, str sequence) :
self.sequence = sequence
self[bytes2str(NUC_SEQUENCE_COLUMN)] = sequence
cpdef get_sequence(self) :
return self.sequence
return self[bytes2str(NUC_SEQUENCE_COLUMN)]
def __str__(self) :
return self.sequence # or not
return self[bytes2str(NUC_SEQUENCE_COLUMN)] # or not
# cpdef str reverse_complement(self) : TODO in C ?
# pass
......
......@@ -12,8 +12,8 @@
../../../src/obilittlebigman.c
../../../src/obitypes.h
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/utils.c
../../../src/utils.h
../../../src/obiavl.h
../../../src/obiavl.c
../../../src/encode.h
......
......@@ -5,7 +5,7 @@ import time
from obitools3.obidms._obidms import OBIDMS
def bufferedRead(fileobj,size=100000000):
def bufferedRead(fileobj,size=209715200): ## 200 MB
buffer = fileobj.readlines(size)
while buffer:
for l in buffer:
......@@ -26,14 +26,16 @@ if __name__ == '__main__':
view = d.new_view('uniq view', view_type="NUC_SEQS_VIEW")
# for i in range(35000000) :
# if (not (i%500000)) :
# print(str(time.time())+'\t'+str(i))
# id = "@HWI-D00405:142:C71BAANXX:4:1101:1234:2234_CONS_SUB_SUB_"+str(i)
# view[i].set_id(id)
for i in range(35000000) :
if (not (i%500000)) :
print(str(time.time())+'\t'+str(i))
id = "@HWI-D00405:142:C71BAANXX:4:1101:1234:2234_CONS_SUB_SUB_"+str(i)
view[i].set_id(id)
if id != view[i]["ID"] :
print("nope", id, view[i]["ID"])
input_file = open(args.input_file, 'r')
input_file_buffered = bufferedRead(input_file)
# input_file = open(args.input_file, 'r')
# input_file_buffered = bufferedRead(input_file)
#
# if args.input_file[-1:] == "a" :
......@@ -111,37 +113,37 @@ if __name__ == '__main__':
# l = 0
# next = False
#
l=0
i=0
# l=0
# i=0
# while (True):
# l+=1
# line = input_file.readline()
# if line=="":
# break
for line in input_file_buffered :
# for line in input_file_buffered :
#
# #if i > 1E7 :
# # print('hmm?')
#
# #if i == 10000000 :
# # break
# if i == 6000000 :
# break
#
if l%4 == 0 :
# if l%4 == 0 :
#
if (not (i%500000)) :
print(str(time.time())+'\t'+str(i))
# if (not (i%500000)) :
# print(str(time.time())+'\t'+str(i))
# #
# # #print("header", line)
# #
id = line.split(" ", 1)[0][1:]
print(id)
# id = line.split(" ", 1)[0][1:]
# print(id)
# # #rest = (line[:-1].split(" ", 1)[1]).split(";")
view[i].set_id(id)
#print(view[i]["ID"])
# view[i].set_id(id)
# print(view[i]["ID"])
#
i+=1
# i+=1
l+=1
# l+=1
#
# # description = ""
# # for j in range(len(rest)) :
......@@ -186,7 +188,7 @@ if __name__ == '__main__':
# l+=1
#
#
input_file.close()
# input_file.close()
#print(view)
print(view.__repr__())
......
--extra-index-url https://pypi.python.org/simple/
Cython>=0.21
Cython==0.23.5
Sphinx>=1.2.0
ipython>=3.0.0
breathe>=4.0.0
......@@ -122,6 +122,40 @@ static void setup_buckets(struct bloom * bloom, unsigned int cache_size)
}
// TODO
int bloom_filter_size(int entries, double error)
{
int bytes;
double num;
double denom;
double bpe;
int bits;
unsigned bucket_bytes;
int not_even_by;
num = log(error);
denom = 0.480453013918201; // ln(2)^2
bpe = -(num / denom);
bits = (int)(((double)entries) * bpe);
if (bits % 8) {
bytes = (bits / 8) + 1;
}
else {
bytes = bits / 8;
}
bucket_bytes = BLOOM_BUCKET_SIZE_FALLBACK;
not_even_by = bytes % bucket_bytes;
if (not_even_by) {
// adjust bytes
bytes += (bucket_bytes - not_even_by);
}
return bytes;
}
int bloom_init_size(struct bloom * bloom, int entries, double error,
unsigned int cache_size)
{
......@@ -151,19 +185,21 @@ int bloom_init_size(struct bloom * bloom, int entries, double error,
setup_buckets(bloom, cache_size);
bloom->bf = (unsigned char *)calloc(bloom->bytes, sizeof(unsigned char));
if (bloom->bf == NULL) {
return 1;
}
// TODO comment
memset(bloom->bf, 0, bloom->bytes);
//bloom->bf = (unsigned char *)calloc(bloom->bytes, sizeof(unsigned char));
//if (bloom->bf == NULL) {
// return 1;
//}
bloom->ready = 1;
return 0;
}
int bloom_init(struct bloom * bloom, int entries, double error)
int bloom_init(struct bloom * bloom, int entries) //, double error)
{
return bloom_init_size(bloom, entries, error, 0);
return bloom_init_size(bloom, entries, BLOOM_FILTER_ERROR_RATE, 0);
}
......
......@@ -9,6 +9,10 @@
#define _BLOOM_H
// TODO
#define BLOOM_FILTER_ERROR_RATE (0.001)
/** ***************************************************************************
* On Linux, the code attempts to compute a bucket size based on CPU cache
* size info, if available. If that fails for any reason, this fallback size
......@@ -60,10 +64,17 @@ struct bloom
unsigned bucket_bits_fast_mod_operand;
double bpe;
unsigned char * bf;
int ready;
unsigned char bf[];
};
typedef struct bloom bloom_t;
// TODO
int bloom_filter_size(int entries, double error);
/** ***************************************************************************
* Initialize the bloom filter for use.
......@@ -91,7 +102,7 @@ struct bloom
* 1 - on failure
*
*/
int bloom_init(struct bloom * bloom, int entries, double error);
int bloom_init(struct bloom * bloom, int entries); //, double error);
/** ***************************************************************************
......
......@@ -64,6 +64,12 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length)
length_b = ceil((double) length / (double) 4.0);
seq_b = (byte_t*) malloc(length_b * sizeof(byte_t));
if (seq_b == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR); // TODO
obidebug(1, "\nError allocating memory for an encoded DNA sequence");
return NULL;
}
// Initialize all the bits to 0
memset(seq_b, 0, length_b);
......@@ -93,6 +99,7 @@ byte_t* encode_seq_on_2_bits(char* seq, int32_t length)
seq_b[i/4] |= NUC_T_2b;
break;
default:
obi_set_errno(OBI_ENCODE_ERROR); // TODO
obidebug(1, "\nInvalid nucleotide base when encoding (not [atgcATGC])");
return NULL;
}
......@@ -116,6 +123,12 @@ char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq)
uint8_t nuc;
seq = (char*) malloc((length_seq+1) * sizeof(char));
if (seq == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR); // TODO
obidebug(1, "\nError allocating memory for a decoded DNA sequence");
return NULL;
}
for (i=0; i<length_seq; i++)
{
......@@ -138,6 +151,7 @@ char* decode_seq_on_2_bits(byte_t* seq_b, int32_t length_seq)
seq[i] = 't';
break;
default:
obi_set_errno(OBI_DECODE_ERROR); // TODO
obidebug(1, "\nInvalid nucleotide base when decoding");
return NULL;
}
......@@ -159,6 +173,12 @@ byte_t* encode_seq_on_4_bits(char* seq, int32_t length)
length_b = ceil((double) length / (double) 2.0);
seq_b = (byte_t*) malloc(length_b * sizeof(byte_t));
if (seq_b == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR); // TODO
obidebug(1, "\nError allocating memory for an encoded DNA sequence");
return NULL;
}
// Initialize all the bits to 0
memset(seq_b, 0, length_b);
......@@ -232,6 +252,7 @@ byte_t* encode_seq_on_4_bits(char* seq, int32_t length)
seq_b[i/2] |= NUC_N_4b;
break;
default:
obi_set_errno(OBI_ENCODE_ERROR); // TODO
obidebug(1, "\nInvalid nucleotide base when encoding (not IUPAC)");
return NULL;
}
......@@ -255,6 +276,12 @@ char* decode_seq_on_4_bits(byte_t* seq_b, int32_t length_seq)
uint8_t nuc;
seq = (char*) malloc((length_seq+1) * sizeof(char));
if (seq == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR); // TODO
obidebug(1, "\nError allocating memory for a decoded DNA sequence");
return NULL;
}
for (i=0; i<length_seq; i++)
{
......@@ -310,6 +337,7 @@ char* decode_seq_on_4_bits(byte_t* seq_b, int32_t length_seq)
seq[i] = 'n';
break;
default:
obi_set_errno(OBI_DECODE_ERROR); // TODO
obidebug(1, "\nInvalid nucleotide base when decoding");
return NULL;
}
......@@ -321,6 +349,111 @@ char* decode_seq_on_4_bits(byte_t* seq_b, int32_t length_seq)
}
Obi_byte_array_p obi_byte_array(byte_t* encoded_value, uint8_t element_size, int32_t length_encoded_value, int32_t length_decoded_value)
{
Obi_byte_array_p byte_array;
// Allocate the memory for the byte array structure
byte_array = (Obi_byte_array_p) malloc(sizeof(Obi_byte_array_t) + length_encoded_value);
if (byte_array == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR); // TODO
obidebug(1, "\nError allocating memory for a byte array");
return NULL;
}
// Store the number of bits on which each element is encoded
byte_array->element_size = element_size;
// Store the length (in bytes) of the encoded value
byte_array->length_encoded_value = length_encoded_value;
// Store the initial length (in bytes) of the decoded value
byte_array->length_decoded_value = length_decoded_value;
// Store the encoded value
memcpy(byte_array->value, encoded_value, length_encoded_value);
return byte_array;
}
Obi_byte_array_p obi_str_to_obibytes(char* value)
{
Obi_byte_array_p value_b;
int32_t length;
// Compute the number of bytes on which the value will be encoded
length = strlen(value) + 1; // +1 to store \0 at the end (makes retrieving faster)
value_b = obi_byte_array(value, ELEMENT_SIZE_STR, length, length);
if (value_b == NULL)
{
obidebug(1, "\nError encoding a character string in a byte array");
return NULL;
}
return value_b;
}
const char* obi_obibytes_to_str(Obi_byte_array_p value_b)
{
fprintf(stderr, "\n%s", value_b->value);
return value_b->value;
}
Obi_byte_array_p obi_seq_to_obibytes(char* seq)
{
Obi_byte_array_p value_b;
int32_t length_encoded_seq; // length of the encoded sequence in bytes
int32_t seq_length;
byte_t* encoded_seq;
seq_length = strlen(seq);
// Check if just ATGC and encode accordingly
if (only_ATGC(seq))
{
// Compute the length (in bytes) of the encoded sequence
length_encoded_seq = ceil((double) seq_length / (double) 4.0);
// Encode
encoded_seq = encode_seq_on_2_bits(seq, seq_length);
if (encoded_seq == NULL)
return NULL;
value_b = obi_byte_array(encoded_seq, ELEMENT_SIZE_SEQ_2, length_encoded_seq, seq_length);
}
else
{
// Compute the length (in bytes) of the encoded sequence
length_encoded_seq = ceil((double) seq_length / (double) 2.0);
// Encode
encoded_seq = encode_seq_on_4_bits(seq, seq_length);
if (encoded_seq == NULL)
return NULL;
value_b = obi_byte_array(encoded_seq, ELEMENT_SIZE_SEQ_4, length_encoded_seq, seq_length);
}
free(encoded_seq);
return value_b;
}
const char* obi_obibytes_to_seq(Obi_byte_array_p value_b)
{
// Decode
if (value_b->element_size == 2)
return decode_seq_on_2_bits(value_b->value, value_b->length_decoded_value);
else
return decode_seq_on_4_bits(value_b->value, value_b->length_decoded_value);
}
// TODO same for int
///////////////////// FOR DEBUGGING ///////////////////////////
//NOTE: The first byte is printed the first (at the left-most).
......
......@@ -10,6 +10,10 @@
*/
#ifndef ENCODE_H_
#define ENCODE_H_
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
......@@ -18,8 +22,31 @@
#include "obitypes.h"
#define NUC_MASK_2B 0x3 /**< Binary: 11 to use when decoding 2 bits sequences */
#define NUC_MASK_4B 0xF /**< Binary: 1111 to use when decoding 4 bits sequences */
#define NUC_MASK_2B 0x3 /**< Binary: 11 to use when decoding 2 bits sequences
*/
#define NUC_MASK_4B 0xF /**< Binary: 1111 to use when decoding 4 bits sequences
*/
#define ELEMENT_SIZE_STR (8) /**< The size of an element from a value of type character string.
*/
#define ELEMENT_SIZE_SEQ_2 (2) /**< The size of an element from a value of type DNA sequence encoded on 2 bits.
*/
#define ELEMENT_SIZE_SEQ_4 (4) /**< The size of an element from a value of type DNA sequence encoded on 4 bits.
*/
/**
* @brief Byte array structure.
*/
typedef struct Obi_byte_array {
uint8_t element_size; /**< Size in bits of one element from the value.
*/
int32_t length_encoded_value; /**< Length in bytes of the encoded value.
*/
int32_t length_decoded_value; /**< Length in bytes of the decoded value.
*/
byte_t value[]; /**< Encoded value.
*/
} Obi_byte_array_t, *Obi_byte_array_p;
/**
......@@ -174,8 +201,70 @@ byte_t* encode_seq_on_4_bits(char* seq, int32_t length);
char* decode_seq_on_4_bits(byte_t* seq_b, int32_t length_seq);
/**
* @brief Converts a character string to a byte array with a header.
*
* @warning The byte array must be freed by the caller.
*
* @param value The character string to convert.
*
* @returns A pointer to the byte array created.
* @retval NULL if an error occurred.</