Commit c1393675 authored by Celine Mercier's avatar Celine Mercier

DNA sequences and character strings are now handled using AVL trees.

parent 1586956d
......@@ -12,7 +12,7 @@
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/obiarray.h
../../../src/obiarray.c
../../../src/obiavl.h
../../../src/obiavl.c
../../../src/encode.h
../../../src/encode.c
\ No newline at end of file
......@@ -25,7 +25,7 @@ cdef class OBIDMS:
index_t nb_lines=*,
index_t nb_elements_per_line=*,
list elements_names=*,
str array_name=*,
str avl_name=*,
str comments=*)
......
......@@ -133,7 +133,7 @@ cdef class OBIDMS :
index_t nb_lines=0,
index_t nb_elements_per_line=0,
list elements_names=None,
str array_name="default_obiarray",
str avl_name="default_AVL_tree",
str comments=""):
# Declarations
......@@ -263,7 +263,7 @@ cdef class OBIDMS :
referring,
version_number, data_type,
nb_lines, nb_elements_per_line,
elements_names, array_name,
elements_names, avl_name,
comments)
return column
......@@ -284,13 +284,13 @@ cdef class OBIDMS_column :
index_t nb_lines,
index_t nb_elements_per_line,
list elements_names,
str array_name,
str avl_name,
str comments):
# Declarations
cdef bytes column_name_b
cdef bytes dms_name_b
cdef bytes array_name_b
cdef bytes avl_name_b
cdef bytes elements_names_b
cdef bytes comments_b
......@@ -304,7 +304,7 @@ cdef class OBIDMS_column :
# Format the character strings to send them to C functions
column_name_b = str2bytes(column_name)
dms_name_b = str2bytes(self.dms.dms_name)
array_name_b = str2bytes(array_name)
avl_name_b = str2bytes(avl_name)
comments_b = str2bytes(comments)
# Create, clone or open column
......@@ -315,7 +315,7 @@ cdef class OBIDMS_column :
elements_names_b = str2bytes(";".join(elements_names))
self.pointer = obi_create_column(self.dms.pointer, column_name_b, type,
nb_lines, nb_elements_per_line,
elements_names_b, array_name_b, comments_b,
elements_names_b, avl_name_b, comments_b,
referring)
else :
if clone :
......
......@@ -14,5 +14,5 @@
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/obiarray.h
../../../src/obiarray.c
../../../src/obiavl.h
../../../src/obiavl.c
......@@ -14,5 +14,5 @@
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/obiarray.h
../../../src/obiarray.c
../../../src/obiavl.h
../../../src/obiavl.c
......@@ -14,5 +14,5 @@
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/obiarray.h
../../../src/obiarray.c
../../../src/obiavl.h
../../../src/obiavl.c
......@@ -14,5 +14,5 @@
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/obiarray.h
../../../src/obiarray.c
../../../src/obiavl.h
../../../src/obiavl.c
......@@ -14,5 +14,5 @@
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/obiarray.h
../../../src/obiarray.c
../../../src/obiavl.h
../../../src/obiavl.c
......@@ -14,5 +14,5 @@
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/obiarray.h
../../../src/obiarray.c
../../../src/obiavl.h
../../../src/obiavl.c
......@@ -30,7 +30,7 @@ cdef extern from "obidmscolumn.h" nogil:
bint referring
obiversion_t referred_column_version
const_char_p name
const_char_p array_name
const_char_p avl_name
const_char_p comments
ctypedef OBIDMS_column_header_t* OBIDMS_column_header_p
......@@ -47,7 +47,7 @@ cdef extern from "obidmscolumn.h" nogil:
index_t nb_lines,
index_t nb_elements_per_line,
const_char_p elements_names,
const_char_p array_name,
const_char_p avl_name,
const_char_p comments,
bint referring)
......
......@@ -16,7 +16,8 @@
#include <math.h>
#include "encode.h"
#include "obiarray.h"
#include "obierrno.h"
#include "obitypes.h" // For byte_t type
#include "obidebug.h"
......
......@@ -15,7 +15,7 @@
#include <stdint.h>
#include <stdbool.h>
#include "obiarray.h"
#include "obitypes.h"
#define NUC_MASK_2B 0x3 /**< Binary: 11 to use when decoding 2 bits sequences */
......
This diff is collapsed.
This diff is collapsed.
/****************************************************************************
* OBIDMS array header file *
* OBIDMS AVL tree header file *
****************************************************************************/
/**
* @file obiarray.h
* @file obiavl.h
* @author Celine Mercier
* @date October 19th 2015
* @brief Header file for handling arrays for storing and retrieving byte arrays (i.e. coding for character strings).
* @date December 3rd 2015
* @brief Header file for handling AVL trees for storing and retrieving byte arrays (i.e. coding for character strings).
*/
#ifndef OBIARRAY_H_
#define OBIARRAY_H_
#ifndef OBIAVL_H_
#define OBIAVL_H_
#include <stdlib.h>
......@@ -20,28 +20,45 @@
#include <time.h>
#include <sys/types.h>
#include <dirent.h>
#include <stdbool.h>
#include "obidms.h"
#include "obitypes.h"
#define ARRAY_MAX_NAME (1024) /**< The maximum length of an array name.
*/
#define ARRAY_GROWTH_FACTOR (2) /**< The growth factor when an array is enlarged.
*/
#define BYTE_ARRAY_HEADER_SIZE (9) /**< The size of the header of a byte array.
*/
#define AVL_MAX_NAME (1024) /**< The maximum length of an AVL tree name.
*/
#define AVL_GROWTH_FACTOR (2) /**< The growth factor when an AVL tree is enlarged.
*/
#define AVL_MAX_DEPTH (50) /**< The maximum depth of an AVL tree.
*/
#define LEFT_CHILD(node) (avl->tree)+(node->left_child) /**< Pointer to the left child of a node in an AVL tree.
*/
#define RIGHT_CHILD(node) (avl->tree)+(node->right_child) /**< Pointer to the right child of a node in an AVL tree.
*/
#define BYTE_ARRAY_HEADER_SIZE (9) /**< The size of the header of a byte array.
*/
typedef char byte_t; /**< Defining byte type since data is stored in bits
* and char (stored on one byte) is the smallest addressable unit.
*/
/**
* @brief AVL tree node structure.
*/
typedef struct AVL_node {
index_t left_child; /**< Index of left less child node.
*/
index_t right_child; /**< Index of right greater child node.
*/
int8_t balance_factor; /**< Balance factor of the node.
*/
index_t value; /**< Index of the value associated with the node in the data array.
*/
} AVL_node_t, *AVL_node_p;
/**
* @brief OBIDMS array data header structure.
* @brief OBIDMS AVL tree data header structure.
*/
typedef struct OBIDMS_array_data_header {
typedef struct OBIDMS_avl_data_header {
int header_size; /**< Size of the header in bytes.
*/
index_t data_size_used; /**< Size of the data used in bytes.
......@@ -50,213 +67,218 @@ typedef struct OBIDMS_array_data_header {
*/
index_t nb_items; /**< Number of items.
*/
char array_name[ARRAY_MAX_NAME+1]; /**< The array name as a NULL terminated string.
char avl_name[AVL_MAX_NAME+1]; /**< The AVL tree name as a NULL terminated string.
*/
time_t creation_date; /**< Date of creation of the file.
*/
} OBIDMS_array_data_header_t, *OBIDMS_array_data_header_p;
} OBIDMS_avl_data_header_t, *OBIDMS_avl_data_header_p;
/**
* @brief OBIDMS array data structure.
* @brief OBIDMS AVL tree data structure.
*/
typedef struct OBIDMS_array_data {
OBIDMS_array_data_header_p header; /**< A pointer to the header of the array data.
typedef struct OBIDMS_avl_data {
OBIDMS_avl_data_header_p header; /**< A pointer to the header of the AVL tree data.
*/
byte_t* data; /**< A pointer to the beginning of the data.
*/
} OBIDMS_array_data_t, *OBIDMS_array_data_p;
} OBIDMS_avl_data_t, *OBIDMS_avl_data_p;
/**
* @brief OBIDMS array header structure.
* @brief OBIDMS AVL tree header structure.
*/
typedef struct OBIDMS_array_header {
typedef struct OBIDMS_avl_header {
int header_size; /**< Size of the header in bytes.
*/
size_t array_size; /**< Size of the array in bytes.
size_t avl_size; /**< Size of the AVL tree in bytes.
*/
index_t nb_items; /**< Number of items in the AVL tree.
*/
index_t nb_items; /**< Number of items in the array.
index_t nb_items_max; /**< Maximum number of items in the AVL tree before it has to be enlarged.
*/
index_t nb_items_max; /**< Maximum number of items in the array before it has to be enlarged.
index_t root_idx; /**< Index of the root of the AVL tree.
*/
char array_name[ARRAY_MAX_NAME+1]; /**< The array name as a NULL terminated string.
char avl_name[AVL_MAX_NAME+1]; /**< The AVL tree name as a NULL terminated string.
*/
time_t creation_date; /**< Date of creation of the file.
*/
} OBIDMS_array_header_t, *OBIDMS_array_header_p;
} OBIDMS_avl_header_t, *OBIDMS_avl_header_p;
/**
* @brief OBIDMS array structure.
* @brief OBIDMS AVL tree structure.
*/
typedef struct OBIDMS_array {
OBIDMS_p dms; /**< A pointer to the OBIDMS structure to which the array belongs.
typedef struct OBIDMS_avl {
OBIDMS_p dms; /**< A pointer to the OBIDMS structure to which the AVL tree belongs.
*/
OBIDMS_array_header_p header; /**< A pointer to the header of the array.
OBIDMS_avl_header_p header; /**< A pointer to the header of the AVL tree.
*/
index_t* first; /**< A pointer to the beginning of the array itself.
struct AVL_node* tree; /**< A pointer to the root of the AVL tree.
*/
OBIDMS_array_data_p data; /**< A pointer to the structure containing the data
* that the array references.
index_t path_idx[AVL_MAX_DEPTH]; /**< The path taken to a node from the root as an array of node indices.
*/
int8_t path_dir[AVL_MAX_DEPTH]; /**< The path taken to a node from the root as an array of directions
* (0 for left, -1 for right).
*/
OBIDMS_avl_data_p data; /**< A pointer to the structure containing the data
* that the AVL tree references.
*/
DIR* directory; /**< A directory entry usable to
* refer and scan the array directory.
* refer and scan the AVL tree directory.
*/
int dir_fd; /**< The file descriptor of the directory entry
* usable to refer and scan the array directory.
* usable to refer and scan the AVL tree directory.
*/
size_t counter; /**< Indicates by how many threads/programs (TODO) the array is used.
size_t counter; /**< Indicates by how many threads/programs (TODO) the AVL tree is used.
*/
} OBIDMS_array_t, *OBIDMS_array_p;
} OBIDMS_avl_t, *OBIDMS_avl_p;
/**
* @brief Checks if an obiarray already exists or not.
* @brief Checks if an AVL tree already exists or not.
*
* @param dms The OBIDMS to which the obiarray belongs.
* @param array_name The name of the obiarray.
* @param dms The OBIDMS to which the AVL tree belongs.
* @param avl_name The name of the AVL tree.
*
* @returns A value indicating whether the obiarray exists or not.
* @retval 1 if the obiarray exists.
* @retval 0 if the obiarray does not exist.
* @returns A value indicating whether the AVL tree exists or not.
* @retval 1 if the AVL tree exists.
* @retval 0 if the AVL tree does not exist.
* @retval -1 if an error occurred.
*
* @since October 2015
* @since December 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int obi_array_exists(OBIDMS_p dms, const char* array_name);
int obi_avl_exists(OBIDMS_p dms, const char* avl_name);
/**
* @brief Opens an obiarray and creates it if it does not already exist.
* @brief Opens an AVL tree and creates it if it does not already exist.
*
* Note: An obiarray is made of two files (referred to by two structures).
* Note: An AVL tree is made of two files (referred to by two structures).
* One file contains the indices referring to the data, and the other
* file contains the data itself. The obiarray as a whole is referred
* to via the OBIDMS_array structure.
* file contains the data itself. The AVL tree as a whole is referred
* to via the OBIDMS_avl structure.
*
* @param dms The OBIDMS to which the obiarray belongs.
* @param array_name The name of the obiarray.
* @param dms The OBIDMS to which the AVL tree belongs.
* @param avl_name The name of the AVL tree.
*
* @returns A pointer to the obiarray structure.
* @returns A pointer to the AVL tree structure.
* @retval NULL if an error occurred.
*
* @since October 2015
* @since December 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
OBIDMS_array_p obi_array(OBIDMS_p dms, const char* array_name);
OBIDMS_avl_p obi_avl(OBIDMS_p dms, const char* avl_name);
/**
* @brief Creates an obiarray. Fails if it already exists.
* @brief Creates an AVL tree. Fails if it already exists.
*
* Note: An obiarray is made of two files (referred to by two structures).
* Note: An AVL tree is made of two files (referred to by two structures).
* One file contains the indices referring to the data, and the other
* file contains the data itself. The obiarray as a whole is referred
* to via the OBIDMS_array structure.
* file contains the data itself. The AVL tree as a whole is referred
* to via the OBIDMS_avl structure.
*
* @param dms The OBIDMS to which the obiarray belongs.
* @param array_name The name of the obiarray.
* @param dms The OBIDMS to which the AVL tree belongs.
* @param avl_name The name of the AVL tree.
*
* @returns A pointer to the newly created obiarray structure.
* @returns A pointer to the newly created AVL tree structure.
* @retval NULL if an error occurred.
*
* @since October 2015
* @since December 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
OBIDMS_array_p obi_create_array(OBIDMS_p dms, const char* array_name);
OBIDMS_avl_p obi_create_avl(OBIDMS_p dms, const char* avl_name);
/**
* @brief Opens an obiarray. Fails if it does not already exist.
* @brief Opens an AVL tree. Fails if it does not already exist.
*
* Note: An obiarray is made of two files (referred to by two structures).
* Note: An AVL tree is made of two files (referred to by two structures).
* One file contains the indices referring to the data, and the other
* file contains the data itself. The obiarray as a whole is referred
* to via the OBIDMS_array structure.
* file contains the data itself. The AVL tree as a whole is referred
* to via the OBIDMS_avl structure.
*
* @param dms The OBIDMS to which the obiarray belongs.
* @param array_name The name of the obiarray.
* @param dms The OBIDMS to which the AVL tree belongs.
* @param avl_name The name of the AVL tree.
*
* @returns A pointer to the obiarray structure.
* @returns A pointer to the AVL tree structure.
* @retval NULL if an error occurred.
*
* @since October 2015
* @since December 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
OBIDMS_array_p obi_open_array(OBIDMS_p dms, const char* array_name);
OBIDMS_avl_p obi_open_avl(OBIDMS_p dms, const char* avl_name);
/**
* @brief Closes an obiarray.
* @brief Closes an AVL tree.
*
* Note: An obiarray is made of two files (referred to by two structures).
* Note: An AVL tree is made of two files (referred to by two structures).
* One file contains the indices referring to the data, and the other
* file contains the data itself. The obiarray as a whole is referred
* to via the OBIDMS_array structure.
* file contains the data itself. The AVL tree as a whole is referred
* to via the OBIDMS_avl structure.
*
* @param array A pointer to the obiarray structure to close and free.
* @param avl A pointer to the AVL tree structure to close and free.
*
* @retval 0 if the operation was successfully completed.
* @retval -1 if an error occurred.
*
* @since October 2015
* @since December 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int obi_close_array(OBIDMS_array_p array);
int obi_close_avl(OBIDMS_avl_p avl);
/**
* @brief Adds a value (byte array) in an obiarray, checking first if it is already in it.
* @brief Adds a value (byte array) in an AVL tree, checking if it is already in it.
*
* @warning The byte array to add must already be encoded and contain its header.
*
* @param array A pointer to the obiarray.
* @param value The byte array to add in the obiarray.
* @param avl A pointer to the AVL tree.
* @param value The byte array to add in the AVL tree.
*
* @returns The index of the value, whether it was added or already in the obiarray.
* @returns The index of the value, whether it was added or already in the AVL tree.
* @retval -1 if an error occurred.
*
* @since October 2015
* @since December 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
index_t obi_array_add(OBIDMS_array_p array, byte_t* value);
index_t obi_avl_add(OBIDMS_avl_p avl, byte_t* value);
/**
* @brief Recovers a value (byte array) in an obiarray.
* @brief Finds a value (byte array) in an AVL tree, checking first if it is already in it.
*
* @warning The byte array recovered is encoded and contains its header.
* @warning The byte array to add must already be encoded and contain its header.
*
* @param array A pointer to the obiarray.
* @param index The index of the value in the data array.
* @param avl A pointer to the AVL tree.
* @param value The byte array to add in the AVL tree.
*
* @returns A pointer to the byte array recovered.
* @returns The data index of the value.
* @retval -1 if the value is not in the tree.
*
* @since October 2015
* @since December 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
byte_t* obi_array_get(OBIDMS_array_p array, index_t index);
index_t obi_avl_find(OBIDMS_avl_p avl, byte_t* value);
/**
* @brief Searches a value (byte array) in an obiarray performing a binary search.
* @brief Recovers a value (byte array) in an AVL tree.
*
* @warning The byte array to search must already be encoded and contain its header.
* @warning The byte array recovered is encoded and contains its header.
*
* @param array A pointer to the obiarray.
* @param value The byte array to add in the obiarray.
* @param avl A pointer to the AVL tree.
* @param index The index of the value in the data array.
*
* @returns If the value is found, its data index is returned.
* If the value is not found, the array index indicating where the value's data index
* should be in the array is returned in the form (- (index + 1)), as data indices in an
* obiarray are sorted according to the ascending order of the values (byte arrays) themselves.
* @returns A pointer to the byte array recovered.
*
* @since October 2015
* @since December 2015
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
index_t obi_array_search(OBIDMS_array_p array, byte_t* value);
byte_t* obi_avl_get(OBIDMS_avl_p avl, index_t index);
/**
......@@ -318,5 +340,5 @@ byte_t* obi_seq_to_obibytes(char* seq);
const char* obi_obibytes_to_seq(byte_t* value_b);
#endif /* OBIARRAY_H_ */
#endif /* OBIAVL_H_ */
......@@ -247,7 +247,7 @@ OBIDMS_p obi_create_dms(const char* dms_name)
return NULL;
}
// Get file descriptor of DMS directory to create the arrays directory
// Get file descriptor of DMS directory to create the AVL trees directory
dms_dir = opendir(directory_name);
if (dms_dir == NULL)
{
......@@ -267,11 +267,11 @@ OBIDMS_p obi_create_dms(const char* dms_name)
return NULL;
}
// Create the arrays directory
if (mkdirat(dms_file_descriptor, ARRAYS_DIR_NAME, 00777) < 0)
// Create the AVL trees directory
if (mkdirat(dms_file_descriptor, AVL_TREES_DIR_NAME, 00777) < 0)
{
obi_set_errno(OBI_ARRAY_ERROR);
obidebug(1, "\nProblem creating an arrays directory");
obi_set_errno(OBI_AVL_ERROR);
obidebug(1, "\nProblem creating an AVL trees directory");
return NULL;
}
......@@ -390,24 +390,24 @@ OBIDMS_p obi_open_dms(const char* dms_name)
dms->little_endian = little_endian_dms;
// Open the arrays directory
dms->array_directory = private_opendirat(dms->dir_fd, ARRAYS_DIR_NAME);
if (dms->array_directory == NULL)
// Open the AVL trees directory
dms->avl_directory = private_opendirat(dms->dir_fd, AVL_TREES_DIR_NAME);
if (dms->avl_directory == NULL)
{
obi_set_errno(OBIDMS_UNKNOWN_ERROR);
obidebug(1, "\nError opening the arrays directory");
obidebug(1, "\nError opening the AVL trees directory");
closedir(dms->directory);
free(dms);
return NULL;
}
// Store the array directory's file descriptor
dms->array_dir_fd = dirfd(dms->array_directory);
if (dms->array_dir_fd < 0)
// Store the AVL trees directory's file descriptor
dms->avl_dir_fd = dirfd(dms->avl_directory);
if (dms->avl_dir_fd < 0)
{
obi_set_errno(OBIDMS_UNKNOWN_ERROR);
obidebug(1, "\nError getting the file descriptor of the arrays directory");
closedir(dms->array_directory);
obidebug(1, "\nError getting the file descriptor of the AVL trees directory");
closedir(dms->avl_directory);
closedir(dms->directory);
free(dms);
return NULL;
......@@ -418,10 +418,10 @@ OBIDMS_p obi_open_dms(const char* dms_name)
(dms->opened_columns)->columns = (OBIDMS_column_p*) malloc(MAX_NB_OPENED_COLUMNS*sizeof(OBIDMS_column_p));
(dms->opened_columns)->nb_opened_columns = 0;
// Initialize the list of opened arrays
dms->opened_arrays = (Opened_arrays_list_p) malloc(sizeof(Opened_arrays_list_t));
(dms->opened_arrays)->arrays = (OBIDMS_array_p*)<