Commit b63d0fb9 authored by Celine Mercier's avatar Celine Mercier

Added C functions to write .rdx, .tdx, .ndx binary taxonomy files from a

taxonomy C structure
parent 0dfd67ec
#cython: language_level=3
from .capi.obitaxonomy cimport ecotx_t, OBIDMS_taxonomy_p
from ._obidms cimport OBIDMS
cdef class OBI_Taxonomy :
cdef str _name
cdef OBIDMS_taxonomy_p _pointer
cdef OBIDMS _dms
cpdef close(self)
cpdef _write(self, str prefix)
cdef class OBI_Taxon :
......
......@@ -4,7 +4,10 @@ from obitools3.utils cimport bytes2str, str2bytes
from .capi.obitaxonomy cimport obi_read_taxonomy, \
obi_close_taxonomy, \
obi_taxo_get_taxon_with_taxid
obi_taxo_get_taxon_with_taxid, \
write_rankidx, \
write_taxonomyidx, \
write_nameidx
from ._obidms cimport OBIDMS
......@@ -18,6 +21,7 @@ cdef class OBI_Taxonomy :
def __init__(self, OBIDMS dms, str name) :
self._dms = dms
self._name = name
self._pointer = obi_read_taxonomy(dms._pointer, str2bytes(name), True) # TODO discuss
# TODO if not found in DMS, try to import?
......@@ -39,7 +43,16 @@ cdef class OBI_Taxonomy :
cpdef close(self) :
if (obi_close_taxonomy(self._pointer) < 0) :
raise Exception("Error closing the taxonomy")
cpdef _write(self, str prefix) :
if (write_rankidx(self._dms._pointer, self._pointer, str2bytes(prefix)) < 0) :
raise Exception("Error writing the taxonomy rank file")
if (write_taxonomyidx(self._dms._pointer, self._pointer, str2bytes(prefix)) < 0) :
raise Exception("Error writing the taxonomy taxa file")
if (write_nameidx(self._dms._pointer, self._pointer, str2bytes(prefix)) < 0) :
raise Exception("Error writing the taxonomy taxa file")
cdef class OBI_Taxon : # TODO dict subclass?
......@@ -82,6 +95,6 @@ cdef class OBI_Taxon : # TODO dict subclass?
d['parent'] = self.parent.taxid
d['farest'] = self.farest
return str(d)
......@@ -40,3 +40,7 @@ cdef extern from "obidms_taxonomy.h" nogil:
ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
int write_rankidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const_char_p taxonomy_name)
int write_taxonomyidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const_char_p taxonomy_name)
int write_nameidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const_char_p taxonomy_name)
......@@ -497,6 +497,33 @@ OBIDMS_p obi_open_dms(const char* dms_path)
return NULL;
}
// Open the taxonomy directory
dms->tax_directory = opendir_in_dms(dms, TAXONOMY_DIR_NAME);
if (dms->tax_directory == NULL)
{
obi_set_errno(OBIDMS_UNKNOWN_ERROR);
obidebug(1, "\nError opening the taxonomy directory");
closedir(dms->indexer_directory);
closedir(dms->view_directory);
closedir(dms->directory);
free(dms);
return NULL;
}
// Store the taxonomy directory's file descriptor
dms->tax_dir_fd = dirfd(dms->tax_directory);
if (dms->tax_dir_fd < 0)
{
obi_set_errno(OBIDMS_UNKNOWN_ERROR);
obidebug(1, "\nError getting the file descriptor of the taxonomy directory");
closedir(dms->indexer_directory);
closedir(dms->tax_directory);
closedir(dms->view_directory);
closedir(dms->directory);
free(dms);
return NULL;
}
// Initialize the list of opened columns
dms->opened_columns = (Opened_columns_list_p) malloc(sizeof(Opened_columns_list_t));
(dms->opened_columns)->nb_opened_columns = 0;
......@@ -536,7 +563,7 @@ int obi_close_dms(OBIDMS_p dms)
while ((dms->opened_columns)->nb_opened_columns > 0)
obi_close_column(*((dms->opened_columns)->columns));
// Close dms, and view and indexer directories
// Close dms, and view, indexer and taxonomy directories
if (closedir(dms->indexer_directory) < 0)
{
obi_set_errno(OBI_INDEXER_ERROR);
......@@ -551,6 +578,13 @@ int obi_close_dms(OBIDMS_p dms)
free(dms);
return -1;
}
if (closedir(dms->tax_directory) < 0)
{
obi_set_errno(OBIVIEW_ERROR);
obidebug(1, "\nError closing a taxonomy directory");
free(dms);
return -1;
}
if (closedir(dms->directory) < 0)
{
obi_set_errno(OBIDMS_MEMORY_ERROR);
......
......@@ -106,6 +106,12 @@ typedef struct OBIDMS {
int view_dir_fd; /**< The file descriptor of the directory entry
* usable to refer and scan the view directory.
*/
DIR* tax_directory; /**< A directory entry usable to
* refer and scan the taxonomy directory.
*/
int tax_dir_fd; /**< The file descriptor of the directory entry
* usable to refer and scan the taxonomy directory.
*/
bool little_endian; /**< Endianness of the database.
*/
Opened_columns_list_p opened_columns; /**< List of opened columns.
......
......@@ -51,6 +51,27 @@ int compareRankLabel(const void *label1, const void *label2)
}
char* get_taxonomy_path(OBIDMS_p dms, const char* tax_name)
{
char* all_tax_dir_path;
char* tax_path;
all_tax_dir_path = obi_dms_get_full_path(dms, TAXONOMY_DIR_NAME);
tax_path = (char*) malloc((strlen(all_tax_dir_path) + strlen(tax_name) + 2)*sizeof(char));
if (sprintf(tax_path, "%s/%s", all_tax_dir_path, tax_name) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building taxonomy path");
free(all_tax_dir_path);
return NULL;
}
free(all_tax_dir_path);
return tax_path;
}
int32_t rank_index(const char* label, ecorankidx_t* ranks)
{
char **rep;
......@@ -58,7 +79,7 @@ int32_t rank_index(const char* label, ecorankidx_t* ranks)
rep = bsearch(label, ranks->label, ranks->count, sizeof(char*), compareRankLabel);
if (rep)
return rep-ranks->label; // TODO what???
return rep-ranks->label;
return -1;
}
......@@ -93,8 +114,8 @@ void* read_ecorecord(FILE* f, int32_t* record_size)
}
// if (!(obi_is_little_endian())) // TODO
if (is_big_endian())
*record_size=swap_int32_t(*record_size);
// if (is_big_endian())
// *record_size=swap_int32_t(*record_size);
if (buffer_size < *record_size)
{
......@@ -137,13 +158,13 @@ ecotx_t* readnext_ecotaxon(FILE* f, ecotx_t* taxon)
return NULL;
// if (!(obi_is_little_endian())) // TODO
if (is_big_endian())
{
raw->name_length = swap_int32_t(raw->name_length);
raw->parent = swap_int32_t(raw->parent);
raw->rank = swap_int32_t(raw->rank);
raw->taxid = swap_int32_t(raw->taxid);
}
// if (is_big_endian())
// {
// raw->name_length = swap_int32_t(raw->name_length);
// raw->parent = swap_int32_t(raw->parent);
// raw->rank = swap_int32_t(raw->rank);
// raw->taxid = swap_int32_t(raw->taxid);
// }
taxon->parent = (ecotx_t*) ((size_t) raw->parent);
taxon->taxid = raw->taxid;
......@@ -195,8 +216,8 @@ FILE* open_ecorecorddb(const char* file_name,
}
// if (!(obi_is_little_endian())) // TODO
if (is_big_endian())
*count = swap_int32_t(*count);
// if (is_big_endian())
// *count = swap_int32_t(*count);
return f;
}
......@@ -225,6 +246,7 @@ ecorankidx_t* read_rankidx(const char* ranks_file_name)
buffer = read_ecorecord(ranks_file, &rank_length);
ranks_index->label[i] = (char*) malloc(rank_length+1);
strncpy(ranks_index->label[i], buffer, rank_length);
(ranks_index->label[i])[rank_length] = 0;
}
return ranks_index;
......@@ -277,6 +299,7 @@ ecotxidx_t* read_taxonomyidx(const char* taxa_file_name, const char* local_taxa_
for (; i < count_taxa; i++){
readnext_ecotaxon(f_local_taxa, &(taxa_index->taxon[i]));
taxa_index->taxon[i].idx = i;
taxa_index->taxon[i].parent = taxa_index->taxon + (size_t) taxa_index->taxon[i].parent;
taxa_index->taxon[i].parent->farest=0;
if (taxa_index->taxon[i].taxid > taxa_index->max_taxid)
......@@ -321,13 +344,13 @@ econame_t* readnext_econame(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy
return NULL;
// if (!(obi_is_little_endian())) // TODO
if (is_big_endian())
{
raw->is_scientific_name = swap_int32_t(raw->is_scientific_name);
raw->name_length = swap_int32_t(raw->name_length);
raw->class_length = swap_int32_t(raw->class_length);
raw->taxid = swap_int32_t(raw->taxid);
}
// if (is_big_endian())
// {
// raw->is_scientific_name = swap_int32_t(raw->is_scientific_name);
// raw->name_length = swap_int32_t(raw->name_length);
// raw->class_length = swap_int32_t(raw->class_length);
// raw->taxid = swap_int32_t(raw->taxid);
// }
name->is_scientific_name = raw->is_scientific_name;
......@@ -382,7 +405,6 @@ static int bcomptaxon (const void* ptaxid, const void* ptaxon)
OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, bool read_alternative_names)
{
OBIDMS_taxonomy_p tax;
char* main_taxonomy_dir_path;
char* taxonomy_path;
char* ranks_file_name;
char* taxa_file_name;
......@@ -398,16 +420,7 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo
buffer_size = 2048; // TODO
main_taxonomy_dir_path = obi_dms_get_full_path(dms, TAXONOMY_DIR_NAME);
taxonomy_path = (char*) malloc((strlen(main_taxonomy_dir_path) + strlen(taxonomy_name) + strlen(taxonomy_name) + 3)*sizeof(char));
if (sprintf(taxonomy_path, "%s/%s/%s", main_taxonomy_dir_path, taxonomy_name, taxonomy_name) < 0)
{
free(main_taxonomy_dir_path);
obi_close_taxonomy(tax);
return NULL;
}
free(main_taxonomy_dir_path);
taxonomy_path = get_taxonomy_path(dms, taxonomy_name);
// Read ranks
ranks_file_name = (char*) malloc(buffer_size*sizeof(char));
......@@ -417,7 +430,7 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo
obi_close_taxonomy(tax);
return NULL;
}
if (snprintf(ranks_file_name, buffer_size, "%s.rdx", taxonomy_path) < 0)
if (snprintf(ranks_file_name, buffer_size, "%s/%s.rdx", taxonomy_path, taxonomy_name) < 0)
{
free(taxonomy_path);
free(ranks_file_name);
......@@ -441,7 +454,7 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo
obi_close_taxonomy(tax);
return NULL;
}
if (snprintf(taxa_file_name, buffer_size,"%s.tdx", taxonomy_path) < 0)
if (snprintf(taxa_file_name, buffer_size, "%s/%s.tdx", taxonomy_path, taxonomy_name) < 0)
{
free(taxonomy_path);
free(taxa_file_name);
......@@ -456,7 +469,7 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo
obi_close_taxonomy(tax);
return NULL;
}
if (snprintf(local_taxa_file_name, buffer_size,"%s.ldx", taxonomy_path) < 0)
if (snprintf(local_taxa_file_name, buffer_size, "%s/%s.ldx", taxonomy_path, taxonomy_name) < 0)
{
free(taxonomy_path);
free(taxa_file_name);
......@@ -486,7 +499,7 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo
obi_close_taxonomy(tax);
return NULL;
}
if (snprintf(alter_names_file_name, buffer_size,"%s.ndx", taxonomy_path) < 0)
if (snprintf(alter_names_file_name, buffer_size, "%s/%s.ndx", taxonomy_path, taxonomy_name) < 0)
{
free(taxonomy_path);
free(alter_names_file_name);
......@@ -514,7 +527,7 @@ int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy)
if (taxonomy)
{
if (taxonomy->ranks)
free(taxonomy->ranks); // TODO those don't free everything but mapping will replace anyway
free(taxonomy->ranks); // TODO those don't free everything
if (taxonomy->names)
free(taxonomy->names);
......@@ -527,7 +540,7 @@ int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy)
return 0;
}
// TODO no closing files?
// close files
return 1;
}
......@@ -699,3 +712,395 @@ ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
return obi_taxo_get_parent_at_rank(taxon, rankindex);
}
// Functions to write taxonomy structure to binary files
int write_rankidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct?
{
int i;
char* file_name;
int file_descriptor;
off_t file_size;
char* taxonomy_path;
int32_t length;
// Compute file size
file_size = sizeof(int32_t);
for (i=0; i < (tax->ranks)->count; i++)
{
file_size = file_size + sizeof(int32_t); // To store label size
file_size = file_size + strlen(((tax->ranks)->label)[i]); // To store label
}
// Build the taxonomy directory path
taxonomy_path = get_taxonomy_path(dms, taxonomy_name);
file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 5)*sizeof(char));
if (file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a binary taxonomy file name");
return -1;
}
// Build the file path
if (sprintf(file_name, "%s/%s.rdx", taxonomy_path, taxonomy_name) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building a binary taxonomy file name");
return -1;
}
free(taxonomy_path);
// Create file
file_descriptor = open(file_name, O_RDWR | O_CREAT | O_EXCL, 0777);
if (file_descriptor < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError creating a binary taxonomy file");
free(file_name);
return -1;
}
free(file_name);
// Truncate the file to the right size
if (ftruncate(file_descriptor, file_size) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError truncating a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write rank count
if (write(file_descriptor, &((tax->ranks)->count), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write ranks
for (i=0; i < (tax->ranks)->count; i++)
{
length = strlen(((tax->ranks)->label)[i]);
// Write rank size
if (write(file_descriptor, &length, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write rank label
if (write(file_descriptor, ((tax->ranks)->label)[i], length) < ((ssize_t) length))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
}
// Close file
if (close(file_descriptor) < 0)
{
obi_set_errno(OBIDMS_UNKNOWN_ERROR);
obidebug(1, "\nError closing a DMS information file");
return -1;
}
return 0;
}
int write_taxonomyidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct?
{
int i;
char* file_name;
int file_descriptor;
off_t file_size;
char* taxonomy_path;
int32_t name_length;
int32_t record_size;
// Compute file size
file_size = sizeof(int32_t); // To store record count
for (i=0; i < (tax->taxa)->count; i++)
{
file_size = file_size + sizeof(int32_t) * 5; // To store record size, taxid, rank index, parent index, and name length
file_size = file_size + strlen(tax->taxa->taxon[i].name); // To store name
}
// Build the taxonomy directory path
taxonomy_path = get_taxonomy_path(dms, taxonomy_name);
file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 5)*sizeof(char));
if (file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a binary taxonomy file name");
return -1;
}
// Build the file path
if (sprintf(file_name, "%s/%s.tdx", taxonomy_path, taxonomy_name) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building a binary taxonomy file name");
return -1;
}
free(taxonomy_path);
// Create file
file_descriptor = open(file_name, O_RDWR | O_CREAT | O_EXCL, 0777);
if (file_descriptor < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError creating a binary taxonomy file");
free(file_name);
return -1;
}
free(file_name);
// Truncate the file to the right size
if (ftruncate(file_descriptor, file_size) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError truncating a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write record count
if (write(file_descriptor, &(tax->taxa->count), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write records
for (i=0; i < tax->taxa->count; i++)
{
name_length = strlen(tax->taxa->taxon[i].name);
record_size = 4*sizeof(int32_t) + name_length;
// Write record size
if (write(file_descriptor, &record_size, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write taxid
if (write(file_descriptor, &(tax->taxa->taxon[i].taxid), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write rank index
if (write(file_descriptor, &(tax->taxa->taxon[i].rank), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write parent index
if (write(file_descriptor, &((tax->taxa->taxon[i].parent)->idx), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write name length
if (write(file_descriptor, &name_length, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write name
if (write(file_descriptor, tax->taxa->taxon[i].name, name_length) < ((ssize_t) name_length))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
}
// Close file
if (close(file_descriptor) < 0)
{
obi_set_errno(OBIDMS_UNKNOWN_ERROR);
obidebug(1, "\nError closing a DMS information file");
return -1;
}
return 0;
}
int write_nameidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct?
{
int i;
char* file_name;
int file_descriptor;
off_t file_size;
char* taxonomy_path;
int32_t name_length;
int32_t class_length;
int32_t record_size;
// Compute file size
file_size = sizeof(int32_t); // To store record count
for (i=0; i < (tax->names)->count; i++)
{
file_size = file_size + sizeof(int32_t) * 5; // To store record size, taxid, rank index, parent index, and name length
file_size = file_size + strlen(tax->names->names[i].name); // To store name
file_size = file_size + strlen(tax->names->names[i].class_name); // To store name
}
// Build the taxonomy directory path
taxonomy_path = get_taxonomy_path(dms, taxonomy_name);
file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 5)*sizeof(char));
if (file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a binary taxonomy file name");
return -1;
}
// Build the file path
if (sprintf(file_name, "%s/%s.ndx", taxonomy_path, taxonomy_name) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building a binary taxonomy file name");
return -1;
}
free(taxonomy_path);