Commit 21d1b2ed authored by Celine Mercier's avatar Celine Mercier

First implementation of taxonomy reading

parent a08def47
......@@ -20,3 +20,5 @@
../../../src/encode.c
../../../src/obidmscolumn_idx.h
../../../src/obidmscolumn_idx.c
../../../src/obidms_taxonomy.c
../../../src/obidms_taxonomy.h
......@@ -4,7 +4,7 @@ from .capi.obidms cimport OBIDMS_p
from .capi.obidmscolumn cimport OBIDMS_column_p
from .capi.obiview cimport Obiview_p
from .capi.obitypes cimport obiversion_t, OBIType_t, index_t
from ._obitaxo cimport OBI_Taxonomy
cdef class OBIDMS_column:
......@@ -86,6 +86,7 @@ cdef class OBIDMS:
cdef str dms_name
cpdef close(self)
cpdef OBI_Taxonomy open_taxonomy(self, str taxo_name)
cpdef OBIView open_view(self, str view_name)
cpdef OBIView new_view(self, str view_name, object view_to_clone=*, list line_selection=*, str view_type=*, str comments=*)
cpdef dict read_view_infos(self, str view_name)
......
......@@ -26,6 +26,8 @@ from ._obidms cimport OBIDMS, \
OBIView, \
OBIView_line
from ._obitaxo cimport OBI_Taxonomy
from ._obiseq cimport OBI_Nuc_Seq, OBI_Nuc_Seq_Stored
from ._obidmscolumn_int cimport OBIDMS_column_int, \
......@@ -243,7 +245,8 @@ cdef class OBIView :
cdef OBIDMS_column_p* column_pp
cdef OBIDMS_column_header_p header
cdef index_t* line_selection_p
cdef object col_capsule
self.dms = dms
if line_selection is not None :
......@@ -649,6 +652,10 @@ cdef class OBIDMS :
if (obi_close_dms(self.pointer)) < 0 :
raise Exception("Problem closing an OBIDMS")
cpdef OBI_Taxonomy open_taxonomy(self, str taxo_name) :
return OBI_Taxonomy(self, taxo_name)
cpdef OBIView open_view(self, str view_name) :
......
../../../src/obidms.h
../../../src/obidms.c
../../../src/obiview.h
../../../src/obiview.c
../../../src/obidmscolumn.h
../../../src/obidmscolumn.c
../../../src/obidmscolumndir.h
../../../src/obidmscolumndir.c
../../../src/obierrno.h
../../../src/obierrno.c
../../../src/obilittlebigman.h
../../../src/obilittlebigman.c
../../../src/obitypes.h
../../../src/obitypes.c
../../../src/private_at_functions.h
../../../src/private_at_functions.c
../../../src/obiavl.h
../../../src/obiavl.c
../../../src/encode.h
../../../src/encode.c
../../../src/obidmscolumn_idx.h
../../../src/obidmscolumn_idx.c
../../../src/obidms_taxonomy.c
../../../src/obidms_taxonomy.h
#cython: language_level=3
from .capi.obitaxonomy cimport ecotx_t, OBIDMS_taxonomy_p
from libc.stdint cimport int32_t
cdef class OBI_Taxonomy :
cdef str name
cdef OBIDMS_taxonomy_p pointer
cpdef close(self)
cdef class OBI_Taxon :
cdef ecotx_t* pointer
cdef int32_t taxid
cdef int32_t rank
cdef int32_t farest
cdef ecotx_t* parent
cdef str name
cpdef int32_t taxid(self)
cpdef int32_t rank(self)
cpdef int32_t farest(self)
cpdef OBI_Taxon parent(self)
#cython: language_level=3
from obitools3.utils cimport bytes2str, str2bytes
from .capi.obitaxonomy cimport obi_read_taxonomy, \
obi_close_taxonomy, \
obi_taxo_get_taxon_with_taxid
from ._obidms cimport OBIDMS
from cpython.pycapsule cimport PyCapsule_New, PyCapsule_GetPointer
cdef class OBI_Taxonomy :
def __init__(self, OBIDMS dms, str name) :
self.name = name
self.pointer = obi_read_taxonomy(dms.pointer, str2bytes(name), True) # TODO discuss
def __getitem__(self, object ref):
cdef ecotx_t* taxon_p
cdef object taxon_capsule
if type(ref) == int :
taxon_p = obi_taxo_get_taxon_with_taxid(self.pointer, ref)
taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL)
return OBI_Taxon(taxon_capsule)
cpdef close(self) :
if (obi_close_taxonomy(self.pointer) < 0) :
raise Exception("Error closing the taxonomy")
cdef class OBI_Taxon : # dict subclass?
def __init__(self, object taxon_capsule) :
cdef ecotx_t* taxon
taxon = <ecotx_t*> PyCapsule_GetPointer(taxon_capsule, NULL)
self.pointer = taxon
self.taxid = taxon.taxid
self.rank = taxon.rank
self.farest = taxon.farest
self.parent = taxon.parent
self.name = bytes2str(taxon.name)
cpdef int32_t taxid(self):
return self.taxid
cpdef int32_t rank(self):
return self.rank
cpdef int32_t farest(self):
return self.farest
cpdef OBI_Taxon parent(self):
cdef object parent_capsule
parent_capsule = PyCapsule_New(self.parent, NULL, NULL)
return OBI_Taxon(parent_capsule)
#cython: language_level=3
from .obitypes cimport const_char_p
from .obidms cimport OBIDMS_p
from libc.stdint cimport int32_t
cdef extern from "obidms_taxonomy.h" nogil:
struct OBIDMS_taxonomy_t
ctypedef OBIDMS_taxonomy_t* OBIDMS_taxonomy_p
struct ecotxnode :
int32_t taxid
int32_t rank
int32_t farest
ecotxnode* parent
char* name
ctypedef ecotxnode ecotx_t
OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const_char_p taxonomy_name, bint read_alternative_names)
int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy)
ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx)
ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid)
bint obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid)
ecotx_t* obi_taxo_get_species(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
ecotx_t* obi_taxo_get_genus(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
ecotx_t* obi_taxo_get_family(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
......@@ -29,6 +29,8 @@
*/
#define AVL_TREES_DIR_NAME "AVL_trees" /**< The name of the AVL trees directory.
*/
#define TAXONOMY_DIR_NAME "TAXONOMY" /**< The name of the taxonomy directory.
*/
#define MAX_NB_OPENED_COLUMNS (100) /**< The maximum number of columns open at the same time.
*/
#define MAX_NB_OPENED_AVL_TREES (100) /**< The maximum number of AVL trees open at the same time.
......
/********************************************************************
* OBIDMS taxonomy functions *
********************************************************************/
/**
* @file obidms_taxonomy.c
* @author Celine Mercier (celine.mercier@metabarcoding.org)
* @date March 2nd 2016
* @brief Functions for reading binary taxonomy files.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
#include <fcntl.h>
#include "obidms_taxonomy.h"
#include "obidms.h"
#include "obidebug.h"
#include "obierrno.h"
#include "private_at_functions.h"
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
// TODO : the malloc aren't checked but won't exist for long because mapping instead
int compareRankLabel(const void *label1, const void *label2)
{
return strcmp((const char*)label1,*(const char**)label2);
}
int32_t rank_index(const char* label, ecorankidx_t* ranks)
{
char **rep;
rep = bsearch(label, ranks->label, ranks->count, sizeof(char*), compareRankLabel);
if (rep)
return rep-ranks->label; // TODO what???
return -1;
}
void* read_ecorecord(FILE* f, int32_t* record_size)
{
static void* buffer = NULL;
int32_t buffer_size = 0;
int32_t read;
if (!record_size)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError reading a taxonomy file: record_size can not be NULL");
return NULL;
}
read = fread(record_size,
1,
sizeof(int32_t),
f);
if (feof(f))
return NULL;
if (read != sizeof(int32_t))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError reading a taxonomy file: error reading record size");
return NULL;
}
// if (is_big_endian()) // TODO
// *recordSize=swap_int32_t(*recordSize);
if (buffer_size < *record_size)
{
if (buffer)
buffer = realloc(buffer, *record_size);
else
buffer = malloc(*record_size);
if (buffer == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError reading a taxonomy file: error allocating memory");
return NULL;
}
}
read = fread(buffer,
1,
*record_size,
f);
if (read != *record_size)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError reading a taxonomy file: error reading a record %d, %d", read, *record_size);
return NULL;
}
return buffer;
};
ecotx_t* readnext_ecotaxon(FILE* f, ecotx_t* taxon)
{
ecotxformat_t* raw;
int32_t record_length;
raw = read_ecorecord(f, &record_length);
if (!raw)
return NULL;
// if (is_big_endian()) // TODO
// {
// raw->namelength = swap_int32_t(raw->namelength);
// raw->parent = swap_int32_t(raw->parent);
// raw->rank = swap_int32_t(raw->rank);
// raw->taxid = swap_int32_t(raw->taxid);
// }
taxon->parent = (ecotx_t*) ((size_t) raw->parent);
taxon->taxid = raw->taxid;
taxon->rank = raw->rank;
taxon->farest = -1;
taxon->name = malloc((raw->name_length+1) * sizeof(char));
strncpy(taxon->name, raw->name, raw->name_length);
return taxon;
}
FILE* open_ecorecorddb(const char* file_name,
int32_t* count,
int32_t abort_on_open_error)
{
FILE* f;
int32_t read;
fprintf(stderr, "\n%s\n", file_name);
f = fopen(file_name, "rb");
if (!f)
{
if (abort_on_open_error)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nCouldn't open a taxonomy file");
return NULL;
}
else
{
*count = 0;
return NULL;
}
}
read = fread(count,
1,
sizeof(int32_t),
f);
if (read != sizeof(int32_t))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError reading taxonomy record size");
return NULL;
}
// if (!obi_is_little_endian()) // TODO
// *count = swap_int32_t(*count);
return f;
}
ecorankidx_t* read_rankidx(const char* ranks_file_name)
{
int32_t count;
FILE* ranks_file;
ecorankidx_t* ranks_index;
int32_t i;
int32_t rank_length;
char* buffer;
ranks_file = open_ecorecorddb(ranks_file_name, &count, 0);
if (ranks_file==NULL)
return NULL;
ranks_index = (ecorankidx_t*) malloc(sizeof(ecorankidx_t) + sizeof(char*) * (count-1));
ranks_index->count = count;
for (i=0; i < count; i++)
{
buffer = read_ecorecord(ranks_file, &rank_length);
ranks_index->label[i] = (char*) malloc(rank_length+1);
strncpy(ranks_index->label[i], buffer, rank_length);
}
return ranks_index;
}
ecotxidx_t* read_taxonomyidx(const char* taxa_file_name, const char* local_taxa_file_name)
{
int32_t count_taxa;
int32_t count_local_taxa;
FILE* f_taxa;
FILE* f_local_taxa;
ecotxidx_t* taxa_index;
struct ecotxnode* t;
int32_t i;
int32_t j;
f_taxa = open_ecorecorddb(taxa_file_name, &count_taxa,0);
if (f_taxa == NULL)
{
obidebug(1, "\nError reading taxonomy taxa file");
return NULL;
}
f_local_taxa = open_ecorecorddb(local_taxa_file_name, &count_local_taxa, 0);
taxa_index = (ecotxidx_t*) malloc(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count_taxa + count_local_taxa - 1));
taxa_index->count = count_taxa + count_local_taxa;
taxa_index->buffer_size = taxa_index->count;
taxa_index->max_taxid = 0;
printf("Reading %d taxa...\n", count_taxa);
for (i=0; i<count_taxa; i++)
{
readnext_ecotaxon(f_taxa, &(taxa_index->taxon[i]));
taxa_index->taxon[i].parent = taxa_index->taxon + (size_t) taxa_index->taxon[i].parent;
taxa_index->taxon[i].parent->farest = 0;
if (taxa_index->taxon[i].taxid > taxa_index->max_taxid)
taxa_index->max_taxid = taxa_index->taxon[i].taxid;
}
if (count_local_taxa > 0)
printf("Reading %d local taxa...\n", count_local_taxa);
else
printf("No local taxa\n");
count_taxa = taxa_index->count;
for (; i < count_taxa; i++){
readnext_ecotaxon(f_local_taxa, &(taxa_index->taxon[i]));
taxa_index->taxon[i].parent = taxa_index->taxon + (size_t) taxa_index->taxon[i].parent;
taxa_index->taxon[i].parent->farest=0;
if (taxa_index->taxon[i].taxid > taxa_index->max_taxid)
taxa_index->max_taxid = taxa_index->taxon[i].taxid;
}
printf("Computing longest branches...\n");
for (i=0; i < count_taxa; i++)
{
t = taxa_index->taxon+i;
if (t->farest == -1)
{
t->farest=0;
while (t->parent != t)
{
j = t->farest + 1;
if (j > t->parent->farest)
{
t->parent->farest = j;
t=t->parent;
}
else
t = taxa_index->taxon;
}
}
}
return taxa_index;
}
econame_t* readnext_econame(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy)
{
econameformat_t* raw;
int32_t record_length;
raw = read_ecorecord(f, &record_length);
if (!raw)
return NULL;
// if (is_big_endian()) // TODO
// {
// raw->is_scientificname = swap_int32_t(raw->is_scientificname);
// raw->namelength = swap_int32_t(raw->namelength);
// raw->classlength = swap_int32_t(raw->classlength);
// raw->taxid = swap_int32_t(raw->taxid);
// }
name->is_scientific_name = raw->is_scientific_name;
name->name = malloc((raw->name_length + 1) * sizeof(char));
strncpy(name->name, raw->names, raw->name_length);
name->name[raw->name_length] = 0;
name->class_name = malloc((raw->class_length+1) * sizeof(char));
strncpy(name->class_name,(raw->names + raw->name_length), raw->class_length);
name->class_name[raw->class_length] = 0;
name->taxon = taxonomy->taxa->taxon + raw->taxid;
return name;
}
econameidx_t* read_nameidx(const char *file_name, OBIDMS_taxonomy_p taxonomy)
{
int32_t count;
FILE* f;
econameidx_t* index_names;
int32_t i;
f = open_ecorecorddb(file_name, &count, 0);
if (f == NULL)
return NULL;
index_names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t) * (count-1));
index_names->count = count;
for (i=0; i < count; i++)
readnext_econame(f, (index_names->names)+i, taxonomy);
return index_names;
}
static int bcomptaxon (const void* ptaxid, const void* ptaxon)
{
ecotx_t* current_taxon = (ecotx_t*) ptaxon;
int32_t taxid = (int32_t) ((size_t) ptaxid);
return taxid - current_taxon->taxid;
}
/////// PUBLIC /////////
OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, bool read_alternative_names)
{
OBIDMS_taxonomy_p tax;
char* main_taxonomy_dir_path;
char* taxonomy_path;
char* ranks_file_name;
char* taxa_file_name;
char* local_taxa_file_name;
char* alter_names_file_name;
int buffer_size;
tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t));
tax->ranks = NULL;
tax->taxa = NULL;
tax->names = NULL;
buffer_size = 2048; // TODO
main_taxonomy_dir_path = get_full_path(dms->dir_fd, TAXONOMY_DIR_NAME);
if (asprintf(&taxonomy_path, "%s/%s/%s", main_taxonomy_dir_path, taxonomy_name, taxonomy_name) < 0)
{
free(main_taxonomy_dir_path);
obi_close_taxonomy(tax);
return NULL;
}
free(main_taxonomy_dir_path);
// Read ranks
ranks_file_name = (char*) malloc(