Commit eb586b2f by Celine Mercier

New command and C functions: obi ecotag

parent 9556130b
#cython: language_level=3
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms.dms cimport DMS
from obitools3.dms.view import RollbackException
from obitools3.dms.capi.obiecotag cimport obi_ecotag
from obitools3.apps.optiongroups import addMinimalInputOption, addTaxonomyOption, addMinimalOutputOption
from obitools3.uri.decode import open_uri
from obitools3.apps.config import logger
from obitools3.utils cimport tobytes, str2bytes
from obitools3.dms.view.view cimport View
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
import sys
__title__="Taxonomic assignment of sequences"
def addOptions(parser):
addMinimalInputOption(parser)
addTaxonomyOption(parser)
addMinimalOutputOption(parser)
group = parser.add_argument_group('obi ecotag specific options')
group.add_argument('--ref-database','-R',
action="store", dest="ecotag:ref_view",
metavar='<REF_VIEW>',
type=str,
help="URI of the view containing the reference database as built by the build_ref_db command.")
group.add_argument('--minimum-identity','-m',
action="store", dest="ecotag:threshold",
metavar='<THRESHOLD>',
default=0.0,
type=float,
help="Minimum identity to consider for assignment, as a normalized identity, e.g. 0.95 for an identity of 95%%. "
"Default: 0.00 (no threshold).")
def run(config):
DMS.obi_atexit()
logger("info", "obi ecotag")
# Open the query view: only the DMS
input = open_uri(config['obi']['inputURI'],
dms_only=True)
if input is None:
raise Exception("Could not read input")
i_dms = input[0]
i_dms_name = input[0].name
i_view_name = input[1]
# Open the reference view: only the DMS
ref = open_uri(config['ecotag']['ref_view'],
dms_only=True)
if ref is None:
raise Exception("Could not read reference view URI")
ref_dms = ref[0]
ref_dms_name = ref[0].name
ref_view_name = ref[1]
# Open the output: only the DMS
output = open_uri(config['obi']['outputURI'],
input=False,
dms_only=True)
if output is None:
raise Exception("Could not create output")
o_dms = output[0]
final_o_view_name = output[1]
# If the input and output DMS are not the same, run ecotag creating a temporary view that will be exported to
# the right DMS and deleted in the other afterwards.
if i_dms != o_dms:
temporary_view_name = final_o_view_name
i=0
while temporary_view_name in i_dms: # Making sure view name is unique in input DMS
temporary_view_name = final_o_view_name+b"_"+str2bytes(str(i))
i+=1
o_view_name = temporary_view_name
else:
o_view_name = final_o_view_name
# Read taxonomy DMS and name
taxo = open_uri(config['obi']['taxoURI'],
dms_only=True)
taxo_dms_name = taxo[0].name
taxo_dms = taxo[0]
taxonomy_name = config['obi']['taxoURI'].split("/")[-1] # Robust in theory
# Save command config in View comments
command_line = " ".join(sys.argv[1:])
comments = View.print_config(config, "ecotag", command_line, input_dms_name=[i_dms_name], input_view_name=[i_view_name]) # TODO no. fix
if obi_ecotag(tobytes(i_dms_name), tobytes(i_view_name), \
tobytes(ref_dms_name), tobytes(ref_view_name), \
tobytes(taxo_dms_name), tobytes(taxonomy_name), \
tobytes(o_view_name), comments,
config['ecotag']['threshold']) < 0:
raise Exception("Error running ecotag")
# If the input and output DMS are not the same, export result view to output DMS
if i_dms != o_dms:
View.import_view(i_dms.full_path[:-7], o_dms.full_path[:-7], o_view_name, final_o_view_name)
# Save command config in DMS comments
o_dms.record_command_line(command_line)
print("\n")
print(repr(o_dms[final_o_view_name]))
# If the input and the output DMS are different, delete the temporary result view in the input DMS
if i_dms != o_dms:
View.delete_view(i_dms, o_view_name)
o_dms.close()
i_dms.close()
#cython: language_level=3
cdef extern from "obi_ecotag.h" nogil:
int obi_ecotag(const char* dms_name,
const char* query_view_name,
const char* ref_dms_name,
const char* ref_view_name,
const char* taxo_dms_name,
const char* taxonomy_name,
const char* output_view_name,
const char* output_view_comments,
double ecotag_threshold)
......@@ -38,6 +38,7 @@
../../../src/obidmscolumn_str.c
../../../src/obidmscolumn.c
../../../src/obidmscolumndir.c
../../../src/obi_ecotag.c
../../../src/obierrno.c
../../../src/obilittlebigman.c
../../../src/obitypes.c
......
......@@ -38,6 +38,7 @@
../../src/obidmscolumn_array.c
../../src/obidmscolumn.c
../../src/obidmscolumndir.c
../../src/obi_ecotag.c
../../src/obierrno.c
../../src/obilittlebigman.c
../../src/obitypes.c
......
/*************************************************************************************************
* Header file for functions for the taxonomic assignment of sequences *
*************************************************************************************************/
/**
* @file obi_ecotag.h
* @author Celine Mercier (celine.mercier@metabarcoding.org)
* @date November 15th 2018
* @brief Header file for the functions for the taxonomic assignment of sequences.
*/
#ifndef OBI_ECOTAG_H_
#define OBI_ECOTAG_H_
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#define ECOTAG_TAXID_COLUMN_NAME "TAXID"
#define ECOTAG_NAME_COLUMN_NAME "SCIENTIFIC_NAME"
#define ECOTAG_STATUS_COLUMN_NAME "ID_STATUS"
#define ECOTAG_BEST_MATCH_IDS_COLUMN_NAME "BEST_MATCH"
#define ECOTAG_SCORE_COLUMN_NAME "BEST_IDENTITY"
/**
* @brief Taxonomic assignment of sequences.
*
* Note: The columns where the results are written are automatically named and created.
*
* @param dms_name The path to the DMS where the views are.
* @param query_view_name The name of the view containing the query sequences.
* @param ref_dms_name The name of the DMS containing the reference database.
* @param ref_view_name The name of the view corresponding to the reference database as built by build_reference_db().
* @param taxo_dms_name The name of the DMS containing the taxonomy associated with the reference database.
* @param taxonomy_name The name of the taxonomy associated with the reference database.
* @param output_view_name The name to give to the output view.
* @param output_view_comments The comments to associate to the output view.
* @param ecotag_threshold The threshold at which to assign.
*
* The algorithm works like this:
* For each query sequence:
* Align with reference database
* Keep the indices of all the best matches
* For each kept index, get the LCA at that threshold as stored in the reference database, then the LCA of those LCAs
* Write result (max score, threshold, taxid and scientific name of the LCA assigned, list of the ids of the best matches)
*
* @returns A value indicating the success of the operation.
* @retval 0 if the operation was successfully completed.
* @retval -1 if an error occurred.
*
* @since November 2018
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int obi_ecotag(const char* dms_name,
const char* query_view_name,
const char* ref_dms_name,
const char* ref_view_name,
const char* taxo_dms_name,
const char* taxonomy_name,
const char* output_view_name,
const char* output_view_comments,
double ecotag_threshold);
#endif /* OBI_ECOTAG_H_ */
......@@ -130,6 +130,8 @@ extern int obi_errno;
*/
#define OBIVIEW_ALREADY_EXISTS_ERROR (35) /** Tried to create a new view with a name already existing in the DMS.
*/
#define OBI_ECOTAG_ERROR (36) /** Tried to create a new view with a name already existing in the DMS.
*/
/**@}*/
#endif /* OBIERRNO_H_ */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment