Commit 6911bf4d by Celine Mercier

obi clean: first version

parent f0c147c2
#cython: language_level=3
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms.dms cimport DMS
from obitools3.dms.capi.obidms cimport OBIDMS_p
from obitools3.dms.view import RollbackException
from obitools3.dms.capi.obiclean cimport obi_clean
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
from obitools3.uri.decode import open_uri
from obitools3.apps.config import logger
from obitools3.utils cimport tobytes
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
__title__="Tag a set of sequences for PCR and sequencing errors identification"
def addOptions(parser):
addSequenceInputOption(parser)
addMinimalOutputOption(parser)
group = parser.add_argument_group('obi clean specific options')
group.add_argument('--distance', '-d',
action="store", dest="clean:distance",
metavar='<DISTANCE>',
default=1.0,
type=float,
help="Maximum numbers of errors between two variant sequences. Default: 1.")
group.add_argument('--sample-tag', '-s',
action="store",
dest="clean:sample-tag-name",
metavar="<SAMPLE TAG NAME>",
type=str,
default="merged_sample",
help="Name of the tag where sample counts are kept.")
group.add_argument('--ratio', '-r',
action="store", dest="clean:ratio",
metavar='<RATIO>',
default=0.5,
type=float,
help="Maximum ratio between the counts of two sequences so that the less abundant one can be considered"
" a variant of the more abundant one. Default: 0.5.")
group.add_argument('--heads-only', '-H',
action="store_true",
dest="clean:heads-only",
default=False,
help="Only sequences labeled as heads are kept in the output. Default: False")
group.add_argument('--cluster-tags', '-C',
action="store_true",
dest="clean:cluster-tags",
default=False,
help="Adds tags for each sequence giving its cluster's head and weight for each sample.")
def run(config):
DMS.obi_atexit()
logger("info", "obi clean")
# Open DMS
dms_name = config['obi']['inputURI'].split('/')[0]
dms = open_uri(dms_name)[0]
# Read the name of the input view
uri_i = config['obi']['inputURI'].split('/')
i_view_name = uri_i[1]
# Read the name of the output view
uri_o = config['obi']['outputURI'].split('/')
if len(uri_o)==2:
# Check that input and output DMS are the same (predicate, to discuss)
if dms_name != uri_o[0]:
raise Exception("Input and output DMS must be the same")
o_view_name = uri_o[1]
else:
o_view_name = uri_o[0]
if obi_clean(tobytes(dms_name), tobytes(i_view_name), tobytes(config['clean']['sample-tag-name']), tobytes(o_view_name), b"obiclean", \
config['clean']['distance'], config['clean']['ratio'], config['clean']['heads-only'], 1) < 0:
raise Exception("Error running obiclean")
print("\n")
print(repr(dms[o_view_name]))
dms.close()
#cython: language_level=3
from obitools3.dms.capi.obidms cimport OBIDMS_p
cdef extern from "obi_clean.h" nogil:
int obi_clean(const char* dms_name,
const char* i_view_name,
const char* sample_column_name,
const char* o_view_name,
const char* o_view_comments,
double threshold,
double max_ratio,
bint heads_only,
int thread_count)
......@@ -8,6 +8,7 @@
../../../src/linked_list.c
../../../src/murmurhash2.c
../../../src/obi_align.c
../../../src/obi_clean.c
../../../src/obiavl.c
../../../src/obiblob_indexer.c
../../../src/obiblob.c
......
......@@ -8,6 +8,7 @@
../../src/linked_list.c
../../src/murmurhash2.c
../../src/obi_align.c
../../src/obi_clean.c
../../src/obiavl.c
../../src/obiblob_indexer.c
../../src/obiblob.c
......
This diff is collapsed. Click to expand it.
/*************************************************************************************************
* Header file for functions tagging a set of sequences for PCR/sequencing errors identification *
*************************************************************************************************/
/**
* @file obi_clean.h
* @author Celine Mercier (celine.mercier@metabarcoding.org)
* @date April 9th 2018
* @brief Header file for the functions tagging a set of sequences for PCR/sequencing errors identification.
*/
#ifndef OBI_CLEAN_H_
#define OBI_CLEAN_H_
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include "obidms.h"
#include "obiview.h"
#include "obidmscolumn.h"
#include "obitypes.h"
/**
* @brief Names and comments of columns automatically created in the output view when aligning.
*
* @since April 2018
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
#define CLEAN_STATUS_COLUMN_NAME "obiclean_status"
#define CLEAN_HEAD_COLUMN_NAME "obiclean_head"
#define CLEAN_SAMPLECOUNT_COLUMN_NAME "obiclean_samplecount"
#define CLEAN_HEADCOUNT_COLUMN_NAME "obiclean_headcount"
#define CLEAN_INTERNALCOUNT_COLUMN_NAME "obiclean_internalcount"
#define CLEAN_SINGLETONCOUNT_COLUMN_NAME "obiclean_singletoncount"
#define CLEAN_STATUS_COLUMN_COMMENTS ""
#define CLEAN_HEAD_COLUMN_COMMENTS ""
#define CLEAN_SAMPLECOUNT_COLUMN_COMMENTS ""
#define CLEAN_HEADCOUNT_COLUMN_COMMENTS ""
#define CLEAN_INTERNALCOUNT_COLUMN_COMMENTS ""
#define CLEAN_SINGLETONCOUNT_COLUMN_COMMENTS ""
/**
* @brief Tags a set of sequences for PCR/sequencing errors identification
*
* Note: The columns where the results are written are automatically named and created.
*
* @param dms A pointer on an OBIDMS.
* @param i_view_name The name of the input view.
* @param sample_column_name The name of the OBI_STR column in the input view where the sample information is kept.
* NULL or "" (empty string) if there is no sample information.
* @param o_view_name The name of the output view where the results should be written (should not already exist).
* @param o_view_comments The comments that should be associated with the output view.
* @param threshold Similarity threshold expressed as a number of differences.
* Only sequence pairs with a similarity above the threshold are clustered.
* @param max_ratio Maximum ratio between the counts of two sequences so that the less abundant one can be considered
* as a variant of the more abundant one.
* @param heads_only If true, only cluster heads are printed to the output view.
* @param thread_count Number of threads to use (Not available yet) TODO
*
* @returns A value indicating the success of the operation.
* @retval 0 if the operation was successfully completed.
* @retval -1 if an error occurred.
*
* @since April 2017
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int obi_clean(const char* dms_name,
const char* i_view_name,
const char* sample_column_name,
const char* o_view_name,
const char* o_view_comments,
double threshold,
double max_ratio,
bool heads_only,
int thread_count);
#endif /* OBI_CLEAN_H_ */
......@@ -122,6 +122,8 @@ extern int obi_errno;
*/
#define OBI_ELT_IDX_ERROR (31) /** Error setting or getting a value at a non-existent element index or with a non-existent element name
*/
#define OBI_CLEAN_ERROR (32) /** Error while cleaning sequences
*/
/**@}*/
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment