Commit 2ba6d161 by Celine Mercier

New command: obi ecopcr

parent 275d85dc
#cython: language_level=3
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms.dms cimport DMS
from obitools3.dms.capi.obidms cimport OBIDMS_p
from obitools3.dms.view import RollbackException
from obitools3.dms.capi.obiecopcr cimport obi_ecopcr
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption, addTaxonomyInputOption
from obitools3.uri.decode import open_uri
from obitools3.apps.config import logger
from obitools3.utils cimport tobytes
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
from libc.stdlib cimport malloc, free
from libc.stdint cimport int32_t
__title__="in silico PCR"
# TODO: add option to output unique ids
def addOptions(parser):
addSequenceInputOption(parser)
addMinimalOutputOption(parser)
addTaxonomyInputOption(parser)
group = parser.add_argument_group('obi ecopcr specific options')
group.add_argument('--primer1', '-F',
action="store", dest="ecopcr:primer1",
metavar='<PRIMER>',
type=str,
help="Forward primer.")
group.add_argument('--primer2', '-R',
action="store", dest="ecopcr:primer2",
metavar='<PRIMER>',
type=str,
help="Reverse primer.")
group.add_argument('--error', '-e',
action="store", dest="ecopcr:error",
metavar='<ERROR>',
default=0,
type=int,
help="Maximum number of errors (mismatches) allowed per primer. Default: 0.")
group.add_argument('--min-length', '-l',
action="store",
dest="ecopcr:min-length",
metavar="<MINIMUM LENGTH>",
type=int,
default=0,
help="Minimum length of the in silico amplified DNA fragment, excluding primers.")
group.add_argument('--max-length', '-L',
action="store",
dest="ecopcr:max-length",
metavar="<MAXIMUM LENGTH>",
type=int,
default=0,
help="Maximum length of the in silico amplified DNA fragment, excluding primers.")
group.add_argument('--restrict-to-taxid', '-r',
action="append",
dest="ecopcr:restrict-to-taxid",
metavar="<TAXID>",
type=int,
default=[],
help="Only the sequence records corresponding to the taxonomic group identified "
"by TAXID are considered for the in silico PCR. The TAXID is an integer "
"that can be found in the NCBI taxonomic database.")
group.add_argument('--ignore-taxid', '-i',
action="append",
dest="ecopcr:ignore-taxid",
metavar="<TAXID>",
type=int,
default=[],
help="The sequences of the taxonomic group identified by TAXID are not considered for the in silico PCR.")
group.add_argument('--circular', '-c',
action="store_true",
dest="ecopcr:circular",
default=False,
help="Considers that the input sequences are circular (e.g. mitochondrial or chloroplastic DNA).")
group.add_argument('--salt-concentration', '-a',
action="store",
dest="ecopcr:salt-concentration",
metavar="<FLOAT>",
type=float,
default=0.05,
help="Salt concentration used for estimating the Tm. Default: 0.05.")
group.add_argument('--salt-correction-method', '-m',
action="store",
dest="ecopcr:salt-correction-method",
metavar="<1|2>",
type=int,
default=1,
help="Defines the method used for estimating the Tm (melting temperature) between the primers and their corresponding "
"target sequences. SANTALUCIA: 1, or OWCZARZY: 2. Default: 1.")
group.add_argument('--keep-nucs', '-D',
action="store",
dest="ecopcr:keep-nucs",
metavar="<INTEGER>",
type=int,
default=0,
help="Keeps the specified number of nucleotides on each side of the in silico amplified sequences, "
"(already including the amplified DNA fragment plus the two target sequences of the primers).")
group.add_argument('--kingdom-mode', '-k',
action="store_true",
dest="ecopcr:kingdom-mode",
default=False,
help="Print in the output the kingdom of the in silico amplified sequences (default: print the superkingdom).")
def run(config):
cdef int32_t* restrict_to_taxids_p = NULL
cdef int32_t* ignore_taxids_p = NULL
restrict_to_taxids_len = len(config['ecopcr']['restrict-to-taxid'])
restrict_to_taxids_p = <int32_t*> malloc((restrict_to_taxids_len + 1) * sizeof(int32_t)) # +1 for the -1 flagging the end of the array
for i in range(restrict_to_taxids_len) :
restrict_to_taxids_p[i] = config['ecopcr']['restrict-to-taxid'][i]
restrict_to_taxids_p[restrict_to_taxids_len] = -1
ignore_taxids_len = len(config['ecopcr']['ignore-taxid'])
ignore_taxids_p = <int32_t*> malloc((ignore_taxids_len + 1) * sizeof(int32_t)) # +1 for the -1 flagging the end of the array
for i in range(ignore_taxids_len) :
ignore_taxids_p[i] = config['ecopcr']['ignore-taxid'][i]
ignore_taxids_p[ignore_taxids_len] = -1
DMS.obi_atexit()
logger("info", "obi ecopcr")
# TODO Bad URI reading because current one is not adapted
# Get input DMS path
i_dms_name = config['obi']['inputURI'].split('/')[0]
# Read the name of the input view
i_uri = config['obi']['inputURI'].split('/')
i_view_name = i_uri[1]
# Read the name of the output view
o_uri = config['obi']['outputURI'].split('/')
if len(o_uri)==2:
# Get output DMS path
o_dms_name = o_uri[0]
o_view_name = o_uri[1]
else:
o_dms_name = i_dms_name
o_view_name = o_uri[0]
o_dms = open_uri(o_dms_name, input=False)[0]
# Read taxonomy name
taxonomy_name = config['obi']['taxoURI'].split('/')[2]
# TODO: input DMS, taxonomy and primers in comments
if obi_ecopcr(tobytes(i_dms_name), tobytes(i_view_name), tobytes(taxonomy_name), \
tobytes(o_dms_name), tobytes(o_view_name), b"ecopcr", \
tobytes(config['ecopcr']['primer1']), tobytes(config['ecopcr']['primer2']), \
config['ecopcr']['error'], \
config['ecopcr']['min-length'], config['ecopcr']['max-length'], \
restrict_to_taxids_p, ignore_taxids_p, \
config['ecopcr']['circular'], config['ecopcr']['salt-concentration'], config['ecopcr']['salt-correction-method'], \
config['ecopcr']['keep-nucs'], config['ecopcr']['kingdom-mode']) < 0:
raise Exception("Error running ecopcr")
free(restrict_to_taxids_p)
free(ignore_taxids_p)
print("\n")
print(repr(o_dms[o_view_name]))
o_dms.close()
#cython: language_level=3
from obitools3.dms.capi.obidms cimport OBIDMS_p
from libc.stdint cimport int32_t
cdef extern from "obi_ecopcr.h" nogil:
int obi_ecopcr(const char* input_dms_name,
const char* i_view_name,
const char* taxonomy_name,
const char* output_dms_name,
const char* o_view_name,
const char* o_view_comments,
const char* primer1,
const char* primer2,
int error_max,
int min_len,
int max_len,
int32_t* restrict_to_taxids,
int32_t* ignore_taxids,
int circular,
double salt_concentration,
int salt_correction_method,
int keep_nucleotides,
bint kingdom_mode)
......@@ -9,6 +9,15 @@
../../../src/murmurhash2.c
../../../src/obi_align.c
../../../src/obi_clean.c
../../../src/obi_ecopcr.c
../../../src/libecoPCR/libthermo/nnparams.c
../../../src/libecoPCR/libapat/apat_parse.c
../../../src/libecoPCR/libapat/apat_search.c
../../../src/libecoPCR/libapat/libstki.c
../../../src/libecoPCR/ecoapat.c
../../../src/libecoPCR/ecodna.c
../../../src/libecoPCR/ecoError.c
../../../src/libecoPCR/ecoMalloc.c
../../../src/obiavl.c
../../../src/obiblob_indexer.c
../../../src/obiblob.c
......
......@@ -9,6 +9,15 @@
../../src/murmurhash2.c
../../src/obi_align.c
../../src/obi_clean.c
../../src/obi_ecopcr.c
../../src/libecoPCR/libthermo/nnparams.c
../../src/libecoPCR/libapat/apat_parse.c
../../src/libecoPCR/libapat/apat_search.c
../../src/libecoPCR/libapat/libstki.c
../../src/libecoPCR/ecoapat.c
../../src/libecoPCR/ecodna.c
../../src/libecoPCR/ecoError.c
../../src/libecoPCR/ecoMalloc.c
../../src/obiavl.c
../../src/obiblob_indexer.c
../../src/obiblob.c
......
#include "ecoPCR.h"
#include <stdio.h>
#include <stdlib.h>
/*
* print the message given as argument and exit the program
* @param error error number
* @param message the text explaining what's going on
* @param filename the file source where the program failed
* @param linenumber the line where it has failed
* filename and linenumber are written at pre-processing
* time by a macro
*/
void ecoError(int32_t error,
const char* message,
const char * filename,
int linenumber)
{
fprintf(stderr,"Error %d in file %s line %d : %s\n",
error,
filename,
linenumber,
message);
abort();
}
#include "ecoPCR.h"
#include <stdlib.h>
static int eco_log_malloc = 0;
void eco_trace_memory_allocation()
{
eco_log_malloc=1;
}
void eco_untrace_memory_allocation()
{
eco_log_malloc=0;
}
void *eco_malloc(int32_t chunksize,
const char *error_message,
const char *filename,
int32_t line)
{
void * chunk;
chunk = calloc(1,chunksize);
if (!chunk)
ecoError(ECO_MEM_ERROR,error_message,filename,line);
if (eco_log_malloc)
fprintf(stderr,
"Memory segment located at %p of size %d is allocated (file : %s [%d])",
chunk,
chunksize,
filename,
line);
return chunk;
}
void *eco_realloc(void *chunk,
int32_t newsize,
const char *error_message,
const char *filename,
int32_t line)
{
void *newchunk;
newchunk = realloc(chunk,newsize);
if (!newchunk)
ecoError(ECO_MEM_ERROR,error_message,filename,line);
if (eco_log_malloc)
fprintf(stderr,
"Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])",
chunk,
newchunk,
newsize,
filename,
line);
return newchunk;
}
void eco_free(void *chunk,
const char *error_message,
const char *filename,
int32_t line)
{
free(chunk);
if (eco_log_malloc)
fprintf(stderr,
"Memory segment %p is released => %s (file : %s [%d])",
chunk,
error_message,
filename,
line);
}
#ifndef ECOPCR_H_
#define ECOPCR_H_
#include <stdio.h>
#include <inttypes.h>
#include "../obidmscolumn.h"
#include "../obiview.h"
#include "../obitypes.h"
#ifndef H_apat
#include "./libapat/apat.h"
#endif
/*****************************************************
*
* Data type declarations
*
*****************************************************/
/*
*
* Sequence types
*
*/
typedef struct {
int32_t taxid;
char AC[20];
int32_t DE_length;
int32_t SQ_length;
int32_t CSQ_length;
char data[1];
} ecoseqformat_t;
typedef struct {
int32_t taxid;
int32_t SQ_length;
char *AC;
char *DE;
char *SQ;
} ecoseq_t;
/*****************************************************
*
* Function declarations
*
*****************************************************/
/*
*
* Low level system functions
*
*/
int32_t is_big_endian();
int32_t swap_int32_t(int32_t);
void *eco_malloc(int32_t chunksize,
const char *error_message,
const char *filename,
int32_t line);
void *eco_realloc(void *chunk,
int32_t chunksize,
const char *error_message,
const char *filename,
int32_t line);
void eco_free(void *chunk,
const char *error_message,
const char *filename,
int32_t line);
void eco_trace_memory_allocation();
void eco_untrace_memory_allocation();
#define ECOMALLOC(size,error_message) \
eco_malloc((size),(error_message),__FILE__,__LINE__)
#define ECOREALLOC(chunk,size,error_message) \
eco_realloc((chunk),(size),(error_message),__FILE__,__LINE__)
#define ECOFREE(chunk,error_message) \
eco_free((chunk),(error_message),__FILE__,__LINE__)
/*
*
* Error management
*
*/
void ecoError(int32_t,const char*,const char *,int);
#define ECOERROR(code,message) ecoError((code),(message),__FILE__,__LINE__)
#define ECO_IO_ERROR (1)
#define ECO_MEM_ERROR (2)
#define ECO_ASSERT_ERROR (3)
#define ECO_NOTFOUND_ERROR (4)
/*
*
* Low level Disk access functions
*
*/
int32_t delete_apatseq(SeqPtr pseq);
PatternPtr buildPattern(const char *pat, int32_t error_max);
PatternPtr complementPattern(PatternPtr pat);
SeqPtr ecoseq2apatseq(char* sequence, SeqPtr out, int32_t circular);
char *ecoComplementPattern(char *nucAcSeq);
char *ecoComplementSequence(char *nucAcSeq);
char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end);
#endif /*ECOPCR_H_*/
#include "./libapat/libstki.h"
#include "./libapat/apat.h"
#include "ecoPCR.h"
#include <string.h>
#include "../obidmscolumn.h"
#include "../obiview.h"
#include "../obitypes.h"
static void EncodeSequence(SeqPtr seq);
static void UpperSequence(char *seq);
/* -------------------------------------------- */
/* uppercase sequence */
/* -------------------------------------------- */
#define IS_LOWER(c) (((c) >= 'a') && ((c) <= 'z'))
#define TO_UPPER(c) ((c) - 'a' + 'A')
void UpperSequence(char *seq)
{
char *cseq;
for (cseq = seq ; *cseq ; cseq++)
if (IS_LOWER(*cseq))
*cseq = TO_UPPER(*cseq);
}
#undef IS_LOWER
#undef TO_UPPER
/* -------------------------------------------- */
/* encode sequence */
/* IS_UPPER is slightly faster than isupper */
/* -------------------------------------------- */
#define IS_UPPER(c) (((c) >= 'A') && ((c) <= 'Z'))
void EncodeSequence(SeqPtr seq)
{
int i;
UInt8 *data;
char *cseq;
data = seq->data;
cseq = seq->cseq;
while (*cseq) {
*data = (IS_UPPER(*cseq) ? *cseq - 'A' : 0x0);
data++;
cseq++;
}
for (i=0,cseq=seq->cseq;i < seq->circular; i++,cseq++,data++)
*data = (IS_UPPER(*cseq) ? *cseq - 'A' : 0x0);
for (i = 0 ; i < MAX_PATTERN ; i++)
seq->hitpos[i]->top = seq->hiterr[i]->top = 0;
}
#undef IS_UPPER
SeqPtr ecoseq2apatseq(char* sequence, SeqPtr out, int32_t circular)
{
int i;
int32_t seq_len;
if (!out)
{
out = ECOMALLOC(sizeof(Seq),
"Error in Allocation of a new Seq structure");
for (i = 0 ; i < MAX_PATTERN ; i++)
{
if (! (out->hitpos[i] = NewStacki(kMinStackiSize)))
ECOERROR(ECO_MEM_ERROR,"Error in hit stack Allocation");
if (! (out->hiterr[i] = NewStacki(kMinStackiSize)))
ECOERROR(ECO_MEM_ERROR,"Error in error stack Allocation");
}
}
seq_len = strlen(sequence);
out->seqsiz = out->seqlen = seq_len;
out->circular = circular;
if (!out->data)
{
out->data = ECOMALLOC((out->seqlen+circular) *sizeof(UInt8),
"Error in Allocation of a new Seq data member");
out->datsiz= out->seqlen+circular;
}
else if ((out->seqlen +circular) >= out->datsiz)
{
out->data = ECOREALLOC(out->data,(out->seqlen+circular),
"Error during Seq data buffer realloc");
out->datsiz= out->seqlen+circular;
}
UpperSequence(sequence); // ecoPCR only works on uppercase
out->cseq = sequence;
EncodeSequence(out);
return out;
}
int32_t delete_apatseq(SeqPtr pseq)
{
int i;
if (pseq) {
if (pseq->data)
ECOFREE(pseq->data,"Freeing sequence data buffer");
for (i = 0 ; i < MAX_PATTERN ; i++) {
if (pseq->hitpos[i]) FreeStacki(pseq->hitpos[i]);
if (pseq->hiterr[i]) FreeStacki(pseq->hiterr[i]);
}
ECOFREE(pseq,"Freeing apat sequence structure");
return 0;
}
return 1;
}
PatternPtr buildPattern(const char *pat, int32_t error_max)
{
PatternPtr pattern;
int32_t patlen;
pattern = ECOMALLOC(sizeof(Pattern),
"Error in pattern allocation");
pattern->ok = Vrai;
pattern->hasIndel= Faux;
pattern->maxerr = error_max;
patlen = strlen(pat);
pattern->cpat = ECOMALLOC(sizeof(char)*patlen+1,
"Error in sequence pattern allocation");
strncpy(pattern->cpat,pat,patlen);
pattern->cpat[patlen]=0;
UpperSequence(pattern->cpat);
if (!CheckPattern(pattern))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern checking");
if (! EncodePattern(pattern, dna))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern encoding");
if (! CreateS(pattern, ALPHA_LEN))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern compiling");
return pattern;
}
PatternPtr complementPattern(PatternPtr pat)
{
PatternPtr pattern;
pattern = ECOMALLOC(sizeof(Pattern),
"Error in pattern allocation");
pattern->ok = Vrai;
pattern->hasIndel= pat->hasIndel;
pattern->maxerr = pat->maxerr;
pattern->patlen = pat->patlen;
pattern->cpat = ECOMALLOC(sizeof(char)*(strlen(pat->cpat)+1),
"Error in sequence pattern allocation");
strcpy(pattern->cpat,pat->cpat);
ecoComplementPattern(pattern->cpat);
if (!CheckPattern(pattern))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern checking");
if (! EncodePattern(pattern, dna))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern encoding");
if (! CreateS(pattern, ALPHA_LEN))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern compiling");
return pattern;
}
#include <string.h>
#include "ecoPCR.h"
/*
* @doc: DNA alphabet (IUPAC)
*/
#define LX_BIO_DNA_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]"
/*
* @doc: complementary DNA alphabet (IUPAC)
*/
#define LX_BIO_CDNA_ALPHA "TVGHEFCDIJMLKNOPQYSAABWXRZ#!]["
static char sNuc[] = LX_BIO_DNA_ALPHA;
static char sAnuc[] = LX_BIO_CDNA_ALPHA;
static char LXBioBaseComplement(char nucAc);
static char *LXBioSeqComplement(char *nucAcSeq);
static char *reverseSequence(char *str,char isPattern);
/* ---------------------------- */
char LXBioBaseComplement(char nucAc)
{
char *c;
if ((c = strchr(sNuc, nucAc)))
return sAnuc[(c - sNuc)];
else
return nucAc;
}
/* ---------------------------- */
char *LXBioSeqComplement(char *nucAcSeq)
{
char *s;
for (s = nucAcSeq ; *s ; s++)
*s = LXBioBaseComplement(*s);
return nucAcSeq;
}
char *reverseSequence(char *str,char isPattern)
{
char *sb, *se, c;
if (! str)
return str;
sb = str