Commit 84a736bb authored by Eric Coissac's avatar Eric Coissac

Push new files

parent b119dbb5
......@@ -68,6 +68,7 @@ Collate:
'read_metabar.R'
'read_ngsfilter.R'
'read_obitab.R'
'robi4mer.R'
'robimetabar_aggregate.R'
'robimetabar_filter.R'
'robimetabar_stat.R'
......
......@@ -373,6 +373,7 @@ export(filter_tag_homopolymere)
export(filter_tag_homopolymere_min)
export(forward_tags)
export(forward_tags_colname)
export(fourMerTable)
export(full_taxonomy)
export(genus)
export(get_default_taxonomy)
......@@ -560,6 +561,7 @@ importFrom(rlang,inform)
importFrom(rlang,is_atomic)
importFrom(rlang,warn)
importFrom(stringr,str_sub)
importFrom(tibble,as_tibble)
importFrom(vctrs,new_factor)
importFrom(vctrs,new_vctr)
importFrom(vctrs,vec_c)
......
......@@ -152,7 +152,7 @@ is_robi4mer = function(object) {
#'
#' @examples
#'
#' x <- c("AGT","GGT","CGT","GTA","CCC")
#' x <- c("AGTT","GGCT","CCGT","GATA","CCCC")
#' rt <- as_robi4mer(x)
#'
#' @rdname robi4mer
......@@ -175,7 +175,9 @@ lcs_score <- function(x,y,lx=nchar(x),ly=nchar(y),
theshold <- as.double(threshold[1])
normalize <- as.logical(normalize[1])
reference <- as.integer(match(as.character(reference[1]),c("alignment","longest","shortest"),nomatch = 0) - 1)
reference <- as.integer(match(as.character(reference[1]),
c("alignment","longest","shortest"),
nomatch = 0) - 1)
robiassert_arg(reference>=0,
"reference",
......
This diff is collapsed.
......@@ -7,8 +7,8 @@ if (.has_doParallel) require(doParallel)
#' @author Eric Coissac <eric.coissac@metabarcoding.org>
#' @export
every_oligo <- function(n) {
do.call(paste0,lapply(rev(do.call(expand.grid,rep(list(c("A","C","G","T")),n))),as.character))
every_oligo <- function(n,dna = c("A","C","G","T")) {
do.call(paste0,lapply(rev(do.call(expand.grid,rep(list(dna),n))),as.character))
}
......
#' @include oligotag.R
#' @importFrom tibble as_tibble
NULL
.fourmers_label <- every_oligo(4,dna = c("A","C","T","G"))
#' Title
#'
#' @param sequences
#'
#' @return
#' @export
#'
#' @examples
fourMerTable <- function(sequences) {
results <- .Call("ROBI_4mer_table",sequences)
names(results) <- c("kmer","overflow","nkmer")
colnames(results$kmer) <- .fourmers_label
as_tibble(results)
}
\ No newline at end of file
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/robi4mer.R
\name{fourMerTable}
\alias{fourMerTable}
\title{Title}
\usage{
fourMerTable(sequences)
}
\arguments{
\item{sequences}{}
}
\value{
}
\description{
Title
}
......@@ -80,7 +80,7 @@ is_robi4mer(rt)
is_robi4mer("toto")
x <- c("AGT","GGT","CGT","GTA","CCC")
x <- c("AGTT","GGCT","CCGT","GATA","CCCC")
rt <- as_robi4mer(x)
}
......
......@@ -2,70 +2,13 @@
#include <Rinternals.h>
#include <Rdefines.h>
#include <stdint.h>
#include "robidnacode.h"
#define ABS(X) (((X) >= 0) ? (X):-(X))
#define MIN(X,Y) (((X) < (Y)) ? (X):(Y))
#define MAX(X,Y) (((X) > (Y)) ? (X):(Y))
/**
* Degenerated nucleotides are coded on four bits
*
* 8421
* ACGT
*
* Table for converting IUPAC symbol in DNA code
*
*/
static const char codeDNA[256] = { 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
/* A B C D E F G H */
8, 7, 4, 11, 0, 0, 2, 13,
/* I J K L M N O P */
0, 0, 3, 0, 12, 15, 0, 0,
/* Q R S T U V W X */
0, 10, 6, 1, 1, 14, 9, 0,
/* Y Z */
5, 0, 0, 0, 0, 0, 0, 0,
/* a b c d e f g h */
8, 7, 4, 11, 0, 0, 2, 13,
/* i j k l m n o p */
0, 0, 3, 0, 12, 15, 0, 0,
/* q r s t u v w x */
0, 10, 6, 1, 1, 14, 9, 0,
/* y z */
5, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0 };
/**
*
* Constant for decoding dna code into IUPAC symbol
*
* 0000000000011111
* 0123456789012345
*/
static const char DNAcode[17] = "-TGKCYSBAWRDMHVN";
static int32_t is_d0_or_d1(const char *seq1, const char* seq2, size_t length1, size_t length2) {
const char *b1, *e1;
......@@ -110,11 +53,11 @@ static int32_t is_d0_or_d1(const char *seq1, const char* seq2, size_t length1, s
pos = MIN((b1>= e1) ? MAX(pos1,pos2):0,INT_LEAST16_MAX);
if (pos2 > pos1)
code = (codeDNA[*e2]);
code = (robi_codeDNA[*e2]);
if (pos2 < pos1)
code = (codeDNA[*e1]*16);
code = (robi_codeDNA[*e1]*16);
if (pos2 == pos1)
code = (codeDNA[*e1]*16) + (codeDNA[*e2]);
code = (robi_codeDNA[*e1]*16) + (robi_codeDNA[*e2]);
code+= 512 + (pos * 65536);
......@@ -170,165 +113,4 @@ SEXP ROBI_is_d0_or_d1(SEXP sequences1,SEXP sequences2,SEXP length1,SEXP length2)
return results;
}
SEXP ROBI_as_chararacter_mutation(SEXP mutations) {
size_t n;
size_t i;
int32_t m;
const char* rep;
char buffer[256];
SEXP results;
if (! isInteger(mutations))
error("the mutations parameter must be an integer vector");
n = LENGTH(mutations);
results = PROTECT(NEW_STRING(n));
for (i = 0; i < n; i++) {
m = INTEGER(mutations)[i];
if (INTEGER(mutations)[i]==NA_INTEGER)
SET_STRING_ELT(results,i,NA_STRING);
else {
if (m==0) rep = "[SEVERAL]";
else {
if (m & 256) rep = "[NO MUTATION]";
else {
snprintf(buffer,256,"[%5d : %c > %c]",
(m >> 16) + 1,
DNAcode[(m & 240) >> 4],
DNAcode[m&15]);
rep = buffer;
}
}
SET_STRING_ELT(results,i,mkChar((const char*)rep));
}
}
UNPROTECT(1);
return results;
}
SEXP ROBI_substitution_mutation(SEXP mutations) {
size_t n;
size_t i;
int32_t m;
char buffer[256];
SEXP results;
if (! isInteger(mutations))
error("the mutations parameter must be an integer vector");
n = LENGTH(mutations);
results = PROTECT(NEW_STRING(n));
for (i = 0; i < n; i++) {
m = INTEGER(mutations)[i];
if (INTEGER(mutations)[i]==NA_INTEGER || m==0 || (m & 256))
SET_STRING_ELT(results,i,NA_STRING);
else {
snprintf(buffer,256,"[%c > %c]", DNAcode[(m & 240) >> 4], DNAcode[m&15]);
SET_STRING_ELT(results,i,mkChar((const char*)buffer));
}
}
UNPROTECT(1);
return results;
}
SEXP ROBI_origin_mutation(SEXP mutations) {
size_t n;
size_t i;
int32_t m;
char buffer[256];
SEXP results;
if (! isInteger(mutations))
error("the mutations parameter must be an integer vector");
n = LENGTH(mutations);
results = PROTECT(NEW_STRING(n));
for (i = 0; i < n; i++) {
m = INTEGER(mutations)[i];
if (INTEGER(mutations)[i]==NA_INTEGER || m==0 || (m & 256))
SET_STRING_ELT(results,i,NA_STRING);
else {
snprintf(buffer,256,"%c", DNAcode[(m & 240) >> 4]);
SET_STRING_ELT(results,i,mkChar((const char*)buffer));
}
}
UNPROTECT(1);
return results;
}
SEXP ROBI_to_mutation(SEXP mutations) {
size_t n;
size_t i;
int32_t m;
char buffer[256];
SEXP results;
if (! isInteger(mutations))
error("the mutations parameter must be an integer vector");
n = LENGTH(mutations);
results = PROTECT(NEW_STRING(n));
for (i = 0; i < n; i++) {
m = INTEGER(mutations)[i];
if (INTEGER(mutations)[i]==NA_INTEGER || m==0 || (m & 256))
SET_STRING_ELT(results,i,NA_STRING);
else {
snprintf(buffer,256,"%c", DNAcode[m&15]);
SET_STRING_ELT(results,i,mkChar((const char*)buffer));
}
}
UNPROTECT(1);
return results;
}
SEXP ROBI_position_mutation(SEXP mutations) {
size_t n;
size_t i;
size_t pos;
int32_t m;
const char* rep;
char buffer[256];
SEXP results;
if (! isInteger(mutations))
error("the mutations parameter must be an integer vector");
n = LENGTH(mutations);
results = PROTECT(NEW_INTEGER(n));
for (i = 0; i < n; i++) {
m = INTEGER(mutations)[i];
if (INTEGER(mutations)[i]==NA_INTEGER || m==0 || (m & 256))
INTEGER(results)[i]=NA_INTEGER;
else
INTEGER(results)[i]= (m >> 16) + 1;
}
UNPROTECT(1);
return results;
}
......@@ -79,11 +79,12 @@ inline static void dumpm128(unsigned short *table,vUInt8 data)
* count greater than 255.
*/
static int32_t buildTable(const char* sequence, unsigned char *table, int32_t *count)
static int buildTable(const char* sequence, unsigned char *table, size_t nseq, int *count)
{
int overflow = 0;
int wc=0;
int i;
size_t j;
vUInt8 mask_00= _MM_SETZERO_SI128();
uchar_v frag;
......@@ -94,8 +95,6 @@ static int32_t buildTable(const char* sequence, unsigned char *table, int32_t *c
s=(char*)sequence;
memset(table,1,256*sizeof(unsigned char));
// encode ascii sequence with A : 00 C : 01 T: 10 G : 11
for(frag.m=_MM_LOADU_SI128((vUInt8*)s);
......@@ -105,19 +104,30 @@ static int32_t buildTable(const char* sequence, unsigned char *table, int32_t *c
words= hash4m128(frag);
// printf("%d %d %d %d\n",words.c[0],words.c[1],words.c[2],words.c[3]);
if (table[words.c[0]]<255) table[words.c[0]]++; else overflow++;
if (table[words.c[1]]<255) table[words.c[1]]++; else overflow++;
if (table[words.c[2]]<255) table[words.c[2]]++; else overflow++;
if (table[words.c[3]]<255) table[words.c[3]]++; else overflow++;
if (table[words.c[4]]<255) table[words.c[4]]++; else overflow++;
if (table[words.c[5]]<255) table[words.c[5]]++; else overflow++;
if (table[words.c[6]]<255) table[words.c[6]]++; else overflow++;
if (table[words.c[7]]<255) table[words.c[7]]++; else overflow++;
if (table[words.c[8]]<255) table[words.c[8]]++; else overflow++;
if (table[words.c[9]]<255) table[words.c[9]]++; else overflow++;
if (table[words.c[10]]<255) table[words.c[10]]++; else overflow++;
if (table[words.c[11]]<255) table[words.c[11]]++; else overflow++;
j = words.c[0] * nseq;
if (table[j]<255) table[j]++; else overflow++;
j = words.c[1] * nseq;
if (table[j]<255) table[j]++; else overflow++;
j = words.c[2] * nseq;
if (table[j]<255) table[j]++; else overflow++;
j = words.c[3] * nseq;
if (table[j]<255) table[j]++; else overflow++;
j = words.c[4] * nseq;
if (table[j]<255) table[j]++; else overflow++;
j = words.c[5] * nseq;
if (table[j]<255) table[j]++; else overflow++;
j = words.c[6] * nseq;
if (table[j]<255) table[j]++; else overflow++;
j = words.c[7] * nseq;
if (table[j]<255) table[j]++; else overflow++;
j = words.c[8] * nseq;
if (table[j]<255) table[j]++; else overflow++;
j = words.c[9] * nseq;
if (table[j]<255) table[j]++; else overflow++;
j = words.c[10] * nseq;
if (table[j]<255) table[j]++; else overflow++;
j = words.c[11] * nseq;
if (table[j]<255) table[j]++; else overflow++;
wc+=12;
}
......@@ -128,8 +138,10 @@ static int32_t buildTable(const char* sequence, unsigned char *table, int32_t *c
words = hash4m128(frag);
if (zero.c[0]+zero.c[1]+zero.c[2]+zero.c[3]==0)
for(i=0;zero.c[i+3]==0;i++,wc++)
if (table[words.c[i]]<255) table[words.c[i]]++; else overflow++;
for(i=0;zero.c[i+3]==0;i++,wc++) {
j = words.c[i] * nseq;
if (table[j]<255) table[j]++; else overflow++;
}
if (count) *count=wc;
return overflow;
......@@ -209,163 +221,74 @@ static int32_t compareTable(unsigned char *t1, int32_t over1, unsigned char* t2,
total = summini.c[0]+summini.c[1];
total+= (over1 < over2) ? over1:over2;
return total - 256;
}
static inline int32_t overflow(const char* c) {
return decode_non_zero_int(*((int32_t*)(c + 256)));
}
static inline int32_t n4mer(const char* c) {
return decode_non_zero_int(*((int32_t*)(c + 256 + sizeof(int32_t))));
return total;
}
SEXP ROBI_4mer_table(SEXP sequences) {
unsigned char buffer[257 + 2 * sizeof(int32_t)];
unsigned char* buffer;
unsigned char swap1;
unsigned char swap2;
const char* csource;
size_t nseq;
size_t i;
size_t j;
SEXP srep;
SEXP hashes;
SEXP svalue;
SEXP soverflow;
SEXP sn4mer;
SEXP sdim;
int* overflow;
int* n4mer;
if (! isString(sequences))
error("the sequences parameter must be a character vector");
srep = PROTECT(NEW_LIST(3));
nseq = LENGTH(sequences);
hashes = PROTECT(NEW_STRING(nseq));
hashes = PROTECT(NEW_RAW(nseq * 256));
memset(RAW(hashes),0,nseq * 256 * sizeof(unsigned char));
soverflow = PROTECT(NEW_INTEGER(nseq));
sn4mer = PROTECT(NEW_INTEGER(nseq));
sdim = PROTECT(NEW_INTEGER(2));
INTEGER(sdim)[0] = nseq;
INTEGER(sdim)[1] = 256;
SET_DIM(hashes,sdim);
overflow = INTEGER(soverflow);
n4mer = INTEGER(sn4mer);
for (i = 0; i < nseq; i++) {
buffer = RAW(hashes) + i;
svalue = STRING_ELT(sequences,i);
if (svalue != NA_STRING) {
csource = CHAR(svalue);
*((int32_t*)(buffer + 256)) = buildTable(csource,
buffer,
(int32_t*)(buffer + 256 + sizeof(int32_t)));
buffer[256 + 2 * sizeof(int)]=0;
*((int32_t*)(buffer + 256)) = encode_non_zero_int(*((int32_t*)(buffer + 256)));
*((int32_t*)(buffer + 256 + sizeof(int32_t))) = encode_non_zero_int(*((int32_t*)(buffer + 256 + sizeof(int32_t))));
SET_STRING_ELT(hashes,i,mkChar((const char*)buffer));
overflow[i] = buildTable(csource,buffer,nseq,n4mer + i);
}
else
SET_STRING_ELT(hashes,i,NA_STRING);
}
UNPROTECT(1);
return hashes;
}
SEXP ROBI_4mer_overflow(SEXP tables) {
size_t nseq;
size_t i;
SEXP overflows;
SEXP svalue;
if (! isString(tables))
error("the sequences parameter must be a character vector");
nseq = LENGTH(tables);
overflows = PROTECT(NEW_INTEGER(nseq));
for (i = 0; i < nseq; i++) {
svalue = STRING_ELT(tables,i);
if (svalue != NA_STRING)
INTEGER(overflows)[i] = overflow(CHAR(svalue));
else
INTEGER(overflows)[i] = NA_INTEGER;
}
UNPROTECT(1);
return overflows;
}
else {
overflow[i] = 0;
n4mer[i] = 0;
memset(buffer,0,256*sizeof(unsigned char));
SEXP ROBI_n4mer(SEXP tables) {
size_t nseq;
size_t i;
SEXP n4mers;
SEXP svalue;
if (! isString(tables))
error("the sequences parameter must be a character vector");
nseq = LENGTH(tables);
n4mers = PROTECT(NEW_INTEGER(nseq));
for (i = 0; i < nseq; i++) {
svalue = STRING_ELT(tables,i);
if (svalue != NA_STRING)
INTEGER(n4mers)[i] = n4mer(CHAR(svalue));
else
INTEGER(n4mers)[i] = NA_INTEGER;
}
}
UNPROTECT(1);
return n4mers;
}
SEXP ROBI_encode_non_zero_int(SEXP value) {
size_t i;
size_t n;
int32_t v;
uint32_t uv;
SEXP result;
if (! isInteger(value))
error("the sequences parameter must be an integer vector");
n = LENGTH(value);
result = PROTECT(NEW_INTEGER(n));
for (i = 0; i < n; i++) {
v = INTEGER(value)[i];
if (v < 0)
error("Only positive values can be encoded");
SET_VECTOR_ELT(srep,0,hashes);
SET_VECTOR_ELT(srep,1,soverflow);
SET_VECTOR_ELT(srep,2,sn4mer);
uv = encode_non_zero_int((uint32_t)v);
if (uv == 0xFFFFFFFF)
error("Only values smaller than 16,777,216 can be encoded");
INTEGER(result)[i] = (int32_t)uv;
}
UNPROTECT(1);
return result;
UNPROTECT(5);
return srep;
}
SEXP ROBI_decode_non_zero_int(SEXP value) {
size_t i;
size_t n;
int32_t v;
uint32_t uv;
SEXP result;
if (! isInteger(value))
error("the sequences parameter must be an integer vector");
n = LENGTH(value);
result = PROTECT(NEW_INTEGER(n));
for (i = 0; i < n; i++) {
v = INTEGER(value)[i];
uv = decode_non_zero_int(*((uint32_t*)&v));
INTEGER(result)[i] = (int32_t)uv;
}
UNPROTECT(1);
return result;
}
SEXP ROBI_max_common_4mer(SEXP x, SEXP y) {
size_t nx, ny, si;
......@@ -412,8 +335,8 @@ SEXP ROBI_max_common_4mer(SEXP x, SEXP y) {
cx = CHAR(vx);
cy = CHAR(vy);
overx = overflow(cx);
overy = overflow(cy);
//overx = overflow(cx);
//overy = overflow(cy);
INTEGER(result)[i] = compareTable((unsigned char*)cx, overx,
(unsigned char*)cy, overy);
......
/**
* Degenerated nucleotides are coded on four bits
*
* 8421
* ACGT
*
* Table for converting IUPAC symbol in DNA code
*
*/
const char robi_codeDNA[256] = { 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
/* A B C D E F G H */
8, 7, 4, 11, 0, 0, 2, 13,
/* I J K L M N O P */
0, 0, 3, 0, 12, 15, 0, 0,
/* Q R S T U V W X */
0, 10, 6, 1, 1, 14, 9, 0,
/* Y Z */
5, 0, 0, 0, 0, 0, 0, 0,
/* a b c d e f g h */
8, 7, 4, 11, 0, 0, 2, 13,
/* i j k l m n o p */
0, 0, 3, 0, 12, 15, 0, 0,
/* q r s t u v w x */
0, 10, 6, 1, 1, 14, 9, 0,
/* y z */
5, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0 };
/**
*
* Constant for decoding dna code into IUPAC symbol
*
* 0000000000011111
* 0123456789012345
*/