Commit b20f0be4 authored by Eric Coissac's avatar Eric Coissac

Plein de choses

parent 4ce163ae
......@@ -31,8 +31,6 @@ Collate:
'robiobject.R'
'robiatomic.R'
'alignment.R'
'basic_plots.R'
'color.R'
'robidata.R'
'robitaxid.R'
'robitag.R'
......@@ -41,10 +39,15 @@ Collate:
'robicategory.R'
'robisample.R'
'robimetabar.R'
'as_motu_matrix.R'
'basic_plots.R'
'clean_ref_db.R'
'color.R'
'contaminent.R'
'dplyr.R'
'dplyr_robimetabar.R'
'ecotag.R'
'ecotransform.R'
'entropie.R'
'lowest_common_ancestor.R'
'metabar_data_class.R'
......@@ -57,6 +60,7 @@ Collate:
'read_metabar.R'
'read_ngsfilter.R'
'read_obitab.R'
'robimetabar_filter.R'
'robimetabar_stat.R'
'robimetabar_xlsx.R'
'robimutation.R'
......
......@@ -315,6 +315,7 @@ export(H_q)
export(alternative_names)
export(alternative_taxids)
export(as.robimetabar.robiseq_db)
export(as_motus_matrix)
export(as_robi4mer)
export(as_robiatomic)
export(as_robicategory)
......@@ -343,6 +344,8 @@ export(clique_tag)
export(clone)
export(colors_tol)
export(combine_LETTERS)
export(cum_hill)
export(cum_relative_frequencies)
export(d_exp_q)
export(d_log_q)
export(dd_exp_q)
......@@ -356,6 +359,7 @@ export(expand_names)
export(family)
export(feature_as_matrix)
export(filter_motus)
export(filter_sample_min_read)
export(filter_tag_gcmax)
export(filter_tag_homopolymere)
export(filter_tag_homopolymere_min)
......@@ -366,9 +370,11 @@ export(genus)
export(get_default_taxonomy)
export(gradient_tol1)
export(gradient_tol2)
export(hellinger)
export(hist_tag_jumps)
export(ids)
export(ids_colname)
export(import_motus_matrix)
export(int2nuc)
export(is_dist_1_or_0)
export(is_full_taxonomy)
......@@ -399,6 +405,7 @@ export(master_taxids_colname)
export(motus)
export(motus_ids)
export(motus_ids_colname)
export(motus_ranks)
export(motus_read_count)
export(motus_sample_count)
export(motus_sample_max)
......@@ -443,6 +450,8 @@ export(read_obitab)
export(read_obitools_taxonomy)
export(read_robitaxonomy)
export(reference_taxonomy)
export(relative_cum_hill)
export(relative_frequencies)
export(resets_category_tags)
export(reverse_tags)
export(reverse_tags_colname)
......@@ -456,6 +465,7 @@ export(robiatomic)
export(robicategory)
export(robicategory_mapping)
export(robidata)
export(robidnadist)
export(robilca)
export(robimetabar)
export(robimotu)
......@@ -532,6 +542,7 @@ import(readr)
import(rlang)
import(stringr)
import(tibble)
import(tidyfast)
import(tidyr)
import(utils)
import(vctrs)
......
......@@ -171,7 +171,7 @@ lcs_score <- function(x,y,lx=nchar(x),ly=nchar(y),
similarity_mode = c("similarity","distance"),
lcs=FALSE,alilength=FALSE) {
data <- tibble(x=x,y=y,lx=lx,ly=ly)
data <- tibble(x=x,y=y,lx=as.integer(lx),ly=as.integer(ly))
theshold <- as.double(threshold[1])
normalize <- as.logical(normalize[1])
......@@ -207,4 +207,11 @@ lcs_score <- function(x,y,lx=nchar(x),ly=nchar(y),
do.call(tibble,results);
}
#' @export
robidnadist <- function(x,y,lx=nchar(x),ly=nchar(y)) {
data <- tibble(x=x,y=y,lx=as.integer(lx),ly=as.integer(ly))
results = .Call("ROBI_dnadist",data$x,data$y,data$lx,data$ly)
results
}
#' @include robimetabar.R
#' @import dplyr
#' @import tidyfast
#'
NULL
#' Converts a robimetabar instance into a sample x motus matrix
#'
#' @param metabar
#' @param value
#'
#' @return
#' @export
#'
#' @examples
as_motus_matrix <- function(metabar,value = "count", values_fill = 0) {
metabar$data %>%
select(sample,motu,data = !! value) %>%
dt_pivot_wider(names_from = motu,values_from = data) -> mat
row_names <- mat$sample
mat %>% select(-sample) %>% as.matrix() -> mat
rownames(mat) <- row_names
mat[is.na(mat)] <- values_fill
mat
}
#' Title
#'
#' @param data
#' @param metabar
#' @param results_in
#'
#' @return
#' @export
#'
#' @examples
import_motus_matrix <- function(data,metabar,results_in) {
metabar$data %>%
left_join(data %>%
as_tibble(rownames = "sample") %>%
dt_pivot_longer(cols = -sample,
names_to = "motu",
values_to = '.__data'),
by=c("sample","motu")) %>%
mutate(!!results_in := .__data) %>%
select(-.__data) -> metabar$data
invisible(metabar)
}
\ No newline at end of file
clean_ref_db <- function(refdb, taxonomy, rank_threshold,
n = 1,
error_rate = 0.1,
quorum = 20, dist_max = 5) {
votes <- tibble(id = refdb$id,
in_pools = 0,
bad = 0)
i=n
for (i in n) {
distances <- lcs_score(refdb$sequence[i], refdb$sequence,
normalize = FALSE,
similarity_mode = "distance",
threshold = dist_max
)
dist_score <- cumsum(table(rep(
distances$score,
refdb$features$count
)))
dist_score_ok <- dist_score[dist_score >= quorum]
if (length(dist_score_ok) >= 1) {
score_lim <- as.integer(names(dist_score_ok[1]))
} else {
score_lim <- dist_max
}
in_cluster = which(!is.na(distances$score) & distances$score <= score_lim)
cluster <- refdb[in_cluster, ]
cluster$taxid = cluster$features$taxid
cluster$idx = in_cluster
votes$in_pools[in_cluster] = votes$in_pools[in_cluster] + 1
if (length(in_cluster) == 1) next
cluster <- cluster %>%
left_join(., taxonomy$nodes[taxonomy$nodes$taxid %in% .$taxid, ],
by = "taxid"
) %>%
mutate(
path = mapply(function(x, y) c(rev(x), y), path, taxid,SIMPLIFY = FALSE),
length = mapply(length, path)
)
max_length <- max(cluster$length)
paths <- t(mapply(function(x, l) c(x, rep(NA, max_length - l)),
cluster$path,
cluster$length))
times <- cluster$features$count
counts = integer(ncol(paths))
taxids = integer(ncol(paths))
for (i in seq_len(ncol(paths))) {
tab <- table(rep(paths[,i], times))
x <- tab[which.max(tab)]
counts[i] <- x
taxids[i] <- as.integer(names(x))
}
tibble(
count = counts,
taxid = as_robitaxid(taxids)
) %>%
left_join(., taxonomy$nodes[taxonomy$nodes$taxid %in% .$taxid, ], by = "taxid") %>%
mutate(max_count = max(count), error = max_count - count, ok = error < max_count * error_rate) %>%
filter(ok) %>% tail(1) -> limit
is_subcladeof(cluster,taxonomy,limit$taxid,results_in = "subclad") %>% filter(!subclad) -> bads
votes$bad[bads$idx] = votes$bad[bads$idx] + 1
}
votes$ratio = 0
votes$ratio[votes$in_pools>0] = votes$bad[votes$in_pools>0] / votes$in_pools[votes$in_pools>0]
votes
}
# s = 0:7 * 10282
# e = c(s[-1] - 1,82260)
# p = mapply(seq.int, e,s)
# library(parallel)
# check_ref <- mclapply(p, function(i) clean_ref_db(ref,taxonomy = taxo,
# rank_threshold = "",n = i,quorum = 5,error_rate = 0.05))
#
clean_ref_db <- function(refdb, taxonomy, quorum = 10,error_rate = 0.05, rank_limit="class") {
rdata <- tibble(taxid = refdb$features$taxid,
count = refdb$features$count)
urdata <- rdata %>%
group_by(taxid) %>%
summarise(count = sum(count),.groups = "drop") %>%
left_join(nodes(taxonomy) %>% select(taxid,path), by="taxid") %>%
mutate(path = mapply(c,taxid,path), lp = mapply(length, path))
with(urdata,
tibble(count = rep(count,lp),
taxid = unlist(path))) %>%
group_by(taxid) %>%
summarise(count = sum(count),.groups = "drop") -> grdata
tibble(id = refdb$id,
taxid = refdb$features$taxid,
count = refdb$features$count,
sequence = refdb$sequence) %>%
left_join(urdata %>% select(-count),by = "taxid") -> rdata
with(rdata,
tibble(id = rep(id,lp),
taxid = rep(taxid,lp),
count = rep(count,lp),
path = unlist(path),
rank = unlist(mapply(seq,lp)),
sequence= rep(sequence,lp))) %>%
group_by(path) %>%
mutate(total = sum(count)) -> grdata
print(grdata)
grdata %>%
filter(total >= quorum) %>%
group_by(id) %>%
filter(rank == min(rank)) -> rdata
print(rdata)
grdata[grdata$path %in% unique(rdata$path),c("id","path","count","sequence")] %>%
group_by(path) %>%
summarise(ids = list(id),
counts = list(count),
sequences = list(sequence),.groups = "drop") %>%
mutate(rank_theshold = !is.na(taxon_at_rank(path,
taxo,
!!rank_limit,
results_in = "rank")$rank)) %>%
filter(rank_theshold) -> grdata
grdata
}
outgroup <- function(ids,sequences,counts) {
if (length(ids) > 2) {
t(combn(ids,2)) -> results
colnames(results) <- c("X","Y")
results <- as_tibble(results)
t(combn(sequences,2)) -> seqs
results$score = lcs_score(seqs[,1],seqs[,2],similarity_mode = "distance")$score
results %>%
left_join(tibble(X=ids,count_x=counts),by ="X") %>%
left_join(tibble(Y=ids,count_y=counts),by ="Y") %>%
mutate(count = count_x * count_y) -> results
pval = numeric(length(ids))
for (id in seq_along(ids)) {
ingroup <- results %>% filter(X == ids[id] | Y == ids[id])
outgroup <- results %>% filter(X != ids[id] & Y != ids[id])
X <- sample(rep(ingroup$score,ingroup$count),
floor(0.5 * sum(ingroup$count)))
Y <- sample(rep(outgroup$score,outgroup$count),
floor(0.5 * sum(outgroup$count)))
if (length(Y) >= 5)
pval[id] <- wilcox.test(X,Y,
alternative = "greater")$p.value
else pval[id] <- 1.0
}
}
else pval=1.0
tibble(id = ids,count = counts,pvalue = pval,sequence = sequences)
}
# Si une sequence apparait plus de k fois dans l'EMBL avec le mme taxid on les garde
# On calcule le score -> filtre les erreurs potentielles.
# Sur les erreurs potentielles nouveau score bas sur le nombre de clusters de differences
# as_tibble(t(with(x[1:100,],mapply(outgroup,ids,sequences,counts))))
#' @include robimetabar.R
#' @import dplyr
#'
NULL
#' Computes the Hellinger transformation of the read counts
#'
#' On a per sample base, the square root of the relative frequency of each
#' *MOTU*.
#'
#' This function is modifying the `robimetabar` instance provided
#' as input.
#'
#' @param metabar a `robimetabar` instance.
#' @param results_in by default the result of the computation
#' is stored in the **relative_frequency** column of
#' the data table.. Name of the destination column
#' can be changed by providing the desired name through
#' that argument.
#'
#' @seealso `relative_frequencies()`, `cum_relative_frequencies()`,
#' `motus_ranks()`
#'
#' @return the modified `robimetabar` instance.
#' @export
#'
#' @examples
#' @md
hellinger <- function(metabar, results_in) {
if (missing(results_in)) {
results_in <- "hellinger"
}
metabar$data %>%
ungroup() %>%
group_by(sample) %>%
mutate(!!results_in := sqrt(count/sum(count))) %>%
ungroup() -> metabar$data
invisible(metabar)
}
#' Computes the relative frequencies of the read counts
#'
#' On a per sample base, the relative frequency of each
#' *MOTU* is computed by dividing the read count of the motu for that sample
#' by the total read count of every *MOTUs* occurring in the given sample.
#'
#' This function is modifying the `robimetabar` instance provided
#' as input.
#'
#' @param metabar a `robimetabar` instance.
#' @param results_in by default the result of the computation
#' is stored in the **relative_frequency** column of
#' the data table.. Name of the destination column
#' can be changed by providing the desired name through
#' that argument.
#'
#' @seealso `hellinger()`, `cum_relative_frequencies()`,
#' `motus_ranks()`
#'
#' @return the modified `robimetabar` instance.
#' @export
#'
#' @examples
#' @md
relative_frequencies <- function(metabar, results_in) {
if (missing(results_in)) {
results_in <- "relative_frequency"
}
metabar$data %>%
ungroup() %>%
group_by(sample) %>%
mutate(!!results_in := count/sum(count)) %>%
ungroup() -> metabar$data
invisible(metabar)
}
#' Computes the cumulative relative frequencies in a sample.
#'
#' MOTUs are firstly ordered by decreasing read abundances,
#' sample per sample and and then the cumulative relative
#' frequency of each *MOTU* summed to the more abundant is
#' estimated.
#'
#' This function is modifying the `robimetabar` instance provided
#' as input.
#'
#' @param metabar a `robimetabar` instance.
#' @param results_in by default the result of the computation
#' is stored in the **cum_rel_freq** column of
#' the data table.. Name of the destination column
#' can be changed by providing the desired name through
#' that argument.
#'
#' @return the modified `robimetabar` instance.
#' @export
#'
#' @seealso `relative_frequencies()`, `hellinger()`,
#' `motus_ranks()`
#'
#' @examples
#' @md
cum_relative_frequencies <- function(metabar, results_in) {
if (missing(results_in)) {
results_in <- "cum_rel_freq"
}
metabar$data %>%
ungroup() %>%
arrange(sample,desc(count)) %>%
group_by(sample) %>%
mutate(!! results_in := cumsum(count/sum(count))) %>%
ungroup() -> metabar$data
invisible(metabar)
}
#' Computes the cumulative relative frequencies in a sample.
#'
#' MOTUs are firstly ordered by decreasing read abundances,
#' sample per sample and and then the cumulative relative
#' frequency of each *MOTU* summed to the more abundant is
#' estimated.
#'
#' This function is modifying the `robimetabar` instance provided
#' as input.
#'
#' @param metabar a `robimetabar` instance.
#' @param results_in by default the result of the computation
#' is stored in the **cum_hill** column of
#' the data table.. Name of the destination column
#' can be changed by providing the desired name through
#' that argument.
#'
#' @return the modified `robimetabar` instance.
#' @export
#'
#' @seealso `relative_frequencies()`, `hellinger()`,
#' `motus_ranks()`
#'
#' @examples
#' @md
cum_hill <- function(metabar, q=1, results_in) {
.cumhill <- function(x) sapply(1:length(x),
function(i) D_q(x[1:i]))
if (missing(results_in)) {
results_in <- "cum_hill"
}
metabar$data %>%
ungroup() %>%
arrange(sample,desc(count)) %>%
group_by(sample) %>%
mutate(!! results_in := .cumhill(count)) %>%
ungroup() -> metabar$data
invisible(metabar)
}
#' Computes the cumulative relative frequencies in a sample.
#'
#' MOTUs are firstly ordered by decreasing read abundances,
#' sample per sample and and then the cumulative relative
#' frequency of each *MOTU* summed to the more abundant is
#' estimated.
#'
#' This function is modifying the `robimetabar` instance provided
#' as input.
#'
#' @param metabar a `robimetabar` instance.
#' @param results_in by default the result of the computation
#' is stored in the **cum_hill** column of
#' the data table.. Name of the destination column
#' can be changed by providing the desired name through
#' that argument.
#'
#' @return the modified `robimetabar` instance.
#' @export
#'
#' @seealso `relative_frequencies()`, `hellinger()`,
#' `motus_ranks()`
#'
#' @examples
#' @md
cum_hill <- function(metabar, q=1, results_in) {
.cumhill <- function(x) sapply(1:length(x),
function(i) D_q(x[1:i]))
if (missing(results_in)) {
results_in <- "relative_cum_hill"
}
metabar$data %>%
ungroup() %>%
arrange(sample,desc(count)) %>%
group_by(sample) %>%
mutate(!! results_in := .cumhill(count)) %>%
ungroup() -> metabar$data
invisible(metabar)
}
#' Computes the cumulative relative frequencies in a sample.
#'
#' MOTUs are firstly ordered by decreasing read abundances,
#' sample per sample and and then the cumulative relative
#' frequency of each *MOTU* summed to the more abundant is
#' estimated.
#'
#' This function is modifying the `robimetabar` instance provided
#' as input.
#'
#' @param metabar a `robimetabar` instance.
#' @param results_in by default the result of the computation
#' is stored in the **cum_hill** column of
#' the data table.. Name of the destination column
#' can be changed by providing the desired name through
#' that argument.
#'
#' @return the modified `robimetabar` instance.
#' @export
#'
#' @seealso `relative_frequencies()`, `hellinger()`,
#' `motus_ranks()`
#'
#' @examples
#' @md
relative_cum_hill <- function(metabar, q=1, results_in) {
.relcumhill <- function(x) {
total = D_q(x,q=q)
sapply(1:length(x),
function(i) D_q(x[1:i],q=q)/total)
}
if (missing(results_in)) {
results_in <- "relative_cum_hill"
}
metabar$data %>%
ungroup() %>%
arrange(sample,desc(count)) %>%
group_by(sample) %>%
mutate(!! results_in := .relcumhill(count)) %>%
ungroup() -> metabar$data
invisible(metabar)
}
#' Computes the rank of each MOTU in a sample.
#'
#'
#' This function is modifying the `robimetabar` instance provided
#' as input.
#'
#' @param metabar a `robimetabar` instance.
#' @param results_in by default the result of the computation
#' is stored in the **motus_rank** column of
#' the data table.. Name of the destination column
#' can be changed by providing the desired name through
#' that argument.
#' @export
#'
#' @seealso `relative_frequencies()`, `hellinger()`,
#' `cum_relative_frequencies()`
#'
#' @examples
motus_ranks <- function(metabar, results_in) {
if (missing(results_in)) {
results_in <- "motus_rank"
}
metabar$data %>%
ungroup() %>%
group_by(sample) %>%
mutate(!!results_in := order(count,decreasing = TRUE)) %>%
ungroup() -> metabar$data
invisible(metabar)
}
......@@ -79,5 +79,5 @@ H_q = function(x, q = 1, normalize = TRUE) {
#' @author Eric Coissac
#' @export
D_q = function(x, q = 1, normalize = TRUE) {
exp_q(H_q(x, q, normalize=TRUE), q)
exp_q(H_q(x, q, normalize=normalize), q)
}
......@@ -176,6 +176,7 @@ as_robicategory <- function(object) {
#' @param extraction_blk the string used to qualifying extraction blank.
#' @param pcr_blk the string used to qualifying PCR blank.
#' @param sequencing_blk the string used to sequencing blank.
#' @param default the default category when the name is matching no pattern
#' @param verbose if `TRUE` warnings and messages are emitted when the
#' constructor takes some decision because of missing values and estimates
#' default values these missing data.
......@@ -201,12 +202,27 @@ robicategory_mapping <- function(sample="s",positive_ctrl="PCRPos",
extraction_blk="coExt",
pcr_blk="PCRNeg",
sequencing_blk="blank",
default = NA,
verbose = is_robi_verbose()) {
matcher <- function(data) {
m <- rbind(str_detect(data,sample),
str_detect(data,positive_ctrl),
str_detect(data,extraction_blk),
str_detect(data,pcr_blk),
str_detect(data,sequencing_blk))
ms = colSums(m)
robiassert(all(ms <= 1),message = "A sample name match several categories")
cat <- colSums(m * 1:5)
cat
}
m <- as.character(c(sample, positive_ctrl, extraction_blk, pcr_blk, sequencing_blk))
mapping <- function(data) (
new_robicategory(.robi_all_categories[match(as.character(data),m,nomatch = NA)],
new_robicategory(c(default,.robi_all_categories)[matcher(as.character(data))+1],
verbose = verbose)
)
......
......@@ -603,7 +603,14 @@ samples_ids.robimetabar <- function(object) {
samples(object) %>% ids()
}
#' Clones a robimetabar instance
#'
#' @param object the robimetabar instance to be cloned
#'
#' @return a copy of object
#' @export
#'
#' @examples
clone.robimetabar <- function(object) {
object$clone()
}
......
#' @include robimetabar.R
#' @import dplyr
#'
NULL
#' Filters out samples based on reacd count.
#'
#' Filters out samples having less than the specified minimum
#' number of reads in total.
#'
#' @param metabar
#' @param min_read
#' @param remove_empty_samples
#' @p