Commit ffa96383 by Eric Coissac

Initial commit

parents
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: ISO-8859-1
RnwWeave: knitr
LaTeX: pdfLaTeX
BuildType: Package
PackageUseDevtools: Yes
PackagePath: ROBITools
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace
/man/
/loopbenchmark.R
/Read-and-delete-me
Package: ROBITools
Type: Package
Title: Metabarcoding data biodiversity analysis
Version: 0.1
Date: 2012-08-23
Author: LECA - Laboratoire d'ecologie alpine
Maintainer: LECA OBITools team <obitools@metabarcoding.org>
Description: More about what it does (maybe more than one line)
License: CeCILL v2.0
LazyLoad: yes
Roxygen: list(wrap = FALSE)
Collate:
's3objects.R'
'ROBITools.R'
'02_class_metabarcoding.data.R'
'aggregate.R'
'choose.taxonomy.R'
'contaslayer.R'
'distrib.extrapol.R'
'experimental.section.R'
'export-metabarcoding.R'
'read.obitab.R'
'import.metabarcoding.R'
'import.ngsfilter.R'
'layers.metabarcoding.R'
'metabarcoding_threshold.R'
'mstat.R'
'obiclean.R'
'pcrslayer.R'
'plot.PCRplate.R'
'plot.seqinsample.R'
'rarefy.R'
'read.ngsfilter.R'
'read.sumatra.R'
'taxoDBtree.R'
'taxonomic.resolution.R'
'taxonomy_classic_table.R'
RoxygenNote: 5.0.1
Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
Copyright (c) 2013 Cesanta Software Limited
All rights reserved
This code is dual-licensed: you can redistribute it and/or modify
it under the terms of the GNU General Public License version 2 as
published by the Free Software Foundation. For the terms of this
license, see <http://www.gnu.org/licenses/>.
You are free to use this code under the terms of the GNU General
Public License, but WITHOUT ANY WARRANTY; without even the implied
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.
Alternatively, you can license this code under a commercial
license, as set out in <http://cesanta.com/>.
# Generated by roxygen2: do not edit by hand
S3method(aggregate,metabarcoding.data)
S3method(plot,PCRplate)
S3method(plot,seqinsample)
S3method(summary,taxores)
export(addS3Class)
export(colnames)
export(const.threshold.mask)
export(contaslayer)
export(createS3Class)
export(dbtree)
export(dist.center.group)
export(dist.clique.group)
export(dist.grid)
export(dm.univariate)
export(extracts.obiclean)
export(extracts.obiclean_cluster)
export(extrapol.freq)
export(get.classic.taxonomy)
export(import.metabarcoding.data)
export(import.ngsfilter.data)
export(layer.names)
export(m.bivariate)
export(m.univariate)
export(m.univariate.test)
export(m.weight)
export(map.extrapol.freq)
export(marginalsum)
export(metabarcoding.data)
export(motus)
export(normalize)
export(rarefy)
export(read.ngsfilter)
export(read.obitab)
export(reads)
export(rmS3Class)
export(rownames)
export(samples)
export(taxo.decider)
export(threshold)
export(threshold.mask)
export(threshold.set)
exportClasses(metabarcoding.data)
exportMethods("$")
exportMethods("$<-")
exportMethods("[[")
exportMethods("[[<-")
exportMethods(colnames)
exportMethods(rownames)
import(ROBITaxonomy)
import(igraph)
#' A package to manipulate DNA metabarcoding data.
#'
#' This package was written as a following of the OBITools.
#'
#' \tabular{ll}{
#' Package: \tab ROBITools\cr
#' Type: \tab Package\cr
#' Version: \tab 0.1\cr
#' Date: \tab 2013-06-27\cr
#' License: \tab CeCILL 2.0\cr
#' LazyLoad: \tab yes\cr
#'}
#'
#' @name ROBITools-package
#' @aliases ROBITools
#' @docType package
#' @title A package to manipulate DNA metabarcoding data.
#' @author Frederic Boyer
#' @author Aurelie Bonin
#' @author Lucie Zinger
#' @author Eric Coissac
#'
#' @references http://metabarcoding.org/obitools
#'
NA
.onLoad <- function(libname, pkgname) {
packageStartupMessage( "ROBITools package" )
#print(getwd())
}
#' @include 02_class_metabarcoding.data.R
NULL
# TODO: Add comment
#
# Author: coissac
###############################################################################
#' @export
aggregate.metabarcoding.data=function(x, by, FUN,...,
MARGIN='sample',
default.layer=NULL,
layers=NULL) {
uniq.value = function(z) {
if (is.null(z) |
any(is.na(z)) |
length(z)==0)
ans = NA
else {
if (all(z==z[1]))
ans = z[1]
else
ans = NA
}
if (is.factor(z))
ans = factor(ans,levels=levels(z))
return(ans)
}
#
# Deals with the supplementaty aggregate arguments
#
if (is.null(default.layer))
default.layer=uniq.value
if (is.null(layers)) {
layers = as.list(rep(c(default.layer),length(x@layers)))
names(layers)=layer.names(x)
}
else {
for (n in layer.names(x))
if (is.null(layers[[n]]))
layers[[n]]=default.layers
}
if (MARGIN == 'sample')
MARGIN=1
if (MARGIN == 'motu')
MARGIN=2
reads = x@reads
if (MARGIN==1) {
# prepare the aggrevation arguments for the read table
# from the function arguments
dotted = list(...)
if (length(dotted) > 0)
aggr.args = list(reads,by=by,FUN=FUN,...=dotted,simplify=FALSE)
else
aggr.args = list(reads,by=by,FUN=FUN,simplify=FALSE)
# Aggregate the read table
ragr = do.call(aggregate,aggr.args)
# extrat new ids from the aggregated table
ncat = length(by)
ids = as.character(interaction(ragr[,1:ncat,drop=FALSE]))
# remove the aggregations modalities to rebuild a correct
# reads table
ragr = as.matrix(ragr[,-(1:ncat),drop=FALSE])
dragr= dim(ragr)
cragr= colnames(ragr)
ragr = as.numeric(ragr)
dim(ragr)=dragr
colnames(ragr)=cragr
rownames(ragr)=ids
#
# Apply the same aggragation to each layer
#
ln = layer.names(x)
la = vector(mode="list",length(ln))
names(la)=ln
for (n in ln) {
f = layers[[n]]
if (is.factor(x[[n]])){
isfact = TRUE
lf = levels(x[[n]])
df = dim(x[[n]])
m = matrix(as.character(x[[n]]))
dim(m)=df
}
else
m = x[[n]]
aggr.args = list(m,by=by,FUN=f,simplify=FALSE)
lagr = do.call(aggregate,aggr.args)
lagr = as.factor.or.matrix(lagr[,-(1:ncat),drop=FALSE])
if (isfact){
df = dim(lagr)
lagr = factor(lagr,levels=lf)
dim(lagr)=df
}
rownames(lagr)=ids
la[[n]]=lagr
}
# aggragate the sample table according to the same criteria
#
# TODO: We have to take special care of factors in the samples
# data.frame
sagr = aggregate(samples(x),by,uniq.value,simplify=FALSE)
# move the first columns of the resulting data frame (the aggregations
# modalities to the last columns of the data.frame
sagr = sagr[,c((ncat+1):(dim(sagr)[2]),1:ncat),drop=FALSE]
larg = c(lapply(sagr,unlist),list(stringsAsFactors=FALSE))
sagr = do.call(data.frame,larg)
# set samples ids to the ids computed from modalities
sagr$id=ids
rownames(sagr)=ids
# build the new metabarcoding data instance
newdata = copy.metabarcoding.data(x,reads=ragr,samples=sagr)
}
else {
# prepare the aggregation arguments for the read table
# from the function arguments
# BECARFUL : the reads table is transposed
# standard aggregate runs by row and we want
# aggregation by column
dotted = list(...)
if (length(dotted) > 0)
aggr.args = list(t(reads),by=by,FUN=FUN,...=dotted,simplify=FALSE)
else
aggr.args = list(t(reads),by=by,FUN=FUN,simplify=FALSE)
# Aggregate the read table
ragr = do.call(aggregate.data.frame,aggr.args)
# extrat new ids from the aggregated table
ncat = length(by)
ids = as.character(interaction(ragr[,1:ncat,drop=FALSE]))
# remove the aggregations modalities to rebuild a correct
# reads table
ragr = t(ragr[,-(1:ncat),drop=FALSE])
dragr= dim(ragr)
rragr= rownames(ragr)
ragr = as.numeric(ragr)
dim(ragr)=dragr
colnames(ragr)=ids
rownames(ragr)=rragr
#
# Apply the same aggragation to each layer
#
ln = layer.names(x)
la = vector(mode="list",length(ln))
names(la)=ln
for (n in ln) {
f = layers[[n]]
if (is.factor(x[[n]])){
isfact = TRUE
lf = levels(x[[n]])
df = dim(x[[n]])
m = matrix(as.character(x[[n]]))
dim(m)=df
}
else
m = x[[n]]
aggr.args = list(t(m),by=by,FUN=f,simplify=FALSE)
lagr = do.call(aggregate,aggr.args)
lagr = t(as.factor.or.matrix(lagr[,-(1:ncat),drop=FALSE]))
if (isfact){
df = dim(lagr)
lagr = factor(lagr,levels=lf)
dim(lagr)=df
}
colnames(lagr)=ids
la[[n]]=lagr
}
# aggragate the motus table according to the same criteria
magr = aggregate(motus(x),by,uniq.value,simplify=FALSE)
# move the first columns of the resulting data frame (the aggregations
# modalities to the last columns of the data.frame
magr = magr[,c((ncat+1):(dim(magr)[2]),1:ncat),drop=FALSE]
larg = c(lapply(magr,unlist),list(stringsAsFactors=FALSE))
magr = do.call(data.frame,larg)
# set motus ids to the ids computed from modalities
magr$id=ids
rownames(magr)=ids
# build the new metabarcoding data instance
newdata = copy.metabarcoding.data(x,reads=ragr,motus=magr,layers=la)
}
return(newdata)
}
#' @import ROBITaxonomy
#' @include 02_class_metabarcoding.data.R
NULL
#' Choose between databases for taxonomic classifications
#'
#' Chooses a sequence taxonomic assignment in order of preference for the different
#' reference databases that have been used when the assignment is above a certain threshold
#'
#'
#' @param x a \code{\link{metabarcoding.data}} object
#' @param taxonomy a \code{\linkS4class{taxonomy.obitools}} instance
#' @param dbrank string or vector indicating reference database names ranked by order of preference
#' @param thresh a best_identity threshold for applying priority. Default is \code{0.95}
#'
#' @return returns a data.frame with the refined taxonomic assignement and classic taxonomy description.
#'
#' @examples
#'
#' data(termes)
#'
#' taxo=default.taxonomy()
#'
#' #create artificial taxonomic assignments
#' attr(termes, "motus")["best_identity:DB1"] = sample(seq(0.5,1,0.001),size=nrow(termes$motus), replace=T)
#' attr(termes, "motus")["best_identity:DB2"] = sample(seq(0.5,1,0.001),size=nrow(termes$motus), replace=T)
#' attr(termes, "motus")["best_identity:DB3"] = sample(seq(0.5,1,0.001),size=nrow(termes$motus), replace=T)
#' attr(termes, "motus")["taxid_by_db:DB1"] = termes$motus$taxid
#' attr(termes, "motus")["taxid_by_db:DB2"] = sample(termes$motus$taxid,size=nrow(termes$motus), replace=F)
#' attr(termes, "motus")["taxid_by_db:DB3"] = sample(termes$motus$taxid,size=nrow(termes$motus), replace=F)
#'
#' #Run taxo.decider
#' termes.ok = taxo.decider(termes, taxo, "DB2", 0.95)
#' head(termes.ok$motus[union(grep("DB", colnames(termes.ok$motus)), grep("_ok", colnames(termes.ok$motus)))])
#'
#' termes.ok = taxo.decider(termes, taxo, c("DB3", "DB1"), 0.95)
#' head(termes.ok$motus[union(grep("DB", colnames(termes.ok$motus)), grep("_ok", colnames(termes.ok$motus)))])
#'
#' #Quick look at the enhancement in taxonomic assignements
#' par(mfrow=c(1,4))
#' for(i in grep("best_identity.", colnames(termes.ok$motus))){
#' hist(termes.ok$motus[,i], breaks=20, ylim=c(1,21), main=colnames(termes.ok$motus)[i], xlab="assignment score")
#' }
#'
#' @seealso \code{\linkS4class{taxonomy.obitools}}, and methods \code{\link{species}},\code{\link{genus}}, \code{\link{family}},\code{\link{kingdom}},
#' \code{\link{superkingdom}},\code{\link{taxonatrank}}, \code{\link{taxonmicank}}
#'
#' @author Lucie Zinger
#' @keywords taxonomy
#'
#' @export
#'
taxo.decider = function(x, taxonomy, dbrank, thresh=0.95) {
noms = colnames(x$motus)
best_ids_names = noms[grep("best_identity.", noms)]
best_ids = x$motus[,best_ids_names]
taxids = x$motus[, gsub("best_identity", "taxid_by_db", best_ids_names)]
dbs = unlist(lapply(strsplit(best_ids_names, "\\:"), "[[", 2))
#Set max indices
ind = as.vector(t(apply(best_ids,1,function(y) order(rank(-y, ties.method="max"), match(dbrank, dbs))))[,1])
#Set default vector: db, bestids, taxids with max score
db_ok = dbs[ind]
best_identity_ok = best_ids[cbind(1:length(ind), ind)]
taxids_by_db_ok = taxids[cbind(1:length(ind), ind)]
#Get vector of db index that should be used according to condition > thresh
db_choice = taxo.decider.routine(dbrank, best_ids, dbs, thresh)
#Replacing by right values according to db_ok
for(i in 1:length(dbrank)){
db_ok[which(db_choice==i)] = dbrank[i]
best_identity_ok[which(db_choice==i)] = best_ids[which(db_choice==i),grep(dbrank[i], colnames(best_ids))]
taxids_by_db_ok[which(db_choice==i)] = taxids[which(db_choice==i),grep(dbrank[i], colnames(taxids))]
}
decision = data.frame(db_ok, best_identity_ok, taxids_by_db_ok)
coltaxid = colnames(decision)[grep("taxid", colnames(decision))]
attr(x, "motus") = data.frame(x$motus, decision)
new.tax = get.classic.taxonomy(x, taxonomy, coltaxid)
attr(x, "motus") = data.frame(x$motus, new.tax)
return(x)
}
taxo.decider.routine = function(dbrank, best_ids, dbs, thresh) {
#Setting mask
mask = matrix(NA,nrow(best_ids),length(dbrank))
colnames(mask)=dbrank
#For each DB, see if condition T/F
for(i in dbrank){
mask[,i] = best_ids[,which(dbs==i)]>thresh
}
#Get the first occurence of T in the table
out = apply(mask, 1, function(x) which(x==T)[1])
return(out)
}
#' @include 02_class_metabarcoding.data.R
NULL
#' Detects contaminants in metabarcoding data
#'
#' Detects sequences/motus in a \code{\link{metabarcoding.data}} object
#' for which frequencies over the entire dataset are maximum in negative controls and
#' hence, most likely to be contaminants.
#'
#'
#' @param x a \code{\link{metabarcoding.data}} object
#' @param controls a vector of samples names where conta are suspected to be detected
#' (typically negative control names).
#' @param clust a vector for grouping sequences. Default set to \code{NULL}.
#'
#' @return a vector containing the names of sequences identified as contaminants
#'
#' @examples
#'
#' data(termes)
#' termes.ok = termes[,colSums(termes$reads)>0]
#' neg = rownames(termes.ok)[grep("r",rownames(termes.ok))]
#'
#' #finds contaminants based on neg samples
#' contaslayer(termes.ok, neg)
#'
#' # extanding contamininant detection with grouping factor,
#' # typically obiclean/sumatra cluster or taxonomy membership
#' contaslayer(termes.ok, neg, termes.ok$motus$scientific_name)
#'
#' @seealso \code{\link{threshold}} for further trimming
#' @author Lucie Zinger
#' @export
contaslayer = function(x,controls,clust=NULL){
x.fcol = normalize(x, MARGIN=2)$reads
x.max = rownames(x.fcol[apply(x.fcol, 2, which.max),])
conta = colnames(x)[!is.na(match(x.max,controls))]
if (length(clust)!=0) {
agg = data.frame(conta.id=colnames(x.fcol), clust)
conta.ext = agg$conta.id[which(!is.na(match( agg$clust, agg$clust[match(conta,agg$conta.id)])))]
return(as.vector(conta.ext))
}
else {
return(conta)
}
}
#' @include 02_class_metabarcoding.data.R
NULL
#' Read frequencies krigging
#'
#' Extrapolates read frequencies from a \code{\link{metabarcoding.data}} object in space for a finer resolution
#'
#' @param x a vector or matrix from a row-normalized read table
#' \code{\link{metabarcoding.data}} object
#' @param min.coord a vector of length = 2 indicating the minimum values of x and y
#' coordinates to be used for the predicted grid
#' @param max.coord a vector of length = 2 indicating the maximum values of x and y
#' coordinates to be used for the predicted grid
#' @param grid.grain an integer indicating the resolution (i.e. nb of subpoints) in x and y
#' coordinates required for the predicted grid
#' @param coords a dataframe containing the x and y coordinates of the abundances
#' from x to be extrapolated.
#' @param otus.table a motus data.frame containing motus informations of x
#' @param cutoff a cutoff below which abundances are set to 0.
#' This threshold also determines the value to be added to 0 values for log10
#' transformation
#' @param return.metabarcoding.data if \code{TRUE}, returns a \code{\link{metabarcoding.data}} object. Default is \code{FALSE}
#'
#' @return either a dataframe or a S3 object with a structure similar to \code{\link{metabarcoding.data}} object.
#' The number of samples corresponds to the predicted points.
#' The two last columns (if \code{return.metabarcoding.data==F}) or sample data.frame contains x y coordinates of the predicted grid
#' The all but last two columns (if \code{return.metabarcoding.data==F}) or read matrix contains the predicted log10 transformed relative abundances
#' instead of reads counts
#' If \code{return.metabarcoding.data==F} the motus data.frame contains the motus informations from x
#'
#' @examples
#'
#' data(termes)
#' #Create dummy spatial coordinates
#' attr(termes, "samples")[c("x", "y")] = expand.grid(1:7,1:3)
#'
#' #compute frequencies
#' attr(termes, "layers")[["reads.freq"]] = normalize(termes, MARGIN=1)$reads
#'
#' # Getting extrapolations
#' termes.pred = extrapol.freq(attr(termes, "layers")[["reads.freq"]], min.coord=c(1,1), max.coord=c(7,3),
#' grid.grain=100,termes$samples[,c("x", "y")], termes$motus, cutoff=1e-3)
#'
#' head(termes.pred$reads)
#' @seealso \code{\link{map.extrapol.freq}} as well as \code{sp} and \code{gstat} packages
#' @author Lucie Zinger
#' @export
extrapol.freq = function(x, min.coord, max.coord, grid.grain=100, coords, otus.table, cutoff=1e-3, return.metabarcoding.data = FALSE) {
require(gstat)
require(sp)
#predicted grid setting
new.x = seq(min.coord[1], max.coord[1], length.out = grid.grain)
new.y = seq(min.coord[2], max.coord[2], length.out = grid.grain)
grid.p=expand.grid(new.x, new.y)
colnames(grid.p)=c("x", "y")
S=sp::SpatialPoints(grid.p); sp::gridded(S)<-TRUE
m=gstat::vgm(50, "Exp", 100)
#krigging
preds = apply(x, 2, function(otu) {
otu[otu<cutoff] = cutoff
spj=cbind(coords,otu)
colnames(spj)=c("x", "y", "otu")
spj.g=gstat::gstat(id="Log10.freq", formula=log10(otu)~1,locations=~x+y,data=spj,model=m)
gstat::predict.gstat(spj.g, grid.p, quiet=T)$Log10.freq.pred
})
#formatting the output
colnames(preds) = rownames(otus.table)
rownames(preds) = paste("s", 1:nrow(grid.p), sep=".")
row.names(grid.p) = rownames(preds)
if(return.metabarcoding.data==F) {
out = data.frame(preds, grid.p)
} else{
out = metabarcoding.data(preds, grid.p, otus.table)
}
return(out)
}
#' Maps of krigged log10-transformed frequencies
#'
#' Maps the output of extrapol.freq
#'
#'
#' @param x an extrapol.freq output
#' @param path the path of the folder to export the map. Default is \code{NULL} and map is printed in Rplot/quartz
#' @param col.names a vector containing the names of the columns to be used for defining the file name. Typically
#' the column names containing the taxonomic information and/or sequence/motus id.
#' @param index an integer indicating column number of the motu/sequence to be plotted.
#' @param cutoff lower motu frequency accepted to consider motu abundance as different
#' from 0. Should be the same than the one used in extrapol.freq
#' @param add.points a 3-column data.frame containing factor levels and associated x and y coordinates
#' to be added to the map. Typically taxa observed in the field.
#' @param adj a value used for adjusting text position in the map. Default is \code{4}
#'
#' @return a map/png file displaying motus distribution.
#'
#' @examples
#'
#' data(termes)
#' attr(termes, "samples")[c("x", "y")] = expand.grid(1:7,1:3)
#'
#' #compute frequencies
#' attr(termes, "layers")[["reads.freq"]] = normalize(termes, MARGIN=1)$reads
#'
#' # Getting extrapolations
#' termes.pred = extrapol.freq(attr(termes, "layers")[["reads.freq"]],
#' grid.grain=100,termes$samples[,c("x", "y")], termes$motus, cutoff=1e-3)
#'
#' #mapping the distribution of the 3 most abundant sequences (caution, mfrow does not work for lattice's levelplot)
#' map.extrapol.freq(termes.pred, path=NULL, col.name=NULL, 1, cutoff=1e-3)
#' map.extrapol.freq(termes.pred, path=NULL, col.name=NULL, 2, cutoff=1e-3)
#' map.extrapol.freq(termes.pred, path=NULL, col.name=NULL, 3, cutoff=1e-3)
#'
#' #dummy observationnal data
#' termes.obs = data.frame(x=c(2,3,5), y=c(2.7,2,2.6), taxa = rep("Isoptera Apicotermitinae", 3))
#' map.extrapol.freq(termes.pred, path=NULL, col.name=NULL, 3, cutoff=1e-3, add.points=termes.obs)
#'
#' @seealso \code{\link{extrapol.freq}}, and \code{levelplot} from \code{lattice} package
#' @author Lucie Zinger
#' @export
map.extrapol.freq = function(x, path=NULL, col.name=NULL, index, cutoff=1e-3, add.points=NULL, adj=4) {
require(lattice)
if(!is.null(path)) {
x.motus = apply(x$motus,2,as.character)
name = gsub("\\.", "_", paste(gsub(", ", "_", toString(x.motus[index,col.name])), x.motus[index,"id"], sep="_"))
file.out = paste(path, "/", name, ".png", sep="")
}
z=x$reads[,index]
z[abs(z)>abs(log10(cutoff))]=log10(cutoff)
z[z>0] = 0
spj=as.data.frame(cbind(x$samples,z))
colnames(spj)=c("x", "y", "z")
map.out=levelplot(z~x+y, spj, col.regions=topo.colors(100),
at=seq(log10(cutoff),log10(1), by=0.2),
colorkey=list(at=seq(log10(cutoff),log10(1), by=0.2),
labels=list(at=seq(log10(cutoff),log10(