multivariate.R 12 KB
Newer Older
1
#' @include procmod_frame.R
2 3 4 5
#' @import MASS
#'
NULL

6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
#' Converts a \code{dist} object to a \code{data.frame} object.
#'
#' The created \code{data.frame} has a attribute \code{is.dist} set to
#' the logical value \code{TRUE}.
#'
#' @param x the \code{dist} object to be converted
#' @param row.names NULL or a \code{character} vector giving the row
#'                  names for the data frame. Missing values
#'                  are not allowed.
#' @param optional  logical. If \code{TRUE}, setting row names and converting
#'                  column names (to syntactic names: see make.names)
#'                  is optional. Note that all of R's base package
#'                  as.data.frame() methods use optional only for column
#'                  names treatment, basically with the meaning of
#'                  data.frame(*, check.names = !optional).
#'                  See also the make.names argument of the
#'                  \code{\link[base]{matrix}} method.
#' @param ...       additional arguments to be passed to or from methods.
#'
25 26 27 28 29 30 31 32 33 34
#' @examples
#' data(bacteria)
#' bacteria_rel_freq <- sweep(bacteria,
#'                            1,
#'                            rowSums(bacteria),
#'                            "/")
#' bacteria_hellinger <- sqrt(bacteria_rel_freq)
#' bacteria_dist <- dist(bacteria_hellinger)
#' bdf <- as.data.frame(bacteria_dist)
#'
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
#' @author Eric Coissac
#' @author Christelle Gonindard-Melodelima
#' @export
as.data.frame.dist <- function(x, row.names = NULL, optional = FALSE, ...) {

  x <- as.data.frame(
    as.matrix(x),
    row.names = row.names,
    optional = optional,
    ...)

  attr(x, "is.dist") <- TRUE

  x
}

51 52 53

#' Project a distance matrix in a euclidean space (NMDS).
#'
Eric Coissac committed
54 55 56 57
#' Project a set of points defined by a distance matrix in
#' an eucleadean space using the Kruskal's Non-metric
#' Multidimensional Scaling. This function is mainly a simplified
#' interface on the \code{\link[MASS]{isoMDS}} function using as
58
#' much as possible dimensions to limit the stress. The aims of this
Eric Coissac committed
59 60 61 62
#' NDMS being only to project point in an orthogonal space therefore
#' without any correlation between axis. Because a non-metric method
#' is used no condition is required on the used distance.
#'
63 64 65 66 67 68 69 70
#' @param distances   a \code{\link[stats]{dist}} object or a
#'                    \code{\link[base]{matrix}}
#'                    object representing a distance matrix.
#' @param maxit       The maximum number of iterations.
#' @param trace       Logical for tracing optimization. Default \code{TRUE}.
#' @param tol         convergence tolerance.
#' @param p           Power for Minkowski distance in the configuration space.
#'
71 72 73 74
#' @return a numeric matrix with at most \code{n-1} dimensions, with
#'         \code{n} the number pf observations. This matrix defines the
#'         coordinates of each point in the orthogonal space.
#'
75 76 77 78 79 80 81 82 83 84 85
#' @examples
#' data(bacteria)
#' bacteria_rel_freq <- sweep(bacteria,
#'                            1,
#'                            rowSums(bacteria),
#'                            "/")
#' bacteria_hellinger <- sqrt(bacteria_rel_freq)
#' bacteria_dist <- dist(bacteria_hellinger)
#'
#' project <- nmds(bacteria_dist)
#'
86 87 88
#' @author Eric Coissac
#' @author Christelle Gonindard-Melodelima
#' @export
89
nmds <- function(distances,
90
                 maxit = 100, trace = FALSE,
91 92 93 94 95 96
                 tol = 0.001, p = 2) {


  if (inherits(distances, "matrix")) {
    distances <- as.dist(distances)
  }
97

98
  stopifnot(inherits(distances, "dist"))
99

100
  k <- attributes(distances)$Size - 1
101

102 103
  y <- suppressWarnings(cmdscale(distances, k))
  k <- ncol(y)
104

105 106 107 108 109 110 111 112
  n <- isoMDS(distances,
    y = y,
    k = k,
    maxit = maxit,
    trace = trace,
    tol = tol,
    p = p
  )
113

114 115 116
  p <- n$points
  attributes(p)$stress <- n$stress
  attributes(p)$projected <- "nmds"
117

118
  p
119 120 121 122
}

#' Project a distance matrix in a euclidean space (PCOA).
#'
Eric Coissac committed
123 124 125 126 127 128 129 130
#' Project a set of points defined by a distance matrix in
#' an eucleadean space using the Principal Coordinates Analysis
#' method. This function is mainly a simplified
#' interface on the \code{\link[stats]{cmdscale}} function using as
#' much as possible dimensions for the projection. The aims of this
#' PCoA being only to project point in an orthogonal space therefore
#' without any correlation between axis. Because a metric method
#' is used the used distance must be euclidean
131
#' (cf \code{\link[ProcMod]{is_euclid}}).
Eric Coissac committed
132
#'
133 134 135 136 137 138 139 140
#' @param distances   a \code{\link[stats]{dist}} object or a
#'                    \code{\link[base]{matrix}}
#'                    object representing a distance matrix.
#'
#' @return a numeric matrix with at most \code{n-1} dimensions, with
#'         \code{n} the number pf observations. This matrix defines the
#'         coordinates of each point in the orthogonal space.
#'
141 142 143 144 145 146 147 148 149 150 151
#' @examples
#' data(bacteria)
#' bacteria_rel_freq <- sweep(bacteria,
#'                            1,
#'                            rowSums(bacteria),
#'                            "/")
#' bacteria_hellinger <- sqrt(bacteria_rel_freq)
#' bacteria_dist <- dist(bacteria_hellinger)
#'
#' project <- pcoa(bacteria_dist)
#'
152 153 154
#' @author Eric Coissac
#' @author Christelle Gonindard-Melodelima
#' @export
155 156 157 158 159
pcoa <- function(distances) {

  if (inherits(distances, "matrix")) {
    distances <- as.dist(distances)
  }
160

161
  stopifnot(inherits(distances, "dist"))
162

163 164 165
  k <- attributes(distances)$Size - 1
  x <- suppressWarnings(cmdscale(distances, k = k))
  attributes(x)$projected <- "pcoa"
166

167
  x
168 169 170 171
}

#' Project a set of points in a euclidean space (PCA).
#'
172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
#' Project a set of points defined by a set of numeric variables in
#' an eucleadean space using the pricipal componant analysis.
#' This function is mainly a simplified
#' interface on the \code{\link[stats]{prcomp}} function using as
#' much as possible dimensions to keep all the variation. The aims of this
#' PCA being only to project point in an orthogonal space therefore
#' without any correlation between axis. Data are centered by not scaled by
#' default.
#'
#' @param data a numeric matrix describing the points
#' @param scale a \code{logical} value indicating if the dimensions must be scaled
#'        to force for every column that \code{sd=1}. \code{FALSE} by default.
#'
#' @return a numeric matrix with at most \code{n-1} dimensions, with
#'         \code{n} the number pf observations. This matrix defines the
#'         coordinates of each point in the orthogonal space.
#'
189 190 191 192 193 194 195 196 197 198
#' @examples
#' data(bacteria)
#' bacteria_rel_freq <- sweep(bacteria,
#'                            1,
#'                            rowSums(bacteria),
#'                            "/")
#' bacteria_hellinger <- sqrt(bacteria_rel_freq)
#'
#' project <- pca(bacteria_hellinger)
#'
199 200 201
#' @author Eric Coissac
#' @author Christelle Gonindard-Melodelima
#' @export
202 203 204 205 206 207 208 209 210 211 212 213 214
pca <- function(data, scale = FALSE) {

  p <- prcomp(data,
    retx = TRUE,
    center = TRUE,
    scale. = scale,
    tol = 0
  )

  x <- p$x
  attributes(x)$projected <- "pca"

  x
215 216 217 218 219 220 221 222 223
}

#' Double centering of a matrix.
#'
#' colSums and rowSums of the returned matrix are all equal to zero.
#'
#' Inspired from the algorithm described in stackoverflow
#' \url{https://stackoverflow.com/questions/43639063/double-centering-in-r}
#'
224 225 226 227 228
#' @param m a \code{numeric} matrix
#'
#' @return a \code{numeric} matrix centred by rows
#'           and columns
#'
229 230 231 232 233 234
#' @examples
#' data(bacteria)
#' bact_bc <- bicenter(bacteria)
#' sum(rowSums(bact_bc))
#' sum(colSums(bact_bc))
#'
235 236
#' @author Eric Coissac
#' @author Christelle Gonindard-Melodelima
237
#' @export
238
bicenter <- function(m) {
239 240

  # compute the row-wise and column-wise mean matrices
241 242 243
  m0 <- m * 0
  R <- m0 + rowMeans(m)
  C <- t(m0 + colMeans(m))
244 245

  # substract them and add the grand mean
246
  m - R - C + mean(m[])
247 248 249 250 251 252 253
}

#' Test if the distance matrix is euclidean.
#'
#' Actually a simplified version of the ADE4 implementation
#' (\code{\link[ade4]{is.euclid}}).
#'
254 255 256 257 258
#' @param distances	an object of class 'dist'
#' @param tol	a tolerance threshold : an eigenvalue is
#'        considered positive if it is larger than
#'        -tol*lambda1 where lambda1 is the largest eigenvalue.
#'
259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
#' @examples
#' library(vegan)
#' data(bacteria)
#'
#' bacteria_rel_freq <- sweep(bacteria,
#'                            1,
#'                            rowSums(bacteria),
#'                            "/")
#'
#' bacteria_bray <- vegdist(bacteria_rel_freq,method = "bray")
#' is_euclid(bacteria_bray)
#'
#' bacteria_chao <- vegdist(floor(bacteria*10000),method = "chao")
#' is_euclid(bacteria_chao)
#'
274 275 276
#' @author Eric Coissac
#' @author Christelle Gonindard-Melodelima
#' @export
277
is_euclid <- function(distances, tol = 1e-07) {
278
  if (!inherits(distances, "dist")) {
279
    stop("Object of class 'dist' expected")
280
  }
281

282
  if (any(distances < tol)) {
283
    warning("Zero distance(s)")
284
  }
285 286 287 288 289

  distances <- as.matrix(distances)
  n <- ncol(distances)
  delta <- -0.5 * ProcMod::bicenter(distances * distances)
  lambda <- eigen(delta, symmetric = TRUE, only.values = TRUE)$values
290 291 292
  w0 <- lambda[n] / lambda[1]

  w0 > -tol
293 294
}

295
is_orthogonal <- function(x) {
296
  stopifnot(is_procmod_frame(x))
297 298

  !is.null(attr(x, "projected"))
Eric Coissac committed
299 300 301
}

#' Project a dataset in a euclidean space.
302
#'
Eric Coissac committed
303 304 305 306 307 308 309
#' Project a set of points defined by a distance matrix
#' or a set of variables in an eucleadean space.
#' If the distance matrix is a metric, this is done using
#' the \code{\link[ProcMod]{pcoa}} function,
#' for other distance the \code{\link[ProcMod]{nmds}} is used.
#' When points are described by a set of variable the
#' \code{\link[ProcMod]{nmds}} is used.
310
#'
311 312 313 314 315 316 317
#' @param data a numeric matrix describing the points
#' @param tol	a tolerance threshold : an eigenvalue is
#'        considered positive if it is larger than
#'        -tol*lambda1 where lambda1 is the largest eigenvalue.
#' @param scale a \code{logical} value indicating if the dimensions must be scaled
#'        to force for every column that \code{sd=1}. \code{FALSE} by default.
#'
318 319
#' @param ... other parameters specific to some implementation
#'
320 321 322 323
#' @return a numeric matrix with at most \code{n-1} dimensions, with
#'         \code{n} the number pf observations. This matrix defines the
#'         coordinates of each point in the orthogonal space.
#'
324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350
#' @examples
#' library(vegan)
#' data(bacteria)
#' data(eukaryotes)
#' data(soil)
#'
#' dataset <- procmod_frame(euk = vegdist(decostand(eukaryotes,
#'                                                  method = "hellinger"),
#'                                        method = "euclidean"),
#'                          bac = vegdist(decostand(bacteria,
#'                                                  method = "hellinger"),
#'                                        method = "euclidean"),
#'                          soil = scale(soil,
#'                                       center = TRUE,
#'                                       scale  = TRUE))
#'
#' dp <- ortho(dataset)
#'
#' bacteria_rel_freq <- sweep(bacteria,
#'                            1,
#'                            rowSums(bacteria),
#'                            "/")
#' bacteria_hellinger <- sqrt(bacteria_rel_freq)
#' bacteria_dist <- dist(bacteria_hellinger)
#'
#' project <- ortho(bacteria_dist)
#'
351 352 353
#' @author Eric Coissac
#' @author Christelle Gonindard-Melodelima
#' @export
354 355
ortho <- function(data, ...) {
  UseMethod("ortho", data)
Eric Coissac committed
356
}
357

358
#' @rdname ortho
Eric Coissac committed
359
#' @export
360
ortho.dist <- function(data, tol = 1e-7, ...) {
361
  if (ProcMod::is_euclid(data, tol = tol)) {
362
    return(ProcMod::pcoa(data))
363 364
  }

365
  ProcMod::nmds(data)
Eric Coissac committed
366 367
}

368
#' @rdname ortho
Eric Coissac committed
369
#' @export
370 371
ortho.matrix <- function(data, scale = FALSE, ...) {
  pca(data, scale = scale)
Eric Coissac committed
372 373
}

374
#' @rdname ortho
Eric Coissac committed
375
#' @export
376 377 378 379
ortho.data.frame <- function(data, scale = FALSE, ...) {
  if (!is.null(attributes(data)$is.dist) &&
    attributes(data)$is.dist == TRUE) {
    return(ortho(as.dist(as.matrix(data))))
380
  }
Eric Coissac committed
381

382
  pca(as.matrix(data), scale = scale)
Eric Coissac committed
383 384 385
}


386
#' @rdname ortho
Eric Coissac committed
387
#' @export
388
ortho.procmod_frame <- function(data, ...) {
389
  if (is_orthogonal(data)) {
390
    return(data)
391
  }
Eric Coissac committed
392

393
  n <- ncol(data)
Eric Coissac committed
394

395
  p <- vector(mode = "character", length = n)
Eric Coissac committed
396
  for (i in seq_len(n)) {
397
    xt <- ortho(data[[i]])
398
    p[i] <- attributes(xt)$projected
399
    data[[i]] <- xt
Eric Coissac committed
400 401
  }

402 403
  names(p) <- names(data)
  attributes(data)$projected <- p
Eric Coissac committed
404

405
  data
406
}