Commit 050956c0 by Eric Coissac

Some corrections on the lecture

parent cf80ff9f
......@@ -7,5 +7,6 @@ export(H_spectrum)
export(exp_q)
export(log_q)
export(mode)
export(norm)
export(tag_bad_pcr)
importFrom(Rdpack,reprompt)
#' @export
norm <- function(data,l=2) {
no <- function(x,y) sum(abs(data[x,]-data[y,])^l)^(1/l)
n = nrow(data)
d = matrix(0,nrow = n,ncol = n)
for (i in 1:n)
for (j in i:n) {
d[i,j] <- no(i,j)
d[j,i] <- d[i,j]
}
rownames(d) = rownames(data)
colnames(d) = rownames(data)
as.dist(d)
}
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -28,6 +28,7 @@ opts_chunk$set(echo = FALSE,
# Summary
- The MetabarSchool Package
- What do the reading numbers per PCR mean?
- Rarefaction vs. relative frequencies
- alpha diversity metrics
......@@ -35,6 +36,28 @@ opts_chunk$set(echo = FALSE,
- multidimentionnal analysis
- comparison between datasets
# The MetabarSchool Package
## Instaling the package
You need the *devtools* package
```{r eval=FALSE, echo=TRUE}
install.packages("devtools",dependencies = TRUE)
```
Then you can install *MetabarSchool*
```{r eval=FALSE, echo=TRUE}
devtools::install_git("https://git.metabarcoding.org/MetabarcodingSchool/biodiversity-metrics.git")
```
You will also need the *vegan* package
```{r eval=FALSE, echo=TRUE}
install.packages("vegan",dependencies = TRUE)
```
# The dataset
## The mock community {.flexbox .vcenter .smaller}
......@@ -68,6 +91,8 @@ data("positive.samples")
## Loading data
```{r echo=TRUE}
library(MetabarSchool)
data("positive.count")
data("positive.samples")
data("positive.motus")
......@@ -94,6 +119,8 @@ positive.count[1:5,1:5]
## Loading data
```{r echo=TRUE}
library(MetabarSchool)
data("positive.count")
data("positive.samples")
data("positive.motus")
......@@ -119,6 +146,8 @@ head(positive.samples,n=3)
## Loading data
```{r echo=TRUE}
library(MetabarSchool)
data("positive.count")
data("positive.samples")
data("positive.motus")
......@@ -169,7 +198,7 @@ positive.motus = positive.motus[are.not.singleton,]
$`r nrow(positive.count)` \; PCRs \; \times \; `r ncol(positive.count)` \; MOTUs$
matrix
## Not all the PCR have the number of reads {.flexbox .vcenter}
## Not all the PCR have the same number of reads {.flexbox .vcenter}
Despite all standardization efforts
......@@ -240,13 +269,19 @@ positive.count.rarefied = rrarefy(positive.count,2000)
## Rarefying read count (2) {.flexbox .vcenter}
```{r fig.height=3}
```{r fig.height=4}
par(mfrow=c(1,2),bg=NA)
hist(log10(colSums(positive.count)+1),
main = "Not rarefied",
xlim = c(0,6),
ylim = c(0,2300),
breaks = 30,
xlab = TeX("$\\log_{10}(reads per MOTUs)$"))
hist(log10(colSums(positive.count.rarefied)+1),
main = "Rarefied data",
xlim = c(0,6),
ylim = c(0,2300),
breaks = 30,
xlab = TeX("$\\log_{10}(reads per MOTUs)$"))
```
......@@ -325,11 +360,11 @@ knitr::include_graphics("figures/diversity.svg")
@Whittaker:10:00
<br><br><br><br>
- $\alpha-diversity$ : Mean diversity per site ($species/site$)
- $\alpha\text{-diversity}$ : Mean diversity per site ($species/site$)
- $\gamma-diversity$ : Regional biodiversity ($species/region$)
- $\gamma\text{-diversity}$ : Regional biodiversity ($species/region$)
- $\beta-diversity$ : $\beta = \frac{\gamma}{\alpha}$ ($site$)
- $\beta\text{-diversity}$ : $\beta = \frac{\gamma}{\alpha}$ ($sites/region$)
</div>
......@@ -410,25 +445,19 @@ kable(data.frame(`Gini-Simpson`=GS),
kable_styling(position = "center")
```
## Shanon entropy {.smaller}
## Shannon entropy {.smaller}
<div style="float: left; width: 65%;">
Shanon entropy is based on information theory.
Shannon entropy is based on information theory:
Let $X$ be a uniformly distributed random variable with values in $A$
<center>
$H^{\prime }=-\sum _{i=1}^{S}p_{i}\log p_{i}$
</center>
$$
H(X) = \log|A|
$$
<br>
</div>
<div style="float: right; width: 35%;">
if $A$ is a community where every species are equally represented then
$$
H^{\prime }=-\sum _{i=1}^{S}p_{i}\log p_{i}
H(A) = \log|A|
$$
<br>
</div>
<center>
```{r out.width = "400px"}
......@@ -441,7 +470,7 @@ H = - rowSums(environments * log(environments),na.rm = TRUE)
```
```{r}
kable(data.frame(`Shanon index`=H),
kable(data.frame(`Shannon index`=H),
format="html",
align = 'rr') %>%
kable_styling(position = "center")
......@@ -452,12 +481,12 @@ kable(data.frame(`Shanon index`=H),
<div style="float: left; width: 50%;">
As :
$$
H(X) = \log|A| \;\Rightarrow\; ^1D = e^{H(X)}
H(A) = \log|A| \;\Rightarrow\; ^1D = e^{H(A)}
$$
<br>
</div>
<div style="float: right; width: 50%;">
where $^1D$ is the theoretical number of species in a evenly distributed community that would have the same Shanon's entropy than ours.
where $^1D$ is the theoretical number of species in a evenly distributed community that would have the same Shannon's entropy than ours.
</div>
<center>
......@@ -548,10 +577,10 @@ exp_q = function(x,q=1) {
}
```
## Generalised Shanon entropy
## Generalised Shannon entropy
$$
^qH = - \sum_{i=1}^S pi \times ^q\log pi
^qH = - \sum_{i=1}^S p_i \; ^q\log p_i
$$
```{r echo=TRUE, eval=FALSE}
......@@ -616,7 +645,7 @@ abline(v=c(0,1,2),lty=2,col=4:6)
- $^0H(X) = S - 1$ : the richness minus one.
- $^1H(X) = H^{\prime}$ : the Shanon's entropy.
- $^1H(X) = H^{\prime}$ : the Shannon's entropy.
- $^2H(X) = 1 - \lambda$ : Gini-Simpson's index.
......@@ -1074,15 +1103,15 @@ BC_{jk}=\frac{\sum _{i=1}^{p}(N_{ij} - min(N_{ij},N_{ik}) + (N_{ik} - min(N_{ij}
$$
$$
BC_{jk}=\frac{\sum _{i=1}^{p}]N_{ij} - N_{ik}|}{\sum _{i=1}^{p}N_{ij}+\sum _{i=1}^{p}N_{ik}}
BC_{jk}=\frac{\sum _{i=1}^{p}|N_{ij} - N_{ik}|}{\sum _{i=1}^{p}N_{ij}+\sum _{i=1}^{p}N_{ik}}
$$
$$
BC_{jk}=\frac{\sum _{i=1}^{p}]N_{ij} - N_{ik}|}{1+1}
BC_{jk}=\frac{\sum _{i=1}^{p}|N_{ij} - N_{ik}|}{1+1}
$$
$$
BC_{jk}=\frac{1}{2}\sum _{i=1}^{p}]N_{ij} - N_{ik}|
BC_{jk}=\frac{1}{2}\sum _{i=1}^{p}|N_{ij} - N_{ik}|
$$
## Principale coordinate analysis (1) {.flexbox .vcenter}
......@@ -1109,7 +1138,7 @@ plot(guiana.bc.pcoa$points[,1:2],
xlab="Axis 1",
ylab="Axis 2",
main = "Bray Curtis on Rel. Freqs")
plot(guiana.euc.pcoa$points[,1:2],
plot(guiana.euc.pcoa$points[,1],-guiana.euc.pcoa$points[,2],
col = samples.type,
asp = 1,
xlab="Axis 1",
......@@ -1123,7 +1152,7 @@ plot(guiana.jac.1.pcoa$points[,1:2],
xlab="Axis 1",
ylab="Axis 2",
main = "Jaccard on presence (0.1%)")
plot(guiana.jac.10.pcoa$points[,1:2],
plot(-guiana.jac.10.pcoa$points[,1],guiana.jac.10.pcoa$points[,2],
col = samples.type,
asp = 1,
xlab="Axis 1",
......@@ -1162,6 +1191,58 @@ plot(0,type='n',axes=FALSE,ann=FALSE)
legend("topleft",legend = levels(samples.type),fill = 1:4,cex=1.2)
```
<!---
## Computation of norms
```{r guiana_norm, echo=TRUE}
guiana.n1.dist = norm(guiana.relfreq.final,l=1)
guiana.n2.dist = norm(guiana.relfreq.final^(1/2),l=2)
guiana.n3.dist = norm(guiana.relfreq.final^(1/3),l=3)
guiana.n4.dist = norm(guiana.relfreq.final^(1/100),l=100)
```
## pCoA on norms
```{r dependson="guiana_norm"}
guiana.n1.pcoa = cmdscale(guiana.n1.dist,k=3,eig = TRUE)
guiana.n2.pcoa = cmdscale(guiana.n2.dist,k=3,eig = TRUE)
guiana.n3.pcoa = cmdscale(guiana.n3.dist,k=3,eig = TRUE)
guiana.n4.pcoa = cmdscale(guiana.n4.dist,k=3,eig = TRUE)
```
```{r}
par(mfrow=c(2,3),bg=NA)
plot(guiana.n1.pcoa$points[,1],guiana.n1.pcoa$points[,2],
col = samples.type,
asp = 1,
xlab="Axis 1",
ylab="Axis 2",
main = "Norm 1 on Hellinger")
plot(guiana.n2.pcoa$points[,1],-guiana.n2.pcoa$points[,2],
col = samples.type,
asp = 1,
xlab="Axis 1",
ylab="Axis 2",
main = "Norm 2 on Hellinger")
plot(0,type='n',axes=FALSE,ann=FALSE)
legend("topleft",legend = levels(samples.type),fill = 1:4,cex=1.2)
plot(-guiana.n3.pcoa$points[,1],-guiana.n3.pcoa$points[,2],
col = samples.type,
asp = 1,
xlab="Axis 1",
ylab="Axis 2",
main = "Norm 3 on Hellinger")
plot(-guiana.n4.pcoa$points[,1],-guiana.n4.pcoa$points[,2],
col = samples.type,
asp = 1,
xlab="Axis 1",
ylab="Axis 2",
main = "Norm 4 on Hellinger")
```
--->
## Comparing diversity of the environments
```{r}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment