Commit d955695d by Eric Coissac

Delete debroussaillage.Rmd

parent a2c28e33
---
title: "Untitled"
author: "Christelle & Eric"
date: "27/03/2018"
output:
html_document: default
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
library(ade4)
library(vegan)
```
```{r}
euk = read.csv("australia.euk.reads.plot.csv",head=TRUE,row.names = 1)
bac = read.csv("australia.bac.reads.plot.csv",head=TRUE,row.names = 1)
env = read.csv("biogegraphy_and_environment.csv",head=TRUE,row.names = 1)
```
```{r}
env=env[,lapply(env,class)=="numeric"]
euk=euk[rownames(env),]
bac=bac[rownames(env),]
```
```{r}
env = sweep(env,MARGIN = 2,colMeans(env),'-')
env = sweep(env,MARGIN = 2,apply(env,2,sd),'/')
```
```{r}
euk=sqrt(euk)
#euk = sweep(euk,MARGIN = 2,colMeans(euk),'-')
#euk = sweep(euk,MARGIN = 2,apply(euk,2,sd),'/')
```
```{r}
bac=sqrt(bac)
#bac = sweep(bac,MARGIN = 2,colMeans(bac),'-')
#bac = sweep(bac,MARGIN = 2,apply(bac,2,sd),'/')
```
```{r}
euk.dist=vegdist(euk,method = "jaccard")
euk.pco =dudi.pco(euk.dist,full = TRUE)
euk.pco.li = euk.pco$li
dim(euk.pco.li)
plot(euk.pco.li[,1:2],cex=0)
text(euk.pco.li[,1:2],labels = rownames(euk.pco.li),cex=0.6)
```
```{r}
bac.dist=vegdist(bac,method = "jaccard")
bac.pco =dudi.pco(bac.dist,full = TRUE)
bac.pco.li = bac.pco$li
dim(bac.pco.li)
plot(bac.pco.li[,1:2],cex=0)
text(bac.pco.li[,1:2],labels = rownames(bac.pco.li),cex=0.6)
```
```{r}
env.pca = dudi.pca(env,scannf = FALSE,nf=nrow(env)-1)
env.pca.li = env.pca$li
dim(env.pca.li)
plot(env.pca.li[,1:2],cex=0)
text(env.pca.li[,1:2],labels = rownames(env.pca.li),cex=0.6)
```
```{r}
euk.sct = sum(euk.pco.li**2)
bac.sct = sum(bac.pco.li**2)
```
```{r}
x = procrustes(euk.pco.li,bac.pco.li,scale = TRUE)
```
```{r}
y = procrustes(bac.pco.li,euk.pco.li,scale = TRUE)
```
```{r}
SCR.E.B = sum((euk.pco.li-x$Yrot)^2)
SCT.E = sum((euk.pco.li- colMeans(euk.pco.li))^2)
1-SCR.E.B/SCT.E
```
```{r}
SCR.B.E = sum((bac.pco.li-y$Yrot)^2)
SCT.B = sum((bac.pco.li- colMeans(bac.pco.li))^2)
1-SCR.B.E/SCT.B
```
```{r}
myproc = function (X, Y, scale = TRUE, symmetric = FALSE, scores = "sites",
...)
{
X <- scores(X, display = scores, ...)
Y <- scores(Y, display = scores, ...)
if (nrow(X) != nrow(Y))
stop("Matrices have different number of rows: ", nrow(X),
" and ", nrow(Y))
if (ncol(X) < ncol(Y)) {
warning("X has fewer axes than Y: X adjusted to comform Y\n")
addcols <- ncol(Y) - ncol(X)
for (i in 1:addcols) X <- cbind(X, 0)
}
ctrace <- function(MAT) sum(MAT^2)
c <- 1
if (symmetric) {
X <- scale(X, scale = FALSE)
Y <- scale(Y, scale = FALSE)
X <- X/sqrt(ctrace(X))
Y <- Y/sqrt(ctrace(Y))
}
xmean <- apply(X, 2, mean)
ymean <- apply(Y, 2, mean)
if (!symmetric) {
X <- scale(X, scale = FALSE)
Y <- scale(Y, scale = FALSE)
}
XY <- crossprod(X, Y)
sol <- svd(XY)
A <- sol$v %*% t(sol$u)
print(A)
if (scale) {
c <- sum(sol$d)/ctrace(Y)
}
Yrot <- c * Y %*% A
b <- xmean - c * ymean %*% A
R2 <- ctrace(X) + c * c * ctrace(Y) - 2 * c * sum(sol$d)
reslt <- list(Yrot = Yrot, X = X, ss = R2, rotation = A,
translation = b, scale = c, xmean = xmean, symmetric = symmetric,
call = match.call())
reslt$svd <- sol
class(reslt) <- "procrustes"
reslt
}
```
```{r}
myproc3 = function (Y, ...)
{
require(matlib)
ctrace <- function(MAT) sum(MAT^2)
Xs <- get_list_from_ellipsis(...)
nY = nrow(Y)
nXs= sapply(Xs, nY)
if (any(nXs!=nY)) {
stop("Matrices have different number of rows: ", nY,
" and ", cat(nXs))
}
Ymean <- colMeans(Y)
Xmeans<- sapply(Xs,colMeans)
Y <- scale(Y, scale = FALSE)
Xs <- lapply(Xs,scale,scale = FALSE)
XYs <- lapply(Xs,function(x) crossprod(Y, x))
sol_yxs <- lapply(XYs,svd)
A_xys <- lapply(sol_yxs, function(x) x$v %*% t(x$u))
X1X2 <- crossprod(X1, X2)
sol_x1x2 <- svd(X1X2)
A_x1x2 <- sol_x1x2$v %*% t(sol_x1x2$u)
X1rot <- X1 %*% A_yx1
X2rot <- X2 %*% A_yx2
CovYX1 = sum(sol_yx1$d)
CovYX2 = sum(sol_yx2$d)
CovX1X2 = sum(sol_x1x2$d)
VarY = ctrace(Y)
VarX1 = ctrace(X1)
VarX2 = ctrace(X2)
CovEx = matrix(c(VarX1,CovX1X2,CovX1X2,VarX2),nrow=2)
Cov2 = matrix(c(CovYX1,CovYX2),nrow=2)
pentes = inv(CovEx) %*% Cov2
print(pentes)
SdX1 = sqrt(VarX1)
SdX2 = sqrt(VarX2)
a1 = (CovYX1 * VarX2 - CovYX2 * CovX1X2)/(VarX1*VarX2-CovX1X2^2)
a2 = (CovYX2 - a1 * CovX1X2) / VarX2
b = Ymean - a1 * X1mean %*% A_yx1 - a2 * X2mean %*% A_yx2
SCX1 = sum((X1rot * a1)^2)
SCX2 = sum((X2rot * a2)^2)
Yhat = X1rot * a1 + X2rot * a2
SCR = sum((Y - Yhat)^2)
SCT = VarY
SCI = SCT - SCX1 - SCX2 - SCR
ddl.t = (nrow(Y)-1)^2
ddl.r = ddl.t - 3
vt = SCT/ddl.t
vx1= SCX1
vx2= SCX2
vi = SCI
vr = SCR/ddl.r
fx1=vx1/vr
fx2=vx2/vr
fi =vi/vr
pf.x1=1-pf(fx1,1,ddl.r)
pf.x2=1-pf(fx2,1,ddl.r)
pf.i =1-pf(fi ,1,ddl.r)
ddl = (nrow(Y)-1)^2-1
sd.a1 = sqrt((VarY/VarX1 - a1^2)/ddl)
sd.a2 = sqrt((VarY/VarX2 - a2^2)/ddl)
t.a1 = a1/sd.a1
t.a2 = a2/sd.a2
p.a1 = 1 - pt(t.a1,ddl)
p.a2 = 1 - pt(t.a2,ddl)
res = list(coefficients = list(a1=a1,a2=a2,b=b),
sd = list(a1=sd.a1,a2=sd.a2),
t = list(a1=t.a1,a2=t.a2),
p.value = list(a1=p.a1,a2=p.a2),
anova = list(X1=SCX1/SCT, X2=SCX2/SCT, X1X2=SCI/SCT, Res=SCR/SCT),
SunSq = list(X1=SCX1, X2=SCX2, X1X2=SCI, Res=SCR),
MeanSq = list(X1=vx1,X2=vx2,X1X2=vi,Res=vr),
Fvalue = list(X1=fx1,X2=fx2,X1X2=fi),
Fvalue = list(X1=pf.x1,X2=pf.x2,X1X2=pf.i)
)
class(res) <- "procustes3"
return(res)
}
```
```{r}
xxx = myproc3(euk.pco.li,bac.pco.li,env.pca.li)
xxx
```
```{r}
xxx = myproc3(bac.pco.li,euk.pco.li,env.pca.li)
xxx
```
```{r}
xxx = myproc3(env.pca.li,bac.pco.li,euk.pco.li)
xxx
```
## Estimation des coefficients lors d'une r?gression lin?aire multiple (2 variables)
$$ \hat{y}=a_1 x_1 + a_2 x_2 + b$$
On cherche ? minimiser l'erreur commise entre la pr?diction et la vraie valeur (m?thode des moindres carr?s)
$$
\left\{
\begin{array}{rcr}
\sum_i{(y_i- \hat{y_i})^2}=0 \\
\sum_i{(y_i- (a_1 x_{1,i} + a_2 x_{2,i} + b))^2}=0
\end{array}
\right.
$$
Pour trouver les param?tres qui minimise cette fonction, on calcule les trois d?riv?es partielles et on les annule :
$$
\left\{
\begin{array}{rcr}
-2 \sum_i{x_{1,i}(y_i- a_1 x_{1,i} + a_2 x_{2,i} + b)}=0 \\
-2 \sum_i{x_{2,i}(y_i- a_1 x_{1,i} + a_2 x_{2,i} + b)}=0 \\
-2 \sum_i{(y_i- a_1 x_{1,i} + a_2 x_{2,i} + b)}=0
\end{array}
\right.
$$
A partir de la derni?re ?quation, on obtient :
$$
\overline{y}-a_1\overline{x_1}-a_2\overline{x_2}-b=0
\\
\Leftrightarrow \hat{b}=\overline{y}-a_1\overline{x_1}-a_2\overline{x_2}
$$
En rempla?ant dans la premi?re ?quation on obtient :
$$
\frac{\sum_i{x_{1,i}y_i}}{n}-a_1\frac{\sum_i{x_{1,i}^2}}{n}-a_2\frac{\sum_i{x_{1,i}x_{2,i}}}{n}-b\frac{\sum_i{x_{1,i}}}{n}=0
\\
\Leftrightarrow
\frac{\sum_i{x_{1,i}y_i}}{n}-a_1\frac{\sum_i{x_{1,i}^2}}{n}-a_2\frac{\sum_i{x_{1,i}x_{2,i}}}{n}-(\overline{y}-a_1\overline{x_1}-a_2\overline{x_2}) \overline{x_1}=0
\\
\Leftrightarrow
\frac{\sum_i{x_{1,i}y_i}}{n}-a_1\frac{\sum_i{x_{1,i}^2}}{n}-a_2\frac{\sum_i{x_{1,i}x_{2,i}}}{n}-(\overline{y}-a_1\overline{x_1}-a_2\overline{x_2} ) \overline{x_1}=0
\\
\Leftrightarrow
S_{xy}-a_1S^{2}_{x_{1}}-a_2S_{x_1x_2}=0
\\
\Leftrightarrow
\hat{a}_1=\frac{S_{x_1y}-a_2S_{x_1x_2}}{S^{2}_{x_{1}}}
$$
De la m?me mani?re, on obtient :
$$
\hat{a}_2=\frac{S_{x_2y}-a_1S_{x_1x_2}}{S^{2}_{x_{2}}}
$$
$$
\hat{a}_1=\frac{S_{x_1y}-\frac{S_{x_2y}-a_1S_{x_1x_2}}{S^{2}_{x_{2}}}S_{x_1x_2}}{S^{2}_{x_{1}}}
$$
$$
\hat{a}_1-\hat{a}_1\frac{S^2_{x_1x_2}}
{S^2_{x_2}S^2_{x_1}}=\frac{S_{x_1y}}
{S^2_{x_1}}
-\frac{S_{x_1x_2}S_{x_2y}}
{S^2_{x_2}S^2_{x_1}}
$$
$$
\hat{a}_1(1-\frac{S^2_{x_1x_2}}
{S^2_{x_2}S^2_{x_1}})=\frac{S_{x_1y}}
{S^2_{x_1}}
-\frac{S_{x_1x_2}S_{x_2y}}
{S^2_{x_2}S^2_{x_1}}
$$
$$
\hat{a}_1\frac{S^2_{x_2}S^2_{x_1}-S^2_{x_1x_2}}
{S^2_{x_2}S^2_{x_1}}=
\frac{S_{x_1y}}
{S^2_{x_1}}
-\frac{S_{x_1x_2}S_{x_2y}}
{S^2_{x_2}S^2_{x_1}}
$$
$$
\hat{a}_1\frac{S^2_{x_2}S^2_{x_1}-S^2_{x_1x_2}}
{S^2_{x_2}S^2_{x_1}}=
\frac{S_{x_1y}S^2_{x_2}}
{S^2_{x_1}S^2_{x_2}}
-\frac{S_{x_1x_2}S_{x_2y}}
{S^2_{x_2}S^2_{x_1}}
$$
$$
\hat{a}_1
=
\frac{S_{x_1y}S^2_{x_2}
-S_{x_1x_2}S_{x_2y}}
{S^2_{x_2}S^2_{x_1}-S^2_{x_1x_2}}
$$
## Calcul matriciel :
Soit X la matrice ? expliquer et Y la matrice explicative.
Par procuste on a trois op?rations :
* translation (revient ? aligner les barycentres : centrer )
* ?chelle : homoth?tie
* rotation trouver l'angle qui minimise : m?thode des moindres carr?s
Seule contrainte de la m?thode, la rotation doit ?tre effectu?e en dernier.
On cherche : $$ Min|X-(1c'+\rho Y A)| $$
On se retrouve ? minimiser comme en relation lin?aire l'?cart entre X notre tableau de donn?es explicative et la transformation de la matrice Y (qui repr?sente l'approximation en r?gression lin?aire) :
$$1c'+\rho Y A $$
avec $c'$ translation (centrage), $\rho$ ?chelle (normalisation), $A$ rotation
D'un point de vue matriciel : il faut que A'A=I orthogonal, diag(A'A)=1
Les matrices centr?es de X et Y sont not?es :
$$\tilde{Y}, \tilde{X}$$
D?composition en valeur singuli?re : avec U'U=V'V=I
$$\tilde{X}' \tilde{Y}=U \Lambda V'$$
La matrice de rotation est donc estim?e par : $$\hat{A}=VU'$$
Param?tre d'?chelle qui correspond ? c dans procruste :
$$\hat{\rho}=\frac{trace(\hat{A}\tilde{X}'\tilde{Y})}{trace(\tilde{Y}'\tilde{Y})}$$
Si on fait l'analogie avec la fonction programm?e dans procrustre (?quivalence : translation <=> b, ?chelle <=> c et rotation A) :
$$Yrot=\hat{\rho} Y A
\\
b=\overline{X}-\hat{\rho}\overline{Y} A
$$
? j'aurai plus t?t ?crit, la translation a d?j? du ?tre faite dans procruste avant cette ?tape??
$$Yrot=1c'+ \hat{\rho} Y A
\\
b=\overline{X}-\hat{\rho}\overline{Y} A
$$
? Yrot est la nouvelle estimation de Y qui a ?t? transform? pour "?tre repr?sent? dans la m?me base" que X. ???
Et du coup la diff?rence entre X et Yrot repr?sente la part non expliqu?e par Y du tableau de donn?es X
$$
\sum(X-Y_{rot})^2=SRC_{residuel}
$$
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment