prepare_data.R 5.46 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
positive = read.delim("RawData/positifs.uniq.annotated.txt",
                      sep="\t",
                      header = TRUE)


columns = names(positive)
columns.info   = c("id", "dilution", "species_name", "taxid", "true", "sequence")
columns.counts= columns[grep("^sample\\.",columns)]

positive.count = t(positive[,columns.counts])

motus = as.data.frame(positive[,columns.info])
positive.motus = data.frame(dilution = as.numeric(motus$dilution)/2,
                            species  = as.character(motus$species_name),
                            taxid    = as.integer(motus$taxid),
                            true     = motus$true == "True"
                            )


samples.names = rownames(positive.count)

samples = t(simplify2array(strsplit(samples.names,split="_")))

#      [,1]        [,2]  [,3]  [,4] [,5] [,6]
# [1,] "sample.TM" "POS" "d16" "1"  "a"  "A1"
# [2,] "sample.TM" "POS" "d16" "1"  "a"  "B1"
# [3,] "sample.TM" "POS" "d16" "1"  "b"  "A2"
# [4,] "sample.TM" "POS" "d16" "1"  "b"  "B2"
# [5,] "sample.TM" "POS" "d16" "2"  "a"  "A1"
# [6,] "sample.TM" "POS" "d16" "2"  "a"  "B1"

samples = as.data.frame(samples[,3:6])
names(samples) = c("dilution","repeats","PCR","Plate")

positive.samples = data.frame(dilution = 32%/%as.integer(substr(as.character(samples$dilution),2,10)),
                              repeats  = interaction(samples[,2:4],drop = TRUE)
                             )

rownames(positive.samples)= samples.names
rownames(positive.count)  = samples.names

rownames(positive.motus)  = positive$id
colnames(positive.count)  = positive$id

plants.16 = positive.motus[positive.motus$true,][,c(2,3,1)]

plants.16 = plants.16[order(1/plants.16$dilution),]
plants.16$log10.dilution = - seq_len(nrow(plants.16)) / log(10)*log(2)
plants.16$dilution = 1/(2^seq_len(nrow(plants.16)))
usethis::use_data(positive.samples,overwrite = TRUE)
usethis::use_data(positive.motus,overwrite = TRUE)
usethis::use_data(positive.count,overwrite = TRUE)
usethis::use_data(plants.16,overwrite = TRUE)

positive.clean = read.delim("RawData/positifs.uniq.annotated.clean.txt",
                      sep="\t",
                      header = TRUE)

columns = names(positive.clean)
columns.info   = c("id", "dilution", "species_name", "taxid", "true", "sequence")
columns.counts= columns[grep("^sample\\.",columns)]

positive.clean.count = t(positive.clean[,columns.counts])

motus.clean = as.data.frame(positive.clean[,columns.info])
positive.clean.motus = data.frame(dilution = as.numeric(motus.clean$dilution)/2,
                            species  = as.character(motus.clean$species_name),
                            taxid    = as.integer(motus.clean$taxid),
                            true     = motus.clean$true == "True"
                            )

samples.names = rownames(positive.clean.count)

samples = t(simplify2array(strsplit(samples.names,split="_")))

samples = as.data.frame(samples[,3:6])
names(samples) = c("dilution","repeats","PCR","Plate")

positive.clean.samples = data.frame(dilution = 32%/%as.integer(substr(as.character(samples$dilution),2,10)),
                              repeats  = interaction(samples[,2:4],drop = TRUE)
                             )

rownames(positive.clean.samples)= samples.names
rownames(positive.clean.count)  = samples.names

rownames(positive.clean.motus)  = positive.clean$id
colnames(positive.clean.count)  = positive.clean$id

usethis::use_data(positive.clean.samples,overwrite = TRUE)
usethis::use_data(positive.clean.motus,overwrite = TRUE)
usethis::use_data(positive.clean.count,overwrite = TRUE)


#
# Litter/Soil dataset
#


guiana         = read.delim("RawData/litiere_ins_cl97_agg_filt_tax.tab", 
                            header = TRUE,
                            sep="\t")

columns = names(guiana)

columns.info   = c("id","best_identity.order_filtered_embl_r136_noenv_INS",
                   "taxid", 
                   "phylum_name","order_name","class_name","family_name","genus_name","species_name",
                   "sequence")

columns.counts= columns[grep("^sample\\.",columns)]

samples.names = gsub(pattern = "sample.",
                     replacement = "",
                     columns.counts)

guiana.count = t(guiana[,columns.counts])

motus = as.data.frame(guiana[,columns.info])
guiana.motus = data.frame(id = paste("EUK",sprintf("%06d",1:nrow(motus)),sep=""),
                          best_id  = motus$best_identity.order_filtered_embl_r136_noenv_INS,
                          taxid    = as.integer(motus$taxid),
                          species  = factor(as.character(motus$species_name)),
                          genus    = factor(as.character(motus$genus_name)),
                          family   = factor(as.character(motus$family_name)),
                          class    = factor(as.character(motus$class_name)),
                          order    = factor(as.character(motus$order_name)),
                          phylum   = factor(as.character(motus$phylum_name)),
                          sequence = as.character(motus$sequence),
                          stringsAsFactors = FALSE
                         )

samples = read.delim("RawData/Litiere_sample_list.txt",header=TRUE)

guiana.samples = samples[samples.names,]
guiana.samples$sample = as.factor(sub("_r.$","",samples.names))


rownames(guiana.count) = samples.names
colnames(guiana.count) = guiana.motus$id
rownames(guiana.motus) = guiana.motus$id

usethis::use_data(guiana.samples,overwrite = TRUE)
usethis::use_data(guiana.motus,overwrite = TRUE)
usethis::use_data(guiana.count,overwrite = TRUE)