buildRRNAModels.sh 5.62 KB
Newer Older
1 2 3 4 5 6 7 8
#!/bin/bash
#
#                           BUILD RRNA models
#
#========================================================================================

# -- CAUTION -- Works as long than the script 
#               is not called through a symlink
alain viari's avatar
alain viari committed
9 10
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
11 12 13

function fasta1li {

14
    $AwkCmd '/^>/ {if (sequence) \
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
                  {print sequence}; \
               print $0; \
               sequence=""} \
         !/^>/ {sequence = sequence $0} \
         END {print sequence}' $1
}

function dereplicate {
	DATA=$1
	sumaclust -t 1 $DATA | \
		fasta1li | \
		grep -A 1 '^>' | \
		grep -A1 'cluster_center=True;' | \
		grep -v -- -- | \
		sed -E "s/count=[0-9]+; //" | \
		sed 's/cluster_weight/count/' | \
31
		$AwkCmd ' /^>/ {SEQ++;\
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
		            match($0,"count=[0-9][0-9]*;");\
		            count=substr($0,RSTART,RLENGTH);\
		            $1=$1"_"SEQ;\
		            print $1,count} \
			 !/^>/ {print $0}'
}


function clustering {
	DATA=$1
	rm -rf $DATA
	mkdir $DATA
	sumaclust -t 0.9 $DATA.fasta | \
		fasta1li > $DATA.clust.fasta
	cluster=$(grep '^>' $DATA.clust.fasta | \
	            sed -E 's/.*cluster=([^;]+);.*$/\1/' | \
	            sort -u)
	for c in $cluster; do
		w=$(grep "$c" "${DATA}.clust.fasta" | \
			head -1 | \
			sed -E 's/.*cluster_weight=([^;]+);.*$/\1/')
	    out=$(printf "${DATA}/%05d_%s" $w $c)
        grep -A1  "$c" "${DATA}.clust.fasta" | \
           grep -v -- -- > "$out.fasta"
        muscle -in "$out.fasta" -out "$out.align.fasta"
	done
}

function revcomp {
61
    $AwkCmd 'function printfasta(seq) {                                  \
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
            seqlen=length(seq);                                       \
            for (i=1; i <= seqlen; i+=60)                              \
              print substr(seq,i,60);                                 \
         }                                                          \
        function comp(seq) {                                           \
            "echo "seq" | tr acgtACGT tgcaTGCA " | getline res;     \
            close("echo "seq" | tr acgtACGT tgcaTGCA "); \
            return res;                                                \
        }                                                              \
        function rev(seq) {                                            \
            "echo "seq" | rev " | getline res;                         \
            close("echo "seq" | rev ");                                \
            return res;                                                \
        }                                                              \
        function revcomp(seq) {                                        \
            res=rev(comp(seq));                                        \
            return res;                                                \
        }                                                              \
                                                                       \
        (seq) && /^>/ {print head;                                     \
                       printfasta(revcomp(seq));                       \
                       seq=""}                                         \
        /^>/   {head=$0}                                               \
        ! /^>/ {seq=seq$0}                                             \
        END { print head;                                     \
              printfasta(revcomp(seq));                       \
            }' $1
}


pushTmpDir ORG.buildRRNA

	openLogFile "${RRNA_DATA_DIR}/rRNA_models.log"
	
	loginfo "Selecting Viridiplantae genebank entries..."
		VIRIDIPLANTAE=$(${PROG_DIR}/../../normalize/tools/selectViridiplantae.sh $*) 
		loginfo " --> $(echo ${VIRIDIPLANTAE} | wc -w) entries selected"
	loginfo "Done"
	

	loginfo "Extracting 4.5S rRNA sequences..."
		${PROG_DIR}/extract_ref4.5S.sh ${VIRIDIPLANTAE} > raw_4_5S.fasta
		loginfo " --> $(fastaCount raw_4_5S.fasta) retreived sequences"
		dereplicate raw_4_5S.fasta > 4_5S.fasta
		loginfo " --> $(fastaCount ${CALL_DIR}/4_5S.fasta) distinct sequences"
	loginfo "Done"

	loginfo "Clustering 4.5S rRNA sequences..."
		clustering 4_5S
	loginfo "Done"
	
	loginfo "Installing 4.5S rRNA sequences..."
		cp -r 4_5S 	"${RRNA_DATA_DIR}/RRNA_4_5S"
	loginfo "Done"
	
	loginfo "Extracting 5S rRNA sequences..."
		${PROG_DIR}/extract_ref5S.sh ${VIRIDIPLANTAE} > raw_5S.fasta
		loginfo " --> $(fastaCount raw_5S.fasta) retreived sequences"
		dereplicate raw_5S.fasta > 5S.fasta
		loginfo " --> $(fastaCount 5S.fasta) distinct sequences"
	loginfo "Done"

	loginfo "Clustering 5S rRNA sequences..."
		clustering 5S
	loginfo "Done"
	
	loginfo "Installing 5S rRNA sequences..."
		cp -r 5S 	"${RRNA_DATA_DIR}/RRNA_5S"
	loginfo "Done"
	
	
	loginfo "Extracting 16S rRNA sequences..."
		${PROG_DIR}/extract_ref16S.sh ${VIRIDIPLANTAE} > raw_16S.fasta
		loginfo " --> $(fastaCount raw_16S.fasta) retreived sequences"
		dereplicate raw_16S.fasta > 16S.fasta
		loginfo " --> $(fastaCount 16S.fasta) distinct sequences"
	loginfo "Done"

	loginfo "Clustering 16S rRNA sequences..."
		clustering 16S
	loginfo "Done"
	
	loginfo "Installing 16S rRNA sequences..."
		cp -r 16S 	"${RRNA_DATA_DIR}/RRNA_16S"
	loginfo "Done"


	loginfo "Extracting 23S rRNA sequences..."
		${PROG_DIR}/extract_ref23S.sh ${VIRIDIPLANTAE} > raw_23S.fasta
		loginfo " --> $(fastaCount raw_23S.fasta) retreived sequences"
		dereplicate raw_23S.fasta > 23S.fasta
		loginfo " --> $(fastaCount 23S.fasta) distinct sequences"
	loginfo "Done"

	loginfo "Clustering 23S rRNA sequences..."
		clustering 23S
	loginfo "Done"
	
	loginfo "Installing 23S rRNA sequences..."
		cp -r 23S 	"${RRNA_DATA_DIR}/RRNA_23S"
	loginfo "Done"


popTmpDir