org-annotate.sh 4.86 KB
Newer Older
1 2 3 4
#!/bin/bash
#
#
#
alain viari's avatar
alain viari committed
5
#                           Annotate Organelle 
6 7 8 9 10 11
#
#========================================================================================
#
#
#========================================================================================

alain viari's avatar
alain viari committed
12
# -- CAUTION -- Works as long as the script 
13
#               is not called through a symlink
alain viari's avatar
alain viari committed
14 15
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/scripts/bash_init.sh"
16

17 18 19 20 21 22 23
#
# Management of options
#

taxid="no"
normalization="yes"
irdetection="yes"
Eric Coissac's avatar
Eric Coissac committed
24
organism="no"
25 26

# options may be followed by one colon to indicate they have a required argument
Eric Coissac's avatar
Eric Coissac committed
27
if ! options=$(getopt -o t:o:ih -l ncbi-taxid:,organism,no-ir-detection,help -- "$@")
28 29 30 31 32
then
    # something went wrong, getopt will put out an error message for us
    exit 1
fi

33
eval set -- "$options"
34 35 36 37 38 39

while [ $# -gt 0 ]
do
    case $1 in
    -t|--ncbi-taxid) taxid="$2" ; shift;;
    -i|--no-ir-detection)  irdetection="no" ;;
Eric Coissac's avatar
Eric Coissac committed
40
    -o|--organism) organism="$2" ; shift;;
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
    -h|--help)  echo "Usage:" ;  
    			echo "    $0 "'[-t|--ncbi-taxid ###] [-n|--no-normalization] \' 
    			echo "       [-i|--no-ir-detection] [-h|--help] <FASTAFILE>"
    			echo
    			echo "Options:"
    			echo '  -t ### | --ncbi-taxid ###'
    			echo '      ### represents the ncbi taxid associated to the sequence'
    			echo
    			echo '  -i     | --no-ir-detection'
    			echo '      Does not look for inverted repeats in the plastid genome'
               	exit 0;;
    (--) shift; break;;
    (-*) echo "$0: error - unrecognized option $1" 1>&2; exit 1;;
    (*) break;;
    esac
    shift
done

#############################

61 62 63 64 65 66 67
pushTmpDir ORG.organnot

	if [[ ! "$1" =~ ^/ ]]; then
		QUERY="${CALL_DIR}/$1"
	else
		QUERY="$1"
	fi
68
	
69 70 71 72 73 74

	RESULTS=$(basename ${QUERY/.*/})
	LOG="${CALL_DIR}/${RESULTS}.log"
	
	rm -f ${LOG}
	openLogFile ${LOG}
75
		
76 77 78 79 80 81 82 83 84 85 86 87
	if [ "$irdetection"=="yes" ]; then

		loginfo "Normalizing the structure of the Chloroplast sequence..."
			loginfo "   LSC + IRB + SSC + IRA"
			${PROG_DIR}/detectors/normalize/bin/go_normalize.sh ${QUERY} > "${RESULTS}.norm.fasta"
		loginfo "Done."
		
		loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..."
			${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"		
		loginfo "Done."
		
	fi
88
	
89
	loginfo "Annotating the tRNA..."
90
		${PROG_DIR}/detectors/trna/bin/go_trna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
91
	loginfo "Done."
92
	
93
	loginfo "Annotating the rRNA genes..."
94
		${PROG_DIR}/detectors/rrna/bin/go_rrna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
95
	loginfo "Done."
alain viari's avatar
alain viari committed
96 97

	loginfo "Annotating the CDS..."
98
		${PROG_DIR}/detectors/cds/bin/go_cds.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
alain viari's avatar
alain viari committed
99
	loginfo "Done."
100
	
Eric Coissac's avatar
Eric Coissac committed
101 102 103 104
	loginfo "Printing minimal header..."
		echo "ID   XXX; XXX; circular; genomic DNA; XXX; XXX; $(seqlength ${RESULTS}.norm.fasta) BP."
		echo "XX"
		echo "AC   XXX;"
105
		echo "XX"
Eric Coissac's avatar
Eric Coissac committed
106 107 108 109 110 111 112 113 114
		if [[ "${organism}" == "no" ]]; then
			echo "DE   {organism} plastid, complete genome."
		else
			echo "DE   $(echo ${organism} | tr '_' ' ') plastid, complete genome."
		fi
		echo "XX"	
	loginfo "Done."

	loginfo "Printing annotations header..."
115 116 117 118
    	echo "FH   Key             Location/Qualifiers"
	loginfo "Done."
	
	loginfo "Ordering annotations..."
119
		awk '/^.....(misc|repeat|rRNA|tRNA|gene)/ { \
120 121
		        match($3,"[0-9][0-9]*"); \
		        pos=substr($3,RSTART,RLENGTH)*1000 + 1; \
122 123 124
		        print pos,$0;    \
		        next} \
		      { pos++; \
125
		        print pos,$0}' "${RESULTS}.annot" | \
126
		sort -nk1 |\
127 128
		awk '{ \
		        match($0,"^[0-9]* ");\
129
		        line=substr($0,RLENGTH+1);\
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
		        print line}' 
	loginfo "Done."
	
	loginfo "Closing annotations table..."
		echo "XX"
	loginfo "Done."
	
	loginfo "Computing statistics on nucleotide usage..."
		awk '! /^>/ { \
			    seq=toupper($0); \
				gsub(" ","",seq); \
			    lseq=length(seq); \
				for (i=0; i < lseq; i++) { \
					freq[substr(seq,i,1)]++}\
					} \
			 END { \
			 	other=0; \
			 	for (i in freq) { \
			 		if (i!="A" && i!="C" && i!="G" && i!="T") {\
			 			other+=freq[i] \
			 			} \
			 		}; \
			 		print "SQ   Sequence "\
			 		      (freq["A"]+freq["C"]+freq["G"]+freq["T"]+other) \
			 		      " BP; "\
			 		      freq["A"]" A; "\
			 		      freq["C"]" C; "\
			 		      freq["G"]" G; "\
			 		      freq["T"]" T; "\
			 		      other" other;" \
160
			 }' "${RESULTS}.norm.fasta"
161 162 163
	loginfo "Done."
	
	loginfo "Reformating sequences..."
164
		lines=$(wc -l "${RESULTS}.norm.fasta" | awk '{print $1}')
165 166 167 168 169 170 171 172 173 174 175 176 177 178
		awk -v lines=$lines ' \
			! /^>/ { \
					seq=tolower($0); \
					gsub(" ","",seq); \
					printf("     ") ;\
					for (i=0; i < 6; i++) { \
						f=substr(seq,i * 10, 10); \
						pos+=length(f); \
						f = f  substr("          ",1,10-length(f)); \
						printf("%s ",f) \
					}; \
					if (NR==lines) \
					  {pos-=1}; \
					printf("   %6d\n",pos) \
179
			   }' "${RESULTS}.norm.fasta"
180 181 182 183 184 185
	loginfo "Done."
	
	loginfo "Closing sequence part..."
		echo "//"
	loginfo "Done."

186 187
popTmpDir