org-annotate.sh 9.86 KB
Newer Older
1 2 3 4
#!/bin/bash
#
#
#
alain viari committed
5
#                           Annotate Organelle 
6 7 8 9 10 11
#
#========================================================================================
#
#
#========================================================================================

alain viari committed
12
# -- CAUTION -- Works as long as the script 
13
#               is not called through a symlink
alain viari committed
14 15
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/scripts/bash_init.sh"
16

17

18 19 20 21 22 23 24
#
# Management of options
#

taxid="no"
normalization="yes"
irdetection="yes"
Eric Coissac committed
25
organism="no"
26
types="chloro"
27
partial=0
28
minlength=0
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56

function usage {
	echo "Usage:" ;  
	echo "    $1 "'[-t|--ncbi-taxid ###] [-n|--no-normalization] \' 
	echo '       [-i|--no-ir-detection] [-h|--help] \ '
	echo '       [-o|--organism <organism_name>]  \ '
	echo '       [-c|--chloroplast|-r|--nuclear-rdna|-m|--mitochondrion] <FASTAFILE>' 
	echo
	echo "Options:"
	echo '  -t ### | --ncbi-taxid ###'
	echo '      ### represents the ncbi taxid associated to the sequence'
	echo
	echo '  -i     | --no-ir-detection'
	echo '      Does not look for inverted repeats in the plastid genome'
	echo
	echo '  -o     | --organism <organism_name>'
	echo '      Allows for specifiying the organism name in the embl generated file'
	echo '      Spaces have to be substituted by underscore ex : Abies_alba'
	echo
	echo '  -c     | --chloroplast'
	echo '      Selects for the annotation of a chloroplast genome'
	echo '      This is the default mode'
	echo
	echo '  -r     | --nuclear-rdna'
	echo '      Selects for the annotation of the rDNA nuclear cistron'
	echo
	echo '  -m     | --mitochondrion'
	echo '      Selects for the annotation of an animal mitochondrion genome'
57 58 59
	echo
	echo '  -p     | --partial'
	echo '      Indicates that the genome sequence is partial and therefore in several contigs'
60 61 62
	echo
	echo '  -l     | --min-length'
	echo '      Indicates for partial mode the minimum length of contig to annotate'
63 64 65
   	exit $2
}

66

67
function fastaIterator() {
68
	$AwkCmd '/^>/ {if (seq) printf("%s\f",seq); seq=""} \
69 70 71 72
	          {if (seq) seq=seq"\n"; seq=seq $1} \
	          END {print seq}' "$1"
}

73
# options may be followed by one colon to indicate they have a required argument
74
if ! options=$(getopt -o t:o:icrmhpl: -l ncbi-taxid:,organism,no-ir-detection,chloroplast,nuclear-rdna,mitochondrion,partial,min-length:,help -- "$@")
75 76
then
    # something went wrong, getopt will put out an error message for us
77
    usage $0 1
78 79
fi

80
eval set -- "$options"
81 82 83 84

while [ $# -gt 0 ]
do
    case $1 in
85
    -t|--ncbi-taxid) taxid="$2" ; shift ;;
86
    -i|--no-ir-detection)  irdetection="no" ;;
87 88 89 90
    -o|--organism) organism="$2" ; shift ;;
    -c|--chloroplast) types="chloro" ;;
    -r|--nuclear-rdna) types="nucrdna" ;;
    -m|--mitochondrion) types="mito" ;;
91
    -p|--partial) partial="1" ;;
92
    -l|--min-length) minlength="$2" ; shift ;;
93
    -h|--help)  usage $0 0;;
94 95 96 97 98 99 100
    (--) shift; break;;
    (-*) echo "$0: error - unrecognized option $1" 1>&2; exit 1;;
    (*) break;;
    esac
    shift
done

101 102 103 104
loginfo "Annotating mode.....: $types"
loginfo "IR detection mode...: $irdetection"
loginfo "Organism............: $organism"
loginfo "Partial mode........: $partial"
105
loginfo "Minimum length......: $minlength"
106 107


108 109
#############################

110 111
pushTmpDir ORG.organnot

112

113 114 115 116 117
	if [[ ! "$1" =~ ^/ ]]; then
		QUERY="${CALL_DIR}/$1"
	else
		QUERY="$1"
	fi
118
	
119 120 121

	RESULTS=$(basename ${QUERY/.*/})
	LOG="${CALL_DIR}/${RESULTS}.log"
122

123 124 125
	
	rm -f ${LOG}
	openLogFile ${LOG}
126

127
	IFS=$'\f'
128
	
129 130 131 132
	for sequence in $(fastaIterator "${QUERY}") ; do
		unset IFS
		if [[ ! -z "${sequence}" ]] ; then
			echo "${sequence}" > toannotate.fasta
133
			sl=$(seqlength "toannotate.fasta")
134
			
135
			if (( sl >= minlength )) ; then
136
			
137
				seqid=$($AwkCmd '(NR==1) {print substr($1,2,1000)}' toannotate.fasta)
138
				
139 140 141
				case "$types" in 
					chloro) 
						loginfo "Annotating a plant chloroplast genome..."
142
						
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
						if [[ "$irdetection" == "yes" ]] && (( partial == 0 )) ; then
					
							loginfo "Normalizing the structure of the Chloroplast sequence..."
								loginfo "   LSC + IRB + SSC + IRA"
								${PROG_DIR}/detectors/normalize/bin/go_normalize.sh toannotate.fasta > "${RESULTS}.norm.fasta"
							loginfo "Done."
							
							loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..."
								${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"		
							loginfo "Done."
							
						else
							loginfo "No normalization of the structure of the Chloroplast sequence..."
							cat toannotate.fasta > "${RESULTS}.norm.fasta"
							rm -f "${RESULTS}.annot"
							touch "${RESULTS}.annot"
						fi
						
						loginfo "Annotating the tRNA..."
							${PROG_DIR}/detectors/trna/bin/go_trna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
163 164
						loginfo "Done."
						
165 166 167
						loginfo "Annotating the rRNA genes..."
							${PROG_DIR}/detectors/rrna/bin/go_rrna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
						loginfo "Done."
168
					
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
						loginfo "Annotating the CDS..."
							tcsh -f ${PROG_DIR}/detectors/cds/bin/go_cds.csh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
						loginfo "Done."
						
						if (( partial == 0 )) ; then 
							topology="circular"
							defline="plastid, complete genome"
						else
							topology="linear"
							defline="plastid, partial sequence"
						fi
						;;
						
					nucrdna) 
						loginfo "Annotating a plant rDNA cistron..."
						
						loginfo "Normalizing the structure of the cistron sequence..."
							${PROG_DIR}/detectors/normalizerdna/bin/go_normalizerdna.sh toannotate.fasta > "${RESULTS}.norm.fasta"								
						loginfo "Done."
						
						loginfo "Annotating the rRNA genes..."
							${PROG_DIR}/detectors/nucrrna/bin/go_nucrrna.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"
						loginfo "Done."
			
193
						topology="linear"
194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
						defline="18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene"
						;;
						
					mito) 
						loginfo "Annotating an animal mitochondrial genome..."
						logerror "Not yet implemented"
			
						if (( partial == 0 )) ; then 
							topology="circular"
							defline="mitochondrion, complete genome"
						else
							topology="linear"
							defline="mitochondrion, partial sequence"
						fi
						
						exit 1
						;;
						
					*) 
						usage $0 1;;
				esac
									
				if [[ "${organism}" == "no" ]]; then
					organism="{organism}"
				else
					organism="$(echo ${organism} | tr '_' ' ')"
				fi
				
				sl=$(seqlength "${RESULTS}.norm.fasta")
				
				loginfo "Printing minimal header..."		
					echo "ID   ${seqid}; ${seqid}; ${topology}; genomic DNA; XXX; XXX; ${sl} BP."
					echo "XX"
					echo "AC   ${seqid};"
					echo "DE   ${organism} ${defline}."
					echo "XX"
				loginfo "Done."
			
				loginfo "Printing annotations header..."
			    	echo "FH   Key             Location/Qualifiers"
				loginfo "Done."
	
				loginfo "Printing the source feature"
						echo "FT   source          1..${sl}"                               
	
					if [[ "${organism}" != "{organism}" ]] ; then 
						echo "FT                   /organism=\"${organism}\""              
					fi	
242
					
243 244 245 246 247 248 249 250 251 252 253
					case "${types}" in 
						chloro)  
							echo "FT                   /organelle=\"plastid:chloroplast\"" 
						;;
						mito)    
							echo "FT                   /organelle=\"mitochondrion\""       
						;;
						*) 
							loginfo "Nuclear sequence"  
						;;
					esac
254
					
255
						echo "FT                   /mol_type=\"genomic DNA\""              
256
					
257 258
					if [[ "${taxid}" != "no" ]] ; then 
						echo "FT                   /db_xref=\"taxon:${taxid}\""            
259 260
					fi
					
261 262
					#	echo "FT                   /country=\"Poland: Bialowieza Forest\"" 
				loginfo "Done."
263
				
264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
				loginfo "Ordering annotations..."
					$AwkCmd '(entry && /^.....(misc|repeat|rRNA|tRNA|gene|source)/) { \
	                           print pos,entry } \
						 /^.....(misc|repeat|rRNA|tRNA|gene|source)/ { \
					        match($3,"[0-9][0-9]*"); \
					        pos=substr($3,RSTART,RLENGTH)*1000 + 1; \
					        entry=$0;    \
					        next} \
					      { entry=entry "@" $0} \
	 					END {print pos,entry}' "${RESULTS}.annot" | \
					sort -nk1 |\
					$AwkCmd '{ \
					        match($0,"^[0-9]* ");\
					        line=substr($0,RLENGTH+1);\
							gsub("@","\n",line); \
					        print line}' 
				loginfo "Done."
281 282 283
				
				
				
284 285 286
				loginfo "Closing annotations table..."
					echo "XX"
				loginfo "Done."
287
				
288 289 290
				loginfo "Computing statistics on nucleotide usage..."
					$AwkCmd '! /^>/ { \
						    seq=toupper($0); \
291
							gsub(" ","",seq); \
292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
						    lseq=length(seq); \
							for (i=0; i < lseq; i++) { \
								freq[substr(seq,i,1)]++}\
								} \
						 END { \
						 	other=0; \
						 	for (i in freq) { \
						 		if (i!="A" && i!="C" && i!="G" && i!="T") {\
						 			other+=freq[i] \
						 			} \
						 		}; \
						 		print "SQ   Sequence "\
						 		      (freq["A"]+freq["C"]+freq["G"]+freq["T"]+other) \
						 		      " BP; "\
						 		      freq["A"]" A; "\
						 		      freq["C"]" C; "\
						 		      freq["G"]" G; "\
						 		      freq["T"]" T; "\
						 		      other" other;" \
						 }' "${RESULTS}.norm.fasta"
				loginfo "Done."
				
				loginfo "Reformating sequences..."
					lines=$(wc -l "${RESULTS}.norm.fasta" | $AwkCmd '{print $1}')
					
					loginfo "Sequence length $(seqlength ${RESULTS}.norm.fasta)"
					loginfo "lines $lines"
					formatfasta "${RESULTS}.norm.fasta" | \
					$AwkCmd -v lines=$lines ' \
						! /^>/ { \
								seq=tolower($0); \
								gsub(" ","",seq); \
								printf("     ") ;\
								for (i=0; i < 6; i++) { \
									f=substr(seq,i * 10 + 1, 10); \
									pos+=length(f); \
									f = f  substr("          ",1,10-length(f)); \
									printf("%s ",f) \
								}; \
								printf("   %6d\n",pos) \
						   }'
				loginfo "Done."
				
				loginfo "Closing sequence part..."
					echo "//"
				loginfo "Done."
			fi # End of the minimum length condition
		fi  # End of not empty sequence condition
340 341 342
		
		IFS=$'\f'
	done # End of the loop over the sequences
343 344
popTmpDir