org-annotate.sh 9.27 KB
Newer Older
1 2 3 4
#!/bin/bash
#
#
#
alain viari committed
5
#                           Annotate Organelle 
6 7 8 9 10 11
#
#========================================================================================
#
#
#========================================================================================

alain viari committed
12
# -- CAUTION -- Works as long as the script 
13
#               is not called through a symlink
alain viari committed
14 15
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/scripts/bash_init.sh"
16

17

18 19 20 21 22 23 24
#
# Management of options
#

taxid="no"
normalization="yes"
irdetection="yes"
Eric Coissac committed
25
organism="no"
26
types="chloro"
27
partial=0
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55

function usage {
	echo "Usage:" ;  
	echo "    $1 "'[-t|--ncbi-taxid ###] [-n|--no-normalization] \' 
	echo '       [-i|--no-ir-detection] [-h|--help] \ '
	echo '       [-o|--organism <organism_name>]  \ '
	echo '       [-c|--chloroplast|-r|--nuclear-rdna|-m|--mitochondrion] <FASTAFILE>' 
	echo
	echo "Options:"
	echo '  -t ### | --ncbi-taxid ###'
	echo '      ### represents the ncbi taxid associated to the sequence'
	echo
	echo '  -i     | --no-ir-detection'
	echo '      Does not look for inverted repeats in the plastid genome'
	echo
	echo '  -o     | --organism <organism_name>'
	echo '      Allows for specifiying the organism name in the embl generated file'
	echo '      Spaces have to be substituted by underscore ex : Abies_alba'
	echo
	echo '  -c     | --chloroplast'
	echo '      Selects for the annotation of a chloroplast genome'
	echo '      This is the default mode'
	echo
	echo '  -r     | --nuclear-rdna'
	echo '      Selects for the annotation of the rDNA nuclear cistron'
	echo
	echo '  -m     | --mitochondrion'
	echo '      Selects for the annotation of an animal mitochondrion genome'
56 57 58
	echo
	echo '  -p     | --partial'
	echo '      Indicates that the genome sequence is partial and therefore in several contigs'
59 60 61
   	exit $2
}

62

63
function fastaIterator() {
64
	$AwkCmd '/^>/ {if (seq) printf("%s\f",seq); seq=""} \
65 66 67 68
	          {if (seq) seq=seq"\n"; seq=seq $1} \
	          END {print seq}' "$1"
}

69
# options may be followed by one colon to indicate they have a required argument
70
if ! options=$(getopt -o t:o:icrmhp -l ncbi-taxid:,organism,no-ir-detection,chloroplast,nuclear-rdna,mitochondrion,partial,help -- "$@")
71 72
then
    # something went wrong, getopt will put out an error message for us
73
    usage $0 1
74 75
fi

76
eval set -- "$options"
77 78 79 80

while [ $# -gt 0 ]
do
    case $1 in
81
    -t|--ncbi-taxid) taxid="$2" ; shift ;;
82
    -i|--no-ir-detection)  irdetection="no" ;;
83 84 85 86
    -o|--organism) organism="$2" ; shift ;;
    -c|--chloroplast) types="chloro" ;;
    -r|--nuclear-rdna) types="nucrdna" ;;
    -m|--mitochondrion) types="mito" ;;
87
    -p|--partial) partial="1" ;;
88
    -h|--help)  usage $0 0;;
89 90 91 92 93 94 95
    (--) shift; break;;
    (-*) echo "$0: error - unrecognized option $1" 1>&2; exit 1;;
    (*) break;;
    esac
    shift
done

96 97 98 99 100 101
loginfo "Annotating mode.....: $types"
loginfo "IR detection mode...: $irdetection"
loginfo "Organism............: $organism"
loginfo "Partial mode........: $partial"


102 103
#############################

104 105
pushTmpDir ORG.organnot

106

107 108 109 110 111
	if [[ ! "$1" =~ ^/ ]]; then
		QUERY="${CALL_DIR}/$1"
	else
		QUERY="$1"
	fi
112
	
113 114 115

	RESULTS=$(basename ${QUERY/.*/})
	LOG="${CALL_DIR}/${RESULTS}.log"
116

117 118 119
	
	rm -f ${LOG}
	openLogFile ${LOG}
120

121
	IFS=$'\f'
122
	
123 124 125 126 127
	for sequence in $(fastaIterator "${QUERY}") ; do
		unset IFS
		if [[ ! -z "${sequence}" ]] ; then
			echo "${sequence}" > toannotate.fasta
			
128
			seqid=$($AwkCmd '(NR==1) {print substr($1,2,1000)}' toannotate.fasta)
129 130 131 132 133
			
			case "$types" in 
				chloro) 
					loginfo "Annotating a plant chloroplast genome..."
					
134
					if [[ "$irdetection" == "yes" ]] && (( partial == 0 )) ; then
135
				
136 137 138 139 140 141 142 143 144 145
						loginfo "Normalizing the structure of the Chloroplast sequence..."
							loginfo "   LSC + IRB + SSC + IRA"
							${PROG_DIR}/detectors/normalize/bin/go_normalize.sh toannotate.fasta > "${RESULTS}.norm.fasta"
						loginfo "Done."
						
						loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..."
							${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"		
						loginfo "Done."
						
					else
146
						loginfo "No normalization of the structure of the Chloroplast sequence..."
147 148 149 150 151 152 153 154 155 156 157 158
						cat toannotate.fasta > "${RESULTS}.norm.fasta"
						rm -f "${RESULTS}.annot"
						touch "${RESULTS}.annot"
					fi
					
					loginfo "Annotating the tRNA..."
						${PROG_DIR}/detectors/trna/bin/go_trna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
					loginfo "Done."
					
					loginfo "Annotating the rRNA genes..."
						${PROG_DIR}/detectors/rrna/bin/go_rrna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
					loginfo "Done."
159
				
160
					loginfo "Annotating the CDS..."
161
						tcsh -f ${PROG_DIR}/detectors/cds/bin/go_cds.csh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
					loginfo "Done."
					
					if (( partial == 0 )) ; then 
						topology="circular"
						defline="plastid, complete genome"
					else
						topology="linear"
						defline="plastid, partial sequence"
					fi
					;;
					
				nucrdna) 
					loginfo "Annotating a plant rDNA cistron..."
					
					loginfo "Normalizing the structure of the cistron sequence..."
177
						${PROG_DIR}/detectors/normalizerdna/bin/go_normalizerdna.sh toannotate.fasta > "${RESULTS}.norm.fasta"								
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
					loginfo "Done."
					
					loginfo "Annotating the rRNA genes..."
						${PROG_DIR}/detectors/nucrrna/bin/go_nucrrna.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"
					loginfo "Done."
		
					topology="linear"
					defline="18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene"
					;;
					
				mito) 
					loginfo "Annotating an animal mitochondrial genome..."
					logerror "Not yet implemented"
		
					if (( partial == 0 )) ; then 
						topology="circular"
						defline="mitochondrion, complete genome"
					else
						topology="linear"
						defline="mitochondrion, partial sequence"
					fi
					
					exit 1
					;;
					
				*) 
					usage $0 1;;
			esac
								
			if [[ "${organism}" == "no" ]]; then
				organism="{organism}"
			else
				organism="$(echo ${organism} | tr '_' ' ')"
211 212
			fi
			
213
			sl=$(seqlength "${RESULTS}.norm.fasta")
214
			
215 216 217 218 219 220
			loginfo "Printing minimal header..."		
				echo "ID   ${seqid}; ${seqid}; ${topology}; genomic DNA; XXX; XXX; ${sl} BP."
				echo "XX"
				echo "AC   ${seqid};"
				echo "DE   ${organism} ${defline}."
				echo "XX"
221
			loginfo "Done."
222
		
223 224 225 226 227
			loginfo "Printing annotations header..."
		    	echo "FH   Key             Location/Qualifiers"
			loginfo "Done."

			loginfo "Printing the source feature"
228
					echo "FT   source          1..${sl}"                               
229 230

				if [[ "${organism}" != "{organism}" ]] ; then 
231
					echo "FT                   /organism=\"${organism}\""              
232 233 234 235
				fi	
				
				case "${types}" in 
					chloro)  
236
						echo "FT                   /organelle=\"plastid:chloroplast\"" 
237 238
					;;
					mito)    
239
						echo "FT                   /organelle=\"mitochondrion\""       
240 241
					;;
					*) 
242
						loginfo "Nuclear sequence"  
243 244 245
					;;
				esac
				
246
					echo "FT                   /mol_type=\"genomic DNA\""              
247 248
				
				if [[ "${taxid}" != "no" ]] ; then 
249
					echo "FT                   /db_xref=\"taxon:${taxid}\""            
250 251
				fi
				
252
				#	echo "FT                   /country=\"Poland: Bialowieza Forest\"" 
253 254
			loginfo "Done."
			
255
			loginfo "Ordering annotations..."
256 257 258
				$AwkCmd '(entry && /^.....(misc|repeat|rRNA|tRNA|gene|source)/) { \
                           print pos,entry } \
					 /^.....(misc|repeat|rRNA|tRNA|gene|source)/ { \
259 260
				        match($3,"[0-9][0-9]*"); \
				        pos=substr($3,RSTART,RLENGTH)*1000 + 1; \
261
				        entry=$0;    \
262
				        next} \
263 264
				      { entry=entry "@" $0} \
 					END {print pos,entry}' "${RESULTS}.annot" | \
265
				sort -nk1 |\
266
				$AwkCmd '{ \
267 268
				        match($0,"^[0-9]* ");\
				        line=substr($0,RLENGTH+1);\
269
						gsub("@","\n",line); \
270 271 272 273
				        print line}' 
			loginfo "Done."
			
			
274
			
275 276
			loginfo "Closing annotations table..."
				echo "XX"
277 278
			loginfo "Done."
			
279
			loginfo "Computing statistics on nucleotide usage..."
280
				$AwkCmd '! /^>/ { \
281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302
					    seq=toupper($0); \
						gsub(" ","",seq); \
					    lseq=length(seq); \
						for (i=0; i < lseq; i++) { \
							freq[substr(seq,i,1)]++}\
							} \
					 END { \
					 	other=0; \
					 	for (i in freq) { \
					 		if (i!="A" && i!="C" && i!="G" && i!="T") {\
					 			other+=freq[i] \
					 			} \
					 		}; \
					 		print "SQ   Sequence "\
					 		      (freq["A"]+freq["C"]+freq["G"]+freq["T"]+other) \
					 		      " BP; "\
					 		      freq["A"]" A; "\
					 		      freq["C"]" C; "\
					 		      freq["G"]" G; "\
					 		      freq["T"]" T; "\
					 		      other" other;" \
					 }' "${RESULTS}.norm.fasta"
303
			loginfo "Done."
304 305
			
			loginfo "Reformating sequences..."
306
				lines=$(wc -l "${RESULTS}.norm.fasta" | $AwkCmd '{print $1}')
307 308 309 310
				
				loginfo "Sequence length $(seqlength ${RESULTS}.norm.fasta)"
				loginfo "lines $lines"
				formatfasta "${RESULTS}.norm.fasta" | \
311
				$AwkCmd -v lines=$lines ' \
312 313 314 315 316
					! /^>/ { \
							seq=tolower($0); \
							gsub(" ","",seq); \
							printf("     ") ;\
							for (i=0; i < 6; i++) { \
317
								f=substr(seq,i * 10 + 1, 10); \
318 319 320 321 322
								pos+=length(f); \
								f = f  substr("          ",1,10-length(f)); \
								printf("%s ",f) \
							}; \
							printf("   %6d\n",pos) \
323
					   }'
324 325 326 327 328 329 330 331 332
			loginfo "Done."
			
			loginfo "Closing sequence part..."
				echo "//"
			loginfo "Done."
		fi
		
		IFS=$'\f'
	done # End of the loop over the sequences
333 334
popTmpDir