go_normalize.sh 4.16 KB
Newer Older
Eric Coissac's avatar
Eric Coissac committed
1 2 3 4 5 6
#!/bin/bash
#
#                           NORMALISATION D'UN PLASTIDE
#
#========================================================================================
#
7 8
# Normalize the way the chloroplaste genome sequence is linearized in the fasta file
# The normalized sequence is: 
Eric Coissac's avatar
Eric Coissac committed
9
#
10
#               LSC + IRB + SSC + IRA
Eric Coissac's avatar
Eric Coissac committed
11
#
12 13 14 15
# The SSC and LSC are approximatively mapped by similarity with a reference database
# Inverted repeats (IRs) are identified for maximizing the segregation between 
# LSC and SSC match 
# 
Eric Coissac's avatar
Eric Coissac committed
16
#
17
#  go_normalize.sh <FASTAFILE>
Eric Coissac's avatar
Eric Coissac committed
18
#
19
#		- <FASTAFILE> : The fasta file containing the genome to normalize
Eric Coissac's avatar
Eric Coissac committed
20
#
21
#  Results are printed to the standart output
Eric Coissac's avatar
Eric Coissac committed
22 23 24
#
#========================================================================================

25 26
# -- CAUTION -- Works as long than the script 
#               is not called through a symlink
alain viari's avatar
alain viari committed
27 28
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source ${THIS_DIR}/../lib/lookforIR.lib.sh
Eric Coissac's avatar
Eric Coissac committed
29

30
ORG_DEBUG=1
Eric Coissac's avatar
Eric Coissac committed
31

32
pushTmpDir ORG.normalize
Eric Coissac's avatar
Eric Coissac committed
33 34 35 36

	tmpfasta1="tmp_$$_1.fasta"
	tmpfasta2="tmp_$$_2.fasta"

37
	logdebug "Running on : $QUERY"
Eric Coissac's avatar
Eric Coissac committed
38

39 40 41 42 43
	loginfo "Computing the genome size..."
		genome_length=$(seqlength $QUERY)
		loginfo " --> $genome_length bp"
	loginfo "Done"
	
44 45
	IRDetected=1
	IR=( $(lookForIR ${QUERY}) ) || IRDetected=0
46
	
47
	if (( IRDetected == 1 )) ; then
48
	
49 50 51 52
		posIR1=${IR[4]}
		posIR2=${IR[6]}
		
		let "lenIR= ( ${IR[5]} +  ${IR[7]} ) / 2 " 
53
	
54 55
		let "endIR2=$posIR2 + $lenIR - 1"
		let "endIR1=$posIR1 + $lenIR - 1"
56
		
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
		if (( "$endIR2" >= "$genome_length" )) ; then	
			loginfo "IRB is at the end of the original sequence"
			
			#
			# We just move the IRB at the begining of the sequence
			#
			
			# Extract the IRB sequence
			let "posCut=($endIR1+$posIR2)/2"
			cutseq ${QUERY} ${posCut} ${genome_length} > ${tmpfasta1}
	
			# Append the remaining part of the genome		
			let "posCut=$posCut-1"
			cutseq ${QUERY} 1 ${posCut} >> ${tmpfasta1}
			
			# merges both the parts
			joinfasta ${tmpfasta1} > ${tmpfasta2}
			rm -f ${tmpfasta1}
			QUERY=${tmpfasta2}
	
			loginfo "Recomputing location of the IR..."
				declare -a IR=( $(lookForIR ${QUERY}) )
			loginfo "Done"
			
			posIR1="${IR[4]}"
			posIR2="${IR[6]}"
			
			let "lenIR=(${IR[5]} +  ${IR[7]}) / 2 " 
85
		
86 87 88 89
			let "endIR2=$posIR2 + $lenIR - 1"
			let "endIR1=$posIR1 + $lenIR - 1"
			
		fi		
90
		
91 92
		tmpIR1="tmp_$$_IR1.fasta"		
		tmpIR2="tmp_$$_IR2.fasta"		
93
		
94 95 96
		#enregistre les deux fragments IRa et IRb complet
		cutseq ${QUERY} ${posIR1} ${endIR1} > ${tmpIR1}
		cutseq ${QUERY} ${posIR2} ${endIR2} > ${tmpIR2}
97
		
98 99 100 101 102 103 104
		let "lenSC1=$posIR1 -1 + ($genome_length - endIR2)"
		let "lenSC2=$posIR2 - $endIR1"
		
		center="${IR[0]}"
			
		tmpLSC="tmp_$$_LSC.fasta"		
		tmpSSC="tmp_$$_SSC.fasta"		
105
		
106 107
		# Extract the central SC present in between the two IRs
		# considering it as LSC
108
	
109 110 111
		let "beginLSC=$endIR1+1"
		let "endLSC=$posIR2-1"
		cutseq ${QUERY} ${beginLSC} ${endLSC} > ${tmpLSC}
112
	
113
		strandLSC="${IR[1]}"
114 115
	
	
116 117 118 119 120
		# Extract the external SC present in two parts
		# Considering it as SSC
		
		let "beginSSC=$endIR2+1"
		cutseq ${QUERY} ${beginSSC} ${genome_length} > ${tmpSSC}
121
	
122 123
		let "endSSC=$posIR1-1"
		cutseq ${QUERY} 1 ${endSSC} >> ${tmpSSC}
124
	
125 126
		joinfasta ${tmpSSC} > ${tmpfasta1}
		mv ${tmpfasta1} ${tmpSSC}
127
		
128
		strandSSC="${IR[3]}"
129 130 131
		
		
	
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
		if [[ "$center" == "SSC" ]]; then
		
			# Actually this is the oposite LSC is SSC and SSC is LSC
	
			# Exchanges the SSC and LSC sequences
			mv ${tmpSSC}    ${tmpfasta1}
			mv ${tmpLSC}    ${tmpSSC}
			mv ${tmpfasta1} ${tmpLSC}
			
			# Exchanges the IRa and IRb sequences
			mv ${tmpIR1}    ${tmpfasta1}
			mv ${tmpIR2}    ${tmpIR1}
			mv ${tmpfasta1} ${tmpIR2}
			
			# Exchanges the strand of both the Single copies
			tmp=${strandSSC}
			strandSSC=${strandLSC}
			strandLSC=${tmp}
			
		fi
		
		# Reverse complement the SSC if needed
		if [[ "${strandSSC}" == "-" ]]; then
			fastarevcomp -f ${tmpSSC} > ${tmpfasta1}
			mv ${tmpfasta1} ${tmpSSC}
		fi
		
		# Reverse complement the LSC if needed
		if [[ "${strandLSC}" == "-" ]]; then
			fastarevcomp -f ${tmpLSC} > ${tmpfasta1}
			mv ${tmpfasta1} ${tmpLSC}
		fi
		
		# Merges the four parts of the genome.
		cat ${tmpLSC} ${tmpIR2} ${tmpSSC} ${tmpIR1} | joinfasta
Eric Coissac's avatar
Eric Coissac committed
167

168 169 170 171
	else
		# No IR detected --> normalization has no effect
		cat ${QUERY}
	fi
172
popTmpDir
Eric Coissac's avatar
Eric Coissac committed
173 174

exit 0
175