Commit ee54019d by Eric Coissac

Add annotation of nuclear rDNA cistron

parent 7f75da85
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
#!/bin/bash
#
# Annotate the Intergenic Spacer (ITS) of nuclear rDNA cluster
#
#========================================================================================
#
# This script is based on ITSx
#
#
# go_its.sh <FASTAFILE>
#
# - <FASTAFILE> : The fasta file containing the cluster to annotate
#
# Results are printed to the standart output
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
pushTmpDir ORG.its
loginfo "Normalizing nuclear rDNA cistron..."
RRNADB="${NUCRRNA_DATA_DIR}/plants/nuc_RRNA.hmm"
if [[ ! "$1" =~ ^/ ]]; then
QUERY="${CALL_DIR}/$1"
else
QUERY="$1"
fi
strand=( $(hmmsearch --max ${RRNADB} ${QUERY} | \
$AwkCmd '/Query: / { \
profil=$2; \
match($3,"[0-9][0-9]*");\
lprof=substr($3,RSTART,RLENGTH)} \
/ [0-9][0-9]* ! / { \
print profil,lprof,$7,$8,$10,$11}' | \
$AwkCmd '($3 <=5) && (($2-$4) <=5) { \
full=1;$5=$5-$3+1;$6=$6+($2-$4)} \
{loc="Forward"} \
($1 ~ /_RC$/) { \
loc="Reverse"} \
(full==1) {match($1,"_..*S");\
rrna=substr($1,RSTART+1,RLENGTH-1);\
print loc;\
full=0
}' | sort | uniq) )
if [[ "${#strand[@]}" == 1 ]] ; then
if [[ "${strand[0]}" == "Forward" ]] ; then
cat ${QUERY}
else
fastarevcomp -f ${QUERY}
fi
else
logerror "Cannot determine the Cistron orientation"
exit 1
fi
loginfo "Done."
popTmpDir
exit 0
#!/bin/bash
#
# Annotate the Intergenic Spacer (ITS) of nuclear rDNA cluster
#
#========================================================================================
#
# This script is based on ITSx
#
#
# go_its.sh <FASTAFILE>
#
# - <FASTAFILE> : The fasta file containing the cluster to annotate
#
# Results are printed to the standart output
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
pushTmpDir ORG.its
loginfo "Annotating ITS and TSU..."
RRNADB="${NUCRRNA_DATA_DIR}/plants/nuc_RRNA.hmm"
if [[ ! "$1" =~ ^/ ]]; then
QUERY="${CALL_DIR}/$1"
else
QUERY="$1"
fi
ITSx -p "${ITS_DATA_DIR}/ITSx_db/HMMs" -i "${QUERY}" -o "output.itsx"
ITS1=( $(sed -E 's/.*ITS1: *([0-9]+)-([0-9]+).*/\1 \2/' "output.itsx.positions.txt" ) )
ITS2=( $(sed -E 's/.*ITS2: *([0-9]+)-([0-9]+).*/\1 \2/' "output.itsx.positions.txt" ) )
TSU=( $(sed -E 's/.*5\.8S: *([0-9]+)-([0-9]+).*/\1 \2/' "output.itsx.positions.txt" ) )
if [[ ${#ITS1[@]}=="2" ]] ; then
echo "FT misc_RNA ${ITS1[0]}..${ITS1[1]}"
echo 'FT /gene="ITS1"'
echo 'FT /note="internal transcribed spacer 1, ITS1"'
fi
if [[ ${#TSU[@]}=="2" ]] ; then
echo "FT rRNA ${TSU[0]}..${TSU[1]}"
echo 'FT /gene="5.8S rRNA"'
echo 'FT /product="5.8S ribosomal nuclear RNA"'
fi
if [[ ${#ITS2[@]}=="2" ]] ; then
echo "FT misc_RNA ${ITS2[0]}..${ITS2[1]}"
echo 'FT /gene="ITS2"'
echo 'FT /note="internal transcribed spacer 2, ITS2"'
fi
hmmsearch --max ${RRNADB} ${QUERY} | \
$AwkCmd '/Query: / { \
profil=$2; \
match($3,"[0-9][0-9]*");\
lprof=substr($3,RSTART,RLENGTH)} \
/ [0-9][0-9]* ! / { \
print profil,lprof,$7,$8,$10,$11}' | \
$AwkCmd '($3 <=5) && (($2-$4) <=5) { \
full=1;$5=$5-$3+1;$6=$6+($2-$4)} \
{loc=$5".."$6} \
($1 ~ /_RC$/) { \
loc="complement("loc")"} \
(full==1) {match($1,"_..*S");\
rrna=substr($1,RSTART+1,RLENGTH-1);\
print "FT rRNA " loc; \
print "FT /gene=\"rrn"rrna"\""
print "FT /product=\""rrna" ribosomal RNA\"";\
full=0
}'
loginfo "Done."
popTmpDir
exit 0
......@@ -23,9 +23,41 @@ taxid="no"
normalization="yes"
irdetection="yes"
organism="no"
types="chloro"
function usage {
echo "Usage:" ;
echo " $1 "'[-t|--ncbi-taxid ###] [-n|--no-normalization] \'
echo ' [-i|--no-ir-detection] [-h|--help] \ '
echo ' [-o|--organism <organism_name>] \ '
echo ' [-c|--chloroplast|-r|--nuclear-rdna|-m|--mitochondrion] <FASTAFILE>'
echo
echo "Options:"
echo ' -t ### | --ncbi-taxid ###'
echo ' ### represents the ncbi taxid associated to the sequence'
echo
echo ' -i | --no-ir-detection'
echo ' Does not look for inverted repeats in the plastid genome'
echo
echo ' -o | --organism <organism_name>'
echo ' Allows for specifiying the organism name in the embl generated file'
echo ' Spaces have to be substituted by underscore ex : Abies_alba'
echo
echo ' -c | --chloroplast'
echo ' Selects for the annotation of a chloroplast genome'
echo ' This is the default mode'
echo
echo ' -r | --nuclear-rdna'
echo ' Selects for the annotation of the rDNA nuclear cistron'
echo
echo ' -m | --mitochondrion'
echo ' Selects for the annotation of an animal mitochondrion genome'
exit $2
}
# options may be followed by one colon to indicate they have a required argument
if ! options=$(getopt -o t:o:ih -l ncbi-taxid:,organism,no-ir-detection,help -- "$@")
if ! options=$(getopt -o t:o:icrmh -l ncbi-taxid:,organism,no-ir-detection,chloroplast,nuclear-rdna,mitochondrion,help -- "$@")
then
# something went wrong, getopt will put out an error message for us
exit 1
......@@ -36,20 +68,13 @@ eval set -- "$options"
while [ $# -gt 0 ]
do
case $1 in
-t|--ncbi-taxid) taxid="$2" ; shift;;
-t|--ncbi-taxid) taxid="$2" ; shift ;;
-i|--no-ir-detection) irdetection="no" ;;
-o|--organism) organism="$2" ; shift;;
-h|--help) echo "Usage:" ;
echo " $0 "'[-t|--ncbi-taxid ###] [-n|--no-normalization] \'
echo " [-i|--no-ir-detection] [-h|--help] <FASTAFILE>"
echo
echo "Options:"
echo ' -t ### | --ncbi-taxid ###'
echo ' ### represents the ncbi taxid associated to the sequence'
echo
echo ' -i | --no-ir-detection'
echo ' Does not look for inverted repeats in the plastid genome'
exit 0;;
-o|--organism) organism="$2" ; shift ;;
-c|--chloroplast) types="chloro" ;;
-r|--nuclear-rdna) types="nucrdna" ;;
-m|--mitochondrion) types="mito" ;;
-h|--help) usage $0 0;;
(--) shift; break;;
(-*) echo "$0: error - unrecognized option $1" 1>&2; exit 1;;
(*) break;;
......@@ -57,6 +82,7 @@ do
shift
done
echo $type
#############################
pushTmpDir ORG.organnot
......@@ -73,43 +99,78 @@ pushTmpDir ORG.organnot
rm -f ${LOG}
openLogFile ${LOG}
if [ "$irdetection"=="yes" ]; then
loginfo "Normalizing the structure of the Chloroplast sequence..."
loginfo " LSC + IRB + SSC + IRA"
${PROG_DIR}/detectors/normalize/bin/go_normalize.sh ${QUERY} > "${RESULTS}.norm.fasta"
loginfo "Done."
case "$types" in
chloro)
loginfo "Annotating a plant chloroplast genome..."
if [ "$irdetection"=="yes" ]; then
loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..."
${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"
loginfo "Done."
loginfo "Normalizing the structure of the Chloroplast sequence..."
loginfo " LSC + IRB + SSC + IRA"
${PROG_DIR}/detectors/normalize/bin/go_normalize.sh ${QUERY} > "${RESULTS}.norm.fasta"
loginfo "Done."
loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..."
${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"
loginfo "Done."
fi
loginfo "Annotating the tRNA..."
${PROG_DIR}/detectors/trna/bin/go_trna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
loginfo "Done."
loginfo "Annotating the rRNA genes..."
${PROG_DIR}/detectors/rrna/bin/go_rrna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
loginfo "Done."
loginfo "Annotating the CDS..."
tcsh -f ${PROG_DIR}/detectors/cds/bin/go_cds.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
loginfo "Done."
topology="circular"
defline="plastid, complete genome"
;;
nucrdna)
loginfo "Annotating a plant rDNA cistron..."
loginfo "Normalizing the structure of the cistron sequence..."
${PROG_DIR}/detectors/normalizerdna/bin/go_normalizerdna.sh ${QUERY} > "${RESULTS}.norm.fasta"
loginfo "Done."
loginfo "Annotating the rRNA genes..."
${PROG_DIR}/detectors/nucrrna/bin/go_nucrrna.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"
loginfo "Done."
topology="linear"
defline="18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene"
;;
mito)
loginfo "Annotating an animal mitochondrial genome..."
logerror "Not yet implemented"
topology="circular"
defline="mitochondrion, complete genome"
exit 1
;;
*)
echo usage $0 1;;
esac
if [[ "${organism}" == "no" ]]; then
organism="{organism}"
else
organism="$(echo ${organism} | tr '_' ' ')"
fi
loginfo "Annotating the tRNA..."
${PROG_DIR}/detectors/trna/bin/go_trna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
loginfo "Done."
loginfo "Annotating the rRNA genes..."
${PROG_DIR}/detectors/rrna/bin/go_rrna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
loginfo "Done."
loginfo "Annotating the CDS..."
tcsh -f ${PROG_DIR}/detectors/cds/bin/go_cds.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
loginfo "Done."
loginfo "Printing minimal header..."
echo "ID XXX; XXX; circular; genomic DNA; XXX; XXX; $(seqlength ${RESULTS}.norm.fasta) BP."
echo "ID XXX; XXX; ${topology}; genomic DNA; XXX; XXX; $(seqlength ${RESULTS}.norm.fasta) BP."
echo "XX"
echo "AC XXX;"
echo "DE ${organism} ${defline}."
echo "XX"
if [[ "${organism}" == "no" ]]; then
echo "DE {organism} plastid, complete genome."
else
echo "DE $(echo ${organism} | tr '_' ' ') plastid, complete genome."
fi
echo "XX"
loginfo "Done."
loginfo "Printing annotations header..."
......
......@@ -182,6 +182,12 @@ CDS_DATA_DIR="${DATA_DIR}/cds" # Directory containing data related to
RRNA_DATA_DIR="${DATA_DIR}/rrna" # Directory containing data related to
# rRNAs detection
NUCRRNA_DATA_DIR="${DATA_DIR}/nucrrna" # Directory containing data related to
# rRNAs detection
ITS_DATA_DIR="${DATA_DIR}/its" # Directory containing data related to
# rRNAs detection
#
#
......
......@@ -25,31 +25,44 @@ PRTPATH = $(abspath $(PRTDIR))
DATADIR = $(CFGDIR)../data
DATAITS = $(DATADIR)/its
HMMPRESS= $(BINDIR)/hmmpress
HMMDIR = $(PKGDIR)/ITSx_db/HMMs
HMMS = $(wildcard $(HMMDIR)/*.hmm)
HMMP = $(patsubst %.hmm,%.hmm.h3p,$(HMMS))
HMMI = $(patsubst %.hmm,%.hmm.h3i,$(HMMS))
HMMM = $(patsubst %.hmm,%.hmm.h3m,$(HMMS))
#
# Rules
#
.PHONY: all clean test portclean pkg pkg.expand pkg.install
all:: pkg
%.hmm.h3i: %.hmm
echo $(HMMPRESS)
(! test -s $< ) || $(HMMPRESS) -f $<
all:: pkg.install
pkg.expand::
test -d $(PKGDIR) || mkdir $(PKGDIR)
$(TAR) zxf $(PKGTAR) -C $(PKGDIR) --strip-components 1
pkg.install:: pkg.expand
pkg.install:: pkg
@mkdir -p $(BINDIR)
@cp $(PKGDIR)/ITSx $(BINDIR)
@mkdir -p $(DATAITS)
@cp -r $(PKGDIR)/ITSx_db $(DATAITS)
@echo "+++++++++++ package $(PKG) done"
pkg:: pkg.install
pkg:: pkg.expand clean $(HMMI)
test::
echo No test available
clean::
\rm -f $(HMMP) $(HMMI) $(HMMM)
echo Done
portclean::
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment