Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
O
ORG.Annotate
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
11
Issues
11
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
ORG.Asm
ORG.Annotate
Commits
ee54019d
Commit
ee54019d
authored
May 02, 2016
by
Eric Coissac
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add annotation of nuclear rDNA cistron
parent
7f75da85
Changes
8
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
72157 additions
and
45 deletions
+72157
-45
data/nucrrna/plants/nuc_RRNA.hmm
data/nucrrna/plants/nuc_RRNA.hmm
+29834
-0
data/nucrrna/plants/nuc_RRNA_18S.fst
data/nucrrna/plants/nuc_RRNA_18S.fst
+1440
-0
data/nucrrna/plants/nuc_RRNA_28S.fst
data/nucrrna/plants/nuc_RRNA_28S.fst
+40600
-0
detectors/normalizerdna/bin/go_normalizerdna.sh
detectors/normalizerdna/bin/go_normalizerdna.sh
+70
-0
detectors/nucrrna/bin/go_nucrrna.sh
detectors/nucrrna/bin/go_nucrrna.sh
+88
-0
org-annotate.sh
org-annotate.sh
+103
-42
scripts/bash_init.sh
scripts/bash_init.sh
+6
-0
src/ITSx/Makefile
src/ITSx/Makefile
+16
-3
No files found.
data/nucrrna/plants/nuc_RRNA.hmm
0 → 100644
View file @
ee54019d
This diff is collapsed.
Click to expand it.
data/nucrrna/plants/nuc_RRNA_18S.fst
0 → 100644
View file @
ee54019d
This diff is collapsed.
Click to expand it.
data/nucrrna/plants/nuc_RRNA_28S.fst
0 → 100644
View file @
ee54019d
This diff is collapsed.
Click to expand it.
detectors/normalizerdna/bin/go_normalizerdna.sh
0 → 100755
View file @
ee54019d
#!/bin/bash
#
# Annotate the Intergenic Spacer (ITS) of nuclear rDNA cluster
#
#========================================================================================
#
# This script is based on ITSx
#
#
# go_its.sh <FASTAFILE>
#
# - <FASTAFILE> : The fasta file containing the cluster to annotate
#
# Results are printed to the standart output
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR
=
"
$(
dirname
${
BASH_SOURCE
[0]
}
)
"
source
"
${
THIS_DIR
}
/../../../scripts/bash_init.sh"
pushTmpDir ORG.its
loginfo
"Normalizing nuclear rDNA cistron..."
RRNADB
=
"
${
NUCRRNA_DATA_DIR
}
/plants/nuc_RRNA.hmm"
if
[[
!
"
$1
"
=
~ ^/
]]
;
then
QUERY
=
"
${
CALL_DIR
}
/
$1
"
else
QUERY
=
"
$1
"
fi
strand
=(
$(
hmmsearch
--max
${
RRNADB
}
${
QUERY
}
|
\
$AwkCmd
'/Query: / { \
profil=$2; \
match($3,"[0-9][0-9]*");\
lprof=substr($3,RSTART,RLENGTH)} \
/ [0-9][0-9]* ! / { \
print profil,lprof,$7,$8,$10,$11}'
|
\
$AwkCmd
'($3 <=5) && (($2-$4) <=5) { \
full=1;$5=$5-$3+1;$6=$6+($2-$4)} \
{loc="Forward"} \
($1 ~ /_RC$/) { \
loc="Reverse"} \
(full==1) {match($1,"_..*S");\
rrna=substr($1,RSTART+1,RLENGTH-1);\
print loc;\
full=0
}'
|
sort
|
uniq
)
)
if
[[
"
${#
strand
[@]
}
"
==
1
]]
;
then
if
[[
"
${
strand
[0]
}
"
==
"Forward"
]]
;
then
cat
${
QUERY
}
else
fastarevcomp
-f
${
QUERY
}
fi
else
logerror
"Cannot determine the Cistron orientation"
exit
1
fi
loginfo
"Done."
popTmpDir
exit
0
detectors/nucrrna/bin/go_nucrrna.sh
0 → 100755
View file @
ee54019d
#!/bin/bash
#
# Annotate the Intergenic Spacer (ITS) of nuclear rDNA cluster
#
#========================================================================================
#
# This script is based on ITSx
#
#
# go_its.sh <FASTAFILE>
#
# - <FASTAFILE> : The fasta file containing the cluster to annotate
#
# Results are printed to the standart output
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR
=
"
$(
dirname
${
BASH_SOURCE
[0]
}
)
"
source
"
${
THIS_DIR
}
/../../../scripts/bash_init.sh"
pushTmpDir ORG.its
loginfo
"Annotating ITS and TSU..."
RRNADB
=
"
${
NUCRRNA_DATA_DIR
}
/plants/nuc_RRNA.hmm"
if
[[
!
"
$1
"
=
~ ^/
]]
;
then
QUERY
=
"
${
CALL_DIR
}
/
$1
"
else
QUERY
=
"
$1
"
fi
ITSx
-p
"
${
ITS_DATA_DIR
}
/ITSx_db/HMMs"
-i
"
${
QUERY
}
"
-o
"output.itsx"
ITS1
=(
$(
sed
-E
's/.*ITS1: *([0-9]+)-([0-9]+).*/\1 \2/'
"output.itsx.positions.txt"
)
)
ITS2
=(
$(
sed
-E
's/.*ITS2: *([0-9]+)-([0-9]+).*/\1 \2/'
"output.itsx.positions.txt"
)
)
TSU
=(
$(
sed
-E
's/.*5\.8S: *([0-9]+)-([0-9]+).*/\1 \2/'
"output.itsx.positions.txt"
)
)
if
[[
${#
ITS1
[@]
}
==
"2"
]]
;
then
echo
"FT misc_RNA
${
ITS1
[0]
}
..
${
ITS1
[1]
}
"
echo
'FT /gene="ITS1"'
echo
'FT /note="internal transcribed spacer 1, ITS1"'
fi
if
[[
${#
TSU
[@]
}
==
"2"
]]
;
then
echo
"FT rRNA
${
TSU
[0]
}
..
${
TSU
[1]
}
"
echo
'FT /gene="5.8S rRNA"'
echo
'FT /product="5.8S ribosomal nuclear RNA"'
fi
if
[[
${#
ITS2
[@]
}
==
"2"
]]
;
then
echo
"FT misc_RNA
${
ITS2
[0]
}
..
${
ITS2
[1]
}
"
echo
'FT /gene="ITS2"'
echo
'FT /note="internal transcribed spacer 2, ITS2"'
fi
hmmsearch
--max
${
RRNADB
}
${
QUERY
}
|
\
$AwkCmd
'/Query: / { \
profil=$2; \
match($3,"[0-9][0-9]*");\
lprof=substr($3,RSTART,RLENGTH)} \
/ [0-9][0-9]* ! / { \
print profil,lprof,$7,$8,$10,$11}'
|
\
$AwkCmd
'($3 <=5) && (($2-$4) <=5) { \
full=1;$5=$5-$3+1;$6=$6+($2-$4)} \
{loc=$5".."$6} \
($1 ~ /_RC$/) { \
loc="complement("loc")"} \
(full==1) {match($1,"_..*S");\
rrna=substr($1,RSTART+1,RLENGTH-1);\
print "FT rRNA " loc; \
print "FT /gene=\"rrn"rrna"\""
print "FT /product=\""rrna" ribosomal RNA\"";\
full=0
}'
loginfo
"Done."
popTmpDir
exit
0
org-annotate.sh
View file @
ee54019d
...
...
@@ -23,9 +23,41 @@ taxid="no"
normalization
=
"yes"
irdetection
=
"yes"
organism
=
"no"
types
=
"chloro"
function
usage
{
echo
"Usage:"
;
echo
"
$1
"
'[-t|--ncbi-taxid ###] [-n|--no-normalization] \'
echo
' [-i|--no-ir-detection] [-h|--help] \ '
echo
' [-o|--organism <organism_name>] \ '
echo
' [-c|--chloroplast|-r|--nuclear-rdna|-m|--mitochondrion] <FASTAFILE>'
echo
echo
"Options:"
echo
' -t ### | --ncbi-taxid ###'
echo
' ### represents the ncbi taxid associated to the sequence'
echo
echo
' -i | --no-ir-detection'
echo
' Does not look for inverted repeats in the plastid genome'
echo
echo
' -o | --organism <organism_name>'
echo
' Allows for specifiying the organism name in the embl generated file'
echo
' Spaces have to be substituted by underscore ex : Abies_alba'
echo
echo
' -c | --chloroplast'
echo
' Selects for the annotation of a chloroplast genome'
echo
' This is the default mode'
echo
echo
' -r | --nuclear-rdna'
echo
' Selects for the annotation of the rDNA nuclear cistron'
echo
echo
' -m | --mitochondrion'
echo
' Selects for the annotation of an animal mitochondrion genome'
exit
$2
}
# options may be followed by one colon to indicate they have a required argument
if
!
options
=
$(
getopt
-o
t:o:i
h
-l
ncbi-taxid:,organism,no-ir-detect
ion,help
--
"
$@
"
)
if
!
options
=
$(
getopt
-o
t:o:i
crmh
-l
ncbi-taxid:,organism,no-ir-detection,chloroplast,nuclear-rdna,mitochondr
ion,help
--
"
$@
"
)
then
# something went wrong, getopt will put out an error message for us
exit
1
...
...
@@ -36,20 +68,13 @@ eval set -- "$options"
while
[
$#
-gt
0
]
do
case
$1
in
-t
|
--ncbi-taxid
)
taxid
=
"
$2
"
;
shift
;;
-t
|
--ncbi-taxid
)
taxid
=
"
$2
"
;
shift
;;
-i
|
--no-ir-detection
)
irdetection
=
"no"
;;
-o
|
--organism
)
organism
=
"
$2
"
;
shift
;;
-h
|
--help
)
echo
"Usage:"
;
echo
"
$0
"
'[-t|--ncbi-taxid ###] [-n|--no-normalization] \'
echo
" [-i|--no-ir-detection] [-h|--help] <FASTAFILE>"
echo
echo
"Options:"
echo
' -t ### | --ncbi-taxid ###'
echo
' ### represents the ncbi taxid associated to the sequence'
echo
echo
' -i | --no-ir-detection'
echo
' Does not look for inverted repeats in the plastid genome'
exit
0
;;
-o
|
--organism
)
organism
=
"
$2
"
;
shift
;;
-c
|
--chloroplast
)
types
=
"chloro"
;;
-r
|
--nuclear-rdna
)
types
=
"nucrdna"
;;
-m
|
--mitochondrion
)
types
=
"mito"
;;
-h
|
--help
)
usage
$0
0
;;
(
--
)
shift
;
break
;;
(
-
*
)
echo
"
$0
: error - unrecognized option
$1
"
1>&2
;
exit
1
;;
(
*
)
break
;;
...
...
@@ -57,6 +82,7 @@ do
shift
done
echo
$type
#############################
pushTmpDir ORG.organnot
...
...
@@ -73,43 +99,78 @@ pushTmpDir ORG.organnot
rm
-f
${
LOG
}
openLogFile
${
LOG
}
if
[
"
$irdetection
"
==
"yes"
]
;
then
loginfo
"Normalizing the structure of the Chloroplast sequence..."
loginfo
" LSC + IRB + SSC + IRA"
${
PROG_DIR
}
/detectors/normalize/bin/go_normalize.sh
${
QUERY
}
>
"
${
RESULTS
}
.norm.fasta"
loginfo
"Done."
case
"
$types
"
in
chloro
)
loginfo
"Annotating a plant chloroplast genome..."
if
[
"
$irdetection
"
==
"yes"
]
;
then
loginfo
"Annotating the Inverted repeats and Single copies (LSC and SSC)..."
${
PROG_DIR
}
/detectors/ir/bin/go_ir.sh
"
${
RESULTS
}
.norm.fasta"
>
"
${
RESULTS
}
.annot"
loginfo
"Done."
loginfo
"Normalizing the structure of the Chloroplast sequence..."
loginfo
" LSC + IRB + SSC + IRA"
${
PROG_DIR
}
/detectors/normalize/bin/go_normalize.sh
${
QUERY
}
>
"
${
RESULTS
}
.norm.fasta"
loginfo
"Done."
loginfo
"Annotating the Inverted repeats and Single copies (LSC and SSC)..."
${
PROG_DIR
}
/detectors/ir/bin/go_ir.sh
"
${
RESULTS
}
.norm.fasta"
>
"
${
RESULTS
}
.annot"
loginfo
"Done."
fi
loginfo
"Annotating the tRNA..."
${
PROG_DIR
}
/detectors/trna/bin/go_trna.sh
"
${
RESULTS
}
.norm.fasta"
>>
"
${
RESULTS
}
.annot"
loginfo
"Done."
loginfo
"Annotating the rRNA genes..."
${
PROG_DIR
}
/detectors/rrna/bin/go_rrna.sh
"
${
RESULTS
}
.norm.fasta"
>>
"
${
RESULTS
}
.annot"
loginfo
"Done."
loginfo
"Annotating the CDS..."
tcsh
-f
${
PROG_DIR
}
/detectors/cds/bin/go_cds.sh
"
${
RESULTS
}
.norm.fasta"
>>
"
${
RESULTS
}
.annot"
loginfo
"Done."
topology
=
"circular"
defline
=
"plastid, complete genome"
;;
nucrdna
)
loginfo
"Annotating a plant rDNA cistron..."
loginfo
"Normalizing the structure of the cistron sequence..."
${
PROG_DIR
}
/detectors/normalizerdna/bin/go_normalizerdna.sh
${
QUERY
}
>
"
${
RESULTS
}
.norm.fasta"
loginfo
"Done."
loginfo
"Annotating the rRNA genes..."
${
PROG_DIR
}
/detectors/nucrrna/bin/go_nucrrna.sh
"
${
RESULTS
}
.norm.fasta"
>
"
${
RESULTS
}
.annot"
loginfo
"Done."
topology
=
"linear"
defline
=
"18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene"
;;
mito
)
loginfo
"Annotating an animal mitochondrial genome..."
logerror
"Not yet implemented"
topology
=
"circular"
defline
=
"mitochondrion, complete genome"
exit
1
;;
*
)
echo
usage
$0
1
;;
esac
if
[[
"
${
organism
}
"
==
"no"
]]
;
then
organism
=
"{organism}"
else
organism
=
"
$(
echo
${
organism
}
|
tr
'_'
' '
)
"
fi
loginfo
"Annotating the tRNA..."
${
PROG_DIR
}
/detectors/trna/bin/go_trna.sh
"
${
RESULTS
}
.norm.fasta"
>>
"
${
RESULTS
}
.annot"
loginfo
"Done."
loginfo
"Annotating the rRNA genes..."
${
PROG_DIR
}
/detectors/rrna/bin/go_rrna.sh
"
${
RESULTS
}
.norm.fasta"
>>
"
${
RESULTS
}
.annot"
loginfo
"Done."
loginfo
"Annotating the CDS..."
tcsh
-f
${
PROG_DIR
}
/detectors/cds/bin/go_cds.sh
"
${
RESULTS
}
.norm.fasta"
>>
"
${
RESULTS
}
.annot"
loginfo
"Done."
loginfo
"Printing minimal header..."
echo
"ID XXX; XXX;
circular
; genomic DNA; XXX; XXX;
$(
seqlength
${
RESULTS
}
.norm.fasta
)
BP."
echo
"ID XXX; XXX;
${
topology
}
; genomic DNA; XXX; XXX;
$(
seqlength
${
RESULTS
}
.norm.fasta
)
BP."
echo
"XX"
echo
"AC XXX;"
echo
"DE
${
organism
}
${
defline
}
."
echo
"XX"
if
[[
"
${
organism
}
"
==
"no"
]]
;
then
echo
"DE {organism} plastid, complete genome."
else
echo
"DE
$(
echo
${
organism
}
|
tr
'_'
' '
)
plastid, complete genome."
fi
echo
"XX"
loginfo
"Done."
loginfo
"Printing annotations header..."
...
...
scripts/bash_init.sh
View file @
ee54019d
...
...
@@ -182,6 +182,12 @@ CDS_DATA_DIR="${DATA_DIR}/cds" # Directory containing data related to
RRNA_DATA_DIR
=
"
${
DATA_DIR
}
/rrna"
# Directory containing data related to
# rRNAs detection
NUCRRNA_DATA_DIR
=
"
${
DATA_DIR
}
/nucrrna"
# Directory containing data related to
# rRNAs detection
ITS_DATA_DIR
=
"
${
DATA_DIR
}
/its"
# Directory containing data related to
# rRNAs detection
#
#
...
...
src/ITSx/Makefile
View file @
ee54019d
...
...
@@ -25,31 +25,44 @@ PRTPATH = $(abspath $(PRTDIR))
DATADIR
=
$(CFGDIR)
../data
DATAITS
=
$(DATADIR)
/its
HMMPRESS
=
$(BINDIR)
/hmmpress
HMMDIR
=
$(PKGDIR)
/ITSx_db/HMMs
HMMS
=
$(
wildcard
$(HMMDIR)
/
*
.hmm
)
HMMP
=
$(
patsubst
%.hmm,%.hmm.h3p,
$(HMMS)
)
HMMI
=
$(
patsubst
%.hmm,%.hmm.h3i,
$(HMMS)
)
HMMM
=
$(
patsubst
%.hmm,%.hmm.h3m,
$(HMMS)
)
#
# Rules
#
.PHONY
:
all clean test portclean pkg pkg.expand pkg.install
all
::
pkg
%.hmm.h3i
:
%.hmm
echo
$(HMMPRESS)
(!
test
-s
$<
)
||
$(HMMPRESS)
-f
$<
all
::
pkg.install
pkg.expand
::
test
-d
$(PKGDIR)
||
mkdir
$(PKGDIR)
$(TAR)
zxf
$(PKGTAR)
-C
$(PKGDIR)
--strip-components
1
pkg.install
::
pkg
.expand
pkg.install
::
pkg
@
mkdir
-p
$(BINDIR)
@
cp
$(PKGDIR)
/ITSx
$(BINDIR)
@
mkdir
-p
$(DATAITS)
@
cp
-r
$(PKGDIR)
/ITSx_db
$(DATAITS)
@
echo
"+++++++++++ package
$(PKG)
done"
pkg
::
pkg.
install
pkg
::
pkg.
expand clean $(HMMI)
test
::
echo
No
test
available
clean
::
\r
m
-f
$(HMMP)
$(HMMI)
$(HMMM)
echo
Done
portclean
::
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment