Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
O
ORG.Annotate
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
11
Issues
11
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
ORG.Asm
ORG.Annotate
Commits
abd6112a
Commit
abd6112a
authored
Oct 05, 2016
by
Eric Coissac
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add management of partial sequences
parent
7e3ddd41
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
204 additions
and
133 deletions
+204
-133
org-annotate.sh
org-annotate.sh
+204
-133
No files found.
org-annotate.sh
View file @
abd6112a
...
...
@@ -24,6 +24,7 @@ normalization="yes"
irdetection
=
"yes"
organism
=
"no"
types
=
"chloro"
partial
=
0
function
usage
{
echo
"Usage:"
;
...
...
@@ -52,12 +53,21 @@ function usage {
echo
echo
' -m | --mitochondrion'
echo
' Selects for the annotation of an animal mitochondrion genome'
echo
echo
' -p | --partial'
echo
' Indicates that the genome sequence is partial and therefore in several contigs'
exit
$2
}
function
fastaIterator
()
{
awk
'/^>/ {if (seq) printf("%s\f",seq); seq=""} \
{if (seq) seq=seq"\n"; seq=seq $1} \
END {print seq}'
"
$1
"
}
# options may be followed by one colon to indicate they have a required argument
if
!
options
=
$(
getopt
-o
t:o:icrmh
-l
ncbi-taxid:,organism,no-ir-detection,chloroplast,nuclear-rdna,mitochondrion
,help
--
"
$@
"
)
if
!
options
=
$(
getopt
-o
t:o:icrmh
p
-l
ncbi-taxid:,organism,no-ir-detection,chloroplast,nuclear-rdna,mitochondrion,partial
,help
--
"
$@
"
)
then
# something went wrong, getopt will put out an error message for us
usage
$0
1
...
...
@@ -74,6 +84,7 @@ do
-c
|
--chloroplast
)
types
=
"chloro"
;;
-r
|
--nuclear-rdna
)
types
=
"nucrdna"
;;
-m
|
--mitochondrion
)
types
=
"mito"
;;
-p
|
--partial
)
partial
=
"1"
;;
-h
|
--help
)
usage
$0
0
;;
(
--
)
shift
;
break
;;
(
-
*
)
echo
"
$0
: error - unrecognized option
$1
"
1>&2
;
exit
1
;;
...
...
@@ -101,151 +112,211 @@ pushTmpDir ORG.organnot
rm
-f
${
LOG
}
openLogFile
${
LOG
}
case
"
$types
"
in
chloro
)
loginfo
"Annotating a plant chloroplast genome..."
IFS
=
$'
\f
'
if
[
"
$irdetection
"
==
"yes"
]
;
then
loginfo
"Normalizing the structure of the Chloroplast sequence..."
loginfo
" LSC + IRB + SSC + IRA"
${
PROG_DIR
}
/detectors/normalize/bin/go_normalize.sh
${
QUERY
}
>
"
${
RESULTS
}
.norm.fasta"
loginfo
"Done."
for
sequence
in
$(
fastaIterator
"
${
QUERY
}
"
)
;
do
unset
IFS
if
[[
!
-z
"
${
sequence
}
"
]]
;
then
echo
"
${
sequence
}
"
>
toannotate.fasta
seqid
=
$(
awk
'(NR==1) {print substr($1,2,1000)}'
toannotate.fasta
)
case
"
$types
"
in
chloro
)
loginfo
"Annotating a plant chloroplast genome..."
if
[[
"
$irdetection
"
==
"yes"
]]
&&
((
partial
==
0
))
;
then
loginfo
"Annotating the Inverted repeats and Single copies (LSC and SSC)..."
${
PROG_DIR
}
/detectors/ir/bin/go_ir.sh
"
${
RESULTS
}
.norm.fasta"
>
"
${
RESULTS
}
.annot"
loginfo
"Done."
loginfo
"Normalizing the structure of the Chloroplast sequence..."
loginfo
" LSC + IRB + SSC + IRA"
${
PROG_DIR
}
/detectors/normalize/bin/go_normalize.sh toannotate.fasta
>
"
${
RESULTS
}
.norm.fasta"
loginfo
"Done."
loginfo
"Annotating the Inverted repeats and Single copies (LSC and SSC)..."
${
PROG_DIR
}
/detectors/ir/bin/go_ir.sh
"
${
RESULTS
}
.norm.fasta"
>
"
${
RESULTS
}
.annot"
loginfo
"Done."
else
cat
toannotate.fasta
>
"
${
RESULTS
}
.norm.fasta"
rm
-f
"
${
RESULTS
}
.annot"
touch
"
${
RESULTS
}
.annot"
fi
loginfo
"Annotating the tRNA..."
${
PROG_DIR
}
/detectors/trna/bin/go_trna.sh
"
${
RESULTS
}
.norm.fasta"
>>
"
${
RESULTS
}
.annot"
loginfo
"Done."
loginfo
"Annotating the rRNA genes..."
${
PROG_DIR
}
/detectors/rrna/bin/go_rrna.sh
"
${
RESULTS
}
.norm.fasta"
>>
"
${
RESULTS
}
.annot"
loginfo
"Done."
loginfo
"Annotating the CDS..."
tcsh
-f
${
PROG_DIR
}
/detectors/cds/bin/go_cds.sh
"
${
RESULTS
}
.norm.fasta"
>>
"
${
RESULTS
}
.annot"
loginfo
"Done."
if
((
partial
==
0
))
;
then
topology
=
"circular"
defline
=
"plastid, complete genome"
else
topology
=
"linear"
defline
=
"plastid, partial sequence"
fi
;;
nucrdna
)
loginfo
"Annotating a plant rDNA cistron..."
loginfo
"Normalizing the structure of the cistron sequence..."
${
PROG_DIR
}
/detectors/normalizerdna/bin/go_normalizerdna.sh toannotate.fasta
>
"
${
RESULTS
}
.norm.fasta"
loginfo
"Done."
loginfo
"Annotating the rRNA genes..."
${
PROG_DIR
}
/detectors/nucrrna/bin/go_nucrrna.sh
"
${
RESULTS
}
.norm.fasta"
>
"
${
RESULTS
}
.annot"
loginfo
"Done."
topology
=
"linear"
defline
=
"18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene"
;;
mito
)
loginfo
"Annotating an animal mitochondrial genome..."
logerror
"Not yet implemented"
if
((
partial
==
0
))
;
then
topology
=
"circular"
defline
=
"mitochondrion, complete genome"
else
topology
=
"linear"
defline
=
"mitochondrion, partial sequence"
fi
exit
1
;;
*
)
usage
$0
1
;;
esac
if
[[
"
${
organism
}
"
==
"no"
]]
;
then
organism
=
"{organism}"
else
organism
=
"
$(
echo
${
organism
}
|
tr
'_'
' '
)
"
fi
loginfo
"Annotating the tRNA..."
${
PROG_DIR
}
/detectors/trna/bin/go_trna.sh
"
${
RESULTS
}
.norm.fasta"
>>
"
${
RESULTS
}
.annot"
loginfo
"Done."
sl
=
$(
seqlength
"
${
RESULTS
}
.norm.fasta"
)
loginfo
"Annotating the rRNA genes..."
${
PROG_DIR
}
/detectors/rrna/bin/go_rrna.sh
"
${
RESULTS
}
.norm.fasta"
>>
"
${
RESULTS
}
.annot"
loginfo
"Printing minimal header..."
echo
"ID
${
seqid
}
;
${
seqid
}
;
${
topology
}
; genomic DNA; XXX; XXX;
${
sl
}
BP."
echo
"XX"
echo
"AC
${
seqid
}
;"
echo
"DE
${
organism
}
${
defline
}
."
echo
"XX"
loginfo
"Done."
loginfo
"Annotating the CDS..."
tcsh
-f
${
PROG_DIR
}
/detectors/cds/bin/go_cds.sh
"
${
RESULTS
}
.norm.fasta"
>>
"
${
RESULTS
}
.annot"
loginfo
"Printing annotations header..."
echo
"FH Key Location/Qualifiers"
loginfo
"Done."
loginfo
"Printing the source feature"
echo
"FT source 1..
${
sl
}
"
>>
"
${
RESULTS
}
.annot"
if
[[
"
${
organism
}
"
!=
"{organism}"
]]
;
then
echo
"FT /organism=
\"
${
organism
}
\"
"
>>
"
${
RESULTS
}
.annot"
fi
case
"
${
types
}
"
in
chloro
)
echo
"FT /organelle=
\"
plastid:chloroplast
\"
"
>>
"
${
RESULTS
}
.annot"
;;
mito
)
echo
"FT /organelle=
\"
mitochondrion
\"
"
>>
"
${
RESULTS
}
.annot"
;;
*
)
loginfo
"Nuclear sequence"
;;
esac
echo
"FT /mol_type=
\"
genomic DNA
\"
"
>>
"
${
RESULTS
}
.annot"
if
[[
"
${
taxid
}
"
!=
"no"
]]
;
then
echo
"FT /db_xref=
\"
taxon:
${
taxid
}
\"
"
>>
"
${
RESULTS
}
.annot"
fi
# echo "FT /country=\"Poland: Bialowieza Forest\"" >> "${RESULTS}.annot"
loginfo
"Done."
topology
=
"circular"
defline
=
"plastid, complete genome"
;;
nucrdna
)
loginfo
"Annotating a plant rDNA cistron..."
loginfo
"Ordering annotations..."
awk
'/^.....(misc|repeat|rRNA|tRNA|gene|source)/ { \
match($3,"[0-9][0-9]*"); \
pos=substr($3,RSTART,RLENGTH)*1000 + 1; \
print pos,$0; \
next} \
{ pos++; \
print pos,$0}'
"
${
RESULTS
}
.annot"
|
\
sort
-nk1
|
\
awk
'{ \
match($0,"^[0-9]* ");\
line=substr($0,RLENGTH+1);\
print line}'
loginfo
"Done."
loginfo
"
Normalizing the structure of the cistron sequenc
e..."
${
PROG_DIR
}
/detectors/normalizerdna/bin/go_normalizerdna.sh
${
QUERY
}
>
"
${
RESULTS
}
.norm.fasta
"
loginfo
"
Closing annotations tabl
e..."
echo
"XX
"
loginfo
"Done."
loginfo
"Annotating the rRNA genes..."
${
PROG_DIR
}
/detectors/nucrrna/bin/go_nucrrna.sh
"
${
RESULTS
}
.norm.fasta"
>
"
${
RESULTS
}
.annot"
loginfo
"Computing statistics on nucleotide usage..."
awk
'! /^>/ { \
seq=toupper($0); \
gsub(" ","",seq); \
lseq=length(seq); \
for (i=0; i < lseq; i++) { \
freq[substr(seq,i,1)]++}\
} \
END { \
other=0; \
for (i in freq) { \
if (i!="A" && i!="C" && i!="G" && i!="T") {\
other+=freq[i] \
} \
}; \
print "SQ Sequence "\
(freq["A"]+freq["C"]+freq["G"]+freq["T"]+other) \
" BP; "\
freq["A"]" A; "\
freq["C"]" C; "\
freq["G"]" G; "\
freq["T"]" T; "\
other" other;" \
}'
"
${
RESULTS
}
.norm.fasta"
loginfo
"Done."
topology
=
"linear"
defline
=
"18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene"
;;
mito
)
loginfo
"Annotating an animal mitochondrial genome..."
logerror
"Not yet implemented"
topology
=
"circular"
defline
=
"mitochondrion, complete genome"
exit
1
;;
*
)
usage
$0
1
;;
esac
if
[[
"
${
organism
}
"
==
"no"
]]
;
then
organism
=
"{organism}"
else
organism
=
"
$(
echo
${
organism
}
|
tr
'_'
' '
)
"
fi
loginfo
"Printing minimal header..."
echo
"ID XXX; XXX;
${
topology
}
; genomic DNA; XXX; XXX;
$(
seqlength
${
RESULTS
}
.norm.fasta
)
BP."
echo
"XX"
echo
"AC XXX;"
echo
"DE
${
organism
}
${
defline
}
."
echo
"XX"
loginfo
"Done."
loginfo
"Printing annotations header..."
echo
"FH Key Location/Qualifiers"
loginfo
"Done."
loginfo
"Ordering annotations..."
awk
'/^.....(misc|repeat|rRNA|tRNA|gene)/ { \
match($3,"[0-9][0-9]*"); \
pos=substr($3,RSTART,RLENGTH)*1000 + 1; \
print pos,$0; \
next} \
{ pos++; \
print pos,$0}'
"
${
RESULTS
}
.annot"
|
\
sort
-nk1
|
\
awk
'{ \
match($0,"^[0-9]* ");\
line=substr($0,RLENGTH+1);\
print line}'
loginfo
"Done."
loginfo
"Closing annotations table..."
echo
"XX"
loginfo
"Done."
loginfo
"Computing statistics on nucleotide usage..."
awk
'! /^>/ { \
seq=toupper($0); \
gsub(" ","",seq); \
lseq=length(seq); \
for (i=0; i < lseq; i++) { \
freq[substr(seq,i,1)]++}\
} \
END { \
other=0; \
for (i in freq) { \
if (i!="A" && i!="C" && i!="G" && i!="T") {\
other+=freq[i] \
} \
}; \
print "SQ Sequence "\
(freq["A"]+freq["C"]+freq["G"]+freq["T"]+other) \
" BP; "\
freq["A"]" A; "\
freq["C"]" C; "\
freq["G"]" G; "\
freq["T"]" T; "\
other" other;" \
}'
"
${
RESULTS
}
.norm.fasta"
loginfo
"Done."
loginfo
"Reformating sequences..."
lines
=
$(
wc
-l
"
${
RESULTS
}
.norm.fasta"
|
awk
'{print $1}'
)
awk
-v
lines
=
$lines
' \
! /^>/ { \
seq=tolower($0); \
gsub(" ","",seq); \
printf(" ") ;\
for (i=0; i < 6; i++) { \
f=substr(seq,i * 10, 10); \
pos+=length(f); \
f = f substr(" ",1,10-length(f)); \
printf("%s ",f) \
}; \
if (NR==lines) \
{pos-=1}; \
printf(" %6d\n",pos) \
}'
"
${
RESULTS
}
.norm.fasta"
loginfo
"Done."
loginfo
"Closing sequence part..."
echo
"//"
loginfo
"Done."
loginfo
"Reformating sequences..."
lines
=
$(
wc
-l
"
${
RESULTS
}
.norm.fasta"
|
awk
'{print $1}'
)
awk
-v
lines
=
$lines
' \
! /^>/ { \
seq=tolower($0); \
gsub(" ","",seq); \
printf(" ") ;\
for (i=0; i < 6; i++) { \
f=substr(seq,i * 10, 10); \
pos+=length(f); \
f = f substr(" ",1,10-length(f)); \
printf("%s ",f) \
}; \
if (NR==lines) \
{pos-=1}; \
printf(" %6d\n",pos) \
}'
"
${
RESULTS
}
.norm.fasta"
loginfo
"Done."
loginfo
"Closing sequence part..."
echo
"//"
loginfo
"Done."
fi
IFS
=
$'
\f
'
done
# End of the loop over the sequences
popTmpDir
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment