Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
O
ORG.Annotate
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
11
Issues
11
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
ORG.Asm
ORG.Annotate
Commits
07fb256b
Commit
07fb256b
authored
Nov 09, 2015
by
alain viari
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added cds/tools/compare
parent
6be837df
Changes
16
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
12503 additions
and
16 deletions
+12503
-16
detectors/cds/tools/compare/go_compare.sh
detectors/cds/tools/compare/go_compare.sh
+89
-0
detectors/cds/tools/compare/lib/compareCds.awk
detectors/cds/tools/compare/lib/compareCds.awk
+98
-0
detectors/cds/tools/compare/lib/embl.cds.awk
detectors/cds/tools/compare/lib/embl.cds.awk
+80
-0
detectors/cds/tools/compare/lib/embl.oneliner.awk
detectors/cds/tools/compare/lib/embl.oneliner.awk
+28
-0
detectors/cds/tools/compare/lib/gbk.cds.awk
detectors/cds/tools/compare/lib/gbk.cds.awk
+84
-0
detectors/cds/tools/compare/lib/gbk.oneliner.awk
detectors/cds/tools/compare/lib/gbk.oneliner.awk
+29
-0
detectors/cds/tools/compare/lib/libnws.awk
detectors/cds/tools/compare/lib/libnws.awk
+146
-0
detectors/cds/tools/compare/lib/libutil.awk
detectors/cds/tools/compare/lib/libutil.awk
+123
-0
detectors/cds/tools/compare/test/go_test.sh
detectors/cds/tools/compare/test/go_test.sh
+21
-0
detectors/cds/tools/compare/test/pred.embl
detectors/cds/tools/compare/test/pred.embl
+4228
-0
detectors/cds/tools/compare/test/ref.gbk
detectors/cds/tools/compare/test/ref.gbk
+7182
-0
detectors/cds/tools/compare/test/test.ref
detectors/cds/tools/compare/test/test.ref
+367
-0
detectors/ir/test/go_test.sh
detectors/ir/test/go_test.sh
+7
-4
detectors/normalize/test/go_test.sh
detectors/normalize/test/go_test.sh
+7
-4
detectors/rrna/test/go_test.sh
detectors/rrna/test/go_test.sh
+7
-4
detectors/trna/test/go_test.sh
detectors/trna/test/go_test.sh
+7
-4
No files found.
detectors/cds/tools/compare/go_compare.sh
0 → 100755
View file @
07fb256b
#!/bin/csh -f
#
# compare CDS annotation in reference file to predicted file
# annotation file are in Genbank/Embl format
#
# usage: go_compare reference predicted
#
# output on stdout
#
unsetenv ORG_SOURCED
setenv ORG_HOME
`
dirname
$0
`
/../../../..
source
$ORG_HOME
/scripts/csh_init.sh
NeedArg 2
set
RefFile
=
$Argv
[
1]
set
PrdFile
=
$Argv
[
2]
NeedFile
$RefFile
NeedFile
$PrdFile
set
RefType
=
$RefFile
:e
set
PrdType
=
$PrdFile
:e
#
# parse ref and prediction
#
Notify
"get genome info from
$RefFile
"
$AwkCmd
-f
$LIB_DIR
/
$RefType
.oneliner.awk
$RefFile
|
\
$AwkCmd
-f
$LIB_DIR
/libutil.awk
-f
$LIB_DIR
/
$RefType
.cds.awk
>
R_
$$
Notify
"get prediction info from
$PrdFile
"
$AwkCmd
-f
$LIB_DIR
/
$PrdType
.oneliner.awk
$PrdFile
|
\
$AwkCmd
-f
$LIB_DIR
/libutil.awk
-f
$LIB_DIR
/
$PrdType
.cds.awk
>
P_
$$
#
# compare
#
Notify
"compare bank to predictions"
$AwkCmd
-f
$LIB_DIR
/libnws.awk
\
-f
$LIB_DIR
/compareCds.awk
\
R_
$$
P_
$$
>
S_
$$
# base statistics
egrep
"^MATCH"
S_
$$
|
tr
'.'
' '
|
awk
'{print $5}'
|
\
sort
|
uniq
-c
|
sort
-nr
|
awk
'{print "#",$0}'
>
U_
$$
# add chlorodb/core statistics
if
(
-d
$DATA_DIR
/cds/chlorodb/core
)
then
ls
$DATA_DIR
/cds/chlorodb/core/
*
.fst |
\
sed
-e
's@^.*core/@@1'
|
sed
-e
's/.fst$//g'
|
\
sort
>
C_
$$
egrep
"^MATCH"
S_
$$
|
grep
"MISSED"
|
awk
'{print $2}'
|
sort
|
uniq
>
D_
$$
join
D_
$$
C_
$$
>
E_
$$
@ nc
=
`
cat
C_
$$
|
wc
-l
`
@ mt
=
`
cat
D_
$$
|
wc
-l
`
@ mc
=
`
cat
E_
$$
|
wc
-l
`
@ mn
=
$mt
-
$mc
set
LC
=
`
cat
E_
$$
`
echo
"#"
>>
U_
$$
echo
"#
$mc
MISSED in ChloroDB-Core (
$LC
)"
>>
U_
$$
echo
"#
$mn
MISSED not in ChloroDB-Core"
>>
U_
$$
echo
"#"
>>
U_
$$
echo
""
>>
U_
$$
endif
cat
S_
$$
>>
U_
$$
cat
U_
$$
#
# end
#
(
\r
m
-f
?_
$$
)
>>
/dev/null
Exit 0
detectors/cds/tools/compare/lib/compareCds.awk
0 → 100644
View file @
07fb256b
#
#
#
function
Min
(
a
,
b
)
{
return
(
a
<
b
?
a
:
b
)
}
function
Max
(
a
,
b
)
{
return
(
a
>
b
?
a
:
b
)
}
function
Align
(
s1
,
s2
,
_local_
,
d
,
l
)
{
if
(
s1
==
s2
)
return
100
d
=
AlignNWS
(
s1
,
s2
,
Identity
)
l
=
Max
(
length
(
s1
),
length
(
s2
))
return
int
((
l
-
d
)
*
100
/
l
)
}
BEGIN
{
PROCINFO
[
"sorted_in"
]
=
"@ind_num_asc"
IdentityMatrix
(
"ABCDEFGHIJKLMNOPQRSTUVWXYZ*"
,
Identity
)
}
BEGINFILE
{
NFile
++
File
[
NFile
]
=
FILENAME
}
/^#/
{
next
}
{
strand
=
$5
stop
=
(
strand
==
"D"
?
$4
:
$3
)
Stop
[
stop
]
++
i
=
++
NRec
[
NFile
]
Rec
[
NFile
][
i
][
"record"
]
=
$0
Rec
[
NFile
][
i
][
"genefam"
]
=
$1
Rec
[
NFile
][
i
][
"gene"
]
=
$2
Rec
[
NFile
][
i
][
"from"
]
=
$3
Rec
[
NFile
][
i
][
"to"
]
=
$4
Rec
[
NFile
][
i
][
"strand"
]
=
$5
Rec
[
NFile
][
i
][
"nexon"
]
=
$6
Rec
[
NFile
][
i
][
"length"
]
=
$7
Rec
[
NFile
][
i
][
"protseq"
]
=
$9
if
(
NFile
==
1
)
Indx1
[
stop
]
=
i
else
Indx2
[
stop
]
=
i
}
END
{
for
(
st
in
Stop
)
{
if
(
Indx1
[
st
])
print
"FILE1 "
Rec
[
1
][
Indx1
[
st
]][
"record"
]
else
print
"FILE1 NONE"
if
(
Indx2
[
st
])
print
"FILE2 "
Rec
[
2
][
Indx2
[
st
]][
"record"
]
else
print
"FILE2 NONE"
if
(
Indx1
[
st
]
&&
Indx2
[
st
])
{
fm
=
Rec
[
1
][
Indx1
[
st
]][
"genefam"
]
id
=
Align
(
Rec
[
1
][
Indx1
[
st
]][
"protseq"
],
Rec
[
2
][
Indx2
[
st
]][
"protseq"
])
printf
(
"MATCH %s ID %d "
,
fm
,
id
)
if
(
id
==
100
)
status
=
"CORRECT"
else
if
(
id
>=
90
)
status
=
"ALMOST_CORRECT"
else
if
(
id
>=
80
)
status
=
"ACCEPTABLE"
else
status
=
"WRONG"
if
(
status
!=
"CORRECT"
)
{
if
(
Rec
[
1
][
Indx1
[
st
]][
"nexon"
]
!=
Rec
[
2
][
Indx2
[
st
]][
"nexon"
])
status
=
status
".BAD_NBEXON"
start1
=
Rec
[
1
][
Indx1
[
st
]][
"strand"
]
==
"D"
?
Rec
[
1
][
Indx1
[
st
]][
"from"
]
:
Rec
[
1
][
Indx1
[
st
]][
"to"
]
start2
=
Rec
[
2
][
Indx2
[
st
]][
"strand"
]
==
"D"
?
Rec
[
2
][
Indx2
[
st
]][
"from"
]
:
Rec
[
2
][
Indx2
[
st
]][
"to"
]
if
(
start1
!=
start2
)
status
=
status
".BAD_START"
else
status
=
status
".BAD_JUNCTION"
}
print
status
}
else
if
(
Indx1
[
st
])
{
fm
=
Rec
[
1
][
Indx1
[
st
]][
"genefam"
]
print
"MATCH "
fm
" ID 0 MISSED.WRONG_STOP"
}
else
if
(
Indx2
[
st
])
{
fm
=
Rec
[
2
][
Indx2
[
st
]][
"genefam"
]
print
"MATCH "
fm
" ID 0 OVERPRED.WRONG_STOP"
}
print
""
}
}
detectors/cds/tools/compare/lib/embl.cds.awk
0 → 100644
View file @
07fb256b
#
# get cds features from embl (short version)
#
# @include lib.embl.awk
BEGIN
{
if
(
MAXSPAN
==
""
)
MAXSPAN
=
10000
print
"#genefam gene from to strand nexon length status protseq product"
}
/^FT CDS/
{
revstrand
=
match
(
$3
,
"^complement"
)
s
=
substr
(
$0
,
22
)
gsub
(
"^complement"
,
""
,
s
)
ok
=
!
match
(
s
,
"complement|order"
)
nexon
=
Nexons
(
s
)
SpanLocation
(
s
,
sloc
)
spanlen
=
sloc
[
2
]
-
sloc
[
1
]
+
1
len
=
LenLocation
(
s
)
ok
=
ok
&&
(
len
<
MAXSPAN
)
cdsseq
=
ok
?
SeqLocation
(
seq
,
s
,
revstrand
)
:
"XXX"
cstart
=
substr
(
cdsseq
,
1
,
3
)
cstop
=
substr
(
cdsseq
,
length
(
cdsseq
)
-
2
)
gene
=
"none"
locustag
=
"none"
product
=
"none"
translation
=
"X"
incds
=
1
next
}
(
incds
&&
/^FT
[^
]
/
)
{
print
GeneFamily
(
gene
),
gene
,
sloc
[
1
],
sloc
[
2
],
(
revstrand
?
"R"
:
"D"
),
nexon
,
len
,
(
ok
?
"Ok"
:
"Error"
),
translation
,
product
incds
=
0
next
}
/^FT
\/
gene=/
{
split
(
$0
,
a
,
"="
)
gene
=
a
[
2
]
gsub
(
"^[^a-z,A-Z]+"
,
""
,
gene
)
gsub
(
"\""
,
""
,
gene
)
gsub
(
" "
,
"_"
,
gene
)
next
}
/^FT
\/
locus_tag=/
{
split
(
$0
,
a
,
"="
)
locustag
=
a
[
2
]
gsub
(
"\""
,
""
,
locustag
)
gsub
(
" "
,
"_"
,
locustag
)
next
}
/^FT
\/
product=/
{
split
(
$0
,
a
,
"="
)
product
=
a
[
2
]
gsub
(
"\""
,
""
,
product
)
gsub
(
" "
,
"_"
,
product
)
next
}
/^FT
\/
translation=/
{
split
(
$0
,
a
,
"="
)
translation
=
a
[
2
]
gsub
(
"\""
,
""
,
translation
)
gsub
(
" "
,
""
,
translation
)
next
}
END
{
if
(
incds
)
{
print
GeneFamily
(
gene
),
gene
,
sloc
[
1
],
sloc
[
2
],
(
revstrand
?
"R"
:
"D"
),
nexon
,
len
,
(
ok
?
"Ok"
:
"Error"
),
translation
,
product
}
}
detectors/cds/tools/compare/lib/embl.oneliner.awk
0 → 100644
View file @
07fb256b
#
# embl oneLiner
#
/^
FT
/
{
InFeat
=
1
}
(
InFeat
==
0
)
&&
(
$1
!=
"CC"
)
&&
(
$1
==
pkey
)
&&
/^..
[^
]
/
{
line
=
line
" "
substr
(
$0
,
6
)
next
}
(
InFeat
==
1
)
&&
/^FT
[^\/]
/
{
line
=
line
""
substr
(
$0
,
22
)
next
}
{
if
(
line
!=
""
)
print
line
line
=
$0
pkey
=
$1
next
}
END
{
if
(
line
!=
""
)
print
line
}
detectors/cds/tools/compare/lib/gbk.cds.awk
0 → 100644
View file @
07fb256b
#
# get cds features from genbank (short version)
#
# @include lib.gbk.awk
BEGIN
{
if
(
MAXSPAN
==
""
)
MAXSPAN
=
10000
print
"#genefam gene from to strand nexon length status protseq product"
}
/^ CDS/
{
revstrand
=
match
(
$2
,
"^complement"
)
s
=
substr
(
$0
,
22
)
gsub
(
"^complement"
,
""
,
s
)
ok
=
!
match
(
s
,
"complement|order"
)
nexon
=
Nexons
(
s
)
SpanLocation
(
s
,
sloc
)
spanlen
=
sloc
[
2
]
-
sloc
[
1
]
+
1
len
=
LenLocation
(
s
)
ok
=
ok
&&
(
len
<
MAXSPAN
)
cdsseq
=
ok
?
SeqLocation
(
seq
,
s
,
revstrand
)
:
"XXX"
cstart
=
substr
(
cdsseq
,
1
,
3
)
cstop
=
substr
(
cdsseq
,
length
(
cdsseq
)
-
2
)
gene
=
"none"
locustag
=
"none"
product
=
"none"
translation
=
"X"
incds
=
1
next
}
(
incds
&&
/^
[^
]
/
)
{
print
GeneFamily
(
gene
),
gene
,
sloc
[
1
],
sloc
[
2
],
(
revstrand
?
"R"
:
"D"
),
nexon
,
len
,
(
ok
?
"Ok"
:
"Error"
),
translation
,
product
incds
=
0
next
}
/^
\/
gene=/
{
split
(
$0
,
a
,
"="
)
gene
=
a
[
2
]
gsub
(
"^[^a-z,A-Z]+"
,
""
,
gene
)
gsub
(
"\""
,
""
,
gene
)
gsub
(
" "
,
"_"
,
gene
)
next
}
/^
\/
locus_tag=/
{
split
(
$0
,
a
,
"="
)
locustag
=
a
[
2
]
gsub
(
"\""
,
""
,
locustag
)
gsub
(
" "
,
"_"
,
locustag
)
next
}
/^
\/
product=/
{
split
(
$0
,
a
,
"="
)
product
=
a
[
2
]
gsub
(
"\""
,
""
,
product
)
gsub
(
" "
,
"_"
,
product
)
next
}
/^
\/
translation=/
{
split
(
$0
,
a
,
"="
)
translation
=
a
[
2
]
gsub
(
"\""
,
""
,
translation
)
gsub
(
" "
,
""
,
translation
)
next
}
/^
\/\/
/
{
locus
=
"?"
}
END
{
if
(
incds
)
{
print
GeneFamily
(
gene
),
gene
,
sloc
[
1
],
sloc
[
2
],
(
revstrand
?
"R"
:
"D"
),
nexon
,
len
,
(
ok
?
"Ok"
:
"Error"
),
translation
,
product
}
}
detectors/cds/tools/compare/lib/gbk.oneliner.awk
0 → 100644
View file @
07fb256b
#
# genbank oneLiner
#
/^
[
^
]
/
{
line
=
line
""
substr
(
$0
,
12
)
next
}
/^
[^\/]
/
{
line
=
line
""
substr
(
$0
,
21
)
next
}
/^ ORGANISM/
{
if
(
line
!=
""
)
print
line
line
=
$0
";"
next
}
{
if
(
line
!=
""
)
print
line
line
=
$0
next
}
END
{
print
line
}
detectors/cds/tools/compare/lib/libnws.awk
0 → 100644
View file @
07fb256b
#
# NWS alignment
#
# -----------------------
# make identity substitution matrix for given alphabet
#
function
IdentityMatrix
(
alpha
,
mat
,
diag
,
ndiag
,
_local_
,
n
,
i
,
j
,
ci
,
cj
)
{
if
(
diag
==
""
)
diag
=
0
if
(
ndiag
==
""
)
ndiag
=
1
alpha
=
toupper
(
alpha
)
delete
mat
n
=
length
(
alpha
)
for
(
i
=
1
;
i
<=
n
;
i
++
)
{
ci
=
substr
(
alpha
,
i
,
1
)
for
(
j
=
1
;
j
<=
n
;
j
++
)
{
cj
=
substr
(
alpha
,
j
,
1
)
mat
[
ci
][
cj
]
=
(
i
==
j
)
?
diag
:
ndiag
}
}
}
# -----------------------
# internal utility : reverse string
#
function
_Reverse
(
s
,
_local_
,
i
,
n
,
rs
)
{
rs
=
""
;
n
=
length
(
s
);
for
(
i
=
n
;
i
>=
1
;
i
--
)
rs
=
rs
""
substr
(
s
,
i
,
1
)
return
rs
;
}
# -----------------------
# internal utility : alignment traceback for NWS
#
function
_Traceback
(
dir
,
s1
,
s2
,
n1
,
n2
,
align
,
_local_
,
i
,
c1
,
c2
,
c3
)
{
delete
align
while
(
dir
[
n1
][
n2
]
!=
0
)
{
if
(
dir
[
n1
][
n2
]
==
"s"
)
{
c1
=
substr
(
s1
,
n1
--
,
1
)
c2
=
substr
(
s2
,
n2
--
,
1
)
c3
=
(
c1
==
c2
)
?
tolower
(
c1
)
:
"*"
}
else
if
(
dir
[
n1
][
n2
]
==
"i"
)
{
c1
=
"-"
c2
=
substr
(
s2
,
n2
--
,
1
)
c3
=
"-"
}
else
{
c1
=
substr
(
s1
,
n1
--
,
1
)
c2
=
"-"
c3
=
"-"
}
align
[
1
]
=
align
[
1
]
""
c1
align
[
2
]
=
align
[
2
]
""
c2
align
[
3
]
=
align
[
3
]
""
c3
}
for
(
i
=
1
;
i
<=
3
;
i
++
)
align
[
i
]
=
_Reverse
(
align
[
i
])
}
# -----------------------
# internal utility : min
#
function
_Min
(
a
,
b
)
{
return
(
a
<
b
?
a
:
b
)
}
# -----------------------
# sequence alignment NWS
#
# todo : check alphabet
#
# --> i
# |
# v d
#
function
AlignNWS
(
s1
,
s2
,
subst
,
indel
,
local
,
align
,
_local_
,
rev
,
n1
,
n2
,
i
,
j
,
c1
,
c2
,
m2
,
ws
,
wi
,
wd
,
w
,
mat
,
dir
)
{
s1
=
toupper
(
s1
)
;
s2
=
toupper
(
s2
)
n1
=
length
(
s1
)
;
n2
=
length
(
s2
)
if
(
local
&&
(
n2
<
n1
))
{
rev
=
s1
;
s1
=
s2
;
s2
=
rev
rev
=
n1
;
n1
=
n2
;
n2
=
rev
}
if
(
indel
==
""
)
indel
=
1
#
# nws alignment
#
for
(
i
=
0
;
i
<=
n1
;
i
++
)
{
c1
=
substr
(
s1
,
i
,
1
)
for
(
j
=
0
;
j
<=
n2
;
j
++
)
{
c2
=
substr
(
s2
,
j
,
1
)
if
(
i
&&
j
)
{
ws
=
mat
[
i
-
1
][
j
-
1
]
+
subst
[
c1
][
c2
]
wd
=
mat
[
i
-
1
][
j
]
+
indel
wi
=
mat
[
i
][
j
-
1
]
+
indel
w
=
_Min
(
ws
,
_Min
(
wi
,
wd
))
mat
[
i
][
j
]
=
w
dir
[
i
][
j
]
=
(
w
==
ws
)
?
"s"
:
(
w
==
wi
)
?
"i"
:
"d"
}
else
if
(
i
)
{
mat
[
i
][
j
]
=
mat
[
i
-
1
][
j
]
+
indel
dir
[
i
][
j
]
=
"d"
}
else
if
(
j
)
{
mat
[
i
][
j
]
=
mat
[
i
][
j
-
1
]
+
(
local
?
0
:
indel
)
dir
[
i
][
j
]
=
"i"
}
else
{
mat
[
i
][
j
]
=
0
dir
[
i
][
j
]
=
0
}
}
# delete mat[i-1]
}
#
# adjust last line in local mode
#
if
(
local
)
{
m2
=
n2
for
(
j
=
m2
;
j
>=
0
;
j
--
)
{
if
(
mat
[
n1
][
j
]
<
mat
[
n1
][
m2
])
m2
=
j
}
mat
[
n1
][
n2
]
=
mat
[
n1
][
m2
]
for
(
j
=
m2
+
1
;
j
<=
n2
;
j
++
)
{
dir
[
n1
][
j
]
=
"i"
}
}
#
# traceback
#
_Traceback
(
dir
,
s1
,
s2
,
n1
,
n2
,
align
)
if
(
rev
)
{
rev
=
align
[
1
]
;
align
[
1
]
=
align
[
2
];
align
[
2
]
=
rev
}
return
mat
[
n1
][
n2
]
}
detectors/cds/tools/compare/lib/libutil.awk
0 → 100644
View file @
07fb256b
#
# utilities library
#
function
Min
(
a
,
b
)
{
return
(
a
<
b
?
a
:
b
)
}
function
Max
(
a
,
b
)
{
return
(
a
>
b
?
a
:
b
)
}
function
Strip
(
s
)
{
gsub
(
"complement|join|order|\\)|\\(|<|>| "
,
""
,
s
)
return
s
}
function
GetLoc
(
s
,
loc
,
_local_
,
a
,
tmp
)
{
delete
loc
loc
[
1
]
=
loc
[
2
]
=
0
split
(
s
,
loc
,
"\\.\\."
)
if
(
loc
[
1
]
>
loc
[
2
])
{
tmp
=
loc
[
1
]
loc
[
1
]
=
loc
[
2
]
loc
[
2
]
=
tmp
}
}
function
ParseLocation
(
s
,
locs
,
_local_
,
i
,
na
,
a
,
loc
)
{
delete
locs
s
=
Strip
(
s
)
na
=
split
(
s
,
a
,
","
)
for
(
i
=
1
;
i
<=
na
;
i
++
)
{
GetLoc
(
a
[
i
],
loc
)
locs
[
i
][
1
]
=
loc
[
1
]