Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
O
ORG.Annotate
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
11
Issues
11
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
ORG.Asm
ORG.Annotate
Commits
73e4e016
Commit
73e4e016
authored
Nov 09, 2015
by
Eric Coissac
1
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Patch bug in the inverted repeats annotation
Realize the annotation on the normalized chromosome
parent
48e30ca5
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
53 additions
and
20 deletions
+53
-20
detectors/normalize/bin/go_normalize.sh
detectors/normalize/bin/go_normalize.sh
+6
-5
detectors/normalize/lib/lookforIR.lib.sh
detectors/normalize/lib/lookforIR.lib.sh
+10
-1
detectors/normalize/lib/selectIR.py
detectors/normalize/lib/selectIR.py
+36
-13
org-annotate.sh
org-annotate.sh
+1
-1
No files found.
detectors/normalize/bin/go_normalize.sh
View file @
73e4e016
...
...
@@ -100,7 +100,7 @@ pushTmpDir ORG.normalize
tmpLSC
=
"tmp_
$$
_LSC.fasta"
tmpSSC
=
"tmp_
$$
_SSC.fasta"
# Extract the
first
SC present in between the two IRs
# Extract the
central
SC present in between the two IRs
# considering it as LSC
let
"beginLSC=
$endIR1
+1"
...
...
@@ -110,7 +110,7 @@ pushTmpDir ORG.normalize
strandLSC
=
"
${
IR
[1]
}
"
# Extract the
second
SC present in two parts
# Extract the
external
SC present in two parts
# Considering it as SSC
let
"beginSSC=
$endIR2
+1"
...
...
@@ -130,16 +130,17 @@ pushTmpDir ORG.normalize
# Actually this is the oposite LSC is SSC and SSC is LSC
# Exchange the SSC and LSC sequences
# Exchange
s
the SSC and LSC sequences
mv
${
tmpSSC
}
${
tmpfasta1
}
mv
${
tmpLSC
}
${
tmpSSC
}
mv
${
tmpfasta1
}
${
tmpLSC
}
# Exchange the IRa and IRb sequences
# Exchange
s
the IRa and IRb sequences
mv
${
tmpIR1
}
${
tmpfasta1
}
mv
${
tmpIR2
}
${
tmpIR1
}
mv
${
tmpfasta1
}
${
tmpIR2
}
# Exchanges the strand of both the Single copies
tmp
=
${
strandSSC
}
strandSSC
=
${
strandLSC
}
strandLSC
=
${
tmp
}
...
...
@@ -161,7 +162,7 @@ pushTmpDir ORG.normalize
# Merges the four parts of the genome.
cat
${
tmpLSC
}
${
tmpIR2
}
${
tmpSSC
}
${
tmpIR1
}
| joinfasta
exit
1
popTmpDir
...
...
detectors/normalize/lib/lookforIR.lib.sh
View file @
73e4e016
...
...
@@ -12,6 +12,15 @@ function lookForIR {
local
REPEATS
=
"
${
MATCHES
/.*/
}
.repseek"
# Blast columns:
# query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
# We keep blast matches if :
# The match is longer than 1000
# The identity is higher than 80%
#
# The match file has the following format:
# LSC/SSC begin end same_strand=1/diff_strand=0
loginfo
"Locating SSC and LSC by similarity..."
blastn
-db
${
SCDB
}
\
-query
${
QUERY
}
\
...
...
@@ -31,7 +40,7 @@ function lookForIR {
repseek
-c
-p
0.001
-i
${
QUERY
}
2>> /dev/null
>
${
REPEATS
}
loginfo
" -->
$(
wc
-l
${
REPEATS
}
|
awk
'{print $1}'
)
repeats identified"
loginfo
"Done"
loginfo
"Marking and selecting the best inverted repeat..."
local
IR
=(
$(
${
SELECTIR
}
${
MATCHES
}
${
REPEATS
}
)
)
loginfo
"Done"
...
...
detectors/normalize/lib/selectIR.py
View file @
73e4e016
...
...
@@ -8,6 +8,12 @@ repeats = open(sys.argv[2])
chloro
=
{
'LSC'
:
[],
'SSC'
:
[]
}
chlorosize
=
0
# We scan the blast matches:
# We build a vector with one position per base pair counting the matches
# The match file has the following format:
# LSC/SSC begin end same_strand=1/diff_strand=0
for
line
in
data
:
parts
=
line
.
strip
().
split
()
if
len
(
parts
)
>=
4
:
...
...
@@ -16,7 +22,8 @@ for line in data:
end
=
int
(
parts
[
2
])
direction
=
int
(
parts
[
3
])
# Change the code of the direction:
# reverse complement = -1
if
direction
==
0
:
direction
=-
1
...
...
@@ -39,36 +46,53 @@ maxLSC = float(max(abs(n) for n in chloro['LSC']))
chloro
[
'SSC'
]
=
[
n
/
maxSSC
for
n
in
chloro
[
'SSC'
]]
chloro
[
'LSC'
]
=
[
n
/
maxLSC
for
n
in
chloro
[
'LSC'
]]
scoreMax
=
0
len1Max
=
0
len2Max
=
0
imax
=
len
(
chloro
[
'LSC'
])
for
line
in
repeats
:
parts
=
line
.
strip
().
split
()
# First repeat position and length
# (position start at 0)
pos1
=
int
(
parts
[
1
])
-
1
len1
=
int
(
parts
[
3
])
# Second repeat position and length
# (position start at 0)
pos2
=
int
(
parts
[
2
])
-
1
len2
=
int
(
parts
[
4
])
# Location of the central single copy
# - in between the two IR -
c_begin
=
min
(
pos1
+
len1
,
imax
)
c_end
=
min
(
pos2
,
imax
)
# Location of the external single copy
# - in between the two IR -
o_max
=
min
(
pos1
,
imax
)
o_min
=
min
(
pos2
+
len2
,
imax
)
c_lsc
=
sum
(
abs
(
chloro
[
'LSC'
][
n
])
for
n
in
range
(
c_begin
,
c_end
))
c_ssc
=
sum
(
abs
(
chloro
[
'SSC'
][
n
])
for
n
in
range
(
c_begin
,
c_end
))
# count of coherent matches for LSC and SSC on the central single copy
c_lsc
=
abs
(
sum
(
chloro
[
'LSC'
][
n
]
for
n
in
range
(
c_begin
,
c_end
)))
c_ssc
=
abs
(
sum
(
chloro
[
'SSC'
][
n
]
for
n
in
range
(
c_begin
,
c_end
)))
o_lsc
=
sum
(
abs
(
chloro
[
'LSC'
][
n
])
for
n
in
range
(
0
,
o_max
))
o_ssc
=
sum
(
abs
(
chloro
[
'SSC'
][
n
])
for
n
in
range
(
0
,
o_max
))
# count of coherent matches for LSC and SSC on the external single copy
# this score is in two parts before the first copy and after the second
o_lsc
=
sum
(
chloro
[
'LSC'
][
n
]
for
n
in
range
(
0
,
o_max
))
o_ssc
=
sum
(
chloro
[
'SSC'
][
n
]
for
n
in
range
(
0
,
o_max
))
o_lsc
+=
sum
(
abs
(
chloro
[
'LSC'
][
n
])
for
n
in
range
(
o_min
,
len
(
chloro
[
'LSC'
])))
o_ssc
+=
sum
(
abs
(
chloro
[
'SSC'
][
n
])
for
n
in
range
(
o_min
,
len
(
chloro
[
'SSC'
])))
o_lsc
+=
sum
(
chloro
[
'LSC'
][
n
]
for
n
in
range
(
o_min
,
imax
))
o_ssc
+=
sum
(
chloro
[
'SSC'
][
n
]
for
n
in
range
(
o_min
,
imax
))
o_lsc
=
abs
(
o_lsc
)
o_ssc
=
abs
(
o_ssc
)
c
=
float
(
c_lsc
+
c_ssc
)
o
=
float
(
o_lsc
+
o_ssc
)
if
c
>
0
:
c_lsc
/=
c
c_ssc
/=
c
...
...
@@ -78,10 +102,9 @@ for line in repeats:
o_ssc
/=
o
score
=
((
c_lsc
-
c_ssc
)
**
2
+
(
o_lsc
-
o_ssc
)
**
2
)
/
2.0
# print >>sys.stderr,"c.lsc = %f c.ssc = %f o.lsc = %f o.ssc = %f score = %6.4f (len=%d)" % (c_lsc,c_ssc,o_lsc,o_ssc,score,len1)
if
(
score
>
scoreMax
):
if
(
score
>
=
scoreMax
)
and
((
len1
>
len1Max
)
or
(
len2
>
len2Max
)
):
scoreMax
=
score
pos1Max
=
pos1
pos2Max
=
pos2
...
...
@@ -99,8 +122,8 @@ c_ssc = sum(chloro['SSC'][n] for n in range(c_begin,c_end))
o_lsc
=
sum
(
chloro
[
'LSC'
][
n
]
for
n
in
range
(
0
,
o_max
))
o_ssc
=
sum
(
chloro
[
'SSC'
][
n
]
for
n
in
range
(
0
,
o_max
))
o_lsc
+=
sum
(
chloro
[
'LSC'
][
n
]
for
n
in
range
(
o_min
,
len
(
chloro
[
'LSC'
])
))
o_ssc
+=
sum
(
chloro
[
'SSC'
][
n
]
for
n
in
range
(
o_min
,
len
(
chloro
[
'SSC'
])
))
o_lsc
+=
sum
(
chloro
[
'LSC'
][
n
]
for
n
in
range
(
o_min
,
imax
))
o_ssc
+=
sum
(
chloro
[
'SSC'
][
n
]
for
n
in
range
(
o_min
,
imax
))
if
abs
(
c_lsc
)
>
abs
(
c_ssc
):
center
=
"LSC"
...
...
org-annotate.sh
View file @
73e4e016
...
...
@@ -34,7 +34,7 @@ pushTmpDir ORG.organnot
loginfo
"Done."
loginfo
"Annotating the Inverted repeats and Single copies (LSC and SSC)..."
${
PROG_DIR
}
/detectors/ir/bin/go_ir.sh
${
QUERY
}
>
"
${
RESULTS
}
.annot"
${
PROG_DIR
}
/detectors/ir/bin/go_ir.sh
"
${
RESULTS
}
.norm.fasta"
>
"
${
RESULTS
}
.annot"
loginfo
"Done."
loginfo
"Annotating the tRNA..."
...
...
Eric Coissac
@coissac
mentioned in issue
#10 (closed)
·
Nov 09, 2015
mentioned in issue
#10 (closed)
mentioned in issue #10
Toggle commit list
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment