Commit e30a53b5 by Eric Coissac

A new set of protein cleaned for the CDS detector prepared using the

clusterizecore.sh script from the detectors/cds/lib folder.

The CDS detector is now modified to use the clean.fst files.
parent b4415ad2
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -31,7 +31,7 @@ set GenoName = `basename $GenoFile:r`
set ProtFile = $Argv[1]; Shift
set ProtDir = `dirname $ProtFile`
set ProtName = `basename $ProtFile:r`
set ProtName = `basename $ProtFile | $AwkCmd -F'.' '{print $1}'`
set ProtType = `basename $ProtDir`
NeedFile $GenoFile
......@@ -122,7 +122,7 @@ endif
if ($PASS1_SPEEDUP != 0) then
tcsh -f $PROG_DIR/do_filterbx.sh $GenoFile $ProtFile \
tcsh -f $PROG_DIR/do_filterbx.csh $GenoFile $ProtFile \
$PASS1_BLASTX_FILTER_IDMIN \
$PASS1_BLASTX_FILTER_NBMIN \
$PASS1_BLASTX_FILTER_NBMAX > D_$$
......
......@@ -55,10 +55,10 @@ endif
foreach dir ("core" "shell" "dust")
if (-d $DbRoot/$dir) then
set fams = `ls $DbRoot/$dir/*.fst`
set fams = `ls $DbRoot/$dir/*.clean.fst`
Notify "running pass1:$dir exonerate of $Genome on $DbRoot"
foreach f ($fams)
tcsh -f $PROG_DIR/do_exonerate.sh $Fasta $f $DbRoot/models $temp
tcsh -f $PROG_DIR/do_exonerate.csh $Fasta $f $DbRoot/models $temp
end
endif
end
......
......@@ -3,6 +3,8 @@
#
# -v MAX_SPAN ALLOW_STOP EXCLUDE
#
#
#
BEGIN {
PROCINFO["sorted_in"] = "@ind_num_asc"
......
......@@ -6,6 +6,9 @@ source "${THIS_DIR}/../../../scripts/bash_init.sh"
CORELIB="${CDS_DATA_DIR}/chlorodb/core"
CDHIT_ID=0.7
CDHIT_DELTA=0.8
function clusterize() {
......@@ -17,12 +20,15 @@ function clusterize() {
rm -rf "${prot}"
mkdir -p "${prot}"
cd-hit -i "${fastain}" \
-o "${prot}/${cdhitout}" \
-c 0.6 -G 1 \
-g 1 -s 0.8 \
-b 150 -p 1 \
-d 0 -n 3
cd-hit -i "${fastain}" \
-o "${prot}/${cdhitout}" \
-c ${CDHIT_DELTA} \
-G 1 \
-g 1 \
-aL 0.95 \
-s ${CDHIT_ID} \
-b 350 -p 1 \
-d 0 -n 3
fasta1line "${fastain}" > "${prot}/${fasta1}"
......@@ -35,6 +41,8 @@ function clusterize() {
filename=prot".cluster."cluster".ids"; \
print $3 >> filename ; \
close(filename) }' "${cdhitout}.clstr"
rm -f "../$prot.clean.fst"
for ids in *.cluster.*.ids ; do
cluster=$(printf "%03d" $(echo "${ids}" | $AwkCmd -F'.' '{print $3}'))
......@@ -52,10 +60,10 @@ function clusterize() {
egrep -v -- '^--$' > "$alignment"
fi
if (( size >= 10 )) ; then
if (( size >= 5 )) ; then
egrep -f "$ids" -A 1 "${fasta1}" | \
egrep -v -- '^--$' | \
formatfasta >> "$prot.clean.fst"
formatfasta >> "../$prot.clean.fst"
fi
rm -f "$ids"
......@@ -73,6 +81,8 @@ function clusterize() {
pushd $CORELIB
rm -rf *.clean.fst
for prot in *.fst ; do
prot="${prot/.fst/}"
clusterize "$prot"
......
#!/usr/bin/env python
import sys
from math import lgamma
from math import log
data = open(sys.argv[1])
repeats = open(sys.argv[2])
......@@ -8,6 +10,10 @@ repeats = open(sys.argv[2])
chloro = {'LSC' : [], 'SSC' : [] }
chlorosize =0
def lpbinom(x,n,p):
lprob = log(p) * x + log(1-p) * (n-x) + lgamma(n+1) - lgamma(x+1) - lgamma(n-x+1)
return lprob
# We scan the blast matches:
# We build a vector with one position per base pair counting the matches
......@@ -102,6 +108,7 @@ for line in repeats:
o_ssc /= o
score = ((c_lsc - c_ssc) ** 2 + (o_lsc - o_ssc) ** 2) / 2.0
# pvalue=
# print >>sys.stderr,"c.lsc = %f c.ssc = %f o.lsc = %f o.ssc = %f score = %6.4f (len=%d)" % (c_lsc,c_ssc,o_lsc,o_ssc,score,len1)
if (score >= scoreMax) and ((len1 > len1Max) or (len2 > len2Max)):
......
......@@ -151,7 +151,7 @@ pushTmpDir ORG.organnot
loginfo "Done."
loginfo "Annotating the CDS..."
tcsh -f ${PROG_DIR}/detectors/cds/bin/go_cds.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
tcsh -f ${PROG_DIR}/detectors/cds/bin/go_cds.csh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
loginfo "Done."
if (( partial == 0 )) ; then
......
......@@ -129,11 +129,13 @@ function fasta1line {
}
function formatfasta {
$AwkCmd 'function printfasta(seq) { \
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \
} \
(seq && /^>/) { printfasta(seq); \
seq=""} \
/^>/ { print $0 } \
! /^>/ { seq=seq $0 } \
END { printfasta(seq)}' "${1}"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment