Commit 1be9bd69 authored by Eric Coissac's avatar Eric Coissac

Last version for maurizio

parent 4fe58281
#!/usr/local/bin/python
################################################
# setenv PYTHONPATH ./orgasm/build/lib.macosx-10.6-intel-2.7/
# echo $PYTHONPATH
# ./orgasm/build/lib.macosx-10.6-intel-2.7/
# python orgasm-1.py ./samples/GWM-261 ./samples/COX1_Machaon.fasta machaon
from orgasm.tango import tango, \
matchtoseed, \
cutLowCoverage, \
scaffold, \
coverageEstimate, \
fillGaps, \
path2fasta, \
unfoldAssembling, \
cutSNPs, \
selectGoodComponent, \
estimateDeadBrancheLength, pairEndedConstraints
from orgasm.indexer import Index
from orgasm.backtranslate.fasta import fasta
from orgasm.assembler import Assembler
#from orgasm.multialign import multiAlignReads, \
# consensus
import sys
import pickle
def estimateMinRead(index,minoverlap,coverage):
return (index.getReadSize() - minoverlap) * coverage / index.getReadSize() / 2
if __name__ == '__main__':
import argparse
# Declare script options
parser = argparse.ArgumentParser(description='Assembly program dedicated to organel genome reconstruction.')
parser.add_argument('indexFilename', metavar='index', help='index root filename (produced by orgasmi)')
parser.add_argument('seedsFilenameAA', metavar='seeds', help='fasta file containing seeds proteic sequences')
parser.add_argument('outputFilename', metavar='output', help='output prefix')
parser.add_argument('--minread', dest='minread', type=int, action='store', default=None, help='the minimum count of read to consider [default: %(default)d]')
parser.add_argument('--minratio', dest='minratio', type=float, action='store', default=0.1, help='minimum ratio between occurrences of an extension and the occurrences of the most frequent extension to keep it. [default: %(default)f]')
parser.add_argument('--mincov', dest='mincov', type=int, action='store', default=1, help='minimum occurrences of an extension to keep it. [default: %(default)d]')
parser.add_argument('--minoverlap', dest='minoverlap', type=int, action='store', default=40, help='minimum length of the overlap between the sequence and reads to participate in the extension. [default: %(default)d]')
parser.add_argument('--lowcomplexity', dest='lowcomplexity', action='store_true', default=False, help='Use also low complexity probes')
parser.add_argument('--back', dest='back', type=int, action='store', default=500, help='the number of bases taken at the end of contigs to jump with pared-ends [default: %(default)d]')
parser.add_argument('--snp', dest='snp', action='store_false', default=True, help='desactivate the SNP clearing mode')
parser.add_argument('--logfile', dest='logfile', action='store_true', default=False, help='Create a logfile for the assembling')
#parser.add_argument('--maxjump', dest='maxjump', type=int, action='store', default=1, help='COunt of pair end jump by extension branch')
args = parser.parse_args()
if args.logfile:
logfile=open(args.outputFilename+'.log',"w")
# Load the read index
r = Index(args.indexFilename)
# looks if seed protein are a file name or an internal list
try:
p = fasta(args.seedsFilenameAA)
if args.logfile:
print >>logfile,"Load protein sequences from file : %s" % args.seedsFilenameAA
except IOError:
import orgasm.samples
p = getattr(orgasm.samples,args.seedsFilenameAA)
if args.logfile:
print >>logfile,"Load protein internal dataset : %s" % args.seedsFilenameAA
if args.logfile:
logfile.flush()
# looks if the internal blast was already run
try:
# yes, load the previous results
x = pickle.load(open(args.outputFilename+'.match.omx'))
if args.logfile:
print >>logfile,"Load protein match from previous run : %d matches" % sum(len(x[i]) for i in x)
except:
# no, run it and save the results
if args.logfile:
print >>logfile,"Running protein matching against reads...",
logfile.flush()
x = r.lookForSeeds(p,covmin=1)
if args.logfile:
print >>logfile," : %d matches" % sum(len(x[i]) for i in x)
logfile.flush()
pickle.dump(x,open(args.outputFilename+'.match.omx',"w"))
# First esti;ation of the coverage based on protein matches
common = set(p.keys()) & set(x.keys())
pcoverage = [len(x[i]) * r.getReadSize() / len(p[i]) / 3 for i in common]
pcoverage.sort()
print >>sys.stderr,pcoverage
#coverage=mode(pcoverage)
coverage=pcoverage[len(pcoverage)/2]
# according to the minread option estimate it from coverage or use the specified value
if args.minread is None:
minread = estimateMinRead(r,args.minoverlap,coverage)
else:
minread = args.minread
print >>sys.stderr,"\nestimated coverage : %d based on protein match (minread: %d)" % (coverage,minread)
if args.logfile:
print >>logfile,"\nestimated coverage : %d based on protein match (minread: %d)" % (coverage,minread)
logfile.flush()
# Convert matches in sedd list
s = matchtoseed(x,r)
# Create the assembler object
asm = Assembler(r)
# Run the first assembling pass
a = tango(asm,s,mincov=args.mincov,
minread=minread,
minoverlap=args.minoverlap,
lowfilter=not args.lowcomplexity,
maxjump=0, cycle=1)
# Clean small unsuccessful extensions
asm.cleanDeadBranches(maxlength=10)
# and too low covered assembling
cutLowCoverage(asm,int(coverage/4),terminal=True)
# cleanup snp bubble in the graph
if args.snp:
cutSNPs(asm)
# reestimate coverage
score,length,coverage = coverageEstimate(asm)
if args.minread is None:
minread = estimateMinRead(r,args.minoverlap,coverage)
else:
minread = args.minread
print >>sys.stderr,"coverage estimated : %d based on %d bp (minread: %d)" %(coverage,length,minread)
if args.logfile:
print >>logfile,"coverage estimated : %d based on %d bp (minread: %d)" %(coverage,length,minread)
logfile.flush()
# Run the fill gap procedure
while fillGaps(asm,back=args.back,
minread=minread,
maxjump=0,
minoverlap=args.minoverlap,
cmincov=0,
gmincov=int(coverage/4),
snp=args.snp) > 0:
print >>sys.stderr
print >>sys.stderr
print >>sys.stderr
print >>sys.stderr,"=================================================================="
print >>sys.stderr # Clean small unsuccessful extensions
asm.cleanDeadBranches(maxlength=10)
# and too low covered assembling
cutLowCoverage(asm,int(coverage/3),terminal=True)
# reestimate coverage
score,length,coverage = coverageEstimate(asm)
if args.minread is None:
minread = estimateMinRead(r,args.minoverlap,coverage)
else:
minread = args.minread
print >>sys.stderr,"coverage estimated : %d based on %d bp (minread: %d)" %(coverage,length,minread)
cg = asm.compactAssembling(verbose=False)
scaffold(asm,cg,minlink=100,back=int(args.back/2),addConnectedLink=False)
print >>open(args.outputFilename+'.intermediate.gml','w'),cg.gml()
print >>sys.stderr
print >>sys.stderr,"=================================================================="
print >>sys.stderr
cutSNPs(asm)
smallbranches = estimateDeadBrancheLength(asm)
asm.cleanDeadBranches(maxlength=smallbranches)
cg = asm.compactAssembling(verbose=False)
score,length,coverage = coverageEstimate(asm)
if args.snp:
cutSNPs(asm)
cutLowCoverage(asm,int(coverage/2),terminal=True)
cutLowCoverage(asm,int(coverage/10),terminal=False)
cg = asm.compactAssembling(verbose=False)
constraints = pairEndedConstraints(asm,cg,back=int(args.back))
scaffold(asm,cg,minlink=5,back=int(args.back),addConnectedLink=False)
print >>open(args.outputFilename+'.gml','w'),cg.gml()
fastaout = open(args.outputFilename+'.fasta','w')
gcc = selectGoodComponent(cg)
for seeds in gcc:
path = unfoldAssembling(asm,cg,seeds=seeds)
fa = path2fasta(asm,cg,path[-1][0],
identifier=args.outputFilename,
back=args.back,
minlink=5)
print >>fastaout,fa
import copy
from orgasm.backtranslate._ahocorasick import AhoCarasick
gbfile = "/Users/coissac/travail/ecodb/chloroplast/chloroplast.gb"
......@@ -23,17 +24,6 @@ UniversalGeneticCode = {"F" :("TTT","TTC"),
"E" :("GAA","GAG"),
"G" :("GGA","GGC","GGG","GGT")}
cox1_ble = "MTNMVRWLFSTNHKDIGTLYFIFGAIAGVMGTCFSVLIRMELARPGDQILGGNHQLYNVL" \
"ITAHAFLMIFFMVMPAMIGGFGNWFVPILIGAPDMAFPRLNNISFWLLPPSLLLLLSSAL" \
"VEVGSGTGWTVYPPLSGITSHSGGAVDLAIFSLHLSGISSILGSINFITTIFNMRGPGMT" \
"MHRLPLFVWSVLVTAFLLLLSLPVLAGAITMLLTDRNFNTTFFDPAGGGDPILYQHLFWF" \
"FGHPEVYILILPGFGIISHIVSTFSRKPVFGYLGMVYAMISIGVLGFLVWAHHMFTVGLD" \
"VDTRAYFTAATMIIAVPTGIKIFSWIATMWGGSIQYKTPMLFAVGFIFLFTIGGLTGIVL" \
"ANSGLDIALHDTYYVVAHFHYVLSMGAVFALFAGFYYWVGKIFGRTYPETLGQIHFWITF" \
"FGVNLTFFPMHFLGLSGMPRRIPDYPDAYAGWNALSSFGSYISVVGIRRFFVVVAITSSS" \
"GKNQKCAESPWAVEQNPTTLEWLVQSPPAFHTFGELPAVKETKS"
def listCodons2Dict(codons,offset=0,protein='prot'):
d = [[(protein,offset)],None,{}]
......
This diff is collapsed.
#!/usr/local/bin/python
from orgasm.indexer import Index
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Return read count present in an orgasm index.')
parser.add_argument('indexFilename', metavar='index', help='index root filename (produced by orgasmi)')
args = parser.parse_args()
# Load the read index
r = Index(args.indexFilename)
print len(r),r.getReadSize()
......@@ -198,7 +198,50 @@ cg = asm.compactAssembling()
# LA DESSOUS C'EST MON BORDEL
#
##################################################################################
prefix='E4.test'
import orgasm.samples
import argparse
parser = argparse.ArgumentParser(description='Assembly program dedicated to organel genome reconstruction.')
args = parser.parse_args()
prefix='E4'
r = Index(prefix)
p=orgasm.samples.protChloroArabidopsis
x = pickle.load(open(prefix+'.match.omx'))
common = set(p.keys()) & set(x.keys())
pcoverage = [len(x[i]) * r.getReadSize() / len(p[i]) / 3 for i in common]
pcoverage.sort()
coverage=pcoverage[len(pcoverage)/2]
args.minoverlap=40
args.lowcomplexity=False
args.mincov=1
args.snp=True
args.minread=None
args.outputFilename="E4.test"
args.back=500
minread = estimateMinRead(r,args.minoverlap,coverage)
%paste
asm = Assembler(r)
s = matchtoseed(x,r)
constraints
def selectGoodComponent(cg):
cc = list(cg.connectedComponentIterator())
cclength = [[cg.getEdgeAttr(*i)['length']
for i in cg.edgeIterator(nodePredicate=lambda n: n in cc1)
if 'stemid' in cg.getEdgeAttr(*i)
] for cc1 in cc]
ccmlength=[sum(i)/float(len(i)) for i in cclength]
m = mode(ccmlength)
gcc = [c for c,mcc in map(None,cc,ccmlength) if mcc > m * 2]
return gcc
while fillGaps(asm,back=300,minread=30,maxjump=0,minoverlap=80,cmincov=0,gmincov=100,minlink=5):
score,length,coverage = coverageEstimate(asm)
print >>sys.stderr,"coverage estimated : %d based on %d bp" %(coverage,length)
......@@ -416,4 +459,6 @@ def cutParallel(self,mincov):
del stemparallel[spi]
return stemparallel
E4 : 11m18.971s -> Full Circle
E3 : 10m36.784s -> Not sequence (Mito)
......@@ -42,24 +42,26 @@ CTOOLS.extend([('littlebigman',{"sources":["src/littlebigman.c"]}),
('buildcode',{"sources":["src/buildcode.c"]})])
CEXES.extend([('orgasmi',{"sources":["src/orgasmi.c",
"src/buffer.c",
"src/buildindex.c",
"src/code16bits.c",
"src/codecomp.c",
"src/compsort.c",
"src/debug.c",
"src/decode.c",
"src/encode.c",
"src/fastq.c",
"src/indexinput.c",
"src/indexoutput.c",
"src/load.c",
"src/lookfor.c",
"src/malloc.c",
"src/sort.c"
]})])
#CEXES.extend([('orgasmi',{"sources":["src/orgasmi.c",
# "src/buffer.c",
# "src/buildindex.c",
# "src/code16bits.c",
# "src/codecomp.c",
# "src/compsort.c",
# "src/debug.c",
# "src/decode.c",
# "src/encode.c",
# "src/fastq.c",
# "src/indexinput.c",
# "src/indexoutput.c",
# "src/load.c",
# "src/lookfor.c",
# "src/malloc.c",
# "src/fgetln.c",
# "src/sort.c"
# ]})])
CEXES.extend([])
setup(name="ORG.asm",
description="Scripts and library for organelle assembling",
version=VERSION,
......
......@@ -17,6 +17,7 @@ ORGASMI_SRC= fastq.c \
compsort.c \
load.c \
lookfor.c \
fgetln.c \
orgasmi.c
ORGASMD_SRC= fastq.c \
......@@ -43,7 +44,7 @@ ORGASMD_OBJ= $(patsubst %.c,%.o,$(ORGASMD_SRC))
SRCS= $(ORGASMI_SRC)
LIB= -lz -lm
LIB= -lz -lm -lpthread
LIBFILE=
......@@ -97,5 +98,8 @@ expand8bits.c: buildexpand8bits
clean:
rm -f *.o
rm -f $(EXEC)
rm -f buildcode buildcomplement buildexpand8bits
rm -f code16bits.c codecomp.c expand8bits.c
......@@ -3,8 +3,8 @@ LIBPATH=
MAKEDEPEND = gcc -D$(MACHINE) -M $(CPPFLAGS) -o $*.d $<
CC=gcc
CFLAGS= -W -Wall $(shell ./littlebigman) -D__SSE2__
#CFLAGS= -W -Wall -O0 -g $(shell ./littlebigman) -D__SSE2__
#CFLAGS= -W -Wall $(shell ./littlebigman) -D__SSE2__
CFLAGS= -W -Wall -O0 -g $(shell ./littlebigman) -D__SSE2__
LDFLAGS= -g
default: all
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment