Commit 155a43b1 by Eric Coissac

A big jump into cython for increasing speed

==> checked with cython 1.6
parent 1f5fcd80
......@@ -15,7 +15,7 @@ ali2consensus.py -t 75 myFastaAlignedSequences.fasta
'''
from obitools.fasta import fastaIterator
from obitools.fasta import fastFastaIterator
from obitools.options import getOptionManager
from obitools.alignment import Alignment, columnIterator
from obitools import NucSequence
......@@ -34,7 +34,7 @@ def addAliOptions(optionManager):
if __name__=='__main__':
optionParser = getOptionManager([addInOutputOption, addAliOptions],
entryIterator=fastaIterator
entryIterator=fastFastaIterator
)
(options, entries) = optionParser()
......
......@@ -20,7 +20,7 @@ def addSpecificityOptions(optionManager):
action="store", dest="dist",
metavar="###",
type="int",
default=0,
default=1,
help="Maximum errors between two sequences")
optionManager.add_option('-q','--quorum',
action="store", dest="quorum",
......
......@@ -5,6 +5,7 @@ from obitools.ecopcr import sequence
from obitools.ecopcr import EcoPCRFile
from obitools.options import getOptionManager
from obitools.ecopcr.options import loadTaxonomyDatabase
......@@ -17,6 +18,14 @@ def addTaxonomyOptions(optionManager):
help="ecoPCR Database "
"name")
optionManager.add_option('-r','--required',
action="append",
dest='required',
metavar="<TAXID>",
type="int",
default=[],
help="required taxid")
if __name__=='__main__':
optionParser = getOptionManager([addTaxonomyOptions],
......@@ -33,14 +42,20 @@ if __name__=='__main__':
listtaxonbyrank = {}
for seq in seqd:
for rank,rankid in ranks:
if rank != 'no rank':
t = tax.getTaxonAtRank(seq['taxid'],rankid)
if t is not None:
if rank in listtaxonbyrank:
listtaxonbyrank[rank].add(t)
else:
listtaxonbyrank[rank]=set([t])
taxid = seq['taxid']
if (options.required and
reduce(lambda x,y: x or y,
(tax.isAncestor(r,taxid) for r in options.required),
False)):
for rank,rankid in ranks:
if rank != 'no rank':
t = tax.getTaxonAtRank(seq['taxid'],rankid)
if t is not None:
if rank in listtaxonbyrank:
listtaxonbyrank[rank].add(t)
else:
listtaxonbyrank[rank]=set([t])
stats = dict((x,len(listtaxonbyrank[x])) for x in listtaxonbyrank)
......@@ -64,7 +79,7 @@ if __name__=='__main__':
print '%-20s\t%10s\t%10s\t%7s' % ('rank','ecopcr','db','percent')
for r in ranknames:
if r in dbstats and dbstats[r]:
if r in dbstats and r in stats and dbstats[r]:
print '%-20s\t%10d\t%10d\t%8.2f' % (r,dbstats[r],stats[r],float(dbstats[r])/stats[r]*100)
......
#!/usr/local/bin/python
'''
Created on 24 juin 2009
@author: coissac
'''
import re
import sys
from logging import root,DEBUG
from obitools.fasta import fastaIterator
from obitools.options import getOptionManager
from obitools.utils import deprecatedScript
def addTableOptions(optionManager):
optionManager.add_option('-n','--na-string',
action="append", dest="NA",
metavar="<NOT AVAILABLE STRING>",
type="string",
default="NA",
help="String write in the table for not available value"
)
optionManager.add_option('-o','--output-seq',
action="store_true", dest="sequence",
metavar="<NOT AVAILABLE STRING>",
default=False,
help="Add an extra column for sequence"
)
optionManager.add_option('-d','--no-definition',
action="store_false", dest="definition",
metavar="<NOT AVAILABLE STRING>",
default=True,
help="Add an extra column for sequence definition"
)
optionManager.add_option('-k','--omit-key',
action="append", dest="omit",
metavar="<KEY NAME>",
default=[],
help="Add key name to omit in the output tab"
)
def headerCmp(h1,h2):
if type(h1) is str and type(h2) is str:
return cmp(h1, h2)
if type(h1) is str and type(h2) is tuple:
return cmp(h1, h2[0])
if type(h1) is tuple and type(h2) is str:
return cmp(h1[0], h2)
if type(h1) is tuple and type(h2) is tuple:
c = cmp(h1[0],h2[0])
if c==0:
c = cmp(h1[1],h2[1])
return c
raise AssertionError
if __name__=='__main__':
deprecatedScript('obitab')
try:
import psyco
psyco.full()
except ImportError:
pass
# root.setLevel(DEBUG)
optionParser = getOptionManager([addTableOptions],
entryIterator=fastaIterator
)
(options, entries) = optionParser()
column = {}
subcol = {}
db = []
for seq in entries:
db.append(seq)
keys = seq.keys()
for k in keys:
t=type(seq[k])
if k in column:
column[k].add(t)
else:
column[k]=set([t])
if t is dict:
if k not in subcol:
subcol[k]=set()
subcol[k]|=set(seq[k].keys())
headers = set()
for c in column:
if len(column[c])==1:
column[c]=column[c].pop()
else:
column[c]=str
if column[c] not in (str,int,float,dict,bool):
column[c]=str
if column[c] is not dict:
headers.add(c)
else:
for sc in subcol[c]:
headers.add((c,sc))
omit = set(options.omit)
headers=list(headers)
headers.sort(headerCmp)
print "id",
if options.definition:
print '\tdefinition',
for k in headers:
if type(k) is str:
if k not in omit:
print '\t',k,
else:
if k[0][0:7] not in omit:
if k[0][0:7]=='merged_':
print '\t%s:%s' % (k[0][7:],str(k[1])),
else:
print '\t%s:%s' % (k[0],str(k[1])),
if options.sequence:
print "\tsequence",
print
for seq in db:
print seq.id,
if options.definition:
print '\t',seq.definition,
for k in headers:
if type(k) is str:
if k not in omit:
if k in seq:
print '\t',seq[k],
else:
print '\t',options.NA,
else:
if k[0] not in omit:
if k[0] in seq:
sk = seq[k[0]]
else:
sk={}
if k[1] in sk:
print '\t',sk[k[1]],
else:
if k[0][0:7]=='merged_':
print '\t',0,
else:
print '\t',options.NA,
if options.sequence:
print '\t',str(seq)
else:
print
#!/usr/local/bin/python
from obitools.fasta import fastaIterator,formatFasta
from obitools.options import getOptionManager
from obitools.options.bioseqfilter import addSequenceFilteringOptions
from obitools.options.bioseqfilter import filterGenerator
from obitools.options.bioseqedittag import addSequenceEditTagOptions
from obitools.options.bioseqedittag import sequenceTaggerGenerator
from obitools.utils import deprecatedScript
if __name__=='__main__':
deprecatedScript('obiannotate')
optionParser = getOptionManager([addSequenceFilteringOptions,
addSequenceEditTagOptions],
entryIterator=fastaIterator)
(options, entries) = optionParser()
if not options.noPsyco:
try:
import psyco
psyco.full()
except ImportError:
pass
sequenceTagger = sequenceTaggerGenerator(options)
goodFasta = filterGenerator(options)
for seq in entries:
if goodFasta(seq):
sequenceTagger(seq)
print formatFasta(seq)
#!/usr/local/bin/python
'''
Created on 1 nov. 2009
@author: coissac
'''
from obitools.options import getOptionManager
from obitools.fasta import fastaIterator
from obitools.utils import deprecatedScript
def addCountOptions(optionManager):
optionManager.add_option('-s','--sequence',
action="store_true", dest="sequence",
default=False,
help="Print the count of differentes sequences"
)
optionManager.add_option('-a','--all',
action="store_true", dest="all",
default=False,
help="Print the count of all sequences"
)
if __name__ == '__main__':
deprecatedScript('obicount')
optionParser = getOptionManager([addCountOptions],
entryIterator=fastaIterator
)
(options, entries) = optionParser()
count1=0
count2=0
for s in entries:
count1+=1
if 'count' in s:
count2+=s['count']
else:
count2+=1
if options.all==options.sequence:
print count1,count2
elif options.all:
print count2
else:
print count1
\ No newline at end of file
#!/usr/local/bin/python
from obitools.fasta import fastaIterator,formatFasta
from obitools.options import getOptionManager
from obitools.options.bioseqfilter import addSequenceFilteringOptions
from obitools.options.bioseqfilter import sequenceFilterIteratorGenerator
from obitools.utils import deprecatedScript
if __name__=='__main__':
deprecatedScript('obigrep')
optionParser = getOptionManager([addSequenceFilteringOptions],
entryIterator=fastaIterator)
(options, entries) = optionParser()
goodSeq = sequenceFilterIteratorGenerator(options)
for seq in goodSeq(entries):
print formatFasta(seq)
\ No newline at end of file
#!/usr/local/bin/python
'''
Created on 15 dec. 2009
@author: coissac
'''
from obitools.fasta import fastaIterator,formatFasta
from obitools.options import getOptionManager
import sys
from obitools.utils import deprecatedScript
def addHeadOptions(optionManager):
optionManager.add_option('-n','--sequence-count',
action="store", dest="count",
metavar="###",
type="int",
default=10,
help="Count of first sequences to print")
if __name__ == '__main__':
deprecatedScript('obihead')
optionParser = getOptionManager([addHeadOptions],
entryIterator=fastaIterator
)
(options, entries) = optionParser()
i=0
for s in entries:
if i < options.count:
print formatFasta(s)
i+=1
else:
sys.exit(0)
#!/usr/local/bin/python
"""\
-------------------------------------------------------
fastalength.py
-------------------------------------------------------
fastaLength.py [-h|--help] <fastafile>"
add length data on all sequences in the fasta file.
-------------------------------------------------------
-h --help : print this help
-------------------------------------------------------
"""
from obitools.fasta import fastaIterator,formatFasta
from obitools.options import getOptionManager
from obitools.utils import deprecatedScript
if __name__=='__main__':
deprecatedScript('obiannotate')
optionParser = getOptionManager([],
entryIterator=fastaIterator
)
(options, entries) = optionParser()
for seq in entries:
seq['seqLength']=len(seq)
print formatFasta(seq)
\ No newline at end of file
#!/usr/local/bin/python
"""
fastaLocate.py
"""
import fileinput
import getopt
import sys
from obitools.fasta import fastaIterator,formatFasta
from obitools.fast import Fast
from obitools.options import getOptionManager
def _probefile(options,opt,value,parser):
seq = str(fastaIterator(value).next())
parser.values.probe=seq
def addLocateOptions(optionManager):
optionManager.add_option('-P','--probe-file',
action="callback", callback=_probefile,
metavar="<FILENAME>",
type="string",
default=None,
help="file name containing the oligo sequence")
optionManager.add_option('-p','--probe',
action="store", dest="probe",
metavar="<DNA_SEQ>",
type="string",
default=None,
help="nucleic sequence of the probe to locate")
optionManager.add_option('-k','--kup',
action="store", dest="kup",
metavar="##",
type="int",
default=2,
help="word size uses for aligment")
def locateRefGenerator(options):
kup = options.kup
seqref= options.probe
hashd = Fast(seqref,kup)
def locate(seq):
score,p =hashd(seq)
seq['locate']=min(p)+1
seq['locate_score']=score
return seq
return locate
if __name__=='__main__':
try:
import psyco
psyco.full()
except ImportError:
pass
optionParser = getOptionManager([addLocateOptions],
entryIterator=fastaIterator
)
(options, entries) = optionParser()
locator = locateRefGenerator(options)
for seq in entries:
info=locator(seq)
print formatFasta(seq)
#!/usr/local/bin/python
'''
Created on 1 nov. 2009
@author: coissac
'''
from obitools.options import getOptionManager
from obitools.fasta import fastaIterator,formatFasta
from obitools.sample import weigthedSample
from obitools.utils import deprecatedScript
def addSampleOptions(optionManager):
optionManager.add_option('-s','--sample-size',
action="store", dest="size",
metavar="###",
type="int",
default=None,
help="Size of the generated sample"
)
if __name__ == '__main__':
deprecatedScript('obisample')
optionParser = getOptionManager([addSampleOptions],
entryIterator=fastaIterator
)
(options, entries) = optionParser()
db = [s for s in entries]
if options.size is None:
options.size=len(db)
distribution = {}
idx=0
for s in db:
if 'count' in s:
count = s['count']
else:
count = 1
distribution[idx]=count
idx+=1
sample =weigthedSample(distribution, options.size)
for idx in sample:
seq = db[idx]
seq['count']=sample[idx]
print formatFasta(seq)
#!/usr/local/bin/python
import re
import sys
from obitools.fasta import formatFasta,fastaIterator
from obitools.options import getOptionManager
from obitools.utils import ColumnFile, deprecatedScript
from obitools.fast import Fast
from obitools import NucSequence
def addSplitOptions(optionManager):
optionManager.add_option('-p','--prefix',
action="store", dest="prefix",
metavar="<PREFIX FILENAME>",
type="string",
default=None,
help="sequence of the direct primer")
optionManager.add_option('-t','--tag-name',
action="store", dest="tagname",
metavar="<tagname>",
type="string",
default=None,
help="file containing tag used")
optionManager.add_option('-u','--undefined',
action="store", dest="undefined",
metavar="<FILENAME>",
type="string",
default=None,
help="file used to store unidentified sequences")
def outputFile(seq,options):
if options.tagname is None:
name = seq.id
else:
if options.tagname in seq:
name = seq[options.tagname]
else:
name=None
if name is None:
if options.undefined is not None:
file = open(options.undefined,'a')
else:
file = None
else:
if options.prefix is not None:
name = '%s%s' % (options.prefix,name)
file = open('%s.fasta' % name,'a')
return file
if __name__=='__main__':
deprecatedScript('obisplit')
try:
import psyco
psyco.full()
except ImportError:
pass
optionParser = getOptionManager([addSplitOptions],
entryIterator=fastaIterator
)
(options, entries) = optionParser()
for seq in entries:
f = outputFile(seq, options)
if f is not None:
print >>f,formatFasta(seq)
del f
#!/usr/local/bin/python
import fileinput
import re
import getopt
import sys
from obitools.fasta import fastaIterator,formatFasta
from obitools.fast import Fast
def strandOnRefGenerator(seqref,kup=2):
hashd = Fast(seqref,kup)
hashr = Fast(seqref.complement(),kup)
def isDirect(seq):
sdirect,p =hashd(seq)
sreverse,p=hashr(seq)
return sdirect > sreverse,sdirect,sreverse
return isDirect
def printHelp():
print "-----------------------------------"
print " fastaGrep.py"
print "-----------------------------------"
print "fastaGrep.py [option] <argument>"
print "-----------------------------------"
print "-h --help : print this help"
print "-r --reference=<fasta> : fasta file containing reference sequence"
print "-k --kup=## : word size used in fastn algorithm"
print "-----------------------------------"
if __name__=='__main__':