Commit ef888974 by Eric Coissac

--no commit message

parent 5f4c894b
#!/usr/local/bin/python
from obitools.fasta import fastaIterator
from obitools.fasta import fastaNucIterator
from obitools.align.ssearch import ssearchIterator
from obitools.utils.bioseq import uniqSequence,sortSequence
from obitools.ecopcr.taxonomy import Taxonomy
from obitools.ecopcr.taxonomy import EcoTaxonomyDB
import sys
import re
......@@ -128,7 +128,7 @@ def count(data):
if __name__=='__main__':
optionParser = getOptionManager([addSearchOptions],
entryIterator=fastaIterator
entryIterator=fastaNucIterator
)
(options, entries) = optionParser()
......@@ -136,8 +136,8 @@ if __name__=='__main__':
assert options.program in ('fasta35','ssearch35')
taxonomy = Taxonomy(options.taxonomy)
db = fastaIterator(options.database)
taxonomy = EcoTaxonomyDB(options.taxonomy)
db = fastaNucIterator(options.database)
taxonlink = {}
rankid = taxonomy.findRankByName(options.explain)
......
......@@ -6,7 +6,7 @@ from obitools.options import getOptionManager
from obitools.ecotag.parser import EcoTagFileIterator
from obitools.ecotag.parser import ecoTagIdentifiedFilter
from obitools.ecopcr.taxonomy import Taxonomy
from obitools.ecopcr.taxonomy import EcoTaxonomyDB
def addAbstractOptions(optionManager):
......@@ -43,7 +43,7 @@ if __name__=='__main__':
(options, entries) = optionParser()
print >>sys.stderr,"Reading taxonomy...",
taxonomy = Taxonomy(options.taxonomy)
taxonomy = EcoTaxonomyDB(options.taxonomy)
rankid = taxonomy.findRankByName(options.rank)
print >>sys.stderr,"Ok"
......
......@@ -25,7 +25,7 @@ if __name__=='__main__':
)
(options, entries) = optionParser()
tax = taxonomy.Taxonomy(options.db)
tax = taxonomy.EcoTaxonomyDB(options.db)
ranks = set(x for x in tax.rankIterator())
results = [seq for seq in entries]
......
......@@ -25,7 +25,7 @@ if __name__=='__main__':
(options, entries) = optionParser()
tax = taxonomy.Taxonomy(options.db)
tax = taxonomy.EcoTaxonomyDB(options.db)
seqd= sequence.EcoPCRDBSequenceIterator(options.db,taxonomy=tax)
ranks = set(x for x in tax.rankIterator())
......
......@@ -72,6 +72,9 @@ def experiment(tag,tags):
def checkSequence(seq,direct,reverse=None,taglength=5,tags=None):
# look for direct primer match
# take into account the first match
dscore,dpos = direct(seq)
ok = dscore > len(direct) - 5 and dpos[0] > 0 and (len(seq) - dpos[0] > len(direct))
good = None
......@@ -79,6 +82,7 @@ def checkSequence(seq,direct,reverse=None,taglength=5,tags=None):
if ok:
dpos = dpos[0]
# Check if there is enough place for the tag
ok = dpos >= taglength
seq['scoreDirect']=dscore
......@@ -95,7 +99,9 @@ def checkSequence(seq,direct,reverse=None,taglength=5,tags=None):
rscore,rpos = reverse(seq)
ok = rscore > len(reverse) - 5
rpos = rpos[-1]
# take the first occurence of the reverse primer
rpos = rpos[0]
seq['scoreReverse']=rscore
if rpos > 0 and len(seq)-rpos >len(reverse):
......
......@@ -37,7 +37,7 @@ if __name__=='__main__':
)
(options, entries) = optionParser()
tax = taxonomy.Taxonomy(options.db)
tax = taxonomy.EcoTaxonomyDB(options.db)
if options.undefined is not None:
options.undefined=open(options.undefined,'w')
......
#!/usr/local/bin/python
from obitools.fasta import fastaIterator,formatFasta
from obitools.ecopcr.taxonomy import Taxonomy
from obitools.ecopcr.taxonomy import EcoTaxonomyDB
from obitools.options import getOptionManager
import sys
......@@ -26,7 +26,7 @@ if __name__=='__main__':
(options, entries) = optionParser()
taxonomy = Taxonomy(options.taxonomy)
taxonomy = EcoTaxonomyDB(options.taxonomy)
for seq in entries:
taxid = int(seq['taxid'])
......
......@@ -3,7 +3,7 @@
import sys
from obitools.fasta import fastaIterator,formatFasta
from obitools.ecopcr.taxonomy import Taxonomy
from obitools.ecopcr.taxonomy import EcoTaxonomyDB
from obitools.options import getOptionManager
def addTaxonomyOptions(optionManager):
......@@ -32,7 +32,7 @@ if __name__=='__main__':
assert options.taxonomy is not None,"You must specify a -t option"
assert options.rank is not None,"You must specify a -r option"
taxonomy = Taxonomy(options.taxonomy)
taxonomy = EcoTaxonomyDB(options.taxonomy)
found = set()
......
......@@ -7,7 +7,7 @@ from logging import root,DEBUG
from obitools.fasta import fastaIterator,formatFasta
from obitools.utils.bioseq import uniqSequence
from obitools.options import getOptionManager
from obitools.ecopcr.taxonomy import Taxonomy
from obitools.ecopcr.taxonomy import EcoTaxonomyDB
def addTaxonomyFilterOptions(optionManager):
......@@ -35,7 +35,7 @@ if __name__=='__main__':
(options, entries) = optionParser()
if options.taxonomy is not None:
taxonomy = Taxonomy(options.taxonomy)
taxonomy = EcoTaxonomyDB(options.taxonomy)
else:
taxonomy=None
......
#!/usr/local/bin/python
'''
Created on 6 mai 2009
@author: coissac
'''
from obitools.options import getOptionManager
from obitools.options.taxonomyfilter import addTaxonomyFilterOptions, \
taxonomyFilterIteratorGenerator
from obitools.utils import ColumnFile,universalOpen
from obitools.ecopcr.taxonomy import EcoTaxonomyDB,ecoTaxonomyWriter
from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter,EcoPCRDBSequenceIterator
from obitools.format.sequence.embl import emblIterator
from obitools import NucSequence
import sys
from urllib2 import urlopen
def addGROptions(optionManager):
optionManager.add_option('-g','--genome-reviews',
action="store", dest="ftp",
metavar="<Genome reviews FTP URL>",
type="string",
default="ftp://ftp.ebi.ac.uk/pub/databases/genome_reviews",
help="Genome reviews FTP URL default value to ftp://ftp.ebi.ac.uk/pub/databases/genome_reviews")
optionManager.add_option('-n','--name',
action="store", dest="name",
metavar="PREFIX",
type="string",
default="ftp://ftp.ebi.ac.uk/pub/databases/genome_reviews",
help="New EcoPCR database name")
optionManager.add_option('-R','--recover',
action="store_true", dest="recover",
default=False,
help="Set into recover mode")
if __name__=='__main__':
try:
import psyco
psyco.full()
except ImportError:
pass
optionParser = getOptionManager([addGROptions,addTaxonomyFilterOptions])
(options, entries) = optionParser()
assert options.name is not None, "You must use -d options"
writeTaxtonomy= options.taxonomy is None or options.name != options.taxonomy
filter = taxonomyFilterIteratorGenerator(options)
if writeTaxtonomy:
ecoTaxonomyWriter(options.name, options.taxonomy)
print >>sys.stderr,"download : ",options.ftp+"/gr2species.txt"
gr2species = urlopen(options.ftp+"/gr2species.txt")
gr2species = ColumnFile(gr2species, '\t', True, (str,int,str,str,int,str),
head=('AC','taxid','name','replicon','GRID','uniprotID')
)
# a) sequence AC
# b) NCBI Tax ID
# a) Species name
# c) Chromosome/plasmid name
# d) Genome Reviews Accession number
# f) uniprot ornanism code
genome={}
for seq in filter(gr2species):
if seq['replicon'][0:10]=='Chromosome':
grid = seq['GRID']
if grid not in genome:
genome[grid]={'name':seq['name'],'AC':[],'taxid':seq['taxid'],'uniprotID':seq['uniprotID']}
genome[grid]['AC'].append(seq['AC'])
if options.recover:
print >>sys.stderr,"Readding old library...",
seqdb = EcoPCRDBSequenceIterator(options.name,options.taxonomy)
seqac = set(x.id for x in seqdb)
print >>sys.stderr," %d genomes already downloaded" % len(seqac)
db = EcoPCRDBSequenceWriter(options.name, taxonomy=options.taxonomy,append=True)
else:
db = EcoPCRDBSequenceWriter(options.name, taxonomy=options.taxonomy)
seqac=set()
for grid in genome:
sequence = []
ok=True
gac = '%s_%d' % (genome[grid]['uniprotID'],grid)
if gac in seqac:
print >> sys.stderr,'Skip genome [%d] : %s' % (grid,genome[grid]['name'])
else:
print >> sys.stderr,'Download genome [%d] : %s' % (grid,genome[grid]['name'])
for ac in genome[grid]['AC']:
print >> sys.stderr,' Sequence : %s...' % ac,
try:
data = emblIterator(universalOpen("%s/dat/cellular/%s.dat.gz" % (options.ftp,ac)))
sequence.append(data.next())
del data
print >> sys.stderr,'ok'
except:
print >> sys.stderr,'bad'
ok=False
if ok:
sequence=('X'*40).join([str(x) for x in sequence])
sequence=NucSequence(gac,
sequence,
genome[grid]['name'],taxid=genome[grid]['taxid'])
db.put(sequence)
......@@ -2,6 +2,7 @@ from obitools import utils
from obitools import NucSequence
from obitools.utils import universalOpen, universalTell, fileSize, progressBar
import struct
import sys
class EcoPCRFile(utils.ColumnFile):
......@@ -44,8 +45,22 @@ class EcoPCRDBFile(object):
def _ecoRecordIterator(self,file):
file = universalOpen(file)
(recordCount,) = struct.unpack('> I',file.read(4))
self._recover=False
for i in xrange(recordCount):
(recordSize,)=struct.unpack('>I',file.read(4))
record = file.read(recordSize)
yield record
if recordCount:
for i in xrange(recordCount):
(recordSize,)=struct.unpack('>I',file.read(4))
record = file.read(recordSize)
yield record
else:
print >> sys.stderr,"\n\n WARNING : EcoPCRDB readding set into recover data mode\n"
self._recover=True
ok=True
while(ok):
try:
(recordSize,)=struct.unpack('>I',file.read(4))
record = file.read(recordSize)
yield record
except:
ok=False
\ No newline at end of file
from obitools import NucSequence
from obitools.ecopcr import EcoPCRDBFile
from obitools.ecopcr.taxonomy import Taxonomy
from obitools.ecopcr.taxonomy import EcoTaxonomyDB
from obitools.ecopcr.annotation import EcoPCRDBAnnotationWriter
from obitools.utils import universalOpen
from glob import glob
import struct
import gzip
......@@ -29,7 +30,7 @@ class EcoPCRDBSequenceIterator(EcoPCRDBFile):
if taxonomy is not None:
self._taxonomy=taxonomy
else:
self._taxonomy=Taxonomy(path)
self._taxonomy=EcoTaxonomyDB(path)
self._seqfilesFiles = glob('%s_???.sdx' % self._path)
self._seqfilesFiles.sort()
......@@ -39,6 +40,7 @@ class EcoPCRDBSequenceIterator(EcoPCRDBFile):
lrecord = len(record)
lnames = lrecord - (4*4+20)
(taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record)
seqid=seqid.strip('\x00')
de = string[:deflength]
seq = gzip.zlib.decompress(string[deflength:])
bioseq = NucSequence(seqid,seq,de,taxidx=taxid,taxid=self._taxonomy._taxonomy[taxid][0])
......@@ -51,13 +53,24 @@ class EcoPCRDBSequenceIterator(EcoPCRDBFile):
class EcoPCRDBSequenceWriter(object):
def __init__(self,dbname,fileidx=1,taxonomy=None,ftid=None,type=None,definition=None):
def __init__(self,dbname,fileidx=1,taxonomy=None,ftid=None,type=None,definition=None,append=False):
self._taxonomy=taxonomy
self._filename="%s_%03d.sdx" % (dbname,fileidx)
self._file = open(self._filename,'wb')
self._sequenceCount=0
self._file.write(struct.pack('> I',self._sequenceCount))
if append:
mode ='r+b'
f = universalOpen(self._filename)
(recordCount,) = struct.unpack('> I',f.read(4))
self._sequenceCount=recordCount
del f
self._file = open(self._filename,mode)
self._file.seek(0,0)
self._file.write(struct.pack('> I',0))
self._file.seek(0,2)
else:
self._sequenceCount=0
mode = 'wb'
self._file = open(self._filename,mode)
self._file.write(struct.pack('> I',self._sequenceCount))
......@@ -69,13 +82,13 @@ class EcoPCRDBSequenceWriter(object):
def _ecoSeqPacker(self,seq):
compactseq = gzip.zlib.compress(str(seq),9)
compactseq = gzip.zlib.compress(str(seq).upper(),9)
cptseqlength = len(compactseq)
delength = len(seq.definition)
totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
if self._taxonomy is None:
if self._taxonomy is None or 'taxid' not in seq:
taxon=-1
else:
taxon=self._taxonomy.findIndex(seq['taxid'])
......@@ -97,7 +110,8 @@ class EcoPCRDBSequenceWriter(object):
def put(self,sequence):
if self._taxonomy is not None:
self.extractTaxon()
if 'taxid' not in sequence and hasattr(sequence, 'extractTaxon'):
sequence.extractTaxon()
self._file.write(self._ecoSeqPacker(sequence))
if self._annotation is not None:
self._annotation.put(sequence, self._sequenceCount)
......
......@@ -34,7 +34,7 @@ def addSequenceFilteringOptions(optionManager):
help="regular expression pattern used to select "
"the sequence. The pattern is case insensitive")
optionManager.add_option('-d','--definition',
optionManager.add_option('-D','--definition',
action="callback", callback=_defintionOptionCallback,
type="string",
metavar="<REGULAR_PATTERN>",
......@@ -42,7 +42,7 @@ def addSequenceFilteringOptions(optionManager):
"the definition of the sequence. "
"The pattern is case sensitive")
optionManager.add_option('-i','--identifier',
optionManager.add_option('-I','--identifier',
action="callback", callback=_identifierOptionCallback,
type="string",
metavar="<REGULAR_PATTERN>",
......
from obitools.ecopcr.taxonomy import Taxonomy
from obitools.ecopcr.taxonomy import Taxonomy, EcoTaxonomyDB, TaxonomyDump
def addTaxonomyFilterOptions(optionManager):
......@@ -12,25 +12,66 @@ def addTaxonomyFilterOptions(optionManager):
help="select sequence with taxid tag containing "
"a parent of rank <RANK_NAME>")
optionManager.add_option('-t','--taxonomy',
optionManager.add_option('-r','--required',
action="append",
dest='required',
metavar="<TAXID>",
type="int",
default=[],
help="required taxid")
optionManager.add_option('-i','--ignore',
action="append",
dest='ignored',
metavar="<TAXID>",
type="int",
default=[],
help="ignored taxid")
optionManager.add_option('-d','--database',
action="store", dest="taxonomy",
metavar="<FILENAME>",
type="string",
help="ecoPCR taxonomy Database "
"name")
optionManager.add_option('-t','--taxonomy-dump',
action="store", dest="taxdump",
metavar="<FILENAME>",
type="string",
help="NCBI Taxonomy dump repository "
"name")
def taxonomyFilterGenerator(options):
if options.taxonomy is not None:
taxonomy = Taxonomy(options.taxonomy)
if options.taxonomy is not None or options.taxdump is not None:
if options.taxdump is not None:
taxonomy = TaxonomyDump(options.taxdump)
elif isinstance(options.taxonomy, Taxonomy):
taxonomy = options.taxonomy
elif isinstance(options.taxonomy, str):
taxonomy = EcoTaxonomyDB(options.taxonomy)
options.taxonomy=taxonomy
def taxonomyFilter(seq):
good = True
if 'taxid' in seq:
taxid = seq['taxid']
# print taxid,
if options.requiredRank is not None:
taxonatrank = taxonomy.getTaxonAtRank(taxid,options.requiredRank)
good = taxonatrank is not None
good = good and taxonatrank is not None
# print " Has rank : ",good,
if options.required:
good = good and reduce(lambda x,y: x or y,
(taxonomy.isAncestor(r,taxid) for r in options.required),
False)
# print " Required : ",good,
if options.ignored:
good = good and not reduce(lambda x,y: x or y,
(taxonomy.isAncestor(r,taxid) for r in options.required),
False)
# print " Ignored : ",good,
# print " Global : ",good
return good
......
......@@ -35,8 +35,8 @@ def universalOpen(file,*options):
if urllib2.urlparse.urlparse(file)[0]=='':
rep = open(file,*options)
else:
rep = urllib2.urlopen(file)
rep = urllib2.urlopen(file,timeout=15)
if file[-3:] == '.gz':
rep = GzipFile(fileobj=rep)
if file[-4:] == '.zip':
......@@ -155,7 +155,7 @@ def skipWhiteLineIterator(lineiterator):
class ColumnFile(object):
def __init__(self,stream,sep=None,strip=True,types=None,skip=None):
def __init__(self,stream,sep=None,strip=True,types=None,skip=None,head=None):
self._stream = universalOpen(stream)
self._delimiter=sep
self._strip=strip
......@@ -172,6 +172,7 @@ class ColumnFile(object):
self._lskip= len(skip)
else:
self._lskip= 0
self._head=head
def str2bool(x):
return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False}))
......@@ -193,6 +194,8 @@ class ColumnFile(object):
if self._types:
it = endLessIterator(self._types)
data = [x[1](x[0]) for x in ((y,it.next()) for y in data)]
if self._head is not None:
data=dict(map(None, self._head,data))
return data
def tell(self):
......
......@@ -41,7 +41,7 @@ if __name__=='__main__':
(options, entries) = optionParser()
tax = taxonomy.Taxonomy(options.db)
tax = taxonomy.EcoTaxonomyDB(options.db)
if options.iterator==EcoPCRDBSequenceIterator:
seqd= options.iterator(options.db)
else:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment