Commit 173dde28 by Eric Coissac

New realease of obitools including a first version of obipr2 and obisilva

parent 49e3eb3c
...@@ -5,5 +5,7 @@ File format conversions ...@@ -5,5 +5,7 @@ File format conversions
:maxdepth: 2 :maxdepth: 2
scripts/obiconvert scripts/obiconvert
scripts/obipr2
scripts/obisilva
scripts/obitaxonomy scripts/obitaxonomy
scripts/obitab scripts/obitab
\ No newline at end of file
.. automodule:: obipr2
:py:mod:`obipr2` specific options
-------------------------------------
.. cmdoption:: --local=<DIRNAME>
Specify you have already downloaded a copy of the PR2 database located at the following URL
`<http://5.196.17.195/pr2/download/entire_database>`_
*Example:*
.. code-block:: bash
> obipr2 --local=PR2Dir
This format **PR2** database pre-downloaded in the `PR2Dir` directory.
.. include:: ../optionsSet/defaultoptions.txt
\ No newline at end of file
.. automodule:: obisilva
:py:mod:`obisilva` specific options
-------------------------------------
.. cmdoption:: -s , --ssu
Specify that you are interested in the **SSU** database.
*Example:*
.. code-block:: bash
> obisilva --ssu --parc
This download and format into an ecoPCR database the latest version of the **SSUParc** database of **Silva**.
.. cmdoption:: -l, --lsu
Specify that you are interested in the **LSU** database.
*Example:*
.. code-block:: bash
> obisilva --ssu --parc
This download and format into an ecoPCR database the latest version of the **LSUParc** database of **Silva**.
.. cmdoption:: -p , --parc
Specify that you are interested in the **Parc** (complete) version of the **Silva** database.
.. cmdoption:: -r , --ref
Specify that you are interested in the **Reference** (cleaned to keep only high quality sequences)
version of the **Silva** database.
.. cmdoption:: -n , --nr
Specify that you are interested in the **Non redundant** version of the **Silva** database.
just a version of the to closely related sequence is kept in this version of the database
.. warning::
Non redundant version of **Silva** exists only for the SSU sequences
in its **Reference** and **Truncated** version
.. cmdoption:: -t , --trunc
Specify that you are interested in the **Truncated** (limited to the rDNA element without flanked regions)
version of the **Silva** database.
.. cmdoption:: --local=<DIRNAME>
Specify you have already downloaded a copy of the **Silva** database located at the following URL
`<http://www.arb-**Silva**.de/no_cache/download/archive/current/Exports/>`_
*Example:*
.. code-block:: bash
> obisilva --ssu --parc --local=**Silva**Dir
This format the **SSUParc** version of the **Silva** database pre-downloaded in the `**Silva**Dir` directory.
.. include:: ../optionsSet/defaultoptions.txt
\ No newline at end of file
...@@ -19,7 +19,7 @@ from os import path ...@@ -19,7 +19,7 @@ from os import path
PACKAGE = "OBITools" PACKAGE = "OBITools"
VERSION = "1.1.000" VERSION = "1.1.001"
AUTHOR = 'Eric Coissac' AUTHOR = 'Eric Coissac'
EMAIL = 'eric@coissac.eu' EMAIL = 'eric@coissac.eu'
URL = 'metabarcoding.org/obitools' URL = 'metabarcoding.org/obitools'
......
...@@ -34,6 +34,7 @@ if __name__=='__main__': ...@@ -34,6 +34,7 @@ if __name__=='__main__':
optionParser = getOptionManager([addSequenceFilteringOptions,addInOutputOption],progdoc=__doc__) optionParser = getOptionManager([addSequenceFilteringOptions,addInOutputOption],progdoc=__doc__)
(options, entries) = optionParser() (options, entries) = optionParser()
goodSeq = sequenceFilterIteratorGenerator(options) goodSeq = sequenceFilterIteratorGenerator(options)
writer = sequenceWriterGenerator(options) writer = sequenceWriterGenerator(options)
......
#!/usr/local/bin/python
'''
:py:mod:`obipr2`: manages taxonomic databases
==================================================
:py:mod:`obipr2`: converts and optionally download the `PR2 database <http://ssu-rrna.org/pr2/>`_
into an ecoPCR database. The formated database include the taxonomy as defined by the PR2 authors.
.. warning::
Take care that the numeric taxids associated to the sequences are internal
to this PR2 database and not compatible with the NCBI taxids.
The taxids present in a version of the Silva database are are just valid for
this version of the database and not compatible with the taxids used in another version
downloaded at an other time.
*Example:*
.. code-block:: bash
> obipr2
This command downloads and formats the latest version of the PR2 database from
the official `PR2 web site<http://ssu-rrna.org/pr2/>`_.
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>
'''
from obitools.options import getOptionManager
from obitools.ecopcr.taxonomy import Taxonomy
from obitools.fasta import fastaIterator
import sys
from obitools.utils import universalOpen, ColumnFile
import re
import urllib2
from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
from obitools.utils import progressBar
from os.path import isfile, join
from os import listdir
def numberInStr(s) :
containsNumber = False
for c in s :
if c.isdigit() :
containsNumber = True
return containsNumber
def silvaOptions(optionManager):
optionManager.add_option('--localdb',
action="store", dest="local",
type='str',
default=None,
help="Local copy of the files located in the specified directory "
"will be used instead of those present on the PR2 web site")
optionManager.add_option('-m','--min-taxid',
action="store", dest="taxashift",
type="int",
metavar="####",
default=10000000,
help="minimal taxid for the species taxid")
# http://5.196.17.195/pr2/download/entire_database/gb203_pr2.fasta.gz
siteurl="http://5.196.17.195/"
baseurl="%s/pr2/download/entire_database" % siteurl
def getHyperlink(url):
furl = urllib2.urlopen(url)
data = "".join([l.strip() for l in furl])
href = re.compile('<a .*?</a>',re.IGNORECASE)
target=re.compile('href="(.*?)"',re.IGNORECASE)
filename=re.compile(">(.*?)</a>",re.IGNORECASE)
hrefs = href.findall(data)
links = {}
for h in hrefs:
t = target.search(h).group(1)
f = filename.search(h).group(1)
links[f]=t
return links
def pr2URLS(options):
global baseurl
if options.local is not None:
archive = dict((f,f) for f in listdir(options.local) if isfile(join(options.local,f)))
baseurl=options.local
else:
archive=getHyperlink(baseurl)
pr2file = [x.strip() for x in archive.keys()
if x.strip().endswith('pr2.fasta.gz') or x.strip().endswith('pr2.fasta')
]
version_pattern = re.compile("^gb([0-9]*)", re.IGNORECASE)
versions = [int(version_pattern.search(x.strip()).group(1)) for x in pr2file]
latest = max(versions)
seqfile=pr2file[versions.index(latest)]
pr2txfile = [x for x in archive.keys()
if x.endswith('pr2.tlf.gz') or x.endswith('pr2.tlf')
]
versions = [int(version_pattern.search(x).group(1)) for x in pr2txfile]
print versions
taxfile = pr2txfile[versions.index(latest)]
try:
sequrl = archive[seqfile]
except KeyError:
if seqfile[-3:]=='.gz':
seqfile=seqfile[0:-3]
else:
seqfile=seqfile+'.gz'
sequrl = archive[seqfile]
try:
taxurl = archive[taxfile]
except KeyError:
if taxfile[-3:]=='.gz':
taxfile=taxfile[0:-3]
else:
taxfile=taxfile+'.gz'
taxurl = archive[taxfile]
output = "pr2_gb%d" % latest
return "%s/%s" %(baseurl,sequrl),"%s/%s" %(baseurl,taxurl),output
pathElementPattern = re.compile("^ *(.*?) {(.*?)} *$", re.IGNORECASE)
def pr2PathParser(path):
x = pathElementPattern.match(path)
rank = x.group(1)
if rank=='classe':
rank='class'
elif rank=='ordre':
rank='order'
elif rank=='famille':
rank='family'
elif rank=='genre':
rank='genus'
elif rank=='espece':
rank='species'
elif rank.strip()=="":
rank="no rank"
return rank,x.group(2)
class Pr2Dump(Taxonomy):
def __init__(self,taxdump=None):
self._path=taxdump
self._readNodeTable(taxdump)
Taxonomy.__init__(self)
def _taxonCmp(t1,t2):
if t1[0] < t2[0]:
return -1
elif t1[0] > t2[0]:
return +1
return 0
_taxonCmp=staticmethod(_taxonCmp)
def _readNodeTable(self,dumpfile):
nodes = ColumnFile(dumpfile,
sep='\t',
types=(str,pr2PathParser))
print >>sys.stderr,"Reading taxonomy dump file..."
# (taxid,rank,parent)
nexttaxid = 2
taxidx={'root':1}
actaxid={}
taxonomy=[[1,'root',1,'root','pr2']]
for node in nodes:
ac = node[0]
path = [('root','root')] + node[2:]
allpath = [[]]
for s in path:
allpath.append(allpath[-1]+[s[1]])
allpath.pop(0)
allpath=[";".join(x) for x in allpath]
i=0
for p in allpath:
try:
taxid = taxidx[p]
except KeyError:
taxid=nexttaxid
taxidx[p]=taxid
nexttaxid+=1
parent=p.rsplit(";",1)[0]
ptaxid=taxidx[parent]
rank = path[i][0]
name = path[i][1]
taxonomy.append([taxid,rank,ptaxid,name,'pr2'])
i+=1
actaxid[ac]=taxid
print >>sys.stderr,"List all taxonomy rank..."
ranks =list(set(x[1] for x in taxonomy))
ranks.sort()
print >>sys.stderr,ranks
rankidx = dict(map(None,ranks,xrange(len(ranks))))
self._taxonomy=taxonomy
self._localtaxon=len(taxonomy)
print >>sys.stderr,"Indexing taxonomy..."
index = {}
for i in xrange(self._localtaxon):
index[self._taxonomy[i][0]]=i
print >>sys.stderr,"Indexing parent and rank..."
for t in self._taxonomy:
t[1]=rankidx[t[1]]
t[2]=index[t[2]]
self._ranks=ranks
self._index=index
self._preferedName = []
self._name=[(n[3],'scientific name',self._index[n[0]]) for n in taxonomy]
self.pr2ac=actaxid
def pr22obi(seq,taxonomy):
try:
# parent = taxonomy.findTaxonByTaxid(taxonomy.silvaname[ancestor])
oriid=seq.id
seq.id,seq.definition=oriid.split("|",1)
taxid=taxonomy.pr2ac[seq.id]
seq['taxid']=taxid
except KeyError:
pass
return seq
if __name__ == '__main__':
optionParser = getOptionManager([silvaOptions])
(options, entries) = optionParser()
sequrl,taxurl,options.ecopcroutput = pr2URLS(options)
taxonomydata = universalOpen(taxurl)
options.taxonomy = Pr2Dump(taxonomydata)
# if options.write != '' :
# options.write = open(options.write, 'w')
entries = fastaIterator(universalOpen(sequrl))
writer = EcoPCRDBSequenceWriter(options)
nseq = len(options.taxonomy.pr2ac)
progressBar(1,nseq,
head=options.ecopcroutput)
done=0
for e in entries:
e = pr22obi(e, options.taxonomy)
done+=1
progressBar(done,nseq,
head=options.ecopcroutput)
if 'taxid' in e:
writer.put(e)
else:
print >>sys.stderr,"\nCannot find taxon for entry : %s : %s" % (e.id,e.definition)
print >>sys.stderr
\ No newline at end of file
...@@ -10,14 +10,14 @@ file by describing sequence record groups and defining how many and which sequen ...@@ -10,14 +10,14 @@ file by describing sequence record groups and defining how many and which sequen
from each group must be retrieved. from each group must be retrieved.
""" """
from obitools.format.options import addInOutputOption, sequenceWriterGenerator,\ from obitools.format.options import addInOutputOption, sequenceWriterGenerator
addInputFormatOption
from obitools.options import getOptionManager from obitools.options import getOptionManager
from obitools.ecopcr.options import addTaxonomyDBOptions, loadTaxonomyDatabase from obitools.ecopcr.options import addTaxonomyDBOptions, loadTaxonomyDatabase
from random import random from random import random
from obitools.utils import progressBar from obitools.utils import progressBar
import math import math
import sys import sys
from obitools.utils.bioseq import mergeTaxonomyClassification
def minimum(seqs): def minimum(seqs):
return min(s['select'] for s in seqs) return min(s['select'] for s in seqs)
......
This diff is collapsed. Click to expand it.
...@@ -90,7 +90,7 @@ cdef class DNAComplementSequence(WrappedBioSequence): ...@@ -90,7 +90,7 @@ cdef class DNAComplementSequence(WrappedBioSequence):
cpdef bytes getId(self) cpdef bytes getId(self)
cpdef setId(self, bytes value) cpdef setId(self, bytes value)
cpdef bytes getStr(self) cpdef bytes getStr(self)
cpdef int _posInWrapped(self, int position) cpdef int _posInWrapped(self, int position) except *
cpdef getSymbolAt(self, int position) cpdef getSymbolAt(self, int position)
cpdef object complement(self) cpdef object complement(self)
......
...@@ -21,13 +21,13 @@ _default_raw_parser=__default_raw_parser ...@@ -21,13 +21,13 @@ _default_raw_parser=__default_raw_parser
cdef class WrapperSetIterator(object): cdef class WrapperSetIterator(object):
def __init__(self,s): def __init__(self,s):
self._i = set.__iter__(s) self._i = set.__iter__(s)
def next(self): def next(self): # @ReservedAssignment
return self._i.next()() return self._i.next()()
def __iter__(self): def __iter__(self):
return self return self
cdef class WrapperSet(set): cdef class WrapperSet(set):
def __iter__(self): def __iter__(self): # @DuplicatedSignature
return WrapperSetIterator(self) return WrapperSetIterator(self)
...@@ -49,7 +49,7 @@ cdef class BioSequence(object): ...@@ -49,7 +49,7 @@ cdef class BioSequence(object):
can only be called by a subclass constructor. can only be called by a subclass constructor.
''' '''
def __init__(self,bytes id, bytes seq, def __init__(self,bytes id, bytes seq, # @DuplicatedSignature
bytes definition=None, bytes definition=None,
bytes rawinfo=None, bytes rawinfo=None,
bytes rawparser=__default_raw_parser,**info): bytes rawparser=__default_raw_parser,**info):
...@@ -282,7 +282,7 @@ cdef class BioSequence(object): ...@@ -282,7 +282,7 @@ cdef class BioSequence(object):
else: else:
raise TypeError,key raise TypeError,key
def __iter__(self): def __iter__(self): # @DuplicatedSignature
''' '''
Iterate through the sequence symbols Iterate through the sequence symbols
''' '''
...@@ -455,7 +455,7 @@ cdef class WrappedBioSequence(BioSequence): ...@@ -455,7 +455,7 @@ cdef class WrappedBioSequence(BioSequence):
""" """
def __init__(self, object reference, def __init__(self, object reference, # @DuplicatedSignature
bytes id=None, bytes id=None,
bytes definition=None, bytes definition=None,
**info): **info):
...@@ -512,16 +512,16 @@ cdef class WrappedBioSequence(BioSequence): ...@@ -512,16 +512,16 @@ cdef class WrappedBioSequence(BioSequence):
return self.wrapped.isNucleotide() return self.wrapped.isNucleotide()
def iterkeys(self): def iterkeys(self): # @DuplicatedSignature
return uniqueChain(self._info.iterkeys(), return uniqueChain(self._info.iterkeys(),
self.wrapped.iterkeys()) self.wrapped.iterkeys())
def rawiteritems(self): def rawiteritems(self): # @DuplicatedSignature
return chain(self._info.iteritems(), return chain(self._info.iteritems(),
(x for x in self.wrapped.rawiteritems() (x for x in self.wrapped.rawiteritems()
if x[0] not in self._info)) if x[0] not in self._info))
def iteritems(self): def iteritems(self): # @DuplicatedSignature
for x in self.iterkeys(): for x in self.iterkeys():
yield (x,self[x]) yield (x,self[x])
...@@ -537,7 +537,7 @@ cdef class WrappedBioSequence(BioSequence): ...@@ -537,7 +537,7 @@ cdef class WrappedBioSequence(BioSequence):
cpdef getSymbolAt(self, int position): cpdef getSymbolAt(self, int position):
return self.wrapped.getSymbolAt(self.posInWrapped(position)) return self.wrapped.getSymbolAt(self.posInWrapped(position))
cpdef int posInWrapped(self, int position, object reference=None): cpdef int posInWrapped(self, int position, object reference=None) except *:
if reference is None or reference is self.wrapped: if reference is None or reference is self.wrapped:
return self._posInWrapped(position) return self._posInWrapped(position)
else: else:
...@@ -562,7 +562,7 @@ cdef class WrappedBioSequence(BioSequence): ...@@ -562,7 +562,7 @@ cdef class WrappedBioSequence(BioSequence):
raise AttributeError raise AttributeError
cpdef int _posInWrapped(self, int position): cpdef int _posInWrapped(self, int position) except *:
return position return position
...@@ -588,7 +588,7 @@ cdef class SubSequence(WrappedBioSequence): ...@@ -588,7 +588,7 @@ cdef class SubSequence(WrappedBioSequence):
""" """
""" """
def __init__(self, object reference, def __init__(self, object reference, # @DuplicatedSignature
object location=None, object location=None,
int start=0, object stop=None, int start=0, object stop=None,
object id=None, object id=None,
...@@ -636,16 +636,16 @@ cdef class SubSequence(WrappedBioSequence): ...@@ -636,16 +636,16 @@ cdef class SubSequence(WrappedBioSequence):
return seq return seq
def __len__(self): def __len__(self): # @DuplicatedSignature
return len(self._xrange) return len(self._xrange)
cpdef bytes getStr(self): cpdef bytes getStr(self):
return b''.join([x for x in self]) return b''.join([x for x in self])
def __iter__(self): def __iter__(self): # @DuplicatedSignature
return (self.wrapped.getSymbolAt(x) for x in self._xrange) return (self.wrapped.getSymbolAt(x) for x in self._xrange)
cpdef int _posInWrapped(self, int position): cpdef int _posInWrapped(self, int position) except *:
return self._xrange[position] return self._xrange[position]
...@@ -664,7 +664,7 @@ cdef class DNAComplementSequence(WrappedBioSequence): ...@@ -664,7 +664,7 @@ cdef class DNAComplementSequence(WrappedBioSequence):
of this class are produced by using the :py:meth:`NucSequence.complement` method. of this class are produced by using the :py:meth:`NucSequence.complement` method.
""" """
def __init__(self, object reference, def __init__(self, object reference, # @DuplicatedSignature
bytes id=None, bytes id=None,
bytes definition=None, bytes definition=None,
**info): **info):
...@@ -689,13 +689,13 @@ cdef class DNAComplementSequence(WrappedBioSequence): ...@@ -689,13 +689,13 @@ cdef class DNAComplementSequence(WrappedBioSequence):
''' '''
WrappedBioSequence.setId(self,value) WrappedBioSequence.setId(self,value)
def __len__(self): def __len__(self): # @DuplicatedSignature
return len(self._wrapped) return len(self._wrapped)
cpdef bytes getStr(self): cpdef bytes getStr(self):
return b''.join([x for x in self]) return b''.join([x for x in self])
def __iter__(self): def __iter__(self): # @DuplicatedSignature
return (self.getSymbolAt(x) for x in xrange(len(self))) return (self.getSymbolAt(x) for x in xrange(len(self)))
cpdef int _posInWrapped(self, int position) except *: cpdef int _posInWrapped(self, int position) except *:
......
...@@ -54,6 +54,13 @@ class EcoPCRDBSequenceIterator(EcoPCRDBFile): ...@@ -54,6 +54,13 @@ class EcoPCRDBSequenceIterator(EcoPCRDBFile):
for seq in self.__ecoSequenceIterator(seqfile): for seq in self.__ecoSequenceIterator(seqfile):
yield seq yield seq
@property
def taxonomy(self):
"""Return the taxonomy associated to the ecoPCRDB reader"""
return self._taxonomy
class EcoPCRDBSequenceWriter(object): class EcoPCRDBSequenceWriter(object):
def __init__(self,options,fileidx=None,ftid=None,type=None,definition=None,append=False): def __init__(self,options,fileidx=None,ftid=None,type=None,definition=None,append=False):
...@@ -61,14 +68,14 @@ class EcoPCRDBSequenceWriter(object): ...@@ -61,14 +68,14 @@ class EcoPCRDBSequenceWriter(object):
# Take care of the taxonomy associated to the database # Take care of the taxonomy associated to the database
self._taxonomy= loadTaxonomyDatabase(options) self._taxonomy= loadTaxonomyDatabase(options)
dbname=options.ecopcroutput dbname = options.ecopcroutput
if (self._taxonomy is not None if (self._taxonomy is not None
and (not hasattr(options,'ecodb') or options.ecodb!=dbname)): and (not hasattr(options,'ecodb') or options.ecodb!=dbname)):
print >> sys.stderr,"Writing the taxonomy file...", print >> sys.stderr,"Writing the taxonomy file...",
ecoTaxonomyWriter(dbname,self._taxonomy) ecoTaxonomyWriter(dbname,self._taxonomy)
print >> sys.stderr,"Ok" print >> sys.stderr,"Ok"
# Identifiy the next sequence file number # Identifiy the next sequence file number
if fileidx is None: if fileidx is None:
p = re.compile(r'([0-9]{3})\.sdx') p = re.compile(r'([0-9]{3})\.sdx')
...@@ -147,5 +154,6 @@ class EcoPCRDBSequenceWriter(object): ...@@ -147,5 +154,6 @@ class EcoPCRDBSequenceWriter(object):
self._file.seek(0,0) self._file.seek(0,0)
self._file.write(struct.pack('> I',self._sequenceCount)) self._file.write(struct.pack('> I',self._sequenceCount))
self._file.close() self._file.close()
...@@ -378,10 +378,10 @@ class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile): ...@@ -378,10 +378,10 @@ class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
i=0 i=0
for x in self.__ecoTaxonomicIterator(): for x in self.__ecoTaxonomicIterator():
taxonomy.append(x) taxonomy.append(x)
if x[4]=='ncbi': if x[4]!='local':
localtaxon+=1 localtaxon+=1
if buildIndex or x[4]!='ncbi': if buildIndex or x[4]=='local':
index[x[0]] = i index[x[0]] = i
i+=1 i+=1
...@@ -433,13 +433,15 @@ class TaxonomyDump(Taxonomy): ...@@ -433,13 +433,15 @@ class TaxonomyDump(Taxonomy):
print >>sys.stderr,"Adding deleted taxid..."