Commit 173dde28 by Eric Coissac

New realease of obitools including a first version of obipr2 and obisilva

parent 49e3eb3c
......@@ -5,5 +5,7 @@ File format conversions
:maxdepth: 2
scripts/obiconvert
scripts/obipr2
scripts/obisilva
scripts/obitaxonomy
scripts/obitab
\ No newline at end of file
.. automodule:: obipr2
:py:mod:`obipr2` specific options
-------------------------------------
.. cmdoption:: --local=<DIRNAME>
Specify you have already downloaded a copy of the PR2 database located at the following URL
`<http://5.196.17.195/pr2/download/entire_database>`_
*Example:*
.. code-block:: bash
> obipr2 --local=PR2Dir
This format **PR2** database pre-downloaded in the `PR2Dir` directory.
.. include:: ../optionsSet/defaultoptions.txt
\ No newline at end of file
.. automodule:: obisilva
:py:mod:`obisilva` specific options
-------------------------------------
.. cmdoption:: -s , --ssu
Specify that you are interested in the **SSU** database.
*Example:*
.. code-block:: bash
> obisilva --ssu --parc
This download and format into an ecoPCR database the latest version of the **SSUParc** database of **Silva**.
.. cmdoption:: -l, --lsu
Specify that you are interested in the **LSU** database.
*Example:*
.. code-block:: bash
> obisilva --ssu --parc
This download and format into an ecoPCR database the latest version of the **LSUParc** database of **Silva**.
.. cmdoption:: -p , --parc
Specify that you are interested in the **Parc** (complete) version of the **Silva** database.
.. cmdoption:: -r , --ref
Specify that you are interested in the **Reference** (cleaned to keep only high quality sequences)
version of the **Silva** database.
.. cmdoption:: -n , --nr
Specify that you are interested in the **Non redundant** version of the **Silva** database.
just a version of the to closely related sequence is kept in this version of the database
.. warning::
Non redundant version of **Silva** exists only for the SSU sequences
in its **Reference** and **Truncated** version
.. cmdoption:: -t , --trunc
Specify that you are interested in the **Truncated** (limited to the rDNA element without flanked regions)
version of the **Silva** database.
.. cmdoption:: --local=<DIRNAME>
Specify you have already downloaded a copy of the **Silva** database located at the following URL
`<http://www.arb-**Silva**.de/no_cache/download/archive/current/Exports/>`_
*Example:*
.. code-block:: bash
> obisilva --ssu --parc --local=**Silva**Dir
This format the **SSUParc** version of the **Silva** database pre-downloaded in the `**Silva**Dir` directory.
.. include:: ../optionsSet/defaultoptions.txt
\ No newline at end of file
......@@ -19,7 +19,7 @@ from os import path
PACKAGE = "OBITools"
VERSION = "1.1.000"
VERSION = "1.1.001"
AUTHOR = 'Eric Coissac'
EMAIL = 'eric@coissac.eu'
URL = 'metabarcoding.org/obitools'
......
......@@ -34,6 +34,7 @@ if __name__=='__main__':
optionParser = getOptionManager([addSequenceFilteringOptions,addInOutputOption],progdoc=__doc__)
(options, entries) = optionParser()
goodSeq = sequenceFilterIteratorGenerator(options)
writer = sequenceWriterGenerator(options)
......
#!/usr/local/bin/python
'''
:py:mod:`obipr2`: manages taxonomic databases
==================================================
:py:mod:`obipr2`: converts and optionally download the `PR2 database <http://ssu-rrna.org/pr2/>`_
into an ecoPCR database. The formated database include the taxonomy as defined by the PR2 authors.
.. warning::
Take care that the numeric taxids associated to the sequences are internal
to this PR2 database and not compatible with the NCBI taxids.
The taxids present in a version of the Silva database are are just valid for
this version of the database and not compatible with the taxids used in another version
downloaded at an other time.
*Example:*
.. code-block:: bash
> obipr2
This command downloads and formats the latest version of the PR2 database from
the official `PR2 web site<http://ssu-rrna.org/pr2/>`_.
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>
'''
from obitools.options import getOptionManager
from obitools.ecopcr.taxonomy import Taxonomy
from obitools.fasta import fastaIterator
import sys
from obitools.utils import universalOpen, ColumnFile
import re
import urllib2
from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
from obitools.utils import progressBar
from os.path import isfile, join
from os import listdir
def numberInStr(s) :
containsNumber = False
for c in s :
if c.isdigit() :
containsNumber = True
return containsNumber
def silvaOptions(optionManager):
optionManager.add_option('--localdb',
action="store", dest="local",
type='str',
default=None,
help="Local copy of the files located in the specified directory "
"will be used instead of those present on the PR2 web site")
optionManager.add_option('-m','--min-taxid',
action="store", dest="taxashift",
type="int",
metavar="####",
default=10000000,
help="minimal taxid for the species taxid")
# http://5.196.17.195/pr2/download/entire_database/gb203_pr2.fasta.gz
siteurl="http://5.196.17.195/"
baseurl="%s/pr2/download/entire_database" % siteurl
def getHyperlink(url):
furl = urllib2.urlopen(url)
data = "".join([l.strip() for l in furl])
href = re.compile('<a .*?</a>',re.IGNORECASE)
target=re.compile('href="(.*?)"',re.IGNORECASE)
filename=re.compile(">(.*?)</a>",re.IGNORECASE)
hrefs = href.findall(data)
links = {}
for h in hrefs:
t = target.search(h).group(1)
f = filename.search(h).group(1)
links[f]=t
return links
def pr2URLS(options):
global baseurl
if options.local is not None:
archive = dict((f,f) for f in listdir(options.local) if isfile(join(options.local,f)))
baseurl=options.local
else:
archive=getHyperlink(baseurl)
pr2file = [x.strip() for x in archive.keys()
if x.strip().endswith('pr2.fasta.gz') or x.strip().endswith('pr2.fasta')
]
version_pattern = re.compile("^gb([0-9]*)", re.IGNORECASE)
versions = [int(version_pattern.search(x.strip()).group(1)) for x in pr2file]
latest = max(versions)
seqfile=pr2file[versions.index(latest)]
pr2txfile = [x for x in archive.keys()
if x.endswith('pr2.tlf.gz') or x.endswith('pr2.tlf')
]
versions = [int(version_pattern.search(x).group(1)) for x in pr2txfile]
print versions
taxfile = pr2txfile[versions.index(latest)]
try:
sequrl = archive[seqfile]
except KeyError:
if seqfile[-3:]=='.gz':
seqfile=seqfile[0:-3]
else:
seqfile=seqfile+'.gz'
sequrl = archive[seqfile]
try:
taxurl = archive[taxfile]
except KeyError:
if taxfile[-3:]=='.gz':
taxfile=taxfile[0:-3]
else:
taxfile=taxfile+'.gz'
taxurl = archive[taxfile]
output = "pr2_gb%d" % latest
return "%s/%s" %(baseurl,sequrl),"%s/%s" %(baseurl,taxurl),output
pathElementPattern = re.compile("^ *(.*?) {(.*?)} *$", re.IGNORECASE)
def pr2PathParser(path):
x = pathElementPattern.match(path)
rank = x.group(1)
if rank=='classe':
rank='class'
elif rank=='ordre':
rank='order'
elif rank=='famille':
rank='family'
elif rank=='genre':
rank='genus'
elif rank=='espece':
rank='species'
elif rank.strip()=="":
rank="no rank"
return rank,x.group(2)
class Pr2Dump(Taxonomy):
def __init__(self,taxdump=None):
self._path=taxdump
self._readNodeTable(taxdump)
Taxonomy.__init__(self)
def _taxonCmp(t1,t2):
if t1[0] < t2[0]:
return -1
elif t1[0] > t2[0]:
return +1
return 0
_taxonCmp=staticmethod(_taxonCmp)
def _readNodeTable(self,dumpfile):
nodes = ColumnFile(dumpfile,
sep='\t',
types=(str,pr2PathParser))
print >>sys.stderr,"Reading taxonomy dump file..."
# (taxid,rank,parent)
nexttaxid = 2
taxidx={'root':1}
actaxid={}
taxonomy=[[1,'root',1,'root','pr2']]
for node in nodes:
ac = node[0]
path = [('root','root')] + node[2:]
allpath = [[]]
for s in path:
allpath.append(allpath[-1]+[s[1]])
allpath.pop(0)
allpath=[";".join(x) for x in allpath]
i=0
for p in allpath:
try:
taxid = taxidx[p]
except KeyError:
taxid=nexttaxid
taxidx[p]=taxid
nexttaxid+=1
parent=p.rsplit(";",1)[0]
ptaxid=taxidx[parent]
rank = path[i][0]
name = path[i][1]
taxonomy.append([taxid,rank,ptaxid,name,'pr2'])
i+=1
actaxid[ac]=taxid
print >>sys.stderr,"List all taxonomy rank..."
ranks =list(set(x[1] for x in taxonomy))
ranks.sort()
print >>sys.stderr,ranks
rankidx = dict(map(None,ranks,xrange(len(ranks))))
self._taxonomy=taxonomy
self._localtaxon=len(taxonomy)
print >>sys.stderr,"Indexing taxonomy..."
index = {}
for i in xrange(self._localtaxon):
index[self._taxonomy[i][0]]=i
print >>sys.stderr,"Indexing parent and rank..."
for t in self._taxonomy:
t[1]=rankidx[t[1]]
t[2]=index[t[2]]
self._ranks=ranks
self._index=index
self._preferedName = []
self._name=[(n[3],'scientific name',self._index[n[0]]) for n in taxonomy]
self.pr2ac=actaxid
def pr22obi(seq,taxonomy):
try:
# parent = taxonomy.findTaxonByTaxid(taxonomy.silvaname[ancestor])
oriid=seq.id
seq.id,seq.definition=oriid.split("|",1)
taxid=taxonomy.pr2ac[seq.id]
seq['taxid']=taxid
except KeyError:
pass
return seq
if __name__ == '__main__':
optionParser = getOptionManager([silvaOptions])
(options, entries) = optionParser()
sequrl,taxurl,options.ecopcroutput = pr2URLS(options)
taxonomydata = universalOpen(taxurl)
options.taxonomy = Pr2Dump(taxonomydata)
# if options.write != '' :
# options.write = open(options.write, 'w')
entries = fastaIterator(universalOpen(sequrl))
writer = EcoPCRDBSequenceWriter(options)
nseq = len(options.taxonomy.pr2ac)
progressBar(1,nseq,
head=options.ecopcroutput)
done=0
for e in entries:
e = pr22obi(e, options.taxonomy)
done+=1
progressBar(done,nseq,
head=options.ecopcroutput)
if 'taxid' in e:
writer.put(e)
else:
print >>sys.stderr,"\nCannot find taxon for entry : %s : %s" % (e.id,e.definition)
print >>sys.stderr
\ No newline at end of file
......@@ -10,14 +10,14 @@ file by describing sequence record groups and defining how many and which sequen
from each group must be retrieved.
"""
from obitools.format.options import addInOutputOption, sequenceWriterGenerator,\
addInputFormatOption
from obitools.format.options import addInOutputOption, sequenceWriterGenerator
from obitools.options import getOptionManager
from obitools.ecopcr.options import addTaxonomyDBOptions, loadTaxonomyDatabase
from random import random
from obitools.utils import progressBar
import math
import sys
from obitools.utils.bioseq import mergeTaxonomyClassification
def minimum(seqs):
return min(s['select'] for s in seqs)
......
This diff is collapsed. Click to expand it.
......@@ -90,7 +90,7 @@ cdef class DNAComplementSequence(WrappedBioSequence):
cpdef bytes getId(self)
cpdef setId(self, bytes value)
cpdef bytes getStr(self)
cpdef int _posInWrapped(self, int position)
cpdef int _posInWrapped(self, int position) except *
cpdef getSymbolAt(self, int position)
cpdef object complement(self)
......
......@@ -21,13 +21,13 @@ _default_raw_parser=__default_raw_parser
cdef class WrapperSetIterator(object):
def __init__(self,s):
self._i = set.__iter__(s)
def next(self):
def next(self): # @ReservedAssignment
return self._i.next()()
def __iter__(self):
return self
cdef class WrapperSet(set):
def __iter__(self):
def __iter__(self): # @DuplicatedSignature
return WrapperSetIterator(self)
......@@ -49,7 +49,7 @@ cdef class BioSequence(object):
can only be called by a subclass constructor.
'''
def __init__(self,bytes id, bytes seq,
def __init__(self,bytes id, bytes seq, # @DuplicatedSignature
bytes definition=None,
bytes rawinfo=None,
bytes rawparser=__default_raw_parser,**info):
......@@ -282,7 +282,7 @@ cdef class BioSequence(object):
else:
raise TypeError,key
def __iter__(self):
def __iter__(self): # @DuplicatedSignature
'''
Iterate through the sequence symbols
'''
......@@ -455,7 +455,7 @@ cdef class WrappedBioSequence(BioSequence):
"""
def __init__(self, object reference,
def __init__(self, object reference, # @DuplicatedSignature
bytes id=None,
bytes definition=None,
**info):
......@@ -512,16 +512,16 @@ cdef class WrappedBioSequence(BioSequence):
return self.wrapped.isNucleotide()
def iterkeys(self):
def iterkeys(self): # @DuplicatedSignature
return uniqueChain(self._info.iterkeys(),
self.wrapped.iterkeys())
def rawiteritems(self):
def rawiteritems(self): # @DuplicatedSignature
return chain(self._info.iteritems(),
(x for x in self.wrapped.rawiteritems()
if x[0] not in self._info))
def iteritems(self):
def iteritems(self): # @DuplicatedSignature
for x in self.iterkeys():
yield (x,self[x])
......@@ -537,7 +537,7 @@ cdef class WrappedBioSequence(BioSequence):
cpdef getSymbolAt(self, int position):
return self.wrapped.getSymbolAt(self.posInWrapped(position))
cpdef int posInWrapped(self, int position, object reference=None):
cpdef int posInWrapped(self, int position, object reference=None) except *:
if reference is None or reference is self.wrapped:
return self._posInWrapped(position)
else:
......@@ -562,7 +562,7 @@ cdef class WrappedBioSequence(BioSequence):
raise AttributeError
cpdef int _posInWrapped(self, int position):
cpdef int _posInWrapped(self, int position) except *:
return position
......@@ -588,7 +588,7 @@ cdef class SubSequence(WrappedBioSequence):
"""
"""
def __init__(self, object reference,
def __init__(self, object reference, # @DuplicatedSignature
object location=None,
int start=0, object stop=None,
object id=None,
......@@ -636,16 +636,16 @@ cdef class SubSequence(WrappedBioSequence):
return seq
def __len__(self):
def __len__(self): # @DuplicatedSignature
return len(self._xrange)
cpdef bytes getStr(self):
return b''.join([x for x in self])
def __iter__(self):
def __iter__(self): # @DuplicatedSignature
return (self.wrapped.getSymbolAt(x) for x in self._xrange)
cpdef int _posInWrapped(self, int position):
cpdef int _posInWrapped(self, int position) except *:
return self._xrange[position]
......@@ -664,7 +664,7 @@ cdef class DNAComplementSequence(WrappedBioSequence):
of this class are produced by using the :py:meth:`NucSequence.complement` method.
"""
def __init__(self, object reference,
def __init__(self, object reference, # @DuplicatedSignature
bytes id=None,
bytes definition=None,
**info):
......@@ -689,13 +689,13 @@ cdef class DNAComplementSequence(WrappedBioSequence):
'''
WrappedBioSequence.setId(self,value)
def __len__(self):
def __len__(self): # @DuplicatedSignature
return len(self._wrapped)
cpdef bytes getStr(self):
return b''.join([x for x in self])
def __iter__(self):
def __iter__(self): # @DuplicatedSignature
return (self.getSymbolAt(x) for x in xrange(len(self)))
cpdef int _posInWrapped(self, int position) except *:
......
......@@ -54,6 +54,13 @@ class EcoPCRDBSequenceIterator(EcoPCRDBFile):
for seq in self.__ecoSequenceIterator(seqfile):
yield seq
@property
def taxonomy(self):
"""Return the taxonomy associated to the ecoPCRDB reader"""
return self._taxonomy
class EcoPCRDBSequenceWriter(object):
def __init__(self,options,fileidx=None,ftid=None,type=None,definition=None,append=False):
......@@ -61,14 +68,14 @@ class EcoPCRDBSequenceWriter(object):
# Take care of the taxonomy associated to the database
self._taxonomy= loadTaxonomyDatabase(options)
dbname=options.ecopcroutput
dbname = options.ecopcroutput
if (self._taxonomy is not None
and (not hasattr(options,'ecodb') or options.ecodb!=dbname)):
print >> sys.stderr,"Writing the taxonomy file...",
ecoTaxonomyWriter(dbname,self._taxonomy)
print >> sys.stderr,"Ok"
# Identifiy the next sequence file number
if fileidx is None:
p = re.compile(r'([0-9]{3})\.sdx')
......@@ -147,5 +154,6 @@ class EcoPCRDBSequenceWriter(object):
self._file.seek(0,0)
self._file.write(struct.pack('> I',self._sequenceCount))
self._file.close()
......@@ -378,10 +378,10 @@ class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
i=0
for x in self.__ecoTaxonomicIterator():
taxonomy.append(x)
if x[4]=='ncbi':
if x[4]!='local':
localtaxon+=1
if buildIndex or x[4]!='ncbi':
if buildIndex or x[4]=='local':
index[x[0]] = i
i+=1
......@@ -433,13 +433,15 @@ class TaxonomyDump(Taxonomy):
print >>sys.stderr,"Adding deleted taxid..."
for taxid in self._deletedNodeIterator('%s/delnodes.dmp' % taxdir):
self._index[taxid]=None
Taxonomy.__init__(self)
self._nameidx = {}
for x in self._name :
if x[0] not in self._nameidx :
self._nameidx[x[0]] = [x[2]]
else :
self._nameidx[x[0]].append(x[2])
# self._nameidx = {}
# for x in self._name :
# if x[0] not in self._nameidx :
# self._nameidx[x[0]] = [x[2]]
# else :
# self._nameidx[x[0]].append(x[2])
def _taxonCmp(t1,t2):
......@@ -556,13 +558,16 @@ def ecoTaxonomyWriter(prefix, taxonomy,onlyLocal=False):
totalSize = 4 + 4 + 4 + 4 + namelength
packed = struct.pack('> I I I I I %ds' % namelength,
totalSize,
tx[0],
tx[1],
tx[2],
namelength,
tx[3])
try:
packed = struct.pack('> I I I I I %ds' % namelength,
totalSize,
tx[0],
tx[1],
tx[2],
namelength,
tx[3])
except :
raise TypeError,"Cannot convert %s" % tx[3]
return packed
......@@ -611,19 +616,19 @@ def ecoTaxonomyWriter(prefix, taxonomy,onlyLocal=False):
def ecoTaxWriter(file,taxonomy):
output = open(file,'wb')
nbtaxon = reduce(lambda x,y:x+y,(1 for t in taxonomy if t[4]=='ncbi'),0)
nbtaxon = reduce(lambda x,y:x+y,(1 for t in taxonomy if t[4]!='local'),0)
output.write(struct.pack('> I',nbtaxon))
for tx in taxonomy:
if tx[4]=='ncbi':
if tx[4]!='local':
output.write(ecoTaxPacker(tx))
output.close()
return nbtaxon < len(taxonomy)
def ecoLocalTaxWriter(file,taxonomy):
nbtaxon = reduce(lambda x,y:x+y,(1 for t in taxonomy if t[4]!='ncbi'),0)
nbtaxon = reduce(lambda x,y:x+y,(1 for t in taxonomy if t[4]=='local'),0)
if nbtaxon:
output = open(file,'wb')
......@@ -631,7 +636,7 @@ def ecoTaxonomyWriter(prefix, taxonomy,onlyLocal=False):
output.write(struct.pack('> I',nbtaxon))
for tx in taxonomy:
if tx[4]!='ncbi':
if tx[4]=='local':
output.write(ecoTaxPacker(tx))
output.close()
......
......@@ -64,7 +64,9 @@ def genericEntryIteratorGenerator(bytes startEntry=None,
cdef bytes line
cdef bint started
file = universalOpen(file)
if not hasattr(file, 'next'):
file = universalOpen(file)
line = file.next()
started = head or isBeginning(line,c_startEntry)
......
......@@ -23,6 +23,7 @@ from _options import fileWithProgressBar, \
currentFileSize, \
currentFileTell, \
allEntryIterator
from obitools.ecopcr.sequence import EcoPCRDBSequenceIterator
class ObiHelpFormatter (IndentedHelpFormatter):
def __init__(self,
......@@ -87,7 +88,10 @@ def getOptionManager(optionDefinitions,entryIterator=None,progdoc=None,checkForm
options.readerIterator=ei
i = allEntryIterator(files,ei,with_progress=options.progressbar)
if ei==EcoPCRDBSequenceIterator:
options.taxonomy=files[0]
i = allEntryIterator(files,ei,with_progress=options.progressbar,options=options)
return options,i