Commit 59f5ab4d by Eric Coissac

Patch several bugs or inconsistencies following the tutorial at Anthony's lab

parent e0d8e2fe
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?eclipse-pydev version="1.0"?>
<pydev_project>
<?eclipse-pydev version="1.0"?><pydev_project>
<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
<path>/OBITools-1.0/src</path>
<path>/OBITools-1.0/textwrangler</path>
</pydev_pathproperty>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python2.7</pydev_property>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python-2.7</pydev_property>
</pydev_project>
......@@ -17,7 +17,7 @@ Standard output format
Generating an ecoPCR database
.............................
.. cmdoption:: --ecopcrDB-output=<PREFIX_FILENAME>
.. cmdoption:: --ecopcrdb-output=<PREFIX_FILENAME>
Creates an ecoPCR database from sequence records results
......
......@@ -6,9 +6,16 @@
try:
from setuptools.core import setup
from setuptools import setup
except ImportError:
from distutils.core import setup
import ez_setup
ez_setup.use_setuptools()
from setuptools import setup
# try:
# from setuptools.core import setup
# except ImportError:
# from distutils.core import setup
from distutils.extension import Extension
from distutils.util import convert_path
from distutils import log
......@@ -39,6 +46,10 @@ import glob
from os import path
# requires = ['Cython>=0.20', 'Sphinx>=1.2']
requires = ['Cython>=0.20']
class install_scripts(ori_install_scripts):
def remove_deprecated_script(self):
......@@ -209,7 +220,7 @@ def findC(root,base=None,pyrexs=None):
#from obitools.version import version as obiversion
#sys.path.pop(0)
VERSION = "1.0.beta"
VERSION = "1.0.beta.2"
AUTHOR = 'Eric Coissac'
EMAIL = 'eric@coissac.eu'
URL = 'www.grenoble.prabi.fr/trac/OBITools'
......@@ -242,6 +253,18 @@ else:
setup(name="OBITools",
description="Scripts and library for sequence analysis",
classifiers=[
'Development Status :: 5 - Production/Stable',
'Environment :: Console',
'Intended Audience :: Research',
'License :: CeCILL-V2',
'Operating System :: Unix like',
'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Topic :: NGS Data processing',
'Topic :: DNA metabarcoding',
'Topic :: Utilities',
],
version=VERSION,
author=AUTHOR,
author_email=EMAIL,
......@@ -251,7 +274,7 @@ setup(name="OBITools",
package_dir = {'': SRC},
packages=findPackage(SRC),
cmdclass = {'build_ext': build_ext,'build_scripts':build_scripts, 'install_scripts':install_scripts},
requires=['Cython (>=0.16)'],
install_requires=requires,
zip_safe = False,
ext_modules=EXTENTION)
......@@ -202,8 +202,8 @@ if __name__ == '__main__':
localdata=False
if options.write != '' :
options.write = open(options.write, 'w')
# if options.write != '' :
# options.write = open(options.write, 'w')
for t in options.newtaxon:
tx = t.split(':')
......
......@@ -70,10 +70,9 @@ def loadTaxonomyDatabase(options):
if isinstance(options.taxonomy, Taxonomy):
return options.taxonomy
#taxonomy = ecobarcodeDatabaseConnection(options)
taxonomy = None
if (taxonomy is not None or
options.taxonomy is not None or
if (options.taxonomy is not None or
options.taxdump is not None):
if options.taxdump is not None:
taxonomy = TaxonomyDump(options.taxdump)
......
from obitools import NucSequence
from obitools.ecopcr import EcoPCRDBFile
from obitools.ecopcr.taxonomy import EcoTaxonomyDB, ecoTaxonomyWriter
from obitools.ecopcr.options import loadTaxonomyDatabase
from obitools.ecopcr.annotation import EcoPCRDBAnnotationWriter
from obitools.utils import universalOpen
from glob import glob
import struct
import gzip
import sys
import re
class EcoPCRDBSequenceIterator(EcoPCRDBFile):
......@@ -40,11 +42,11 @@ class EcoPCRDBSequenceIterator(EcoPCRDBFile):
for record in self._ecoRecordIterator(file):
lrecord = len(record)
lnames = lrecord - (4*4+20)
(taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record)
(taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record) # @UnusedVariable
seqid=seqid.strip('\x00')
de = string[:deflength]
seq = gzip.zlib.decompress(string[deflength:])
bioseq = NucSequence(seqid,seq,de,taxidx=taxid,taxid=self._taxonomy._taxonomy[taxid][0])
bioseq = NucSequence(seqid,seq,de,taxid=self._taxonomy._taxonomy[taxid][0])
yield bioseq
def __iter__(self):
......@@ -54,8 +56,26 @@ class EcoPCRDBSequenceIterator(EcoPCRDBFile):
class EcoPCRDBSequenceWriter(object):
def __init__(self,dbname,fileidx=1,taxonomy=None,ftid=None,type=None,definition=None,append=False):
self._taxonomy=taxonomy
def __init__(self,options,fileidx=None,ftid=None,type=None,definition=None,append=False):
# Take care of the taxonomy associated to the database
self._taxonomy= loadTaxonomyDatabase(options)
dbname=options.ecopcroutput
if (self._taxonomy is not None
and (not hasattr(options,'ecodb') or options.ecodb!=dbname)):
print >> sys.stderr,"Writing the taxonomy file...",
ecoTaxonomyWriter(dbname,self._taxonomy)
print >> sys.stderr,"Ok"
# Identifiy the next sequence file numbre
if fileidx is None:
p = re.compile(r'([0-9]{3})\.sdx')
fileidx = max(list(int(p.search(i).group(1))
for i in glob('%s_[0-9][0-9][0-9].sdx' % dbname))+[0]
) +1
self._filename="%s_%03d.sdx" % (dbname,fileidx)
if append:
mode ='r+b'
......@@ -72,12 +92,7 @@ class EcoPCRDBSequenceWriter(object):
mode = 'wb'
self._file = open(self._filename,mode)
self._file.write(struct.pack('> I',self._sequenceCount))
if self._taxonomy is not None:
print >> sys.stderr,"Writing the taxonomy file...",
ecoTaxonomyWriter(dbname,self._taxonomy)
print >> sys.stderr,"Ok"
if type is not None:
assert ftid is not None,"You must specify an id attribute for features"
self._annotation = EcoPCRDBAnnotationWriter(dbname, ftid, fileidx, type, definition)
......
......@@ -329,10 +329,10 @@ class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
try :
lt=0
for record in self._ecoRecordIterator(self._localTaxonFile):
for record in self._ecoRecordIterator(self._localTaxonFile,noError=True):
lrecord = len(record)
lnames = lrecord - 16
(taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
(taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record) # @UnusedVariable
lt+=1
yield (taxid,rankid,parentidx,name,'local')
print >> sys.stderr, " [INFO : Local taxon file found] : %d added taxa" % lt
......@@ -344,7 +344,7 @@ class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
yield record
def __ecoAliasIterator(self):
for record in self._ecoRecordIterator(self._aliasFile):
for record in self._ecoRecordIterator(self._aliasFile,noError=True):
(taxid,index) = struct.unpack('> I i',record)
yield taxid,index
......@@ -402,7 +402,7 @@ class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
try :
self._preferedName = [(x[0],'obi',x[2])
for x in self.__ecoNameIterator(self._preferedNamesFile)]
for x in self.__ecoNameIterator(self._preferedNamesFile,noError=True)]
print >> sys.stderr, " [INFO : Preferred taxon name file found] : %d added taxa" % len(self._preferedName)
except:
print >> sys.stderr, " [INFO : Preferred taxon name file not found]"
......
......@@ -2,7 +2,7 @@
import sys
from obitools.fasta import formatFasta
from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
#from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
cpdef printOutput(options,seq,output=sys.stdout):
if options.output is not None:
......
......@@ -18,7 +18,6 @@ from obitools.fasta import formatFasta, rawFastaIterator,\
from obitools.fastq import formatFastq
from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
from obitools.ecopcr.options import loadTaxonomyDatabase
from cPickle import dump,load,UnpicklingError
......@@ -34,7 +33,7 @@ from obitools.format.sequence import skipOnErrorIterator
from obitools import BioSequence
from obitools.utils import FakeFile
from glob import glob
def binarySequenceIterator(lineiterator):
......@@ -168,7 +167,7 @@ def addOutputFormatOption(optionManager):
# help="Output sequences in sap fasta format "
# "(Sequence must have a taxid and a taxonomy has to be loaded)")
group.add_option('--ecopcrDB-output',
group.add_option('--ecopcrdb-output',
action="store", dest="ecopcroutput",
default=None,
help="Output sequences in ecopcr database format "
......@@ -313,6 +312,10 @@ def sequenceWriterGenerator(options,output=sys.stdout):
self._format=formatSAPFastaGenerator(options)
elif options.outputFormater is not None:
self._format=options.outputFormater
if hasattr(seq,'_hasTaxid') and seq._hasTaxid:
seq.extractTaxon()
s = self._format(seq,upper=self._upper)
try:
self._file.write(s)
......@@ -336,8 +339,7 @@ def sequenceWriterGenerator(options,output=sys.stdout):
if options.ecopcroutput is not None:
taxo = loadTaxonomyDatabase(options)
writer=EcoPCRDBSequenceWriter(options.ecopcroutput,taxonomy=taxo)
writer=EcoPCRDBSequenceWriter(options)
elif options.output==dump:
writer=BinaryWriter(options,output)
else:
......
......@@ -6,7 +6,7 @@ from obitools.utils import universalOpen
from obitools.utils import universalTell
from obitools.utils import fileSize
from obitools.ecopcr.sequence import EcoPCRDBSequenceIterator
from glob import glob
from logging import debug
import sys
......@@ -68,24 +68,31 @@ def allEntryIterator(files,entryIterator,with_progress=False,histo_step=102):
if files :
for f in files:
if (entryIterator != EcoPCRDBSequenceIterator) :
if (entryIterator != EcoPCRDBSequenceIterator) :
cfs.currentInputFileName=f
f = universalOpen(f)
cfs.currentFile=f
cfs.currentFileSize=fileSize(cfs.currentFile)
debug(f)
if with_progress:
f=fileWithProgressBar(f,step=histo_step)
if entryIterator is None:
for line in f:
yield line
try:
f = universalOpen(f,noError=True)
except Exception as e:
if glob('%s_[0-9][0-9][0-9].sdx' % f):
entryIterator=EcoPCRDBSequenceIterator
else:
print >>sys.stderr, e
sys.exit();
else:
for entry in entryIterator(f):
yield entry
else :
yield EcoPCRDBSequenceIterator(f)
cfs.currentFile=f
cfs.currentFileSize=fileSize(cfs.currentFile)
debug(f)
if with_progress:
f=fileWithProgressBar(f,step=histo_step)
if entryIterator is None:
for line in f:
yield line
else:
for entry in entryIterator(f):
yield entry
else:
if entryIterator is None:
......
......@@ -26,7 +26,7 @@ class FileFormatError(Exception):
def universalOpen(file,*options):
def universalOpen(file,noError=False):
'''
Open a file gziped or not.
......@@ -47,7 +47,7 @@ def universalOpen(file,*options):
if isinstance(file,str):
try:
if urllib2.urlparse.urlparse(file)[0]=='':
rep = open(file,*options)
rep = open(file)
else:
rep = urllib2.urlopen(file,timeout=15)
......@@ -60,8 +60,11 @@ def universalOpen(file,*options):
name = data[0].filename
rep = zip.open(name)
except Exception as e:
print>>sys.stderr, e
sys.exit();
if not noError:
print >>sys.stderr, e
sys.exit();
else:
raise e
else:
rep = file
return rep
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment