Commit fdeaf595 by Eric Coissac

Change the way ecoPCRDB are written by obitools. If the obitools is

called with several sequence files as input when the ecoPCRDB is
requested as output format, the sequences are splitted in several sdx
files  
parent 343d9ec6
...@@ -64,12 +64,14 @@ class EcoPCRDBSequenceIterator(EcoPCRDBFile): ...@@ -64,12 +64,14 @@ class EcoPCRDBSequenceIterator(EcoPCRDBFile):
class EcoPCRDBSequenceWriter(object): class EcoPCRDBSequenceWriter(object):
def __init__(self,options,fileidx=None,ftid=None,type=None,definition=None,append=False): def __init__(self,options,fileidx=None,ftid=None,type=None,definition=None,append=False):
from obitools.options import currentInputFileName
self.currentInputFileName=currentInputFileName
# Take care of the taxonomy associated to the database # Take care of the taxonomy associated to the database
self._currentfile=None
self._taxonomy= loadTaxonomyDatabase(options) self._taxonomy= loadTaxonomyDatabase(options)
dbname = options.ecopcroutput dbname = options.ecopcroutput
if (self._taxonomy is not None if (self._taxonomy is not None
and (not hasattr(options,'ecodb') or options.ecodb!=dbname)): and (not hasattr(options,'ecodb') or options.ecodb!=dbname)):
print >> sys.stderr,"Writing the taxonomy file...", print >> sys.stderr,"Writing the taxonomy file...",
...@@ -82,23 +84,25 @@ class EcoPCRDBSequenceWriter(object): ...@@ -82,23 +84,25 @@ class EcoPCRDBSequenceWriter(object):
fileidx = max(list(int(p.search(i).group(1)) fileidx = max(list(int(p.search(i).group(1))
for i in glob('%s_[0-9][0-9][0-9].sdx' % dbname))+[0] for i in glob('%s_[0-9][0-9][0-9].sdx' % dbname))+[0]
) +1 ) +1
self._fileidx=fileidx
self._dbname=dbname
self._filename="%s_%03d.sdx" % (dbname,fileidx) self._filename="%s_%03d.sdx" % (dbname,fileidx)
if append: if append:
mode ='r+b'
f = universalOpen(self._filename) f = universalOpen(self._filename)
(recordCount,) = struct.unpack('> I',f.read(4)) (recordCount,) = struct.unpack('> I',f.read(4))
self._sequenceCount=recordCount self._sequenceCount=recordCount
self._sequenceFileCount=recordCount
del f del f
self._file = open(self._filename,mode) self.open('r+b')
self._file.seek(0,0)
self._file.write(struct.pack('> I',0))
self._file.seek(0,2) self._file.seek(0,2)
else: else:
self._sequenceCount=0 self._sequenceCount=0
mode = 'wb' self._sequenceFileCount=0
self._file = open(self._filename,mode) self.open("wb")
self._file.write(struct.pack('> I',self._sequenceCount))
if type is not None: if type is not None:
assert ftid is not None,"You must specify an id attribute for features" assert ftid is not None,"You must specify an id attribute for features"
...@@ -141,7 +145,28 @@ class EcoPCRDBSequenceWriter(object): ...@@ -141,7 +145,28 @@ class EcoPCRDBSequenceWriter(object):
return packed return packed
def close(self):
self._file.seek(0,0)
self._file.write(struct.pack('> I',self._sequenceFileCount))
self._file.close()
def open(self,mode):
self._filename="%s_%03d.sdx" % (self._dbname,self._fileidx)
self._file=open(self._filename,mode)
self._sequenceFileCount=0
self._file.write(struct.pack('> I',self._sequenceFileCount))
def put(self,sequence): def put(self,sequence):
if self._currentfile is None:
self._currentfile=self.currentInputFileName()
if self.currentInputFileName() != self._currentfile:
self._currentfile=self.currentInputFileName()
self.close()
self._fileidx+=1
self.open('wb')
if self._taxonomy is not None: if self._taxonomy is not None:
if 'taxid' not in sequence and hasattr(sequence, 'extractTaxon'): if 'taxid' not in sequence and hasattr(sequence, 'extractTaxon'):
sequence.extractTaxon() sequence.extractTaxon()
...@@ -149,11 +174,10 @@ class EcoPCRDBSequenceWriter(object): ...@@ -149,11 +174,10 @@ class EcoPCRDBSequenceWriter(object):
if self._annotation is not None: if self._annotation is not None:
self._annotation.put(sequence, self._sequenceCount) self._annotation.put(sequence, self._sequenceCount)
self._sequenceCount+=1 self._sequenceCount+=1
self._sequenceFileCount+=1
def __del__(self): def __del__(self):
self._file.seek(0,0) self.close()
self._file.write(struct.pack('> I',self._sequenceCount))
self._file.close()
...@@ -93,7 +93,7 @@ def allEntryIterator(files,entryIterator,with_progress=False,histo_step=102,opti ...@@ -93,7 +93,7 @@ def allEntryIterator(files,entryIterator,with_progress=False,histo_step=102,opti
else: else:
if entryIterator == EcoPCRDBSequenceIterator and options is not None: if entryIterator == EcoPCRDBSequenceIterator and options is not None:
if options.ecodb==f: if hasattr(options,'ecodb') and options.ecodb==f:
iterator = entryIterator(f,options.taxonomy) iterator = entryIterator(f,options.taxonomy)
else: else:
iterator = entryIterator(f) iterator = entryIterator(f)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment