Commit d5fbfa6e by Eric Coissac

--no commit message

parent 0bf1761d
......@@ -26,7 +26,7 @@ if __name__=='__main__':
(options, entries) = optionParser()
tax = taxonomy.Taxonomy(options.db)
seqd= sequence.ecoPCRDBSequenceIterator(options.db,taxonomy=tax)
seqd= sequence.EcoPCRDBSequenceIterator(options.db,taxonomy=tax)
ranks = set(x for x in tax.rankIterator())
......
#!/usr/local/bin/python
import fileinput
import re
import sys
......
......@@ -21,11 +21,14 @@ class SsearchParser(object):
for line in props:
subject,tab = line.split('\t')
tab=tab.split()
ac = subject.split()[0]
ssp = subject.split()
ac = ssp[0]
dbl= int(ssp[-5][:-1])
ident = float(tab[0])
matchlen = int(tab[5]) - int(tab[4]) +1
self.props.append({"ac" :ac,
"identity" :ident,
"subjectlength":dbl,
'matchlength' : matchlen})
def run(seq,database,program='fasta35',opts=''):
......
from obitools.fasta import fastaNucIterator
from obitools.cns import cnsTag
def cnsFastaIterator(file):
x = fastaNucIterator(file, cnsTag)
return x
\ No newline at end of file
from obitools import NucSequence
from obitools.ecopcr import EcoPCRDBFile
from obitools.ecopcr.taxonomy import Taxonomy
from obitools.ecopcr.annotation import EcoPCRDBAnnotationWriter
from glob import glob
import struct
import gzip
class ecoPCRDBSequenceIterator(EcoPCRDBFile):
class EcoPCRDBSequenceIterator(EcoPCRDBFile):
def __init__(self,path,taxonomy=None):
self._path = path
......@@ -33,3 +34,64 @@ class ecoPCRDBSequenceIterator(EcoPCRDBFile):
for seqfile in self._seqfilesFiles:
for seq in self.__ecoSequenceIterator(seqfile):
yield seq
class EcoPCRDBSequenceWriter(object):
def __init__(self,dbname,fileidx=1,taxonomy=None,ftid=None,type=None,definition=None):
self._taxonomy=taxonomy
self._filename="%s_%03d.sdx" % (dbname,fileidx)
self._file = open(self._filename,'wb')
self._sequenceCount=0
self._file.write(struct.pack('> I',self._sequenceCount))
if type is not None:
assert ftid is not None,"You must specify an id attribute for features"
self._annotation = EcoPCRDBAnnotationWriter(dbname, ftid, fileidx, type, definition)
else:
self._annotation = None
def _ecoSeqPacker(self,seq):
compactseq = gzip.zlib.compress(str(seq),9)
cptseqlength = len(compactseq)
delength = len(seq.definition)
totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
if self._taxonomy is None:
taxon=-1
else:
taxon=self._taxonomy.findIndex(seq['taxid'])
packed = struct.pack('> I i 20s I I I %ds %ds' % (delength,cptseqlength),
totalSize,
taxon,
seq.id,
delength,
len(seq),
cptseqlength,
seq.definition,
compactseq)
assert len(packed) == totalSize+4, "error in sequence packing"
return packed
def put(self,sequence):
if self._taxonomy is not None:
self.extractTaxon()
self._file.write(self._ecoSeqPacker(sequence))
if self._annotation is not None:
self._annotation.put(sequence, self._sequenceCount)
self._sequenceCount+=1
def __del__(self):
self._file.seek(0,0)
self._file.write(struct.pack('> I',self._sequenceCount))
self._file.close()
......@@ -97,6 +97,9 @@ class Taxonomy(EcoPCRDBFile):
return self._ranks.index(rank)
except ValueError:
return None
def findIndex(self,taxid):
return self._index[taxid]
#####
......
......@@ -21,6 +21,7 @@ class Fast(object):
self._kup = kup
self._hash= hash
self._seq = seq
def __call__(self,seq):
'''
......@@ -35,7 +36,7 @@ class Fast(object):
@rtype: a int tuple (smax,pmax)
'''
histo={}
seq = str(seq)
seq = str(seq).upper()
hash= self._hash
kup = self._kup
......@@ -47,5 +48,9 @@ class Fast(object):
smax = max(histo.values())
pmax = [x for x in histo if histo[x]==smax]
return smax,pmax
def __len__(self):
return len(self._seq)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment