Package obitools :: Package seqdb :: Package embl :: Module parser
[hide private]
[frames] | no frames]

Source Code for Module obitools.seqdb.embl.parser

 1  import re 
 2  import sys 
 3   
 4  from obitools.seqdb import embl 
 5  from obitools.seqdb import nucEntryIterator 
 6   
 7  _featureMatcher = re.compile('(^FT  .*\n)+', re.M) 
 8  _cleanFT       = re.compile('^FT',re.M) 
 9   
10  _headerMatcher = re.compile('^ID.+(?=\nFH  )', re.DOTALL) 
11  _seqMatcher    = re.compile('(^    ).+(?=//\n)', re.DOTALL + re.M) 
12  _cleanSeq      = re.compile('[ \n0-9]+') 
13  _acMatcher     = re.compile('(?<=^AC   ).+',re.M) 
14  _deMatcher     = re.compile('(^DE   .+\n)+',re.M) 
15  _cleanDe       = re.compile('(^|\n)DE +') 
16   
17 -def __emblparser(text):
18 try: 19 header = _headerMatcher.search(text).group() 20 21 ft = _featureMatcher.search(text).group() 22 ft = _cleanFT.sub(' ',ft) 23 24 seq = _seqMatcher.search(text).group() 25 seq = _cleanSeq.sub('',seq).upper() 26 27 acs = _acMatcher.search(text).group() 28 acs = acs.split() 29 ac = acs[0] 30 acs = acs[1:] 31 32 de = _deMatcher.search(header).group() 33 de = _cleanDe.sub(' ',de).strip().strip('.') 34 except AttributeError,e: 35 print >>sys.stderr,'=======================================================' 36 print >>sys.stderr,text 37 print >>sys.stderr,'=======================================================' 38 raise e 39 40 return (ac,seq,de,header,ft,acs)
41
42 -def emblParser(text):
43 return embl.EmblSequence(*__emblparser(text))
44 45
46 -def emblIterator(file):
47 for e in nucEntryIterator(file): 48 yield emblParser(e)
49