1 import re
2 import sys
3
4 from obitools.seqdb import embl
5 from obitools.seqdb import nucEntryIterator
6
7 _featureMatcher = re.compile('(^FT .*\n)+', re.M)
8 _cleanFT = re.compile('^FT',re.M)
9
10 _headerMatcher = re.compile('^ID.+(?=\nFH )', re.DOTALL)
11 _seqMatcher = re.compile('(^ ).+(?=//\n)', re.DOTALL + re.M)
12 _cleanSeq = re.compile('[ \n0-9]+')
13 _acMatcher = re.compile('(?<=^AC ).+',re.M)
14 _deMatcher = re.compile('(^DE .+\n)+',re.M)
15 _cleanDe = re.compile('(^|\n)DE +')
16
18 try:
19 header = _headerMatcher.search(text).group()
20
21 ft = _featureMatcher.search(text).group()
22 ft = _cleanFT.sub(' ',ft)
23
24 seq = _seqMatcher.search(text).group()
25 seq = _cleanSeq.sub('',seq).upper()
26
27 acs = _acMatcher.search(text).group()
28 acs = acs.split()
29 ac = acs[0]
30 acs = acs[1:]
31
32 de = _deMatcher.search(header).group()
33 de = _cleanDe.sub(' ',de).strip().strip('.')
34 except AttributeError,e:
35 print >>sys.stderr,'======================================================='
36 print >>sys.stderr,text
37 print >>sys.stderr,'======================================================='
38 raise e
39
40 return (ac,seq,de,header,ft,acs)
41
44
45
47 for e in nucEntryIterator(file):
48 yield emblParser(e)
49