1 """
2 G{packagetree format}
3 """
4 import re
5
6 from obitools.utils import universalOpen
7
8 -def genericEntryIteratorGenerator(startEntry=None,endEntry=None,head=False,tail=False,strip=False):
9 '''
10 Transfome a text line iterator to an entry oriented iterator.
11
12 This iterator converted is usefull to implement first stage
13 of flat file parsing.
14
15 @param startEntry: a regular pattern matching the beginning of
16 an entry
17 @type startEntry: C{str} or None
18 @param endEntry: a regular pattern matching the end of
19 an entry
20 @type endEntry: C{str} or None
21 @param head: indicate if an header is present before
22 the first entry (as in many original genbank
23 files)
24 @type head: C{bool}
25 @param tail: indicate if some extra informations are present
26 after the last entry.
27 @type tail: C{bool}
28
29 @return: an iterator on entries in text format
30 @rtype: an iterator on C{str}
31 '''
32
33 def isBeginning(line):
34 return startEntry is None or startEntry.match(line) is not None
35
36 def isEnding(line):
37 return ((endEntry is not None and endEntry.match(line) is not None) or
38 (endEntry is None and startEntry is not None and startEntry.match(line) is not None))
39
40 def transparentIteratorEntry(file):
41 file = universalOpen(file)
42 return file
43
44 def genericEntryIterator(file):
45 file = universalOpen(file)
46 entry = []
47 line = file.next()
48 started = head or isBeginning(line)
49
50 try:
51 while 1:
52 while not started:
53 line = file.next()
54 started = isBeginning(line)
55
56 if endEntry is None:
57 entry.append(line)
58 line = file.next()
59
60 while started:
61 end = isEnding(line)
62 if end:
63 if endEntry is not None:
64 entry.append(line)
65 e = ''.join(entry)
66 entry=[]
67 if strip:
68 e=e.strip()
69 yield e
70 started=False
71 if endEntry is not None:
72 line = file.next()
73 else:
74 entry.append(line)
75 line = file.next()
76
77 started = isBeginning(line)
78
79 except StopIteration:
80 if entry and (endEntry is None or tail):
81 e = ''.join(entry)
82 if strip:
83 e=e.strip()
84 yield e
85
86
87
88 if startEntry is not None:
89 startEntry = re.compile(startEntry)
90 if endEntry is not None:
91 endEntry = re.compile(endEntry)
92
93 if startEntry is None and endEntry is None:
94 return transparentIteratorEntry
95
96 return genericEntryIterator
97
98
100
101 - def __init__(self,
102 startEntry=None,
103 endEntry=None,
104 head=False,
105 tail=False,
106 strip=False,
107 **parseAction):
108 """
109 @param startEntry: a regular pattern matching the beginning of
110 an entry
111 @type startEntry: C{str} or None
112 @param endEntry: a regular pattern matching the end of
113 an entry
114 @type endEntry: C{str} or None
115 @param head: indicate if an header is present before
116 the first entry (as in many original genbank
117 files)
118 @type head: C{bool}
119 @param tail: indicate if some extra informations are present
120 after the last entry.
121 @type tail: C{bool}
122
123 @param parseAction:
124
125 """
126 self.flatiterator= genericEntryIteratorGenerator(startEntry,
127 endEntry,
128 head,
129 tail,
130 strip)
131
132 self.action={}
133
134 for k in parseAction:
135 self.addParseAction(k,*parseAction[k])
136
137 - def addParseAction(self,name,dataMatcher,dataCleaner=None,cleanSub=''):
138 '''
139 Add a parse action to the generic parser. A parse action
140 allows to extract one information from an entry. A parse
141 action is defined by a name and a method to extract this
142 information from the full text entry.
143
144 A parse action can be defined following two ways.
145
146 - via regular expression patterns
147
148 - via dedicated function.
149
150 In the first case, you have to indicate at least the
151 dataMatcher regular pattern. This pattern should match exactly
152 the data part you want to retrieve. If cleanning of extra
153 characters is needed. The second pattern dataCLeanner can be
154 used to specifyed these characters.
155
156 In the second case you must provide a callable object (function)
157 that extract and clean data from the text entry. This function
158 should return an array containing all data retrevied even if
159 no data or only one data is retrevied.
160
161 @summary: Add a parse action to the generic parser.
162
163 @param name: name of the data extracted
164 @type name: C{str}
165 @param dataMatcher: a regular pattern matching the data
166 or a callable object parsing the
167 entry and returning a list of marched data
168 @type dataMatcher: C{str} or C{SRE_Pattern} instance or a callable
169 object
170 @param dataCleaner: a regular pattern matching part of the data
171 to suppress.
172 @type dataCleaner: C{str} or C{SRE_Pattern} instance or C{None}
173 @param cleanSub: string used to replace dataCleaner matches.
174 Default is an empty string
175 @type cleanSub: C{str}
176
177 '''
178 if callable(dataMatcher):
179 self.action[name]=dataMatcher
180 else :
181 if isinstance(dataMatcher, str):
182 dataMatcher=re.compile(dataMatcher)
183 if isinstance(dataCleaner, str):
184 dataCleaner=re.compile(dataCleaner)
185 self.action[name]=self._buildREParser(dataMatcher,
186 dataCleaner,
187 cleanSub)
188
190 def parser(data):
191 x = dataMatcher.findall(data)
192 if dataCleaner is not None:
193 x = [dataCleaner.sub(cleanSub,y) for y in x]
194 return x
195 return parser
196
198 for e in self.flatiterator(file):
199 pe = {'fullentry':e}
200 for k in self.action:
201 pe[k]=self.action[k](e)
202 yield pe
203