Package obitools :: Package format :: Package genericparser
[hide private]
[frames] | no frames]

Source Code for Package obitools.format.genericparser

  1  """ 
  2  G{packagetree format} 
  3  """ 
  4  import re 
  5   
  6  from obitools.utils import universalOpen 
  7   
8 -def genericEntryIteratorGenerator(startEntry=None,endEntry=None,head=False,tail=False,strip=False):
9 ''' 10 Transfome a text line iterator to an entry oriented iterator. 11 12 This iterator converted is usefull to implement first stage 13 of flat file parsing. 14 15 @param startEntry: a regular pattern matching the beginning of 16 an entry 17 @type startEntry: C{str} or None 18 @param endEntry: a regular pattern matching the end of 19 an entry 20 @type endEntry: C{str} or None 21 @param head: indicate if an header is present before 22 the first entry (as in many original genbank 23 files) 24 @type head: C{bool} 25 @param tail: indicate if some extra informations are present 26 after the last entry. 27 @type tail: C{bool} 28 29 @return: an iterator on entries in text format 30 @rtype: an iterator on C{str} 31 ''' 32 33 def isBeginning(line): 34 return startEntry is None or startEntry.match(line) is not None
35 36 def isEnding(line): 37 return ((endEntry is not None and endEntry.match(line) is not None) or 38 (endEntry is None and startEntry is not None and startEntry.match(line) is not None)) 39 40 def transparentIteratorEntry(file): 41 file = universalOpen(file) 42 return file 43 44 def genericEntryIterator(file): 45 file = universalOpen(file) 46 entry = [] 47 line = file.next() 48 started = head or isBeginning(line) 49 50 try: 51 while 1: 52 while not started: 53 line = file.next() 54 started = isBeginning(line) 55 56 if endEntry is None: 57 entry.append(line) 58 line = file.next() 59 60 while started: 61 end = isEnding(line) 62 if end: 63 if endEntry is not None: 64 entry.append(line) 65 e = ''.join(entry) 66 entry=[] 67 if strip: 68 e=e.strip() 69 yield e 70 started=False 71 if endEntry is not None: 72 line = file.next() 73 else: 74 entry.append(line) 75 line = file.next() 76 77 started = isBeginning(line) 78 79 except StopIteration: 80 if entry and (endEntry is None or tail): 81 e = ''.join(entry) 82 if strip: 83 e=e.strip() 84 yield e 85 86 87 88 if startEntry is not None: 89 startEntry = re.compile(startEntry) 90 if endEntry is not None: 91 endEntry = re.compile(endEntry) 92 93 if startEntry is None and endEntry is None: 94 return transparentIteratorEntry 95 96 return genericEntryIterator 97 98
99 -class GenericParser(object):
100
101 - def __init__(self, 102 startEntry=None, 103 endEntry=None, 104 head=False, 105 tail=False, 106 strip=False, 107 **parseAction):
108 """ 109 @param startEntry: a regular pattern matching the beginning of 110 an entry 111 @type startEntry: C{str} or None 112 @param endEntry: a regular pattern matching the end of 113 an entry 114 @type endEntry: C{str} or None 115 @param head: indicate if an header is present before 116 the first entry (as in many original genbank 117 files) 118 @type head: C{bool} 119 @param tail: indicate if some extra informations are present 120 after the last entry. 121 @type tail: C{bool} 122 123 @param parseAction: 124 125 """ 126 self.flatiterator= genericEntryIteratorGenerator(startEntry, 127 endEntry, 128 head, 129 tail, 130 strip) 131 132 self.action={} 133 134 for k in parseAction: 135 self.addParseAction(k,*parseAction[k])
136
137 - def addParseAction(self,name,dataMatcher,dataCleaner=None,cleanSub=''):
138 ''' 139 Add a parse action to the generic parser. A parse action 140 allows to extract one information from an entry. A parse 141 action is defined by a name and a method to extract this 142 information from the full text entry. 143 144 A parse action can be defined following two ways. 145 146 - via regular expression patterns 147 148 - via dedicated function. 149 150 In the first case, you have to indicate at least the 151 dataMatcher regular pattern. This pattern should match exactly 152 the data part you want to retrieve. If cleanning of extra 153 characters is needed. The second pattern dataCLeanner can be 154 used to specifyed these characters. 155 156 In the second case you must provide a callable object (function) 157 that extract and clean data from the text entry. This function 158 should return an array containing all data retrevied even if 159 no data or only one data is retrevied. 160 161 @summary: Add a parse action to the generic parser. 162 163 @param name: name of the data extracted 164 @type name: C{str} 165 @param dataMatcher: a regular pattern matching the data 166 or a callable object parsing the 167 entry and returning a list of marched data 168 @type dataMatcher: C{str} or C{SRE_Pattern} instance or a callable 169 object 170 @param dataCleaner: a regular pattern matching part of the data 171 to suppress. 172 @type dataCleaner: C{str} or C{SRE_Pattern} instance or C{None} 173 @param cleanSub: string used to replace dataCleaner matches. 174 Default is an empty string 175 @type cleanSub: C{str} 176 177 ''' 178 if callable(dataMatcher): 179 self.action[name]=dataMatcher 180 else : 181 if isinstance(dataMatcher, str): 182 dataMatcher=re.compile(dataMatcher) 183 if isinstance(dataCleaner, str): 184 dataCleaner=re.compile(dataCleaner) 185 self.action[name]=self._buildREParser(dataMatcher, 186 dataCleaner, 187 cleanSub)
188
189 - def _buildREParser(self,dataMatcher,dataCleaner,cleanSub):
190 def parser(data): 191 x = dataMatcher.findall(data) 192 if dataCleaner is not None: 193 x = [dataCleaner.sub(cleanSub,y) for y in x] 194 return x
195 return parser
196
197 - def __call__(self,file):
198 for e in self.flatiterator(file): 199 pe = {'fullentry':e} 200 for k in self.action: 201 pe[k]=self.action[k](e) 202 yield pe
203