Package obitools :: Package obo :: Module parser
[hide private]
[frames] | no frames]

Source Code for Module obitools.obo.parser

  1  from obitools.utils import skipWhiteLineIterator,multiLineWrapper 
  2  from obitools.utils import universalOpen 
  3  from obitools.format.genericparser import genericEntryIteratorGenerator 
  4  from logging import debug,warning 
  5   
  6  import re 
  7   
  8   
  9  ################################################################################# 
 10  ##                           Stanza preparation area                           ## 
 11  ################################################################################# 
 12   
 13   
14 -class FileFormatError(Exception):
15 ''' 16 An error derived from the class Exception. 17 ''' 18 pass
19 20 _oboEntryIterator = genericEntryIteratorGenerator(endEntry='^ *$', 21 strip=True) 22
23 -def stanzaIterator(inputfile):
24 ''' 25 Iterator of stanza. The stanza are the basic units of OBO files. 26 27 @param inputfile: a stream of strings from an opened OBO file. 28 @type inputfile: a stream of strings 29 30 @return: a stream of stanza 31 @rtype: a stream of aggregated strings 32 33 @note: The iterator constructs stanza by aggregate strings from the 34 OBO file. 35 ''' 36 inputfile = universalOpen(inputfile) 37 inputfile = multiLineWrapper(inputfile) 38 return _oboEntryIterator(inputfile)
39 40 41 42 ################################################################################# 43 ## Trailing Modifiers treatment area ## 44 ################################################################################# 45 46
47 -class TrailingModifier(dict):
48 ''' 49 A class object which inherits from the class dict. Trailing modifiers can be found 50 at the end of TaggedValue objects when they exist. 51 ''' 52 53 _match_brace = re.compile('(?<=\ {)[^\]]*(\}) *( !|$)') 54
55 - def __init__(self,string):
56 57 ## search for trailing modifiers signals 58 trailing_modifiers = TrailingModifier._match_brace.search(string) 59 60 ## the trailing modifiers exist 61 if trailing_modifiers: 62 trailing_modifiers=trailing_modifiers.group(0).strip() 63 print trailing_modifiers 64 ## creates and feeds the dictionary of trailing modifiers 65 dict.__init__(self,(x.strip().split('=',1) for x in trailing_modifiers.split(',')))
66 67
68 -def trailingModifierFactory(string):
69 ''' 70 Dispatcher of trailing modifiers. 71 72 @param string: a string from a TaggedValue object with a trailing modifiers signal. 73 @type string: string 74 75 @return: a class object 76 77 @note: The dispatcher is currently very simple. Only one case is treated by the function. 78 `the function returns a class object inherited from the class dict if the trailing modifiers 79 exist, None if they don't. 80 ''' 81 82 trailing_modifiers = TrailingModifier(string) 83 if not trailing_modifiers: 84 trailing_modifiers=None 85 return trailing_modifiers
86 87 88 ################################################################################# 89 ## TaggedValue treatment area ## 90 ################################################################################# 91 92
93 -class TaggedValue(object):
94 ''' 95 A couple 'tag:value' of an OBOEntry. 96 ''' 97 98 _match_value = re.compile('(("(\\\\"|[^\"])*")|(\\\\"|[^\"]))*?( !| {|$)') 99 _split_comment = re.compile('^!| !') 100 _match_quotedString = re.compile('(?<=")(\\\\"|[^\"])*(?=")') 101 _match_bracket = re.compile('\[[^\]]*\]') 102
103 - def __init__(self,line):
104 ''' 105 Constructor of the class TaggedValue. 106 107 @param line: a line of an OBOEntry composed of a tag and a value. 108 @type line: string 109 110 @note: The constructor separates tags from right terms. 'value' is extracted 111 from right terms using a regular expression (value is at the beginning of the 112 string, between quotes or not). Then, 'comment' is extracted from the rest of the 113 string using another regular expression ('comment' is at the end of the string 114 after a '!'. By default, 'comment' is set to None). Finally, 'trailing_modifiers' 115 are extracted from the last string using another regular expression. 116 The tag, the value, the comment and the trailing_modifiers are saved. 117 ''' 118 119 debug("tagValueParser : %s" % line) 120 121 ## by default : 122 trailing_modifiers = None 123 comment = None 124 125 ## the tag is saved. 'right' is composed of the value, the comment and the trailing modifiers 126 tag,rigth = line.split(':',1) 127 128 ## the value is saved 129 value = TaggedValue._match_value.search(rigth).group(0) 130 debug("Extracted value : %s" % value) 131 132 ## if there is a value AND a sign of a comment or trailing modifiers 133 if value and value[-1] in '!{': 134 lvalue = len(value) 135 ## whatever it is a comment or trailing modifiers, it is saved into 'extra' 136 extra = rigth[lvalue-1:].strip() 137 ## a comment is extracted 138 extra =TaggedValue._split_comment.split(extra,1) 139 ## and saved if it exists 140 if len(extra)==2: 141 comment=extra[1].strip() 142 ## trailing modifiers are extracted 143 extra=extra[0] 144 trailing_modifiers = trailingModifierFactory(extra) 145 ## the value is cleaned of any comment or trailing modifiers signals 146 value = value[0:-1] 147 148 if tag=='use_term': 149 tag='consider' 150 raise DeprecationWarning,"user_term is a deprecated tag, you should instead use consider" 151 152 ## recording zone 153 self.value =value.strip() 154 self.tag = tag 155 self.__doc__=comment 156 self.trailing_modifiers=trailing_modifiers
157
158 - def __str__(self):
159 return str(self.value)
160
161 - def __repr__(self):
162 return '''"""%s"""''' % str(self)
163 164
165 -class NameValue(TaggedValue):
166 ''' 167 A couple 'name:value' inherited from the class TaggedValue. Used to manage name tags. 168 ''' 169
170 - def __init__(self,line):
171 172 ## no use of the TaggedValue constructor. The NameValue is very simple. 173 tag,rigth = line.split(':',1) 174 175 ## recording zone 176 self.value = rigth.strip() 177 self.tag = 'name' 178 self.__doc__=None 179 self.trailing_modifiers=None
180 181 182
183 -class DefValue(TaggedValue):
184 ''' 185 A couple 'def:value' inherited from the class TaggedValue. Used to manage def tags. 186 ''' 187
188 - def __init__(self,line):
189 ''' 190 Constructor of the class DefValue. 191 192 @param line: a line of an OBOEntry composed of a tag named 'def' and a value. 193 @type line: string 194 195 @note: The constructor calls the TaggedValue constructor. A regular expression 196 is used to extract the 'definition' from TaggedValue.value (definition is a not 197 quoted TaggedValue.value). A regular expression is used to extract 'dbxrefs' 198 from the aggedValue.value without the definition (dbxrefs are between brackets 199 and definition can be so). Definition is saved as the new value of the DefValue. 200 dbxrefs are saved. 201 ''' 202 203 ## use of the TaggedValue constructor 204 TaggedValue.__init__(self, line) 205 206 ## definition, which is quoted, is extracted from the standard value of a TaggedValue. 207 definition = TaggedValue._match_quotedString.search(self.value).group(0) 208 209 ## the standard value is cleaned of the definition. 210 cleanvalue = self.value.replace(definition,'') 211 cleanvalue = cleanvalue.replace(' ',' ') 212 213 ## dbxrefs are searched into the rest of the standard value. 214 dbxrefs = TaggedValue._match_bracket.search(cleanvalue).group(0) 215 216 ## recording zone 217 self.tag = 'def' 218 ## the value of a DefValue is not the standard value but the definition. 219 self.value=definition 220 self.dbxrefs=xrefFactory(dbxrefs)
221 222
223 -class SynonymValue(TaggedValue):
224 ''' 225 A couple 'synonym:value' inherited from the class TaggedValue. Used to manage 226 synonym tags, exact_synonym tags, broad_synonym tags and narrow_synonym tags. 227 ''' 228 229 _match_scope = re.compile('(?<="")[^\[]*(?=\[|$)') 230
231 - def __init__(self,line):
232 ''' 233 Constructor of the class SynonymValue. 234 235 @param line: a line of an OBOEntry composed of a tag named 'synonym' or 236 'exact_synonym' or 'broad_synonym' or 'narrow_synonym' and a value. 237 @type line: string 238 239 @note: SynonymValue is composed of a tag, a value, a scope, a list of types and 240 dbxrefs. 241 The constructor calls the TaggedValue constructor. A regular expression 242 is used to extract 'definition' from TaggedValue.value (definition is a not 243 quoted TaggedValue.value). Definition is saved as the new value of the class 244 SynonymValue. 245 A regular expression is used to extract 'attributes' from the rest of the 246 string. Attributes may contain an optional synonym scope and an optional list 247 of synonym types. The scope is extracted from attributes or set by default to 248 'RELATED'. It is saved as the scope of the class. The types are the rest of the 249 attributes and are saved as the list of types of the class. 250 For deprecated tags 'exact_synonym', 'broad_synonym' and 'narrow_synonym', tag 251 is set to 'synonym' and scope is set respectively to 'EXACT', 'BROAD' and 'NARROW'. 252 A regular expression is used to extract 'dbxrefs' from the TaggedValue.value 253 without the definition (dbxrefs are between brackets and definition can be so). 254 dbxrefs are saved. 255 ''' 256 257 ## use of the TaggedValue constructor 258 TaggedValue.__init__(self, line) 259 260 ## definition, which is quoted, is extracted from the standard value of a TaggedValue. 261 definition = TaggedValue._match_quotedString.search(self.value).group(0) 262 263 ## the standard value is cleaned of the definition. 264 cleanvalue = self.value.replace(definition,'') 265 cleanvalue = cleanvalue.replace(' ',' ') 266 267 ## 1) attributes are searched into the rest of the standard value. 268 ## 2) then they are stripped. 269 ## 3) then they are split on every ' '. 270 ## 4) finally they are ordered into a set. 271 attributes = set(SynonymValue._match_scope.search(cleanvalue).group(0).strip().split()) 272 273 ## the scopes are the junction between the attributes and a set of specific terms. 274 scopes = attributes & set(['RELATED','EXACT','BROAD','NARROW']) 275 276 ## the types are the rest of the attributes. 277 types = attributes - scopes 278 279 ## this is a constraint of the OBO format 280 assert len(scopes)< 2,"Only one synonym scope allowed" 281 282 ## the scope of the SynonymValue is into scopes or set by default to RELATED 283 if scopes: 284 scope = scopes.pop() 285 else: 286 scope = 'RELATED' 287 288 ## Specific rules are defined for the following tags : 289 if self.tag == 'exact_synonym': 290 raise DeprecationWarning,'exact_synonym is a deprecated tag use instead synonym tag' 291 self.tag = 'synonym' 292 scope = 'EXACT' 293 294 if self.tag == 'broad_synonym': 295 raise DeprecationWarning,'broad_synonym is a deprecated tag use instead synonym tag' 296 self.tag = 'synonym' 297 scope = 'BROAD' 298 299 if self.tag == 'narrow_synonym': 300 raise DeprecationWarning,'narrow_synonym is a deprecated tag use instead synonym tag' 301 self.tag = 'synonym' 302 scope = 'NARROW' 303 304 if self.tag == 'systematic_synonym': 305 #raise DeprecationWarning,'narrow_synonym is a deprecated tag use instead sysnonym tag' 306 self.tag = 'synonym' 307 scope = 'SYSTEMATIC' 308 309 ## this is our own constraint. deprecated tags are not saved by this parser. 310 assert self.tag =='synonym',"%s synonym type is not managed" % self.tag 311 312 ## dbxrefs are searched into the rest of the standard value. 313 dbxrefs = TaggedValue._match_bracket.search(cleanvalue).group(0) 314 315 ## recording zone 316 ## the value of a SynonymValue is not the standard value but the definition. 317 self.value = definition 318 self.dbxrefs = xrefFactory(dbxrefs) 319 self.scope = scope 320 self.types = list(types)
321
322 - def __eq__(self,b):
323 return ((self.value==b.value) and (self.dbxrefs==b.dbxrefs) 324 and (self.scope==b.scope) and (self.types==b.types) 325 and (self.__doc__==b.__doc__) and (self.tag==b.tag) 326 and (self.trailing_modifiers==b.trailing_modifiers))
327
328 - def __hash__(self):
329 return (reduce(lambda x,y:x+y,(hash(z) for z in [self.__doc__, 330 self.value, 331 frozenset(self.dbxrefs), 332 self.scope, 333 frozenset(self.types), 334 self.tag, 335 self.trailing_modifiers]),0)) % (2**31)
336 337
338 -class XrefValue(TaggedValue):
339 ''' 340 A couple 'xref:value' inherited from the class TaggedValue. Used to manage 341 xref tags. 342 ''' 343
344 - def __init__(self,line):
345 346 ## use of the TaggedValue constructor 347 TaggedValue.__init__(self, line) 348 349 ## use the same function as the dbxrefs 350 self.value=xrefFactory(self.value) 351 352 if self.tag in ('xref_analog','xref_unk'): 353 raise DeprecationWarning,'%s is a deprecated tag use instead sysnonym tag' % self.tag 354 self.tag='xref' 355 356 ## this is our own constraint. deprecated tags are not saved by this parser. 357 assert self.tag=='xref'
358 359
360 -class RelationshipValue(TaggedValue):
361 ''' 362 A couple 'xref:value' inherited from the class TaggedValue. Used to manage 363 xref tags. 364 ''' 365
366 - def __init__(self,line):
367 368 ## use of the TaggedValue constructor 369 TaggedValue.__init__(self, line) 370 371 ## the value is split on the first ' '. 372 value = self.value.split(None,1) 373 374 ## succesful split ! 375 if len(value)==2: 376 relationship=value[0] 377 term=value[1] 378 ## unsuccesful split. The relationship is set by default to IS_A 379 else: 380 relationship='is_a' 381 term=value[0] 382 383 ## recording zone 384 self.value=term 385 self.relationship=relationship
386 387
388 -class NamespaceValue(TaggedValue):
389 - def __init__(self,line):
391
392 -class RemarkValue(TaggedValue):
393 - def __init__(self,line):
394 TaggedValue.__init__(self, line) 395 label,value = self.value.split(':',1) 396 label = label.strip() 397 value = value.strip() 398 self.value=value 399 self.label=label
400 401
402 -def taggedValueFactory(line):
403 ''' 404 A function used to dispatch lines of an OBOEntry between the class TaggedValue 405 and its inherited classes. 406 407 @param line: a line of an OBOEntry composed of a tag and a value. 408 @type line: string 409 410 @return: a class object 411 ''' 412 413 if (line[0:9]=='namespace' or 414 line[0:17]=='default-namespace'): 415 return NamespaceValue(line) 416 ## DefValue is an inherited class of TaggedValue 417 elif line[0:3]=='def': 418 return DefValue(line) 419 ## SynonymValue is an inherited class of TaggedValue 420 elif ((line[0:7]=="synonym" and line[0:14]!="synonymtypedef") or 421 line[0:13]=="exact_synonym" or 422 line[0:13]=="broad_synonym" or 423 line[0:14]=="narrow_synonym"): 424 return SynonymValue(line) 425 ## XrefValue is an inherited class of TaggedValue 426 elif line[0:4]=='xref': 427 return XrefValue(line) 428 ## NameValue is an inherited class of TaggedValue 429 elif line[0:4]=='name': 430 return NameValue(line) 431 ## RelationshipValue is an inherited class of TaggedValue 432 elif (line[0:15]=='intersection_of' or 433 line[0:8] =='union_of' or 434 line[0:12]=='relationship'): 435 return RelationshipValue(line) 436 elif (line[0:6]=='remark'): 437 return RemarkValue(line) 438 ## each line is a couple : tag / value (and some more features) 439 else: 440 return TaggedValue(line)
441 442 443 ################################################################################# 444 ## Xref treatment area ## 445 ################################################################################# 446 447 448
449 -class Xref(object):
450 ''' 451 A xref object of an OBOentry. It may be the 'dbxrefs' of SynonymValue and 452 DefValue objects or the 'value' of XrefValue objects. 453 ''' 454 455 __splitdata__ = re.compile(' +(?=["{])') 456
457 - def __init__(self,ref):
458 if ref == '' : # 459 ref = None # 460 data = '' # 461 else : # Modifs JJ sinon erreur : list index out of range 462 data = Xref.__splitdata__.split(ref,1) # 463 ref = data[0] # 464 description=None 465 trailing_modifiers = None 466 if len(data)> 1: 467 extra = data[1] 468 description = TaggedValue._match_quotedString.search(extra) 469 if description is not None: 470 description = description.group(0) 471 extra.replace(description,'') 472 trailing_modifiers=trailingModifierFactory(extra) 473 self.reference=ref 474 self.description=description 475 self.trailing_modifiers=trailing_modifiers
476
477 - def __eq__(self,b):
478 return ((self.reference==b.reference) and (self.description==b.description) 479 and (self.trailing_modifiers==b.trailing_modifiers))
480
481 - def __hash__(self):
482 return (reduce(lambda x,y:x+y,(hash(z) for z in [self.reference, 483 self.description, 484 self.trailing_modifiers]),0)) % (2**31)
485 486
487 -def xrefFactory(string):
488 ''' 489 Dispatcher of xrefs. 490 491 @param string: a string (between brackets) from an inherited TaggedValue object with a dbxrefs 492 signal (actually, the signal can only be found into SynonymValue and DefValue 493 objects) or a string (without brackets) from a XrefValue object. 494 @type string: string 495 496 @return: a class object 497 498 @note: The dispatcher treats differently the strings between brackets (from SynonymValue and 499 DefValue objects) and without brackets (from XrefValue objects). 500 ''' 501 502 string = string.strip() 503 if string[0]=='[': 504 return [Xref(x.strip()) for x in string[1:-1].split(',')] 505 else: 506 return Xref(string)
507 508 509 ################################################################################# 510 ## Stanza treatment area ## 511 ################################################################################# 512 513
514 -class OBOEntry(dict):
515 ''' 516 An entry of an OBOFile. It can be a header (without a stanza name) or 517 a stanza (with a stanza name between brackets). It inherits from the class dict. 518 ''' 519 _match_stanza_name = re.compile('(?<=^\[)[^\]]*(?=\])') 520
521 - def __init__(self,stanza):
522 ## tests if it is the header of the OBO file (returns TRUE) or not (returns FALSE) 523 self.isHeader = stanza[0]!='[' 524 lines = stanza.split('\n') 525 ## not the header : there is a [stanzaName] 526 if not self.isHeader: 527 self.stanzaName = lines[0].strip()[1:-1] 528 lines=lines[1:] 529 ## whatever the stanza is. 530 for line in lines: 531 ## each line is a couple : tag / value 532 taggedvalue = taggedValueFactory(line) 533 if taggedvalue.tag in self: 534 self[taggedvalue.tag].append(taggedvalue) 535 else: 536 self[taggedvalue.tag]=[taggedvalue]
537 538
539 - def parseStanzaName(stanza):
540 sm = OBOEntry._match_stanza_name.search(stanza) 541 if sm: 542 return sm.group(0) 543 else: 544 return None
545 546 parseStanzaName=staticmethod(parseStanzaName)
547 548 549
550 -class OBOTerm(OBOEntry):
551 ''' 552 A stanza named 'Term'. It inherits from the class OBOEntry. 553 '''
554 - def __init__(self,stanza):
555 556 ## use of the OBOEntry constructor. 557 OBOEntry.__init__(self, stanza) 558 559 assert self.stanzaName=='Term' 560 assert 'id' in self and len(self['id'])==1,"An OBOTerm must have an id" 561 assert 'name' in self and len(self['name'])==1,"An OBOTerm must have a name" 562 assert 'namespace' not in self or len(self['namespace'])==1, "Only one namespace is allowed for an OBO term" 563 564 assert 'def' not in self or len(self['def'])==1,"Only one definition is allowed for an OBO term" 565 assert 'comment' not in self or len(self['comment'])==1,"Only one comment is allowed for an OBO term" 566 567 assert 'union_of' not in self or len(self['union_of'])>=2,"Only one union relationship is allowed for an OBO term" 568 assert 'intersection_of' not in self or len(self['intersection_of'])>=2,"Only one intersection relationship is allowed for an OBO term" 569 570 if self._isObsolete(): 571 assert 'is_a' not in self 572 assert 'relationship' not in self 573 assert 'inverse_of' not in self 574 assert 'disjoint_from' not in self 575 assert 'union_of' not in self 576 assert 'intersection_of' not in self 577 578 assert 'replaced_by' not in self or self._isObsolete() 579 assert 'consider' not in self or self._isObsolete()
580 581 ## make-up functions.
582 - def _getDefinition(self):
583 if 'def' in self: 584 return self['def'][0] 585 return None
586
587 - def _getId(self):
588 return self['id'][0]
589
590 - def _getNamespace(self):
591 return self['namespace'][0]
592
593 - def _getName(self):
594 return self['name'][0]
595
596 - def _getComment(self):
597 if 'comment' in self: 598 return self['comment'][0] 599 return None
600
601 - def _getAltIds(self):
602 if 'alt_id' in self: 603 return list(set(self.get('alt_id',None))) 604 return None
605
606 - def _getIsA(self):
607 if 'is_a' in self: 608 return list(set(self.get('is_a',None))) 609 return None
610
611 - def _getSynonym(self):
612 if 'synonym' in self : 613 return list(set(self.get('synonym',None))) 614 return None
615
616 - def _getSubset(self):
617 if self.get('subset',None) != None: 618 return list(set(self.get('subset',None))) 619 else: 620 return None
621
622 - def _getXref(self):
623 if 'xref' in self: 624 return list(set(self.get('xref',None))) 625 return None
626
627 - def _getRelationShip(self):
628 if 'relationship' in self: 629 return list(set(self.get('relationship',None))) 630 return None
631
632 - def _getUnion(self):
633 return list(set(self.get('union_of',None)))
634
635 - def _getIntersection(self):
636 return list(set(self.get('intersection_of',None)))
637
638 - def _getDisjonction(self):
639 return list(set(self.get('disjoint_from',None)))
640
641 - def _isObsolete(self):
642 return 'is_obsolete' in self and str(self['is_obsolete'][0])=='true'
643
644 - def _getReplaceBy(self):
645 if 'replaced_by' in self: 646 return list(set(self.get('replaced_by',None))) 647 return None
648
649 - def _getConsider(self):
650 if 'consider' in self: 651 return list(set(self.get('consider',None))) 652 return None
653 654 ## automatically make-up ! 655 definition = property(_getDefinition,None,None) 656 id = property(_getId,None,None) 657 namespace = property(_getNamespace,None,None) 658 name = property(_getName,None,None) 659 comment = property(_getComment,None,None) 660 alt_ids = property(_getAltIds,None,None) 661 is_a = property(_getIsA,None,None) 662 synonyms = property(_getSynonym,None,None) 663 subsets = property(_getSubset,None,None) 664 xrefs = property(_getXref,None,None) 665 relationship = property(_getRelationShip,None,None) 666 union_of = property(_getUnion,None,None) 667 intersection_of = property(_getIntersection,None,None) 668 disjoint_from = property(_getDisjonction,None,None) 669 is_obsolete = property(_isObsolete,None,None) 670 replaced_by = property(_getReplaceBy,None,None) 671 consider = property(_getConsider,None,None)
672 673
674 -def OBOEntryFactory(stanza):
675 ''' 676 Dispatcher of stanza. 677 678 @param stanza: a stanza composed of several lines. 679 @type stanza: text 680 681 @return: an C{OBOTerm} | C{OBOEntry} instance 682 683 @note: The dispatcher treats differently the stanza which are OBO "Term" 684 and the others. 685 ''' 686 687 stanzaType = OBOEntry.parseStanzaName(stanza) 688 689 if stanzaType=="Term": 690 return OBOTerm(stanza) 691 else: 692 return OBOEntry(stanza)
693
694 -def OBOEntryIterator(file):
695 entries = stanzaIterator(file) 696 for e in entries: 697 debug(e) 698 yield OBOEntryFactory(e)
699