Package obitools :: Package location
[hide private]
[frames] | no frames]

Source Code for Package obitools.location

  1  import obitools 
  2  import re 
  3   
4 -class Location(object):
5 """ 6 Define a location on a sequence. 7 """ 8
9 - def extractSequence(self,sequence):
10 ''' 11 Extract subsequence corresponding to a Location. 12 13 @param sequence: 14 @type sequence: C{BioSequence} or C{str} 15 ''' 16 assert isinstance(sequence, (obitools.BioSequence,str)), \ 17 "sequence must be an instance of str or BioSequence" 18 19 if isinstance(sequence, str): 20 seq = self._extractSequence(sequence) 21 else: 22 if isinstance(sequence, obitools.AASequence): 23 assert not self.needNucleic(), \ 24 "This location can be used only with Nucleic sequences" 25 seq = self._extractSequence(str(sequence)) 26 27 if isinstance(sequence, obitools.AASequence): 28 st = obitools.AASequence 29 else: 30 st = obitools.NucSequence 31 32 seq = st(sequence.id, 33 seq, 34 sequence.definition, 35 **sequence.getTags()) 36 seq['location']=str(self) 37 38 if 'length' in sequence.getTags(): 39 seq['length']=len(seq) 40 41 return seq
42
43 - def isDirect(self):
44 return None
45
46 - def isSimple(self):
47 ''' 48 Indicate if a location is composed of a single continuous 49 region or is composed by the junction of several locations 50 by the C{join} operator. 51 52 @return: C{True} if the location is composed of a single 53 continuous region. 54 @rtype: bool 55 ''' 56 57 return None
58
59 - def isFullLength(self):
60 return None
61
62 - def needNucleic(self):
63 ''' 64 If a location contains a complement operator, it can be use 65 only on nucleic sequence. 66 67 @return: C{True} if location contains a complement operator 68 @rtype: bool 69 ''' 70 return None
71
72 - def getGloc(self):
73 loc = self.simplify() 74 assert loc.isDirect() is not None,"Gloc cannot be created for multi oriented location : %s" % str(loc) 75 positions = ','.join([str(x) for x in loc._getglocpos()]) 76 return "(%s,%s)" % ({True:'T',False:'F'}[loc.isDirect()], 77 positions)
78
79 - def shift(self,s):
80 return None
81
82 - def getBegin(self):
83 return None
84
85 - def getEnd(self):
86 return None
87
88 - def getFivePrime(self):
89 return self.getBegin()
90
91 - def getThreePrime(self):
92 return self.getEnd()
93 94 begin = property(getBegin,None,None,"beginning position of the location") 95 end = property(getEnd,None,None,"ending position of the location") 96 fivePrime=property(getFivePrime,None,None,"5' potisition of the location") 97 threePrime=property(getThreePrime,None,None,"3' potisition of the location") 98
99 - def __abs__(self):
100 assert self.isDirect() is not None,"Abs operator cannot be applied on non oriented location" 101 if self.isDirect(): 102 return self 103 else: 104 return ComplementLocation(self).simplify()
105
106 - def __cmp__(self,y):
107 if self.begin < y.begin: 108 return -1 109 if self.begin > y.begin: 110 return 1 111 if self.isDirect() == y.isDirect(): 112 return 0 113 if self.isDirect() and not y.isDirect(): 114 return -1 115 return 1
116
117 -class SimpleLocation(Location):
118 """ 119 A simple location is describe a continuous region of 120 a sequence define by a C{begin} and a C{end} position. 121 """ 122
123 - def __init__(self,begin,end):
124 ''' 125 Build a new C{SimpleLocation} instance. Valid 126 position are define on M{[1,N]} with N the length 127 of the sequence. 128 129 @param begin: start position of the location 130 @type begin: int 131 @param end: end position of the location 132 @type end: int 133 ''' 134 assert begin > 0 and end > 0 135 136 self._begin = begin 137 self._end = end
138
139 - def _extractSequence(self,sequence):
140 141 assert ( self._begin < len(sequence) 142 and self._end <= len(sequence)), \ 143 "Sequence length %d is too short" % len(sequence) 144 145 return sequence[self._begin-1:self._end]
146
147 - def isDirect(self):
148 return True
149
150 - def isSimple(self):
151 return True
152
153 - def isFullLength(self):
154 return not (self.before or self.after)
155
156 - def simplify(self):
157 if self._begin == self._end: 158 return PointLocation(self._begin) 159 else: 160 return self
161
162 - def needNucleic(self):
163 return False
164
165 - def __str__(self):
166 before = {True:'<',False:''}[self.before] 167 after = {True:'>',False:''}[self.after] 168 return "%s%d..%s%d" % (before,self._begin,after,self._end)
169
170 - def shift(self,s):
171 assert (self._begin + s) > 0,"shift to large (%d)" % s 172 if s == 0: 173 return self 174 return SimpleLocation(self._begin + s, self._end + s)
175
176 - def _getglocpos(self):
177 return (self.begin,self.end)
178
179 - def getGloc(self):
180 positions = ','.join([str(x) for x in self._getglocpos()]) 181 return "(%s,%s)" % ({True:'T',False:'F'}[self.isDirect()], 182 positions)
183
184 - def getBegin(self):
185 return self._begin
186
187 - def getEnd(self):
188 return self._end
189 190 191 begin = property(getBegin,None,None,"beginning position of the location") 192 end = property(getEnd,None,None,"ending position of the location")
193
194 -class PointLocation(Location):
195 """ 196 A point location describes a location on a sequence 197 limited to a single position 198 """ 199
200 - def __init__(self,position):
201 assert position > 0 202 self._pos=position
203
204 - def _extractSequence(self,sequence):
205 206 assert self._end <= len(sequence), \ 207 "Sequence length %d is too short" % len(sequence) 208 209 return sequence[self._pos-1]
210
211 - def isDirect(self):
212 return True
213
214 - def isSimple(self):
215 return True
216
217 - def isFullLength(self):
218 return True
219
220 - def simplify(self):
221 return self
222
223 - def needNucleic(self):
224 return False
225
226 - def shift(self,s):
227 assert (self._pos + s) > 0,"shift to large (%d)" % s 228 if s == 0: 229 return self 230 return PointLocation(self._pos + s)
231
232 - def _getglocpos(self):
233 return (self._pos,self._pos)
234
235 - def getBegin(self):
236 return self._pos
237
238 - def getEnd(self):
239 return self._pos
240 241 begin = property(getBegin,None,None,"beginning position of the location") 242 end = property(getEnd,None,None,"ending position of the location") 243
244 - def __str__(self):
245 return str(self._pos)
246
247 -class CompositeLocation(Location):
248 """ 249 """
250 - def __init__(self,locations):
251 self._locs = tuple(locations)
252 253
254 - def _extractSequence(self,sequence):
255 seq = ''.join([x._extractSequence(sequence) 256 for x in self._locs]) 257 return seq
258
259 - def isDirect(self):
260 hasDirect,hasReverse = reduce(lambda x,y: (x[0] or y,x[1] or not y), 261 (z.isDirect() for z in self._locs),(False,False)) 262 263 if hasDirect and not hasReverse: 264 return True 265 if hasReverse and not hasDirect: 266 return False 267 268 return None
269 270
271 - def isSimple(self):
272 return False
273 274
275 - def simplify(self):
276 if len(self._locs)==1: 277 return self._locs[0] 278 279 rep = CompositeLocation(x.simplify() for x in self._locs) 280 281 if reduce(lambda x,y : x and y, 282 (isinstance(z, ComplementLocation) 283 for z in self._locs)): 284 rep = ComplementLocation(CompositeLocation(x._loc.simplify() 285 for x in rep._locs[::-1])) 286 287 return rep
288
289 - def isFullLength(self):
290 return reduce(lambda x,y : x and y, (z.isFullLength() for z in self._locs),1)
291
292 - def needNucleic(self):
293 return reduce(lambda x,y : x or y, 294 (z.needNucleic for z in self._locs), 295 False)
296
297 - def _getglocpos(self):
298 return reduce(lambda x,y : x + y, 299 (z._getglocpos() for z in self._locs))
300 301
302 - def getBegin(self):
303 return min(x.getBegin() for x in self._locs)
304
305 - def getEnd(self):
306 return max(x.getEnd() for x in self._locs)
307
308 - def shift(self,s):
309 assert (self.getBegin() + s) > 0,"shift to large (%d)" % s 310 if s == 0: 311 return self 312 return CompositeLocation(x.shift(s) for x in self._locs)
313 314 315 begin = property(getBegin,None,None,"beginning position of the location") 316 end = property(getEnd,None,None,"ending position of the location") 317 318
319 - def __str__(self):
320 return "join(%s)" % ','.join([str(x) 321 for x in self._locs])
322
323 -class ComplementLocation(Location):
324 """ 325 """ 326 327 _comp={'a': 't', 'c': 'g', 'g': 'c', 't': 'a', 328 'r': 'y', 'y': 'r', 'k': 'm', 'm': 'k', 329 's': 's', 'w': 'w', 'b': 'v', 'd': 'h', 330 'h': 'd', 'v': 'b', 'n': 'n', 'u': 'a', 331 '-': '-'} 332
333 - def __init__(self,location):
334 self._loc = location
335
336 - def _extractSequence(self,sequence):
337 seq = self._loc._extractSequence(sequence) 338 seq = ''.join([Complement._comp.get(x.lower(),'n') for x in strseq[position]]) 339 return seq
340
341 - def isDirect(self):
342 return False
343
344 - def isSimple(self):
345 return self._loc.isSimple()
346
347 - def isFullLength(self):
348 return self._loc.isFullLength()
349
350 - def simplify(self):
351 if isinstance(self._loc, ComplementLocation): 352 return self._loc._loc.simplify() 353 else: 354 return self
355
356 - def needNucleic(self):
357 return True
358
359 - def __str__(self):
360 return "complement(%s)" % self._loc
361
362 - def shift(self,s):
363 assert (self.getBegin() + s) > 0,"shift to large (%d)" % s 364 if s == 0: 365 return self 366 return ComplementLocation(self._loc.shift(s))
367
368 - def _getglocpos(self):
369 return self._loc._getglocpos()
370
371 - def getBegin(self):
372 return self._loc.getBegin()
373
374 - def getEnd(self):
375 return self._loc.getEnd()
376
377 - def getFivePrime(self):
378 return self.getEnd()
379
380 - def getThreePrime(self):
381 return self.getBegin()
382 383 384 begin = property(getBegin,None,None,"beginning position of the location") 385 end = property(getEnd,None,None,"ending position of the location") 386 fivePrime=property(getFivePrime,None,None,"5' potisition of the location") 387 threePrime=property(getThreePrime,None,None,"3' potisition of the location")
388 389 390 # 391 # Internal functions used for location parsing 392 # 393
394 -def __sublocationIterator(text):
395 sl = [] 396 plevel=0 397 for c in text: 398 assert plevel>=0,"Misformated location : %s" % text 399 if c == '(': 400 plevel+=1 401 sl.append(c) 402 elif c==')': 403 plevel-=1 404 sl.append(c) 405 elif c==',' and plevel == 0: 406 assert sl,"Misformated location : %s" % text 407 yield ''.join(sl) 408 sl=[] 409 else: 410 sl.append(c) 411 assert sl and plevel==0,"Misformated location : %s" % text 412 yield ''.join(sl)
413 414 415 416 # 417 # Internal functions used for location parsing 418 # 419 420 __simplelocparser = re.compile('(?P<before><?)(?P<from>[0-9]+)(\.\.(?P<after>>?)(?P<to>[0-9]+))?') 421 422
423 -def __locationParser(text):
424 text=text.strip() 425 if text[0:5]=='join(': 426 assert text[-1]==')',"Misformated location : %s" % text 427 return CompositeLocation(__locationParser(sl) for sl in __sublocationIterator(text[5:-1])) 428 elif text[0:11]=='complement(': 429 assert text[-1]==')',"Misformated location : %s" % text 430 subl = tuple(__locationParser(sl) for sl in __sublocationIterator(text[11:-1])) 431 if len(subl)>1: 432 subl = CompositeLocation(subl) 433 else: 434 subl = subl[0] 435 return ComplementLocation(subl) 436 else: 437 data = __simplelocparser.match(text) 438 assert data is not None,"Misformated location : %s" % text 439 data = data.groupdict() 440 if not data['to'] : 441 sl = PointLocation(int(data['from'])) 442 else: 443 sl = SimpleLocation(int(data['from']),int(data['to'])) 444 sl.before=data['before']=='<' 445 sl.after=data['after']=='>' 446 return sl
447
448 -def locationGenerator(locstring):
449 ''' 450 Parse a location string as present in genbank or embl file. 451 452 @param locstring: string description of the location in embl/gb format 453 @type locstring: str 454 455 @return: a Location instance 456 @rtype: C{Location} subclass instance 457 ''' 458 return __locationParser(locstring)
459 460 461 _matchExternalRef = re.compile('[A-Za-z0-9_|]+(\.[0-9]+)?(?=:)') 462
463 -def extractExternalRefs(locstring):
464 ''' 465 When a location describe external references (ex: D28156.1:1..>1292) 466 separate the external reference part of the location and the location 467 by itself. 468 469 @param locstring: text representation of the location. 470 @type locstring: str 471 472 @return: a tuple with a set of string describing accession number 473 of the referred sequences and a C{Location} instance. 474 475 @rtype: tuple(set,Location) 476 ''' 477 m = set(x.group() for x in _matchExternalRef.finditer(locstring)) 478 clean = re.compile(':|'.join([re.escape(x) for x in m])+':') 479 cloc = locationGenerator(clean.sub('',locstring)) 480 481 return m,cloc
482