1 import obitools
2 import re
3
5 """
6 Define a location on a sequence.
7 """
8
10 '''
11 Extract subsequence corresponding to a Location.
12
13 @param sequence:
14 @type sequence: C{BioSequence} or C{str}
15 '''
16 assert isinstance(sequence, (obitools.BioSequence,str)), \
17 "sequence must be an instance of str or BioSequence"
18
19 if isinstance(sequence, str):
20 seq = self._extractSequence(sequence)
21 else:
22 if isinstance(sequence, obitools.AASequence):
23 assert not self.needNucleic(), \
24 "This location can be used only with Nucleic sequences"
25 seq = self._extractSequence(str(sequence))
26
27 if isinstance(sequence, obitools.AASequence):
28 st = obitools.AASequence
29 else:
30 st = obitools.NucSequence
31
32 seq = st(sequence.id,
33 seq,
34 sequence.definition,
35 **sequence.getTags())
36 seq['location']=str(self)
37
38 if 'length' in sequence.getTags():
39 seq['length']=len(seq)
40
41 return seq
42
45
47 '''
48 Indicate if a location is composed of a single continuous
49 region or is composed by the junction of several locations
50 by the C{join} operator.
51
52 @return: C{True} if the location is composed of a single
53 continuous region.
54 @rtype: bool
55 '''
56
57 return None
58
61
63 '''
64 If a location contains a complement operator, it can be use
65 only on nucleic sequence.
66
67 @return: C{True} if location contains a complement operator
68 @rtype: bool
69 '''
70 return None
71
73 loc = self.simplify()
74 assert loc.isDirect() is not None,"Gloc cannot be created for multi oriented location : %s" % str(loc)
75 positions = ','.join([str(x) for x in loc._getglocpos()])
76 return "(%s,%s)" % ({True:'T',False:'F'}[loc.isDirect()],
77 positions)
78
81
84
87
90
93
94 begin = property(getBegin,None,None,"beginning position of the location")
95 end = property(getEnd,None,None,"ending position of the location")
96 fivePrime=property(getFivePrime,None,None,"5' potisition of the location")
97 threePrime=property(getThreePrime,None,None,"3' potisition of the location")
98
105
116
118 """
119 A simple location is describe a continuous region of
120 a sequence define by a C{begin} and a C{end} position.
121 """
122
124 '''
125 Build a new C{SimpleLocation} instance. Valid
126 position are define on M{[1,N]} with N the length
127 of the sequence.
128
129 @param begin: start position of the location
130 @type begin: int
131 @param end: end position of the location
132 @type end: int
133 '''
134 assert begin > 0 and end > 0
135
136 self._begin = begin
137 self._end = end
138
140
141 assert ( self._begin < len(sequence)
142 and self._end <= len(sequence)), \
143 "Sequence length %d is too short" % len(sequence)
144
145 return sequence[self._begin-1:self._end]
146
149
152
154 return not (self.before or self.after)
155
157 if self._begin == self._end:
158 return PointLocation(self._begin)
159 else:
160 return self
161
164
166 before = {True:'<',False:''}[self.before]
167 after = {True:'>',False:''}[self.after]
168 return "%s%d..%s%d" % (before,self._begin,after,self._end)
169
171 assert (self._begin + s) > 0,"shift to large (%d)" % s
172 if s == 0:
173 return self
174 return SimpleLocation(self._begin + s, self._end + s)
175
178
180 positions = ','.join([str(x) for x in self._getglocpos()])
181 return "(%s,%s)" % ({True:'T',False:'F'}[self.isDirect()],
182 positions)
183
186
189
190
191 begin = property(getBegin,None,None,"beginning position of the location")
192 end = property(getEnd,None,None,"ending position of the location")
193
195 """
196 A point location describes a location on a sequence
197 limited to a single position
198 """
199
201 assert position > 0
202 self._pos=position
203
205
206 assert self._end <= len(sequence), \
207 "Sequence length %d is too short" % len(sequence)
208
209 return sequence[self._pos-1]
210
213
216
219
222
225
227 assert (self._pos + s) > 0,"shift to large (%d)" % s
228 if s == 0:
229 return self
230 return PointLocation(self._pos + s)
231
233 return (self._pos,self._pos)
234
237
240
241 begin = property(getBegin,None,None,"beginning position of the location")
242 end = property(getEnd,None,None,"ending position of the location")
243
245 return str(self._pos)
246
248 """
249 """
251 self._locs = tuple(locations)
252
253
258
260 hasDirect,hasReverse = reduce(lambda x,y: (x[0] or y,x[1] or not y),
261 (z.isDirect() for z in self._locs),(False,False))
262
263 if hasDirect and not hasReverse:
264 return True
265 if hasReverse and not hasDirect:
266 return False
267
268 return None
269
270
273
274
288
290 return reduce(lambda x,y : x and y, (z.isFullLength() for z in self._locs),1)
291
293 return reduce(lambda x,y : x or y,
294 (z.needNucleic for z in self._locs),
295 False)
296
298 return reduce(lambda x,y : x + y,
299 (z._getglocpos() for z in self._locs))
300
301
303 return min(x.getBegin() for x in self._locs)
304
306 return max(x.getEnd() for x in self._locs)
307
309 assert (self.getBegin() + s) > 0,"shift to large (%d)" % s
310 if s == 0:
311 return self
312 return CompositeLocation(x.shift(s) for x in self._locs)
313
314
315 begin = property(getBegin,None,None,"beginning position of the location")
316 end = property(getEnd,None,None,"ending position of the location")
317
318
320 return "join(%s)" % ','.join([str(x)
321 for x in self._locs])
322
324 """
325 """
326
327 _comp={'a': 't', 'c': 'g', 'g': 'c', 't': 'a',
328 'r': 'y', 'y': 'r', 'k': 'm', 'm': 'k',
329 's': 's', 'w': 'w', 'b': 'v', 'd': 'h',
330 'h': 'd', 'v': 'b', 'n': 'n', 'u': 'a',
331 '-': '-'}
332
335
340
343
346
349
355
358
360 return "complement(%s)" % self._loc
361
367
370
373
376
379
382
383
384 begin = property(getBegin,None,None,"beginning position of the location")
385 end = property(getEnd,None,None,"ending position of the location")
386 fivePrime=property(getFivePrime,None,None,"5' potisition of the location")
387 threePrime=property(getThreePrime,None,None,"3' potisition of the location")
388
389
390
391
392
393
395 sl = []
396 plevel=0
397 for c in text:
398 assert plevel>=0,"Misformated location : %s" % text
399 if c == '(':
400 plevel+=1
401 sl.append(c)
402 elif c==')':
403 plevel-=1
404 sl.append(c)
405 elif c==',' and plevel == 0:
406 assert sl,"Misformated location : %s" % text
407 yield ''.join(sl)
408 sl=[]
409 else:
410 sl.append(c)
411 assert sl and plevel==0,"Misformated location : %s" % text
412 yield ''.join(sl)
413
414
415
416
417
418
419
420 __simplelocparser = re.compile('(?P<before><?)(?P<from>[0-9]+)(\.\.(?P<after>>?)(?P<to>[0-9]+))?')
421
422
447
449 '''
450 Parse a location string as present in genbank or embl file.
451
452 @param locstring: string description of the location in embl/gb format
453 @type locstring: str
454
455 @return: a Location instance
456 @rtype: C{Location} subclass instance
457 '''
458 return __locationParser(locstring)
459
460
461 _matchExternalRef = re.compile('[A-Za-z0-9_|]+(\.[0-9]+)?(?=:)')
462
464 '''
465 When a location describe external references (ex: D28156.1:1..>1292)
466 separate the external reference part of the location and the location
467 by itself.
468
469 @param locstring: text representation of the location.
470 @type locstring: str
471
472 @return: a tuple with a set of string describing accession number
473 of the referred sequences and a C{Location} instance.
474
475 @rtype: tuple(set,Location)
476 '''
477 m = set(x.group() for x in _matchExternalRef.finditer(locstring))
478 clean = re.compile(':|'.join([re.escape(x) for x in m])+':')
479 cloc = locationGenerator(clean.sub('',locstring))
480
481 return m,cloc
482