Package obitools :: Module fasta
[hide private]
[frames] | no frames]

Source Code for Module obitools.fasta

  1  """ 
  2  fasta module provides functions to read and write sequences in fasta format. 
  3   
  4   
  5  """ 
  6   
  7  from obitools.format.genericparser import genericEntryIteratorGenerator 
  8  from obitools import bioSeqGenerator,BioSequence,AASequence,NucSequence 
  9  from obitools.align import alignmentReader 
 10  from obitools.utils import universalOpen 
 11  import re 
 12   
 13  _parseFastaTag=re.compile('([a-zA-Z]\w*) *= *([^;]+);') 
 14   
 15  fastaEntryIterator=genericEntryIteratorGenerator(startEntry='^>') 
 16   
 17   
18 -def parseFastaDescription(ds,tagparser=_parseFastaTag):
19 info = dict((x[0],x[1].strip()) 20 for x in tagparser.findall(ds)) 21 definition = tagparser.sub('',ds).strip() 22 for k in info: 23 try: 24 info[k]=eval(info[k]) 25 except: 26 pass 27 28 return definition,info
29
30 -def _fastaJoinSeq(seqarray):
31 return ''.join([x.strip() for x in seqarray])
32
33 -def fastaParser(seq,bioseqfactory,tagparser=_parseFastaTag,joinseq=_fastaJoinSeq):
34 ''' 35 Parse a fasta record. 36 37 @attention: internal purpuse function 38 39 @param seq: a sequence object containing all lines corresponding 40 to one fasta sequence 41 @type seq: C{list} or C{tuple} of C{str} 42 43 @param bioseqfactory: a callable object return a BioSequence 44 instance. 45 @type bioseqfactory: a callable object 46 47 @param tagparser: a compiled regular expression usable 48 to identify key, value couples from 49 title line. 50 @type tagparser: regex instance 51 52 @return: a C{BioSequence} instance 53 ''' 54 seq = seq.split('\n') 55 title = seq[0].strip()[1:].split(None,1) 56 id=title[0] 57 if len(title) == 2: 58 definition,info=parseFastaDescription(title[1], tagparser) 59 else: 60 info= {} 61 definition=None 62 63 seq=joinseq(seq[1:]) 64 return bioseqfactory(id, seq, definition,**info)
65
66 -def fastaNucParser(seq,tagparser=_parseFastaTag,joinseq=_fastaJoinSeq):
67 return fastaParser(seq,NucSequence,tagparser=_parseFastaTag,joinseq=_fastaJoinSeq)
68
69 -def fastaAAParser(seq,tagparser=_parseFastaTag,joinseq=_fastaJoinSeq):
70 return fastaParser(seq,AASequence,tagparser=_parseFastaTag,joinseq=_fastaJoinSeq)
71
72 -def fastaIterator(file,bioseqfactory=bioSeqGenerator,tagparser=_parseFastaTag,joinseq=_fastaJoinSeq):
73 ''' 74 iterate through a fasta file sequence by sequence. 75 Returned sequences by this iterator will be BioSequence 76 instances 77 78 @param file: a line iterator containing fasta data or a filename 79 @type file: an iterable object or str 80 @param bioseqfactory: a callable object return a BioSequence 81 instance. 82 @type bioseqfactory: a callable object 83 84 @param tagparser: a compiled regular expression usable 85 to identify key, value couples from 86 title line. 87 @type tagparser: regex instance 88 89 @return: an iterator on C{BioSequence} instance 90 91 @see: L{fastaNucIterator} 92 @see: L{fastaAAIterator} 93 94 ''' 95 96 for entry in fastaEntryIterator(file): 97 yield fastaParser(entry,bioseqfactory,tagparser,joinseq)
98
99 -def fastaNucIterator(file,tagparser=_parseFastaTag):
100 ''' 101 iterate through a fasta file sequence by sequence. 102 Returned sequences by this iterator will be NucSequence 103 instances 104 105 @param file: a line iterator containint fasta data 106 @type file: an iterable object 107 108 @param tagparser: a compiled regular expression usable 109 to identify key, value couples from 110 title line. 111 @type tagparser: regex instance 112 113 @return: an iterator on C{NucBioSequence} instance 114 115 @see: L{fastaIterator} 116 @see: L{fastaAAIterator} 117 ''' 118 return fastaIterator(file, NucSequence,tagparser)
119
120 -def fastaAAIterator(file,tagparser=_parseFastaTag):
121 ''' 122 iterate through a fasta file sequence by sequence. 123 Returned sequences by this iterator will be AASequence 124 instances 125 126 @param file: a line iterator containing fasta data 127 @type file: an iterable object 128 129 @param tagparser: a compiled regular expression usable 130 to identify key, value couples from 131 title line. 132 @type tagparser: regex instance 133 134 @return: an iterator on C{AABioSequence} instance 135 136 @see: L{fastaIterator} 137 @see: L{fastaNucIterator} 138 ''' 139 return fastaIterator(file, AASequence,tagparser)
140
141 -def formatFasta(data,gbmode=False):
142 ''' 143 Convert a seqence or a set of sequences in a 144 string following the fasta format 145 146 @param data: sequence or a set of sequences 147 @type data: BioSequence instance or an iterable object 148 on BioSequence instances 149 150 @param gbmode: if set to C{True} identifier part of the title 151 line follows recommendation from nbci to allow 152 sequence indexing with the blast formatdb command. 153 @type gbmode: bool 154 155 @return: a fasta formated string 156 @rtype: str 157 ''' 158 if isinstance(data, BioSequence): 159 data = [data] 160 rep = [] 161 for sequence in data: 162 seq = str(sequence) 163 if sequence.definition is None: 164 definition='' 165 else: 166 definition=sequence.definition 167 frgseq = '\n'.join([seq[x:x+60] for x in xrange(0,len(seq),60)]) 168 info='; '.join(['%s=%s' % x for x in sequence.iteritems()]) 169 if info: 170 info=info+';' 171 id = sequence.id 172 if gbmode: 173 if 'gi' in sequence: 174 id = "gi|%s|%s" % (sequence['gi'],id) 175 else: 176 id = "lcl|%s|" % (id) 177 title='>%s %s %s' %(id,info,definition) 178 rep.append("%s\n%s" % (title,frgseq)) 179 return '\n'.join(rep)
180