1 """
2 fasta module provides functions to read and write sequences in fasta format.
3
4
5 """
6
7 from obitools.format.genericparser import genericEntryIteratorGenerator
8 from obitools import bioSeqGenerator,BioSequence,AASequence,NucSequence
9 from obitools.align import alignmentReader
10 from obitools.utils import universalOpen
11 import re
12
13 _parseFastaTag=re.compile('([a-zA-Z]\w*) *= *([^;]+);')
14
15 fastaEntryIterator=genericEntryIteratorGenerator(startEntry='^>')
16
17
19 info = dict((x[0],x[1].strip())
20 for x in tagparser.findall(ds))
21 definition = tagparser.sub('',ds).strip()
22 for k in info:
23 try:
24 info[k]=eval(info[k])
25 except:
26 pass
27
28 return definition,info
29
31 return ''.join([x.strip() for x in seqarray])
32
34 '''
35 Parse a fasta record.
36
37 @attention: internal purpuse function
38
39 @param seq: a sequence object containing all lines corresponding
40 to one fasta sequence
41 @type seq: C{list} or C{tuple} of C{str}
42
43 @param bioseqfactory: a callable object return a BioSequence
44 instance.
45 @type bioseqfactory: a callable object
46
47 @param tagparser: a compiled regular expression usable
48 to identify key, value couples from
49 title line.
50 @type tagparser: regex instance
51
52 @return: a C{BioSequence} instance
53 '''
54 seq = seq.split('\n')
55 title = seq[0].strip()[1:].split(None,1)
56 id=title[0]
57 if len(title) == 2:
58 definition,info=parseFastaDescription(title[1], tagparser)
59 else:
60 info= {}
61 definition=None
62
63 seq=joinseq(seq[1:])
64 return bioseqfactory(id, seq, definition,**info)
65
68
71
73 '''
74 iterate through a fasta file sequence by sequence.
75 Returned sequences by this iterator will be BioSequence
76 instances
77
78 @param file: a line iterator containing fasta data or a filename
79 @type file: an iterable object or str
80 @param bioseqfactory: a callable object return a BioSequence
81 instance.
82 @type bioseqfactory: a callable object
83
84 @param tagparser: a compiled regular expression usable
85 to identify key, value couples from
86 title line.
87 @type tagparser: regex instance
88
89 @return: an iterator on C{BioSequence} instance
90
91 @see: L{fastaNucIterator}
92 @see: L{fastaAAIterator}
93
94 '''
95
96 for entry in fastaEntryIterator(file):
97 yield fastaParser(entry,bioseqfactory,tagparser,joinseq)
98
100 '''
101 iterate through a fasta file sequence by sequence.
102 Returned sequences by this iterator will be NucSequence
103 instances
104
105 @param file: a line iterator containint fasta data
106 @type file: an iterable object
107
108 @param tagparser: a compiled regular expression usable
109 to identify key, value couples from
110 title line.
111 @type tagparser: regex instance
112
113 @return: an iterator on C{NucBioSequence} instance
114
115 @see: L{fastaIterator}
116 @see: L{fastaAAIterator}
117 '''
118 return fastaIterator(file, NucSequence,tagparser)
119
121 '''
122 iterate through a fasta file sequence by sequence.
123 Returned sequences by this iterator will be AASequence
124 instances
125
126 @param file: a line iterator containing fasta data
127 @type file: an iterable object
128
129 @param tagparser: a compiled regular expression usable
130 to identify key, value couples from
131 title line.
132 @type tagparser: regex instance
133
134 @return: an iterator on C{AABioSequence} instance
135
136 @see: L{fastaIterator}
137 @see: L{fastaNucIterator}
138 '''
139 return fastaIterator(file, AASequence,tagparser)
140
180