1
2 """\
3 --------------------------------------------------------------------
4 fastaGrep.py
5 --------------------------------------------------------------------
6 fastaGrep.py [option] <argument>
7 --------------------------------------------------------------------
8 -h --help : print this help
9
10 -s --sequence=<pattern> : match the sequence with
11 a regular pattern
12
13 -i --identifier=<pattern> : match the sequence
14 identifier with a regular
15 pattern
16
17 -d --definition=<pattern> : match the sequence definition
18 with a regular pattern
19
20 -a --attribute=<name>:<pattern> : match the sequence attribute
21 <name> with a regular pattern
22
23 -l --lmin=## : keep sequences longer than
24 lmin
25
26 -L --lmax=## : keep sequences shorter than
27 lmax
28
29 -v : revert the sequence selection
30 --------------------------------------------------------------------
31 """
32
33 import fileinput
34 import re
35 import getopt
36 import sys
37
38 from obitools.fasta import fastaIterator,writeFasta
39 from obitools.utils import checkHelpOption
40
41
43 o,filenames = getopt.getopt(sys.argv[1:],
44 'hs:i:d:a:l:L:v',
45 ['help',
46 'sequence=',
47 'identifier=',
48 'definition=',
49 'attribute=',
50 'lmin=',
51 'lmax='])
52
53 sys.argv[1:]=filenames
54
55 haveSequencePattern = False
56 haveIdentifierPattern= False
57 haveDefinitionPattern= False
58 haveAttributePattern = False
59 haveLmin = False
60 haveLmax = False
61
62 isInverted = False
63
64 attributePatterns={}
65
66 for name,value in o:
67 if name in ('-s','--sequence'):
68 sequencePattern=re.compile(value,re.Ignore)
69 haveSequencePattern=True
70
71 elif name in ('-d','--definition'):
72 definitionPattern=re.compile(value)
73 haveDefinitionPattern=True
74
75 elif name in ('-i','--identifier'):
76 identifierPattern=re.compile(value)
77 haveIdentifierPattern=True
78
79 elif name in ('-a','--attribute'):
80 attribute,pattern=value.split(':',1)
81 attributePatterns[attribute]=re.compile(pattern)
82 haveAttributePattern=True
83
84 elif name in ('-l','--lmin'):
85 lmin=int(value)
86 haveLmin=True
87
88 elif name in ('-L','--lmax'):
89 lmax=int(value)
90 haveLmax=True
91
92 elif name in ('-v'):
93 isInverted=True
94
95 else:
96 raise ValueError,'Unknown option %s' % name
97
98 def sequenceSelector(seq):
99
100 good=True
101
102 if haveSequencePattern:
103 good = bool(sequencePattern.search(str(seq)))
104
105 if good and haveIdentifierPattern:
106 good = bool(identifierPattern.search(seq.id))
107
108 if good and haveDefinitionPattern:
109 good = bool(definitionPattern.search(seq.definition))
110
111 if good and haveAttributePattern:
112 good = (reduce(lambda x,y : x and y,
113 (bool(attributePatterns[p].search(seq[p]))
114 for p in attributePatterns
115 if p in seq),True)
116 and
117 reduce(lambda x,y : x and y,
118 (bool(p in seq)
119 for p in attributePatterns),True)
120 )
121
122 if good and haveLmin:
123 good = len(seq) >= lmin
124
125 if good and haveLmax:
126 good = len(seq) <= lmax
127
128
129 if isInverted:
130 good=not good
131
132 return good
133
134 return sequenceSelector
135
136
137
138 if __name__=='__main__':
139
140 checkHelpOption(__doc__)
141
142 goodFasta=goodFastaGenerator()
143
144 fasta = fastaIterator(fileinput.input())
145
146 for seq in fasta:
147 if goodFasta(seq):
148 print writeFasta(seq)
149