1 """
2 implement fastn/fastp sililarity search algorithm for BioSequence.
3 """
4
6
8 '''
9 @param seq: sequence to hash
10 @type seq: BioSequence
11 @param kup: word size used for hashing process
12 @type kup: int
13 '''
14 hash={}
15 seq = str(seq)
16 for word,pos in ((seq[i:i+kup].upper(),i) for i in xrange(len(seq)-kup)):
17 if word in hash:
18 hash[word].append(pos)
19 else:
20 hash[word]=[pos]
21
22 self._kup = kup
23 self._hash= hash
24 self._seq = seq
25
27 '''
28 Align one sequence with the fast hash table.
29
30 @param seq: the sequence to align
31 @type seq: BioSequence
32
33 @return: where smax is the
34 score of the largest diagonal and pmax the
35 associated shift
36 @rtype: a int tuple (smax,pmax)
37 '''
38 histo={}
39 seq = str(seq).upper()
40 hash= self._hash
41 kup = self._kup
42
43 for word,pos in ((seq[i:i+kup],i) for i in xrange(len(seq)-kup)):
44 matchedpos = hash.get(word,[])
45 for p in matchedpos:
46 delta = pos - p
47 histo[delta]=histo.get(delta,0) + 1
48 smax = max(histo.values())
49 pmax = [x for x in histo if histo[x]==smax]
50 return smax,pmax
51
54