1 """
2 implement fastn/fastp sililarity search algorithm for BioSequence.
3 """
4
6
8 '''
9 @param seq: sequence to hash
10 @type seq: BioSequence
11 @param kup: word size used for hashing process
12 @type kup: int
13 '''
14 hash={}
15 seq = str(seq)
16 for word,pos in ((seq[i:i+kup].upper(),i) for i in xrange(len(seq)-kup)):
17 if word in hash:
18 hash[word].append(pos)
19 else:
20 hash[word]=[pos]
21
22 self._kup = kup
23 self._hash= hast
24
26 '''
27 Align one sequence with the fast hash table.
28
29 @param seq: the sequence to align
30 @type seq: BioSequence
31
32 @return: where smax is the
33 score of the largest diagonal and pmax the
34 associated shift
35 @rtype: a int tuple (smax,pmax)
36 '''
37 histo={}
38 seq = str(seq)
39 hash= self._hash
40 kup = self._kup
41
42 for word,pos in ((seq[i:i+kup],i) for i in xrange(len(seq)-kup)):
43 matchedpos = hash.get(word,[])
44 for p in matchedpos:
45 delta = pos - p
46 histo[delta]=histo.get(delta,0) + 1
47 smax = max(histo.values())
48 pmax = [x for x in histo if histo[x]==smax]
49 return smax,pmax
50