Commit c53b6a0c by Eric Coissac

--no commit message

parent 340b78c0
......@@ -13,24 +13,46 @@ fastaObservedDist.py [-h|--help] <fastafile>"
-------------------------------------------------------
"""
import fileinput
import getopt
from obitools.fasta import fastaAligmentReader
from obitools.distances.observed import PairewiseGapRemoval
from obitools.align import alignmentReader
from obitools.fasta import fastaIterator
from obitools.distances.observed import PairewiseGapRemoval,Pairewise
from obitools.distances.r import writeRMatrix
from obitools.utils import checkHelpOption
from obitools.distances.phylip import writePhylipMatrix
from obitools.options import getOptionManager
def addOligoOptions(optionManager):
optionManager.add_option('-g','--with_gap',
action="store_true", dest="distmode",
default=False,
help="Take gap in account in distance evaluation")
optionManager.add_option('-r','--r_format',
action="store_true", dest="rformat",
default=False,
help="Output is formated to be read by R")
if __name__=='__main__':
checkHelpOption(__doc__)
optionParser = getOptionManager([addOligoOptions],
entryIterator=None
)
(options, entries) = optionParser()
ali = alignmentReader(entries, fastaIterator)
ali = fastaAligmentReader(fileinput.input())
dist= PairewiseGapRemoval(ali)
if options.distmode:
dist= Pairewise(ali)
else:
dist= PairewiseGapRemoval(ali)
print writeRMatrix(dist)
if options.rformat:
print writeRMatrix(dist)
else:
print writePhylipMatrix(dist)
\ No newline at end of file
......@@ -39,4 +39,39 @@ class PairewiseGapRemoval(DistanceMatrix):
diff,tot = reduce(lambda x,y: (x[0]+y,x[1]+1),
(z[0]!=z[1] for z in imap(None,seq1,seq2)
if '-' not in z),(0,0))
return float(diff)/tot
\ No newline at end of file
return float(diff)/tot
class Pairewise(DistanceMatrix):
'''
Observed divergeance matrix from an alignment.
Gap are kept from the alignemt
'''
def evaluateDist(self,x,y):
'''
Compute the observed divergeance from two sequences
of an aligment.
@attention For performance purpose this method should
be directly used. use instead the __getitem__
method from DistanceMatrix.
@see __getitem__
@param x number of the fisrt sequence in the aligment
@type x int
@param y umber of the second sequence in the aligment
@type y int
'''
seq1 = self.aligment[x]
seq2 = self.aligment[y]
diff,tot = reduce(lambda x,y: (x[0]+y,x[1]+1),
(z[0]!=z[1] for z in imap(None,seq1,seq2)),
(0,0))
return float(diff)/tot
\ No newline at end of file
......@@ -10,7 +10,7 @@ def writePhylipMatrix(matrix):
for n in pnames:
unicity[n]=unicity.get(n,0)+1
redundent.append(unicity[n])
print redundent
for i,n,r in imap(None,count(),pnames,redundent):
alternate = n
if r > 1:
......
......@@ -9,16 +9,17 @@ def _parseFasta(seq,bioseqfactory):
title = seq[0].strip()[1:].split(None,1)
id=title[0]
if len(title) == 2:
field = _fastaSplit.split(title[1])
info = dict(_fastaTag.findall(title[1]))
definition = _fastaTag.sub('',title[1]).strip()
else:
field=[]
info = dict(_fastaTag.findall(title[1]))
info= {}
definition=None
for k in info:
try:
info[k]=eval(info[k])
except:
pass
definition = _fastaTag.sub('',title[1]).strip()
seq=(''.join([x.strip() for x in seq[1:]]))
return bioseqfactory(id, seq, definition,**info)
......
from logging import debug
import time
import sys
_maxsize=0
_solution=0
def cliqueIterator(graph,minsize=1,node=None):
global _maxsize,_solution
_maxsize=0
_solution=0
starttime = time.time()
if node:
node = graph.getNode(node)
index = node.index
clique= set([index])
candidates= graph.neighbourIndexSet(index=index)
else:
clique=set()
candidates = set(x.index for x in graph)
# candidates = set(x for x in candidates
# if len(graph.neighbourIndexSet(index=x) & candidates) >= (minsize - 1))
for c in _cliqueIterator(graph,clique,candidates,set(),minsize,start=starttime):
yield c
def _cliqueIterator(graph,clique,candidates,notset,minsize=0,start=None):
global _maxsize,_solution
if len(clique)>_maxsize or not _solution % 1000 :
lcandidates = len(candidates)
if start is not None:
top = time.time()
delta = top - start
speed = _solution / delta
start = top
else:
speed = 0
print >>sys.stderr,"\rCandidates : %-5d Maximum clique size : %-5d Solutions explored : %10d speed = %5.2f solutions/sec " % (lcandidates,_maxsize,_solution,speed),
sys.stderr.flush()
if len(clique)>_maxsize:
_maxsize=len(clique)
if (len(clique)+len(candidates)) >= minsize:
if not candidates and not notset:
yield set(clique)
else:
while candidates:
# count explored solution
_solution+=1
nextcandidate = candidates.pop()
clique.add(nextcandidate)
# Speed indicator
nextcandidates= candidates & graph.neighbourIndexSet(index=nextcandidate)
nextnot = notset & graph.neighbourIndexSet(index=nextcandidate)
# potentialClique = nextcandidates | clique
# nextcandidates = set(x for x in nextcandidates
# if len(graph.neighbourIndexSet(index=x) & potentialClique) >= (minsize - 1))
#
#
#
# if (len(nextcandidates) + len(clique)) >= minsize:
#
# if nextcandidates:
for c in _cliqueIterator(graph,
set(clique),
nextcandidates,
nextnot,
minsize,
start):
yield c
clique.remove(nextcandidate)
notset.add(nextcandidate)
......@@ -78,14 +78,18 @@ def dnaWordIterator(options):
assert options.oligoSize is not None,"option -s or --oligo-size must be specified"
assert options.familySize is not None,"option -f or --family-size must be specified"
assert options.oligoDist is not None,"option -d or --distance must be specified"
words = wordIterator(options.oligoSize)
seed = 'a' * options.oligoSize
if not hasattr(options, "acceptedOligo"):
options.acceptedOligo=[]
if not hasattr(options, "rejectedOligo"):
options.rejectedOligo=[]
options.acceptedOligo.append(predicat.distMinGenerator(seed, options.oligoDist))
if options.homopolymere:
options.rejectedOligo.append(predicat.homoPolymerGenerator(options.homopolymere))
......
import re
from obitools.oligo import wordDist
def rePredicatGenerator(regex):
regex = re.compile(regex,re.I)
......@@ -15,3 +16,8 @@ def gcUpperBondGenerator(count):
def homoPolymerGenerator(count):
pattern = '(.)' + '\\1' * (count -1)
return rePredicatGenerator(pattern)
def distMinGenerator(word,dmin):
def predicat(w):
return w==word or wordDist(w, word) >= dmin
return predicat
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment