Commit fc8356f5 by Eric Coissac

--no commit message

parent 171bf79e
......@@ -7,13 +7,19 @@ from orgasm.indexer import _orgasm as indexer
from orgasm import samples
from orgasm.assembler import Assembler
from orgasm.graph import pathIterator
from orgasm.backtranslate.fasta import fasta
import sys
import time
import math
#if __name__ == '__main__':
r = indexer.Index('../../samples/locontella')
asm = Assembler(r,samples.lecontella,overlap=90)
p = fasta('../../samples/arabidopsis.chloro.prot.fasta')
asm = Assembler(r)
x = asm.lookForSeeds(p)
asm.run()
asm.simplifyAssembly()
......@@ -483,47 +489,200 @@ def cpe(self,probe):
return extension
In [1]: from orgasm.backtranslate import _ahocorasick as aho
In [2]: a = aho.AhoCarasick()
In [3]: a.addWord('acgtacgggtacac',1,14)
In [4]: a.addWord('acgtacggtcgaaa',1,14)
In [5]: a.addWord('acgtacggtcgtgc',1,14)
In [5]: a.addWord('gtcgaacgtcgaac',1,14)
In [7]: a.finalize()
In [8]: print >>open('ahocorasick.gml','w'),a.asGraph().gml()
from orgasm.backtranslate import _ahocorasick as aho
a = aho.AhoCarasick()
cox1_ble = "MTNMVRWLFSTNHKDIGTLYFIFGAIAGVMGTCFSVLIRMELARPGDQILGGNHQLYNVL" \
"ITAHAFLMIFFMVMPAMIGGFGNWFVPILIGAPDMAFPRLNNISFWLLPPSLLLLLSSAL" \
"VEVGSGTGWTVYPPLSGITSHSGGAVDLAIFSLHLSGISSILGSINFITTIFNMRGPGMT" \
"MHRLPLFVWSVLVTAFLLLLSLPVLAGAITMLLTDRNFNTTFFDPAGGGDPILYQHLFWF" \
"FGHPEVYILILPGFGIISHIVSTFSRKPVFGYLGMVYAMISIGVLGFLVWAHHMFTVGLD" \
"VDTRAYFTAATMIIAVPTGIKIFSWIATMWGGSIQYKTPMLFAVGFIFLFTIGGLTGIVL" \
"ANSGLDIALHDTYYVVAHFHYVLSMGAVFALFAGFYYWVGKIFGRTYPETLGQIHFWITF" \
"FGVNLTFFPMHFLGLSGMPRRIPDYPDAYAGWNALSSFGSYISVVGIRRFFVVVAITSSS" \
"GKNQKCAESPWAVEQNPTTLEWLVQSPPAFHTFGELPAVKETKS"
cox1_ble = "MTNM"
a.addProtein(cox1_ble)
print >>open('ahocorasick.gml','w'),a.asGraph().gml()
In [1]: from orgasm.backtranslate import _ahocorasick as aho
In [2]: a = aho.AhoCarasick()
In [3]: cox1_ble = "MTNMVRWLFSTNHKDIGTLYFIFGAIAGVMGTCFSVLIRMELARPGDQILGGNHQLYNVL" \
"ITAHAFLMIFFMVMPAMIGGFGNWFVPILIGAPDMAFPRLNNISFWLLPPSLLLLLSSAL" \
"VEVGSGTGWTVYPPLSGITSHSGGAVDLAIFSLHLSGISSILGSINFITTIFNMRGPGMT" \
"MHRLPLFVWSVLVTAFLLLLSLPVLAGAITMLLTDRNFNTTFFDPAGGGDPILYQHLFWF" \
"FGHPEVYILILPGFGIISHIVSTFSRKPVFGYLGMVYAMISIGVLGFLVWAHHMFTVGLD" \
"VDTRAYFTAATMIIAVPTGIKIFSWIATMWGGSIQYKTPMLFAVGFIFLFTIGGLTGIVL" \
"ANSGLDIALHDTYYVVAHFHYVLSMGAVFALFAGFYYWVGKIFGRTYPETLGQIHFWITF" \
"FGVNLTFFPMHFLGLSGMPRRIPDYPDAYAGWNALSSFGSYISVVGIRRFFVVVAITSSS" \
"GKNQKCAESPWAVEQNPTTLEWLVQSPPAFHTFGELPAVKETKS"
In [4]: a.addProtein(cox1_ble,kup=4)
In [5]: a.finalize()
In [6]: from orgasm import samples
In [7]: m = a.match(samples.sartidia_cox1)
def coverageEstimate(prot,match,lread=100,minscore=15):
cov={}
for k in prot:
lprot = len(prot[k]) * 3
basecount = 0
if k in match:
for m in match[k]:
if m[1] >=minscore:
basecount+=lread * m[2]
cov[k]=float(basecount) / lprot
return cov
def cmpPoints(p1,p2):
'''
Compare two extensions point.
INTERNAL functio uses for ordering extension point:
the smallest point is point closest to the initial extension point
with the highest coverage.
:param p1:
:type p1:
:param p2:
:type p2:
'''
rep = cmp(p2[0],p1[0])
if not rep:
rep= cmp(p1[4],p2[4])
return -rep
def cmpRead(r1,r2):
return cmp(r1[1],r2[1])
def buildGraph(self,
seeds,
minread=20, minratio=0.1,
mincov=2, minoverlap=40,
extlength=1, cleanBranch=False):
'''
:param index: the read index
:type index:
:param minread:
:type minread:
:param minratio:
:type minratio:
:param mincov:
:type mincov:
:param minoverlap:
:type minoverlap:
:param extlength:
:type extlength:
'''
readlength = self._index.getReadSize()
cycle = 0
extraid = int(10**math.ceil(math.log10(len(self._index)))) + 1
shiftid = extraid
lmax = 0
totreads = 0
points
# un point d'extension is :
# - the length of the path to the start read
# - insertion since last read
# - the sequence to use as probe
# - the readid on the ending node
# - the weight of the node
points = [(readlength,
0,
self._index.getRead(e[1],1,readlength-1),
e[1],
self.graph[e[1]]['weight']) for e in self.extensions]
while(points):
cycle+=1
points.sort(cmpPoints)
running = points.pop()
extensions={}
extlength=1
length,noreads,probe,nodeindex,w = running
lmax = max(lmax,length)
print >>sys.stderr,b"Cycle : %8d (%d nodes / %d reads) Waiting points : %8d (%d bp / %d bp) \r" % (cycle,len(self.graph),totreads,len(points),length,lmax),
attr = self.graph.getNodeAttr(nodeindex)
#read = index.getRead(nodeindex,0,readlength)
#
# Try to deal with homopolymer by stopping elongation
#
if probe=="A" * len(probe) :
extensions={}
# extensions = {'C':(1,[]),'G':(1,[]),'T':(1,[])}
elif probe=="C" * len(probe) :
extensions={}
# extensions = {'A':(1,[]),'G':(1,[]),'T':(1,[])}
elif probe=="G" * len(probe) :
extensions={}
# extensions = {'C':(1,[]),'A':(1,[]),'T':(1,[])}
elif probe=="T" * len(probe) :
extensions={}
# extensions = {'C':(1,[]),'G':(1,[]),'A':(1,[])}
else:
extensions = self._index.checkedExtensions(probe,
minread, minratio,
mincov, minoverlap,
extlength)
# print "\n",nodeindex," ++> ",extensions
if not extensions and cleanBranch:
f = self.isEnd(nodeindex)
while(f):
del self.graph[nodeindex]
nodeindex=f
f = self.isEnd(nodeindex)
for ex in extensions:
data = extensions[ex]
if data[1]:
reads = self.readType(data[1])
nodeindexE = reads[0]
#
# A read cannot link to itself
# --> homopolymer longer than the read length
#
if nodeindexE==nodeindex:
continue
else:
nodeindexE = extraid
extraid+=1
if nodeindexE==self.startread:
print >>sys.stderr,b"\nLooping over start read %d " % (self.startread)
alreadyin = nodeindexE in self.graph
self.graph.addEdge(nodeindex, nodeindexE)
#self._finalread=nodeindexE
attr = self.graph.getNodeAttr(nodeindexE)
if nodeindexE >= shiftid:
attr[b"label"]=''
attr[b'graphics']={b'fill':'#008000',
b'type':'rectangle'}
noreads+=1
else:
attr[b"label"]='%d (%d)' %(nodeindexE,reads[1])
attr[b"weight"]=reads[1]
attr[b"reads"]=reads[2]
totreads+=1
noreads=0
attr[b"position"]=length+len(ex)
attr = self.graph.getEdgeAttr(nodeindex, nodeindexE)
attr[b"label"]='%s [%d] (%d)' %(ex,length+len(ex),data[0])
attr[b'sequence']=ex
attr[b"type"]=b"next read"
attr[b"coverage"]=data[0]
attr['graphics']={'width':math.ceil(math.log(data[0])/math.log(2)),
'arrow':'last'}
if not alreadyin and noreads < (readlength - minoverlap):
points.append((length + len(ex),
noreads,
probe[1:]+ex,
nodeindexE,
data[0]))
def shanon(seq,kup=2):
w = {}
wc = 0.0
lbase=math.log(4**kup)
for x in range(len(seq) - kup + 1):
word = seq[x:x+kup]
wc+=1.0
w[word]=w.get(word,0)+1
s = 0.0
for word in w:
p = w[word]/wc
s+= p * math.log(p)
return - s / lbase
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment