Commit 8d076d3e by Eric Coissac

Switch to python 3

parent 683691d9
......@@ -8,7 +8,9 @@ from distutils import log
import os
from Cython.Distutils import build_ext as ori_build_ext # @UnresolvedImport
from Cython.Compiler import Options as cython_options # @UnresolvedImport
from distutils.errors import DistutilsSetupError
class build_ext(ori_build_ext):
......@@ -84,6 +86,7 @@ class build_ext(ori_build_ext):
def run(self):
self.modifyDocScripts()
cython_options.annotate = True
ori_build_ext.run(self) # @UndefinedVariable
......
......@@ -62,7 +62,10 @@ def findCython(root,base=None,pyrexs=None):
cfiles = [x for x in cfiles if x[-2:]==".c"]
pyrexs[-1].sources.extend(cfiles)
pyrexs[-1].include_dirs.extend(incdir)
pyrexs[-1].extra_compile_args.extend(['-msse2','-Wno-unused-function','-Wmissing-braces'])
pyrexs[-1].extra_compile_args.extend(['-msse2',
'-Wno-unused-function',
'-Wmissing-braces',
'-Wchar-subscripts'])
except IOError:
pass
......
......@@ -206,7 +206,7 @@ class ApiDocWriter(object):
# get the names of all classes and functions
functions, classes = self._parse_module(uri)
if not len(functions) and not len(classes):
print 'WARNING: Empty -',uri # dbg
print('WARNING: Empty -',uri) # dbg
return ''
# Make a shorter version of the uri that omits the package name for
......
......@@ -6,9 +6,9 @@ import inspect
import textwrap
import re
import pydoc
from StringIO import StringIO
from io import StringIO
from warnings import warn
4
class Reader(object):
"""A line-based string reader.
......@@ -113,7 +113,7 @@ class NumpyDocString(object):
return self._parsed_data[key]
def __setitem__(self,key,val):
if not self._parsed_data.has_key(key):
if key not in self._parsed_data:
warn("Unknown section %s" % key)
else:
self._parsed_data[key] = val
......@@ -369,7 +369,7 @@ class NumpyDocString(object):
idx = self['index']
out = []
out += ['.. index:: %s' % idx.get('default','')]
for section, references in idx.iteritems():
for section, references in idx.items():
if section == 'default':
continue
out += [' :%s: %s' % (section, ', '.join(references))]
......@@ -413,10 +413,10 @@ class FunctionDoc(NumpyDocString):
doc = inspect.getdoc(func) or ''
try:
NumpyDocString.__init__(self, doc)
except ValueError, e:
print '*'*78
print "ERROR: '%s' while parsing `%s`" % (e, self._f)
print '*'*78
except ValueError as e:
print('*'*78)
print("ERROR: '%s' while parsing `%s`" % (e, self._f))
print('*'*78)
#print "Docstring follows:"
#print doclines
#print '='*78
......@@ -429,7 +429,7 @@ class FunctionDoc(NumpyDocString):
argspec = inspect.formatargspec(*argspec)
argspec = argspec.replace('*','\*')
signature = '%s%s' % (func_name, argspec)
except TypeError, e:
except TypeError as e:
signature = '%s()' % func_name
self['Signature'] = signature
......@@ -452,7 +452,7 @@ class FunctionDoc(NumpyDocString):
if self._role:
if not roles.has_key(self._role):
print "Warning: invalid role %s" % self._role
print("Warning: invalid role %s" % self._role)
out += '.. %s:: %s\n \n\n' % (roles.get(self._role,''),
func_name)
......@@ -494,4 +494,3 @@ class ClassDoc(NumpyDocString):
return out
......@@ -73,7 +73,7 @@ class SphinxDocString(NumpyDocString):
return out
out += ['.. index:: %s' % idx.get('default','')]
for section, references in idx.iteritems():
for section, references in idx.items():
if section == 'default':
continue
elif section == 'refguide':
......@@ -133,4 +133,3 @@ def get_doc_object(obj, what=None, doc=None):
if doc is None:
doc = pydoc.getdoc(obj)
return SphinxDocString(doc)
......@@ -37,7 +37,7 @@ class IPythonConsoleLexer(Lexer):
In [2]: a
Out[2]: 'foo'
In [3]: print a
In [3]: print(a)
foo
In [4]: 1 / 0
......
......@@ -49,7 +49,7 @@ def mangle_docstrings(app, what, name, obj, options, lines,
try:
references.append(int(l[len('.. ['):l.index(']')]))
except ValueError:
print "WARNING: invalid reference in %s docstring" % name
print("WARNING: invalid reference in %s docstring" % name)
# Start renaming from the biggest number, otherwise we may
# overwrite references.
......@@ -104,7 +104,7 @@ def monkeypatch_sphinx_ext_autodoc():
if sphinx.ext.autodoc.format_signature is our_format_signature:
return
print "[numpydoc] Monkeypatching sphinx.ext.autodoc ..."
print("[numpydoc] Monkeypatching sphinx.ext.autodoc ...")
_original_format_signature = sphinx.ext.autodoc.format_signature
sphinx.ext.autodoc.format_signature = our_format_signature
......
#!/usr/bin/env python
################################################
# setenv PYTHONPATH ./orgasm/build/lib.macosx-10.6-intel-2.7/
# echo $PYTHONPATH
# ./orgasm/build/lib.macosx-10.6-intel-2.7/
# python orgasm-1.py ./samples/GWM-261 ./samples/COX1_Machaon.fasta machaon
from orgasm.tango import tango, \
matchtoseed, \
cutLowCoverage, \
scaffold, \
coverageEstimate, \
fillGaps, \
path2fasta, \
unfoldAssembling, \
cutSNPs, \
selectGoodComponent, \
estimateDeadBrancheLength, pairEndedConstraints
from orgasm.indexer import Index
from orgasm.backtranslate.fasta import fasta
from orgasm.assembler import Assembler
#from orgasm.multialign import multiAlignReads, \
# consensus
import sys
import pickle
def estimateMinRead(index,minoverlap,coverage):
return (index.getReadSize() - minoverlap) * coverage / index.getReadSize() / 2
if __name__ == '__main__':
import argparse
# Declare script options
parser = argparse.ArgumentParser(description='Assembly program dedicated to organel genome reconstruction.')
parser.add_argument('indexFilename', metavar='index', help='index root filename (produced by orgasmi)')
parser.add_argument('seedsFilenameAA', metavar='seeds', help='fasta file containing seeds proteic sequences')
parser.add_argument('outputFilename', metavar='output', help='output prefix')
parser.add_argument('--minread', dest='minread', type=int, action='store', default=None, help='the minimum count of read to consider [default: %(default)d]')
parser.add_argument('--minratio', dest='minratio', type=float, action='store', default=0.1, help='minimum ratio between occurrences of an extension and the occurrences of the most frequent extension to keep it. [default: %(default)f]')
parser.add_argument('--mincov', dest='mincov', type=int, action='store', default=1, help='minimum occurrences of an extension to keep it. [default: %(default)d]')
parser.add_argument('--minoverlap', dest='minoverlap', type=int, action='store', default=40, help='minimum length of the overlap between the sequence and reads to participate in the extension. [default: %(default)d]')
parser.add_argument('--lowcomplexity', dest='lowcomplexity', action='store_true', default=False, help='Use also low complexity probes')
parser.add_argument('--back', dest='back', type=int, action='store', default=500, help='the number of bases taken at the end of contigs to jump with pared-ends [default: %(default)d]')
parser.add_argument('--snp', dest='snp', action='store_false', default=True, help='desactivate the SNP clearing mode')
parser.add_argument('--logfile', dest='logfile', action='store_true', default=False, help='Create a logfile for the assembling')
#parser.add_argument('--maxjump', dest='maxjump', type=int, action='store', default=1, help='COunt of pair end jump by extension branch')
args = parser.parse_args()
if args.logfile:
logfile=open(args.outputFilename+'.log',"w")
# Load the read index
r = Index(args.indexFilename)
# looks if seed protein are a file name or an internal list
try:
p = fasta(args.seedsFilenameAA)
if args.logfile:
print >>logfile,"Load protein sequences from file : %s" % args.seedsFilenameAA
except IOError:
import orgasm.samples
p = getattr(orgasm.samples,args.seedsFilenameAA)
if args.logfile:
print >>logfile,"Load protein internal dataset : %s" % args.seedsFilenameAA
if args.logfile:
logfile.flush()
# looks if the internal blast was already run
try:
# yes, load the previous results
x = pickle.load(open(args.outputFilename+'.match.omx'))
if args.logfile:
print >>logfile,"Load protein match from previous run : %d matches" % sum(len(x[i]) for i in x)
except:
# no, run it and save the results
if args.logfile:
print >>logfile,"Running protein matching against reads...",
logfile.flush()
x = r.lookForSeeds(p,covmin=1)
if args.logfile:
print >>logfile," : %d matches" % sum(len(x[i]) for i in x)
logfile.flush()
pickle.dump(x,open(args.outputFilename+'.match.omx',"w"))
# First esti;ation of the coverage based on protein matches
common = set(p.keys()) & set(x.keys())
pcoverage = [len(x[i]) * r.getReadSize() / len(p[i]) / 3 for i in common]
pcoverage.sort()
print >>sys.stderr,pcoverage
#coverage=mode(pcoverage)
coverage=pcoverage[len(pcoverage)/2]
# according to the minread option estimate it from coverage or use the specified value
if args.minread is None:
minread = estimateMinRead(r,args.minoverlap,coverage)
else:
minread = args.minread
print >>sys.stderr,"\nestimated coverage : %d based on protein match (minread: %d)" % (coverage,minread)
if args.logfile:
print >>logfile,"\nestimated coverage : %d based on protein match (minread: %d)" % (coverage,minread)
logfile.flush()
# Convert matches in sedd list
s = matchtoseed(x,r)
# Create the assembler object
asm = Assembler(r)
# Run the first assembling pass
a = tango(asm,s,mincov=args.mincov,
minread=minread,
minoverlap=args.minoverlap,
lowfilter=not args.lowcomplexity,
maxjump=0, cycle=1)
# Clean small unsuccessful extensions
asm.cleanDeadBranches(maxlength=10)
# and too low covered assembling
cutLowCoverage(asm,int(coverage/4),terminal=True)
# cleanup snp bubble in the graph
if args.snp:
cutSNPs(asm)
# reestimate coverage
score,length,coverage = coverageEstimate(asm)
if args.minread is None:
minread = estimateMinRead(r,args.minoverlap,coverage)
else:
minread = args.minread
print >>sys.stderr,"coverage estimated : %d based on %d bp (minread: %d)" %(coverage,length,minread)
if args.logfile:
print >>logfile,"coverage estimated : %d based on %d bp (minread: %d)" %(coverage,length,minread)
logfile.flush()
# Run the fill gap procedure
while fillGaps(asm,back=args.back,
minread=minread,
maxjump=0,
minoverlap=args.minoverlap,
cmincov=0,
gmincov=int(coverage/4),
snp=args.snp) > 0:
print >>sys.stderr
print >>sys.stderr
print >>sys.stderr
print >>sys.stderr,"=================================================================="
print >>sys.stderr # Clean small unsuccessful extensions
asm.cleanDeadBranches(maxlength=10)
# and too low covered assembling
cutLowCoverage(asm,int(coverage/3),terminal=True)
# reestimate coverage
score,length,coverage = coverageEstimate(asm)
if args.minread is None:
minread = estimateMinRead(r,args.minoverlap,coverage)
else:
minread = args.minread
print >>sys.stderr,"coverage estimated : %d based on %d bp (minread: %d)" %(coverage,length,minread)
cg = asm.compactAssembling(verbose=False)
scaffold(asm,cg,minlink=100,back=int(args.back/2),addConnectedLink=False)
print >>open(args.outputFilename+'.intermediate.gml','w'),cg.gml()
print >>sys.stderr
print >>sys.stderr,"=================================================================="
print >>sys.stderr
cutSNPs(asm)
smallbranches = estimateDeadBrancheLength(asm)
asm.cleanDeadBranches(maxlength=smallbranches)
cg = asm.compactAssembling(verbose=False)
score,length,coverage = coverageEstimate(asm)
if args.snp:
cutSNPs(asm)
cutLowCoverage(asm,int(coverage/2),terminal=True)
cutLowCoverage(asm,int(coverage/10),terminal=False)
cg = asm.compactAssembling(verbose=False)
constraints = pairEndedConstraints(asm,cg,back=int(args.back))
scaffold(asm,cg,minlink=5,back=int(args.back),addConnectedLink=False)
print >>open(args.outputFilename+'.gml','w'),cg.gml()
fastaout = open(args.outputFilename+'.fasta','w')
gcc = selectGoodComponent(cg)
for seeds in gcc:
path = unfoldAssembling(asm,cg,seeds=seeds)
fa = path2fasta(asm,cg,path[-1][0],
identifier=args.outputFilename,
back=args.back,
minlink=5)
print >>fastaout,fa
import pickle
import sys
from orgasm.indexer import Index
from .indexer import Index
from backtranslate.fasta import fasta
import samples
from .backtranslate.fasta import fasta
from . import samples
from orgasm.utils.dna import isDNA
......@@ -76,19 +77,42 @@ def getSeeds(index,probes,config,extension=".omx"):
# looks if the internal blast was already run
output = config['orgasm']['outputfilename']
logger=config['orgasm']['logger']
filename=output+extension
try:
# yes, load the previous results
seeds = pickle.load(open(output+extension))
with open(filename,'rb') as fseeds:
seeds = pickle.load(fseeds)
logger.info("Load matches from previous run : %d matches" % sum(len(seeds[i]) for i in seeds))
except:
except FileNotFoundError:
# no, run it and save the results
logger.info("Running probes matching against reads...")
seeds = index.lookForSeeds(probes,mincov=config['buildgraph']['mincov'])
seeds = index.lookForSeeds(probes,mincov=config['buildgraph']['mincov'],logger=logger)
logger.info("==> %d matches" % sum(len(seeds[i]) for i in seeds))
with open(filename,"wb") as fseeds:
pickle.dump(seeds,fseeds)
pickle.dump(seeds,open(output+extension,"w"))
if probes is not None:
nuc = all([isDNA(probes[k]) for k in probes])
nbmatch = [(k,
len(seeds[k]),
len(seeds[k])*index.getReadSize() / len(probes[k]) / (1 if nuc else 3)) for k in seeds]
nbmatch.sort(key=lambda x:-x[2])
logger.info("Match list :")
for gene, nb, cov in nbmatch:
logger.info(" %-10s : %5d (%5.1fx)" % (gene,nb,cov))
coverage=nbmatch[0][2]
else:
nbmatch = [(k,len(seeds[k])) for k in seeds]
nbmatch.sort(key=lambda x:-x[1])
logger.info("Match list :")
for gene, nb in nbmatch:
logger.info(" %-10s : %5d" % (gene,nb))
coverage=None
return seeds
return coverage,seeds
#def reloadAssembling
\ No newline at end of file
......@@ -13,4 +13,5 @@ The :py:mod:`orgasm.assembler` python package provide the :class:`Assembler` cla
which manage the assembling process.
"""
from _assembler import Assembler
from ._assembler import Assembler
from ._assembler import buildstem
# cython: language_level=3
from orgasm.graph._graph cimport *
from orgasm.indexer._orgasm cimport *
......
from _asmbgraph cimport *
# cython: language_level=3
from ._asmbgraph cimport *
cdef int sign(int x):
......@@ -16,7 +18,7 @@ cdef class AsmbGraph(DiGraph):
identical reads in the graph
"""
def __init__(self, bytes name, Index readIndex):
def __init__(self, str name, Index readIndex):
'''
'''
......@@ -45,25 +47,25 @@ cdef class AsmbGraph(DiGraph):
id,coverage,equiv = self._readIndex.getIds(node)
if id==0:
print "#####> ",node,id,equiv
print("#####> %s %s %s" % (node,id,equiv))
snprintf(pbuffer,49,"%d",id)
nbuffer[0]='-'
snprintf(pbuffer,49,b"%d",id)
nbuffer[0]=b'-'
pkey = pbuffer
nkey = nbuffer
if pkey not in self._nodes:
e5pc = self._readIndex.getRead(id,0,1)
e5pc = {'A':'T','C':'G','G':'C','T':'A'}[e5pc]
e5pc = {b'A':b'T',b'C':b'G',b'G':b'C',b'T':b'A'}[e5pc]
e3p = self._readIndex.getRead(id,self._readIndex.getReadSize()-1,1)
self._nodes[pkey]=(set(),set())
self._nodes[nkey]=(set(),set())
node_attrs={b"label" : pkey,
b'coverage':coverage,
b'reads':equiv,
b'e5pc':e5pc,
b'e3p':e3p}
node_attrs={"label" : pkey,
'coverage':coverage,
'reads':equiv,
'e5pc':e5pc,
'e3p':e3p}
self._nodes_attrs[pkey]=node_attrs
self._nodes_attrs[nkey]=node_attrs
else:
......@@ -99,11 +101,11 @@ cdef class AsmbGraph(DiGraph):
d2 = 1 if node2 in equiv else -1
snprintf(pbuffer1,49,"%d",id1)
nbuffer1[0]='-'
snprintf(pbuffer1,49,b"%d",id1)
nbuffer1[0]=b'-'
snprintf(pbuffer2,49,"%d",id2)
nbuffer2[0]='-'
snprintf(pbuffer2,49,b"%d",id2)
nbuffer2[0]=b'-'
if d1==1:
okey1=pbuffer1
......@@ -157,7 +159,13 @@ cdef class AsmbGraph(DiGraph):
DiGraph.deleteNode(self, -node)
cpdef dict getNodeAttr(self, int node):
return DiGraph.getNodeAttr(self, self._readIndex.getIds(node)[0])
cdef int inode
try:
inode = self._readIndex.getIds(node)[0]
except IndexError as e:
raise IndexError("%s --> index requested %d" % (e.args[0],node))
return DiGraph.getNodeAttr(self, inode)
from ._asmbgraph cimport AsmbGraph
from orgasm.graph._graphmultiedge cimport DiGraphMultiEdge
from orgasm.indexer._orgasm cimport Index
cdef class Assembler:
cdef AsmbGraph _graph
cdef Index _index
cdef int _overlap
cdef set extensionPoints
cdef int _startread
cdef int _finalread
cdef list _seeds
cdef dict _annotations
cpdef tuple readType(self,ids)
cpdef dict buildstem(Assembler assembler,
int first,
int last,
bytes sequence,
list path,
bint circle)
# cdef class Stem(dict):
# cdef int _first
# cdef int _last
# cdef int _length
# cdef bytes _sequence
# cdef list _path
# cdef bint _palindrome
# cdef str _label
# cdef int _weight
# cdef bint _circle
# cdef int _stemid
#
# cdef bint isPalindrome(self)
# cdef str buildLabel(self)
# cdef int getWeight(self, Assembler graph) except 0
cdef class CompactAssembling(DiGraphMultiEdge):
cdef dict _paths
cdef dict _stemid
cdef bint _stemidOk
cdef Assembler _assembler
cdef int getStemid(self, dict stem) except 0
cdef void setStemid(self)
cpdef dict addStem(self, dict stem)
cdef class StemIterator:
cdef dict edgeName
cdef Assembler _assembler
cdef AsmbGraph _graph
import copy
from orgasm.backtranslate._ahocorasick import AhoCorasick
from ._ahocorasick import AhoCorasick
UniversalGeneticCode = {"F" :("TTT","TTC"),
"L" :("TTA","TTG","TAG","CTA",'CTG',"CTC","CTT"),
......@@ -78,14 +78,14 @@ def buildAutomata(sequence,prot="prot",kup=4):
for pos in xrange(len(sequence)-kup+1):
peptide = sequence[pos:(pos+kup)]
print peptide
print(peptide)
bt= [listCodons2Dict(UniversalGeneticCode[peptide[x]],
(pos+x)*3,
prot)
for x in xrange(kup)
]
for shift in xrange(kup-1,0,-1):
print shift
print(shift)
connect2codons(bt[shift-1],bt[shift],prot)
autopep = bt[0]
......
#!python
#cython: linetrace=True
#
# C API to python object
#
# cython: language_level=3
from cpython.ref cimport PyObject,Py_XDECREF
......
from _ahocorasick cimport *
# cython: language_level=3
from ._ahocorasick cimport *
from orgasm.utils._progress cimport progressBar
from orgasm.utils.dna import reverseComplement
from _bitvector cimport BitVector
from ._bitvector cimport BitVector
import time
import math
......@@ -61,7 +63,6 @@ cdef dict listCodons2Dict(tuple codons):
cdef dict s
cdef dict new
cdef char* cc
cdef PyObject* pvalue
cdef dict value
cdef int l
cdef char letter[2]
......@@ -74,14 +75,12 @@ cdef dict listCodons2Dict(tuple codons):
cc= c
for l in range(3):
letter[0]=cc[l]
pvalue = PyDict_GetItemString(s, letter)
if pvalue==NULL:
value = PyDict_New()
PyDict_SetItemString(s,letter,value)
else:
value = <object>pvalue
value = s.get(letter,None)
if value is None:
value = {}
s[letter]=value
s=value
return d
cpdef dict bindCodons(dict codon1, dict codon2):
......@@ -114,10 +113,14 @@ cpdef dict codon(char aa, bint direct=True):
letter[0]=aa
letter[1]=0
if direct:
value = <object>PyDict_GetItemString(UniversalGeneticCode,letter)
# value = <object>PyDict_GetItemString(UniversalGeneticCode,letter)
value = UniversalGeneticCode[letter]
else:
value = <object>PyDict_GetItemString(CompUniversalGeneticCode,letter)
# value = <object>PyDict_GetItemString(CompUniversalGeneticCode,letter)
value = CompUniversalGeneticCode[letter]
return listCodons2Dict(value)
cpdef bint homopolymer(bytes s):
......@@ -164,7 +167,7 @@ cpdef dict word2automata(wordlist):
return automata
cdef double* buildShanonTable(size_t readsize, size_t* offset):
cdef size_t codonmax = readsize / 3 + (1 if (readsize % 3) else 0)
cdef size_t codonmax = readsize // 3 + (1 if (readsize % 3) else 0)
cdef size_t i=codonmax+1
cdef size_t j=0
cdef size_t k=0
......@@ -847,7 +850,7 @@ cdef class ProtAhoCorasick(AhoCorasick):
return 0
if self._seqid.has_key(seqid):
if seqid in self._seqid:
id = self._seqid[seqid]