Commit 7e50ffc1 by Eric Coissac

First version of the new index command

parent f8eac3bb
#cython: language_level=3
#from ..utils cimport str2bytes
cdef extern from "stdio.h":
struct FILE
int fprintf(FILE *stream, char *format, ...)
int fputs(char *string, FILE *stream)
FILE* stderr
ctypedef unsigned int off_t "unsigned long long"
cdef extern from "unistd.h":
int fsync(int fd);
cdef extern from "time.h":
struct tm :
int tm_yday
int tm_hour
int tm_min
int tm_sec
enum: CLOCKS_PER_SEC
ctypedef int time_t
ctypedef int clock_t
ctypedef int suseconds_t
struct timeval:
time_t tv_sec # seconds */
suseconds_t tv_usec # microseconds */
struct timezone :
int tz_minuteswest; # minutes west of Greenwich
int tz_dsttime; # type of DST correction
int gettimeofday(timeval *tv, timezone *tz)
tm *gmtime_r(time_t *clock, tm *result)
time_t time(time_t *tloc)
clock_t clock()
cdef class ProgressCounter:
cdef off_t digits
cdef off_t logeach
cdef clock_t starttime
cdef clock_t lasttime
cdef clock_t tickcount
cdef int freq
cdef int cycle
cdef int lastlog
cdef bint ontty
cdef int fd
cdef bytes _head
cdef char *chead
cdef bytes _unit
cdef char *cunit
cdef object logger
cdef clock_t clock(self)
#cython: language_level=3
'''
Created on 27 mars 2016
@author: coissac
'''
import sys
from ..utils import bytes2str,str2bytes
from .config cimport getConfiguration
from builtins import None
from orgasm.utils import str2bytes
cdef class ProgressCounter:
cdef clock_t clock(self):
cdef clock_t t
cdef timeval tp
cdef clock_t s
<void> gettimeofday(&tp,NULL)
s = <clock_t> (<double> tp.tv_usec * 1.e-6 * <double> CLOCKS_PER_SEC)
t = tp.tv_sec * CLOCKS_PER_SEC + s
return t
def __init__(self,
off_t digits,
off_t logeach=10000,
dict config={},
str head="",
str unit="",
double seconde=0.1):
self.starttime = self.clock()
self.lasttime = self.starttime
self.tickcount = <clock_t> (seconde * CLOCKS_PER_SEC)
self.freq = 1
self.cycle = 0
self.lastlog = 0
self.logeach = logeach
self.ontty = sys.stderr.isatty()
self.digits = digits
self._head = str2bytes(head)
self.chead= self._head
self._unit = str2bytes(unit)
self.cunit = self._unit
try:
if not config:
config=getConfiguration()
self.logger=config[config["__root_config__"]]["logger"]
except RuntimeError:
self.logger=None
def __call__(self,object pos):
cdef off_t ipos
cdef clock_t elapsed
cdef clock_t newtime
cdef clock_t delta
cdef clock_t more
cdef off_t fraction
cdef int twentyth
cdef double speed
self.cycle+=1
if self.cycle % self.freq == 0:
self.cycle=1
newtime = self.clock()
delta = newtime - self.lasttime
self.lasttime = newtime
elapsed = newtime - self.starttime
# print(" ",delta,elapsed,elapsed/CLOCKS_PER_SEC,self.tickcount)
if delta < self.tickcount / 5 :
self.freq*=2
elif delta > self.tickcount * 5 and self.freq>1:
self.freq/=2
if callable(pos):
ipos=pos()
else:
ipos=pos
if ipos==0:
ipos=1
speed = ipos / <double>elapsed * CLOCKS_PER_SEC
if self.ontty:
<void>fprintf(stderr,b'\r%s [%*d] speed : %*.1f %s/s',
self.chead,
self.digits,
ipos,
self.digits,
speed,
self.cunit)
twentyth = int(ipos / self.logeach)
if twentyth != self.lastlog and not self.ontty and self.logger is not None:
self.logger.info('%s [%*d] speed : %*.1f %s/s' % (
bytes2str(self._head),
self.digits,
ipos,
self.digits,
speed,
bytes2str(self._unit)))
self.lastlog=twentyth
else:
self.cycle+=1
property head:
def __get__(self):
return self._head
def __set__(self,str value):
self._head=str2bytes(value)
self.chead=self._head
......@@ -36,9 +36,6 @@ cdef class ProgressBar:
self.arrow = 0
self.lastlog = 0
if not config:
config=getConfiguration()
self.ontty = sys.stderr.isatty()
......@@ -50,7 +47,14 @@ cdef class ProgressBar:
self.chead= self._head
self.logger=config[config["__root_config__"]]["logger"]
try:
if not config:
config=getConfiguration()
self.logger=config[config["__root_config__"]]["logger"]
except RuntimeError:
self.logger=None
self.wheel = '|/-\\'
self.spaces=' ' \
' ' \
......@@ -73,7 +77,7 @@ cdef class ProgressBar:
cdef tm remain
cdef int days,hour,minu,sec
cdef off_t fraction
cdef int twentyth
cdef int tenth
self.cycle+=1
......@@ -129,7 +133,7 @@ cdef class ProgressBar:
hour,minu,sec)
tenth = int(percent * 10)
if tenth != self.lastlog and not self.ontty:
if tenth != self.lastlog and not self.ontty and self.logger is not None:
# if self.ontty:
# <void>fputs(b'\n',stderr)
......
......@@ -109,7 +109,7 @@ cdef class AhoCorasick:
cpdef finalize(self)
cdef int addCWord(self,char* cword, size_t lword, int protid, int position)
cdef int addAutomata(self,dict automata,int protid, int position)
cpdef dict match(self,bytes sequence)
cpdef list match(self,char* sequence)
cpdef DiGraphMultiEdge asGraph(self)
cdef class ProtAhoCorasick(AhoCorasick):
......
#cython: language_level=3, boundscheck=False, wraparound=False
from cpython.list cimport PyList_GET_ITEM,PyList_SET_ITEM,PyList_New
from ._ahocorasick cimport *
from orgasm.apps.progress cimport ProgressBar
......@@ -710,8 +712,8 @@ cdef class AhoCorasick:
#cython: initializedcheck=False
cpdef dict match(self,bytes sequence):
cdef char* csequence = sequence
cpdef list match(self,char* sequence):
# cdef char* csequence = sequence
cdef size_t lseq = len(sequence)
cdef size_t pos
cdef dnastate* state
......@@ -719,11 +721,14 @@ cdef class AhoCorasick:
cdef dnastate** table
cdef protmatch* pmatch
cdef int letter
cdef dict results = {}
cdef PyObject* plpos
cdef int rsize = self._maxseqid * 2 + 1
# cdef list results = PyList_New(rsize)
cdef list results = [[] for i in range(rsize)]
cdef list lpos
cdef int protid
# for i in range(rsize):
# PyList_SET_ITEM(results,i,[])
if not self._finalized:
self.finalize()
......@@ -732,19 +737,21 @@ cdef class AhoCorasick:
for pos in range(lseq):
table = &(state.a)
letter = csequence[pos]
letter = sequence[pos]
letter >>=1
letter &=3
nstate = table[letter]
pmatch = nstate.match
while (pmatch!=NULL):
plpos = PyDict_GetItem(results,PyInt_FromLong(pmatch.protid))
if plpos==NULL:
lpos = []
PyDict_SetItem(results,PyInt_FromLong(pmatch.protid),lpos)
if pmatch.protid < 0:
lpos = <object>PyList_GET_ITEM(results,rsize + pmatch.protid)
#lpos=results[rsize + pmatch.protid]
else:
lpos = <object>plpos
lpos.append((pos,pmatch.position))
lpos = <object>PyList_GET_ITEM(results,pmatch.protid)
#lpos=results[pmatch.protid]
PyList_Append(lpos,(pos,pmatch.position))
#lpos.append((pos,pmatch.position))
pmatch = pmatch.next
state = nstate
......
......@@ -154,6 +154,16 @@ def addOptions(parser):
'among %s' % str(list(filter(lambda s: s.startswith('prot') or
s.startswith('nuc'),dir(orgasm.samples)))))
parser.add_argument('--phiX', dest='orgasm:phix',
action='store_true',
default=None,
help='activate the filtering of Phi-X174 sequences [default]')
parser.add_argument('--phiX-off', dest='orgasm:phix',
action='store_false',
default=None,
help='desactivate the filtering of Phi-X174 sequences')
parser.add_argument('--no-seeds', dest ='orgasm:noseeds',
metavar='no-seeds',
action='append',
......
......@@ -26,6 +26,16 @@ def addOptions(parser):
help='output prefix' )
parser.add_argument('--phiX', dest='orgasm:phix',
action='store_true',
default=None,
help='activate the filtering of Phi-X174 sequences [default]')
parser.add_argument('--phiX-off', dest='orgasm:phix',
action='store_false',
default=None,
help='desactivate the filtering of Phi-X174 sequences')
parser.add_argument('--seeds', dest ='orgasm:seeds',
metavar='seeds',
action='append',
......
#cython: language_level=3, boundscheck=False, wraparound=False
#cython: language_level=3, boundscheck=False, wraparound=False, cdivision=True
from cpython.array cimport array
from orgasm.apps.counter cimport ProgressCounter
......@@ -15,11 +15,12 @@ from pickle import dumps,loads
import sys
from orgasm.samples.phix import phix
from jsonschema._validators import minLength
__logger__ = None
cdef dict __stats__ = {}
cdef int __interval__=1000
cdef int __interval__=10000
def logOrPrint(message,level='info'):
global __logger__
......@@ -284,7 +285,7 @@ def PEQualityTrimmer(PEiterator, int quality, int shift=33):
#cython: initializedcheck=False
def PELengthEstimate(PEiterator, double threshold=0.9):
def PELengthEstimate(PEiterator, double threshold=0.9,int minlength=81):
cdef tuple pair
cdef tuple forward
cdef tuple reverse
......@@ -322,6 +323,11 @@ def PELengthEstimate(PEiterator, double threshold=0.9):
print("",file=sys.stderr)
logOrPrint("Indexing length estimated to : %dbp" % lcut)
if lcut < minlength:
lcut=minLength
logOrPrint("Estimated length smaller than : %dbp" % minlength)
logOrPrint("Indexing length reset to : %dbp" % lcut)
for lpair,cpair in store:
if lpair >= lcut:
......@@ -338,6 +344,9 @@ def PELengthEstimate(PEiterator, double threshold=0.9):
reverse[2][rb:(rb+lcut)]))
def PELengthCut(PEiterator, int length):
def PESkipFirstReads(PEI,int skip=0):
cdef int n = 0
......@@ -352,11 +361,12 @@ def PEMaxReadCount(PEI, int readCount):
for n in range(readCount):
yield next(PEI)
#cython: initializedcheck=False
def PEPhiXCleaner(PEiterator):
cdef tuple pair
cdef tuple forward
cdef tuple reverse
cdef dict fmatch,rmatch
cdef char* forward
cdef char* reverse
cdef list fmatch,rmatch
cdef int nf,nr
cdef int reject=0
cdef AhoCorasick aho=NucAhoCorasick()
......@@ -364,14 +374,16 @@ def PEPhiXCleaner(PEiterator):
aho.finalize()
for pair in PEiterator:
forward,reverse = pair
nf=(len(forward[1])-11) // 2
nr=(len(reverse[1])-11) // 2
fmatch=aho.match(forward[1])
rmatch=aho.match(reverse[1])
if (len(fmatch.get(1,())) >= nf or len(fmatch.get(-1,())) >= nf or
len(rmatch.get(1,())) >= nr or len(rmatch.get(-1,())) >= nr):
forward = pair[0][1]
reverse = pair[1][1]
nf=(len(forward)-11) // 2
nr=(len(reverse)-11) // 2
fmatch=aho.match(forward)
rmatch=aho.match(reverse)
if (len(fmatch[1]) >= nf or len(fmatch[2]) >= nf or
len(rmatch[1]) >= nr or len(rmatch[2]) >= nr):
reject+=1
if reject % __interval__ ==0:
logOrPrint('%d phiX174 sequence pairs rejected' % reject)
......@@ -398,7 +410,9 @@ def readPairedEnd(filenames,
int cut=4,
int badQualityLimit=28,
int qualityCut=0,
int length=0,
double lengthestimate=0.9,
int minlength=81,
int skip=0,
int maxread=0,
bint phix=True,
......@@ -448,8 +462,10 @@ def readPairedEnd(filenames,
if phix:
PEI = PEPhiXCleaner(PEI)
if lengthestimate > 0:
PEI = PELengthEstimate(PEI,lengthestimate)
if length>0 :
PEI =
elif lengthestimate > 0:
PEI = PELengthEstimate(PEI,lengthestimate,minlength)
if maxread > 0:
PEI = PEMaxReadCount(PEI,maxread)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment