Commit 09bd36be by Eric Coissac

Add function for matching protein sequence on reads based on an Aho-Corasick automata

parent 9881decf
......@@ -483,3 +483,47 @@ def cpe(self,probe):
return extension
In [1]: from orgasm.backtranslate import _ahocorasick as aho
In [2]: a = aho.AhoCarasick()
In [3]: a.addWord('acgtacgggtacac',1,14)
In [4]: a.addWord('acgtacggtcgaaa',1,14)
In [5]: a.addWord('acgtacggtcgtgc',1,14)
In [5]: a.addWord('gtcgaacgtcgaac',1,14)
In [7]: a.finalize()
In [8]: print >>open('ahocorasick.gml','w'),a.asGraph().gml()
from orgasm.backtranslate import _ahocorasick as aho
a = aho.AhoCarasick()
cox1_ble = "MTNMVRWLFSTNHKDIGTLYFIFGAIAGVMGTCFSVLIRMELARPGDQILGGNHQLYNVL" \
"ITAHAFLMIFFMVMPAMIGGFGNWFVPILIGAPDMAFPRLNNISFWLLPPSLLLLLSSAL" \
"VEVGSGTGWTVYPPLSGITSHSGGAVDLAIFSLHLSGISSILGSINFITTIFNMRGPGMT" \
"MHRLPLFVWSVLVTAFLLLLSLPVLAGAITMLLTDRNFNTTFFDPAGGGDPILYQHLFWF" \
"FGHPEVYILILPGFGIISHIVSTFSRKPVFGYLGMVYAMISIGVLGFLVWAHHMFTVGLD" \
"VDTRAYFTAATMIIAVPTGIKIFSWIATMWGGSIQYKTPMLFAVGFIFLFTIGGLTGIVL" \
"ANSGLDIALHDTYYVVAHFHYVLSMGAVFALFAGFYYWVGKIFGRTYPETLGQIHFWITF" \
"FGVNLTFFPMHFLGLSGMPRRIPDYPDAYAGWNALSSFGSYISVVGIRRFFVVVAITSSS" \
"GKNQKCAESPWAVEQNPTTLEWLVQSPPAFHTFGELPAVKETKS"
cox1_ble = "MTNM"
a.addProtein(cox1_ble)
print >>open('ahocorasick.gml','w'),a.asGraph().gml()
In [1]: from orgasm.backtranslate import _ahocorasick as aho
In [2]: a = aho.AhoCarasick()
In [3]: cox1_ble = "MTNMVRWLFSTNHKDIGTLYFIFGAIAGVMGTCFSVLIRMELARPGDQILGGNHQLYNVL" \
"ITAHAFLMIFFMVMPAMIGGFGNWFVPILIGAPDMAFPRLNNISFWLLPPSLLLLLSSAL" \
"VEVGSGTGWTVYPPLSGITSHSGGAVDLAIFSLHLSGISSILGSINFITTIFNMRGPGMT" \
"MHRLPLFVWSVLVTAFLLLLSLPVLAGAITMLLTDRNFNTTFFDPAGGGDPILYQHLFWF" \
"FGHPEVYILILPGFGIISHIVSTFSRKPVFGYLGMVYAMISIGVLGFLVWAHHMFTVGLD" \
"VDTRAYFTAATMIIAVPTGIKIFSWIATMWGGSIQYKTPMLFAVGFIFLFTIGGLTGIVL" \
"ANSGLDIALHDTYYVVAHFHYVLSMGAVFALFAGFYYWVGKIFGRTYPETLGQIHFWITF" \
"FGVNLTFFPMHFLGLSGMPRRIPDYPDAYAGWNALSSFGSYISVVGIRRFFVVVAITSSS" \
"GKNQKCAESPWAVEQNPTTLEWLVQSPPAFHTFGELPAVKETKS"
In [4]: a.addProtein(cox1_ble,kup=4)
In [5]: a.finalize()
In [6]: from orgasm import samples
In [7]: m = a.match(samples.sartidia_cox1)
import copy
gbfile = "/Users/coissac/travail/ecodb/chloroplast/chloroplast.gb"
UniversalGeneticCode = {"F" :("TTT","TTC"),
"L" :("TTA","TTG","TAG","CTA",'CTG',"CTC","CTT"),
"S" :("TCA","TCC","TCG","TCT","AGA","AGC","AGG","AGT"),
"Y" :("TAA","TAC","TAT"),
"C" :("TGT","TGC"),
"W" :("TGA","TGG"),
"P" :("CCA","CCG","CCT","CCC"),
"H" :("CAT","CAC"),
"Q" :("CAA","CAG"),
"R" :("CGA","CGC","CGG","CGT","AGA","AGG"),
"I" :("ATT","ATC","ATA"),
"M" :("ATA","ATG"),
"T" :("ACA","ACT","ACG","ACC"),
"N" :("AAT","AAC","AAA"),
"K" :("AAA","AAG"),
"V" :("GTA","GTC","GTG","GTT"),
"A" :("GCA","GCC","GCG","GCT"),
"D" :("GAT","GAC"),
"E" :("GAA","GAG"),
"G" :("GGA","GGC","GGG","GGT")}
cox1_ble = "MTNMVRWLFSTNHKDIGTLYFIFGAIAGVMGTCFSVLIRMELARPGDQILGGNHQLYNVL" \
"ITAHAFLMIFFMVMPAMIGGFGNWFVPILIGAPDMAFPRLNNISFWLLPPSLLLLLSSAL" \
"VEVGSGTGWTVYPPLSGITSHSGGAVDLAIFSLHLSGISSILGSINFITTIFNMRGPGMT" \
"MHRLPLFVWSVLVTAFLLLLSLPVLAGAITMLLTDRNFNTTFFDPAGGGDPILYQHLFWF" \
"FGHPEVYILILPGFGIISHIVSTFSRKPVFGYLGMVYAMISIGVLGFLVWAHHMFTVGLD" \
"VDTRAYFTAATMIIAVPTGIKIFSWIATMWGGSIQYKTPMLFAVGFIFLFTIGGLTGIVL" \
"ANSGLDIALHDTYYVVAHFHYVLSMGAVFALFAGFYYWVGKIFGRTYPETLGQIHFWITF" \
"FGVNLTFFPMHFLGLSGMPRRIPDYPDAYAGWNALSSFGSYISVVGIRRFFVVVAITSSS" \
"GKNQKCAESPWAVEQNPTTLEWLVQSPPAFHTFGELPAVKETKS"
def listCodons2Dict(codons,offset=0,protein='prot'):
d = [[(protein,offset)],None,{}]
for c in codons:
s = d[2]
p = 0
for l in c:
s[l] = s.get(l,[[(protein,p+offset)],None,{}])
s=s[l][2]
p+=1
return d[2]
def connect2codons(codon1,codon2,prot='prot'):
for l1 in codon1:
for l2 in codon1[l1][2]:
for l3 in codon1[l1][2][l2][2]:
codon1[l1][2][l2][2][l3][2]=copy.deepcopy(codon2)
def mergeAutomata(automata1,automata2):
for k in automata2:
if k in automata1:
automata1[k][0].extend(automata2[k][0])
mergeAutomata(automata1[k][2], automata2[k][2])
else:
automata1[k]=copy.deepcopy(automata2[k])
def cleanAutomata(automata,length=100000,keepOnlyTerminal=True):
lword=0
d = automata
stack=[(lword,d)]
while stack:
lword,d = stack.pop(0)
lword+=1
for val in d.itervalues():
if lword==length:
val[2]={}
else:
stack.append((lword,val[2]))
# Keep only leaves as terminal state
if keepOnlyTerminal and val[2]:
val[0]=[]
def buildAutomata(sequence,prot="prot",kup=4):
automata = {}
kup+=1
for pos in xrange(len(sequence)-kup+1):
peptide = sequence[pos:(pos+kup)]
print peptide
bt= [listCodons2Dict(UniversalGeneticCode[peptide[x]],
(pos+x)*3,
prot)
for x in xrange(kup)
]
for shift in xrange(kup-1,0,-1):
print shift
connect2codons(bt[shift-1],bt[shift],prot)
autopep = bt[0]
mergeAutomata(automata, autopep)
for k in autopep:
autopep2 = autopep[k][2]
mergeAutomata(automata, autopep2)
for k2 in autopep2:
mergeAutomata(automata, autopep2[k2][2])
cleanAutomata(automata, (kup-1)*3)
return automata
def automataIterator(automata):
word=""
d = automata[1]
stack=[(word,d)]
while stack:
word,d = stack.pop(0)
for k,val in d.iteritems():
if k!='*':
seq = word+k
else:
seq = word
if isinstance(val, dict):
stack.append((seq,val))
if isinstance(val, list):
yield seq,val
def setLastCodon(codon1,prot='prot'):
for l1 in codon1[1]:
if l1!='*':
for l2 in codon1[1][l1]:
if l2!='*':
for l3 in codon1[1][l1][l2]:
if l3!='*':
codon1[1][l1][l2][l3]=[(prot,codon1[0]*3+2),None]
else:
codon1[1][l1][l2][l3]=[(prot,codon1[0]*3+1),None]
else:
codon1[1][l1][l2]=[(prot,codon1[0]*3),None]
else:
codon1[1][l1]=[(prot,codon1[0]*3-1),None]
#def buildAutomata(sequence,prot='prot'):
# bt = [listCodons2Dict(UniversalGeneticCode[sequence[x]],x*3,prot) for x in xrange(len(sequence))]
#
# setLastCodon(bt[-1], prot)
#
# for x in xrange(len(sequence)-1):
# connect2codons(bt[-x-2], bt[-x-1], prot)
#
# return bt[0]
def matchSuffix(automata,suffixe):
d = automata[1]
for i in xrange(len(suffixe)):
l = suffixe[i]
if l in d:
d=d[l]
else:
return None
return d
def longestSuffix(automata,seq):
if seq:
seq=seq[1:]
d = matchSuffix(automata,seq)
while not isinstance(d, dict):
seq=seq[1:]
d = matchSuffix(automata,seq)
return seq,d
def buildErrorLink(automata):
word=""
d = automata[1]
stack=[(word,d)]
while stack:
word,d = stack.pop(0)
print >>sys.stderr,"Seq length : %d \r" % len(word),
for k,val in d.iteritems():
if k!='*':
seq = word+k
else:
seq = word
if isinstance(val, dict):
stack.append((seq,val))
if isinstance(val, list):
val[1]=longestSuffix(automata, word)[1]
../../../fiboheap/fibo.h
../../../src/orgasm.h
../../../src/expand8bits.c
#
# C API to python object
#
from cpython.ref cimport PyObject,Py_XDECREF
#
# C API to Bytes object
#
from cpython.bytes cimport PyBytes_Size, PyBytes_FromStringAndSize
# We explicitly want access to the raw version of the PyBytes_AS_STRING macro
# that can be called with a PyObject*
cdef extern from "Python.h":
char* PyBytes_AS_STRING(PyObject* obj)
#
# C API to Dict object
#
from cpython.dict cimport PyDict_New,PyDict_GetItemString,PyDict_SetItemString
from cpython.dict cimport PyDict_Next,PyDict_SetItem,PyDict_GetItem,PyDict_Size
#
# C API to List object
#
from cpython.list cimport PyList_GET_SIZE, PyList_Append
#
# C API to Tuple object
#
from cpython.tuple cimport PyTuple_GET_ITEM
#
# C API to int object
#
from cpython.int cimport PyInt_AsSsize_t,PyInt_FromLong
#
# C API to the libc
#
from stdlib cimport malloc,realloc,free
from libc.string cimport strlen
cdef extern from "strings.h":
void bzero(void *s, size_t n)
from orgasm.graph._graph cimport DiGraph
from orgasm.indexer._orgasm cimport Index
cdef dict listCodons2Dict(tuple codons)
cdef dict bindCodons(dict codon1, dict codon2)
cdef dict codon(char aa, bint direct=?)
cdef struct protmatch:
int protid
int position
protmatch* next
cdef struct dnastate:
# dnastate* error
protmatch* match
dnastate* a
dnastate* c
dnastate* t
dnastate* g
cdef extern from "inttypes.h":
ctypedef int int32_t
ctypedef int int64_t
ctypedef unsigned int uint32_t
ctypedef unsigned int uint64_t
cdef extern from "orgasm.h":
uint32_t expanded8bitsnuc[]
cdef class AhoCarasick:
cdef dnastate *_states
cdef size_t _statesize
cdef size_t _statecount
cdef protmatch *_matches
cdef size_t _matchsize
cdef size_t _matchcount
# the automata has been finalized
cdef bint _finalized
cdef size_t _step
cdef size_t _depth
cdef int _maxseqid
cdef char* _wordBuffer
cdef dnastate** _statestack
cdef dict _seqid
cdef dnastate *getNextState(self,dnastate *base, int letter)
cdef protmatch* setAsTerminal(self, dnastate* base, int protid, int position)
cdef dnastate** simpleMatch(self,char* cword, size_t lword)
cdef dnastate** longestSuffix(self,char* cword, size_t *plword)
cdef setErrorLink(self)
cpdef finalize(self)
cdef int addCWord(self,char* cword, size_t lword, int protid, int position)
cdef int addAutomata(self,dict automata,int protid, int position)
cpdef int addProtein(self,bytes sequence, object protid=?, size_t kup=?)
cpdef dict match(self,bytes sequence)
cpdef scanIndex(self,Index seqindex,int minmatch=?, int covmin=?)
cpdef DiGraph asGraph(self)
\ No newline at end of file
'''
Created on 13 mars 2013
@author: coissac
'''
def fasta(filename):
data = open(filename).readlines()
prots = {}
protid= None
seq = None
for l in data:
if l[0]=='>':
protid = l.split()[0][1:]
seq = prots.get(protid,[])
prots[protid]=seq
else:
seq.append(l.strip())
for p in prots:
prots[p]="".join(prots[p]).upper()
return prots
\ No newline at end of file
......@@ -5,6 +5,8 @@ cdef extern from "inttypes.h":
cdef extern from "orgasm.h":
ctypedef struct buffer_t:
size_t readSize # Size of one read in base pair
size_t recordSize # Size in bytes of one compressed read
char* records # the start of the first record [0].
size_t readCount # count of reads. One pair of reads counts for 2
uint32_t* order1 # a pointer used to point on a int array indicating an order
# over the records.
......
cdef extern from "stdio.h":
struct FILE
int fprintf(FILE *stream, char *format, ...)
FILE* stderr
ctypedef unsigned int off_t "unsigned long long"
cdef extern from "time.h":
struct tm :
int tm_yday
int tm_hour
int tm_min
int tm_sec
enum: CLOCKS_PER_SEC
ctypedef int time_t
ctypedef int clock_t
tm *gmtime_r(time_t *clock, tm *result)
time_t time(time_t *tloc)
clock_t clock()
cpdef object progressBar(object pos,
off_t maxi,
bint reset=?,
bytes head=?,
list delta=?,
list step=?)
\ No newline at end of file
cpdef object progressBar(object pos,
off_t maxi,
bint reset=False,
bytes head=b'',
list delta=[],
list step=[1,0,0]):
cdef off_t ipos
cdef double percent
cdef int days,hour,minu,sec
cdef bytes bar
cdef off_t fraction
cdef int freq,cycle,arrow
cdef tm remain
cdef clock_t d
cdef clock_t elapsed
cdef clock_t newtime
cdef clock_t more
# 0123456789
cdef char* wheel= '|/-\\'
cdef char* spaces=' ' \
' ' \
' ' \
' ' \
' '
cdef char* diese ='##########' \
'##########' \
'##########' \
'##########' \
'##########'
if reset:
del delta[:]
if not delta:
delta.append(clock())
delta.append(clock())
assert maxi>0
freq,cycle,arrow = step
cycle+=1
if cycle % freq == 0:
cycle=1
newtime = clock()
d = newtime-delta[1]
if d < 0.2 * CLOCKS_PER_SEC :
freq*=2
elif d > 0.4 * CLOCKS_PER_SEC and freq>1:
freq/=2
delta[1]=newtime
elapsed = newtime-delta[0]
if callable(pos):
ipos=pos()
else:
ipos=pos
percent = <double>ipos/<double>maxi
more = <time_t>((<double>elapsed / percent * (1. - percent))/CLOCKS_PER_SEC)
<void>gmtime_r(&more, &remain)
days = remain.tm_yday
hour = remain.tm_hour
minu = remain.tm_min
sec = remain.tm_sec
fraction=<int>(percent * 50.)
arrow=(arrow+1) % 4
diese[fraction]=0
spaces[50 - fraction]=0
if days:
<void>fprintf(stderr,b'\r%s %5.1f %% |%s%c%s] remain : %d days %02d:%02d:%02d',
<char*>head,
percent*100,
diese,wheel[arrow],spaces,
days,hour,minu,sec)
else:
<void>fprintf(stderr,b'\r%s %5.1f %% |%s%c%s] remain : %02d:%02d:%02d',
<char*>head,
percent*100.,
diese,wheel[arrow],spaces,
hour,minu,sec)
diese[fraction]='#'
spaces[50 - fraction]=' '
else:
cycle+=1
step[0:3] = freq,cycle,arrow
......@@ -19,10 +19,6 @@ ORGASMI_SRC= fastq.c \
lookfor.c \
orgasmi.c
ORGASMI_OBJ= $(patsubst %.c,%.o,$(ORGASMI_SRC))
EXEC=orgasmi
ORGASMD_SRC= fastq.c \
encode.c \
decode.c \
......@@ -53,7 +49,7 @@ LIBFILE=
include global.mk
all: $(EXEC)
all: $(EXEC) expand8bits.c
########
......@@ -70,6 +66,9 @@ littlebigman: littlebigman.c
buildcode: buildcode.c
$(CC) -o $@ $(shell ./littlebigman) $<
buildexpand8bits: buildexpand8bits.c
$(CC) -o $@ $(shell ./littlebigman) $<
buildcomplement: buildcomplement.c
$(CC) -o $@ $(shell ./littlebigman) $<
......@@ -86,6 +85,9 @@ code16bits.c: buildcode
codecomp.c: buildcomplement
./$< > $@
expand8bits.c: buildexpand8bits
./$< > $@
########
#
# project management
......
/*
* buildcode.c
*
* Created on: 25 sept. 2012
* Author: coissac
*/
#include <stdio.h>
#include <inttypes.h>
int main(int argc, char *argv[])
{
uint32_t code;
uint32_t icode;
char nuc[5];
int i;
nuc[4]=0;
printf("/**\n");
printf(" * \n");
printf(" * source file automatically generated by\n");
printf(" * the program buildexpand8bits. Do not modify manually\n");
printf(" *\n");
printf(" */ \n");
printf("\n");
printf("#include <inttypes.h>\n");
printf("\n");
printf("\n");
printf("uint32_t expanded8bitsnuc[] = {\n");
for (code=0; code <= 0xFF; code++)
{
icode = code;
for (i=0; i < 4; i++)
nuc[3-i] = (icode >> (i*2)) & 3;
#ifdef LITTLE_END
*((uint32_t*)nuc) = *((uint32_t*)nuc) >> 24 | \
*((uint32_t*)nuc) << 24 | \
((*((uint32_t*)nuc) >> 8) & 0x0000FF00) | \
((*((uint32_t*)nuc) << 8) & 0x00FF0000);
#endif
if (code < 0xFF)
fprintf(stdout," 0x%08XUL, // %04X\n", *((uint32_t*)nuc),code);
else
fprintf(stdout," 0x%08XUL // %04X\n", *((uint32_t*)nuc),code);
}
printf("};\n");
return 0;
}
......@@ -149,7 +149,7 @@ int32_t nextRead(buffer_t *buffer, int32_t current, size_t length, int32_t* endo
extern uint64_t decode16bitsnuc[];
extern uint8_t complement4nuc[];
extern uint32_t expanded8bitsnuc[];
pnuc encodeSequence(pnuc dest, char* src, uint32_t length);
pnuc shiftKey(pnuc dest, pnuc key,uint32_t shift, uint32_t keyLength);
......
/*
* orgasmi.c
*
* Created on: 11 juil. 2012
* Author: coissac
*/
#include <stdio.h>
#include <string.h>
#include "orgasm.h"
#include "debug.h"
//#define LARGE_TEST
#define LOAD_ONLY
int main(int argc, char *argv[])
{
FILE* forward;
FILE* reverse;
FILE* index;
buffer_t* reads;
uint32_t i;
const char* indexfiles;
char query[500];
char key[500];
char *akey;
char *aquery;
int32_t eol;
int32_t pos;
int32_t pnext;
char *toto;
#define CMPLENGTH 100
#ifdef LARGE_TEST
// char* forwardFileName="/Users/coissac/travail/GuillaumeBesnard/lecontella/120824_SN365_A_L005_GWM-9_R1.fastq";
// char* reverseFileName="/Users/coissac/travail/GuillaumeBesnard/lecontella/120824_SN365_A_L005_GWM-9_R2.fastq";
char* forwardFileName="/Users/coissac/travail/GuillaumeBesnard/Sartidia/Spe_CAGATC_L001_R1.fastq";
char* reverseFileName="/Users/coissac/travail/GuillaumeBesnard/Sartidia/Spe_CAGATC_L001_R2.fastq";
indexfiles = "/Users/coissac/encours/orgasm/samples/sartidia";
indexfiles = "/Users/coissac/encours/orgasm/samples/locontella";
#define MAXREAD 2000000000
#else
char* forwardFileName="/Users/coissac/encours/orgasm/samples/5_1.fastq";
char* reverseFileName="/Users/coissac/encours/orgasm/samples/5_2.fastq";
indexfiles = "/Users/coissac/encours/orgasm/samples/5";
#define MAXREAD 1000000000
#endif
#ifndef LOAD_ONLY
reads = buildIndex(forwardFileName,reverseFileName,indexfiles,95,MAXREAD);
freeBuffer(reads);
#endif
// reads = loadIndexedReads(indexfiles);
// akey = "CCTGAGTGAAAAAGATACCGCGAGAACGATCTTTTTCAATAAAATCATCGCGCAATAAATCAACAAAACCTAAAGTAATTTCGCGTTCCCCTTCTAACTT";
// // TTTGTAAAgt
// // acTTTACAAA
//
// printf("%9d, %s\n",0,akey);
//
// pnext = lookForString(reads, akey, CMPLENGTH, &eol);
// i=0;
// printf ("pos = %d -> endoflist = %d\n",pnext,eol);
// printf ("--> %s\n 0 :%s\n",akey,decode(reads, pnext-1, 0, CMPLENGTH, 0));
// printf ("-1 :%s\n",decode(reads, pnext-2, 0, CMPLENGTH, 0));
// printf ("+1 :%s\n",decode(reads, pnext, 0, CMPLENGTH, 0));
//
// while (pnext > 0 && eol==0)