Commit 25431343 by Eric Coissac

Patch index search on reverse strand for read with a length not multiple of 4

parent b49c55ef
......@@ -21,6 +21,29 @@ ORGASMI_SRC= fastq.c \
ORGASMI_OBJ= $(patsubst %.c,%.o,$(ORGASMI_SRC))
EXEC=orgasmi
ORGASMD_SRC= fastq.c \
encode.c \
decode.c \
buffer.c \
debug.c \
malloc.c \
sort.c \
indexoutput.c \
indexinput.c \
code16bits.c \
codecomp.c \
buildindex.c \
compsort.c \
load.c \
lookfor.c \
orgasmd.c
ORGASMI_OBJ= $(patsubst %.c,%.o,$(ORGASMI_SRC))
ORGASMD_OBJ= $(patsubst %.c,%.o,$(ORGASMD_SRC))
SRCS= $(ORGASMI_SRC)
......@@ -53,6 +76,9 @@ buildcomplement: buildcomplement.c
orgasmi: littlebigman $(ORGASMI_OBJ) $(LIBFILE)
$(CC) $(LDFLAGS) -o $@ $(ORGASMI_OBJ) $(LIBPATH) $(LIB)
orgasmd: littlebigman $(ORGASMD_OBJ) $(LIBFILE)
$(CC) $(LDFLAGS) -o $@ $(ORGASMD_OBJ) $(LIBPATH) $(LIB)
code16bits.c: buildcode
./$< > $@
......
......@@ -138,3 +138,34 @@ char *getRead(buffer_t *buffer, int32_t recordid, uint32_t begin, int32_t length
return NULL;
}
char * decodeSequence(pnuc buffer, uint32_t begin, int32_t length, char *dest)
{
static char internal[MAXREADLEN+20];
uint64_t* idest;
uint32_t from;
uint32_t shift;
uint32_t i;
uint32_t lcompact;
uint32_t mask;
uint16_t *compactseq;
if (dest==NULL) dest = internal;
dest = (char*)PTR8(dest);
idest = (uint16_t*)dest;
from = ((begin >> 2) & 0xFFFFFFFE);
shift = begin & 7;
lcompact = (length+shift) / 8 + (((length+shift) & 7) ? 1:0);
for (compactseq =(uint16_t*)(buffer + from), i=0;
i < lcompact;
compactseq++,i++, idest++)
*idest=decode16bitsnuc[*compactseq];
dest[length + shift]=0;
return dest + shift;
}
......@@ -24,8 +24,7 @@
* reduced to this interval
*
* keyLength: a pointer to the length of the compressed key expressed
* in bytes. This length is potentially changed for reflecting
* the length of the shifted key
* in bytes.
*
* returns a pointer to the shifted key.
*
......@@ -61,12 +60,28 @@ pnuc shiftKey(pnuc dest, pnuc key,uint32_t shift, uint32_t keyLength)
rep = dest;
shift = shift & 0x3;
// Shift == 0 is no shift so no computation
if (shift==0) return key;
// a shift of x bases is a shift of 2x bits
rshift = shift << 1;
lshift = 8 - rshift;
//
// Example :
//
// Shift = 3
// rshift= 3 * 2 = 6
// lshift= 8 - 6 = 2
//
// mask = (1 << 6) - 1 = 0b01000000 - 1 = 0b00111111
// cmask= 0b11000000
// rmask= 0b11111111 >> 6 = 0b00000011
mask = (1 << rshift) - 1;
cmask= ~mask;
rmask= (~0) >> rshift;
......@@ -180,7 +195,7 @@ int8_t cmpPrefix(buffer_t *buffer,uint32_t r1,pnuc key,uint32_t length)
*
*/
int8_t cmpCompPrefix(buffer_t *buffer,uint32_t r1,pnuc key,uint32_t lkey, uint32_t length)
int8_t cmpCompPrefix(buffer_t *buffer,uint32_t r1,pnuc key, uint32_t length)
{
int32_t i;
uint32_t querylength;
......@@ -191,14 +206,17 @@ int8_t cmpCompPrefix(buffer_t *buffer,uint32_t r1,pnuc key,uint32_t lkey, uint32
uint8_t vr1=0;
uint8_t vr2=0;
// Compute the length of compacted sequences
querylength = CODELENGTH(length);
readlength = CODELENGTH(buffer->readSize);
mask= ((uint8_t)(~0)) >> ((((4-(length & 3))) & 3) << 1);
// DEBUG("pos : %d mask : %d,%x",pos,shift,mask);
pr1 = ((uint8_t*)(buffer->records)) + buffer->recordSize * r1 + readlength - 1;
pr2 = ((uint8_t*)(key)) + lkey - 1;
pr2 = ((uint8_t*)(key)) + querylength - 1;
i=querylength-1;
......@@ -306,13 +324,14 @@ int32_t nextForward(buffer_t *buffer, int32_t current, size_t length, int32_t* e
return 0;
}
int32_t lookForReverse(buffer_t *buffer, pnuc key, uint32_t lkey, size_t length, int32_t* endoflist)
int32_t lookForReverse(buffer_t *buffer, pnuc key, size_t length, int32_t* endoflist)
{
char internalbuffer[512];
char internalbuffer2[512];
uint8_t bmask[]={255,192,240,252};
pnuc dest;
pnuc dest2;
int32_t shift;
//int32_t shift;
int32_t rshift;
uint32_t i;
int32_t initcode;
......@@ -320,13 +339,16 @@ int32_t lookForReverse(buffer_t *buffer, pnuc key, uint32_t lkey, size_t length,
int32_t end;
int32_t middle;
int32_t comp;
uint32_t lkey;
// I set the end of list flag to 0
*endoflist=0;
dest = (pnuc) PTR16(internalbuffer);
dest2 = (pnuc) PTR16(internalbuffer2);
shift= 4 - (length & 3);
//shift= 4 - (length & 3);
lkey=CODELENGTH(length);
// then we reverse complement the sequence
......@@ -340,12 +362,17 @@ int32_t lookForReverse(buffer_t *buffer, pnuc key, uint32_t lkey, size_t length,
if (rshift > 0)
{
dest = shiftKey(dest, dest2, rshift, lkey);
lkey++;
dest[lkey]&=bmask[rshift];
// we have to increase match length to take into account the
// unfully filled last byte of the reads and key
length+= 4 - rshift;
if (length & 3)
lkey++;
else
dest++;
}
else
dest = dest2;
......@@ -364,12 +391,12 @@ int32_t lookForReverse(buffer_t *buffer, pnuc key, uint32_t lkey, size_t length,
start=buffer->index2[initcode];
if (initcode < 0xFFFF)
end = buffer->index2[initcode+1];
else
end = buffer->readCount;
// in this case no reads begin with the 8mer
if (end-start==0)
{
......@@ -384,14 +411,14 @@ int32_t lookForReverse(buffer_t *buffer, pnuc key, uint32_t lkey, size_t length,
while (start!=end)
{
middle = (start + end) / 2;
comp = cmpCompPrefix(buffer,buffer->order1[middle],dest,lkey,length);
comp = cmpCompPrefix(buffer,buffer->order1[middle],dest,length);
if (comp < 0) start=middle+1;
if (comp >= 0) end=middle;
}
comp = cmpCompPrefix(buffer,buffer->order1[start],dest,lkey,length);
comp = cmpCompPrefix(buffer,buffer->order1[start],dest,length);
// No match found
......@@ -412,7 +439,7 @@ int32_t nextReverse(buffer_t *buffer, int32_t current, size_t length, int32_t* e
int32_t lkey = CODELENGTH(buffer->readSize);
pnuc key= (pnuc)((uint8_t*)(buffer->records) + buffer->recordSize * buffer->order1[current]);
int32_t rshift= buffer->readSize & 3;
int comp=cmpCompPrefix(buffer,buffer->order1[pos],key,lkey, length + ((rshift==0) ? 0:(4-rshift)));
int comp=cmpCompPrefix(buffer,buffer->order1[pos],key, length + ((rshift==0) ? 0:(4-rshift)));
if (comp==0)
return pos;
......@@ -422,7 +449,7 @@ int32_t nextReverse(buffer_t *buffer, int32_t current, size_t length, int32_t* e
}
int32_t lookForReads(buffer_t *buffer, pnuc key, int32_t lkey,size_t length, int32_t* endoflist)
int32_t lookForReads(buffer_t *buffer, pnuc key, size_t length, int32_t* endoflist)
{
int32_t pos = lookForForward(buffer, key, length, endoflist);
......@@ -430,7 +457,7 @@ int32_t lookForReads(buffer_t *buffer, pnuc key, int32_t lkey,size_t length, int
pos++;
else
{
pos = lookForReverse(buffer, key, lkey, length, endoflist);
pos = lookForReverse(buffer, key, length, endoflist);
if (*endoflist==0)
pos = -pos - 1;
}
......@@ -454,7 +481,7 @@ int32_t nextRead(buffer_t *buffer, int32_t current, size_t length, int32_t* endo
{
from = (pnuc)((uint8_t*)(buffer->records) + buffer->recordSize * current);
next = lookForReverse(buffer, from, CODELENGTH(buffer->readSize),length, endoflist);
next = lookForReverse(buffer, from, length, endoflist);
if (*endoflist==0)
next = -next -1;
......@@ -489,6 +516,6 @@ int32_t lookForString(buffer_t *buffer, char *key, size_t length, int32_t* endof
dest = (pnuc) PTR16(internalbuffer);
dest = encodeSequence(dest, key, length);
return lookForReads(buffer, dest, CODELENGTH(length), length, endoflist);
return lookForReads(buffer, dest, length, endoflist);
}
......@@ -100,6 +100,7 @@ uint8_t is16ACGT(const char* seq);
char *decode(buffer_t *buffer, uint32_t recordid, uint32_t begin, int32_t length, char *dest);
char *decodeComp(buffer_t *buffer, uint32_t recordid, uint32_t begin, int32_t length, char *dest);
char *getRead(buffer_t *buffer, int32_t recordid, uint32_t begin, int32_t length, char *dest);
char *decodeSequence(pnuc buffer, uint32_t begin, int32_t length, char *dest);
buffer_t *loadPairedFastq(FILE* forward,
FILE* reverse,
......@@ -140,7 +141,7 @@ void indexReverse(buffer_t *buffer);
buffer_t *loadIndexedReads(const char *indexname);
int32_t lookForReads(buffer_t *buffer, pnuc key, int32_t lkey,size_t length, int32_t* endoflist);
int32_t lookForReads(buffer_t *buffer, pnuc key, size_t length, int32_t* endoflist);
int32_t lookForString(buffer_t *buffer, char *key, size_t length, int32_t* endoflist);
int32_t nextRead(buffer_t *buffer, int32_t current, size_t length, int32_t* endoflist);
......
......@@ -10,10 +10,21 @@
#include "debug.h"
// We used the heavyiest bit of the order1 index to
// store the sorted status of a read
// these macro are setup to manipulate this bit
#define SETSORTED(x) ((x) | 0x80000000)
#define CLEARSORTED(x) ((x) & 0x7FFFFFFF)
#define GETSORTED(x) ((x) & 0x80000000)
// fillOrdored function fill the two int arrays
// pointed by the members order1 and order2 with
// increasing int values from 1 to n (order1)
// and fill the order2 array in a way that link
// paired reads which are interleaved by the
// loadpair function
void fillOrdored(buffer_t *buffer)
{
um128 counter;
......@@ -60,6 +71,12 @@ void fillOrdored(buffer_t *buffer)
}
// partialSortBuffer, sort lexicographically the reads suffix
// stored in buffer from the position [start;stop[.
// for this function reads are indexed from 0 to n-1 with
// n the read count.
//
// This function is recursive
void partialSortBuffer(buffer_t *buffer,
uint32_t start, uint32_t stop,
......@@ -82,12 +99,18 @@ void partialSortBuffer(buffer_t *buffer,
uint32_t cumsum=start;
uint32_t backtolastswap=FALSE;
// Just print some stats on the sorting progress
if (shift==1 && start > nextprint)
{
fprintf(stderr,"%9d sequences sorted\r",start);
nextprint=start;
}
// If we sort just one sequence or a suffix of length=0
// this is finnished
if ((stop - start)<=1 || shift==buffer->recordSize)
return;
......@@ -100,6 +123,8 @@ void partialSortBuffer(buffer_t *buffer,
for (; index < endex; index++)
wordStat[((uint8_t*)(buffer->records))[*index * buffer->recordSize + shift]]++;
// And we compute the cumulative sum of ocurences
for (i=0; i < 256; i++)
{
swap=wordStat[i];
......@@ -111,6 +136,7 @@ void partialSortBuffer(buffer_t *buffer,
for (i=start; i < stop; i++)
{
// I skip already ordored sequences
while (i < stop && GETSORTED(buffer->order1[i]))
i++;
......@@ -187,9 +213,11 @@ void partialSortBuffer(buffer_t *buffer,
}
// Clear all the sorted status
for (i=start; i < stop; i++)
buffer->order1[i]&=0x7FFFFFFF;
// Run recurcively the algorithm on each sub set
for (i=0; i < 256; i++)
{
if (i>0 && wordStat[i] < wordStat[i-1])
......@@ -385,6 +413,10 @@ void sortSuffix(buffer_t *buffer,uint32_t pos)
// DEBUG("Minimum read @ pos = %4d is %9d",pos,nextmini);
}
// indexForward function fill the int array pointed by the member index1
// of the structure buffer with a hash index of the reads based on the
// eight first bases of the sequences
void indexForward(buffer_t *buffer)
{
int32_t i;
......@@ -412,6 +444,11 @@ void indexForward(buffer_t *buffer)
}
}
// indexReverse function fill the int array pointed by the member index2
// of the structure buffer with a hash index of the reads based on the
// eight first bases of the reverse complement of the sequences
void indexReverse(buffer_t *buffer)
{
int32_t i;
......@@ -429,7 +466,7 @@ void indexReverse(buffer_t *buffer)
#ifdef LITTLE_END
hash = complement4nuc[hash & 0x00FF] | complement4nuc[hash >> 8] << 8;
#else
hash = complement4nuc[hash & 255] << 8 | complement4nuc[hash >> 8];
hash = complement4nuc[hash & 0x00FF] << 8 | complement4nuc[hash >> 8];
#endif
buffer->index2[hash]++;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment