lookfor.c 12.2 KB
Newer Older
Eric Coissac's avatar
Eric Coissac committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * lookfor.c
 *
 *  Created on: 3 oct. 2012
 *      Author: coissac
 */

#include <string.h>

#include "orgasm.h"
#include "_sse.h"
/**
 * Rigth shift a nucleic sequence encoded with the encode
 * encode.
 *
 * dest : a 16bytes aligned pointer to the place where to store
 *        the shiftedkey. If dest is NULL a static internal buffer
 *        is used.
 *
 * key  : a 16bytes aligned pointer to the key to shift.
 *
 * shift: a value between 0 and 3 indicating the count of base pair
 *        to shift. 0 leads to no action. Value greater than 3 are
 *        reduced to this interval
 *
 * keyLength: a pointer to the length of the compressed key expressed
27
 *            in bytes.
Eric Coissac's avatar
Eric Coissac committed
28 29 30 31 32 33 34 35 36
 *
 * returns a pointer to the shifted key.
 *
 * WARNING even if dest is not NULL, the returned pointer can be
 * different than dest. So alwaus use the returned pointer. The
 * returned pointer is always 16bytes aligned
 */
pnuc shiftKey(pnuc dest, pnuc key,uint32_t shift, uint32_t keyLength)
{
Eric Coissac's avatar
Eric Coissac committed
37
	static char buffer[512] __attribute__ ((aligned (16))); // Force the alignment of the buffer
Eric Coissac's avatar
Eric Coissac committed
38 39 40
	uint32_t  n16bytes;
	uint32_t  remains;
	uint32_t  i;
Eric Coissac's avatar
Eric Coissac committed
41 42 43

	register um128 out;
	register um128 tmp;
Eric Coissac's avatar
Eric Coissac committed
44 45 46 47 48 49 50 51 52 53 54 55 56 57

	uint8_t mask;
	uint8_t cmask;
	uint8_t rmask;
	uint8_t save;
	uint8_t saveLast;
	uint8_t lshift;
	uint8_t rshift;


	pnuc rep;


	if (dest == NULL)
Eric Coissac's avatar
Eric Coissac committed
58
		dest = (pnuc) buffer;
Eric Coissac's avatar
Eric Coissac committed
59 60 61 62 63

	ASSERT(((size_t)dest & 0xF)==0,"Pointer %p is not 16bytes aligned",dest)

	rep = dest;

Eric Coissac's avatar
Eric Coissac committed
64
	// Limit shift in the [0;3] interval
65 66
	shift = shift & 0x3;

Eric Coissac's avatar
Eric Coissac committed
67 68 69
	// Shift == 0 is no shift so no computation
	if (shift==0) return key;

70
	// a shift of x bases is a shift of 2x bits
Eric Coissac's avatar
Eric Coissac committed
71
	rshift = shift << 1;
72 73


Eric Coissac's avatar
Eric Coissac committed
74 75
	lshift = 8 - rshift;

76 77 78 79 80 81 82 83 84 85 86
	//
	// Example :
	//
	// Shift = 3
	// rshift= 3 * 2 = 6
	// lshift= 8 - 6 = 2
	//
	// mask = (1 << 6) - 1 = 0b01000000 - 1 = 0b00111111
	// cmask= 0b11000000
	// rmask= 0b11111111 >> 6 = 0b00000011

Eric Coissac's avatar
Eric Coissac committed
87 88 89 90 91 92 93
	mask = (1 << rshift) - 1;
	cmask= ~mask;
	rmask= (~0) >> rshift;

	// The shifted key occupied one extra byte
	keyLength++;

Eric Coissac's avatar
Eric Coissac committed
94
	// Computes the count of complete 16 bytes packs in the key
Eric Coissac's avatar
Eric Coissac committed
95
	n16bytes = keyLength >>4;
Eric Coissac's avatar
Eric Coissac committed
96 97

	// The count of bytes after the last 16 bytes packs
Eric Coissac's avatar
Eric Coissac committed
98
	remains  = keyLength & 0xF;
Eric Coissac's avatar
Eric Coissac committed
99 100


Eric Coissac's avatar
Eric Coissac committed
101 102
	save = 0;

Eric Coissac's avatar
Eric Coissac committed
103

Eric Coissac's avatar
Eric Coissac committed
104 105 106 107 108 109 110 111 112 113 114 115
	for (i=0; i < n16bytes; i++, key+=16,dest+=16)
	{
		out.i = _MM_LOAD_SI128((const __m128i*)key);
		saveLast = out.u8[15];
		tmp.i = _MM_SRLI_EPI64(_MM_AND_SI128(out.i,
				 _MM_SET1_EPI8(cmask)),rshift);
		out.i = _MM_SLLI_EPI64(_MM_AND_SI128(out.i,_MM_SET1_EPI8(mask)),
														   lshift);
		out.i = _MM_SLLI_SI128(out.i,1);
		out.i = _MM_OR_SI128(out.i,tmp.i);

		if (i>0)
Eric Coissac's avatar
Eric Coissac committed
116 117 118
			out.u8[0]|= save << lshift; // BUG ??? change rshift to lshift
//		else                            // Not useful
//			out.u8[0]&= rmask;
Eric Coissac's avatar
Eric Coissac committed
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153

		save = saveLast;

		_MM_STORE_SI128((__m128i*)dest,out.i);
	}

	for (i=0; i < remains; i++, key++,dest++)
	{
		saveLast = *key;
		*dest = (saveLast >> rshift) | (save << lshift);
		save = saveLast;
	}

	return rep;
}

/**
 * Compare the prefix of a key encoded by the encode function over
 * the length 'length' expressed in base pair with a prefix of same
 * length corresponding to the read r1 from buffer.
 *
 * buffer : a pointer to a read buffer
 * r1     : the identifer of the read to compare
 * key    : a pointer to the encoded second sequence involved in the comparison
 * length : the length of the prefix to compare
 *
 * return 0 if both the prefix are equal, -1 if the r1 sequence is lexicographically
 * before key and +1 otherwise.
 *
 */
int8_t cmpPrefix(buffer_t *buffer,uint32_t r1,pnuc key,uint32_t length)
{
	uint32_t shift;
	uint32_t i;

154
	uint8_t  bmask[]={255,192,240,252};
Eric Coissac's avatar
Eric Coissac committed
155 156 157 158 159 160 161
	uint8_t  mask;
	uint8_t  *pr1;
	uint8_t  *pr2;
	uint8_t  vr1=0;
	uint8_t  vr2=0;

	shift = CODELENGTH(length);
162
	mask  = bmask[length % 4];
Eric Coissac's avatar
Eric Coissac committed
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205

	// DEBUG("pos : %d mask : %d,%x",pos,shift,mask);

	pr1 = ((uint8_t*)(buffer->records)) + buffer->recordSize * r1;
	pr2 = (uint8_t*)key;
	i=0;

	do {
		vr1 = *pr1;
		vr2 = *pr2;

		if (i==(shift-1))
		{
			vr1&=mask;
			vr2&=mask;
		}

		i++; pr1++; pr2++;

	} while( i < shift && vr2==vr1);

	return (vr1==vr2) ? 0 : ((vr1 < vr2) ? -1:1);

}


/**
 * Compare the prefix of a key encoded by the encode function over
 * the length 'length' expressed in base pair with a prefix of same
 * length corresponding to the read r1 from buffer.
 *
 * buffer : a pointer to a read buffer
 * r1     : the identifer of the read to compare
 * key    : a pointer to the encoded second sequence involved in the comparison
 * length : the length of the prefix to compare
 *
 * return 0 if both the prefix are equal, -1 if the r1 sequence is lexicographically
 * before key and +1 otherwise.
 *
 * INTERNAL FUNCTION DO NOT USE DIRECTLY
 *
 */

Eric Coissac's avatar
Eric Coissac committed
206

207
int8_t cmpCompPrefix(buffer_t *buffer,uint32_t r1,pnuc key, uint32_t length)
Eric Coissac's avatar
Eric Coissac committed
208 209 210 211 212 213 214 215 216 217
{
	int32_t i;
	uint32_t querylength;
	uint32_t readlength;
	uint8_t  mask;
	uint8_t  *pr1;
	uint8_t  *pr2;
	uint8_t  vr1=0;
	uint8_t  vr2=0;

218
	// Compute the length of compacted sequences
Eric Coissac's avatar
Eric Coissac committed
219 220
	querylength = CODELENGTH(length);
	readlength  = CODELENGTH(buffer->readSize);
221

Eric Coissac's avatar
Eric Coissac committed
222 223
	// Mask used to clean the left side of the query and subject
	mask= ((uint8_t)(~0)) >> (((4-(length & 3)) & 3) << 1);
Eric Coissac's avatar
Eric Coissac committed
224 225 226 227

	// DEBUG("pos : %d mask : %d,%x",pos,shift,mask);

	pr1 = ((uint8_t*)(buffer->records)) + buffer->recordSize * r1 + readlength - 1;
228
	pr2 = ((uint8_t*)(key)) + querylength - 1;
Eric Coissac's avatar
Eric Coissac committed
229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260

	i=querylength-1;

	do {
		vr1 = *pr1;
		vr2 = *pr2;

		if (i==0)
		{
			vr1&=mask;
			vr2&=mask;
		}

		i--; pr1--; pr2--;

	} while( i >=0 && vr2==vr1);

	vr1 = complement4nuc[vr1];
	vr2 = complement4nuc[vr2];

	return (vr1==vr2) ? 0 : ((vr1 < vr2) ? -1:1);

}


int32_t lookForForward(buffer_t *buffer, pnuc key, size_t length, int32_t* endoflist)
{
	int32_t initcode;
	int32_t start;
	int32_t end;
	int32_t middle;
	int32_t comp;
Eric Coissac's avatar
Eric Coissac committed
261
//	int32_t comp1;
Eric Coissac's avatar
Eric Coissac committed
262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335

	// I set the end of list flag to 0
	*endoflist=0;

	// and extract the 8 first nucleotides
	initcode = *((uint16_t*)key);

#ifdef LITTLE_END
	initcode = ((initcode & 255) << 8) | (initcode >> 8);
#endif

	// I use the hash table for identifying positions of reads
	// starting with these 8 nucleotides

	start=buffer->index1[initcode];

	if (initcode < 0xFFFF)
		end = buffer->index1[initcode+1];
	else
		end = buffer->readCount;


	// in this case no reads begin with the 8mer
	if (end-start==0)
	{
		*endoflist=1;
		return 0;
	}

	// I start a bsearch between start and end

//	middle = (start + end) / 2;
//	comp = cmpPrefix(buffer,middle,key,length);

	end--;

	while (start!=end)
	{
		middle = (start + end) / 2;
		comp = cmpPrefix(buffer,middle,key,length);

		if (comp < 0) start=middle+1;
		if (comp >= 0) end=middle;

	}

	comp = cmpPrefix(buffer,start,key,length);

	// No match found

	if (comp!=0)
	{
		*endoflist=1;
		return 0;
	}

	return start;

}

int32_t nextForward(buffer_t *buffer, int32_t current, size_t length, int32_t* endoflist)
{

	int32_t pos= current+1;
	pnuc    key= (pnuc)((uint8_t*)(buffer->records) + buffer->recordSize * current);
	int     comp=cmpPrefix(buffer,pos,key,length);

	if (comp==0)
		return pos;

	*endoflist=1;
	return 0;
}

Eric Coissac's avatar
Eric Coissac committed
336
static char* bin8(int val){
Eric Coissac's avatar
Eric Coissac committed
337

Eric Coissac's avatar
Eric Coissac committed
338
	static char buf[9] = {0};
Eric Coissac's avatar
Eric Coissac committed
339 340 341

	int i = 30;

Eric Coissac's avatar
Eric Coissac committed
342 343
	for(i=7; i>=0 ; --i, val >>= 1)
		buf[i] = "01"[val & 1];
Eric Coissac's avatar
Eric Coissac committed
344

Eric Coissac's avatar
Eric Coissac committed
345
	return buf;
Eric Coissac's avatar
Eric Coissac committed
346 347 348

}

Eric Coissac's avatar
Eric Coissac committed
349 350 351 352 353 354 355 356 357
/**
 *
 * buffer    : a pointer to a read index structure
 * pnuc      : a pointer
 * key       : a pointer to the encoded query
 * length    : the length of the query
 * endoflist : set to 0 by the function if it finds a solution to 1 otherwise
 *
 */
358
int32_t lookForReverse(buffer_t *buffer, pnuc key, size_t length, int32_t* endoflist)
Eric Coissac's avatar
Eric Coissac committed
359
{
Eric Coissac's avatar
Eric Coissac committed
360 361 362 363 364 365
	static uint8_t  bmask[]={255,192,240,252};
	char internalbuffer[512]  __attribute__ ((aligned (16))); // Force the alignment of the buffer;
	char internalbuffer2[512] __attribute__ ((aligned (16))); // Force the alignment of the buffer;
	pnuc 	 dest;
	pnuc 	 dest2;
	int32_t  rshift;
Eric Coissac's avatar
Eric Coissac committed
366
	uint32_t i;
Eric Coissac's avatar
Eric Coissac committed
367 368 369 370 371
	int32_t  initcode;
	int32_t  start;
	int32_t  end;
	int32_t  middle;
	int32_t  comp;
372
	uint32_t lkey;
Eric Coissac's avatar
Eric Coissac committed
373 374 375 376

	// I set the end of list flag to 0
	*endoflist=0;

Eric Coissac's avatar
Eric Coissac committed
377 378
	dest =  (pnuc) internalbuffer;
	dest2 = (pnuc) internalbuffer2;
379 380
	//shift= 4 - (length & 3);

Eric Coissac's avatar
Eric Coissac committed
381
	lkey=CODELENGTH(length);               //-> we compute the compressed query length
Eric Coissac's avatar
Eric Coissac committed
382 383 384 385 386 387 388 389 390 391 392 393

	// then we reverse complement the sequence

	for (i=0; i<lkey;i++)
		dest2[i]=complement4nuc[(uint8_t)key[lkey-1-i]];

	// We have now to "bit align" the end of the key with the end of the reads

	rshift= buffer->readSize & 3;

	if (rshift > 0)
	{
Eric Coissac's avatar
Eric Coissac committed
394
		printf("Rshift = %d\n",rshift);
Eric Coissac's avatar
Eric Coissac committed
395
		int  ib;
Eric Coissac's avatar
Eric Coissac committed
396
		for (ib=0;ib<lkey;ib++)
Eric Coissac's avatar
Eric Coissac committed
397
		    printf ("%8s ",bin8(dest2[ib]));
Eric Coissac's avatar
Eric Coissac committed
398
		printf("\n");
Eric Coissac's avatar
Eric Coissac committed
399

Eric Coissac's avatar
Eric Coissac committed
400
		dest = shiftKey(dest, dest2, rshift, lkey);
Eric Coissac's avatar
Eric Coissac committed
401 402
		dest[lkey]&=bmask[rshift];

Eric Coissac's avatar
Eric Coissac committed
403
		for (ib=0;ib<=lkey;ib++)
Eric Coissac's avatar
Eric Coissac committed
404
		    printf ("%8s ",bin8(dest[ib]));
Eric Coissac's avatar
Eric Coissac committed
405
		printf("\n");
Eric Coissac's avatar
Eric Coissac committed
406 407

		// Set the last bits of the query to 0
Eric Coissac's avatar
Eric Coissac committed
408 409 410 411 412

		// we have to increase match length to take into account the
		// unfully filled last byte of the reads and key

		length+= 4 - rshift;
413 414

		if (length & 3)
Eric Coissac's avatar
Eric Coissac committed
415
			printf("with extension %d\n", length & 3),
416 417
			lkey++;
		else
Eric Coissac's avatar
Eric Coissac committed
418
			printf("without extension\n"),
419
			dest++;
Eric Coissac's avatar
Eric Coissac committed
420 421 422 423

		for (ib=0;ib<=lkey;ib++)
		    printf ("%8s ",bin8(dest[ib]));
		printf("\n");
Eric Coissac's avatar
Eric Coissac committed
424 425 426 427 428 429
	}
	else
		dest = dest2;

	// Now dest point to the key to use

430
	initcode = *(((uint16_t*)(dest + lkey))-1);
Eric Coissac's avatar
Eric Coissac committed
431
#ifdef LITTLE_END
Eric Coissac's avatar
Eric Coissac committed
432
	initcode = complement4nuc[initcode & 0x00FF] | (complement4nuc[(initcode >> 8) & 0x00FF] << 8);
Eric Coissac's avatar
Eric Coissac committed
433
#else
Eric Coissac's avatar
Eric Coissac committed
434
	initcode = (complement4nuc[initcode & 0x00FF] << 8) | complement4nuc[(initcode >> 8) & 0x00FF ];
Eric Coissac's avatar
Eric Coissac committed
435 436 437 438 439 440 441
#endif

	// I use the hash table for identifying positions of reads
	// starting with these 8 nucleotides

	start=buffer->index2[initcode];

442

Eric Coissac's avatar
Eric Coissac committed
443 444 445 446 447 448 449 450 451
	if (initcode < 0xFFFF)
		end = buffer->index2[initcode+1];
	else
		end = buffer->readCount;

	// in this case no reads begin with the 8mer
	if (end-start==0)
	{
		*endoflist=1;
Eric Coissac's avatar
Eric Coissac committed
452
		printf('No short kmer\n');
Eric Coissac's avatar
Eric Coissac committed
453 454 455 456 457 458 459 460 461 462
		return 0;
	}

	// I start a bsearch between start and end

	end--;

	while (start!=end)
	{
		middle = (start + end) / 2;
463
		comp = cmpCompPrefix(buffer,buffer->order1[middle],dest,length);
Eric Coissac's avatar
Eric Coissac committed
464

Eric Coissac's avatar
Eric Coissac committed
465
		if (comp < 0)  start=middle+1;
Eric Coissac's avatar
Eric Coissac committed
466 467 468 469
		if (comp >= 0) end=middle;

	}

470
	comp = cmpCompPrefix(buffer,buffer->order1[start],dest,length);
Eric Coissac's avatar
Eric Coissac committed
471 472 473 474 475 476

	// No match found

	if (comp!=0)
	{
		*endoflist=1;
Eric Coissac's avatar
Eric Coissac committed
477
		printf('No reads\n');
Eric Coissac's avatar
Eric Coissac committed
478 479 480 481 482 483 484 485 486 487
		return 0;
	}


	return start;

}

int32_t nextReverse(buffer_t *buffer, int32_t current, size_t length, int32_t* endoflist)
{
Eric Coissac's avatar
Eric Coissac committed
488

Eric Coissac's avatar
Eric Coissac committed
489 490
	int32_t pos= current+1;
	int32_t rshift= buffer->readSize & 3;
Eric Coissac's avatar
Eric Coissac committed
491
	size_t  lkey = length + ((rshift==0) ? 0:(4-rshift));
Eric Coissac's avatar
Eric Coissac committed
492 493
	pnuc    key= (pnuc)((uint8_t*)(buffer->records) + buffer->recordSize * buffer->order1[current] + CODELENGTH(buffer->readSize) - CODELENGTH(lkey));
	int     comp=cmpCompPrefix(buffer,buffer->order1[pos],key, lkey);
Eric Coissac's avatar
Eric Coissac committed
494 495

	if (comp==0)
Eric Coissac's avatar
Eric Coissac committed
496
	{
Eric Coissac's avatar
Eric Coissac committed
497
		*endoflist=0;
Eric Coissac's avatar
Eric Coissac committed
498
		return pos;
Eric Coissac's avatar
Eric Coissac committed
499
	}
Eric Coissac's avatar
Eric Coissac committed
500 501 502 503 504 505

	*endoflist=1;
	return 0;

}

506
int32_t lookForReads(buffer_t *buffer, pnuc key, size_t length, int32_t* endoflist)
Eric Coissac's avatar
Eric Coissac committed
507 508 509
{
	int32_t pos = lookForForward(buffer, key, length, endoflist);

Eric Coissac's avatar
Eric Coissac committed
510 511


Eric Coissac's avatar
Eric Coissac committed
512 513 514 515
	if (*endoflist==0)
		pos++;
	else
	{
516
		pos = lookForReverse(buffer, key, length, endoflist);
Eric Coissac's avatar
Eric Coissac committed
517
		if (*endoflist==0)
Eric Coissac's avatar
Eric Coissac committed
518
			pos = - buffer->order1[pos] - 1;
Eric Coissac's avatar
Eric Coissac committed
519 520 521 522 523 524 525
	}

	return pos;
}

int32_t nextRead(buffer_t *buffer, int32_t current, size_t length, int32_t* endoflist)
{
Eric Coissac's avatar
Eric Coissac committed
526
	int32_t next=0;
Eric Coissac's avatar
Eric Coissac committed
527 528
	pnuc from;

Eric Coissac's avatar
Eric Coissac committed
529 530 531
	if (current == 0)
		*endoflist=1;

Eric Coissac's avatar
Eric Coissac committed
532 533 534 535 536 537 538 539 540 541 542
	if (current > 0)
	{
		current--;
		next = nextForward(buffer, current, length, endoflist);

		if (*endoflist==0)
			next++;
		else
		{
			from = (pnuc)((uint8_t*)(buffer->records) + buffer->recordSize * current);

543
			next = lookForReverse(buffer, from, length, endoflist);
Eric Coissac's avatar
Eric Coissac committed
544 545

			if (*endoflist==0)
Eric Coissac's avatar
Eric Coissac committed
546
				next = - buffer->order1[next] -1;
Eric Coissac's avatar
Eric Coissac committed
547 548 549 550 551 552
		}

	}

	if (current < 0)
	{
Eric Coissac's avatar
Eric Coissac committed
553
		current = buffer->complement[-current -1];
Eric Coissac's avatar
Eric Coissac committed
554 555 556
		next = nextReverse(buffer, current, length, endoflist);

		if (*endoflist==0)
Eric Coissac's avatar
Eric Coissac committed
557
			next = - buffer->order1[next] -1;
Eric Coissac's avatar
Eric Coissac committed
558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574
	}

	return next;
}

int32_t lookForString(buffer_t *buffer, char *key, size_t length, int32_t* endoflist)
{
	char internalbuffer[512];
	pnuc dest;

	ASSERT(length>=8,"length have to be greater or equal to %d",8)


	// I encode the key string
	dest = (pnuc) PTR16(internalbuffer);
	dest = encodeSequence(dest, key, length);

575
	return lookForReads(buffer, dest, length, endoflist);
Eric Coissac's avatar
Eric Coissac committed
576
}
577

Eric Coissac's avatar
Eric Coissac committed
578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593
int32_t lookForReadsIds(buffer_t *buffer, int32_t key, int32_t* endoflist)
{
	pnuc dest;

	if (key < 0)
		key=-key;

	dest = (pnuc)((uint8_t*)(buffer->records) + buffer->recordSize * (key-1));

	return lookForReads(buffer, dest, buffer->readSize, endoflist);
}

int32_t nextReadIds(buffer_t *buffer, int32_t current, int32_t* endoflist)
{
	return nextRead(buffer,current, buffer->readSize, endoflist);
}