fastq.c 5.81 KB
Newer Older
Eric Coissac's avatar
Eric Coissac committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
/*
 * fastq.c
 *
 *  Created on: 11 juil. 2012
 *      Author: coissac
 */

#include <stdio.h>
#include <inttypes.h>
#include <string.h>

#include "orgasm.h"

#include "debug.h"

/**
 *
 *  This simple fastq reader assume that sequence and quality are stored
 *  on a single line and only read the sequence from the file not the quality
 *
 *	dest    : a pointer to a memory zone where the read sequence
 *	          will be stored
 *
 *	in      : the input file from which sequences are read
 *
 *	maxsize : the maximal size of the read sequence
 *
 *  The function return the length of the sequence or 0 on error
 */

uint32_t readFastq(char* dest, FILE* in, int32_t maxsize)
{

	char* buffer;
	size_t lseq;

	// Look for the first fastq sequence header starting
	// with a @

	buffer = fgetln(in,&lseq);

	while(buffer!=NULL && buffer[0]!='@')
		buffer = fgetln(in,&lseq);

	// We check if we are on error

	if (buffer==NULL)
		return 0;

	// Then we read the sequence on one line

	buffer = fgets(dest,maxsize,in);
	lseq=strlen(buffer);

	// Then two lines are skipped for the quality

	buffer = fgetln(in,&lseq);
	buffer = fgetln(in,&lseq);

	if (lseq > 0)
		{
			lseq--;
			dest[lseq]=0;
		}

	return lseq;

}

/**
 *
 *  This function read paired fastq file using the readFastq function
 *  encoded above
 *
 *	dest    : a pointer to a memory zone where the read sequence
 *	          will be stored
 *
 *	forward : the input file from which forward sequences are read
 *
 *	reverse : the input file from which reverse sequences are read
 *
 *	maxsize : the maximal size of the read sequence
 *
 *  The function return the length of the forward sequence or 0 on error
 *  Different read length for forward and reverse reads is considered as
 *  an error.
 */

Eric Coissac's avatar
Eric Coissac committed
89 90 91 92 93
uint32_t readPairedFastq(char* dest,
		 	 	 	     FILE* forward,
		 	 	 	     FILE* reverse,
		 	 	 	     int32_t maxsize,
		 	 	 	     uint32_t readLength)
Eric Coissac's avatar
Eric Coissac committed
94 95 96
{
	int32_t forwardLength;
	int32_t forwardLength16;
Eric Coissac's avatar
Eric Coissac committed
97 98 99 100
	int32_t reverseLength=0;
	char*   rdest;
	char*   c;
	int32_t rsize;
Eric Coissac's avatar
Eric Coissac committed
101 102 103

	bzero(dest,maxsize);

Eric Coissac's avatar
Eric Coissac committed
104 105
	if (readLength > 0)
	{
Eric Coissac's avatar
Eric Coissac committed
106 107 108 109 110

		/* If the read length is even then shorten it by one base */
	    if ((readLength & 1) ==0)
	    {
	    	readLength--;
Eric Coissac's avatar
Eric Coissac committed
111
	    	fprintf(stderr,"Read length adjusted to %d\n",readLength);
Eric Coissac's avatar
Eric Coissac committed
112 113
	    }

Eric Coissac's avatar
Eric Coissac committed
114 115 116 117 118 119 120 121 122 123 124 125
		forwardLength16 = round16(readLength);

		if (forwardLength16*2 > maxsize)
			return 0;

		rdest = dest + forwardLength16;
		rsize = maxsize - forwardLength16;
	}
	else
		forwardLength16=0;


Eric Coissac's avatar
Eric Coissac committed
126 127
	// we read the forward sequence

Eric Coissac's avatar
Eric Coissac committed
128
	do {
Eric Coissac's avatar
Eric Coissac committed
129 130

        // Skip the too short reads
Eric Coissac's avatar
Eric Coissac committed
131 132 133 134 135
		for (forwardLength = readFastq(dest,forward,maxsize);
			 forwardLength>0 && forwardLength<readLength;
			 forwardLength = readFastq(dest,forward,maxsize))
			// skip the reverse read and take the next forward
			reverseLength = readFastq(dest,reverse,maxsize);
Eric Coissac's avatar
Eric Coissac committed
136

Eric Coissac's avatar
Eric Coissac committed
137
		// If readLength is not specify set up it from the length of the first sequence
Eric Coissac's avatar
Eric Coissac committed
138 139
		if (forwardLength16==0)
		{
Eric Coissac's avatar
Eric Coissac committed
140
			readLength = forwardLength;
Eric Coissac's avatar
Eric Coissac committed
141
			/* If the read length is even then shorten it by one base */
Eric Coissac's avatar
Eric Coissac committed
142
		    if ((readLength & 1) ==0)
Eric Coissac's avatar
Eric Coissac committed
143
		    {
Eric Coissac's avatar
Eric Coissac committed
144 145
		    	readLength--;
		    	fprintf(stderr,"Read length adjusted to %d\n",readLength);
Eric Coissac's avatar
Eric Coissac committed
146 147
		    }

Eric Coissac's avatar
Eric Coissac committed
148
			forwardLength16 = round16(readLength);
Eric Coissac's avatar
Eric Coissac committed
149 150
			if (forwardLength16*2 > maxsize)
				return 0;
Eric Coissac's avatar
Eric Coissac committed
151

Eric Coissac's avatar
Eric Coissac committed
152 153 154
			rdest = dest + forwardLength16;
			rsize = maxsize - forwardLength16;
		}
Eric Coissac's avatar
Eric Coissac committed
155 156 157



Eric Coissac's avatar
Eric Coissac committed
158 159
		// founded a good forward get the corresponding reverse
		reverseLength = readFastq(rdest,reverse,rsize);
Eric Coissac's avatar
Eric Coissac committed
160

Eric Coissac's avatar
Eric Coissac committed
161 162 163
		} while ( forwardLength > 0 &&
		 	      reverseLength > 0 &&
		 	      reverseLength < readLength );
Eric Coissac's avatar
Eric Coissac committed
164

Eric Coissac's avatar
Eric Coissac committed
165
	// No more sequences
Eric Coissac's avatar
Eric Coissac committed
166 167 168
	if (forwardLength==0 || reverseLength==0)
		return 0;

Eric Coissac's avatar
Eric Coissac committed
169
	if ((forwardLength >= readLength) && (reverseLength >= readLength)) {
Eric Coissac's avatar
Eric Coissac committed
170 171 172
		for (c=dest+readLength;
			  c < rdest;
			  c++) *c=0;
Eric Coissac's avatar
Eric Coissac committed
173

Eric Coissac's avatar
Eric Coissac committed
174 175 176
		for (c=rdest+readLength;
			  c < (dest+maxsize);
			  c++) *c=0;
Eric Coissac's avatar
Eric Coissac committed
177
	}
Eric Coissac's avatar
Eric Coissac committed
178

Eric Coissac's avatar
Eric Coissac committed
179
	// we return the length of one read
Eric Coissac's avatar
Eric Coissac committed
180
	return readLength;
Eric Coissac's avatar
Eric Coissac committed
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198

}

uint8_t checkACGT(const char* src, size_t size)
{
	size_t  i;
	uint8_t rep=TRUE;

	size/=16;

	for (i=0;rep && i < size; i++, src+=16)
		rep &= is16ACGT(src);

	return rep;
}

buffer_t * loadPairedFastq(FILE* forward,
 			 		       FILE* reverse,
Eric Coissac's avatar
Eric Coissac committed
199
 			 	 	 	   size_t maxread,
Eric Coissac's avatar
Eric Coissac committed
200 201
					       size_t maxbuffersize,
					       uint32_t readLength
Eric Coissac's avatar
Eric Coissac committed
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
		                  )
{

		// We allocate on the stack an over sized buffer for the
	    // ascii sequence

#define MASK  ((size_t)0xF)
#define CLEAN ((size_t)~MASK)

	char     readbuffer[2048];

	uint32_t buffersize;
	char*    reads;
	char*    index;
	char*    bufferend;
Eric Coissac's avatar
Eric Coissac committed
217
//	uint32_t readLength;
Eric Coissac's avatar
Eric Coissac committed
218
	uint32_t recordLength;
Eric Coissac's avatar
Eric Coissac committed
219
	uint32_t nextprint=0;
Eric Coissac's avatar
Eric Coissac committed
220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
	uint32_t* encodedseqs;
	buffer_t* seqbuffer;


	fprintf(stderr,"\nReading sequence reads...\n\n");

	// we compute the first aligned address on 16 bytes word
	// include in the buffer

	reads = (char*) &readbuffer;
	if ((size_t)reads & MASK)
		reads = (char*)(((size_t)reads & CLEAN) + 0x10);

			// we adjust the buffer size according to the alignment

	buffersize= 2048 - (reads - (char*)(&readbuffer));

Eric Coissac's avatar
Eric Coissac committed
237
	readLength = readPairedFastq(reads,forward,reverse,buffersize,readLength);
Eric Coissac's avatar
Eric Coissac committed
238

Eric Coissac's avatar
Eric Coissac committed
239
	seqbuffer  = newBuffer(maxbuffersize,readLength,maxread);
Eric Coissac's avatar
Eric Coissac committed
240 241
	encodedseqs= (uint32_t*)(seqbuffer->records);

Eric Coissac's avatar
Eric Coissac committed
242
	printf("maximum reads : %zu\n",seqbuffer->maxrecord);
Eric Coissac's avatar
Eric Coissac committed
243 244


Eric Coissac's avatar
Eric Coissac committed
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
	while (readLength && (seqbuffer->readCount+2) <= seqbuffer->maxrecord)
	{
		if (seqbuffer->readCount > nextprint)
		{
			fprintf(stderr,"%9d sequences read\r",nextprint);
			nextprint+=DISPLAYSTEP;
		}

		recordLength = round16(readLength)*2;
		bufferend = reads + recordLength;
		if (checkACGT(reads,recordLength))
		{
			for (index=reads; index < bufferend; index+=16, encodedseqs++)
				*encodedseqs=encode16nuc(index);

			seqbuffer->readCount+=2;
		}

Eric Coissac's avatar
Eric Coissac committed
263
		readLength = readPairedFastq(reads,forward,reverse,buffersize,readLength);
Eric Coissac's avatar
Eric Coissac committed
264 265 266 267 268 269 270 271 272 273 274
		//DEBUG("Filled %d/%d",seqbuffer->readCount,seqbuffer->maxrecord);
	}


	fprintf(stderr,"%9zd sequences read\n",seqbuffer->readCount);

	return seqbuffer;

#undef MASK
#undef CLEAN
}