pairs.c 10.3 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 * pairs.c
 *
 *  Created on: 15 dc. 2008
 *      Author: coissac
 */

#include  "ecoprimer.h"
#include  <string.h>
10 11 12 13 14 15 16 17 18 19
#include  <stdlib.h>

static void buildPrimerPairsForOneSeq(uint32_t seqid,
									  pecodnadb_t seqdb,
		 							  pprimercount_t primers,
								 	  ppairtree_t pairs,
									  poptions_t options);



20 21 22



23 24 25 26 27 28 29 30 31
/*************************************
 *
 *       pair collection management
 *
 *************************************/

#ifdef MASKEDCODE

char *addamplifiasetelem (ppair_t pair, char* amplifia, int32_t taxid)
32 33 34 35
{
	uint32_t i;
	uint32_t j;
	char *ampused = NULL;
36

37 38 39 40 41 42
	if(pair->ampsetcount == 0)
	{
		pair->ampsetcount = 500;
		pair->ampsetindex = 0;
		pair->ampset = ECOMALLOC(pair->ampsetcount * sizeof(ampseqset_t),"Cannot allocate amplifia set");
	}
43

44 45 46 47 48 49 50 51
	for (i = 0; i < pair->ampsetindex; i++)
	{
		if (strcmp (pair->ampset[i].amplifia, amplifia) == 0)
		{
			ampused = pair->ampset[i].amplifia;
			break;
		}
	}
52

53 54 55 56 57 58
	if (i == 0)
	{
		pair->ampset[i].seqidcount = 100;
		pair->ampset[i].seqidindex = 0;
		pair->ampset[i].taxonids = ECOMALLOC(pair->ampset[i].seqidcount * sizeof(uint32_t),"Cannot allocate amplifia sequence table");
	}
59

60 61 62 63 64
	if (pair->ampsetindex == pair->ampsetcount)
	{
		pair->ampsetcount += 500;
		pair->ampset = ECOREALLOC(pair->ampset, pair->ampsetcount * sizeof(ampseqset_t), "Cannot allocate amplifia set");
	}
65

66 67 68 69 70
	if (pair->ampset[i].seqidindex == pair->ampset[i].seqidcount)
	{
		pair->ampset[i].seqidcount += 100;
		pair->ampset[i].taxonids = ECOREALLOC(pair->ampset[i].taxonids, pair->ampset[i].seqidcount * sizeof(int32_t), "Cannot allocate amplifia sequence table");
	}
71

72 73 74 75 76
	if (pair->ampset[i].amplifia == NULL)
	{
		pair->ampset[i].amplifia = amplifia;
		pair->ampsetindex++;
	}
77

78 79 80 81
	for (j = 0; j < pair->ampset[i].seqidindex; j++)
	{
		if (pair->ampset[i].taxonids[j] == taxid) break;
	}
82

83 84 85 86 87
	if (j == pair->ampset[i].seqidindex)
		pair->ampset[i].taxonids[pair->ampset[i].seqidindex++] = taxid;
	return ampused;
}

88
void addtaxampsetelem (ppair_t pair, int32_t taxid, char *amplifia)
89 90 91 92 93 94 95 96 97 98
{
	uint32_t i;
	uint32_t j;

	if(pair->taxsetcount == 0)
	{
		pair->taxsetcount = 500;
		pair->taxsetindex = 0;
		pair->taxset = ECOMALLOC(pair->taxsetcount * sizeof(taxampset_t),"Cannot allocate taxon set");
	}
99

100 101 102 103
	for (i = 0; i < pair->taxsetindex; i++)
	{
		if (pair->taxset[i].taxonid == taxid) break;
	}
104

105 106 107 108 109 110
	if (i == 0)
	{
		pair->taxset[i].amplifiacount = 100;
		pair->taxset[i].amplifiaindex = 0;
		pair->taxset[i].amplifia = ECOMALLOC(pair->taxset[i].amplifiacount * sizeof(char *),"Cannot allocate amplifia table");
	}
111

112 113 114 115 116
	if (pair->taxsetindex == pair->taxsetcount)
	{
		pair->taxsetcount += 500;
		pair->taxset = ECOREALLOC(pair->taxset, pair->taxsetcount * sizeof(taxampset_t), "Cannot allocate taxon set");
	}
117

118 119 120 121 122
	if (pair->taxset[i].amplifiaindex == pair->taxset[i].amplifiacount)
	{
		pair->taxset[i].amplifiacount += 100;
		pair->taxset[i].amplifia = ECOREALLOC(pair->taxset[i].amplifia, pair->taxset[i].amplifiacount * sizeof(char *), "Cannot allocate amplifia table");
	}
123

124 125 126 127 128
	if (pair->taxset[i].taxonid == 0)
	{
		pair->taxset[i].taxonid = taxid;
		pair->taxsetindex++;
	}
129

130 131 132 133
	for (j = 0; j < pair->taxset[i].amplifiaindex; j++)
	{
		if (strcmp(pair->taxset[i].amplifia[j], amplifia) == 0) break;
	}
134

135 136 137 138 139 140 141 142 143
	if (j == pair->taxset[i].amplifiaindex)
	{
		pair->taxset[i].amplifia[j] = amplifia;
		pair->taxset[i].amplifiaindex++;
	}
}

char *getamplifia (pecoseq_t seq, uint32_t start, uint32_t len)
{
144
	fprintf(stderr,"start : %d length : %d\n",start,len);
145 146
	char *amplifia = ECOMALLOC((len + 1) * sizeof(char),"Cannot allocate amplifia");
	char *seqc = &seq->SQ[start];
147

148 149 150 151
	strncpy(amplifia, seqc, len);
	return amplifia;
}

152
#endif
153 154

/*TR: Added*/
155
ppairtree_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options)
156 157
{
	uint32_t i;
158
	ppairtree_t primerpairs;
159

160
	primerpairs = initpairtree(NULL);
161 162 163

	for (i=0; i < seqdbsize; i++)
	{
164
		buildPrimerPairsForOneSeq(i, seqdb, primers, primerpairs, options);
165
	}
166

167
	return primerpairs;
168

169 170
}

171 172 173 174 175 176 177
#define DMAX (2000000000)

static void buildPrimerPairsForOneSeq(uint32_t seqid,
									  pecodnadb_t seqdb,
		 							  pprimercount_t primers,
								 	  ppairtree_t pairs,
									  poptions_t options)
178
{
179 180 181 182
	static uint32_t    paircount=0;
	uint32_t           i,j,k;
	uint32_t           matchcount=0;
	pprimermatch_t     matches = NULL;
183
	primermatchcount_t seqmatchcount;
184 185 186 187 188 189
	ppair_t            pcurrent;
	pair_t			   current;
	pprimer_t		   wswp;
	bool_t			   bswp;
	size_t			   distance;
	bool_t             strand;
190 191 192 193 194 195

	for (i=0;i < primers->size; i++)
	{
		matchcount+=primers->primers[i].directCount[seqid];
		matchcount+=primers->primers[i].reverseCount[seqid];
	}
196 197 198 199

	if (matchcount <= 0)
		return;

200 201 202 203 204 205 206 207
	matches = ECOMALLOC(matchcount * sizeof(primermatch_t),"Cannot allocate primers match table");

	for (i=0,j=0;i < primers->size; i++)
	{
		if (primers->primers[i].directCount[seqid])
		{
			if (primers->primers[i].directCount[seqid]==1)
			{
208 209
			matches[j].primer = primers->primers+i;
			matches[j].strand=TRUE;
210 211 212 213 214
				matches[j].position=primers->primers[i].directPos[seqid].value;
				j++;
			}
			else for (k=0; k < primers->primers[i].directCount[seqid]; k++,j++)
			{
215
				matches[j].primer = primers->primers+i;
216 217 218 219 220 221 222 223 224
				matches[j].strand=TRUE;
				matches[j].position=primers->primers[i].directPos[seqid].pointer[k];
			}
		}

		if (primers->primers[i].reverseCount[seqid])
		{
			if (primers->primers[i].reverseCount[seqid]==1)
			{
225 226
			matches[j].primer = primers->primers+i;
			matches[j].strand=FALSE;
227 228 229 230 231
				matches[j].position=primers->primers[i].reversePos[seqid].value;
				j++;
			}
			else for (k=0; k < primers->primers[i].reverseCount[seqid]; k++,j++)
			{
232
				matches[j].primer = primers->primers+i;
233 234 235 236 237 238
				matches[j].strand=FALSE;
				matches[j].position=primers->primers[i].reversePos[seqid].pointer[k];
			}
		}
	}

239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
	if (matchcount>1)
	{
//		fprintf(stderr,"\n====================================\n");

		sortmatch(matches,matchcount); // sort in ascending order by position

		for (i=0; i < matchcount;i++)
		{
			// For all primers matching the sequence

			for(j=i+1;
			       (j<matchcount)
			    && ((distance=matches[j].position - matches[i].position - options->primer_length) < options->lmax);
			   j++
			   )

			   // For all not too far primers

		       if ( (matches[i].primer->good || matches[j].primer->good)
		    		&& (distance > options->lmin)
		    		)
		       {

		    	   // If possible primer pair

		    	   current.p1 = matches[i].primer;
		    	   current.asdirect1=matches[i].strand;
		    	   current.p2 = matches[j].primer;
		    	   current.asdirect2= !matches[j].strand;
		    	   current.maxd=DMAX;
		    	   current.mind=DMAX;
		    	   current.sumd=0;
	    		   current.inexample=0;
	    		   current.outexample=0;
273 274
				   current.curseqid = 0;
				   current.refsequence=-1;
275 276 277 278 279 280 281 282 283 284 285 286 287 288 289


		    	   // Standardize the pair

	    		   strand = current.p2->word > current.p1->word;
		    	   if (!strand)
		    	   {
		    		   wswp = current.p1;
		    		   current.p1=current.p2;
		    		   current.p2=wswp;

		    		   bswp = current.asdirect1;
		    		   current.asdirect1=current.asdirect2;
		    		   current.asdirect2=bswp;
		    	   }
290

291 292 293 294 295 296 297 298 299

		    	   // Look for the new pair in already seen pairs

		    	   pcurrent = insertpair(current,pairs);


		    	   if (seqdb[seqid]->isexample)

				   {
300
		    		   //pcurrent->inexample++;
301 302 303 304 305 306 307 308
		    		   pcurrent->sumd+=distance;

		    		   if ((pcurrent->maxd==DMAX) || (distance > pcurrent->maxd))
			    		   pcurrent->maxd = distance;

			    	   if (distance < pcurrent->mind)
			    		   pcurrent->mind = distance;
		    	   }
309 310
		    	   //else
		    		 //  pcurrent->outexample++;
311

312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
				  if (pcurrent->curseqid != (seqid+1))
				   {
						if (seqdb[seqid]->isexample)
							pcurrent->inexample++;
		    	        else
		    		        pcurrent->outexample++;

						if (pcurrent->curseqid != 0)
							pcurrent->curseqid = seqid+1;
					}

					/*if ((pcurrent->outexample+pcurrent->inexample)==0)
					{
						fprintf(stderr,"pcurrent->outexample+pcurrent->inexample=0!\n");
						exit(0);
					}*/

		    	   if (pcurrent->curseqid == 0)//((pcurrent->outexample+pcurrent->inexample)==1)
330
		    	   {
331
					   pcurrent->curseqid = seqid+1;
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
					   paircount++;
					   pcurrent->pcr.ampslot=200;
					   pcurrent->pcr.ampcount=0;
					   pcurrent->pcr.amplifias = ECOMALLOC(sizeof(amplifia_t)*pcurrent->pcr.ampslot,
							                               "Cannot allocate amplifia table");
		    	   }
		    	   else
		    	   {
		    		   if (pcurrent->pcr.ampslot==pcurrent->pcr.ampcount)
		    		   {
		    			   pcurrent->pcr.ampslot+=200;
		    			   pcurrent->pcr.amplifias = ECOREALLOC(pcurrent->pcr.amplifias,
															    sizeof(amplifia_t)*pcurrent->pcr.ampslot,
		    			   							            "Cannot allocate amplifia table");
		    		   }
		    	   }

349 350
		    	   if (seqid==options->refseqid)
		    		   pcurrent->refsequence=seqid;
351 352 353
		    	   pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].length=distance;
		    	   pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].sequence=seqdb[seqid];
		    	   pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].strand=strand;
354 355
		    	   pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].begin=matches[i].position + options->primer_length;
		    	   pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].end=  matches[j].position - 1;
356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388

		    	   if (strand)
		    	   	   pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].amplifia=  seqdb[seqid]->SQ + matches[i].position + options->primer_length;
		    	   else
		    	   	   pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].amplifia=  seqdb[seqid]->SQ + matches[j].position - 1 ;

		    	   pcurrent->pcr.ampcount++;
//		    	   fprintf(stderr,"%c%c W1 : %s   direct : %c",
//		    			   "bG"[(int)pcurrent->p1->good],
//		    			   "bG"[(int)pcurrent->p2->good],
//		    			   ecoUnhashWord(pcurrent->p1->word, options->primer_length),
//		    			   "><"[(int)pcurrent->asdirect1]
//		    			   );
//
//		    	   fprintf(stderr,"   W2 : %s   direct : %c distance : %d (min/max/avg : %d/%d/%f) in/out: %d/%d %c (%d pairs)\n",
//		    			   ecoUnhashWord(pcurrent->p2->word, options->primer_length),
//		    			   "><"[(int)pcurrent->asdirect2],
//		    			   distance,
//		    			   pcurrent->mind,pcurrent->maxd,
//		    			   (pcurrent->inexample) ? (float)pcurrent->sumd/pcurrent->inexample:0.0,
//		    			   pcurrent->inexample,pcurrent->outexample,
//		    			   " N"[(pcurrent->outexample+pcurrent->inexample)==1],
//		    			   paircount
//
//		    			   );
//

		       }
		 }
	}

   pairs->count=paircount;

389
}