Commit 1f5a30b0 by Eric Coissac

My complete changes on my laptop, with specificity bug fix + ahocorasick + sets

git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@393 60f365c0-8329-0410-b2a4-ec073aeeaa1d
parent 19887e9a
......@@ -3,7 +3,8 @@ LIBPATH= -Llibapat -LlibecoPCR -Llibecoprimer -Llibthermo
MAKEDEPEND = gcc -D$(MACHINE) -M $(CPPFLAGS) -o $*.d $<
CC=gcc
CFLAGS= -W -Wall -O5 -m64
CFLAGS= -W -Wall -m64 -g
#CFLAGS= -W -Wall -O5 -m64 -g
#CFLAGS= -W -Wall -O0 -m64 -g
#CFLAGS= -W -Wall -O5 -fast -g
......
......@@ -15,7 +15,8 @@ SOURCES = goodtaxon.c \
taxstats.c \
apat_search.c \
filtering.c \
PrimerSets.c
PrimerSets.c \
ahocorasick.c
SRCS=$(SOURCES)
......
......@@ -13,6 +13,8 @@ typedef struct {
float set_lmean;
float set_lcov;
float set_score;
int32_t set_intaxa;
int32_t set_wi_cnt;
}pairset;
typedef struct{
......@@ -33,9 +35,24 @@ typedef struct{
void add_pair_in_set (pairset *pair_set, int32_t pset_idx, int32_t prb_idx, SetParams *pparams);
void get_next_pair_options (int *pair_wi_count_sorted_ids, pairset *pair_set, SetParams *pparams);
float get_links_distribution (int prb_idx, pairset *prob_set, SetParams *pparams);
pairset build_primers_set (ppair_t* sortedpairs, int32_t sorted_count, pecodnadb_t seqdb,
poptions_t options);
pairset build_primers_set_greedy_spc (SetParams *pparams);
void get_set_mean_cov_stats (pairset *prob_set, SetParams *pparams);
void some_other_set_possibilities (pairset *pair_set,
ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
void sets_by_SimulatedAnealing (pairset *pair_set,
ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
void sets_by_TabuSearch (pairset *pair_set,
ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
pairset * sets_by_BruteForce (ppair_t * sortedpairs,
int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
pairset * extend_set_randomly (pairset *pair_set, SetParams *params, int extend_to_cnt);
void build_and_print_sets (ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
int32_t get_next_option_increasing_cov (pairset *pair_set, SetParams *pparams);
void reset_set_props (pairset *pair_set, SetParams *pparams);
void primers_graph_graphviz (ppair_t * sortedpairs,
int32_t sorted_count, poptions_t options);
size_t primers_changeSortedArray (ppair_t ** pairs,
size_t sorted_count, poptions_t options);
size_t primers_filterWithGivenLinks (ppair_t ** pairs,
size_t sorted_count, poptions_t options);
#endif
/*
* ahocorasick.h
*
* Created on: 26 march 2011
* Author: tiayyba
*/
#ifndef H_ahocorasick
#define H_ahocorasick
#include "ecoprimer.h"
typedef struct aho_output_t{
uint32_t wordidx; //index of strict word (dont save the word of 64B)
bool_t isdirect; //we need to find both direct and reverse words so we must know which one is it
}aho_output;
typedef struct aho_output_count_t{
uint32_t count;
aho_output *out_set;
}aho_output_count;
typedef struct aho_state_t{
int32_t id;
struct aho_state_t *next[4]; //for labels A=0,C=1,G=2 and T=3
struct aho_state_t *fail;
aho_output_count output;
}aho_state;
typedef struct queue_node_t {
aho_state *state_node;
struct queue_node_t *next;
}queue_node;
typedef struct{
queue_node *first;
queue_node *last;
}aho_queue;
pprimercount_t ahoc_lookforStrictPrimers (pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount,
pwordcount_t words,poptions_t options);
#endif /* H_ahocorasick */
......@@ -176,6 +176,7 @@ typedef struct {
int *wellIdentifiedSeqs; //< an array having elements equla to total seqs
// values are either 0 or 1, if seq is well identified
// its 1 else 0
int *coveredSeqs; //< an array having elements equal to total seqs, 1 if seq is covered else 0
// these statistics are relative to inexample sequences
......@@ -291,6 +292,9 @@ typedef struct {
PNNParams pnparm;
bool_t print_sets_of_primers;
float specificity_threshold;
int links_cnt;
float max_links_percent;
bool_t filter_on_links;
} options_t, *poptions_t;
typedef ecoseq_t **pecodnadb_t;
......@@ -350,7 +354,7 @@ int32_t getrankdbstats(pecodnadb_t seqdb,
uint32_t seqdbsize,
ecotaxonomy_t *taxonomy,
poptions_t options);
float taxonomycoverage(ppair_t pair, poptions_t options);
float taxonomycoverage(ppair_t pair, poptions_t options, pecodnadb_t seqdb,uint32_t seqdbsize);
char ecoComplementChar(char base);
void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize);
......
......@@ -114,6 +114,8 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
error<<= 1;
error&=ERRORMASK(FWORDSIZE);
//code = -1;
//if((*base) >= 'A' && (*base) <= 'Z')
code = encoder[(*base) - 'A'];
if (code <0)
{
......@@ -154,7 +156,7 @@ int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
for (i=0;i<seqdbsize;i++)
{
if (database[i]->isexample)
if (database[i]->isexample && database[i]->SQ_length > options->primer_length)
{
j++;
wordscount=ecoFilteringHashSequence(wordscount,
......
......@@ -179,7 +179,7 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid,
uint32_t i,j,k;
uint32_t matchcount=0;
pprimermatch_t matches = NULL;
primermatchcount_t seqmatchcount;
//primermatchcount_t seqmatchcount;
ppair_t pcurrent;
pair_t current;
pprimer_t wswp;
......@@ -189,9 +189,9 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid,
//char prmr[50];
//float mtemp;
word_t w1, w1a, omask = (0x1L << (options->strict_three_prime*2)) -1;
word_t w2, w2a, wtmp;
word_t w2, w2a;//, wtmp;
uint32_t bp1,bp2;
//prmr[options->primer_length] = '\0';
for (i=0;i < primers->size; i++)
......@@ -252,16 +252,17 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid,
{
// For all primers matching the sequence
//for(j=i+1;
// (j<matchcount)
// && ((distance=matches[j].position - matches[i].position - options->primer_length) < options->lmax);
// j++
// )
/*for(j=i+1;
(j<matchcount)
&& ((distance=matches[j].position - matches[i].position - options->primer_length) < options->lmax);
j++
)//*/
for (j=i+1; j<matchcount; j++)
{
if (matches[j].position - matches[i].position <= options->primer_length) continue;
distance = matches[j].position - matches[i].position - options->primer_length;
if (distance >= options->lmax) break;
// For all not too far primers
......@@ -269,9 +270,7 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid,
&& (distance > options->lmin)
)
{
// If possible primer pair
current.p1 = matches[i].primer;
current.asdirect1=matches[i].strand;
current.p2 = matches[j].primer;
......@@ -456,7 +455,6 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid,
}
}
}
pairs->count=paircount;
}
......@@ -108,10 +108,11 @@ void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ
table->inseqcount++;
table->strictcount = ECOREALLOC(table->strictcount,buffersize*sizeof(uint32_t),
//fprintf (stderr, "\nOldAddress: %x", table->strictcount);
table->strictcount = ECOREALLOC(table->strictcount,(buffersize+5000)*sizeof(uint32_t),
"Cannot allocate memory to extend example word count table");
//fprintf (stderr, " NewAddress: %x\n", table->strictcount);
for (i=table->size; i < buffersize; i++)
table->strictcount[i]=1;
......@@ -172,7 +173,7 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
for (i=0;i<seqdbsize;i++)
{
if (database[i]->isexample)
if (database[i]->isexample && database[i]->SQ_length > options->primer_length)
{
if (first)
......
......@@ -6,10 +6,46 @@
*/
#include <search.h>
//void tdestroy (void *root, void (*free_node)(void *nodep));
#include "ecoprimer.h"
static int cmptaxon(const void *t1, const void* t2);
void **tree_root = NULL;
int delete_passes = 0;
void delete_twalkaction (const void *node, VISIT order, int level)
{
switch (order)
{
case preorder:
delete_passes++;
break;
case postorder:
delete_passes++;
break;
case endorder:
delete_passes++;
break;
case leaf:
if (tree_root)
tdelete (node, tree_root,cmptaxon);
delete_passes++;
break;
}
}
void free_tree_nodes (void *tree)
{
while (1)
{
delete_passes = 0;
twalk (tree, delete_twalkaction);
if (delete_passes <= 1) break;
}
}
static int cmptaxon(const void *t1, const void* t2)
{
const size_t taxid1=(size_t)t1;
......@@ -35,7 +71,12 @@ int32_t counttaxon(int32_t taxid)
if (taxid==-1)
{
if (taxontree)
{
tree_root = (void **)&taxontree;
//free_tree_nodes (taxontree);
ECOFREE(taxontree,"Free taxon tree");
tree_root = NULL;
}
taxontree=NULL;
taxoncount=0;
return 0;
......@@ -97,22 +138,30 @@ int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *tax
}
float taxonomycoverage(ppair_t pair, poptions_t options)
float taxonomycoverage(ppair_t pair, poptions_t options, pecodnadb_t seqdb,uint32_t seqdbsize)
{
int32_t seqcount;
int32_t i;
int32_t incount=0;
int32_t outcount=0;
uint32_t j;
memset (pair->coveredSeqs, 0, seqdbsize*sizeof (int));
seqcount=pair->pcr.ampcount;
counttaxon(-1);
for (i=0; i < seqcount; i++)
if (pair->pcr.amplifias[i].sequence->isexample
&& pair->pcr.amplifias[i].sequence->ranktaxonid > 0 )
{
incount = counttaxon(pair->pcr.amplifias[i].sequence->ranktaxonid);
for (j=0; j<seqdbsize; j++)
if (pair->pcr.amplifias[i].sequence == seqdb[j])
{pair->coveredSeqs[j] = 1; break;}
}
counttaxon(-1);
for (i=0; i < seqcount; i++)
if (!pair->pcr.amplifias[i].sequence->isexample
......@@ -145,12 +194,14 @@ static int cmpamp(const void *ampf1, const void* ampf2)
{
incr = -1;
j = pampf1->length - 1;
if (pampf2->strand)
{
pampf1 = (pamptotaxon_t) ampf2;
pampf2 = (pamptotaxon_t) ampf1;
chd = 1;
}
//j = pampf2->length - 1; should have been here and pampf2 instead of pampf1?
}
len = (pampf1->length <= pampf2->length)? pampf1->length: pampf2->length;
......@@ -173,6 +224,7 @@ static int cmpamp(const void *ampf1, const void* ampf2)
return 0;
}*/
static int cmpamp(const void *ampf1, const void* ampf2)
{
int i;
......@@ -183,10 +235,10 @@ static int cmpamp(const void *ampf1, const void* ampf2)
char *ch2;
int incr1;
int incr2;
pamptotaxon_t pampf1 = (pamptotaxon_t) ampf1;
pamptotaxon_t pampf2 = (pamptotaxon_t) ampf2;
ch1 = pampf1->amplifia;
ch2 = pampf2->amplifia;
......@@ -218,7 +270,7 @@ static int cmpamp(const void *ampf1, const void* ampf2)
if (pampf1->length > pampf2->length) return 1;
if (pampf2->length > pampf1->length) return -1;
return 0;
}
......@@ -242,6 +294,8 @@ void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize)
uint32_t i, j;
uint32_t ampfindex = 0;
int32_t taxid;
uint32_t wellidentifiedcount;
void *ampftree = NULL;
pamptotaxon_t pcurrentampf;
pamptotaxon_t *ptmp;
......@@ -278,11 +332,14 @@ void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize)
}
memset (pair->wellIdentifiedSeqs, 0, seqdbsize*sizeof (int));
counttaxon(-1);
//counttaxon(-1);
for (i = 0; i < ampfindex; i++)
{
if (ampfwithtaxtree[i].taxoncount > 1)
twalk(ampfwithtaxtree[i].taxontree, twalkaction);
{
//printf ("\nampfwithtaxtree[i].taxoncount: %d\n", ampfwithtaxtree[i].taxoncount);
//twalk(ampfwithtaxtree[i].taxontree, twalkaction);
}
//TR 5/9/10 - added code for well identified seqs
else if(ampfwithtaxtree[i].taxoncount == 1) /*well identified*/
{
......@@ -293,6 +350,7 @@ void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize)
{
for (j = 0; j < seqdbsize; j++)
if (seqdb[j]->ranktaxonid == gtxid
&& seqdb[j]->isexample
&&(pair->p1->directCount[j] > 0
|| pair->p1->reverseCount[j] > 0)
&& (pair->p2->directCount[j] > 0
......@@ -303,10 +361,18 @@ void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize)
}
}
}
pair->notwellidentifiedtaxa = counttaxon(-2);
pair->bs = ((float)pair->intaxa - (float)pair->notwellidentifiedtaxa) / pair->intaxa;
//printf ("\n");
counttaxon(-1);
wellidentifiedcount = 0;
for (j = 0; j < seqdbsize; j++)
if (pair->wellIdentifiedSeqs[j] == 1)
counttaxon(seqdb[j]->ranktaxonid);
wellidentifiedcount = counttaxon(-2);
//pair->notwellidentifiedtaxa = counttaxon(-2);
pair->notwellidentifiedtaxa = (pair->intaxa-wellidentifiedcount); //counttaxon(-2);
//pair->bs = ((float)pair->intaxa - (float)pair->notwellidentifiedtaxa) / pair->intaxa;
pair->bs = ((float)wellidentifiedcount) / (float)pair->intaxa;
ECOFREE (ampfwithtaxtree, "Free amplifia table");
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment