Commit 627c4e77 authored by Celine Mercier's avatar Celine Mercier

sumatra version 1.0.02

parents
This diff is collapsed.
This diff is collapsed.
EXEC=sumatra
SUMATRA_SRC= sumatra.c \
mtcompare_sumatra.c
SUMATRA_OBJ= $(patsubst %.c,%.o,$(SUMATRA_SRC))
SRCS= $(SUMATRA_SRC)
LIB= -lfasta -llcs -lfile -lutils -lz -lm
include ./global.mk
all: $(EXEC)
########
#
# sumatra compilation
#
########
# executable compilation and link
sumatra: $(SUMATRA_OBJ) $(LIBFASTA) $(LIBLCS) $(LIBFILE) $(LIBUTILS)
$(CC) $(LDFLAGS) -o $@ -pthread $(SUMATRA_OBJ) $(LIBFASTAPATH) $(LIBLCSPATH) $(LIBFILEPATH) $(LIBUTILSPATH) $(LIB)
########
#
# project management
#
########
clean:
rm -f *.o
rm -f *.P
rm -f $(EXEC)
$(MAKE) -C ../sumalibs/libfasta clean
$(MAKE) -C ../sumalibs/liblcs clean
$(MAKE) -C ../sumalibs/libfile clean
$(MAKE) -C ../sumalibs/libutils clean
LIBFASTAPATH = -L../sumalibs/libfasta
LIBLCSPATH = -L../sumalibs/liblcs
LIBFILEPATH = -L../sumalibs/libfile
LIBUTILSPATH = -L../sumalibs/libutils
LIBFASTA = ../sumalibs/libfasta/libfasta.a
LIBLCS = ../sumalibs/liblcs/liblcs.a
LIBFILE = ../sumalibs/libfile/libfile.a
LIBUTILS = ../sumalibs/libutils/libutils.a
CC=gcc
LDFLAGS=
ifeq ($(CC),gcc)
CFLAGS = -O3 -s -DOMP_SUPPORT -fopenmp -w
else
CFLAGS = -O3 -w
endif
default: all
%.o: %.c
$(CC) $(CFLAGS) -c -o $@ $< $(LIB)
########
#
# libraries compilation
#
########
../sumalibs/libfasta/libfasta.a:
$(MAKE) -C ../sumalibs/libfasta
../sumalibs/liblcs/liblcs.a:
$(MAKE) -C ../sumalibs/liblcs
../sumalibs/libfile/libfile.a:
$(MAKE) -C ../sumalibs/libfile
../sumalibs/libutils/libutils.a:
$(MAKE) -C ../sumalibs/libutils
\ No newline at end of file
/*
* mtcompare.c
*
* Created on: 17 aot 2010
* Authors: Eric Coissac, Celine Mercier
*/
#include <pthread.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "sumatra.h"
#include "../sumalibs/libfasta/sequence.h"
#include "../sumalibs/libutils/utilities.h"
#include "../sumalibs/liblcs/upperband.h"
#include "../sumalibs/liblcs/sse_banded_LCS_alignment.h"
typedef struct {
int32_t tocompute; // next line to compute
int64_t computed; // count of alignment computed
int32_t thread_count; // count of active thread
int32_t tokill;
pthread_t *threads;
pthread_mutex_t m_tocompute;
pthread_mutex_t m_thread_count;
pthread_mutex_t m_incharge;
pthread_mutex_t m_print;
pthread_mutex_t m_tokill;
pthread_mutex_t m_finished;
int32_t lineset;
int64_t pairs;
fastaSeqCount *db1;
fastaSeqCount *db2;
BOOL normalize;
BOOL lcsmode;
int reference;
BOOL extradata;
BOOL always;
double threshold;
int16_t** addresses;
int16_t** iseqs1;
int16_t** iseqs2;
int sizeForSeqs;
} thread_control_t;
void printLine(fastaSeqPtr seq1, fastaSeqPtr seq2, int32_t seqcount, double *score, BOOL extradata, int64_t pairs)
{
int j;
for (j=0; j < seqcount; j++,seq2++,score++)
printResults(seq1,seq2,*score,extradata,pairs,(*score >=0));
}
void *computeline(void* c)
{
thread_control_t *control=(thread_control_t*)c;
int32_t line;
int32_t start;
int32_t end;
int32_t threadid;
int32_t i,j;
double *score;
double *scores;
fastaSeqPtr db2;
fastaSeqPtr db1;
fastaSeqPtr sq2;
fastaSeqPtr sq1;
char* s1;
int l1;
int LCSmin;
threadid = control->thread_count++;
pthread_mutex_unlock(&(control->m_thread_count));
db1 = control->db1->fastaSeqs;
if (control->db2)
{
end = control->db2->count;
db2 = control->db2->fastaSeqs;
}
else
{
end = control->db1->count;
db2 = control->db1->fastaSeqs;
}
scores = (double*)malloc(end * sizeof(double));
while (control->tocompute < control->db1->count)
{
pthread_mutex_lock(&(control->m_tocompute));
line = control->tocompute;
pthread_mutex_unlock(&(control->m_incharge));
if (line < control->db1->count)
{
for (i=0,sq1=db1+line; i < control->lineset && (i+line) < control->db1->count; i++,sq1++)
{
s1 = sq1->sequence;
l1 = sq1->length;
start = (control->db2) ? 0:line+i+1;
for (score=scores,j=start; j < end ; j++,score++)
{
sq2=db2+j;
filtersSumatra(sq1, sq2, control->threshold, control->normalize, control->reference, control->lcsmode, score, &LCSmin);
if (control->always || (*score == -1.0))
{
*score = alignForSumathings(s1, *((control->iseqs1)+threadid), sq2->sequence, *((control->iseqs2)+threadid), l1, sq2->length,
control->normalize, control->reference, control->lcsmode, *((control->addresses)+threadid), control->sizeForSeqs, LCSmin);
if (!control->always && (((*score < control->threshold) && (control->lcsmode || control->normalize)) || ((!control->lcsmode && !control->normalize) && (*score > control->threshold))))
*score = -1.0;
else if (!control->lcsmode && control->normalize)
*score = 1.0 - *score;
}
else if (*score == -2.0)
*score = -1.0;
}
pthread_mutex_lock(&(control->m_print));
printLine(sq1,db2+start,end-start,scores,control->extradata,control->pairs);
pthread_mutex_unlock(&(control->m_print));
}
}
}
pthread_mutex_unlock(&(control->m_incharge));
free(scores);
pthread_mutex_lock(&(control->m_finished));
control->tokill=threadid;
pthread_mutex_unlock(&(control->m_tokill));
return (void*)threadid;
}
int mt_compare_sumatra(fastaSeqCount *db1, fastaSeqCount *db2, double threshold, BOOL normalize, int reference, BOOL lcsmode, BOOL extradata, int n)
{
int64_t pairs;
thread_control_t control;
int32_t i;
int lmax, lmax1;
int lmin, lmin1;
if (db2==NULL)
{
fprintf(stderr,"Pairwise alignment of one database against itself\n");
pairs = (int64_t)(db1->count - 1) * (int64_t)db1->count /2;
}
else
{
fprintf(stderr,"Pairwise alignment of two databases\n");
pairs = (int64_t)db1->count * (int64_t)db2->count;
}
fprintf(stderr,"Count of alignment to do : %lld\n",pairs);
control.addresses = (int16_t**) malloc(n*sizeof(int16_t*));
control.iseqs1 = (int16_t**) malloc(n*sizeof(int16_t*));
control.iseqs2 = (int16_t**) malloc(n*sizeof(int16_t*));
calculateMaxAndMinLenDB(*db1, &lmax, &lmin);
if (!(db2==NULL))
{
calculateMaxAndMinLenDB(*db2, &lmax1, &lmin1);
if (lmax1 > lmax)
lmax = lmax1;
if (lmin1 < lmin)
lmin = lmin1;
}
for (i=0; i < n; i++)
control.sizeForSeqs = prepareTablesForSumathings(lmax, lmin, threshold, normalize, reference, lcsmode, (control.addresses)+i, (control.iseqs1)+i, (control.iseqs2)+i);
control.db1 = db1;
control.db2 = db2;
control.tocompute = 0;
control.normalize = normalize;
control.reference = reference;
control.extradata = extradata;
control.threshold = threshold;
control.lcsmode = lcsmode;
control.computed = 0;
control.thread_count = 0;
control.pairs = pairs;
if (n > control.db1->count/2)
n = control.db1->count/2;
control.lineset = control.db1->count / n / 2;
if (threshold > 0)
{
fprintf(stderr,"Compute exact LCS only for score > %lf\n", threshold);
control.always = FALSE;
}
else
control.always=TRUE;
if (pthread_mutex_init(&(control.m_thread_count),NULL))
{
fprintf(stderr,"m_thread_count mutex init error\n");
exit(1);
}
if (pthread_mutex_init(&(control.m_tocompute),NULL))
{
fprintf(stderr,"m_tocompute mutex init error\n");
exit(1);
}
if (pthread_mutex_init(&(control.m_incharge),NULL))
{
fprintf(stderr,"m_incharge mutex init error\n");
exit(1);
}
if (pthread_mutex_init(&(control.m_print),NULL))
{
fprintf(stderr,"m_print mutex init error\n");
exit(1);
}
if (pthread_mutex_init(&(control.m_tokill),NULL))
{
fprintf(stderr,"m_tokill mutex init error\n");
exit(1);
}
if (pthread_mutex_init(&(control.m_finished),NULL))
{
fprintf(stderr,"m_finished mutex init error\n");
exit(1);
}
control.threads = (pthread_t*)malloc(n * sizeof(pthread_t));
if (!control.threads)
{
fprintf(stderr,"Cannot allocate memory for threads\n");
exit(2);
}
pthread_mutex_lock(&(control.m_thread_count));
pthread_mutex_lock(&(control.m_tocompute));
pthread_mutex_lock(&(control.m_incharge));
pthread_mutex_unlock(&(control.m_print));
pthread_mutex_lock(&(control.m_tokill));
pthread_mutex_unlock(&(control.m_finished));
fprintf(stderr,"\n");
for (i=0; i < n; i++)
{
fprintf(stderr,"Initializing thread...");
if (pthread_create(control.threads+i,NULL,computeline,&control))
{
fprintf(stderr," : thread %d Error\n",i);
exit(3);
}
pthread_mutex_lock(&(control.m_thread_count));
fprintf(stderr," : thread %d Ok\n",control.thread_count);
}
pthread_mutex_unlock(&(control.m_thread_count));
for (control.tocompute=0;
control.tocompute < db1->count+1;
control.tocompute+=control.lineset)
{
pthread_mutex_unlock(&(control.m_tocompute));
pthread_mutex_lock(&(control.m_incharge));
}
pthread_mutex_unlock(&(control.m_tocompute));
fprintf(stderr,"\n");
for (i=0; i < n; i++)
{
pthread_mutex_lock(&(control.m_tokill));
fprintf(stderr,"Joining thread %d...",control.tokill);
pthread_mutex_unlock(&(control.m_tocompute));
pthread_join(control.threads[control.tokill],NULL);
fprintf(stderr," : Ok\n");
pthread_mutex_unlock(&(control.m_finished));
}
// Freeing
for (i=0; i < n; i++)
{
free((*((control.iseqs1)+i))-(control.sizeForSeqs)+lmax);
free((*((control.iseqs2)+i))-(control.sizeForSeqs)+lmax);
}
free(control.iseqs1);
free(control.iseqs2);
if ((reference == ALILEN) && ((lcsmode && normalize) || (!lcsmode)))
{
for (i=0; i < n; i++)
free(*((control.addresses)+i));
free(control.addresses);
}
return 0;
}
/*
* mtcompare_sumatra.h
*
* Created on: 12 mars 2013
* Author: celinemercier
*/
#ifndef MTCOMPARE_SUMATRA_H_
#define MTCOMPARE_SUMATRA_H_
int mt_compare_sumatra(fastaSeqCount *db1, fastaSeqCount *db2, double threshold, BOOL normalize, int reference, BOOL lcsmode, BOOL extradata, int n);
#endif /* MTCOMPARE_SUMATRA_H_ */
/**
* FileName: sumatra.c
* Authors: Eric Coissac, Celine Mercier
* Description: computation of pairwise similarities of DNA sequences
* **/
#include "sumatra.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <string.h>
#include <sys/time.h>
#include "../sumalibs/libfasta/sequence.h"
#include "../sumalibs/liblcs/upperband.h"
#include "../sumalibs/liblcs/sse_banded_LCS_alignment.h"
#include "../sumalibs/libutils/utilities.h"
#include "mtcompare_sumatra.h"
#define VERSION "1.0.02"
/* ----------------------------------------------- */
/* printout help */
/* ----------------------------------------------- */
#define PP fprintf(stdout,
static void PrintHelp()
{
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
PP " SUMATRA Version %s\n", VERSION);
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
PP " Synopsis : sumatra computes all the pairwise LCS (Longest Common Subsequence) scores\n");
PP " of one nucleotide dataset or between two nucleotide datasets.\n");
PP " Usage: sumatra [options] <dataset1> [dataset2]\n");
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
PP " Options:\n\n");
PP " -h : [H]elp - print <this> help\n\n");
PP " -l : Reference sequence length is the shortest. \n\n");
PP " -L : Reference sequence length is the largest. \n\n");
PP " -a : Reference sequence length is the alignment length (default). \n\n");
PP " -n : Score is normalized by reference sequence length (default).\n\n");
PP " -r : Raw score, not normalized. \n\n");
PP " -d : Score is expressed in distance (default: score is expressed in similarity). \n\n");
PP " -t ##.## : Score threshold. If the score is normalized and expressed in similarity (default),\n");
PP " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized\n");
PP " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%.\n");
PP " If the score is not normalized and expressed in similarity, it is the length of the\n");
PP " Longest Common Subsequence. If the score is not normalized and expressed in distance,\n");
PP " it is (reference length - LCS length).\n");
PP " Only sequence pairs with a similarity above ##.## are printed. Default: 0.00 \n");
PP " (no threshold).\n\n");
PP " -p ## : Number of threads used for computation (default=1).\n\n");
PP " -g : n's are replaced with a's (default: sequences with n's are discarded).\n");
PP " -x : Adds four extra columns with the count and length of both sequences.\n");
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
PP " First argument : the nucleotide dataset to analyze\n\n");
PP " Second argument : optionally the second nucleotide dataset\n");
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
PP " Results table description : \n");
PP " column 1 : Identifier sequence 1\n");
PP " column 2 : Identifier sequence 2\n");
PP " column 3 : Score\n");
PP " column 4 : Count of sequence 1 (only with option -x)\n");
PP " column 5 : Count of sequence 2 (only with option -x)\n");
PP " column 6 : Length of sequence 1 (only with option -x)\n");
PP " column 7 : Length of sequence 2 (only with option -x)\n");
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
PP " http://metabarcoding.org/sumatra\n");
PP "-----------------------------------------------------------------------------------------------------------------------------\n\n");
}
#undef PP
/* ----------------------------------------------- */
/* printout usage and exit */
/* ----------------------------------------------- */
#define PP fprintf(stderr,
static void ExitUsage(stat)
int stat;
{
PP "usage: sumatra [-l|L|a|n|r|d|g|x] [-t threshold_value] [-p number of threads] dataset1 [dataset2]\n");
PP "type \"sumatra -h\" for help\n");
if (stat)
exit(stat);
}
#undef PP
void printResults(fastaSeqPtr seq1,fastaSeqPtr seq2,
double score,
BOOL extradata,
int64_t pairs,
BOOL print)
{
static struct timeval start;
static struct timeval lastprint;
static BOOL first=TRUE;
static uint64_t aligned=0;
struct timeval current;
double fraction;
time_t fulltime;
time_t remaintime;
double elapsedtime;
int32_t day;
int32_t hour;
int32_t minute;
int32_t seconde;
aligned++;
if (first)
{
first=FALSE;
gettimeofday(&start,NULL);
lastprint=start;
}
gettimeofday(&current,NULL);
if (current.tv_sec!=lastprint.tv_sec)
{
lastprint=current;
fraction = (double)aligned/(double)pairs;
elapsedtime = difftime(current.tv_sec,start.tv_sec);
fulltime = elapsedtime / fraction;
remaintime = (time_t)difftime(fulltime,(time_t)elapsedtime);
fprintf(stderr,
"Computed %lld / %lld -> %5.2lf%%",
aligned, pairs, fraction*100.
);
seconde = fulltime % 60;
minute = fulltime / 60;
hour = minute / 60;
minute = minute % 60;
day = hour / 24;
hour = hour % 24;
if (day)
fprintf(stderr,
", estimated computation time = %3d days %02d:%02d:%02d",
day,
hour,
minute,
seconde
);
else
fprintf(stderr,
", estimated computation time = %02d:%02d:%02d",
hour,
minute,
seconde
);
seconde = remaintime % 60;
minute = remaintime / 60;
hour = minute / 60;
minute = minute % 60;
day = hour / 24;
hour = hour % 24;
if (day)
fprintf(stderr,
", about %3d days %02d:%02d:%02d remaining \r",
day,
hour,
minute,
seconde
);
else
fprintf(stderr,
", about %02d:%02d:%02d remaining \r",
hour,
minute,
seconde
);
}
if (print)
{
if (extradata)
printf("%s\t%s\t%lf\t%d\t%d\t%d\t%d\n", seq1->accession_id,
seq2->accession_id,
score,
seq1->count,
seq2->count,
seq1->length,
seq2->length
);
else