Commit 04912948 by Celine Mercier

sumalibs first commit

parents
CC=gcc
LDFLAGS=
CFLAGS = -O3 -w
default: all
%.o: %.c
$(CC) $(CFLAGS) -c -o $@ $< $(LIB)
SOURCES = fasta_header_parser.c \
fasta_seq_writer.c \
fasta_header_handler.c \
header_mem_handler.c \
sequence.c
SRCS=$(SOURCES)
OBJECTS= $(patsubst %.c,%.o,$(SOURCES))
LIBFILE = libfasta.a
RANLIB = ranlib
include ../global.mk
all: $(LIBFILE)
fasta_header_parser.c: fasta_header_parser.l
flex -Pheader_yy -t $< > $@
dic_parser.c: dic_parser.l
lex -Phashtable_yy -t $< > $@
clean:
rm -rf $(OBJECTS) $(LIBFILE)
rm -f *.a
$(LIBFILE): $(OBJECTS)
ar -cr $@ $?
$(RANLIB) $@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "sequence.h"
#include "fasta_header_parser.h"
#include "fasta_header_handler.h"
char* char_header_add_field(char* header, char* name, char* value)
{
int lheader = strlen(header);
header = (char*) realloc(header, (lheader+strlen(name)+strlen(value)+4)*sizeof(char));
if (header[lheader-1] == '.')
{
strcpy(header+lheader-1,";");
strcpy(header+lheader," ");
strcpy(header+lheader+1,name);
strcpy(header+lheader+1+strlen(name),"=");
strcpy(header+lheader+1+strlen(name)+1,value);
}
else
{
strcpy(header+lheader,";");
strcpy(header+lheader+1," ");
strcpy(header+lheader+2,name);
strcpy(header+lheader+2+strlen(name),"=");
strcpy(header+lheader+2+strlen(name)+1,value);
}
return header;
}
char* fastaSeqPtr_header_add_field(fastaSeqPtr seq, char* name, char* value)
{
int lheader = strlen(seq->rawheader);
int i;
char* buffer;
char* rawheader;
rawheader = (char*) malloc((lheader+strlen(name)+strlen(value)+5)*sizeof(char));
strcpy(rawheader, seq->rawheader);
buffer = calloc(lheader, sizeof(char));
i=0;
while ((rawheader[i] != ' ') && (rawheader[i] != 0))
i++;
if (rawheader[i] == ' ')
strcpy(buffer, rawheader+i);
else
strcpy(rawheader+i, " ");
i++;
strcpy(rawheader+i,name);
strcpy(rawheader+i+strlen(name),"=");
strcpy(rawheader+i+strlen(name)+1,value);
strcpy(rawheader+i+strlen(name)+1+strlen(value),";");
strcpy(rawheader+i+strlen(name)+1+strlen(value)+1, buffer);
free(buffer);
return(rawheader);
}
element_from_header* table_header_add_field(element_from_header* header, char* name, char* value)
{
int nbf;
nbf = atoi(header[0].value);
nbf++;
header = (element_from_header*) realloc(header, (nbf+1)*sizeof(element_from_header));
header[nbf].name = (char*) malloc((1+strlen(name))*sizeof(char));
strcpy(header[nbf].name, name);
header[nbf].value = (char*) malloc((1+strlen(value))*sizeof(char));
strcpy(header[nbf].value, value);
sprintf(header[0].value, "%d", nbf);
return(header);
}
void free_header_table(element_from_header* header)
{
int i;
int nbf = atoi(header[0].value);
for (i = 0; i <= nbf; i++)
{
free((header[i]).name);
free((header[i]).value);
}
free(header);
}
char* getItemFromHeader(char* name, element_from_header* header)
{
char* value = 0;
int nbf;
int i;
nbf = atoi(header[0].value);
for (i = 1; i <= nbf; i++)
{
if (strcmp(header[i].name,name)==0)
value = header[i].value;
}
return value;
}
void changeValue(element_from_header* header, char* name, char* newValue)
{
int i;
int nbf = atoi(header[0].value);
for (i = 1; i <= nbf; i++)
{
if (strcmp(header[i].name, name)==0)
{
header[i].value = realloc(header[i].value, (1+strlen(newValue))*sizeof(char));
strcpy(header[i].value, newValue);
}
}
}
#ifndef FASTA_HEADER_HANDLER_H_
#define FASTA_HEADER_HANDLER_H_
#include "sequence.h"
char* char_header_add_field(char*,char*,char*);
char* fastaSeqPtr_header_add_field(fastaSeqPtr seq, char* name, char* value);
element_from_header* table_header_add_dic(element_from_header* header, char* name, struct hashtable *hashtab);
element_from_header* table_header_add_field(element_from_header* header, char* name, char* value);
void free_header_table(element_from_header*);
char* getItemFromHeader(char*, element_from_header*);
void changeValue(element_from_header* header, char* name, char* newValue);
#endif
#ifndef FASTA_HEADER_PARSER_H_
#define FASTA_HEADER_PARSER_H_
typedef struct {
char *name;
void *value;
}element_from_header;
element_from_header* header_parser_main(char*);
#endif
/*
* Add -ll in Makefile if you modify this file to convert to .c
*/
%x REGID
%x REGNAME
%x REGVAL
%{
#include <stdlib.h>
#include <string.h>
#include "header_mem_handler.h"
#include "fasta_header_handler.h"
#define MEMALLOCATED 10
#define BUFFER 5
#define YY_DECL int header_parser(int *nbf, int *memory_allocated, element_from_header **p_header)
%}
WORD [[:alnum:]:\-.{},'_()\#\[\]\|\&\"\'\/\%\+]+
WORDID [[:alnum:]:\-.{},'_()\#\[\]\|\&\"\'\/\%\+=;]+
SUP >
EOL \n
SEP ;
SPACE [[:blank:]]+
EQUAL =
%%
int i;
int size_needed;
int free_size;
char* field;
<INITIAL>{SUP} {
/*printf("\n<INITIAL>{SUP},%s",yytext);*/
BEGIN(REGID);
}
<INITIAL,REGID>{WORDID} {
i=0;
field = malloc_field(&free_size);
(*p_header)[*nbf].name = (char*) malloc(3*sizeof(char));
strcpy(((*p_header)[*nbf]).name,"id");
size_needed = strlen(yytext)+1;
(*p_header)[*nbf].value = (char*) malloc(sizeof(char)*size_needed);
strcpy(((*p_header)[*nbf]).value,yytext);
(*nbf)++;
}
<INITIAL,REGID>{SPACE} {
BEGIN(REGNAME);
}
<REGNAME>{WORD} {
/*fprintf(stderr,"\n<REGNAME>{WORD} **%s**",yytext);*/
field = store_in_field(field,yytext,&free_size,&i);
}
<REGNAME>{SPACE} {
/*fprintf(stderr,"\n<REGNAME>{SPACE} **%s**",yytext);*/
if (i != 0)
field = store_in_field(field,yytext,&free_size,&i);
}
<REGNAME>{EQUAL} {
/*fprintf(stderr,"\n<REGNAME>{EQUAL},%s",yytext);*/
field = store_in_header_table(field, &((*p_header)[*nbf].name), &free_size, &i);
BEGIN(REGVAL);
}
<REGNAME>{SEP} {
/*fprintf(stderr,"\n<REGNAME>{SEP},%s",yytext);*/
(*p_header)[*nbf].name = (char*) malloc(19*sizeof(char));
strcpy((*p_header)[*nbf].name,"definition");
field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i);
p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated);
BEGIN(REGNAME);
}
<REGVAL>{WORD} {
/*fprintf(stderr,"\n<REGVAL>{WORD} **%s**\n",yytext);*/
field = store_in_field(field,yytext,&free_size,&i);
}
<REGVAL>{SPACE} {
/*fprintf(stderr,"\n<REGVAL>{SPACE} **%s**\n",yytext);*/
field = store_in_field(field,yytext,&free_size,&i);
}
<REGVAL>{SEP} {
/*fprintf(stderr,"\n<REGVAL>{SEP},%s\n",yytext);*/
field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i);
p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated);
BEGIN(REGNAME);
}
<REGVAL>{EQUAL} {
/*fprintf(stderr, "\nWarning : separator ';' probably missing in header after %s",(*p_header)[*nbf].name);*/
}
<REGVAL><<EOF>> {
field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i);
p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated);
end_header_table(p_header, *nbf);
free(field);
BEGIN(INITIAL);
return 0;
}
<REGNAME><<EOF>> {
/*(*p_header)[*nbf].name = (char*) malloc(sizeof(char)*19);
strcpy((*p_header)[*nbf].name,"other_informations");
field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i);
p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated);
*/
end_header_table(p_header, *nbf);
free(field);
BEGIN(INITIAL);
return 0;
}
%%
int header_yywrap()
{
return 1;
}
element_from_header* header_parser_main(char *h)
{
int nbfields,memory_allocated;
element_from_header* header;
char* nbfields_n;
char* nbfields_v;
nbfields_n = (char*) malloc(9*sizeof(char));
nbfields_v = (char*) malloc(5*sizeof(char));
memory_allocated=MEMALLOCATED;
nbfields=1;
strcpy(nbfields_n, "nbfields");
strcpy(nbfields_v, "1");
header = (element_from_header*) malloc(memory_allocated * sizeof(element_from_header));
header[0].name = nbfields_n;
header[0].value = nbfields_v;
YY_BUFFER_STATE state;
state=yy_scan_string(h);
header_parser(&nbfields, &memory_allocated, &header);
yy_delete_buffer(state);
return header;
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "sequence.h"
#include "fasta_header_parser.h"
void printOnlySeqFromFastaSeqPtr(fastaSeqPtr seq, FILE* output)
{
char nuc;
int n=60;
int l = strlen(seq->sequence);
for (n=60; n<l; n+=60)
{
nuc = seq->sequence[n];
seq->sequence[n]=0;
fprintf(output,"%s\n",seq->sequence+n-60);
seq->sequence[n]=nuc;
}
fprintf(output,"%s\n",seq->sequence+n-60);
}
void printOnlySeqFromChar(char* seq, FILE* output)
{
char nuc;
int n=60;
int l = strlen(seq);
for (n=60; n<l; n+=60)
{
nuc = seq[n];
seq[n]=0;
fprintf(output,"%s\n",seq+n-60);
seq[n]=nuc;
}
fprintf(output,"%s\n",seq+n-60);
}
void printOnlyHeaderFromFastaSeqPtr(fastaSeqPtr seq, FILE* output)
{
fprintf(output,">%s\n",seq->rawheader);
}
void printOnlyHeaderFromTable(element_from_header* header, FILE* output)
{
int i;
int nbf;
nbf = atoi(header[0].value);
fprintf(output,">%s ",header[1].value);
for (i = 2; i <= nbf; i++)
{
if (strcmp(header[i].name, "definition") != 0)
{
fprintf(output,"%s",header[i].name);
fprintf(output,"=");
fprintf(output,"%s; ",header[i].value);
}
}
if (strcmp(header[nbf].name, "definition") == 0)
fprintf(output,"%s; ",header[nbf].value);
fprintf(output,"\n");
}
void printHeaderAndSeqFromFastaSeqPtr(fastaSeqPtr seq, FILE* output)
{
printOnlyHeaderFromFastaSeqPtr(seq, output);
printOnlySeqFromFastaSeqPtr(seq, output);
}
#ifndef FASTA_SEQ_WRITER_H_
#define FASTA_SEQ_WRITER_H_
#include "sequence.h"
void printOnlySeqFromFastaSeqPtr(fastaSeqPtr, FILE*);
void printOnlySeqFromChar(char*, FILE*);
void printOnlyHeaderFromFastaSeqPtr(fastaSeqPtr, FILE*);
void printOnlyHeaderFromTable(element_from_header*, FILE*);
void printHeaderAndSeqFromFastaSeqPtr(fastaSeqPtr, FILE*);
#endif
#include <stdio.h>
#include <stdlib.h>
#include "header_mem_handler.h"
#include <string.h>
#define FIELD_BUFFER 1024
char* malloc_field(int *free_size)
{
char* field = (char*) malloc(sizeof(char) * FIELD_BUFFER);
field[0] = 0;
(*free_size) = FIELD_BUFFER;
return field;
}
int check_mem_field(int size_needed)
{
int number_of_chunks_to_alloc;
number_of_chunks_to_alloc = size_needed / FIELD_BUFFER + 1;
return number_of_chunks_to_alloc;
}
char* realloc_field(int number_of_chunks_to_alloc, char* field)
{
int size_needed;
size_needed = number_of_chunks_to_alloc * FIELD_BUFFER;
field = realloc(field, (size_needed)*sizeof(char));
return field;
}
char* check_and_realloc_field(char* field, int size_needed, int* free_size)
{
size_needed = size_needed + strlen(field);
int number_of_chunks_to_alloc = check_mem_field(size_needed);
if (strlen(field)>0)
field = realloc_field(number_of_chunks_to_alloc, field);
else
{
free(field);
field = malloc(number_of_chunks_to_alloc * FIELD_BUFFER);
}
(*free_size) = number_of_chunks_to_alloc*FIELD_BUFFER - size_needed + 1;
return field;
}
char* store_in_field(char* field, char* yytext, int* free_size, int* i)
{
int size_needed;
size_needed = strlen(yytext)+1;
if (size_needed > (*free_size))
field = check_and_realloc_field(field, size_needed, free_size);
else
(*free_size) = (*free_size) - size_needed + 1;
strcpy(&(field[(*i)]),yytext);
(*i) = (*i)+size_needed-1;
return field;
}
char* store_in_header_table(char* field, char** storing_place, int* free_size, int* i)
{
int size_needed;
size_needed = strlen(field)+1;
*storing_place = (char*) malloc(size_needed*sizeof(char));
strcpy(*storing_place,field);
(*i)=0;
free(field);
field = malloc_field(free_size);
return field;
}
element_from_header** check_and_realloc_mem_in_header_table(element_from_header** p_header, int* nbf, int* memory_allocated)
{
(*nbf)++;
if (*nbf == *memory_allocated)
{
(*memory_allocated)++;
*p_header = (element_from_header*) realloc(*p_header, (*memory_allocated) * sizeof(element_from_header));
}
return p_header;
}
void end_header_table(element_from_header** p_header, int nbf)
{
nbf = nbf - 1;
//fprintf(stderr, "nbf = %d", nbf);
sprintf((*p_header)->value, "%d", nbf);
}
#ifndef HEADER_MEM_HANDLER_H_
#define HEADER_MEM_HANDLER_H_
#include "fasta_header_parser.h"
char* malloc_field(int*);
int check_mem_field(int);
char* realloc_field(int, char*);
char* check_and_realloc_field(char*, int, int*);
char* store_in_field(char*, char*, int*, int*);
char* store_in_header_table(char*, char**, int*, int*);
element_from_header** check_and_realloc_mem_in_header_table(element_from_header**, int*, int*);
void end_header_table(element_from_header** p_header, int nbf);
#endif
/**
* FileName: sequence.h
* Authors: Tiayyba Riaz, Celine Mercier
* Description: Prototypes and other declarations for sequences
* **/
#ifndef SEQUENCE_H_
#define SEQUENCE_H_
#include <stdint.h>
#include <stdio.h>
#include "../libutils/utilities.h"
#include "fasta_header_parser.h"
typedef struct {
char* accession_id; // identifier
char *rawheader; // not parsed header
element_from_header* header; // parsed header
char *sequence; // DNA sequence itself
int32_t length; // DNA sequence's length
int32_t count; // abundance of the sequence
unsigned char *table; // 4mer occurrence table build using function buildTable
int32_t over; // count of 4mer with occurrences greater than 255 (overflow)
struct fastaSeqPtr* next; // next unique sequence for example
BOOL cluster_center; // whether the sequence is a cluster center or not
int32_t cluster_weight; // cluster weight when sequence is cluster center
int32_t cluster_weight_unique_ids; // cluster weight when sequence is cluster center, counting the number sequence records
double score; // score with cluster center for example
struct fastaSeqPtr* center; // pointer to the sequence's cluster center
int32_t center_index; // index of the sequence's cluster center
BOOL uniqHead; // whether the sequence is a unique head or not
char* columns_BIOM; // to print in BIOM format
int columns_BIOM_size; // size allocated for columns_BIOM
char* line_OTU_table; // to print in OTU table format
int line_OTU_table_size; // size allocated for line_OTU_table
struct hashtable *sample_counts; // sample counts for sumaclean
}fastaSeq,*fastaSeqPtr;
typedef struct {
int32_t count;
fastaSeqPtr fastaSeqs;
}fastaSeqCount, *fastaSeqCountPtr;
fastaSeqPtr seq_getNext(FILE *fp, char *fieldDelim, BOOL isStandardSeq, BOOL onlyATGC);
char *seq_readNextFromFilebyLine(FILE* fp);
void seq_fillSeq(char *seq, fastaSeqPtr seqElem, int seqLen);
void seq_fillSeqOnlyATGC(char *seq, fastaSeqPtr seqElem, int seqLen);
void seq_fillDigitSeq(char *seq, fastaSeqPtr seqElem, int seqLen);
void seq_fillHeader(char* header, char *fieldDelim, fastaSeqPtr seqElem);
fastaSeqCount seq_readAllSeq2(char *fileName, BOOL isStandardSeq, BOOL onlyATGC);
int32_t seq_findSeqByAccId (char *accid, fastaSeqCountPtr allseqs);
void seq_printSeqs (fastaSeqCountPtr allseq);
int cleanDB(fastaSeqCount);
void addCounts(fastaSeqCount* db);
int uniqSeqsVector(fastaSeqCount* db, fastaSeqPtr** uniqSeqs);
void calculateMaxAndMinLen(fastaSeqPtr* db, int n, int* lmax, int* lmin);
void calculateMaxAndMinLenDB(fastaSeqCount db, int* lmax, int* lmin);
int sortSeqsWithCounts(const void **s1, const void **s2);
int reverseSortSeqsWithCounts(const void **s1, const void **s2);
void readSampleCounts(fastaSeqCount* db, char* key_name);
#endif /*SEQUENCE_H_*/
SOURCES = fileHandling.c
SRCS=$(SOURCES)
OBJECTS= $(patsubst %.c,%.o,$(SOURCES))
LIBFILE= libfile.a
RANLIB=ranlib
include ../global.mk
all: $(LIBFILE)
clean:
rm -rf $(OBJECTS) $(LIBFILE)
rm -f *.P
rm -f *.a
$(LIBFILE): $(OBJECTS)
ar -cr $@ $?
$(RANLIB) $@
/**
* FileName: fileHandling.c
* Authors: Tiayyba Riaz, Celine Mercier
* Description: C file for file handling functions
* **/
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include "../libutils/utilities.h"
/*
* Function Name: fileOpen(char* fileName, BOOL abortOnError)
* Description: Opens the file and returns the pointer to file object
*/
FILE *file_open(char* fileName, BOOL abortOnError)
{
FILE* fp;
if (fileName == NULL && abortOnError)
ERRORABORT(FILE_OPENING_ERROR, "File name not given.");