Commit 8afb1644 authored by Celine Mercier's avatar Celine Mercier

Alignment: API rework. 'obi align' is now 'obi lcs', and the results are

now written to columns automatically created in the output view, all
optimally handled at the C level.
parent fa4e4ffa
#cython: language_level=3
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.obidms._obidms import OBIDMS, OBIView # TODO cimport doesn't work
from obitools3.obidms._obidms cimport OBIDMS # TODO cimport doesn't work
from obitools3.utils cimport str2bytes
from obitools3.obidms.capi.obialign cimport obi_lcs_align_one_column
import time
......@@ -12,26 +17,78 @@ default_config = { 'inputview' : None,
def addOptions(parser):
# TODO put this common group somewhere else but I don't know where
# TODO put this common group somewhere else but I don't know where.
# Also some options should probably be in another group
group=parser.add_argument_group('DMS and view options')
group.add_argument('--default-dms','-d',
group.add_argument('--default-dms', '-d',
action="store", dest="obi:defaultdms",
metavar='<DMS NAME>',
default=None,
type=str,
help="Name of the default DMS for reading and writing data.")
group.add_argument('--input-view','-i',
action="store", dest="obi:inputview",
group.add_argument('--input-view-1', '-i',
action="store", dest="obi:inputview1",
metavar='<INPUT VIEW NAME>',
default=None,
type=str,
help="Name of the input view.")
help="Name of the (first) input view.")
group.add_argument('--input-view-2', '-I',
action="store", dest="obi:inputview2",
metavar='<INPUT VIEW NAME>',
default="",
type=str,
help="Eventually, the name of the second input view.")
group.add_argument('--input-column-1', '-c',
action="store", dest="obi:inputcolumn1",
metavar='<INPUT COLUMN NAME>',
default="",
type=str,
help="Name of the (first) input column. "
" Default: the default nucleotide sequence column of the view if there is one.")
group.add_argument('--input-column-2', '-C',
action="store", dest="obi:inputcolumn2",
metavar='<INPUT COLUMN NAME>',
default="",
type=str,
help="Eventually, the name of the second input column.")
group.add_argument('--input-elt-1', '-e',
action="store", dest="obi:inputelement1",
metavar='<INPUT ELEMENT NAME>',
default="",
type=str,
help="If the first input column has multiple elements per line, name of the element referring to the sequence to align. "
" Default: the first element of the line.")
group.add_argument('--input-elt-2', '-E',
action="store", dest="obi:inputelement2",
metavar='<INPUT ELEMENT NAME>',
default="",
type=str,
help="If the second input column has multiple elements per line, name of the element referring to the sequence to align. "
" Default: the first element of the line.")
# TODO eventually 2nd view, or 2nd column?
group.add_argument('--id-column-1', '-f',
action="store", dest="obi:idcolumn1",
metavar='<ID COLUMN NAME>',
default="",
type=str,
help="Name of the (first) column containing the identifiers of the sequences to align. "
" Default: the default ID column of the view if there is one.")
group.add_argument('--id-column-2', '-F',
action="store", dest="obi:idcolumn2",
metavar='<ID COLUMN NAME>',
default="",
type=str,
help="Eventually, the name of the second ID column.")
group.add_argument('--output-view','-o',
group.add_argument('--output-view', '-o',
action="store", dest="obi:outputview",
metavar='<OUTPUT VIEW NAME>',
default=None,
......@@ -39,14 +96,7 @@ def addOptions(parser):
help="Name of the output view.")
group=parser.add_argument_group('obi align specific options')
group.add_argument('--lcs','-C',
action="store", dest="align:alitype",
metavar='<ALIGNMENT TYPE>',
default='lcs',
type=str,
help="Compute alignment using the LCS method (default).")
group=parser.add_argument_group('obi lcs specific options')
group.add_argument('--threshold','-t',
action="store", dest="align:threshold",
......@@ -62,14 +112,14 @@ def addOptions(parser):
" Only sequence pairs with a similarity above <THRESHOLD> are printed. Default: 0.00"
" (no threshold).")
group.add_argument('--longest_length','-L',
group.add_argument('--longest-length','-L',
action="store_const", dest="align:reflength",
default=0,
const=1,
help="The reference length is the length of the longest sequence."
" Default: the reference length is the length of the alignment.")
group.add_argument('--shortest_length','-l',
group.add_argument('--shortest-length','-l',
action="store_const", dest="align:reflength",
default=0,
const=2,
......@@ -86,35 +136,74 @@ def addOptions(parser):
default=True,
help="Score is expressed in distance. Default: score is expressed in similarity.")
group.add_argument('--print-seq','-s',
action="store_true", dest="align:printseq",
default=False,
help="The nucleotide sequences are written in the output view. Default: they are not written.")
group.add_argument('--print-count','-n',
action="store_true", dest="align:printcount",
default=False,
help="Sequence counts are written in the output view. Default: they are not written.")
cpdef align(str dms_n,
str input_view_1_n, str output_view_n,
str input_view_2_n="",
str input_column_1_n="", str input_column_2_n="",
str input_elt_1_n="", str input_elt_2_n="",
str id_column_1_n="", str id_column_2_n="",
double threshold=0.0, bint normalize=True,
int reference=0, bint similarity_mode=True,
bint print_seq=False, bint print_count=False,
comments="") :
cdef OBIDMS d
d = OBIDMS(dms_n)
# Align 1 column (2 columns not implemented yet)
if obi_lcs_align_one_column(d._pointer, \
str2bytes(input_view_1_n), \
str2bytes(input_column_1_n), \
str2bytes(input_elt_1_n), \
str2bytes(id_column_1_n), \
str2bytes(output_view_n), \
str2bytes(comments), \
print_seq, \
print_count, \
threshold, normalize, reference, similarity_mode) < 0 :
raise Exception("Error aligning sequences")
def run(config):
# Open DMS
d = OBIDMS(config['obi']['defaultdms'])
# Open input view 1
iview = d.open_view(config['obi']['inputview'])
d.close()
# TODO Open input view 2 if there is one
# Create output view
oview = d.new_view(config['obi']['outputview'])
def run(config):
# TODO Take other alignment types into account when they'll be implemented
# TODO: Build formatted comments with all parameters etc
comments = "Obi align"
# Call cython alignment function
iview.align(oview, threshold=config['align']['threshold'], normalize=config['align']['normalize'], reference=config['align']['reflength'], similarity_mode=config['align']['similarity'])
print(repr(oview))
iview.close()
oview.close()
d.close()
align(config['obi']['defaultdms'], \
config['obi']['inputview1'], \
config['obi']['outputview'], \
input_view_2_n = config['obi']['inputview2'], \
input_column_1_n = config['obi']['inputcolumn1'], \
input_column_2_n = config['obi']['inputcolumn2'], \
input_elt_1_n = config['obi']['inputelement1'], \
input_elt_2_n = config['obi']['inputelement2'], \
id_column_1_n = config['obi']['idcolumn1'], \
id_column_2_n = config['obi']['idcolumn2'], \
threshold = config['align']['threshold'], \
normalize = config['align']['normalize'], \
reference = config['align']['reflength'], \
similarity_mode = config['align']['similarity'], \
print_seq = config['align']['printseq'], \
print_count = config['align']['printcount'], \
comments = comments)
print("Done.")
\ No newline at end of file
\ No newline at end of file
......@@ -67,16 +67,8 @@ cdef class OBIView:
cdef object get_view_subclass(str view_type)
cdef class OBIView_NUC_SEQS(OBIView):
cpdef align(self,
OBIView oview,
OBIView iview2=*,
double threshold=*,
bint normalize=*,
int reference=*,
bint similarity_mode=*
)
cdef class OBIView_NUC_SEQS(OBIView) :
pass
cdef class OBIView_line :
......
......@@ -10,8 +10,6 @@ from .capi.obidmscolumn cimport obi_close_column, \
OBIDMS_column_header_p
from .capi.obiutils cimport obi_format_date
from .capi.obialign cimport obi_align_one_column
from .capi.obitypes cimport const_char_p, \
OBIType_t, \
......@@ -535,49 +533,6 @@ cdef class OBIView_NUC_SEQS(OBIView):
self[line_idx][key] = sequence_obj[key]
# TODO discuss
cpdef align(self, OBIView oview, OBIView iview2=None,
double threshold=0.0, bint normalize=True, int reference=0, bint similarity_mode=True) :
pass
#
# cdef OBIView iview1
#
# cdef Obiview_p iview1_p
# cdef Obiview_p iview2_p
# cdef Obiview_p oview_p
#
# cdef OBIDMS_column icol1
# cdef OBIDMS_column_p icol1_p
# cdef OBIDMS_column_p* icol1_pp
#
# cdef OBIDMS_column id1_col
# cdef OBIDMS_column_p id1_col_p
# cdef OBIDMS_column_p* id1_col_pp
#
# cdef OBIDMS_column id2_col
# cdef OBIDMS_column_p id2_col_p
# cdef OBIDMS_column_p* id2_col_pp
#
# cdef OBIDMS_column ocol
# cdef OBIDMS_column_p ocol_p
# cdef OBIDMS_column_p* ocol_pp
#
# cdef str id1_col_name
# cdef str id2_col_name
# cdef str score_col_name
#
# score_col_name = "score"
#
# iview1= self
# iview1_p = iview1._pointer
# icol1 = iview1[bytes2str(NUC_SEQUENCE_COLUMN)]
# icol1_pp = icol1._pointer
# icol1_p = icol1_pp[0]
#
# if obi_align_one_column(iview1_p, icol1_p, threshold, normalize, reference, similarity_mode) < 0 :
# raise Exception("Error aligning sequences")
######################################################################################################
......
......@@ -4,7 +4,6 @@ from .capi.obiview cimport obi_get_seq_with_elt_name_and_col_p_in_view, \
obi_get_seq_with_elt_idx_and_col_p_in_view, \
obi_set_seq_with_elt_name_and_col_p_in_view, \
obi_set_seq_with_elt_idx_and_col_p_in_view
from .capi.obialign cimport obi_align_one_column
from .capi.obierrno cimport obi_errno
from .capi.obitypes cimport OBISeq_NA, const_char_p
......
......@@ -102,12 +102,12 @@ cdef class OBI_Nuc_Seq_Stored(OBIView_line) :
return self[bytes2str(QUALITY_COLUMN)]
@quality.setter
def quality(self, object new_qual):
if (type(new_qual) == list) or (new_qual is None) :
if (type(new_qual) == list) or (new_qual is None) : # TODO check that quality column exists
self[bytes2str(QUALITY_COLUMN)] = new_qual
else : # Quality is in str form
(((self._view).columns)[bytes2str(QUALITY_COLUMN)]).set_str_line(self._index, new_qual)
cpdef object get_str_quality(self) : # TODO not ideal
cpdef object get_str_quality(self) : # TODO not ideal. Make quality_int and quality_str properties
return ((self._view).columns)[bytes2str(QUALITY_COLUMN)].get_str_line(self._index)
# cpdef str reverse_complement(self) : TODO in C ?
......
#cython: language_level=3
from ..capi.obiview cimport Obiview_p
from ..capi.obidmscolumn cimport OBIDMS_column_p
from obitools3.obidms.capi.obidms cimport OBIDMS_p
from obitools3.obidms.capi.obitypes cimport const_char_p
cdef extern from "obi_align.h" nogil:
int obi_align_one_column(Obiview_p seq_view,
OBIDMS_column_p seq_column,
const char* seq_name,
Obiview_p score_view,
OBIDMS_column_p id1_column,
OBIDMS_column_p id2_column,
OBIDMS_column_p score_column,
double threshold,
bint normalize,
int reference,
bint similarity_mode)
int obi_lcs_align_one_column(OBIDMS_p dms,
const_char_p seq_view_name,
const_char_p seq_column_name,
const_char_p seq_elt_name,
const_char_p id_column_name,
const_char_p output_view_name,
const_char_p output_view_comments,
bint print_seq,
bint print_count,
double threshold,
bint normalize,
int reference,
bint similarity_mode)
......@@ -14,6 +14,7 @@
#include <stdio.h>
#include <stdbool.h>
#include "obi_align.h"
#include "obidebug.h"
#include "obierrno.h"
#include "obitypes.h"
......@@ -28,67 +29,227 @@
// TODO
// use openMP pragmas
// option pour ecrire en stdint?
// check NUC_SEQS view type? and score type (int or float if normalize)
// what's with multiple sequences/line columns?
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const char* seq_name,
Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column,
double threshold, bool normalize, int reference, bool similarity_mode)
int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char* seq_column_name, const char* seq_elt_name,
const char* id_column_name,
const char* output_view_name, const char* output_view_comments,
bool print_seq, bool print_count,
double threshold, bool normalize, int reference, bool similarity_mode)
{
index_t i, j, k;
index_t seq_count;
const char* id1;
const char* id2;
index_t id1_idx, id2_idx;
index_t seq1_idx, seq2_idx;
double score;
OBIDMS_column_p id_column;
int lcs_length;
int ali_length;
Kmer_table_p ktable;
Obi_blob_p blob1;
Obi_blob_p blob2;
int lcs_min;
index_t seq_idx;
index_t seq_elt_idx;
Obiview_p seq_view = NULL;
Obiview_p output_view = NULL;
OBIDMS_column_p iseq_column = NULL;
OBIDMS_column_p id_column;
OBIDMS_column_p id1_column = NULL;
OBIDMS_column_p id2_column = NULL;
OBIDMS_column_p seq1_column = NULL;
OBIDMS_column_p seq2_column = NULL;
//OBIDMS_column_p count1_column = NULL;
//OBIDMS_column_p count2_column = NULL;
OBIDMS_column_p idx1_column = NULL;
OBIDMS_column_p idx2_column = NULL;
OBIDMS_column_p lcs_length_column = NULL;
OBIDMS_column_p ali_length_column = NULL;
OBIDMS_column_p score_column = NULL;
k = 0;
// If no sequence column is given and the view has the type NUC_SEQS_VIEW, the default sequence column is aligned
if ((seq_column == NULL) && (strcmp((seq_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0))
// Open input view
seq_view = obi_open_view(dms, seq_view_name);
if (seq_view == NULL)
{
obidebug(1, "\nError opening the input view to align");
return -1;
}
// Open the sequence column to align
// If a column name wasn't given, open default sequence column
if (strcmp(seq_column_name, "") == 0)
{
seq_column = obi_view_get_column(seq_view, NUC_SEQUENCE_COLUMN);
if (seq_column == NULL)
if (strcmp((seq_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0)
iseq_column = obi_view_get_column(seq_view, NUC_SEQUENCE_COLUMN);
else
{
obi_set_errno(OBI_ALIGN_ERROR);
obidebug(1, "\nError: no column given to align");
return -1;
}
}
// Check that the given sequence column contains nucleotide sequences
else if ((seq_column->header)->returned_data_type != OBI_SEQ)
else
iseq_column = obi_view_get_column(seq_view, seq_column_name);
if (iseq_column == NULL)
{
obi_set_errno(OBI_ALIGN_ERROR);
obidebug(1, "\nTrying to align a column of a different type than OBI_SEQ");
obidebug(1, "\nError getting the column to align");
return -1;
}
if ((normalize && ((score_column->header)->returned_data_type != OBI_FLOAT)) ||
(!normalize && ((score_column->header)->returned_data_type != OBI_INT)))
// Get element index of the sequence to align in each line to compute it only once
if ((strcmp(seq_elt_name, "") != 0) && (seq_elt_name != NULL))
{
obi_set_errno(OBI_ALIGN_ERROR);
obidebug(1, "\nTrying to store alignment scores in a column of an inappropriate type");
seq_elt_idx = obi_column_get_element_index_from_name(iseq_column, seq_elt_name);
if (seq_elt_idx == OBIIdx_NA)
{
obidebug(1, "\nError getting the sequence index in a column line when aligning");
return -1;
}
}
else
seq_elt_idx = 0;
// Open the ID column, containing the identifiers of the sequences to align
// If a column name wasn't given, open default ID column
if (strcmp(id_column_name, "") == 0)
{
if (strcmp((seq_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0)
id_column = obi_view_get_column(seq_view, ID_COLUMN);
else
{
obi_set_errno(OBI_ALIGN_ERROR);
obidebug(1, "\nError: no ID column given");
return -1;
}
}
else
id_column = obi_view_get_column(seq_view, id_column_name);
if (id_column == NULL)
{
obidebug(1, "\nError getting the ID column");
return -1;
}
// Create the output view
output_view = obi_new_view(dms, output_view_name, NULL, NULL, output_view_comments);
if (output_view == NULL)
{
obidebug(1, "\nError creating the output view when aligning");
return -1;
}
// Get element index from element name to compute it only once
if (seq_name != NULL)
// Create the output columns
// Create the column for the ids of the 1st sequence aligned
if (obi_view_add_column(output_view, ID1_COLUMN_NAME, -1, ID1_COLUMN_NAME, OBI_STR, 0, 1, NULL, (id_column->header)->indexer_name, NULL, -1, ID1_COLUMN_COMMENTS, true) < 0)
{
obidebug(1, "\nError creating the first column for the sequence ids when aligning");
return -1;
}
id1_column = obi_view_get_column(output_view, ID1_COLUMN_NAME);
// Create the column for the ids of the 2nd sequence aligned
if (obi_view_add_column(output_view, ID2_COLUMN_NAME, -1, ID2_COLUMN_NAME, OBI_STR, 0, 1, NULL, (id_column->header)->indexer_name, NULL, -1, ID2_COLUMN_COMMENTS, true) < 0)
{
obidebug(1, "\nError creating the second column for the sequence ids when aligning");
return -1;
}
id2_column = obi_view_get_column(output_view, ID2_COLUMN_NAME);
// Create the column for the index (in the input view) of the first sequences aligned
if (obi_view_add_column(output_view, IDX1_COLUMN_NAME, -1, IDX1_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, IDX1_COLUMN_COMMENTS, true) < 0)
{
obidebug(1, "\nError creating the first column for the sequence indices when aligning");
return -1;
}
idx1_column = obi_view_get_column(output_view, IDX1_COLUMN_NAME);
// Create the column for the index (in the input view) of the second sequences aligned
if (obi_view_add_column(output_view, IDX2_COLUMN_NAME, -1, IDX2_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, IDX2_COLUMN_COMMENTS, true) < 0)
{
obidebug(1, "\nError creating the second column for the sequence indices when aligning");
return -1;
}
idx2_column = obi_view_get_column(output_view, IDX2_COLUMN_NAME);
// Create the column for the LCS length
if (obi_view_add_column(output_view, LCS_LENGTH_COLUMN_NAME, -1, LCS_LENGTH_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, LCS_LENGTH_COLUMN_COMMENTS, true) < 0)
{
obidebug(1, "\nError creating the column for the LCS length when aligning");
return -1;
}
lcs_length_column = obi_view_get_column(output_view, LCS_LENGTH_COLUMN_NAME);
// Create the column for the alignment length if it is computed
if ((reference == ALILEN) && (normalize || !similarity_mode))
{
seq_idx = obi_column_get_element_index_from_name(seq_column, seq_name);
if (seq_idx == OBIIdx_NA)
if (obi_view_add_column(output_view, ALI_LENGTH_COLUMN_NAME, -1, ALI_LENGTH_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, ALI_LENGTH_COLUMN_COMMENTS, true) < 0)
{
obidebug(1, "\nError getting the sequence index in a column line when aligning");
obidebug(1, "\nError creating the column for the alignment length when aligning");
return -1;
}
ali_length_column = obi_view_get_column(output_view, ALI_LENGTH_COLUMN_NAME);
}
// Create the column for the alignment score
if (normalize)
{
if (obi_view_add_column(output_view, SCORE_COLUMN_NAME, -1, SCORE_COLUMN_NAME, OBI_FLOAT, 0, 1, NULL, NULL, NULL, -1, SCORE_COLUMN_NAME, true) < 0)
{
obidebug(1, "\nError creating the column for the score when aligning");
return -1;
}
}
else
seq_idx = 0;
{
if (obi_view_add_column(output_view, SCORE_COLUMN_NAME, -1, SCORE_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, SCORE_COLUMN_NAME, true) < 0)
{
obidebug(1, "\nError creating the column for the score when aligning");
return -1;
}
}
score_column = obi_view_get_column(output_view, SCORE_COLUMN_NAME);
if (print_seq)
{
// Create the column for the first sequences aligned
if (obi_view_add_column(output_view, SEQ1_COLUMN_NAME, -1, SEQ1_COLUMN_NAME, OBI_SEQ, 0, 1, NULL, (iseq_column->header)->indexer_name, NULL, -1, SEQ1_COLUMN_COMMENTS, true) < 0)
{
obidebug(1, "\nError creating the first column for the sequences when aligning");
return -1;
}
seq1_column = obi_view_get_column(output_view, SEQ1_COLUMN_NAME);
// Create the column for the second sequences aligned
if (obi_view_add_column(output_view, SEQ2_COLUMN_NAME, -1, SEQ2_COLUMN_NAME, OBI_SEQ, 0, 1, NULL, (iseq_column->header)->indexer_name, NULL, -1, SEQ2_COLUMN_COMMENTS, true) < 0)
{
obidebug(1, "\nError creating the second column for the sequences when aligning");
return -1;
}
seq2_column = obi_view_get_column(output_view, SEQ2_COLUMN_NAME);
}
// if (print_count) // TODO count columns not implemented yet
// {
// // Create the column for the count of the first sequences aligned
// if (obi_view_add_column(output_view, COUNT1_COLUMN_NAME, -1, COUNT1_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, COUNT1_COLUMN_COMMENTS, true) < 0)
// {
// obidebug(1, "\nError creating the first column for the sequence counts when aligning");