Commit c7f5b8d9 by Celine Mercier

Alignpairedend: added alignment using shifting with best kmer similarity

(low level layer in C and Cython API)
parent 59017c0d
#cython: language_level=3
cdef object buildAlignment(object direct, object reverse)
\ No newline at end of file
......@@ -2,19 +2,20 @@
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms import DMS
from obitools3.dms.view.view cimport View
from obitools3.dms.view import RollbackException
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
from obitools3.dms.column.column cimport Column, Column_line
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t, OBI_QUAL
from obitools3.dms.column.column cimport Column
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN
from obitools3.dms.capi.obitypes cimport OBI_QUAL
from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption
from obitools3.uri.decode import open_uri
from obitools3.apps.config import logger
from obitools3.libalign._qsassemble import QSolexaReverseAssemble
from obitools3.libalign._qsrassemble import QSolexaRightReverseAssemble
from obitools3.libalign._solexapairend import buildConsensus, buildJoinedSequence
from obitools3.dms.obiseq cimport Nuc_Seq_Stored, Nuc_Seq
from obitools3.dms.obiseq cimport Nuc_Seq
from obitools3.libalign.shifted_ali cimport Kmer_similarity, Ali_shifted
import sys
import os
......@@ -27,6 +28,7 @@ REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY"
__title__="Aligns paired-ended reads"
def addOptions(parser):
addMinimalInputOption(parser)
......@@ -41,14 +43,6 @@ def addOptions(parser):
type=str,
help="URI to the reverse reads if they are in a different view than the forward reads")
# TODO
# group.add_argument('--index-file',
# action="store", dest="alignpairedend:index",
# metavar="<FILENAME>",
# default=None,
# type=str,
# help="URI to the index reads")
group.add_argument('--score-min',
action="store", dest="alignpairedend:smin",
metavar="#.###",
......@@ -56,12 +50,22 @@ def addOptions(parser):
type=float,
help="Minimum score for keeping alignments")
group.add_argument('-A', '--true-ali',
action="store_true", dest="alignpairedend:trueali",
default=False,
help="Performs gap free end alignment of sequences instead of using kmers to compute alignments (slower).")
group.add_argument('-k', '--kmer-size',
action="store", dest="alignpairedend:kmersize",
metavar="#",
default=3,
type=int,
help="K-mer size for kmer comparisons, between 1 and 4 (not when using -A option; default: 3)")
# TODO declarations, cdef, imports
la = QSolexaReverseAssemble()
ra = QSolexaRightReverseAssemble()
def buildAlignment(object direct, object reverse):
cdef object buildAlignment(object direct, object reverse):
if len(direct)==0 or len(reverse)==0:
return None
......@@ -82,9 +86,9 @@ def buildAlignment(object direct, object reverse):
ali = rali
return ali
def alignmentIterator(entries):
def alignmentIterator(entries, aligner):
if type(entries) == list:
two_views = True
......@@ -94,7 +98,7 @@ def alignmentIterator(entries):
else:
two_views = False
entries_len = len(entries)
for i in range(entries_len):
if two_views:
seqF = forward[i]
......@@ -104,9 +108,12 @@ def alignmentIterator(entries):
seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality=seqF[REVERSE_QUALITY_COLUMN_NAME])
seqF.pop(REVERSE_SEQ_COLUMN_NAME)
seqF.pop(REVERSE_QUALITY_COLUMN_NAME)
ali = buildAlignment(seqF, seqR)
ali = aligner(seqF, seqR)
if ali is None:
continue
yield ali
......@@ -160,11 +167,6 @@ def run(config):
else:
entries_len = len(entries)
# if "index" in config["alignpairedend"]: # TODO
# index = open_uri(config["alignpairedend"]["index"])
# else:
# index = None
# Open the output
output = open_uri(config['obi']['outputURI'],
input=False,
......@@ -174,22 +176,30 @@ def run(config):
view = output[1]
Column.new_column(view, b"QUALITY", OBI_QUAL) #TODO output URI quality option
Column.new_column(view, QUALITY_COLUMN, OBI_QUAL) #TODO output URI quality option?
if 'smin' in config['alignpairedend']:
config['alignpairedend']['sminL'] = config['alignpairedend']['smin']
config['alignpairedend']['sminR'] = config['alignpairedend']['smin']
sminL = config['alignpairedend']['sminL']
sminR = config['alignpairedend']['sminR']
smin = config['alignpairedend']['smin']
else:
sminL = 0
sminR = 0
smin = 0
# Initialize the progress bar
pb = ProgressBar(entries_len, config, seconde=5)
ba = alignmentIterator(entries)
if config['alignpairedend']['trueali']:
kmer_ali = False
aligner = buildAlignment
else :
kmer_ali = True
if type(entries) == list:
forward = entries[0]
reverse = entries[1]
aligner = Kmer_similarity(forward, view2=reverse, kmer_size=config['alignpairedend']['kmersize'])
else:
aligner = Kmer_similarity(entries, kmer_size=config['alignpairedend']['kmersize'])
ba = alignmentIterator(entries, aligner)
i = 0
for ali in ba:
......@@ -197,31 +207,33 @@ def run(config):
consensus = view[i]
if sminL > 0:
if ((ali.direction=='left' and ali.score > sminL)
or (ali.score > sminR)):
buildConsensus(ali, consensus)
if not two_views:
seqF = entries[i]
else:
seqF = forward[i]
if smin > 0:
if (ali.score > smin) :
buildConsensus(ali, consensus, seqF)
else:
seqF = ali[0].wrapped
if not two_views:
seq = entries[i]
seqR = Nuc_Seq(seq.id, seq[REVERSE_SEQ_COLUMN_NAME], quality = seq[REVERSE_QUALITY_COLUMN_NAME])
seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality = seqF[REVERSE_QUALITY_COLUMN_NAME])
else:
seqR = reverse[i]
buildJoinedSequence(ali, seqR, consensus)
buildJoinedSequence(ali, seqR, consensus, forward=seqF)
consensus[b"sminL"] = sminL
consensus[b"sminR"] = sminR
consensus[b"smin"] = smin
else:
buildConsensus(ali, consensus)
# TODO
# if "index" in config['alignpairedend'] and config['alignpairedend']['index'] is not None:
# idx = str(config['alignpairedend']['index'].next())
# consensus[b"illumina_index"] = idx
buildConsensus(ali, consensus, seqF)
if kmer_ali :
ali.free()
i+=1
if kmer_ali :
aligner.free()
# Save command config in View and DMS comments
command_line = " ".join(sys.argv[1:])
view.write_config(config, "alignpairedend", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
......
#cython: language_level=3
from obitools3.dms.capi.obiview cimport Obiview_p
from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p
from obitools3.dms.capi.obitypes cimport index_t
from libc.stdint cimport int32_t, uint8_t
cdef extern from "kmer_similarity.h" nogil:
struct Obi_ali_t :
double score
int consensus_length
int overlap_length
char* consensus_seq
uint8_t* consensus_qual
int shift
char* direction
ctypedef Obi_ali_t* Obi_ali_p
void obi_free_shifted_ali(Obi_ali_p ali)
Obi_ali_p kmer_similarity(Obiview_p view1,
OBIDMS_column_p column1,
index_t idx1,
index_t elt_idx1,
Obiview_p view2,
OBIDMS_column_p column2,
index_t idx2,
index_t elt_idx2,
uint8_t kmer_size,
int32_t* kmer_pos_array,
int32_t* kmer_pos_array_height_p,
int32_t* shift_array,
int32_t* shift_array_height_p,
int32_t* shift_count_array,
int32_t* shift_count_array_height_p,
bint build_consensus)
......@@ -6,6 +6,7 @@
../../../src/dna_seq_indexer.c
../../../src/encode.c
../../../src/hashtable.c
../../../src/kmer_similarity.c
../../../src/linked_list.c
../../../src/murmurhash2.c
../../../src/obi_lcs.c
......
......@@ -3,6 +3,18 @@
from cpython cimport array
from .solexapairend import iterOnAligment
from .shifted_ali cimport Ali_shifted
from obitools3.dms.capi.obiview cimport Obiview_p, QUALITY_COLUMN, \
obi_set_qual_int_with_elt_idx_and_col_p_in_view, \
obi_set_str_with_elt_idx_and_col_p_in_view
from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p
from obitools3.dms.view.view cimport View
from obitools3.dms.column.column cimport Column
from math import log10
......@@ -146,7 +158,7 @@ cdef class IterOnConsensus:
seqBDeletion = property(get_seqBDeletion, None, None, "deletereverse's docstring")
def buildConsensus(ali, seq):
def buildConsensus(ali, seq, ref_tags=None):
cdef list quality
cdef char aseq[1000]
cdef int i=0
......@@ -156,61 +168,82 @@ def buildConsensus(ali, seq):
cdef double score
cdef bytes sseq
cdef double p
cdef OBIDMS_column_p col_p
cdef Obiview_p view_p
cdef View view
cdef Column column
quality = []
if type(ali) == Ali_shifted:
view = seq.view
view_p = view.pointer()
column = view[QUALITY_COLUMN]
col_p = column.pointer()
# doesn't work because uint8_t* are forced into bytes by cython (nothing read/kept beyond 0 values)
#obi_set_qual_int_with_elt_idx_and_col_p_in_view(view_p, col_p, seq.index, 0, ali.consensus_qual, ali.consensus_len)
seq.set(ref_tags.id+b"_CONS", ali.consensus_seq, quality=ali.consensus_qual, tags=ref_tags)
seq[b'ali_length'] = ali.consensus_len
seq[b'score_norm']=float(ali.score)/ali.consensus_len
seq[b'shift']=ali.shift
else:
if len(ali[0])>999: # TODO why?
raise AssertionError,"Too long alignemnt"
if len(ali[0])>999: # TODO why?
raise AssertionError,"Too long alignemnt"
ic=IterOnConsensus(ali)
for nuc,score in ic:
cnuc=nuc
quality.append(score)
aseq[i]=cnuc[0]
i+=1
aseq[i]=0
sseq=aseq
# Reconvert quality from array of probabilities to array of raw quality values
i=0
for p in quality:
quality[i] = min(42, round(-10*log10(p)))
i+=1
ic=IterOnConsensus(ali)
for nuc,score in ic:
cnuc=nuc
quality.append(score)
aseq[i]=cnuc[0]
i+=1
seq.set(ali[0].wrapped.id+b"_CONS", sseq, quality=quality, tags=ali[0].wrapped)
aseq[i]=0
sseq=aseq
# Reconvert quality from array of probabilities to array of raw quality values
i=0
for p in quality:
quality[i] = min(42, round(-10*log10(p)))
i+=1
seq.set(ali[0].wrapped.id+b"_CONS", sseq, quality=quality, tags=ali[0].wrapped)
if hasattr(ali, "counter"):
seq[b'alignement_id']=ali.counter
seq[b'seq_a_single']=ic.seqASingle
seq[b'seq_b_single']=ic.seqBSingle
seq[b'seq_ab_match']=ic.seqABMatch
seq[b'seq_a_mismatch']=ic.seqAMismatch
seq[b'seq_b_mismatch']=ic.seqBMismatch
seq[b'seq_a_insertion']=ic.seqAInsertion
seq[b'seq_b_insertion']=ic.seqBInsertion-ic.seqBSingle
seq[b'seq_a_deletion']=ic.seqADeletion
seq[b'seq_b_deletion']=ic.seqBDeletion
seq[b'ali_length']=len(seq)-ic.seqASingle-ic.seqBSingle
if seq[b'ali_length']>0:
seq[b'score_norm']=float(ali.score)/seq[b'ali_length']
if hasattr(ali, "direction"):
seq[b'direction']=ali.direction
if hasattr(ali, "counter"):
seq[b'alignement_id']=ali.counter
seq[b'seq_a_single']=ic.seqASingle
seq[b'seq_b_single']=ic.seqBSingle
seq[b'seq_ab_match']=ic.seqABMatch
seq[b'seq_a_mismatch']=ic.seqAMismatch
seq[b'seq_b_mismatch']=ic.seqBMismatch
seq[b'seq_a_insertion']=ic.seqAInsertion
seq[b'seq_b_insertion']=ic.seqBInsertion-ic.seqBSingle
seq[b'seq_a_deletion']=ic.seqADeletion
seq[b'seq_b_deletion']=ic.seqBDeletion
seq[b'ali_direction']=ali.direction
seq[b'score']=ali.score
seq[b'ali_length']=len(seq)-ic.seqASingle-ic.seqBSingle
if seq[b'ali_length']>0:
seq[b'score_norm']=float(ali.score)/seq[b'ali_length']
seq[b'mode']=b'alignment'
return seq
def buildJoinedSequence(ali, reverse, seq):
forward = ali[0].wrapped
def buildJoinedSequence(ali, reverse, seq, forward=None):
if forward is not None:
forward = forward
else:
forward = ali[0].wrapped
s = forward.seq + reverse.seq
quality = forward.quality
quality.extend(reverse.quality)
seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality, tags=forward)
seq[b"score"]=ali.score
seq[b"ali_dir"]=ali.direction
seq[b"ali_direction"]=ali.direction
seq[b"mode"]=b"joined"
seq[b"pairedend_limit"]=len(forward)
return seq
......
#cython: language_level=3
from obitools3.dms.capi.kmer_similarity cimport kmer_similarity, Obi_ali_p
from libc.stdint cimport int32_t, uint8_t
from obitools3.dms.capi.obiview cimport Obiview_p
from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p
cdef class Ali_shifted:
cdef Obi_ali_p _pointer
cdef inline Obi_ali_p pointer(self)
cpdef free(self)
@staticmethod
cdef Ali_shifted new_ali(Obi_ali_p ali_p)
cdef class Kmer_similarity:
cdef uint8_t kmer_size
cdef int32_t* kmer_pos_array_p
cdef int32_t kmer_pos_array_height_a[1]
cdef int32_t* shift_array_p
cdef int32_t shift_array_height_a[1]
cdef int32_t* shift_count_array_p
cdef int32_t shift_count_array_height_a[1]
cdef Obiview_p view1_p
cdef OBIDMS_column_p column1_p
cdef Obiview_p view2_p
cdef OBIDMS_column_p column2_p
cdef bint build_consensus
cpdef free(self)
\ No newline at end of file
#cython: language_level=3
from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN
from obitools3.dms.obiseq cimport Nuc_Seq_Stored
from obitools3.dms.capi.kmer_similarity cimport kmer_similarity, Obi_ali_p, obi_free_shifted_ali
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
from obitools3.dms.column.column cimport Column
from libc.stdlib cimport free
cdef class Ali_shifted:
cdef inline Obi_ali_p pointer(self) :
return <Obi_ali_p>(self._pointer)
@staticmethod
cdef Ali_shifted new_ali(Obi_ali_p ali_p) : # not __init__ method because need to pass pointer
ali = Ali_shifted()
ali._pointer = ali_p
return ali
@property
def score(self):
return self.pointer().score
@property
def consensus_len(self):
return self.pointer().consensus_length
@property
def overlap_len(self):
return self.pointer().overlap_length
@property
def consensus_seq(self):
return self.pointer().consensus_seq
@property
def shift(self):
return self.pointer().shift
@property
def consensus_qual(self): # must return list because uint8_t* are forced into bytes by cython which won't keep beyond the first 0 value
qual_list = []
for i in range(self.consensus_len):
qual_list.append((self.pointer().consensus_qual)[i])
return qual_list
@property
def direction(self):
return self.pointer().direction
cpdef free(self):
obi_free_shifted_ali(self.pointer())
cdef class Kmer_similarity:
def __init__(self, View_NUC_SEQS view1, Column column1=None, View_NUC_SEQS view2=None, Column column2=None, uint8_t kmer_size=3, build_consensus=True) :
cdef Column default_seq_col
if kmer_size < 1 or kmer_size > 3:
raise Exception("Kmer size to compute kmer similarity must be >=1 or <=4")
self.kmer_pos_array_p = NULL
self.shift_array_p = NULL
self.shift_count_array_p = NULL
self.kmer_pos_array_height_a[0] = 0
self.shift_array_height_a[0] = 0
self.shift_count_array_height_a[0] = 0
self.kmer_size = kmer_size
self.build_consensus = build_consensus
self.view1_p = view1.pointer()
if column1 is not None:
self.column1_p = column1.pointer()
else:
if type(view1) != View_NUC_SEQS:
raise Exception("Kmer similarity with no sequence column given must be given a NUC_SEQS view")
default_seq_col = view1[NUC_SEQUENCE_COLUMN]
self.column1_p = default_seq_col.pointer()
if view2 is not None:
self.view2_p = view2.pointer()
else:
view2 = view1
self.view2_p = self.view1_p
if column2 is not None:
self.column2_p = column2.pointer()
else:
if type(view2) != View_NUC_SEQS:
raise Exception("Kmer similarity with no sequence column given must be given a NUC_SEQS view")
default_seq_col = view2[NUC_SEQUENCE_COLUMN]
self.column2_p = default_seq_col.pointer()
def __call__(self, Nuc_Seq_Stored seq1, Nuc_Seq_Stored seq2):
cdef Obi_ali_p ali_p
cdef Ali_shifted ali
ali_p = kmer_similarity(self.view1_p, self.column1_p, seq1.index, 0, \
self.view2_p, self.column2_p, seq2.index, 0, \
self.kmer_size, \
self.kmer_pos_array_p, self.kmer_pos_array_height_a, \
self.shift_array_p, self.shift_array_height_a, \
self.shift_count_array_p, self.shift_count_array_height_a,
self.build_consensus)
ali = Ali_shifted.new_ali(ali_p)
return ali
cpdef free(self):
if self.kmer_pos_array_p is not NULL:
free(self.kmer_pos_array_p)
self.kmer_pos_array_height_a[0] = 0
if (self.shift_array_p is not NULL) :
free(self.shift_array_p)
self.shift_array_height_a[0] = 0
if (self.shift_count_array_p is not NULL) :
free(self.shift_count_array_p)
self.shift_count_array_height_a[0] = 0
......@@ -6,6 +6,7 @@
../../src/dna_seq_indexer.c
../../src/encode.c
../../src/hashtable.c
../../src/kmer_similarity.c
../../src/linked_list.c
../../src/murmurhash2.c
../../src/obi_lcs.c
......
/****************************************************************************
* Header file for Kmer similarity computation functions *
****************************************************************************/
/**
* @file kmer_similarity.h
* @author Celine Mercier (celine.mercier@metabarcoding.org)
* @date January 7th 2019
* @brief Header file for Kmer similarity computation functions.
*/
#ifndef KMER_SIMILARITY_H_
#define KMER_SIMILARITY_H_
#include <stdio.h>
#include "obitypes.h"
#include "obidmscolumn.h"
#include "obiview.h"
#define ARRAY_LENGTH (256)
/**
* @brief Alignment structure, with informations about the similarity and to rebuild the alignment.
*/
typedef struct Obi_ali {
double score; /**< Alignment score.
*/
int consensus_length; /**< Length of the final consensus sequence.
*/
int overlap_length; /**< Length of the overlap between the aligned sequences.
*/
char* consensus_seq; /**< Consensus sequence built with the computed overlap.
*/
uint8_t* consensus_qual; /**< Consensus quality built with the computed overlap.
*/
int shift; /**< Shift chosen to align the sequences (for shifted alignment).
*/
char direction[6]; /**< Alignement direction (positive/right or negative/left shift) (for shifted alignment).
*/ // TODO but sequences switched around depending on size..... discuss
} Obi_ali_t, *Obi_ali_p;
/**
* @brief Frees an Obi_ali_p structure and all its elements.
*
* @warning The pointer sent becomes unusable.
*
* @param ali The pointer on the Obi_ali_p structure to free.
*
* @since January 2019
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
void obi_free_shifted_ali(Obi_ali_p ali);
/**
* Function computing the kmer similarity of two sequences stored in views.
*
* The similarity is computing this way: the positions of identical kmers in both sequences are
* compared, and the most represented shift is chosen. The similarity is then calculated as:
* kmer_similarity = number_of_common_kmers_with_the_chosen_shift + kmer_size - 1
*
* @warning Several pointers and structures passed to or returned by the function have to be freed by the caller:
* - kmer_pos_array
* - shift_array
* - shift_count_array
* - the Obi_ali_p structure returned
*
* @param view1 A pointer on the view containing the first sequence.
* @param column1 A pointer on the column containing the first sequence.
* @param idx1 The index of the first sequence in view1.
* @param elt_idx1 The element index of the first sequence in column1.
* @param view2 A pointer on the view containing the second sequence.
* @param column2 A pointer on the column containing the second sequence.
* @param idx2 The index of the second sequence in view2.
* @param elt_idx2 The element index of the second sequence in column2.
* @param kmer_size The kmer length to use. Must be >= 1 <= 4.
* @param kmer_pos_array The array used to store kmer positions. If NULL, allocated by the function.
* If needed, reallocated to a bigger size.
* @param kmer_pos_array_height_p A pointer on an integer corresponding to the size (number of elements)
* allocated for kmer_pos_array. Updated by the function as needed.
* @param shift_array The array used to store kmer shifts. If NULL, allocated by the function.
* If needed, reallocated to a bigger size.
* @param shift_array_height_p A pointer on an integer corresponding to the size (number of elements)
* allocated for shift_array. Updated by the function as needed.
* @param shift_count_array The array used to store shift counts. If NULL, allocated by the function.
* If needed, reallocated to a bigger size.
* @param shift_count_array_height_p A pointer on an integer corresponding to the size (number of elements)
* allocated for shift_count_array. Updated by the function as needed.
* @param build_consensus A boolean indicating whether the function should build the consensus sequence and quality. // TODO option to build consensus without quality?
*
* @returns A pointer on an Obi_ali_p structure containing the results.
* @retval NULL if an error occurred.
*
* @since January 2019
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
Obi_ali_p kmer_similarity(Obiview_p view1,
OBIDMS_column_p column1,
index_t idx1,
index_t elt_idx1,
Obiview_p view2,
OBIDMS_column_p column2,
index_t idx2,
index_t elt_idx2,
uint8_t kmer_size,
int32_t* kmer_pos_array,
int32_t* kmer_pos_array_height_p,
int32_t* shift_array,
int32_t* shift_array_height_p,
int32_t* shift_count_array,
int32_t* shift_count_array_height_p,
bool build_consensus);
#endif /* KMER_SIMILARITY_H_ */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment