Commit 99a397b8 by Celine Mercier

obi uniq: various improvements and fixes #66

parent f5c472ff
......@@ -7,12 +7,13 @@ from obitools3.dms.obiseq cimport Nuc_Seq_Stored
from obitools3.dms.view import RollbackException
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
from obitools3.dms.column.column cimport Column, Column_line
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN, TAXID_COLUMN
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN, TAXID_COLUMN, \
TAXID_DIST_COLUMN, MERGED_TAXID_COLUMN, MERGED_COLUMN, MERGED_PREFIX
from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t
from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption, addTaxonomyOption
from obitools3.uri.decode import open_uri
from obitools3.apps.config import logger
from obitools3.utils cimport tobytes
from obitools3.utils cimport tobytes, tostr
import sys
......@@ -34,12 +35,12 @@ def addOptions(parser):
metavar="<TAG NAME>",
default=[],
type=str,
help="Attributes to merge.") # TODO must be a 1 elt/line column
help="Attributes to merge.") # note: must be a 1 elt/line column, but columns containing already merged information (name MERGED_*) are automatically remerged
group.add_argument('--merge-ids', '-e',
action="store_true", dest="uniq:mergeids",
default=False,
help="ONLY WORKING ON SMALL SETS FOR NOW Add the merged key with all ids of merged sequences.") # TODO ?
help="Add the merged key with all ids of merged sequences.")
group.add_argument('--category-attribute', '-c',
action="append", dest="uniq:categories",
......@@ -129,9 +130,9 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
)
for seq in o_view:
if b"merged_taxid" in seq :
if MERGED_TAXID_COLUMN in seq :
m_taxids = []
m_taxids_dict = seq[b"merged_taxid"]
m_taxids_dict = seq[MERGED_TAXID_COLUMN]
for k in m_taxids_dict.keys() :
if m_taxids_dict[k] is not None:
m_taxids.append(int(k))
......@@ -171,7 +172,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
seq[b"scientific_name"] = taxonomy.get_scientific_name(taxid)
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=10000) :
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=1000000) :
cdef int i
cdef int k
......@@ -215,9 +216,18 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
cdef object iter_view
cdef object mcol
cdef object to_merge
cdef list merged_list
uniques = {}
for column_name in view.keys():
if column_name[:7] == b"MERGED_":
info_to_merge = column_name[7:]
if mergedKeys_list is not None:
mergedKeys_list.append(tostr(info_to_merge))
else:
mergedKeys_list = [tostr(info_to_merge)]
mergedKeys_list_b = []
if mergedKeys_list is not None:
for key_str in mergedKeys_list:
......@@ -230,12 +240,12 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
mergedKeys_set.add(TAXID_COLUMN)
mergedKeys = list(mergedKeys_set)
k_count = len(mergedKeys)
mergedKeys_m = []
for k in range(k_count):
mergedKeys_m.append(b"merged_" + mergedKeys[k])
mergedKeys_m.append(MERGED_PREFIX + mergedKeys[k])
if categories is None:
categories = []
......@@ -245,8 +255,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
i_id_col = view[ID_COLUMN]
if TAXID_COLUMN in view:
i_taxid_col = view[TAXID_COLUMN]
if b"taxid_dist" in view:
i_taxid_dist_col = view[b"taxid_dist"]
if TAXID_DIST_COLUMN in view:
i_taxid_dist_col = view[TAXID_DIST_COLUMN]
# First browsing
......@@ -278,13 +288,13 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
for k in range(k_count):
key = mergedKeys[k]
mkey = mergedKeys_m[k]
# if mkey in i_seq: # TODO
# if mkey not in merged_infos:
# merged_infos[mkey] = {}
# mkey_infos = merged_infos[mkey]
# mkey_infos['nb_elts'] = 1
# mkey_infos['elt_names'] = [i_seq[key]]
if key in i_seq: # TODO what if mkey already in i_seq? --> should update
if mkey in i_seq:
if mkey not in merged_infos:
merged_infos[mkey] = {}
mkey_infos = merged_infos[mkey]
mkey_infos['nb_elts'] = view[mkey].nb_elements_per_line
mkey_infos['elt_names'] = view[mkey].elements_names
if key in i_seq:
if mkey not in merged_infos:
merged_infos[mkey] = {}
mkey_infos = merged_infos[mkey]
......@@ -304,6 +314,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
key = mergedKeys[k]
merged_col_name = mergedKeys_m[k]
i_col = view[key]
if merged_infos[merged_col_name]['nb_elts'] > max_elts:
str_merged_cols.append(merged_col_name)
Column.new_column(o_view,
......@@ -311,7 +322,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
OBI_STR,
to_eval=True,
comments=i_col.comments,
alias=merged_col_name # TODO what if it already exists
alias=merged_col_name
)
else:
......@@ -321,7 +332,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
nb_elements_per_line=merged_infos[merged_col_name]['nb_elts'],
elements_names=list(merged_infos[merged_col_name]['elt_names']),
comments=i_col.comments,
alias=merged_col_name # TODO what if it already exists
alias=merged_col_name
)
mkey_cols[merged_col_name] = o_view[merged_col_name]
......@@ -329,41 +340,39 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
# taxid_dist column
if mergeIds and TAXID_COLUMN in mergedKeys:
if len(view) > max_elts: #The number of different IDs corresponds to the number of sequences in the view
str_merged_cols.append(b"taxid_dist")
str_merged_cols.append(TAXID_DIST_COLUMN)
Column.new_column(o_view,
b"taxid_dist",
TAXID_DIST_COLUMN,
OBI_STR,
to_eval=True,
alias=b"taxid_dist" # TODO what if it already exists
alias=TAXID_DIST_COLUMN
)
else:
Column.new_column(o_view,
b"taxid_dist",
TAXID_DIST_COLUMN,
OBI_INT,
nb_elements_per_line=len(view),
elements_names=[id for id in i_id_col],
alias=b"taxid_dist" # TODO what if it already exists
alias=TAXID_DIST_COLUMN
)
del(merged_infos)
# Merged ids column
if mergeIds :
Column.new_column(o_view,
b"merged",
MERGED_COLUMN,
OBI_STR,
tuples=True,
alias=b"merged" # TODO what if it already exists
alias=MERGED_COLUMN
)
# Keep columns that are going to be used a lot in variables
o_id_col = o_view[ID_COLUMN]
if b"taxid_dist" in o_view:
o_taxid_dist_col = o_view[b"taxid_dist"]
if b"merged" in o_view:
o_merged_col = o_view[b"merged"]
if TAXID_DIST_COLUMN in o_view:
o_taxid_dist_col = o_view[TAXID_DIST_COLUMN]
if MERGED_COLUMN in o_view:
o_merged_col = o_view[MERGED_COLUMN]
print("\n") # TODO because in the middle of progress bar. Better solution?
logger("info", "Second browsing through the input")
......@@ -383,11 +392,15 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
o_id = o_seq.id
if mergeIds:
o_merged_col[o_idx] = [view[idx].id for idx in merged_sequences]
merged_list = [view[idx].id for idx in merged_sequences]
if MERGED_COLUMN in view: # merge all ids if there's already some merged ids info
merged_list.extend(view[MERGED_COLUMN][idx] for idx in merged_sequences)
merged_list = list(set(merged_list)) # deduplicate the list
o_merged_col[o_idx] = merged_list
o_seq[COUNT_COLUMN] = 0
if b"taxid_dist" in u_seq and i_taxid_dist_col[u_idx] is not None:
if TAXID_DIST_COLUMN in u_seq and i_taxid_dist_col[u_idx] is not None:
taxid_dist_dict = i_taxid_dist_col[u_idx]
else:
taxid_dist_dict = {}
......@@ -407,19 +420,19 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
i_count = i_seq[COUNT_COLUMN]
o_seq[COUNT_COLUMN] += i_count
for k in range(k_count):
key = mergedKeys[k]
mkey = mergedKeys_m[k]
if key==TAXID_COLUMN and mergeIds:
if b"taxid_dist" in i_seq:
if TAXID_DIST_COLUMN in i_seq:
taxid_dist_dict.update(i_taxid_dist_col[i_idx])
if TAXID_COLUMN in i_seq:
taxid_dist_dict[i_id] = i_taxid_col[i_idx]
#cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
# merge relevant keys
if key in i_seq:
to_merge = i_seq[key]
if to_merge is not None:
......@@ -431,21 +444,20 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
else:
mcol[to_merge] = mcol[to_merge] + i_count
o_seq[key] = None
#cas ou merged_keys existe deja
else: # TODO is this a good else
if mkey in i_seq:
mcol = merged_dict[mkey]
i_mcol = i_seq[mkey]
if i_mcol is not None:
for key2 in i_mcol:
if mcol[key2] is None:
mcol[key2] = i_mcol[key2]
else:
mcol[key2] = mcol[key2] + i_mcol[key2]
# merged infos already in seq: merge the merged infos
if mkey in i_seq:
mcol = merged_dict[mkey] # dict
i_mcol = i_seq[mkey] # column line
if i_mcol.is_NA() == False:
for key2 in i_mcol:
if key2 not in mcol:
mcol[key2] = i_mcol[key2]
else:
mcol[key2] = mcol[key2] + i_mcol[key2]
# Write taxid_dist
if mergeIds and TAXID_COLUMN in mergedKeys:
if b"taxid_dist" in str_merged_cols:
if TAXID_DIST_COLUMN in str_merged_cols:
o_taxid_dist_col[o_idx] = str(taxid_dist_dict)
else:
o_taxid_dist_col[o_idx] = taxid_dist_dict
......@@ -456,7 +468,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
mkey_cols[mkey][o_idx] = str(merged_dict[mkey])
else:
mkey_cols[mkey][o_idx] = merged_dict[mkey]
# Sets NA values to 0 # TODO discuss, maybe keep as None and test for None instead of testing for 0 in tools
# Sets NA values to 0 # TODO discuss, for now keep as None and test for None instead of testing for 0 in tools
#for key in mkey_cols[mkey][o_idx]:
# if mkey_cols[mkey][o_idx][key] is None:
# mkey_cols[mkey][o_idx][key] = 0
......@@ -464,7 +476,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
for key in i_seq.keys():
# Delete informations that differ between the merged sequences
# TODO make special columns list?
if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] :
if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] \
and key not in merged_dict :
o_seq[key] = None
o_idx += 1
......
......@@ -26,6 +26,10 @@ cdef extern from "obiview.h" nogil:
extern const_char_p QUALITY_COLUMN
extern const_char_p COUNT_COLUMN
extern const_char_p TAXID_COLUMN
extern const_char_p MERGED_TAXID_COLUMN
extern const_char_p MERGED_PREFIX
extern const_char_p TAXID_DIST_COLUMN
extern const_char_p MERGED_COLUMN
struct Alias_column_pair_t :
Column_reference_t column_refs
......
......@@ -57,9 +57,14 @@
*/
#define TAXID_COLUMN "TAXID" /**< The name of the column containing the taxids. TODO subtype of INT column?
*/
#define MERGED_TAXID_COLUMN "merged_TAXID" /**< The name of the column containing the merged taxids information.
*/ // TODO maybe mix of lower and uppercase == bad
#define MERGED_TAXID_COLUMN "MERGED_TAXID" /**< The name of the column containing the merged taxids information.
*/
#define MERGED_PREFIX "MERGED_" /**< The prefix to prepend to column names when merging informations during obi uniq.
*/
#define TAXID_DIST_COLUMN "TAXID_DIST" /**< The name of the column containing a dictionary of taxid:[list of ids] when merging informations during obi uniq.
*/
#define MERGED_COLUMN "MERGED" /**< The name of the column containing a list of ids when merging informations during obi uniq.
*/
#define ID_PREFIX "seq" /**< The default prefix of sequence identifiers in automatic ID columns.
*/
#define PREDICATE_KEY "predicates" /**< The key used in the json-formatted view comments to store predicates.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment