Commit 3e45c344 authored by mercierc's avatar mercierc

import: now imports and adds taxids for SILVA and RDP files, added

import of lists, fixed skipping of errors (was not overwriting), and
fixed --no-progress-bar option
parent c2f3d90d
......@@ -49,7 +49,13 @@ def __addImportInputOption(optionManager):
action="store_const", dest="obi:inputformat",
default=None,
const=b'silva',
help="Input file is in SILVA fasta format")
help="Input file is in SILVA fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
group.add_argument('--rdp-input',
action="store_const", dest="obi:inputformat",
default=None,
const=b'rdp',
help="Input file is in RDP training set fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
group.add_argument('--embl-input',
action="store_const", dest="obi:inputformat",
......
......@@ -34,14 +34,17 @@ from obitools3.dms.capi.obidms cimport obi_import_view
from obitools3.dms.capi.obitypes cimport obitype_t, \
OBI_VOID, \
OBI_QUAL, \
OBI_STR
OBI_STR, \
OBI_INT
from obitools3.dms.capi.obierrno cimport obi_errno
from obitools3.apps.optiongroups import addImportInputOption, \
addTabularInputOption, \
addTaxdumpInputOption, \
addMinimalOutputOption
addMinimalOutputOption, \
addNoProgressBarOption, \
addTaxonomyOption
from obitools3.uri.decode import open_uri
......@@ -50,6 +53,7 @@ from obitools3.apps.config import logger
from cpython.exc cimport PyErr_CheckSignals
from io import BufferedWriter
import ast
__title__="Import sequences from different formats into a DMS"
......@@ -69,7 +73,9 @@ def addOptions(parser):
addImportInputOption(parser)
addTabularInputOption(parser)
addTaxdumpInputOption(parser)
addTaxonomyOption(parser)
addMinimalOutputOption(parser)
addNoProgressBarOption(parser)
group = parser.add_argument_group('obi import specific options')
......@@ -85,6 +91,10 @@ def addOptions(parser):
help="If importing a view into another DMS, do it by importing each line, saving disk space if the original view "
"has a line selection associated.")
# group.add_argument('--only-id',
# action="store", dest="import:onlyid",
# help="only id")
def run(config):
cdef tuple input
......@@ -97,6 +107,7 @@ def run(config):
cdef bint get_quality
cdef bint NUC_SEQS_view
cdef bint silva
cdef bint rdp
cdef int nb_elts
cdef object d
cdef View view
......@@ -180,6 +191,16 @@ def run(config):
logger("info", "Done.")
return
# Open taxonomy if there is one
if 'taxoURI' in config['obi'] and config['obi']['taxoURI'] is not None:
taxo_uri = open_uri(config['obi']['taxoURI'])
if taxo_uri is None or taxo_uri[2] == bytes:
raise Exception("Couldn't open taxonomy")
taxo = taxo_uri[1]
else :
taxo = None
# If importing a view between two DMS and not wanting to save space if line selection in original view, use C API
if isinstance(input[1], View) and not config['import']['space_priority']:
if obi_import_view(input[0].name_with_full_path, o_dms.name_with_full_path, input[1].name, tobytes((config['obi']['outputURI'].split('/'))[-1])) < 0 :
......@@ -192,8 +213,11 @@ def run(config):
logger("info", "Done.")
return
if entry_count >= 0:
# Reinitialize the progress bar
if entry_count >= 0 and config['obi']['noprogressbar'] == False:
pb = ProgressBar(entry_count, config)
else:
pb = None
NUC_SEQS_view = False
if isinstance(output[1], View) :
......@@ -202,20 +226,25 @@ def run(config):
NUC_SEQS_view = True
else:
raise NotImplementedError()
# Save basic columns in variables for optimization
if NUC_SEQS_view :
id_col = view[ID_COLUMN]
def_col = view[DEFINITION_COLUMN]
seq_col = view[NUC_SEQUENCE_COLUMN]
# Prepare taxon scientific name if SILVA file
if 'inputformat' in config['obi'] and config['obi']['inputformat'] == b"silva":
# Prepare taxon scientific name and taxid refs if RDP or SILVA file
silva = False
rdp = False
if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or config['obi']['inputformat'] == b"rdp"):
#if taxo is None:
# raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files")
silva = True
sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
else:
silva = False
rdp = True
if taxo is not None:
sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT)
dcols = {}
# First read through the entries to prepare columns with dictionaries as they are very time-expensive to rewrite
......@@ -265,8 +294,13 @@ def run(config):
# Reinitialize the input
if isinstance(input[0], CompressedFile):
input_is_file = True
if entry_count >= 0:
# Reinitialize the progress bar
if entry_count >= 0 and config['obi']['noprogressbar'] == False:
pb = ProgressBar(entry_count, config)
else:
pb = None
try:
input[0].close()
except AttributeError:
......@@ -275,6 +309,11 @@ def run(config):
if input is None:
raise Exception("Could not open input URI")
# if 'onlyid' in config['import']:
# onlyid = tobytes(config['import']['onlyid'])
# else:
# onlyid = None
entries = input[1]
i = 0
for entry in entries :
......@@ -292,6 +331,9 @@ def run(config):
elif not i%50000:
logger("info", "Imported %d entries", i)
# if onlyid is not None and entry.id != onlyid:
# continue
try:
if NUC_SEQS_view:
......@@ -307,11 +349,18 @@ def run(config):
if get_quality:
qual_col[i] = entry.quality
# Parse taxon scientific name if SILVA file
if silva:
sci_name = entry.definition.split(b";")[-1]
sci_name_col[i] = sci_name
# Parse taxon scientific name if RDP file
if (rdp or silva) and taxo is not None:
sci_names = entry.definition.split(b";")
for sci_name in reversed(sci_names):
if sci_name.split()[0] != b'unidentified' and sci_name.split()[0] != b'uncultured' and sci_name.split()[0] != b'metagenome' :
taxon = taxo.get_taxon_by_name(sci_name)
if taxon is not None:
sci_name_col[i] = taxon.name
taxid_col[i] = taxon.taxid
#print(taxid_col[i], sci_name_col[i])
break
for tag in entry :
if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty
......@@ -323,27 +372,39 @@ def run(config):
tag = COUNT_COLUMN
if tag[:7] == b"merged_":
tag = MERGED_PREFIX+tag[7:]
if type(value) == bytes and value[:1]==b"[" :
try:
if type(eval(value)) == list:
value = eval(value)
#print(value)
except:
pass
if tag not in dcols :
value_type = type(value)
nb_elts = 1
value_obitype = OBI_VOID
dict_col = False
if value_type == dict or value_type == list :
if value_type == dict :
nb_elts = len(value)
elt_names = list(value)
if value_type == dict :
dict_col = True
dict_col = True
else :
nb_elts = 1
elt_names = None
if value_type == list :
tuples = True
else:
tuples = False
value_obitype = get_obitype(value)
if value_obitype != OBI_VOID :
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names, dict_column=dict_col), value_obitype)
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names, dict_column=dict_col, tuples=tuples), value_obitype)
# Fill value
dcols[tag][0][i] = value
......@@ -419,11 +480,12 @@ def run(config):
dcols[tag][0][i] = value
except Exception as e:
print("\nCould not import sequence:", entry, "(error raised:", e, ")")
print("\nCould not import sequence:", repr(entry), "(error raised:", e, ")")
if 'skiperror' in config['obi'] and not config['obi']['skiperror']:
raise e
else:
pass
i-=1 # overwrite problematic entry
i+=1
......
......@@ -18,7 +18,7 @@ cdef object clean_empty_values_from_object(object value, exclude=*)
cdef obitype_t get_obitype_single_value(object value)
cdef obitype_t update_obitype(obitype_t obitype, object new_value)
cdef obitype_t get_obitype_iterable_value(object value)
cdef obitype_t get_obitype_iterable_value(object value, type t)
cdef obitype_t get_obitype(object value)
cdef object __etag__(bytes x, bytes nastring=*)
......@@ -260,38 +260,51 @@ cdef obitype_t update_obitype(obitype_t obitype, object new_value) :
new_type = type(new_value)
if obitype == OBI_INT :
if new_type == float or new_value > OBI_INT_MAX :
return OBI_FLOAT
#if new_type == NoneType: # doesn't work because Cython sucks
if new_value == None or new_type==list or new_type==dict or new_type==tuple:
return obitype
# TODO BOOL vers INT/FLOAT
elif new_type == str or new_type == bytes :
if new_type == str or new_type == bytes :
if obitype == OBI_SEQ and is_a_DNA_seq(tobytes(new_value)) :
pass
else :
return OBI_STR
elif obitype == OBI_INT :
if new_type == float or new_value > OBI_INT_MAX :
return OBI_FLOAT
return obitype
cdef obitype_t get_obitype_iterable_value(object value) :
cdef obitype_t get_obitype_iterable_value(object value, type t) :
cdef obitype_t value_obitype
value_obitype = OBI_VOID
for k in value :
if value_obitype == OBI_VOID :
value_obitype = get_obitype_single_value(value[k])
else :
value_obitype = update_obitype(value_obitype, value[k])
if t == dict:
for k in value :
if value_obitype == OBI_VOID :
value_obitype = get_obitype_single_value(value[k])
else :
value_obitype = update_obitype(value_obitype, value[k])
elif t == list or t == tuple:
for v in value :
if value_obitype == OBI_VOID :
value_obitype = get_obitype_single_value(v)
else :
value_obitype = update_obitype(value_obitype, v)
return value_obitype
cdef obitype_t get_obitype(object value) :
if type(value) == dict or type(value) == list or type(value) == tuple :
return get_obitype_iterable_value(value)
t = type(value)
if t == dict or t == list or t == tuple :
return get_obitype_iterable_value(value, t)
else :
return get_obitype_single_value(value)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment