import.pyx 10.1 KB
Newer Older
1
#cython: language_level=3
2

Eric Coissac's avatar
Eric Coissac committed
3
import sys
4
import os
Eric Coissac's avatar
Eric Coissac committed
5

6 7
from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
from obitools3.dms.view.view cimport View
8
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
9
from obitools3.dms.column.column cimport Column
10
from obitools3.dms.obiseq cimport Nuc_Seq
11
from obitools3.dms import DMS
12 13
from obitools3.dms.taxo.taxo cimport Taxonomy

14

15 16 17 18 19
from obitools3.utils cimport tobytes, \
                             get_obitype, \
                             update_obitype

from obitools3.dms.capi.obitypes cimport obitype_t, \
20 21
                                         OBI_VOID, \
                                         OBI_QUAL
22 23 24

from obitools3.dms.capi.obierrno cimport obi_errno

25
from obitools3.apps.optiongroups import addImportInputOption, \
26
                                        addTabularInputOption, \
27 28
                                        addTaxdumpInputOption, \
                                        addMinimalOutputOption
29

Eric Coissac's avatar
Eric Coissac committed
30
from obitools3.uri.decode import open_uri
31

32 33
from obitools3.apps.config import logger

34 35 36 37 38 39 40 41 42
__title__="Imports sequences from different formats into a DMS"
 
 
default_config = {   'destview'     : None,
                     'skip'         : 0,
                     'only'         : None,
                     'skiperror'    : False,
                     'seqinformat'  : None,
                     'moltype'      : 'nuc',
Eric Coissac's avatar
Eric Coissac committed
43
                     'source'     : None
44 45 46
                 }

def addOptions(parser):
Eric Coissac's avatar
Eric Coissac committed
47
    
48
    addImportInputOption(parser)
49
    addTabularInputOption(parser)
50
    addTaxdumpInputOption(parser)
51
    addMinimalOutputOption(parser)
52

53

54
def run(config):
55
    
56 57
    cdef   tuple       input
    cdef   tuple       output 
58 59 60 61 62 63 64 65 66 67
    cdef   int         i
    cdef   type        value_type
    cdef   obitype_t   value_obitype
    cdef   obitype_t   old_type
    cdef   obitype_t   new_type
    cdef   bint        get_quality
    cdef   bint        NUC_SEQS_view
    cdef   int         nb_elts
    cdef   object      d
    cdef   View        view
68 69
    cdef   object      entries
    cdef   object      entry
70 71 72 73 74 75 76 77
    cdef   Column      id_col
    cdef   Column      def_col
    cdef   Column      seq_col
    cdef   Column      qual_col
    cdef   Column      old_column
    cdef   bint        rewrite
    cdef   dict        dcols
    cdef   int         skipping
78
    cdef   bytes       tag
79 80 81 82 83 84 85 86
    cdef   object      value
    cdef   list        elt_names
    cdef   int         old_nb_elements_per_line
    cdef   int         new_nb_elements_per_line
    cdef   list        old_elements_names
    cdef   list        new_elements_names
    cdef   ProgressBar pb
    global             obi_errno
87
        
88
    DMS.obi_atexit()
89
    
90 91 92
    logger("info", "obi import: imports an object (file(s), obiview, taxonomy...) into a DMS")
    
    entry_count = -1
93
    
94
    if not config['obi']['taxdump']:
95 96 97
        input = open_uri(config['obi']['inputURI'])
        if input is None:  # TODO check for bytes instead now?
            raise Exception("Could not open input URI")
98 99 100 101 102
        
        entry_count = input[4]
        logger("info", "Importing %d entries", entry_count)
        
        # TODO a bit dirty
103 104 105 106 107 108 109
        if input[2]==Nuc_Seq:
            v = View_NUC_SEQS
        else:
            v = View 
    else:
        v = None

110 111 112
    output = open_uri(config['obi']['outputURI'],
                      input=False,
                      newviewtype=v)
113 114
    if output is None:
        raise Exception("Could not create output view")
115
        
116 117 118 119 120
    # Read taxdump
    if config['obi']['taxdump']:  # The input is a taxdump to import in a DMS
        taxo = Taxonomy.open_taxdump(output[0], config['obi']['inputURI'])
        taxo.write(output[1])
        taxo.close()
121
        output[0].record_command_line(" ".join(sys.argv[1:]))
122 123
        output[0].close()
        return
124
        
125
    pb = ProgressBar(entry_count, config, seconde=5)
Eric Coissac's avatar
Eric Coissac committed
126
    
127
    entries = input[1]
128
        
129 130 131 132 133 134 135 136 137
    NUC_SEQS_view = False
    if isinstance(output[1], View) :
        view = output[1]
        if output[2] == View_NUC_SEQS :
            NUC_SEQS_view = True
    else: 
        raise NotImplementedError()
    
    # Save basic columns in variables for optimization
138
    if NUC_SEQS_view :
139
        id_col = view[b"ID"]            # TODO use macros or globals for column names
140 141
        def_col = view[b"DEFINITION"]
        seq_col = view[b"NUC_SEQ"]
142
        
143
    dcols = {}
144
        
145
    i = 0
146
    for entry in entries :
147
        
148
        pb(i)
149 150
                        
        if NUC_SEQS_view: 
151 152 153
            id_col[i] = entry.id
            def_col[i] = entry.definition
            seq_col[i] = entry.seq
154 155 156 157 158 159 160
            # Check if there is a sequencing quality associated by checking the first entry    # TODO haven't found a more robust solution yet
            if i == 0:
                get_quality = b"QUALITY" in entry
                if get_quality:
                    Column.new_column(view, b"QUALITY", OBI_QUAL)
                    qual_col = view[b"QUALITY"]
            if get_quality:
161
                qual_col[i] = entry.quality
162
         
163
        for tag in entry :
164 165 166
            
            if tag != b"ID" and tag != b"DEFINITION" and tag != b"NUC_SEQ" and tag != b"QUALITY" :  # TODO hmmm... 
                                
167
                value = entry[tag]
168 169
                if tag == b"taxid":
                    tag = b"TAXID"
170
                                 
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
                if tag not in dcols :
                     
                    value_type = type(value)
                    nb_elts = 1
                    value_obitype = OBI_VOID
                     
                    if value_type == dict or value_type == list :
                        nb_elts = len(value)
                        elt_names = list(value)
                    else :
                        nb_elts = 1
                        elt_names = None
                     
                    value_obitype = get_obitype(value)
                     
                    if value_obitype != OBI_VOID :
                        dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
                                                 
                        # Fill value
                        dcols[tag][0][i] = value
                     
                    # TODO else log error?
 
                else :
         
                    rewrite = False
 
                    # Check type adequation
                    old_type = dcols[tag][1]
                    new_type = OBI_VOID
                    new_type = update_obitype(old_type, value)
                    if old_type != new_type :
                        rewrite = True
 
                    try:
                        # Fill value
                        dcols[tag][0][i] = value
                     
                    except IndexError :
                                                 
                        value_type = type(value)
                        old_column = dcols[tag][0]
                        old_nb_elements_per_line = old_column.nb_elements_per_line
                        new_nb_elements_per_line = 0
                        old_elements_names = old_column.elements_names
                        new_elements_names = None
     
                        #####################################################################
                         
                        # Check the length and keys of column lines if needed
                        if value_type == dict :    # Check dictionary keys
                            for k in value :
                                if k not in old_elements_names :
                                    new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
                                    rewrite = True
                                    break
                         
                        elif value_type == list or value_type == tuple :  # Check vector length
                            if old_nb_elements_per_line < len(value) :
                                new_nb_elements_per_line = len(value)
                                rewrite = True
                         
                        #####################################################################
                         
                        if rewrite :
                            if new_nb_elements_per_line == 0 and new_elements_names is not None :
                                new_nb_elements_per_line = len(new_elements_names)
                             
                            # Reset obierrno 
                            obi_errno = 0
 
                            dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, 
                                                                                   new_data_type=new_type, 
                                                                                   new_nb_elements_per_line=new_nb_elements_per_line,
245 246
                                                                                   new_elements_names=new_elements_names,
                                                                                   rewrite_last_line=False), 
247 248 249 250 251 252 253 254 255 256 257
                                          value_obitype)
                             
                            # Update the dictionary:
                            for t in dcols :
                                dcols[t] = (view[t], dcols[t][1])
                             
                            # Fill value
                            dcols[tag][0][i] = value
                                    
        i+=1
 
258 259 260
    pb(i, force=True)
    print("", file=sys.stderr)
    
261 262 263 264 265
    # Save command config in View and DMS comments
    command_line = " ".join(sys.argv[1:])
    view.write_config(config, "import", command_line, input_str=[os.path.abspath(config['obi']['inputURI'])])
    output[0].record_command_line(command_line)

266 267 268
    #print("\n\nOutput view:\n````````````", file=sys.stderr)
    #print(repr(view), file=sys.stderr)
    
269 270 271 272 273 274 275 276
    try:
        input[0].close()
    except AttributeError:
        pass
    try:
        output[0].close()
    except AttributeError:
        pass
277

278
    logger("info", "Done.")