Commit 99c1cd60 by mercierc

export: now exports header for tabular files by default and added option

to only export specific columns
parent ce7ae4ac
......@@ -137,10 +137,10 @@ def __addImportInputOption(optionManager):
def __addTabularOption(optionManager):
group = optionManager.add_argument_group("Input and output format options for tabular files")
group.add_argument('--header',
action="store_true", dest="obi:header",
default=False,
help="First line of tabular file contains column names")
group.add_argument('--no-header',
action="store_false", dest="obi:header",
default=True,
help="Don't print the header (first line with column names")
group.add_argument('--sep',
action="store", dest="obi:sep",
......@@ -297,6 +297,12 @@ def __addExportOutputOption(optionManager):
const=b'tabular',
help="Output file is in tabular format")
group.add_argument('--only-keys',
action="append", dest="obi:only_keys",
type=str,
default=[],
help="Only export the given keys (columns).")
group.add_argument('--print-na',
action="store_true", dest="obi:printna",
default=False,
......
......@@ -4,6 +4,6 @@ cdef class TabFormat:
cdef bint header
cdef bint first_line
cdef bytes NAString
cdef list tags
cdef set tags
cdef bytes sep
cdef bint NAIntTo0
\ No newline at end of file
......@@ -10,7 +10,8 @@ import sys
cdef class TabFormat:
def __init__(self, header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True):
def __init__(self, list tags=[], header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True):
self.tags = set(tags)
self.header = header
self.first_line = True
self.NAString = NAString
......@@ -20,46 +21,53 @@ cdef class TabFormat:
@cython.boundscheck(False)
def __call__(self, object data):
line = []
cdef set ktags
cdef list tags = [key for key in data]
if self.first_line:
self.tags = [k for k in data.keys()]
line = []
if self.tags is not None and self.tags:
ktags = self.tags
else:
ktags = set(tags)
if self.header and self.first_line:
for k in self.tags:
if isinstance(data.view[k], Column_multi_elts):
keys = data.view[k].keys()
keys.sort()
for k2 in keys:
line.append(tobytes(k)+b':'+tobytes(k2))
else:
line.append(tobytes(k))
for k in ktags:
if k in tags:
if isinstance(data.view[k], Column_multi_elts):
keys = data.view[k].keys()
keys.sort()
for k2 in keys:
line.append(tobytes(k)+b':'+tobytes(k2))
else:
line.append(tobytes(k))
r = self.sep.join(value for value in line)
r += b'\n'
line = []
for k in self.tags:
value = data[k]
if isinstance(data.view[k], Column_multi_elts):
keys = data.view[k].keys()
keys.sort()
if value is None: # all keys at None
for k2 in keys: # TODO could be much more efficient
line.append(self.NAString)
else:
for k2 in keys: # TODO could be much more efficient
if value[k2] is not None:
line.append(str2bytes(str(bytes2str_object(value[k2])))) # genius programming
else:
if self.NAIntTo0 and isinstance(data.view[k], Column_multi_elts_int):
line.append(b"0")
for k in ktags:
if k in tags:
value = data[k]
if isinstance(data.view[k], Column_multi_elts):
keys = data.view[k].keys()
keys.sort()
if value is None: # all keys at None
for k2 in keys: # TODO could be much more efficient
line.append(self.NAString)
else:
for k2 in keys: # TODO could be much more efficient
if value[k2] is not None:
line.append(str2bytes(str(bytes2str_object(value[k2])))) # genius programming
else:
line.append(self.NAString)
else:
if value is not None or (self.NAIntTo0 and isinstance(data.view[k], Column_int)):
line.append(str2bytes(str(bytes2str_object(value))))
if self.NAIntTo0 and isinstance(data.view[k], Column_multi_elts_int):
line.append(b"0")
else:
line.append(self.NAString)
else:
line.append(self.NAString)
if value is not None or (self.NAIntTo0 and isinstance(data.view[k], Column_int)):
line.append(str2bytes(str(bytes2str_object(value))))
else:
line.append(self.NAString)
if self.header and self.first_line:
r += self.sep.join(value for value in line)
......
......@@ -476,6 +476,18 @@ def open_uri(uri,
except KeyError:
commentchar=b'#'
if b"only_keys" in qualifiers:
only_keys=qualifiers[b"only_keys"][0] # not sure that works but no one ever uses qualifiers
else:
try:
only_keys_str=config["obi"]["only_keys"]
only_keys=[]
for key in only_keys_str:
only_keys.append(tobytes(key))
except KeyError:
only_keys=[]
if format is not None:
if seqtype==b"nuc":
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
......@@ -486,7 +498,7 @@ def open_uri(uri,
only=only,
nastring=nastring)
else:
iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring),
iseq = FastaNucWriter(FastaFormat(tags=only_keys, printNAKeys=printna, NAString=nastring),
file,
skip=skip,
only=only)
......@@ -499,7 +511,7 @@ def open_uri(uri,
noquality=noquality,
nastring=nastring)
else:
iseq = FastqWriter(FastqFormat(printNAKeys=printna, NAString=nastring),
iseq = FastqWriter(FastqFormat(tags=only_keys, printNAKeys=printna, NAString=nastring),
file,
skip=skip,
only=only)
......@@ -535,7 +547,7 @@ def open_uri(uri,
skip = skip,
only = only)
else:
iseq = TabWriter(TabFormat(header=header, NAString=nastring, sep=sep, NAIntTo0=na_int_to_0),
iseq = TabWriter(TabFormat(tags=only_keys, header=header, NAString=nastring, sep=sep, NAIntTo0=na_int_to_0),
file,
skip=skip,
only=only,
......@@ -571,7 +583,7 @@ def open_uri(uri,
commentchar)
else: # default export is in fasta? or tab? TODO
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring),
iseq = FastaNucWriter(FastaFormat(tags=only_keys, printNAKeys=printna, NAString=nastring),
file,
skip=skip,
only=only)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment