Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
O
OBITools3
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
23
Issues
23
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
OBITools
OBITools3
Commits
d88390c6
Commit
d88390c6
authored
Mar 13, 2019
by
Celine Mercier
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Cython API: when importing a file in a DMS, its length is computed
beforehand for the progress bar
parent
50e7cd61
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
97 additions
and
48 deletions
+97
-48
python/obitools3/commands/import.pyx
python/obitools3/commands/import.pyx
+16
-7
python/obitools3/parsers/universal.pyx
python/obitools3/parsers/universal.pyx
+26
-21
python/obitools3/uri/decode.pyx
python/obitools3/uri/decode.pyx
+25
-20
python/obitools3/utils.pxd
python/obitools3/utils.pxd
+2
-0
python/obitools3/utils.pyx
python/obitools3/utils.pyx
+28
-0
No files found.
python/obitools3/commands/import.pyx
View file @
d88390c6
...
...
@@ -87,14 +87,19 @@ def run(config):
DMS
.
obi_atexit
()
logger
(
"info"
,
"obi import : imports file into a DMS"
)
logger
(
"info"
,
"obi import: imports an object (file(s), obiview, taxonomy...) into a DMS"
)
entry_count
=
-
1
if
not
config
[
'obi'
][
'taxdump'
]:
input
=
open_uri
(
config
[
'obi'
][
'inputURI'
])
if
input
is
None
:
# TODO check for bytes instead now?
raise
Exception
(
"Could not open input URI"
)
# TODO uuuuh
entry_count
=
input
[
4
]
logger
(
"info"
,
"Importing %d entries"
,
entry_count
)
# TODO a bit dirty
if
input
[
2
]
==
Nuc_Seq
:
v
=
View_NUC_SEQS
else
:
...
...
@@ -117,7 +122,7 @@ def run(config):
output
[
0
].
close
()
return
pb
=
ProgressBar
(
10000000
,
config
,
seconde
=
5
)
# TODO should be number of records in file
pb
=
ProgressBar
(
entry_count
,
config
,
seconde
=
5
)
entries
=
input
[
1
]
...
...
@@ -250,14 +255,17 @@ def run(config):
i
+=
1
pb
(
i
,
force
=
True
)
print
(
""
,
file
=
sys
.
stderr
)
# Save command config in View and DMS comments
command_line
=
" "
.
join
(
sys
.
argv
[
1
:])
view
.
write_config
(
config
,
"import"
,
command_line
,
input_str
=
[
os
.
path
.
abspath
(
config
[
'obi'
][
'inputURI'
])])
output
[
0
].
record_command_line
(
command_line
)
print
(
"
\n
"
)
print
(
view
.
__repr__
()
)
#print("\n\nOutput view:\n````````````", file=sys.stderr
)
#print(repr(view), file=sys.stderr
)
try
:
input
[
0
].
close
()
except
AttributeError
:
...
...
@@ -267,3 +275,4 @@ def run(config):
except
AttributeError
:
pass
logger
(
"info"
,
"Done."
)
python/obitools3/parsers/universal.pyx
View file @
d88390c6
...
...
@@ -23,20 +23,20 @@ def is_ngsfilter_line(line): # TODO doesn't work?
return
False
def
entryIteratorFactory
(
lineiterator
,
int
skip
=
0
,
only
=
None
,
bytes
seqtype
=
b
'nuc'
,
int
offset
=-
1
,
bint
noquality
=
False
,
bint
skiperror
=
True
,
bint
header
=
False
,
bytes
sep
=
None
,
bytes
dec
=
b
'.'
,
bytes
nastring
=
b
"NA"
,
bint
stripwhite
=
True
,
bint
blanklineskip
=
True
,
bytes
commentchar
=
b
"#"
,
int
buffersize
=
100000000
):
int
skip
=
0
,
only
=
None
,
bytes
seqtype
=
b
'nuc'
,
int
offset
=-
1
,
bint
noquality
=
False
,
bint
skiperror
=
True
,
bint
header
=
False
,
bytes
sep
=
None
,
bytes
dec
=
b
'.'
,
bytes
nastring
=
b
"NA"
,
bint
stripwhite
=
True
,
bint
blanklineskip
=
True
,
bytes
commentchar
=
b
"#"
,
int
buffersize
=
100000000
):
if
isinstance
(
lineiterator
,
(
str
,
bytes
)):
lineiterator
=
uopen
(
lineiterator
)
...
...
@@ -65,7 +65,7 @@ def entryIteratorFactory(lineiterator,
format
=
b
"embl"
elif
first
[
0
:
6
]
==
b
'LOCUS '
:
format
=
b
"genbank"
elif
first
[
0
:
11
]
==
b
'#@ecopcr-v2'
:
# TODO v2????
elif
first
[
0
:
8
]
==
b
'#@ecopcr'
:
format
=
b
"ecopcrfile"
elif
is_ngsfilter_line
(
first
):
format
=
b
"ngsfilter"
...
...
@@ -83,7 +83,8 @@ def entryIteratorFactory(lineiterator,
firstline
=
first
,
buffersize
=
buffersize
,
nastring
=
nastring
),
Nuc_Seq
)
Nuc_Seq
,
format
)
else
:
raise
NotImplementedError
()
elif
format
==
b
'fastq'
:
...
...
@@ -94,7 +95,8 @@ def entryIteratorFactory(lineiterator,
firstline
=
first
,
buffersize
=
buffersize
,
nastring
=
nastring
),
Nuc_Seq
)
Nuc_Seq
,
format
)
elif
format
==
b
'tabular'
:
return
(
tabIterator
(
lineiterator
,
header
=
header
,
...
...
@@ -108,7 +110,8 @@ def entryIteratorFactory(lineiterator,
only
=
only
,
firstline
=
first
,
buffersize
=
buffersize
),
dict
)
dict
,
format
)
elif
format
==
b
'ngsfilter'
:
return
(
ngsfilterIterator
(
lineiterator
,
sep
=
sep
,
...
...
@@ -121,7 +124,8 @@ def entryIteratorFactory(lineiterator,
only
=
only
,
firstline
=
first
,
buffersize
=
buffersize
),
dict
)
dict
,
format
)
elif
format
==
b
'embl'
:
return
(
emblIterator
(
lineiterator
,
...
...
@@ -129,7 +133,8 @@ def entryIteratorFactory(lineiterator,
only
=
only
,
firstline
=
first
,
buffersize
=
buffersize
),
dict
)
dict
,
format
)
raise
NotImplementedError
(
'File format
not yet implemented
'
)
raise
NotImplementedError
(
'File format
iterator not implemented yet
'
)
python/obitools3/uri/decode.pyx
View file @
d88390c6
...
...
@@ -20,7 +20,7 @@ from obitools3.format.fastq import FastqFormat
from
obitools3.dms.obiseq
import
Nuc_Seq
from
obitools3.apps.config
import
getConfiguration
,
logger
from
obitools3.apps.temp
import
get_temp_dms
from
obitools3.utils
cimport
tobytes
# TODO
because can't read options as bytes
from
obitools3.utils
cimport
tobytes
,
count_entries
# TODO tobytes
because can't read options as bytes
from
obitools3.dms.capi.obierrno
cimport
obi_errno
,
\
OBIVIEW_ALREADY_EXISTS_ERROR
...
...
@@ -159,6 +159,7 @@ Reads an URI and returns a tuple containing:
(2) The opened view or iterator on the opened file or writer
(3) The class of object returned or handled by (2)
(4) The original URI in bytes
(5) The number of entries (if input URI) or -1 if unavailable
'''
def
open_uri
(
uri
,
bint
input
=
True
,
...
...
@@ -209,7 +210,8 @@ def open_uri(uri,
return
(
dms
[
0
],
dms
[
1
],
type
(
dms
[
1
]),
urlunparse
(
urip
))
urlunparse
(
urip
),
len
(
dms
[
0
]))
try
:
resource
=
open_dms_element
(
dms
[
0
],
dms
[
1
],
...
...
@@ -230,7 +232,8 @@ def open_uri(uri,
return
(
resource
[
0
],
resource
[
1
],
type
(
resource
[
1
]),
urlunparse
(
urip
))
urlunparse
(
urip
),
len
(
resource
[
1
]))
except
Exception
as
e
:
global
obi_errno
if
obi_errno
==
OBIVIEW_ALREADY_EXISTS_ERROR
:
...
...
@@ -503,19 +506,19 @@ def open_uri(uri,
raise
NotImplementedError
(
'Output sequence file format not implemented'
)
else
:
if
input
:
iseq
,
objclass
=
entryIteratorFactory
(
file
,
skip
,
only
,
seqtype
,
offset
,
noquality
,
skiperror
,
header
,
sep
,
dec
,
nastring
,
stripwhite
,
blanklineskip
,
commentchar
)
iseq
,
objclass
,
format
=
entryIteratorFactory
(
file
,
skip
,
only
,
seqtype
,
offset
,
noquality
,
skiperror
,
header
,
sep
,
dec
,
nastring
,
stripwhite
,
blanklineskip
,
commentchar
)
else
:
# default export is in fasta? or tab? TODO
objclass
=
Nuc_Seq
# Nuc_Seq_Stored? TODO
iseq
=
FastaNucWriter
(
FastaFormat
(
printNAKeys
=
printna
,
NAString
=
nastring
),
...
...
@@ -524,7 +527,9 @@ def open_uri(uri,
only
=
only
)
#tmpdms = get_temp_dms()
return
(
file
,
iseq
,
objclass
,
urib
)
entry_count
=
-
1
if
input
:
entry_count
=
count_entries
(
file
,
format
)
return
(
file
,
iseq
,
objclass
,
urib
,
entry_count
)
python/obitools3/utils.pxd
View file @
d88390c6
...
...
@@ -2,6 +2,8 @@
from
obitools3.dms.capi.obitypes
cimport
obitype_t
,
index_t
cpdef
bytes
format_separator
(
bytes
format
)
cpdef
int
count_entries
(
file
,
bytes
format
)
cdef
obi_errno_to_exception
(
int
obi_errno
,
index_t
line_nb
=*
,
object
elt_id
=*
,
str
error_message
=*
)
...
...
python/obitools3/utils.pyx
View file @
d88390c6
...
...
@@ -16,6 +16,34 @@ from obitools3.dms.capi.obierrno cimport OBI_LINE_IDX_ERROR, \
#obi_errno
import
re
import
mmap
cpdef
bytes
format_separator
(
bytes
format
):
if
format
==
b
"fasta"
:
return
b
"
\n
>"
elif
format
==
b
"fastq"
:
return
b
"
\n
@"
elif
format
==
b
"ngsfilter"
or
format
==
b
"tabular"
:
return
b
"
\n
"
elif
format
==
b
"genbank"
or
format
==
b
"embl"
:
return
b
"
\n
//"
elif
format
==
b
"ecopcr"
:
return
b
"
\n
[^#]"
else
:
return
None
cpdef
int
count_entries
(
file
,
bytes
format
):
try
:
sep
=
format_separator
(
format
)
if
sep
is
None
:
return
-
1
sep
=
re
.
compile
(
sep
)
mmapped_file
=
mmap
.
mmap
(
file
.
fileno
(),
0
,
access
=
mmap
.
ACCESS_READ
)
return
len
(
re
.
findall
(
sep
,
mmapped_file
))
except
:
return
-
1
# TODO RollbackException?
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment