Commit 23717f58 by Eric Coissac

Add a first version of the obisubset tool and tag the version 1.2.0

parent fdeaf595
......@@ -3,13 +3,12 @@ Sequence sampling and filtering
.. toctree::
:maxdepth: 2
scripts/obiextract
scripts/obigrep
scripts/obihead
scripts/obisample
scripts/obiselect
scripts/obisplit
scripts/obisubset
scripts/obitail
\ No newline at end of file
.. automodule:: obisubset
:py:mod:`obisubset` specific options
------------------------------------
.. cmdoption:: -s <TAGNAME>, --sample=<TAGNAME>,
The option ``-s`` allows to specify the tag containing sample descriptions,
the default value is set to *merged_sample*.
*Example:*
.. code-block:: bash
> obiuniq -m sample seq1.fasta > seq2.fasta
> obisubset -s merged_sample -n sample1 seq2.fasta > seq3.fasta
After the dereplication of the sequences using the
in the new attribute ``merged_sample``.
.. cmdoption:: -o <TAGNAME>, --other-tag=<TAGNAME>,
Another tag to clean according to the sample subset
*Example:*
.. code-block:: bash
> obisubset -s merged_sample -o -n sample1 seq2.fasta > seq3.fasta
.. cmdoption:: -l <FILENAME>, --sample-list=<FILENAME>,
File containing the samples names (one sample id per line).
*Example:*
.. code-block:: bash
> obisubset -s merged_sample -o -l ids.txt seq2.fasta > seq3.fasta
.. cmdoption:: -p <REGEX>, --sample-pattern=<REGEX>,
A regular expression pattern matching the sample ids to extract.
*Example:*
.. code-block:: bash
> obisubset -s merged_sample -o -p "negative_.*" seq2.fasta > seq3.fasta
.. cmdoption:: -n <SAMPLEIDS>, --sample-name=<SAMPLEIDS>,
A sample id to extract
*Example:*
.. code-block:: bash
> obisubset -s merged_sample -o -n sample1 seq2.fasta > seq3.fasta
.. include:: ../optionsSet/inputformat.txt
.. include:: ../optionsSet/outputformat.txt
.. include:: ../optionsSet/defaultoptions.txt
:py:mod:`obisubset` modifies sequence attributes
------------------------------------------------
.. hlist::
:columns: 3
- :doc:`count <../attributes/count>`
- :doc:`merged_* <../attributes/merged_star>`
:py:mod:`obisubset` used sequence attribute
-------------------------------------------
- :doc:`count <../attributes/taxid>`
- :doc:`merged_* <../attributes/merged_star>`
......@@ -19,7 +19,7 @@ from os import path
PACKAGE = "OBITools"
VERSION = "1.1.22"
VERSION = "1.2.0"
AUTHOR = 'Eric Coissac'
EMAIL = 'eric@coissac.eu'
URL = 'metabarcoding.org/obitools'
......
#!/usr/local/bin/python
'''
:py:mod:`obisubset`: extract a subset of samples
================================================
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>
The :py:mod:`obisubset` command extracts a subset of samples from a sequence file
after its dereplication using :py:mod:`obiuniq` program.
'''
from obitools.format.options import addInOutputOption, sequenceWriterGenerator
from obitools.options import getOptionManager
import re
def addSubsetOptions(optionManager):
group = optionManager.add_option_group('obisubset specific options')
group.add_option('-s','--sample',
action="store", dest="sample",
metavar="<TAGNAME>",
type="str",
default='merged_sample',
help="Tag containing sample descriptions, the default value is set to *merged_sample*")
group.add_option('-o','--other-tag',
action="append", dest="taglist",
metavar="<TAGNAME>",
type="string",
default=[],
help="Another tag to clean according to the sample subset")
group.add_option('-l','--sample-list',
action="store", dest="samplelist",
metavar="<FILENAME>",
type="string",
default=None,
help="File containing the samples names (one sample id per line)")
group.add_option('-p','--sample-pattern',
action="store", dest="samplepattern",
metavar="<REGEX>",
type="string",
default=None,
help="A regular expression pattern matching the sample ids to extract")
group.add_option('-n','--sample-name',
action="append", dest="samplename",
metavar="<SAMPLEIDS>",
type="string",
default=[],
help="A sample id to extract")
def sequenceSelectorGenerator(options):
samplename = set(options.samplename)
othertags = set(options.taglist)
if options.samplelist is not None:
with open(options.samplelist) as lname :
for name in lname:
name = name.strip()
samplename.add(name)
if options.samplepattern is not None:
samplepattern = re.compile(options.samplepattern)
else:
samplepattern = None
def sequenceSelector(entries):
for entry in entries:
samples=entry[options.sample]
slist = set(samples.keys())
tokeep=slist & samplename
if samplepattern is not None:
for name in slist:
if samplepattern.match(name):
tokeep.add(name)
if tokeep:
newsample={}
newcount=0
for name in tokeep:
c = samples[name]
newsample[name]= c
newcount+=c
entry['count']=newcount
entry[options.sample]=newsample
for t in othertags:
if t in entry:
d = entry[t]
newd={}
for name in tokeep:
if name in d:
newd[name] = d[name]
entry[t]=newd
yield entry
return sequenceSelector
if __name__=='__main__':
optionParser = getOptionManager([addInOutputOption,addSubsetOptions],progdoc=__doc__)
(options, entries) = optionParser()
writer = sequenceWriterGenerator(options)
good = sequenceSelectorGenerator(options)
for seq in good(entries):
writer(seq)
major = 1
minor = 1
serial= '22'
minor = 2
serial= '0'
version = "%2d.%02d %s" % (major,minor,serial)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment