Commit 3b520e05 by Celine Mercier

doc added in obiaddtaxids.py

parent 6d4cda0e
#!/usr/local/bin/python
'''
Created on 5 oct. 2012
:py:mod:`obiaddtaxids` : Adding taxids to sequence records using an ecopcr database
=================================================================================
.. codeauthor:: Celine Mercier <celine.mercier@metabarcoding.org>
The :py:mod:`obiaddtaxids` command takes a sequence file in either fasta, SILVA or UNITE format and an ecopcr database as inputs.
If the sequence file is in fasta format, the user should specify where to find the taxon name associated with the sequence using
the ``-T`` option.
For each sequence record, :py:mod:`obiaddtaxids` will try to match its taxon name with one from the ecopcr database,
and will print it with the associated taxid if a match is found.
:py:mod:`obiaddtaxids` can associate a sequence record with a taxon from the ecopcr database in three different ways :
- If the taxon name matches exactly one in the ecopcr database, the sequence record is printed
with a new attribute having the key ``taxid``, and the taxid associated with the matching taxon as its value.
- If the ``-g`` option is set, the taxon name is composed of two words, and the first one matches a taxon name
from the ecopcr database, the sequence record is printed in the file specified by the ``-g`` option.
- If the ``-s`` option is set and the exact taxon name, nor its first word if the ``-g`` option was set, matched
with a taxon from the ecopcr database, each word from the taxon name are searched. The sequences identified this way
are written in the file set by the ``-s`` option.
If the ``-u`` option is set and a sequence was printed neither in the output, the ``-g`` file nor the ``-s`` file, it is printed in
the file set by the ``-u`` option.
@author: celine mercier
'''
import sys
from obitools.fasta import fastaIterator,formatFasta
......@@ -93,18 +120,18 @@ def SILVAIterator(f, tax):
yield s
def PierreFileIterator(f) :
fastaEntryIterator = genericEntryIteratorGenerator(startEntry='>')
for entry in fastaEntryIterator(f) :
all = entry.split('\n')
header = all[0]
fields = header.split('|')
id = fields[0][1:]
seq = all[1]
s = NucSequence(id, seq)
s['species_name'] = fields[-1].replace('+',' ')
print>>sys.stderr, s['species_name']
yield s
#def PierreFileIterator(f) :
# fastaEntryIterator = genericEntryIteratorGenerator(startEntry='>')
# for entry in fastaEntryIterator(f) :
# all = entry.split('\n')
# header = all[0]
# fields = header.split('|')
# id = fields[0][1:]
# seq = all[1]
# s = NucSequence(id, seq)
# s['species_name'] = fields[-1].replace('+',' ')
# print>>sys.stderr, s['species_name']
# yield s
def lookForSimilarSpeciesNameInGenus(species_name, species_list):
......@@ -141,7 +168,6 @@ def lookForSimilarNamesInTaxonomy(name, tax, ancestor):
def dirtyLookForSimilarNames(name, tax, ancestor):
taxonomy = tax._taxonomy
similar_name = ''
taxid = None
......@@ -159,19 +185,19 @@ def dirtyLookForSimilarNames(name, tax, ancestor):
return similar_name, taxid
def getAllSpeciesFromGenus(raw_list):
speciesList = ""
for species in raw_list[1:] :
speciesList = speciesList + i[3] + " with taxid " + str(i[0]) + ", "
if speciesList == "" :
speciesList = "None"
else :
speciesList = speciesList[:-2]
return speciesList
#def getAllSpeciesFromGenus(raw_list):
#
# speciesList = ""
#
# for species in raw_list[1:] :
# speciesList = speciesList + i[3] + " with taxid " + str(i[0]) + ", "
#
# if speciesList == "" :
# speciesList = "None"
# else :
# speciesList = speciesList[:-2]
#
# return speciesList
def getGenusTaxid(tax, species_name, ancestor):
......@@ -322,14 +348,14 @@ if __name__=='__main__':
entryIterator = SILVAIterator
entries = entryIterator(entries, tax)
options.tagname = 'species_name'
elif options.db_type == 'Pierre' :
entryIterator = PierreFileIterator
entries = entryIterator(entries)
# elif options.db_type == 'Pierre' :
# entryIterator = PierreFileIterator
# entries = entryIterator(entries)
#entries = entryIterator(entries)
openFiles(options)
if (options.db_type == 'raw') or (options.db_type == 'SILVA') or (options.db_type == 'Pierre') :
if (options.db_type == 'raw') or (options.db_type == 'SILVA') :
if options.res_anc == '' :
restricting_ancestor = None
......@@ -339,7 +365,7 @@ if __name__=='__main__':
for s in entries:
if options.res_anc != '' and not options.res_anc.isdigit():
restricting_ancestor = int(seq[options.res_anc])
restricting_ancestor = int(s[options.res_anc])
species_name = get_species_name(s, options)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment