Commit b9988181 by Celine Mercier

Closes #9 : adds a parser in obiaddtaxids for the UNITE 'general FASTA

release' format
parent df093b3f
......@@ -68,7 +68,6 @@ Otherwise,
'''
import sys
import re
from obitools.fasta import fastaIterator,formatFasta
......@@ -111,9 +110,9 @@ def addObiaddtaxidsOptions(optionManager):
metavar="<FORMAT>",
type="string",
default='raw',
help="type of the database with the taxa to be added. Possibilities : 'raw', 'UNITE' or 'SILVA'."
"The UNITE format must be the one used for the 'Full UNITE+INSD dataset'. Example :"
">UDB016651|k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Thelephorales;f__Thelephoraceae;g__Tomentella;s__Tomentella sp|SH200602.06FU"
help="type of the database with the taxa to be added. Possibilities : 'raw', 'UNITE_FULL', 'UNITE_GENERAL' or 'SILVA'."
"The UNITE_FULL format is the one used for the 'Full UNITE+INSD dataset', and the UNITE_GENERAL format is the "
"one used for the 'General FASTA release'."
" Default : raw.")
optionManager.add_option('-k','--key-name',
......@@ -145,7 +144,7 @@ def numberInStr(s) :
return containsNumber
def UNITEIterator(f):
def UNITEIterator_FULL(f):
fastaEntryIterator = genericEntryIteratorGenerator(startEntry='>')
for entry in fastaEntryIterator(f) :
......@@ -172,6 +171,28 @@ def UNITEIterator(f):
yield s
def UNITEIterator_GENERAL(f):
fastaEntryIterator = genericEntryIteratorGenerator(startEntry='>')
for entry in fastaEntryIterator(f) :
all = entry.split('\n')
header = all[0]
fields = header.split('|')
seq_id = fields[0][1:]
seq = all[1]
s = NucSequence(seq_id, seq)
s['species_name'] = seq_id.replace("_", " ")
path = fields[4]
path = re.sub('[a-z]__', '', path)
path = path.replace(';', ',')
s['path'] = path.replace(',,', ',')
yield s
def SILVAIterator(f, tax):
fastaEntryIterator = genericEntryIteratorGenerator(startEntry='>')
......@@ -317,8 +338,10 @@ if __name__=='__main__':
if options.db_type == 'raw' :
entryIterator = fastaIterator
entries = entryIterator(entries)
elif options.db_type == 'UNITE' :
entryIterator = UNITEIterator
elif options.db_type == 'UNITE_FULL' :
entryIterator = UNITEIterator_FULL
elif options.db_type == 'UNITE_GENERAL' :
entryIterator = UNITEIterator_GENERAL
entries = entryIterator(entries)
elif options.db_type == 'SILVA' :
entryIterator = SILVAIterator
......@@ -353,6 +376,7 @@ if __name__=='__main__':
if options.genus_found is not None and len(species_name.split(' ')) >= 2 :
try:
genusTaxid = getGenusTaxid(tax, species_name, restricting_ancestor)
s['genus_taxid'] = genusTaxid
print>>options.genus_found, formatFasta(s)
genusFound = True
except KeyError :
......@@ -368,16 +392,17 @@ if __name__=='__main__':
print>>options.unidentified,formatFasta(s)
elif options.db_type == 'UNITE' :
elif ((options.db_type =='UNITE_FULL') or (options.db_type =='UNITE_GENERAL')) :
restricting_ancestor = tax.findTaxonByName('Fungi')[0][0]
for s in entries :
try :
species_name = s['species_name']
taxid = getTaxid(tax, species_name, restricting_ancestor)
s['taxid']=taxid
s['taxid'] = taxid
s['rank'] = tax.getRank(taxid)
print formatFasta(s)
......@@ -386,8 +411,7 @@ if __name__=='__main__':
genusFound = False
if options.genus_found is not None :
try:
genus_name = s['genus_name']
genusTaxid = getGenusTaxid(tax, genus_name, restricting_ancestor)
genusTaxid = getGenusTaxid(tax, species_name, restricting_ancestor)
s['genus_taxid'] = genusTaxid
print>>options.genus_found, formatFasta(s)
genusFound = True
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment