Commit df093b3f by Celine Mercier

updated the UNITE parser of obiaddtaxids

parent 9ba67985
...@@ -69,6 +69,7 @@ Otherwise, ...@@ -69,6 +69,7 @@ Otherwise,
import sys import sys
import re
from obitools.fasta import fastaIterator,formatFasta from obitools.fasta import fastaIterator,formatFasta
from obitools.options import getOptionManager from obitools.options import getOptionManager
...@@ -111,7 +112,9 @@ def addObiaddtaxidsOptions(optionManager): ...@@ -111,7 +112,9 @@ def addObiaddtaxidsOptions(optionManager):
type="string", type="string",
default='raw', default='raw',
help="type of the database with the taxa to be added. Possibilities : 'raw', 'UNITE' or 'SILVA'." help="type of the database with the taxa to be added. Possibilities : 'raw', 'UNITE' or 'SILVA'."
" Default : raw.") "The UNITE format must be the one used for the 'Full UNITE+INSD dataset'. Example :"
">UDB016651|k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Thelephorales;f__Thelephoraceae;g__Tomentella;s__Tomentella sp|SH200602.06FU"
" Default : raw.")
optionManager.add_option('-k','--key-name', optionManager.add_option('-k','--key-name',
action="store", dest="tagname", action="store", dest="tagname",
...@@ -148,16 +151,24 @@ def UNITEIterator(f): ...@@ -148,16 +151,24 @@ def UNITEIterator(f):
for entry in fastaEntryIterator(f) : for entry in fastaEntryIterator(f) :
all = entry.split('\n') all = entry.split('\n')
header = all[0] header = all[0]
fields = header.split('|') fields = header.split('|')
id = fields[0][1:] seq_id = fields[0][1:]
seq = all[1] seq = all[1]
s = NucSequence(id, seq) s = NucSequence(seq_id, seq)
s['ISDN_species_name'] = fields[1]
s['UNITE_species_name'] = fields[3] path = fields[1]
path1 = fields[2].replace(';', ',')
path2 = fields[4].replace(';', ',') species_name_loc = path.index('s__')
s['ISDN_path'] = path1 species_name_loc+=3
s['UNITE_path'] = path2 s['species_name'] = path[species_name_loc:]
genus_name_loc = path.index('g__')
genus_name_loc+=3
s['genus_name'] = path[genus_name_loc:species_name_loc-4]
path = re.sub('[a-z]__', '', path)
s['path'] = path.replace(';', ',')
yield s yield s
...@@ -358,53 +369,31 @@ if __name__=='__main__': ...@@ -358,53 +369,31 @@ if __name__=='__main__':
elif options.db_type == 'UNITE' : elif options.db_type == 'UNITE' :
restricting_ancestor = tax.findTaxonByName('Fungi')[0]
restricting_ancestor = tax.findTaxonByName('Fungi')[0][0]
for s in entries : for s in entries :
try: try :
species_name = s["UNITE_species_name"] species_name = s['species_name']
taxid = getTaxid(tax, species_name, restricting_ancestor) taxid = getTaxid(tax, species_name, restricting_ancestor)
s['taxid']=taxid s['taxid']=taxid
s["species_name"] = species_name
print formatFasta(s) print formatFasta(s)
except KeyError: except KeyError:
try:
species_name = s["ISDN_species_name"]
print species_name
taxid = getTaxid(tax, species_name, restricting_ancestor)
s['taxid']=taxid
s["species_name"] = species_name
print formatFasta(s)
except KeyError: genusFound = False
if options.genus_found is not None :
if s["UNITE_species_name"] != "-" and s["UNITE_species_name"] != "" : try:
s["species_name"] = s["UNITE_species_name"] genus_name = s['genus_name']
chosen = 'unite' genusTaxid = getGenusTaxid(tax, genus_name, restricting_ancestor)
s['genus_taxid'] = genusTaxid
elif s["ISDN_species_name"] != "-" and s["ISDN_species_name"] != "" : print>>options.genus_found, formatFasta(s)
s["species_name"] = s["ISDN_species_name"] genusFound = True
chosen = 'isdn'
else : except KeyError:
if s["UNITE_path"] != "-" and s["UNITE_path"] != "" : pass
chosen = 'unite'
s["species_name"] = (s["UNITE_path"].split(', '))[-1] if options.unidentified is not None and not genusFound :
print>>options.unidentified,formatFasta(s)
elif s["ISDN_path"] != "-" and s["ISDN_path"] != "" :
chosen = 'isdn'
s["species_name"] = (s["ISDN_path"].split(', '))[-1]
else :
print>>sys.stderr, "\n\nwarning : sequence without any identification at all\n\n"
if chosen == 'unite' :
s['path'] = s["UNITE_path"]
else :
s['path'] = s["ISDN_path"]
if options.unidentified is not None :
print>>options.unidentified,formatFasta(s)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment