Commit 5f4c894b by Eric Coissac

add fastaAddTaxid.py script

parent 619eebf6
from obitools.utils import ColumnFile
from obitools.utils import progressBar
from obitools.graph import Graph
from obitools.graph import selectEdgeAttributeFactory
from obitools.graph.algorithms.component import componentIterator
from obitools.graph.algorithms.compact import compactGraph
from obitools import NucSequence
from obitools.options import getOptionManager
import sys
def addGraphOptions(optionManager):
optionManager.add_option('-d','--ecopcrdb',
action="store", dest="db",
metavar="<FILENAME>",
type="string",
help="ecoPCR Database "
"name")
def ColIterator(stream):
return ColumnFile(stream, strip=True, types=(str,str,str,str,
int,
float,float,
float,float,
float,float,
int,int,float),
skip="primer_1")
def deltaEqual(d,p1,p2):
if d < 0:
pn=p1[-d:]
pp=p2[:d]
elif d > 0:
pn=p2[d:]
pp=p1[:-d]
else:
pn=p1
pp=p2
return pn==pp
def linkPrimerNode(delta,g,complement=False,**data):
word={}
nn = len(g)
pn = 0
print >> sys.stderr,"Indexing nodes..."
progressBar(1, nn, True)
for n in g:
pn+=1
progressBar(pn, nn)
for i in xrange(delta+1):
p = n.label[i:]
if p in word:
word[p].add(n.index)
else:
word[p]=set([n.index])
if i:
p = n.label[:-i]
if p in word:
word[p].add(n.index)
else:
word[p]=set([n.index])
print >> sys.stderr,"\nLook for related primers"
pn = 0
progressBar(1, nn, True)
for n in g:
if complement:
cp = str(NucSequence('x',n.label).complement()).lower()
else:
cp=n.label
cc=set()
pn+=1
progressBar(pn, nn)
for i in xrange(delta+1):
p = cp[i:]
if p in word:
cc|=word[p]
if i:
p = cp[:-i]
if p in word:
cc|=word[p]
for i in cc:
if complement or i!=n.index:
g.addEdge(index1=n.index,index2=i,**data)
#print >> sys.stderr, g.addEdge(index1=n.index,index2=i,color="red"), " ==> Complement(",n.label,":",cp,g.getNode(index=i).label,")"
print >>sys.stderr,""
def greenComponentIterator(g):
greenPredicat = selectEdgeAttributeFactory("color", "green")
return componentIterator(g,edgePredicat=greenPredicat)
if __name__=='__main__':
optionParser = getOptionManager([],
entryIterator=ColIterator
)
(options, entries) = optionParser()
G = Graph('primers')
print >>sys.stderr,"Read ecoPrimer result file...",
for amp in entries:
p1 = amp[1]
p2 = amp[3]
count=amp[4]
family_spe,family_sens = amp[5],amp[6]
genus_spe,genus_sens = amp[7],amp[8]
species_spe,species_sens = amp[9],amp[10]
avg_l=amp[13]
G.addEdge(p1, p2, count=count ,
species_spe=species_spe,species_sens=species_sens,
avg_l=avg_l,
color="blue")
print >> sys.stderr," %d primers and %d amplifia" % (len(G),G.edgeCount())
print >>sys.stderr,"Look for complementary primers..."
linkPrimerNode(2,G,True,color="red")
print >> sys.stderr,"Ok"
for C in componentIterator(G):
sub = G.subgraph(C)
linkPrimerNode(2, sub,color="green")
compactGraph(sub, greenComponentIterator)
\ No newline at end of file
...@@ -44,7 +44,7 @@ def addSearchOptions(optionManager): ...@@ -44,7 +44,7 @@ def addSearchOptions(optionManager):
type="float", type="float",
default=0.9, default=0.9,
help="minimum ratio between match length and" help="minimum ratio between match length and"
" query length . c in [0.0,1.0].") " query length . c in [0.0,1.0] default = 0.9.")
optionManager.add_option('-C','--db-coverage', optionManager.add_option('-C','--db-coverage',
action="store", dest="dbcov", action="store", dest="dbcov",
...@@ -52,7 +52,7 @@ def addSearchOptions(optionManager): ...@@ -52,7 +52,7 @@ def addSearchOptions(optionManager):
type="float", type="float",
default=0.9, default=0.9,
help="minimum ratio between match length and" help="minimum ratio between match length and"
" subject length . C in [0.0,1.0].") " subject length . C in [0.0,1.0] default = 0.9.")
optionManager.add_option('-p','--program', optionManager.add_option('-p','--program',
action="store", dest="program", action="store", dest="program",
......
#!/usr/local/bin/python
import re
def igrep(lines,pattern,exclude=False):
"""
igrep emule la commande unix grep.
@param lines: une collection de string
@type lines: iterator on str
@param pattern: une expression reguliere
@type pattern: str
@return: un iterateur sur le sous ensemble
des lignes possedant au moins une occurence
du pattern
@rtype: iterator on str
"""
automate = re.compile(pattern)
for l in lines:
match = automate.search(l) is not None
out = (match and not exclude) or (not match and exclude)
if out:
yield l
def wordIterator(sequence,lword,step=1,endIncluded=False):
L = len(sequence)
if endIncluded:
pmax=L
else:
pmax = L - lword + 1
pos = xrange(0,pmax,step)
for x in pos:
yield sequence[x:x+lword]
def fastaFormat(title,sequence):
width = 50
fasta = '>' + title + '\n'
iwords= wordIterator(sequence,width,width,True)
words =[w for w in iwords]
seq = '\n'.join(words)
return fasta + seq
def isqlines(lines):
for l in igrep(lines,'^ '):
yield l
def icleanSqLines(lines):
sq = isqlines(lines)
bad = re.compile('[ \t\n0-9]+')
for line in sq:
yield bad.sub('',line)
def getSeq(lines):
sq = icleanSqLines(lines)
allsq = [x for x in sq]
seq = ''.join(allsq)
return seq
if __name__=='__main__':
# ici commence le main de mon script
import sys
import optparse
parser = optparse.OptionParser()
parser.add_option("-b", "--begin",
type="int", dest="begin",
default=1,
help="start position of the cutted fragment")
parser.add_option("-e", "--end",
type="int", dest="end",
help="end position of the cutted fragment")
parser.add_option("-t", "--title",
dest="title",
help="title line of the fasta output")
(options, args) = parser.parse_args()
if len(args) > 0:
filename = args[0]
f = open(filename)
else:
f = sys.stdin
sq = getSeq(f)
start = options.begin - 1
stop = options.end
subseq= sq[start:stop]
fasta = fastaFormat(options.title,subseq)
print fasta
#!/usr/local/bin/python
import re
import sys
from obitools.fasta import fastaIterator,formatFasta
from obitools.options import getOptionManager
from obitools.ecopcr import taxonomy
def addTaxonomyOptions(optionManager):
optionManager.add_option('-d','--ecopcrdb',
action="store", dest="db",
metavar="<FILENAME>",
type="string",
help="ecoPCR Database "
"name")
optionManager.add_option('-u','--undefined',
action="store", dest="undefined",
metavar="<FILENAME>",
type="string",
default=None,
help="file used to store unidentified sequences")
optionManager.add_option('-t','--tag-name',
action="store", dest="tagname",
metavar="<tagname>",
type="string",
default='species',
help="file containing tag used")
if __name__=='__main__':
optionParser = getOptionManager([addTaxonomyOptions],
entryIterator=fastaIterator
)
(options, entries) = optionParser()
tax = taxonomy.Taxonomy(options.db)
if options.undefined is not None:
options.undefined=open(options.undefined,'w')
for s in entries:
if options.tagname in s:
sp = s[options.tagname]
try:
taxon = tax.findTaxonByName(sp)
s['taxid']=taxon[0]
print formatFasta(s)
except KeyError:
if options.undefined is not None:
print >> options.undefined,formatFasta(s)
\ No newline at end of file
...@@ -13,8 +13,6 @@ fastaLength.py [-h|--help] <fastafile>" ...@@ -13,8 +13,6 @@ fastaLength.py [-h|--help] <fastafile>"
------------------------------------------------------- -------------------------------------------------------
""" """
import fileinput
from obitools.fasta import fastaIterator,formatFasta from obitools.fasta import fastaIterator,formatFasta
from obitools.options import getOptionManager from obitools.options import getOptionManager
...@@ -28,7 +26,7 @@ if __name__=='__main__': ...@@ -28,7 +26,7 @@ if __name__=='__main__':
(options, entries) = optionParser() (options, entries) = optionParser()
for seq in entries: for seq in entries:
seq['seqLength']=str(len(seq)) seq['seqLength']=len(seq)
print formatFasta(seq) print formatFasta(seq)
\ No newline at end of file
#!/Library/Frameworks/Python.framework/Versions/2.6/Resources/Python.app/Contents/MacOS/Python
import sys
from obitools.fasta import fastaIterator,formatFasta
from obitools.ecopcr.taxonomy import Taxonomy
from obitools.options import getOptionManager
def addTaxonomyOptions(optionManager):
optionManager.add_option('-t','--taxonomy',
action="store", dest="taxonomy",
metavar="<FILENAME>",
type="string",
help="ecoPCR taxonomy Database name")
optionManager.add_option('-r','--rank',
action="store", dest="rank",
metavar="<RANKNAME>",
type="string",
help="taxonomic rank name "
"(i.e. genus, species...)")
if __name__=='__main__':
optionParser = getOptionManager([addTaxonomyOptions],
entryIterator=fastaIterator
)
(options, entries) = optionParser()
assert options.taxonomy is not None,"You must specify a -t option"
assert options.rank is not None,"You must specify a -r option"
taxonomy = Taxonomy(options.taxonomy)
found = set()
for seq in entries:
currenttaxid = seq['taxid']
ranktaxid = taxonomy.getTaxonAtRank(currenttaxid,options.rank)
if ranktaxid is not None:
if ranktaxid not in found:
found.add(ranktaxid)
seq[options.rank]=ranktaxid
seq[options.rank+'_sn']=taxonomy.getScientificName(ranktaxid)
print formatFasta(seq)
#!/usr/local/bin/python
from obitools.fasta import fastaIterator,formatFasta
from obitools.options import getOptionManager
from obitools.obischemas import initConnection,getConnection
from obitools.obischemas.options import addConnectionOptions
from obitools.table.csv import csvIterator
from obitools import NucSequence
import sys
import re
from logging import root,DEBUG
def addFastaTaxidOptions(optionManager):
optionManager.add_option('-r','--restrict',
action="store", dest="restrict",
metavar="<TAXID>",
type="string",
default=None,
help="taxid limiting to a taxonomic "
"subtree")
optionManager.add_option('-c','--csv',
dest="csv",
action="store_true",
default=False,
help="the input file is a Ludovic CSV format")
def lookForTaxon(taxon,restriction=None,connection=None):
if connection is None:
connection=getConnection()
cur = connection.cursor()
if restriction is None:
query='''
select n.taxid,r.scientific_name
from ncbi_taxonomy.name as n,
ncbi_taxonomy.taxon as r
where n.taxid=r.numid
and n.name = '%s'
''' % taxon.strip()
else:
query='''
select n.taxid,r.scientific_name
from ncbi_taxonomy.name as n,
ncbi_taxonomy.taxon as r
where n.taxid=r.numid
and r.path ~ '*.%s.*'
and n.name = '%s'
''' % (restriction,taxon.strip())
cur.execute(query)
return [x for x in cur]
def lookForGenus(genus,restriction=None,connection=None):
if connection is None:
connection=getConnection()
cur = connection.cursor()
if restriction is None:
query='''
select n.taxid,r.scientific_name
from ncbi_taxonomy.name as n,
ncbi_taxonomy.genus_rank as r
where n.taxid=r.numid
and n.name='%s'
''' % genus.strip()
else:
query='''
select n.taxid,r.scientific_name
from ncbi_taxonomy.name as n,
ncbi_taxonomy.genus_rank as r
where n.taxid=r.numid
and r.path ~ '*.%s.*'
and n.name='%s'
''' % (restriction,genus.strip())
cur.execute(query)
return [x for x in cur]
def lookForFamily(family,restriction=None,connection=None):
if connection is None:
connection=getConnection()
cur = connection.cursor()
if restriction is None:
query='''
select n.taxid,r.scientific_name
from ncbi_taxonomy.name as n,
ncbi_taxonomy.family_rank as r
where n.taxid=r.numid
and n.name='%s'
''' % family.strip()
else:
query='''
select n.taxid,r.scientific_name
from ncbi_taxonomy.name as n,
ncbi_taxonomy.family_rank as r
where n.taxid=r.numid
and r.path ~ '*.%s.*'
and n.name='%s'
''' % (restriction,family.strip())
cur.execute(query)
return [x for x in cur]
def csvEntryIterator(entries):
for data in csvIterator(entries):
seq = NucSequence(data[1],data[5])
for x in data[2:5]:
rank,name=x.split('=',1)
seq[rank]=name
yield seq
__missing_family = set()
__missing_genus = set()
__missing_species= set()
__multiple_family = set()
__multiple_genus = set()
__multiple_species= set()
def clearMissing():
global __missing_family, __missing_genus, __missing_species
global __multiple_family, __multiple_genus, __multiple_species
__missing_family=set()
__missing_genus=set()
__missing_species=set()
__multiple_family=set()
__multiple_genus=set()
__multiple_species=set()
def sortMissing(m1,m2):
if isinstance(m1, tuple):
m1=m1[0]
if isinstance(m2, tuple):
m2=m2[0]
return cmp(m1[0],m2[0])
def printMissing(out=sys.stderr):
sf = '%s'
tf = ' "%s" in %s (%s)'
if __missing_family:
print >>out,'Missing Family'
print >>out,'--------------'
print >>out
l = list(__missing_family)
l.sort()
for f in l:
print >> out,' ',f
print >>out
if __multiple_family:
print >>out,'Multiply defined Family'
print >>out,'-----------------------'
print >>out
l = list(__multiple_family)
l.sort()
for f in l:
print >> out,' ',f
print >>out
if __missing_genus:
print >>out,'Missing genus'
print >>out,'-------------'
print >>out
l = list(__missing_genus)
l.sort(sortMissing)
for f in l:
if isinstance(f,tuple):
print >>out,tf % f
else:
print >>out,f
print >>out
if __multiple_genus:
print >>out,'Multiply defined genus'
print >>out,'----------------------'
print >>out
l = list(__multiple_genus)
l.sort(sortMissing)
for f in l:
if isinstance(f,tuple):
print >>out,tf % f
else:
print >>out,f
print >>out
if __missing_species:
print >>out,'Missing species'
print >>out,'---------------'
print >>out
l = list(__missing_species)
l.sort(sortMissing)
for f in l:
if isinstance(f,tuple):
print >>out,tf % f
else:
print >>out,f
print >>out
if __multiple_species:
print >>out,'Multiply defined specie'
print >>out,'----------------------'
print >>out
l = list(__multiple_species)
l.sort(sortMissing)
for f in l:
if isinstance(f,tuple):
print >>out,tf % f
else:
print >>out,f
print >>out
def tagSequence(seq,restriction=None):
family = seq['Family']
genus = seq['Genus']
species = '%s %s' % (genus,seq['Species'])
genus_id=None
family_id=None
species_id = lookForTaxon(species,
restriction)
if len(species_id)==1:
seq['ncbi_scientific_name']=species_id[0][1]
seq['taxid']=species_id[0][0]
elif not species_id:
genus_id = lookForGenus(genus,
restriction)
if len(genus_id)==1:
seq['ncbi_genus']=genus_id[0][1]
genus_id = genus_id[0][0]
__missing_species.add((species,genus,genus_id))
else:
__missing_species.add(species)
genus_id=None
species_id=None
else:
genus_id = lookForGenus(genus,
restriction)
if len(genus_id)==1:
seq['ncbi_genus']=genus_id[0][1]
genus_id = genus_id[0][0]
__multiple_species.add((species,genus_id,genus))
else:
__multiple_species.add(species)
genus_id=None
species_id=None
if species_id is None and genus_id is None:
genus_id = lookForGenus(genus,
restriction)
if len(genus_id)==1:
seq['ncbi_genus']=genus_id[0][1]
genus_id = genus_id[0][0]
elif not genus_id:
family_id = lookForFamily(family, restriction)
if len(family_id)==1:
seq['ncbi_family']=family_id[0][1]
family_id=family_id[0][0]
__missing_genus.add((genus,family,family_id))
else:
__missing_genus.add(genus)
family_id=None
genus_id=None
else:
family_id = lookForFamily(family, restriction)
if len(family_id)==1:
seq['ncbi_family']=family_id[0][1]
family_id=family_id[0][0]
__multiple_genus.add((genus,family,family_id))
else:
__multiple_genus.add(genus)
genus_id=None
if family_id is not None and isinstance(family_id, list):
if not family_id:
__missing_family.add(family)
else:
__multiple_family.add(family)
return seq
if __name__=='__main__':
optionParser = getOptionManager([addFastaTaxidOptions,
addConnectionOptions],
)
(options, entries) = optionParser()
if options.csv:
entries = csvEntryIterator(entries)
else:
entries = fastaIterator(entries)
initConnection(options)
clearMissing()
for e in entries: