Commit 2d06d9be authored by Eric Coissac's avatar Eric Coissac

a first version of obiuniq

parent d1206c3b
......@@ -259,8 +259,6 @@ def readTagfile(filename):
dpartial[tags]=data
rpartial[tags]=data
print(primers, file=sys.stderr)
return primers
......
......@@ -226,7 +226,7 @@ cdef class BioSequence(object):
return 1
elif key==b'taxid' and self._hasTaxid:
self.extractTaxon()
return self._info['taxid']
return self._info[b'taxid']
else:
raise KeyError,key
p = re.compile(self._rawparser % key)
......@@ -240,7 +240,7 @@ cdef class BioSequence(object):
pass
self._info[key]=v
else:
if key=='count':
if key==b'count':
v=1
else:
raise KeyError,key
......
from functools import cmp_to_key
from obitools.utils import toBytes
def mergeTaxonomyClassification(uniqSeq,taxonomy):
for seq in uniqSeq:
if seq['merged_taxid']:
seq['taxid']=taxonomy.lastCommonTaxon(*list(seq['merged_taxid'].keys()))
tsp = taxonomy.getSpecies(seq['taxid'])
tgn = taxonomy.getGenus(seq['taxid'])
tfa = taxonomy.getFamily(seq['taxid'])
if seq[b'merged_taxid']:
seq[b'taxid']=taxonomy.lastCommonTaxon(*list(seq[b'merged_taxid'].keys()))
tsp = taxonomy.getSpecies(seq[b'taxid'])
tgn = taxonomy.getGenus(seq[b'taxid'])
tfa = taxonomy.getFamily(seq[b'taxid'])
if tsp is not None:
sp_sn = taxonomy.getScientificName(tsp)
else:
sp_sn="###"
sp_sn=b"###"
tsp=-1
if tgn is not None:
gn_sn = taxonomy.getScientificName(tgn)
else:
gn_sn="###"
gn_sn=b"###"
tgn=-1
if tfa is not None:
fa_sn = taxonomy.getScientificName(tfa)
else:
fa_sn="###"
fa_sn=b"###"
tfa=-1
seq['species']=tsp
seq['genus']=tgn
seq['family']=tfa
seq[b'species']=tsp
seq[b'genus']=tgn
seq[b'family']=tfa
seq['species_name']=sp_sn
seq['genus_name']=gn_sn
seq['family_name']=fa_sn
seq[b'species_name']=sp_sn
seq[b'genus_name']=gn_sn
seq[b'family_name']=fa_sn
seq['rank']=taxonomy.getRank(seq['taxid'])
seq['scientific_name']=fa_sn = taxonomy.getScientificName(seq['taxid'])
seq[b'rank']=taxonomy.getRank(seq[b'taxid'])
seq[b'scientific_name']=fa_sn = taxonomy.getScientificName(seq[b'taxid'])
def uniqSequence(seqIterator,taxonomy=None,mergedKey=None,mergeIds=False,categories=None):
uniques={}
......@@ -48,30 +51,30 @@ def uniqSequence(seqIterator,taxonomy=None,mergedKey=None,mergeIds=False,categor
mergedKey=set()
if taxonomy is not None:
mergedKey.add('taxid')
mergedKey.add(b'taxid')
for seq in seqIterator:
s = tuple(seq[x] for x in categories) + (str(seq),)
if s in uniques:
s = uniques[s]
if 'count' in seq:
s['count']+=seq['count']
if b'count' in seq:
s[b'count']+=seq[b'count']
else:
s['count']+=1
seq['count']=1
s[b'count']+=1
seq[b'count']=1
# if taxonomy is not None and 'taxid' in seq:
# s['merged_taxid'][seq['taxid']]=
for key in mergedKey:
if key=='taxid' and mergeIds:
if 'taxid_dist' in seq:
s["taxid_dist"].update(seq["taxid_dist"])
if key==b'taxid' and mergeIds:
if b'taxid_dist' in seq:
s[b"taxid_dist"].update(seq[b"taxid_dist"])
if 'taxid' in seq:
s["taxid_dist"][seq.id]=seq['taxid']
s[b"taxid_dist"][seq.id]=seq[b'taxid']
mkey = "merged_%s" % key
mkey = b"merged_%s" % key
#cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
if key in seq:
s[mkey][seq[key]]=s[mkey].get(seq[key],0)+seq['count']
s[mkey][seq[key]]=s[mkey].get(seq[key],0)+seq[b'count']
#cas ou merged_keys existe deja
else:
if mkey in seq:
......@@ -81,31 +84,31 @@ def uniqSequence(seqIterator,taxonomy=None,mergedKey=None,mergeIds=False,categor
for key in seq.keys():
# Merger proprement l'attribut merged s'il exist
if key in s and s[key]!=seq[key] and key!='count' and key[0:7]!='merged_' and key!='merged':
if key in s and s[key]!=seq[key] and key!=b'count' and key[0:7]!=b'merged_' and key!=b'merged':
del(s[key])
if mergeIds:
s['merged'].append(seq.id)
s[b'merged'].append(seq.id)
else:
uniques[s]=seq
for key in mergedKey:
if key=='taxid' and mergeIds:
if 'taxid_dist' not in seq:
seq["taxid_dist"]={}
if 'taxid' in seq:
seq["taxid_dist"][seq.id]=seq['taxid']
mkey = "merged_%s" % key
if key==b'taxid' and mergeIds:
if b'taxid_dist' not in seq:
seq[b"taxid_dist"]={}
if b'taxid' in seq:
seq[b"taxid_dist"][seq.id]=seq[b'taxid']
mkey = b"merged_%s" % key
if mkey not in seq:
seq[mkey]={}
if key in seq:
seq[mkey][seq[key]]=seq[mkey].get(seq[key],0)+seq['count']
seq[mkey][seq[key]]=seq[mkey].get(seq[key],0)+seq[b'count']
del(seq[key])
if 'count' not in seq:
seq['count']=1
if b'count' not in seq:
seq[b'count']=1
if mergeIds:
seq['merged']=[seq.id]
seq[b'merged']=[seq.id]
uniqSeq.append(seq)
if taxonomy is not None:
......@@ -121,7 +124,13 @@ def uniqPrefixSequence(seqIterator,taxonomy=None,mergedKey=None,mergeIds=False,c
categories=[]
def cmpseq(s1,s2):
return cmp(str(s1),str(s2))
s1 = toBytes(s1)
s2 = toBytes(s2)
if s1==s2 :
return 0
if s1 < s2 :
return -1
return 1
if mergedKey is not None:
mergedKey=set(mergedKey)
......@@ -129,28 +138,28 @@ def uniqPrefixSequence(seqIterator,taxonomy=None,mergedKey=None,mergeIds=False,c
mergedKey=set()
if taxonomy is not None:
mergedKey.add('taxid')
mergedKey.add(b'taxid')
sequences=list(seqIterator)
if not sequences:
return []
sequences.sort(cmpseq)
sequences.sort(key=cmp_to_key(cmpseq))
old=sequences.pop()
uniqSeq=[old]
if 'count' not in old:
old['count']=1
if b'count' not in old:
old[b'count']=1
for key in mergedKey:
mkey = "merged_%s" % key
mkey = b"merged_%s" % key
if mkey not in old:
old[mkey]={}
if key in old:
old[mkey][old[key]]=old[mkey].get(old[key],0)+1
if mergeIds:
old['merged']=[old.id]
old[b'merged']=[old.id]
while(sequences):
......@@ -159,13 +168,13 @@ def uniqPrefixSequence(seqIterator,taxonomy=None,mergedKey=None,mergeIds=False,c
pold = str(old)[0:lseq]
if pold==str(seq):
if 'count' in seq:
old['count']+=seq['count']
if b'count' in seq:
old[b'count']+=seq[b'count']
else:
old['count']+=1
old[b'count']+=1
for key in mergedKey:
mkey = "merged_%s" % key
mkey = b"merged_%s" % key
if key in seq:
old[mkey][seq[key]]=old[mkey].get(seq[key],0)+1
if mkey in seq:
......@@ -181,22 +190,22 @@ def uniqPrefixSequence(seqIterator,taxonomy=None,mergedKey=None,mergeIds=False,c
if mergeIds:
old['merged'].append(seq.id)
old[b'merged'].append(seq.id)
else:
old=seq
for key in mergedKey:
mkey = "merged_%s" % key
mkey = b"merged_%s" % key
if mkey not in seq:
seq[mkey]={}
if key in seq:
seq[mkey][seq[key]]=seq[mkey].get(seq[key],0)+1
del(seq[key])
if 'count' not in seq:
seq['count']=1
if b'count' not in seq:
seq[b'count']=1
if mergeIds:
seq['merged']=[seq.id]
seq[b'merged']=[seq.id]
uniqSeq.append(seq)
if taxonomy is not None:
......@@ -229,6 +238,6 @@ def _cmpOnKeyGenerator(key,reverse=False):
def sortSequence(seqIterator,key,reverse=False):
seqs = list(seqIterator)
seqs.sort(_cmpOnKeyGenerator(key, reverse))
seqs.sort(key=cmp_to_key(_cmpOnKeyGenerator(key, reverse)))
return seqs
\ No newline at end of file
......@@ -50,6 +50,7 @@ attributes created (``species``, ``genus``, ``family``, ``species_name``, ``genu
from obitools.format.options import addInputFormatOption
from obitools.fasta import formatFasta
from obitools.utils.bioseq import uniqSequence,uniqPrefixSequence
from obitools.utils import toBytes
from obitools.options import getOptionManager
from obitools.options.taxonomyfilter import addTaxonomyDBOptions
from obitools.options.taxonomyfilter import loadTaxonomyDatabase
......@@ -101,6 +102,9 @@ if __name__=='__main__':
else:
usm= uniqSequence
if options.merge is not None:
options.merge = [toBytes(k) for k in options.merge]
uniqSeq=usm(entries,taxonomy,options.merge,options.mergeids,options.categories)
for seq in uniqSeq:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment