Commit 46b446f0 by Eric Coissac

Add capacity to filter on merged sequences

parent 6208cbee
......@@ -23,8 +23,11 @@ def minimum(seqs):
return min(s['select'] for s in seqs)
def maximum(seqs):
return max(s['select'] for s in seqs)
try:
return max(s['select'] for s in seqs)
except TypeError, e:
print >>sys.stderr, seqs
raise e
def mean(seqs):
ss= reduce(lambda x,y: x + y,(s['select'] for s in seqs),0)
return float(ss) / len(seqs)
......@@ -101,6 +104,13 @@ def addSelectOptions(optionManager):
default=[],
help="attributes to merge within each group")
group.add_option('-s','--sample',
action="store", dest="sample",
metavar="<TAGNAME>",
type="str",
default=None,
help="Tag containing sample descriptions, the default value is set to *merged_sample*")
group.add_option('--merge-ids',
action="store_true", dest="mergeids",
default=False,
......@@ -129,16 +139,22 @@ if __name__ == '__main__':
classes = {}
print >>sys.stderr,"\nLoading sequences...\n"
with_taxonomy=hasattr(options, 'taxonomy') and options.taxonomy is not None
nbseq=0
for s in entries:
nbseq+=1
category = []
if with_taxonomy:
environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()}
else:
environ = {'sequence':s,'random':random()}
for c in options.categories:
try:
if hasattr(options, 'taxonomy') and options.taxonomy is not None:
environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()}
else:
environ = {'sequence':s,'random':random()}
v = eval(c,environ,s)
category.append(v)
except:
......@@ -149,11 +165,6 @@ if __name__ == '__main__':
group.append(s)
classes[category]= group
if hasattr(options, 'taxonomy') and options.taxonomy is not None:
environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()}
else:
environ = {'sequence':s, 'random':random()}
try:
select = eval(options.function,environ,s)
s['select']=select
......@@ -181,75 +192,87 @@ if __name__ == '__main__':
i+=1
progressBar(i,lclasses,False,"%15s" % ("/".join(map(str,c)),))
seqs = classes[c]
sortclass(seqs, options)
if len(c)==1:
c=c[0]
if options.sample is not None:
subsets = {}
for s in seqs:
for sid in s[options.sample]:
ss = subsets.get(sid,[])
ss.append(s)
subsets[sid]=ss
else:
subsets={"all":seqs}
if options.number==1:
s = seqs[0]
for seqs in subsets.values():
sortclass(seqs, options)
for key in mergedKey:
if key=='taxid' and mergeIds:
if 'taxid_dist' not in s:
s["taxid_dist"]={}
if 'taxid' in s:
s["taxid_dist"][s.id]=s['taxid']
mkey = "merged_%s" % key
if mkey not in s:
if key in s:
s[mkey]={s[key]:1}
else:
s[mkey]={}
if 'count' not in s:
s['count']=1
if mergeIds:
s['merged']=[s.id]
for seq in seqs[1:]:
if len(c)==1:
c=c[0]
s['count']+=seq['count']
if options.number==1 and options.sample is None:
s = seqs[0]
for key in mergedKey:
if key=='taxid' and mergeIds:
if 'taxid_dist' in seq:
s["taxid_dist"].update(seq["taxid_dist"])
if 'taxid' in seq:
s["taxid_dist"][seq.id]=seq['taxid']
if 'taxid_dist' not in s:
s["taxid_dist"]={}
if 'taxid' in s:
s["taxid_dist"][s.id]=s['taxid']
mkey = "merged_%s" % key
if mkey in seq:
m = seq[mkey]
else:
if key in seq:
m={seq[key]:1}
allmkey = set(m.keys()) | set(s[mkey].keys())
s[mkey] = dict((k,m.get(k,0)+s[mkey].get(k,0)) for k in allmkey)
# if mkey in seq:
# for skey in seq[mkey]:
# if skey in s:
# s[mkey][skey]=s[mkey].get(seq[skey],0)+seq[mkey][skey]
# else:
# s[mkey][skey]=seq[mkey][skey]
#for key in seq.iterkeys():
# # Merger proprement l'attribut merged s'il exist
# if key in s and s[key]!=seq[key] and key!='count' and key[0:7]!='merged_' and key!='merged' and key!='select':
# del(s[key])
if mkey not in s:
if key in s:
s[mkey]={s[key]:1}
else:
s[mkey]={}
if 'count' not in s:
s['count']=1
if mergeIds:
s['merged'].append(seq.id)
if taxonomy is not None:
mergeTaxonomyClassification(seqs, taxonomy)
s['merged']=[s.id]
for seq in seqs[1:]:
for s in seqs[0:options.number]:
s['class']=c
del s['select']
writer(s)
s['count']+=seq['count']
for key in mergedKey:
if key=='taxid' and mergeIds:
if 'taxid_dist' in seq:
s["taxid_dist"].update(seq["taxid_dist"])
if 'taxid' in seq:
s["taxid_dist"][seq.id]=seq['taxid']
mkey = "merged_%s" % key
if mkey in seq:
m = seq[mkey]
else:
if key in seq:
m={seq[key]:1}
allmkey = set(m.keys()) | set(s[mkey].keys())
s[mkey] = dict((k,m.get(k,0)+s[mkey].get(k,0)) for k in allmkey)
if mergeIds:
s['merged'].append(seq.id)
if taxonomy is not None:
mergeTaxonomyClassification(seqs, taxonomy)
for s in seqs[0:options.number]:
s['class']=c
s['__@TOWRITE@__']=True
print >>sys.stderr,"\Writing sequences...\n"
progressBar(1,nbseq,True,'Writing')
i=0
for c in classes:
seqs = classes[c]
for s in seqs:
i+=1
progressBar(i,nbseq,False,"Writing")
if '__@TOWRITE@__' in s:
del s['__@TOWRITE@__']
del s['select']
writer(s)
print >>sys.stderr
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment