bioseq.py 6.96 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
def mergeTaxonomyClassification(uniqSeq,taxonomy):
    for seq in uniqSeq:
        if seq['merged_taxid']:
            seq['taxid']=taxonomy.lastCommonTaxon(*seq['merged_taxid'].keys())
            tsp = taxonomy.getSpecies(seq['taxid'])
            tgn = taxonomy.getGenus(seq['taxid'])
            tfa = taxonomy.getFamily(seq['taxid'])
            
            if tsp is not None:
                sp_sn = taxonomy.getScientificName(tsp)
            else:
                sp_sn="###"
                tsp=-1
                
            if tgn is not None:
                gn_sn = taxonomy.getScientificName(tgn)
            else:
                gn_sn="###"
                tgn=-1
                
            if tfa is not None:
                fa_sn = taxonomy.getScientificName(tfa)
            else:
                fa_sn="###"
                tfa=-1
                
            seq['species']=tsp
            seq['genus']=tgn
            seq['family']=tfa
                
31 32 33
            seq['species_name']=sp_sn
            seq['genus_name']=gn_sn
            seq['family_name']=fa_sn
34 35 36
            
            seq['rank']=taxonomy.getRank(seq['taxid'])
            seq['scientific_name']=fa_sn = taxonomy.getScientificName(seq['taxid'])
Eric Coissac committed
37

38
def uniqSequence(seqIterator,taxonomy=None,mergedKey=None,mergeIds=False,categories=None):
Eric Coissac committed
39 40
    uniques={}
    uniqSeq=[]
41
     
42 43
    if categories is None:
        categories=[]
Eric Coissac committed
44
    
45 46 47
    if mergedKey is not None:
        mergedKey=set(mergedKey)
    else:
48
        mergedKey=set() 
49 50 51 52
    
    if taxonomy is not None:
        mergedKey.add('taxid')
                
Eric Coissac committed
53
    for seq in seqIterator:    
54
        s = tuple(seq[x] for x in categories) + (str(seq),)
Eric Coissac committed
55 56
        if s in uniques:
            s = uniques[s]
Eric Coissac committed
57 58 59 60
            if 'count' in seq:
                s['count']+=seq['count']
            else:
                s['count']+=1
61
                seq['count']=1
Eric Coissac committed
62 63
#            if taxonomy is not None and 'taxid' in seq:
#                s['merged_taxid'][seq['taxid']]=
64
            for key in mergedKey:
65 66 67 68 69 70
                if key=='taxid' and mergeIds:
                    if 'taxid_dist' in seq:
                        s["taxid_dist"].update(seq["taxid_dist"])
                    if 'taxid' in seq:
                        s["taxid_dist"][seq.id]=seq['taxid']
                        
71
                mkey = "merged_%s" % key 
Eric Coissac committed
72
                #cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
73
                if key in seq:
74
                    s[mkey][seq[key]]=s[mkey].get(seq[key],0)+seq['count']
75 76 77 78 79 80
                #cas ou merged_keys existe deja
                else:
                    if mkey in seq:
                        for skey in seq[mkey]:
                            s[mkey][skey]=s[mkey].get(skey,0)+seq[mkey][skey]                            

81 82
                            
            for key in seq.iterkeys():
83 84
                # Merger proprement l'attribut merged s'il exist
                if key in s and s[key]!=seq[key] and key!='count' and key[0:7]!='merged_' and key!='merged':
85 86 87
                    del(s[key])
                
                            
88 89
            if mergeIds:        
                s['merged'].append(seq.id)
Eric Coissac committed
90 91
        else:
            uniques[s]=seq
92
            for key in mergedKey:
93 94 95 96 97
                if key=='taxid' and mergeIds:
                    if 'taxid_dist' not in seq:
                        seq["taxid_dist"]={}
                    if 'taxid' in seq:
                        seq["taxid_dist"][seq.id]=seq['taxid']
98 99 100 101
                mkey = "merged_%s" % key 
                if mkey not in seq:
                    seq[mkey]={}
                if key in seq:
102
                    seq[mkey][seq[key]]=seq[mkey].get(seq[key],0)+seq['count']
103
                    del(seq[key])
104

Eric Coissac committed
105 106
            if 'count' not in seq:
                seq['count']=1
107 108
            if mergeIds:        
                seq['merged']=[seq.id]
Eric Coissac committed
109 110
            uniqSeq.append(seq)

111
    if taxonomy is not None:
112
        mergeTaxonomyClassification(uniqSeq, taxonomy)
113
                    
114 115 116 117
                     

    return uniqSeq

118
def uniqPrefixSequence(seqIterator,taxonomy=None,mergedKey=None,mergeIds=False,categories=None):
119

120 121 122
    if categories is None:
        categories=[]
    
123 124 125 126 127 128 129 130 131 132
    def cmpseq(s1,s2):
        return cmp(str(s1),str(s2))

    if mergedKey is not None:
        mergedKey=set(mergedKey)
    else:
        mergedKey=set() 
    
    if taxonomy is not None:
        mergedKey.add('taxid')
133
                
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
    sequences=list(seqIterator)

    if not sequences:
        return []
    
    sequences.sort(cmpseq)
    
    
    old=sequences.pop()
    uniqSeq=[old]
    if 'count' not in old:
        old['count']=1
    for key in mergedKey:
        mkey = "merged_%s" % key 
        if mkey not in old:
            old[mkey]={}
        if key in old:
            old[mkey][old[key]]=old[mkey].get(old[key],0)+1
    if mergeIds:        
        old['merged']=[old.id]

     
    while(sequences):
        seq=sequences.pop()
        lseq=len(seq)
        pold = str(old)[0:lseq]
        if pold==str(seq):
            
            if 'count' in seq:
                old['count']+=seq['count']
            else:
                old['count']+=1
166
                
167 168 169 170 171 172 173 174 175 176
            for key in mergedKey:
                mkey = "merged_%s" % key 
                if key in seq:
                    old[mkey][seq[key]]=old[mkey].get(seq[key],0)+1
                if mkey in seq:
                    for skey in seq[mkey]:
                        if skey in old:
                            old[mkey][skey]=old[mkey].get(seq[skey],0)+seq[mkey][skey]
                        else:
                            old[mkey][skey]=seq[mkey][skey]
177 178 179 180 181 182

            for key in seq.iterkeys():
                if key in old and old[key]!=seq[key]:
                    del(old[key])
                

183 184 185 186 187 188 189 190 191 192 193
            if mergeIds:        
                old['merged'].append(seq.id)
        else:
            old=seq
            
            for key in mergedKey:
                mkey = "merged_%s" % key 
                if mkey not in seq:
                    seq[mkey]={}
                if key in seq:
                    seq[mkey][seq[key]]=seq[mkey].get(seq[key],0)+1
194
                    del(seq[key])
195 196 197 198 199 200 201 202 203

            if 'count' not in seq:
                seq['count']=1
            if mergeIds:        
                seq['merged']=[seq.id]
            uniqSeq.append(seq)
            
    if taxonomy is not None:
        mergeTaxonomyClassification(uniqSeq, taxonomy)
204

Eric Coissac committed
205
    return uniqSeq
206 207 208
           
    
    
Eric Coissac committed
209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234

def _cmpOnKeyGenerator(key,reverse=False):
    def compare(x,y):
        try:
            c1 = x[key]
        except KeyError:
            c1=None
            
        try:
            c2 = y[key]
        except KeyError:
            c2=None
            
        if reverse:
            s=c1
            c1=c2
            c2=s
        return cmp(c1,c2)
    
    return compare

def sortSequence(seqIterator,key,reverse=False):
    seqs = list(seqIterator)
    seqs.sort(_cmpOnKeyGenerator(key, reverse))
    return seqs