obisample.py 3.44 KB
Newer Older
1 2
#!/usr/local/bin/python
'''
Aurélie Bonin committed
3
:py:mod:`obisample`: randomly resamples sequence records
4
========================================================
Frédéric Boyer committed
5 6 7 8

.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>


9
:py:mod:`obisample` randomly resamples sequence records with or without replacement.
10 11 12

'''

Lucie Zinger committed
13

14
from obitools.options import getOptionManager
15
from obitools.sample import weigthedSample, weigthedSampleWithoutReplacement
Eric Coissac committed
16
from obitools.format.options import addInOutputOption, sequenceWriterGenerator
17
import random
18 19 20 21 22

def addSampleOptions(optionManager):
    optionManager.add_option('-s','--sample-size',
                             action="store", dest="size",
                             metavar="###",
23
                             type="float",
24
                             default=None,
25
                             help="Size of the generated sample. "
Frédéric Boyer committed
26
                                  "If -a option is set, size is expressed as fraction"
27 28 29 30 31 32
                             )
    optionManager.add_option('-a','--approx-sampling',
                             action="store_true", dest="approx",
                             default=False,
                             help="Switch to an approximative algorithm, "
                                  "useful for large files"
33 34
                             )

35 36 37 38 39 40 41 42
    optionManager.add_option('-w','--without-replacement',
                             action="store_true", dest="woreplace",
                             default=False,
                             help="Ask for sampling without replacement"
                            )
 
def rbinom(n,p):
    return sum((random.random() < p) for x in xrange(n))
43 44 45 46 47 48 49 50

if __name__ == '__main__':

    optionParser = getOptionManager([addSampleOptions,addInOutputOption]
                                    )
    
    (options, entries) = optionParser()
    
51
    if not options.approx:
52
    
53
        db = [s for s in entries]
54
        
55 56 57 58 59 60 61 62 63
        if options.size is None:
            options.size=len(db)
        else:
            options.size=int(options.size)
            
        distribution = {}
        idx=0
        total = 0
        for s in db:
64
            count = s['count']
65 66 67 68 69 70 71
            total+=count
            distribution[idx]=count
            idx+=1

        if options.woreplace:
            assert options.size <= total
            sp = weigthedSampleWithoutReplacement
72
        else:
73 74 75 76
            sp= weigthedSample
                        
        sample =sp(distribution, options.size)

77
        
78 79 80 81 82
    else:
        db = []
        distribution = {}
        idx = 0
        total = 0
Eric Coissac committed
83

84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
        assert options.size is not None, \
            "You cannot specify option -a without option -s"
            
        assert options.size>=0 and options.size <=1, \
            "When used with -a options -s must be a probability"
          
        p = options.size * 1.5
        
        if p > 1.:
            p = 1.
            
        for seq in entries:
            count = seq['count']
            total+=count
            
            n = rbinom(count, p)
                        
            if n > 0:
                db.append(seq)
                distribution[idx]=n
                
                idx+=1
                
        size = int(total * options.size)  
        sample=weigthedSampleWithoutReplacement(distribution, size)
                    
Eric Coissac committed
110
    writer = sequenceWriterGenerator(options)
111 112 113 114
    
    for idx in sample:
        seq = db[idx]
        seq['count']=sample[idx]
Eric Coissac committed
115
        writer(seq)
116

117 118 119