obijoinpairedend.py 3.76 KB
Newer Older
1 2
#!/usr/local/bin/python
'''
Eric Coissac committed
3
:py:mod:`obijoinpairedend`: Joins paired-end reads
4 5 6 7 8 9 10
==================================================

.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>

:py:mod:`obijoinpairedend` aims at joining the two reads of a paired-end library.

For this purpose, it concatenates sequence merging the forward read and the 
11
reversed-complemented reverse read.
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32

The program uses as input one or two sequences reads files. 

    - If two files are used one of them must be specified using the ``-r`` option. 
      Sequence records corresponding to the same read pair must be in the same order 
      in the two files.
      
    - If just one file is provided, sequence records are supposed to be all of the same length.
      The first half of the sequence is used as forward read, the second half is used as the reverse
      read.

    *Example:*
    
    .. code-block:: bash
    
       > obijoinpairedend -r seq3P.fastq seq5P.fastq > seq.fastq
       
    The ``seq5P.fastq`` sequence file contains the forward sequence records.
    The ``seq3P.fastq`` sequence file contains the reverse sequence records.
    Pairs of reads are joined together and the resulting sequence is stored in the
    `` seq.fastq`` file.
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51

'''

from obitools.options import getOptionManager

from itertools import chain
from obitools import NucSequence
from obitools.format.options import sequenceWriterGenerator, autoEntriesIterator,\
    addInOutputOption
from obitools.utils import universalOpen

def addPairEndOptions(optionManager):
    optionManager.add_option('-r','--reverse-reads',
                             action="store", dest="reverse",
                             metavar="<FILENAME>",
                             type="string",
                             default=None,
                             help="Filename containing reverse solexa reads "
                            )
52

53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
    
def cutDirectReverse(entries):
    first = []
    
    for i in xrange(10):
        first.append(entries.next())
        
    lens = [len(x) for x in first]
    clen = {}
    for i in lens:
        clen[i]=clen.get(i,0)+1
    freq = max(clen.values())
    freq = [k for k in clen if clen[k]==freq]
    assert len(freq)==1,"To many sequence length"
    freq = freq[0]
    assert freq % 2 == 0, ""
    lread = freq/2
    
    seqs = chain(first,entries)
    
    for s in seqs:
        d = s[0:lread]
        r = s[lread:]
        yield(d,r)

    
def seqPairs(direct,reverse):
    for d in direct:
        r = reverse.next()
        yield(d,r)


        
86 87
def buildJoinedSequence(sequences,options):
    
88 89 90
    for d,r in sequences:
        r=r.complement()
        
91
        s = str(d) + str(r)
92
        
93
        seq = NucSequence(d.id + '_PairEnd',s,d.definition,**d)
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
        
        withqual = hasattr(d, 'quality') or hasattr(r, 'quality')
        
        if withqual:
            if hasattr(d, 'quality'):
                quality = d.quality
            else:
                quality = [10**-4] * len(d)
            
            if hasattr(r, 'quality'):
                quality.extend(r.quality)
            else:
                quality.extend([10**-4] * len(r))
                
            seq.quality=quality
109
            seq['pairend_limit']=len(d)
110

111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
            
        yield seq
        
    
    
if __name__ == '__main__':
    optionParser = getOptionManager([addPairEndOptions,addInOutputOption])
    
    (options, direct) = optionParser()
    
    if options.reverse is None:
        sequences=cutDirectReverse(direct)
    else:
        reader = autoEntriesIterator(options)
        reverse = reader(universalOpen(options.reverse))
        sequences=seqPairs(direct,reverse)
    
    writer = sequenceWriterGenerator(options)

130
    for seq in buildJoinedSequence(sequences,options):
131 132 133 134
        writer(seq)