illuminapairedend.py 8.47 KB
Newer Older
1 2
#!/usr/local/bin/python
'''
Aurélie Bonin committed
3
:py:mod:`illuminapairedend`: aligns paired-end Illumina reads
4
=============================================================
Frédéric Boyer committed
5 6 7

.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>

Frédéric Boyer committed
8 9 10
.. IMPORTANT:: 

   :py:mod:`illuminapairedend` replaces ``solexapairend``.
Eric Coissac committed
11

Eric Coissac committed
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
:py:mod:`illuminapairedend` aims at aligning the two reads of a pair-end library sequenced 
using an Illumina platform. 

    - If the two reads overlap, it returns the consensus sequence together with its quality 
    
    - Otherwise, it concatenates sequence merging the forward read and 
      the reversed-complemented reverse read.

The program uses as input one or two :doc:`fastq <../fastq>` sequences reads files. 

    - If two files are used one of them must be specified using the ``-r`` option. 
      Sequence records corresponding to the same read pair must be in the same order 
      in the two files.
      
    - If just one file is provided, sequence records are supposed to be all of the same length.
      The first half of the sequence is used as forward read, the second half is used as the reverse
      read.

:py:mod:`illuminapairedend` align the forward sequence record with the reverse complement of the 
reverse sequence record. The alignment algorithm takes into account the base qualities.

    *Example:*
    
    .. code-block:: bash
    
       > illuminapairedend -r seq3P.fastq seq5P.fastq > seq.fastq
       
    The ``seq5P.fastq`` sequence file contains the forward sequence records.
    The ``seq3P.fastq`` sequence file contains the reverse sequence records.
    Pairs of reads are aligned together and the consensus sequence is stored in the
    `` seq.fastq`` file.
43 44 45

'''

46
from obitools import NucSequence
47
from obitools.options import getOptionManager, allEntryIterator
48 49
from obitools.align import QSolexaReverseAssemble
from obitools.align import QSolexaRightReverseAssemble
Eric Coissac committed
50
from obitools.tools._solexapairend import buildConsensus
51
from obitools.format.options import addOutputFormatOption,\
52
    sequenceWriterGenerator
53

54
from itertools import chain
55 56
import cPickle
import math
57 58
from obitools.fastq._fastq import fastqIterator  # @UnresolvedImport

59

60 61 62 63
def addSolexaPairEndOptions(optionManager):
    optionManager.add_option('-r','--reverse-reads',
                             action="store", dest="reverse",
                             metavar="<FILENAME>",
64
                             type="str",
65 66 67
                             default=None,
                             help="Filename containing reverse solexa reads "
                            )
68 69 70 71 72 73 74 75

    optionManager.add_option('--index-file',
                             action="store", dest="indexfile",
                             metavar="<FILENAME>",
                             type="str",
                             default=None,
                             help="Filename containing illumina index reads "
                            )
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
    optionManager.add_option('--sanger',
                             action="store_const", dest="seqinformat",
                             default=None,
                             const='sanger',
                             help="input file is in sanger fastq nucleic format (standard fastq)")

    optionManager.add_option('--solexa',
                             action="store_const", dest="seqinformat",
                             default=None,
                             const='solexa',
                             help="input file is in fastq nucleic format produced by solexa sequencer")

    optionManager.add_option('--illumina',
                             action="store_const", dest="seqinformat",
                             default=None,
                             const='illumina',
                             help="input file is in fastq nucleic format produced by old solexa sequencer")

94 95 96 97 98 99 100 101
    optionManager.add_option('--score-min',
                             action="store", dest="smin",
                             metavar="#.###",
                             type="float",
                             default=None,
                             help="minimum score for keeping aligment")


102

103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
def cutDirectReverse(entries):
    first = []
    
    for i in xrange(10):
        first.append(entries.next())
        
    lens = [len(x) for x in first]
    clen = {}
    for i in lens:
        clen[i]=clen.get(i,0)+1
    freq = max(clen.values())
    freq = [k for k in clen if clen[k]==freq]
    assert len(freq)==1,"To many sequence length"
    freq = freq[0]
    assert freq % 2 == 0, ""
    lread = freq/2
    
    seqs = chain(first,entries)
    
    for s in seqs:
        d = s[0:lread]
        r = s[lread:]
        yield(d,r)
    
def seqPairs(direct,reverse):
    for d in direct:
        r = reverse.next()
        yield(d,r)
131 132 133 134 135

def checkAlignOk(ali):
    #print not (ali[0][0]=='-' or ali[1][len(ali[1])-1]=='-')
    return not (ali[0][0]=='-' or ali[1][len(ali[1])-1]=='-')
    
136 137
la = QSolexaReverseAssemble()
ra = QSolexaRightReverseAssemble()
138

139 140 141 142
def buildAlignment(direct,reverse):
    
    if len(direct)==0 or len(reverse)==0:
        return None
143
        
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
    la.seqA=direct 
    la.seqB=reverse 
    ali=la()
    ali.direction='left'
    
    ra.seqA=direct
    ra.seqB=reverse
    rali=ra()
    rali.direction='right'
    
    if ali.score < rali.score:
        ali=rali

    return ali
        
def alignmentIterator(sequences):
160 161
    
    for d,r in sequences:
162 163
        ali = buildAlignment(d,r)
        if ali is None:
164
            continue
165 166
        yield ali
        
167 168 169 170 171 172 173 174
      
def buildJoinedSequence(ali,options):
    d = ali[0].getRoot()
    r = ali[1].getRoot()

    
    r=r.complement()
    
175
    s = str(d) + str(r)
176
    
177
    seq = NucSequence(d.id + '_PairEnd',s,d.definition,**d)
178 179 180 181 182 183 184 185
    
    withqual = hasattr(d, 'quality') or hasattr(r, 'quality')
    
    if withqual:
        if hasattr(d, 'quality'):
            quality = d.quality
        else:
            quality = [10**-4] * len(d)
186
                    
187 188 189 190 191 192
        if hasattr(r, 'quality'):
            quality.extend(r.quality)
        else:
            quality.extend([10**-4] * len(r))
            
        seq.quality=quality
193
        
194
    seq['score']=ali.score
Frédéric Boyer committed
195
    seq['ali_dir']=ali.direction
196
    seq['mode']='joined'
197
    seq['pairend_limit']=len(d)
198 199 200
    
    return seq

201 202 203
    
    
if __name__ == '__main__':
204
    optionParser = getOptionManager([addSolexaPairEndOptions,addOutputFormatOption],checkFormat=True
205 206 207
                                    )
    
    (options, direct) = optionParser()
Eric Coissac committed
208 209 210
    
    #WARNING TO REMOVE : DIRTY PATCH !
    options.proba = None
211 212
    options.skip  = None
    options.only  = None
Eric Coissac committed
213

214 215 216 217

    options.sminL = None
    options.sminR = None

218
    
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
    if options.proba is not None and options.smin is None:
        p = open(options.proba)
        options.nullLeft  = cPickle.load(p)
        options.nullRight = cPickle.load(p)
        
        assert options.pvalue is not None, "You have to indicate a pvalue or an score min"
        
        i = int(math.floor((1.0 - options.pvalue) * len(options.nullLeft)))
                           
        if i == len(options.nullLeft):
            i-=1
        options.sminL = options.nullLeft[i]
        
        i = int(math.floor((1.0 - options.pvalue) * len(options.nullRight)))
        if i == len(options.nullRight):
            i-=1
        options.sminR = options.nullRight[i]
        
    if options.smin is not None:
        options.sminL = options.smin
        options.sminR = options.smin
        
        
242 243 244
    if options.reverse is None:
        sequences=cutDirectReverse(direct)
    else:
245
        reverse = allEntryIterator([options.reverse],options.readerIterator)
246
        sequences=seqPairs(direct,reverse)
247
        
248 249 250 251 252
    if options.indexfile is not None:
        indexfile = fastqIterator(options.indexfile)
    else:
        indexfile = None
        
253
    writer = sequenceWriterGenerator(options)
254
    
255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
    ba = alignmentIterator(sequences)
                
    for ali in ba:
        
        if options.sminL is not None:
            if (   (ali.direction=='left' and ali.score > options.sminL) 
                or (ali.score > options.sminR)):
                consensus = buildConsensus(ali)
            else:
                consensus = buildJoinedSequence(ali, options)
                
            consensus['sminL']=options.sminL
            consensus['sminR']=options.sminR
        else:
            consensus = buildConsensus(ali)
270 271 272 273
            
        if indexfile is not None:
            i = str(indexfile.next())
            consensus['illumina_index']=i
274
                        
275
        writer(consensus)
276 277
        
        
278 279
        
        
280