maurizio.py 8.86 KB
Newer Older
Alain Viari's avatar
MOD  
Alain Viari committed
1
#!/usr/bin/env python
Eric Coissac's avatar
Eric Coissac committed
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226
################################################
# setenv PYTHONPATH ./orgasm/build/lib.macosx-10.6-intel-2.7/
# echo $PYTHONPATH
# ./orgasm/build/lib.macosx-10.6-intel-2.7/
# python orgasm-1.py ./samples/GWM-261 ./samples/COX1_Machaon.fasta machaon

from orgasm.tango import tango, \
                         matchtoseed, \
                         cutLowCoverage, \
                         scaffold, \
                         coverageEstimate, \
                         fillGaps, \
                         path2fasta, \
                         unfoldAssembling, \
                         cutSNPs, \
                         selectGoodComponent, \
                         estimateDeadBrancheLength, pairEndedConstraints
from orgasm.indexer import Index
from orgasm.backtranslate.fasta import fasta
from orgasm.assembler import Assembler
#from orgasm.multialign import multiAlignReads, \
#                              consensus                              
import sys
import pickle    

def estimateMinRead(index,minoverlap,coverage):
    return (index.getReadSize() - minoverlap) * coverage / index.getReadSize()  / 2

if __name__ == '__main__':
    
    import argparse
    
    # Declare script options
    
    parser = argparse.ArgumentParser(description='Assembly program dedicated to organel genome reconstruction.')
    parser.add_argument('indexFilename', metavar='index', help='index root filename (produced by orgasmi)')
    
    parser.add_argument('seedsFilenameAA', metavar='seeds', help='fasta file containing seeds proteic sequences')

    parser.add_argument('outputFilename', metavar='output', help='output prefix')
    
    parser.add_argument('--minread', dest='minread', type=int, action='store', default=None, help='the minimum count of read to consider [default: %(default)d]')
    parser.add_argument('--minratio', dest='minratio', type=float, action='store', default=0.1, help='minimum ratio between occurrences of an extension and the occurrences of the most frequent extension to keep it. [default: %(default)f]')
    parser.add_argument('--mincov', dest='mincov', type=int, action='store', default=1, help='minimum occurrences of an extension to keep it. [default: %(default)d]')
    parser.add_argument('--minoverlap', dest='minoverlap', type=int, action='store', default=40, help='minimum length of the overlap between the sequence and reads to participate in the extension. [default: %(default)d]')
    parser.add_argument('--lowcomplexity', dest='lowcomplexity', action='store_true', default=False, help='Use also low complexity probes')
    parser.add_argument('--back', dest='back', type=int, action='store', default=500, help='the number of bases taken at the end of contigs to jump with pared-ends [default: %(default)d]')
    parser.add_argument('--snp', dest='snp', action='store_false', default=True, help='desactivate the SNP clearing mode')
    parser.add_argument('--logfile', dest='logfile', action='store_true', default=False, help='Create a logfile for the assembling')
    #parser.add_argument('--maxjump', dest='maxjump', type=int, action='store', default=1, help='COunt of pair end jump by extension branch')
    
    
    args = parser.parse_args()
    
    if args.logfile:
        logfile=open(args.outputFilename+'.log',"w")

    # Load the read index

    r = Index(args.indexFilename)
    
    # looks if seed protein are a file name or an internal list
    
    try:
        p = fasta(args.seedsFilenameAA)
        if args.logfile:
            print >>logfile,"Load protein sequences from file : %s" % args.seedsFilenameAA
    except IOError:
        import orgasm.samples
        p = getattr(orgasm.samples,args.seedsFilenameAA)
        if args.logfile:
            print >>logfile,"Load protein internal dataset : %s" % args.seedsFilenameAA
            
    if args.logfile:
        logfile.flush()
    
                
    # looks if the internal blast was already run            
                
    try:
        # yes, load the previous results
        x = pickle.load(open(args.outputFilename+'.match.omx'))
        if args.logfile:
            print >>logfile,"Load protein match from previous run : %d matches" % sum(len(x[i]) for i in x)
    except: 
        # no, run it and save the results
        if args.logfile:
            print >>logfile,"Running protein matching against reads...",
            logfile.flush()
        x = r.lookForSeeds(p,covmin=1)
        if args.logfile:
            print >>logfile," : %d matches" % sum(len(x[i]) for i in x)
            logfile.flush()
        pickle.dump(x,open(args.outputFilename+'.match.omx',"w"))

    # First esti;ation of the coverage based on protein matches
    
    common    = set(p.keys()) & set(x.keys())
    pcoverage = [len(x[i]) * r.getReadSize()  / len(p[i]) / 3 for i in common]
    pcoverage.sort()

    print >>sys.stderr,pcoverage

    #coverage=mode(pcoverage)
    coverage=pcoverage[len(pcoverage)/2]
    
    # according to the minread option estimate it from coverage or use the specified value
   
    if args.minread is None:
        minread = estimateMinRead(r,args.minoverlap,coverage)
    else:
        minread = args.minread

    print >>sys.stderr,"\nestimated coverage  : %d based on protein match (minread: %d)" % (coverage,minread)
    
    if args.logfile:
        print >>logfile,"\nestimated coverage  : %d based on protein match (minread: %d)" % (coverage,minread)
        logfile.flush()
        
        
    # Convert matches in sedd list    
    s = matchtoseed(x,r)
    
    # Create the assembler object
    asm = Assembler(r)
    
    # Run the first assembling pass
    a = tango(asm,s,mincov=args.mincov,
                    minread=minread,
                    minoverlap=args.minoverlap,
                    lowfilter=not args.lowcomplexity,
                    maxjump=0, cycle=1)
    
    # Clean small unsuccessful extensions
    asm.cleanDeadBranches(maxlength=10)
    
    # and too low covered assembling
    cutLowCoverage(asm,int(coverage/4),terminal=True)
    
    # cleanup snp bubble in the graph    
    if args.snp:
        cutSNPs(asm)
    
    # reestimate coverage
    score,length,coverage = coverageEstimate(asm)    

    if args.minread is None:
        minread = estimateMinRead(r,args.minoverlap,coverage)
    else:
        minread = args.minread

    print >>sys.stderr,"coverage estimated : %d based on %d bp (minread: %d)" %(coverage,length,minread)
    if args.logfile:
        print >>logfile,"coverage estimated : %d based on %d bp (minread: %d)" %(coverage,length,minread)
        logfile.flush()
        
        
    # Run the fill gap procedure    
    while fillGaps(asm,back=args.back,
                   minread=minread,
                   maxjump=0,
                   minoverlap=args.minoverlap,
                   cmincov=0,
                   gmincov=int(coverage/4),
                   snp=args.snp) > 0:
                   
        print >>sys.stderr
        print >>sys.stderr
        print >>sys.stderr
        print >>sys.stderr,"=================================================================="      
        print >>sys.stderr    # Clean small unsuccessful extensions
        asm.cleanDeadBranches(maxlength=10)
    
        # and too low covered assembling
        cutLowCoverage(asm,int(coverage/3),terminal=True)
    
    
        # reestimate coverage
        score,length,coverage = coverageEstimate(asm)    
        if args.minread is None:
            minread = estimateMinRead(r,args.minoverlap,coverage)
        else:
            minread = args.minread

        print >>sys.stderr,"coverage estimated : %d based on %d bp (minread: %d)" %(coverage,length,minread)

        cg = asm.compactAssembling(verbose=False)
        scaffold(asm,cg,minlink=100,back=int(args.back/2),addConnectedLink=False)
        print >>open(args.outputFilename+'.intermediate.gml','w'),cg.gml()

        print >>sys.stderr
        print >>sys.stderr,"=================================================================="      
        print >>sys.stderr
        
    cutSNPs(asm)
    
    smallbranches = estimateDeadBrancheLength(asm)
    asm.cleanDeadBranches(maxlength=smallbranches)
    cg = asm.compactAssembling(verbose=False)
    
    score,length,coverage = coverageEstimate(asm)
    if args.snp:
        cutSNPs(asm)
    cutLowCoverage(asm,int(coverage/2),terminal=True)
    cutLowCoverage(asm,int(coverage/10),terminal=False)
        
    cg = asm.compactAssembling(verbose=False)
    constraints = pairEndedConstraints(asm,cg,back=int(args.back))
     
        
    scaffold(asm,cg,minlink=5,back=int(args.back),addConnectedLink=False)
    print >>open(args.outputFilename+'.gml','w'),cg.gml()

    fastaout = open(args.outputFilename+'.fasta','w')
    
    gcc = selectGoodComponent(cg)
    for seeds in gcc:
        path = unfoldAssembling(asm,cg,seeds=seeds)
        fa = path2fasta(asm,cg,path[-1][0],
             identifier=args.outputFilename,
             back=args.back,
             minlink=5)
        print >>fastaout,fa