unfold.py 6.41 KB
Newer Older
1 2 3 4 5
'''
Created on 28 sept. 2014

@author: coissac
'''
6
import sys
7 8
from orgasm import getOutput,getIndex, getSeeds
from orgasm.tango import restoreGraph, estimateFragmentLength, genesincontig,\
9
    pathConstraints, scaffold, selectGoodComponent, unfoldAssembling, path2fasta,\
10
    path2sam, parseFocedScaffold
11 12 13 14

__title__="Universal assembling graph unfolder"

default_config = { 'circular' : False,
15
                   'force'    : False,
16 17
                   'format'   : "fasta",
                   "fscaffold": []
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
                 }

def addOptions(parser):
    parser.add_argument(dest='orgasm:indexfilename',  metavar='index', 
                        help='index root filename (produced by the oa index command)')
    
    parser.add_argument(dest='orgasm:outputfilename',     metavar='output', 
                                                          nargs='?', 
                                                          default=None,
                        help='output prefix' )
    
    
    parser.add_argument('--circular',         dest='unfold:circular', 
                                              action='store_true', 
                                              default=None, 
                        help='Wish a circular unfolding')

35 36 37 38 39
#     parser.add_argument('--sam',         dest='unfold:format', 
#                                               action='store_const', 
#                                               const="sam",
#                                               default=None, 
#                         help='Produce a SAM file of the assembly')
40 41 42 43 44 45 46

    parser.add_argument('--fasta',            dest='unfold:format', 
                                              action='store_const', 
                                              const="fasta",
                                              default=None, 
                        help='Produce a FASTA file of the assembly [default format]')

47 48 49 50 51
    parser.add_argument('--force',            dest='unfold:force', 
                                              action='store_true', 
                                              default=None, 
                        help='Force circular unfolding')

52 53 54 55 56
    parser.add_argument('--force-scaffold',   dest='unfold:fscaffold', 
                                              action='append', 
                                              default=None, 
                        help='Force circular unfolding')

57 58 59 60 61 62 63
    parser.add_argument('--back',             dest='orgasm:back', 
                                              type=int, 
                                              action='store', 
                                              default=None, 
                        help='the number of bases taken at the end of '
                             'contigs to jump with paired-ends [default: <estimated>]')
    
64 65 66 67 68 69 70 71
    parser.add_argument('--set-tag','-S',     dest ='fasta:tags', 
                                              metavar='tag', 
                                              action='append',
                                              default=[], 
                                              type=str, 
                        help='Allows to add a tag in the OBITools format '
                             'to the header of the fasta sequences')

72 73 74 75 76 77 78 79 80

def run(config):

    logger=config['orgasm']['logger']
    output = getOutput(config)

    if config['unfold']['force']:
        config['unfold']['circular']= True
        
81 82 83
    forcedscaffold = parseFocedScaffold(config['unfold']['fscaffold'])
    
         
84
    r = getIndex(config)
85
    coverage,x,newprobes = getSeeds(r,config)  # @UnusedVariable
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
    
    asm = restoreGraph(output+'.oax',r,x)

    logger.info("Evaluate fragment length")
    
    meanlength,sdlength = estimateFragmentLength(asm)
    
    if meanlength is not None:
        logger.info("Fragment length estimated : %f pb (sd: %f)" % (meanlength,sdlength))

    if config['orgasm']['back'] is not None:
        back = config['orgasm']['back']
    elif config['orgasm']['back'] is None and meanlength is not None:
        back = int(meanlength + 4 * sdlength)
        if back > 500:
            back=500
    else:
        back = 300
        
    logger.info("Evaluate pair-end constraints")
    
    cg = asm.compactAssembling(verbose=False)
    
    genesincontig(cg,r,x)

    minlink=config['orgasm']['minlink']
    scaffold(asm,cg,minlink=minlink,
113 114 115
             back=int(back),
             addConnectedLink=False,
             forcedLink=forcedscaffold,
116
             logger=logger)
117
     
118
    constraints = pathConstraints(asm,cg,back=int(back),minlink=minlink)
119 120 121 122 123 124 125 126 127

    #fastaout = open(output+".fasta","w")
    fastaout = sys.stdout
    pathout  = open(output+".path","w")
    
    
    logger.info("Select the good connected components")
    gcc = selectGoodComponent(cg)
    
128
    logger.info("Print the result as a %s file" % config['unfold']['format'])
129 130 131 132 133 134 135 136
    
    if config['unfold']['circular']:
        if config['unfold']['force']:
            logger.info("Force circular sequence")
        else:
            logger.info("Unfolding in circular mode")
        
    c=1
137 138
    seqid = config['orgasm']['outputfilename'].split('/')[-1]

139 140 141 142 143
    for seeds in gcc:
        path = unfoldAssembling(asm,cg,
                                seeds=seeds,
                                constraints=constraints,
                                circular=config['unfold']['circular'],
144 145
                                force=config['unfold']['force'],
                                logger=logger)
146 147
            
        path = path[-1][0]
148
        
149
        logger.info("Expanded path : %s" % str(path))
150 151
        print("====>",config['unfold']['format'])

152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
        if config['unfold']['format']=="fasta":                   
            fa = path2fasta(asm,cg,path,
                 identifier="%s_%d" % (seqid,c),
                 back=back,
                 minlink=config['orgasm']['minlink'],
                 logger=logger,
                 tags=config['fasta']['tags'])
        if config['unfold']['format']=="sam":                   
            fa = path2sam(asm,cg,path,
                 identifier="%s_%d" % (seqid,c),
                 back=back,
                 minlink=config['orgasm']['minlink'],
                 logger=logger,
                 tags=config['fasta']['tags'])

167 168 169 170 171 172 173 174 175
        print(fa,file=fastaout)
        print(" ".join([str(x) for x in path]),file=pathout)

        c+=1
        
    with open(output +'.path.gml','w') as gmlfile:
        print(cg.gml(),file=gmlfile)