__init__.py 10.2 KB
Newer Older
Eric Coissac committed
1 2 3
import pickle
import sys

Eric Coissac committed
4
from .indexer import Index
5

Eric Coissac committed
6 7 8
from .backtranslate.fasta import fasta
from . import samples
from orgasm.utils.dna import isDNA
Eric Coissac committed
9

10 11
import os
import os.path
Eric Coissac committed
12

13 14 15 16 17
def getOutput(config):
    '''
    @param config: a configuration object
    @return: the path for the output files
    '''
18
    logger = config['orgasm']['logger']
19 20 21
    
    dirname = "%s.oas" % config['orgasm']['outputfilename']
    if not os.path.exists(dirname):
22 23 24 25 26 27
        if os.path.exists('%s.oax' % config['orgasm']['outputfilename']):
            
            if config['buildgraph']['reformat']:
                #
                # Reformating outpout according to the new format
                #
Eric Coissac committed
28 29 30 31

                # Try to load the index to test its format
                index=getIndex(config)
                
32 33
                os.makedirs(dirname) 
                for extension in ['oax','omx','gml','stats',
Eric Coissac committed
34
                                  'intermediate.gml','.intermediate.oax',
35
                                  'path.gml',
Eric Coissac committed
36
                                  'path']:
37 38 39 40 41 42
                    if os.path.exists('%s.%s' % (config['orgasm']['outputfilename'],
                                                 extension)):
                        os.renames('%s.%s' % (config['orgasm']['outputfilename'],
                                              extension), 
                                   '%s.oas/assembling.%s' % (config['orgasm']['outputfilename'],
                                                             extension))
Eric Coissac committed
43 44 45 46

                # Try to load the seeds to test its format
                seeds=getSeeds(index, config)
               
47
                sys.exit(0)
Eric Coissac committed
48
               
49 50 51 52 53 54 55 56 57 58 59
            else:
                #
                # Exit with an error because the format is obsolete.
                #
                logger.error("The %s assembly is not stored according to new format" % config['orgasm']['outputfilename'])
                logger.error('Run the oa buildgraph command with the --reformat option')
                sys.exit(1)
        else:
            os.makedirs(dirname)
            
            
60 61 62
        
    return "%s/assembling" % dirname
    
63

64 65 66 67 68 69 70
def getIndex(config):
    '''
    
    @param config: a configuration object
    @return: an Indexer instance
    '''
    
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
    logger=config['orgasm']['logger']    
    output=config['orgasm']['indexfilename']
    dirname="%s.odx" % output
    
    if (   (not os.path.exists(dirname))       and
           (os.path.exists('%s.ofx' % output ) and
            os.path.exists('%s.ogx' % output ) and
            os.path.exists('%s.opx' % output ) and
            os.path.exists('%s.orx' % output )
       )):
        logger.error('Index %s uses the old format please run the command' % output)
        logger.error('    oa index --reformat %s' % output)
        sys.exit(1)
    
    return Index("%s/index" % dirname)
86

Eric Coissac committed
87
def getProbes(config):
Eric Coissac committed
88 89 90 91 92 93 94 95 96 97
    '''
    According to the configuration file and the --seeds option this function return the set
    of sequence probes to use
    
     
    @param config: a configuration object
    @return: a dictionary with gene name as key and the 
             nucleic or protein sequence as value
    @rtype: dict
    '''
98 99 100
    logger=config['orgasm']['logger']
    seeds =config['orgasm']['seeds']
    
101 102
    probes = {}
    
Eric Coissac committed
103 104 105 106 107 108 109 110 111 112 113
    if seeds is not None:
    
        for s in seeds:
            try:
                p = fasta(s)
                logger.info("Load probe sequences from file : %s" % s)
            except IOError:
                p = getattr(samples,s)
                logger.info("Load probe internal dataset : %s" % s)
                
            probes[s]=[p,{}]
114 115 116
            
    else:
        logger.info("No new probe set specified")
117 118
                            
    return probes
Eric Coissac committed
119

120 121 122 123 124 125 126 127 128 129 130 131
def getAdapters(config):
    '''
    According to the configuration file and the --seeds option this function return the set
    of sequence probes to use
    
     
    @param config: a configuration object
    @return: a dictionary with gene name as key and the 
             nucleic or protein sequence as value
    @rtype: dict
    '''
    logger=config['orgasm']['logger']
Eric Coissac committed
132 133
    adapt5 =config['orgasm']['adapt5']
    adapt3 =config['orgasm']['adapt3']
134 135
    
    try:
Eric Coissac committed
136 137
        p3 = fasta(adapt3).values()
        logger.info("Load 3' adapter sequences from file : %s" % adapt3)
138
    except IOError:
Eric Coissac committed
139 140
        p3 = getattr(samples,adapt3)
        logger.info("Load 3' adapter internal dataset : %s" % adapt3)
141
                    
Eric Coissac committed
142 143 144 145 146 147 148 149
    try:
        p5 = fasta(adapt5).values()
        logger.info("Load 5' adapter sequences from file : %s" % adapt5)
    except IOError:
        p5 = getattr(samples,adapt5)
        logger.info("Load 5' adapter internal dataset : %s" % adapt5)
                    
    return (p3,p5)
150 151


152
def getSeeds(index,config):
Eric Coissac committed
153
    # looks if the internal blast was already run            
154 155
    output = config['orgasm']['outputfilename'] 
    logger=config['orgasm']['logger']
156
    kup=-1 if config['orgasm']['kup'] is None else config['orgasm']['kup']
157
    clean=config['buildgraph']['clean']
158 159
    forceseeds=config['buildgraph']['forceseeds']
    
Eric Coissac committed
160

161
    filename="%s.oas/assembling.omx" % output
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
    
    #
    # Check if the seeds are not correctly placed in the oas directory
    #

    oldfilename=output+'.omx'
    
    if (not os.path.exists(filename) and 
        os.path.exists(oldfilename)):
        if config['seeds']['reformat']:
            os.renames(oldfilename, filename)
        else:
            logger.error('Seed matches are not stored following the new format')
            logger.error('Run the oa seeds command with the --reformat option')
            sys.exit(1)
    
178
    
179 180 181 182 183 184 185 186 187
    #
    # Look for seeds with the new probe sets
    #

    newprobes = getProbes(config)
    
    # 
    # Load already run probe sets
    #
188
    reformated=[]
189
    if not clean or not newprobes:
190 191 192 193
        try:
            with open(filename,'rb') as fseeds:
                probes = pickle.load(fseeds)
            logger.info("Load matches from previous run : %d probe sets restored" % len(probes))
194
    
195 196 197 198 199 200 201
            if not isinstance(list(probes.values())[0][0], dict):
                logger.error("Too old version of probes matches that cannot be reformated")
                logger.error("Generate a new probe match set using the command oa seeds --seeds command")
                logger.error("The old unsuable seed match has been erased")
                os.remove(filename)
                sys.exit(1)
                
202 203 204 205
            oldversion=False
            for probename in probes:
                s = probes[probename][1]
                if len(s[list(s.keys())[0]][0]) != 6:
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
                    if config['seeds']['reformat']:
                        oldversion=True
        
                        logger.warning("Old probe version save on the disk. Recomputes probes %s" % probename)
                        
                        p = probes[probename][0]
                        logger.info("    -> probe set: %s" % probename)
             
                        seeds = index.lookForSeeds(p,
                                                   mincov=config['orgasm']['seedmincov'],
                                                   kup=kup,
                                                   identity=config['orgasm']['identity'],
                                                   logger=logger)
                        
                        probes[probename][1]=seeds
                        
                        reformated.append(probename)
223
                    
224 225 226 227 228
                        logger.info("==> %d matches" % sum(len(seeds[i]) for i in seeds))
                    else:
                        logger.error('Seed matches are not stored following the new format')
                        logger.error('Run the oa seeds command with the --reformat option')
                        sys.exit(1)
229 230 231 232
            
            if oldversion:
                with open(filename,"wb") as fseeds:
                    pickle.dump(probes,fseeds)
233
    
234 235 236 237 238 239
            nm=0
            for k in probes:
                for m in probes[k][1].values():
                    nm+=len(m)
            logger.info("   ==> A total of : %d" % nm)
        except FileNotFoundError: 
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
            logger.info("No previous matches loaded")
            probes={}
    else:
        if os.path.exists(filename):
            logger.info("Cleaning previous matches")
            os.remove(filename)
        probes={}

    
    if newprobes:
        # --seeds option on the command line -> look for these seeds
        logger.info("Running probes matching against reads...")
        
        for probename in newprobes:
            p = newprobes[probename][0]
            logger.info("    -> probe set: %s" % probename)
 
            seeds = index.lookForSeeds(p,
                                        mincov=config['orgasm']['seedmincov'],
                                        kup=kup,
                                        identity=config['orgasm']['identity'],
                                        logger=logger)
            
            nmatches = sum(len(seeds[i]) for i in seeds)
                        
            logger.info("==> %d matches" % nmatches)
            
            if nmatches:            
                probes[probename]=[p,seeds]
    
270
    newprobes = list(newprobes.keys()) 
271 272 273 274 275 276 277 278 279
     
    if not probes:
        logger.info("No --seeds option specified and not previous matches stored")
        sys.exit(1)
        
        
    if newprobes:
        with open(filename,"wb") as fseeds:
            pickle.dump(probes,fseeds)
280 281

    logger.info("Match list :") 
Eric Coissac committed
282

283 284 285 286 287 288
    covmax=0

    for probename in probes:
        p = probes[probename][0]
        s = probes[probename][1]
        nuc = all([isDNA(k) for k in p.values()])
289
        #print(s[list(s.keys())[0]])
Eric Coissac committed
290
        nbmatch = [(k,
291 292
                    sum(x[2] for x in s[k]),
                    sum(x[2] for x in s[k])*index.getReadSize() / len(p[k]) / (1 if nuc else 3)) for k in s]
293
                 
Eric Coissac committed
294
        nbmatch.sort(key=lambda x:-x[2])
295
        
Eric Coissac committed
296 297 298
        for gene, nb, cov in nbmatch:
            logger.info("     %-10s : %5d (%5.1fx)" % (gene,nb,cov))
        coverage=nbmatch[0][2]
299 300
        if coverage > covmax:
            covmax=coverage
Eric Coissac committed
301
        
302 303
    if forceseeds:
        newprobes=None
Eric Coissac committed
304

305
    return covmax,probes,newprobes
Eric Coissac committed
306 307

#def reloadAssembling