Commit 2202ed0f by Eric Coissac

Refactoring of the unfold command

- Create a new submodule for output format
- Remove the *path* and *fasta* commands integrated into unfold
parent d086f56c
'''
Created on 26 nov. 2014
@author: boyer
'''
from orgasm import getOutput,getIndex, getSeeds
from orgasm.tango import restoreGraph, genesincontig
from orgasm.utils import tags2str
import os
import sys
__title__="Build a fasta file from the assembling graph"
default_config = { 'tags' : None,
'extension' : None
}
def addOptions(parser):
parser.add_argument(dest='orgasm:indexfilename', metavar='<index>',
help='index root filename (produced by the oa index command)')
parser.add_argument(dest='orgasm:outputfilename', metavar='<output>',
nargs='?',
default=None,
help='output prefix' )
parser.add_argument('--set-tag','-S', dest ='fasta:tags',
metavar='tag',
action='append',
default=[],
type=str,
help='Allows to add a tag in the OBITools format '
'to the header of the fasta sequences')
parser.add_argument('--no-5ext', dest='fasta:extension',
action='store_false',
default=True,
help="Do not add the 5' end of the sequences")
def fastaFormat(edge, title=None, nchar=60, extension=False,tags=[]):
if title is None:
title = 'Seq'
lheader = []
for k in ('weight', 'label', 'length', 'stemid', 'ingene'):
lheader.append('%s=%s'%(k, edge[k]))
l = ['; '.join(lheader)+"; " + tags2str(tags)]
l[0] = '>%s_%d %s'%(title, edge['stemid'], l[0])
seq = edge['sequence']
if extension:
seq=edge['head'].lower() + seq
lseq = len(seq)
i=0
while i < lseq:
l.append(seq[i:i+60].decode('ascii'))
i += 60
return '\n'.join(l)
def run(config):
logger=config['orgasm']['logger']
output = getOutput(config)
r = getIndex(config)
ecoverage,x,newprobes = getSeeds(r,config)
asm = restoreGraph(output+'.oax',r,x)
cg = asm.compactAssembling(verbose=False)
genesincontig(cg,r,x)
# fastaout = open(output+".fasta","w")
fastaout = sys.stdout
logger.info("Print the result as a fasta file")
edges = [cg.getEdgeAttr(*i) for i in cg.edgeIterator(edgePredicate = lambda e : cg.getEdgeAttr(*e)['stemid']>0)]
head, tail = os.path.split(config['orgasm']['outputfilename'])
c=1
for e in edges:
print(fastaFormat(e, "%s_%d" % (tail,c),
extension=config['fasta']['extension'],
tags=config['fasta']['tags']
),
file=fastaout)
c+=1
'''
Created on 28 sept. 2014
@author: coissac
'''
import sys
from orgasm import getOutput,getIndex, getSeeds
from orgasm.tango import restoreGraph, estimateFragmentLength, genesincontig,\
scaffold, path2fasta, parseFocedScaffold
__title__="Build a fasta file from a path across the assembling graph"
default_config = { "pathfile" : None
}
def addOptions(parser):
parser.add_argument(dest='orgasm:indexfilename', metavar='<index>',
help='index root filename (produced by the oa index command)')
parser.add_argument(dest='orgasm:outputfilename', metavar='<output>',
nargs='?',
default=None,
help='output prefix' )
parser.add_argument('--path', dest='path:path',
action='store',
metavar='<edgeid>',
type=int,
nargs='+',
required=True,
default=None,
help='A list of edge id separated by space add -- at the end of the path')
parser.add_argument('--force-scaffold', dest='unfold:fscaffold',
action='append',
default=None,
help='Force circular unfolding')
parser.add_argument('--back', dest='orgasm:back',
metavar='<insert size>',
type=int,
action='store',
default=None,
help='the number of bases taken at the end of '
'contigs to jump with pared-ends [default: <estimated>]')
parser.add_argument('--set-tag','-S', dest ='fasta:tags',
metavar='tag',
action='append',
default=[],
type=str,
help='Allows to add a tag in the OBITools format '
'to the header of the fasta sequences')
def run(config):
logger=config['orgasm']['logger']
output = getOutput(config)
forcedscaffold = parseFocedScaffold(config['unfold']['fscaffold'])
r = getIndex(config)
ecoverage,x,newprobes = getSeeds(r,config) # @UnusedVariable
asm = restoreGraph(output+'.oax',r,x)
logger.info("Evaluate fragment length")
meanlength,sdlength = estimateFragmentLength(asm)
if meanlength is not None:
logger.info("Fragment length estimated : %f pb (sd: %f)" % (meanlength,sdlength))
if config['orgasm']['back'] is not None:
back = config['orgasm']['back']
elif config['orgasm']['back'] is None and meanlength is not None:
back = int(meanlength + 4 * sdlength)
if back > 500:
back=500
else:
back = 300
cg = asm.compactAssembling(verbose=False)
genesincontig(cg,r,x)
scaffold(asm,
cg,
minlink=config['orgasm']['minlink'],
back=int(back),
addConnectedLink=False,
forcedLink=forcedscaffold,
logger=logger)
# fastaout = open(output+".fasta","w")
fastaout = sys.stdout
pathout = open(output+".path","w")
logger.info("Print the result as a fasta file")
c=1
seqid = config['orgasm']['outputfilename'].split('/')[-1]
path = config['path']['path']
logger.info('Built path : %s' % str(path))
fa = path2fasta(asm,cg,path,
identifier="%s_%d" % (seqid,c),
back=back,
minlink=config['orgasm']['minlink'],
logger=logger,
tags=config['fasta']['tags'])
print(fa,file=fastaout)
print(" ".join([str(x) for x in path]),file=pathout)
print(cg.gml(),file=open(output +'.path.gml','w'))
......@@ -6,15 +6,20 @@ Created on 28 sept. 2014
import sys
from orgasm import getOutput,getIndex, getSeeds
from orgasm.tango import restoreGraph, estimateFragmentLength, genesincontig,\
pathConstraints, scaffold, selectGoodComponent, unfoldAssembling, path2fasta,\
path2sam, parseFocedScaffold
pathConstraints, scaffold, selectGoodComponent, unfoldAssembling, path2fasta, \
parseFocedScaffold
from orgasm.format.sam import path2sam,buildReadSequence
from orgasm.version import version
__title__="Universal assembling graph unfolder"
default_config = { 'circular' : False,
'force' : False,
'format' : "fasta",
"fscaffold": []
"fscaffold": [],
"path" : None,
"contigs" : False
}
def addOptions(parser):
......@@ -27,16 +32,29 @@ def addOptions(parser):
help='output prefix' )
parser.add_argument('--path', dest='unfold:path',
action='store',
metavar='<edgeid>',
type=int,
nargs='+',
default=None,
help='A list of edge id separated by space add -- at the end of the path')
parser.add_argument('--contigs', dest='unfold:contigs',
action='store_true',
default=None,
help='Outputs each stem of the assembly graph')
parser.add_argument('--circular', dest='unfold:circular',
action='store_true',
default=None,
help='Wish a circular unfolding')
# parser.add_argument('--sam', dest='unfold:format',
# action='store_const',
# const="sam",
# default=None,
# help='Produce a SAM file of the assembly')
parser.add_argument('--sam', dest='unfold:format',
action='store_const',
const="sam",
default=None,
help='Produce a SAM file of the assembly')
parser.add_argument('--fasta', dest='unfold:format',
action='store_const',
......@@ -70,6 +88,40 @@ def addOptions(parser):
'to the header of the fasta sequences')
def buildPath(asm,compactgraph,back,minlink,config):
logger=config['orgasm']['logger']
logger.info("Select the good connected components")
gcc = selectGoodComponent(compactgraph)
if config['unfold']['circular']:
if config['unfold']['force']:
logger.info("Force circular sequence")
else:
logger.info("Unfolding in circular mode")
constraints = pathConstraints(asm,compactgraph,
back=int(back),minlink=minlink)
paths = []
for seeds in gcc:
path = unfoldAssembling(asm,compactgraph,
seeds=seeds,
constraints=constraints,
circular=config['unfold']['circular'],
force=config['unfold']['force'],
logger=logger)
paths.append(path[-1][0])
return(paths)
def contigPath(asm,compactgraph,config):
edges = [compactgraph.getEdgeAttr(*i)['stemid']
for i in compactgraph.edgeIterator(edgePredicate = lambda e : compactgraph.getEdgeAttr(*e)['class']=="sequence")]
return [(i,) for i in edges if i > 0]
def run(config):
logger=config['orgasm']['logger']
......@@ -115,58 +167,77 @@ def run(config):
forcedLink=forcedscaffold,
logger=logger)
constraints = pathConstraints(asm,cg,back=int(back),minlink=minlink)
#fastaout = open(output+".fasta","w")
fastaout = sys.stdout
pathout = open(output+".path","w")
paths=[]
#### --->
if config['unfold']['path'] is not None:
paths.append(tuple(config['unfold']['path']))
if config['unfold']['contigs']:
paths.extend(contigPath(asm,cg,config))
logger.info("Select the good connected components")
gcc = selectGoodComponent(cg)
if not paths:
paths=buildPath(asm, cg, back, minlink, config)
logger.info("Print the result as a %s file" % config['unfold']['format'])
if config['unfold']['circular']:
if config['unfold']['force']:
logger.info("Force circular sequence")
else:
logger.info("Unfolding in circular mode")
c=1
paths.sort()
seqid = config['orgasm']['outputfilename'].split('/')[-1]
for seeds in gcc:
path = unfoldAssembling(asm,cg,
seeds=seeds,
constraints=constraints,
circular=config['unfold']['circular'],
force=config['unfold']['force'],
logger=logger)
if config['unfold']['format']=="sam":
c=1
printedPath=set()
print("@HD\tVN:1.5\tSO:coordinate",file=fastaout)
print("@PG\tID:ORG.Asm\tVN:%s" % version)
for path in paths:
rpath = tuple(-i for i in reversed(path))
if path not in printedPath and rpath not in printedPath:
printedPath.add(tuple(path))
identifier="%s_%d" % (seqid,c)
seqlength = len(buildReadSequence(asm,cg,identifier=identifier,
path=path,back=back,logger=logger))
print("@SQ\tSN:%s\tLN:%d" % (identifier,seqlength)) # <=== !!! ATTENTION !!!
c+=1
c=1
printedPath=set()
for path in paths:
rpath = tuple(-i for i in reversed(path))
if path not in printedPath and rpath not in printedPath:
path = path[-1][0]
printedPath.add(tuple(path))
logger.info("Expanded path : %s" % str(path))
if config['unfold']['format']=="fasta":
fa = path2fasta(asm,cg,path,
identifier="%s_%d" % (seqid,c),
back=back,
minlink=config['orgasm']['minlink'],
logger=logger,
tags=config['fasta']['tags'])
if config['unfold']['format']=="sam":
fa = path2sam(asm,cg,path,
identifier="%s_%d" % (seqid,c),
back=back,
minlink=config['orgasm']['minlink'],
logger=logger,
tags=config['fasta']['tags'])
print(fa,file=fastaout)
print(" ".join([str(x) for x in path]),file=pathout)
c+=1
logger.info("Expanded path : %s" % str(path))
if config['unfold']['format']=="fasta":
fa = path2fasta(asm,cg,path,
identifier="%s_%d" % (seqid,c),
back=back,
minlink=config['orgasm']['minlink'],
logger=logger,
tags=config['fasta']['tags'])
if config['unfold']['format']=="sam":
fa = path2sam(asm,cg,path,
identifier="%s_%d" % (seqid,c),
back=back,
minlink=config['orgasm']['minlink'],
logger=logger,
tags=config['fasta']['tags'])
print(fa,file=fastaout)
print(" ".join([str(x) for x in path]),file=pathout)
c+=1
with open(output +'.path.gml','w') as gmlfile:
print(cg.gml(),file=gmlfile)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment