Commit 1d533f22 authored by Eric Coissac's avatar Eric Coissac

Initial commit

parent 554c7110
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>OBITools</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.python.pydev.PyDevBuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.python.pydev.pythonNature</nature>
</natures>
</projectDescription>
<?xml version="1.0" encoding="UTF-8"?>
<?eclipse-pydev version="1.0"?>
<pydev_project>
<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
<path>/OBITools/src</path>
</pydev_pathproperty>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.4</pydev_property>
</pydev_project>
This diff is collapsed.
This diff is collapsed.
#!/usr/local/bin/python
import fileinput
import re
import getopt
import sys
from obitools.fasta import fastaIterator,writeFasta
def printHelp():
print "-----------------------------------"
print " fastaComplement.py"
print "-----------------------------------"
print "fastaComplement.py <fastafile>"
print " complement and reverse all sequence in the fasta file"
print "-----------------------------------"
print "-h --help : print this help"
print "-----------------------------------"
if __name__=='__main__':
fasta = fastaIterator(fileinput.input())
for seq in fasta:
print writeFasta(seq.complement())
\ No newline at end of file
#!/usr/local/bin/python
"""\
--------------------------------------------------------------------
fastaGrep.py
--------------------------------------------------------------------
fastaGrep.py [option] <argument>
--------------------------------------------------------------------
-h --help : print this help
-s --sequence=<pattern> : match the sequence with
a regular pattern
-i --identifier=<pattern> : match the sequence
identifier with a regular
pattern
-d --definition=<pattern> : match the sequence definition
with a regular pattern
-a --attribute=<name>:<pattern> : match the sequence attribute
<name> with a regular pattern
-l --lmin=## : keep sequences longer than
lmin
-L --lmax=## : keep sequences shorter than
lmax
-v : revert the sequence selection
--------------------------------------------------------------------
"""
import fileinput
import re
import getopt
import sys
from obitools.fasta import fastaIterator,writeFasta
from obitools.utils import checkHelpOption
def goodFastaGenerator():
o,filenames = getopt.getopt(sys.argv[1:],
'hs:i:d:a:l:L:v',
['help',
'sequence=',
'identifier=',
'definition=',
'attribute=',
'lmin=',
'lmax='])
sys.argv[1:]=filenames
haveSequencePattern = False
haveIdentifierPattern= False
haveDefinitionPattern= False
haveAttributePattern = False
haveLmin = False
haveLmax = False
isInverted = False
attributePatterns={}
for name,value in o:
if name in ('-s','--sequence'):
sequencePattern=re.compile(value,re.Ignore)
haveSequencePattern=True
elif name in ('-d','--definition'):
definitionPattern=re.compile(value)
haveDefinitionPattern=True
elif name in ('-i','--identifier'):
identifierPattern=re.compile(value)
haveIdentifierPattern=True
elif name in ('-a','--attribute'):
attribute,pattern=value.split(':',1)
attributePatterns[attribute]=re.compile(pattern)
haveAttributePattern=True
elif name in ('-l','--lmin'):
lmin=int(value)
haveLmin=True
elif name in ('-L','--lmax'):
lmax=int(value)
haveLmax=True
elif name in ('-v'):
isInverted=True
else:
raise ValueError,'Unknown option %s' % name
def sequenceSelector(seq):
good=True
if haveSequencePattern:
good = bool(sequencePattern.search(str(seq)))
if good and haveIdentifierPattern:
good = bool(identifierPattern.search(seq.id))
if good and haveDefinitionPattern:
good = bool(definitionPattern.search(seq.definition))
if good and haveAttributePattern:
good = (reduce(lambda x,y : x and y,
(bool(attributePatterns[p].search(seq[p]))
for p in attributePatterns
if p in seq),True)
and
reduce(lambda x,y : x and y,
(bool(p in seq)
for p in attributePatterns),True)
)
if good and haveLmin:
good = len(seq) >= lmin
if good and haveLmax:
good = len(seq) <= lmax
if isInverted:
good=not good
return good
return sequenceSelector
if __name__=='__main__':
checkHelpOption(__doc__)
goodFasta=goodFastaGenerator()
fasta = fastaIterator(fileinput.input())
for seq in fasta:
if goodFasta(seq):
print writeFasta(seq)
\ No newline at end of file
#!/usr/local/bin/python
"""\
-------------------------------------------------------
fastalength.py
-------------------------------------------------------
fastaLength.py [-h|--help] <fastafile>"
add length data on all sequences in the fasta file.
-------------------------------------------------------
-h --help : print this help
-------------------------------------------------------
"""
import fileinput
import re
import getopt
import sys
from obitools.fasta import fastaIterator,writeFasta
from obitools.utils import checkHelpOption
if __name__=='__main__':
checkHelpOption(__doc__)
fasta = fastaIterator(fileinput.input())
for seq in fasta:
seq['seqLength']=str(len(seq))
print writeFasta(seq)
\ No newline at end of file
#!/usr/local/bin/python
import fileinput
import re
import getopt
import sys
from obitools import complement
from obitools.fasta import fastaIterator,writeFasta
from obitools.fast import Fast
def strandOnRefGenerator(seqref,kup=2):
hashd = Fast(seqref,kup)
hashr = Fast(seqref.complement(),kup)
def isDirect(seq):
sdirect,p =hashd(seq)
sreverse,p=hashr(seq)
return sdirect > sreverse,sdirect,sreverse
return isDirect
def printHelp():
print "-----------------------------------"
print " fastaGrep.py"
print "-----------------------------------"
print "fastaGrep.py [option] <argument>"
print "-----------------------------------"
print "-h --help : print this help"
print "-r --reference=<fasta> : fasta file containing reference sequence"
print "-k --kup=## : word size used in fastn algorithm"
print "-----------------------------------"
if __name__=='__main__':
o,filenames = getopt.getopt(sys.argv[1:],
'hr:k:',
['help',
'reference=',
'kup:'])
kup = 6
sys.argv[1:]=filenames
for name,value in o:
if name in ('-h','--help'):
printHelp()
exit()
elif name in ('-r','--reference'):
rseq=fastaIterator(value).next()
elif name in ('-k','--kup'):
kup = int(value)
else:
raise ValueError,'Unknown option %s' % name
isDirect=strandOnRefGenerator(rseq,kup)
fasta = fastaIterator(fileinput.input())
for seq in fasta:
d,ds,rs=isDirect(seq)
seq['isDirect']=str(d)
seq['directScore']=str(ds)
seq['reverseScore']=str(rs)
seq['onReference']=rid
print writeFasta(seq)
\ No newline at end of file
#!/usr/local/bin/python
import fileinput
import re
import getopt
import sys
from obitools.fasta import fastaIterator,writeFasta
def annoteFastaGenerator():
o,filenames = getopt.getopt(sys.argv[1:],
'hs:i:d:a:l:L:t:v',
['help',
'sequence=',
'identifier=',
'definition=',
'attribute=',
'lmin=',
'lmax=',
'tag='])
sys.argv[1:]=filenames
haveSequencePattern = False
haveIdentifierPattern= False
haveDefinitionPattern= False
haveAttributePattern = False
haveLmin = False
haveLmax = False
haveTag = False
isInverted = False
attributePatterns={}
tags={}
for name,value in o:
if name in ('-h','--help'):
printHelp()
exit()
elif name in ('-s','--sequence'):
sequencePattern=re.compile(value,re.Ignore)
haveSequencePattern=True
elif name in ('-d','--definition'):
definitionPattern=re.compile(value)
haveDefinitionPattern=True
elif name in ('-i','--identifier'):
identifierPattern=re.compile(value)
haveIdentifierPattern=True
elif name in ('-a','--attribute'):
attribute,pattern=value.split(':',1)
attributePatterns[attribute]=re.compile(pattern)
haveAttributePattern=True
elif name in ('-t','--tag'):
attribute,data=value.split(':',1)
tags[attribute]=data.strip()
haveTag=True
elif name in ('-l','--lmin'):
lmin=int(value)
haveLmin=True
elif name in ('-L','--lmax'):
lmax=int(value)
haveLmax=True
elif name in ('-v'):
isInverted=True
else:
raise ValueError,'Unknown option %s' % name
def sequenceAnnotator(seq):
good=True
if haveSequencePattern:
good = bool(sequencePattern.search(str(seq)))
if good and haveIdentifierPattern:
good = bool(identifierPattern.search(seq.id))
if good and haveDefinitionPattern:
good = bool(definitionPattern.search(seqdefinition))
if good and haveAttributePattern:
good = (reduce(lambda x,y : x and y,
(bool(attributePatterns[p].search(seq[p]))
for p in attributePatterns
if p in seq),True)
and
reduce(lambda x,y : x and y,
(bool(p in seq)
for p in attributePatterns),True)
)
if good and haveLmin:
good = len(seq) >= lmin
if good and haveLmax:
good = len(seq) <= lmax
if isInverted:
good=not good
if good:
info.update(tags)
return info
assert haveTag,'You must specified at least one --tag option'
return sequenceAnnotator
def printHelp():
print "-----------------------------------"
print " fastaTag.py"
print "-----------------------------------"
print "fastaGrep.py [option] <argument>"
print "-----------------------------------"
print "-h --help : print this help"
print "-s --sequence=<pattern> : match the sequence with a regular pattern"
print "-i --identifier=<pattern> : match the sequence identifier with a regular pattern"
print "-d --definition=<pattern> : match the sequence definition with a regular pattern"
print "-a --attribute=<name>:<pattern> : match the sequence attribute <name> with a regular pattern"
print "-l --lmin=## : keep sequences longer than lmin"
print "-L --lmax=## : keep sequences shorter than lmax"
print "-v : revert the sequence selection"
print "-----------------------------------"
if __name__=='__main__':
annoteFasta=annoteFastaGenerator()
fasta = fastaIterator(fileinput.input())
for seq in fasta:
info=annoteFasta(seq)
print writeFasta(seq)
\ No newline at end of file
'''
'''
class BioSequence(object):
'''
BioSequence class is the base class for biological
sequence representation.
It provides storage of the sequence itself, of an identifier,
a defintion an manage a set of complementary information on
a key / value principle.
BioSequence is an abstract class and must be instanciate
from its subclasses
'''
def __init__(self,id,seq,definition=None,**info):
'''
BioSequence constructor.
@param id: sequence identifier
@type id: str
@param seq: the sequence
@type seq: str
@param definition: sequence defintion (optional)
@type definition: str
extra named parametters can be add to associtiate complementary
data to the sequence
'''
self.seq = seq
self.definition = definition
self.id = id
self._info = dict(info)
def __str__(self):
return self.seq
def __getitem__(self,key):
if isinstance(key, str):
return self._info[key]
elif isinstance(key, int):
return self.seq[key]
elif isinstance(key, slice):
subseq=self.seq[key]
info = dict(self._info)
if key.start is not None:
start = key.start +1
else:
start = 1
if key.stop is not None:
stop = key.stop+1
else:
stop = len(self.seq)
if key.step is not None:
step = key.step
else:
step = 1
info['cut']='[%d,%d,%s]' % (start,stop,step)
return bioSeqGenerator(self.id, subseq, self.definition,**info)
raise TypeError,'key must be an integer, a str or a slice'
def __setitem__(self,key,value):
self._info[key]=value
def __iter__(self):
return iter(self.seq)
def __len__(self):
return len(self.seq)
def isNucleotide(self):
raise NotImplementedError
class NucSequence(BioSequence):
_comp={'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A',
'R': 'Y', 'Y': 'R', 'K': 'M', 'M': 'K',
'S': 'S', 'W': 'W', 'B': 'V', 'D': 'H',
'H': 'D', 'V': 'B', 'N': 'N', 'U': 'A',
'-': '-'}
def complement(self):
cseq = [NucSequence._comp.get(x,'N') for x in self.seq]
cseq.reverse()
rep = NucSequence(self.id,''.join(cseq),self.definition,**self._info)
rep._info['complemented']=not rep._info.get('complemented',False)
return rep
def isNucleotide(self):
return True
class AASequence(BioSequence):
def isNucleotide(self):
return False
def _isNucSeq(text):
acgt = 0
notnuc = 0
ltot = len(text)
for c in text.upper():
if c in 'ACGT-':
acgt+=1
if c not in NucSequence._comp:
notnuc+=1
return notnuc==0 and float(acgt)/ltot > 0.8
def bioSeqGenerator(id,seq,definition=None,**info):
if _isNucSeq(seq):
return NucSequence(id,seq,definition,**info)
else:
return AASequence(id,seq,definition,**info)
\ No newline at end of file
from obitools import BioSequence
class Alignement(list):
def _assertData(self,data):
assert isinstance(data, BioSequence),'You must only add bioseq to an alignement'
if hasattr(self, '_alignlen'):
assert self._alignlen==len(data),'All aligned sequences must have the same length'
else:
self._alignlen=len(data)
return data
def append(self,data):
data = self._assertData(data)
list.append(self,data)
def __setitem__(self,index,data):
data = self._assertData(data)
list.__setitem__(self,index,data)
def getSite(self,key):
if isinstance(key,int):