Commit 1d533f22 by Eric Coissac

Initial commit

parent 554c7110
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>OBITools</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.python.pydev.PyDevBuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.python.pydev.pythonNature</nature>
</natures>
</projectDescription>
<?xml version="1.0" encoding="UTF-8"?>
<?eclipse-pydev version="1.0"?>
<pydev_project>
<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
<path>/OBITools/src</path>
</pydev_pathproperty>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.4</pydev_property>
</pydev_project>
#!/usr/local/bin/python
import fileinput
import re
import getopt
import sys
from obitools.fasta import fastaIterator,writeFasta
def printHelp():
print "-----------------------------------"
print " fastaComplement.py"
print "-----------------------------------"
print "fastaComplement.py <fastafile>"
print " complement and reverse all sequence in the fasta file"
print "-----------------------------------"
print "-h --help : print this help"
print "-----------------------------------"
if __name__=='__main__':
fasta = fastaIterator(fileinput.input())
for seq in fasta:
print writeFasta(seq.complement())
\ No newline at end of file
#!/usr/local/bin/python
"""\
--------------------------------------------------------------------
fastaGrep.py
--------------------------------------------------------------------
fastaGrep.py [option] <argument>
--------------------------------------------------------------------
-h --help : print this help
-s --sequence=<pattern> : match the sequence with
a regular pattern
-i --identifier=<pattern> : match the sequence
identifier with a regular
pattern
-d --definition=<pattern> : match the sequence definition
with a regular pattern
-a --attribute=<name>:<pattern> : match the sequence attribute
<name> with a regular pattern
-l --lmin=## : keep sequences longer than
lmin
-L --lmax=## : keep sequences shorter than
lmax
-v : revert the sequence selection
--------------------------------------------------------------------
"""
import fileinput
import re
import getopt
import sys
from obitools.fasta import fastaIterator,writeFasta
from obitools.utils import checkHelpOption
def goodFastaGenerator():
o,filenames = getopt.getopt(sys.argv[1:],
'hs:i:d:a:l:L:v',
['help',
'sequence=',
'identifier=',
'definition=',
'attribute=',
'lmin=',
'lmax='])
sys.argv[1:]=filenames
haveSequencePattern = False
haveIdentifierPattern= False
haveDefinitionPattern= False
haveAttributePattern = False
haveLmin = False
haveLmax = False
isInverted = False
attributePatterns={}
for name,value in o:
if name in ('-s','--sequence'):
sequencePattern=re.compile(value,re.Ignore)
haveSequencePattern=True
elif name in ('-d','--definition'):
definitionPattern=re.compile(value)
haveDefinitionPattern=True
elif name in ('-i','--identifier'):
identifierPattern=re.compile(value)
haveIdentifierPattern=True
elif name in ('-a','--attribute'):
attribute,pattern=value.split(':',1)
attributePatterns[attribute]=re.compile(pattern)
haveAttributePattern=True
elif name in ('-l','--lmin'):
lmin=int(value)
haveLmin=True
elif name in ('-L','--lmax'):
lmax=int(value)
haveLmax=True
elif name in ('-v'):
isInverted=True
else:
raise ValueError,'Unknown option %s' % name
def sequenceSelector(seq):
good=True
if haveSequencePattern:
good = bool(sequencePattern.search(str(seq)))
if good and haveIdentifierPattern:
good = bool(identifierPattern.search(seq.id))
if good and haveDefinitionPattern:
good = bool(definitionPattern.search(seq.definition))
if good and haveAttributePattern:
good = (reduce(lambda x,y : x and y,
(bool(attributePatterns[p].search(seq[p]))
for p in attributePatterns
if p in seq),True)
and
reduce(lambda x,y : x and y,
(bool(p in seq)
for p in attributePatterns),True)
)
if good and haveLmin:
good = len(seq) >= lmin
if good and haveLmax:
good = len(seq) <= lmax
if isInverted:
good=not good
return good
return sequenceSelector
if __name__=='__main__':
checkHelpOption(__doc__)
goodFasta=goodFastaGenerator()
fasta = fastaIterator(fileinput.input())
for seq in fasta:
if goodFasta(seq):
print writeFasta(seq)
\ No newline at end of file
#!/usr/local/bin/python
"""\
-------------------------------------------------------
fastalength.py
-------------------------------------------------------
fastaLength.py [-h|--help] <fastafile>"
add length data on all sequences in the fasta file.
-------------------------------------------------------
-h --help : print this help
-------------------------------------------------------
"""
import fileinput
import re
import getopt
import sys
from obitools.fasta import fastaIterator,writeFasta
from obitools.utils import checkHelpOption
if __name__=='__main__':
checkHelpOption(__doc__)
fasta = fastaIterator(fileinput.input())
for seq in fasta:
seq['seqLength']=str(len(seq))
print writeFasta(seq)
\ No newline at end of file
#!/usr/local/bin/python
import fileinput
import re
import getopt
import sys
from obitools import complement
from obitools.fasta import fastaIterator,writeFasta
from obitools.fast import Fast
def strandOnRefGenerator(seqref,kup=2):
hashd = Fast(seqref,kup)
hashr = Fast(seqref.complement(),kup)
def isDirect(seq):
sdirect,p =hashd(seq)
sreverse,p=hashr(seq)
return sdirect > sreverse,sdirect,sreverse
return isDirect
def printHelp():
print "-----------------------------------"
print " fastaGrep.py"
print "-----------------------------------"
print "fastaGrep.py [option] <argument>"
print "-----------------------------------"
print "-h --help : print this help"
print "-r --reference=<fasta> : fasta file containing reference sequence"
print "-k --kup=## : word size used in fastn algorithm"
print "-----------------------------------"
if __name__=='__main__':
o,filenames = getopt.getopt(sys.argv[1:],
'hr:k:',
['help',
'reference=',
'kup:'])
kup = 6
sys.argv[1:]=filenames
for name,value in o:
if name in ('-h','--help'):
printHelp()
exit()
elif name in ('-r','--reference'):
rseq=fastaIterator(value).next()
elif name in ('-k','--kup'):
kup = int(value)
else:
raise ValueError,'Unknown option %s' % name
isDirect=strandOnRefGenerator(rseq,kup)
fasta = fastaIterator(fileinput.input())
for seq in fasta:
d,ds,rs=isDirect(seq)
seq['isDirect']=str(d)
seq['directScore']=str(ds)
seq['reverseScore']=str(rs)
seq['onReference']=rid
print writeFasta(seq)
\ No newline at end of file
#!/usr/local/bin/python
import fileinput
import re
import getopt
import sys
from obitools.fasta import fastaIterator,writeFasta
def annoteFastaGenerator():
o,filenames = getopt.getopt(sys.argv[1:],
'hs:i:d:a:l:L:t:v',
['help',
'sequence=',
'identifier=',
'definition=',
'attribute=',
'lmin=',
'lmax=',
'tag='])
sys.argv[1:]=filenames
haveSequencePattern = False
haveIdentifierPattern= False
haveDefinitionPattern= False
haveAttributePattern = False
haveLmin = False
haveLmax = False
haveTag = False
isInverted = False
attributePatterns={}
tags={}
for name,value in o:
if name in ('-h','--help'):
printHelp()
exit()
elif name in ('-s','--sequence'):
sequencePattern=re.compile(value,re.Ignore)
haveSequencePattern=True
elif name in ('-d','--definition'):
definitionPattern=re.compile(value)
haveDefinitionPattern=True
elif name in ('-i','--identifier'):
identifierPattern=re.compile(value)
haveIdentifierPattern=True
elif name in ('-a','--attribute'):
attribute,pattern=value.split(':',1)
attributePatterns[attribute]=re.compile(pattern)
haveAttributePattern=True
elif name in ('-t','--tag'):
attribute,data=value.split(':',1)
tags[attribute]=data.strip()
haveTag=True
elif name in ('-l','--lmin'):
lmin=int(value)
haveLmin=True
elif name in ('-L','--lmax'):
lmax=int(value)
haveLmax=True
elif name in ('-v'):
isInverted=True
else:
raise ValueError,'Unknown option %s' % name
def sequenceAnnotator(seq):
good=True
if haveSequencePattern:
good = bool(sequencePattern.search(str(seq)))
if good and haveIdentifierPattern:
good = bool(identifierPattern.search(seq.id))
if good and haveDefinitionPattern:
good = bool(definitionPattern.search(seqdefinition))
if good and haveAttributePattern:
good = (reduce(lambda x,y : x and y,
(bool(attributePatterns[p].search(seq[p]))
for p in attributePatterns
if p in seq),True)
and
reduce(lambda x,y : x and y,
(bool(p in seq)
for p in attributePatterns),True)
)
if good and haveLmin:
good = len(seq) >= lmin
if good and haveLmax:
good = len(seq) <= lmax
if isInverted:
good=not good
if good:
info.update(tags)
return info
assert haveTag,'You must specified at least one --tag option'
return sequenceAnnotator
def printHelp():
print "-----------------------------------"
print " fastaTag.py"
print "-----------------------------------"
print "fastaGrep.py [option] <argument>"
print "-----------------------------------"
print "-h --help : print this help"
print "-s --sequence=<pattern> : match the sequence with a regular pattern"
print "-i --identifier=<pattern> : match the sequence identifier with a regular pattern"
print "-d --definition=<pattern> : match the sequence definition with a regular pattern"
print "-a --attribute=<name>:<pattern> : match the sequence attribute <name> with a regular pattern"
print "-l --lmin=## : keep sequences longer than lmin"
print "-L --lmax=## : keep sequences shorter than lmax"
print "-v : revert the sequence selection"
print "-----------------------------------"
if __name__=='__main__':
annoteFasta=annoteFastaGenerator()
fasta = fastaIterator(fileinput.input())
for seq in fasta:
info=annoteFasta(seq)
print writeFasta(seq)
\ No newline at end of file
'''
'''
class BioSequence(object):
'''
BioSequence class is the base class for biological
sequence representation.
It provides storage of the sequence itself, of an identifier,
a defintion an manage a set of complementary information on
a key / value principle.
BioSequence is an abstract class and must be instanciate
from its subclasses
'''
def __init__(self,id,seq,definition=None,**info):
'''
BioSequence constructor.
@param id: sequence identifier
@type id: str
@param seq: the sequence
@type seq: str
@param definition: sequence defintion (optional)
@type definition: str
extra named parametters can be add to associtiate complementary
data to the sequence
'''
self.seq = seq
self.definition = definition
self.id = id
self._info = dict(info)
def __str__(self):
return self.seq
def __getitem__(self,key):
if isinstance(key, str):
return self._info[key]
elif isinstance(key, int):
return self.seq[key]
elif isinstance(key, slice):
subseq=self.seq[key]
info = dict(self._info)
if key.start is not None:
start = key.start +1
else:
start = 1
if key.stop is not None:
stop = key.stop+1
else:
stop = len(self.seq)
if key.step is not None:
step = key.step
else:
step = 1
info['cut']='[%d,%d,%s]' % (start,stop,step)
return bioSeqGenerator(self.id, subseq, self.definition,**info)
raise TypeError,'key must be an integer, a str or a slice'
def __setitem__(self,key,value):
self._info[key]=value
def __iter__(self):
return iter(self.seq)
def __len__(self):
return len(self.seq)
def isNucleotide(self):
raise NotImplementedError
class NucSequence(BioSequence):
_comp={'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A',
'R': 'Y', 'Y': 'R', 'K': 'M', 'M': 'K',
'S': 'S', 'W': 'W', 'B': 'V', 'D': 'H',
'H': 'D', 'V': 'B', 'N': 'N', 'U': 'A',
'-': '-'}
def complement(self):
cseq = [NucSequence._comp.get(x,'N') for x in self.seq]
cseq.reverse()
rep = NucSequence(self.id,''.join(cseq),self.definition,**self._info)
rep._info['complemented']=not rep._info.get('complemented',False)
return rep
def isNucleotide(self):
return True
class AASequence(BioSequence):
def isNucleotide(self):
return False
def _isNucSeq(text):
acgt = 0
notnuc = 0
ltot = len(text)
for c in text.upper():
if c in 'ACGT-':
acgt+=1
if c not in NucSequence._comp:
notnuc+=1
return notnuc==0 and float(acgt)/ltot > 0.8
def bioSeqGenerator(id,seq,definition=None,**info):
if _isNucSeq(seq):
return NucSequence(id,seq,definition,**info)
else:
return AASequence(id,seq,definition,**info)
\ No newline at end of file
from obitools import BioSequence
class Alignement(list):
def _assertData(self,data):
assert isinstance(data, BioSequence),'You must only add bioseq to an alignement'
if hasattr(self, '_alignlen'):
assert self._alignlen==len(data),'All aligned sequences must have the same length'
else:
self._alignlen=len(data)
return data
def append(self,data):
data = self._assertData(data)
list.append(self,data)
def __setitem__(self,index,data):
data = self._assertData(data)
list.__setitem__(self,index,data)
def getSite(self,key):
if isinstance(key,int):
return [x[key] for x in self]
def isFullGapSite(self,key):
return reduce(lambda x,y: x and y,(z=='-' for z in self.getSite(key)),True)
def isGappedSite(self,key):
return reduce(lambda x,y: x or y,(z=='-' for z in self.getSite(key)),False)
def alignmentReader(file,sequenceIterator):
seqs = sequenceIterator(file)
alignement = Alignement()
for seq in seqs:
alignement.append(seq)
return alignement
class DistanceMatrix(object):
def __init__(self,alignment):
'''
DistanceMatrix constructor.
@param alignment: aligment used to compute distance matrix
@type alignment: obitools.align.Alignment
'''
self.aligment = alignment
self.matrix = [[None] * (x+1) for x in xrange(len(alignment))]
def evaluateDist(self,x,y):
raise NotImplementedError
def __getitem__(self,key):
assert isinstance(key,(tuple,list)) and len(key)==2, \
'key must be a tuple or a list of two integers'
x,y = key
if y < x:
z=x
x=y
y=z
rep = self.matrix[y][x]
if rep is None:
rep = self.evaluateDist(x,y)
self.matrix[y][x] = rep
return rep
\ No newline at end of file
'''
Module dedicated to compute observed divergeances from
an alignment. No distance correction is applied at all
'''
from itertools import imap
from obitools.distances import DistanceMatrix
class PaireWisedGapRemoval(DistanceMatrix):
'''
Observed divergeance matrix from an alignment.
Gap are removed from the alignemt on a pairewised
sequence base
'''
def evaluateDist(self,x,y):
'''
Compute the observed divergeance from two sequences
of an aligment.
@attention: For performance purpose this method should
be directly used. use instead the __getitem__
method from DistanceMatrix.
@see DistanceMatrix.__getitem__
@param x: number of the fisrt sequence in the aligment
@type x:int
@param y: umber of the second sequence in the aligment
@type y: int
'''
seq1 = self.aligment[x]
seq2 = self.aligment[y]
diff,tot = reduce(lambda x,y: (x[0]+y,x[1]+1),
(z[0]!=z[1] for z in imap(None,seq1,seq2)
if '-' not in z),(0,0))
return float(diff)/tot
\ No newline at end of file
import sys
from itertools import imap,count
def writePhylipMatrix(matrix):
names = [x.id for x in matrix.aligment]
pnames= [x[:10] for x in names]
unicity={}
redundent=[]
for n in pnames:
unicity[n]=unicity.get(n,0)+1
redundent.append(unicity[n])
print redundent
for i,n,r in imap(None,count(),pnames,redundent):
alternate = n
if r > 1:
while alternate in pnames:
lcut = 9 - len(str(r))
alternate = n[:lcut]+ '_%d' % r
r+=1
pnames[i]='%-10s' % alternate
firstline = '%5d' % len(matrix.aligment)