Commit e944a2c4 by Eric Coissac

Add --upper-case option and a first version of obiselect

parent 94905d41
......@@ -102,6 +102,9 @@ class build_scripts(ori_build_scripts):
file, oldmode, newmode)
os.chmod(file, newmode)
class build_filters(build_scripts):
pass
def findPackage(root,base=None):
modules=[]
......@@ -127,6 +130,8 @@ def findCython(root,base=None,pyrexs=None):
for pyrex in glob.glob(path.join(root,module,'*.pyx')):
pyrexs.append(Extension('.'.join(base+[module,path.splitext(path.basename(pyrex))[0]]),[pyrex]))
pyrexs[-1].sources.extend(glob.glob(os.path.splitext(pyrex)[0]+'.*.c'))
print pyrexs[-1].sources
Main.compile([pyrex],timestamps=True,recursion=True)
pyrexs.extend(findCython(path.join(root,module),base+[module]))
......@@ -147,18 +152,26 @@ def findC(root,base=None,pyrexs=None):
return pyrexs
VERSION = '0.1.999'
VERSION = '0.2.000'
AUTHOR = 'Eric Coissac'
EMAIL = 'eric@coissac.eu'
URL = 'www.grenoble.prabi.fr/trac/OBITools'
LICENSE = 'CeCILL-V2'
SRC = 'src'
SRC = 'src'
FILTERSRC = 'textwrangler/filter'
SCRIPTS = glob.glob('%s/*.py' % SRC)
FILTERS = glob.glob('%s/*.py' % FILTERSRC)
def rootname(x):
return os.path.splitext(x.sources[0])[0]
if has_cython:
EXTENTION=findCython(SRC)
CEXTENTION=findC(SRC)
cython_ext = set(rootname(x) for x in EXTENTION)
EXTENTION.extend(x for x in CEXTENTION if rootname(x) not in cython_ext)
else:
EXTENTION=findC(SRC)
......
......@@ -3,8 +3,6 @@
from obitools.fasta import formatFasta
from obitools.alignment.ace import contigIterator
import sys
from logging import root,DEBUG
from obitools.options import getOptionManager
......
#!/usr/local/bin/python
import re
def igrep(lines,pattern,exclude=False):
"""
igrep emule la commande unix grep.
@param lines: une collection de string
@type lines: iterator on str
@param pattern: une expression reguliere
@type pattern: str
@return: un iterateur sur le sous ensemble
des lignes possedant au moins une occurence
du pattern
@rtype: iterator on str
"""
automate = re.compile(pattern)
for l in lines:
match = automate.search(l) is not None
out = (match and not exclude) or (not match and exclude)
if out:
yield l
def wordIterator(sequence,lword,step=1,endIncluded=False):
L = len(sequence)
if endIncluded:
pmax=L
else:
pmax = L - lword + 1
pos = xrange(0,pmax,step)
for x in pos:
yield sequence[x:x+lword]
def fastaFormat(title,sequence):
width = 50
fasta = '>' + title + '\n'
iwords= wordIterator(sequence,width,width,True)
words =[w for w in iwords]
seq = '\n'.join(words)
return fasta + seq
def isqlines(lines):
for l in igrep(lines,'^ '):
yield l
def icleanSqLines(lines):
sq = isqlines(lines)
bad = re.compile('[ \t\n0-9]+')
for line in sq:
yield bad.sub('',line)
def getSeq(lines):
sq = icleanSqLines(lines)
allsq = [x for x in sq]
seq = ''.join(allsq)
return seq
if __name__=='__main__':
# ici commence le main de mon script
import sys
import optparse
parser = optparse.OptionParser()
parser.add_option("-b", "--begin",
type="int", dest="begin",
default=1,
help="start position of the cutted fragment")
parser.add_option("-e", "--end",
type="int", dest="end",
help="end position of the cutted fragment")
parser.add_option("-t", "--title",
dest="title",
help="title line of the fasta output")
(options, args) = parser.parse_args()
if len(args) > 0:
filename = args[0]
f = open(filename)
else:
f = sys.stdin
sq = getSeq(f)
start = options.begin - 1
stop = options.end
subseq= sq[start:stop]
fasta = fastaFormat(options.title,subseq)
print fasta
#!/usr/local/bin/python
'''
Created on 6 juil. 2010
@author: coissac
'''
from obitools.format.options import addInOutputOption, printOutput
from obitools.options import getOptionManager
from obitools.format.sequence import autoSequenceIterator
def addSelectOptions(optionManager):
optionManager.add_option('-i','--identifier',
action="store", dest="identifier",
metavar="<FILENAME>",
type="string",
default=None,
help="file containing sample sequences to select on "
"on the base of their identifier")
if __name__ == '__main__':
optionParser = getOptionManager([addSelectOptions,addInOutputOption])
(options, entries) = optionParser()
idset=set(x.id for x in autoSequenceIterator(options.identifier))
for seq in entries:
if seq.id in idset:
printOutput(options,seq)
/* Generated by Cython 0.12 on Wed May 5 00:11:20 2010 */
/* Generated by Cython 0.12 on Sat Jun 19 12:17:43 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
/* Generated by Cython 0.12 on Wed May 5 00:11:21 2010 */
/* Generated by Cython 0.12 on Sat Jun 19 12:17:43 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
/* Generated by Cython 0.12 on Wed May 5 00:11:21 2010 */
/* Generated by Cython 0.12 on Sat Jun 19 12:17:43 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
/* Generated by Cython 0.12 on Wed May 5 00:11:21 2010 */
/* Generated by Cython 0.12 on Mon Jun 28 17:32:13 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......@@ -139,6 +139,7 @@
#endif
#include <math.h>
#define __PYX_HAVE_API__obitools__align___lcs
#include "_lcs.h"
#include "stdlib.h"
#include "string.h"
......@@ -1752,10 +1753,10 @@ PyMODINIT_FUNC PyInit__lcs(void)
Py_DECREF(__pyx_t_1); __pyx_t_1 = 0;
/*--- Execution code ---*/
/* "/Users/coissac/encours/OBITools/src/obitools/align/_dynamic.pxd":1
* cdef import from "stdlib.h": # <<<<<<<<<<<<<<
* void* malloc(int size) except NULL
* void* realloc(void* chunk,int size) except NULL
/* "/Users/coissac/encours/OBITools/src/obitools/align/_lcs.pxd":1
* cdef extern from *: # <<<<<<<<<<<<<<
* ctypedef char* const_char_ptr "const char*"
*
*/
goto __pyx_L0;
__pyx_L1_error:;
......
#include "_lcs.h"
#include <string.h>
#include <xmmintrin.h>
#include <stdint.h>
#include <stdlib.h>
#include <limits.h>
// Expands the 8 low weight 8 bit unsigned integer
// to 8 16bits signed integer
static inline vInt16 expand_8_to_16(vUInt8 data)
{
vUInt8 mask_00= _mm_setzero_si128();
return _mm_unpacklo_epi8(data,mask_00);
}
// Load an SSE register with the next 8 first symbols
static inline vInt16 load8Symboles(const char* seq)
{
vUInt8 s;
s = _mm_loadu_si128((vUInt8*)seq);
return expand_8_to_16(s);
}
// Allocate a band allowing to align sequences of length : 'length'
static inline band_t* allocateBand(int length,band_t *band)
{
int size;
size = (length + 7) * sizeof(vInt16);
if (band==NULL)
{
band = malloc(sizeof(band_t));
if (!band)
return NULL;
if ((posix_memalign(&(band->band),16,size)) ||
(!(band->band)))
{
free(band);
return NULL;
}
band->size = length;
}
else if (length > band->size)
{
vInt16 *old = band->band;
if ((posix_memalign(&(band->band),16,size)) ||
(!(band->band)))
{
band->band=old;
return NULL;
}
band->size=length;
free(old);
}
// SHRT_MIN
return band;
}
int fastLCSScore(const char* seq1, const char* seq2,band_t **band)
{
int lseq1,lseq2; // length of the both sequences
int itmp; // tmp variables for swap
const char* stmp; //
int nbands; // Number of bands of width eight in the score matrix
int lastband; // width of the last band
vInt16 *mainband;
int16_t *scores;
// Made seq1 the shortest sequences
lseq1=strlen(seq1);
lseq2=strlen(seq2);
if (lseq1 > lseq2)
{
itmp=lseq1;
lseq1=lseq2;
lseq2=itmp;
stmp=seq1;
seq1=seq2;
seq2=stmp;
}
nbands = lseq1 / 8; // You have 8 short int in a SSE register
lastband = lseq1 - (nbands * 8);
}
#include <xmmintrin.h>
#include <stdint.h>
#define ALIGN __attribute__((aligned(16)))
typedef __m128i vUInt8;
typedef __m128i vInt16;
typedef struct {
int16_t size;
vInt16 *band;
} band_t;
int fastLCSScore(const char* seq1, const char* seq2,band_t **band);
cdef extern from *:
ctypedef char* const_char_ptr "const char*"
cdef import from "_lcs.h":
int fastLCSScore(const_char_ptr seq1, const_char_ptr seq2)
/* Generated by Cython 0.12 on Wed May 5 00:11:21 2010 */
/* Generated by Cython 0.12 on Sat Jun 19 12:17:43 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
/* Generated by Cython 0.12 on Wed May 5 00:11:21 2010 */
/* Generated by Cython 0.12 on Sat Jun 19 12:17:43 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
/* Generated by Cython 0.12 on Wed May 5 00:11:22 2010 */
/* Generated by Cython 0.12 on Sat Jun 19 12:17:43 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
/* Generated by Cython 0.12 on Wed May 5 00:11:22 2010 */
/* Generated by Cython 0.12 on Sat Jun 19 12:17:43 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
/* Generated by Cython 0.12 on Sun Jun 20 14:56:36 2010 */
/* Generated by Cython 0.12 on Thu Jun 24 00:04:53 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
......@@ -39,13 +39,12 @@ inline static uchar_v hash4m128(uchar_v frag)
{
uchar_v words;
vUInt8 mask_06= _mm_set1_epi8(0x06); // charge le registre avec 16x le meme octet
vUInt8 mask_03= _mm_set1_epi8(0x03); // charge le registre avec 16x le meme octet
vUInt8 mask_7F= _mm_set1_epi8(0x7F);
vUInt8 mask_FC= _mm_set1_epi8(0xFC);
frag.m = _mm_and_si128(frag.m,mask_06); // and sur les 128 bits
frag.m = _mm_srli_epi64(frag.m,1); // shift logic a droite sur 2 x 64 bits
frag.m = _mm_and_si128(frag.m,mask_7F);
frag.m = _mm_and_si128(frag.m,mask_03); // and sur les 128 bits
words.m= _mm_slli_epi64(frag.m,2);
......@@ -98,7 +97,6 @@ int buildTable(const char* sequence, unsigned char *table, int *count)
// encode ascii sequence with A : 00 C : 01 T: 10 G : 11
// for(frag.m=loadm128(s);! anyzerom128(frag.m);s+=12,frag.m=loadm128(s))
for(frag.m=_mm_loadu_si128((vUInt8*)s);
! anyzerom128(frag.m);
s+=12,frag.m=_mm_loadu_si128((vUInt8*)s))
......
......@@ -144,7 +144,7 @@ class Alignment(list):
return reduce(lambda x,y: x and y,(z=='-' for z in self.getSite(key)),True)
def isGappedSite(self,key):
return reduce(lambda x,y: x or y,(z=='-' for z in self.getSite(key)),False)
return '-' in self.getSite(key)
def __str__(self):
l = len(self[0])
......
......@@ -105,6 +105,13 @@ class Taxonomy(object):
return ancetre
def betterCommonTaxon(self,error=1,*taxids):
lca = self.lastCommonTaxon(*taxids)
idx = self._index[lca]
sublca = [t[0] for t in self._taxonomy if t[2]==idx]
dist={}
return sublca
def getScientificName(self,taxid):
return self.findTaxonByTaxid(taxid)[3]
......@@ -139,6 +146,18 @@ class Taxonomy(object):
for x in imap(None,self._ranks,xrange(len(self._ranks))):
yield x
def groupTaxa(self,taxa,groupname):
t=[self.findTaxonByTaxid(x) for x in taxa]
a=set(x[2] for x in t)
assert len(a)==1,"All taxa must have the same parent"
newtaxid=max([2999999]+[x[0] for x in self._taxonomy if x[0]>=3000000 and x[0]<4000000])+1
newidx=len(self._taxonomy)
if 'MOTU' not in self._ranks:
self._ranks.append('MOTU')
rankid=self._ranks.index('MOTU')
self._taxonomy.append((newtaxid,rankid,a.pop(),groupname))
for x in t:
x[2]=newidx
class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
'''
......
......@@ -138,7 +138,7 @@ def fastaAAIterator(file,tagparser=_parseFastaTag):
'''
return fastaIterator(file, AASequence,tagparser)
def formatFasta(data,gbmode=False):
def formatFasta(data,gbmode=False,upper=False):
'''
Convert a seqence or a set of sequences in a
string following the fasta format
......@@ -164,7 +164,10 @@ def formatFasta(data,gbmode=False):
definition=''
else:
definition=sequence.definition
frgseq = '\n'.join([seq[x:x+60] for x in xrange(0,len(seq),60)])
if upper:
frgseq = '\n'.join([seq[x:x+60].upper() for x in xrange(0,len(seq),60)])
else:
frgseq = '\n'.join([seq[x:x+60] for x in xrange(0,len(seq),60)])
info='; '.join(['%s=%s' % x for x in sequence.rawiteritems()])
if info:
info=info+';'
......
/* Generated by Cython 0.12 on Mon May 17 13:45:18 2010 */
/* Generated by Cython 0.12 on Sat Jun 19 12:17:44 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -167,7 +167,7 @@ cpdef str errorToSangerFastQStr(array.array quality):
free(<void *>result)
return code
cpdef str formatFastq(object data, bint gbmode=False):
cpdef str formatFastq(object data, bint gbmode=False, bint upper=False):
cdef list rep=[]
cdef str seq
cdef str definition
......@@ -181,6 +181,8 @@ cpdef str formatFastq(object data, bint gbmode=False):
for sequence in data:
seq = str(sequence)
if upper:
seq=seq.upper()
if sequence.definition is None:
definition=''
else:
......
......@@ -7,7 +7,7 @@ cpdef printOutput(options,seq,output=sys.stdout):
if options.output is not None:
r=options.output(seq)
elif options.outputFormater is not None:
r=options.outputFormater(seq)
r=options.outputFormater(seq,upper=options.uppercase)
else:
r=formatFasta(seq)
......
......@@ -79,6 +79,10 @@ def addInputFormatOption(optionManager):
default=None,
const='pep',
help="input file is protein sequences")
optionManager.add_option('--upper-case',
action='store_true',dest='uppercase',
default=False,
help="Print sequences in upper case (defualt is lower case)")
def addOutputFormatOption(optionManager):
......
from obitools.fasta import fastaIterator
from obitools.fastq import fastqSangerIterator
from obitools.seqdb.embl.parser import emblIterator
from obitools.seqdb.genbank.parser import genbankIterator
from itertools import chain
def autoSequenceIterator(lineiterator):
first = lineiterator.next()
if first[0]==">":
reader=fastaIterator
elif first[0]=='@':
reader=fastqSangerIterator
elif first[0:3]=='ID ':
reader=emblIterator
elif first[0:6]=='LOCUS ':
reader=genbankIterator
else:
raise AssertionError,'file is not in fasta, fasta, embl, or genbank format'
input = reader(chain([first],lineiterator))
return input
......@@ -70,7 +70,8 @@ def uniqSequence(seqIterator,taxonomy=None,mergedKey=None,mergeIds=False):
s[mkey][skey]=seq[mkey][skey]
for key in seq.iterkeys():
if key in s and s[key]!=seq[key]:
# Merger proprement l'attribut merged s'il exist
if key in s and s[key]!=seq[key] and key!='count' and key[0:7]!='merged_' and key!='merged':
del(s[key])
......
/* Generated by Cython 0.12 on Wed May 5 00:11:22 2010 */
/* Generated by Cython 0.12 on Sat Jun 19 12:17:44 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment