Commit e944a2c4 by Eric Coissac

Add --upper-case option and a first version of obiselect

parent 94905d41
......@@ -102,6 +102,9 @@ class build_scripts(ori_build_scripts):
file, oldmode, newmode)
os.chmod(file, newmode)
class build_filters(build_scripts):
pass
def findPackage(root,base=None):
modules=[]
......@@ -127,6 +130,8 @@ def findCython(root,base=None,pyrexs=None):
for pyrex in glob.glob(path.join(root,module,'*.pyx')):
pyrexs.append(Extension('.'.join(base+[module,path.splitext(path.basename(pyrex))[0]]),[pyrex]))
pyrexs[-1].sources.extend(glob.glob(os.path.splitext(pyrex)[0]+'.*.c'))
print pyrexs[-1].sources
Main.compile([pyrex],timestamps=True,recursion=True)
pyrexs.extend(findCython(path.join(root,module),base+[module]))
......@@ -147,18 +152,26 @@ def findC(root,base=None,pyrexs=None):
return pyrexs
VERSION = '0.1.999'
VERSION = '0.2.000'
AUTHOR = 'Eric Coissac'
EMAIL = 'eric@coissac.eu'
URL = 'www.grenoble.prabi.fr/trac/OBITools'
LICENSE = 'CeCILL-V2'
SRC = 'src'
SRC = 'src'
FILTERSRC = 'textwrangler/filter'
SCRIPTS = glob.glob('%s/*.py' % SRC)
FILTERS = glob.glob('%s/*.py' % FILTERSRC)
def rootname(x):
return os.path.splitext(x.sources[0])[0]
if has_cython:
EXTENTION=findCython(SRC)
CEXTENTION=findC(SRC)
cython_ext = set(rootname(x) for x in EXTENTION)
EXTENTION.extend(x for x in CEXTENTION if rootname(x) not in cython_ext)
else:
EXTENTION=findC(SRC)
......
......@@ -3,8 +3,6 @@
from obitools.fasta import formatFasta
from obitools.alignment.ace import contigIterator
import sys
from logging import root,DEBUG
from obitools.options import getOptionManager
......
#!/usr/local/bin/python
import re
def igrep(lines,pattern,exclude=False):
"""
igrep emule la commande unix grep.
@param lines: une collection de string
@type lines: iterator on str
@param pattern: une expression reguliere
@type pattern: str
@return: un iterateur sur le sous ensemble
des lignes possedant au moins une occurence
du pattern
@rtype: iterator on str
"""
automate = re.compile(pattern)
for l in lines:
match = automate.search(l) is not None
out = (match and not exclude) or (not match and exclude)
if out:
yield l
def wordIterator(sequence,lword,step=1,endIncluded=False):
L = len(sequence)
if endIncluded:
pmax=L
else:
pmax = L - lword + 1
pos = xrange(0,pmax,step)
for x in pos:
yield sequence[x:x+lword]
def fastaFormat(title,sequence):
width = 50
fasta = '>' + title + '\n'
iwords= wordIterator(sequence,width,width,True)
words =[w for w in iwords]
seq = '\n'.join(words)
return fasta + seq
def isqlines(lines):
for l in igrep(lines,'^ '):
yield l
def icleanSqLines(lines):
sq = isqlines(lines)
bad = re.compile('[ \t\n0-9]+')
for line in sq:
yield bad.sub('',line)
def getSeq(lines):
sq = icleanSqLines(lines)
allsq = [x for x in sq]
seq = ''.join(allsq)
return seq
if __name__=='__main__':
# ici commence le main de mon script
import sys
import optparse
parser = optparse.OptionParser()
parser.add_option("-b", "--begin",
type="int", dest="begin",
default=1,
help="start position of the cutted fragment")
parser.add_option("-e", "--end",
type="int", dest="end",
help="end position of the cutted fragment")
parser.add_option("-t", "--title",
dest="title",
help="title line of the fasta output")
(options, args) = parser.parse_args()
if len(args) > 0:
filename = args[0]
f = open(filename)
else:
f = sys.stdin
sq = getSeq(f)
start = options.begin - 1
stop = options.end
subseq= sq[start:stop]
fasta = fastaFormat(options.title,subseq)
print fasta
#!/usr/local/bin/python
'''
Created on 6 juil. 2010
@author: coissac
'''
from obitools.format.options import addInOutputOption, printOutput
from obitools.options import getOptionManager
from obitools.format.sequence import autoSequenceIterator
def addSelectOptions(optionManager):
optionManager.add_option('-i','--identifier',
action="store", dest="identifier",
metavar="<FILENAME>",
type="string",
default=None,
help="file containing sample sequences to select on "
"on the base of their identifier")
if __name__ == '__main__':
optionParser = getOptionManager([addSelectOptions,addInOutputOption])
(options, entries) = optionParser()
idset=set(x.id for x in autoSequenceIterator(options.identifier))
for seq in entries:
if seq.id in idset:
printOutput(options,seq)
/* Generated by Cython 0.12 on Wed May 5 00:11:20 2010 */
/* Generated by Cython 0.12 on Sat Jun 19 12:17:43 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
/* Generated by Cython 0.12 on Wed May 5 00:11:21 2010 */
/* Generated by Cython 0.12 on Sat Jun 19 12:17:43 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
/* Generated by Cython 0.12 on Wed May 5 00:11:21 2010 */
/* Generated by Cython 0.12 on Sat Jun 19 12:17:43 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
/* Generated by Cython 0.12 on Wed May 5 00:11:21 2010 */
/* Generated by Cython 0.12 on Mon Jun 28 17:32:13 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......@@ -139,6 +139,7 @@
#endif
#include <math.h>
#define __PYX_HAVE_API__obitools__align___lcs
#include "_lcs.h"
#include "stdlib.h"
#include "string.h"
......@@ -1752,10 +1753,10 @@ PyMODINIT_FUNC PyInit__lcs(void)
Py_DECREF(__pyx_t_1); __pyx_t_1 = 0;
/*--- Execution code ---*/
/* "/Users/coissac/encours/OBITools/src/obitools/align/_dynamic.pxd":1
* cdef import from "stdlib.h": # <<<<<<<<<<<<<<
* void* malloc(int size) except NULL
* void* realloc(void* chunk,int size) except NULL
/* "/Users/coissac/encours/OBITools/src/obitools/align/_lcs.pxd":1
* cdef extern from *: # <<<<<<<<<<<<<<
* ctypedef char* const_char_ptr "const char*"
*
*/
goto __pyx_L0;
__pyx_L1_error:;
......
#include "_lcs.h"
#include <string.h>
#include <xmmintrin.h>
#include <stdint.h>
#include <stdlib.h>
#include <limits.h>
// Expands the 8 low weight 8 bit unsigned integer
// to 8 16bits signed integer
static inline vInt16 expand_8_to_16(vUInt8 data)
{
vUInt8 mask_00= _mm_setzero_si128();
return _mm_unpacklo_epi8(data,mask_00);
}
// Load an SSE register with the next 8 first symbols
static inline vInt16 load8Symboles(const char* seq)
{
vUInt8 s;
s = _mm_loadu_si128((vUInt8*)seq);
return expand_8_to_16(s);
}
// Allocate a band allowing to align sequences of length : 'length'
static inline band_t* allocateBand(int length,band_t *band)
{
int size;
size = (length + 7) * sizeof(vInt16);
if (band==NULL)
{
band = malloc(sizeof(band_t));
if (!band)
return NULL;
if ((posix_memalign(&(band->band),16,size)) ||
(!(band->band)))
{
free(band);
return NULL;
}
band->size = length;
}
else if (length > band->size)
{
vInt16 *old = band->band;
if ((posix_memalign(&(band->band),16,size)) ||
(!(band->band)))
{
band->band=old;
return NULL;
}
band->size=length;
free(old);
}
// SHRT_MIN
return band;
}
int fastLCSScore(const char* seq1, const char* seq2,band_t **band)
{
int lseq1,lseq2; // length of the both sequences
int itmp; // tmp variables for swap
const char* stmp; //
int nbands; // Number of bands of width eight in the score matrix
int lastband; // width of the last band
vInt16 *mainband;
int16_t *scores;
// Made seq1 the shortest sequences
lseq1=strlen(seq1);
lseq2=strlen(seq2);
if (lseq1 > lseq2)
{
itmp=lseq1;
lseq1=lseq2;
lseq2=itmp;
stmp=seq1;
seq1=seq2;
seq2=stmp;
}
nbands = lseq1 / 8; // You have 8 short int in a SSE register
lastband = lseq1 - (nbands * 8);
}
#include <xmmintrin.h>
#include <stdint.h>
#define ALIGN __attribute__((aligned(16)))
typedef __m128i vUInt8;
typedef __m128i vInt16;
typedef struct {
int16_t size;
vInt16 *band;
} band_t;
int fastLCSScore(const char* seq1, const char* seq2,band_t **band);
cdef extern from *:
ctypedef char* const_char_ptr "const char*"
cdef import from "_lcs.h":
int fastLCSScore(const_char_ptr seq1, const_char_ptr seq2)
/* Generated by Cython 0.12 on Wed May 5 00:11:21 2010 */
/* Generated by Cython 0.12 on Sat Jun 19 12:17:43 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
/* Generated by Cython 0.12 on Wed May 5 00:11:21 2010 */
/* Generated by Cython 0.12 on Sat Jun 19 12:17:43 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
/* Generated by Cython 0.12 on Wed May 5 00:11:22 2010 */
/* Generated by Cython 0.12 on Sat Jun 19 12:17:43 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
/* Generated by Cython 0.12 on Wed May 5 00:11:22 2010 */
/* Generated by Cython 0.12 on Sat Jun 19 12:17:43 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
/* Generated by Cython 0.12 on Sun Jun 20 14:56:36 2010 */
/* Generated by Cython 0.12 on Thu Jun 24 00:04:53 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
......@@ -39,13 +39,12 @@ inline static uchar_v hash4m128(uchar_v frag)
{
uchar_v words;
vUInt8 mask_06= _mm_set1_epi8(0x06); // charge le registre avec 16x le meme octet
vUInt8 mask_03= _mm_set1_epi8(0x03); // charge le registre avec 16x le meme octet
vUInt8 mask_7F= _mm_set1_epi8(0x7F);
vUInt8 mask_FC= _mm_set1_epi8(0xFC);
frag.m = _mm_and_si128(frag.m,mask_06); // and sur les 128 bits
frag.m = _mm_srli_epi64(frag.m,1); // shift logic a droite sur 2 x 64 bits
frag.m = _mm_and_si128(frag.m,mask_7F);
frag.m = _mm_and_si128(frag.m,mask_03); // and sur les 128 bits
words.m= _mm_slli_epi64(frag.m,2);
......@@ -98,7 +97,6 @@ int buildTable(const char* sequence, unsigned char *table, int *count)
// encode ascii sequence with A : 00 C : 01 T: 10 G : 11
// for(frag.m=loadm128(s);! anyzerom128(frag.m);s+=12,frag.m=loadm128(s))
for(frag.m=_mm_loadu_si128((vUInt8*)s);
! anyzerom128(frag.m);
s+=12,frag.m=_mm_loadu_si128((vUInt8*)s))
......
......@@ -144,7 +144,7 @@ class Alignment(list):
return reduce(lambda x,y: x and y,(z=='-' for z in self.getSite(key)),True)
def isGappedSite(self,key):
return reduce(lambda x,y: x or y,(z=='-' for z in self.getSite(key)),False)
return '-' in self.getSite(key)
def __str__(self):
l = len(self[0])
......
......@@ -105,6 +105,13 @@ class Taxonomy(object):
return ancetre
def betterCommonTaxon(self,error=1,*taxids):
lca = self.lastCommonTaxon(*taxids)
idx = self._index[lca]
sublca = [t[0] for t in self._taxonomy if t[2]==idx]
dist={}
return sublca
def getScientificName(self,taxid):
return self.findTaxonByTaxid(taxid)[3]
......@@ -139,6 +146,18 @@ class Taxonomy(object):
for x in imap(None,self._ranks,xrange(len(self._ranks))):
yield x
def groupTaxa(self,taxa,groupname):
t=[self.findTaxonByTaxid(x) for x in taxa]
a=set(x[2] for x in t)
assert len(a)==1,"All taxa must have the same parent"
newtaxid=max([2999999]+[x[0] for x in self._taxonomy if x[0]>=3000000 and x[0]<4000000])+1
newidx=len(self._taxonomy)
if 'MOTU' not in self._ranks:
self._ranks.append('MOTU')
rankid=self._ranks.index('MOTU')
self._taxonomy.append((newtaxid,rankid,a.pop(),groupname))
for x in t:
x[2]=newidx
class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
'''
......
......@@ -138,7 +138,7 @@ def fastaAAIterator(file,tagparser=_parseFastaTag):
'''
return fastaIterator(file, AASequence,tagparser)
def formatFasta(data,gbmode=False):
def formatFasta(data,gbmode=False,upper=False):
'''
Convert a seqence or a set of sequences in a
string following the fasta format
......@@ -164,7 +164,10 @@ def formatFasta(data,gbmode=False):
definition=''
else:
definition=sequence.definition
frgseq = '\n'.join([seq[x:x+60] for x in xrange(0,len(seq),60)])
if upper:
frgseq = '\n'.join([seq[x:x+60].upper() for x in xrange(0,len(seq),60)])
else:
frgseq = '\n'.join([seq[x:x+60] for x in xrange(0,len(seq),60)])
info='; '.join(['%s=%s' % x for x in sequence.rawiteritems()])
if info:
info=info+';'
......
/* Generated by Cython 0.12 on Mon May 17 13:45:18 2010 */
/* Generated by Cython 0.12 on Sat Jun 19 12:17:44 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -167,7 +167,7 @@ cpdef str errorToSangerFastQStr(array.array quality):
free(<void *>result)
return code
cpdef str formatFastq(object data, bint gbmode=False):
cpdef str formatFastq(object data, bint gbmode=False, bint upper=False):
cdef list rep=[]
cdef str seq
cdef str definition
......@@ -181,6 +181,8 @@ cpdef str formatFastq(object data, bint gbmode=False):
for sequence in data:
seq = str(sequence)
if upper:
seq=seq.upper()
if sequence.definition is None:
definition=''
else:
......
/* Generated by Cython 0.12 on Mon May 17 11:01:39 2010 */
/* Generated by Cython 0.12.1 on Wed Jul 7 07:04:27 2010 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
......@@ -6,6 +6,7 @@
#ifndef Py_PYTHON_H
#error Python headers needed to compile C extensions, please install development version of Python.
#else
#ifndef PY_LONG_LONG
#define PY_LONG_LONG LONG_LONG
#endif
......@@ -17,6 +18,7 @@
#define PyDict_CheckExact(op) (Py_TYPE(op) == &PyDict_Type)
#define PyDict_Contains(d,o) PySequence_Contains(d,o)
#endif
#if PY_VERSION_HEX < 0x02050000
typedef int Py_ssize_t;
#define PY_SSIZE_T_MAX INT_MAX
......@@ -26,7 +28,9 @@
#define PyInt_AsSsize_t(o) PyInt_AsLong(o)
#define PyNumber_Index(o) PyNumber_Int(o)
#define PyIndex_Check(o) PyNumber_Check(o)
#define PyErr_WarnEx(category, message, stacklevel) PyErr_Warn(category, message)
#endif
#if PY_VERSION_HEX < 0x02060000
#define Py_REFCNT(ob) (((PyObject*)(ob))->ob_refcnt)
#define Py_TYPE(ob) (((PyObject*)(ob))->ob_type)
......@@ -36,17 +40,17 @@
#define PyType_Modified(t)
typedef struct {
void *buf;
PyObject *obj;
Py_ssize_t len;
Py_ssize_t itemsize;
int readonly;
int ndim;
char *format;
Py_ssize_t *shape;
Py_ssize_t *strides;
Py_ssize_t *suboffsets;
void *internal;
void *buf;
PyObject *obj;
Py_ssize_t len;
Py_ssize_t itemsize;
int readonly;
int ndim;
char *format;
Py_ssize_t *shape;
Py_ssize_t *strides;
Py_ssize_t *suboffsets;
void *internal;
} Py_buffer;
#define PyBUF_SIMPLE 0
......@@ -60,18 +64,22 @@
#define PyBUF_INDIRECT (0x0100 | PyBUF_STRIDES)
#endif
#if PY_MAJOR_VERSION < 3
#define __Pyx_BUILTIN_MODULE_NAME "__builtin__"
#else
#define __Pyx_BUILTIN_MODULE_NAME "builtins"
#endif
#if PY_MAJOR_VERSION >= 3
#define Py_TPFLAGS_CHECKTYPES 0
#define Py_TPFLAGS_HAVE_INDEX 0
#endif
#if (PY_VERSION_HEX < 0x02060000) || (PY_MAJOR_VERSION >= 3)
#define Py_TPFLAGS_HAVE_NEWBUFFER 0
#endif
#if PY_MAJOR_VERSION >= 3
#define PyBaseString_Type PyUnicode_Type
#define PyString_Type PyUnicode_Type
......@@ -80,6 +88,7 @@
#define PyBytes_Type PyString_Type
#define PyBytes_CheckExact PyString_CheckExact
#endif
#if PY_MAJOR_VERSION >= 3
#define PyInt_Type PyLong_Type
#define PyInt_Check(op) PyLong_Check(op)
......@@ -99,10 +108,13 @@
#else
#define __Pyx_PyNumber_Divide(x,y) PyNumber_Divide(x,y)
#define __Pyx_PyNumber_InPlaceDivide(x,y) PyNumber_InPlaceDivide(x,y)
#endif
#if PY_MAJOR_VERSION >= 3
#define PyMethod_New(func, self, klass) PyInstanceMethod_New(func)
#endif
#if !defined(WIN32) && !defined(MS_WINDOWS)
#ifndef __stdcall
#define __stdcall
......@@ -116,6 +128,7 @@
#else
#define _USE_MATH_DEFINES
#endif
#if PY_VERSION_HEX < 0x02050000
#define __Pyx_GetAttrString(o,n) PyObject_GetAttrString((o),((char *)(n)))
#define __Pyx_SetAttrString(o,n,a) PyObject_SetAttrString((o),((char *)(n)),(a))
......@@ -125,6 +138,7 @@
#define __Pyx_SetAttrString(o,n,a) PyObject_SetAttrString((o),(n),(a))
#define __Pyx_DelAttrString(o,n) PyObject_DelAttrString((o),(n))
#endif
#if PY_VERSION_HEX < 0x02050000
#define __Pyx_NAMESTR(n) ((char *)(n))
#define __Pyx_DOCSTR(n) ((char *)(n))
......@@ -140,12 +154,14 @@
#include <math.h>
#define __PYX_HAVE_API__obitools__format___format
#ifdef __GNUC__
#define INLINE __inline__
#elif _WIN32
#define INLINE __inline
#else
#define INLINE
#ifndef CYTHON_INLINE
#if defined(__GNUC__)
#define CYTHON_INLINE __inline__
#elif defined(_MSC_VER)
#define CYTHON_INLINE __inline
#else
#define CYTHON_INLINE
#endif
#endif
typedef struct {PyObject **p; char *s; const long n; const char* encoding; const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry; /*proto*/
......@@ -167,8 +183,8 @@ typedef struct {PyObject **p; char *s; const long n; const char* encoding; const
#define __Pyx_PyBytes_AsUString(s) ((unsigned char*) __Pyx_PyBytes_AsString(s))
#define __Pyx_PyBool_FromLong(b) ((b) ? (Py_INCREF(Py_True), Py_True) : (Py_INCREF(Py_False), Py_False))
static INLINE int __Pyx_PyObject_IsTrue(PyObject*);
static INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x);
static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*);
static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x);
#if !defined(T_PYSSIZET)
#if PY_VERSION_HEX < 0x02050000
......@@ -232,9 +248,9 @@ static INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x);
#endif
#endif
static INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*);
static INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t);
static INLINE size_t __Pyx_PyInt_AsSize_t(PyObject*);
static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*);
static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t);
static CYTHON_INLINE size_t __Pyx_PyInt_AsSize_t(PyObject*);
#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x))
......@@ -326,59 +342,64 @@ struct __pyx_opt_args_8obitools_6format_7_format_printOutput {
#define __Pyx_XGOTREF(r) do { if((r) != NULL) {__Pyx_GOTREF(r);} } while(0)
#ifndef CYTHON_PROFILE
#define CYTHON_PROFILE 1
#define CYTHON_PROFILE 1
#endif
#ifndef CYTHON_PROFILE_REUSE_FRAME
#define CYTHON_PROFILE_REUSE_FRAME 0
#define CYTHON_PROFILE_REUSE_FRAME 0
#endif
#if CYTHON_PROFILE
#include "compile.h"
#include "frameobject.h"
#include "traceback.h"
#include "compile.h"
#include "frameobject.h"
#include "traceback.h"
#if CYTHON_PROFILE_REUSE_FRAME
#define CYTHON_FRAME_MODIFIER static
#define CYTHON_FRAME_DEL
#else
#define CYTHON_FRAME_MODIFIER
#define CYTHON_FRAME_DEL Py_DECREF(__pyx_frame)
#endif
#if CYTHON_PROFILE_REUSE_FRAME
#define CYTHON_FRAME_MODIFIER static
#define CYTHON_FRAME_DEL
#else
#define CYTHON_FRAME_MODIFIER
#define CYTHON_FRAME_DEL Py_DECREF(__pyx_frame)
#endif
#define __Pyx_TraceCall(funcname, srcfile, firstlineno) \
static PyCodeObject *__pyx_frame_code = NULL; \
CYTHON_FRAME_MODIFIER PyFrameObject *__pyx_frame = NULL; \
int __Pyx_use_tracing = 0; \
if (unlikely(PyThreadState_GET()->use_tracing && PyThreadState_GET()->c_profilefunc)) { \
__Pyx_use_tracing = __Pyx_TraceSetupAndCall(&__pyx_frame_code, &__pyx_frame, funcname, srcfile, firstlineno); \
}
#define __Pyx_TraceDeclarations \
static PyCodeObject *__pyx_frame_code = NULL; \
CYTHON_FRAME_MODIFIER PyFrameObject *__pyx_frame = NULL; \
int __Pyx_use_tracing = 0;