Commit 9b3166df by Eric Coissac

First draft of a sphinx doc

parent 18ced699
# Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = build
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
.PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest
help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
clean:
-rm -rf $(BUILDDIR)/*
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/OBITools.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/OBITools.qhc"
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
"run these through (pdf)latex."
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: c8cf90f918980d07f2eb7578215449db
tags: fbb0d17656682115ca4d033fb2f83ba1
File format conversions
=======================
Several OBITools exist for converting files from one format to another.
As :doc:`fasta file <fasta>` is the central format for OBITools, many of
these converters convert to :doc:`extended OBITools fasta format <obifasta>`.
Convert to extended OBITools fasta format
-----------------------------------------
.. toctree::
:maxdepth: 2
scripts/convert2fasta
scripts/ecopcr2fasta
Convert taxonomic data
----------------------
.. toctree::
:maxdepth: 2
scripts/buildOBITaxonomy
Convert to tabular data files
-----------------------------
.. toctree::
:maxdepth: 2
scripts/fasta2tab
\ No newline at end of file
The EMBL sequence format
========================
\ No newline at end of file
The fasta format
================
The fasta format is certainly the most widely used sequence file format.
This is certainly due to its great simplicity. It was originally created
for the ''Lipman'' and ''Pearson'' `FASTA program`_. OBITools use in more
of :ref:`the classical fasta format <classical-fasta>` several extended
version of this format where structured data are included in the title line.
.. toctree::
:maxdepth: 2
obifasta
.. _classical-fasta:
The classical fasta format
--------------------------
In fasta format a sequence is represented by a title line beginning with a **>** character and
the sequences by itself following :doc:`iupac`. The sequence is usually split other severals
lines of the same length (expected for the last one) ::
>my_sequence this is my pretty sequence
ACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGT
GTGCTGACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTGTTT
AACGACGTTGCAGTACGTTGCAGT
This is no special format for the title line excepting that this line should be unique.
Usually the first word following the **>** character is considered as the sequence identifier.
The end of the title line corresponding to a description of the sequence.
Several sequences can be concatenated in a same file. The description of the next sequence
is just pasted at the end of the description of the previous one ::
>sequence_A this is my first pretty sequence
ACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGT
GTGCTGACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTGTTT
AACGACGTTGCAGTACGTTGCAGT
>sequence_B this is my second pretty sequence
ACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGT
GTGCTGACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTGTTT
AACGACGTTGCAGTACGTTGCAGT
>sequence_C this is my third pretty sequence
ACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGT
GTGCTGACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTGTTT
AACGACGTTGCAGTACGTTGCAGT
.. _`FASTA program`: http://www.ncbi.nlm.nih.gov/pubmed/3162770?dopt=Citation
\ No newline at end of file
File formats usable with OBITools
=================================
The sequence files
------------------
Sequences can be stored following various format. OBITools knows
some of them. The central format for sequence files manipulated by OBITools scripts
is the :doc:`fasta format <fasta>`. OBITools extends the fasta format by specifying
a syntax to include in the definition line data qualifying the sequence.
All file formats use the :doc:`IUPAC <iupac>` code for encoding nucleotides and
amino-acids.
.. toctree::
:maxdepth: 2
iupac
fasta
genbank
embl
The taxonomy files
------------------
Many OBITools are able to take into account taxonomic data. These data
are manipulated following the `NCBI taxonomy`_.
.. toctree::
:maxdepth: 2
taxdump
obitaxonomy
The ecoPCR files
----------------
ecoPCR_ is a software developed in LECA_. It simulates a PCR experiment by
selecting in a sequence database, sequences matching simultaneously two
primers sequences in a way allowing a PCR amplification of a DNA region.
The ecoPrimer files
-------------------
The OBITools files
------------------
.. _ecoPCR: http://www.grenoble.prabi.fr/trac/ecoPCR
.. _LECA: http://www-leca.ujf-grenoble.fr
.. _`NCBI taxonomy`: http://www.ncbi.nlm.nih.gov/taxonomy
\ No newline at end of file
The genbank sequence format
===========================
\ No newline at end of file
.. OBITools documentation master file, created by
sphinx-quickstart on Tue Dec 8 21:30:02 2009.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to OBITools's documentation!
====================================
Contents:
.. toctree::
:maxdepth: 2
The OBITools scripts <scripts>
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
The IUPAC code
==============
The International Union of Pure and Applied Chemistry (IUPAC_) defined
the standard code for representing protein or DNA sequences.
Nucleic IUPAC Code
------------------
======== =================================
**Code** **Nucleotide**
======== =================================
A Adenine
C Cytosine
G Guanine
T Thymine
U Uracil
R Purine (A or G)
Y Pyrimidine (C, T, or U)
M C or A
K T, U, or G
W T, U, or A
S C or G
B C, T, U, or G (not A)
D A, T, U, or G (not C)
H A, T, U, or C (not G)
V A, C, or G (not T, not U)
N Any base (A, C, G, T, or U)
======== =================================
Peptidic one and three letters IUPAC code
-----------------------------------------
============ ============= =======================================
**1-letter** **3-letters** **Amino acid**
============ ============= =======================================
A Ala Alanine
R Arg Arginine
N Asn Asparagine
D Asp Aspartic acid
C Cys Cysteine
Q Gln Glutamine
E Glu Glutamic acid
G Gly Glycine
H His Histidine
I Ile Isoleucine
L Leu Leucine
K Lys Lysine
M Met Methionine
F Phe Phenylalanine
P Pro Proline
S Ser Serine
T Thr Threonine
W Trp Tryptophan
Y Tyr Tyrosine
V Val Valine
B Asx Aspartic acid or Asparagine
Z Glx Glutamine or Glutamic acid
X Xaa Any amino acid
============ ============= =======================================
.. _IUPAC: http://www.iupac.org/
\ No newline at end of file
The extended OBITools fasta format
==================================
The *extended OBITools Fasta format* is a strict :doc:`fasta format file <fasta>`.
The file in *extended OBITools Fasta format* can be readed by all programs
reading fasta files.
Difference between standard and extended fasta is just the structure of the title
line. For OBITools title line is divided in three parts :
- Seqid : the sequence identifier
- key=value; : a set of key/value keys
- the sequence definition
::
>my_sequence taxid=3456; direct=True; sample=A354; this is my pretty sequence
ACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGT
GTGCTGACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTGTTT
AACGACGTTGCAGTACGTTGCAGT
Following these rules, the title line can be parsed :
- The sequence identifier of this sequence is *my_sequence*
- Three keys are assigned to this sequence :
- Key *taxid* with value *3456*
- Key *direct* with value *True*
- Key *sample* with value *A354*
- The definition of this sequence is this is *my pretty sequence*
Key value can be any valid python expression. If a key value cannot be evaluated as
a python expression, it is them assumed as a simple string. Following this rule,
taxid value is considered as an integer value, direct value as a boolean and sample
value is not a valid python expression so it is considered as a string value.
The OBITools formated taxonomy
==============================
OBITools scripts
================
OBITools scripts are developed mainly for manipulating large sequence
files generated by the next generation sequencers.
Contents:
.. toctree::
:maxdepth: 2
Usable file formats with OBITools <formats>
File format conversions <conversions>
Convert NCBI taxdump to binary formated OBITools taxonomy database
==================================================================
:command:`buildOBITaxonomy.py` -t <taxdump dir> -d <db name>
Convert an text dump directory of the NCBI Taxonomy database to the binary
format used by ecoPCR and many OBITools scripts. An archive corresponding to
this directory can be downloaded at the following URL
`ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/ <ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/>`_
obitools common options
-----------------------
.. program:: obitools
.. cmdoption:: -h, --help
show this help message and exit
.. cmdoption:: --DEBUG
Set logging in debug mode
.. cmdoption:: --no-psyco
Don't use psyco even if it installed
taxonomy related options
------------------------
.. program:: taxonomy
.. cmdoption:: -d <FILENAME>, --database=<FILENAME>
ecoPCR taxonomy Database name
.. cmdoption:: -t <FILENAME>, --taxonomy-dump=<FILENAME>
NCBI Taxonomy dump repository name
example
-------
for building a new taxonomy database named *ncbitaxonomy* from a taxdump dir ::
% curl ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz | tar zxf -
% buildOBITaxonomy.py --taxonomy-dump taxdump --database ncbitaxonomy
\ No newline at end of file
Convert sequence files to extended OBITools fasta format
========================================================
:command:`convert2fasta.py` [options] [filename 1] [filename 2] ...
Convert sequence files to the extended OBITools fasta format. If no
file name are specified data are read from standard input.
obitools common options
-----------------------
.. program:: obitools
.. cmdoption:: -h, --help
show this help message and exit
.. cmdoption:: --DEBUG
Set logging in debug mode
.. cmdoption:: --no-psyco
Don't use psyco even if it installed
convert2fasta.py specific options
---------------------------------
.. program:: convert2fasta.py
.. cmdoption:: --genbank
input file is in :doc:`genbank format <../genbank>`
.. cmdoption:: --embl
input file is in :doc:`embl format <../embl>`
.. cmdoption:: --fna
input file is in fasta nucleic format produced by 454 sequencer
pipeline
.. cmdoption:: --nuc
input file contains nucleic sequences
.. cmdoption:: --prot
input file contains protein sequences
example
-------
for converting a genbank file to fasta ::
% convert2fasta.py --genbank --nuc sequences.gb > sequences.fasta
Convert ecoPCR result files to extended OBITools fasta file
===========================================================
Convert extended OBITools fasta file to a tabular format
========================================================
\ No newline at end of file
The NCBI taxonomy dump files
============================
/**
* Sphinx stylesheet -- basic theme
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
/* -- main layout ----------------------------------------------------------- */
div.clearer {
clear: both;
}
/* -- relbar ---------------------------------------------------------------- */
div.related {
width: 100%;
font-size: 90%;
}
div.related h3 {
display: none;
}
div.related ul {
margin: 0;
padding: 0 0 0 10px;
list-style: none;
}
div.related li {
display: inline;
}
div.related li.right {
float: right;
margin-right: 5px;
}
/* -- sidebar --------------------------------------------------------------- */
div.sphinxsidebarwrapper {
padding: 10px 5px 0 10px;
}
div.sphinxsidebar {
float: left;
width: 230px;
margin-left: -100%;
font-size: 90%;
}
div.sphinxsidebar ul {
list-style: none;
}
div.sphinxsidebar ul ul,
div.sphinxsidebar ul.want-points {
margin-left: 20px;
list-style: square;
}
div.sphinxsidebar ul ul {
margin-top: 0;
margin-bottom: 0;
}
div.sphinxsidebar form {
margin-top: 10px;
}
div.sphinxsidebar input {
border: 1px solid #98dbcc;
font-family: sans-serif;
font-size: 1em;
}
img {
border: 0;
}
/* -- search page ----------------------------------------------------------- */
ul.search {
margin: 10px 0 0 20px;
padding: 0;
}
ul.search li {
padding: 5px 0 5px 20px;
background-image: url(file.png);
background-repeat: no-repeat;
background-position: 0 7px;
}
ul.search li a {
font-weight: bold;
}
ul.search li div.context {
color: #888;
margin: 2px 0 0 30px;
text-align: left;
}
ul.keywordmatches li.goodmatch a {
font-weight: bold;
}
/* -- index page ------------------------------------------------------------ */
table.contentstable {
width: 90%;
}
table.contentstable p.biglink {
line-height: 150%;
}
a.biglink {
font-size: 1.3em;
}
span.linkdescr {
font-style: italic;
padding-top: 5px;
font-size: 90%;
}
/* -- general index --------------------------------------------------------- */
table.indextable td {
text-align: left;
vertical-align: top;
}
table.indextable dl, table.indextable dd {
margin-top: 0;
margin-bottom: 0;
}
table.indextable tr.pcap {
height: 10px;
}
table.indextable tr.cap {
margin-top: 10px;
background-color: #f2f2f2;
}
img.toggler {
margin-right: 3px;
margin-top: 3px;
cursor: pointer;
}
/* -- general body styles --------------------------------------------------- */
a.headerlink {
visibility: hidden;
}
h1:hover > a.headerlink,
h2:hover > a.headerlink,
h3:hover > a.headerlink,
h4:hover > a.headerlink,
h5:hover > a.headerlink,
h6:hover > a.headerlink,
dt:hover > a.headerlink {
visibility: visible;
}
div.body p.caption {
text-align: inherit;
}
div.body td {
text-align: left;
}</