From: Filippos Giannakos Date: Mon, 5 Aug 2013 13:42:49 +0000 (+0300) Subject: docs: Add initial docs. X-Git-Url: https://code.grnet.gr/git/archipelago/commitdiff_plain/d58d20bef538008e0af07cf2ff658c3a3ebe85b3 docs: Add initial docs. Add initial sphinx docs of archipelago. Also add ci script for automatic doc compiling. --- diff --git a/ci/make_docs.sh b/ci/make_docs.sh new file mode 100755 index 0000000..33de928 --- /dev/null +++ b/ci/make_docs.sh @@ -0,0 +1,14 @@ +#!/bin/sh +set -e + +DOCS_DIR=$1 + +cd docs +make clean +make html +cd - + +mkdir -p $DOCS_DIR +mv -n docs/_build/html/* $DOCS_DIR + +echo "Moved docs to to: $(pwd)/$DOCS_DIR" diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..38fd314 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,153 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/synnefo.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/synnefo.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/synnefo" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/synnefo" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." diff --git a/docs/archipelago.rst b/docs/archipelago.rst new file mode 100644 index 0000000..9c4507c --- /dev/null +++ b/docs/archipelago.rst @@ -0,0 +1,138 @@ +.. _archipelago: + +Volume Service (archipelago) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Problem Overview +================ + +In an IaaS cloud, VM provisioning and destroying happens thousands times in a +day. It should be fast and efficient. Copying data between nodes requires time +and bandwidth which can be a bottleneck. More over it takes up extra space for +the copied data. So fast VM provisioning with zero data movement and +deduplicated data is a huge deal. Furthermore support for VM snapshots, to be +used as VM backups or to create new VMs with the changes made in the original +VM, is an asset with regard to functionality and productivity. + +However all this functionality should not interfere with the ability to migrate +the VMs between hardware nodes or having redundancy on the actual data, in case +of hardware failures. + +Archipelago tackles this problem, by creating a storage layer that adds the +necessary logic, between the client that uses the volume and the actual storage. +As an added benefit, the actual storage used is not relative provided that the +appropriate storage driver is used. + +Archipelago Overview +==================== + +Archipelago, as mentioned before, is a distributed storage layer that provides +volumes with the ability to snapshot them and clone them to create new ones, +independently from the actual storage. Archipelago software stack is deployed on +each node where the volume will be used, acting as disks for VMs for example. +Then each volume is exposed as an independent block device and accessed as such. +The data of each volume can reside on any supported storage type. The software +stack will take care any coordination or concurrency control needed between +other nodes running archipelago + +Archipelago main asset is that it decouples the +composition/snapshot/cloning/deduplicating logic from the storage backend used. +It provides a software stack where the aforementioned logic and volume handling +is implemented and through pluggable storage drivers, it can operate over +different storage types. So, Archipelago greatly reduces the need of each +individual storage manufacturer or developer, to develop the same set of +features for their storage solution. + +Archipelago Architecture +======================== + +.. image:: images/archipelago-architecture.png + :target: _images/archipelago-architecture.png + + +Archipelago consists of several components, both userspace and kernelspace, +which communicate through a custom-built shared memory segment communication +mechanism. This mechanism, which is called XSEG, also defines a common +communication protocol between these components and is provided by the library +``libxseg``. Each Archipelago component, which can be a kernelspace block +driver or a userspace process, is an *xseg peer*. The segment provides *ports*, +where each peer binds. The peer then uses the port to communicate with the other +peers on the same segment. The communication consists of *requests* that are +submitted to the receiver port, and are responded to the submitter port. + +This form of communication, allows us to develop distinct components for each +operation of Archipelago, while being able to communicate with exactly +the same protocol between these components, independently from their domain +(userspace or kernelspace). + +Archipelago components +********************** + +Each Archipelago component serves a distinct purpose and coordinates with the +other components to provide the final service. + +These components are described below. + +Volume composer (vlmcd) +####################### +Volume composer is responsible for the volume composition. Xsegbd devices direct +I/O requests on the volume, to the volume composer. Volume composer then +consults the mapper, to get the actual objects on which it will perform the +appropriate I/O. It then directs I/O requests for each individual object to the +blocker and wait for their completion. In the end, it composes the individual +responses, to respond to the original volume request from the xsegbd. + +Mapper (mapperd) +################ +Mapper is responsible for keeping and updating the mappings from volume +offsets to individual objects which actually hold the data. It is also +responsible for creating new volumes, snapshotting existing ones and create new +volume based on a previously captured snapshot (clones). It stores the mappings +to the storage backend, from which it reads and/or updates them, keeping them +cached when appropriate. It also ensure that each action on the volumes, does +not happen unless the necessary volume locks are acquired. + +File blocker (filed) +#################### +File blocker is responsible for storing each object as a single file in a +specified directory. It servers the requests for each objects as they come from +the volume composer and the mapper components. + +Rados blocker (sosd) +#################### +Rados blocker is another form of blocker which stores each objects as a single +object in a RADOS pool. It can be used instead of the file blocker, to create +and use disks over RADOS storage. + +Block devices (xsegbd) +###################### +Each volume on Archipelago is exposed as a block device in the system /dev +directory. These special devices are nothing more than just another peer, which +forwards the requests through the shared memory segment, to the volume composer +for completion. + + +In a nutshell, in archipelago, each xsegbd device communicates through the +shared memory segment with the volume composer. Then the volume composer +requests the objects on which it should perform the I/O from the mapper. The +mapper takes into account all the necessary logic (taking locks etc) and +retrieves the mappings from the storage, by requesting the appropriate objects +from the blocker responsible to hold the maps. It then performs any copy on +write operations needed and returns the mapping to the volume composer. The +volume composer then communicates with the blocker responsible for holding the +objects where the actual data reside and composes the responses, to respond to +the original request. + +Archipelago APIs +================ + +Archipelago allows users to manage and access volume backed by various storage +types. In order to do that, archipelago provides multiple endpoints for the user +to interact (block device driver, qemu driver, user provided process, command +line tool, etc). + +Archipelago Integration with synnefo and ganeti +=============================================== + +How everything ties together in a real world awesome cloud infra. + diff --git a/docs/archipelago_deploy.rst b/docs/archipelago_deploy.rst new file mode 100644 index 0000000..37135a8 --- /dev/null +++ b/docs/archipelago_deploy.rst @@ -0,0 +1,251 @@ +Archipelago management +====================== + +This section describes basic Archipelago management and configuration. + +Archipelago installation +************************ + +Archipelago consists of the following packages: + +* ``libxseg0``: libxseg used to communicate over shared memory segments +* ``python-xseg``: python bindings for libxseg +* ``archipelago-kernel-dkms``: contains archipelago kernel modules to provide + block devices to be used as vm disks +* ``archipleago-modules-source``: contains archipelago kernel modules source, to + build deb packages with the help of module assistant +* ``python-archipelago``: archipelago python module. Includes archipelago and + vlmc functionality. +* ``archipelago``: user space tools and peers for the archipelago management and + volume composition +* ``archipelago-rados``: user space storage driver to enable RADOS support +* ``archipelago-ganeti``: ganeti ext storage scripts, that enable ganeti to + provision VMs over archipelago + + + +Installing ``archipelago-ganeti`` from the apt repository should fetch all the +necessary dependencies, based on the dkms infrastructure. Install also +``archipelago-rados`` to enable RADOS storage backend. + + +.. code-block:: console + + $ apt-get install archipelago-ganeti archipelago-rados + +.. tip:: Archipelago does not start automatically after installation. Please + review the configuration file, make any appropriate changes to the + default configuration (e.g. default max segment size) and start it + manually. + +If a dkms based install is not desired, build your own archipelago-modules +package by installing archipelago-modules-source and performing: + +.. code-block:: console + + $ m-a build --text-mode --kvers-list "target kernel to build" archipelago-modules + +.. note:: Kernel modules need linux-kernel >= 3.2 + +.. warning:: Archipelago currently supports only x86_64 architecture. + +Archipelago configuration +************************* + +Archipelago configuration file is located to : +``/etc/archipelago/archipelago.conf`` + + +``SEGMENT_PORTS`` + **Description** : Max number of ports in the segment. + +``SEGMENT_SIZE`` + **Description** : Shared memory size, used for IPC. + +``XSEGBD_START`` + **Description** : Start port of xsegbd peers + +``XSEGBD_END`` + **Description** : End port of xsegbd peers + +``VTOOL_START`` + **Description** : Start port of vlmc tool. + +``VTOOL_END`` + **Description** : End port of vlmc tool. + +``roles`` + **Description** : A list of (role_name, role_type) tuples, which is used to + deploy the archipelago user space peers. Order matters. + +``role_name { 'setting': value }`` + **Description** : A python dictionary which holds the parameters of they + userspace peers. + +Common peer options: + * ``portno_start``: Start port of the peer. + * ``portno_end``: End port of the peer. + * ``log_level``: Loggging lever for the peer. Available logging levers 0-3. + * ``nr_ops``: Number of ops, each peer can have flying. + +.. * ``logfile``: +.. * ``pidfile``: + +Filed specific options: + * ``nr_threads``: Number of I/O threads to server requests. + * ``archip_dir``: Directory where the files will reside. + * ``fdcache``: Number of file descriptors to be kept open. + +Rados specific options: + * ``nr_threads``: Number of threads to server requests. + * ``pool``: RADOS pool where the objects will be stored. + +Mapper specified options: + * ``blockerb_port``: Port for communication with the blocker responsible for + the data blocks. + * ``blockerm_port``: Port for communication with the blocker responsible for + the maps. + +Vlmc specific options: + * ``blocker_port``: Port for communication with the blocker responsible for the + data blocks. + * ``mapper_port``: Port for communication with the mapper. + +Archipelago commands +******************** + +``archipelago`` provides basic functionality for archipelago. + +Usage: + +.. code-block:: console + + $ archipelago [-u] command + + +Currently it supports the following commands: + +* ``start [role]`` + Starts archipelago or the specified peer. +* ``stop [role]`` + Stops archipelago or the specified peer. +* ``restart [role]`` + Restarts archipelago or the specified peer. +* ``status`` + Show the status of archipelago. + +``role`` is one of the roles defined on the configuration file. + + +``start``, ``stop``, ``restart`` can be combined with the ``-u / --user`` option +to affect only the userspace peers supporting archipelago. + +Archipelago advanced commands +***************************** + +The ``vlmc`` tool provides a way to interact with archipelago volumes + +Usage: + +.. code-block:: console + + $ vlmc command [args] + +Available commands: + +* **map**: maps the volume to a xsegbd device + + Usage: ``$ vlmc map `` + +* **unmap**: unmaps the specified device from the system. + + Usage: ``vlmc unmap `` + +* **create**: creates a new volume with an optional specified size from an optional + specified snapshot + + Usage: ``vlmc create --snap --size `` + + Usage: ``vlmc create --snap `` + + Usage: ``vlmc create --size `` + + The ``--snap`` and ``--size`` are both optional, but at least one of them is + mandatory. If snap is not specified, then a blank volume with the specified + size is created. If size is not specified, the new volume inherits the size + from the snapshot. + +* **remove**: removes the volume. + + Usage: ``vlmc remove `` + + This does not actually delete the blocks, just make the volume inaccessible + for usage. The actual blocks are removed later, when a garbage collection is + invoked. + +* **list**: Provides a list of archipelago volume currently found on storage + + Usage: ``vlmc list`` + +* **info**: shows volume information. Currently returns only the volume size. + + Usage: ``vlmc info `` + +* **open**: opens an archipelago volume. That is, taking all the necessary locks + and also make the rest of the infrastructure aware of the operation. + + Usage: ``vlmc open `` + + This operation succeeds if the volume is alread opened by the current host. + +* **close**: closes an archipelago volume. That is, performing all the necessary + functions in the insfrastrure to successfully release the volume. Also + releases all the acquired locks. + + Usage: ``vlmc close `` + + A explicit ``close`` command should be invoked an explicit ``open``, to + release the volume, unless another action triggered an implicit ``close``. + +* **lock**: locks a volume. This step allow the administrator to lock an + archipelago volume, independently from the rest of the infrastructure. + + Usage: ``vlmc lock `` + + The locks are idempotent for the current owner of the lock. That is, a lock + operation will succeed when the volume is already locked by the same blocker. + +* **unlock**: unlocks a volume. This allow the administrator to unlock a volume, + independently from the rest of the infrastructure. + + Usage: ``vlmc unlock [-f] `` + + The unlock option can be performed only by the blocker that acquired the lock + in the first place. To unlock a volume from another blocker, ``-f`` option + must be used to break the lock. + +Archipelago volume locking system +********************************* + +Archipelago uses volume storage based locks, to get exclusive access to volumes. +Since a volume can be active in only one VM, locks are used to ensure that +restriction. But since locks are storage based, they are permanent and +independent from the process or subsystem that acquired them. So, if a process, +an archipelago deployment on a node misbehaves or crashes, or even a hypervisor +management software (e.g. ganeti) fails to perform a migration, there might be an +inconsistency. Knowledge of locking behavior in archipelago is necessary in +order to surpass these problems. + +#TODO FILL ME + +locking is cached on mapper + +Persistent locks. held if a process/blocker stops/fails/crashes + +lock is acquired with best effort mode: + +* reads: try to get it, but do not fail if not able to. just don't cache anything +* writes: try to get it, and wait until the owner free it. +* snapshot/remove/create etc: Try to get it. Fail if not able to. + +during migrations: blah blah diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..202a018 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,47 @@ +import sys, os + + +cur_dir = os.path.dirname(os.path.abspath(__file__)) + +project = u'archipelago' +copyright = u'2012-2013, GRNET' +version = open(os.path.join(cur_dir + '/../', 'version')).read().strip() +release = version +html_title = 'archipelago ' + version + +templates_path = ['_templates'] +source_suffix = '.rst' +master_doc = 'index' +exclude_patterns = ['_build'] +pygments_style = 'sphinx' +html_theme = 'default' +html_theme_options = { + 'collapsiblesidebar': 'true', + 'footerbgcolor': '#55b577', + 'footertextcolor': '#000000', + 'sidebarbgcolor': '#ffffff', + 'sidebarbtncolor': '#f2f2f2', + 'sidebartextcolor': '#000000', + 'sidebarlinkcolor': '#328e4a', + 'relbarbgcolor': '#55b577', + 'relbartextcolor': '#ffffff', + 'relbarlinkcolor': '#ffffff', + 'bgcolor': '#ffffff', + 'textcolor': '#000000', + 'headbgcolor': '#ffffff', + 'headtextcolor': '#000000', + 'headlinkcolor': '#c60f0f', + 'linkcolor': '#328e4a', + 'visitedlinkcolor': '#63409b', + 'codebgcolor': '#eeffcc', + 'codetextcolor': '#333333', +} + +#html_static_path = ['_static'] +htmlhelp_basename = 'archipelagodoc' + +ARCHIPELAGO_DOCS_BASE_URL = 'http://www.synnefo.org/docs' +extensions = ['sphinx.ext.autodoc', + 'sphinx.ext.intersphinx', + 'sphinx.ext.todo', + 'sphinx.ext.viewcode'] diff --git a/docs/images/archipelago-architecture.png b/docs/images/archipelago-architecture.png new file mode 100644 index 0000000..bc29d1c Binary files /dev/null and b/docs/images/archipelago-architecture.png differ diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..762d5b1 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,51 @@ +Archipelago +^^^^^^^^^^^ + +Archipelago is a distributed software defined storage solution that decouples +cloning and snapshotting logic from the actual storage used. + +Every Volume inside a VM can be thought of as a linearly addressable set of +fixed-size blocks. The storage of the actual blocks is orthogonal to the task of +exposing a single block device for use by each VM. Bridging the gap between the +VMs performing random access to Volumes and the storage of actual blocks is +Archipelago: a custom storage handling layer which handled volumes as set of +distinct blocks in the backend, a process we call volume composition. + +For the actual storage of blocks, Archipelago is agnostic to the storage backend +used. Through plugable storage drivers, Archipelago can support multiple storage +backends to suit the needs of each deployment. We currently provide two storage +drivers. One for simple files, where each object is stored as a single file on the +(shared) filesystem, and one for objects backed by RADOS. RADOS is the +distributed object store which supports the Ceph parallel filesystem. With RADOS, +we can solve the problem of reliable, fault-tolerant object storage through +replication on multiple storage nodes. + +As mentioned before, Archipelago composes the volume through individual blocks. +This is accomplished by maintaining a map for each volume, to map offset in a +volume with a single object. The exact offset inside the object, is calculated +statically from the fixed object size and the offset in the volume. But having +this map and the composition subsystems, allow us to do much more than simple +volume composition. Archipelago offers Copy-On-Write snapshotable volumes. +Furthermore, each snapshot can be hashed, to allow deduplication to play its +part, reducing the storage cost of each hashed object. Further more, Archipelago +can integrate with Pithos, and use Pithos images to provision a volume with +Copy-On-Write semantics (i.e. a clone). Since Pithos images are already hashed, +we can store Archipelago hashed volumes, which are indistinguishable from a Pithos +image, along with the Pithos images, to enable further deduplication, or even +registering an archipelago hashed snapshot as Pithos image file. + +Having stated all that, Archipelago is used by Cyclades and Ganeti for fast VM +provisioning based on CoW volumes. Moreover, it enables live migration of +thinly-provisioned VMs with no physically shared storage. + + +Contents: +********* + +.. toctree:: + :maxdepth: 2 + :numbered: + :glob: + + archipelago + archipelago_deploy