# -*- coding: utf-8 -*- """ Helper methods to parse rst documents and extract data appropriate for faq/guide entries creation. """ import os import sys import glob import StringIO from os.path import join from collections import defaultdict from docutils.core import publish_parts from lxml import html class SphinxImportException(Exception): pass class SphinxImportValidationError(SphinxImportException): pass def rst2html(data): """ Use docutils publis_parts to convert rst to html. Return parts body and error output tuple. """ origstderr = sys.stderr sys.stderr = StringIO.StringIO() parts = publish_parts(data, writer_name='html')['body'] sys.stderr.seek(0) output = sys.stderr.read() sys.stderr = origstderr return parts, output def parse_rst_data(data, data_type='faq'): """ Parse given data from rst to html. Filter html and generate approriate entries based on data_type provided. Generated content: - **category** (used for `faq` data type since each question belongs to a specific category) - **slug** the slug of the entry - **title** the title of the entry - **html_data** the html content of the entry - **images** (img-alt, img-path) tuples list """ html_data, output = rst2html(data) doc = html.document_fromstring("" + html_data + "") category_selectors = { 'faq': ".//div[h2][@class='section']", 'userguide': ".//div[h1][@class='section']", } # find first level sections sections = doc.findall(category_selectors[data_type]) for section in sections: entry_category = (None, None) attrs = dict(section.items()) if not attrs.get('id', None): continue slug = attrs.get('id') if data_type == 'userguide': title = section.find('h1').text_content() section.remove(section.find('h1')) else: title = section.find('h2').text_content() section.remove(section.find('h2')) image_els = section.findall('.//img') if data_type == 'faq': h1 = list(section.iterancestors())[0].find(".//h1") el_with_id = dict(h1.getparent().items()) entry_category = (el_with_id.get('id', None), h1.text_content()) def get_img_el_data(img): attrs = dict(img.items()) alt = attrs.get('alt', None) if not alt: alt = "okeanos iaas " + data_type + " image" else: if len(alt.split("/")) > 0: alt = data_type + " " + alt.split("/")[-1] if len(alt.split(".")) > 0: alt = alt.split(".")[0] img.set('alt', alt) src = attrs.get('src') if src.startswith("/images"): src = src[1:] img.set('src', src) return attrs.get('alt', None), src images = map(get_img_el_data, image_els) html_data = "" for child in section.getchildren(): html_data += html.tostring(child, pretty_print=True) yield entry_category, slug, title, html_data, images, output def get_dir_rst_files(dirname): """ Given a dir return the glob of *.rst files """ for f in glob.glob(join(dirname, '*.rst')): if f.startswith('index'): continue yield f def generate_rst_contents_from_dir(rstdir): """ Handle directory contents and run ``parse_rst_data`` for each file we want to parse. Valid structure of the dir contents so that appropriate files can be parsed:: ├── README.rst └── source ├── conf.py ├── faq │ ├── cyclades.rst │ ├── index.rst │ ├── okeanos.rst │ └── pithos.rst ├── images │ ├── cyclades │ │ ├── image10.png │ │ └── image9.png │ ├── faq │ │ └── faq_image1.png │ ├── intro_img_cyclades.png │ └── pithos_guide │ └── image2.png ├── index.rst └── userguide ├── cyclades.rst ├── index.rst ├── pithos.rst └── quick-intro.rst Will generate a tuple of, ['faq', 'userguide'], + * """ #rstdir = "/tmp/tmphsl6bicloudcms-sphinx-exports" fpath = lambda x: join(rstdir, 'source', x) images_dir = fpath('images') guide_dir = fpath('userguide') faq_dir = fpath('faq') # validation if not os.path.exists(images_dir) or not os.path.isdir(images_dir): raise SphinxImportException('Cannot find images dir') if not os.path.exists(guide_dir) or not os.path.isdir(guide_dir): raise SphinxImportException('Cannot find guide dir') if not os.path.exists(faq_dir) or not os.path.isdir(faq_dir): raise SphinxImportException('Cannot find FAQs dir') def fix_image_path(img): # make image path absolute img = list(img) if img[1].startswith("/"): img.append(fpath(img[1][1:])) else: img.append(fpath(img[1])) return img for d in ['userguide', 'faq']: for f in get_dir_rst_files(fpath(d)): for category, slug, title, html_data, \ images, stderr in parse_rst_data(file(f).read(), d): # absolute image paths images = map(fix_image_path, images) yield d, f, category, slug, title, html_data, images, stderr