code.grnet.gr Git - snf-cloudcms/blob - cloudcms/rstutils.py

   1 # -*- coding: utf-8 -*-
   2
   3 """
   4 Helper methods to parse rst documents and extract data appropriate for faq/guide
   5 entries creation.
   6 """
   7
   8 import os
   9 import sys
  10 import glob
  11 import StringIO
  12
  13 from os.path import join
  14 from collections import defaultdict
  15 from docutils.core import publish_parts
  16 from lxml import html
  17
  18 class SphinxImportException(Exception):
  19     pass
  20
  21
  22 class SphinxImportValidationError(SphinxImportException):
  23     pass
  24
  25
  26 def rst2html(data):
  27     """
  28     Use docutils publis_parts to convert rst to html. Return parts body and error
  29     output tuple.
  30     """
  31     origstderr = sys.stderr
  32     sys.stderr = StringIO.StringIO()
  33
  34     parts = publish_parts(data, writer_name='html')['body']
  35     sys.stderr.seek(0)
  36     output = sys.stderr.read()
  37     sys.stderr = origstderr
  38
  39     return parts, output
  40
  41
  42 def parse_rst_data(data, data_type='faq'):
  43     """
  44     Parse given data from rst to html. Filter html and generate approriate
  45     entries based on data_type provided.
  46
  47     Generated content:
  48
  49         - **category** (used for `faq` data type since each question belongs to a
  50           specific category)
  51         - **slug** the slug of the entry
  52         - **title** the title of the entry
  53         - **html_data** the html content of the entry
  54         - **images** (img-alt, img-path) tuples list
  55     """
  56     html_data, output = rst2html(data)
  57     doc = html.document_fromstring("<html><body>" + html_data + "</body></html>")
  58
  59     category_selectors = {
  60         'faq': ".//div[h2][@class='section']",
  61         'userguide': ".//div[h1][@class='section']",
  62     }
  63
  64     # find first level sections
  65     sections = doc.findall(category_selectors[data_type])
  66     for section in sections:
  67         entry_category = (None, None)
  68
  69         attrs = dict(section.items())
  70         if not attrs.get('id', None):
  71             continue
  72
  73         slug = attrs.get('id')
  74         if data_type == 'userguide':
  75             title = section.find('h1').text_content()
  76             section.remove(section.find('h1'))
  77         else:
  78             title = section.find('h2').text_content()
  79             section.remove(section.find('h2'))
  80
  81         image_els = section.findall('.//img')
  82
  83         if data_type == 'faq':
  84             h1 = list(section.iterancestors())[0].find(".//h1")
  85             el_with_id = dict(h1.getparent().items())
  86             entry_category = (el_with_id.get('id', None), h1.text_content())
  87
  88
  89         def get_img_el_data(img):
  90             attrs = dict(img.items())
  91             alt = attrs.get('alt', None)
  92             if not alt:
  93                 alt = "okeanos iaas " + data_type + " image"
  94             else:
  95                 if len(alt.split("/")) > 0:
  96                     alt = data_type + " " + alt.split("/")[-1]
  97                 if len(alt.split(".")) > 0:
  98                     alt = alt.split(".")[0]
  99
 100             img.set('alt', alt)
 101
 102             src = attrs.get('src')
 103             if src.startswith("/images"):
 104                 src = src[1:]
 105                 img.set('src', src)
 106
 107             return attrs.get('alt', None), src
 108
 109         images = map(get_img_el_data, image_els)
 110
 111         html_data = ""
 112         for child in section.getchildren():
 113             html_data += html.tostring(child, pretty_print=True)
 114
 115         yield entry_category, slug, title, html_data, images, output
 116
 117
 118 def get_dir_rst_files(dirname):
 119     """
 120     Given a dir return the glob of *.rst files
 121     """
 122     for f in glob.glob(join(dirname, '*.rst')):
 123         if f.startswith('index'):
 124             continue
 125         yield f
 126
 127
 128 def generate_rst_contents_from_dir(rstdir):
 129     """
 130     Handle directory contents and run ``parse_rst_data`` for each file we want
 131     to parse.
 132
 133     Valid structure of the dir contents so that appropriate files can be parsed::
 134
 135         ├── README.rst
 136         └── source
 137             ├── conf.py
 138             ├── faq
 139             │   ├── cyclades.rst
 140             │   ├── index.rst
 141             │   ├── okeanos.rst
 142             │   └── pithos.rst
 143             ├── images
 144             │   ├── cyclades
 145             │   │   ├── image10.png
 146             │   │   └── image9.png
 147             │   ├── faq
 148             │   │   └── faq_image1.png
 149             │   ├── intro_img_cyclades.png
 150             │   └── pithos_guide
 151             │       └── image2.png
 152             ├── index.rst
 153             └── userguide
 154                 ├── cyclades.rst
 155                 ├── index.rst
 156                 ├── pithos.rst
 157                 └── quick-intro.rst
 158
 159     Will generate a tuple of,
 160
 161         ['faq', 'userguide'], </abs/path/filename.rst> + *<generated tuple members of ``parse_rst_data``>
 162
 163     """
 164
 165     #rstdir = "/tmp/tmphsl6bicloudcms-sphinx-exports"
 166
 167     fpath = lambda x: join(rstdir, 'source', x)
 168
 169     images_dir = fpath('images')
 170     guide_dir = fpath('userguide')
 171     faq_dir = fpath('faq')
 172
 173     # validation
 174     if not os.path.exists(images_dir) or not os.path.isdir(images_dir):
 175         raise SphinxImportException('Cannot find images dir')
 176
 177     if not os.path.exists(guide_dir) or not os.path.isdir(guide_dir):
 178         raise SphinxImportException('Cannot find guide dir')
 179
 180     if not os.path.exists(faq_dir) or not os.path.isdir(faq_dir):
 181         raise SphinxImportException('Cannot find FAQs dir')
 182
 183     def fix_image_path(img):
 184         # make image path absolute
 185         img = list(img)
 186         if img[1].startswith("/"):
 187             img.append(fpath(img[1][1:]))
 188         else:
 189             img.append(fpath(img[1]))
 190
 191         return img
 192
 193     for d in ['userguide', 'faq']:
 194         for f in get_dir_rst_files(fpath(d)):
 195             for category, slug, title, html_data, \
 196                     images, stderr in parse_rst_data(file(f).read(), d):
 197                 # absolute image paths
 198                 images = map(fix_image_path, images)
 199                 yield d, f, category, slug, title, html_data, images, stderr
 200
 201
 202