root / cloudcms / rstutils.py @ d240ebcb
History | View | Annotate | Download (5.8 kB)
1 | deb708bf | Kostas Papadimitriou | # -*- coding: utf-8 -*-
|
---|---|---|---|
2 | deb708bf | Kostas Papadimitriou | |
3 | deb708bf | Kostas Papadimitriou | """
|
4 | deb708bf | Kostas Papadimitriou | Helper methods to parse rst documents and extract data appropriate for faq/guide
|
5 | deb708bf | Kostas Papadimitriou | entries creation.
|
6 | deb708bf | Kostas Papadimitriou | """
|
7 | deb708bf | Kostas Papadimitriou | |
8 | deb708bf | Kostas Papadimitriou | import os |
9 | deb708bf | Kostas Papadimitriou | import sys |
10 | deb708bf | Kostas Papadimitriou | import glob |
11 | deb708bf | Kostas Papadimitriou | import StringIO |
12 | deb708bf | Kostas Papadimitriou | |
13 | deb708bf | Kostas Papadimitriou | from os.path import join |
14 | deb708bf | Kostas Papadimitriou | from collections import defaultdict |
15 | deb708bf | Kostas Papadimitriou | from docutils.core import publish_parts |
16 | deb708bf | Kostas Papadimitriou | from lxml import html |
17 | deb708bf | Kostas Papadimitriou | |
18 | deb708bf | Kostas Papadimitriou | class SphinxImportException(Exception): |
19 | deb708bf | Kostas Papadimitriou | pass
|
20 | deb708bf | Kostas Papadimitriou | |
21 | deb708bf | Kostas Papadimitriou | |
22 | deb708bf | Kostas Papadimitriou | class SphinxImportValidationError(SphinxImportException): |
23 | deb708bf | Kostas Papadimitriou | pass
|
24 | deb708bf | Kostas Papadimitriou | |
25 | deb708bf | Kostas Papadimitriou | |
26 | deb708bf | Kostas Papadimitriou | def rst2html(data): |
27 | deb708bf | Kostas Papadimitriou | """
|
28 | deb708bf | Kostas Papadimitriou | Use docutils publis_parts to convert rst to html. Return parts body and error
|
29 | deb708bf | Kostas Papadimitriou | output tuple.
|
30 | deb708bf | Kostas Papadimitriou | """
|
31 | deb708bf | Kostas Papadimitriou | origstderr = sys.stderr |
32 | deb708bf | Kostas Papadimitriou | sys.stderr = StringIO.StringIO() |
33 | deb708bf | Kostas Papadimitriou | |
34 | deb708bf | Kostas Papadimitriou | parts = publish_parts(data, writer_name='html')['body'] |
35 | deb708bf | Kostas Papadimitriou | sys.stderr.seek(0)
|
36 | deb708bf | Kostas Papadimitriou | output = sys.stderr.read() |
37 | deb708bf | Kostas Papadimitriou | sys.stderr = origstderr |
38 | deb708bf | Kostas Papadimitriou | |
39 | deb708bf | Kostas Papadimitriou | return parts, output
|
40 | deb708bf | Kostas Papadimitriou | |
41 | deb708bf | Kostas Papadimitriou | |
42 | deb708bf | Kostas Papadimitriou | def parse_rst_data(data, data_type='faq'): |
43 | deb708bf | Kostas Papadimitriou | """
|
44 | deb708bf | Kostas Papadimitriou | Parse given data from rst to html. Filter html and generate approriate
|
45 | deb708bf | Kostas Papadimitriou | entries based on data_type provided.
|
46 | deb708bf | Kostas Papadimitriou |
|
47 | deb708bf | Kostas Papadimitriou | Generated content:
|
48 | deb708bf | Kostas Papadimitriou |
|
49 | deb708bf | Kostas Papadimitriou | - **category** (used for `faq` data type since each question belongs to a
|
50 | deb708bf | Kostas Papadimitriou | specific category)
|
51 | deb708bf | Kostas Papadimitriou | - **slug** the slug of the entry
|
52 | deb708bf | Kostas Papadimitriou | - **title** the title of the entry
|
53 | deb708bf | Kostas Papadimitriou | - **html_data** the html content of the entry
|
54 | deb708bf | Kostas Papadimitriou | - **images** (img-alt, img-path) tuples list
|
55 | deb708bf | Kostas Papadimitriou | """
|
56 | deb708bf | Kostas Papadimitriou | html_data, output = rst2html(data) |
57 | deb708bf | Kostas Papadimitriou | doc = html.document_fromstring("<html><body>" + html_data + "</body></html>") |
58 | deb708bf | Kostas Papadimitriou | |
59 | deb708bf | Kostas Papadimitriou | category_selectors = { |
60 | deb708bf | Kostas Papadimitriou | 'faq': ".//div[h2][@class='section']", |
61 | deb708bf | Kostas Papadimitriou | 'userguide': ".//div[h1][@class='section']", |
62 | deb708bf | Kostas Papadimitriou | } |
63 | deb708bf | Kostas Papadimitriou | |
64 | deb708bf | Kostas Papadimitriou | # find first level sections
|
65 | deb708bf | Kostas Papadimitriou | sections = doc.findall(category_selectors[data_type]) |
66 | deb708bf | Kostas Papadimitriou | for section in sections: |
67 | deb708bf | Kostas Papadimitriou | entry_category = (None, None) |
68 | deb708bf | Kostas Papadimitriou | |
69 | deb708bf | Kostas Papadimitriou | attrs = dict(section.items())
|
70 | deb708bf | Kostas Papadimitriou | if not attrs.get('id', None): |
71 | deb708bf | Kostas Papadimitriou | continue
|
72 | deb708bf | Kostas Papadimitriou | |
73 | deb708bf | Kostas Papadimitriou | slug = attrs.get('id')
|
74 | deb708bf | Kostas Papadimitriou | if data_type == 'userguide': |
75 | deb708bf | Kostas Papadimitriou | title = section.find('h1').text_content()
|
76 | deb708bf | Kostas Papadimitriou | section.remove(section.find('h1'))
|
77 | deb708bf | Kostas Papadimitriou | else:
|
78 | deb708bf | Kostas Papadimitriou | title = section.find('h2').text_content()
|
79 | deb708bf | Kostas Papadimitriou | section.remove(section.find('h2'))
|
80 | deb708bf | Kostas Papadimitriou | |
81 | deb708bf | Kostas Papadimitriou | image_els = section.findall('.//img')
|
82 | deb708bf | Kostas Papadimitriou | |
83 | deb708bf | Kostas Papadimitriou | if data_type == 'faq': |
84 | deb708bf | Kostas Papadimitriou | h1 = list(section.iterancestors())[0].find(".//h1") |
85 | deb708bf | Kostas Papadimitriou | el_with_id = dict(h1.getparent().items())
|
86 | deb708bf | Kostas Papadimitriou | entry_category = (el_with_id.get('id', None), h1.text_content()) |
87 | deb708bf | Kostas Papadimitriou | |
88 | deb708bf | Kostas Papadimitriou | |
89 | deb708bf | Kostas Papadimitriou | def get_img_el_data(img): |
90 | deb708bf | Kostas Papadimitriou | attrs = dict(img.items())
|
91 | deb708bf | Kostas Papadimitriou | alt = attrs.get('alt', None) |
92 | deb708bf | Kostas Papadimitriou | if not alt: |
93 | deb708bf | Kostas Papadimitriou | alt = "okeanos iaas " + data_type + " image" |
94 | deb708bf | Kostas Papadimitriou | else:
|
95 | deb708bf | Kostas Papadimitriou | if len(alt.split("/")) > 0: |
96 | deb708bf | Kostas Papadimitriou | alt = data_type + " " + alt.split("/")[-1] |
97 | deb708bf | Kostas Papadimitriou | if len(alt.split(".")) > 0: |
98 | deb708bf | Kostas Papadimitriou | alt = alt.split(".")[0] |
99 | deb708bf | Kostas Papadimitriou | |
100 | deb708bf | Kostas Papadimitriou | img.set('alt', alt)
|
101 | deb708bf | Kostas Papadimitriou | |
102 | deb708bf | Kostas Papadimitriou | src = attrs.get('src')
|
103 | deb708bf | Kostas Papadimitriou | if src.startswith("/images"): |
104 | deb708bf | Kostas Papadimitriou | src = src[1:]
|
105 | deb708bf | Kostas Papadimitriou | img.set('src', src)
|
106 | deb708bf | Kostas Papadimitriou | |
107 | deb708bf | Kostas Papadimitriou | return attrs.get('alt', None), src |
108 | deb708bf | Kostas Papadimitriou | |
109 | deb708bf | Kostas Papadimitriou | images = map(get_img_el_data, image_els)
|
110 | deb708bf | Kostas Papadimitriou | |
111 | deb708bf | Kostas Papadimitriou | html_data = ""
|
112 | deb708bf | Kostas Papadimitriou | for child in section.getchildren(): |
113 | deb708bf | Kostas Papadimitriou | html_data += html.tostring(child, pretty_print=True)
|
114 | deb708bf | Kostas Papadimitriou | |
115 | deb708bf | Kostas Papadimitriou | yield entry_category, slug, title, html_data, images, output
|
116 | deb708bf | Kostas Papadimitriou | |
117 | deb708bf | Kostas Papadimitriou | |
118 | deb708bf | Kostas Papadimitriou | def get_dir_rst_files(dirname): |
119 | deb708bf | Kostas Papadimitriou | """
|
120 | deb708bf | Kostas Papadimitriou | Given a dir return the glob of *.rst files
|
121 | deb708bf | Kostas Papadimitriou | """
|
122 | deb708bf | Kostas Papadimitriou | for f in glob.glob(join(dirname, '*.rst')): |
123 | deb708bf | Kostas Papadimitriou | if f.startswith('index'): |
124 | deb708bf | Kostas Papadimitriou | continue
|
125 | deb708bf | Kostas Papadimitriou | yield f
|
126 | deb708bf | Kostas Papadimitriou | |
127 | deb708bf | Kostas Papadimitriou | |
128 | deb708bf | Kostas Papadimitriou | def generate_rst_contents_from_dir(rstdir): |
129 | deb708bf | Kostas Papadimitriou | """
|
130 | deb708bf | Kostas Papadimitriou | Handle directory contents and run ``parse_rst_data`` for each file we want
|
131 | deb708bf | Kostas Papadimitriou | to parse.
|
132 | deb708bf | Kostas Papadimitriou |
|
133 | deb708bf | Kostas Papadimitriou | Valid structure of the dir contents so that appropriate files can be parsed::
|
134 | deb708bf | Kostas Papadimitriou |
|
135 | deb708bf | Kostas Papadimitriou | ├── README.rst
|
136 | deb708bf | Kostas Papadimitriou | └── source
|
137 | deb708bf | Kostas Papadimitriou | ├── conf.py
|
138 | deb708bf | Kostas Papadimitriou | ├── faq
|
139 | deb708bf | Kostas Papadimitriou | │ ├── cyclades.rst
|
140 | deb708bf | Kostas Papadimitriou | │ ├── index.rst
|
141 | deb708bf | Kostas Papadimitriou | │ ├── okeanos.rst
|
142 | deb708bf | Kostas Papadimitriou | │ └── pithos.rst
|
143 | deb708bf | Kostas Papadimitriou | ├── images
|
144 | deb708bf | Kostas Papadimitriou | │ ├── cyclades
|
145 | deb708bf | Kostas Papadimitriou | │ │ ├── image10.png
|
146 | deb708bf | Kostas Papadimitriou | │ │ └── image9.png
|
147 | deb708bf | Kostas Papadimitriou | │ ├── faq
|
148 | deb708bf | Kostas Papadimitriou | │ │ └── faq_image1.png
|
149 | deb708bf | Kostas Papadimitriou | │ ├── intro_img_cyclades.png
|
150 | deb708bf | Kostas Papadimitriou | │ └── pithos_guide
|
151 | deb708bf | Kostas Papadimitriou | │ └── image2.png
|
152 | deb708bf | Kostas Papadimitriou | ├── index.rst
|
153 | deb708bf | Kostas Papadimitriou | └── userguide
|
154 | deb708bf | Kostas Papadimitriou | ├── cyclades.rst
|
155 | deb708bf | Kostas Papadimitriou | ├── index.rst
|
156 | deb708bf | Kostas Papadimitriou | ├── pithos.rst
|
157 | deb708bf | Kostas Papadimitriou | └── quick-intro.rst
|
158 | deb708bf | Kostas Papadimitriou |
|
159 | deb708bf | Kostas Papadimitriou | Will generate a tuple of,
|
160 | deb708bf | Kostas Papadimitriou |
|
161 | deb708bf | Kostas Papadimitriou | ['faq', 'userguide'], </abs/path/filename.rst> + *<generated tuple members of ``parse_rst_data``>
|
162 | deb708bf | Kostas Papadimitriou |
|
163 | deb708bf | Kostas Papadimitriou | """
|
164 | deb708bf | Kostas Papadimitriou | |
165 | deb708bf | Kostas Papadimitriou | #rstdir = "/tmp/tmphsl6bicloudcms-sphinx-exports"
|
166 | deb708bf | Kostas Papadimitriou | |
167 | deb708bf | Kostas Papadimitriou | fpath = lambda x: join(rstdir, 'source', x) |
168 | deb708bf | Kostas Papadimitriou | |
169 | deb708bf | Kostas Papadimitriou | images_dir = fpath('images')
|
170 | deb708bf | Kostas Papadimitriou | guide_dir = fpath('userguide')
|
171 | deb708bf | Kostas Papadimitriou | faq_dir = fpath('faq')
|
172 | deb708bf | Kostas Papadimitriou | |
173 | deb708bf | Kostas Papadimitriou | # validation
|
174 | deb708bf | Kostas Papadimitriou | if not os.path.exists(images_dir) or not os.path.isdir(images_dir): |
175 | deb708bf | Kostas Papadimitriou | raise SphinxImportException('Cannot find images dir') |
176 | deb708bf | Kostas Papadimitriou | |
177 | deb708bf | Kostas Papadimitriou | if not os.path.exists(guide_dir) or not os.path.isdir(guide_dir): |
178 | deb708bf | Kostas Papadimitriou | raise SphinxImportException('Cannot find guide dir') |
179 | deb708bf | Kostas Papadimitriou | |
180 | deb708bf | Kostas Papadimitriou | if not os.path.exists(faq_dir) or not os.path.isdir(faq_dir): |
181 | deb708bf | Kostas Papadimitriou | raise SphinxImportException('Cannot find FAQs dir') |
182 | deb708bf | Kostas Papadimitriou | |
183 | deb708bf | Kostas Papadimitriou | def fix_image_path(img): |
184 | deb708bf | Kostas Papadimitriou | # make image path absolute
|
185 | deb708bf | Kostas Papadimitriou | img = list(img)
|
186 | deb708bf | Kostas Papadimitriou | if img[1].startswith("/"): |
187 | deb708bf | Kostas Papadimitriou | img.append(fpath(img[1][1:])) |
188 | deb708bf | Kostas Papadimitriou | else:
|
189 | deb708bf | Kostas Papadimitriou | img.append(fpath(img[1]))
|
190 | deb708bf | Kostas Papadimitriou | |
191 | deb708bf | Kostas Papadimitriou | return img
|
192 | deb708bf | Kostas Papadimitriou | |
193 | deb708bf | Kostas Papadimitriou | for d in ['userguide', 'faq']: |
194 | deb708bf | Kostas Papadimitriou | for f in get_dir_rst_files(fpath(d)): |
195 | deb708bf | Kostas Papadimitriou | for category, slug, title, html_data, \
|
196 | deb708bf | Kostas Papadimitriou | images, stderr in parse_rst_data(file(f).read(), d): |
197 | deb708bf | Kostas Papadimitriou | # absolute image paths
|
198 | deb708bf | Kostas Papadimitriou | images = map(fix_image_path, images)
|
199 | deb708bf | Kostas Papadimitriou | yield d, f, category, slug, title, html_data, images, stderr
|
200 | deb708bf | Kostas Papadimitriou | |
201 | deb708bf | Kostas Papadimitriou |