root / src / parser.py
History | View | Annotate | Download (9.8 kB)
1 |
# Copyright (c) 2010, Panos Louridas, GRNET S.A.
|
---|---|
2 |
#
|
3 |
# All rights reserved.
|
4 |
#
|
5 |
# Redistribution and use in source and binary forms, with or without
|
6 |
# modification, are permitted provided that the following conditions are met:
|
7 |
#
|
8 |
# * Redistributions of source code must retain the above copyright notice, this
|
9 |
# list of conditions and the following disclaimer.
|
10 |
#
|
11 |
# * Redistributions in binary form must reproduce the above copyright notice,
|
12 |
# this list of conditions and the following disclaimer in the documentation
|
13 |
# and/or other materials provided with the distribution.
|
14 |
#
|
15 |
# * Neither the name of GRNET S.A, nor the names of its contributors may be
|
16 |
# used to endorse or promote products derived from this software without
|
17 |
# specific prior written permission.
|
18 |
#
|
19 |
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
20 |
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
21 |
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
22 |
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
23 |
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
24 |
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
25 |
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
26 |
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
27 |
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
28 |
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
29 |
|
30 |
import xml.parsers.expat |
31 |
import sys |
32 |
import re |
33 |
import csv |
34 |
import getopt |
35 |
|
36 |
#
|
37 |
# Canonicalize a target or a page title following the rules in:
|
38 |
# http://en.wikipedia.org/wiki/Help:Link#Conversion_to_canonical_form
|
39 |
#
|
40 |
# Also capitalises the first letter following the prefix.
|
41 |
#
|
42 |
# Returns a tuple consisting of the canonicalized target the prefix
|
43 |
# (denotated by the first colon, so it may not be a real namespace but
|
44 |
# that's file) and the part following the colon separator. If no colon is found,
|
45 |
# the last two elements are empty.
|
46 |
#
|
47 |
def canonicalize(s): |
48 |
s = s.strip(' ')
|
49 |
s = s.strip('_')
|
50 |
s = s.replace('_', ' ') |
51 |
length = len(s)
|
52 |
if length == 0: |
53 |
return (s, '', '') |
54 |
(prefix, sep, after) = s.partition(':')
|
55 |
if after != '': |
56 |
s = prefix + sep + after[0].upper() + after[1:] |
57 |
else:
|
58 |
prefix = ''
|
59 |
if length > 1: |
60 |
s = s[0].upper() + s[1:] |
61 |
else:
|
62 |
s = s[0].upper()
|
63 |
return (s, prefix, after)
|
64 |
|
65 |
class BodyParser: |
66 |
|
67 |
ARTICLE_CLASSES = ["{{Featured article}}",
|
68 |
"{{Featured list}}",
|
69 |
"{{A-Class}}",
|
70 |
"{{Good article}}",
|
71 |
"{{B-Class}}",
|
72 |
"{{C-Class}}",
|
73 |
"{{Start-Class}}",
|
74 |
"{{Stub-Class}}"
|
75 |
] |
76 |
|
77 |
ARTICLE_CLASSES_RE = [ re.compile(x, re.I) for x in ARTICLE_CLASSES ] |
78 |
|
79 |
IMPORTANCE_CLASSES = ["{{Top-importance}}",
|
80 |
"{{High-importance}}",
|
81 |
"{{Mid-Importance}}",
|
82 |
"{{Low-Importance}}",
|
83 |
"{{Bottom-Importance}}",
|
84 |
"{{No-Importance}}",
|
85 |
"{{NA-Importance}}"
|
86 |
] |
87 |
|
88 |
IMPORTANCE_CLASSES_RE = [ re.compile(x, re.I) for x in IMPORTANCE_CLASSES ] |
89 |
|
90 |
def init_state(self): |
91 |
self.word_count = 0 |
92 |
self.title = "" |
93 |
self.importance_class = "" |
94 |
self.article_class = "" |
95 |
self.category = []
|
96 |
self.is_redirect = False |
97 |
|
98 |
def __init__(self): |
99 |
self.init_state()
|
100 |
self.scanner = re.Scanner([
|
101 |
(r"{{[^}]+}}", self.template), |
102 |
(r"\[\[[^]]+\]\]", self.reference), |
103 |
(r"\[[^]]+\]", self.external_link), |
104 |
(r"(?i)#REDIRECT\s*\[\[[^]]+\]\]", self.redirect), |
105 |
(r"<[^>]+>", self.skip), |
106 |
(r"\s+", self.skip), |
107 |
(r"\b\w+\b", self.word), |
108 |
(r"([^[]|\W+)", self.skip), |
109 |
]) |
110 |
|
111 |
def template(self, scanner, token): |
112 |
for c in BodyParser.ARTICLE_CLASSES_RE: |
113 |
if re.search(c, token) is not None: |
114 |
self.article_class = c.pattern
|
115 |
break
|
116 |
for c in BodyParser.IMPORTANCE_CLASSES_RE: |
117 |
if re.search(c, token) is not None: |
118 |
self.importance_class = c.pattern
|
119 |
break
|
120 |
return "TEMPLATE", token |
121 |
|
122 |
def redirect(self, scanner, token): |
123 |
self.is_redirect = True |
124 |
target = token.split("[[")[1].split("]]")[0] |
125 |
target = canonicalize(target)[0]
|
126 |
print self.title + " #REDIRECT " + target |
127 |
return "REDIRECT", token |
128 |
|
129 |
def reference(self, scanner, token): |
130 |
target = token.split("|")[0][2:].rstrip("]") |
131 |
(target, prefix, rest) = canonicalize(target) |
132 |
if (prefix == "Category"): |
133 |
self.category.append(rest)
|
134 |
print self.title + " => " + target |
135 |
return "REFERENCE", token |
136 |
|
137 |
def external_link(self, scanner, token): |
138 |
return "EXTERNAL_LINK", token |
139 |
|
140 |
def word(self, scanner, token): |
141 |
self.word_count = self.word_count + 1 |
142 |
return "WORD", token |
143 |
|
144 |
def skip(self, scanner, token): |
145 |
return "SKIP", token |
146 |
|
147 |
def parse(self, title, input): |
148 |
self.init_state()
|
149 |
self.title = canonicalize(title)[0] |
150 |
return self.scanner.scan(input) |
151 |
|
152 |
class WikipediaParser: |
153 |
|
154 |
NS = "http://www.mediawiki.org/xml/export-0.4/"
|
155 |
PAGE = "/mediawiki/page"
|
156 |
TITLE = PAGE + "/title"
|
157 |
REDIRECT = PAGE + "/redirect"
|
158 |
TEXT = PAGE + "/revision/text"
|
159 |
TIMESTAMP = PAGE + "/revision/timestamp"
|
160 |
USERNAME = PAGE + "/revision/contributor/username"
|
161 |
|
162 |
READ_SET = set([TITLE, TEXT, TIMESTAMP, USERNAME])
|
163 |
|
164 |
NAMESPACES = ( |
165 |
'Media:', 'Special:', 'Talk:', 'User:', 'User talk:', 'Wikipedia:', |
166 |
'Wikipedia talk:', 'File:', 'Image:', 'File talk:', 'MediaWiki:', |
167 |
'MediaWiki talk:', 'Template:', 'Template talk:', 'Help:', |
168 |
'Help talk:', 'Category:', 'Category talk:', 'Portal:', |
169 |
'Portal talk:', 'Book:', 'Book talk:' |
170 |
) |
171 |
|
172 |
def __init__(self, output_writer): |
173 |
self.output_writer = output_writer
|
174 |
self.element_content = []
|
175 |
self.article_class = "" |
176 |
self.importance_class = "" |
177 |
self.title = "" |
178 |
self.timestamp = "" |
179 |
self.username = "" |
180 |
self.current_element = "" |
181 |
self.debug = False |
182 |
self.redirect = False |
183 |
self.keep_categories = ()
|
184 |
self.skip = False |
185 |
self.parser = BodyParser()
|
186 |
|
187 |
def add_element(self, tag): |
188 |
self.current_element = self.current_element + "/" + tag |
189 |
|
190 |
def remove_element(self, tag): |
191 |
self.element_content = []
|
192 |
self.current_element = self.current_element.rpartition("/")[0] |
193 |
|
194 |
def start(self, tag, attrib): |
195 |
self.add_element(tag)
|
196 |
if self.current_element == WikipediaParser.PAGE: |
197 |
self.redirect = False |
198 |
self.skip = False |
199 |
elif self.current_element == WikipediaParser.REDIRECT: |
200 |
self.redirect = True |
201 |
|
202 |
def data(self, data): |
203 |
if (not self.skip |
204 |
and self.current_element in WikipediaParser.READ_SET): |
205 |
self.element_content.append(data.encode('utf-8')) |
206 |
|
207 |
|
208 |
def end(self, tag): |
209 |
if self.skip: |
210 |
pass
|
211 |
elif self.current_element == WikipediaParser.TITLE: |
212 |
self.title = ''.join(self.element_content).strip() |
213 |
if not self.keep_categories: |
214 |
if self.title.startswith(WikipediaParser.NAMESPACES): |
215 |
self.skip = True |
216 |
else:
|
217 |
if not self.title.startswith(self.keep_categories): |
218 |
self.skip = True |
219 |
elif self.current_element == WikipediaParser.TIMESTAMP: |
220 |
self.timestamp = ''.join(self.element_content).strip() |
221 |
elif self.current_element == WikipediaParser.USERNAME: |
222 |
self.username = ''.join(self.element_content).strip() |
223 |
elif self.current_element == WikipediaParser.TEXT: |
224 |
self.article_class = "" |
225 |
self.importance_class = "" |
226 |
page_content = ''.join(self.element_content) |
227 |
tokens, remainder = self.parser.parse(self.title, page_content) |
228 |
#for token in tokens:
|
229 |
# print token
|
230 |
if remainder != "": |
231 |
print "'" + remainder + "'" |
232 |
print "Exiting..." |
233 |
sys.exit(1)
|
234 |
if not self.redirect: |
235 |
self.output_writer.writerow([self.parser.title, |
236 |
self.username,
|
237 |
self.timestamp,
|
238 |
self.parser.word_count,
|
239 |
self.parser.article_class,
|
240 |
self.parser.importance_class,
|
241 |
self.parser.category])
|
242 |
self.remove_element(tag)
|
243 |
def close(self): |
244 |
pass
|
245 |
|
246 |
def main(): |
247 |
try:
|
248 |
opts, args = getopt.getopt(sys.argv[1:], "c:", ["categories="]) |
249 |
except getopt.GetoptError, err:
|
250 |
print str(err) |
251 |
sys.exit(1)
|
252 |
|
253 |
categories = () |
254 |
for o, a in opts: |
255 |
if o in ("-c", "--categories"): |
256 |
categories = set(c + ":" for c in re.split(',\s+', a[1:])) |
257 |
else:
|
258 |
assert False, "unhandled option " + o |
259 |
|
260 |
output_writer = csv.writer(sys.stdout, delimiter=';', quotechar='"', |
261 |
quoting=csv.QUOTE_MINIMAL) |
262 |
|
263 |
p = xml.parsers.expat.ParserCreate() |
264 |
p.buffer_text = True
|
265 |
|
266 |
wikipedia_parser = WikipediaParser(output_writer) |
267 |
wikipedia_parser.keep_categories = tuple(categories)
|
268 |
|
269 |
p.StartElementHandler = wikipedia_parser.start |
270 |
p.EndElementHandler = wikipedia_parser.end |
271 |
p.CharacterDataHandler = wikipedia_parser.data |
272 |
|
273 |
p.ParseFile(sys.stdin) |
274 |
|
275 |
if __name__ == "__main__": |
276 |
main() |