Revision 2:29b1f3811d49
b/src/parser.py | ||
---|---|---|
32 | 32 |
import re |
33 | 33 |
import csv |
34 | 34 |
|
35 |
from language_prefixes import languages |
|
36 |
|
|
35 | 37 |
def canonicalize(s): |
36 | 38 |
s = s.strip(' ') |
37 | 39 |
s = s.strip('_') |
38 | 40 |
s = s.replace('_', ' ') |
39 | 41 |
s = ' '.join(s.split()) |
40 |
s = s.capitalize() |
|
42 |
prefix = s.split(':')[0] |
|
43 |
if not prefix in languages: |
|
44 |
s = s.capitalize() |
|
41 | 45 |
return s |
42 | 46 |
|
43 | 47 |
class BodyParser: |
... | ... | |
78 | 82 |
(r"{{[^}]+}}", self.template), |
79 | 83 |
(r"\[\[[^]]+\]\]", self.reference), |
80 | 84 |
(r"\[[^]]+\]", self.external_link), |
81 |
(r"#REDIRECT \[\[[^]]+\]\]", self.redirect),
|
|
85 |
(r"(?i)#REDIRECT\s*\[\[[^]]+\]\]", self.redirect),
|
|
82 | 86 |
(r"<[^>]+>", self.skip), |
83 | 87 |
(r"\s+", self.skip), |
84 | 88 |
(r"\b\w+\b", self.word), |
... | ... | |
100 | 104 |
self.is_redirect = True |
101 | 105 |
target = token.split("[[")[1].split("]]")[0] |
102 | 106 |
target = canonicalize(target) |
103 |
print "'" + self.title + "' #REDIRECT '" + target + "'"
|
|
107 |
print self.title + " #REDIRECT " + target
|
|
104 | 108 |
return "REDIRECT", token |
105 | 109 |
|
106 | 110 |
def reference(self, scanner, token): |
107 | 111 |
target = token.split("|")[0][2:].rstrip("]"); |
108 | 112 |
target = canonicalize(target) |
109 |
print "'" + self.title + "' => '" + target + "'"
|
|
113 |
print self.title + " => " + target
|
|
110 | 114 |
return "REFERENCE", token |
111 | 115 |
|
112 | 116 |
def external_link(self, scanner, token): |
... | ... | |
154 | 158 |
self.username = "" |
155 | 159 |
self.current_element = "" |
156 | 160 |
self.debug = False |
157 |
self.redirect = True |
|
161 |
self.redirect = False |
|
162 |
self.skip = False |
|
158 | 163 |
self.parser = BodyParser() |
159 | 164 |
|
160 | 165 |
def add_element(self, tag): |
... | ... | |
168 | 173 |
self.add_element(tag) |
169 | 174 |
if self.current_element == WikipediaParser.PAGE: |
170 | 175 |
self.redirect = False |
176 |
self.skip = False |
|
171 | 177 |
elif self.current_element == WikipediaParser.REDIRECT: |
172 | 178 |
self.redirect = True |
173 | 179 |
|
174 | 180 |
def data(self, data): |
175 |
if self.current_element in WikipediaParser.READ_SET: |
|
181 |
if (not self.skip |
|
182 |
and self.current_element in WikipediaParser.READ_SET): |
|
176 | 183 |
self.element_content.append(data.encode('utf-8')) |
177 | 184 |
|
178 | 185 |
|
179 | 186 |
def end(self, tag): |
180 |
if self.current_element == WikipediaParser.TITLE: |
|
187 |
if self.skip: |
|
188 |
pass |
|
189 |
elif self.current_element == WikipediaParser.TITLE: |
|
181 | 190 |
self.title = ''.join(self.element_content).strip() |
182 | 191 |
if self.title.startswith(WikipediaParser.NAMESPACES): |
183 | 192 |
self.skip = True |
Also available in: Unified diff