Revision 2:29b1f3811d49

b/src/parser.py
32 32
import re
33 33
import csv
34 34

  
35
from language_prefixes import languages
36

  
35 37
def canonicalize(s):
36 38
    s = s.strip(' ')
37 39
    s = s.strip('_')
38 40
    s = s.replace('_', ' ')
39 41
    s = ' '.join(s.split())
40
    s = s.capitalize()
42
    prefix = s.split(':')[0]
43
    if not prefix in languages:
44
        s = s.capitalize()
41 45
    return s
42 46

  
43 47
class BodyParser:
......
78 82
            (r"{{[^}]+}}", self.template),
79 83
            (r"\[\[[^]]+\]\]", self.reference),
80 84
            (r"\[[^]]+\]", self.external_link),
81
            (r"#REDIRECT \[\[[^]]+\]\]", self.redirect),
85
            (r"(?i)#REDIRECT\s*\[\[[^]]+\]\]", self.redirect),
82 86
            (r"<[^>]+>", self.skip),
83 87
            (r"\s+", self.skip),
84 88
            (r"\b\w+\b", self.word),
......
100 104
        self.is_redirect = True
101 105
        target = token.split("[[")[1].split("]]")[0]
102 106
        target = canonicalize(target)
103
        print "'" + self.title + "' #REDIRECT '" + target + "'"
107
        print self.title + " #REDIRECT " + target
104 108
        return "REDIRECT", token
105 109
    
106 110
    def reference(self, scanner, token):
107 111
        target = token.split("|")[0][2:].rstrip("]");
108 112
        target = canonicalize(target)
109
        print "'" + self.title +  "' => '" + target + "'"
113
        print self.title +  " => " + target
110 114
        return "REFERENCE", token
111 115
    
112 116
    def external_link(self, scanner, token):
......
154 158
        self.username = ""
155 159
        self.current_element = ""
156 160
        self.debug = False
157
        self.redirect = True
161
        self.redirect = False
162
        self.skip = False
158 163
        self.parser = BodyParser()
159 164

  
160 165
    def add_element(self, tag):
......
168 173
        self.add_element(tag)
169 174
        if self.current_element == WikipediaParser.PAGE:
170 175
            self.redirect = False
176
            self.skip = False
171 177
        elif self.current_element == WikipediaParser.REDIRECT:
172 178
            self.redirect = True
173 179
                
174 180
    def data(self, data):
175
        if self.current_element in WikipediaParser.READ_SET:
181
        if (not self.skip
182
            and self.current_element in WikipediaParser.READ_SET):
176 183
            self.element_content.append(data.encode('utf-8'))
177 184

  
178 185

  
179 186
    def end(self, tag):
180
        if self.current_element == WikipediaParser.TITLE:
187
        if self.skip:
188
            pass
189
        elif self.current_element == WikipediaParser.TITLE:
181 190
            self.title =  ''.join(self.element_content).strip()
182 191
            if self.title.startswith(WikipediaParser.NAMESPACES):
183 192
                self.skip = True

Also available in: Unified diff