Statistics
| Branch: | Revision:

root / src / parser.py

History | View | Annotate | Download (9.8 kB)

1
# Copyright (c) 2010, Panos Louridas, GRNET S.A.
2
#
3
# All rights reserved.
4
#
5
# Redistribution and use in source and binary forms, with or without
6
# modification, are permitted provided that the following conditions are met:
7
#
8
# * Redistributions of source code must retain the above copyright notice, this
9
# list of conditions and the following disclaimer.
10
#
11
# * Redistributions in binary form must reproduce the above copyright notice,
12
# this list of conditions and the following disclaimer in the documentation
13
# and/or other materials provided with the distribution.
14
#
15
# * Neither the name of GRNET S.A, nor the names of its contributors may be
16
# used to endorse or promote products derived from this software without
17
# specific prior written permission.
18
#
19
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29

    
30
import xml.parsers.expat
31
import sys
32
import re
33
import csv
34
import getopt
35

    
36
#
37
# Canonicalize a target or a page title following the rules in:
38
# http://en.wikipedia.org/wiki/Help:Link#Conversion_to_canonical_form
39
#
40
# Also capitalises the first letter following the prefix.
41
#
42
# Returns a tuple consisting of the canonicalized target the prefix
43
# (denotated by the first colon, so it may not be a real namespace but
44
# that's file) and the part following the colon separator. If no colon is found,
45
# the last two elements are empty.
46
#
47
def canonicalize(s):
48
    s = s.strip(' ')
49
    s = s.strip('_')
50
    s = s.replace('_', ' ')
51
    length = len(s)
52
    if length == 0:
53
        return (s, '', '')
54
    (prefix, sep, after) = s.partition(':')
55
    if after != '':
56
        s = prefix + sep + after[0].upper() + after[1:]
57
    else:
58
        prefix = ''
59
        if length > 1:
60
            s = s[0].upper() + s[1:]
61
        else:
62
            s = s[0].upper()
63
    return (s, prefix, after)
64

    
65
class BodyParser:
66

    
67
    ARTICLE_CLASSES = ["{{Featured article}}",
68
                       "{{Featured list}}",
69
                       "{{A-Class}}",
70
                       "{{Good article}}",
71
                       "{{B-Class}}",
72
                       "{{C-Class}}",
73
                       "{{Start-Class}}",
74
                       "{{Stub-Class}}"
75
                       ]
76
    
77
    ARTICLE_CLASSES_RE = [ re.compile(x, re.I) for x in ARTICLE_CLASSES ]
78
    
79
    IMPORTANCE_CLASSES = ["{{Top-importance}}",
80
                          "{{High-importance}}",
81
                          "{{Mid-Importance}}",
82
                          "{{Low-Importance}}",
83
                          "{{Bottom-Importance}}",
84
                          "{{No-Importance}}",
85
                          "{{NA-Importance}}"
86
                          ]
87

    
88
    IMPORTANCE_CLASSES_RE = [ re.compile(x, re.I) for x in IMPORTANCE_CLASSES ]
89

    
90
    def init_state(self):
91
        self.word_count = 0
92
        self.title = ""
93
        self.importance_class = ""
94
        self.article_class = ""
95
        self.category = []
96
        self.is_redirect = False
97

    
98
    def __init__(self):
99
        self.init_state()
100
        self.scanner = re.Scanner([
101
            (r"{{[^}]+}}", self.template),
102
            (r"\[\[[^]]+\]\]", self.reference),
103
            (r"\[[^]]+\]", self.external_link),
104
            (r"(?i)#REDIRECT\s*\[\[[^]]+\]\]", self.redirect),
105
            (r"<[^>]+>", self.skip),
106
            (r"\s+", self.skip),
107
            (r"\b\w+\b", self.word),
108
            (r"([^[]|\W+)", self.skip),
109
            ])
110

    
111
    def template(self, scanner, token):
112
        for c in BodyParser.ARTICLE_CLASSES_RE:
113
                if re.search(c, token) is not None:
114
                    self.article_class = c.pattern
115
                    break
116
        for c in BodyParser.IMPORTANCE_CLASSES_RE:
117
            if re.search(c, token) is not None:
118
                self.importance_class = c.pattern
119
                break
120
        return "TEMPLATE", token
121

    
122
    def redirect(self, scanner, token):
123
        self.is_redirect = True
124
        target = token.split("[[")[1].split("]]")[0]
125
        target = canonicalize(target)[0]
126
        print self.title + " #REDIRECT " + target
127
        return "REDIRECT", token
128
    
129
    def reference(self, scanner, token):
130
        target = token.split("|")[0][2:].rstrip("]")
131
        (target, prefix, rest) = canonicalize(target)
132
        if (prefix == "Category"):
133
            self.category.append(rest)
134
        print self.title +  " => " + target
135
        return "REFERENCE", token
136
    
137
    def external_link(self, scanner, token):
138
        return "EXTERNAL_LINK", token
139

    
140
    def word(self, scanner, token):
141
        self.word_count = self.word_count + 1
142
        return "WORD", token
143

    
144
    def skip(self, scanner, token):
145
        return "SKIP", token
146

    
147
    def parse(self, title, input):
148
        self.init_state()
149
        self.title = canonicalize(title)[0]
150
        return self.scanner.scan(input)
151

    
152
class WikipediaParser:
153
    
154
    NS = "http://www.mediawiki.org/xml/export-0.4/"
155
    PAGE = "/mediawiki/page"
156
    TITLE = PAGE + "/title"
157
    REDIRECT = PAGE + "/redirect"
158
    TEXT = PAGE + "/revision/text"
159
    TIMESTAMP = PAGE + "/revision/timestamp"
160
    USERNAME = PAGE + "/revision/contributor/username"
161

    
162
    READ_SET = set([TITLE, TEXT, TIMESTAMP, USERNAME])
163

    
164
    NAMESPACES = (
165
        'Media:', 'Special:', 'Talk:', 'User:', 'User talk:', 'Wikipedia:',
166
        'Wikipedia talk:', 'File:', 'Image:', 'File talk:', 'MediaWiki:',
167
        'MediaWiki talk:', 'Template:', 'Template talk:', 'Help:',
168
        'Help talk:', 'Category:', 'Category talk:', 'Portal:',
169
        'Portal talk:', 'Book:', 'Book talk:'
170
        )
171
    
172
    def __init__(self, output_writer):
173
        self.output_writer = output_writer
174
        self.element_content = []
175
        self.article_class = ""
176
        self.importance_class = ""
177
        self.title = ""
178
        self.timestamp = ""
179
        self.username = ""
180
        self.current_element = ""
181
        self.debug = False
182
        self.redirect = False
183
        self.keep_categories = ()
184
        self.skip = False
185
        self.parser = BodyParser()
186

    
187
    def add_element(self, tag):
188
        self.current_element = self.current_element + "/" + tag
189

    
190
    def remove_element(self, tag):
191
        self.element_content = []
192
        self.current_element = self.current_element.rpartition("/")[0]
193
        
194
    def start(self, tag, attrib):
195
        self.add_element(tag)
196
        if self.current_element == WikipediaParser.PAGE:
197
            self.redirect = False
198
            self.skip = False
199
        elif self.current_element == WikipediaParser.REDIRECT:
200
            self.redirect = True
201
                
202
    def data(self, data):
203
        if (not self.skip
204
            and self.current_element in WikipediaParser.READ_SET):
205
            self.element_content.append(data.encode('utf-8'))
206

    
207

    
208
    def end(self, tag):
209
        if self.skip:
210
            pass
211
        elif self.current_element == WikipediaParser.TITLE:
212
            self.title =  ''.join(self.element_content).strip()
213
            if not self.keep_categories:
214
                if self.title.startswith(WikipediaParser.NAMESPACES):
215
                    self.skip = True
216
            else:
217
                if not self.title.startswith(self.keep_categories):
218
                    self.skip = True
219
        elif self.current_element == WikipediaParser.TIMESTAMP:
220
            self.timestamp = ''.join(self.element_content).strip()
221
        elif self.current_element == WikipediaParser.USERNAME:
222
            self.username = ''.join(self.element_content).strip()
223
        elif self.current_element == WikipediaParser.TEXT:
224
            self.article_class = ""
225
            self.importance_class = ""
226
            page_content = ''.join(self.element_content)
227
            tokens, remainder = self.parser.parse(self.title, page_content)
228
            #for token in tokens:
229
            #    print token
230
            if remainder != "":
231
                print "'" + remainder + "'"
232
                print "Exiting..."
233
                sys.exit(1)
234
            if not self.redirect:
235
                self.output_writer.writerow([self.parser.title,
236
                                             self.username,
237
                                             self.timestamp,
238
                                             self.parser.word_count,
239
                                             self.parser.article_class,
240
                                             self.parser.importance_class,
241
                                             self.parser.category])
242
        self.remove_element(tag)
243
    def close(self):
244
        pass
245

    
246
def main():
247
    try:
248
        opts, args = getopt.getopt(sys.argv[1:], "c:", ["categories="])
249
    except getopt.GetoptError, err:
250
        print str(err)
251
        sys.exit(1)
252

    
253
    categories = ()
254
    for o, a in opts:
255
        if o in ("-c", "--categories"):
256
            categories = set(c + ":" for c in re.split(',\s+', a[1:]))
257
        else:
258
            assert False, "unhandled option " + o
259
        
260
    output_writer = csv.writer(sys.stdout, delimiter=';', quotechar='"',
261
                               quoting=csv.QUOTE_MINIMAL)
262

    
263
    p = xml.parsers.expat.ParserCreate()
264
    p.buffer_text = True
265

    
266
    wikipedia_parser = WikipediaParser(output_writer)
267
    wikipedia_parser.keep_categories = tuple(categories)
268

    
269
    p.StartElementHandler = wikipedia_parser.start
270
    p.EndElementHandler = wikipedia_parser.end
271
    p.CharacterDataHandler = wikipedia_parser.data
272

    
273
    p.ParseFile(sys.stdin)
274

    
275
if __name__ == "__main__":
276
    main()