/src/parser.py - wikimeasure - Greek Research and Technology Network's projects

root / src / parser.py

History | View | Annotate | Download (9.8 kB)

       # Copyright (c) 2010, Panos Louridas, GRNET S.A.
+      #
       # All rights reserved.
+      #
       # Redistribution and use in source and binary forms, with or without
       # modification, are permitted provided that the following conditions are met:
+      #
       # * Redistributions of source code must retain the above copyright notice, this
       # list of conditions and the following disclaimer.
+      #
       # * Redistributions in binary form must reproduce the above copyright notice,
       # this list of conditions and the following disclaimer in the documentation
       # and/or other materials provided with the distribution.
+      #
       # * Neither the name of GRNET S.A, nor the names of its contributors may be
       # used to endorse or promote products derived from this software without
       # specific prior written permission.
+      #
       # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
       # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
       # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
       # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
       # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
       # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
       # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
       # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
       import xml.parsers.expat
       import sys
       import re
       import csv
       import getopt
+      #
       # Canonicalize a target or a page title following the rules in:
       # http://en.wikipedia.org/wiki/Help:Link#Conversion_to_canonical_form
+      #
       # Also capitalises the first letter following the prefix.
+      #
       # Returns a tuple consisting of the canonicalized target the prefix
       # (denotated by the first colon, so it may not be a real namespace but
       # that's file) and the part following the colon separator. If no colon is found,
       # the last two elements are empty.
+      #
       def canonicalize(s):
           s = s.strip(' ')
           s = s.strip('_')
           s = s.replace('_', ' ')
           length = len(s)
           if length == 0:
               return (s, '', '')
           (prefix, sep, after) = s.partition(':')
           if after != '':
               s = prefix + sep + after[0].upper() + after[1:]
           else:
               prefix = ''
               if length > 1:
                   s = s[0].upper() + s[1:]
               else:
                   s = s[0].upper()
           return (s, prefix, after)
       class BodyParser:
           ARTICLE_CLASSES = ["{{Featured article}}",
                              "{{Featured list}}",
                              "{{A-Class}}",
                              "{{Good article}}",
                              "{{B-Class}}",
                              "{{C-Class}}",
                              "{{Start-Class}}",
                              "{{Stub-Class}}"
+                             ]
           ARTICLE_CLASSES_RE = [ re.compile(x, re.I) for x in ARTICLE_CLASSES ]
           IMPORTANCE_CLASSES = ["{{Top-importance}}",
                                 "{{High-importance}}",
                                 "{{Mid-Importance}}",
                                 "{{Low-Importance}}",
                                 "{{Bottom-Importance}}",
                                 "{{No-Importance}}",
                                 "{{NA-Importance}}"
+                                ]
           IMPORTANCE_CLASSES_RE = [ re.compile(x, re.I) for x in IMPORTANCE_CLASSES ]
           def init_state(self):
               self.word_count = 0
               self.title = ""
               self.importance_class = ""
               self.article_class = ""
               self.category = []
               self.is_redirect = False
           def __init__(self):
               self.init_state()
               self.scanner = re.Scanner([
                   (r"{{[^}]+}}", self.template),
                   (r"\[\[[^]]+\]\]", self.reference),
                   (r"\[[^]]+\]", self.external_link),
                   (r"(?i)#REDIRECT\s*\[\[[^]]+\]\]", self.redirect),
                   (r"<[^>]+>", self.skip),
                   (r"\s+", self.skip),
                   (r"\b\w+\b", self.word),
                   (r"([^[]|\W+)", self.skip),
                   ])
           def template(self, scanner, token):
               for c in BodyParser.ARTICLE_CLASSES_RE:
                       if re.search(c, token) is not None:
                           self.article_class = c.pattern
                           break
               for c in BodyParser.IMPORTANCE_CLASSES_RE:
                   if re.search(c, token) is not None:
                       self.importance_class = c.pattern
                       break
               return "TEMPLATE", token
           def redirect(self, scanner, token):
               self.is_redirect = True
               target = token.split("[[")[1].split("]]")[0]
               target = canonicalize(target)[0]
               print self.title + " #REDIRECT " + target
               return "REDIRECT", token
           def reference(self, scanner, token):
               target = token.split("|")[0][2:].rstrip("]")
               (target, prefix, rest) = canonicalize(target)
               if (prefix == "Category"):
                   self.category.append(rest)
               print self.title +  " => " + target
               return "REFERENCE", token
           def external_link(self, scanner, token):
               return "EXTERNAL_LINK", token
           def word(self, scanner, token):
               self.word_count = self.word_count + 1
               return "WORD", token
           def skip(self, scanner, token):
               return "SKIP", token
           def parse(self, title, input):
               self.init_state()
               self.title = canonicalize(title)[0]
               return self.scanner.scan(input)
       class WikipediaParser:
           NS = "http://www.mediawiki.org/xml/export-0.4/"
           PAGE = "/mediawiki/page"
           TITLE = PAGE + "/title"
           REDIRECT = PAGE + "/redirect"
           TEXT = PAGE + "/revision/text"
           TIMESTAMP = PAGE + "/revision/timestamp"
           USERNAME = PAGE + "/revision/contributor/username"
           READ_SET = set([TITLE, TEXT, TIMESTAMP, USERNAME])
           NAMESPACES = (
               'Media:', 'Special:', 'Talk:', 'User:', 'User talk:', 'Wikipedia:',
               'Wikipedia talk:', 'File:', 'Image:', 'File talk:', 'MediaWiki:',
               'MediaWiki talk:', 'Template:', 'Template talk:', 'Help:',
               'Help talk:', 'Category:', 'Category talk:', 'Portal:',
               'Portal talk:', 'Book:', 'Book talk:'
+              )
           def __init__(self, output_writer):
               self.output_writer = output_writer
               self.element_content = []
               self.article_class = ""
               self.importance_class = ""
               self.title = ""
               self.timestamp = ""
               self.username = ""
               self.current_element = ""
               self.debug = False
               self.redirect = False
               self.keep_categories = ()
               self.skip = False
               self.parser = BodyParser()
           def add_element(self, tag):
               self.current_element = self.current_element + "/" + tag
           def remove_element(self, tag):
               self.element_content = []
               self.current_element = self.current_element.rpartition("/")[0]
           def start(self, tag, attrib):
               self.add_element(tag)
               if self.current_element == WikipediaParser.PAGE:
                   self.redirect = False
                   self.skip = False
               elif self.current_element == WikipediaParser.REDIRECT:
                   self.redirect = True
           def data(self, data):
               if (not self.skip
                   and self.current_element in WikipediaParser.READ_SET):
                   self.element_content.append(data.encode('utf-8'))
           def end(self, tag):
               if self.skip:
                   pass
               elif self.current_element == WikipediaParser.TITLE:
                   self.title =  ''.join(self.element_content).strip()
                   if not self.keep_categories:
                       if self.title.startswith(WikipediaParser.NAMESPACES):
                           self.skip = True
                   else:
                       if not self.title.startswith(self.keep_categories):
                           self.skip = True
               elif self.current_element == WikipediaParser.TIMESTAMP:
                   self.timestamp = ''.join(self.element_content).strip()
               elif self.current_element == WikipediaParser.USERNAME:
                   self.username = ''.join(self.element_content).strip()
               elif self.current_element == WikipediaParser.TEXT:
                   self.article_class = ""
                   self.importance_class = ""
                   page_content = ''.join(self.element_content)
                   tokens, remainder = self.parser.parse(self.title, page_content)
                   #for token in tokens:
                   #    print token
                   if remainder != "":
                       print "'" + remainder + "'"
                       print "Exiting..."
                       sys.exit(1)
                   if not self.redirect:
                       self.output_writer.writerow([self.parser.title,
                                                    self.username,
                                                    self.timestamp,
                                                    self.parser.word_count,
                                                    self.parser.article_class,
                                                    self.parser.importance_class,
                                                    self.parser.category])
               self.remove_element(tag)
           def close(self):
               pass
       def main():
           try:
               opts, args = getopt.getopt(sys.argv[1:], "c:", ["categories="])
           except getopt.GetoptError, err:
               print str(err)
               sys.exit(1)
           categories = ()
           for o, a in opts:
               if o in ("-c", "--categories"):
                   categories = set(c + ":" for c in re.split(',\s+', a[1:]))
               else:
                   assert False, "unhandled option " + o
           output_writer = csv.writer(sys.stdout, delimiter=';', quotechar='"',
                                      quoting=csv.QUOTE_MINIMAL)
           p = xml.parsers.expat.ParserCreate()
           p.buffer_text = True
           wikipedia_parser = WikipediaParser(output_writer)
           wikipedia_parser.keep_categories = tuple(categories)
           p.StartElementHandler = wikipedia_parser.start
           p.EndElementHandler = wikipedia_parser.end
           p.CharacterDataHandler = wikipedia_parser.data
           p.ParseFile(sys.stdin)
       if __name__ == "__main__":
           main()