Statistics
| Branch: | Revision:

root / mediawiki_parser / wiki.py @ 12:2ac9494ba25e

History | View | Annotate | Download (2.8 kB)

1
# -*- coding: utf-8 -*-
2

    
3
""" Wikitext format handler """
4
import os, re
5
from itertools import groupby
6
from transifex.txcommon.log import logger
7
from transifex.resources.formats.utils.decorators import *
8
from transifex.resources.formats.utils.hash_tag import hash_tag
9
from transifex.resources.formats.core import GenericTranslation, Handler, \
10
        StringSet, ParseError, CompileError
11

    
12

    
13
class WikiParseError(ParseError):
14
    pass
15

    
16

    
17
class WikiCompileError(CompileError):
18
    pass
19

    
20

    
21
class WikiHandler(Handler):
22
    name = "Wiki handler"
23
    mime_types = ['text/x-wiki']
24
    format = "Files extracted from Wikipedia (.wiki)"
25

    
26
    @classmethod
27
    def accepts(cls, filename=None, mime=None):
28
        return (filename and filename.endswith('.wiki')) or mime in cls.mime_types
29

    
30
    @classmethod
31
    def contents_check(self, filename):
32
        pass
33

    
34
    @need_language
35
    @need_file
36
    def parse_file(self, is_source=False, lang_rules=None):
37
        assert is_source
38
        try:
39
            fh = open(self.filename, 'r')
40
            try:
41
                buf = fh.read().decode('utf-8')
42
            finally:
43
                fh.close()
44
            self._parse(buf)
45
        except Exception, e:
46
            logger.error("Error in wiki text: %s" % e, exc_info=True)
47
            raise WikiError(unicode(e))
48

    
49
    def _parse(self, content):
50
        stringset = StringSet()
51
        suggestions = StringSet()
52

    
53
        par_splitter = "\n\n"
54
        template_open = "{{"
55
        template_ends = "}}"
56

    
57
        template = content
58
        context = ''
59

    
60
        prev_split_pos = 0
61
        prev_text_pos = 0
62
        while 1:
63
            par_pos = content.find(par_splitter, prev_split_pos)
64
            t_open_pos = content.find(template_open, prev_split_pos)
65
            if prev_text_pos == -1:
66
                break
67
            elif par_pos == -1 and t_open_pos == -1:
68
                # end of document
69
                source = trans = content[prev_text_pos:].strip()
70
                prev_text_pos = -1
71
            elif par_pos < t_open_pos or t_open_pos == -1:
72
                source = trans = content[prev_text_pos:par_pos].strip()
73
                prev_split_pos = prev_text_pos = par_pos + 2
74
            else:
75
                t_end_pos = content.find(template_ends, prev_split_pos)
76
                prev_split_pos = t_end_pos
77
                continue
78

    
79
            if not source.strip('\n'):
80
                continue
81
            source_len = len(source)
82
            template = re.sub(
83
                re.escape(trans),
84
                "%(hash)s_tr" % {'hash': hash_tag(source, context)},
85
                template
86
            )
87
            stringset.strings.append(GenericTranslation(source,
88
                trans, context=context))
89

    
90
        self.stringset = stringset
91
        self.suggestions = suggestions
92
        self.template = str(template.encode('utf-8'))