root / mediawiki_parser / wiki.py @ 12:2ac9494ba25e
History | View | Annotate | Download (2.8 kB)
1 |
# -*- coding: utf-8 -*-
|
---|---|
2 |
|
3 |
""" Wikitext format handler """
|
4 |
import os, re |
5 |
from itertools import groupby |
6 |
from transifex.txcommon.log import logger |
7 |
from transifex.resources.formats.utils.decorators import * |
8 |
from transifex.resources.formats.utils.hash_tag import hash_tag |
9 |
from transifex.resources.formats.core import GenericTranslation, Handler, \ |
10 |
StringSet, ParseError, CompileError |
11 |
|
12 |
|
13 |
class WikiParseError(ParseError): |
14 |
pass
|
15 |
|
16 |
|
17 |
class WikiCompileError(CompileError): |
18 |
pass
|
19 |
|
20 |
|
21 |
class WikiHandler(Handler): |
22 |
name = "Wiki handler"
|
23 |
mime_types = ['text/x-wiki']
|
24 |
format = "Files extracted from Wikipedia (.wiki)"
|
25 |
|
26 |
@classmethod
|
27 |
def accepts(cls, filename=None, mime=None): |
28 |
return (filename and filename.endswith('.wiki')) or mime in cls.mime_types |
29 |
|
30 |
@classmethod
|
31 |
def contents_check(self, filename): |
32 |
pass
|
33 |
|
34 |
@need_language
|
35 |
@need_file
|
36 |
def parse_file(self, is_source=False, lang_rules=None): |
37 |
assert is_source
|
38 |
try:
|
39 |
fh = open(self.filename, 'r') |
40 |
try:
|
41 |
buf = fh.read().decode('utf-8')
|
42 |
finally:
|
43 |
fh.close() |
44 |
self._parse(buf)
|
45 |
except Exception, e: |
46 |
logger.error("Error in wiki text: %s" % e, exc_info=True) |
47 |
raise WikiError(unicode(e)) |
48 |
|
49 |
def _parse(self, content): |
50 |
stringset = StringSet() |
51 |
suggestions = StringSet() |
52 |
|
53 |
par_splitter = "\n\n"
|
54 |
template_open = "{{"
|
55 |
template_ends = "}}"
|
56 |
|
57 |
template = content |
58 |
context = ''
|
59 |
|
60 |
prev_split_pos = 0
|
61 |
prev_text_pos = 0
|
62 |
while 1: |
63 |
par_pos = content.find(par_splitter, prev_split_pos) |
64 |
t_open_pos = content.find(template_open, prev_split_pos) |
65 |
if prev_text_pos == -1: |
66 |
break
|
67 |
elif par_pos == -1 and t_open_pos == -1: |
68 |
# end of document
|
69 |
source = trans = content[prev_text_pos:].strip() |
70 |
prev_text_pos = -1
|
71 |
elif par_pos < t_open_pos or t_open_pos == -1: |
72 |
source = trans = content[prev_text_pos:par_pos].strip() |
73 |
prev_split_pos = prev_text_pos = par_pos + 2
|
74 |
else:
|
75 |
t_end_pos = content.find(template_ends, prev_split_pos) |
76 |
prev_split_pos = t_end_pos |
77 |
continue
|
78 |
|
79 |
if not source.strip('\n'): |
80 |
continue
|
81 |
source_len = len(source)
|
82 |
template = re.sub( |
83 |
re.escape(trans), |
84 |
"%(hash)s_tr" % {'hash': hash_tag(source, context)}, |
85 |
template |
86 |
) |
87 |
stringset.strings.append(GenericTranslation(source, |
88 |
trans, context=context)) |
89 |
|
90 |
self.stringset = stringset
|
91 |
self.suggestions = suggestions
|
92 |
self.template = str(template.encode('utf-8')) |