|
1 |
#
|
|
2 |
# ElementTree
|
|
3 |
# $Id: ElementTree.py 3224 2007-08-27 21:23:39Z fredrik $
|
|
4 |
#
|
|
5 |
# light-weight XML support for Python 1.5.2 and later.
|
|
6 |
#
|
|
7 |
# history:
|
|
8 |
# 2001-10-20 fl created (from various sources)
|
|
9 |
# 2001-11-01 fl return root from parse method
|
|
10 |
# 2002-02-16 fl sort attributes in lexical order
|
|
11 |
# 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup
|
|
12 |
# 2002-05-01 fl finished TreeBuilder refactoring
|
|
13 |
# 2002-07-14 fl added basic namespace support to ElementTree.write
|
|
14 |
# 2002-07-25 fl added QName attribute support
|
|
15 |
# 2002-10-20 fl fixed encoding in write
|
|
16 |
# 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding
|
|
17 |
# 2002-11-27 fl accept file objects or file names for parse/write
|
|
18 |
# 2002-12-04 fl moved XMLTreeBuilder back to this module
|
|
19 |
# 2003-01-11 fl fixed entity encoding glitch for us-ascii
|
|
20 |
# 2003-02-13 fl added XML literal factory
|
|
21 |
# 2003-02-21 fl added ProcessingInstruction/PI factory
|
|
22 |
# 2003-05-11 fl added tostring/fromstring helpers
|
|
23 |
# 2003-05-26 fl added ElementPath support
|
|
24 |
# 2003-07-05 fl added makeelement factory method
|
|
25 |
# 2003-07-28 fl added more well-known namespace prefixes
|
|
26 |
# 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch)
|
|
27 |
# 2003-09-04 fl fall back on emulator if ElementPath is not installed
|
|
28 |
# 2003-10-31 fl markup updates
|
|
29 |
# 2003-11-15 fl fixed nested namespace bug
|
|
30 |
# 2004-03-28 fl added XMLID helper
|
|
31 |
# 2004-06-02 fl added default support to findtext
|
|
32 |
# 2004-06-08 fl fixed encoding of non-ascii element/attribute names
|
|
33 |
# 2004-08-23 fl take advantage of post-2.1 expat features
|
|
34 |
# 2005-02-01 fl added iterparse implementation
|
|
35 |
# 2005-03-02 fl fixed iterparse support for pre-2.2 versions
|
|
36 |
# 2006-11-18 fl added parser support for IronPython (ElementIron)
|
|
37 |
# 2007-08-27 fl fixed newlines in attributes
|
|
38 |
#
|
|
39 |
# Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved.
|
|
40 |
#
|
|
41 |
# fredrik@pythonware.com
|
|
42 |
# http://www.pythonware.com
|
|
43 |
#
|
|
44 |
# --------------------------------------------------------------------
|
|
45 |
# The ElementTree toolkit is
|
|
46 |
#
|
|
47 |
# Copyright (c) 1999-2007 by Fredrik Lundh
|
|
48 |
#
|
|
49 |
# By obtaining, using, and/or copying this software and/or its
|
|
50 |
# associated documentation, you agree that you have read, understood,
|
|
51 |
# and will comply with the following terms and conditions:
|
|
52 |
#
|
|
53 |
# Permission to use, copy, modify, and distribute this software and
|
|
54 |
# its associated documentation for any purpose and without fee is
|
|
55 |
# hereby granted, provided that the above copyright notice appears in
|
|
56 |
# all copies, and that both that copyright notice and this permission
|
|
57 |
# notice appear in supporting documentation, and that the name of
|
|
58 |
# Secret Labs AB or the author not be used in advertising or publicity
|
|
59 |
# pertaining to distribution of the software without specific, written
|
|
60 |
# prior permission.
|
|
61 |
#
|
|
62 |
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
|
|
63 |
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
|
|
64 |
# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
|
|
65 |
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
|
|
66 |
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
|
67 |
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
|
68 |
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
|
69 |
# OF THIS SOFTWARE.
|
|
70 |
# --------------------------------------------------------------------
|
|
71 |
|
|
72 |
__all__ = [
|
|
73 |
# public symbols
|
|
74 |
"Comment",
|
|
75 |
"dump",
|
|
76 |
"Element", "ElementTree",
|
|
77 |
"fromstring",
|
|
78 |
"iselement", "iterparse",
|
|
79 |
"parse",
|
|
80 |
"PI", "ProcessingInstruction",
|
|
81 |
"QName",
|
|
82 |
"SubElement",
|
|
83 |
"tostring",
|
|
84 |
"TreeBuilder",
|
|
85 |
"VERSION", "XML",
|
|
86 |
"XMLTreeBuilder",
|
|
87 |
]
|
|
88 |
|
|
89 |
# parser api override (None = use default)
|
|
90 |
parser_api = None
|
|
91 |
|
|
92 |
# TODO: add support for custom namespace resolvers/default namespaces
|
|
93 |
# TODO: add improved support for incremental parsing
|
|
94 |
|
|
95 |
VERSION = "1.2.7"
|
|
96 |
|
|
97 |
##
|
|
98 |
# The <b>Element</b> type is a flexible container object, designed to
|
|
99 |
# store hierarchical data structures in memory. The type can be
|
|
100 |
# described as a cross between a list and a dictionary.
|
|
101 |
# <p>
|
|
102 |
# Each element has a number of properties associated with it:
|
|
103 |
# <ul>
|
|
104 |
# <li>a <i>tag</i>. This is a string identifying what kind of data
|
|
105 |
# this element represents (the element type, in other words).</li>
|
|
106 |
# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
|
|
107 |
# <li>a <i>text</i> string.</li>
|
|
108 |
# <li>an optional <i>tail</i> string.</li>
|
|
109 |
# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
|
|
110 |
# </ul>
|
|
111 |
#
|
|
112 |
# To create an element instance, use the {@link #Element} or {@link
|
|
113 |
# #SubElement} factory functions.
|
|
114 |
# <p>
|
|
115 |
# The {@link #ElementTree} class can be used to wrap an element
|
|
116 |
# structure, and convert it from and to XML.
|
|
117 |
##
|
|
118 |
|
|
119 |
|
|
120 |
import sys, re
|
|
121 |
|
|
122 |
try:
|
|
123 |
import string
|
|
124 |
except:
|
|
125 |
# emulate string module under IronPython
|
|
126 |
class string(object):
|
|
127 |
def join(self, seq, sep):
|
|
128 |
return sep.join(seq)
|
|
129 |
def replace(self, text, *args):
|
|
130 |
return text.replace(*args)
|
|
131 |
def split(self, text, *args):
|
|
132 |
return text.split(*args)
|
|
133 |
def strip(self, text, *args):
|
|
134 |
return text.strip(*args)
|
|
135 |
string = string()
|
|
136 |
|
|
137 |
class _SimpleElementPath:
|
|
138 |
# emulate pre-1.2 find/findtext/findall behaviour
|
|
139 |
def find(self, element, tag):
|
|
140 |
for elem in element:
|
|
141 |
if elem.tag == tag:
|
|
142 |
return elem
|
|
143 |
return None
|
|
144 |
def findtext(self, element, tag, default=None):
|
|
145 |
for elem in element:
|
|
146 |
if elem.tag == tag:
|
|
147 |
return elem.text or ""
|
|
148 |
return default
|
|
149 |
def findall(self, element, tag):
|
|
150 |
if tag[:3] == ".//":
|
|
151 |
return element.getiterator(tag[3:])
|
|
152 |
result = []
|
|
153 |
for elem in element:
|
|
154 |
if elem.tag == tag:
|
|
155 |
result.append(elem)
|
|
156 |
return result
|
|
157 |
|
|
158 |
try:
|
|
159 |
import ElementPath
|
|
160 |
except ImportError:
|
|
161 |
# FIXME: issue warning in this case?
|
|
162 |
ElementPath = _SimpleElementPath()
|
|
163 |
|
|
164 |
class DefaultParserAPI:
|
|
165 |
|
|
166 |
def parse(self, source, parser=None):
|
|
167 |
if not hasattr(source, "read"):
|
|
168 |
source = open(source, "rb")
|
|
169 |
if not parser:
|
|
170 |
parser = XMLTreeBuilder()
|
|
171 |
while 1:
|
|
172 |
data = source.read(32768)
|
|
173 |
if not data:
|
|
174 |
break
|
|
175 |
parser.feed(data)
|
|
176 |
return parser.close()
|
|
177 |
|
|
178 |
def iterparse(self, source, events):
|
|
179 |
return _iterparse(source, events)
|
|
180 |
|
|
181 |
def fromstring(self, text):
|
|
182 |
parser = XMLTreeBuilder()
|
|
183 |
parser.feed(text)
|
|
184 |
return parser.close()
|
|
185 |
|
|
186 |
parser_api = default_parser_api = DefaultParserAPI()
|
|
187 |
|
|
188 |
##
|
|
189 |
# Internal element class. This class defines the Element interface,
|
|
190 |
# and provides a reference implementation of this interface.
|
|
191 |
# <p>
|
|
192 |
# You should not create instances of this class directly. Use the
|
|
193 |
# appropriate factory functions instead, such as {@link #Element}
|
|
194 |
# and {@link #SubElement}.
|
|
195 |
#
|
|
196 |
# @see Element
|
|
197 |
# @see SubElement
|
|
198 |
# @see Comment
|
|
199 |
# @see ProcessingInstruction
|
|
200 |
|
|
201 |
class _ElementInterface:
|
|
202 |
# <tag attrib>text<child/>...</tag>tail
|
|
203 |
|
|
204 |
##
|
|
205 |
# (Attribute) Element tag.
|
|
206 |
|
|
207 |
tag = None
|
|
208 |
|
|
209 |
##
|
|
210 |
# (Attribute) Element attribute dictionary. Where possible, use
|
|
211 |
# {@link #_ElementInterface.get},
|
|
212 |
# {@link #_ElementInterface.set},
|
|
213 |
# {@link #_ElementInterface.keys}, and
|
|
214 |
# {@link #_ElementInterface.items} to access
|
|
215 |
# element attributes.
|
|
216 |
|
|
217 |
attrib = None
|
|
218 |
|
|
219 |
##
|
|
220 |
# (Attribute) Text before first subelement. This is either a
|
|
221 |
# string or the value None, if there was no text.
|
|
222 |
|
|
223 |
text = None
|
|
224 |
|
|
225 |
##
|
|
226 |
# (Attribute) Text after this element's end tag, but before the
|
|
227 |
# next sibling element's start tag. This is either a string or
|
|
228 |
# the value None, if there was no text.
|
|
229 |
|
|
230 |
tail = None # text after end tag, if any
|
|
231 |
|
|
232 |
def __init__(self, tag, attrib):
|
|
233 |
self.tag = tag
|
|
234 |
self.attrib = attrib
|
|
235 |
self._children = []
|
|
236 |
|
|
237 |
def __repr__(self):
|
|
238 |
return "<Element %s at %x>" % (self.tag, id(self))
|
|
239 |
|
|
240 |
##
|
|
241 |
# Creates a new element object of the same type as this element.
|
|
242 |
#
|
|
243 |
# @param tag Element tag.
|
|
244 |
# @param attrib Element attributes, given as a dictionary.
|
|
245 |
# @return A new element instance.
|
|
246 |
|
|
247 |
def makeelement(self, tag, attrib):
|
|
248 |
return Element(tag, attrib)
|
|
249 |
|
|
250 |
##
|
|
251 |
# Returns the number of subelements.
|
|
252 |
#
|
|
253 |
# @return The number of subelements.
|
|
254 |
|
|
255 |
def __len__(self):
|
|
256 |
return len(self._children)
|
|
257 |
|
|
258 |
##
|
|
259 |
# Returns the given subelement.
|
|
260 |
#
|
|
261 |
# @param index What subelement to return.
|
|
262 |
# @return The given subelement.
|
|
263 |
# @exception IndexError If the given element does not exist.
|
|
264 |
|
|
265 |
def __getitem__(self, index):
|
|
266 |
return self._children[index]
|
|
267 |
|
|
268 |
##
|
|
269 |
# Replaces the given subelement.
|
|
270 |
#
|
|
271 |
# @param index What subelement to replace.
|
|
272 |
# @param element The new element value.
|
|
273 |
# @exception IndexError If the given element does not exist.
|
|
274 |
# @exception AssertionError If element is not a valid object.
|
|
275 |
|
|
276 |
def __setitem__(self, index, element):
|
|
277 |
assert iselement(element)
|
|
278 |
self._children[index] = element
|
|
279 |
|
|
280 |
##
|
|
281 |
# Deletes the given subelement.
|
|
282 |
#
|
|
283 |
# @param index What subelement to delete.
|
|
284 |
# @exception IndexError If the given element does not exist.
|
|
285 |
|
|
286 |
def __delitem__(self, index):
|
|
287 |
del self._children[index]
|
|
288 |
|
|
289 |
##
|
|
290 |
# Returns a list containing subelements in the given range.
|
|
291 |
#
|
|
292 |
# @param start The first subelement to return.
|
|
293 |
# @param stop The first subelement that shouldn't be returned.
|
|
294 |
# @return A sequence object containing subelements.
|
|
295 |
|
|
296 |
def __getslice__(self, start, stop):
|
|
297 |
return self._children[start:stop]
|
|
298 |
|
|
299 |
##
|
|
300 |
# Replaces a number of subelements with elements from a sequence.
|
|
301 |
#
|
|
302 |
# @param start The first subelement to replace.
|
|
303 |
# @param stop The first subelement that shouldn't be replaced.
|
|
304 |
# @param elements A sequence object with zero or more elements.
|
|
305 |
# @exception AssertionError If a sequence member is not a valid object.
|
|
306 |
|
|
307 |
def __setslice__(self, start, stop, elements):
|
|
308 |
for element in elements:
|
|
309 |
assert iselement(element)
|
|
310 |
self._children[start:stop] = list(elements)
|
|
311 |
|
|
312 |
##
|
|
313 |
# Deletes a number of subelements.
|
|
314 |
#
|
|
315 |
# @param start The first subelement to delete.
|
|
316 |
# @param stop The first subelement to leave in there.
|
|
317 |
|
|
318 |
def __delslice__(self, start, stop):
|
|
319 |
del self._children[start:stop]
|
|
320 |
|
|
321 |
##
|
|
322 |
# Adds a subelement to the end of this element.
|
|
323 |
#
|
|
324 |
# @param element The element to add.
|
|
325 |
# @exception AssertionError If a sequence member is not a valid object.
|
|
326 |
|
|
327 |
def append(self, element):
|
|
328 |
assert iselement(element)
|
|
329 |
self._children.append(element)
|
|
330 |
|
|
331 |
##
|
|
332 |
# Inserts a subelement at the given position in this element.
|
|
333 |
#
|
|
334 |
# @param index Where to insert the new subelement.
|
|
335 |
# @exception AssertionError If the element is not a valid object.
|
|
336 |
|
|
337 |
def insert(self, index, element):
|
|
338 |
assert iselement(element)
|
|
339 |
self._children.insert(index, element)
|
|
340 |
|
|
341 |
##
|
|
342 |
# Removes a matching subelement. Unlike the <b>find</b> methods,
|
|
343 |
# this method compares elements based on identity, not on tag
|
|
344 |
# value or contents.
|
|
345 |
#
|
|
346 |
# @param element What element to remove.
|
|
347 |
# @exception ValueError If a matching element could not be found.
|
|
348 |
# @exception AssertionError If the element is not a valid object.
|
|
349 |
|
|
350 |
def remove(self, element):
|
|
351 |
assert iselement(element)
|
|
352 |
self._children.remove(element)
|
|
353 |
|
|
354 |
##
|
|
355 |
# Returns all subelements. The elements are returned in document
|
|
356 |
# order.
|
|
357 |
#
|
|
358 |
# @return A list of subelements.
|
|
359 |
# @defreturn list of Element instances
|
|
360 |
|
|
361 |
def getchildren(self):
|
|
362 |
return self._children
|
|
363 |
|
|
364 |
##
|
|
365 |
# Finds the first matching subelement, by tag name or path.
|
|
366 |
#
|
|
367 |
# @param path What element to look for.
|
|
368 |
# @return The first matching element, or None if no element was found.
|
|
369 |
# @defreturn Element or None
|
|
370 |
|
|
371 |
def find(self, path):
|
|
372 |
return ElementPath.find(self, path)
|
|
373 |
|
|
374 |
##
|
|
375 |
# Finds text for the first matching subelement, by tag name or path.
|
|
376 |
#
|
|
377 |
# @param path What element to look for.
|
|
378 |
# @param default What to return if the element was not found.
|
|
379 |
# @return The text content of the first matching element, or the
|
|
380 |
# default value no element was found. Note that if the element
|
|
381 |
# has is found, but has no text content, this method returns an
|
|
382 |
# empty string.
|
|
383 |
# @defreturn string
|
|
384 |
|
|
385 |
def findtext(self, path, default=None):
|
|
386 |
return ElementPath.findtext(self, path, default)
|
|
387 |
|
|
388 |
##
|
|
389 |
# Finds all matching subelements, by tag name or path.
|
|
390 |
#
|
|
391 |
# @param path What element to look for.
|
|
392 |
# @return A list or iterator containing all matching elements,
|
|
393 |
# in document order.
|
|
394 |
# @defreturn list of Element instances
|
|
395 |
|
|
396 |
def findall(self, path):
|
|
397 |
return ElementPath.findall(self, path)
|
|
398 |
|
|
399 |
##
|
|
400 |
# Resets an element. This function removes all subelements, clears
|
|
401 |
# all attributes, and sets the text and tail attributes to None.
|
|
402 |
|
|
403 |
def clear(self):
|
|
404 |
self.attrib.clear()
|
|
405 |
self._children = []
|
|
406 |
self.text = self.tail = None
|
|
407 |
|
|
408 |
##
|
|
409 |
# Gets an element attribute.
|
|
410 |
#
|
|
411 |
# @param key What attribute to look for.
|
|
412 |
# @param default What to return if the attribute was not found.
|
|
413 |
# @return The attribute value, or the default value, if the
|
|
414 |
# attribute was not found.
|
|
415 |
# @defreturn string or None
|
|
416 |
|
|
417 |
def get(self, key, default=None):
|
|
418 |
return self.attrib.get(key, default)
|
|
419 |
|
|
420 |
##
|
|
421 |
# Sets an element attribute.
|
|
422 |
#
|
|
423 |
# @param key What attribute to set.
|
|
424 |
# @param value The attribute value.
|
|
425 |
|
|
426 |
def set(self, key, value):
|
|
427 |
self.attrib[key] = value
|
|
428 |
|
|
429 |
##
|
|
430 |
# Gets a list of attribute names. The names are returned in an
|
|
431 |
# arbitrary order (just like for an ordinary Python dictionary).
|
|
432 |
#
|
|
433 |
# @return A list of element attribute names.
|
|
434 |
# @defreturn list of strings
|
|
435 |
|
|
436 |
def keys(self):
|
|
437 |
return self.attrib.keys()
|
|
438 |
|
|
439 |
##
|
|
440 |
# Gets element attributes, as a sequence. The attributes are
|
|
441 |
# returned in an arbitrary order.
|
|
442 |
#
|
|
443 |
# @return A list of (name, value) tuples for all attributes.
|
|
444 |
# @defreturn list of (string, string) tuples
|
|
445 |
|
|
446 |
def items(self):
|
|
447 |
return self.attrib.items()
|
|
448 |
|
|
449 |
##
|
|
450 |
# Creates a tree iterator. The iterator loops over this element
|
|
451 |
# and all subelements, in document order, and returns all elements
|
|
452 |
# with a matching tag.
|
|
453 |
# <p>
|
|
454 |
# If the tree structure is modified during iteration, the result
|
|
455 |
# is undefined.
|
|
456 |
#
|
|
457 |
# @param tag What tags to look for (default is to return all elements).
|
|
458 |
# @return A list or iterator containing all the matching elements.
|
|
459 |
# @defreturn list or iterator
|
|
460 |
|
|
461 |
def getiterator(self, tag=None):
|
|
462 |
nodes = []
|
|
463 |
if tag == "*":
|
|
464 |
tag = None
|
|
465 |
if tag is None or self.tag == tag:
|
|
466 |
nodes.append(self)
|
|
467 |
for node in self._children:
|
|
468 |
nodes.extend(node.getiterator(tag))
|
|
469 |
return nodes
|
|
470 |
|
|
471 |
# compatibility
|
|
472 |
_Element = _ElementInterface
|
|
473 |
|
|
474 |
##
|
|
475 |
# Element factory. This function returns an object implementing the
|
|
476 |
# standard Element interface. The exact class or type of that object
|
|
477 |
# is implementation dependent, but it will always be compatible with
|
|
478 |
# the {@link #_ElementInterface} class in this module.
|
|
479 |
# <p>
|
|
480 |
# The element name, attribute names, and attribute values can be
|
|
481 |
# either 8-bit ASCII strings or Unicode strings.
|
|
482 |
#
|
|
483 |
# @param tag The element name.
|
|
484 |
# @param attrib An optional dictionary, containing element attributes.
|
|
485 |
# @param **extra Additional attributes, given as keyword arguments.
|
|
486 |
# @return An element instance.
|
|
487 |
# @defreturn Element
|
|
488 |
|
|
489 |
def Element(tag, attrib={}, **extra):
|
|
490 |
attrib = attrib.copy()
|
|
491 |
attrib.update(extra)
|
|
492 |
return _ElementInterface(tag, attrib)
|
|
493 |
|
|
494 |
##
|
|
495 |
# Subelement factory. This function creates an element instance, and
|
|
496 |
# appends it to an existing element.
|
|
497 |
# <p>
|
|
498 |
# The element name, attribute names, and attribute values can be
|
|
499 |
# either 8-bit ASCII strings or Unicode strings.
|
|
500 |
#
|
|
501 |
# @param parent The parent element.
|
|
502 |
# @param tag The subelement name.
|
|
503 |
# @param attrib An optional dictionary, containing element attributes.
|
|
504 |
# @param **extra Additional attributes, given as keyword arguments.
|
|
505 |
# @return An element instance.
|
|
506 |
# @defreturn Element
|
|
507 |
|
|
508 |
def SubElement(parent, tag, attrib={}, **extra):
|
|
509 |
attrib = attrib.copy()
|
|
510 |
attrib.update(extra)
|
|
511 |
element = parent.makeelement(tag, attrib)
|
|
512 |
parent.append(element)
|
|
513 |
return element
|
|
514 |
|
|
515 |
##
|
|
516 |
# Comment element factory. This factory function creates a special
|
|
517 |
# element that will be serialized as an XML comment.
|
|
518 |
# <p>
|
|
519 |
# The comment string can be either an 8-bit ASCII string or a Unicode
|
|
520 |
# string.
|
|
521 |
#
|
|
522 |
# @param text A string containing the comment string.
|
|
523 |
# @return An element instance, representing a comment.
|
|
524 |
# @defreturn Element
|
|
525 |
|
|
526 |
def Comment(text=None):
|
|
527 |
element = Element(Comment)
|
|
528 |
element.text = text
|
|
529 |
return element
|
|
530 |
|
|
531 |
##
|
|
532 |
# PI element factory. This factory function creates a special element
|
|
533 |
# that will be serialized as an XML processing instruction.
|
|
534 |
#
|
|
535 |
# @param target A string containing the PI target.
|
|
536 |
# @param text A string containing the PI contents, if any.
|
|
537 |
# @return An element instance, representing a PI.
|
|
538 |
# @defreturn Element
|
|
539 |
|
|
540 |
def ProcessingInstruction(target, text=None):
|
|
541 |
element = Element(ProcessingInstruction)
|
|
542 |
element.text = target
|
|
543 |
if text:
|
|
544 |
element.text = element.text + " " + text
|
|
545 |
return element
|
|
546 |
|
|
547 |
PI = ProcessingInstruction
|
|
548 |
|
|
549 |
##
|
|
550 |
# QName wrapper. This can be used to wrap a QName attribute value, in
|
|
551 |
# order to get proper namespace handling on output.
|
|
552 |
#
|
|
553 |
# @param text A string containing the QName value, in the form {uri}local,
|
|
554 |
# or, if the tag argument is given, the URI part of a QName.
|
|
555 |
# @param tag Optional tag. If given, the first argument is interpreted as
|
|
556 |
# an URI, and this argument is interpreted as a local name.
|
|
557 |
# @return An opaque object, representing the QName.
|
|
558 |
|
|
559 |
class QName:
|
|
560 |
def __init__(self, text_or_uri, tag=None):
|
|
561 |
if tag:
|
|
562 |
text_or_uri = "{%s}%s" % (text_or_uri, tag)
|
|
563 |
self.text = text_or_uri
|
|
564 |
def __str__(self):
|
|
565 |
return self.text
|
|
566 |
def __hash__(self):
|
|
567 |
return hash(self.text)
|
|
568 |
def __cmp__(self, other):
|
|
569 |
if isinstance(other, QName):
|
|
570 |
return cmp(self.text, other.text)
|
|
571 |
return cmp(self.text, other)
|
|
572 |
|
|
573 |
##
|
|
574 |
# ElementTree wrapper class. This class represents an entire element
|
|
575 |
# hierarchy, and adds some extra support for serialization to and from
|
|
576 |
# standard XML.
|
|
577 |
#
|
|
578 |
# @param element Optional root element.
|
|
579 |
# @keyparam file Optional file handle or name. If given, the
|
|
580 |
# tree is initialized with the contents of this XML file.
|
|
581 |
|
|
582 |
class ElementTree:
|
|
583 |
|
|
584 |
def __init__(self, element=None, file=None):
|
|
585 |
assert element is None or iselement(element)
|
|
586 |
self._root = element # first node
|
|
587 |
if file:
|
|
588 |
self.parse(file)
|
|
589 |
|
|
590 |
##
|
|
591 |
# Gets the root element for this tree.
|
|
592 |
#
|
|
593 |
# @return An element instance.
|
|
594 |
# @defreturn Element
|
|
595 |
|
|
596 |
def getroot(self):
|
|
597 |
return self._root
|
|
598 |
|
|
599 |
##
|
|
600 |
# Replaces the root element for this tree. This discards the
|
|
601 |
# current contents of the tree, and replaces it with the given
|
|
602 |
# element. Use with care.
|
|
603 |
#
|
|
604 |
# @param element An element instance.
|
|
605 |
|
|
606 |
def _setroot(self, element):
|
|
607 |
assert iselement(element)
|
|
608 |
self._root = element
|
|
609 |
|
|
610 |
##
|
|
611 |
# Loads an external XML document into this element tree.
|
|
612 |
#
|
|
613 |
# @param source A file name or file object.
|
|
614 |
# @param parser An optional parser instance. If not given, the
|
|
615 |
# standard {@link XMLTreeBuilder} parser is used.
|
|
616 |
# @return The document root element.
|
|
617 |
# @defreturn Element
|
|
618 |
|
|
619 |
def parse(self, source, parser=None):
|
|
620 |
if parser:
|
|
621 |
tree = default_parser_api.parse(source, parser)
|
|
622 |
else:
|
|
623 |
tree = parser_api.parse(source)
|
|
624 |
self._root = tree
|
|
625 |
return tree
|
|
626 |
|
|
627 |
##
|
|
628 |
# Creates a tree iterator for the root element. The iterator loops
|
|
629 |
# over all elements in this tree, in document order.
|
|
630 |
#
|
|
631 |
# @param tag What tags to look for (default is to return all elements)
|
|
632 |
# @return An iterator.
|
|
633 |
# @defreturn iterator
|
|
634 |
|
|
635 |
def getiterator(self, tag=None):
|
|
636 |
assert self._root is not None
|
|
637 |
return self._root.getiterator(tag)
|
|
638 |
|
|
639 |
##
|
|
640 |
# Finds the first toplevel element with given tag.
|
|
641 |
# Same as getroot().find(path).
|
|
642 |
#
|
|
643 |
# @param path What element to look for.
|
|
644 |
# @return The first matching element, or None if no element was found.
|
|
645 |
# @defreturn Element or None
|
|
646 |
|
|
647 |
def find(self, path):
|
|
648 |
assert self._root is not None
|
|
649 |
if path[:1] == "/":
|
|
650 |
path = "." + path
|
|
651 |
return self._root.find(path)
|
|
652 |
|
|
653 |
##
|
|
654 |
# Finds the element text for the first toplevel element with given
|
|
655 |
# tag. Same as getroot().findtext(path).
|
|
656 |
#
|
|
657 |
# @param path What toplevel element to look for.
|
|
658 |
# @param default What to return if the element was not found.
|
|
659 |
# @return The text content of the first matching element, or the
|
|
660 |
# default value no element was found. Note that if the element
|
|
661 |
# has is found, but has no text content, this method returns an
|
|
662 |
# empty string.
|
|
663 |
# @defreturn string
|
|
664 |
|
|
665 |
def findtext(self, path, default=None):
|
|
666 |
assert self._root is not None
|
|
667 |
if path[:1] == "/":
|
|
668 |
path = "." + path
|
|
669 |
return self._root.findtext(path, default)
|
|
670 |
|
|
671 |
##
|
|
672 |
# Finds all toplevel elements with the given tag.
|
|
673 |
# Same as getroot().findall(path).
|
|
674 |
#
|
|
675 |
# @param path What element to look for.
|
|
676 |
# @return A list or iterator containing all matching elements,
|
|
677 |
# in document order.
|
|
678 |
# @defreturn list of Element instances
|
|
679 |
|
|
680 |
def findall(self, path):
|
|
681 |
assert self._root is not None
|
|
682 |
if path[:1] == "/":
|
|
683 |
path = "." + path
|
|
684 |
return self._root.findall(path)
|
|
685 |
|
|
686 |
##
|
|
687 |
# Writes the element tree to a file, as XML.
|
|
688 |
#
|
|
689 |
# @param file A file name, or a file object opened for writing.
|
|
690 |
# @param encoding Optional output encoding (default is US-ASCII).
|
|
691 |
|
|
692 |
def write(self, file, encoding="us-ascii"):
|
|
693 |
assert self._root is not None
|
|
694 |
if not hasattr(file, "write"):
|
|
695 |
file = open(file, "wb")
|
|
696 |
if not encoding:
|
|
697 |
encoding = "us-ascii"
|
|
698 |
elif encoding != "utf-8" and encoding != "us-ascii":
|
|
699 |
file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
|
|
700 |
self._write(file, self._root, encoding, {})
|
|
701 |
|
|
702 |
def _write(self, file, node, encoding, namespaces):
|
|
703 |
# write XML to file
|
|
704 |
tag = node.tag
|
|
705 |
if tag is Comment:
|
|
706 |
file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
|
|
707 |
elif tag is ProcessingInstruction:
|
|
708 |
file.write("<?%s?>" % _escape_cdata(node.text, encoding))
|
|
709 |
else:
|
|
710 |
items = node.items()
|
|
711 |
xmlns_items = [] # new namespaces in this scope
|
|
712 |
try:
|
|
713 |
if isinstance(tag, QName) or tag[:1] == "{":
|
|
714 |
tag, xmlns = fixtag(tag, namespaces)
|
|
715 |
if xmlns: xmlns_items.append(xmlns)
|
|
716 |
except TypeError:
|
|
717 |
_raise_serialization_error(tag)
|
|
718 |
file.write("<" + _encode(tag, encoding))
|
|
719 |
if items or xmlns_items:
|
|
720 |
items.sort() # lexical order
|
|
721 |
for k, v in items:
|
|
722 |
try:
|
|
723 |
if isinstance(k, QName) or k[:1] == "{":
|
|
724 |
k, xmlns = fixtag(k, namespaces)
|
|
725 |
if xmlns: xmlns_items.append(xmlns)
|
|
726 |
except TypeError:
|
|
727 |
_raise_serialization_error(k)
|
|
728 |
try:
|
|
729 |
if isinstance(v, QName):
|
|
730 |
v, xmlns = fixtag(v, namespaces)
|
|
731 |
if xmlns: xmlns_items.append(xmlns)
|
|
732 |
except TypeError:
|
|
733 |
_raise_serialization_error(v)
|
|
734 |
file.write(" %s=\"%s\"" % (_encode(k, encoding),
|
|
735 |
_escape_attrib(v, encoding)))
|
|
736 |
for k, v in xmlns_items:
|
|
737 |
file.write(" %s=\"%s\"" % (_encode(k, encoding),
|
|
738 |
_escape_attrib(v, encoding)))
|
|
739 |
if node.text or len(node):
|
|
740 |
file.write(">")
|
|
741 |
if node.text:
|
|
742 |
file.write(_escape_cdata(node.text, encoding))
|
|
743 |
for n in node:
|
|
744 |
self._write(file, n, encoding, namespaces)
|
|
745 |
file.write("</" + _encode(tag, encoding) + ">")
|
|
746 |
else:
|
|
747 |
file.write(" />")
|
|
748 |
for k, v in xmlns_items:
|
|
749 |
del namespaces[v]
|
|
750 |
if node.tail:
|
|
751 |
file.write(_escape_cdata(node.tail, encoding))
|
|
752 |
|
|
753 |
# --------------------------------------------------------------------
|
|
754 |
# helpers
|
|
755 |
|
|
756 |
##
|
|
757 |
# Checks if an object appears to be a valid element object.
|
|
758 |
#
|
|
759 |
# @param An element instance.
|
|
760 |
# @return A true value if this is an element object.
|
|
761 |
# @defreturn flag
|
|
762 |
|
|
763 |
def iselement(element):
|
|
764 |
# FIXME: not sure about this; might be a better idea to look
|
|
765 |
# for tag/attrib/text attributes
|
|
766 |
return isinstance(element, _ElementInterface) or hasattr(element, "tag")
|
|
767 |
|
|
768 |
##
|
|
769 |
# Writes an element tree or element structure to sys.stdout. This
|
|
770 |
# function should be used for debugging only.
|
|
771 |
# <p>
|
|
772 |
# The exact output format is implementation dependent. In this
|
|
773 |
# version, it's written as an ordinary XML file.
|
|
774 |
#
|
|
775 |
# @param elem An element tree or an individual element.
|
|
776 |
|
|
777 |
def dump(elem):
|
|
778 |
# debugging
|
|
779 |
if not isinstance(elem, ElementTree):
|
|
780 |
elem = ElementTree(elem)
|
|
781 |
elem.write(sys.stdout)
|
|
782 |
tail = elem.getroot().tail
|
|
783 |
if not tail or tail[-1] != "\n":
|
|
784 |
sys.stdout.write("\n")
|
|
785 |
|
|
786 |
def _encode(s, encoding):
|
|
787 |
try:
|
|
788 |
return s.encode(encoding)
|
|
789 |
except AttributeError:
|
|
790 |
return s # 1.5.2: assume the string uses the right encoding
|
|
791 |
|
|
792 |
if sys.version[:3] == "1.5":
|
|
793 |
_escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2
|
|
794 |
else:
|
|
795 |
_escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))
|
|
796 |
|
|
797 |
_escape_map = {
|
|
798 |
"&": "&",
|
|
799 |
"<": "<",
|
|
800 |
">": ">",
|
|
801 |
'"': """,
|
|
802 |
}
|
|
803 |
|
|
804 |
_namespace_map = {
|
|
805 |
# "well-known" namespace prefixes
|
|
806 |
"http://www.w3.org/XML/1998/namespace": "xml",
|
|
807 |
"http://www.w3.org/1999/xhtml": "html",
|
|
808 |
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
|
|
809 |
"http://schemas.xmlsoap.org/wsdl/": "wsdl",
|
|
810 |
}
|
|
811 |
|
|
812 |
def _raise_serialization_error(text):
|
|
813 |
raise TypeError(
|
|
814 |
"cannot serialize %r (type %s)" % (text, type(text).__name__)
|
|
815 |
)
|
|
816 |
|
|
817 |
def _encode_entity(text, pattern=_escape):
|
|
818 |
# map reserved and non-ascii characters to numerical entities
|
|
819 |
def escape_entities(m, map=_escape_map):
|
|
820 |
out = []
|
|
821 |
append = out.append
|
|
822 |
for char in m.group():
|
|
823 |
text = map.get(char)
|
|
824 |
if text is None:
|
|
825 |
text = "&#%d;" % ord(char)
|
|
826 |
append(text)
|
|
827 |
return string.join(out, "")
|
|
828 |
try:
|
|
829 |
return _encode(pattern.sub(escape_entities, text), "ascii")
|
|
830 |
except TypeError:
|
|
831 |
_raise_serialization_error(text)
|
|
832 |
|
|
833 |
#
|
|
834 |
# the following functions assume an ascii-compatible encoding
|
|
835 |
# (or "utf-16")
|
|
836 |
|
|
837 |
def _escape_cdata(text, encoding=None, replace=string.replace):
|
|
838 |
# escape character data
|
|
839 |
try:
|
|
840 |
if encoding:
|
|
841 |
try:
|
|
842 |
text = _encode(text, encoding)
|
|
843 |
except UnicodeError:
|
|
844 |
return _encode_entity(text)
|
|
845 |
if "&" in text:
|
|
846 |
text = replace(text, "&", "&")
|
|
847 |
if "<" in text:
|
|
848 |
text = replace(text, "<", "<")
|
|
849 |
if ">" in text:
|
|
850 |
text = replace(text, ">", ">")
|
|
851 |
return text
|
|
852 |
except (TypeError, AttributeError):
|
|
853 |
_raise_serialization_error(text)
|
|
854 |
|
|
855 |
def _escape_attrib(text, encoding=None, replace=string.replace):
|
|
856 |
# escape attribute value
|
|
857 |
try:
|
|
858 |
if encoding:
|
|
859 |
try:
|
|
860 |
text = _encode(text, encoding)
|
|
861 |
except UnicodeError:
|
|
862 |
return _encode_entity(text)
|
|
863 |
if "&" in text:
|
|
864 |
text = replace(text, "&", "&")
|
|
865 |
if "\"" in text:
|
|
866 |
text = replace(text, "\"", """)
|
|
867 |
if "<" in text:
|
|
868 |
text = replace(text, "<", "<")
|
|
869 |
if ">" in text:
|
|
870 |
text = replace(text, ">", ">")
|
|
871 |
if "\n" in text:
|
|
872 |
text = replace(text, "\n", " ")
|
|
873 |
return text
|
|
874 |
except (TypeError, AttributeError):
|
|
875 |
_raise_serialization_error(text)
|
|
876 |
|
|
877 |
def fixtag(tag, namespaces):
|
|
878 |
# given a decorated tag (of the form {uri}tag), return prefixed
|
|
879 |
# tag and namespace declaration, if any
|
|
880 |
if isinstance(tag, QName):
|
|
881 |
tag = tag.text
|
|
882 |
namespace_uri, tag = string.split(tag[1:], "}", 1)
|
|
883 |
prefix = namespaces.get(namespace_uri)
|
|
884 |
if prefix is None:
|
|
885 |
prefix = _namespace_map.get(namespace_uri)
|
|
886 |
if prefix is None:
|
|
887 |
prefix = "ns%d" % len(namespaces)
|
|
888 |
namespaces[namespace_uri] = prefix
|
|
889 |
if prefix == "xml":
|
|
890 |
xmlns = None
|
|
891 |
else:
|
|
892 |
xmlns = ("xmlns:%s" % prefix, namespace_uri)
|
|
893 |
else:
|
|
894 |
xmlns = None
|
|
895 |
return "%s:%s" % (prefix, tag), xmlns
|
|
896 |
|
|
897 |
##
|
|
898 |
# Parses an XML document into an element tree.
|
|
899 |
#
|
|
900 |
# @param source A filename or file object containing XML data.
|
|
901 |
# @param parser An optional parser instance. If not given, the
|
|
902 |
# standard {@link XMLTreeBuilder} parser is used.
|
|
903 |
# @return An ElementTree instance
|
|
904 |
|
|
905 |
def parse(source, parser=None):
|
|
906 |
if parser:
|
|
907 |
tree = default_parser_api.parse(source, parser)
|
|
908 |
else:
|
|
909 |
tree = parser_api.parse(source)
|
|
910 |
return ElementTree(tree)
|
|
911 |
|
|
912 |
##
|
|
913 |
# Parses an XML document into an element tree incrementally, and reports
|
|
914 |
# what's going on to the user.
|
|
915 |
#
|
|
916 |
# @param source A filename or file object containing XML data.
|
|
917 |
# @param events A list of events to report back. If omitted, only "end"
|
|
918 |
# events are reported.
|
|
919 |
# @return A (event, elem) iterator.
|
|
920 |
|
|
921 |
def iterparse(source, events=None):
|
|
922 |
return parser_api.iterparse(source, events)
|
|
923 |
|
|
924 |
class _iterparse:
|
|
925 |
|
|
926 |
def __init__(self, source, events):
|
|
927 |
if not hasattr(source, "read"):
|
|
928 |
source = open(source, "rb")
|
|
929 |
self._file = source
|
|
930 |
self._events = []
|
|
931 |
self._index = 0
|
|
932 |
self.root = self._root = None
|
|
933 |
self._parser = XMLTreeBuilder()
|
|
934 |
# wire up the parser for event reporting
|
|
935 |
parser = self._parser._parser
|
|
936 |
append = self._events.append
|
|
937 |
if events is None:
|
|
938 |
events = ["end"]
|
|
939 |
for event in events:
|
|
940 |
if event == "start":
|
|
941 |
try:
|
|
942 |
parser.ordered_attributes = 1
|
|
943 |
parser.specified_attributes = 1
|
|
944 |
def handler(tag, attrib_in, event=event, append=append,
|
|
945 |
start=self._parser._start_list):
|
|
946 |
append((event, start(tag, attrib_in)))
|
|
947 |
parser.StartElementHandler = handler
|
|
948 |
except AttributeError:
|
|
949 |
def handler(tag, attrib_in, event=event, append=append,
|
|
950 |
start=self._parser._start):
|
|
951 |
append((event, start(tag, attrib_in)))
|
|
952 |
parser.StartElementHandler = handler
|
|
953 |
elif event == "end":
|
|
954 |
def handler(tag, event=event, append=append,
|
|
955 |
end=self._parser._end):
|
|
956 |
append((event, end(tag)))
|
|
957 |
parser.EndElementHandler = handler
|
|
958 |
elif event == "start-ns":
|
|
959 |
def handler(prefix, uri, event=event, append=append):
|
|
960 |
try:
|
|
961 |
uri = _encode(uri, "ascii")
|
|
962 |
except UnicodeError:
|
|
963 |
pass
|
|
964 |
append((event, (prefix or "", uri)))
|
|
965 |
parser.StartNamespaceDeclHandler = handler
|
|
966 |
elif event == "end-ns":
|
|
967 |
def handler(prefix, event=event, append=append):
|
|
968 |
append((event, None))
|
|
969 |
parser.EndNamespaceDeclHandler = handler
|
|
970 |
|
|
971 |
def next(self):
|
|
972 |
while 1:
|
|
973 |
try:
|
|
974 |
item = self._events[self._index]
|
|
975 |
except IndexError:
|
|
976 |
if self._parser is None:
|
|
977 |
self.root = self._root
|
|
978 |
try:
|
|
979 |
raise StopIteration
|
|
980 |
except NameError:
|
|
981 |
raise IndexError
|
|
982 |
# load event buffer
|
|
983 |
del self._events[:]
|
|
984 |
self._index = 0
|
|
985 |
data = self._file.read(16384)
|
|
986 |
if data:
|
|
987 |
self._parser.feed(data)
|
|
988 |
else:
|
|
989 |
self._root = self._parser.close()
|
|
990 |
self._parser = None
|
|
991 |
else:
|
|
992 |
self._index = self._index + 1
|
|
993 |
return item
|
|
994 |
|
|
995 |
try:
|
|
996 |
iter
|
|
997 |
def __iter__(self):
|
|
998 |
return self
|
|
999 |
except NameError:
|
|
1000 |
def __getitem__(self, index):
|
|
1001 |
return self.next()
|
|
1002 |
|
|
1003 |
##
|
|
1004 |
# Parses an XML document from a string constant. This function can
|
|
1005 |
# be used to embed "XML literals" in Python code.
|
|
1006 |
#
|
|
1007 |
# @param source A string containing XML data.
|
|
1008 |
# @return An Element instance.
|
|
1009 |
# @defreturn Element
|
|
1010 |
|
|
1011 |
def XML(text):
|
|
1012 |
api = parser_api or default_parser_api
|
|
1013 |
return api.fromstring(text)
|
|
1014 |
|
|
1015 |
##
|
|
1016 |
# Parses an XML document from a string constant, and also returns
|
|
1017 |
# a dictionary which maps from element id:s to elements.
|
|
1018 |
#
|
|
1019 |
# @param source A string containing XML data.
|
|
1020 |
# @return A tuple containing an Element instance and a dictionary.
|
|
1021 |
# @defreturn (Element, dictionary)
|
|
1022 |
|
|
1023 |
def XMLID(text):
|
|
1024 |
api = parser_api or default_parser_api
|
|
1025 |
tree = api.fromstring(text)
|
|
1026 |
ids = {}
|
|
1027 |
for elem in tree.getiterator():
|
|
1028 |
id = elem.get("id")
|
|
1029 |
if id:
|
|
1030 |
ids[id] = elem
|
|
1031 |
return tree, ids
|
|
1032 |
|
|
1033 |
##
|
|
1034 |
# Parses an XML document from a string constant. Same as {@link #XML}.
|
|
1035 |
#
|
|
1036 |
# @def fromstring(text)
|
|
1037 |
# @param source A string containing XML data.
|
|
1038 |
# @return An Element instance.
|
|
1039 |
# @defreturn Element
|
|
1040 |
|
|
1041 |
fromstring = XML
|
|
1042 |
|
|
1043 |
##
|
|
1044 |
# Generates a string representation of an XML element, including all
|
|
1045 |
# subelements.
|
|
1046 |
#
|
|
1047 |
# @param element An Element instance.
|
|
1048 |
# @return An encoded string containing the XML data.
|
|
1049 |
# @defreturn string
|
|
1050 |
|
|
1051 |
def tostring(element, encoding=None):
|
|
1052 |
class dummy:
|
|
1053 |
pass
|
|
1054 |
data = []
|
|
1055 |
file = dummy()
|
|
1056 |
file.write = data.append
|
|
1057 |
ElementTree(element).write(file, encoding)
|
|
1058 |
return string.join(data, "")
|
|
1059 |
|
|
1060 |
##
|
|
1061 |
# Generic element structure builder. This builder converts a sequence
|
|
1062 |
# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
|
|
1063 |
# #TreeBuilder.end} method calls to a well-formed element structure.
|
|
1064 |
# <p>
|
|
1065 |
# You can use this class to build an element structure using a custom XML
|
|
1066 |
# parser, or a parser for some other XML-like format.
|
|
1067 |
#
|
|
1068 |
# @param element_factory Optional element factory. This factory
|
|
1069 |
# is called to create new Element instances, as necessary.
|
|
1070 |
|
|
1071 |
class TreeBuilder:
|
|
1072 |
|
|
1073 |
def __init__(self, element_factory=None):
|
|
1074 |
self._data = [] # data collector
|
|
1075 |
self._elem = [] # element stack
|
|
1076 |
self._last = None # last element
|
|
1077 |
self._tail = None # true if we're after an end tag
|
|
1078 |
if element_factory is None:
|
|
1079 |
element_factory = _ElementInterface
|
|
1080 |
self._factory = element_factory
|
|
1081 |
|
|
1082 |
##
|
|
1083 |
# Flushes the parser buffers, and returns the toplevel documen
|
|
1084 |
# element.
|
|
1085 |
#
|
|
1086 |
# @return An Element instance.
|
|
1087 |
# @defreturn Element
|
|
1088 |
|
|
1089 |
def close(self):
|
|
1090 |
assert len(self._elem) == 0, "missing end tags"
|
|
1091 |
assert self._last != None, "missing toplevel element"
|
|
1092 |
return self._last
|
|
1093 |
|
|
1094 |
def _flush(self):
|
|
1095 |
if self._data:
|
|
1096 |
if self._last is not None:
|
|
1097 |
text = string.join(self._data, "")
|
|
1098 |
if self._tail:
|
|
1099 |
assert self._last.tail is None, "internal error (tail)"
|
|
1100 |
self._last.tail = text
|
|
1101 |
else:
|
|
1102 |
assert self._last.text is None, "internal error (text)"
|
|
1103 |
self._last.text = text
|
|
1104 |
self._data = []
|
|
1105 |
|
|
1106 |
##
|
|
1107 |
# Adds text to the current element.
|
|
1108 |
#
|
|
1109 |
# @param data A string. This should be either an 8-bit string
|
|
1110 |
# containing ASCII text, or a Unicode string.
|
|
1111 |
|
|
1112 |
def data(self, data):
|
|
1113 |
self._data.append(data)
|
|
1114 |
|
|
1115 |
##
|
|
1116 |
# Opens a new element.
|
|
1117 |
#
|
|
1118 |
# @param tag The element name.
|
|
1119 |
# @param attrib A dictionary containing element attributes.
|
|
1120 |
# @return The opened element.
|
|
1121 |
# @defreturn Element
|
|
1122 |
|
|
1123 |
def start(self, tag, attrs):
|
|
1124 |
self._flush()
|
|
1125 |
self._last = elem = self._factory(tag, attrs)
|
|
1126 |
if self._elem:
|
|
1127 |
self._elem[-1].append(elem)
|
|
1128 |
self._elem.append(elem)
|
|
1129 |
self._tail = 0
|
|
1130 |
return elem
|
|
1131 |
|
|
1132 |
##
|
|
1133 |
# Closes the current element.
|
|
1134 |
#
|
|
1135 |
# @param tag The element name.
|
|
1136 |
# @return The closed element.
|
|
1137 |
# @defreturn Element
|
|
1138 |
|
|
1139 |
def end(self, tag):
|
|
1140 |
self._flush()
|
|
1141 |
self._last = self._elem.pop()
|
|
1142 |
assert self._last.tag == tag,\
|
|
1143 |
"end tag mismatch (expected %s, got %s)" % (
|
|
1144 |
self._last.tag, tag)
|
|
1145 |
self._tail = 1
|
|
1146 |
return self._last
|
|
1147 |
|
|
1148 |
##
|
|
1149 |
# Element structure builder for XML source data, based on the
|
|
1150 |
# <b>expat</b> parser.
|
|
1151 |
#
|
|
1152 |
# @keyparam target Target object. If omitted, the builder uses an
|
|
1153 |
# instance of the standard {@link #TreeBuilder} class.
|
|
1154 |
# @keyparam html Predefine HTML entities. This flag is not supported
|
|
1155 |
# by the current implementation.
|
|
1156 |
# @see #ElementTree
|
|
1157 |
# @see #TreeBuilder
|
|
1158 |
|
|
1159 |
class XMLTreeBuilder:
|
|
1160 |
|
|
1161 |
def __init__(self, html=0, target=None):
|
|
1162 |
try:
|
|
1163 |
from xml.parsers import expat
|
|
1164 |
except ImportError:
|
|
1165 |
raise ImportError(
|
|
1166 |
"No module named expat; use SimpleXMLTreeBuilder instead"
|
|
1167 |
)
|
|
1168 |
self._parser = parser = expat.ParserCreate(None, "}")
|
|
1169 |
if target is None:
|
|
1170 |
target = TreeBuilder()
|
|
1171 |
self._target = target
|
|
1172 |
self._names = {} # name memo cache
|
|
1173 |
# callbacks
|
|
1174 |
parser.DefaultHandlerExpand = self._default
|
|
1175 |
parser.StartElementHandler = self._start
|
|
1176 |
parser.EndElementHandler = self._end
|
|
1177 |
parser.CharacterDataHandler = self._data
|
|
1178 |
# let expat do the buffering, if supported
|
|
1179 |
try:
|
|
1180 |
self._parser.buffer_text = 1
|
|
1181 |
except AttributeError:
|
|
1182 |
pass
|
|
1183 |
# use new-style attribute handling, if supported
|
|
1184 |
try:
|
|
1185 |
self._parser.ordered_attributes = 1
|
|
1186 |
self._parser.specified_attributes = 1
|
|
1187 |
parser.StartElementHandler = self._start_list
|
|
1188 |
except AttributeError:
|
|
1189 |
pass
|
|
1190 |
encoding = None
|
|
1191 |
if not parser.returns_unicode:
|
|
1192 |
encoding = "utf-8"
|
|
1193 |
# target.xml(encoding, None)
|
|
1194 |
self._doctype = None
|
|
1195 |
self.entity = {}
|
|
1196 |
|
|
1197 |
def _fixtext(self, text):
|
|
1198 |
# convert text string to ascii, if possible
|
|
1199 |
try:
|
|
1200 |
return _encode(text, "ascii")
|
|
1201 |
except UnicodeError:
|
|
1202 |
return text
|
|
1203 |
|
|
1204 |
def _fixname(self, key):
|
|
1205 |
# expand qname, and convert name string to ascii, if possible
|
|
1206 |
try:
|
|
1207 |
name = self._names[key]
|
|
1208 |
except KeyError:
|
|
1209 |
name = key
|
|
1210 |
if "}" in name:
|
|
1211 |
name = "{" + name
|
|
1212 |
self._names[key] = name = self._fixtext(name)
|
|
1213 |
return name
|
|
1214 |
|
|
1215 |
def _start(self, tag, attrib_in):
|
|
1216 |
fixname = self._fixname
|
|
1217 |
tag = fixname(tag)
|
|
1218 |
attrib = {}
|
|
1219 |
for key, value in attrib_in.items():
|
|
1220 |
attrib[fixname(key)] = self._fixtext(value)
|
|
1221 |
return self._target.start(tag, attrib)
|
|
1222 |
|
|
1223 |
def _start_list(self, tag, attrib_in):
|
|
1224 |
fixname = self._fixname
|
|
1225 |
tag = fixname(tag)
|
|
1226 |
attrib = {}
|
|
1227 |
if attrib_in:
|
|
1228 |
for i in range(0, len(attrib_in), 2):
|
|
1229 |
attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1])
|
|
1230 |
return self._target.start(tag, attrib)
|
|
1231 |
|
|
1232 |
def _data(self, text):
|
|
1233 |
return self._target.data(self._fixtext(text))
|
|
1234 |
|
|
1235 |
def _end(self, tag):
|
|
1236 |
return self._target.end(self._fixname(tag))
|
|
1237 |
|
|
1238 |
def _default(self, text):
|
|
1239 |
prefix = text[:1]
|
|
1240 |
if prefix == "&":
|
|
1241 |
# deal with undefined entities
|
|
1242 |
try:
|
|
1243 |
self._target.data(self.entity[text[1:-1]])
|
|
1244 |
except KeyError:
|
|
1245 |
from xml.parsers import expat
|
|
1246 |
raise expat.error(
|
|
1247 |
"undefined entity %s: line %d, column %d" %
|
|
1248 |
(text, self._parser.ErrorLineNumber,
|
|
1249 |
self._parser.ErrorColumnNumber)
|
|
1250 |
)
|
|
1251 |
elif prefix == "<" and text[:9] == "<!DOCTYPE":
|
|
1252 |
self._doctype = [] # inside a doctype declaration
|
|
1253 |
elif self._doctype is not None:
|
|
1254 |
# parse doctype contents
|
|
1255 |
if prefix == ">":
|
|
1256 |
self._doctype = None
|
|
1257 |
return
|
|
1258 |
text = string.strip(text)
|
|
1259 |
if not text:
|
|
1260 |
return
|
|
1261 |
self._doctype.append(text)
|
|
1262 |
n = len(self._doctype)
|
|
1263 |
if n > 2:
|
|
1264 |
type = self._doctype[1]
|
|
1265 |
if type == "PUBLIC" and n == 4:
|
|
1266 |
name, type, pubid, system = self._doctype
|
|
1267 |
elif type == "SYSTEM" and n == 3:
|
|
1268 |
name, type, system = self._doctype
|
|
1269 |
pubid = None
|
|
1270 |
else:
|
|
1271 |
return
|
|
1272 |
if pubid:
|
|
1273 |
pubid = pubid[1:-1]
|
|
1274 |
self.doctype(name, pubid, system[1:-1])
|
|
1275 |
self._doctype = None
|
|
1276 |
|
|
1277 |
##
|
|
1278 |
# Handles a doctype declaration.
|
|
1279 |
#
|
|
1280 |
# @param name Doctype name.
|
|
1281 |
# @param pubid Public identifier.
|
|
1282 |
# @param system System identifier.
|
|
1283 |
|
|
1284 |
def doctype(self, name, pubid, system):
|
|
1285 |
pass
|
|
1286 |
|
|
1287 |
##
|
|
1288 |
# Feeds data to the parser.
|
|
1289 |
#
|
|
1290 |
# @param data Encoded data.
|
|
1291 |
|
|
1292 |
def feed(self, data):
|
|
1293 |
self._parser.Parse(data, 0)
|
|
1294 |
|
|
1295 |
##
|
|
1296 |
# Finishes feeding data to the parser.
|
|
1297 |
#
|
|
1298 |
# @return An element structure.
|
|
1299 |
# @defreturn Element
|
|
1300 |
|
|
1301 |
def close(self):
|
|
1302 |
self._parser.Parse("", 1) # end of data
|
|
1303 |
tree = self._target.close()
|
|
1304 |
del self._target, self._parser # get rid of circular references
|
|
1305 |
return tree
|
|
1306 |
|
|
1307 |
|
|
1308 |
# --------------------------------------------------------------------
|
|
1309 |
# load platform specific extensions
|
|
1310 |
|
|
1311 |
if sys.platform == "cli":
|
|
1312 |
try:
|
|
1313 |
import ElementIron
|
|
1314 |
except ImportError:
|
|
1315 |
pass # fall back on optional pyexpat emulation
|
|
1316 |
else:
|
|
1317 |
parser_api = ElementIron.ParserAPI(TreeBuilder)
|
|
1318 |
|
|
1319 |
elif sys.platform.startswith("java"):
|
|
1320 |
try:
|
|
1321 |
import ElementJava
|
|
1322 |
except ImportError:
|
|
1323 |
pass
|
|
1324 |
else:
|
|
1325 |
parser_api = ElementJava.ParserAPI(TreeBuilder)
|