From 1fa30c6e4cf7018d1338d62ae8c0fcaedc4a391c Mon Sep 17 00:00:00 2001 From: Funda Wang Date: Mon, 15 Sep 2025 13:33:24 +0800 Subject: [PATCH] use lxml instead of libxml2 --- Use-pylxml.patch | 1623 +++++++++++++++++ ...ficiently-quoted-regular-expressions.patch | 73 + itstool.spec | 25 +- 3 files changed, 1711 insertions(+), 10 deletions(-) create mode 100644 Use-pylxml.patch create mode 100644 backport-Fix-insufficiently-quoted-regular-expressions.patch diff --git a/Use-pylxml.patch b/Use-pylxml.patch new file mode 100644 index 0000000..b0214e4 --- /dev/null +++ b/Use-pylxml.patch @@ -0,0 +1,1623 @@ +From 15027b5391e3d2c45846524721abbe978ca73def Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Tue, 15 Apr 2025 02:51:25 +0200 +Subject: [PATCH 1/5] Switch from libxml2 to lxml + +Most of the transition is straight-forward, but some issues turned up. + +- lxml doesn't seem to expose the prefixes of attributes, requiring + an XPath evaluation as work-around. +- Serializing the internal subset is a bit hacky. +- lxml doesn't support attribute nodes, so we have to emulate them. +- lxml doesn't support attributes as XPath context nodes, so some use + cases aren't supported. Using an `.` expression on an attribute + works, though. + +Changes to expected test results are mostly cosmetic. + +- Whitespace before and after the document element is processed more + faithfully. +- lxml removes some superfluous namespace prefixes. +- There's one superfluous namespace declaration which isn't removed + anymore. + +Fixes #10. +--- + configure.ac | 2 +- + itstool.in | 1012 +++++++++++------------ + 2 files changed, 479 insertions(+), 577 deletions(-) + +diff --git a/configure.ac b/configure.ac +index 9d04372..d94bead 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -12,7 +12,7 @@ AC_SUBST([DATADIR]) + + AM_PATH_PYTHON([2.6]) + +-py_module=libxml2 ++py_module=lxml + AC_MSG_CHECKING(for python module $py_module) + echo "import $py_module" | $PYTHON - &>/dev/null + if test $? -ne 0; then +diff --git a/itstool.in b/itstool.in +index 4452616..052255e 100755 +--- a/itstool.in ++++ b/itstool.in +@@ -24,7 +24,8 @@ DATADIR="@DATADIR@" + + import gettext + import hashlib +-import libxml2 ++from copy import deepcopy ++from lxml import etree + import optparse + import os + import os.path +@@ -190,7 +191,7 @@ class Comment (object): + class Placeholder (object): + def __init__ (self, node): + self.node = node +- self.name = ustr(node.name, 'utf-8') ++ self.name = ustr(xml_localname(node), 'utf-8') + + + class Message (object): +@@ -243,32 +244,30 @@ class Message (object): + def add_start_tag (self, node): + if len(self._message) == 0 or not(isinstance(self._message[-1], string_types)): + self._message.append('') +- if node.ns() is not None and node.ns().name is not None: +- self._message[-1] += ('<%s:%s' % (ustr(node.ns().name, 'utf-8'), ustr(node.name, 'utf-8'))) +- else: +- self._message[-1] += ('<%s' % ustr(node.name, 'utf-8')) +- for prop in xml_attr_iter(node): +- name = prop.name +- if prop.ns() is not None: +- name = prop.ns().name + ':' + name +- atval = prop.content ++ self._message[-1] += ('<%s' % ustr(xml_qname(node), 'utf-8')) ++ for name, atval in node.items(): ++ qname = etree.QName(name) ++ if qname.namespace is not None: ++ # lxml doesn't expose the prefix of attributes, so we use ++ # an XPath expression to get the attribute's prefixed name. ++ # This is horribly inefficient. ++ expr = 'name(@*[local-name()="%s" and namespace-uri()="%s"])' % ( ++ qname.localname, qname.namespace) ++ name = node.xpath(expr) + if not isinstance(atval, ustr_type): + atval = ustr(atval, 'utf-8') + atval = atval.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') + self._message += " %s=\"%s\"" % (name, atval) +- if node.children is not None: ++ if len(node) > 0 or node.text: + self._message[-1] += '>' + else: + self._message[-1] += '/>' + + def add_end_tag (self, node): +- if node.children is not None: ++ if len(node) > 0 or node.text: + if len(self._message) == 0 or not(isinstance(self._message[-1], string_types)): + self._message.append('') +- if node.ns() is not None and node.ns().name is not None: +- self._message[-1] += ('' % (ustr(node.ns().name, 'utf-8'), ustr(node.name, 'utf-8'))) +- else: +- self._message[-1] += ('' % ustr(node.name, 'utf-8')) ++ self._message[-1] += ('' % ustr(xml_qname(node), 'utf-8')) + + def is_empty (self): + return self._empty +@@ -379,67 +378,84 @@ class Message (object): + return ret + + +-def xml_child_iter (node): +- child = node.children +- while child is not None: +- yield child +- child = child.next +- +-def xml_attr_iter (node): +- attr = node.get_properties() +- while attr is not None: +- yield attr +- attr = attr.next +- +-def xml_is_ns_name (node, ns, name): +- if node.type != 'element': +- return False +- return node.name == name and node.ns() is not None and node.ns().content == ns ++def xml_localname (node): ++ return etree.QName(node.tag).localname ++ ++def xml_qname (node): ++ qname = etree.QName(node.tag).localname ++ if node.prefix is not None: ++ qname = node.prefix + ':' + qname ++ return qname ++ ++def xml_content (node): ++ if isinstance(node, string_types): ++ return node ++ if isinstance(node, XMLAttr): ++ return node.parent.get(node.tag) ++ return etree.tostring(node, method='text', encoding='unicode') ++ ++def xml_delete_node (node): ++ parent = node.getparent() ++ prev = node.getprevious() ++ tail = node.tail ++ if parent is not None: ++ parent.remove(node) ++ if prev is not None: ++ if prev.tail is None or re.fullmatch(r'\s+', prev.tail): ++ prev.tail = tail ++ else: ++ prev.tail += tail ++ elif parent is not None: ++ if parent.text is None or re.fullmatch(r'\s+', parent.text): ++ parent.text = tail ++ else: ++ parent.text += tail + + def xml_get_node_path(node): + # The built-in nodePath() method only does numeric indexes + # when necessary for disambiguation. For various reasons, + # we prefer always using indexes. +- name = node.name +- if node.ns() is not None and node.ns().name is not None: +- name = node.ns().name + ':' + name +- if node.type == 'attribute': ++ name = xml_qname(node) ++ if isinstance(node, XMLAttr): + name = '@' + name + name = '/' + name +- if node.type == 'element' and node.parent.type == 'element': ++ if node.getparent() is not None: + count = 1 +- prev = node.previousElementSibling() ++ prev = node.getprevious() + while prev is not None: +- if prev.name == node.name: +- if prev.ns() is None: +- if node.ns() is None: +- count += 1 +- else: +- if node.ns() is not None: +- if prev.ns().name == node.ns().name: +- count += 1 +- prev = prev.previousElementSibling() ++ if prev.tag == node.tag: ++ count += 1 ++ prev = prev.getprevious() + name = '%s[%i]' % (name, count) +- if node.parent.type == 'element': +- name = xml_get_node_path(node.parent) + name ++ name = xml_get_node_path(node.getparent()) + name + return name + +-def xml_error_catcher(doc, error): +- doc._xml_err += " %s" % error + +-def fix_node_ns (node, nsdefs): +- childnsdefs = nsdefs.copy() +- nsdef = node.nsDefs() +- while nsdef is not None: +- nextnsdef = nsdef.next +- if nsdef.name in nsdefs and nsdefs[nsdef.name] == nsdef.content: +- node.removeNsDef(nsdef.content) +- else: +- childnsdefs[nsdef.name] = nsdef.content +- nsdef = nextnsdef +- for child in xml_child_iter(node): +- if child.type == 'element': +- fix_node_ns(child, childnsdefs) ++# lxml doesn't support attribute nodes, so we have to emulate them. ++class XMLAttr (object): ++ def __init__(self, element, tag): ++ self.parent = element ++ self.tag = tag ++ self.attrib = {} ++ self.sourceline = element.sourceline ++ ++ def __repr__(self): ++ return '%s@%s' % (repr(self.parent), self.tag) ++ ++ def __eq__(self, other): ++ return other and self.parent == other.parent and self.tag == other.tag ++ ++ def __ne__(self, other): ++ return not self.__eq__(other) ++ ++ def __hash__(self): ++ return hash(repr(self)) ++ ++ def getparent(self): ++ return self.parent ++ ++ def get(self, default=None): ++ return default + + + class LocNote (object): +@@ -464,82 +480,51 @@ class LocNote (object): + + class Document (object): + def __init__ (self, filename, messages, load_dtd=False, keep_entities=False): +- self._xml_err = '' +- libxml2.registerErrorHandler(xml_error_catcher, self) +- try: +- ctxt = libxml2.createFileParserCtxt(filename) +- except: +- sys.stderr.write('Error: cannot open XML file %s\n' % filename) +- sys.exit(1) +- ctxt.lineNumbers(1) + self._load_dtd = load_dtd + self._keep_entities = keep_entities +- if load_dtd: +- ctxt.loadSubset(1) +- if keep_entities: +- ctxt.loadSubset(1) +- ctxt.ctxtUseOptions(libxml2.XML_PARSE_DTDLOAD) +- ctxt.replaceEntities(0) +- else: +- ctxt.replaceEntities(1) +- ctxt.parseDocument() ++ parser = etree.XMLParser(load_dtd = load_dtd or keep_entities, ++ resolve_entities = not(keep_entities)) ++ doc = etree.parse(filename, parser) ++ doc.xinclude() + self._filename = filename +- self._doc = ctxt.doc() ++ self._doc = doc + self._localrules = [] +- def pre_process (node): +- for child in xml_child_iter(node): +- if xml_is_ns_name(child, 'http://www.w3.org/2001/XInclude', 'include'): +- if child.nsProp('parse', None) == 'text': +- child.xincludeProcessTree() +- elif xml_is_ns_name(child, NS_ITS, 'rules'): +- if child.hasNsProp('href', NS_XLINK): +- href = child.nsProp('href', NS_XLINK) +- fileref = os.path.join(os.path.dirname(filename), href) +- if not os.path.exists(fileref): +- if opts.itspath is not None: +- for pathdir in opts.itspath: +- fileref = os.path.join(pathdir, href) +- if os.path.exists(fileref): +- break +- if not os.path.exists(fileref): +- sys.stderr.write('Error: Could not locate ITS file %s\n' % href) +- sys.exit(1) +- hctxt = libxml2.createFileParserCtxt(fileref) +- hctxt.replaceEntities(1) +- hctxt.parseDocument() +- root = hctxt.doc().getRootElement() +- version = None +- if root.hasNsProp('version', None): +- version = root.nsProp('version', None) +- else: +- sys.stderr.write('Warning: ITS file %s missing version attribute\n' % +- os.path.basename(href)) +- if version is not None and version not in ('1.0', '2.0'): +- sys.stderr.write('Warning: Skipping ITS file %s with unknown version %s\n' % +- (os.path.basename(href), root.nsProp('version', None))) +- else: +- self._localrules.append(root) ++ for child in doc.iter(): ++ if child.tag == '{' + NS_ITS + '}rules': ++ href = child.get('{' + NS_XLINK + '}href') ++ if href is not None: ++ fileref = os.path.join(os.path.dirname(filename), href) ++ if not os.path.exists(fileref): ++ if opts.itspath is not None: ++ for pathdir in opts.itspath: ++ fileref = os.path.join(pathdir, href) ++ if os.path.exists(fileref): ++ break ++ if not os.path.exists(fileref): ++ sys.stderr.write('Error: Could not locate ITS file %s\n' % href) ++ sys.exit(1) ++ root = etree.parse(fileref).getroot() + version = None +- if child.hasNsProp('version', None): +- version = child.nsProp('version', None) ++ version = root.get('version') ++ if version is None: ++ sys.stderr.write('Warning: ITS file %s missing version attribute\n' % ++ os.path.basename(href)) ++ elif version not in ('1.0', '2.0'): ++ sys.stderr.write('Warning: Skipping ITS file %s with unknown version %s\n' % ++ (os.path.basename(href), root.get('version'))) + else: +- root = child.doc.getRootElement() +- if root.hasNsProp('version', NS_ITS): +- version = root.nsProp('version', NS_ITS) +- else: +- sys.stderr.write('Warning: Local ITS rules missing version attribute\n') +- if version is not None and version not in ('1.0', '2.0'): +- sys.stderr.write('Warning: Skipping local ITS rules with unknown version %s\n' % +- version) +- else: +- self._localrules.append(child) +- pre_process(child) +- pre_process(self._doc) +- try: +- self._check_errors() +- except libxml2.parserError as e: +- sys.stderr.write('Error: Could not parse document:\n%s\n' % ustr(e)) +- sys.exit(1) ++ self._localrules.append(root) ++ version = child.get('version') ++ if version is None: ++ root = child.getroottree() ++ version = root.get('{' + NS_ITS + '}version') ++ if version is None: ++ sys.stderr.write('Warning: Local ITS rules missing version attribute\n') ++ elif version not in ('1.0', '2.0'): ++ sys.stderr.write('Warning: Skipping local ITS rules with unknown version %s\n' % ++ version) ++ else: ++ self._localrules.append(child) + self._msgs = messages + self._its_translate_nodes = {} + self._its_within_text_nodes = {} +@@ -556,13 +541,6 @@ class Document (object): + + self._clear_cache() + +- def __del__ (self): +- self._doc.freeDoc() +- +- def _check_errors(self): +- if self._xml_err: +- raise libxml2.parserError(self._xml_err) +- + def _clear_cache(self): + self._its_translate_nodes_cache = {} + self._its_locale_filters_cache = {} +@@ -570,123 +548,107 @@ class Document (object): + + def get_its_params(self, rules): + params = {} +- for child in xml_child_iter(rules): +- if xml_is_ns_name(child, NS_ITS, 'param'): +- params[child.nsProp('name', None)] = child.getContent() ++ for child in rules.iterchildren(): ++ if child.tag == '{' + NS_ITS + '}param': ++ params[child.get('name')] = xml_content(child) + return params + +- def register_its_params(self, xpath, params, userparams={}): +- for param in params: +- if param in userparams: +- xpath.xpathRegisterVariable(name, None, userparams[param]) ++ def register_its_params(self, var, params, userparams={}): ++ for name in params: ++ if name in userparams: ++ var[name] = userparams[name] + else: +- xpath.xpathRegisterVariable(name, None, params[param]) ++ var[name] = params[name] + + def apply_its_rule(self, rule, xpath): + self._clear_cache() +- if rule.type != 'element': +- return +- if xml_is_ns_name(rule, NS_ITS, 'translateRule'): +- if rule.nsProp('selector', None) is not None: +- for node in self._try_xpath_eval(xpath, rule.nsProp('selector', None)): +- self._its_translate_nodes[node] = rule.nsProp('translate', None) +- elif xml_is_ns_name(rule, NS_ITS, 'withinTextRule'): +- if rule.nsProp('selector', None) is not None: +- for node in self._try_xpath_eval(xpath, rule.nsProp('selector', None)): +- self._its_within_text_nodes[node] = rule.nsProp('withinText', None) +- elif xml_is_ns_name(rule, NS_ITST, 'preserveSpaceRule'): +- if rule.nsProp('selector', None) is not None: +- for node in self._try_xpath_eval(xpath, rule.nsProp('selector', None)): +- val = rule.nsProp('preserveSpace', None) ++ if rule.tag == '{' + NS_ITS + '}translateRule': ++ sel = rule.get('selector') ++ if sel is not None: ++ for node in self._try_xpath_eval(xpath, sel): ++ self._its_translate_nodes[node] = rule.get('translate') ++ elif rule.tag == '{' + NS_ITS + '}withinTextRule': ++ sel = rule.get('selector') ++ if sel is not None: ++ for node in self._try_xpath_eval(xpath, sel): ++ self._its_within_text_nodes[node] = rule.get('withinText') ++ elif rule.tag == '{' + NS_ITST + '}preserveSpaceRule': ++ sel = rule.get('selector') ++ if sel is not None: ++ for node in self._try_xpath_eval(xpath, sel): ++ val = rule.get('preserveSpace') + if val == 'yes': + self._its_preserve_space_nodes[node] = 'preserve' +- elif xml_is_ns_name(rule, NS_ITS, 'preserveSpaceRule'): +- if rule.nsProp('selector', None) is not None: +- for node in self._try_xpath_eval(xpath, rule.nsProp('selector', None)): +- self._its_preserve_space_nodes[node] = rule.nsProp('space', None) +- elif xml_is_ns_name(rule, NS_ITS, 'localeFilterRule'): +- if rule.nsProp('selector', None) is not None: +- if rule.hasNsProp('localeFilterList', None): +- lst = rule.nsProp('localeFilterList', None) +- else: +- lst = '*' +- if rule.hasNsProp('localeFilterType', None): +- typ = rule.nsProp('localeFilterType', None) +- else: +- typ = 'include' +- for node in self._try_xpath_eval(xpath, rule.nsProp('selector', None)): ++ elif rule.tag == '{' + NS_ITS + '}preserveSpaceRule': ++ sel = rule.get('selector') ++ if sel is not None: ++ for node in self._try_xpath_eval(xpath, sel): ++ self._its_preserve_space_nodes[node] = rule.get('space') ++ elif rule.tag == '{' + NS_ITS + '}localeFilterRule': ++ sel = rule.get('selector') ++ if sel is not None: ++ lst = rule.get('localeFilterList', '*') ++ typ = rule.get('localeFilterType', 'include') ++ for node in self._try_xpath_eval(xpath, sel): + self._its_locale_filters[node] = (lst, typ) +- elif xml_is_ns_name(rule, NS_ITST, 'dropRule'): +- if rule.nsProp('selector', None) is not None: +- for node in self._try_xpath_eval(xpath, rule.nsProp('selector', None)): +- self._itst_drop_nodes[node] = rule.nsProp('drop', None) +- elif xml_is_ns_name(rule, NS_ITS, 'idValueRule'): +- sel = rule.nsProp('selector', None) +- idv = rule.nsProp('idValue', None) ++ elif rule.tag == '{' + NS_ITST + '}dropRule': ++ sel = rule.get('selector') ++ if sel is not None: ++ for node in self._try_xpath_eval(xpath, sel): ++ self._itst_drop_nodes[node] = rule.get('drop') ++ elif rule.tag == '{' + NS_ITS + '}idValueRule': ++ sel = rule.get('selector') ++ idv = rule.get('idValue') + if sel is not None and idv is not None: + for node in self._try_xpath_eval(xpath, sel): +- try: +- oldnode = xpath.contextNode() +- except: +- oldnode = None +- xpath.setContextNode(node) +- idvalue = self._try_xpath_eval(xpath, idv) ++ idvalue = self._try_xpath_eval(xpath, idv, node=node) + if isinstance(idvalue, string_types): + self._its_id_values[node] = idvalue + else: + for val in idvalue: +- self._its_id_values[node] = val.content ++ self._its_id_values[node] = xml_content(val) + break +- xpath.setContextNode(oldnode) + pass +- elif xml_is_ns_name(rule, NS_ITST, 'contextRule'): +- if rule.nsProp('selector', None) is not None: +- for node in self._try_xpath_eval(xpath, rule.nsProp('selector', None)): +- if rule.hasNsProp('context', None): +- self._itst_contexts[node] = rule.nsProp('context', None) +- elif rule.hasNsProp('contextPointer', None): +- try: +- oldnode = xpath.contextNode() +- except: +- oldnode = None +- xpath.setContextNode(node) +- ctxt = self._try_xpath_eval(xpath, rule.nsProp('contextPointer', None)) ++ elif rule.tag == '{' + NS_ITST + '}contextRule': ++ sel = rule.get('selector') ++ if sel is not None: ++ for node in self._try_xpath_eval(xpath, sel): ++ ctxt = rule.get('context') ++ cp = rule.get('contextPointer') ++ if ctxt is not None: ++ self._itst_contexts[node] = ctxt ++ elif cp is not None: ++ ctxt = self._try_xpath_eval(xpath, cp, node=node) + if isinstance(ctxt, string_types): + self._itst_contexts[node] = ctxt + else: + for ctxt in ctxt: +- self._itst_contexts[node] = ctxt.content ++ self._itst_contexts[node] = xml_content(ctxt) + break +- xpath.setContextNode(oldnode) +- elif xml_is_ns_name(rule, NS_ITS, 'locNoteRule'): ++ elif rule.tag == '{' + NS_ITS + '}locNoteRule': + locnote = None +- notetype = rule.nsProp('locNoteType', None) +- for child in xml_child_iter(rule): +- if xml_is_ns_name(child, NS_ITS, 'locNote'): +- locnote = LocNote(locnote=child.content, locnotetype=notetype) +- break ++ notetype = rule.get('locNoteType') ++ for child in rule.iterchildren('{' + NS_ITS + '}locNote'): ++ locnote = LocNote(locnote=xml_content(child), locnotetype=notetype) ++ break + if locnote is None: +- if rule.hasNsProp('locNoteRef', None): +- locnote = LocNote(locnoteref=rule.nsProp('locNoteRef', None), locnotetype=notetype) +- if rule.nsProp('selector', None) is not None: +- for node in self._try_xpath_eval(xpath, rule.nsProp('selector', None)): ++ if 'locNoteRef' in rule.attrib: ++ locnote = LocNote(locnoteref=rule.get('locNoteRef'), locnotetype=notetype) ++ sel = rule.get('selector') ++ if sel is not None: ++ for node in self._try_xpath_eval(xpath, sel): + if locnote is not None: + self._its_loc_notes.setdefault(node, []).append(locnote) + else: +- if rule.hasNsProp('locNotePointer', None): +- sel = rule.nsProp('locNotePointer', None) ++ if 'locNotePointer' in rule.attrib: ++ sel = rule.get('locNotePointer') + ref = False +- elif rule.hasNsProp('locNoteRefPointer', None): +- sel = rule.nsProp('locNoteRefPointer', None) ++ elif 'locNoteRefPointer' in rule.attrib: ++ sel = rule.get('locNoteRefPointer') + ref = True + else: + continue +- try: +- oldnode = xpath.contextNode() +- except: +- oldnode = None +- xpath.setContextNode(node) +- note = self._try_xpath_eval(xpath, sel) ++ note = self._try_xpath_eval(xpath, sel, node=node) + if isinstance(note, string_types): + if ref: + nodenote = LocNote(locnoteref=note, locnotetype=notetype) +@@ -695,55 +657,46 @@ class Document (object): + self._its_loc_notes.setdefault(node, []).append(nodenote) + else: + for note in note: ++ text = xml_content(note) + if ref: +- nodenote = LocNote(locnoteref=note.content, locnotetype=notetype) ++ nodenote = LocNote(locnoteref=text, locnotetype=notetype) + else: +- nodenote = LocNote(locnote=note.content, locnotetype=notetype, ++ nodenote = LocNote(locnote=text, locnotetype=notetype, + space=self.get_preserve_space(note)) + self._its_loc_notes.setdefault(node, []).append(nodenote) + break +- xpath.setContextNode(oldnode) +- elif xml_is_ns_name(rule, NS_ITS, 'langRule'): +- if rule.nsProp('selector', None) is not None and rule.nsProp('langPointer', None) is not None: +- for node in self._try_xpath_eval(xpath, rule.nsProp('selector', None)): +- try: +- oldnode = xpath.contextNode() +- except: +- oldnode = None +- xpath.setContextNode(node) +- res = self._try_xpath_eval(xpath, rule.nsProp('langPointer', None)) ++ elif rule.tag == '{' + NS_ITS + '}langRule': ++ sel = rule.get('selector') ++ lp = rule.get('langPointer') ++ if sel is not None and lp is not None: ++ for node in self._try_xpath_eval(xpath, sel): ++ res = self._try_xpath_eval(xpath, lp, node=node) + if len(res) > 0: +- self._its_lang[node] = res[0].content ++ self._its_lang[node] = xml_content(res[0]) + # We need to construct language attributes, not just read + # language information. Technically, langPointer could be + # any XPath expression. But if it looks like an attribute + # accessor, just use the attribute name. +- if rule.nsProp('langPointer', None)[0] == '@': +- self._itst_lang_attr[node] = rule.nsProp('langPointer', None)[1:] +- xpath.setContextNode(oldnode) +- elif xml_is_ns_name(rule, NS_ITST, 'credits'): +- if rule.nsProp('appendTo', None) is not None: +- for node in self._try_xpath_eval(xpath, rule.nsProp('appendTo', None)): ++ if lp[0] == '@': ++ self._itst_lang_attr[node] = lp[1:] ++ elif rule.tag == '{' + NS_ITST + '}credits': ++ sel = rule.get('appendTo') ++ if sel is not None: ++ for node in self._try_xpath_eval(xpath, sel): + self._itst_credits = (node, rule) + break +- elif (xml_is_ns_name(rule, NS_ITS, 'externalResourceRefRule') or +- xml_is_ns_name(rule, NS_ITST, 'externalRefRule')): +- sel = rule.nsProp('selector', None) +- if xml_is_ns_name(rule, NS_ITS, 'externalResourceRefRule'): +- ptr = rule.nsProp('externalResourceRefPointer', None) ++ elif (rule.tag == '{' + NS_ITS + '}externalResourceRefRule' or ++ rule.tag == '{' + NS_ITST + '}externalRefRule'): ++ sel = rule.get('selector') ++ if rule.tag == '{' + NS_ITS + '}externalResourceRefRule': ++ ptr = rule.get('externalResourceRefPointer') + else: +- ptr = rule.nsProp('refPointer', None) ++ ptr = rule.get('refPointer') + if sel is not None and ptr is not None: + for node in self._try_xpath_eval(xpath, sel): +- try: +- oldnode = xpath.contextNode() +- except: +- oldnode = None +- xpath.setContextNode(node) +- res = self._try_xpath_eval(xpath, ptr) ++ res = self._try_xpath_eval(xpath, ptr, node=node) + if len(res) > 0: +- self._its_externals[node] = res[0].content +- xpath.setContextNode(oldnode) ++ self._its_externals[node] = xml_content(res[0]) + + def apply_its_rules(self, builtins, userparams={}): + self._clear_cache() +@@ -773,94 +726,59 @@ class Document (object): + + def apply_its_file(self, filename, userparams={}): + self._clear_cache() +- doc = libxml2.parseFile(filename) +- root = doc.getRootElement() +- if not xml_is_ns_name(root, NS_ITS, 'rules'): ++ parser = etree.XMLParser(resolve_entities = False) ++ root = etree.parse(filename, parser).getroot() ++ if root.tag != '{' + NS_ITS + '}rules': + return +- version = None +- if root.hasNsProp('version', None): +- version = root.nsProp('version', None) +- else: ++ version = root.get('version') ++ if version is None: + sys.stderr.write('Warning: ITS file %s missing version attribute\n' % + os.path.basename(filename)) +- if version is not None and version not in ('1.0', '2.0'): ++ elif version not in ('1.0', '2.0'): + sys.stderr.write('Warning: Skipping ITS file %s with unknown version %s\n' % +- (os.path.basename(filename), root.nsProp('version', None))) ++ (os.path.basename(filename), root.get('version'))) + return + matched = True +- for match in xml_child_iter(root): +- if xml_is_ns_name(match, NS_ITST, 'match'): ++ for match in root.iterchildren(): ++ if match.tag == '{' + NS_ITST + '}match': + matched = False +- xpath = self._doc.xpathNewContext() +- par = match +- nss = {} +- while par is not None: +- nsdef = par.nsDefs() +- while nsdef is not None: +- if nsdef.name is not None: +- if nsdef.name not in nss: +- nss[nsdef.name] = nsdef.content +- xpath.xpathRegisterNs(nsdef.name, nsdef.content) +- nsdef = nsdef.next +- par = par.parent +- if match.hasNsProp('selector', None): +- if len(self._try_xpath_eval(xpath, match.nsProp('selector', None))) > 0: ++ sel = match.get('selector') ++ if sel is not None: ++ ns = { k: v for k, v in match.nsmap.items() if k is not None } ++ xpath = (ns, {}) ++ if len(self._try_xpath_eval(xpath, sel)) > 0: + matched = True + break + if matched == False: + return ++ ns = { k: v for k, v in match.nsmap.items() if k is not None } ++ var = {} + params = self.get_its_params(root) +- for rule in xml_child_iter(root): +- xpath = self._doc.xpathNewContext() +- par = match +- nss = {} +- while par is not None: +- nsdef = par.nsDefs() +- while nsdef is not None: +- if nsdef.name is not None: +- if nsdef.name not in nss: +- nss[nsdef.name] = nsdef.content +- xpath.xpathRegisterNs(nsdef.name, nsdef.content) +- nsdef = nsdef.next +- par = par.parent +- self.register_its_params(xpath, params, userparams=userparams) ++ self.register_its_params(var, params, userparams=userparams) ++ xpath = (ns, var) ++ for rule in root.iterchildren(): + self.apply_its_rule(rule, xpath) + + def apply_local_its_rules(self, userparams={}): + self._clear_cache() + for rules in self._localrules: +- def reg_ns(xpath, node): +- if node.parent is not None: +- reg_ns(xpath, node.parent) +- nsdef = node.nsDefs() +- while nsdef is not None: +- if nsdef.name is not None: +- xpath.xpathRegisterNs(nsdef.name, nsdef.content) +- nsdef = nsdef.next +- xpath = self._doc.xpathNewContext() +- reg_ns(xpath, rules) ++ var = {} + params = self.get_its_params(rules) +- self.register_its_params(xpath, params, userparams=userparams) +- for rule in xml_child_iter(rules): +- if rule.type != 'element': +- continue +- if rule.nsDefs() is not None: +- rule_xpath = self._doc.xpathNewContext() +- reg_ns(rule_xpath, rule) +- self.register_its_params(rule_xpath, params, userparams=userparams) +- else: +- rule_xpath = xpath ++ self.register_its_params(var, params, userparams=userparams) ++ for rule in rules.iterchildren(): ++ ns = { k: v for k, v in rule.nsmap.items() if k is not None } ++ rule_xpath = (ns, var) + self.apply_its_rule(rule, rule_xpath) + + def _append_credits(self, parent, node, trdata): +- if xml_is_ns_name(node, NS_ITST, 'for-each'): +- select = node.nsProp('select', None) ++ if node.tag == '{' + NS_ITST + '}for-each': ++ select = node.get('select') + if select == 'years': + for year in trdata[2].split(','): +- for child in xml_child_iter(node): ++ for child in node.iterchildren(): + self._append_credits(parent, child, trdata + (year.strip(),)) +- elif xml_is_ns_name(node, NS_ITST, 'value-of'): +- select = node.nsProp('select', None) ++ elif node.tag == '{' + NS_ITST + '}value-of': ++ select = node.get('select') + val = None + if select == 'name': + val = trdata[0] +@@ -875,9 +793,9 @@ class Document (object): + val = val.encode('utf-8') + parent.addContent(val) + else: +- newnode = node.copyNode(2) +- parent.addChild(newnode) +- for child in xml_child_iter(node): ++ newnode = parent.makeelement(node.tag, node.attrib) ++ parent.append(newnode) ++ for child in node.iterchildren(): + self._append_credits(newnode, child, trdata) + + def merge_credits(self, translations, language, node): +@@ -895,7 +813,7 @@ class Document (object): + if not match: + continue + trdata = match.groups() +- for node in xml_child_iter(self._itst_credits[1]): ++ for node in self._itst_credits[1].iterchildren(): + self._append_credits(self._itst_credits[0], node, trdata) + + def join_translations(self, translations, node=None, strict=False): +@@ -903,29 +821,30 @@ class Document (object): + if node is None: + is_root = True + self.generate_messages(comments=False) +- node = self._doc.getRootElement() +- if node is None or node.type != 'element': ++ node = self._doc.getroot() ++ if node is None: + return + if self.get_itst_drop(node) == 'yes': +- prev = node.prev +- node.unlinkNode() +- node.freeNode() +- if prev is not None and prev.isBlankNode(): +- prev.unlinkNode() +- prev.freeNode() ++ xml_delete_node(node) + return + msg = self._msgs.get_message_by_node(node) + if msg is None: +- self.translate_attrs(node, node) +- children = [child for child in xml_child_iter(node)] +- for child in children: ++ #self.translate_attrs(node, node) ++ for child in node.iterchildren(): + self.join_translations(translations, node=child, strict=strict) + else: +- prevnode = None +- if node.prev is not None and node.prev.type == 'text': +- prevtext = node.prev.content +- if re.sub(r'\s+', '', prevtext) == '': +- prevnode = node.prev ++ prevtext = None ++ prev = node.getprevious() ++ if prev is None: ++ parent = node.getparent() ++ if parent is not None: ++ prevtext = parent.text ++ else: ++ prevtext = prev.tail ++ if prevtext is not None: ++ if not re.fullmatch(r'\s+', prevtext): ++ prevtext = None ++ i = 0 + for lang in sorted(list(translations.keys()), reverse=True): + locale = self.get_its_locale_filter(node) + lmatch = match_locale_list(locale[0], lang) +@@ -933,24 +852,26 @@ class Document (object): + continue + newnode = self.get_translated(node, translations[lang], strict=strict, lang=lang) + if newnode != node: +- newnode.setProp('xml:lang', lang) +- node.addNextSibling(newnode) +- if prevnode is not None: +- node.addNextSibling(prevnode.copyNode(0)) +- if is_root: +- # Because of the way we create nodes and rewrite the document, +- # we end up with lots of redundant namespace definitions. We +- # kill them off in one fell swoop at the end. +- fix_node_ns(node, {}) +- self._check_errors() ++ newnode.set('{' + NS_XML + '}lang', lang) ++ node.addnext(newnode) ++ if i == 0: ++ # Move tail to first new node ++ newnode.tail = node.tail ++ if prevtext is not None: ++ node.tail = prevtext ++ else: ++ if prevtext is not None: ++ newnode.tail = prevtext ++ i += 1 ++ + + def merge_translations(self, translations, language, node=None, strict=False): + is_root = False + if node is None: + is_root = True + self.generate_messages(comments=False) +- node = self._doc.getRootElement() +- if node is None or node.type != 'element': ++ node = self._doc.getroot() ++ if node is None: + return + drop = False + locale = self.get_its_locale_filter(node) +@@ -962,26 +883,23 @@ class Document (object): + if match_locale_list(locale[0], language): + drop = True + if self.get_itst_drop(node) == 'yes' or drop: +- prev = node.prev +- node.unlinkNode() +- node.freeNode() +- if prev is not None and prev.isBlankNode(): +- prev.unlinkNode() +- prev.freeNode() ++ xml_delete_node(node) + return + if is_root: + self.merge_credits(translations, language, node) + msg = self._msgs.get_message_by_node(node) + if msg is None: + self.translate_attrs(node, node) +- children = [child for child in xml_child_iter(node)] +- for child in children: ++ for child in node.iterchildren(): + self.merge_translations(translations, language, node=child, strict=strict) + else: + newnode = self.get_translated(node, translations, strict=strict, lang=language) + if newnode != node: + self.translate_attrs(node, newnode) +- node.replaceNode(newnode) ++ newnode.tail = node.tail ++ parent = node.getparent() ++ if parent is not None: ++ parent.replace(node, newnode) + if is_root: + # Apply language attributes to untranslated nodes. We don't do + # this before processing, because then these attributes would +@@ -998,31 +916,27 @@ class Document (object): + origlang = self._its_lang.get(lcpar) + if origlang is not None: + break +- lcpar = lcpar.parent ++ lcpar = lcpar.getparent() + if origlang is not None: +- lcnode.setProp(attr, origlang) ++ lcnode.set(attr, origlang) + # And then set the language attribute on the root node. + if language is not None: + attr = self._itst_lang_attr.get(node) + if attr is not None: +- node.setProp(attr, language) +- # Because of the way we create nodes and rewrite the document, +- # we end up with lots of redundant namespace definitions. We +- # kill them off in one fell swoop at the end. +- fix_node_ns(node, {}) +- self._check_errors() ++ node.set(attr, language) + + def translate_attrs(self, oldnode, newnode): +- trans_attrs = [attr for attr in xml_attr_iter(oldnode) if self._its_translate_nodes.get(attr, 'no') == 'yes'] +- for attr in trans_attrs: +- srccontent = attr.get_content() ++ for attrname, srccontent in oldnode.items(): ++ attr = XMLAttr(oldnode, attrname) ++ if self._its_translate_nodes.get(attr, 'no') != 'yes': ++ continue + if not PY3: + srccontent = srccontent.decode('utf-8') + newcontent = translations.ugettext(srccontent) + if newcontent: + if not PY3: + newcontent = newcontent.encode('utf-8') +- newnode.setProp(attr.name, newcontent) ++ newnode.set(attrname, newcontent) + + def get_translated (self, node, translations, strict=False, lang=None): + msg = self._msgs.get_message_by_node(node) +@@ -1037,106 +951,84 @@ class Document (object): + trans = translations.ugettext(msgstr) + if trans is None: + return node +- nss = {} +- def reg_ns(node, nss): +- if node.parent is not None: +- reg_ns(node.parent, nss) +- nsdef = node.nsDefs() +- while nsdef is not None: +- nss[nsdef.name] = nsdef.content +- nsdef = nsdef.next +- reg_ns(node, nss) +- nss['_'] = NS_BLANK +- try: +- blurb = node.doc.intSubset().serialize('utf-8') +- except Exception: +- blurb = '' +- blurb += '<' + ustr(node.name, 'utf-8') +- for nsname in list(nss.keys()): ++ blurb = '' ++ doc = node.getroottree() ++ if doc.docinfo.internalDTD: ++ # This is an ugly hack to serialize the DTD. We copy the ++ # document, replace the document element, serialize the ++ # document and remove the last line which contains the ++ # document element, leaving only the DTD. ++ copy = deepcopy(doc) ++ root = copy.getroot() ++ newroot = root.makeelement(root.tag) ++ copy._setroot(newroot) ++ blurb = re.sub('.*$', '', etree.tostring(copy, encoding='unicode')) ++ localname = ustr(xml_localname(node), 'utf-8') ++ blurb += '<' + localname ++ blurb += ' xmlns:_="%s"' % NS_BLANK ++ for nsname, nsuri in node.nsmap.items(): + if nsname is None: +- blurb += ' xmlns="%s"' % nss[nsname] ++ blurb += ' xmlns="%s"' % nsuri + else: +- blurb += ' xmlns:%s="%s"' % (nsname, nss[nsname]) +- blurb += '>%s' % (trans, ustr(node.name, 'utf-8')) +- if not PY3: +- blurb = blurb.encode('utf-8') +- ctxt = libxml2.createDocParserCtxt(blurb) +- if self._load_dtd: +- ctxt.loadSubset(1) +- if self._keep_entities: +- ctxt.loadSubset(1) +- ctxt.ctxtUseOptions(libxml2.XML_PARSE_DTDLOAD) +- ctxt.replaceEntities(0) +- else: +- ctxt.replaceEntities(1) +- ctxt.parseDocument() +- trnode = ctxt.doc().getRootElement() ++ blurb += ' xmlns:%s="%s"' % (nsname, nsuri) ++ blurb += '>%s' % (trans, localname) ++ parser = etree.XMLParser(load_dtd = self._load_dtd or self._keep_entities, ++ resolve_entities = not(self._keep_entities)) + try: +- self._check_errors() +- except libxml2.parserError: ++ trnode = etree.fromstring(blurb, parser) ++ except: + if strict: + raise + else: + sys.stderr.write('Warning: Could not merge %stranslation for msgid:\n%s\n' % ( +- (lang + ' ') if lang is not None else '', +- msgstr.encode('utf-8'))) +- self._xml_err = '' ++ (lang + ' ') if lang is not None else '', ++ msgstr.encode('utf-8'))) + return node +- def scan_node(node): +- children = [child for child in xml_child_iter(node)] +- for child in children: +- if child.type != 'element': ++ try: ++ for child in trnode.iterdescendants(): ++ if isinstance(child, (etree._Entity, etree._Comment, etree._ProcessingInstruction)): + continue +- if child.ns() is not None and child.ns().content == NS_BLANK: +- ph_node = msg.get_placeholder(child.name).node +- if self.has_child_elements(ph_node): ++ qname = etree.QName(child.tag) ++ if qname.namespace == NS_BLANK: ++ ph_node = msg.get_placeholder(qname.localname).node ++ if len(ph_node): + self.merge_translations(translations, None, ph_node, strict=strict) +- newnode = ph_node.copyNode(1) +- newnode.setTreeDoc(self._doc) +- child.replaceNode(newnode) ++ newnode = deepcopy(ph_node) ++ newnode.tail = child.tail ++ child.getparent().replace(child, newnode) + else: + repl = self.get_translated(ph_node, translations, strict=strict, lang=lang) +- child.replaceNode(repl) +- scan_node(child) +- try: +- scan_node(trnode) ++ repl.tail = child.tail ++ child.getparent().replace(child, repl) + except: ++ raise + if strict: + raise + else: + sys.stderr.write('Warning: Could not merge %stranslation for msgid:\n%s\n' % ( + (lang + ' ') if lang is not None else '', + msgstr.encode('utf-8'))) +- self._xml_err = '' +- ctxt.doc().freeDoc() + return node +- retnode = node.copyNode(2) +- retnode.setTreeDoc(self._doc) +- for child in xml_child_iter(trnode): +- newnode = child.copyNode(1) +- newnode.setTreeDoc(self._doc) +- retnode.addChild(newnode) +- +- ctxt.doc().freeDoc() ++ retnode = self._doc.getroot().makeelement(node.tag, node.attrib, node.nsmap) ++ retnode.text = trnode.text ++ for child in trnode.iterchildren(): ++ retnode.append(child) ++ + return retnode + + def generate_messages(self, comments=True): + if self._itst_credits is not None: + self._msgs.add_credits() +- for child in xml_child_iter(self._doc): +- if child.type == 'element': +- self.generate_message(child, None, comments=comments) +- break ++ if self._doc is not None: ++ self.generate_message(self._doc.getroot(), None, comments=comments) + + def generate_message(self, node, msg, comments=True, path=None): +- if node.type in ('text', 'cdata') and msg is not None: +- msg.add_text(node.content) ++ if isinstance(node, etree._Entity): ++ msg.add_entity_ref(node.name) + return +- if node.type == 'entity_ref': +- msg.add_entity_ref(node.name); +- if node.type != 'element': ++ if isinstance(node, XMLAttr): + return +- if node.hasNsProp('drop', NS_ITST) and node.nsProp('drop', NS_ITST) == 'yes': ++ if node.get('{' + NS_ITST + '}drop', 'no') == 'yes': + return + if self._itst_drop_nodes.get(node, 'no') == 'yes': + return +@@ -1158,9 +1050,7 @@ class Document (object): + if msg is not None: + msg.add_placeholder(node) + msg = Message() +- ctxt = None +- if node.hasNsProp('context', NS_ITST): +- ctxt = node.nsProp('context', NS_ITST) ++ ctxt = node.get('{' + NS_ITST + '}context') + if ctxt is None: + ctxt = self._itst_contexts.get(node) + if ctxt is not None: +@@ -1173,27 +1063,38 @@ class Document (object): + msg.set_preserve_space() + if self.get_its_locale_filter(node) != ('*', 'include'): + msg.set_locale_filter(self.get_its_locale_filter(node)) +- msg.add_source('%s:%i' % (self._doc.name, node.lineNo())) +- msg.add_marker('%s/%s' % (ustr(node.parent.name, 'utf-8'), ustr(node.name, 'utf-8'))) ++ msg.add_source('%s:%i' % (self._doc.docinfo.URL, node.sourceline)) ++ parent = node.getparent() ++ if parent is None: ++ ptag = '#root' ++ else: ++ ptag = xml_localname(parent) ++ msg.add_marker('%s/%s' % (ustr(ptag, 'utf-8'), ustr(xml_localname(node), 'utf-8'))) + else: + withinText = True + msg.add_start_tag(node) + + if not withinText: + # Add msg for translatable node attributes +- for attr in xml_attr_iter(node): ++ for attrname, attrval in node.items(): ++ attr = XMLAttr(node, attrname) + if self._its_translate_nodes.get(attr, 'no') == 'yes': + attr_msg = Message() + if self.get_preserve_space(attr): + attr_msg.set_preserve_space() +- attr_msg.add_source('%s:%i' % (self._doc.name, node.lineNo())) +- attr_msg.add_marker('%s/%s@%s' % (node.parent.name, node.name, attr.name)) +- attr_msg.add_text(attr.content) ++ attr_msg.add_source('%s:%i' % (self._doc.docinfo.URL, node.sourceline)) ++ attr_msg.add_marker('%s/%s@%s' % ( ++ xml_localname(node.getparent()), ++ xml_localname(node), ++ etree.QName(attrname).localname)) ++ attr_msg.add_text(attrval) + if comments: + for locnote in self.get_its_loc_notes(attr): + comment = Comment(locnote) + comment.add_marker ('%s/%s@%s' % ( +- node.parent.name, node.name, attr.name)) ++ xml_localname(node.getparent()), ++ xml_localname(node), ++ etree.QName(attrname).localname)) + attr_msg.add_comment(comment) + self._msgs.add_message(attr_msg, attr) + +@@ -1204,15 +1105,16 @@ class Document (object): + for locnote in self.get_its_loc_notes(cnode, inherit=(not withinText)): + comment = Comment(locnote) + if withinText: +- comment.add_marker('.%s/%s' % (path, cnode.name)) ++ comment.add_marker('.%s/%s' % (path, xml_localname(cnode))) + msg.add_comment(comment) + hasnote = True + if hasnote or not is_unit: + break +- cnode = cnode.parent ++ cnode = cnode.getparent() + + self.generate_external_resource_message(node) +- for attr in xml_attr_iter(node): ++ for attrname in node.keys(): ++ attr = XMLAttr(node, attrname) + self.generate_external_resource_message(attr) + idvalue = self.get_its_id_value(attr) + if idvalue is not None: +@@ -1220,9 +1122,13 @@ class Document (object): + msg.add_id_value(basename + '#' + idvalue) + + if withinText: +- path = path + '/' + node.name +- for child in xml_child_iter(node): ++ path = path + '/' + node.tag ++ if node.text is not None and msg is not None: ++ msg.add_text(node.text) ++ for child in node.iterchildren(): + self.generate_message(child, msg, comments=comments, path=path) ++ if child.tail is not None and msg is not None: ++ msg.add_text(child.tail) + + if translate: + if is_unit and not msg.is_empty(): +@@ -1234,12 +1140,17 @@ class Document (object): + if node not in self._its_externals: + return + resref = self._its_externals[node] +- if node.type == 'element': +- translate = self.get_its_translate(node) +- marker = '%s/%s' % (node.parent.name, node.name) ++ if isinstance(node, XMLAttr): ++ elem = node.getparent() ++ translate = self.get_its_translate(elem) ++ marker = '%s/%s/@%s' % ( ++ xml_localname(elem.getparent()), ++ xml_localname(elem), ++ xml_localname(node)) + else: +- translate = self.get_its_translate(node.parent) +- marker = '%s/%s/@%s' % (node.parent.parent.name, node.parent.name, node.name) ++ translate = self.get_its_translate(node) ++ marker = '%s/%s' % (xml_localname(node.getparent()), ++ xml_localname(node)) + if translate == 'no': + return + msg = Message() +@@ -1253,7 +1164,7 @@ class Document (object): + txt = "external ref='%s' md5='%s'" % (resref, filemd5) + msg.set_context('_') + msg.add_text(txt) +- msg.add_source('%s:%i' % (self._doc.name, node.lineNo())) ++ msg.add_source('%s:%i' % (self._doc.docinfo.URL, node.sourceline)) + msg.add_marker(marker) + msg.add_comment(Comment('This is a reference to an external file such as an image or' + ' video. When the file changes, the md5 hash will change to' +@@ -1265,44 +1176,41 @@ class Document (object): + def is_translation_unit (self, node): + return self.get_its_within_text(node) != 'yes' + +- def has_child_elements(self, node): +- return len([child for child in xml_child_iter(node) if child.type=='element']) +- + def get_preserve_space (self, node): +- while node.type in ('attribute', 'element'): +- if node.getSpacePreserve() == 1: ++ while node is not None: ++ if node.get('{' + NS_XML + '}space') == 'preserve': + return True + if node in self._its_preserve_space_nodes: + return (self._its_preserve_space_nodes[node] == 'preserve') +- node = node.parent ++ node = node.getparent() + return False + + def get_its_translate(self, node): + if node in self._its_translate_nodes_cache: + return self._its_translate_nodes_cache[node] + val = None +- if node.hasNsProp('translate', NS_ITS): +- val = node.nsProp('translate', NS_ITS) +- elif xml_is_ns_name(node, NS_ITS, 'span') and node.hasNsProp('translate', None): +- val = node.nsProp('translate', None) ++ if '{' + NS_ITS + '}translate' in node.attrib: ++ val = node.get('{' + NS_ITS + '}translate') ++ elif node.tag == '{' + NS_ITS + '}span' and 'translate' in node.attrib: ++ val = node.get('translate') + elif node in self._its_translate_nodes: + val = self._its_translate_nodes[node] + if val is not None: + self._its_translate_nodes_cache[node] = val + return val +- if node.type == 'attribute': ++ if isinstance(node, XMLAttr): + return 'no' +- if node.parent.type == 'element': +- parval = self.get_its_translate(node.parent) ++ if node.getparent() is not None: ++ parval = self.get_its_translate(node.getparent()) + self._its_translate_nodes_cache[node] = parval + return parval + return 'yes' + + def get_its_within_text(self, node): +- if node.hasNsProp('withinText', NS_ITS): +- val = node.nsProp('withinText', NS_ITS) +- elif xml_is_ns_name(node, NS_ITS, 'span') and node.hasNsProp('withinText', None): +- val = node.nsProp('withinText', None) ++ if '{' + NS_ITS + '}withinText' in node.attrib: ++ val = node.get('{' + NS_ITS + '}withinText') ++ elif node.tag == '{' + NS_ITS + '}span' and 'withinText' in node.attrib: ++ val = node.get('withinText') + else: + return self._its_within_text_nodes.get(node, 'no') + if val in ('yes', 'nested'): +@@ -1312,73 +1220,63 @@ class Document (object): + def get_its_locale_filter(self, node): + if node in self._its_locale_filters_cache: + return self._its_locale_filters_cache[node] +- if node.hasNsProp('localeFilterList', NS_ITS) or node.hasNsProp('localeFilterType', NS_ITS): +- if node.hasNsProp('localeFilterList', NS_ITS): +- lst = node.nsProp('localeFilterList', NS_ITS) +- else: +- lst = '*' +- if node.hasNsProp('localeFilterType', NS_ITS): +- typ = node.nsProp('localeFilterType', NS_ITS) +- else: +- typ = 'include' ++ if ('{' + NS_ITS + '}localeFilterList' in node.attrib or ++ '{' + NS_ITS + '}localeFilterType' in node.attrib): ++ lst = node.get('{' + NS_ITS + '}localeFilterList', '*') ++ typ = node.get('{' + NS_ITS + '}localeFilterType', 'include') + return (lst, typ) +- if (xml_is_ns_name(node, NS_ITS, 'span') and +- (node.hasNsProp('localeFilterList', None) or node.hasNsProp('localeFilterType', None))): +- if node.hasNsProp('localeFilterList', None): +- lst = node.nsProp('localeFilterList', None) +- else: +- lst = '*' +- if node.hasNsProp('localeFilterType', None): +- typ = node.nsProp('localeFilterType', None) +- else: +- typ = 'include' ++ if (node.tag == '{' + NS_ITS + '}span' and ++ ('localeFilterList' in node.attrib or 'localeFilterType' in node.attrib)): ++ lst = node.get('localeFilterList', '*') ++ typ = node.get('localeFilterType', 'include') + return (lst, typ) + if node in self._its_locale_filters: + return self._its_locale_filters[node] +- if node.parent.type == 'element': +- parval = self.get_its_locale_filter(node.parent) ++ if node.getparent() is not None: ++ parval = self.get_its_locale_filter(node.getparent()) + self._its_locale_filters_cache[node] = parval + return parval + return ('*', 'include') + + def get_itst_drop(self, node): +- if node.hasNsProp('drop', NS_ITST) and node.nsProp('drop', NS_ITST) == 'yes': ++ if node.get('{' + NS_ITST + '}drop') == 'yes': + return 'yes' + if self._itst_drop_nodes.get(node, 'no') == 'yes': + return 'yes' + return 'no' + + def get_its_id_value(self, node): +- if node.hasNsProp('id', NS_XML): +- return node.nsProp('id', NS_XML) ++ if '{' + NS_XML + '}id' in node.attrib: ++ return node.get('{' + NS_XML + '}id') + return self._its_id_values.get(node, None) + + def get_its_loc_notes(self, node, inherit=True): + if node in self._its_loc_notes_cache: + return self._its_loc_notes_cache[node] + ret = [] +- if ( node.hasNsProp('locNote', NS_ITS) or +- node.hasNsProp('locNoteRef', NS_ITS) or +- node.hasNsProp('locNoteType', NS_ITS) ): +- notetype = node.nsProp('locNoteType', NS_ITS) +- if node.hasNsProp('locNote', NS_ITS): +- ret.append(LocNote(locnote=node.nsProp('locNote', NS_ITS), locnotetype=notetype)) +- elif node.hasNsProp('locNoteRef', NS_ITS): +- ret.append(LocNote(locnoteref=node.nsProp('locNoteRef', NS_ITS), locnotetype=notetype)) +- elif xml_is_ns_name(node, NS_ITS, 'span'): +- if ( node.hasNsProp('locNote', None) or +- node.hasNsProp('locNoteRef', None) or +- node.hasNsProp('locNoteType', None) ): +- notetype = node.nsProp('locNoteType', None) +- if node.hasNsProp('locNote', None): +- ret.append(LocNote(locnote=node.nsProp('locNote', None), locnotetype=notetype)) +- elif node.hasNsProp('locNoteRef', None): +- ret.append(LocNote(locnoteref=node.nsProp('locNoteRef', None), locnotetype=notetype)) ++ if ( '{' + NS_ITS + '}locNote' in node.attrib or ++ '{' + NS_ITS + '}locNoteRef' in node.attrib or ++ '{' + NS_ITS + '}locNoteType' in node.attrib ): ++ notetype = node.get('{' + NS_ITS + '}locNoteType') ++ if '{' + NS_ITS + '}locNote' in node.attrib: ++ ret.append(LocNote(locnote=node.get('{' + NS_ITS + '}locNote'), locnotetype=notetype)) ++ elif '{' + NS_ITS + '}locNoteRef' in node.attrib: ++ ret.append(LocNote(locnoteref=node.get('{' + NS_ITS + '}locNoteRef'), locnotetype=notetype)) ++ elif node.tag == '{' + NS_ITS + '}span': ++ if ( 'locNote' in node.attrib or ++ 'locNoteRef' in node.attrib or ++ 'locNoteType' in node.attrib ): ++ notetype = node.get('locNoteType') ++ if 'locNote' in node.attrib: ++ ret.append(LocNote(locnote=node.get('locNote'), locnotetype=notetype)) ++ elif 'locNoteRef' in node.attrib: ++ ret.append(LocNote(locnoteref=node.get('locNoteRef'), locnotetype=notetype)) + for locnote in reversed(self._its_loc_notes.get(node, [])): + ret.append(locnote) + if (len(ret) == 0 and inherit and +- node.type != 'attribute' and node.parent is not None and node.parent.type == 'element'): +- parval = self.get_its_loc_notes(node.parent) ++ not isinstance(node, XMLAttr) and ++ node.getparent() is not None): ++ parval = self.get_its_loc_notes(node.getparent()) + self._its_loc_notes_cache[node] = parval + return parval + self._its_loc_notes_cache[node] = ret +@@ -1386,12 +1284,12 @@ class Document (object): + + def output_test_data(self, category, out, node=None): + if node is None: +- node = self._doc.getRootElement() ++ node = self._doc.getroot() + compval = '' + if category == 'translate': + compval = 'translate="%s"' % self.get_its_translate(node) + elif category == 'withinText': +- if node.type != 'attribute': ++ if not isinstance(node, XMLAttr): + compval = 'withinText="%s"' % self.get_its_within_text(node) + elif category == 'localeFilter': + compval = 'localeFilterList="%s"\tlocaleFilterType="%s"' % self.get_its_locale_filter(node) +@@ -1422,16 +1320,32 @@ class Document (object): + out.write('%s\t%s\r\n' % (xml_get_node_path(node), compval)) + else: + out.write('%s\r\n' % (xml_get_node_path(node))) +- for attr in sorted(xml_attr_iter(node), key=ustr): ++ for attrname in sorted(node.keys(), key=ustr): ++ attr = XMLAttr(node, attrname) + self.output_test_data(category, out, attr) +- for child in xml_child_iter(node): +- if child.type == 'element': +- self.output_test_data(category, out, child) ++ for child in node.iterchildren(): ++ self.output_test_data(category, out, child) + +- @staticmethod +- def _try_xpath_eval (xpath, expr): ++ def _try_xpath_eval (self, xpath, expr, node=None): ++ if node is None: ++ node = self._doc ++ elif isinstance(node, XMLAttr): ++ # lxml doesn't support attributes as XPath context nodes. ++ if expr == '.': ++ return [ node ] ++ sys.stderr.write('Warning: Unsupported XPath on attribute: %s\n' % expr) ++ return [] + try: +- return xpath.xpathEval(expr) ++ result = node.xpath(expr, namespaces=xpath[0], **xpath[1]) ++ if not isinstance(result, str): ++ for i in range(len(result)): ++ val = result[i] ++ # Use lxml's "smart string" feature to determine ++ # the attribute node. ++ if (isinstance(val, etree._ElementUnicodeResult) and ++ val.is_attribute): ++ result[i] = XMLAttr(val.getparent(), val.attrname) ++ return result + except: + sys.stderr.write('Warning: Invalid XPath: %s\n' % expr) + return [] +@@ -1636,11 +1550,11 @@ if __name__ == '__main__': + raise + sys.stderr.write('Error: Could not merge translations:\n%s\n' % ustr(e)) + sys.exit(1) +- serialized = doc._doc.serialize('utf-8') +- if PY3: +- # For some reason, under py3, our serialized data is returns as a str. +- # Let's encode it to bytes +- serialized = serialized.encode('utf-8') ++ # lxml generates XML declarations with single quotes. ++ serialized = ( ++ b'\n' + ++ etree.tostring(doc._doc, encoding='utf-8') + ++ b'\n') + fout = out + fout_is_str = isinstance(fout, string_types) + if fout_is_str: +@@ -1675,11 +1589,11 @@ if __name__ == '__main__': + for itsfile in opts.itsfile: + doc.apply_its_file(itsfile, userparams=userparams) + doc.join_translations(translations, strict=opts.strict) +- serialized = doc._doc.serialize('utf-8') +- if PY3: +- # For some reason, under py3, our serialized data is returns as a str. +- # Let's encode it to bytes +- serialized = serialized.encode('utf-8') ++ # lxml generates XML declarations with single quotes. ++ serialized = ( ++ b'\n' + ++ etree.tostring(doc._doc, encoding='utf-8') + ++ b'\n') + out.write(serialized) + out.flush() + +From 30289d5b532bb888f2e6099c04976e441141dd01 Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Mon, 12 May 2025 17:11:31 +0200 +Subject: [PATCH 2/5] Fix element check + +--- + itstool.in | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/itstool.in b/itstool.in +index 052255e..4c39660 100755 +--- a/itstool.in ++++ b/itstool.in +@@ -864,7 +864,6 @@ class Document (object): + newnode.tail = prevtext + i += 1 + +- + def merge_translations(self, translations, language, node=None, strict=False): + is_root = False + if node is None: +@@ -1026,7 +1025,8 @@ class Document (object): + if isinstance(node, etree._Entity): + msg.add_entity_ref(node.name) + return +- if isinstance(node, XMLAttr): ++ # Only allow elements ++ if isinstance(node, XMLAttr) or not isinstance(node.tag, str): + return + if node.get('{' + NS_ITST + '}drop', 'no') == 'yes': + return + +From 0d79db8eacc6787b397caa34f0d849afec52c582 Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Tue, 27 May 2025 20:15:56 +0200 +Subject: [PATCH 3/5] Rewrite remaining call to `addContent` + +--- + itstool.in | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/itstool.in b/itstool.in +index 4c39660..78887ca 100755 +--- a/itstool.in ++++ b/itstool.in +@@ -791,7 +791,16 @@ class Document (object): + if val is not None: + if not PY3: + val = val.encode('utf-8') +- parent.addContent(val) ++ if len(parent): ++ if parent[-1].tail: ++ parent[-1].tail += val ++ else: ++ parent[-1].tail = val ++ else: ++ if parent.text: ++ parent.text += val ++ else: ++ parent.text = val + else: + newnode = parent.makeelement(node.tag, node.attrib) + parent.append(newnode) + +From d8c9a667e6af706cc7729e3d90b8f80b12c4f80e Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Tue, 27 May 2025 21:45:06 +0200 +Subject: [PATCH 4/5] Fix extraction of namespaced attributes from langPointer + +--- + itstool.in | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +diff --git a/itstool.in b/itstool.in +index 78887ca..c7729fd 100755 +--- a/itstool.in ++++ b/itstool.in +@@ -677,8 +677,18 @@ class Document (object): + # language information. Technically, langPointer could be + # any XPath expression. But if it looks like an attribute + # accessor, just use the attribute name. ++ # TODO: This should probably be skipped if langPointer ++ # equals '@xml:lang' which is the default. + if lp[0] == '@': +- self._itst_lang_attr[node] = lp[1:] ++ name = lp[1:] ++ if ':' in name: ++ prefix, lname = name.split(':', 2) ++ nsuri = node.nsmap.get(prefix) ++ if nsuri is None: ++ name = lname ++ else: ++ name = '{' + nsuri + '}' + lname ++ self._itst_lang_attr[node] = name + elif rule.tag == '{' + NS_ITST + '}credits': + sel = rule.get('appendTo') + if sel is not None: + +From 0f6751a586422719442eb7e9ddfe635ec7ca06fb Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Wed, 28 May 2025 19:46:44 +0200 +Subject: [PATCH 5/5] Handle missing placeholders more gracefully + +Print a warning to stderr instead of crashing. + +This is not related to the lxml migration but to issue #1. +--- + itstool.in | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/itstool.in b/itstool.in +index c7729fd..eebd181 100755 +--- a/itstool.in ++++ b/itstool.in +@@ -1008,7 +1008,12 @@ class Document (object): + continue + qname = etree.QName(child.tag) + if qname.namespace == NS_BLANK: +- ph_node = msg.get_placeholder(qname.localname).node ++ ph = msg.get_placeholder(qname.localname) ++ if ph is None: ++ sys.stderr.write('Warning: Could not find placeholder %s\n' % ( ++ qname.localname)) ++ continue ++ ph_node = ph.node + if len(ph_node): + self.merge_translations(translations, None, ph_node, strict=strict) + newnode = deepcopy(ph_node) diff --git a/backport-Fix-insufficiently-quoted-regular-expressions.patch b/backport-Fix-insufficiently-quoted-regular-expressions.patch new file mode 100644 index 0000000..7cffcbd --- /dev/null +++ b/backport-Fix-insufficiently-quoted-regular-expressions.patch @@ -0,0 +1,73 @@ +From 32c7d07664dc37765100285d1202d488cd6a27e8 Mon Sep 17 00:00:00 2001 +From: Nils Philippsen +Date: Mon, 9 Oct 2023 14:26:43 +0200 +Subject: [PATCH] Fix insufficiently quoted regular expressions + +These went under the radar until Python 3.12 started warning about them. + +Signed-off-by: Nils Philippsen +--- + itstool.in | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +diff --git a/itstool.in b/itstool.in +index c21ad4b..4452616 100755 +--- a/itstool.in ++++ b/itstool.in +@@ -220,7 +220,7 @@ class Message (object): + if not isinstance(text, ustr_type): + text = ustr(text, 'utf-8') + self._message[-1] += text.replace('&', '&').replace('<', '<').replace('>', '>') +- if re.sub('\s+', ' ', text).strip() != '': ++ if re.sub(r'\s+', ' ', text).strip() != '': + self._empty = False + + def add_entity_ref (self, name): +@@ -318,7 +318,7 @@ class Message (object): + message += '<_:%s-%i/>' % (msg.name, placeholder) + placeholder += 1 + if not self._preserve: +- message = re.sub('\s+', ' ', message).strip() ++ message = re.sub(r'\s+', ' ', message).strip() + return message + + def get_preserve_space (self): +@@ -456,9 +456,9 @@ class LocNote (object): + if self._preserve_space: + return self.locnote + else: +- return re.sub('\s+', ' ', self.locnote).strip() ++ return re.sub(r'\s+', ' ', self.locnote).strip() + elif self.locnoteref is not None: +- return '(itstool) link: ' + re.sub('\s+', ' ', self.locnoteref).strip() ++ return '(itstool) link: ' + re.sub(r'\s+', ' ', self.locnoteref).strip() + return '' + + +@@ -889,7 +889,7 @@ class Document (object): + trans = translations.ugettext('_\x04translator-credits') + if trans is None or trans == 'translator-credits': + return +- regex = re.compile('(.*) \<(.*)\>, (.*)') ++ regex = re.compile(r'(.*) \<(.*)\>, (.*)') + for credit in trans.split('\n'): + match = regex.match(credit) + if not match: +@@ -924,7 +924,7 @@ class Document (object): + prevnode = None + if node.prev is not None and node.prev.type == 'text': + prevtext = node.prev.content +- if re.sub('\s+', '', prevtext) == '': ++ if re.sub(r'\s+', '', prevtext) == '': + prevnode = node.prev + for lang in sorted(list(translations.keys()), reverse=True): + locale = self.get_its_locale_filter(node) +@@ -1468,7 +1468,7 @@ def match_locale(extrange, locale): + localei += 1 + return True + +-_locale_pattern = re.compile('([a-zA-Z0-9-]+)(_[A-Za-z0-9]+)?(@[A-Za-z0-9]+)?(\.[A-Za-z0-9]+)?') ++_locale_pattern = re.compile(r'([a-zA-Z0-9-]+)(_[A-Za-z0-9]+)?(@[A-Za-z0-9]+)?(\.[A-Za-z0-9]+)?') + def convert_locale (locale): + # Automatically convert POSIX-style locales to BCP47 + match = _locale_pattern.match(locale) diff --git a/itstool.spec b/itstool.spec index ede0b55..efe06e4 100644 --- a/itstool.spec +++ b/itstool.spec @@ -1,29 +1,31 @@ Name: itstool Version: 2.0.7 -Release: 1 +Release: 2 Summary: ITS-based XML translation tool -License: GPLv3+ +License: GPL-3.0-or-later URL: http://itstool.org/ Source0: http://files.itstool.org/itstool/%{name}-%{version}.tar.bz2 +# https://github.com/itstool/itstool/pull/51 +Patch6001: backport-Fix-insufficiently-quoted-regular-expressions.patch +# https://github.com/itstool/itstool/pull/57 +Patch9001: Use-pylxml.patch BuildArch: noarch -BuildRequires: python3-libxml2 python3-devel -Requires: python3-libxml2 +BuildRequires: python3-lxml python3-devel +BuildRequires: autoconf automake libtool +Requires: python3-lxml %description ITS Tool allows you to translate your XML documents with PO files, using rules from the W3C Internationalization Tag Set (ITS)\ to determine what to translate and how to separate it into PO file messages -%package help -Summary: Help manual for %{name} - -%description help -The %{name}-help package conatins man manual etc +%package_help %prep %autosetup -n %{name}-%{version} -p1 %build +autoreconf -fi export PYTHON=%{__python3} %configure %make_build @@ -38,9 +40,12 @@ export PYTHON=%{__python3} %files help %doc NEWS -%doc %{_mandir}/man1/itstool.1.gz +%{_mandir}/man1/* %changelog +* Mon Sep 15 2025 Funda Wang - 2.0.7-2 +- Use lxml instead of libxml2 + * Wed Jun 15 2022 SimpleUpdate Robot - 2.0.7-1 - Upgrade to version 2.0.7 -- Gitee