diff --git a/0001-Adapt-to-libxml2-2.10.4-and-later.patch b/0001-Adapt-to-libxml2-2.10.4-and-later.patch new file mode 100644 index 0000000000000000000000000000000000000000..5b47d41c6d293be4743fc58a78728b397ae755ba --- /dev/null +++ b/0001-Adapt-to-libxml2-2.10.4-and-later.patch @@ -0,0 +1,251 @@ +From a1669cfdefa1e9762e7a8297f4413ecbf373888d Mon Sep 17 00:00:00 2001 +From: han-guangyu +Date: Mon, 20 May 2024 13:57:13 +0800 +Subject: [PATCH 1/1] Adapt to libxml2 2.10.4 and later + +Change HTML "prefix" handling in ElementPath to let +"element.find('part1:part2')" search for "part1:part2" instead of just +"part2" with an unknown prefix. + +Also adapt the HTML "prefix" parsing test to make it work in libxml2 +2.10.4 and later, where HTML "prefixes" are kept as part of the tag +name by the parser. +--- + CHANGES.txt | 12 ++++++++++++ + src/lxml/_elementpath.py | 21 +++++++++++---------- + src/lxml/apihelpers.pxi | 7 +++++++ + src/lxml/etree.pyx | 8 ++++---- + src/lxml/includes/tree.pxd | 11 +++++++++++ + src/lxml/tests/test_etree.py | 26 ++++++++++++++++++++++---- + 6 files changed, 67 insertions(+), 18 deletions(-) + +diff --git a/CHANGES.txt b/CHANGES.txt +index 4dd1055..0e47581 100644 +--- a/CHANGES.txt ++++ b/CHANGES.txt +@@ -24,6 +24,18 @@ Other changes + + * Built with Cython 0.29.37. + ++Bugs fixed in openEuler ++ ++---------- ++* With libxml2 2.10.4 and later (as provided by the lxml 5.0 binary wheels), ++ parsing HTML tags with "prefixes" no longer builds a namespace dictionary ++ in ``nsmap`` but considers the ``prefix:name`` string the actual tag name. ++ With older libxml2 versions, since 2.9.11, the prefix was removed. Before ++ that, the prefix was parsed as XML prefix. ++ ++ lxml 5.0 does not try to hide this difference but now changes the ElementPath ++ implementation to let ``element.find("part1:part2")`` search for the tag ++ ``part1:part2`` in documents parsed as HTML, instead of looking only for ``part2``. + + 4.9.3 (2023-07-05) + ================== +diff --git a/src/lxml/_elementpath.py b/src/lxml/_elementpath.py +index eabd81c..24b8e2b 100644 +--- a/src/lxml/_elementpath.py ++++ b/src/lxml/_elementpath.py +@@ -71,14 +71,14 @@ xpath_tokenizer_re = re.compile( + r"\s+" + ) + +-def xpath_tokenizer(pattern, namespaces=None): ++def xpath_tokenizer(pattern, namespaces=None, with_prefixes=True): + # ElementTree uses '', lxml used None originally. + default_namespace = (namespaces.get(None) or namespaces.get('')) if namespaces else None + parsing_attribute = False + for token in xpath_tokenizer_re.findall(pattern): + ttype, tag = token + if tag and tag[0] != "{": +- if ":" in tag: ++ if ":" in tag and with_prefixes: + prefix, uri = tag.split(":", 1) + try: + if not namespaces: +@@ -251,7 +251,7 @@ ops = { + _cache = {} + + +-def _build_path_iterator(path, namespaces): ++def _build_path_iterator(path, namespaces, with_prefixes=True): + """compile selector pattern""" + if path[-1:] == "/": + path += "*" # implicit all (FIXME: keep this?) +@@ -279,7 +279,7 @@ def _build_path_iterator(path, namespaces): + + if path[:1] == "/": + raise SyntaxError("cannot use absolute path on element") +- stream = iter(xpath_tokenizer(path, namespaces)) ++ stream = iter(xpath_tokenizer(path, namespaces, with_prefixes=with_prefixes)) + try: + _next = stream.next + except AttributeError: +@@ -308,7 +308,8 @@ def _build_path_iterator(path, namespaces): + ## + # Iterate over the matching nodes + +-def iterfind(elem, path, namespaces=None): ++def iterfind(elem, path, namespaces=None, with_prefixes=True): ++ selector = _build_path_iterator(path, namespaces, with_prefixes=with_prefixes) + selector = _build_path_iterator(path, namespaces) + result = iter((elem,)) + for select in selector: +@@ -319,8 +320,8 @@ def iterfind(elem, path, namespaces=None): + ## + # Find first matching object. + +-def find(elem, path, namespaces=None): +- it = iterfind(elem, path, namespaces) ++def find(elem, path, namespaces=None, with_prefixes=True): ++ it = iterfind(elem, path, namespaces, with_prefixes=with_prefixes) + try: + return next(it) + except StopIteration: +@@ -330,15 +331,15 @@ def find(elem, path, namespaces=None): + ## + # Find all matching objects. + +-def findall(elem, path, namespaces=None): ++def findall(elem, path, namespaces=None, with_prefixes=True): + return list(iterfind(elem, path, namespaces)) + + + ## + # Find text for first matching object. + +-def findtext(elem, path, default=None, namespaces=None): +- el = find(elem, path, namespaces) ++def findtext(elem, path, default=None, namespaces=None, with_prefixes=True): ++ el = find(elem, path, namespaces, with_prefixes=with_prefixes) + if el is None: + return default + else: +diff --git a/src/lxml/apihelpers.pxi b/src/lxml/apihelpers.pxi +index 9fae9fb..35b3187 100644 +--- a/src/lxml/apihelpers.pxi ++++ b/src/lxml/apihelpers.pxi +@@ -15,6 +15,13 @@ cdef void displayNode(xmlNode* c_node, indent): + finally: + return # swallow any exceptions + ++cdef inline bint _isHtmlDocument(_Element element) except -1: ++ cdef xmlNode* c_node = element._c_node ++ return ( ++ c_node is not NULL and c_node.doc is not NULL and ++ c_node.doc.properties & tree.XML_DOC_HTML != 0 ++ ) ++ + cdef inline int _assertValidNode(_Element element) except -1: + assert element._c_node is not NULL, u"invalid Element proxy at %s" % id(element) + +diff --git a/src/lxml/etree.pyx b/src/lxml/etree.pyx +index ff8ec9f..a2a776c 100644 +--- a/src/lxml/etree.pyx ++++ b/src/lxml/etree.pyx +@@ -1554,7 +1554,7 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]: + """ + if isinstance(path, QName): + path = (path).text +- return _elementpath.find(self, path, namespaces) ++ return _elementpath.find(self, path, namespaces, with_prefixes=not _isHtmlDocument(self)) + + def findtext(self, path, default=None, namespaces=None): + u"""findtext(self, path, default=None, namespaces=None) +@@ -1567,7 +1567,7 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]: + """ + if isinstance(path, QName): + path = (path).text +- return _elementpath.findtext(self, path, default, namespaces) ++ return _elementpath.findtext(self, path, default, namespaces, with_prefixes=not _isHtmlDocument(self)) + + def findall(self, path, namespaces=None): + u"""findall(self, path, namespaces=None) +@@ -1580,7 +1580,7 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]: + """ + if isinstance(path, QName): + path = (path).text +- return _elementpath.findall(self, path, namespaces) ++ return _elementpath.findall(self, path, namespaces, with_prefixes=not _isHtmlDocument(self)) + + def iterfind(self, path, namespaces=None): + u"""iterfind(self, path, namespaces=None) +@@ -1593,7 +1593,7 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]: + """ + if isinstance(path, QName): + path = (path).text +- return _elementpath.iterfind(self, path, namespaces) ++ return _elementpath.iterfind(self, path, namespaces, with_prefixes=not _isHtmlDocument(self)) + + def xpath(self, _path, *, namespaces=None, extensions=None, + smart_strings=True, **_variables): +diff --git a/src/lxml/includes/tree.pxd b/src/lxml/includes/tree.pxd +index 010af80..5f21329 100644 +--- a/src/lxml/includes/tree.pxd ++++ b/src/lxml/includes/tree.pxd +@@ -153,6 +153,16 @@ cdef extern from "libxml/tree.h": + XML_INTERNAL_PARAMETER_ENTITY= 4 + XML_EXTERNAL_PARAMETER_ENTITY= 5 + XML_INTERNAL_PREDEFINED_ENTITY= 6 ++ ctypedef enum xmlDocProperties: ++ XML_DOC_WELLFORMED = 1 # /* document is XML well formed */ ++ XML_DOC_NSVALID = 2 # /* document is Namespace valid */ ++ XML_DOC_OLD10 = 4 # /* parsed with old XML-1.0 parser */ ++ XML_DOC_DTDVALID = 8 # /* DTD validation was successful */ ++ XML_DOC_XINCLUDE = 16 # /* XInclude substitution was done */ ++ XML_DOC_USERBUILT = 32 # /* Document was built using the API ++ # and not by parsing an instance */ ++ XML_DOC_INTERNAL = 64 # /* built for internal processing */ ++ XML_DOC_HTML = 128 # /* parsed or built HTML document */ + + ctypedef struct xmlNs: + const_xmlChar* href +@@ -274,6 +284,7 @@ cdef extern from "libxml/tree.h": + void* _private + xmlDtd* intSubset + xmlDtd* extSubset ++ int properties + + ctypedef struct xmlAttr: + void* _private +diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py +index 9eab5bf..2afc07e 100644 +--- a/src/lxml/tests/test_etree.py ++++ b/src/lxml/tests/test_etree.py +@@ -3121,11 +3121,29 @@ class ETreeOnlyTestCase(HelperTestCase): + + def test_html_prefix_nsmap(self): + etree = self.etree +- el = etree.HTML('aa').find('.//page-description') +- if etree.LIBXML_VERSION < (2, 9, 11): +- self.assertEqual({'hha': None}, el.nsmap) ++ el = etree.HTML('aa') ++ pd = el[-1] ++ while len(pd): ++ pd = pd[-1] ++ ++ if etree.LIBXML_VERSION >= (2, 10, 4): ++ # "Prefix" is kept as part of the tag name. ++ self.assertEqual("hha:page-description", pd.tag) ++ self.assertIsNone(el.find('.//page-description')) ++ self.assertIsNotNone(el.find('.//hha:page-description')) # no namespaces! ++ for e in el.iter(): ++ self.assertEqual({}, e.nsmap) ++ elif etree.LIBXML_VERSION >= (2, 9, 11): ++ # "Prefix" is stripped. ++ self.assertEqual("page-description", pd.tag) ++ self.assertIsNotNone(el.find('.//page-description')) ++ for e in el.iter(): ++ self.assertEqual({}, e.nsmap) + else: +- self.assertEqual({}, el.nsmap) ++ # "Prefix" is parsed as XML prefix. ++ self.assertEqual("page-description", pd.tag) ++ pd = el.find('.//page-description') ++ self.assertEqual({'hha': None}, pd.nsmap) + + def test_getchildren(self): + Element = self.etree.Element +-- +2.43.0 + diff --git a/0002-Fix-test_elementtree-with-Expat-2.6.0.patch b/0002-Fix-test_elementtree-with-Expat-2.6.0.patch new file mode 100644 index 0000000000000000000000000000000000000000..f95b981fa10cd385a81755e6d69c328726d1ae8d --- /dev/null +++ b/0002-Fix-test_elementtree-with-Expat-2.6.0.patch @@ -0,0 +1,58 @@ +diff --git a/src/lxml/tests/test_elementtree.py b/src/lxml/tests/test_elementtree.py +index 96426cb..ef3302e 100644 +--- a/src/lxml/tests/test_elementtree.py ++++ b/src/lxml/tests/test_elementtree.py +@@ -4396,29 +4396,30 @@ class _XMLPullParserTest(unittest.TestCase): + self.assertEqual([(action, elem.tag) for action, elem in events], + expected) + +- def test_simple_xml(self): +- for chunk_size in (None, 1, 5): +- #with self.subTest(chunk_size=chunk_size): +- parser = self.etree.XMLPullParser() +- self.assert_event_tags(parser, []) +- self._feed(parser, "\n", chunk_size) +- self.assert_event_tags(parser, []) +- self._feed(parser, +- "\n text\n", chunk_size) +- self.assert_event_tags(parser, [('end', 'element')]) +- self._feed(parser, "texttail\n", chunk_size) +- self._feed(parser, "\n", chunk_size) +- self.assert_event_tags(parser, [ +- ('end', 'element'), +- ('end', 'empty-element'), +- ]) +- self._feed(parser, "\n", chunk_size) +- self.assert_event_tags(parser, [('end', 'root')]) +- root = self._close_and_return_root(parser) +- self.assertEqual(root.tag, 'root') ++ def test_simple_xml(self, chunk_size=None): ++ parser = self.etree.XMLPullParser() ++ self.assert_event_tags(parser, []) ++ self._feed(parser, "\n", chunk_size) ++ self.assert_event_tags(parser, []) ++ self._feed(parser, ++ "\n text\n", chunk_size) ++ self.assert_event_tags(parser, [('end', 'element')]) ++ self._feed(parser, "texttail\n", chunk_size) ++ self._feed(parser, "\n", chunk_size) ++ self.assert_event_tags(parser, [ ++ ('end', 'element'), ++ ('end', 'empty-element'), ++ ]) ++ self._feed(parser, "\n", chunk_size) ++ self.assert_event_tags(parser, [('end', 'root')]) ++ root = self._close_and_return_root(parser) ++ self.assertEqual(root.tag, 'root') ++ ++ def test_simple_xml_chunk_22(self): ++ self.test_simple_xml(chunk_size=22) + + def test_feed_while_iterating(self): + parser = self.etree.XMLPullParser() diff --git a/backport-CVE-2021-28957.patch b/backport-CVE-2021-28957.patch deleted file mode 100644 index de583e45171517623ab437a4267b20d427ab50b4..0000000000000000000000000000000000000000 --- a/backport-CVE-2021-28957.patch +++ /dev/null @@ -1,52 +0,0 @@ -From 2d01a1ba8984e0483ce6619b972832377f208a0d Mon Sep 17 00:00:00 2001 -From: Kevin Chung -Date: Sun, 21 Mar 2021 10:03:09 -0400 -Subject: [PATCH] Add HTML-5 "formaction" attribute to "defs.link_attrs" - (GH-316) - -Resolves https://bugs.launchpad.net/lxml/+bug/1888153 -See https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-28957 ---- - src/lxml/html/defs.py | 2 ++ - src/lxml/html/tests/test_clean.py | 15 +++++++++++++++ - 2 files changed, 17 insertions(+) - -diff --git a/src/lxml/html/defs.py b/src/lxml/html/defs.py -index 1b3a75b36..2058ea330 100644 ---- a/src/lxml/html/defs.py -+++ b/src/lxml/html/defs.py -@@ -23,6 +23,8 @@ - 'usemap', - # Not standard: - 'dynsrc', 'lowsrc', -+ # HTML5 formaction -+ 'formaction' - ]) - - # Not in the HTML 4 spec: -diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py -index 0e669f98d..45c2e83ab 100644 ---- a/src/lxml/html/tests/test_clean.py -+++ b/src/lxml/html/tests/test_clean.py -@@ -123,6 +123,21 @@ def test_sneaky_js_in_math_style(self): - b'', - lxml.html.tostring(clean_html(s))) - -+ def test_formaction_attribute_in_button_input(self): -+ # The formaction attribute overrides the form's action and should be -+ # treated as a malicious link attribute -+ html = ('
' -+ '') -+ expected = ('
' -+ '
') -+ cleaner = Cleaner( -+ forms=False, -+ safe_attrs_only=False, -+ ) -+ self.assertEqual( -+ expected, -+ cleaner.clean_html(html)) -+ - - def test_suite(): - suite = unittest.TestSuite() diff --git a/lxml-4.6.2.tar.gz b/lxml-4.9.4.tar.gz similarity index 30% rename from lxml-4.6.2.tar.gz rename to lxml-4.9.4.tar.gz index 75bd7eefdbfc5fa2a97665c20f657d2e70b464dd..bf045929a9f3435068d4da19bee336cd8aa56239 100644 Binary files a/lxml-4.6.2.tar.gz and b/lxml-4.9.4.tar.gz differ diff --git a/python-lxml.spec b/python-lxml.spec index 47576f7822b3d7e48cce959e73f6c9ea6a8833e7..b2323d479b05cb4ce8f7ce3402b156d1e80c98f0 100644 --- a/python-lxml.spec +++ b/python-lxml.spec @@ -1,57 +1,87 @@ -%global modname lxml +%global _empty_manifest_terminate_build 0 %global _description \ The lxml XML toolkit is a Pythonic binding for the C libraries libxml2 and libxslt. \ It is unique in that it combines the speed and XML feature completeness of these libraries with \ the simplicity of a native Python API, mostly compatible but superior to the well-known ElementTree API. \ The latest release works with all CPython versions from 2.7 to 3.7. -Name: python-%{modname} -Version: 4.6.2 -Release: 2 -Summary: XML processing library combining libxml2/libxslt with the ElementTree API -License: BSD -URL: https://files.pythonhosted.org -Source0: https://files.pythonhosted.org/packages/db/f7/43fecb94d66959c1e23aa53d6161231dca0e93ec500224cf31b3c4073e37/lxml-4.6.2.tar.gz - -Patch6000: backport-CVE-2021-28957.patch - -BuildRequires: gcc libxml2-devel libxslt-devel - -%description %{_description} - - -%package -n python3-%{modname} -Summary: %{summary} -BuildRequires: python3-devel python3-setuptools python3-Cython -%{?python_provide:%python_provide python3-%{modname}} - -%description -n python3-%{modname} %{_description} - -%package_help +Name: python-lxml +Version: 4.9.4 +Release: 1 +Summary: XML processing library combining libxml2/libxslt with the ElementTree API +License: BSD +URL: https://github.com/lxml/lxml +Source0: https://files.pythonhosted.org/packages/84/14/c2070b5e37c650198de8328467dd3d1681e80986f81ba0fea04fc4ec9883/lxml-4.9.4.tar.gz + +Patch0: 0001-Adapt-to-libxml2-2.10.4-and-later.patch +Patch1: 0002-Fix-test_elementtree-with-Expat-2.6.0.patch + +%description +%{_description} + +%package -n python3-lxml +Summary: XML processing library combining libxml2/libxslt with the ElementTree API +Provides: python-lxml = %{version}-%{release} +BuildRequires: gcc +BuildRequires: libxml2-devel +BuildRequires: libxslt-devel +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-Cython +%description -n python3-lxml +%{_description} + +%package help +Summary: Development documents and examples for lxml +Provides: python3-lxml-doc +%description help +%{_description} %prep -%autosetup -n %{modname}-%{version} -p1 +%autosetup -n lxml-%{version} -p1 +find -type f -name '*.c' -print -delete %build -export WITH_CYTHON=true %py3_build %install %py3_install +pushd %{buildroot} +if [ -d usr/lib ]; then + find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/lib64 ]; then + find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/bin ]; then + find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst +fi +if [ -d usr/sbin ]; then + find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst +fi +touch doclist.lst +if [ -d usr/share/man ]; then + find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst +fi +popd +mv %{buildroot}/filelist.lst . +mv %{buildroot}/doclist.lst . %check -%{__python3} setup.py test +cp -a build/lib.%{python3_platform}-*/* src/ +python3 test.py +%files -n python3-lxml -f filelist.lst +%license doc/licenses/*.txt LICENSES.txt +%dir %{python3_sitearch}/* -%files -n python3-%{modname} -%license doc/licenses/ZopePublicLicense.txt LICENSES.txt -%{python3_sitearch}/%{modname}/ -%{python3_sitearch}/*.egg-info/ - -%files help +%files help -f doclist.lst %doc README.rst src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt -%changelog +%changelog +* Tue Nov 26 2024 tzing_t - 4.9.4-1 +- Upgrade to version 4.9.4 + * Wed Apr 14 2021 shixuantong - 4.6.2-2 - fix CVE-2021-28957