From 891c7fac3c845ed0c1675c5ae4ea5b68c517d9b8 Mon Sep 17 00:00:00 2001 From: zhuofeng Date: Fri, 14 Jun 2024 14:20:08 +0800 Subject: [PATCH] fix CVE-2024-37388 (cherry picked from commit e861ad4dc5e50f1a111f3becbadc6278622dca2b) --- Fix-test_elementtree-with-Expat-2.6.0.patch | 78 ++++ backport-CVE-2024-37388.patch | 372 ++++++++++++++++++ ...ix-handling-in-ElementPath-to-let-el.patch | 220 +++++++++++ python-lxml.spec | 11 +- 4 files changed, 680 insertions(+), 1 deletion(-) create mode 100644 Fix-test_elementtree-with-Expat-2.6.0.patch create mode 100644 backport-CVE-2024-37388.patch create mode 100644 backport-Change-HTML-prefix-handling-in-ElementPath-to-let-el.patch diff --git a/Fix-test_elementtree-with-Expat-2.6.0.patch b/Fix-test_elementtree-with-Expat-2.6.0.patch new file mode 100644 index 0000000..e7621f4 --- /dev/null +++ b/Fix-test_elementtree-with-Expat-2.6.0.patch @@ -0,0 +1,78 @@ +From e3012a702dea2b03830fe00a5e8f7a429bbc3f42 Mon Sep 17 00:00:00 2001 +From: Serhiy Storchaka +Date: Mon, 22 Apr 2024 16:52:26 +0800 +Subject: [PATCH] Fix test_elementtree with Expat 2.6.0 + +--- + src/lxml/tests/test_elementtree.py | 48 ++++++++++++++++-------------- + 1 file changed, 25 insertions(+), 23 deletions(-) + +diff --git a/src/lxml/tests/test_elementtree.py b/src/lxml/tests/test_elementtree.py +index 96426cb..d9cd47e 100644 +--- a/src/lxml/tests/test_elementtree.py ++++ b/src/lxml/tests/test_elementtree.py +@@ -14,6 +14,7 @@ import copy + import io + import operator + import os ++import pyexpat + import re + import sys + import textwrap +@@ -4396,29 +4397,30 @@ class _XMLPullParserTest(unittest.TestCase): + self.assertEqual([(action, elem.tag) for action, elem in events], + expected) + +- def test_simple_xml(self): +- for chunk_size in (None, 1, 5): +- #with self.subTest(chunk_size=chunk_size): +- parser = self.etree.XMLPullParser() +- self.assert_event_tags(parser, []) +- self._feed(parser, "\n", chunk_size) +- self.assert_event_tags(parser, []) +- self._feed(parser, +- "\n text\n", chunk_size) +- self.assert_event_tags(parser, [('end', 'element')]) +- self._feed(parser, "texttail\n", chunk_size) +- self._feed(parser, "\n", chunk_size) +- self.assert_event_tags(parser, [ +- ('end', 'element'), +- ('end', 'empty-element'), +- ]) +- self._feed(parser, "\n", chunk_size) +- self.assert_event_tags(parser, [('end', 'root')]) +- root = self._close_and_return_root(parser) +- self.assertEqual(root.tag, 'root') ++ def test_simple_xml(self, chunk_size=None): ++ parser = self.etree.XMLPullParser() ++ self.assert_event_tags(parser, []) ++ self._feed(parser, "\n", chunk_size) ++ self.assert_event_tags(parser, []) ++ self._feed(parser, ++ "\n text\n", chunk_size) ++ self.assert_event_tags(parser, [('end', 'element')]) ++ self._feed(parser, "texttail\n", chunk_size) ++ self._feed(parser, "\n", chunk_size) ++ self.assert_event_tags(parser, [ ++ ('end', 'element'), ++ ('end', 'empty-element'), ++ ]) ++ self._feed(parser, "\n", chunk_size) ++ self.assert_event_tags(parser, [('end', 'root')]) ++ root = self._close_and_return_root(parser) ++ self.assertEqual(root.tag, 'root') ++ ++ def test_simple_xml_chunk_22(self): ++ self.test_simple_xml(chunk_size=22) + + def test_feed_while_iterating(self): + parser = self.etree.XMLPullParser() +-- +2.33.0 + diff --git a/backport-CVE-2024-37388.patch b/backport-CVE-2024-37388.patch new file mode 100644 index 0000000..7d5b5db --- /dev/null +++ b/backport-CVE-2024-37388.patch @@ -0,0 +1,372 @@ +From b38cebf2f846e92bd63de4488fd3d1c8b568f397 Mon Sep 17 00:00:00 2001 +From: scoder +Date: Fri, 29 Dec 2023 14:21:23 +0100 +Subject: [PATCH] Disable external entity resolution (XXE) by default (GH-391) + +This prevents security risks that would allow loading arbitrary external files. + +Closes https://bugs.launchpad.net/lxml/+bug/1742885 +Supersedes https://github.com/lxml/lxml/pull/130 +--- + doc/FAQ.txt | 12 +++-- + src/lxml/includes/xmlparser.pxd | 18 +++++++- + src/lxml/parser.pxi | 70 ++++++++++++++++++++++++++-- + src/lxml/tests/test_etree.py | 81 +++++++++++++++++++++++++++++++++ + 4 files changed, 170 insertions(+), 11 deletions(-) + +diff --git a/doc/FAQ.txt b/doc/FAQ.txt +index 48f69a6..7f3a524 100644 +--- a/doc/FAQ.txt ++++ b/doc/FAQ.txt +@@ -1107,9 +1107,9 @@ useless for the data commonly sent through web services and + can simply be disabled, which rules out several types of + denial of service attacks at once. This also involves an attack + that reads local files from the server, as XML entities can be +-defined to expand into their content. Consequently, version +-1.2 of the SOAP standard explicitly disallows entity references +-in the XML stream. ++defined to expand into the content of external resources. ++Consequently, version 1.2 of the SOAP standard explicitly ++disallows entity references in the XML stream. + + To disable entity expansion, use an XML parser that is configured + with the option ``resolve_entities=False``. Then, after (or +@@ -1117,7 +1117,11 @@ while) parsing the document, use ``root.iter(etree.Entity)`` to + recursively search for entity references. If it contains any, + reject the entire input document with a suitable error response. + In lxml 3.x, you can also use the new DTD introspection API to +-apply your own restrictions on input documents. ++apply your own restrictions on input documents. Since version 5.x, ++lxml disables the expansion of external entities (XXE) by default. ++If you really want to allow loading external files into XML documents ++using this functionality, you have to explicitly set ++``resolve_entities=True``. + + Another attack to consider is compression bombs. If you allow + compressed input into your web service, attackers can try to send +diff --git a/src/lxml/includes/xmlparser.pxd b/src/lxml/includes/xmlparser.pxd +index 45acfc8..3945495 100644 +--- a/src/lxml/includes/xmlparser.pxd ++++ b/src/lxml/includes/xmlparser.pxd +@@ -1,9 +1,9 @@ + from libc.string cimport const_char + + from lxml.includes.tree cimport ( +- xmlDoc, xmlNode, xmlDict, xmlDtd, xmlChar, const_xmlChar) ++ xmlDoc, xmlNode, xmlEntity, xmlDict, xmlDtd, xmlChar, const_xmlChar) + from lxml.includes.tree cimport xmlInputReadCallback, xmlInputCloseCallback +-from lxml.includes.xmlerror cimport xmlError, xmlStructuredErrorFunc ++from lxml.includes.xmlerror cimport xmlError, xmlStructuredErrorFunc, xmlErrorLevel + + + cdef extern from "libxml/parser.h": +@@ -47,11 +47,14 @@ cdef extern from "libxml/parser.h": + + ctypedef void (*referenceSAXFunc)(void * ctx, const_xmlChar* name) + ++ ctypedef xmlEntity* (*getEntitySAXFunc)(void* ctx, const_xmlChar* name) ++ + cdef int XML_SAX2_MAGIC + + cdef extern from "libxml/tree.h": + ctypedef struct xmlParserInput: + int line ++ int col + int length + const_xmlChar* base + const_xmlChar* cur +@@ -76,6 +79,7 @@ cdef extern from "libxml/tree.h": + charactersSAXFunc characters + cdataBlockSAXFunc cdataBlock + referenceSAXFunc reference ++ getEntitySAXFunc getEntity + commentSAXFunc comment + processingInstructionSAXFunc processingInstruction + startDocumentSAXFunc startDocument +@@ -150,6 +154,8 @@ cdef extern from "libxml/parser.h": + int inSubset + int charset + xmlParserInput* input ++ int inputNr ++ xmlParserInput** inputTab + + ctypedef enum xmlParserOption: + XML_PARSE_RECOVER = 1 # recover on errors +@@ -212,6 +218,12 @@ cdef extern from "libxml/parser.h": + char* filename, const_char* encoding, + int options) nogil + ++ cdef void xmlErrParser(xmlParserCtxt* ctxt, xmlNode* node, ++ int domain, int code, xmlErrorLevel level, ++ const xmlChar *str1, const xmlChar *str2, const xmlChar *str3, ++ int int1, const char *msg, ...) ++ ++ + # iterparse: + + cdef xmlParserCtxt* xmlCreatePushParserCtxt(xmlSAXHandler* sax, +@@ -233,6 +245,8 @@ cdef extern from "libxml/parser.h": + cdef xmlExternalEntityLoader xmlGetExternalEntityLoader() nogil + cdef void xmlSetExternalEntityLoader(xmlExternalEntityLoader f) nogil + ++ cdef xmlEntity* xmlSAX2GetEntity(void* ctxt, const_xmlChar* name) nogil ++ + # DTDs: + + cdef xmlDtd* xmlParseDTD(const_xmlChar* ExternalID, const_xmlChar* SystemID) nogil +diff --git a/src/lxml/parser.pxi b/src/lxml/parser.pxi +index 3187a38..2f0ce80 100644 +--- a/src/lxml/parser.pxi ++++ b/src/lxml/parser.pxi +@@ -794,6 +794,7 @@ cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict, + c_attr = c_attr.next + return 0 + ++ + @cython.internal + cdef class _BaseParser: + cdef ElementClassLookup _class_lookup +@@ -806,6 +807,7 @@ cdef class _BaseParser: + cdef bint _remove_pis + cdef bint _strip_cdata + cdef bint _collect_ids ++ cdef bint _resolve_external_entities + cdef XMLSchema _schema + cdef bytes _filename + cdef readonly object target +@@ -814,7 +816,7 @@ cdef class _BaseParser: + + def __init__(self, int parse_options, bint for_html, XMLSchema schema, + remove_comments, remove_pis, strip_cdata, collect_ids, +- target, encoding): ++ target, encoding, bint resolve_external_entities=True): + cdef tree.xmlCharEncodingHandler* enchandler + cdef int c_encoding + if not isinstance(self, (XMLParser, HTMLParser)): +@@ -827,6 +829,7 @@ cdef class _BaseParser: + self._remove_pis = remove_pis + self._strip_cdata = strip_cdata + self._collect_ids = collect_ids ++ self._resolve_external_entities = resolve_external_entities + self._schema = schema + + self._resolvers = _ResolverRegistry() +@@ -906,6 +909,8 @@ cdef class _BaseParser: + if self._strip_cdata: + # hard switch-off for CDATA nodes => makes them plain text + pctxt.sax.cdataBlock = NULL ++ if not self._resolve_external_entities: ++ pctxt.sax.getEntity = _getInternalEntityOnly + + cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1: + cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax +@@ -1206,6 +1211,56 @@ cdef class _BaseParser: + finally: + context.cleanup() + ++cdef tree.xmlEntity* _getInternalEntityOnly(void* ctxt, const_xmlChar* name): ++ """ ++ Callback function to intercept the entity resolution when external entity loading is disabled. ++ """ ++ cdef tree.xmlEntity* entity = xmlparser.xmlSAX2GetEntity(ctxt, name) ++ if not entity: ++ return NULL ++ if entity.etype not in ( ++ tree.xmlEntityType.XML_EXTERNAL_GENERAL_PARSED_ENTITY, ++ tree.xmlEntityType.XML_EXTERNAL_GENERAL_UNPARSED_ENTITY, ++ tree.xmlEntityType.XML_EXTERNAL_PARAMETER_ENTITY): ++ return entity ++ ++ # Reject all external entities and fail the parsing instead. There is currently ++ # no way in libxml2 to just prevent the entity resolution in this case. ++ cdef xmlerror.xmlError c_error ++ cdef xmlerror.xmlStructuredErrorFunc err_func ++ cdef xmlparser.xmlParserInput* parser_input ++ cdef void* err_context ++ ++ c_ctxt = ctxt ++ err_func = xmlerror.xmlStructuredError ++ if err_func: ++ parser_input = c_ctxt.input ++ # Copied from xmlVErrParser() in libxml2: get current input from stack. ++ if parser_input and parser_input.filename is NULL and c_ctxt.inputNr > 1: ++ parser_input = c_ctxt.inputTab[c_ctxt.inputNr - 2] ++ ++ c_error = xmlerror.xmlError( ++ domain=xmlerror.xmlErrorDomain.XML_FROM_PARSER, ++ code=xmlerror.xmlParserErrors.XML_ERR_EXT_ENTITY_STANDALONE, ++ level=xmlerror.xmlErrorLevel.XML_ERR_FATAL, ++ message=b"External entity resolution is disabled for security reasons " ++ b"when resolving '&%s;'. Use 'XMLParser(resolve_entities=True)' " ++ b"if you consider it safe to enable it.", ++ file=parser_input.filename, ++ node=entity, ++ str1= name, ++ str2=NULL, ++ str3=NULL, ++ line=parser_input.line if parser_input else 0, ++ int1=0, ++ int2=parser_input.col if parser_input else 0, ++ ) ++ err_context = xmlerror.xmlStructuredErrorContext ++ err_func(err_context, &c_error) ++ ++ c_ctxt.wellFormed = 0 ++ # The entity was looked up and does not need to be freed. ++ return NULL + + cdef void _initSaxDocument(void* ctxt) with gil: + xmlparser.xmlSAX2StartDocument(ctxt) +@@ -1508,12 +1563,14 @@ cdef class XMLParser(_FeedParser): + - strip_cdata - replace CDATA sections by normal text content (default: True) + - compact - save memory for short text content (default: True) + - collect_ids - use a hash table of XML IDs for fast access (default: True, always True with DTD validation) +- - resolve_entities - replace entities by their text value (default: True) + - huge_tree - disable security restrictions and support very deep trees + and very long text content (only affects libxml2 2.7+) + + Other keyword arguments: +- ++ - resolve_entities - replace entities by their text value: False for keeping the ++ entity references, True for resolving them, and 'internal' for resolving ++ internal definitions only (no external file/URL access). ++ The default used to be True and was changed to 'internal' in lxml 5.0. + - encoding - override the document encoding + - target - a parser target object that will receive the parse events + - schema - an XMLSchema to validate against +@@ -1525,10 +1582,11 @@ cdef class XMLParser(_FeedParser): + def __init__(self, *, encoding=None, attribute_defaults=False, + dtd_validation=False, load_dtd=False, no_network=True, + ns_clean=False, recover=False, XMLSchema schema=None, +- huge_tree=False, remove_blank_text=False, resolve_entities=True, ++ huge_tree=False, remove_blank_text=False, resolve_entities='internal', + remove_comments=False, remove_pis=False, strip_cdata=True, + collect_ids=True, target=None, compact=True): + cdef int parse_options ++ cdef bint resolve_external = True + parse_options = _XML_DEFAULT_PARSE_OPTIONS + if load_dtd: + parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD +@@ -1553,12 +1611,14 @@ cdef class XMLParser(_FeedParser): + parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT + if not resolve_entities: + parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT ++ elif resolve_entities == 'internal': ++ resolve_external = False + if not strip_cdata: + parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA + + _BaseParser.__init__(self, parse_options, 0, schema, + remove_comments, remove_pis, strip_cdata, +- collect_ids, target, encoding) ++ collect_ids, target, encoding, resolve_external) + + + cdef class XMLPullParser(XMLParser): +diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py +index 14b21f7..bc7548f 100644 +--- a/src/lxml/tests/test_etree.py ++++ b/src/lxml/tests/test_etree.py +@@ -12,11 +12,14 @@ from __future__ import absolute_import + from collections import OrderedDict + import os.path + import unittest ++import contextlib + import copy + import sys + import re + import gc + import operator ++import shutil ++import tempfile + import textwrap + import zlib + import gzip +@@ -1675,6 +1678,84 @@ class ETreeOnlyTestCase(HelperTestCase): + self.assertEqual(_bytes('&myentity;'), + tostring(root)) + ++ @contextlib.contextmanager ++ def _xml_test_file(self, name, content=b'XML'): ++ temp_dir = tempfile.mkdtemp() ++ try: ++ xml_file = os.path.join(temp_dir, name) ++ with open(xml_file, 'wb') as tmpfile: ++ tmpfile.write(content) ++ yield xml_file ++ finally: ++ shutil.rmtree(temp_dir) ++ ++ def test_entity_parse_external(self): ++ fromstring = self.etree.fromstring ++ tostring = self.etree.tostring ++ parser = self.etree.XMLParser(resolve_entities=True) ++ ++ with self._xml_test_file("entity.xml") as entity_file: ++ xml = ''' ++ ++ ]> ++ &my_external_entity; ++ ''' % path2url(entity_file) ++ root = fromstring(xml, parser) ++ ++ self.assertEqual(_bytes('XML'), ++ tostring(root)) ++ self.assertEqual(root.tag, 'doc') ++ self.assertEqual(root[0].tag, 'evil') ++ self.assertEqual(root[0].text, 'XML') ++ self.assertEqual(root[0].tail, None) ++ ++ def test_entity_parse_external_no_resolve(self): ++ fromstring = self.etree.fromstring ++ parser = self.etree.XMLParser(resolve_entities=False) ++ Entity = self.etree.Entity ++ ++ with self._xml_test_file("entity.xml") as entity_file: ++ xml = ''' ++ ++ ]> ++ &my_external_entity; ++ ''' % path2url(entity_file) ++ root = fromstring(xml, parser) ++ ++ self.assertEqual(root[0].tag, Entity) ++ self.assertEqual(root[0].text, "&my_external_entity;") ++ ++ def test_entity_parse_no_external_default(self): ++ fromstring = self.etree.fromstring ++ ++ with self._xml_test_file("entity.xml") as entity_file: ++ xml = ''' ++ ++ ]> ++ &my_failing_external_entity; ++ ''' % path2url(entity_file) ++ ++ try: ++ fromstring(xml) ++ except self.etree.XMLSyntaxError as exc: ++ exception = exc ++ else: ++ self.assertTrue(False, "XMLSyntaxError was not raised") ++ ++ self.assertIn("my_failing_external_entity", str(exception)) ++ self.assertTrue(exception.error_log) ++ # Depending on the libxml2 version, we get different errors here, ++ # not necessarily the one that lxml produced. But it should fail either way. ++ for error in exception.error_log: ++ if "my_failing_external_entity" in error.message: ++ self.assertEqual(5, error.line) ++ break ++ else: ++ self.assertFalse("entity error not found in parser error log") ++ + def test_entity_restructure(self): + xml = _bytes(''' ]> + +-- +2.33.0 + diff --git a/backport-Change-HTML-prefix-handling-in-ElementPath-to-let-el.patch b/backport-Change-HTML-prefix-handling-in-ElementPath-to-let-el.patch new file mode 100644 index 0000000..6e1e75c --- /dev/null +++ b/backport-Change-HTML-prefix-handling-in-ElementPath-to-let-el.patch @@ -0,0 +1,220 @@ +From 72f5a287a4016ecb405f2e8a4a03ae22a5b0b496 Mon Sep 17 00:00:00 2001 +From: Stefan Behnel +Date: Wed, 5 Jul 2023 22:10:45 +0200 +Subject: [PATCH] Change HTML "prefix" handling in ElementPath to let + "element.find('part1:part2')" search for "part1:part2" instead of just + "part2" with an unknown prefix. Also adapt the HTML "prefix" parsing test to + make it work in libxml2 2.10.4 and later, where HTML "prefixes" are kept as + part of the tag name by the parser. + +--- + src/lxml/_elementpath.py | 22 +++++++++++----------- + src/lxml/apihelpers.pxi | 7 +++++++ + src/lxml/etree.pyx | 8 ++++---- + src/lxml/includes/tree.pxd | 12 ++++++++++++ + src/lxml/tests/test_etree.py | 20 ++++++++++++++++---- + 5 files changed, 50 insertions(+), 19 deletions(-) + +diff --git a/src/lxml/_elementpath.py b/src/lxml/_elementpath.py +index eabd81c..001b345 100644 +--- a/src/lxml/_elementpath.py ++++ b/src/lxml/_elementpath.py +@@ -71,14 +71,14 @@ xpath_tokenizer_re = re.compile( + r"\s+" + ) + +-def xpath_tokenizer(pattern, namespaces=None): ++def xpath_tokenizer(pattern, namespaces=None, with_prefixes=True): + # ElementTree uses '', lxml used None originally. + default_namespace = (namespaces.get(None) or namespaces.get('')) if namespaces else None + parsing_attribute = False + for token in xpath_tokenizer_re.findall(pattern): + ttype, tag = token + if tag and tag[0] != "{": +- if ":" in tag: ++ if ":" in tag and with_prefixes: + prefix, uri = tag.split(":", 1) + try: + if not namespaces: +@@ -251,7 +251,7 @@ ops = { + _cache = {} + + +-def _build_path_iterator(path, namespaces): ++def _build_path_iterator(path, namespaces, with_prefixes=True): + """compile selector pattern""" + if path[-1:] == "/": + path += "*" # implicit all (FIXME: keep this?) +@@ -279,7 +279,7 @@ def _build_path_iterator(path, namespaces): + + if path[:1] == "/": + raise SyntaxError("cannot use absolute path on element") +- stream = iter(xpath_tokenizer(path, namespaces)) ++ stream = iter(xpath_tokenizer(path, namespaces, with_prefixes=with_prefixes)) + try: + _next = stream.next + except AttributeError: +@@ -308,8 +308,8 @@ def _build_path_iterator(path, namespaces): + ## + # Iterate over the matching nodes + +-def iterfind(elem, path, namespaces=None): +- selector = _build_path_iterator(path, namespaces) ++def iterfind(elem, path, namespaces=None, with_prefixes=True): ++ selector = _build_path_iterator(path, namespaces, with_prefixes=with_prefixes) + result = iter((elem,)) + for select in selector: + result = select(result) +@@ -319,8 +319,8 @@ def iterfind(elem, path, namespaces=None): + ## + # Find first matching object. + +-def find(elem, path, namespaces=None): +- it = iterfind(elem, path, namespaces) ++def find(elem, path, namespaces=None, with_prefixes=True): ++ it = iterfind(elem, path, namespaces, with_prefixes=with_prefixes) + try: + return next(it) + except StopIteration: +@@ -330,15 +330,15 @@ def find(elem, path, namespaces=None): + ## + # Find all matching objects. + +-def findall(elem, path, namespaces=None): ++def findall(elem, path, namespaces=None, with_prefixes=True): + return list(iterfind(elem, path, namespaces)) + + + ## + # Find text for first matching object. + +-def findtext(elem, path, default=None, namespaces=None): +- el = find(elem, path, namespaces) ++def findtext(elem, path, default=None, namespaces=None, with_prefixes=True): ++ el = find(elem, path, namespaces, with_prefixes=with_prefixes) + if el is None: + return default + else: +diff --git a/src/lxml/apihelpers.pxi b/src/lxml/apihelpers.pxi +index 88a031d..effd116 100644 +--- a/src/lxml/apihelpers.pxi ++++ b/src/lxml/apihelpers.pxi +@@ -15,6 +15,13 @@ cdef void displayNode(xmlNode* c_node, indent): + finally: + return # swallow any exceptions + ++cdef inline bint _isHtmlDocument(_Element element) except -1: ++ cdef xmlNode* c_node = element._c_node ++ return ( ++ c_node is not NULL and c_node.doc is not NULL and ++ c_node.doc.properties & tree.XML_DOC_HTML != 0 ++ ) ++ + cdef inline int _assertValidNode(_Element element) except -1: + assert element._c_node is not NULL, u"invalid Element proxy at %s" % id(element) + +diff --git a/src/lxml/etree.pyx b/src/lxml/etree.pyx +index 689c330..90753fc 100644 +--- a/src/lxml/etree.pyx ++++ b/src/lxml/etree.pyx +@@ -1544,7 +1544,7 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]: + """ + if isinstance(path, QName): + path = (path).text +- return _elementpath.find(self, path, namespaces) ++ return _elementpath.find(self, path, namespaces, with_prefixes=not _isHtmlDocument(self)) + + def findtext(self, path, default=None, namespaces=None): + u"""findtext(self, path, default=None, namespaces=None) +@@ -1557,7 +1557,7 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]: + """ + if isinstance(path, QName): + path = (path).text +- return _elementpath.findtext(self, path, default, namespaces) ++ return _elementpath.findtext(self, path, default, namespaces, with_prefixes=not _isHtmlDocument(self)) + + def findall(self, path, namespaces=None): + u"""findall(self, path, namespaces=None) +@@ -1570,7 +1570,7 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]: + """ + if isinstance(path, QName): + path = (path).text +- return _elementpath.findall(self, path, namespaces) ++ return _elementpath.findall(self, path, namespaces, with_prefixes=not _isHtmlDocument(self)) + + def iterfind(self, path, namespaces=None): + u"""iterfind(self, path, namespaces=None) +@@ -1583,7 +1583,7 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]: + """ + if isinstance(path, QName): + path = (path).text +- return _elementpath.iterfind(self, path, namespaces) ++ return _elementpath.iterfind(self, path, namespaces, with_prefixes=not _isHtmlDocument(self)) + + def xpath(self, _path, *, namespaces=None, extensions=None, + smart_strings=True, **_variables): +diff --git a/src/lxml/includes/tree.pxd b/src/lxml/includes/tree.pxd +index 010af80..d709313 100644 +--- a/src/lxml/includes/tree.pxd ++++ b/src/lxml/includes/tree.pxd +@@ -154,6 +154,17 @@ cdef extern from "libxml/tree.h": + XML_EXTERNAL_PARAMETER_ENTITY= 5 + XML_INTERNAL_PREDEFINED_ENTITY= 6 + ++ ctypedef enum xmlDocProperties: ++ XML_DOC_WELLFORMED = 1 # /* document is XML well formed */ ++ XML_DOC_NSVALID = 2 # /* document is Namespace valid */ ++ XML_DOC_OLD10 = 4 # /* parsed with old XML-1.0 parser */ ++ XML_DOC_DTDVALID = 8 # /* DTD validation was successful */ ++ XML_DOC_XINCLUDE = 16 # /* XInclude substitution was done */ ++ XML_DOC_USERBUILT = 32 # /* Document was built using the API ++ # and not by parsing an instance */ ++ XML_DOC_INTERNAL = 64 # /* built for internal processing */ ++ XML_DOC_HTML = 128 # /* parsed or built HTML document */ ++ + ctypedef struct xmlNs: + const_xmlChar* href + const_xmlChar* prefix +@@ -274,6 +285,7 @@ cdef extern from "libxml/tree.h": + void* _private + xmlDtd* intSubset + xmlDtd* extSubset ++ int properties + + ctypedef struct xmlAttr: + void* _private +diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py +index bde496d..e969f3a 100644 +--- a/src/lxml/tests/test_etree.py ++++ b/src/lxml/tests/test_etree.py +@@ -3137,11 +3137,23 @@ class ETreeOnlyTestCase(HelperTestCase): + + def test_html_prefix_nsmap(self): + etree = self.etree +- el = etree.HTML('aa').find('.//page-description') +- if etree.LIBXML_VERSION < (2, 9, 11): +- self.assertEqual({'hha': None}, el.nsmap) ++ el = etree.HTML('aa') ++ pd = el[-1] ++ while len(pd): ++ pd = pd[-1] ++ ++ if etree.LIBXML_VERSION >= (2, 9, 11): ++ # "Prefix" is kept as part of the tag name. ++ self.assertEqual("hha:page-description", pd.tag) ++ self.assertIsNone(el.find('.//page-description')) ++ self.assertIsNotNone(el.find('.//hha:page-description')) # no namespaces! ++ for e in el.iter(): ++ self.assertEqual({}, e.nsmap) + else: +- self.assertEqual({}, el.nsmap) ++ # "Prefix" is parsed as XML prefix. ++ self.assertEqual("page-description", pd.tag) ++ pd = el.find('.//page-description') ++ self.assertEqual({'hha': None}, pd.nsmap) + + def test_getchildren(self): + Element = self.etree.Element +-- +2.33.0 + diff --git a/python-lxml.spec b/python-lxml.spec index b64153c..a13fdd6 100644 --- a/python-lxml.spec +++ b/python-lxml.spec @@ -7,7 +7,7 @@ The latest release works with all CPython versions from 2.7 to 3.7. Name: python-%{modname} Version: 4.7.1 -Release: 5 +Release: 6 Summary: XML processing library combining libxml2/libxslt with the ElementTree API License: BSD URL: https://files.pythonhosted.org @@ -15,6 +15,9 @@ Source0: https://files.pythonhosted.org/packages/source/l/lxml/lxml-%{ver Patch6000: backport-CVE-2022-2309.patch Patch6001: backport-Work-around-libxml2-bug-in-affected-versions.patch +Patch6002: Fix-test_elementtree-with-Expat-2.6.0.patch +Patch6003: backport-CVE-2024-37388.patch +Patch6004: backport-Change-HTML-prefix-handling-in-ElementPath-to-let-el.patch BuildRequires: gcc libxml2-devel libxslt-devel @@ -55,6 +58,12 @@ make test3 %doc README.rst src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt %changelog +* Wed Jun 12 2024 zhuofeng - 4.7.1-6 +- Type:CVE +- CVE:CVE-2024-37388 +- SUG:NA +- DESC:fix CVE-2024-37388 + * Wed Nov 16 2022 zhuofeng - 4.7.1-5 - change the Source0 -- Gitee