From 891c7fac3c845ed0c1675c5ae4ea5b68c517d9b8 Mon Sep 17 00:00:00 2001
From: zhuofeng <zhuofeng2@huawei.com>
Date: Fri, 14 Jun 2024 14:20:08 +0800
Subject: [PATCH] fix CVE-2024-37388

(cherry picked from commit e861ad4dc5e50f1a111f3becbadc6278622dca2b)
---
 Fix-test_elementtree-with-Expat-2.6.0.patch   |  78 ++++
 backport-CVE-2024-37388.patch                 | 372 ++++++++++++++++++
 ...ix-handling-in-ElementPath-to-let-el.patch | 220 +++++++++++
 python-lxml.spec                              |  11 +-
 4 files changed, 680 insertions(+), 1 deletion(-)
 create mode 100644 Fix-test_elementtree-with-Expat-2.6.0.patch
 create mode 100644 backport-CVE-2024-37388.patch
 create mode 100644 backport-Change-HTML-prefix-handling-in-ElementPath-to-let-el.patch
diff --git a/Fix-test_elementtree-with-Expat-2.6.0.patch b/Fix-test_elementtree-with-Expat-2.6.0.patch
new file mode 100644
index 0000000..e7621f4
--- /dev/null
+++ b/Fix-test_elementtree-with-Expat-2.6.0.patch
@@ -0,0 +1,78 @@
+From e3012a702dea2b03830fe00a5e8f7a429bbc3f42 Mon Sep 17 00:00:00 2001
+From: Serhiy Storchaka <storchaka@gmail.com>
+Date: Mon, 22 Apr 2024 16:52:26 +0800
+Subject: [PATCH] Fix test_elementtree with Expat 2.6.0
+
+---
+ src/lxml/tests/test_elementtree.py | 48 ++++++++++++++++--------------
+ 1 file changed, 25 insertions(+), 23 deletions(-)
+
+diff --git a/src/lxml/tests/test_elementtree.py b/src/lxml/tests/test_elementtree.py
+index 96426cb..d9cd47e 100644
+--- a/src/lxml/tests/test_elementtree.py
++++ b/src/lxml/tests/test_elementtree.py
+@@ -14,6 +14,7 @@ import copy
+ import io
+ import operator
+ import os
++import pyexpat
+ import re
+ import sys
+ import textwrap
+@@ -4396,29 +4397,30 @@ class _XMLPullParserTest(unittest.TestCase):
+         self.assertEqual([(action, elem.tag) for action, elem in events],
+                          expected)
+ 
+-    def test_simple_xml(self):
+-        for chunk_size in (None, 1, 5):
+-            #with self.subTest(chunk_size=chunk_size):
+-                parser = self.etree.XMLPullParser()
+-                self.assert_event_tags(parser, [])
+-                self._feed(parser, "<!-- comment -->\n", chunk_size)
+-                self.assert_event_tags(parser, [])
+-                self._feed(parser,
+-                           "<root>\n  <element key='value'>text</element",
+-                           chunk_size)
+-                self.assert_event_tags(parser, [])
+-                self._feed(parser, ">\n", chunk_size)
+-                self.assert_event_tags(parser, [('end', 'element')])
+-                self._feed(parser, "<element>text</element>tail\n", chunk_size)
+-                self._feed(parser, "<empty-element/>\n", chunk_size)
+-                self.assert_event_tags(parser, [
+-                    ('end', 'element'),
+-                    ('end', 'empty-element'),
+-                    ])
+-                self._feed(parser, "</root>\n", chunk_size)
+-                self.assert_event_tags(parser, [('end', 'root')])
+-                root = self._close_and_return_root(parser)
+-                self.assertEqual(root.tag, 'root')
++    def test_simple_xml(self, chunk_size=None):
++        parser = self.etree.XMLPullParser()
++        self.assert_event_tags(parser, [])
++        self._feed(parser, "<!-- comment -->\n", chunk_size)
++        self.assert_event_tags(parser, [])
++        self._feed(parser,
++                   "<root>\n  <element key='value'>text</element",
++                   chunk_size)
++        self.assert_event_tags(parser, [])
++        self._feed(parser, ">\n", chunk_size)
++        self.assert_event_tags(parser, [('end', 'element')])
++        self._feed(parser, "<element>text</element>tail\n", chunk_size)
++        self._feed(parser, "<empty-element/>\n", chunk_size)
++        self.assert_event_tags(parser, [
++            ('end', 'element'),
++            ('end', 'empty-element'),
++            ])
++        self._feed(parser, "</root>\n", chunk_size)
++        self.assert_event_tags(parser, [('end', 'root')])
++        root = self._close_and_return_root(parser)
++        self.assertEqual(root.tag, 'root')
++
++    def test_simple_xml_chunk_22(self):
++        self.test_simple_xml(chunk_size=22)
+ 
+     def test_feed_while_iterating(self):
+         parser = self.etree.XMLPullParser()
+-- 
+2.33.0
+
diff --git a/backport-CVE-2024-37388.patch b/backport-CVE-2024-37388.patch
new file mode 100644
index 0000000..7d5b5db
--- /dev/null
+++ b/backport-CVE-2024-37388.patch
@@ -0,0 +1,372 @@
+From b38cebf2f846e92bd63de4488fd3d1c8b568f397 Mon Sep 17 00:00:00 2001
+From: scoder <stefan_ml@behnel.de>
+Date: Fri, 29 Dec 2023 14:21:23 +0100
+Subject: [PATCH] Disable external entity resolution (XXE) by default (GH-391)
+
+This prevents security risks that would allow loading arbitrary external files.
+
+Closes https://bugs.launchpad.net/lxml/+bug/1742885
+Supersedes https://github.com/lxml/lxml/pull/130
+---
+ doc/FAQ.txt                     | 12 +++--
+ src/lxml/includes/xmlparser.pxd | 18 +++++++-
+ src/lxml/parser.pxi             | 70 ++++++++++++++++++++++++++--
+ src/lxml/tests/test_etree.py    | 81 +++++++++++++++++++++++++++++++++
+ 4 files changed, 170 insertions(+), 11 deletions(-)
+
+diff --git a/doc/FAQ.txt b/doc/FAQ.txt
+index 48f69a6..7f3a524 100644
+--- a/doc/FAQ.txt
++++ b/doc/FAQ.txt
+@@ -1107,9 +1107,9 @@ useless for the data commonly sent through web services and
+ can simply be disabled, which rules out several types of
+ denial of service attacks at once.  This also involves an attack
+ that reads local files from the server, as XML entities can be
+-defined to expand into their content.   Consequently, version
+-1.2 of the SOAP standard explicitly disallows entity references
+-in the XML stream.
++defined to expand into the content of external resources.
++Consequently, version 1.2 of the SOAP standard explicitly
++disallows entity references in the XML stream.
+ 
+ To disable entity expansion, use an XML parser that is configured
+ with the option ``resolve_entities=False``.  Then, after (or
+@@ -1117,7 +1117,11 @@ while) parsing the document, use ``root.iter(etree.Entity)`` to
+ recursively search for entity references.  If it contains any,
+ reject the entire input document with a suitable error response.
+ In lxml 3.x, you can also use the new DTD introspection API to
+-apply your own restrictions on input documents.
++apply your own restrictions on input documents.  Since version 5.x,
++lxml disables the expansion of external entities (XXE) by default.
++If you really want to allow loading external files into XML documents
++using this functionality, you have to explicitly set
++``resolve_entities=True``.
+ 
+ Another attack to consider is compression bombs.  If you allow
+ compressed input into your web service, attackers can try to send
+diff --git a/src/lxml/includes/xmlparser.pxd b/src/lxml/includes/xmlparser.pxd
+index 45acfc8..3945495 100644
+--- a/src/lxml/includes/xmlparser.pxd
++++ b/src/lxml/includes/xmlparser.pxd
+@@ -1,9 +1,9 @@
+ from libc.string cimport const_char
+ 
+ from lxml.includes.tree cimport (
+-    xmlDoc, xmlNode, xmlDict, xmlDtd, xmlChar, const_xmlChar)
++    xmlDoc, xmlNode, xmlEntity, xmlDict, xmlDtd, xmlChar, const_xmlChar)
+ from lxml.includes.tree cimport xmlInputReadCallback, xmlInputCloseCallback
+-from lxml.includes.xmlerror cimport xmlError, xmlStructuredErrorFunc
++from lxml.includes.xmlerror cimport xmlError, xmlStructuredErrorFunc, xmlErrorLevel
+ 
+ 
+ cdef extern from "libxml/parser.h":
+@@ -47,11 +47,14 @@ cdef extern from "libxml/parser.h":
+ 
+     ctypedef void (*referenceSAXFunc)(void * ctx, const_xmlChar* name)
+ 
++    ctypedef xmlEntity* (*getEntitySAXFunc)(void* ctx, const_xmlChar* name)
++
+     cdef int XML_SAX2_MAGIC
+ 
+ cdef extern from "libxml/tree.h":
+     ctypedef struct xmlParserInput:
+         int line
++        int col
+         int length
+         const_xmlChar* base
+         const_xmlChar* cur
+@@ -76,6 +79,7 @@ cdef extern from "libxml/tree.h":
+         charactersSAXFunc               characters
+         cdataBlockSAXFunc               cdataBlock
+         referenceSAXFunc                reference
++        getEntitySAXFunc                getEntity
+         commentSAXFunc                  comment
+         processingInstructionSAXFunc	processingInstruction
+         startDocumentSAXFunc            startDocument
+@@ -150,6 +154,8 @@ cdef extern from "libxml/parser.h":
+         int inSubset
+         int charset
+         xmlParserInput* input
++        int inputNr
++        xmlParserInput** inputTab
+ 
+     ctypedef enum xmlParserOption:
+         XML_PARSE_RECOVER = 1 # recover on errors
+@@ -212,6 +218,12 @@ cdef extern from "libxml/parser.h":
+                                    char* filename, const_char* encoding,
+                                    int options) nogil
+ 
++    cdef void xmlErrParser(xmlParserCtxt* ctxt, xmlNode* node,
++                           int domain, int code, xmlErrorLevel level,
++                           const xmlChar *str1, const xmlChar *str2, const xmlChar *str3,
++                           int int1, const char *msg, ...)
++
++
+ # iterparse:
+ 
+     cdef xmlParserCtxt* xmlCreatePushParserCtxt(xmlSAXHandler* sax,
+@@ -233,6 +245,8 @@ cdef extern from "libxml/parser.h":
+     cdef xmlExternalEntityLoader xmlGetExternalEntityLoader() nogil
+     cdef void xmlSetExternalEntityLoader(xmlExternalEntityLoader f) nogil
+ 
++    cdef xmlEntity* xmlSAX2GetEntity(void* ctxt, const_xmlChar* name) nogil
++
+ # DTDs:
+ 
+     cdef xmlDtd* xmlParseDTD(const_xmlChar* ExternalID, const_xmlChar* SystemID) nogil
+diff --git a/src/lxml/parser.pxi b/src/lxml/parser.pxi
+index 3187a38..2f0ce80 100644
+--- a/src/lxml/parser.pxi
++++ b/src/lxml/parser.pxi
+@@ -794,6 +794,7 @@ cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
+         c_attr = c_attr.next
+     return 0
+ 
++
+ @cython.internal
+ cdef class _BaseParser:
+     cdef ElementClassLookup _class_lookup
+@@ -806,6 +807,7 @@ cdef class _BaseParser:
+     cdef bint _remove_pis
+     cdef bint _strip_cdata
+     cdef bint _collect_ids
++    cdef bint _resolve_external_entities
+     cdef XMLSchema _schema
+     cdef bytes _filename
+     cdef readonly object target
+@@ -814,7 +816,7 @@ cdef class _BaseParser:
+ 
+     def __init__(self, int parse_options, bint for_html, XMLSchema schema,
+                  remove_comments, remove_pis, strip_cdata, collect_ids,
+-                 target, encoding):
++                 target, encoding, bint resolve_external_entities=True):
+         cdef tree.xmlCharEncodingHandler* enchandler
+         cdef int c_encoding
+         if not isinstance(self, (XMLParser, HTMLParser)):
+@@ -827,6 +829,7 @@ cdef class _BaseParser:
+         self._remove_pis = remove_pis
+         self._strip_cdata = strip_cdata
+         self._collect_ids = collect_ids
++        self._resolve_external_entities = resolve_external_entities
+         self._schema = schema
+ 
+         self._resolvers = _ResolverRegistry()
+@@ -906,6 +909,8 @@ cdef class _BaseParser:
+         if self._strip_cdata:
+             # hard switch-off for CDATA nodes => makes them plain text
+             pctxt.sax.cdataBlock = NULL
++        if not self._resolve_external_entities:
++            pctxt.sax.getEntity = _getInternalEntityOnly
+ 
+     cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1:
+         cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax
+@@ -1206,6 +1211,56 @@ cdef class _BaseParser:
+         finally:
+             context.cleanup()
+ 
++cdef tree.xmlEntity* _getInternalEntityOnly(void* ctxt, const_xmlChar* name):
++    """
++    Callback function to intercept the entity resolution when external entity loading is disabled.
++    """
++    cdef tree.xmlEntity* entity = xmlparser.xmlSAX2GetEntity(ctxt, name)
++    if not entity:
++        return NULL
++    if entity.etype not in (
++            tree.xmlEntityType.XML_EXTERNAL_GENERAL_PARSED_ENTITY,
++            tree.xmlEntityType.XML_EXTERNAL_GENERAL_UNPARSED_ENTITY,
++            tree.xmlEntityType.XML_EXTERNAL_PARAMETER_ENTITY):
++        return entity
++
++    # Reject all external entities and fail the parsing instead. There is currently
++    # no way in libxml2 to just prevent the entity resolution in this case.
++    cdef xmlerror.xmlError c_error
++    cdef xmlerror.xmlStructuredErrorFunc err_func
++    cdef xmlparser.xmlParserInput* parser_input
++    cdef void* err_context
++
++    c_ctxt = <xmlparser.xmlParserCtxt *> ctxt
++    err_func = xmlerror.xmlStructuredError
++    if err_func:
++        parser_input = c_ctxt.input
++        # Copied from xmlVErrParser() in libxml2: get current input from stack.
++        if parser_input and parser_input.filename is NULL and c_ctxt.inputNr > 1:
++            parser_input = c_ctxt.inputTab[c_ctxt.inputNr - 2]
++
++        c_error = xmlerror.xmlError(
++            domain=xmlerror.xmlErrorDomain.XML_FROM_PARSER,
++            code=xmlerror.xmlParserErrors.XML_ERR_EXT_ENTITY_STANDALONE,
++            level=xmlerror.xmlErrorLevel.XML_ERR_FATAL,
++            message=b"External entity resolution is disabled for security reasons "
++                    b"when resolving '&%s;'. Use 'XMLParser(resolve_entities=True)' "
++                    b"if you consider it safe to enable it.",
++            file=parser_input.filename,
++            node=entity,
++            str1=<char*> name,
++            str2=NULL,
++            str3=NULL,
++            line=parser_input.line if parser_input else 0,
++            int1=0,
++            int2=parser_input.col if parser_input else 0,
++        )
++        err_context = xmlerror.xmlStructuredErrorContext
++        err_func(err_context, &c_error)
++
++    c_ctxt.wellFormed = 0
++    # The entity was looked up and does not need to be freed.
++    return NULL
+ 
+ cdef void _initSaxDocument(void* ctxt) with gil:
+     xmlparser.xmlSAX2StartDocument(ctxt)
+@@ -1508,12 +1563,14 @@ cdef class XMLParser(_FeedParser):
+     - strip_cdata        - replace CDATA sections by normal text content (default: True)
+     - compact            - save memory for short text content (default: True)
+     - collect_ids        - use a hash table of XML IDs for fast access (default: True, always True with DTD validation)
+-    - resolve_entities   - replace entities by their text value (default: True)
+     - huge_tree          - disable security restrictions and support very deep trees
+                            and very long text content (only affects libxml2 2.7+)
+ 
+     Other keyword arguments:
+-
++    - resolve_entities - replace entities by their text value: False for keeping the
++          entity references, True for resolving them, and 'internal' for resolving
++          internal definitions only (no external file/URL access).
++          The default used to be True and was changed to 'internal' in lxml 5.0.
+     - encoding - override the document encoding
+     - target   - a parser target object that will receive the parse events
+     - schema   - an XMLSchema to validate against
+@@ -1525,10 +1582,11 @@ cdef class XMLParser(_FeedParser):
+     def __init__(self, *, encoding=None, attribute_defaults=False,
+                  dtd_validation=False, load_dtd=False, no_network=True,
+                  ns_clean=False, recover=False, XMLSchema schema=None,
+-                 huge_tree=False, remove_blank_text=False, resolve_entities=True,
++                 huge_tree=False, remove_blank_text=False, resolve_entities='internal',
+                  remove_comments=False, remove_pis=False, strip_cdata=True,
+                  collect_ids=True, target=None, compact=True):
+         cdef int parse_options
++        cdef bint resolve_external = True
+         parse_options = _XML_DEFAULT_PARSE_OPTIONS
+         if load_dtd:
+             parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
+@@ -1553,12 +1611,14 @@ cdef class XMLParser(_FeedParser):
+             parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
+         if not resolve_entities:
+             parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
++        elif resolve_entities == 'internal':
++            resolve_external = False
+         if not strip_cdata:
+             parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA
+ 
+         _BaseParser.__init__(self, parse_options, 0, schema,
+                              remove_comments, remove_pis, strip_cdata,
+-                             collect_ids, target, encoding)
++                             collect_ids, target, encoding, resolve_external)
+ 
+ 
+ cdef class XMLPullParser(XMLParser):
+diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py
+index 14b21f7..bc7548f 100644
+--- a/src/lxml/tests/test_etree.py
++++ b/src/lxml/tests/test_etree.py
+@@ -12,11 +12,14 @@ from __future__ import absolute_import
+ from collections import OrderedDict
+ import os.path
+ import unittest
++import contextlib
+ import copy
+ import sys
+ import re
+ import gc
+ import operator
++import shutil
++import tempfile
+ import textwrap
+ import zlib
+ import gzip
+@@ -1675,6 +1678,84 @@ class ETreeOnlyTestCase(HelperTestCase):
+         self.assertEqual(_bytes('<doc>&myentity;</doc>'),
+                           tostring(root))
+ 
++    @contextlib.contextmanager
++    def _xml_test_file(self, name, content=b'<evil>XML</evil>'):
++        temp_dir = tempfile.mkdtemp()
++        try:
++            xml_file = os.path.join(temp_dir, name)
++            with open(xml_file, 'wb') as tmpfile:
++                tmpfile.write(content)
++            yield xml_file
++        finally:
++            shutil.rmtree(temp_dir)
++
++    def test_entity_parse_external(self):
++        fromstring = self.etree.fromstring
++        tostring = self.etree.tostring
++        parser = self.etree.XMLParser(resolve_entities=True)
++
++        with self._xml_test_file("entity.xml") as entity_file:
++            xml = '''
++            <!DOCTYPE doc [
++                <!ENTITY my_external_entity SYSTEM "%s">
++            ]>
++            <doc>&my_external_entity;</doc>
++            ''' % path2url(entity_file)
++            root = fromstring(xml, parser)
++
++        self.assertEqual(_bytes('<doc><evil>XML</evil></doc>'),
++                          tostring(root))
++        self.assertEqual(root.tag, 'doc')
++        self.assertEqual(root[0].tag, 'evil')
++        self.assertEqual(root[0].text, 'XML')
++        self.assertEqual(root[0].tail, None)
++
++    def test_entity_parse_external_no_resolve(self):
++        fromstring = self.etree.fromstring
++        parser = self.etree.XMLParser(resolve_entities=False)
++        Entity = self.etree.Entity
++
++        with self._xml_test_file("entity.xml") as entity_file:
++            xml = '''
++            <!DOCTYPE doc [
++                <!ENTITY my_external_entity SYSTEM "%s">
++            ]>
++            <doc>&my_external_entity;</doc>
++            ''' % path2url(entity_file)
++            root = fromstring(xml, parser)
++
++        self.assertEqual(root[0].tag, Entity)
++        self.assertEqual(root[0].text, "&my_external_entity;")
++
++    def test_entity_parse_no_external_default(self):
++        fromstring = self.etree.fromstring
++
++        with self._xml_test_file("entity.xml") as entity_file:
++            xml = '''
++            <!DOCTYPE doc [
++                <!ENTITY my_failing_external_entity SYSTEM "%s">
++            ]>
++            <doc>&my_failing_external_entity;</doc>
++            ''' % path2url(entity_file)
++
++            try:
++                fromstring(xml)
++            except self.etree.XMLSyntaxError as exc:
++                exception = exc
++            else:
++                self.assertTrue(False, "XMLSyntaxError was not raised")
++
++        self.assertIn("my_failing_external_entity", str(exception))
++        self.assertTrue(exception.error_log)
++        # Depending on the libxml2 version, we get different errors here,
++        # not necessarily the one that lxml produced. But it should fail either way.
++        for error in exception.error_log:
++            if "my_failing_external_entity" in error.message:
++                self.assertEqual(5, error.line)
++                break
++        else:
++            self.assertFalse("entity error not found in parser error log")
++
+     def test_entity_restructure(self):
+         xml = _bytes('''<!DOCTYPE root [ <!ENTITY nbsp "&#160;"> ]>
+             <root>
+-- 
+2.33.0
+
diff --git a/backport-Change-HTML-prefix-handling-in-ElementPath-to-let-el.patch b/backport-Change-HTML-prefix-handling-in-ElementPath-to-let-el.patch
new file mode 100644
index 0000000..6e1e75c
--- /dev/null
+++ b/backport-Change-HTML-prefix-handling-in-ElementPath-to-let-el.patch
@@ -0,0 +1,220 @@
+From 72f5a287a4016ecb405f2e8a4a03ae22a5b0b496 Mon Sep 17 00:00:00 2001
+From: Stefan Behnel <stefan_ml@behnel.de>
+Date: Wed, 5 Jul 2023 22:10:45 +0200
+Subject: [PATCH] Change HTML "prefix" handling in ElementPath to let
+ "element.find('part1:part2')" search for "part1:part2" instead of just
+ "part2" with an unknown prefix. Also adapt the HTML "prefix" parsing test to
+ make it work in libxml2 2.10.4 and later, where HTML "prefixes" are kept as
+ part of the tag name by the parser.
+
+---
+ src/lxml/_elementpath.py     | 22 +++++++++++-----------
+ src/lxml/apihelpers.pxi      |  7 +++++++
+ src/lxml/etree.pyx           |  8 ++++----
+ src/lxml/includes/tree.pxd   | 12 ++++++++++++
+ src/lxml/tests/test_etree.py | 20 ++++++++++++++++----
+ 5 files changed, 50 insertions(+), 19 deletions(-)
+
+diff --git a/src/lxml/_elementpath.py b/src/lxml/_elementpath.py
+index eabd81c..001b345 100644
+--- a/src/lxml/_elementpath.py
++++ b/src/lxml/_elementpath.py
+@@ -71,14 +71,14 @@ xpath_tokenizer_re = re.compile(
+     r"\s+"
+     )
+ 
+-def xpath_tokenizer(pattern, namespaces=None):
++def xpath_tokenizer(pattern, namespaces=None, with_prefixes=True):
+     # ElementTree uses '', lxml used None originally.
+     default_namespace = (namespaces.get(None) or namespaces.get('')) if namespaces else None
+     parsing_attribute = False
+     for token in xpath_tokenizer_re.findall(pattern):
+         ttype, tag = token
+         if tag and tag[0] != "{":
+-            if ":" in tag:
++            if ":" in tag and with_prefixes:
+                 prefix, uri = tag.split(":", 1)
+                 try:
+                     if not namespaces:
+@@ -251,7 +251,7 @@ ops = {
+ _cache = {}
+ 
+ 
+-def _build_path_iterator(path, namespaces):
++def _build_path_iterator(path, namespaces, with_prefixes=True):
+     """compile selector pattern"""
+     if path[-1:] == "/":
+         path += "*"  # implicit all (FIXME: keep this?)
+@@ -279,7 +279,7 @@ def _build_path_iterator(path, namespaces):
+ 
+     if path[:1] == "/":
+         raise SyntaxError("cannot use absolute path on element")
+-    stream = iter(xpath_tokenizer(path, namespaces))
++    stream = iter(xpath_tokenizer(path, namespaces, with_prefixes=with_prefixes))
+     try:
+         _next = stream.next
+     except AttributeError:
+@@ -308,8 +308,8 @@ def _build_path_iterator(path, namespaces):
+ ##
+ # Iterate over the matching nodes
+ 
+-def iterfind(elem, path, namespaces=None):
+-    selector = _build_path_iterator(path, namespaces)
++def iterfind(elem, path, namespaces=None, with_prefixes=True):
++    selector = _build_path_iterator(path, namespaces, with_prefixes=with_prefixes)
+     result = iter((elem,))
+     for select in selector:
+         result = select(result)
+@@ -319,8 +319,8 @@ def iterfind(elem, path, namespaces=None):
+ ##
+ # Find first matching object.
+ 
+-def find(elem, path, namespaces=None):
+-    it = iterfind(elem, path, namespaces)
++def find(elem, path, namespaces=None, with_prefixes=True):
++    it = iterfind(elem, path, namespaces, with_prefixes=with_prefixes)
+     try:
+         return next(it)
+     except StopIteration:
+@@ -330,15 +330,15 @@ def find(elem, path, namespaces=None):
+ ##
+ # Find all matching objects.
+ 
+-def findall(elem, path, namespaces=None):
++def findall(elem, path, namespaces=None, with_prefixes=True):
+     return list(iterfind(elem, path, namespaces))
+ 
+ 
+ ##
+ # Find text for first matching object.
+ 
+-def findtext(elem, path, default=None, namespaces=None):
+-    el = find(elem, path, namespaces)
++def findtext(elem, path, default=None, namespaces=None, with_prefixes=True):
++    el = find(elem, path, namespaces, with_prefixes=with_prefixes)
+     if el is None:
+         return default
+     else:
+diff --git a/src/lxml/apihelpers.pxi b/src/lxml/apihelpers.pxi
+index 88a031d..effd116 100644
+--- a/src/lxml/apihelpers.pxi
++++ b/src/lxml/apihelpers.pxi
+@@ -15,6 +15,13 @@ cdef void displayNode(xmlNode* c_node, indent):
+     finally:
+         return  # swallow any exceptions
+ 
++cdef inline bint _isHtmlDocument(_Element element) except -1:
++    cdef xmlNode* c_node = element._c_node
++    return (
++        c_node is not NULL and c_node.doc is not NULL and
++        c_node.doc.properties & tree.XML_DOC_HTML != 0
++    )
++
+ cdef inline int _assertValidNode(_Element element) except -1:
+     assert element._c_node is not NULL, u"invalid Element proxy at %s" % id(element)
+ 
+diff --git a/src/lxml/etree.pyx b/src/lxml/etree.pyx
+index 689c330..90753fc 100644
+--- a/src/lxml/etree.pyx
++++ b/src/lxml/etree.pyx
+@@ -1544,7 +1544,7 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
+         """
+         if isinstance(path, QName):
+             path = (<QName>path).text
+-        return _elementpath.find(self, path, namespaces)
++        return _elementpath.find(self, path, namespaces, with_prefixes=not _isHtmlDocument(self))
+ 
+     def findtext(self, path, default=None, namespaces=None):
+         u"""findtext(self, path, default=None, namespaces=None)
+@@ -1557,7 +1557,7 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
+         """
+         if isinstance(path, QName):
+             path = (<QName>path).text
+-        return _elementpath.findtext(self, path, default, namespaces)
++        return _elementpath.findtext(self, path, default, namespaces, with_prefixes=not _isHtmlDocument(self))
+ 
+     def findall(self, path, namespaces=None):
+         u"""findall(self, path, namespaces=None)
+@@ -1570,7 +1570,7 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
+         """
+         if isinstance(path, QName):
+             path = (<QName>path).text
+-        return _elementpath.findall(self, path, namespaces)
++        return _elementpath.findall(self, path, namespaces, with_prefixes=not _isHtmlDocument(self))
+ 
+     def iterfind(self, path, namespaces=None):
+         u"""iterfind(self, path, namespaces=None)
+@@ -1583,7 +1583,7 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
+         """
+         if isinstance(path, QName):
+             path = (<QName>path).text
+-        return _elementpath.iterfind(self, path, namespaces)
++        return _elementpath.iterfind(self, path, namespaces, with_prefixes=not _isHtmlDocument(self))
+ 
+     def xpath(self, _path, *, namespaces=None, extensions=None,
+               smart_strings=True, **_variables):
+diff --git a/src/lxml/includes/tree.pxd b/src/lxml/includes/tree.pxd
+index 010af80..d709313 100644
+--- a/src/lxml/includes/tree.pxd
++++ b/src/lxml/includes/tree.pxd
+@@ -154,6 +154,17 @@ cdef extern from "libxml/tree.h":
+         XML_EXTERNAL_PARAMETER_ENTITY=        5
+         XML_INTERNAL_PREDEFINED_ENTITY=       6
+ 
++    ctypedef enum xmlDocProperties:
++        XML_DOC_WELLFORMED          = 1    # /* document is XML well formed */
++        XML_DOC_NSVALID             = 2    # /* document is Namespace valid */
++        XML_DOC_OLD10               = 4    # /* parsed with old XML-1.0 parser */
++        XML_DOC_DTDVALID            = 8    # /* DTD validation was successful */
++        XML_DOC_XINCLUDE            = 16   # /* XInclude substitution was done */
++        XML_DOC_USERBUILT           = 32   # /* Document was built using the API
++                                           #    and not by parsing an instance */
++        XML_DOC_INTERNAL            = 64   # /* built for internal processing */
++        XML_DOC_HTML                = 128  # /* parsed or built HTML document */
++
+     ctypedef struct xmlNs:
+         const_xmlChar* href
+         const_xmlChar* prefix
+@@ -274,6 +285,7 @@ cdef extern from "libxml/tree.h":
+         void* _private
+         xmlDtd* intSubset
+         xmlDtd* extSubset
++        int properties
+         
+     ctypedef struct xmlAttr:
+         void* _private
+diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py
+index bde496d..e969f3a 100644
+--- a/src/lxml/tests/test_etree.py
++++ b/src/lxml/tests/test_etree.py
+@@ -3137,11 +3137,23 @@ class ETreeOnlyTestCase(HelperTestCase):
+ 
+     def test_html_prefix_nsmap(self):
+         etree = self.etree
+-        el = etree.HTML('<hha:page-description>aa</hha:page-description>').find('.//page-description')
+-        if etree.LIBXML_VERSION < (2, 9, 11):
+-            self.assertEqual({'hha': None}, el.nsmap)
++        el = etree.HTML('<hha:page-description>aa</hha:page-description>')
++        pd = el[-1]
++        while len(pd):
++            pd = pd[-1]
++
++        if etree.LIBXML_VERSION >= (2, 9, 11):
++            # "Prefix" is kept as part of the tag name.
++            self.assertEqual("hha:page-description", pd.tag)
++            self.assertIsNone(el.find('.//page-description'))
++            self.assertIsNotNone(el.find('.//hha:page-description'))  # no namespaces!
++            for e in el.iter():
++                self.assertEqual({}, e.nsmap)
+         else:
+-            self.assertEqual({}, el.nsmap)
++            # "Prefix" is parsed as XML prefix.
++            self.assertEqual("page-description", pd.tag)
++            pd = el.find('.//page-description')
++            self.assertEqual({'hha': None}, pd.nsmap)
+ 
+     def test_getchildren(self):
+         Element = self.etree.Element
+-- 
+2.33.0
+
diff --git a/python-lxml.spec b/python-lxml.spec
index b64153c..a13fdd6 100644
--- a/python-lxml.spec
+++ b/python-lxml.spec
@@ -7,7 +7,7 @@ The latest release works with all CPython versions from 2.7 to 3.7.
 
 Name:           python-%{modname}
 Version:        4.7.1
-Release:        5
+Release:        6
 Summary:        XML processing library combining libxml2/libxslt with the ElementTree API
 License:        BSD
 URL:            https://files.pythonhosted.org
@@ -15,6 +15,9 @@ Source0:        https://files.pythonhosted.org/packages/source/l/lxml/lxml-%{ver
 
 Patch6000:      backport-CVE-2022-2309.patch
 Patch6001:      backport-Work-around-libxml2-bug-in-affected-versions.patch
+Patch6002:      Fix-test_elementtree-with-Expat-2.6.0.patch
+Patch6003:      backport-CVE-2024-37388.patch
+Patch6004:      backport-Change-HTML-prefix-handling-in-ElementPath-to-let-el.patch
 
 BuildRequires:  gcc libxml2-devel libxslt-devel
 
@@ -55,6 +58,12 @@ make test3
 %doc README.rst src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt
 
 %changelog
+* Wed Jun 12 2024 zhuofeng <zhuofeng2@huawei.com> - 4.7.1-6
+- Type:CVE
+- CVE:CVE-2024-37388
+- SUG:NA
+- DESC:fix CVE-2024-37388
+
 * Wed Nov 16 2022 zhuofeng <zhuofeng@huawei.com> - 4.7.1-5
 - change the Source0
 
-- 
Gitee