diff --git a/backport-Cleaner-cover-some-more-cases-where-scripts-could-sn.patch b/backport-Cleaner-cover-some-more-cases-where-scripts-could-sn.patch new file mode 100644 index 0000000000000000000000000000000000000000..5b7dcef990d027c0474bf1b7fcc7ce465ac95ed3 --- /dev/null +++ b/backport-Cleaner-cover-some-more-cases-where-scripts-could-sn.patch @@ -0,0 +1,163 @@ +From 69a747356655158fdf9abaecea5feafb3bd6b5f5 Mon Sep 17 00:00:00 2001 +From: Stefan Behnel +Date: Sat, 11 Dec 2021 12:19:21 +0100 +Subject: [PATCH] Cleaner: cover some more cases where scripts could sneak + through in specially crafted style content. + +--- + src/lxml/html/clean.py | 20 ++++++------ + src/lxml/html/tests/test_clean.py | 65 ++++++++++++++++++++++++++++++++++++++- + 2 files changed, 73 insertions(+), 12 deletions(-) + +diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py +index 4df10c2..0e96627 100644 +--- a/src/lxml/html/clean.py ++++ b/src/lxml/html/clean.py +@@ -74,22 +74,20 @@ _looks_like_tag_content = re.compile( + # All kinds of schemes besides just javascript: that can cause + # execution: + _find_image_dataurls = re.compile( +- r'^data:image/(.+);base64,', re.I).findall +-_is_possibly_malicious_scheme = re.compile( ++ r'data:image/(.+);base64,', re.I).findall ++_possibly_malicious_schemes = re.compile( + r'(javascript|jscript|livescript|vbscript|data|about|mocha):', + re.I).findall + # SVG images can contain script content +-_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).findall ++_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).search + +-def _is_javascript_scheme(s): +- is_image_url = False ++def _has_javascript_scheme(s): ++ safe_image_urls = 0 + for image_type in _find_image_dataurls(s): +- is_image_url = True + if _is_unsafe_image_type(image_type): + return True +- if is_image_url: +- return False +- return bool(_is_possibly_malicious_scheme(s)) ++ safe_image_urls += 1 ++ return len(_possibly_malicious_schemes(s)) > safe_image_urls + + _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub + +@@ -521,7 +519,7 @@ class Cleaner(object): + def _remove_javascript_link(self, link): + # links like "j a v a s c r i p t:" might be interpreted in IE + new = _substitute_whitespace('', unquote_plus(link)) +- if _is_javascript_scheme(new): ++ if _has_javascript_scheme(new): + # FIXME: should this be None to delete? + return '' + return link +@@ -543,7 +541,7 @@ class Cleaner(object): + style = style.replace('\\', '') + style = _substitute_whitespace('', style) + style = style.lower() +- if 'javascript:' in style: ++ if _has_javascript_scheme(style): + return True + if 'expression(' in style: + return True +diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py +index a05d967..aec87cd 100644 +--- a/src/lxml/html/tests/test_clean.py ++++ b/src/lxml/html/tests/test_clean.py +@@ -126,7 +126,7 @@ class CleanerTest(unittest.TestCase): + lxml.html.tostring(clean_html(s))) + + def test_sneaky_import_in_style(self): +- # Prevent "@@importimport" -> "@import" replacement. ++ # Prevent "@@importimport" -> "@import" replacement etc. + style_codes = [ + "@@importimport(extstyle.css)", + "@ @ import import(extstyle.css)", +@@ -134,6 +134,11 @@ class CleanerTest(unittest.TestCase): + "@@ import import(extstyle.css)", + "@ @import import(extstyle.css)", + "@@importimport()", ++ "@@importimport() ()", ++ "@/* ... */import()", ++ "@im/* ... */port()", ++ "@ @import/* ... */import()", ++ "@ /* ... */ import()", + ] + for style_code in style_codes: + html = '' % style_code +@@ -145,6 +150,41 @@ class CleanerTest(unittest.TestCase): + cleaned, + "%s -> %s" % (style_code, cleaned)) + ++ def test_sneaky_schemes_in_style(self): ++ style_codes = [ ++ "javasjavascript:cript:", ++ "javascriptjavascript::", ++ "javascriptjavascript:: :", ++ "vbjavascript:cript:", ++ ] ++ for style_code in style_codes: ++ html = '' % style_code ++ s = lxml.html.fragment_fromstring(html) ++ ++ cleaned = lxml.html.tostring(clean_html(s)) ++ self.assertEqual( ++ b'', ++ cleaned, ++ "%s -> %s" % (style_code, cleaned)) ++ ++ def test_sneaky_urls_in_style(self): ++ style_codes = [ ++ "url(data:image/svg+xml;base64,...)", ++ "url(javasjavascript:cript:)", ++ "url(javasjavascript:cript: ::)", ++ "url(vbjavascript:cript:)", ++ "url(vbjavascript:cript: :)", ++ ] ++ for style_code in style_codes: ++ html = '' % style_code ++ s = lxml.html.fragment_fromstring(html) ++ ++ cleaned = lxml.html.tostring(clean_html(s)) ++ self.assertEqual( ++ b'', ++ cleaned, ++ "%s -> %s" % (style_code, cleaned)) ++ + def test_svg_data_links(self): + # Remove SVG images with potentially insecure content. + svg = b'' +@@ -188,6 +228,29 @@ class CleanerTest(unittest.TestCase): + cleaned, + "%s -> %s" % (url, cleaned)) + ++ def test_image_data_links_in_style(self): ++ data = b'123' ++ data_b64 = base64.b64encode(data).decode('ASCII') ++ urls = [ ++ "data:image/jpeg;base64," + data_b64, ++ "data:image/apng;base64," + data_b64, ++ "data:image/png;base64," + data_b64, ++ "data:image/gif;base64," + data_b64, ++ "data:image/webp;base64," + data_b64, ++ "data:image/bmp;base64," + data_b64, ++ "data:image/tiff;base64," + data_b64, ++ "data:image/x-icon;base64," + data_b64, ++ ] ++ for url in urls: ++ html = '' % url ++ s = lxml.html.fragment_fromstring(html) ++ ++ cleaned = lxml.html.tostring(clean_html(s)) ++ self.assertEqual( ++ html.encode("UTF-8"), ++ cleaned, ++ "%s -> %s" % (url, cleaned)) ++ + def test_formaction_attribute_in_button_input(self): + # The formaction attribute overrides the form's action and should be + # treated as a malicious link attribute +-- +2.13.7 + diff --git a/python-lxml.spec b/python-lxml.spec index f48dbf7e0d8404a365e183efc82ee63560bcf839..23f4d82c70691d0f5fb8b86871961708c879c04f 100644 --- a/python-lxml.spec +++ b/python-lxml.spec @@ -7,7 +7,7 @@ The latest release works with all CPython versions from 2.7 to 3.7. Name: python-%{modname} Version: 4.5.2 -Release: 5 +Release: 6 Summary: XML processing library combining libxml2/libxslt with the ElementTree API License: BSD URL: http://lxml.de @@ -18,6 +18,7 @@ Patch6001: backport-CVE-2020-27783-2.patch Patch6002: backport-CVE-2021-28957.patch Patch6003: backport-0001-CVE-2021-43818.patch Patch6004: backport-0002-CVE-2021-43818.patch +Patch6005: backport-Cleaner-cover-some-more-cases-where-scripts-could-sn.patch BuildRequires: gcc libxml2-devel libxslt-devel @@ -68,6 +69,9 @@ make test3 %doc README.rst src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt %changelog +* Sat Jan 22 2022 shixuantong - 4.5.2-6 +- Cleaner: cover some more cases where scripts could sneak through in specially crafted style content. + * Wed Jan 19 2022 shixuantong - 4.5.2-5 - enable check