From 161cf19a8b674efdbb28a6f7831ec94f21671ce7 Mon Sep 17 00:00:00 2001 From: panxiaohe Date: Thu, 2 Dec 2021 14:40:14 +0800 Subject: [PATCH] backport upstream patches --- ...ormatting-newlines-to-XInclude-nodes.patch | 6 +- Fix-NodeDumpOutput-functions.patch | 66 ++++++ Fix-SEGV-in-xmlSAXParseFileWithData.patch | 26 +++ ...k-in-xmlParseElementMixedContentDecl.patch | 42 ++++ Fix-null-deref-in-xmlStringGetNodeList.patch | 30 +++ ...ression-in-xmlNodeDumpOutputInternal.patch | 46 ++++ ...parsing-of-HTML-with-encoding-errors.patch | 100 +++++++++ Fix-undefined-behavior-in-UTF16LEToUTF8.patch | 39 ++++ ...eger-overflow-in-htmlParseTryOrFinis.patch | 79 +++++++ ...-corrupted-documents-more-gracefully.patch | 50 +++++ More-NodeDumpOutput-fixes.patch | 55 +++++ ...coding-parameter-of-HTML-output-func.patch | 133 +++++++++++ Work-around-lxml-API-abuse.patch | 212 ++++++++++++++++++ ...leak-in-xmlRegisterCharEncodingHandl.patch | 55 +++++ libxml2.spec | 24 +- 15 files changed, 958 insertions(+), 5 deletions(-) create mode 100644 Fix-NodeDumpOutput-functions.patch create mode 100644 Fix-SEGV-in-xmlSAXParseFileWithData.patch create mode 100644 Fix-memory-leak-in-xmlParseElementMixedContentDecl.patch create mode 100644 Fix-null-deref-in-xmlStringGetNodeList.patch create mode 100644 Fix-regression-in-xmlNodeDumpOutputInternal.patch create mode 100644 Fix-slow-parsing-of-HTML-with-encoding-errors.patch create mode 100644 Fix-undefined-behavior-in-UTF16LEToUTF8.patch create mode 100644 Fix-unsigned-integer-overflow-in-htmlParseTryOrFinis.patch create mode 100644 Handle-dumps-of-corrupted-documents-more-gracefully.patch create mode 100644 More-NodeDumpOutput-fixes.patch create mode 100644 Remove-unused-encoding-parameter-of-HTML-output-func.patch create mode 100644 Work-around-lxml-API-abuse.patch create mode 100644 encoding-fix-memleak-in-xmlRegisterCharEncodingHandl.patch diff --git a/Don-t-add-formatting-newlines-to-XInclude-nodes.patch b/Don-t-add-formatting-newlines-to-XInclude-nodes.patch index b925431..e00b8f8 100644 --- a/Don-t-add-formatting-newlines-to-XInclude-nodes.patch +++ b/Don-t-add-formatting-newlines-to-XInclude-nodes.patch @@ -15,13 +15,13 @@ index f1d40b9..2225628 100644 while (1) { if (cur == root) return; -- if (ctxt->format == 1) { +- if (ctxt->format == 1) + if ((ctxt->format == 1) && + (cur->type != XML_XINCLUDE_START) && -+ (cur->type != XML_XINCLUDE_END)) { ++ (cur->type != XML_XINCLUDE_END)) xmlOutputBufferWrite(buf, 1, "\n"); - } if (cur->next != NULL) { + cur = cur->next; @@ -1224,7 +1226,9 @@ xmlDocContentDumpOutput(xmlSaveCtxtPtr ctxt, xmlDocPtr cur) { else #endif diff --git a/Fix-NodeDumpOutput-functions.patch b/Fix-NodeDumpOutput-functions.patch new file mode 100644 index 0000000..3f62d44 --- /dev/null +++ b/Fix-NodeDumpOutput-functions.patch @@ -0,0 +1,66 @@ +From 7b2e5172616406edcb5b84d048fa590c997784b3 Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Tue, 28 Jul 2020 21:52:55 +0200 +Subject: [PATCH] Fix *NodeDumpOutput functions + +Only output end tag for elements. Should fix serialization of document +fragments. +--- + xmlsave.c | 54 ++++++++++++++++++++++++++++++------------------------ + 1 file changed, 30 insertions(+), 24 deletions(-) + +diff --git a/xmlsave.c b/xmlsave.c +index 2235c8f..f2e0ea8 100644 +--- a/xmlsave.c ++++ b/xmlsave.c +@@ -1049,9 +1049,8 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + while (1) { + if (cur == root) + return; +- if (ctxt->format == 1) { ++ if (ctxt->format == 1) + xmlOutputBufferWrite(buf, 1, "\n"); +- } + if (cur->next != NULL) { + cur = cur->next; + break; +@@ -1065,21 +1064,25 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + (ctxt->level > ctxt->indent_nr ? + ctxt->indent_nr : ctxt->level), + ctxt->indent); ++ ++ if (cur->type == XML_ELEMENT_NODE) { ++ xmlOutputBufferWrite(buf, 2, "ns != NULL) && (cur->ns->prefix != NULL)) { ++ xmlOutputBufferWriteString(buf, ++ (const char *)cur->ns->prefix); ++ xmlOutputBufferWrite(buf, 1, ":"); ++ } ++ ++ xmlOutputBufferWriteString(buf, (const char *)cur->name); ++ if (ctxt->format == 2) ++ xmlOutputBufferWriteWSNonSig(ctxt, 0); ++ xmlOutputBufferWrite(buf, 1, ">"); ++ } ++ + if (cur == unformattedNode) { + ctxt->format = format; + unformattedNode = NULL; + } +- +- xmlOutputBufferWrite(buf, 2, "ns != NULL) && (cur->ns->prefix != NULL)) { +- xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); +- xmlOutputBufferWrite(buf, 1, ":"); +- } +- +- xmlOutputBufferWriteString(buf, (const char *)cur->name); +- if (ctxt->format == 2) +- xmlOutputBufferWriteWSNonSig(ctxt, 0); +- xmlOutputBufferWrite(buf, 1, ">"); + } + } + } +-- +1.8.3.1 + diff --git a/Fix-SEGV-in-xmlSAXParseFileWithData.patch b/Fix-SEGV-in-xmlSAXParseFileWithData.patch new file mode 100644 index 0000000..b65894b --- /dev/null +++ b/Fix-SEGV-in-xmlSAXParseFileWithData.patch @@ -0,0 +1,26 @@ +From 7929f05710134b9b243952019b6c14066cd3ac9e Mon Sep 17 00:00:00 2001 +From: yanjinjq +Date: Sun, 30 Aug 2020 10:34:01 +0000 +Subject: [PATCH] Fix SEGV in xmlSAXParseFileWithData + +Fixes #181. +--- + parser.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/parser.c b/parser.c +index be14c32..f779eb6 100644 +--- a/parser.c ++++ b/parser.c +@@ -14077,7 +14077,7 @@ xmlSAXParseFileWithData(xmlSAXHandlerPtr sax, const char *filename, + + if ((ctxt->wellFormed) || recovery) { + ret = ctxt->myDoc; +- if (ret != NULL) { ++ if ((ret != NULL) && (ctxt->input->buf != NULL)) { + if (ctxt->input->buf->compressed > 0) + ret->compression = 9; + else +-- +1.8.3.1 + diff --git a/Fix-memory-leak-in-xmlParseElementMixedContentDecl.patch b/Fix-memory-leak-in-xmlParseElementMixedContentDecl.patch new file mode 100644 index 0000000..29b59a2 --- /dev/null +++ b/Fix-memory-leak-in-xmlParseElementMixedContentDecl.patch @@ -0,0 +1,42 @@ +From 45da175c1431d69e74e05a115f0b14cc8c97d886 Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Fri, 18 Dec 2020 12:14:52 +0100 +Subject: [PATCH] Fix memory leak in xmlParseElementMixedContentDecl + +Free parsed content if malloc fails to avoid a memory leak. + +Found with libFuzzer. +--- + parser.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/parser.c b/parser.c +index 85494df..43b8835 100644 +--- a/parser.c ++++ b/parser.c +@@ -6082,14 +6082,20 @@ xmlParseElementMixedContentDecl(xmlParserCtxtPtr ctxt, int inputchk) { + NEXT; + if (elem == NULL) { + ret = xmlNewDocElementContent(ctxt->myDoc, NULL, XML_ELEMENT_CONTENT_OR); +- if (ret == NULL) return(NULL); ++ if (ret == NULL) { ++ xmlFreeDocElementContent(ctxt->myDoc, cur); ++ return(NULL); ++ } + ret->c1 = cur; + if (cur != NULL) + cur->parent = ret; + cur = ret; + } else { + n = xmlNewDocElementContent(ctxt->myDoc, NULL, XML_ELEMENT_CONTENT_OR); +- if (n == NULL) return(NULL); ++ if (n == NULL) { ++ xmlFreeDocElementContent(ctxt->myDoc, ret); ++ return(NULL); ++ } + n->c1 = xmlNewDocElementContent(ctxt->myDoc, elem, XML_ELEMENT_CONTENT_ELEMENT); + if (n->c1 != NULL) + n->c1->parent = n; +-- +1.8.3.1 + diff --git a/Fix-null-deref-in-xmlStringGetNodeList.patch b/Fix-null-deref-in-xmlStringGetNodeList.patch new file mode 100644 index 0000000..6e14174 --- /dev/null +++ b/Fix-null-deref-in-xmlStringGetNodeList.patch @@ -0,0 +1,30 @@ +From 1d73f07d67e32d8eaccd85bc46c5d277a1dc00c9 Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Fri, 18 Dec 2020 00:55:00 +0100 +Subject: [PATCH] Fix null deref in xmlStringGetNodeList + +Check for malloc failure to avoid null deref. + +Found with libFuzzer. +--- + tree.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/tree.c b/tree.c +index 64572d9..2130d55 100644 +--- a/tree.c ++++ b/tree.c +@@ -1649,6 +1649,10 @@ xmlStringGetNodeList(const xmlDoc *doc, const xmlChar *value) { + + if (!xmlBufIsEmpty(buf)) { + node = xmlNewDocText(doc, NULL); ++ if (node == NULL) { ++ xmlBufFree(buf); ++ return(NULL); ++ } + node->content = xmlBufDetach(buf); + + if (last == NULL) { +-- +1.8.3.1 + diff --git a/Fix-regression-in-xmlNodeDumpOutputInternal.patch b/Fix-regression-in-xmlNodeDumpOutputInternal.patch new file mode 100644 index 0000000..c3c2bc1 --- /dev/null +++ b/Fix-regression-in-xmlNodeDumpOutputInternal.patch @@ -0,0 +1,46 @@ +From 13ad8736d294536da4cbcd70a96b0a2fbf47070c Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Tue, 25 May 2021 10:55:25 +0200 +Subject: [PATCH] Fix regression in xmlNodeDumpOutputInternal + +Commit 85b1792e could cause additional whitespace if xmlNodeDump was +called with a non-zero starting level. +--- + xmlsave.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +diff --git a/xmlsave.c b/xmlsave.c +index aedbd5e..489505f 100644 +--- a/xmlsave.c ++++ b/xmlsave.c +@@ -890,6 +890,13 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + break; + + case XML_ELEMENT_NODE: ++ if ((cur != root) && (ctxt->format == 1) && ++ (xmlIndentTreeOutput)) ++ xmlOutputBufferWrite(buf, ctxt->indent_size * ++ (ctxt->level > ctxt->indent_nr ? ++ ctxt->indent_nr : ctxt->level), ++ ctxt->indent); ++ + /* + * Some users like lxml are known to pass nodes with a corrupted + * tree structure. Fall back to a recursive call to handle this +@@ -900,13 +907,6 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + break; + } + +- if ((ctxt->level > 0) && (ctxt->format == 1) && +- (xmlIndentTreeOutput)) +- xmlOutputBufferWrite(buf, ctxt->indent_size * +- (ctxt->level > ctxt->indent_nr ? +- ctxt->indent_nr : ctxt->level), +- ctxt->indent); +- + xmlOutputBufferWrite(buf, 1, "<"); + if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { + xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); +-- +1.8.3.1 + diff --git a/Fix-slow-parsing-of-HTML-with-encoding-errors.patch b/Fix-slow-parsing-of-HTML-with-encoding-errors.patch new file mode 100644 index 0000000..72be38b --- /dev/null +++ b/Fix-slow-parsing-of-HTML-with-encoding-errors.patch @@ -0,0 +1,100 @@ +From dcb80b92da0417bc5b3d97ab8a61381973f1711b Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Sat, 20 Feb 2021 20:30:43 +0100 +Subject: [PATCH] Fix slow parsing of HTML with encoding errors + +Under certain circumstances, the HTML parser would try to guess and +switch input encodings multiple times, leading to slow processing of +documents with encoding errors. The repeated scanning of the input +buffer when guessing encodings could even lead to quadratic behavior. + +The code htmlCurrentChar probably assumed that if there's an encoding +handler, it is guaranteed to produce valid UTF-8. This holds true in +general, but if the detected encoding was "UTF-8", the UTF8ToUTF8 +encoding handler simply invoked memcpy without checking for invalid +UTF-8. This still must be fixed, preferably by not using this handler +at all. + +Also leave a note that switching encodings twice seems impossible to +implement correctly. Add a check when handling UTF-8 encoding errors +in htmlCurrentChar to avoid this situation, even if encoders produce +invalid UTF-8. + +Found by OSS-Fuzz. +--- + HTMLparser.c | 18 ++++++++++++++++-- + encoding.c | 5 +++++ + parserInternals.c | 5 +++++ + 3 files changed, 26 insertions(+), 2 deletions(-) + +diff --git a/HTMLparser.c b/HTMLparser.c +index 14cc56f..c9a64c7 100644 +--- a/HTMLparser.c ++++ b/HTMLparser.c +@@ -457,7 +457,12 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { + ctxt->input->encoding = guess; + handler = xmlFindCharEncodingHandler((const char *) guess); + if (handler != NULL) { +- xmlSwitchToEncoding(ctxt, handler); ++ /* ++ * Don't use UTF-8 encoder which isn't required and ++ * can produce invalid UTF-8. ++ */ ++ if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8")) ++ xmlSwitchToEncoding(ctxt, handler); + } else { + htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, + "Unsupported encoding %s", guess, NULL); +@@ -570,7 +575,16 @@ encoding_error: + BAD_CAST buffer, NULL); + } + +- ctxt->charset = XML_CHAR_ENCODING_8859_1; ++ /* ++ * Don't switch encodings twice. Note that if there's an encoder, we ++ * shouldn't receive invalid UTF-8 anyway. ++ * ++ * Note that if ctxt->input->buf == NULL, switching encodings is ++ * impossible, see Gitlab issue #34. ++ */ ++ if ((ctxt->input->buf != NULL) && ++ (ctxt->input->buf->encoder == NULL)) ++ xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); + *len = 1; + return((int) *ctxt->input->cur); + } +diff --git a/encoding.c b/encoding.c +index d67c16d..cdff6ae 100644 +--- a/encoding.c ++++ b/encoding.c +@@ -373,6 +373,11 @@ UTF8ToUTF8(unsigned char* out, int *outlen, + if (len < 0) + return(-1); + ++ /* ++ * FIXME: Conversion functions must assure valid UTF-8, so we have ++ * to check for UTF-8 validity. Preferably, this converter shouldn't ++ * be used at all. ++ */ + memcpy(out, inb, len); + + *outlen = len; +diff --git a/parserInternals.c b/parserInternals.c +index b0629ef..cbcfde0 100644 +--- a/parserInternals.c ++++ b/parserInternals.c +@@ -1153,6 +1153,11 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, + * Note: this is a bit dangerous, but that's what it + * takes to use nearly compatible signature for different + * encodings. ++ * ++ * FIXME: Encoders might buffer partial byte sequences, so ++ * this probably can't work. We should return an error and ++ * make sure that callers never try to switch the encoding ++ * twice. + */ + xmlCharEncCloseFunc(input->buf->encoder); + input->buf->encoder = handler; +-- +1.8.3.1 + diff --git a/Fix-undefined-behavior-in-UTF16LEToUTF8.patch b/Fix-undefined-behavior-in-UTF16LEToUTF8.patch new file mode 100644 index 0000000..a5d0fa8 --- /dev/null +++ b/Fix-undefined-behavior-in-UTF16LEToUTF8.patch @@ -0,0 +1,39 @@ +From 2f9382033e4c398dd1c9aae4d24fa9f649fbf23d Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Mon, 15 Jun 2020 15:45:47 +0200 +Subject: [PATCH] Fix undefined behavior in UTF16LEToUTF8 + +Don't perform arithmetic on null pointer. + +Found with libFuzzer and UBSan. +--- + encoding.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/encoding.c b/encoding.c +index 8b6f349..1a6386a 100644 +--- a/encoding.c ++++ b/encoding.c +@@ -496,13 +496,18 @@ UTF16LEToUTF8(unsigned char* out, int *outlen, + { + unsigned char* outstart = out; + const unsigned char* processed = inb; +- unsigned char* outend = out + *outlen; ++ unsigned char* outend; + unsigned short* in = (unsigned short*) inb; + unsigned short* inend; + unsigned int c, d, inlen; + unsigned char *tmp; + int bits; + ++ if (*outlen == 0) { ++ *inlenb = 0; ++ return(0); ++ } ++ outend = out + *outlen; + if ((*inlenb % 2) == 1) + (*inlenb)--; + inlen = *inlenb / 2; +-- +1.8.3.1 + diff --git a/Fix-unsigned-integer-overflow-in-htmlParseTryOrFinis.patch b/Fix-unsigned-integer-overflow-in-htmlParseTryOrFinis.patch new file mode 100644 index 0000000..029f70f --- /dev/null +++ b/Fix-unsigned-integer-overflow-in-htmlParseTryOrFinis.patch @@ -0,0 +1,79 @@ +From 681f094e5bd1d0f6b38b27701d0d1bf1ca7a9a26 Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Mon, 15 Jun 2020 15:23:05 +0200 +Subject: [PATCH] Fix unsigned integer overflow in htmlParseTryOrFinish + +Cast to signed type before subtraction to avoid unsigned integer +overflow. Also use ptrdiff_t to avoid potential integer truncation. + +Found with libFuzzer and UBSan. +--- + HTMLparser.c | 17 +++++++++++------ + 1 file changed, 11 insertions(+), 6 deletions(-) + +diff --git a/HTMLparser.c b/HTMLparser.c +index be7e14f..9ade663 100644 +--- a/HTMLparser.c ++++ b/HTMLparser.c +@@ -5339,7 +5339,7 @@ static int + htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { + int ret = 0; + htmlParserInputPtr in; +- int avail = 0; ++ ptrdiff_t avail = 0; + xmlChar cur, next; + + htmlParserNodeInfo node_info; +@@ -5404,7 +5404,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { + if (in->buf == NULL) + avail = in->length - (in->cur - in->base); + else +- avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); ++ avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - ++ (in->cur - in->base); + if ((avail == 0) && (terminate)) { + htmlAutoCloseOnEnd(ctxt); + if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { +@@ -5440,7 +5441,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { + if (in->buf == NULL) + avail = in->length - (in->cur - in->base); + else +- avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); ++ avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - ++ (in->cur - in->base); + } + if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) + ctxt->sax->setDocumentLocator(ctxt->userData, +@@ -5482,7 +5484,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { + if (in->buf == NULL) + avail = in->length - (in->cur - in->base); + else +- avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); ++ avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - ++ (in->cur - in->base); + /* + * no chars in buffer + */ +@@ -5555,7 +5558,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { + if (in->buf == NULL) + avail = in->length - (in->cur - in->base); + else +- avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); ++ avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - ++ (in->cur - in->base); + if (avail < 2) + goto done; + cur = in->cur[0]; +@@ -5596,7 +5600,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { + if (in->buf == NULL) + avail = in->length - (in->cur - in->base); + else +- avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); ++ avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - ++ (in->cur - in->base); + if (avail < 1) + goto done; + cur = in->cur[0]; +-- +1.8.3.1 + diff --git a/Handle-dumps-of-corrupted-documents-more-gracefully.patch b/Handle-dumps-of-corrupted-documents-more-gracefully.patch new file mode 100644 index 0000000..8c0ba6d --- /dev/null +++ b/Handle-dumps-of-corrupted-documents-more-gracefully.patch @@ -0,0 +1,50 @@ +From 0b3c64d9f2f3e9ce1a98d8f19ee7a763c87e27d5 Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Tue, 29 Sep 2020 18:08:37 +0200 +Subject: [PATCH] Handle dumps of corrupted documents more gracefully + +Check parent pointers for NULL after the non-recursive rewrite of the +serialization code. This avoids segfaults with corrupted documents +which can apparently be seen with lxml, see issue #187. +--- + HTMLtree.c | 6 ++++++ + xmlsave.c | 12 ++++++++++++ + 2 files changed, 18 insertions(+) + +diff --git a/HTMLtree.c b/HTMLtree.c +index cdb7f86..8d0c779 100644 +--- a/HTMLtree.c ++++ b/HTMLtree.c +@@ -903,6 +903,12 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + break; + } + ++ /* ++ * The parent should never be NULL here but we want to handle ++ * corrupted documents gracefully. ++ */ ++ if (cur->parent == NULL) ++ return; + cur = cur->parent; + + if ((cur->type == XML_HTML_DOCUMENT_NODE) || +diff --git a/xmlsave.c b/xmlsave.c +index 2225628..61a4045 100644 +--- a/xmlsave.c ++++ b/xmlsave.c +@@ -1058,6 +1058,12 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + break; + } + ++ /* ++ * The parent should never be NULL here but we want to handle ++ * corrupted documents gracefully. ++ */ ++ if (cur->parent == NULL) ++ return; + cur = cur->parent; + + if (cur->type == XML_ELEMENT_NODE) { +-- +1.8.3.1 + diff --git a/More-NodeDumpOutput-fixes.patch b/More-NodeDumpOutput-fixes.patch new file mode 100644 index 0000000..26fefc4 --- /dev/null +++ b/More-NodeDumpOutput-fixes.patch @@ -0,0 +1,55 @@ +From 1a360c1c2ec950f478d55b31722ecf78f5698e97 Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Wed, 29 Jul 2020 00:39:15 +0200 +Subject: [PATCH] More *NodeDumpOutput fixes + +When leaving nodes, restrict more operations to XML_ELEMENT_NODEs. +--- + xmlsave.c | 44 ++++++++++++++++++++++---------------------- + 1 file changed, 22 insertions(+), 22 deletions(-) + +diff --git a/xmlsave.c b/xmlsave.c +index f2e0ea8..f1d40b9 100644 +--- a/xmlsave.c ++++ b/xmlsave.c +@@ -1058,14 +1058,14 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + + cur = cur->parent; + +- if (ctxt->level > 0) ctxt->level--; +- if ((xmlIndentTreeOutput) && (ctxt->format == 1)) +- xmlOutputBufferWrite(buf, ctxt->indent_size * +- (ctxt->level > ctxt->indent_nr ? +- ctxt->indent_nr : ctxt->level), +- ctxt->indent); +- + if (cur->type == XML_ELEMENT_NODE) { ++ if (ctxt->level > 0) ctxt->level--; ++ if ((xmlIndentTreeOutput) && (ctxt->format == 1)) ++ xmlOutputBufferWrite(buf, ctxt->indent_size * ++ (ctxt->level > ctxt->indent_nr ? ++ ctxt->indent_nr : ctxt->level), ++ ctxt->indent); ++ + xmlOutputBufferWrite(buf, 2, "ns != NULL) && (cur->ns->prefix != NULL)) { + xmlOutputBufferWriteString(buf, +@@ -1077,11 +1077,11 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + if (ctxt->format == 2) + xmlOutputBufferWriteWSNonSig(ctxt, 0); + xmlOutputBufferWrite(buf, 1, ">"); +- } + +- if (cur == unformattedNode) { +- ctxt->format = format; +- unformattedNode = NULL; ++ if (cur == unformattedNode) { ++ ctxt->format = format; ++ unformattedNode = NULL; ++ } + } + } + } +-- +1.8.3.1 + diff --git a/Remove-unused-encoding-parameter-of-HTML-output-func.patch b/Remove-unused-encoding-parameter-of-HTML-output-func.patch new file mode 100644 index 0000000..f5013ec --- /dev/null +++ b/Remove-unused-encoding-parameter-of-HTML-output-func.patch @@ -0,0 +1,133 @@ +From e6495e47890afacfc3513a9161671e8d228ccc76 Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Sun, 7 Feb 2021 13:38:01 +0100 +Subject: [PATCH] Remove unused encoding parameter of HTML output functions + +The encoding string is unused. Encodings are set by way of the output +buffer. +--- + HTMLtree.c | 34 +++++++++++++++++----------------- + 1 file changed, 17 insertions(+), 17 deletions(-) + +diff --git a/HTMLtree.c b/HTMLtree.c +index 8d0c779..24434d4 100644 +--- a/HTMLtree.c ++++ b/HTMLtree.c +@@ -518,7 +518,7 @@ htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, + buf = xmlOutputBufferCreateFile(out, handler); + if (buf == NULL) return(0); + +- htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); ++ htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format); + + ret = xmlOutputBufferClose(buf); + return(ret); +@@ -670,13 +670,11 @@ htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + * @buf: the HTML buffer output + * @doc: the document + * @cur: the attribute pointer +- * @encoding: the encoding string + * + * Dump an HTML attribute + */ + static void +-htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, +- const char *encoding ATTRIBUTE_UNUSED) { ++htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) { + xmlChar *value; + + /* +@@ -737,14 +735,15 @@ htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, + * @buf: the HTML buffer output + * @doc: the document + * @cur: the current node +- * @encoding: the encoding string ++ * @encoding: the encoding string (unused) + * @format: should formatting spaces been added + * + * Dump an HTML node, recursive behaviour,children are printed too. + */ + void + htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, +- xmlNodePtr cur, const char *encoding, int format) { ++ xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED, ++ int format) { + xmlNodePtr root; + xmlAttrPtr attr; + const htmlElemDesc * info; +@@ -788,7 +787,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + xmlNsListDumpOutput(buf, cur->nsDef); + attr = cur->properties; + while (attr != NULL) { +- htmlAttrDumpOutput(buf, doc, attr, encoding); ++ htmlAttrDumpOutput(buf, doc, attr); + attr = attr->next; + } + +@@ -835,7 +834,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + break; + + case XML_ATTRIBUTE_NODE: +- htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding); ++ htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur); + break; + + case HTML_TEXT_NODE: +@@ -955,44 +954,45 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + * @buf: the HTML buffer output + * @doc: the document + * @cur: the current node +- * @encoding: the encoding string ++ * @encoding: the encoding string (unused) + * + * Dump an HTML node, recursive behaviour,children are printed too, + * and formatting returns/spaces are added. + */ + void + htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, +- xmlNodePtr cur, const char *encoding) { +- htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1); ++ xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) { ++ htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1); + } + + /** + * htmlDocContentDumpFormatOutput: + * @buf: the HTML buffer output + * @cur: the document +- * @encoding: the encoding string ++ * @encoding: the encoding string (unused) + * @format: should formatting spaces been added + * + * Dump an HTML document. + */ + void + htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, +- const char *encoding, int format) { +- htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, encoding, format); ++ const char *encoding ATTRIBUTE_UNUSED, ++ int format) { ++ htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format); + } + + /** + * htmlDocContentDumpOutput: + * @buf: the HTML buffer output + * @cur: the document +- * @encoding: the encoding string ++ * @encoding: the encoding string (unused) + * + * Dump an HTML document. Formatting return/spaces are added. + */ + void + htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, +- const char *encoding) { +- htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, encoding, 1); ++ const char *encoding ATTRIBUTE_UNUSED) { ++ htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1); + } + + /************************************************************************ +-- +1.8.3.1 + diff --git a/Work-around-lxml-API-abuse.patch b/Work-around-lxml-API-abuse.patch new file mode 100644 index 0000000..8bb91b4 --- /dev/null +++ b/Work-around-lxml-API-abuse.patch @@ -0,0 +1,212 @@ +From 85b1792e37b131e7a51af98a37f92472e8de5f3f Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Tue, 18 May 2021 20:08:28 +0200 +Subject: [PATCH] Work around lxml API abuse + +Make xmlNodeDumpOutput and htmlNodeDumpFormatOutput work with corrupted +parent pointers. This used to work with the old recursive code but the +non-recursive rewrite required parent pointers to be set correctly. + +Unfortunately, lxml relies on the old behavior and passes subtrees with +a corrupted structure. Fall back to a recursive function call if an +invalid parent pointer is detected. + +Fixes #255. +--- + HTMLtree.c | 46 ++++++++++++++++++++++++++++------------------ + xmlsave.c | 31 +++++++++++++++++++++---------- + 2 files changed, 49 insertions(+), 28 deletions(-) + +diff --git a/HTMLtree.c b/HTMLtree.c +index 24434d4..bdd639c 100644 +--- a/HTMLtree.c ++++ b/HTMLtree.c +@@ -744,7 +744,7 @@ void + htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED, + int format) { +- xmlNodePtr root; ++ xmlNodePtr root, parent; + xmlAttrPtr attr; + const htmlElemDesc * info; + +@@ -755,6 +755,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + } + + root = cur; ++ parent = cur->parent; + while (1) { + switch (cur->type) { + case XML_HTML_DOCUMENT_NODE: +@@ -762,7 +763,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + if (((xmlDocPtr) cur)->intSubset != NULL) { + htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL); + } +- if (cur->children != NULL) { ++ /* Always validate cur->parent when descending. */ ++ if ((cur->parent == parent) && (cur->children != NULL)) { ++ parent = cur; + cur = cur->children; + continue; + } +@@ -770,6 +773,16 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + + case XML_ELEMENT_NODE: + /* ++ * Some users like lxml are known to pass nodes with a corrupted ++ * tree structure. Fall back to a recursive call to handle this ++ * case. ++ */ ++ if ((cur->parent != parent) && (cur->children != NULL)) { ++ htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); ++ break; ++ } ++ ++ /* + * Get specific HTML info for that node. + */ + if (cur->ns == NULL) +@@ -817,6 +830,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + (cur->name != NULL) && + (cur->name[0] != 'p')) /* p, pre, param */ + xmlOutputBufferWriteString(buf, "\n"); ++ parent = cur; + cur = cur->children; + continue; + } +@@ -825,9 +839,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + (info != NULL) && (!info->isinline)) { + if ((cur->next->type != HTML_TEXT_NODE) && + (cur->next->type != HTML_ENTITY_REF_NODE) && +- (cur->parent != NULL) && +- (cur->parent->name != NULL) && +- (cur->parent->name[0] != 'p')) /* p, pre, param */ ++ (parent != NULL) && ++ (parent->name != NULL) && ++ (parent->name[0] != 'p')) /* p, pre, param */ + xmlOutputBufferWriteString(buf, "\n"); + } + +@@ -842,9 +856,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + break; + if (((cur->name == (const xmlChar *)xmlStringText) || + (cur->name != (const xmlChar *)xmlStringTextNoenc)) && +- ((cur->parent == NULL) || +- ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) && +- (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) { ++ ((parent == NULL) || ++ ((xmlStrcasecmp(parent->name, BAD_CAST "script")) && ++ (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) { + xmlChar *buffer; + + buffer = xmlEncodeEntitiesReentrant(doc, cur->content); +@@ -902,13 +916,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + break; + } + +- /* +- * The parent should never be NULL here but we want to handle +- * corrupted documents gracefully. +- */ +- if (cur->parent == NULL) +- return; +- cur = cur->parent; ++ cur = parent; ++ /* cur->parent was validated when descending. */ ++ parent = cur->parent; + + if ((cur->type == XML_HTML_DOCUMENT_NODE) || + (cur->type == XML_DOCUMENT_NODE)) { +@@ -939,9 +949,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + (cur->next != NULL)) { + if ((cur->next->type != HTML_TEXT_NODE) && + (cur->next->type != HTML_ENTITY_REF_NODE) && +- (cur->parent != NULL) && +- (cur->parent->name != NULL) && +- (cur->parent->name[0] != 'p')) /* p, pre, param */ ++ (parent != NULL) && ++ (parent->name != NULL) && ++ (parent->name[0] != 'p')) /* p, pre, param */ + xmlOutputBufferWriteString(buf, "\n"); + } + } +diff --git a/xmlsave.c b/xmlsave.c +index 61a4045..aedbd5e 100644 +--- a/xmlsave.c ++++ b/xmlsave.c +@@ -847,7 +847,7 @@ htmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + static void + xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + int format = ctxt->format; +- xmlNodePtr tmp, root, unformattedNode = NULL; ++ xmlNodePtr tmp, root, unformattedNode = NULL, parent; + xmlAttrPtr attr; + xmlChar *start, *end; + xmlOutputBufferPtr buf; +@@ -856,6 +856,7 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + buf = ctxt->buf; + + root = cur; ++ parent = cur->parent; + while (1) { + switch (cur->type) { + case XML_DOCUMENT_NODE: +@@ -868,7 +869,9 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + break; + + case XML_DOCUMENT_FRAG_NODE: +- if (cur->children != NULL) { ++ /* Always validate cur->parent when descending. */ ++ if ((cur->parent == parent) && (cur->children != NULL)) { ++ parent = cur; + cur = cur->children; + continue; + } +@@ -887,7 +890,18 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + break; + + case XML_ELEMENT_NODE: +- if ((cur != root) && (ctxt->format == 1) && (xmlIndentTreeOutput)) ++ /* ++ * Some users like lxml are known to pass nodes with a corrupted ++ * tree structure. Fall back to a recursive call to handle this ++ * case. ++ */ ++ if ((cur->parent != parent) && (cur->children != NULL)) { ++ xmlNodeDumpOutputInternal(ctxt, cur); ++ break; ++ } ++ ++ if ((ctxt->level > 0) && (ctxt->format == 1) && ++ (xmlIndentTreeOutput)) + xmlOutputBufferWrite(buf, ctxt->indent_size * + (ctxt->level > ctxt->indent_nr ? + ctxt->indent_nr : ctxt->level), +@@ -942,6 +956,7 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + xmlOutputBufferWrite(buf, 1, ">"); + if (ctxt->format == 1) xmlOutputBufferWrite(buf, 1, "\n"); + if (ctxt->level >= 0) ctxt->level++; ++ parent = cur; + cur = cur->children; + continue; + } +@@ -1058,13 +1073,9 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { + break; + } + +- /* +- * The parent should never be NULL here but we want to handle +- * corrupted documents gracefully. +- */ +- if (cur->parent == NULL) +- return; +- cur = cur->parent; ++ cur = parent; ++ /* cur->parent was validated when descending. */ ++ parent = cur->parent; + + if (cur->type == XML_ELEMENT_NODE) { + if (ctxt->level > 0) ctxt->level--; +-- +1.8.3.1 + diff --git a/encoding-fix-memleak-in-xmlRegisterCharEncodingHandl.patch b/encoding-fix-memleak-in-xmlRegisterCharEncodingHandl.patch new file mode 100644 index 0000000..317de2c --- /dev/null +++ b/encoding-fix-memleak-in-xmlRegisterCharEncodingHandl.patch @@ -0,0 +1,55 @@ +From 649d02eaa419fa72ae6b131718a4ac77063d7a5a Mon Sep 17 00:00:00 2001 +From: Xiaoming Ni +Date: Mon, 7 Dec 2020 20:19:53 +0800 +Subject: [PATCH] encoding: fix memleak in xmlRegisterCharEncodingHandler() + +The return type of xmlRegisterCharEncodingHandler() is void. The invoker +cannot determine whether xmlRegisterCharEncodingHandler() is executed +successfully. when nbCharEncodingHandler >= MAX_ENCODING_HANDLERS, the +"handler" is not added to the array "handlers". As a result, the memory +of "handler" cannot be managed and released: memory leakage. + +so add "xmlfree(handler)" to fix memory leakage on the failure branch of +xmlRegisterCharEncodingHandler(). + +Reported-by: wuqing +Signed-off-by: Xiaoming Ni +--- + encoding.c | 13 +++++++++++-- + 1 file changed, 11 insertions(+), 2 deletions(-) + +diff --git a/encoding.c b/encoding.c +index 264f60b..d67c16d 100644 +--- a/encoding.c ++++ b/encoding.c +@@ -1488,16 +1488,25 @@ xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) { + if ((handler == NULL) || (handlers == NULL)) { + xmlEncodingErr(XML_I18N_NO_HANDLER, + "xmlRegisterCharEncodingHandler: NULL handler !\n", NULL); +- return; ++ goto free_handler; + } + + if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) { + xmlEncodingErr(XML_I18N_EXCESS_HANDLER, + "xmlRegisterCharEncodingHandler: Too many handler registered, see %s\n", + "MAX_ENCODING_HANDLERS"); +- return; ++ goto free_handler; + } + handlers[nbCharEncodingHandler++] = handler; ++ return; ++ ++free_handler: ++ if (handler != NULL) { ++ if (handler->name != NULL) { ++ xmlFree(handler->name); ++ } ++ xmlFree(handler); ++ } + } + + /** +-- +1.8.3.1 + diff --git a/libxml2.spec b/libxml2.spec index 6661d06..86b1183 100644 --- a/libxml2.spec +++ b/libxml2.spec @@ -1,7 +1,7 @@ Summary: Library providing XML and HTML support Name: libxml2 Version: 2.9.10 -Release: 21 +Release: 22 License: MIT Group: Development/Libraries Source: ftp://xmlsoft.org/libxml2/libxml2-%{version}.tar.gz @@ -93,7 +93,7 @@ Patch80: Remove-dead-code-in-xinclude.c.patch Patch81: Fix-regression-introduced-with-commit-74dcc10b.patch Patch82: Fix-regression-introduced-with-commit-d88df4b.patch Patch83: Make-xmlNodeDumpOutputInternal-non-recursive.patch -Patch84: Don-t-add-formatting-newlines-to-XInclude-nodes.patch +Patch84: Fix-NodeDumpOutput-functions.patch Patch85: Make-htmlNodeDumpFormatOutput-non-recursive.patch Patch86: Fix-memory-leaks-in-XPointer-string-range-function.patch Patch87: Fix-null-pointer-deref-in-xmlXPtrRangeInsideFunction.patch @@ -103,6 +103,20 @@ Patch90: Fix-XPath-recursion-limit.patch Patch91: Fix-Null-deref-in-xmlSchemaGetComponentTargetNs.patch Patch92: Fix-memleaks-in-xmlXIncludeProcessFlags.patch Patch93: Fix-heap-use-after-free-in-xmlAddNextSibling-and-xmlAddChild.patch +Patch94: Fix-unsigned-integer-overflow-in-htmlParseTryOrFinis.patch +Patch95: Fix-undefined-behavior-in-UTF16LEToUTF8.patch +Patch96: Fix-SEGV-in-xmlSAXParseFileWithData.patch +Patch97: encoding-fix-memleak-in-xmlRegisterCharEncodingHandl.patch +Patch98: Fix-null-deref-in-xmlStringGetNodeList.patch +Patch99: Fix-memory-leak-in-xmlParseElementMixedContentDecl.patch +Patch100:Fix-slow-parsing-of-HTML-with-encoding-errors.patch + +Patch101:More-NodeDumpOutput-fixes.patch +Patch102:Don-t-add-formatting-newlines-to-XInclude-nodes.patch +Patch103:Handle-dumps-of-corrupted-documents-more-gracefully.patch +Patch104:Remove-unused-encoding-parameter-of-HTML-output-func.patch +Patch105:Work-around-lxml-API-abuse.patch +Patch106:Fix-regression-in-xmlNodeDumpOutputInternal.patch BuildRoot: %{_tmppath}/%{name}-%{version}-root BuildRequires: python2-devel @@ -295,6 +309,12 @@ rm -fr %{buildroot} %changelog +* Thu Dec 2 2021 panxiaohe - 2.9.10-22 +- Type:bugfix +- ID:NA +- SUG:NA +- DESC:backport upstream patches + * Thu Nov 11 2021 panxiaohe - 2.9.10-21 - Type:bugfix - ID:NA -- Gitee