diff --git a/Binary-strings-preserve-UTF-8-and-UTF-16-errors.patch b/Binary-strings-preserve-UTF-8-and-UTF-16-errors.patch deleted file mode 100644 index 6863778adaa63ee495a2830f6446b82b2b3b9b31..0000000000000000000000000000000000000000 --- a/Binary-strings-preserve-UTF-8-and-UTF-16-errors.patch +++ /dev/null @@ -1,616 +0,0 @@ -From b2384ea878f484c48419fc0ec30380d0a5ffe3ce Mon Sep 17 00:00:00 2001 -From: Max Zerzouri -Date: Sat, 15 May 2021 08:32:27 +0000 -Subject: [PATCH] Binary strings: preserve UTF-8 and UTF-16 errors - -The internal string representation is changed from UTF-8 with replacement -characters to a modified form of "WTF-8" that is able to distinctly encode -UTF-8 errors and UTF-16 errors. - -This handles UTF-8 errors in raw string inputs and handles UTF-8 and UTF-16 -errors in JSON input. UTF-16 errors (using "\uXXXX") and UTF-8 errors (using -the original raw bytes) are maintained when emitting JSON. When emitting raw -strings, UTF-8 errors are maintained and UTF-16 errors are converted into -replacement characters. ---- - scripts/gen_utf8_tables.py | 3 +- - src/jv.c | 28 ++++++------ - src/jv.h | 1 + - src/jv_parse.c | 77 ++++++++++++++++++++++----------- - src/jv_print.c | 26 +++++++++++- - src/jv_unicode.c | 87 ++++++++++++++++++++++++++++++++++---- - src/jv_unicode.h | 11 +++++ - src/jv_utf8_tables.h | 4 +- - src/main.c | 29 ++++++++++++- - tests/jq.test | 5 +++ - tests/shtest | 9 ++++ - 11 files changed, 228 insertions(+), 52 deletions(-) - -diff --git a/scripts/gen_utf8_tables.py b/scripts/gen_utf8_tables.py -index 6fe0a53..7706462 100644 ---- a/scripts/gen_utf8_tables.py -+++ b/scripts/gen_utf8_tables.py -@@ -16,8 +16,7 @@ def print_table(type, name, t): - def utf8info(c): - if c < 0x80: return 1, mask(7) - if 0x80 <= c <= 0xBF: return 255, mask(6) -- if 0xC0 <= c <= 0xC1: return 0, 0 -- if 0xC2 <= c <= 0xDF: return 2, mask(5) -+ if 0xC0 <= c <= 0xDF: return 2, mask(5) - if 0xE0 <= c <= 0xEF: return 3, mask(4) - if 0xF0 <= c <= 0xF4: return 4, mask(3) - if 0xF4 <= c <= 0xFF: return 0, 0 -diff --git a/src/jv.c b/src/jv.c -index 1f1029e..e979cc6 100644 ---- a/src/jv.c -+++ b/src/jv.c -@@ -452,20 +452,24 @@ static jvp_string* jvp_string_alloc(uint32_t size) { - return s; - } - --/* Copy a UTF8 string, replacing all badly encoded points with U+FFFD */ -+/* Copy a UTF8 string, using WTF-8b to replace all UTF-8 errors */ - static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) { - const char* end = data + length; - const char* i = data; - const char* cstart; - -- uint32_t maxlength = length * 3 + 1; // worst case: all bad bytes, each becomes a 3-byte U+FFFD -+ uint32_t maxlength = length * 2 + 1; // worst case: all bad bytes, each becomes a 2-byte overlong U+XX - jvp_string* s = jvp_string_alloc(maxlength); - char* out = s->data; - int c = 0; - -- while ((i = jvp_utf8_next((cstart = i), end, &c))) { -+ while ((i = jvp_utf8_extended_next((cstart = i), end, 0, &c))) { - if (c == -1) { -- c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER -+ int error = (unsigned char)*cstart; -+ assert(error >= 0x80 && error <= 0xFF); -+ c = -error; -+ /* Ensure each UTF-8 error byte is consumed separately */ -+ i = cstart + 1; - } - out += jvp_utf8_encode(c, out); - assert(out < s->data + maxlength); -@@ -477,8 +481,8 @@ static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) { - return r; - } - --/* Assumes valid UTF8 */ --static jv jvp_string_new(const char* data, uint32_t length) { -+/* Assumes valid WTF-8b */ -+jv jv_string_extended_sized(const char* data, int length) { - jvp_string* s = jvp_string_alloc(length); - s->length_hashed = length << 1; - if (data != NULL) -@@ -618,7 +622,7 @@ static int jvp_string_equal(jv a, jv b) { - jv jv_string_sized(const char* str, int len) { - return - jvp_utf8_is_valid(str, str+len) ? -- jvp_string_new(str, len) : -+ jv_string_extended_sized(str, len) : - jvp_string_copy_replace_bad(str, len); - } - -@@ -682,14 +686,14 @@ jv jv_string_split(jv j, jv sep) { - - if (seplen == 0) { - int c; -- while ((jstr = jvp_utf8_next(jstr, jend, &c))) -+ while ((jstr = jvp_utf8_extended_next(jstr, jend, JVP_UTF8_ERRORS_ALL, &c))) - a = jv_array_append(a, jv_string_append_codepoint(jv_string(""), c)); - } else { - for (p = jstr; p < jend; p = s + seplen) { - s = _jq_memmem(p, jend - p, sepstr, seplen); - if (s == NULL) - s = jend; -- a = jv_array_append(a, jv_string_sized(p, s - p)); -+ a = jv_array_append(a, jv_string_extended_sized(p, s - p)); - // Add an empty string to denote that j ends on a sep - if (s + seplen == jend && seplen != 0) - a = jv_array_append(a, jv_string("")); -@@ -760,7 +764,7 @@ jv jv_string_slice(jv j, int start, int end) { - - /* Look for byte offset corresponding to start codepoints */ - for (p = s, i = 0; i < start; i++) { -- p = jvp_utf8_next(p, s + len, &c); -+ p = jvp_utf8_extended_next(p, s + len, JVP_UTF8_ERRORS_ALL, &c); - if (p == NULL) { - jv_free(j); - return jv_string_empty(16); -@@ -772,7 +776,7 @@ jv jv_string_slice(jv j, int start, int end) { - } - /* Look for byte offset corresponding to end codepoints */ - for (e = p; e != NULL && i < end; i++) { -- e = jvp_utf8_next(e, s + len, &c); -+ e = jvp_utf8_extended_next(e, s + len, JVP_UTF8_ERRORS_ALL, &c); - if (e == NULL) { - e = s + len; - break; -@@ -790,7 +794,7 @@ jv jv_string_slice(jv j, int start, int end) { - * memory like a drunken navy programmer. There's probably nothing we - * can do about it. - */ -- res = jv_string_sized(p, e - p); -+ res = jv_string_extended_sized(p, e - p); - jv_free(j); - return res; - } -diff --git a/src/jv.h b/src/jv.h -index d111c80..2aed1ae 100644 ---- a/src/jv.h -+++ b/src/jv.h -@@ -104,6 +104,7 @@ jv jv_array_indexes(jv, jv); - - jv jv_string(const char*); - jv jv_string_sized(const char*, int); -+jv jv_string_extended_sized(const char*, int); - jv jv_string_empty(int len); - int jv_string_length_bytes(jv); - int jv_string_length_codepoints(jv); -diff --git a/src/jv_parse.c b/src/jv_parse.c -index 51ad9f0..194efaf 100644 ---- a/src/jv_parse.c -+++ b/src/jv_parse.c -@@ -397,7 +397,7 @@ static void tokenadd(struct jv_parser* p, char c) { - p->tokenbuf[p->tokenpos++] = c; - } - --static int unhex4(char* hex) { -+static int unhex4(const char* hex) { - int r = 0; - for (int i=0; i<4; i++) { - char c = *hex++; -@@ -413,15 +413,19 @@ static int unhex4(char* hex) { - } - - static pfunc found_string(struct jv_parser* p) { -- char* in = p->tokenbuf; -- char* out = p->tokenbuf; -- char* end = p->tokenbuf + p->tokenpos; -- -- while (in < end) { -- char c = *in++; -+ const char* in = p->tokenbuf; -+ // start by writing to tokenbuf, only allocate in case that output size is greater than input size (possible only when input has UTF-8 errors) -+ char* newbuf = NULL; -+ char* buf = p->tokenbuf; -+ char* out = buf; -+ const char* end = p->tokenbuf + p->tokenpos; -+ const char* cstart; -+ int c; -+ -+ while ((in = jvp_utf8_extended_next((cstart = in), end, 0, &c))) { - if (c == '\\') { - if (in >= end) -- return "Expected escape character at end of string"; -+ return jv_mem_free(newbuf), "Expected escape character at end of string"; - c = *in++; - switch (c) { - case '\\': -@@ -436,38 +440,61 @@ static pfunc found_string(struct jv_parser* p) { - case 'u': - /* ahh, the complicated case */ - if (in + 4 > end) -- return "Invalid \\uXXXX escape"; -+ return jv_mem_free(newbuf), "Invalid \\uXXXX escape"; - int hexvalue = unhex4(in); - if (hexvalue < 0) -- return "Invalid characters in \\uXXXX escape"; -+ return jv_mem_free(newbuf), "Invalid characters in \\uXXXX escape"; - unsigned long codepoint = (unsigned long)hexvalue; - in += 4; -+ // leading surrogate - if (0xD800 <= codepoint && codepoint <= 0xDBFF) { -- /* who thought UTF-16 surrogate pairs were a good idea? */ -- if (in + 6 > end || in[0] != '\\' || in[1] != 'u') -- return "Invalid \\uXXXX\\uXXXX surrogate pair escape"; -- unsigned long surrogate = unhex4(in+2); -- if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF)) -- return "Invalid \\uXXXX\\uXXXX surrogate pair escape"; -- in += 6; -- codepoint = 0x10000 + (((codepoint - 0xD800) << 10) -- |(surrogate - 0xDC00)); -+ // look ahead for trailing surrogate and decode as UTF-16, otherwise encode this lone surrogate as WTF-8 -+ if (in + 6 <= end && in[0] == '\\' && in[1] == 'u') { -+ unsigned long surrogate = unhex4(in+2); -+ if (0xDC00 <= surrogate && surrogate <= 0xDFFF) { -+ in += 6; -+ codepoint = 0x10000 + (((codepoint - 0xD800) << 10) -+ |(surrogate - 0xDC00)); -+ } -+ } - } -- if (codepoint > 0x10FFFF) -- codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER -+ // UTF-16 surrogates can not encode a greater codepoint -+ assert(codepoint <= 0x10FFFF); -+ // NOTE: a leading or trailing surrogate here (0xD800 <= codepoint && codepoint <= 0xDFFF) is encoded as WTF-8 - out += jvp_utf8_encode(codepoint, out); - break; - - default: -- return "Invalid escape"; -+ return jv_mem_free(newbuf), "Invalid escape"; - } - } else { - if (c > 0 && c < 0x001f) -- return "Invalid string: control characters from U+0000 through U+001F must be escaped"; -- *out++ = c; -+ return jv_mem_free(newbuf), "Invalid string: control characters from U+0000 through U+001F must be escaped"; -+ if (c == -1) { -+ int error = (unsigned char)*cstart; -+ assert(error >= 0x80 && error <= 0xFF); -+ c = -error; -+ /* Ensure each UTF-8 error byte is consumed separately */ -+ const int wtf8_length = 2; -+ assert(jvp_utf8_encode_length(c) == wtf8_length); -+ in = cstart + 1; -+ if (newbuf == NULL && out + wtf8_length > in) { -+ /* Output is about to overflow input, move output to temporary buffer */ -+ int current_size = out - p->tokenbuf; -+ int remaining = end - cstart; -+ newbuf = jv_mem_alloc(current_size + remaining * wtf8_length); // worst case: all remaining bad bytes, each becomes a 2-byte overlong U+XX -+ memcpy(newbuf, buf, current_size); -+ buf = newbuf; -+ out = buf + current_size; -+ } -+ } else -+ assert(jvp_utf8_encode_length(c) == in - cstart); -+ out += jvp_utf8_encode(c, out); - } - } -- TRY(value(p, jv_string_sized(p->tokenbuf, out - p->tokenbuf))); -+ jv v = jv_string_extended_sized(buf, out - buf); -+ jv_mem_free(newbuf); -+ TRY(value(p, v)); - p->tokenpos = 0; - return 0; - } -diff --git a/src/jv_print.c b/src/jv_print.c -index 5ebc01e..dfa1f05 100644 ---- a/src/jv_print.c -+++ b/src/jv_print.c -@@ -98,6 +98,16 @@ static void put_char(char c, FILE* fout, jv* strout, int T) { - put_buf(&c, 1, fout, strout, T); - } - -+static void put_invalid_utf8_byte(int c, FILE* fout, jv* strout, int T) { -+ assert(c >= 0x80 && c <= 0xFF); -+ if (strout) { -+ // encode as an invalid UTF-8 byte in output -+ *strout = jv_string_append_codepoint(*strout, -c); -+ } else { -+ put_char(c, fout, strout, T); -+ } -+} -+ - static void put_str(const char* s, FILE* fout, jv* strout, int T) { - put_buf(s, strlen(s), fout, strout, T); - } -@@ -121,7 +131,7 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) { - int c = 0; - char buf[32]; - put_char('"', F, S, T); -- while ((i = jvp_utf8_next((cstart = i), end, &c))) { -+ while ((i = jvp_utf8_extended_next((cstart = i), end, JVP_UTF8_ERRORS_ALL, &c))) { - assert(c != -1); - int unicode_escape = 0; - if (0x20 <= c && c <= 0x7E) { -@@ -130,6 +140,17 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) { - put_char('\\', F, S, T); - } - put_char(c, F, S, T); -+ } else if (c >= -0xFF && c <= -0x80) { -+ // Invalid UTF-8 byte -+ if (ascii_only) { -+ // refusing to emit invalid UTF-8 -+ // TODO: convince the world to adopt a "\xXX" notation for JSON? -+ c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER -+ unicode_escape = 1; -+ } else { -+ // pass through -+ put_invalid_utf8_byte(-c, F, S, T); -+ } - } else if (c < 0x20 || c == 0x7F) { - // ASCII control character - switch (c) { -@@ -160,6 +181,9 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) { - } else { - if (ascii_only) { - unicode_escape = 1; -+ } else if (c >= 0xD800 && c <= 0xDFFF) { -+ // lone surrogate; can't be encoded to UTF-8 -+ unicode_escape = 1; - } else { - put_buf(cstart, i - cstart, F, S, T); - } -diff --git a/src/jv_unicode.c b/src/jv_unicode.c -index d197349..8c47536 100644 ---- a/src/jv_unicode.c -+++ b/src/jv_unicode.c -@@ -27,6 +27,56 @@ const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_ - } - - const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { -+ return jvp_utf8_extended_next(in, end, JVP_UTF8_REPLACE, codepoint_ret); -+} -+ -+/* -+ The internal representation of jv strings uses an encoding that is hereby -+ referred to as "WTF-8b" (until someone demonstrates use of another term to -+ refer to the same encoding). -+ -+ WTF-8b is an extension of WTF-8, which is an extension of UTF-8. Any sequence -+ of Unicode scalar values is represented by the same bytes in UTF-8, WTF-8 and -+ WTF-8b, therefore any well-formed UTF-8 string is interpreted as the same -+ sequence of Unicode scalar values (roughly, code points) in WTF-8b. -+ -+ Like WTF-8, WTF-8b is able to encode UTF-16 errors (lone surrogates) using -+ the "generalized UTF-8" representation of code points between U+D800 and -+ U+DFFF. These errors occur in JSON terms such as: -+ "_\uD8AB_\uDBCD_" -+ -+ Unlike WTF-8, WTF-8b is also able to encode UTF-8 errors (bytes 0x80 to 0xFF -+ that are not part of a valid UTF-8 sequence) using the first 128 "overlong" -+ codings (unused 2-byte representations of U+00 to U+7F). These errors can -+ occur in any byte stream that is interpreted as UTF-8, for example: -+ "\xED\xA2\xAB" -+ The above example is in fact the WTF-8b (and WTF-8) encoding for the lone -+ UTF-16 surrogate "\uD8AB", which demonstrates the need for a distinct -+ encoding of UTF-8 errors. If a distinction were not made, then "\xED\xA2\xAB" -+ and "\uD8AB" would be interpreted as the same string, so at least one of the -+ forms would not be preserved when printed as JSON output. -+ -+ It should also be noted that the process of converting from invalid UTF-8 to -+ WTF-8b is not (and can not be) idempotent, since the "generalised UTF-8" -+ representation of UTF-16 surrogates are intentionally not able to be -+ generated from invalid UTF-8, only through some other means (usually "\uXXXX" -+ notation). -+ -+ Each UTF-16 error is encoded as 3 WTF-8b (or WTF-8) bytes. -+ Each UTF-8 error is encoded as 2 WTF-8b bytes. -+ -+ When iterating over code points using `JVP_UTF8_ERRORS_UTF16`, encoded UTF-16 -+ errors are emitted in the form of code points in the range U+D800 to U+DFFF. -+ These code points can be reencoded as usual using `jvp_utf8_encode`. -+ -+ When iterating over code points using `JVP_UTF8_ERRORS_UTF8`, encoded UTF-8 -+ errors are emitted in the form of code points in the negative range -0x80 to -+ -0xFF. These negative code points can be negated to determine the original -+ error bytes. These code points can be reencoded as usual using -+ `jvp_utf8_encode`. -+*/ -+ -+const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint_ret) { - assert(in <= end); - if (in == end) { - return 0; -@@ -40,9 +90,11 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { - length = 1; - } else if (length == 0 || length == UTF8_CONTINUATION_BYTE) { - /* Bad single byte - either an invalid byte or an out-of-place continuation byte */ -+ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: bad single byte"); - length = 1; - } else if (in + length > end) { - /* String ends before UTF8 sequence ends */ -+ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: underrun"); - length = end - in; - } else { - codepoint = ((unsigned)in[0]) & utf8_coding_bits[first]; -@@ -50,6 +102,7 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { - unsigned ch = (unsigned char)in[i]; - if (utf8_coding_length[ch] != UTF8_CONTINUATION_BYTE){ - /* Invalid UTF8 sequence - not followed by the right number of continuation bytes */ -+ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: wrong bytes"); - codepoint = -1; - length = i; - break; -@@ -58,17 +111,29 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { - } - if (codepoint < utf8_first_codepoint[length]) { - /* Overlong UTF8 sequence */ -- codepoint = -1; -+ if ((flags & JVP_UTF8_ERRORS_UTF8) && 0x00 <= codepoint && codepoint <= 0x7F) { -+ /* UTF-8 error is emitted as a negative codepoint */ -+ codepoint = -(codepoint + 0x80); -+ } else { -+ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: overlong"); -+ codepoint = -1; -+ } - } - if (0xD800 <= codepoint && codepoint <= 0xDFFF) { -- /* Surrogate codepoints can't be encoded in UTF8 */ -- codepoint = -1; -+ /* Surrogate codepoints are allowed in WTF-8/WTF-8b */ -+ if (!(flags & JVP_UTF8_ERRORS_UTF16)) { -+ /* Surrogate codepoints can't be encoded in UTF8 */ -+ codepoint = -1; -+ } - } - if (codepoint > 0x10FFFF) { - /* Outside Unicode range */ -+ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range"); - codepoint = -1; - } - } -+ if (codepoint == -1 && (flags & JVP_UTF8_REPLACE)) -+ codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER - assert(length > 0); - *codepoint_ret = codepoint; - return in + length; -@@ -76,7 +141,7 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { - - int jvp_utf8_is_valid(const char* in, const char* end) { - int codepoint; -- while ((in = jvp_utf8_next(in, end, &codepoint))) { -+ while ((in = jvp_utf8_extended_next(in, end, 0, &codepoint))) { - if (codepoint == -1) return 0; - } - return 1; -@@ -91,20 +156,24 @@ int jvp_utf8_decode_length(char startchar) { - } - - int jvp_utf8_encode_length(int codepoint) { -- if (codepoint <= 0x7F) return 1; -+ if (codepoint >= 0 && codepoint <= 0x7F) return 1; - else if (codepoint <= 0x7FF) return 2; - else if (codepoint <= 0xFFFF) return 3; - else return 4; - } - - int jvp_utf8_encode(int codepoint, char* out) { -- assert(codepoint >= 0 && codepoint <= 0x10FFFF); -+ assert((codepoint >= 0 && codepoint <= 0x10FFFF) || (codepoint >= -0xFF && codepoint <= -0x80)); - char* start = out; -- if (codepoint <= 0x7F) { -+ if (codepoint >= 0 && codepoint <= 0x7F) { - *out++ = codepoint; - } else if (codepoint <= 0x7FF) { -- *out++ = 0xC0 + ((codepoint & 0x7C0) >> 6); -- *out++ = 0x80 + ((codepoint & 0x03F)); -+ // encode UTF-8 errors as overlong representations of U+00 to U+7F -+ int cp = codepoint >= -0xFF && codepoint <= -0x80? -+ -codepoint - 0x80 : -+ codepoint; -+ *out++ = 0xC0 + ((cp & 0x7C0) >> 6); -+ *out++ = 0x80 + ((cp & 0x03F)); - } else if(codepoint <= 0xFFFF) { - *out++ = 0xE0 + ((codepoint & 0xF000) >> 12); - *out++ = 0x80 + ((codepoint & 0x0FC0) >> 6); -diff --git a/src/jv_unicode.h b/src/jv_unicode.h -index 558721a..37c7fc0 100644 ---- a/src/jv_unicode.h -+++ b/src/jv_unicode.h -@@ -1,7 +1,18 @@ - #ifndef JV_UNICODE_H - #define JV_UNICODE_H - -+enum jvp_utf8_flags { -+ /* Emit replacement character instead of -1 for errors */ -+ JVP_UTF8_REPLACE = 1, -+ /* Treat input as WTF-8b, emit 0xD800 to 0xDFFF to denote encoded UTF-16 errors */ -+ JVP_UTF8_ERRORS_UTF16 = 2, -+ /* Treat input as WTF-8b, emit -0x80 to -0xFF to denote encoded UTF-8 errors */ -+ JVP_UTF8_ERRORS_UTF8 = 4, -+ JVP_UTF8_ERRORS_ALL = JVP_UTF8_ERRORS_UTF16 | JVP_UTF8_ERRORS_UTF8 -+}; -+ - const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes); -+const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint); - const char* jvp_utf8_next(const char* in, const char* end, int* codepoint); - int jvp_utf8_is_valid(const char* in, const char* end); - -diff --git a/src/jv_utf8_tables.h b/src/jv_utf8_tables.h -index f1a4252..7c68749 100644 ---- a/src/jv_utf8_tables.h -+++ b/src/jv_utf8_tables.h -@@ -12,7 +12,7 @@ static const unsigned char utf8_coding_length[] = - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, -- 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, -+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, - 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, - 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, - 0x04, 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; -@@ -29,7 +29,7 @@ static const unsigned char utf8_coding_bits[] = - 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, - 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, - 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, -- 0x00, 0x00, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, -+ 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, - 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, - 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; -diff --git a/src/main.c b/src/main.c -index b154689..5fa5c4f 100644 ---- a/src/main.c -+++ b/src/main.c -@@ -30,6 +30,7 @@ - #include "jv.h" - #include "jq.h" - #include "jv_alloc.h" -+#include "jv_unicode.h" - #include "util.h" - #include "src/version.h" - -@@ -161,6 +162,30 @@ static const char *skip_shebang(const char *p) { - return n+1; - } - -+static void jvp_dump_raw_string(const char* start, const char* end, FILE* f) { -+ static const unsigned char UTF8_REPLACEMENT[] = {0xEF,0xBF,0xBD}; // U+FFFD REPLACEMENT CHARACTER -+ -+ const char* i = start; -+ const char* cstart; -+ int c; -+ -+ while ((i = jvp_utf8_extended_next((cstart = i), end, JVP_UTF8_ERRORS_ALL, &c))) { -+ if (c >= -0xFF && c <= -0x80) { -+ // invalid UTF-8 byte; pass through -+ fwrite(start, 1, cstart - start, f); -+ start = i; -+ fputc(-c, f); -+ } else if ((c >= 0xD800 && c <= 0xDFFF) || c == -1) { -+ // lone surrugate; can't be encoded to UTF-8 -+ fwrite(start, 1, cstart - start, f); -+ start = i; -+ fwrite(UTF8_REPLACEMENT, 1, sizeof(UTF8_REPLACEMENT), f); -+ } else -+ continue; -+ } -+ fwrite(start, 1, end - start, f); -+} -+ - static int process(jq_state *jq, jv value, int flags, int dumpopts) { - int ret = 14; // No valid results && -e -> exit(4) - jq_start(jq, value, flags); -@@ -170,7 +195,9 @@ static int process(jq_state *jq, jv value, int flags, int dumpopts) { - if (options & ASCII_OUTPUT) { - jv_dumpf(result, stdout, JV_PRINT_ASCII); - } else { -- fwrite(jv_string_value(result), 1, jv_string_length_bytes(jv_copy(result)), stdout); -+ const char *start = jv_string_value(result); -+ const char *end = start + jv_string_length_bytes(jv_copy(result)); -+ jvp_dump_raw_string(start, end, stdout); - } - ret = 0; - jv_free(result); -diff --git a/tests/jq.test b/tests/jq.test -index 7e2dd43..c882fd2 100644 ---- a/tests/jq.test -+++ b/tests/jq.test -@@ -57,6 +57,11 @@ null - "Aa\r\n\t\b\f\u03bc" - "Aa\u000d\u000a\u0009\u0008\u000c\u03bc" - -+# Check that unpaired surrogates are preserved in output -+"\u2200\ud800\u2203\udc00\u2205\udfff" -+null -+"∀\ud800∃\udc00∅\udfff" -+ - "inter\("pol" + "ation")" - null - "interpolation" -diff --git a/tests/shtest b/tests/shtest -index 86fec33..4c8b57e 100755 ---- a/tests/shtest -+++ b/tests/shtest -@@ -130,6 +130,15 @@ printf "[1,2][3,4]\n" | $JQ -cs add > $d/out 2>&1 - cmp $d/out $d/expected - - -+clean=false -+# Invalid UTF-8 bytes are preserved when encoding/decoding JSON -+dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null -+$VALGRIND $Q $JQ -sR . $d/rand >$d/out.json -+$VALGRIND $Q $JQ -j . $d/out.json >$d/out -+cmp $d/out $d/rand -+clean=true -+ -+ - ## Test streaming parser - - ## If we add an option to stream to the `import ... as $symbol;` directive diff --git a/CVE-2024-53427-pre.patch b/CVE-2024-53427-pre.patch new file mode 100644 index 0000000000000000000000000000000000000000..4fc5a632205bed626c5ed3035defdda402513ac3 --- /dev/null +++ b/CVE-2024-53427-pre.patch @@ -0,0 +1,68 @@ +From b86ff49f46a4a37e5a8e75a140cb5fd6e1331384 Mon Sep 17 00:00:00 2001 +From: itchyny +Date: Sun, 16 Feb 2025 22:08:36 +0900 +Subject: [PATCH] fix: `jv_number_value` should cache the double value of + literal numbers (#3245) + +The code of `jv_number_value` is intended to cache the double value of +literal numbers, but it does not work because it accepts the `jv` struct +by value. This patch fixes the behavior by checking if the double value +is `NaN`, which indicates the unconverted value. This patch improves the +performance of major use cases; e.g. `range(1000000)` runs 25% faster. + +Origin: https://github.com/jqlang/jq/commit/b86ff49f46a4a37e5a8e75a140cb5fd6e1331384 +--- + src/jv.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +diff --git a/src/jv.c b/src/jv.c +index e23d8ec..9329eae 100644 +--- a/src/jv.c ++++ b/src/jv.c +@@ -206,9 +206,6 @@ enum { + JVP_NUMBER_DECIMAL = 1 + }; + +-#define JV_NUMBER_SIZE_INIT (0) +-#define JV_NUMBER_SIZE_CONVERTED (1) +- + #define JVP_FLAGS_NUMBER_NATIVE JVP_MAKE_FLAGS(JV_KIND_NUMBER, JVP_MAKE_PFLAGS(JVP_NUMBER_NATIVE, 0)) + #define JVP_FLAGS_NUMBER_LITERAL JVP_MAKE_FLAGS(JV_KIND_NUMBER, JVP_MAKE_PFLAGS(JVP_NUMBER_DECIMAL, 1)) + +@@ -589,8 +586,12 @@ static jv jvp_literal_number_new(const char * literal) { + jv_mem_free(n); + return JV_INVALID; + } ++ if (decNumberIsNaN(&n->num_decimal)) { ++ jv_mem_free(n); ++ return jv_number(NAN); ++ } + +- jv r = {JVP_FLAGS_NUMBER_LITERAL, 0, 0, JV_NUMBER_SIZE_INIT, {&n->refcnt}}; ++ jv r = {JVP_FLAGS_NUMBER_LITERAL, 0, 0, 0, {&n->refcnt}}; + return r; + } + +@@ -698,9 +699,8 @@ double jv_number_value(jv j) { + if (JVP_HAS_FLAGS(j, JVP_FLAGS_NUMBER_LITERAL)) { + jvp_literal_number* n = jvp_literal_number_ptr(j); + +- if (j.size != JV_NUMBER_SIZE_CONVERTED) { ++ if (isnan(n->num_double)) { + n->num_double = jvp_literal_number_to_double(j); +- j.size = JV_NUMBER_SIZE_CONVERTED; + } + + return n->num_double; +@@ -731,7 +731,7 @@ int jvp_number_is_nan(jv n) { + return decNumberIsNaN(pdec); + } + #endif +- return n.u.number != n.u.number; ++ return isnan(n.u.number); + } + + int jvp_number_cmp(jv a, jv b) { +-- +2.48.1 + diff --git a/CVE-2024-53427.patch b/CVE-2024-53427.patch new file mode 100644 index 0000000000000000000000000000000000000000..6046eba7e4b4f000e931fb40f518999880dc212f --- /dev/null +++ b/CVE-2024-53427.patch @@ -0,0 +1,77 @@ +From a09a4dfd55e6c24d04b35062ccfe4509748b1dd3 Mon Sep 17 00:00:00 2001 +From: itchyny +Date: Wed, 5 Mar 2025 07:43:54 +0900 +Subject: [PATCH] Reject NaN with payload while parsing JSON + +This commit drops support for parsing NaN with payload in JSON like +`NaN123` and fixes CVE-2024-53427. Other JSON extensions like `NaN` and +`Infinity` are still supported. Fixes #3023, fixes #3196, fixes #3246. + +Origin: https://github.com/jqlang/jq/commit/a09a4dfd55e6c24d04b35062ccfe4509748b1dd3 +--- + src/jv.c | 5 +++++ + tests/jq.test | 14 ++++++++++---- + tests/shtest | 5 ----- + 3 files changed, 15 insertions(+), 9 deletions(-) + +diff --git a/src/jv.c b/src/jv.c +index 9329eae..e26f74d 100644 +--- a/src/jv.c ++++ b/src/jv.c +@@ -587,6 +587,11 @@ static jv jvp_literal_number_new(const char * literal) { + return JV_INVALID; + } + if (decNumberIsNaN(&n->num_decimal)) { ++ // Reject NaN with payload. ++ if (n->num_decimal.digits > 1 || *n->num_decimal.lsu != 0) { ++ jv_mem_free(n); ++ return JV_INVALID; ++ } + jv_mem_free(n); + return jv_number(NAN); + } +diff --git a/tests/jq.test b/tests/jq.test +index 7036df2..d052b22 100644 +--- a/tests/jq.test ++++ b/tests/jq.test +@@ -1938,11 +1938,17 @@ tojson | fromjson + {"a":nan} + {"a":null} + +-# also "nan with payload" #2985 +-fromjson | isnan +-"nan1234" ++# NaN with payload is not parsed ++.[] | try (fromjson | isnan) catch . ++["NaN","-NaN","NaN1","NaN10","NaN100","NaN1000","NaN10000","NaN100000"] + true +- ++true ++"Invalid numeric literal at EOF at line 1, column 4 (while parsing 'NaN1')" ++"Invalid numeric literal at EOF at line 1, column 5 (while parsing 'NaN10')" ++"Invalid numeric literal at EOF at line 1, column 6 (while parsing 'NaN100')" ++"Invalid numeric literal at EOF at line 1, column 7 (while parsing 'NaN1000')" ++"Invalid numeric literal at EOF at line 1, column 8 (while parsing 'NaN10000')" ++"Invalid numeric literal at EOF at line 1, column 9 (while parsing 'NaN100000')" + + # calling input/0, or debug/0 in a test doesn't crash jq + +diff --git a/tests/shtest b/tests/shtest +index 14aafbf..a471889 100755 +--- a/tests/shtest ++++ b/tests/shtest +@@ -594,11 +594,6 @@ if ! x=$($JQ -n "1 # foo$cr + 2") || [ "$x" != 1 ]; then + exit 1 + fi + +-# CVE-2023-50268: No stack overflow comparing a nan with a large payload +-$VALGRIND $Q $JQ '1 != .' <<\EOF >/dev/null +-Nan4000 +-EOF +- + # Allow passing the inline jq script before -- #2919 + if ! r=$($JQ --args -rn -- '$ARGS.positional[0]' bar) || [ "$r" != bar ]; then + echo "passing the inline script after -- didn't work" +-- +2.48.1 + diff --git a/Correct-UTF-8-and-UTF-16-errors-during-concatenation.patch b/Correct-UTF-8-and-UTF-16-errors-during-concatenation.patch deleted file mode 100644 index 6ceeda158004badc622261d49bed3e0b1d96fa42..0000000000000000000000000000000000000000 --- a/Correct-UTF-8-and-UTF-16-errors-during-concatenation.patch +++ /dev/null @@ -1,388 +0,0 @@ -From 8829368f14943b8d2674c75805b27e56a569ad2c Mon Sep 17 00:00:00 2001 -From: Max Zerzouri -Date: Tue, 25 May 2021 22:59:59 +1200 -Subject: [PATCH] Correct UTF-8 and UTF-16 errors during concatenation - -UTF-8 errors and UTF-16 errors that were previously encoded into the -ends of -strings will now potentially be used to form correct code points. - -This is mostly a matter of making string equality behave expectedly, since -without this normalisation, it is possible to produce `jv` strings that are -converted to UTF-8 or UTF-16 the same way but are not equal due well-formed -code units that may or may not be encoded as errors. ---- - src/jv.c | 13 ++- - src/jv_unicode.c | 248 ++++++++++++++++++++++++++++++++++++++--------- - src/jv_unicode.h | 3 + - tests/jq.test | 15 +++ - 4 files changed, 230 insertions(+), 49 deletions(-) - -diff --git a/src/jv.c b/src/jv.c -index e979cc6..67d86fb 100644 ---- a/src/jv.c -+++ b/src/jv.c -@@ -522,20 +522,27 @@ static jv jvp_string_append(jv string, const char* data, uint32_t len) { - jvp_string* s = jvp_string_ptr(string); - uint32_t currlen = jvp_string_length(s); - -+ char join_buf[4]; -+ int join_len = jvp_utf8_extended_join(s->data, &currlen, &data, &len, join_buf); -+ - if (jvp_refcnt_unshared(string.u.ptr) && -- jvp_string_remaining_space(s) >= len) { -+ jvp_string_remaining_space(s) >= join_len + len) { - // the next string fits at the end of a -+ memcpy(s->data + currlen, join_buf, join_len); -+ currlen += join_len; - memcpy(s->data + currlen, data, len); - s->data[currlen + len] = 0; - s->length_hashed = (currlen + len) << 1; - return string; - } else { - // allocate a bigger buffer and copy -- uint32_t allocsz = (currlen + len) * 2; -+ uint32_t allocsz = (currlen + join_len + len) * 2; - if (allocsz < 32) allocsz = 32; - jvp_string* news = jvp_string_alloc(allocsz); -- news->length_hashed = (currlen + len) << 1; -+ news->length_hashed = (currlen + join_len + len) << 1; - memcpy(news->data, s->data, currlen); -+ memcpy(news->data + currlen, join_buf, join_len); -+ currlen += join_len; - memcpy(news->data + currlen, data, len); - news->data[currlen + len] = 0; - jvp_string_free(string); -diff --git a/src/jv_unicode.c b/src/jv_unicode.c -index 8c47536..7d67300 100644 ---- a/src/jv_unicode.c -+++ b/src/jv_unicode.c -@@ -1,8 +1,72 @@ - #include -+#include - #include - #include "jv_unicode.h" - #include "jv_utf8_tables.h" - -+// length of encoding of erroneous UTF-8 byte -+#define UTF8_ERR_LEN 2 -+// length of encoding of erroneous UTF-16 surrogate -+#define UTF16_ERR_LEN 3 -+ -+#define U32(a, b, c, d) ( \ -+ (uint32_t) (a) << 0 | \ -+ (uint32_t) (b) << 8 | \ -+ (uint32_t) (c) << 16 | \ -+ (uint32_t) (d) << 24 \ -+) -+ -+#define BYTE(u32, n) ((uint32_t) (((u32) >> (n)*8) & 0xFF)) -+ -+#define B0 0x00 // 00000000 -+#define B1 0x80 // 10000000 -+#define B2 0xC0 // 11000000 -+#define B3 0xE0 // 11100000 -+#define B4 0xF0 // 11110000 -+#define B5 0xF8 // 11111000 -+ -+// NOTE: these flags are likely to be optimised out as `decode` gets inlined -+enum decode_flags { -+ DECODE_1 = 1, -+ DECODE_2 = 2, -+ DECODE_3 = 8, -+ DECODE_4 = 16 -+}; -+ -+// decode up to 4 bytes of "generalised UTF-8"; no checking for overlong -+// codings or out-of-range code points, works by testing all fixed bits in each -+// of the 4 coding patterns, then shifting the value bits according to the -+// pattern -+static int decode(enum decode_flags flags, uint32_t data, int* codepoint_ret) { -+ if((flags & DECODE_1) && (data & U32(B1, B0, B0, B0)) == 0){ -+ *codepoint_ret = BYTE(data, 0); -+ return 1; -+ } -+ if((flags & DECODE_2) && (data & U32(B3, B2, B0, B0)) == U32(B2, B1, B0, B0)){ -+ *codepoint_ret = -+ (BYTE(data, 0) & ~B3) << 6 | -+ (BYTE(data, 1) & ~B2) << 0; -+ return 2; -+ } -+ if((flags & DECODE_3) && (data & U32(B4, B2, B2, B0)) == U32(B3, B1, B1, B0)){ -+ *codepoint_ret = -+ (BYTE(data, 0) & ~B4) << 12 | -+ (BYTE(data, 1) & ~B2) << 6 | -+ (BYTE(data, 2) & ~B2) << 0; -+ return 3; -+ } -+ if((flags & DECODE_4) && (data & U32(B5, B2, B2, B2)) == U32(B4, B1, B1, B1)){ -+ *codepoint_ret = -+ (BYTE(data, 0) & ~B5) << 18 | -+ (BYTE(data, 1) & ~B2) << 12 | -+ (BYTE(data, 2) & ~B2) << 6 | -+ (BYTE(data, 3) & ~B2) << 0; -+ return 4; -+ } -+ *codepoint_ret = -1; -+ return 1; -+} -+ - // jvp_utf8_backtrack returns the beginning of the last codepoint in the - // string, assuming that start is the last byte in the string. - // If the last codepoint is incomplete, returns the number of missing bytes via -@@ -81,56 +145,42 @@ const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf - if (in == end) { - return 0; - } -- int codepoint = -1; -- unsigned char first = (unsigned char)in[0]; -- int length = utf8_coding_length[first]; -- if ((first & 0x80) == 0) { -+ uint32_t data = in[0] & 0xFF; -+ if ((data & B1) == 0) { - /* Fast-path for ASCII */ -- codepoint = first; -- length = 1; -- } else if (length == 0 || length == UTF8_CONTINUATION_BYTE) { -- /* Bad single byte - either an invalid byte or an out-of-place continuation byte */ -- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: bad single byte"); -- length = 1; -- } else if (in + length > end) { -- /* String ends before UTF8 sequence ends */ -- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: underrun"); -- length = end - in; -- } else { -- codepoint = ((unsigned)in[0]) & utf8_coding_bits[first]; -- for (int i=1; i 0x10FFFF) { -- /* Outside Unicode range */ -- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range"); -+ } else if (0xD800 <= codepoint && codepoint <= 0xDFFF) { -+ /* Surrogate codepoints are allowed in WTF-8/WTF-8b */ -+ if (!(flags & JVP_UTF8_ERRORS_UTF16)) { -+ /* Surrogate codepoints can't be encoded in UTF8 */ - codepoint = -1; - } -+ } else if (codepoint > 0x10FFFF) { -+ /* Outside Unicode range */ -+ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range"); -+ codepoint = -1; - } - if (codepoint == -1 && (flags & JVP_UTF8_REPLACE)) - codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER -@@ -139,6 +189,112 @@ const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf - return in + length; - } - -+// assumes two bytes are readable from `in` -+static int decode_utf8_error(const char* in) { -+ uint32_t data = U32(in[0] & 0xFF, in[1] & 0xFF, 0, 0); -+ int codepoint; -+ if (decode(DECODE_2, data, &codepoint) == UTF8_ERR_LEN && codepoint < 0x80) -+ return codepoint + 0x80; -+ return -1; -+} -+ -+// assumes three bytes are readable from `in` -+static int decode_utf16_error(const char* in) { -+ uint32_t data = U32(in[0] & 0xFF, in[1] & 0xFF, in[2] & 0xFF, 0); -+ int codepoint; -+ if (decode(DECODE_3, data, &codepoint) == UTF16_ERR_LEN && codepoint >= 0xD800 && codepoint < 0xDFFF) -+ return codepoint; -+ return -1; -+} -+ -+// jvp_utf8_extended_join attempts to turn errors at the end of `a` and the -+// beginning of `b` into a valid code point. if a correction is possible, -+// `*alen_io`, `*bstart_io` and `*blen_io` are updated to exclude the existing -+// errors, and the UTF-8 encoding of the code point to insert is stored in -+// `out`. the number of bytes that should be inserted from `out` into the -+// middle of the strings is returned (up to 4). this will be 0 if there are no -+// bytes to insert. -+int jvp_utf8_extended_join(const char* astart, uint32_t* alen_io, const char** bstart_io, uint32_t* blen_io, char* out) { -+ const char* aend = astart + *alen_io; -+ const char* bstart = *bstart_io; -+ const char* bend = bstart + *blen_io; -+ int bcp; -+ bstart = jvp_utf8_extended_next(bstart, bend, JVP_UTF8_ERRORS_ALL, &bcp); -+ if (!bstart) { -+ // end of string -+ return 0; -+ } -+ if (bcp >= 0xDC00 && bcp <= 0xDFFF) { -+ // UTF-16 tail surrogate, look for lead surrogate at the end of `a` -+ assert(bstart == *bstart_io + UTF16_ERR_LEN); -+ if (aend - astart < UTF16_ERR_LEN) -+ return 0; -+ int acp = decode_utf16_error(aend - UTF16_ERR_LEN); -+ if (acp >= 0xD800 && acp <= 0xDBFF) { -+ // UTF-16 lead surrogate, decode matching UTF-16 pair -+ *alen_io -= UTF16_ERR_LEN; -+ *blen_io -= UTF16_ERR_LEN; -+ *bstart_io += UTF16_ERR_LEN; -+ int codepoint = 0x10000 + (((acp - 0xD800) << 10) | (bcp - 0xDC00)); -+ return jvp_utf8_encode(codepoint, out); -+ } -+ return 0; -+ } -+ if (bcp >= -0xFF && bcp <= -0x80) { -+ // UTF-8 error, if it's a continuation byte, search backwards in `a` for the leading byte -+ bcp = -bcp; -+ assert(bstart == *bstart_io + UTF8_ERR_LEN); -+ if (utf8_coding_length[bcp] != UTF8_CONTINUATION_BYTE) -+ return 0; -+ // if there's a correctable error, we will consume up to 4 encoded error bytes total, with up to 3 bytes from each of `a` and `b` -+ unsigned char buf[6]; -+ unsigned char* bufstart = buf + 3; -+ unsigned char* bufend = bufstart; -+ *bufend++ = bcp; -+ int length; -+ // search backwards in `a` for a leading byte -+ for (;;) { -+ if (aend - astart < UTF8_ERR_LEN) -+ return 0; // `a` is too short -+ int acp = decode_utf8_error(aend - UTF8_ERR_LEN); -+ if (acp == -1) -+ return 0; // not a UTF-8 error -+ aend -= UTF8_ERR_LEN; -+ length = utf8_coding_length[acp]; -+ if (length == 0) -+ return 0; // not a possible UTF-8 byte -+ *--bufstart = acp; -+ if (length != UTF8_CONTINUATION_BYTE) -+ break; // found leading byte -+ if (bufstart == buf) -+ return 0; // too many continuation bytes -+ } -+ if (bufend - bufstart > length) -+ return 0; // too many continuation bytes -+ // search forwards in `b` for any more needed continuation bytes -+ while (bufend - bufstart < length) { -+ if (bend - bstart < UTF8_ERR_LEN) -+ return 0; // `b` is too short -+ bcp = decode_utf8_error(bstart); -+ if (bcp == -1 || utf8_coding_length[bcp] != UTF8_CONTINUATION_BYTE) -+ return 0; // not a UTF-8 error, didn't find enough continuation bytes -+ bstart += UTF8_ERR_LEN; -+ *bufend++ = bcp; -+ } -+ int codepoint; -+ // check that the bytes are strict UTF-8 -+ jvp_utf8_extended_next((char*)bufstart, (char*)bufend, 0, &codepoint); -+ if (codepoint != -1) { -+ memcpy(out, bufstart, 4); -+ *alen_io = aend - astart; -+ *blen_io = bend - bstart; -+ *bstart_io = bstart; -+ return bufend - bufstart; -+ } -+ } -+ return 0; -+} -+ - int jvp_utf8_is_valid(const char* in, const char* end) { - int codepoint; - while ((in = jvp_utf8_extended_next(in, end, 0, &codepoint))) { -diff --git a/src/jv_unicode.h b/src/jv_unicode.h -index 37c7fc0..ff2a437 100644 ---- a/src/jv_unicode.h -+++ b/src/jv_unicode.h -@@ -1,6 +1,8 @@ - #ifndef JV_UNICODE_H - #define JV_UNICODE_H - -+#include -+ - enum jvp_utf8_flags { - /* Emit replacement character instead of -1 for errors */ - JVP_UTF8_REPLACE = 1, -@@ -14,6 +16,7 @@ enum jvp_utf8_flags { - const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes); - const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint); - const char* jvp_utf8_next(const char* in, const char* end, int* codepoint); -+int jvp_utf8_extended_join(const char* astart, uint32_t* alen, const char** bstart, uint32_t* blen, char* out); - int jvp_utf8_is_valid(const char* in, const char* end); - - int jvp_utf8_decode_length(char startchar); -diff --git a/tests/jq.test b/tests/jq.test -index c882fd2..9e6c896 100644 ---- a/tests/jq.test -+++ b/tests/jq.test -@@ -62,6 +62,11 @@ null - null - "∀\ud800∃\udc00∅\udfff" - -+# Check that unpaired surrogates are paired when concatenated -+add -+["\ud83d","\ude43","\ud83e","\udd11","\ud83e","\udd17","\ud83e","\udd14","\ud83e","\udd10","\ud83d","\ude44","\ud83e","\udd12","\ud83e","\udd15","\ud83e","\udd13","\ud83e","\udd16","\ud83e","\udd18","\ud83c","\udffb","\ud83c","\udffc"] -+"🙃🤑🤗🤔🤐🙄🤒🤕🤓🤖🤘🏻🏼" -+ - "inter\("pol" + "ation")" - null - "interpolation" -@@ -87,6 +92,16 @@ null - "Zm/Ds2Jhcgo=" - "foóbar\n" - -+# test correction of UTF-8 errors when concatenating as binary data (input is a random sequence of code points) -+. as $text | @base64 | . as $b64 | [range(0, 300)] | map($b64[(.*4):((. + 1)*4)] | @base64d) | add | . == $text -+"򍨼衍򙮬񪜁򻴠󖂡󔁰񗏷󛊭񢠃򍧝𭌞󹰞󙴋𿋓󧜹򳔎񦰓򅆹򽐟󂑛򶃯㾱ꕽ񂊛򉙲򅤎􃖣󻣸󁸦򴏜򽃿􄑏󠦱񄛲񄕵񡿚򮩒񡏂򨆯򶚒󎮆󉨗򡮟򆿴񬏪򻀅㫑񉒗󴍶󬪸񝶑񂾑򇔣򉩉􂞇𲡀𨫆򤵇𲺝\u001c񖂟񳐉󲔹𳨬􀮔𸒙񜶻㊬񓐊񽒬󑀧󗧚󞌶󦥥𗌽𘀍󴼹􌇺򫗛񂷶󏷕񜁍񥬟󼁁󓺉𗟒򷝊𩕃񞝏񧄀󁲩򐀄򳂸񲊷򃀋񃫫𝷏򏖝򷂍󢭣􋛨𞪒򁁅勸󯩥󵪭񚮚򻡍騎񾊯򪓚񗡈񎕫򡯬񋫠ᕴ𞨹󾄇񩠶𙯾񢥱𚯴񬥷󢶖񾹌񡈟򧓑񒾘𚸯񳗺񭟡𫸬񷤖񷆐𖋌񦰃椀𫎾󗚋𿋆󈝰񺥲򝕊𵯮򙧚󬱃󍗞󱆃󂟙󟆺񻢬󸮤󗗉񉛮𺵡𰣒􁋙񻍛􇡘ᮍ񕥸񨵂盕嗪𻸮򶆍򊈤񽓎󙴐𗬜󾱒󷹰􇡈񨦎􏥩񴲡𨑮򱏝𭢊󕁶򣙥󶡮󮰌󿙾氕񼻘􆔪񢕀񊿃󮨝񑛖󣴊󎎏򳞓㊁󒭀󇜳𯄌𻙩" -+true -+ -+# test preservation of binary data when concatenating (input is a random sequence of UTF-16 surrogates encoded in WTF-8, should be treated as regular UTF-8 errors) -+@base64d | . as $text | @base64 | . as $b64 | [range(0, 300)] | map($b64[(.*4):((. + 1)*4)] | @base64d) | add | . == $text -+"7bKv7aiz7auX7aG37aO77aOe7auy7bmm7bqk7aG87bSH7a6m7bmc7bum7bqj7au+7bqf7aap7buC7byq7aS37aCp7aSl7a+a7bur7aGV7bGl7b6M7biB7aOe7ayR7amW7aOX7b637a+P7bu+7ayP7bOw7ba/7ayp7b6G7aqd7bG37bK57b6O7bq27a+u7a2N7ayu7bKK" -+true -+ - @uri - "\u03bc" - "%CE%BC" diff --git a/Update-base64-utf8bytelength-and-fromjson-to-handlebinary-strings.patch b/Update-base64-utf8bytelength-and-fromjson-to-handlebinary-strings.patch deleted file mode 100644 index f3cd1dcf863aa1f9f21f8c98201015d360abca9f..0000000000000000000000000000000000000000 --- a/Update-base64-utf8bytelength-and-fromjson-to-handlebinary-strings.patch +++ /dev/null @@ -1,210 +0,0 @@ -From a6ccbaad05bea30c5700b10bd51e46d390496a9b Mon Sep 17 00:00:00 2001 -From: Max Zerzouri -Date: Sun, 16 May 2021 09:18:51 +0000 -Subject: [PATCH] Update `@base64`, `utf8bytelength` and `fromjson` to handle - binary strings - ---- - docs/content/3.manual/manual.yml | 1 - - src/builtin.c | 107 ++++++++++++++++++++++++++----- - tests/base64.test | 10 +++ - tests/shtest | 19 ++++-- - 4 files changed, 116 insertions(+), 21 deletions(-) - -diff --git a/docs/content/3.manual/manual.yml b/docs/content/3.manual/manual.yml -index bfb17f4..1258dbf 100644 ---- a/docs/content/3.manual/manual.yml -+++ b/docs/content/3.manual/manual.yml -@@ -1843,7 +1843,6 @@ sections: - * `@base64d`: - - The inverse of `@base64`, input is decoded as specified by RFC 4648. -- Note\: If the decoded string is not UTF-8, the results are undefined. - - This syntax can be combined with string interpolation in a - useful way. You can follow a `@foo` token with a string -diff --git a/src/builtin.c b/src/builtin.c -index c6c8c2e..975bf49 100644 ---- a/src/builtin.c -+++ b/src/builtin.c -@@ -409,10 +409,55 @@ static jv f_dump(jq_state *jq, jv input) { - static jv f_json_parse(jq_state *jq, jv input) { - if (jv_get_kind(input) != JV_KIND_STRING) - return type_error(input, "only strings can be parsed"); -- jv res = jv_parse_sized(jv_string_value(input), -- jv_string_length_bytes(jv_copy(input))); -+ -+ const char* i = jv_string_value(input); -+ const char* end = i + jv_string_length_bytes(jv_copy(input)); -+ -+ struct jv_parser* parser = jv_parser_new(0); -+ int count = 0; -+ jv value = jv_invalid(); -+ while (i != NULL) { -+ const int max_utf8_len = 4; -+ unsigned char buf[100 + max_utf8_len]; -+ int buflen = 0; -+ int c; -+ while ((buflen + max_utf8_len < sizeof(buf)) && (i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) { -+ if (c >= -0xFF && c <= -0x80) { -+ // Invalid UTF-8 byte, pass through -+ buf[buflen++] = -c; -+ } else -+ buflen += jvp_utf8_encode(c, buf + buflen); -+ } -+ jv_parser_set_buf(parser, buf, buflen, i != NULL); -+ for (;;) { -+ jv next = jv_parser_next(parser); -+ if (!jv_is_valid(next)) { -+ if (jv_invalid_has_msg(jv_copy(next))) { -+ count++; -+ jv_free(value); -+ value = next; -+ i = NULL; -+ } -+ break; -+ } -+ jv_free(value); -+ if (count++ == 0) -+ value = next; -+ else { -+ jv_free(next); -+ value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values")); -+ i = NULL; -+ break; -+ } -+ } -+ } -+ jv_parser_free(parser); - jv_free(input); -- return res; -+ if (count == 0) { -+ jv_free(value); -+ value = jv_invalid_with_msg(jv_string("Expected JSON value")); -+ } -+ return value; - } - - static jv f_tonumber(jq_state *jq, jv input) { -@@ -457,7 +502,19 @@ static jv f_tostring(jq_state *jq, jv input) { - static jv f_utf8bytelength(jq_state *jq, jv input) { - if (jv_get_kind(input) != JV_KIND_STRING) - return type_error(input, "only strings have UTF-8 byte length"); -- return jv_number(jv_string_length_bytes(input)); -+ const char* i = jv_string_value(input); -+ const char* end = i + jv_string_length_bytes(jv_copy(input)); -+ int len = 0; -+ int c; -+ while ((i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) { -+ if (c >= -0xFF && c <= -0x80) { -+ // Invalid UTF-8 byte, will be passed through -+ len++; -+ } else -+ len += jvp_utf8_encode_length(c); -+ } -+ jv_free(input); -+ return jv_number(len); - } - - #define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" -@@ -632,21 +689,41 @@ static jv f_format(jq_state *jq, jv input, jv fmt) { - jv_free(fmt); - input = f_tostring(jq, input); - jv line = jv_string(""); -- const unsigned char* data = (const unsigned char*)jv_string_value(input); -- int len = jv_string_length_bytes(jv_copy(input)); -- for (int i=0; i= 3 ? 3 : len-i; -- for (int j=0; j<3; j++) { -+ const char* i = jv_string_value(input); -+ const char* end = i + jv_string_length_bytes(jv_copy(input)); -+ uint32_t code = 0; -+ int n = 0; -+ int c; -+ while ((i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) { -+ unsigned char ubuf[4]; -+ int len = 0; -+ if (c >= -0xFF && c <= -0x80) { -+ // Invalid UTF-8 byte, pass through -+ ubuf[len++] = -c; -+ } else -+ len += jvp_utf8_encode(c, ubuf); -+ for (int x = 0; x < len; x++) { - code <<= 8; -- code |= j < n ? (unsigned)data[i+j] : 0; -+ code |= ubuf[x]; -+ if (++n == 3) { -+ char buf[4]; -+ for (int j = 0; j < 4; j++) -+ buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f]; -+ line = jv_string_append_buf(line, buf, sizeof(buf)); -+ n = 0; -+ code = 0; -+ } - } -+ } -+ if (n > 0) { -+ assert(n < 3); -+ code <<= 8*(3 - n); - char buf[4]; -- for (int j=0; j<4; j++) { -+ for (int j = 0; j < 4; j++) - buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f]; -- } -- if (n < 3) buf[3] = '='; -- if (n < 2) buf[2] = '='; -+ buf[3] = '='; -+ if (n < 2) -+ buf[2] = '='; - line = jv_string_append_buf(line, buf, sizeof(buf)); - } - jv_free(input); -diff --git a/tests/base64.test b/tests/base64.test -index 0f82b0b..6507bb8 100644 ---- a/tests/base64.test -+++ b/tests/base64.test -@@ -33,3 +33,13 @@ - . | try @base64d catch . - "QUJDa" - "string (\"QUJDa\") trailing base64 byte found" -+ -+# random binary data -+(. | @base64d | @base64) == . -+"zns0Su1i4JjDfGiR95WOcU8iiPMOrfJTUBm9P1ot2qIMiyk04b0WSIFNTMD7w9ziMV8nSbwpPqNl3JKF1eWZrRRg24rbvh66O1e7Z1xIGPNqTqm+jdzRCkWSryR+67wXRVgD6Q==" -+true -+ -+# replace lone surrogates -+@base64 -+"foo\udca9\ud83dbar" -+"Zm9v77+977+9YmFy" -diff --git a/tests/shtest b/tests/shtest -index 4c8b57e..7de61e4 100755 ---- a/tests/shtest -+++ b/tests/shtest -@@ -131,11 +131,20 @@ cmp $d/out $d/expected - - - clean=false --# Invalid UTF-8 bytes are preserved when encoding/decoding JSON --dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null --$VALGRIND $Q $JQ -sR . $d/rand >$d/out.json --$VALGRIND $Q $JQ -j . $d/out.json >$d/out --cmp $d/out $d/rand -+# Invalid UTF-8 bytes are preserved when encoding/decoding JSON and base64 and concatenating binary strings -+if dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null; then -+ $VALGRIND $Q $JQ -sR . $d/rand >$d/out.json -+ $VALGRIND $Q $JQ -j . $d/out.json >$d/out -+ cmp $d/out $d/rand -+ $VALGRIND $Q $JQ -jR fromjson $d/out.json >$d/out -+ cmp $d/out $d/rand -+ $VALGRIND $Q $JQ -j '@base64 | @base64d' $d/out.json >$d/out -+ cmp $d/out $d/rand -+ base64 $d/rand | $VALGRIND $Q $JQ -R '@base64d' | $VALGRIND $Q $JQ -sj 'add' >$d/out -+ cmp $d/out $d/rand -+ $VALGRIND $Q $JQ -nj '$a' --rawfile a $d/rand >$d/out -+ cmp $d/out $d/rand -+fi - clean=true - - diff --git a/jq.spec b/jq.spec index 37538216d0388ddcbc976ca29854f4e7cb363d8b..02a26b7229611717fe64c8fe60804d76169d310e 100644 --- a/jq.spec +++ b/jq.spec @@ -1,10 +1,12 @@ Name: jq Version: 1.7.1 -Release: 2 +Release: 3 Summary: A lightweight and flexible command-line JSON processor License: MIT and ASL 2.0 and CC-BY-3.0 and GPLv3 URL: http://stedolan.github.io/jq/ Source0: https://github.com/jqlang/jq/releases/download/jq-%{version}/jq-%{version}.tar.gz +Patch0: CVE-2024-53427-pre.patch +Patch1: CVE-2024-53427.patch BuildRequires: make flex bison gcc chrpath oniguruma-devel %ifarch %{valgrind_arches} BuildRequires: valgrind @@ -74,6 +76,9 @@ make check %changelog +* Fri Mar 07 2025 yaoxin <1024769339@qq.com> - 1.7.1-3 +- Fix CVE-2024-53427 + * Mon Sep 09 2024 laokz - 1.7.1-2 - Let valgrind depend on system arch macro