diff --git a/Binary-strings-preserve-UTF-8-and-UTF-16-errors.patch b/Binary-strings-preserve-UTF-8-and-UTF-16-errors.patch new file mode 100644 index 0000000000000000000000000000000000000000..6863778adaa63ee495a2830f6446b82b2b3b9b31 --- /dev/null +++ b/Binary-strings-preserve-UTF-8-and-UTF-16-errors.patch @@ -0,0 +1,616 @@ +From b2384ea878f484c48419fc0ec30380d0a5ffe3ce Mon Sep 17 00:00:00 2001 +From: Max Zerzouri +Date: Sat, 15 May 2021 08:32:27 +0000 +Subject: [PATCH] Binary strings: preserve UTF-8 and UTF-16 errors + +The internal string representation is changed from UTF-8 with replacement +characters to a modified form of "WTF-8" that is able to distinctly encode +UTF-8 errors and UTF-16 errors. + +This handles UTF-8 errors in raw string inputs and handles UTF-8 and UTF-16 +errors in JSON input. UTF-16 errors (using "\uXXXX") and UTF-8 errors (using +the original raw bytes) are maintained when emitting JSON. When emitting raw +strings, UTF-8 errors are maintained and UTF-16 errors are converted into +replacement characters. +--- + scripts/gen_utf8_tables.py | 3 +- + src/jv.c | 28 ++++++------ + src/jv.h | 1 + + src/jv_parse.c | 77 ++++++++++++++++++++++----------- + src/jv_print.c | 26 +++++++++++- + src/jv_unicode.c | 87 ++++++++++++++++++++++++++++++++++---- + src/jv_unicode.h | 11 +++++ + src/jv_utf8_tables.h | 4 +- + src/main.c | 29 ++++++++++++- + tests/jq.test | 5 +++ + tests/shtest | 9 ++++ + 11 files changed, 228 insertions(+), 52 deletions(-) + +diff --git a/scripts/gen_utf8_tables.py b/scripts/gen_utf8_tables.py +index 6fe0a53..7706462 100644 +--- a/scripts/gen_utf8_tables.py ++++ b/scripts/gen_utf8_tables.py +@@ -16,8 +16,7 @@ def print_table(type, name, t): + def utf8info(c): + if c < 0x80: return 1, mask(7) + if 0x80 <= c <= 0xBF: return 255, mask(6) +- if 0xC0 <= c <= 0xC1: return 0, 0 +- if 0xC2 <= c <= 0xDF: return 2, mask(5) ++ if 0xC0 <= c <= 0xDF: return 2, mask(5) + if 0xE0 <= c <= 0xEF: return 3, mask(4) + if 0xF0 <= c <= 0xF4: return 4, mask(3) + if 0xF4 <= c <= 0xFF: return 0, 0 +diff --git a/src/jv.c b/src/jv.c +index 1f1029e..e979cc6 100644 +--- a/src/jv.c ++++ b/src/jv.c +@@ -452,20 +452,24 @@ static jvp_string* jvp_string_alloc(uint32_t size) { + return s; + } + +-/* Copy a UTF8 string, replacing all badly encoded points with U+FFFD */ ++/* Copy a UTF8 string, using WTF-8b to replace all UTF-8 errors */ + static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) { + const char* end = data + length; + const char* i = data; + const char* cstart; + +- uint32_t maxlength = length * 3 + 1; // worst case: all bad bytes, each becomes a 3-byte U+FFFD ++ uint32_t maxlength = length * 2 + 1; // worst case: all bad bytes, each becomes a 2-byte overlong U+XX + jvp_string* s = jvp_string_alloc(maxlength); + char* out = s->data; + int c = 0; + +- while ((i = jvp_utf8_next((cstart = i), end, &c))) { ++ while ((i = jvp_utf8_extended_next((cstart = i), end, 0, &c))) { + if (c == -1) { +- c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER ++ int error = (unsigned char)*cstart; ++ assert(error >= 0x80 && error <= 0xFF); ++ c = -error; ++ /* Ensure each UTF-8 error byte is consumed separately */ ++ i = cstart + 1; + } + out += jvp_utf8_encode(c, out); + assert(out < s->data + maxlength); +@@ -477,8 +481,8 @@ static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) { + return r; + } + +-/* Assumes valid UTF8 */ +-static jv jvp_string_new(const char* data, uint32_t length) { ++/* Assumes valid WTF-8b */ ++jv jv_string_extended_sized(const char* data, int length) { + jvp_string* s = jvp_string_alloc(length); + s->length_hashed = length << 1; + if (data != NULL) +@@ -618,7 +622,7 @@ static int jvp_string_equal(jv a, jv b) { + jv jv_string_sized(const char* str, int len) { + return + jvp_utf8_is_valid(str, str+len) ? +- jvp_string_new(str, len) : ++ jv_string_extended_sized(str, len) : + jvp_string_copy_replace_bad(str, len); + } + +@@ -682,14 +686,14 @@ jv jv_string_split(jv j, jv sep) { + + if (seplen == 0) { + int c; +- while ((jstr = jvp_utf8_next(jstr, jend, &c))) ++ while ((jstr = jvp_utf8_extended_next(jstr, jend, JVP_UTF8_ERRORS_ALL, &c))) + a = jv_array_append(a, jv_string_append_codepoint(jv_string(""), c)); + } else { + for (p = jstr; p < jend; p = s + seplen) { + s = _jq_memmem(p, jend - p, sepstr, seplen); + if (s == NULL) + s = jend; +- a = jv_array_append(a, jv_string_sized(p, s - p)); ++ a = jv_array_append(a, jv_string_extended_sized(p, s - p)); + // Add an empty string to denote that j ends on a sep + if (s + seplen == jend && seplen != 0) + a = jv_array_append(a, jv_string("")); +@@ -760,7 +764,7 @@ jv jv_string_slice(jv j, int start, int end) { + + /* Look for byte offset corresponding to start codepoints */ + for (p = s, i = 0; i < start; i++) { +- p = jvp_utf8_next(p, s + len, &c); ++ p = jvp_utf8_extended_next(p, s + len, JVP_UTF8_ERRORS_ALL, &c); + if (p == NULL) { + jv_free(j); + return jv_string_empty(16); +@@ -772,7 +776,7 @@ jv jv_string_slice(jv j, int start, int end) { + } + /* Look for byte offset corresponding to end codepoints */ + for (e = p; e != NULL && i < end; i++) { +- e = jvp_utf8_next(e, s + len, &c); ++ e = jvp_utf8_extended_next(e, s + len, JVP_UTF8_ERRORS_ALL, &c); + if (e == NULL) { + e = s + len; + break; +@@ -790,7 +794,7 @@ jv jv_string_slice(jv j, int start, int end) { + * memory like a drunken navy programmer. There's probably nothing we + * can do about it. + */ +- res = jv_string_sized(p, e - p); ++ res = jv_string_extended_sized(p, e - p); + jv_free(j); + return res; + } +diff --git a/src/jv.h b/src/jv.h +index d111c80..2aed1ae 100644 +--- a/src/jv.h ++++ b/src/jv.h +@@ -104,6 +104,7 @@ jv jv_array_indexes(jv, jv); + + jv jv_string(const char*); + jv jv_string_sized(const char*, int); ++jv jv_string_extended_sized(const char*, int); + jv jv_string_empty(int len); + int jv_string_length_bytes(jv); + int jv_string_length_codepoints(jv); +diff --git a/src/jv_parse.c b/src/jv_parse.c +index 51ad9f0..194efaf 100644 +--- a/src/jv_parse.c ++++ b/src/jv_parse.c +@@ -397,7 +397,7 @@ static void tokenadd(struct jv_parser* p, char c) { + p->tokenbuf[p->tokenpos++] = c; + } + +-static int unhex4(char* hex) { ++static int unhex4(const char* hex) { + int r = 0; + for (int i=0; i<4; i++) { + char c = *hex++; +@@ -413,15 +413,19 @@ static int unhex4(char* hex) { + } + + static pfunc found_string(struct jv_parser* p) { +- char* in = p->tokenbuf; +- char* out = p->tokenbuf; +- char* end = p->tokenbuf + p->tokenpos; +- +- while (in < end) { +- char c = *in++; ++ const char* in = p->tokenbuf; ++ // start by writing to tokenbuf, only allocate in case that output size is greater than input size (possible only when input has UTF-8 errors) ++ char* newbuf = NULL; ++ char* buf = p->tokenbuf; ++ char* out = buf; ++ const char* end = p->tokenbuf + p->tokenpos; ++ const char* cstart; ++ int c; ++ ++ while ((in = jvp_utf8_extended_next((cstart = in), end, 0, &c))) { + if (c == '\\') { + if (in >= end) +- return "Expected escape character at end of string"; ++ return jv_mem_free(newbuf), "Expected escape character at end of string"; + c = *in++; + switch (c) { + case '\\': +@@ -436,38 +440,61 @@ static pfunc found_string(struct jv_parser* p) { + case 'u': + /* ahh, the complicated case */ + if (in + 4 > end) +- return "Invalid \\uXXXX escape"; ++ return jv_mem_free(newbuf), "Invalid \\uXXXX escape"; + int hexvalue = unhex4(in); + if (hexvalue < 0) +- return "Invalid characters in \\uXXXX escape"; ++ return jv_mem_free(newbuf), "Invalid characters in \\uXXXX escape"; + unsigned long codepoint = (unsigned long)hexvalue; + in += 4; ++ // leading surrogate + if (0xD800 <= codepoint && codepoint <= 0xDBFF) { +- /* who thought UTF-16 surrogate pairs were a good idea? */ +- if (in + 6 > end || in[0] != '\\' || in[1] != 'u') +- return "Invalid \\uXXXX\\uXXXX surrogate pair escape"; +- unsigned long surrogate = unhex4(in+2); +- if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF)) +- return "Invalid \\uXXXX\\uXXXX surrogate pair escape"; +- in += 6; +- codepoint = 0x10000 + (((codepoint - 0xD800) << 10) +- |(surrogate - 0xDC00)); ++ // look ahead for trailing surrogate and decode as UTF-16, otherwise encode this lone surrogate as WTF-8 ++ if (in + 6 <= end && in[0] == '\\' && in[1] == 'u') { ++ unsigned long surrogate = unhex4(in+2); ++ if (0xDC00 <= surrogate && surrogate <= 0xDFFF) { ++ in += 6; ++ codepoint = 0x10000 + (((codepoint - 0xD800) << 10) ++ |(surrogate - 0xDC00)); ++ } ++ } + } +- if (codepoint > 0x10FFFF) +- codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER ++ // UTF-16 surrogates can not encode a greater codepoint ++ assert(codepoint <= 0x10FFFF); ++ // NOTE: a leading or trailing surrogate here (0xD800 <= codepoint && codepoint <= 0xDFFF) is encoded as WTF-8 + out += jvp_utf8_encode(codepoint, out); + break; + + default: +- return "Invalid escape"; ++ return jv_mem_free(newbuf), "Invalid escape"; + } + } else { + if (c > 0 && c < 0x001f) +- return "Invalid string: control characters from U+0000 through U+001F must be escaped"; +- *out++ = c; ++ return jv_mem_free(newbuf), "Invalid string: control characters from U+0000 through U+001F must be escaped"; ++ if (c == -1) { ++ int error = (unsigned char)*cstart; ++ assert(error >= 0x80 && error <= 0xFF); ++ c = -error; ++ /* Ensure each UTF-8 error byte is consumed separately */ ++ const int wtf8_length = 2; ++ assert(jvp_utf8_encode_length(c) == wtf8_length); ++ in = cstart + 1; ++ if (newbuf == NULL && out + wtf8_length > in) { ++ /* Output is about to overflow input, move output to temporary buffer */ ++ int current_size = out - p->tokenbuf; ++ int remaining = end - cstart; ++ newbuf = jv_mem_alloc(current_size + remaining * wtf8_length); // worst case: all remaining bad bytes, each becomes a 2-byte overlong U+XX ++ memcpy(newbuf, buf, current_size); ++ buf = newbuf; ++ out = buf + current_size; ++ } ++ } else ++ assert(jvp_utf8_encode_length(c) == in - cstart); ++ out += jvp_utf8_encode(c, out); + } + } +- TRY(value(p, jv_string_sized(p->tokenbuf, out - p->tokenbuf))); ++ jv v = jv_string_extended_sized(buf, out - buf); ++ jv_mem_free(newbuf); ++ TRY(value(p, v)); + p->tokenpos = 0; + return 0; + } +diff --git a/src/jv_print.c b/src/jv_print.c +index 5ebc01e..dfa1f05 100644 +--- a/src/jv_print.c ++++ b/src/jv_print.c +@@ -98,6 +98,16 @@ static void put_char(char c, FILE* fout, jv* strout, int T) { + put_buf(&c, 1, fout, strout, T); + } + ++static void put_invalid_utf8_byte(int c, FILE* fout, jv* strout, int T) { ++ assert(c >= 0x80 && c <= 0xFF); ++ if (strout) { ++ // encode as an invalid UTF-8 byte in output ++ *strout = jv_string_append_codepoint(*strout, -c); ++ } else { ++ put_char(c, fout, strout, T); ++ } ++} ++ + static void put_str(const char* s, FILE* fout, jv* strout, int T) { + put_buf(s, strlen(s), fout, strout, T); + } +@@ -121,7 +131,7 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) { + int c = 0; + char buf[32]; + put_char('"', F, S, T); +- while ((i = jvp_utf8_next((cstart = i), end, &c))) { ++ while ((i = jvp_utf8_extended_next((cstart = i), end, JVP_UTF8_ERRORS_ALL, &c))) { + assert(c != -1); + int unicode_escape = 0; + if (0x20 <= c && c <= 0x7E) { +@@ -130,6 +140,17 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) { + put_char('\\', F, S, T); + } + put_char(c, F, S, T); ++ } else if (c >= -0xFF && c <= -0x80) { ++ // Invalid UTF-8 byte ++ if (ascii_only) { ++ // refusing to emit invalid UTF-8 ++ // TODO: convince the world to adopt a "\xXX" notation for JSON? ++ c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER ++ unicode_escape = 1; ++ } else { ++ // pass through ++ put_invalid_utf8_byte(-c, F, S, T); ++ } + } else if (c < 0x20 || c == 0x7F) { + // ASCII control character + switch (c) { +@@ -160,6 +181,9 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) { + } else { + if (ascii_only) { + unicode_escape = 1; ++ } else if (c >= 0xD800 && c <= 0xDFFF) { ++ // lone surrogate; can't be encoded to UTF-8 ++ unicode_escape = 1; + } else { + put_buf(cstart, i - cstart, F, S, T); + } +diff --git a/src/jv_unicode.c b/src/jv_unicode.c +index d197349..8c47536 100644 +--- a/src/jv_unicode.c ++++ b/src/jv_unicode.c +@@ -27,6 +27,56 @@ const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_ + } + + const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { ++ return jvp_utf8_extended_next(in, end, JVP_UTF8_REPLACE, codepoint_ret); ++} ++ ++/* ++ The internal representation of jv strings uses an encoding that is hereby ++ referred to as "WTF-8b" (until someone demonstrates use of another term to ++ refer to the same encoding). ++ ++ WTF-8b is an extension of WTF-8, which is an extension of UTF-8. Any sequence ++ of Unicode scalar values is represented by the same bytes in UTF-8, WTF-8 and ++ WTF-8b, therefore any well-formed UTF-8 string is interpreted as the same ++ sequence of Unicode scalar values (roughly, code points) in WTF-8b. ++ ++ Like WTF-8, WTF-8b is able to encode UTF-16 errors (lone surrogates) using ++ the "generalized UTF-8" representation of code points between U+D800 and ++ U+DFFF. These errors occur in JSON terms such as: ++ "_\uD8AB_\uDBCD_" ++ ++ Unlike WTF-8, WTF-8b is also able to encode UTF-8 errors (bytes 0x80 to 0xFF ++ that are not part of a valid UTF-8 sequence) using the first 128 "overlong" ++ codings (unused 2-byte representations of U+00 to U+7F). These errors can ++ occur in any byte stream that is interpreted as UTF-8, for example: ++ "\xED\xA2\xAB" ++ The above example is in fact the WTF-8b (and WTF-8) encoding for the lone ++ UTF-16 surrogate "\uD8AB", which demonstrates the need for a distinct ++ encoding of UTF-8 errors. If a distinction were not made, then "\xED\xA2\xAB" ++ and "\uD8AB" would be interpreted as the same string, so at least one of the ++ forms would not be preserved when printed as JSON output. ++ ++ It should also be noted that the process of converting from invalid UTF-8 to ++ WTF-8b is not (and can not be) idempotent, since the "generalised UTF-8" ++ representation of UTF-16 surrogates are intentionally not able to be ++ generated from invalid UTF-8, only through some other means (usually "\uXXXX" ++ notation). ++ ++ Each UTF-16 error is encoded as 3 WTF-8b (or WTF-8) bytes. ++ Each UTF-8 error is encoded as 2 WTF-8b bytes. ++ ++ When iterating over code points using `JVP_UTF8_ERRORS_UTF16`, encoded UTF-16 ++ errors are emitted in the form of code points in the range U+D800 to U+DFFF. ++ These code points can be reencoded as usual using `jvp_utf8_encode`. ++ ++ When iterating over code points using `JVP_UTF8_ERRORS_UTF8`, encoded UTF-8 ++ errors are emitted in the form of code points in the negative range -0x80 to ++ -0xFF. These negative code points can be negated to determine the original ++ error bytes. These code points can be reencoded as usual using ++ `jvp_utf8_encode`. ++*/ ++ ++const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint_ret) { + assert(in <= end); + if (in == end) { + return 0; +@@ -40,9 +90,11 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { + length = 1; + } else if (length == 0 || length == UTF8_CONTINUATION_BYTE) { + /* Bad single byte - either an invalid byte or an out-of-place continuation byte */ ++ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: bad single byte"); + length = 1; + } else if (in + length > end) { + /* String ends before UTF8 sequence ends */ ++ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: underrun"); + length = end - in; + } else { + codepoint = ((unsigned)in[0]) & utf8_coding_bits[first]; +@@ -50,6 +102,7 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { + unsigned ch = (unsigned char)in[i]; + if (utf8_coding_length[ch] != UTF8_CONTINUATION_BYTE){ + /* Invalid UTF8 sequence - not followed by the right number of continuation bytes */ ++ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: wrong bytes"); + codepoint = -1; + length = i; + break; +@@ -58,17 +111,29 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { + } + if (codepoint < utf8_first_codepoint[length]) { + /* Overlong UTF8 sequence */ +- codepoint = -1; ++ if ((flags & JVP_UTF8_ERRORS_UTF8) && 0x00 <= codepoint && codepoint <= 0x7F) { ++ /* UTF-8 error is emitted as a negative codepoint */ ++ codepoint = -(codepoint + 0x80); ++ } else { ++ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: overlong"); ++ codepoint = -1; ++ } + } + if (0xD800 <= codepoint && codepoint <= 0xDFFF) { +- /* Surrogate codepoints can't be encoded in UTF8 */ +- codepoint = -1; ++ /* Surrogate codepoints are allowed in WTF-8/WTF-8b */ ++ if (!(flags & JVP_UTF8_ERRORS_UTF16)) { ++ /* Surrogate codepoints can't be encoded in UTF8 */ ++ codepoint = -1; ++ } + } + if (codepoint > 0x10FFFF) { + /* Outside Unicode range */ ++ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range"); + codepoint = -1; + } + } ++ if (codepoint == -1 && (flags & JVP_UTF8_REPLACE)) ++ codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER + assert(length > 0); + *codepoint_ret = codepoint; + return in + length; +@@ -76,7 +141,7 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { + + int jvp_utf8_is_valid(const char* in, const char* end) { + int codepoint; +- while ((in = jvp_utf8_next(in, end, &codepoint))) { ++ while ((in = jvp_utf8_extended_next(in, end, 0, &codepoint))) { + if (codepoint == -1) return 0; + } + return 1; +@@ -91,20 +156,24 @@ int jvp_utf8_decode_length(char startchar) { + } + + int jvp_utf8_encode_length(int codepoint) { +- if (codepoint <= 0x7F) return 1; ++ if (codepoint >= 0 && codepoint <= 0x7F) return 1; + else if (codepoint <= 0x7FF) return 2; + else if (codepoint <= 0xFFFF) return 3; + else return 4; + } + + int jvp_utf8_encode(int codepoint, char* out) { +- assert(codepoint >= 0 && codepoint <= 0x10FFFF); ++ assert((codepoint >= 0 && codepoint <= 0x10FFFF) || (codepoint >= -0xFF && codepoint <= -0x80)); + char* start = out; +- if (codepoint <= 0x7F) { ++ if (codepoint >= 0 && codepoint <= 0x7F) { + *out++ = codepoint; + } else if (codepoint <= 0x7FF) { +- *out++ = 0xC0 + ((codepoint & 0x7C0) >> 6); +- *out++ = 0x80 + ((codepoint & 0x03F)); ++ // encode UTF-8 errors as overlong representations of U+00 to U+7F ++ int cp = codepoint >= -0xFF && codepoint <= -0x80? ++ -codepoint - 0x80 : ++ codepoint; ++ *out++ = 0xC0 + ((cp & 0x7C0) >> 6); ++ *out++ = 0x80 + ((cp & 0x03F)); + } else if(codepoint <= 0xFFFF) { + *out++ = 0xE0 + ((codepoint & 0xF000) >> 12); + *out++ = 0x80 + ((codepoint & 0x0FC0) >> 6); +diff --git a/src/jv_unicode.h b/src/jv_unicode.h +index 558721a..37c7fc0 100644 +--- a/src/jv_unicode.h ++++ b/src/jv_unicode.h +@@ -1,7 +1,18 @@ + #ifndef JV_UNICODE_H + #define JV_UNICODE_H + ++enum jvp_utf8_flags { ++ /* Emit replacement character instead of -1 for errors */ ++ JVP_UTF8_REPLACE = 1, ++ /* Treat input as WTF-8b, emit 0xD800 to 0xDFFF to denote encoded UTF-16 errors */ ++ JVP_UTF8_ERRORS_UTF16 = 2, ++ /* Treat input as WTF-8b, emit -0x80 to -0xFF to denote encoded UTF-8 errors */ ++ JVP_UTF8_ERRORS_UTF8 = 4, ++ JVP_UTF8_ERRORS_ALL = JVP_UTF8_ERRORS_UTF16 | JVP_UTF8_ERRORS_UTF8 ++}; ++ + const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes); ++const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint); + const char* jvp_utf8_next(const char* in, const char* end, int* codepoint); + int jvp_utf8_is_valid(const char* in, const char* end); + +diff --git a/src/jv_utf8_tables.h b/src/jv_utf8_tables.h +index f1a4252..7c68749 100644 +--- a/src/jv_utf8_tables.h ++++ b/src/jv_utf8_tables.h +@@ -12,7 +12,7 @@ static const unsigned char utf8_coding_length[] = + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +- 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, ++ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, + 0x04, 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; +@@ -29,7 +29,7 @@ static const unsigned char utf8_coding_bits[] = + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, +- 0x00, 0x00, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, ++ 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; +diff --git a/src/main.c b/src/main.c +index b154689..5fa5c4f 100644 +--- a/src/main.c ++++ b/src/main.c +@@ -30,6 +30,7 @@ + #include "jv.h" + #include "jq.h" + #include "jv_alloc.h" ++#include "jv_unicode.h" + #include "util.h" + #include "src/version.h" + +@@ -161,6 +162,30 @@ static const char *skip_shebang(const char *p) { + return n+1; + } + ++static void jvp_dump_raw_string(const char* start, const char* end, FILE* f) { ++ static const unsigned char UTF8_REPLACEMENT[] = {0xEF,0xBF,0xBD}; // U+FFFD REPLACEMENT CHARACTER ++ ++ const char* i = start; ++ const char* cstart; ++ int c; ++ ++ while ((i = jvp_utf8_extended_next((cstart = i), end, JVP_UTF8_ERRORS_ALL, &c))) { ++ if (c >= -0xFF && c <= -0x80) { ++ // invalid UTF-8 byte; pass through ++ fwrite(start, 1, cstart - start, f); ++ start = i; ++ fputc(-c, f); ++ } else if ((c >= 0xD800 && c <= 0xDFFF) || c == -1) { ++ // lone surrugate; can't be encoded to UTF-8 ++ fwrite(start, 1, cstart - start, f); ++ start = i; ++ fwrite(UTF8_REPLACEMENT, 1, sizeof(UTF8_REPLACEMENT), f); ++ } else ++ continue; ++ } ++ fwrite(start, 1, end - start, f); ++} ++ + static int process(jq_state *jq, jv value, int flags, int dumpopts) { + int ret = 14; // No valid results && -e -> exit(4) + jq_start(jq, value, flags); +@@ -170,7 +195,9 @@ static int process(jq_state *jq, jv value, int flags, int dumpopts) { + if (options & ASCII_OUTPUT) { + jv_dumpf(result, stdout, JV_PRINT_ASCII); + } else { +- fwrite(jv_string_value(result), 1, jv_string_length_bytes(jv_copy(result)), stdout); ++ const char *start = jv_string_value(result); ++ const char *end = start + jv_string_length_bytes(jv_copy(result)); ++ jvp_dump_raw_string(start, end, stdout); + } + ret = 0; + jv_free(result); +diff --git a/tests/jq.test b/tests/jq.test +index 7e2dd43..c882fd2 100644 +--- a/tests/jq.test ++++ b/tests/jq.test +@@ -57,6 +57,11 @@ null + "Aa\r\n\t\b\f\u03bc" + "Aa\u000d\u000a\u0009\u0008\u000c\u03bc" + ++# Check that unpaired surrogates are preserved in output ++"\u2200\ud800\u2203\udc00\u2205\udfff" ++null ++"∀\ud800∃\udc00∅\udfff" ++ + "inter\("pol" + "ation")" + null + "interpolation" +diff --git a/tests/shtest b/tests/shtest +index 86fec33..4c8b57e 100755 +--- a/tests/shtest ++++ b/tests/shtest +@@ -130,6 +130,15 @@ printf "[1,2][3,4]\n" | $JQ -cs add > $d/out 2>&1 + cmp $d/out $d/expected + + ++clean=false ++# Invalid UTF-8 bytes are preserved when encoding/decoding JSON ++dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null ++$VALGRIND $Q $JQ -sR . $d/rand >$d/out.json ++$VALGRIND $Q $JQ -j . $d/out.json >$d/out ++cmp $d/out $d/rand ++clean=true ++ ++ + ## Test streaming parser + + ## If we add an option to stream to the `import ... as $symbol;` directive diff --git a/Correct-UTF-8-and-UTF-16-errors-during-concatenation.patch b/Correct-UTF-8-and-UTF-16-errors-during-concatenation.patch new file mode 100644 index 0000000000000000000000000000000000000000..6ceeda158004badc622261d49bed3e0b1d96fa42 --- /dev/null +++ b/Correct-UTF-8-and-UTF-16-errors-during-concatenation.patch @@ -0,0 +1,388 @@ +From 8829368f14943b8d2674c75805b27e56a569ad2c Mon Sep 17 00:00:00 2001 +From: Max Zerzouri +Date: Tue, 25 May 2021 22:59:59 +1200 +Subject: [PATCH] Correct UTF-8 and UTF-16 errors during concatenation + +UTF-8 errors and UTF-16 errors that were previously encoded into the +ends of +strings will now potentially be used to form correct code points. + +This is mostly a matter of making string equality behave expectedly, since +without this normalisation, it is possible to produce `jv` strings that are +converted to UTF-8 or UTF-16 the same way but are not equal due well-formed +code units that may or may not be encoded as errors. +--- + src/jv.c | 13 ++- + src/jv_unicode.c | 248 ++++++++++++++++++++++++++++++++++++++--------- + src/jv_unicode.h | 3 + + tests/jq.test | 15 +++ + 4 files changed, 230 insertions(+), 49 deletions(-) + +diff --git a/src/jv.c b/src/jv.c +index e979cc6..67d86fb 100644 +--- a/src/jv.c ++++ b/src/jv.c +@@ -522,20 +522,27 @@ static jv jvp_string_append(jv string, const char* data, uint32_t len) { + jvp_string* s = jvp_string_ptr(string); + uint32_t currlen = jvp_string_length(s); + ++ char join_buf[4]; ++ int join_len = jvp_utf8_extended_join(s->data, &currlen, &data, &len, join_buf); ++ + if (jvp_refcnt_unshared(string.u.ptr) && +- jvp_string_remaining_space(s) >= len) { ++ jvp_string_remaining_space(s) >= join_len + len) { + // the next string fits at the end of a ++ memcpy(s->data + currlen, join_buf, join_len); ++ currlen += join_len; + memcpy(s->data + currlen, data, len); + s->data[currlen + len] = 0; + s->length_hashed = (currlen + len) << 1; + return string; + } else { + // allocate a bigger buffer and copy +- uint32_t allocsz = (currlen + len) * 2; ++ uint32_t allocsz = (currlen + join_len + len) * 2; + if (allocsz < 32) allocsz = 32; + jvp_string* news = jvp_string_alloc(allocsz); +- news->length_hashed = (currlen + len) << 1; ++ news->length_hashed = (currlen + join_len + len) << 1; + memcpy(news->data, s->data, currlen); ++ memcpy(news->data + currlen, join_buf, join_len); ++ currlen += join_len; + memcpy(news->data + currlen, data, len); + news->data[currlen + len] = 0; + jvp_string_free(string); +diff --git a/src/jv_unicode.c b/src/jv_unicode.c +index 8c47536..7d67300 100644 +--- a/src/jv_unicode.c ++++ b/src/jv_unicode.c +@@ -1,8 +1,72 @@ + #include ++#include + #include + #include "jv_unicode.h" + #include "jv_utf8_tables.h" + ++// length of encoding of erroneous UTF-8 byte ++#define UTF8_ERR_LEN 2 ++// length of encoding of erroneous UTF-16 surrogate ++#define UTF16_ERR_LEN 3 ++ ++#define U32(a, b, c, d) ( \ ++ (uint32_t) (a) << 0 | \ ++ (uint32_t) (b) << 8 | \ ++ (uint32_t) (c) << 16 | \ ++ (uint32_t) (d) << 24 \ ++) ++ ++#define BYTE(u32, n) ((uint32_t) (((u32) >> (n)*8) & 0xFF)) ++ ++#define B0 0x00 // 00000000 ++#define B1 0x80 // 10000000 ++#define B2 0xC0 // 11000000 ++#define B3 0xE0 // 11100000 ++#define B4 0xF0 // 11110000 ++#define B5 0xF8 // 11111000 ++ ++// NOTE: these flags are likely to be optimised out as `decode` gets inlined ++enum decode_flags { ++ DECODE_1 = 1, ++ DECODE_2 = 2, ++ DECODE_3 = 8, ++ DECODE_4 = 16 ++}; ++ ++// decode up to 4 bytes of "generalised UTF-8"; no checking for overlong ++// codings or out-of-range code points, works by testing all fixed bits in each ++// of the 4 coding patterns, then shifting the value bits according to the ++// pattern ++static int decode(enum decode_flags flags, uint32_t data, int* codepoint_ret) { ++ if((flags & DECODE_1) && (data & U32(B1, B0, B0, B0)) == 0){ ++ *codepoint_ret = BYTE(data, 0); ++ return 1; ++ } ++ if((flags & DECODE_2) && (data & U32(B3, B2, B0, B0)) == U32(B2, B1, B0, B0)){ ++ *codepoint_ret = ++ (BYTE(data, 0) & ~B3) << 6 | ++ (BYTE(data, 1) & ~B2) << 0; ++ return 2; ++ } ++ if((flags & DECODE_3) && (data & U32(B4, B2, B2, B0)) == U32(B3, B1, B1, B0)){ ++ *codepoint_ret = ++ (BYTE(data, 0) & ~B4) << 12 | ++ (BYTE(data, 1) & ~B2) << 6 | ++ (BYTE(data, 2) & ~B2) << 0; ++ return 3; ++ } ++ if((flags & DECODE_4) && (data & U32(B5, B2, B2, B2)) == U32(B4, B1, B1, B1)){ ++ *codepoint_ret = ++ (BYTE(data, 0) & ~B5) << 18 | ++ (BYTE(data, 1) & ~B2) << 12 | ++ (BYTE(data, 2) & ~B2) << 6 | ++ (BYTE(data, 3) & ~B2) << 0; ++ return 4; ++ } ++ *codepoint_ret = -1; ++ return 1; ++} ++ + // jvp_utf8_backtrack returns the beginning of the last codepoint in the + // string, assuming that start is the last byte in the string. + // If the last codepoint is incomplete, returns the number of missing bytes via +@@ -81,56 +145,42 @@ const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf + if (in == end) { + return 0; + } +- int codepoint = -1; +- unsigned char first = (unsigned char)in[0]; +- int length = utf8_coding_length[first]; +- if ((first & 0x80) == 0) { ++ uint32_t data = in[0] & 0xFF; ++ if ((data & B1) == 0) { + /* Fast-path for ASCII */ +- codepoint = first; +- length = 1; +- } else if (length == 0 || length == UTF8_CONTINUATION_BYTE) { +- /* Bad single byte - either an invalid byte or an out-of-place continuation byte */ +- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: bad single byte"); +- length = 1; +- } else if (in + length > end) { +- /* String ends before UTF8 sequence ends */ +- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: underrun"); +- length = end - in; +- } else { +- codepoint = ((unsigned)in[0]) & utf8_coding_bits[first]; +- for (int i=1; i 0x10FFFF) { +- /* Outside Unicode range */ +- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range"); ++ } else if (0xD800 <= codepoint && codepoint <= 0xDFFF) { ++ /* Surrogate codepoints are allowed in WTF-8/WTF-8b */ ++ if (!(flags & JVP_UTF8_ERRORS_UTF16)) { ++ /* Surrogate codepoints can't be encoded in UTF8 */ + codepoint = -1; + } ++ } else if (codepoint > 0x10FFFF) { ++ /* Outside Unicode range */ ++ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range"); ++ codepoint = -1; + } + if (codepoint == -1 && (flags & JVP_UTF8_REPLACE)) + codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER +@@ -139,6 +189,112 @@ const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf + return in + length; + } + ++// assumes two bytes are readable from `in` ++static int decode_utf8_error(const char* in) { ++ uint32_t data = U32(in[0] & 0xFF, in[1] & 0xFF, 0, 0); ++ int codepoint; ++ if (decode(DECODE_2, data, &codepoint) == UTF8_ERR_LEN && codepoint < 0x80) ++ return codepoint + 0x80; ++ return -1; ++} ++ ++// assumes three bytes are readable from `in` ++static int decode_utf16_error(const char* in) { ++ uint32_t data = U32(in[0] & 0xFF, in[1] & 0xFF, in[2] & 0xFF, 0); ++ int codepoint; ++ if (decode(DECODE_3, data, &codepoint) == UTF16_ERR_LEN && codepoint >= 0xD800 && codepoint < 0xDFFF) ++ return codepoint; ++ return -1; ++} ++ ++// jvp_utf8_extended_join attempts to turn errors at the end of `a` and the ++// beginning of `b` into a valid code point. if a correction is possible, ++// `*alen_io`, `*bstart_io` and `*blen_io` are updated to exclude the existing ++// errors, and the UTF-8 encoding of the code point to insert is stored in ++// `out`. the number of bytes that should be inserted from `out` into the ++// middle of the strings is returned (up to 4). this will be 0 if there are no ++// bytes to insert. ++int jvp_utf8_extended_join(const char* astart, uint32_t* alen_io, const char** bstart_io, uint32_t* blen_io, char* out) { ++ const char* aend = astart + *alen_io; ++ const char* bstart = *bstart_io; ++ const char* bend = bstart + *blen_io; ++ int bcp; ++ bstart = jvp_utf8_extended_next(bstart, bend, JVP_UTF8_ERRORS_ALL, &bcp); ++ if (!bstart) { ++ // end of string ++ return 0; ++ } ++ if (bcp >= 0xDC00 && bcp <= 0xDFFF) { ++ // UTF-16 tail surrogate, look for lead surrogate at the end of `a` ++ assert(bstart == *bstart_io + UTF16_ERR_LEN); ++ if (aend - astart < UTF16_ERR_LEN) ++ return 0; ++ int acp = decode_utf16_error(aend - UTF16_ERR_LEN); ++ if (acp >= 0xD800 && acp <= 0xDBFF) { ++ // UTF-16 lead surrogate, decode matching UTF-16 pair ++ *alen_io -= UTF16_ERR_LEN; ++ *blen_io -= UTF16_ERR_LEN; ++ *bstart_io += UTF16_ERR_LEN; ++ int codepoint = 0x10000 + (((acp - 0xD800) << 10) | (bcp - 0xDC00)); ++ return jvp_utf8_encode(codepoint, out); ++ } ++ return 0; ++ } ++ if (bcp >= -0xFF && bcp <= -0x80) { ++ // UTF-8 error, if it's a continuation byte, search backwards in `a` for the leading byte ++ bcp = -bcp; ++ assert(bstart == *bstart_io + UTF8_ERR_LEN); ++ if (utf8_coding_length[bcp] != UTF8_CONTINUATION_BYTE) ++ return 0; ++ // if there's a correctable error, we will consume up to 4 encoded error bytes total, with up to 3 bytes from each of `a` and `b` ++ unsigned char buf[6]; ++ unsigned char* bufstart = buf + 3; ++ unsigned char* bufend = bufstart; ++ *bufend++ = bcp; ++ int length; ++ // search backwards in `a` for a leading byte ++ for (;;) { ++ if (aend - astart < UTF8_ERR_LEN) ++ return 0; // `a` is too short ++ int acp = decode_utf8_error(aend - UTF8_ERR_LEN); ++ if (acp == -1) ++ return 0; // not a UTF-8 error ++ aend -= UTF8_ERR_LEN; ++ length = utf8_coding_length[acp]; ++ if (length == 0) ++ return 0; // not a possible UTF-8 byte ++ *--bufstart = acp; ++ if (length != UTF8_CONTINUATION_BYTE) ++ break; // found leading byte ++ if (bufstart == buf) ++ return 0; // too many continuation bytes ++ } ++ if (bufend - bufstart > length) ++ return 0; // too many continuation bytes ++ // search forwards in `b` for any more needed continuation bytes ++ while (bufend - bufstart < length) { ++ if (bend - bstart < UTF8_ERR_LEN) ++ return 0; // `b` is too short ++ bcp = decode_utf8_error(bstart); ++ if (bcp == -1 || utf8_coding_length[bcp] != UTF8_CONTINUATION_BYTE) ++ return 0; // not a UTF-8 error, didn't find enough continuation bytes ++ bstart += UTF8_ERR_LEN; ++ *bufend++ = bcp; ++ } ++ int codepoint; ++ // check that the bytes are strict UTF-8 ++ jvp_utf8_extended_next((char*)bufstart, (char*)bufend, 0, &codepoint); ++ if (codepoint != -1) { ++ memcpy(out, bufstart, 4); ++ *alen_io = aend - astart; ++ *blen_io = bend - bstart; ++ *bstart_io = bstart; ++ return bufend - bufstart; ++ } ++ } ++ return 0; ++} ++ + int jvp_utf8_is_valid(const char* in, const char* end) { + int codepoint; + while ((in = jvp_utf8_extended_next(in, end, 0, &codepoint))) { +diff --git a/src/jv_unicode.h b/src/jv_unicode.h +index 37c7fc0..ff2a437 100644 +--- a/src/jv_unicode.h ++++ b/src/jv_unicode.h +@@ -1,6 +1,8 @@ + #ifndef JV_UNICODE_H + #define JV_UNICODE_H + ++#include ++ + enum jvp_utf8_flags { + /* Emit replacement character instead of -1 for errors */ + JVP_UTF8_REPLACE = 1, +@@ -14,6 +16,7 @@ enum jvp_utf8_flags { + const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes); + const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint); + const char* jvp_utf8_next(const char* in, const char* end, int* codepoint); ++int jvp_utf8_extended_join(const char* astart, uint32_t* alen, const char** bstart, uint32_t* blen, char* out); + int jvp_utf8_is_valid(const char* in, const char* end); + + int jvp_utf8_decode_length(char startchar); +diff --git a/tests/jq.test b/tests/jq.test +index c882fd2..9e6c896 100644 +--- a/tests/jq.test ++++ b/tests/jq.test +@@ -62,6 +62,11 @@ null + null + "∀\ud800∃\udc00∅\udfff" + ++# Check that unpaired surrogates are paired when concatenated ++add ++["\ud83d","\ude43","\ud83e","\udd11","\ud83e","\udd17","\ud83e","\udd14","\ud83e","\udd10","\ud83d","\ude44","\ud83e","\udd12","\ud83e","\udd15","\ud83e","\udd13","\ud83e","\udd16","\ud83e","\udd18","\ud83c","\udffb","\ud83c","\udffc"] ++"🙃🤑🤗🤔🤐🙄🤒🤕🤓🤖🤘🏻🏼" ++ + "inter\("pol" + "ation")" + null + "interpolation" +@@ -87,6 +92,16 @@ null + "Zm/Ds2Jhcgo=" + "foóbar\n" + ++# test correction of UTF-8 errors when concatenating as binary data (input is a random sequence of code points) ++. as $text | @base64 | . as $b64 | [range(0, 300)] | map($b64[(.*4):((. + 1)*4)] | @base64d) | add | . == $text ++"򍨼衍򙮬񪜁򻴠󖂡󔁰񗏷󛊭񢠃򍧝𭌞󹰞󙴋𿋓󧜹򳔎񦰓򅆹򽐟󂑛򶃯㾱ꕽ񂊛򉙲򅤎􃖣󻣸󁸦򴏜򽃿􄑏󠦱񄛲񄕵񡿚򮩒񡏂򨆯򶚒󎮆󉨗򡮟򆿴񬏪򻀅㫑񉒗󴍶󬪸񝶑񂾑򇔣򉩉􂞇𲡀𨫆򤵇𲺝\u001c񖂟񳐉󲔹𳨬􀮔𸒙񜶻㊬񓐊񽒬󑀧󗧚󞌶󦥥𗌽𘀍󴼹􌇺򫗛񂷶󏷕񜁍񥬟󼁁󓺉𗟒򷝊𩕃񞝏񧄀󁲩򐀄򳂸񲊷򃀋񃫫𝷏򏖝򷂍󢭣􋛨𞪒򁁅勸󯩥󵪭񚮚򻡍騎񾊯򪓚񗡈񎕫򡯬񋫠ᕴ𞨹󾄇񩠶𙯾񢥱𚯴񬥷󢶖񾹌񡈟򧓑񒾘𚸯񳗺񭟡𫸬񷤖񷆐𖋌񦰃椀𫎾󗚋𿋆󈝰񺥲򝕊𵯮򙧚󬱃󍗞󱆃󂟙󟆺񻢬󸮤󗗉񉛮𺵡𰣒􁋙񻍛􇡘ᮍ񕥸񨵂盕嗪𻸮򶆍򊈤񽓎󙴐𗬜󾱒󷹰􇡈񨦎􏥩񴲡𨑮򱏝𭢊󕁶򣙥󶡮󮰌󿙾氕񼻘􆔪񢕀񊿃󮨝񑛖󣴊󎎏򳞓㊁󒭀󇜳𯄌𻙩" ++true ++ ++# test preservation of binary data when concatenating (input is a random sequence of UTF-16 surrogates encoded in WTF-8, should be treated as regular UTF-8 errors) ++@base64d | . as $text | @base64 | . as $b64 | [range(0, 300)] | map($b64[(.*4):((. + 1)*4)] | @base64d) | add | . == $text ++"7bKv7aiz7auX7aG37aO77aOe7auy7bmm7bqk7aG87bSH7a6m7bmc7bum7bqj7au+7bqf7aap7buC7byq7aS37aCp7aSl7a+a7bur7aGV7bGl7b6M7biB7aOe7ayR7amW7aOX7b637a+P7bu+7ayP7bOw7ba/7ayp7b6G7aqd7bG37bK57b6O7bq27a+u7a2N7ayu7bKK" ++true ++ + @uri + "\u03bc" + "%CE%BC" diff --git a/Update-base64-utf8bytelength-and-fromjson-to-handlebinary-strings.patch b/Update-base64-utf8bytelength-and-fromjson-to-handlebinary-strings.patch new file mode 100644 index 0000000000000000000000000000000000000000..f3cd1dcf863aa1f9f21f8c98201015d360abca9f --- /dev/null +++ b/Update-base64-utf8bytelength-and-fromjson-to-handlebinary-strings.patch @@ -0,0 +1,210 @@ +From a6ccbaad05bea30c5700b10bd51e46d390496a9b Mon Sep 17 00:00:00 2001 +From: Max Zerzouri +Date: Sun, 16 May 2021 09:18:51 +0000 +Subject: [PATCH] Update `@base64`, `utf8bytelength` and `fromjson` to handle + binary strings + +--- + docs/content/3.manual/manual.yml | 1 - + src/builtin.c | 107 ++++++++++++++++++++++++++----- + tests/base64.test | 10 +++ + tests/shtest | 19 ++++-- + 4 files changed, 116 insertions(+), 21 deletions(-) + +diff --git a/docs/content/3.manual/manual.yml b/docs/content/3.manual/manual.yml +index bfb17f4..1258dbf 100644 +--- a/docs/content/3.manual/manual.yml ++++ b/docs/content/3.manual/manual.yml +@@ -1843,7 +1843,6 @@ sections: + * `@base64d`: + + The inverse of `@base64`, input is decoded as specified by RFC 4648. +- Note\: If the decoded string is not UTF-8, the results are undefined. + + This syntax can be combined with string interpolation in a + useful way. You can follow a `@foo` token with a string +diff --git a/src/builtin.c b/src/builtin.c +index c6c8c2e..975bf49 100644 +--- a/src/builtin.c ++++ b/src/builtin.c +@@ -409,10 +409,55 @@ static jv f_dump(jq_state *jq, jv input) { + static jv f_json_parse(jq_state *jq, jv input) { + if (jv_get_kind(input) != JV_KIND_STRING) + return type_error(input, "only strings can be parsed"); +- jv res = jv_parse_sized(jv_string_value(input), +- jv_string_length_bytes(jv_copy(input))); ++ ++ const char* i = jv_string_value(input); ++ const char* end = i + jv_string_length_bytes(jv_copy(input)); ++ ++ struct jv_parser* parser = jv_parser_new(0); ++ int count = 0; ++ jv value = jv_invalid(); ++ while (i != NULL) { ++ const int max_utf8_len = 4; ++ unsigned char buf[100 + max_utf8_len]; ++ int buflen = 0; ++ int c; ++ while ((buflen + max_utf8_len < sizeof(buf)) && (i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) { ++ if (c >= -0xFF && c <= -0x80) { ++ // Invalid UTF-8 byte, pass through ++ buf[buflen++] = -c; ++ } else ++ buflen += jvp_utf8_encode(c, buf + buflen); ++ } ++ jv_parser_set_buf(parser, buf, buflen, i != NULL); ++ for (;;) { ++ jv next = jv_parser_next(parser); ++ if (!jv_is_valid(next)) { ++ if (jv_invalid_has_msg(jv_copy(next))) { ++ count++; ++ jv_free(value); ++ value = next; ++ i = NULL; ++ } ++ break; ++ } ++ jv_free(value); ++ if (count++ == 0) ++ value = next; ++ else { ++ jv_free(next); ++ value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values")); ++ i = NULL; ++ break; ++ } ++ } ++ } ++ jv_parser_free(parser); + jv_free(input); +- return res; ++ if (count == 0) { ++ jv_free(value); ++ value = jv_invalid_with_msg(jv_string("Expected JSON value")); ++ } ++ return value; + } + + static jv f_tonumber(jq_state *jq, jv input) { +@@ -457,7 +502,19 @@ static jv f_tostring(jq_state *jq, jv input) { + static jv f_utf8bytelength(jq_state *jq, jv input) { + if (jv_get_kind(input) != JV_KIND_STRING) + return type_error(input, "only strings have UTF-8 byte length"); +- return jv_number(jv_string_length_bytes(input)); ++ const char* i = jv_string_value(input); ++ const char* end = i + jv_string_length_bytes(jv_copy(input)); ++ int len = 0; ++ int c; ++ while ((i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) { ++ if (c >= -0xFF && c <= -0x80) { ++ // Invalid UTF-8 byte, will be passed through ++ len++; ++ } else ++ len += jvp_utf8_encode_length(c); ++ } ++ jv_free(input); ++ return jv_number(len); + } + + #define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" +@@ -632,21 +689,41 @@ static jv f_format(jq_state *jq, jv input, jv fmt) { + jv_free(fmt); + input = f_tostring(jq, input); + jv line = jv_string(""); +- const unsigned char* data = (const unsigned char*)jv_string_value(input); +- int len = jv_string_length_bytes(jv_copy(input)); +- for (int i=0; i= 3 ? 3 : len-i; +- for (int j=0; j<3; j++) { ++ const char* i = jv_string_value(input); ++ const char* end = i + jv_string_length_bytes(jv_copy(input)); ++ uint32_t code = 0; ++ int n = 0; ++ int c; ++ while ((i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) { ++ unsigned char ubuf[4]; ++ int len = 0; ++ if (c >= -0xFF && c <= -0x80) { ++ // Invalid UTF-8 byte, pass through ++ ubuf[len++] = -c; ++ } else ++ len += jvp_utf8_encode(c, ubuf); ++ for (int x = 0; x < len; x++) { + code <<= 8; +- code |= j < n ? (unsigned)data[i+j] : 0; ++ code |= ubuf[x]; ++ if (++n == 3) { ++ char buf[4]; ++ for (int j = 0; j < 4; j++) ++ buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f]; ++ line = jv_string_append_buf(line, buf, sizeof(buf)); ++ n = 0; ++ code = 0; ++ } + } ++ } ++ if (n > 0) { ++ assert(n < 3); ++ code <<= 8*(3 - n); + char buf[4]; +- for (int j=0; j<4; j++) { ++ for (int j = 0; j < 4; j++) + buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f]; +- } +- if (n < 3) buf[3] = '='; +- if (n < 2) buf[2] = '='; ++ buf[3] = '='; ++ if (n < 2) ++ buf[2] = '='; + line = jv_string_append_buf(line, buf, sizeof(buf)); + } + jv_free(input); +diff --git a/tests/base64.test b/tests/base64.test +index 0f82b0b..6507bb8 100644 +--- a/tests/base64.test ++++ b/tests/base64.test +@@ -33,3 +33,13 @@ + . | try @base64d catch . + "QUJDa" + "string (\"QUJDa\") trailing base64 byte found" ++ ++# random binary data ++(. | @base64d | @base64) == . ++"zns0Su1i4JjDfGiR95WOcU8iiPMOrfJTUBm9P1ot2qIMiyk04b0WSIFNTMD7w9ziMV8nSbwpPqNl3JKF1eWZrRRg24rbvh66O1e7Z1xIGPNqTqm+jdzRCkWSryR+67wXRVgD6Q==" ++true ++ ++# replace lone surrogates ++@base64 ++"foo\udca9\ud83dbar" ++"Zm9v77+977+9YmFy" +diff --git a/tests/shtest b/tests/shtest +index 4c8b57e..7de61e4 100755 +--- a/tests/shtest ++++ b/tests/shtest +@@ -131,11 +131,20 @@ cmp $d/out $d/expected + + + clean=false +-# Invalid UTF-8 bytes are preserved when encoding/decoding JSON +-dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null +-$VALGRIND $Q $JQ -sR . $d/rand >$d/out.json +-$VALGRIND $Q $JQ -j . $d/out.json >$d/out +-cmp $d/out $d/rand ++# Invalid UTF-8 bytes are preserved when encoding/decoding JSON and base64 and concatenating binary strings ++if dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null; then ++ $VALGRIND $Q $JQ -sR . $d/rand >$d/out.json ++ $VALGRIND $Q $JQ -j . $d/out.json >$d/out ++ cmp $d/out $d/rand ++ $VALGRIND $Q $JQ -jR fromjson $d/out.json >$d/out ++ cmp $d/out $d/rand ++ $VALGRIND $Q $JQ -j '@base64 | @base64d' $d/out.json >$d/out ++ cmp $d/out $d/rand ++ base64 $d/rand | $VALGRIND $Q $JQ -R '@base64d' | $VALGRIND $Q $JQ -sj 'add' >$d/out ++ cmp $d/out $d/rand ++ $VALGRIND $Q $JQ -nj '$a' --rawfile a $d/rand >$d/out ++ cmp $d/out $d/rand ++fi + clean=true + + diff --git a/jq.spec b/jq.spec index d546e387e19a3eb9bb7278a95dbd0791b6131e58..922b1dd90d1ca348980864d9cddd3a0be2d53e6a 100644 --- a/jq.spec +++ b/jq.spec @@ -1,12 +1,17 @@ Name: jq Version: 1.6 -Release: 1 +Release: 2 Summary: A lightweight and flexible command-line JSON processor License: MIT and ASL 2.0 and CC-BY and GPLv3 URL: http://stedolan.github.io/jq/ Source0: https://github.com/stedolan/jq/releases/download/jq-%{version}/jq-%{version}.tar.gz BuildRequires: make flex bison valgrind gcc chrpath oniguruma-devel +Patch0001: jv_string_implode-avoid-producing-unprintable-string-fromreserved-code-points.patch +Patch0002: Binary-strings-preserve-UTF-8-and-UTF-16-errors.patch +Patch0003: Update-base64-utf8bytelength-and-fromjson-to-handlebinary-strings.patch +Patch0004: Correct-UTF-8-and-UTF-16-errors-during-concatenation.patch + %description jq is a lightweight and flexible command-line JSON processor. you can use it to slice and filter and map and transform structured data. @@ -28,15 +33,15 @@ BuildArch: noarch Documentation for jq package. %prep -%autosetup -n jq-%{version} +%autosetup -n jq-%{version} -p1 %build -%configure --disable-static +%configure %make_build %install %make_install -%delete_la +%delete_la_and_a chrpath -d %{buildroot}%{_bindir}/%{name} %check @@ -70,6 +75,9 @@ make check %changelog +* Mon Aug 30 2021 lingsheng - 1.6-2 +- Support binary strings preserve UTF-8 and UTF-16 errors + * Wed Aug 25 2021 wangyue - 1.6-1 - Upgrade to 1.6 diff --git a/jv_string_implode-avoid-producing-unprintable-string-fromreserved-code-points.patch b/jv_string_implode-avoid-producing-unprintable-string-fromreserved-code-points.patch new file mode 100644 index 0000000000000000000000000000000000000000..a90b87511934433fa74951c442c9907f5f7d8fa0 --- /dev/null +++ b/jv_string_implode-avoid-producing-unprintable-string-fromreserved-code-points.patch @@ -0,0 +1,23 @@ +From e165542664e9fe3c155eeb13e16320a07dfbd5fd Mon Sep 17 00:00:00 2001 +From: Max Zerzouri +Date: Sat, 15 May 2021 10:50:15 +0000 +Subject: [PATCH] jv_string_implode: avoid producing unprintable string from + reserved code points + +--- + src/jv.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/jv.c b/src/jv.c +index 979d188..1f1029e 100644 +--- a/src/jv.c ++++ b/src/jv.c +@@ -725,7 +725,7 @@ jv jv_string_implode(jv j) { + jv n = jv_array_get(jv_copy(j), i); + assert(jv_get_kind(n) == JV_KIND_NUMBER); + int nv = jv_number_value(n); +- if (nv > 0x10FFFF) ++ if (nv < 0 || (nv >= 0xD800 && nv <= 0xDFFF) || nv > 0x10FFFF) + nv = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER + s = jv_string_append_codepoint(s, nv); + }