From 133123019833587229a5b1053fd1203cb9b15e87 Mon Sep 17 00:00:00 2001 From: zhaozhibo Date: Wed, 12 Jan 2022 15:53:12 +0800 Subject: [PATCH 1/6] Fix conversion function bug in utf_helper when utf8 characters do not end with '\0' and reconstruct ecmastring Signed-off-by: zhaozhibo --- ecmascript/base/number_helper.cpp | 11 +++----- ecmascript/base/utf_helper.cpp | 43 +++---------------------------- ecmascript/base/utf_helper.h | 4 +-- ecmascript/ecma_string-inl.h | 4 +-- ecmascript/ecma_string.cpp | 14 +++++----- ecmascript/ecma_string.h | 6 ++--- ecmascript/ecma_string_table.cpp | 2 +- 7 files changed, 22 insertions(+), 62 deletions(-) diff --git a/ecmascript/base/number_helper.cpp b/ecmascript/base/number_helper.cpp index a6c3410bab..c6b1bf458e 100644 --- a/ecmascript/base/number_helper.cpp +++ b/ecmascript/base/number_helper.cpp @@ -69,14 +69,9 @@ bool NumberHelper::GotoNonspace(uint8_t **ptr, const uint8_t *end) while (*ptr < end) { uint16_t c = **ptr; size_t size = 1; - if (**ptr > INT8_MAX) { - size = 0; - uint16_t utf8Bit = INT8_MAX + 1; // equal 0b1000'0000 - while (utf8Bit > 0 && (c & utf8Bit) == utf8Bit) { - ++size; - utf8Bit >>= 1UL; - } - if (base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, &c, 1, 0) <= 0) { + if (c > INT8_MAX) { + size = base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, &c, end - *ptr, 1, 0); + if (size <= 0) { return true; } } diff --git a/ecmascript/base/utf_helper.cpp b/ecmascript/base/utf_helper.cpp index 5cb168e06f..bcda59079e 100644 --- a/ecmascript/base/utf_helper.cpp +++ b/ecmascript/base/utf_helper.cpp @@ -216,48 +216,13 @@ std::pair ConvertUtf8ToUtf16Pair(const uint8_t *data, bool com return {pair, UtfLength::FOUR}; } -size_t Utf8ToUtf16Size(const uint8_t *utf8) +size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len) { - size_t res = 0; - while (*utf8 != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - auto [pair, nbytes] = ConvertUtf8ToUtf16Pair(utf8); - res += pair > 0xffff ? UtfLength::TWO : UtfLength::ONE; // NOLINT(readability-magic-numbers) - utf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - } - return res; + return utf::Utf8ToMUtf16Size(utf8, utf8Len); } -size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf16Len, size_t start) +size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len, size_t start) { - ASSERT(utf16Out != nullptr); - size_t outPos = 0; - while (*utf8In != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - auto [pair, nbytes] = ConvertUtf8ToUtf16Pair(utf8In); - auto [pHi, pLo] = utf::SplitUtf16Pair(pair); - - utf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - if (start > 0) { - start -= nbytes; - continue; - } - - if (pHi != 0) { - if (outPos >= utf16Len - 1) { // check for place for two uint16 - break; - } - outPos++; - *utf16Out++ = pHi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - } - if (outPos >= utf16Len) { - break; - } - outPos++; - *utf16Out++ = pLo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - if (outPos >= utf16Len) { - break; - } - } - - return outPos; + return utf::ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start); } } // namespace panda::ecmascript::base::utf_helper diff --git a/ecmascript/base/utf_helper.h b/ecmascript/base/utf_helper.h index 29abed4908..1808167543 100644 --- a/ecmascript/base/utf_helper.h +++ b/ecmascript/base/utf_helper.h @@ -69,9 +69,9 @@ size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_ std::pair ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine = false); -size_t Utf8ToUtf16Size(const uint8_t *utf8); +size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len); -size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf16Len, size_t start); +size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len, size_t start); static inline uint32_t CombineTwoU16(uint16_t d0, uint16_t d1) { diff --git a/ecmascript/ecma_string-inl.h b/ecmascript/ecma_string-inl.h index 5cffa1c1b1..88e917a248 100644 --- a/ecmascript/ecma_string-inl.h +++ b/ecmascript/ecma_string-inl.h @@ -63,12 +63,12 @@ inline EcmaString *EcmaString::CreateFromUtf8(const uint8_t *utf8Data, uint32_t UNREACHABLE(); } } else { - auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data); + auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data, utf8Len); string = AllocStringObject(utf16Len, false, vm); ASSERT(string != nullptr); [[maybe_unused]] auto len = - base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, string->GetDataUtf16Writable(), utf16Len, 0); + base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, string->GetDataUtf16Writable(), utf8Len, utf16Len, 0); ASSERT(len == utf16Len); } diff --git a/ecmascript/ecma_string.cpp b/ecmascript/ecma_string.cpp index 7e9fa5fbba..4558a864e2 100644 --- a/ecmascript/ecma_string.cpp +++ b/ecmascript/ecma_string.cpp @@ -321,7 +321,7 @@ bool EcmaString::StringsAreEqualUtf8(const EcmaString *str1, const uint8_t *utf8 Span data2(utf8Data, utf8Len); return EcmaString::StringsAreEquals(data1, data2); } - return IsUtf8EqualsUtf16(utf8Data, str1->GetDataUtf16(), str1->GetLength()); + return IsUtf8EqualsUtf16(utf8Data, utf8Len, str1->GetDataUtf16(), str1->GetLength()); } /* static */ @@ -331,7 +331,7 @@ bool EcmaString::StringsAreEqualUtf16(const EcmaString *str1, const uint16_t *ut if (str1->GetLength() != utf16Len) { result = false; } else if (!str1->IsUtf16()) { - result = IsUtf8EqualsUtf16(str1->GetDataUtf8(), utf16Data, utf16Len); + result = IsUtf8EqualsUtf16(str1->GetDataUtf8(), str1->GetLength(), utf16Data, utf16Len); } else { Span data1(str1->GetDataUtf16(), str1->GetLength()); Span data2(utf16Data, utf16Len); @@ -422,15 +422,15 @@ uint32_t EcmaString::ComputeHashcode() const } /* static */ -uint32_t EcmaString::ComputeHashcodeUtf8(const uint8_t *utf8Data, bool canBeCompress) +uint32_t EcmaString::ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len, bool canBeCompress) { uint32_t hash; if (canBeCompress) { hash = ComputeHashForUtf8(utf8Data); } else { - auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data); + auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data, utf8Len); CVector tmpBuffer(utf16Len); - [[maybe_unused]] auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf16Len, 0); + [[maybe_unused]] auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8Len, utf16Len, 0); ASSERT(len == utf16Len); hash = ComputeHashForData(tmpBuffer.data(), utf16Len); } @@ -444,12 +444,12 @@ uint32_t EcmaString::ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t le } /* static */ -bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16Len) +bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data, uint32_t utf16Len) { // length is one more than compared utf16Data, don't need convert all utf8Data to utf16Data uint32_t utf8ConvertLength = utf16Len + 1; CVector tmpBuffer(utf8ConvertLength); - auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8ConvertLength, 0); + auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8Len, utf8ConvertLength, 0); if (len != utf16Len) { return false; } diff --git a/ecmascript/ecma_string.h b/ecmascript/ecma_string.h index 453b2d1250..d4e751dba4 100644 --- a/ecmascript/ecma_string.h +++ b/ecmascript/ecma_string.h @@ -171,7 +171,7 @@ public: } return length; } - return base::utf_helper::ConvertRegionUtf8ToUtf16(GetDataUtf8(), buf, maxLength, start); + return base::utf_helper::ConvertRegionUtf8ToUtf16(GetDataUtf8(), buf, len, maxLength, start); } // NOLINTNEXTLINE(modernize-avoid-c-arrays) @@ -245,7 +245,7 @@ public: * Compares strings by bytes, It doesn't check canonical unicode equivalence. */ static bool StringsAreEqualUtf16(const EcmaString *str1, const uint16_t *utf16Data, uint32_t utf16Len); - static uint32_t ComputeHashcodeUtf8(const uint8_t *utf8Data, bool canBeCompress); + static uint32_t ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len, bool canBeCompress); static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length); static void SetCompressedStringsEnabled(bool val) @@ -303,7 +303,7 @@ private: * str1 should have the same length as utf16_data. * Converts utf8Data to utf16 and compare it with given utf16_data. */ - static bool IsUtf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16Len); + static bool IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data, uint32_t utf16Len); template /** diff --git a/ecmascript/ecma_string_table.cpp b/ecmascript/ecma_string_table.cpp index e74381da57..33629dad26 100644 --- a/ecmascript/ecma_string_table.cpp +++ b/ecmascript/ecma_string_table.cpp @@ -26,7 +26,7 @@ EcmaStringTable::EcmaStringTable(const EcmaVM *vm) : vm_(vm) {} EcmaString *EcmaStringTable::GetString(const uint8_t *utf8Data, uint32_t utf8Len, bool canBeCompress) const { - uint32_t hashCode = EcmaString::ComputeHashcodeUtf8(utf8Data, canBeCompress); + uint32_t hashCode = EcmaString::ComputeHashcodeUtf8(utf8Data, utf8Len, canBeCompress); for (auto it = table_.find(hashCode); it != table_.end(); it++) { auto foundedString = it->second; if (EcmaString::StringsAreEqualUtf8(foundedString, utf8Data, utf8Len, canBeCompress)) { -- Gitee From 60b048ae126a405c2fc915e4292d065c66915480 Mon Sep 17 00:00:00 2001 From: zhaozhibo Date: Wed, 12 Jan 2022 16:19:49 +0800 Subject: [PATCH 2/6] fix conversion function bug in utf_helper when utf8 characters do not end with '\0' and reconstruct ecmastring Signed-off-by: zhaozhibo --- ecmascript/base/utf_helper.cpp | 5 +++-- ecmascript/base/utf_helper.h | 3 ++- ecmascript/ecma_string.cpp | 6 ++++-- ecmascript/ecma_string.h | 3 ++- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/ecmascript/base/utf_helper.cpp b/ecmascript/base/utf_helper.cpp index bcda59079e..bdf4daeba3 100644 --- a/ecmascript/base/utf_helper.cpp +++ b/ecmascript/base/utf_helper.cpp @@ -218,10 +218,11 @@ std::pair ConvertUtf8ToUtf16Pair(const uint8_t *data, bool com size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len) { - return utf::Utf8ToMUtf16Size(utf8, utf8Len); + return utf::MUtf8ToUtf16Size(utf8, utf8Len); } -size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len, size_t start) +size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len, + size_t start) { return utf::ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start); } diff --git a/ecmascript/base/utf_helper.h b/ecmascript/base/utf_helper.h index 1808167543..6182b6dc86 100644 --- a/ecmascript/base/utf_helper.h +++ b/ecmascript/base/utf_helper.h @@ -71,7 +71,8 @@ std::pair ConvertUtf8ToUtf16Pair(const uint8_t *data, bool com size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len); -size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len, size_t start); +size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len, + size_t start); static inline uint32_t CombineTwoU16(uint16_t d0, uint16_t d1) { diff --git a/ecmascript/ecma_string.cpp b/ecmascript/ecma_string.cpp index 4558a864e2..d3d6504c3a 100644 --- a/ecmascript/ecma_string.cpp +++ b/ecmascript/ecma_string.cpp @@ -430,7 +430,8 @@ uint32_t EcmaString::ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len } else { auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data, utf8Len); CVector tmpBuffer(utf16Len); - [[maybe_unused]] auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8Len, utf16Len, 0); + [[maybe_unused]] auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8Len, + utf16Len, 0); ASSERT(len == utf16Len); hash = ComputeHashForData(tmpBuffer.data(), utf16Len); } @@ -444,7 +445,8 @@ uint32_t EcmaString::ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t le } /* static */ -bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data, uint32_t utf16Len) +bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data, + uint32_t utf16Len) { // length is one more than compared utf16Data, don't need convert all utf8Data to utf16Data uint32_t utf8ConvertLength = utf16Len + 1; diff --git a/ecmascript/ecma_string.h b/ecmascript/ecma_string.h index d4e751dba4..e7f6d7ef14 100644 --- a/ecmascript/ecma_string.h +++ b/ecmascript/ecma_string.h @@ -303,7 +303,8 @@ private: * str1 should have the same length as utf16_data. * Converts utf8Data to utf16 and compare it with given utf16_data. */ - static bool IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data, uint32_t utf16Len); + static bool IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data, + uint32_t utf16Len); template /** -- Gitee From 47e6e541d8b1e0d3840c3d99d24d4bbd9b3dc3f6 Mon Sep 17 00:00:00 2001 From: zhaozhibo Date: Wed, 12 Jan 2022 16:46:51 +0800 Subject: [PATCH 3/6] fix conversion function bug in 'utf_helper' when utf8 characters do not end with '\0' and reconstruct ecmastring Signed-off-by: zhaozhibo --- ecmascript/base/utf_helper.cpp | 2 +- ecmascript/base/utf_helper.h | 2 +- ecmascript/tests/ecma_string_test.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ecmascript/base/utf_helper.cpp b/ecmascript/base/utf_helper.cpp index bdf4daeba3..88602385d1 100644 --- a/ecmascript/base/utf_helper.cpp +++ b/ecmascript/base/utf_helper.cpp @@ -35,7 +35,7 @@ bool IsValidUTF8(const std::vector &data) uint32_t length = data.size(); switch (length) { case UtfLength::ONE: - if (data.at(0) > BIT_MASK_1) { + if (data.at(0) >= BIT_MASK_1) { return false; } break; diff --git a/ecmascript/base/utf_helper.h b/ecmascript/base/utf_helper.h index 6182b6dc86..64df9d09ce 100644 --- a/ecmascript/base/utf_helper.h +++ b/ecmascript/base/utf_helper.h @@ -71,7 +71,7 @@ std::pair ConvertUtf8ToUtf16Pair(const uint8_t *data, bool com size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len); -size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len, +size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len, size_t start); static inline uint32_t CombineTwoU16(uint16_t d0, uint16_t d1) diff --git a/ecmascript/tests/ecma_string_test.cpp b/ecmascript/tests/ecma_string_test.cpp index f3e39b5b2e..4bb105911d 100644 --- a/ecmascript/tests/ecma_string_test.cpp +++ b/ecmascript/tests/ecma_string_test.cpp @@ -1647,7 +1647,7 @@ HWTEST_F_L0(EcmaStringTest, ComputeHashcodeUtf8) for (uint32_t i = 0; i < lengthEcmaStrU8; i++) { hashExpect = hashExpect * 31 + arrayU8[i]; } - EXPECT_EQ(EcmaString::ComputeHashcodeUtf8(&arrayU8[0], lengthEcmaStrU8), static_cast(hashExpect)); + EXPECT_EQ(EcmaString::ComputeHashcodeUtf8(&arrayU8[0], lengthEcmaStrU8, true), static_cast(hashExpect)); } /* -- Gitee From c0602b8b7742df703d9e2f383d7f7e6aa5254708 Mon Sep 17 00:00:00 2001 From: zhaozhibo Date: Wed, 12 Jan 2022 17:45:37 +0800 Subject: [PATCH 4/6] Fix conversion function bug in 'utf_helper' when utf8 characters do not end with '\0' and reconstruct ecmastring Signed-off-by: zhaozhibo --- ecmascript/base/number_helper.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ecmascript/base/number_helper.cpp b/ecmascript/base/number_helper.cpp index c6b1bf458e..140b475b02 100644 --- a/ecmascript/base/number_helper.cpp +++ b/ecmascript/base/number_helper.cpp @@ -70,8 +70,13 @@ bool NumberHelper::GotoNonspace(uint8_t **ptr, const uint8_t *end) uint16_t c = **ptr; size_t size = 1; if (c > INT8_MAX) { - size = base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, &c, end - *ptr, 1, 0); - if (size <= 0) { + size = 0; + uint16_t utf8Bit = INT8_MAX + 1; // equal 0b1000'0000 + while (utf8Bit > 0 && (c & utf8Bit) == utf8Bit) { + ++size; + utf8Bit >>= 1UL; + } + if (base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, &c, end - *ptr, 1, 0) <= 0) { return true; } } -- Gitee From 2e8f8e557002cd9d8404fad3f2364ad556ff4e2b Mon Sep 17 00:00:00 2001 From: zhaozhibo Date: Thu, 13 Jan 2022 11:38:03 +0800 Subject: [PATCH 5/6] fix conversion function bug in 'utf_helper' when utf8 characters do not end with '\0' and reconstruct ecmastring Signed-off-by: zhaozhibo --- ecmascript/base/number_helper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ecmascript/base/number_helper.cpp b/ecmascript/base/number_helper.cpp index 140b475b02..4f6130c5c8 100644 --- a/ecmascript/base/number_helper.cpp +++ b/ecmascript/base/number_helper.cpp @@ -69,7 +69,7 @@ bool NumberHelper::GotoNonspace(uint8_t **ptr, const uint8_t *end) while (*ptr < end) { uint16_t c = **ptr; size_t size = 1; - if (c > INT8_MAX) { + if (**ptr > INT8_MAX) { size = 0; uint16_t utf8Bit = INT8_MAX + 1; // equal 0b1000'0000 while (utf8Bit > 0 && (c & utf8Bit) == utf8Bit) { -- Gitee From aa7c203e742f3f99f92f909e8c898fb8705bee8a Mon Sep 17 00:00:00 2001 From: zhaozhibo Date: Thu, 13 Jan 2022 11:39:09 +0800 Subject: [PATCH 6/6] fix conversion function bug in 'utf_helper' when utf8 characters do not end with '\0' and reconstruct ecmastring Signed-off-by: zhaozhibo --- ecmascript/base/number_helper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ecmascript/base/number_helper.cpp b/ecmascript/base/number_helper.cpp index 4f6130c5c8..140b475b02 100644 --- a/ecmascript/base/number_helper.cpp +++ b/ecmascript/base/number_helper.cpp @@ -69,7 +69,7 @@ bool NumberHelper::GotoNonspace(uint8_t **ptr, const uint8_t *end) while (*ptr < end) { uint16_t c = **ptr; size_t size = 1; - if (**ptr > INT8_MAX) { + if (c > INT8_MAX) { size = 0; uint16_t utf8Bit = INT8_MAX + 1; // equal 0b1000'0000 while (utf8Bit > 0 && (c & utf8Bit) == utf8Bit) { -- Gitee