diff --git a/ecmascript/base/number_helper.cpp b/ecmascript/base/number_helper.cpp index a6c3410babc5fbb3f8ae8a33f4cd0e984ee2453d..23317cb969889bcc32dc0b9ecb9ffd63b49d0e64 100644 --- a/ecmascript/base/number_helper.cpp +++ b/ecmascript/base/number_helper.cpp @@ -69,14 +69,14 @@ bool NumberHelper::GotoNonspace(uint8_t **ptr, const uint8_t *end) while (*ptr < end) { uint16_t c = **ptr; size_t size = 1; - if (**ptr > INT8_MAX) { + if (c > INT8_MAX) { size = 0; uint16_t utf8Bit = INT8_MAX + 1; // equal 0b1000'0000 while (utf8Bit > 0 && (c & utf8Bit) == utf8Bit) { ++size; utf8Bit >>= 1UL; } - if (base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, &c, 1, 0) <= 0) { + if (utf_helper::ConvertRegionMUtf8ToUtf16(*ptr, &c, end - *ptr, 1, 0) <= 0) { return true; } } diff --git a/ecmascript/base/utf_helper.cpp b/ecmascript/base/utf_helper.cpp index 5cb168e06f8e10c0c3fccec8f7e428c0999166a5..be1520762d6eae28b5429e45cb980943dccaa283 100644 --- a/ecmascript/base/utf_helper.cpp +++ b/ecmascript/base/utf_helper.cpp @@ -180,6 +180,32 @@ size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_ return utf8Pos; } +size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len, + size_t start) +{ + return utf::ConvertRegionUtf16ToMUtf8(utf16_in, mutf8_out, utf16_len, mutf8_len, start); +} + +int CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2) +{ + return utf::CompareMUtf8ToMUtf8(mutf8_1, mutf8_2); +} + +int CompareUtf8ToUtf8(const uint8_t *utf8_1, size_t utf8_1_length, const uint8_t *utf8_2, size_t utf8_2_length) +{ + return utf::CompareUtf8ToUtf8(utf8_1, utf8_1_length, utf8_2, utf8_2_length); +} + +bool IsEqual(Span utf8_1, Span utf8_2) +{ + return utf::IsEqual(utf8_1, utf8_2); +} + +bool IsEqual(const uint8_t *mutf8_1, const uint8_t *mutf8_2) +{ + return utf::IsEqual(mutf8_1, mutf8_2); +} + std::pair ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine) { uint8_t d0 = data[0]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) @@ -216,6 +242,21 @@ std::pair ConvertUtf8ToUtf16Pair(const uint8_t *data, bool com return {pair, UtfLength::FOUR}; } +std::pair ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes) +{ + return utf::ConvertMUtf8ToUtf16Pair(data, max_bytes); +} + +bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in) +{ + return utf::IsMUtf8OnlySingleBytes(mutf8_in); +} + +void ConvertMUtf8ToUtf16(const uint8_t *mutf8_in, size_t mutf8_len, uint16_t *utf16_out) +{ + utf::ConvertMUtf8ToUtf16(mutf8_in, mutf8_len, utf16_out); +} + size_t Utf8ToUtf16Size(const uint8_t *utf8) { size_t res = 0; @@ -227,6 +268,26 @@ size_t Utf8ToUtf16Size(const uint8_t *utf8) return res; } +size_t MUtf8ToUtf16Size(const uint8_t *mutf8) +{ + return utf::MUtf8ToUtf16Size(mutf8); +} + +size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len) +{ + return utf::MUtf8ToUtf16Size(mutf8, mutf8_len); +} + +size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length) +{ + return utf::Utf16ToMUtf8Size(mutf16, length); +} + +size_t Mutf8Size(const uint8_t *mutf8) +{ + return utf::Mutf8Size(mutf8); +} + size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf16Len, size_t start) { ASSERT(utf16Out != nullptr); @@ -260,4 +321,10 @@ size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_ return outPos; } + +size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, size_t mutf8_len, size_t utf16_len, + size_t start) +{ + return utf::ConvertRegionMUtf8ToUtf16(mutf8_in, utf16_out, mutf8_len, utf16_len, start); +} } // namespace panda::ecmascript::base::utf_helper diff --git a/ecmascript/base/utf_helper.h b/ecmascript/base/utf_helper.h index 29abed490829a3e6d433f3755d681e37f6a3e7ba..c737ad541e203be0efcbbeb2dc9be6f03c328362 100644 --- a/ecmascript/base/utf_helper.h +++ b/ecmascript/base/utf_helper.h @@ -67,12 +67,40 @@ size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify = tru size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len, size_t start, bool modify = true); +size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len, + size_t start); + +int CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2); + +int CompareUtf8ToUtf8(const uint8_t *utf8_1, size_t utf8_1_length, const uint8_t *utf8_2, size_t utf8_2_length); + +bool IsEqual(Span utf8_1, Span utf8_2); + +bool IsEqual(const uint8_t *mutf8_1, const uint8_t *mutf8_2); + std::pair ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine = false); +std::pair ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes = 4); + +bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in); + +void ConvertMUtf8ToUtf16(const uint8_t *mutf8_in, size_t mutf8_len, uint16_t *utf16_out); + size_t Utf8ToUtf16Size(const uint8_t *utf8); +size_t MUtf8ToUtf16Size(const uint8_t *mutf8); + +size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len); + +size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length); + +size_t Mutf8Size(const uint8_t *mutf8); + size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf16Len, size_t start); +size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, size_t mutf8_len, size_t utf16_len, + size_t start); + static inline uint32_t CombineTwoU16(uint16_t d0, uint16_t d1) { uint32_t codePoint = d0 - utf::HI_SURROGATE_MIN; @@ -81,6 +109,34 @@ static inline uint32_t CombineTwoU16(uint16_t d0, uint16_t d1) codePoint += utf::LO_SUPPLEMENTS_MIN; return codePoint; } + +static inline uint32_t DecodeUtf16Pair(uint32_t pair) +{ + auto [lead, trail] = utf::SplitUtf16Pair(pair); + uint32_t codePoint = (lead - utf::U16_LEAD) << (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH); + codePoint |= trail - utf::U16_TAIL; + return codePoint; +} + +inline const uint8_t *CStringAsMutf8(const char *str) +{ + return utf::CStringAsMutf8(str); +} + +inline const char *Mutf8AsCString(const uint8_t *mutf8) +{ + return utf::Mutf8AsCString(mutf8); +} + +inline constexpr bool IsAvailableNextUtf16Code(uint16_t val) +{ + return utf::IsAvailableNextUtf16Code(val); +} + +static inline std::pair SplitUtf16Pair(uint32_t pair) +{ + return utf::SplitUtf16Pair(pair); +} } // namespace panda::ecmascript::base::utf_helper #endif // ECMASCRIPT_BASE_UTF_HELPER_H \ No newline at end of file diff --git a/ecmascript/builtins/builtins_number.cpp b/ecmascript/builtins/builtins_number.cpp index dddc9e1c5b865d8f2b63f91d0681bc848796ac98..03f333e7c454bfa87a77c4a023eca730bf909db8 100644 --- a/ecmascript/builtins/builtins_number.cpp +++ b/ecmascript/builtins/builtins_number.cpp @@ -153,10 +153,10 @@ JSTaggedValue BuiltinsNumber::ParseFloat(EcmaRuntimeCallInfo *argv) RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread); Span str; if (UNLIKELY(numberString->IsUtf16())) { - size_t len = base::utf_helper::Utf16ToUtf8Size(numberString->GetDataUtf16(), numberString->GetLength()) - 1; + size_t len = base::utf_helper::Utf16ToMUtf8Size(numberString->GetDataUtf16(), numberString->GetLength()) - 1; CVector buf(len); - len = base::utf_helper::ConvertRegionUtf16ToUtf8(numberString->GetDataUtf16(), buf.data(), - numberString->GetLength(), len, 0); + len = base::utf_helper::ConvertRegionUtf16ToMUtf8(numberString->GetDataUtf16(), buf.data(), + numberString->GetLength(), len, 0); str = Span(buf.data(), len); } else { str = Span(numberString->GetDataUtf8(), numberString->GetUtf8Length() - 1); @@ -192,10 +192,10 @@ JSTaggedValue BuiltinsNumber::ParseInt(EcmaRuntimeCallInfo *argv) RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread); Span str; if (UNLIKELY(numberString->IsUtf16())) { - size_t len = base::utf_helper::Utf16ToUtf8Size(numberString->GetDataUtf16(), numberString->GetLength()) - 1; + size_t len = base::utf_helper::Utf16ToMUtf8Size(numberString->GetDataUtf16(), numberString->GetLength()) - 1; CVector buf(len); - len = base::utf_helper::ConvertRegionUtf16ToUtf8(numberString->GetDataUtf16(), buf.data(), - numberString->GetLength(), len, 0); + len = base::utf_helper::ConvertRegionUtf16ToMUtf8(numberString->GetDataUtf16(), buf.data(), + numberString->GetLength(), len, 0); str = Span(buf.data(), len); } else { str = Span(numberString->GetDataUtf8(), numberString->GetUtf8Length() - 1); diff --git a/ecmascript/builtins/builtins_string.cpp b/ecmascript/builtins/builtins_string.cpp index e98ac03ba09b5bceaebd2333d30b85b416a169e3..bff880d7d88aa1389a791b05c05ec9c1505bb5a6 100644 --- a/ecmascript/builtins/builtins_string.cpp +++ b/ecmascript/builtins/builtins_string.cpp @@ -312,7 +312,7 @@ JSTaggedValue BuiltinsString::CodePointAt(EcmaRuntimeCallInfo *argv) if (second < base::utf_helper::DECODE_TRAIL_LOW || second > base::utf_helper::DECODE_TRAIL_HIGH) { return GetTaggedInt(first); } - uint32_t res = base::utf_helper::UTF16Decode(first, second); + uint32_t res = base::utf_helper::CombineTwoU16(first, second); return GetTaggedInt(res); } diff --git a/ecmascript/ecma_string-inl.h b/ecmascript/ecma_string-inl.h index 5cffa1c1b1eaabc26f13cfeb27c8d6fb80082cbb..120350ee66e6f2c54c8e6eb63084cc6f78bcd481 100644 --- a/ecmascript/ecma_string-inl.h +++ b/ecmascript/ecma_string-inl.h @@ -63,12 +63,12 @@ inline EcmaString *EcmaString::CreateFromUtf8(const uint8_t *utf8Data, uint32_t UNREACHABLE(); } } else { - auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data); + auto utf16Len = base::utf_helper::MUtf8ToUtf16Size(utf8Data); string = AllocStringObject(utf16Len, false, vm); ASSERT(string != nullptr); [[maybe_unused]] auto len = - base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, string->GetDataUtf16Writable(), utf16Len, 0); + base::utf_helper::ConvertRegionMUtf8ToUtf16(utf8Data, string->GetDataUtf16Writable(), utf8Len, utf16Len, 0); ASSERT(len == utf16Len); } diff --git a/ecmascript/ecma_string.cpp b/ecmascript/ecma_string.cpp index 7e9fa5fbbab0ba01b7f9f55df234b776687d61e3..2702e9cc5f3a9dc71db1c5a1b203adaa0a72eeb5 100644 --- a/ecmascript/ecma_string.cpp +++ b/ecmascript/ecma_string.cpp @@ -321,7 +321,7 @@ bool EcmaString::StringsAreEqualUtf8(const EcmaString *str1, const uint8_t *utf8 Span data2(utf8Data, utf8Len); return EcmaString::StringsAreEquals(data1, data2); } - return IsUtf8EqualsUtf16(utf8Data, str1->GetDataUtf16(), str1->GetLength()); + return IsUtf8EqualsUtf16(utf8Data, str1->GetDataUtf16(), utf8Len, str1->GetLength()); } /* static */ @@ -331,7 +331,7 @@ bool EcmaString::StringsAreEqualUtf16(const EcmaString *str1, const uint16_t *ut if (str1->GetLength() != utf16Len) { result = false; } else if (!str1->IsUtf16()) { - result = IsUtf8EqualsUtf16(str1->GetDataUtf8(), utf16Data, utf16Len); + result = IsUtf8EqualsUtf16(str1->GetDataUtf8(), utf16Data, str1->GetLength(), utf16Len); } else { Span data1(str1->GetDataUtf16(), str1->GetLength()); Span data2(utf16Data, utf16Len); @@ -422,15 +422,17 @@ uint32_t EcmaString::ComputeHashcode() const } /* static */ -uint32_t EcmaString::ComputeHashcodeUtf8(const uint8_t *utf8Data, bool canBeCompress) +uint32_t EcmaString::ComputeHashcodeUtf8(const uint8_t *utf8Data, uint32_t utf8Len) { + bool canBeCompressed = EcmaString::CanBeCompressed(utf8Data); uint32_t hash; - if (canBeCompress) { + if (canBeCompressed) { hash = ComputeHashForUtf8(utf8Data); } else { - auto utf16Len = base::utf_helper::Utf8ToUtf16Size(utf8Data); + auto utf16Len = base::utf_helper::MUtf8ToUtf16Size(utf8Data); CVector tmpBuffer(utf16Len); - [[maybe_unused]] auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf16Len, 0); + [[maybe_unused]] auto len = base::utf_helper::ConvertRegionMUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8Len, + utf16Len, 0); ASSERT(len == utf16Len); hash = ComputeHashForData(tmpBuffer.data(), utf16Len); } @@ -444,12 +446,13 @@ uint32_t EcmaString::ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t le } /* static */ -bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16Len) +bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf8Len, + uint32_t utf16Len) { // length is one more than compared utf16Data, don't need convert all utf8Data to utf16Data uint32_t utf8ConvertLength = utf16Len + 1; CVector tmpBuffer(utf8ConvertLength); - auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8ConvertLength, 0); + auto len = base::utf_helper::ConvertRegionMUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8Len, utf8ConvertLength, 0); if (len != utf16Len) { return false; } diff --git a/ecmascript/ecma_string.h b/ecmascript/ecma_string.h index 453b2d12506ca847e00362f9a3d15826078887d4..d7243625028f9bb4b81cc200bbb1b1d640ff5201 100644 --- a/ecmascript/ecma_string.h +++ b/ecmascript/ecma_string.h @@ -108,7 +108,7 @@ public: if (!IsUtf16()) { return GetLength() + 1; // add place for zero in the end } - return base::utf_helper::Utf16ToUtf8Size(dataUtf16_, GetLength()); + return base::utf_helper::Utf16ToMUtf8Size(dataUtf16_, GetLength()); } size_t GetUtf16Length() const @@ -145,7 +145,7 @@ public: } return length; } - return base::utf_helper::ConvertRegionUtf16ToUtf8(GetDataUtf16(), buf, length, maxLength - 1, start); + return base::utf_helper::ConvertRegionUtf16ToMUtf8(GetDataUtf16(), buf, length, maxLength - 1, start); } inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t maxLength) const @@ -171,7 +171,7 @@ public: } return length; } - return base::utf_helper::ConvertRegionUtf8ToUtf16(GetDataUtf8(), buf, maxLength, start); + return base::utf_helper::ConvertRegionMUtf8ToUtf16(GetDataUtf8(), buf, maxLength, GetLength(), start); } // NOLINTNEXTLINE(modernize-avoid-c-arrays) @@ -245,7 +245,7 @@ public: * Compares strings by bytes, It doesn't check canonical unicode equivalence. */ static bool StringsAreEqualUtf16(const EcmaString *str1, const uint16_t *utf16Data, uint32_t utf16Len); - static uint32_t ComputeHashcodeUtf8(const uint8_t *utf8Data, bool canBeCompress); + static uint32_t ComputeHashcodeUtf8(const uint8_t *utf8Data, uint32_t utf8Len); static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length); static void SetCompressedStringsEnabled(bool val) @@ -303,7 +303,8 @@ private: * str1 should have the same length as utf16_data. * Converts utf8Data to utf16 and compare it with given utf16_data. */ - static bool IsUtf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16Len); + static bool IsUtf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf8Len, + uint32_t utf16Len); template /** diff --git a/ecmascript/ecma_string_table.cpp b/ecmascript/ecma_string_table.cpp index e74381da57e69604b35678e2681b3541a8c1d45c..92c77452ee92f6e3d679eee4b9be0bbd30c8e4fb 100644 --- a/ecmascript/ecma_string_table.cpp +++ b/ecmascript/ecma_string_table.cpp @@ -26,7 +26,7 @@ EcmaStringTable::EcmaStringTable(const EcmaVM *vm) : vm_(vm) {} EcmaString *EcmaStringTable::GetString(const uint8_t *utf8Data, uint32_t utf8Len, bool canBeCompress) const { - uint32_t hashCode = EcmaString::ComputeHashcodeUtf8(utf8Data, canBeCompress); + uint32_t hashCode = EcmaString::ComputeHashcodeUtf8(utf8Data, utf8Len); for (auto it = table_.find(hashCode); it != table_.end(); it++) { auto foundedString = it->second; if (EcmaString::StringsAreEqualUtf8(foundedString, utf8Data, utf8Len, canBeCompress)) { diff --git a/ecmascript/js_tagged_value-inl.h b/ecmascript/js_tagged_value-inl.h index 3e02edef54f132aa71f3ef16c236d8fd9bcc79c0..fbf13c89600985c9f544f6baf7f770df83b08afb 100644 --- a/ecmascript/js_tagged_value-inl.h +++ b/ecmascript/js_tagged_value-inl.h @@ -113,9 +113,9 @@ inline JSTaggedNumber JSTaggedValue::ToNumber(JSThread *thread, const JSHandle buf; // Span will use buf.data(), shouldn't define inside 'if' if (UNLIKELY(strObj->IsUtf16())) { - size_t len = base::utf_helper::Utf16ToUtf8Size(strObj->GetDataUtf16(), strLen) - 1; + size_t len = base::utf_helper::Utf16ToMUtf8Size(strObj->GetDataUtf16(), strLen) - 1; buf.reserve(len); - len = base::utf_helper::ConvertRegionUtf16ToUtf8(strObj->GetDataUtf16(), buf.data(), strLen, len, 0); + len = base::utf_helper::ConvertRegionUtf16ToMUtf8(strObj->GetDataUtf16(), buf.data(), strLen, len, 0); str = Span(buf.data(), len); } else { str = Span(strObj->GetDataUtf8(), strLen); diff --git a/ecmascript/regexp/regexp_parser.cpp b/ecmascript/regexp/regexp_parser.cpp index b005ced9fae1568b764d12b6fcb99311b6396381..764812924d630168a1443dbe80fbd12e9f055b2d 100644 --- a/ecmascript/regexp/regexp_parser.cpp +++ b/ecmascript/regexp/regexp_parser.cpp @@ -1141,13 +1141,16 @@ uint32_t RegExpParser::ParseClassAtom(RangeSet *atom) [[fallthrough]]; default: uint32_t value = c0_; - int u16_size = 0; if (c0_ > INT8_MAX) { // NOLINTNEXTLINE(readability-magic-numbers) pc_ -= 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) - auto u16_result = base::utf_helper::ConvertUtf8ToUtf16Pair(pc_, true); - value = u16_result.first; - u16_size = u16_result.second; - Advance(u16_size + 1); + auto [u32Value, u8Size] = base::utf_helper::ConvertMUtf8ToUtf16Pair(pc_, end_ - pc_); + if (u8Size == base::utf_helper::MAX_BYTES) { + // Decode to simple combind uint16 pair + value = base::utf_helper::DecodeUtf16Pair(u32Value); + } else { + value = u32Value; + } + Advance(static_cast(u8Size + 1)); } else { Advance(); }