From a2ed0703f554eca539737e37d2c7b130f6ec9f59 Mon Sep 17 00:00:00 2001 From: mmorozov Date: Tue, 20 Dec 2022 14:58:49 +0300 Subject: [PATCH] Add core string utf8 creation Signed-off-by: mmorozov --- runtime/CMakeLists.txt | 1 - runtime/base/json_parser.h | 2 +- runtime/base/number_helper.cpp | 6 +- runtime/base/string_helper.h | 6 +- runtime/base/utf_helper.cpp | 234 ------------------ runtime/base/utf_helper.h | 90 ------- runtime/builtins/builtins_global.cpp | 23 +- runtime/builtins/builtins_number.cpp | 14 +- runtime/builtins/builtins_string.cpp | 8 +- runtime/builtins/builtins_string_iterator.cpp | 6 +- runtime/ecma_string-inl.h | 6 +- runtime/ecma_string.cpp | 8 +- runtime/ecma_string.h | 10 +- runtime/interpreter/slow_runtime_stub.cpp | 4 +- runtime/js_tagged_value-inl.h | 4 +- runtime/mem/ecma_string.cpp | 4 +- runtime/regexp/regexp_parser.cpp | 2 +- subproject_sources.gn | 1 - 18 files changed, 45 insertions(+), 384 deletions(-) delete mode 100644 runtime/base/utf_helper.cpp delete mode 100644 runtime/base/utf_helper.h diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index 3652b3dcd..ab7e86e75 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -60,7 +60,6 @@ set(ECMASCRIPT_SOURCES ${ECMA_SRC_DIR}/base/object_helper.cpp ${ECMA_SRC_DIR}/base/string_helper.cpp ${ECMA_SRC_DIR}/base/typed_array_helper.cpp - ${ECMA_SRC_DIR}/base/utf_helper.cpp ${ECMA_SRC_DIR}/builtins.cpp ${ECMA_SRC_DIR}/builtins/builtins_ark_tools.cpp ${ECMA_SRC_DIR}/builtins/builtins_array.cpp diff --git a/runtime/base/json_parser.h b/runtime/base/json_parser.h index 53315527a..99b685470 100644 --- a/runtime/base/json_parser.h +++ b/runtime/base/json_parser.h @@ -20,7 +20,6 @@ #include "plugins/ecmascript/runtime/base/builtins_base.h" #include "plugins/ecmascript/runtime/base/number_helper.h" #include "plugins/ecmascript/runtime/base/string_helper.h" -#include "plugins/ecmascript/runtime/base/utf_helper.h" #include "plugins/ecmascript/runtime/ecma_string-inl.h" #include "plugins/ecmascript/runtime/ecma_string.h" #include "plugins/ecmascript/runtime/internal_call_params.h" @@ -31,6 +30,7 @@ #include "plugins/ecmascript/runtime/js_tagged_value.h" #include "plugins/ecmascript/runtime/object_factory.h" #include "plugins/ecmascript/es2panda/util/helpers.h" +#include "libpandabase/utils/utf.h" namespace panda::ecmascript::base { constexpr unsigned int UNICODE_DIGIT_LENGTH = 4; diff --git a/runtime/base/number_helper.cpp b/runtime/base/number_helper.cpp index 2e2bdd1df..c5a942339 100644 --- a/runtime/base/number_helper.cpp +++ b/runtime/base/number_helper.cpp @@ -77,7 +77,7 @@ bool NumberHelper::GotoNonspace(uint8_t **ptr, const uint8_t *end) ++size; utf8_bit >>= 1UL; } - if (base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, &c, end - *ptr, 1, 0) <= 0) { + if (utf::ConvertRegionUtf8ToUtf16(*ptr, &c, end - *ptr, 1, 0) <= 0) { return true; } } @@ -336,9 +336,9 @@ JSTaggedValue NumberHelper::StringToBigInt(JSThread *thread, JSHandleIsUtf16())) { PandaVector buf; - size_t len = base::utf_helper::Utf16ToUtf8Size(strObj->GetDataUtf16(), strLen) - 1; + size_t len = utf::Utf16ToUtf8Size(strObj->GetDataUtf16(), strLen) - 1; buf.reserve(len); - len = base::utf_helper::ConvertRegionUtf16ToUtf8(strObj->GetDataUtf16(), buf.data(), strLen, len, 0); + len = utf::ConvertRegionUtf16ToUtf8(strObj->GetDataUtf16(), buf.data(), strLen, len, 0); str = Span(buf.data(), len); } else { str = Span(strObj->GetDataUtf8(), strLen); diff --git a/runtime/base/string_helper.h b/runtime/base/string_helper.h index f8e593858..c1184b40d 100644 --- a/runtime/base/string_helper.h +++ b/runtime/base/string_helper.h @@ -23,12 +23,12 @@ #include #include -#include "plugins/ecmascript/runtime/base/utf_helper.h" #include "plugins/ecmascript/runtime/ecma_string-inl.h" #include "plugins/ecmascript/runtime/ecma_vm.h" #include "plugins/ecmascript/runtime/js_thread.h" #include "plugins/ecmascript/runtime/object_factory.h" #include "icu4c/source/common/unicode/unistr.h" +#include "libpandabase/utils/utf.h" #include "libpandafile/file_items.h" namespace panda::ecmascript::base { @@ -222,11 +222,11 @@ public: for (int i = 0; i < l; i++) { // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) b = *p++; - if (b < utf_helper::UTF8_2B_SECOND || b >= utf_helper::UTF8_2B_FIRST) { + if (b < utf::UTF8_2B_SECOND || b >= utf::UTF8_2B_FIRST) { return INVALID_UNICODE_FROM_UTF8; } // NOLINTNEXTLINE(hicpp-signed-bitwise) - c = (c << 6) | (b & utf_helper::UTF8_2B_THIRD); // 6: Maximum Unicode range + c = (c << 6) | (b & utf::UTF8_2B_THIRD); // 6: Maximum Unicode range } if (c < UTF8_MIN_CODE[l - 1]) { return INVALID_UNICODE_FROM_UTF8; diff --git a/runtime/base/utf_helper.cpp b/runtime/base/utf_helper.cpp deleted file mode 100644 index c98f70d7a..000000000 --- a/runtime/base/utf_helper.cpp +++ /dev/null @@ -1,234 +0,0 @@ -/* - * Copyright (c) 2021-2022 Huawei Device Co., Ltd. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "plugins/ecmascript/runtime/base/utf_helper.h" - -// NOLINTNEXTLINE(cppcoreguidelines-macro-usage, hicpp-signed-bitwise) -static constexpr int32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000; -// NOLINTNEXTLINE(cppcoreguidelines-macro-usage) -#define U16_GET_SUPPLEMENTARY(lead, trail) \ - ((static_cast(lead) << 10UL) + static_cast(trail) - U16_SURROGATE_OFFSET) - -namespace panda::ecmascript::base::utf_helper { -uint32_t UTF16Decode(uint16_t lead, uint16_t trail) -{ - ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) && - (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH)); - uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR; - return cp; -} - -bool IsValidUTF8(const std::vector &data) -{ - uint32_t length = data.size(); - switch (length) { - case UtfLength::ONE: - if (data.at(0) >= BIT_MASK_1) { - return false; - } - break; - case UtfLength::TWO: - if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) { - return false; - } - break; - case UtfLength::THREE: - if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) { - return false; - } - break; - case UtfLength::FOUR: - if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) { - return false; - } - break; - default: - UNREACHABLE(); - break; - } - - for (uint32_t i = 1; i < length; i++) { - if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) { - return false; - } - } - return true; -} - -Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify) -{ - // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0, - // means that is a single code point, it needs to be represented by three UTF8 code. - if (d1 == 0 && d0 >= utf::HI_SURROGATE_MIN && d0 <= utf::LO_SURROGATE_MAX) { - auto ch0 = static_cast(UTF8_3B_FIRST | static_cast(d0 >> UtfOffset::TWELVE)); - auto ch1 = static_cast(UTF8_3B_SECOND | (static_cast(d0 >> UtfOffset::SIX) & utf::MASK_6BIT)); - auto ch2 = static_cast(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT)); - return {UtfLength::THREE, {ch0, ch1, ch2}}; - } - - if (d0 == 0) { - if (modify) { - // special case for \u0000 ==> C080 - 1100'0000 1000'0000 - return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}}; - } - // For print string, just skip '\u0000' - return {0, {0x00U}}; - } - if (d0 <= UTF8_1B_MAX) { - return {UtfLength::ONE, {static_cast(d0)}}; - } - if (d0 <= UTF8_2B_MAX) { - auto ch0 = static_cast(UTF8_2B_FIRST | static_cast(d0 >> UtfOffset::SIX)); - auto ch1 = static_cast(UTF8_2B_SECOND | (d0 & utf::MASK_6BIT)); - return {UtfLength::TWO, {ch0, ch1}}; - } - if (d0 < utf::HI_SURROGATE_MIN || d0 > utf::HI_SURROGATE_MAX) { - auto ch0 = static_cast(UTF8_3B_FIRST | static_cast(d0 >> UtfOffset::TWELVE)); - auto ch1 = static_cast(UTF8_3B_SECOND | (static_cast(d0 >> UtfOffset::SIX) & utf::MASK_6BIT)); - auto ch2 = static_cast(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT)); - return {UtfLength::THREE, {ch0, ch1, ch2}}; - } - if (d1 < utf::LO_SURROGATE_MIN || d1 > utf::LO_SURROGATE_MAX) { - // Bad sequence - UNREACHABLE(); - } - - uint32_t codePoint = CombineTwoU16(d0, d1); - - auto ch0 = static_cast((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST); - auto ch1 = static_cast(((codePoint >> UtfOffset::TWELVE) & utf::MASK_6BIT) | utf::MASK1); - auto ch2 = static_cast(((codePoint >> UtfOffset::SIX) & utf::MASK_6BIT) | utf::MASK1); - auto ch3 = static_cast((codePoint & utf::MASK_6BIT) | utf::MASK1); - return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}}; -} - -size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify) -{ - size_t res = 1; // zero byte - // when utf16 data length is only 1 and code in 0xd800-0xdfff, - // means that is a single code point, it needs to be represented by three UTF8 code. - if (length == 1 && utf16[0] >= utf::HI_SURROGATE_MIN && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - utf16[0] <= utf::LO_SURROGATE_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - res += UtfLength::THREE; - return res; - } - - for (uint32_t i = 0; i < length; ++i) { - if (utf16[i] == 0) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - if (modify) { - res += UtfLength::TWO; // special case for U+0000 => C0 80 - } - } else if (utf16[i] <= UTF8_1B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - res += 1; - } else if (utf16[i] <= UTF8_2B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - res += UtfLength::TWO; - // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) - } else if (utf16[i] < utf::HI_SURROGATE_MIN || utf16[i] > utf::HI_SURROGATE_MAX) { - res += UtfLength::THREE; - } else { - if (i < length - 1 && - utf16[i + 1] >= utf::LO_SURROGATE_MIN && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - utf16[i + 1] <= utf::LO_SURROGATE_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - res += UtfLength::FOUR; - ++i; - } else { - res += UtfLength::THREE; - } - } - } - return res; -} - -size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16_in, uint8_t *utf8_out, size_t utf16_len, size_t utf8_len, - size_t start, bool modify) -{ - size_t utf8_pos = 0; - if (utf16_in == nullptr || utf8_out == nullptr || utf8_len == 0) { - return 0; - } - size_t end = start + utf16_len; - for (size_t i = start; i < end; ++i) { - uint16_t next16_code = 0; - // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) - if ((i + 1) != end && utf::IsAvailableNextUtf16Code(utf16_in[i + 1])) { - next16_code = utf16_in[i + 1]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - } - // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) - Utf8Char ch = ConvertUtf16ToUtf8(utf16_in[i], next16_code, modify); - if (utf8_pos + ch.n > utf8_len) { - break; - } - for (size_t c = 0; c < ch.n; ++c) { - utf8_out[utf8_pos++] = ch.ch[c]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - } - if (ch.n == UtfLength::FOUR) { // Two UTF-16 chars are used - ++i; - } - } - return utf8_pos; -} - -std::pair ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine) -{ - uint8_t d0 = data[0]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - if ((d0 & utf::MASK1) == 0) { - return {d0, 1}; - } - - uint8_t d1 = data[1]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - if ((d0 & utf::MASK2) == 0) { - return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO}; - } - - uint8_t d2 = data[UtfLength::TWO]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - if ((d0 & utf::MASK3) == 0) { - return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) | - (d2 & utf::MASK_6BIT), - UtfLength::THREE}; - } - - uint8_t d3 = data[UtfLength::THREE]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - uint32_t codePoint = ((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) | - ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT); - - uint32_t pair = 0; - if (combine) { - uint32_t lead = ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD); - uint32_t tail = ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT; - pair = U16_GET_SUPPLEMENTARY(lead, tail); // NOLINT(hicpp-signed-bitwise) - } else { - pair |= ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD) << utf::PAIR_ELEMENT_WIDTH; - pair |= ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT; - } - - return {pair, UtfLength::FOUR}; -} - -size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8_len) -{ - return utf::MUtf8ToUtf16Size(utf8, utf8_len); -} - -size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8_in, uint16_t *utf16_out, size_t utf8_len, size_t utf16_len, - size_t start) -{ - return utf::ConvertRegionMUtf8ToUtf16(utf8_in, utf16_out, utf8_len, utf16_len, start); -} - -bool IsUTF16SurrogatePair(const uint16_t lead) -{ - return lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH; -} -} // namespace panda::ecmascript::base::utf_helper diff --git a/runtime/base/utf_helper.h b/runtime/base/utf_helper.h deleted file mode 100644 index b166ddc3a..000000000 --- a/runtime/base/utf_helper.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2021-2022 Huawei Device Co., Ltd. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ECMASCRIPT_BASE_UTF_HELPER_H -#define ECMASCRIPT_BASE_UTF_HELPER_H - -#include -#include - -#include "libpandabase/utils/utf.h" - -namespace panda::ecmascript::base::utf_helper { -static constexpr uint16_t DECODE_LEAD_LOW = 0xD800; -static constexpr uint16_t DECODE_LEAD_HIGH = 0xDBFF; -static constexpr uint16_t DECODE_TRAIL_LOW = 0xDC00; -static constexpr uint16_t DECODE_TRAIL_HIGH = 0xDFFF; -static constexpr uint32_t DECODE_FIRST_FACTOR = 0x400; -static constexpr uint32_t DECODE_SECOND_FACTOR = 0x10000; - -static constexpr uint8_t BIT_MASK_1 = 0x80; -static constexpr uint8_t BIT_MASK_2 = 0xC0; -static constexpr uint8_t BIT_MASK_3 = 0xE0; -static constexpr uint8_t BIT_MASK_4 = 0xF0; -static constexpr uint8_t BIT_MASK_5 = 0xF8; - -static constexpr uint8_t UTF8_1B_MAX = 0x7f; - -static constexpr uint16_t UTF8_2B_MAX = 0x7ff; -static constexpr uint8_t UTF8_2B_FIRST = 0xc0; -static constexpr uint8_t UTF8_2B_SECOND = 0x80; -static constexpr uint8_t UTF8_2B_THIRD = 0x3f; - -static constexpr uint8_t UTF8_3B_FIRST = 0xe0; -static constexpr uint8_t UTF8_3B_SECOND = 0x80; -static constexpr uint8_t UTF8_3B_THIRD = 0x80; - -static constexpr uint8_t UTF8_4B_FIRST = 0xf0; - -enum UtfLength : uint8_t { ONE = 1, TWO = 2, THREE = 3, FOUR = 4 }; -enum UtfOffset : uint8_t { SIX = 6, TEN = 10, TWELVE = 12, EIGHTEEN = 18 }; - -static constexpr size_t MAX_BYTES = 4; -struct Utf8Char { - size_t n; - std::array ch; -}; - -uint32_t UTF16Decode(uint16_t lead, uint16_t trail); - -bool IsValidUTF8(const std::vector &data); - -Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify); - -size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify = true); - -size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16_in, uint8_t *utf8_out, size_t utf16_len, size_t utf8_len, - size_t start, bool modify = true); - -std::pair ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine = false); - -size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8_len); - -size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8_in, uint16_t *utf16_out, size_t utf8_len, size_t utf16_len, - size_t start); - -bool IsUTF16SurrogatePair(uint16_t lead); - -static inline uint32_t CombineTwoU16(uint16_t d0, uint16_t d1) -{ - uint32_t code_point = d0 - utf::HI_SURROGATE_MIN; - code_point <<= UtfOffset::TEN; - code_point |= d1 - utf::LO_SURROGATE_MIN; - code_point += utf::LO_SUPPLEMENTS_MIN; - return code_point; -} -} // namespace panda::ecmascript::base::utf_helper - -#endif // ECMASCRIPT_BASE_UTF_HELPER_H \ No newline at end of file diff --git a/runtime/builtins/builtins_global.cpp b/runtime/builtins/builtins_global.cpp index d3ba3be6d..65c828d38 100644 --- a/runtime/builtins/builtins_global.cpp +++ b/runtime/builtins/builtins_global.cpp @@ -456,8 +456,7 @@ JSTaggedValue BuiltinsGlobal::Encode(JSThread *thread, const JSHandle= ecmascript::base::utf_helper::DECODE_TRAIL_LOW && - cc <= ecmascript::base::utf_helper::DECODE_TRAIL_HIGH) { + if (cc >= utf::DECODE_TRAIL_LOW && cc <= utf::DECODE_TRAIL_HIGH) { THROW_URI_ERROR_AND_RETURN(thread, "EncodeURI: The format of the URI to be parsed is incorrect", JSTaggedValue::Exception()); } @@ -471,8 +470,7 @@ JSTaggedValue BuiltinsGlobal::Encode(JSThread *thread, const JSHandle ecmascript::base::utf_helper::DECODE_LEAD_HIGH) { + if (cc < utf::DECODE_LEAD_LOW || cc > utf::DECODE_LEAD_HIGH) { vv = cc; } else { k++; @@ -480,12 +478,11 @@ JSTaggedValue BuiltinsGlobal::Encode(JSThread *thread, const JSHandleAt(k); - if (kc < ecmascript::base::utf_helper::DECODE_TRAIL_LOW || - kc > ecmascript::base::utf_helper::DECODE_TRAIL_HIGH) { + if (kc < utf::DECODE_TRAIL_LOW || kc > utf::DECODE_TRAIL_HIGH) { THROW_URI_ERROR_AND_RETURN(thread, "EncodeURI: The format of the URI to be parsed is incorrect", JSTaggedValue::Exception()); } - vv = ecmascript::base::utf_helper::UTF16Decode(cc, kc); + vv = utf::UTF16Decode(cc, kc); } // iv. Let Octets be the array of octets resulting by applying the UTF-8 transformation to V, @@ -673,23 +670,21 @@ JSTaggedValue BuiltinsGlobal::Decode(JSThread *thread, const JSHandle(&vv), 1); } else { sStr = StringHelper::StringToU16string(StringHelper::SubString(str, start, k - start + 1)); } } else { - uint16_t lv = (((vv - ecmascript::base::utf_helper::DECODE_SECOND_FACTOR) & BIT16_MASK) + - ecmascript::base::utf_helper::DECODE_TRAIL_LOW); - uint16_t hv = - ((((vv - ecmascript::base::utf_helper::DECODE_SECOND_FACTOR) >> 10U) & BIT16_MASK) + // NOLINT - ecmascript::base::utf_helper::DECODE_LEAD_LOW); // 10: means shift left by 10 digits + uint16_t lv = (((vv - utf::DECODE_SECOND_FACTOR) & BIT16_MASK) + utf::DECODE_TRAIL_LOW); + uint16_t hv = ((((vv - utf::DECODE_SECOND_FACTOR) >> 10U) & BIT16_MASK) + // NOLINT + utf::DECODE_LEAD_LOW); // 10: means shift left by 10 digits sStr = StringHelper::Append(StringHelper::Utf16ToU16String(&hv, 1), StringHelper::Utf16ToU16String(&lv, 1)); } diff --git a/runtime/builtins/builtins_number.cpp b/runtime/builtins/builtins_number.cpp index 723b72fdd..1c0f2a9bd 100644 --- a/runtime/builtins/builtins_number.cpp +++ b/runtime/builtins/builtins_number.cpp @@ -163,11 +163,10 @@ JSTaggedValue BuiltinsNumber::ParseFloat(EcmaRuntimeCallInfo *argv) RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread); if (UNLIKELY(numberString->IsUtf16())) { - size_t len = - ecmascript::base::utf_helper::Utf16ToUtf8Size(numberString->GetDataUtf16(), numberString->GetLength()) - 1; + size_t len = utf::Utf16ToUtf8Size(numberString->GetDataUtf16(), numberString->GetLength()) - 1; PandaVector buf(len); - len = ecmascript::base::utf_helper::ConvertRegionUtf16ToUtf8(numberString->GetDataUtf16(), buf.data(), - numberString->GetLength(), len, 0); + len = + utf::ConvertRegionUtf16ToUtf8(numberString->GetDataUtf16(), buf.data(), numberString->GetLength(), len, 0); auto str = Span(buf.data(), len); return ParseFloatStr(str); } @@ -203,11 +202,10 @@ JSTaggedValue BuiltinsNumber::ParseInt(EcmaRuntimeCallInfo *argv) RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread); if (UNLIKELY(numberString->IsUtf16())) { - size_t len = - ecmascript::base::utf_helper::Utf16ToUtf8Size(numberString->GetDataUtf16(), numberString->GetLength()) - 1; + size_t len = utf::Utf16ToUtf8Size(numberString->GetDataUtf16(), numberString->GetLength()) - 1; PandaVector buf(len); - len = ecmascript::base::utf_helper::ConvertRegionUtf16ToUtf8(numberString->GetDataUtf16(), buf.data(), - numberString->GetLength(), len, 0); + len = + utf::ConvertRegionUtf16ToUtf8(numberString->GetDataUtf16(), buf.data(), numberString->GetLength(), len, 0); auto str = Span(buf.data(), len); return ParseIntStr(str, radix); } diff --git a/runtime/builtins/builtins_string.cpp b/runtime/builtins/builtins_string.cpp index 993770e97..cf8d4efec 100644 --- a/runtime/builtins/builtins_string.cpp +++ b/runtime/builtins/builtins_string.cpp @@ -306,16 +306,14 @@ JSTaggedValue BuiltinsString::CodePointAt(EcmaRuntimeCallInfo *argv) return JSTaggedValue::Undefined(); } uint16_t first = thisHandle->At(pos); - if (first < ecmascript::base::utf_helper::DECODE_LEAD_LOW || - first > ecmascript::base::utf_helper::DECODE_LEAD_HIGH || pos + 1 == thisLen) { + if (first < utf::DECODE_LEAD_LOW || first > utf::DECODE_LEAD_HIGH || pos + 1 == thisLen) { return GetTaggedInt(first); } uint16_t second = thisHandle->At(pos + 1); - if (second < ecmascript::base::utf_helper::DECODE_TRAIL_LOW || - second > ecmascript::base::utf_helper::DECODE_TRAIL_HIGH) { + if (second < utf::DECODE_TRAIL_LOW || second > utf::DECODE_TRAIL_HIGH) { return GetTaggedInt(first); } - uint32_t res = ecmascript::base::utf_helper::UTF16Decode(first, second); + uint32_t res = utf::UTF16Decode(first, second); return GetTaggedInt(res); } diff --git a/runtime/builtins/builtins_string_iterator.cpp b/runtime/builtins/builtins_string_iterator.cpp index e90ff858a..6b0d73281 100644 --- a/runtime/builtins/builtins_string_iterator.cpp +++ b/runtime/builtins/builtins_string_iterator.cpp @@ -64,8 +64,7 @@ JSTaggedValue BuiltinsStringIterator::Next(EcmaRuntimeCallInfo *argv) // 10. If first < 0xD800 or first > 0xDBFF or position+1 = len, let resultString be the string consisting of the // single code unit first. ObjectFactory *factory = thread->GetEcmaVM()->GetFactory(); - if (position + 1 == len || first < ecmascript::base::utf_helper::DECODE_LEAD_LOW || - first > ecmascript::base::utf_helper::DECODE_LEAD_HIGH) { + if (position + 1 == len || first < utf::DECODE_LEAD_LOW || first > utf::DECODE_LEAD_HIGH) { std::vector resultString {first, 0x0}; result.Update(factory->NewFromUtf16UnCheck(resultString.data(), 1, true).GetTaggedValue()); } else { @@ -75,8 +74,7 @@ JSTaggedValue BuiltinsStringIterator::Next(EcmaRuntimeCallInfo *argv) // first. // c. Else, let resultString be the string consisting of the code unit first followed by the code unit second. uint16_t second = string.GetObject()->At(position + 1); - if (second < ecmascript::base::utf_helper::DECODE_TRAIL_LOW || - second > ecmascript::base::utf_helper::DECODE_TRAIL_HIGH) { + if (second < utf::DECODE_TRAIL_LOW || second > utf::DECODE_TRAIL_HIGH) { std::vector resultString {first, 0x0}; result.Update(factory->NewFromUtf16UnCheck(resultString.data(), 1, false).GetTaggedValue()); } else { diff --git a/runtime/ecma_string-inl.h b/runtime/ecma_string-inl.h index 5220bbb8f..d0b2d766a 100644 --- a/runtime/ecma_string-inl.h +++ b/runtime/ecma_string-inl.h @@ -63,12 +63,12 @@ inline EcmaString *EcmaString::CreateFromUtf8(const uint8_t *utf8_data, uint32_t UNREACHABLE(); } } else { - auto utf16_len = base::utf_helper::Utf8ToUtf16Size(utf8_data, utf8_len); + auto utf16_len = utf::Utf8ToUtf16Size(utf8_data, utf8_len); string = AllocStringObject(utf16_len, false, vm, space_type); ASSERT(string != nullptr); - [[maybe_unused]] auto len = base::utf_helper::ConvertRegionUtf8ToUtf16( - utf8_data, string->GetDataUtf16Writable(), utf8_len, utf16_len, 0); + [[maybe_unused]] auto len = + utf::ConvertRegionUtf8ToUtf16(utf8_data, string->GetDataUtf16Writable(), utf8_len, utf16_len, 0); ASSERT(len == utf16_len); } diff --git a/runtime/ecma_string.cpp b/runtime/ecma_string.cpp index d2cdf0c12..5d3928276 100644 --- a/runtime/ecma_string.cpp +++ b/runtime/ecma_string.cpp @@ -411,10 +411,9 @@ uint32_t EcmaString::ComputeHashcodeUtf8(const uint8_t *utf8_data, size_t utf8_l if (can_be_compress) { hash = ComputeHashForUtf8(utf8_data, utf8_len); } else { - auto utf16_len = base::utf_helper::Utf8ToUtf16Size(utf8_data, utf8_len); + auto utf16_len = utf::Utf8ToUtf16Size(utf8_data, utf8_len); PandaVector tmp_buffer(utf16_len); - [[maybe_unused]] auto len = - base::utf_helper::ConvertRegionUtf8ToUtf16(utf8_data, tmp_buffer.data(), utf8_len, utf16_len, 0); + [[maybe_unused]] auto len = utf::ConvertRegionUtf8ToUtf16(utf8_data, tmp_buffer.data(), utf8_len, utf16_len, 0); ASSERT(len == utf16_len); hash = ComputeHashForData(tmp_buffer.data(), utf16_len); } @@ -434,8 +433,7 @@ bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8_data, size_t utf8_len, co // length is one more than compared utf16_data, don't need convert all utf8_data to utf16_data uint32_t utf8_convert_length = utf16_len + 1; PandaVector tmp_buffer(utf8_convert_length); - auto len = - base::utf_helper::ConvertRegionUtf8ToUtf16(utf8_data, tmp_buffer.data(), utf8_len, utf8_convert_length, 0); + auto len = utf::ConvertRegionUtf8ToUtf16(utf8_data, tmp_buffer.data(), utf8_len, utf8_convert_length, 0); if (len != utf16_len) { return false; } diff --git a/runtime/ecma_string.h b/runtime/ecma_string.h index cde5bb1ea..bdd17113c 100644 --- a/runtime/ecma_string.h +++ b/runtime/ecma_string.h @@ -20,7 +20,7 @@ #include #include -#include "plugins/ecmascript/runtime/base/utf_helper.h" +#include "libpandabase/utils/utf.h" #include "plugins/ecmascript/runtime/ecma_macros.h" #include "plugins/ecmascript/runtime/js_tagged_value.h" #include "plugins/ecmascript/runtime/mem/tagged_object.h" @@ -132,7 +132,7 @@ public: if (!IsUtf16()) { return GetLength() + 1; // add place for zero in the end } - return base::utf_helper::Utf16ToUtf8Size(GetData(), GetLength()); + return utf::Utf16ToUtf8Size(GetData(), GetLength()); } size_t GetUtf16Length() const @@ -169,7 +169,7 @@ public: } return length; } - return base::utf_helper::ConvertRegionUtf16ToUtf8(GetDataUtf16(), buf, length, max_length - 1, start); + return utf::ConvertRegionUtf16ToUtf8(GetDataUtf16(), buf, length, max_length - 1, start); } inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t max_length) const @@ -195,7 +195,7 @@ public: } return length; } - return base::utf_helper::ConvertRegionUtf8ToUtf16(GetDataUtf8(), buf, len, max_length, start); + return utf::ConvertRegionUtf8ToUtf16(GetDataUtf8(), buf, len, max_length, start); } // NOLINTNEXTLINE(modernize-avoid-c-arrays) @@ -317,7 +317,7 @@ private: static bool IsASCIICharacter(uint16_t data) { // \0 is not considered ASCII in Ecma-Modified-UTF8 [only modify '\u0000'] - return data - 1U < base::utf_helper::UTF8_1B_MAX; + return data - 1U < utf::UTF8_1B_MAX; } /** diff --git a/runtime/interpreter/slow_runtime_stub.cpp b/runtime/interpreter/slow_runtime_stub.cpp index 1e2cf8246..7dad6ee9d 100644 --- a/runtime/interpreter/slow_runtime_stub.cpp +++ b/runtime/interpreter/slow_runtime_stub.cpp @@ -18,7 +18,6 @@ #include "js_tagged_value.h" #include "lexical_env.h" #include "plugins/ecmascript/runtime/base/number_helper.h" -#include "plugins/ecmascript/runtime/base/utf_helper.h" #include "plugins/ecmascript/runtime/builtins/builtins_regexp.h" #include "plugins/ecmascript/runtime/class_linker/program_object-inl.h" #include "plugins/ecmascript/runtime/ecma_module.h" @@ -49,6 +48,7 @@ #include "plugins/ecmascript/runtime/runtime_call_id.h" #include "plugins/ecmascript/runtime/template_string.h" #include "plugins/ecmascript/runtime/vmstat/runtime_stat.h" +#include "libpandabase/utils/utf.h" namespace panda::ecmascript { JSTaggedValue SlowRuntimeStub::CallSpreadDyn(JSThread *thread, JSTaggedValue func, JSTaggedValue obj, @@ -1954,7 +1954,7 @@ JSTaggedValue SlowRuntimeStub::StArraySpread(JSThread *thread, JSTaggedValue dst uint32_t prop_counter = 0; for (uint32_t i = 0; i < str_len; i++, prop_counter++) { uint16_t res = src_string->At(i); - if (UNLIKELY(base::utf_helper::IsUTF16SurrogatePair(res))) { + if (UNLIKELY(utf::IsUTF16SurrogatePair(res))) { std::array res_surrogate_pair {}; res_surrogate_pair[0] = src_string->At(i); res_surrogate_pair[1] = src_string->At(i + 1); diff --git a/runtime/js_tagged_value-inl.h b/runtime/js_tagged_value-inl.h index 649877be3..532706b58 100644 --- a/runtime/js_tagged_value-inl.h +++ b/runtime/js_tagged_value-inl.h @@ -135,9 +135,9 @@ inline JSTaggedNumber JSTaggedValue::ToNumber(JSThread *thread, const JSHandle buf; // Span will use buf.data(), shouldn't define inside 'if' if (UNLIKELY(str_obj->IsUtf16())) { - size_t len = base::utf_helper::Utf16ToUtf8Size(str_obj->GetDataUtf16(), str_len) - 1; + size_t len = utf::Utf16ToUtf8Size(str_obj->GetDataUtf16(), str_len) - 1; buf.reserve(len); - len = base::utf_helper::ConvertRegionUtf16ToUtf8(str_obj->GetDataUtf16(), buf.data(), str_len, len, 0); + len = utf::ConvertRegionUtf16ToUtf8(str_obj->GetDataUtf16(), buf.data(), str_len, len, 0); str = Span(buf.data(), len); } else { str = Span(str_obj->GetDataUtf8(), str_len); diff --git a/runtime/mem/ecma_string.cpp b/runtime/mem/ecma_string.cpp index d41bc89e4..2dc55699f 100644 --- a/runtime/mem/ecma_string.cpp +++ b/runtime/mem/ecma_string.cpp @@ -59,10 +59,10 @@ PandaString ConvertToPandaString(const EcmaString *s, uint32_t start, uint32_t l // Should convert utf-16 to utf-8, because uint16_t likely great than maxChar, will convert fail bool modify = (usage != StringConvertedUsage::PRINT); // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) - size_t len = base::utf_helper::Utf16ToUtf8Size(s->GetDataUtf16() + start, length, modify) - 1; + size_t len = utf::Utf16ToUtf8Size(s->GetDataUtf16() + start, length, modify) - 1; PandaVector buf(len); // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) - len = base::utf_helper::ConvertRegionUtf16ToUtf8(s->GetDataUtf16() + start, buf.data(), length, len, 0, modify); + len = utf::ConvertRegionUtf16ToUtf8(s->GetDataUtf16() + start, buf.data(), length, len, 0, modify); Span sp(buf.data(), len); return ConvertToPandaString(sp); } diff --git a/runtime/regexp/regexp_parser.cpp b/runtime/regexp/regexp_parser.cpp index 4e7f906a1..c8567d55c 100644 --- a/runtime/regexp/regexp_parser.cpp +++ b/runtime/regexp/regexp_parser.cpp @@ -1289,7 +1289,7 @@ uint32_t RegExpParser::ParseClassAtom(RangeSet *atom) size_t u16_size = 0; if (c0_ > INT8_MAX) { pc_ -= 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - auto u16_result = base::utf_helper::ConvertUtf8ToUtf16Pair(pc_, true); + auto u16_result = utf::ConvertUtf8ToUtf16Pair(pc_, true); value = u16_result.first; u16_size = u16_result.second; Advance(u16_size + 1); diff --git a/subproject_sources.gn b/subproject_sources.gn index d903a9a92..440fc1f5f 100644 --- a/subproject_sources.gn +++ b/subproject_sources.gn @@ -88,7 +88,6 @@ srcs_runtime = [ "runtime/base/object_helper.cpp", "runtime/base/string_helper.cpp", "runtime/base/typed_array_helper.cpp", - "runtime/base/utf_helper.cpp", "runtime/bridge/ecma_bridge_helpers.cpp", "runtime/builtins.cpp", "runtime/builtins/builtins_ark_tools.cpp", -- Gitee