From a2ed0703f554eca539737e37d2c7b130f6ec9f59 Mon Sep 17 00:00:00 2001
From: mmorozov <morozov.maxim@huawei-partners.com>
Date: Tue, 20 Dec 2022 14:58:49 +0300
Subject: [PATCH] Add core string utf8 creation

Signed-off-by: mmorozov <morozov.maxim@huawei-partners.com>
---
 runtime/CMakeLists.txt                        |   1 -
 runtime/base/json_parser.h                    |   2 +-
 runtime/base/number_helper.cpp                |   6 +-
 runtime/base/string_helper.h                  |   6 +-
 runtime/base/utf_helper.cpp                   | 234 ------------------
 runtime/base/utf_helper.h                     |  90 -------
 runtime/builtins/builtins_global.cpp          |  23 +-
 runtime/builtins/builtins_number.cpp          |  14 +-
 runtime/builtins/builtins_string.cpp          |   8 +-
 runtime/builtins/builtins_string_iterator.cpp |   6 +-
 runtime/ecma_string-inl.h                     |   6 +-
 runtime/ecma_string.cpp                       |   8 +-
 runtime/ecma_string.h                         |  10 +-
 runtime/interpreter/slow_runtime_stub.cpp     |   4 +-
 runtime/js_tagged_value-inl.h                 |   4 +-
 runtime/mem/ecma_string.cpp                   |   4 +-
 runtime/regexp/regexp_parser.cpp              |   2 +-
 subproject_sources.gn                         |   1 -
 18 files changed, 45 insertions(+), 384 deletions(-)
 delete mode 100644 runtime/base/utf_helper.cpp
 delete mode 100644 runtime/base/utf_helper.h
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 3652b3dcd..ab7e86e75 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -60,7 +60,6 @@ set(ECMASCRIPT_SOURCES
     ${ECMA_SRC_DIR}/base/object_helper.cpp
     ${ECMA_SRC_DIR}/base/string_helper.cpp
     ${ECMA_SRC_DIR}/base/typed_array_helper.cpp
-    ${ECMA_SRC_DIR}/base/utf_helper.cpp
     ${ECMA_SRC_DIR}/builtins.cpp
     ${ECMA_SRC_DIR}/builtins/builtins_ark_tools.cpp
     ${ECMA_SRC_DIR}/builtins/builtins_array.cpp
diff --git a/runtime/base/json_parser.h b/runtime/base/json_parser.h
index 53315527a..99b685470 100644
--- a/runtime/base/json_parser.h
+++ b/runtime/base/json_parser.h
@@ -20,7 +20,6 @@
 #include "plugins/ecmascript/runtime/base/builtins_base.h"
 #include "plugins/ecmascript/runtime/base/number_helper.h"
 #include "plugins/ecmascript/runtime/base/string_helper.h"
-#include "plugins/ecmascript/runtime/base/utf_helper.h"
 #include "plugins/ecmascript/runtime/ecma_string-inl.h"
 #include "plugins/ecmascript/runtime/ecma_string.h"
 #include "plugins/ecmascript/runtime/internal_call_params.h"
@@ -31,6 +30,7 @@
 #include "plugins/ecmascript/runtime/js_tagged_value.h"
 #include "plugins/ecmascript/runtime/object_factory.h"
 #include "plugins/ecmascript/es2panda/util/helpers.h"
+#include "libpandabase/utils/utf.h"
 
 namespace panda::ecmascript::base {
 constexpr unsigned int UNICODE_DIGIT_LENGTH = 4;
diff --git a/runtime/base/number_helper.cpp b/runtime/base/number_helper.cpp
index 2e2bdd1df..c5a942339 100644
--- a/runtime/base/number_helper.cpp
+++ b/runtime/base/number_helper.cpp
@@ -77,7 +77,7 @@ bool NumberHelper::GotoNonspace(uint8_t **ptr, const uint8_t *end)
                 ++size;
                 utf8_bit >>= 1UL;
             }
-            if (base::utf_helper::ConvertRegionUtf8ToUtf16(*ptr, &c, end - *ptr, 1, 0) <= 0) {
+            if (utf::ConvertRegionUtf8ToUtf16(*ptr, &c, end - *ptr, 1, 0) <= 0) {
                 return true;
             }
         }
@@ -336,9 +336,9 @@ JSTaggedValue NumberHelper::StringToBigInt(JSThread *thread, JSHandle<JSTaggedVa
     }
     if (UNLIKELY(strObj->IsUtf16())) {
         PandaVector<uint8_t> buf;
-        size_t len = base::utf_helper::Utf16ToUtf8Size(strObj->GetDataUtf16(), strLen) - 1;
+        size_t len = utf::Utf16ToUtf8Size(strObj->GetDataUtf16(), strLen) - 1;
         buf.reserve(len);
-        len = base::utf_helper::ConvertRegionUtf16ToUtf8(strObj->GetDataUtf16(), buf.data(), strLen, len, 0);
+        len = utf::ConvertRegionUtf16ToUtf8(strObj->GetDataUtf16(), buf.data(), strLen, len, 0);
         str = Span<const uint8_t>(buf.data(), len);
     } else {
         str = Span<const uint8_t>(strObj->GetDataUtf8(), strLen);
diff --git a/runtime/base/string_helper.h b/runtime/base/string_helper.h
index f8e593858..c1184b40d 100644
--- a/runtime/base/string_helper.h
+++ b/runtime/base/string_helper.h
@@ -23,12 +23,12 @@
 #include <string>
 #include <vector>
 
-#include "plugins/ecmascript/runtime/base/utf_helper.h"
 #include "plugins/ecmascript/runtime/ecma_string-inl.h"
 #include "plugins/ecmascript/runtime/ecma_vm.h"
 #include "plugins/ecmascript/runtime/js_thread.h"
 #include "plugins/ecmascript/runtime/object_factory.h"
 #include "icu4c/source/common/unicode/unistr.h"
+#include "libpandabase/utils/utf.h"
 #include "libpandafile/file_items.h"
 
 namespace panda::ecmascript::base {
@@ -222,11 +222,11 @@ public:
         for (int i = 0; i < l; i++) {
             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
             b = *p++;
-            if (b < utf_helper::UTF8_2B_SECOND || b >= utf_helper::UTF8_2B_FIRST) {
+            if (b < utf::UTF8_2B_SECOND || b >= utf::UTF8_2B_FIRST) {
                 return INVALID_UNICODE_FROM_UTF8;
             }
             // NOLINTNEXTLINE(hicpp-signed-bitwise)
-            c = (c << 6) | (b & utf_helper::UTF8_2B_THIRD);  // 6: Maximum Unicode range
+            c = (c << 6) | (b & utf::UTF8_2B_THIRD);  // 6: Maximum Unicode range
         }
         if (c < UTF8_MIN_CODE[l - 1]) {
             return INVALID_UNICODE_FROM_UTF8;
diff --git a/runtime/base/utf_helper.cpp b/runtime/base/utf_helper.cpp
deleted file mode 100644
index c98f70d7a..000000000
--- a/runtime/base/utf_helper.cpp
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "plugins/ecmascript/runtime/base/utf_helper.h"
-
-// NOLINTNEXTLINE(cppcoreguidelines-macro-usage, hicpp-signed-bitwise)
-static constexpr int32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000;
-// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
-#define U16_GET_SUPPLEMENTARY(lead, trail) \
-    ((static_cast<int32_t>(lead) << 10UL) + static_cast<int32_t>(trail) - U16_SURROGATE_OFFSET)
-
-namespace panda::ecmascript::base::utf_helper {
-uint32_t UTF16Decode(uint16_t lead, uint16_t trail)
-{
-    ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) &&
-           (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH));
-    uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
-    return cp;
-}
-
-bool IsValidUTF8(const std::vector<uint8_t> &data)
-{
-    uint32_t length = data.size();
-    switch (length) {
-        case UtfLength::ONE:
-            if (data.at(0) >= BIT_MASK_1) {
-                return false;
-            }
-            break;
-        case UtfLength::TWO:
-            if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) {
-                return false;
-            }
-            break;
-        case UtfLength::THREE:
-            if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) {
-                return false;
-            }
-            break;
-        case UtfLength::FOUR:
-            if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) {
-                return false;
-            }
-            break;
-        default:
-            UNREACHABLE();
-            break;
-    }
-
-    for (uint32_t i = 1; i < length; i++) {
-        if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) {
-            return false;
-        }
-    }
-    return true;
-}
-
-Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify)
-{
-    // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
-    // means that is a single code point, it needs to be represented by three UTF8 code.
-    if (d1 == 0 && d0 >= utf::HI_SURROGATE_MIN && d0 <= utf::LO_SURROGATE_MAX) {
-        auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
-        auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT));
-        auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT));
-        return {UtfLength::THREE, {ch0, ch1, ch2}};
-    }
-
-    if (d0 == 0) {
-        if (modify) {
-            // special case for \u0000 ==> C080 - 1100'0000 1000'0000
-            return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}};
-        }
-        // For print string, just skip '\u0000'
-        return {0, {0x00U}};
-    }
-    if (d0 <= UTF8_1B_MAX) {
-        return {UtfLength::ONE, {static_cast<uint8_t>(d0)}};
-    }
-    if (d0 <= UTF8_2B_MAX) {
-        auto ch0 = static_cast<uint8_t>(UTF8_2B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::SIX));
-        auto ch1 = static_cast<uint8_t>(UTF8_2B_SECOND | (d0 & utf::MASK_6BIT));
-        return {UtfLength::TWO, {ch0, ch1}};
-    }
-    if (d0 < utf::HI_SURROGATE_MIN || d0 > utf::HI_SURROGATE_MAX) {
-        auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
-        auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT));
-        auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT));
-        return {UtfLength::THREE, {ch0, ch1, ch2}};
-    }
-    if (d1 < utf::LO_SURROGATE_MIN || d1 > utf::LO_SURROGATE_MAX) {
-        // Bad sequence
-        UNREACHABLE();
-    }
-
-    uint32_t codePoint = CombineTwoU16(d0, d1);
-
-    auto ch0 = static_cast<uint8_t>((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST);
-    auto ch1 = static_cast<uint8_t>(((codePoint >> UtfOffset::TWELVE) & utf::MASK_6BIT) | utf::MASK1);
-    auto ch2 = static_cast<uint8_t>(((codePoint >> UtfOffset::SIX) & utf::MASK_6BIT) | utf::MASK1);
-    auto ch3 = static_cast<uint8_t>((codePoint & utf::MASK_6BIT) | utf::MASK1);
-    return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}};
-}
-
-size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify)
-{
-    size_t res = 1;  // zero byte
-    // when utf16 data length is only 1 and code in 0xd800-0xdfff,
-    // means that is a single code point, it needs to be represented by three UTF8 code.
-    if (length == 1 && utf16[0] >= utf::HI_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-        utf16[0] <= utf::LO_SURROGATE_MAX) {                 // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-        res += UtfLength::THREE;
-        return res;
-    }
-
-    for (uint32_t i = 0; i < length; ++i) {
-        if (utf16[i] == 0) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-            if (modify) {
-                res += UtfLength::TWO;  // special case for U+0000 => C0 80
-            }
-        } else if (utf16[i] <= UTF8_1B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-            res += 1;
-        } else if (utf16[i] <= UTF8_2B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-            res += UtfLength::TWO;
-            // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-        } else if (utf16[i] < utf::HI_SURROGATE_MIN || utf16[i] > utf::HI_SURROGATE_MAX) {
-            res += UtfLength::THREE;
-        } else {
-            if (i < length - 1 &&
-                utf16[i + 1] >= utf::LO_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-                utf16[i + 1] <= utf::LO_SURROGATE_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-                res += UtfLength::FOUR;
-                ++i;
-            } else {
-                res += UtfLength::THREE;
-            }
-        }
-    }
-    return res;
-}
-
-size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16_in, uint8_t *utf8_out, size_t utf16_len, size_t utf8_len,
-                                size_t start, bool modify)
-{
-    size_t utf8_pos = 0;
-    if (utf16_in == nullptr || utf8_out == nullptr || utf8_len == 0) {
-        return 0;
-    }
-    size_t end = start + utf16_len;
-    for (size_t i = start; i < end; ++i) {
-        uint16_t next16_code = 0;
-        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-        if ((i + 1) != end && utf::IsAvailableNextUtf16Code(utf16_in[i + 1])) {
-            next16_code = utf16_in[i + 1];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-        }
-        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-        Utf8Char ch = ConvertUtf16ToUtf8(utf16_in[i], next16_code, modify);
-        if (utf8_pos + ch.n > utf8_len) {
-            break;
-        }
-        for (size_t c = 0; c < ch.n; ++c) {
-            utf8_out[utf8_pos++] = ch.ch[c];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-        }
-        if (ch.n == UtfLength::FOUR) {  // Two UTF-16 chars are used
-            ++i;
-        }
-    }
-    return utf8_pos;
-}
-
-std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine)
-{
-    uint8_t d0 = data[0];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-    if ((d0 & utf::MASK1) == 0) {
-        return {d0, 1};
-    }
-
-    uint8_t d1 = data[1];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-    if ((d0 & utf::MASK2) == 0) {
-        return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO};
-    }
-
-    uint8_t d2 = data[UtfLength::TWO];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-    if ((d0 & utf::MASK3) == 0) {
-        return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) |
-                    (d2 & utf::MASK_6BIT),
-                UtfLength::THREE};
-    }
-
-    uint8_t d3 = data[UtfLength::THREE];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-    uint32_t codePoint = ((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) |
-                         ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT);
-
-    uint32_t pair = 0;
-    if (combine) {
-        uint32_t lead = ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD);
-        uint32_t tail = ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT;
-        pair = U16_GET_SUPPLEMENTARY(lead, tail);  // NOLINT(hicpp-signed-bitwise)
-    } else {
-        pair |= ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD) << utf::PAIR_ELEMENT_WIDTH;
-        pair |= ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT;
-    }
-
-    return {pair, UtfLength::FOUR};
-}
-
-size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8_len)
-{
-    return utf::MUtf8ToUtf16Size(utf8, utf8_len);
-}
-
-size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8_in, uint16_t *utf16_out, size_t utf8_len, size_t utf16_len,
-                                size_t start)
-{
-    return utf::ConvertRegionMUtf8ToUtf16(utf8_in, utf16_out, utf8_len, utf16_len, start);
-}
-
-bool IsUTF16SurrogatePair(const uint16_t lead)
-{
-    return lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH;
-}
-}  // namespace panda::ecmascript::base::utf_helper
diff --git a/runtime/base/utf_helper.h b/runtime/base/utf_helper.h
deleted file mode 100644
index b166ddc3a..000000000
--- a/runtime/base/utf_helper.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ECMASCRIPT_BASE_UTF_HELPER_H
-#define ECMASCRIPT_BASE_UTF_HELPER_H
-
-#include <cstdint>
-#include <vector>
-
-#include "libpandabase/utils/utf.h"
-
-namespace panda::ecmascript::base::utf_helper {
-static constexpr uint16_t DECODE_LEAD_LOW = 0xD800;
-static constexpr uint16_t DECODE_LEAD_HIGH = 0xDBFF;
-static constexpr uint16_t DECODE_TRAIL_LOW = 0xDC00;
-static constexpr uint16_t DECODE_TRAIL_HIGH = 0xDFFF;
-static constexpr uint32_t DECODE_FIRST_FACTOR = 0x400;
-static constexpr uint32_t DECODE_SECOND_FACTOR = 0x10000;
-
-static constexpr uint8_t BIT_MASK_1 = 0x80;
-static constexpr uint8_t BIT_MASK_2 = 0xC0;
-static constexpr uint8_t BIT_MASK_3 = 0xE0;
-static constexpr uint8_t BIT_MASK_4 = 0xF0;
-static constexpr uint8_t BIT_MASK_5 = 0xF8;
-
-static constexpr uint8_t UTF8_1B_MAX = 0x7f;
-
-static constexpr uint16_t UTF8_2B_MAX = 0x7ff;
-static constexpr uint8_t UTF8_2B_FIRST = 0xc0;
-static constexpr uint8_t UTF8_2B_SECOND = 0x80;
-static constexpr uint8_t UTF8_2B_THIRD = 0x3f;
-
-static constexpr uint8_t UTF8_3B_FIRST = 0xe0;
-static constexpr uint8_t UTF8_3B_SECOND = 0x80;
-static constexpr uint8_t UTF8_3B_THIRD = 0x80;
-
-static constexpr uint8_t UTF8_4B_FIRST = 0xf0;
-
-enum UtfLength : uint8_t { ONE = 1, TWO = 2, THREE = 3, FOUR = 4 };
-enum UtfOffset : uint8_t { SIX = 6, TEN = 10, TWELVE = 12, EIGHTEEN = 18 };
-
-static constexpr size_t MAX_BYTES = 4;
-struct Utf8Char {
-    size_t n;
-    std::array<uint8_t, MAX_BYTES> ch;
-};
-
-uint32_t UTF16Decode(uint16_t lead, uint16_t trail);
-
-bool IsValidUTF8(const std::vector<uint8_t> &data);
-
-Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify);
-
-size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify = true);
-
-size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16_in, uint8_t *utf8_out, size_t utf16_len, size_t utf8_len,
-                                size_t start, bool modify = true);
-
-std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine = false);
-
-size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8_len);
-
-size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8_in, uint16_t *utf16_out, size_t utf8_len, size_t utf16_len,
-                                size_t start);
-
-bool IsUTF16SurrogatePair(uint16_t lead);
-
-static inline uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
-{
-    uint32_t code_point = d0 - utf::HI_SURROGATE_MIN;
-    code_point <<= UtfOffset::TEN;
-    code_point |= d1 - utf::LO_SURROGATE_MIN;
-    code_point += utf::LO_SUPPLEMENTS_MIN;
-    return code_point;
-}
-}  // namespace panda::ecmascript::base::utf_helper
-
-#endif  // ECMASCRIPT_BASE_UTF_HELPER_H
\ No newline at end of file
diff --git a/runtime/builtins/builtins_global.cpp b/runtime/builtins/builtins_global.cpp
index d3ba3be6d..65c828d38 100644
--- a/runtime/builtins/builtins_global.cpp
+++ b/runtime/builtins/builtins_global.cpp
@@ -456,8 +456,7 @@ JSTaggedValue BuiltinsGlobal::Encode(JSThread *thread, const JSHandle<EcmaString
         } else {
             // i. If the code unit value of C is not less than 0xDC00 and not greater than 0xDFFF,
             //    throw a URIError exception.
-            if (cc >= ecmascript::base::utf_helper::DECODE_TRAIL_LOW &&
-                cc <= ecmascript::base::utf_helper::DECODE_TRAIL_HIGH) {
+            if (cc >= utf::DECODE_TRAIL_LOW && cc <= utf::DECODE_TRAIL_HIGH) {
                 THROW_URI_ERROR_AND_RETURN(thread, "EncodeURI: The format of the URI to be parsed is incorrect",
                                            JSTaggedValue::Exception());
             }
@@ -471,8 +470,7 @@ JSTaggedValue BuiltinsGlobal::Encode(JSThread *thread, const JSHandle<EcmaString
             //    4. If kChar is less than 0xDC00 or greater than 0xDFFF, throw a URIError exception.
             //    5. Let V be UTF16Decode(C, kChar).
             uint32_t vv;
-            if (cc < ecmascript::base::utf_helper::DECODE_LEAD_LOW ||
-                cc > ecmascript::base::utf_helper::DECODE_LEAD_HIGH) {
+            if (cc < utf::DECODE_LEAD_LOW || cc > utf::DECODE_LEAD_HIGH) {
                 vv = cc;
             } else {
                 k++;
@@ -480,12 +478,11 @@ JSTaggedValue BuiltinsGlobal::Encode(JSThread *thread, const JSHandle<EcmaString
                     THROW_URI_ERROR_AND_RETURN(thread, "k is invalid", JSTaggedValue::Exception());
                 }
                 uint16_t kc = str->At(k);
-                if (kc < ecmascript::base::utf_helper::DECODE_TRAIL_LOW ||
-                    kc > ecmascript::base::utf_helper::DECODE_TRAIL_HIGH) {
+                if (kc < utf::DECODE_TRAIL_LOW || kc > utf::DECODE_TRAIL_HIGH) {
                     THROW_URI_ERROR_AND_RETURN(thread, "EncodeURI: The format of the URI to be parsed is incorrect",
                                                JSTaggedValue::Exception());
                 }
-                vv = ecmascript::base::utf_helper::UTF16Decode(cc, kc);
+                vv = utf::UTF16Decode(cc, kc);
             }
 
             // iv. Let Octets be the array of octets resulting by applying the UTF-8 transformation to V,
@@ -673,23 +670,21 @@ JSTaggedValue BuiltinsGlobal::Decode(JSThread *thread, const JSHandle<EcmaString
                 // 8. Let V be the value obtained by applying the UTF-8 transformation to Octets, that is,
                 //     from an array of octets into a 21-bit value. If Octets does not contain a valid UTF-8 encoding of
                 //     a Unicode code point throw a URIError exception.
-                if (!ecmascript::base::utf_helper::IsValidUTF8(oct)) {
+                if (!utf::IsValidUTF8(oct)) {
                     THROW_URI_ERROR_AND_RETURN(thread, "DecodeURI: The format of the URI to be parsed is incorrect",
                                                JSTaggedValue::Exception());
                 }
                 uint32_t vv = StringHelper::Utf8ToU32String(oct);
-                if (vv < ecmascript::base::utf_helper::DECODE_SECOND_FACTOR) {
+                if (vv < utf::DECODE_SECOND_FACTOR) {
                     if (!IsInURISet(vv)) {
                         sStr = StringHelper::Utf16ToU16String(reinterpret_cast<uint16_t *>(&vv), 1);
                     } else {
                         sStr = StringHelper::StringToU16string(StringHelper::SubString(str, start, k - start + 1));
                     }
                 } else {
-                    uint16_t lv = (((vv - ecmascript::base::utf_helper::DECODE_SECOND_FACTOR) & BIT16_MASK) +
-                                   ecmascript::base::utf_helper::DECODE_TRAIL_LOW);
-                    uint16_t hv =
-                        ((((vv - ecmascript::base::utf_helper::DECODE_SECOND_FACTOR) >> 10U) & BIT16_MASK) +  // NOLINT
-                         ecmascript::base::utf_helper::DECODE_LEAD_LOW);  // 10: means shift left by 10 digits
+                    uint16_t lv = (((vv - utf::DECODE_SECOND_FACTOR) & BIT16_MASK) + utf::DECODE_TRAIL_LOW);
+                    uint16_t hv = ((((vv - utf::DECODE_SECOND_FACTOR) >> 10U) & BIT16_MASK) +  // NOLINT
+                                   utf::DECODE_LEAD_LOW);  // 10: means shift left by 10 digits
                     sStr = StringHelper::Append(StringHelper::Utf16ToU16String(&hv, 1),
                                                 StringHelper::Utf16ToU16String(&lv, 1));
                 }
diff --git a/runtime/builtins/builtins_number.cpp b/runtime/builtins/builtins_number.cpp
index 723b72fdd..1c0f2a9bd 100644
--- a/runtime/builtins/builtins_number.cpp
+++ b/runtime/builtins/builtins_number.cpp
@@ -163,11 +163,10 @@ JSTaggedValue BuiltinsNumber::ParseFloat(EcmaRuntimeCallInfo *argv)
     RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread);
 
     if (UNLIKELY(numberString->IsUtf16())) {
-        size_t len =
-            ecmascript::base::utf_helper::Utf16ToUtf8Size(numberString->GetDataUtf16(), numberString->GetLength()) - 1;
+        size_t len = utf::Utf16ToUtf8Size(numberString->GetDataUtf16(), numberString->GetLength()) - 1;
         PandaVector<uint8_t> buf(len);
-        len = ecmascript::base::utf_helper::ConvertRegionUtf16ToUtf8(numberString->GetDataUtf16(), buf.data(),
-                                                                     numberString->GetLength(), len, 0);
+        len =
+            utf::ConvertRegionUtf16ToUtf8(numberString->GetDataUtf16(), buf.data(), numberString->GetLength(), len, 0);
         auto str = Span<const uint8_t>(buf.data(), len);
         return ParseFloatStr(str);
     }
@@ -203,11 +202,10 @@ JSTaggedValue BuiltinsNumber::ParseInt(EcmaRuntimeCallInfo *argv)
     RETURN_EXCEPTION_IF_ABRUPT_COMPLETION(thread);
 
     if (UNLIKELY(numberString->IsUtf16())) {
-        size_t len =
-            ecmascript::base::utf_helper::Utf16ToUtf8Size(numberString->GetDataUtf16(), numberString->GetLength()) - 1;
+        size_t len = utf::Utf16ToUtf8Size(numberString->GetDataUtf16(), numberString->GetLength()) - 1;
         PandaVector<uint8_t> buf(len);
-        len = ecmascript::base::utf_helper::ConvertRegionUtf16ToUtf8(numberString->GetDataUtf16(), buf.data(),
-                                                                     numberString->GetLength(), len, 0);
+        len =
+            utf::ConvertRegionUtf16ToUtf8(numberString->GetDataUtf16(), buf.data(), numberString->GetLength(), len, 0);
         auto str = Span<const uint8_t>(buf.data(), len);
         return ParseIntStr(str, radix);
     }
diff --git a/runtime/builtins/builtins_string.cpp b/runtime/builtins/builtins_string.cpp
index 993770e97..cf8d4efec 100644
--- a/runtime/builtins/builtins_string.cpp
+++ b/runtime/builtins/builtins_string.cpp
@@ -306,16 +306,14 @@ JSTaggedValue BuiltinsString::CodePointAt(EcmaRuntimeCallInfo *argv)
         return JSTaggedValue::Undefined();
     }
     uint16_t first = thisHandle->At<false>(pos);
-    if (first < ecmascript::base::utf_helper::DECODE_LEAD_LOW ||
-        first > ecmascript::base::utf_helper::DECODE_LEAD_HIGH || pos + 1 == thisLen) {
+    if (first < utf::DECODE_LEAD_LOW || first > utf::DECODE_LEAD_HIGH || pos + 1 == thisLen) {
         return GetTaggedInt(first);
     }
     uint16_t second = thisHandle->At<false>(pos + 1);
-    if (second < ecmascript::base::utf_helper::DECODE_TRAIL_LOW ||
-        second > ecmascript::base::utf_helper::DECODE_TRAIL_HIGH) {
+    if (second < utf::DECODE_TRAIL_LOW || second > utf::DECODE_TRAIL_HIGH) {
         return GetTaggedInt(first);
     }
-    uint32_t res = ecmascript::base::utf_helper::UTF16Decode(first, second);
+    uint32_t res = utf::UTF16Decode(first, second);
     return GetTaggedInt(res);
 }
 
diff --git a/runtime/builtins/builtins_string_iterator.cpp b/runtime/builtins/builtins_string_iterator.cpp
index e90ff858a..6b0d73281 100644
--- a/runtime/builtins/builtins_string_iterator.cpp
+++ b/runtime/builtins/builtins_string_iterator.cpp
@@ -64,8 +64,7 @@ JSTaggedValue BuiltinsStringIterator::Next(EcmaRuntimeCallInfo *argv)
     // 10. If first < 0xD800 or first > 0xDBFF or position+1 = len, let resultString be the string consisting of the
     // single code unit first.
     ObjectFactory *factory = thread->GetEcmaVM()->GetFactory();
-    if (position + 1 == len || first < ecmascript::base::utf_helper::DECODE_LEAD_LOW ||
-        first > ecmascript::base::utf_helper::DECODE_LEAD_HIGH) {
+    if (position + 1 == len || first < utf::DECODE_LEAD_LOW || first > utf::DECODE_LEAD_HIGH) {
         std::vector<uint16_t> resultString {first, 0x0};
         result.Update(factory->NewFromUtf16UnCheck(resultString.data(), 1, true).GetTaggedValue());
     } else {
@@ -75,8 +74,7 @@ JSTaggedValue BuiltinsStringIterator::Next(EcmaRuntimeCallInfo *argv)
         // first.
         // c. Else, let resultString be the string consisting of the code unit first followed by the code unit second.
         uint16_t second = string.GetObject<EcmaString>()->At<false>(position + 1);
-        if (second < ecmascript::base::utf_helper::DECODE_TRAIL_LOW ||
-            second > ecmascript::base::utf_helper::DECODE_TRAIL_HIGH) {
+        if (second < utf::DECODE_TRAIL_LOW || second > utf::DECODE_TRAIL_HIGH) {
             std::vector<uint16_t> resultString {first, 0x0};
             result.Update(factory->NewFromUtf16UnCheck(resultString.data(), 1, false).GetTaggedValue());
         } else {
diff --git a/runtime/ecma_string-inl.h b/runtime/ecma_string-inl.h
index 5220bbb8f..d0b2d766a 100644
--- a/runtime/ecma_string-inl.h
+++ b/runtime/ecma_string-inl.h
@@ -63,12 +63,12 @@ inline EcmaString *EcmaString::CreateFromUtf8(const uint8_t *utf8_data, uint32_t
             UNREACHABLE();
         }
     } else {
-        auto utf16_len = base::utf_helper::Utf8ToUtf16Size(utf8_data, utf8_len);
+        auto utf16_len = utf::Utf8ToUtf16Size(utf8_data, utf8_len);
         string = AllocStringObject(utf16_len, false, vm, space_type);
         ASSERT(string != nullptr);
 
-        [[maybe_unused]] auto len = base::utf_helper::ConvertRegionUtf8ToUtf16(
-            utf8_data, string->GetDataUtf16Writable(), utf8_len, utf16_len, 0);
+        [[maybe_unused]] auto len =
+            utf::ConvertRegionUtf8ToUtf16(utf8_data, string->GetDataUtf16Writable(), utf8_len, utf16_len, 0);
         ASSERT(len == utf16_len);
     }
 
diff --git a/runtime/ecma_string.cpp b/runtime/ecma_string.cpp
index d2cdf0c12..5d3928276 100644
--- a/runtime/ecma_string.cpp
+++ b/runtime/ecma_string.cpp
@@ -411,10 +411,9 @@ uint32_t EcmaString::ComputeHashcodeUtf8(const uint8_t *utf8_data, size_t utf8_l
     if (can_be_compress) {
         hash = ComputeHashForUtf8(utf8_data, utf8_len);
     } else {
-        auto utf16_len = base::utf_helper::Utf8ToUtf16Size(utf8_data, utf8_len);
+        auto utf16_len = utf::Utf8ToUtf16Size(utf8_data, utf8_len);
         PandaVector<uint16_t> tmp_buffer(utf16_len);
-        [[maybe_unused]] auto len =
-            base::utf_helper::ConvertRegionUtf8ToUtf16(utf8_data, tmp_buffer.data(), utf8_len, utf16_len, 0);
+        [[maybe_unused]] auto len = utf::ConvertRegionUtf8ToUtf16(utf8_data, tmp_buffer.data(), utf8_len, utf16_len, 0);
         ASSERT(len == utf16_len);
         hash = ComputeHashForData(tmp_buffer.data(), utf16_len);
     }
@@ -434,8 +433,7 @@ bool EcmaString::IsUtf8EqualsUtf16(const uint8_t *utf8_data, size_t utf8_len, co
     // length is one more than compared utf16_data, don't need convert all utf8_data to utf16_data
     uint32_t utf8_convert_length = utf16_len + 1;
     PandaVector<uint16_t> tmp_buffer(utf8_convert_length);
-    auto len =
-        base::utf_helper::ConvertRegionUtf8ToUtf16(utf8_data, tmp_buffer.data(), utf8_len, utf8_convert_length, 0);
+    auto len = utf::ConvertRegionUtf8ToUtf16(utf8_data, tmp_buffer.data(), utf8_len, utf8_convert_length, 0);
     if (len != utf16_len) {
         return false;
     }
diff --git a/runtime/ecma_string.h b/runtime/ecma_string.h
index cde5bb1ea..bdd17113c 100644
--- a/runtime/ecma_string.h
+++ b/runtime/ecma_string.h
@@ -20,7 +20,7 @@
 #include <cstdint>
 #include <cstring>
 
-#include "plugins/ecmascript/runtime/base/utf_helper.h"
+#include "libpandabase/utils/utf.h"
 #include "plugins/ecmascript/runtime/ecma_macros.h"
 #include "plugins/ecmascript/runtime/js_tagged_value.h"
 #include "plugins/ecmascript/runtime/mem/tagged_object.h"
@@ -132,7 +132,7 @@ public:
         if (!IsUtf16()) {
             return GetLength() + 1;  // add place for zero in the end
         }
-        return base::utf_helper::Utf16ToUtf8Size(GetData(), GetLength());
+        return utf::Utf16ToUtf8Size(GetData(), GetLength());
     }
 
     size_t GetUtf16Length() const
@@ -169,7 +169,7 @@ public:
             }
             return length;
         }
-        return base::utf_helper::ConvertRegionUtf16ToUtf8(GetDataUtf16(), buf, length, max_length - 1, start);
+        return utf::ConvertRegionUtf16ToUtf8(GetDataUtf16(), buf, length, max_length - 1, start);
     }
 
     inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t max_length) const
@@ -195,7 +195,7 @@ public:
             }
             return length;
         }
-        return base::utf_helper::ConvertRegionUtf8ToUtf16(GetDataUtf8(), buf, len, max_length, start);
+        return utf::ConvertRegionUtf8ToUtf16(GetDataUtf8(), buf, len, max_length, start);
     }
 
     // NOLINTNEXTLINE(modernize-avoid-c-arrays)
@@ -317,7 +317,7 @@ private:
     static bool IsASCIICharacter(uint16_t data)
     {
         // \0 is not considered ASCII in Ecma-Modified-UTF8 [only modify '\u0000']
-        return data - 1U < base::utf_helper::UTF8_1B_MAX;
+        return data - 1U < utf::UTF8_1B_MAX;
     }
 
     /**
diff --git a/runtime/interpreter/slow_runtime_stub.cpp b/runtime/interpreter/slow_runtime_stub.cpp
index 1e2cf8246..7dad6ee9d 100644
--- a/runtime/interpreter/slow_runtime_stub.cpp
+++ b/runtime/interpreter/slow_runtime_stub.cpp
@@ -18,7 +18,6 @@
 #include "js_tagged_value.h"
 #include "lexical_env.h"
 #include "plugins/ecmascript/runtime/base/number_helper.h"
-#include "plugins/ecmascript/runtime/base/utf_helper.h"
 #include "plugins/ecmascript/runtime/builtins/builtins_regexp.h"
 #include "plugins/ecmascript/runtime/class_linker/program_object-inl.h"
 #include "plugins/ecmascript/runtime/ecma_module.h"
@@ -49,6 +48,7 @@
 #include "plugins/ecmascript/runtime/runtime_call_id.h"
 #include "plugins/ecmascript/runtime/template_string.h"
 #include "plugins/ecmascript/runtime/vmstat/runtime_stat.h"
+#include "libpandabase/utils/utf.h"
 
 namespace panda::ecmascript {
 JSTaggedValue SlowRuntimeStub::CallSpreadDyn(JSThread *thread, JSTaggedValue func, JSTaggedValue obj,
@@ -1954,7 +1954,7 @@ JSTaggedValue SlowRuntimeStub::StArraySpread(JSThread *thread, JSTaggedValue dst
         uint32_t prop_counter = 0;
         for (uint32_t i = 0; i < str_len; i++, prop_counter++) {
             uint16_t res = src_string->At<false>(i);
-            if (UNLIKELY(base::utf_helper::IsUTF16SurrogatePair(res))) {
+            if (UNLIKELY(utf::IsUTF16SurrogatePair(res))) {
                 std::array<uint16_t, 2> res_surrogate_pair {};
                 res_surrogate_pair[0] = src_string->At<false>(i);
                 res_surrogate_pair[1] = src_string->At<false>(i + 1);
diff --git a/runtime/js_tagged_value-inl.h b/runtime/js_tagged_value-inl.h
index 649877be3..532706b58 100644
--- a/runtime/js_tagged_value-inl.h
+++ b/runtime/js_tagged_value-inl.h
@@ -135,9 +135,9 @@ inline JSTaggedNumber JSTaggedValue::ToNumber(JSThread *thread, const JSHandle<J
         }
         [[maybe_unused]] PandaVector<uint8_t> buf;  // Span will use buf.data(), shouldn't define inside 'if'
         if (UNLIKELY(str_obj->IsUtf16())) {
-            size_t len = base::utf_helper::Utf16ToUtf8Size(str_obj->GetDataUtf16(), str_len) - 1;
+            size_t len = utf::Utf16ToUtf8Size(str_obj->GetDataUtf16(), str_len) - 1;
             buf.reserve(len);
-            len = base::utf_helper::ConvertRegionUtf16ToUtf8(str_obj->GetDataUtf16(), buf.data(), str_len, len, 0);
+            len = utf::ConvertRegionUtf16ToUtf8(str_obj->GetDataUtf16(), buf.data(), str_len, len, 0);
             str = Span<const uint8_t>(buf.data(), len);
         } else {
             str = Span<const uint8_t>(str_obj->GetDataUtf8(), str_len);
diff --git a/runtime/mem/ecma_string.cpp b/runtime/mem/ecma_string.cpp
index d41bc89e4..2dc55699f 100644
--- a/runtime/mem/ecma_string.cpp
+++ b/runtime/mem/ecma_string.cpp
@@ -59,10 +59,10 @@ PandaString ConvertToPandaString(const EcmaString *s, uint32_t start, uint32_t l
         // Should convert utf-16 to utf-8, because uint16_t likely great than maxChar, will convert fail
         bool modify = (usage != StringConvertedUsage::PRINT);
         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-        size_t len = base::utf_helper::Utf16ToUtf8Size(s->GetDataUtf16() + start, length, modify) - 1;
+        size_t len = utf::Utf16ToUtf8Size(s->GetDataUtf16() + start, length, modify) - 1;
         PandaVector<uint8_t> buf(len);
         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-        len = base::utf_helper::ConvertRegionUtf16ToUtf8(s->GetDataUtf16() + start, buf.data(), length, len, 0, modify);
+        len = utf::ConvertRegionUtf16ToUtf8(s->GetDataUtf16() + start, buf.data(), length, len, 0, modify);
         Span<const uint8_t> sp(buf.data(), len);
         return ConvertToPandaString(sp);
     }
diff --git a/runtime/regexp/regexp_parser.cpp b/runtime/regexp/regexp_parser.cpp
index 4e7f906a1..c8567d55c 100644
--- a/runtime/regexp/regexp_parser.cpp
+++ b/runtime/regexp/regexp_parser.cpp
@@ -1289,7 +1289,7 @@ uint32_t RegExpParser::ParseClassAtom(RangeSet *atom)
             size_t u16_size = 0;
             if (c0_ > INT8_MAX) {
                 pc_ -= 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-                auto u16_result = base::utf_helper::ConvertUtf8ToUtf16Pair(pc_, true);
+                auto u16_result = utf::ConvertUtf8ToUtf16Pair(pc_, true);
                 value = u16_result.first;
                 u16_size = u16_result.second;
                 Advance(u16_size + 1);
diff --git a/subproject_sources.gn b/subproject_sources.gn
index d903a9a92..440fc1f5f 100644
--- a/subproject_sources.gn
+++ b/subproject_sources.gn
@@ -88,7 +88,6 @@ srcs_runtime = [
   "runtime/base/object_helper.cpp",
   "runtime/base/string_helper.cpp",
   "runtime/base/typed_array_helper.cpp",
-  "runtime/base/utf_helper.cpp",
   "runtime/bridge/ecma_bridge_helpers.cpp",
   "runtime/builtins.cpp",
   "runtime/builtins/builtins_ark_tools.cpp",
-- 
Gitee