diff --git a/.huawei-posix_spawn-clock_gettime-clock_getres-__cxa_thread_.patch.swp b/.huawei-posix_spawn-clock_gettime-clock_getres-__cxa_thread_.patch.swp new file mode 100644 index 0000000000000000000000000000000000000000..8d6264d8f93316497df2303e4b19a3f22a90b671 --- /dev/null +++ b/.huawei-posix_spawn-clock_gettime-clock_getres-__cxa_thread_.patch.swp @@ -0,0 +1,862 @@ +Date: Fri, 17 Oct 2025 12:19:34 +0800 +Subject: [PATCH] posix_spawn clock_gettime clock_getres + __cxa_thread_atexit_impl symbol downgrade + +--- + make/autoconf/flags-cflags.m4 | 19 +++ + make/autoconf/libraries.m4 | 9 -- + src/hotspot/os/aix/os_aix.cpp | 15 ++ + src/hotspot/os/aix/os_aix.inline.hpp | 5 + + src/hotspot/os/bsd/os_bsd.cpp | 71 +++++++++- + src/hotspot/os/bsd/os_bsd.hpp | 2 + + src/hotspot/os/bsd/os_bsd.inline.hpp | 8 ++ + src/hotspot/os/bsd/os_perf_bsd.cpp | 9 +- + src/hotspot/os/linux/os_linux.cpp | 84 +++++++++++- + src/hotspot/os/posix/os_posix.cpp | 167 +++++++++++++++-------- + src/hotspot/os/posix/os_posix.hpp | 17 +++ + src/hotspot/os/posix/os_posix.inline.hpp | 22 +++ + src/hotspot/share/prims/upcallLinker.cpp | 38 ++++++ + src/hotspot/share/prims/upcallLinker.hpp | 10 ++ + src/hotspot/share/runtime/javaThread.cpp | 2 +- + src/hotspot/share/runtime/os.hpp | 1 + + src/hotspot/share/runtime/threads.cpp | 4 + + 17 files changed, 400 insertions(+), 83 deletions(-) + +diff --git a/make/autoconf/flags-cflags.m4 b/make/autoconf/flags-cflags.m4 +index b0b317db6..485988ec6 100644 +--- a/make/autoconf/flags-cflags.m4 ++++ b/make/autoconf/flags-cflags.m4 +@@ -727,6 +727,25 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_HELPER], + fi + fi + ++ # Optional POSIX functionality needed by the JVM ++ # ++ # Check if clock_gettime is available and in which library. This indicates ++ # availability of CLOCK_MONOTONIC for hotspot. But we don't need to link, so ++ # don't let it update LIBS. ++ save_LIBS="$LIBS" ++ AC_SEARCH_LIBS(clock_gettime, rt, [HAS_CLOCK_GETTIME=true], []) ++ if test "x$LIBS" = "x-lrt "; then ++ CLOCK_GETTIME_IN_LIBRT=true ++ fi ++ LIBS="$save_LIBS" ++ ++ if test "x$HAS_CLOCK_GETTIME" = "xtrue"; then ++ OS_CFLAGS_JVM="$OS_CFLAGS_JVM -DSUPPORTS_CLOCK_MONOTONIC" ++ if test "x$CLOCK_GETTIME_IN_LIBRT" = "xtrue"; then ++ OS_CFLAGS_JVM="$OS_CFLAGS_JVM -DNEEDS_LIBRT" ++ fi ++ fi ++ + # Extra flags needed when building optional static versions of certain + # JDK libraries. + STATIC_LIBS_CFLAGS="-DSTATIC_BUILD=1" +diff --git a/make/autoconf/libraries.m4 b/make/autoconf/libraries.m4 +index b4f62ca19..23df5f1b5 100644 +--- a/make/autoconf/libraries.m4 ++++ b/make/autoconf/libraries.m4 +@@ -148,15 +148,6 @@ AC_DEFUN_ONCE([LIB_SETUP_LIBRARIES], + BASIC_JVM_LIBS="$BASIC_JVM_LIBS -lpthread" + fi + +- # librt for legacy clock_gettime +- if test "x$OPENJDK_TARGET_OS" = xlinux; then +- # Hotspot needs to link librt to get the clock_* functions. +- # But once our supported minimum build and runtime platform +- # has glibc 2.17, this can be removed as the functions are +- # in libc. +- BASIC_JVM_LIBS="$BASIC_JVM_LIBS -lrt" +- fi +- + # perfstat lib + if test "x$OPENJDK_TARGET_OS" = xaix; then + BASIC_JVM_LIBS="$BASIC_JVM_LIBS -lperfstat" +diff --git a/src/hotspot/os/aix/os_aix.cpp b/src/hotspot/os/aix/os_aix.cpp +index a914422e5..f1990407d 100644 +--- a/src/hotspot/os/aix/os_aix.cpp ++++ b/src/hotspot/os/aix/os_aix.cpp +@@ -948,6 +948,21 @@ double os::elapsedVTime() { + } + } + ++jlong os::javaTimeMillis() { ++ timeval time; ++ int status = gettimeofday(&time, NULL); ++ assert(status != -1, "aix error at gettimeofday()"); ++ return jlong(time.tv_sec) * 1000 + jlong(time.tv_usec / 1000); ++} ++ ++void os::javaTimeSystemUTC(jlong &seconds, jlong &nanos) { ++ timeval time; ++ int status = gettimeofday(&time, NULL); ++ assert(status != -1, "aix error at gettimeofday()"); ++ seconds = jlong(time.tv_sec); ++ nanos = jlong(time.tv_usec) * 1000; ++} ++ + // We use mread_real_time here. + // On AIX: If the CPU has a time register, the result will be RTC_POWER and + // it has to be converted to real time. AIX documentations suggests to do +diff --git a/src/hotspot/os/aix/os_aix.inline.hpp b/src/hotspot/os/aix/os_aix.inline.hpp +index f7e7ee8ab..4ded87316 100644 +--- a/src/hotspot/os/aix/os_aix.inline.hpp ++++ b/src/hotspot/os/aix/os_aix.inline.hpp +@@ -56,4 +56,9 @@ inline void os::map_stack_shadow_pages(address sp) { + inline bool os::can_trim_native_heap() { return false; } + inline bool os::trim_native_heap(os::size_change_t* rss_change) { return false; } + ++inline bool os::supports_monotonic_clock() { ++ // mread_real_time() is monotonic on AIX (see os::javaTimeNanos() comments) ++ return true; ++} ++ + #endif // OS_AIX_OS_AIX_INLINE_HPP +diff --git a/src/hotspot/os/bsd/os_bsd.cpp b/src/hotspot/os/bsd/os_bsd.cpp +index eca9aa9c9..526cf82bc 100644 +--- a/src/hotspot/os/bsd/os_bsd.cpp ++++ b/src/hotspot/os/bsd/os_bsd.cpp +@@ -125,6 +125,8 @@ julong os::Bsd::_physical_memory = 0; + #ifdef __APPLE__ + mach_timebase_info_data_t os::Bsd::_timebase_info = {0, 0}; + volatile uint64_t os::Bsd::_max_abstime = 0; ++#else ++int (*os::Bsd::_clock_gettime)(clockid_t, struct timespec *) = NULL; + #endif + pthread_t os::Bsd::_main_thread; + +@@ -745,13 +747,40 @@ double os::elapsedVTime() { + return elapsedTime(); + } + ++jlong os::javaTimeMillis() { ++ timeval time; ++ int status = gettimeofday(&time, NULL); ++ assert(status != -1, "bsd error"); ++ return jlong(time.tv_sec) * 1000 + jlong(time.tv_usec / 1000); ++} ++ ++void os::javaTimeSystemUTC(jlong &seconds, jlong &nanos) { ++ timeval time; ++ int status = gettimeofday(&time, NULL); ++ assert(status != -1, "bsd error"); ++ seconds = jlong(time.tv_sec); ++ nanos = jlong(time.tv_usec) * 1000; ++} ++ ++#ifndef __APPLE__ ++ #ifndef CLOCK_MONOTONIC ++ #define CLOCK_MONOTONIC (1) ++ #endif ++#endif ++ + #ifdef __APPLE__ + void os::Bsd::clock_init() { + mach_timebase_info(&_timebase_info); + } + #else + void os::Bsd::clock_init() { +- // Nothing to do ++ struct timespec res; ++ struct timespec tp; ++ if (::clock_getres(CLOCK_MONOTONIC, &res) == 0 && ++ ::clock_gettime(CLOCK_MONOTONIC, &tp) == 0) { ++ // yes, monotonic clock is supported ++ _clock_gettime = ::clock_gettime; ++ } + } + #endif + +@@ -783,14 +812,44 @@ jlong os::javaTimeNanos() { + return (prev == obsv) ? now : obsv; + } + +-void os::javaTimeNanos_info(jvmtiTimerInfo *info_ptr) { +- info_ptr->max_value = ALL_64_BITS; +- info_ptr->may_skip_backward = false; // not subject to resetting or drifting +- info_ptr->may_skip_forward = false; // not subject to resetting or drifting +- info_ptr->kind = JVMTI_TIMER_ELAPSED; // elapsed not CPU time ++#else // __APPLE__ ++ ++jlong os::javaTimeNanos() { ++ if (os::supports_monotonic_clock()) { ++ struct timespec tp; ++ int status = Bsd::_clock_gettime(CLOCK_MONOTONIC, &tp); ++ assert(status == 0, "gettime error"); ++ jlong result = jlong(tp.tv_sec) * (1000 * 1000 * 1000) + jlong(tp.tv_nsec); ++ return result; ++ } else { ++ timeval time; ++ int status = gettimeofday(&time, NULL); ++ assert(status != -1, "bsd error"); ++ jlong usecs = jlong(time.tv_sec) * (1000 * 1000) + jlong(time.tv_usec); ++ return 1000 * usecs; ++ } + } + #endif // __APPLE__ + ++void os::javaTimeNanos_info(jvmtiTimerInfo *info_ptr) { ++ if (os::supports_monotonic_clock()) { ++ info_ptr->max_value = ALL_64_BITS; ++ ++ // CLOCK_MONOTONIC - amount of time since some arbitrary point in the past ++ info_ptr->may_skip_backward = false; // not subject to resetting or drifting ++ info_ptr->may_skip_forward = false; // not subject to resetting or drifting ++ } else { ++ // gettimeofday - based on time in seconds since the Epoch thus does not wrap ++ info_ptr->max_value = ALL_64_BITS; ++ ++ // gettimeofday is a real time clock so it skips ++ info_ptr->may_skip_backward = true; ++ info_ptr->may_skip_forward = true; ++ } ++ ++ info_ptr->kind = JVMTI_TIMER_ELAPSED; // elapsed not CPU time ++} ++ + // Information of current thread in variety of formats + pid_t os::Bsd::gettid() { + int retval = -1; +diff --git a/src/hotspot/os/bsd/os_bsd.hpp b/src/hotspot/os/bsd/os_bsd.hpp +index f79212bc4..7bf3ff83b 100644 +--- a/src/hotspot/os/bsd/os_bsd.hpp ++++ b/src/hotspot/os/bsd/os_bsd.hpp +@@ -36,6 +36,8 @@ class os::Bsd { + // mach_absolute_time + static mach_timebase_info_data_t _timebase_info; + static volatile uint64_t _max_abstime; ++#else ++ static int (*_clock_gettime)(clockid_t, struct timespec *); + #endif + + static GrowableArray* _cpu_to_node; +diff --git a/src/hotspot/os/bsd/os_bsd.inline.hpp b/src/hotspot/os/bsd/os_bsd.inline.hpp +index 2049b3371..706a8b861 100644 +--- a/src/hotspot/os/bsd/os_bsd.inline.hpp ++++ b/src/hotspot/os/bsd/os_bsd.inline.hpp +@@ -59,4 +59,12 @@ inline void os::map_stack_shadow_pages(address sp) { + inline bool os::can_trim_native_heap() { return false; } + inline bool os::trim_native_heap(os::size_change_t* rss_change) { return false; } + ++inline bool os::supports_monotonic_clock() { ++#ifdef __APPLE__ ++ return true; ++#else ++ return Bsd::_clock_gettime != NULL; ++#endif ++} ++ + #endif // OS_BSD_OS_BSD_INLINE_HPP +diff --git a/src/hotspot/os/bsd/os_perf_bsd.cpp b/src/hotspot/os/bsd/os_perf_bsd.cpp +index 631d2135b..8d5702378 100644 +--- a/src/hotspot/os/bsd/os_perf_bsd.cpp ++++ b/src/hotspot/os/bsd/os_perf_bsd.cpp +@@ -58,13 +58,12 @@ class CPUPerformanceInterface::CPUPerformance : public CHeapObj { + int _active_processor_count; + + bool now_in_nanos(uint64_t* resultp) { +- struct timespec tp; +- int status = clock_gettime(CLOCK_REALTIME, &tp); +- assert(status == 0, "clock_gettime error: %s", os::strerror(errno)); +- if (status != 0) { ++ timeval current_time; ++ if (gettimeofday(¤t_time, NULL) != 0) { ++ // Error getting current time + return false; + } +- *resultp = tp.tv_sec * NANOS_PER_SEC + tp.tv_nsec; ++ *resultp = current_time.tv_sec * NANOS_PER_SEC + 1000L * current_time.tv_usec; + return true; + } + #endif +diff --git a/src/hotspot/os/linux/os_linux.cpp b/src/hotspot/os/linux/os_linux.cpp +index ddc9ac85c..f73459337 100644 +--- a/src/hotspot/os/linux/os_linux.cpp ++++ b/src/hotspot/os/linux/os_linux.cpp +@@ -1410,6 +1410,11 @@ void os::Linux::capture_initial_stack(size_t max_size) { + + //////////////////////////////////////////////////////////////////////////////// + // time support ++ ++#ifndef SUPPORTS_CLOCK_MONOTONIC ++#error "Build platform doesn't support clock_gettime and related functionality" ++#endif ++ + double os::elapsedVTime() { + struct rusage usage; + int retval = getrusage(RUSAGE_THREAD, &usage); +@@ -1421,6 +1426,38 @@ double os::elapsedVTime() { + } + } + ++jlong os::javaTimeMillis() { ++ if (os::Posix::supports_clock_gettime()) { ++ struct timespec ts; ++ int status = os::Posix::clock_gettime(CLOCK_REALTIME, &ts); ++ assert_status(status == 0, status, "gettime error"); ++ return jlong(ts.tv_sec) * MILLIUNITS + ++ jlong(ts.tv_nsec) / NANOUNITS_PER_MILLIUNIT; ++ } else { ++ timeval time; ++ int status = gettimeofday(&time, NULL); ++ assert(status != -1, "linux error"); ++ return jlong(time.tv_sec) * MILLIUNITS + ++ jlong(time.tv_usec) / (MICROUNITS / MILLIUNITS); ++ } ++} ++ ++void os::javaTimeSystemUTC(jlong &seconds, jlong &nanos) { ++ if (os::Posix::supports_clock_gettime()) { ++ struct timespec ts; ++ int status = os::Posix::clock_gettime(CLOCK_REALTIME, &ts); ++ assert_status(status == 0, status, "gettime error"); ++ seconds = jlong(ts.tv_sec); ++ nanos = jlong(ts.tv_nsec); ++ } else { ++ timeval time; ++ int status = gettimeofday(&time, NULL); ++ assert(status != -1, "linux error"); ++ seconds = jlong(time.tv_sec); ++ nanos = jlong(time.tv_usec) * (NANOUNITS / MICROUNITS); ++ } ++} ++ + void os::Linux::fast_thread_clock_init() { + if (!UseLinuxPosixThreadCPUClocks) { + return; +@@ -1441,12 +1478,47 @@ void os::Linux::fast_thread_clock_init() { + + if (pthread_getcpuclockid_func && + pthread_getcpuclockid_func(_main_thread, &clockid) == 0 && +- clock_getres(clockid, &tp) == 0 && tp.tv_sec == 0) { ++ os::Posix::clock_getres(clockid, &tp) == 0 && tp.tv_sec == 0) { + _supports_fast_thread_cpu_time = true; + _pthread_getcpuclockid = pthread_getcpuclockid_func; + } + } + ++jlong os::javaTimeNanos() { ++ if (os::supports_monotonic_clock()) { ++ struct timespec tp; ++ int status = os::Posix::clock_gettime(CLOCK_MONOTONIC, &tp); ++ assert(status == 0, "gettime error"); ++ jlong result = jlong(tp.tv_sec) * (1000 * 1000 * 1000) + jlong(tp.tv_nsec); ++ return result; ++ } else { ++ timeval time; ++ int status = gettimeofday(&time, NULL); ++ assert(status != -1, "linux error"); ++ jlong usecs = jlong(time.tv_sec) * (1000 * 1000) + jlong(time.tv_usec); ++ return 1000 * usecs; ++ } ++} ++ ++void os::javaTimeNanos_info(jvmtiTimerInfo *info_ptr) { ++ if (os::supports_monotonic_clock()) { ++ info_ptr->max_value = ALL_64_BITS; ++ ++ // CLOCK_MONOTONIC - amount of time since some arbitrary point in the past ++ info_ptr->may_skip_backward = false; // not subject to resetting or drifting ++ info_ptr->may_skip_forward = false; // not subject to resetting or drifting ++ } else { ++ // gettimeofday - based on time in seconds since the Epoch thus does not wrap ++ info_ptr->max_value = ALL_64_BITS; ++ ++ // gettimeofday is a real time clock so it skips ++ info_ptr->may_skip_backward = true; ++ info_ptr->may_skip_forward = true; ++ } ++ ++ info_ptr->kind = JVMTI_TIMER_ELAPSED; // elapsed not CPU time ++} ++ + // thread_id is kernel thread id (similar to Solaris LWP id) + intx os::current_thread_id() { return os::Linux::gettid(); } + int os::current_process_id() { +@@ -4412,8 +4484,8 @@ OSReturn os::get_native_priority(const Thread* const thread, + + jlong os::Linux::fast_thread_cpu_time(clockid_t clockid) { + struct timespec tp; +- int status = clock_gettime(clockid, &tp); +- assert(status == 0, "clock_gettime error: %s", os::strerror(errno)); ++ int rc = os::Posix::clock_gettime(clockid, &tp); ++ assert(rc == 0, "clock_gettime is expected to return 0 code"); + return (tp.tv_sec * NANOSECS_PER_SEC) + tp.tv_nsec; + } + +@@ -4533,6 +4605,12 @@ void os::init(void) { + FLAG_SET_DEFAULT(UseMadvPopulateWrite, (::madvise(0, 0, MADV_POPULATE_WRITE) == 0)); + + os::Posix::init(); ++ ++ // Always warn if no monotonic clock available ++ if (!os::Posix::supports_monotonic_clock()) { ++ warning("No monotonic clock was available - timed services may " \ ++ "be adversely affected if the time-of-day clock changes"); ++ } + } + + // To install functions for atexit system call +diff --git a/src/hotspot/os/posix/os_posix.cpp b/src/hotspot/os/posix/os_posix.cpp +index 7f95560a1..cb4122ab8 100644 +--- a/src/hotspot/os/posix/os_posix.cpp ++++ b/src/hotspot/os/posix/os_posix.cpp +@@ -23,7 +23,6 @@ + */ + + #include "classfile/classLoader.hpp" +-#include "jvm.h" + #include "jvmtifiles/jvmti.h" + #include "logging/log.hpp" + #include "memory/allocation.inline.hpp" +@@ -97,6 +96,12 @@ + #define assert_with_errno(cond, msg) check_with_errno(assert, cond, msg) + #define guarantee_with_errno(cond, msg) check_with_errno(guarantee, cond, msg) + ++#if defined(AMD64) ++ __asm__(".symver posix_spawn,posix_spawn@GLIBC_2.2.5"); ++#elif defined(AARCH64) ++ __asm__(".symver posix_spawn,posix_spawn@GLIBC_2.17"); ++#endif ++ + static jlong initial_time_count = 0; + + static int clock_tics_per_sec = 100; +@@ -832,6 +837,10 @@ int os::connect(int fd, struct sockaddr* him, socklen_t len) { + RESTARTABLE_RETURN_INT(::connect(fd, him, len)); + } + ++bool os::supports_monotonic_clock() { ++ return os::Posix::supports_monotonic_clock(); ++} ++ + void os::exit(int num) { + ALLOW_C_FUNCTION(::exit, ::exit(num);) + } +@@ -1223,6 +1232,23 @@ static void pthread_init_common(void) { + PlatformMutex::init(); + } + ++// Not all POSIX types and API's are available on all notionally "posix" ++// platforms. If we have build-time support then we will check for actual ++// runtime support via dlopen/dlsym lookup. This allows for running on an ++// older OS version compared to the build platform. But if there is no ++// build time support then there cannot be any runtime support as we do not ++// know what the runtime types would be (for example clockid_t might be an ++// int or int64_t). ++// ++#ifdef SUPPORTS_CLOCK_MONOTONIC ++ ++// This means we have clockid_t, clock_gettime et al and CLOCK_MONOTONIC ++ ++int (*os::Posix::_clock_gettime)(clockid_t, struct timespec *) = NULL; ++int (*os::Posix::_clock_getres)(clockid_t, struct timespec *) = NULL; ++ ++bool os::Posix::_supports_monotonic_clock = false; ++ + static int (*_pthread_condattr_setclock)(pthread_condattr_t *, clockid_t) = nullptr; + + static bool _use_clock_monotonic_condattr = false; +@@ -1238,7 +1264,44 @@ void os::Posix::init(void) { + // NOTE: no logging available when this is called. Put logging + // statements in init_2(). + +- // Check for pthread_condattr_setclock support. ++ // 1. Check for CLOCK_MONOTONIC support. ++ ++ void* handle = NULL; ++ ++ // For older linux we need librt, for other OS we can find ++ // this function in regular libc. ++#ifdef NEEDS_LIBRT ++ // We do dlopen's in this particular order due to bug in linux ++ // dynamic loader (see 6348968) leading to crash on exit. ++ handle = dlopen("librt.so.1", RTLD_LAZY); ++ if (handle == NULL) { ++ handle = dlopen("librt.so", RTLD_LAZY); ++ } ++#endif ++ ++ if (handle == NULL) { ++ handle = RTLD_DEFAULT; ++ } ++ ++ int (*clock_getres_func)(clockid_t, struct timespec*) = ++ (int(*)(clockid_t, struct timespec*))dlsym(handle, "clock_getres"); ++ int (*clock_gettime_func)(clockid_t, struct timespec*) = ++ (int(*)(clockid_t, struct timespec*))dlsym(handle, "clock_gettime"); ++ if (clock_getres_func != NULL && clock_gettime_func != NULL) { ++ _clock_gettime = clock_gettime_func; ++ _clock_getres = clock_getres_func; ++ // We assume that if both clock_gettime and clock_getres support ++ // CLOCK_MONOTONIC then the OS provides true high-res monotonic clock. ++ struct timespec res; ++ struct timespec tp; ++ if (clock_getres_func(CLOCK_MONOTONIC, &res) == 0 && ++ clock_gettime_func(CLOCK_MONOTONIC, &tp) == 0) { ++ // Yes, monotonic clock is supported. ++ _supports_monotonic_clock = true; ++ } ++ } ++ ++ // 2. Check for pthread_condattr_setclock support. + + // libpthread is already loaded. + int (*condattr_setclock_func)(pthread_condattr_t*, clockid_t) = +@@ -1253,7 +1316,7 @@ void os::Posix::init(void) { + pthread_init_common(); + + int status; +- if (_pthread_condattr_setclock != nullptr) { ++ if (_pthread_condattr_setclock != nullptr && _clock_gettime != nullptr) { + if ((status = _pthread_condattr_setclock(_condAttr, CLOCK_MONOTONIC)) != 0) { + if (status == EINVAL) { + _use_clock_monotonic_condattr = false; +@@ -1271,13 +1334,28 @@ void os::Posix::init(void) { + } + + void os::Posix::init_2(void) { +- log_info(os)("Use of CLOCK_MONOTONIC is supported"); ++ log_info(os)("Use of CLOCK_MONOTONIC is%s supported",(_clock_gettime != NULL ? "" : " not")); + log_info(os)("Use of pthread_condattr_setclock is%s supported", + (_pthread_condattr_setclock != nullptr ? "" : " not")); + log_info(os)("Relative timed-wait using pthread_cond_timedwait is associated with %s", + _use_clock_monotonic_condattr ? "CLOCK_MONOTONIC" : "the default clock"); + } + ++#else // !SUPPORTS_CLOCK_MONOTONIC ++ ++void os::Posix::init(void) { ++ pthread_init_common(); ++} ++ ++void os::Posix::init_2(void) { ++ log_info(os)("Use of CLOCK_MONOTONIC is not supported"); ++ log_info(os)("Use of pthread_condattr_setclock is not supported"); ++ log_info(os)("Relative timed-wait using pthread_cond_timedwait is associated with the default clock"); ++} ++ ++#endif // SUPPORTS_CLOCK_MONOTONIC ++ ++ + // Utility to convert the given timeout to an absolute timespec + // (based on the appropriate clock) to use with pthread_cond_timewait, + // and sem_timedwait(). +@@ -1329,6 +1406,7 @@ static void calc_rel_time(timespec* abstime, jlong timeout, jlong now_sec, + + // Unpack the given deadline in milliseconds since the epoch, into the given timespec. + // The current time in seconds is also passed in to enforce an upper bound as discussed above. ++// This is only used with gettimeofday, when clock_gettime is not available. + static void unpack_abs_time(timespec* abstime, jlong deadline, jlong now_sec) { + time_t max_secs = now_sec + MAX_SECS; + +@@ -1363,21 +1441,38 @@ static void to_abstime(timespec* abstime, jlong timeout, + timeout = 0; + } + +- clockid_t clock = CLOCK_MONOTONIC; +- if (isAbsolute || (!_use_clock_monotonic_condattr || isRealtime)) { +- clock = CLOCK_REALTIME; +- } +- +- struct timespec now; +- int status = clock_gettime(clock, &now); +- assert(status == 0, "clock_gettime error: %s", os::strerror(errno)); ++#ifdef SUPPORTS_CLOCK_MONOTONIC + +- if (!isAbsolute) { ++ clockid_t clock = CLOCK_MONOTONIC; ++ // need to ensure we have a runtime check for clock_gettime support ++ if (!isAbsolute && os::Posix::supports_monotonic_clock()) { ++ if (!_use_clock_monotonic_condattr || isRealtime) { ++ clock = CLOCK_REALTIME; ++ } ++ struct timespec now; ++ int status = os::Posix::clock_gettime(clock, &now); ++ assert_status(status == 0, status, "clock_gettime"); + calc_rel_time(abstime, timeout, now.tv_sec, now.tv_nsec, NANOUNITS); ++ DEBUG_ONLY(max_secs += now.tv_sec;) + } else { +- unpack_abs_time(abstime, timeout, now.tv_sec); ++ ++#else ++ ++ { // Match the block scope. ++ ++#endif // SUPPORTS_CLOCK_MONOTONIC ++ ++ // Time-of-day clock is all we can reliably use. ++ struct timeval now; ++ int status = gettimeofday(&now, NULL); ++ assert_status(status == 0, errno, "gettimeofday"); ++ if (isAbsolute) { ++ unpack_abs_time(abstime, timeout, now.tv_sec); ++ } else { ++ calc_rel_time(abstime, timeout, now.tv_sec, now.tv_usec, MICROUNITS); ++ } ++ DEBUG_ONLY(max_secs += now.tv_sec;) + } +- DEBUG_ONLY(max_secs += now.tv_sec;) + + assert(abstime->tv_sec >= 0, "tv_sec < 0"); + assert(abstime->tv_sec <= max_secs, "tv_sec > max_secs"); +@@ -1393,50 +1486,6 @@ void os::Posix::to_RTC_abstime(timespec* abstime, int64_t millis) { + true /* use real-time clock */); + } + +-// Common (partly) shared time functions +- +-jlong os::javaTimeMillis() { +- struct timespec ts; +- int status = clock_gettime(CLOCK_REALTIME, &ts); +- assert(status == 0, "clock_gettime error: %s", os::strerror(errno)); +- return jlong(ts.tv_sec) * MILLIUNITS + +- jlong(ts.tv_nsec) / NANOUNITS_PER_MILLIUNIT; +-} +- +-void os::javaTimeSystemUTC(jlong &seconds, jlong &nanos) { +- struct timespec ts; +- int status = clock_gettime(CLOCK_REALTIME, &ts); +- assert(status == 0, "clock_gettime error: %s", os::strerror(errno)); +- seconds = jlong(ts.tv_sec); +- nanos = jlong(ts.tv_nsec); +-} +- +-// macOS and AIX have platform specific implementations for javaTimeNanos() +-// using native clock/timer access APIs. These have historically worked well +-// for those platforms, but it may be possible for them to switch to the +-// generic clock_gettime mechanism in the future. +-#if !defined(__APPLE__) && !defined(AIX) +- +-jlong os::javaTimeNanos() { +- struct timespec tp; +- int status = clock_gettime(CLOCK_MONOTONIC, &tp); +- assert(status == 0, "clock_gettime error: %s", os::strerror(errno)); +- jlong result = jlong(tp.tv_sec) * NANOSECS_PER_SEC + jlong(tp.tv_nsec); +- return result; +-} +- +-// for timer info max values which include all bits +-#define ALL_64_BITS CONST64(0xFFFFFFFFFFFFFFFF) +- +-void os::javaTimeNanos_info(jvmtiTimerInfo *info_ptr) { +- // CLOCK_MONOTONIC - amount of time since some arbitrary point in the past +- info_ptr->max_value = ALL_64_BITS; +- info_ptr->may_skip_backward = false; // not subject to resetting or drifting +- info_ptr->may_skip_forward = false; // not subject to resetting or drifting +- info_ptr->kind = JVMTI_TIMER_ELAPSED; // elapsed not CPU time +-} +-#endif // ! APPLE && !AIX +- + // Time since start-up in seconds to a fine granularity. + double os::elapsedTime() { + return ((double)os::elapsed_counter()) / os::elapsed_frequency(); // nanosecond resolution +diff --git a/src/hotspot/os/posix/os_posix.hpp b/src/hotspot/os/posix/os_posix.hpp +index 9e98f4316..0fb974634 100644 +--- a/src/hotspot/os/posix/os_posix.hpp ++++ b/src/hotspot/os/posix/os_posix.hpp +@@ -96,6 +96,23 @@ public: + static address ucontext_get_pc(const ucontext_t* ctx); + static void ucontext_set_pc(ucontext_t* ctx, address pc); + ++#ifdef SUPPORTS_CLOCK_MONOTONIC ++ ++private: ++ static bool _supports_monotonic_clock; ++ // These need to be members so we can access them from inline functions ++ static int (*_clock_gettime)(clockid_t, struct timespec *); ++ static int (*_clock_getres)(clockid_t, struct timespec *); ++public: ++ static bool supports_monotonic_clock(); ++ static bool supports_clock_gettime(); ++ static int clock_gettime(clockid_t clock_id, struct timespec *tp); ++ static int clock_getres(clockid_t clock_id, struct timespec *tp); ++#else ++ static bool supports_monotonic_clock() { return false; } ++ static bool supports_clock_gettime() { return false; } ++#endif ++ + static void to_RTC_abstime(timespec* abstime, int64_t millis); + + static bool handle_stack_overflow(JavaThread* thread, address addr, address pc, +diff --git a/src/hotspot/os/posix/os_posix.inline.hpp b/src/hotspot/os/posix/os_posix.inline.hpp +index cf81c5d62..52bdca757 100644 +--- a/src/hotspot/os/posix/os_posix.inline.hpp ++++ b/src/hotspot/os/posix/os_posix.inline.hpp +@@ -37,6 +37,28 @@ + // Aix does not have NUMA support but need these for compilation. + inline bool os::numa_has_group_homing() { AIX_ONLY(ShouldNotReachHere();) return false; } + ++#ifdef SUPPORTS_CLOCK_MONOTONIC ++ ++// Exported clock functionality ++ ++inline bool os::Posix::supports_monotonic_clock() { ++ return _supports_monotonic_clock; ++} ++ ++inline bool os::Posix::supports_clock_gettime() { ++ return _clock_gettime != NULL; ++} ++ ++inline int os::Posix::clock_gettime(clockid_t clock_id, struct timespec *tp) { ++ return _clock_gettime != NULL ? _clock_gettime(clock_id, tp) : -1; ++} ++ ++inline int os::Posix::clock_getres(clockid_t clock_id, struct timespec *tp) { ++ return _clock_getres != NULL ? _clock_getres(clock_id, tp) : -1; ++} ++ ++#endif // SUPPORTS_CLOCK_MONOTONIC ++ + // Platform Mutex/Monitor implementation + + inline void PlatformMutex::lock() { +diff --git a/src/hotspot/share/prims/upcallLinker.cpp b/src/hotspot/share/prims/upcallLinker.cpp +index d9692bd52..a1e39a4c1 100644 +--- a/src/hotspot/share/prims/upcallLinker.cpp ++++ b/src/hotspot/share/prims/upcallLinker.cpp +@@ -26,6 +26,9 @@ + #include "classfile/symbolTable.hpp" + #include "classfile/systemDictionary.hpp" + #include "compiler/compilationPolicy.hpp" ++#if defined(AARCH64) || defined(AMD64) ++#include "memory/allocation.hpp" ++#endif // AARCH64 || AMD64 + #include "memory/resourceArea.hpp" + #include "prims/upcallLinker.hpp" + #include "runtime/interfaceSupport.inline.hpp" +@@ -44,7 +47,11 @@ extern struct JavaVM_ main_vm; + // to keep track of the fact that we have attached a native thread to the VM. When the thread local + // storage is destroyed (which happens when the native threads is terminated), we check if the + // storage has an attached thread and, if so, we detach it from the VM. ++#if defined(AARCH64) || defined(AMD64) ++struct UpcallContext: public CHeapObj { ++#else + struct UpcallContext { ++#endif // AARCH64 || AMD64 + Thread* attachedThread; + + UpcallContext() {} // Explicit constructor to address XL C compiler bug. +@@ -56,9 +63,36 @@ struct UpcallContext { + } + }; + ++#if defined(AARCH64) || defined(AMD64) ++static unsigned int upcall_thread_key; ++ ++void ThreadLocalUpCall::upcall_destructor(void* threadContext) { ++ delete static_cast(threadContext); ++} ++ ++void ThreadLocalUpCall::init() { ++ int rslt = pthread_key_create(&upcall_thread_key, upcall_destructor); ++ assert_status(rslt == 0, rslt, "pthread_key_create"); ++} ++ ++void* ThreadLocalUpCall::get() { ++ return pthread_getspecific(upcall_thread_key); ++} ++ ++void ThreadLocalUpCall::set() { ++ if (!ThreadLocalUpCall::get()) { ++ UpcallContext* threadContext = new UpcallContext(); ++ pthread_setspecific(upcall_thread_key, threadContext); ++ } ++} ++#else + APPROVED_CPP_THREAD_LOCAL UpcallContext threadContext; ++#endif // AARCH64 || AMD64 + + JavaThread* UpcallLinker::maybe_attach_and_get_thread() { ++#if defined(AARCH64) || defined(AMD64) ++ ThreadLocalUpCall::set(); ++#endif // AARCH64 || AMD64 + JavaThread* thread = JavaThread::current_or_null(); + if (thread == nullptr) { + JavaVM_ *vm = (JavaVM *)(&main_vm); +@@ -66,7 +100,11 @@ JavaThread* UpcallLinker::maybe_attach_and_get_thread() { + jint result = vm->functions->AttachCurrentThreadAsDaemon(vm, (void**) &p_env, nullptr); + guarantee(result == JNI_OK, "Could not attach thread for upcall. JNI error code: %d", result); + thread = JavaThread::current(); ++#if defined(AARCH64) || defined(AMD64) ++ ((UpcallContext *)(ThreadLocalUpCall::get()))->attachedThread = thread; ++#else + threadContext.attachedThread = thread; ++#endif // AARCH64 || AMD64 + assert(!thread->has_last_Java_frame(), "newly-attached thread not expected to have last Java frame"); + } + return thread; +diff --git a/src/hotspot/share/prims/upcallLinker.hpp b/src/hotspot/share/prims/upcallLinker.hpp +index 3f8c717e5..417e16a48 100644 +--- a/src/hotspot/share/prims/upcallLinker.hpp ++++ b/src/hotspot/share/prims/upcallLinker.hpp +@@ -46,4 +46,14 @@ public: + bool needs_return_buffer, int ret_buf_size); + }; + ++#if defined(AARCH64) || defined(AMD64) ++class ThreadLocalUpCall { ++public: ++ static void upcall_destructor(void* threadContext); ++ static void init(); ++ static void set(); ++ static void* get(); ++}; ++#endif // AARCH64 || AMD64 ++ + #endif // SHARE_VM_PRIMS_UPCALLLINKER_HPP +diff --git a/src/hotspot/share/runtime/javaThread.cpp b/src/hotspot/share/runtime/javaThread.cpp +index 515a8d9d9..43da36495 100644 +--- a/src/hotspot/share/runtime/javaThread.cpp ++++ b/src/hotspot/share/runtime/javaThread.cpp +@@ -2026,7 +2026,7 @@ bool JavaThread::sleep_nanos(jlong nanos) { + if (newtime - prevtime < 0) { + // time moving backwards, should only happen if no monotonic clock + // not a guarantee() because JVM should not abort on kernel/glibc bugs +- assert(false, ++ assert(!os::supports_monotonic_clock(), + "unexpected time moving backwards detected in JavaThread::sleep()"); + } else { + nanos_remaining -= (newtime - prevtime); +diff --git a/src/hotspot/share/runtime/os.hpp b/src/hotspot/share/runtime/os.hpp +index 1cc59d275..10b53c3de 100644 +--- a/src/hotspot/share/runtime/os.hpp ++++ b/src/hotspot/share/runtime/os.hpp +@@ -262,6 +262,7 @@ class os: AllStatic { + static void javaTimeNanos_info(jvmtiTimerInfo *info_ptr); + static void javaTimeSystemUTC(jlong &seconds, jlong &nanos); + static void run_periodic_checks(outputStream* st); ++ static bool supports_monotonic_clock(); + + // Returns the elapsed time in seconds since the vm started. + static double elapsedTime(); +diff --git a/src/hotspot/share/runtime/threads.cpp b/src/hotspot/share/runtime/threads.cpp +index 8b7058405..0df523f76 100644 +--- a/src/hotspot/share/runtime/threads.cpp ++++ b/src/hotspot/share/runtime/threads.cpp +@@ -58,6 +58,9 @@ + #include "oops/symbol.hpp" + #include "prims/jvmtiAgentList.hpp" + #include "prims/jvm_misc.hpp" ++#if defined(AARCH64) || defined(AMD64) ++#include "prims/upcallLinker.hpp" ++#endif // AARCH64 || AMD64 + #include "runtime/arguments.hpp" + #include "runtime/fieldDescriptor.inline.hpp" + #include "runtime/flags/jvmFlagLimit.hpp" +@@ -424,6 +425,11 @@ jint Threads::create_vm(JavaVMInitArgs* args, bool* canTryAgain) { + // Initialize library-based TLS + ThreadLocalStorage::init(); + ++#if defined(AARCH64) || defined(AMD64) ++ // Initialize ThreadLocalUpCall ++ ThreadLocalUpCall::init(); ++ ++#endif // AARCH64 || AMD64 + // Initialize the output stream module + ostream_init(); + +-- +2.34.1 + diff --git a/Backport-JDK-8314125-RISC-V-implement-Base64-intrinsic.patch b/Backport-JDK-8314125-RISC-V-implement-Base64-intrinsic.patch new file mode 100644 index 0000000000000000000000000000000000000000..a0a5bba5a9897aa7fc292b9de5424f802d3e889d --- /dev/null +++ b/Backport-JDK-8314125-RISC-V-implement-Base64-intrinsic.patch @@ -0,0 +1,561 @@ +diff --git a/src/hotspot/cpu/riscv/assembler_riscv.hpp b/src/hotspot/cpu/riscv/assembler_riscv.hpp +index c6db58cfa..49a8fa25b 100644 +--- a/src/hotspot/cpu/riscv/assembler_riscv.hpp ++++ b/src/hotspot/cpu/riscv/assembler_riscv.hpp +@@ -1904,6 +1904,21 @@ enum Nf { + + #undef INSN + ++#define INSN(NAME, op, width, umop, mop, mew, nf) \ ++ void NAME(VectorRegister Vd_or_Vs3, Register Rs1, VectorMask vm = unmasked) { \ ++ patch_VLdSt(op, Vd_or_Vs3, width, Rs1, umop, vm, mop, mew, nf); \ ++ } ++ ++ // Vector Unit-Stride Segment Load Instructions ++ INSN(vlseg3e8_v, 0b0000111, 0b000, 0b00000, 0b00, 0b0, g3); ++ INSN(vlseg4e8_v, 0b0000111, 0b000, 0b00000, 0b00, 0b0, g4); ++ ++ // Vector Unit-Stride Segment Store Instructions ++ INSN(vsseg3e8_v, 0b0100111, 0b000, 0b00000, 0b00, 0b0, g3); ++ INSN(vsseg4e8_v, 0b0100111, 0b000, 0b00000, 0b00, 0b0, g4); ++ ++#undef INSN ++ + #define INSN(NAME, op, width, mop, mew) \ + void NAME(VectorRegister Vd, Register Rs1, VectorRegister Vs2, VectorMask vm = unmasked, Nf nf = g1) { \ + patch_VLdSt(op, Vd, width, Rs1, Vs2->raw_encoding(), vm, mop, mew, nf); \ +diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +index 1e93cf2b0..3cf7bae99 100644 +--- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp ++++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +@@ -5260,6 +5260,502 @@ class StubGenerator: public StubCodeGenerator { + return (address) start; + } + ++ /** ++ * vector registers: ++ * input VectorRegister's: intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3 ++ * index VectorRegister's: idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7 ++ * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11 ++ * ++ * NOTE: each field will occupy a vector register group ++ */ ++ void base64_vector_encode_round(Register src, Register dst, Register codec, ++ Register size, Register stepSrc, Register stepDst, ++ VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, ++ VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4, ++ VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4, ++ Assembler::LMUL lmul) { ++ // set vector register type/len ++ __ vsetvli(x0, size, Assembler::e8, lmul); ++ ++ // segmented load src into v registers: mem(src) => vr(3) ++ __ vlseg3e8_v(inputV1, src); ++ ++ // src = src + register_group_len_bytes * 3 ++ __ add(src, src, stepSrc); ++ ++ // encoding ++ // 1. compute index into lookup table: vr(3) => vr(4) ++ __ vsrl_vi(idxV1, inputV1, 2); ++ ++ __ vsrl_vi(idxV2, inputV2, 2); ++ __ vsll_vi(inputV1, inputV1, 6); ++ __ vor_vv(idxV2, idxV2, inputV1); ++ __ vsrl_vi(idxV2, idxV2, 2); ++ ++ __ vsrl_vi(idxV3, inputV3, 4); ++ __ vsll_vi(inputV2, inputV2, 4); ++ __ vor_vv(idxV3, inputV2, idxV3); ++ __ vsrl_vi(idxV3, idxV3, 2); ++ ++ __ vsll_vi(idxV4, inputV3, 2); ++ __ vsrl_vi(idxV4, idxV4, 2); ++ ++ // 2. indexed load: vr(4) => vr(4) ++ __ vluxei8_v(outputV1, codec, idxV1); ++ __ vluxei8_v(outputV2, codec, idxV2); ++ __ vluxei8_v(outputV3, codec, idxV3); ++ __ vluxei8_v(outputV4, codec, idxV4); ++ ++ // segmented store encoded data in v registers back to dst: vr(4) => mem(dst) ++ __ vsseg4e8_v(outputV1, dst); ++ ++ // dst = dst + register_group_len_bytes * 4 ++ __ add(dst, dst, stepDst); ++ } ++ ++ /** ++ * void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) ++ * ++ * Input arguments: ++ * c_rarg0 - src, source array ++ * c_rarg1 - sp, src start offset ++ * c_rarg2 - sl, src end offset ++ * c_rarg3 - dst, dest array ++ * c_rarg4 - dp, dst start offset ++ * c_rarg5 - isURL, Base64 or URL character set ++ */ ++ address generate_base64_encodeBlock() { ++ alignas(64) static const char toBase64[64] = { ++ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', ++ 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', ++ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', ++ 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ++ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' ++ }; ++ ++ alignas(64) static const char toBase64URL[64] = { ++ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', ++ 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', ++ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', ++ 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ++ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' ++ }; ++ ++ __ align(CodeEntryAlignment); ++ StubCodeMark mark(this, "StubRoutines", "encodeBlock"); ++ address start = __ pc(); ++ __ enter(); ++ ++ Register src = c_rarg0; ++ Register soff = c_rarg1; ++ Register send = c_rarg2; ++ Register dst = c_rarg3; ++ Register doff = c_rarg4; ++ Register isURL = c_rarg5; ++ ++ Register codec = c_rarg6; ++ Register length = c_rarg7; // total length of src data in bytes ++ ++ Label ProcessData, Exit; ++ ++ // length should be multiple of 3 ++ __ sub(length, send, soff); ++ // real src/dst to process data ++ __ add(src, src, soff); ++ __ add(dst, dst, doff); ++ ++ // load the codec base address ++ __ la(codec, ExternalAddress((address) toBase64)); ++ __ beqz(isURL, ProcessData); ++ __ la(codec, ExternalAddress((address) toBase64URL)); ++ __ BIND(ProcessData); ++ ++ // vector version ++ if (UseRVV) { ++ Label ProcessM2, ProcessM1, ProcessScalar; ++ ++ Register size = soff; ++ Register stepSrcM1 = send; ++ Register stepSrcM2 = doff; ++ Register stepDst = isURL; ++ ++ __ mv(size, MaxVectorSize * 2); ++ __ mv(stepSrcM1, MaxVectorSize * 3); ++ __ slli(stepSrcM2, stepSrcM1, 1); ++ __ mv(stepDst, MaxVectorSize * 2 * 4); ++ ++ __ blt(length, stepSrcM2, ProcessM1); ++ ++ __ BIND(ProcessM2); ++ base64_vector_encode_round(src, dst, codec, ++ size, stepSrcM2, stepDst, ++ v2, v4, v6, // inputs ++ v8, v10, v12, v14, // indexes ++ v16, v18, v20, v22, // outputs ++ Assembler::m2); ++ ++ __ sub(length, length, stepSrcM2); ++ __ bge(length, stepSrcM2, ProcessM2); ++ ++ __ BIND(ProcessM1); ++ __ blt(length, stepSrcM1, ProcessScalar); ++ ++ __ srli(size, size, 1); ++ __ srli(stepDst, stepDst, 1); ++ base64_vector_encode_round(src, dst, codec, ++ size, stepSrcM1, stepDst, ++ v1, v2, v3, // inputs ++ v4, v5, v6, v7, // indexes ++ v8, v9, v10, v11, // outputs ++ Assembler::m1); ++ __ sub(length, length, stepSrcM1); ++ ++ __ BIND(ProcessScalar); ++ } ++ ++ // scalar version ++ { ++ Register byte1 = soff, byte0 = send, byte2 = doff; ++ Register combined24Bits = isURL; ++ ++ __ beqz(length, Exit); ++ ++ Label ScalarLoop; ++ __ BIND(ScalarLoop); ++ { ++ // plain: [byte0[7:0] : byte1[7:0] : byte2[7:0]] => ++ // encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]] ++ ++ // load 3 bytes src data ++ __ lbu(byte0, Address(src, 0)); ++ __ lbu(byte1, Address(src, 1)); ++ __ lbu(byte2, Address(src, 2)); ++ __ addi(src, src, 3); ++ ++ // construct 24 bits from 3 bytes ++ __ slliw(byte0, byte0, 16); ++ __ slliw(byte1, byte1, 8); ++ __ orr(combined24Bits, byte0, byte1); ++ __ orr(combined24Bits, combined24Bits, byte2); ++ ++ // get codec index and encode(ie. load from codec by index) ++ __ slliw(byte0, combined24Bits, 8); ++ __ srliw(byte0, byte0, 26); ++ __ add(byte0, codec, byte0); ++ __ lbu(byte0, byte0); ++ ++ __ slliw(byte1, combined24Bits, 14); ++ __ srliw(byte1, byte1, 26); ++ __ add(byte1, codec, byte1); ++ __ lbu(byte1, byte1); ++ ++ __ slliw(byte2, combined24Bits, 20); ++ __ srliw(byte2, byte2, 26); ++ __ add(byte2, codec, byte2); ++ __ lbu(byte2, byte2); ++ ++ __ andi(combined24Bits, combined24Bits, 0x3f); ++ __ add(combined24Bits, codec, combined24Bits); ++ __ lbu(combined24Bits, combined24Bits); ++ ++ // store 4 bytes encoded data ++ __ sb(byte0, Address(dst, 0)); ++ __ sb(byte1, Address(dst, 1)); ++ __ sb(byte2, Address(dst, 2)); ++ __ sb(combined24Bits, Address(dst, 3)); ++ ++ __ sub(length, length, 3); ++ __ addi(dst, dst, 4); ++ // loop back ++ __ bnez(length, ScalarLoop); ++ } ++ } ++ ++ __ BIND(Exit); ++ ++ __ leave(); ++ __ ret(); ++ ++ return (address) start; ++ } ++ ++ /** ++ * vector registers: ++ * input VectorRegister's: intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8 ++ * index VectorRegister's: idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16 ++ * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22 ++ * ++ * NOTE: each field will occupy a single vector register group ++ */ ++ void base64_vector_decode_round(Register src, Register dst, Register codec, ++ Register size, Register stepSrc, Register stepDst, Register failedIdx, ++ VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4, ++ VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4, ++ VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, ++ Assembler::LMUL lmul) { ++ // set vector register type/len ++ __ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta); ++ ++ // segmented load src into v registers: mem(src) => vr(4) ++ __ vlseg4e8_v(inputV1, src); ++ ++ // src = src + register_group_len_bytes * 4 ++ __ add(src, src, stepSrc); ++ ++ // decoding ++ // 1. indexed load: vr(4) => vr(4) ++ __ vluxei8_v(idxV1, codec, inputV1); ++ __ vluxei8_v(idxV2, codec, inputV2); ++ __ vluxei8_v(idxV3, codec, inputV3); ++ __ vluxei8_v(idxV4, codec, inputV4); ++ ++ // 2. check wrong data ++ __ vor_vv(outputV1, idxV1, idxV2); ++ __ vor_vv(outputV2, idxV3, idxV4); ++ __ vor_vv(outputV1, outputV1, outputV2); ++ __ vmseq_vi(v0, outputV1, -1); ++ __ vfirst_m(failedIdx, v0); ++ Label NoFailure, FailureAtIdx0; ++ // valid value can only be -1 when < 0 ++ __ bltz(failedIdx, NoFailure); ++ // when the first data (at index 0) fails, no need to process data anymore ++ __ beqz(failedIdx, FailureAtIdx0); ++ __ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu); ++ __ slli(stepDst, failedIdx, 1); ++ __ add(stepDst, failedIdx, stepDst); ++ __ BIND(NoFailure); ++ ++ // 3. compute the decoded data: vr(4) => vr(3) ++ __ vsll_vi(idxV1, idxV1, 2); ++ __ vsrl_vi(outputV1, idxV2, 4); ++ __ vor_vv(outputV1, outputV1, idxV1); ++ ++ __ vsll_vi(idxV2, idxV2, 4); ++ __ vsrl_vi(outputV2, idxV3, 2); ++ __ vor_vv(outputV2, outputV2, idxV2); ++ ++ __ vsll_vi(idxV3, idxV3, 6); ++ __ vor_vv(outputV3, idxV4, idxV3); ++ ++ // segmented store encoded data in v registers back to dst: vr(3) => mem(dst) ++ __ vsseg3e8_v(outputV1, dst); ++ ++ // dst = dst + register_group_len_bytes * 3 ++ __ add(dst, dst, stepDst); ++ __ BIND(FailureAtIdx0); ++ } ++ ++ /** ++ * int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME) ++ * ++ * Input arguments: ++ * c_rarg0 - src, source array ++ * c_rarg1 - sp, src start offset ++ * c_rarg2 - sl, src end offset ++ * c_rarg3 - dst, dest array ++ * c_rarg4 - dp, dst start offset ++ * c_rarg5 - isURL, Base64 or URL character set ++ * c_rarg6 - isMIME, Decoding MIME block ++ */ ++ address generate_base64_decodeBlock() { ++ ++ static const uint8_t fromBase64[256] = { ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, ++ 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, ++ 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, ++ 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ }; ++ ++ static const uint8_t fromBase64URL[256] = { ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, ++ 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, ++ 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, ++ 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, ++ 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, ++ }; ++ ++ __ align(CodeEntryAlignment); ++ StubCodeMark mark(this, "StubRoutines", "decodeBlock"); ++ address start = __ pc(); ++ __ enter(); ++ ++ Register src = c_rarg0; ++ Register soff = c_rarg1; ++ Register send = c_rarg2; ++ Register dst = c_rarg3; ++ Register doff = c_rarg4; ++ Register isURL = c_rarg5; ++ Register isMIME = c_rarg6; ++ ++ Register codec = c_rarg7; ++ Register dstBackup = x31; ++ Register length = x28; // t3, total length of src data in bytes ++ ++ Label ProcessData, Exit; ++ Label ProcessScalar, ScalarLoop; ++ ++ // passed in length (send - soff) is guaranteed to be > 4, ++ // and in this intrinsic we only process data of length in multiple of 4, ++ // it's not guaranteed to be multiple of 4 by java level, so do it explicitly ++ __ sub(length, send, soff); ++ __ andi(length, length, -4); ++ // real src/dst to process data ++ __ add(src, src, soff); ++ __ add(dst, dst, doff); ++ // backup of dst, used to calculate the return value at exit ++ __ mv(dstBackup, dst); ++ ++ // load the codec base address ++ __ la(codec, ExternalAddress((address) fromBase64)); ++ __ beqz(isURL, ProcessData); ++ __ la(codec, ExternalAddress((address) fromBase64URL)); ++ __ BIND(ProcessData); ++ ++ // vector version ++ if (UseRVV) { ++ // for MIME case, it has a default length limit of 76 which could be ++ // different(smaller) from (send - soff), so in MIME case, we go through ++ // the scalar code path directly. ++ __ bnez(isMIME, ScalarLoop); ++ ++ Label ProcessM1, ProcessM2; ++ ++ Register failedIdx = soff; ++ Register stepSrcM1 = send; ++ Register stepSrcM2 = doff; ++ Register stepDst = isURL; ++ Register size = x29; // t4 ++ ++ __ mv(size, MaxVectorSize * 2); ++ __ mv(stepSrcM1, MaxVectorSize * 4); ++ __ slli(stepSrcM2, stepSrcM1, 1); ++ __ mv(stepDst, MaxVectorSize * 2 * 3); ++ ++ __ blt(length, stepSrcM2, ProcessM1); ++ ++ ++ // Assembler::m2 ++ __ BIND(ProcessM2); ++ base64_vector_decode_round(src, dst, codec, ++ size, stepSrcM2, stepDst, failedIdx, ++ v2, v4, v6, v8, // inputs ++ v10, v12, v14, v16, // indexes ++ v18, v20, v22, // outputs ++ Assembler::m2); ++ __ sub(length, length, stepSrcM2); ++ ++ // error check ++ // valid value of failedIdx can only be -1 when < 0 ++ __ bgez(failedIdx, Exit); ++ ++ __ bge(length, stepSrcM2, ProcessM2); ++ ++ ++ // Assembler::m1 ++ __ BIND(ProcessM1); ++ __ blt(length, stepSrcM1, ProcessScalar); ++ ++ __ srli(size, size, 1); ++ __ srli(stepDst, stepDst, 1); ++ base64_vector_decode_round(src, dst, codec, ++ size, stepSrcM1, stepDst, failedIdx, ++ v1, v2, v3, v4, // inputs ++ v5, v6, v7, v8, // indexes ++ v9, v10, v11, // outputs ++ Assembler::m1); ++ __ sub(length, length, stepSrcM1); ++ ++ // error check ++ // valid value of failedIdx can only be -1 when < 0 ++ __ bgez(failedIdx, Exit); ++ ++ __ BIND(ProcessScalar); ++ __ beqz(length, Exit); ++ } ++ ++ // scalar version ++ { ++ Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL; ++ Register combined32Bits = x29; // t5 ++ ++ // encoded: [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] => ++ // plain: [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]] ++ __ BIND(ScalarLoop); ++ ++ // load 4 bytes encoded src data ++ __ lbu(byte0, Address(src, 0)); ++ __ lbu(byte1, Address(src, 1)); ++ __ lbu(byte2, Address(src, 2)); ++ __ lbu(byte3, Address(src, 3)); ++ __ addi(src, src, 4); ++ ++ // get codec index and decode (ie. load from codec by index) ++ __ add(byte0, codec, byte0); ++ __ add(byte1, codec, byte1); ++ __ lb(byte0, Address(byte0, 0)); ++ __ lb(byte1, Address(byte1, 0)); ++ __ add(byte2, codec, byte2); ++ __ add(byte3, codec, byte3); ++ __ lb(byte2, Address(byte2, 0)); ++ __ lb(byte3, Address(byte3, 0)); ++ __ slliw(byte0, byte0, 18); ++ __ slliw(byte1, byte1, 12); ++ __ orr(byte0, byte0, byte1); ++ __ orr(byte0, byte0, byte3); ++ __ slliw(byte2, byte2, 6); ++ // For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time, ++ // 1. error check below ++ // 2. decode below ++ __ orr(combined32Bits, byte0, byte2); ++ ++ // error check ++ __ bltz(combined32Bits, Exit); ++ ++ // store 3 bytes decoded data ++ __ sraiw(byte0, combined32Bits, 16); ++ __ sraiw(byte1, combined32Bits, 8); ++ __ sb(byte0, Address(dst, 0)); ++ __ sb(byte1, Address(dst, 1)); ++ __ sb(combined32Bits, Address(dst, 2)); ++ ++ __ sub(length, length, 4); ++ __ addi(dst, dst, 3); ++ // loop back ++ __ bnez(length, ScalarLoop); ++ } ++ ++ __ BIND(Exit); ++ __ sub(c_rarg0, dst, dstBackup); ++ ++ __ leave(); ++ __ ret(); ++ ++ return (address) start; ++ } ++ + void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable, + VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc, + Register temp0, Register temp1, Register temp2, Register temp3, +@@ -6128,6 +6624,11 @@ static const int64_t right_3_bits = right_n_bits(3); + StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); + } + ++ if (UseBASE64Intrinsics) { ++ StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); ++ StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); ++ } ++ + if (UseAdler32Intrinsics) { + StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); + } +diff --git a/src/hotspot/cpu/riscv/vm_version_riscv.cpp b/src/hotspot/cpu/riscv/vm_version_riscv.cpp +index 03e45da7f..c507a39c9 100644 +--- a/src/hotspot/cpu/riscv/vm_version_riscv.cpp ++++ b/src/hotspot/cpu/riscv/vm_version_riscv.cpp +@@ -311,6 +311,11 @@ void VM_Version::c2_initialize() { + FLAG_SET_DEFAULT(UseRVVForBigIntegerShiftIntrinsics, false); + } + ++ // Base64 ++ if (FLAG_IS_DEFAULT(UseBASE64Intrinsics)) { ++ FLAG_SET_DEFAULT(UseBASE64Intrinsics, true); ++ } ++ + if (UseRVV) { + if (FLAG_IS_DEFAULT(MaxVectorSize)) { + MaxVectorSize = _initial_vector_length; diff --git a/Backport-JDK-8317720-RISC-V-Implement-Adler32-intrinsic.patch b/Backport-JDK-8317720-RISC-V-Implement-Adler32-intrinsic.patch new file mode 100644 index 0000000000000000000000000000000000000000..f14eb494334d84ea374384e726537b9808a7e246 --- /dev/null +++ b/Backport-JDK-8317720-RISC-V-Implement-Adler32-intrinsic.patch @@ -0,0 +1,327 @@ +diff --git a/src/hotspot/cpu/riscv/assembler_riscv.hpp b/src/hotspot/cpu/riscv/assembler_riscv.hpp +index 5d0291f62..c6db58cfa 100644 +--- a/src/hotspot/cpu/riscv/assembler_riscv.hpp ++++ b/src/hotspot/cpu/riscv/assembler_riscv.hpp +@@ -1504,6 +1504,10 @@ enum VectorMask { + INSN(vredmaxu_vs, 0b1010111, 0b010, 0b000110); + INSN(vredmax_vs, 0b1010111, 0b010, 0b000111); + ++ // Vector Widening Integer Reduction Instructions ++ INSN(vwredsum_vs, 0b1010111, 0b000, 0b110001); ++ INSN(vwredsumu_vs, 0b1010111, 0b000, 0b110000); ++ + // Vector Floating-Point Compare Instructions + INSN(vmfle_vv, 0b1010111, 0b001, 0b011001); + INSN(vmflt_vv, 0b1010111, 0b001, 0b011011); +@@ -1541,6 +1545,10 @@ enum VectorMask { + INSN(vmulh_vv, 0b1010111, 0b010, 0b100111); + INSN(vmul_vv, 0b1010111, 0b010, 0b100101); + ++ // Vector Widening Integer Multiply Instructions ++ INSN(vwmul_vv, 0b1010111, 0b010, 0b111011); ++ INSN(vwmulu_vv, 0b1010111, 0b010, 0b111000); ++ + // Vector Integer Min/Max Instructions + INSN(vmax_vv, 0b1010111, 0b000, 0b000111); + INSN(vmaxu_vv, 0b1010111, 0b000, 0b000110); +diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +index 502010347..1e93cf2b0 100644 +--- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp ++++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +@@ -5260,6 +5260,262 @@ class StubGenerator: public StubCodeGenerator { + return (address) start; + } + ++ void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable, ++ VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc, ++ Register temp0, Register temp1, Register temp2, Register temp3, ++ VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) { ++ ++ assert((lmul == Assembler::m4 && step == 64) || ++ (lmul == Assembler::m2 && step == 32) || ++ (lmul == Assembler::m1 && step == 16), ++ "LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16"); ++ // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used. ++ // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case. ++ // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration. ++ // In non-vectorized code, we update s1 and s2 as: ++ // s1 <- s1 + b1 ++ // s2 <- s2 + s1 ++ // s1 <- s1 + b2 ++ // s2 <- s2 + b1 ++ // ... ++ // s1 <- s1 + b64 ++ // s2 <- s2 + s1 ++ // Putting above assignments together, we have: ++ // s1_new = s1 + b1 + b2 + ... + b64 ++ // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) = ++ // = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) = ++ // = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1) ++ ++ __ mv(temp3, step); ++ // Load data ++ __ vsetvli(temp0, temp3, Assembler::e8, lmul); ++ __ vle8_v(vbytes, buff); ++ __ addi(buff, buff, step); ++ ++ // Upper bound reduction sum for s1_new: ++ // 0xFF * 64 = 0x3FC0, so: ++ // 1. Need to do vector-widening reduction sum ++ // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements ++ __ vwredsumu_vs(vs1acc, vbytes, vzero); ++ // Multiplication for s2_new ++ __ vwmulu_vv(vs2acc, vtable, vbytes); ++ ++ // s2 = s2 + s1 * log2(step) ++ __ slli(temp1, s1, exact_log2(step)); ++ __ add(s2, s2, temp1); ++ ++ // Summing up calculated results for s2_new ++ if (MaxVectorSize > 16) { ++ __ vsetvli(temp0, temp3, Assembler::e16, lmul); ++ } else { ++ // Half of vector-widening multiplication result is in successor of vs2acc ++ // group for vlen == 16, in which case we need to double vector register ++ // group width in order to reduction sum all of them ++ Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 : ++ (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8; ++ __ vsetvli(temp0, temp3, Assembler::e16, lmulx2); ++ } ++ // Upper bound for reduction sum: ++ // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so: ++ // 1. Need to do vector-widening reduction sum ++ // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements ++ __ vwredsumu_vs(vtemp1, vs2acc, vzero); ++ ++ // Extracting results for: ++ // s1_new ++ __ vmv_x_s(temp0, vs1acc); ++ __ add(s1, s1, temp0); ++ // s2_new ++ __ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1); ++ __ vmv_x_s(temp1, vtemp1); ++ __ add(s2, s2, temp1); ++ } ++ ++ /*** ++ * int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len) ++ * ++ * Arguments: ++ * ++ * Inputs: ++ * c_rarg0 - int adler ++ * c_rarg1 - byte* buff (b + off) ++ * c_rarg2 - int len ++ * ++ * Output: ++ * c_rarg0 - int adler result ++ */ ++ address generate_updateBytesAdler32() { ++ __ align(CodeEntryAlignment); ++ StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); ++ address start = __ pc(); ++ ++ Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop, ++ L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1; ++ ++ // Aliases ++ Register adler = c_rarg0; ++ Register s1 = c_rarg0; ++ Register s2 = c_rarg3; ++ Register buff = c_rarg1; ++ Register len = c_rarg2; ++ Register nmax = c_rarg4; ++ Register base = c_rarg5; ++ Register count = c_rarg6; ++ Register temp0 = x28; // t3 ++ Register temp1 = x29; // t4 ++ Register temp2 = x30; // t5 ++ Register temp3 = x31; // t6 ++ ++ VectorRegister vzero = v31; ++ VectorRegister vbytes = v8; // group: v8, v9, v10, v11 ++ VectorRegister vs1acc = v12; // group: v12, v13, v14, v15 ++ VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23 ++ VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27 ++ VectorRegister vtable_32 = v4; // group: v4, v5 ++ VectorRegister vtable_16 = v30; ++ VectorRegister vtemp1 = v28; ++ VectorRegister vtemp2 = v29; ++ ++ // Max number of bytes we can process before having to take the mod ++ // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 ++ const uint64_t BASE = 0xfff1; ++ const uint64_t NMAX = 0x15B0; ++ ++ // Loops steps ++ int step_64 = 64; ++ int step_32 = 32; ++ int step_16 = 16; ++ int step_1 = 1; ++ ++ __ enter(); // Required for proper stackwalking of RuntimeStub frame ++ __ mv(temp1, 64); ++ __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4); ++ ++ // Generating accumulation coefficients for further calculations ++ // vtable_64: ++ __ vid_v(vtemp1); ++ __ vrsub_vx(vtable_64, vtemp1, temp1); ++ // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 } ++ ++ // vtable_32: ++ __ mv(temp1, 32); ++ __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2); ++ __ vid_v(vtemp1); ++ __ vrsub_vx(vtable_32, vtemp1, temp1); ++ // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 } ++ ++ __ vsetivli(temp0, 16, Assembler::e8, Assembler::m1); ++ // vtable_16: ++ __ mv(temp1, 16); ++ __ vid_v(vtemp1); ++ __ vrsub_vx(vtable_16, vtemp1, temp1); ++ // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 } ++ ++ __ vmv_v_i(vzero, 0); ++ ++ __ mv(base, BASE); ++ __ mv(nmax, NMAX); ++ ++ // s1 is initialized to the lower 16 bits of adler ++ // s2 is initialized to the upper 16 bits of adler ++ __ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff) ++ __ zero_extend(s1, adler, 16); // s1 = (adler & 0xffff) ++ ++ // The pipelined loop needs at least 16 elements for 1 iteration ++ // It does check this, but it is more effective to skip to the cleanup loop ++ __ mv(temp0, step_16); ++ __ bgeu(len, temp0, L_nmax); ++ __ beqz(len, L_combine); ++ ++ // Jumping to L_by1_loop ++ __ sub(len, len, step_1); ++ __ j(L_by1_loop); ++ ++ __ bind(L_nmax); ++ __ sub(len, len, nmax); ++ __ sub(count, nmax, 16); ++ __ bltz(len, L_by16); ++ ++ // Align L_nmax loop by 64 ++ __ bind(L_nmax_loop_entry); ++ __ sub(count, count, 32); ++ ++ __ bind(L_nmax_loop); ++ adler32_process_bytes(buff, s1, s2, vtable_64, vzero, ++ vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, ++ vtemp1, vtemp2, step_64, Assembler::m4); ++ __ sub(count, count, step_64); ++ __ bgtz(count, L_nmax_loop); ++ ++ // There are three iterations left to do ++ adler32_process_bytes(buff, s1, s2, vtable_32, vzero, ++ vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, ++ vtemp1, vtemp2, step_32, Assembler::m2); ++ adler32_process_bytes(buff, s1, s2, vtable_16, vzero, ++ vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, ++ vtemp1, vtemp2, step_16, Assembler::m1); ++ ++ // s1 = s1 % BASE ++ __ remuw(s1, s1, base); ++ // s2 = s2 % BASE ++ __ remuw(s2, s2, base); ++ ++ __ sub(len, len, nmax); ++ __ sub(count, nmax, 16); ++ __ bgez(len, L_nmax_loop_entry); ++ ++ __ bind(L_by16); ++ __ add(len, len, count); ++ __ bltz(len, L_by1); ++ // Trying to unroll ++ __ mv(temp3, step_64); ++ __ blt(len, temp3, L_by16_loop); ++ ++ __ bind(L_by16_loop_unroll); ++ adler32_process_bytes(buff, s1, s2, vtable_64, vzero, ++ vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, ++ vtemp1, vtemp2, step_64, Assembler::m4); ++ __ sub(len, len, step_64); ++ // By now the temp3 should still be 64 ++ __ bge(len, temp3, L_by16_loop_unroll); ++ ++ __ bind(L_by16_loop); ++ adler32_process_bytes(buff, s1, s2, vtable_16, vzero, ++ vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, ++ vtemp1, vtemp2, step_16, Assembler::m1); ++ __ sub(len, len, step_16); ++ __ bgez(len, L_by16_loop); ++ ++ __ bind(L_by1); ++ __ add(len, len, 15); ++ __ bltz(len, L_do_mod); ++ ++ __ bind(L_by1_loop); ++ __ lbu(temp0, Address(buff, 0)); ++ __ addi(buff, buff, step_1); ++ __ add(s1, temp0, s1); ++ __ add(s2, s2, s1); ++ __ sub(len, len, step_1); ++ __ bgez(len, L_by1_loop); ++ ++ __ bind(L_do_mod); ++ // s1 = s1 % BASE ++ __ remuw(s1, s1, base); ++ // s2 = s2 % BASE ++ __ remuw(s2, s2, base); ++ ++ // Combine lower bits and higher bits ++ // adler = s1 | (s2 << 16) ++ __ bind(L_combine); ++ __ slli(s2, s2, 16); ++ __ orr(s1, s1, s2); ++ ++ __ leave(); // Required for proper stackwalking of RuntimeStub frame ++ __ ret(); ++ ++ return start; ++ } ++ + #endif // COMPILER2_OR_JVMCI + + /** +@@ -5872,6 +6128,10 @@ static const int64_t right_3_bits = right_n_bits(3); + StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); + } + ++ if (UseAdler32Intrinsics) { ++ StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); ++ } ++ + #endif // COMPILER2_OR_JVMCI + } + +diff --git a/src/hotspot/cpu/riscv/vm_version_riscv.cpp b/src/hotspot/cpu/riscv/vm_version_riscv.cpp +index 4f4f0ebf9..03e45da7f 100644 +--- a/src/hotspot/cpu/riscv/vm_version_riscv.cpp ++++ b/src/hotspot/cpu/riscv/vm_version_riscv.cpp +@@ -201,6 +201,18 @@ void VM_Version::initialize() { + FLAG_SET_DEFAULT(UseZvbc, false); + } + ++ // Adler32 ++ if (UseRVV) { ++ if (FLAG_IS_DEFAULT(UseAdler32Intrinsics)) { ++ FLAG_SET_DEFAULT(UseAdler32Intrinsics, true); ++ } ++ } else if (UseAdler32Intrinsics) { ++ if (!FLAG_IS_DEFAULT(UseAdler32Intrinsics)) { ++ warning("Adler32 intrinsic requires RVV instructions (not available on this CPU)."); ++ } ++ FLAG_SET_DEFAULT(UseAdler32Intrinsics, false); ++ } ++ + // ChaCha20 + if (UseRVV && MaxVectorSize >= 32) { + // performance tests on hardwares (MaxVectorSize == 16, 32) show that diff --git a/Backport-JDK-8322209-8322179-8329641-RISC-V-Enable-sha-md5-tests.patch b/Backport-JDK-8322209-8322179-8329641-RISC-V-Enable-sha-md5-tests.patch index 84e6b279777d6e4179ced1f8975337f388f8e470..b66a23fba8fa062da634ed59936f4da76ea59eda 100644 --- a/Backport-JDK-8322209-8322179-8329641-RISC-V-Enable-sha-md5-tests.patch +++ b/Backport-JDK-8322209-8322179-8329641-RISC-V-Enable-sha-md5-tests.patch @@ -1,24 +1,3 @@ -diff --git a/test/hotspot/jtreg/compiler/intrinsics/sha/cli/TestUseMD5IntrinsicsOptionOnUnsupportedCPU.java b/test/hotspot/jtreg/compiler/intrinsics/sha/cli/TestUseMD5IntrinsicsOptionOnUnsupportedCPU.java -index e9ae2f6c1..cd5933ec9 100644 ---- a/test/hotspot/jtreg/compiler/intrinsics/sha/cli/TestUseMD5IntrinsicsOptionOnUnsupportedCPU.java -+++ b/test/hotspot/jtreg/compiler/intrinsics/sha/cli/TestUseMD5IntrinsicsOptionOnUnsupportedCPU.java -@@ -39,6 +39,7 @@ package compiler.intrinsics.sha.cli; - - import compiler.intrinsics.sha.cli.testcases.GenericTestCaseForOtherCPU; - import compiler.intrinsics.sha.cli.testcases.GenericTestCaseForUnsupportedAArch64CPU; -+import compiler.intrinsics.sha.cli.testcases.GenericTestCaseForUnsupportedRISCV64CPU; - import compiler.intrinsics.sha.cli.testcases.GenericTestCaseForUnsupportedX86CPU; - import compiler.intrinsics.sha.cli.testcases.UseSHAIntrinsicsSpecificTestCaseForUnsupportedCPU; - -@@ -49,6 +50,8 @@ public class TestUseMD5IntrinsicsOptionOnUnsupportedCPU { - DigestOptionsBase.USE_MD5_INTRINSICS_OPTION, /* checkUseSHA = */ false), - new GenericTestCaseForUnsupportedAArch64CPU( - DigestOptionsBase.USE_MD5_INTRINSICS_OPTION, /* checkUseSHA = */ false), -+ new GenericTestCaseForUnsupportedRISCV64CPU( -+ DigestOptionsBase.USE_MD5_INTRINSICS_OPTION, /* checkUseSHA = */ false), - new GenericTestCaseForOtherCPU( - DigestOptionsBase.USE_MD5_INTRINSICS_OPTION, /* checkUseSHA = */ false)).test(); - } diff --git a/test/hotspot/jtreg/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java b/test/hotspot/jtreg/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java index 689c7c8cc..27fe99892 100644 --- a/test/hotspot/jtreg/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java diff --git a/huawei-posix_spawn-clock_gettime-clock_getres-__cxa_thread_.patch b/huawei-posix_spawn-clock_gettime-clock_getres-__cxa_thread_.patch index 8c0c7ec9e2510574744262ca1184f9f85dd19839..8d6264d8f93316497df2303e4b19a3f22a90b671 100644 --- a/huawei-posix_spawn-clock_gettime-clock_getres-__cxa_thread_.patch +++ b/huawei-posix_spawn-clock_gettime-clock_getres-__cxa_thread_.patch @@ -835,21 +835,25 @@ diff --git a/src/hotspot/share/runtime/threads.cpp b/src/hotspot/share/runtime/t index 8b7058405..0df523f76 100644 --- a/src/hotspot/share/runtime/threads.cpp +++ b/src/hotspot/share/runtime/threads.cpp -@@ -58,6 +58,7 @@ +@@ -58,6 +58,9 @@ #include "oops/symbol.hpp" #include "prims/jvmtiAgentList.hpp" #include "prims/jvm_misc.hpp" ++#if defined(AARCH64) || defined(AMD64) +#include "prims/upcallLinker.hpp" ++#endif // AARCH64 || AMD64 #include "runtime/arguments.hpp" #include "runtime/fieldDescriptor.inline.hpp" #include "runtime/flags/jvmFlagLimit.hpp" -@@ -424,6 +425,9 @@ jint Threads::create_vm(JavaVMInitArgs* args, bool* canTryAgain) { +@@ -424,6 +425,11 @@ jint Threads::create_vm(JavaVMInitArgs* args, bool* canTryAgain) { // Initialize library-based TLS ThreadLocalStorage::init(); - + ++#if defined(AARCH64) || defined(AMD64) + // Initialize ThreadLocalUpCall + ThreadLocalUpCall::init(); + ++#endif // AARCH64 || AMD64 // Initialize the output stream module ostream_init(); diff --git a/openjdk-21.spec b/openjdk-21.spec index 5378f6732a1262c7b2b21583e8d65b9a8b202992..91a1d6f368ca573503b73ece61615b9415b42cc9 100644 --- a/openjdk-21.spec +++ b/openjdk-21.spec @@ -905,7 +905,8 @@ Name: java-21-%{origin} Version: %{newjavaver}.%{buildver} # This package needs `.rolling` as part of Release so as to not conflict on install with # java-X-openjdk. I.e. when latest rolling release is also an LTS release packaged as -Release: 0 +Release: 1 + # java-1.5.0-ibm from jpackage.org set Epoch to 1 for unknown reasons # and this change was brought into RHEL-4. java-1.5.0-ibm packages @@ -1068,6 +1069,8 @@ Patch3012: Backport-JDK-8315716-RISC-V-implement-ChaCha20-intrinsic.patch Patch3013: Backport-JDK-8316592-RISC-V-implement-poly1305-intrinsic.patch Patch3014: Backport-JDK-8317721-RISC-V-Implement-CRC32-intrinsic.patch Patch3015: Backport-JDK-8347981-RISC-V-implement-Add-Zfa-zli-imm-loads.patch +Patch3016: Backport-JDK-8317720-RISC-V-Implement-Adler32-intrinsic.patch +Patch3017: Backport-JDK-8314125-RISC-V-implement-Base64-intrinsic.patch BuildRequires: autoconf BuildRequires: automake @@ -1377,6 +1380,8 @@ pushd %{top_level_dir_name} %patch3013 -p1 %patch3014 -p1 %patch3015 -p1 +%patch3016 -p1 +%patch3017 -p1 popd %endif @@ -1934,12 +1939,15 @@ cjc.mainProgram(args) -- the returns from copy_jdk_configs.lua should not affect %changelog +* Mon Oct 27 2025 zhangshihui - 1:21.0.9.10-1 +- RISC-V add Adler32 and Base64 + * Fri Oct 17 2025 Benshuai5D - 1:21.0.9.10-0 - update to jdk21.0.9-ga - delete Backport-JDK-8339460-CDS-error-when-module-is-locate.patch - delete Backport-JDK-7036144-GZIPInputStream-readTrailer-use.patch - modify huawei-Add-KAE-zip-feature.patch -- add huawei-posix_spawn-clock_gettime-clock_getres-__cxa_thread_.patch +- add huawei-posix_spawn-clock_gettime-clock_getres-__cxa_thread_.patch * Fri Oct 17 2025 zhangzejian - 1:21.0.8.9-7 - RISC-V Add Zfa zli imm loads