diff --git a/common/inc/pwrerr.h b/common/inc/pwrerr.h index 19166a04c6701fbb86c9829af0b5c18fef706c2d..68dda32c6e8eb113b61282d90a8a918259fbbfc5 100644 --- a/common/inc/pwrerr.h +++ b/common/inc/pwrerr.h @@ -64,5 +64,6 @@ enum PWR_RtnCode { PWR_ERR_FILE_OPEN_FAILED, PWR_ERR_FILE_SPRINTF_FAILED, PWR_ERR_HBM_NOT_SUPPORTED = 600, + PWR_ERR_HBM_SET_POWER_STATE_FAILED = 601, }; #endif \ No newline at end of file diff --git a/common/src/pwrbuffer.c b/common/src/pwrbuffer.c index fcc7688bf9ef0911b3f02ec9a68c7d1764d1b7ba..3a772bfbe8eba8555e41b4729d69c4ac65ae089b 100644 --- a/common/src/pwrbuffer.c +++ b/common/src/pwrbuffer.c @@ -18,7 +18,7 @@ #include #include "pwrerr.h" -#define WAITING_RESULT_TIME_OUT 30 // second +#define WAITING_RESULT_TIME_OUT 150 // second #define USEC_TO_NSEC 1000 // queue private static void DeleteFromHead(PwrMsgBuffer *smb) diff --git a/pwrapic/src/powerapi.c b/pwrapic/src/powerapi.c index a3dc5465e4f07d64515446af0682fce37238a627..947413ff817b7c27799bc698b531158e58d1e99b 100644 --- a/pwrapic/src/powerapi.c +++ b/pwrapic/src/powerapi.c @@ -392,6 +392,8 @@ int PWR_HBM_GetSysState(PWR_HBM_SYS_STATE *hbmState) { CHECK_STATUS(STATUS_REGISTERTED); + CHECK_NULL_POINTER(hbmState); + return GetHbmSysState(hbmState); } diff --git a/pwrapis/src/hbmservice.c b/pwrapis/src/hbmservice.c index 2516f75903be3cca49297a987ef1de4e001af3b5..3c2f3dbac81b2bfb2d2e8920db38948ce914e106 100644 --- a/pwrapis/src/hbmservice.c +++ b/pwrapis/src/hbmservice.c @@ -13,6 +13,7 @@ * Description: provide hbm service * **************************************************************************** */ +#include "config.h" #include "string.h" #include "pwrerr.h" #include "server.h" @@ -21,6 +22,9 @@ #include "utils.h" #include "hbmservice.h" +#define MAX_RETRY_COUNT 50 +#define RETRY_INTERVAL_MS 100 + #define EXEC_COMMAND(cmd) \ do { \ FILE *fp = popen(cmd, "r"); \ @@ -30,32 +34,123 @@ pclose(fp); \ } while (0) -static int GetHbmMode(PWR_HBM_SYS_STATE *state) +static int IsNodeEmptyCpuList(const char *nodePath) { - char cache_mod_cmd[] = "find /sys/devices/LNXSYSTM* -name 'HISI04A1*'"; - char flat_mod_cmd[] = "find /sys/devices/LNXSYSTM* -name 'PNP0C80*'"; - *state = PWR_HBM_NOT_SUPPORT; + char cpuListFile[MAX_FULL_NAME]; + FILE *cpuListFp; + char cpuListBuf[256]; + + snprintf(cpuListFile, sizeof(cpuListFile), "%s/cpulist", nodePath); + cpuListFp = fopen(cpuListFile, "r"); + if (cpuListFp == NULL) { + return 0; + } + + if (fgets(cpuListBuf, sizeof(cpuListBuf), cpuListFp) != NULL && + (strlen(cpuListBuf) == 0 || strcmp(cpuListBuf, "\n") == 0)) { + fclose(cpuListFp); + return 1; + } - FILE *cache_mod_fp = popen(cache_mod_cmd, "r"); - if (cache_mod_fp == NULL) { + fclose(cpuListFp); + return 0; +} + +static int OfflineMemoryState(const char *nodePath) +{ + char memoryDirPath[MAX_FULL_NAME]; + char memoryStateFile[MAX_FULL_NAME]; + DIR *dir; + struct dirent *entry; + + snprintf(memoryDirPath, sizeof(memoryDirPath), "%s", nodePath); + dir = opendir(memoryDirPath); + if (dir == NULL) { + Logger(ERROR, MD_NM_SVR_HBM, "Failed to open memory directory"); return PWR_ERR_COMMON; } - char cache_buf[PWR_MAX_STRING_LEN] = {0}; - if (fgets(cache_buf, PWR_MAX_STRING_LEN, cache_mod_fp) != NULL) { - *state |= PWR_HBM_FLAT_MOD; + + while ((entry = readdir(dir)) != NULL) { + if (strncmp(entry->d_name, "memory", 6) != 0) { + continue; + } + + int ret = snprintf(memoryStateFile, sizeof(memoryStateFile), "%s/%s/state", memoryDirPath, entry->d_name); + if (ret >= (int)sizeof(memoryStateFile)) { + Logger(ERROR, MD_NM_SVR_HBM, "Buffer overflow detected in memoryStateFile"); + continue; + } + if (WriteFile(memoryStateFile, "offline", strlen("offline")) != PWR_SUCCESS) { + Logger(ERROR, MD_NM_SVR_HBM, "Failed to write to memory state file"); + return PWR_ERR_COMMON; + } + } + + closedir(dir); + return PWR_SUCCESS; +} + +static int OfflineAllHBMNode() +{ + DIR *dirPtr; + struct dirent *dirEntry; + char nodePath[MAX_FULL_NAME]; + + dirPtr = opendir("/sys/devices/system/node"); + if (dirPtr == NULL) { + Logger(ERROR, MD_NM_SVR_HBM, "Failed to open /sys/devices/system/node dir"); + return PWR_ERR_FILE_OPEN_FAILED; } - FILE *flat_mod_fp = popen(flat_mod_cmd, "r"); - if (flat_mod_fp == NULL) { - pclose(cache_mod_fp); + while ((dirEntry = readdir(dirPtr)) != NULL) { + if (strncmp(dirEntry->d_name, "node", 4) == 0) { + snprintf(nodePath, sizeof(nodePath), "/sys/devices/system/node/%s", dirEntry->d_name); + + // if cpulist is empty, offline the node + if (IsNodeEmptyCpuList(nodePath)) { + if (OfflineMemoryState(nodePath) != PWR_SUCCESS) { + continue; + } + } + } + } + + closedir(dirPtr); + return PWR_SUCCESS; +} + +static int GetHbmMode(PWR_HBM_SYS_STATE *state) +{ + *state = PWR_HBM_NOT_SUPPORT; + + char hbmModeFile[] = "/sys/firmware/efi/efivars/MemoryOnChipMode-21f3b3c5-946d-41c1-838c-194e48aa41e2"; + if (access(hbmModeFile, F_OK) != 0) { + return PWR_ERR_HBM_NOT_SUPPORTED; + } + + char hbmCmd[] = + "hexdump /sys/firmware/efi/efivars/MemoryOnChipMode-21f3b3c5-946d-41c1-838c-194e48aa41e2 | grep '0000000 0007 0000 " + "0001' | wc -l"; + FILE *fp = popen(hbmCmd, "r"); + if (fp == NULL) { + Logger(ERROR, MD_NM_SVR_HBM, "Failed get hbm mode"); return PWR_ERR_COMMON; } - char flat_buf[PWR_MAX_STRING_LEN] = {0}; - if (fgets(flat_buf, PWR_MAX_STRING_LEN, flat_mod_fp) != NULL) { - *state |= PWR_HBM_CACHE_MOD; + + char resultBuf[PWR_MAX_STRING_LEN] = {0}; + if (fgets(resultBuf, sizeof(resultBuf), fp) != NULL) { + int count = atoi(resultBuf); + if (count == 0) { + *state = PWR_HBM_FLAT_MOD; + } else if (count == 1) { + *state = PWR_HBM_CACHE_MOD; + } else { + *state = PWR_HBM_NOT_SUPPORT; + } } - pclose(cache_mod_fp); - pclose(flat_mod_fp); + + pclose(fp); + return PWR_SUCCESS; } @@ -76,38 +171,159 @@ void GetHbmSysState(PwrMsg *req) } } -static int SetPowerState(int powerState) +static int HandleCacheMode(const int powerState) { - PWR_HBM_SYS_STATE hbmState = PWR_HBM_NOT_SUPPORT; - if (GetHbmMode(&hbmState) != PWR_SUCCESS) { - Logger(ERROR, MD_NM_SVR_HBM, "GetHbmMode failed"); + char cmd[PWR_MAX_STRING_LEN] = {0}; + const char *stateStr = (powerState == 0) ? "offline" : "online"; + + // Check if kernel module exist + FILE *checkFile = popen("find /sys/kernel/hbm_cache/*/state -type f", "r"); + if (checkFile == NULL) { + Logger(ERROR, MD_NM_SVR_HBM, "Failed to run find command"); return PWR_ERR_COMMON; } - if (hbmState == PWR_HBM_NOT_SUPPORT) { - Logger(ERROR, MD_NM_SVR_HBM, "SetHbmAllPowerState: HBM is not support"); - return PWR_ERR_HBM_NOT_SUPPORTED; + + if (fgetc(checkFile) == EOF) { + Logger(INFO, MD_NM_SVR_HBM, "No hbm_cache state files found, loading kernel moudle"); + if (system("modprobe hisi_hbmcache") != 0) { + Logger(ERROR, MD_NM_SVR_HBM, "Failed to load hbm.ko module"); + pclose(checkFile); + return PWR_ERR_COMMON; + } } + pclose(checkFile); - const char *state_str = (powerState == 0) ? "offline" : "online"; - if (hbmState == PWR_HBM_CACHE_MOD || hbmState == PWR_HBM_HYBRID_MOD) { - char cmd[PWR_MAX_STRING_LEN] = {0}; - snprintf( - cmd, sizeof(cmd), - "find /sys/kernel/hbm_cache/*/state -type f | xargs -I {} sh -c 'echo \"%s\" > {}'", - state_str); - EXEC_COMMAND(cmd); + snprintf(cmd, sizeof(cmd), + "find /sys/kernel/hbm_cache/*/state -type f | xargs -I {} sh -c " + "'echo \"%s\" > {}'", + stateStr); + EXEC_COMMAND(cmd); + + // check status + const char *checkCmd; + if (powerState == 0) { + checkCmd = + "find /sys/kernel/hbm_cache/*/firmware_node/status -type f | xargs -I {} cat {} 2>/dev/null | grep -q -v " + "'0' && echo \"Failure\" || echo \"Success\""; + } else { + checkCmd = + "find /sys/kernel/hbm_cache/*/firmware_node/status -type f | xargs -I {} cat {} 2>/dev/null | grep -q -v " + "'15' && echo \"Failure\" || echo \"Success\""; } - if (hbmState == PWR_HBM_FLAT_MOD || hbmState == PWR_HBM_HYBRID_MOD) { - char cmd[PWR_MAX_STRING_LEN] = {0}; - snprintf(cmd, sizeof(cmd), - "find /sys/devices/system/container/PNP0A06*/state -type f | xargs -I {} sh -c 'echo " - "\"%s\" > {}'", - state_str); - EXEC_COMMAND(cmd); + int retryCount = 0; + while (retryCount < MAX_RETRY_COUNT) { + FILE *fp = popen(checkCmd, "r"); + if (fp == NULL) { + Logger(ERROR, MD_NM_SVR_HBM, "Failed to run command"); + return PWR_ERR_COMMON; + } + + char result[PWR_MAX_NAME_LEN]; + if (fgets(result, sizeof(result), fp) != NULL) { + if (strncmp(result, "Success", 7) == 0) { + pclose(fp); + return PWR_SUCCESS; + } + } + pclose(fp); + + usleep(RETRY_INTERVAL_MS * 1000); + retryCount++; } - return PWR_SUCCESS; + Logger(ERROR, MD_NM_SVR_HBM, "Failed to set hbm power state after retries"); + return PWR_ERR_HBM_SET_POWER_STATE_FAILED; +} + +static int HandleFlatMode(const int powerState) +{ + // Check if kernel module exist + FILE *checkFile = popen("find /sys/devices/system/container/PNP0A06*/state -type f", "r"); + if (checkFile == NULL) { + Logger(ERROR, MD_NM_SVR_HBM, "Failed to run find command"); + return PWR_ERR_COMMON; + } + + if (fgetc(checkFile) == EOF) { + Logger(INFO, MD_NM_SVR_HBM, "No hbm_cache state files found, loading kernel moudle"); + if (system("modprobe hisi_hbmdev") != 0) { + Logger(ERROR, MD_NM_SVR_HBM, "Failed to load hbm.ko module"); + pclose(checkFile); + return PWR_ERR_COMMON; + } + } + pclose(checkFile); + + // offline all memory + if (powerState == 0) { + if (OfflineAllHBMNode() != PWR_SUCCESS) { + return PWR_ERR_COMMON; + } + } + + // online/offline hbm node + const char *stateStr = (powerState == 0) ? "offline" : "online"; + char cmd[PWR_MAX_STRING_LEN] = {0}; + snprintf(cmd, sizeof(cmd), + "find /sys/devices/system/container/PNP0A06*/state -type f | xargs -I {} sh -c 'echo " + "\"%s\" > {}'", + stateStr); + EXEC_COMMAND(cmd); + + // check if online/offline is successful + const char *checkCmd; + if (powerState == 0) { + checkCmd = + "find /sys/devices/system/container/PNP0A06*/firmware_node/PNP0C80*/status -type f | xargs -I {} cat {} " + "2>/dev/null | grep -q -v '13' && echo \"Failure\" || echo \"Success\""; + } else { + checkCmd = + "find /sys/devices/system/container/PNP0A06*/firmware_node/PNP0C80*/status -type f | xargs -I {} cat {} " + "2>/dev/null | grep -q -v '15' && echo \"Failure\" || echo \"Success\""; + } + + int retryCount = 0; + while (retryCount < MAX_RETRY_COUNT) { + FILE *fp = popen(checkCmd, "r"); + if (fp == NULL) { + Logger(ERROR, MD_NM_SVR_HBM, "Failed to check hbm power state"); + return PWR_ERR_COMMON; + } + + char result[PWR_MAX_NAME_LEN]; + if (fgets(result, sizeof(result), fp) != NULL) { + if (strncmp(result, "Success", 7) == 0) { + pclose(fp); + return PWR_SUCCESS; + } + } + pclose(fp); + + usleep(RETRY_INTERVAL_MS * 1000); // 转换为微秒 + retryCount++; + } + + Logger(ERROR, MD_NM_SVR_HBM, "Failed to set hbm power state after retries"); + return PWR_ERR_HBM_SET_POWER_STATE_FAILED; +} + +static int SetPowerState(int powerState) +{ + PWR_HBM_SYS_STATE hbmState = PWR_HBM_NOT_SUPPORT; + int ret = PWR_ERR_HBM_SET_POWER_STATE_FAILED; + if (GetHbmMode(&hbmState) != PWR_SUCCESS || hbmState == PWR_HBM_NOT_SUPPORT) { + Logger(ERROR, MD_NM_SVR_HBM, "SetHbmAllPowerState: HBM is not support"); + return PWR_ERR_HBM_NOT_SUPPORTED; + } + + if (hbmState == PWR_HBM_CACHE_MOD) { + ret = HandleCacheMode(powerState); + } else if (hbmState == PWR_HBM_FLAT_MOD) { + ret = HandleFlatMode(powerState); + } + + return ret; } void SetHbmAllPowerState(PwrMsg *req)