diff --git a/profiler/affinity_cpu_bind/README.md b/profiler/affinity_cpu_bind/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f51b6c21f85495ee8f2de62de3c11e8b9c154e83 --- /dev/null +++ b/profiler/affinity_cpu_bind/README.md @@ -0,0 +1,24 @@ +### **昇腾亲和性CPU绑核工具** + +### **介绍** +昇腾亲和性CPU绑核工具支持用户无需侵入式修改工程,直接运行工具即可实现按亲和性策略绑核,提升推理或训练性能。 + +### **使用方式** +1.命令行输入python3 bind_core.py -app/--application="inference/train cmd"(如果命令含多个参数,放在双引号中) +该方式会在拉起任务后,监测任务进程,并实施绑核,直至任务进程结束。 + +2.推理或训练任务已经拉起,命令行输入python3 bind_core.py。该方式会循环查找使用到NPU卡的任务进程,并实施绑核。 + +### **使用须知 +1.该脚本会在拉起后查找使用到NPU卡的进程,每次查找10s,循环5次。如果找不到进程,会超时退出。 + +2.使用工具前应提前安装pstree工具,参考命令yum install -y psmisc或apt -y install psmisc。 + +3.使用前手动执行npu-smi info -t topo,出现如下样例结果,说明环境支持绑核,否则请将环境驱动包升级到Ascend HDK 23.0.RC2以上版本。 + +![输入图片说明](image.png) + + + + + diff --git a/profiler/affinity_cpu_bind/bind_core.py b/profiler/affinity_cpu_bind/bind_core.py new file mode 100644 index 0000000000000000000000000000000000000000..46a7eb1b06c8cc3389986744937df75697a96c18 --- /dev/null +++ b/profiler/affinity_cpu_bind/bind_core.py @@ -0,0 +1,248 @@ +#! /usr/bin/python3 +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import subprocess +import re +import argparse +import os +import datetime +import sys +# import signal +import time + +NUP_IDS = [] +RUNNING_PIDS = {} +NPU_CPU_DICT = {} +BIND_CORES_WAIT_TIME = 0 +RUNNING_USER_NAME = '' + +# signal.signal(signal.SIGCHLD, signal.SIG_IGN) + +# binding core log file +nowtime = datetime.datetime.now() +BIND_CORE_RESULT_FILE = 'bind_core_' + \ + str(nowtime.year) + '_' + \ + str(nowtime.month) + '_' + \ + str(nowtime.day) + '_' + \ + str(nowtime.hour) + '_' + \ + str(nowtime.minute) + '_' + \ + str(nowtime.second) + '.txt' +tmp = sys.stdout + +PRINT_FILE = open(BIND_CORE_RESULT_FILE, 'w') + +# get process user +def get_process_user(pid): + cmd = 'ps -ef | grep {}'.format(pid) + p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + res = p.communicate()[0].decode('utf-8').strip().split() + return res[0] + +# print log to stdout +def print_log_to_stdout_and_exit(msg): + sys.stdout = tmp + print(msg) + exit() + +# print log to logfile +def print_log_to_file(msg): + sys.stdout = PRINT_FILE + print(msg) + sys.stdout = tmp + +# launch training or inference process +def launch_process(cmd): + global RUNNING_CMD_PID + print_log_to_file('[INFO] Start to execute cmd: {}'.format(cmd)) + subprocess.Popen(cmd, shell=True) + +# parse input cmd +def args_parse(): + global BIND_CORES_WAIT_TIME + parser = argparse.ArgumentParser(description='This is a sample program.') + parser.add_argument('-t', '--time', type=int, metavar='', nargs='+', help='Wait time before bind cores that you want to set. |nThe unit is \'s\'') + parser.add_argument('-app', '--application', metavar='', nargs='+', help='Training or inference command that you want to run.\" \"') + args = parser.parse_args() + if args.application: + application_cmd = ' '.join(args.application) + launch_process(application_cmd) + time.sleep(10) + if args.time: + BIND_CORES_WAIT_TIME = int(args.time[0]) + + # if time is set, wait for setting time before bind cores + if BIND_CORES_WAIT_TIME != 0: + time.sleep(BIND_CORES_WAIT_TIME) + +# get npu affinity +def get_npu_affinity(): + global NPU_CPU_DICT + global NUP_IDS + + cmd = 'uname -m' + p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + res = p.communicate()[0].decode('utf-8').strip().split() + cpu_compensating = 0 + + # arm: need cpu compensating + if res[0] == 'aarch64': + cmd = 'lscpu | grep "CPU:"' + p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + res = p.communicate()[0].decode('utf-8').strip().split() + cpu_num = int(res[1]) + cmd = 'lscpu | grep "NUMA"' + p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + res = p.communicate()[0].decode('utf-8').strip().split() + node_num = int(res[2]) + + cpu_compensating = int(cpu_num / node_num) + + cmd = 'npu-smi info -t topo' + p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + res = p.communicate()[0].decode('utf-8').strip().split() + + i = 0 + for v in res: + if '-' in v: + cpu_list = v.split('-') + v = cpu_list[0] + '-' + str(int(cpu_list[1]) + cpu_compensating) + NPU_CPU_DICT[NUP_IDS[i]] = v + i += 1 + for k in NPU_CPU_DICT.keys(): + print_log_to_file('[INFO] Affinity CPU list {} for NPU {}'.format(NPU_CPU_DICT[k], k)) + +# get total npu id +def get_total_npu_id(): + global NUP_IDS + cmd = 'npu-smi info -l | grep "NPU ID"' + p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + res = p.communicate()[0].decode('utf-8').strip().split() + for i in res: + if i.isdigit(): + NUP_IDS.append(int(i)) + if not NUP_IDS: + print_log_to_stdout_and_exit('[ERROR] Failed to get total NPU id list, please make sure there is NPU on this device') + print_log_to_file('[INFO] NPU total id list: {}'.format(NUP_IDS)) + +# get app pid on npu +def get_pid_on_npu(): + global RUNNING_PIDS + global NUP_IDS + print_log_to_file('[INFO] Begin to find running process on all NPUs') + RUNNING_PIDS.clear() + # get process pid on NPUs, retry times : 5 + for times in range(5): + for i in NUP_IDS: + cmd = 'npu-smi info -t proc-mem -i {} -c 0'.format(int(i)) + p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + res = p.communicate()[0].decode('utf-8').strip().split() + + if 'Process' in res: + for v in res: + if v.startswith('id:'): + pid_on_npu = v.split(':')[1] + if get_process_user(int(pid_on_npu)) != RUNNING_USER_NAME: + continue + if i not in RUNNING_PIDS: + RUNNING_PIDS[i] = [int(pid_on_npu)] + else: + RUNNING_PIDS[i].append(int(pid_on_npu)) + + if RUNNING_PIDS: + break + print_log_to_file('[WARNING] Found no running process on all NPUs, retry times: {}, wait for 5 s'.format(times + 1)) + # wait 5 s for each time + time.sleep(5) + + # no running process on NPUs, stop + if not RUNNING_PIDS: + print_log_to_file('[INFO] Found no running process on all NPUs, stop bind cores') + print_log_to_stdout_and_exit('[INFO] Now there is no running process on all NPUs, stop bind cores, you can see bind core results in ' + BIND_CORE_RESULT_FILE) + + # delete repeat pid + for i in NUP_IDS: + if i in RUNNING_PIDS: + pids_npu = RUNNING_PIDS[i] + for n, pid in RUNNING_PIDS.items(): + if n != i and pid in pids_npu: + RUNNING_PIDS[n].remove(pid) + + for k in RUNNING_PIDS.keys(): + print_log_to_file('[INFO] Succeed to find running process {} on NPU {}'.format(RUNNING_PIDS[k], k)) + +# get device info +def get_dev_info(): + get_total_npu_id() + get_npu_affinity() + +# get process affinity +def get_process_affinity(pid): + cmd = 'taskset -pc {} '.format(pid) + p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + res = p.communicate()[0].decode('utf-8').strip().split() + if res: + return res[len(res) - 1] + +# run bind core +def run_bind_core(): + global NUP_IDS + global NPU_CPU_DICT + for k, pid_list in RUNNING_PIDS.items(): + cpu_list = NPU_CPU_DICT[k].split('-') + start_cpu_id = cpu_list[0] + end_cpu_id = cpu_list[1] + + for pid in pid_list: + cmd = 'pstree {} -p -T'.format(pid) + p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + res = p.communicate()[0].decode('utf-8').strip().split() + for ele in res: + ele = re.sub(u"\\(|\\)", ",", ele) + ele_list = ele.split(',') + for sub_p in ele_list: + if sub_p.isdigit(): + sub_p = int(sub_p) + + # if process has set to right affinity, continue + current_affinity_cpu_list = get_process_affinity(sub_p) + if not current_affinity_cpu_list: + continue + current_cpu_list = current_affinity_cpu_list.split('-') + if current_cpu_list and current_cpu_list[0] == start_cpu_id and current_cpu_list[1] == end_cpu_id: + continue + print_log_to_file('[INFO] Begin to bind cores for process {} on NPU {}'.format(str(sub_p), k)) + cmd = 'taskset -pc {}-{} {}'.format(int(start_cpu_id), int(end_cpu_id), sub_p) + p = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + print_log_to_file(p.stdout.decode('utf-8')) + + print_log_to_file('[INFO] Succeed to bind process {} on NPU {} with cpu cores list {}'.format(str(sub_p), k, NPU_CPU_DICT[k])) + +# get current process user +def get_current_user(): + global RUNNING_USER_NAME + RUNNING_USER_NAME = get_process_user(os.getpid()) + +if __name__ == '__main__': + print("[INFO] Begin to run bind-cores script...") + args_parse() + get_current_user() + get_dev_info() + + while True: + get_pid_on_npu() + run_bind_core() + + print("[INFO] Finished to bind cores, you can see bind core results in " + BIND_CORE_RESULT_FILE) + diff --git a/profiler/affinity_cpu_bind/image.png b/profiler/affinity_cpu_bind/image.png new file mode 100644 index 0000000000000000000000000000000000000000..09db6251f28ae8ebaf2bec07b75f526d040f9b85 Binary files /dev/null and b/profiler/affinity_cpu_bind/image.png differ