From 9e0f8f28fb27c5b456f7a0f53dfdc6e997c621c6 Mon Sep 17 00:00:00 2001 From: sunyiming Date: Tue, 12 Mar 2024 09:46:08 +0000 Subject: [PATCH 1/3] runtime error bugfix Signed-off-by: sunyiming --- .../ptdbg_ascend/compare/acc_compare.py | 47 +++++++++++++------ 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py index bb3980989..2c159a90f 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py @@ -409,32 +409,51 @@ def read_dump_path(result_path): def _handle_multi_process(func, input_parma, result_path, lock): - process_num = int((multiprocessing.cpu_count() + 1) / 2) + try: + ulimit_output = subprocess.check_output(['ulimit', '-u'], shell=True) + max_user_processes = int(ulimit_output.strip()) + except subprocess.CalledProcessError as e: + print_warn_log(f"Failed to get ulimit: {e}") + max_user_processes = 1024 + cpu_count = multiprocessing.cpu_count() // 2 + estimated_max_processes = max_user_processes // 4 + process_num = min(max(1, cpu_count), estimated_max_processes) + if process_num < cpu_count: + print_warn_log(f"Reducing number of processes to {process_num} due to system limits.") op_name_mapping_dict = read_dump_path(result_path) - op_names = [] - for _ in range(process_num): - op_names.append([]) + op_names = [[] for _ in range(process_num)] all_op_names = list(op_name_mapping_dict.keys()) for i, op_name in enumerate(all_op_names): op_names[i % process_num].append(op_name) all_tasks = [] - pool = multiprocessing.Pool(process_num) + try: + pool = multiprocessing.Pool(process_num) + except RuntimeError as e: + print_error_log(f"Failed to start process pool: {e}") + return def err_call(args): print_error_log('multiprocess compare failed! Reason: {}'.format(args)) try: pool.terminate() - if os.path.exists(result_path): - os.remove(result_path) except OSError as e: print_error_log("pool terminate failed") - - for process_idx, fusion_op_names in enumerate(op_names): - idx = [process_num, process_idx] - task = pool.apply_async(func, - args=(idx, fusion_op_names, op_name_mapping_dict, result_path, lock, input_parma), - error_callback=err_call) - all_tasks.append(task) + try: + for process_idx, fusion_op_names in enumerate(op_names): + idx = [process_num, process_idx] + task = pool.apply_async(func, + args=(idx, fusion_op_names, op_name_mapping_dict, result_path, lock, input_parma), + error_callback=err_call) + all_tasks.append(task) + except RuntimeError as e: + print_error_log(f"Error starting new tasks: {e}") + pool.terminate() + if os.path.exists(result_path): + try: + os.remove(result_path) + except OSError as e: + print_error_log(f"Error removing result path: {e}") + return pool.close() pool.join() -- Gitee From bd4e9d8513d397567f9135921affd7414ed4280e Mon Sep 17 00:00:00 2001 From: sunyiming Date: Wed, 13 Mar 2024 01:19:54 +0000 Subject: [PATCH 2/3] update debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py. Signed-off-by: sunyiming --- .../src/python/ptdbg_ascend/compare/acc_compare.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py index 2c159a90f..c3c84d93e 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py @@ -410,7 +410,7 @@ def read_dump_path(result_path): def _handle_multi_process(func, input_parma, result_path, lock): try: - ulimit_output = subprocess.check_output(['ulimit', '-u'], shell=True) + ulimit_output = subprocess.check_output(['bash', '-c', 'ulimit -u']) max_user_processes = int(ulimit_output.strip()) except subprocess.CalledProcessError as e: print_warn_log(f"Failed to get ulimit: {e}") @@ -449,10 +449,7 @@ def _handle_multi_process(func, input_parma, result_path, lock): print_error_log(f"Error starting new tasks: {e}") pool.terminate() if os.path.exists(result_path): - try: - os.remove(result_path) - except OSError as e: - print_error_log(f"Error removing result path: {e}") + os.remove(result_path) return pool.close() pool.join() -- Gitee From c5cf8b72841c24106c961a8b093601744e7a4c4b Mon Sep 17 00:00:00 2001 From: sunyiming Date: Fri, 15 Mar 2024 08:05:46 +0000 Subject: [PATCH 3/3] update debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py. Signed-off-by: sunyiming --- .../ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py index c3c84d93e..0692b8aad 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py @@ -410,7 +410,7 @@ def read_dump_path(result_path): def _handle_multi_process(func, input_parma, result_path, lock): try: - ulimit_output = subprocess.check_output(['bash', '-c', 'ulimit -u']) + ulimit_output = subprocess.check_output(['ulimit', '-u'], shell=False) max_user_processes = int(ulimit_output.strip()) except subprocess.CalledProcessError as e: print_warn_log(f"Failed to get ulimit: {e}") -- Gitee