From a812041307c98736df60dd5624262e563bed21e2 Mon Sep 17 00:00:00 2001 From: lijiaojiao Date: Tue, 6 Aug 2024 20:14:01 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E6=96=B0=E5=A2=9Eonline=5Fdispatch?= =?UTF-8?q?=E6=B5=8B=E8=AF=95=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test_compare_online_dispatch.py | 106 ++++++++++ .../online_dispatch/test_dump_compare.py | 189 ++++++++++++++++++ .../online_dispatch/test_single_compare.py | 52 +++++ .../test_utils_online_dispatch.py | 99 +++++++++ 4 files changed, 446 insertions(+) create mode 100644 debug/accuracy_tools/msprobe/test/pytorch_ut/online_dispatch/test_compare_online_dispatch.py create mode 100644 debug/accuracy_tools/msprobe/test/pytorch_ut/online_dispatch/test_dump_compare.py create mode 100644 debug/accuracy_tools/msprobe/test/pytorch_ut/online_dispatch/test_single_compare.py create mode 100644 debug/accuracy_tools/msprobe/test/pytorch_ut/online_dispatch/test_utils_online_dispatch.py diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/online_dispatch/test_compare_online_dispatch.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/online_dispatch/test_compare_online_dispatch.py new file mode 100644 index 0000000000..a4da9db109 --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/online_dispatch/test_compare_online_dispatch.py @@ -0,0 +1,106 @@ +# Copyright (c) 2024-2024 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import torch +from msprobe.pytorch.online_dispatch.compare import get_json_contents, Saver + +import json +import csv +import os +import logging +import threading +from pathlib import Path +import pandas as pd +from unittest.mock import Mock, patch +from msprobe.core.common.utils import CompareException + +class TestCompare(unittest.TestCase): + def setUp(self): + self.dict_json_path = "./dict.json" + self.list_json_path = "./list.json" + Path(self.dict_json_path).touch() + Path(self.list_json_path).touch() + + def tearDown(self): + if os.path.exists(self.dict_json_path): + os.remove(self.dict_json_path) + if os.path.exists(self.list_json_path): + os.remove(self.list_json_path) + + def test_get_json_contents_when_get_json(self): + data = {"one":1} + with open(self.dict_json_path,'w') as f: + json.dump(data, f) + self.assertEqual(get_json_contents(self.dict_json_path),data) + + @patch('msprobe.core.common.log.BaseLogger.error') + def test_get_json_contents_when_get_list(self,mock_error): + data = [1,2] + with open(self.list_json_path,'w') as f: + json.dump(data, f) + with self.assertRaises(CompareException) as context: + get_json_contents(self.list_json_path) + self.assertEqual(context.exception.code, CompareException.INVALID_FILE_ERROR) + mock_error.assert_called_once_with('Json file %s, content is not a dictionary!' % self.list_json_path) + +class TestSaver(unittest.TestCase): + def setUp(self): + self.save_path = "./saver_save.csv" + self.detail_save_path = "./saver_detail.csv" + self.saver = Saver(self.save_path,self.detail_save_path,False) + Path(self.save_path).touch() + Path(self.detail_save_path).touch() + + def tearDown(self): + if os.path.exists(self.save_path): + os.remove(self.save_path) + if os.path.exists(self.detail_save_path): + os.remove(self.detail_save_path) + + def test_write_csv_title(self): + self.saver.write_csv_title() + mock_data_save = {self.saver.COLUMN_API_NAME:{}, + self.saver.COLUMN_FORWARD_SUCCESS:{}, + self.saver.COLUMN_BACKWARD_SUCCESS:{}, + "Message":{}} + mock_data_detail = {'Npu Name': {}, 'Bench Dtype': {}, 'NPU Dtype': {}, 'Shape': {}, 'error_balance': {}, 'max_abs_diff': {}, 'max_abs_idx': {}, 'max_rel_diff': {}, 'max_rel_idx': {}, 'eb_thd': {}, 'error_thd': {}, 'Status': {}, 'Message': {}} + self.assertTrue(pd.read_csv(self.save_path).to_dict()==mock_data_save and pd.read_csv(self.detail_save_path).to_dict()==mock_data_detail) + + def test_print_pretest_result(self): + pass + + def test_get_statistics_from_result_csv(self): + pass + + def test_write_summary_csv(self): + mock_test_result = Mock() + mock_test_result.api_name = "api_name" + mock_test_result.is_fwd_success = "SKIP" + mock_test_result.is_bwd_success = "NOSKIP" + mock_test_result.fwd_compare_alg_results = "result" + self.saver.write_summary_csv(mock_test_result) + mock_data_save = {"api_name":{}, "SKIP":{}, "NOSKIP":{}, "result":{}} + self.assertTrue(pd.read_csv(self.save_path).to_dict()==mock_data_save) + + def test_write_detail_csv(self): + mock_test_result = Mock() + mock_test_result.api_name = "api_name" + mock_test_result.fwd_compare_alg_results = ["f"] + mock_test_result.bwd_compare_alg_results = ["b"] + self.saver.write_detail_csv(mock_test_result) + mock_data_detail = {'api_name.forward.output.0': {0: 'api_name.backward.output.0'}, 'f': {0: 'b'}} + + self.assertTrue(pd.read_csv(self.detail_save_path).to_dict()==mock_data_detail) + diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/online_dispatch/test_dump_compare.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/online_dispatch/test_dump_compare.py new file mode 100644 index 0000000000..7f0fd8e747 --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/online_dispatch/test_dump_compare.py @@ -0,0 +1,189 @@ +# Copyright (c) 2024-2024 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import torch +import json +import os +import threading +from pathlib import Path +from unittest.mock import Mock, patch +import pandas as pd + +from msprobe.core.common.const import CompareConst +from msprobe.pytorch.online_dispatch.dump_compare import support_basic_type, dump_data, save_temp_summery, dispatch_workflow, get_torch_func, dispatch_multiprocess, error_call, save_csv + + + +class TestDumpCompare(unittest.TestCase): + def setUp(self): + self.summery_path = "summery.json" + Path(self.summery_path).touch() + self.csv_path = "test_save_csv.csv" + Path(self.csv_path).touch() + + def tearDown(self): + if os.path.exists(self.summery_path): + os.remove(self.summery_path) + if os.path.exists(self.csv_path): + os.remove(self.csv_path) + + def test_support_basic_type_should_return_true_when_is_instance(self): + self.assertTrue(support_basic_type(2.3)) + + def test_support_basic_type_should_return_false_when_isnot_instance(self): + self.assertFalse(support_basic_type("abcde")) + + def test_dump_data_should_return_when_data_is_tensor(self): + data=torch.tensor([1,2]) + self.assertIsNone(dump_data(data,1,1)) + + def test_save_temp_summery(self): + api_index='1' + single_api_summery="conv2d" + path = '' + data = [] + lock=threading.Lock() + + save_temp_summery(api_index=api_index,single_api_summery=single_api_summery,path=path,lock=lock) + + with open(self.summery_path) as f: + content = f.readlines() + for line in content: + data.append(json.loads(line)) + self.assertEqual([['1','conv2d']],data) + + @patch('msprobe.pytorch.online_dispatch.dump_compare.dump_data') + @patch('msprobe.pytorch.online_dispatch.dump_compare.save_temp_summery') + def test_dispatch_workflow_should_dump_when_flag_is_True(self,mock_save_temp_summery,mock_dump_data): + mock_run_param = Mock() + mock_run_param.aten_api="aten_api" + mock_run_param.single_api_index="single_api_index" + mock_run_param.root_npu_path="" + mock_data_info = Mock() + mock_data_info.cpu_args=None + mock_data_info.cpu_kwargs=[] + + mock_run_param.dump_flag=True + mock_run_param.process_num = 0 + mock_run_param.api_index = 1 + mock_data_info.all_summery=[1] + + dispatch_workflow(mock_run_param, mock_data_info) + mock_dump_data.assert_called() + mock_save_temp_summery.assert_not_called() + + @patch('msprobe.pytorch.online_dispatch.dump_compare.dump_data') + @patch('msprobe.pytorch.online_dispatch.dump_compare.save_temp_summery') + def test_dispatch_workflow_should_not_dump_when_flag_is_false(self,mock_save_temp_summery,mock_dump_data): + mock_run_param = Mock() + mock_run_param.aten_api="aten_api" + mock_run_param.single_api_index="single_api_index" + mock_run_param.root_npu_path="" + mock_data_info = Mock() + mock_data_info.cpu_args=None + mock_data_info.cpu_kwargs=[] + + mock_run_param.dump_flag=False + mock_run_param.auto_dump_flag=False + mock_run_param.process_num = 1 + mock_run_param.api_index = 1 + mock_data_info.all_summery=[1] + + dispatch_workflow(mock_run_param, mock_data_info) + mock_dump_data.assert_not_called() + mock_save_temp_summery.assert_called() + + def test_get_torch_func_should_return_None_when_outside_input(self): + mock_run_param = Mock() + mock_run_param.func_namespace="new_attr1" + mock_run_param.aten_api="new_attr2" + mock_run_param.aten_api_overload_name="new_attr3" + self.assertIsNone(get_torch_func(mock_run_param)) + + def test_get_torch_func_should_return_None_when_inside_input(self): + mock_run_param = Mock() + mock_run_param.func_namespace="aten" + mock_run_param.aten_api="add" + mock_run_param.aten_api_overload_name="Scalar" + self.assertEqual(get_torch_func(mock_run_param),torch.ops.aten.add.Scalar) + + @patch('msprobe.core.common.log.BaseLogger.error') + def test_dispatch_multiprocess_should_logger_error_when_wrong_api_input(self,mock_error): + mock_run_param = Mock() + mock_run_param.func_namespace="new_attr1" + mock_run_param.aten_api="new_attr2" + mock_run_param.aten_api_overload_name="new_attr3" + mock_dispatch_data_info=Mock() + dispatch_multiprocess(mock_run_param,mock_dispatch_data_info) + mock_error.assert_called_once_with(f'can not find suitable call api:{mock_run_param.aten_api}') + + @patch('msprobe.pytorch.online_dispatch.dump_compare.dispatch_workflow') + def test_dispatch_multiprocess_should_workflow_when_right_api_input(self,mock_workflow): + mock_run_param = Mock() + mock_run_param.func_namespace="aten" + mock_run_param.aten_api="add" + mock_run_param.aten_api_overload_name="Scalar" + mock_dispatch_data_info=Mock() + mock_workflow.return_value=1 + dispatch_multiprocess(mock_run_param,mock_dispatch_data_info) + mock_workflow.assert_called_once_with(mock_run_param,mock_dispatch_data_info) + + @patch('msprobe.core.common.log.BaseLogger.error') + def test_error_call(self,mock_error): + error_call("messages") + mock_error.assert_called_once_with("multiprocess messages") + + def test_save_csv(self): + data = {CompareConst.NPU_NAME: 1, + CompareConst.BENCH_NAME: 1, + CompareConst.NPU_DTYPE: 1, + CompareConst.BENCH_DTYPE: 11, + CompareConst.NPU_SHAPE: 1, + CompareConst.BENCH_SHAPE: 1, + CompareConst.NPU_MAX: 1, + CompareConst.NPU_MIN: 1, + CompareConst.NPU_MEAN: 1, + CompareConst.BENCH_MAX: 1, + CompareConst.BENCH_MIN: 1, + CompareConst.BENCH_MEAN: 1, + CompareConst.COSINE: 1, + CompareConst.MAX_ABS_ERR: 1, + CompareConst.MAX_RELATIVE_ERR: 1, + CompareConst.ACCURACY: 1, + CompareConst.ERROR_MESSAGE: 1} + + save_csv([[data]],[2],self.csv_path) + + df = pd.read_csv(self.csv_path) + data_gt = {CompareConst.NPU_NAME: 1, + CompareConst.BENCH_NAME: 1, + CompareConst.NPU_DTYPE: 1, + CompareConst.BENCH_DTYPE: 11, + CompareConst.NPU_SHAPE: 1, + CompareConst.BENCH_SHAPE: 1, + CompareConst.NPU_MAX: 1, + CompareConst.NPU_MIN: 1, + CompareConst.NPU_MEAN: 1, + CompareConst.BENCH_MAX: 1, + CompareConst.BENCH_MIN: 1, + CompareConst.BENCH_MEAN: 1, + CompareConst.COSINE: 1, + CompareConst.MAX_ABS_ERR: 1, + CompareConst.MAX_RELATIVE_ERR: 1, + CompareConst.ACCURACY: 1, + CompareConst.STACK: 2, + CompareConst.ERROR_MESSAGE: 1} + df_gt = pd.DataFrame.from_dict(data_gt, orient='index').T + self.assertTrue((df.all()==df_gt.all()).all()) \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/online_dispatch/test_single_compare.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/online_dispatch/test_single_compare.py new file mode 100644 index 0000000000..9aaa14af6e --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/online_dispatch/test_single_compare.py @@ -0,0 +1,52 @@ +# Copyright (c) 2024-2024 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import torch +import logging +from unittest.mock import Mock, patch +from msprobe.pytorch.online_dispatch.single_compare import SingleBenchmarkCompareStandard,SingleBenchmarkAccuracyResult, SingleBenchmarkAccuracyCompare + +class TestSingleBenchmarkCompareStandard(unittest.TestCase): + def setUp(self): + self.single_benchmark_compare_standard = SingleBenchmarkCompareStandard() + @patch('logging.warning') + def test_get_error_thd_when_input_f64(self,mock_warning): + self.single_benchmark_compare_standard.get_error_thd(torch.float64) + mock_warning.assert_called_once_with("the output data of fp64 uses the same standard as fp32.") + + @patch('logging.error') + def test_get_error_thd_when_input_bool(self,mock_error): + self.single_benchmark_compare_standard.get_error_thd(torch.bool) + mock_error.assert_called_once_with("Single benchmark compare only supports floating point " + "in fp16, bf16, fp32. ") + + def test_get_eb_thd_when_input_f16(self): + self.assertEqual(self.single_benchmark_compare_standard.get_eb_thd(torch.float16),2 ** -10) + + def test_get_eb_thd_when_input_bool(self): + self.assertIsNone(self.single_benchmark_compare_standard.get_eb_thd(torch.bool)) + + +class TestSingleBenchmarkAccuracyResult(unittest.TestCase): + def setUp(self): + self.single_benchmark_accuracy_result = SingleBenchmarkAccuracyResult(True,1,1,1,1,1) + + def test_get_result_result_false(self): + self.single_benchmark_accuracy_result.get_result(0.5,0.5) + self.assertEqual(self.single_benchmark_accuracy_result.result,False) + + def test_get_result_result_true(self): + self.single_benchmark_accuracy_result.get_result(2,2) + self.assertEqual(self.single_benchmark_accuracy_result.result,True) \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/online_dispatch/test_utils_online_dispatch.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/online_dispatch/test_utils_online_dispatch.py new file mode 100644 index 0000000000..14b28369be --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/online_dispatch/test_utils_online_dispatch.py @@ -0,0 +1,99 @@ +# Copyright (c) 2024-2024 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import inspect +import numpy as np +import os +import sys +import torch +import psutil +import logging +from pathlib import Path +from unittest.mock import patch, MagicMock + +from msprobe.pytorch.online_dispatch.utils import COLOR_RED, COLOR_CYAN, COLOR_YELLOW, COLOR_RESET, COMPARE_LOGO, get_callstack, np_save_data, data_to_cpu, logger_debug, logger_info, logger_warn, logger_error, logger_user, logger_logo, DispatchException + +cpu_device = torch._C.device("cpu") + +class FakeData: + def init(self): + self.numpy=np.random.rand(5,5) + +class FakeDataNoNumpy: + def init(self): + self.data=np.random.rand(5,5) + +class TestUtils(unittest.TestCase): + def setUp(self): + self.stack=inspect.stack() + self.data_path="" + self.file_name="data" + self.data=FakeData() + self.data_nonumpy=FakeDataNoNumpy() + self.dispatch_exception=DispatchException(err_code=1, err_msg="messages") + Path(os.path.join(self.data_path, f'{self.file_name}.npy')).touch() + def tearDown(self): + if os.path.exists(os.path.join(self.data_path, f'{self.file_name}.npy')): + os.remove(os.path.join(self.data_path, f'{self.file_name}.npy')) + + @patch('msprobe.core.common.file_check.change_mode') + def test_np_save_data_should_error_when_input_wrong(self,mock_change_mode): + np_save_data(self.data_nonumpy,self.file_name,self.data_path) + mock_change_mode.assert_not_called() + + def test_data_to_cpu_should_return_tensor_copy_when_input_tensor(self): + data = torch.tensor([1,2],device=cpu_device,dtype=torch.float16) + deep=1 + data_cpu=[] + self.assertEqual(data_to_cpu(data,deep,data_cpu).all(),data.clone().detach().float().all()) + + def test_data_to_cpu_should_return_list_when_input_list(self): + data=[1,2] + deep=0 + data_cpu=[] + self.assertEqual(data_to_cpu(data,deep,data_cpu),[1,2]) + + @patch('msprobe.pytorch.online_dispatch.utils.get_mp_logger') + def test_logger_debug(self,mock_inf0): + logger_debug("messages") + mock_inf0.return_value.assert_called_once_with("DEBUG messages") + + @patch('msprobe.pytorch.online_dispatch.utils.get_mp_logger') + def test_logger_info(self,mock_info): + logger_info("messages") + mock_info.return_value.assert_called_once_with("INFO messages") + + @patch('msprobe.pytorch.online_dispatch.utils.get_mp_logger') + def test_logger_warn(self,mock_info): + logger_warn("messages") + mock_info.return_value.assert_called_once_with(f'{COLOR_YELLOW}WARNING messages {COLOR_RESET}') + + @patch('msprobe.pytorch.online_dispatch.utils.get_mp_logger') + def test_logger_error(self,mock_info): + logger_error("messages") + mock_info.return_value.assert_called_once_with(f'{COLOR_RED}ERROR messages {COLOR_RESET}') + + @patch('msprobe.pytorch.online_dispatch.utils.get_mp_logger') + def test_logger_user(self,mock_info): + logger_user("messages") + mock_info.return_value.assert_called_once_with("messages") + + @patch('msprobe.pytorch.online_dispatch.utils.get_mp_logger') + def test_logger_logo(self,mock_info): + logger_logo() + mock_info.return_value.assert_called_once_with(f'{COLOR_CYAN}{COMPARE_LOGO} {COLOR_RESET}') + + def test_str(self): + self.assertEqual(self.dispatch_exception.__str__(),"messages") \ No newline at end of file -- Gitee From f21a2cbad1b7108273615baed094cbdd93ae2c41 Mon Sep 17 00:00:00 2001 From: lijiaojiao Date: Tue, 13 Aug 2024 20:25:09 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E3=80=90bugfix=E3=80=91=E8=B7=AF=E5=BE=84?= =?UTF-8?q?=E6=9D=83=E9=99=90=E8=AE=BE=E7=BD=AE=E9=94=99=E8=AF=AF=E4=BB=A5?= =?UTF-8?q?=E5=8F=8A=E6=BD=9C=E5=9C=A8=E7=9A=84cur=5Fpath=E6=9C=AA?= =?UTF-8?q?=E5=AE=9A=E4=B9=89=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/kj600/kj600/module_hook.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 74ef684a61..c0741b3605 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -132,8 +132,8 @@ class TrainerMon: cur_time = datetime.now().strftime('%b%d_%H-%M-%S') unique_id = str(uuid.uuid4())[:8] if dist.is_initialized(): + cur_path = os.path.join(output_base_dir, f"{cur_time}-rank{dist.get_rank()}-{unique_id}") if (dist.get_rank() in self.module_rank_list) or len(self.module_rank_list) == 0: - cur_path = os.path.join(output_base_dir, f"{cur_time}-rank{dist.get_rank()}-{unique_id}") check_path_length(cur_path) check_path_pattern_valid(cur_path) self.summary_writer = SummaryWriterWithAD( @@ -145,7 +145,7 @@ class TrainerMon: self.summary_writer = SummaryWriterWithAD(cur_path, self.alert_rules, unique_id, anomaly_inform) full_path = os.path.realpath(cur_path) - change_mode(full_path,FileCheckConst.DATA_FILE_AUTHORITY) + change_mode(full_path,FileCheckConst.DATA_DIR_AUTHORITY) # A HeatmapVisualizer instance is associated with an image self.update_heatmap_visualizer = defaultdict(HeatmapVisualizer) -- Gitee