From 90b2eeac9e6ed322aca24eb7eed751b01d59bfe9 Mon Sep 17 00:00:00 2001 From: pengxiaopeng Date: Mon, 25 Mar 2024 14:14:31 +0800 Subject: [PATCH] grad tool ut --- .../accuracy_tools/grad_tool/grad_monitor.py | 2 +- .../test/resources/test_grad_monitor.yaml | 6 ++ .../test/resources/test_save_grad.yaml | 6 ++ .../accuracy_tools/grad_tool/test/run_test.sh | 36 ++++++++ debug/accuracy_tools/grad_tool/test/run_ut.py | 41 +++++++++ .../grad_tool/test/ut/test_grad_csv.py | 63 ++++++++++++++ .../grad_tool/test/ut/test_grad_monitor.py | 83 +++++++++++++++++++ 7 files changed, 236 insertions(+), 1 deletion(-) create mode 100644 debug/accuracy_tools/grad_tool/test/resources/test_grad_monitor.yaml create mode 100644 debug/accuracy_tools/grad_tool/test/resources/test_save_grad.yaml create mode 100644 debug/accuracy_tools/grad_tool/test/run_test.sh create mode 100644 debug/accuracy_tools/grad_tool/test/run_ut.py create mode 100644 debug/accuracy_tools/grad_tool/test/ut/test_grad_csv.py create mode 100644 debug/accuracy_tools/grad_tool/test/ut/test_grad_monitor.py diff --git a/debug/accuracy_tools/grad_tool/grad_monitor.py b/debug/accuracy_tools/grad_tool/grad_monitor.py index cb6ab15bf6..bf1a6da0ba 100644 --- a/debug/accuracy_tools/grad_tool/grad_monitor.py +++ b/debug/accuracy_tools/grad_tool/grad_monitor.py @@ -98,7 +98,7 @@ class GradientMonitor: continue if param.grad is not None: grad = param.grad - elif param.main_grad is not None: + elif hasattr(param, "main_grad") and param.main_grad is not None: grad = param.main_grad else: continue diff --git a/debug/accuracy_tools/grad_tool/test/resources/test_grad_monitor.yaml b/debug/accuracy_tools/grad_tool/test/resources/test_grad_monitor.yaml new file mode 100644 index 0000000000..a0c895e78f --- /dev/null +++ b/debug/accuracy_tools/grad_tool/test/resources/test_grad_monitor.yaml @@ -0,0 +1,6 @@ +level: L2 +param_list: +rank: +step: +bounds: +output_path: ./output/test_grad_monitor \ No newline at end of file diff --git a/debug/accuracy_tools/grad_tool/test/resources/test_save_grad.yaml b/debug/accuracy_tools/grad_tool/test/resources/test_save_grad.yaml new file mode 100644 index 0000000000..136803d26e --- /dev/null +++ b/debug/accuracy_tools/grad_tool/test/resources/test_save_grad.yaml @@ -0,0 +1,6 @@ +level: L2 +param_list: +rank: +step: +bounds: +output_path: ./output/test_save_grad \ No newline at end of file diff --git a/debug/accuracy_tools/grad_tool/test/run_test.sh b/debug/accuracy_tools/grad_tool/test/run_test.sh new file mode 100644 index 0000000000..ef21e1bb02 --- /dev/null +++ b/debug/accuracy_tools/grad_tool/test/run_test.sh @@ -0,0 +1,36 @@ +#!/bin/bash +CUR_DIR=$(dirname $(readlink -f $0)) +TOP_DIR=${CUR_DIR}/.. +TEST_DIR=${TOP_DIR}/"test" +SRC_DIR=${TOP_DIR}/../ + +clean() { + cd ${TEST_DIR} + + if [ -e ${TEST_DIR}/"report" ]; then + rm -r ${TEST_DIR}/"report" + echo "remove last ut_report successfully." + fi + + if [ -e ${TEST_DIR}/"output" ]; then + rm -r ${TEST_DIR}/"output" + echo "remove last output successfully." + fi + +} + +run_ut() { + export PYTHONPATH=${SRC_DIR}:${PYTHONPATH} + python3 run_ut.py +} + +main() { + clean + if [ "$1"x == "clean"x ]; then + return 0 + fi + + cd ${TEST_DIR} && run_ut +} + +main $@ diff --git a/debug/accuracy_tools/grad_tool/test/run_ut.py b/debug/accuracy_tools/grad_tool/test/run_ut.py new file mode 100644 index 0000000000..c739496979 --- /dev/null +++ b/debug/accuracy_tools/grad_tool/test/run_ut.py @@ -0,0 +1,41 @@ +import os +import shutil +import subprocess +import sys + +def run_ut(): + cur_dir = os.path.realpath(os.path.dirname(__file__)) + top_dir = os.path.realpath(os.path.dirname(cur_dir)) + ut_path = os.path.join(cur_dir, "ut/") + src_dir = top_dir + report_dir = os.path.join(cur_dir, "report") + + if os.path.exists(report_dir): + shutil.rmtree(report_dir) + + os.makedirs(report_dir) + + cmd = ["python3", "-m", "pytest", ut_path, "--junitxml=" + report_dir + "/final.xml", + "--cov=" + src_dir, "--cov-branch", "--cov-report=xml:" + report_dir + "/coverage.xml"] + + result_ut = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + while result_ut.poll() is None: + line = result_ut.stdout.readline().strip() + if line: + print(line) + + ut_flag = False + if result_ut.returncode == 0: + ut_flag = True + print("run ut successfully.") + else: + print("run ut failed.") + + return ut_flag + +if __name__=="__main__": + if run_ut(): + sys.exit(0) + else: + sys.exit(1) diff --git a/debug/accuracy_tools/grad_tool/test/ut/test_grad_csv.py b/debug/accuracy_tools/grad_tool/test/ut/test_grad_csv.py new file mode 100644 index 0000000000..a4b6d9dd6a --- /dev/null +++ b/debug/accuracy_tools/grad_tool/test/ut/test_grad_csv.py @@ -0,0 +1,63 @@ +# coding=utf-8 +import unittest +import os +import torch +from grad_tool.grad_stat_csv import GradStatCsv +from grad_tool.level_adapter import LevelAdapter + + +grad_tensor = torch.tensor([[-2, 2], [0.2, 0.3]]) + + +class TestGradCSV(unittest.TestCase): + def test_level_L0_header(self): + self.assertEqual(['param_name', 'MD5', 'max', 'min', 'norm', 'shape'], + GradStatCsv.generate_csv_header(level=LevelAdapter.level_adapter("L0"), bounds=[-1, 0, 1])) + + def test_level_L1_header(self): + self.assertEqual(['param_name', 'MD5', '(-inf, -1]', '(-1, 0]', '(0, 1]', '(1, inf)', '=0', 'max', 'min', 'norm', 'shape'], + GradStatCsv.generate_csv_header(level=LevelAdapter.level_adapter("L1"), bounds=[-1, 0, 1])) + + def test_level_L2_header(self): + self.assertEqual(['param_name', 'MD5', 'max', 'min', 'norm', 'shape'], + GradStatCsv.generate_csv_header(level=LevelAdapter.level_adapter("L2"), bounds=[-1, 0, 1])) + + def test_level_L3_header(self): + self.assertEqual(['param_name', 'MD5', '(-inf, -1]', '(-1, 0]', '(0, 1]', '(1, inf)', '=0', 'max', 'min', 'norm', 'shape'], + GradStatCsv.generate_csv_header(level=LevelAdapter.level_adapter("L3"), bounds=[-1, 0, 1])) + + def test_level_L0_content(self): + generated_csv_line = GradStatCsv.generate_csv_line( + level=LevelAdapter.level_adapter("L0"), + param_name="model.conv2d", + grad=grad_tensor, + bounds=[-1, 0, 1]) + self.assertEqual(['model.conv2d', '678a6c7d9d9716682b56fda097d0936c', 2.0, -2.0, 2.851315498352051, [2, 2]], + generated_csv_line) + + def test_level_L1_content(self): + generated_csv_line = GradStatCsv.generate_csv_line( + level=LevelAdapter.level_adapter("L1"), + param_name="model.conv2d", + grad=grad_tensor, + bounds=[-1, 0, 1]) + self.assertEqual(['model.conv2d', '678a6c7d9d9716682b56fda097d0936c', 0.25, 0.0, 0.5, 0.25, 0.0, 2.0, -2.0, 2.851315498352051, [2, 2]], + generated_csv_line) + + def test_level_L2_content(self): + generated_csv_line = GradStatCsv.generate_csv_line( + level=LevelAdapter.level_adapter("L2"), + param_name="model.conv2d", + grad=grad_tensor, + bounds=[-1, 0, 1]) + self.assertEqual(['model.conv2d', '678a6c7d9d9716682b56fda097d0936c', 2.0, -2.0, 2.851315498352051, [2, 2]], + generated_csv_line) + + def test_level_L3_content(self): + generated_csv_line = GradStatCsv.generate_csv_line( + level=LevelAdapter.level_adapter("L3"), + param_name="model.conv2d", + grad=grad_tensor, + bounds=[-1, 0, 1]) + self.assertEqual(['model.conv2d', '678a6c7d9d9716682b56fda097d0936c', 0.25, 0.0, 0.5, 0.25, 0.0, 2.0, -2.0, 2.851315498352051, [2, 2]], + generated_csv_line) diff --git a/debug/accuracy_tools/grad_tool/test/ut/test_grad_monitor.py b/debug/accuracy_tools/grad_tool/test/ut/test_grad_monitor.py new file mode 100644 index 0000000000..f233997e73 --- /dev/null +++ b/debug/accuracy_tools/grad_tool/test/ut/test_grad_monitor.py @@ -0,0 +1,83 @@ +import os +import random +import unittest +import hashlib +import torch +import numpy as np +import torch.nn as nn +from grad_tool.grad_monitor import GradientMonitor +from grad_tool.grad_comparator import GradComparator + + +def seed_all(seed=1234, mode=False): + random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.use_deterministic_algorithms(mode) + +seed_all() + +base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + +inputs = [torch.rand(10, 10) for _ in range(10)] +labels = [torch.randint(0, 5, (10,)) for _ in range(10)] + + +class TestModule(nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(10, 5) + self.relu = nn.ReLU() + + def forward(self, x): + x1 = self.linear(x) + x2 = self.relu(x1) + return x2 + + +def test_grad_monitor(): + gm = GradientMonitor(os.path.join(base_dir, "resources/test_grad_monitor.yaml")) + loss_fun = nn.CrossEntropyLoss() + test_module = TestModule() + nn.init.constant_(test_module.linear.weight, 1.0) + nn.init.constant_(test_module.linear.bias, 1.0) + gm.monitor(test_module) + optimizer = torch.optim.SGD(test_module.parameters(), lr=1e-2) + for input_data, label in zip(inputs, labels): + output = test_module(input_data) + loss = loss_fun(output, label) + optimizer.zero_grad() + loss.backward() + optimizer.step() + return gm + + +def test_save_grad(): + gm = GradientMonitor(os.path.join(base_dir, "resources/test_save_grad.yaml")) + loss_fun = nn.CrossEntropyLoss() + test_module = TestModule() + nn.init.constant_(test_module.linear.weight, 1.0) + nn.init.constant_(test_module.linear.bias, 1.0) + optimizer = torch.optim.SGD(test_module.parameters(), lr=1e-2) + for input_data, label in zip([x + 0.1 for x in inputs], labels): + output = test_module(input_data) + loss = loss_fun(output, label) + optimizer.zero_grad() + loss.backward() + gm.save_grad(test_module) + optimizer.step() + return gm + + +class TestGradMonitor(unittest.TestCase): + def test_compare(self): + gm1 = test_grad_monitor() + gm2 = test_save_grad() + compare_output_path = os.path.join(os.path.dirname(gm1._output_path), "grad_compare") + GradComparator.compare_distributed(gm1._output_path, gm2._output_path, compare_output_path) + items = os.listdir(compare_output_path) + self.assertEqual(len(items), 1) + with open(os.path.join(compare_output_path, items[0], "similarities.csv"), 'r') as f: + data = f.read() + self.assertEqual(hashlib.md5(data.encode("utf-8")).hexdigest(), "20441d98b8c8d14ee6f896ea29d01b14") -- Gitee