diff --git a/debug/accuracy_tools/atat/core/common_config.py b/debug/accuracy_tools/atat/core/common_config.py index c78fc1a809afd2b1a55759f08ecd540508aa0190..53e271ec28b35b338da5893dcb29be2d112ed085 100644 --- a/debug/accuracy_tools/atat/core/common_config.py +++ b/debug/accuracy_tools/atat/core/common_config.py @@ -21,7 +21,7 @@ class CommonConfig: raise Exception("rank is invalid") if self.step is not None and not isinstance(self.step, list): raise Exception("step is invalid") - if self.level is not None and self.level not in ["L0", "L1", "L2"]: + if self.level and self.level not in Const.LEVEL_LIST: raise Exception("level is invalid") if self.seed is not None and not isinstance(self.seed, int): raise Exception("seed is invalid") diff --git a/debug/accuracy_tools/atat/mindspore/__init__.py b/debug/accuracy_tools/atat/mindspore/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bb3f93567542e93ff913edf3daabcd3aedb91ee3 --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/__init__.py @@ -0,0 +1 @@ +from atat.mindspore.debugger.precision_debugger import PrecisionDebugger diff --git a/debug/accuracy_tools/atat/mindspore/debugger/__init__.py b/debug/accuracy_tools/atat/mindspore/debugger/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/mindspore/debugger/debugger_config.py b/debug/accuracy_tools/atat/mindspore/debugger/debugger_config.py new file mode 100644 index 0000000000000000000000000000000000000000..56a4b9bf758197d77ef04874f2865e2136d6f67c --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/debugger/debugger_config.py @@ -0,0 +1,51 @@ +import os + + +class DebuggerConfig: + convert_map = { + "L0": "cell", + "L1": "api", + "L2": 'kernel' + } + + def __init__(self, common_config, task_config): + self.dump_path = common_config.dump_path + self.task = common_config.task + self.rank = [] if not common_config.rank else common_config.rank + self.step = [] if not common_config.step else common_config.step + if not common_config.level: + common_config.level = "L1" + self.level = DebuggerConfig.convert_map[common_config.level] + self.list = [] if not task_config.list else task_config.list + self.data_mode = [] if not task_config.data_mode else task_config.data_mode + self.file_format = task_config.file_format + self.check_mode = task_config.check_mode + + self.check() + + def check(self): + if not self.dump_path: + raise Exception("Dump path is empty.") + if not os.path.isabs(self.dump_path): + raise Exception("Dump path must be absolute path.") + if not self.task: + self.task = "statistics" + if not self.level: + raise Exception("level must be L0, L1 or L2") + if not self.file_format: + self.file_format = "npy" + if not self.check_mode: + self.check_mode = "all" + self._check_rank() + self._check_step() + return True + + def _check_rank(self): + for rank_id in self.rank: + if not isinstance(rank_id, int) or rank_id < 0: + raise ValueError(f"rank {self.rank} must be a positive integer.") + + def _check_step(self): + for s in self.step: + if not isinstance(s, int): + raise ValueError(f"step element {s} should be int") diff --git a/debug/accuracy_tools/atat/mindspore/debugger/precision_debugger.py b/debug/accuracy_tools/atat/mindspore/debugger/precision_debugger.py new file mode 100644 index 0000000000000000000000000000000000000000..0099074762f0746c1bd8341047f37b3e5fe08855 --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/debugger/precision_debugger.py @@ -0,0 +1,32 @@ +import os +from atat.mindspore.ms_config import parse_json_config +from atat.mindspore.debugger.debugger_config import DebuggerConfig +from atat.mindspore.task_handler_factory import TaskHandlerFactory + + +class PrecisionDebugger: + _instance = None + + def __new__(cls, config_path=None): + if not cls._instance: + cls._instance = super().__new__(cls) + cls._instance.initialized = False + cls._instance.config = None + return cls._instance + + def __init__(self, config_path=None): + if self.initialized: + return + if not config_path: + config_path = os.path.join(os.path.dirname(__file__), "../../config/config.json") + common_config, task_config = parse_json_config(config_path) + self.config = DebuggerConfig(common_config, task_config) + self.initialized = True + + @classmethod + def start(cls, target=None): + instance = cls._instance + if not instance: + raise Exception("No instance of PrecisionDebugger found.") + handler = TaskHandlerFactory.create(instance.config) + handler.handle() diff --git a/debug/accuracy_tools/atat/mindspore/dump/__init__.py b/debug/accuracy_tools/atat/mindspore/dump/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/mindspore/dump/api_kbk_dump.py b/debug/accuracy_tools/atat/mindspore/dump/api_kbk_dump.py new file mode 100644 index 0000000000000000000000000000000000000000..b0f80f40e553a8b136144f515015d0f94c635f5d --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/dump/api_kbk_dump.py @@ -0,0 +1,55 @@ +import os +import json +from atat.core.utils import make_dump_path_if_not_exists +from atat.mindspore.debugger.debugger_config import DebuggerConfig +from atat.core.log import print_info_log +from atat.core.file_check_util import FileOpen + + +class ApiKbkDump: + def __init__(self, config: DebuggerConfig): + self.dump_json = dict() + self.dump_json["common_dump_settings"] = dict() + self.dump_json["common_dump_settings"]["dump_mode"] = 0 + self.dump_json["common_dump_settings"]["path"] = "" + self.dump_json["common_dump_settings"]["net_name"] = "Net" + self.dump_json["common_dump_settings"]["iteration"] = "all" + self.dump_json["common_dump_settings"]["saved_data"] = "statistic" + self.dump_json["common_dump_settings"]["input_output"] = 0 + self.dump_json["common_dump_settings"]["kernels"] = [] + self.dump_json["common_dump_settings"]["support_device"] = [0,1,2,3,4,5,6,7] + self.dump_json["e2e_dump_settings"] = dict() + self.dump_json["e2e_dump_settings"]["enable"] = True + self.dump_json["e2e_dump_settings"]["trans_flag"] = True + + + if len(config.list) > 0: + self.dump_json["common_dump_settings"]["dump_mode"] = 1 + self.dump_json["common_dump_settings"]["kernels"] = config.list + self.dump_json["common_dump_settings"]["path"] = config.dump_path + if len(config.step) > 0: + step_str = "" + for s in config.step: + step_str += (str(s) + '|') + self.dump_json["common_dump_settings"]["iteration"] = step_str[:-1] + if len(config.rank) > 0: + self.dump_json["common_dump_settings"]["support_device"] = config.rank + if config.task == "tensor": + self.dump_json["common_dump_settings"]["saved_data"] = "tensor" + if len(config.data_mode) == 1: + if config.data_mode[0] == "input": + self.dump_json["common_dump_settings"]["input_output"] = 1 + if config.data_mode[0] == "output": + self.dump_json["common_dump_settings"]["input_output"] = 2 + + def handle(self): + json_path = self.dump_json["common_dump_settings"]["path"] + make_dump_path_if_not_exists(json_path) + json_path = os.path.join(json_path, "api_kbk_dump.json") + with FileOpen(json_path, 'w') as f: + json.dump(self.dump_json, f) + print_info_log(json_path + " has been created.") + os.environ["GRAPH_OP_RUN"] = "1" + os.environ["MINDSPORE_DUMP_CONFIG"] = json_path + if "MS_ACL_DUMP_CFG_PATH" in os.environ: + del os.environ["MS_ACL_DUMP_CFG_PATH"] diff --git a/debug/accuracy_tools/atat/mindspore/dump/dump_tool_factory.py b/debug/accuracy_tools/atat/mindspore/dump/dump_tool_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..ab534edc243dfd5f44688358fe4ca8edb6a8a12d --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/dump/dump_tool_factory.py @@ -0,0 +1,38 @@ +from atat.mindspore.debugger.debugger_config import DebuggerConfig +from atat.mindspore.dump.api_kbk_dump import ApiKbkDump +from atat.mindspore.dump.kernel_graph_dump import KernelGraphDump + + +class DumpToolFactory: + tools = { + "cell": { + "kbk": None, + "graph": None, + "pynative": None + }, + "api": { + "kbk": ApiKbkDump, + "graph": None, + "pynative": None + }, + "kernel": { + "kbk": None, + "graph": KernelGraphDump, + "pynative": None + } + } + + @staticmethod + def create(config: DebuggerConfig): + tool = DumpToolFactory.tools.get(config.level) + if not tool: + raise Exception("valid level is needed.") + if config.level == "api": + tool = tool.get("kbk") + elif config.level == "kernel": + tool = tool.get("graph") + elif config.level == "cell": + raise Exception("Cell dump in not supported now.") + if not tool: + raise Exception("Data dump in not supported in this mode.") + return tool(config) \ No newline at end of file diff --git a/debug/accuracy_tools/atat/mindspore/dump/kernel_graph_dump.py b/debug/accuracy_tools/atat/mindspore/dump/kernel_graph_dump.py new file mode 100644 index 0000000000000000000000000000000000000000..f8a10ec1b1f690931871895a47014d44594ac80a --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/dump/kernel_graph_dump.py @@ -0,0 +1,60 @@ +import os +import json +from atat.core.utils import make_dump_path_if_not_exists +from atat.mindspore.debugger.debugger_config import DebuggerConfig +from atat.core.log import print_info_log +from atat.core.file_check_util import FileOpen + + +class KernelGraphDump: + def __init__(self, config: DebuggerConfig): + self.dump_json = dict() + self.dump_json["common_dump_settings"] = dict() + self.dump_json["common_dump_settings"]["dump_mode"] = 0 + self.dump_json["common_dump_settings"]["path"] = "" + self.dump_json["common_dump_settings"]["net_name"] = "Net" + self.dump_json["common_dump_settings"]["iteration"] = "all" + self.dump_json["common_dump_settings"]["saved_data"] = "statistic" + self.dump_json["common_dump_settings"]["input_output"] = 0 + self.dump_json["common_dump_settings"]["kernels"] = [] + self.dump_json["common_dump_settings"]["support_device"] = [0, 1, 2, 3, 4, 5, 6, 7] + self.dump_json["common_dump_settings"]["op_debug_mode"] = 0 + self.dump_json["common_dump_settings"]["file_format"] = "npy" + + if len(config.list) > 0: + self.dump_json["common_dump_settings"]["dump_mode"] = 1 + self.dump_json["common_dump_settings"]["kernels"] = config.list + self.dump_json["common_dump_settings"]["path"] = config.dump_path + if len(config.step) > 0: + step_str = "" + for s in config.step: + step_str += (str(s) + '|') + self.dump_json["common_dump_settings"]["iteration"] = step_str[:-1] + if len(config.rank) > 0: + self.dump_json["common_dump_settings"]["support_device"] = config.rank + if config.task == "tensor": + self.dump_json["common_dump_settings"]["saved_data"] = "tensor" + self.dump_json["common_dump_settings"]["file_format"] = config.file_format + if len(config.data_mode) == 1: + if config.data_mode[0] == "input": + self.dump_json["common_dump_settings"]["input_output"] = 1 + if config.data_mode[0] == "output": + self.dump_json["common_dump_settings"]["input_output"] = 2 + + def handle(self): + if os.getenv("GRAPH_OP_RUN") == "1": + raise Exception("Must run in graph mode, not kbk mode") + json_path = self.dump_json["common_dump_settings"]["path"] + make_dump_path_if_not_exists(json_path) + json_path = os.path.join(json_path, "kernel_graph_dump.json") + with FileOpen(json_path, 'w') as f: + json.dump(self.dump_json, f) + print_info_log(json_path + " has been created.") + os.environ["MINDSPORE_DUMP_CONFIG"] = json_path + if self.dump_json["common_dump_settings"]["dump_mode"] == 0: + if self.dump_json["common_dump_settings"]["iteration"] != "all" or \ + len(self.dump_json["common_dump_settings"]["kernels"]) == 0: + os.environ["MS_ACL_DUMP_CFG_PATH"] = json_path + else: + if "MS_ACL_DUMP_CFG_PATH" in os.environ: + del os.environ["MS_ACL_DUMP_CFG_PATH"] diff --git a/debug/accuracy_tools/atat/mindspore/ms_config.py b/debug/accuracy_tools/atat/mindspore/ms_config.py new file mode 100644 index 0000000000000000000000000000000000000000..deecaf1932c2e4014803a02fa2899a35f0117f9d --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/ms_config.py @@ -0,0 +1,78 @@ +import json +from atat.core.common_config import CommonConfig, BaseConfig +from atat.core.file_check_util import FileOpen + + +class TensorConfig(BaseConfig): + def __init__(self, json_config): + super().__init__(json_config) + self.check_mode = None + self.file_format = json_config.get("file_format") + self.check_config() + self._check_config() + + def _check_config(self): + if self.data_mode is not None and len(self.data_mode) > 0: + if len(self.data_mode) > 1 or self.data_mode[0] not in ["all", "input", "output"]: + raise Exception("data_mode must be all, input or output") + if self.file_format not in ["npy", "bin"]: + raise Exception("file_format is invalid") + + +class StatisticsConfig(BaseConfig): + def __init__(self, json_config): + super().__init__(json_config) + self.file_format = None + self.check_mode = None + self.check_config() + self._check_config() + + def _check_config(self): + if self.data_mode is not None and len(self.data_mode) > 0: + if len(self.data_mode) > 1 or self.data_mode[0] not in ["all", "input", "output"]: + raise Exception("data_mode must be all, input or output") + + +class OverflowCheck(BaseConfig): + def __init__(self, json_config): + super().__init__(json_config) + self.file_format = None + self.check_mode = json_config.get("check_mode") + self._check_config() + + def _check_config(self): + if self.data_mode is not None and len(self.data_mode) > 0: + if len(self.data_mode) > 1 or self.data_mode[0] not in ["all", "input", "output"]: + raise Exception("data_mode must be all, input or output") + if self.check_mode not in ["all", "aicore", "atomic"]: + raise Exception("check_mode is invalid") + + +def parse_common_config(json_config): + return CommonConfig(json_config) + + +def parse_task_config(task, json_config): + task_map = json_config[task] + if not task_map: + task_map = dict() + if task == "tensor": + return TensorConfig(task_map) + elif task == "statistics": + return StatisticsConfig(task_map) + elif task == "overflow_check": + return OverflowCheck(task_map) + else: + raise Exception("task is invalid.") + + +def parse_json_config(json_file_path): + if not json_file_path: + raise Exception("json file path is None") + with FileOpen(json_file_path, 'r') as file: + json_config = json.load(file) + common_config = parse_common_config(json_config) + if not common_config.task: + common_config.task = "statistics" + task_config = parse_task_config(common_config.task, json_config) + return common_config, task_config diff --git a/debug/accuracy_tools/atat/mindspore/overflow_check/__init__.py b/debug/accuracy_tools/atat/mindspore/overflow_check/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/mindspore/overflow_check/kernel_graph_overflow_check.py b/debug/accuracy_tools/atat/mindspore/overflow_check/kernel_graph_overflow_check.py new file mode 100644 index 0000000000000000000000000000000000000000..5ef005e59e8839e19f9af600c168343251580936 --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/overflow_check/kernel_graph_overflow_check.py @@ -0,0 +1,45 @@ +import os +import json +from atat.core.utils import make_dump_path_if_not_exists +from atat.mindspore.debugger.debugger_config import DebuggerConfig +from atat.core.log import print_warn_log, print_info_log +from atat.core.file_check_util import FileOpen + + +class KernelGraphOverflowCheck: + def __init__(self, config: DebuggerConfig): + self.dump_json = dict() + self.dump_json["common_dump_settings"] = dict() + self.dump_json["common_dump_settings"]["dump_mode"] = 0 + self.dump_json["common_dump_settings"]["path"] = "" + self.dump_json["common_dump_settings"]["net_name"] = "Net" + self.dump_json["common_dump_settings"]["iteration"] = "all" + self.dump_json["common_dump_settings"]["saved_data"] = "full" + self.dump_json["common_dump_settings"]["input_output"] = 0 + self.dump_json["common_dump_settings"]["kernels"] = [] + self.dump_json["common_dump_settings"]["support_device"] = [0,1,2,3,4,5,6,7] + self.dump_json["common_dump_settings"]["op_debug_mode"] = 3 + self.dump_json["common_dump_settings"]["file_format"] = "npy" + + self.dump_json["common_dump_settings"]["path"] = config.dump_path + if len(config.step) > 0: + print_warn_log("Step would change to all in this task.") + if len(config.rank) > 0: + self.dump_json["common_dump_settings"]["support_device"] = config.rank + if config.check_mode == "aicore": + self.dump_json["common_dump_settings"]["op_debug_mode"] = 1 + elif config.check_mode == "atomic": + self.dump_json["common_dump_settings"]["op_debug_mode"] = 2 + + def handle(self): + if os.getenv("GRAPH_OP_RUN") == "1": + raise Exception("Must run in graph mode, not kbk mode") + json_path = self.dump_json["common_dump_settings"]["path"] + make_dump_path_if_not_exists(json_path) + json_path = os.path.join(json_path, "kernel_graph_overflow_check.json") + with FileOpen(json_path, 'w') as f: + json.dump(self.dump_json, f) + print_info_log(json_path + " has been created.") + os.environ["MINDSPORE_DUMP_CONFIG"] = json_path + if "MS_ACL_DUMP_CFG_PATH" in os.environ: + del os.environ["MS_ACL_DUMP_CFG_PATH"] diff --git a/debug/accuracy_tools/atat/mindspore/overflow_check/overflow_check_tool_factory.py b/debug/accuracy_tools/atat/mindspore/overflow_check/overflow_check_tool_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..fe53359be1ba1ecb73fb84138228415f68e1c2ce --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/overflow_check/overflow_check_tool_factory.py @@ -0,0 +1,32 @@ +from atat.mindspore.debugger.debugger_config import DebuggerConfig +from atat.mindspore.overflow_check.kernel_graph_overflow_check import KernelGraphOverflowCheck + + +class OverflowCheckToolFactory: + tools = { + "cell": { + "kbk": None, + "graph": None, + "pynative": None + }, + "api": { + "kbk": None, + "graph": None, + "pynative": None + }, + "kernel": { + "kbk": None, + "graph": KernelGraphOverflowCheck, + "pynative": None + } + } + + @staticmethod + def create(config: DebuggerConfig): + tool = OverflowCheckToolFactory.tools.get(config.level) + if not tool: + raise Exception("valid level is needed.") + tool = tool.get("graph") + if not tool: + raise Exception("Overflow check in not supported in this mode.") + return tool(config) diff --git a/debug/accuracy_tools/atat/mindspore/task_handler_factory.py b/debug/accuracy_tools/atat/mindspore/task_handler_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..4f80e4e89c92156762ea0e4c4ed3302cc5c31f5f --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/task_handler_factory.py @@ -0,0 +1,21 @@ +from atat.mindspore.debugger.debugger_config import DebuggerConfig +from atat.mindspore.dump.dump_tool_factory import DumpToolFactory +from atat.mindspore.overflow_check.overflow_check_tool_factory import OverflowCheckToolFactory + + +class TaskHandlerFactory: + tasks = { + "tensor": DumpToolFactory, + "statistics": DumpToolFactory, + "overflow_check": OverflowCheckToolFactory + } + + @staticmethod + def create(config: DebuggerConfig): + task = TaskHandlerFactory.tasks.get(config.task) + if not task: + raise Exception("valid task is needed.") + handler = task.create(config) + if not handler: + raise Exception("Can not find task handler") + return handler