From 3e9ba39f926ff49363d25e6dd23006e62a49a1b6 Mon Sep 17 00:00:00 2001 From: JianxinZhang Date: Mon, 18 Aug 2025 17:09:14 +0800 Subject: [PATCH 1/4] start and end dump --- torch_npu/csrc/npu/Module.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index c66f2a8d52..3a6fa91e27 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -51,6 +51,7 @@ #include "torch_npu/csrc/aten/common/from_blob.h" #include "torch_npu/csrc/profiler/combined_traceback.h" #include "torch_npu/csrc/profiler/python/combined_traceback.h" +#include "torch_npu/csrc/framework/interface/AclInterface.h" struct NPUDeviceProp { std::string name; @@ -1834,6 +1835,30 @@ static PyObject* THNPModule_reset_device_res_limit(PyObject* self, PyObject *arg END_HANDLE_TH_ERRORS } +PyObject* THNPModule_aclop_start_dump(PyObject* self, PyObject* args) +{ + HANDLE_TH_ERRORS + uint32_t dumpType=0x00000001U; + std::string module = THPUtils_unpackString(args); + const char *path=module.c_str(); + std::cout<<"AclopStartDumpArgs call starts!\n"; + at_npu::native::AclopStartDumpArgs(dumpType, path); + std::cout<<"AclopStartDumpArgs call ends!\n"; + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +PyObject* THNPModule_aclop_stop_dump(PyObject* self, PyObject* noargs) +{ + HANDLE_TH_ERRORS + uint32_t dumpType=0x00000001U; + std::cout<<"AclopStopDumpArgs call starts!\n"; + at_npu::native::AclopStopDumpArgs(dumpType); + std::cout<<"AclopStopDumpArgs call ends!\n"; + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + static struct PyMethodDef THNPModule_methods[] = { {"_npu_init", (PyCFunction)THNPModule_initExtension, METH_NOARGS, nullptr}, {"_npu_set_run_yet_variable_to_false", (PyCFunction)THNPModule_set_run_yet_variable_to_false_wrap, METH_NOARGS, nullptr}, @@ -1902,6 +1927,8 @@ static struct PyMethodDef THNPModule_methods[] = { {"_npu_get_device_res_limit", (PyCFunction)THNPModule_get_device_res_limit, METH_VARARGS, nullptr}, {"_npu_set_device_res_limit", (PyCFunction)THNPModule_set_device_res_limit, METH_VARARGS, nullptr}, {"_npu_reset_device_res_limit", (PyCFunction)THNPModule_reset_device_res_limit, METH_O, nullptr}, + {"_aclop_start_dump", (PyCFunction)THNPModule_aclop_start_dump, METH_O, nullptr}, + {"_aclop_stop_dump", (PyCFunction)THNPModule_aclop_stop_dump, METH_NOARGS, nullptr}, {nullptr}}; TORCH_NPU_API PyMethodDef* THNPModule_get_methods() -- Gitee From 6cdd03ec831c8811f62a15a174b54e7d3c118881 Mon Sep 17 00:00:00 2001 From: JianxinZhang Date: Mon, 18 Aug 2025 19:33:33 +0800 Subject: [PATCH 2/4] static kernel inductor --- torch_npu/__init__.py | 3 + torch_npu/_inductor/codegen/wrapper.py | 15 ++ torch_npu/_inductor/npu_static_kernel.py | 311 +++++++++++++++++++++++ torch_npu/npu/_graph_tree.py | 10 +- torch_npu/npu/aclnn/__init__.py | 2 + torch_npu/utils/_graph_tree.py | 12 +- 6 files changed, 349 insertions(+), 4 deletions(-) create mode 100644 torch_npu/_inductor/npu_static_kernel.py diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index 9e7cb7b250..c2b4020bdb 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -18,6 +18,7 @@ import torch from torch.distributed.fsdp import sharded_grad_scaler from torch.utils.checkpoint import DefaultDeviceType import torch_npu +from torch_npu._inductor import npu_static_kernel acc = torch._C._get_accelerator() if acc.type != "cpu": @@ -276,6 +277,8 @@ def _npu_shutdown(): torch_npu._C._npu_shutdown(success) _except_handler.handle_exception() torch_npu.asd.asd.matmul_check._cleanup() + if torch_npu.npu.aclnn.use_static_aclnn_kernel == True: + uninstall_static_kernel() # register npu shutdown hook on exit diff --git a/torch_npu/_inductor/codegen/wrapper.py b/torch_npu/_inductor/codegen/wrapper.py index a2a29fe645..e848a5e02d 100644 --- a/torch_npu/_inductor/codegen/wrapper.py +++ b/torch_npu/_inductor/codegen/wrapper.py @@ -15,6 +15,8 @@ from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode from torch.utils._sympy.singleton_int import SingletonInt from torch_npu._inductor import config as npu_config +from torch_npu._inductor import npu_static_kernel +import torch_npu.npu.aclnn class NPUWrapperCodeGen(PythonWrapperCodegen): @@ -246,3 +248,16 @@ class NPUWrapperCodeGen(PythonWrapperCodegen): call_str = f"repro_run({', '.join(V.graph.graph_inputs.keys())})" output.writeline(f"fn = lambda: {call_str}") output.writeline("return fn()") + + def write_prefix(self) -> None: + super().write_prefix() + if torch_npu.npu.aclnn.use_static_aclnn_kernel == True: + with self.prefix.indent(): + self.prefix.writeline('static_kernel_complier = StaticKernelCompiler()') + self.prefix.writeline('static_kernel_complier.enter()') + + def generate_return(self, output_refs: list[str]) -> None: + if torch_npu.npu.aclnn.use_static_aclnn_kernel == True: + self.wrapper_call.writeline('exc_info=(None, None, None)') + self.wrapper_call.writeline('static_kernel_complier.__exit__(*exc_info)') + super().generate_return(output_refs) \ No newline at end of file diff --git a/torch_npu/_inductor/npu_static_kernel.py b/torch_npu/_inductor/npu_static_kernel.py new file mode 100644 index 0000000000..710552cdaf --- /dev/null +++ b/torch_npu/_inductor/npu_static_kernel.py @@ -0,0 +1,311 @@ +import os +import subprocess +import datetime +from enum import IntEnum +from pathlib import Path + +import torch +import torch_npu +import torch_npu.npu.utils as utils +from .config import log + +_uninstall_path = None + +class SocVersion(IntEnum): + UnsupportedSocVersion = -1 + Ascend910PremiumA = 100 + Ascend910ProA = 101 + Ascend910A = 102 + Ascend910ProB = 103 + Ascend910B = 104 + Ascend310P1 = 200 + Ascend310P2 = 201 + Ascend310P3 = 202 + Ascend310P4 = 203 + Ascend310P5 = 204 + Ascend310P7 = 206 + Ascend910B1 = 220 + Ascend910B2 = 221 + Ascend910B2C = 222 + Ascend910B3 = 223 + Ascend910B4 = 224 + Ascend910B4_1 = 225 + Ascend310B1 = 240 + Ascend310B2 = 241 + Ascend310B3 = 242 + Ascend310B4 = 243 + Ascend910_9391 = 250 + Ascend910_9392 = 251 + Ascend910_9381 = 252 + Ascend910_9382 = 253 + Ascend910_9372 = 254 + Ascend910_9362 = 255 + +def get_soc_version_name(): + soc_version = utils.get_soc_version() + try: + return SocVersion(soc_version).name + except ValueError: + return SocVersion(-1).name + +def safe_resolve_output_dir(build_dir: str): + base_dir = Path.cwd().resolve() + if build_dir is not None: + if "\x00" in build_dir: + raise ValueError("build_dir contains null byte") + + candidate = Path(build_dir) + if ".." in candidate.parts: + raise ValueError("build_dir must not contain '..'") + + script_dir = candidate if candidate.is_absolute() else base_dir / candidate + + cur = Path(script_dir.anchor) + for part in script_dir.parts[1:]: + cur = cur / part + if cur.exists() and cur.is_symlink(): + raise ValueError(f"symlink detected in path: {cur}") + + try: + script_dir = script_dir.resolve(strict=False) + except Exception as e: + raise ValueError(f"cannot resolve path {script_dir}: {e}") + else: + script_dir = base_dir # 在同目录生成临时dump的文件夹,用于保存生成的算子信息json + + timestamp = f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}_{os.getpid()}" + result_root = script_dir / f"{timestamp}_kernel_aot_optimization_build_outputs" + + try: + result_root.mkdir(exist_ok=True) + except (PermissionError, OSError) as e: + raise RuntimeError(f"failed to create output directory {result_root}: {e}") from e + + return result_root + +class AclopDumpContext: + def __init__(self, save_path: str): + self.save_path = save_path + + def __enter__(self): + torch_npu._C._aclop_start_dmp(self.save_path) + return self # 可选 + + def __exit__(self, exc_type, exc_val, exc_tb): + torch_npu._C._aclop_stop_dump() + +def save_uninstall_info(filename: str): + global _uninstall_path + latest = Path(os.environ["ASCEND_HOME_PATH"]) + root = latest.parent + pattern = f"*/opp/static_kernel/ai_core/{filename}/uninstall.sh" + match = next(root.glob(pattern), None) + if match is None: + _uninstall_path = None + log.debug(f"can not find uninstall path, pattern: {pattern}") + print(f"can not find uninstall path, pattern: {pattern}") + else: + _uninstall_path = str(match) + +class StaticKernelCompiler: + """ + 上下文管理器,用于自动处理静态 Kernel 的 dump、编译和安装过程。 + """ + def __init__(self, build_dir=None): + """ + 初始化编译器,并创建用于存放编译产物的目录。 + :param build_dir: 可选,指定编译产物的存放目录。如果为 None,则在当前工作目录下创建。 + """ + self.result_root = safe_resolve_output_dir(build_dir) + print(f"StaticKernelCompiler initialized. Build directory: {self.result_root}") + + def __enter__(self): + """ + 进入 with 代码块时调用,启动 aclop dump。 + """ + print("[INFO] Starting operator dump...") + # 注意:这里假设 torch_npu._C._aclop_start_dump 是正确的 dump 启动函数 + torch_npu._C._aclop_start_dump(str(self.result_root)) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """ + 退出 with 代码块时调用,停止 dump 并执行编译和安装。 + """ + print("[INFO] Stopping operator dump.") + # 确保 dump 进程被停止,即使在 with 块内发生异常 + torch_npu._C._aclop_stop_dump() + + if exc_type: + print(f" An exception occurred during model execution: {exc_val}") + print("[INFO] Skipping static kernel compilation due to the error.") + return # 不抑制异常,让调用者知道发生了错误 + + print("[INFO] Starting static kernel compilation process...") + + debug_dirs = [d for d in self.result_root.iterdir() + if d.is_dir() and d.name.endswith("_debug")] + if not debug_dirs: + print(" Can not find json of ops, skipping op_compiler.") + return + + debug_dir = max(debug_dirs, key=lambda d: d.stat().st_mtime) + json_files = list(debug_dir.glob("*.json")) + if not json_files: + print(f" No JSON files in {debug_dir}, skipping op_compiler.") + return + + cmd = [ + "op_compiler", + "-p", str(debug_dir), + "-v", get_soc_version_name(), + "-l", "info", + "-j", "4", + "-o", str(self.result_root), + ] + try: + print(f" Executing op_compiler command: {' '.join(cmd)}") + res = subprocess.run(cmd, check=True, capture_output=True, text=True) + print(f" op_compiler execution successful, msg: {res.stdout}") + except subprocess.CalledProcessError as e: + print(f" op_compiler execution failed, msg: {e.stderr}") + return + + for run_pkg in self.result_root.glob("*.run"): + filename = run_pkg.name + try: + print(f"[INFO] Installing static kernel package: {filename}") + result = subprocess.run([str(run_pkg)], check=True, capture_output=True, text=True) + print(f" {filename} install successful, msg: {result.stdout}") + save_uninstall_info(filename[:-4]) + torch_npu.npu._aclnn_reselect_static_kernel() + except subprocess.CalledProcessError as e: + print(f" {filename} install failed, msg: {e.stderr}") + +def compile_static_kernel(build_dir = None, *args, **kwargs): + log.info(f"start compiler static kernel") + print(f"Start compiler static kernel") + result_root = safe_resolve_output_dir(build_dir) + + # 执行单算子,用于生成算子信息json + torch_npu._C._aclop_start_dump(str(result_root)) + try: + # torch.fx.Interpreter(fx_graph).run(*args, **kwargs) + test_model() + finally: + torch_npu._C.aclop_stop_dump_args() + + with AclopDumpContext(result_root): + test_model() + + + debug_dirs = [d for d in result_root.iterdir() + if d.is_dir() and d.name.endswith("_debug")] + if not debug_dirs: + log.error("Can not find json of ops, do not excute op_compiler") + print("Can not find json of ops, do not excute op_compiler") + return + + debug_dir = max(debug_dirs, key=lambda d: d.stat().st_mtime) + + json_files = list(debug_dir.glob("*.json")) + if not json_files: + log.error(f"[ERROR] No JSON files in {debug_dir}, skip op_compiler") + print(f"No JSON files in {debug_dir}, skip op_compiler") + return + + cmd = [ + "op_compiler", + "-p", str(debug_dir), + "-v", get_soc_version_name(), + "-l", "info", + "-j", "4", + "-o", str(result_root), + ] + try: + res = subprocess.run(cmd, check=True, capture_output=True, text=True) + log.debug(f"excute op_compiler, msg: {res.stdout}") + print(f"excute op_compiler, msg: {res.stdout}") + except subprocess.CalledProcessError as e: + log.warning(f"[WARNING] excute op_compiler error, msg: {e.stderr}") + print(f"excute op_compiler error, msg: {e.stderr}") + return + + for run_pkg in result_root.glob("*.run"): + filepath = run_pkg + filename = run_pkg.name + try: + result = subprocess.run( + [str(filepath)], + check=True, + capture_output=True, + text=True + ) + log.debug(f"{filename} static kernel run pkg install success, msg: {result.stdout}") + print(f"{filename} static kernel run pkg install success, msg: {result.stdout}") + save_uninstall_info(filename[:-4]) + torch_npu.npu._aclnn_reselect_static_kernel() + except subprocess.CalledProcessError as e: + log.warning(f"{filename} static kernel run pkg install failed, msg: {e.stderr}") + print(f"{filename} static kernel run pkg install failed, msg: {e.stderr}") + +def uninstall_static_kernel(): + global _uninstall_path + if not _uninstall_path: + log.debug(f"uninstall_path is none, skip uninstall static kernel") + print(f"uninstall_path is none, skip uninstall static kernel") + return + + try: + result = subprocess.run( + [_uninstall_path], + check=True, + capture_output=True, + text=True, + ) + log.debug(f"{_uninstall_path} uninstall success, msg: \n{result.stdout}") + print(f"[DEBUG] {_uninstall_path} uninstall success, msg: \n{result.stdout}") + except subprocess.CalledProcessError as e: + logger.debug(f"{_uninstall_path} uninstall failed, msg: \n{e.stderr}") + print(f"[ERROR] {_uninstall_path} uninstall failed, msg: \n{e.stderr}") + finally: + _uninstall_path = None + +def test_model(): + class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear1 = torch.nn.Linear(1024, 2048) + self.relu1 = torch.nn.ReLU() + self.linear2 = torch.nn.Linear(2048, 1024) + + def forward(self, x): + ln1 = self.linear1(x) + ln2 = self.relu1(ln1) + ln3 = self.linear2(ln2) + return ln3 + + model = Model().npu() + # config = CompilerConfig() + # config.mode = "reduce-overhead" + # config.debug.aclgraph.enable_aclnn_kernel = True + # aclgraph_backend = torchair.get_npu_backend(compiler_config=config) + # model = torch.compile(Model(), backend=aclgraph_backend, dynamic=False) + x = torch.randn(32, 1024).npu() + y = model(x) + +if __name__=="__main__": + static_kernel_complier = StaticKernelCompiler() + with staric_kernel_complier: + test_model() + # exc_info=(None, None, None) + # try: + # static_kernel_complier.__enter__() + # test_model() + # except Exception: + # print("ERROR") + # finally: + # static_kernel_complier.__exit__(*exc_info) + + uninstall_static_kernel() + \ No newline at end of file diff --git a/torch_npu/npu/_graph_tree.py b/torch_npu/npu/_graph_tree.py index 1149dd53d2..b32e9d64b6 100644 --- a/torch_npu/npu/_graph_tree.py +++ b/torch_npu/npu/_graph_tree.py @@ -102,7 +102,8 @@ import torch_npu from torch_npu._C import ( _npu_NPUAllocator_AllocatorState as AllocatorState, _set_cached_tensors_enabled as _set_cached_tensors_enabled) - +import torch_npu.npu.aclnn +from torch_npu._inductor import npu_static_kernel if TYPE_CHECKING: from torch._inductor.utils import InputType @@ -614,7 +615,12 @@ class NPUWarmupNode: ), disable_conv_cache_emptying(), clear_cublas_manager(), _use_npu_memory_pool_manager( self.device_index, self.npu_graphs_pool, self.stream ), get_history_recording(): - out = self.wrapped_function.model(new_inputs) + if torch_npu.npu.aclnn.use_static_aclnn_kernel == True: + static_kernel_complier = StaticKernelCompiler() + with static_kernel_complier: + out = self.wrapped_function.model(new_inputs) + else: + out = self.wrapped_function.model(new_inputs) # We need to know which outputs are allocated within the cudagraph pool # so that we can deallocate them at the beginning of the next cudagraph step, diff --git a/torch_npu/npu/aclnn/__init__.py b/torch_npu/npu/aclnn/__init__.py index 48fccdbce7..0c0f783267 100644 --- a/torch_npu/npu/aclnn/__init__.py +++ b/torch_npu/npu/aclnn/__init__.py @@ -11,6 +11,8 @@ from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModul import torch_npu._C from .backends import version +# determine to use static aclnn kernel +use_static_aclnn_kernel = False def _set_allow_conv_hf32(_enabled: bool): r"""Set the device supports conv operation hf32. diff --git a/torch_npu/utils/_graph_tree.py b/torch_npu/utils/_graph_tree.py index b031f62e9b..4bcaec0697 100644 --- a/torch_npu/utils/_graph_tree.py +++ b/torch_npu/utils/_graph_tree.py @@ -50,6 +50,8 @@ from torch._inductor.utils import ( InputType, ) from torch.multiprocessing.reductions import StorageWeakRef +import torch_npu.npu.aclnn +from torch_npu._inductor import npu_static_kernel def npugraph_mark_step_begin(): @@ -163,8 +165,14 @@ def npugraphify_impl( stream = torch.npu.Stream() stream.wait_stream(torch.npu.current_stream()) # copy static_inputs because it will be cleared in model - with torch.npu.stream(stream): - model(list(static_inputs)) + if torch_npu.npu.aclnn.use_static_aclnn_kernel == True: + static_kernel_complier = StaticKernelCompiler() + with static_kernel_complier: + with torch.npu.stream(stream): + model(list(static_inputs)) + else: + with torch.npu.stream(stream): + model(list(static_inputs)) stream.synchronize() torch.npu.current_stream().wait_stream(stream) torch.npu.synchronize() -- Gitee From b236524a885bd4be6003904442b3971d2ee6fa5f Mon Sep 17 00:00:00 2001 From: JianxinZhang Date: Tue, 19 Aug 2025 21:09:07 +0800 Subject: [PATCH 3/4] 20250819 --- torch_npu/__init__.py | 2 +- torch_npu/_inductor/codegen/wrapper.py | 28 ++++++++++++++++++-------- torch_npu/npu/_graph_tree.py | 5 +++-- torch_npu/utils/_graph_tree.py | 5 +++-- 4 files changed, 27 insertions(+), 13 deletions(-) diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index c2b4020bdb..bfcf14de64 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -18,7 +18,6 @@ import torch from torch.distributed.fsdp import sharded_grad_scaler from torch.utils.checkpoint import DefaultDeviceType import torch_npu -from torch_npu._inductor import npu_static_kernel acc = torch._C._get_accelerator() if acc.type != "cpu": @@ -278,6 +277,7 @@ def _npu_shutdown(): _except_handler.handle_exception() torch_npu.asd.asd.matmul_check._cleanup() if torch_npu.npu.aclnn.use_static_aclnn_kernel == True: + from torch_npu._inductor.npu_static_kernel import uninstall_static_kernel uninstall_static_kernel() diff --git a/torch_npu/_inductor/codegen/wrapper.py b/torch_npu/_inductor/codegen/wrapper.py index e848a5e02d..7e81590ffb 100644 --- a/torch_npu/_inductor/codegen/wrapper.py +++ b/torch_npu/_inductor/codegen/wrapper.py @@ -15,7 +15,6 @@ from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode from torch.utils._sympy.singleton_int import SingletonInt from torch_npu._inductor import config as npu_config -from torch_npu._inductor import npu_static_kernel import torch_npu.npu.aclnn @@ -54,6 +53,7 @@ class NPUWrapperCodeGen(PythonWrapperCodegen): ) from torch_npu._inductor.npu_triton_heuristics import grid import torch_npu + has_initialized = True """ if config.triton.autotune_at_compile_time: self.kernel_autotune_calls.splice(import_str) @@ -248,16 +248,28 @@ class NPUWrapperCodeGen(PythonWrapperCodegen): call_str = f"repro_run({', '.join(V.graph.graph_inputs.keys())})" output.writeline(f"fn = lambda: {call_str}") output.writeline("return fn()") - + def write_prefix(self) -> None: super().write_prefix() if torch_npu.npu.aclnn.use_static_aclnn_kernel == True: - with self.prefix.indent(): - self.prefix.writeline('static_kernel_complier = StaticKernelCompiler()') - self.prefix.writeline('static_kernel_complier.enter()') + with self.prefix.indent(): + self.prefix.writeline('global has_initialized') + self.prefix.writeline('if not has_initialized:') + self.prefix.do_indent() + with self.prefix.indent(): + self.prefix.writeline('print("33"*20)') + self.prefix.writeline('from torch_npu._inductor.npu_static_kernel import StaticKernelCompiler') + self.prefix.writeline('static_kernel_complier = StaticKernelCompiler()') + self.prefix.writeline('static_kernel_complier.__enter__()') + self.prefix.writeline('has_initialized = True') + self.prefix.do_unindent() def generate_return(self, output_refs: list[str]) -> None: if torch_npu.npu.aclnn.use_static_aclnn_kernel == True: - self.wrapper_call.writeline('exc_info=(None, None, None)') - self.wrapper_call.writeline('static_kernel_complier.__exit__(*exc_info)') - super().generate_return(output_refs) \ No newline at end of file + self.wrapper_call.do_unindent() + with self.wrapper_call.indent(): + self.wrapper_call.writeline('if not has_initialized:') + self.wrapper_call.do_indent() + with self.wrapper_call.indent(): + self.wrapper_call.writeline('exc_info=(None, None, None)') + self.wrapper_call.writeline('static_kernel_complier.__exit__(*exc_info)') \ No newline at end of file diff --git a/torch_npu/npu/_graph_tree.py b/torch_npu/npu/_graph_tree.py index b32e9d64b6..e34c9242ce 100644 --- a/torch_npu/npu/_graph_tree.py +++ b/torch_npu/npu/_graph_tree.py @@ -102,8 +102,6 @@ import torch_npu from torch_npu._C import ( _npu_NPUAllocator_AllocatorState as AllocatorState, _set_cached_tensors_enabled as _set_cached_tensors_enabled) -import torch_npu.npu.aclnn -from torch_npu._inductor import npu_static_kernel if TYPE_CHECKING: from torch._inductor.utils import InputType @@ -615,7 +613,10 @@ class NPUWarmupNode: ), disable_conv_cache_emptying(), clear_cublas_manager(), _use_npu_memory_pool_manager( self.device_index, self.npu_graphs_pool, self.stream ), get_history_recording(): + import torch_npu.npu.aclnn if torch_npu.npu.aclnn.use_static_aclnn_kernel == True: + print("22"*20) + from torch_npu._inductor.npu_static_kernel import StaticKernelCompiler static_kernel_complier = StaticKernelCompiler() with static_kernel_complier: out = self.wrapped_function.model(new_inputs) diff --git a/torch_npu/utils/_graph_tree.py b/torch_npu/utils/_graph_tree.py index 4bcaec0697..9400537791 100644 --- a/torch_npu/utils/_graph_tree.py +++ b/torch_npu/utils/_graph_tree.py @@ -50,8 +50,6 @@ from torch._inductor.utils import ( InputType, ) from torch.multiprocessing.reductions import StorageWeakRef -import torch_npu.npu.aclnn -from torch_npu._inductor import npu_static_kernel def npugraph_mark_step_begin(): @@ -165,7 +163,10 @@ def npugraphify_impl( stream = torch.npu.Stream() stream.wait_stream(torch.npu.current_stream()) # copy static_inputs because it will be cleared in model + import torch_npu.npu.aclnn if torch_npu.npu.aclnn.use_static_aclnn_kernel == True: + print("55"*20) + from torch_npu._inductor.npu_static_kernel import StaticKernelCompiler static_kernel_complier = StaticKernelCompiler() with static_kernel_complier: with torch.npu.stream(stream): -- Gitee From ab23bda58b0d0bb5ac8615957a040b2878cd6920 Mon Sep 17 00:00:00 2001 From: JianxinZhang Date: Tue, 19 Aug 2025 23:37:31 +0800 Subject: [PATCH 4/4] fix bug --- torch_npu/_inductor/codegen/wrapper.py | 7 ++++--- torch_npu/npu/_graph_tree.py | 2 +- torch_npu/utils/_graph_tree.py | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/torch_npu/_inductor/codegen/wrapper.py b/torch_npu/_inductor/codegen/wrapper.py index 7e81590ffb..826c4d63f6 100644 --- a/torch_npu/_inductor/codegen/wrapper.py +++ b/torch_npu/_inductor/codegen/wrapper.py @@ -53,7 +53,7 @@ class NPUWrapperCodeGen(PythonWrapperCodegen): ) from torch_npu._inductor.npu_triton_heuristics import grid import torch_npu - has_initialized = True + has_initialized = False """ if config.triton.autotune_at_compile_time: self.kernel_autotune_calls.splice(import_str) @@ -248,7 +248,7 @@ class NPUWrapperCodeGen(PythonWrapperCodegen): call_str = f"repro_run({', '.join(V.graph.graph_inputs.keys())})" output.writeline(f"fn = lambda: {call_str}") output.writeline("return fn()") - + def write_prefix(self) -> None: super().write_prefix() if torch_npu.npu.aclnn.use_static_aclnn_kernel == True: @@ -272,4 +272,5 @@ class NPUWrapperCodeGen(PythonWrapperCodegen): self.wrapper_call.do_indent() with self.wrapper_call.indent(): self.wrapper_call.writeline('exc_info=(None, None, None)') - self.wrapper_call.writeline('static_kernel_complier.__exit__(*exc_info)') \ No newline at end of file + self.wrapper_call.writeline('static_kernel_complier.__exit__(*exc_info)') + super().generate_return(output_refs) \ No newline at end of file diff --git a/torch_npu/npu/_graph_tree.py b/torch_npu/npu/_graph_tree.py index e34c9242ce..ff2fc421a6 100644 --- a/torch_npu/npu/_graph_tree.py +++ b/torch_npu/npu/_graph_tree.py @@ -102,6 +102,7 @@ import torch_npu from torch_npu._C import ( _npu_NPUAllocator_AllocatorState as AllocatorState, _set_cached_tensors_enabled as _set_cached_tensors_enabled) +import torch_npu.npu.aclnn if TYPE_CHECKING: from torch._inductor.utils import InputType @@ -613,7 +614,6 @@ class NPUWarmupNode: ), disable_conv_cache_emptying(), clear_cublas_manager(), _use_npu_memory_pool_manager( self.device_index, self.npu_graphs_pool, self.stream ), get_history_recording(): - import torch_npu.npu.aclnn if torch_npu.npu.aclnn.use_static_aclnn_kernel == True: print("22"*20) from torch_npu._inductor.npu_static_kernel import StaticKernelCompiler diff --git a/torch_npu/utils/_graph_tree.py b/torch_npu/utils/_graph_tree.py index 9400537791..e40aa91942 100644 --- a/torch_npu/utils/_graph_tree.py +++ b/torch_npu/utils/_graph_tree.py @@ -50,6 +50,7 @@ from torch._inductor.utils import ( InputType, ) from torch.multiprocessing.reductions import StorageWeakRef +import torch_npu.npu.aclnn def npugraph_mark_step_begin(): @@ -163,7 +164,6 @@ def npugraphify_impl( stream = torch.npu.Stream() stream.wait_stream(torch.npu.current_stream()) # copy static_inputs because it will be cleared in model - import torch_npu.npu.aclnn if torch_npu.npu.aclnn.use_static_aclnn_kernel == True: print("55"*20) from torch_npu._inductor.npu_static_kernel import StaticKernelCompiler -- Gitee