From 3e9ba39f926ff49363d25e6dd23006e62a49a1b6 Mon Sep 17 00:00:00 2001
From: JianxinZhang <zhangjianxin27@huawei.com>
Date: Mon, 18 Aug 2025 17:09:14 +0800
Subject: [PATCH 1/4] start and end dump

---
 torch_npu/csrc/npu/Module.cpp | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index c66f2a8d52..3a6fa91e27 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -51,6 +51,7 @@
 #include "torch_npu/csrc/aten/common/from_blob.h"
 #include "torch_npu/csrc/profiler/combined_traceback.h"
 #include "torch_npu/csrc/profiler/python/combined_traceback.h"
+#include "torch_npu/csrc/framework/interface/AclInterface.h"
 
 struct NPUDeviceProp {
     std::string name;
@@ -1834,6 +1835,30 @@ static PyObject* THNPModule_reset_device_res_limit(PyObject* self, PyObject *arg
     END_HANDLE_TH_ERRORS
 }
 
+PyObject* THNPModule_aclop_start_dump(PyObject* self, PyObject* args)
+{
+    HANDLE_TH_ERRORS
+    uint32_t dumpType=0x00000001U;
+    std::string module = THPUtils_unpackString(args);
+    const char *path=module.c_str();
+    std::cout<<"AclopStartDumpArgs call starts!\n";
+    at_npu::native::AclopStartDumpArgs(dumpType, path);
+    std::cout<<"AclopStartDumpArgs call ends!\n";
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
+}
+
+PyObject* THNPModule_aclop_stop_dump(PyObject* self, PyObject* noargs)
+{
+    HANDLE_TH_ERRORS
+    uint32_t  dumpType=0x00000001U;
+    std::cout<<"AclopStopDumpArgs call starts!\n";
+    at_npu::native::AclopStopDumpArgs(dumpType);
+    std::cout<<"AclopStopDumpArgs call ends!\n";
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
+}
+
 static struct PyMethodDef THNPModule_methods[] = {
     {"_npu_init", (PyCFunction)THNPModule_initExtension, METH_NOARGS, nullptr},
     {"_npu_set_run_yet_variable_to_false", (PyCFunction)THNPModule_set_run_yet_variable_to_false_wrap, METH_NOARGS, nullptr},
@@ -1902,6 +1927,8 @@ static struct PyMethodDef THNPModule_methods[] = {
     {"_npu_get_device_res_limit", (PyCFunction)THNPModule_get_device_res_limit, METH_VARARGS, nullptr},
     {"_npu_set_device_res_limit", (PyCFunction)THNPModule_set_device_res_limit, METH_VARARGS, nullptr},
     {"_npu_reset_device_res_limit", (PyCFunction)THNPModule_reset_device_res_limit, METH_O, nullptr},
+    {"_aclop_start_dump", (PyCFunction)THNPModule_aclop_start_dump, METH_O, nullptr},
+    {"_aclop_stop_dump", (PyCFunction)THNPModule_aclop_stop_dump, METH_NOARGS, nullptr},
     {nullptr}};
 
 TORCH_NPU_API PyMethodDef* THNPModule_get_methods()
-- 
Gitee


From 6cdd03ec831c8811f62a15a174b54e7d3c118881 Mon Sep 17 00:00:00 2001
From: JianxinZhang <zhangjianxin27@huawei.com>
Date: Mon, 18 Aug 2025 19:33:33 +0800
Subject: [PATCH 2/4] static kernel inductor

---
 torch_npu/__init__.py                    |   3 +
 torch_npu/_inductor/codegen/wrapper.py   |  15 ++
 torch_npu/_inductor/npu_static_kernel.py | 311 +++++++++++++++++++++++
 torch_npu/npu/_graph_tree.py             |  10 +-
 torch_npu/npu/aclnn/__init__.py          |   2 +
 torch_npu/utils/_graph_tree.py           |  12 +-
 6 files changed, 349 insertions(+), 4 deletions(-)
 create mode 100644 torch_npu/_inductor/npu_static_kernel.py

diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index 9e7cb7b250..c2b4020bdb 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -18,6 +18,7 @@ import torch
 from torch.distributed.fsdp import sharded_grad_scaler
 from torch.utils.checkpoint import DefaultDeviceType
 import torch_npu
+from torch_npu._inductor import npu_static_kernel
 
 acc = torch._C._get_accelerator()
 if acc.type != "cpu":
@@ -276,6 +277,8 @@ def _npu_shutdown():
     torch_npu._C._npu_shutdown(success)
     _except_handler.handle_exception()
     torch_npu.asd.asd.matmul_check._cleanup()
+    if torch_npu.npu.aclnn.use_static_aclnn_kernel == True:
+        uninstall_static_kernel()
 
 
 # register npu shutdown hook on exit
diff --git a/torch_npu/_inductor/codegen/wrapper.py b/torch_npu/_inductor/codegen/wrapper.py
index a2a29fe645..e848a5e02d 100644
--- a/torch_npu/_inductor/codegen/wrapper.py
+++ b/torch_npu/_inductor/codegen/wrapper.py
@@ -15,6 +15,8 @@ from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
 from torch.utils._sympy.singleton_int import SingletonInt
 
 from torch_npu._inductor import config as npu_config
+from torch_npu._inductor import npu_static_kernel
+import torch_npu.npu.aclnn
 
 
 class NPUWrapperCodeGen(PythonWrapperCodegen):
@@ -246,3 +248,16 @@ class NPUWrapperCodeGen(PythonWrapperCodegen):
             call_str = f"repro_run({', '.join(V.graph.graph_inputs.keys())})"
             output.writeline(f"fn = lambda: {call_str}")
             output.writeline("return fn()")
+
+    def write_prefix(self) -> None:
+        super().write_prefix()
+        if torch_npu.npu.aclnn.use_static_aclnn_kernel == True:
+        with self.prefix.indent():
+            self.prefix.writeline('static_kernel_complier = StaticKernelCompiler()')
+            self.prefix.writeline('static_kernel_complier.enter()')
+
+    def generate_return(self, output_refs: list[str]) -> None:
+        if torch_npu.npu.aclnn.use_static_aclnn_kernel == True:
+            self.wrapper_call.writeline('exc_info=(None, None, None)')
+            self.wrapper_call.writeline('static_kernel_complier.__exit__(*exc_info)')
+        super().generate_return(output_refs)
\ No newline at end of file
diff --git a/torch_npu/_inductor/npu_static_kernel.py b/torch_npu/_inductor/npu_static_kernel.py
new file mode 100644
index 0000000000..710552cdaf
--- /dev/null
+++ b/torch_npu/_inductor/npu_static_kernel.py
@@ -0,0 +1,311 @@
+import os
+import subprocess
+import datetime
+from enum import IntEnum
+from pathlib import Path
+
+import torch
+import torch_npu
+import torch_npu.npu.utils as utils
+from .config import log
+
+_uninstall_path = None
+
+class SocVersion(IntEnum):
+    UnsupportedSocVersion = -1
+    Ascend910PremiumA = 100
+    Ascend910ProA = 101
+    Ascend910A = 102
+    Ascend910ProB = 103
+    Ascend910B = 104
+    Ascend310P1 = 200
+    Ascend310P2 = 201
+    Ascend310P3 = 202
+    Ascend310P4 = 203
+    Ascend310P5 = 204
+    Ascend310P7 = 206
+    Ascend910B1 = 220
+    Ascend910B2 = 221
+    Ascend910B2C = 222
+    Ascend910B3 = 223
+    Ascend910B4 = 224
+    Ascend910B4_1 = 225
+    Ascend310B1 = 240
+    Ascend310B2 = 241
+    Ascend310B3 = 242
+    Ascend310B4 = 243
+    Ascend910_9391 = 250
+    Ascend910_9392 = 251
+    Ascend910_9381 = 252
+    Ascend910_9382 = 253
+    Ascend910_9372 = 254
+    Ascend910_9362 = 255
+
+def get_soc_version_name():
+    soc_version = utils.get_soc_version()
+    try:
+        return SocVersion(soc_version).name
+    except ValueError:
+        return SocVersion(-1).name
+
+def safe_resolve_output_dir(build_dir: str):
+    base_dir = Path.cwd().resolve()
+    if build_dir is not None:
+        if "\x00" in build_dir:
+            raise ValueError("build_dir contains null byte")
+
+        candidate = Path(build_dir)
+        if ".." in candidate.parts:
+            raise ValueError("build_dir must not contain '..'")
+
+        script_dir = candidate if candidate.is_absolute() else base_dir / candidate
+
+        cur = Path(script_dir.anchor)
+        for part in script_dir.parts[1:]:
+            cur = cur / part
+            if cur.exists() and cur.is_symlink():
+                raise ValueError(f"symlink detected in path: {cur}")
+
+        try:
+            script_dir = script_dir.resolve(strict=False)
+        except Exception as e:
+            raise ValueError(f"cannot resolve path {script_dir}: {e}")
+    else:
+        script_dir = base_dir  # 在同目录生成临时dump的文件夹，用于保存生成的算子信息json
+
+    timestamp = f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}_{os.getpid()}"
+    result_root = script_dir / f"{timestamp}_kernel_aot_optimization_build_outputs"
+
+    try:
+        result_root.mkdir(exist_ok=True)
+    except (PermissionError, OSError) as e:
+        raise RuntimeError(f"failed to create output directory {result_root}: {e}") from e
+
+    return result_root
+
+class AclopDumpContext:
+    def __init__(self, save_path: str):
+        self.save_path = save_path
+
+    def __enter__(self):
+        torch_npu._C._aclop_start_dmp(self.save_path)
+        return self  # 可选
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        torch_npu._C._aclop_stop_dump()
+
+def save_uninstall_info(filename: str):
+    global _uninstall_path
+    latest = Path(os.environ["ASCEND_HOME_PATH"])
+    root = latest.parent
+    pattern = f"*/opp/static_kernel/ai_core/{filename}/uninstall.sh"
+    match = next(root.glob(pattern), None)
+    if match is None:
+        _uninstall_path = None
+        log.debug(f"can not find uninstall path, pattern: {pattern}")
+        print(f"can not find uninstall path, pattern: {pattern}")
+    else:
+        _uninstall_path = str(match)
+
+class StaticKernelCompiler:
+    """
+    上下文管理器，用于自动处理静态 Kernel 的 dump、编译和安装过程。
+    """
+    def __init__(self, build_dir=None):
+        """
+        初始化编译器，并创建用于存放编译产物的目录。
+        :param build_dir: 可选，指定编译产物的存放目录。如果为 None，则在当前工作目录下创建。
+        """
+        self.result_root = safe_resolve_output_dir(build_dir)
+        print(f"StaticKernelCompiler initialized. Build directory: {self.result_root}")
+
+    def __enter__(self):
+        """
+        进入 with 代码块时调用，启动 aclop dump。
+        """
+        print("[INFO] Starting operator dump...")
+        # 注意：这里假设 torch_npu._C._aclop_start_dump 是正确的 dump 启动函数
+        torch_npu._C._aclop_start_dump(str(self.result_root))
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """
+        退出 with 代码块时调用，停止 dump 并执行编译和安装。
+        """
+        print("[INFO] Stopping operator dump.")
+        # 确保 dump 进程被停止，即使在 with 块内发生异常
+        torch_npu._C._aclop_stop_dump()
+
+        if exc_type:
+            print(f" An exception occurred during model execution: {exc_val}")
+            print("[INFO] Skipping static kernel compilation due to the error.")
+            return # 不抑制异常，让调用者知道发生了错误
+
+        print("[INFO] Starting static kernel compilation process...")
+        
+        debug_dirs = [d for d in self.result_root.iterdir()
+                      if d.is_dir() and d.name.endswith("_debug")]
+        if not debug_dirs:
+            print(" Can not find json of ops, skipping op_compiler.")
+            return
+
+        debug_dir = max(debug_dirs, key=lambda d: d.stat().st_mtime)
+        json_files = list(debug_dir.glob("*.json"))
+        if not json_files:
+            print(f" No JSON files in {debug_dir}, skipping op_compiler.")
+            return
+
+        cmd = [
+            "op_compiler",
+            "-p", str(debug_dir),
+            "-v", get_soc_version_name(),
+            "-l", "info",
+            "-j", "4",
+            "-o", str(self.result_root),
+        ]
+        try:
+            print(f" Executing op_compiler command: {' '.join(cmd)}")
+            res = subprocess.run(cmd, check=True, capture_output=True, text=True)
+            print(f" op_compiler execution successful, msg: {res.stdout}")
+        except subprocess.CalledProcessError as e:
+            print(f" op_compiler execution failed, msg: {e.stderr}")
+            return
+
+        for run_pkg in self.result_root.glob("*.run"):
+            filename = run_pkg.name
+            try:
+                print(f"[INFO] Installing static kernel package: {filename}")
+                result = subprocess.run([str(run_pkg)], check=True, capture_output=True, text=True)
+                print(f" {filename} install successful, msg: {result.stdout}")
+                save_uninstall_info(filename[:-4])
+                torch_npu.npu._aclnn_reselect_static_kernel()
+            except subprocess.CalledProcessError as e:
+                print(f" {filename} install failed, msg: {e.stderr}")
+
+def compile_static_kernel(build_dir = None, *args, **kwargs):
+    log.info(f"start compiler static kernel")
+    print(f"Start compiler static kernel")
+    result_root = safe_resolve_output_dir(build_dir)
+
+    # 执行单算子，用于生成算子信息json
+    torch_npu._C._aclop_start_dump(str(result_root))
+    try:
+        # torch.fx.Interpreter(fx_graph).run(*args, **kwargs)
+        test_model()
+    finally:
+        torch_npu._C.aclop_stop_dump_args()
+
+    with AclopDumpContext(result_root):
+        test_model()
+
+
+    debug_dirs = [d for d in result_root.iterdir()
+                  if d.is_dir() and d.name.endswith("_debug")]
+    if not debug_dirs:
+        log.error("Can not find json of ops, do not excute op_compiler")
+        print("Can not find json of ops, do not excute op_compiler")
+        return
+
+    debug_dir = max(debug_dirs, key=lambda d: d.stat().st_mtime)
+
+    json_files = list(debug_dir.glob("*.json"))
+    if not json_files:
+        log.error(f"[ERROR] No JSON files in {debug_dir}, skip op_compiler")
+        print(f"No JSON files in {debug_dir}, skip op_compiler")
+        return
+
+    cmd = [
+        "op_compiler",
+        "-p", str(debug_dir),
+        "-v", get_soc_version_name(),
+        "-l", "info",
+        "-j", "4",
+        "-o", str(result_root),
+    ]
+    try:
+        res = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        log.debug(f"excute op_compiler, msg: {res.stdout}")
+        print(f"excute op_compiler, msg: {res.stdout}")
+    except subprocess.CalledProcessError as e:
+        log.warning(f"[WARNING] excute op_compiler error, msg: {e.stderr}")
+        print(f"excute op_compiler error, msg: {e.stderr}")
+        return
+
+    for run_pkg in result_root.glob("*.run"):
+        filepath = run_pkg
+        filename = run_pkg.name
+        try:
+            result = subprocess.run(
+                [str(filepath)],
+                check=True,
+                capture_output=True,
+                text=True
+            )
+            log.debug(f"{filename} static kernel run pkg install success, msg: {result.stdout}")
+            print(f"{filename} static kernel run pkg install success, msg: {result.stdout}")
+            save_uninstall_info(filename[:-4])
+            torch_npu.npu._aclnn_reselect_static_kernel()
+        except subprocess.CalledProcessError as e:
+            log.warning(f"{filename} static kernel run pkg install failed, msg: {e.stderr}")
+            print(f"{filename} static kernel run pkg install failed, msg: {e.stderr}")
+
+def uninstall_static_kernel():
+    global _uninstall_path
+    if not _uninstall_path:
+        log.debug(f"uninstall_path is none, skip uninstall static kernel")
+        print(f"uninstall_path is none, skip uninstall static kernel")
+        return
+
+    try:
+        result = subprocess.run(
+            [_uninstall_path],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        log.debug(f"{_uninstall_path} uninstall success, msg: \n{result.stdout}")
+        print(f"[DEBUG] {_uninstall_path} uninstall success, msg: \n{result.stdout}")
+    except subprocess.CalledProcessError as e:
+        logger.debug(f"{_uninstall_path} uninstall failed, msg: \n{e.stderr}")
+        print(f"[ERROR] {_uninstall_path} uninstall failed, msg: \n{e.stderr}")
+    finally:
+        _uninstall_path = None
+
+def test_model():
+    class Model(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear1 = torch.nn.Linear(1024, 2048)
+            self.relu1 = torch.nn.ReLU()
+            self.linear2 = torch.nn.Linear(2048, 1024)
+
+        def forward(self, x):
+            ln1 = self.linear1(x)
+            ln2 = self.relu1(ln1)
+            ln3 = self.linear2(ln2)
+            return ln3
+
+    model = Model().npu()
+    # config = CompilerConfig()
+    # config.mode = "reduce-overhead"
+    # config.debug.aclgraph.enable_aclnn_kernel = True
+    # aclgraph_backend = torchair.get_npu_backend(compiler_config=config)
+    # model = torch.compile(Model(), backend=aclgraph_backend, dynamic=False)
+    x = torch.randn(32, 1024).npu()
+    y = model(x)
+
+if __name__=="__main__":
+    static_kernel_complier = StaticKernelCompiler()
+    with staric_kernel_complier:
+        test_model()
+    # exc_info=(None, None, None)
+    # try:
+    #     static_kernel_complier.__enter__()
+    #     test_model()
+    # except Exception:
+    #     print("ERROR")
+    # finally:
+    #     static_kernel_complier.__exit__(*exc_info)
+
+    uninstall_static_kernel()
+    
\ No newline at end of file
diff --git a/torch_npu/npu/_graph_tree.py b/torch_npu/npu/_graph_tree.py
index 1149dd53d2..b32e9d64b6 100644
--- a/torch_npu/npu/_graph_tree.py
+++ b/torch_npu/npu/_graph_tree.py
@@ -102,7 +102,8 @@ import torch_npu
 from torch_npu._C import (
     _npu_NPUAllocator_AllocatorState as AllocatorState,
     _set_cached_tensors_enabled as _set_cached_tensors_enabled)
-
+import torch_npu.npu.aclnn
+from torch_npu._inductor import npu_static_kernel
 
 if TYPE_CHECKING:
     from torch._inductor.utils import InputType
@@ -614,7 +615,12 @@ class NPUWarmupNode:
         ), disable_conv_cache_emptying(), clear_cublas_manager(), _use_npu_memory_pool_manager(
             self.device_index, self.npu_graphs_pool, self.stream
         ), get_history_recording():
-            out = self.wrapped_function.model(new_inputs)
+            if torch_npu.npu.aclnn.use_static_aclnn_kernel == True:
+                static_kernel_complier = StaticKernelCompiler()
+                with static_kernel_complier:
+                    out = self.wrapped_function.model(new_inputs)
+            else:
+                out = self.wrapped_function.model(new_inputs)
 
         # We need to know which outputs are allocated within the cudagraph pool
         # so that we can deallocate them at the beginning of the next cudagraph step,
diff --git a/torch_npu/npu/aclnn/__init__.py b/torch_npu/npu/aclnn/__init__.py
index 48fccdbce7..0c0f783267 100644
--- a/torch_npu/npu/aclnn/__init__.py
+++ b/torch_npu/npu/aclnn/__init__.py
@@ -11,6 +11,8 @@ from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModul
 import torch_npu._C
 from .backends import version
 
+# determine to use static aclnn kernel
+use_static_aclnn_kernel = False
 
 def _set_allow_conv_hf32(_enabled: bool):
     r"""Set the device supports conv operation hf32.
diff --git a/torch_npu/utils/_graph_tree.py b/torch_npu/utils/_graph_tree.py
index b031f62e9b..4bcaec0697 100644
--- a/torch_npu/utils/_graph_tree.py
+++ b/torch_npu/utils/_graph_tree.py
@@ -50,6 +50,8 @@ from torch._inductor.utils import (
     InputType,
 )
 from torch.multiprocessing.reductions import StorageWeakRef
+import torch_npu.npu.aclnn
+from torch_npu._inductor import npu_static_kernel
 
 
 def npugraph_mark_step_begin():
@@ -163,8 +165,14 @@ def npugraphify_impl(
     stream = torch.npu.Stream()
     stream.wait_stream(torch.npu.current_stream())
     # copy static_inputs because it will be cleared in model
-    with torch.npu.stream(stream):
-        model(list(static_inputs))
+    if torch_npu.npu.aclnn.use_static_aclnn_kernel == True:
+        static_kernel_complier = StaticKernelCompiler()
+        with static_kernel_complier:
+            with torch.npu.stream(stream):
+                model(list(static_inputs))
+    else:
+        with torch.npu.stream(stream):
+            model(list(static_inputs))
     stream.synchronize()
     torch.npu.current_stream().wait_stream(stream)
     torch.npu.synchronize()
-- 
Gitee


From b236524a885bd4be6003904442b3971d2ee6fa5f Mon Sep 17 00:00:00 2001
From: JianxinZhang <zhangjianxin27@huawei.com>
Date: Tue, 19 Aug 2025 21:09:07 +0800
Subject: [PATCH 3/4] 20250819

---
 torch_npu/__init__.py                  |  2 +-
 torch_npu/_inductor/codegen/wrapper.py | 28 ++++++++++++++++++--------
 torch_npu/npu/_graph_tree.py           |  5 +++--
 torch_npu/utils/_graph_tree.py         |  5 +++--
 4 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index c2b4020bdb..bfcf14de64 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -18,7 +18,6 @@ import torch
 from torch.distributed.fsdp import sharded_grad_scaler
 from torch.utils.checkpoint import DefaultDeviceType
 import torch_npu
-from torch_npu._inductor import npu_static_kernel
 
 acc = torch._C._get_accelerator()
 if acc.type != "cpu":
@@ -278,6 +277,7 @@ def _npu_shutdown():
     _except_handler.handle_exception()
     torch_npu.asd.asd.matmul_check._cleanup()
     if torch_npu.npu.aclnn.use_static_aclnn_kernel == True:
+        from torch_npu._inductor.npu_static_kernel import uninstall_static_kernel
         uninstall_static_kernel()
 
 
diff --git a/torch_npu/_inductor/codegen/wrapper.py b/torch_npu/_inductor/codegen/wrapper.py
index e848a5e02d..7e81590ffb 100644
--- a/torch_npu/_inductor/codegen/wrapper.py
+++ b/torch_npu/_inductor/codegen/wrapper.py
@@ -15,7 +15,6 @@ from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
 from torch.utils._sympy.singleton_int import SingletonInt
 
 from torch_npu._inductor import config as npu_config
-from torch_npu._inductor import npu_static_kernel
 import torch_npu.npu.aclnn
 
 
@@ -54,6 +53,7 @@ class NPUWrapperCodeGen(PythonWrapperCodegen):
             )
             from torch_npu._inductor.npu_triton_heuristics import grid
             import torch_npu
+            has_initialized = True
             """
         if config.triton.autotune_at_compile_time:
             self.kernel_autotune_calls.splice(import_str)
@@ -248,16 +248,28 @@ class NPUWrapperCodeGen(PythonWrapperCodegen):
             call_str = f"repro_run({', '.join(V.graph.graph_inputs.keys())})"
             output.writeline(f"fn = lambda: {call_str}")
             output.writeline("return fn()")
-
+    
     def write_prefix(self) -> None:
         super().write_prefix()
         if torch_npu.npu.aclnn.use_static_aclnn_kernel == True:
-        with self.prefix.indent():
-            self.prefix.writeline('static_kernel_complier = StaticKernelCompiler()')
-            self.prefix.writeline('static_kernel_complier.enter()')
+            with self.prefix.indent():
+                self.prefix.writeline('global has_initialized')
+                self.prefix.writeline('if not has_initialized:')
+            self.prefix.do_indent()
+            with self.prefix.indent():
+                self.prefix.writeline('print("33"*20)')
+                self.prefix.writeline('from torch_npu._inductor.npu_static_kernel import StaticKernelCompiler')
+                self.prefix.writeline('static_kernel_complier = StaticKernelCompiler()')
+                self.prefix.writeline('static_kernel_complier.__enter__()')
+                self.prefix.writeline('has_initialized = True')
+            self.prefix.do_unindent()
 
     def generate_return(self, output_refs: list[str]) -> None:
         if torch_npu.npu.aclnn.use_static_aclnn_kernel == True:
-            self.wrapper_call.writeline('exc_info=(None, None, None)')
-            self.wrapper_call.writeline('static_kernel_complier.__exit__(*exc_info)')
-        super().generate_return(output_refs)
\ No newline at end of file
+            self.wrapper_call.do_unindent()
+            with self.wrapper_call.indent():
+                self.wrapper_call.writeline('if not has_initialized:')
+            self.wrapper_call.do_indent()
+            with self.wrapper_call.indent():
+                self.wrapper_call.writeline('exc_info=(None, None, None)')
+                self.wrapper_call.writeline('static_kernel_complier.__exit__(*exc_info)')
\ No newline at end of file
diff --git a/torch_npu/npu/_graph_tree.py b/torch_npu/npu/_graph_tree.py
index b32e9d64b6..e34c9242ce 100644
--- a/torch_npu/npu/_graph_tree.py
+++ b/torch_npu/npu/_graph_tree.py
@@ -102,8 +102,6 @@ import torch_npu
 from torch_npu._C import (
     _npu_NPUAllocator_AllocatorState as AllocatorState,
     _set_cached_tensors_enabled as _set_cached_tensors_enabled)
-import torch_npu.npu.aclnn
-from torch_npu._inductor import npu_static_kernel
 
 if TYPE_CHECKING:
     from torch._inductor.utils import InputType
@@ -615,7 +613,10 @@ class NPUWarmupNode:
         ), disable_conv_cache_emptying(), clear_cublas_manager(), _use_npu_memory_pool_manager(
             self.device_index, self.npu_graphs_pool, self.stream
         ), get_history_recording():
+            import torch_npu.npu.aclnn
             if torch_npu.npu.aclnn.use_static_aclnn_kernel == True:
+                print("22"*20)
+                from torch_npu._inductor.npu_static_kernel import StaticKernelCompiler
                 static_kernel_complier = StaticKernelCompiler()
                 with static_kernel_complier:
                     out = self.wrapped_function.model(new_inputs)
diff --git a/torch_npu/utils/_graph_tree.py b/torch_npu/utils/_graph_tree.py
index 4bcaec0697..9400537791 100644
--- a/torch_npu/utils/_graph_tree.py
+++ b/torch_npu/utils/_graph_tree.py
@@ -50,8 +50,6 @@ from torch._inductor.utils import (
     InputType,
 )
 from torch.multiprocessing.reductions import StorageWeakRef
-import torch_npu.npu.aclnn
-from torch_npu._inductor import npu_static_kernel
 
 
 def npugraph_mark_step_begin():
@@ -165,7 +163,10 @@ def npugraphify_impl(
     stream = torch.npu.Stream()
     stream.wait_stream(torch.npu.current_stream())
     # copy static_inputs because it will be cleared in model
+    import torch_npu.npu.aclnn
     if torch_npu.npu.aclnn.use_static_aclnn_kernel == True:
+        print("55"*20)
+        from torch_npu._inductor.npu_static_kernel import StaticKernelCompiler
         static_kernel_complier = StaticKernelCompiler()
         with static_kernel_complier:
             with torch.npu.stream(stream):
-- 
Gitee


From ab23bda58b0d0bb5ac8615957a040b2878cd6920 Mon Sep 17 00:00:00 2001
From: JianxinZhang <zhangjianxin27@huawei.com>
Date: Tue, 19 Aug 2025 23:37:31 +0800
Subject: [PATCH 4/4] fix bug

---
 torch_npu/_inductor/codegen/wrapper.py | 7 ++++---
 torch_npu/npu/_graph_tree.py           | 2 +-
 torch_npu/utils/_graph_tree.py         | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/torch_npu/_inductor/codegen/wrapper.py b/torch_npu/_inductor/codegen/wrapper.py
index 7e81590ffb..826c4d63f6 100644
--- a/torch_npu/_inductor/codegen/wrapper.py
+++ b/torch_npu/_inductor/codegen/wrapper.py
@@ -53,7 +53,7 @@ class NPUWrapperCodeGen(PythonWrapperCodegen):
             )
             from torch_npu._inductor.npu_triton_heuristics import grid
             import torch_npu
-            has_initialized = True
+            has_initialized = False
             """
         if config.triton.autotune_at_compile_time:
             self.kernel_autotune_calls.splice(import_str)
@@ -248,7 +248,7 @@ class NPUWrapperCodeGen(PythonWrapperCodegen):
             call_str = f"repro_run({', '.join(V.graph.graph_inputs.keys())})"
             output.writeline(f"fn = lambda: {call_str}")
             output.writeline("return fn()")
-    
+
     def write_prefix(self) -> None:
         super().write_prefix()
         if torch_npu.npu.aclnn.use_static_aclnn_kernel == True:
@@ -272,4 +272,5 @@ class NPUWrapperCodeGen(PythonWrapperCodegen):
             self.wrapper_call.do_indent()
             with self.wrapper_call.indent():
                 self.wrapper_call.writeline('exc_info=(None, None, None)')
-                self.wrapper_call.writeline('static_kernel_complier.__exit__(*exc_info)')
\ No newline at end of file
+                self.wrapper_call.writeline('static_kernel_complier.__exit__(*exc_info)')
+        super().generate_return(output_refs)
\ No newline at end of file
diff --git a/torch_npu/npu/_graph_tree.py b/torch_npu/npu/_graph_tree.py
index e34c9242ce..ff2fc421a6 100644
--- a/torch_npu/npu/_graph_tree.py
+++ b/torch_npu/npu/_graph_tree.py
@@ -102,6 +102,7 @@ import torch_npu
 from torch_npu._C import (
     _npu_NPUAllocator_AllocatorState as AllocatorState,
     _set_cached_tensors_enabled as _set_cached_tensors_enabled)
+import torch_npu.npu.aclnn
 
 if TYPE_CHECKING:
     from torch._inductor.utils import InputType
@@ -613,7 +614,6 @@ class NPUWarmupNode:
         ), disable_conv_cache_emptying(), clear_cublas_manager(), _use_npu_memory_pool_manager(
             self.device_index, self.npu_graphs_pool, self.stream
         ), get_history_recording():
-            import torch_npu.npu.aclnn
             if torch_npu.npu.aclnn.use_static_aclnn_kernel == True:
                 print("22"*20)
                 from torch_npu._inductor.npu_static_kernel import StaticKernelCompiler
diff --git a/torch_npu/utils/_graph_tree.py b/torch_npu/utils/_graph_tree.py
index 9400537791..e40aa91942 100644
--- a/torch_npu/utils/_graph_tree.py
+++ b/torch_npu/utils/_graph_tree.py
@@ -50,6 +50,7 @@ from torch._inductor.utils import (
     InputType,
 )
 from torch.multiprocessing.reductions import StorageWeakRef
+import torch_npu.npu.aclnn
 
 
 def npugraph_mark_step_begin():
@@ -163,7 +164,6 @@ def npugraphify_impl(
     stream = torch.npu.Stream()
     stream.wait_stream(torch.npu.current_stream())
     # copy static_inputs because it will be cleared in model
-    import torch_npu.npu.aclnn
     if torch_npu.npu.aclnn.use_static_aclnn_kernel == True:
         print("55"*20)
         from torch_npu._inductor.npu_static_kernel import StaticKernelCompiler
-- 
Gitee