From d703649b3d68bb0178e4653512b44b4deae1a49b Mon Sep 17 00:00:00 2001
From: hli28146 <a1967953220@163.com>
Date: Sat, 6 Dec 2025 17:33:00 +0800
Subject: [PATCH 1/2] =?UTF-8?q?[Level=204=20=E7=A4=BA=E4=BE=8B=E5=BC=80?=
 =?UTF-8?q?=E5=8F=91]=20=E3=80=90tanhshrink=E3=80=91=E4=BD=BF=E7=94=A8?=
 =?UTF-8?q?=E6=A1=88=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/tanhshrink/README.md              | 122 +++++++++++++++
 examples/tanhshrink/example_tanh_shrink.py | 169 +++++++++++++++++++++
 examples/tanhshrink/test_tanh_shrink.py    |  13 ++
 3 files changed, 304 insertions(+)
 create mode 100644 examples/tanhshrink/README.md
 create mode 100644 examples/tanhshrink/example_tanh_shrink.py
 create mode 100644 examples/tanhshrink/test_tanh_shrink.py

diff --git a/examples/tanhshrink/README.md b/examples/tanhshrink/README.md
new file mode 100644
index 00000000..5ad26e0b
--- /dev/null
+++ b/examples/tanhshrink/README.md
@@ -0,0 +1,122 @@
+# TanhShrink Operator Implementation with TileLang
+
+## Operator Description
+
+###  Application Scenario
+`TanhShrink` is an activation function defined mathematically as:
+$$ f(x) = x - \tanh(x) $$
+
+It is commonly used in deep neural networks, particularly in scenarios involving residual learning or where sparse activation is beneficial. As an Element-wise operation, it is typically applied in:
+- Nonlinear activation for hidden layers.
+- Feature processing in Generative Adversarial Networks (GANs) or specific regression tasks.
+
+### Core Calculation Logic
+This operator is classified as a **Memory-Bound** kernel. The core implementation logic is as follows:
+
+* Data Loading: Utilizes TileLang's automatic vectorization to efficiently load data from Global Memory directly into Register Fragments.
+
+* Parallel Computation: The input tensor `(M, N)` is divided into tiles, each processed by a Thread Block. Inside the block, threads perform the $x - \tanh(x)$ calculation in parallel using SIMT execution.
+* Write Back: The computed results are written back directly to Global Memory.
+* Optimization: Since no cross-thread reduction is required, the implementation skips Shared Memory and computes directly in registers to minimize latency and maximize bandwidth utilization.
+
+## Interface Parameters
+
+### Input Parameters
+- A (Input Tensor):
+  - Shape: `(M, N)`, 2D Tensor.
+  - Data Type: `float` (float32).
+- B (Output Tensor):
+  - Shape: `(M, N)`, 2D Tensor (Buffer for results).
+  - Data Type: `float` (float32).
+
+### Optional Parameters (JIT Configuration)
+- block_M: Tile size for the M dimension (Recommended: 64).
+- block_N: Tile size for the N dimension (Recommended: 128, to ensure coalesced memory access).
+
+## Usage
+
+### Basic Example
+
+```python
+import torch
+import tilelang
+import tilelang.language as T
+import torch.nn.functional as F
+
+# 1. Define the JIT-compiled kernel using decorator
+# @tilelang.jit(target="maca") # explicit target or inferred
+@tilelang.jit
+def tanh_shrink(M, N, block_M, block_N):
+    dtype = "float"
+
+    @T.prim_func
+    def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):
+            val_local = T.alloc_fragment((block_M, block_N), dtype)
+            T.copy(A[bx * block_M, by * block_N], val_local)
+            for i, j in T.Parallel(block_M, block_N):
+                x = val_local[i, j]
+                val_local[i, j] = x - T.tanh(x)
+            T.copy(val_local, B[bx * block_M, by * block_N])
+
+    return main
+
+# 2. Define dimensions
+M, N = 4096, 4096
+block_M = 64
+block_N = 128
+
+# 3. Instantiate the kernel
+kernel = tanh_shrink(M, N, block_M, block_N)
+
+# 4. Prepare data
+device = "cuda"
+x = torch.randn(M, N, device=device, dtype=torch.float32)
+y = torch.empty(M, N, device=device, dtype=torch.float32) # Pre-allocate output buffer
+
+# 5. Run the kernel
+kernel(x, y)
+
+# 6. Validate correctness
+y_ref = F.tanhshrink(x)
+torch.testing.assert_close(y, y_ref, rtol=1e-3, atol=1e-3)
+print("Kernel output matches PyTorch reference.")
+
+# 7. Profile latency
+profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+latency = profiler.do_bench()
+
+print(f"Latency: {latency:.3f} ms")
+```
+
+### Running Benchmarks
+
+The provided script includes a built-in benchmark suite that tests the operator across multiple scales and compares performance against PyTorch.
+
+```bash
+cd mcTileLang/examples/tanhshrink
+python example_tanh_shrink.py
+```
+
+### Running Tests
+
+using `tilelang.testing` for test
+
+```bash
+python test_tanh_shrink.py
+```
+
+---
+
+## Performance Report
+
+### Test Environment
+- **Hardware**: XiCloud C500
+- **Software**: TileLang 0.15, Python 3.10, MACA SDK 2.33.1.12
+
+### Benchmark Results
+| Scale (M, N)     | Ref Time (ms) | Tile Time (ms) | Speedup   |
+| :--------------- | :------------ | :------------- | :-------- |
+| **(1024, 1024)** | 0.032         | 0.026          | **1.21x** |
+| **(4096, 4096)** | 0.289         | 0.122          | **2.36x** |
+| **(8192, 8192)** | 0.956         | 0.436          | **2.19x** |
diff --git a/examples/tanhshrink/example_tanh_shrink.py b/examples/tanhshrink/example_tanh_shrink.py
new file mode 100644
index 00000000..65853f21
--- /dev/null
+++ b/examples/tanhshrink/example_tanh_shrink.py
@@ -0,0 +1,169 @@
+import torch
+import tilelang
+import tilelang.language as T
+import torch.nn.functional as F
+
+def tanh_shrink(M, N, block_M, block_N):
+    """
+    TanhShrink Operator Implementation using TileLang.
+    Formula: y = x - tanh(x)
+
+    Args:
+        M (int): Number of rows.
+        N (int): Number of columns.
+        block_M (int): Block size for M dimension.
+        block_N (int): Block size for N dimension.
+
+    Returns:
+        The compiled TileLang primitive function.
+    """
+    dtype = "float"
+
+    @T.prim_func
+    def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):
+            # Allocate Register Fragments
+            val_local = T.alloc_fragment((block_M, block_N), dtype)
+
+            # Load Data (Global -> Register)
+            T.copy(A[bx * block_M, by * block_N], val_local)
+
+            # Parallel Computation
+            for i, j in T.Parallel(block_M, block_N):
+                x = val_local[i, j]
+                # Formula: x - tanh(x)
+                val_local[i, j] = x - T.tanh(x)
+
+            # Store Data (Register -> Global)
+            T.copy(val_local, B[bx * block_M, by * block_N])
+
+    return main
+
+def ref_program(x):
+    """
+    PyTorch reference implementation for TanhShrink.
+    """
+    return F.tanhshrink(x)
+
+def run_benchmark(configs, block_config, target="maca"):
+    """
+    Main loop for executing benchmarks across different configurations.
+
+    Args:
+        configs (list): List of dictionaries containing dataset shapes (M, N).
+        block_config (dict): Dictionary containing block sizes (block_M, block_N).
+        target (str): Compilation target (default: "maca").
+    """
+    block_M = block_config["block_M"]
+    block_N = block_config["block_N"]
+
+    # Print table header
+    print(f"\n{'='*80}")
+    print(f"{'TanhShrink Operator Benchmark (TileLang vs PyTorch)':^80}")
+    print(f"{'='*80}")
+    print(f"Target: {target} | Block Config: M={block_M}, N={block_N}")
+    print(f"{'-'*80}")
+
+    # Store summary results
+    summary_results = []
+
+    for cfg in configs:
+        M, N = cfg["M"], cfg["N"]
+        scale_str = f"({M}, {N})"
+        
+        # Build and Compile TileLang Kernel
+        program = tanh_shrink(M, N, block_M, block_N)
+        
+        compile_success = False
+        error_msg = ""
+        
+        try:
+            kernel = tilelang.compile(
+                program,
+                out_idx=1,
+                target=target,
+                execution_backend="cython",
+                pass_configs={"tl.disable_tma_lower": True}
+            )
+            compile_success = True
+        except Exception as e:
+            error_msg = str(e)
+
+        if not compile_success:
+            print(f"| {scale_str:<18} | Compilation Failed: {error_msg} |")
+            summary_results.append((scale_str, "-", "-", "-", "Compile Err"))
+            continue
+
+        # Automatically manages Tensor generation and memory allocation
+        profiler = kernel.get_profiler()
+
+        # Correctness Verification
+        status = "PASSED"
+        try:
+            profiler.assert_allclose(ref_program, rtol=1e-2, atol=1e-2)
+        except Exception as e:
+            status = "FAILED"
+
+        # Performance Benchmarking
+        if status == "PASSED":
+            ref_latency = profiler.do_bench(ref_program, warmup=500)
+            tl_latency = profiler.do_bench(warmup=500)
+            speedup = ref_latency / tl_latency if tl_latency > 0 else 0.0
+            
+            print(f"Running Scale {scale_str:<15} ... Done.")
+            print(f"   > Status:    {status}(Correctness verified)")
+            print(f"   > Ref Time:  {ref_latency:.3f} ms")
+            print(f"   > Tile Time: {tl_latency:.3f} ms")
+            print(f"   > Speedup:   {speedup:.2f}x")
+            print(f"{'-'*80}")
+            
+            summary_results.append((scale_str, ref_latency, tl_latency, speedup, status))
+        else:
+            print(f"Running Scale {scale_str:<15} ... Failed Validation.")
+            print(f"{'-'*80}")
+            summary_results.append((scale_str, "-", "-", "-", status))
+
+    # Summary Result Display
+    print(f"{'='*80}")
+    # Table Header
+    header = f"| {'Scale (M, N)':<20} | {'Ref (ms)':>12} | {'Tile (ms)':>12} | {'Speedup':>10} | {'Status':^10} |"
+    print(header)
+    print(f"|{'-'*22}|{'-'*14}|{'-'*14}|{'-'*12}|{'-'*12}|")
+
+    for res in summary_results:
+        scale, ref, tile, spd, st = res
+        
+        # Format data
+        ref_str = f"{ref:.3f}" if isinstance(ref, float) else str(ref)
+        tile_str = f"{tile:.3f}" if isinstance(tile, float) else str(tile)
+        spd_str = f"{spd:.2f}x" if isinstance(spd, float) else str(spd)
+        
+        row = f"| {scale:<20} | {ref_str:>12} | {tile_str:>12} | {spd_str:>10} | {st:^10} |"
+        print(row)
+    print(f"{'='*80}\n")
+
+def main():
+    """
+    Entry point for the benchmark script.
+    Configures parameters and runs the benchmark suite.
+    """
+    # Block Configuration
+    BLOCK_CONFIG = {
+        "block_M": 64, 
+        "block_N": 128
+    }
+
+    # Dataset Scale Configuration
+    DATA_CONFIGS = [
+        {"M": 1024, "N": 1024, "desc": "Small"},
+        {"M": 4096, "N": 4096, "desc": "Medium"},
+        {"M": 8192, "N": 8192, "desc": "Large"}
+    ]
+
+    # Testing target
+    TARGET = "maca"
+    
+    run_benchmark(DATA_CONFIGS, BLOCK_CONFIG, TARGET)    
+
+if __name__ == "__main__":
+   main()
\ No newline at end of file
diff --git a/examples/tanhshrink/test_tanh_shrink.py b/examples/tanhshrink/test_tanh_shrink.py
new file mode 100644
index 00000000..1c9b6f85
--- /dev/null
+++ b/examples/tanhshrink/test_tanh_shrink.py
@@ -0,0 +1,13 @@
+import tilelang.testing
+
+import example_tanh_shrink
+from unittest import mock
+import sys
+
+def test_example_tanh_shrink():
+    with mock.patch.object(sys, 'argv', ["example_tanh_shrink.py"]):
+        example_tanh_shrink.main()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
\ No newline at end of file
-- 
Gitee


From 5227ed2d47998847d2b769c01f91e3e396ef9944 Mon Sep 17 00:00:00 2001
From: hli28146 <a1967953220@163.com>
Date: Sun, 7 Dec 2025 02:09:41 +0800
Subject: [PATCH 2/2] =?UTF-8?q?[Level4=20=E7=A4=BA=E4=BE=8B=E5=BC=80?=
 =?UTF-8?q?=E5=8F=91]=20=E3=80=90tanhshrink=E3=80=91=E4=BD=BF=E7=94=A8?=
 =?UTF-8?q?=E6=A1=88=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/tanhshrink/README.md              | 65 ++++++++--------------
 examples/tanhshrink/example_tanh_shrink.py |  1 +
 2 files changed, 25 insertions(+), 41 deletions(-)

diff --git a/examples/tanhshrink/README.md b/examples/tanhshrink/README.md
index 5ad26e0b..65bc9570 100644
--- a/examples/tanhshrink/README.md
+++ b/examples/tanhshrink/README.md
@@ -29,9 +29,9 @@ This operator is classified as a **Memory-Bound** kernel. The core implementatio
   - Shape: `(M, N)`, 2D Tensor (Buffer for results).
   - Data Type: `float` (float32).
 
-### Optional Parameters (JIT Configuration)
-- block_M: Tile size for the M dimension (Recommended: 64).
-- block_N: Tile size for the N dimension (Recommended: 128, to ensure coalesced memory access).
+### Optional Parameters
+- block_M: Tile size for the M dimension.
+- block_N: Tile size for the N dimension.
 
 ## Usage
 
@@ -40,49 +40,32 @@ This operator is classified as a **Memory-Bound** kernel. The core implementatio
 ```python
 import torch
 import tilelang
-import tilelang.language as T
-import torch.nn.functional as F
-
-# 1. Define the JIT-compiled kernel using decorator
-# @tilelang.jit(target="maca") # explicit target or inferred
-@tilelang.jit
-def tanh_shrink(M, N, block_M, block_N):
-    dtype = "float"
-
-    @T.prim_func
-    def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
-        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):
-            val_local = T.alloc_fragment((block_M, block_N), dtype)
-            T.copy(A[bx * block_M, by * block_N], val_local)
-            for i, j in T.Parallel(block_M, block_N):
-                x = val_local[i, j]
-                val_local[i, j] = x - T.tanh(x)
-            T.copy(val_local, B[bx * block_M, by * block_N])
-
-    return main
-
-# 2. Define dimensions
-M, N = 4096, 4096
-block_M = 64
-block_N = 128
-
-# 3. Instantiate the kernel
-kernel = tanh_shrink(M, N, block_M, block_N)
+from example_tanh_shrink import tanh_shrink, ref_program
 
-# 4. Prepare data
-device = "cuda"
-x = torch.randn(M, N, device=device, dtype=torch.float32)
-y = torch.empty(M, N, device=device, dtype=torch.float32) # Pre-allocate output buffer
-
-# 5. Run the kernel
+# Define dimensions
+M, N = 4096, 4096
+block_M, block_N = 64, 128
+
+# Create kernel
+program = tanh_shrink(M, N, block_M, block_N)
+kernel = tilelang.compile(
+    program,
+    target="maca",
+    execution_backend="cython",
+    pass_configs={"tl.disable_tma_lower": True}
+)
+
+# Run kernel
+x = torch.randn(M, N, device="cuda", dtype=torch.float32)
+y = torch.empty_like(x)
 kernel(x, y)
 
-# 6. Validate correctness
-y_ref = F.tanhshrink(x)
-torch.testing.assert_close(y, y_ref, rtol=1e-3, atol=1e-3)
+# Validate
+y_ref = ref_program(x)
+torch.testing.assert_close(y, y_ref, rtol=1e-2, atol=1e-2)
 print("Kernel output matches PyTorch reference.")
 
-# 7. Profile latency
+# Profile latency
 profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 latency = profiler.do_bench()
 
diff --git a/examples/tanhshrink/example_tanh_shrink.py b/examples/tanhshrink/example_tanh_shrink.py
index 65853f21..ff88271c 100644
--- a/examples/tanhshrink/example_tanh_shrink.py
+++ b/examples/tanhshrink/example_tanh_shrink.py
@@ -103,6 +103,7 @@ def run_benchmark(configs, block_config, target="maca"):
             profiler.assert_allclose(ref_program, rtol=1e-2, atol=1e-2)
         except Exception as e:
             status = "FAILED"
+            print(f"Validation Error: {e}") 
 
         # Performance Benchmarking
         if status == "PASSED":
-- 
Gitee