diff --git a/CONTRIBUTING.zh.md b/CONTRIBUTING.zh.md
index b47775dd1aaf2af09b6d1b882cb83ec6555b8f13..d1d01a66e2dd2709414bffa623fc51f03bf6de35 100644
--- a/CONTRIBUTING.zh.md
+++ b/CONTRIBUTING.zh.md
@@ -52,9 +52,8 @@
     import torch_npu
     import numpy as np
 
-    from torch_npu.testing.common_utils import TestCase, run_tests
-    from torch_npu.testing.common_device_type import instantiate_device_type_tests
-    from torch_npu.testing.util_test import create_common_tensor
+    from torch_npu.testing.testcase import TestCase, run_tests
+    from torch_npu.testing.common_utils import create_common_tensor
 
     
     # 定义add测试用例类
@@ -87,14 +86,13 @@
                 self.assertRtolEqual(cpu_output, npu_output)
     
         # 定义具体add场景的测试用例，用例函数需要以test_开头
-        def test_add_shape_format_fp32_2d(self, device):
+        def test_add_shape_format_fp32_2d(self, device="npu"):
             format_list = [0, 3, 29]
             shape_format = [
                 [np.float32, i, [5, 256]]  for i in format_list 
             ]        
             self.add_result(shape_format)
     
-    instantiate_device_type_tests(TestAdd, globals(), except_for="cpu")
     if __name__ == "__main__":
         run_tests()
     ```
diff --git a/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md b/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md
index 4ecf8cc78d87ee3c1b26085286855785132785ea..f871d0db3f649630e0d143c83fcf4452ae9a8a40 100644
--- a/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md	
+++ b/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md	
@@ -711,13 +711,11 @@ This section describes how to test the functions of a PyTorch operator.
 
     ```
     # Import the dependency library.
-    import sys
-    sys.path.append('..')
     import torch
     import numpy as np
-    from common_utils import TestCase, run_tests
-    from common_device_type import dtypes, instantiate_device_type_tests
-    from util_test import create_common_tensor
+
+    from torch_npu.testing.testcase import TestCase, run_tests
+    from torch_npu.testing.common_utils import create_common_tensor
     
     # Define the add test case class.
     class TestAdd(TestCase):
@@ -747,14 +745,13 @@ This section describes how to test the functions of a PyTorch operator.
                 self.assertRtolEqual(cpu_output, npu_output)
     
         # Define a test case for a specific add scenario. The test case function must start with test_.
-        def test_add_shape_format_fp32_2d(self, device):
+        def test_add_shape_format_fp32_2d(self, device="npu"):
             format_list = [0, 3, 29]
             shape_format = [
                 [np.float32, i, [5, 256]]  for i in format_list 
             ]
             self.add_result(shape_format)
     
-    instantiate_device_type_tests(TestAdd, globals(), except_for="cpu")
     if __name__ == "__main__":
         run_tests()
     ```
diff --git "a/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md"
index 7df1c04773f8b52fe879289fcb7c276fc74acef7..be18ba1e4b4c88e0e9b2b32e4137ce2230225b88 100644
--- "a/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md"
+++ "b/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md"
@@ -715,7 +715,6 @@ pip3 install --upgrade torch-1.5.0+ascend.post3-cp37-cp37m-linux_{arch}.whl
     import torch
     import numpy as np
     from common_utils import TestCase, run_tests
-    from common_device_type import dtypes, instantiate_device_type_tests
     from util_test import create_common_tensor
     
     # 定义add测试用例类
@@ -746,14 +745,13 @@ pip3 install --upgrade torch-1.5.0+ascend.post3-cp37-cp37m-linux_{arch}.whl
                 self.assertRtolEqual(cpu_output, npu_output)
     
         # 定义具体add场景的测试用例，用例函数需要以test_开头
-        def test_add_shape_format_fp32_2d(self, device):
+        def test_add_shape_format_fp32_2d(self, device="npu"):
             format_list = [0, 3, 29]
             shape_format = [
                 [np.float32, i, [5, 256]]  for i in format_list 
             ]
             self.add_result(shape_format)
     
-    instantiate_device_type_tests(TestAdd, globals(), except_for="cpu")
     if __name__ == "__main__":
         run_tests()
     ```
diff --git a/test/test_amp.py b/test/test_amp.py
index b9232204621240bacd0d0cfea4b752e9858111cc..9548f1b9cbf9cfbdea84f6dcc81ab8162f411093 100644
--- a/test/test_amp.py
+++ b/test/test_amp.py
@@ -19,9 +19,7 @@ import torch
 import torch_npu
 from torch_npu.npu.amp import NpuGradScaler, NpuAutocast
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor  # Set device dependency, need to be removed.
+from torch_npu.testing.testcase import TestCase, run_tests
 
 
 class TestAmp(TestCase):
@@ -29,7 +27,7 @@ class TestAmp(TestCase):
         float_tensor = torch.tensor([40000.0], dtype=torch.float16).npu()
         float_tensor = float_tensor + float_tensor
 
-    def test_grad_scaling_scale(self, device):
+    def test_grad_scaling_scale(self, device="npu"):
         scaler = NpuGradScaler(init_scale=2.)
         t0 = torch.full((1,), 4.0, dtype=torch.float32, device="npu")
         t1 = torch.full((1,), 4.0, dtype=torch.float32, device="npu")
@@ -40,7 +38,7 @@ class TestAmp(TestCase):
                         outputs[2][0] == 8.0 and outputs[2][1][0] == 8.0 and outputs[2][1][1] == 8.0)
         self.assertTrue(scaler._scale.device == t1.device)
         
-    def test_grad_scaling_state_dict(self, device):
+    def test_grad_scaling_state_dict(self, device="npu"):
         for lazy_init_scale in True, False:
             s0 = NpuGradScaler(init_scale=3., growth_factor=4., backoff_factor=.5, growth_interval=2)
             s1 = NpuGradScaler(init_scale=6., growth_factor=7., backoff_factor=.8, growth_interval=1)
@@ -118,7 +116,7 @@ class TestAmp(TestCase):
                 self.assertRtolEqual(c, s, atol)
                 
     # Compares no scaling + no autocasting against scaling + autocasting.
-    def test_grad_scaling_autocast(self, device):
+    def test_grad_scaling_autocast(self, device="npu"):
         try_pickle = False
 
         def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
@@ -147,7 +145,7 @@ class TestAmp(TestCase):
         try_pickle = True
         self._run_scaling_case(run, unskipped=3, skipped=1, atol=1e-3)
 
-    def test_grad_scaling_clipping(self, device):
+    def test_grad_scaling_clipping(self, device="npu"):
         def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
             max_norm = 0.2  # A reasonable value that actually has an effect, based on printouts of grads
             for i, (input_data, target) in enumerate(data):
@@ -169,7 +167,7 @@ class TestAmp(TestCase):
 
         self._run_scaling_case(run, unskipped=3, skipped=1, atol=1e-6)
 
-    def test_grad_scaling_clipping_separate_unscale(self, device):
+    def test_grad_scaling_clipping_separate_unscale(self, device="npu"):
         def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
             max_norm = 0.2  # A reasonable value that actually has an effect, based on printouts of grads
             for i, (input_data, target) in enumerate(data):
@@ -192,7 +190,7 @@ class TestAmp(TestCase):
 
         self._run_scaling_case(run, unskipped=3, skipped=1)
 
-    def test_grad_scaling_penalty(self, device):
+    def test_grad_scaling_penalty(self, device="npu"):
         def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
             for i, (input_data, target) in enumerate(data):
                 optimizer.zero_grad()
@@ -226,7 +224,7 @@ class TestAmp(TestCase):
 
         self._run_scaling_case(run, unskipped=3, skipped=1)
 
-    def test_grad_scaling_accumulation(self, device):
+    def test_grad_scaling_accumulation(self, device="npu"):
         def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
             iters_to_accumulate = 2
             for i, (input_data, target) in enumerate(data):
@@ -248,7 +246,7 @@ class TestAmp(TestCase):
 
         self._run_scaling_case(run, unskipped=2, skipped=0)
 
-    def test_grad_scaling_multiple(self, device):
+    def test_grad_scaling_multiple(self, device="npu"):
         # Tests gradient scaling with 2 models and 2 optimizers that both receive gradients from 2 losses.
         # Some of the logic here cannot reuse the generic helper functions created for the 1-optimizer cases.
         for enabled in True, False:
@@ -300,6 +298,6 @@ class TestAmp(TestCase):
                 s = s.cpu().to(torch.float).detach().numpy()
                 self.assertRtolEqual(c, s, 1e-7)
 
-instantiate_device_type_tests(TestAmp, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_aoe.py b/test/test_aoe.py
index d465e043688c32c02385a0a04b3f99a43283af16..0dd2611c17e061001dae7b50d59697ecb1391073 100644
--- a/test/test_aoe.py
+++ b/test/test_aoe.py
@@ -15,9 +15,11 @@
 import os
 import shutil
 import torch
-from torch_npu.testing.common_utils import TestCase, run_tests
 import torch_npu
 
+from torch_npu.testing.testcase import TestCase, run_tests
+
+
 class SmallModel(torch.nn.Module):
     def __init__(self, in_channel, out_channel):
         super(SmallModel, self).__init__()
diff --git a/test/test_api/test_serialization.py b/test/test_api/test_serialization.py
index 3d0b5d37dfe7b5e6b83e186e618278d0de260f13..3711f9d83ee64308e10ba1274fc4d7a42acc9f2b 100644
--- a/test/test_api/test_serialization.py
+++ b/test/test_api/test_serialization.py
@@ -19,8 +19,7 @@ import torch_npu
 import torch.nn as nn
 import torch.nn.functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
 
 
 class NpuMNIST(nn.Module):
@@ -45,7 +44,7 @@ class TestSerialization(TestCase):
     The saved data is transferred to PyTorch CPU device before being saved, so a
     following `torch.load()` will load CPU data.
     '''
-    def test_save(self, device):
+    def test_save(self, device="npu"):
         x = torch.randn(5).npu()
         with tempfile.TemporaryDirectory() as tmpdir:
             path = os.path.join(tmpdir, 'data.pt')
@@ -54,7 +53,27 @@ class TestSerialization(TestCase):
             x_loaded = x_loaded.npu()
             self.assertRtolEqual(x.cpu(), x_loaded.cpu())
 
+<<<<<<< HEAD
+<<<<<<< HEAD
+    def test_save_value(self, device):
+=======
+    def test_save_value(self, device="npu"):
+>>>>>>> 467110b (更新device)
+        x = 44
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, 'data.pt')
+            torch.save(x, path)
+            x_loaded = torch.load(path)
+            self.assertEqual(x, x_loaded)
+
+<<<<<<< HEAD
     def test_save_tuple(self, device):
+=======
+    def test_save_tuple(self, device="npu"):
+>>>>>>> 984ec0d (Modify to adapt new testing framework.)
+=======
+    def test_save_tuple(self, device="npu"):
+>>>>>>> 467110b (更新device)
         x = torch.randn(5).npu()
         number = 3
         with tempfile.TemporaryDirectory() as tmpdir:
@@ -64,13 +83,16 @@ class TestSerialization(TestCase):
             x_loaded = x_loaded.npu()
             self.assertRtolEqual(x.cpu(), x_loaded.cpu())
             self.assertTrue(number, number_loaded)
+<<<<<<< HEAD
+=======
     
-    def test_save_error(self, device):
+    def test_save_error(self, device="npu"):
         a = 44
         with self.assertRaisesRegex(RuntimeError, "torch.save received invalid input."):
             out = torch.save(a, 'a.pth')
+>>>>>>> 984ec0d (Modify to adapt new testing framework.)
 
-    def test_serialization_model(self, device):
+    def test_serialization_model(self, device="npu"):
         with tempfile.TemporaryDirectory() as tmpdir:
             path = os.path.join(tmpdir, 'data.pt')
             model = NpuMNIST().npu()
@@ -78,7 +100,7 @@ class TestSerialization(TestCase):
             loaded_model = torch.load(path)
             self.assertExpectedInline(str(model), str(loaded_model))
 
-    def test_serialization_state_dict(self, device):
+    def test_serialization_state_dict(self, device="npu"):
         with tempfile.TemporaryDirectory() as tmpdir:
             path = os.path.join(tmpdir, 'data.pt')
             model = NpuMNIST().npu()
@@ -100,6 +122,5 @@ class TestSerialization(TestCase):
             self.assertRtolEqual(before_save['fc2.bias'].cpu(), after_load['fc2.bias'].cpu())
             
 
-instantiate_device_type_tests(TestSerialization, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_cann_profiler.py b/test/test_cann_profiler.py
index b38899250ac88b12a3f881d07fcccf51b8220843..57e40a5e428a7d83f961575de000488f87f505ae 100644
--- a/test/test_cann_profiler.py
+++ b/test/test_cann_profiler.py
@@ -19,7 +19,9 @@ import shutil
 from itertools import combinations
 import torch
 import torch_npu
-from torch.testing._internal.common_utils import TestCase, run_tests
+
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class SmallModel(torch.nn.Module):
     def __init__(self, in_channel=3, out_channel=12):
diff --git a/test/test_custom_ops/test_float_status.py b/test/test_custom_ops/test_float_status.py
index c46130db832fac7772e8125238809b98aff10a41..4da3b03ca6eac2472fc63b2931b3ad7f2254bfea 100644
--- a/test/test_custom_ops/test_float_status.py
+++ b/test/test_custom_ops/test_float_status.py
@@ -16,13 +16,12 @@
 import torch
 import torch_npu
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
 
 
 class TestFloatStatus(TestCase):
 
-    def test_float_status(self, device):
+    def test_float_status(self, device="npu"):
         float_tensor = torch.tensor([40000.0], dtype=torch.float16).npu()
         float_tensor = float_tensor + float_tensor
 
@@ -38,6 +37,6 @@ class TestFloatStatus(TestCase):
         local_float_status = torch_npu.npu_get_float_status(float_status)
         self.assertTrue(float_status.cpu()[0] == 0)
 
-instantiate_device_type_tests(TestFloatStatus, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_custom_ops/test_npu_one_hot.py b/test/test_custom_ops/test_npu_one_hot.py
index d7aaaf597e3ac776d849490d3f92291d17c8dad5..a5c02b8648a179cd79dc2dce6a26d7f6f9587365 100644
--- a/test/test_custom_ops/test_npu_one_hot.py
+++ b/test/test_custom_ops/test_npu_one_hot.py
@@ -16,11 +16,8 @@
 
 import torch
 import torch_npu
-import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
 
 
 class TestNpuOneHot(TestCase):
@@ -41,12 +38,12 @@ class TestNpuOneHot(TestCase):
         output = output.cpu().numpy()
         return output
 
-    def test_one_hot_1(self, device):
+    def test_one_hot_1(self, device="npu"):
         target = self.create_target_lable(10, (64, ))
         cpu_output = self.cpu_op_exec(target, 10, 0.9, 0.1)
         npu_output = self.npu_op_exec(target.npu(), 10, 0.9, 0.1)
         self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestNpuOneHot, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test___and__.py b/test/test_network_ops/test___and__.py
index 9d13f1db08e50b1e0fdc810ab826738182293f2c..03017da32004ae7302952b4a8ec3fbf990b85b78 100644
--- a/test/test_network_ops/test___and__.py
+++ b/test/test_network_ops/test___and__.py
@@ -17,9 +17,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class Test__And__(TestCase):
     def cpu_op_exec(self, input1, input2):        
@@ -35,7 +35,7 @@ class Test__And__(TestCase):
             output = output.to(torch.int32)
         return output.numpy()
 
-    def test___And___shape_format(self, device):
+    def test___And___shape_format(self, device="npu"):
         shape_format = [
                 [[np.int32, 0, [256, 1000]], [1]],
                 [[np.int32, 0, [256, 1000]], [np.int32, 0, [256, 1000]]],
@@ -59,6 +59,6 @@ class Test__And__(TestCase):
                 npu_output = self.npu_op_exec(npu_input1, item[1][0])
                 self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(Test__And__, globals(), except_for="cpu")
+
 if __name__ == "__main__":
 	run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test___ior__.py b/test/test_network_ops/test___ior__.py
index 21d3de85644b4479e8d41f62c47312064a57a69d..363aa108a81e125d6177a3c4ae7399ec4b1afca3 100644
--- a/test/test_network_ops/test___ior__.py
+++ b/test/test_network_ops/test___ior__.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestIor(TestCase):
     def generate_bool_data(self, shape):
@@ -63,159 +62,159 @@ class TestIor(TestCase):
         output = output.numpy()
         return output
 
-    def test___ior___bool(self, device):
+    def test___ior___bool(self, device="npu"):
         npu_input1 = self.generate_bool_data((1, 31, 149, 2))
         npu_input2 = self.generate_bool_data((1, 31, 149, 2))
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test___ior___bool_scalar(self, device):
+    def test___ior___bool_scalar(self, device="npu"):
         npu_input1 = self.generate_bool_data((1, 31, 149, 2))
         npu_input2 = False
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec_scalar(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test___ior___uint8(self, device):
+    def test___ior___uint8(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(0, 255, (1, 31, 149, 2), np.uint8)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test___ior___int8(self, device):
+    def test___ior___int8(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-128, 127, (1, 31, 149, 2), np.int8)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test___ior___int32_001(self, device):
+    def test___ior___int32_001(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, -2147483648, (1, 31, 149, 2), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test___ior___int32_002(self, device):
+    def test___ior___int32_002(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(2147483647, 2147483647, (128), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test___ior___int32_003(self, device):
+    def test___ior___int32_003(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (184965, 1), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test___ior___int32_004(self, device):
+    def test___ior___int32_004(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1, 31, 149, 2), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test___ior___int32_005(self, device):
+    def test___ior___int32_005(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (2, 31, 149, 2), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test___ior___int32_006(self, device):
+    def test___ior___int32_006(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (4, 31, 149, 2), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test___ior___int32_007(self, device):
+    def test___ior___int32_007(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (2048, 31, 1, 2), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test___ior___int32_008(self, device):
+    def test___ior___int32_008(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (8, 7, 149), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test___ior___int32_009(self, device):
+    def test___ior___int32_009(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (65535,1,1,1), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
     
-    def test___ior___int32_010(self, device):
+    def test___ior___int32_010(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,8192), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
     
-    def test___ior___int32_011(self, device):
+    def test___ior___int32_011(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,16384), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
     
-    def test___ior___int32_012(self, device):
+    def test___ior___int32_012(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,32768), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
     
-    def test___ior___int32_013(self, device):
+    def test___ior___int32_013(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,65535), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
     
-    def test___ior___int32_014(self, device):
+    def test___ior___int32_014(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,131072), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
         
-    def test___ior___int32_015(self, device):
+    def test___ior___int32_015(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,196608), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
     
-    def test___ior___int32_016(self, device):
+    def test___ior___int32_016(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,262144), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
     
-    def test___ior___int32_017(self, device):
+    def test___ior___int32_017(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,393216), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
     
-    def test___ior___int32_018(self, device):
+    def test___ior___int32_018(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,524288), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
       
-    def test___ior___int32_019(self, device):
+    def test___ior___int32_019(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,655360), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
     
-    def test___ior___int32_020(self, device):
+    def test___ior___int32_020(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,786432), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
     
-    def test___ior___int_scalar(self, device):
+    def test___ior___int_scalar(self, device="npu"):
         npu_input1 = self.generate_single_data(-2147483648, 2147483647, (1,31,149,2), np.int32)
         npu_input2 = self.generate_int_scalar(-2147483648, 2147483647)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec_scalar(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestIor, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test___or__.py b/test/test_network_ops/test___or__.py
index 11c111c33c03fd4fb38f21e63776f31ff9653672..74bd6c8d277e226a5002be8f347bb58376fa5014 100644
--- a/test/test_network_ops/test___or__.py
+++ b/test/test_network_ops/test___or__.py
@@ -18,9 +18,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class Test__Or__(TestCase):
     def cpu_op_exec(self, input1, input2):
@@ -36,7 +36,7 @@ class Test__Or__(TestCase):
             output = output.to(torch.int32)
         return output.numpy()
 
-    def test___Or___shape_format(self, device):
+    def test___Or___shape_format(self, device="npu"):
         shape_format = [
                 [[np.int32, 0, [256, 1000]], [1]],
                 [[np.int32, 0, [256, 1000]], [np.int32, 0, [256, 1000]]],
@@ -67,6 +67,6 @@ class Test__Or__(TestCase):
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(Test__Or__, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test__iRshift__.py b/test/test_network_ops/test__iRshift__.py
index 117c7a0e772e9504530c5e5f91ca9c1d34f1f271..aae06cf435e99c9d01bd7f6cf18d3e2d90744ef7 100644
--- a/test/test_network_ops/test__iRshift__.py
+++ b/test/test_network_ops/test__iRshift__.py
@@ -18,9 +18,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestiRshift(TestCase):
     def cpu_op_exec(self, input1, input2):
@@ -34,7 +34,7 @@ class TestiRshift(TestCase):
         output = output.numpy()
         return output
 
-    def test_iRshift_tensor(self, device):
+    def test_iRshift_tensor(self, device="npu"):
         format_list = [0]
         shape_list = [(256, 32, 56)]
         shape_format = [
@@ -49,7 +49,7 @@ class TestiRshift(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_iRshift_scalar(self, device):
+    def test_iRshift_scalar(self, device="npu"):
         format_list = [0]
         shape_list = [(256, 32, 56)]
         shape_format = [
@@ -64,6 +64,6 @@ class TestiRshift(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestiRshift, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test__ilshift__.py b/test/test_network_ops/test__ilshift__.py
index debdd59f0ba5766997ec0908b1b682adfad47ed8..db2450246c7ad07345e0a2279e227afea91425c1 100644
--- a/test/test_network_ops/test__ilshift__.py
+++ b/test/test_network_ops/test__ilshift__.py
@@ -18,9 +18,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestiLshift(TestCase):
     def cpu_op_exec(self, input1, input2):
@@ -34,7 +34,7 @@ class TestiLshift(TestCase):
         output = output.numpy()
         return output
 
-    def test_ilshift_tensor(self, device):
+    def test_ilshift_tensor(self, device="npu"):
         format_list = [0]
         shape_list = [(256, 32, 56)]
         shape_format = [
@@ -49,7 +49,7 @@ class TestiLshift(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_ilshift_scalar(self, device):
+    def test_ilshift_scalar(self, device="npu"):
         format_list = [0]
         shape_list = [(256, 32, 56)]
         shape_format = [
@@ -64,6 +64,6 @@ class TestiLshift(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestiLshift, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test__lshift__.py b/test/test_network_ops/test__lshift__.py
index dfdf1ba3305c244fa035c9f07b47eb422bca5186..d19f6d5fad28978e2b3f5f0c47dfa129c79fdaf2 100644
--- a/test/test_network_ops/test__lshift__.py
+++ b/test/test_network_ops/test__lshift__.py
@@ -18,9 +18,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestLshift(TestCase):
     def cpu_op_exec(self, input1, input2):
@@ -34,7 +34,7 @@ class TestLshift(TestCase):
         output = output.numpy()
         return output
 
-    def test_lshift_tensor(self, device):
+    def test_lshift_tensor(self, device="npu"):
         format_list = [0]
         shape_list = [(256, 32, 56)]
         shape_format = [
@@ -49,7 +49,7 @@ class TestLshift(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_lshift_scalar(self, device):
+    def test_lshift_scalar(self, device="npu"):
         format_list = [0]
         shape_list = [(256, 32, 56)]
         shape_format = [
@@ -64,6 +64,6 @@ class TestLshift(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestLshift, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test__rshift__.py b/test/test_network_ops/test__rshift__.py
index 9d421865def87098385063e0a043016a1d7e67c9..72409eb46653311762f9acb0613fdc84384eccd2 100644
--- a/test/test_network_ops/test__rshift__.py
+++ b/test/test_network_ops/test__rshift__.py
@@ -18,9 +18,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestRshift(TestCase):
@@ -35,7 +34,7 @@ class TestRshift(TestCase):
         output = output.numpy()
         return output
 
-    def test_rshift_tensor(self, device):
+    def test_rshift_tensor(self, device="npu"):
         format_list = [0]
         shape_list = [(256, 32, 56)]
         shape_format = [
@@ -50,7 +49,7 @@ class TestRshift(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_rshift_scalar(self, device):
+    def test_rshift_scalar(self, device="npu"):
         format_list = [0]
         shape_list = [(256, 32, 56)]
         shape_format = [
@@ -65,6 +64,6 @@ class TestRshift(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestRshift, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_abs.py b/test/test_network_ops/test_abs.py
index cf44ee4bc0cda0ca2958634cd735651558f60b5d..e1e4de92c0691025266d26a00466b95ac85663aa 100644
--- a/test/test_network_ops/test_abs.py
+++ b/test/test_network_ops/test_abs.py
@@ -15,9 +15,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestAbs(TestCase):
     def cpu_op_exec(self, input1):
@@ -31,7 +31,7 @@ class TestAbs(TestCase):
         output = output.numpy()
         return output
 
-    def test_abs_shape_format_fp16(self, device):
+    def test_abs_shape_format_fp16(self, device="npu"):
         format_list = [0, 3]
         shape_list = [[5], [5, 10], [1, 3, 2], [52, 15, 15, 20]]
         shape_format = [
@@ -45,7 +45,7 @@ class TestAbs(TestCase):
             cpu_output = cpu_output.astype(np.float16)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_abs_shape_format_fp32(self, device):
+    def test_abs_shape_format_fp32(self, device="npu"):
         format_list = [0, 3]
         shape_list = [[5], [5, 10], [1, 3, 2], [52, 15, 15, 20]]
         shape_format = [
@@ -57,6 +57,6 @@ class TestAbs(TestCase):
             npu_output = self.npu_op_exec(npu_input)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestAbs, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_acos.py b/test/test_network_ops/test_acos.py
index 36e9ffa8931ca0e8487451c78b1014ad0b3b7748..ef4a9a31ce4c7bee531c792cebac0b371cbf1cef 100644
--- a/test/test_network_ops/test_acos.py
+++ b/test/test_network_ops/test_acos.py
@@ -15,9 +15,9 @@
 import torch
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestAcos(TestCase):
     def cpu_op_exec(self, input_para):
@@ -31,7 +31,7 @@ class TestAcos(TestCase):
         output = output.numpy()
         return output  
         
-    def test_acos_common_shape_format(self, device):
+    def test_acos_common_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float32, -1, 1]],
                 [[np.float32, -1, (64, 10)]], 
@@ -43,7 +43,7 @@ class TestAcos(TestCase):
             npu_output = self.npu_op_exec(npu_input)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_acos_float16_shape_format(self, device):
+    def test_acos_float16_shape_format(self, device="npu"):
         def cpu_op_exec_fp16(input_para):
             input_para = input_para.to(torch.float32)
             output = torch.acos(input_para)
@@ -62,7 +62,7 @@ class TestAcos(TestCase):
             npu_output = self.npu_op_exec(npu_input)
             self.assertRtolEqual(cpu_output, npu_output)  
 
-instantiate_device_type_tests(TestAcos, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
         
diff --git a/test/test_network_ops/test_adaptive_avg_pool1d.py b/test/test_network_ops/test_adaptive_avg_pool1d.py
index cfa89686b52707f038c525d28b1af36f87f20e89..af129c3e5fd82d9e332ce85c13f402aed2ff938e 100644
--- a/test/test_network_ops/test_adaptive_avg_pool1d.py
+++ b/test/test_network_ops/test_adaptive_avg_pool1d.py
@@ -17,9 +17,9 @@ import torch_npu
 import torch.nn as nn
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestAdaptiveAvgPool1d(TestCase):
     def cpu_op_exec(self, input1, output_size):
@@ -32,7 +32,7 @@ class TestAdaptiveAvgPool1d(TestCase):
         output = m(input1)
         return output.cpu()
     
-    def test_AdaptiveAvgPool1d_shape_format_fp16(self, device):
+    def test_AdaptiveAvgPool1d_shape_format_fp16(self, device="npu"):
         shape_format = [
                 [np.float16, 0, (64, 10, 16)],
                 [np.float16, -1, (256, 2048, 8)],
@@ -46,7 +46,7 @@ class TestAdaptiveAvgPool1d(TestCase):
                 npu_output = self.npu_op_exec(npu_input, output_size)
                 self.assertRtolEqual(cpu_output, npu_output, prec16=0.002)
 
-    def test_AdaptiveAvgPool1d_shape_format_fp32(self, device):
+    def test_AdaptiveAvgPool1d_shape_format_fp32(self, device="npu"):
         shape_format = [
                 [np.float32, 0, (64, 10, 16)],
                 [np.float32, -1, (256, 2048, 8)],
@@ -60,6 +60,6 @@ class TestAdaptiveAvgPool1d(TestCase):
                 npu_output = self.npu_op_exec(npu_input, output_size)
                 self.assertRtolEqual(cpu_output, npu_output, 0.001)
 
-instantiate_device_type_tests(TestAdaptiveAvgPool1d, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_adaptive_avg_pool2d_backward.py b/test/test_network_ops/test_adaptive_avg_pool2d_backward.py
index 7ad252fea17386d5562a41caca73aabbaccf005e..7037973c859f8c7058f0eb09f3ce73e5a133aabc 100644
--- a/test/test_network_ops/test_adaptive_avg_pool2d_backward.py
+++ b/test/test_network_ops/test_adaptive_avg_pool2d_backward.py
@@ -17,9 +17,8 @@ import torch_npu
 import numpy as np
 from torch.nn import functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestAdaptiveAvgPool2dBackward(TestCase):
 
@@ -42,7 +41,7 @@ class TestAdaptiveAvgPool2dBackward(TestCase):
         out = output.detach().cpu(), input_x.grad.cpu()
         return out
 
-    def test_adaptiveAvgPool2d_backward_1(self, device):
+    def test_adaptiveAvgPool2d_backward_1(self, device="npu"):
         cpu_input = torch.randn((1, 8, 9), dtype=torch.float32)
         npu_input = cpu_input.npu()
         output_size = np.array((2, 3))
@@ -51,7 +50,7 @@ class TestAdaptiveAvgPool2dBackward(TestCase):
         self.assertRtolEqual(cpu_output[0], npu_output[0])
         self.assertRtolEqual(cpu_output[1], npu_output[1])
         
-    def test_adaptiveAvgPool2d_backward_2(self, device):
+    def test_adaptiveAvgPool2d_backward_2(self, device="npu"):
         cpu_input = torch.randn((1, 3, 3, 3), dtype=torch.float32)
         npu_input = cpu_input.npu()
         output_size = np.array((2, 2))
@@ -60,7 +59,7 @@ class TestAdaptiveAvgPool2dBackward(TestCase):
         self.assertRtolEqual(cpu_output[0], npu_output[0])
         self.assertRtolEqual(cpu_output[1], npu_output[1])
 
-    def test_adaptiveAvgPool2d_backward_fp16(self, device):
+    def test_adaptiveAvgPool2d_backward_fp16(self, device="npu"):
         input_x = np.random.uniform(0, 1, (1, 3, 6, 6)).astype(np.float16)
         cpu_input = torch.from_numpy(input_x)
         npu_input = cpu_input.npu()
@@ -70,6 +69,6 @@ class TestAdaptiveAvgPool2dBackward(TestCase):
         self.assertRtolEqual(cpu_output[0], npu_output[0])
         self.assertRtolEqual(cpu_output[1], npu_output[1])
         
-instantiate_device_type_tests(TestAdaptiveAvgPool2dBackward, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_adaptive_avg_pool3d.py b/test/test_network_ops/test_adaptive_avg_pool3d.py
index fec1bc76915d0daa8cf3c10d96a68a5db5885cdb..83f39c5c183072a142bb2edf9912f6ba8ff6e988 100644
--- a/test/test_network_ops/test_adaptive_avg_pool3d.py
+++ b/test/test_network_ops/test_adaptive_avg_pool3d.py
@@ -17,9 +17,9 @@ import torch_npu
 import torch.nn as nn
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestAdaptiveAvgPool3d(TestCase):
     def cpu_op_exec(self, input1, output_size):
@@ -32,7 +32,7 @@ class TestAdaptiveAvgPool3d(TestCase):
         output = m(input1).cpu()
         return output.numpy()
 
-    def test_adaptive_avg_pool3d_shape_format_fp16(self, device):
+    def test_adaptive_avg_pool3d_shape_format_fp16(self, device="npu"):
         shape_format = [
                 [np.float16, -1, (64, 10, 16, 32)],
                 [np.float16, -1, (4, 16, 8, 4, 2)],
@@ -49,7 +49,7 @@ class TestAdaptiveAvgPool3d(TestCase):
                 cpu_output = cpu_output.astype(npu_output.dtype)
                 self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_adaptive_avg_pool3d_shape_format_fp32(self, device):
+    def test_adaptive_avg_pool3d_shape_format_fp32(self, device="npu"):
         shape_format = [
                 [np.float32, -1, (64, 10, 16, 32)],
                 [np.float32, -1, (4, 2, 2, 4, 316)],
@@ -64,7 +64,7 @@ class TestAdaptiveAvgPool3d(TestCase):
                 npu_output = self.npu_op_exec(npu_input, output_size)
                 self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestAdaptiveAvgPool3d, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
 
diff --git a/test/test_network_ops/test_adaptive_avg_pool3d_backward.py b/test/test_network_ops/test_adaptive_avg_pool3d_backward.py
index 6417d21f446be4aac8adc20d685efe702a8a6c6d..e5d3fde9f332a257c3effceb9817e4956077a31b 100644
--- a/test/test_network_ops/test_adaptive_avg_pool3d_backward.py
+++ b/test/test_network_ops/test_adaptive_avg_pool3d_backward.py
@@ -17,9 +17,9 @@ import torch_npu
 import numpy as np
 from torch.nn import functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestAdaptiveAvgPool3dBackward(TestCase):
 
@@ -41,7 +41,7 @@ class TestAdaptiveAvgPool3dBackward(TestCase):
         out = input_x.grad.cpu()
         return out.numpy()
 
-    def test_adaptive_avg_pool3d_backward(self, device):
+    def test_adaptive_avg_pool3d_backward(self, device="npu"):
         dtype_list = [np.float16, np.float32]
         format_list = [-1]
         shape_list = [
@@ -61,6 +61,6 @@ class TestAdaptiveAvgPool3dBackward(TestCase):
                 npu_output = self.npu_op_exec(npu_input, output_size)
                 self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestAdaptiveAvgPool3dBackward, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_adaptive_max_pool2d.py b/test/test_network_ops/test_adaptive_max_pool2d.py
index 1c077f1f41a9993fb1b779049ce44a9ea9bc4e73..0cfe2c8417d1357bdeeef8bdd55dd900e955de71 100644
--- a/test/test_network_ops/test_adaptive_max_pool2d.py
+++ b/test/test_network_ops/test_adaptive_max_pool2d.py
@@ -16,9 +16,9 @@ import torch.nn as nn
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestAdaptiveMaxPool2d(TestCase):
     def cpu_op_exec(self, input1, output_size):
@@ -31,7 +31,7 @@ class TestAdaptiveMaxPool2d(TestCase):
         output = m(input1)
         return output.cpu().numpy()
 
-    def test_adaptive_max_pool2d_shape_format_fp32_6(self, device):
+    def test_adaptive_max_pool2d_shape_format_fp32_6(self, device="npu"):
         format_list = [-1]
         shape_list = [(1, 5, 9, 9)]
         shape_format = [
@@ -45,6 +45,6 @@ class TestAdaptiveMaxPool2d(TestCase):
                 npu_output = self.npu_op_exec(npu_input, output_size)
                 self.assertRtolEqual(cpu_output, npu_output, 0.0004)
 
-instantiate_device_type_tests(TestAdaptiveMaxPool2d, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_adaptive_max_pool2d_backward.py b/test/test_network_ops/test_adaptive_max_pool2d_backward.py
index f4d826499f3e47090f4203927469f4045d4f29a0..8fb62a32d563353abcbc839b6c93010f28af25f0 100644
--- a/test/test_network_ops/test_adaptive_max_pool2d_backward.py
+++ b/test/test_network_ops/test_adaptive_max_pool2d_backward.py
@@ -17,9 +17,9 @@ import torch_npu
 import torch.nn as nn
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestAdaptiveMaxPool2dBackward(TestCase):
     def cpu_op_exec(self, input_tensor, output_size):
@@ -39,7 +39,7 @@ class TestAdaptiveMaxPool2dBackward(TestCase):
         npu_grad = npu_grad.to("cpu")
         return npu_grad
 
-    def test_adaptive_max_pool2d_shape_format_fp32_6(self, device):
+    def test_adaptive_max_pool2d_shape_format_fp32_6(self, device="npu"):
         format_list = [-1]
         shape_list = [(1, 3, 8, 9)]
         shape_format = [
@@ -55,6 +55,6 @@ class TestAdaptiveMaxPool2dBackward(TestCase):
                 npu_output = self.npu_op_exec(npu_input, output_size)
                 self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestAdaptiveMaxPool2dBackward, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_add.py b/test/test_network_ops/test_add.py
index 015ffa65f6a3df3a09fee7dd7170cf0b7f0b3f1a..bc5c9a9c99975499cf8ecc8a173c773f7b75a384 100644
--- a/test/test_network_ops/test_add.py
+++ b/test/test_network_ops/test_add.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestAdd(TestCase):
@@ -143,7 +142,7 @@ class TestAdd(TestCase):
             
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_add_scalar_shape_format_fp16_1d(self, device):
+    def test_add_scalar_shape_format_fp16_1d(self, device="npu"):
         format_list = [0, 3]
         scalar_list = [0,1]
         shape_format = [
@@ -151,7 +150,7 @@ class TestAdd(TestCase):
         ]        
         self.add_scalar_result(shape_format)
     
-    def test_add_scalar_shape_format_fp32_1d(self, device):
+    def test_add_scalar_shape_format_fp32_1d(self, device="npu"):
         format_list = [0, 3]
         scalar_list = [0,1]
         shape_format = [
@@ -159,7 +158,7 @@ class TestAdd(TestCase):
         ]        
         self.add_scalar_result(shape_format)
         
-    def test_add_scalar_shape_format_fp16_2d(self, device):
+    def test_add_scalar_shape_format_fp16_2d(self, device="npu"):
         format_list = [0, 3, 29]
         scalar_list = [0,1]
         shape_format = [
@@ -167,7 +166,7 @@ class TestAdd(TestCase):
         ]        
         self.add_scalar_result(shape_format)
     
-    def test_add_scalar_shape_format_fp32_2d(self, device):
+    def test_add_scalar_shape_format_fp32_2d(self, device="npu"):
         format_list = [0, 3, 29]
         scalar_list = [0,1]
         shape_format = [
@@ -175,7 +174,7 @@ class TestAdd(TestCase):
         ]        
         self.add_scalar_result(shape_format)
         
-    def test_add_scalar_shape_format_fp16_3d(self, device):
+    def test_add_scalar_shape_format_fp16_3d(self, device="npu"):
         format_list = [0, 3, 29]
         scalar_list = [0,1]
         shape_format = [
@@ -183,7 +182,7 @@ class TestAdd(TestCase):
         ]        
         self.add_scalar_result(shape_format)
     
-    def test_add_scalar_shape_format_fp32_3d(self, device):
+    def test_add_scalar_shape_format_fp32_3d(self, device="npu"):
         format_list = [0, 3, 29]
         scalar_list = [0,1]
         shape_format = [
@@ -191,7 +190,7 @@ class TestAdd(TestCase):
         ]        
         self.add_scalar_result(shape_format)
         
-    def test_add_scalar_shape_format_fp16_4d(self, device):
+    def test_add_scalar_shape_format_fp16_4d(self, device="npu"):
         format_list = [0, 3, 29]
         scalar_list = [0,1]
         shape_format = [
@@ -199,7 +198,7 @@ class TestAdd(TestCase):
         ]        
         self.add_scalar_result(shape_format)
     
-    def test_add_scalar_shape_format_fp32_4d(self, device):
+    def test_add_scalar_shape_format_fp32_4d(self, device="npu"):
         format_list = [0, 3, 29]
         scalar_list = [0,1]
         shape_format = [
@@ -207,7 +206,7 @@ class TestAdd(TestCase):
         ]        
         self.add_scalar_result(shape_format)
           
-    def test_add_scalar_shape_format_fp16_1d(self, device):
+    def test_add_scalar_shape_format_fp16_1d(self, device="npu"):
         format_list = [0, 3]
         scalar_list = [0,1]
         shape_format = [
@@ -215,7 +214,7 @@ class TestAdd(TestCase):
         ]        
         self.add_scalar_alpha_result(shape_format)
     
-    def test_add_scalar_shape_format_fp32_1d(self, device):
+    def test_add_scalar_shape_format_fp32_1d(self, device="npu"):
         format_list = [0, 3]
         scalar_list = [0,1]
         shape_format = [
@@ -223,7 +222,7 @@ class TestAdd(TestCase):
         ]        
         self.add_scalar_alpha_result(shape_format)
         
-    def test_add_scalar_shape_format_fp16_2d(self, device):
+    def test_add_scalar_shape_format_fp16_2d(self, device="npu"):
         format_list = [0, 3, 29]
         scalar_list = [0,1]
         shape_format = [
@@ -231,7 +230,7 @@ class TestAdd(TestCase):
         ]        
         self.add_scalar_alpha_result(shape_format)
     
-    def test_add_scalar_shape_format_fp32_2d(self, device):
+    def test_add_scalar_shape_format_fp32_2d(self, device="npu"):
         format_list = [0, 3, 29]
         scalar_list = [0,1]
         shape_format = [
@@ -239,7 +238,7 @@ class TestAdd(TestCase):
         ]        
         self.add_scalar_alpha_result(shape_format)
         
-    def test_add_scalar_shape_format_fp16_3d(self, device):
+    def test_add_scalar_shape_format_fp16_3d(self, device="npu"):
         format_list = [0, 3, 29]
         scalar_list = [0,1]
         shape_format = [
@@ -247,7 +246,7 @@ class TestAdd(TestCase):
         ]        
         self.add_scalar_alpha_result(shape_format)
     
-    def test_add_scalar_shape_format_fp32_3d(self, device):
+    def test_add_scalar_shape_format_fp32_3d(self, device="npu"):
         format_list = [0, 3, 29]
         scalar_list = [0,1]
         shape_format = [
@@ -255,7 +254,7 @@ class TestAdd(TestCase):
         ]        
         self.add_scalar_alpha_result(shape_format)
         
-    def test_add_scalar_shape_format_fp16_4d(self, device):
+    def test_add_scalar_shape_format_fp16_4d(self, device="npu"):
         format_list = [0, 3, 29]
         scalar_list = [0,1]
         shape_format = [
@@ -263,7 +262,7 @@ class TestAdd(TestCase):
         ]        
         self.add_scalar_alpha_result(shape_format)
     
-    def test_add_scalar_shape_format_fp32_4d(self, device):
+    def test_add_scalar_shape_format_fp32_4d(self, device="npu"):
         format_list = [0, 3, 29]
         scalar_list = [0,1]
         shape_format = [
@@ -271,119 +270,119 @@ class TestAdd(TestCase):
         ]        
         self.add_scalar_alpha_result(shape_format)
 
-    def test_add_shape_format_fp16_1d(self, device):
+    def test_add_shape_format_fp16_1d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float16, i, [64]]  for i in format_list
         ]        
         self.add_result(shape_format)
     
-    def test_add_shape_format_fp32_1d(self, device):
+    def test_add_shape_format_fp32_1d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float32, i, [64]]  for i in format_list 
         ]        
         self.add_result(shape_format)
         
-    def test_add_shape_format_fp16_2d(self, device):
+    def test_add_shape_format_fp16_2d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_format = [
             [np.float16, i, [5, 256]]  for i in format_list
         ]        
         self.add_result(shape_format)
     
-    def test_add_shape_format_fp32_2d(self, device):
+    def test_add_shape_format_fp32_2d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_format = [
             [np.float32, i, [5, 256]]  for i in format_list 
         ]        
         self.add_result(shape_format)
         
-    def test_add_shape_format_fp16_3d(self, device):
+    def test_add_shape_format_fp16_3d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_format = [
             [np.float16, i, [32, 3, 3]]  for i in format_list
         ]        
         self.add_result(shape_format)
     
-    def test_add_shape_format_fp32_3d(self, device):
+    def test_add_shape_format_fp32_3d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_format = [
             [np.float32, i, [32, 3, 3]]  for i in format_list 
         ]        
         self.add_result(shape_format)
         
-    def test_add_shape_format_fp16_4d(self, device):
+    def test_add_shape_format_fp16_4d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_format = [
             [np.float16, i, [64, 112, 7, 7]]  for i in format_list
         ]        
         self.add_result(shape_format)
     
-    def test_add_shape_format_fp32_4d(self, device):
+    def test_add_shape_format_fp32_4d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_format = [
             [np.float32, i, [64, 112, 7, 7]]  for i in format_list 
         ]        
         self.add_result(shape_format)
 
-    def test_add_shape_format_fp16_1d(self, device):
+    def test_add_shape_format_fp16_1d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float16, i, [64]]  for i in format_list
         ]        
         self.add_alpha_result(shape_format)
     
-    def test_add_shape_format_fp32_1d(self, device):
+    def test_add_shape_format_fp32_1d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float32, i, [64]]  for i in format_list 
         ]        
         self.add_alpha_result(shape_format)
         
-    def test_add_shape_format_fp16_2d(self, device):
+    def test_add_shape_format_fp16_2d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_format = [
             [np.float16, i, [5, 256]]  for i in format_list
         ]        
         self.add_alpha_result(shape_format)
     
-    def test_add_shape_format_fp32_2d(self, device):
+    def test_add_shape_format_fp32_2d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_format = [
             [np.float32, i, [5, 256]]  for i in format_list 
         ]        
         self.add_alpha_result(shape_format)
         
-    def test_add_shape_format_fp16_3d(self, device):
+    def test_add_shape_format_fp16_3d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_format = [
             [np.float16, i, [32, 3, 3]]  for i in format_list
         ]        
         self.add_alpha_result(shape_format)
     
-    def test_add_shape_format_fp32_3d(self, device):
+    def test_add_shape_format_fp32_3d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_format = [
             [np.float32, i, [32, 3, 3]]  for i in format_list 
         ]        
         self.add_alpha_result(shape_format)
         
-    def test_add_shape_format_fp16_4d(self, device):
+    def test_add_shape_format_fp16_4d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_format = [
             [np.float16, i, [64, 112, 7, 7]]  for i in format_list
         ]        
         self.add_alpha_result(shape_format)
     
-    def test_add_shape_format_fp32_4d(self, device):
+    def test_add_shape_format_fp32_4d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_format = [
             [np.float32, i, [64, 112, 7, 7]]  for i in format_list 
         ]        
         self.add_alpha_result(shape_format)
 
-    def test_add_mix_dtype(self, device):
+    def test_add_mix_dtype(self, device="npu"):
         cpu_input1, npu_input1 = create_common_tensor([np.int32, 0, (2, 3)], 1, 100)
         cpu_input2, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
         cpu_output = torch.add(cpu_input1, cpu_input2)
@@ -392,7 +391,5 @@ class TestAdd(TestCase):
         self.assertRtolEqual(cpu_output, npu_output)
 
 
-instantiate_device_type_tests(TestAdd, globals(), except_for="cpu")
-
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_addbmm.py b/test/test_network_ops/test_addbmm.py
index c9fe853f909d2eb1c19035d9519d408012a49eeb..7b3a380146291596f237fecb51ddc9bcb0a7a75f 100644
--- a/test/test_network_ops/test_addbmm.py
+++ b/test/test_network_ops/test_addbmm.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestAddbmm(TestCase):
     def generate_scalar(self, dtype, min_d, max_d):
@@ -78,7 +78,7 @@ class TestAddbmm(TestCase):
         output = output.numpy()
         return output
 
-    def test_addbmm(self, device):
+    def test_addbmm(self, device="npu"):
         shape_format = [
             [[np.float32, 0, [3, 5]], [np.float32, 0, [10, 3, 4]], [np.float32, 0, [10, 4, 5]], "float32"],
             [[np.int32, 0, [3, 5]], [np.int32, 0, [10, 3, 4]], [np.int32, 0, [10, 4, 5]], "int32"]
@@ -103,7 +103,7 @@ class TestAddbmm(TestCase):
             self.assertRtolEqual(cpu_output, npu_output1)
             self.assertRtolEqual(cpu_output, npu_output2)
 
-        def test_addbmm_transpose(self, device):
+        def test_addbmm_transpose(self, device="npu"):
             shape_format = [
                 [[np.float32, 0, [4, 5]], [np.float32, 0, [10, 4, 7]], [np.float32, 0, [10, 5, 7]], "float32"],
                 [[np.int32, 0, [4, 5]], [np.int32, 0, [10, 4, 7]], [np.int32, 0, [10, 5, 7]], "int32"]
@@ -123,6 +123,5 @@ class TestAddbmm(TestCase):
                 self.assertRtolEqual(cpu_transpose_output, npu_transpose_output)
 
 
-instantiate_device_type_tests(TestAddbmm, globals(), except_for='cpu')
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_addmm.py b/test/test_network_ops/test_addmm.py
index 21cfb51d226e94f16b3d7add7aa5c86367d83686..58db41361ac7d12596980bd9c99145497ab5280c 100644
--- a/test/test_network_ops/test_addmm.py
+++ b/test/test_network_ops/test_addmm.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestAddmm(TestCase):
@@ -112,7 +111,7 @@ class TestAddmm(TestCase):
         output = output.numpy()
         return output
 
-    def test_addmm_shape_format_int(self, device):
+    def test_addmm_shape_format_int(self, device="npu"):
         format_list = [0]
         shape_list = [(3, 3), (3, 5), (5, 3)]
         shape_format1 = [
@@ -149,7 +148,7 @@ class TestAddmm(TestCase):
             self.assertRtolEqual(cpu_output, npu_output1)
             self.assertRtolEqual(cpu_output, npu_output2)
 
-    def test_addmm_shape_format_fp32(self, device):
+    def test_addmm_shape_format_fp32(self, device="npu"):
         format_list = [0]
         shape_list = [(3, 3), (3, 5), (5, 3)]
         shape_format1 = [
@@ -186,7 +185,7 @@ class TestAddmm(TestCase):
             self.assertRtolEqual(cpu_output, npu_output1)
             self.assertRtolEqual(cpu_output, npu_output2)
 
-    def test_addmm_shape_format_fp16(self, device):
+    def test_addmm_shape_format_fp16(self, device="npu"):
         format_list = [0]
         shape_list = [(3, 3), (3, 5), (5, 3)]
         shape_format1 = [
@@ -228,7 +227,7 @@ class TestAddmm(TestCase):
             self.assertRtolEqual(cpu_output, npu_output1)
             self.assertRtolEqual(cpu_output, npu_output2)
 
-    def test_addmm_transpose_shape_format_int(self, device):
+    def test_addmm_transpose_shape_format_int(self, device="npu"):
         format_list = [0]
         shape_list = [(4, 5), (4, 7), (5, 7)]
         shape_format1 = [
@@ -257,7 +256,7 @@ class TestAddmm(TestCase):
 
             self.assertRtolEqual(cpu_transpose_output, npu_transpose_output)
 
-    def test_addmm_transpose_shape_format_fp32(self, device):
+    def test_addmm_transpose_shape_format_fp32(self, device="npu"):
         format_list = [0]
         shape_list = [(4, 5), (4, 7), (5, 7)]
         shape_format1 = [
@@ -286,7 +285,7 @@ class TestAddmm(TestCase):
 
             self.assertRtolEqual(cpu_transpose_output, npu_transpose_output)
 
-    def test_addmm_transpose_shape_format_fp16(self, device):
+    def test_addmm_transpose_shape_format_fp16(self, device="npu"):
         format_list = [0]
         shape_list = [(4, 5), (4, 7), (5, 7)]
         shape_format1 = [
@@ -322,6 +321,5 @@ class TestAddmm(TestCase):
             self.assertRtolEqual(cpu_transpose_output, npu_transpose_output)
 
 
-instantiate_device_type_tests(TestAddmm, globals(), except_for='cpu')
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_addmv.py b/test/test_network_ops/test_addmv.py
index 73a787b05be58606fab85f2c087a39772394b097..b59052a87c5d5382906ed866b223f3d29de81652 100644
--- a/test/test_network_ops/test_addmv.py
+++ b/test/test_network_ops/test_addmv.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestAddmv(TestCase):
     def cpu_op_exec(self, a, b, c, alpha, beta):
@@ -39,7 +39,7 @@ class TestAddmv(TestCase):
         output = output.numpy()
         return output
 
-    def test_addmv_fp16(self, device):
+    def test_addmv_fp16(self, device="npu"):
         shape_format = [
            [[np.float16, 3, (2, 3)], [np.float16, 3, (3,)], [np.float16, 3, (2, )]]
            
@@ -60,7 +60,7 @@ class TestAddmv(TestCase):
             cpu_output = cpu_output.astype(np.float16)
             self.assertRtolEqual(cpu_output, npu_output)
             
-    def test_addmv_out_fp16(self, device):
+    def test_addmv_out_fp16(self, device="npu"):
         shape_format = [
            [[np.float16, 3, (2, 3)], [np.float16, 3, (3,)], [np.float16, 3, (2, )], [np.float16, 3, (10,)]]
            
@@ -82,7 +82,7 @@ class TestAddmv(TestCase):
             
             self.assertRtolEqual(cpu_output, npu_output)
      
-    def test_addmv_fp32(self, device):
+    def test_addmv_fp32(self, device="npu"):
         shape_format = [
            [[np.float32, 0, (2, 3)], [np.float32, 0, (3,)], [np.float32, 0, (2, )]],
            [[np.float32, 0, (3168, 320)], [np.float32, 0, (320,)], [np.float32, 0, (3168, )]],
@@ -100,7 +100,6 @@ class TestAddmv(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)        
     
 
-instantiate_device_type_tests(TestAddmv, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
 
diff --git a/test/test_network_ops/test_addr.py b/test/test_network_ops/test_addr.py
index afec4c3a7e6599aa0e0ce20d3301e4469a7c75a3..3d37459509a17300ee35214096ab9af462edd0cd 100644
--- a/test/test_network_ops/test_addr.py
+++ b/test/test_network_ops/test_addr.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestAddr(TestCase):
     def cpu_op_exec(self,input1, vec1, vec2, beta, alpha):
@@ -38,7 +38,7 @@ class TestAddr(TestCase):
         output = output.numpy()
         return output
 
-    def test_addr_common_shape_format(self, device):
+    def test_addr_common_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float32, 0, (5,3)], [np.float32, 0, (5)], [np.float32, 0, (3)]],
                 [[np.int32, 0, (5,3)], [np.int32, 0, (5)], [np.int32, 0, (3)]],
@@ -53,7 +53,7 @@ class TestAddr(TestCase):
             npu_output = self.npu_op_exec(npu_input1, npu_vec1, npu_vec2, beta, alpha)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_addr_out_common_shape_format(self, device):
+    def test_addr_out_common_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float32, 0, (5,3)], [np.float32, 0, (5,3)], [np.float32, 0, (5)], [np.float32, 0, (3)]],
                 [[np.int32, 0, (5,3)], [np.int32, 0, (5,3)], [np.int32, 0, (5)], [np.int32, 0, (3)]],
@@ -69,6 +69,6 @@ class TestAddr(TestCase):
             npu_output = self.npu_op_exec_out(npu_input1, npu_input2, npu_vec1, npu_vec2, beta, alpha)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestAddr, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_affine_grid_generator.py b/test/test_network_ops/test_affine_grid_generator.py
index eb19f60e0a36c34363fda91f9c981b3762c8e9d8..1c62d975d7518e8b7128832794d2ac406ca6d1f0 100644
--- a/test/test_network_ops/test_affine_grid_generator.py
+++ b/test/test_network_ops/test_affine_grid_generator.py
@@ -19,9 +19,7 @@ import torch_npu
 import numpy as np
 from torch.nn import functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
 
 
 class TestAffineGridGenerator(TestCase):
@@ -36,7 +34,7 @@ class TestAffineGridGenerator(TestCase):
         output = output.cpu().numpy()
         return output
 
-    def test_affine_grid_generator_2D(self, device):
+    def test_affine_grid_generator_2D(self, device="npu"):
         theta_list = [[1, 0, 0],
                       [0, 1, 0],
                      ]
@@ -57,7 +55,7 @@ class TestAffineGridGenerator(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output, 0.001)
 
-    def test_affine_grid_generator_3D(self, device):
+    def test_affine_grid_generator_3D(self, device="npu"):
         theta_list = [[1, 0, 0, 0],
                       [0, 1, 0, 0],
                       [0, 0, 1, 0],
@@ -79,6 +77,6 @@ class TestAffineGridGenerator(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output, 0.001)
 
-instantiate_device_type_tests(TestAffineGridGenerator, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_affine_grid_generator_backward.py b/test/test_network_ops/test_affine_grid_generator_backward.py
index 7a6159f75dd1ccd64546d1ad6dea99ffc55fb8f2..e13250aaf008a79b22e2101406d7b4442579b65e 100644
--- a/test/test_network_ops/test_affine_grid_generator_backward.py
+++ b/test/test_network_ops/test_affine_grid_generator_backward.py
@@ -17,12 +17,12 @@ import torch_npu
 import numpy as np
 from torch.nn import functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestAffineGridGeneratorBackward(TestCase):
-    def test_affine_grid_generator_backward_common_shape(self, device):
+    def test_affine_grid_generator_backward_common_shape(self, device="npu"):
         shape_list = [[100, 2, 3], [10, 2, 3]]
         shape_format = [
             [np.float32, -1, j] for j in shape_list
@@ -36,7 +36,7 @@ class TestAffineGridGeneratorBackward(TestCase):
             npu_output = self.npu_op_exec(npu_input1, size)
             self.assertRtolEqual(cpu_output, npu_output)
     
-    def test_affine_grid_generator_backward_fp16(self, device):
+    def test_affine_grid_generator_backward_fp16(self, device="npu"):
         shape_list = [[100, 2, 3], [10, 2, 3]]
         shape_format = [
             [np.float16, -1, j] for j in shape_list
@@ -68,6 +68,6 @@ class TestAffineGridGeneratorBackward(TestCase):
         output = input1.grad.to("cpu").numpy()
         return output
 
-instantiate_device_type_tests(TestAffineGridGeneratorBackward, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_all.py b/test/test_network_ops/test_all.py
index 9d096ee453a14ed3ef2277f4ce8f5f93d8c67059..df40ae454567838b5bdac9b7ea3944a2765e87ca 100644
--- a/test/test_network_ops/test_all.py
+++ b/test/test_network_ops/test_all.py
@@ -15,9 +15,7 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
 
 
 class TestAll(TestCase):
@@ -39,7 +37,7 @@ class TestAll(TestCase):
         output = output.numpy()
         return output
 
-    def test_all_shape_format(self, device):
+    def test_all_shape_format(self, device="npu"):
         shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024], [2, 0, 2]]
         for item in shape_list:
             cpu_input, npu_input = self.create_bool_tensor(item, 0, 1)
@@ -73,7 +71,7 @@ class TestAll(TestCase):
         output1 = output1.to("cpu").numpy()
         return output0, output1
 
-    def test_alld_shape_format(self, device):
+    def test_alld_shape_format(self, device="npu"):
         shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
         for item in shape_list:
             cpu_input, npu_input = self.create_bool_tensor(item, 0, 1)
@@ -85,6 +83,5 @@ class TestAll(TestCase):
             self.assertRtolEqual(cpu_output.astype(np.int32), npu_out1.astype(np.int32))
 
 
-instantiate_device_type_tests(TestAll, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_anchorresponseflags.py b/test/test_network_ops/test_anchorresponseflags.py
index ef99d4a49ce6eb6d288919b5f87b19ad626cc2fa..57847f467efc1e601a92b3b170b92ec368b86a82 100644
--- a/test/test_network_ops/test_anchorresponseflags.py
+++ b/test/test_network_ops/test_anchorresponseflags.py
@@ -16,9 +16,9 @@ import torch_npu
 import numpy as np
 from torch.nn import functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestAnchorResponseFlags(TestCase):
     def cpu_op_exec(self, gt_bboxes, featmap_size, strides, num_base_anchors):
@@ -40,7 +40,7 @@ class TestAnchorResponseFlags(TestCase):
         out = out.to("cpu")
         return out.detach().numpy()
         
-    def test_anchor_response_flags(self, device):
+    def test_anchor_response_flags(self, device="npu"):
         shape_format = [
             [[np.float32, -1, [100, 4]], [60, 60], [2, 2], 9],
             [[np.float16, -1, [200, 4]], [10, 10], [32, 32], 3],
@@ -54,6 +54,6 @@ class TestAnchorResponseFlags(TestCase):
             npu_output = self.npu_op_exec(npu_input, *item[1:])
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestAnchorResponseFlags, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_any.py b/test/test_network_ops/test_any.py
index c1cef3b617a58499ffb680fbe6f371970f5eb50b..6a2a34a1963aff61a40a4cd6da208c50ac0c080d 100644
--- a/test/test_network_ops/test_any.py
+++ b/test/test_network_ops/test_any.py
@@ -15,9 +15,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestAny(TestCase):
     def create_bool_tensor(self, shape, minValue, maxValue):
@@ -37,7 +36,7 @@ class TestAny(TestCase):
         output = output.numpy()
         return output
 
-    def test_any_shape_format(self, device):
+    def test_any_shape_format(self, device="npu"):
         shape_list = [[],
                       [1024], 
                       [32, 1024], 
@@ -75,7 +74,7 @@ class TestAny(TestCase):
         output1 = output1.to("cpu").numpy()
         return output0, output1
 
-    def test_anyd_shape_format(self, device):
+    def test_anyd_shape_format(self, device="npu"):
         shape_list = [[ [1024],             0, False],
                       [ [32, 1024],         1, False],
                       [ [32, 8, 1024],      2, True ],
@@ -90,6 +89,6 @@ class TestAny(TestCase):
             self.assertRtolEqual(cpu_output.astype(np.int32),npu_out0.astype(np.int32))
             self.assertRtolEqual(cpu_output.astype(np.int32),npu_out1.astype(np.int32))
 
-instantiate_device_type_tests(TestAny, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_arange.py b/test/test_network_ops/test_arange.py
index b2b0ca16b20b626b1e524f19312cb6fc0017bdc8..c4aa1a13e5dc571847467856d128a51e3b6d53c5 100644
--- a/test/test_network_ops/test_arange.py
+++ b/test/test_network_ops/test_arange.py
@@ -15,12 +15,11 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 class TestArange(TestCase):
-    def test_arange(self, device):
+    def test_arange(self, device="npu"):
         shape_format = [
             [0, 100, 2, torch.float32],
             [1, 100, 1, torch.int32],
@@ -34,7 +33,7 @@ class TestArange(TestCase):
                 device="npu").cpu().numpy()
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_arange_out(self, device):
+    def test_arange_out(self, device="npu"):
         shape_format = [
             [0, 100, 1, torch.float32, [np.float32, 0, [10]]],
             [1, 100, 2, torch.int32, [np.int32, 0, [20]]],
@@ -50,6 +49,5 @@ class TestArange(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
 
 
-instantiate_device_type_tests(TestArange, globals(), except_for='cpu')
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_argmax.py b/test/test_network_ops/test_argmax.py
index 36707cf0e94168230dcde42aa092cc94914478ae..5e0007727a8cff9e181d37666b62ae1fbe4fb6cb 100644
--- a/test/test_network_ops/test_argmax.py
+++ b/test/test_network_ops/test_argmax.py
@@ -15,9 +15,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestArgmax(TestCase):
@@ -32,7 +31,7 @@ class TestArgmax(TestCase):
         output = output.numpy()
         return output
 
-    def test_argmax_shape_format_fp16(self, device):
+    def test_argmax_shape_format_fp16(self, device="npu"):
         format_list = [0]
         shape_list = [[5], [2, 4], [2, 2, 4], [2, 3, 3, 4]]
         shape_format = [
@@ -46,7 +45,7 @@ class TestArgmax(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_argmax_shape_format_fp32(self, device):
+    def test_argmax_shape_format_fp32(self, device="npu"):
         format_list = [0]
         shape_list = [[5], [2, 4], [2, 2, 4], [2, 3, 3, 4]]
         shape_format = [
@@ -70,7 +69,7 @@ class TestArgmax(TestCase):
         output = output.numpy()
         return output
 
-    def test_argmaxd_shape_format_fp16(self, device):
+    def test_argmaxd_shape_format_fp16(self, device="npu"):
         format_list = [0]
         shape_list = [[5], [2, 4], [2, 2, 4], [2, 3, 3, 4]]
         shape_format = [
@@ -84,7 +83,7 @@ class TestArgmax(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_argmaxd_shape_format_fp32(self, device):
+    def test_argmaxd_shape_format_fp32(self, device="npu"):
         format_list = [0]
         shape_list = [[5], [2, 4], [2, 2, 4], [2, 3, 3, 4]]
         shape_format = [
@@ -97,6 +96,6 @@ class TestArgmax(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestArgmax, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_argsort.py b/test/test_network_ops/test_argsort.py
index 57f456937507b88fc10e22bc5c806c049dd2f280..e0fcfed5f8a964c93ed3e698ce5723a826a536c9 100644
--- a/test/test_network_ops/test_argsort.py
+++ b/test/test_network_ops/test_argsort.py
@@ -17,9 +17,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 class TestArgSort(TestCase):
     def cpu_op_exec(self, input1, dim, descending):
@@ -39,7 +38,7 @@ class TestArgSort(TestCase):
         output = torch.argsort(input1)
         return output.cpu().numpy()
 
-    def test_sort_shape_format_fp32(self, device):
+    def test_sort_shape_format_fp32(self, device="npu"):
         shape_format = [
                 [[np.float32, 0, (8, 4, 3, 9)], 2, False],
                 [[np.float32, 0, (2, 3)]],
@@ -57,7 +56,7 @@ class TestArgSort(TestCase):
                 npu_output = self.npu_default_op_exec(npu_input1)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_sort_shape_format_fp16(self, device):
+    def test_sort_shape_format_fp16(self, device="npu"):
         shape_format = [
                 [[np.float16, 0, (8, 4, 3, 9)], 2, False],
                 [[np.float16, 0, (2, 3)]],
@@ -76,6 +75,5 @@ class TestArgSort(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
 
 
-instantiate_device_type_tests(TestArgSort, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_asin.py b/test/test_network_ops/test_asin.py
index 695010ca0d76549fec51aad943e248cd745fabd9..45d014c759eeb85986f92a7c2d4c819004effe98 100644
--- a/test/test_network_ops/test_asin.py
+++ b/test/test_network_ops/test_asin.py
@@ -15,9 +15,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 class TestAsin(TestCase):
     def cpu_op_exec(self,input1):
@@ -37,7 +36,7 @@ class TestAsin(TestCase):
         output = output.numpy()
         return output
 
-    def test_asin_common_shape_format(self, device):
+    def test_asin_common_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float32, 0, (5,3)]],
         ]
@@ -47,7 +46,7 @@ class TestAsin(TestCase):
             npu_output = self.npu_op_exec(npu_input1)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_asin_out_common_shape_format(self, device):
+    def test_asin_out_common_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float32, 0, (4,3)],    [np.float32, 0, (4,3)]],
         ]
@@ -58,6 +57,6 @@ class TestAsin(TestCase):
             npu_output = self.npu_op_exec_out(npu_input1, npu_input2)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestAsin, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_atan.py b/test/test_network_ops/test_atan.py
index 9b7926e1e43a3a9d5fd4ae3e89782ce4b6d9a2bc..ad4289162208ebb7c7952ecf46bdfb67f5e8adb0 100644
--- a/test/test_network_ops/test_atan.py
+++ b/test/test_network_ops/test_atan.py
@@ -17,10 +17,10 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
  
+
 class TestAtan(TestCase):
     def cpu_op_exec(self, input1):
         output = torch.atan(input1) 
@@ -31,7 +31,7 @@ class TestAtan(TestCase):
         output = output.to("cpu") 
         return output  
         
-    def test_atan_shape_format(self, device):
+    def test_atan_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float32, 0, 1]],
                 [[np.float32, 0, (64, 10)]],
@@ -45,6 +45,6 @@ class TestAtan(TestCase):
             npu_output = self.npu_op_exec(npu_input)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestAtan, globals(), except_for="cpu") 
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_avg_pool2d.py b/test/test_network_ops/test_avg_pool2d.py
index b579cdbd06eac34d50456699b26509ec18d7f8d9..4d1c4e432be74d5d7a1bcb019a548ac0c83b8f8b 100644
--- a/test/test_network_ops/test_avg_pool2d.py
+++ b/test/test_network_ops/test_avg_pool2d.py
@@ -17,9 +17,8 @@ import torch_npu
 import torch.nn as nn
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 class TestAvgPool2d(TestCase):
     def cpu_op_exec(self, input1, ceil_mode):
@@ -35,7 +34,7 @@ class TestAvgPool2d(TestCase):
         output = output.detach().numpy()
         return output
 
-    def test_avg_pool2d_backward_shape_format_fp16(self, device):
+    def test_avg_pool2d_backward_shape_format_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, 0, (1, 3, 147, 147)], True],
             [[np.float16, 0, (1, 3, 147, 147)], True]
@@ -48,7 +47,7 @@ class TestAvgPool2d(TestCase):
             npu_output = self.npu_op_exec(npu_input, item[1])
             self.assertRtolEqual(cpu_output, npu_output, prec16=0.002)
 
-    def test_avg_pool2d_backward_shape_format_fp32(self, device):
+    def test_avg_pool2d_backward_shape_format_fp32(self, device="npu"):
         shape_format = [
             [[np.float32, 0, (1, 3, 147, 147)], True],
             [[np.float32, 0, (1, 3, 147, 147)], True]
@@ -60,6 +59,6 @@ class TestAvgPool2d(TestCase):
             npu_output = self.npu_op_exec(npu_input, item[1])
             self.assertRtolEqual(cpu_output, npu_output, 0.0009)
 
-instantiate_device_type_tests(TestAvgPool2d, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_avg_pool2d_backward.py b/test/test_network_ops/test_avg_pool2d_backward.py
index f673e65be51ae20ac3483d429a28767eec4fc11e..fb89146df2ee551923d1728df89896ad44b467a2 100644
--- a/test/test_network_ops/test_avg_pool2d_backward.py
+++ b/test/test_network_ops/test_avg_pool2d_backward.py
@@ -17,9 +17,8 @@ import torch_npu
 import torch.nn as nn
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestAvgPool2dBackward(TestCase):
@@ -47,7 +46,7 @@ class TestAvgPool2dBackward(TestCase):
 
         return output_grad, output
 
-    def test_avg_pool2d_backward_shape_format_fp16(self, device):
+    def test_avg_pool2d_backward_shape_format_fp16(self, device="npu"):
         format_list = [0, 3]
         shape_list = [(5, 20, 8, 8)]
         shape_format = [
@@ -64,7 +63,7 @@ class TestAvgPool2dBackward(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_output_grad, npu_output_grad)
 
-    def test_avg_pool2d_backward_shape_format_fp32(self, device):
+    def test_avg_pool2d_backward_shape_format_fp32(self, device="npu"):
         format_list = [0, 3]
         shape_list = [(5, 20, 8, 8)]
         shape_format = [
@@ -84,9 +83,5 @@ class TestAvgPool2dBackward(TestCase):
             self.assertRtolEqual(cpu_output_grad, npu_output_grad)
 
 
-instantiate_device_type_tests(
-    TestAvgPool2dBackward,
-    globals(),
-    except_for="cpu")
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_avg_pool3d.py b/test/test_network_ops/test_avg_pool3d.py
index 6efa9c0840ea10b447397cf9b5be677a0c4007bf..dd8e3188f44865f496508fa3c27ae814a86d1c7c 100644
--- a/test/test_network_ops/test_avg_pool3d.py
+++ b/test/test_network_ops/test_avg_pool3d.py
@@ -18,9 +18,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestAvgPool3D(TestCase):
@@ -40,7 +39,7 @@ class TestAvgPool3D(TestCase):
         output_data = m(input1)
         return output_data
 
-    def test_avg_pool_3d_fp32(self, device):
+    def test_avg_pool_3d_fp32(self, device="npu"):
         # shape_format:[[dtype, (input_shape)], kernel_size, stride]
         shape_format = [
                         [[np.float32, -1, (20, 16, 50, 44, 31)], (3, 2, 2), (2, 1, 2)],
@@ -55,7 +54,7 @@ class TestAvgPool3D(TestCase):
             cpu_output = self.cpu_op_exec(item[1], item[2], cpu_input1)
             self.assertRtolEqual(cpu_output, npu_output.cpu(), 1.e-3)
 
-    def test_avg_pool_3d_fp16(self, device):
+    def test_avg_pool_3d_fp16(self, device="npu"):
         # shape_format:[[dtype, (input_shape)], kernel_size, stride]
         shape_format = [
                         [[np.float16, -1, (20, 16, 50, 44, 31)], (3, 2, 2), (2, 1, 2)],
@@ -70,7 +69,7 @@ class TestAvgPool3D(TestCase):
             cpu_output = self.cpu_op_exec_fp16(item[1], item[2], cpu_input1)
             self.assertRtolEqual(cpu_output, npu_output.cpu())
 
-instantiate_device_type_tests(TestAvgPool3D, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
 
diff --git a/test/test_network_ops/test_avg_pool3d_backward.py b/test/test_network_ops/test_avg_pool3d_backward.py
index 16e4e940728c968e601c8b9a241716f392cc53c6..e86ecc7d2b57afad31e89bf164a73e64954c4982 100644
--- a/test/test_network_ops/test_avg_pool3d_backward.py
+++ b/test/test_network_ops/test_avg_pool3d_backward.py
@@ -18,9 +18,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestAvgPool3DBackward(TestCase):
@@ -58,7 +57,7 @@ class TestAvgPool3DBackward(TestCase):
         output = output.detach().numpy()
         return output_grad, output
 
-    def test_avg_pool_3d_fp32(self, device):
+    def test_avg_pool_3d_fp32(self, device="npu"):
         shape_format = [
                         [[np.float32, -1, (20, 16, 50, 44, 31)], (3, 2, 2), (2, 1, 2)],
                         [[np.float32, -1, (2, 1, 4, 4, 4)], 3, 2],
@@ -73,7 +72,7 @@ class TestAvgPool3DBackward(TestCase):
             self.assertRtolEqual(cpu_output, npu_output, 1.e-3)
             self.assertRtolEqual(cpu_output_grad, npu_output_grad, 1.e-3)
 
-    def test_avg_pool_3d_fp16(self, device):
+    def test_avg_pool_3d_fp16(self, device="npu"):
         shape_format = [
                         [[np.float16, -1, (20, 16, 50, 44, 31)], (3, 2, 2), (2, 1, 2)],
                         [[np.float16, -1, (2, 1, 4, 4, 4)], 3, 2],
@@ -88,7 +87,7 @@ class TestAvgPool3DBackward(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_output_grad, npu_output_grad)
 
-instantiate_device_type_tests(TestAvgPool3DBackward, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
 
diff --git a/test/test_network_ops/test_baddbmm.py b/test/test_network_ops/test_baddbmm.py
index b59fcdcde5138d30cc2e3cf54492ae552a91b468..c419ff8a92aac98e79533ebbaa33429cc603dd3c 100644
--- a/test/test_network_ops/test_baddbmm.py
+++ b/test/test_network_ops/test_baddbmm.py
@@ -16,9 +16,9 @@ import torch_npu
 import numpy as np
 from torch.nn import functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestBaddBmm(TestCase):
     def generate_scalar(self, dtype, min1, max1):
@@ -52,7 +52,7 @@ class TestBaddBmm(TestCase):
         input1 = input1.numpy()
         return input1   
 
-    def test_baddbmm_common_shape_format(self, device):
+    def test_baddbmm_common_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float32, -1, (1, 3, 5)], [np.float32, -1, (1, 3, 4)], 
                 [np.float32, -1, (1, 4, 5)], "float32"],
@@ -77,7 +77,7 @@ class TestBaddBmm(TestCase):
             npu_output_ = self.npu_op_exec_(npu_input1, npu_input2, npu_input3, scalar1, scalar2)
             self.assertRtolEqual(cpu_output_, npu_output_)
             
-    def test_baddbmm_float16_shape_format(self, device):
+    def test_baddbmm_float16_shape_format(self, device="npu"):
         def cpu_op_exec_fp16(input1, input2, input3, scalar1, scalar2):
             input1 = input1.to(torch.float32)
             input2 = input2.to(torch.float32)
@@ -109,6 +109,5 @@ class TestBaddBmm(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)  
       
 
-instantiate_device_type_tests(TestBaddBmm, globals(), except_for='cpu')
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_batch_norm_stats.py b/test/test_network_ops/test_batch_norm_stats.py
index 88f1598df8fd9053d08cf402cf1458abc33fca7f..22143b876a5b4fa9bc454f5bf944a2f8ef288120 100644
--- a/test/test_network_ops/test_batch_norm_stats.py
+++ b/test/test_network_ops/test_batch_norm_stats.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestBatchNormStats(TestCase):
     def cuda_op_exec(self, *args):
@@ -36,7 +36,7 @@ class TestBatchNormStats(TestCase):
         out_invstd = npu_invstd.cpu().numpy()
         return out_mean, out_invstd
 
-    def test_batch_norm_stats(self, device):
+    def test_batch_norm_stats(self, device="npu"):
         shape_format = [
             [[np.float16, -1, [2, 3, 12, 12]], 1e-5],
         ]
@@ -57,6 +57,5 @@ class TestBatchNormStats(TestCase):
             self.assertRtolEqual(cpu_output[1], npu_outputfp32[1], 1e-2)
 
 
-instantiate_device_type_tests(TestBatchNormStats, globals(), except_for='cpu')
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_batchnorm3d_backward.py b/test/test_network_ops/test_batchnorm3d_backward.py
index 4872d2cdf7e0fcee61a8d8ed7434301e824e4490..af1eaad3826ba297960d31ab44fbe8e1b449f9c5 100644
--- a/test/test_network_ops/test_batchnorm3d_backward.py
+++ b/test/test_network_ops/test_batchnorm3d_backward.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestBn2d(TestCase):
     def cpu_op_exec(self,input1, dim):
@@ -48,7 +48,7 @@ class TestBn2d(TestCase):
         output = output.detach().numpy()
         return output, input_npu
 
-    def test_batchnorm3d_shape_format_fp16(self, device):
+    def test_batchnorm3d_shape_format_fp16(self, device="npu"):
         format_list = [30]
         shape_list = [[256, 164, 7, 7, 7],[148, 16, 28, 28, 28]]
         shape_format = [
@@ -64,7 +64,7 @@ class TestBn2d(TestCase):
             cpu_input = cpu_input.astype(npu_input.dtype)
             self.assertRtolEqual(cpu_input, npu_input)
 
-    def test_batchnorm3d_shape_format_fp32(self, device):
+    def test_batchnorm3d_shape_format_fp32(self, device="npu"):
         format_list = [30]
         shape_list = [(256, 32, 7, 7, 7)]
         shape_format = [
@@ -79,7 +79,7 @@ class TestBn2d(TestCase):
             cpu_input = cpu_input.astype(npu_input.dtype)
             self.assertRtolEqual(cpu_input, npu_input)
 
-instantiate_device_type_tests(TestBn2d, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
 
diff --git a/test/test_network_ops/test_batchnorm_backward.py b/test/test_network_ops/test_batchnorm_backward.py
index 4d32547eb074a8ec0e6167fefa864d09cab49c31..de5e7d9bce0b97cf869bce05b82a2871c4f6c9d3 100644
--- a/test/test_network_ops/test_batchnorm_backward.py
+++ b/test/test_network_ops/test_batchnorm_backward.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestBn2d(TestCase):
     def cpu_op_exec(self,input1, dim):
@@ -48,7 +48,7 @@ class TestBn2d(TestCase):
         output = output.detach().numpy()
         return output, input_npu
 
-    def test_batchnorm_shape_format_fp16(self, device):
+    def test_batchnorm_shape_format_fp16(self, device="npu"):
         format_list = [0]
         shape_list = [[256, 672, 7, 7],[1024, 58, 28, 28]]
         shape_format = [
@@ -64,7 +64,7 @@ class TestBn2d(TestCase):
             cpu_input = cpu_input.astype(npu_input.dtype)
             self.assertRtolEqual(cpu_input, npu_input)
 
-    def test_batchnorm_shape_format_fp32(self, device):
+    def test_batchnorm_shape_format_fp32(self, device="npu"):
         format_list = [0]
         shape_list = [(256, 32, 112, 112)]
         shape_format = [
@@ -79,7 +79,7 @@ class TestBn2d(TestCase):
             cpu_input = cpu_input.astype(npu_input.dtype)
             self.assertRtolEqual(cpu_input, npu_input)
 
-instantiate_device_type_tests(TestBn2d, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
 
diff --git a/test/test_network_ops/test_batchnorm_backward_elemt.py b/test/test_network_ops/test_batchnorm_backward_elemt.py
index d85aaa5aefc449b84c7561edb107b5df1cf55fb3..39a5d5a36047a532d5876fc6c83ab35c46dadef0 100644
--- a/test/test_network_ops/test_batchnorm_backward_elemt.py
+++ b/test/test_network_ops/test_batchnorm_backward_elemt.py
@@ -14,12 +14,12 @@
 
 import torch
 import torch_npu
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestBatchNormBackwardElemt(TestCase):
 
-    def test_batch_norm_backward_elemt_4d(self, device):
+    def test_batch_norm_backward_elemt_4d(self, device="npu"):
         grad_output = torch.ones([2, 3, 1, 4]).npu()
         input1 = torch.ones([2, 3, 1, 4]).npu()
         mean = torch.tensor([8., 5., 9.]).npu()
@@ -38,7 +38,7 @@ class TestBatchNormBackwardElemt(TestCase):
                                          [[2776., 2776., 2776, 2776.]]]])
         self.assertRtolEqual(grad_input.cpu(), cuda_expect_out)
 
-    def test_batch_norm_backward_elemt_2d(self, device):
+    def test_batch_norm_backward_elemt_2d(self, device="npu"):
         grad_output = torch.ones([2, 3]).npu()
         input1 = torch.ones([2, 3]).npu()
         mean = torch.tensor([8., 5., 9.]).npu()
@@ -53,7 +53,7 @@ class TestBatchNormBackwardElemt(TestCase):
                                         [110., 11., 2776.]])
         self.assertRtolEqual(grad_input.cpu(), cuda_expect_out)
 
-    def test_batch_norm_backward_elemt_2d_fp(self, device):
+    def test_batch_norm_backward_elemt_2d_fp(self, device="npu"):
         grad_output = torch.ones([2, 3]).npu()
         input1 = torch.ones([2, 3]).npu()
         mean = torch.tensor([8.123456, 5.147125, 9.365778]).npu()
@@ -68,6 +68,6 @@ class TestBatchNormBackwardElemt(TestCase):
                                         [361.5542, 41.5013, 4467.4121]])
         self.assertRtolEqual(grad_input.cpu(), cuda_expect_out)
 
-instantiate_device_type_tests(TestBatchNormBackwardElemt, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_batchnorm_backward_eval.py b/test/test_network_ops/test_batchnorm_backward_eval.py
index a81e3fb06e331c2c2fd7eb2b38a141538b4c1fc4..5a8ae18f95214f7d4f76df092d3e17ff40970bbb 100644
--- a/test/test_network_ops/test_batchnorm_backward_eval.py
+++ b/test/test_network_ops/test_batchnorm_backward_eval.py
@@ -17,8 +17,7 @@ import torch
 import torch_npu
 import torch.nn as nn
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
 
 
 class Model(nn.Module):
@@ -38,7 +37,7 @@ class Model(nn.Module):
         return x
 
 class TestBn2dEval(TestCase):
-    def test_batchnorm_backward_eval(self, device):
+    def test_batchnorm_backward_eval(self, device="npu"):
         model = Model(in_channels=256)
         cpu_tensor = torch.randn(32,256,14,14)
         npu_tensor = cpu_tensor.npu()
@@ -74,6 +73,6 @@ class TestBn2dEval(TestCase):
                 #精度未满足 self.assertRtolEqual(cpu_grad.numpy(), npu_grad.numpy())
                 self.assertRtolEqual(cpu_grad.numpy(), npu_grad.numpy(), 0.1)
 
-instantiate_device_type_tests(TestBn2dEval, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py b/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py
index 52585e231104ab9daa514f5ea9831110172df118..71f12c1f623fecd390693ce1d166bb185aa64b5a 100644
--- a/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py
+++ b/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestBatchNormGatherStatsWithCounts(TestCase):
     def expect_cuda_out_fp16(self):
@@ -61,7 +61,7 @@ class TestBatchNormGatherStatsWithCounts(TestCase):
             npu_counts = torch_npu.npu_format_cast(npu_counts, npu_format)
         return npu_counts
 
-    def test_batch_norm_gather_stats_with_counts(self, device):
+    def test_batch_norm_gather_stats_with_counts(self, device="npu"):
         shape_format = [
             [[np.float16, -1, [2, 3, 12, 12]], [np.float32, -1, [4, 3]], [np.float32, -1, [4, 3]], \
                     [np.float32, -1, [3]], [np.float32, -1, [3]], 1e-3, 1e-5, [np.float32, -1, [4]], 0],
@@ -99,6 +99,6 @@ class TestBatchNormGatherStatsWithCounts(TestCase):
                 self.assertRtolEqual(npu_outputfp32[0], cuda_output[0])
                 self.assertRtolEqual(npu_outputfp32[1], cuda_output[1])
 
-instantiate_device_type_tests(TestBatchNormGatherStatsWithCounts, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_binary_cross_entropy.py b/test/test_network_ops/test_binary_cross_entropy.py
index 565d7ef3d8fcc0667092e0d31b9ad9ab2c5fbc9b..d943017f87d93c88c36058862b93f7c2a0bdf6a0 100644
--- a/test/test_network_ops/test_binary_cross_entropy.py
+++ b/test/test_network_ops/test_binary_cross_entropy.py
@@ -13,14 +13,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import copy
 import torch
 import torch_npu
-import torch.nn as nn
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
 
 LOWER = 0
 UPPER = 1
@@ -49,7 +46,7 @@ class TestBinaryCrossEntropy(TestCase):
         res = res.to("cpu")
         return res.numpy()
 
-    def test_binary_cross_entropy_float32(self, device):
+    def test_binary_cross_entropy_float32(self, device="npu"):
         for shape, weight_shape, reduction in [
            ((10, 64), None,     "mean"),
            ((10, 64), (10, 1),  "mean"),
@@ -67,7 +64,7 @@ class TestBinaryCrossEntropy(TestCase):
             npu_output = self.npu_op_exec(predict, target, weight=weight, reduction=reduction)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_binary_cross_entropy_float16(self, device):
+    def test_binary_cross_entropy_float16(self, device="npu"):
         for shape, weight_shape, reduction in [
            ((10, 64), (10, 64), "sum"),
            ((10, 64), (10, 64), "mean"),
@@ -87,6 +84,6 @@ class TestBinaryCrossEntropy(TestCase):
             cpu_output = self.cpu_op_exec_half(predict_32, target_32, weight=weight_32, reduction=reduction)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestBinaryCrossEntropy, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_binary_cross_entropy_backward.py b/test/test_network_ops/test_binary_cross_entropy_backward.py
index 3f377d21d199999af84e485b9929c83bfd4f2c40..e3ed88662d6de0608f03e51a413e296eca682d6b 100644
--- a/test/test_network_ops/test_binary_cross_entropy_backward.py
+++ b/test/test_network_ops/test_binary_cross_entropy_backward.py
@@ -17,11 +17,10 @@
 import copy
 import torch
 import torch_npu
-import torch.nn as nn
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestBinaryCrossEntropyBackward(TestCase):
     def generate_data(self, min_val, max_val, shape, dtype):
@@ -70,7 +69,7 @@ class TestBinaryCrossEntropyBackward(TestCase):
         res = res.numpy()
         return npu_input, res
 
-    def test_binary_cross_entropy_backward_float16(self, device):
+    def test_binary_cross_entropy_backward_float16(self, device="npu"):
         shape_list = [(10, 64)]
         reduction_list = ["none", "mean", "sum"]
         shape_format = [
@@ -87,7 +86,7 @@ class TestBinaryCrossEntropyBackward(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_grad, npu_grad)
 
-    def test_binary_cross_entropy_backward_float32(self, device):
+    def test_binary_cross_entropy_backward_float32(self, device="npu"):
         shape_list = [(10, 64)]
         reduction_list = ["none", "mean", "sum"]
         shape_format = [
@@ -104,7 +103,7 @@ class TestBinaryCrossEntropyBackward(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_grad, npu_grad)
 
-    def test_binary_cross_entropy_backward_with_weight_float16(self, device):
+    def test_binary_cross_entropy_backward_with_weight_float16(self, device="npu"):
         shape_list = [(10, 64)]
         reduction_list = ["none", "mean", "sum"]
         shape_format = [
@@ -122,7 +121,7 @@ class TestBinaryCrossEntropyBackward(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_grad, npu_grad)
 
-    def test_binary_cross_entropy_backward_with_weight_float32(self, device):
+    def test_binary_cross_entropy_backward_with_weight_float32(self, device="npu"):
         shape_list = [(10, 64)]
         reduction_list = ["none", "mean", "sum"]
         shape_format = [
@@ -140,6 +139,6 @@ class TestBinaryCrossEntropyBackward(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_grad, npu_grad)
 
-instantiate_device_type_tests(TestBinaryCrossEntropyBackward, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_binary_cross_entropy_with_logits.py b/test/test_network_ops/test_binary_cross_entropy_with_logits.py
index e93c3db30502d79fc8dbfa5e40f8e53b8f608b9e..8e0ce96ee13c2ae9f51bc51deb54b7838b26c3cf 100644
--- a/test/test_network_ops/test_binary_cross_entropy_with_logits.py
+++ b/test/test_network_ops/test_binary_cross_entropy_with_logits.py
@@ -11,14 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import copy
 import torch
 import torch_npu
-import torch.nn as nn
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestBinaryCrossEntropyWithLogits(TestCase):
 
@@ -75,7 +73,7 @@ class TestBinaryCrossEntropyWithLogits(TestCase):
         res = res.to("cpu")
         return res.numpy()
 
-    def test_binary_cross_with_logits_float32(self, device):
+    def test_binary_cross_with_logits_float32(self, device="npu"):
         for shape, weight_shape, pos_weight_shape, reduction in [
            ((10, 64), None, None, "mean"),
            ((10, 64), (10, 1), None, "mean"),
@@ -102,7 +100,7 @@ class TestBinaryCrossEntropyWithLogits(TestCase):
             npu_output = self.npu_op_exec(input1, target, weight=weight, pos_weight=pos_weight, reduction=reduction)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_binary_cross_with_logits_float16(self, device):
+    def test_binary_cross_with_logits_float16(self, device="npu"):
         for shape, weight_shape, pos_weight_shape, reduction in [
            ((10, 64), None, None, "mean"),
            ((10, 64), (10, 1), None, "mean"),
@@ -139,7 +137,7 @@ class TestBinaryCrossEntropyWithLogits(TestCase):
             cpu_output = cpu_output.astype(np.float16)
             self.assertRtolEqual(cpu_output, npu_output)
     
-    def test_binary_cross_with_logits_function_float32(self, device):
+    def test_binary_cross_with_logits_function_float32(self, device="npu"):
         for shape, weight_shape, pos_weight_shape, reduction in [
             ((10, 64), None, None, "mean"),
            ((10, 64), (10, 1), None, "mean"),
@@ -166,7 +164,7 @@ class TestBinaryCrossEntropyWithLogits(TestCase):
             npu_output = self.npu_op_func_exec(input1, target, weight=weight, pos_weight=pos_weight, reduction=reduction)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_binary_cross_with_logits_function_float16(self, device):
+    def test_binary_cross_with_logits_function_float16(self, device="npu"):
         for shape, weight_shape, pos_weight_shape, reduction in [
              ((10, 64), None, None, "mean"),
            ((10, 64), (10, 1), None, "mean"),
@@ -204,6 +202,6 @@ class TestBinaryCrossEntropyWithLogits(TestCase):
             cpu_output = cpu_output.astype(np.float16)                                
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestBinaryCrossEntropyWithLogits, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_binary_cross_entropy_with_logits_backward.py b/test/test_network_ops/test_binary_cross_entropy_with_logits_backward.py
index bce792bdb8159c082ef736a6b388897c09b1576f..1e01253d07db20f1c511498f071713957188e840 100644
--- a/test/test_network_ops/test_binary_cross_entropy_with_logits_backward.py
+++ b/test/test_network_ops/test_binary_cross_entropy_with_logits_backward.py
@@ -17,8 +17,8 @@ import torch_npu
 import torch.nn as nn
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 def generate_data(min1, max1, shape, dtype):
     input1 = np.random.uniform(min1, max1, shape).astype(dtype)
@@ -53,7 +53,7 @@ class TestBinaryCrossEntropyWithLogitsBackward(TestCase):
         res = res.numpy()
         return input_npu, res
 
-    def test_binary_cross_entropy_with_logits_backward_fp32(self, device):
+    def test_binary_cross_entropy_with_logits_backward_fp32(self, device="npu"):
         npu_input1, npu_target = generate_data(0, 100, (5, 3), np.float32)
         cpu_input1 = copy.deepcopy(npu_input1)
         cpu_target = copy.deepcopy(npu_target)
@@ -62,7 +62,7 @@ class TestBinaryCrossEntropyWithLogitsBackward(TestCase):
         self.assertRtolEqual(cpu_output, npu_output)
         self.assertRtolEqual(cpu_grad_output, npu_grad_output)
 
-    def test_binary_cross_entropy_with_logits_backward_fp16(self, device):
+    def test_binary_cross_entropy_with_logits_backward_fp16(self, device="npu"):
         npu_input1, npu_target = generate_data(0, 100, (5, 3), np.float16)
         cpu_input1 = copy.deepcopy(npu_input1)
         cpu_target = copy.deepcopy(npu_target)
@@ -75,7 +75,7 @@ class TestBinaryCrossEntropyWithLogitsBackward(TestCase):
         self.assertRtolEqual(cpu_output, npu_output)
         self.assertRtolEqual(cpu_grad_output, npu_grad_output)
 
-instantiate_device_type_tests(TestBinaryCrossEntropyWithLogitsBackward, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
 
diff --git a/test/test_network_ops/test_bitwise_and.py b/test/test_network_ops/test_bitwise_and.py
index fe17bbb72ffd7f979d1279ae1e3d4297624d99c1..7299201eb52575aa00f5a14dfc9e6d0b33d67b2a 100644
--- a/test/test_network_ops/test_bitwise_and.py
+++ b/test/test_network_ops/test_bitwise_and.py
@@ -12,14 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
 import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestBitwiseAnd(TestCase):
     def generate_data(self, min_d, max_d, shape, dtype):
@@ -144,7 +143,7 @@ class TestBitwiseAnd(TestCase):
             cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
             self.assertRtolEqual(cpu_output_out, npu_output_out)
 
-    def test_bitwise_and_tensor_out(self, device):
+    def test_bitwise_and_tensor_out(self, device="npu"):
         shape_format = [
             [[np.int16, 0, [128, 3, 224, 224]], [np.int16, 0, [3, 3, 3]]],
             [[np.int16, 0, [128, 116, 14, 14]], [np.int16, 0, [128, 116, 14, 14]]],
@@ -164,7 +163,7 @@ class TestBitwiseAnd(TestCase):
             cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
             self.assertRtolEqual(cpu_output_out, npu_output_out)
 
-    def test_bitwise_and_scalar_out(self, device):
+    def test_bitwise_and_scalar_out(self, device="npu"):
         shape_format = [
             [[np.int16, 0, [16, 3, 1111, 1212]], [np.int16, 0, [3, 3, 3]]],
             [[np.int16, 0, [128, 116, 14, 14]], [np.int16, 0, [128, 116, 14, 14]]],
@@ -173,26 +172,26 @@ class TestBitwiseAnd(TestCase):
         ]
         self.bitwise_and_scalar_out_result(shape_format)
 
-    def test_bitwise_and_bool_scalar(self, device): 
+    def test_bitwise_and_bool_scalar(self, device="npu"): 
         npu_input1, npu_input2 = self.generate_data(0, 100, (2,3), np.int32) 
         cpu_output = self.cpu_op_exec_out(npu_input1, True,npu_input1) 
         npu_output = self.npu_op_exec_scalar_out(npu_input1, True,npu_input1) 
         self.assertRtolEqual(cpu_output, npu_output) 
 
-    def test_bitwise_and_int16_diff(self, device): 
+    def test_bitwise_and_int16_diff(self, device="npu"): 
         npu_input1 = self.generate_single_data(0, 100, (1,6), np.int16) 
         npu_input2 = self.generate_single_data(0, 100, (1,1), np.int16)
         cpu_output = self.cpu_op_exec_out(npu_input1, npu_input2, npu_input1) 
         npu_output = self.npu_op_exec_out(npu_input1, npu_input2, npu_input1)
         self.assertRtolEqual(cpu_output, npu_output) 
     
-    def test_bitwise_and_mix_dtype(self, device):
+    def test_bitwise_and_mix_dtype(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(0, 100, (1,6), np.int32) 
         npu_input3, npu_input4 = self.generate_data(0, 100, (1,6), np.int16) 
         cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
         npu_output = self.npu_mix_op_exec(npu_input1, npu_input3) 
         self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestBitwiseAnd, globals(), except_for='cpu')     
+ 
 if __name__ == '__main__': 
     run_tests() 
\ No newline at end of file
diff --git a/test/test_network_ops/test_bitwise_not.py b/test/test_network_ops/test_bitwise_not.py
index feb0d3e2c4a1d6b17555b9967a8db25fd071c49c..7ceccf49b276c372544fbf89ce27305f4dde51dd 100644
--- a/test/test_network_ops/test_bitwise_not.py
+++ b/test/test_network_ops/test_bitwise_not.py
@@ -15,9 +15,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class Test_Bitwise_Not(TestCase):
     def generate_data(self, min_d, max_d, shape, dtype):
@@ -57,31 +56,31 @@ class Test_Bitwise_Not(TestCase):
         output = output.numpy()
         return output
 
-    def test_bitwise_not_bool(self, device):
+    def test_bitwise_not_bool(self, device="npu"):
         npu_input1 = self.generate_bool_data((2, 3))
         cpu_output = self.cpu_op_exec(npu_input1)
         npu_output = self.npu_op_exec(npu_input1)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_bitwise_not_int16(self, device):
+    def test_bitwise_not_int16(self, device="npu"):
         npu_input1 = self.generate_data(0, 2342, (2, 3), np.int16)
         cpu_output = self.cpu_op_exec(npu_input1)
         npu_output = self.npu_op_exec(npu_input1)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_bitwise_not_int32(self, device):
+    def test_bitwise_not_int32(self, device="npu"):
         npu_input1 = self.generate_data(0, 34222, (2, 3), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1)
         npu_output = self.npu_op_exec(npu_input1)
         self.assertRtolEqual(cpu_output, npu_output)
         
-    def test_bitwise_not_int64(self, device):
+    def test_bitwise_not_int64(self, device="npu"):
         npu_input1 = self.generate_data(0, 355553, (2, 3), np.int64)
         cpu_output = self.cpu_op_exec(npu_input1)
         npu_output = self.npu_op_exec(npu_input1)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_bitwise_not_out(self, device):
+    def test_bitwise_not_out(self, device="npu"):
         shape_format = [
             [[0, 2342, [2, 3], np.int16], [0, 2342, [10, 20], np.int16]],
             [[0, 34222, [2, 3], np.int32], [0, 34222, [10, 20], np.int32]],
@@ -96,6 +95,6 @@ class Test_Bitwise_Not(TestCase):
             self.assertRtolEqual(cpu_output, npu_output1)
             self.assertRtolEqual(cpu_output, npu_output1)
 
-instantiate_device_type_tests(Test_Bitwise_Not, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_bitwise_or.py b/test/test_network_ops/test_bitwise_or.py
index d2ebbad5aaf1b9bfcb018c3d03b9cac46b7bd45b..adce35bf31319f3ff5686498c9df15481c9edf52 100644
--- a/test/test_network_ops/test_bitwise_or.py
+++ b/test/test_network_ops/test_bitwise_or.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestBitwiseOr(TestCase):
     def generate_data(self, min_d, max_d, shape, dtype):
@@ -156,7 +156,7 @@ class TestBitwiseOr(TestCase):
 
             self.assertRtolEqual(cpu_output_out, npu_output_out)
 
-    def test_bitwise_or_tensor_out(self, device):
+    def test_bitwise_or_tensor_out(self, device="npu"):
         shape_format = [
             [[np.int16, 0, [128, 3, 224, 224]], [np.int16, 0, [3, 3, 3]]],
             [[np.int16, 0, [128, 116, 14, 14]], [np.int16, 0, [128, 116, 14, 14]]],
@@ -176,7 +176,7 @@ class TestBitwiseOr(TestCase):
             cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
             self.assertRtolEqual(cpu_output_out, npu_output_out)
 
-    def test_bitwise_or_scalar_out(self, device):
+    def test_bitwise_or_scalar_out(self, device="npu"):
         shape_format = [
             [[np.int16, 0, [16, 3, 1111, 1212]], [np.int16, 0, [3, 3, 3]]],
             [[np.int16, 0, [128, 116, 14, 14]], [np.int16, 0, [128, 116, 14, 14]]],
@@ -185,56 +185,56 @@ class TestBitwiseOr(TestCase):
         ]
         self.bitwise_or_scalar_out_result(shape_format)
      
-    def test_bitwise_or_int32(self, device): 
+    def test_bitwise_or_int32(self, device="npu"): 
         npu_input1, npu_input2 = self.generate_data(0, 100, (2,3), np.int32) 
         cpu_output = self.cpu_op_exec_out(npu_input1, npu_input2, npu_input1) 
         npu_output = self.npu_op_exec_out(npu_input1, npu_input2, npu_input1) 
         self.assertRtolEqual(cpu_output, npu_output) 
     
-    def test_bitwise_or_bool_scalar(self, device): 
+    def test_bitwise_or_bool_scalar(self, device="npu"): 
         npu_input1, npu_input2 = self.generate_data(0, 100, (2,3), np.int32) 
         cpu_output = self.cpu_op_exec_out(npu_input1, True, npu_input1) 
         npu_output = self.npu_op_exec_scalar_out(npu_input1, True, npu_input1) 
         self.assertRtolEqual(cpu_output, npu_output) 
     
-    def test_bitwise_or_int32_scalar(self, device): 
+    def test_bitwise_or_int32_scalar(self, device="npu"): 
         npu_input1, npu_input2 = self.generate_data(0, 100, (2,3), np.int32) 
         cpu_output = self.cpu_op_exec_out(npu_input1, 1, npu_input1) 
         npu_output = self.npu_op_exec_scalar_out(npu_input1, 1, npu_input1) 
         self.assertRtolEqual(cpu_output, npu_output) 
     
-    def test_bitwise_or_int16(self, device): 
+    def test_bitwise_or_int16(self, device="npu"): 
         npu_input1, npu_input2 = self.generate_data(0, 100, (1,6), np.int16) 
         cpu_output = self.cpu_op_exec_out(npu_input1, npu_input2, npu_input1) 
         npu_output = self.npu_op_exec_out(npu_input1, npu_input2, npu_input1) 
         self.assertRtolEqual(cpu_output, npu_output) 
         
-    def test_bitwise_or_int16_scalar(self, device): 
+    def test_bitwise_or_int16_scalar(self, device="npu"): 
         npu_input1, npu_input2 = self.generate_data(0, 100, (2,3), np.int16) 
         cpu_output = self.cpu_op_exec_out(npu_input1, 1, npu_input1) 
         npu_output = self.npu_op_exec_scalar_out(npu_input1, 1, npu_input1) 
         self.assertRtolEqual(cpu_output, npu_output) 
     
-    def test_bitwise_or_int16_diff(self, device): 
+    def test_bitwise_or_int16_diff(self, device="npu"): 
         npu_input1 = self.generate_single_data(0, 100, (1,6), np.int16) 
         npu_input2 = self.generate_single_data(0, 100, (1,1), np.int16)
         cpu_output = self.cpu_op_exec_out(npu_input1, npu_input2, npu_input1) 
         npu_output = self.npu_op_exec_out(npu_input1, npu_input2, npu_input1) 
         self.assertRtolEqual(cpu_output, npu_output) 
         
-    def test_bitwise_or_int16_out(self, device):
+    def test_bitwise_or_int16_out(self, device="npu"):
         npu_input1, npu_input2, npu_input3  = self.generate_three_data(0, 100, (4,3), np.int16)
         cpu_output = self.cpu_op_exec_out(npu_input1, npu_input2, npu_input3)
         npu_output = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
         self.assertRtolEqual(cpu_output, npu_output)
     
-    def test_bitwise_or_mix_dtype(self, device):
+    def test_bitwise_or_mix_dtype(self, device="npu"):
         npu_input1 = self.generate_single_data(0, 100, (1,6), np.int32)
         npu_input2 = self.generate_single_data(0, 100, (1,6), np.int16)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2) 
         npu_output = self.npu_mix_op_exec(npu_input1, npu_input2) 
         self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestBitwiseOr, globals(), except_for='cpu')     
+  
 if __name__ == '__main__': 
     run_tests() 
diff --git a/test/test_network_ops/test_bmm.py b/test/test_network_ops/test_bmm.py
index de204834bb3e7644060ebc02e4125fdd1713ebba..1391cb40020786df47f6dc9a0033935d8fc760bb 100644
--- a/test/test_network_ops/test_bmm.py
+++ b/test/test_network_ops/test_bmm.py
@@ -15,9 +15,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestBatchMatMul(TestCase):
     def cpu_op_exec(self, input1, input2):
@@ -44,7 +44,7 @@ class TestBatchMatMul(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_batchmatmul_shape_format_fp16_3d(self, device):
+    def test_batchmatmul_shape_format_fp16_3d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_list = [(1, 3, 2)]
         shape_format1 = [[np.float16, i, j]
@@ -56,7 +56,7 @@ class TestBatchMatMul(TestCase):
         shape_format = [[i, j] for i in shape_format1 for j in shape_format2]
         self.bmm_auto_list_exec(shape_format)
 
-    def test_batchmatmul_shape_format_fp32_3d(self, device):
+    def test_batchmatmul_shape_format_fp32_3d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_list = [(1, 3, 2)]
         shape_format1 = [[np.float32, i, j]
@@ -68,6 +68,6 @@ class TestBatchMatMul(TestCase):
         shape_format = [[i, j] for i in shape_format1 for j in shape_format2]
         self.bmm_auto_list_exec(shape_format)
 
-instantiate_device_type_tests(TestBatchMatMul, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_bmmV2.py b/test/test_network_ops/test_bmmV2.py
index 1b92a3ea27650eeea0bc9f8c483837cc0a60858d..25d2c6356db3f54ce03ece0a2637d0f46aadec0f 100644
--- a/test/test_network_ops/test_bmmV2.py
+++ b/test/test_network_ops/test_bmmV2.py
@@ -15,9 +15,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestBatchMatMulV2(TestCase):
@@ -45,7 +44,7 @@ class TestBatchMatMulV2(TestCase):
           cpu_output = cpu_output.astype(npu_output.dtype)
           self.assertRtolEqual(cpu_output, npu_output)
 
-  def test_batchmatmul_shape_format_fp16_3d(self, device):
+  def test_batchmatmul_shape_format_fp16_3d(self, device="npu"):
       format_list = [0, 3, 29]
       shape_list = [(1, 3, 2)]
       shape_format1 = [[np.float16, i, j]
@@ -57,7 +56,7 @@ class TestBatchMatMulV2(TestCase):
       shape_format = [[i, j] for i in shape_format1 for j in shape_format2]
       self.bmm_auto_list_exec(shape_format)
 
-  def test_batchmatmul_shape_format_fp32_3d(self, device):
+  def test_batchmatmul_shape_format_fp32_3d(self, device="npu"):
       format_list = [0, 3, 29]
       shape_list = [(1, 3, 2)]
       shape_format1 = [[np.float32, i, j]
@@ -69,6 +68,6 @@ class TestBatchMatMulV2(TestCase):
       shape_format = [[i, j] for i in shape_format1 for j in shape_format2]
       self.bmm_auto_list_exec(shape_format)
 
-instantiate_device_type_tests(TestBatchMatMulV2, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_cdist.py b/test/test_network_ops/test_cdist.py
index d3e95248a883e548f7b7b77f2820e48283c72d6e..d300f190295c077a8903acd936c2ac8cf84ec423 100644
--- a/test/test_network_ops/test_cdist.py
+++ b/test/test_network_ops/test_cdist.py
@@ -15,9 +15,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class Testcdist(TestCase):
     def generate_data(self, min_n, max_n, shape_x, shape_y, src_type):
@@ -45,140 +44,140 @@ class Testcdist(TestCase):
             y = y.astype(np.float16)
         return y
 
-    def test_cdist_float16_1(self, device):
+    def test_cdist_float16_1(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-1, 1,
                                 (5, 64), (4, 64), np.float16)
         cpu_output = self.op_exec(npu_input1, npu_input2, 0.0, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 0.0, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cdist_float16_2(self, device):
+    def test_cdist_float16_2(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-1, 1,
                                 (5, 10), (4, 10), np.float16)
         cpu_output = self.op_exec(npu_input1, npu_input2, 0.5, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 0.5, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cdist_float16_3(self, device):
+    def test_cdist_float16_3(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-1, 1,
                                 (5, 10), (4, 10), np.float16)
         cpu_output = self.op_exec(npu_input1, npu_input2, 1.0, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 1.0, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cdist_float16_4(self, device):
+    def test_cdist_float16_4(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-1, 1,
                                 (5, 10), (4, 10), np.float16)
         cpu_output = self.op_exec(npu_input1, npu_input2, 1.5, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 1.5, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cdist_float16_5(self, device):
+    def test_cdist_float16_5(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-1, 1,
                                 (5, 10), (4, 10), np.float16)
         cpu_output = self.op_exec(npu_input1, npu_input2, 2.0, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 2.0, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cdist_float16_6(self, device):
+    def test_cdist_float16_6(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-1, 1,
                                 (5, 10), (4, 10), np.float16)
         cpu_output = self.op_exec(npu_input1, npu_input2, 2.5, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 2.5, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cdist_float16_7(self, device):
+    def test_cdist_float16_7(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-1, 1,
                                 (3, 5, 500), (4, 500), np.float16)
         cpu_output = self.op_exec(npu_input1, npu_input2, 2.0, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 2.0, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cdist_float32_1(self, device):
+    def test_cdist_float32_1(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-1, 1,
                                 (5, 10), (4, 10), np.float32)
         cpu_output = self.op_exec(npu_input1, npu_input2, 0.0, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 0.0, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cdist_float32_2(self, device):
+    def test_cdist_float32_2(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-1, 1,
                                 (5, 10), (4, 10), np.float32)
         cpu_output = self.op_exec(npu_input1, npu_input2, 0.5, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 0.5, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cdist_float32_3(self, device):
+    def test_cdist_float32_3(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-1, 1,
                                 (5, 10), (4, 10), np.float32)
         cpu_output = self.op_exec(npu_input1, npu_input2, 1.0, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 1.0, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cdist_float32_4(self, device):
+    def test_cdist_float32_4(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-1, 1,
                                 (5, 10), (4, 10), np.float32)
         cpu_output = self.op_exec(npu_input1, npu_input2, 1.5, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 1.5, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cdist_float32_5(self, device):
+    def test_cdist_float32_5(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-1, 1,
                                 (5, 10), (4, 10), np.float32)
         cpu_output = self.op_exec(npu_input1, npu_input2, 2.0, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 2.0, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cdist_float32_6(self, device):
+    def test_cdist_float32_6(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-1, 1,
                                 (5, 10), (4, 10), np.float32)
         cpu_output = self.op_exec(npu_input1, npu_input2, 2.5, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 2.5, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cdist_float32_7(self, device):
+    def test_cdist_float32_7(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-1, 1,
                                 (5, 500), (3, 4, 500), np.float32)
         cpu_output = self.op_exec(npu_input1, npu_input2, 2.0, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 2.0, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cdist_float32_8(self, device):
+    def test_cdist_float32_8(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-100, 100,
                                 (5, 100), (3, 4, 100), np.float32)
         cpu_output = self.op_exec(npu_input1, npu_input2, 2.5, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 2.5, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cdist_float32_9(self, device):
+    def test_cdist_float32_9(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-1000, 1000,
                                 (5, 100), (3, 4, 100), np.float32)
         cpu_output = self.op_exec(npu_input1, npu_input2, 1.5, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 1.5, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cdist_float32_10(self, device):
+    def test_cdist_float32_10(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-0.1, 0.1,
                                 (5, 100), (3, 4, 100), np.float32)
         cpu_output = self.op_exec(npu_input1, npu_input2, 2.5, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 2.5, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cdist_float32_11(self, device):
+    def test_cdist_float32_11(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-0.1, 0.1,
                                 (5, 100), (3, 4, 100), np.float32)
         cpu_output = self.op_exec(npu_input1, npu_input2, 0.5, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 0.5, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
     
-    def test_cdist_float32_12(self, device):
+    def test_cdist_float32_12(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-0.1, 0.1,
                                 (16, 11, 17, 5, 84, 2), (16, 11, 17, 5, 84, 2), np.float32)
         cpu_output = self.op_exec(npu_input1, npu_input2, 2.0, 'cpu')
         npu_output = self.op_exec(npu_input1, npu_input2, 2.0, 'npu')
         self.assertRtolEqual(cpu_output, npu_output)
     
-    def test_cdist_float32_13(self, device):
+    def test_cdist_float32_13(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(-0.1, 0.1,
                                 (2, 2, 13, 39, 97, 14, 2, 7), (2, 2, 13, 39, 97, 14, 12, 7), np.float32)
         cpu_output = self.op_exec(npu_input1, npu_input2, 2.0, 'cpu')
@@ -186,6 +185,5 @@ class Testcdist(TestCase):
         self.assertRtolEqual(cpu_output, npu_output)
     
 
-instantiate_device_type_tests(Testcdist, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_ceil.py b/test/test_network_ops/test_ceil.py
index bfa32911ea6ddf20b3b8cbe1b91252e9112a9c5a..17f20c4fba5b663abeeb92381f6e0b4bfe18820b 100644
--- a/test/test_network_ops/test_ceil.py
+++ b/test/test_network_ops/test_ceil.py
@@ -17,9 +17,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestCeil(TestCase):
     @Dtypes(torch.float)
@@ -41,7 +41,7 @@ class TestCeil(TestCase):
         output = output.to("cpu")
         return output
 
-    def test_ceil_shape_format(self, device):
+    def test_ceil_shape_format(self, device="npu"):
         shape_format = [
                 [np.float32, 0,  10               ],
                 [np.float32, 0,  (64, 10)         ],
@@ -55,6 +55,6 @@ class TestCeil(TestCase):
             npu_output = self.npu_op_exec(npu_input1)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestCeil, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_celu.py b/test/test_network_ops/test_celu.py
index 43d1ac2cda32dd6c8e5cf6c6080f8fab97f20bf1..631c16d1caac6e90b967a62226529abac7bb1901 100644
--- a/test/test_network_ops/test_celu.py
+++ b/test/test_network_ops/test_celu.py
@@ -15,9 +15,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestCelu(TestCase):
     def generate_data(self, min_d, max_d, shape, dtype):
@@ -77,25 +77,25 @@ class TestCelu(TestCase):
         output = output.astype(np.float16)
         return output
 
-    def test_celu_3_3_float32_alpha1(self, device):
+    def test_celu_3_3_float32_alpha1(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 1.0)
         npu_output1 = self.npu_op_exec(input_x1, 1.0)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_celu_10_10_10_10_float32_alpha1(self, device):
+    def test_celu_10_10_10_10_float32_alpha1(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (10, 10, 10, 10), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 1.0)
         npu_output1 = self.npu_op_exec(input_x1, 1.0)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_celu_100_100_float32_alpha2(self, device):
+    def test_celu_100_100_float32_alpha2(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (100, 100), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 2.0)
         npu_output1 = self.npu_op_exec(input_x1, 2.0)
         self.assertRtolEqual(cpu_output1, npu_output1)
     
-    def test_celu_float16_alpha1(self, device):
+    def test_celu_float16_alpha1(self, device="npu"):
         shape_format = [
             [[np.float16, 0, (65535, 1, 1, 1)]],
             [[np.float16, 0, (1, 1, 1, 65535)]],
@@ -107,7 +107,7 @@ class TestCelu(TestCase):
             npu_output = self.npu_op_exec(npu_input1, 1.0)
             self.assertRtolEqual(cpu_output, npu_output)  
 
-    def test_celu_float16_alpha2_success(self, device):
+    def test_celu_float16_alpha2_success(self, device="npu"):
         shape_format = [
             [[np.float16, 0, (65535, 1, 1, 1)]],
             [[np.float16, 0, (1, 1, 1, 65535)]],
@@ -119,7 +119,7 @@ class TestCelu(TestCase):
             npu_output = self.npu_op_exec(npu_input1, 2.0)
             self.assertRtolEqual(cpu_output, npu_output)  
 
-    def test_celu_float16_alpha2_fail(self, device):
+    def test_celu_float16_alpha2_fail(self, device="npu"):
         shape_format = [
             [[np.float16, 0, (65535, 1, 1, 1)]],
             [[np.float16, 0, (1, 1, 1, 65535)]],
@@ -131,7 +131,7 @@ class TestCelu(TestCase):
             npu_output = self.npu_op_exec(npu_input1, 2.0)
             self.assertRtolEqual(cpu_output, npu_output) 
 
-    def test_celu_inplace_alpha1(self, device):
+    def test_celu_inplace_alpha1(self, device="npu"):
         shape_format = [
             [[np.float32, 0, (65535, 1, 1, 1)]],
             [[np.float32, 0, (1, 1, 1, 65535)]],
@@ -142,7 +142,7 @@ class TestCelu(TestCase):
             npu_output = self.npu_op_inplace_exec(npu_input1, 1.0)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_celu_inplace_alpha2(self, device):
+    def test_celu_inplace_alpha2(self, device="npu"):
         shape_format = [
             [[np.float32, 0, (65535, 1, 1, 1)]],
             [[np.float32, 0, (1, 1, 1, 65535)]],
@@ -153,7 +153,7 @@ class TestCelu(TestCase):
             npu_output = self.npu_op_inplace_exec(npu_input1, 2.0)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_celu_inplace_alpha2_fail(self, device):
+    def test_celu_inplace_alpha2_fail(self, device="npu"):
         shape_format = [
             [[np.float32, 0, (65535, 1, 1, 1)]],
             [[np.float32, 0, (1, 1, 1, 65535)]],
@@ -164,7 +164,7 @@ class TestCelu(TestCase):
             npu_output = self.npu_op_inplace_exec(npu_input1, 2.0)
             self.assertRtolEqual(cpu_output, npu_output)
     
-    def test_celu_inplace_shape_format_alpha_range(self, device):
+    def test_celu_inplace_shape_format_alpha_range(self, device="npu"):
         shape_format_alpha_range = [
             # 注：[[dtype, format, shape], alpha, min, max]
             [[np.float16, 2, (16, 5, 7, 11)], 5.6, -2, 2],
@@ -190,7 +190,7 @@ class TestCelu(TestCase):
                 cpu_output = self.cpu_op_inplace_exec(cpu_input1, alpha)
             self.assertRtolEqual(cpu_output, npu_output)
     
-    def test_celu_inplace_shape_format_alpha_range(self, device):
+    def test_celu_inplace_shape_format_alpha_range(self, device="npu"):
         shape_format_alpha_range = [
             # 注：[[dtype, format, shape], alpha, min, max]
             [[np.float32, 2, (16, 5, 7, 11)], 0.5, -2, 2],
@@ -215,6 +215,6 @@ class TestCelu(TestCase):
                 cpu_output = self.cpu_op_exec(cpu_input1, alpha)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestCelu, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_clamp.py b/test/test_network_ops/test_clamp.py
index 3d655ee03cacde1e222376461015215bb0094009..fc32b89ec955cd707a1271bbc13e059abc9fc073 100644
--- a/test/test_network_ops/test_clamp.py
+++ b/test/test_network_ops/test_clamp.py
@@ -18,9 +18,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestClamp(TestCase):
     def generate_data(self, data):
@@ -105,7 +104,7 @@ class TestClamp(TestCase):
 
         return output
 
-    def test_clamp_common(self, device):
+    def test_clamp_common(self, device="npu"):
         shape_format = [
                 [1, 100, (4, 3), np.float32],
                 [1, 100, (4, 3), np.int32],
@@ -130,7 +129,7 @@ class TestClamp(TestCase):
             self.assertRtolEqual(cpu_output, npu_out_output)
             self.assertRtolEqual(cpu_inp_uncon_output, npu_inp_uncon_output)
 
-    def test_clamp_float16(self, device):
+    def test_clamp_float16(self, device="npu"):
         shape_format = [
                 [1, 100, (4, 3), np.float16],
         ]
@@ -154,6 +153,6 @@ class TestClamp(TestCase):
             self.assertRtolEqual(cpu_output, npu_out_output)
             self.assertRtolEqual(cpu_inp_uncon_output, npu_inp_uncon_output)
 
-instantiate_device_type_tests(TestClamp, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_clamp_max.py b/test/test_network_ops/test_clamp_max.py
index 675dd051dce09ec191edcad685ff7d8f3e545030..12c78852bf3987622d7b2589ca5c23dcbdfc471d 100644
--- a/test/test_network_ops/test_clamp_max.py
+++ b/test/test_network_ops/test_clamp_max.py
@@ -18,9 +18,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestClampMax(TestCase):
     def generate_data(self, data):
@@ -105,7 +104,7 @@ class TestClampMax(TestCase):
 
         return output
 
-    def test_clamp_max_common(self, device):
+    def test_clamp_max_common(self, device="npu"):
         shape_format = [
                 [1, 100, (4, 3), np.float32],
                 [1, 100, (4, 3), np.int32],
@@ -130,7 +129,7 @@ class TestClampMax(TestCase):
             self.assertRtolEqual(cpu_output, npu_out_output)
             self.assertRtolEqual(cpu_inp_uncon_output, npu_inp_uncon_output)
 
-    def test_clamp_max_float16(self, device):
+    def test_clamp_max_float16(self, device="npu"):
         shape_format = [
                 [1, 100, (4, 3), np.float16],
         ]
@@ -154,6 +153,6 @@ class TestClampMax(TestCase):
             self.assertRtolEqual(cpu_output, npu_out_output)
             self.assertRtolEqual(cpu_inp_uncon_output, npu_inp_uncon_output)
 
-instantiate_device_type_tests(TestClampMax, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_clamp_min.py b/test/test_network_ops/test_clamp_min.py
index b326ae5e836a300358a3272a3e858af9e1b3ebb8..8abf146177827e8bf6f34e375078f03e1ef667ba 100644
--- a/test/test_network_ops/test_clamp_min.py
+++ b/test/test_network_ops/test_clamp_min.py
@@ -18,9 +18,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestClampMin(TestCase):
     def generate_data(self, data):
@@ -105,7 +104,7 @@ class TestClampMin(TestCase):
 
         return output
 
-    def test_clamp_min_common(self, device):
+    def test_clamp_min_common(self, device="npu"):
         shape_format2 = [
                 [1, 100, (4, 3), np.float32],
                 [1, 100, (4, 3), np.int32],
@@ -130,7 +129,7 @@ class TestClampMin(TestCase):
             self.assertRtolEqual(cpu_output, npu_out_output)
             self.assertRtolEqual(cpu_inp_uncon_output, npu_inp_uncon_output)
 
-    def test_clamp_min_float16(self, device):
+    def test_clamp_min_float16(self, device="npu"):
         shape_format3 = [
                 [1, 100, (4, 3), np.float16],
         ]
@@ -154,6 +153,6 @@ class TestClampMin(TestCase):
             self.assertRtolEqual(cpu_output, npu_out_output)
             self.assertRtolEqual(cpu_inp_uncon_output, npu_inp_uncon_output)
 
-instantiate_device_type_tests(TestClampMin, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_confusion_transpose.py b/test/test_network_ops/test_confusion_transpose.py
index 578d75df492494dbe2f8ca812e870e9c2dafd274..09699710d0113ef2ae129a661970607b44727e81 100644
--- a/test/test_network_ops/test_confusion_transpose.py
+++ b/test/test_network_ops/test_confusion_transpose.py
@@ -19,9 +19,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestConfusionTransposeD(TestCase):
     def npu_op_exec(self, input1, shape, perm, transpose_first):
@@ -37,7 +37,7 @@ class TestConfusionTransposeD(TestCase):
         output = output.numpy()
         return output
 
-    def test_confusion_transpose(self, device):
+    def test_confusion_transpose(self, device="npu"):
         shape_format = [
             [[np.float32, 0, [1, 576, 2560]],[1, 576, 32, 80], (0, 2, 1, 3), False],
             [[np.float32, 0, [1, 32, 576, 80]],[1, 576, 2560], (0, 2, 1, 3), True],
@@ -52,6 +52,6 @@ class TestConfusionTransposeD(TestCase):
             npu_output = self.npu_op_exec(npu_input, item[1], item[2], item[3])
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestConfusionTransposeD, globals(), except_for='cpu')
+
 if __name__ == "__main__":
-    run_tests()
\ No newline at end of file
+    run_tests()
diff --git a/test/test_network_ops/test_confusion_transpose_backward.py b/test/test_network_ops/test_confusion_transpose_backward.py
index 81565ef3db458c53115dc6b17d9a0b81acf2a747..4921bcc4d4a4380b4ea3f3f45a09c813ab473374 100644
--- a/test/test_network_ops/test_confusion_transpose_backward.py
+++ b/test/test_network_ops/test_confusion_transpose_backward.py
@@ -18,9 +18,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestConfusionTransposeDBackward(TestCase):
     def npu_op_exec(self, input1, shape, perm, transpose_first):
@@ -42,7 +42,7 @@ class TestConfusionTransposeDBackward(TestCase):
         output2 = input1.grad.numpy()
         return output1, output2
 
-    def test_confusion_transpose_backward(self, device):
+    def test_confusion_transpose_backward(self, device="npu"):
         shape_format = [
             [[np.float32, 0, [1, 576, 2560]],[1, 576, 32, 80], (0, 2, 1, 3), False],
             [[np.float32, 0, [1, 32, 576, 80]],[1, 576, 2560], (0, 2, 1, 3), True],
@@ -56,6 +56,6 @@ class TestConfusionTransposeDBackward(TestCase):
             self.assertRtolEqual(cpu_output1, npu_output1)
             self.assertRtolEqual(cpu_output2, npu_output2)
 
-instantiate_device_type_tests(TestConfusionTransposeDBackward, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_conv2d.py b/test/test_network_ops/test_conv2d.py
index d7ace9e32468c8fdd4bfec0ed94ed243e9e227a1..e00838b97b4192eae99cca60bb2514c2aee0f620 100644
--- a/test/test_network_ops/test_conv2d.py
+++ b/test/test_network_ops/test_conv2d.py
@@ -17,9 +17,8 @@ import torch_npu
 import numpy as np
 import torch.nn as nn
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestConv2d(TestCase):
@@ -94,7 +93,7 @@ class TestConv2d(TestCase):
             self.assertRtolEqual(self.input_grad[0].numpy(), self.input_grad[1].numpy())
             self.assertRtolEqual(self.weight_grad[0].numpy(), self.weight_grad[1].numpy())
 
-    def test_conv2d_backward_shape_format_fp16(self, device):
+    def test_conv2d_backward_shape_format_fp16(self, device="npu"):
         shape_format = [  # input, weight, padding, stride, dilation, bias, groups            
             # shuflenet
             [[np.float16, 3, [1024, 232, 7, 7]], [np.float16, 4, [232, 232, 1, 1]], 0, 1, 1, None, 1],
@@ -111,102 +110,9 @@ class TestConv2d(TestCase):
             [[np.float16, 0, [4, 256, 75, 5]], [np.float16, 4, [128, 256, 3, 3]], [2,1], 1, 1, None, 1], 
             [[np.float16, 0, [4, 384, 75, 1]], [np.float16, 4, [192, 384, 3, 1]], 0, 1, 1, None, 1], 
             [[np.float16, 0, [4, 384, 1, 75]], [np.float16, 4, [192, 384, 1, 3]], 0, 1, 1, None, 1], 
-            # 当前不支持kernel_size_h >= padding_h*2 + input_h和kernel_size_w >= padding_w*2 + input_w, 预计330支持
-            # [[np.float16, 0, [4, 384, 75, 1]], [np.float16, 0, [192, 384, 3, 3]], 0, 1, 1, None, 1],
-            # [[np.float16, 0, [4, 384, 75, 1]], [np.float16, 0, [192, 384, 3, 3]], [1,1], 1, 1, None, 1],
-            # [[np.float16, 0, [4, 384, 1, 75]], [np.float16, 0, [192, 384, 3, 3]], 0, 1, 1, None, 1],
-            # [[np.float16, 0, [4, 384, 1, 75]], [np.float16, 0, [192, 384, 3, 3]], [1,1], 1, 1, None, 1],
         ]
         self.conv2d_backward_result(shape_format)
 
-    def test_conv2d_backward_shape_format_fp32(self, device):
-        shape_format = [  # input, weight, padding, stride, dilation, bias, groups            
-            # mobilenet
-            [[np.float32, 3, [256, 960, 7, 7]], [np.float32, 0, [320, 960, 1, 1]], 0, 1, 1, None, 1],
-            [[np.float32, 0, [256, 3, 224, 224]], [np.float32, 0, [32, 3, 3, 3]], 1, 2, 1, None, 1],
-            [[np.float32, 0, [16, 3, 640, 640]], [np.float32, 4, [64, 3, 7, 7]], 3, 2, 1, None, 1],
-            [[np.float32, 0, [4, 8, 300, 40]], [np.float32, 0, [16, 8, 3, 3]], [2,1], 1, 1, None, 1], 
-            [[np.float32, 0, [4, 64, 150, 10]], [np.float32, 0, [32, 64, 1, 1]], 0, 1, 1, None, 1], 
-            [[np.float32, 0, [4, 128, 75, 10]], [np.float32, 0, [64, 128, 1, 1]], 0, 1, 1, None, 1], 
-            [[np.float32, 0, [4, 256, 75, 5]], [np.float32, 0, [128, 256, 3, 3]], [2,1], 1, 1, None, 1], 
-            [[np.float32, 0, [4, 384, 75, 1]], [np.float32, 0, [192, 384, 3, 1]], 0, 1, 1, None, 1], 
-            [[np.float32, 0, [4, 384, 1, 75]], [np.float32, 0, [192, 384, 1, 3]], 0, 1, 1, None, 1], 
-            [[np.float32, 3, [4, 256, 75, 5]], [np.float32, 0, [128, 256, 3, 3]], [2,1], 1, 1, None, 1], 
-            [[np.float32, 3, [4, 384, 75, 1]], [np.float32, 0, [192, 384, 3, 1]], 0, 1, 1, None, 1], 
-            [[np.float32, 3, [4, 384, 1, 75]], [np.float32, 0, [192, 384, 1, 3]], 0, 1, 1, None, 1], 
-            [[np.float32, 0, [4, 256, 75, 5]], [np.float32, 4, [128, 256, 3, 3]], [2,1], 1, 1, None, 1], 
-            [[np.float32, 0, [4, 384, 75, 1]], [np.float32, 4, [192, 384, 3, 1]], 0, 1, 1, None, 1], 
-            [[np.float32, 0, [4, 384, 1, 75]], [np.float32, 4, [192, 384, 1, 3]], 0, 1, 1, None, 1], 
-            # 当前不支持kernel_size_h >= padding_h*2 + input_h和kernel_size_w >= padding_w*2 + input_w, 预计330支持
-            # [[np.float32, 0, [4, 384, 75, 1]], [np.float32, 0, [192, 384, 3, 3]], 0, 1, 1, None, 1],
-            # [[np.float32, 0, [4, 384, 75, 1]], [np.float32, 0, [192, 384, 3, 3]], [1,1], 1, 1, None, 1],
-            # [[np.float32, 0, [4, 384, 1, 75]], [np.float32, 0, [192, 384, 3, 3]], 0, 1, 1, None, 1],
-            # [[np.float32, 0, [4, 384, 1, 75]], [np.float32, 0, [192, 384, 3, 3]], [1,1], 1, 1, None, 1],
-            ]
-        #conv类算子不支持fp32数据的精度要求
-        #self.conv2d_backward_result(shape_format)
-
-    def test_group_conv2d_backward_shape_format_fp16(self, device):
-        shape_format = [  # input, weight, padding, stride, dilation, bias, groups
-            # KDXF
-            [[np.float16, 0, [4, 64, 75, 10]], [np.float16, 0, [128, 16, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float16, 0, [4, 128, 75, 10]], [np.float16, 0, [64, 32, 1, 1]], 0, 1, 1, None, 4],
-            [[np.float16, 0, [4, 128, 75, 5]], [np.float16, 0, [256, 32, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float16, 0, [4, 256, 75, 1]], [np.float16, 0, [384, 64, 3, 1]], [1,0], 1, 1, None, 4],
-            [[np.float16, 0, [4, 192, 75, 1]], [np.float16, 0, [384, 48, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float16, 0, [4, 128, 75, 1]], [np.float16, 0, [128, 32, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float16, 0, [4, 128, 75, 5]], [np.float16, 0, [128, 32, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float16, 3, [4, 192, 75, 1]], [np.float16, 0, [384, 48, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float16, 3, [4, 128, 75, 1]], [np.float16, 0, [128, 32, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float16, 3, [4, 128, 75, 5]], [np.float16, 0, [128, 32, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float16, 3, [4, 192, 75, 1]], [np.float16, 4, [384, 48, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float16, 3, [4, 128, 75, 1]], [np.float16, 4, [128, 32, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float16, 3, [4, 128, 75, 5]], [np.float16, 4, [128, 32, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float16, 0, [4, 64, 75, 5]], [np.float16, 0, [64, 1, 3, 3]], [2,1], 1, 1, None, 64], 
-            [[np.float16, 0, [4, 64, 75, 1]], [np.float16, 0, [64, 1, 3, 1]], 0, 1, 1, None, 64], 
-            [[np.float16, 0, [4, 64, 1, 75]], [np.float16, 0, [64, 1, 1, 3]], 0, 1, 1, None, 64], 
-            # 当前不支持kernel_size_h >= padding_h*2 + input_h和kernel_size_w >= padding_w*2 + input_w, 预计330支持
-            # [[np.float16, 0, [4, 64, 75, 1]], [np.float16, 0, [128, 16, 3, 3]], 0, 1, 1, None, 4],
-            # [[np.float16, 0, [4, 64, 75, 1]], [np.float16, 0, [128, 16, 3, 3]], [1,1], 1, 1, None, 4],
-            # [[np.float16, 0, [4, 64, 1, 75]], [np.float16, 0, [128, 16, 3, 3]], 0, 1, 1, None, 4],
-            # [[np.float16, 0, [4, 64, 1, 75]], [np.float16, 0, [128, 16, 3, 3]], [1,1], 1, 1, None, 4],
-            # 当前不支持in_channel == groups != out_channel
-            # [[np.float32, 0, [4, 64, 75, 5]], [np.float32, 0, [128, 1, 3, 3]], [2,1], 1, 1, None, 64], 
-            # [[np.float32, 0, [4, 64, 75, 1]], [np.float32, 0, [128, 1, 3, 1]], 0, 1, 1, None, 64], 
-            # [[np.float32, 0, [4, 64, 1, 75]], [np.float32, 0, [128, 1, 1, 3]], 0, 1, 1, None, 64], 
-        ]
-
-    def test_group_conv2d_backward_shape_format_fp32(self, device):
-        shape_format = [  # input, weight, padding, stride, dilation, bias, groups
-            # KDXF
-            [[np.float32, 0, [4, 64, 75, 10]], [np.float32, 0, [128, 16, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float32, 0, [4, 128, 75, 10]], [np.float32, 0, [64, 32, 1, 1]], 0, 1, 1, None, 4],
-            [[np.float32, 0, [4, 128, 75, 5]], [np.float32, 0, [256, 32, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float32, 0, [4, 256, 75, 1]], [np.float32, 0, [384, 64, 3, 1]], [1,0], 1, 1, None, 4],
-            [[np.float32, 0, [4, 192, 75, 1]], [np.float32, 0, [384, 48, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float32, 0, [4, 128, 75, 1]], [np.float32, 0, [128, 32, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float32, 0, [4, 128, 75, 5]], [np.float32, 0, [128, 32, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float32, 3, [4, 192, 75, 1]], [np.float32, 0, [384, 48, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float32, 3, [4, 128, 75, 1]], [np.float32, 0, [128, 32, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float32, 3, [4, 128, 75, 5]], [np.float32, 0, [128, 32, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float32, 3, [4, 192, 75, 1]], [np.float32, 4, [384, 48, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float32, 3, [4, 128, 75, 1]], [np.float32, 4, [128, 32, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float32, 3, [4, 128, 75, 5]], [np.float32, 4, [128, 32, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float32, 0, [4, 64, 75, 5]], [np.float32, 0, [64, 1, 3, 3]], [2,1], 1, 1, None, 64], 
-            [[np.float32, 0, [4, 64, 75, 1]], [np.float32, 0, [64, 1, 3, 1]], 0, 1, 1, None, 64], 
-            [[np.float32, 0, [4, 64, 1, 75]], [np.float32, 0, [64, 1, 1, 3]], 0, 1, 1, None, 64], 
-            # 当前不支持kernel_size_h >= padding_h*2 + input_h和kernel_size_w >= padding_w*2 + input_w
-            # [[np.float32, 0, [4, 64, 75, 1]], [np.float32, 0, [128, 16, 3, 3]], 0, 1, 1, None, 4],
-            # [[np.float32, 0, [4, 64, 75, 1]], [np.float32, 0, [128, 16, 3, 3]], [1,1], 1, 1, None, 4],
-            # [[np.float32, 0, [4, 64, 1, 75]], [np.float32, 0, [128, 16, 3, 3]], 0, 1, 1, None, 4],
-            # [[np.float32, 0, [4, 64, 1, 75]], [np.float32, 0, [128, 16, 3, 3]], [1,1], 1, 1, None, 4],
-            # 当前不支持in_channel == groups != out_channel
-            # [[np.float32, 0, [4, 64, 75, 5]], [np.float32, 0, [128, 1, 3, 3]], [2,1], 1, 1, None, 64], 
-            # [[np.float32, 0, [4, 64, 75, 1]], [np.float32, 0, [128, 1, 3, 1]], 0, 1, 1, None, 64], 
-            # [[np.float32, 0, [4, 64, 1, 75]], [np.float32, 0, [128, 1, 1, 3]], 0, 1, 1, None, 64], 
-        ]
-
 
-instantiate_device_type_tests(TestConv2d, globals(), except_for='cpu')
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_conv3d.py b/test/test_network_ops/test_conv3d.py
index 9aa3ead00c9844f6a712ddbf9f0fb646b56d3f12..04d9d6f658bd69cac3ef65deb6e34969a23d8dff 100644
--- a/test/test_network_ops/test_conv3d.py
+++ b/test/test_network_ops/test_conv3d.py
@@ -17,9 +17,8 @@ import torch_npu
 import numpy as np
 import torch.nn as nn
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 class TestConv3d(TestCase):
     weight_grad = []
@@ -90,20 +89,24 @@ class TestConv3d(TestCase):
             self.assertRtolEqual(self.input_grad[0].numpy(), self.input_grad[1].cpu().numpy())
             self.assertRtolEqual(self.weight_grad[0].numpy(), self.weight_grad[1].cpu().numpy())
 
-    def test_conv3d_backward_shape_format_fp16(self, device):
+    def test_conv3d_backward_shape_format_fp16(self, device="npu"):
         shape_format = [  # input, weight, padding, stride, dilation, bias, groups
-            [[np.float16, 30, [128, 128, 4, 14, 14]], [np.float16, 30, [128, 128, 3, 3, 3]], [1,1,1], [1,1,1], 1, None, 1],
-            [[np.float16, 30, [128, 64, 4, 14, 14]], [np.float16, 30, [128, 64, 3, 3, 3]], [1,1,1], [2,2,2], 1, None, 1],
+            [[np.float16, 30, [128, 128, 4, 14, 14]],
+             [np.float16, 30, [128, 128, 3, 3, 3]], [1,1,1], [1,1,1], 1, None, 1],
+            [[np.float16, 30, [128, 64, 4, 14, 14]],
+             [np.float16, 30, [128, 64, 3, 3, 3]], [1,1,1], [2,2,2], 1, None, 1],
         ]
         self.conv3d_backward_result(shape_format)
 
-    def test_conv3d_backward_shape_format_fp32(self, device):
+    def test_conv3d_backward_shape_format_fp32(self, device="npu"):
         shape_format = [  # input, weight, padding, stride, dilation, bias, groups
-            [[np.float32, 30, [128, 128, 4, 14, 14]], [np.float32, 30, [128, 128, 3, 3, 3]], [1,1,1], [1,1,1], 1, None, 1],
-            [[np.float32, 30, [128, 64, 4, 14, 14]], [np.float32, 30, [128, 64, 3, 3, 3]], [1,1,1], [2,2,2], 1, None, 1],
+            [[np.float32, 30, [128, 128, 4, 14, 14]],
+             [np.float32, 30, [128, 128, 3, 3, 3]], [1,1,1], [1,1,1], 1, None, 1],
+            [[np.float32, 30, [128, 64, 4, 14, 14]],
+             [np.float32, 30, [128, 64, 3, 3, 3]], [1,1,1], [2,2,2], 1, None, 1],
         ]
         self.conv3d_backward_result(shape_format)
 
-instantiate_device_type_tests(TestConv3d, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_conv_depthwise2d_backward.py b/test/test_network_ops/test_conv_depthwise2d_backward.py
index 5431fc4d36fd4f9abb067c4bc09f74f4c55e64bb..addc77a2e01a3478cf118fe309bae9ddb8cd4dab 100644
--- a/test/test_network_ops/test_conv_depthwise2d_backward.py
+++ b/test/test_network_ops/test_conv_depthwise2d_backward.py
@@ -17,9 +17,8 @@ import torch_npu
 import numpy as np
 import torch.nn as nn
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestConvDepthwise2d(TestCase):
@@ -95,7 +94,7 @@ class TestConvDepthwise2d(TestCase):
             self.assertRtolEqual(self.input_grad[0].numpy(), self.input_grad[1].numpy())
             self.assertRtolEqual(self.weight_grad[0].numpy(), self.weight_grad[1].numpy())
 
-    def test_conv_depthwise2d_backward_shape_format_fp16(self, device):
+    def test_conv_depthwise2d_backward_shape_format_fp16(self, device="npu"):
         shape_format = [  # input , weight, padding, stide, dilation, bias
             # shuflenet
             [[np.float16, 0, [1024, 116, 28, 28]], [np.float16, 0, [116, 1, 3, 3]], 1, 2, 1, 0],
@@ -103,7 +102,7 @@ class TestConvDepthwise2d(TestCase):
         ]
         self.conv_depthwise2d_backward_result(shape_format)
 
-    def test_conv_depthwise2d_backward_shape_format_fp32(self, device):
+    def test_conv_depthwise2d_backward_shape_format_fp32(self, device="npu"):
         shape_format = [  # input , weight, padding, stide, dilation, bias
             # mobilenet
             [[np.float32, 3, [256, 32, 112, 112]], [np.float32, 0, [32, 1, 3, 3]], 1, 1, 1, None],
@@ -113,6 +112,5 @@ class TestConvDepthwise2d(TestCase):
         #self.conv_depthwise2d_backward_result(shape_format)
 
 
-instantiate_device_type_tests(TestConvDepthwise2d, globals(), except_for='cpu')
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_conv_transpose2d_backward.py b/test/test_network_ops/test_conv_transpose2d_backward.py
index 4e092831f9ad53cfb07f7f133439940a5c348150..55f52b513db50007a4b3b7a8296dc2e49aef4470 100644
--- a/test/test_network_ops/test_conv_transpose2d_backward.py
+++ b/test/test_network_ops/test_conv_transpose2d_backward.py
@@ -20,9 +20,9 @@ import numpy as np
 import copy
 import torch.nn as nn
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestConvTranspose2dBackward(TestCase):
     weight_grad = []
@@ -105,13 +105,13 @@ class TestConvTranspose2dBackward(TestCase):
             self.assertRtolEqual(self.input_grad[0].numpy(), self.input_grad[1].numpy())
             self.assertRtolEqual(self.weight_grad[0].numpy(), self.weight_grad[1].numpy())
 
-    def test_conv_transpose2d_backward_shape_format_fp16(self, device):
+    def test_conv_transpose2d_backward_shape_format_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, 0, [1, 4, 5, 5]], [np.float16, 0, [4, 4, 3, 3]]]
         ]
         self.conv_transpose2d_backward_result(shape_format)
 
-    def test_conv_transpose2d_backward_shape_format_fp32(self, device):
+    def test_conv_transpose2d_backward_shape_format_fp32(self, device="npu"):
         shape_format = [
             [[np.float32, 0, [1, 4, 5, 5]], [np.float32, 0, [4, 4, 3, 3]]]
         ]
@@ -119,6 +119,5 @@ class TestConvTranspose2dBackward(TestCase):
         #self.conv_transpose2d_backward_result(shape_format)
 
 
-instantiate_device_type_tests(TestConvTranspose2dBackward, globals(), except_for='cpu')
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_cos.py b/test/test_network_ops/test_cos.py
index e3b45fc1e793c654e92c4d9859fe844bb51c3fbd..49cd18f9253f4fa9ff318f6dc51e4e38d60946d8 100644
--- a/test/test_network_ops/test_cos.py
+++ b/test/test_network_ops/test_cos.py
@@ -17,9 +17,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestCos(TestCase):
 
@@ -51,7 +51,7 @@ class TestCos(TestCase):
         output = output.numpy()
         return output
 
-    def test_cos_common_shape_format(self, device):
+    def test_cos_common_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float32, 0, (5,3)]],
         ]
@@ -61,7 +61,7 @@ class TestCos(TestCase):
             npu_output = self.npu_op_exec(npu_input1)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cos_out_common_shape_format(self, device):
+    def test_cos_out_common_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float32, 0, (4,3)],    [np.float32, 0, (4,3)]],
         ]
@@ -72,7 +72,7 @@ class TestCos(TestCase):
             npu_output = self.npu_op_exec_out(npu_input1, npu_input2)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cos_common_shape_format(self, device):
+    def test_cos_common_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float32, 0, (5,3)]],
         ]
@@ -82,6 +82,6 @@ class TestCos(TestCase):
             npu_output = self.npu_inp_op_exec(npu_input1)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestCos, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_ctc_loss.py b/test/test_network_ops/test_ctc_loss.py
index 8348e8cd13d6ff7ccae0afb0b4ad60efeb81537d..33df1ec3f480b30ba5c7822950cc1ed891927625 100644
--- a/test/test_network_ops/test_ctc_loss.py
+++ b/test/test_network_ops/test_ctc_loss.py
@@ -18,9 +18,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestCtcLoss(TestCase):
     def generate_data(self, item):
@@ -72,7 +71,7 @@ class TestCtcLoss(TestCase):
 
         return neg_log_likelihood
 
-    def test_ctc_loss(self, device):
+    def test_ctc_loss(self, device="npu"):
         sizes_list = [[50, 20, 16, 30, 10], [26, 37, 256, 18, 10]]
         para_reduction = ["sum", "mean", "none"]
         dtype = [np.float32, np.float16]        
@@ -89,6 +88,6 @@ class TestCtcLoss(TestCase):
             
             self.assertRtolEqual(neg_log_likelihood_cpu, neg_log_likelihood_npu, 1e-3)
 
-instantiate_device_type_tests(TestCtcLoss, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_ctc_loss_backward.py b/test/test_network_ops/test_ctc_loss_backward.py
index 4885a0fc8062965bd46d676bf5ab2578ab14bb6a..9d0361901b23b190e561e1a4189f000c12283ab9 100644
--- a/test/test_network_ops/test_ctc_loss_backward.py
+++ b/test/test_network_ops/test_ctc_loss_backward.py
@@ -19,9 +19,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestCtcLossBackward(TestCase):
     def generate_data(self, item):
@@ -79,7 +78,7 @@ class TestCtcLossBackward(TestCase):
 
         return grad
 
-    def test_ctc_loss_backward(self, device):
+    def test_ctc_loss_backward(self, device="npu"):
         sizes_list = [[50, 20, 16, 30, 10], [26, 37, 2560, 18, 10]]
         para_reduction = ["sum", "mean"]
         dtype = [np.float32]  # Insufficient accuracy when use fp16 data
@@ -95,6 +94,6 @@ class TestCtcLossBackward(TestCase):
             
             self.assertRtolEqual(grad_cpu, grad_npu, 1e-3)
 
-instantiate_device_type_tests(TestCtcLossBackward, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_cumsum.py b/test/test_network_ops/test_cumsum.py
index 2e9ff8df1e914292fe4f50370aaa929248be237d..ae1a686e324e894ac17f51b2f2db9d74585aed87 100644
--- a/test/test_network_ops/test_cumsum.py
+++ b/test/test_network_ops/test_cumsum.py
@@ -15,9 +15,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestCumsum(TestCase):
     def cpu_op_exec(self,input1, dim):
@@ -44,7 +44,7 @@ class TestCumsum(TestCase):
         output = output.numpy()
         return output
 
-    def test_cumsum_common_shape_format(self, device):
+    def test_cumsum_common_shape_format(self, device="npu"):
         shape_format = [
             [[np.float32, 0, (1, 2, 3, 4)]],
             [[np.float32, 0, (2, 3, 4)]],
@@ -66,7 +66,7 @@ class TestCumsum(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_cumsum_out_common_shape_format(self, device):
+    def test_cumsum_out_common_shape_format(self, device="npu"):
         shape_format = [
             [[[np.float32, 0, (1, 2, 3, 4)],    [np.float32, 0, (1, 2, 3, 4)]],
             [[np.float32, 0, (2, 3, 4)],    [np.float32, 0, (2, 3, 4)]],
@@ -91,6 +91,6 @@ class TestCumsum(TestCase):
             npu_output = self.npu_op_exec_out(npu_input1, npu_input2, dim)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestCumsum, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_default.py b/test/test_network_ops/test_default.py
index c92a1664ba88855b82f20c466ddebad8c16d5c3b..f359925b947a6646c19ff128c281290c1ea303bb 100644
--- a/test/test_network_ops/test_default.py
+++ b/test/test_network_ops/test_default.py
@@ -15,12 +15,11 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestDefault(TestCase):      
-    def test_isnan(self, device):
+    def test_isnan(self, device="npu"):
         cpu_input = torch.arange(1., 10)
         npu_input = cpu_input.npu()
 
@@ -28,7 +27,7 @@ class TestDefault(TestCase):
         npu_output = torch.isnan(npu_input)
         self.assertRtolEqual(cpu_output, npu_output.cpu())
         
-    def test_unfold(self, device):
+    def test_unfold(self, device="npu"):
         cpu_input = torch.arange(1., 8)
         npu_input = cpu_input.npu()
 
@@ -36,6 +35,6 @@ class TestDefault(TestCase):
         npu_output = npu_input.unfold(0, 2, 1)
         self.assertRtolEqual(cpu_output, npu_output.cpu())
 
-instantiate_device_type_tests(TestDefault, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_div.py b/test/test_network_ops/test_div.py
index 89a26edafa6c52e00ff612a8ac3f0d3750433258..ae1ba2685d81794d1980c8b69d596022fe6a3551 100644
--- a/test/test_network_ops/test_div.py
+++ b/test/test_network_ops/test_div.py
@@ -17,11 +17,12 @@ import torch_npu
 import numpy as np
 import unittest
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.decorator import Dtypes, instantiate_tests
 
 
+@instantiate_tests
 class TestDiv(TestCase):
     def get_outputs(self, cpu_args, npu_args, dtype):
         # cpu not support fp16 div
@@ -41,13 +42,13 @@ class TestDiv(TestCase):
         npu_output = npu_out.to("cpu").numpy()
         return cpu_output, npu_output
 
-    def test_div_broadcast(self, device):
+    def test_div_broadcast(self, device="npu"):
         for item in test_2args_broadcast(torch.div):
             self.assertRtolEqual(item[0], item[1])
 
     # div not support bool
     @Dtypes(torch.float, torch.half, torch.int)
-    def test_div_dtype(self, device, dtype):
+    def test_div_dtype(self, dtype):
         cpu_input1, npu_input1 = create_dtype_tensor((2,3,4,5), dtype)
         # divisor can not be zero
         cpu_input2, npu_input2 = create_dtype_tensor((2,3,4,5), dtype, no_zero=True)
@@ -60,7 +61,7 @@ class TestDiv(TestCase):
         self.assertRtolEqual(cpu_output, npu_output)
         
     @unittest.skipIf(UT_FAST_MODE, "Run UT in fast mode")
-    def test_div_shape_format_fp16(self, device):
+    def test_div_shape_format_fp16(self, device="npu"):
         format_list = [0, 3, 29]
         shape_list = [1, (64, 10), (32, 3, 3), (256, 2048, 7, 7)]
         shape_format = [
@@ -75,7 +76,7 @@ class TestDiv(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
 
     @unittest.skipIf(UT_FAST_MODE, "Run UT in fast mode")
-    def test_div_shape_format_fp32(self, device):
+    def test_div_shape_format_fp32(self, device="npu"):
         format_list = [0, 3, 29]
         shape_list = [1, (64, 10), (32, 3, 3), (256, 2048, 7, 7), (2, 0, 2)]
         shape_format = [
@@ -87,32 +88,32 @@ class TestDiv(TestCase):
             cpu_output, npu_output = self.get_outputs([cpu_input1, cpu_input2], [npu_input1, npu_input2], torch.float)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_div_mix_dtype_1(self, device):
+    def test_div_mix_dtype_1(self, device="npu"):
         npu_input1, npu_input2 = create_common_tensor([np.int32, 0, (2, 3)], 1, 100)
         npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
         cpu_output, npu_output = self.get_outputs([npu_input1, npu_input3], [npu_input2, npu_input4], torch.float)
         self.assertRtolEqual(cpu_output, npu_output)
         
-    def test_div_mix_dtype_2(self, device):
+    def test_div_mix_dtype_2(self, device="npu"):
         npu_input1, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
         npu_input3 = torch.tensor(3).int()
         cpu_output, npu_output = self.get_outputs([npu_input1, npu_input3], [npu_input2, npu_input3], torch.float)
         self.assertRtolEqual(cpu_output, npu_output)
     
-    def test_div_scalar_dtype(self, device):
+    def test_div_scalar_dtype(self, device="npu"):
         cpu_input1, npu_input1 = create_common_tensor([np.int32, 0, (2, 3)], 1, 100)
         cpu_output = cpu_input1 / 0.5
         npu_output = npu_input1 / 0.5
         self.assertRtolEqual(cpu_output, npu_output.cpu())
 
-    def test_div_npuscalar_dtype(self, device):
+    def test_div_npuscalar_dtype(self, device="npu"):
         cpu_input1, npu_input1 = create_common_tensor([np.int32, 0, (2, 3)], 1, 100)
         cpu_output = cpu_input1 / torch.tensor(0.5)
         npu_output = npu_input1 / torch.tensor(0.5).npu()
         self.assertRtolEqual(cpu_output, npu_output.cpu())
         
     @unittest.skipIf(UT_FAST_MODE, "Run UT in fast mode")
-    def test_div_shape_format_fp32(self, device):
+    def test_div_shape_format_fp32(self, device="npu"):
         format_list = [0, 3, 29]
         shape_list = [1, (64, 10), (32, 3, 3), (256, 2048, 7, 7)]
         shape_format = [
@@ -124,6 +125,6 @@ class TestDiv(TestCase):
             cpu_output, npu_output = self.get_outputs_chk([cpu_input1, cpu_input2], [npu_input1, npu_input2], torch.float)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestDiv, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_dot.py b/test/test_network_ops/test_dot.py
index 0540530bb5027c3df096059af6f6ec105358e9b6..eab188d62d63f18cfa0ea0163a21c0832a8d20b4 100644
--- a/test/test_network_ops/test_dot.py
+++ b/test/test_network_ops/test_dot.py
@@ -16,9 +16,7 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
 
 
 class TestDot(TestCase):
@@ -64,37 +62,37 @@ class TestDot(TestCase):
         output = output.numpy() 
         return output        
 
-    def test_dot_float32(self, device): 
+    def test_dot_float32(self, device="npu"): 
         npu_input1, npu_input2 = self.generate_data(0, 10, (3) , np.float32) 
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2) 
         npu_output = self.npu_op_exec(npu_input1, npu_input2) 
         self.assertRtolEqual(cpu_output, npu_output) 
 
-    def test_dot_float32_out(self, device): 
+    def test_dot_float32_out(self, device="npu"): 
         npu_input1, npu_input2, npu_input3 = self.generate_three_data(0, 10, (3) , np.float32) 
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2) 
         npu_output = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3) 
         self.assertRtolEqual(cpu_output, npu_output) 
         
-    def test_dot_float16(self, device): 
+    def test_dot_float16(self, device="npu"): 
         npu_input1, npu_input2 = self.generate_data(0, 10, (3) , np.float16) 
         cpu_output = self.cpu_op_exec(npu_input1.float(), npu_input2.float()).astype(np.float16)
         npu_output = self.npu_op_exec(npu_input1.float(), npu_input2.float()).astype(np.float16)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_dot_float16_out(self, device): 
+    def test_dot_float16_out(self, device="npu"): 
         npu_input1, npu_input2, npu_input3 = self.generate_three_data(0, 10, (3) , np.float16) 
         cpu_output = self.cpu_op_exec(npu_input1.float(), npu_input2.float()).astype(np.float16)
         npu_output = self.npu_op_exec_out(npu_input1.float(), npu_input2.float(), npu_input3.float()).astype(np.float16)
         self.assertRtolEqual(cpu_output, npu_output) 
 
-    def test_big_scale_float32(self, device):
+    def test_big_scale_float32(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(0, 10, (10240) , np.float32) 
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2) 
         npu_output = self.npu_op_exec(npu_input1, npu_input2) 
         self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestDot, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
 
diff --git a/test/test_network_ops/test_dropout.py b/test/test_network_ops/test_dropout.py
index 7d02c9849a3f989c6484ae252103676786871e0a..4595ae9bf274300dbcedde8907c6097cefb1d0bc 100644
--- a/test/test_network_ops/test_dropout.py
+++ b/test/test_network_ops/test_dropout.py
@@ -15,11 +15,10 @@
 import torch
 import torch_npu
 import numpy as np
-from torch.nn import functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestDropOutDoMask(TestCase):
     def cpu_op_exec(self, input1):
@@ -50,7 +49,7 @@ class TestDropOutDoMask(TestCase):
             else:
                 print(f'input = {item}, Successfully!')
 
-    def test_op_shape_format_fp16(self, device):
+    def test_op_shape_format_fp16(self, device="npu"):
         format_list = [-1]
         shape_list = [1, (256, 1280), (32, 3, 3), (256, 2048, 7, 7)]
         shape_format = [
@@ -58,7 +57,7 @@ class TestDropOutDoMask(TestCase):
         ]
         self.dropout_list_exec(shape_format)
 
-    def test_op_shape_format_fp32(self, device):
+    def test_op_shape_format_fp32(self, device="npu"):
         format_list = [-1]
         shape_list = [1, (256, 1280), (32, 3, 3), (256, 2048, 7, 7)]
         shape_format = [
@@ -66,6 +65,6 @@ class TestDropOutDoMask(TestCase):
         ]
         self.dropout_list_exec(shape_format)
 
-instantiate_device_type_tests(TestDropOutDoMask, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_dropout_backward.py b/test/test_network_ops/test_dropout_backward.py
index b93c87484437e3b72063993c9a7384493581e471..b2deb21bcd9fbc2077882d7c79c71d3dd84aabd7 100644
--- a/test/test_network_ops/test_dropout_backward.py
+++ b/test/test_network_ops/test_dropout_backward.py
@@ -17,9 +17,9 @@ import torch_npu
 import numpy as np
 from torch.nn import functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestDropOutBackward(TestCase):
     def cpu_op_exec(self, input1):
@@ -66,7 +66,7 @@ class TestDropOutBackward(TestCase):
             else:
                 print(f'input = {item}, Successfully!')
 
-    def test_op_shape_format_fp16(self, device):
+    def test_op_shape_format_fp16(self, device="npu"):
         format_list = [-1]
         shape_list = [1, (32, 3, 3)]
         shape_format = [
@@ -74,7 +74,7 @@ class TestDropOutBackward(TestCase):
         ]
         self.dropout_list_exec(shape_format)
 
-    def test_op_shape_format_fp32(self, device):
+    def test_op_shape_format_fp32(self, device="npu"):
         format_list = [-1]
         shape_list = [1, (32, 3, 3)]
         shape_format = [
@@ -82,6 +82,6 @@ class TestDropOutBackward(TestCase):
         ]
         self.dropout_list_exec(shape_format)
 
-instantiate_device_type_tests(TestDropOutBackward, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_embedding.py b/test/test_network_ops/test_embedding.py
index 3497d28af257f8050a53e29bbb87fd12caed033f..4243bfb6357fcd5e784cdbdb78cbad87dfc29b42 100644
--- a/test/test_network_ops/test_embedding.py
+++ b/test/test_network_ops/test_embedding.py
@@ -17,9 +17,8 @@ import torch_npu
 import numpy as np
 import torch.nn.functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestEmbedding(TestCase):
@@ -34,7 +33,7 @@ class TestEmbedding(TestCase):
         out_npu = out.to("cpu")
         return out_npu.detach().numpy()
 
-    def test_shape_format(self, device):
+    def test_shape_format(self, device="npu"):
         shape_format = [
                         [[np.float32, 0, [40,32]], [np.int64, 0, [40]]],
                         [[np.float32, 0, [40,1024]], [np.int64, 0, [40]]],
@@ -64,6 +63,6 @@ class TestEmbedding(TestCase):
 
             self.assertRtolEqual(cpu_out, npu_out)
 
-instantiate_device_type_tests(TestEmbedding, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_embedding_backward.py b/test/test_network_ops/test_embedding_backward.py
index a5ea360c1021e019233ebb11219e88c00d77fbbb..8dd9e815c6f90891791db1efd9af691865acdc63 100644
--- a/test/test_network_ops/test_embedding_backward.py
+++ b/test/test_network_ops/test_embedding_backward.py
@@ -16,9 +16,9 @@ import torch_npu
 import torch.nn.functional as F
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestEmbeddingBackward(TestCase):
     def cpu_op_exec(self, weight, indices):
@@ -37,7 +37,7 @@ class TestEmbeddingBackward(TestCase):
         grad_npu = grad_npu.to("cpu")
         return out_npu.detach().numpy(), grad_npu.detach().numpy()
 
-    def test_embedding_backward_shape_format_fp32(self, device):
+    def test_embedding_backward_shape_format_fp32(self, device="npu"):
         format_list = [0]
         shape_list1 = [[40, 32], [40, 1024], [40000, 1024], [33712, 1024]]
         shape_list2 = [[40], [40], [40000], [33712]]
@@ -60,7 +60,7 @@ class TestEmbeddingBackward(TestCase):
             self.assertRtolEqual(cpu_out, npu_out)
             self.assertRtolEqual(cpu_grad, npu_grad)
 
-instantiate_device_type_tests(TestEmbeddingBackward, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
 
diff --git a/test/test_network_ops/test_embeddingdensebackward.py b/test/test_network_ops/test_embeddingdensebackward.py
index ea27b1a9baf1c6f4f880d81343bc9a2df4a8f99c..e7d5fa767ec1a71935e8d806e8a4b3fc4d286cdd 100644
--- a/test/test_network_ops/test_embeddingdensebackward.py
+++ b/test/test_network_ops/test_embeddingdensebackward.py
@@ -17,9 +17,9 @@ import torch_npu
 import numpy as np
 import torch.nn.functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestEmbeddingDenseBackward(TestCase):
     def cpu_op_exec(self, weight, indices):
@@ -38,7 +38,7 @@ class TestEmbeddingDenseBackward(TestCase):
         grad_npu = grad_npu.to("cpu")
         return out_npu.detach().numpy(), grad_npu.detach().numpy()
 
-    def test_embedding_dense_backward_shape_format_fp32(self, device):
+    def test_embedding_dense_backward_shape_format_fp32(self, device="npu"):
         format_list = [0]
         shape_list1 = [[40, 32], [40, 1024], [40000, 1024], [33712, 1024]]
         shape_list2 = [[40], [40], [40000], [33712]]
@@ -62,7 +62,6 @@ class TestEmbeddingDenseBackward(TestCase):
             self.assertRtolEqual(cpu_grad, npu_grad)
 
 
-instantiate_device_type_tests(TestEmbeddingDenseBackward, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
 
diff --git a/test/test_network_ops/test_equal.py b/test/test_network_ops/test_equal.py
index 18b984fcdd3cc5bde84419e5532f63c3667fc817..42dbea43d600ba1647ab6f747df91fd04ea97adc 100644
--- a/test/test_network_ops/test_equal.py
+++ b/test/test_network_ops/test_equal.py
@@ -19,9 +19,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestTensorEqual(TestCase):
@@ -38,7 +37,7 @@ class TestTensorEqual(TestCase):
         output = output.numpy()
         return output
 
-    def test_tensor_equal_common_shape_format(self, device):
+    def test_tensor_equal_common_shape_format(self, device="npu"):
         shape_format = [
             [[np.float32, 0, (4, 3)], [np.float32, 0, (4, 3)]],
             [[np.float32, 29, (4, 3, 1)], [np.float32, 29, (4, 1, 5)]],
@@ -68,7 +67,7 @@ class TestTensorEqual(TestCase):
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_tensor_equal_float16_shape_format(self, device):
+    def test_tensor_equal_float16_shape_format(self, device="npu"):
         def cpu_op_exec_fp16(input1, input2):
             output = torch.equal(input1, input2)
             output = torch.tensor(output)
@@ -105,6 +104,5 @@ class TestTensorEqual(TestCase):
         self.assertRtolEqual(cpu_output, npu_output)
 
 
-instantiate_device_type_tests(TestTensorEqual, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_erfc.py b/test/test_network_ops/test_erfc.py
index c04881571d9641812a66c670f077d097d383d9db..9d218d9156ee16c2e29afc957725e2561a7d1881 100644
--- a/test/test_network_ops/test_erfc.py
+++ b/test/test_network_ops/test_erfc.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestErfc(TestCase):
     def get_shapeFormat(self):
@@ -65,7 +65,7 @@ class TestErfc(TestCase):
         output = output.numpy()
         return output
  
-    def test_erfc_float32_common_shape_format(self, device):
+    def test_erfc_float32_common_shape_format(self, device="npu"):
         shape_format = self.get_shapeFormat()
         for item in shape_format:            
             cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
@@ -73,7 +73,7 @@ class TestErfc(TestCase):
             npu_output = self.npu_op_exec(npu_input1)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_erfc_float16_common_shape_format(self, device):
+    def test_erfc_float16_common_shape_format(self, device="npu"):
         shape_format = self.get_shapeFormat()
         for item in shape_format:            
             cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
@@ -83,7 +83,7 @@ class TestErfc(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_erfc_float321_common_shape_format(self, device):
+    def test_erfc_float321_common_shape_format(self, device="npu"):
         shape_format = self.get_shapeFormat()
         for item in shape_format:        
             cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
@@ -91,7 +91,7 @@ class TestErfc(TestCase):
             npu_output = self.npu_op_exec_(npu_input1)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_erfc_float161_common_shape_format(self, device):
+    def test_erfc_float161_common_shape_format(self, device="npu"):
         shape_format = self.get_shapeFormat()
         for item in shape_format:        
             cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
@@ -101,7 +101,7 @@ class TestErfc(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_erfc_out_float32_common_shape_format(self, device):
+    def test_erfc_out_float32_common_shape_format(self, device="npu"):
         shape_format = self.get_shapeFormat()
         for item in shape_format:          
             cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
@@ -110,7 +110,7 @@ class TestErfc(TestCase):
             npu_output = self.npu_op_exec_out(npu_input1, npu_out)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_erfc_out_float16_common_shape_format(self, device):
+    def test_erfc_out_float16_common_shape_format(self, device="npu"):
         shape_format = self.get_shapeFormat()
         for item in shape_format:          
             cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
@@ -122,6 +122,6 @@ class TestErfc(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestErfc, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_exp.py b/test/test_network_ops/test_exp.py
index 600827301ca0dae4271b1aa8860b9f39c2570ba3..f6b533f36d0acfd4502ae216ada48d6ae5ce4ade 100644
--- a/test/test_network_ops/test_exp.py
+++ b/test/test_network_ops/test_exp.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestExp(TestCase):
     def cpu_op_exec(self, input1):
@@ -32,7 +32,7 @@ class TestExp(TestCase):
         output = output.numpy()
         return output
 
-    def test_exp_shape_format_fp16(self, device):
+    def test_exp_shape_format_fp16(self, device="npu"):
         format_list = [0, 3]
         shape_list = [[5], [2, 4], [2, 2, 4], [2, 3, 3, 4]]
         shape_format = [
@@ -46,7 +46,7 @@ class TestExp(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_exp_shape_format_fp32(self, device):
+    def test_exp_shape_format_fp32(self, device="npu"):
         format_list = [0, 3]
         shape_list = [[5], [2, 4], [2, 2, 4], [2, 3, 3, 4]]
         shape_format = [
@@ -59,6 +59,5 @@ class TestExp(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
 
 
-instantiate_device_type_tests(TestExp, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_expm1.py b/test/test_network_ops/test_expm1.py
index 2f32061883aeba2511f6e5e79295a3d8f37be770..cf68bae2e4793945dd08db86315cf2a3ef8577b5 100644
--- a/test/test_network_ops/test_expm1.py
+++ b/test/test_network_ops/test_expm1.py
@@ -15,9 +15,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestExpm1(TestCase):
     def get_shapeFormat1(self):
@@ -86,7 +86,7 @@ class TestExpm1(TestCase):
         output = output.numpy()
         return output
 
-    def test_expm1_float32_common_shape_format(self, device):
+    def test_expm1_float32_common_shape_format(self, device="npu"):
         shape_format = self.get_shapeFormat1()
         for item in shape_format:            
             cpu_input1, npu_input1 = create_common_tensor(item, 1, 10)
@@ -94,7 +94,7 @@ class TestExpm1(TestCase):
             npu_output = self.npu_op_exec(npu_input1)
             self.assertRtolEqual(cpu_output, npu_output)
     		
-    def test_expm1_float321_common_shape_format(self, device):
+    def test_expm1_float321_common_shape_format(self, device="npu"):
         shape_format = self.get_shapeFormat1()
         for item in shape_format:        
             cpu_input1, npu_input1 = create_common_tensor(item, 1, 10)
@@ -102,7 +102,7 @@ class TestExpm1(TestCase):
             npu_output = self.npu_op_exec_(npu_input1)
             self.assertRtolEqual(cpu_output, npu_output)
 	
-    def test_expm1_out_float32_common_shape_format(self, device):
+    def test_expm1_out_float32_common_shape_format(self, device="npu"):
         shape_format = self.get_shapeFormat2()
         for item in shape_format:          
             cpu_input1, npu_input1 = create_common_tensor(item, 1, 10)
@@ -111,7 +111,7 @@ class TestExpm1(TestCase):
             npu_output = self.npu_op_exec_out(npu_input1,npu_out)
             self.assertRtolEqual(cpu_output, npu_output)
     
-    def test_expm1_float16_common_shape_format(self, device):
+    def test_expm1_float16_common_shape_format(self, device="npu"):
         shape_format = self.get_shapeFormat2()
         for item in shape_format:            
             cpu_input1, npu_input1 = create_common_tensor(item, 1, 10)
@@ -123,7 +123,7 @@ class TestExpm1(TestCase):
                 cpu_output = cpu_output.astype(np.float16)
             self.assertRtolEqual(cpu_output, npu_output)
     		
-    def test_expm1_float16__common_shape_format(self, device):
+    def test_expm1_float16__common_shape_format(self, device="npu"):
         shape_format = self.get_shapeFormat3()
         for item in shape_format:        
             cpu_input1, npu_input1 = create_common_tensor(item, 1, 10)
@@ -135,7 +135,7 @@ class TestExpm1(TestCase):
                 cpu_output = cpu_output.astype(np.float16)
             self.assertRtolEqual(cpu_output, npu_output)
 	
-    def test_expm1_out_float16_common_shape_format(self, device):
+    def test_expm1_out_float16_common_shape_format(self, device="npu"):
         shape_format = self.get_shapeFormat3()
         for item in shape_format:          
             cpu_input1, npu_input1 = create_common_tensor(item, 1, 10)
@@ -149,7 +149,6 @@ class TestExpm1(TestCase):
                 cpu_output = cpu_output.astype(np.float16)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestExpm1, globals(), except_for="cpu")
 
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_fast_gelu.py b/test/test_network_ops/test_fast_gelu.py
index 3cbed325e24a62b21b116f5bb477dea65a741c3f..c8023da6efbefc6b3ced6bfe76738ec247a6fc03 100644
--- a/test/test_network_ops/test_fast_gelu.py
+++ b/test/test_network_ops/test_fast_gelu.py
@@ -15,9 +15,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestFastGelu(TestCase):
     def npu_op_exec(self, input1):
@@ -26,12 +25,12 @@ class TestFastGelu(TestCase):
         output = output.numpy()
         return output
 
-    def test_fastgelu(self, device):
+    def test_fastgelu(self, device="npu"):
         input1    = torch.tensor([1.,2.,3.,4.]).npu()
         exoutput = torch.tensor([0.8458, 1.9357, 2.9819, 3.9956])
         output   = self.npu_op_exec(input1)
         self.assertRtolEqual(exoutput.numpy(), output)
 
-instantiate_device_type_tests(TestFastGelu, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_fast_gelu_backward.py b/test/test_network_ops/test_fast_gelu_backward.py
index 6fe30f1d21ae34d758dc5abd4a49258c05764c9a..51522f2ac300b5237bc45bfe9af2cf3ddf8b289b 100644
--- a/test/test_network_ops/test_fast_gelu_backward.py
+++ b/test/test_network_ops/test_fast_gelu_backward.py
@@ -15,9 +15,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestFastGelu(TestCase):
     def npu_op_exec(self, input1):
@@ -30,7 +29,7 @@ class TestFastGelu(TestCase):
         output = output.cpu().detach().numpy()
         return output_grad, output
 
-    def test_fastgelu(self, device):
+    def test_fastgelu(self, device="npu"):
         input1    = torch.tensor([1.,2.,3.,4.]).npu()
         exoutputgrad = torch.tensor([1.0677795, 1.0738151, 1.0245483, 1.0064018])
         exoutput = torch.tensor([0.8458, 1.9357, 2.9819, 3.9956])
@@ -38,6 +37,6 @@ class TestFastGelu(TestCase):
         self.assertRtolEqual(exoutputgrad.numpy(), outputgrad)
         self.assertRtolEqual(exoutput.numpy(), output)
 
-instantiate_device_type_tests(TestFastGelu, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_fill_diagonal.py b/test/test_network_ops/test_fill_diagonal.py
index e9894a60c295ea44fc566ca31ab23b3cb116f728..134dce64db596fe9e12988da273e47c7b05dee53 100644
--- a/test/test_network_ops/test_fill_diagonal.py
+++ b/test/test_network_ops/test_fill_diagonal.py
@@ -11,13 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import torch
+
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestFillDiagonal(TestCase):
     def npu_op_exec(self, input1):
@@ -56,7 +56,7 @@ class TestFillDiagonal(TestCase):
         output = input1.numpy()
         return output
 
-    def test_fill_diagonal_shape_format_fp32(self, device):
+    def test_fill_diagonal_shape_format_fp32(self, device="npu"):
         format_list = [0, 3]
         shape_list = ([7, 3], [3, 3, 3])
         shape_format = [
@@ -73,7 +73,7 @@ class TestFillDiagonal(TestCase):
             self.assertRtolEqual(cpu_output2, npu_output2)
             self.assertRtolEqual(cpu_output3, npu_output3)
 
-    def test_fill_diagonal_shape_format_fp16(self, device):
+    def test_fill_diagonal_shape_format_fp16(self, device="npu"):
         format_list = [0, 3]
         shape_list = ([7, 3], [3, 3, 3])
         shape_format = [
@@ -91,7 +91,7 @@ class TestFillDiagonal(TestCase):
             self.assertRtolEqual(cpu_output2, npu_output2)
 
 
-    def test_fill_diagonal_false_shape_format_fp32(self, device):
+    def test_fill_diagonal_false_shape_format_fp32(self, device="npu"):
         format_list1 = [0, 3]
         shape_list1 = ([7, 3], [3, 3, 3])
         shape_format = [
@@ -108,7 +108,7 @@ class TestFillDiagonal(TestCase):
             self.assertRtolEqual(cpu_output3, npu_output3)
             self.assertRtolEqual(cpu_output4, npu_output4)
 
-    def test_fill_diagonal_false_shape_format_fp16(self, device):
+    def test_fill_diagonal_false_shape_format_fp16(self, device="npu"):
         format_list1 = [0, 3]
         shape_list1 = ([7, 3], [3, 3, 3])
         shape_format = [
@@ -125,6 +125,6 @@ class TestFillDiagonal(TestCase):
             self.assertRtolEqual(cpu_output4, npu_output4)
             self.assertRtolEqual(cpu_output5, npu_output5)
 
-instantiate_device_type_tests(TestFillDiagonal, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_flip.py b/test/test_network_ops/test_flip.py
index 11d2800912e131ee046a55bdbdd3a7dd4c71c4ee..b368f362b4a9ab537c6ce1bb935f26cc4ed76b6f 100644
--- a/test/test_network_ops/test_flip.py
+++ b/test/test_network_ops/test_flip.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestFlip(TestCase):
     def cpu_op_exec(self, input1, dims):
@@ -32,7 +32,7 @@ class TestFlip(TestCase):
         output = output.numpy()
         return output
 
-    def test_flip_shape_format(self, device):
+    def test_flip_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float32, 0, [2,2,2]], [0]],
                 [[np.float32, 0, [2,2,2,4]], [-2]],
@@ -45,6 +45,6 @@ class TestFlip(TestCase):
             npu_output = self.npu_op_exec(npu_input1, item[1])
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestFlip, globals(), except_for="cpu")
+
 if __name__ == "__main__":
 	run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_format_cast.py b/test/test_network_ops/test_format_cast.py
index 3e3ed6245b4e00233d78f37681f160a857fb4214..7bbff2d999db12603dae72f0b23db780c774e397 100644
--- a/test/test_network_ops/test_format_cast.py
+++ b/test/test_network_ops/test_format_cast.py
@@ -17,8 +17,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestFormatCast(TestCase):
     def create_single_npu_tensor(self, item, minvalue, maxvalue):
@@ -36,7 +36,7 @@ class TestFormatCast(TestCase):
             print("expectValue: ", expectValue, " resultValue: ", torch_npu.get_npu_format(retTensor))
             sys.exit(-1)
 
-    def test_format_cast_tensor(self, device):
+    def test_format_cast_tensor(self, device="npu"):
         src_shape_format = [
             [np.float16, 0, (2, 2, 4, 4)],
             [np.float16, 2, (2, 2, 4, 4)]
@@ -55,7 +55,7 @@ class TestFormatCast(TestCase):
                 result_tensor = torch_npu.npu_format_cast(src_tensor, dst_tensor)
                 self.check_result(torch_npu.get_npu_format(dst_tensor), result_tensor)
 
-    def test_format_cast(self, device):
+    def test_format_cast(self, device="npu"):
         shape_format = [np.float16, -1, (2, 2, 4, 4)]
         npu_tensor = self.create_single_npu_tensor(shape_format, 1, 5)
 
@@ -69,62 +69,20 @@ class TestFormatCast(TestCase):
         self.check_result(3, npu_tensor)
         npu_tensor = torch_npu.npu_format_cast(npu_tensor, 2)
         self.check_result(0, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 0)
-        self.check_result(0, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 2)
-        self.check_result(2, npu_tensor)
 
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 0)
-        self.check_result(0, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 4)
-        self.check_result(4, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 0)
-        self.check_result(0, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 29)
-        self.check_result(29, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 0)
-        self.check_result(0, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 2)
-        self.check_result(2, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 4)
-        self.check_result(4, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 0)
-        self.check_result(0, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 2)
-        self.check_result(2, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 29)
-        self.check_result(29, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 0)
-        self.check_result(0, npu_tensor)
+        npu_format_list = [0, 2, 0, 4, 0, 29, 0, 2, 4, 0, 2, 29, 0]
+        for npu_format in npu_format_list:
+            npu_tensor = torch_npu.npu_format_cast(npu_tensor, npu_format)
+            self.check_result(npu_format, npu_tensor)
 
         npu_tensor = npu_tensor.view(2,2,2,2,4).clone()
 
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 30)
-        self.check_result(30, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 33)
-        self.check_result(33, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 30)
-        self.check_result(30, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 2)
-        self.check_result(2, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 33)
-        self.check_result(33, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 2)
-        self.check_result(2, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 30)
-        self.check_result(30, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 32)
-        self.check_result(32, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 30)
-        self.check_result(30, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 2)
-        self.check_result(2, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 32)
-        self.check_result(32, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast(npu_tensor, 2)
-        self.check_result(2, npu_tensor)
+        npu_format_list = [30, 33, 30, 2, 33, 2, 30, 32, 30, 2, 32, 2]
+        for npu_format in npu_format_list:
+            npu_tensor = torch_npu.npu_format_cast(npu_tensor, npu_format)
+            self.check_result(npu_format, npu_tensor)
 
-    def test_format_cast_inplace(self, device):
+    def test_format_cast_inplace(self, device="npu"):
         shape_format = [np.float16, -1, (2, 2, 4, 4)]
         npu_tensor = self.create_single_npu_tensor(shape_format, 1, 5)
 
@@ -138,63 +96,21 @@ class TestFormatCast(TestCase):
         self.check_result(3, npu_tensor)
         npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 2)
         self.check_result(0, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 0)
-        self.check_result(0, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 2)
-        self.check_result(2, npu_tensor)
 
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 0)
-        self.check_result(0, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 4)
-        self.check_result(4, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 0)
-        self.check_result(0, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 29)
-        self.check_result(29, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 0)
-        self.check_result(0, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 2)
-        self.check_result(2, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 4)
-        self.check_result(4, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 0)
-        self.check_result(0, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 2)
-        self.check_result(2, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 29)
-        self.check_result(29, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 0)
-        self.check_result(0, npu_tensor)
+        npu_format_list = [0, 2, 0, 4, 0, 29, 0, 2, 4, 0, 2, 29, 0]
+        for npu_format in npu_format_list:
+            npu_tensor = torch_npu.npu_format_cast_(npu_tensor, npu_format)
+            self.check_result(npu_format, npu_tensor)
 
         npu_tensor = npu_tensor.view(2,2,2,2,4).clone()
 
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 30)
-        self.check_result(30, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 33)
-        self.check_result(33, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 30)
-        self.check_result(30, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 2)
-        self.check_result(2, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 33)
-        self.check_result(33, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 2)
-        self.check_result(2, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 30)
-        self.check_result(30, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 32)
-        self.check_result(32, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 30)
-        self.check_result(30, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 2)
-        self.check_result(2, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 32)
-        self.check_result(32, npu_tensor)
-        npu_tensor = torch_npu.npu_format_cast_(npu_tensor, 2)
-        self.check_result(2, npu_tensor)
+        npu_format_list = [30, 33, 30, 2, 33, 2, 30, 32, 30, 2, 32, 2]
+        for npu_format in npu_format_list:
+            npu_tensor = torch_npu.npu_format_cast_(npu_tensor, npu_format)
+            self.check_result(npu_format, npu_tensor)
 
     # UT for view + transdata scene 
-    def test_format_cast_val(self, device):
+    def test_format_cast_val(self, device="npu"):
         shape_format = [np.float32, -1, (10, 4)]
         npu_tensor = self.create_single_npu_tensor(shape_format, 1, 5)
         npu_tensor = torch_npu.npu_format_cast(npu_tensor, 3)
@@ -204,6 +120,6 @@ class TestFormatCast(TestCase):
         b = b.to("cpu")
         self.assertRtolEqual(a, b)
 
-instantiate_device_type_tests(TestFormatCast, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_full.py b/test/test_network_ops/test_full.py
index c779acf22a95f33d2e3c1a020b15746edde4b198..a02613b5d88b22643167c3c0f01620918ea936a9 100644
--- a/test/test_network_ops/test_full.py
+++ b/test/test_network_ops/test_full.py
@@ -16,12 +16,12 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestFull(TestCase):
-    def test_full_shape_format_fp16(self, device):
+    def test_full_shape_format_fp16(self, device="npu"):
         format_list = [0, 3]
         dtype_list = [torch.float32, torch.float16, torch.int32]
         shape_list = [[5, 8], [2, 4, 1, 1], [16]]
@@ -36,7 +36,7 @@ class TestFull(TestCase):
             npu_output = npu_output.numpy()
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_full_shape_format_fp32(self, device):
+    def test_full_shape_format_fp32(self, device="npu"):
         format_list = [0, 3]
         dtype_list = [torch.float32, torch.float16, torch.int32]
         shape_list = [[5, 8], [2, 4, 1, 1], [16]]
@@ -51,7 +51,7 @@ class TestFull(TestCase):
             npu_output = npu_output.numpy()
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_full_out(self, device):
+    def test_full_out(self, device="npu"):
 
         shape_format = [[[np.float32, 0, [5, 8]], torch.float32]]
         for item in shape_format:
@@ -64,6 +64,6 @@ class TestFull(TestCase):
             npu_output = npu_output.numpy()
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestFull, globals(), except_for="cpu")
+
 if __name__ == '__main__':
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_gather.py b/test/test_network_ops/test_gather.py
index 9cc3a761dbbdb315a2fe05255badd22cbdeb4710..ee836b037ba95116bd033729b1a226deb2ce7209 100644
--- a/test/test_network_ops/test_gather.py
+++ b/test/test_network_ops/test_gather.py
@@ -14,14 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestGather(TestCase):
     def cpu_op_exec(self, input1, dim, index):
@@ -35,7 +34,7 @@ class TestGather(TestCase):
         output = output.numpy()
         return output
 
-    def test_gather_shape_format(self, device):
+    def test_gather_shape_format(self, device="npu"):
         shape_format = [
             [[np.int32, 0, (4, 3)], 0, torch.LongTensor([[0, 1, 1], [2, 0, 1]])],
             [[np.int64, 0, (2, 3)], 1, torch.LongTensor([[0, 1, 1], [0, 0, 1]])],
@@ -51,6 +50,6 @@ class TestGather(TestCase):
             npu_output = self.npu_op_exec(npu_input1, item[1], npu_idx)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestGather, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_ge.py b/test/test_network_ops/test_ge.py
index 5a9011620d456096f85442c0633a1e513dc2132e..487838c4945207aed3cc3979c30138f895283b78 100644
--- a/test/test_network_ops/test_ge.py
+++ b/test/test_network_ops/test_ge.py
@@ -15,9 +15,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestGe(TestCase):
@@ -109,7 +108,7 @@ class TestGe(TestCase):
             cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
             self.assertRtolEqual(cpu_output_out, npu_output_out)
 
-    def test_ge_tensor_out(self, device):
+    def test_ge_tensor_out(self, device="npu"):
         shape_format = [
             [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
             [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]],
@@ -135,7 +134,7 @@ class TestGe(TestCase):
             cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
             self.assertRtolEqual(cpu_output_out, npu_output_out)
 
-    def test_ge_scalar_out(self, device):
+    def test_ge_scalar_out(self, device="npu"):
         shape_format = [
             [[np.float16, 0, [4, 4, 128, 128]], [np.float16, 0, [256, 116, 1, 1]]],
             [[np.float16, 0, [12, 10, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
@@ -147,7 +146,7 @@ class TestGe(TestCase):
         ]
         self.ge_scalar_out_result(shape_format)
 
-    def test_ge_bool(self, device):
+    def test_ge_bool(self, device="npu"):
         format_list = [0]
         shape_list = [(5, 3), (2, 3, 4)]
         scalar_list = [True, False]
@@ -166,7 +165,7 @@ class TestGe(TestCase):
             self.assertRtolEqual(cpu_output1, npu_output1)
             self.assertRtolEqual(cpu_output2, npu_output2)
 
-    def test_ge_scalar_float32(self, device):
+    def test_ge_scalar_float32(self, device="npu"):
         format_list = [0]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [
@@ -179,7 +178,7 @@ class TestGe(TestCase):
             npu_output = self.npu_op_exec_scalar(npu_input, scalar)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_ge_scalar_float16(self, device):
+    def test_ge_scalar_float16(self, device="npu"):
         format_list = [0]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [
@@ -193,7 +192,7 @@ class TestGe(TestCase):
             npu_output = self.npu_op_exec_scalar(npu_input, scalar)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_ge_scalar_int32(self, device):
+    def test_ge_scalar_int32(self, device="npu"):
         format_list = [0]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [
@@ -206,7 +205,7 @@ class TestGe(TestCase):
             npu_output = self.npu_op_exec_scalar(npu_input, scalar)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_ge_tensor_float32(self, device):
+    def test_ge_tensor_float32(self, device="npu"):
         format_list = [0]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [[[np.float32, i, j], [np.float32, i, j]]
@@ -218,7 +217,7 @@ class TestGe(TestCase):
             npu_output = self.npu_op_exec(npu_input1, npu_input2)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_ge_tensor_float16(self, device):
+    def test_ge_tensor_float16(self, device="npu"):
         format_list = [0]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [[[np.float16, i, j], [np.float16, i, j]]
@@ -232,7 +231,7 @@ class TestGe(TestCase):
             npu_output = self.npu_op_exec(npu_input1, npu_input2)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_ge_inplace_float32(self, device):
+    def test_ge_inplace_float32(self, device="npu"):
         format_list = [0]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [[[np.float32, i, j], [np.float32, i, j]]
@@ -244,7 +243,7 @@ class TestGe(TestCase):
             npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_ge_inplace_float16(self, device):
+    def test_ge_inplace_float16(self, device="npu"):
         format_list = [0, 3]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [[[np.float16, i, j], [np.float16, i, j]]
@@ -259,7 +258,7 @@ class TestGe(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_ge_inplace_scalar_float32(self, device):
+    def test_ge_inplace_scalar_float32(self, device="npu"):
         format_list = [0]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [
@@ -272,7 +271,7 @@ class TestGe(TestCase):
             npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_ge_inplace_scalar_float16(self, device):
+    def test_ge_inplace_scalar_float16(self, device="npu"):
         format_list = [0]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [
@@ -287,13 +286,13 @@ class TestGe(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_ge_mix_dtype(self, device):
+    def test_ge_mix_dtype(self, device="npu"):
         cpu_input1, npu_input1 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
         cpu_input2, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
         cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestGe, globals(), except_for="cpu")
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_network_ops/test_gelu.py b/test/test_network_ops/test_gelu.py
index 1a4d3b133bdcb1ba070809bfe4cddf999800305d..1820cbb90675e2873be7833a5fc14c32ab1bc44c 100644
--- a/test/test_network_ops/test_gelu.py
+++ b/test/test_network_ops/test_gelu.py
@@ -18,10 +18,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
-#pylint: disable=unused-argument
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestGelu(TestCase):
     def generate_data(self, min_d, max_d, shape, dtype):
@@ -57,41 +55,41 @@ class TestGelu(TestCase):
         output = output.numpy().astype(np.float16)
         return output 
         
-    def test_gelu_float32_1(self, device):
+    def test_gelu_float32_1(self, device="npu"):
         input1 = self.generate_data(0, 100, (4,3), np.float32)
         cpu_input1 = copy.deepcopy(input1)
         cpu_output = self.cpu_op_exec(cpu_input1)
         npu_output = self.npu_op_exec(input1)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_gelu_float32_2(self, device):
+    def test_gelu_float32_2(self, device="npu"):
         input1 = self.generate_data(0, 1000, (4,3), np.float32)
         cpu_input1 = copy.deepcopy(input1)
         cpu_output = self.cpu_op_exec(cpu_input1)
         npu_output = self.npu_op_exec(input1)
         self.assertRtolEqual(cpu_output, npu_output) 
                 
-    def test_gelu_float16_1(self, device):
+    def test_gelu_float16_1(self, device="npu"):
         npu_input1 = self.generate_data(0, 100, (5,3), np.float16)
         cpu_input1 = copy.deepcopy(npu_input1)
         cpu_output = self.cpu_op_exec_fp16(cpu_input1)
         npu_output = self.npu_op_exec_fp16(npu_input1)
         self.assertRtolEqual(cpu_output, npu_output)  
 
-    def test_gelu_float16_2(self, device):
+    def test_gelu_float16_2(self, device="npu"):
         npu_input1 = self.generate_data(0, 1000, (5,3), np.float16)
         cpu_input1 = copy.deepcopy(npu_input1)
         cpu_output = self.cpu_op_exec_fp16(cpu_input1)
         npu_output = self.npu_op_exec_fp16(npu_input1)
         self.assertRtolEqual(cpu_output, npu_output)  
 
-    def test_gelu_float16_3(self, device):
+    def test_gelu_float16_3(self, device="npu"):
         npu_input1 = self.generate_data(0, 1000, (3,3), np.float16)
         cpu_input1 = copy.deepcopy(npu_input1)
         cpu_output = self.cpu_op_exec_fp16(cpu_input1)
         npu_output = self.npu_op_exec_fp16(npu_input1)
         self.assertRtolEqual(cpu_output, npu_output)   
 
-instantiate_device_type_tests(TestGelu, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_gelu_backward.py b/test/test_network_ops/test_gelu_backward.py
index 4952e4f602c2c3abbc653cb33dda8d28aa5e3281..910c140613a4420e6479ca9daff82f42f0100b7a 100644
--- a/test/test_network_ops/test_gelu_backward.py
+++ b/test/test_network_ops/test_gelu_backward.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestGeluBackward(TestCase):
     def generate_single_data(self, min_val, max_val, shape, dtype): 
@@ -43,28 +42,28 @@ class TestGeluBackward(TestCase):
         res = input1.grad.to("cpu")        
         return res.detach().numpy()
         
-    def test_gelu_backward_float32_1(self, device):
+    def test_gelu_backward_float32_1(self, device="npu"):
         input1 = self.generate_single_data(0, 100, (4, 3, 1, 1), np.float32)
         cpu_input1 = copy.deepcopy(input1)
         cpu_output = self.cpu_op_exec(cpu_input1)
         npu_output = self.npu_op_exec(input1)
         self.assertRtolEqual(cpu_output, npu_output)
         
-    def test_gelu_backward_float32_2(self, device):
+    def test_gelu_backward_float32_2(self, device="npu"):
         input1 = self.generate_single_data(0, 100, (15, 3, 1), np.float32)
         cpu_input1 = copy.deepcopy(input1)
         cpu_output = self.cpu_op_exec(cpu_input1)
         npu_output = self.npu_op_exec(input1)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_gelu_backward_float32_3(self, device):
+    def test_gelu_backward_float32_3(self, device="npu"):
         input1 = self.generate_single_data(0, 100, (4, 4), np.float32)
         cpu_input1 = copy.deepcopy(input1)
         cpu_output = self.cpu_op_exec(cpu_input1)
         npu_output = self.npu_op_exec(input1)
         self.assertRtolEqual(cpu_output, npu_output)       
 
-    def test_gelu_backward_float16(self, device):
+    def test_gelu_backward_float16(self, device="npu"):
         input1 = self.generate_single_data(0, 100, (5, 10, 100), np.float16)
         cpu_input1 =  input1.to(torch.float32)
         cpu_output = self.cpu_op_exec(cpu_input1)
@@ -72,6 +71,6 @@ class TestGeluBackward(TestCase):
         npu_output = self.npu_op_exec(input1)
         self.assertRtolEqual(cpu_output, npu_output) 
         
-instantiate_device_type_tests(TestGeluBackward, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_grid_assign_positive.py b/test/test_network_ops/test_grid_assign_positive.py
index bacd6558a264e908f4e00d22882c606ecdbd62a2..2b7a0fc092dc6122f2292593cbd7741c33232a0c 100644
--- a/test/test_network_ops/test_grid_assign_positive.py
+++ b/test/test_network_ops/test_grid_assign_positive.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 import torch
 import torch_npu
-import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestGridAssignPositive(TestCase):
     def npu_op_exec(self, *args):
@@ -24,7 +23,7 @@ class TestGridAssignPositive(TestCase):
         out = out.to("cpu")
         return out.detach().numpy()
         
-    def test_grid_assign_positive(self, device):
+    def test_grid_assign_positive(self, device="npu"):
         assigned_gt_inds = torch.rand((4,), dtype=torch.float32).to("npu")
         overlaps = torch.rand((2,4), dtype=torch.float32).to("npu")
         box_responsible_flags = torch.tensor([1,1,1,0], dtype=torch.uint8).to("npu")
@@ -45,6 +44,6 @@ class TestGridAssignPositive(TestCase):
         npu_output = self.npu_op_exec(*params)
         self.assertRtolEqual(expect_cpu.numpy(), npu_output)
 
-instantiate_device_type_tests(TestGridAssignPositive, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_grid_sampler_2d.py b/test/test_network_ops/test_grid_sampler_2d.py
index af67846b8712f369613c2fd632713103e3395e4f..9ea285d514a0c4a80c652d0c28d915cac455538b 100644
--- a/test/test_network_ops/test_grid_sampler_2d.py
+++ b/test/test_network_ops/test_grid_sampler_2d.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestGridSampler2D(TestCase):
     def cpu_op_exec(self, input1, grid):
@@ -40,7 +40,7 @@ class TestGridSampler2D(TestCase):
         output = output.astype(np.float16)
         return output
 
-    def test_grid_sampler_2d_shape_format(self, device):
+    def test_grid_sampler_2d_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float32, 0, (1,2,4,20)],[np.float32, 0, (1,10,8,2)]],
                 [[np.float32, 0, (1,4,64, 10)],[np.float32, 0, (1,2,32,2)]],
@@ -55,7 +55,7 @@ class TestGridSampler2D(TestCase):
             npu_output = self.npu_op_exec(npu_input, npu_grid)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_grid_sampler_2d_fp16_shape_format(self, device):
+    def test_grid_sampler_2d_fp16_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float16, 0, (1,2,4,20)],[np.float16, 0, (1,10,8,2)]],
                 [[np.float16, 0, (1,4,64, 10)],[np.float16, 0, (1,2,32,2)]],
@@ -70,7 +70,7 @@ class TestGridSampler2D(TestCase):
             npu_output = self.npu_op_exec(npu_input, npu_grid)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestGridSampler2D, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
     
\ No newline at end of file
diff --git a/test/test_network_ops/test_grid_sampler_2d_backward.py b/test/test_network_ops/test_grid_sampler_2d_backward.py
index e18f97c0cdef33f53e378f45ab161a136fab2df6..9b5cc3783f40f59511ff03ae16a29dda114c4a9b 100644
--- a/test/test_network_ops/test_grid_sampler_2d_backward.py
+++ b/test/test_network_ops/test_grid_sampler_2d_backward.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestGridSampler2dBackward(TestCase):
     def get_attrs(self):
@@ -50,7 +50,7 @@ class TestGridSampler2dBackward(TestCase):
         dgrid = dgrid.to("cpu").numpy()
         return dx, dgrid
 
-    def test_grid_sampler_2d_backward_fp32(self, device):
+    def test_grid_sampler_2d_backward_fp32(self, device="npu"):
         shape_list = [[100, 1, 28, 28], [100, 64, 32, 28]]
         shape_format = [
             [np.float32, -1, j] for j in shape_list
@@ -66,7 +66,7 @@ class TestGridSampler2dBackward(TestCase):
                 self.assertRtolEqual(cpu_output_dx, npu_output_dx)
                 self.assertRtolEqual(cpu_output_dgrid, npu_output_dgrid)
 
-    def test_grid_sampler_2d_backward_fp16(self, device):
+    def test_grid_sampler_2d_backward_fp16(self, device="npu"):
         def cpu_op_fp16_exec(input1, sample, pad_mode, align):
             input1 = input1.to(torch.float32)
             sample = sample.to(torch.float32)
@@ -95,6 +95,6 @@ class TestGridSampler2dBackward(TestCase):
                 self.assertRtolEqual(cpu_output_dx, npu_output_dx)
                 self.assertRtolEqual(cpu_output_dgrid, npu_output_dgrid)
 
-instantiate_device_type_tests(TestGridSampler2dBackward, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_ifmr.py b/test/test_network_ops/test_ifmr.py
index 9b73f45a5485546deabd5419b82a04015929224e..5153defdbef7d005ef05c8d3c795cebdefc50d86 100644
--- a/test/test_network_ops/test_ifmr.py
+++ b/test/test_network_ops/test_ifmr.py
@@ -18,9 +18,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestIFMR(TestCase):
     def cpu_op_exec(self,
@@ -108,7 +108,7 @@ class TestIFMR(TestCase):
 
         return scale, offset
 
-    def test_ifrm_with_offset(self, device):
+    def test_ifrm_with_offset(self, device="npu"):
         format_list = [0, 3]
         shape_list = [(2, 2, 3, 4), (5, 5)]
         shape_format = [[np.float32, i, j] for i in format_list
@@ -122,7 +122,7 @@ class TestIFMR(TestCase):
             self.assertTrue((scale_cpu - scale_npu[0]) / scale_cpu < 0.0001)
             self.assertEqual(offset_cpu, offset_npu[0])
 
-    def test_ifrm_without_offset(self, device):
+    def test_ifrm_without_offset(self, device="npu"):
         format_list = [0, 3]
         shape_list = [(2, 2, 3, 4), (5, 5)]
         shape_format = [[np.float32, i, j] for i in format_list
@@ -136,6 +136,6 @@ class TestIFMR(TestCase):
             self.assertTrue((scale_cpu - scale_npu[0]) / scale_cpu < 0.0001)
             self.assertEqual(offset_cpu, offset_npu[0])
 
-instantiate_device_type_tests(TestIFMR, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_index.py b/test/test_network_ops/test_index.py
index 9d8f938d6ea427fafb1fdeebfec376dbfd9cbb5c..81be633c160cfa975ec4334cb5aa226fe59e7a7c 100644
--- a/test/test_network_ops/test_index.py
+++ b/test/test_network_ops/test_index.py
@@ -15,9 +15,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestIndex(TestCase):
     def generate_index_data_bool(self, shape): 
@@ -56,7 +56,7 @@ class TestIndex(TestCase):
         output = output.cpu().numpy()
         return output
 
-    def test_index_ellip(self, device):
+    def test_index_ellip(self, device="npu"):
         dtype_list = [np.float32, np.float16, np.int32]
         format_list = [0]
         shape_list = [[5, 256, 256, 100]]
@@ -71,7 +71,7 @@ class TestIndex(TestCase):
             npu_output = self.npu_op_exec_ellip(npu_input1, npu_index1)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_index_semi(self, device):
+    def test_index_semi(self, device="npu"):
         dtype_list = [np.float32, np.float16, np.int32]
         format_list = [0]
         shape_list = [[5, 256, 256, 100]]
@@ -86,7 +86,7 @@ class TestIndex(TestCase):
             npu_output = self.npu_op_exec_semi(npu_input1, npu_index1)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_index_shape_format_tensor(self, device):
+    def test_index_shape_format_tensor(self, device="npu"):
         #test index is tensor
         dtype_list = [np.float32, np.float16, np.int32]  
         format_list = [0]         
@@ -102,7 +102,7 @@ class TestIndex(TestCase):
             npu_output = self.npu_op_exec(npu_input1, npu_index1)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_index_shape_format_tensor_x(self, device):
+    def test_index_shape_format_tensor_x(self, device="npu"):
         # 注：test index is [tensor, x] , (x=1,bool,range)
         dtype_list = [np.float32, np.float16, np.int32]
         format_list = [0]       
@@ -119,7 +119,7 @@ class TestIndex(TestCase):
                 npu_output = self.npu_op_exec(npu_input1, (npu_index1, i))
                 self.assertRtolEqual(cpu_output, npu_output)
                                 
-    def test_index_shape_format_tensor_tensor(self, device):
+    def test_index_shape_format_tensor_tensor(self, device="npu"):
         #test index is [tensor, tensor]
         dtype_list = [np.float32, np.float16, np.int32]
         format_list = [0]
@@ -136,7 +136,7 @@ class TestIndex(TestCase):
             npu_output = self.npu_op_exec(npu_input1, (npu_index1, npu_index2))
             self.assertRtolEqual(cpu_output, npu_output)
             
-    def test_index_shape_format_list(self, device):
+    def test_index_shape_format_list(self, device="npu"):
         #test index is list
         dtype_list = [np.float32, np.float16, np.int32]
         format_list = [0]   
@@ -151,7 +151,7 @@ class TestIndex(TestCase):
             npu_output = self.npu_op_exec(npu_input1, item[1])
             self.assertRtolEqual(cpu_output, npu_output)
             
-    def test_index_shape_format_list_x(self, device):
+    def test_index_shape_format_list_x(self, device="npu"):
         # 注：test index is [list, x],  (x=1,bool,range)
         dtype_list = [np.float32, np.float16, np.int32]
         format_list = [0]
@@ -167,7 +167,7 @@ class TestIndex(TestCase):
                 npu_output = self.npu_op_exec(npu_input1, (item[1], i))
                 self.assertRtolEqual(cpu_output, npu_output)  
                                 
-    def test_index_shape_format_tensor_bool(self, device):      
+    def test_index_shape_format_tensor_bool(self, device="npu"):      
         # 注：test index is bool tensor 
         dtype_list = [np.float32, np.float16, np.int32]
         format_list = [0]
@@ -183,7 +183,7 @@ class TestIndex(TestCase):
             npu_output = self.npu_op_exec(npu_input1, npu_index)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_index_shape_format_bool_x(self, device):     
+    def test_index_shape_format_bool_x(self, device="npu"):     
         # 注：test index is [bool, x] , (x=1,bool,range)
         dtype_list = [np.float32, np.float16, np.int32]
         format_list = [0]
@@ -199,6 +199,6 @@ class TestIndex(TestCase):
             npu_output = self.npu_op_exec(npu_input1, item[1])
             self.assertRtolEqual(cpu_output, npu_output)       
 
-instantiate_device_type_tests(TestIndex, globals(), except_for="cpu") 
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_index_add.py b/test/test_network_ops/test_index_add.py
index 9d9777e56e9bf4e0be639f1bbe268495862a5e8a..b4be001a27f91b050314f279c8edae278ff5288b 100644
--- a/test/test_network_ops/test_index_add.py
+++ b/test/test_network_ops/test_index_add.py
@@ -12,15 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-import copy
 import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestIndexAdd(TestCase):
 
@@ -46,7 +44,7 @@ class TestIndexAdd(TestCase):
         output = output.numpy()
         return output
     
-    def test_index_add_float32(self, device):
+    def test_index_add_float32(self, device="npu"):
         shape_format = [
                 [[np.float32, -1, (5, 3)], [np.int32, -1, (3, )], [np.float32, -1, (3, 3)], 0],
                 [[np.float32, -1, (6, 4)], [np.int32, -1, (5, )], [np.float32, -1, (5, 4)], 0],
@@ -71,7 +69,7 @@ class TestIndexAdd(TestCase):
             npu_output = self.npu_op_inter_exec(npu_var, npu_index, npu_source, item[3])
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_index_add_int32(self, device):
+    def test_index_add_int32(self, device="npu"):
         shape_format = [   
                 [[np.int32, -1, (5, 3)], [np.int32, -1, (3, )], [np.int32, -1, (3, 3)], 0],
                 [[np.int32, -1, (6, 4)], [np.int32, -1, (5, )], [np.int32, -1, (5, 4)], 0],
@@ -94,7 +92,7 @@ class TestIndexAdd(TestCase):
             npu_output = self.npu_op_inter_exec(npu_var, npu_index, npu_source, item[3])
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_index_add_int8(self, device):
+    def test_index_add_int8(self, device="npu"):
         shape_format = [  
                 [[np.int8, -1, (5, 3)], [np.int32, -1, (3, )], [np.int8, -1, (3, 3)], 0],
                 [[np.int8, -1, (6, 4)], [np.int32, -1, (5, )], [np.int8, -1, (5, 4)], 0],
@@ -118,7 +116,7 @@ class TestIndexAdd(TestCase):
             npu_output = self.npu_op_inter_exec(npu_var, npu_index, npu_source, item[3])
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_index_add_uint8(self, device):
+    def test_index_add_uint8(self, device="npu"):
         shape_format = [
                 [[np.uint8, -1, (5, 3)], [np.int32, -1, (3, )], [np.uint8, -1, (3, 3)], 0],
                 [[np.uint8, -1, (6, 4)], [np.int32, -1, (5, )], [np.uint8, -1, (5, 4)], 0],
@@ -143,7 +141,7 @@ class TestIndexAdd(TestCase):
             npu_output = self.npu_op_inter_exec(npu_var, npu_index, npu_source, item[3])
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_index_add_fp16(self, device):
+    def test_index_add_fp16(self, device="npu"):
         shape_format = [
                 [[np.float16, -1, (5, 3)], [np.int32, -1, (3, )], [np.float16, -1, (3, 3)], 0],
                 [[np.float16, -1, (3, 2)], [np.int32, -1, (2, )], [np.float16, -1, (2, 2)], 0],
@@ -171,6 +169,6 @@ class TestIndexAdd(TestCase):
             cpu_output = cpu_output.astype(np.float16)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestIndexAdd, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_index_put.py b/test/test_network_ops/test_index_put.py
index deb7c643f93acf39d98772a4a0d1df8900c8c1e9..25370000e79873da8d4eddfe194dae6682e5ef14 100644
--- a/test/test_network_ops/test_index_put.py
+++ b/test/test_network_ops/test_index_put.py
@@ -15,9 +15,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestIndexPut(TestCase):
     def cpu_op_exec(self, input1, indices, value):
@@ -112,7 +112,7 @@ class TestIndexPut(TestCase):
             cpu_output = cpu_output.astype(np.float16)
             self.assertEqual(cpu_output, npu_output)
 
-    def test_index_put_shape_format_fp32(self, device):
+    def test_index_put_shape_format_fp32(self, device="npu"):
         format_list = [0]
         shape_list = [(5, 6)]
         shape_format = [[[np.float32, i, j], [np.int64, 0, [1, 2]], [
@@ -120,7 +120,7 @@ class TestIndexPut(TestCase):
         self.case_exec(shape_format)
         self.case_inp_exec(shape_format)
 
-    def test_index_put_shape_format_fp16(self, device):
+    def test_index_put_shape_format_fp16(self, device="npu"):
         format_list = [0]
         shape_list = [(5, 6)]
         shape_format = [[[np.float16, i, j], [np.int64, 0, [1, 2]], [
@@ -128,7 +128,7 @@ class TestIndexPut(TestCase):
         self.case_exec_fp16(shape_format)
         self.case_inp_exec_fp16(shape_format)
 
-    def test_index_put_null(self, device):
+    def test_index_put_null(self, device="npu"):
         cpu_input1 = torch.rand(2, 2)
         cpu_input2 = torch.rand(2, 2)
         cpu_mask_index = torch.tensor([[False, False], [False, False]])
@@ -139,6 +139,6 @@ class TestIndexPut(TestCase):
         npu_input1[npu_mask_index] = npu_input2.detach()[npu_mask_index]
         self.assertEqual(cpu_input1, npu_input1.to("cpu"))
 
-instantiate_device_type_tests(TestIndexPut, globals(), except_for="cpu")
+\
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_index_select.py b/test/test_network_ops/test_index_select.py
index 5567f96c811086cd107bb51a5727d9b74286af21..bbe0dab69f10519800e0f01bace6742ed356a041 100644
--- a/test/test_network_ops/test_index_select.py
+++ b/test/test_network_ops/test_index_select.py
@@ -18,9 +18,9 @@ import copy
 import torch
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestIndexSelect(TestCase):
     def cpu_op_exec(self, input1, axis, indices):
@@ -47,7 +47,7 @@ class TestIndexSelect(TestCase):
         output = output.numpy()
         return output
 
-    def test_index_select(self, device):
+    def test_index_select(self, device="npu"):
         shape_format = [
             [[np.float32, 0, (3, )], torch.tensor(0, dtype=torch.int64), 0],
             [[np.float32, 0, (3, )], torch.tensor([0, 1], dtype=torch.int64), 0],
@@ -101,7 +101,7 @@ class TestIndexSelect(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_output, npu_output_out)
 
-    def test_index_select_fp16(self, device):
+    def test_index_select_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, 0, (3,)], torch.tensor([0, 1], dtype=torch.int64), 0],
             [[np.float16, 0, (2, 4)], torch.tensor([0, 1, 2], dtype=torch.int64), 1],
@@ -118,6 +118,6 @@ class TestIndexSelect(TestCase):
             cpu_output = cpu_output.astype(np.float16)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestIndexSelect, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_is_nonzero.py b/test/test_network_ops/test_is_nonzero.py
index b53a8e5d7244ca2d46bdd65d3aaf987ae57c6bb4..51bd57b237747959d1330478bea82ccc74706ba4 100644
--- a/test/test_network_ops/test_is_nonzero.py
+++ b/test/test_network_ops/test_is_nonzero.py
@@ -15,9 +15,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestIsNonzero(TestCase):
     def cpu_op_exec(self, input1):
@@ -28,7 +28,7 @@ class TestIsNonzero(TestCase):
         output = torch.is_nonzero(input1)
         return output
 
-    def test_isnonzero_shape_format(self, device):
+    def test_isnonzero_shape_format(self, device="npu"):
         dtype_list = [np.float16, np.float32, np.int32, np.bool_]
         format_list = [0]
         shape_list = [[1], [1, 1, 1], [1, 1, 1, 1]]
@@ -41,6 +41,6 @@ class TestIsNonzero(TestCase):
             npu_output = self.npu_op_exec(npu_input)
             cpu_output == npu_output
 
-instantiate_device_type_tests(TestIsNonzero, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_isclose.py b/test/test_network_ops/test_isclose.py
index 2e939b3009017d4464647548ed44d335f84ad979..70e0fef3d0d2f766620d53aaf4c157e5581c9127 100644
--- a/test/test_network_ops/test_isclose.py
+++ b/test/test_network_ops/test_isclose.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestIsclose(TestCase):
 
@@ -75,7 +74,7 @@ class TestIsclose(TestCase):
         output = output.numpy()
         return output 
     
-    def test_isclose_int32_float32(self, device):
+    def test_isclose_int32_float32(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(0, 100, (4,3), np.int32)
         npu_input1 = npu_input1.to(torch.float32)
         npu_input2 = npu_input2.to(torch.float32)
@@ -83,31 +82,31 @@ class TestIsclose(TestCase):
         npu_output = self.npu_op_exec_tensor_need_to_npu(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output,npu_output)
 
-    def test_isclose_equal_nan_false(self, device):
+    def test_isclose_equal_nan_false(self, device="npu"):
         npu_input1, npu_input2 = self.generate_nan((4,3), np.int32)
         cpu_output = self.cpu_op_exec_equal_nan(npu_input1, npu_input2, False)
         npu_output = self.npu_op_exec_tensor_need_to_npu_equal_nan(npu_input1, npu_input2, False)
         self.assertRtolEqual(cpu_output,npu_output)
 
-    def test_isclose_equal_nan_true(self, device):
+    def test_isclose_equal_nan_true(self, device="npu"):
         npu_input1, npu_input2 = self.generate_nan((4,3), np.int32)
         cpu_output = self.cpu_op_exec_equal_nan(npu_input1, npu_input2, True)
         npu_output = self.npu_op_exec_tensor_need_to_npu_equal_nan(npu_input1, npu_input2, True)
         self.assertRtolEqual(cpu_output,npu_output)
 
-    def test_isclose_int32_001(self, device):
+    def test_isclose_int32_001(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(0, 100, (4,3), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec_tensor_need_to_npu(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output,npu_output)
 
-    def test_isclose_int32_002(self, device):
+    def test_isclose_int32_002(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(100, 100, (4,3,2), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec_tensor_need_to_npu(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output,npu_output)
 
-    def test_isclose_int32_003(self, device):
+    def test_isclose_int32_003(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(0, 100, (4,3,2), np.int32)
         rtol = 8e-05
         atol = 8e-08
@@ -115,19 +114,19 @@ class TestIsclose(TestCase):
         npu_output = self.npu_op_exec_tensor_need_to_npu_rtol_atol(npu_input1, npu_input2, rtol, atol)
         self.assertRtolEqual(cpu_output,npu_output)
 
-    def test_isclose_float32_001(self, device):
+    def test_isclose_float32_001(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(100, 100, (4,3), np.float32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec_tensor_need_to_npu(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output,npu_output)
 
-    def test_isclose_float32_002(self, device):
+    def test_isclose_float32_002(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(0, 100, (4,3,2), np.float32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec_tensor_need_to_npu(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output,npu_output)
 
-    def test_isclose_float32_003(self, device):
+    def test_isclose_float32_003(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(0, 100, (4,3,2), np.float32)
         rtol = 8e-05
         atol = 8e-08
@@ -135,7 +134,7 @@ class TestIsclose(TestCase):
         npu_output = self.npu_op_exec_tensor_need_to_npu_rtol_atol(npu_input1, npu_input2, rtol, atol)
         self.assertRtolEqual(cpu_output,npu_output)
 
-    def test_isclose_float16_001(self, device):
+    def test_isclose_float16_001(self, device="npu"):
         def cpu_op_exec_fp16(input1, input2):
             input1 = input1.to(torch.float32)
             input2 = input2.to(torch.float32)
@@ -148,7 +147,7 @@ class TestIsclose(TestCase):
         cpu_output = cpu_output.astype(npu_output.dtype)
         self.assertRtolEqual(cpu_output, npu_output) 
 
-    def test_isclose_float16_002(self, device):
+    def test_isclose_float16_002(self, device="npu"):
         def cpu_op_exec_fp16(input1, input2):
             input1 = input1.to(torch.float32)
             input2 = input2.to(torch.float32)
@@ -161,7 +160,7 @@ class TestIsclose(TestCase):
         cpu_output = cpu_output.astype(npu_output.dtype)
         self.assertRtolEqual(cpu_output, npu_output) 
 
-    def test_isclose_float16_003(self, device):
+    def test_isclose_float16_003(self, device="npu"):
         def cpu_op_exec_fp16_rtol_atol(input1, input2, rtol, atol):
             input1 = input1.to(torch.float32)
             input2 = input2.to(torch.float32)
@@ -176,6 +175,6 @@ class TestIsclose(TestCase):
         cpu_output = cpu_output.astype(npu_output.dtype)
         self.assertRtolEqual(cpu_output,npu_output)
 
-instantiate_device_type_tests(TestIsclose, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_kl_div.py b/test/test_network_ops/test_kl_div.py
index 127c91e64728020bb84520d01a08701fd86701e6..28271922b4f9eb22b7ea2d83306d04269fada717 100644
--- a/test/test_network_ops/test_kl_div.py
+++ b/test/test_network_ops/test_kl_div.py
@@ -17,9 +17,8 @@ import torch_npu
 import torch.nn.functional as F
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestKlDiv(TestCase):
     def cpu_op_exec(self, input1, input2, reduction):
@@ -33,7 +32,7 @@ class TestKlDiv(TestCase):
         output = output.numpy()
         return output
     
-    def test_kl_div_shape_format_fp32(self, device):
+    def test_kl_div_shape_format_fp32(self, device="npu"):
         shape_format = [
             [[torch.float32, 0, (192, 8)], [torch.float32, 0, (192, 8)], 1],
             [[torch.float32, 0, (192, 500)], [torch.float32, 0, (192, 500)], 1],
@@ -53,7 +52,7 @@ class TestKlDiv(TestCase):
             npu_output = self.npu_op_exec(npu_input, npu_target, reduction)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_kl_div_shape_format_fp16(self, device):
+    def test_kl_div_shape_format_fp16(self, device="npu"):
         shape_format = [
             [[torch.float16, 0, (192, 8)], [torch.float16, 0, (192, 8)], 1],
             [[torch.float16, 0, (192, 200)], [torch.float16, 0, (192, 200)], 1],
@@ -73,6 +72,6 @@ class TestKlDiv(TestCase):
             npu_output = self.npu_op_exec(npu_input, npu_target, reduction)
             self.assertRtolEqual(cpu_output.astype(np.float16), npu_output)
 
-instantiate_device_type_tests(TestKlDiv, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_kl_div_backward.py b/test/test_network_ops/test_kl_div_backward.py
index 2af40f8fbe7aab908fcdd664cc52991b3be3ff91..aa0d749c1203715b45c82ce0cf8daa7845f1407d 100644
--- a/test/test_network_ops/test_kl_div_backward.py
+++ b/test/test_network_ops/test_kl_div_backward.py
@@ -17,9 +17,8 @@ import torch_npu
 import torch.nn.functional as F
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestKlDivBackward(TestCase):  
     def cpu_op_exec(self, input1, input2, reduction):
@@ -37,7 +36,7 @@ class TestKlDivBackward(TestCase):
         output = output.detach().numpy()
         return output, input1.grad
     
-    def test_kl_div_backward_shape_format_fp32(self, device):
+    def test_kl_div_backward_shape_format_fp32(self, device="npu"):
         shape_format = [
             [[torch.float16, 0, (192, 8)], [torch.float16, 0, (192, 8)], 1],
             [[torch.float16, 0, (192, 50000)], [torch.float16, 0, (192, 50000)], 1],
@@ -58,7 +57,7 @@ class TestKlDivBackward(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_input_grad, npu_input_grad.cpu())
 
-    def test_kl_div_backward_shape_format_fp16(self, device):
+    def test_kl_div_backward_shape_format_fp16(self, device="npu"):
         shape_format = [
             [[torch.float16, 0, (112, 8)], [torch.float16, 0, (112, 8)], 1],
             [[torch.float16, 0, (112, 50000)], [torch.float16, 0, (112, 50000)], 1],
@@ -82,6 +81,6 @@ class TestKlDivBackward(TestCase):
             self.assertRtolEqual(cpu_output.astype(np.float16), npu_output)
             self.assertRtolEqual(cpu_input_grad.to(torch.float16), npu_input_grad.cpu())
 
-instantiate_device_type_tests(TestKlDivBackward, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_kthvalue.py b/test/test_network_ops/test_kthvalue.py
index 51cd01eea0b4dd61ce9ad3a0276af9408f2028ed..6c23900e5cf6c9199d90f3f865087b32b227b309 100644
--- a/test/test_network_ops/test_kthvalue.py
+++ b/test/test_network_ops/test_kthvalue.py
@@ -17,9 +17,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestKthvalues(TestCase):
     def generate_data(self, min1, max1, shape, dtype):
@@ -89,7 +88,7 @@ class TestKthvalues(TestCase):
         indices = indices.numpy()
         return y, indices
 
-    def test_kthvalues(self, device):
+    def test_kthvalues(self, device="npu"):
         x = self.generate_data(-100, 100, (3, 4, 5, 6), np.float32)
         k = self.generate_int_k(3)
         dim = self.generate_int_dim(4)
@@ -99,7 +98,7 @@ class TestKthvalues(TestCase):
         self.assertRtolEqual(cpu_y, npu_y)
         self.assertRtolEqual(cpu_indices.astype(np.int32), npu_indices.astype(np.int32))
 
-    def test_kthvalues_without_dim(self, device):
+    def test_kthvalues_without_dim(self, device="npu"):
         x = self.generate_data(-100, 100, (3, 4, 5, 6), np.int32)
         k = self.generate_int_k(3)
         keepdim = self.generate_bool_keepdim()
@@ -108,7 +107,7 @@ class TestKthvalues(TestCase):
         self.assertRtolEqual(cpu_y, npu_y)
         self.assertRtolEqual(cpu_indices.astype(np.int32), npu_indices.astype(np.int32))
 
-    def test_kthvalues_without_keepdim(self, device):
+    def test_kthvalues_without_keepdim(self, device="npu"):
         x = self.generate_data(-100, 100, (3, 4, 5, 6), np.float16)
         k = self.generate_int_k(3)
         dim = self.generate_int_dim(4)
@@ -117,7 +116,7 @@ class TestKthvalues(TestCase):
         self.assertRtolEqual(cpu_y.astype(np.float16), npu_y)
         self.assertRtolEqual(cpu_indices.astype(np.int32), npu_indices.astype(np.int32))
 
-    def test_kthvalues_out(self, device):
+    def test_kthvalues_out(self, device="npu"):
         x = self.generate_data(-100, 100, (3, 4, 5, 6), np.float32)
         k = self.generate_int_k(3)
         dim = self.generate_int_dim(4)
@@ -131,7 +130,7 @@ class TestKthvalues(TestCase):
         self.assertRtolEqual(cpu_y.numpy(), npu_y.to("cpu").numpy())
         self.assertRtolEqual(cpu_indices.numpy().astype(np.int32), npu_indices.to("cpu").numpy().astype(np.int32))
     
-    def test_kthvalues_dimname(self, device):
+    def test_kthvalues_dimname(self, device="npu"):
         x = self.generate_data(-100, 100, (3, 4, 5, 6), np.float32)
         x.names = ['A', 'B', 'C', 'D']
         k = self.generate_int_k(3)
@@ -141,7 +140,7 @@ class TestKthvalues(TestCase):
         self.assertRtolEqual(cpu_y, npu_y)
         self.assertRtolEqual(cpu_indices.astype(np.int32), npu_indices.astype(np.int32))
 
-    def test_kthvalues_dimname_without_dim(self, device):
+    def test_kthvalues_dimname_without_dim(self, device="npu"):
         x = self.generate_data(-100, 100, (3, 4, 5, 6), np.int32)
         x.names = ['A', 'B', 'C', 'D']
         k = self.generate_int_k(3)
@@ -151,7 +150,7 @@ class TestKthvalues(TestCase):
         self.assertRtolEqual(cpu_y, npu_y)
         self.assertRtolEqual(cpu_indices.astype(np.int32), npu_indices.astype(np.int32))
 
-    def test_kthvalues_dimname_without_keepdim(self, device):
+    def test_kthvalues_dimname_without_keepdim(self, device="npu"):
         x = self.generate_data(-100, 100, (3, 4, 5, 6), np.float32)
         x.names = ['A', 'B', 'C', 'D']
         k = self.generate_int_k(3)
@@ -160,7 +159,7 @@ class TestKthvalues(TestCase):
         self.assertRtolEqual(cpu_y, npu_y)
         self.assertRtolEqual(cpu_indices.astype(np.int32), npu_indices.astype(np.int32))
 
-    def test_kthvalues_dimname_out(self, device):
+    def test_kthvalues_dimname_out(self, device="npu"):
         x = self.generate_data(-100, 100, (3, 4, 5, 6), np.int32)
         x.names = ['A', 'B', 'C', 'D']
         k = self.generate_int_k(3)
@@ -175,6 +174,6 @@ class TestKthvalues(TestCase):
         self.assertRtolEqual(cpu_y.numpy(), npu_y.to("cpu").numpy())
         self.assertRtolEqual(cpu_indices.numpy().astype(np.int32), npu_indices.to("cpu").numpy().astype(np.int32))
 
-instantiate_device_type_tests(TestKthvalues, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_l1_loss.py b/test/test_network_ops/test_l1_loss.py
index 5f74ab9fbda0d148470246704abd663152f98f4a..21da5ba9f03e2d43f20986be88831e0f519dfb63 100644
--- a/test/test_network_ops/test_l1_loss.py
+++ b/test/test_network_ops/test_l1_loss.py
@@ -16,7 +16,7 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.testcase import TestCase, run_tests
 
 
 class TestL1Loss(TestCase):
diff --git a/test/test_network_ops/test_l1_loss_backward.py b/test/test_network_ops/test_l1_loss_backward.py
index 40da5bfe3b4f3f6887f24e9dbda2764d8d4c82cb..4ca08824062f0684332ef393ea20394548198559 100644
--- a/test/test_network_ops/test_l1_loss_backward.py
+++ b/test/test_network_ops/test_l1_loss_backward.py
@@ -17,9 +17,9 @@ import torch_npu
 import torch.nn as nn
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestL1lossbackward(TestCase):
     def cpu_op_exec(self, input1, input2, input3, reduction):
@@ -45,7 +45,7 @@ class TestL1lossbackward(TestCase):
         output = input2.grad.to("cpu").numpy()
         return output
 
-    def test_l1lossbackward_common_shape_format(self, device):
+    def test_l1lossbackward_common_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float32, -1, (4)], [np.float32, -1, (4)],
                  [np.float32, -1, (4)], "none"],
@@ -85,7 +85,7 @@ class TestL1lossbackward(TestCase):
             npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3, item[3])
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_l1lossbackward_float16_shape_format(self, device):
+    def test_l1lossbackward_float16_shape_format(self, device="npu"):
         def cpu_op_exec_fp16(input1, input2, input3, reduction):
             input1 = input1.to(torch.float32)
             input2 = input2.to(torch.float32)
@@ -138,6 +138,6 @@ class TestL1lossbackward(TestCase):
             npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3, item[3])
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestL1lossbackward, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_layer_norm.py b/test/test_network_ops/test_layer_norm.py
index 743858a2f20da6862d547ade3ed8e570c8bdcdfe..3607a9190741435293147871603fcf4f4070c6ee 100644
--- a/test/test_network_ops/test_layer_norm.py
+++ b/test/test_network_ops/test_layer_norm.py
@@ -18,12 +18,12 @@ import torch_npu
 import torch.nn as nn
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestLayerNorm(TestCase):
-    def test_c10_layer_norm(self, device):
+    def test_c10_layer_norm(self, device="npu"):
         # test that we can call c10 ops and they return a reasonable result
         X = torch.rand(5, 5, dtype=torch.float, device="cpu")
         X = X.to("npu")
@@ -55,7 +55,7 @@ class TestLayerNorm(TestCase):
         output = output.to("cpu")
         return output
 
-    def test_layer_norm_shape_format(self, device):
+    def test_layer_norm_shape_format(self, device="npu"):
         shape_format = [
                 [np.float32, 0, (64, 10)],
                 [np.float32, 0, (256, 2048, 7, 7)],
@@ -71,7 +71,7 @@ class TestLayerNorm(TestCase):
             npu_output = self.npu_op_exec(npu_input)
             self.assertRtolEqual(cpu_output.detach().numpy(), npu_output.detach().numpy())
 
-    def test_layer_norm_float16_format(self, device):
+    def test_layer_norm_float16_format(self, device="npu"):
         shape_format = [
                 [np.float16, 0, (64, 10)],
                 [np.float16, 0, (256, 2048, 7, 7)],
@@ -89,6 +89,6 @@ class TestLayerNorm(TestCase):
             cpu_output = cpu_output.to(torch.float16)
             self.assertRtolEqual(cpu_output.detach().numpy(), npu_output.detach().numpy())
 
-instantiate_device_type_tests(TestLayerNorm, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_layer_norm_backward.py b/test/test_network_ops/test_layer_norm_backward.py
index 1af28911a00d1c2be84428dba4fe613bc2cdc2bf..cd8117c63dbaa04c3e688b40074bf9e8021abe3e 100644
--- a/test/test_network_ops/test_layer_norm_backward.py
+++ b/test/test_network_ops/test_layer_norm_backward.py
@@ -15,9 +15,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestLayerNorm(TestCase):
     weight_grad = []
@@ -53,7 +53,7 @@ class TestLayerNorm(TestCase):
         grad_weight = m.weight.grad.cpu().detach().numpy()
         return grad_output, grad_weight, grad_bias
 
-    def test_layernorm_shape_format(self, device):
+    def test_layernorm_shape_format(self, device="npu"):
         shape_format = [
                 [np.float32, 3, [256, 32, 112, 112]],
                 [np.float16, 3, [256, 672, 7, 7]],
@@ -87,6 +87,5 @@ class TestLayerNorm(TestCase):
             self.assertRtolEqual(cpu_grad_bias, npu_grad_bias)
 
 
-instantiate_device_type_tests(TestLayerNorm, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_le.py b/test/test_network_ops/test_le.py
index 5b7d933b5bf781347c066bf902dc171fe144e2eb..b264026033c8c9caef2abddef2337a5de8e1a8c1 100644
--- a/test/test_network_ops/test_le.py
+++ b/test/test_network_ops/test_le.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestLe(TestCase):
     def generate_scalar(self, min1, max1):
@@ -143,7 +143,7 @@ class TestLe(TestCase):
 
             self.assertRtolEqual(cpu_output_out, npu_output_out)
 
-    def test_le_tensor_out(self, device):
+    def test_le_tensor_out(self, device="npu"):
         shape_format = [
             [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
             [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]],
@@ -170,7 +170,7 @@ class TestLe(TestCase):
                 cpu_output_out = cpu_output_out.astype(np.float16)
             self.assertRtolEqual(cpu_output_out, npu_output_out)
 
-    def test_le_scalar_out(self, device):
+    def test_le_scalar_out(self, device="npu"):
         shape_format = [
             [[np.float16, 0, [12, 4, 12, 121]], [np.float16, 0, [256, 116, 1, 1]]],
             [[np.float16, 0, [12, 10, 14, 111]], [np.float16, 0, [256, 116, 1, 1]]],
@@ -182,7 +182,7 @@ class TestLe(TestCase):
         ]
         self.le_scalar_out_result(shape_format)
 
-    def test_le_scalar_float32(self, device):
+    def test_le_scalar_float32(self, device="npu"):
         format_list = [0]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [
@@ -195,7 +195,7 @@ class TestLe(TestCase):
             npu_output = self.npu_op_exec_scalar(npu_input, scalar)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_le_scalar_int32(self, device):
+    def test_le_scalar_int32(self, device="npu"):
         format_list = [0]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [
@@ -208,7 +208,7 @@ class TestLe(TestCase):
             npu_output = self.npu_op_exec_scalar(npu_input, scalar)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_le_scalar_float16(self, device):
+    def test_le_scalar_float16(self, device="npu"):
         format_list = [0]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [
@@ -222,7 +222,7 @@ class TestLe(TestCase):
             npu_output = self.npu_op_exec_scalar(npu_input, scalar)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_le_tensor_float32(self, device):
+    def test_le_tensor_float32(self, device="npu"):
         format_list = [0]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [[[np.float32, i, j], [np.float32, i, j]]
@@ -234,7 +234,7 @@ class TestLe(TestCase):
             npu_output = self.npu_op_exec(npu_input1, npu_input2)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_le_tensor_float16(self, device):
+    def test_le_tensor_float16(self, device="npu"):
         format_list = [0]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [[[np.float16, i, j], [np.float16, i, j]]
@@ -248,7 +248,7 @@ class TestLe(TestCase):
             npu_output = self.npu_op_exec(npu_input1, npu_input2)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_le_inplace_float32(self, device):
+    def test_le_inplace_float32(self, device="npu"):
         format_list = [0, 3]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [[[np.float32, i, j], [np.float32, i, j]]
@@ -260,7 +260,7 @@ class TestLe(TestCase):
             npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_le_inplace_float16(self, device):
+    def test_le_inplace_float16(self, device="npu"):
         format_list = [0, 3]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [[[np.float16, i, j], [np.float16, i, j]]
@@ -275,7 +275,7 @@ class TestLe(TestCase):
             cpu_output = cpu_output.astype(np.float16)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_le_inplace_scalar_float32(self, device):
+    def test_le_inplace_scalar_float32(self, device="npu"):
         format_list = [0]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [
@@ -290,7 +290,7 @@ class TestLe(TestCase):
             npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar1)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_le_inplace_scalar_float16(self, device):
+    def test_le_inplace_scalar_float16(self, device="npu"):
         format_list = [0]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [
@@ -305,7 +305,7 @@ class TestLe(TestCase):
             cpu_output = cpu_output.astype(np.float16)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_le_mix_dtype(self, device):
+    def test_le_mix_dtype(self, device="npu"):
         cpu_input1, npu_input1 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
         cpu_input2, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
         cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
@@ -313,6 +313,5 @@ class TestLe(TestCase):
         self.assertRtolEqual(cpu_output, npu_output)
 
 
-instantiate_device_type_tests(TestLe, globals(), except_for="cpu")
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_network_ops/test_lerp.py b/test/test_network_ops/test_lerp.py
index e420524269a5db9c0ec7b1f03fbd12c6b2b0d861..97d3cb5d19916f1543febd6502af83fd639d3582 100644
--- a/test/test_network_ops/test_lerp.py
+++ b/test/test_network_ops/test_lerp.py
@@ -17,9 +17,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestLerp(TestCase):
     def cpu_op_exec(self, input1, input2, input3): 
@@ -95,7 +95,7 @@ class TestLerp(TestCase):
         output = output.numpy() 
         return output
 
-    def test_lerp_common_shape_format(self, device):
+    def test_lerp_common_shape_format(self, device="npu"):
         shape_format = [
             [[np.float32, -1, (4, 2, 2, 3)]],
             [[np.float32, -1, (2, 2, 3, 4)]],
@@ -113,7 +113,7 @@ class TestLerp(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_lerp_float16_shape_format(self, device):
+    def test_lerp_float16_shape_format(self, device="npu"):
         shape_format = [
             [[np.float16, -1, (100, 4, 5, 5)]],
             [[np.float16, -1, (100, 5, 5, 4)]],
@@ -129,7 +129,7 @@ class TestLerp(TestCase):
             self.assertRtolEqual(cpu_output, npu_output, prec=0.003, prec16=0.003)
             self.assertRtolEqual(cpu_output1, npu_output1, prec=0.003, prec16=0.003)
 
-    def test_lerp_scalar_common_shape_format(self, device):
+    def test_lerp_scalar_common_shape_format(self, device="npu"):
         shape_format = [
             [[np.float32, -1, (4, 2, 2, 3)], 1.0],
             [[np.float32, -1, (2, 2, 3, 4)], 2.0],
@@ -148,7 +148,7 @@ class TestLerp(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_lerp_scalar_float16_shape_format(self, device):
+    def test_lerp_scalar_float16_shape_format(self, device="npu"):
         shape_format = [
             [[np.float16, -1, (100, 4, 5, 5)], 1.2],
             [[np.float16, -1, (100, 5, 5, 4)], 1.2],
@@ -165,6 +165,6 @@ class TestLerp(TestCase):
             self.assertRtolEqual(cpu_output, npu_output, prec16=0.02)
             self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.02)
             
-instantiate_device_type_tests(TestLerp, globals(), except_for='cpu')
+
 if __name__ == '__main__': 
     run_tests() 
diff --git a/test/test_network_ops/test_linspace.py b/test/test_network_ops/test_linspace.py
index 00f4d44e2d78578040cd3cfb81c97d87d2c12f0e..20b4388653d2b2917eeb7f8fb2558d83600ec76b 100644
--- a/test/test_network_ops/test_linspace.py
+++ b/test/test_network_ops/test_linspace.py
@@ -14,14 +14,13 @@
 import torch
 import torch_npu
 import numpy as np
-from torch.nn import functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestLinspace(TestCase):
-    def test_linspace(self, device):
+    def test_linspace(self, device="npu"):
         shape_format = [
             [0, 100, 10, torch.float32, 
             torch.tensor([0.,11.111111, 22.222221, 33.333332, 44.444443, 
@@ -39,7 +38,7 @@ class TestLinspace(TestCase):
             benchmark15 = item[4]
             self.assertRtolEqual(benchmark15, npu_output)
 
-    def test_linspace_out(self, device):
+    def test_linspace_out(self, device="npu"):
         shape_format = [
             [0, 100, 10, torch.float32, [np.float32, 0, [10]],
             torch.tensor([0.,11.111111, 22.222221, 33.333332, 44.444443, 
@@ -58,6 +57,6 @@ class TestLinspace(TestCase):
             benchmark15 = item[5]
             self.assertRtolEqual(benchmark15, npu_output)
 
-instantiate_device_type_tests(TestLinspace, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_log.py b/test/test_network_ops/test_log.py
index 304ab9645e4c61b9b18ed848dbd64dd8ec9e5ea7..64d42c7f04c3fab444d5ac6b3481efe7c7d46db3 100644
--- a/test/test_network_ops/test_log.py
+++ b/test/test_network_ops/test_log.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestLog(TestCase):
@@ -68,8 +67,7 @@ class TestLog(TestCase):
         output = output.numpy()
         return output
 
-# TestCase
-    def test_log_shape_format_fp32(self, device):
+    def test_log_shape_format_fp32(self, device="npu"):
         format_list = [3]
         shape_list = [(4, 4)]
         shape_format = [
@@ -81,7 +79,7 @@ class TestLog(TestCase):
             npu_output = self.npu_op_exec(npu_input1)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_log_shape_format_fp16(self, device):
+    def test_log_shape_format_fp16(self, device="npu"):
         format_list = [3]
         shape_list = [(4, 4)]
         shape_format = [
@@ -95,7 +93,7 @@ class TestLog(TestCase):
             cpu_output = cpu_output.astype(np.float16)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_log_inp_shape_format_fp32(self, device):
+    def test_log_inp_shape_format_fp32(self, device="npu"):
         format_list = [3]
         shape_list = [(4, 4)]
         shape_format = [
@@ -107,7 +105,7 @@ class TestLog(TestCase):
             npu_output = self.npu_inp_op_exec(npu_input1)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_log_inp_shape_format_fp16(self, device):
+    def test_log_inp_shape_format_fp16(self, device="npu"):
         format_list = [3]
         shape_list = [(4, 4)]
         shape_format = [
@@ -121,7 +119,7 @@ class TestLog(TestCase):
             cpu_output = cpu_output.astype(np.float16)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_log_inp_uncon_shape_format_fp32(self, device):
+    def test_log_inp_uncon_shape_format_fp32(self, device="npu"):
         format_list = [3]
         shape_list = [(8, 6)]
         shape_format = [
@@ -133,7 +131,7 @@ class TestLog(TestCase):
             npu_output = self.npu_inp_uncon_op_exec(npu_input1)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_log_inp_uncon_shape_format_fp16(self, device):
+    def test_log_inp_uncon_shape_format_fp16(self, device="npu"):
         format_list = [3]
         shape_list = [(8, 6)]
         shape_format = [
@@ -147,6 +145,6 @@ class TestLog(TestCase):
             cpu_output = cpu_output.astype(np.float16)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestLog, globals(), except_for="cpu")
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_network_ops/test_log10.py b/test/test_network_ops/test_log10.py
index c6318620e0821e9d3062bf5d75681019d54f22fb..7285611f949a0796c891eef09df8e1fc3d9e9af6 100644
--- a/test/test_network_ops/test_log10.py
+++ b/test/test_network_ops/test_log10.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestLog10(TestCase):
     def cpu_op_exec(self, input1):
@@ -58,7 +58,7 @@ class TestLog10(TestCase):
         output = input1.to("cpu").numpy()
         return output
 
-    def test_log10_shape_format_fp32(self, device):
+    def test_log10_shape_format_fp32(self, device="npu"):
         format_list = [3]
         shape_list = [(4, 4)]
         shape_format = [
@@ -73,7 +73,7 @@ class TestLog10(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_log10_shape_format_fp16(self, device):
+    def test_log10_shape_format_fp16(self, device="npu"):
         format_list = [3]
         shape_list = [(4, 4)]
         shape_format = [
@@ -91,7 +91,7 @@ class TestLog10(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_log10_inp_uncon_shape_format_fp32(self, device):
+    def test_log10_inp_uncon_shape_format_fp32(self, device="npu"):
         format_list = [3]
         shape_list = [(8, 6)]
         shape_format = [
@@ -103,7 +103,7 @@ class TestLog10(TestCase):
             npu_output = self.npu_inp_uncon_op_exec(npu_input1)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_log10_inp_uncon_shape_format_fp16(self, device):
+    def test_log10_inp_uncon_shape_format_fp16(self, device="npu"):
         format_list = [3]
         shape_list = [(8, 6)]
         shape_format = [
@@ -117,7 +117,7 @@ class TestLog10(TestCase):
             cpu_output = cpu_output.astype(np.float16)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_log10_out_float32_shape_format(self, device):
+    def test_log10_out_float32_shape_format(self, device="npu"):
         shape_format = [
             [[np.float32, 0, [1024, 32, 7, 7]], [np.float32, 0, [1024, 32, 7, 7]]],
             [[np.float32, 0, [1024, 32, 7]], [np.float32, 0, [1024, 32]]],
@@ -135,7 +135,7 @@ class TestLog10(TestCase):
             npu_output = self.npu_op_exec_out(npu_input, npu_output)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_log10_out_float16_shape_format(self, device):
+    def test_log10_out_float16_shape_format(self, device="npu"):
         shape_format = [
             [[np.float16, 0, [1024, 32, 7, 7]], [np.float16, 0, [1024, 32, 7, 7]]],
             [[np.float16, 0, [1024, 32, 7]], [np.float16, 0, [1024, 32]]],
@@ -158,6 +158,6 @@ class TestLog10(TestCase):
                 cpu_output = cpu_output.astype(np.float16)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestLog10, globals(), except_for="cpu")
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_network_ops/test_log1p.py b/test/test_network_ops/test_log1p.py
index 878375d0faf4ac99dc1b7753d9a6b7a51daccd11..fcce0222182717b7b921a9ab3d6b9df54a315f92 100644
--- a/test/test_network_ops/test_log1p.py
+++ b/test/test_network_ops/test_log1p.py
@@ -16,10 +16,10 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
- 
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
+
 class TestLog1p(TestCase):
     def cpu_op_exec(self, input1):
         output = torch.log1p(input1) 
@@ -50,7 +50,7 @@ class TestLog1p(TestCase):
         output = output.astype(np.float16)
         return output
 
-    def test_log1p_common_shape_format(self, device):    
+    def test_log1p_common_shape_format(self, device="npu"):    
         shape_format = [
                 [[np.float32, 0, 1]],
                 [[np.float32, 0, (64, 10)]],
@@ -66,7 +66,7 @@ class TestLog1p(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_output1, npu_output1)
     
-    def test_log1p_float16_shape_format(self, device):
+    def test_log1p_float16_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float16, -1, 1]], 
                 [[np.float16, -1, (64, 10)]],    
@@ -78,7 +78,7 @@ class TestLog1p(TestCase):
             npu_output = self.npu_op_exec(npu_input)
             self.assertRtolEqual(cpu_output, npu_output) 
 
-instantiate_device_type_tests(TestLog1p, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
         
\ No newline at end of file
diff --git a/test/test_network_ops/test_log_softmax.py b/test/test_network_ops/test_log_softmax.py
index c9a98a3558973e235dfa338031d536ab1fe697e6..6a56b2b509fb1997b943f099a7ee1def26ad03d4 100644
--- a/test/test_network_ops/test_log_softmax.py
+++ b/test/test_network_ops/test_log_softmax.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestLogSoftmax(TestCase):
@@ -45,57 +44,43 @@ class TestLogSoftmax(TestCase):
             npu_output = self.npu_op_exec_new(npu_input1, 0)
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
-    '''
-    def test_logsoftmax_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float16, i, [1024]] for i in format_list
-        ]
-        self.logsoftmax_result(shape_format)
-        
-    def test_logsoftmax_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float32, i, [1024]] for i in format_list
-        ]
-        self.logsoftmax_result(shape_format)
-    '''
-    def test_logsoftmax_shape_format_fp16_2d(self, device):
+
+    def test_logsoftmax_shape_format_fp16_2d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float16, i, [256, 1000]] for i in format_list 
         ]
         self.logsoftmax_result(shape_format)
         
-    def test_logsoftmax_shape_format_fp32_2d(self, device):
+    def test_logsoftmax_shape_format_fp32_2d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float32, i, [256, 1000]] for i in format_list 
         ]
         self.logsoftmax_result(shape_format)
         
-    def test_logsoftmax_shape_format_fp16_3d(self, device):
+    def test_logsoftmax_shape_format_fp16_3d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float16, i, [32, 48, 64]] for i in format_list 
         ]
         self.logsoftmax_result(shape_format)
         
-    def test_logsoftmax_shape_format_fp32_3d(self, device):
+    def test_logsoftmax_shape_format_fp32_3d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float32, i, [32, 48, 1024]] for i in format_list
         ]
         self.logsoftmax_result(shape_format)
         
-    def test_logsoftmax_shape_format_fp16_4d(self, device):
+    def test_logsoftmax_shape_format_fp16_4d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float16, i, [32, 24, 18, 1000]] for i in format_list
         ]
         self.logsoftmax_result(shape_format)
 
-    def test_logsoftmax_shape_format_fp32_4d(self, device):
+    def test_logsoftmax_shape_format_fp32_4d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float32, i, [32, 24, 18, 1000]] for i in format_list
@@ -103,6 +88,5 @@ class TestLogSoftmax(TestCase):
         self.logsoftmax_result(shape_format)
             
 
-instantiate_device_type_tests(TestLogSoftmax, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_log_softmax_backward.py b/test/test_network_ops/test_log_softmax_backward.py
index c395bef220867cce51de9f16bab56be078cebd6a..8090d142829c1b1fe27715409dddc3419e10f064 100644
--- a/test/test_network_ops/test_log_softmax_backward.py
+++ b/test/test_network_ops/test_log_softmax_backward.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestLogSoftmaxBackward(TestCase):
     def cpu_op_exec(self, input1, input2, n):
@@ -46,56 +46,56 @@ class TestLogSoftmaxBackward(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
             
-    def test_logsoftmax_backward_shape_format_fp16_1d(self, device):
+    def test_logsoftmax_backward_shape_format_fp16_1d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float16, i, [18]] for i in format_list 
         ]
         self.logsoftmax_backward_result(shape_format, 0, 2)
         
-    def test_logsoftmax_backward_shape_format_fp32_1d(self, device):
+    def test_logsoftmax_backward_shape_format_fp32_1d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float32, i, [18]] for i in format_list 
         ]
         self.logsoftmax_backward_result(shape_format, 0, 50)
         
-    def test_logsoftmax_backward_shape_format_fp16_2d(self, device):
+    def test_logsoftmax_backward_shape_format_fp16_2d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_format = [
             [np.float16, i, [256, 1000]] for i in format_list 
         ]
         self.logsoftmax_backward_result(shape_format, 0, 2)
         
-    def test_logsoftmax_backward_shape_format_fp32_2d(self, device):
+    def test_logsoftmax_backward_shape_format_fp32_2d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_format = [
             [np.float32, i, [256, 1000]] for i in format_list 
         ]
         self.logsoftmax_backward_result(shape_format, 0, 50)
         
-    def test_logsoftmax_backward_shape_format_fp16_3d(self, device):
+    def test_logsoftmax_backward_shape_format_fp16_3d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_format = [
             [np.float16, i, [32, 48, 64]] for i in format_list 
         ]
         self.logsoftmax_backward_result(shape_format, 0, 2)
         
-    def test_logsoftmax_backward_shape_format_fp32_3d(self, device):
+    def test_logsoftmax_backward_shape_format_fp32_3d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_format = [
             [np.float32, i, [32, 48, 64]] for i in format_list 
         ]
         self.logsoftmax_backward_result(shape_format, 0, 50)
         
-    def test_logsoftmax_backward_shape_format_fp16_4d(self, device):
+    def test_logsoftmax_backward_shape_format_fp16_4d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float16, i, [32, 24, 18, 18]] for i in format_list 
         ]
         self.logsoftmax_backward_result(shape_format, 0, 2)
         
-    def test_logsoftmax_backward_shape_format_fp32_4d(self, device):
+    def test_logsoftmax_backward_shape_format_fp32_4d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float32, i, [32, 24, 18, 18]] for i in format_list 
@@ -103,6 +103,5 @@ class TestLogSoftmaxBackward(TestCase):
         self.logsoftmax_backward_result(shape_format, 0, 50)
             
 
-instantiate_device_type_tests(TestLogSoftmaxBackward, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_logical_and.py b/test/test_network_ops/test_logical_and.py
index 5542a2971c32719cc5793dfa3fe74ad0e7d33334..9721e87d1e339a72f8713e83ded249522d195014 100644
--- a/test/test_network_ops/test_logical_and.py
+++ b/test/test_network_ops/test_logical_and.py
@@ -12,15 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-import copy
 import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestLogicalAnd(TestCase):
     def generate_single_data(self, min_d, max_d, shape, dtype):
@@ -93,7 +90,7 @@ class TestLogicalAnd(TestCase):
             npu_output_out = self.npu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
             self.assertRtolEqual(cpu_output_out, npu_output_out)
 
-    def test_logical_and_out(self, device):
+    def test_logical_and_out(self, device="npu"):
         shape_format = [
             [[128, 116, 14, 14], [256, 116, 1, 1, 28]],
             [[128, 3, 224, 224], [3, 3, 3]],
@@ -104,18 +101,18 @@ class TestLogicalAnd(TestCase):
         ]
         self.logical_and_out_result(shape_format)
 
-    def test_logical_and_bool(self, device):
+    def test_logical_and_bool(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(0, 2, (2, 5), np.bool)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2).astype(np.float32)
         npu_output = self.npu_op_exec(npu_input1, npu_input2).astype(np.float32)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_logical_and_inplace_bool(self, device):
+    def test_logical_and_inplace_bool(self, device="npu"):
         npu_input1, npu_input2 = self.generate_data(0, 2, (2, 5), np.bool)
         cpu_output = self.cpu_op_exec_(npu_input1, npu_input2).astype(np.float32)
         npu_output = self.npu_op_exec_(npu_input1, npu_input2).astype(np.float32)
         self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestLogicalAnd, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_logical_not.py b/test/test_network_ops/test_logical_not.py
index 22d6a1e3b54126cf2e41b5c1cbddabef7957718d..fbd67543933318b695374523a4cca5b2c6f6ac9d 100644
--- a/test/test_network_ops/test_logical_not.py
+++ b/test/test_network_ops/test_logical_not.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestLogicalNot(TestCase):
     def cpu_op_exec(self, input1):
@@ -38,7 +38,7 @@ class TestLogicalNot(TestCase):
         output = output.numpy()
         return output
 
-    def test_logical_not_common_shape_format(self, device):
+    def test_logical_not_common_shape_format(self, device="npu"):
         shape_format = [
                 [[np.int8, -1, 1]],
                 [[np.int8, -1, (64, 10)]],
@@ -65,7 +65,7 @@ class TestLogicalNot(TestCase):
             npu_output = self.npu_op_exec(npu_input)
             self.assertRtolEqual(cpu_output, npu_output)  
 
-    def test_logical_not_out_common_shape_format(self, device):
+    def test_logical_not_out_common_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float16, -1, (64, 10)], [np.float16, -1, (64, 1)]],
                 [[np.float16, -1, (256, 2048, 7, 7)], [np.float16, -1, (256, 2048, 7)]],
@@ -84,6 +84,6 @@ class TestLogicalNot(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestLogicalNot, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_logical_or.py b/test/test_network_ops/test_logical_or.py
index 87c6accafdbfc7be7fcc337fa415c12ae5b6e417..07f716da93051d93744b996d45bd34cf34c310c7 100644
--- a/test/test_network_ops/test_logical_or.py
+++ b/test/test_network_ops/test_logical_or.py
@@ -12,15 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-import copy
 import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestLogicalOr(TestCase):
     def generate_single_data(self, min_d, max_d, shape, dtype):
@@ -100,7 +97,7 @@ class TestLogicalOr(TestCase):
             npu_output_out = self.npu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
             self.assertRtolEqual(cpu_output_out, npu_output_out)
 
-    def test_logical_or_out(self, device):
+    def test_logical_or_out(self, device="npu"):
         shape_format = [
             [[128, 116, 14, 14], [256, 116, 1, 1, 28]],
             [[128, 3, 224, 224], [3, 3, 3]],
@@ -111,19 +108,19 @@ class TestLogicalOr(TestCase):
         ]
         self.logical_or_out_result(shape_format)
 
-    def test_logical_or_bool(self, device):
+    def test_logical_or_bool(self, device="npu"):
         npu_input1, npu_input2 = self.generate_bool_data(0, 2, (10, 64), np.bool)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_logical_or_inplace_bool(self, device):
+    def test_logical_or_inplace_bool(self, device="npu"):
         npu_input1, npu_input2 = self.generate_bool_data(0, 2, (10, 64), np.bool)
         cpu_output = self.cpu_op_exec_(npu_input1, npu_input2)
         npu_output = self.npu_op_exec_(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestLogicalOr, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
     
\ No newline at end of file
diff --git a/test/test_network_ops/test_logspace.py b/test/test_network_ops/test_logspace.py
index f6f13c779c7b770f51f74d43b3d2b1d0f1f94eee..960af0e186160d1a257ebb9bf7f482963ed3fefa 100644
--- a/test/test_network_ops/test_logspace.py
+++ b/test/test_network_ops/test_logspace.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestLogSpace(TestCase):
     def cpu_op_exec(self, start, end, steps, base):
@@ -50,7 +50,7 @@ class TestLogSpace(TestCase):
         output = output.numpy()
         return output
 
-    def test_logspace_common_shape_format(self, device):
+    def test_logspace_common_shape_format(self, device="npu"):
         shape_format = [
                 [0.0, 1.0, 10, 0.2, torch.float32],
                 [2.0, 3.0, 10, 0.05, torch.float32],
@@ -69,7 +69,7 @@ class TestLogSpace(TestCase):
             npu_output = self.npu_op_exec(item[0], item[1], item[2], item[3])
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_logspace_out_common_shape_format(self, device):
+    def test_logspace_out_common_shape_format(self, device="npu"):
         shape_format = [
                 [0.0, 1.0, 10, 0.2, torch.float32, [np.float32, 0, [10, 2]]],
                 [2.0, 3.0, 10, 0.05, torch.float32, [np.float32, 0, [10, 2, 5]]],
@@ -89,7 +89,7 @@ class TestLogSpace(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_logspace_float16_shape_format(self, device):
+    def test_logspace_float16_shape_format(self, device="npu"):
         shape_format = [
                 [-2.0, 2.0, 32, 32, torch.float16],
                 [0.0, 1.0, 10, 0.2, torch.float16],
@@ -104,6 +104,6 @@ class TestLogSpace(TestCase):
             npu_output = self.npu_op_exec_dtype(item[0], item[1], item[2], item[3], item[4])
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestLogSpace, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_logsumexp.py b/test/test_network_ops/test_logsumexp.py
index e9114f61bf6f13c5823011f6bcd0fce378648fbd..8cc676175f99ebc2be5a2d2c1da83f4c1c15b801 100644
--- a/test/test_network_ops/test_logsumexp.py
+++ b/test/test_network_ops/test_logsumexp.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestLogsumexp(TestCase):
 
@@ -45,7 +45,7 @@ class TestLogsumexp(TestCase):
         output = out.to("cpu")
         return output
 
-    def test_logsumexp_shape_format(self, device):
+    def test_logsumexp_shape_format(self, device="npu"):
         shape_format = [
             [[np.float32, 0, (3, 4, 2)], [np.float32, 0, (3, 4, 1)], 2, True],
             [[np.float32, 0, (3, 4, 2)], [np.float32, 0, (3, 4)], 2, False],
@@ -77,7 +77,7 @@ class TestLogsumexp(TestCase):
             cpu_result = cpu_result.to(npu_result.dtype)
             self.assertRtolEqual(cpu_result.numpy(), npu_result.numpy())
 
-    def test_logsumexp_dimname1(self, device):
+    def test_logsumexp_dimname1(self, device="npu"):
         cpu_input = self.generate_data(-10, 10, (2, 14, 69, 96, 1824), np.float32)
         cpu_input.names = ['A', 'B', 'C', 'D', 'E']
         dim = ['C']
@@ -86,7 +86,7 @@ class TestLogsumexp(TestCase):
         npu_out = self.npu_op_exec(cpu_input.npu(), dim, keepdim)
         self.assertRtolEqual(cpu_out.numpy(), npu_out.numpy())
 
-    def test_logsumexp_dimname2(self, device):
+    def test_logsumexp_dimname2(self, device="npu"):
         cpu_input = self.generate_data(-10, 10, (14, 69, 96, 1824), np.float32)
         cpu_input.names = ['A', 'B', 'C', 'D']
         dim = ['B', 'C']
@@ -95,7 +95,7 @@ class TestLogsumexp(TestCase):
         npu_out = self.npu_op_exec(cpu_input.npu(), dim, keepdim)
         self.assertRtolEqual(cpu_out.numpy(), npu_out.numpy())
 
-    def test_logsumexp_dimname3(self, device):
+    def test_logsumexp_dimname3(self, device="npu"):
         cpu_input = self.generate_data(-10, 10, (14, 69, 96, 1824), np.float32)
         cpu_input.names = ['A', 'B', 'C', 'D']
         dim = ['B', 'C', 'D']
@@ -104,6 +104,6 @@ class TestLogsumexp(TestCase):
         npu_out = self.npu_op_exec(cpu_input.npu(), dim, keepdim)
         self.assertRtolEqual(cpu_out.numpy(), npu_out.numpy())
 
-instantiate_device_type_tests(TestLogsumexp, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_masked_fill.py b/test/test_network_ops/test_masked_fill.py
index 256e366b29b6533543b12d9db19d61fa5dd54e09..c31fefc93b7a184e840635a174e80a7cc676fc42 100644
--- a/test/test_network_ops/test_masked_fill.py
+++ b/test/test_network_ops/test_masked_fill.py
@@ -15,9 +15,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestMaskedFill(TestCase):
     def create_bool_tensor(self, shape, minValue, maxValue):
@@ -49,7 +49,7 @@ class TestMaskedFill(TestCase):
         output = output.numpy()
         return output
 
-    def test_masked_fill_shape_format_fp16(self, device):
+    def test_masked_fill_shape_format_fp16(self, device="npu"):
         format_list = [0]
         shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
         value_list = [1.25,
@@ -73,7 +73,7 @@ class TestMaskedFill(TestCase):
             cpu_output2 = cpu_output2.astype(npu_output2.dtype)
             self.assertRtolEqual(cpu_output2, npu_output2)
 
-    def test_masked_fill_shape_format_fp32(self, device):
+    def test_masked_fill_shape_format_fp32(self, device="npu"):
         format_list = [0]
         shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
         value_list = [1.25,
@@ -94,7 +94,7 @@ class TestMaskedFill(TestCase):
             npu_output2 = self.npu_inp_op_exec(npu_input1, mask_npu, item[1])
             self.assertRtolEqual(cpu_output2, npu_output2)
 
-    def test_masked_fill_shape_format_int32(self, device):
+    def test_masked_fill_shape_format_int32(self, device="npu"):
         format_list = [0]
         shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
         value_list = [1.25,
@@ -115,7 +115,7 @@ class TestMaskedFill(TestCase):
             npu_output2 = self.npu_inp_op_exec(npu_input1, mask_npu, item[1])
             self.assertRtolEqual(cpu_output2, npu_output2)
 
-    def test_masked_fill_shape_format_int64(self, device):
+    def test_masked_fill_shape_format_int64(self, device="npu"):
         format_list = [0]
         shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
         value_list = [1.25,
@@ -140,6 +140,6 @@ class TestMaskedFill(TestCase):
             npu_output2 = npu_output2.astype(np.int32)
             self.assertRtolEqual(cpu_output2, npu_output2)
 
-instantiate_device_type_tests(TestMaskedFill, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_masked_fill_range.py b/test/test_network_ops/test_masked_fill_range.py
index 8da2a1a211054170557beaceb3d7fcd0280e46f3..8f5066bfcdab2581210857636072ddd3ca00e62c 100644
--- a/test/test_network_ops/test_masked_fill_range.py
+++ b/test/test_network_ops/test_masked_fill_range.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestMaskedFillRange(TestCase):
     def cpu_op_exec(self, input1, start, end, value, axis, dim):
@@ -48,7 +48,7 @@ class TestMaskedFillRange(TestCase):
         out = out.to("cpu")
         return out.detach().numpy()
 
-    def test_normalize_batch(self, device):
+    def test_normalize_batch(self, device="npu"):
         shape_format = [
             [[np.float32, -1, [32, 64, 1688]], 
                 [list(range(0, 32))],
@@ -79,6 +79,6 @@ class TestMaskedFillRange(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestMaskedFillRange, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_masked_scatter.py b/test/test_network_ops/test_masked_scatter.py
index e509e089c390630c307350c2c0f4d80e613f43a3..a7ae68ab4a2d1f3f21d8518efd3b2f8c8db60a0a 100644
--- a/test/test_network_ops/test_masked_scatter.py
+++ b/test/test_network_ops/test_masked_scatter.py
@@ -18,9 +18,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestMaskedScatter(TestCase):
     def cpu_op_exec(self, input1, maskbool, source):
@@ -45,7 +45,7 @@ class TestMaskedScatter(TestCase):
         npu_output = npu_output.to("cpu")
         return npu_output.numpy()
 
-    def test_masked_scatter_float(self, device):
+    def test_masked_scatter_float(self, device="npu"):
         dtype_list = [np.float32]
         format_list = [0, 3]
         shape_list = [[4, 5],[3, 4, 5], [2, 3, 4, 5]]
@@ -62,7 +62,7 @@ class TestMaskedScatter(TestCase):
             npu_output2 = self.npu_inp_op_exec(npu_input, maskbool, npu_source)
             self.assertRtolEqual(cpu_output2, npu_output2)
           
-    def test_masked_scatter_int(self, device):
+    def test_masked_scatter_int(self, device="npu"):
         dtype_list = [np.int32, np.int64]
         format_list = [0]
         shape_list = [[4, 5],[3, 4, 5], [2, 3, 4, 5]]
@@ -79,6 +79,6 @@ class TestMaskedScatter(TestCase):
             npu_output2 = self.npu_inp_op_exec(npu_input, maskbool, npu_source)
             self.assertRtolEqual(cpu_output2, npu_output2)
 
-instantiate_device_type_tests(TestMaskedScatter, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_masked_select.py b/test/test_network_ops/test_masked_select.py
index 9587eadd13a68876815e3ea5e6b390cc7db30361..ecd2d718dc1f59f3608e395b883d8c0d63ff2b39 100644
--- a/test/test_network_ops/test_masked_select.py
+++ b/test/test_network_ops/test_masked_select.py
@@ -14,14 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from unittest import makeSuite
 import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestMaskedSelect(TestCase):
     def get_mask(self):
@@ -58,7 +57,7 @@ class TestMaskedSelect(TestCase):
         output = torch.masked_select(input1, mask, out=output)
         return output.detach().to("cpu").numpy()
 
-    def test_maskedselect_out_result(self, device):
+    def test_maskedselect_out_result(self, device="npu"):
         shape_format = [
             [[np.float16, 2, [15, 15, 15, 16]], [np.float16, 2, [15, 15, 15, 16]]],
             [[np.float16, 2, [15, 15, 15, 16]], [np.float16, 2, [3, 3, 7, 7]]],
@@ -80,7 +79,7 @@ class TestMaskedSelect(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_maskedselect_shape_format_maskdiff(self, device):
+    def test_maskedselect_shape_format_maskdiff(self, device="npu"):
         dtype_list = [np.int64, np.int32, np.float32]
         format_list = [0]
         shape_list = [[3, 4, 5]]
@@ -94,7 +93,7 @@ class TestMaskedSelect(TestCase):
             npu_output = self.npu_op_exec(npu_input, mask_npu > 50)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_maskedselect_shape_format_fp32(self, device):
+    def test_maskedselect_shape_format_fp32(self, device="npu"):
         format_list = [0, 3]
         shape_list = [[3, 4, 5]]
         shape_format = [
@@ -108,7 +107,7 @@ class TestMaskedSelect(TestCase):
             npu_output = self.npu_op_exec(npu_input, mask)
             self.assertRtolEqual(cpu_output, npu_output)
             
-    def test_maskedselect_shape_format_int(self, device):
+    def test_maskedselect_shape_format_int(self, device="npu"):
         dtype_list = [np.int32, np.int64]
         format_list = [0]
         shape_list = [[3, 4, 5]]
@@ -123,6 +122,6 @@ class TestMaskedSelect(TestCase):
             npu_output = self.npu_op_exec(npu_input, mask)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestMaskedSelect, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_matmul.py b/test/test_network_ops/test_matmul.py
index 87b8841154d3559c6d4d2d79982a46ca18bc2393..9ce02898d3115c022eec8b33466dc14a1116e6a6 100644
--- a/test/test_network_ops/test_matmul.py
+++ b/test/test_network_ops/test_matmul.py
@@ -17,9 +17,8 @@ import torch_npu
 import numpy as np
 import torch.nn as nn
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestMatMul(TestCase):
@@ -62,7 +61,7 @@ class TestMatMul(TestCase):
             self.assertRtolEqual(cpu_mat1_grad.astype(npu_mat1_grad.dtype), npu_mat1_grad)
             self.assertRtolEqual(cpu_mat2_grad.astype(npu_mat2_grad.dtype), npu_mat2_grad)
     
-    def test_matmul_backward_shape_format_fp16_case1(self, device):
+    def test_matmul_backward_shape_format_fp16_case1(self, device="npu"):
         shape_format = [
             # mat1 1dim, mat2 1dim       
             [[np.float16, 2, [5]], [np.float16, 2, [5]]],
@@ -70,7 +69,7 @@ class TestMatMul(TestCase):
         ]
         self.matmul_backward_result(shape_format)
     
-    def test_matmul_backward_shape_format_fp16_case3(self, device):
+    def test_matmul_backward_shape_format_fp16_case3(self, device="npu"):
         shape_format = [
             # mat1 1dim, mat2 2dim       
             [[np.float16, 2, [5]], [np.float16, 2, [5,6]]],
@@ -79,7 +78,7 @@ class TestMatMul(TestCase):
         ]
         self.matmul_backward_result(shape_format)
         
-    def test_matmul_backward_shape_format_fp16_case4(self, device):
+    def test_matmul_backward_shape_format_fp16_case4(self, device="npu"):
         shape_format = [
             # mat1 1dim, mat2 2dim       
             [[np.float16, 2, [5,7]], [np.float16, 2, [7,10]]],
@@ -87,7 +86,7 @@ class TestMatMul(TestCase):
         ]
         self.matmul_backward_result(shape_format)
     
-    def test_matmul_backward_shape_format_fp16_case5(self, device):
+    def test_matmul_backward_shape_format_fp16_case5(self, device="npu"):
         shape_format = [
             # mat1 1dim, mat2 2dim       
             [[np.float16, 2, [4,5,10]], [np.float16, 2, [10]]],
@@ -97,7 +96,7 @@ class TestMatMul(TestCase):
         ]
         self.matmul_backward_result(shape_format)
         
-    def test_matmul_backward_shape_format_fp16_case6(self, device):
+    def test_matmul_backward_shape_format_fp16_case6(self, device="npu"):
         shape_format = [
             # mat1 >2dim, mat2 2dim       
             [[np.float16, 2, [5,7,10]], [np.float16, 2, [10,16]]],
@@ -106,7 +105,7 @@ class TestMatMul(TestCase):
         ]
         self.matmul_backward_result(shape_format)
     
-    def test_matmul_backward_shape_format_fp16_case7(self, device):
+    def test_matmul_backward_shape_format_fp16_case7(self, device="npu"):
         shape_format = [
             # mat1 1dim, mat2 >2dim       
             [[np.float16, 2, [3,]], [np.float16, 2, [2,3,2]]],
@@ -114,7 +113,7 @@ class TestMatMul(TestCase):
         ]
         self.matmul_backward_result(shape_format)
         
-    def test_matmul_backward_shape_format_fp16_case8(self, device):
+    def test_matmul_backward_shape_format_fp16_case8(self, device="npu"):
         shape_format = [
             # mat1 2dim, mat2 >2dim       
             [[np.float16, 2, [2,3]], [np.float16, 2, [2,3,2]]],
@@ -123,7 +122,7 @@ class TestMatMul(TestCase):
         ]
         self.matmul_backward_result(shape_format)
         
-    def test_matmul_backward_shape_format_fp16_case9(self, device):
+    def test_matmul_backward_shape_format_fp16_case9(self, device="npu"):
         shape_format = [       
             [[np.float16, 2, [5,7,10]], [np.float16, 2, [5,10,15]]],
             [[np.float16, 2, [68,75,16]], [np.float16, 2, [68,16,43]]],
@@ -133,6 +132,6 @@ class TestMatMul(TestCase):
         ]
         self.matmul_backward_result(shape_format)        
 
-instantiate_device_type_tests(TestMatMul, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_max.py b/test/test_network_ops/test_max.py
index 7a49ddced442962ce00a60b68503a02166532186..002204630a7713d056b9b030bb2f167dabf79037 100644
--- a/test/test_network_ops/test_max.py
+++ b/test/test_network_ops/test_max.py
@@ -11,14 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import copy
+
 import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestMax(TestCase):
@@ -216,7 +215,7 @@ class TestMax(TestCase):
             self.assertRtolEqual(cpu_output_amax, npu_output_amax)
             self.assertRtolEqual(cpu_output_amax, npu_output_amax_out)
 
-    def test_max_out_result(self, device):
+    def test_max_out_result(self, device="npu"):
         shape_format = [
             [[np.float16, 0, [9, 10, 14, 14]], [np.float16, 0, [7, 10, 1, 1]]],
             [[np.float16, 0, [9, 7, 12, 12]],  [np.float16, 0, [7, 7, 1, 1]]],
@@ -225,14 +224,14 @@ class TestMax(TestCase):
         ]
         self.max_out_result_other(shape_format)
 
-    def test_max_shape_format_fp16_1d(self, device):
+    def test_max_shape_format_fp16_1d(self, device="npu"):
         format_list = [0, 3]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
                         ]
         self.max_result(shape_format)
 
-    def test_max_shape_format_fp32_1d(self, device):
+    def test_max_shape_format_fp32_1d(self, device="npu"):
         format_list = [0, 3]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8]], np.random.randint(0, 1), j] for i in format_list for j in
@@ -240,7 +239,7 @@ class TestMax(TestCase):
                         ]
         self.max_result(shape_format)
 
-    def test_max_shape_format_fp16_2d(self, device):
+    def test_max_shape_format_fp16_2d(self, device="npu"):
         format_list = [0, 3]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7]], np.random.randint(0, 2), j] for i in format_list for j in
@@ -248,7 +247,7 @@ class TestMax(TestCase):
                         ]
         self.max_result(shape_format)
 
-    def test_max_shape_format_fp32_2d(self, device):
+    def test_max_shape_format_fp32_2d(self, device="npu"):
         format_list = [0, 3]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7]], np.random.randint(0, 2), j] for i in format_list for j in
@@ -256,7 +255,7 @@ class TestMax(TestCase):
                         ]
         self.max_result(shape_format)
 
-    def test_max_shape_format_fp16_3d(self, device):
+    def test_max_shape_format_fp16_3d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9]], np.random.randint(0, 3), j] for i in format_list for j in
@@ -264,7 +263,7 @@ class TestMax(TestCase):
                         ]
         self.max_result(shape_format)
 
-    def test_max_shape_format_fp32_3d(self, device):
+    def test_max_shape_format_fp32_3d(self, device="npu"):
         format_list = [0, 3]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9]], np.random.randint(0, 3), j] for i in format_list for j in
@@ -272,7 +271,7 @@ class TestMax(TestCase):
                         ]
         self.max_result(shape_format)
 
-    def test_max_shape_format_fp16_4d(self, device):
+    def test_max_shape_format_fp16_4d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9, 10]], np.random.randint(0, 4), j] for i in format_list for j
@@ -280,7 +279,7 @@ class TestMax(TestCase):
                         ]
         self.max_result(shape_format)
 
-    def test_max_shape_format_fp32_4d(self, device):
+    def test_max_shape_format_fp32_4d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9, 10]], np.random.randint(0, 4), j] for i in format_list for j
@@ -289,14 +288,14 @@ class TestMax(TestCase):
                         ]
         self.max_result(shape_format)
 
-    def test_max_dim_shape_format_fp16_1d(self, device):
+    def test_max_dim_shape_format_fp16_1d(self, device="npu"):
         format_list = [0, 3, 4]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
                         ]
         self.max_result_dim(shape_format)
 
-    def test_max_dim_shape_format_fp32_1d(self, device):
+    def test_max_dim_shape_format_fp32_1d(self, device="npu"):
         format_list = [0, 3, 4]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8]], np.random.randint(0, 1), j] for i in format_list for j in
@@ -304,7 +303,7 @@ class TestMax(TestCase):
                         ]
         self.max_result_dim(shape_format)
 
-    def test_max_dim_shape_format_fp16_2d(self, device):
+    def test_max_dim_shape_format_fp16_2d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7]], np.random.randint(0, 2), j] for i in format_list for j in
@@ -312,7 +311,7 @@ class TestMax(TestCase):
                         ]
         self.max_result_dim(shape_format)
 
-    def test_max_dim_shape_format_fp32_2d(self, device):
+    def test_max_dim_shape_format_fp32_2d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7]], np.random.randint(0, 2), j] for i in format_list for j in
@@ -320,7 +319,7 @@ class TestMax(TestCase):
                         ]
         self.max_result_dim(shape_format)
 
-    def test_max_dim_shape_format_fp16_3d(self, device):
+    def test_max_dim_shape_format_fp16_3d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9]], np.random.randint(0, 3), j] for i in format_list for j in
@@ -328,7 +327,7 @@ class TestMax(TestCase):
                         ]
         self.max_result_dim(shape_format)
 
-    def test_max_dim_shape_format_fp32_3d(self, device):
+    def test_max_dim_shape_format_fp32_3d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9]], np.random.randint(0, 3), j] for i in format_list for j in
@@ -336,7 +335,7 @@ class TestMax(TestCase):
                         ]
         self.max_result_dim(shape_format)
 
-    def test_max_dim_shape_format_fp16_4d(self, device):
+    def test_max_dim_shape_format_fp16_4d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9, 10]], np.random.randint(0, 4), j] for i in format_list for j
@@ -344,7 +343,7 @@ class TestMax(TestCase):
                         ]
         self.max_result_dim(shape_format)
 
-    def test_max_dim_shape_format_fp32_4d(self, device):
+    def test_max_dim_shape_format_fp32_4d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9, 10]], np.random.randint(0, 4), j] for i in format_list for j
@@ -352,14 +351,14 @@ class TestMax(TestCase):
                         ]
         self.max_result_dim(shape_format)
 
-    def test_max_other_shape_format_fp16_1d(self, device):
+    def test_max_other_shape_format_fp16_1d(self, device="npu"):
         format_list = [0, 3, 4]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
                         ]
         self.max_result_other(shape_format)
 
-    def test_max_other_shape_format_fp32_1d(self, device):
+    def test_max_other_shape_format_fp32_1d(self, device="npu"):
         format_list = [0, 3, 4]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8]], np.random.randint(0, 1), j] for i in format_list for j in
@@ -367,7 +366,7 @@ class TestMax(TestCase):
                         ]
         self.max_result_other(shape_format)
 
-    def test_max_other_shape_format_fp16_2d(self, device):
+    def test_max_other_shape_format_fp16_2d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7]], np.random.randint(0, 2), j] for i in format_list for j in
@@ -375,7 +374,7 @@ class TestMax(TestCase):
                         ]
         self.max_result_other(shape_format)
 
-    def test_max_other_shape_format_fp32_2d(self, device):
+    def test_max_other_shape_format_fp32_2d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7]], np.random.randint(0, 2), j] for i in format_list for j in
@@ -383,7 +382,7 @@ class TestMax(TestCase):
                         ]
         self.max_result_other(shape_format)
 
-    def test_max_other_shape_format_fp16_3d(self, device):
+    def test_max_other_shape_format_fp16_3d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9]], np.random.randint(0, 3), j] for i in format_list for j in
@@ -391,7 +390,7 @@ class TestMax(TestCase):
                         ]
         self.max_result_other(shape_format)
 
-    def test_max_other_shape_format_fp32_3d(self, device):
+    def test_max_other_shape_format_fp32_3d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9]], np.random.randint(0, 3), j] for i in format_list for j in
@@ -399,7 +398,7 @@ class TestMax(TestCase):
                         ]
         self.max_result_other(shape_format)
 
-    def test_max_other_shape_format_fp16_4d(self, device):
+    def test_max_other_shape_format_fp16_4d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9, 10]], np.random.randint(0, 4), j] for i in format_list for j
@@ -407,7 +406,7 @@ class TestMax(TestCase):
                         ]
         self.max_result_other(shape_format)
 
-    def test_max_other_shape_format_fp32_4d(self, device):
+    def test_max_other_shape_format_fp32_4d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9, 10]], np.random.randint(0, 4), j] for i in format_list for j
@@ -416,7 +415,7 @@ class TestMax(TestCase):
                         ]
         self.max_result_other(shape_format)
     
-    def test_max_dimname_shape_format(self, device):
+    def test_max_dimname_shape_format(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9, 10], ('N', 'C', 'H', 'W')],
@@ -426,7 +425,7 @@ class TestMax(TestCase):
                         ]
         self.max_name_result_other(shape_format)
     
-    def test_max_dimname_shape_format_fp16(self, device):
+    def test_max_dimname_shape_format_fp16(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9, 10], ('N', 'C', 'H', 'W')],
@@ -436,7 +435,7 @@ class TestMax(TestCase):
                         ]
         self.max_name_result_other(shape_format)
     
-    def test_max_dimname_out_shape_format(self, device):
+    def test_max_dimname_out_shape_format(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9, 10], ('N', 'C', 'H', 'W')],
@@ -446,7 +445,7 @@ class TestMax(TestCase):
                         ]
         self.max_name_out_result_other(shape_format)
     
-    def test_max_dimname_out_shape_format_fp16(self, device):
+    def test_max_dimname_out_shape_format_fp16(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9, 10], ('N', 'C', 'H', 'W')],
@@ -456,14 +455,14 @@ class TestMax(TestCase):
                         ]
         self.max_name_out_result_other(shape_format)
 
-    def test_amax_shape_format_fp16_1d(self, device):
+    def test_amax_shape_format_fp16_1d(self, device="npu"):
         format_list = [0, 3, 4]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
                         ]
         self.amax_result(shape_format)
 
-    def test_amax_shape_format_fp32_1d(self, device):
+    def test_amax_shape_format_fp32_1d(self, device="npu"):
         format_list = [0, 3, 4]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8]], np.random.randint(0, 1), j] for i in format_list for j in
@@ -471,7 +470,7 @@ class TestMax(TestCase):
                         ]
         self.amax_result(shape_format)
 
-    def test_amax_shape_format_fp16_2d(self, device):
+    def test_amax_shape_format_fp16_2d(self, device="npu"):
         format_list = [0, 3, 4]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7]], np.random.randint(0, 2), j] for i in format_list for j in
@@ -479,7 +478,7 @@ class TestMax(TestCase):
                         ]
         self.amax_result(shape_format)
 
-    def test_amax_shape_format_fp32_2d(self, device):
+    def test_amax_shape_format_fp32_2d(self, device="npu"):
         format_list = [0, 3, 4]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7]], np.random.randint(0, 2), j] for i in format_list for j in
@@ -487,7 +486,7 @@ class TestMax(TestCase):
                         ]
         self.amax_result(shape_format)
 
-    def test_amax_shape_format_fp16_3d(self, device):
+    def test_amax_shape_format_fp16_3d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9]], np.random.randint(0, 3), j] for i in format_list for j in
@@ -495,7 +494,7 @@ class TestMax(TestCase):
                         ]
         self.amax_result(shape_format)
 
-    def test_amax_shape_format_fp32_3d(self, device):
+    def test_amax_shape_format_fp32_3d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9]], np.random.randint(0, 3), j] for i in format_list for j in
@@ -503,7 +502,7 @@ class TestMax(TestCase):
                         ]
         self.amax_result(shape_format)
 
-    def test_amax_shape_format_fp16_4d(self, device):
+    def test_amax_shape_format_fp16_4d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9, 10]], np.random.randint(0, 4), j] for i in format_list for j
@@ -511,7 +510,7 @@ class TestMax(TestCase):
                         ]
         self.amax_result(shape_format)
 
-    def test_amax_shape_format_fp32_4d(self, device):
+    def test_amax_shape_format_fp32_4d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9, 10]], np.random.randint(0, 4), j] for i in format_list for j
@@ -519,6 +518,6 @@ class TestMax(TestCase):
                         ]
         self.amax_result(shape_format)
 
-instantiate_device_type_tests(TestMax, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_max_pool2d_with_indices.py b/test/test_network_ops/test_max_pool2d_with_indices.py
index 03a42c0978ff9454099a91216b72251b4175bf3c..f3cc1a1df6ed0bb3dd6c47cb677bf48587e50483 100644
--- a/test/test_network_ops/test_max_pool2d_with_indices.py
+++ b/test/test_network_ops/test_max_pool2d_with_indices.py
@@ -19,9 +19,8 @@ import torch_npu
 import numpy as np
 import torch.nn.functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestMaxPool2dWithIndices(TestCase):
@@ -37,7 +36,7 @@ class TestMaxPool2dWithIndices(TestCase):
         output2 = argMaxNpu.to("cpu").detach()
         return output1, output2
 
-    def test_max_pool2d_with_indices_fp16(self, device):
+    def test_max_pool2d_with_indices_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, 0, [256, 64, 112, 112]], [3, 3], [2, 2], 1, 1, False],
             [[np.float16, 0, [1024, 24, 112, 112]], [3, 3], [2, 2], 1, 1, False],
@@ -64,7 +63,7 @@ class TestMaxPool2dWithIndices(TestCase):
 
             self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
     
-    def test_max_pool2d_with_indices_fp32(self, device):
+    def test_max_pool2d_with_indices_fp32(self, device="npu"):
         shape_format = [
             [[np.float32, 0, [256, 64, 112, 112]], [3, 3], [2, 2], 1, 1, False],
             [[np.float32, 0, [1024, 24, 112, 112]], [3, 3], [2, 2], 1, 1, False],
@@ -88,6 +87,5 @@ class TestMaxPool2dWithIndices(TestCase):
             self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy(), prec=1.e-3)
 
 
-instantiate_device_type_tests(TestMaxPool2dWithIndices, globals(), except_for='cpu')
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_max_pool2d_with_indices_backward.py b/test/test_network_ops/test_max_pool2d_with_indices_backward.py
index 35a9d575610dc588061113acc1d0ca12d972b010..4be67b71632f07680c7a1619f51471c5a81d5c62 100644
--- a/test/test_network_ops/test_max_pool2d_with_indices_backward.py
+++ b/test/test_network_ops/test_max_pool2d_with_indices_backward.py
@@ -19,9 +19,8 @@ import torch_npu
 import numpy as np
 import torch.nn.functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestMaxPool2dWithIndicesBackward(TestCase):
@@ -46,7 +45,7 @@ class TestMaxPool2dWithIndicesBackward(TestCase):
         output1 = dataNpu.to("cpu").detach()
         return output1, npu_grad
 
-    def test_max_pool2d_with_indices_backward_fp16(self, device):
+    def test_max_pool2d_with_indices_backward_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, 3, [256, 64, 112, 112]], [3, 3], [2, 2], 1, 1, False],
             [[np.float16, 3, [1024, 24, 112, 112]], [3, 3], [2, 2], 1, 1, False],
@@ -67,7 +66,7 @@ class TestMaxPool2dWithIndicesBackward(TestCase):
             self.assertRtolEqual(cpu_grad.numpy(), npu_grad.numpy())
 
 
-    def test_max_pool2d_with_indices_backward_fp32(self, device):
+    def test_max_pool2d_with_indices_backward_fp32(self, device="npu"):
         shape_format = [
             [[np.float16, 3, [256, 64, 112, 112]], [3, 3], [2, 2], 1, 1, False],
             [[np.float16, 3, [1024, 24, 112, 112]], [3, 3], [2, 2], 1, 1, False],
@@ -87,7 +86,7 @@ class TestMaxPool2dWithIndicesBackward(TestCase):
             self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy(), prec=1.e-3)
             self.assertRtolEqual(cpu_grad.numpy(), npu_grad.numpy(), prec=1.e-3)
 
-instantiate_device_type_tests(TestMaxPool2dWithIndicesBackward, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
 
diff --git a/test/test_network_ops/test_max_pool3d_with_indices.py b/test/test_network_ops/test_max_pool3d_with_indices.py
index de954330aa78203f5d3d493cb99d314917cf2b7a..669280a1a8f8e4f31a075b5c4760e34b6ea1595f 100644
--- a/test/test_network_ops/test_max_pool3d_with_indices.py
+++ b/test/test_network_ops/test_max_pool3d_with_indices.py
@@ -19,9 +19,9 @@ import torch_npu
 import numpy as np
 import torch.nn.functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestMaxPool3dWithIndices(TestCase):
     def cpu_op_exec(self, inputCpu, kernel_size, stride, padding, dilation, ceil_mode):
@@ -36,7 +36,7 @@ class TestMaxPool3dWithIndices(TestCase):
         output2 = argMaxNpu.to("cpu").detach()
         return output1, output2
 
-    def test_max_pool3d_with_indices(self, device):
+    def test_max_pool3d_with_indices(self, device="npu"):
         shape_format = [
             [np.float16, 30, [1, 3, 19, 19, 19], [3, 3, 3], [2, 2, 2], 1, 1, False],
             [np.float16, 30, [3, 3, 124, 112, 112], 3, [2, 2, 2], 1, 1, True],
@@ -56,6 +56,6 @@ class TestMaxPool3dWithIndices(TestCase):
 
             self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
 
-instantiate_device_type_tests(TestMaxPool3dWithIndices, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_max_pool3d_withindices_backward.py b/test/test_network_ops/test_max_pool3d_withindices_backward.py
index bf392bcd7654d0e067e694100fa179893a44f3f7..3ac21f893686c1d229ae84fe54c9dc10616473f1 100644
--- a/test/test_network_ops/test_max_pool3d_withindices_backward.py
+++ b/test/test_network_ops/test_max_pool3d_withindices_backward.py
@@ -19,9 +19,9 @@ import torch_npu
 import numpy as np
 import torch.nn.functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestMaxPool3dWithIndicesBackward(TestCase):
     def cpu_op_exec(self, inputCpu, kernel_size, stride, padding, dilation, ceil_mode):
@@ -49,7 +49,7 @@ class TestMaxPool3dWithIndicesBackward(TestCase):
         output1 = dataNpu.to("cpu").detach()
         return output1, npu_grad
 
-    def test_max_pool3d_backward_shape_format(self, device):
+    def test_max_pool3d_backward_shape_format(self, device="npu"):
         shape_format = [
             [np.float16, 30, [1, 3, 19, 19, 19], [3, 3, 3], [2, 2, 2], 1, 1, False],
             [np.float16, 30, [3, 3, 124, 112, 112], 3, [2, 2, 2], 1, 1, True],
@@ -70,6 +70,6 @@ class TestMaxPool3dWithIndicesBackward(TestCase):
             self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
             self.assertRtolEqual(cpu_grad.numpy(), npu_grad.numpy())
 
-instantiate_device_type_tests(TestMaxPool3dWithIndicesBackward, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_max_unpool2d.py b/test/test_network_ops/test_max_unpool2d.py
index ba14a812163b5eb8294bcc44c8078595c82baf56..33e174c17a11a708e73670055bbea972e216631d 100644
--- a/test/test_network_ops/test_max_unpool2d.py
+++ b/test/test_network_ops/test_max_unpool2d.py
@@ -15,11 +15,11 @@
 import torch
 import torch_npu
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestMaxunpool2d(TestCase):
-    def test_max_unpool2d(self, device):
+    def test_max_unpool2d(self, device="npu"):
         input1 = torch.tensor([[[[1., 2., 3., 4.], [5., 6., 7., 8.], [9., 10. , 11. , 12.], [13., 14., 15., 16.]]]])
         pool2d = torch.nn.MaxPool2d(2, stride=2, return_indices=True)
         out, ind = pool2d(input1)
@@ -28,6 +28,6 @@ class TestMaxunpool2d(TestCase):
         cpu_out = unpool2d(out, ind)
         self.assertRtolEqual(cpu_out, npu_out.cpu())
 
-instantiate_device_type_tests(TestMaxunpool2d, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_max_unpool2d_backward.py b/test/test_network_ops/test_max_unpool2d_backward.py
index b01008ce7a0b58cbbb7d867a97b56148a6419883..7c00e2a9d71f6da5aa20d6e13767abbe105da688 100644
--- a/test/test_network_ops/test_max_unpool2d_backward.py
+++ b/test/test_network_ops/test_max_unpool2d_backward.py
@@ -15,11 +15,11 @@
 import torch
 import torch_npu
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestMaxunpool2dBackward(TestCase):
-    def test_maxunpool2d_backward(self, device):
+    def test_maxunpool2d_backward(self, device="npu"):
         input1 = torch.tensor([[[[1., 2, 3, 4], [5, 6, 7, 8], [9, 10 , 11 , 12], [13, 14, 15, 16]]]])
         pool2d = torch.nn.MaxPool2d(2, stride = 2, return_indices = True)
         out, ind = pool2d(input1)
@@ -45,6 +45,6 @@ class TestMaxunpool2dBackward(TestCase):
         npu_grad = npu_upinput.grad
         self.assertRtolEqual(cpu_grad, npu_grad.cpu())
 
-instantiate_device_type_tests(TestMaxunpool2dBackward, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_max_unpool3d.py b/test/test_network_ops/test_max_unpool3d.py
index 70da0ba6473b8f86f3824c5ad730f0c479b28b2f..bf756805e3b9f449f759fb600897f0ea755374b8 100644
--- a/test/test_network_ops/test_max_unpool3d.py
+++ b/test/test_network_ops/test_max_unpool3d.py
@@ -17,9 +17,9 @@ import torch_npu
 import torch.nn as nn
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestMaxUnpool3d(TestCase):
     def cpu_op_exec(self, input1):
@@ -41,7 +41,7 @@ class TestMaxUnpool3d(TestCase):
         unpooled_output = unpooled_output.cpu()
         return unpooled_output
 
-    def test_max_unpool3d_shape_format(self, device):
+    def test_max_unpool3d_shape_format(self, device="npu"):
         dtype_list = [np.float32, np.float16]
         format_list = [-1]
         shape_list = [(20, 16, 51, 33, 15)]
@@ -57,6 +57,6 @@ class TestMaxUnpool3d(TestCase):
             npu_output = self.npu_op_exec(npu_input)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestMaxUnpool3d, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_max_unpool3d_backward.py b/test/test_network_ops/test_max_unpool3d_backward.py
index 1bedd9cb772e54275d17e25fa8d3c96469fcc835..fb8feb1b73abf623ff2b5a9158ca321d4f04e03f 100644
--- a/test/test_network_ops/test_max_unpool3d_backward.py
+++ b/test/test_network_ops/test_max_unpool3d_backward.py
@@ -17,9 +17,9 @@ import torch_npu
 import torch.nn as nn
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestMaxUnpool3dBackward(TestCase):
     def cpu_op_exec(self, input1):
@@ -52,7 +52,7 @@ class TestMaxUnpool3dBackward(TestCase):
         unpooled_output = unpooled_output.cpu().detach()
         return unpooled_output, unpool_input_grad
 
-    def test_max_unpool3d_backward_shape_format(self, device):
+    def test_max_unpool3d_backward_shape_format(self, device="npu"):
         dtype_list = [np.float32, np.float16]
         format_list = [-1]
         shape_list = [(20, 16, 51, 33, 15)]
@@ -71,6 +71,6 @@ class TestMaxUnpool3dBackward(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_unpool_input_grad, npu_unpool_input_grad)
 
-instantiate_device_type_tests(TestMaxUnpool3dBackward, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_median.py b/test/test_network_ops/test_median.py
index 50cae3ef79a15e32622a30a7740700504fd7b7ab..c9dc0f701ba51632c61d4f12e12af759bb462da5 100644
--- a/test/test_network_ops/test_median.py
+++ b/test/test_network_ops/test_median.py
@@ -14,14 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestMedian(TestCase):
@@ -55,7 +53,7 @@ class TestMedian(TestCase):
         output2 = input3.to("cpu").numpy()
         return output1, output2
 
-    def test_median_shape_format(self, device):
+    def test_median_shape_format(self, device="npu"):
         shape_format = [
             [np.float16, -1, (10,)],
             [np.float16, 3, (4, 4, 4)],
@@ -67,7 +65,7 @@ class TestMedian(TestCase):
             npu_output = self.npu_op_exec(npu_input)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_median_dim_shape_format(self, device):
+    def test_median_dim_shape_format(self, device="npu"):
         shape_format = [
             [[np.float16, -1, (10,)], 0, False],
             [[np.float16, 0, (1, 2, 3, 4)], 1, False],
@@ -86,6 +84,6 @@ class TestMedian(TestCase):
             self.assertRtolEqual(npu_output1_out, npu_output1)
             self.assertRtolEqual(npu_output2_out, npu_output2)
 
-instantiate_device_type_tests(TestMedian, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_min.py b/test/test_network_ops/test_min.py
index e2cedfa427d6521bd57eae08ecdf58e55ba125c0..f47a95de008efae0195dd6b0a7951c8012b7caa4 100644
--- a/test/test_network_ops/test_min.py
+++ b/test/test_network_ops/test_min.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestMin(TestCase):
@@ -208,7 +207,7 @@ class TestMin(TestCase):
             self.assertRtolEqual(cpu_output_amin, npu_output_amin)
             self.assertRtolEqual(cpu_output_amin, npu_output_amin_out)
 
-    def test_min_out_result(self, device):
+    def test_min_out_result(self, device="npu"):
         shape_format = [
             [[np.float16, 0, [9, 10, 14, 14]], [np.float16, 0, [7, 10, 1, 1]]],
             [[np.float16, 0, [9, 7, 12, 12]],  [np.float16, 0, [7, 7, 1, 1]]],
@@ -217,14 +216,14 @@ class TestMin(TestCase):
         ]
         self.min_out_result_other(shape_format)
 
-    def test_min_shape_format_fp16_1d(self, device):
+    def test_min_shape_format_fp16_1d(self, device="npu"):
         format_list = [0, 3]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
                         ]
         self.min_result(shape_format)
 
-    def test_min_shape_format_fp32_1d(self, device):
+    def test_min_shape_format_fp32_1d(self, device="npu"):
         format_list = [0, 3]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8]], np.random.randint(0, 1), j] for i in format_list for j in
@@ -232,7 +231,7 @@ class TestMin(TestCase):
                         ]
         self.min_result(shape_format)
 
-    def test_min_shape_format_fp16_2d(self, device):
+    def test_min_shape_format_fp16_2d(self, device="npu"):
         format_list = [0, 3]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7]], np.random.randint(0, 1), j] for i in format_list for j in
@@ -240,7 +239,7 @@ class TestMin(TestCase):
                         ]
         self.min_result(shape_format)
 
-    def test_min_shape_format_fp32_2d(self, device):
+    def test_min_shape_format_fp32_2d(self, device="npu"):
         format_list = [0, 3]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7]], np.random.randint(0, 1), j] for i in format_list for j in
@@ -248,7 +247,7 @@ class TestMin(TestCase):
                         ]
         self.min_result(shape_format)
 
-    def test_min_shape_format_fp16_3d(self, device):
+    def test_min_shape_format_fp16_3d(self, device="npu"):
         format_list = [0, 3, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9]], np.random.randint(0, 1), j] for i in format_list for j in
@@ -256,7 +255,7 @@ class TestMin(TestCase):
                         ]
         self.min_result(shape_format)
 
-    def test_min_shape_format_fp32_3d(self, device):
+    def test_min_shape_format_fp32_3d(self, device="npu"):
         format_list = [0, 3, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9]], np.random.randint(0, 1), j] for i in format_list for j in
@@ -264,7 +263,7 @@ class TestMin(TestCase):
                         ]
         self.min_result(shape_format)
 
-    def test_min_shape_format_fp16_4d(self, device):
+    def test_min_shape_format_fp16_4d(self, device="npu"):
         format_list = [0, 4, 3, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9, 10]], np.random.randint(0, 1), j] for i in format_list for j in
@@ -272,7 +271,7 @@ class TestMin(TestCase):
                         ]
         self.min_result(shape_format)
 
-    def test_min_shape_format_fp32_4d(self, device):
+    def test_min_shape_format_fp32_4d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9, 10]], np.random.randint(0, 1), j] for i in format_list for j in
@@ -280,14 +279,14 @@ class TestMin(TestCase):
                         ]
         self.min_result(shape_format)
 
-    def test_min_dim_shape_format_fp16_1d(self, device):
+    def test_min_dim_shape_format_fp16_1d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
                         ]
         self.min_result_dim(shape_format)
 
-    def test_min_dim_shape_format_fp32_1d(self, device):
+    def test_min_dim_shape_format_fp32_1d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8]], np.random.randint(0, 1), j] for i in format_list for j in
@@ -295,7 +294,7 @@ class TestMin(TestCase):
                         ]
         self.min_result_dim(shape_format)
 
-    def test_min_dim_shape_format_fp16_2d(self, device):
+    def test_min_dim_shape_format_fp16_2d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7]], np.random.randint(0, 2), j] for i in format_list for j in
@@ -303,7 +302,7 @@ class TestMin(TestCase):
                         ]
         self.min_result_dim(shape_format)
 
-    def test_min_dim_shape_format_fp32_2d(self, device):
+    def test_min_dim_shape_format_fp32_2d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7]], np.random.randint(0, 2), j] for i in format_list for j in
@@ -311,7 +310,7 @@ class TestMin(TestCase):
                         ]
         self.min_result_dim(shape_format)
 
-    def test_min_dim_shape_format_fp16_3d(self, device):
+    def test_min_dim_shape_format_fp16_3d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9]], np.random.randint(0, 3), j] for i in format_list for j in
@@ -319,7 +318,7 @@ class TestMin(TestCase):
                         ]
         self.min_result_dim(shape_format)
 
-    def test_min_dim_shape_format_fp32_3d(self, device):
+    def test_min_dim_shape_format_fp32_3d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9]], np.random.randint(0, 3), j] for i in format_list for j in
@@ -327,7 +326,7 @@ class TestMin(TestCase):
                         ]
         self.min_result_dim(shape_format)
 
-    def test_min_dim_shape_format_fp16_4d(self, device):
+    def test_min_dim_shape_format_fp16_4d(self, device="npu"):
         format_list = [0, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9, 10]], np.random.randint(0, 4), j] for i in format_list for j
@@ -335,7 +334,7 @@ class TestMin(TestCase):
                         ]
         self.min_result_dim(shape_format)
 
-    def test_min_dim_shape_format_fp32_4d(self, device):
+    def test_min_dim_shape_format_fp32_4d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9, 10]], np.random.randint(0, 4), j] for i in format_list for j
@@ -343,14 +342,14 @@ class TestMin(TestCase):
                         ]
         self.min_result_dim(shape_format)
 
-    def test_min_other_shape_format_fp16_1d(self, device):
+    def test_min_other_shape_format_fp16_1d(self, device="npu"):
         format_list = [0, 3, 4]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
                         ]
         self.min_result_other(shape_format)
 
-    def test_min_other_shape_format_fp32_1d(self, device):
+    def test_min_other_shape_format_fp32_1d(self, device="npu"):
         format_list = [0, 3, 4]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8]], np.random.randint(0, 1), j] for i in format_list for j in
@@ -358,7 +357,7 @@ class TestMin(TestCase):
                         ]
         self.min_result_other(shape_format)
 
-    def test_min_other_shape_format_fp16_2d(self, device):
+    def test_min_other_shape_format_fp16_2d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7]], np.random.randint(0, 2), j] for i in format_list for j in
@@ -366,7 +365,7 @@ class TestMin(TestCase):
                         ]
         self.min_result_other(shape_format)
 
-    def test_min_other_shape_format_fp32_2d(self, device):
+    def test_min_other_shape_format_fp32_2d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7]], np.random.randint(0, 2), j] for i in format_list for j in
@@ -374,7 +373,7 @@ class TestMin(TestCase):
                         ]
         self.min_result_other(shape_format)
 
-    def test_min_other_shape_format_fp16_3d(self, device):
+    def test_min_other_shape_format_fp16_3d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9]], np.random.randint(0, 3), j] for i in format_list for j in
@@ -382,7 +381,7 @@ class TestMin(TestCase):
                         ]
         self.min_result_other(shape_format)
 
-    def test_min_other_shape_format_fp32_3d(self, device):
+    def test_min_other_shape_format_fp32_3d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9]], np.random.randint(0, 3), j] for i in format_list for j in
@@ -390,7 +389,7 @@ class TestMin(TestCase):
                         ]
         self.min_result_other(shape_format)
 
-    def test_min_other_shape_format_fp16_4d(self, device):
+    def test_min_other_shape_format_fp16_4d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9, 10]], np.random.randint(0, 4), j] for i in format_list for j
@@ -398,7 +397,7 @@ class TestMin(TestCase):
                         ]
         self.min_result_other(shape_format)
 
-    def test_min_other_shape_format_fp32_4d(self, device):
+    def test_min_other_shape_format_fp32_4d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9, 10]], np.random.randint(0, 4), j] for i in format_list for j
@@ -406,7 +405,7 @@ class TestMin(TestCase):
                         ]
         self.min_result_other(shape_format)
     
-    def test_min_dimname_shape_format(self, device):
+    def test_min_dimname_shape_format(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9, 10], ('N', 'C', 'H', 'W')],
@@ -416,7 +415,7 @@ class TestMin(TestCase):
                         ]
         self.min_name_result_other(shape_format)
     
-    def test_min_dimname_shape_format_fp16(self, device):
+    def test_min_dimname_shape_format_fp16(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9, 10], ('N', 'C', 'H', 'W')],
@@ -426,7 +425,7 @@ class TestMin(TestCase):
                         ]
         self.min_name_result_other(shape_format)
     
-    def test_min_dimname_out_shape_format(self, device):
+    def test_min_dimname_out_shape_format(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9, 10], ('N', 'C', 'H', 'W')],
@@ -436,7 +435,7 @@ class TestMin(TestCase):
                         ]
         self.min_name_out_result_other(shape_format)
     
-    def test_min_dimname_out_shape_format_fp16(self, device):
+    def test_min_dimname_out_shape_format_fp16(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9, 10], ('N', 'C', 'H', 'W')],
@@ -446,14 +445,14 @@ class TestMin(TestCase):
                         ]
         self.min_name_out_result_other(shape_format)
 
-    def test_amin_shape_format_fp16_1d(self, device):
+    def test_amin_shape_format_fp16_1d(self, device="npu"):
         format_list = [0, 3, 4]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
                         ]
         self.amin_result(shape_format)
 
-    def test_amin_shape_format_fp32_1d(self, device):
+    def test_amin_shape_format_fp32_1d(self, device="npu"):
         format_list = [0, 3, 4]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8]], np.random.randint(0, 1), j] for i in format_list for j in
@@ -461,7 +460,7 @@ class TestMin(TestCase):
                         ]
         self.amin_result(shape_format)
 
-    def test_amin_shape_format_fp16_2d(self, device):
+    def test_amin_shape_format_fp16_2d(self, device="npu"):
         format_list = [0, 3, 4]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7]], np.random.randint(0, 2), j] for i in format_list for j in
@@ -469,7 +468,7 @@ class TestMin(TestCase):
                         ]
         self.amin_result(shape_format)
 
-    def test_amin_shape_format_fp32_2d(self, device):
+    def test_amin_shape_format_fp32_2d(self, device="npu"):
         format_list = [0, 3, 4]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7]], np.random.randint(0, 2), j] for i in format_list for j in
@@ -477,7 +476,7 @@ class TestMin(TestCase):
                         ]
         self.amin_result(shape_format)
 
-    def test_amin_shape_format_fp16_3d(self, device):
+    def test_amin_shape_format_fp16_3d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9]], np.random.randint(0, 3), j] for i in format_list for j in
@@ -485,7 +484,7 @@ class TestMin(TestCase):
                         ]
         self.amin_result(shape_format)
 
-    def test_amin_shape_format_fp32_3d(self, device):
+    def test_amin_shape_format_fp32_3d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9]], np.random.randint(0, 3), j] for i in format_list for j in
@@ -493,7 +492,7 @@ class TestMin(TestCase):
                         ]
         self.amin_result(shape_format)
 
-    def test_amin_shape_format_fp16_4d(self, device):
+    def test_amin_shape_format_fp16_4d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float16, i, [8, 7, 9, 10]], np.random.randint(0, 4), j] for i in format_list for j
@@ -501,7 +500,7 @@ class TestMin(TestCase):
                         ]
         self.amin_result(shape_format)
 
-    def test_amin_shape_format_fp32_4d(self, device):
+    def test_amin_shape_format_fp32_4d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         keepdim_list = [True, False]
         shape_format = [[[np.float32, i, [8, 7, 9, 10]], np.random.randint(0, 4), j] for i in format_list for j
@@ -510,6 +509,5 @@ class TestMin(TestCase):
         self.amin_result(shape_format)
 
 
-instantiate_device_type_tests(TestMin, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_mse_loss.py b/test/test_network_ops/test_mse_loss.py
index 48754a197abcb51192d25e3205ff972d41dddb78..349cad2716c81497d1f6dbf00eb1485fd1d02b97 100644
--- a/test/test_network_ops/test_mse_loss.py
+++ b/test/test_network_ops/test_mse_loss.py
@@ -17,9 +17,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestMseLoss(TestCase):
 
@@ -51,7 +51,7 @@ class TestMseLoss(TestCase):
         output = output.numpy()
         return output
 
-    def test_mse_loss_shape_format(self, device):
+    def test_mse_loss_shape_format(self, device="npu"):
         shape_format = [
             [0, 100, (4,3), np.float32, ""],
             [0, 100, (4,3), np.float32, "mean"],
@@ -65,13 +65,13 @@ class TestMseLoss(TestCase):
             npu_output = self.npu_op_exec(npu_input1, npu_input2, item[4]) 
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_mse_mix_dtype(self, device):
+    def test_mse_mix_dtype(self, device="npu"):
         npu_input1, npu_input2 = create_common_tensor([np.int32, 0, (2, 3)], 1, 100)
         npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input3, "mean")
         npu_output = self.npu_op_exec(npu_input1, npu_input3, "mean")
         self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestMseLoss, globals(), except_for='cpu')    
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_network_ops/test_mse_loss_backward.py b/test/test_network_ops/test_mse_loss_backward.py
index 5e0fd01ba7d0373f0c67e4f233755c73a7c2d88b..2e86e77640a68ddc63892c9596c56bb843c6bd09 100644
--- a/test/test_network_ops/test_mse_loss_backward.py
+++ b/test/test_network_ops/test_mse_loss_backward.py
@@ -18,8 +18,7 @@ import torch_npu
 from torch.autograd import Variable
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
 
 
 class TestMseLossGrad(TestCase):
@@ -113,36 +112,36 @@ class TestMseLossGrad(TestCase):
         output = grads['x'].to("cpu").detach().numpy()
         return output
 
-    def test_mse_loss_grad_float32(self, device):
+    def test_mse_loss_grad_float32(self, device="npu"):
         npu_input1, npu_input2 = self.generate_mse_grad_inputs(0, 100, (4,3), np.float32)
         cpu_output = self.cpu_op_exec_default(npu_input1, npu_input2)
         npu_output = self.npu_op_exec_default(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_mse_loss_grad_float32_mean(self, device):
+    def test_mse_loss_grad_float32_mean(self, device="npu"):
         npu_input1, npu_input2 = self.generate_mse_grad_inputs(0, 100, (4,3), np.float32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "mean")
         npu_output = self.npu_op_exec(npu_input1, npu_input2, "mean")
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_mse_loss_grad_float32_none(self, device):
+    def test_mse_loss_grad_float32_none(self, device="npu"):
         npu_input1, npu_input2 = self.generate_mse_grad_inputs(0, 100, (4,3), np.float32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "none")
         npu_output = self.npu_op_exec(npu_input1, npu_input2, "none")
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_mse_loss_grad_float32_sum(self, device):
+    def test_mse_loss_grad_float32_sum(self, device="npu"):
         npu_input1, npu_input2 = self.generate_mse_grad_inputs(0, 100, (4,3), np.float32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "sum")
         npu_output = self.npu_op_exec(npu_input1, npu_input2, "sum")
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_mse_loss_grad_shape_0(self, device):
+    def test_mse_loss_grad_shape_0(self, device="npu"):
         npu_input1, npu_input2 = self.generate_mse_grad_inputs(0, 100, (0,4), np.float32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "mean")
         npu_output = self.npu_op_exec(npu_input1, npu_input2, "mean")
         self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestMseLossGrad, globals(), except_for='cpu')    
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_network_ops/test_multilabel_margin_loss.py b/test/test_network_ops/test_multilabel_margin_loss.py
index 9cae3e49e42dba0225d027c99bc3c88cb3fe3c2d..cd462c5457f1f0016e942103a16fb7e9ff92f47a 100644
--- a/test/test_network_ops/test_multilabel_margin_loss.py
+++ b/test/test_network_ops/test_multilabel_margin_loss.py
@@ -16,8 +16,7 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
 
 
 class TestMultilabelMarginLoss(TestCase):
@@ -46,7 +45,7 @@ class TestMultilabelMarginLoss(TestCase):
         output = output.detach().numpy()
         return output
     
-    def test_multilabel_margin_loss_1(self, device):
+    def test_multilabel_margin_loss_1(self, device="npu"):
         data = torch.Tensor([[0.1, 0.2, 0.4, 0.8], [0.1, 0.2, 0.4, 0.8]]).to(torch.float32)
         target = torch.Tensor([[3, 0, -1, 1], [0, 1, 3, -1]]).to(torch.int64)
 
@@ -58,7 +57,7 @@ class TestMultilabelMarginLoss(TestCase):
 
             self.assertRtolEqual(cpu_output, npu_output)
     
-    def test_multilabel_margin_loss_2(self, device):
+    def test_multilabel_margin_loss_2(self, device="npu"):
         data = torch.Tensor([[0.1, 0.2, 0.4, 0.8], [0.1, 0.2, 0.4, 0.8]]).to(torch.float32)
         target = torch.Tensor([[1, 1, 1, 1], [1, 1, 1, 1]]).to(torch.int64)
 
@@ -69,7 +68,7 @@ class TestMultilabelMarginLoss(TestCase):
             npu_output = self.npu_op_exec(data_npu, target_npu, reduction)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_multilabel_margin_loss_3(self, device):
+    def test_multilabel_margin_loss_3(self, device="npu"):
         data = torch.Tensor([[0.1, 0.2, 0.4, 0.8, 0.1, 0.1, 0.1, 0.1, 0.1], [0.1, 0.2, 0.4, 0.8, 0.1, 0.1, 0.1, 0.1, 0.1]]).to(torch.float32)
         target = torch.Tensor([[3, 0, 7, 8, 1, -1, 1, 2, 2], [4, 5, -1, 1, 1, 1, 1, 2, 2]]).to(torch.int64)
 
@@ -80,7 +79,7 @@ class TestMultilabelMarginLoss(TestCase):
             npu_output = self.npu_op_exec(data_npu, target_npu, reduction)
             self.assertRtolEqual(cpu_output, npu_output)
     
-    def test_multilabel_margin_loss_out(self, device):
+    def test_multilabel_margin_loss_out(self, device="npu"):
         data = torch.tensor([[-0.4191,  0.6214],
                    [-0.3765, -0.4781],
                    [0.2881,  0.4888]]).to(torch.float32)
@@ -98,7 +97,7 @@ class TestMultilabelMarginLoss(TestCase):
             npu_output = self.npu_op_exec_out(data_npu, target_npu, c_npu, reduction)
             self.assertRtolEqual(cpu_output, npu_output)
     
-    def test_multilabel_margin_loss_float16_1(self, device):
+    def test_multilabel_margin_loss_float16_1(self, device="npu"):
         data = torch.Tensor([[0.1, 0.2, 0.4, 0.8], [0.1, 0.2, 0.4, 0.8]]).to(torch.float32)
         target = torch.Tensor([[3, 0, -1, 1], [0, 1, 3, -1]]).to(torch.int64)
 
@@ -113,7 +112,7 @@ class TestMultilabelMarginLoss(TestCase):
 
             self.assertRtolEqual(cpu_output, npu_output)
     
-    def test_multilabel_margin_loss_float16_2(self, device):
+    def test_multilabel_margin_loss_float16_2(self, device="npu"):
         data = torch.Tensor([[0.1, 0.2, 0.4, 0.8, 0.1, 0.1, 0.1, 0.1, 0.1], [0.1, 0.2, 0.4, 0.8, 0.1, 0.1, 0.1, 0.1, 0.1]]).to(torch.float32)
         target = torch.Tensor([[3, 0, 7, 8, 1, -1, 1, 2, 2], [4, 5, -1, 1, 1, 1, 1, 2, 2]]).to(torch.int64)
         
@@ -128,6 +127,6 @@ class TestMultilabelMarginLoss(TestCase):
 
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestMultilabelMarginLoss, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_multinomial.py b/test/test_network_ops/test_multinomial.py
index 2411081e07c745d9258c108e7e1aa12a240c8fcd..6746d527b646459b085f5f4ad88e811eb51b6808 100644
--- a/test/test_network_ops/test_multinomial.py
+++ b/test/test_network_ops/test_multinomial.py
@@ -18,9 +18,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestMultinomial(TestCase):
 
@@ -30,7 +30,7 @@ class TestMultinomial(TestCase):
             for index in sample:
                 self.assertNotEqual(weight[index], 0)
     
-    def test_multinomial_1d_shape_format(self, device):
+    def test_multinomial_1d_shape_format(self, device="npu"):
         shape_format = [
             [[np.float32, 0, (5,)], 0, 100, 5],
             [[np.float32, 0, (10,)], 0, 100, 10],
@@ -53,7 +53,7 @@ class TestMultinomial(TestCase):
                 for j in row:
                     self.assertNotEqual(weight[i][j], 0)
     
-    def test_multinomial_2d_shape_format(self, device):
+    def test_multinomial_2d_shape_format(self, device="npu"):
         shape_format = [
             [[np.float32, 0, (5,5)], 0, 100, 5],
             [[np.float32, 0, (5,10)], 0, 100, 10],
@@ -70,6 +70,5 @@ class TestMultinomial(TestCase):
             self.sample_2d(npu_input1, item[3])
 
 
-instantiate_device_type_tests(TestMultinomial, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_ne.py b/test/test_network_ops/test_ne.py
index a403a208d6c46a77aa8e66ff93fe5a254fbd9c28..de4c0e9180873fda5375fabe971bbd39cf85a28c 100644
--- a/test/test_network_ops/test_ne.py
+++ b/test/test_network_ops/test_ne.py
@@ -15,9 +15,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestNe(TestCase):
     def cpu_op_exec(self, input1, input2):
@@ -38,7 +38,7 @@ class TestNe(TestCase):
         output = output.numpy()
         return output
 
-    def test_ne_shape_format_fp32(self, device):
+    def test_ne_shape_format_fp32(self, device="npu"):
         dtype_list = [np.float32]
         format_list = [0, 3]
         shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]]
@@ -52,7 +52,7 @@ class TestNe(TestCase):
             npu_output = self.npu_op_exec(npu_input1, npu_input2)            
             self.assertRtolEqual(cpu_output, npu_output)
             
-    def test_ne_shape_format_fp16(self, device):
+    def test_ne_shape_format_fp16(self, device="npu"):
         dtype_list = [np.float16]
         format_list = [0, 3]
         shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]]
@@ -70,7 +70,7 @@ class TestNe(TestCase):
             npu_output = self.npu_op_exec(npu_input1, npu_input2)            
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_ne_out_shape_format_fp32(self, device):
+    def test_ne_out_shape_format_fp32(self, device="npu"):
         dtype_list = [np.float32]
         format_list = [0]
         shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]]
@@ -84,7 +84,7 @@ class TestNe(TestCase):
             cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)           
             self.assertRtolEqual(cpu_output, npu_output_out)
 
-    def test_ne_scalar_out_shape_format_fp32(self, device):
+    def test_ne_scalar_out_shape_format_fp32(self, device="npu"):
         dtype_list = [np.float32]
         format_list = [0]
         shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]]
@@ -97,13 +97,13 @@ class TestNe(TestCase):
             cpu_output = self.cpu_op_exec(cpu_input1, 5)
             self.assertRtolEqual(cpu_output, npu_output_out)
 
-    def test_ne_mix_dtype(self, device):
+    def test_ne_mix_dtype(self, device="npu"):
         cpu_input1, npu_input1 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
         cpu_input2, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
         cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestNe, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_nllloss2d.py b/test/test_network_ops/test_nllloss2d.py
index 28dcba646489f93415f6f88050b7c4c0ced3222a..4665a273afb1a412b73845fbf92566ae88b2803c 100644
--- a/test/test_network_ops/test_nllloss2d.py
+++ b/test/test_network_ops/test_nllloss2d.py
@@ -15,8 +15,7 @@
 import torch
 import torch_npu
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
 
 
 class TestNllloss2d(TestCase):
@@ -34,7 +33,7 @@ class TestNllloss2d(TestCase):
         output = output.detach().numpy()
         return output
 
-    def test_nll_loss2d_mean(self, device):
+    def test_nll_loss2d_mean(self, device="npu"):
         m = torch.nn.LogSoftmax(dim=1)
         dim_n, dim_c = 5, 4
         loss = torch.nn.NLLLoss()
@@ -51,7 +50,7 @@ class TestNllloss2d(TestCase):
 
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_nll_loss2d_none(self, device):
+    def test_nll_loss2d_none(self, device="npu"):
         exp = torch.nn.LogSoftmax(dim=1)
         dim_n, dim_c = 5, 4
         loss = torch.nn.NLLLoss()
@@ -68,7 +67,7 @@ class TestNllloss2d(TestCase):
 
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_nll_loss2d_sum(self, device):
+    def test_nll_loss2d_sum(self, device="npu"):
         exp = torch.nn.LogSoftmax(dim=1)
         dim_n, dim_c = 5, 4
         loss = torch.nn.NLLLoss()
@@ -85,6 +84,6 @@ class TestNllloss2d(TestCase):
 
         self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestNllloss2d, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_nllloss2d_backward.py b/test/test_network_ops/test_nllloss2d_backward.py
index b5fcfe5e44db9cb99b03da6e33ea93d349ea64d0..d3ee2d605a466f206e9482d9d1f1b696fd17b6ac 100644
--- a/test/test_network_ops/test_nllloss2d_backward.py
+++ b/test/test_network_ops/test_nllloss2d_backward.py
@@ -17,8 +17,7 @@ import torch_npu
 import numpy as np
 from torch.autograd import Variable
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
 
 
 class TestNllloss2dBackward(TestCase):
@@ -40,7 +39,7 @@ class TestNllloss2dBackward(TestCase):
         output = grads['x'].to("cpu").numpy()
         return output
     
-    def test_nll_loss2d_grad_mean(self, device):
+    def test_nll_loss2d_grad_mean(self, device="npu"):
         m = torch.nn.LogSoftmax(dim=1)
         N, C = 5, 4
         loss = torch.nn.NLLLoss()
@@ -57,7 +56,7 @@ class TestNllloss2dBackward(TestCase):
 
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_nll_loss2d_grad_none(self, device):
+    def test_nll_loss2d_grad_none(self, device="npu"):
         m = torch.nn.LogSoftmax(dim=1)
         N, C = 5, 4
         loss = torch.nn.NLLLoss()
@@ -74,7 +73,7 @@ class TestNllloss2dBackward(TestCase):
 
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_nll_loss2d_grad_sum(self, device):
+    def test_nll_loss2d_grad_sum(self, device="npu"):
         m = torch.nn.LogSoftmax(dim=1)
         N, C = 5, 4
         loss = torch.nn.NLLLoss()
@@ -91,6 +90,6 @@ class TestNllloss2dBackward(TestCase):
 
         self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestNllloss2dBackward, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_nms_rotated.py b/test/test_network_ops/test_nms_rotated.py
index 86e459501ca8cef1d46f5ef0aada6bdabfe72a42..4b6180b61f8a9b3d330f9681512741b6ae2ebea7 100644
--- a/test/test_network_ops/test_nms_rotated.py
+++ b/test/test_network_ops/test_nms_rotated.py
@@ -15,15 +15,15 @@
 import torch
 import torch_npu
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestNmsRotated(TestCase):
     def npu_op_exec(self, det, score):
         output1, output2 = torch_npu.npu_nms_rotated(det.npu(), score.npu(), 0.2, 0, -1, 1)
         return output1, output2
 
-    def test_nms_rotated_float32(self, device):
+    def test_nms_rotated_float32(self, device="npu"):
         det = torch.tensor([[1.0382e+03, 3.1657e+02, 1.1556e+03, 4.4303e+02, 2.3674e+00],
                             [1.1503e+03, 3.0598e+02, 1.2602e+03, 4.3456e+02, 3.2729e-01],
                             [1.1508e+03, 3.0652e+02, 1.2607e+03, 4.3472e+02, 5.1713e-01],
@@ -53,6 +53,6 @@ class TestNmsRotated(TestCase):
         self.assertRtolEqual(expect_output1, npu_output1.cpu())
         self.assertRtolEqual(expect_output2, npu_output2.cpu())
 
-instantiate_device_type_tests(TestNmsRotated, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_nnpack_spatial_convolution.py b/test/test_network_ops/test_nnpack_spatial_convolution.py
index fa46a329db73aab364e6fae1b0e07e41a4a9974d..8b8d71fb665f1cb66062b6b05b53a9ff2d9e3d4f 100644
--- a/test/test_network_ops/test_nnpack_spatial_convolution.py
+++ b/test/test_network_ops/test_nnpack_spatial_convolution.py
@@ -17,9 +17,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestNnpackSpatialConvolution(TestCase):
 
@@ -70,62 +69,62 @@ class TestNnpackSpatialConvolution(TestCase):
         output = output.numpy()
         return output
 
-    def test__nnpack_spatial_convolution_float16_1(self, device):
+    def test__nnpack_spatial_convolution_float16_1(self, device="npu"):
         getlist1 = self.generate_data(
             -2, 2, 1, 3, 4, 4, 2, 2, 2, np.float16)
         cpu_output = self.cpu_op_exec(getlist1[0], getlist1[1], getlist1[2], getlist1[3])
         npu_output = self.npu_op_exec(getlist1[0], getlist1[1], getlist1[2], getlist1[3])
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test__nnpack_spatial_convolution_float16_2(self, device):
+    def test__nnpack_spatial_convolution_float16_2(self, device="npu"):
         getlist1 = self.generate_data(
             -50, 50, 1, 3, 5, 5, 5, 2, 2, np.float16)
         cpu_output = self.cpu_op_exec(getlist1[0], getlist1[1], getlist1[2], getlist1[3])
         npu_output = self.npu_op_exec(getlist1[0], getlist1[1], getlist1[2], getlist1[3])
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test__nnpack_spatial_convolution_float16_3(self, device):
+    def test__nnpack_spatial_convolution_float16_3(self, device="npu"):
         getlist1 = self.generate_data(
             -50, 50, 1, 5, 1024, 1024, 5, 8, 8, np.float16)
         cpu_output = self.cpu_op_exec(getlist1[0], getlist1[1], getlist1[2], getlist1[3])
         npu_output = self.npu_op_exec(getlist1[0], getlist1[1], getlist1[2], getlist1[3])
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test__nnpack_spatial_convolution_float16_4(self, device):
+    def test__nnpack_spatial_convolution_float16_4(self, device="npu"):
         getlist1 = self.generate_data(
             -100, 100, 1, 5, 1024, 1024, 5, 8, 8, np.float16)
         cpu_output = self.cpu_op_exec(getlist1[0], getlist1[1], getlist1[2], getlist1[3])
         npu_output = self.npu_op_exec(getlist1[0], getlist1[1], getlist1[2], getlist1[3])
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test__nnpack_spatial_convolution_float32_1(self, device):
+    def test__nnpack_spatial_convolution_float32_1(self, device="npu"):
         getlist1 = self.generate_data(
             -2, 2, 1, 3, 4, 4, 2, 2, 2, np.float32)
         cpu_output = self.cpu_op_exec(getlist1[0], getlist1[1], getlist1[2], getlist1[3])
         npu_output = self.npu_op_exec(getlist1[0], getlist1[1], getlist1[2], getlist1[3])
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test__nnpack_spatial_convolution_float32_2(self, device):
+    def test__nnpack_spatial_convolution_float32_2(self, device="npu"):
         getlist1 = self.generate_data(
             -50, 50, 1, 3, 4, 4, 2, 2, 2, np.float32)
         cpu_output = self.cpu_op_exec(getlist1[0], getlist1[1], getlist1[2], getlist1[3])
         npu_output = self.npu_op_exec(getlist1[0], getlist1[1], getlist1[2], getlist1[3])
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test__nnpack_spatial_convolution_float32_3(self, device):
+    def test__nnpack_spatial_convolution_float32_3(self, device="npu"):
         getlist1 = self.generate_data(
             -50, 50, 1, 5, 512, 512, 5, 8, 8, np.float32)
         cpu_output = self.cpu_op_exec(getlist1[0], getlist1[1], getlist1[2], getlist1[3])
         npu_output = self.npu_op_exec(getlist1[0], getlist1[1], getlist1[2], getlist1[3])
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test__nnpack_spatial_convolution_float32_4(self, device):
+    def test__nnpack_spatial_convolution_float32_4(self, device="npu"):
         getlist1 = self.generate_data(
             -100, 100, 1, 5, 512, 512, 5, 8, 8, np.float32)
         cpu_output = self.cpu_op_exec(getlist1[0], getlist1[1], getlist1[2], getlist1[3])
         npu_output = self.npu_op_exec(getlist1[0], getlist1[1], getlist1[2], getlist1[3])
         self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestNnpackSpatialConvolution, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_nonzero.py b/test/test_network_ops/test_nonzero.py
index 84c204a86fc4b2c4774be98ab9d5a14602e32d6a..11c8d5167e18d50c2ca244bdcbe05fe6b8a41330 100644
--- a/test/test_network_ops/test_nonzero.py
+++ b/test/test_network_ops/test_nonzero.py
@@ -17,9 +17,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestNonzero(TestCase):
     def cpu_op_exec(self, input1):
@@ -33,7 +33,7 @@ class TestNonzero(TestCase):
         output = output.numpy().astype(np.int32)
         return output
 
-    def test_nonzero_shape_format(self, device):
+    def test_nonzero_shape_format(self, device="npu"):
         dtype_list = [np.float32, np.float16, np.int32, np.int64]
         format_list = [0]
         shape_list = [[256,10], [256,256,100],[5,256,256,100]]
@@ -48,6 +48,5 @@ class TestNonzero(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
 
 
-instantiate_device_type_tests(TestNonzero, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_npu_bertapplyadam.py b/test/test_network_ops/test_npu_bertapplyadam.py
index 2d6f602141bce74c28975491a65c49c265cbc8d9..e6634c655340693c45cf9aeab17071cd1c2a261e 100644
--- a/test/test_network_ops/test_npu_bertapplyadam.py
+++ b/test/test_network_ops/test_npu_bertapplyadam.py
@@ -14,14 +14,12 @@
 
 import torch
 import torch_npu
-import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestNpuBertApplyAdam(TestCase):
-    def test_npu_bert_apply_adam(self, device):
+    def test_npu_bert_apply_adam(self, device="npu"):
         seed = 3
         torch.manual_seed(seed)
         torch.npu.manual_seed(seed)
@@ -51,6 +49,6 @@ class TestNpuBertApplyAdam(TestCase):
         self.assertRtolEqual(m_out[:3].cpu(), m_ans)
         self.assertRtolEqual(v_out[:3].cpu(), v_ans)
 
-instantiate_device_type_tests(TestNpuBertApplyAdam, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_npu_linear.py b/test/test_network_ops/test_npu_linear.py
index fbf7329694910ff8ffc1f54aec01b4211900cc32..45e66980d780b432fe4685253e4bf4e867497beb 100644
--- a/test/test_network_ops/test_npu_linear.py
+++ b/test/test_network_ops/test_npu_linear.py
@@ -15,9 +15,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestNpuLinear(TestCase):
     def cpu_op_exec(self, x, weight, bias):
@@ -30,7 +30,7 @@ class TestNpuLinear(TestCase):
         output = output.cpu().numpy()
         return output
 
-    def test_npu_linear_shape_format_fp32(self, device):
+    def test_npu_linear_shape_format_fp32(self, device="npu"):
         shape_format = [
             [[np.float32, -1, (6144, 1024)], [np.float32, -1, (256, 1024)], [np.float32, -1, (256)]],
             [[np.float32, -1, (123, 456)], [np.float32, -1, (789, 456)], [np.float32, -1, (789)]],
@@ -44,7 +44,7 @@ class TestNpuLinear(TestCase):
             npu_output = self.npu_op_exec(npu_x, npu_w, npu_b)
             self.assertRtolEqual(cpu_output, npu_output, 0.0002)
 
-    def test_npu_linear_shape_format_fp16(self, device):
+    def test_npu_linear_shape_format_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, -1, (6144, 1024)], [np.float16, -1, (256, 1024)], [np.float16, -1, (256)]],
             [[np.float16, -1, (123, 456)], [np.float16, -1, (789, 456)], [np.float16, -1, (789)]],
@@ -58,6 +58,6 @@ class TestNpuLinear(TestCase):
             npu_output = self.npu_op_exec(npu_x, npu_w, npu_b)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestNpuLinear, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_npu_linear_backward.py b/test/test_network_ops/test_npu_linear_backward.py
index df0014377560f7881a3f8681770774540eefddfe..b7f6a1cf1cac263f4c62c271ca1f13a4d0ca22d2 100644
--- a/test/test_network_ops/test_npu_linear_backward.py
+++ b/test/test_network_ops/test_npu_linear_backward.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestNpuLinearBackward(TestCase):
     def cpu_op_exec(self, x, weight, bias):
@@ -42,7 +42,7 @@ class TestNpuLinearBackward(TestCase):
            bias.grad.cpu().numpy()]
         return list2
 
-    def test_npu_linear_backward_shape_format_fp32(self, device):
+    def test_npu_linear_backward_shape_format_fp32(self, device="npu"):
         shape_format = [
             [[np.float32, -1, (6144, 1024)], [np.float32, -1, (256, 1024)], [np.float32, -1, (256)]],
             [[np.float32, -1, (123, 456)], [np.float32, -1, (789, 456)], [np.float32, -1, (789)]],
@@ -59,7 +59,7 @@ class TestNpuLinearBackward(TestCase):
             self.assertRtolEqual(getlist1[2], getlist2[2])
             self.assertRtolEqual(getlist1[3], getlist2[3])
 
-    def test_npu_linear_shape_format_fp16(self, device):
+    def test_npu_linear_shape_format_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, -1, (6144, 1024)], [np.float16, -1, (256, 1024)], [np.float16, -1, (256)]],
             [[np.float16, -1, (123, 456)], [np.float16, -1, (789, 456)], [np.float16, -1, (789)]],
@@ -76,6 +76,6 @@ class TestNpuLinearBackward(TestCase):
             self.assertRtolEqual(getlist1[2].astype(np.float16), getlist2[2])
             self.assertRtolEqual(getlist1[3].astype(np.float16), getlist2[3])
 
-instantiate_device_type_tests(TestNpuLinearBackward, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_one_hot.py b/test/test_network_ops/test_one_hot.py
index 3a1af57cccacc01e74108d0c323fddea6b3a0a73..52538e0a8ea46675d9589769814557ce02faf29f 100644
--- a/test/test_network_ops/test_one_hot.py
+++ b/test/test_network_ops/test_one_hot.py
@@ -16,11 +16,9 @@
 
 import torch
 import torch_npu
-import numpy as np
 
-from torch_npu.testing.util_test import create_common_tensor
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestOneHot(TestCase):
     def generate_single_data(self, low, high):
@@ -41,42 +39,42 @@ class TestOneHot(TestCase):
         output = output.numpy()
         return output
 
-    def test_one_hot_1(self, device):
+    def test_one_hot_1(self, device="npu"):
         input1 = self.generate_single_data(0, 5)
         cpu_output = self.cpu_op_exec(input1, 5)
         npu_output = self.npu_op_exec(input1, 5)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_one_hot_2(self, device):
+    def test_one_hot_2(self, device="npu"):
         input1 = self.generate_single_data(0, 5)
         npu_output = self.npu_op_exec(input1, -1)
         cpu_output = self.cpu_op_exec(input1, -1)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_one_hot_3(self, device):
+    def test_one_hot_3(self, device="npu"):
         input1 = self.generate_single_data(0, 5)
         npu_output = self.npu_op_exec(input1, 6)
         cpu_output = self.cpu_op_exec(input1, 6)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_one_hot_4(self, device):
+    def test_one_hot_4(self, device="npu"):
         input1 = self.generate_single_data(0, 10)
         cpu_output = self.cpu_op_exec(input1, 10)
         npu_output = self.npu_op_exec(input1, 10)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_one_hot_5(self, device):
+    def test_one_hot_5(self, device="npu"):
         input1 = self.generate_single_data(0, 10)
         cpu_output = self.cpu_op_exec(input1, -1)
         npu_output = self.npu_op_exec(input1, -1)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_one_hot_6(self, device):
+    def test_one_hot_6(self, device="npu"):
         input1 = self.generate_single_data(0, 10)
         cpu_output = self.cpu_op_exec(input1, 12)
         npu_output = self.npu_op_exec(input1, 12)
         self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestOneHot, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_pack_padded_sequence.py b/test/test_network_ops/test_pack_padded_sequence.py
index 9bf880be398b77d678722c9412f2818e25f7d4ce..62f592e8fdf7d7646334a5209dabe45caaf59571 100644
--- a/test/test_network_ops/test_pack_padded_sequence.py
+++ b/test/test_network_ops/test_pack_padded_sequence.py
@@ -15,11 +15,11 @@
 import torch
 import  torch_npu
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestPackPaddedSequence(TestCase):
-    def test_pack_padded_sequence_fp32(self, device):
+    def test_pack_padded_sequence_fp32(self, device="npu"):
         data = torch.randn(6, 3, 2, dtype = torch.float32).npu()
         lengths = torch.tensor([6, 5, 3], dtype = torch.int64)
         expect_dim2 = data.view(18, 2).cpu()
@@ -28,7 +28,7 @@ class TestPackPaddedSequence(TestCase):
         self.assertRtolEqual(expect_dim2, out_dim2.cpu())
         self.assertRtolEqual(expect_batch_sizes, batch_sizes.cpu())
 
-    def test_pack_padded_sequence_fp16(self, device):
+    def test_pack_padded_sequence_fp16(self, device="npu"):
         data = torch.randn(6, 3, 2, dtype = torch.float16).npu()
         lengths = torch.tensor([6, 5, 3], dtype = torch.int64)
         expect_dim2 = data.view(18, 2).cpu()
@@ -37,6 +37,6 @@ class TestPackPaddedSequence(TestCase):
         self.assertRtolEqual(expect_dim2, out_dim2.cpu())
         self.assertRtolEqual(expect_batch_sizes, batch_sizes.cpu())
 
-instantiate_device_type_tests(TestPackPaddedSequence, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_prelu.py b/test/test_network_ops/test_prelu.py
index 6a41dc94fafabc03a8e3a51b7a9a8f42e29d0acb..bf4885dbe4e0721a89798fa2049658d7d194140b 100644
--- a/test/test_network_ops/test_prelu.py
+++ b/test/test_network_ops/test_prelu.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestPrelu(TestCase):
     
@@ -33,7 +33,7 @@ class TestPrelu(TestCase):
             output = output.to(torch.float32)
         return output.numpy()
 
-    def test_prelu_shape_format(self, device):
+    def test_prelu_shape_format(self, device="npu"):
         shape_format = [
             [[np.float32, 0, [1, 1]], [np.float32, 0, 1]],
             [[np.float32, 0, [2, 2]], [np.float32, 0, 1]],
@@ -52,6 +52,6 @@ class TestPrelu(TestCase):
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestPrelu, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_prelu_backward.py b/test/test_network_ops/test_prelu_backward.py
index 5b219038d528b2833831bb60c25061f88073fd81..0e428985256eff6971ce810c20aa30b4c4ca73da 100644
--- a/test/test_network_ops/test_prelu_backward.py
+++ b/test/test_network_ops/test_prelu_backward.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 class TestPreluBackward(TestCase): 
     def cpu_op_back_exec_ext(self,input1,weight):                    
@@ -44,7 +43,7 @@ class TestPreluBackward(TestCase):
         output = output.numpy()
         return output
 
-    def test_PreluBackward_shape_format_fp32(self, device):
+    def test_PreluBackward_shape_format_fp32(self, device="npu"):
         shape_format = [  
                 [np.float32, 0, (17, 12, 38, 15)],
                 [np.float32, 0, (1, 12, 38, 5)],
@@ -59,7 +58,7 @@ class TestPreluBackward(TestCase):
             npu_output = self.npu_op_back_exec_ext(npu_input, npu_weight)
             self.assertRtolEqual(cpu_output, npu_output)
     
-    def test_PreluBackward_shape_format_fp16(self, device):
+    def test_PreluBackward_shape_format_fp16(self, device="npu"):
         def cpu_op_back_exec_fp16_ext(input1,weight):            
             input1 = input1.to(torch.float32)            
             weight = weight.to(torch.float32)
@@ -88,6 +87,6 @@ class TestPreluBackward(TestCase):
             npu_output = self.npu_op_back_exec_ext(npu_input, npu_weight)
             self.assertRtolEqual(cpu_output, npu_output)  
     
-instantiate_device_type_tests(TestPreluBackward, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_ps_roi_pooling.py b/test/test_network_ops/test_ps_roi_pooling.py
index 5dcc4b8b52ec2b95b27a858096773e2e6ba84641..97ae487059b1845b75f1f0f55fbeb5e245cd8df7 100644
--- a/test/test_network_ops/test_ps_roi_pooling.py
+++ b/test/test_network_ops/test_ps_roi_pooling.py
@@ -13,15 +13,12 @@
 # limitations under the License.
 import torch
 import torch_npu
-import numpy as np
-from torch.nn import functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestPsRoiPooling(TestCase):
-    def test_ps_roi_pooling_fp16(self, device):
+    def test_ps_roi_pooling_fp16(self, device="npu"):
         roi = torch.tensor([[[1], [2], [3], [4], [5]],
                             [[6], [7], [8], [9], [10]]
                            ], dtype = torch.float16).npu()
@@ -38,6 +35,6 @@ class TestPsRoiPooling(TestCase):
                                   ], dtype = torch.float16)
         self.assertRtolEqual(expect_out, out.cpu())
 
-instantiate_device_type_tests(TestPsRoiPooling, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_ps_roi_pooling_backward.py b/test/test_network_ops/test_ps_roi_pooling_backward.py
index a3c5b4fe2d7a368e30690a29a24ef2e3ce2f07a6..89db48c76da0c40a75018d333771a1eb6f97648f 100644
--- a/test/test_network_ops/test_ps_roi_pooling_backward.py
+++ b/test/test_network_ops/test_ps_roi_pooling_backward.py
@@ -13,15 +13,12 @@
 # limitations under the License.
 import torch
 import torch_npu
-import numpy as np
-from torch.nn import functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestPsRoiPoolingBackward(TestCase):
-    def test_ps_roi_pooling_backward_fp16(self, device):
+    def test_ps_roi_pooling_backward_fp16(self, device="npu"):
         roi = torch.tensor([[[1], [2], [3], [4], [5]],
                             [[6], [7], [8], [9], [10]]
                            ], dtype = torch.float16).npu()
@@ -48,6 +45,6 @@ class TestPsRoiPoolingBackward(TestCase):
         self.assertRtolEqual(expect_out, out.detach().cpu())
         self.assertRtolEqual(expect_gradout, gradout.cpu())
 
-instantiate_device_type_tests(TestPsRoiPoolingBackward, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_put.py b/test/test_network_ops/test_put.py
index 2081cdabe5a7e3917fd934ee6ce3996973135156..ec1463426bb3439e30df316576e6aed3f4e3afff 100644
--- a/test/test_network_ops/test_put.py
+++ b/test/test_network_ops/test_put.py
@@ -17,9 +17,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestPut(TestCase):
 
@@ -60,7 +60,7 @@ class TestPut(TestCase):
 
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_put_aicpu_common_shape_format_fp32(self, device):
+    def test_put_aicpu_common_shape_format_fp32(self, device="npu"):
         shape_format = [
             [[np.float32, -1, (4, 3)], [np.int64, -1, (4, 1)], [np.float32, -1, (4)]],
             [[np.float32, -1, (4, 3, 5)], [np.int64, -1, (4, 2)], [np.float32, -1, (4, 2)]],
@@ -71,7 +71,7 @@ class TestPut(TestCase):
         ]
         self.get_result(shape_format)
 
-    def test_put_aicore_common_shape_format_fp32(self, device):
+    def test_put_aicore_common_shape_format_fp32(self, device="npu"):
         shape_format = [
             [[np.float32, -1, (4, 3)], [np.int32, -1, (4, 1)], [np.float32, -1, (4)]],
             [[np.float32, -1, (4, 3, 5)], [np.int32, -1, (4, 2)], [np.float32, -1, (4, 2)]],
@@ -82,7 +82,7 @@ class TestPut(TestCase):
         ]
         self.get_result(shape_format)
 
-    def test_put_aicpu_common_shape_format_fp16(self, device):
+    def test_put_aicpu_common_shape_format_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, -1, (4, 3)], [np.int64, -1, (4, 1)], [np.float16, -1, (4)]],
             [[np.float16, -1, (4, 3, 5)], [np.int64, -1, (4, 2)], [np.float16, -1, (4, 2)]],
@@ -95,7 +95,7 @@ class TestPut(TestCase):
         ]
         self.get_result(shape_format)
 
-    def test_put_aicore_common_shape_format_fp16(self, device):
+    def test_put_aicore_common_shape_format_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, -1, (4, 3)], [np.int32, -1, (4, 1)], [np.float16, -1, (4)]],
             [[np.float16, -1, (4, 3, 5)], [np.int32, -1, (4, 2)], [np.float16, -1, (4, 2)]],
@@ -106,7 +106,7 @@ class TestPut(TestCase):
         ]
         self.get_result(shape_format)
 
-    def test_put_empty_shape(self, device):
+    def test_put_empty_shape(self, device="npu"):
         shape_format = [
             [np.float32, -1, (4, 3)],
             [np.float32, -1, (4, 3, 5)],
@@ -129,6 +129,6 @@ class TestPut(TestCase):
             npu_output = self.npu_op_exec(input_x_npu, index_npu, source_npu, accumulate)
             self.assertRtolEqual(cpu_output, npu_output)  
 
-instantiate_device_type_tests(TestPut, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_qr.py b/test/test_network_ops/test_qr.py
index ec4c07e7eba851b5c04b2042ee90b4bb656b650d..cc527b272d1fee7eea6f21ffb1e448e816b1dc3c 100644
--- a/test/test_network_ops/test_qr.py
+++ b/test/test_network_ops/test_qr.py
@@ -18,9 +18,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestQr(TestCase):
     def cpu_op_exec(self, input1, some):
@@ -57,7 +57,7 @@ class TestQr(TestCase):
         rout = input3.to("cpu").numpy()
         return qout, rout
 
-    def test_qr_shape_format(self, device):
+    def test_qr_shape_format(self, device="npu"):
         # TODO(ascend): 算子目前 暂不支持fp16, 后续开发中
         dtype_list = [np.float32]
         format_list = [-1]
@@ -92,7 +92,7 @@ class TestQr(TestCase):
             self.assertRtolEqual(npu_output1_out, npu_output1)
             self.assertRtolEqual(npu_output2_out, npu_output2)
 
-    def test_qr_common_shape_format(self, device):
+    def test_qr_common_shape_format(self, device="npu"):
         shape_format = [
             [np.float32, -1, (5, 3)],
             [np.float32, -1, (1, 64, 147, 147)],
@@ -117,6 +117,6 @@ class TestQr(TestCase):
                 self.assertRtolEqual(cpu_output_r, npu_output_r)
                 self.assertRtolEqual(cpu_input1.numpy(), npu_output)
 
-instantiate_device_type_tests(TestQr, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_quantize_per_channel.py b/test/test_network_ops/test_quantize_per_channel.py
index 9cc0a808e0fed650b78fff056509ffacd74b0276..f04463256c5ee770a9567089b116e4e016ca8d8f 100644
--- a/test/test_network_ops/test_quantize_per_channel.py
+++ b/test/test_network_ops/test_quantize_per_channel.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestQuantizePerChannel(TestCase):
     def generate_data_per_channel(self, min_d, max_d, shape_x, shape_scale, shape_zp, dtype_x, dtype_scale, dtype_zp):
@@ -44,28 +43,28 @@ class TestQuantizePerChannel(TestCase):
         output = output.numpy()
         return output
 
-    def test_per_channel_3_3_0_int32(self, device):
+    def test_per_channel_3_3_0_int32(self, device="npu"):
         input_x1, scales, zero_points = self.generate_data_per_channel(-1, 1, (3, 3), (3,), (3,), np.float32,
                                                                        np.float32, np.int32)
         cpu_output1 = self.cpu_op_exec_per_channel(input_x1, scales, zero_points, 0, torch.qint32)
         npu_output1 = self.npu_op_exec_per_channel(input_x1, scales, zero_points, 0, torch.qint32)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_per_channel_3_3_3_3_1_int8(self, device):
+    def test_per_channel_3_3_3_3_1_int8(self, device="npu"):
         input_x1, scales, zero_points = self.generate_data_per_channel(-1, 1, (3, 3), (3,), (3,), np.float32,
                                                                        np.float32, np.int8)
         cpu_output1 = self.cpu_op_exec_per_channel(input_x1, scales, zero_points, 1, torch.qint8).astype(np.int32)
         npu_output1 = self.npu_op_exec_per_channel(input_x1, scales, zero_points, 1, torch.qint8).astype(np.int32)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_per_channel_3_3_3_3_3_3_3_3_4_uint8(self, device):
+    def test_per_channel_3_3_3_3_3_3_3_3_4_uint8(self, device="npu"):
         input_x1, scales, zero_points = self.generate_data_per_channel(-1, 1, (3, 3, 3, 3, 3, 3, 3, 3), (3,), (3,),
                                                                        np.float32, np.float32, np.int32)
         cpu_output1 = self.cpu_op_exec_per_channel(input_x1, scales, zero_points, 4, torch.quint8)
         npu_output1 = self.npu_op_exec_per_channel(input_x1, scales, zero_points, 4, torch.quint8)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_per_channel_30_30_30_30_30_2_uint8(self, device):
+    def test_per_channel_30_30_30_30_30_2_uint8(self, device="npu"):
         input_x1, scales, zero_points = self.generate_data_per_channel(-1, 1, (30, 30, 30, 30), (30,), (30,),
                                                                        np.float16, np.float32, np.uint8)
         input_x1_cpu = input_x1.float()
@@ -73,6 +72,6 @@ class TestQuantizePerChannel(TestCase):
         npu_output1 = self.npu_op_exec_per_channel(input_x1, scales, zero_points, 2, torch.quint8)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-instantiate_device_type_tests(TestQuantizePerChannel, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_quantize_per_tensor.py b/test/test_network_ops/test_quantize_per_tensor.py
index 446e9f8f48e759c66362016c17d61ee3282d3e67..c7af4231fcd078ca5b7552a2aa72e08908b0314a 100644
--- a/test/test_network_ops/test_quantize_per_tensor.py
+++ b/test/test_network_ops/test_quantize_per_tensor.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestQuantizePerTensor(TestCase):
 
@@ -39,31 +38,31 @@ class TestQuantizePerTensor(TestCase):
         output = output.numpy()
         return output
 
-    def test_per_tensor_3_3_0p1_10_int32(self, device):
+    def test_per_tensor_3_3_0p1_10_int32(self, device="npu"):
         input_x1 = self.generate_data_per_tensor(-1, 1, (3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec_per_tensor(input_x1, 0.1, 10, torch.qint32)
         npu_output1 = self.npu_op_exec_per_tensor(input_x1, 0.1, 10, torch.qint32)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_per_tensor_3_3_0p1_10_int8(self, device):
+    def test_per_tensor_3_3_0p1_10_int8(self, device="npu"):
         input_x1 = self.generate_data_per_tensor(-1, 1, (3, 3), np.float16)
         input_cpu = input_x1.float()
         cpu_output1 = self.cpu_op_exec_per_tensor(input_cpu, 0.1, 10, torch.qint8)
         npu_output1 = self.npu_op_exec_per_tensor(input_x1, 0.1, 10, torch.qint8)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_per_tensor_3_3_3_3_3_3_0p1_10_uint8(self, device):
+    def test_per_tensor_3_3_3_3_3_3_0p1_10_uint8(self, device="npu"):
         input_x1 = self.generate_data_per_tensor(-1, 1, (3, 3, 3, 3, 3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec_per_tensor(input_x1, 0.1, 10, torch.quint8)
         npu_output1 = self.npu_op_exec_per_tensor(input_x1, 0.1, 10, torch.quint8)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_per_tensor_30_30_30_30_30_30_0p01_5_uint8(self, device):
+    def test_per_tensor_30_30_30_30_30_30_0p01_5_uint8(self, device="npu"):
         input_x1 = self.generate_data_per_tensor(-1, 1, (30, 30, 30, 30, 30, 30), np.float32)
         cpu_output1 = self.cpu_op_exec_per_tensor(input_x1, 0.01, 5, torch.quint8)
         npu_output1 = self.npu_op_exec_per_tensor(input_x1, 0.01, 5, torch.quint8)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-instantiate_device_type_tests(TestQuantizePerTensor, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_random.py b/test/test_network_ops/test_random.py
index a05a4b9d2254921647c902d7ed3f8b8399d108f1..d61caeda993a62576f2f1953120f7e2a51788973 100644
--- a/test/test_network_ops/test_random.py
+++ b/test/test_network_ops/test_random.py
@@ -13,18 +13,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import ddt
 import torch
 import torch_npu
-import torch.nn as nn
-import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.decorator import Dtypes, instantiate_tests
+
 
+@instantiate_tests
 class TestRandom(TestCase):
+
     @Dtypes(torch.int32, torch.int64, torch.float, torch.float16)
-    def test_random_from_to(self, device, dtype):
+    def test_random_from_to(self, dtype):
         size = 2000
         alpha = 0.1
 
@@ -77,7 +79,7 @@ class TestRandom(TestCase):
                     )
     
     @Dtypes(torch.int32, torch.int64, torch.float, torch.float16)
-    def test_random_to(self, device, dtype):
+    def test_random_to(self, dtype):
         size = 2000
         alpha = 0.1
 
@@ -120,7 +122,7 @@ class TestRandom(TestCase):
                 )
 
     @Dtypes(torch.int32, torch.int64, torch.float, torch.float16)
-    def test_random_default(self, device, dtype):
+    def test_random_default(self, dtype):
         size = 2000
         alpha = 0.1
 
@@ -138,6 +140,5 @@ class TestRandom(TestCase):
         self.assertTrue((to_inc - alpha * to_inc) < t.to(torch.double).max() <= to_inc)
 
 
-instantiate_device_type_tests(TestRandom, globals(), except_for='cpu')
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_range.py b/test/test_network_ops/test_range.py
index d7b64417a4b110b32aca80811f06f90f14bbc48b..d50c47f4a6f773a84cb7bd306f0b7ea9546bb589 100644
--- a/test/test_network_ops/test_range.py
+++ b/test/test_network_ops/test_range.py
@@ -16,11 +16,9 @@
 
 import torch
 import torch_npu
-import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestRange(TestCase):
 
@@ -57,7 +55,7 @@ class TestRange(TestCase):
         output = output.numpy()
         return output
 
-    def test_range(self, device):
+    def test_range(self, device="npu"):
         shape_format = [
             [-10, 10, torch.float32],
             [50, 100, torch.int32],
@@ -70,7 +68,7 @@ class TestRange(TestCase):
             npu_output = self.npu_op_exec(item[0], item[1], item[2], 'npu')
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_range_step(self, device):
+    def test_range_step(self, device="npu"):
         shape_format = [
             [-10, 10, 0.5, torch.float32],
             [1, 100, 1, torch.int32],
@@ -83,7 +81,7 @@ class TestRange(TestCase):
             npu_output = self.npu_op_step_exec(item[0], item[1], item[2], item[3], 'npu')
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_range_out_step(self, device):
+    def test_range_out_step(self, device="npu"):
         shape_format = [
             [-10, 10, 0.5, torch.float32],
             [1, 100, 1, torch.int32],
@@ -98,6 +96,6 @@ class TestRange(TestCase):
             npu_output = self.npu_op_out_exec(item[0], item[1], item[2], item[3], npu_output)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestRange, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_reflection_pad1d.py b/test/test_network_ops/test_reflection_pad1d.py
index 13a6ddceb6b31d5d48191bb5104e263c64e75781..7f39ebe8c7fa7548251edcb42117dcc0e5a99130 100644
--- a/test/test_network_ops/test_reflection_pad1d.py
+++ b/test/test_network_ops/test_reflection_pad1d.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestReflectionPad1d(TestCase):
     def cpu_op_out_exec(self, input1, pad, output):
@@ -45,7 +45,7 @@ class TestReflectionPad1d(TestCase):
         output = output.numpy()
         return output
 
-    def test_reflection_pad1d_out_shape_format_fp16(self, device):
+    def test_reflection_pad1d_out_shape_format_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, 2, (1, 2, 4)], [3, 1]],
             [[np.float16, 3, (1, 2, 4)], [3, 1]]
@@ -66,7 +66,7 @@ class TestReflectionPad1d(TestCase):
             npu_output = self.npu_op_out_exec(npu_input1, item[1], npuout)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_reflection_pad1d_out_shape_format_fp32(self, device):
+    def test_reflection_pad1d_out_shape_format_fp32(self, device="npu"):
         shape_format = [
             [[np.float32, 0, (1, 2, 4)], [3, 1]],
             [[np.float32, 2, (1, 2, 4)], [3, 1]]
@@ -80,7 +80,7 @@ class TestReflectionPad1d(TestCase):
             npu_output = self.npu_op_out_exec(npu_input1, item[1], npuout)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_reflection_pad1d_shape_format_fp16(self, device):
+    def test_reflection_pad1d_shape_format_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, 0, (2, 10, 12)], [4, 3]],
             [[np.float16, 3, (2, 10, 12)], [4, 3]]
@@ -100,7 +100,7 @@ class TestReflectionPad1d(TestCase):
             npu_output = self.npu_op_exec(npu_input1, item[1])
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_reflection_pad1d_shape_format_fp32(self, device):
+    def test_reflection_pad1d_shape_format_fp32(self, device="npu"):
         shape_format = [
             [[np.float32, 2, (2, 10, 12)], [4, 3]],
             [[np.float32, 2, (2, 10, 12)], [4, 3]]
@@ -112,6 +112,6 @@ class TestReflectionPad1d(TestCase):
             npu_output = self.npu_op_exec(npu_input1, item[1])
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestReflectionPad1d, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_reflection_pad2d.py b/test/test_network_ops/test_reflection_pad2d.py
index dff95bf8c3065ee8fb5fc481e0d39f6704e42cde..5fac72ead7cf013f8805994b5eab065c01d6dc3f 100644
--- a/test/test_network_ops/test_reflection_pad2d.py
+++ b/test/test_network_ops/test_reflection_pad2d.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestReflectionPad2d(TestCase):
     def cpu_op_out_exec(self, input1, pad, output):
@@ -60,7 +60,7 @@ class TestReflectionPad2d(TestCase):
         output = output.astype(np.float16)
         return output
 
-    def test_reflection_pad2d_out_shape_format_fp16(self, device):
+    def test_reflection_pad2d_out_shape_format_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
             [[np.float16, 3, (1, 1, 4, 3)], 2]
@@ -74,7 +74,7 @@ class TestReflectionPad2d(TestCase):
             npu_output = self.npu_op_out_exec(npu_input1, item[1], npuout)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_reflection_pad2d_shape_format_fp16(self, device):
+    def test_reflection_pad2d_shape_format_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
             [[np.float16, 3, (1, 1, 4, 3)], 2]
@@ -86,7 +86,7 @@ class TestReflectionPad2d(TestCase):
             npu_output = self.npu_op_exec(npu_input1, item[1])
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_reflection_pad2d_shape_format_fp32(self, device):
+    def test_reflection_pad2d_shape_format_fp32(self, device="npu"):
         shape_format = [
             [[np.float32, 0, (1, 1, 37, 37)], [2, 2, 2, 2]],
             [[np.float32, 3, (1, 1, 17, 17)], 2]
@@ -98,7 +98,7 @@ class TestReflectionPad2d(TestCase):
             npu_output = self.npu_op_exec(npu_input1, item[1])
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestReflectionPad2d, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
     
\ No newline at end of file
diff --git a/test/test_network_ops/test_reflection_pad2d_backward.py b/test/test_network_ops/test_reflection_pad2d_backward.py
index b9a8853c85ea7bfff3eb6abff60daf48bf1ed2e9..e7285d48e8e23e755c9e255ee2a4d45d440c9b8b 100644
--- a/test/test_network_ops/test_reflection_pad2d_backward.py
+++ b/test/test_network_ops/test_reflection_pad2d_backward.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestReflectionPad2dBackward(TestCase):
     def cpu_op_exec(self, input1, pad):
@@ -42,7 +42,7 @@ class TestReflectionPad2dBackward(TestCase):
         input_grad = input_grad.cpu().numpy()
         return output, input_grad
 
-    def test_reflectionPad2d_backward_shape_format_fp16(self, device):
+    def test_reflectionPad2d_backward_shape_format_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, 0, (1, 1, 37, 37)], [2, 2, 2, 2]],
             [[np.float16, 3, (1, 1, 4, 3)], 2],
@@ -68,6 +68,6 @@ class TestReflectionPad2dBackward(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_grad, npu_grad)
 
-instantiate_device_type_tests(TestReflectionPad2dBackward, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_renorm.py b/test/test_network_ops/test_renorm.py
index 338517fe5e27eb6d3c7c77b2b3050fa6ca5fd7f6..b18f43e2a830925a20f2f1a9dee59bc6d534536e 100644
--- a/test/test_network_ops/test_renorm.py
+++ b/test/test_network_ops/test_renorm.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestRenorm(TestCase):
     def generate_data(self, min_d, max_d, shape, dtype):
@@ -70,110 +70,110 @@ class TestRenorm(TestCase):
         output = output.numpy()
         return output
 
-    def test_renorm_3_3_4_0_1(self, device):
+    def test_renorm_3_3_4_0_1(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 4, 0, 1)
         npu_output1 = self.npu_op_exec(input_x1, 4, 0, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_1_1_1(self, device):
+    def test_renorm_3_3_1_1_1(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 1, 1, 1)
         npu_output1 = self.npu_op_exec(input_x1, 1, 1, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_0_0_1_float16(self, device):
+    def test_renorm_3_3_0_0_1_float16(self, device="npu"):
         input_x1 = self.generate_data(-10, 10, (3, 3), np.float16)
         input_x1_cpu = input_x1.float()
         cpu_output1 = self.cpu_op_exec(input_x1_cpu, 0, 0, 1).astype(np.float16)
         npu_output1 = self.npu_op_exec(input_x1, 0, 0, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_0_0_1(self, device):
+    def test_renorm_3_3_0_0_1(self, device="npu"):
         input_x1 = self.generate_data(-10, 10, (3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 0, 0, 1)
         npu_output1 = self.npu_op_exec(input_x1, 0, 0, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_4_0_1_float16(self, device):
+    def test_renorm_3_3_4_0_1_float16(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3), np.float16)
         input_x1_cpu = input_x1.float()
         cpu_output1 = self.cpu_op_exec(input_x1_cpu, 4, 0, 1).astype(np.float16)
         npu_output1 = self.npu_op_exec(input_x1, 4, 0, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_1_1_1_float16(self, device):
+    def test_renorm_3_3_1_1_1_float16(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3), np.float16)
         input_x1_cpu = input_x1.float()
         cpu_output1 = self.cpu_op_exec(input_x1_cpu, 1, 1, 1).astype(np.float16)
         npu_output1 = self.npu_op_exec(input_x1, 1, 1, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_1_0_1(self, device):
+    def test_renorm_3_3_1_0_1(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 1, 0, 1)
         npu_output1 = self.npu_op_exec(input_x1, 1, 0, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_3_3_1_1(self, device):
+    def test_renorm_3_3_3_3_1_1(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 3, 1, 1)
         npu_output1 = self.npu_op_exec(input_x1, 3, 1, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_3_2_2_1(self, device):
+    def test_renorm_3_3_3_2_2_1(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 2, 2, 1)
         npu_output1 = self.npu_op_exec(input_x1, 2, 2, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_3_3_2_0_1(self, device):
+    def test_renorm_3_3_3_3_2_0_1(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3, 3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 2, 0, 1)
         npu_output1 = self.npu_op_exec(input_x1, 2, 0, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_3_3_3_3_1(self, device):
+    def test_renorm_3_3_3_3_3_3_1(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3, 3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 3, 3, 1)
         npu_output1 = self.npu_op_exec(input_x1, 3, 3, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_3_3_3_4_4_1(self, device):
+    def test_renorm_3_3_3_3_3_4_4_1(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3, 3, 3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 4, 4, 1)
         npu_output1 = self.npu_op_exec(input_x1, 4, 4, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_4_0_1_out(self, device):
+    def test_renorm_3_3_4_0_1_out(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3), np.float32)
         output_y = self.generate_data(-1, 1, (3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 4, 0, 1)
         npu_output1 = self.npu_op_exec_out(input_x1, 4, 0, 1, output_y)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_1_1_1_out(self, device):
+    def test_renorm_3_3_1_1_1_out(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3), np.float32)
         output_y = self.generate_data(-1, 1, (3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 1, 1, 1)
         npu_output1 = self.npu_op_exec_out(input_x1, 1, 1, 1, output_y)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_1_0_1_out(self, device):
+    def test_renorm_3_3_1_0_1_out(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3), np.float32)
         output_y = self.generate_data(-1, 1, (3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 1, 0, 1)
         npu_output1 = self.npu_op_exec_out(input_x1, 1, 0, 1, output_y)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_3_3_1_1_out(self, device):
+    def test_renorm_3_3_3_3_1_1_out(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3, 3), np.float32)
         output_y = self.generate_data(-1, 1, (3, 3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 3, 1, 1)
         npu_output1 = self.npu_op_exec_out(input_x1, 3, 1, 1, output_y)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_30_40_50_2_1_1_out_fp16(self, device):
+    def test_renorm_30_40_50_2_1_1_out_fp16(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (30, 40, 50), np.float16)
         output_y = self.generate_data(-1, 1, (30, 40, 50), np.float16)
         input_cpu = input_x1.float()
@@ -182,7 +182,7 @@ class TestRenorm(TestCase):
         npu_output1 = self.npu_op_exec_out(input_x1, 2, 1, 1, output_y)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_30_40_50_2_0_2_out_fp16(self, device):
+    def test_renorm_30_40_50_2_0_2_out_fp16(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (30, 40, 50), np.float16)
         output_y = self.generate_data(-1, 1, (30, 40, 50), np.float16)
         input_cpu = input_x1.float()
@@ -191,82 +191,82 @@ class TestRenorm(TestCase):
         npu_output1 = self.npu_op_exec_out(input_x1, 2, 0, 2, output_y)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_3_2_2_1_out(self, device):
+    def test_renorm_3_3_3_2_2_1_out(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3, 3), np.float32)
         output_y = self.generate_data(-1, 1, (3, 3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 2, 2, 1)
         npu_output1 = self.npu_op_exec_out(input_x1, 2, 2, 1, output_y)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_3_3_2_0_1_out(self, device):
+    def test_renorm_3_3_3_3_2_0_1_out(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3, 3, 3), np.float32)
         output_y = self.generate_data(-1, 1, (3, 3, 3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 2, 0, 1)
         npu_output1 = self.npu_op_exec_out(input_x1, 2, 0, 1, output_y)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_3_3_3_3_1_out(self, device):
+    def test_renorm_3_3_3_3_3_3_1_out(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3, 3, 3), np.float32)
         output_y = self.generate_data(-1, 1, (3, 3, 3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 3, 3, 1)
         npu_output1 = self.npu_op_exec_out(input_x1, 3, 3, 1, output_y)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_3_3_3_4_4_1_out(self, device):
+    def test_renorm_3_3_3_3_3_4_4_1_out(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3, 3, 3, 3), np.float32)
         output_y = self.generate_data(-1, 1, (3, 3, 3, 3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 4, 4, 1)
         npu_output1 = self.npu_op_exec_out(input_x1, 4, 4, 1, output_y)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_4_0_1_inplace(self, device):
+    def test_renorm_3_3_4_0_1_inplace(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 4, 0, 1)
         npu_output1 = self.npu_op_exec_inplace(input_x1, 4, 0, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_1_1_1_inplace(self, device):
+    def test_renorm_3_3_1_1_1_inplace(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 1, 1, 1)
         npu_output1 = self.npu_op_exec_inplace(input_x1, 1, 1, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_1_0_1_inplace(self, device):
+    def test_renorm_3_3_1_0_1_inplace(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 1, 0, 1)
         npu_output1 = self.npu_op_exec_inplace(input_x1, 1, 0, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_3_3_1_1_inplace(self, device):
+    def test_renorm_3_3_3_3_1_1_inplace(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 3, 1, 1)
         npu_output1 = self.npu_op_exec_inplace(input_x1, 3, 1, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_3_2_2_1_inplace(self, device):
+    def test_renorm_3_3_3_2_2_1_inplace(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 2, 2, 1)
         npu_output1 = self.npu_op_exec_inplace(input_x1, 2, 2, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_3_3_2_0_1_inplace(self, device):
+    def test_renorm_3_3_3_3_2_0_1_inplace(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3, 3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 2, 0, 1)
         npu_output1 = self.npu_op_exec_inplace(input_x1, 2, 0, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_3_3_3_3_1_inplace(self, device):
+    def test_renorm_3_3_3_3_3_3_1_inplace(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3, 3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 3, 3, 1)
         npu_output1 = self.npu_op_exec_inplace(input_x1, 3, 3, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_renorm_3_3_3_3_3_4_4_1_inplace(self, device):
+    def test_renorm_3_3_3_3_3_4_4_1_inplace(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 3, 3, 3, 3), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, 4, 4, 1)
         npu_output1 = self.npu_op_exec_inplace(input_x1, 4, 4, 1)
         self.assertRtolEqual(cpu_output1, npu_output1)
 
-instantiate_device_type_tests(TestRenorm, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_repeat.py b/test/test_network_ops/test_repeat.py
index 61db71bf0d039db8d2f06a525ee21440ac201386..85ada44a5a33fab4f9a0822625be089b54f4b32c 100644
--- a/test/test_network_ops/test_repeat.py
+++ b/test/test_network_ops/test_repeat.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 class TestRepeat(TestCase):
     def cpu_op_exec(self, input1, size):    
@@ -32,7 +31,7 @@ class TestRepeat(TestCase):
         output = output.numpy()
         return output
 
-    def test_repeat_common_shape_format(self, device):
+    def test_repeat_common_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float32, -1, (1280, 4)], [2,3]],
                 [[np.float32, 0, (1, 6, 4)],  [2, 4, 8]],
@@ -56,6 +55,6 @@ class TestRepeat(TestCase):
             npu_output = self.npu_op_exec(npu_input1, item[1])
             self.assertRtolEqual(cpu_output, npu_output)
        
-instantiate_device_type_tests(TestRepeat, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()  
diff --git a/test/test_network_ops/test_repeat_interleave.py b/test/test_network_ops/test_repeat_interleave.py
index f0673b66d3f726c60d1aac06e6eecfb249aa0928..b984c78b22a539b5b10e2fe2a083f3a0aad7c826 100644
--- a/test/test_network_ops/test_repeat_interleave.py
+++ b/test/test_network_ops/test_repeat_interleave.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestRepeatInterleave(TestCase):
 
@@ -49,7 +48,7 @@ class TestRepeatInterleave(TestCase):
         output = output.numpy()
         return output
 
-    def test_repeat_interleave_float16(self, device):
+    def test_repeat_interleave_float16(self, device="npu"):
         npu_input1 = self.generate_data(0, 100, (3, 3, 3), np.float16)
         npu_input2 = np.random.randint(1, 100)
         npu_input3 = np.random.randint(0, 2)
@@ -57,7 +56,7 @@ class TestRepeatInterleave(TestCase):
         npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_repeat_interleave_float32(self, device):
+    def test_repeat_interleave_float32(self, device="npu"):
         npu_input1 = self.generate_data(0, 100, (3, 3, 3), np.float32)
         npu_input2 = np.random.randint(1, 100)
         npu_input3 = np.random.randint(0, 2)
@@ -65,7 +64,7 @@ class TestRepeatInterleave(TestCase):
         npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_repeat_interleave_int32(self, device):
+    def test_repeat_interleave_int32(self, device="npu"):
         npu_input1 = self.generate_data(0, 100, (3, 3, 3), np.int32)
         npu_input2 = np.random.randint(1, 100)
         npu_input3 = np.random.randint(0, 2)
@@ -73,13 +72,13 @@ class TestRepeatInterleave(TestCase):
         npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3)
         self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_repeat_interleave_int32_without_dim(self, device):
+    def test_repeat_interleave_int32_without_dim(self, device="npu"):
         npu_input1 = self.generate_data(0, 100, (3, 3, 3), np.int32)
         npu_input2 = np.random.randint(1, 100)
         cpu_output = self.cpu_op_exec_without_dim(npu_input1, npu_input2)
         npu_output = self.npu_op_exec_without_dim(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestRepeatInterleave, globals(), except_for='cpu')
+
 if __name__ == '__main__':
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_replication_pad1d.py b/test/test_network_ops/test_replication_pad1d.py
index c7531ac50e776a546bc05b6fee1508c9e3fe0738..7351909892c87f6ff2f28733ae7999006da83198 100644
--- a/test/test_network_ops/test_replication_pad1d.py
+++ b/test/test_network_ops/test_replication_pad1d.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestReplicationPad1d(TestCase):
@@ -47,7 +46,7 @@ class TestReplicationPad1d(TestCase):
         m = m.numpy()
         return m
 
-    def test_replicationPad1d_shape_format_fp16(self, device):
+    def test_replicationPad1d_shape_format_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, 0, (1, 2, 4)], [3, 1]],
             [[np.float16, 2, (1, 2, 4)], [3, 1]]
@@ -60,7 +59,7 @@ class TestReplicationPad1d(TestCase):
             npu_output = self.npu_op_exec(npu_input1, item[1])
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_replicationPad1d_shape_format_fp32(self, device):
+    def test_replicationPad1d_shape_format_fp32(self, device="npu"):
         shape_format = [
             [[np.float32, 0, (1, 2, 4)], [3, 1]],
             [[np.float32, 2, (1, 2, 4)], [3, 1]]
@@ -71,7 +70,7 @@ class TestReplicationPad1d(TestCase):
             npu_output = self.npu_op_exec(npu_input1, item[1])
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_replicationPad1d_out_shape_format_fp16(self, device):
+    def test_replicationPad1d_out_shape_format_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, 0, (2, 17, 20)], [4, 3]],
             [[np.float16, 3, (2, 17, 20)], [4, 3]]
@@ -86,7 +85,7 @@ class TestReplicationPad1d(TestCase):
             npu_output = self.npu_op_out_exec(npu_input1, item[1], npuout)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_replicationPad1d_out_shape_format_fp32(self, device):
+    def test_replicationPad1d_out_shape_format_fp32(self, device="npu"):
         shape_format = [
             [[np.float32, 0, (2, 17, 20)], [4, 3]],
             [[np.float32, 3, (2, 17, 20)], [4, 3]]
@@ -99,6 +98,6 @@ class TestReplicationPad1d(TestCase):
             npu_output = self.npu_op_out_exec(npu_input1, item[1], npuout)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestReplicationPad1d, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_replication_pad2d.py b/test/test_network_ops/test_replication_pad2d.py
index 8cbd5a98ab037b3f295604ada9eadf2c806f5095..d33248b1df8c15699696735a3e2b9ad488fc12ab 100644
--- a/test/test_network_ops/test_replication_pad2d.py
+++ b/test/test_network_ops/test_replication_pad2d.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestReplicationPad2d(TestCase):
@@ -36,7 +35,7 @@ class TestReplicationPad2d(TestCase):
         m_n = m_n.numpy()
         return m_n
 
-    def test_replicationPad2d_shape_format_fp16(self, device):
+    def test_replicationPad2d_shape_format_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
             [[np.float16, 3, (1, 1, 4, 3)], 3]
@@ -56,7 +55,7 @@ class TestReplicationPad2d(TestCase):
             npu_output = self.npu_op_exec(npu_input1, item[1])
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_replicationPad2d_out_shape_format_fp16(self, device):
+    def test_replicationPad2d_out_shape_format_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
             [[np.float16, 3, (1, 1, 4, 3)], 2]
@@ -78,6 +77,5 @@ class TestReplicationPad2d(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
 
 
-instantiate_device_type_tests(TestReplicationPad2d, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_replication_pad2d_backward.py b/test/test_network_ops/test_replication_pad2d_backward.py
index 733cc56e9863e4e12b3cc78240a961dda759d545..f6214dadbdc6e992d944735f9e7ede98e9591885 100644
--- a/test/test_network_ops/test_replication_pad2d_backward.py
+++ b/test/test_network_ops/test_replication_pad2d_backward.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestReplicationPad2dBackward(TestCase):
@@ -33,7 +32,7 @@ class TestReplicationPad2dBackward(TestCase):
         output = output.detach().numpy()
         return output, npu_grad.cpu().numpy()
 
-    def test_replicationPad2d_backward_shape_format_fp16(self, device):
+    def test_replicationPad2d_backward_shape_format_fp16(self, device="npu"):
         shape_format = [
             [[np.float16, 0, (1, 1, 27, 27)], [2, 2, 2, 2]],
             [[np.float16, 0, (1, 1, 27, 27)], 3]
@@ -58,6 +57,5 @@ class TestReplicationPad2dBackward(TestCase):
             self.assertRtolEqual(cpu_grad, npu_grad)
 
 
-instantiate_device_type_tests(TestReplicationPad2dBackward, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_roi_align.py b/test/test_network_ops/test_roi_align.py
index 2c8f43b373a5ce40dd194b0b5ef7161ee309eaef..b47b006fe872df917ec1a9b1fa57d71ed63e88a8 100644
--- a/test/test_network_ops/test_roi_align.py
+++ b/test/test_network_ops/test_roi_align.py
@@ -13,15 +13,12 @@
 # limitations under the License.
 import torch
 import torch_npu
-import numpy as np
-from torch.nn import functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestRoiAlign(TestCase):
-    def test_roi_align_fp32(self, device):
+    def test_roi_align_fp32(self, device="npu"):
         _input = torch.FloatTensor([[[[1, 2, 3 , 4, 5, 6],
                                       [7, 8, 9, 10, 11, 12],
                                       [13, 14, 15, 16, 17, 18],
@@ -35,6 +32,6 @@ class TestRoiAlign(TestCase):
         out = torch_npu.npu_roi_align(_input, rois, 0.25, 3, 3, 2, 0)
         self.assertRtolEqual(expect_out, out.cpu())
 
-instantiate_device_type_tests(TestRoiAlign, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_roi_align_backward.py b/test/test_network_ops/test_roi_align_backward.py
index 262cf08a56f41da6c07044700b3b913506f5bf87..1aa732138f837e55ff7d90b9063171607ed26ca1 100644
--- a/test/test_network_ops/test_roi_align_backward.py
+++ b/test/test_network_ops/test_roi_align_backward.py
@@ -14,14 +14,12 @@
 import torch
 import torch_npu
 import numpy as np
-from torch.nn import functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestRoiAlignBackward(TestCase):
-    def test_roi_align_backward_fp32(self, device):
+    def test_roi_align_backward_fp32(self, device="npu"):
         _input = torch.FloatTensor([[[[1, 2, 3 , 4, 5, 6],
                                       [7, 8, 9, 10, 11, 12],
                                       [13, 14, 15, 16, 17, 18],
@@ -44,6 +42,6 @@ class TestRoiAlignBackward(TestCase):
         gradout = torch_npu.npu_roi_alignbk(out, rois, _input.size(), 3, 3, 0.25, 2)
         self.assertRtolEqual(expect_gradout, gradout.cpu())
 
-instantiate_device_type_tests(TestRoiAlignBackward, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_roll.py b/test/test_network_ops/test_roll.py
index 3a90588b9acc9167abb61c0643cb480cf7aad66a..7584f3deea43cd2c7c4e1f3b51caabe35aed76f5 100644
--- a/test/test_network_ops/test_roll.py
+++ b/test/test_network_ops/test_roll.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestRoll(TestCase):
     def generate_data(self, min_d, max_d, shape, dtype):
@@ -37,37 +36,37 @@ class TestRoll(TestCase):
         output = output.numpy()
         return output
 
-    def test_roll_3_4_5_float32(self, device):
+    def test_roll_3_4_5_float32(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 4, 5), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, [2, 1], [0, 1])
         npu_output1 = self.npu_op_exec(input_x1, [2, 1], [0, 1])
         self.assertRtolEqual(cpu_output1, npu_output1)
     
-    def test_roll_3_4_5_float16(self, device):
+    def test_roll_3_4_5_float16(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (3, 4, 5), np.float16)
         input_cpu = input_x1.float()
         cpu_output1 = self.cpu_op_exec(input_cpu, [2, 1], [0, 1]).astype(np.float16)
         npu_output1 = self.npu_op_exec(input_x1, [2, 1], [0, 1])
         self.assertRtolEqual(cpu_output1, npu_output1)
     
-    def test_roll_30_40_50_int32(self, device):
+    def test_roll_30_40_50_int32(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (30, 40, 50), np.int32)
         cpu_output1 = self.cpu_op_exec(input_x1, [20], [])
         npu_output1 = self.npu_op_exec(input_x1, [20], [])
         self.assertRtolEqual(cpu_output1, npu_output1)
     
-    def test_roll_20_30_40_50_uint8(self, device):
+    def test_roll_20_30_40_50_uint8(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (20, 30, 40, 50), np.uint8)
         cpu_output1 = self.cpu_op_exec(input_x1, [-20, 30], [-1, 0])
         npu_output1 = self.npu_op_exec(input_x1, [-20, 30], [-1, 0])
         self.assertRtolEqual(cpu_output1, npu_output1)
     
-    def test_roll_20_30_40_50_flaot32(self, device):
+    def test_roll_20_30_40_50_flaot32(self, device="npu"):
         input_x1 = self.generate_data(-1, 1, (20, 30, 40, 50), np.float32)
         cpu_output1 = self.cpu_op_exec(input_x1, [30], [3])
         npu_output1 = self.npu_op_exec(input_x1, [30], [3])
         self.assertRtolEqual(cpu_output1, npu_output1)
    
-instantiate_device_type_tests(TestRoll, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_roll_6d.py b/test/test_network_ops/test_roll_6d.py
index ce70f20c909f308b7d4e0d18d328a7f78c2e2f44..f60236ea738fd75aa9c654ecd362dd546fe1bae7 100644
--- a/test/test_network_ops/test_roll_6d.py
+++ b/test/test_network_ops/test_roll_6d.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestRoll6d(TestCase):
     def generate_data(self, min_d1, max_d1, shape1, dtype1):
@@ -37,12 +36,12 @@ class TestRoll6d(TestCase):
         output1 = output1.numpy()
         return output1
     
-    def test_roll_10_10_10_10_10_10_int8(self, device):
+    def test_roll_10_10_10_10_10_10_int8(self, device="npu"):
         input1 = self.generate_data(-1, 1, (10, 10, 10, 10, 10, 10), np.int8)
         cpu_output1 = self.cpu_op_exec(input1, [-20, 30, 5], [-3, -4, -5])
         npu_output1 = self.npu_op_exec(input1, [-20, 30, 5], [-3, -4, -5])
         self.assertRtolEqual(cpu_output1, npu_output1)
    
-instantiate_device_type_tests(TestRoll6d, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_rsub.py b/test/test_network_ops/test_rsub.py
index 9b2167d78f65e871adda07d8e4daedb14cb8d53f..6cb8fd3574ce2becf851c75449745a9fd17ee199 100644
--- a/test/test_network_ops/test_rsub.py
+++ b/test/test_network_ops/test_rsub.py
@@ -15,9 +15,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestRsub(TestCase):
@@ -63,109 +62,108 @@ class TestRsub(TestCase):
             cpu_output = cpu_output.astype(npu_output_scalar.dtype)
             self.assertRtolEqual(cpu_output, npu_output_scalar)
 
-    def test_sub_shape_format_fp16_1d(self, device):
+    def test_sub_shape_format_fp16_1d(self, device="npu"):
         format_list = [-1, 0, 3]
         shape_format = [[[np.float16, i, [32]], [np.float16, i, [32]]] for i in format_list]
         self.rsub_result(shape_format)
 
-    def test_sub_shape_format_fp32_1d(self, device):
+    def test_sub_shape_format_fp32_1d(self, device="npu"):
         format_list = [-1, 0, 3]
         shape_format = [[[np.float16, i, [32]], [np.float16, i, [32]]] for i in format_list]
         self.rsub_result(shape_format)
 
-    def test_sub_shape_format_fp16_2d(self, device):
+    def test_sub_shape_format_fp16_2d(self, device="npu"):
         format_list = [-1, 0, 3, 29]
         shape_format = [[[np.float16, i, [5, 3]], [np.float16, i, [5, 3]]] for i in format_list]
         self.rsub_result(shape_format)
 
-    def test_sub_shape_format_fp32_2d(self, device):
+    def test_sub_shape_format_fp32_2d(self, device="npu"):
         format_list = [-1, 0, 3, 29]
         shape_format = [[[np.float16, i, [5, 3]], [np.float16, i, [5, 3]]] for i in format_list]
         self.rsub_result(shape_format)
 
-    def test_sub_shape_format_fp16_3d(self, device):
+    def test_sub_shape_format_fp16_3d(self, device="npu"):
         format_list = [-1, 0, 3, 29]
         shape_format = [[[np.float16, i, [256, 480, 14]], [np.float16, i, [256, 480, 14]]] for i in format_list]
         self.rsub_result(shape_format)
 
-    def test_sub_shape_format_fp32_3d(self, device):
+    def test_sub_shape_format_fp32_3d(self, device="npu"):
         format_list = [-1, 0, 3, 29]
         shape_format = [[[np.float16, i, [256, 480, 14]], [np.float16, i, [256, 480, 14]]] for i in format_list]
         self.rsub_result(shape_format)
 
-    def test_sub_shape_format_fp16_4d(self, device):
+    def test_sub_shape_format_fp16_4d(self, device="npu"):
         format_list = [-1, 0, 3, 29]
         shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, [32, 3, 3, 3]]] for i in format_list]
         self.rsub_result(shape_format)
 
-    def test_sub_shape_format_fp32_4d(self, device):
+    def test_sub_shape_format_fp32_4d(self, device="npu"):
         format_list = [-1, 0, 3, 29]
         shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, [32, 3, 3, 3]]] for i in format_list]
         self.rsub_result(shape_format)
 
     # int-------------------------------------------------------------------------------
-    def test_sub_shape_format_int32_1d(self, device):
+    def test_sub_shape_format_int32_1d(self, device="npu"):
         format_list = [-1, 0]
         shape_format = [[[np.int32, i, [32]], [np.int32, i, [32]]] for i in format_list]
         self.rsub_result(shape_format)
 
-    def test_sub_shape_format_int32_2d(self, device):
+    def test_sub_shape_format_int32_2d(self, device="npu"):
         format_list = [-1, 0]
         shape_format = [[[np.int32, i, [5, 3]], [np.int32, i, [5, 3]]] for i in format_list]
         self.rsub_result(shape_format)
 
-    def test_sub_shape_format_int32_3d(self, device):
+    def test_sub_shape_format_int32_3d(self, device="npu"):
         format_list = [-1, 0]
         shape_format = [[[np.int32, i, [256, 480, 14]], [np.int32, i, [256, 480, 14]]] for i in format_list]
         self.rsub_result(shape_format)
 
-    def test_sub_shape_format_int32_4d(self, device):
+    def test_sub_shape_format_int32_4d(self, device="npu"):
         format_list = [-1, 0]
         shape_format = [[[np.int32, i, [32, 3, 3, 3]], [np.int32, i, [32, 3, 3, 3]]] for i in format_list]
         self.rsub_result(shape_format)
 
     # scalar----------------------------------------------------------------------------
-    def test_sub_scalar_shape_format_fp16_1d(self, device):
+    def test_sub_scalar_shape_format_fp16_1d(self, device="npu"):
         format_list = [-1, 0]
         shape_format = [[[np.float16, i, [32]]] for i in format_list]
         self.rsub_scalar_result(shape_format)
 
-    def test_sub_scalar_shape_format_fp32_1d(self, device):
+    def test_sub_scalar_shape_format_fp32_1d(self, device="npu"):
         format_list = [-1, 0]
         shape_format = [[[np.float16, i, [32]]] for i in format_list]
         self.rsub_scalar_result(shape_format)
 
-    def test_sub_scalar_shape_format_fp16_2d(self, device):
+    def test_sub_scalar_shape_format_fp16_2d(self, device="npu"):
         format_list = []
         shape_format = [[[np.float16, i, [32, 64]]] for i in format_list]
         self.rsub_scalar_result(shape_format)
 
-    def test_sub_scalar_shape_format_fp32_2d(self, device):
+    def test_sub_scalar_shape_format_fp32_2d(self, device="npu"):
         format_list = []
         shape_format = [[[np.float16, i, [32, 64]]] for i in format_list]
         self.rsub_scalar_result(shape_format)
 
-    def test_sub_scalar_shape_format_fp16_3d(self, device):
+    def test_sub_scalar_shape_format_fp16_3d(self, device="npu"):
         format_list = []
         shape_format = [[[np.float16, i, [32, 64, 128]]] for i in format_list]
         self.rsub_scalar_result(shape_format)
 
-    def test_sub_scalar_shape_format_fp32_3d(self, device):
+    def test_sub_scalar_shape_format_fp32_3d(self, device="npu"):
         format_list = []
         shape_format = [[[np.float16, i, [32, 64, 128]]] for i in format_list]
         self.rsub_scalar_result(shape_format)
 
-    def test_sub_scalar_shape_format_fp16_4d(self, device):
+    def test_sub_scalar_shape_format_fp16_4d(self, device="npu"):
         format_list = []
         shape_format = [[[np.float16, i, [32, 64, 128, 28]]] for i in format_list]
         self.rsub_scalar_result(shape_format)
 
-    def test_sub_scalar_shape_format_fp32_4d(self, device):
+    def test_sub_scalar_shape_format_fp32_4d(self, device="npu"):
         format_list = []
         shape_format = [[[np.float16, i, [32, 64, 128, 28]]] for i in format_list]
         self.rsub_scalar_result(shape_format)
 
 
-instantiate_device_type_tests(TestRsub, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_scatter.py b/test/test_network_ops/test_scatter.py
index fc74ab543d20756921f8c7802d309010206624e9..1936d3761b2035b2c6073afe29bee2c083a15739 100644
--- a/test/test_network_ops/test_scatter.py
+++ b/test/test_network_ops/test_scatter.py
@@ -18,9 +18,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestScatter(TestCase):
     def cpu_op_exec(self, shape, dim, index, src):
@@ -51,7 +51,7 @@ class TestScatter(TestCase):
         input1 = input1.cpu()
         return input1.numpy()
 
-    def test_scatter_shape_format(self, device):
+    def test_scatter_shape_format(self, device="npu"):
         shape_format = [
                 [0, [3, 5], [np.float32, 0, [2, 5]]],
                 [0, [3, 5], [np.float32, 3, [2, 5]]],
@@ -86,6 +86,6 @@ class TestScatter(TestCase):
             npu_output = self.npu_op_exec_inplace(item[1], item[0], index, 1.23, False)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestScatter, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_scatter_add.py b/test/test_network_ops/test_scatter_add.py
index 64aae16c27e5c7b5e26fc38ea4b2a839e107e769..9443d71b754beeafd033130b931c6644fa57d851 100644
--- a/test/test_network_ops/test_scatter_add.py
+++ b/test/test_network_ops/test_scatter_add.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestScatterAdd(TestCase):
     def cpu_op_exec_inp(self, input1, dim, index, src):
@@ -43,7 +43,7 @@ class TestScatterAdd(TestCase):
         output = output.numpy()
         return output
 
-    def test_scatter_add_common_shape_format(self, device):
+    def test_scatter_add_common_shape_format(self, device="npu"):
         shape_format = [
                 [0,     [np.int64, 0, [10, 20]],       [np.float32, 0, [10, 20]],         [np.float32, 0, [10, 20]]],
                 [1,     [np.int64, 0, [10, 20]],       [np.float32, 0, [10, 20]],         [np.float32, 0, [10, 20]]],
@@ -67,7 +67,7 @@ class TestScatterAdd(TestCase):
             npu_inp_output = self.npu_op_exec_inp(npu_input3, item[0], npu_input1, npu_input2)
             self.assertRtolEqual(cpu_inp_output, npu_inp_output)
     
-    def test_scatter_add_float16_shape_format(self, device):
+    def test_scatter_add_float16_shape_format(self, device="npu"):
         def cpu_op_exec_inp_fp16(input1, dim, index, src):
             input1 = input1.to(torch.float32)
             src = src.to(torch.float32)
@@ -104,6 +104,6 @@ class TestScatterAdd(TestCase):
             npu_inp_output = self.npu_op_exec_inp(npu_input3, item[0], npu_input1, npu_input2)
             self.assertRtolEqual(cpu_inp_output, npu_inp_output)
 
-instantiate_device_type_tests(TestScatterAdd, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_scatterv1.py b/test/test_network_ops/test_scatterv1.py
index 38d66bbbd3956fbb70950f99f2a82b24ab12f603..678118e548a123a5ec383f9cac85fcaf2054f7bd 100644
--- a/test/test_network_ops/test_scatterv1.py
+++ b/test/test_network_ops/test_scatterv1.py
@@ -15,9 +15,8 @@
 import torch
 import torch_npu
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestScatterV1(TestCase):
    def npu_op_exec(self, input1, indices, updates, dim):
@@ -26,7 +25,7 @@ class TestScatterV1(TestCase):
         output = output.numpy()
         return output
 
-   def test_scatterv1(self, device):
+   def test_scatterv1(self, device="npu"):
         input1_list = [[[1.6279, 0.1226], [0.9041, 1.0980]]]
         indices_list = [[0, 1]]
         updates_list = [[-1.1993, -1.5247]]
@@ -45,7 +44,6 @@ class TestScatterV1(TestCase):
             output = self.npu_op_exec(input1_tensor, indices_tensor, updates_tensor, dim)
             self.assertRtolEqual(exoutput_tensor.numpy(), output)
 
-            
-instantiate_device_type_tests(TestScatterV1, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_slogdet.py b/test/test_network_ops/test_slogdet.py
index 0ba0af08bdec785ed56148cd8ccd311f65977450..2ea343f4fd6b1cfee97901197630fe2108e2200f 100644
--- a/test/test_network_ops/test_slogdet.py
+++ b/test/test_network_ops/test_slogdet.py
@@ -18,9 +18,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestSlogdet(TestCase):
     def cpu_op_exec(self, input1):
@@ -37,7 +37,7 @@ class TestSlogdet(TestCase):
         logabsdet = logabsdet.numpy()
         return sign, logabsdet
 
-    def test_slogdet_shape_format(self, device):
+    def test_slogdet_shape_format(self, device="npu"):
         shape_format = [
                 [np.float32, -1, (3, 3)],
                 [np.float32, -1, (4, 3, 3)],
@@ -50,6 +50,6 @@ class TestSlogdet(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_indices, npu_indices)
 
-instantiate_device_type_tests(TestSlogdet, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_softmaxcrossentropywithlogits.py b/test/test_network_ops/test_softmaxcrossentropywithlogits.py
index 3dc58dd5b7d928f6b2e02bab0619a3c1acdb8444..67acbbe8664a0620df01dc44ca7d8fd55909d53b 100644
--- a/test/test_network_ops/test_softmaxcrossentropywithlogits.py
+++ b/test/test_network_ops/test_softmaxcrossentropywithlogits.py
@@ -13,11 +13,9 @@
 # limitations under the License.
 import torch
 import torch_npu
-import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestSoftmaxCrossentropyWithLogits(TestCase):
     def npu_op_exec(self, input1, label):
@@ -26,13 +24,13 @@ class TestSoftmaxCrossentropyWithLogits(TestCase):
         output = output.numpy()
         return output
 
-    def test_softmaxcross(self, device):
+    def test_softmaxcross(self, device="npu"):
         input1 = torch.tensor([[1.,2.,3.,4.]]).npu()
         label = torch.tensor([[1.,2.,3.,4.]]).npu()
         exresult = torch.tensor([14.4019])
         output = self.npu_op_exec(input1, label)
         self.assertRtolEqual(exresult.numpy(), output)
 
-instantiate_device_type_tests(TestSoftmaxCrossentropyWithLogits, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_sort_without_indices.py b/test/test_network_ops/test_sort_without_indices.py
index 12f552940c798d3d1e1160e0d0cf303e5f5bb4de..591f2232711b7ce3dde7f0dd04ba0df3c955f334 100644
--- a/test/test_network_ops/test_sort_without_indices.py
+++ b/test/test_network_ops/test_sort_without_indices.py
@@ -17,11 +17,10 @@
 import torch
 import torch_npu
 import numpy as np
-from torch.nn import functional as F
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestSortWithoutIndices(TestCase):
     def cpu_default_op_exec(self, input1):
@@ -46,7 +45,7 @@ class TestSortWithoutIndices(TestCase):
         output = output.numpy()
         return output
     
-    def test_sort_v2_shape_format(self, device):
+    def test_sort_v2_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float16, 0, (1, 5000)]],
                 [[np.float16, 0, (1, 50000)]],
@@ -64,7 +63,7 @@ class TestSortWithoutIndices(TestCase):
                 npu_output = self.npu_op_exec(npu_input1, item[1])
             self.assertRtolEqual(cpu_output, npu_output)
     
-    def test_sort_v2_shape_format_big_range(self, device):
+    def test_sort_v2_shape_format_big_range(self, device="npu"):
         shape_format = [
                 [[np.float16, 0, (1, 5000)]],
                 [[np.float16, 0, (1, 50000)]],
@@ -83,6 +82,5 @@ class TestSortWithoutIndices(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
 
 
-instantiate_device_type_tests(TestSortWithoutIndices, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_sub_sample.py b/test/test_network_ops/test_sub_sample.py
index f6d95510ee74f69b21ec9db5a4cfcdecf086fded..a6ae808ccbccaac51b83101d4fe216593b20ae4b 100644
--- a/test/test_network_ops/test_sub_sample.py
+++ b/test/test_network_ops/test_sub_sample.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestSubSample(TestCase):
     def get_num(self, input1, output):
@@ -60,7 +59,7 @@ class TestSubSample(TestCase):
             if input_num0 >= size - size * fraction and output_num0 != size - size * fraction:
                 self.fail(error_name)     
 
-    def test_subsample(self, device):
+    def test_subsample(self, device="npu"):
         for _ in range(20):
             input1 = np.random.randint(-1, 2, size = (10))
             npu_input = torch.from_numpy(input1).to("npu")
@@ -72,6 +71,5 @@ class TestSubSample(TestCase):
             self.nummore_equal(getlist[0], getlist[1], getlist[2], getlist[3], 5, 0.6)
             
 
-instantiate_device_type_tests(TestSubSample, globals(), except_for='cpu')
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_sum.py b/test/test_network_ops/test_sum.py
index b226d637dff1f6be514d6af1e90d8dbc69184984..94bb384110da2eae24e78c41e599989dcdc7c45e 100644
--- a/test/test_network_ops/test_sum.py
+++ b/test/test_network_ops/test_sum.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestSum(TestCase):
     def cpu_op_exec(self, input1):
@@ -88,56 +88,56 @@ class TestSum(TestCase):
             cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
             self.assertRtolEqual(cpu_output_dim, npu_output_dim)
             
-    def test_sum_shape_format_fp16_1d(self, device):
+    def test_sum_shape_format_fp16_1d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float16, i, [18]] for i in format_list 
         ]
         self.sum_result(shape_format)
         
-    def test_sum_shape_format_fp32_1d(self, device):
+    def test_sum_shape_format_fp32_1d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float32, i, [18]] for i in format_list 
         ]
         self.sum_result(shape_format)
         
-    def test_sum_shape_format_fp16_2d(self, device):
+    def test_sum_shape_format_fp16_2d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_format = [
             [np.float16, i, [256, 1000]] for i in format_list 
         ]
         self.sum_result(shape_format)
         
-    def test_sum_shape_format_fp32_2d(self, device):
+    def test_sum_shape_format_fp32_2d(self, device="npu"):
         format_list = [0, 3,  29]
         shape_format = [
             [np.float32, i, [256, 1000]] for i in format_list 
         ]
         self.sum_result(shape_format)
         
-    def test_sum_shape_format_fp16_3d(self, device):
+    def test_sum_shape_format_fp16_3d(self, device="npu"):
         format_list = [0, 3, 29]
         shape_format = [
             [np.float16, i, [32, 48, 64]] for i in format_list 
         ]
         self.sum_result(shape_format)
         
-    def test_sum_shape_format_fp32_3d(self, device):
+    def test_sum_shape_format_fp32_3d(self, device="npu"):
         format_list = [0, 3,  29]
         shape_format = [
             [np.float32, i, [32, 48, 64]] for i in format_list 
         ]
         self.sum_result(shape_format)
         
-    def test_sum_shape_format_fp16_4d(self, device):
+    def test_sum_shape_format_fp16_4d(self, device="npu"):
         format_list = [0,  4, 29]
         shape_format = [
             [np.float16, i, [32, 24, 18, 18]] for i in format_list 
         ]
         self.sum_result(shape_format)
         
-    def test_sum_shape_format_fp32_4d(self, device):
+    def test_sum_shape_format_fp32_4d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         shape_format = [
             [np.float32, i, [32, 24, 18, 18]] for i in format_list 
@@ -146,35 +146,35 @@ class TestSum(TestCase):
         
         # --------sum dim---------------------
         
-    def test_sum_dim_shape_format_fp16_1d(self, device):
+    def test_sum_dim_shape_format_fp16_1d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float16, i, [18]] for i in format_list 
         ]
         self.sum_dim_result(shape_format)
         
-    def test_sum_dim_shape_format_fp32_1d(self, device):
+    def test_sum_dim_shape_format_fp32_1d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float32, i, [18]] for i in format_list 
         ]
         self.sum_dim_result(shape_format)
         
-    def test_sum_dim_shape_format_fp16_2d(self, device):
+    def test_sum_dim_shape_format_fp16_2d(self, device="npu"):
         format_list = [0, 3,  29]
         shape_format = [
             [np.float16, i, [256, 1000]] for i in format_list 
         ]
         self.sum_dim_result(shape_format)
         
-    def test_sum_dim_shape_format_fp32_2d(self, device):
+    def test_sum_dim_shape_format_fp32_2d(self, device="npu"):
         format_list = [0, 3,  29]
         shape_format = [
             [np.float32, i, [256, 1000]] for i in format_list 
         ]
         self.sum_dim_result(shape_format)
         
-    def test_sum_dim_shape_format_fp16_3d(self, device):
+    def test_sum_dim_shape_format_fp16_3d(self, device="npu"):
         # TODO(ascend): Insufficient precision
         #format=29精度不满足 format_list = [0, 3,  29]
         format_list = [0, 3]
@@ -183,28 +183,28 @@ class TestSum(TestCase):
         ]
         self.sum_dim_result(shape_format)
         
-    def test_sum_dim_shape_format_fp32_3d(self, device):
+    def test_sum_dim_shape_format_fp32_3d(self, device="npu"):
         format_list = [0, 3,  29]
         shape_format = [
             [np.float32, i, [32, 48, 64]] for i in format_list 
         ]
         self.sum_dim_result(shape_format)
         
-    def test_sum_dim_shape_format_fp16_4d(self, device):
+    def test_sum_dim_shape_format_fp16_4d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [
             [np.float16, i, [16, 16, 9, 9]] for i in format_list 
         ]
         self.sum_dim_result(shape_format)
         
-    def test_sum_dim_shape_format_fp32_4d(self, device):
+    def test_sum_dim_shape_format_fp32_4d(self, device="npu"):
         format_list = [0, 3, 4]
         shape_format = [
             [np.float32, i, [32, 24, 18, 18]] for i in format_list 
         ]
         self.sum_dim_result(shape_format)
 
-    def test_sum_dim_with_zero_shape_format(self, device):
+    def test_sum_dim_with_zero_shape_format(self, device="npu"):
         format_list = [0, 3, 4]
         shape_format = [
             [np.float32, i, [2, 0, 3]] for i in format_list 
@@ -213,6 +213,5 @@ class TestSum(TestCase):
         self.sum_result(shape_format)
 
 
-instantiate_device_type_tests(TestSum, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_symeig.py b/test/test_network_ops/test_symeig.py
index 6f3939fa3db5b040cdefb69b25db92fa2a0df653..1901423f20e3356dc9e814e6254872a6cecb0b43 100644
--- a/test/test_network_ops/test_symeig.py
+++ b/test/test_network_ops/test_symeig.py
@@ -17,8 +17,8 @@
 import torch
 import torch_npu
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestSymeig(TestCase):
     def op_exec(self, input1, eigenvectorsflag):
@@ -37,28 +37,28 @@ class TestSymeig(TestCase):
         self.op_exec(input1, False)
         self.op_exec(input1, True)
 
-    def test_symeig_null(self, device):
+    def test_symeig_null(self, device="npu"):
         a = torch.randn(0, 0)
         self.op_exec(a, False)
         self.op_exec(a, True)
 
-    def test_symeig_2d(self, device):
+    def test_symeig_2d(self, device="npu"):
         a = torch.randn(5, 5, dtype = torch.float32)
         self.case_exec(a)
 
-    def test_symeig_3d(self, device):
+    def test_symeig_3d(self, device="npu"):
         a = torch.randn(10, 5, 5, dtype = torch.float32)
         self.case_exec(a)
 
-    def test_symeig_4d(self, device):
+    def test_symeig_4d(self, device="npu"):
         a = torch.randn(10, 3, 5, 5, dtype = torch.float32)
         self.case_exec(a)
 
-    def test_symeig_5d(self, device):
+    def test_symeig_5d(self, device="npu"):
         a = torch.randn(2, 10, 3, 5, 5, dtype = torch.float32)
         self.case_exec(a)
 
-    def test_symeig_out(self, device):
+    def test_symeig_out(self, device="npu"):
         a = torch.randn(2, 3, 3, dtype = torch.float32)
         a = a + a.transpose(-2, -1)
         an = a.npu()
@@ -68,6 +68,6 @@ class TestSymeig(TestCase):
         ret = torch.matmul(v, torch.matmul(e.diag_embed(), v.transpose(-2, -1)))
         self.assertRtolEqual(ret.cpu(), a, prec = 1e-3)
 
-instantiate_device_type_tests(TestSymeig, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_take.py b/test/test_network_ops/test_take.py
index 4454b5e1542ff2c5e791714cd7dddcfd78ebc46d..7cbe07bd589703f532c247d91925a02da79a3804 100644
--- a/test/test_network_ops/test_take.py
+++ b/test/test_network_ops/test_take.py
@@ -18,9 +18,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestTake(TestCase):
     def cpu_op_out_exec(self, input1,input2, out):
@@ -44,7 +44,7 @@ class TestTake(TestCase):
         output = output.to("cpu").numpy()
         return output
 
-    def test_take_shape_format(self, device):
+    def test_take_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float32, 0, (5,3)], [np.int64, 0, (3)],8],
                 [[np.int8, 0, (64, 10)], [np.int64,0, (10)],74],
@@ -64,7 +64,7 @@ class TestTake(TestCase):
                 cpu_output = cpu_output.astype(np.float16)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_take_out_shape_format(self, device):
+    def test_take_out_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float32, 0, (5,3)], [np.int64, 0, (3)],8, [np.float32, 0, (3)]],
                 [[np.int8, 0, (64, 10)], [np.int64,0, (10)],74, [np.int8, 0, (10)]],
@@ -86,6 +86,6 @@ class TestTake(TestCase):
                 cpu_output = cpu_output.astype(np.float16)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestTake, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_tanh.py b/test/test_network_ops/test_tanh.py
index 28c8c0789fcc21e214d57dcce013778c6d60fe92..a1bcfb09c6a1ca3a6d6b3376447157a6a4037584 100644
--- a/test/test_network_ops/test_tanh.py
+++ b/test/test_network_ops/test_tanh.py
@@ -15,9 +15,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestTanh(TestCase):
@@ -33,7 +32,7 @@ class TestTanh(TestCase):
         output = output.numpy()
         return output
 
-    def test_tanh_common_shape_format(self, device):
+    def test_tanh_common_shape_format(self, device="npu"):
         shape_format = [
             [[np.float32, -1, (4, 3, 3)], 1, 100],
             [[np.float32, -1, (7,5,5)], 21474836,21474837],
@@ -67,7 +66,7 @@ class TestTanh(TestCase):
             npu_output = self.npu_op_exec(npu_input1)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_tanh_float16_shape_format(self, device):
+    def test_tanh_float16_shape_format(self, device="npu"):
         def cpu_op_exec_fp16(input1):
             input1 = input1.to(torch.float32)
             output = torch.tanh(input1)
@@ -106,7 +105,7 @@ class TestTanh(TestCase):
             npu_output = self.npu_op_exec(npu_input1)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_tanh_inplace_common_shape_format(self, device):
+    def test_tanh_inplace_common_shape_format(self, device="npu"):
         def cpu_op_inplace_exec(input1):
             output = torch.tanh_(input1)
             output = output.numpy()
@@ -133,7 +132,5 @@ class TestTanh(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
 
 
-instantiate_device_type_tests(TestTanh, globals(), except_for='cpu')
-
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_tanh_backward.py b/test/test_network_ops/test_tanh_backward.py
index 1e108d1960a8eaad461210ea4b5fb355935edc11..35f4642acae4d3ecab873fffb1a1fe280653afb1 100644
--- a/test/test_network_ops/test_tanh_backward.py
+++ b/test/test_network_ops/test_tanh_backward.py
@@ -15,9 +15,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestTanhBackward(TestCase):
@@ -38,7 +37,7 @@ class TestTanhBackward(TestCase):
         output = output.numpy()
         return output
 
-    def test_tanh_backward_common_shape_format(self, device):
+    def test_tanh_backward_common_shape_format(self, device="npu"):
         shape_format = [
             [[np.float32, -1, (4, 3)], 1, 100],
             [[np.float32, -1, (7, 5, 5)], 21474836,21474837],
@@ -63,7 +62,7 @@ class TestTanhBackward(TestCase):
             npu_output = self.npu_op_exec(npu_input1)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_tanh_backward_float16_shape_format(self, device):
+    def test_tanh_backward_float16_shape_format(self, device="npu"):
         def cpu_op_exec_fp16(input1):
             input1 = input1.to(torch.float32)
             input1.requires_grad = True
@@ -95,7 +94,5 @@ class TestTanhBackward(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
 
 
-instantiate_device_type_tests(TestTanhBackward, globals(), except_for='cpu')
-
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_thnn_conv_depthwise2d_backward.py b/test/test_network_ops/test_thnn_conv_depthwise2d_backward.py
index 7652fc32a9d7d4bf1910d24250576eae5c6e2c42..b267e1e59effe246b03e99bd642939f21d56d88f 100644
--- a/test/test_network_ops/test_thnn_conv_depthwise2d_backward.py
+++ b/test/test_network_ops/test_thnn_conv_depthwise2d_backward.py
@@ -19,9 +19,9 @@ import torch_npu
 import numpy as np
 import torch.nn as nn
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 #TODO:The accuracy of the operator is not up to standard
 class TestThnnConvDepthwise2d(TestCase):
@@ -104,7 +104,7 @@ class TestThnnConvDepthwise2d(TestCase):
                 self.assertRtolEqual(self.input_grad[0].numpy(), self.input_grad[1].numpy(), prec16=1e-3)
                 self.assertRtolEqual(self.weight_grad[0].numpy(), self.weight_grad[1].numpy(), prec16=1e-2)
 
-    def test_conv_depthwise2d_backward_shape_format_fp16(self, device):
+    def test_conv_depthwise2d_backward_shape_format_fp16(self, device="npu"):
         shape_format = [  # input, weight, padding, stride, dilation, bias
             [[np.float16, 0, [32, 32, 112, 112]], [np.float16, 0, [32, 1, 3, 3]], 0, 1, 1, True],
             [[np.float16, 0, [128, 232, 14, 14]], [np.float16, 0, [232, 1, 3, 3]], 1, [2, 2], 1, None],
@@ -113,7 +113,7 @@ class TestThnnConvDepthwise2d(TestCase):
         ]
         self.conv_depthwise2d_backward_result(shape_format)
 
-    def test_conv_depthwise2d_backward_shape_format_fp32(self, device):
+    def test_conv_depthwise2d_backward_shape_format_fp32(self, device="npu"):
         shape_format = [  # input, weight, padding, stride, dilation, bias
             [[np.float32, 3, [32, 32, 12, 12]], [np.float32, 0, [32, 1, 3, 3]], 1, 1, 1, None],
             [[np.float32, 0, [32, 32, 12, 12]], [np.float32, 0, [32, 1, 3, 3]], 0, 1, 1, None],
@@ -122,6 +122,6 @@ class TestThnnConvDepthwise2d(TestCase):
         ]
         self.conv_depthwise2d_backward_result(shape_format)
 
-instantiate_device_type_tests(TestThnnConvDepthwise2d, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_thnn_conv_depthwise2d_forward.py b/test/test_network_ops/test_thnn_conv_depthwise2d_forward.py
index 5d79d4a12823ebf19341ea6f62e5c2f323fe77f9..10f1083a4ffb53651e5e43be496364388d1910f4 100644
--- a/test/test_network_ops/test_thnn_conv_depthwise2d_forward.py
+++ b/test/test_network_ops/test_thnn_conv_depthwise2d_forward.py
@@ -17,9 +17,8 @@ import torch_npu
 import numpy as np
 import torch.nn as nn
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestThnnConvDepthwise2d(TestCase):
@@ -104,26 +103,26 @@ class TestThnnConvDepthwise2d(TestCase):
         else:
             self.assertRtolEqual(cpu_output.detach().numpy(), npu_output.detach().numpy() )
 
-    def test_thnn_conv_depthwise2d_0(self, device):
+    def test_thnn_conv_depthwise2d_0(self, device="npu"):
         item = self.thnn_conv_depthwise2d_format(0)
         self.thnn_conv_depthwise2d_execute(item, 3)
 
-    def test_thnn_conv_depthwise2d_1(self, device):
+    def test_thnn_conv_depthwise2d_1(self, device="npu"):
         item = self.thnn_conv_depthwise2d_format(1)
         self.thnn_conv_depthwise2d_execute(item, 3)
 
-    def test_thnn_conv_depthwise2d_2(self, device):
+    def test_thnn_conv_depthwise2d_2(self, device="npu"):
         item = self.thnn_conv_depthwise2d_format(2)
         self.thnn_conv_depthwise2d_execute(item, 3)
 
-    def test_thnn_conv_depthwise2d_3(self, device):
+    def test_thnn_conv_depthwise2d_3(self, device="npu"):
         item = self.thnn_conv_depthwise2d_format(3)
         self.thnn_conv_depthwise2d_execute(item, 6)
 
-    def test_thnn_conv_depthwise2d_4(self, device):
+    def test_thnn_conv_depthwise2d_4(self, device="npu"):
         item = self.thnn_conv_depthwise2d_format(4)
         self.thnn_conv_depthwise2d_execute(item, 6)
 
-instantiate_device_type_tests(TestThnnConvDepthwise2d, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_threshold.py b/test/test_network_ops/test_threshold.py
index 4a9600a5b2537ca7dbe76bbb537575d36dbafde8..febb6cf36d0f48b1365c9e07a861a702ecdb4bb3 100644
--- a/test/test_network_ops/test_threshold.py
+++ b/test/test_network_ops/test_threshold.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestThreshold(TestCase):
 
@@ -33,7 +33,7 @@ class TestThreshold(TestCase):
         output = output.numpy()
         return output
 
-    def test_threshold_common_shape_format(self, device):
+    def test_threshold_common_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float32, 0, (1,5)], [1.0], [20.0]],
                 [[np.int32, 0, (1,5)], [2], [20]],
@@ -46,7 +46,7 @@ class TestThreshold(TestCase):
             npu_output = self.npu_op_exec(npu_input1, npu_threshold, npu_value)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_threshold_inplace_common_shape_format(self, device):
+    def test_threshold_inplace_common_shape_format(self, device="npu"):
         def cpu_op_inplace_exec(input1, threshold, value):
             torch.nn.functional.threshold_(input1, threshold, value)
             output = input1.numpy()
@@ -70,6 +70,6 @@ class TestThreshold(TestCase):
             npu_output = npu_op_inplace_exec(npu_input1, npu_threshold, npu_value)
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestThreshold, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_threshold_backward.py b/test/test_network_ops/test_threshold_backward.py
index dab2bea060e7df1653219ab33b6ef15166d14248..1307e10ed3042c19683ef469b84df41c59384e8e 100644
--- a/test/test_network_ops/test_threshold_backward.py
+++ b/test/test_network_ops/test_threshold_backward.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestThresholdBackward(TestCase):
 
@@ -40,7 +40,7 @@ class TestThresholdBackward(TestCase):
         output = output.detach().to("cpu")
         return output.numpy(), out.numpy()
 
-    def test_threshold_backward_common_shape_format(self, device):
+    def test_threshold_backward_common_shape_format(self, device="npu"):
         shape_format = [
                 [[np.float32, 0, (1,5)], [1.0], [20.0]],
                 [[np.float32, 0, (2,3,5)], [2.0], [20.0]],
@@ -63,6 +63,6 @@ class TestThresholdBackward(TestCase):
             self.assertRtolEqual(npu_output1.astype(np.float32), cpu_output1)
             self.assertRtolEqual(npu_output2.astype(np.float32), cpu_output2)
 
-instantiate_device_type_tests(TestThresholdBackward, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_topk.py b/test/test_network_ops/test_topk.py
index 9f2d8a205aedc02aec4e81e31ae4f3e7b9fb1153..375c1f84240c56c8e9afa76d0a45affbbc00d981 100644
--- a/test/test_network_ops/test_topk.py
+++ b/test/test_network_ops/test_topk.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestTopK(TestCase):
 
@@ -47,62 +47,62 @@ class TestTopK(TestCase):
             #  目前只支持fp16,fp32降低阈值判断
             self.assertRtolEqual(cpu_output, npu_output, prec=1.e-1)
             
-    def test_topk_shape_format_fp16_1d(self, device):
+    def test_topk_shape_format_fp16_1d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         shape_format = [
             [np.float16, i, [18]] for i in format_list
         ]        
         self.topk_result(shape_format)
         
-    def test_topk_shape_format_fp32_1d(self, device):
+    def test_topk_shape_format_fp32_1d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         shape_format = [
             [np.float32, i, [18]] for i in format_list
         ]        
         self.topk_result(shape_format)
         
-    def test_topk_shape_format_fp16_2d(self, device):
+    def test_topk_shape_format_fp16_2d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         shape_format = [
             [np.float16, i, [5, 256]] for i in format_list
         ]        
         self.topk_result(shape_format)
         
-    def test_topk_shape_format_fp32_2d(self, device):
+    def test_topk_shape_format_fp32_2d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         shape_format = [
             [np.float32, i, [5, 256]] for i in format_list
         ]        
         self.topk_result(shape_format)
         
-    def test_topk_shape_format_fp16_3d(self, device):
+    def test_topk_shape_format_fp16_3d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         shape_format = [
             [np.float16, i, [32, 8, 8]] for i in format_list
         ]        
         self.topk_result(shape_format)
         
-    def test_topk_shape_format_fp32_3d(self, device):
+    def test_topk_shape_format_fp32_3d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         shape_format = [
             [np.float32, i, [32, 8, 8]] for i in format_list
         ]        
         self.topk_result(shape_format)
         
-    def test_topk_shape_format_fp16_4d(self, device):
+    def test_topk_shape_format_fp16_4d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         shape_format = [
             [np.float16, i, [64, 112, 7, 7]] for i in format_list
         ]        
         self.topk_result(shape_format)
         
-    def test_topk_shape_format_fp32_4d(self, device):
+    def test_topk_shape_format_fp32_4d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         shape_format = [
             [np.float32, i, [64, 112, 7, 7]] for i in format_list
         ]        
         self.topk_result(shape_format)
 
-instantiate_device_type_tests(TestTopK, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_transpose.py b/test/test_network_ops/test_transpose.py
index 9b6e4bbc98eab0d0fb0b6b009f3ddccb5db883ae..798d7d5e326b5ad7d988029251a1b9d793ae109a 100644
--- a/test/test_network_ops/test_transpose.py
+++ b/test/test_network_ops/test_transpose.py
@@ -16,12 +16,12 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestTransepose(TestCase):
-    def test_transepose(self, device):
+    def test_transepose(self, device="npu"):
         def cpu_op_exec(input1, perm):
             output = input1.permute(perm)
             output = output.numpy()
@@ -46,6 +46,5 @@ class TestTransepose(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
 
 
-instantiate_device_type_tests(TestTransepose, globals(), except_for='cpu')
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_uniform_.py b/test/test_network_ops/test_uniform_.py
index de4a3a96697a825ce0f10b9cfddb1d427f283de6..7fd70bdd34acea96ebb7b2aaec93453cf0b6523f 100644
--- a/test/test_network_ops/test_uniform_.py
+++ b/test/test_network_ops/test_uniform_.py
@@ -13,14 +13,12 @@
 # limitations under the License.
 import torch
 import torch_npu
-import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestUniform(TestCase):
-    def test_uniform(self, device):
+    def test_uniform(self, device="npu"):
         shape_format = [
            [(20,300), -100, 100, torch.float32],
            [(20,300), -100, 100, torch.float16]
@@ -32,7 +30,7 @@ class TestUniform(TestCase):
             self.assertTrue(item[1] <= input1.min())
             self.assertTrue(item[2] >= input1.max())
     
-    def test_uniform_trans(self, device):
+    def test_uniform_trans(self, device="npu"):
         shape_format = [
            [(20,300), -100, 100, torch.float32],
         ]
@@ -45,6 +43,5 @@ class TestUniform(TestCase):
             self.assertTrue(item[2] >= input1.max())
 
 
-instantiate_device_type_tests(TestUniform, globals(), except_for='cpu')
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_unique2.py b/test/test_network_ops/test_unique2.py
index 22aab9be6a0c35a569fa9eb382cba37449ebaec6..4f3c07d218dbdef9f65e0e3220b6fb8a16919ef7 100644
--- a/test/test_network_ops/test_unique2.py
+++ b/test/test_network_ops/test_unique2.py
@@ -18,12 +18,11 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestUnique2(TestCase):
-    def test_unique2(self, device):
+    def test_unique2(self, device="npu"):
         shape_format = [
                         [[np.uint8, (2, 3)], True, True, True],
                         [[np.int8, (2, 3)], True, True, True],
@@ -53,6 +52,6 @@ class TestUnique2(TestCase):
             self.assertRtolEqual(cpu_yInverse.numpy().astype(np.float32), npu_yInverse.cpu().numpy().astype(np.float32))
             self.assertRtolEqual(cpu_yCounts.numpy().astype(np.float32), npu_yCounts.cpu().numpy().astype(np.float32))
 
-instantiate_device_type_tests(TestUnique2, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests() 
\ No newline at end of file
diff --git a/test/test_network_ops/test_upsample_linear1d.py b/test/test_network_ops/test_upsample_linear1d.py
index 022736f795844495604ea58f78de8b16b4ba5ca0..0232218341a79072d6b71c430c19a4252b6bc75c 100644
--- a/test/test_network_ops/test_upsample_linear1d.py
+++ b/test/test_network_ops/test_upsample_linear1d.py
@@ -16,9 +16,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestUpsampleLinear1D(TestCase):
     def cpu_op_exec(self, input1, size, align_corners):
@@ -36,7 +36,7 @@ class TestUpsampleLinear1D(TestCase):
         out_result = out_result.to("cpu")
         return output.numpy(), out_result.numpy()
         
-    def creat_shape_format1(self, device):
+    def creat_shape_format1(self, device="npu"):
         test_cases = [
             [[np.float16, 0, (1, 1, 1, 2)], [4, ], True],
             [[np.float16, 0, (2, 1, 1, 4)], [8, ], True],
@@ -80,7 +80,7 @@ class TestUpsampleLinear1D(TestCase):
         ]
         return test_cases
 
-    def test_upsample_linear1d(self, device):
+    def test_upsample_linear1d(self, device="npu"):
         for item in self.creat_shape_format1(device):
             cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
 
@@ -105,6 +105,6 @@ class TestUpsampleLinear1D(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_out_result, npu_out_result)
             
-instantiate_device_type_tests(TestUpsampleLinear1D, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_upsample_linear1d_backward.py b/test/test_network_ops/test_upsample_linear1d_backward.py
index 27ea48b33b49a95a1d25902eea5f0055ddd57683..4cbfa26a31fd57e62824e18504af0527a0ec92d6 100644
--- a/test/test_network_ops/test_upsample_linear1d_backward.py
+++ b/test/test_network_ops/test_upsample_linear1d_backward.py
@@ -16,12 +16,12 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestUpsampleLinear1DBackward(TestCase):
-    def creat_shape_format(self, device):
+    def creat_shape_format(self):
         format_list = [0]
         align_list = [True, False]
         dtype_list = [np.float16, np.float32]
@@ -51,7 +51,7 @@ class TestUpsampleLinear1DBackward(TestCase):
         output = output.to("cpu")
         return output.detach().numpy(), gradnpu.detach().numpy()
 
-    def test_upsample_linear1d_backward(self, device):
+    def test_upsample_linear1d_backward(self, device="npu"):
         for item in self.creat_shape_format(device):
             cpu_input, npu_input = create_common_tensor(item, 0, 100)
 
@@ -94,6 +94,5 @@ class TestUpsampleLinear1DBackward(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
             self.assertRtolEqual(cpu_grad, npu_grad)
 
-instantiate_device_type_tests(TestUpsampleLinear1DBackward, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_var.py b/test/test_network_ops/test_var.py
index 3eaa731092f95f189243e5fbda6be4eda04de77c..6ef4ffd19384601ae3f89a280c23b830d11ae7a7 100644
--- a/test/test_network_ops/test_var.py
+++ b/test/test_network_ops/test_var.py
@@ -18,9 +18,9 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestVar(TestCase):
     def cpu_op_exec(self, input1, unbiased=True):
@@ -143,7 +143,7 @@ class TestVar(TestCase):
             npu_input = npu_input.npu_format_cast(npuformat)
         return cpu_input, npu_input
         
-    def test_var_shape_format_fp16(self, device):
+    def test_var_shape_format_fp16(self, device="npu"):
         format_list = [-1]
         shape_list = [[32, 24], [32, 8, 24]]
         unbiased_list = [True, False]
@@ -158,7 +158,7 @@ class TestVar(TestCase):
             npu_output = self.npu_op_exec(npu_input, item[3])
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_var_shape_format_fp32(self, device):
+    def test_var_shape_format_fp32(self, device="npu"):
         format_list = [-1]
         shape_list = [[32, 24], [32, 8, 24]]
         unbiased_list = [True, False]
@@ -171,7 +171,7 @@ class TestVar(TestCase):
             npu_output = self.npu_op_exec(npu_input, item[3])
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_var_dim_shape_format_fp16(self, device):
+    def test_var_dim_shape_format_fp16(self, device="npu"):
         format_list = [-1]
         shape_list = [[32, 24], [32, 8, 24]]
         dim_list = [0]
@@ -189,7 +189,7 @@ class TestVar(TestCase):
             npu_output = self.npu_op_dim_exec(npu_input, item[3], item[4], item[5])
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_var_dim_shape_format_fp32(self, device):
+    def test_var_dim_shape_format_fp32(self, device="npu"):
         format_list = [-1]
         shape_list = [[32, 24], [32, 8, 24]]
         dim_list = [0]
@@ -205,7 +205,7 @@ class TestVar(TestCase):
             npu_output = self.npu_op_dim_exec(npu_input, item[3], item[4], item[5])
             self.assertRtolEqual(cpu_output, npu_output)
     
-    def test_var_names_dim_shape_format_fp16(self, device):
+    def test_var_names_dim_shape_format_fp16(self, device="npu"):
         format_list = [-1]
         shape_list1 = [[32, 24], [32, 8, 24]]
         dim_list = [0]
@@ -223,7 +223,7 @@ class TestVar(TestCase):
             npu_output = self.npu_op_names_dim_exec(npu_input, item[3], item[4], item[5])       
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_var_names_dim_shape_format_fp32(self, device):
+    def test_var_names_dim_shape_format_fp32(self, device="npu"):
         format_list = [-1]
         shape_list1 = [[32, 24], [32, 8, 24]]
         dim_list = [0]
@@ -240,7 +240,7 @@ class TestVar(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
 
 
-    def test_var_out_shape_format_fp16(self, device):
+    def test_var_out_shape_format_fp16(self, device="npu"):
         format_list1 = [-1]
         shape_list = [[32, 24], [32, 8, 24]]
         dim_list = [0]
@@ -261,7 +261,7 @@ class TestVar(TestCase):
             cpu_output1 = cpu_output1.astype(np.float16)
             self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test_var_out_shape_format_fp32(self, device):
+    def test_var_out_shape_format_fp32(self, device="npu"):
         format_list1 = [-1]
         shape_list = [[32, 24], [32, 8, 24]]
         dim_list = [0]
@@ -279,7 +279,7 @@ class TestVar(TestCase):
             npu_output1 = self.npu_op_out_exec(npu_input1, item[3], npu_output, item[4], item[5])
             self.assertRtolEqual(cpu_output1, npu_output1)
 
-    def test__var_shape_format_fp16(self, device):
+    def test__var_shape_format_fp16(self, device="npu"):
         format_list = [-1]
         shape_list = [[32, 24], [32, 8, 24]]
         unbiased_list = [True, False]
@@ -309,7 +309,7 @@ class TestVar(TestCase):
             npu_output = self.npu_op_var_exec(npu_input, item[3])
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_var_mean_shape_format_fp16(self, device):
+    def test_var_mean_shape_format_fp16(self, device="npu"):
         format_list = [-1]
         shape_list = [[32, 24], [32, 8, 24]]
         unbiased_list = [True, False]
@@ -327,7 +327,7 @@ class TestVar(TestCase):
             self.assertRtolEqual(cpu_output1, npu_output1)
             self.assertRtolEqual(cpu_output2, npu_output2)
 
-    def test_var_mean_shape_format_fp32(self, device):
+    def test_var_mean_shape_format_fp32(self, device="npu"):
         format_list = [-1]
         shape_list = [[32, 24], [32, 8, 24]]
         unbiased_list = [True, False]
@@ -342,7 +342,7 @@ class TestVar(TestCase):
             self.assertRtolEqual(cpu_output1, npu_output1)
             self.assertRtolEqual(cpu_output2, npu_output2)
 
-    def test_var_mean_dim_shape_format_fp16(self, device):
+    def test_var_mean_dim_shape_format_fp16(self, device="npu"):
         format_list1 = [-1]
         shape_list1 = [[32, 24], [32, 8, 24]]
         dim_list = [0]
@@ -362,7 +362,7 @@ class TestVar(TestCase):
             self.assertRtolEqual(cpu_output1, npu_output1)
             self.assertRtolEqual(cpu_output2, npu_output2)
 
-    def test_var_mean_dim_shape_format_fp32(self, device):
+    def test_var_mean_dim_shape_format_fp32(self, device="npu"):
         format_list = [-1]
         shape_list = [[32, 1024], [32, 8, 1024]]
         dim_list = [0]
@@ -379,7 +379,7 @@ class TestVar(TestCase):
             self.assertRtolEqual(cpu_output1, npu_output1)
             self.assertRtolEqual(cpu_output2, npu_output2)
 
-    def test_var_mean_names_dim_shape_format_fp16(self, device):
+    def test_var_mean_names_dim_shape_format_fp16(self, device="npu"):
         shape = (1024, 8, 32)
         dimlist = ['N', 'C', 'H']
         cpu_input = torch.rand(shape, dtype=torch.float32)
@@ -395,7 +395,7 @@ class TestVar(TestCase):
         self.assertRtolEqual(cpu_output1, npu_output1)
         self.assertRtolEqual(cpu_output2, npu_output2)
 
-    def test_var_mean_names_dim_shape_format_fp32(self, device):
+    def test_var_mean_names_dim_shape_format_fp32(self, device="npu"):
         shape = (1024, 8, 32)
         dimlist = ['N', 'C', 'H']
         cpu_input = torch.rand(shape, dtype=torch.float32, names=('N', 'C', 'H'))
@@ -407,7 +407,7 @@ class TestVar(TestCase):
         self.assertRtolEqual(cpu_output1, npu_output1)
         self.assertRtolEqual(cpu_output2, npu_output2)
 
-    def test_var_dim_shape_format_5d_fp16(self, device):
+    def test_var_dim_shape_format_5d_fp16(self, device="npu"):
         format_list = [-1]
         shape_list = [[2, 94, 4, 52, 192]]
         dim_list = [0]
@@ -425,6 +425,6 @@ class TestVar(TestCase):
             npu_output1 = self.npu_op_dim_exec(npu_input1, item[3], item[4], item[5])
             self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.004)
     
-instantiate_device_type_tests(TestVar, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_where.py b/test/test_network_ops/test_where.py
index b43fe49dfe3bf5b3b749643c57584408fb60c55d..4db59af042f71db8eaa9883e5b9df95cb76f67aa 100644
--- a/test/test_network_ops/test_where.py
+++ b/test/test_network_ops/test_where.py
@@ -13,12 +13,11 @@
 # limitations under the License.
 import torch
 import torch_npu
-import torch.nn as nn
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+
 
 class TestWhere(TestCase):
     def cpu_op_exec(self, input1):
@@ -82,46 +81,46 @@ class TestWhere(TestCase):
             self.assertRtolEqual(cpu_output_cond, npu_output_cond)
             self.assertRtolEqual(cpu_output_s, npu_output_s)
 
-    def test_where_shape_format_fp32_1d(self, device):
+    def test_where_shape_format_fp32_1d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [[np.float32, i, [18]] for i in format_list]
         self.where_result(shape_format)
 
-    def test_where_shape_format_fp32_2d(self, device):
+    def test_where_shape_format_fp32_2d(self, device="npu"):
         format_list = [0]
         shape_format = [[np.float32, i, [5, 256]] for i in format_list]
         self.where_result(shape_format)
 
-    def test_where_shape_format_fp32_3d(self, device):
+    def test_where_shape_format_fp32_3d(self, device="npu"):
         format_list = [0]
         shape_format = [[np.float32, i, [32, 3, 3]] for i in format_list]
         self.where_result(shape_format)
 
-    def test_where_shape_format_fp32_4d(self, device):
+    def test_where_shape_format_fp32_4d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [[np.float32, i, [64, 112, 7, 7]] for i in format_list]
         self.where_result(shape_format)
 
-    def test_where_shape_format_fp16_1d(self, device):
+    def test_where_shape_format_fp16_1d(self, device="npu"):
         format_list = [0, 3]
         shape_format = [[np.float16, i, [18]] for i in format_list]
         self.where_result(shape_format)
 
-    def test_where_shape_format_fp16_2d(self, device):
+    def test_where_shape_format_fp16_2d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         shape_format = [[np.float16, i, [5, 256]] for i in format_list]
         self.where_result(shape_format)
         
-    def test_where_shape_format_fp16_3d(self, device):
+    def test_where_shape_format_fp16_3d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         shape_format = [[np.float16, i, [32, 3, 3]] for i in format_list]
         self.where_result(shape_format)
         
-    def test_where_shape_format_fp16_4d(self, device):
+    def test_where_shape_format_fp16_4d(self, device="npu"):
         format_list = [0, 3, 4, 29]
         shape_format = [[np.float16, i, [64, 112, 7, 7]] for i in format_list]
         self.where_result(shape_format)
 
-instantiate_device_type_tests(TestWhere, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_network_ops/test_xor.py b/test/test_network_ops/test_xor.py
index 9f4f916fa6cdc40e9de9c72392295908e1e536eb..f0e254befeef509646bbd5053ad1fd5812b30dbc 100644
--- a/test/test_network_ops/test_xor.py
+++ b/test/test_network_ops/test_xor.py
@@ -16,9 +16,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestXor(TestCase):
 
@@ -96,82 +95,82 @@ class TestXor(TestCase):
         output = output.numpy()
         return output
 
-    def test_xor_tensor_int32(self, device):
+    def test_xor_tensor_int32(self, device="npu"):
         npu_input1 = self.generate_single_data(0, 100, (10, 10), np.int32)
         npu_input2 = self.generate_single_data(0, 100, (10, 10), np.int32)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertEqual(cpu_output, npu_output)
 
-    def test_xor_tensor_int16(self, device):
+    def test_xor_tensor_int16(self, device="npu"):
         npu_input1 = self.generate_single_data(0, 100, (10, 10), np.int16)
         npu_input2 = self.generate_single_data(0, 100, (10, 10), np.int16)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertEqual(cpu_output, npu_output)
 
-    def test_xor_tensor_int8(self, device):
+    def test_xor_tensor_int8(self, device="npu"):
         npu_input1 = self.generate_single_data(0, 100, (10, 10), np.int8)
         npu_input2 = self.generate_single_data(0, 100, (10, 10), np.int8)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertEqual(cpu_output, npu_output)
 
-    def test_xor_scalar_int32(self, device):
+    def test_xor_scalar_int32(self, device="npu"):
         npu_input = self.generate_single_data(0, 100, (1, 10), np.int32)
         npu_input_scalr = np.random.randint(0, 100)
         cpu_output = self.cpu_op_exec(npu_input, npu_input_scalr)
         npu_output = self.npu_op_exec_scalar(npu_input, npu_input_scalr)
         self.assertEqual(cpu_output, npu_output)
 
-    def test_xor_scalar_int16(self, device):
+    def test_xor_scalar_int16(self, device="npu"):
         npu_input = self.generate_single_data(0, 100, (10, 20), np.int16)
         npu_input_scalr = np.random.randint(0, 100)
         cpu_output = self.cpu_op_exec(npu_input, npu_input_scalr)
         npu_output = self.npu_op_exec_scalar(npu_input, npu_input_scalr)
         self.assertEqual(cpu_output, npu_output)
 
-    def test_xor_scalar_int8(self, device):
+    def test_xor_scalar_int8(self, device="npu"):
         npu_input = self.generate_single_data(0, 100, (20, 10), np.int8)
         npu_input_scalr = np.random.randint(0, 100)
         cpu_output = self.cpu_op_exec(npu_input, npu_input_scalr)
         npu_output = self.npu_op_exec_scalar(npu_input, npu_input_scalr)
         self.assertEqual(cpu_output, npu_output)
 
-    def test_xor_tensor_uint8(self, device):
+    def test_xor_tensor_uint8(self, device="npu"):
         npu_input1 = self.generate_single_data(0, 100, (10, 10), np.uint8)
         npu_input2 = self.generate_single_data(0, 100, (10, 10), np.uint8)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertEqual(cpu_output, npu_output)
 
-    def test_xor_scalar_uint8(self, device):
+    def test_xor_scalar_uint8(self, device="npu"):
         npu_input = self.generate_single_data(0, 100, (5, 10), np.uint8)
         npu_input_scalr = np.random.randint(0, 100)
         cpu_output = self.cpu_op_exec(npu_input, npu_input_scalr)
         npu_output = self.npu_op_exec_scalar(npu_input, npu_input_scalr)
         self.assertEqual(cpu_output, npu_output)
 
-    def test_xor_scalar_bool1(self, device):
+    def test_xor_scalar_bool1(self, device="npu"):
         npu_input = self.generate_single_bool_data((10, 10))
         npu_input_scalr = True
         cpu_output = self.cpu_op_exec(npu_input, npu_input_scalr)
         npu_output = self.npu_op_exec_scalar(npu_input, npu_input_scalr)
         self.assertEqual(cpu_output, npu_output)
 
-    def test_xor_scalar_bool2(self, device):
+    def test_xor_scalar_bool2(self, device="npu"):
         npu_input = self.generate_single_bool_data((10, 10))
         npu_input_scalr = False
         cpu_output = self.cpu_op_exec(npu_input, npu_input_scalr)
         npu_output = self.npu_op_exec_scalar(npu_input, npu_input_scalr)
         self.assertEqual(cpu_output, npu_output)
 
-    def test_xor_tensor_bool(self, device):
+    def test_xor_tensor_bool(self, device="npu"):
         npu_input1, npu_input2 = self.generate_bool_data((10, 10))
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
         npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestXor, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_yolo_boxes_encode.py b/test/test_network_ops/test_yolo_boxes_encode.py
index 2755be59381cc31b6a534676f6baa9d6a5606b70..23656d7918b5ce1473f6360c7111e67e1ed1785e 100644
--- a/test/test_network_ops/test_yolo_boxes_encode.py
+++ b/test/test_network_ops/test_yolo_boxes_encode.py
@@ -15,9 +15,8 @@
 import torch
 import torch_npu
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+
 
 class TestYoloBoxesEncode(TestCase):
     def npu_op_exec(self, anchor_boxes, gt_bboxes, stride, impl_mode=False):
@@ -25,7 +24,7 @@ class TestYoloBoxesEncode(TestCase):
         out = out.to("cpu")
         return out.detach().numpy()
         
-    def test_yolo_boxes_encode(self, device):
+    def test_yolo_boxes_encode(self, device="npu"):
         anchor_boxes = [(2, 4)]
         gt_bboxes = [(2 ,4)]
         stride = [[2, 2]]
@@ -45,6 +44,5 @@ class TestYoloBoxesEncode(TestCase):
             self.assertRtolEqual(exoutput_cpu_tensor.numpy(), npu_output)
 
 
-instantiate_device_type_tests(TestYoloBoxesEncode, globals(), except_for='cpu')
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_zeroslike.py b/test/test_network_ops/test_zeroslike.py
index 5e239509c43f74eafbd7b05feb3226de3b68b8e8..a014707ab2fc1581286c5d9b5eb89f57ea6a2f2a 100644
--- a/test/test_network_ops/test_zeroslike.py
+++ b/test/test_network_ops/test_zeroslike.py
@@ -18,9 +18,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
 
 
 class TestZerosLike(TestCase):
@@ -35,7 +34,7 @@ class TestZerosLike(TestCase):
         output = output.numpy()
         return output
 
-    def test_zeroslike_fp32(self, device):
+    def test_zeroslike_fp32(self, device="npu"):
         format_list = [0, 3, 29]
         shape_list = [1, (1000, 1280), (32, 3, 3), (32, 144, 1, 1)]
         shape_format = [
@@ -47,7 +46,7 @@ class TestZerosLike(TestCase):
             npu_output = self.npu_op_exec(npu_input, torch.float32)
             self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_zeroslike_fp16(self, device):
+    def test_zeroslike_fp16(self, device="npu"):
         format_list = [0, 3, 29]
         shape_list = [1, (1000, 1280), (32, 3, 3), (32, 144, 1, 1)]
         shape_format = [
@@ -62,6 +61,5 @@ class TestZerosLike(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
 
 
-instantiate_device_type_tests(TestZerosLike, globals(), except_for='cpu')
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_npu.py b/test/test_npu.py
index 26f9a8522e7dd38a1b832da9420534eff0c6c061..b8f63f24abdfba51a1b89f1f56771ec814e0c6b0 100644
--- a/test/test_npu.py
+++ b/test/test_npu.py
@@ -20,7 +20,8 @@ import gc
 import torch
 import torch_npu
 
-from torch_npu.testing.common_utils import TestCase, run_tests, freeze_rng_state, SkipIfRocm
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import freeze_rng_state
 
 
 class TestNpu(TestCase):
@@ -363,7 +364,6 @@ class TestNpu(TestCase):
         with self.assertRaisesRegex(ValueError, "Expected a npu device, but"):
             torch_npu.npu.synchronize("cpu")
 
-    @SkipIfRocm()
     def test_streams(self):
         default_stream = torch_npu.npu.current_stream()
         user_stream = torch_npu.npu.Stream()
diff --git a/test/test_pt_profiler.py b/test/test_pt_profiler.py
index 175e86593d8813961ffcee42fe228f079329f523..919d25ee423a00cf3ee5d2267fb9b3d80ca9a786 100644
--- a/test/test_pt_profiler.py
+++ b/test/test_pt_profiler.py
@@ -13,9 +13,11 @@
 # limitations under the License.
 
 import torch
-from torch_npu.testing.common_utils import TestCase, run_tests
 import torch_npu
 
+from torch_npu.testing.testcase import TestCase, run_tests
+
+
 class SmallModel(torch.nn.Module):
     def __init__(self, in_channel, out_channel):
         super(SmallModel, self).__init__()
@@ -185,11 +187,6 @@ class TestProfiler(TestCase):
             self.train(steps)
         prof.export_chrome_trace("./test_trace.prof")
 
+
 if __name__ == '__main__':
-    try:
-        # to init the device
-        torch.rand(2,3).npu()
-    except Exception:
-        print("there is no npu device")
-        exit()
     run_tests()
diff --git a/test/test_tensor.py b/test/test_tensor.py
index e71b687685b5ff1706ae09003fa6273d530ca833..2c9c4f6318b92de5e1938119e6ee7439797558dc 100644
--- a/test/test_tensor.py
+++ b/test/test_tensor.py
@@ -1,27 +1,29 @@
-import tempfile
-from itertools import product, combinations, combinations_with_replacement, permutations
-
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
 import torch
 import torch_npu
 
-from torch.testing._internal.common_utils import TestCase, run_tests
-from torch.testing._internal.common_device_type import device_type_test_bases, \
-      DeviceTypeTestBase, onlyOn, dtypes, instantiate_device_type_tests
-
-
-def onlyNPU(fn):
-    return onlyOn('npu')(fn)
-
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.decorator import Dtypes, instantiate_tests
 
-class NPUTestBase(DeviceTypeTestBase):
-    device_type = 'npu'
-
-device_type_test_bases.append(NPUTestBase)
 
+@instantiate_tests
 class TestTensor(TestCase):
-    
-    @onlyNPU
-    def test_narrow_empty(self, device):
+
+    def test_narrow_empty(self, device="npu"):
         x = torch.randn(2, 3, 4).to(device=device)
         for d in range(x.dim()):
             y = x.narrow(d, x.size(d), 0)
@@ -29,8 +31,7 @@ class TestTensor(TestCase):
             sz[d] = 0
             self.assertEqual(sz, y.size())
 
-    @onlyNPU
-    def test_tensor_set(self, device):
+    def test_tensor_set(self):
         t1 = torch.Tensor()
         t2 = torch.Tensor(3, 4, 9, 10).uniform_()
         t1.set_(t2)
@@ -66,8 +67,7 @@ class TestTensor(TestCase):
         t1.set_(t2)
         self.assertEqual(t1.storage()._cdata, t2.storage()._cdata)
     
-    @onlyNPU
-    @dtypes(torch.half, torch.float)
+    @Dtypes(torch.half, torch.float)
     def test_cat_all_dtypes_and_devices(self, device, dtype):
         x = torch.tensor([[1, 2], [3, 4]], dtype=dtype, device=device)
 
@@ -77,15 +77,13 @@ class TestTensor(TestCase):
         expected2 = torch.tensor([[1, 2, 1, 2], [3, 4, 3, 4]], dtype=dtype, device=device)
         self.assertEqual(torch.cat((x, x), 1).to('cpu'), expected2.to('cpu'))
 
-    @onlyNPU
-    def test_cat_mem_overlap(self, device):
+    def test_cat_mem_overlap(self, device="npu"):
         x = torch.rand((1, 3)).to(device).expand((6, 3))
         y = torch.rand((3, 3)).to(device)
         with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
             torch.cat([y, y], out=x)
 
-    @onlyNPU
-    def test_cat(self, device):
+    def test_cat(self, device="npu"):
         SIZE = 10
         for dim in range(-3, 3):
             pos_dim = dim if dim >= 0 else 3 + dim
@@ -94,9 +92,9 @@ class TestTensor(TestCase):
             z = torch.rand(19, SIZE, SIZE).to(device).transpose(0, pos_dim)
 
             res1 = torch.cat((x, y, z), dim)
-            self.assertEqual(res1.narrow(pos_dim, 0, 13).to('cpu'), x.to('cpu'), atol=0, rtol=0)
-            self.assertEqual(res1.narrow(pos_dim, 13, 17).to('cpu'), y.to('cpu'), atol=0, rtol=0)
-            self.assertEqual(res1.narrow(pos_dim, 30, 19).to('cpu'), z.to('cpu'), atol=0, rtol=0)
+            self.assertEqual(res1.narrow(pos_dim, 0, 13).to('cpu'), x.to('cpu'))
+            self.assertEqual(res1.narrow(pos_dim, 13, 17).to('cpu'), y.to('cpu'))
+            self.assertEqual(res1.narrow(pos_dim, 30, 19).to('cpu'), z.to('cpu'))
 
         x = torch.randn(20, SIZE, SIZE).to(device)
         self.assertEqual(torch.cat(torch.split(x, 7)).to('cpu'), x.to('cpu'))
@@ -106,8 +104,7 @@ class TestTensor(TestCase):
         z = torch.cat([x, y])
         self.assertEqual(z.size(), (21, SIZE, SIZE))
 
-    @onlyNPU
-    def test_zeros(self, device):
+    def test_zeros(self, device="npu"):
         res1 = torch.zeros(100, 100, device=device)
         res2 = torch.tensor((), device=device)
         torch.zeros(100, 100, device=device, out=res2)
@@ -127,8 +124,7 @@ class TestTensor(TestCase):
         expected = torch.tensor([[0.]], device=device, dtype=torch.half)
         self.assertEqual(bfloat16Tensor.to('cpu'), expected.to('cpu'))
 
-    @onlyNPU
-    def test_zeros_out(self, device):
+    def test_zeros_out(self, device="npu"):
         shape = (3, 4)
         out = torch.zeros(shape, device=device)
         torch.zeros(shape, device=device, out=out)
@@ -147,8 +143,7 @@ class TestTensor(TestCase):
         self.assertEqual(torch.zeros(shape, device=device).to('cpu'),
                          torch.zeros(shape, device=device, out=out).to('cpu'))
 
-    @onlyNPU
-    def test_ones(self, device):
+    def test_ones(self, device="npu"):
         res1 = torch.ones(100, 100, device=device)
         res2 = torch.tensor((), device=device)
         torch.ones(100, 100, device=device, out=res2)
@@ -159,8 +154,7 @@ class TestTensor(TestCase):
         expected = torch.tensor([[True, True]], device=device, dtype=torch.bool)
         self.assertEqual(res1.to('cpu'), expected.to('cpu'))
 
-    @onlyNPU
-    def test_empty_strided(self, device):
+    def test_empty_strided(self, device="npu"):
         for shape in [(2, 3, 4), (0, 2, 0)]:
             # some of these cases are pretty strange, just verifying that if as_strided
             # allows them then empty_strided can as well.
@@ -174,8 +168,7 @@ class TestTensor(TestCase):
                 self.assertEqual(empty_strided.shape, as_strided.shape)
                 self.assertEqual(empty_strided.stride(), as_strided.stride())
 
-    @onlyNPU
-    def test_empty_tensor_props(self, device):
+    def test_empty_tensor_props(self, device="npu"):
         sizes = [(0,), (0, 3), (5, 0), (5, 0, 3, 0, 2), (0, 3, 0, 2), (0, 5, 0, 2, 0)]
         for size in sizes:
             x = torch.empty(tuple(size), device=device)
@@ -185,9 +178,8 @@ class TestTensor(TestCase):
             y = torch.empty(tuple(size_ones_instead_of_zeros), device=device)
             self.assertEqual(x.stride(), y.stride())
 
-    @onlyNPU
-    @dtypes(torch.half, torch.float)
-    def test_full_inference(self, device, dtype):
+    @Dtypes(torch.half, torch.float)
+    def test_full_inference(self, dtype):
         size = (2, 2)
 
         prev_default = torch.get_default_dtype()
@@ -207,8 +199,7 @@ class TestTensor(TestCase):
 
         torch.set_default_dtype(prev_default)
 
-    @onlyNPU
-    def test_full_out(self, device):
+    def test_full_out(self, device="npu"):
         size = (5,)
         o = torch.empty(size, device=device, dtype=torch.long)
 
@@ -220,10 +211,7 @@ class TestTensor(TestCase):
         self.assertEqual(torch.full(o.shape, 1., out=o).dtype, o.dtype)
         self.assertEqual(torch.full(size, 1, out=o).dtype, o.dtype)
 
-    # TODO: this test should be updated
-
-    @onlyNPU
-    def test_ones_like(self, device):
+    def test_ones_like(self, device="npu"):
         expected = torch.ones(100, 100, device=device)
 
         res1 = torch.ones_like(expected)
@@ -234,14 +222,12 @@ class TestTensor(TestCase):
         res1 = torch.ones_like(expected)
         self.assertEqual(res1.to('cpu'), expected.to('cpu'))
 
-    @onlyNPU
-    def test_zeros_like(self, device):
+    def test_zeros_like(self, device="npu"):
         expected = torch.zeros((100, 100,), device=device)
 
         res1 = torch.zeros_like(expected)
         self.assertEqual(res1.to('cpu'), expected.to('cpu'))
 
-instantiate_device_type_tests(TestTensor, globals(), only_for='npu')
 
 if __name__ == '__main__':
     run_tests()
\ No newline at end of file
diff --git a/test/test_testing.py b/test/test_testing.py
index f0c27366a9207a7ba54d1cdd995aace9faf20800..4d55d547f6f0e28536ec65f8c26012499d482578 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -11,18 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import itertools
 import torch
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests, Dtypes, Formats
-from torch_npu.testing.util_test import create_dtype_tensor
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_dtype_tensor
+from torch_npu.testing.decorator import Dtypes, Formats, instantiate_tests
 
 
 # For testing TestCase methods and torch_npu.testing functions
+@instantiate_tests
 class TestTesting(TestCase):
+
     # Ensure that assertTensorSlowEqual handles npu arrays properly
-    
     @Dtypes(torch.int32, torch.bool, torch.half, torch.float)
     @Formats(0, 3, 4)
     def test_assert_tensor_slow_equal(self, device, dtype, npu_format):
@@ -122,10 +125,8 @@ class TestTesting(TestCase):
             self.assertNotEqual(a_cpu, b_cpu, message=msg)
             self.assertNotEqual(a_cpu, b_npu, message=msg)
             self.assertNotEqual(a_npu, b_cpu, message=msg)
-            self.assertNotEqual(a_npu, b_npu, message=msg)              
-    
+            self.assertNotEqual(a_npu, b_npu, message=msg)  
     
-instantiate_device_type_tests(TestTesting, globals(), except_for="cpu")
 
 if __name__ == '__main__':
     run_tests()
\ No newline at end of file
diff --git a/test/test_trans_contiguous/test_as_strided_copy_to_contiguous.py b/test/test_trans_contiguous/test_as_strided_copy_to_contiguous.py
index 78d05c1681d58d283d81e2747e234597c27d9a08..7c5d57acbb62b189b2c951b9370e6bd92e793eae 100644
--- a/test/test_trans_contiguous/test_as_strided_copy_to_contiguous.py
+++ b/test/test_trans_contiguous/test_as_strided_copy_to_contiguous.py
@@ -17,9 +17,8 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, check_operators_in_prof
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor, check_operators_in_prof
 
 os.environ["COMBINED_ENABLE"] = "1"  # Open combined-view cases optimization
 
@@ -37,7 +36,7 @@ class TestAsStridedCopyToContiguous(TestCase):
         output = output.cpu().numpy()
         return output
 
-    def test_as_strided(self, device):
+    def test_as_strided(self, device="npu"):
         dtype_list = [np.bool, np.int32, np.float16, np.float32, np.int8, np.uint8, np.int64]
         format_list = [-1]
         small_shape_list = [
@@ -65,6 +64,6 @@ class TestAsStridedCopyToContiguous(TestCase):
             npu_output = self.npu_op_exec(npu_input, item[1], item[2], item[3])
             self.assertRtolEqual(cpu_output, npu_output)
 
-instantiate_device_type_tests(TestAsStridedCopyToContiguous, globals(), except_for="cpu")
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_trans_contiguous/test_combined_flatten_x__copy_to_contiguous.py b/test/test_trans_contiguous/test_combined_flatten_x__copy_to_contiguous.py
index b968da35139fd7c1e7cb51663f20b7903cf4b867..d87caf659eb6f202cf575b26bdc9656fffe4b93e 100644
--- a/test/test_trans_contiguous/test_combined_flatten_x__copy_to_contiguous.py
+++ b/test/test_trans_contiguous/test_combined_flatten_x__copy_to_contiguous.py
@@ -17,15 +17,14 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, check_operators_in_prof
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor, check_operators_in_prof
 
 os.environ["COMBINED_ENABLE"] = "1"  # Open combined-view cases optimization
 
 # Note: NPU only support trans-contiguous with base format, so format_list uses -1
 class CombinedFlattenXCopyToContiguous(TestCase):
-    def test_flatten_select_copy_contiguous(self, device):
+    def test_flatten_select_copy_contiguous(self, device="npu"):
         dtype_list1 = [np.float16, np.float32]
         format_list1 = [-1]
         shape_list1 = [
@@ -52,7 +51,7 @@ class CombinedFlattenXCopyToContiguous(TestCase):
             cpu_out2 = cpu_input.select(2,1).flatten(1).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
     
-    def test_flatten_strideslice_copy_contiguous(self, device):
+    def test_flatten_strideslice_copy_contiguous(self, device="npu"):
         dtype_list2 = [np.float16, np.float32]
         format_list2 = [-1]
         shape_list2 = [
@@ -79,7 +78,6 @@ class CombinedFlattenXCopyToContiguous(TestCase):
             cpu_out2 = cpu_input[:,2:20:3].flatten().contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
-         
-instantiate_device_type_tests(CombinedFlattenXCopyToContiguous, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_trans_contiguous/test_combined_reshape_x_copy_to_contiguous.py b/test/test_trans_contiguous/test_combined_reshape_x_copy_to_contiguous.py
index 9aabeb1afde492925a663051eda7ec9153c5bef2..80d8281ddf16934777ded842e684030e5a949e8c 100644
--- a/test/test_trans_contiguous/test_combined_reshape_x_copy_to_contiguous.py
+++ b/test/test_trans_contiguous/test_combined_reshape_x_copy_to_contiguous.py
@@ -17,15 +17,14 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, check_operators_in_prof
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor, check_operators_in_prof
 
 os.environ["COMBINED_ENABLE"] = "1"  # Open combined-view cases optimization
 
 # Note: NPU only support trans-contiguous with base format, so format_list uses -1
 class CombinedReshapeXCopyToContiguous(TestCase):
-    def test_view_permute_copy_contiguous(self, device):
+    def test_view_permute_copy_contiguous(self, device="npu"):
         dtype_list1 = [np.float16, np.float32]
         format_list1 = [-1]
         shape_list1 = [
@@ -65,7 +64,7 @@ class CombinedReshapeXCopyToContiguous(TestCase):
                 .contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) 
     
-    def test_view_select_copy_contiguous(self, device):
+    def test_view_select_copy_contiguous(self, device="npu"):
         dtype_list2 = [np.float16, np.float32]
         format_list2 = [-1]
         shape_list2 = [
@@ -98,7 +97,7 @@ class CombinedReshapeXCopyToContiguous(TestCase):
             cpu_out2 = cpu_input.select(2, 1).view(npu_input.size(1), npu_input.size(0), -1).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
     
-    def test_view_narrow_copy_contiguous(self, device):
+    def test_view_narrow_copy_contiguous(self, device="npu"):
         dtype_list3 = [np.float16, np.float32]
         format_list3 = [-1]
         shape_list3 = [
@@ -125,7 +124,7 @@ class CombinedReshapeXCopyToContiguous(TestCase):
             cpu_out2 = cpu_input[:,10:19,:,:].view(20, 360, 16).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
     
-    def test_view_strideslice_copy_contiguous(self, device):
+    def test_view_strideslice_copy_contiguous(self, device="npu"):
         dtype_list4 = [np.float16, np.float32]
         format_list4 = [-1]
         shape_list4 = [
@@ -152,7 +151,6 @@ class CombinedReshapeXCopyToContiguous(TestCase):
             cpu_out2 = cpu_input[10:19:3,:,:].view(3, 2400, 5).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
-         
-instantiate_device_type_tests(CombinedReshapeXCopyToContiguous, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_trans_contiguous/test_combined_squeeze_x_copy_to_contiguous.py b/test/test_trans_contiguous/test_combined_squeeze_x_copy_to_contiguous.py
index 6b9ba8e2aa5cf18c9cf2073c2c049c12dd55d965..9e709019a48f7a1625180548f8a7eebcfcd22ab2 100644
--- a/test/test_trans_contiguous/test_combined_squeeze_x_copy_to_contiguous.py
+++ b/test/test_trans_contiguous/test_combined_squeeze_x_copy_to_contiguous.py
@@ -17,15 +17,14 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, check_operators_in_prof
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor, check_operators_in_prof
 
 os.environ["COMBINED_ENABLE"] = "1"  # Open combined-view cases optimization
 
 # Note: NPU only support trans-contiguous with base format, so format_list uses -1
 class CombinedSqueezeXCopyToContiguous(TestCase):
-    def test_squeeze_permute_copy_contiguous(self, device):
+    def test_squeeze_permute_copy_contiguous(self, device="npu"):
         dtype_list1 = [np.float16, np.float32]
         format_list1 = [-1]
         shape_list1 = [
@@ -53,7 +52,7 @@ class CombinedSqueezeXCopyToContiguous(TestCase):
             cpu_out2 = cpu_input.permute(1,0,3,2).squeeze(0).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) 
     
-    def test_squeeze_narrow_copy_contiguous(self, device):
+    def test_squeeze_narrow_copy_contiguous(self, device="npu"):
         dtype_list2 = [np.float16, np.float32]
         format_list2 = [-1]
         shape_list2 = [
@@ -81,7 +80,7 @@ class CombinedSqueezeXCopyToContiguous(TestCase):
             cpu_out2 = cpu_input[:,:,:,10:19].squeeze(1).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
-    def test_squeeze_select_copy_contiguous(self, device):
+    def test_squeeze_select_copy_contiguous(self, device="npu"):
         dtype_list3 = [np.float16, np.float32]
         format_list3 = [-1]
         shape_list3 = [
@@ -108,7 +107,7 @@ class CombinedSqueezeXCopyToContiguous(TestCase):
             cpu_out2 = cpu_input.select(2,1).squeeze().contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
-    def test_squeeze_strideslice_copy_contiguous(self, device):
+    def test_squeeze_strideslice_copy_contiguous(self, device="npu"):
         dtype_list4 = [np.float16, np.float32]
         format_list4 = [-1]
         shape_list4 = [
@@ -135,7 +134,6 @@ class CombinedSqueezeXCopyToContiguous(TestCase):
             cpu_out2 = cpu_input[:,:,10:19:3].squeeze(1).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) 
 
-         
-instantiate_device_type_tests(CombinedSqueezeXCopyToContiguous, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_trans_contiguous/test_combined_unsqueeze_x_copy_to_contiguous.py b/test/test_trans_contiguous/test_combined_unsqueeze_x_copy_to_contiguous.py
index e604f12331fc98175764f5cbffcee39514e77207..f3699034001576097df28329273835f94074d568 100644
--- a/test/test_trans_contiguous/test_combined_unsqueeze_x_copy_to_contiguous.py
+++ b/test/test_trans_contiguous/test_combined_unsqueeze_x_copy_to_contiguous.py
@@ -17,15 +17,14 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, check_operators_in_prof
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor, check_operators_in_prof
 
 os.environ["COMBINED_ENABLE"] = "1"  # Open combined-view cases optimization
 
 # Note: NPU only support trans-contiguous with base format, so format_list uses -1
 class CombinedUnsqueezeXCopyToContiguous(TestCase):
-    def test_unsqueeze_permute_copy_contiguous(self, device):
+    def test_unsqueeze_permute_copy_contiguous(self, device="npu"):
         dtype_list1 = [np.float16, np.float32]
         format_list1 = [-1]
         shape_list1 = [
@@ -53,7 +52,7 @@ class CombinedUnsqueezeXCopyToContiguous(TestCase):
             cpu_out2 = cpu_input.permute(1,0,2,3).unsqueeze(0).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
-    def test_unsqueeze_narrow_copy_contiguous(self, device):
+    def test_unsqueeze_narrow_copy_contiguous(self, device="npu"):
         dtype_list2 = [np.float16, np.float32]
         format_list2 = [-1]
         shape_list2 = [
@@ -85,7 +84,7 @@ class CombinedUnsqueezeXCopyToContiguous(TestCase):
             cpu_out2 = cpu_input[:,1:10].unsqueeze(2).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
-    def test_unsqueeze_select_copy_contiguous(self, device):
+    def test_unsqueeze_select_copy_contiguous(self, device="npu"):
         dtype_list3 = [np.float16, np.float32]
         format_list3 = [-1]
         shape_list3 = [
@@ -114,7 +113,7 @@ class CombinedUnsqueezeXCopyToContiguous(TestCase):
             cpu_out2 = cpu_input.select(1,1).unsqueeze(0).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
     
-    def test_unsqueeze_strideslice_copy_contiguous(self, device):
+    def test_unsqueeze_strideslice_copy_contiguous(self, device="npu"):
         dtype_list5 = [np.float16, np.float32]
         format_list5 = [-1]
         shape_list5 = [
@@ -142,7 +141,6 @@ class CombinedUnsqueezeXCopyToContiguous(TestCase):
             cpu_out2 = cpu_input[:,:,10:19:3].unsqueeze(0).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
-         
-instantiate_device_type_tests(CombinedUnsqueezeXCopyToContiguous, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_trans_contiguous/test_combined_views_copy_to_contiguous.py b/test/test_trans_contiguous/test_combined_views_copy_to_contiguous.py
index 5b843c9d5640f55a1ce845e90385b42219a5920d..ecb088eed58986ffc950ccef1e2d9dff8056205d 100644
--- a/test/test_trans_contiguous/test_combined_views_copy_to_contiguous.py
+++ b/test/test_trans_contiguous/test_combined_views_copy_to_contiguous.py
@@ -17,15 +17,14 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, check_operators_in_prof
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor, check_operators_in_prof
 
 os.environ["COMBINED_ENABLE"] = "1"  # Open combined-view cases optimization
 
 # Note: NPU only support trans-contiguous with base format, so format_list uses -1
 class CombinedViewsCopyToContiguous(TestCase):
-    def test_permute_narrow_copy_contiguous(self, device):
+    def test_permute_narrow_copy_contiguous(self, device="npu"):
         dtype_list1 = [np.float16]
         format_list1 = [-1]
         shape_list1 = [
@@ -53,7 +52,7 @@ class CombinedViewsCopyToContiguous(TestCase):
             cpu_out2 = cpu_input[:,1:10].permute(1,0,3,2).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
-    def test_permute_select_copy_contiguous(self, device):
+    def test_permute_select_copy_contiguous(self, device="npu"):
         dtype_list2 = [np.float32]
         format_list2 = [-1]
         shape_list2 = [
@@ -81,7 +80,7 @@ class CombinedViewsCopyToContiguous(TestCase):
             cpu_out2 = cpu_input.select(1,0).permute(1,0,2).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
-    def test_permute_strideslice_copy_contiguous(self, device):
+    def test_permute_strideslice_copy_contiguous(self, device="npu"):
         dtype_list3 = [np.float16]
         format_list3 = [-1]
         shape_list3 = [
@@ -109,7 +108,7 @@ class CombinedViewsCopyToContiguous(TestCase):
             cpu_out2 = cpu_input[:,1:10:3].permute(1,3,0,2).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
-    def test_narrow_select_copy_contiguous(self, device):
+    def test_narrow_select_copy_contiguous(self, device="npu"):
         dtype_list4 = [np.float16, np.float32]
         format_list4 = [0, 3, 29]
         shape_list4 = [
@@ -153,7 +152,7 @@ class CombinedViewsCopyToContiguous(TestCase):
             self.assertRtolEqual(npu_out3.to("cpu").numpy(), cpu_out3.numpy())
             self.assertRtolEqual(npu_out4.to("cpu").numpy(), cpu_out4.numpy())
 
-    def test_narrow_strideslice_copy_contiguous(self, device):
+    def test_narrow_strideslice_copy_contiguous(self, device="npu"):
         dtype_list5 = [np.float32]
         format_list5 = [-1]
         shape_list5 = [
@@ -204,7 +203,7 @@ class CombinedViewsCopyToContiguous(TestCase):
             self.assertRtolEqual(npu_out4.to("cpu").numpy(), cpu_out4.numpy())
             self.assertRtolEqual(npu_out5.to("cpu").numpy(), cpu_out5.numpy())
 
-    def test_strideslice_select_contiguous(self, device):
+    def test_strideslice_select_contiguous(self, device="npu"):
         dtype_list6 = [np.float16]
         format_list6 = [-1]
         shape_list6 = [
@@ -247,7 +246,7 @@ class CombinedViewsCopyToContiguous(TestCase):
             self.assertRtolEqual(npu_out3.to("cpu").numpy(), cpu_out3.numpy())
             self.assertRtolEqual(npu_out4.to("cpu").numpy(), cpu_out4.numpy())
     
-    def test_broadcast_permute_contiguous(self, device):
+    def test_broadcast_permute_contiguous(self, device="npu"):
         dtype_list7 = [np.float16, np.float32]
         format_list7 = [-1]
         shape_list7 = [
@@ -269,6 +268,6 @@ class CombinedViewsCopyToContiguous(TestCase):
             cpu_out1 = cpu_input.expand(item[2][1]).transpose(1,3).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
          
-instantiate_device_type_tests(CombinedViewsCopyToContiguous, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_trans_contiguous/test_single_broadcast_copy_to_contiguous.py b/test/test_trans_contiguous/test_single_broadcast_copy_to_contiguous.py
index 3c4816316197f1f04f2abb5aab54ccaf21f63bb4..71c0364c88941fd975c490f089143d926e5dc77f 100644
--- a/test/test_trans_contiguous/test_single_broadcast_copy_to_contiguous.py
+++ b/test/test_trans_contiguous/test_single_broadcast_copy_to_contiguous.py
@@ -17,15 +17,14 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, check_operators_in_prof
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor, check_operators_in_prof
 
 os.environ["COMBINED_ENABLE"] = "1"  # Open combined-view cases optimization
 
 # Optimized view Ops contains Transpose, permute, narrow, strideslice, select, unfold 
 class SingleViewCopyToContiguous(TestCase):
-    def test_broadcast_copy_contiguous(self, device):
+    def test_broadcast_copy_contiguous(self, device="npu"):
         dtype_list = [np.float16, np.float32, np.int32, np.int8, np.uint8]
         format_list = [-1]
         shape_list = [
@@ -53,6 +52,6 @@ class SingleViewCopyToContiguous(TestCase):
             cpu_out1 = cpu_input.expand(item[2][1]).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())                
                 
-instantiate_device_type_tests(SingleViewCopyToContiguous, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_trans_contiguous/test_single_permute_copy_to_contiguous.py b/test/test_trans_contiguous/test_single_permute_copy_to_contiguous.py
index 1d6d3d9af02be96ecef39b3576193d2d62b21659..eeeeba55662e523a4b7845e43d9c94eb0119ad26 100644
--- a/test/test_trans_contiguous/test_single_permute_copy_to_contiguous.py
+++ b/test/test_trans_contiguous/test_single_permute_copy_to_contiguous.py
@@ -17,15 +17,14 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, check_operators_in_prof
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor, check_operators_in_prof
 
 os.environ["COMBINED_ENABLE"] = "1"  # Open combined-view cases optimization
 
 # Optimized view Ops contains Transpose, permute, narrow, strideslice, select, unfold 
 class SingleViewCopyToContiguous(TestCase):    
-    def test_permute_copy_contiguous(self, device):
+    def test_permute_copy_contiguous(self, device="npu"):
         dtype_list = [np.float16, np.float32]
         format_list = [-1]
         shape_list = [[2, 6, 9, 4]]
@@ -51,6 +50,6 @@ class SingleViewCopyToContiguous(TestCase):
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) 
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())              
                 
-instantiate_device_type_tests(SingleViewCopyToContiguous, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_trans_contiguous/test_single_reshape_copy_to_contiguous.py b/test/test_trans_contiguous/test_single_reshape_copy_to_contiguous.py
index 9f5ba790a1baf8056d5b40bc7f63221303f1cfe3..61e7fe8a3fe0ad39f0db248198e8c7b77b0cd191 100644
--- a/test/test_trans_contiguous/test_single_reshape_copy_to_contiguous.py
+++ b/test/test_trans_contiguous/test_single_reshape_copy_to_contiguous.py
@@ -17,15 +17,14 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, check_operators_in_prof
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor, check_operators_in_prof
 
 os.environ["COMBINED_ENABLE"] = "1"  # Open combined-view cases optimization
 
 # Optimized view Ops contains Transpose, permute, narrow, strideslice, select, unfold 
 class SingleViewCopyToContiguous(TestCase):
-    def test_view_copy(self, device):
+    def test_view_copy(self, device="npu"):
         dtype_list1 = [np.float16, np.float32]
         format_list1 = [0, 3, 29]
         shape_list1 = [
@@ -68,7 +67,7 @@ class SingleViewCopyToContiguous(TestCase):
             cpu_out2 = cpu_input.view(1, 6, cpu_input.size(2)*cpu_input.size(3), 1).clone()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())        
 
-    def test_unsqueeze_copy(self, device):
+    def test_unsqueeze_copy(self, device="npu"):
         dtype_list2 = [np.float16, np.float32]
         format_list2 = [2, 3, 29]
         shape_list2 = [
@@ -98,7 +97,7 @@ class SingleViewCopyToContiguous(TestCase):
                 cpu_out = cpu_input.unsqueeze(i).clone()
                 self.assertRtolEqual(npu_out.to("cpu").numpy(), cpu_out.numpy()) 
     
-    def test_flatten_copy(self, device):
+    def test_flatten_copy(self, device="npu"):
         dtype_list3 = [np.float16, np.float32]
         format_list3 = [0, 3, 29]
         shape_list3 = [
@@ -125,7 +124,7 @@ class SingleViewCopyToContiguous(TestCase):
             cpu_out = torch.flatten(cpu_input, 0, 1).clone()
             self.assertRtolEqual(npu_out.to("cpu").numpy(), cpu_out.numpy())   
     
-    def test_narrow_at_first_axis_copy(self, device):
+    def test_narrow_at_first_axis_copy(self, device="npu"):
         # this case: slice at the first dim, tensor with offset remains contiguous
         dtype_list4 = [np.float16, np.float32]
         format_list4 = [2, 3, 29]
@@ -173,7 +172,7 @@ class SingleViewCopyToContiguous(TestCase):
             cpu_out2 = cpu_input[1:10,:,:].clone()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
     
-    def test_select_at_first_axis_to_single_element_tensor_copy(self, device):
+    def test_select_at_first_axis_to_single_element_tensor_copy(self, device="npu"):
         dtype_list5 = [torch.float32]
         format_list5 = [2, 3, 29]
         shape_format5 = [
@@ -208,7 +207,6 @@ class SingleViewCopyToContiguous(TestCase):
             cpu_out2 = cpu_input[0] + 1
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
-                
-instantiate_device_type_tests(SingleViewCopyToContiguous, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_trans_contiguous/test_single_slice_copy_to_contiguous.py b/test/test_trans_contiguous/test_single_slice_copy_to_contiguous.py
index 5357aab9973a8dcb1caa233985431ab55619e1f7..b6f4e0a23cffc5bb5b9f5f6d4ef92fd56c02eb8c 100644
--- a/test/test_trans_contiguous/test_single_slice_copy_to_contiguous.py
+++ b/test/test_trans_contiguous/test_single_slice_copy_to_contiguous.py
@@ -17,15 +17,14 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, check_operators_in_prof
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor, check_operators_in_prof
 
 os.environ["COMBINED_ENABLE"] = "1"  # Open combined-view cases optimization
 
 # Optimized view Ops contains Transpose, permute, narrow, strideslice, select, unfold 
 class SingleViewCopyToContiguous(TestCase):
-    def test_narrow_copy_contiguous(self, device):
+    def test_narrow_copy_contiguous(self, device="npu"):
         # AssertionError: required dtype in [np.bool, np.int32, np.float16, np.float32, np.int8, np.uint8, np.int64]
         # However, considering the dtypes that Transdata supports, only np.float16, np.float32 are tested.
         dtype_list1 = [np.float16, np.float32]
@@ -69,7 +68,7 @@ class SingleViewCopyToContiguous(TestCase):
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
             self.assertRtolEqual(npu_out3.to("cpu").numpy(), cpu_out3.numpy())
 
-    def test_strideslice_copy_contiguous(self, device):
+    def test_strideslice_copy_contiguous(self, device="npu"):
         dtype_list2 = [np.float16, np.float32, np.int8, np.int32, np.uint8, np.bool]
         format_list2 = [-1]
         shape_list2 = [[10,32,16,9], [10,32,16,9,10]]
@@ -120,7 +119,7 @@ class SingleViewCopyToContiguous(TestCase):
                 npu_out6 = npu_input[:,:,:,:,1:7:3].contiguous()
                 self.assertRtolEqual(npu_out6.to("cpu").numpy(), cpu_out6.numpy()) 
     
-    def test_select_copy_contiguous(self, device):
+    def test_select_copy_contiguous(self, device="npu"):
         dtype_list = [np.float16, np.float32]
         format_list = [-1]
         shape_list = [[2,32,16,9], [2,32,16,9,10]]
@@ -138,7 +137,7 @@ class SingleViewCopyToContiguous(TestCase):
                 cpu_out = cpu_input.select(dim,1).contiguous()
                 self.assertRtolEqual(npu_out.to("cpu").numpy(), cpu_out.numpy())  
 
-    def test_span_axis_strideslice_contiguous(self, device):
+    def test_span_axis_strideslice_contiguous(self, device="npu"):
         dtype_list = [np.float16, np.float32]
         format_list = [-1]
         shape_list = [[32,8,2], [(8,6,2), (5,4,1), 1]]
@@ -158,6 +157,6 @@ class SingleViewCopyToContiguous(TestCase):
                 shape_list[1][0], shape_list[1][1], shape_list[1][2]).contiguous()
             self.assertRtolEqual(npu_out.to("cpu").numpy(), cpu_out.numpy())      
                 
-instantiate_device_type_tests(SingleViewCopyToContiguous, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_trans_contiguous/test_tri_combined_views_copy_to_contiguous.py b/test/test_trans_contiguous/test_tri_combined_views_copy_to_contiguous.py
index 115d6059c9f3b953bfff5ea8eeb53412a98f320d..7febaa856c73a79fd64f953149b2c4861b1b8291 100644
--- a/test/test_trans_contiguous/test_tri_combined_views_copy_to_contiguous.py
+++ b/test/test_trans_contiguous/test_tri_combined_views_copy_to_contiguous.py
@@ -17,14 +17,13 @@ import torch
 import torch_npu
 import numpy as np
 
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor, check_operators_in_prof
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor, check_operators_in_prof
 
 os.environ["COMBINED_ENABLE"] = "1"  # Open combined-view cases optimization
 
 class TestTriCombinedViewsCopyToContiguous(TestCase):
-    def test_view_narrow_permute_copy_contiguous(self, device):
+    def test_view_narrow_permute_copy_contiguous(self, device="npu"):
         dtype_list1 = [np.float16, np.float32]
         format_list1 = [-1]
         shape_list1 = [
@@ -58,7 +57,7 @@ class TestTriCombinedViewsCopyToContiguous(TestCase):
                 [:,:,1:10].contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
     
-    def test_view_select_permute_copy_contiguous(self, device):
+    def test_view_select_permute_copy_contiguous(self, device="npu"):
         dtype_list2 = [np.float16, np.float32]
         format_list2 = [-1]
         shape_list2 = [
@@ -92,6 +91,6 @@ class TestTriCombinedViewsCopyToContiguous(TestCase):
                 [:,:,2].contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
-instantiate_device_type_tests(TestTriCombinedViewsCopyToContiguous, globals(), except_for='cpu')
+
 if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/torch_npu/testing/common_device_type.py b/torch_npu/testing/common_device_type.py
deleted file mode 100644
index 37e74ff3fd9a85ed314bc15e6eab69a9f7dc5114..0000000000000000000000000000000000000000
--- a/torch_npu/testing/common_device_type.py
+++ /dev/null
@@ -1,561 +0,0 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import threading
-from functools import wraps
-import unittest
-import torch
-from torch.testing._internal.common_utils import TEST_MKL
-from torch_npu.testing.common_utils import TestCase
-
-# Note: Generic Device-Type Testing
-#
-# [WRITING TESTS]
-#
-# Write your test class as usual except:
-#   (1) Each test method should have one of four signatures:
-#
-#           (1a) testX(self, device)
-#
-#           (1b) @DeviceCountAtLeast(<minimum number of devices to run test with>)
-#                testX(self, devices)
-#
-#           (1c) @Dtypes(<list of dtypes>)
-#                testX(self, device, dtype)
-#
-#           (1d) @DeviceCountAtLeast(<minimum number of devices to run test with>)
-#                @Dtypes(<list of dtypes>)
-#                testX(self, devices, dtype)
-#
-#
-#       Note that the decorators are required for signatures (1b), (1c) and
-#       (1d).
-#
-#       When a test like (1a) is called it will be given a device string,
-#       like 'cpu' or 'npu:0.'
-#
-#       Tests like (1b) are called with a list of device strings, like
-#       ['npu:0', 'npu:1']. The first device string will be the
-#       primary device. These tests will be skipped if the device type
-#       has fewer available devices than the argument to @DeviceCountAtLeast.
-#
-#       Tests like (1c) are called with a device string and a torch.dtype from
-#       the list of dtypes specified in the @Dtypes decorator. Device-specific
-#       dtype overrides can be specified using @DtypesIfCPU and @DtypesIfNPU.
-#
-#       Tests like (1d) take a devices argument like (1b) and a dtype
-#       argument from (1c).
-#
-#   (2) Prefer using test decorators defined in this file to others.
-#       For example, using the @SkipIfNoLapack decorator instead of the
-#       @SkipCPUIfNoLapack will cause the test to not run on NPU if
-#       LAPACK is not available, which is wrong. If you need to use a decorator
-#       you may want to ask about porting it to this framework.
-#
-#   See the TestTorchDeviceType class in test_torch.py for an example.
-#
-# [RUNNING TESTS]
-#
-# After defining your test class call instantiate_device_type_tests on it
-# and pass in globals() for the second argument. This will instantiate
-# discoverable device-specific test classes from your generic class. It will
-# also hide the tests in your generic class so they're not run.
-#
-# If you device-generic test class is TestClass then new classes with names
-# TestClass<DEVICE_TYPE> will be created for each available device type.
-# TestClassCPU and TestClassNPU, for example. Tests in these classes also
-# have the device type and dtype, if provided, appended to their original
-# name. testX, for instance, becomes testX_<device_type> or
-# testX_<device_type>_<dtype>.
-#
-# More concretely, TestTorchDeviceType becomes TestTorchDeviceTypeCPU,
-# TestTorchDeviceTypeNPU, ... test_diagonal in TestTorchDeviceType becomes
-# test_diagonal_cpu, test_diagonal_npu, ... test_erfinv, which accepts a dtype,
-# becomes test_erfinv_cpu_float, test_erfinv_cpu_double, test_erfinv_npu_half,
-# ...
-#
-# These tests can be run directly like normal tests:
-# "python test_torch.py TestTorchDeviceTypeCPU.test_diagonal_cpu"
-#
-# All the tests for a particular device type can be run using the class, and
-# other collections of tests can be run using pytest filtering, like
-#
-# "pytest test_torch.py -k 'test_diag'"
-#
-# which will run test_diag on every available device.
-#
-# To specify particular device types the 'and' keyword can be used:
-#
-# "pytest test_torch.py -k 'test_erfinv and cpu'"
-#
-# will run test_erfinv on all cpu dtypes.
-#
-# [ADDING A DEVICE TYPE]
-#
-# To add a device type:
-#
-#   (1) Create a new "TestBase" extending DeviceTypeTestBase.
-#       See CPUTestBase and NPUTestBase below.
-#   (2) Define the "device_type" attribute of the base to be the
-#       appropriate string.
-#   (3) Add logic to this file that appends your base class to
-#       device_type_test_bases when your device type is available.
-#   (4) (Optional) Write setUpClass/tearDownClass class methods that
-#       instantiate dependencies (see MAGMA in NPUTestBase).
-#   (5) (Optional) Override the "instantiate_test" method for total
-#       control over how your class creates tests.
-#
-# setUpClass is called AFTER tests have been created and BEFORE and ONLY IF
-# they are run. This makes it useful for initializing devices and dependencies.
-#
-# Note [Overriding methods in generic tests]
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# Device generic tests look a lot like normal test classes, but they differ
-# from ordinary classes in some important ways.  In particular, overriding
-# methods in generic tests doesn't work quite the way you expect.
-#
-#     class TestFooDeviceType(TestCase):
-#         # Intention is to override
-#         def assertEqual(self, x, y):
-#             # This DOESN'T WORK!
-#             super(TestFooDeviceType, self).assertEqual(x, y)
-#
-# If you try to run this code, you'll get an error saying that TestFooDeviceType
-# is not in scope.  This is because after instantiating our classes, we delete
-# it from the parent scope.  Instead, you need to hardcode a direct invocation
-# of the desired subclass call, e.g.,
-#
-#     class TestFooDeviceType(TestCase):
-#         # Intention is to override
-#         def assertEqual(self, x, y):
-#             TestCase.assertEqual(x, y)
-#
-# However, a less error-prone way of customizing the behavior of TestCase
-# is to either (1) add your functionality to TestCase and make it toggled
-# by a class attribute, or (2) create your own subclass of TestCase, and
-# then inherit from it for your generic test.
-#
-
-# List of device type test bases that can be used to instantiate tests.
-# See below for how this list is populated. If you're adding a device type
-# you should check if it's available and (if it is) add it to this list.
-device_type_test_bases = []
-
-
-class DeviceTypeTestBase(TestCase):
-    device_type = 'generic_device_type'
-
-    # Precision is a thread-local setting since it may be overridden per test
-    _tls = threading.local()
-    _tls.precision = TestCase.precision
-
-    @property
-    def precision(self):
-        return self._tls.precision
-
-    @precision.setter
-    def precision(self, prec):
-        self._tls.precision = prec
-
-    # Returns a string representing the device that single device tests should use.
-    # Note: single device tests use this device exclusively.
-    @classmethod
-    def get_primary_device(cls):
-        return cls.device_type
-
-    # Returns a list of strings representing all available devices of this
-    # device type. The primary device must be the first string in the list
-    # and the list must contain no duplicates.
-    # Note: UNSTABLE API. Will be replaced once PyTorch has a device generic
-    #   mechanism of acquiring all available devices.
-    @classmethod
-    def get_all_devices(cls):
-        return [cls.get_primary_device()]
-
-    # Returns the dtypes the test has requested.
-    # Prefers device-specific dtype specifications over generic ones.
-    @classmethod
-    def _get_dtypes(cls, test):
-        if not hasattr(test, 'dtypes'):
-            return None
-        return test.dtypes.get(cls.device_type, test.dtypes.get('all', None))
-
-    @classmethod
-    def _get_formats(cls, test):
-        if not hasattr(test, 'formats'):
-            return None
-        return test.formats.get(cls.device_type, test.formats.get('all', None))
-
-    def _get_precision_override(self, test, dtype):
-        if not hasattr(test, 'precision_overrides'):
-            return self.precision
-        return test.precision_overrides.get(dtype, self.precision)
-
-    # Creates device-specific tests.
-    @classmethod
-    def instantiate_test(cls, name, test):
-        test_name = name + "_" + cls.device_type
-
-        dtypes = cls._get_dtypes(test)
-        formats = cls._get_formats(test)
-        if dtypes is None and formats is None:  # Test has no dtype and npu_format variants
-            assert not hasattr(cls, test_name), "Redefinition of test {0}".format(test_name)
-
-            @wraps(test)
-            def instantiated_test(self, test=test):
-                device_arg = cls.get_primary_device() if not hasattr(test, 'num_required_devices') else cls.get_all_devices()
-                return test(self, device_arg)
-
-            setattr(cls, test_name, instantiated_test)
-
-        elif dtypes is None and formats: # Test has npu_format variants
-            for npu_format in formats:
-                format_str = str(npu_format)
-                format_test_name = test_name + "_" + format_str
-                assert not hasattr(cls, format_test_name), "Redefinition of test {0}".format(format_test_name)
-
-                @wraps(test)
-                def instantiated_test(self, test=test, npu_format=npu_format):
-                    device_arg = cls.get_primary_device() if not hasattr(test,
-                                                                         'num_required_devices') else cls.get_all_devices()
-                    # Sets precision and runs test
-                    # Note: precision is reset after the test is run
-                    guard_precision = self.precision
-                    try:
-                        result = test(self, device_arg, npu_format)
-                    finally:
-                        self.precision = guard_precision
-
-                    return result
-
-                setattr(cls, format_test_name, instantiated_test)
-
-        elif formats and dtypes: # Test has dtype and npu_format variants
-            for npu_format in formats:
-                for dtype in dtypes:
-                    dtype_str = str(dtype).split('.')[1]
-                    format_str = str(npu_format)
-                    format_dtype_test_name = test_name + "_" + dtype_str + "_" + format_str
-                    assert not hasattr(cls, format_dtype_test_name), "Redefinition of test {0}".format(format_dtype_test_name)
-
-                    @wraps(test)
-                    def instantiated_test(self, test=test, dtype=dtype, npu_format=npu_format):
-                        device_arg = cls.get_primary_device() if not hasattr(test,
-                                                                             'num_required_devices') else cls.get_all_devices()
-                        # Sets precision and runs test
-                        # Note: precision is reset after the test is run
-                        guard_precision = self.precision
-                        try:
-                            self.precision = self._get_precision_override(test, dtype)
-                            result = test(self, device_arg, dtype, npu_format)
-                        finally:
-                            self.precision = guard_precision
-
-                        return result
-
-                    setattr(cls, format_dtype_test_name, instantiated_test)
-
-        elif formats is None and dtypes:  # Test has dtype variants
-            for dtype in dtypes:
-                dtype_str = str(dtype).split('.')[1]
-                dtype_test_name = test_name + "_" + dtype_str
-                assert not hasattr(cls, dtype_test_name), "Redefinition of test {0}".format(dtype_test_name)
-
-                @wraps(test)
-                def instantiated_test(self, test=test, dtype=dtype):
-                    device_arg = cls.get_primary_device() if not hasattr(test, 'num_required_devices') else cls.get_all_devices()
-                    # Sets precision and runs test
-                    # Note: precision is reset after the test is run
-                    guard_precision = self.precision
-                    try :
-                        self.precision = self._get_precision_override(test, dtype)
-                        result = test(self, device_arg, dtype)
-                    finally:
-                        self.precision = guard_precision
-
-                    return result
-
-                setattr(cls, dtype_test_name, instantiated_test)
-
-
-class NPUTestBase(DeviceTypeTestBase):
-    device_type = 'npu'
-
-
-class CPUTestBase(DeviceTypeTestBase):
-    device_type = 'cpu'
-
-
-# Adds available device-type-specific test base classes
-device_type_test_bases.append(CPUTestBase)
-device_type_test_bases.append(NPUTestBase)
-
-
-# Adds 'instantiated' device-specific test cases to the given scope.
-# The tests in these test cases are derived from the generic tests in
-# generic_test_class.
-# See note "Generic Device Type Testing."
-def instantiate_device_type_tests(generic_test_class, scope, except_for=None):
-    # Removes the generic test class from its enclosing scope so its tests
-    # are not discoverable.
-    del scope[generic_test_class.__name__]
-
-    # Creates an 'empty' version of the generic_test_class
-    # Note: we don't inherit from the generic_test_class directly because
-    #   that would add its tests to our test classes and they would be
-    #   discovered (despite not being runnable). Inherited methods also
-    #   can't be removed later, and we can't rely on load_tests because
-    #   pytest doesn't support it (as of this writing).
-    empty_name = generic_test_class.__name__ + "_base"
-    empty_class = type(empty_name, generic_test_class.__bases__, {})
-
-    # Acquires members names
-    # See Note [Overriding methods in generic tests]
-    generic_members = set(generic_test_class.__dict__.keys()) - set(empty_class.__dict__.keys())
-    generic_tests = [x for x in generic_members if x.startswith('test')]
-
-    # Creates device-specific test cases
-    for base in device_type_test_bases:
-        # Skips bases listed in except_for
-        if except_for is not None and base.device_type in except_for:
-            continue
-
-        class_name = generic_test_class.__name__ + base.device_type.upper()
-        device_type_test_class = type(class_name, (base, empty_class), {})
-
-        for name in generic_members:
-            if name in generic_tests:  # Instantiates test member
-                # Requires tests be a function for Python2 compat
-                # (In Python2 tests are type checked methods wrapping functions)
-                test = getattr(generic_test_class, name)
-                if hasattr(test, '__func__'):
-                    test = test.__func__
-                assert inspect.isfunction(test), "Couldn't extract function from '{0}'".format(name)
-
-                # Instantiates the device-specific tests
-                device_type_test_class.instantiate_test(name, test)
-            else:  # Ports non-test member
-                assert name not in device_type_test_class.__dict__, "Redefinition of directly defined member {0}".format(name)
-
-                # Unwraps to functions (when available) for Python2 compat
-                nontest = getattr(generic_test_class, name)
-                if hasattr(nontest, '__func__'):
-                    nontest = nontest.__func__
-
-                setattr(device_type_test_class, name, nontest)
-
-        # Mimics defining the instantiated class in the caller's file
-        # by setting its module to the given class's and adding
-        # the module to the given scope.
-        # This lets the instantiated class be discovered by unittest.
-        device_type_test_class.__module__ = generic_test_class.__module__
-        scope[class_name] = device_type_test_class
-
-
-# Decorator that skips a test if the given condition is true.
-# Notes:
-#   (1) Skip conditions stack.
-#   (2) Skip conditions can be bools or strings. If a string the
-#       test base must have defined the corresponding attribute to be False
-#       for the test to run. If you want to use a string argument you should
-#       probably define a new decorator instead (see below).
-#   (3) Prefer the existing decorators to defining the 'device_type' kwarg.
-class SkipIf(object):
-
-    def __init__(self, dep, reason, device_type=None):
-        self.dep = dep
-        self.reason = reason
-        self.device_type = device_type
-
-    def __call__(self, fn):
-
-        @wraps(fn)
-        def dep_fn(slf, device, *args, **kwargs):
-            if self.device_type is None or self.device_type == slf.device_type:
-                if (isinstance(self.dep, str) and getattr(slf, self.dep, True)) or (isinstance(self.dep, bool) and self.dep):
-                    raise unittest.SkipTest(self.reason)
-
-            return fn(slf, device, *args, **kwargs)
-        return dep_fn
-
-
-# Skips a test on CPU if the condition is true.
-class SkipCPUIf(SkipIf):
-
-    def __init__(self, dep, reason):
-        super(SkipCPUIf, self).__init__(dep, reason, device_type='cpu')
-
-
-class ExpectedFailure(object):
-
-    def __init__(self, device_type):
-        self.device_type = device_type
-
-    def __call__(self, fn):
-
-        @wraps(fn)
-        def efail_fn(slf, device, *args, **kwargs):
-            if self.device_type is None or self.device_type == slf.device_type:
-                try:
-                    fn(slf, device, *args, **kwargs)
-                except Exception:
-                    return
-                else:
-                    slf.fail('expected test to fail, but it passed')
-
-            return fn(slf, device, *args, **kwargs)
-        return efail_fn
-
-
-class OnlyOn(object):
-
-    def __init__(self, device_type):
-        self.device_type = device_type
-
-    def __call__(self, fn):
-
-        @wraps(fn)
-        def only_fn(slf, device, *args, **kwargs):
-            if self.device_type != slf.device_type:
-                reason = "Only runs on {0}".format(self.device_type)
-                raise unittest.SkipTest(reason)
-
-            return fn(slf, device, *args, **kwargs)
-
-        return only_fn
-
-
-# Decorator that provides all available devices of the device type to the test
-# as a list of strings instead of providing a single device string.
-# Skips the test if the number of available devices of the variant's device
-# type is less than the 'num_required_devices' arg.
-class DeviceCountAtLeast(object):
-
-    def __init__(self, num_required_devices):
-        self.num_required_devices = num_required_devices
-
-    def __call__(self, fn):
-        assert not hasattr(fn, 'num_required_devices'), "DeviceCountAtLeast redefinition for {0}".format(fn.__name__)
-        fn.num_required_devices = self.num_required_devices
-
-        @wraps(fn)
-        def multi_fn(slf, devices, *args, **kwargs):
-            if len(devices) < self.num_required_devices:
-                reason = "fewer than {0} devices detected".format(self.num_required_devices)
-                raise unittest.SkipTest(reason)
-
-            return fn(slf, devices, *args, **kwargs)
-
-        return multi_fn
-
-
-# Specifies per-dtype precision overrides.
-# Ex.
-#
-# @PrecisionOverride(torch.half : 1e-2, torch.float : 1e-4)
-# @Dtypes(torch.half, torch.float, torch.double)
-# def test_X(self, device, dtype):
-#   ...
-#
-# When the test is instantiated its class's precision will be set to the
-# corresponding override, if it exists.
-# self.precision can be accessed directly, and it also controls the behavior of
-# functions like self.assertEqual().
-#
-# Note that self.precision is a scalar value, so if you require multiple
-# precisions (or are working with multiple dtypes) they should be specified
-# explicitly and computed using self.precision (e.g.
-# self.precision *2, max(1, self.precision)).
-class PrecisionOverride(object):
-
-    def __init__(self, d):
-        assert isinstance(d, dict), "PrecisionOverride not given a dtype : precision dict!"
-        for dtype, prec in d.items():
-            assert isinstance(dtype, torch.dtype), "PrecisionOverride given unknown dtype {0}".format(dtype)
-
-        self.d = d
-
-    def __call__(self, fn):
-        fn.precision_overrides = self.d
-        return fn
-
-
-# Decorator that instantiates a variant of the test for each given dtype.
-# Notes:
-#   (1) Tests that accept the dtype argument MUST use this decorator.
-#   (2) Can be overridden for the CPU or NPU, respectively, using DtypesIfCPU
-#       or DtypesIfNPU.
-#   (3) Prefer the existing decorators to defining the 'device_type' kwarg.
-class Dtypes(object):
-
-    # Note: *args, **kwargs for Python2 compat.
-    # Python 3 allows (self, *args, device_type='all').
-    def __init__(self, *args, **kwargs):
-        assert args is not None and len(args) != 0, "No dtypes given"
-        assert all(isinstance(arg, torch.dtype) for arg in args), "Unknown dtype in {0}".format(str(args))
-        self.args = args
-        self.device_type = kwargs.get('device_type', 'all')
-
-    def __call__(self, fn):
-        d = getattr(fn, 'dtypes', {})
-        assert self.device_type not in d, "dtypes redefinition for {0}".format(self.device_type)
-        d[self.device_type] = self.args
-        fn.dtypes = d
-        return fn
-
-class Formats(object):
-
-    def __init__(self, *args, **kwargs):
-        assert args is not None and len(args) != 0, "No formats given"
-        self.args = args
-        self.device_type = kwargs.get('device_type', 'all')
-
-    def __call__(self, fn):
-        d = getattr(fn, 'formats', {})
-        assert self.device_type not in d, "formats redefinition for {0}".format(self.device_type)
-        d[self.device_type] = self.args
-        fn.formats = d
-        return fn
-
-# Overrides specified Dtypes on the CPU.
-class DtypesIfCPU(Dtypes):
-
-    def __init__(self, *args):
-        super(DtypesIfCPU, self).__init__(*args, device_type='cpu')
-
-
-def only_npu(fn):
-    return OnlyOn('npu')(fn)
-
-
-def only_cpu(fn):
-    return OnlyOn('cpu')(fn)
-
-
-# Skips a test on CPU if LAPACK is not available.
-class SkipCPUIfNoLapack(object):
-
-    def __call__(self, fn):
-        return SkipCPUIf(not torch._C.has_lapack, "PyTorch compiled without Lapack")(fn)
-
-
-# Skips a test on CPU if MKL is not available.
-class SkipCPUIfNoMkl(object):
-
-    def __call__(fn):
-        return SkipCPUIf(not TEST_MKL, "PyTorch is built without MKL support")(fn)
-
diff --git a/torch_npu/testing/common_nn.py b/torch_npu/testing/common_nn.py
deleted file mode 100644
index 9853c331204f810ee31e3b3e566d03c3d279c8a9..0000000000000000000000000000000000000000
--- a/torch_npu/testing/common_nn.py
+++ /dev/null
@@ -1,4763 +0,0 @@
-import math
-import sys
-import tempfile
-import unittest
-
-from copy import deepcopy
-from functools import reduce
-from itertools import product
-from operator import mul
-from math import pi
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.functional import _Reduction
-from torch.autograd.gradcheck import get_numerical_jacobian, iter_tensors
-from torch.autograd import Variable
-import torch.backends.cudnn
-
-from torch_npu.testing.common_utils import (TestCase, freeze_rng_state,
-                                            is_iterable, TEST_WITH_ROCM,
-                                            _assertGradAndGradgradChecks, to_npu)
-
-
-# tarfile module tries to obtain a file object name in python 3.3
-if sys.version_info[:2] == (3, 3):
-    TemporaryFile = tempfile.NamedTemporaryFile
-else:
-    TemporaryFile = tempfile.TemporaryFile
-PRECISION = 1e-5
-TEST_NPU = torch.npu.is_available()
-
-
-def get_reduction(m):
-    result = getattr(m, 'reduction', None)
-    if result is None:
-        result = _Reduction.legacy_get_string(getattr(m, 'sizeAverage', None), True, emit_warning=False)
-    assert result is not None
-    return result
-
-
-def get_weight(m):
-    result = getattr(m, 'weight', None)
-    if result is not None:
-        return result
-    return getattr(m, 'weights', None)
-
-# NOTE [How to check NN module / functional API parity between Python and C++ frontends]
-#
-# The way to check API parity is to add parity tests for the NN module / functional of interest.
-# Here are the detailed steps:
-#
-# For NN module:
-# 1. Make sure you already have a test dict with the module configuration you want to test.
-# 2. Add `cpp_constructor_args` entry to the test dict, with its value exactly matching
-#    the Python module constructor arguments. For example, if in the test dict we pass
-#    `(10, 8)` to `torch.nn.Linear` constructor, then we should pass `torch::nn::LinearOptions(10, 8)`
-#    as the corresponding C++ constructor argument to `torch::nn::Linear`.
-# 3. If in the process of performing the above step you referenced any variables
-#    in the `cpp_constructor_args` entry, you must add `cpp_var_map` entry
-#    to the test dict to make sure that those variables are populated with the right Python values.
-#    For example, if the Python constructor call is
-#    `torch.nn.FractionalMaxPool2d(2, output_ratio=0.5, _random_samples=random_samples)`,
-#    the corresponding C++ constructor argument is
-#    `torch::nn::FractionalMaxPool2dOptions(2).output_ratio(0.5)._random_samples(random_samples)`,
-#    and the `cpp_var_map` entry must be
-#    `{'random_samples': random_samples}` in order to populate the C++ variable `random_samples`
-#    used in the C++ constructor argument with the Python tensor value `random_samples`.
-#
-# For NN functional:
-# 1. Make sure you already have a test dict with the functional configuration you want to test.
-# 2. If the test dict's `constructor` entry looks like `wrap_functional(F.some_functional_name, ...)`,
-#    then you must add `cpp_options_args` entry to the test dict, with its value exactly matching the Python
-#    functional optional arguments. For example, if the test dict's `constructor` entry is
-#    `wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest')`,
-#    then the `cpp_options_args` entry should be
-#    "F::InterpolateFuncOptions().size(std::vector<int64_t>({12})).scale_factor(c10::nullopt).mode(torch::kNearest)".
-# 3. Otherwise, if the test dict's `constructor` entry looks like
-#    `wrap_functional(lambda i: F.some_functional_name(...))`,
-#    then you must add `cpp_function_call` entry to the test dict, with its value exactly matching the Python
-#    functional function call. For example, if the test dict's `constructor` entry is
-#    `wrap_functional(lambda i: F.poisson_nll_loss(i, t.type_as(i), reduction='none'))`,
-#    then the `cpp_function_call` entry should be
-#    "F::poisson_nll_loss(i, t.to(i.options()), F::PoissonNLLLossFuncOptions().reduction(torch::kNone))".
-# 4. If in the process of performing the above two steps you referenced any variables
-#    in the `cpp_options_args` or `cpp_function_call` entry, you must
-#    add `cpp_var_map` entry to the test dict to make sure that those variables
-#    are populated with the right Python values. For example, if the test dict's `constructor` entry is
-#    `wrap_functional(lambda i: F.poisson_nll_loss(i, t.type_as(i), reduction='none'))`,
-#    then the `cpp_function_call` entry should be
-#    "F::poisson_nll_loss(i, t.to(i.options()), F::PoissonNLLLossFuncOptions().reduction(torch::kNone))".
-#    Notice that there are two variables `i` and `t` that need to have their values provided,
-#    and the way to do so is to add a `cpp_var_map` entry: `cpp_var_map={'i': '_get_input()', 't': t}`.
-#    (Note that for `i`, since we want it to take the Python input value, we pass '_get_input()' string as value
-#    and the C++ parity test mechanism will populate `i` with the Python input value correctly.)
-#
-# There are also a few optional flags in the test dict to control the C++ parity test behavior:
-#
-# - `test_cpp_api_parity`: if `False`, skips the C++ parity test for this test dict. Default: True.
-# - `has_parity`: if `False`, expects this test dict to fail the C++ parity test. Default: True.
-
-
-module_tests = [
-    dict(
-        module_name='Linear',
-        constructor_args=(10, 8),
-        cpp_constructor_args='torch::nn::LinearOptions(10, 8)',
-        input_size=(4, 10),
-        reference_fn=lambda i, p, _: torch.mm(i, p[0].t()) + p[1].view(1, -1).expand(4, 8),
-    ),
-    dict(
-        module_name='Linear',
-        constructor_args=(10, 8, False),
-        cpp_constructor_args='torch::nn::LinearOptions(10, 8).bias(false)',
-        input_size=(4, 10),
-        desc='no_bias',
-        reference_fn=lambda i, p, _: torch.mm(i, p[0].t())
-    ),
-    dict(
-        module_name='Threshold',
-        constructor_args=(2., 1.),
-        cpp_constructor_args='torch::nn::ThresholdOptions(2., 1.)',
-        input_size=(2, 3, 4, 5),
-        check_inplace=True,
-        desc='threshold_value'
-    ),
-    dict(
-        module_name='Threshold',
-        constructor_args=(2., 10.),
-        cpp_constructor_args='torch::nn::ThresholdOptions(2., 10.)',
-        input_size=(2, 3, 4, 5),
-        desc='large_value'
-    ),
-    dict(
-        module_name='ReLU',
-        input_size=(2, 3, 4, 5),
-        check_inplace=True,
-    ),
-    dict(
-        module_name='ReLU6',
-        input_size=(2, 3, 4, 5),
-        check_inplace=True,
-    ),
-    dict(
-        module_name='RReLU',
-        input_size=(1, 2, 2),
-    ),
-    dict(
-        module_name='RReLU',
-        constructor_args=(0.1, 0.9),
-        cpp_constructor_args='torch::nn::RReLUOptions().lower(0.1).upper(0.9)',
-        input_size=(4, 4, 5),
-        desc='with_up_down',
-    ),
-    dict(
-        module_name='Hardtanh',
-        input_size=(3, 2, 5),
-        reference_fn=lambda i, *_: i.clamp(-1, 1),
-    ),
-    dict(
-        module_name='Sigmoid',
-        input_size=(2, 3, 4, 5),
-    ),
-    dict(
-        module_name='Tanh',
-        input_size=(2, 3, 4, 5),
-    ),
-    dict(
-        module_name='Flatten',
-        input_size=(2, 3, 4, 5),
-        reference_fn=lambda i, *_: torch.flatten(i, 1)
-    ),
-    dict(
-        module_name='Softmax',
-        constructor_args=(1,),
-        cpp_constructor_args='torch::nn::SoftmaxOptions(1)',
-        input_size=(10, 20),
-        reference_fn=lambda i, *_: torch.exp(i).div(torch.exp(i).sum(1, True).expand(10, 20)),
-    ),
-    dict(
-        module_name='Softmax2d',
-        input_size=(1, 3, 10, 20),
-        reference_fn=lambda i, *_: torch.exp(i).div(torch.exp(i).sum(1, False)),
-    ),
-    dict(
-        module_name='LogSoftmax',
-        constructor_args=(1,),
-        cpp_constructor_args='torch::nn::LogSoftmaxOptions(1)',
-        input_size=(10, 20),
-        reference_fn=lambda i, *_: torch.exp(i).div_(torch.exp(i).sum(1, True).expand(10, 20)).log_(),
-    ),
-    dict(
-        module_name='LogSoftmax',
-        constructor_args=(1,),
-        cpp_constructor_args='torch::nn::LogSoftmaxOptions(1)',
-        input_size=(1, 3, 10, 20),
-        reference_fn=lambda i, *_: torch.exp(i).div_(torch.exp(i).sum(1, False)).log_(),
-        desc='multiparam',
-    ),
-    dict(
-        module_name='ELU',
-        constructor_args=(2.,),
-        cpp_constructor_args='torch::nn::ELUOptions().alpha(2.)',
-        input_size=(3, 2, 5),
-        reference_fn=lambda x, *_: torch.where(x >= 0, x, 2 * (x.exp() - 1)),
-    ),
-    # TODO: reference function
-    dict(
-        module_name='Hardshrink',
-        constructor_args=(2.,),
-        cpp_constructor_args='torch::nn::HardshrinkOptions(2.)',
-        input_size=(4, 3, 2, 4),
-    ),
-    dict(
-        module_name='LeakyReLU',
-        input_size=(3, 2, 5),
-        check_inplace=True
-    ),
-    dict(
-        module_name='LeakyReLU',
-        constructor_args=(0.5,),
-        cpp_constructor_args='torch::nn::LeakyReLUOptions().negative_slope(0.5)',
-        input_size=(3, 2, 5),
-        check_inplace=True,
-        desc='with_negval'
-    ),
-    dict(
-        module_name='LogSigmoid',
-        input_size=(2, 3, 4),
-        reference_fn=lambda i, *_: i.sigmoid().log(),
-    ),
-    dict(
-        module_name='Softplus',
-        input_size=(10, 20),
-        reference_fn=lambda i, *_: torch.log(1 + torch.exp(i)),
-    ),
-    dict(
-        module_name='Softplus',
-        constructor_args=(2,),
-        cpp_constructor_args='torch::nn::SoftplusOptions().beta(2)',
-        input_size=(10, 20),
-        reference_fn=lambda i, *_: 1. / 2. * torch.log(1 + torch.exp(2 * i)),
-        desc='beta',
-    ),
-    dict(
-        module_name='Softplus',
-        constructor_args=(2, -100),
-        cpp_constructor_args='torch::nn::SoftplusOptions().beta(2).threshold(-100)',
-        input_size=(10, 20),
-        reference_fn=(
-            lambda i, *_: ((i * 2) > -100).type_as(i) * i
-            + ((i * 2) <= -100).type_as(i) * 1. / 2. * torch.log(1 + torch.exp(2 * i))
-        ),
-        desc='beta_threshold',
-    ),
-    dict(
-        module_name='Softshrink',
-        input_size=(3, 2, 5),
-    ),
-    dict(
-        module_name='Softshrink',
-        constructor_args=(1,),
-        cpp_constructor_args='torch::nn::SoftshrinkOptions(1)',
-        input_size=(3, 2, 5),
-        desc='lambda',
-    ),
-    dict(
-        module_name='CrossMapLRN2d',
-        constructor_args=(5, 5e-3, 1e-3, 2),
-        cpp_constructor_args='torch::nn::CrossMapLRN2dOptions(5).alpha(5e-3).beta(1e-3).k(2)',
-        input_size=(2, 3, 6, 6),
-        check_gradgrad=False,
-    ),
-    dict(
-        module_name='PReLU',
-        input_size=(2, 3, 4),
-        reference_fn=lambda i, p, _: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
-        desc='1d',
-    ),
-    dict(
-        module_name='PReLU',
-        constructor_args=(3,),
-        cpp_constructor_args='torch::nn::PReLUOptions().num_parameters(3)',
-        input_size=(2, 3, 4),
-        desc='1d_multiparam',
-        reference_fn=lambda i, p, _: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
-    ),
-    dict(
-        module_name='PReLU',
-        input_size=(2, 3, 4, 5),
-        desc='2d',
-        reference_fn=lambda i, p, _: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
-    ),
-    dict(
-        module_name='PReLU',
-        constructor_args=(3,),
-        cpp_constructor_args='torch::nn::PReLUOptions().num_parameters(3)',
-        input_size=(2, 3, 4, 5),
-        desc='2d_multiparam',
-        reference_fn=lambda i, p, _: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
-    ),
-    dict(
-        module_name='PReLU',
-        input_size=(2, 3, 4, 5, 6),
-        reference_fn=lambda i, p, _: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
-        desc='3d',
-    ),
-    dict(
-        module_name='PReLU',
-        constructor_args=(3,),
-        cpp_constructor_args='torch::nn::PReLUOptions().num_parameters(3)',
-        input_size=(2, 3, 4, 5, 6),
-        desc='3d_multiparam',
-        reference_fn=lambda i, p, _: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
-    ),
-    dict(
-        module_name='Softsign',
-        input_size=(3, 2, 5),
-        reference_fn=lambda i, *_: i.div(1 + torch.abs(i)),
-    ),
-    dict(
-        module_name='Softmin',
-        constructor_args=(1,),
-        cpp_constructor_args='torch::nn::SoftminOptions(1)',
-        input_size=(10, 20),
-    ),
-    dict(
-        module_name='Softmin',
-        constructor_args=(1,),
-        cpp_constructor_args='torch::nn::SoftminOptions(1)',
-        input_size=(2, 3, 5, 10),
-        desc='multidim',
-    ),
-    dict(
-        module_name='Tanhshrink',
-        input_size=(2, 3, 4, 5),
-    ),
-]
-
-
-# Generates rand tensor with non-equal values. This ensures that duplicate
-# values won't be causing test failure for modules like MaxPooling.
-# size should be small, otherwise randperm fails / long overflows.
-def _rand_tensor_non_equal(*size):
-    total = reduce(mul, size, 1)
-    return torch.randperm(total).view(*size).double()
-
-
-def wrap_functional(fn, **kwargs):
-    class FunctionalModule(nn.Module):
-        def forward(self, *args):
-            return fn(*args, **kwargs)
-    return FunctionalModule
-
-
-def poissonnllloss_no_reduce_test():
-    t = torch.randn(10, 10)
-    return dict(
-        fullname='PoissonNLLLoss_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.poisson_nll_loss(i, t.type_as(i), reduction='none')),
-        cpp_function_call='F::poisson_nll_loss('
-                          'i, t.to(i.options()), F::PoissonNLLLossFuncOptions().reduction(torch::kNone))',
-        input_fn=lambda: torch.rand(10, 10),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_: i.exp() - t.mul(i),
-        pickle=False)
-
-
-def bceloss_no_reduce_test():
-    t = Variable(torch.randn(15, 10).gt(0).double())
-    return dict(
-        fullname='BCELoss_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.binary_cross_entropy(i, t.type_as(i), reduction='none')),
-        cpp_function_call='F::binary_cross_entropy('
-                          'i, t.to(i.options()), F::BinaryCrossEntropyFuncOptions().reduction(torch::kNone))',
-        input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_: -(t * i.log() + (1 - t) * (1 - i).log()),
-        pickle=False,
-        precision=7e-4)
-
-
-def bceloss_no_reduce_scalar_test():
-    t = torch.randn(()).gt(0).double()
-    return dict(
-        fullname='BCELoss_no_reduce_scalar',
-        constructor=wrap_functional(
-            lambda i: F.binary_cross_entropy(i, t.type_as(i), reduction='none')),
-        cpp_function_call='F::binary_cross_entropy('
-                          'i, t.to(i.options()), F::BinaryCrossEntropyFuncOptions().reduction(torch::kNone))',
-        input_fn=lambda: torch.rand(()).clamp_(2.8e-2, 1 - 2.8e-2),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_: -(t * i.log() + (1 - t) * (1 - i).log()),
-        pickle=False)
-
-
-def bceloss_weights_no_reduce_test():
-    t = Variable(torch.randn(15, 10).gt(0).double())
-    weights = torch.rand(10)
-    return dict(
-        fullname='BCELoss_weights_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.binary_cross_entropy(i, t.type_as(i),
-                                             weight=weights.type_as(i), reduction='none')),
-        cpp_function_call='F::binary_cross_entropy('
-                          'i, t.to(i.options()), '
-                          'F::BinaryCrossEntropyFuncOptions().weight(weights.to(i.options())).reduction(torch::kNone))',
-        input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2),
-        cpp_var_map={'i': '_get_input()', 't': t, 'weights': weights},
-        reference_fn=lambda i, p, m: -(t * i.log() + (1 - t) * (1 - i).log()) * weights,
-        pickle=False,
-        precision=3e-4
-    )
-
-
-def bceloss_weights_no_reduce_scalar_test():
-    t = torch.randn(()).double()
-    weights = torch.rand(())
-    return dict(
-        fullname='BCELoss_weights_no_reduce_scalar',
-        constructor=wrap_functional(
-            lambda i: F.binary_cross_entropy(i, t.type_as(i),
-                                             weight=weights.type_as(i), reduction='none')),
-        cpp_function_call='''F::binary_cross_entropy(
-            i, t.to(i.options()),
-            F::BinaryCrossEntropyFuncOptions().weight(weights.to(i.options())).reduction(torch::kNone))''',
-        cpp_var_map={'i': '_get_input()', 't': t, 'weights': weights},
-        input_fn=lambda: torch.rand(()).clamp_(2.8e-2, 1 - 2.8e-2),
-        reference_fn=lambda i, *_: -(t * i.log() + (1 - t) * (1 - i).log()) * weights,
-        pickle=False
-    )
-
-
-def bce_with_logistic_legacy_enum_test():
-    t = Variable(torch.randn(15, 10).gt(0).double())
-    sigmoid = nn.Sigmoid()
-    return dict(
-        fullname='BCEWithLogitsLoss_legacy_enum',
-        constructor=wrap_functional(
-            lambda i: F.binary_cross_entropy_with_logits(i, t.type_as(i), reduce=False)),
-        cpp_function_call='''F::binary_cross_entropy_with_logits(
-            i, t.to(i.options()), F::BinaryCrossEntropyWithLogitsFuncOptions().reduction(torch::kNone))''',
-        input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_: -(t * sigmoid(i).log() + (1 - t) * (1 - sigmoid(i)).log()),
-        check_gradgrad=False,
-        pickle=False,
-    )
-
-
-def bce_with_logistic_no_reduce_test():
-    t = Variable(torch.randn(15, 10).gt(0).double())
-    sigmoid = nn.Sigmoid()
-    return dict(
-        fullname='BCEWithLogitsLoss_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.binary_cross_entropy_with_logits(i, t.type_as(i), reduction='none')),
-        cpp_function_call='''F::binary_cross_entropy_with_logits(
-            i, t.to(i.options()), F::BinaryCrossEntropyWithLogitsFuncOptions().reduction(torch::kNone))''',
-        input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_: -(t * sigmoid(i).log() + (1 - t) * (1 - sigmoid(i)).log()),
-        check_gradgrad=False,
-        pickle=False,
-    )
-
-
-def bce_with_logistic_no_reduce_scalar_test():
-    t = torch.randn(()).gt(0).double()
-    sigmoid = nn.Sigmoid()
-    return dict(
-        fullname='BCEWithLogitsLoss_no_reduce_scalar',
-        constructor=wrap_functional(
-            lambda i: F.binary_cross_entropy_with_logits(i, t.type_as(i), reduction='none')),
-        cpp_function_call='''F::binary_cross_entropy_with_logits(
-            i, t.to(i.options()), F::BinaryCrossEntropyWithLogitsFuncOptions().reduction(torch::kNone))''',
-        input_fn=lambda: torch.rand(()).clamp_(2.8e-2, 1 - 2.8e-2),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_: -(t * sigmoid(i).log() + (1 - t) * (1 - sigmoid(i)).log()),
-        check_gradgrad=False,
-        pickle=False
-    )
-
-
-def kldivloss_with_target_no_reduce_test():
-    i = torch.rand(10, 10).log()
-    return dict(
-        fullname='KLDivLoss_with_target_no_reduce',
-        constructor=wrap_functional(
-            lambda t: F.kl_div(i.type_as(t), t, reduction='none')),
-        cpp_function_call='F::kl_div(i.to(t.options()), t, F::KLDivFuncOptions().reduction(torch::kNone))',
-        input_fn=lambda: torch.rand(10, 10),
-        cpp_var_map={'i': i, 't': '_get_input()'},
-        reference_fn=lambda t, *_:
-            loss_reference_fns['KLDivLoss'](i.type_as(t), t, reduction='none'),
-        pickle=False)
-
-
-def kldivloss_no_reduce_test():
-    t = torch.randn(10, 10)
-    return dict(
-        fullname='KLDivLoss_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.kl_div(i, t.type_as(i), reduction='none')),
-        cpp_function_call='F::kl_div(i, t.to(i.options()), F::KLDivFuncOptions().reduction(torch::kNone))',
-        input_fn=lambda: torch.rand(10, 10).log(),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['KLDivLoss'](i, t.type_as(i), reduction='none'),
-        pickle=False,
-    )
-
-
-def kldivloss_no_reduce_scalar_test():
-    t = torch.randn(())
-    return dict(
-        fullname='KLDivLoss_no_reduce_scalar',
-        constructor=wrap_functional(
-            lambda i: F.kl_div(i, t.type_as(i), reduction='none')),
-        cpp_function_call='F::kl_div(i, t.to(i.options()), F::KLDivFuncOptions().reduction(torch::kNone))',
-        input_fn=lambda: torch.rand(()).log(),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['KLDivLoss'](i, t.type_as(i), reduction='none'),
-        pickle=False)
-
-
-def l1loss_no_reduce_test():
-    t = torch.randn(2, 3, 4)
-    return dict(
-        fullname='L1Loss_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.l1_loss(i, t.type_as(i), reduction='none')),
-        cpp_function_call='F::l1_loss(i, t.to(i.options()), F::L1LossFuncOptions().reduction(torch::kNone))',
-        input_fn=lambda: torch.randn(2, 3, 4),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_: (i - t.type_as(i)).abs(),
-        pickle=False)
-
-
-def l1loss_no_reduce_scalar_test():
-    t = torch.randn(())
-    return dict(
-        fullname='L1Loss_no_reduce_scalar',
-        constructor=wrap_functional(
-            lambda i: F.l1_loss(i, t.type_as(i), reduction='none')),
-        cpp_function_call='F::l1_loss(i, t.to(i.options()), F::L1LossFuncOptions().reduction(torch::kNone))',
-        input_fn=lambda: torch.randn(()),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_: (i - t.type_as(i)).abs(),
-        pickle=False)
-
-
-def mseloss_no_reduce_test():
-    input_size = (2, 3, 4, 5)
-    target = torch.randn(*input_size)
-    return dict(
-        fullname='MSELoss_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.mse_loss(i, target.type_as(i), reduction='none')),
-        cpp_function_call='F::mse_loss(i, target.to(i.options()), F::MSELossFuncOptions().reduction(torch::kNone))',
-        input_size=input_size,
-        cpp_var_map={'i': '_get_input()', 'target': target},
-        reference_fn=lambda i, *_: (i - target).pow(2),
-        pickle=False)
-
-
-def mseloss_no_reduce_scalar_test():
-    input_size = ()
-    target = torch.randn(input_size)
-    return dict(
-        fullname='MSELoss_no_reduce_scalar',
-        constructor=wrap_functional(
-            lambda i: F.mse_loss(i, target.type_as(i), reduction='none')),
-        cpp_function_call='F::mse_loss(i, target.to(i.options()), F::MSELossFuncOptions().reduction(torch::kNone))',
-        input_size=input_size,
-        cpp_var_map={'i': '_get_input()', 'target': target},
-        reference_fn=lambda i, *_: (i - target).pow(2),
-        pickle=False)
-
-
-def nllloss_no_reduce_test():
-    t = Variable(torch.Tensor(15).uniform_().mul(10).floor().long())
-    kwargs = {'reduction': 'none'}
-    return dict(
-        fullname='NLLLoss_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs)),
-        cpp_function_call='''F::nll_loss(
-            i, t.to(i.options()).to(torch::kLong), F::NLLLossFuncOptions().reduction(torch::kNone))''',
-        input_fn=lambda: torch.rand(15, 10).log(),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs),
-        pickle=False)
-
-
-def nllloss_no_reduce_ignore_index_test():
-    t = Variable(torch.Tensor(15).uniform_().mul(10).floor().long())
-    kwargs = {'ignore_index': 2, 'reduction': 'none'}
-    return dict(
-        fullname='NLLLoss_no_reduce_ignore_index',
-        constructor=wrap_functional(
-            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs)),
-        cpp_function_call='''F::nll_loss(
-            i, t.to(i.options()).to(torch::kLong), F::NLLLossFuncOptions().ignore_index(2).reduction(torch::kNone))''',
-        input_fn=lambda: torch.rand(15, 10).log(),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs),
-        pickle=False)
-
-
-def nllloss_no_reduce_weights_test():
-    t = Variable(torch.Tensor(15).uniform_().mul(10).floor().long())
-    weight = torch.rand(10)
-
-    def kwargs(i):
-        return {'weight': weight.type_as(i), 'reduction': 'none'}
-
-    return dict(
-        fullname='NLLLoss_no_reduce_weights',
-        constructor=wrap_functional(
-            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs(i))),
-        cpp_function_call='''F::nll_loss(
-            i, t.to(i.options()).to(torch::kLong),
-            F::NLLLossFuncOptions().weight(weight.to(i.options())).reduction(torch::kNone))''',
-        input_fn=lambda: torch.rand(15, 10).add(1e-2).log(),
-        cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs(i)),
-        pickle=False)
-
-
-def nllloss_no_reduce_weights_ignore_index_test():
-    t = Variable(torch.Tensor(15).uniform_().mul(10).floor().long())
-    weight = torch.rand(10)
-
-    def kwargs(i):
-        return {'weight': weight.type_as(i), 'reduction': 'none',
-                'ignore_index': 2}
-
-    return dict(
-        fullname='NLLLoss_no_reduce_weights_ignore_index',
-        constructor=wrap_functional(
-            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs(i.data))),
-        cpp_function_call='''F::nll_loss(
-            i, t.to(i.options()).to(torch::kLong),
-            F::NLLLossFuncOptions().weight(weight.to(i.options())).reduction(torch::kNone).ignore_index(2))''',
-        input_fn=lambda: torch.rand(15, 10).add(1e-2).log(),
-        cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs(i)),
-        pickle=False)
-
-
-def nllloss_no_reduce_weights_ignore_index_neg_test():
-    t = Variable(torch.Tensor(15).uniform_().mul(10).floor().long())
-    weight = torch.rand(10)
-
-    def kwargs(i):
-        return {'weight': weight.type_as(i), 'reduction': 'none',
-                'ignore_index': -1}
-
-    return dict(
-        fullname='NLLLoss_no_reduce_weights_ignore_index_neg',
-        constructor=wrap_functional(
-            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs(i))),
-        cpp_function_call='''F::nll_loss(
-            i, t.to(i.options()).to(torch::kLong),
-            F::NLLLossFuncOptions().weight(weight.to(i.options())).reduction(torch::kNone).ignore_index(-1))''',
-        input=torch.rand(15, 10).add(1e-2).log(),
-        cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs(i)),
-        pickle=False)
-
-
-def nllloss2d_no_reduce_test():
-    t = Variable(torch.rand(2, 5, 5).mul(3).floor().long())
-    kwargs = {'reduction': 'none'}
-    return dict(
-        fullname='NLLLoss2d_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs)),
-        cpp_function_call='''F::nll_loss(
-            i, t.to(i.options()).to(torch::kLong), F::NLLLossFuncOptions().reduction(torch::kNone))''',
-        input_fn=lambda: torch.rand(2, 3, 5, 5).log(),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs),
-        pickle=False)
-
-
-def nllloss2d_no_reduce_ignore_index_test():
-    t = Variable(torch.rand(2, 5, 5).mul(3).floor().long())
-    kwargs = {'ignore_index': 1, 'reduction': 'none'}
-    return dict(
-        fullname='NLLLoss2d_no_reduce_ignore_index',
-        constructor=wrap_functional(
-            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs)),
-        cpp_function_call='''F::nll_loss(
-            i, t.to(i.options()).to(torch::kLong), F::NLLLossFuncOptions().ignore_index(1).reduction(torch::kNone))''',
-        input_fn=lambda: torch.rand(2, 3, 5, 5).log(),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs),
-        pickle=False)
-
-
-def nllloss2d_no_reduce_weights_test():
-    t = Variable(torch.rand(2, 5, 5).mul(3).floor().long())
-    weight = torch.rand(3)
-
-    def kwargs(i):
-        return {'weight': weight.type_as(i), 'reduction': 'none'}
-
-    return dict(
-        fullname='NLLLoss2d_no_reduce_weights',
-        constructor=wrap_functional(
-            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs(i))),
-        cpp_function_call='''F::nll_loss(
-            i, t.to(i.options()).to(torch::kLong),
-            F::NLLLossFuncOptions().weight(weight.to(i.options())).reduction(torch::kNone))''',
-        input_fn=lambda: torch.rand(2, 3, 5, 5).log(),
-        cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs(i)),
-        pickle=False)
-
-
-def nlllossNd_no_reduce_test():
-    t = Variable(torch.rand(2, 5, 5, 2, 2).mul(3).floor().long())
-    kwargs = {'reduction': 'none'}
-    return dict(
-        fullname='NLLLossNd_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs)),
-        cpp_function_call='''F::nll_loss(
-            i, t.to(i.options()).to(torch::kLong), F::NLLLossFuncOptions().reduction(torch::kNone))''',
-        input_fn=lambda: torch.rand(2, 3, 5, 5, 2, 2).log(),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs),
-        pickle=False)
-
-
-def nlllossNd_no_reduce_ignore_index_test():
-    t = Variable(torch.rand(2, 5, 5, 2, 2).mul(3).floor().long())
-    kwargs = {'ignore_index': 1, 'reduction': 'none'}
-    return dict(
-        fullname='NLLLossNd_no_reduce_ignore_index',
-        constructor=wrap_functional(
-            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs)),
-        cpp_function_call='''F::nll_loss(
-            i, t.to(i.options()).to(torch::kLong), F::NLLLossFuncOptions().ignore_index(1).reduction(torch::kNone))''',
-        input_fn=lambda: torch.rand(2, 3, 5, 5, 2, 2).log(),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs),
-        pickle=False)
-
-
-def nlllossNd_no_reduce_weights_test():
-    t = Variable(torch.rand(2, 5, 5, 2, 2).mul(3).floor().long())
-    weight = torch.rand(3)
-
-    def kwargs(i):
-        return {'weight': weight.type_as(i), 'reduction': 'none'}
-
-    return dict(
-        fullname='NLLLossNd_no_reduce_weights',
-        constructor=wrap_functional(
-            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs(i))),
-        cpp_function_call='''F::nll_loss(
-            i, t.to(i.options()).to(torch::kLong),
-            F::NLLLossFuncOptions().weight(weight.to(i.options())).reduction(torch::kNone))''',
-        input_fn=lambda: torch.rand(2, 3, 5, 5, 2, 2).log(),
-        cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs(i)),
-        pickle=False)
-
-
-def smoothl1loss_no_reduce_test():
-    t = torch.randn(2, 3, 4)
-    return dict(
-        fullname='SmoothL1Loss_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.smooth_l1_loss(i, t.type_as(i), reduction='none')),
-        cpp_function_call='''F::smooth_l1_loss(
-            i, t.to(i.options()), F::SmoothL1LossFuncOptions().reduction(torch::kNone))''',
-        input_fn=lambda: torch.randn(2, 3, 4),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none'),
-        pickle=False)
-
-
-def smoothl1loss_no_reduce_scalar_test():
-    t = torch.randn(())
-    return dict(
-        fullname='SmoothL1Loss_no_reduce_scalar',
-        constructor=wrap_functional(
-            lambda i: F.smooth_l1_loss(i, t.type_as(i), reduction='none')),
-        cpp_function_call='''F::smooth_l1_loss(
-            i, t.to(i.options()), F::SmoothL1LossFuncOptions().reduction(torch::kNone))''',
-        input_fn=lambda: torch.randn(()),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none'),
-        pickle=False)
-
-
-def multilabelmarginloss_0d_no_reduce_test():
-    t = torch.zeros(()).long()
-    return dict(
-        fullname='MultiLabelMarginLoss_0d_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.multilabel_margin_loss(i, t.type_as(i).long(), reduction='none')),
-        cpp_function_call='''F::multilabel_margin_loss(
-            i, t.to(i.options()).to(torch::kLong), F::MultilabelMarginLossFuncOptions().reduction(torch::kNone))''',
-        input_fn=lambda: torch.randn(()),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['MultiLabelMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        pickle=False)
-
-
-def multilabelmarginloss_1d_no_reduce_test():
-    t = Variable(torch.rand(10).mul(10).floor().long())
-    return dict(
-        fullname='MultiLabelMarginLoss_1d_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.multilabel_margin_loss(i, t.type_as(i).long(), reduction='none')),
-        cpp_function_call='''F::multilabel_margin_loss(
-            i, t.to(i.options()).to(torch::kLong), F::MultilabelMarginLossFuncOptions().reduction(torch::kNone))''',
-        input_fn=lambda: torch.randn(10),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['MultiLabelMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        pickle=False)
-
-
-def multilabelmarginloss_index_neg_test():
-    t = Variable(torch.clamp(torch.rand(5, 10).add(-.5).mul(20).floor().long(), min=-1))
-    return dict(
-        fullname='MultiLabelMarginLoss_index_neg',
-        constructor=wrap_functional(
-            lambda i: F.multilabel_margin_loss(i, t.type_as(i).long(), reduction='none')),
-        cpp_function_call='''F::multilabel_margin_loss(
-            i, t.to(i.options()).to(torch::kLong), F::MultilabelMarginLossFuncOptions().reduction(torch::kNone))''',
-        input_fn=lambda: torch.randn(5, 10),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['MultiLabelMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        pickle=False)
-
-
-def multilabelmarginloss_no_reduce_test():
-    t = Variable(torch.rand(5, 10).mul(10).floor().long())
-    return dict(
-        fullname='MultiLabelMarginLoss_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.multilabel_margin_loss(i, t.type_as(i).long(), reduction='none')),
-        cpp_function_call='''F::multilabel_margin_loss(
-            i, t.to(i.options()).to(torch::kLong), F::MultilabelMarginLossFuncOptions().reduction(torch::kNone))''',
-        input_fn=lambda: torch.randn(5, 10),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['MultiLabelMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        pickle=False)
-
-
-def hingeembeddingloss_no_reduce_test():
-    t = Variable(torch.randn(10).gt(0).double().mul_(2).sub(1))
-    return dict(
-        fullname='HingeEmbeddingLoss_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.hinge_embedding_loss(i, t.type_as(i), reduction='none')),
-        cpp_function_call='''F::hinge_embedding_loss(
-            i, t.to(i.options()), F::HingeEmbeddingLossFuncOptions().reduction(torch::kNone))''',
-        input_fn=lambda: torch.randn(10),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['HingeEmbeddingLoss'](i, t.type_as(i), reduction='none'),
-        check_sum_reduction=True,
-        pickle=False)
-
-
-def hingeembeddingloss_margin_no_reduce_test():
-    t = Variable(torch.randn(10).gt(0).double().mul_(2).sub(1))
-    return dict(
-        fullname='HingeEmbeddingLoss_margin_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.hinge_embedding_loss(i, t.type_as(i), margin=0.5, reduction='none')),
-        cpp_function_call='''F::hinge_embedding_loss(
-            i, t.to(i.options()), F::HingeEmbeddingLossFuncOptions().margin(0.5).reduction(torch::kNone))''',
-        input_fn=lambda: torch.randn(10),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['HingeEmbeddingLoss'](i, t.type_as(i), margin=0.5, reduction='none'),
-        check_sum_reduction=True,
-        pickle=False)
-
-
-def softmarginloss_no_reduce_test():
-    t = torch.randn(5, 5)
-    return dict(
-        fullname='SoftMarginLoss_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.soft_margin_loss(i, t.type_as(i), reduction='none')),
-        cpp_function_call='''F::soft_margin_loss(
-            i, t.to(i.options()), F::SoftMarginLossFuncOptions().reduction(torch::kNone))''',
-        input_fn=lambda: torch.randn(5, 5),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['SoftMarginLoss'](i, t.type_as(i), reduction='none'),
-        pickle=False)
-
-
-def multilabelsoftmarginloss_no_reduce_test():
-    t = torch.rand(5, 10).mul(2).floor()
-    return dict(
-        fullname='MultiLabelSoftMarginLoss_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.multilabel_soft_margin_loss(i, t.type_as(i), reduction='none')),
-        cpp_function_call='''F::multilabel_soft_margin_loss(
-            i, t.to(i.options()), F::MultilabelSoftMarginLossFuncOptions().reduction(torch::kNone))''',
-        input_fn=lambda: torch.randn(5, 10),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            (-(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log())).sum(dim=1) / i.size(1),
-        check_gradgrad=False,
-        pickle=False)
-
-
-def multilabelsoftmarginloss_weights_no_reduce_test():
-    t = torch.rand(5, 10).mul(2).floor()
-    weights = torch.rand(10)
-    return dict(
-        fullname='MultiLabelSoftMarginLoss_weights_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.multilabel_soft_margin_loss(i, t.type_as(i),
-                                                    weight=weights.type_as(i), reduction='none')),
-        cpp_function_call='''F::multilabel_soft_margin_loss(
-            i, t.to(i.options()),
-            F::MultilabelSoftMarginLossFuncOptions().weight(weights.to(i.options())).reduction(torch::kNone))''',
-        input_fn=lambda: torch.randn(5, 10),
-        cpp_var_map={'i': '_get_input()', 't': t, 'weights': weights},
-        reference_fn=lambda i, *_:
-            (-(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log()) * weights).sum(dim=1) / i.size(1),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        pickle=False)
-
-
-def multimarginloss_no_reduce_test():
-    t = torch.rand(5).mul(8).floor().long()
-    return dict(
-        fullname='MultiMarginLoss_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.multi_margin_loss(i, t.type_as(i).long(), reduction='none')),
-        cpp_function_call='''F::multi_margin_loss(
-            i, t.to(i.options()).to(torch::kLong), F::MultiMarginLossFuncOptions().reduction(torch::kNone))''',
-        input_fn=lambda: torch.randn(5, 10),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        pickle=False)
-
-
-def multimarginloss_1d_no_reduce_test():
-    t = torch.rand(1).mul(8).floor().long()
-    return dict(
-        fullname='MultiMarginLoss_1d_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.multi_margin_loss(i, t.type_as(i).long(), reduction='none')),
-        cpp_function_call='''F::multi_margin_loss(
-            i, t.to(i.options()).to(torch::kLong), F::MultiMarginLossFuncOptions().reduction(torch::kNone))''',
-        input_fn=lambda: torch.randn(10),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        pickle=False)
-
-
-def multimarginloss_1d_input_0d_target_no_reduce_test():
-    t = torch.rand(()).mul(8).floor().long()
-    return dict(
-        fullname='multimarginloss_1d_input_0d_target_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.multi_margin_loss(i, t.type_as(i).long(), reduction='none')),
-        cpp_function_call='''F::multi_margin_loss(
-            i, t.to(i.options()).to(torch::kLong), F::MultiMarginLossFuncOptions().reduction(torch::kNone))''',
-        input_fn=lambda: torch.randn(10),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        pickle=False)
-
-
-def multimarginloss_p_no_reduce_test():
-    t = torch.rand(5).mul(8).floor().long()
-    return dict(
-        fullname='MultiMarginLoss_p_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.multi_margin_loss(i, t.type_as(i).long(), p=2, reduction='none')),
-        cpp_function_call='''F::multi_margin_loss(
-            i, t.to(i.options()).to(torch::kLong), F::MultiMarginLossFuncOptions().p(2).reduction(torch::kNone))''',
-        input_fn=lambda: torch.randn(5, 10).clamp_(1e-2, 1 - 1e-2),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), p=2, reduction='none'),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        pickle=False)
-
-
-def multimarginloss_margin_no_reduce_test():
-    t = torch.rand(5).mul(8).floor().long()
-    return dict(
-        fullname='MultiMarginLoss_margin_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.multi_margin_loss(i, t.type_as(i).long(), margin=0.5, reduction='none')),
-        cpp_function_call='''F::multi_margin_loss(
-            i, t.to(i.options()).to(torch::kLong),
-            F::MultiMarginLossFuncOptions().margin(0.5).reduction(torch::kNone))''',
-        input_fn=lambda: torch.randn(5, 10),
-        cpp_var_map={'i': '_get_input()', 't': t},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(),
-                                                  margin=0.5, reduction='none'),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        pickle=False)
-
-
-def multimarginloss_weights_no_reduce_test():
-    t = torch.rand(5).mul(8).floor().long()
-    weights = torch.rand(10)
-    return dict(
-        fullname='MultiMarginLoss_weights_no_reduce',
-        constructor=wrap_functional(
-            lambda i: F.multi_margin_loss(i, t.type_as(i).long(), weight=weights.type_as(i),
-                                          reduction='none')),
-        cpp_function_call='''F::multi_margin_loss(
-            i, t.to(i.options()).to(torch::kLong),
-            F::MultiMarginLossFuncOptions().weight(weights.to(i.options())).reduction(torch::kNone))''',
-        input_fn=lambda: torch.randn(5, 10),
-        cpp_var_map={'i': '_get_input()', 't': t, 'weights': weights},
-        reference_fn=lambda i, *_:
-            loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(),
-                                                  weight=weights, reduction='none'),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        pickle=False)
-
-
-def fractional_max_pool2d_test(test_case):
-    random_samples = torch.DoubleTensor(1, 3, 2).uniform_()
-    if test_case == 'ratio':
-        return dict(
-            constructor=lambda: nn.FractionalMaxPool2d(
-                2, output_ratio=0.5, _random_samples=random_samples),
-            cpp_constructor_args='''torch::nn::FractionalMaxPool2dOptions(2)
-                                    .output_ratio(0.5)
-                                    ._random_samples(random_samples)''',
-            input_size=(1, 3, 5, 7),
-            cpp_var_map={'random_samples': random_samples},
-            fullname='FractionalMaxPool2d_ratio')
-    elif test_case == 'size':
-        return dict(
-            constructor=lambda: nn.FractionalMaxPool2d((2, 3), output_size=(
-                4, 3), _random_samples=random_samples),
-            cpp_constructor_args='''torch::nn::FractionalMaxPool2dOptions({2, 3})
-                                    .output_size(std::vector<int64_t>({4, 3}))
-                                    ._random_samples(random_samples)''',
-            input_size=(1, 3, 7, 6),
-            cpp_var_map={'random_samples': random_samples},
-            fullname='FractionalMaxPool2d_size')
-
-
-def fractional_max_pool3d_test(test_case):
-    random_samples = torch.DoubleTensor(2, 4, 3).uniform_()
-    if test_case == 'ratio':
-        return dict(
-            constructor=lambda: nn.FractionalMaxPool3d(
-                2, output_ratio=0.5, _random_samples=random_samples),
-            cpp_constructor_args='''torch::nn::FractionalMaxPool3dOptions(2)
-                                    .output_ratio(0.5)
-                                    ._random_samples(random_samples)''',
-            input_size=(2, 4, 5, 5, 5),
-            cpp_var_map={'random_samples': random_samples},
-            fullname='FractionalMaxPool3d_ratio')
-    elif test_case == 'size':
-        return dict(
-            constructor=lambda: nn.FractionalMaxPool3d((2, 2, 2), output_size=(
-                4, 4, 4), _random_samples=random_samples),
-            cpp_constructor_args='''torch::nn::FractionalMaxPool3dOptions({2, 2, 2})
-                                    .output_size(std::vector<int64_t>({4, 4, 4}))
-                                    ._random_samples(random_samples)''',
-            input_size=(2, 4, 7, 7, 7),
-            cpp_var_map={'random_samples': random_samples},
-            fullname='FractionalMaxPool3d_size')
-    elif test_case == 'asymsize':
-        return dict(
-            constructor=lambda: nn.FractionalMaxPool3d((4, 2, 3), output_size=(
-                10, 3, 2), _random_samples=random_samples),
-            cpp_constructor_args='''torch::nn::FractionalMaxPool3dOptions({4, 2, 3})
-                                    .output_size(std::vector<int64_t>({10, 3, 2}))
-                                    ._random_samples(random_samples)''',
-            input_size=(2, 4, 16, 7, 5),
-            cpp_var_map={'random_samples': random_samples},
-            fullname='FractionalMaxPool3d_asymsize')
-
-
-new_module_tests = [
-    poissonnllloss_no_reduce_test(),
-    bceloss_no_reduce_test(),
-    bceloss_weights_no_reduce_test(),
-    bce_with_logistic_legacy_enum_test(),
-    bce_with_logistic_no_reduce_test(),
-    bceloss_no_reduce_scalar_test(),
-    bceloss_weights_no_reduce_scalar_test(),
-    bce_with_logistic_no_reduce_scalar_test(),
-    kldivloss_with_target_no_reduce_test(),
-    kldivloss_no_reduce_test(),
-    kldivloss_no_reduce_scalar_test(),
-    l1loss_no_reduce_test(),
-    l1loss_no_reduce_scalar_test(),
-    mseloss_no_reduce_test(),
-    mseloss_no_reduce_scalar_test(),
-    nllloss_no_reduce_test(),
-    nllloss_no_reduce_ignore_index_test(),
-    nllloss_no_reduce_weights_test(),
-    nllloss_no_reduce_weights_ignore_index_test(),
-    nllloss_no_reduce_weights_ignore_index_neg_test(),
-    nllloss2d_no_reduce_test(),
-    nllloss2d_no_reduce_weights_test(),
-    nllloss2d_no_reduce_ignore_index_test(),
-    nlllossNd_no_reduce_test(),
-    nlllossNd_no_reduce_weights_test(),
-    nlllossNd_no_reduce_ignore_index_test(),
-    smoothl1loss_no_reduce_test(),
-    smoothl1loss_no_reduce_scalar_test(),
-    multilabelmarginloss_0d_no_reduce_test(),
-    multilabelmarginloss_1d_no_reduce_test(),
-    multilabelmarginloss_index_neg_test(),
-    multilabelmarginloss_no_reduce_test(),
-    hingeembeddingloss_no_reduce_test(),
-    hingeembeddingloss_margin_no_reduce_test(),
-    softmarginloss_no_reduce_test(),
-    multilabelsoftmarginloss_no_reduce_test(),
-    multilabelsoftmarginloss_weights_no_reduce_test(),
-    multimarginloss_no_reduce_test(),
-    multimarginloss_1d_no_reduce_test(),
-    multimarginloss_1d_input_0d_target_no_reduce_test(),
-    multimarginloss_p_no_reduce_test(),
-    multimarginloss_margin_no_reduce_test(),
-    multimarginloss_weights_no_reduce_test(),
-    fractional_max_pool2d_test('ratio'),
-    fractional_max_pool2d_test('size'),
-    fractional_max_pool3d_test('ratio'),
-    fractional_max_pool3d_test('size'),
-    fractional_max_pool3d_test('asymsize'),
-    dict(
-        module_name='BatchNorm1d',
-        constructor_args=(10,),
-        cpp_constructor_args='torch::nn::BatchNorm1dOptions(10)',
-        input_size=(4, 10),
-        cudnn=True,
-        check_eval=True,
-        desc='affine',
-    ),
-    dict(
-        module_name='BatchNorm1d',
-        constructor_args=(5,),
-        cpp_constructor_args='torch::nn::BatchNorm1dOptions(5)',
-        input_size=(4, 5, 3),
-        cudnn=True,
-        check_eval=True,
-        desc='3d_input',
-    ),
-    dict(
-        module_name='BatchNorm1d',
-        constructor_args=(10, 1e-3, None),
-        cpp_constructor_args='torch::nn::BatchNorm1dOptions(10).eps(1e-3).momentum(c10::nullopt)',
-        input_size=(4, 10),
-        cudnn=True,
-        check_eval=True,
-        desc='affine_simple_average',
-    ),
-    dict(
-        module_name='BatchNorm1d',
-        constructor_args=(10, 1e-3, 0.3, False),
-        cpp_constructor_args='torch::nn::BatchNorm1dOptions(10).eps(1e-3).momentum(0.3).affine(false)',
-        input_size=(4, 10),
-        cudnn=True,
-        check_eval=True,
-        desc='not_affine',
-    ),
-    dict(
-        module_name='BatchNorm1d',
-        constructor_args=(10, 1e-3, 0.3, True, False),
-        cpp_constructor_args='''torch::nn::BatchNorm1dOptions(10)
-                                .eps(1e-3).momentum(0.3).affine(true).track_running_stats(false)''',
-        input_size=(4, 10),
-        cudnn=True,
-        check_eval=True,
-        desc='not_tracking_stats',
-    ),
-    dict(
-        module_name='BatchNorm1d',
-        constructor_args=(5, 1e-3, 0.3, False),
-        cpp_constructor_args='torch::nn::BatchNorm1dOptions(5).eps(1e-3).momentum(0.3).affine(false)',
-        input_size=(4, 5, 3),
-        cudnn=True,
-        check_eval=True,
-        desc='3d_input_not_affine',
-    ),
-    dict(
-        module_name='BatchNorm1d',
-        constructor_args=(5, 1e-3, 0.3, False),
-        cpp_constructor_args='torch::nn::BatchNorm1dOptions(5).eps(1e-3).momentum(0.3).affine(false)',
-        input_size=(0, 5, 9),
-        cudnn=True,
-        check_eval=True,
-        desc='zero_batch',
-    ),
-    dict(
-        module_name='BatchNorm2d',
-        constructor_args=(3,),
-        cpp_constructor_args='torch::nn::BatchNorm2dOptions(3)',
-        input_size=(2, 3, 6, 6),
-        cudnn=True,
-        check_eval=True,
-    ),
-    dict(
-        module_name='BatchNorm2d',
-        constructor_args=(3, 1e-3, None),
-        cpp_constructor_args='torch::nn::BatchNorm2dOptions(3).eps(1e-3).momentum(c10::nullopt)',
-        input_size=(2, 3, 6, 6),
-        cudnn=True,
-        check_eval=True,
-        desc='2d_simple_average',
-    ),
-    dict(
-        module_name='BatchNorm2d',
-        constructor_args=(3, 1e-3, 0.8),
-        cpp_constructor_args='torch::nn::BatchNorm2dOptions(3).eps(1e-3).momentum(0.8)',
-        input_size=(2, 3, 6, 6),
-        cudnn=True,
-        check_eval=True,
-        desc='momentum',
-    ),
-    dict(
-        module_name='BatchNorm2d',
-        constructor_args=(3, 1e-3, 0.8, False),
-        cpp_constructor_args='torch::nn::BatchNorm2dOptions(3).eps(1e-3).momentum(0.8).affine(false)',
-        input_size=(2, 3, 6, 6),
-        cudnn=True,
-        check_eval=True,
-        desc='not_affine',
-    ),
-    dict(
-        module_name='BatchNorm2d',
-        constructor_args=(3, 1e-3, 0.8, True, False),
-        cpp_constructor_args='''torch::nn::BatchNorm2dOptions(3)
-                                .eps(1e-3).momentum(0.8).affine(true).track_running_stats(false)''',
-        input_size=(2, 3, 6, 6),
-        cudnn=True,
-        check_eval=True,
-        desc='not_tracking_stats',
-    ),
-    dict(
-        module_name='BatchNorm2d',
-        constructor_args=(5, 1e-3, 0.3, False),
-        cpp_constructor_args='torch::nn::BatchNorm2dOptions(5).eps(1e-3).momentum(0.3).affine(false)',
-        input_size=(0, 5, 2, 2),
-        cudnn=True,
-        check_eval=True,
-        desc='zero_batch',
-    ),
-    dict(
-        module_name='BatchNorm3d',
-        constructor_args=(3,),
-        cpp_constructor_args='torch::nn::BatchNorm3dOptions(3)',
-        input_size=(2, 3, 4, 4, 4),
-        cudnn=True,
-        check_eval=True,
-    ),
-    dict(
-        module_name='BatchNorm3d',
-        constructor_args=(3, 1e-3, None),
-        cpp_constructor_args='torch::nn::BatchNorm3dOptions(3).eps(1e-3).momentum(c10::nullopt)',
-        input_size=(2, 3, 4, 4, 4),
-        cudnn=True,
-        check_eval=True,
-        desc='3d_simple_average',
-    ),
-    dict(
-        module_name='BatchNorm3d',
-        constructor_args=(3, 1e-3, 0.7),
-        cpp_constructor_args='torch::nn::BatchNorm3dOptions(3).eps(1e-3).momentum(0.7)',
-        input_size=(2, 3, 4, 4, 4),
-        cudnn=True,
-        check_eval=True,
-        desc='momentum',
-    ),
-    dict(
-        module_name='BatchNorm3d',
-        constructor_args=(3, 1e-3, 0.7, False),
-        cpp_constructor_args='torch::nn::BatchNorm3dOptions(3).eps(1e-3).momentum(0.7).affine(false)',
-        input_size=(2, 3, 4, 4, 4),
-        cudnn=True,
-        check_eval=True,
-        desc='not_affine',
-    ),
-    dict(
-        module_name='BatchNorm3d',
-        constructor_args=(3, 1e-3, 0.7, True, False),
-        cpp_constructor_args='''torch::nn::BatchNorm3dOptions(3)
-                                .eps(1e-3).momentum(0.7).affine(true).track_running_stats(false)''',
-        input_size=(2, 3, 4, 4, 4),
-        cudnn=True,
-        check_eval=True,
-        desc='not_tracking_stats',
-    ),
-    dict(
-        module_name='BatchNorm3d',
-        constructor_args=(5, 1e-3, 0.3, False),
-        cpp_constructor_args='torch::nn::BatchNorm3dOptions(5).eps(1e-3).momentum(0.3).affine(false)',
-        input_size=(0, 5, 2, 2, 2),
-        cudnn=True,
-        check_eval=True,
-        desc='zero_batch',
-    ),
-    dict(
-        module_name='InstanceNorm1d',
-        constructor_args=(3, 1e-3, 0.3),
-        cpp_constructor_args='torch::nn::InstanceNorm1dOptions(3).eps(1e-3).momentum(0.3)',
-        input_size=(4, 3, 15),
-        cudnn=True,
-        check_eval=True,
-    ),
-    dict(
-        module_name='InstanceNorm1d',
-        constructor_args=(3, 1e-3, 0.3, False, True),
-        cpp_constructor_args='''torch::nn::InstanceNorm1dOptions(3)
-                                .eps(1e-3).momentum(0.3).affine(false).track_running_stats(true)''',
-        input_size=(4, 3, 15),
-        cudnn=True,
-        check_eval=True,
-        desc='tracking_stats',
-    ),
-    dict(
-        module_name='InstanceNorm2d',
-        constructor_args=(3, 1e-3, 0.3),
-        cpp_constructor_args='torch::nn::InstanceNorm2dOptions(3).eps(1e-3).momentum(0.3)',
-        input_size=(2, 3, 6, 6),
-        cudnn=True,
-        check_eval=True,
-    ),
-    dict(
-        module_name='InstanceNorm2d',
-        constructor_args=(3, 1e-3, 0.3, False, True),
-        cpp_constructor_args='''torch::nn::InstanceNorm2dOptions(3)
-                                .eps(1e-3).momentum(0.3).affine(false).track_running_stats(true)''',
-        input_size=(2, 3, 6, 6),
-        cudnn=True,
-        check_eval=True,
-        desc='tracking_stats',
-    ),
-    dict(
-        module_name='InstanceNorm3d',
-        constructor_args=(3, 1e-3, 0.3),
-        cpp_constructor_args='torch::nn::InstanceNorm3dOptions(3).eps(1e-3).momentum(0.3)',
-        input_size=(2, 3, 4, 4, 4),
-        cudnn=True,
-        check_eval=True,
-    ),
-    dict(
-        module_name='InstanceNorm3d',
-        constructor_args=(3, 1e-3, 0.3, False, True),
-        cpp_constructor_args='''torch::nn::InstanceNorm3dOptions(3)
-                                .eps(1e-3).momentum(0.3).affine(false).track_running_stats(true)''',
-        input_size=(2, 3, 4, 4, 4),
-        cudnn=True,
-        check_eval=True,
-        desc='tracking_stats',
-    ),
-    dict(
-        module_name='LayerNorm',
-        constructor_args=([5], 1e-3),
-        cpp_constructor_args='torch::nn::LayerNormOptions({5}).eps(1e-3)',
-        input_size=(4, 5, 5),
-        cudnn=True,
-        check_eval=True,
-        desc='1d_elementwise_affine',
-    ),
-    dict(
-        module_name='LayerNorm',
-        constructor_args=([5], 1e-3, False),
-        cpp_constructor_args='torch::nn::LayerNormOptions({5}).eps(1e-3).elementwise_affine(false)',
-        input_size=(4, 5, 5),
-        cudnn=True,
-        check_eval=True,
-        desc='1d_no_elementwise_affine',
-    ),
-    dict(
-        module_name='LayerNorm',
-        constructor_args=([2, 2, 5], 1e-3),
-        cpp_constructor_args='torch::nn::LayerNormOptions({2, 2, 5}).eps(1e-3)',
-        input_size=(4, 2, 2, 5),
-        cudnn=True,
-        check_eval=True,
-        desc='3d_elementwise_affine',
-    ),
-    dict(
-        module_name='LayerNorm',
-        constructor_args=([2, 2, 5], 1e-3, False),
-        cpp_constructor_args='torch::nn::LayerNormOptions({2, 2, 5}).eps(1e-3).elementwise_affine(false)',
-        input_size=(4, 2, 2, 5),
-        cudnn=True,
-        check_eval=True,
-        desc='3d_no_elementwise_affine',
-    ),
-    dict(
-        module_name='LayerNorm',
-        constructor_args=([5], 1e-3),
-        cpp_constructor_args='torch::nn::LayerNormOptions({5}).eps(1e-3)',
-        input_size=(0, 5),
-        cudnn=True,
-        check_eval=True,
-        desc='1d_empty_elementwise_affine',
-    ),
-    dict(
-        module_name='GroupNorm',
-        constructor_args=(3, 6, 1e-3),
-        cpp_constructor_args='torch::nn::GroupNormOptions(3, 6).eps(1e-3)',
-        input_size=(4, 6, 5),
-        cudnn=True,
-        check_eval=True,
-        desc='1d_affine',
-    ),
-    dict(
-        module_name='GroupNorm',
-        constructor_args=(5, 5, 1e-3, False),
-        cpp_constructor_args='torch::nn::GroupNormOptions(5, 5).eps(1e-3).affine(false)',
-        input_size=(4, 5, 5),
-        cudnn=True,
-        check_eval=True,
-        desc='1d_no_affine_IN',  # this setting is equivalent with InstanceNormi
-    ),
-    dict(
-        module_name='GroupNorm',
-        constructor_args=(1, 5, 1e-3, False),
-        cpp_constructor_args='torch::nn::GroupNormOptions(1, 5).eps(1e-3).affine(false)',
-        input_size=(4, 5, 5),
-        cudnn=True,
-        check_eval=True,
-        desc='1d_no_affine_LN',  # this setting is equivalent with LayerNorm
-    ),
-    dict(
-        module_name='GroupNorm',
-        constructor_args=(3, 6, 1e-3),
-        cpp_constructor_args='torch::nn::GroupNormOptions(3, 6).eps(1e-3)',
-        input_size=(4, 6, 2, 3),
-        cudnn=True,
-        check_eval=True,
-        desc='2d_affine',
-    ),
-    dict(
-        module_name='GroupNorm',
-        constructor_args=(3, 3, 1e-3, False),
-        cpp_constructor_args='torch::nn::GroupNormOptions(3, 3).eps(1e-3).affine(false)',
-        input_size=(4, 3, 2, 3),
-        cudnn=True,
-        check_eval=True,
-        desc='2d_no_affine_IN',  # this setting is equivalent with InstanceNorm
-    ),
-    dict(
-        module_name='GroupNorm',
-        constructor_args=(1, 3, 1e-3, False),
-        cpp_constructor_args='torch::nn::GroupNormOptions(1, 3).eps(1e-3).affine(false)',
-        input_size=(4, 3, 2, 3),
-        cudnn=True,
-        check_eval=True,
-        desc='2d_no_affine_LN',  # this setting is equivalent with LayerNorm
-    ),
-    dict(
-        module_name='Conv1d',
-        constructor_args=(4, 5, 3),
-        cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 3)',
-        input_size=(2, 4, 10),
-        cudnn=True,
-    ),
-    dict(
-        module_name='Conv1d',
-        constructor_args=(4, 5, 3, 2),
-        cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 3).stride(2)',
-        input_size=(2, 4, 10),
-        cudnn=True,
-        desc='stride',
-    ),
-    dict(
-        module_name='Conv1d',
-        constructor_args=(4, 5, 3, 1, 1),
-        cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 3).stride(1).padding(1)',
-        input_size=(2, 4, 10),
-        cudnn=True,
-        desc='pad1',
-    ),
-    dict(
-        module_name='Conv1d',
-        constructor_args=(4, 5, 5, 1, 2),
-        cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 5).stride(1).padding(2)',
-        input_size=(2, 4, 10),
-        cudnn=True,
-        desc='pad2',
-    ),
-    dict(
-        module_name='Conv1d',
-        constructor_args=(4, 4, 3, 1, 1),
-        cpp_constructor_args='torch::nn::Conv1dOptions(4, 4, 3).stride(1).padding(1)',
-        input_size=(1, 4, 1),
-        cudnn=True,
-        desc='pad1size1',
-    ),
-    dict(
-        module_name='Conv1d',
-        constructor_args=(4, 4, 5, 1, 2),
-        cpp_constructor_args='torch::nn::Conv1dOptions(4, 4, 5).stride(1).padding(2)',
-        input_size=(1, 4, 1),
-        cudnn=True,
-        desc='pad2size1',
-    ),
-    dict(
-        module_name='Conv1d',
-        constructor_args=(4, 5, 3),
-        cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 3)',
-        input_size=(0, 4, 10),
-        cudnn=True,
-        desc='zero_batch',
-    ),
-    dict(
-        fullname='Conv1d_dilated',
-        constructor=lambda: nn.Conv1d(4, 5, kernel_size=3, dilation=2),
-        cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 3).dilation(2)',
-        input_size=(2, 4, 10),
-    ),
-    dict(
-        fullname='Conv1d_groups',
-        constructor=lambda: nn.Conv1d(4, 6, kernel_size=3, groups=2),
-        cpp_constructor_args='torch::nn::Conv1dOptions(4, 6, 3).groups(2)',
-        input_size=(2, 4, 6),
-        cudnn=True,
-    ),
-    dict(
-        fullname='ConvTranspose1d',
-        constructor=lambda: nn.ConvTranspose1d(3, 4, kernel_size=3, stride=(3,), padding=1, output_padding=(1,)),
-        cpp_constructor_args='torch::nn::ConvTranspose1dOptions(3, 4, 3).stride(3).padding(1).output_padding(1)',
-        cudnn=True,
-        input_size=(1, 3, 7),
-    ),
-    dict(
-        module_name='ConvTranspose1d',
-        constructor_args=(3, 4, 3, 2, 1, 1, 1, False),
-        cpp_constructor_args='''torch::nn::ConvTranspose1dOptions(3, 4, 3)
-                                .stride(2).padding(1).output_padding(1).groups(1).bias(false)''',
-        input_size=(1, 3, 6),
-        cudnn=True,
-        desc='no_bias',
-    ),
-    dict(
-        module_name='ConvTranspose1d',
-        constructor_args=(3, 4, 3, 2, 1, 1, 1, True, 2),
-        cpp_constructor_args='''torch::nn::ConvTranspose1dOptions(3, 4, 3)
-                                .stride(2).padding(1).output_padding(1).groups(1).bias(true).dilation(2)''',
-        input_size=(1, 3, 6),
-        cudnn=True,
-        desc='dilated',
-    ),
-    dict(
-        fullname='ConvTranspose1d_groups',
-        constructor=lambda: nn.ConvTranspose1d(4, 6, 3, stride=(3,), padding=1, output_padding=(1,), groups=2),
-        cpp_constructor_args='''torch::nn::ConvTranspose1dOptions(4, 6, 3)
-                                .stride(3).padding(1).output_padding(1).groups(2)''',
-        cudnn=True,
-        input_size=(2, 4, 7),
-    ),
-    dict(
-        module_name='MaxPool1d',
-        constructor_args=(4,),
-        cpp_constructor_args='torch::nn::MaxPool1dOptions(4)',
-        input_size=(2, 10, 4),
-    ),
-    dict(
-        module_name='MaxPool1d',
-        constructor_args=(4, 4),
-        cpp_constructor_args='torch::nn::MaxPool1dOptions(4).stride(4)',
-        input_size=(2, 10, 4),
-        desc='stride',
-    ),
-    dict(
-        module_name='Conv2d',
-        constructor_args=(3, 4, (3, 2)),
-        cpp_constructor_args='torch::nn::Conv2dOptions(3, 4, {3, 2})',
-        input_size=(2, 3, 7, 5),
-        cudnn=True,
-        check_with_long_tensor=True,
-    ),
-    dict(
-        module_name='Conv2d',
-        constructor_args=(3, 4, (3, 3), (2, 2)),
-        cpp_constructor_args='torch::nn::Conv2dOptions(3, 4, {3, 3}).stride({2, 2})',
-        input_size=(2, 3, 6, 6),
-        cudnn=True,
-        desc='strided',
-        check_with_long_tensor=True,
-    ),
-    dict(
-        module_name='Conv2d',
-        constructor_args=(3, 4, (3, 3), (2, 2), (1, 1)),
-        cpp_constructor_args='torch::nn::Conv2dOptions(3, 4, {3, 3}).stride({2, 2}).padding({1, 1})',
-        input_size=(2, 3, 6, 6),
-        cudnn=True,
-        desc='padding',
-        check_with_long_tensor=True,
-    ),
-    dict(
-        module_name='Conv2d',
-        constructor_args=(3, 2, (3, 3), (2, 2), (1, 1), (2, 2)),
-        cpp_constructor_args='torch::nn::Conv2dOptions(3, 2, {3, 3}).stride({2, 2}).padding({1, 1}).dilation({2, 2})',
-        input_size=(2, 3, 8, 8),
-        cudnn=True,
-        desc='dilated',
-        check_with_long_tensor=True,
-    ),
-    dict(
-        module_name='Conv2d',
-        constructor_args=(3, 4, (3, 2), 1, 0, 1, 1, False),
-        cpp_constructor_args='''torch::nn::Conv2dOptions(3, 4, {3, 2})
-                                .stride(1).padding(0).dilation(1).groups(1).bias(false)''',
-        input_size=(2, 3, 6, 5),
-        cudnn=True,
-        desc='no_bias',
-        check_with_long_tensor=True,
-    ),
-    dict(
-        module_name='Conv2d',
-        constructor_args=(3, 4, (3, 2)),
-        cpp_constructor_args='torch::nn::Conv2dOptions(3, 4, {3, 2})',
-        input_size=(0, 3, 7, 5),
-        cudnn=True,
-        desc='zero_batch',
-        check_with_long_tensor=True,
-    ),
-    dict(
-        fullname='Conv2d_groups',
-        constructor=lambda: nn.Conv2d(4, 6, (3, 2), groups=2),
-        cpp_constructor_args='torch::nn::Conv2dOptions(4, 6, {3, 2}).groups(2)',
-        input_size=(2, 4, 6, 5),
-        cudnn=True,
-        check_with_long_tensor=True,
-    ),
-    dict(
-        fullname='Conv2d_groups_thnn',
-        constructor=lambda: nn.Conv2d(4, 6, (3, 2), groups=2),
-        cpp_constructor_args='torch::nn::Conv2dOptions(4, 6, {3, 2}).groups(2)',
-        input_size=(2, 4, 6, 5),
-        check_with_long_tensor=True,
-    ),
-    dict(
-        module_name='ConvTranspose2d',
-        constructor_args=(3, 4, 3, (3, 2), 1, (1, 1)),
-        cpp_constructor_args='''torch::nn::ConvTranspose2dOptions(3, 4, 3)
-                                .stride({3, 2}).padding(1).output_padding({1, 1})''',
-        cudnn=True,
-        input_size=(1, 3, 7, 6),
-        check_with_long_tensor=True,
-    ),
-    dict(
-        module_name='ConvTranspose2d',
-        constructor_args=(3, 4, 3, (2, 3), 1, (1, 1), 1, False, (2, 2)),
-        cpp_constructor_args='''torch::nn::ConvTranspose2dOptions(3, 4, 3)
-                                .stride({2, 3})
-                                .padding(1)
-                                .output_padding({1, 1})
-                                .groups(1)
-                                .bias(false)
-                                .dilation({2, 2})''',
-        input_size=(1, 3, 6, 7),
-        cudnn=True,
-        desc='dilated',
-        check_with_long_tensor=True,
-    ),
-    dict(
-        module_name='ConvTranspose2d',
-        constructor_args=(3, 4, 3, (2, 3), 1, (1, 1), 1, False),
-        cpp_constructor_args='''torch::nn::ConvTranspose2dOptions(3, 4, 3)
-                                .stride({2, 3}).padding(1).output_padding({1, 1}).groups(1).bias(false)''',
-        input_size=(1, 3, 6, 7),
-        cudnn=True,
-        desc='no_bias',
-        check_with_long_tensor=True,
-    ),
-    dict(
-        fullname='ConvTranspose2d_groups',
-        constructor=lambda: nn.ConvTranspose2d(2, 4, (2, 3), groups=2),
-        cpp_constructor_args='torch::nn::ConvTranspose2dOptions(2, 4, {2, 3}).groups(2)',
-        input_size=(1, 2, 4, 5),
-        cudnn=True,
-        check_with_long_tensor=True,
-    ),
-    dict(
-        fullname='Conv2d_depthwise',
-        constructor=lambda: nn.Conv2d(4, 4, (3, 3), groups=4),
-        cpp_constructor_args='torch::nn::Conv2dOptions(4, 4, {3, 3}).groups(4)',
-        input_size=(2, 4, 6, 6),
-    ),
-    dict(
-        fullname='Conv2d_depthwise_with_multiplier',
-        constructor=lambda: nn.Conv2d(4, 8, (3, 3), groups=4),
-        cpp_constructor_args='torch::nn::Conv2dOptions(4, 8, {3, 3}).groups(4)',
-        input_size=(2, 4, 6, 6),
-    ),
-    dict(
-        fullname='Conv2d_depthwise_strided',
-        constructor=lambda: nn.Conv2d(4, 4, (3, 3), stride=(2, 2), groups=4),
-        cpp_constructor_args='torch::nn::Conv2dOptions(4, 4, {3, 3}).stride({2, 2}).groups(4)',
-        input_size=(2, 4, 6, 6),
-    ),
-    dict(
-        fullname='Conv2d_depthwise_padded',
-        constructor=lambda: nn.Conv2d(4, 4, (3, 3), padding=(1, 1), groups=4),
-        cpp_constructor_args='torch::nn::Conv2dOptions(4, 4, {3, 3}).padding({1, 1}).groups(4)',
-        input_size=(2, 4, 6, 6),
-    ),
-    dict(
-        fullname='Conv2d_depthwise_dilated',
-        constructor=lambda: nn.Conv2d(4, 4, (2, 2), dilation=(2, 2), groups=4),
-        cpp_constructor_args='torch::nn::Conv2dOptions(4, 4, {2, 2}).dilation({2, 2}).groups(4)',
-        input_size=(2, 4, 5, 5),
-    ),
-    dict(
-        module_name='MaxPool2d',
-        constructor_args=((3, 3), (2, 2), (1, 1)),
-        cpp_constructor_args='torch::nn::MaxPool2dOptions({3, 3}).stride({2, 2}).padding({1, 1})',
-        input_size=(3, 7, 7),
-        desc='3d_input'
-    ),
-    dict(
-        module_name='MaxPool2d',
-        constructor_args=((3, 3), (2, 2), (1, 1)),
-        cpp_constructor_args='torch::nn::MaxPool2dOptions({3, 3}).stride({2, 2}).padding({1, 1})',
-        input_size=(1, 3, 7, 7),
-        check_with_channels_last=True,
-        desc='4d_input'
-    ),
-    dict(
-        module_name='AvgPool1d',
-        constructor_args=(2,),
-        cpp_constructor_args='torch::nn::AvgPool1dOptions(2)',
-        input_size=(2, 3, 6),
-    ),
-    dict(
-        module_name='AvgPool1d',
-        constructor_args=((2,), (2,)),
-        cpp_constructor_args='torch::nn::AvgPool1dOptions(2).stride(2)',
-        input_size=(2, 3, 6),
-        desc='stride',
-    ),
-    dict(
-        module_name='AvgPool1d',
-        constructor_args=(2, 2, 1),
-        cpp_constructor_args='torch::nn::AvgPool1dOptions(2).stride(2).padding(1)',
-        input_size=(2, 3, 6),
-        desc='stride_pad',
-    ),
-    dict(
-        module_name='AvgPool2d',
-        constructor_args=((2, 2),),
-        cpp_constructor_args='torch::nn::AvgPool2dOptions({2, 2})',
-        input_size=(2, 3, 6, 6),
-    ),
-    dict(
-        module_name='AvgPool2d',
-        constructor_args=((2, 2), (2, 2)),
-        cpp_constructor_args='torch::nn::AvgPool2dOptions({2, 2}).stride({2, 2})',
-        input_size=(2, 3, 6, 6),
-        desc='stride',
-    ),
-    dict(
-        module_name='AvgPool2d',
-        constructor_args=((2, 2), (2, 2), (1, 1)),
-        cpp_constructor_args='torch::nn::AvgPool2dOptions({2, 2}).stride({2, 2}).padding({1, 1})',
-        input_size=(2, 3, 6, 6),
-        desc='stride_pad',
-    ),
-    dict(
-        fullname='AvgPool2d_divisor',
-        constructor=lambda: nn.AvgPool2d((2, 2), divisor_override=1),
-        cpp_constructor_args='torch::nn::AvgPool2dOptions({2, 2}).divisor_override(1)',
-        input_size=(2, 3, 6, 6),
-        check_with_long_tensor=True,
-    ),
-    dict(
-        fullname='AvgPool2d_divisor_stride',
-        constructor=lambda: nn.AvgPool2d((2, 2), (2, 2), divisor_override=1),
-        cpp_constructor_args='torch::nn::AvgPool2dOptions({2, 2}).stride({2, 2}).divisor_override(1)',
-        input_size=(2, 3, 6, 6),
-        check_with_long_tensor=True,
-    ),
-    dict(
-        fullname='AvgPool2d_divisor_stride_pad',
-        constructor=lambda: nn.AvgPool2d((2, 2), (2, 2), (1, 1), divisor_override=1),
-        cpp_constructor_args='torch::nn::AvgPool2dOptions({2, 2}).stride({2, 2}).padding({1, 1}).divisor_override(1)',
-        input_size=(2, 3, 6, 6),
-        check_with_long_tensor=True,
-    ),
-    dict(
-        module_name='LPPool2d',
-        constructor_args=(2, 2, 2),
-        cpp_constructor_args='torch::nn::LPPool2dOptions(2, 2).stride(2)',
-        input_size=(1, 3, 7, 7),
-    ),
-    dict(
-        module_name='LPPool2d',
-        constructor_args=(1.5, 2),
-        cpp_constructor_args='torch::nn::LPPool2dOptions(1.5, 2)',
-        input_fn=lambda: torch.rand(1, 3, 7, 7),
-        desc='norm',
-    ),
-    dict(
-        module_name='LPPool1d',
-        constructor_args=(1.5, 2),
-        cpp_constructor_args='torch::nn::LPPool1dOptions(1.5, 2)',
-        input_fn=lambda: torch.rand(1, 3, 7),
-        desc='norm',
-    ),
-    dict(
-        module_name='LPPool1d',
-        constructor_args=(2, 2, 3),
-        cpp_constructor_args='torch::nn::LPPool1dOptions(2, 2).stride(3)',
-        input_size=(1, 3, 7),
-    ),
-    dict(
-        module_name='LocalResponseNorm',
-        constructor_args=(3, ),
-        cpp_constructor_args='torch::nn::LocalResponseNormOptions(3)',
-        input_size=(1, 5, 7),
-        desc='1d',
-    ),
-    dict(
-        module_name='LocalResponseNorm',
-        constructor_args=(2, ),
-        cpp_constructor_args='torch::nn::LocalResponseNormOptions(2)',
-        input_size=(1, 5, 7, 7),
-        desc='2d_uneven_pad',
-    ),
-    dict(
-        module_name='LocalResponseNorm',
-        constructor_args=(1, 1., 0.5, 2.),
-        cpp_constructor_args='torch::nn::LocalResponseNormOptions(1).alpha(1.).beta(0.5).k(2.)',
-        input_size=(1, 5, 7, 7, 7),
-        desc='3d_custom_params',
-    ),
-    dict(
-        module_name='ReflectionPad1d',
-        constructor_args=((1, 2),),
-        cpp_constructor_args='torch::nn::ReflectionPad1dOptions({1, 2})',
-        input_size=(2, 3, 8),
-    ),
-    dict(
-        module_name='ReflectionPad2d',
-        constructor_args=((1, 2, 3, 4),),
-        cpp_constructor_args='torch::nn::ReflectionPad2dOptions({1, 2, 3, 4})',
-        input_size=(2, 3, 8, 8),
-    ),
-    dict(
-        module_name='ReplicationPad1d',
-        constructor_args=((1, 2),),
-        cpp_constructor_args='torch::nn::ReplicationPad1dOptions({1, 2})',
-        input_size=(2, 3, 4),
-    ),
-    dict(
-        module_name='ReplicationPad2d',
-        constructor_args=((1, 2, 3, 4),),
-        cpp_constructor_args='torch::nn::ReplicationPad2dOptions({1, 2, 3, 4})',
-        input_size=(2, 3, 4, 4),
-    ),
-    dict(
-        module_name='ZeroPad2d',
-        constructor_args=((1, 2, 3, 4),),
-        cpp_constructor_args='torch::nn::ZeroPad2dOptions({1, 2, 3, 4})',
-        input_size=(2, 3, 4, 4),
-    ),
-    dict(
-        module_name='ZeroPad2d',
-        constructor_args=((-1, -1, -1, -2),),
-        cpp_constructor_args='torch::nn::ZeroPad2dOptions({-1, -1, -1, -2})',
-        input_size=(2, 3, 4, 4),
-        desc='negative_dims'
-    ),
-    dict(
-        module_name='ConstantPad1d',
-        constructor_args=((1, 2), 2.),
-        cpp_constructor_args='torch::nn::ConstantPad1dOptions({1, 2}, 2.)',
-        input_size=(2, 3, 4),
-    ),
-    dict(
-        module_name='ConstantPad2d',
-        constructor_args=((1, 2, 3, 4), 2.),
-        cpp_constructor_args='torch::nn::ConstantPad2dOptions({1, 2, 3, 4}, 2.)',
-        input_size=(2, 3, 4, 4),
-    ),
-    dict(
-        module_name='ConstantPad3d',
-        constructor_args=((1, 2, 3, 4, 1, 0), 2.),
-        cpp_constructor_args='torch::nn::ConstantPad3dOptions({1, 2, 3, 4, 1, 0}, 2.)',
-        input_size=(2, 3, 4, 4, 5),
-    ),
-    dict(
-        module_name='Conv3d',
-        constructor_args=(3, 4, (2, 3, 4)),
-        cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, {2, 3, 4})',
-        input_size=(2, 3, 3, 4, 5),
-        cudnn=True,
-        check_with_long_tensor=True,
-    ),
-    dict(
-        module_name='Conv3d',
-        constructor_args=(3, 4, (2, 3, 4), 1, 0, 1, 1, False),
-        cpp_constructor_args='''torch::nn::Conv3dOptions(3, 4, {2, 3, 4})
-                                .stride(1).padding(0).dilation(1).groups(1).bias(false)''',
-        input_size=(2, 3, 3, 4, 5),
-        cudnn=True,
-        desc='no_bias',
-        check_with_long_tensor=True,
-    ),
-    dict(
-        module_name='Conv3d',
-        constructor_args=(3, 4, 2, 2),
-        cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, 2).stride(2)',
-        input_size=(2, 3, 5, 5, 5),
-        cudnn=True,
-        desc='stride',
-        check_with_long_tensor=True,
-    ),
-    dict(
-        module_name='Conv3d',
-        constructor_args=(3, 4, 2, 2, 1),
-        cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, 2).stride(2).padding(1)',
-        input_size=(2, 3, 5, 5, 5),
-        cudnn=True,
-        desc='stride_padding',
-        check_with_long_tensor=True,
-    ),
-    dict(
-        module_name='Conv3d',
-        constructor_args=(3, 4, (2, 3, 4)),
-        cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, {2, 3, 4})',
-        input_size=(0, 3, 3, 4, 5),
-        cudnn=True,
-        check_with_long_tensor=True,
-        desc='zero_batch',
-    ),
-    dict(
-        fullname='Conv3d_groups',
-        constructor=lambda: nn.Conv3d(4, 6, kernel_size=3, groups=2),
-        cpp_constructor_args='torch::nn::Conv3dOptions(4, 6, 3).groups(2)',
-        input_size=(2, 4, 4, 5, 4),
-        cudnn=True,
-        check_with_long_tensor=True,
-    ),
-    dict(
-        fullname='Conv3d_dilated',
-        constructor=lambda: nn.Conv3d(3, 4, kernel_size=2, dilation=2),
-        cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, 2).dilation(2)',
-        input_size=(2, 3, 5, 5, 5),
-    ),
-    dict(
-        fullname='Conv3d_dilated_strided',
-        constructor=lambda: nn.Conv3d(3, 4, kernel_size=2, dilation=2, stride=2),
-        cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, 2).dilation(2).stride(2)',
-        input_size=(2, 3, 5, 5, 5),
-    ),
-    dict(
-        module_name='ConvTranspose3d',
-        constructor_args=(2, 3, (2, 3, 2)),
-        cpp_constructor_args='torch::nn::ConvTranspose3dOptions(2, 3, {2, 3, 2})',
-        cudnn=True,
-        input_size=(1, 2, 4, 5, 4),
-    ),
-    dict(
-        module_name='ConvTranspose3d',
-        constructor_args=(2, 3, (2, 3, 2), 1, 0, 0, 1, True, (2, 2, 2)),
-        cpp_constructor_args='''torch::nn::ConvTranspose3dOptions(2, 3, {2, 3, 2})
-                                .stride(1).padding(0).output_padding(0).groups(1).bias(true).dilation({2, 2, 2})''',
-        cudnn=True,
-        input_size=(1, 2, 4, 5, 4),
-        desc='dilated',
-    ),
-    dict(
-        module_name='MaxPool3d',
-        constructor_args=((2, 2, 2),),
-        cpp_constructor_args='torch::nn::MaxPool3dOptions({2, 2, 2})',
-        input_size=(2, 3, 5, 5, 5),
-    ),
-    dict(
-        module_name='MaxPool3d',
-        constructor_args=(2, (2, 2, 2)),
-        cpp_constructor_args='torch::nn::MaxPool3dOptions(2).stride({2, 2, 2})',
-        input_size=(2, 3, 5, 5, 5),
-        desc='stride',
-    ),
-    dict(
-        module_name='MaxPool3d',
-        constructor_args=(2, 2, (1, 1, 1)),
-        cpp_constructor_args='torch::nn::MaxPool3dOptions(2).stride(2).padding({1, 1, 1})',
-        input_size=(2, 3, 5, 5, 5),
-        desc='stride_padding',
-    ),
-    dict(
-        module_name='AvgPool3d',
-        constructor_args=((2, 2, 2),),
-        cpp_constructor_args='torch::nn::AvgPool3dOptions({2, 2, 2})',
-        input_size=(2, 3, 4, 4, 4),
-    ),
-    dict(
-        module_name='AvgPool3d',
-        constructor_args=(2, (2, 2, 2)),
-        cpp_constructor_args='torch::nn::AvgPool3dOptions(2).stride({2, 2, 2})',
-        input_size=(2, 3, 5, 5, 5),
-        desc='stride',
-    ),
-    dict(
-        module_name='AvgPool3d',
-        constructor_args=(2, 2, (1, 1, 1)),
-        cpp_constructor_args='torch::nn::AvgPool3dOptions(2).stride(2).padding({1, 1, 1})',
-        input_size=(2, 3, 5, 5, 5),
-        desc='stride_pad',
-    ),
-    dict(
-        module_name='AvgPool3d',
-        constructor_args=(4, 2, (1, 2, 1)),
-        cpp_constructor_args='torch::nn::AvgPool3dOptions(4).stride(2).padding({1, 2, 1})',
-        input_size=(2, 3, 5, 5, 5),
-        desc='stride_pad_npu_fixedkw_output',
-    ),
-    dict(
-        module_name='AvgPool3d',
-        constructor_args=((2, 4, 8), 1, (1, 1, 2)),
-        cpp_constructor_args='torch::nn::AvgPool3dOptions({2, 4, 8}).stride(1).padding({1, 1, 2})',
-        input_size=(2, 3, 2, 4, 8),
-        desc='stride_pad_npu_general_output',
-    ),
-    dict(
-        module_name='AvgPool3d',
-        constructor_args=(3, 1, 0),
-        cpp_constructor_args='torch::nn::AvgPool3dOptions(3).stride(1).padding(0)',
-        input_size=(2, 3, 4, 4, 4),
-        desc='stride1_pad0_npu_input',
-    ),
-    dict(
-        module_name='AvgPool3d',
-        constructor_args=(2, 2, (1, 1, 1)),
-        cpp_constructor_args='torch::nn::AvgPool3dOptions(2).stride(2).padding({1, 1, 1})',
-        input_size=(2, 3, 4, 4, 4),
-        desc='stride_pad_npu_input_nooverlap',
-    ),
-    dict(
-        fullname='AvgPool3d_divisor',
-        constructor=lambda: nn.AvgPool3d((2, 2, 2), divisor_override=1),
-        cpp_constructor_args='torch::nn::AvgPool3dOptions({2, 2, 2}).divisor_override(1)',
-        input_size=(2, 3, 4, 4, 4),
-        check_with_long_tensor=True,
-    ),
-    dict(
-        fullname='AvgPool3d_divisor_stride',
-        constructor=lambda: nn.AvgPool3d(2, (2, 2, 2), divisor_override=1),
-        cpp_constructor_args='torch::nn::AvgPool3dOptions(2).stride({2, 2, 2}).divisor_override(1)',
-        input_size=(2, 3, 5, 5, 5),
-        check_with_long_tensor=True,
-    ),
-    dict(
-        fullname='AvgPool3d_divisor_stride_pad',
-        constructor=lambda: nn.AvgPool3d(2, 2, (1, 1, 1), divisor_override=1),
-        cpp_constructor_args='torch::nn::AvgPool3dOptions(2).stride(2).padding({1, 1, 1}).divisor_override(1)',
-        input_size=(2, 3, 5, 5, 5),
-        check_with_long_tensor=True,
-    ),
-    dict(
-        fullname='AvgPool3d_divisor_stride_pad_npu_fixedkw_output',
-        constructor=lambda: nn.AvgPool3d(4, 2, (1, 2, 1), divisor_override=1),
-        cpp_constructor_args='torch::nn::AvgPool3dOptions(4).stride(2).padding({1, 2, 1}).divisor_override(1)',
-        input_size=(2, 3, 5, 5, 5),
-        check_with_long_tensor=True,
-    ),
-    dict(
-        fullname='AvgPool3d_divisor_stride_pad_npu_general_output',
-        constructor=lambda: nn.AvgPool3d((2, 4, 8), 1, (1, 1, 2), divisor_override=1),
-        cpp_constructor_args='torch::nn::AvgPool3dOptions({2, 4, 8}).stride(1).padding({1, 1, 2}).divisor_override(1)',
-        input_size=(2, 3, 2, 4, 8),
-        check_with_long_tensor=True,
-    ),
-    dict(
-        fullname='AvgPool3d_divisor_stride1_pad0_npu_input',
-        constructor=lambda: nn.AvgPool3d(3, 1, 0, divisor_override=1),
-        cpp_constructor_args='torch::nn::AvgPool3dOptions(3).stride(1).padding(0).divisor_override(1)',
-        input_size=(2, 3, 4, 4, 4),
-        check_with_long_tensor=True,
-    ),
-    dict(
-        fullname='AvgPool3d_divisor_stride_pad_npu_input_nooverlap',
-        constructor=lambda: nn.AvgPool3d(2, 2, (1, 1, 1), divisor_override=1),
-        cpp_constructor_args='torch::nn::AvgPool3dOptions(2).stride(2).padding({1, 1, 1}).divisor_override(1)',
-        input_size=(2, 3, 4, 4, 4),
-        check_with_long_tensor=True,
-    ),
-    dict(
-        module_name='ReplicationPad3d',
-        constructor_args=((1, 2, 3, 4, 5, 6),),
-        cpp_constructor_args='torch::nn::ReplicationPad3dOptions({1, 2, 3, 4, 5, 6})',
-        input_size=(2, 3, 5, 5, 5),
-    ),
-    dict(
-        module_name='Embedding',
-        constructor_args=(4, 3),
-        cpp_constructor_args='torch::nn::EmbeddingOptions(4, 3)',
-        input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4),
-        jacobian_input=False,
-        check_gradgrad=False,
-    ),
-    dict(
-        module_name='EmbeddingBag',
-        constructor_args=(4, 3),
-        cpp_constructor_args='torch::nn::EmbeddingBagOptions(4, 3)',
-        input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4),
-        jacobian_input=False,
-        check_gradgrad=False,
-        desc='mean',
-    ),
-    dict(
-        module_name='EmbeddingBag',
-        constructor_args=(4, 3, None, 2., False, 'sum'),
-        cpp_constructor_args='''torch::nn::EmbeddingBagOptions(4, 3)
-                                .max_norm(c10::nullopt).norm_type(2.).scale_grad_by_freq(false).mode(torch::kSum)''',
-        input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4),
-        jacobian_input=False,
-        check_gradgrad=False,
-        desc='sum',
-    ),
-    dict(
-        module_name='EmbeddingBag',
-        constructor_args=(4, 3, None, 2., False, 'max'),
-        cpp_constructor_args='''torch::nn::EmbeddingBagOptions(4, 3)
-                                .max_norm(c10::nullopt).norm_type(2.).scale_grad_by_freq(false).mode(torch::kMax)''',
-        input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4),
-        jacobian_input=False,
-        check_gradgrad=False,
-        desc='max',
-    ),
-    dict(
-        fullname='EmbeddingBag_sparse',
-        constructor=lambda: nn.EmbeddingBag(4, 3, sparse=True),
-        cpp_constructor_args='torch::nn::EmbeddingBagOptions(4, 3).sparse(true)',
-        input_fn=lambda: torch.randperm(2).repeat(1, 2),
-        jacobian_input=False,
-        check_gradgrad=False,
-    ),
-    dict(
-        constructor=lambda: nn.Embedding(4, 3, sparse=True),
-        cpp_constructor_args='torch::nn::EmbeddingOptions(4, 3).sparse(true)',
-        input_fn=lambda: torch.randperm(2).repeat(1, 2),
-        jacobian_input=False,
-        fullname='Embedding_sparse',
-        check_gradgrad=False,
-    ),
-    dict(
-        module_name='PixelShuffle',
-        constructor_args=(3,),
-        cpp_constructor_args='torch::nn::PixelShuffleOptions(3)',
-        input_size=(1, 9, 4, 4),
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({12})).scale_factor(c10::nullopt).mode(torch::kNearest)''',
-        input_size=(1, 2, 4),
-        fullname='interpolate_nearest_1d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({12})).scale_factor(c10::nullopt).mode(torch::kNearest)''',
-        input_size=(0, 2, 4),
-        fullname='interpolate_nearest_1d_zero_dim',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=(12, ), scale_factor=None, mode='nearest'),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({12})).scale_factor(c10::nullopt).mode(torch::kNearest)''',
-        input_size=(1, 2, 3),
-        fullname='interpolate_nearest_tuple_1d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='nearest'),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(c10::nullopt).scale_factor(std::vector<double>({4.})).mode(torch::kNearest)''',
-        input_size=(1, 2, 4),
-        fullname='interpolate_nearest_scale_1d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='linear', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({12}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kLinear)
-                            .align_corners(false)''',
-        input_size=(1, 2, 4),
-        fullname='interpolate_linear_1d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=(4, ), scale_factor=None, mode='linear', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({4}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kLinear)
-                            .align_corners(false)''',
-        input_size=(1, 2, 3),
-        fullname='interpolate_linear_tuple_1d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='linear', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(c10::nullopt)
-                            .scale_factor(std::vector<double>({4.}))
-                            .mode(torch::kLinear)
-                            .align_corners(false)''',
-        input_size=(1, 2, 4),
-        fullname='interpolate_linear_scale_1d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='linear', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({12}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kLinear)
-                            .align_corners(false)''',
-        input_size=(0, 2, 4),
-        fullname='interpolate_linear_1d_zero_dim',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='linear', align_corners=True),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({12}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kLinear)
-                            .align_corners(true)''',
-        input_size=(1, 2, 4),
-        fullname='interpolate_linear_1d_align_corners',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='linear', align_corners=True),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(c10::nullopt)
-                            .scale_factor(std::vector<double>({4.}))
-                            .mode(torch::kLinear)
-                            .align_corners(true)''',
-        input_size=(1, 2, 4),
-        fullname='interpolate_linear_scale_1d_align_corners',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=2, scale_factor=None, mode='nearest'),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({2, 2}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kNearest)''',
-        input_size=(1, 128, 1, 1),
-        fullname='interpolate_nearest_2d_launch_configs',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({12, 12}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kNearest)''',
-        input_size=(1, 2, 4, 4),
-        fullname='interpolate_nearest_2d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=(12, 16), scale_factor=None, mode='nearest'),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({12, 16}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kNearest)''',
-        input_size=(1, 2, 3, 4),
-        fullname='interpolate_nearest_tuple_2d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='nearest'),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(c10::nullopt)
-                            .scale_factor(std::vector<double>({4., 4.}))
-                            .mode(torch::kNearest)''',
-        input_size=(1, 2, 4, 4),
-        fullname='interpolate_nearest_scale_2d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({12, 12}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kNearest)''',
-        input_size=(0, 2, 4, 4),
-        fullname='interpolate_nearest_2d_zero_dim',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='bilinear', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({12, 12}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kBilinear)
-                            .align_corners(false)''',
-        input_size=(1, 2, 4, 4),
-        fullname='interpolate_bilinear_2d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='bilinear', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({12, 12}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kBilinear)
-                            .align_corners(false)''',
-        input_size=(0, 2, 4, 4),
-        fullname='interpolate_bilinear_2d_zero_dim',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=(4, 6), scale_factor=None,
-                                    mode='bilinear', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({4, 6}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kBilinear)
-                            .align_corners(false)''',
-        input_size=(1, 2, 2, 3),
-        fullname='interpolate_bilinear_tuple_2d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=None, scale_factor=4.,
-                                    mode='bilinear', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(c10::nullopt)
-                            .scale_factor(std::vector<double>({4., 4.}))
-                            .mode(torch::kBilinear)
-                            .align_corners(false)''',
-        input_size=(1, 2, 4, 4),
-        fullname='interpolate_bilinear_scale_2d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 2.),
-                                    mode='bilinear', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(c10::nullopt)
-                            .scale_factor(std::vector<double>({2., 2.}))
-                            .mode(torch::kBilinear)
-                            .align_corners(false)''',
-        input_size=(1, 2, 4, 4),
-        fullname='interpolate_bilinear_scale_tuple_shared_2d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 1.),
-                                    mode='bilinear', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(c10::nullopt)
-                            .scale_factor(std::vector<double>({2., 1.}))
-                            .mode(torch::kBilinear)
-                            .align_corners(false)''',
-        input_size=(1, 2, 4, 4),
-        fullname='interpolate_bilinear_scale_tuple_skewed_2d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=(4, 6), scale_factor=None, mode='bilinear', align_corners=True),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({4, 6}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kBilinear)
-                            .align_corners(true)''',
-        input_size=(1, 2, 4, 4),
-        fullname='interpolate_bilinear_tuple_2d_align_corners',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 1.),
-                                    mode='bilinear', align_corners=True),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(c10::nullopt)
-                            .scale_factor(std::vector<double>({2., 1.}))
-                            .mode(torch::kBilinear)
-                            .align_corners(true)''',
-        input_size=(1, 2, 4, 4),
-        fullname='interpolate_bilinear_scale_tuple_skewed_2d_align_corners',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='bicubic', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({12, 12}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kBicubic)
-                            .align_corners(false)''',
-        input_size=(1, 2, 4, 4),
-        fullname='interpolate_bicubic_2d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='bicubic', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({12, 12}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kBicubic)
-                            .align_corners(false)''',
-        input_size=(0, 2, 4, 4),
-        fullname='interpolate_bicubic_2d_zero_dim',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=(4, 6), scale_factor=None,
-                                    mode='bicubic', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({4, 6}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kBicubic)
-                            .align_corners(false)''',
-        input_size=(1, 2, 2, 3),
-        fullname='interpolate_bicubic_tuple_2d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='bicubic', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(c10::nullopt)
-                            .scale_factor(std::vector<double>({4., 4.}))
-                            .mode(torch::kBicubic)
-                            .align_corners(false)''',
-        input_size=(1, 2, 4, 4),
-        fullname='interpolate_bicubic_scale_2d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 2.),
-                                    mode='bicubic', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(c10::nullopt)
-                            .scale_factor(std::vector<double>({2., 2.}))
-                            .mode(torch::kBicubic)
-                            .align_corners(false)''',
-        input_size=(1, 2, 4, 4),
-        fullname='interpolate_bicubic_scale_tuple_shared_2d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 1.),
-                                    mode='bicubic', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(c10::nullopt)
-                            .scale_factor(std::vector<double>({2., 1.}))
-                            .mode(torch::kBicubic)
-                            .align_corners(false)''',
-        input_size=(1, 2, 4, 4),
-        fullname='interpolate_bicubic_scale_tuple_skewed_2d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=(4, 6), scale_factor=None, mode='bicubic', align_corners=True),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({4, 6}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kBicubic)
-                            .align_corners(true)''',
-        input_size=(1, 2, 4, 4),
-        fullname='interpolate_bicubic_tuple_2d_align_corners',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 1.),
-                                    mode='bicubic', align_corners=True),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(c10::nullopt)
-                            .scale_factor(std::vector<double>({2., 1.}))
-                            .mode(torch::kBicubic)
-                            .align_corners(true)''',
-        input_size=(1, 2, 4, 4),
-        fullname='interpolate_bicubic_scale_tuple_skewed_2d_align_corners',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({12, 12, 12}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kNearest)''',
-        input_size=(1, 2, 4, 4, 4),
-        fullname='interpolate_nearest_3d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({12, 12, 12}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kNearest)''',
-        input_size=(0, 2, 4, 4, 4),
-        fullname='interpolate_nearest_3d_zero_dim',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=(12, 16, 16), scale_factor=None, mode='nearest'),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({12, 16, 16}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kNearest)''',
-        input_size=(1, 2, 3, 4, 4),
-        fullname='interpolate_nearest_tuple_3d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='nearest'),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(c10::nullopt)
-                            .scale_factor(std::vector<double>({4., 4., 4.}))
-                            .mode(torch::kNearest)''',
-        input_size=(1, 2, 4, 4, 4),
-        fullname='interpolate_nearest_scale_3d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='trilinear', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({12, 12, 12}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kTrilinear)
-                            .align_corners(false)''',
-        input_size=(1, 2, 4, 4, 4),
-        fullname='interpolate_trilinear_3d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='trilinear', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({12, 12, 12}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kTrilinear)
-                            .align_corners(false)''',
-        input_size=(0, 2, 4, 4, 4),
-        fullname='interpolate_trilinear_3d_zero_dim',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=(4, 6, 6),
-                                    scale_factor=None, mode='trilinear', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({4, 6, 6}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kTrilinear)
-                            .align_corners(false)''',
-        input_size=(1, 2, 2, 3, 3),
-        fullname='interpolate_trilinear_tuple_3d',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=None, scale_factor=3., mode='trilinear', align_corners=False),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(c10::nullopt)
-                            .scale_factor(std::vector<double>({3., 3., 3.}))
-                            .mode(torch::kTrilinear)
-                            .align_corners(false)''',
-        input_size=(1, 2, 3, 4, 4),
-        fullname='interpolate_trilinear_scale_3d',
-        # See https://github.com/pytorch/pytorch/issues/5006
-        precision=3e-4,
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=(4, 6, 6), scale_factor=None,
-                                    mode='trilinear', align_corners=True),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(std::vector<int64_t>({4, 6, 6}))
-                            .scale_factor(c10::nullopt)
-                            .mode(torch::kTrilinear)
-                            .align_corners(true)''',
-        input_size=(1, 2, 2, 3, 3),
-        fullname='interpolate_trilinear_tuple_3d_align_corners',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.interpolate, size=None, scale_factor=3., mode='trilinear', align_corners=True),
-        cpp_options_args='''F::InterpolateFuncOptions()
-                            .size(c10::nullopt)
-                            .scale_factor(std::vector<double>({3., 3., 3.}))
-                            .mode(torch::kTrilinear)
-                            .align_corners(true)''',
-        input_size=(1, 2, 3, 4, 4),
-        fullname='interpolate_trilinear_scale_3d_align_corners',
-        # See https://github.com/pytorch/pytorch/issues/5006
-        precision=3e-4,
-        pickle=False,
-    ),
-
-
-    dict(
-        module_name='AdaptiveMaxPool1d',
-        constructor_args=(3,),
-        cpp_constructor_args='torch::nn::AdaptiveMaxPool1dOptions(3)',
-        input_fn=lambda: _rand_tensor_non_equal(1, 3, 5),
-    ),
-    dict(
-        module_name='AdaptiveMaxPool2d',
-        constructor_args=(3,),
-        cpp_constructor_args='torch::nn::AdaptiveMaxPool2dOptions(3)',
-        input_fn=lambda: _rand_tensor_non_equal(1, 3, 5, 6),
-        desc='single',
-    ),
-    dict(
-        module_name='AdaptiveMaxPool2d',
-        constructor_args=((3, 4),),
-        cpp_constructor_args='torch::nn::AdaptiveMaxPool2dOptions({3, 4})',
-        input_fn=lambda: _rand_tensor_non_equal(1, 3, 5, 6),
-        desc='tuple',
-    ),
-    dict(
-        module_name='AdaptiveMaxPool2d',
-        constructor_args=((3, None),),
-        cpp_constructor_args='torch::nn::AdaptiveMaxPool2dOptions({3, c10::nullopt})',
-        input_fn=lambda: _rand_tensor_non_equal(1, 3, 5, 6),
-        desc='tuple_none',
-    ),
-    dict(
-        module_name='AdaptiveMaxPool3d',
-        constructor_args=(3,),
-        cpp_constructor_args='torch::nn::AdaptiveMaxPool3dOptions(3)',
-        input_fn=lambda: _rand_tensor_non_equal(2, 3, 5, 6, 7),
-        desc='single',
-    ),
-    dict(
-        module_name='AdaptiveMaxPool3d',
-        constructor_args=((3, 4, 5),),
-        cpp_constructor_args='torch::nn::AdaptiveMaxPool3dOptions({3, 4, 5})',
-        input_fn=lambda: _rand_tensor_non_equal(2, 3, 5, 6, 7),
-        desc='tuple',
-    ),
-    dict(
-        module_name='AdaptiveMaxPool3d',
-        constructor_args=((3, None, 5),),
-        cpp_constructor_args='torch::nn::AdaptiveMaxPool3dOptions({3, c10::nullopt, 5})',
-        input_fn=lambda: _rand_tensor_non_equal(2, 3, 5, 6, 7),
-        desc='tuple_none',
-    ),
-    dict(
-        module_name='AdaptiveMaxPool3d',
-        constructor_args=(3,),
-        cpp_constructor_args='torch::nn::AdaptiveMaxPool3dOptions(3)',
-        input_fn=lambda: _rand_tensor_non_equal(2, 3, 12, 9, 3),
-        desc='single_nonatomic',
-    ),
-    dict(
-        module_name='AdaptiveMaxPool3d',
-        constructor_args=((3, 4, 5),),
-        cpp_constructor_args='torch::nn::AdaptiveMaxPool3dOptions({3, 4, 5})',
-        input_fn=lambda: _rand_tensor_non_equal(2, 3, 6, 4, 10),
-        desc='tuple_nonatomic',
-    ),
-    dict(
-        module_name='AdaptiveAvgPool1d',
-        constructor_args=(3,),
-        cpp_constructor_args='torch::nn::AdaptiveAvgPool1dOptions(3)',
-        input_fn=lambda: torch.rand(1, 3, 5),
-    ),
-    dict(
-        module_name='AdaptiveAvgPool1d',
-        constructor_args=(1,),
-        cpp_constructor_args='torch::nn::AdaptiveAvgPool1dOptions(1)',
-        input_fn=lambda: torch.rand(1, 3, 5),
-        desc='one_output',
-    ),
-    dict(
-        module_name='AdaptiveAvgPool2d',
-        constructor_args=(3,),
-        cpp_constructor_args='torch::nn::AdaptiveAvgPool2dOptions(3)',
-        input_fn=lambda: torch.rand(1, 3, 5, 6),
-        desc='single',
-    ),
-    dict(
-        module_name='AdaptiveAvgPool2d',
-        constructor_args=(1,),
-        cpp_constructor_args='torch::nn::AdaptiveAvgPool2dOptions(1)',
-        input_fn=lambda: torch.rand(1, 3, 5, 6),
-        desc='single_1x1output',
-    ),
-    dict(
-        module_name='AdaptiveAvgPool2d',
-        constructor_args=((3, 4),),
-        cpp_constructor_args='torch::nn::AdaptiveAvgPool2dOptions({3, 4})',
-        input_fn=lambda: torch.rand(1, 3, 5, 6),
-        desc='tuple',
-    ),
-    dict(
-        module_name='AdaptiveAvgPool2d',
-        constructor_args=((3, None),),
-        cpp_constructor_args='torch::nn::AdaptiveAvgPool2dOptions({3, c10::nullopt})',
-        input_fn=lambda: torch.rand(1, 3, 5, 6),
-        desc='tuple_none',
-    ),
-    dict(
-        module_name='AdaptiveAvgPool3d',
-        constructor_args=(3,),
-        cpp_constructor_args='torch::nn::AdaptiveAvgPool3dOptions(3)',
-        input_fn=lambda: torch.rand(2, 3, 5, 2, 7),
-        desc='single',
-    ),
-    dict(
-        module_name='AdaptiveAvgPool3d',
-        constructor_args=((3, 4, 5),),
-        cpp_constructor_args='torch::nn::AdaptiveAvgPool3dOptions({3, 4, 5})',
-        input_fn=lambda: torch.rand(2, 3, 5, 3, 7),
-        desc='tuple',
-    ),
-    dict(
-        module_name='AdaptiveAvgPool3d',
-        constructor_args=((None, 4, 5),),
-        cpp_constructor_args='torch::nn::AdaptiveAvgPool3dOptions({c10::nullopt, 4, 5})',
-        input_fn=lambda: torch.rand(2, 3, 5, 3, 7),
-        desc='tuple_none',
-    ),
-    dict(
-        module_name='SELU',
-        input_size=(3, 2, 5),
-        check_inplace=True
-    ),
-    dict(
-        module_name='SELU',
-        input_size=(),
-        check_inplace=True,
-        desc='scalar'
-    ),
-    dict(
-        module_name='CELU',
-        input_size=(3, 2, 5),
-        constructor_args=(2.,),
-        cpp_constructor_args='torch::nn::CELUOptions().alpha(2.)',
-        check_inplace=True,
-        reference_fn=lambda x, *_: torch.where(x >= 0, x, 2. * ((.5 * x).exp() - 1)),
-    ),
-    dict(
-        module_name='CELU',
-        input_size=(),
-        constructor_args=(2.,),
-        cpp_constructor_args='torch::nn::CELUOptions().alpha(2.)',
-        check_inplace=True,
-        reference_fn=lambda x, *_: torch.where(x >= 0, x, 2. * ((.5 * x).exp() - 1)),
-        desc='scalar'
-    ),
-    dict(
-        module_name='GLU',
-        input_size=(5, 6),
-    ),
-    dict(
-        module_name='GLU',
-        constructor_args=(1,),
-        cpp_constructor_args='torch::nn::GLUOptions(1)',
-        input_size=(5, 6, 7),
-        desc='dim',
-    ),
-    dict(
-        module_name='GELU',
-        input_size=(),
-        desc='scalar',
-        reference_fn=lambda x, *_: x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))),
-    ),
-    dict(
-        module_name='GELU',
-        input_size=(3, 2, 5),
-        reference_fn=lambda x, *_: x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))),
-    ),
-    dict(
-        constructor=wrap_functional(F.softmax, dim=-1),
-        cpp_options_args='F::SoftmaxFuncOptions(-1)',
-        input_size=(2, 128),  # trigger the last-dim algo in NPU
-        fullname='softmax_lastdim',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.softmax, dim=1, dtype=torch.float64),
-        cpp_options_args='F::SoftmaxFuncOptions(1).dtype(torch::kFloat64)',
-        input_size=(2, 128),
-        fullname='softmax_lastdim_dtype',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.softmax, dim=1),
-        cpp_options_args='F::SoftmaxFuncOptions(1)',
-        input_size=(2, 128, 2, 2),  # trigger special case of spatial NPU algo
-        fullname='softmax_spatial_special',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.softmax, dim=1),
-        cpp_options_args='F::SoftmaxFuncOptions(1)',
-        input_size=(2, 2, 4, 4),  # regular spatial algorithm
-        fullname='softmax_spatial',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.softmax, dim=1, dtype=torch.float64),
-        cpp_options_args='F::SoftmaxFuncOptions(1).dtype(torch::kFloat64)',
-        input_size=(2, 2, 4, 4),  # regular spatial algorithm
-        fullname='softmax_spatial_dtype',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.softmax, dim=0),
-        cpp_options_args='F::SoftmaxFuncOptions(0)',
-        input_size=(2, 3, 4, 5),
-        fullname='softmax_functional_dim0',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.softmax, dim=3),
-        cpp_options_args='F::SoftmaxFuncOptions(3)',
-        input_size=(2, 3, 4, 5),
-        fullname='softmax_functional_dim3',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.softmax, dim=-1),
-        cpp_options_args='F::SoftmaxFuncOptions(-1)',
-        input_size=(),
-        fullname='softmax_functional_scalar',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.log_softmax, dim=-1),
-        cpp_options_args='F::LogSoftmaxFuncOptions(-1)',
-        input_size=(2, 128),  # trigger the last-dim algo in NPU
-        fullname='log_softmax_lastdim',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.log_softmax, dim=1),
-        cpp_options_args='F::LogSoftmaxFuncOptions(1)',
-        input_size=(2, 128, 2, 2),  # trigger special case of spatial NPU algo
-        fullname='log_softmax_spatial_special',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.log_softmax, dim=1),
-        cpp_options_args='F::LogSoftmaxFuncOptions(1)',
-        input_size=(2, 2, 4, 4),  # regular spatial algorithm
-        fullname='log_softmax_spatial',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.log_softmax, dim=0),
-        cpp_options_args='F::LogSoftmaxFuncOptions(0)',
-        input_size=(2, 3, 4, 5),
-        fullname='log_softmax_dim0',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.log_softmax, dim=3),
-        cpp_options_args='F::LogSoftmaxFuncOptions(3)',
-        input_size=(2, 3, 4, 5),
-        fullname='log_softmax_dim3',
-        pickle=False,
-    ),
-    dict(
-        constructor=wrap_functional(F.log_softmax, dim=0),
-        cpp_options_args='F::LogSoftmaxFuncOptions(0)',
-        input_size=(),
-        fullname='log_softmax_scalar',
-        pickle=False,
-    ),
-
-
-    dict(
-        fullname='Unfold',
-        constructor=lambda: nn.Unfold((2, 2), (1, 1), (0, 0), (1, 1)),
-        cpp_constructor_args='torch::nn::UnfoldOptions({2, 2}).dilation({1, 1}).padding({0, 0}).stride({1, 1})',
-        input_size=(2, 4, 3, 3),
-        check_gradgrad=False,
-    ),
-    dict(
-        fullname='Fold',
-        constructor=lambda: nn.Fold((3, 3), (2, 2), (1, 1), (0, 0), (1, 1)),
-        cpp_constructor_args='torch::nn::FoldOptions({3, 3}, {2, 2}).dilation({1, 1}).padding({0, 0}).stride({1, 1})',
-        input_size=(2, 16, 4),
-        check_gradgrad=False,
-    ),
-    dict(
-        fullname='Unfold_int_input',
-        constructor=lambda: nn.Unfold(2, 1, 0, 1),
-        cpp_constructor_args='torch::nn::UnfoldOptions(2).dilation(1).padding(0).stride(1)',
-        input_size=(2, 4, 3, 3),
-        check_gradgrad=False,
-    ),
-    dict(
-        fullname='Fold_int_input',
-        constructor=lambda: nn.Fold(3, 2, 1, 0, 1),
-        cpp_constructor_args='torch::nn::FoldOptions(3, 2).dilation(1).padding(0).stride(1)',
-        input_size=(2, 16, 4),
-        check_gradgrad=False,
-    ),
-    dict(
-        module_name='Threshold',
-        constructor_args=(2., 1.),
-        cpp_constructor_args='torch::nn::ThresholdOptions(2., 1.)',
-        input_size=(),
-        check_inplace=True,
-        desc='threshold_value_scalar'
-    ),
-
-    dict(
-        module_name='ReLU',
-        input_size=(),
-        check_inplace=True,
-        desc='scalar'
-    ),
-    dict(
-        module_name='ReLU6',
-        input_size=(),
-        check_inplace=True,
-        desc='scalar'
-    ),
-    dict(
-        module_name='RReLU',
-        constructor_args=(0.1, 0.9),
-        cpp_constructor_args='torch::nn::RReLUOptions().lower(0.1).upper(0.9)',
-        input_size=(),
-        desc='with_up_down_scalar',
-    ),
-    dict(
-        module_name='Hardtanh',
-        input_size=(),
-        reference_fn=lambda i, *_: i.clamp(-1, 1),
-        desc='scalar'
-    ),
-    dict(
-        module_name='Sigmoid',
-        input_size=(),
-        desc='scalar',
-    ),
-    dict(
-        module_name='Tanh',
-        input_size=(),
-        desc='scalar',
-    ),
-    dict(
-        module_name='Softmax',
-        constructor_args=(0,),
-        cpp_constructor_args='torch::nn::SoftmaxOptions(0)',
-        input_size=(),
-        reference_fn=lambda i, *_: torch.exp(i).div(torch.exp(i).sum(0, True)),
-        desc='scalar',
-    ),
-    dict(
-        module_name='LogSoftmax',
-        constructor_args=(0,),
-        cpp_constructor_args='torch::nn::LogSoftmaxOptions(0)',
-        input_size=(),
-        reference_fn=lambda i, *_: torch.exp(i).div_(torch.exp(i).sum(0, False)).log_(),
-        desc='multiparam_scalar',
-    ),
-    dict(
-        module_name='ELU',
-        constructor_args=(2.,),
-        cpp_constructor_args='torch::nn::ELUOptions().alpha(2.)',
-        input_size=(),
-        desc='scalar',
-    ),
-    dict(
-        module_name='Hardshrink',
-        constructor_args=(2.,),
-        cpp_constructor_args='torch::nn::HardshrinkOptions(2.)',
-        input_size=(),
-        desc='scalar',
-    ),
-    dict(
-        module_name='LeakyReLU',
-        constructor_args=(0.5,),
-        cpp_constructor_args='torch::nn::LeakyReLUOptions().negative_slope(0.5)',
-        input_size=(),
-        check_inplace=True,
-        desc='with_negval_scalar'
-    ),
-    dict(
-        module_name='LogSigmoid',
-        input_size=(),
-        reference_fn=lambda i, *_: i.sigmoid().log(),
-        desc='scalar'
-    ),
-    dict(
-        module_name='Softplus',
-        constructor_args=(2, -100),
-        cpp_constructor_args='torch::nn::SoftplusOptions().beta(2).threshold(-100)',
-        input_size=(),
-        reference_fn=(
-            lambda i, *_: ((i * 2) > -100).type_as(i) * i
-            + ((i * 2) <= -100).type_as(i) * 1.0 / 2.0 * torch.log(1 + torch.exp(2 * i))
-        ),
-        desc='beta_threshold_scalar',
-    ),
-    dict(
-        module_name='Softshrink',
-        constructor_args=(1,),
-        cpp_constructor_args='torch::nn::SoftshrinkOptions(1)',
-        input_size=(),
-        desc='lambda_scalar',
-    ),
-    dict(
-        module_name='PReLU',
-        input_size=(),
-        reference_fn=lambda i, p, _: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
-        desc='scalar',
-    ),
-    dict(
-        module_name='Softsign',
-        input_size=(),
-        reference_fn=lambda i, *_: i.div(1 + torch.abs(i)),
-        desc='scalar',
-    ),
-    dict(
-        module_name='Softmin',
-        constructor_args=(0,),
-        cpp_constructor_args='torch::nn::SoftminOptions(0)',
-        input_size=(),
-        desc='scalar',
-    ),
-    dict(
-        module_name='Tanhshrink',
-        input_size=(),
-        desc='scalar',
-    ),
-    dict(
-        fullname='Padding12_1dcircular',
-        constructor=wrap_functional(F.pad, pad=(1, 2), mode='circular'),
-        cpp_options_args='F::PadFuncOptions({1, 2}).mode(torch::kCircular)',
-        input_fn=lambda: torch.arange(6, out=torch.DoubleTensor()).reshape([1, 2, 3]),
-        reference_fn=lambda i, *_: padding1d_circular(i, (1, 2)),
-        skip_double=TEST_WITH_ROCM,
-        pickle=False,
-    ),
-    dict(
-        fullname='Padding31_1dcircular',
-        constructor=wrap_functional(F.pad, pad=(3, 1), mode='circular'),
-        cpp_options_args='F::PadFuncOptions({3, 1}).mode(torch::kCircular)',
-        input_fn=lambda: torch.arange(6, out=torch.DoubleTensor()).reshape([1, 2, 3]),
-        reference_fn=lambda i, *_: padding1d_circular(i, (3, 1)),
-        skip_double=TEST_WITH_ROCM,
-        pickle=False,
-    ),
-    dict(
-        fullname='Padding33_1dcircular',
-        constructor=wrap_functional(F.pad, pad=(3, 3), mode='circular'),
-        cpp_options_args='F::PadFuncOptions({3, 3}).mode(torch::kCircular)',
-        input_fn=lambda: torch.arange(6, out=torch.DoubleTensor()).reshape([1, 2, 3]),
-        reference_fn=lambda i, *_: padding1d_circular(i, (3, 3)),
-        skip_double=TEST_WITH_ROCM,
-        pickle=False,
-    ),
-    dict(
-        fullname='Padding1221_2dcircular',
-        constructor=wrap_functional(F.pad, pad=(1, 2, 2, 1), mode='circular'),
-        cpp_options_args='F::PadFuncOptions({1, 2, 2, 1}).mode(torch::kCircular)',
-        input_fn=lambda: torch.arange(6, out=torch.DoubleTensor()).reshape([1, 1, 2, 3]),
-        reference_fn=lambda i, *_: padding2d_circular(i, (1, 2, 2, 1)),
-        skip_double=TEST_WITH_ROCM,
-        pickle=False,
-    ),
-    dict(
-        fullname='Padding2322_2dcircular',
-        constructor=wrap_functional(F.pad, pad=(2, 3, 2, 2), mode='circular'),
-        cpp_options_args='F::PadFuncOptions({2, 3, 2, 2}).mode(torch::kCircular)',
-        input_fn=lambda: torch.arange(6, out=torch.DoubleTensor()).reshape([1, 1, 2, 3]),
-        reference_fn=lambda i, *_: padding2d_circular(i, (2, 3, 2, 2)),
-        skip_double=TEST_WITH_ROCM,
-        pickle=False,
-    ),
-    dict(
-        fullname='Padding3331_2dcircular',
-        constructor=wrap_functional(F.pad, pad=(3, 3, 3, 1), mode='circular'),
-        cpp_options_args='F::PadFuncOptions({3, 3, 3, 1}).mode(torch::kCircular)',
-        input_fn=lambda: torch.arange(9, out=torch.DoubleTensor()).reshape([1, 1, 3, 3]),
-        reference_fn=lambda i, *_: padding2d_circular(i, (3, 3, 3, 1)),
-        skip_double=TEST_WITH_ROCM,
-        pickle=False,
-    ),
-    dict(
-        fullname='Padding122112_3dcircular',
-        constructor=wrap_functional(F.pad, pad=(1, 2, 2, 1, 1, 2), mode='circular'),
-        cpp_options_args='F::PadFuncOptions({1, 2, 2, 1, 1, 2}).mode(torch::kCircular)',
-        input_fn=lambda: torch.arange(12, out=torch.DoubleTensor()).reshape([1, 1, 2, 2, 3]),
-        reference_fn=lambda i, *_: padding3d_circular(i, (1, 2, 2, 1, 1, 2)),
-        skip_double=TEST_WITH_ROCM,
-        pickle=False,
-    ),
-    dict(
-        fullname='Padding322112_3dcircular',
-        constructor=wrap_functional(F.pad, pad=(3, 2, 2, 1, 1, 2), mode='circular'),
-        cpp_options_args='F::PadFuncOptions({3, 2, 2, 1, 1, 2}).mode(torch::kCircular)',
-        input_fn=lambda: torch.arange(12, out=torch.DoubleTensor()).reshape([1, 1, 2, 2, 3]),
-        reference_fn=lambda i, *_: padding3d_circular(i, (3, 2, 2, 1, 1, 2)),
-        skip_double=TEST_WITH_ROCM,
-        pickle=False,
-    ),
-    dict(
-        fullname='Padding332122_3dcircular',
-        constructor=wrap_functional(F.pad, pad=(3, 3, 2, 1, 2, 2), mode='circular'),
-        cpp_options_args='F::PadFuncOptions({3, 3, 2, 1, 2, 2}).mode(torch::kCircular)',
-        input_fn=lambda: torch.arange(12, out=torch.DoubleTensor()).reshape([1, 1, 2, 2, 3]),
-        reference_fn=lambda i, *_: padding3d_circular(i, (3, 3, 2, 1, 2, 2)),
-        skip_double=TEST_WITH_ROCM,
-        pickle=False,
-    ),
-]
-
-# add conv padding mode tests:
-for padding_mode, cpp_padding_mode in zip(
-        ['reflect', 'circular', 'replicate', 'zeros'],
-        ['torch::kReflect', 'torch::kCircular', 'torch::kReplicate', 'torch::kZeros']):
-    # conv signature:
-    #     in_channels, out_channels, kernel_size, stride=1,
-    #     padding=0, dilation=1, groups=1,
-    #     bias=True, padding_mode='zeros'
-    for d in (1, 2, 3):
-        if d == 3 and padding_mode == 'reflect':
-            # FIXME: remove after implementing reflection pad 3d
-            #        https://github.com/pytorch/pytorch/issues/27655
-            continue
-        new_module_tests.append(
-            dict(
-                module_name='Conv{}d'.format(d),
-                constructor_args=(3, 4, 3, 2, 2, 1, 1, True, padding_mode),
-                cpp_constructor_args='''torch::nn::Conv{}dOptions(3, 4, 3)
-                                        .stride(2)
-                                        .padding(2)
-                                        .dilation(1)
-                                        .groups(1)
-                                        .bias(true)
-                                        .padding_mode({})'''.format(d, cpp_padding_mode),
-                input_size=(2, 3) + (3,) * d,
-                output_size=(2, 4) + (3,) * d,
-                cudnn=True,
-                desc='{}_stride2_pad2'.format(padding_mode),
-            ),
-        )
-
-
-def kldivloss_reference(x, target, reduction='mean'):
-    safe_target = target * (target > 0).type_as(target)
-    safe_target_log = (safe_target + (target <= 0).type_as(target)).log()
-    result = safe_target * (safe_target_log - x)
-    if reduction == 'mean':
-        return result.mean()
-    elif reduction == 'sum':
-        return result.sum()
-    elif reduction == 'batchmean' and result.dim() != 0:
-        return result.sum() / result.size(0)
-    return result
-
-
-def nlllossNd_reference(x, target, weight=None, ignore_index=-100,
-                        reduction='mean'):
-    assert x.dim() >= 3
-    N = x.size(0)
-    C = x.size(1)
-    out_size = (N,) + x.size()[2:]
-    output = torch.zeros(out_size).type_as(x)
-
-    if weight is None:
-        weight = torch.ones(C).type_as(x)
-    total_weight = 0
-    for tup in product(*[range(size) for size in out_size]):
-        t_nx = target[tup]
-        norm = 0. if ignore_index == t_nx else weight[t_nx].item()
-        input_index = list(tup)
-        input_index.insert(1, t_nx)
-        output[tup] = -x[tuple(input_index)] * norm
-        total_weight += norm
-
-    if reduction == 'mean':
-        return output.sum() / total_weight
-    elif reduction == 'sum':
-        return output.sum()
-    return output
-
-
-def nllloss_reference(x, target, weight=None, ignore_index=-100,
-                      reduction='mean'):
-
-    def nll_loss_helper(x, target, weight, ignore_index):
-        if target == ignore_index:
-            return (0, 0)
-        norm = 1 if weight is None else weight[target]
-        result = -x[target] * norm
-        return (result, norm)
-
-    losses_and_weights = [nll_loss_helper(i, t, weight, ignore_index)
-                          for i, t in zip(x, target)]
-    losses, weights = zip(*losses_and_weights)
-    losses_tensor = x.new_tensor(losses)
-    if reduction == 'mean':
-        return sum(losses_tensor) / sum(weights)
-    elif reduction == 'sum':
-        return sum(losses_tensor)
-    else:
-        return losses_tensor
-
-
-def smoothl1loss_reference(x, target, reduction='mean'):
-    abs_diff = (x - target).abs()
-    ge_one_mask = (abs_diff >= 1).type_as(abs_diff)
-    lt_one_mask = (abs_diff < 1).type_as(abs_diff)
-    output = ge_one_mask * (abs_diff - 0.5) + lt_one_mask * 0.5 * (abs_diff ** 2)
-    if reduction == 'mean':
-        return output.mean()
-    elif reduction == 'sum':
-        return output.sum()
-    return output
-
-
-def _multilabelmarginloss_reference(x, target):
-    targets = []
-    for target_index in target:
-        if target_index < 0:
-            break
-        targets.append(target_index)
-
-    t_sum = 0
-    for target_index in targets:
-        for i, item in enumerate(x):
-            if i not in targets:
-                t_sum += max(0, 1 - x[target_index] + item)
-
-    return t_sum
-
-
-def multilabelmarginloss_reference(x, target, reduction='mean'):
-    # make everything 2-dimensional
-    input_dim = x.dim()
-    if x.dim() < 2:
-        assert target.dim() < 2
-        x = x.unsqueeze(0) if x.dim() == 1 else x.unsqueeze(0).unsqueeze(0)
-        target = target.unsqueeze(0) if target.dim() == 1 else target.unsqueeze(0).unsqueeze(0)
-
-    n = x.size(0)
-    dim = x.size(1)
-    output = x.new(n).zero_()
-    for i in range(0, n):
-        output[i] = _multilabelmarginloss_reference(x[i], target[i])
-
-    if reduction == 'mean':
-        return output.mean() / dim
-    elif reduction == 'sum':
-        return output.sum() / dim
-    elif input_dim < 2:
-        # we know we have (1, C) X (1, C) -> (1,), so squeeze will get us
-        # back to correct dimensionality
-        return output.squeeze() / dim
-    else:
-        return output / dim
-
-
-def hingeembeddingloss_reference(x, target, margin=1.0, reduction='mean'):
-    margin_clamp = (margin - x).clamp(min=0).type_as(x)
-    output = torch.where(target == 1, x, margin_clamp)
-
-    if reduction == 'mean':
-        return output.mean()
-    elif reduction == 'sum':
-        return output.sum()
-    return output
-
-
-def softmarginloss_reference(x, target, reduction='mean'):
-    output = (1 + (-x * target).exp()).log()
-
-    if reduction == 'mean':
-        return output.mean()
-    elif reduction == 'sum':
-        return output.sum()
-    return output
-
-
-def _multimarginloss_reference(x, target_idx, p, margin, weight):
-    if weight is None:
-        weight = x.new(len(x)).fill_(1)
-
-    output = 0
-    for i, item in enumerate(x):
-        if i != target_idx:
-            output += max(0, weight[target_idx] * (margin - x[target_idx] + item) ** p)
-    return output
-
-
-def multimarginloss_reference(x, target, p=1, margin=1, weight=None, reduction='mean'):
-    if x.dim() < 2:
-        x = x.unsqueeze(0) if x.dim() == 1 else x.unsqueeze(0).unsqueeze(0)
-
-    target_dim = target.dim()
-    if target.dim() == 0:
-        target = target.unsqueeze(0)
-
-    n = x.size(0)
-    dim = x.size(1)
-    output = x.new(n)
-    for x in range(0, n):
-        output[x] = _multimarginloss_reference(x[x], target[x], p, margin, weight)
-
-    if reduction == 'mean':
-        return output.mean() / dim
-    elif reduction == 'sum':
-        return output.sum() / dim
-    elif target_dim == 0:
-        return output.squeeze(0) / dim
-    return output / dim
-
-
-def cosineembeddingloss_reference(input1, input2, target, margin=0, reduction='mean'):
-    def _cos(a, b):
-        cos = a.new(a.size(0))
-        for i in range(0, a.size(0)):
-            cos[i] = (a[i] * b[i]).sum() / ((((a[i] * a[i]).sum() + 1e-12) * ((b[i] * b[i]).sum() + 1e-12)) ** 0.5)
-        return cos
-
-    output = torch.where(target == 1, 1 - _cos(input1, input2), (_cos(input1, input2) - margin).clamp(min=0))
-
-    if reduction == 'mean':
-        return output.mean()
-    elif reduction == 'sum':
-        return output.sum()
-    return output
-
-
-def tripletmarginloss_reference(anchor, positive, negative, margin=1.0, p=2, eps=1e-6, swap=False,
-                                reduction='mean'):
-    d_p = torch.pairwise_distance(anchor, positive, p, eps)
-    d_n = torch.pairwise_distance(anchor, negative, p, eps)
-    if swap:
-        d_s = torch.pairwise_distance(positive, negative, p, eps)
-        d_n = torch.min(d_n, d_s)
-
-    output = torch.clamp(margin + d_p - d_n, min=0.0)
-    if reduction == 'mean':
-        return output.mean()
-    elif reduction == 'sum':
-        return output.sum()
-    return output
-
-
-def marginrankingloss_reference(input1, input2, target, margin=0, reduction='mean'):
-    output = (-target * (input1 - input2) + margin).clamp(min=0)
-    if reduction == 'mean':
-        return output.mean()
-    elif reduction == 'sum':
-        return output.sum()
-    return output
-
-
-# this directly follows Graves et al's paper, in contrast to the production implementation, it does not use log-space
-def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0, reduction='mean'):
-    input_lengths = torch.as_tensor(input_lengths, dtype=torch.long)
-    target_lengths = torch.as_tensor(target_lengths, dtype=torch.long)
-    dt = log_probs.dtype
-    log_probs = log_probs.double()  # we need the accuracy as we are not in logspace
-    targets = targets.long()
-    cum_target_lengths = target_lengths.cumsum(0)
-    losses = []
-    for i in range(log_probs.size(1)):
-        input_length = input_lengths[i].item()
-        target_length = target_lengths[i].item()
-        cum_target_length = cum_target_lengths[i].item()
-        targets_prime = targets.new_full((2 * target_length + 1,), blank)
-        if targets.dim() == 2:
-            targets_prime[1::2] = targets[i, :target_length]
-        else:
-            targets_prime[1::2] = targets[cum_target_length - target_length:cum_target_length]
-        probs = log_probs[:input_length, i].exp()
-        alpha = log_probs.new_zeros((target_length * 2 + 1,))
-        alpha[0] = probs[0, blank]
-        alpha[1] = probs[0, targets_prime[1]]
-        mask_third = (targets_prime[:-2] != targets_prime[2:])
-        for t in range(1, input_length):
-            alpha_next = alpha.clone()
-            alpha_next[1:] += alpha[:-1]
-            alpha_next[2:] += torch.where(mask_third, alpha[:-2], alpha.new_zeros(1))
-            alpha = probs[t, targets_prime] * alpha_next
-        losses.append(-alpha[-2:].sum().log()[None])
-    output = torch.cat(losses, 0)
-    if reduction == 'mean':
-        return (output / target_lengths.to(dtype=output.dtype, device=output.device)).mean()
-    elif reduction == 'sum':
-        return output.sum()
-    output = output.to(dt)
-    return output
-
-
-def padding1d_circular(x, pad):
-    r""" x:
-            [[[0., 1., 2.],
-              [3., 4., 5.]]]
-          pad: (1, 2)
-          output:
-            [[[2., 0., 1., 2., 0., 1.],
-              [5., 3., 4., 5., 3., 4.]]]
-    """
-    return torch.cat([x[:, :, -pad[0]:], x,
-                      x[:, :, 0:pad[1]]], dim=2)
-
-
-def padding2d_circular(x, pad):
-    r"""x:
-             [[[[0., 1., 2],
-                [3., 4., 5.]]]]
-            pad: (1, 2, 2, 1)
-    output:
-        [[[[2., 0., 1., 2., 0., 1.],
-           [5., 3., 4., 5., 3., 4.],
-           [2., 0., 1., 2., 0., 1.],
-           [5., 3., 4., 5., 3., 4.],
-           [2., 0., 1., 2., 0., 1.]]]]
-    """
-    x = torch.cat([x[:, :, -pad[2]:], x, x[:, :, 0:pad[3]]], dim=2)
-    return torch.cat([x[:, :, :, -pad[0]:], x, x[:, :, :, 0:pad[1]]], dim=3)
-
-
-def padding3d_circular(x, pad):
-    r"""x:
-            [[[[[ 0.,  1.,  2.],
-                [ 3.,  4.,  5.]],
-               [[ 6.,  7.,  8.],
-                [ 9., 10., 11.]]]]]
-        pad: (1, 2, 2, 1, 1, 2)
-        output: [[[[[ 8.,  6.,  7.,  8.,  6.,  7.],
-               [11.,  9., 10., 11.,  9., 10.],
-               [ 8.,  6.,  7.,  8.,  6.,  7.],
-               [11.,  9., 10., 11.,  9., 10.],
-               [ 8.,  6.,  7.,  8.,  6.,  7.]],
-
-              [[ 2.,  0.,  1.,  2.,  0.,  1.],
-               [ 5.,  3.,  4.,  5.,  3.,  4.],
-               [ 2.,  0.,  1.,  2.,  0.,  1.],
-               [ 5.,  3.,  4.,  5.,  3.,  4.],
-               [ 2.,  0.,  1.,  2.,  0.,  1.]],
-
-              [[ 8.,  6.,  7.,  8.,  6.,  7.],
-               [11.,  9., 10., 11.,  9., 10.],
-               [ 8.,  6.,  7.,  8.,  6.,  7.],
-               [11.,  9., 10., 11.,  9., 10.],
-               [ 8.,  6.,  7.,  8.,  6.,  7.]],
-
-              [[ 2.,  0.,  1.,  2.,  0.,  1.],
-               [ 5.,  3.,  4.,  5.,  3.,  4.],
-               [ 2.,  0.,  1.,  2.,  0.,  1.],
-               [ 5.,  3.,  4.,  5.,  3.,  4.],
-               [ 2.,  0.,  1.,  2.,  0.,  1.]],
-
-              [[ 8.,  6.,  7.,  8.,  6.,  7.],
-               [11.,  9., 10., 11.,  9., 10.],
-               [ 8.,  6.,  7.,  8.,  6.,  7.],
-               [11.,  9., 10., 11.,  9., 10.],
-               [ 8.,  6.,  7.,  8.,  6.,  7.]]]]]
-    """
-    x = torch.cat([x[:, :, -pad[4]:], x, x[:, :, 0:pad[5]]], dim=2)
-    x = torch.cat([x[:, :, :, -pad[2]:], x, x[:, :, :, 0:pad[3]]], dim=3)
-    return torch.cat([x[:, :, :, :, -pad[0]:], x, x[:, :, :, :, 0:pad[1]]], dim=4)
-
-
-loss_reference_fns = {
-    'KLDivLoss': kldivloss_reference,
-    'NLLLoss': nllloss_reference,
-    'NLLLossNd': nlllossNd_reference,
-    'SmoothL1Loss': smoothl1loss_reference,
-    'MultiLabelMarginLoss': multilabelmarginloss_reference,
-    'HingeEmbeddingLoss': hingeembeddingloss_reference,
-    'SoftMarginLoss': softmarginloss_reference,
-    'MultiMarginLoss': multimarginloss_reference,
-    'CosineEmbeddingLoss': cosineembeddingloss_reference,
-    'TripletMarginLoss': tripletmarginloss_reference,
-    'MarginRankingLoss': marginrankingloss_reference,
-    'CTCLoss': ctcloss_reference,
-}
-
-
-criterion_tests = [
-    dict(
-        module_name='L1Loss',
-        input_size=(2, 3, 4),
-        target_size=(2, 3, 4),
-        reference_fn=lambda i, t, _: 1. / i.numel() *
-        sum((a - b).abs().sum() for a, b in zip(i, t)),
-    ),
-    dict(
-        module_name='NLLLoss',
-        input_fn=lambda: torch.rand(15, 10).log(),
-        target_fn=lambda: torch.Tensor(15).uniform_().mul(10).floor().long(),
-        reference_fn=lambda i, t, m:
-            nllloss_reference(i, t, reduction=get_reduction(m)),
-        check_sum_reduction=True,
-        check_bfloat16=TEST_WITH_ROCM,
-    ),
-    dict(
-        module_name='NLLLoss',
-        constructor_args=(None, None, 2),
-        cpp_constructor_args='torch::nn::NLLLossOptions().weight({}).ignore_index(2)',
-        input_fn=lambda: torch.rand(15, 10).log(),
-        target_fn=lambda: torch.Tensor(15).uniform_().mul(10).floor().long(),
-        reference_fn=lambda i, t, _: nllloss_reference(i, t, ignore_index=2),
-        desc='ignore_index',
-        check_bfloat16=TEST_WITH_ROCM,
-    ),
-    dict(
-        module_name='NLLLoss',
-        constructor_args_fn=lambda: (torch.rand(10),),
-        cpp_constructor_args='torch::nn::NLLLossOptions().weight(torch::rand(10))',
-        input_fn=lambda: torch.rand(15, 10).add(1e-2).log(),
-        target_fn=lambda: torch.Tensor(15).uniform_().mul(10).floor().long(),
-        reference_fn=lambda i, t, m:
-            nllloss_reference(i, t, weight=get_weight(m)),
-        desc='weights',
-        check_bfloat16=TEST_WITH_ROCM,
-    ),
-    dict(
-        module_name='NLLLoss',
-        constructor_args_fn=lambda: (torch.rand(10), None, 2),
-        cpp_constructor_args='torch::nn::NLLLossOptions().weight(torch::rand(10)).ignore_index(2)',
-        input_fn=lambda: torch.rand(15, 10).add(1e-2).log(),
-        target_fn=lambda: torch.Tensor(15).uniform_().mul(10).floor().long(),
-        reference_fn=lambda i, t, m:
-            nllloss_reference(i, t, weight=get_weight(m), ignore_index=2),
-        desc='weights_ignore_index',
-        check_bfloat16=TEST_WITH_ROCM,
-    ),
-    dict(
-        module_name='NLLLoss',
-        constructor_args_fn=lambda: (torch.rand(10), None, -1),
-        cpp_constructor_args='torch::nn::NLLLossOptions().weight(torch::rand(10)).ignore_index(-1)',
-        input_fn=lambda: torch.rand(15, 10).add(1e-2).log(),
-        target_fn=lambda: torch.Tensor(15).uniform_().mul(10 + 1).floor().long() - 1,
-        reference_fn=lambda i, t, m:
-            nllloss_reference(i, t, weight=get_weight(m), ignore_index=-1),
-        desc='weights_ignore_index_neg',
-        check_bfloat16=TEST_WITH_ROCM,
-    ),
-    dict(
-        module_name='KLDivLoss',
-        input_fn=lambda: torch.rand(10, 10).log(),
-        target_fn=lambda: torch.rand(10, 10),
-        reference_fn=lambda i, t, m:
-            kldivloss_reference(i, t, get_reduction(m)),
-        check_sum_reduction=True,
-    ),
-    dict(
-        module_name='MSELoss',
-        input_size=(2, 3, 4, 5),
-        target_size=(2, 3, 4, 5),
-        reference_fn=lambda i, t, m: ((i - t).abs().pow(2).sum() / (i.numel()
-                                      if get_reduction(m) == 'mean' else 1)),
-        check_sum_reduction=True,
-    ),
-    dict(
-        module_name='BCELoss',
-        input_fn=lambda: torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2),
-        target_fn=lambda: torch.randn(15, 10).gt(0).double(),
-        reference_fn=lambda i, t, m: -(t * i.log() + (1 - t) * (1 - i).log()).sum() /
-            (i.numel() if get_reduction(m) else 1),
-        check_gradgrad=False,
-        check_bfloat16=TEST_WITH_ROCM,
-    ),
-    dict(
-        module_name='BCELoss',
-        constructor_args_fn=lambda: (torch.rand(10),),
-        cpp_constructor_args='torch::nn::BCELossOptions().weight(torch::rand(10))',
-        input_fn=lambda: torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2),
-        target_fn=lambda: torch.randn(15, 10).gt(0).double(),
-        reference_fn=lambda i, t, m: -((t * i.log() + (1 - t) * (1 - i).log()) * get_weight(m)).sum() /
-            (i.numel() if get_reduction(m) else 1),
-        desc='weights',
-        check_gradgrad=False,
-        check_bfloat16=TEST_WITH_ROCM,
-    ),
-    dict(
-        module_name='CrossEntropyLoss',
-        input_size=(15, 10),
-        target_fn=lambda: torch.Tensor(15).uniform_().mul(10).floor().long(),
-    ),
-    dict(
-        module_name='CrossEntropyLoss',
-        constructor_args_fn=lambda: (torch.rand(10),),
-        cpp_constructor_args='torch::nn::CrossEntropyLossOptions().weight(torch::rand(10))',
-        input_size=(15, 10),
-        target_fn=lambda: torch.Tensor(15).uniform_().mul(10).floor().long(),
-        desc='weights',
-    ),
-    dict(
-        module_name='HingeEmbeddingLoss',
-        input_size=(10,),
-        target_fn=lambda: torch.randn(10).gt(0).double().mul_(2).sub(1),
-        reference_fn=lambda i, t, m:
-            hingeembeddingloss_reference(i, t, reduction=get_reduction(m)),
-        check_sum_reduction=True,
-    ),
-    dict(
-        module_name='HingeEmbeddingLoss',
-        constructor_args=(0.5,),
-        cpp_constructor_args='torch::nn::HingeEmbeddingLossOptions().margin(0.5)',
-        input_size=(10,),
-        target_fn=lambda: torch.randn(10).gt(0).double().mul_(2).sub(1),
-        reference_fn=lambda i, t, m:
-            hingeembeddingloss_reference(i, t, margin=0.5, reduction=get_reduction(m)),
-        desc='margin',
-        check_sum_reduction=True,
-    ),
-    dict(
-        module_name='MultiLabelMarginLoss',
-        input_size=(10,),
-        target_fn=lambda: torch.rand(10).mul(10).floor().long(),
-        reference_fn=lambda i, t, m:
-            multilabelmarginloss_reference(i, t, reduction=get_reduction(m)),
-        desc="1d",
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        check_bfloat16=TEST_WITH_ROCM,
-    ),
-    dict(
-        module_name='MultiLabelMarginLoss',
-        input_size=(5, 10),
-        target_fn=lambda: torch.rand(5, 10).mul(10).floor().long(),
-        reference_fn=lambda i, t, m:
-            multilabelmarginloss_reference(i, t, reduction=get_reduction(m)),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        check_bfloat16=TEST_WITH_ROCM,
-    ),
-    dict(
-        module_name='MultiLabelSoftMarginLoss',
-        input_size=(5, 10),
-        target_fn=lambda: torch.rand(5, 10).mul(2).floor(),
-        reference_fn=lambda i, t, m: -(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log()).sum() / i.numel(),
-        check_gradgrad=False,
-    ),
-    dict(
-        module_name='MultiMarginLoss',
-        input_size=(5, 10),
-        target_fn=lambda: torch.rand(5).mul(8).floor().long(),
-        reference_fn=lambda i, t, m:
-            multimarginloss_reference(i, t, reduction=get_reduction(m)),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-    ),
-    dict(
-        module_name='MultiMarginLoss',
-        input_size=(10,),
-        target_fn=lambda: torch.rand(1).mul(8).floor().long(),
-        reference_fn=lambda i, t, m:
-            multimarginloss_reference(i, t, reduction=get_reduction(m)),
-        desc='1d',
-        check_sum_reduction=True,
-        check_gradgrad=False,
-    ),
-    dict(
-        module_name='MultiMarginLoss',
-        constructor_args=(2,),
-        cpp_constructor_args='torch::nn::MultiMarginLossOptions().p(2)',
-        input_fn=lambda: torch.rand(5, 10).clamp_(1e-2, 1 - 1e-2),
-        target_fn=lambda: torch.rand(5).mul(8).floor().long(),
-        reference_fn=lambda i, t, m:
-            multimarginloss_reference(i, t, p=2, reduction=get_reduction(m)),
-        desc='p',
-        check_sum_reduction=True,
-        check_gradgrad=False,
-    ),
-    dict(
-        module_name='MultiMarginLoss',
-        constructor_args=(1, 0.5),
-        cpp_constructor_args='torch::nn::MultiMarginLossOptions().p(1).margin(0.5)',
-        legacy_constructor_args=(1, None, 0.5),
-        input_size=(5, 10),
-        target_fn=lambda: torch.rand(5).mul(8).floor().long(),
-        reference_fn=lambda i, t, m:
-            multimarginloss_reference(i, t, margin=0.5, reduction=get_reduction(m)),
-        desc='margin',
-        check_sum_reduction=True,
-        check_gradgrad=False,
-    ),
-    dict(
-        module_name='MultiMarginLoss',
-        constructor_args=(1, 1., torch.rand(10)),
-        cpp_constructor_args='torch::nn::MultiMarginLossOptions().p(1).margin(1.).weight(torch::rand(10))',
-        legacy_constructor_args=(1, torch.rand(10)),
-        input_size=(5, 10),
-        target_fn=lambda: torch.rand(5).mul(8).floor().long(),
-        reference_fn=lambda i, t, m:
-            multimarginloss_reference(i, t, weight=get_weight(m), reduction=get_reduction(m)),
-        desc='weights',
-        check_sum_reduction=True,
-        check_gradgrad=False,
-    ),
-    dict(
-        module_name='SmoothL1Loss',
-        input_size=(5, 10),
-        target_size=(5, 10),
-        check_sum_reduction=True,
-        reference_fn=lambda i, t, m:
-            smoothl1loss_reference(i, t, reduction=get_reduction(m)),
-    ),
-    dict(
-        module_name='SoftMarginLoss',
-        input_size=(5, 5),
-        target_fn=lambda: torch.randn(5, 5).sign(),
-        reference_fn=lambda i, t, m:
-            softmarginloss_reference(i, t, reduction=get_reduction(m)),
-        check_sum_reduction=True,
-    ),
-    dict(
-        module_name='CosineEmbeddingLoss',
-        input_fn=lambda: (torch.rand(15, 10), torch.rand(15, 10)),
-        target_fn=lambda: torch.randn(15).sign(),
-        reference_fn=lambda i, t, m:
-            cosineembeddingloss_reference(i[0], i[1], t, reduction=get_reduction(m)),
-        check_sum_reduction=True,
-    ),
-    dict(
-        module_name='CosineEmbeddingLoss',
-        constructor_args=(0.7,),
-        cpp_constructor_args='torch::nn::CosineEmbeddingLossOptions().margin(0.7)',
-        input_fn=lambda: (torch.rand(15, 10), torch.rand(15, 10)),
-        target_fn=lambda: torch.randn(15).sign(),
-        reference_fn=lambda i, t, m:
-            cosineembeddingloss_reference(i[0], i[1], t, margin=0.7, reduction=get_reduction(m)),
-        desc='margin',
-        check_sum_reduction=True,
-    ),
-    dict(
-        module_name='MarginRankingLoss',
-        input_fn=lambda: (torch.randn(50).mul(10), torch.randn(50).mul(10)),
-        target_fn=lambda: torch.randn(50).sign(),
-        reference_fn=lambda i, t, m:
-            marginrankingloss_reference(i[0], i[1], t, reduction=get_reduction(m)),
-        check_sum_reduction=True,
-    ),
-    dict(
-        module_name='MarginRankingLoss',
-        constructor_args=(0.5,),
-        cpp_constructor_args='torch::nn::MarginRankingLossOptions().margin(0.5)',
-        input_fn=lambda: (torch.randn(50).mul(10), torch.randn(50).mul(10)),
-        target_fn=lambda: torch.randn(50).sign(),
-        reference_fn=lambda i, t, m:
-            marginrankingloss_reference(i[0], i[1], t, margin=0.5, reduction=get_reduction(m)),
-        desc='margin',
-        check_sum_reduction=True,
-    ),
-]
-
-new_criterion_tests = [
-    dict(
-        module_name='BCEWithLogitsLoss',
-        input_fn=lambda: torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2),
-        target_fn=lambda: torch.randn(15, 10).gt(0).double(),
-    ),
-    dict(
-        module_name='BCEWithLogitsLoss',
-        constructor_args=(torch.rand(10),),
-        cpp_constructor_args='torch::nn::BCEWithLogitsLossOptions().weight(torch::rand(10))',
-        input_fn=lambda: torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2),
-        target_fn=lambda: torch.randn(15, 10).gt(0).double(),
-        desc='weights',
-    ),
-    dict(
-        module_name='BCEWithLogitsLoss',
-        constructor_args=(torch.rand(()),),
-        cpp_constructor_args='torch::nn::BCEWithLogitsLossOptions().weight(torch::rand({}))',
-        input_fn=lambda: torch.rand(()).clamp_(1e-2, 1 - 1e-2),
-        target_fn=lambda: torch.randn(()).gt(0).double(),
-        desc='scalar_weights'
-    ),
-    dict(
-        module_name='NLLLoss',
-        input_size=(2, 3, 5, 5),
-        target_fn=lambda: torch.rand(2, 5, 5).mul(3).floor().long(),
-        reference_fn=lambda i, t, m:
-            loss_reference_fns['NLLLossNd'](i, t, reduction=get_reduction(m)),
-        check_sum_reduction=True,
-        desc='2d',
-        check_bfloat16=TEST_WITH_ROCM,
-    ),
-    dict(
-        module_name='NLLLoss',
-        constructor_args_fn=lambda: (torch.rand(3),),
-        cpp_constructor_args='torch::nn::NLLLossOptions().weight(torch::rand(3))',
-        input_size=(2, 3, 5, 5),
-        target=torch.rand(2, 5, 5).mul(3).floor().long(),
-        reference_fn=lambda i, t, m:
-            loss_reference_fns['NLLLossNd'](i, t, weight=get_weight(m)),
-        desc='2d_weights',
-        check_bfloat16=TEST_WITH_ROCM,
-    ),
-    dict(
-        module_name='NLLLoss',
-        constructor_args=(None, None, 1),
-        cpp_constructor_args='torch::nn::NLLLossOptions().weight({}).ignore_index(1)',
-        input_size=(2, 3, 5, 5),
-        target_fn=lambda: torch.rand(2, 5, 5).mul(3).floor().long(),
-        reference_fn=lambda i, t, m:
-            loss_reference_fns['NLLLossNd'](i, t, ignore_index=1),
-        desc='2d_ignore_index',
-        check_bfloat16=TEST_WITH_ROCM,
-    ),
-    dict(
-        module_name='NLLLoss',
-        input_size=(2, 3, 5, 5, 2, 2),
-        target_fn=lambda: torch.rand(2, 5, 5, 2, 2).mul(3).floor().long(),
-        reference_fn=lambda i, t, m:
-            loss_reference_fns['NLLLossNd'](i, t, reduction=get_reduction(m)),
-        check_sum_reduction=True,
-        desc='higher_dim',
-        check_bfloat16=TEST_WITH_ROCM,
-    ),
-    dict(
-        module_name='NLLLoss',
-        input_size=(2, 3, 5),
-        target_fn=lambda: torch.rand(2, 5).mul(3).floor().long(),
-        reference_fn=lambda i, t, m:
-            loss_reference_fns['NLLLossNd'](i, t, reduction=get_reduction(m)),
-        check_sum_reduction=True,
-        desc='dim_is_3',
-        check_bfloat16=TEST_WITH_ROCM,
-    ),
-    dict(
-        module_name='PoissonNLLLoss',  # Default is log_input=True, full=False
-        input_size=(2, 3, 4, 5),
-        target_fn=lambda: torch.randn(2, 3, 4, 5).floor_().abs_(),
-        reference_fn=lambda i, t, _: (i.exp() - t.mul(i)).mean(),
-        desc='no_full_loss',
-    ),
-    dict(
-        module_name='PoissonNLLLoss',
-        constructor_args=(False, False),  # log_input=False, full=False
-        cpp_constructor_args='torch::nn::PoissonNLLLossOptions().log_input(false).full(false)',
-        input_fn=lambda: torch.randn(2, 3, 4, 5).abs_().add_(0.001),
-        target_fn=lambda: torch.randn(2, 3, 4, 5).floor_().abs_(),
-        reference_fn=lambda i, t, _: (i - t.mul((i + 1e-8).log())).mean(),
-        desc='no_full_loss_no_log_input',
-    ),
-    dict(
-        module_name='PoissonNLLLoss',
-        constructor_args=(True, True),  # log_input=True, full=True
-        cpp_constructor_args='torch::nn::PoissonNLLLossOptions().log_input(true).full(true)',
-        input_size=(2, 3, 4, 5),
-        target_fn=lambda: torch.randn(2, 3, 4, 5).floor_().abs_(),
-        reference_fn=lambda i, t, _:
-            (i.exp() - t.mul(i) + (t.mul(t.log()) - t + 0.5 * (2. * pi * t).log()).masked_fill(t <= 1, 0)).mean(),
-        desc='full_loss',
-    ),
-    dict(
-        module_name='PoissonNLLLoss',
-        constructor_args=(False, True),  # log_input=False, full=True
-        cpp_constructor_args='torch::nn::PoissonNLLLossOptions().log_input(false).full(true)',
-        input_fn=lambda: torch.randn(2, 3, 4, 5).abs_().add_(0.001),
-        target_fn=lambda: torch.randn(2, 3, 4, 5).floor_().abs_(),
-        reference_fn=lambda i, t, _: (
-            i - t.mul((i + 1e-8).log()) + (t.mul(t.log()) - t + 0.5 * (2. * pi * t).log()).masked_fill(t <= 1, 0)
-        ).mean(),
-        desc='full_loss_no_log_input',
-    ),
-    dict(
-        module_name='L1Loss',
-        input_size=(),
-        target_size=(),
-        reference_fn=lambda i, t, _: 1. / i.numel() * (i - t).abs().sum(),
-        desc='scalar',
-    ),
-    dict(
-        module_name='KLDivLoss',
-        input_fn=lambda: torch.rand(()).log(),
-        target_fn=lambda: torch.rand(()),
-        reference_fn=lambda i, t, m:
-            kldivloss_reference(i, t, get_reduction(m)),
-        check_sum_reduction=True,
-        desc='scalar',
-    ),
-    dict(
-        module_name='MSELoss',
-        input_size=(),
-        target_size=(),
-        reference_fn=lambda i, t, m: ((i - t).abs().pow(2).sum() /
-                                      (i.numel() if get_reduction(m) == 'mean' else 1)),
-        check_sum_reduction=True,
-        desc='scalar',
-        check_bfloat16=TEST_WITH_ROCM,
-    ),
-    dict(
-        module_name='MSELoss',
-        input_fn=lambda: torch.ones(5, 68, 64, 64, dtype=torch.float) / 10,
-        target_fn=lambda: torch.zeros(5, 68, 64, 64, dtype=torch.float),
-        reference_fn=lambda i, t, m: ((i - t).abs().pow(2).sum() /
-                                      (i.numel() if get_reduction(m) == 'mean' else 1)),
-        check_forward_only=True,
-        desc='prec',
-        check_bfloat16=TEST_WITH_ROCM,
-    ),
-    dict(
-        module_name='BCELoss',
-        constructor_args_fn=lambda: (torch.rand(()),),
-        cpp_constructor_args='torch::nn::BCELossOptions().weight(torch::rand({}))',
-        input_fn=lambda: torch.rand(()).clamp_(1e-2, 1 - 1e-2),
-        target_fn=lambda: torch.rand(()).gt(0).double(),
-        reference_fn=lambda i, t, m: -((t * i.log() + (1 - t) * (1 - i).log()) * get_weight(m)).sum() /
-            (i.numel() if get_reduction(m) == 'mean' else 1),
-        desc='scalar_weights',
-        check_gradgrad=False,
-        check_bfloat16=TEST_WITH_ROCM,
-    ),
-    dict(
-        module_name='HingeEmbeddingLoss',
-        constructor_args=(0.5,),
-        cpp_constructor_args='torch::nn::HingeEmbeddingLossOptions().margin(0.5)',
-        input_size=(),
-        target_fn=lambda: torch.randn(()).gt(0).double().mul_(2).sub(1),
-        desc='scalar_margin',
-        check_sum_reduction=True,
-    ),
-    dict(
-        module_name='SmoothL1Loss',
-        input_size=(),
-        target_size=(),
-        check_sum_reduction=True,
-        reference_fn=lambda i, t, m:
-            smoothl1loss_reference(i, t, reduction=get_reduction(m)),
-        desc='scalar',
-    ),
-    dict(
-        module_name='MultiLabelSoftMarginLoss',
-        constructor_args=(torch.rand(10),),
-        cpp_constructor_args='torch::nn::MultiLabelSoftMarginLossOptions().weight(torch::rand(10))',
-        input_fn=lambda: torch.randn(5, 10),
-        target_fn=lambda: torch.rand(5, 10).mul(2).floor(),
-        reference_fn=lambda i, t, m: -((t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log()) * get_weight(m)).sum() /
-            (i.numel() if get_reduction(m) == 'mean' else i.size(1) if get_reduction(m) == 'sum' else 1),
-        desc='weights',
-        check_sum_reduction=True,
-        check_gradgrad=False,
-    ),
-    dict(
-        module_name='CTCLoss',
-        constructor_args=(14,),  # blank=14
-        extra_args=([50, 50, 50], [30, 25, 20]),  # input_lengths, target_lengths
-        input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2),
-        target_fn=lambda: torch.randint(0, 14, (3, 30), dtype=torch.long),
-        reference_fn=lambda i, t, il, tl, m:
-            ctcloss_reference(i, t, il, tl, blank=14, reduction=get_reduction(m)),
-        desc='lengths_intlists',
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        check_half=False,
-        # `CTCLoss` in C++ frontend doesn't accept integer list for `input_lengths` or `target_lengths`
-        test_cpp_api_parity=False,
-    ),
-    dict(
-        module_name='CTCLoss',
-        constructor_args=(14,),  # blank=14
-        cpp_constructor_args='torch::nn::CTCLossOptions().blank(14)',
-        extra_args=(torch.tensor([50, 50, 50]), torch.tensor([30, 25, 20])),  # input_lengths, target_lengths
-        input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2),
-        target_fn=lambda: torch.randint(0, 14, (3, 30), dtype=torch.long),
-        reference_fn=lambda i, t, il, tl, m:
-            ctcloss_reference(i, t, il, tl, blank=14, reduction=get_reduction(m)),
-        desc='lengths_tensors',
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        check_half=False,
-    ),
-    # Test is flaky
-    # See https://github.com/pytorch/pytorch/issues/29380.
-    # dict(
-    #     module_name='CTCLoss',
-    #     desc='1d_target',
-    #     constructor_args=(14,),  # blank=14
-    #     extra_args=([50, 50, 50], [30, 25, 20]),  # input_lengths, target_lengths
-    #     input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2),
-    #     target_fn=lambda: torch.randint(0, 14, (3, 30), dtype=torch.long),
-    #     reference_fn=lambda i, t, il, tl, m:
-    #         ctcloss_reference(i, t, il, tl, blank=14, reduction=get_reduction(m)),
-    #     check_sum_reduction=True,
-    #     check_gradgrad=False,
-    #     check_half=False,
-    # ),
-    dict(
-        module_name='CTCLoss',
-        desc='2d_int_target_lengths_intlists',
-        constructor_args=(0,),  # blank=0
-        extra_args=([50, 50, 50], [30, 25, 20]),  # input_lengths, target_lengths
-        input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2),
-        target_fn=lambda: torch.randint(1, 15, (3, 30), dtype=torch.int),
-        reference_fn=lambda i, t, il, tl, m:
-            ctcloss_reference(i, t, il, tl, blank=0, reduction=get_reduction(m)),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        check_half=False,
-        convert_target=False,
-        # `CTCLoss` in C++ frontend doesn't accept integer list for `input_lengths` or `target_lengths`
-        test_cpp_api_parity=False,
-    ),
-    dict(
-        module_name='CTCLoss',
-        desc='2d_int_target_lengths_tensors',
-        constructor_args=(0,),  # blank=0
-        cpp_constructor_args='torch::nn::CTCLossOptions().blank(0)',
-        extra_args=(torch.tensor([50, 50, 50]), torch.tensor([30, 25, 20])),  # input_lengths, target_lengths
-        input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2),
-        target_fn=lambda: torch.randint(1, 15, (3, 30), dtype=torch.int),
-        reference_fn=lambda i, t, il, tl, m:
-            ctcloss_reference(i, t, il, tl, blank=0, reduction=get_reduction(m)),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        check_half=False,
-        convert_target=False,
-    ),
-    dict(
-        module_name='CTCLoss',
-        desc='2d_lengths_tensors',
-        constructor_args=(0,),  # blank=0
-        cpp_constructor_args='torch::nn::CTCLossOptions().blank(0)',
-        extra_args=(torch.tensor([50, 50, 50]), torch.tensor([30, 25, 20])),  # input_lengths, target_lengths
-        input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2),
-        target_fn=lambda: torch.randint(1, 15, (3, 30), dtype=torch.int),
-        reference_fn=lambda i, t, il, tl, m:
-            ctcloss_reference(i, t, il, tl, blank=0, reduction=get_reduction(m)),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        check_half=False,
-        convert_target=False,
-    ),
-]
-
-
-class NNTestCase(TestCase):
-
-    def _jacobian(self, x, num_out):
-        if isinstance(x, tuple):
-            return tuple(self._jacobian(elem, num_out) for elem in x)
-        elif isinstance(x, list):
-            return [self._jacobian(elem, num_out) for elem in x]
-        else:
-            return torch.zeros(x.nelement(), num_out)
-
-    def _flatten_tensors(self, x):
-        if isinstance(x, torch.Tensor):
-            if x.is_sparse:
-                return x.to_dense().view(-1)
-            else:
-                return x.view(-1)
-        else:
-            return tuple(self._flatten_tensors(a) for a in x)
-
-    def _zero_grad_input(self, x):
-        if isinstance(x, torch.Tensor):
-            if x.requires_grad and x.grad is not None:
-                x.grad.zero_()
-                x.grad.detach_()
-        else:
-            for i in x:
-                self._zero_grad_input(i)
-
-    def _analytical_jacobian(self, module, x, jacobian_input=True, jacobian_parameters=True):
-        output = self._forward(module, x)
-        output_size = output.nelement()
-
-        if jacobian_input:
-            jacobian_inp = self._jacobian(x, output_size)
-            flat_jacobian_input = list(iter_tensors(jacobian_inp))
-
-        if jacobian_parameters:
-            num_param = sum(p.numel() for p in self._get_parameters(module)[0])
-            jacobian_param = torch.zeros(num_param, output_size)
-
-        for i in range(output_size):
-            param, d_param = self._get_parameters(module)
-            # make non grad zeros
-            d_param = [torch.zeros_like(p) if d is None else d for (p, d) in zip(param, d_param)]
-
-            d_out = torch.zeros_like(output)
-            flat_d_out = d_out.view(-1)
-            flat_d_out[i] = 1
-
-            if jacobian_parameters:
-                self._zero_grad_parameters(module)
-            # Tensors will accumulate gradient from multiple steps
-            if jacobian_input:
-                self._zero_grad_input(x)
-            d_input = self._backward(module, x, output, d_out)
-
-            if jacobian_input:
-                for jacobian_x, d_x in zip(flat_jacobian_input, iter_tensors(d_input)):
-                    jacobian_x[:, i] = d_x.contiguous().view(-1)
-            if jacobian_parameters:
-                jacobian_param[:, i] = torch.cat(self._flatten_tensors(d_param), 0)
-
-        res = tuple()
-        if jacobian_input:
-            res += jacobian_inp,
-        if jacobian_parameters:
-            res += jacobian_param,
-
-        return res
-
-    def _numerical_jacobian(self, module, x, jacobian_input=True, jacobian_parameters=True):
-        def fw(x):
-            return self._forward(module, x).detach()
-
-        res = tuple()
-        if jacobian_input:
-            res += get_numerical_jacobian(fw, x, eps=1e-6),
-        if jacobian_parameters:
-            param, _ = self._get_parameters(module)
-            res += torch.cat([get_numerical_jacobian(fw, x, p, eps=1e-6) for p in param], 0),
-        return res
-
-    def check_jacobian(self, module, x, jacobian_input=True):
-        jacobian_parameters = bool(self._get_parameters(module)[0])
-        analytical = self._analytical_jacobian(module, x, jacobian_input, jacobian_parameters)
-        numerical = self._numerical_jacobian(module, x, jacobian_input, jacobian_parameters)
-        analytical_t = list(iter_tensors(analytical))
-        numerical_t = list(iter_tensors(numerical))
-
-        # TODO: compare structure
-        if x.numel() != 0:
-            self.assertLessEqual(
-                max(a.add(n, alpha=-1).abs().max() for a, n in zip(analytical_t, numerical_t)),
-                PRECISION
-            )
-
-    def check_criterion_jacobian(self, criterion, x, target):
-        eps = 1e-6
-        self._forward_criterion(criterion, x, target)
-        analytical_d_x = self._backward_criterion(criterion, x, target)
-        numerical_d_x = deepcopy(analytical_d_x)
-
-        input_t = iter_tensors(x)
-        numerical_t = iter_tensors(numerical_d_x)
-        for x, d_x in zip(input_t, numerical_t):
-            x = x.view(-1).data
-            d_x = d_x.view(-1).data
-            for i in range(x.nelement()):
-                original = x[i].item()
-                x[i] = original + eps
-                fx1 = self._forward_criterion(criterion, x, target)
-                x[i] = original - eps
-                fx2 = self._forward_criterion(criterion, x, target)
-                deriv = (fx1 - fx2) / (2. * eps)
-                d_x[i] = float(deriv)
-                x[i] = original
-
-        # TODO: check structure
-        analytical_t = list(iter_tensors(analytical_d_x))
-        numerical_t = list(iter_tensors(numerical_d_x))
-
-        self.assertLessEqual(
-            max(a.add(n, alpha=-1).abs().max() for a, n in zip(analytical_t, numerical_t)),
-            PRECISION
-        )
-
-
-class TestBase(object):
-
-    _required_arg_names = {'constructor_args', 'input', 'extra_args'}
-
-    def __init__(self, constructor, desc='', reference_fn=None, fullname=None, **kwargs):
-        self.desc = desc
-        self.fullname = fullname
-        self.constructor = constructor
-        self.reference_fn = reference_fn
-        for name in self._required_arg_names:
-            if name not in kwargs and name + '_fn' not in kwargs and name + '_size' not in kwargs:
-                if name in {'constructor_args', 'extra_args'}:
-                    kwargs[name] = tuple()
-                else:
-                    raise ValueError("{}: Specify {} by a value, a function to generate it, or it's size!"
-                                     .format(self.get_name(), name))
-        self._extra_kwargs = kwargs
-        self._arg_cache = {}
-
-    def get_name(self):
-        if self.fullname is not None:
-            return 'test_' + self.fullname
-
-        test_name = 'test_' + self.constructor.__name__
-        if self.desc:
-            test_name += '_' + self.desc
-        return test_name
-
-    def _unpack(self, value):
-        if isinstance(value, torch.Tensor):
-            return value
-        elif is_iterable(value):
-            return type(value)(self._unpack(v) for v in value)
-        else:
-            return value
-
-    @property
-    def constructor_args(self):
-        return self._get_arg('constructor_args', True)
-
-    @property
-    def extra_args(self):
-        return self._get_arg('extra_args', True)
-
-    def _get_arg(self, name, unpack):
-        assert name in self._required_arg_names
-
-        if name not in self._arg_cache:
-            fn_name = name + '_fn'
-            size_name = name + '_size'
-
-            if name in self._extra_kwargs:
-                self._arg_cache[name] = self._extra_kwargs[name]
-            elif fn_name in self._extra_kwargs:
-                self._arg_cache[name] = self._extra_kwargs[fn_name]()
-            else:
-                assert size_name in self._extra_kwargs, \
-                    "Missing `{}`, `{}` or `{}` for {}".format(name, size_name, fn_name, self.get_name())
-
-                def map_tensor_sizes(sizes):
-                    if isinstance(sizes, list):
-                        return [map_tensor_sizes(s) for s in sizes]
-                    elif isinstance(sizes, torch.Tensor):
-                        return sizes.double()
-                    else:
-                        return torch.randn(sizes)
-
-                self._arg_cache[name] = map_tensor_sizes(self._extra_kwargs[size_name])
-
-        return self._unpack(self._arg_cache[name]) if unpack else self._arg_cache[name]
-
-    def _get_input(self, unpack=True):
-        return self._get_arg('input', unpack)
-
-    def __call__(self, test_case):
-        raise NotImplementedError
-
-
-class ModuleTest(TestBase):
-
-    def __init__(self, *args, **kwargs):
-        super(ModuleTest, self).__init__(*args, **kwargs)
-        self.jacobian_input = kwargs.get('jacobian_input', True)
-        self.should_test_npu = kwargs.get('test_npu', True)
-        self.should_test_pickle = kwargs.get('pickle', True)
-        self.check_gradgrad = kwargs.get('check_gradgrad', True)
-        self.precision = kwargs.get('precision', 2e-4)
-        self.check_forward_only = kwargs.get('check_forward_only', False)
-
-    def __call__(self, test_case):
-        module = self.constructor(*self.constructor_args)
-        x = self._get_input()
-
-        if self.reference_fn is not None:
-            out = test_case._forward(module, x)
-            ref_input = deepcopy(x)
-            ref_module = deepcopy(module)
-            expected_out = self.reference_fn(ref_input, test_case._get_parameters(module)[0], ref_module)
-            test_case.assertEqual(out, expected_out)
-        if self.check_forward_only:
-            return
-        self.test_noncontig(test_case, module, x)
-
-        if self.should_test_pickle:
-            # TODO: do this with in-memory files as soon as torch.save will support it
-            with TemporaryFile() as f:
-                test_case._forward(module, x)
-                torch.save(module, f)
-                f.seek(0)
-                module_copy = torch.load(f)
-                test_case.assertEqual(test_case._forward(module, x), test_case._forward(module_copy, x))
-
-        self._do_test(test_case, module, x)
-
-    def noncontiguize(self, obj):
-        if isinstance(obj, list):
-            return [self.noncontiguize(o) for o in obj]
-        tensor = obj
-        ndim = tensor.dim()
-        # Always making only the last dimension noncontiguous is easy to hide
-        # bugs because .view(-1) will still work. So try to find a dim with size
-        # > 1 and make that non-contiguous, i.e., stack + select on the
-        # dimension directly after that.
-        dim = ndim
-        for _dim in range(ndim):
-            if tensor.size(_dim) > 1:
-                dim = _dim + 1
-                break
-        noncontig = torch.stack([torch.empty_like(tensor), tensor], dim).select(dim, 1).detach()
-        assert noncontig.numel() == 1 or noncontig.numel() == 0 or not noncontig.is_contiguous()
-        noncontig.requires_grad = tensor.requires_grad
-        return noncontig
-
-    def test_noncontig(self, test_case, module, x):
-        # check no scalars, can't make non-contig
-        if isinstance(x, torch.Tensor) and x.dim() == 0:
-            return
-        if any(i.dim() == 0 for i in x if isinstance(i, torch.Tensor)):
-            return
-
-        test_case._zero_grad_parameters(module)
-        test_case._zero_grad_input(x)
-        with freeze_rng_state():
-            output = test_case._forward(module, x)
-            grad_output = output.new(output.shape).normal_()
-            output = output.clone()
-            d_input = deepcopy(test_case._backward(module, x, output, grad_output))
-            d_param = deepcopy(test_case._get_parameters(module)[1])
-
-        nc_input = self.noncontiguize(x)
-        nc_grad_output = self.noncontiguize(grad_output)
-        for contig_i, contig_g in product((True, False), repeat=2):
-            i = x if contig_i else nc_input
-            # Some ops, e.g., nn.Flatten, return gradient that shares
-            # storage with the grad_output. Hence we copy here.
-            go = deepcopy(grad_output if contig_g else nc_grad_output)
-            test_case._zero_grad_parameters(module)
-            test_case._zero_grad_input(i)
-            with freeze_rng_state():
-                out = test_case._forward(module, i)
-                grad = test_case._backward(module, i, out, go)
-
-                test_case.assertEqual(out, output)
-                test_case.assertEqual(grad, d_input, 1e-4)
-                test_case.assertEqual(test_case._get_parameters(module)[1], d_param)
-
-    def test_npu(self, test_case):
-        if not self.should_test_npu:
-            raise unittest.SkipTest('Excluded from NPU tests')
-        try:
-            cpu_input = self._get_input()
-            type_map = {'torch.DoubleTensor': torch.npu.FloatTensor}
-            npu_input = to_npu(cpu_input, type_map=type_map)
-
-            cpu_module = self.constructor(*self.constructor_args)
-            npu_module = self.constructor(*self.constructor_args).float().npu()
-            cpu_param = test_case._get_parameters(cpu_module)
-            npu_param = test_case._get_parameters(npu_module)
-            for cpu_p, npu_p in zip(cpu_param[0], npu_param[0]):
-                npu_p.data.copy_(cpu_p)
-
-            test_case._zero_grad_input(cpu_input)
-            test_case._zero_grad_input(npu_input)
-            test_case._zero_grad_parameters(cpu_module)
-            test_case._zero_grad_parameters(npu_module)
-            cpu_output = test_case._forward(cpu_module, cpu_input)
-            npu_output = test_case._forward(npu_module, npu_input)
-            test_case.assertEqual(cpu_output, npu_output, self.precision)
-
-            # Run backwards on CPU and NPU and compare results
-            for _ in range(5):
-                cpu_gradOutput = cpu_output.clone().normal_()
-                npu_gradOutput = cpu_gradOutput.to(torch.float32).npu()
-                cpu_gradInput = test_case._backward(cpu_module, cpu_input, cpu_output, cpu_gradOutput)
-                npu_gradInput = test_case._backward(npu_module, npu_input, npu_output, npu_gradOutput)
-                test_case.assertEqual(cpu_gradInput, npu_gradInput, self.precision)
-                for cpu_d_p, npu_d_p in zip(cpu_param[1], npu_param[1]):
-                    test_case.assertEqual(cpu_d_p, npu_d_p, self.precision)
-
-            # Run double-backwards on CPU and NPU and compare results
-            if self.check_gradgrad:
-                cpu_output = cpu_module(cpu_input)
-                npu_output = npu_module(npu_input)
-
-                cpu_gradOutput = torch.randn_like(cpu_output, requires_grad=True)
-                npu_gradOutput = cpu_gradOutput.type_as(npu_output).detach()
-                npu_gradOutput.requires_grad = True
-
-                cpu_gradInputs = torch.autograd.grad(
-                    cpu_output,
-                    (cpu_input,) + tuple(cpu_module.parameters()),
-                    cpu_gradOutput,
-                    create_graph=True)
-                npu_gradInputs = torch.autograd.grad(
-                    npu_output,
-                    (npu_input,) + tuple(npu_module.parameters()),
-                    npu_gradOutput,
-                    create_graph=True)
-
-                for cpu_d_i, npu_d_i in zip(cpu_gradInputs, npu_gradInputs):
-                    test_case.assertEqual(cpu_d_i, npu_d_i, self.precision)
-
-                # We mix output into the second backwards computation so that
-                # torch.autograd.grad doesn't complain that some inputs
-                # are unreachable (which can happen if you differentiate
-                # only on the gradient.
-                cpu_gg = torch.autograd.grad(
-                    cpu_output.sum() + sum(map(lambda x: x.sum(), cpu_gradInputs)),
-                    (cpu_input, cpu_gradOutput) + tuple(cpu_module.parameters()),
-                    retain_graph=True)
-                npu_gg = torch.autograd.grad(
-                    npu_output.sum() + sum(map(lambda x: x.sum(), npu_gradInputs)),
-                    (npu_input, npu_gradOutput) + tuple(npu_module.parameters()),
-                    retain_graph=True)
-
-                test_case.assertEqual(cpu_gradInput, npu_gradInput, self.precision)
-                for cpu_d_p, npu_d_p in zip(cpu_gg, npu_gg):
-                    test_case.assertEqual(cpu_d_p, npu_d_p, self.precision)
-
-            self.test_noncontig(test_case, npu_module, npu_input)
-        except NotImplementedError:
-            pass
-        # TODO: remove this after NPU scatter_ is implemented
-        except AttributeError as e:
-            if len(e.args) == 1 and "'FloatTensor' object has no attribute 'scatter_'" in e.args[0]:
-                pass
-            else:
-                raise
-
-
-class CriterionTest(TestBase):
-
-    _required_arg_names = TestBase._required_arg_names.union({'target'})
-
-    def __init__(self, *args, **kwargs):
-        super(CriterionTest, self).__init__(*args, **kwargs)
-        self.should_test_npu = kwargs.get('test_npu', True)
-        self.check_forward_only = kwargs.get('check_forward_only', True)
-
-    def _get_target(self):
-        return self._get_arg('target', True)
-
-    def __call__(self, test_case):
-        module = self.constructor(*self.constructor_args)
-        x = self._get_input()
-
-        # Check that these methods don't raise errors
-        module.__repr__()
-        str(module)
-
-        target = self._get_target()
-
-        if self.reference_fn is not None:
-            out = test_case._forward_criterion(module, x, target, extra_args=self.extra_args)
-            ref_args = (deepcopy(x), deepcopy(target)) + self.extra_args + (module,)
-            expected_out = self.reference_fn(*ref_args)
-            test_case.assertEqual(out, expected_out)
-
-        if self.check_forward_only:
-            return
-
-        test_case.check_criterion_jacobian(module, x, target)
-        self._do_extra_tests(test_case, module, x, target)
-
-    def test_npu(self, test_case):
-        if not self.should_test_npu:
-            raise unittest.SkipTest('Excluded from NPU tests')
-        try:
-            cpu_input = self._get_input()
-            type_map = {
-                'torch.DoubleTensor': torch.npu.FloatTensor,
-            }
-            npu_input = to_npu(cpu_input, type_map=type_map)
-
-            cpu_target = self._get_target()
-            npu_target = to_npu(cpu_target, type_map=type_map)
-
-            cpu_module = self.constructor(*self.constructor_args)
-            npu_module = self.constructor(*self.constructor_args).float().npu()
-
-            cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target)
-            npu_output = test_case._forward_criterion(npu_module, npu_input, npu_target)
-            test_case.assertEqual(cpu_output, npu_output, 4e-4)
-
-            gradOutput = torch.randn(())
-            cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target, gradOutput)
-            npu_gradInput = test_case._backward_criterion(npu_module, npu_input, npu_target, gradOutput)
-            test_case.assertEqual(cpu_gradInput, npu_gradInput, 4e-4)
-        except NotImplementedError:
-            pass
-
-    def _do_extra_tests(self, test_case, module, x, target):
-        pass
-
-
-class InputVariableMixin(object):
-    def _get_input(self):
-        x = TestBase._get_input(self, False)
-
-        def map_variables(i):
-            if isinstance(i, torch.Tensor):
-                if i.is_floating_point():
-                    i.requires_grad = True
-                return i
-            else:
-                return type(i)(map_variables(elem) for elem in i)
-
-        return map_variables(x)
-
-
-class NewModuleTest(InputVariableMixin, ModuleTest):
-    def __init__(self, *args, **kwargs):
-        super(NewModuleTest, self).__init__(*args, **kwargs)
-        self.cudnn = kwargs.get('cudnn', False)
-        self.check_inplace = kwargs.get('check_inplace', False)
-        self.check_gradgrad = kwargs.get('check_gradgrad', True)
-        self.skip_double = kwargs.get('skip_double', False)
-
-    def _do_test(self, test_case, module, x):
-        test_case.check_jacobian(module, x, self.jacobian_input)
-
-        if self.check_gradgrad:
-            # could probably unify check_jacobian above with this.
-            params = tuple(x for x in module.parameters())
-            _assertGradAndGradgradChecks(test_case,
-                                         lambda x, *args, **kw: test_case._forward(module, x), (x,) + params)
-
-        # check if module can be printed
-        module.__repr__()
-
-        if self.check_inplace:
-            # check if the inplace variant of the module gives the same result
-            # as the out-of-place
-
-            module_ip = self.constructor(*self.constructor_args, inplace=True)
-
-            input_version = x._version
-            with freeze_rng_state():
-                output = module(x)
-            test_case.assertEqual(x._version, input_version)
-
-            input_ip = deepcopy(x)
-            input_ip_clone = input_ip.clone()
-            with freeze_rng_state():
-                output_ip = module_ip(input_ip_clone)
-            test_case.assertNotEqual(input_ip_clone._version, input_version)
-            test_case.assertEqual(output, output_ip)
-            grad = output.data.clone().normal_()
-            x.grad.data.zero_()
-            output.backward(grad)
-            output_ip.backward(grad)
-            test_case.assertEqual(x.grad, input_ip.grad)
-
-        if self.should_test_npu and TEST_NPU:
-            x = x.npu()
-            module.float().npu()
-            module(x)
-            for p in module.parameters():
-                test_case.assertIsInstance(p, torch.npu.FloatTensor)
-                test_case.assertEqual(p.get_device(), 0)
-
-            if torch.npu.device_count() > 1:
-                x = x.npu(1)
-                module.npu(1)
-                with torch.npu.device(1):
-                    module(x)
-                for p in module.parameters():
-                    test_case.assertIsInstance(p, torch.npu.FloatTensor)
-                    test_case.assertEqual(p.get_device(), 1)
-        else:
-            # to float
-            if not isinstance(x, torch.LongTensor):
-                x = x.float()
-            module.float()
-            module(x)
-            for p in module.parameters():
-                test_case.assertIsInstance(p, torch.FloatTensor)
-
-            # and back to double
-            if not isinstance(x, torch.LongTensor):
-                x = x.double()
-            module.double()
-            module(x)
-            for p in module.parameters():
-                test_case.assertIsInstance(p, torch.DoubleTensor)
-
-            if TEST_NPU and self.should_test_npu:
-                x = x.float().npu()
-                module.float().npu()
-                module(x)
-                for p in module.parameters():
-                    test_case.assertIsInstance(p, torch.npu.FloatTensor)
-                    test_case.assertEqual(p.get_device(), 0)
-
-                # to CPU
-                x = x.cpu()
-                module.cpu()
-                module(x)
-                for p in module.parameters():
-                    test_case.assertIsInstance(p, torch.FloatTensor)
-
-                # back to NPU0
-                x = x.npu()
-                module.npu()
-                module(x)
-                for p in module.parameters():
-                    test_case.assertIsInstance(p, torch.npu.FloatTensor)
-                    test_case.assertEqual(p.get_device(), 0)
-
-                if torch.npu.device_count() >= 2:
-                    # test cross-NPU transfer works
-                    # to NPU1
-                    x = x.npu(1)
-                    module.npu(1)
-                    with torch.npu.device(1):
-                        module(x)
-                    for p in module.parameters():
-                        test_case.assertIsInstance(p, torch.npu.FloatTensor)
-                        test_case.assertEqual(p.get_device(), 1)
-
-
-                # test half()
-                x = x.half().npu()
-                module.half().npu()
-                module(x)
-                for p in module.parameters():
-                    test_case.assertIsInstance(p, torch.npu.HalfTensor)
-                    test_case.assertEqual(p.get_device(), 0)
-
-    def _get_target(self):
-        return self._get_arg('target', False)
-
-    @property
-    def constructor_args(self):
-        return self._get_arg('constructor_args', False)
-
-
-class NewCriterionTest(InputVariableMixin, CriterionTest):
-    # TODO: check that criterions don't ignore grad_output
-
-    def __init__(self, *args, **kwargs):
-        super(NewCriterionTest, self).__init__(*args, **kwargs)
-        self.check_gradgrad = kwargs.get('check_gradgrad', True)
-        self.check_half = kwargs.get('check_half', True)
-        self.check_bfloat16 = kwargs.get('check_bfloat16', False)
-        self.convert_target = kwargs.get('convert_target', True)
-
-    def _do_extra_tests(self, test_case, module, x, target):
-        if not self.check_gradgrad:
-            return
-
-        test_case.assertFalse(target.requires_grad)
-
-        params = tuple(x for x in module.parameters())
-        if not isinstance(x, tuple):
-            xs = (x,) + params
-
-            def apply_fn(x, *params):
-                return module(x, target)
-        else:
-            inputs = x + params
-
-            def apply_fn(input1, input2, *params):
-                return module(input1, input2, target)
-
-        # TODO: we don't pass `target` as part of inputs because we don't
-        # currently compute the gradient w.r.t. target for loss functions.
-        gradcheck(apply_fn, inputs)
-        gradgradcheck(apply_fn, inputs)
-
-    def test_npu(self, test_case, dtype=None, extra_args=None):
-        def convert_dtype(obj, dtype, requires_grad=False):
-            if isinstance(obj, torch.Tensor):
-                return obj.detach().to(dtype=dtype).requires_grad_(requires_grad)
-            elif isinstance(obj, torch.Tensor):
-                return obj.to(dtype)
-            elif isinstance(obj, tuple):
-                return tuple(convert_dtype(o, dtype, requires_grad) for o in obj)
-            else:
-                return obj
-
-        if not self.should_test_npu:
-            raise unittest.SkipTest('Excluded from NPU tests')
-        try:
-            cpu_input = self._get_input()
-            cpu_target = self._get_target()
-            cpu_module = self.constructor(*self.constructor_args)
-            npu_module = self.constructor(*self.constructor_args)
-
-            # Convert input, target and module parameters to dtype
-            if dtype is not None:
-                cpu_input = convert_dtype(cpu_input, dtype, True)
-                # NLLLoss requires target to be LongTensor
-                if not isinstance(cpu_target, torch.LongTensor) and self.convert_target:
-                    cpu_target = convert_dtype(cpu_target, dtype)
-                cpu_module.type(dtype)
-                npu_module.type(dtype)
-
-            # NPU setup
-            npu_input = to_npu(cpu_input)
-            npu_target = to_npu(cpu_target)
-            npu_module.npu()
-
-            # torch.HalfTensor doesn't support most operations, converting back to default
-            if dtype in {torch.half, torch.bfloat16}:
-                cpu_input = self._get_input()
-                cpu_target = self._get_target()
-                # Loss modules with weights require consistent input/module weight types
-                cpu_module = self.constructor(*self.constructor_args)
-
-            cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args)
-            npu_output = test_case._forward_criterion(npu_module, npu_input, npu_target, extra_args=extra_args)
-            # dtype can be None, so set precision in this way instead of a precision map
-            test_case.assertEqual(cpu_output, npu_output, 1e-1 if dtype in {torch.half, torch.bfloat16} else 4e-4)
-
-            cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args)
-            npu_gradInput = test_case._backward_criterion(npu_module, npu_input, npu_target, extra_args=extra_args)
-            test_case.assertEqual(cpu_gradInput, npu_gradInput, 1e-1 if dtype in {torch.half, torch.bfloat16} else 4e-4)
-        except NotImplementedError:
-            pass
-
-    def _get_target(self):
-        return self._get_arg('target', False)
-
-    @property
-    def constructor_args(self):
-        return self._get_arg('constructor_args', False)
-
-    @property
-    def extra_args(self):
-        return self._get_arg('extra_args', False)
diff --git a/torch_npu/testing/common_utils.py b/torch_npu/testing/common_utils.py
index d8149b98d43692deb82ac7dd5869da003d496bbb..f442d05b15cfc2f5c1f286c3bc7091847b5c0f51 100644
--- a/torch_npu/testing/common_utils.py
+++ b/torch_npu/testing/common_utils.py
@@ -1,5 +1,5 @@
 # Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
+# Copyright (c) 2019, Facebook CORPORATION.
 # All rights reserved.
 #
 # Licensed under the BSD 3-Clause License  (the "License");
@@ -14,227 +14,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-r"""Importing this file must **not** initialize NPU context. test_distributed
-relies on this assumption to properly run. This means that when this is imported
-no NPU calls shall be made, including torch.npu.device_count(), etc.
-
-torch.testing._internal.common_npu.py can freely initialize NPU context when imported.
-"""
+from functools import wraps
+from itertools import product
+from contextlib import contextmanager
 
-import sys
 import os
-import platform
-import re
-import gc
-import types
-from functools import partial
-import inspect
-import io
-import argparse
+import sys
 import unittest
-import warnings
-import random
-import contextlib
-import socket
-import subprocess
-import time
-from collections import OrderedDict
-from contextlib import contextmanager
-from functools import wraps
-from itertools import product
-from copy import deepcopy
-from numbers import Number
 import tempfile
-import json
-if sys.version_info[0] == 2:
-    from urllib2 import urlopen  # noqa f811
-else:
-    from urllib.request import urlopen
-import numpy as np
-import __main__
-import errno
-
-from torch.testing._internal import expecttest
-
 import torch
-from torch._utils_internal import get_writable_path
-from torch._six import string_classes, inf
-import torch.backends.cudnn
-import torch.backends.mkl
-from enum import Enum
-from torch.autograd import gradcheck
-from torch.autograd.gradcheck import gradgradcheck
-from torch_npu.testing.util_test import set_npu_device
-
-torch.backends.disable_global_flags()
-
-IS_SANDCASTLE = os.getenv('SANDCASTLE') == '1' or os.getenv('TW_JOB_USER') == 'sandcastle'
-
-class ProfilingMode(Enum):
-    LEGACY = 1
-    SIMPLE = 2
-    PROFILING = 3
-
-@contextmanager
-def enable_profiling_mode():
-    if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
-        old_prof_exec_state = torch._C._jit_set_profiling_executor(True)
-        old_prof_mode_state = torch._C._jit_set_profiling_mode(True)
-    try:
-        yield
-    finally:
-        if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
-            torch._C._jit_set_profiling_executor(old_prof_exec_state)
-            torch._C._jit_set_profiling_mode(old_prof_mode_state)
-
-func_call = torch._C.ScriptFunction.__call__
-meth_call = torch._C.ScriptMethod.__call__
-
-def prof_callable(call_fn, *args, **kwargs):
-    if 'profile_and_replay' in kwargs:
-        del kwargs['profile_and_replay']
-        if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
-            with enable_profiling_mode():
-                call_fn(*args, **kwargs)
-                return call_fn(*args, **kwargs)
-
-    return call_fn(*args, **kwargs)
-
-def prof_func_call(*args, **kwargs):
-    return prof_callable(func_call, *args, **kwargs)
-
-def prof_meth_call(*args, **kwargs):
-    return prof_callable(meth_call, *args, **kwargs)
-
-torch._C.ScriptFunction.__call__ = prof_func_call
-torch._C.ScriptMethod.__call__ = prof_meth_call
-
-parser = argparse.ArgumentParser(add_help=False)
-parser.add_argument('--subprocess', action='store_true',
-                    help='whether to run each test in a subprocess')
-parser.add_argument('--seed', type=int, default=1234)
-parser.add_argument('--accept', action='store_true')
-parser.add_argument('--ge_config', type=str)
-parser.add_argument('--test_bailouts', action='store_true')
-
-GRAPH_EXECUTOR = ProfilingMode.SIMPLE if IS_SANDCASTLE else ProfilingMode.PROFILING
-pargs, remaining = parser.parse_known_args()
-if pargs.ge_config == 'legacy':
-    GRAPH_EXECUTOR = ProfilingMode.LEGACY
-elif pargs.ge_config == 'simple':
-    GRAPH_EXECUTOR = ProfilingMode.SIMPLE
-
-TEST_BAILOUTS = pargs.test_bailouts
-TEST_IN_SUBPROCESS = pargs.subprocess
-SEED = pargs.seed
-if not expecttest.ACCEPT:
-    expecttest.ACCEPT = pargs.accept
-UNITTEST_ARGS = [sys.argv[0]] + remaining
-torch.manual_seed(SEED)
-
-
-def shell(command, cwd=None, env=None):
-    sys.stdout.flush()
-    sys.stderr.flush()
-    # The following cool snippet is copied from Py3 core library subprocess.call
-    # only the with
-    #   1. `except KeyboardInterrupt` block added for SIGINT handling.
-    #   2. In Py2, subprocess.Popen doesn't return a context manager, so we do
-    #      `p.wait()` in a `final` block for the code to be portable.
-    #
-    # https://github.com/python/cpython/blob/71b6c1af727fbe13525fb734568057d78cea33f3/Lib/subprocess.py#L309-L323
-    assert not isinstance(command, torch._six.string_classes), "Command to shell should be a list or tuple of tokens"
-    p = subprocess.Popen(command, universal_newlines=True, cwd=cwd, env=env)
-    try:
-        return p.wait()
-    except KeyboardInterrupt:
-        # Give `p` a chance to handle KeyboardInterrupt. Without this,
-        # `pytest` can't print errors it collected so far upon KeyboardInterrupt.
-        exit_status = p.wait(timeout=5)
-        if exit_status is not None:
-            return exit_status
-        else:
-            p.kill()
-            raise
-    except:  # noqa E722, copied from python core library
-        p.kill()
-        raise
-    finally:
-        # Always call p.wait() to ensure exit
-        p.wait()
-
-
-# Used to run the same test with different tensor types
-def repeat_test_for_types(dtypes):
-    def repeat_helper(f):
-        @wraps(f)
-        def call_helper(self, *args):
-            for dtype in dtypes:
-                if PY34:
-                    with TestCase.subTest(self, dtype=dtype):
-                        f(self, *args, dtype=dtype)
-                else:
-                    f(self, *args, dtype=dtype)
-
-        return call_helper
-    return repeat_helper
-
-# Environment variable `IS_PYTORCH_CI` is set in `.jenkins/common.sh`.
-IS_PYTORCH_CI = bool(os.environ.get('IS_PYTORCH_CI'))
-IN_CIRCLECI = bool(os.environ.get('IN_CIRCLECI'))
-TEST_REPORT_SOURCE_OVERRIDE = os.environ.get('TEST_REPORT_SOURCE_OVERRIDE')
-
-PY3 = sys.version_info > (3, 0)
-PY34 = sys.version_info >= (3, 4)
-
-def run_tests(argv=UNITTEST_ARGS):
-    if TEST_IN_SUBPROCESS:
-        suite = unittest.TestLoader().loadTestsFromModule(__main__)
-        test_cases = []
-
-        def add_to_test_cases(suite_or_case):
-            if isinstance(suite_or_case, unittest.TestCase):
-                test_cases.append(suite_or_case)
-            else:
-                for element in suite_or_case:
-                    add_to_test_cases(element)
-
-        add_to_test_cases(suite)
-        failed_tests = []
-        for case in test_cases:
-            test_case_full_name = case.id().split('.', 1)[1]
-            exitcode = shell([sys.executable] + argv + [test_case_full_name])
-            if exitcode != 0:
-                failed_tests.append(test_case_full_name)
-
-        assert len(failed_tests) == 0, "{} unit test(s) failed:\n\t{}".format(
-            len(failed_tests), '\n\t'.join(failed_tests))
-    else:
-        if IN_CIRCLECI:
-            # import here so that non-CI doesn't need xmlrunner installed
-            import xmlrunner
-            # allow users to override the test file location. We need this
-            # because the distributed tests run the same test file multiple
-            # times with different configurations.
-            if TEST_REPORT_SOURCE_OVERRIDE is not None:
-                test_source = TEST_REPORT_SOURCE_OVERRIDE
-            else:
-                test_source = 'python-unittest'
+import torch_npu
+import numpy as np
 
-            test_report_path = os.path.join('test-reports', test_source)
-            if PY3:
-                os.makedirs(test_report_path, exist_ok=True)
-            else:
-                if not os.path.exists(test_report_path):
-                    os.makedirs(test_report_path)
+from torch.testing._internal.common_utils import TEST_MKL
 
-            unittest.main(argv=argv, testRunner=xmlrunner.XMLTestRunner(output=test_report_path))
-        else:
-            unittest.main(argv=argv)
 
 IS_WINDOWS = sys.platform == "win32"
-IS_MACOS = sys.platform == "darwin"
-IS_PPC = platform.machine() == "ppc64le"
 
 if IS_WINDOWS:
     @contextmanager
@@ -255,115 +50,278 @@ else:
             yield f.name
 
 
-def _check_module_exists(name):
-    r"""Returns if a top-level module with :attr:`name` exists *without**
-    importing it. This is generally safer than try-catch block around a
-    `import X`. It avoids third party libraries breaking assumptions of some of
-    our tests, e.g., setting multiprocessing start method when imported
-    (see librosa/#747, torchvision/#544).
-    """
-    if not PY3:  # Python 2
-        import imp
-        try:
-            imp.find_module(name)
-            return True
-        except ImportError:
+@contextmanager
+def freeze_rng_state():
+    rng_state = torch.get_rng_state()
+    yield
+    torch.set_rng_state(rng_state)
+
+
+def iter_indices(tensor):
+    if tensor.dim() == 0:
+        return range(0)
+    if tensor.dim() == 1:
+        return range(tensor.size(0))
+    return product(*(range(s) for s in tensor.size()))
+
+
+def is_iterable(obj):
+    try:
+        iter(obj)
+        return True
+    except TypeError:
+        return False
+
+
+def set_npu_device():
+    npu_device = get_npu_device()
+    torch.npu.set_device(npu_device)
+    print(f"Your device is {npu_device}")
+    return npu_device
+
+
+def get_npu_device():
+    npu_device = os.environ.get('SET_NPU_DEVICE')
+    if npu_device is None:
+        npu_device = "npu:0"
+    else:
+        npu_device = f"npu:{npu_device}"
+    return npu_device
+
+
+def create_common_tensor(item, minValue, maxValue, device=None):
+    if device is None:
+        device = get_npu_device()
+        
+    dtype = item[0]
+    npu_format = item[1]
+    shape = item[2]
+    input1 = np.random.uniform(minValue, maxValue, shape).astype(dtype)
+    cpu_input = torch.from_numpy(input1)
+    npu_input = torch.from_numpy(input1).to(device)
+    if npu_format != -1:
+        npu_input = torch_npu.npu_format_cast(npu_input, npu_format)
+    return cpu_input, npu_input
+
+def compare_res_new(cpu_output, npu_output, testcase_name):
+    if cpu_output.shape != npu_output.shape:
+        return print("result shape error!", cpu_output.shape, npu_output.shape)
+    if cpu_output.dtype != npu_output.dtype:
+        return print("result dtype error!", cpu_output.dtype, npu_output.dtype)
+    if cpu_output.dtype == np.int32:
+        result = np.equal(cpu_output, npu_output)
+        if result is False:
+            return print('testcase_name={0}, npu datatype={1} shape={2} fails!'.format(
+                testcase_name, npu_output.dtype, npu_output.shape))
+    elif cpu_output.dtype == np.float16:
+        result = np.allclose(npu_output, cpu_output, 0.0001, 0)
+        if result is False:
+            return print('testcase_name={0}, npu datatype={1} shape={2} fails!'.format(
+                testcase_name, npu_output.dtype, npu_output.shape))
+    elif cpu_output.dtype == np.float32:
+        result = np.allclose(npu_output, cpu_output, 0.0001, 0)
+        print(npu_output, cpu_output)
+        print(result)
+        if not result:
+            return print('testcase_name={0}, npu datatype={1} shape={2} fails!'.format(
+                testcase_name, npu_output.dtype, npu_output.shape))
+    print('testcase_name={0}, datatype={1} shape={2} pass!'.format(testcase_name, cpu_output.dtype, cpu_output.shape))
+
+
+def __generate_2args_broadcast_cases(device=None):
+    if device is None:
+        device = get_npu_device()
+        
+    # Set broadcast and no axis, i.e. broadcasting 1.
+    X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+    Y = np.random.rand(1, 1, 1).astype(np.float32)
+
+    cpu_x = torch.from_numpy(X)
+    npu_x = torch.from_numpy(X).to(device)
+
+    cpu_y = torch.from_numpy(Y)
+    npu_y = torch.from_numpy(Y).to(device)
+
+    yield cpu_x, cpu_y, npu_x, npu_y
+
+    # broadcasting last two dimensions.
+    X = np.random.rand(2, 3, 4, 5).astype(np.float32)
+    Y = np.random.rand(4, 5).astype(np.float32)
+
+    cpu_x = torch.from_numpy(X)
+    npu_x = torch.from_numpy(X).to(device)
+
+    cpu_y = torch.from_numpy(Y)
+    npu_y = torch.from_numpy(Y).to(device)
+
+    yield cpu_x, cpu_y, npu_x, npu_y
+
+def test_2args_broadcast(fn):
+    output_list = []
+    for cpu_x, cpu_y, npu_x, npu_y in __generate_2args_broadcast_cases():
+        cpu_out = fn(cpu_x, cpu_y).numpy()
+        npu_out = fn(npu_x, npu_y).to("cpu").numpy()
+        output_list.append([cpu_out, npu_out])
+
+    return output_list
+
+
+def create_dtype_tensor(shape, dtype, npu_format=-1, min_value=-5, max_value=5, no_zero=False, device=None):
+    if device is None:
+        device = get_npu_device()
+        
+    if dtype == torch.bool:
+        x = np.random.randint(0, 2, size=shape).astype(bool)
+
+    elif dtype == torch.half:
+        x = np.random.uniform(min_value, max_value, shape).astype(np.float16)
+    
+    elif dtype == torch.float:
+        x = np.random.uniform(min_value, max_value, shape).astype(np.float32)
+
+    else:
+        x = np.random.randint(min_value, max_value+1, size = shape).astype(np.int32)
+
+    if no_zero:
+        ones = np.ones_like(x)
+        x = np.where(x != 0, x, ones)
+
+    cpu_input = torch.from_numpy(x)
+    npu_input = torch.from_numpy(x).to(device)
+    if npu_format != -1 and (dtype in [torch.float, torch.half]):
+        npu_input = torch_npu.npu_format_cast(npu_input, npu_format)
+    return cpu_input, npu_input
+
+
+def check_operators_in_prof(expected_operators, prof, unexpected_operators=None):
+    unexpected_operators = unexpected_operators or []
+    prof_key_averages = prof.key_averages()
+    if not prof_key_averages:
+        return print("torch profiling is empty, please check it")
+    for prof_item in prof_key_averages:        
+        if prof_item.key in unexpected_operators:
+            # if unexpected oprators are called, pattern inferring in trans-contiguous is failed
             return False
-    elif not PY34:  # Python [3, 3.4)
-        import importlib
-        loader = importlib.find_loader(name)
-        return loader is not None
-    else:  # Python >= 3.4
-        import importlib
-        import importlib.util
-        spec = importlib.util.find_spec(name)
-        return spec is not None
-
-TEST_NUMPY = _check_module_exists('numpy')
-TEST_SCIPY = _check_module_exists('scipy')
-TEST_MKL = torch.backends.mkl.is_available()
-TEST_NUMBA = _check_module_exists('numba')
-
-# Skip the test until issue #28313 gets fixed on Py2.
-TEST_DILL = _check_module_exists('dill') and PY3
-
-# On Py2, importing librosa 0.6.1 triggers a TypeError (if using newest joblib)
-# see librosa/librosa#729.
-# TODO: allow Py2 when librosa 0.6.2 releases
-TEST_LIBROSA = _check_module_exists('librosa') and PY3
-
-# Python 2.7 doesn't have spawn
-NO_MULTIPROCESSING_SPAWN = os.environ.get('NO_MULTIPROCESSING_SPAWN', '0') == '1' or sys.version_info[0] == 2
-TEST_WITH_ASAN = os.getenv('PYTORCH_TEST_WITH_ASAN', '0') == '1'
-TEST_WITH_TSAN = os.getenv('PYTORCH_TEST_WITH_TSAN', '0') == '1'
-TEST_WITH_UBSAN = os.getenv('PYTORCH_TEST_WITH_UBSAN', '0') == '1'
-TEST_WITH_ROCM = os.getenv('PYTORCH_TEST_WITH_ROCM', '0') == '1'
-# Enables tests that are slow to run (disabled by default)
-TEST_WITH_SLOW = os.getenv('PYTORCH_TEST_WITH_SLOW', '0') == '1'
-
-# Disables non-slow tests (these tests enabled by default)
-# This is usually used in conjunction with TEST_WITH_SLOW to
-# run *only* slow tests.  (I could have done an enum, but
-# it felt a little awkward.
-TEST_SKIP_FAST = os.getenv('PYTORCH_TEST_SKIP_FAST', '0') == '1'
-
-if TEST_NUMPY:
-    import numpy
-
-ALL_TENSORTYPES = [torch.float,
-                   torch.double,
-                   torch.half]
-
-# bfloat16 bringup is currently only available on ROCm
-# ALL_TENSORTYPES2 will eventually be unified with ALL_TENSORTYPES
-# when bfloat16 bringup is complete on all platforms
-if TEST_WITH_ROCM:
-    ALL_TENSORTYPES2 = [torch.float,
-                        torch.double,
-                        torch.half,
-                        torch.bfloat16]
-else:
-    ALL_TENSORTYPES2 = ALL_TENSORTYPES
+        elif prof_item.key in expected_operators:
+            # if expected oprator is called, empty it in expected_operators list
+            expected_operators.remove(prof_item.key)
+            
+    # if expected_operators list is empty, all oprators have been called
+    if not expected_operators:
+        return True
+    return False
 
-class SkipIfRocm(object):
+
+# Decorator that skips a test if the given condition is true.
+# Notes:
+#   (1) Skip conditions stack.
+#   (2) Skip conditions can be bools or strings. If a string the
+#       test base must have defined the corresponding attribute to be False
+#       for the test to run. If you want to use a string argument you should
+#       probably define a new decorator instead (see below).
+#   (3) Prefer the existing decorators to defining the 'device_type' kwarg.
+class SkipIf(object):
+
+    def __init__(self, dep, reason, device_type=None):
+        self.dep = dep
+        self.reason = reason
+        self.device_type = device_type
 
     def __call__(self, fn):
+
         @wraps(fn)
-        def wrapper(*args, **kwargs):
-            if TEST_WITH_ROCM:
-                raise unittest.SkipTest("test doesn't currently work on the ROCm stack")
-            else:
-                fn(*args, **kwargs)
-        return wrapper
+        def dep_fn(slf, device, *args, **kwargs):
+            if self.device_type is None or self.device_type == slf.device_type:
+                if ((isinstance(self.dep, str) and getattr(slf, self.dep, True))
+                    or (isinstance(self.dep, bool) and self.dep)):
+                    raise unittest.SkipTest(self.reason)
+
+            return fn(slf, device, *args, **kwargs)
+        return dep_fn
+
+
+# Skips a test on CPU if the condition is true.
+class SkipCPUIf(SkipIf):
+
+    def __init__(self, dep, reason):
+        super(SkipCPUIf, self).__init__(dep, reason, device_type='cpu')
 
 
-class SkipIfCompiledWithoutNumpy(object):
+class ExpectedFailure(object):
+
+    def __init__(self, device_type):
+        self.device_type = device_type
 
     def __call__(self, fn):
-        # Even if the numpy module is present, if `USE_NUMPY=0` is used during the
-        # build, numpy tests will fail
-        numpy_support = TEST_NUMPY
-        if numpy_support:
-            try:
-                # The numpy module is present, verify that PyTorch is compiled with
-                # numpy support
-                torch.from_numpy(numpy.array([2, 2]))
-            except RuntimeError:
-                numpy_support = False
 
         @wraps(fn)
-        def wrapper(*args, **kwargs):
-            if not numpy_support:
-                raise unittest.SkipTest("PyTorch was compiled without numpy support")
-            else:
-                fn(*args, **kwargs)
-        return wrapper
+        def efail_fn(slf, device, *args, **kwargs):
+            if self.device_type is None or self.device_type == slf.device_type:
+                try:
+                    fn(slf, device, *args, **kwargs)
+                except Exception:
+                    return
+                else:
+                    slf.fail('expected test to fail, but it passed')
+
+            return fn(slf, device, *args, **kwargs)
+        return efail_fn
 
 
-def _test_function(fn, device):
-    def run_test_function(self):
-        return fn(self, device)
-    return run_test_function
+class OnlyOn(object):
+
+    def __init__(self, device_type):
+        self.device_type = device_type
+
+    def __call__(self, fn):
+
+        @wraps(fn)
+        def only_fn(slf, device, *args, **kwargs):
+            if self.device_type != slf.device_type:
+                reason = "Only runs on {0}".format(self.device_type)
+                raise unittest.SkipTest(reason)
+
+            return fn(slf, device, *args, **kwargs)
+
+        return only_fn
+
+
+# Decorator that provides all available devices of the device type to the test
+# as a list of strings instead of providing a single device string.
+# Skips the test if the number of available devices of the variant's device
+# type is less than the 'num_required_devices' arg.
+class DeviceCountAtLeast(object):
+
+    def __init__(self, num_required_devices):
+        self.num_required_devices = num_required_devices
+
+    def __call__(self, fn):
+        assert not hasattr(fn, 'num_required_devices'), "DeviceCountAtLeast redefinition for {0}".format(fn.__name__)
+        fn.num_required_devices = self.num_required_devices
+
+        @wraps(fn)
+        def multi_fn(slf, devices, *args, **kwargs):
+            if len(devices) < self.num_required_devices:
+                reason = "fewer than {0} devices detected".format(self.num_required_devices)
+                raise unittest.SkipTest(reason)
+
+            return fn(slf, devices, *args, **kwargs)
+
+        return multi_fn
+
+
+# Skips a test on CPU if LAPACK is not available.
+class SkipCPUIfNoLapack(object):
+
+    def __call__(self, fn):
+        return SkipCPUIf(not torch._C.has_lapack, "PyTorch compiled without Lapack")(fn)
+
+
+# Skips a test on CPU if MKL is not available.
+class SkipCPUIfNoMkl(object):
+
+    def __call__(fn):
+        return SkipCPUIf(not TEST_MKL, "PyTorch is built without MKL support")(fn)
 
 
 class SkipIfNoLapack(object):
@@ -392,1146 +350,7 @@ class SkipIfNotRegistered(object):
     def __call__(op_name, message):
         try:
             from caffe2.python import core
-            skipper = unittest.skipIf(op_name not in core._REGISTERED_OPERATORS,
-                                    message)
+            skipper = unittest.skipIf(op_name not in core._REGISTERED_OPERATORS, message)
         except ImportError:
             skipper = unittest.skip("Cannot import `caffe2.python.core`")
-        return skipper
-
-
-def slow_test(fn):
-    @wraps(fn)
-    def wrapper(*args, **kwargs):
-        if not TEST_WITH_SLOW:
-            raise unittest.SkipTest("test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test")
-        else:
-            fn(*args, **kwargs)
-    wrapper.__dict__['slow_test'] = True
-    return wrapper
-
-
-def suppress_warnings(fn):
-    @wraps(fn)
-    def wrapper(*args, **kwargs):
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            fn(*args, **kwargs)
-    return wrapper
-
-
-def get_cpu_type(type_name):
-    module, name = type_name.rsplit('.', 1)
-    assert module == 'torch.npu'
-    return getattr(torch, name)
-
-
-def get_npu_type(type_name):
-    if isinstance(type_name, type):
-        type_name = '{}.{}'.format(type_name.__module__, type_name.__name__)
-    module, name = type_name.rsplit('.', 1)
-    assert module == 'torch'
-    return getattr(torch.npu, name)
-
-
-def to_npu(obj, type_map=None):
-    if type_map is None:
-        type_map = {}
-    if isinstance(obj, torch.Tensor):
-        assert obj.is_leaf
-        t = type_map.get(obj.type(), get_npu_type(obj.type()))
-        with torch.no_grad():
-            res = obj.clone().to(torch.float32).npu()
-            res.requires_grad = obj.requires_grad
-        return res
-    elif torch.is_storage(obj):
-        return obj.new().resize_(obj.size()).copy_(obj)
-    elif isinstance(obj, list):
-        return [to_npu(o, type_map) for o in obj]
-    elif isinstance(obj, tuple):
-        return tuple(to_npu(o, type_map) for o in obj)
-    else:
-        return deepcopy(obj)
-
-
-def get_function_arglist(func):
-    if sys.version_info > (3,):
-        return inspect.getfullargspec(func).args
-    else:
-        return inspect.getargspec(func).args
-
-
-def set_rng_seed(seed):
-    torch.manual_seed(seed)
-    random.seed(seed)
-    if TEST_NUMPY:
-        numpy.random.seed(seed)
-
-
-@contextlib.contextmanager
-def freeze_rng_state():
-    rng_state = torch.get_rng_state()
-    yield
-    torch.set_rng_state(rng_state)
-
-
-def iter_indices(tensor):
-    if tensor.dim() == 0:
-        return range(0)
-    if tensor.dim() == 1:
-        return range(tensor.size(0))
-    return product(*(range(s) for s in tensor.size()))
-
-
-def is_iterable(obj):
-    try:
-        iter(obj)
-        return True
-    except TypeError:
-        return False
-
-
-#  "min_satisfying_examples" setting has been deprecated in hypythesis
-#  3.56.0 and removed in hypothesis 4.x
-try:
-    import hypothesis
-    if hypothesis.version.__version_info__ >= (3, 56, 0):
-        hypothesis.settings.register_profile(
-            "pytorch_ci",
-            hypothesis.settings(
-                derandomize=True,
-                suppress_health_check=[hypothesis.HealthCheck.too_slow],
-                database=None,
-                max_examples=100,
-                verbosity=hypothesis.Verbosity.normal))
-        hypothesis.settings.register_profile(
-            "dev",
-            hypothesis.settings(
-                suppress_health_check=[hypothesis.HealthCheck.too_slow],
-                database=None,
-                max_examples=10,
-                verbosity=hypothesis.Verbosity.normal))
-        hypothesis.settings.register_profile(
-            "debug",
-            hypothesis.settings(
-                suppress_health_check=[hypothesis.HealthCheck.too_slow],
-                database=None,
-                max_examples=1000,
-                verbosity=hypothesis.Verbosity.verbose))
-    else:
-        hypothesis.settings.register_profile(
-            "pytorch_ci",
-            hypothesis.settings(
-                derandomize=True,
-                suppress_health_check=[hypothesis.HealthCheck.too_slow],
-                database=None,
-                max_examples=100,
-                min_satisfying_examples=1,
-                verbosity=hypothesis.Verbosity.normal))
-        hypothesis.settings.register_profile(
-            "dev",
-            hypothesis.settings(
-                suppress_health_check=[hypothesis.HealthCheck.too_slow],
-                database=None,
-                max_examples=10,
-                min_satisfying_examples=1,
-                verbosity=hypothesis.Verbosity.normal))
-        hypothesis.settings.register_profile(
-            "debug",
-            hypothesis.settings(
-                suppress_health_check=[hypothesis.HealthCheck.too_slow],
-                database=None,
-                max_examples=1000,
-                min_satisfying_examples=1,
-                verbosity=hypothesis.Verbosity.verbose))
-
-    hypothesis.settings.load_profile(
-        "pytorch_ci" if IS_PYTORCH_CI else os.getenv('PYTORCH_HYPOTHESIS_PROFILE',
-                                                     'dev')
-    )
-except ImportError:
-    print('Fail to import hypothesis in common_utils, tests are not derandomized')
-
-disabled_test_from_issues = None
-def check_disabled(test_name):
-    global disabled_test_from_issues
-    if disabled_test_from_issues is None:
-        disabled_test_from_issues = {}
-
-        def read_and_process():
-            url = 'https://raw.githubusercontent.com/zdevito/pytorch_disabled_tests/master/result.json'
-            contents = urlopen(url, timeout=1).read().decode('utf-8')
-            the_response = json.loads(contents)
-            for item in the_response['items']:
-                title = item['title']
-                key = 'DISABLED '
-                if title.startswith(key):
-                    test_name = title[len(key):].strip()
-                    disabled_test_from_issues[test_name] = item['html_url']
-
-        if not IS_SANDCASTLE and os.getenv("PYTORCH_RUN_DISABLED_TESTS", "0") != "1":
-            try:
-                read_and_process()
-            except Exception:
-                print("Couldn't download test skip set, leaving all tests enabled...")
-
-
-    if test_name in disabled_test_from_issues:
-        raise unittest.SkipTest(
-            "Test is disabled because an issue exists disabling it: {}".format(disabled_test_from_issues[test_name]) +
-            " To enable set the environment variable PYTORCH_RUN_DISABLED_TESTS=1")
-
-class TestCase(expecttest.TestCase):
-    precision = 1e-5
-    maxDiff = None
-    exact_dtype = False
-
-    def __init__(self, method_name='runTest'):
-        super(TestCase, self).__init__(method_name)
-
-    @classmethod
-    def setUpClass(self):
-        self.npu_device = set_npu_device()
-
-    def setUp(self):
-        if TEST_SKIP_FAST:
-            if not getattr(self, self._testMethodName).__dict__.get('slow_test', False):
-                raise unittest.SkipTest("test is fast; we disabled it with PYTORCH_TEST_SKIP_FAST")
-        check_disabled(str(self))
-
-        set_rng_seed(SEED)
-
-    def assertTensorsSlowEqual(self, x, y, prec=None, message=''):
-        self.assertEqual(x.size(), y.size())
-        self.assertEqual(x.dtype, y.dtype)
-        y = y.type_as(x)
-        if x.dtype == torch.bool:
-            self.assertEqual(x, y)
-        else:
-            max_err = 0
-            for index in iter_indices(x):
-                max_err = max(max_err, abs(x[index] - y[index]))
-            self.assertLessEqual(max_err, prec, message)
-
-    def genSparseTensor(self, size, sparse_dim, nnz, is_uncoalesced, device='cpu'):
-        # Assert not given impossible combination, where the sparse dims have
-        # empty numel, but nnz > 0 makes the indices containing values.
-        assert all(size[d] > 0 for d in range(sparse_dim)) or nnz == 0, 'invalid arguments'
-
-        v_size = [nnz] + list(size[sparse_dim:])
-        v = torch.randn(*v_size, device=device)
-        i = torch.rand(sparse_dim, nnz, device=device)
-        i.mul_(torch.tensor(size[:sparse_dim]).unsqueeze(1).to(i))
-        i = i.to(torch.long)
-        if is_uncoalesced:
-            v = torch.cat([v, torch.randn_like(v)], 0)
-            i = torch.cat([i, i], 1)
-
-        x = torch.sparse_coo_tensor(i, v, torch.Size(size))
-
-        if not is_uncoalesced:
-            x = x.coalesce()
-        else:
-            # FIXME: `x` is a sparse view of `v`. Currently rebase_history for
-            #        sparse views is not implemented, so this workaround is
-            #        needed for inplace operations done on `x`, e.g., copy_().
-            #        Remove after implementing something equivalent to CopySlice
-            #        for sparse views.
-            # NOTE: We do clone() after detach() here because we need to be able to change size/storage of x afterwards
-            x = x.detach().clone()
-        return x, x._indices().clone(), x._values().clone()
-
-    def safeToDense(self, t):
-        r = self.safeCoalesce(t)
-        return r.to_dense()
-
-    def safeCoalesce(self, t):
-        tc = t.coalesce()
-        self.assertEqual(tc.to_dense(), t.to_dense())
-        self.assertTrue(tc.is_coalesced())
-
-        # Our code below doesn't work when nnz is 0, because
-        # then it's a 0D tensor, not a 2D tensor.
-        if t._nnz() == 0:
-            self.assertEqual(t._indices(), tc._indices())
-            self.assertEqual(t._values(), tc._values())
-            return tc
-
-        value_map = {}
-        for idx, val in zip(t._indices().t(), t._values()):
-            idx_tup = tuple(idx.tolist())
-            if idx_tup in value_map:
-                value_map[idx_tup] += val
-            else:
-                value_map[idx_tup] = val.clone() if isinstance(val, torch.Tensor) else val
-
-        new_indices = sorted(list(value_map.keys()))
-        new_values = [value_map[idx] for idx in new_indices]
-        if t._values().ndimension() < 2:
-            new_values = t._values().new(new_values)
-        else:
-            new_values = torch.stack(new_values)
-
-        new_indices = t._indices().new(new_indices).t()
-        tg = t.new(new_indices, new_values, t.size())
-
-        self.assertEqual(tc._indices(), tg._indices())
-        self.assertEqual(tc._values(), tg._values())
-
-        if t.is_coalesced():
-            self.assertEqual(tc._indices(), t._indices())
-            self.assertEqual(tc._values(), t._values())
-
-        return tg
-
-    def assertRtolEqual(self, x, y, prec=None, prec16=None):
-        def compare_res(pre, minimum):
-            result = np.abs(y - x)
-            deno = np.maximum(np.abs(x), np.abs(y))
-            result_atol = np.less_equal(result, pre)
-            result_rtol = np.less_equal(result / np.add(deno, minimum), pre)
-            if result_rtol.all() == False and result_atol.all() == False:
-                if np.sum(result_rtol == False) > size * pre and np.sum(result_atol == False) > size * pre:
-                    self.fail("result error")
-        threshold = 1.e-4
-        threshold2 = 1.e-3
-        minimum16 = 6e-8
-        minimum = 10e-10
-        if prec is None:
-            prec = threshold
-        if prec16 is None:
-            prec16 = threshold2
-        if torch.is_tensor(x) and torch.is_tensor(y):
-            x = x.numpy()
-            y = y.numpy()
-        size = x.size
-        if (x.shape != y.shape):
-            self.fail("shpae error")
-        if (x.dtype != y.dtype):
-            self.fail("dtype error")
-        dtype_list = [np.bool, np.uint16, np.int16, np.int32, np.float16, np.float32, np.int8, np.uint8, np.int64]
-        if x.dtype not in dtype_list:
-            self.fail("required dtype in [np.bool, np.uint16, np.int16, np.int32, np.float16, np.float32, np.int8, np.uint8, np.int64]")
-        if x.dtype == np.bool:
-            result = np.equal(x, y)
-            if result.all() == False:
-                self.fail("result error")
-        elif (x.dtype == np.float16):
-            compare_res(prec16, minimum16)
-        elif (x.dtype in [np.float32, np.int8, np.uint8, np.uint16, np.int16, np.int32, np.int64]):
-            compare_res(prec, minimum)
-        else:
-            self.fail("required numpy object")
-
-    def assertEqual(self, x, y, prec=None, message='', allow_inf=False, exact_dtype=None):
-        if exact_dtype is None:
-            exact_dtype = self.exact_dtype
-
-        if isinstance(prec, str) and message == '':
-            message = prec
-            prec = None
-        if prec is None:
-            prec = self.precision
-
-        if isinstance(x, torch.Tensor) and isinstance(y, Number):
-            self.assertEqual(x.item(), y, prec=prec, message=message,
-                             allow_inf=allow_inf, exact_dtype=exact_dtype)
-        elif isinstance(y, torch.Tensor) and isinstance(x, Number):
-            self.assertEqual(x, y.item(), prec=prec, message=message,
-                             allow_inf=allow_inf, exact_dtype=exact_dtype)
-        elif isinstance(x, torch.Tensor) and isinstance(y, numpy.bool_):
-            self.assertEqual(x.item(), y, prec=prec, message=message,
-                             allow_inf=allow_inf, exact_dtype=exact_dtype)
-        elif isinstance(y, torch.Tensor) and isinstance(x, numpy.bool_):
-            self.assertEqual(x, y.item(), prec=prec, message=message,
-                             allow_inf=allow_inf, exact_dtype=exact_dtype)
-        elif isinstance(x, torch.Tensor) and isinstance(y, torch.Tensor):
-            def assertTensorsEqual(a, b):
-                super(TestCase, self).assertEqual(a.size(), b.size(), message)
-                if exact_dtype:
-                    self.assertEqual(a.dtype, b.dtype)
-                if a.numel() > 0:
-                    if (a.device.type == 'cpu' and (a.dtype == torch.float16 or a.dtype == torch.bfloat16)):
-                        # CPU half and bfloat16 tensors don't have the methods we need below
-                        a = a.to(torch.float32)
-                    b = b.to(a)
-
-                    if (a.dtype == torch.bool) != (b.dtype == torch.bool):
-                        raise TypeError("Was expecting both tensors to be bool type.")
-                    else:
-                        if a.dtype == torch.bool and b.dtype == torch.bool:
-                            # we want to respect precision but as bool doesn't support subtraction,
-                            # boolean tensor has to be converted to int
-                            a = a.to(torch.int)
-                            b = b.to(torch.int)
-
-                        diff = a - b
-                        if a.dtype.is_complex or a.dtype.is_floating_point:
-                            # check that NaNs are in the same locations
-                            nan_mask = torch.isnan(a)
-                            self.assertTrue(torch.equal(nan_mask, torch.isnan(b)), message)
-                            diff[nan_mask] = 0
-                            # inf check if allow_inf=True
-                            if allow_inf:
-                                inf_mask = torch.isinf(a)
-                                inf_sign = inf_mask.sign()
-                                self.assertTrue(torch.equal(inf_sign, torch.isinf(b).sign()), message)
-                                diff[inf_mask] = 0
-                        # TODO: implement abs on CharTensor (int8)
-                        # TODO: modify abs to return float/double for ComplexFloat/ComplexDouble
-                        if diff.is_signed() and diff.dtype != torch.int8:
-                            diff = diff.abs()
-                            # if diff is complex, the imaginary component for diff will be 0
-                            # from the previous step, hence converting it to float and double is fine.
-                            if diff.dtype == torch.complex64:
-                                diff = diff.to(torch.float)
-                            elif diff.dtype == torch.complex128:
-                                diff = diff.to(torch.double)
-                        max_err = diff.max()
-                        self.assertLessEqual(max_err, prec, message)
-            super(TestCase, self).assertEqual(x.is_sparse, y.is_sparse, message)
-            super(TestCase, self).assertEqual(x.is_quantized, y.is_quantized, message)
-            if x.is_sparse:
-                x = self.safeCoalesce(x)
-                y = self.safeCoalesce(y)
-                assertTensorsEqual(x._indices(), y._indices())
-                assertTensorsEqual(x._values(), y._values())
-            elif x.is_quantized and y.is_quantized:
-                self.assertEqual(x.qscheme(), y.qscheme(), prec=prec,
-                                 message=message, allow_inf=allow_inf,
-                                 exact_dtype=exact_dtype)
-                if x.qscheme() == torch.per_tensor_affine:
-                    self.assertEqual(x.q_scale(), y.q_scale(), prec=prec,
-                                     message=message, allow_inf=allow_inf,
-                                     exact_dtype=exact_dtype)
-                    self.assertEqual(x.q_zero_point(), y.q_zero_point(),
-                                     prec=prec, message=message,
-                                     allow_inf=allow_inf, exact_dtype=exact_dtype)
-                elif x.qscheme() == torch.per_channel_affine:
-                    self.assertEqual(x.q_per_channel_scales(), y.q_per_channel_scales(), prec=prec,
-                                     message=message, allow_inf=allow_inf,
-                                     exact_dtype=exact_dtype)
-                    self.assertEqual(x.q_per_channel_zero_points(), y.q_per_channel_zero_points(),
-                                     prec=prec, message=message,
-                                     allow_inf=allow_inf, exact_dtype=exact_dtype)
-                    self.assertEqual(x.q_per_channel_axis(), y.q_per_channel_axis(),
-                                     prec=prec, message=message)
-                self.assertEqual(x.dtype, y.dtype)
-                self.assertEqual(x.int_repr().to(torch.int32),
-                                 y.int_repr().to(torch.int32), prec=prec,
-                                 message=message, allow_inf=allow_inf,
-                                 exact_dtype=exact_dtype)
-            else:
-                assertTensorsEqual(x, y)
-        elif isinstance(x, string_classes) and isinstance(y, string_classes):
-            super(TestCase, self).assertEqual(x, y, message)
-        elif type(x) == set and type(y) == set:
-            super(TestCase, self).assertEqual(x, y, message)
-        elif isinstance(x, dict) and isinstance(y, dict):
-            if isinstance(x, OrderedDict) and isinstance(y, OrderedDict):
-                self.assertEqual(x.items(), y.items(), prec=prec,
-                                 message=message, allow_inf=allow_inf,
-                                 exact_dtype=exact_dtype)
-            else:
-                self.assertEqual(set(x.keys()), set(y.keys()), prec=prec,
-                                 message=message, allow_inf=allow_inf,
-                                 exact_dtype=exact_dtype)
-                key_list = list(x.keys())
-                self.assertEqual([x[k] for k in key_list],
-                                 [y[k] for k in key_list],
-                                 prec=prec, message=message,
-                                 allow_inf=allow_inf, exact_dtype=exact_dtype)
-        elif is_iterable(x) and is_iterable(y):
-            super(TestCase, self).assertEqual(len(x), len(y), message)
-            for x_, y_ in zip(x, y):
-                self.assertEqual(x_, y_, prec=prec, message=message,
-                                 allow_inf=allow_inf, exact_dtype=exact_dtype)
-        elif isinstance(x, bool) and isinstance(y, bool):
-            super(TestCase, self).assertEqual(x, y, message)
-        elif isinstance(x, Number) and isinstance(y, Number):
-            if abs(x) == inf or abs(y) == inf:
-                if allow_inf:
-                    super(TestCase, self).assertEqual(x, y, message)
-                else:
-                    self.fail("Expected finite numeric values - x={}, y={}".format(x, y))
-                return
-            super(TestCase, self).assertLessEqual(abs(x - y), prec, message)
-        else:
-            super(TestCase, self).assertEqual(x, y, message)
-
-    def assertAlmostEqual(self, x, y, places=None, msg=None, delta=None, allow_inf=None):
-        prec = delta
-        if places:
-            prec = 10**(-places)
-        self.assertEqual(x, y, prec, msg, allow_inf)
-
-    def assertNotEqual(self, x, y, prec=None, message=''):
-        if isinstance(prec, str) and message == '':
-            message = prec
-            prec = None
-        if prec is None:
-            prec = self.precision
-
-        if isinstance(x, torch.Tensor) and isinstance(y, torch.Tensor):
-            if x.size() != y.size():
-                super(TestCase, self).assertNotEqual(x.size(), y.size())
-            self.assertGreater(x.numel(), 0)
-            y = y.type_as(x)
-            nan_mask = x != x
-            if torch.equal(nan_mask, y != y):
-                if x.dtype == torch.bool and y.dtype == torch.bool:
-                    x = x.to(torch.int)
-                    y = y.to(torch.int)
-                diff = x - y
-                if diff.is_signed():
-                    diff = diff.abs()
-                diff[nan_mask] = 0
-                # Use `item()` to work around:
-                # https://github.com/pytorch/pytorch/issues/22301
-                max_err = diff.max().item()
-                self.assertGreaterEqual(max_err, prec, message)
-        elif type(x) == str and type(y) == str:
-            super(TestCase, self).assertNotEqual(x, y)
-        elif is_iterable(x) and is_iterable(y):
-            super(TestCase, self).assertNotEqual(x, y)
-        else:
-            try:
-                self.assertGreaterEqual(abs(x - y), prec, message)
-                return
-            except (TypeError, AssertionError):
-                pass
-            super(TestCase, self).assertNotEqual(x, y, message)
-
-    def assertObjectIn(self, obj, iterable):
-        for elem in iterable:
-            if id(obj) == id(elem):
-                return
-        raise AssertionError("object not found in iterable")
-
-    # TODO: Support context manager interface
-    # NB: The kwargs forwarding to callable robs the 'subname' parameter.
-    # If you need it, manually apply your call_fn in a lambda instead.
-    def assertExpectedRaises(self, exc_type, call_fn, *args, **kwargs):
-        subname = None
-        if 'subname' in kwargs:
-            subname = kwargs['subname']
-            del kwargs['subname']
-        try:
-            call_fn(*args, **kwargs)
-        except exc_type as e:
-            self.assertExpected(str(e), subname)
-            return
-        # Don't put this in the try block; the AssertionError will catch it
-        self.fail(msg="Did not raise when expected to")
-
-    def assertNotWarn(self, call_fn, msg=''):
-        r"""
-        Test if :attr:`call_fn` does not raise a warning.
-        """
-        with self._reset_warning_registry(), warnings.catch_warnings(record=True) as ws:
-            warnings.simplefilter("always")  # allow any warning to be raised
-            call_fn()
-            self.assertTrue(len(ws) == 0, msg)
-
-    def assertWarns(self, call_fn, msg=''):
-        r"""
-        Test if :attr:`call_fn` raises a warning.
-        """
-        with self._reset_warning_registry(), warnings.catch_warnings(record=True) as ws:
-            warnings.simplefilter("always")  # allow any warning to be raised
-            call_fn()
-            self.assertTrue(len(ws) > 0, msg)
-
-    def assertWarnsRegex(self, call_fn, regex, msg=''):
-        r"""
-        Test if :attr:`call_fn` raises any warning with message that contains
-        the regex pattern :attr:`regex`.
-        """
-        with self._reset_warning_registry(), warnings.catch_warnings(record=True) as ws:
-            warnings.simplefilter("always")  # allow any warning to be raised
-            call_fn()
-            self.assertTrue(len(ws) > 0, msg)
-            found = any(re.search(regex, str(w.message)) is not None for w in ws)
-            self.assertTrue(found, msg)
-
-    @contextmanager
-    def maybeWarnsRegex(self, category, regex=''):
-        """Context manager for code that *may* warn, e.g. ``TORCH_WARN_ONCE``.
-
-        This filters expected warnings from the test log and fails the test if
-        any unexpected warnings are caught.
-        """
-        with self._reset_warning_registry(), warnings.catch_warnings(record=True) as ws:
-            warnings.simplefilter("always")  # allow any warning to be raised
-            # Ignore expected warnings
-            warnings.filterwarnings("ignore", message=regex, category=category)
-            try:
-                yield
-            finally:
-                if len(ws) != 0:
-                    msg = 'Caught unexpected warnings:\n'
-                    for w in ws:
-                        msg += warnings.formatwarning(
-                            w.message, w.category, w.filename, w.lineno, w.line)
-                        msg += '\n'
-                    self.fail(msg)
-
-    @contextmanager
-    def _reset_warning_registry(self):
-        r"""
-        warnings.catch_warnings() in Python 2 misses already registered
-        warnings. We need to manually clear the existing warning registries to
-        ensure catching warnings in a scope.
-        """
-        # Python 3 has no problem.
-        if sys.version_info >= (3,):
-            yield
-            return
-
-        # Backup and clear all existing warning registries.
-        backup = {}
-        for name, mod in list(sys.modules.items()):
-            try:
-                reg = mod.__warningregistry__
-            except AttributeError:
-                continue
-            else:
-                backup[name] = reg.copy()
-                reg.clear()
-
-        yield
-
-        # Restore backed up warning registries.
-        for name, reg_orig in backup.items():
-            try:
-                mod = sys.modules[name]
-            except KeyError:
-                continue
-
-            try:
-                reg = mod.__warningregistry__
-            except AttributeError:
-                mod.__warningregistry__ = reg_orig
-            else:
-                reg.clear()
-                reg.update(reg_orig)
-
-    def assertExpected(self, s, subname=None):
-        r"""
-        Test that a string matches the recorded contents of a file
-        derived from the name of this test and subname.  This file
-        is placed in the 'expect' directory in the same directory
-        as the test script. You can automatically update the recorded test
-        output using --accept.
-
-        If you call this multiple times in a single function, you must
-        give a unique subname each time.
-        """
-        if not (isinstance(s, str) or (sys.version_info[0] == 2 and isinstance(s, unicode))):
-            raise TypeError("assertExpected is strings only")
-
-        def remove_prefix(text, prefix):
-            if text.startswith(prefix):
-                return text[len(prefix):]
-            return text
-        # NB: we take __file__ from the module that defined the test
-        # class, so we place the expect directory where the test script
-        # lives, NOT where test/common_utils_new.py lives.  This doesn't matter in
-        # PyTorch where all test scripts are in the same directory as
-        # test/common_utils_new.py, but it matters in onnx-pytorch
-        module_id = self.__class__.__module__
-        munged_id = remove_prefix(self.id(), module_id + ".")
-        test_file = os.path.realpath(sys.modules[module_id].__file__)
-        expected_file = os.path.join(os.path.dirname(test_file),
-                                     "expect",
-                                     munged_id)
-
-        subname_output = ""
-        if subname:
-            expected_file += "-" + subname
-            subname_output = " ({})".format(subname)
-        expected_file += ".expect"
-        expected = None
-
-        def accept_output(update_type):
-            print("Accepting {} for {}{}:\n\n{}".format(update_type, munged_id, subname_output, s))
-            with open(expected_file, 'w') as f:
-                f.write(s)
-
-        try:
-            with open(expected_file) as f:
-                expected = f.read()
-        except IOError as e:
-            if e.errno != errno.ENOENT:
-                raise
-            elif expecttest.ACCEPT:
-                return accept_output("output")
-            else:
-                raise RuntimeError(
-                    ("I got this output for {}{}:\n\n{}\n\n"
-                     "No expect file exists; to accept the current output, run:\n"
-                     "python {} {} --accept").format(munged_id, subname_output, s, __main__.__file__, munged_id))
-
-        # a hack for JIT tests
-        if IS_WINDOWS:
-            expected = re.sub(r'CppOp\[(.+?)\]', 'CppOp[]', expected)
-            s = re.sub(r'CppOp\[(.+?)\]', 'CppOp[]', s)
-
-        if expecttest.ACCEPT:
-            if expected != s:
-                return accept_output("updated output")
-        else:
-            if hasattr(self, "assertMultiLineEqual"):
-                # Python 2.7 only
-                # NB: Python considers lhs "old" and rhs "new".
-                self.assertMultiLineEqual(expected, s)
-            else:
-                self.assertEqual(s, expected)
-
-    def assertExpectedStripMangled(self, s, subname=None):
-        s = re.sub(r'__torch__[^ ]+', '', s)
-        self.assertExpected(s, subname)
-
-    # returns captured stderr
-    @staticmethod
-    def runWithPytorchAPIUsageStderr(code):
-        env = os.environ.copy()
-        env["PYTORCH_API_USAGE_STDERR"] = "1"
-        pipes = subprocess.Popen(
-            [sys.executable, '-c', code],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            env=env)
-        return pipes.communicate()[1].decode('ascii')
-
-    if sys.version_info < (3, 2):
-        # assertRegexpMatches renamed to assertRegex in 3.2
-        assertRegex = unittest.TestCase.assertRegexpMatches
-        # assertRaisesRegexp renamed to assertRaisesRegex in 3.2
-        assertRaisesRegex = unittest.TestCase.assertRaisesRegexp
-
-    if sys.version_info < (3, 5):
-        # assertNotRegexpMatches renamed to assertNotRegex in 3.5
-        assertNotRegex = unittest.TestCase.assertNotRegexpMatches
-
-
-def download_file(url, binary=True):
-    if sys.version_info < (3,):
-        from urlparse import urlsplit
-        import urllib2
-        request = urllib2
-        error = urllib2
-    else:
-        from urllib.parse import urlsplit
-        from urllib import request, error
-
-    filename = os.path.basename(urlsplit(url)[2])
-    data_dir = get_writable_path(os.path.join(os.path.dirname(__file__), 'data'))
-    path = os.path.join(data_dir, filename)
-
-    if os.path.exists(path):
-        return path
-    try:
-        data = request.urlopen(url, timeout=15).read()
-        with open(path, 'wb' if binary else 'w') as f:
-            f.write(data)
-        return path
-    except error.URLError:
-        msg = "could not download test file '{}'".format(url)
-        warnings.warn(msg, RuntimeWarning)
-        raise unittest.SkipTest(msg)
-
-
-def find_free_port():
-    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-    sock.bind(('localhost', 0))
-    sockname = sock.getsockname()
-    sock.close()
-    return sockname[1]
-
-# Errors that we can get in c10d initialization for which we should retry tests for.
-ADDRESS_IN_USE = "Address already in use"
-CONNECT_TIMEOUT = "connect() timed out."
-
-def retry_on_connect_failures(func=None, connect_errors=(ADDRESS_IN_USE)):
-    """Reruns a test if the test returns a RuntimeError and the exception
-    matches exactly with one of the strings in connect_errors.
-    """
-    # This if block is executed when using this function as a decorator with arguments.
-    if func is None:
-        return partial(retry_on_connect_failures, connect_errors=connect_errors)
-
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        tries_remaining = 10
-        while True:
-            try:
-                return func(*args, **kwargs)
-            except RuntimeError as error:
-                if str(error) in connect_errors:
-                    tries_remaining -= 1
-                    if tries_remaining == 0:
-                        raise
-                    time.sleep(random.random())
-                    continue
-                raise
-    return wrapper
-
-
-# Decorator to retry upon certain Exceptions.
-def retry(ExceptionToCheck, tries=3, delay=3):
-    def deco_retry(f):
-        @wraps(f)
-        def f_retry(*args, **kwargs):
-            mtries, mdelay = tries, delay
-            while mtries > 1:
-                try:
-                    return f(*args, **kwargs)
-                except ExceptionToCheck as e:
-                    msg = "%s, Retrying in %d seconds..." % (str(e), mdelay)
-                    print(msg)
-                    time.sleep(mdelay)
-                    mtries -= 1
-            return f(*args, **kwargs)
-        return f_retry  # true decorator
-    return deco_retry
-
-
-# Methods for matrix generation
-# Used in test_autograd.py and test_torch.py
-def prod_single_zero(dim_size):
-    result = torch.randn(dim_size, dim_size)
-    result[0, 1] = 0
-    return result
-
-
-def random_square_matrix_of_rank(l, rank, dtype=torch.double, device='cpu'):
-    assert rank <= l
-    A = torch.randn(l, l, dtype=dtype, device=device)
-    u, s, v = A.svd()
-    for i in range(l):
-        if i >= rank:
-            s[i] = 0
-        elif s[i] == 0:
-            s[i] = 1
-    return u.mm(torch.diag(s)).mm(v.transpose(0, 1))
-
-
-def random_symmetric_matrix(l, *batches, **kwargs):
-    dtype = kwargs.get('dtype', torch.double)
-    device = kwargs.get('device', 'cpu')
-    A = torch.randn(*(batches + (l, l)), dtype=dtype, device=device)
-    A = (A + A.transpose(-2, -1)).div_(2)
-    return A
-
-
-def random_symmetric_psd_matrix(l, *batches, **kwargs):
-    dtype = kwargs.get('dtype', torch.double)
-    device = kwargs.get('device', 'cpu')
-    A = torch.randn(*(batches + (l, l)), dtype=dtype, device=device)
-    return torch.matmul(A, A.transpose(-2, -1))
-
-
-def random_symmetric_pd_matrix(matrix_size, *batch_dims, **kwargs):
-    dtype = kwargs.get('dtype', torch.double)
-    device = kwargs.get('device', 'cpu')
-    A = torch.randn(*(batch_dims + (matrix_size, matrix_size)),
-                    dtype=dtype, device=device)
-    return torch.matmul(A, A.transpose(-2, -1)) \
-        + torch.eye(matrix_size, dtype=dtype, device=device) * 1e-5
-
-
-def make_nonzero_det(A, sign=None, min_singular_value=0.1):
-    u, s, v = A.svd()
-    s.clamp_(min=min_singular_value)
-    A = torch.matmul(u, torch.matmul(torch.diag_embed(s), v.transpose(-2, -1)))
-    det = A.det()
-    if sign is not None:
-        if A.dim() == 2:
-            det = det.item()
-            if (det < 0) ^ (sign < 0):
-                A[0, :].neg_()
-        else:
-            cond = ((det < 0) ^ (sign < 0)).nonzero()
-            if cond.size(0) > 0:
-                for i in range(cond.size(0)):
-                    A[list(cond[i])][0, :].neg_()
-    return A
-
-
-def random_fullrank_matrix_distinct_singular_value(matrix_size, *batch_dims,
-                                                   **kwargs):
-    dtype = kwargs.get('dtype', torch.double)
-    device = kwargs.get('device', 'cpu')
-    silent = kwargs.get("silent", False)
-    if silent and not torch._C.has_lapack:
-        return torch.ones(matrix_size, matrix_size, dtype=dtype, device=device)
-
-    A = torch.randn(batch_dims + (matrix_size, matrix_size), dtype=dtype, device=device)
-    u, _, v = A.svd()
-    s = torch.arange(1., matrix_size + 1, dtype=dtype, device=device).mul_(1.0 / (matrix_size + 1)).diag()
-    return u.matmul(s.expand(batch_dims + (matrix_size, matrix_size)).matmul(v.transpose(-2, -1)))
-
-
-def random_matrix(rows, columns, *batch_dims, **kwargs):
-    """Return rectangular matrix or batches of rectangular matrices.
-
-    Parameters:
-      dtype - the data type
-      device - the device kind
-      singular - when True, the output will be singular
-    """
-    dtype = kwargs.get('dtype', torch.double)
-    device = kwargs.get('device', 'cpu')
-    silent = kwargs.get("silent", False)
-    singular = kwargs.get("singular", False)
-    if silent and not torch._C.has_lapack:
-        return torch.ones(rows, columns, dtype=dtype, device=device)
-
-    A = torch.randn(batch_dims + (rows, columns), dtype=dtype, device=device)
-    u, _, v = A.svd(some=False)
-    s = torch.zeros(rows, columns, dtype=dtype, device=device)
-    k = min(rows, columns)
-    for i in range(k):
-        s[i, i] = float(i + 1) / (k + 1)
-    if singular:
-        # make matrix singular
-        s[k - 1, k - 1] = 0
-        if k > 2:
-            # increase the order of singularity so that the pivoting
-            # in LU factorization will be non-trivial
-            s[0, 0] = 0
-    return u.matmul(s.expand(batch_dims + (rows, columns)).matmul(v.transpose(-2, -1)))
-
-
-def random_lowrank_matrix(rank, rows, columns, *batch_dims, **kwargs):
-    """Return rectangular matrix or batches of rectangular matrices with
-    given rank.
-    """
-    B = random_matrix(rows, rank, *batch_dims, **kwargs)
-    C = random_matrix(rank, columns, *batch_dims, **kwargs)
-    return B.matmul(C)
-
-
-def random_sparse_matrix(rows, columns, density=0.01, **kwargs):
-    """Return rectangular random sparse matrix within given density.
-
-    The density of the result approaches to given density as the size
-    of the matrix is increased and a relatively small value of density
-    is specified but higher than min(rows, columns)/(rows * columns)
-    for non-singular matrices.
-    """
-    dtype = kwargs.get('dtype', torch.double)
-    device = kwargs.get('device', 'cpu')
-    singular = kwargs.get("singular", False)
-
-    k = min(rows, columns)
-    nonzero_elements = max(min(rows, columns), int(rows * columns * density))
-
-    row_indices = [i % rows for i in range(nonzero_elements)]
-    column_indices = [i % columns for i in range(nonzero_elements)]
-    random.shuffle(column_indices)
-    indices = [row_indices, column_indices]
-    values = torch.randn(nonzero_elements, dtype=dtype, device=device)
-    # ensure that the diagonal dominates
-    values *= torch.tensor([-float(i - j)**2 for i, j in zip(*indices)], dtype=dtype, device=device).exp()
-    A = torch.sparse_coo_tensor(indices, values, (rows, columns), device=device)
-    return A.coalesce()
-
-
-def random_sparse_pd_matrix(matrix_size, density=0.01, **kwargs):
-    """Return random sparse positive-definite matrix with given density.
-
-    The eigenvalues of the matrix are defined as::
-      arange(1, matrix_size+1)/matrix_size
-
-    Algorithm:
-      A = diag(arange(1, matrix_size+1)/matrix_size)
-      while <A density is smaller than required>:
-          <choose random i, j in range(matrix_size), theta in [0, 2*pi]>
-          R = <rotation matrix (i,j,theta)>
-          A = R^T A R
-    """
-    import math
-    torch_module = kwargs.get('torch', globals()['torch'])
-    dtype = kwargs.get('dtype', torch_module.double)
-    device = kwargs.get('device', 'cpu')
-    data = dict([((i, i), float(i + 1) / matrix_size)
-                 for i in range(matrix_size)])
-
-
-    def multiply(data, N, i, j, cs, sn, left=True):
-        for k in range(N):
-            if left:
-                ik, jk = (k, i), (k, j)
-            else:
-                ik, jk = (i, k), (j, k)
-            aik, ajk = data.get(ik, 0), data.get(jk, 0)
-            aik, ajk = cs * aik + sn * ajk, -sn * aik + cs * ajk
-            if aik:
-                data[ik] = aik
-            else:
-                data.pop(ik, None)
-            if ajk:
-                data[jk] = ajk
-            else:
-                data.pop(jk, None)
-
-    target_nnz = density * matrix_size * matrix_size
-    while len(data) < target_nnz:
-        i = random.randint(0, matrix_size - 1)
-        j = random.randint(0, matrix_size - 1)
-        if i != j:
-            theta = random.uniform(0, 2 * math.pi)
-            cs = math.cos(theta)
-            sn = math.sin(theta)
-            multiply(data, matrix_size, i, j, cs, sn, left=True)
-            multiply(data, matrix_size, i, j, cs, sn, left=False)
-    icoords, jcoords, values = [], [], []
-    for (i, j), v in sorted(data.items()):
-        icoords.append(i)
-        jcoords.append(j)
-        values.append(v)
-    indices = [icoords, jcoords]
-    return torch_module.sparse_coo_tensor(indices, values, (matrix_size, matrix_size), dtype=dtype, device=device)
-
-
-def do_test_dtypes(self, dtypes, layout, device):
-    for dtype in dtypes:
-        if dtype != torch.float16:
-            out = torch.zeros((2, 3), dtype=dtype, layout=layout, device=device)
-            self.assertIs(dtype, out.dtype)
-            self.assertIs(layout, out.layout)
-            self.assertEqual(device, out.device)
-
-
-def do_test_empty_full(self, dtypes, layout, device):
-    shape = torch.Size([2, 3])
-
-    def check_value(tensor, dtype, layout, device, value, requires_grad):
-        self.assertEqual(shape, tensor.shape)
-        self.assertIs(dtype, tensor.dtype)
-        self.assertIs(layout, tensor.layout)
-        self.assertEqual(tensor.requires_grad, requires_grad)
-        if value is not None:
-            fill = tensor.new(shape).fill_(value)
-            self.assertEqual(tensor, fill)
-
-    def get_int64_dtype(dtype):
-        module = '.'.join(str(dtype).split('.')[1:-1])
-        if not module:
-            return torch.int64
-        return operator.attrgetter(module)(torch).int64
-
-    default_dtype = torch.get_default_dtype()
-    check_value(torch.empty(shape), default_dtype, torch.strided, -1, None, False)
-    check_value(torch.full(shape, -5), default_dtype, torch.strided, -1, None, False)
-    for dtype in dtypes:
-        for rg in {dtype.is_floating_point, False}:
-            int64_dtype = get_int64_dtype(dtype)
-            v = torch.empty(shape, dtype=dtype, device=device, layout=layout, requires_grad=rg)
-            check_value(v, dtype, layout, device, None, rg)
-            out = v.new()
-            check_value(torch.empty(shape, out=out, device=device, layout=layout, requires_grad=rg),
-                        dtype, layout, device, None, rg)
-            check_value(v.new_empty(shape), dtype, layout, device, None, False)
-            check_value(v.new_empty(shape, dtype=int64_dtype, device=device, requires_grad=False),
-                        int64_dtype, layout, device, None, False)
-            check_value(torch.empty_like(v), dtype, layout, device, None, False)
-            check_value(torch.empty_like(v, dtype=int64_dtype, layout=layout, device=device, requires_grad=False),
-                        int64_dtype, layout, device, None, False)
-
-            if dtype is not torch.float16 and layout != torch.sparse_coo:
-                fv = 3
-                v = torch.full(shape, fv, dtype=dtype, layout=layout, device=device, requires_grad=rg)
-                check_value(v, dtype, layout, device, fv, rg)
-                check_value(v.new_full(shape, fv + 1), dtype, layout, device, fv + 1, False)
-                out = v.new()
-                check_value(torch.full(shape, fv + 2, out=out, device=device, layout=layout, requires_grad=rg),
-                            dtype, layout, device, fv + 2, rg)
-                check_value(v.new_full(shape, fv + 3, dtype=int64_dtype, device=device, requires_grad=False),
-                            int64_dtype, layout, device, fv + 3, False)
-                check_value(torch.full_like(v, fv + 4), dtype, layout, device, fv + 4, False)
-                check_value(torch.full_like(v, fv + 5,
-                                            dtype=int64_dtype, layout=layout, device=device, requires_grad=False),
-                            int64_dtype, layout, device, fv + 5, False)
-
-
-
-
-THESE_TAKE_WAY_TOO_LONG = {
-    'test_Conv3d_groups',
-    'test_conv_double_backward',
-    'test_conv_double_backward_groups',
-    'test_Conv3d_dilated',
-    'test_Conv3d_stride_padding',
-    'test_Conv3d_dilated_strided',
-    'test_Conv3d',
-    'test_Conv2d_dilated',
-    'test_ConvTranspose3d_dilated',
-    'test_ConvTranspose2d_dilated',
-    'test_snli',
-    'test_Conv2d',
-    'test_Conv2d_padding',
-    'test_ConvTranspose2d_no_bias',
-    'test_ConvTranspose2d',
-    'test_ConvTranspose3d',
-    'test_Conv2d_no_bias',
-    'test_matmul_4d_4d',
-    'test_multinomial_invalid_probs',
-}
-
-
-running_script_path = None
-
-
-def set_running_script_path():
-    global running_script_path
-    try:
-        running_file = os.path.abspath(os.path.realpath(sys.argv[0]))
-        if running_file.endswith('.py'):  # skip if the running file is not a script
-            running_script_path = running_file
-    except Exception:
-        pass
-
-
-def check_test_defined_in_running_script(test_case):
-    if running_script_path is None:
-        return
-    test_case_class_file = os.path.abspath(os.path.realpath(inspect.getfile(test_case.__class__)))
-    assert test_case_class_file == running_script_path, "Class of loaded TestCase \"{}\" " \
-        "is not defined in the running script \"{}\", but in \"{}\". Did you " \
-        "accidentally import a unittest.TestCase from another file?".format(
-            test_case.id(), running_script_path, test_case_class_file)
-
-
-def load_tests(loader, tests, pattern):
-    set_running_script_path()
-    test_suite = unittest.TestSuite()
-    for test_group in tests:
-        for test in test_group:
-            check_test_defined_in_running_script(test)
-            test_suite.addTest(test)
-    return test_suite
-
-
-class BytesIOContext(io.BytesIO):
-    def __enter__(self):
-        return self
-
-    def __exit__(self, *args):
-        pass
-
-def _assertGradAndGradgradChecks(test_case, apply_fn, inputs):
-    # call assert function rather than returning a bool since it's nicer
-    # if we get whether this failed on the gradcheck or the gradgradcheck.
-    test_case.assertTrue(gradcheck(apply_fn, inputs))
-    test_case.assertTrue(gradgradcheck(apply_fn, inputs))
-
-
-# Using @PrecisionOverride specific to your test is the recommended way
-# of doing this. These are just some values that worked for test_nn.
-dtype2prec_DONTUSE = {torch.float: 1e-5,
-                      torch.double: 1e-5,
-                      torch.half: 1e-2,
-                      torch.bfloat16: 1e-1}
+        return skipper
\ No newline at end of file
diff --git a/torch_npu/testing/decorator.py b/torch_npu/testing/decorator.py
new file mode 100644
index 0000000000000000000000000000000000000000..baccd03e86c1d0ced91a9ece04ad903a83210d52
--- /dev/null
+++ b/torch_npu/testing/decorator.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import wraps
+
+import inspect
+import itertools
+import torch
+
+
+def feed_data(func, new_name, *args, **kwargs):
+    """
+    This internal method decorator feeds the test data item to the test.
+    """
+    @wraps(func)
+    def wrapper(self):
+        return func(self, *args, **kwargs)
+    wrapper.__name__ = new_name
+    wrapper.__wrapped__ = func
+    return wrapper
+
+
+def instantiate_tests(arg=None, **kwargs):
+
+    def wrapper(cls):
+        for name, func in list(cls.__dict__.items()):
+            data = {}
+            if hasattr(func, "dtypes"):
+                data['dtype'] = func.dtypes
+            if hasattr(func, "formats"):
+                data['format'] = func.formats
+
+            key_list = data.keys()
+            if not key_list:
+                continue
+
+            func_args = inspect.getfullargspec(func).args
+            value_list = [data.get(key) for key in key_list]
+            for value in itertools.product(*value_list):
+                new_kwargs = dict(device="npu") if "device" in func_args else {}
+                test_name = name
+                for k, v in zip(key_list, value):
+                    func_key = None
+                    if k == "format":
+                        test_name += ("_" + str(v))
+                    if k == "dtype":
+                        test_name += ("_" + str(v).split('.')[1])
+                    for _func_key in func_args:
+                        if k in _func_key:
+                            assert func_key is None, f"Multiple matches for {k}"
+                            func_key = _func_key
+                    new_kwargs[func_key] = v
+                setattr(cls, test_name, feed_data(func, test_name, **new_kwargs))
+
+            delattr(cls, name)
+        return cls
+
+    return wrapper(arg)
+
+
+class Dtypes(object):
+
+    def __init__(self, *args):
+        assert args is not None and len(args) != 0, "No dtypes given"
+        assert all(isinstance(arg, torch.dtype) for arg in args), "Unknown dtype in {0}".format(str(args))
+        self.args = args
+
+    def __call__(self, fn):
+        fn.dtypes = self.args
+        return fn
+
+
+class Formats(object):
+
+    def __init__(self, *args):
+        assert args is not None and len(args) != 0, "No formats given"
+        self.args = args
+
+    def __call__(self, fn):
+        fn.formats = self.args
+        return fn
diff --git a/torch_npu/testing/testcase.py b/torch_npu/testing/testcase.py
new file mode 100644
index 0000000000000000000000000000000000000000..c080247e73842f504d71496bfc160e55938e1789
--- /dev/null
+++ b/torch_npu/testing/testcase.py
@@ -0,0 +1,495 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Importing this file must **not** initialize NPU context. test_distributed
+relies on this assumption to properly run. This means that when this is imported
+no NPU calls shall be made, including torch.npu.device_count(), etc.
+
+torch.testing._internal.common_npu.py can freely initialize NPU context when imported.
+"""
+from collections import OrderedDict
+from contextlib import contextmanager
+from numbers import Number
+
+import sys
+import os
+import re
+import unittest
+import warnings
+import random
+import subprocess
+import __main__
+import torch
+import expecttest
+import numpy as np
+
+from torch._six import string_classes, inf
+
+from torch_npu.testing.common_utils import set_npu_device, is_iterable, iter_indices, IS_WINDOWS
+
+
+def run_tests():
+    argv = [sys.argv[0]]
+    unittest.main(argv=argv)
+
+
+class TestCase(expecttest.TestCase):
+    _precision = 1e-5
+    maxDiff = None
+    exact_dtype = False
+
+    def __init__(self, method_name='runTest'):
+        super(TestCase, self).__init__(method_name)
+
+    @property
+    def precision(self):
+        return self._precision
+
+    @precision.setter
+    def precision(self, prec):
+        self._precision = prec
+
+    @classmethod
+    def setUpClass(self):
+        self.npu_device = set_npu_device()
+
+    def setUp(self):
+        seed = int(os.getenv('SEED', "666"))
+        torch.manual_seed(seed)
+        random.seed(seed)
+
+    def assertTensorsSlowEqual(self, x, y, prec=None, message=''):
+        self.assertEqual(x.size(), y.size())
+        self.assertEqual(x.dtype, y.dtype)
+        y = y.type_as(x)
+        if x.dtype == torch.bool:
+            self.assertEqual(x, y)
+        else:
+            max_err = 0
+            for index in iter_indices(x):
+                max_err = max(max_err, abs(x[index] - y[index]))
+            self.assertLessEqual(max_err, prec, message)
+
+    def genSparseTensor(self, size, sparse_dim, nnz, is_uncoalesced, device='cpu'):
+        # Assert not given impossible combination, where the sparse dims have
+        # empty numel, but nnz > 0 makes the indices containing values.
+        assert all(size[d] > 0 for d in range(sparse_dim)) or nnz == 0, 'invalid arguments'
+
+        v_size = [nnz] + list(size[sparse_dim:])
+        v = torch.randn(*v_size, device=device)
+        i = torch.rand(sparse_dim, nnz, device=device)
+        i.mul_(torch.tensor(size[:sparse_dim]).unsqueeze(1).to(i))
+        i = i.to(torch.long)
+        if is_uncoalesced:
+            v = torch.cat([v, torch.randn_like(v)], 0)
+            i = torch.cat([i, i], 1)
+
+        x = torch.sparse_coo_tensor(i, v, torch.Size(size))
+
+        if not is_uncoalesced:
+            x = x.coalesce()
+        else:
+            # FIXME: `x` is a sparse view of `v`. Currently rebase_history for
+            #        sparse views is not implemented, so this workaround is
+            #        needed for inplace operations done on `x`, e.g., copy_().
+            #        Remove after implementing something equivalent to CopySlice for sparse views.
+            # NOTE: We do clone() after detach() here because we need to be able to change size/storage of x afterwards
+            x = x.detach().clone()
+        return x, x._indices().clone(), x._values().clone()
+
+    def safeToDense(self, t):
+        r = self.safeCoalesce(t)
+        return r.to_dense()
+
+    def safeCoalesce(self, t):
+        tc = t.coalesce()
+        self.assertEqual(tc.to_dense(), t.to_dense())
+        self.assertTrue(tc.is_coalesced())
+
+        # Our code below doesn't work when nnz is 0, because
+        # then it's a 0D tensor, not a 2D tensor.
+        if t._nnz() == 0:
+            self.assertEqual(t._indices(), tc._indices())
+            self.assertEqual(t._values(), tc._values())
+            return tc
+
+        value_map = {}
+        for idx, val in zip(t._indices().t(), t._values()):
+            idx_tup = tuple(idx.tolist())
+            if idx_tup in value_map:
+                value_map[idx_tup] += val
+            else:
+                value_map[idx_tup] = val.clone() if isinstance(val, torch.Tensor) else val
+
+        new_indices = sorted(list(value_map.keys()))
+        new_values = [value_map.get(idx) for idx in new_indices]
+        if t._values().ndimension() < 2:
+            new_values = t._values().new(new_values)
+        else:
+            new_values = torch.stack(new_values)
+
+        new_indices = t._indices().new(new_indices).t()
+        tg = t.new(new_indices, new_values, t.size())
+
+        self.assertEqual(tc._indices(), tg._indices())
+        self.assertEqual(tc._values(), tg._values())
+
+        if t.is_coalesced():
+            self.assertEqual(tc._indices(), t._indices())
+            self.assertEqual(tc._values(), t._values())
+
+        return tg
+
+    def assertRtolEqual(self, x, y, prec=None, prec16=None):
+        def compare_res(pre, minimum):
+            result = np.abs(y - x)
+            deno = np.maximum(np.abs(x), np.abs(y))
+            result_atol = np.less_equal(result, pre)
+            result_rtol = np.less_equal(result / np.add(deno, minimum), pre)
+            if result_rtol.all() is False and result_atol.all() is False:
+                if np.sum(result_rtol is False) > size * pre and np.sum(result_atol is False) > size * pre:
+                    self.fail("result error")
+        threshold = 1.e-4
+        threshold2 = 1.e-3
+        minimum16 = 6e-8
+        minimum = 10e-10
+        if prec is None:
+            prec = threshold
+        if prec16 is None:
+            prec16 = threshold2
+        if torch.is_tensor(x) and torch.is_tensor(y):
+            x = x.numpy()
+            y = y.numpy()
+        size = x.size
+        if (x.shape != y.shape):
+            self.fail("shpae error")
+        if (x.dtype != y.dtype):
+            self.fail("dtype error")
+        dtype_list = [np.bool, np.uint16, np.int16, np.int32, np.float16, np.float32, np.int8, np.uint8, np.int64]
+        if x.dtype not in dtype_list:
+            self.fail("required dtype in [np.bool, np.uint16, np.int16, " +
+                      "np.int32, np.float16, np.float32, np.int8, np.uint8, np.int64]")
+        if x.dtype == np.bool:
+            result = np.equal(x, y)
+            if result.all() is False:
+                self.fail("result error")
+        elif (x.dtype == np.float16):
+            compare_res(prec16, minimum16)
+        elif (x.dtype in [np.float32, np.int8, np.uint8, np.uint16, np.int16, np.int32, np.int64]):
+            compare_res(prec, minimum)
+        else:
+            self.fail("required numpy object")
+
+    def _assert_tensor_equal(self, a, b, message, exact_dtype, allow_inf, prec):
+        super(TestCase, self).assertEqual(a.size(), b.size(), message)
+        if exact_dtype:
+            self.assertEqual(a.dtype, b.dtype)
+        if a.numel() > 0:
+            if (a.device.type == 'cpu' and (a.dtype == torch.float16 or a.dtype == torch.bfloat16)):
+                # CPU half and bfloat16 tensors don't have the methods we need below
+                a = a.to(torch.float32)
+            b = b.to(a)
+
+            if (a.dtype == torch.bool) != (b.dtype == torch.bool):
+                raise TypeError("Was expecting both tensors to be bool type.")
+            else:
+                if a.dtype == torch.bool and b.dtype == torch.bool:
+                    # we want to respect precision but as bool doesn't support subtraction,
+                    # boolean tensor has to be converted to int
+                    a = a.to(torch.int)
+                    b = b.to(torch.int)
+
+                diff = a - b
+                if a.dtype.is_complex or a.dtype.is_floating_point:
+                    # check that NaNs are in the same locations
+                    nan_mask = torch.isnan(a)
+                    self.assertTrue(torch.equal(nan_mask, torch.isnan(b)), message)
+                    diff[nan_mask] = 0
+                    # inf check if allow_inf=True
+                    if allow_inf:
+                        inf_mask = torch.isinf(a)
+                        inf_sign = inf_mask.sign()
+                        self.assertTrue(torch.equal(inf_sign, torch.isinf(b).sign()), message)
+                        diff[inf_mask] = 0
+                # TODO: implement abs on CharTensor (int8)
+                # TODO: modify abs to return float/double for ComplexFloat/ComplexDouble
+                if diff.is_signed() and diff.dtype != torch.int8:
+                    diff = diff.abs()
+                    # if diff is complex, the imaginary component for diff will be 0
+                    # from the previous step, hence converting it to float and double is fine.
+                    if diff.dtype == torch.complex64:
+                        diff = diff.to(torch.float)
+                    elif diff.dtype == torch.complex128:
+                        diff = diff.to(torch.double)
+                max_err = diff.max()
+                self.assertLessEqual(max_err, prec, message)
+
+    def _assertNumberEqual(self, x, y, prec=None, message='', allow_inf=False, exact_dtype=None):
+        if isinstance(x, torch.Tensor) and isinstance(y, Number):
+            self._assertNumberEqual(x.item(), y, prec=prec, message=message,
+                                    allow_inf=allow_inf, exact_dtype=exact_dtype)
+
+        elif isinstance(y, torch.Tensor) and isinstance(x, Number):
+            self._assertNumberEqual(x, y.item(), prec=prec, message=message,
+                                    allow_inf=allow_inf, exact_dtype=exact_dtype)
+
+        else:
+            if abs(x) == inf or abs(y) == inf:
+                if allow_inf:
+                    super(TestCase, self).assertEqual(x, y, message)
+                else:
+                    self.fail("Expected finite numeric values - x={}, y={}".format(x, y))
+                return
+            super(TestCase, self).assertLessEqual(abs(x - y), prec, message)
+
+    def _assertBoolEqual(self, x, y, prec=None, message='', allow_inf=False, exact_dtype=None):
+        if isinstance(x, torch.Tensor) and isinstance(y, np.bool_):
+            self._assertBoolEqual(x.item(), y, prec=prec, message=message,
+                                  allow_inf=allow_inf, exact_dtype=exact_dtype)
+        elif isinstance(y, torch.Tensor) and isinstance(x, np.bool_):
+            self._assertBoolEqual(x, y.item(), prec=prec, message=message,
+                                  allow_inf=allow_inf, exact_dtype=exact_dtype)
+        else:
+            super(TestCase, self).assertEqual(x, y, message)
+
+    def _assertTensorsEqual(self, x, y, prec=None, message='', allow_inf=False, exact_dtype=None):
+        super(TestCase, self).assertEqual(x.is_sparse, y.is_sparse, message)
+        super(TestCase, self).assertEqual(x.is_quantized, y.is_quantized, message)
+        if x.is_sparse:
+            x = self.safeCoalesce(x)
+            y = self.safeCoalesce(y)
+            self._assert_tensor_equal(x._indices(), y._indices(), message, exact_dtype, allow_inf, prec)
+            self._assert_tensor_equal(x._values(), y._values(), message, exact_dtype, allow_inf, prec)
+        elif x.is_quantized and y.is_quantized:
+            self.assertEqual(x.qscheme(), y.qscheme(), prec=prec,
+                                message=message, allow_inf=allow_inf,
+                                exact_dtype=exact_dtype)
+            if x.qscheme() == torch.per_tensor_affine:
+                self.assertEqual(x.q_scale(), y.q_scale(), prec=prec,
+                                    message=message, allow_inf=allow_inf,
+                                    exact_dtype=exact_dtype)
+                self.assertEqual(x.q_zero_point(), y.q_zero_point(),
+                                    prec=prec, message=message,
+                                    allow_inf=allow_inf, exact_dtype=exact_dtype)
+            elif x.qscheme() == torch.per_channel_affine:
+                self.assertEqual(x.q_per_channel_scales(), y.q_per_channel_scales(), prec=prec,
+                                    message=message, allow_inf=allow_inf,
+                                    exact_dtype=exact_dtype)
+                self.assertEqual(x.q_per_channel_zero_points(), y.q_per_channel_zero_points(),
+                                    prec=prec, message=message,
+                                    allow_inf=allow_inf, exact_dtype=exact_dtype)
+                self.assertEqual(x.q_per_channel_axis(), y.q_per_channel_axis(),
+                                    prec=prec, message=message)
+            self.assertEqual(x.dtype, y.dtype)
+            self.assertEqual(x.int_repr().to(torch.int32),
+                                y.int_repr().to(torch.int32), prec=prec,
+                                message=message, allow_inf=allow_inf,
+                                exact_dtype=exact_dtype)
+        else:
+            self._assert_tensor_equal(x, y, message, exact_dtype, allow_inf, prec)
+
+    def assertEqual(self, x, y, prec=None, message='', allow_inf=False, exact_dtype=None):
+        if exact_dtype is None:
+            exact_dtype = self.exact_dtype
+
+        if isinstance(prec, str) and message == '':
+            message = prec
+            prec = None
+        if prec is None:
+            prec = self.precision
+
+        def _assertEqual(x, y, prec=None, message='', allow_inf=False, exact_dtype=None):
+            if isinstance(x, Number) or isinstance(y, Number):
+                self._assertNumberEqual(x, y, prec=prec, message=message,
+                                        allow_inf=allow_inf, exact_dtype=exact_dtype)
+            elif isinstance(x, np.bool_) or isinstance(y, np.bool_):
+                self._assertBoolEqual(x, y, prec=prec, message=message,
+                                    allow_inf=allow_inf, exact_dtype=exact_dtype)
+            elif isinstance(x, torch.Tensor) and isinstance(y, torch.Tensor):
+                self._assertTensorsEqual(x, y, prec=prec, message=message,
+                                        allow_inf=allow_inf, exact_dtype=exact_dtype)
+            elif isinstance(x, string_classes) and isinstance(y, string_classes):
+                super(TestCase, self).assertEqual(x, y, message)
+            elif type(x) == set and type(y) == set:
+                super(TestCase, self).assertEqual(x, y, message)
+            elif isinstance(x, dict) and isinstance(y, dict):
+                if isinstance(x, OrderedDict) and isinstance(y, OrderedDict):
+                    _assertEqual(x.items(), y.items(), prec=prec,
+                                 message=message, allow_inf=allow_inf,
+                                 exact_dtype=exact_dtype)
+                else:
+                    _assertEqual(set(x.keys()), set(y.keys()), prec=prec,
+                                 message=message, allow_inf=allow_inf,
+                                 exact_dtype=exact_dtype)
+                    key_list = list(x.keys())
+                    _assertEqual([x[k] for k in key_list],
+                                 [y[k] for k in key_list],
+                                 prec=prec, message=message,
+                                 allow_inf=allow_inf, exact_dtype=exact_dtype)
+            elif is_iterable(x) and is_iterable(y):
+                super(TestCase, self).assertEqual(len(x), len(y), message)
+                for x_, y_ in zip(x, y):
+                    _assertEqual(x_, y_, prec=prec, message=message,
+                                 allow_inf=allow_inf, exact_dtype=exact_dtype)
+            else:
+                super(TestCase, self).assertEqual(x, y, message)
+
+        _assertEqual(x, y, prec=prec, message=message, allow_inf=allow_inf, exact_dtype=exact_dtype)
+
+    def assertAlmostEqual(self, x, y, places=None, msg=None, delta=None, allow_inf=None):
+        prec = delta
+        if places:
+            prec = 10**(-places)
+        self.assertEqual(x, y, prec, msg, allow_inf)
+
+    def assertNotEqual(self, x, y, prec=None, message=''):
+        if isinstance(prec, str) and message == '':
+            message = prec
+            prec = None
+        if prec is None:
+            prec = self.precision
+
+        if isinstance(x, torch.Tensor) and isinstance(y, torch.Tensor):
+            if x.size() != y.size():
+                super(TestCase, self).assertNotEqual(x.size(), y.size())
+            self.assertGreater(x.numel(), 0)
+            y = y.type_as(x)
+            nan_mask = x != x
+            if torch.equal(nan_mask, y != y):
+                if x.dtype == torch.bool and y.dtype == torch.bool:
+                    x = x.to(torch.int)
+                    y = y.to(torch.int)
+                diff = x - y
+                if diff.is_signed():
+                    diff = diff.abs()
+                diff[nan_mask] = 0
+                # Use `item()` to work around:
+                # https://github.com/pytorch/pytorch/issues/22301
+                max_err = diff.max().item()
+                self.assertGreaterEqual(max_err, prec, message)
+        elif type(x) == str and type(y) == str:
+            super(TestCase, self).assertNotEqual(x, y)
+        elif is_iterable(x) and is_iterable(y):
+            super(TestCase, self).assertNotEqual(x, y)
+        else:
+            try:
+                self.assertGreaterEqual(abs(x - y), prec, message)
+                return
+            except (TypeError, AssertionError):
+                pass
+            super(TestCase, self).assertNotEqual(x, y, message)
+
+    def assertObjectIn(self, obj, iterable):
+        for elem in iterable:
+            if id(obj) == id(elem):
+                return
+        raise AssertionError("object not found in iterable")
+
+    # TODO: Support context manager interface
+    # NB: The kwargs forwarding to callable robs the 'subname' parameter.
+    # If you need it, manually apply your call_fn in a lambda instead.
+    def assertExpectedRaises(self, exc_type, call_fn, *args, **kwargs):
+        subname = None
+        if 'subname' in kwargs:
+            subname = kwargs.get('subname')
+            del kwargs['subname']
+        try:
+            call_fn(*args, **kwargs)
+        except exc_type as e:
+            self.assertExpected(str(e), subname)
+            return
+        # Don't put this in the try block; the AssertionError will catch it
+        self.fail(msg="Did not raise when expected to")
+
+    def assertNotWarn(self, call_fn, msg=''):
+        r"""
+        Test if :attr:`call_fn` does not raise a warning.
+        """
+        with self._reset_warning_registry(), warnings.catch_warnings(record=True) as ws:
+            warnings.simplefilter("always")  # allow any warning to be raised
+            call_fn()
+            self.assertTrue(len(ws) == 0, msg)
+
+    def assertWarns(self, call_fn, msg=''):
+        r"""
+        Test if :attr:`call_fn` raises a warning.
+        """
+        with self._reset_warning_registry(), warnings.catch_warnings(record=True) as ws:
+            warnings.simplefilter("always")  # allow any warning to be raised
+            call_fn()
+            self.assertTrue(len(ws) > 0, msg)
+
+    def assertWarnsRegex(self, call_fn, regex, msg=''):
+        r"""
+        Test if :attr:`call_fn` raises any warning with message that contains
+        the regex pattern :attr:`regex`.
+        """
+        with self._reset_warning_registry(), warnings.catch_warnings(record=True) as ws:
+            warnings.simplefilter("always")  # allow any warning to be raised
+            call_fn()
+            self.assertTrue(len(ws) > 0, msg)
+            found = any(re.search(regex, str(w.message)) is not None for w in ws)
+            self.assertTrue(found, msg)
+
+    @contextmanager
+    def maybeWarnsRegex(self, category, regex=''):
+        """Context manager for code that *may* warn, e.g. ``TORCH_WARN_ONCE``.
+
+        This filters expected warnings from the test log and fails the test if
+        any unexpected warnings are caught.
+        """
+        with self._reset_warning_registry(), warnings.catch_warnings(record=True) as ws:
+            warnings.simplefilter("always")  # allow any warning to be raised
+            # Ignore expected warnings
+            warnings.filterwarnings("ignore", message=regex, category=category)
+            try:
+                yield
+            finally:
+                if len(ws) != 0:
+                    msg = 'Caught unexpected warnings:\n'
+                    for w in ws:
+                        msg += warnings.formatwarning(
+                            w.message, w.category, w.filename, w.lineno, w.line)
+                        msg += '\n'
+                    self.fail(msg)
+
+    @contextmanager
+    def _reset_warning_registry(self):
+        r"""
+        warnings.catch_warnings() in Python 2 misses already registered
+        warnings. We need to manually clear the existing warning registries to
+        ensure catching warnings in a scope.
+        """
+        # Python 3 has no problem.
+        if sys.version_info >= (3,):
+            yield
+            return
+
+    def assertExpectedStripMangled(self, s, subname=None):
+        s = re.sub(r'__torch__[^ ]+', '', s)
+        self.assertExpected(s, subname)
+
+    # returns captured stderr
+    @staticmethod
+    def runWithPytorchAPIUsageStderr(code):
+        env = os.environ.copy()
+        env["PYTORCH_API_USAGE_STDERR"] = "1"
+        pipes = subprocess.Popen(
+            [sys.executable, '-c', code],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env)
+        return pipes.communicate()[1].decode('ascii')
diff --git a/torch_npu/testing/util_test.py b/torch_npu/testing/util_test.py
deleted file mode 100644
index 1f0534f2da6db854c915f25c972f786ebba1f99d..0000000000000000000000000000000000000000
--- a/torch_npu/testing/util_test.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION.
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch_npu
-import numpy as np
-import os
-
-
-UT_FAST_MODE = os.getenv('UT_FAST_MODE') == '1' 
-
-
-def set_npu_device():
-    npu_device = get_npu_device()
-    torch.npu.set_device(npu_device)
-    print(f"Your device is {npu_device}")
-    return npu_device
-
-
-def get_npu_device():
-    npu_device = os.environ.get('SET_NPU_DEVICE')
-    if npu_device is None:
-        npu_device = "npu:0"
-    else:
-        npu_device = f"npu:{npu_device}"
-    return npu_device
-
-
-def create_common_tensor(item, minValue, maxValue, device=None):
-    if device is None:
-        device = get_npu_device()
-        
-    dtype = item[0]
-    npu_format = item[1]
-    shape = item[2]
-    input1 = np.random.uniform(minValue, maxValue, shape).astype(dtype)
-    cpu_input = torch.from_numpy(input1)
-    npu_input = torch.from_numpy(input1).to(device)
-    if npu_format != -1:
-        npu_input = torch_npu.npu_format_cast(npu_input, npu_format)
-    return cpu_input, npu_input
-
-def compare_res_new(cpu_output, npu_output, testcase_name):
-    if cpu_output.shape != npu_output.shape:
-        return print("result shape error!", cpu_output.shape, npu_output.shape)
-    if cpu_output.dtype != npu_output.dtype:
-        return print("result dtype error!", cpu_output.dtype, npu_output.dtype)
-    if cpu_output.dtype == np.int32:
-        result = np.equal(cpu_output, npu_output)
-        if result is False:
-            return print('testcase_name={0}, npu datatype={1} shape={2} fails!'.format(
-                testcase_name, npu_output.dtype, npu_output.shape))
-    elif cpu_output.dtype == np.float16:
-        result = np.allclose(npu_output, cpu_output, 0.0001, 0)
-        if result is False:
-            return print('testcase_name={0}, npu datatype={1} shape={2} fails!'.format(
-                testcase_name, npu_output.dtype, npu_output.shape))
-    elif cpu_output.dtype == np.float32:
-        result = np.allclose(npu_output, cpu_output, 0.0001, 0)
-        print(npu_output, cpu_output)
-        print(result)
-        if not result:
-            return print('testcase_name={0}, npu datatype={1} shape={2} fails!'.format(
-                testcase_name, npu_output.dtype, npu_output.shape))
-    print('testcase_name={0}, datatype={1} shape={2} pass!'.format(testcase_name, cpu_output.dtype, cpu_output.shape))
-
-
-def __generate_2args_broadcast_cases(device=None):
-    if device is None:
-        device = get_npu_device()
-        
-    # Set broadcast and no axis, i.e. broadcasting 1.
-    X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-    Y = np.random.rand(1, 1, 1).astype(np.float32)
-
-    cpu_x = torch.from_numpy(X)
-    npu_x = torch.from_numpy(X).to(device)
-
-    cpu_y = torch.from_numpy(Y)
-    npu_y = torch.from_numpy(Y).to(device)
-
-    yield cpu_x, cpu_y, npu_x, npu_y
-
-    # broadcasting last two dimensions.
-    X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-    Y = np.random.rand(4, 5).astype(np.float32)
-
-    cpu_x = torch.from_numpy(X)
-    npu_x = torch.from_numpy(X).to(device)
-
-    cpu_y = torch.from_numpy(Y)
-    npu_y = torch.from_numpy(Y).to(device)
-
-    yield cpu_x, cpu_y, npu_x, npu_y
-
-def test_2args_broadcast(fn):
-    output_list = []
-    for cpu_x, cpu_y, npu_x, npu_y in __generate_2args_broadcast_cases():
-        cpu_out = fn(cpu_x, cpu_y).numpy()
-        npu_out = fn(npu_x, npu_y).to("cpu").numpy()
-        output_list.append([cpu_out, npu_out])
-
-    return output_list
-
-
-def create_dtype_tensor(shape, dtype, npu_format=-1, min_value=-5, max_value=5, no_zero=False, device=None):
-    if device is None:
-        device = get_npu_device()
-        
-    if dtype == torch.bool:
-        x = np.random.randint(0, 2, size=shape).astype(bool)
-
-    elif dtype == torch.half:
-        x = np.random.uniform(min_value, max_value, shape).astype(np.float16)
-    
-    elif dtype == torch.float:
-        x = np.random.uniform(min_value, max_value, shape).astype(np.float32)
-
-    else:
-        x = np.random.randint(min_value, max_value+1, size = shape).astype(np.int32)
-
-    if no_zero:
-        ones = np.ones_like(x)
-        x = np.where(x != 0, x, ones)
-
-    cpu_input = torch.from_numpy(x)
-    npu_input = torch.from_numpy(x).to(device)
-    if npu_format != -1 and (dtype in [torch.float, torch.half]):
-        npu_input = torch_npu.npu_format_cast(npu_input, npu_format)
-    return cpu_input, npu_input
-
-
-def check_operators_in_prof(expected_operators, prof, unexpected_operators=None):
-    unexpected_operators = unexpected_operators or []
-    prof_key_averages = prof.key_averages()
-    if not prof_key_averages:
-        return print("torch profiling is empty, please check it")
-    for prof_item in prof_key_averages:        
-        if prof_item.key in unexpected_operators:
-            # if unexpected oprators are called, pattern inferring in trans-contiguous is failed
-            return False
-        elif prof_item.key in expected_operators:
-            # if expected oprator is called, empty it in expected_operators list
-            expected_operators.remove(prof_item.key)
-            
-    # if expected_operators list is empty, all oprators have been called
-    if not expected_operators:
-        return True
-    return False
\ No newline at end of file
diff --git a/torch_npu/utils/serialization.py b/torch_npu/utils/serialization.py
index ea859e5399a2d227800b26bccd45e0b856d69a5b..a029696e9ad622d2684c790317f406de5307a8a1 100644
--- a/torch_npu/utils/serialization.py
+++ b/torch_npu/utils/serialization.py
@@ -88,7 +88,7 @@ def save(obj, f, pickle_module=pickle, pickle_protocol=DEFAULT_PROTOCOL, _use_ne
         obj = obj.cpu()
         se.save(obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization)
     else:
-        raise RuntimeError('torch.save received invalid input.')
+        se.save(obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization)
 
 def load(f, map_location=None, pickle_module=pickle, **pickle_load_args):
     """Loads data previously saved with the `save()` API.