diff --git a/test/test_optimized_lib/test_activations.py b/test/test_optimized_lib/test_activations.py
index ca1592ecdee2e7a165d2dd43cd087928be9dde16..95057ca59ead861ce69ef89e551d66c9aaefe16a 100644
--- a/test/test_optimized_lib/test_activations.py
+++ b/test/test_optimized_lib/test_activations.py
@@ -20,7 +20,7 @@ import numpy as np
 from torch_npu.testing.testcase import TestCase, run_tests
 from torch_npu.testing.common_utils import create_common_tensor
 
-from torch_npu.contrib.optimized_lib.module import Mish, SiLU
+from torch_npu.contrib.module import Mish, SiLU
 
 class TestActivations(TestCase):
 
diff --git a/test/test_optimized_lib/test_anchor_generator.py b/test/test_optimized_lib/test_anchor_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..28da5180a664b923527d7ccbc8586832613f105b
--- /dev/null
+++ b/test/test_optimized_lib/test_anchor_generator.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.contrib.function import npu_single_level_responsible_flags
+
+class TestAnchorGenerator(TestCase):
+             
+    def single_level_responsible_flags(self,
+                                       featmap_size,
+                                       gt_bboxes,
+                                       stride,
+                                       num_base_anchors,
+                                       device='cpu'):
+        """Generate the responsible flags of anchor in a single feature map.
+        Args:
+            featmap_size (tuple[int]): The size of feature maps.
+            gt_bboxes (Tensor): Ground truth boxes, shape (n, 4).
+            stride (tuple(int)): stride of current level
+            num_base_anchors (int): The number of base anchors.
+            device (str, optional): Device where the flags will be put on.
+                Defaults to 'cuda'.
+        Returns:
+            torch.Tensor: The valid flags of each anchor in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        gt_bboxes_cx = ((gt_bboxes[:, 0] + gt_bboxes[:, 2]) * 0.5).to(device)
+        gt_bboxes_cy = ((gt_bboxes[:, 1] + gt_bboxes[:, 3]) * 0.5).to(device)
+        gt_bboxes_grid_x = torch.floor(gt_bboxes_cx / stride[0]).long()
+        gt_bboxes_grid_y = torch.floor(gt_bboxes_cy / stride[1]).long()
+
+        # row major indexing
+        gt_bboxes_grid_idx = gt_bboxes_grid_y * feat_w + gt_bboxes_grid_x
+        responsible_grid = torch.zeros(
+            feat_h * feat_w, dtype=torch.uint8, device=device)
+        responsible_grid[gt_bboxes_grid_idx] = 1
+        responsible_grid = responsible_grid[:, None].expand(
+            responsible_grid.size(0), num_base_anchors).contiguous().view(-1)
+        return responsible_grid
+
+    def test_anchor_generator(self):
+        featmap_sizes = [[10, 10], [20, 20], [40, 40]]
+        stride = [[32, 32], [16, 16], [8, 8]]
+        gt_bboxes = torch.randint(0, 100, size=(128, 4))
+        num_base_anchors = 3
+        featmap_level = len(featmap_sizes)
+        for i in range(featmap_level):
+            gt_bboxes = gt_bboxes.npu()
+            cpuout = self.single_level_responsible_flags(featmap_sizes[i],
+                                                    gt_bboxes,
+                                                    stride[i],
+                                                    num_base_anchors)
+            npuout = npu_single_level_responsible_flags(featmap_sizes[i],
+                                                    gt_bboxes,
+                                                    stride[i],
+                                                    num_base_anchors)
+            self.assertRtolEqual(cpuout, npuout.cpu())
+    
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/test/test_optimized_lib/test_bbox_coder.py b/test/test_optimized_lib/test_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..10a19f9bfd85e6c7a80a9a099445fec8882a0928
--- /dev/null
+++ b/test/test_optimized_lib/test_bbox_coder.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.contrib.function import npu_bbox_coder_encode_yolo, \
+     npu_bbox_coder_encode_xyxy2xywh, npu_bbox_coder_decode_xywh2xyxy
+
+class TestBboxCoder(TestCase):
+    def test_npu_bbox_coder_encode_xyxy2xywh(self):
+        bboxes = torch.randint(0, 512, size=(6, 4))
+        gt_bboxes = torch.randint(0, 512, size=(6, 4))
+        bboxes = bboxes.npu()
+        gt_bboxes = gt_bboxes.npu()
+        npuout_1 = npu_bbox_coder_encode_xyxy2xywh(bboxes, gt_bboxes)
+        npuout_2 = npu_bbox_coder_encode_xyxy2xywh(bboxes/512., gt_bboxes/512., \
+                   is_normalized=True, normalized_scale=512.)
+        expect_cpu = torch.tensor([[-1.1562e+01, -1.4492e+00,  2.8105e+00, -1.1855e+00],
+                                    [-3.1465e+00, -1.1826e+00,  1.2939e+00,  1.2314e+00],
+                                    [-3.8696e-01,  1.1758e+00,  1.7346e-01, -2.8174e-01],
+                                    [ 1.3086e+01,  1.6631e+00,  2.4902e+00,  5.6055e-01],
+                                    [-1.6914e+00,  3.8188e+01,  7.7490e-01,  2.9453e+00],
+                                    [ 3.2598e+00, -2.8019e-03,  1.5000e+00, -1.5342e+00]], 
+                                    dtype=torch.float32)
+        self.assertRtolEqual(expect_cpu.numpy(), npuout_1.cpu().numpy())
+        self.assertRtolEqual(expect_cpu.numpy(), npuout_2.cpu().numpy())
+
+    def test_npu_bbox_coder_encode_yolo(self):
+        bboxes = torch.randint(0, 512, size=(6, 4))
+        gt_bboxes = torch.randint(0, 512, size=(6, 4))
+        stride = torch.randint(0, 32, size=(6,))
+        bboxes = bboxes.npu()
+        gt_bboxes = gt_bboxes.npu()
+        stride = stride.npu()
+        npu_output = npu_bbox_coder_encode_yolo(bboxes, gt_bboxes, stride)
+        expect_cpu = torch.tensor([[ 1.0000e+00,  1.0000e+00, -1.3816e+01, -1.3816e+01],
+                                    [ 1.0000e+00,  1.0000e-06, -1.3816e+01, -1.3816e+01],
+                                    [ 1.0000e-06,  1.0000e+00, -1.3816e+01, -2.8768e-01],
+                                    [ 1.0000e+00,  1.0000e+00,  2.5421e+00, -1.3816e+01],
+                                    [ 1.0000e-06,  1.0000e-06, -1.3816e+01, -1.3816e+01],
+                                    [ 1.0000e-06,  4.0909e-01,  1.4889e+00, -1.3816e+01]], 
+                                    dtype=torch.float32)
+        self.assertRtolEqual(expect_cpu.numpy(), npu_output.cpu().numpy())
+
+    def test_npu_bbox_coder_decode_xywh2xyxy(self):
+        max_shape = 512
+        bboxes = torch.randint(0, max_shape, size=(6, 4))
+        pred_bboxes = torch.randn(6, 4)
+        bboxes = bboxes.npu()
+        pred_bboxes = pred_bboxes.npu()
+        npu_output = npu_bbox_coder_decode_xywh2xyxy(bboxes, pred_bboxes, \
+                     max_shape=(max_shape, max_shape))
+        expect_cpu = torch.tensor([[295.2500, 289.5000, 291.7500, 198.5000],
+                                    [235.0000, 221.1250,  21.9375, 511.0000],
+                                    [415.5000, 199.1250, 444.5000, 205.6250],
+                                    [133.0000,  16.0000, 137.5000, 272.0000],
+                                    [275.2500, 373.7500, 367.2500, 362.2500],
+                                    [408.7500,   0.0000, 396.7500,  78.0000]], 
+                                    dtype=torch.float32)
+        self.assertRtolEqual(expect_cpu.numpy(), npu_output.cpu().numpy())
+
+if __name__ == "__main__":
+    run_tests()
+
diff --git a/test/test_optimized_lib/test_bidirectional_lstm.py b/test/test_optimized_lib/test_bidirectional_lstm.py
index e03f40253e86831b18724f77c3a8805e04873050..7bfe90a03e62f67f3f51d9c6bd318269a966846c 100644
--- a/test/test_optimized_lib/test_bidirectional_lstm.py
+++ b/test/test_optimized_lib/test_bidirectional_lstm.py
@@ -18,7 +18,7 @@ import torch_npu
 
 from torch_npu.testing.testcase import TestCase, run_tests
 from torch_npu.testing.common_utils import create_common_tensor
-from torch_npu.contrib.optimized_lib.module import BiLSTM
+from torch_npu.contrib.module import BiLSTM
 
 class TestBidirectionalLstm(TestCase):
     
diff --git a/test/test_optimized_lib/test_channel_shuffle.py b/test/test_optimized_lib/test_channel_shuffle.py
index 501a82c598aacf14996595f36369264f62dc333b..f7bb5e53543916311f099233b28a3b406aecef1a 100644
--- a/test/test_optimized_lib/test_channel_shuffle.py
+++ b/test/test_optimized_lib/test_channel_shuffle.py
@@ -17,7 +17,7 @@ import torch
 import torch_npu
 from torch_npu.testing.testcase import TestCase, run_tests
 from torch_npu.testing.common_utils import create_common_tensor
-from torch_npu.contrib.optimized_lib.module import ChannelShuffle
+from torch_npu.contrib.module import ChannelShuffle
 
 class TestChannelShuffle(TestCase):
     def cpu_channel_shuffle(self, x, groups, split_shuffle):
diff --git a/test/test_optimized_lib/test_crossentropy.py b/test/test_optimized_lib/test_crossentropy.py
index 89dfe6b8f3f822e7e02cc50d185ee5bdb2ae0ab3..6803a15eccca995f77bec5d42d424eda2cc3a22e 100644
--- a/test/test_optimized_lib/test_crossentropy.py
+++ b/test/test_optimized_lib/test_crossentropy.py
@@ -18,7 +18,7 @@ import torch_npu
 
 from torch_npu.testing.testcase import TestCase, run_tests
 from torch_npu.testing.common_utils import create_common_tensor
-from torch_npu.contrib.optimized_lib.module import LabelSmoothingCrossEntropy
+from torch_npu.contrib.module import LabelSmoothingCrossEntropy
 
 class TestCrossentropy(TestCase):
     
diff --git a/test/test_optimized_lib/test_deform_conv.py b/test/test_optimized_lib/test_deform_conv.py
index aa375889723fda6beafa7d9f40881f0fe09ac768..1f875311ef876fdd27a3dfd14b769752d747125a 100644
--- a/test/test_optimized_lib/test_deform_conv.py
+++ b/test/test_optimized_lib/test_deform_conv.py
@@ -18,7 +18,7 @@ import torch_npu
 
 from torch_npu.testing.testcase import TestCase, run_tests
 from torch_npu.testing.common_utils import create_common_tensor
-from torch_npu.contrib.optimized_lib.module import DCNv2
+from torch_npu.contrib.module import DCNv2
 
 class TestDeformConv(TestCase):
     
diff --git a/test/test_optimized_lib/test_index_op.py b/test/test_optimized_lib/test_index_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..5716e4847c7a0d4ad11960f23aba81d5ada0b4f4
--- /dev/null
+++ b/test/test_optimized_lib/test_index_op.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+from torch_npu.contrib.function import npu_fast_condition_index_put
+
+class TestIndexOp(TestCase):
+    def npu_slow_index_op_exec(self, input1):
+        condition = input1 < 0.5
+        value = 0.
+        input1[condition] = value
+        return input1
+    
+    def npu_fast_index_op_exec(self, input1):
+        condition = input1 < 0.5
+        value = 0.
+        return npu_fast_condition_index_put(input1, condition, value)
+
+    def test_npu_index_op(self):
+        dtype_list = [np.float16, np.float32]
+        format_list = [-1]
+        shape_list = [
+            [2, 3, 7, 7],
+            [1, 2, 3, 6, 6],
+            [6, 5, 8, 10],
+            [2, 5, 6, 8, 9]
+        ]
+        shape_format = [
+            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 1, 10)
+            npu_slow_output = self.npu_slow_index_op_exec(npu_input)
+            npu_fast_output = self.npu_fast_index_op_exec(npu_input)
+            self.assertRtolEqual(npu_slow_output.cpu(), npu_fast_output.cpu())
+    
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/test/test_optimized_lib/test_iou.py b/test/test_optimized_lib/test_iou.py
new file mode 100644
index 0000000000000000000000000000000000000000..f68df8e91240df600a996c8465efc9ac4dc89378
--- /dev/null
+++ b/test/test_optimized_lib/test_iou.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.contrib.function import npu_iou, npu_giou
+
+class TestIou(TestCase):
+    def test_npu_iou_1(self):
+        box1 = torch.FloatTensor([[10, 55, 85, 160]])
+        box2 = torch.FloatTensor([[18, 45, 80, 130], [38, 85, 70, 230]])
+        box1 = box1.float().npu()
+        box2 = box2.float().npu()
+        iou1 = npu_iou(box1, box2, mode="iou")
+        iou2 = npu_iou(box1, box2)
+        expedt_iou1 = torch.tensor([[0.5469, 0.2373]], dtype=torch.float32)
+        expedt_iou2 = torch.tensor([[0.5469, 0.2373]], dtype=torch.float32)
+        self.assertRtolEqual(expedt_iou1, iou1.cpu())
+        self.assertRtolEqual(expedt_iou2, iou2.cpu())
+    
+    def test_npu_iou_2(self):
+        box1 = torch.randint(0, 256, size=(8, 4))
+        box2 = torch.randint(0, 256, size=(8, 4))
+        box1 = box1.float().npu()
+        box2 = box2.float().npu()
+        iou1 = npu_iou(box1, box2, mode="iou")
+        iou2 = npu_iou(box1, box2)
+        expedt_iou1 = torch.tensor([[0.0000, 0.0000, -0.0000, 0.0000, 0.0000, 0.0000, -0.0000, -0.0000],
+                                    [0.0000, -0.0000, -0.0000, -0.0000, 0.0000, 0.0000, -0.0000, -0.0000],
+                                    [0.0000, -0.0000, -0.0000, -0.0000, 0.0000, 0.0000, -0.0000, -0.0000],
+                                    [0.0000, 0.0000, -0.0000, 0.0000, 0.0238, 0.0575, -0.0000, -0.0000],
+                                    [0.0000, 0.0000, -0.0000, 0.0000, 0.0000, 0.0000, -0.0000, -0.0000],
+                                    [0.0000, -0.0000, -0.0000, -0.0000, 0.0000, 0.0000, -0.0000, -0.0000],
+                                    [0.0000, 0.0000, -0.0000, 0.0000, 0.0000, 0.0943, -0.0000, 0.0000],
+                                    [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, -0.0000, 0.0000]],
+                                    dtype=torch.float32)
+        expedt_iou2 = torch.tensor([[0.0000, 0.0000, -0.0000, 0.0000, 0.0000, 0.0000, -0.0000, -0.0000],
+                                    [0.0000, -0.0000, -0.0000, -0.0000, 0.0000, 0.0000, -0.0000, -0.0000],
+                                    [0.0000, -0.0000, -0.0000, -0.0000, 0.0000, 0.0000, -0.0000, -0.0000],
+                                    [0.0000, 0.0000, -0.0000, 0.0000, 0.0238, 0.0575, -0.0000, -0.0000],
+                                    [0.0000, 0.0000, -0.0000, 0.0000, 0.0000, 0.0000, -0.0000, -0.0000],
+                                    [0.0000, -0.0000, -0.0000, -0.0000, 0.0000, 0.0000, -0.0000, -0.0000],
+                                    [0.0000, 0.0000, -0.0000, 0.0000, 0.0000, 0.0943, -0.0000, 0.0000],
+                                    [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, -0.0000, 0.0000]], 
+                                    dtype=torch.float32)
+        self.assertRtolEqual(expedt_iou1, iou1.cpu())
+        self.assertRtolEqual(expedt_iou2, iou2.cpu())
+
+    def test_npu_iou_3(self):
+        box1 = torch.FloatTensor([[10, 55, 85, 160]])
+        box2 = torch.FloatTensor([[18, 45, 80, 130], [38, 85, 70, 230]])
+        box1 = box1.float().npu() / 100.
+        box2 = box2.float().npu() / 100.
+        iou1 = npu_iou(box1, box2, mode="iou", is_normalized=True, normalized_scale=100.)
+        iou2 = npu_iou(box1, box2, is_normalized=True, normalized_scale=100.)
+        expedt_iou1 = torch.tensor([[0.5469, 0.2373]], dtype=torch.float32)
+        expedt_iou2 = torch.tensor([[0.5469, 0.2373]], dtype=torch.float32)
+        self.assertRtolEqual(expedt_iou1, iou1.cpu())
+        self.assertRtolEqual(expedt_iou2, iou2.cpu())
+
+    def test_npu_giou_1(self):
+        box1 = torch.randn(16, 4)
+        box1.requires_grad = True
+        box2 = torch.randn(16, 4)
+        box1 = box1.float().npu()
+        box2 = box2.float().npu()
+        iou1 = npu_giou(box1, box2)
+        expedt_iou1 = torch.tensor([[-1.1377e+00],
+                                    [-7.3738e-01],
+                                    [ 1.5935e-01],
+                                    [-2.1271e+00],
+                                    [ 1.2136e+03],
+                                    [-7.2693e+00],
+                                    [-4.7243e-01],
+                                    [-2.0380e+00],
+                                    [ 5.1004e+00],
+                                    [-1.8952e+00],
+                                    [-3.2175e+00],
+                                    [-9.4184e-01],
+                                    [ 5.3800e-01],
+                                    [-7.9274e-01],
+                                    [-1.0181e+00],
+                                    [-1.6168e+00]], dtype=torch.float32)
+        self.assertRtolEqual(expedt_iou1, iou1.cpu().detach())
+
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/test/test_optimized_lib/test_multiclass_nms.py b/test/test_optimized_lib/test_multiclass_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdf2bba60acae74f86b88acfa2975ed674d23550
--- /dev/null
+++ b/test/test_optimized_lib/test_multiclass_nms.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.contrib.function import npu_multiclass_nms, \
+     npu_batched_multiclass_nms
+
+class TestMultiClassNms(TestCase):
+    def test_npu_multiclass_nms_1(self):
+        boxes = torch.randint(1, 255, size=(1000, 4))
+        scores = torch.randn(1000, 81)
+        boxes = boxes.npu().half()
+        scores = scores.npu().half()
+        det_bboxes, det_labels = npu_multiclass_nms(boxes, scores, score_thr=0.3, nms_thr=0.5, max_num=3)
+        expedt_det_bboxes = torch.tensor([[ 57.0000, 198.8750,  45.9688, 221.8750,   4.1484],
+                                          [215.0000, 155.0000, 236.8750, 137.0000,   3.9023],
+                                          [208.8750, 221.0000, 228.0000,  17.0000,   3.8867]], 
+                                            dtype=torch.float16)
+        expedt_det_labels = torch.tensor([59.,  3., 75.], dtype=torch.float16)
+        self.assertRtolEqual(expedt_det_bboxes, det_bboxes.cpu())
+        self.assertRtolEqual(expedt_det_labels, det_labels.cpu())
+    
+    def test_npu_multiclass_nms_2(self):
+        boxes = torch.randn(1000, 4)
+        scores = torch.randn(1000, 81)
+        boxes = boxes.npu().half()
+        scores = scores.npu().half()
+        det_bboxes, det_labels = npu_multiclass_nms(boxes, scores, score_thr=0.3, nms_thr=0.5, max_num=3)
+        expedt_det_bboxes = torch.tensor([[ 0.2231, -1.6943, -0.1172, -1.0547,  4.1484],
+                                          [ 0.2891,  0.4897, -0.3809, -0.7129,  3.9023],
+                                          [ 0.6694, -1.2266, -0.3027,  0.4639,  3.8867]], 
+                                            dtype=torch.float16)
+        expedt_det_labels = torch.tensor([59.,  3., 75.], dtype=torch.float16)
+        self.assertRtolEqual(expedt_det_bboxes, det_bboxes.cpu())
+        self.assertRtolEqual(expedt_det_labels, det_labels.cpu())
+
+    def test_npu_batched_multiclass_nms_1(self):
+        boxes = torch.randint(1, 255, size=(4, 200, 80, 4))
+        scores = torch.randn(4, 200, 81)
+        boxes = boxes.npu().half()
+        scores = scores.npu().half()
+        det_bboxes, det_labels = npu_batched_multiclass_nms(boxes, scores, score_thr=0.3, nms_thr=0.5, max_num=3)
+        expedt_det_bboxes = torch.tensor([[[221.8750,  60.0000, 183.0000,  22.0000,   3.8867],
+                                            [167.0000, 250.0000, 136.0000, 144.0000,   3.6445],
+                                            [ 45.9688, 147.0000,  67.0000, 241.8750,   3.4844]],
+                                            [[  5.0000, 178.0000, 243.8750, 138.0000,   3.7344],
+                                            [238.0000, 132.0000,  47.0000,  84.0000,   3.6836],
+                                            [ 32.0000, 110.0000, 131.0000,  73.0000,   3.6309]],
+                                            [[111.9375, 120.9375,  54.0000, 231.0000,   3.9219],
+                                            [147.0000, 162.0000,  78.0000,   1.0010,   3.9219],
+                                            [157.0000, 118.0000,  57.0000, 115.0000,   3.6523]],
+                                            [[ 80.0000, 126.9375,  54.0000, 246.8750,   3.7344],
+                                            [ 31.0000, 253.8750,  19.0000, 138.0000,   3.6328],
+                                            [ 54.0000, 253.8750,  78.0000,  75.0000,   3.5586]]], 
+                                            dtype=torch.float16)
+        expedt_det_labels = torch.tensor([[76.,  3., 32.],
+                                          [26., 66., 25.],
+                                          [34., 41., 30.],
+                                          [22., 27., 46.]], dtype=torch.float16)
+        self.assertRtolEqual(expedt_det_bboxes, det_bboxes.cpu())
+        self.assertRtolEqual(expedt_det_labels, det_labels.cpu())
+
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/test/test_optimized_lib/test_ps_roi_pooling.py b/test/test_optimized_lib/test_ps_roi_pooling.py
index 65aca98031d385ba6c38a7390cd948bc4c30e2d4..84e63d1b8adadc22e5ed731fba2eb1573a9177a2 100644
--- a/test/test_optimized_lib/test_ps_roi_pooling.py
+++ b/test/test_optimized_lib/test_ps_roi_pooling.py
@@ -18,7 +18,7 @@ import torch_npu
 
 from torch_npu.testing.testcase import TestCase, run_tests
 from torch_npu.testing.common_utils import create_common_tensor
-from torch_npu.contrib.optimized_lib.module import PSROIPool
+from torch_npu.contrib.module import PSROIPool
 
 class TestPsRoiPooling(TestCase):
     def get_random_rois(self, shape):
diff --git a/test/test_optimized_lib/test_roi_align.py b/test/test_optimized_lib/test_roi_align.py
index 1509fc2b658e1b7b193644ef89777a175fc79a12..d0c3df74b2335f8c181fff21eaf151edf738e737 100644
--- a/test/test_optimized_lib/test_roi_align.py
+++ b/test/test_optimized_lib/test_roi_align.py
@@ -18,7 +18,7 @@ import torch_npu
 
 from torch_npu.testing.testcase import TestCase, run_tests
 from torch_npu.testing.common_utils import create_common_tensor
-from torch_npu.contrib.optimized_lib.module import ROIAlign
+from torch_npu.contrib.module import ROIAlign
 
 class TestRoiAlign(TestCase):
     
diff --git a/torch_npu/contrib/__init__.py b/torch_npu/contrib/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..1e039dfcf6a8626c09a11a9d3159071144110252 100644
--- a/torch_npu/contrib/__init__.py
+++ b/torch_npu/contrib/__init__.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .function import npu_iou, npu_ptiou, npu_giou, npu_multiclass_nms, npu_batched_multiclass_nms, \
+    npu_single_level_responsible_flags, npu_fast_condition_index_put, npu_bbox_coder_encode_yolo, \
+    npu_bbox_coder_encode_xyxy2xywh, npu_bbox_coder_decode_xywh2xyxy
+from .module import ChannelShuffle, Prefetcher, LabelSmoothingCrossEntropy, ROIAlign, DCNv2, \
+    ModulatedDeformConv, Mish, BiLSTM, PSROIPool, SiLU, Swish
+
+__all__ = [
+    # from function
+    "npu_iou",
+    "npu_ptiou",
+    "npu_giou",
+    "npu_multiclass_nms",
+    "npu_batched_multiclass_nms",
+    "npu_single_level_responsible_flags",
+    "npu_fast_condition_index_put",
+    "npu_bbox_coder_encode_yolo",
+    "npu_bbox_coder_encode_xyxy2xywh",
+    "npu_bbox_coder_decode_xywh2xyxy",
+
+    # from module
+    "ChannelShuffle",
+    "Prefetcher",
+    "LabelSmoothingCrossEntropy",
+    "ROIAlign",
+    "DCNv2",
+    "ModulatedDeformConv",
+    "Mish",
+    "BiLSTM",
+    "PSROIPool",
+    "SiLU",
+    "Swish",
+]
diff --git a/torch_npu/contrib/optimized_lib/function/__init__.py b/torch_npu/contrib/function/__init__.py
similarity index 100%
rename from torch_npu/contrib/optimized_lib/function/__init__.py
rename to torch_npu/contrib/function/__init__.py
diff --git a/torch_npu/contrib/optimized_lib/function/anchor_generator.py b/torch_npu/contrib/function/anchor_generator.py
similarity index 100%
rename from torch_npu/contrib/optimized_lib/function/anchor_generator.py
rename to torch_npu/contrib/function/anchor_generator.py
diff --git a/torch_npu/contrib/optimized_lib/function/bbox_coder.py b/torch_npu/contrib/function/bbox_coder.py
similarity index 100%
rename from torch_npu/contrib/optimized_lib/function/bbox_coder.py
rename to torch_npu/contrib/function/bbox_coder.py
diff --git a/torch_npu/contrib/optimized_lib/function/index_op.py b/torch_npu/contrib/function/index_op.py
similarity index 100%
rename from torch_npu/contrib/optimized_lib/function/index_op.py
rename to torch_npu/contrib/function/index_op.py
diff --git a/torch_npu/contrib/optimized_lib/function/iou.py b/torch_npu/contrib/function/iou.py
similarity index 100%
rename from torch_npu/contrib/optimized_lib/function/iou.py
rename to torch_npu/contrib/function/iou.py
diff --git a/torch_npu/contrib/optimized_lib/function/nms.py b/torch_npu/contrib/function/nms.py
similarity index 100%
rename from torch_npu/contrib/optimized_lib/function/nms.py
rename to torch_npu/contrib/function/nms.py
diff --git a/torch_npu/contrib/optimized_lib/module/__init__.py b/torch_npu/contrib/module/__init__.py
similarity index 100%
rename from torch_npu/contrib/optimized_lib/module/__init__.py
rename to torch_npu/contrib/module/__init__.py
diff --git a/torch_npu/contrib/optimized_lib/module/activations.py b/torch_npu/contrib/module/activations.py
similarity index 79%
rename from torch_npu/contrib/optimized_lib/module/activations.py
rename to torch_npu/contrib/module/activations.py
index 05116226153c04ec7988a0298af120312b55426b..2776d2a4769bce0ed979ef902cc544e111277b60 100644
--- a/torch_npu/contrib/optimized_lib/module/activations.py
+++ b/torch_npu/contrib/module/activations.py
@@ -80,42 +80,4 @@ class SiLU(nn.Module):
         x = torch_npu.npu_silu(x)
         return x
 
-Swish = SiLU
-
-if __name__ == '__main__':
-    torch.npu.set_device('npu:0')
-    input_tensor = torch.randn(2, 32, 4, 4)
-    input_tensor.requires_grad = True
-    model = Mish()
-
-    input_tensor = input_tensor.npu()
-    model = model.npu()
-
-    o = model(input_tensor)
-    l = o.sum()
-    l.backward()
-
-    o = model(input_tensor.half())
-    l = o.sum()
-    l.backward()
-
-    torch.npu.synchronize()
-    print('Mish test success.')
-
-    input_tensor = torch.randn(2, 32, 4, 4)
-    input_tensor.requires_grad = True
-    model = SiLU()
-
-    input_tensor = input_tensor.npu()
-    model = model.npu()
-
-    o = model(input_tensor)
-    l = o.sum()
-    l.backward()
-
-    o = model(input_tensor.half())
-    l = o.sum()
-    l.backward()
-
-    torch.npu.synchronize()
-    print('SiLU test success.')
+Swish = SiLU
\ No newline at end of file
diff --git a/torch_npu/contrib/optimized_lib/module/bidirectional_lstm.py b/torch_npu/contrib/module/bidirectional_lstm.py
similarity index 91%
rename from torch_npu/contrib/optimized_lib/module/bidirectional_lstm.py
rename to torch_npu/contrib/module/bidirectional_lstm.py
index e3ac0f52a9a0f0587cdce1ab34c88f5dc68bef11..2a97a652d93c5cf2db941b1f79f5103233991311 100644
--- a/torch_npu/contrib/optimized_lib/module/bidirectional_lstm.py
+++ b/torch_npu/contrib/module/bidirectional_lstm.py
@@ -84,19 +84,4 @@ class BiLSTM(torch.nn.Module):
         recurrent_bw = torch.flip(recurrent_bw, [0])
         recurrent = torch.cat((recurrent_fw, recurrent_bw), 2)
 
-        return recurrent
-
-
-if __name__ == '__main__':
-    x = torch.randn(26, 2560, 512)
-    x.requires_grad = True
-
-    torch.npu.set_device(0)
-    x = x.npu()
-    rnn = BiLSTM(512, 256).npu()
-    x.retain_grad()
-    output = rnn(x)
-    print('test forward: ', output)
-    output.backward(torch.ones(x.size(), dtype=torch.float).npu())
-    x_grad = x.grad
-    print('test grad ', x_grad)
+        return recurrent
\ No newline at end of file
diff --git a/torch_npu/contrib/optimized_lib/module/channel_shuffle.py b/torch_npu/contrib/module/channel_shuffle.py
similarity index 89%
rename from torch_npu/contrib/optimized_lib/module/channel_shuffle.py
rename to torch_npu/contrib/module/channel_shuffle.py
index c182124e733010f9f2685a34df1b2b56694eba44..07bc37b1d6b83d77ecc8134ca536c30f2f5f196c 100644
--- a/torch_npu/contrib/optimized_lib/module/channel_shuffle.py
+++ b/torch_npu/contrib/module/channel_shuffle.py
@@ -161,34 +161,4 @@ class IndexSelectHalfImplementation(torch.autograd.Function):
         grad_output = torch.cat([grad_output1, grad_output2], 1)
         out1 = grad_output.index_select(1, ctx.bp_index1)
         out2 = grad_output.index_select(1, ctx.bp_index2)
-        return out1, out2, None, None, None, None
-
-
-def main():
-    device = 'cpu'
-
-    if device.startswith('npu'):
-        torch.npu.set_device(device)
-
-
-    def tescase(split_shuffle=True):
-        x = torch.randn(2, 32, 7, 7)
-        conv = torch.nn.Conv2d(32, 32, 1)
-        model = ChannelShuffle(64, split_shuffle=split_shuffle)
-
-        x = x.to(device)
-        conv = conv.to(device)
-        model = model.to(device)
-
-        x1 = conv(x)
-        x2 = conv(x)
-        output = model(x1, x2)
-        loss = sum([i.sum() for i in output]) if split_shuffle else output.sum()
-        loss.backward()
-
-
-    tescase(split_shuffle=True)
-    tescase(split_shuffle=False)
-
-if __name__ == '__main__':
-    main()
+        return out1, out2, None, None, None, None
\ No newline at end of file
diff --git a/torch_npu/contrib/optimized_lib/module/crossentropy.py b/torch_npu/contrib/module/crossentropy.py
similarity index 79%
rename from torch_npu/contrib/optimized_lib/module/crossentropy.py
rename to torch_npu/contrib/module/crossentropy.py
index 6eba982fb8c1da817cb4650d0be86bdbdc333785..5cf2932b3808fc25b6ac5b34b8836bee038d9d66 100644
--- a/torch_npu/contrib/optimized_lib/module/crossentropy.py
+++ b/torch_npu/contrib/module/crossentropy.py
@@ -41,23 +41,4 @@ class LabelSmoothingCrossEntropy(nn.Module):
         loss = torch_npu.npu_softmax_cross_entropy_with_logits(pred, one_hot_label)
 
         loss = torch.mean(loss, [0], keepdim=False, dtype=torch.float32)
-        return loss
-
-
-if __name__ == '__main__':
-    x = torch.randn(2, 10)
-    x.requires_grad = True
-    y = torch.randint(0, 10, size=(2,))
-
-    torch.npu.set_device(0)
-    x = x.npu()
-    y = y.npu()
-    m = LabelSmoothingCrossEntropy(10)
-    l = m(x, y)
-    l.backward()
-    print('test ce ok, loss is ', l)
-
-    m = LabelSmoothingCrossEntropy(10, 0.1)
-    l = m(x, y)
-    l.backward()
-    print('test lsce ok, loss is ', l)
+        return loss
\ No newline at end of file
diff --git a/torch_npu/contrib/optimized_lib/module/deform_conv.py b/torch_npu/contrib/module/deform_conv.py
similarity index 100%
rename from torch_npu/contrib/optimized_lib/module/deform_conv.py
rename to torch_npu/contrib/module/deform_conv.py
diff --git a/torch_npu/contrib/optimized_lib/module/prefetcher.py b/torch_npu/contrib/module/prefetcher.py
similarity index 100%
rename from torch_npu/contrib/optimized_lib/module/prefetcher.py
rename to torch_npu/contrib/module/prefetcher.py
diff --git a/torch_npu/contrib/optimized_lib/module/ps_roi_pooling.py b/torch_npu/contrib/module/ps_roi_pooling.py
similarity index 75%
rename from torch_npu/contrib/optimized_lib/module/ps_roi_pooling.py
rename to torch_npu/contrib/module/ps_roi_pooling.py
index 14f93e627a7fbb1f58e48c274505cde211e1b6b9..05b708eab7034776b0d6e24e2c291090ecdda36f 100644
--- a/torch_npu/contrib/optimized_lib/module/ps_roi_pooling.py
+++ b/torch_npu/contrib/module/ps_roi_pooling.py
@@ -68,33 +68,4 @@ class PSROIPool(nn.Module):
         tmpstr += ", group_size=" + str(self.group_size)
         tmpstr += ", output_dim=" + str(self.output_dim)
         tmpstr += ")"
-        return tmpstr
-
-
-def get_random_rois(shape):
-    rois_init = torch.zeros(shape)
-    for i in range(shape[0]):
-        for j in range(shape[1]):
-            pi1 = torch.rand(1, 2).uniform_(0, 10)
-            pi2 = torch.rand(1, 2).uniform_(10, 100)
-            boxi = torch.cat((pi1, pi2), 1)
-            n = torch.tensor([[float(i)]])
-            boxi = torch.cat((n, boxi), 1)
-            rois_init[i, j, :] = boxi
-    return rois_init
-
-
-if __name__ == "__main__":
-    cls_feat = torch.randn(4, 1078, 84, 84).float()
-    cls_feat.requires_grad = True
-    rois_tensor = get_random_rois((4, 128, 5)).permute(0, 2, 1).float()
-
-    model = PSROIPool(pooled_height=7, pooled_width=7, spatial_scale=1 / 16.0, group_size=7, output_dim=22)
-
-    torch.npu.set_device(0)
-    cls_feat = cls_feat.npu()
-    rois_tensor = rois_tensor.npu()
-
-    x = model(cls_feat, rois_tensor)  # 512,22,7,7
-    l = x.sum()
-    l.backward()
+        return tmpstr
\ No newline at end of file
diff --git a/torch_npu/contrib/optimized_lib/module/roi_align.py b/torch_npu/contrib/module/roi_align.py
similarity index 100%
rename from torch_npu/contrib/optimized_lib/module/roi_align.py
rename to torch_npu/contrib/module/roi_align.py
diff --git a/torch_npu/contrib/optimized_lib/__init__.py b/torch_npu/contrib/optimized_lib/__init__.py
deleted file mode 100644
index 1e039dfcf6a8626c09a11a9d3159071144110252..0000000000000000000000000000000000000000
--- a/torch_npu/contrib/optimized_lib/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .function import npu_iou, npu_ptiou, npu_giou, npu_multiclass_nms, npu_batched_multiclass_nms, \
-    npu_single_level_responsible_flags, npu_fast_condition_index_put, npu_bbox_coder_encode_yolo, \
-    npu_bbox_coder_encode_xyxy2xywh, npu_bbox_coder_decode_xywh2xyxy
-from .module import ChannelShuffle, Prefetcher, LabelSmoothingCrossEntropy, ROIAlign, DCNv2, \
-    ModulatedDeformConv, Mish, BiLSTM, PSROIPool, SiLU, Swish
-
-__all__ = [
-    # from function
-    "npu_iou",
-    "npu_ptiou",
-    "npu_giou",
-    "npu_multiclass_nms",
-    "npu_batched_multiclass_nms",
-    "npu_single_level_responsible_flags",
-    "npu_fast_condition_index_put",
-    "npu_bbox_coder_encode_yolo",
-    "npu_bbox_coder_encode_xyxy2xywh",
-    "npu_bbox_coder_decode_xywh2xyxy",
-
-    # from module
-    "ChannelShuffle",
-    "Prefetcher",
-    "LabelSmoothingCrossEntropy",
-    "ROIAlign",
-    "DCNv2",
-    "ModulatedDeformConv",
-    "Mish",
-    "BiLSTM",
-    "PSROIPool",
-    "SiLU",
-    "Swish",
-]