From 401877015fe33cb0f99395bf836d5cc2afc01ad6 Mon Sep 17 00:00:00 2001
From: XXX <xxx>
Date: Tue, 8 Feb 2022 17:27:46 +0800
Subject: [PATCH 01/12] =?UTF-8?q?maskedSelect=E7=AE=97=E5=AD=90=E8=BF=81?=
 =?UTF-8?q?=E7=A7=BB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/test_network_ops/test_masked_select.py   | 138 ++++++++++++++++++
 .../csrc/aten/ops/MaskedSelectKernelNpu.cpp   | 107 ++++++++++++++
 2 files changed, 245 insertions(+)
 create mode 100644 test/test_network_ops/test_masked_select.py
 create mode 100644 torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp
diff --git a/test/test_network_ops/test_masked_select.py b/test/test_network_ops/test_masked_select.py
new file mode 100644
index 0000000000..f5aeaf16f5
--- /dev/null
+++ b/test/test_network_ops/test_masked_select.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestMaskedSelect(TestCase):
+    def cpu_op_exec(self, input, mask):
+        output = torch.masked_select(input, mask)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input, mask):
+        mask = mask.to("npu")
+        output = torch.masked_select(input, mask)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input, mask, output):
+        output = torch.masked_select(input, mask, out=output)
+        return output.detach().to("cpu").numpy()
+
+    def test_maskedselect_out_result(self, device):
+        shape_format = [
+            [[np.float16, 2, [15, 15, 15, 16]], [np.float16, 2, [15, 15, 15, 16]]],
+            [[np.float16, 2, [15, 15, 15, 16]], [np.float16, 2, [3, 3, 7, 7]]],
+            [[np.float16, 0, [15, 15, 15, 16]], [np.float16, 0, [15, 15, 15, 16]]],
+            [[np.float16, 0, [15, 15, 15, 16]], [np.float16, 0, [116, 116, 1, 1]]],
+            [[np.float32, 2, [15, 15, 15, 16]], [np.float32, 2, [15, 15, 15, 16]]],
+            [[np.float32, 2, [15, 15, 15, 16]], [np.float32, 2, [3, 3, 7, 7]]],
+            [[np.float32, 0, [15, 15, 15, 16]], [np.float32, 0, [15, 15, 15, 16]]],
+            [[np.float32, 0, [15, 15, 15, 16]], [np.float32, 0, [232, 232, 1, 1]]],
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -2, 2)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], -2, 2)
+            cpu_input3, npu_input3 = create_common_tensor(item[1], -2, 2)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2.to(torch.int32)>0)
+            npu_output = self.npu_op_exec_out(npu_input1, npu_input2.to(torch.int32)>0, npu_input3)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_maskedselect_shape_format_maskdiff(self, device):
+        dtype_list = [np.int64, np.int32, np.float32]
+        format_list = [0]
+        shape_list = [[3, 4, 5]]
+        shape_format = [
+            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            mask_cpu, mask_npu = create_common_tensor((np.int32, 0, (3, 4, 1)), 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input, mask_cpu > 50)
+            npu_output = self.npu_op_exec(npu_input, mask_npu > 50)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_maskedselect_shape_format_fp32(self, device):
+        format_list = [0, 3]
+        shape_list = [[3, 4, 5]]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        mask = torch.tensor([[
+         [ True, False,  True,  True, False],
+         [ True, False, False,  True, False],
+         [False, False, False, False, False],
+         [ True, False, False, False, False]],
+
+        [[ True, False, False, False,  True],
+         [False,  True, False,  True,  True],
+         [False,  True, False,  True,  True],
+         [False, False, False, False, False]],
+
+        [[False,  True,  True, False,  True],
+         [False,  True,  True,  True,  True],
+         [False,  True, False,  True, False],
+         [False,  True,  True, False, False]]])
+
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input, mask)
+            npu_output = self.npu_op_exec(npu_input, mask)
+            self.assertRtolEqual(cpu_output, npu_output)
+            
+    def test_maskedselect_shape_format_int(self, device):
+        dtype_list = [np.int32, np.int64]
+        format_list = [0]
+        shape_list = [[3, 4, 5]]
+        shape_format = [
+            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        mask = torch.tensor([[
+         [ True, False,  True,  True, False],
+         [ True, False, False,  True, False],
+         [False, False, False, False, False],
+         [ True, False, False, False, False]],
+
+        [[ True, False, False, False,  True],
+         [False,  True, False,  True,  True],
+         [False,  True, False,  True,  True],
+         [False, False, False, False, False]],
+
+        [[False,  True,  True, False,  True],
+         [False,  True,  True,  True,  True],
+         [False,  True, False,  True, False],
+         [False,  True,  True, False, False]]])
+
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input, mask)
+            npu_output = self.npu_op_exec(npu_input, mask)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestMaskedSelect, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp b/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp
new file mode 100644
index 0000000000..508ba1c1bb
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp
@@ -0,0 +1,107 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::SmallVector<int64_t, SIZE> masked_select_npu_output_size(
+    const at::Tensor& self,
+    const at::Tensor& mask) {
+  int64_t shape;
+  shape = mask.sum().item().toInt();
+  return {shape};
+}
+
+at::Tensor& masked_select_out_npu_nocheck(
+    at::Tensor& result,
+    const at::Tensor& self,
+    const at::Tensor& mask) {
+  at::Tensor maskBool = mask;
+  if (!(mask.dtype() == at::kBool)) {
+    maskBool = mask.to(at::kBool);
+  }
+
+  OpCommand cmd;
+  cmd.Name("MaskedSelect")
+      .Input(self)
+      .Input(maskBool)
+      .Output(result)
+      .Run();
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::masked_select_out(
+    const at::Tensor& self,
+    const at::Tensor& mask,
+    at::Tensor& result) {
+  at::Tensor dtypeCastOfSelf = self;
+  at::Tensor maskCast = mask;
+  if (maskCast.sizes() != dtypeCastOfSelf.sizes()) {
+    maskCast = NPUNativeFunctions::npu_broadcast(mask, dtypeCastOfSelf.sizes());
+  }
+  if (dtypeCastOfSelf.scalar_type() == ScalarType::Half) {
+    dtypeCastOfSelf = NPUNativeFunctions::npu_dtype_cast(dtypeCastOfSelf, at::ScalarType::Float);
+    result = result.to(ScalarType::Float);
+  }
+  auto outputSize = masked_select_npu_output_size(dtypeCastOfSelf, maskCast);
+
+  OpPreparation::CheckOut(
+      {dtypeCastOfSelf},
+      result,
+      dtypeCastOfSelf,
+      outputSize);
+
+  OpPipeWithDefinedOut pipe;
+  result = pipe.CheckMemory({dtypeCastOfSelf, maskCast}, {result})
+      .Func([&dtypeCastOfSelf, &maskCast](at::Tensor& result)
+      {masked_select_out_npu_nocheck(result, dtypeCastOfSelf, maskCast);})
+      .Call(result);
+
+  if (result.scalar_type() != self.scalar_type()) {
+    result = result.npu_dtype_cast(ScalarType::Half);
+  }
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::masked_select(
+    const at::Tensor& self,
+    const at::Tensor& mask) {
+  at::Tensor dtypeCastOfSelf = self;
+  at::Tensor maskCast = mask;
+  if (maskCast.sizes() != dtypeCastOfSelf.sizes()) {
+    maskCast = NPUNativeFunctions::npu_broadcast(mask, dtypeCastOfSelf.sizes());
+  }
+  if (dtypeCastOfSelf.scalar_type() == ScalarType::Half) {
+    dtypeCastOfSelf = NPUNativeFunctions::npu_dtype_cast(dtypeCastOfSelf, at::ScalarType::Float);
+  }
+  auto outputSize = masked_select_npu_output_size(dtypeCastOfSelf, maskCast);
+
+  at::Tensor result = OpPreparation::ApplyTensor(dtypeCastOfSelf, outputSize);
+
+  masked_select_out_npu_nocheck(result, dtypeCastOfSelf, maskCast);
+
+  if (result.scalar_type() != self.scalar_type()) {
+    result = NPUNativeFunctions::npu_dtype_cast(result, at::ScalarType::Half);
+  }
+  return result;
+}
+
+} // namespace native
+} // namespace at_npu
-- 
Gitee


From bd0acebea0d4261838a3c15b2d4da31c84347b1b Mon Sep 17 00:00:00 2001
From: XXX <xxx>
Date: Tue, 8 Feb 2022 18:42:11 +0800
Subject: [PATCH 02/12] code fix

---
 torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp b/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp
index 508ba1c1bb..7f8cbab6da 100644
--- a/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp
@@ -34,7 +34,7 @@ at::Tensor& masked_select_out_npu_nocheck(
     const at::Tensor& mask) {
   at::Tensor maskBool = mask;
   if (!(mask.dtype() == at::kBool)) {
-    maskBool = mask.to(at::kBool);
+    maskBool = NPUNativeFunctions::npu_dtype_cast(mask, at::kBool);
   }
 
   OpCommand cmd;
@@ -56,9 +56,9 @@ at::Tensor& NPUNativeFunctions::masked_select_out(
   if (maskCast.sizes() != dtypeCastOfSelf.sizes()) {
     maskCast = NPUNativeFunctions::npu_broadcast(mask, dtypeCastOfSelf.sizes());
   }
-  if (dtypeCastOfSelf.scalar_type() == ScalarType::Half) {
+  if (dtypeCastOfSelf.scalar_type() == at::ScalarType::Half) {
     dtypeCastOfSelf = NPUNativeFunctions::npu_dtype_cast(dtypeCastOfSelf, at::ScalarType::Float);
-    result = result.to(ScalarType::Float);
+    result = NPUNativeFunctions::npu_dtype_cast(result, at::ScalarType::Float);
   }
   auto outputSize = masked_select_npu_output_size(dtypeCastOfSelf, maskCast);
 
@@ -75,7 +75,7 @@ at::Tensor& NPUNativeFunctions::masked_select_out(
       .Call(result);
 
   if (result.scalar_type() != self.scalar_type()) {
-    result = result.npu_dtype_cast(ScalarType::Half);
+    result = result.npu_dtype_cast(at::ScalarType::Half);
   }
   return result;
 }
@@ -88,7 +88,7 @@ at::Tensor NPUNativeFunctions::masked_select(
   if (maskCast.sizes() != dtypeCastOfSelf.sizes()) {
     maskCast = NPUNativeFunctions::npu_broadcast(mask, dtypeCastOfSelf.sizes());
   }
-  if (dtypeCastOfSelf.scalar_type() == ScalarType::Half) {
+  if (dtypeCastOfSelf.scalar_type() == at::ScalarType::Half) {
     dtypeCastOfSelf = NPUNativeFunctions::npu_dtype_cast(dtypeCastOfSelf, at::ScalarType::Float);
   }
   auto outputSize = masked_select_npu_output_size(dtypeCastOfSelf, maskCast);
-- 
Gitee


From f91585826daf0dc8b899885b50dbf425b9ab7a27 Mon Sep 17 00:00:00 2001
From: XXX <xxx>
Date: Tue, 8 Feb 2022 18:54:15 +0800
Subject: [PATCH 03/12] code fix2

---
 torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp b/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp
index 7f8cbab6da..46a21a0a55 100644
--- a/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp
@@ -75,7 +75,7 @@ at::Tensor& NPUNativeFunctions::masked_select_out(
       .Call(result);
 
   if (result.scalar_type() != self.scalar_type()) {
-    result = result.npu_dtype_cast(at::ScalarType::Half);
+    result = NPUNativeFunctions::npu_dtype_cast(result, at::ScalarType::Half);
   }
   return result;
 }
-- 
Gitee


From 02c71229b8dceb76d5348e07256cb4132dcda5e0 Mon Sep 17 00:00:00 2001
From: XXX <xxx>
Date: Tue, 8 Feb 2022 19:02:19 +0800
Subject: [PATCH 04/12] fix code check

---
 test/test_network_ops/test_masked_select.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/test/test_network_ops/test_masked_select.py b/test/test_network_ops/test_masked_select.py
index f5aeaf16f5..f741d72222 100644
--- a/test/test_network_ops/test_masked_select.py
+++ b/test/test_network_ops/test_masked_select.py
@@ -24,20 +24,20 @@ from torch_npu.testing.common_device_type import instantiate_device_type_tests
 from torch_npu.testing.util_test import create_common_tensor
 
 class TestMaskedSelect(TestCase):
-    def cpu_op_exec(self, input, mask):
-        output = torch.masked_select(input, mask)
+    def cpu_op_exec(self, input1, mask):
+        output = torch.masked_select(input1, mask)
         output = output.numpy()
         return output
 
-    def npu_op_exec(self, input, mask):
+    def npu_op_exec(self, input1, mask):
         mask = mask.to("npu")
-        output = torch.masked_select(input, mask)
+        output = torch.masked_select(input1, mask)
         output = output.to("cpu")
         output = output.numpy()
         return output
 
-    def npu_op_exec_out(self, input, mask, output):
-        output = torch.masked_select(input, mask, out=output)
+    def npu_op_exec_out(self, input1, mask, output):
+        output = torch.masked_select(input1, mask, out=output)
         return output.detach().to("cpu").numpy()
 
     def test_maskedselect_out_result(self, device):
@@ -57,8 +57,8 @@ class TestMaskedSelect(TestCase):
             cpu_input3, npu_input3 = create_common_tensor(item[1], -2, 2)
             if cpu_input1.dtype == torch.float16:
                 cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2.to(torch.int32)>0)
-            npu_output = self.npu_op_exec_out(npu_input1, npu_input2.to(torch.int32)>0, npu_input3)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2.to(torch.int32) > 0)
+            npu_output = self.npu_op_exec_out(npu_input1, npu_input2.to(torch.int32) > 0, npu_input3)
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-- 
Gitee


From 5ae1383a4817dca1939bfba68f9e87be2fec504d Mon Sep 17 00:00:00 2001
From: shenpengcheng <shenpengcheng5@huawei.com>
Date: Tue, 8 Feb 2022 19:22:30 +0800
Subject: [PATCH 05/12] code fix3

---
 test/test_network_ops/test_masked_select.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_network_ops/test_masked_select.py b/test/test_network_ops/test_masked_select.py
index f741d72222..69910ed236 100644
--- a/test/test_network_ops/test_masked_select.py
+++ b/test/test_network_ops/test_masked_select.py
@@ -22,7 +22,7 @@ import numpy as np
 from torch_npu.testing.common_utils import TestCase, run_tests
 from torch_npu.testing.common_device_type import instantiate_device_type_tests
 from torch_npu.testing.util_test import create_common_tensor
-
+ 
 class TestMaskedSelect(TestCase):
     def cpu_op_exec(self, input1, mask):
         output = torch.masked_select(input1, mask)
-- 
Gitee


From 32a8f42ba5bbe1cdec323f963647902f4821eda5 Mon Sep 17 00:00:00 2001
From: shenpengcheng <shenpengcheng5@huawei.com>
Date: Wed, 26 Jan 2022 13:59:33 +0800
Subject: [PATCH 06/12] =?UTF-8?q?reflection=5Fpad2d=5Fbackward=E7=AE=97?=
 =?UTF-8?q?=E5=AD=90=E7=A7=BB=E6=A4=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../test_reflection_pad2d_backward.py         | 73 +++++++++++++++++
 .../ops/ReflectionPad2dBackwardKernelNpu.cpp  | 81 +++++++++++++++++++
 2 files changed, 154 insertions(+)
 create mode 100644 test/test_network_ops/test_reflection_pad2d_backward.py
 create mode 100644 torch_npu/csrc/aten/ops/ReflectionPad2dBackwardKernelNpu.cpp

diff --git a/test/test_network_ops/test_reflection_pad2d_backward.py b/test/test_network_ops/test_reflection_pad2d_backward.py
new file mode 100644
index 0000000000..b9a8853c85
--- /dev/null
+++ b/test/test_network_ops/test_reflection_pad2d_backward.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestReflectionPad2dBackward(TestCase):
+    def cpu_op_exec(self, input1, pad):
+        m = torch.nn.ReflectionPad2d(pad)
+        input1.requires_grad = True
+        output = m(input1)
+        output.backward(torch.ones_like(output))
+        input_grad = input1.grad
+        output = output.numpy()
+        input_grad = input_grad.numpy()
+        return output, input_grad
+
+    def npu_op_exec(self, input1, pad):
+        m = torch.nn.ReflectionPad2d(pad).to("npu")
+        input1.requires_grad = True
+        output = m(input1)
+        output.backward(torch.ones_like(output))
+        input_grad = input1.grad
+        output = output.to("cpu")
+        output = output.detach().numpy()
+        input_grad = input_grad.cpu().numpy()
+        return output, input_grad
+
+    def test_reflectionPad2d_backward_shape_format_fp16(self, device):
+        shape_format = [
+            [[np.float16, 0, (1, 1, 37, 37)], [2, 2, 2, 2]],
+            [[np.float16, 3, (1, 1, 4, 3)], 2],
+            [[np.float16, 0, (1, 1, 17, 17)], [1, 2, 2, 2]],
+        ]
+
+        def cpu_op_exec_fp16(input1, pad):
+            input1 = input1.to(torch.float32)
+            input1.requires_grad = True
+            m = torch.nn.ReflectionPad2d(pad)
+            output = m(input1)            
+            output.backward(torch.ones_like(output))
+            output = output.detach().numpy()
+            input_grad = input1.grad
+            input_grad = input_grad.numpy().astype(np.float16)
+            output = output.astype(np.float16)
+            return output, input_grad
+        
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_output, cpu_grad = cpu_op_exec_fp16(cpu_input1, item[1])
+            npu_output, npu_grad = self.npu_op_exec(npu_input1, item[1])     
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_grad, npu_grad)
+
+instantiate_device_type_tests(TestReflectionPad2dBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch_npu/csrc/aten/ops/ReflectionPad2dBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/ReflectionPad2dBackwardKernelNpu.cpp
new file mode 100644
index 0000000000..b0c2c3731c
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/ReflectionPad2dBackwardKernelNpu.cpp
@@ -0,0 +1,81 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& reflection_pad2d_backward_out_npu_nocheck(
+    const at::Tensor& gradOutput,
+    const at::Tensor& input,
+    at::IntArrayRef padding,
+    at::Tensor& gradInput) {
+  TORCH_CHECK(input.scalar_type() != at::ScalarType::Float,
+      "PadV3Grad don't supports torch.float!");      
+  c10::SmallVector<int64_t, N> vectorInt;
+  c10::SmallVector<int64_t, N> paddingsVector = array_to_small_vector(padding);
+  paddingsVector.resize(2 * input.dim(), 0);
+  for (int64_t i = paddingsVector.size(); i > 0; i -= 2) {
+    vectorInt.emplace_back(paddingsVector[i - 2]);
+    vectorInt.emplace_back(paddingsVector[i - 1]);
+  } 
+  OpCommand cmd;
+  cmd.Name("PadV3Grad")
+    .Input(gradOutput)
+    .Input(vectorInt, at::kInt)
+    .Output(gradInput)
+    .Attr("mode", (string)"reflect")
+    .Attr("paddings_contiguous", true)
+    .Run();
+  return gradInput;
+}
+
+at::Tensor& NPUNativeFunctions::reflection_pad2d_backward_out(
+    const at::Tensor& gradOutput,
+    const at::Tensor& input,
+    at::IntArrayRef padding,
+    at::Tensor& gradInput) {
+  OpPreparation::CheckOut(
+      {input, gradOutput},
+      gradInput,
+      input); 
+  OpPipeWithDefinedOut pipe;
+  return pipe.CheckMemory({input, gradOutput}, {gradInput})
+    .Func([&gradOutput, &input, &padding](at::Tensor& gradInput)
+    {reflection_pad2d_backward_out_npu_nocheck( 
+        gradOutput, 
+        input, 
+        padding,
+        gradInput);})
+    .Call(gradInput); 
+}
+
+at::Tensor NPUNativeFunctions::reflection_pad2d_backward(
+    const at::Tensor& gradOutput,
+    const at::Tensor& input,
+    at::IntArrayRef padding) {
+  at::Tensor gradInput = OpPreparation::ApplyTensor(input);
+  reflection_pad2d_backward_out_npu_nocheck( 
+      gradOutput, 
+      input, 
+      padding,
+      gradInput);
+  return gradInput;
+}
+} // namespace native
+} // namespace at_npu
-- 
Gitee


From 4e4412492ccad2efd2ae9650dad05b3c6591d51c Mon Sep 17 00:00:00 2001
From: shenpengcheng <shenpengcheng5@huawei.com>
Date: Tue, 8 Feb 2022 17:27:46 +0800
Subject: [PATCH 07/12] =?UTF-8?q?maskedSelect=E7=AE=97=E5=AD=90=E8=BF=81?=
 =?UTF-8?q?=E7=A7=BB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/test_network_ops/test_masked_select.py   | 138 ++++++++++++++++++
 .../csrc/aten/ops/MaskedSelectKernelNpu.cpp   | 107 ++++++++++++++
 2 files changed, 245 insertions(+)
 create mode 100644 test/test_network_ops/test_masked_select.py
 create mode 100644 torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp

diff --git a/test/test_network_ops/test_masked_select.py b/test/test_network_ops/test_masked_select.py
new file mode 100644
index 0000000000..f5aeaf16f5
--- /dev/null
+++ b/test/test_network_ops/test_masked_select.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestMaskedSelect(TestCase):
+    def cpu_op_exec(self, input, mask):
+        output = torch.masked_select(input, mask)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input, mask):
+        mask = mask.to("npu")
+        output = torch.masked_select(input, mask)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input, mask, output):
+        output = torch.masked_select(input, mask, out=output)
+        return output.detach().to("cpu").numpy()
+
+    def test_maskedselect_out_result(self, device):
+        shape_format = [
+            [[np.float16, 2, [15, 15, 15, 16]], [np.float16, 2, [15, 15, 15, 16]]],
+            [[np.float16, 2, [15, 15, 15, 16]], [np.float16, 2, [3, 3, 7, 7]]],
+            [[np.float16, 0, [15, 15, 15, 16]], [np.float16, 0, [15, 15, 15, 16]]],
+            [[np.float16, 0, [15, 15, 15, 16]], [np.float16, 0, [116, 116, 1, 1]]],
+            [[np.float32, 2, [15, 15, 15, 16]], [np.float32, 2, [15, 15, 15, 16]]],
+            [[np.float32, 2, [15, 15, 15, 16]], [np.float32, 2, [3, 3, 7, 7]]],
+            [[np.float32, 0, [15, 15, 15, 16]], [np.float32, 0, [15, 15, 15, 16]]],
+            [[np.float32, 0, [15, 15, 15, 16]], [np.float32, 0, [232, 232, 1, 1]]],
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -2, 2)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], -2, 2)
+            cpu_input3, npu_input3 = create_common_tensor(item[1], -2, 2)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2.to(torch.int32)>0)
+            npu_output = self.npu_op_exec_out(npu_input1, npu_input2.to(torch.int32)>0, npu_input3)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_maskedselect_shape_format_maskdiff(self, device):
+        dtype_list = [np.int64, np.int32, np.float32]
+        format_list = [0]
+        shape_list = [[3, 4, 5]]
+        shape_format = [
+            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            mask_cpu, mask_npu = create_common_tensor((np.int32, 0, (3, 4, 1)), 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input, mask_cpu > 50)
+            npu_output = self.npu_op_exec(npu_input, mask_npu > 50)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_maskedselect_shape_format_fp32(self, device):
+        format_list = [0, 3]
+        shape_list = [[3, 4, 5]]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        mask = torch.tensor([[
+         [ True, False,  True,  True, False],
+         [ True, False, False,  True, False],
+         [False, False, False, False, False],
+         [ True, False, False, False, False]],
+
+        [[ True, False, False, False,  True],
+         [False,  True, False,  True,  True],
+         [False,  True, False,  True,  True],
+         [False, False, False, False, False]],
+
+        [[False,  True,  True, False,  True],
+         [False,  True,  True,  True,  True],
+         [False,  True, False,  True, False],
+         [False,  True,  True, False, False]]])
+
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input, mask)
+            npu_output = self.npu_op_exec(npu_input, mask)
+            self.assertRtolEqual(cpu_output, npu_output)
+            
+    def test_maskedselect_shape_format_int(self, device):
+        dtype_list = [np.int32, np.int64]
+        format_list = [0]
+        shape_list = [[3, 4, 5]]
+        shape_format = [
+            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        mask = torch.tensor([[
+         [ True, False,  True,  True, False],
+         [ True, False, False,  True, False],
+         [False, False, False, False, False],
+         [ True, False, False, False, False]],
+
+        [[ True, False, False, False,  True],
+         [False,  True, False,  True,  True],
+         [False,  True, False,  True,  True],
+         [False, False, False, False, False]],
+
+        [[False,  True,  True, False,  True],
+         [False,  True,  True,  True,  True],
+         [False,  True, False,  True, False],
+         [False,  True,  True, False, False]]])
+
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input, mask)
+            npu_output = self.npu_op_exec(npu_input, mask)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestMaskedSelect, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp b/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp
new file mode 100644
index 0000000000..508ba1c1bb
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp
@@ -0,0 +1,107 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::SmallVector<int64_t, SIZE> masked_select_npu_output_size(
+    const at::Tensor& self,
+    const at::Tensor& mask) {
+  int64_t shape;
+  shape = mask.sum().item().toInt();
+  return {shape};
+}
+
+at::Tensor& masked_select_out_npu_nocheck(
+    at::Tensor& result,
+    const at::Tensor& self,
+    const at::Tensor& mask) {
+  at::Tensor maskBool = mask;
+  if (!(mask.dtype() == at::kBool)) {
+    maskBool = mask.to(at::kBool);
+  }
+
+  OpCommand cmd;
+  cmd.Name("MaskedSelect")
+      .Input(self)
+      .Input(maskBool)
+      .Output(result)
+      .Run();
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::masked_select_out(
+    const at::Tensor& self,
+    const at::Tensor& mask,
+    at::Tensor& result) {
+  at::Tensor dtypeCastOfSelf = self;
+  at::Tensor maskCast = mask;
+  if (maskCast.sizes() != dtypeCastOfSelf.sizes()) {
+    maskCast = NPUNativeFunctions::npu_broadcast(mask, dtypeCastOfSelf.sizes());
+  }
+  if (dtypeCastOfSelf.scalar_type() == ScalarType::Half) {
+    dtypeCastOfSelf = NPUNativeFunctions::npu_dtype_cast(dtypeCastOfSelf, at::ScalarType::Float);
+    result = result.to(ScalarType::Float);
+  }
+  auto outputSize = masked_select_npu_output_size(dtypeCastOfSelf, maskCast);
+
+  OpPreparation::CheckOut(
+      {dtypeCastOfSelf},
+      result,
+      dtypeCastOfSelf,
+      outputSize);
+
+  OpPipeWithDefinedOut pipe;
+  result = pipe.CheckMemory({dtypeCastOfSelf, maskCast}, {result})
+      .Func([&dtypeCastOfSelf, &maskCast](at::Tensor& result)
+      {masked_select_out_npu_nocheck(result, dtypeCastOfSelf, maskCast);})
+      .Call(result);
+
+  if (result.scalar_type() != self.scalar_type()) {
+    result = result.npu_dtype_cast(ScalarType::Half);
+  }
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::masked_select(
+    const at::Tensor& self,
+    const at::Tensor& mask) {
+  at::Tensor dtypeCastOfSelf = self;
+  at::Tensor maskCast = mask;
+  if (maskCast.sizes() != dtypeCastOfSelf.sizes()) {
+    maskCast = NPUNativeFunctions::npu_broadcast(mask, dtypeCastOfSelf.sizes());
+  }
+  if (dtypeCastOfSelf.scalar_type() == ScalarType::Half) {
+    dtypeCastOfSelf = NPUNativeFunctions::npu_dtype_cast(dtypeCastOfSelf, at::ScalarType::Float);
+  }
+  auto outputSize = masked_select_npu_output_size(dtypeCastOfSelf, maskCast);
+
+  at::Tensor result = OpPreparation::ApplyTensor(dtypeCastOfSelf, outputSize);
+
+  masked_select_out_npu_nocheck(result, dtypeCastOfSelf, maskCast);
+
+  if (result.scalar_type() != self.scalar_type()) {
+    result = NPUNativeFunctions::npu_dtype_cast(result, at::ScalarType::Half);
+  }
+  return result;
+}
+
+} // namespace native
+} // namespace at_npu
-- 
Gitee


From 1e31b603d362334dc534c5119a075b0a6c061f43 Mon Sep 17 00:00:00 2001
From: shenpengcheng <shenpengcheng5@huawei.com>
Date: Tue, 8 Feb 2022 18:42:11 +0800
Subject: [PATCH 08/12] code fix

---
 torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp b/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp
index 508ba1c1bb..7f8cbab6da 100644
--- a/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp
@@ -34,7 +34,7 @@ at::Tensor& masked_select_out_npu_nocheck(
     const at::Tensor& mask) {
   at::Tensor maskBool = mask;
   if (!(mask.dtype() == at::kBool)) {
-    maskBool = mask.to(at::kBool);
+    maskBool = NPUNativeFunctions::npu_dtype_cast(mask, at::kBool);
   }
 
   OpCommand cmd;
@@ -56,9 +56,9 @@ at::Tensor& NPUNativeFunctions::masked_select_out(
   if (maskCast.sizes() != dtypeCastOfSelf.sizes()) {
     maskCast = NPUNativeFunctions::npu_broadcast(mask, dtypeCastOfSelf.sizes());
   }
-  if (dtypeCastOfSelf.scalar_type() == ScalarType::Half) {
+  if (dtypeCastOfSelf.scalar_type() == at::ScalarType::Half) {
     dtypeCastOfSelf = NPUNativeFunctions::npu_dtype_cast(dtypeCastOfSelf, at::ScalarType::Float);
-    result = result.to(ScalarType::Float);
+    result = NPUNativeFunctions::npu_dtype_cast(result, at::ScalarType::Float);
   }
   auto outputSize = masked_select_npu_output_size(dtypeCastOfSelf, maskCast);
 
@@ -75,7 +75,7 @@ at::Tensor& NPUNativeFunctions::masked_select_out(
       .Call(result);
 
   if (result.scalar_type() != self.scalar_type()) {
-    result = result.npu_dtype_cast(ScalarType::Half);
+    result = result.npu_dtype_cast(at::ScalarType::Half);
   }
   return result;
 }
@@ -88,7 +88,7 @@ at::Tensor NPUNativeFunctions::masked_select(
   if (maskCast.sizes() != dtypeCastOfSelf.sizes()) {
     maskCast = NPUNativeFunctions::npu_broadcast(mask, dtypeCastOfSelf.sizes());
   }
-  if (dtypeCastOfSelf.scalar_type() == ScalarType::Half) {
+  if (dtypeCastOfSelf.scalar_type() == at::ScalarType::Half) {
     dtypeCastOfSelf = NPUNativeFunctions::npu_dtype_cast(dtypeCastOfSelf, at::ScalarType::Float);
   }
   auto outputSize = masked_select_npu_output_size(dtypeCastOfSelf, maskCast);
-- 
Gitee


From 48b1374750a4167e6d2d70220365c0a642081d47 Mon Sep 17 00:00:00 2001
From: shenpengcheng <shenpengcheng5@huawei.com>
Date: Tue, 8 Feb 2022 18:54:15 +0800
Subject: [PATCH 09/12] code fix2

---
 torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp b/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp
index 7f8cbab6da..46a21a0a55 100644
--- a/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/MaskedSelectKernelNpu.cpp
@@ -75,7 +75,7 @@ at::Tensor& NPUNativeFunctions::masked_select_out(
       .Call(result);
 
   if (result.scalar_type() != self.scalar_type()) {
-    result = result.npu_dtype_cast(at::ScalarType::Half);
+    result = NPUNativeFunctions::npu_dtype_cast(result, at::ScalarType::Half);
   }
   return result;
 }
-- 
Gitee


From 2a47e46dfc45f6ad60e79848d71bec496eb15315 Mon Sep 17 00:00:00 2001
From: shenpengcheng <shenpengcheng5@huawei.com>
Date: Tue, 8 Feb 2022 19:02:19 +0800
Subject: [PATCH 10/12] fix code check

---
 test/test_network_ops/test_masked_select.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/test/test_network_ops/test_masked_select.py b/test/test_network_ops/test_masked_select.py
index f5aeaf16f5..f741d72222 100644
--- a/test/test_network_ops/test_masked_select.py
+++ b/test/test_network_ops/test_masked_select.py
@@ -24,20 +24,20 @@ from torch_npu.testing.common_device_type import instantiate_device_type_tests
 from torch_npu.testing.util_test import create_common_tensor
 
 class TestMaskedSelect(TestCase):
-    def cpu_op_exec(self, input, mask):
-        output = torch.masked_select(input, mask)
+    def cpu_op_exec(self, input1, mask):
+        output = torch.masked_select(input1, mask)
         output = output.numpy()
         return output
 
-    def npu_op_exec(self, input, mask):
+    def npu_op_exec(self, input1, mask):
         mask = mask.to("npu")
-        output = torch.masked_select(input, mask)
+        output = torch.masked_select(input1, mask)
         output = output.to("cpu")
         output = output.numpy()
         return output
 
-    def npu_op_exec_out(self, input, mask, output):
-        output = torch.masked_select(input, mask, out=output)
+    def npu_op_exec_out(self, input1, mask, output):
+        output = torch.masked_select(input1, mask, out=output)
         return output.detach().to("cpu").numpy()
 
     def test_maskedselect_out_result(self, device):
@@ -57,8 +57,8 @@ class TestMaskedSelect(TestCase):
             cpu_input3, npu_input3 = create_common_tensor(item[1], -2, 2)
             if cpu_input1.dtype == torch.float16:
                 cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2.to(torch.int32)>0)
-            npu_output = self.npu_op_exec_out(npu_input1, npu_input2.to(torch.int32)>0, npu_input3)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2.to(torch.int32) > 0)
+            npu_output = self.npu_op_exec_out(npu_input1, npu_input2.to(torch.int32) > 0, npu_input3)
             cpu_output = cpu_output.astype(npu_output.dtype)
             self.assertRtolEqual(cpu_output, npu_output)
 
-- 
Gitee


From 4f526756de03766002cfc53fb4ade32d80f86d82 Mon Sep 17 00:00:00 2001
From: shenpengcheng <shenpengcheng5@huawei.com>
Date: Tue, 8 Feb 2022 19:22:30 +0800
Subject: [PATCH 11/12] code fix3

---
 test/test_network_ops/test_masked_select.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_network_ops/test_masked_select.py b/test/test_network_ops/test_masked_select.py
index f741d72222..69910ed236 100644
--- a/test/test_network_ops/test_masked_select.py
+++ b/test/test_network_ops/test_masked_select.py
@@ -22,7 +22,7 @@ import numpy as np
 from torch_npu.testing.common_utils import TestCase, run_tests
 from torch_npu.testing.common_device_type import instantiate_device_type_tests
 from torch_npu.testing.util_test import create_common_tensor
-
+ 
 class TestMaskedSelect(TestCase):
     def cpu_op_exec(self, input1, mask):
         output = torch.masked_select(input1, mask)
-- 
Gitee


From 1556be2550121c984470dc17fa2b15ee6669872b Mon Sep 17 00:00:00 2001
From: shenpengcheng <shenpengcheng5@huawei.com>
Date: Tue, 8 Feb 2022 19:56:23 +0800
Subject: [PATCH 12/12] cla fix

---
 test/test_network_ops/test_masked_select.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_network_ops/test_masked_select.py b/test/test_network_ops/test_masked_select.py
index 69910ed236..f741d72222 100644
--- a/test/test_network_ops/test_masked_select.py
+++ b/test/test_network_ops/test_masked_select.py
@@ -22,7 +22,7 @@ import numpy as np
 from torch_npu.testing.common_utils import TestCase, run_tests
 from torch_npu.testing.common_device_type import instantiate_device_type_tests
 from torch_npu.testing.util_test import create_common_tensor
- 
+
 class TestMaskedSelect(TestCase):
     def cpu_op_exec(self, input1, mask):
         output = torch.masked_select(input1, mask)
-- 
Gitee