diff --git a/test/test_trans_contiguous/test_combined_flatten_x__copy_to_contiguous.py b/test/test_trans_contiguous/test_combined_flatten_x__copy_to_contiguous.py new file mode 100644 index 0000000000000000000000000000000000000000..b968da35139fd7c1e7cb51663f20b7903cf4b867 --- /dev/null +++ b/test/test_trans_contiguous/test_combined_flatten_x__copy_to_contiguous.py @@ -0,0 +1,85 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor, check_operators_in_prof + +os.environ["COMBINED_ENABLE"] = "1" # Open combined-view cases optimization + +# Note: NPU only support trans-contiguous with base format, so format_list uses -1 +class CombinedFlattenXCopyToContiguous(TestCase): + def test_flatten_select_copy_contiguous(self, device): + dtype_list1 = [np.float16, np.float32] + format_list1 = [-1] + shape_list1 = [ + [20, 30, 40, 16], + ] + shape_format1 = [ + [i, j, k] for i in dtype_list1 for j in format_list1 for k in shape_list1 + ] + + for item in shape_format1: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: flatten+select + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.flatten(2).select(1,1).contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'select_npuStridedSlice'], prof), \ + True, "Error operators called!") + cpu_out1 = cpu_input.flatten(2).select(1,1).contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: select+flatten == can be optimized as single select(npuCombined should not be called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input.select(2,1).flatten(1).contiguous() + self.assertEqual(check_operators_in_prof(['select_npuStridedSlice'], prof, ['npuCombined']), \ + True, "Error operators called!") + cpu_out2 = cpu_input.select(2,1).flatten(1).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_flatten_strideslice_copy_contiguous(self, device): + dtype_list2 = [np.float16, np.float32] + format_list2 = [-1] + shape_list2 = [ + [20, 30, 40, 16], + ] + shape_format2 = [ + [i, j, k] for i in dtype_list2 for j in format_list2 for k in shape_list2 + ] + + for item in shape_format2: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: flatten+strideslice ==> can be optimized as slice(contiguous with offset) + select + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.flatten()[2:100:10].contiguous() + self.assertEqual(check_operators_in_prof(['View_d2dCopyAsync', 'select_npuStridedSlice'], prof), \ + True, "Error operators called!") + cpu_out1 = cpu_input.flatten()[2:100:10].contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: strideslice+flatten==> can be optimized as single strideslice(npuCombined should not be called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input[:,2:20:3].flatten().contiguous() + self.assertEqual(check_operators_in_prof(['npuStridedSlice'], prof, ['npuCombined']), \ + True, "Error operators called!") + cpu_out2 = cpu_input[:,2:20:3].flatten().contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + +instantiate_device_type_tests(CombinedFlattenXCopyToContiguous, globals(), except_for='cpu') +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/test_trans_contiguous/test_combined_reshape_x_copy_to_contiguous.py b/test/test_trans_contiguous/test_combined_reshape_x_copy_to_contiguous.py new file mode 100644 index 0000000000000000000000000000000000000000..9aabeb1afde492925a663051eda7ec9153c5bef2 --- /dev/null +++ b/test/test_trans_contiguous/test_combined_reshape_x_copy_to_contiguous.py @@ -0,0 +1,158 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor, check_operators_in_prof + +os.environ["COMBINED_ENABLE"] = "1" # Open combined-view cases optimization + +# Note: NPU only support trans-contiguous with base format, so format_list uses -1 +class CombinedReshapeXCopyToContiguous(TestCase): + def test_view_permute_copy_contiguous(self, device): + dtype_list1 = [np.float16, np.float32] + format_list1 = [-1] + shape_list1 = [ + [200, 30, 40, 16], + ] + shape_format1 = [ + [i, j, k] for i in dtype_list1 for j in format_list1 for k in shape_list1 + ] + + for item in shape_format1: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: view+permute + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input \ + .view(npu_input.size(0) * npu_input.size(1), npu_input.size(2), npu_input.size(3)) \ + .transpose(0, 1) \ + .contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'npuTranspose'], prof), \ + True, "Error operators called!") + cpu_out1 = cpu_input \ + .view(cpu_input.size(0) * cpu_input.size(1), cpu_input.size(2), cpu_input.size(3)) \ + .transpose(0, 1) \ + .contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + + # case 2: permute+view + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input \ + .permute(1, 0, 2, 3) \ + .view(npu_input.size(1), npu_input.size(0), npu_input.size(2)*npu_input.size(3)) \ + .contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'npuTranspose'], prof), \ + True, "Error operators called!") + cpu_out2 = cpu_input \ + .permute(1, 0, 2, 3) \ + .view(cpu_input.size(1), cpu_input.size(0), cpu_input.size(2)*cpu_input.size(3)) \ + .contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_view_select_copy_contiguous(self, device): + dtype_list2 = [np.float16, np.float32] + format_list2 = [-1] + shape_list2 = [ + [2, 3, 4, 5], + ] + shape_format2 = [ + [i, j, k] for i in dtype_list2 for j in format_list2 for k in shape_list2 + ] + + for item in shape_format2: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: view+select + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input \ + .view(npu_input.size(0), npu_input.size(1) * npu_input.size(2), npu_input.size(3)) \ + .select(2, 1) \ + .contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'select_npuStridedSlice'], prof), \ + True, "Error operators called!") + cpu_out1 = cpu_input \ + .view(npu_input.size(0), npu_input.size(1) * npu_input.size(2), npu_input.size(3)) \ + .select(2, 1) \ + .contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: select+view ==> can be optimized as reshape+narrow + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input.select(2, 1).view(npu_input.size(1), npu_input.size(0), -1).contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), \ + True, "Error operators called!") + cpu_out2 = cpu_input.select(2, 1).view(npu_input.size(1), npu_input.size(0), -1).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_view_narrow_copy_contiguous(self, device): + dtype_list3 = [np.float16, np.float32] + format_list3 = [-1] + shape_list3 = [ + [20, 30, 40, 16], + ] + shape_format3 = [ + [i, j, k] for i in dtype_list3 for j in format_list3 for k in shape_list3 + ] + + for item in shape_format3: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: view + narrow + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.view(20, 1200, 16)[:,20:150,:].contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), \ + True, "Error operators called!") + cpu_out1 = cpu_input.view(20, 1200, 16)[:,20:150,:].contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: narrow + view + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input[:,10:19,:,:].view(20, 360, 16).contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), \ + True, "Error operators called!") + cpu_out2 = cpu_input[:,10:19,:,:].view(20, 360, 16).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_view_strideslice_copy_contiguous(self, device): + dtype_list4 = [np.float16, np.float32] + format_list4 = [-1] + shape_list4 = [ + [20, 30, 40, 10], + ] + shape_format4 = [ + [i, j, k] for i in dtype_list4 for j in format_list4 for k in shape_list4 + ] + + for item in shape_format4: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: view + strideslice + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.view(20, 1200, 10)[:,20:150:3,:].contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), \ + True, "Error operators called!") + cpu_out1 = cpu_input.view(20, 1200, 10)[:,20:150:3,:].contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: strideslice + view + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input[10:19:3,:,:].view(3, 2400, 5).contiguous() + self.assertEqual(check_operators_in_prof(['npuAsStrided'], prof), \ + True, "Error operators called!") + cpu_out2 = cpu_input[10:19:3,:,:].view(3, 2400, 5).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + +instantiate_device_type_tests(CombinedReshapeXCopyToContiguous, globals(), except_for='cpu') +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/test_trans_contiguous/test_combined_squeeze_x_copy_to_contiguous.py b/test/test_trans_contiguous/test_combined_squeeze_x_copy_to_contiguous.py new file mode 100644 index 0000000000000000000000000000000000000000..6b9ba8e2aa5cf18c9cf2073c2c049c12dd55d965 --- /dev/null +++ b/test/test_trans_contiguous/test_combined_squeeze_x_copy_to_contiguous.py @@ -0,0 +1,141 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor, check_operators_in_prof + +os.environ["COMBINED_ENABLE"] = "1" # Open combined-view cases optimization + +# Note: NPU only support trans-contiguous with base format, so format_list uses -1 +class CombinedSqueezeXCopyToContiguous(TestCase): + def test_squeeze_permute_copy_contiguous(self, device): + dtype_list1 = [np.float16, np.float32] + format_list1 = [-1] + shape_list1 = [ + [2, 1, 3, 4], + ] + shape_format1 = [ + [i, j, k] for i in dtype_list1 for j in format_list1 for k in shape_list1 + ] + + for item in shape_format1: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: squeeze+permute ==> can be optimized as single permute(npuCombined should not be called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.squeeze(1).transpose(0,1).contiguous() + self.assertEqual(check_operators_in_prof(['npuTranspose'], prof, ['npuCombined']), \ + True, "Error operators called!") + cpu_out1 = cpu_input.squeeze(1).transpose(0,1).contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + + # case 2: permute+squeeze ==> can be optimized as single permute(npuCombined should not be called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input.permute(1,0,3,2).squeeze(0).contiguous() + self.assertEqual(check_operators_in_prof(['npuTranspose'], prof, ['npuCombined']), \ + True, "Error operators called!") + cpu_out2 = cpu_input.permute(1,0,3,2).squeeze(0).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_squeeze_narrow_copy_contiguous(self, device): + dtype_list2 = [np.float16, np.float32] + format_list2 = [-1] + shape_list2 = [ + [20, 1, 30, 40, 16], + [20, 1, 30, 40] + ] + shape_format2 = [ + [i, j, k] for i in dtype_list2 for j in format_list2 for k in shape_list2 + ] + + for item in shape_format2: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: squeeze + narrow + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.squeeze(1)[:,1:10,:].contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), \ + True, "Error operators called!") + cpu_out1 = cpu_input.squeeze(1)[:,1:10,:].contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: narrow + squeeze + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input[:,:,:,10:19].squeeze(1).contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), \ + True, "Error operators called!") + cpu_out2 = cpu_input[:,:,:,10:19].squeeze(1).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_squeeze_select_copy_contiguous(self, device): + dtype_list3 = [np.float16, np.float32] + format_list3 = [-1] + shape_list3 = [ + [20, 1, 40, 16], + ] + shape_format3 = [ + [i, j, k] for i in dtype_list3 for j in format_list3 for k in shape_list3 + ] + + for item in shape_format3: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: squeeze+select + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.squeeze().select(2,1).contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'select_npuStridedSlice'], prof), \ + True, "Error operators called!") + cpu_out1 = cpu_input.squeeze().select(2,1).contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: select+squeeze + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input.select(2,1).squeeze().contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'select_npuStridedSlice'], prof), \ + True, "Error operators called!") + cpu_out2 = cpu_input.select(2,1).squeeze().contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_squeeze_strideslice_copy_contiguous(self, device): + dtype_list4 = [np.float16, np.float32] + format_list4 = [-1] + shape_list4 = [ + [20, 1, 200, 40, 10], + ] + shape_format4 = [ + [i, j, k] for i in dtype_list4 for j in format_list4 for k in shape_list4 + ] + + for item in shape_format4: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: squeeze + strideslice ==> cannot be optimized(npuCombined should not called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.squeeze(1)[:,20:150:3].contiguous() + self.assertEqual(check_operators_in_prof(['npuAsStrided'], prof, ['npuCombined']), \ + True, "Error operators called!") + cpu_out1 = cpu_input.squeeze(1)[:,20:150:3].contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: strideslice + squeeze ==> cannot be optimized(npuCombined should not called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input[:,:,10:19:3].squeeze(1).contiguous() + self.assertEqual(check_operators_in_prof(['npuAsStrided'], prof, ['npuCombined']), \ + True, "Error operators called!") + cpu_out2 = cpu_input[:,:,10:19:3].squeeze(1).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + +instantiate_device_type_tests(CombinedSqueezeXCopyToContiguous, globals(), except_for='cpu') +if __name__ == "__main__": + run_tests() \ No newline at end of file