diff --git a/test/test_trans_contiguous/test_combined_flatten_x__copy_to_contiguous.py b/test/test_trans_contiguous/test_combined_flatten_x__copy_to_contiguous.py new file mode 100644 index 0000000000000000000000000000000000000000..5211b303bafffbaaf15e1af02ab1fbe74ed5d02d --- /dev/null +++ b/test/test_trans_contiguous/test_combined_flatten_x__copy_to_contiguous.py @@ -0,0 +1,82 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor, check_operators_in_prof + +os.environ["COMBINED_ENABLE"] = "1" # Open combined-view cases optimization +os.environ["PTCOPY_ENABLE"] = "1" + +# Note: NPU only support trans-contiguous with base format, so format_list uses -1 +class CombinedFlattenXCopyToContiguous(TestCase): + def test_flatten_select_copy_contiguous(self, device): + dtype_list = [np.float16, np.float32] + format_list = [-1] + shape_list = [ + [20, 30, 40, 16], + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: flatten+select + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.flatten(2).select(1,1).contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'select_npuStridedSlice'], prof), True, "Error operators called!") + cpu_out1 = cpu_input.flatten(2).select(1,1).contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: select+flatten == can be optimized as single select(npuCombined should not be called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input.select(2,1).flatten(1).contiguous() + self.assertEqual(check_operators_in_prof(['select_npuStridedSlice'], prof, ['npuCombined']), True, "Error operators called!") + cpu_out2 = cpu_input.select(2,1).flatten(1).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_flatten_strideslice_copy_contiguous(self, device): + dtype_list = [np.float16, np.float32] + format_list = [-1] + shape_list = [ + [20, 30, 40, 16], + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: flatten+strideslice ==> can be optimized as slice(contiguous with offset) + select + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.flatten()[2:100:10].contiguous() + self.assertEqual(check_operators_in_prof(['View_d2dCopyAsync', 'select_npuStridedSlice'], prof), True, "Error operators called!") + cpu_out1 = cpu_input.flatten()[2:100:10].contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: strideslice+flatten==> can be optimized as single strideslice(npuCombined should not be called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input[:,2:20:3].flatten().contiguous() + self.assertEqual(check_operators_in_prof(['npuStridedSlice'], prof, ['npuCombined']), True, "Error operators called!") + cpu_out2 = cpu_input[:,2:20:3].flatten().contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + +instantiate_device_type_tests(CombinedFlattenXCopyToContiguous, globals(), except_for='cpu') +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/test_trans_contiguous/test_combined_reshape_x_copy_to_contiguous.py b/test/test_trans_contiguous/test_combined_reshape_x_copy_to_contiguous.py new file mode 100644 index 0000000000000000000000000000000000000000..f1f41538aee0fd0d0b951af1d3bc2074e2fc8c3e --- /dev/null +++ b/test/test_trans_contiguous/test_combined_reshape_x_copy_to_contiguous.py @@ -0,0 +1,133 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor, check_operators_in_prof + +os.environ["COMBINED_ENABLE"] = "1" # Open combined-view cases optimization +os.environ["PTCOPY_ENABLE"] = "1" + +# Note: NPU only support trans-contiguous with base format, so format_list uses -1 +class CombinedReshapeXCopyToContiguous(TestCase): + def test_view_permute_copy_contiguous(self, device): + dtype_list = [np.float16, np.float32] + format_list = [-1] + shape_list = [ + [200, 30, 40, 16], + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: view+permute + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.view(npu_input.size(0) * npu_input.size(1), npu_input.size(2), npu_input.size(3)).transpose(0, 1).contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'npuTranspose'], prof), True, "Error operators called!") + cpu_out1 = cpu_input.view(cpu_input.size(0) * cpu_input.size(1), cpu_input.size(2), cpu_input.size(3)).transpose(0, 1).contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + + # case 2: permute+view + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input.permute(1, 0, 2, 3).view(npu_input.size(1), npu_input.size(0), npu_input.size(2)*npu_input.size(3)).contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'npuTranspose'], prof), True, "Error operators called!") + cpu_out2 = cpu_input.permute(1, 0, 2, 3).view(cpu_input.size(1), cpu_input.size(0), cpu_input.size(2)*cpu_input.size(3)).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_view_select_copy_contiguous(self, device): + dtype_list = [np.float16, np.float32] + format_list = [-1] + shape_list = [ + [2, 3, 4, 5], + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: view+select + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.view(npu_input.size(0), npu_input.size(1) * npu_input.size(2), npu_input.size(3)).select(2, 1).contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'select_npuStridedSlice'], prof), True, "Error operators called!") + cpu_out1 = cpu_input.view(npu_input.size(0), npu_input.size(1) * npu_input.size(2), npu_input.size(3)).select(2, 1).contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: select+view ==> can be optimized as reshape+narrow + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input.select(2, 1).view(npu_input.size(1), npu_input.size(0), -1).contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), True, "Error operators called!") + cpu_out2 = cpu_input.select(2, 1).view(npu_input.size(1), npu_input.size(0), -1).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_view_narrow_copy_contiguous(self, device): + dtype_list = [np.float16, np.float32] + format_list = [-1] + shape_list = [ + [20, 30, 40, 16], + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: view + narrow + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.view(20, 1200, 16)[:,20:150,:].contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), True, "Error operators called!") + cpu_out1 = cpu_input.view(20, 1200, 16)[:,20:150,:].contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: narrow + view + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input[:,10:19,:,:].view(20, 360, 16).contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), True, "Error operators called!") + cpu_out2 = cpu_input[:,10:19,:,:].view(20, 360, 16).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_view_strideslice_copy_contiguous(self, device): + dtype_list = [np.float16, np.float32] + format_list = [-1] + shape_list = [ + [20, 30, 40, 10], + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: view + strideslice + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.view(20, 1200, 10)[:,20:150:3,:].contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), True, "Error operators called!") + cpu_out1 = cpu_input.view(20, 1200, 10)[:,20:150:3,:].contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: strideslice + view + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input[10:19:3,:,:].view(3, 2400, 5).contiguous() + self.assertEqual(check_operators_in_prof(['d2dCopyWithPTCopy'], prof), True, "Error operators called!") + cpu_out2 = cpu_input[10:19:3,:,:].view(3, 2400, 5).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + +instantiate_device_type_tests(CombinedReshapeXCopyToContiguous, globals(), except_for='cpu') +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/test_trans_contiguous/test_combined_squeeze_x_copy_to_contiguous.py b/test/test_trans_contiguous/test_combined_squeeze_x_copy_to_contiguous.py new file mode 100644 index 0000000000000000000000000000000000000000..0184c215a941a392c654730e76c92e6f5c8de0c7 --- /dev/null +++ b/test/test_trans_contiguous/test_combined_squeeze_x_copy_to_contiguous.py @@ -0,0 +1,134 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor, check_operators_in_prof + +os.environ["COMBINED_ENABLE"] = "1" # Open combined-view cases optimization +os.environ["PTCOPY_ENABLE"] = "1" + +# Note: NPU only support trans-contiguous with base format, so format_list uses -1 +class CombinedSqueezeXCopyToContiguous(TestCase): + def test_squeeze_permute_copy_contiguous(self, device): + dtype_list = [np.float16, np.float32] + format_list = [-1] + shape_list = [ + [2, 1, 3, 4], + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: squeeze+permute ==> can be optimized as single permute(npuCombined should not be called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.squeeze(1).transpose(0,1).contiguous() + self.assertEqual(check_operators_in_prof(['npuTranspose'], prof, ['npuCombined']), True, "Error operators called!") + cpu_out1 = cpu_input.squeeze(1).transpose(0,1).contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + + # case 2: permute+squeeze ==> can be optimized as single permute(npuCombined should not be called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input.permute(1,0,3,2).squeeze(0).contiguous() + self.assertEqual(check_operators_in_prof(['npuTranspose'], prof, ['npuCombined']), True, "Error operators called!") + cpu_out2 = cpu_input.permute(1,0,3,2).squeeze(0).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_squeeze_narrow_copy_contiguous(self, device): + dtype_list = [np.float16, np.float32] + format_list = [-1] + shape_list = [ + [20, 1, 30, 40, 16], + [20, 1, 30, 40] + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: squeeze + narrow + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.squeeze(1)[:,1:10,:].contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), True, "Error operators called!") + cpu_out1 = cpu_input.squeeze(1)[:,1:10,:].contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: narrow + squeeze + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input[:,:,:,10:19].squeeze(1).contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), True, "Error operators called!") + cpu_out2 = cpu_input[:,:,:,10:19].squeeze(1).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_squeeze_select_copy_contiguous(self, device): + dtype_list = [np.float16, np.float32] + format_list = [-1] + shape_list = [ + [20, 1, 40, 16], + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: squeeze+select + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.squeeze().select(2,1).contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'select_npuStridedSlice'], prof), True, "Error operators called!") + cpu_out1 = cpu_input.squeeze().select(2,1).contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: select+squeeze + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input.select(2,1).squeeze().contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'select_npuStridedSlice'], prof), True, "Error operators called!") + cpu_out2 = cpu_input.select(2,1).squeeze().contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_squeeze_strideslice_copy_contiguous(self, device): + dtype_list = [np.float16, np.float32] + format_list = [-1] + shape_list = [ + [20, 1, 200, 40, 10], + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: squeeze + strideslice ==> cannot be optimized(npuCombined should not called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.squeeze(1)[:,20:150:3].contiguous() + self.assertEqual(check_operators_in_prof(['d2dCopyWithPTCopy'], prof, ['npuCombined']), True, "Error operators called!") + cpu_out1 = cpu_input.squeeze(1)[:,20:150:3].contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: strideslice + squeeze ==> cannot be optimized(npuCombined should not called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input[:,:,10:19:3].squeeze(1).contiguous() + self.assertEqual(check_operators_in_prof(['d2dCopyWithPTCopy'], prof, ['npuCombined']), True, "Error operators called!") + cpu_out2 = cpu_input[:,:,10:19:3].squeeze(1).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + +instantiate_device_type_tests(CombinedSqueezeXCopyToContiguous, globals(), except_for='cpu') +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/test_trans_contiguous/test_combined_unsqueeze_x_copy_to_contiguous.py b/test/test_trans_contiguous/test_combined_unsqueeze_x_copy_to_contiguous.py new file mode 100644 index 0000000000000000000000000000000000000000..f5fb5538623afdd32c1c3ad1570e8392eb29cf18 --- /dev/null +++ b/test/test_trans_contiguous/test_combined_unsqueeze_x_copy_to_contiguous.py @@ -0,0 +1,167 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor, check_operators_in_prof + +os.environ["COMBINED_ENABLE"] = "1" # Open combined-view cases optimization +os.environ["PTCOPY_ENABLE"] = "1" + +# Note: NPU only support trans-contiguous with base format, so format_list uses -1 +class CombinedUnsqueezeXCopyToContiguous(TestCase): + def test_unsqueeze_permute_copy_contiguous(self, device): + dtype_list = [np.float16, np.float32] + format_list = [-1] + shape_list = [ + [2, 3, 4, 5], + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: unsqueeze+permute ==> can be optimized as single permute(npuCombined should not be called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.unsqueeze(1).transpose(2,3).contiguous() + self.assertEqual(check_operators_in_prof(['npuTranspose'], prof, ['npuCombined']), True, "Error operators called!") + cpu_out1 = cpu_input.unsqueeze(1).transpose(2,3).contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + + # case 2: permute+unsqueeze ==> can be optimized as single permute(npuCombined should not be called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input.permute(1,0,2,3).unsqueeze(0).contiguous() + self.assertEqual(check_operators_in_prof(['npuTranspose'], prof, ['npuCombined']), True, "Error operators called!") + cpu_out2 = cpu_input.permute(1,0,2,3).unsqueeze(0).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_unsqueeze_narrow_copy_contiguous(self, device): + dtype_list = [np.float16, np.float32] + format_list = [-1] + shape_list = [ + #3D-->4D-->3D + [20, 30, 40], + #4D-->5D-->4D test memory allocation + [2, 300, 400, 500], + #5D-->6D-->5D + [20, 30, 40, 50, 60] + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: unsqueeze+narrow + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.unsqueeze(0)[:,:,1:10].contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), True, "Error operators called!") + cpu_out1 = cpu_input.unsqueeze(0)[:,:,1:10].contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: narrow+unsqueeze + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input[:,1:10].unsqueeze(2).contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), True, "Error operators called!") + cpu_out2 = cpu_input[:,1:10].unsqueeze(2).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_unsqueeze_select_copy_contiguous(self, device): + dtype_list = [np.float16, np.float32] + format_list = [-1] + shape_list = [ + [2, 3, 4], + [2, 300, 400, 500], + [2, 3, 4, 5, 6] + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: unsqueeze+select + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.unsqueeze(0).select(2,1).contiguous() + cpu_out1 = cpu_input.unsqueeze(0).select(2,1).contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), True, "Error operators called!") + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: select+unsqueeze + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input.select(1,1).unsqueeze(0).contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), True, "Error operators called!") + cpu_out2 = cpu_input.select(1,1).unsqueeze(0).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_unsqueeze_unfold_copy_contiguous(self, device): + dtype_list = [np.float16, np.float32] + format_list = [-1] + shape_list = [ + [4, 2, 4], + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: unsqueeze+unfold:size==step ==> can be optimized as reshape+permute + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.unsqueeze(1).unfold(0,2,2).contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'npuTranspose'], prof), True, "Error operators called!") + cpu_out1 = cpu_input.unsqueeze(1).unfold(0,2,2).contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + + # case 2: unfold+unsqueeze: size!=step ==> cannot be optimized(npuCombined should not be called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input.unfold(2,2,3).unsqueeze(1).contiguous() + self.assertEqual(check_operators_in_prof(['d2dCopyWithPTCopy'], prof, ['npuCombined']), True, "Error operators called!") + cpu_out2 = cpu_input.unfold(2,2,3).unsqueeze(1).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_unsqueeze_strideslice_copy_contiguous(self, device): + dtype_list = [np.float16, np.float32] + format_list = [-1] + shape_list = [ + [20, 200, 40], + [20, 200, 40, 10] + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: squeeze + strideslice ==> cannot be optimized(npuCombined should not be called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.unsqueeze(1)[:,:,20:150:3].contiguous() + self.assertEqual(check_operators_in_prof(['d2dCopyWithPTCopy'], prof, ['npuCombined']), True, "Error operators called!") + cpu_out1 = cpu_input.unsqueeze(1)[:,:,20:150:3].contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + # case 2: strideslice + squeeze ==> cannot be optimized(npuCombined should not be called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input[:,:,10:19:3].unsqueeze(0).contiguous() + self.assertEqual(check_operators_in_prof(['d2dCopyWithPTCopy'], prof, ['npuCombined']), True, "Error operators called!") + cpu_out2 = cpu_input[:,:,10:19:3].unsqueeze(0).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + +instantiate_device_type_tests(CombinedUnsqueezeXCopyToContiguous, globals(), except_for='cpu') +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/test_trans_contiguous/test_combined_views_copy_to_contiguous.py b/test/test_trans_contiguous/test_combined_views_copy_to_contiguous.py new file mode 100644 index 0000000000000000000000000000000000000000..73499a04b88189079e29f5194e887329c9966b23 --- /dev/null +++ b/test/test_trans_contiguous/test_combined_views_copy_to_contiguous.py @@ -0,0 +1,254 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor, create_common_tensor_for_broadcast, check_operators_in_prof + +os.environ["COMBINED_ENABLE"] = "1" # Open combined-view cases optimization +os.environ["PTCOPY_ENABLE"] = "1" + +# Note: NPU only support trans-contiguous with base format, so format_list uses -1 +class CombinedViewsCopyToContiguous(TestCase): + def test_permute_narrow_copy_contiguous(self, device): + dtype_list = [np.float16] + format_list = [-1] + shape_list = [ + [20, 30, 40, 50], + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: permute+narrow + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.permute(1,3,2,0)[:10].contiguous() + self.assertEqual(check_operators_in_prof(['narrow_npuSlice', 'npuTranspose'], prof), True, "Error operators called!") + cpu_out1 = cpu_input.permute(1,3,2,0)[:10].contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + + # case 2: narrow+permute + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input[:,1:10].permute(1,0,3,2).contiguous() + self.assertEqual(check_operators_in_prof(['narrow_npuSlice', 'npuTranspose'], prof), True, "Error operators called!") + cpu_out2 = cpu_input[:,1:10].permute(1,0,3,2).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_permute_select_copy_contiguous(self, device): + dtype_list = [np.float32] + format_list = [-1] + shape_list = [ + [20, 30, 40, 50], + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: permute+select + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.permute(1,3,2,0).select(1,2).contiguous() + self.assertEqual(check_operators_in_prof(['select_npuStridedSlice', 'npuTranspose'], prof), True, "Error operators called!") + cpu_out1 = cpu_input.permute(1,3,2,0).select(1,2).contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + + # case 2: select+permute + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input.select(1,0).permute(1,0,2).contiguous() + self.assertEqual(check_operators_in_prof(['select_npuStridedSlice', 'npuTranspose'], prof), True, "Error operators called!") + cpu_out2 = cpu_input.select(1,0).permute(1,0,2).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_permute_strideslice_copy_contiguous(self, device): + dtype_list = [np.float16] + format_list = [-1] + shape_list = [ + [20, 30, 40, 50], + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: permute+strideslice-no offset ==> all cannot be optimized(npuCombined should not be called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.permute(1,3,2,0)[::2].contiguous() + self.assertEqual(check_operators_in_prof(['d2dCopyWithPTCopy'], prof, ['npuCombined']), True, "Error operators called!") + cpu_out1 = cpu_input.permute(1,3,2,0)[::2].contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + + # case 2: strideslice+permute-with offset ==> all cannot be optimized(npuCombined should not be called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input[:,1:10:3].permute(1,3,0,2).contiguous() + self.assertEqual(check_operators_in_prof(['d2dCopyWithPTCopy'], prof, ['npuCombined']), True, "Error operators called!") + cpu_out2 = cpu_input[:,1:10:3].permute(1,3,0,2).contiguous() + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + + def test_narrow_select_copy_contiguous(self, device): + dtype_list = [np.float16, np.float32] + format_list = [0, 3, 29] + shape_list = [ + [20, 30, 40, 16], + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: narrow+select + # narrow at any dim + select the last dim ==> narrow + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input[:,2:4].select(3,1).contiguous() + self.assertEqual(check_operators_in_prof(['narrow_npuSlice'], prof), True, "Error operators called!") + cpu_out1 = cpu_input[:,2:4].select(3,1).contiguous() + # narrow at 0 dim + select the any dim ==> common copy + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input[2:4].select(2,2).contiguous() + self.assertEqual(check_operators_in_prof(['d2dCopyWithPTCopy'], prof), True, "Error operators called!") + cpu_out2 = cpu_input[2:4].select(2,2).contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + # case 2: select+narrow + # select the 0 dim + narrow at the 1 dim ==> reshape + select + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out3 = npu_input.select(0,2)[:,1:2].contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), True, "Error operators called!") + cpu_out3 = cpu_input.select(0,2)[:,1:2].contiguous() + # select the 0 dim + narrow at the last dim ==> reshape + select + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out4 = npu_input.select(0,1)[:,:,1:2].contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), True, "Error operators called!") + cpu_out4 = cpu_input.select(0,1)[:,:,1:2].contiguous() + + self.assertRtolEqual(npu_out3.to("cpu").numpy(), cpu_out3.numpy()) + self.assertRtolEqual(npu_out4.to("cpu").numpy(), cpu_out4.numpy()) + + def test_narrow_strideslice_copy_contiguous(self, device): + dtype_list = [np.float32] + format_list = [-1] + shape_list = [ + [20, 30, 40, 16], + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: narrow+strideslice + # slice at adjacent axes + strideslice at lower dim ==> cannot be optimized(npuCombined is called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input[2:4,::2].contiguous() + self.assertEqual(check_operators_in_prof(['d2dCopyWithPTCopy'], prof), True, "Error operators called!") + cpu_out1 = cpu_input[2:4,::2].contiguous() + # strideslice at last dim ==> cannot be optimized(npuCombined should not be called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input[:,2:4,:,1:10:2].contiguous() + self.assertEqual(check_operators_in_prof(['d2dCopyWithPTCopy'], prof, ['npuCombined']), True, "Error operators called!") + cpu_out2 = cpu_input[:,2:4,:,1:10:2].contiguous() + # narrow at 0 dim and strideslice at last dim==> can be optimized as slice(contiguous)+select + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out3 = npu_input[2:4,:,:,::2].contiguous() + self.assertEqual(check_operators_in_prof(['View_d2dCopyAsync', 'select_npuStridedSlice'], prof), True, "Error operators called!") + cpu_out3 = cpu_input[2:4,:,:,::2].contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + self.assertRtolEqual(npu_out3.to("cpu").numpy(), cpu_out3.numpy()) + + # case 2: strideslice+narrow + # slice at adjacent axes + strideslice at higher dim ==> reshape+narrow + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out4 = npu_input[1:10:2,1:10].contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), True, "Error operators called!") + cpu_out4 = cpu_input[1:10:2,1:10].contiguous() + # slice at non-adjacent axes + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out5 = npu_input[::2,:,1:10].contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), True, "Error operators called!") + cpu_out5 = cpu_input[::2,:,1:10].contiguous() + self.assertRtolEqual(npu_out4.to("cpu").numpy(), cpu_out4.numpy()) + self.assertRtolEqual(npu_out5.to("cpu").numpy(), cpu_out5.numpy()) + + def test_strideslice_select_contiguous(self, device): + dtype_list = [np.float16] + format_list = [-1] + shape_list = [ + [20, 30, 40, 16], + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + # case 1: strideslice+select + # select at last dim ==> cannot be optimized(npuCombined is called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input[:10:2].select(3,1).contiguous() + self.assertEqual(check_operators_in_prof(['d2dCopyWithPTCopy'], prof), True, "Error operators called!") + cpu_out1 = cpu_input[:10:2].select(3,1).contiguous() + # select at lower dims except last dim ==> reshape+narrow + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out2 = npu_input[1:10:2].select(2,1).contiguous() + cpu_out2 = cpu_input[1:10:2].select(2,1).contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), True, "Error operators called!") + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy()) + # case 2: select+strideslice + # strideslice at lower dims except last dim ==> reshape+narrow + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out3 = npu_input.select(0,1)[1:10:2].contiguous() + self.assertEqual(check_operators_in_prof(['npuMatch', 'narrow_npuSlice'], prof), True, "Error operators called!") + cpu_out3 = cpu_input.select(0,1)[1:10:2].contiguous() + # strideslice at the last dim ==> cannot be optimized(npuCombined should not be called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out4 = npu_input.select(0,1)[:,:,::3].contiguous() + self.assertEqual(check_operators_in_prof(['d2dCopyWithPTCopy'], prof, ['npuCombined']), True, "Error operators called!") + cpu_out4 = cpu_input.select(0,1)[:,:,::3].contiguous() + self.assertRtolEqual(npu_out3.to("cpu").numpy(), cpu_out3.numpy()) + self.assertRtolEqual(npu_out4.to("cpu").numpy(), cpu_out4.numpy()) + + def test_broadcast_permute_contiguous(self, device): + dtype_list = [np.float16, np.float32] + format_list = [-1] + shape_list = [ + [[2, 1, 3], [1, 2, 4, 3]], + [[2, 1, 3], [5, 2, 4, 3]], + ] + shape_format = [ + [i, j, k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor_for_broadcast(item, 0, 100) + # Broadcast + permute all cannot be optimized(npuCombined should not be called) + with torch.autograd.profiler.profile(use_npu=True) as prof: + npu_out1 = npu_input.expand(item[2][1]).transpose(1,3).contiguous() + self.assertEqual(check_operators_in_prof(['d2dCopyWithPTCopy'], prof, ['npuCombined']), True, "Error operators called!") + cpu_out1 = cpu_input.expand(item[2][1]).transpose(1,3).contiguous() + self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy()) + +instantiate_device_type_tests(CombinedViewsCopyToContiguous, globals(), except_for='cpu') +if __name__ == "__main__": + run_tests() \ No newline at end of file