From b0f5f375f0d41e4cf7c04feefb256e7a7786c10e Mon Sep 17 00:00:00 2001 From: yuhaiyan Date: Fri, 29 Aug 2025 14:49:49 +0800 Subject: [PATCH] Update test_fault_mode.py --- test/distributed/test_fault_mode.py | 2 +- test/npu/test_fault_mode.py | 36 +++++++++++++++++++++-------- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/test/distributed/test_fault_mode.py b/test/distributed/test_fault_mode.py index bb54d548bfe..da19b2197fd 100644 --- a/test/distributed/test_fault_mode.py +++ b/test/distributed/test_fault_mode.py @@ -145,7 +145,7 @@ class TestMode(TestCase): process.terminate() process.wait() self.assertIn( - "EI0002", + "The wait execution of the Notify register times out", message ) diff --git a/test/npu/test_fault_mode.py b/test/npu/test_fault_mode.py index 6877fced8f5..bd1edeec810 100644 --- a/test/npu/test_fault_mode.py +++ b/test/npu/test_fault_mode.py @@ -1,6 +1,6 @@ import os import subprocess - +import traceback import torch from torch.testing._internal.common_utils import TestCase, run_tests from torch.utils.checkpoint import checkpoint @@ -55,7 +55,7 @@ class TestMode(TestCase): torch.npu.set_option(option) def test_set_device(self): - with self.assertRaisesRegex(RuntimeError, "Invalid device ID.\n.+Check whether the device ID is valid."): + with self.assertRaisesRegex(RuntimeError, "The argument is invalid.+Set device failed, invalid device"): torch.npu.set_device(8) def test_distributed_init_param(self): @@ -141,23 +141,43 @@ class TestMode(TestCase): def test_max_memory_allocated(self): x = torch.tensor(2).npu() - with self.assertRaisesRegex(RuntimeError, "Invalid device argument"): + traceback_str = "" + try: torch.npu.max_memory_allocated(device="npu:8") + except Exception as e: + traceback_str = traceback.format_exc() + self.assertIn("in max_memory_allocated", traceback_str) + self.assertIn("Invalid device argument", traceback_str) def test_memory_allocated(self): x = torch.tensor(2).npu() - with self.assertRaisesRegex(RuntimeError, "Invalid device argument"): + traceback_str = "" + try: torch.npu.memory_allocated(device="npu:8") + except Exception as e: + traceback_str = traceback.format_exc() + self.assertIn("in memory_allocated", traceback_str) + self.assertIn("Invalid device argument", traceback_str) def test_memory_reserved(self): x = torch.tensor(2).npu() - with self.assertRaisesRegex(RuntimeError, "Invalid device argument"): + traceback_str = "" + try: torch.npu.memory_reserved(device="npu:8") + except Exception as e: + traceback_str = traceback.format_exc() + self.assertIn("in memory_reserved", traceback_str) + self.assertIn("Invalid device argument", traceback_str) def test_reset_max_memory_allocated(self): x = torch.tensor(2).npu() - with self.assertRaisesRegex(RuntimeError, "Invalid device argument"): + traceback_str = "" + try: torch.npu.reset_max_memory_allocated(device="npu:8") + except Exception as e: + traceback_str = traceback.format_exc() + self.assertIn("in reset_max_memory_allocated", traceback_str) + self.assertIn("Invalid device argument", traceback_str) @SupportedDevices(['Ascend910B']) def test_aclrtSetDevice(self): @@ -172,10 +192,6 @@ class TestMode(TestCase): "_npu_setDevice", message ) - self.assertIn( - "Initialize", - message - ) def test_checkpoint_module(self): class Net1(nn.Module): -- Gitee