diff --git a/test/profiler/test_dynamic_profiler.py b/test/profiler/test_dynamic_profiler.py index f034ef5f93344956ebbd8e951cda5f149be61d5c..97236876f792bbafeb417088b00063027bb6d6c2 100644 --- a/test/profiler/test_dynamic_profiler.py +++ b/test/profiler/test_dynamic_profiler.py @@ -450,7 +450,7 @@ class TestDynamicProfiler(TestCase): PathManager.remove_path_safety(self.cfg_prof_dir) self.assertTrue(has_prof) - def test_dynamic_profiler_default(self): + def test_dynamic_profiler_default_start(self): cfg_json = copy.deepcopy(self.json_sample) cfg_json['prof_dir'] = self.default_prof_dir cfg_json['start_step'] = TestDynamicProfiler.start_step + 1 @@ -470,6 +470,24 @@ class TestDynamicProfiler(TestCase): PathManager.remove_path_safety(self.default_prof_dir) self.assertTrue(has_prof) + def test_dynamic_profiler_default_start_next_step(self): + cfg_json = copy.deepcopy(self.json_sample) + cfg_json['prof_dir'] = self.default_prof_dir + cfg_json['start_step'] = -1 + with os.fdopen(os.open(self.cfg_path, self.flags, self.mode), 'w') as f: + time.sleep(1) + json.dump(cfg_json, f, indent=4) + time.sleep(3) + dp.step() + TestDynamicProfiler.start_step += 1 + self.model_train.train_one_step() + dp.step() + TestDynamicProfiler.start_step += 1 + has_prof = self.has_prof_dir(self.default_prof_dir) + if os.path.exists(self.default_prof_dir): + PathManager.remove_path_safety(self.default_prof_dir) + self.assertTrue(has_prof) + def test_dynamic_profiler_rank(self): cfg_json = copy.deepcopy(self.json_sample) cfg_json['prof_dir'] = self.rank_prof_dir diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py index 0c5ab8c8263141451f9099c689f943f96b36cd53..8fbe112a79a047800e39040b6c75c7a957cfbb74 100644 --- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py +++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py @@ -7,6 +7,7 @@ from ._dynamic_profiler_utils import DynamicProfilerUtils class ConfigContext: DEFAULT_ACTIVE_NUM = 1 DEFAULT_START_STEP = 0 + INSTANT_START_STEP = -1 DEFAULT_WARMUP = 0 DEADLINE_PROF_DIR = "./" BOOL_MAP = {'true': True, 'false': False} @@ -65,7 +66,7 @@ class ConfigContext: except ValueError: self._start_step = self.DEFAULT_START_STEP - if not isinstance(self._start_step, int) or self._start_step < 0: + if not isinstance(self._start_step, int) or self._start_step < self.INSTANT_START_STEP: DynamicProfilerUtils.out_log("Start step is not valid, will be reset to {}.".format( self.DEFAULT_START_STEP), DynamicProfilerUtils.LoggerLevelEnum.INFO) self._start_step = self.DEFAULT_START_STEP @@ -348,6 +349,9 @@ class ConfigContext: def start_step(self) -> int: return self._start_step + def start(self) -> bool: + return self._start_step == self.INSTANT_START_STEP + def experimental_config(self) -> _ExperimentalConfig: return self.experimental_config diff --git a/torch_npu/profiler/dynamic_profile.py b/torch_npu/profiler/dynamic_profile.py index e262fa2fa07e04a5ea4d694ced342e6945aa5411..645ebfc9cf5d85005efca79513493d2407e79492 100644 --- a/torch_npu/profiler/dynamic_profile.py +++ b/torch_npu/profiler/dynamic_profile.py @@ -92,10 +92,10 @@ class _DynamicProfile: DynamicProfilerUtils.out_log("Stop Dynamic Profiler at {} step.".format( self.cur_step), DynamicProfilerUtils.LoggerLevelEnum.INFO) elif self.prof is None and self.cfg_ctx is not None: - if self.cur_step > self.cfg_ctx.start_step(): + if self.cur_step > self.cfg_ctx.start_step() and not self.cfg_ctx.start(): print_warn_msg(f"Dynamic Profiler config is not effective. The start_step={self.cfg_ctx.start_step()}, " f"current_step={self.cur_step}") - if self.cur_step == self.cfg_ctx.start_step(): + if self.cur_step == self.cfg_ctx.start_step() or self.cfg_ctx.start(): self.step_num = self.cfg_ctx.active() + self.cfg_ctx.warmup() self.enable_prof() self.cfg_ctx = None