diff --git a/debug/tools/profiling_analyse/README.md b/debug/tools/profiling_analyse/README.md
index 8f809597d69d468f851ed47c0db98869bb73ccb6..75b22562ce608ac2bfcb857f6911923b56e3620f 100644
--- a/debug/tools/profiling_analyse/README.md
+++ b/debug/tools/profiling_analyse/README.md
@@ -5,17 +5,13 @@
 1. 获取json文件中的traceEvents字段，获取所有cat为kernel的，且name中不包含nccl的event，将他们的耗时相加即为所有算子耗时all_time；
 2. 取算子中name包含gemm的为cube算子耗时cube_time.
 3. vector算子耗时即为(all_time - cube_time)
-### 大kernel算子
-待补充大kernel列表
 ### 通信
-此处的通信指通信未掩盖耗时，gpu上暂无明确的计算方法，故取的profiling展示图中的通信流耗时结果
-实际计算是取name中包含'ncclKernel_'的event，将他们的耗时相加
+此处的通信指通信未掩盖耗时，通过计算有通信流而无计算流的时间戳获得
 ### 计算流e2e耗时
 按照时间也就是'ts'字段排序所有events，可以看到最后的event是Record Window End,故使用最后一个event的ts值减去第一个event的ts值作为计算流e2e耗时
 ### 调度
-由于gpu上目前没有较好的通信不可掩盖耗时算法，所以gpu上的调度耗时计算方法采用：调度耗时 = 计算流E2E耗时 - 计算流任务执行总耗时
-计算流为stream为7的流，实际计算取stream为7的event耗时相加；
-计算流的stream不一定是7，后续会适配通过观察kernel算子分布来判断计算流的方法
+gpu上的调度耗时计算方法采用：调度耗时 = 单步打屏时间 - 算子耗时 - 通信不可掩盖耗时
+单步打屏时间需要用户输入，当用户不输入时，采用e2e耗时代替单步打屏时间
 获得调度耗时后，使用调度占比 = 调度耗时/E2E耗时 获取调度占比
 ### 内存分析
 gpu上的内存使用可以使用nvidia-smi查看，使用json文件分析时需要打开profile_memory=True开关
@@ -27,9 +23,6 @@ gpu上的内存使用可以使用nvidia-smi查看，使用json文件分析时需
 2、当前仅统计算子运行在vector和cube上的耗时。
 3、这2中算子于csv文件中的的TaskType均为AI_CORE，其中aiv_vec_time时间多表明为vector算子，aic_mac_time表明为cube算子。分别累加求和算子耗时进行输出。
 
-### 大kernel算子
-待补充大kernel算子列表
-
 ### 通信
 此处的通信为通信未掩盖耗时，对应为ASCEND_PROFILER_OUTPUT/trace_view.json下的EVENT_WAIT_SQE，对于多个Stream Id的结果，取Stream Id最小值。
 输出结果为该字段时间求和。
@@ -47,7 +40,7 @@ gpu上的内存使用可以使用nvidia-smi查看，使用json文件分析时需
 2、其值在模型训练趋于稳定时逐渐固定，整体偏差不大，因此输出结果为该列数据的最大值。
 
 ## 样例
-- step1:下载数据：https://onebox.huawei.com/v/2ad3400460fac22fa61f21f478edd116
+- step1:获取gpu和npu的profiling数据，若没开启memory采集开关，则没有内存使用数据
 
-- 运行命令:python profiling_parse.py -g prof0704_best\gpu\gpu_trace_device0.json -n prof0704_best\Malluma_443350_20230704144255_ascend_pt
+- 运行命令:python profiling_parse.py -g gpu\gpu_trace_device0.json -gs 0.9 -n npu\xxx_ascend_pt -ns 1.2
 - 输出结果：可以得到gpu与npu对照的打屏性能拆解数据
diff --git a/debug/tools/profiling_analyse/gpu_parser.py b/debug/tools/profiling_analyse/gpu_parser.py
index a269c143007056ec3d817576a4fbf1f581b78747..9d5ffbe4b407292412f0960bd2a523e71c8a1318 100644
--- a/debug/tools/profiling_analyse/gpu_parser.py
+++ b/debug/tools/profiling_analyse/gpu_parser.py
@@ -1,11 +1,13 @@
+from collections import defaultdict
 import pandas as pd
 
 import parser_helper
 
 
 class GpuProfilingParser:
-    def __init__(self, gpu_trace_file):
-        self.trace_events = self.read_profiling_json_file(gpu_trace_file)
+    def __init__(self, args):
+        self.trace_events = self.read_profiling_json_file(args.gpu)
+        self.one_step_time = args.gpu_step
         self.profiling_info = parser_helper.ProfilingInfo()
 
     @staticmethod
@@ -18,9 +20,9 @@ class GpuProfilingParser:
     def parse_events(self):
         cube_time = 0.0
         all_op_time = 0.0
-        communication_not_overlapped = 0.0
         op_list = []
         compute_stream_dur = 0.0  # 计算流耗时
+        marks = defaultdict(int) # mark for compute communication_not_overlapped time
 
         for event in self.trace_events:
             if not isinstance(event, dict):
@@ -31,24 +33,33 @@ class GpuProfilingParser:
                 continue
             name = event.get('name')
             dur = event.get('dur')
-            if 'nccl' in name:
-                if 'ncclKernel_' in name:
-                    communication_not_overlapped += float(dur)
-                continue
+            ts = event.get('ts')
             cat = event.get('cat')
             if cat.lower() != 'kernel':
                 continue
+            if 'nccl' in name:
+                for timestep in range(ts + 1, ts + dur + 1):
+                    marks[str(timestep)] += 1 # mark this timestep in communication stream
+                continue
+            else:
+                for timestep in range(ts + 1, ts + dur + 1):
+                    marks[str(timestep)] += -100   # mark this timestep in compute stream
             if 'gemm' in name:
                 cube_time += float(dur)
             all_op_time += float(dur)
-            op_list.append([event.get('ts'), name, cat, dur])
+            op_list.append([ts, name, cat, dur])
         op_dataframe = pd.DataFrame(op_list, columns=['time start', 'name', 'cat', 'dur'])
         op_dataframe.to_csv('gpu_perf.csv', index=False)
-        self.profiling_info.communication_not_overlapped = communication_not_overlapped / 10 ** 6
+        self.profiling_info.communication_not_overlapped = len([_ for _,value in marks.items() if value > 0]) / 10 ** 6
         self.profiling_info.cube_time = cube_time / 10 ** 6
         self.profiling_info.vector_time = (all_op_time - cube_time) / 10 ** 6
         self.parse_e2e_time()
-        self.profiling_info.scheduling_time = self.profiling_info.e2e_time - compute_stream_dur / 10 ** 6
+        if self.one_step_time:
+            self.profiling_info.scheduling_time = self.one_step_time - all_op_time / 10 ** 6 - \
+                                                  self.profiling_info.communication_not_overlapped
+        else:
+            self.profiling_info.scheduling_time = self.profiling_info.e2e_time - all_op_time / 10 ** 6 - \
+                                                  self.profiling_info.communication_not_overlapped
         self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time
         self.parse_memory_reserved()
 
diff --git a/debug/tools/profiling_analyse/parser_helper.py b/debug/tools/profiling_analyse/parser_helper.py
index 0b9a0e048b18b9416343863cf0afda471b1f328c..1477db6bcee6a3edf98a8fb8f7494f1f98556bb9 100644
--- a/debug/tools/profiling_analyse/parser_helper.py
+++ b/debug/tools/profiling_analyse/parser_helper.py
@@ -13,11 +13,6 @@ class ProfilingInfo:
         self.e2e_time = 0.0
         self.scheduling_time = 0.0
 
-    def __setattr__(self, key, value):
-        if value < 0:
-            raise ValueError(f"The {key} value shouldn't be less than 0.")
-        super().__setattr__(key, value)
-
 
 def read_json_file(path):
     if not os.path.isfile(path):
diff --git a/debug/tools/profiling_analyse/profiling_parse.py b/debug/tools/profiling_analyse/profiling_parse.py
index 9c260bfd67f1225c6f2ddf0a76f06f8c1b05aee8..d95553eb67a68ad667229910ea797403a0dbfc87 100644
--- a/debug/tools/profiling_analyse/profiling_parse.py
+++ b/debug/tools/profiling_analyse/profiling_parse.py
@@ -11,6 +11,7 @@ from parser_helper import ProfilingInfo
 def parse_command():
     parser = argparse.ArgumentParser()
     parser.add_argument('-g', '--gpu', required=False, default='', metavar='(FILE)', help='Gpu profiling json file.')
+    parser.add_argument('-gs', '--gpu_step', required=False, default=0, type=float, help='Gpu one step time(s)')
     parser.add_argument('-n', '--npu', required=False, default='', metavar='(FILE)',
                         help='Npu single core profiling root path.')
     parser.add_argument('-ns', '--npu_step', required=False, default='', metavar='(FILE)', type=float, 
@@ -36,7 +37,9 @@ def show_table(gpu_profiling_info, npu_profiling_info):
 
 def parse_gpu(args):
     if args.gpu:
-        gpu_parser = GpuProfilingParser(args.gpu)
+        if args.gpu_step < 0:
+            raise ValueError("Gpu one step time shouldn't less than 0.")
+        gpu_parser = GpuProfilingParser(args)
         gpu_parser.parse_events()
         return gpu_parser.profiling_info
     print('Gpu trace json file is not specified.')