From ae9a3d9cd135259f73f0e2e3edcdee4c917b4cb0 Mon Sep 17 00:00:00 2001
From: wuyulong11 <wuyulong11@huawei.com>
Date: Thu, 27 Apr 2023 16:39:20 +0800
Subject: [PATCH] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E?=
 =?UTF-8?q?=E3=80=91=20=E5=85=BC=E5=AE=B9GPU=E6=95=B0=E6=8D=AE=E4=BF=AE?=
 =?UTF-8?q?=E6=94=B9=20=E3=80=90=E4=BF=AE=E6=94=B9=E4=BA=BA=E3=80=91=20wuy?=
 =?UTF-8?q?ulong=2030031080?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tb_plugin/fe/src/api/generated/api.ts     |   3 +-
 .../tb_plugin/fe/src/components/Kernel.tsx    |  57 ++++++---
 .../fe/src/components/TooltipDescriptions.ts  |   4 +-
 .../profiling/tb_plugin/test/test_diffrun.py  |   2 +-
 .../tb_plugin/torch_tb_profiler/consts.py     |   2 +-
 .../tb_plugin/torch_tb_profiler/plugin.py     |  29 +++--
 .../torch_tb_profiler/profiler/data.py        |  27 ++--
 .../torch_tb_profiler/profiler/loader.py      |  63 +++++++---
 .../profiler/run_generator.py                 | 118 ++++++++++++++++--
 .../tb_plugin/torch_tb_profiler/run.py        |   3 +-
 .../tb_plugin/torch_tb_profiler/utils.py      |   4 +-
 11 files changed, 242 insertions(+), 70 deletions(-)
diff --git a/tb_plugins/profiling/tb_plugin/fe/src/api/generated/api.ts b/tb_plugins/profiling/tb_plugin/fe/src/api/generated/api.ts
index b33fe1b6a84..37a5eb3127d 100644
--- a/tb_plugins/profiling/tb_plugin/fe/src/api/generated/api.ts
+++ b/tb_plugins/profiling/tb_plugin/fe/src/api/generated/api.ts
@@ -465,7 +465,8 @@ export interface KernelGraph {
    * @type {Graph}
    * @memberof KernelGraph
    */
-  total: Graph
+  total: Graph,
+  device_target: string
 }
 /**
  *
diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx
index ea2fa63fe49..296f8aebec8 100644
--- a/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx
+++ b/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx
@@ -29,7 +29,8 @@ import { DataLoading } from './DataLoading'
 import { makeChartHeaderRenderer, useTooltipCommonStyles } from './helpers'
 import {
   GPUKernelTotalTimeTooltip,
-  TensorCoresPieChartTooltip
+  TensorCoresPieChartTooltip,
+  TensorCoresPieChartTooltipAscend
 } from './TooltipDescriptions'
 
 export interface IProps {
@@ -74,11 +75,12 @@ export const Kernel: React.FC<IProps> = (props) => {
   const [kernelTable, setKernelTable] = React.useState<Graph | undefined>(
     undefined
   )
-  const [groupBy, setGroupBy] = React.useState(KernelGroupBy.KernelNameAndOpName)
+  const [groupBy, setGroupBy] = React.useState(KernelGroupBy.Kernel)
   const [searchKernelName, setSearchKernelName] = React.useState('')
   const [searchOpName, setSearchOpName] = React.useState('')
   const [sortColumn, setSortColumn] = React.useState('')
-  const [hasStep, setHasStep] = React.useState(false);
+  const [hasStep, setHasStep] = React.useState(false)
+  const [deviceTarget, setDeviceTarget] = React.useState('GPU')
 
   const [topText, actualTop, useTop, setTopText, setUseTop] = useTopN({
     defaultUseTop: UseTop.Use,
@@ -111,6 +113,8 @@ export const Kernel: React.FC<IProps> = (props) => {
       .kernelGet(run, worker, span, KernelGroupBy.Kernel)
       .then((resp) => {
         setKernelGraph(resp.total)
+        setDeviceTarget(resp.device_target)
+        setGroupBy(resp.device_target === 'Ascend' ? KernelGroupBy.KernelNameAndOpName : KernelGroupBy.Kernel)
       })
   }, [run, worker, span])
 
@@ -123,7 +127,7 @@ export const Kernel: React.FC<IProps> = (props) => {
   const [searchedKernelTable] = useSearch(searchKernelName, 'name', kernelTable)
   const [searchedOpTable] = useSearch(
     searchOpName,
-    'step id',
+    deviceTarget === 'Ascend' ? 'step id' : 'operator',
     searchedKernelTable
   )
 
@@ -157,12 +161,17 @@ export const Kernel: React.FC<IProps> = (props) => {
   )
 
   const TensorCoresTitle = React.useMemo(
-    () =>
+    () => deviceTarget === 'Ascend' ?
       chartHeaderRenderer(
         'AI Cores Utilization',
+        TensorCoresPieChartTooltipAscend
+      )
+      :
+      chartHeaderRenderer(
+        'Tensor Cores Utilization',
         TensorCoresPieChartTooltip
       ),
-    [chartHeaderRenderer]
+    [chartHeaderRenderer, deviceTarget]
   )
 
   return (
@@ -240,10 +249,10 @@ export const Kernel: React.FC<IProps> = (props) => {
                       onChange={onGroupByChanged}
                     >
                       <MenuItem value={KernelGroupBy.KernelNameAndOpName}>
-                        Statistic
+                        {deviceTarget === 'Ascend' ? 'Statistic' : 'Kernel Properties + Op Name'}
                       </MenuItem>
                       <MenuItem value={KernelGroupBy.Kernel}>
-                        All
+                        {deviceTarget === 'Ascend' ? 'All' : 'Kernel Name'}
                       </MenuItem>
                     </Select>
                   </Grid>
@@ -258,16 +267,28 @@ export const Kernel: React.FC<IProps> = (props) => {
                       label="Search by Name"
                     />
                   </Grid>
-                  {groupBy === KernelGroupBy.Kernel && hasStep &&
-                    <Grid item>
-                      <TextField
-                        classes={{ root: classes.inputWidthOverflow }}
-                        value={searchOpName}
-                        onChange={onSearchOpChanged}
-                        type="search"
-                        label="Search by Step ID"
-                      />
-                    </Grid>
+                  {deviceTarget === 'Ascend' ?
+                    (groupBy === KernelGroupBy.Kernel && hasStep &&
+                      <Grid item>
+                        <TextField
+                          classes={{ root: classes.inputWidthOverflow }}
+                          value={searchOpName}
+                          onChange={onSearchOpChanged}
+                          type="search"
+                          label="Search by Step ID"
+                        />
+                      </Grid>)
+                    :
+                    (groupBy === KernelGroupBy.KernelNameAndOpName &&
+                      <Grid item>
+                        <TextField
+                          classes={{ root: classes.inputWidthOverflow }}
+                          value={searchOpName}
+                          onChange={onSearchOpChanged}
+                          type="search"
+                          label="Search by Operator Name"
+                        />
+                      </Grid>)
                   }
                 </Grid>
               </Grid>
diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/TooltipDescriptions.ts b/tb_plugins/profiling/tb_plugin/fe/src/components/TooltipDescriptions.ts
index f3fe25ff1a2..596e918eb82 100644
--- a/tb_plugins/profiling/tb_plugin/fe/src/components/TooltipDescriptions.ts
+++ b/tb_plugins/profiling/tb_plugin/fe/src/components/TooltipDescriptions.ts
@@ -21,7 +21,9 @@ export const HostTotalTimeTooltip = `The accumulated time spent on Host, includi
 
 export const GPUKernelTotalTimeTooltip = `The accumulated time of all calls of this kernel.`
 
-export const TensorCoresPieChartTooltip = `The accumulated time of all kernels using or not using AI Cores.`
+export const TensorCoresPieChartTooltip = `The accumulated time of all kernels using or not using Tensor Cores.`
+
+export const TensorCoresPieChartTooltipAscend = `The accumulated time of all kernels using or not using AI Cores.`
 
 export const DistributedGpuInfoTableTooltip = `Information about GPU hardware used during the run.`
 
diff --git a/tb_plugins/profiling/tb_plugin/test/test_diffrun.py b/tb_plugins/profiling/tb_plugin/test/test_diffrun.py
index f1bebbfbfa4..06fbf398a7d 100644
--- a/tb_plugins/profiling/tb_plugin/test/test_diffrun.py
+++ b/tb_plugins/profiling/tb_plugin/test/test_diffrun.py
@@ -9,7 +9,7 @@ from torch_tb_profiler.utils import timing
 
 
 def load_profile(worker, span, path):
-    return RunProfileData.parse(worker, span, path, '.')
+    return RunProfileData.parse_gpu(worker, span, path, '.')
 
 
 class TestDiffRun(unittest.TestCase):
diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/consts.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/consts.py
index 0a54ae5687c..486e5f05bf5 100644
--- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/consts.py
+++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/consts.py
@@ -12,7 +12,7 @@ WORKER_PATTERN = re.compile(r"""^(.*?) # worker name
         (?:\.gz)?$""", re.X)  # optional .gz extension
 
 TRACE_PATTERN = re.compile(r"""^trace_view\.json(\.gz)?$""")
-WORKER_SPAN_PATTERN = re.compile(r"""([^\\]*)_(\d+)$""")
+WORKER_SPAN_PATTERN = re.compile(r"""([^\\]*)_(\d+)_ascend_pt$""")
 
 NODE_PROCESS_PATTERN = re.compile(r"""^(.*)_(\d+)""")
 MONITOR_RUN_REFRESH_INTERNAL_IN_SECONDS = 10
diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/plugin.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/plugin.py
index c5dcfb535fc..7fa352261de 100644
--- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/plugin.py
+++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/plugin.py
@@ -438,9 +438,10 @@ class TorchProfilerPlugin(base_plugin.TBPlugin):
                     # Assume no deletion on run directories, trigger async load if find a new run
                     for run_dir in run_dirs:
                         has_dir = True
-                        if run_dir not in touched:
-                            touched.add(run_dir)
-                            logger.info('Find run directory %s', run_dir)
+                        run_device = f'{run_dir["name"]}_{run_dir["device_target"]}'
+                        if run_device not in touched:
+                            touched.add(run_device)
+                            logger.info('Find run directory %s', run_dir['name'])
                             # Use threading to avoid UI stall and reduce data parsing time
                             t = threading.Thread(target=self._load_run, args=(run_dir,))
                             t.start()
@@ -472,7 +473,14 @@ class TorchProfilerPlugin(base_plugin.TBPlugin):
 
     def _get_run_dirs(self):
         """Scan logdir, find PyTorch Profiler run directories.
-        A directory is considered to be a run if it satisfies the following two conditions:
+        A directory is considered to be a gpu run if it contains 1 or more *.pt.trace.json[.gz].
+        E.g. there are 2 runs: run1, run2
+            /run1
+                /[worker1].pt.trace.json.gz
+                /[worker2].pt.trace.json.gz
+            /run2
+                /[worker1].pt.trace.json
+        A directory is considered to be an ascend run if it satisfies the following two conditions:
             1.At least one subdirectory with the name in this format: {worker_span}.
             2.The subdirectory in condition 1 has a 'ASCEND_PROFILER_OUTPUT' subdirectory which
             contains a 'trace_view.json(.gz)' file or a 'kernel_details.csv' file.
@@ -490,21 +498,26 @@ class TorchProfilerPlugin(base_plugin.TBPlugin):
                     /ASCEND_PROFILER_OUTPUT
                         /kernel_details.csv
         """
-        for root, subdirs, _ in io.walk(self.logdir):
+        for root, subdirs, files in io.walk(self.logdir):
             for subdir in subdirs:
                 if str(subdir) == 'ASCEND_PROFILER_OUTPUT':
                     match = consts.WORKER_SPAN_PATTERN.match(io.basename(root))
                     if match is not None:
                         run_name = io.abspath(io.join(root, '..'))
-                        yield run_name
+                        yield {'name': run_name, 'device_target': 'Ascend'}
                         break
 
+            for file in files:
+                if utils.is_gpu_chrome_trace_file(file):
+                    yield {'name': root, 'device_target': 'GPU'}
+                    break
+
     def _load_run(self, run_dir):
         try:
-            name = self._get_run_name(run_dir)
+            name = self._get_run_name(run_dir['name'])
             logger.info('Load run %s', name)
             # Currently, assume run data is immutable, so just load once
-            loader = RunLoader(name, run_dir, self._cache)
+            loader = RunLoader(name, run_dir['name'], self._cache, run_dir['device_target'])
             run = loader.load()
             logger.info('Run %s loaded', name)
             self._queue.put(run)
diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py
index 01864509c2a..4637527cda3 100644
--- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py
+++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py
@@ -67,8 +67,8 @@ class RunProfileData(object):
         self.use_ddp: bool = False
         self.comm_lib = None
         self.has_runtime: bool = False
-        self.has_kernel: bool = False
-        self.has_trace: bool = False
+        self.has_kernel: bool = True
+        self.has_trace: bool = True
         self.has_communication: bool = False
         self.has_memcpy_or_memset: bool = False
         self.role_ranges = None
@@ -101,15 +101,24 @@ class RunProfileData(object):
         self.recommendations = []
 
     @staticmethod
-    def parse(worker, span, path, cache_dir):
+    def parse_gpu(worker, span, path, cache_dir):
+        trace_path, trace_json = RunProfileData._preprocess_file(path, cache_dir, 'GPU')
+
+        profile = RunProfileData.from_json(worker, span, trace_json)
+        profile.trace_file_path = trace_path
+        return profile
+
+    @staticmethod
+    def parse_npu(worker, span, path, cache_dir):
         trace_json = {}
         trace_path = path
         has_trace = False
+        has_kernel = False
         for file in io.listdir(path):
-            if utils.is_trace_path(file):
+            if utils.is_npu_trace_path(file):
                 has_trace = True
                 trace_file = io.join(path, file)
-                trace_path, trace_json = RunProfileData._preprocess_file(trace_file, cache_dir)
+                trace_path, trace_json = RunProfileData._preprocess_file(trace_file, cache_dir, 'Ascend')
                 break
 
         profile = RunProfileData.from_json(worker, span, trace_json)
@@ -118,8 +127,9 @@ class RunProfileData(object):
 
         for file in io.listdir(path):
             if str(file) == 'kernel_details.csv':
-                profile.has_kernel = True
+                has_kernel = True
                 profile.kernel_file_path = io.join(path, file)
+        profile.has_kernel = has_kernel
         return profile
 
     @staticmethod
@@ -131,7 +141,7 @@ class RunProfileData(object):
         return profile
 
     @staticmethod
-    def _preprocess_file(trace_path, cache_dir):
+    def _preprocess_file(trace_path, cache_dir, device_target):
         if not io.exists(trace_path):
             raise FileNotFoundError(trace_path)
 
@@ -157,7 +167,8 @@ class RunProfileData(object):
                     json_reencode = True
 
         # work-around to remove the 'Record Window End' events to avoid the huge end timestamp
-        trace_json = {'traceEvents': trace_json}
+        if device_target == 'Ascend':
+            trace_json = {'traceEvents': trace_json}
         event_list = trace_json['traceEvents']
         end_index = None
         start_index = None
diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/loader.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/loader.py
index ac1501aa535..5a42ddaecc8 100644
--- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/loader.py
+++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/loader.py
@@ -19,27 +19,45 @@ logger = utils.get_logger()
 
 
 class RunLoader(object):
-    def __init__(self, name, run_dir, caches: io.Cache):
+    def __init__(self, name, run_dir, caches: io.Cache, device_target="GPU"):
         self.run_name = name
         self.run_dir = run_dir
         self.caches = caches
         self.queue = Queue()
+        self.device_target = device_target
 
     def load(self):
         workers = []
         spans_by_workers = defaultdict(list)
-        for path in io.listdir(self.run_dir):
-            if io.isdir(io.join(self.run_dir, path)) and utils.is_worker_span_dir(path):
-                data_path = io.join(self.run_dir, path, 'ASCEND_PROFILER_OUTPUT')
-                for file in io.listdir(data_path):
-                    if utils.is_trace_path(file) or str(file) == 'kernel_details.csv':
-                        match = consts.WORKER_SPAN_PATTERN.match(path)
-                        worker = match.group(1)
-                        span = match.group(2)
-                        if span is not None:
-                            bisect.insort(spans_by_workers[worker], span)
-                        workers.append((worker, span, io.join(path, 'ASCEND_PROFILER_OUTPUT')))
-                        break
+        if self.device_target == 'Ascend':
+            for path in io.listdir(self.run_dir):
+                if io.isdir(io.join(self.run_dir, path)) and utils.is_worker_span_dir(path):
+                    data_path = io.join(self.run_dir, path, 'ASCEND_PROFILER_OUTPUT')
+                    for file in io.listdir(data_path):
+                        if utils.is_npu_trace_path(file) or str(file) == 'kernel_details.csv':
+                            match = consts.WORKER_SPAN_PATTERN.match(path)
+                            worker = match.group(1)
+                            span = match.group(2)
+                            if span is not None:
+                                bisect.insort(spans_by_workers[worker], span)
+                            workers.append((worker, span, io.join(path, 'ASCEND_PROFILER_OUTPUT')))
+                            break
+        else:
+            for path in io.listdir(self.run_dir):
+                if io.isdir(io.join(self.run_dir, path)):
+                    continue
+                match = consts.WORKER_PATTERN.match(path)
+                if not match:
+                    continue
+
+                worker = match.group(1)
+                span = match.group(2)
+                if span is not None:
+                    # remove the starting dot (.)
+                    span = span[1:]
+                    bisect.insort(spans_by_workers[worker], span)
+
+                workers.append((worker, span, path))
 
         span_index_map = {}
         for worker, span_array in spans_by_workers.items():
@@ -53,8 +71,8 @@ class RunLoader(object):
             p.start()
         logger.info('started all processing')
 
-        distributed_run = Run(self.run_name, self.run_dir)
-        run = Run(self.run_name, self.run_dir)
+        distributed_run = Run(self.run_name, self.run_dir, self.device_target)
+        run = Run(self.run_name, self.run_dir, self.device_target)
         num_items = len(workers)
         while num_items > 0:
             item: Tuple[RunProfile, DistributedRunProfileData] = self.queue.get()
@@ -82,11 +100,14 @@ class RunLoader(object):
         try:
             logger.debug('Parse trace, run_dir=%s, data_dir=%s', self.run_dir, path)
             local_file = self.caches.get_remote_cache(io.join(self.run_dir, path))
-            data = RunProfileData.parse(worker, span, local_file, self.caches.cache_dir)
+            if self.device_target == 'Ascend':
+                data = RunProfileData.parse_npu(worker, span, local_file, self.caches.cache_dir)
+            else:
+                data = RunProfileData.parse_gpu(worker, span, local_file, self.caches.cache_dir)
             if data.trace_file_path != local_file:
                 self.caches.add_file(local_file, data.trace_file_path)
 
-            generator = RunGenerator(worker, span, data)
+            generator = RunGenerator(worker, span, data, self.device_target)
             profile = generator.generate_run_profile()
             dist_data = DistributedRunProfileData(data)
 
@@ -96,8 +117,12 @@ class RunLoader(object):
             logger.warning('tb_plugin receive keyboard interrupt signal, process %d will exit' % (os.getpid()))
             sys.exit(1)
         except Exception as ex:
-            logger.warning('Failed to parse profile data for Run %s on %s_%s. Exception=%s',
-                           self.run_name, worker, span_name, ex, exc_info=True)
+            if self.device_target == 'Ascend':
+                worker_name = f'{worker}_{span_name}'
+            else:
+                worker_name = worker
+            logger.warning('Failed to parse profile data for Run %s on %s. Exception=%s',
+                           self.run_name, worker_name, ex, exc_info=True)
             self.queue.put((None, None))
         logger.debug('finishing process data')
 
diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/run_generator.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/run_generator.py
index b4c7680214b..f8d54d678ec 100644
--- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/run_generator.py
+++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/run_generator.py
@@ -16,12 +16,13 @@ logger = utils.get_logger()
 
 
 class RunGenerator(object):
-    def __init__(self, worker, span, profile_data: RunProfileData):
+    def __init__(self, worker, span, profile_data: RunProfileData, device_target="GPU"):
         self.worker = worker
         self.span = span
         self.profile_data = profile_data
         self.statistic_data = {}
         self.accelerator_data = {}
+        self.device_target = device_target
 
     def generate_run_profile(self):
         profile_run = RunProfile(self.worker, self.span)
@@ -45,10 +46,16 @@ class RunGenerator(object):
 
         if self.profile_data.has_kernel:
             profile_run.views.append(consts.KERNEL_VIEW)
-            profile_run.kernel_table = self._generate_kernel_table()
-            profile_run.kernel_op_table = self._generate_kernel_op_table()
-            profile_run.kernel_pie = self._generate_kernel_pie()
-            profile_run.tc_pie = self._generate_tc_pie()
+            if self.device_target == 'Ascend':
+                profile_run.kernel_table = self._generate_kernel_table_npu()
+                profile_run.kernel_op_table = self._generate_kernel_op_table_npu()
+                profile_run.kernel_pie = self._generate_kernel_pie_npu()
+                profile_run.tc_pie = self._generate_tc_pie_npu()
+            else:
+                profile_run.kernel_table = self._generate_kernel_table_gpu()
+                profile_run.kernel_op_table = self._generate_kernel_op_table_gpu()
+                profile_run.kernel_pie = self._generate_kernel_pie_gpu()
+                profile_run.tc_pie = self._generate_tc_pie_gpu()
 
         if self.profile_data.has_trace:
             profile_run.views.append(consts.TRACE_VIEW)
@@ -316,7 +323,98 @@ class RunGenerator(object):
             result[k] = self._generate_op_table(v, group_by_input_shape, True)
         return result
 
-    def _generate_kernel_op_table(self):
+    def _generate_kernel_op_table_gpu(self):
+        table = {}
+        result = {
+            'metadata': {
+                'sort': 'Total Duration (us)'
+            },
+            'data': table
+        }
+        table['columns'] = [{'type': 'string', 'name': 'Name'},
+                            {'type': 'string', 'name': 'Operator'},
+                            {'type': 'string', 'name': 'Grid'},
+                            {'type': 'string', 'name': 'Block'},
+                            {'type': 'number', 'name': 'Register Per Thread'},
+                            {'type': 'number', 'name': 'Shared Memory'},
+                            {'type': 'string', 'name': 'Kernel Uses Tensor Cores',
+                             'tooltip': consts.TOOLTIP_KERNEL_USES_TC},
+                            {'type': 'string', 'name': 'Op is Tensor Cores eligible',
+                             'tooltip': consts.TOOLTIP_KERNEL_OP_TC_ELIGIBLE}]
+        col_names = ['Calls', 'Total Duration (us)', 'Mean Duration (us)', 'Max Duration (us)', 'Min Duration (us)']
+        for column in col_names:
+            table['columns'].append({'type': 'number', 'name': column})
+        gpu_metrics_columns = self.profile_data.gpu_metrics_parser.get_gpu_metrics_columns()
+        table['columns'].extend(gpu_metrics_columns)
+
+        table['rows'] = []
+        kernel_list: List[KernelAggByNameOp] = sorted(
+            self.profile_data.kernel_list_groupby_name_op, key=lambda x: x.total_duration, reverse=True)
+        for agg_by_name_op in kernel_list:
+            kernel_op_row = [agg_by_name_op.name, agg_by_name_op.op_name,
+                             str(agg_by_name_op.grid), str(agg_by_name_op.block),
+                             str(agg_by_name_op.regs_per_thread or '0'), str(agg_by_name_op.shared_memory or '0'),
+                             'Yes' if agg_by_name_op.tc_used else 'No',
+                             'Yes' if agg_by_name_op.op_tc_eligible else 'No',
+                             agg_by_name_op.calls,
+                             agg_by_name_op.total_duration, round(agg_by_name_op.avg_duration),
+                             agg_by_name_op.max_duration, agg_by_name_op.min_duration]
+            if self.profile_data.gpu_metrics_parser.has_blocks_per_sm:
+                kernel_op_row.append(round(agg_by_name_op.avg_blocks_per_sm, 2))
+            if self.profile_data.gpu_metrics_parser.has_occupancy:
+                kernel_op_row.append(round(agg_by_name_op.avg_occupancy, 2))
+            table['rows'].append(kernel_op_row)
+        return result
+
+    def _generate_kernel_pie_gpu(self):
+        pie = {'columns': [{'type': 'string', 'name': 'name'}, {'type': 'number', 'name': 'value'}], 'rows': []}
+        for _id, (name, row) in enumerate(self.profile_data.kernel_stat.iterrows()):
+            pie['rows'].append([name, row['sum']])
+        data = {'total': pie, 'device_target': self.device_target}
+        return data
+
+    def _generate_kernel_table_gpu(self):
+        table = {}
+        result = {
+            'metadata': {
+                'sort': 'Total Duration (us)'
+            },
+            'data': table
+        }
+        table['columns'] = [{'type': 'string', 'name': 'Name'},
+                            {'type': 'string', 'name': 'Tensor Cores Used',
+                             'tooltip': consts.TOOLTIP_KERNEL_USES_TC}]
+        columns = ['count', 'sum', 'mean', 'max', 'min']
+        round_digits = [0, 0, 0, 0, 0]
+        if self.profile_data.gpu_metrics_parser.has_blocks_per_sm:
+            columns.append('blocks_per_sm')
+            round_digits.append(2)
+        if self.profile_data.gpu_metrics_parser.has_occupancy:
+            columns.append('occupancy')
+            round_digits.append(2)
+        col_names = ['Calls', 'Total Duration (us)', 'Mean Duration (us)', 'Max Duration (us)', 'Min Duration (us)']
+        for column in col_names:
+            table['columns'].append({'type': 'number', 'name': column})
+        gpu_metrics_columns = self.profile_data.gpu_metrics_parser.get_gpu_metrics_columns()
+        table['columns'].extend(gpu_metrics_columns)
+
+        table['rows'] = []
+        for _id, (name, row) in enumerate(self.profile_data.kernel_stat.iterrows()):
+            kernel_row = [name, 'Yes' if row['tc_used'] else 'No']
+            for i, column in enumerate(columns):
+                kernel_row.append(round(row[column]) if round_digits[i] == 0
+                                  else round(row[column], round_digits[i]))
+            table['rows'].append(kernel_row)
+        return result
+
+    def _generate_tc_pie_gpu(self):
+        pie = {'columns': [{'type': 'string', 'name': 'name'}, {'type': 'number', 'name': 'value'}], 'rows': []}
+        pie['rows'].append(['Using Tensor Cores', self.profile_data.tc_used_ratio])
+        pie['rows'].append(['Not Using Tensor Cores', 1.0 - self.profile_data.tc_used_ratio])
+        data = {'total': pie}
+        return data
+
+    def _generate_kernel_op_table_npu(self):
         table = {}
         result = {
             'metadata': {
@@ -338,7 +436,7 @@ class RunGenerator(object):
             table['rows'].append(temp)
         return result
 
-    def _generate_kernel_pie(self):
+    def _generate_kernel_pie_npu(self):
         pie = {'columns': [{'type': 'string', 'name': 'name'}, {'type': 'number', 'name': 'value'}], 'rows': []}
         with open(self.profile_data.kernel_file_path, encoding="utf-8") as f:
             reader = csv.DictReader(f)
@@ -347,10 +445,10 @@ class RunGenerator(object):
                 data.append(row.get('Name'))
                 data.append(float(row.get('Duration(us)')))
                 pie['rows'].append(data)
-        datas = {'total': pie}
+        datas = {'total': pie, 'device_target': self.device_target}
         return datas
 
-    def _generate_kernel_table(self):
+    def _generate_kernel_table_npu(self):
         display_columns = ('Step ID', 'Name', 'Type', 'Accelerator Core', 'Start Time', 'Duration(us)', 'Wait Time(us)',
                            'Block Dim', 'Input Shapes', 'Input Data Types', 'Input Formats', 'Output Shapes',
                            'Output Data Types', 'Output Formats')
@@ -388,7 +486,7 @@ class RunGenerator(object):
                 datas.append(row)
         return datas
 
-    def _generate_tc_pie(self):
+    def _generate_tc_pie_npu(self):
         pie = {'columns': [{'type': 'string', 'name': 'name'}, {'type': 'number', 'name': 'value'}], 'rows': []}
         for key, val in self.accelerator_data.items():
             pie['rows'].append(['Using ' + key.replace('_', ' '), val])
diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/run.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/run.py
index b86ea679874..3cc7ef7fd56 100644
--- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/run.py
+++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/run.py
@@ -19,11 +19,12 @@ class Run(object):
     May contain profiling results from multiple workers. E.g. distributed scenario.
     """
 
-    def __init__(self, name, run_dir):
+    def __init__(self, name, run_dir, device_target='GPU'):
         self.name = name
         self.run_dir = run_dir
         self.profiles: Dict[Tuple[str, str], RunProfile] = {}
         self.span_view = {}
+        self.device_target = device_target
 
     @property
     def workers(self):
diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/utils.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/utils.py
index 392e8bd61d6..7bd6e42bfa4 100644
--- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/utils.py
+++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/utils.py
@@ -29,7 +29,7 @@ def get_logger():
     return logger
 
 
-def is_chrome_trace_file(path):
+def is_gpu_chrome_trace_file(path):
     return consts.WORKER_PATTERN.match(path)
 
 
@@ -37,7 +37,7 @@ def is_worker_span_dir(path):
     return consts.WORKER_SPAN_PATTERN.match(path)
 
 
-def is_trace_path(path):
+def is_npu_trace_path(path):
     return consts.TRACE_PATTERN.match(path)
 
 
-- 
Gitee