diff --git a/test/test_profiler/test_pt_profiler.py b/test/test_profiler/test_pt_profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ec8aa1242bb182fbfe54208fea8cb9699d7ef5f
--- /dev/null
+++ b/test/test_profiler/test_pt_profiler.py
@@ -0,0 +1,210 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch_npu.testing.common_utils import TestCase, run_tests
+import torch_npu
+
+class SmallModel(torch.nn.Module):
+    def __init__(self, in_channel, out_channel):
+        super(SmallModel, self).__init__()
+        self.conv1 = torch.nn.Conv2d(in_channel, in_channel, 1)
+        self.relu1 = torch.nn.ReLU()
+        self.conv2 = torch.nn.Conv2d(in_channel, out_channel, 1)
+
+    def forward(self, input_1):
+        input_1 = self.conv1(input_1)
+        input_1 = self.relu1(input_1)
+        input_1 = self.conv2(input_1)
+        return input_1.reshape(input_1.shape[0], -1)
+
+class TestProfiler(TestCase):
+
+    def mm_op(self, device="cpu"):
+        a = torch.rand(5, 5).to(device)
+        b = torch.randn(5, 5).to(device)
+        c = torch.mm(a, b)
+
+    def test_cpu_op_profiler(self):
+        with torch.autograd.profiler.profile(use_npu=False) as prof:
+            self.mm_op()
+        found_mm = False 
+
+        for e in prof.function_events:
+            if "mm" in e.name:
+                found_mm = True
+        self.assertTrue(found_mm)
+
+    def test_npu_op_profiler(self):
+        # test basic function for npu op
+        if torch.npu.is_available():
+            device = "npu:0"
+        else:
+            return
+        with torch.autograd.profiler.profile(use_npu=True) as prof:
+            self.mm_op(device)
+        found_mm = False 
+
+        for e in prof.function_events:
+            if "mm" in e.name:
+                found_mm = True
+        self.assertTrue(found_mm)
+
+    def test_memory_profiler(self):
+        # test momory usage
+        def run_profiler(creat_tensor, metric):
+            # collecting allocs / deallocs
+            with torch.autograd.profiler.profile(profile_memory=True, 
+                                            record_shapes=False, use_npu=True) as prof:
+                input_x = None
+                with torch.autograd.profiler.record_function("user_allocate"):
+                    input_x = creat_tensor()
+                with torch.autograd.profiler.record_function("user_deallocate"):
+                    del input_x
+            return prof.key_averages()
+
+        def check_metrics(stats, metric, allocs=None, deallocs=None):
+            stat_metrics = {}
+            for stat in stats:
+                stat_metrics[stat.key] = getattr(stat, metric)
+            if allocs is not None:
+                for alloc_fn in allocs:
+                    self.assertTrue(alloc_fn in stat_metrics)
+                    self.assertTrue(stat_metrics[alloc_fn] > 0)
+            if deallocs is not None:
+                for dealloc_fn in deallocs:
+                    self.assertTrue(dealloc_fn in stat_metrics)
+                    self.assertTrue(stat_metrics[dealloc_fn] < 0)
+
+        def create_cpu_tensor():
+            return torch.rand(10, 10)
+
+        def create_npu_tensor():
+            return torch.rand(20, 30).npu()
+
+        stats = run_profiler(create_cpu_tensor, "cpu_memory_usage")
+        check_metrics(
+            stats,
+            "cpu_memory_usage",
+            allocs=[
+                "aten::empty",
+                "aten::rand",
+                "user_allocate",
+            ],
+            deallocs=[
+                "user_deallocate",
+            ]
+        )
+
+        if torch.npu.is_available():
+            create_npu_tensor()
+            stats = run_profiler(create_npu_tensor, "npu_memory_usage")
+            check_metrics(
+                stats,
+                "npu_memory_usage",
+                allocs=[
+                    "user_allocate",
+                    "aten::to",
+                    "aten::empty_strided",
+                ],
+                deallocs=[
+                    "user_deallocate",
+                ]
+            )
+            check_metrics(
+                stats,
+                "cpu_memory_usage",
+                allocs=[
+                    "aten::rand",
+                    "aten::empty",
+                ]
+            )
+
+    def test_model_profiler(self):
+        """Checks that model forward and backward.
+        """
+        def train():
+            for index in range(steps):
+                x = torch.rand(input_shape).to(device)
+                y = torch.rand(out_shape).reshape(out_shape[0], -1).to(device)
+                y_pred = model(x)
+                loss = criterion(y_pred, y)
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+        input_shape = (4, 3, 24, 24)
+        out_shape = (4, 12, 24, 24)
+        steps = 5
+        device = "npu:0" if torch.npu.is_available() else "cpu"
+        model = SmallModel(input_shape[1], out_shape[1]).to(device)
+        criterion = torch.nn.MSELoss(reduction='sum')
+        optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
+
+        try:
+            train()
+        except Exception:
+            self.assertTrue(False, "Expected no exception without profiling.")
+
+        def judge(expected_event_count, prof):
+            actual_event_count = {}
+            for e in prof.function_events:
+                if "#" in e.name:
+                    key = e.name
+                    if key in expected_event_count.keys():
+                        actual_event_count[key] = actual_event_count.setdefault(key, 0) + 1
+            for key, count in expected_event_count.items():
+                self.assertTrue((key in actual_event_count.keys()) and (count == actual_event_count[key]))
+
+        with torch.autograd.profiler.profile(use_npu=True) as prof:
+            train()
+        expected_event_count = {
+            "Optimizer.step#SGD.step": steps,
+            "Optimizer.zero_grad#SGD.zero_grad": steps
+        }
+        judge(expected_event_count, prof)
+
+
+    def test_npu_simple_profiler(self):
+        def train():
+            for index in range(steps):
+                x = torch.rand(input_shape).to(device)
+                y = torch.rand(out_shape).reshape(out_shape[0], -1).to(device)
+                y_pred = model(x)
+                loss = criterion(y_pred, y)
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+        input_shape = (4, 3, 24, 24)
+        out_shape = (4, 12, 24, 24)
+        steps = 5
+        device = "npu:0" if torch.npu.is_available() else "cpu"
+        model = SmallModel(input_shape[1], out_shape[1]).to(device)
+        criterion = torch.nn.MSELoss(reduction='sum')
+        optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
+        try:
+            train()
+        except Exception:
+            self.assertTrue(False, "Expected no exception without profiling.")
+        with torch.autograd.profiler.profile(use_npu=True, use_npu_simple=True) as prof:
+            train()
+        prof.export_chrome_trace("./test_trace.prof")
+
+if __name__ == '__main__':
+    try:
+        # to init the device
+        torch.rand(2,3).npu()
+    except Exception:
+        print("there is no npu device")
+        exit()
+    run_tests()
diff --git a/torch_npu/npu/profiler.py b/torch_npu/npu/profiler.py
index 0dbf9de39a5f6f3e3e5375fd786bf94ccfe76fab..b7c1474fa6b68a7e9540f4bce3ab067000598113 100644
--- a/torch_npu/npu/profiler.py
+++ b/torch_npu/npu/profiler.py
@@ -1165,7 +1165,7 @@ def parse_kineto_results(result):
 
         is_async = kineto_event.start_thread_id() != kineto_event.end_thread_id()
         fe = FunctionEvent(
-            id=kineto_event.correlation_id(),
+            id_event=kineto_event.correlation_id(),
             name=rewrite_name(name=kineto_event.name(), with_wildcard=True),
             trace_name=rewrite_name(name=kineto_event.name(), with_wildcard=False),
             thread=kineto_event.start_thread_id(),
@@ -1212,7 +1212,7 @@ def parse_kineto_results(result):
     for mem_record in mem_records:
         if not mem_record[1]:
             fe = FunctionEvent(
-                id=mem_record[0].handle(),
+                id_event=mem_record[0].handle(),
                 name="[memory]",
                 trace_name=None,  # not outputting in the trace
                 thread=mem_record[0].thread_id(),
@@ -1340,7 +1340,7 @@ def parse_legacy_records(thread_records):
                 start_flops = start.flops()
 
                 fe = FunctionEvent(
-                    id=record.handle(),
+                    id_event=record.handle(),
                     node_id=record.node_id(),
                     name=rewrite_name(name=start.name(), with_wildcard=True),
                     trace_name=rewrite_name(name=start.name(), with_wildcard=False),
@@ -1398,7 +1398,7 @@ def parse_legacy_records(thread_records):
                 if num_open_handles_cpu == 0:
                     # output event as a top-level memory event
                     fe = FunctionEvent(
-                        id=0,
+                        id_event=0,
                         name="[memory]",
                         trace_name=None,
                         thread=0,