From 90d7796bce754c3a321afe90a7eeb74b9763163a Mon Sep 17 00:00:00 2001
From: Codersheepchen <chenjiaxun2@163.com>
Date: Tue, 22 Jul 2025 21:53:33 -0400
Subject: [PATCH 1/9] add kernel func test and perftest

---
 .../fused_embedding_action_id_gather_test.py  |  69 +++++++++
 .../fused_embedding_gather_test.py            |  88 ++++++++++++
 .../fused_embedding_padding_test.py           | 113 +++++++++++++++
 ...ed_embedding_sparse_dynamic_stitch_test.py |  97 +++++++++++++
 .../fused_embedding_sparse_reshape_test.py    | 102 +++++++++++++
 ...ed_embedding_sparse_segment_reduce_test.py | 134 ++++++++++++++++++
 .../fused_embedding_sparse_select.py          |  92 ++++++++++++
 .../fused_embedding_action_id_gather_test.py  |  67 +++++++++
 .../fused_embedding_gather_test.py            |  93 ++++++++++++
 .../fused_embedding_padding_test.py           | 111 +++++++++++++++
 .../fused_embedding_select.py                 |  93 ++++++++++++
 ...ed_embedding_sparse_dynamic_stitch_test.py |  96 +++++++++++++
 .../fused_embedding_sparse_reshape_test.py    | 100 +++++++++++++
 ...ed_embedding_sparse_segment_reduce_test.py | 132 +++++++++++++++++
 14 files changed, 1387 insertions(+)
 create mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py
 create mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_gather_test.py
 create mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_padding_test.py
 create mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_dynamic_stitch_test.py
 create mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_reshape_test.py
 create mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_segment_reduce_test.py
 create mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_select.py
 create mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_action_id_gather_test.py
 create mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_gather_test.py
 create mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_padding_test.py
 create mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_select.py
 create mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_dynamic_stitch_test.py
 create mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_reshape_test.py
 create mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_segment_reduce_test.py

diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py
new file mode 100644
index 00000000..94db9d42
--- /dev/null
+++ b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py
@@ -0,0 +1,69 @@
+import tensorflow as tf
+import numpy as np
+import unittest
+
+from tensorflow.python.ops import gen_embedding_fused_ops
+
+class TestFusedEmbeddingActionIdGather(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Initialize test data and custom op"""
+        # Load custom op
+        cls.custom_op = gen_embedding_fused_ops
+        
+        # Base test data
+        np.random.seed(140)
+        indices1_shape = (8, 10)
+        indices2_shape = (5, 6)
+        params_shape = (80, 300)
+        cls.input0 = np.random.randint(0, params_shape[0], size=indices1_shape, dtype=np.int32)
+        cls.input1 = np.random.random(params_shape).astype(np.float32)
+        cls.input2 = np.random.randint(0, indices1_shape[0], size=indices2_shape, dtype=np.int32)
+        cls.input3 = params_shape[0]
+         # Create tf session
+        cls.sess = tf.compat.v1.Session()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sess.close()
+
+    def test_kp_fused_embedding_action_id_gather(self):
+        # execute custom op
+        custom_out = self.custom_op.KPFusedEmbeddingActionIdGather(
+            input0=tf.constant(self.input0, dtype=tf.int32),
+            input1=tf.constant(self.input1, dtype=tf.float32),
+            input2=tf.constant(self.input2, dtype=tf.int32),
+            input3=tf.constant(self.input3, dtype=tf.int32),
+        )
+
+        # tf native implementation
+        tf_out = self._tf_reference_impl(
+            input0=tf.constant(self.input0, dtype=tf.int32),
+            input1=tf.constant(self.input1, dtype=tf.float32),
+            input2=tf.constant(self.input2, dtype=tf.int32),
+            input3=tf.constant(self.input3, dtype=tf.int32),
+        )
+
+        custom_out_val = self.sess.run([custom_out])
+        tf_out_val = self.sess.run([tf_out])
+        
+        # Numerical comparison
+        np.testing.assert_array_equal(
+            custom_out_val,
+            tf_out_val,
+            err_msg="result mismatch"
+        )
+    
+    def _tf_reference_impl(self, input0, input1, input2, input3):
+        gather1 = tf.gather(input1, input0, axis=0)
+        gather2 = tf.gather(gather1, input2, axis=0)
+        pack1 = tf.stack([input3, 1680], axis=0)
+        pack2 = tf.stack([input3, -1], axis=0)
+        reshape = tf.reshape(gather2, pack2)
+        fill = tf.fill(pack1, tf.constant(0, dtype=tf.float32))
+        output = tf.concat([reshape, fill], axis=-1)
+        return output
+
+if __name__ == "__main__":
+    tf.compat.v1.disable_eager_execution()
+    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_gather_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_gather_test.py
new file mode 100644
index 00000000..f47b70d2
--- /dev/null
+++ b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_gather_test.py
@@ -0,0 +1,88 @@
+import tensorflow as tf
+import numpy as np
+import unittest
+
+from tensorflow.python.ops import gen_embedding_fused_ops
+
+class TestFusedGather(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Initialize test data and custom op"""
+        # Load custom op
+        cls.custom_op = gen_embedding_fused_ops
+        
+        # Base test data
+        cls.base_data = np.linspace(0, 11, num=240, endpoint=False, dtype=np.float32).reshape(20, 12)
+        cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2]], dtype=np.int64)
+        cls.base_begin = [0, 1]
+        cls.base_end = [0, 2]
+        cls.base_strides = [1, 1]
+         # Create tf session
+        cls.sess = tf.compat.v1.Session()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sess.close()
+
+    def test_custom(self):
+        # execute custom op
+        custom_out1, custom_out2, custom_out3= self.custom_op.KPFusedGather(
+            data=self.base_data,
+            slice_input=self.base_slice_input,
+            begin=self.base_begin,
+        )
+
+        # tf native implementation
+        tf_out1, tf_out2, tf_out3 = self._tf_reference_impl(
+            self.base_data, 
+            self.base_slice_input,
+            self.base_begin,
+        )
+
+        custom_out_val1, custom_out_val2, custom_out_val3 = self.sess.run([custom_out1, custom_out2, custom_out3])
+        tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3])
+
+        np.testing.assert_array_equal(
+            custom_out_val1,
+            tf_out_val1,
+            err_msg="Segment count mismatch"
+        )
+
+        np.testing.assert_array_equal(
+            custom_out_val2,
+            tf_out_val2,
+            err_msg="Segment count mismatch"
+        )
+
+        np.testing.assert_allclose(
+            custom_out_val3,
+            tf_out_val3,
+            rtol=1e-6,
+            err_msg="Output values mismatch"
+        )
+
+    def _tf_reference_impl(self, data, slice_input, begin):
+        slice_out = tf.strided_slice(
+            slice_input,
+            begin = begin,
+            end = [tf.shape(slice_input)[0], begin[1] + 2],
+            strides = [1, 1],
+            begin_mask = 1,
+            end_mask = 1,
+            shrink_axis_mask = 2
+        )
+        
+        slice_out, slice_out_indices = tf.unique(slice_out)
+        output_shape = tf.shape(slice_out)
+        slice_out = tf.reshape(slice_out, [-1])
+        slice_out, _ = tf.unique(slice_out)
+
+        gather1_result = tf.gather(data, slice_out)
+        gather1_result = tf.reshape(gather1_result, [-1, 12])
+
+        gather2_result = tf.gather(gather1_result, slice_out)
+        return output_shape, slice_out_indices, gather2_result
+
+if __name__ == "__main__":
+    tf.compat.v1.disable_eager_execution()
+    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_padding_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_padding_test.py
new file mode 100644
index 00000000..78c01e17
--- /dev/null
+++ b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_padding_test.py
@@ -0,0 +1,113 @@
+import tensorflow as tf
+import numpy as np
+import unittest
+
+from tensorflow.python.ops import gen_embedding_fused_ops
+
+class TestFusedEmbeddingPadding(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Initialize test data and custom op"""
+        # Load custom op
+        cls.custom_op = gen_embedding_fused_ops
+        
+        # Base test data
+        np.random.seed(140)
+        cls.input0 = np.random.randint(0, 100, size=(2 * 3, 10), dtype=np.int64)
+        cls.input1 = np.random.rand(2 * 2, 10).astype(np.float)
+        cls.input2 = cls.input1.shape
+        cls.input3 = np.array([-1, 20]).astype(np.int32)
+         # Create tf session
+        cls.sess = tf.compat.v1.Session()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sess.close()
+
+    def test_kp_fused_embedding_padding_fast(self):
+        # execute custom op
+        _, custom_out = self.custom_op.KPFusedEmbeddingPaddingFast(
+            input0=self.input0.shape,
+            input1=self.input1,
+            input2=self.input2[0],
+            input3=self.input3,
+        )
+
+        # tf native implementation
+        tf_out = self._fused_embedding_padding_fast_reference_impl(
+            tf.constant(self.input0.shape, dtype=tf.int64),
+            tf.constant(self.input1, dtype=tf.float32),
+            tf.constant(self.input2[0], dtype=tf.int32),
+            tf.constant(self.input3, dtype=tf.int32),
+        )
+
+        custom_out_val = self.sess.run([custom_out])
+        tf_out_val = self.sess.run([tf_out])
+        
+        # Numerical comparison
+        np.testing.assert_array_equal(
+            custom_out_val,
+            tf_out_val,
+            err_msg="result mismatch"
+        )
+    
+    def test_kp_fused_embedding_padding(self):
+        # execute custom op
+        _, custom_out = self.custom_op.KPFusedEmbeddingPadding(
+            input0=self.input0.shape,
+            input1=self.input1,
+            input2=self.input2[0],
+            input3=self.input3,
+        )
+
+        # tf native implementation
+        tf_out = self._fused_embedding_padding_reference_impl(
+            tf.constant(self.input0.shape, dtype=tf.int64),
+            tf.constant(self.input1, dtype=tf.float32),
+            tf.constant(self.input2[0], dtype=tf.int32),
+            tf.constant(self.input3, dtype=tf.int32),
+        )
+
+        custom_out_val = self.sess.run([custom_out])
+        tf_out_val = self.sess.run([tf_out])
+        
+        # Numerical comparison
+        np.testing.assert_array_equal(
+            custom_out_val,
+            tf_out_val,
+            err_msg="result mismatch"
+        )
+
+    def _fused_embedding_padding_fast_reference_impl(self, input0, input1, input2, input3):
+        cast = tf.cast(input0, tf.int32)
+        begin = tf.constant([0], dtype=tf.int32)
+        end = tf.constant([1], dtype=tf.int32)
+        strides = tf.constant([1], dtype=tf.int32)
+        hash_rows = tf.strided_slice(cast, begin=begin, end=end, strides=strides, shrink_axis_mask=1)
+        sub_out = hash_rows - input2
+        const = tf.constant(10, dtype=tf.int32)
+        pack = tf.stack([sub_out, const], axis=0)
+        fill = tf.fill(pack, tf.constant(0, dtype=tf.float32))
+        concat = tf.concat([input1, fill], 0)
+        reshape = tf.reshape(concat, input3)
+        shape_tensor = tf.shape(reshape)
+        output = tf.strided_slice(shape_tensor, begin=begin, end=end, strides=strides, shrink_axis_mask=1)
+        return output
+
+    def _fused_embedding_padding_reference_impl(self, input0, input1, input2, input3):
+        cast = tf.cast(input0, tf.int32)
+        begin = tf.constant([0], dtype=tf.int32)
+        end = tf.constant([1], dtype=tf.int32)
+        strides = tf.constant([1], dtype=tf.int32)
+        hash_rows = tf.strided_slice(cast, begin=begin, end=end, strides=strides, shrink_axis_mask=1)
+        sub_out = hash_rows - input2
+        const = tf.constant(10, dtype=tf.int32)
+        pack = tf.stack([sub_out, const], axis=0)
+        fill = tf.fill(pack, tf.constant(0, dtype=tf.float32))
+        concat = tf.concat([input1, fill], 0)
+        output = tf.reshape(concat, input3)
+        return output
+
+if __name__ == "__main__":
+    tf.compat.v1.disable_eager_execution()
+    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_dynamic_stitch_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_dynamic_stitch_test.py
new file mode 100644
index 00000000..4de55241
--- /dev/null
+++ b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_dynamic_stitch_test.py
@@ -0,0 +1,97 @@
+import os
+import tensorflow as tf
+import numpy as np
+import unittest
+
+from tensorflow.python.ops import gen_embedding_fused_ops
+
+class TestSparseSegmentMeanSlice(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Initialize test data and custom op"""
+        # Load custom op
+        cls.custom_op = gen_embedding_fused_ops
+        
+        cls.variables = []
+        max_val = float('inf')
+        for i in range(12):
+            N_i = np.random.randint(1000000, 44739244)
+            max_val = min(N_i, max_val)
+            var = tf.Variable(
+                tf.random.normal([N_i, 10], dtype=tf.float32),  # shape: (N_i, 10)
+                name=f"embedding_table_{i}"
+            )
+            cls.variables.append(var)
+            print(f"Created variable {i}: shape={var.shape}")
+
+        x_np = np.random.randint(0, 12*max_val, size=(10000, 12))
+        cls.x = tf.constant(x_np, dtype=tf.int64)
+        
+        # Create tf session
+        cls.sess = tf.compat.v1.Session()
+        cls.sess.run(tf.compat.v1.global_variables_initializer())
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sess.close()
+
+    def test_base(self):
+        x_first = self.sess.run(self.x)
+        var_first = self.sess.run(self.variables[0])
+
+        x_second = self.sess.run(self.x)
+        var_second = self.sess.run(self.variables[0])
+        np.testing.assert_allclose(
+            x_first,
+            x_second,
+            rtol=1e-6,
+            err_msg="Input values mismatch"
+        )
+
+        np.testing.assert_allclose(
+            var_first,
+            var_second,
+            rtol=1e-6,
+            err_msg="Input values mismatch"
+        )
+
+        # execute custom op
+        custom_out = self.custom_op.KPFusedSparseDynamicStitch(x=self.x, variables=self.variables)
+
+        # tf native implementation
+        tf_out = self._tf_reference_impl(x=self.x, variables=self.variables)
+
+        custom_out_val = self.sess.run([custom_out])
+        tf_out_val = self.sess.run([tf_out])
+        print("custom_shape: ")
+        print(custom_out_val[0].shape)
+        print("tf_out shape: ")
+        print(tf_out_val[0].shape)
+        # Numerical comparison
+        np.testing.assert_allclose(
+            custom_out_val[0],
+            tf_out_val[0],
+            rtol=1e-6,
+            err_msg="Output values mismatch"
+        )
+
+    def _tf_reference_impl(self, x, variables):
+        x_1 = tf.reshape(x, shape=[-1])
+        group_ids = tf.math.floormod(x_1, 12)
+        group_ids = tf.cast(group_ids, dtype=np.int32)
+        chunk_indices = tf.math.floordiv(x_1, 12)
+
+        original_indices = tf.range(0,tf.size(x_1),1)
+
+        a = tf.dynamic_partition(original_indices, group_ids, num_partitions=12)
+        b = tf.dynamic_partition(chunk_indices, group_ids, num_partitions=12)
+
+        c = [tf.gather(variables[i], b[i]) for i in range(12)]
+
+        d = tf.dynamic_stitch(a, c)
+
+        return d
+
+if __name__ == "__main__":
+    tf.compat.v1.disable_eager_execution()
+    unittest.main(argv=[''], verbosity=1)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_reshape_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_reshape_test.py
new file mode 100644
index 00000000..37d27531
--- /dev/null
+++ b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_reshape_test.py
@@ -0,0 +1,102 @@
+import tensorflow as tf
+import numpy as np
+import unittest
+
+from tensorflow.python.ops import gen_embedding_fused_ops
+
+class TestFusedSparseReshape(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Initialize test data and custom op"""
+        # Load custom op
+        cls.custom_op = gen_embedding_fused_ops
+        
+        # Base test data
+        cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2], [3, 4]], dtype=np.int64)
+        cls.base_begin = [0, 1]
+        cls.base_end = [0, 2]
+        cls.base_strides = [1, 1]
+        cls.base_newshape = [2, 4]
+         # Create tf session
+        cls.sess = tf.compat.v1.Session()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sess.close()
+
+    def test_custom(self):
+        # execute custom op
+        custom_out1, custom_out2, = self.custom_op.KPFusedSparseReshape(
+            slice_input=self.base_slice_input,
+            begin=self.base_begin,
+            new_shape=self.base_newshape
+        )
+
+        # tf native implementation
+        tf_out1, tf_out2, tf_out3 = self._tf_reference_impl(
+            self.base_slice_input,
+            self.base_begin,
+            self.base_newshape
+        )
+
+        custom_out_val1, custom_out_val2 = self.sess.run([custom_out1, custom_out2])
+        tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3])
+        
+        print("custom_out_val1: ", custom_out_val1)
+        print("custom_out_val2: ", custom_out_val2)
+        print("tf_out_val1: ", tf_out_val1)
+        print("tf_out_val2: ", tf_out_val2)
+
+        np.testing.assert_array_equal(
+            custom_out_val1,
+            tf_out_val1,
+            err_msg="Segment count mismatch"
+        )
+
+        np.testing.assert_array_equal(
+            custom_out_val2,
+            tf_out_val2,
+            err_msg="Segment count mismatch"
+        )
+
+    def _tf_reference_impl(self, slice_input, begin, new_shape):
+        slice67_out = tf.strided_slice(
+            slice_input,
+            begin=begin,
+            end=[0, 2],
+            strides=[1, 1],
+            begin_mask=1,
+            end_mask=1,
+            shrink_axis_mask=2
+        )
+
+        slice67_out = tf.reshape(slice67_out, [-1, 1])
+        shape_out = tf.shape(slice67_out)
+        slice57_out = tf.strided_slice(
+            shape_out, 
+            begin=[0],
+            end=[1],
+            strides=[1],
+            shrink_axis_mask=1
+        )
+        
+        const2 = tf.constant(2)
+        input_shape = tf.stack([slice57_out, const2])
+        input_shape = tf.cast(input_shape, tf.int64)
+
+        range_out = tf.range(0, slice57_out, 1)
+        range_out = tf.reshape(range_out, [-1, 1])
+        range_out_64 = tf.cast(range_out, dtype=tf.int64)
+        concat_out = tf.concat([range_out_64, slice67_out], axis=-1)
+        
+        sparse_tensor = tf.SparseTensor(
+            indices=concat_out,
+            values=[1,2,3,4],
+            dense_shape=input_shape
+        )
+        sparse_tensor_out = tf.sparse.reshape(sparse_tensor, new_shape)
+        return sparse_tensor_out.indices, sparse_tensor_out.dense_shape, concat_out
+
+if __name__ == "__main__":
+    tf.compat.v1.disable_eager_execution()
+    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_segment_reduce_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_segment_reduce_test.py
new file mode 100644
index 00000000..69c7a114
--- /dev/null
+++ b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_segment_reduce_test.py
@@ -0,0 +1,134 @@
+import tensorflow as tf
+import numpy as np
+import unittest
+
+from tensorflow.python.ops import gen_embedding_fused_ops
+
+class TestSparseSegmentMeanSlice(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Initialize test data and custom op"""
+        # Load custom op
+        cls.custom_op = gen_embedding_fused_ops
+        
+        # Base test data
+        cls.base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0,5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4， 3}
+        cls.base_indices = np.array([0, 1, 2], dtype=np.int64) # shape {3}
+        cls.base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) # shape {3, 2}
+        cls.base_begin = [0, 1]
+        cls.base_end = [0, 2]
+        cls.base_strides = [1, 2]
+         # Create tf session
+        cls.sess = tf.compat.v1.Session()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sess.close()
+
+    def test_mean(self):
+        # execute custom op
+        custom_out, custom_slice_out = self.custom_op.KPFusedSparseSegmentReduce(
+            data=self.base_data,
+            indices=self.base_indices,
+            slice_input=self.base_slice_input,
+            begin=self.base_begin,
+            end = self.base_end,
+            strides = self.base_strides
+        )
+
+        # tf native implementation
+        tf_out, tf_slice_out = self._tf_reference_impl(
+            self.base_data, 
+            self.base_indices,
+            self.base_slice_input,
+            self.base_begin,
+            self.base_end,
+            self.base_strides,
+            True
+        )
+
+        custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out])
+        tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out])
+        
+        # Numerical comparison
+        np.testing.assert_allclose(
+            custom_out_val,
+            tf_out_val,
+            rtol=1e-6,
+            err_msg="Output values mismatch"
+        )
+        np.testing.assert_array_equal(
+            custom_slice_out_val,
+            tf_slice_out_val,
+            err_msg="Segment count mismatch"
+        )
+    
+    def test_sum(self):
+        custom_out, custom_slice_out = self.custom_op.KPFusedSparseSegmentReduce(
+            data=self.base_data,
+            indices=self.base_indices,
+            slice_input=self.base_slice_input,
+            begin=self.base_begin,
+            end = self.base_end,
+            strides = self.base_strides,
+            combiner=0
+        )
+
+        tf_out, tf_slice_out = self._tf_reference_impl(
+            self.base_data, 
+            self.base_indices,
+            self.base_slice_input,
+            self.base_begin,
+            self.base_end,
+            self.base_strides,
+            False
+        )
+
+        custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out])
+        tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out])
+        
+        np.testing.assert_allclose(
+            custom_out_val,
+            tf_out_val,
+            rtol=1e-6,
+            err_msg="Output values mismatch"
+        )
+        np.testing.assert_array_equal(
+            custom_slice_out_val,
+            tf_slice_out_val,
+            err_msg="Segment count mismatch"
+        )
+
+    def _tf_reference_impl(self, data, indices, slice_input, begin, end, strides, is_mean):
+        slice_out = tf.strided_slice(
+            slice_input,
+            begin= begin,
+            end= end,
+            strides= strides,
+            begin_mask=1,
+            end_mask=1,
+            shrink_axis_mask=2
+        )
+        
+        segment_ids = tf.cast(slice_out, dtype=tf.int32)
+        if is_mean:
+            output = tf.sparse.segment_mean(
+                data = data,
+                indices = indices,
+                segment_ids= segment_ids
+            )
+        else:
+            output = tf.sparse.segment_sum(
+                data = data,
+                indices = indices,
+                segment_ids= segment_ids
+            )
+        
+        output_shape = tf.shape(output)
+        slice_out = tf.strided_slice(output_shape, begin=[0], end=[1], strides=[1])
+        
+        return output, slice_out
+
+if __name__ == "__main__":
+    tf.compat.v1.disable_eager_execution()
+    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_select.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_select.py
new file mode 100644
index 00000000..6740811a
--- /dev/null
+++ b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_select.py
@@ -0,0 +1,92 @@
+import tensorflow as tf
+import numpy as np
+import unittest
+
+from tensorflow.python.ops import gen_embedding_fused_ops
+
+class TestKPFusedSparseSelect(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Initialize test data and custom op"""
+        # Load custom op
+        cls.custom_op = gen_embedding_fused_ops
+        
+        # Base test data
+        cls.input_a = np.random.randint(-10, 10, size=(40, 50)).astype(np.int32)
+        cls.input_b = np.random.randint(0, 20000, size=(20, 100)).astype(np.int32)
+        cls.input_c = np.random.randint(0, 10, size=(50, 2, 20)).astype(np.int32)
+         # Create tf session
+        cls.sess = tf.compat.v1.Session()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sess.close()
+
+    def test_custom(self):
+        # execute custom op
+        output_x, output_y, output_z = self.custom_op.KPFusedSparseSelect(
+            input_a=tf.constant(self.input_a, dtype=tf.int32),
+            input_b=tf.constant(self.input_b, dtype=tf.int32),
+            input_c=tf.constant(self.input_c, dtype=tf.int32),
+        )
+
+        # tf native implementation
+        ref_output_x, ref_output_y, ref_output_z = self._tf_reference_impl(
+            input_a=tf.constant(self.input_a, dtype=tf.int32),
+            input_b=tf.constant(self.input_b, dtype=tf.int32),
+            input_c=tf.constant(self.input_c, dtype=tf.int32),
+        )
+
+        op_x_val, op_y_val, op_z_val = self.sess.run([output_x, output_y, output_z])
+        ref_x_val, ref_y_val, ref_z_val = self.sess.run([ref_output_x, ref_output_y, ref_output_z])
+        
+        np.testing.assert_allclose(
+            op_x_val,
+            ref_x_val,
+            rtol=1e-5,
+            err_msg="Output values mismatch"
+        )
+
+        np.testing.assert_allclose(
+            op_y_val,
+            ref_y_val,
+            rtol=1e-5,
+            err_msg="Output values mismatch"
+        )
+
+        np.testing.assert_allclose(
+            op_z_val,
+            ref_z_val,
+            rtol=1e-5,
+            err_msg="Output values mismatch"
+        )
+
+    def _tf_reference_impl(self, input_a, input_b, input_c):
+        a = tf.reshape(input_a, [-1, 1])
+        b = tf.reshape(input_b, [-1, 1])
+        c = tf.reshape(input_c, [-1, 1])
+
+        greater_a = tf.greater(a, 0)
+        shape_reshape_a1 = tf.shape(a)
+        shape_reshape_a2 = tf.shape(a)
+        fill_a1 = tf.fill(shape_reshape_a1, tf.constant(1, dtype=tf.float32))
+        realdiv = tf.realdiv(fill_a1, tf.constant(1, dtype=tf.float32))
+        fill_a2 = tf.fill(shape_reshape_a2, tf.constant(0, dtype=tf.float32))
+        cast_a = tf.cast(greater_a, tf.float32)
+        shape_a = tf.shape(cast_a)
+        fill_a = tf.fill(shape_a, tf.constant(1, dtype=tf.float32))
+        equal_4563 = tf.equal(b, 4563)
+        equal_10831 = tf.equal(b, 10831)
+        equal_3 = tf.equal(c, 3)
+        select_1 = tf.where(equal_4563, fill_a, cast_a)
+        select_2 = tf.where(equal_10831, fill_a, select_1)
+        sub = tf.subtract(tf.constant(1, dtype=tf.float32), select_2)
+        mul = tf.multiply(tf.constant(1, dtype=tf.float32), select_2)
+        select_3 = tf.where(equal_3, realdiv, fill_a1)
+        concat = tf.concat([mul, select_3], axis=-1)
+        return fill_a2, sub, concat
+
+
+if __name__ == "__main__":
+    tf.compat.v1.disable_eager_execution()
+    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_action_id_gather_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_action_id_gather_test.py
new file mode 100644
index 00000000..c766085c
--- /dev/null
+++ b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_action_id_gather_test.py
@@ -0,0 +1,67 @@
+import tensorflow as tf
+import numpy as np
+import unittest
+
+class TestFusedEmbeddingActionIdGather(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Initialize test data and custom op"""
+        # Load custom op
+        cls.custom_op = tf.load_op_library('../kernels/fused_embedding_action_id_gather.so')
+        
+        # Base test data
+        np.random.seed(140)
+        indices1_shape = (8, 10)
+        indices2_shape = (5, 6)
+        params_shape = (80, 300)
+        cls.input0 = np.random.randint(0, params_shape[0], size=indices1_shape, dtype=np.int32)
+        cls.input1 = np.random.random(params_shape).astype(np.float32)
+        cls.input2 = np.random.randint(0, indices1_shape[0], size=indices2_shape, dtype=np.int32)
+        cls.input3 = params_shape[0]
+         # Create tf session
+        cls.sess = tf.compat.v1.Session()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sess.close()
+
+    def test_kp_fused_embedding_action_id_gather(self):
+        # execute custom op
+        custom_out = self.custom_op.kp_fused_embedding_action_id_gather(
+            input0=tf.constant(self.input0, dtype=tf.int32),
+            input1=tf.constant(self.input1, dtype=tf.float32),
+            input2=tf.constant(self.input2, dtype=tf.int32),
+            input3=tf.constant(self.input3, dtype=tf.int32),
+        )
+
+        # tf native implementation
+        tf_out = self._tf_reference_impl(
+            input0=tf.constant(self.input0, dtype=tf.int32),
+            input1=tf.constant(self.input1, dtype=tf.float32),
+            input2=tf.constant(self.input2, dtype=tf.int32),
+            input3=tf.constant(self.input3, dtype=tf.int32),
+        )
+
+        custom_out_val = self.sess.run([custom_out])
+        tf_out_val = self.sess.run([tf_out])
+        
+        # Numerical comparison
+        np.testing.assert_array_equal(
+            custom_out_val,
+            tf_out_val,
+            err_msg="result mismatch"
+        )
+    
+    def _tf_reference_impl(self, input0, input1, input2, input3):
+        gather1 = tf.gather(input1, input0, axis=0)
+        gather2 = tf.gather(gather1, input2, axis=0)
+        pack1 = tf.stack([input3, 1680], axis=0)
+        pack2 = tf.stack([input3, -1], axis=0)
+        reshape = tf.reshape(gather2, pack2)
+        fill = tf.fill(pack1, tf.constant(0, dtype=tf.float32))
+        output = tf.concat([reshape, fill], axis=-1)
+        return output
+
+if __name__ == "__main__":
+    tf.compat.v1.disable_eager_execution()
+    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_gather_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_gather_test.py
new file mode 100644
index 00000000..981c3e3b
--- /dev/null
+++ b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_gather_test.py
@@ -0,0 +1,93 @@
+import tensorflow as tf
+import numpy as np
+import unittest
+
+class TestFusedGather(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Initialize test data and custom op"""
+        # Load custom op
+        cls.custom_op = tf.load_op_library('kpfusedgather.so')
+        
+        # Base test data
+        cls.base_data = np.linspace(0, 11, num=240, endpoint=False, dtype=np.float32).reshape(20, 12)
+        cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2]], dtype=np.int64)
+        cls.base_begin = [0, 1]
+        cls.base_end = [0, 2]
+        cls.base_strides = [1, 1]
+         # Create tf session
+        cls.sess = tf.compat.v1.Session()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sess.close()
+
+    def test_custom(self):
+        # execute custom op
+        custom_out1, custom_out2, custom_out3= self.custom_op.KPFusedGather(
+            data=self.base_data,
+            slice_input=self.base_slice_input,
+            begin=self.base_begin,
+        )
+
+        # tf native implementation
+        tf_out1, tf_out2, tf_out3 = self._tf_reference_impl(
+            self.base_data, 
+            self.base_slice_input,
+            self.base_begin,
+        )
+
+        custom_out_val1, custom_out_val2, custom_out_val3 = self.sess.run([custom_out1, custom_out2, custom_out3])
+        tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3])
+        
+        print("custom_out_val1: ", custom_out_val1)
+        print("custom_out_val2: ", custom_out_val2)
+        print("custom_out_val3: ", custom_out_val3)
+        print("tf_out_val1: ", tf_out_val1)
+        print("tf_out_val2: ", tf_out_val2)
+        print("tf_out_val3: ", tf_out_val3)
+
+        np.testing.assert_array_equal(
+            custom_out_val1,
+            tf_out_val1,
+            err_msg="Segment count mismatch"
+        )
+
+        np.testing.assert_array_equal(
+            custom_out_val2,
+            tf_out_val2,
+            err_msg="Segment count mismatch"
+        )
+
+        np.testing.assert_allclose(
+            custom_out_val3,
+            tf_out_val3,
+            rtol=1e-6,
+            err_msg="Output values mismatch"
+        )
+
+    def _tf_reference_impl(self, data, slice_input, begin):
+        slice_out = tf.strided_slice(
+            slice_input,
+            begin = begin,
+            end = [tf.shape(slice_input)[0], begin[1] + 2],
+            strides = [1, 1],
+            begin_mask = 1,
+            end_mask = 1,
+            shrink_axis_mask = 2
+        )
+        
+        slice_out, slice_out_indices = tf.unique(slice_out)
+        output_shape = tf.shape(slice_out)
+        slice_out = tf.reshape(slice_out, [-1])
+        slice_out, _ = tf.unique(slice_out)
+
+        gather1_result = tf.gather(data, slice_out)
+        gather1_result = tf.reshape(gather1_result, [-1, 12])
+
+        gather2_result = tf.gather(gather1_result, slice_out)
+        return output_shape, slice_out_indices, gather2_result
+
+if __name__ == "__main__":
+    tf.compat.v1.disable_eager_execution()
+    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_padding_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_padding_test.py
new file mode 100644
index 00000000..75dce007
--- /dev/null
+++ b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_padding_test.py
@@ -0,0 +1,111 @@
+import tensorflow as tf
+import numpy as np
+import unittest
+
+class TestFusedEmbeddingPadding(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Initialize test data and custom op"""
+        # Load custom op
+        cls.custom_op = tf.load_op_library('../kernels/fused_embedding_padding.so')
+        
+        # Base test data
+        np.random.seed(140)
+        cls.input0 = np.random.randint(0, 100, size=(2 * 3, 10), dtype=np.int64)
+        cls.input1 = np.random.rand(2 * 2, 10).astype(np.float)
+        cls.input2 = cls.input1.shape
+        cls.input3 = np.array([-1, 20]).astype(np.int32)
+         # Create tf session
+        cls.sess = tf.compat.v1.Session()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sess.close()
+
+    def test_kp_fused_embedding_padding_fast(self):
+        # execute custom op
+        _, custom_out = self.custom_op.kp_fused_embedding_padding_fast(
+            input0=self.input0.shape,
+            input1=self.input1,
+            input2=self.input2[0],
+            input3=self.input3,
+        )
+
+        # tf native implementation
+        tf_out = self._fused_embedding_padding_fast_reference_impl(
+            tf.constant(self.input0.shape, dtype=tf.int64),
+            tf.constant(self.input1, dtype=tf.float32),
+            tf.constant(self.input2[0], dtype=tf.int32),
+            tf.constant(self.input3, dtype=tf.int32),
+        )
+
+        custom_out_val = self.sess.run([custom_out])
+        tf_out_val = self.sess.run([tf_out])
+        
+        # Numerical comparison
+        np.testing.assert_array_equal(
+            custom_out_val,
+            tf_out_val,
+            err_msg="result mismatch"
+        )
+    
+    def test_kp_fused_embedding_padding(self):
+        # execute custom op
+        _, custom_out = self.custom_op.kp_fused_embedding_padding(
+            input0=self.input0.shape,
+            input1=self.input1,
+            input2=self.input2[0],
+            input3=self.input3,
+        )
+
+        # tf native implementation
+        tf_out = self._fused_embedding_padding_reference_impl(
+            tf.constant(self.input0.shape, dtype=tf.int64),
+            tf.constant(self.input1, dtype=tf.float32),
+            tf.constant(self.input2[0], dtype=tf.int32),
+            tf.constant(self.input3, dtype=tf.int32),
+        )
+
+        custom_out_val = self.sess.run([custom_out])
+        tf_out_val = self.sess.run([tf_out])
+        
+        # Numerical comparison
+        np.testing.assert_array_equal(
+            custom_out_val,
+            tf_out_val,
+            err_msg="result mismatch"
+        )
+
+    def _fused_embedding_padding_fast_reference_impl(self, input0, input1, input2, input3):
+        cast = tf.cast(input0, tf.int32)
+        begin = tf.constant([0], dtype=tf.int32)
+        end = tf.constant([1], dtype=tf.int32)
+        strides = tf.constant([1], dtype=tf.int32)
+        hash_rows = tf.strided_slice(cast, begin=begin, end=end, strides=strides, shrink_axis_mask=1)
+        sub_out = hash_rows - input2
+        const = tf.constant(10, dtype=tf.int32)
+        pack = tf.stack([sub_out, const], axis=0)
+        fill = tf.fill(pack, tf.constant(0, dtype=tf.float32))
+        concat = tf.concat([input1, fill], 0)
+        reshape = tf.reshape(concat, input3)
+        shape_tensor = tf.shape(reshape)
+        output = tf.strided_slice(shape_tensor, begin=begin, end=end, strides=strides, shrink_axis_mask=1)
+        return output
+
+    def _fused_embedding_padding_reference_impl(self, input0, input1, input2, input3):
+        cast = tf.cast(input0, tf.int32)
+        begin = tf.constant([0], dtype=tf.int32)
+        end = tf.constant([1], dtype=tf.int32)
+        strides = tf.constant([1], dtype=tf.int32)
+        hash_rows = tf.strided_slice(cast, begin=begin, end=end, strides=strides, shrink_axis_mask=1)
+        sub_out = hash_rows - input2
+        const = tf.constant(10, dtype=tf.int32)
+        pack = tf.stack([sub_out, const], axis=0)
+        fill = tf.fill(pack, tf.constant(0, dtype=tf.float32))
+        concat = tf.concat([input1, fill], 0)
+        output = tf.reshape(concat, input3)
+        return output
+
+if __name__ == "__main__":
+    tf.compat.v1.disable_eager_execution()
+    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_select.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_select.py
new file mode 100644
index 00000000..981c3e3b
--- /dev/null
+++ b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_select.py
@@ -0,0 +1,93 @@
+import tensorflow as tf
+import numpy as np
+import unittest
+
+class TestFusedGather(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Initialize test data and custom op"""
+        # Load custom op
+        cls.custom_op = tf.load_op_library('kpfusedgather.so')
+        
+        # Base test data
+        cls.base_data = np.linspace(0, 11, num=240, endpoint=False, dtype=np.float32).reshape(20, 12)
+        cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2]], dtype=np.int64)
+        cls.base_begin = [0, 1]
+        cls.base_end = [0, 2]
+        cls.base_strides = [1, 1]
+         # Create tf session
+        cls.sess = tf.compat.v1.Session()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sess.close()
+
+    def test_custom(self):
+        # execute custom op
+        custom_out1, custom_out2, custom_out3= self.custom_op.KPFusedGather(
+            data=self.base_data,
+            slice_input=self.base_slice_input,
+            begin=self.base_begin,
+        )
+
+        # tf native implementation
+        tf_out1, tf_out2, tf_out3 = self._tf_reference_impl(
+            self.base_data, 
+            self.base_slice_input,
+            self.base_begin,
+        )
+
+        custom_out_val1, custom_out_val2, custom_out_val3 = self.sess.run([custom_out1, custom_out2, custom_out3])
+        tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3])
+        
+        print("custom_out_val1: ", custom_out_val1)
+        print("custom_out_val2: ", custom_out_val2)
+        print("custom_out_val3: ", custom_out_val3)
+        print("tf_out_val1: ", tf_out_val1)
+        print("tf_out_val2: ", tf_out_val2)
+        print("tf_out_val3: ", tf_out_val3)
+
+        np.testing.assert_array_equal(
+            custom_out_val1,
+            tf_out_val1,
+            err_msg="Segment count mismatch"
+        )
+
+        np.testing.assert_array_equal(
+            custom_out_val2,
+            tf_out_val2,
+            err_msg="Segment count mismatch"
+        )
+
+        np.testing.assert_allclose(
+            custom_out_val3,
+            tf_out_val3,
+            rtol=1e-6,
+            err_msg="Output values mismatch"
+        )
+
+    def _tf_reference_impl(self, data, slice_input, begin):
+        slice_out = tf.strided_slice(
+            slice_input,
+            begin = begin,
+            end = [tf.shape(slice_input)[0], begin[1] + 2],
+            strides = [1, 1],
+            begin_mask = 1,
+            end_mask = 1,
+            shrink_axis_mask = 2
+        )
+        
+        slice_out, slice_out_indices = tf.unique(slice_out)
+        output_shape = tf.shape(slice_out)
+        slice_out = tf.reshape(slice_out, [-1])
+        slice_out, _ = tf.unique(slice_out)
+
+        gather1_result = tf.gather(data, slice_out)
+        gather1_result = tf.reshape(gather1_result, [-1, 12])
+
+        gather2_result = tf.gather(gather1_result, slice_out)
+        return output_shape, slice_out_indices, gather2_result
+
+if __name__ == "__main__":
+    tf.compat.v1.disable_eager_execution()
+    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_dynamic_stitch_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_dynamic_stitch_test.py
new file mode 100644
index 00000000..a85f9ae7
--- /dev/null
+++ b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_dynamic_stitch_test.py
@@ -0,0 +1,96 @@
+import os
+os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '0' 
+import tensorflow as tf
+import numpy as np
+import unittest
+
+class TestSparseSegmentMeanSlice(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Initialize test data and custom op"""
+        # Load custom op
+        cls.custom_op = tf.load_op_library('kp_fused_sparse_dynamic_stitch.so')
+        
+        cls.variables = []
+        max_val = float('inf')
+        for i in range(12):
+            N_i = np.random.randint(1000000, 44739244)
+            max_val = min(N_i, max_val)
+            var = tf.Variable(
+                tf.random.normal([N_i, 10], dtype=tf.float32),  # shape: (N_i, 10)
+                name=f"embedding_table_{i}"
+            )
+            cls.variables.append(var)
+            print(f"Created variable {i}: shape={var.shape}")
+
+        x_np = np.random.randint(0, 12*max_val, size=(10000, 12))
+        cls.x = tf.constant(x_np, dtype=tf.int64)
+        
+        # Create tf session
+        cls.sess = tf.compat.v1.Session()
+        cls.sess.run(tf.compat.v1.global_variables_initializer())
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sess.close()
+
+    def test_base(self):
+        x_first = self.sess.run(self.x)
+        var_first = self.sess.run(self.variables[0])
+
+        x_second = self.sess.run(self.x)
+        var_second = self.sess.run(self.variables[0])
+        np.testing.assert_allclose(
+            x_first,
+            x_second,
+            rtol=1e-6,
+            err_msg="Input values mismatch"
+        )
+
+        np.testing.assert_allclose(
+            var_first,
+            var_second,
+            rtol=1e-6,
+            err_msg="Input values mismatch"
+        )
+
+        # execute custom op
+        custom_out = self.custom_op.kp_fused_sparse_dynamic_stitch(self.x, self.variables)
+
+        # tf native implementation
+        tf_out = self._tf_reference_impl(self.x, self.variables)
+
+        custom_out_val = self.sess.run([custom_out])
+        tf_out_val = self.sess.run([tf_out])
+        print("custom_shape: ")
+        print(custom_out_val[0].shape)
+        print("tf_out shape: ")
+        print(tf_out_val[0].shape)
+        # Numerical comparison
+        np.testing.assert_allclose(
+            custom_out_val[0],
+            tf_out_val[0],
+            rtol=1e-6,
+            err_msg="Output values mismatch"
+        )
+
+    def _tf_reference_impl(self, x, variables):
+        x_1 = tf.reshape(x, shape=[-1])
+        group_ids = tf.math.floormod(x_1, 12)
+        group_ids = tf.cast(group_ids, dtype=np.int32)
+        chunk_indices = tf.math.floordiv(x_1, 12)
+
+        original_indices = tf.range(0,tf.size(x_1),1)
+
+        a = tf.dynamic_partition(original_indices, group_ids, num_partitions=12)
+        b = tf.dynamic_partition(chunk_indices, group_ids, num_partitions=12)
+
+        c = [tf.gather(variables[i], b[i]) for i in range(12)]
+
+        d = tf.dynamic_stitch(a, c)
+
+        return d
+
+if __name__ == "__main__":
+    tf.compat.v1.disable_eager_execution()
+    unittest.main(argv=[''], verbosity=1)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_reshape_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_reshape_test.py
new file mode 100644
index 00000000..e8922699
--- /dev/null
+++ b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_reshape_test.py
@@ -0,0 +1,100 @@
+import tensorflow as tf
+import numpy as np
+import unittest
+
+class TestFusedSparseReshape(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Initialize test data and custom op"""
+        # Load custom op
+        cls.custom_op = tf.load_op_library('kppattern6.so')
+        
+        # Base test data
+        cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2], [3, 4]], dtype=np.int64)
+        cls.base_begin = [0, 1]
+        cls.base_end = [0, 2]
+        cls.base_strides = [1, 1]
+        cls.base_newshape = [2, 4]
+         # Create tf session
+        cls.sess = tf.compat.v1.Session()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sess.close()
+
+    def test_custom(self):
+        # execute custom op
+        custom_out1, custom_out2, = self.custom_op.KPFusedSparseReshape(
+            slice_input=self.base_slice_input,
+            begin=self.base_begin,
+            new_shape=self.base_newshape
+        )
+
+        # tf native implementation
+        tf_out1, tf_out2, tf_out3 = self._tf_reference_impl(
+            self.base_slice_input,
+            self.base_begin,
+            self.base_newshape
+        )
+
+        custom_out_val1, custom_out_val2 = self.sess.run([custom_out1, custom_out2])
+        tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3])
+        
+        print("custom_out_val1: ", custom_out_val1)
+        print("custom_out_val2: ", custom_out_val2)
+        print("tf_out_val1: ", tf_out_val1)
+        print("tf_out_val2: ", tf_out_val2)
+
+        np.testing.assert_array_equal(
+            custom_out_val1,
+            tf_out_val1,
+            err_msg="Segment count mismatch"
+        )
+
+        np.testing.assert_array_equal(
+            custom_out_val2,
+            tf_out_val2,
+            err_msg="Segment count mismatch"
+        )
+
+    def _tf_reference_impl(self, slice_input, begin, new_shape):
+        slice67_out = tf.strided_slice(
+            slice_input,
+            begin=begin,
+            end=[0, 2],
+            strides=[1, 1],
+            begin_mask=1,
+            end_mask=1,
+            shrink_axis_mask=2
+        )
+
+        slice67_out = tf.reshape(slice67_out, [-1, 1])
+        shape_out = tf.shape(slice67_out)
+        slice57_out = tf.strided_slice(
+            shape_out, 
+            begin=[0],
+            end=[1],
+            strides=[1],
+            shrink_axis_mask=1
+        )
+        
+        const2 = tf.constant(2)
+        input_shape = tf.stack([slice57_out, const2])
+        input_shape = tf.cast(input_shape, tf.int64)
+
+        range_out = tf.range(0, slice57_out, 1)
+        range_out = tf.reshape(range_out, [-1, 1])
+        range_out_64 = tf.cast(range_out, dtype=tf.int64)
+        concat_out = tf.concat([range_out_64, slice67_out], axis=-1)
+        
+        sparse_tensor = tf.SparseTensor(
+            indices=concat_out,
+            values=[1,2,3,4],
+            dense_shape=input_shape
+        )
+        sparse_tensor_out = tf.sparse.reshape(sparse_tensor, new_shape)
+        return sparse_tensor_out.indices, sparse_tensor_out.dense_shape, concat_out
+
+if __name__ == "__main__":
+    tf.compat.v1.disable_eager_execution()
+    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_segment_reduce_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_segment_reduce_test.py
new file mode 100644
index 00000000..dcca2f81
--- /dev/null
+++ b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_segment_reduce_test.py
@@ -0,0 +1,132 @@
+import tensorflow as tf
+import numpy as np
+import unittest
+
+class TestSparseSegmentMeanSlice(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Initialize test data and custom op"""
+        # Load custom op
+        cls.custom_op = tf.load_op_library('kp_fused_sparse_segment_reduce.so')
+        
+        # Base test data
+        cls.base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0,5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4， 3}
+        cls.base_indices = np.array([0, 1, 2], dtype=np.int64) # shape {3}
+        cls.base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) # shape {3, 2}
+        cls.base_begin = [0, 1]
+        cls.base_end = [0, 2]
+        cls.base_strides = [1, 2]
+         # Create tf session
+        cls.sess = tf.compat.v1.Session()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sess.close()
+
+    def test_mean(self):
+        # execute custom op
+        custom_out, custom_slice_out = self.custom_op.kp_fused_sparse_segment_reduce(
+            data=self.base_data,
+            indices=self.base_indices,
+            slice_input=self.base_slice_input,
+            begin=self.base_begin,
+            end = self.base_end,
+            strides = self.base_strides
+        )
+
+        # tf native implementation
+        tf_out, tf_slice_out = self._tf_reference_impl(
+            self.base_data, 
+            self.base_indices,
+            self.base_slice_input,
+            self.base_begin,
+            self.base_end,
+            self.base_strides,
+            True
+        )
+
+        custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out])
+        tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out])
+        
+        # Numerical comparison
+        np.testing.assert_allclose(
+            custom_out_val,
+            tf_out_val,
+            rtol=1e-6,
+            err_msg="Output values mismatch"
+        )
+        np.testing.assert_array_equal(
+            custom_slice_out_val,
+            tf_slice_out_val,
+            err_msg="Segment count mismatch"
+        )
+    
+    def test_sum(self):
+        custom_out, custom_slice_out = self.custom_op.kp_fused_sparse_segment_reduce(
+            data=self.base_data,
+            indices=self.base_indices,
+            slice_input=self.base_slice_input,
+            begin=self.base_begin,
+            end = self.base_end,
+            strides = self.base_strides,
+            combiner=0
+        )
+
+        tf_out, tf_slice_out = self._tf_reference_impl(
+            self.base_data, 
+            self.base_indices,
+            self.base_slice_input,
+            self.base_begin,
+            self.base_end,
+            self.base_strides,
+            False
+        )
+
+        custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out])
+        tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out])
+        
+        np.testing.assert_allclose(
+            custom_out_val,
+            tf_out_val,
+            rtol=1e-6,
+            err_msg="Output values mismatch"
+        )
+        np.testing.assert_array_equal(
+            custom_slice_out_val,
+            tf_slice_out_val,
+            err_msg="Segment count mismatch"
+        )
+
+    def _tf_reference_impl(self, data, indices, slice_input, begin, end, strides, is_mean):
+        slice_out = tf.strided_slice(
+            slice_input,
+            begin= begin,
+            end= end,
+            strides= strides,
+            begin_mask=1,
+            end_mask=1,
+            shrink_axis_mask=2
+        )
+        
+        segment_ids = tf.cast(slice_out, dtype=tf.int32)
+        if is_mean:
+            output = tf.sparse.segment_mean(
+                data = data,
+                indices = indices,
+                segment_ids= segment_ids
+            )
+        else:
+            output = tf.sparse.segment_sum(
+                data = data,
+                indices = indices,
+                segment_ids= segment_ids
+            )
+        
+        output_shape = tf.shape(output)
+        slice_out = tf.strided_slice(output_shape, begin=[0], end=[1], strides=[1])
+        
+        return output, slice_out
+
+if __name__ == "__main__":
+    tf.compat.v1.disable_eager_execution()
+    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
-- 
Gitee


From 61cce88af0de1831a94584ab00138518c843bcf2 Mon Sep 17 00:00:00 2001
From: Codersheepchen <chenjiaxun2@163.com>
Date: Wed, 23 Jul 2025 04:24:09 -0400
Subject: [PATCH 2/9] fix KPFusedEmbeddingActionIdGather type unmatched

---
 .../embedding_fused_action_id_gather.cc       | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tensorflow/core/kernels/embedding_fused_action_id_gather.cc b/tensorflow/core/kernels/embedding_fused_action_id_gather.cc
index b324f35f..e20b8e54 100644
--- a/tensorflow/core/kernels/embedding_fused_action_id_gather.cc
+++ b/tensorflow/core/kernels/embedding_fused_action_id_gather.cc
@@ -47,15 +47,20 @@ static void GatherV2Impl(OpKernelContext* context,
 
   const int num_indices = indices_shape.num_elements();
   float* temp_data = temp->flat<float>().data();
-  VLOG(1) << "num_indices : " << num_indices;
-  OP_REQUIRES(context, axis == 0, errors::InvalidArgument("axis only support 0"));
-  const int slice_size = P1;
-  for (int i = 0; i < num_indices; ++i) {
-    Tindices idx = indices_data[i];
-    OP_REQUIRES(context, (idx < 0 || idx >= P0), errors::InvalidArgument("GatherV2 axis=0: index out of range"));
-    std::memcpy(temp_data + i * slice_size,
-                params_data + idx * slice_size,
-                sizeof(float) * slice_size);
+  VLOG(2) << "num_indices : " << num_indices;
+  if (axis == 0) {
+    const int slice_size = P1;
+    for (int i = 0; i < num_indices; ++i) {
+      Tindices idx = indices_data[i];
+      if (idx < 0 || idx >= P0) {
+        LOG(FATAL) << "GatherV2 axis=0: index out of range: " << idx;
+      }
+      std::memcpy(temp_data + i * slice_size,
+                  params_data + idx * slice_size,
+                  sizeof(float) * slice_size);
+    }
+  } else {
+    LOG(FATAL) << "Only axis=0 is supported";
   }
   VLOG(1) << "temp value : " << temp->DebugString(100);
 }
@@ -121,4 +126,4 @@ REGISTER_CPU_KERNEL(int32, int32)
 REGISTER_CPU_KERNEL(int64, int64)
 REGISTER_CPU_KERNEL(int32, int64)
 
-}
\ No newline at end of file
+}
-- 
Gitee


From 5e66f13ca1de6bd6873d69d28b4277b0cbb8219c Mon Sep 17 00:00:00 2001
From: Codersheepchen <chenjiaxun2@163.com>
Date: Wed, 23 Jul 2025 04:31:56 -0400
Subject: [PATCH 3/9] chenge test_kp_fused_embedding_action_id_gather input0
 from int32 to int64

---
 .../fused_embedding_action_id_gather_test.py                | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py
index 94db9d42..56f24fea 100644
--- a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py
+++ b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py
@@ -16,7 +16,7 @@ class TestFusedEmbeddingActionIdGather(unittest.TestCase):
         indices1_shape = (8, 10)
         indices2_shape = (5, 6)
         params_shape = (80, 300)
-        cls.input0 = np.random.randint(0, params_shape[0], size=indices1_shape, dtype=np.int32)
+        cls.input0 = np.random.randint(0, params_shape[0], size=indices1_shape, dtype=np.int64)
         cls.input1 = np.random.random(params_shape).astype(np.float32)
         cls.input2 = np.random.randint(0, indices1_shape[0], size=indices2_shape, dtype=np.int32)
         cls.input3 = params_shape[0]
@@ -30,7 +30,7 @@ class TestFusedEmbeddingActionIdGather(unittest.TestCase):
     def test_kp_fused_embedding_action_id_gather(self):
         # execute custom op
         custom_out = self.custom_op.KPFusedEmbeddingActionIdGather(
-            input0=tf.constant(self.input0, dtype=tf.int32),
+            input0=tf.constant(self.input0, dtype=tf.int64),
             input1=tf.constant(self.input1, dtype=tf.float32),
             input2=tf.constant(self.input2, dtype=tf.int32),
             input3=tf.constant(self.input3, dtype=tf.int32),
@@ -38,7 +38,7 @@ class TestFusedEmbeddingActionIdGather(unittest.TestCase):
 
         # tf native implementation
         tf_out = self._tf_reference_impl(
-            input0=tf.constant(self.input0, dtype=tf.int32),
+            input0=tf.constant(self.input0, dtype=tf.int64),
             input1=tf.constant(self.input1, dtype=tf.float32),
             input2=tf.constant(self.input2, dtype=tf.int32),
             input3=tf.constant(self.input3, dtype=tf.int32),
-- 
Gitee


From cb683f7e0e9f57cfbcb58054c3341b1fe438986e Mon Sep 17 00:00:00 2001
From: Codersheepchen <chenjiaxun2@163.com>
Date: Fri, 25 Jul 2025 03:44:12 -0400
Subject: [PATCH 4/9] add perftest and refactor functest

---
 .../fused_embedding_action_id_gather_test.py  |  69 ---------
 .../fused_embedding_gather_test.py            |  88 ------------
 .../fused_embedding_padding_test.py           | 113 ---------------
 ...ed_embedding_sparse_dynamic_stitch_test.py |  97 -------------
 .../fused_embedding_sparse_reshape_test.py    | 102 -------------
 ...ed_embedding_sparse_segment_reduce_test.py | 134 ------------------
 .../fused_embedding_sparse_select.py          |  92 ------------
 .../fused_embedding_action_id_gather_test.py  |  67 ---------
 .../fused_embedding_gather_test.py            |  93 ------------
 .../fused_embedding_padding_test.py           | 111 ---------------
 .../fused_embedding_select.py                 |  93 ------------
 ...ed_embedding_sparse_dynamic_stitch_test.py |  96 -------------
 .../fused_embedding_sparse_reshape_test.py    | 100 -------------
 ...ed_embedding_sparse_segment_reduce_test.py | 132 -----------------
 .../fused_embedding_gather_test.py            |   4 +-
 .../fused_embedding_sparse_select.py          | 113 +++++++++++++++
 16 files changed, 115 insertions(+), 1389 deletions(-)
 delete mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py
 delete mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_gather_test.py
 delete mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_padding_test.py
 delete mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_dynamic_stitch_test.py
 delete mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_reshape_test.py
 delete mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_segment_reduce_test.py
 delete mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_select.py
 delete mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_action_id_gather_test.py
 delete mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_gather_test.py
 delete mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_padding_test.py
 delete mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_select.py
 delete mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_dynamic_stitch_test.py
 delete mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_reshape_test.py
 delete mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_segment_reduce_test.py
 create mode 100644 tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select.py

diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py
deleted file mode 100644
index 56f24fea..00000000
--- a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import tensorflow as tf
-import numpy as np
-import unittest
-
-from tensorflow.python.ops import gen_embedding_fused_ops
-
-class TestFusedEmbeddingActionIdGather(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """Initialize test data and custom op"""
-        # Load custom op
-        cls.custom_op = gen_embedding_fused_ops
-        
-        # Base test data
-        np.random.seed(140)
-        indices1_shape = (8, 10)
-        indices2_shape = (5, 6)
-        params_shape = (80, 300)
-        cls.input0 = np.random.randint(0, params_shape[0], size=indices1_shape, dtype=np.int64)
-        cls.input1 = np.random.random(params_shape).astype(np.float32)
-        cls.input2 = np.random.randint(0, indices1_shape[0], size=indices2_shape, dtype=np.int32)
-        cls.input3 = params_shape[0]
-         # Create tf session
-        cls.sess = tf.compat.v1.Session()
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.sess.close()
-
-    def test_kp_fused_embedding_action_id_gather(self):
-        # execute custom op
-        custom_out = self.custom_op.KPFusedEmbeddingActionIdGather(
-            input0=tf.constant(self.input0, dtype=tf.int64),
-            input1=tf.constant(self.input1, dtype=tf.float32),
-            input2=tf.constant(self.input2, dtype=tf.int32),
-            input3=tf.constant(self.input3, dtype=tf.int32),
-        )
-
-        # tf native implementation
-        tf_out = self._tf_reference_impl(
-            input0=tf.constant(self.input0, dtype=tf.int64),
-            input1=tf.constant(self.input1, dtype=tf.float32),
-            input2=tf.constant(self.input2, dtype=tf.int32),
-            input3=tf.constant(self.input3, dtype=tf.int32),
-        )
-
-        custom_out_val = self.sess.run([custom_out])
-        tf_out_val = self.sess.run([tf_out])
-        
-        # Numerical comparison
-        np.testing.assert_array_equal(
-            custom_out_val,
-            tf_out_val,
-            err_msg="result mismatch"
-        )
-    
-    def _tf_reference_impl(self, input0, input1, input2, input3):
-        gather1 = tf.gather(input1, input0, axis=0)
-        gather2 = tf.gather(gather1, input2, axis=0)
-        pack1 = tf.stack([input3, 1680], axis=0)
-        pack2 = tf.stack([input3, -1], axis=0)
-        reshape = tf.reshape(gather2, pack2)
-        fill = tf.fill(pack1, tf.constant(0, dtype=tf.float32))
-        output = tf.concat([reshape, fill], axis=-1)
-        return output
-
-if __name__ == "__main__":
-    tf.compat.v1.disable_eager_execution()
-    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_gather_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_gather_test.py
deleted file mode 100644
index f47b70d2..00000000
--- a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_gather_test.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import tensorflow as tf
-import numpy as np
-import unittest
-
-from tensorflow.python.ops import gen_embedding_fused_ops
-
-class TestFusedGather(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """Initialize test data and custom op"""
-        # Load custom op
-        cls.custom_op = gen_embedding_fused_ops
-        
-        # Base test data
-        cls.base_data = np.linspace(0, 11, num=240, endpoint=False, dtype=np.float32).reshape(20, 12)
-        cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2]], dtype=np.int64)
-        cls.base_begin = [0, 1]
-        cls.base_end = [0, 2]
-        cls.base_strides = [1, 1]
-         # Create tf session
-        cls.sess = tf.compat.v1.Session()
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.sess.close()
-
-    def test_custom(self):
-        # execute custom op
-        custom_out1, custom_out2, custom_out3= self.custom_op.KPFusedGather(
-            data=self.base_data,
-            slice_input=self.base_slice_input,
-            begin=self.base_begin,
-        )
-
-        # tf native implementation
-        tf_out1, tf_out2, tf_out3 = self._tf_reference_impl(
-            self.base_data, 
-            self.base_slice_input,
-            self.base_begin,
-        )
-
-        custom_out_val1, custom_out_val2, custom_out_val3 = self.sess.run([custom_out1, custom_out2, custom_out3])
-        tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3])
-
-        np.testing.assert_array_equal(
-            custom_out_val1,
-            tf_out_val1,
-            err_msg="Segment count mismatch"
-        )
-
-        np.testing.assert_array_equal(
-            custom_out_val2,
-            tf_out_val2,
-            err_msg="Segment count mismatch"
-        )
-
-        np.testing.assert_allclose(
-            custom_out_val3,
-            tf_out_val3,
-            rtol=1e-6,
-            err_msg="Output values mismatch"
-        )
-
-    def _tf_reference_impl(self, data, slice_input, begin):
-        slice_out = tf.strided_slice(
-            slice_input,
-            begin = begin,
-            end = [tf.shape(slice_input)[0], begin[1] + 2],
-            strides = [1, 1],
-            begin_mask = 1,
-            end_mask = 1,
-            shrink_axis_mask = 2
-        )
-        
-        slice_out, slice_out_indices = tf.unique(slice_out)
-        output_shape = tf.shape(slice_out)
-        slice_out = tf.reshape(slice_out, [-1])
-        slice_out, _ = tf.unique(slice_out)
-
-        gather1_result = tf.gather(data, slice_out)
-        gather1_result = tf.reshape(gather1_result, [-1, 12])
-
-        gather2_result = tf.gather(gather1_result, slice_out)
-        return output_shape, slice_out_indices, gather2_result
-
-if __name__ == "__main__":
-    tf.compat.v1.disable_eager_execution()
-    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_padding_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_padding_test.py
deleted file mode 100644
index 78c01e17..00000000
--- a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_padding_test.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import tensorflow as tf
-import numpy as np
-import unittest
-
-from tensorflow.python.ops import gen_embedding_fused_ops
-
-class TestFusedEmbeddingPadding(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """Initialize test data and custom op"""
-        # Load custom op
-        cls.custom_op = gen_embedding_fused_ops
-        
-        # Base test data
-        np.random.seed(140)
-        cls.input0 = np.random.randint(0, 100, size=(2 * 3, 10), dtype=np.int64)
-        cls.input1 = np.random.rand(2 * 2, 10).astype(np.float)
-        cls.input2 = cls.input1.shape
-        cls.input3 = np.array([-1, 20]).astype(np.int32)
-         # Create tf session
-        cls.sess = tf.compat.v1.Session()
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.sess.close()
-
-    def test_kp_fused_embedding_padding_fast(self):
-        # execute custom op
-        _, custom_out = self.custom_op.KPFusedEmbeddingPaddingFast(
-            input0=self.input0.shape,
-            input1=self.input1,
-            input2=self.input2[0],
-            input3=self.input3,
-        )
-
-        # tf native implementation
-        tf_out = self._fused_embedding_padding_fast_reference_impl(
-            tf.constant(self.input0.shape, dtype=tf.int64),
-            tf.constant(self.input1, dtype=tf.float32),
-            tf.constant(self.input2[0], dtype=tf.int32),
-            tf.constant(self.input3, dtype=tf.int32),
-        )
-
-        custom_out_val = self.sess.run([custom_out])
-        tf_out_val = self.sess.run([tf_out])
-        
-        # Numerical comparison
-        np.testing.assert_array_equal(
-            custom_out_val,
-            tf_out_val,
-            err_msg="result mismatch"
-        )
-    
-    def test_kp_fused_embedding_padding(self):
-        # execute custom op
-        _, custom_out = self.custom_op.KPFusedEmbeddingPadding(
-            input0=self.input0.shape,
-            input1=self.input1,
-            input2=self.input2[0],
-            input3=self.input3,
-        )
-
-        # tf native implementation
-        tf_out = self._fused_embedding_padding_reference_impl(
-            tf.constant(self.input0.shape, dtype=tf.int64),
-            tf.constant(self.input1, dtype=tf.float32),
-            tf.constant(self.input2[0], dtype=tf.int32),
-            tf.constant(self.input3, dtype=tf.int32),
-        )
-
-        custom_out_val = self.sess.run([custom_out])
-        tf_out_val = self.sess.run([tf_out])
-        
-        # Numerical comparison
-        np.testing.assert_array_equal(
-            custom_out_val,
-            tf_out_val,
-            err_msg="result mismatch"
-        )
-
-    def _fused_embedding_padding_fast_reference_impl(self, input0, input1, input2, input3):
-        cast = tf.cast(input0, tf.int32)
-        begin = tf.constant([0], dtype=tf.int32)
-        end = tf.constant([1], dtype=tf.int32)
-        strides = tf.constant([1], dtype=tf.int32)
-        hash_rows = tf.strided_slice(cast, begin=begin, end=end, strides=strides, shrink_axis_mask=1)
-        sub_out = hash_rows - input2
-        const = tf.constant(10, dtype=tf.int32)
-        pack = tf.stack([sub_out, const], axis=0)
-        fill = tf.fill(pack, tf.constant(0, dtype=tf.float32))
-        concat = tf.concat([input1, fill], 0)
-        reshape = tf.reshape(concat, input3)
-        shape_tensor = tf.shape(reshape)
-        output = tf.strided_slice(shape_tensor, begin=begin, end=end, strides=strides, shrink_axis_mask=1)
-        return output
-
-    def _fused_embedding_padding_reference_impl(self, input0, input1, input2, input3):
-        cast = tf.cast(input0, tf.int32)
-        begin = tf.constant([0], dtype=tf.int32)
-        end = tf.constant([1], dtype=tf.int32)
-        strides = tf.constant([1], dtype=tf.int32)
-        hash_rows = tf.strided_slice(cast, begin=begin, end=end, strides=strides, shrink_axis_mask=1)
-        sub_out = hash_rows - input2
-        const = tf.constant(10, dtype=tf.int32)
-        pack = tf.stack([sub_out, const], axis=0)
-        fill = tf.fill(pack, tf.constant(0, dtype=tf.float32))
-        concat = tf.concat([input1, fill], 0)
-        output = tf.reshape(concat, input3)
-        return output
-
-if __name__ == "__main__":
-    tf.compat.v1.disable_eager_execution()
-    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_dynamic_stitch_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_dynamic_stitch_test.py
deleted file mode 100644
index 4de55241..00000000
--- a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_dynamic_stitch_test.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import os
-import tensorflow as tf
-import numpy as np
-import unittest
-
-from tensorflow.python.ops import gen_embedding_fused_ops
-
-class TestSparseSegmentMeanSlice(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """Initialize test data and custom op"""
-        # Load custom op
-        cls.custom_op = gen_embedding_fused_ops
-        
-        cls.variables = []
-        max_val = float('inf')
-        for i in range(12):
-            N_i = np.random.randint(1000000, 44739244)
-            max_val = min(N_i, max_val)
-            var = tf.Variable(
-                tf.random.normal([N_i, 10], dtype=tf.float32),  # shape: (N_i, 10)
-                name=f"embedding_table_{i}"
-            )
-            cls.variables.append(var)
-            print(f"Created variable {i}: shape={var.shape}")
-
-        x_np = np.random.randint(0, 12*max_val, size=(10000, 12))
-        cls.x = tf.constant(x_np, dtype=tf.int64)
-        
-        # Create tf session
-        cls.sess = tf.compat.v1.Session()
-        cls.sess.run(tf.compat.v1.global_variables_initializer())
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.sess.close()
-
-    def test_base(self):
-        x_first = self.sess.run(self.x)
-        var_first = self.sess.run(self.variables[0])
-
-        x_second = self.sess.run(self.x)
-        var_second = self.sess.run(self.variables[0])
-        np.testing.assert_allclose(
-            x_first,
-            x_second,
-            rtol=1e-6,
-            err_msg="Input values mismatch"
-        )
-
-        np.testing.assert_allclose(
-            var_first,
-            var_second,
-            rtol=1e-6,
-            err_msg="Input values mismatch"
-        )
-
-        # execute custom op
-        custom_out = self.custom_op.KPFusedSparseDynamicStitch(x=self.x, variables=self.variables)
-
-        # tf native implementation
-        tf_out = self._tf_reference_impl(x=self.x, variables=self.variables)
-
-        custom_out_val = self.sess.run([custom_out])
-        tf_out_val = self.sess.run([tf_out])
-        print("custom_shape: ")
-        print(custom_out_val[0].shape)
-        print("tf_out shape: ")
-        print(tf_out_val[0].shape)
-        # Numerical comparison
-        np.testing.assert_allclose(
-            custom_out_val[0],
-            tf_out_val[0],
-            rtol=1e-6,
-            err_msg="Output values mismatch"
-        )
-
-    def _tf_reference_impl(self, x, variables):
-        x_1 = tf.reshape(x, shape=[-1])
-        group_ids = tf.math.floormod(x_1, 12)
-        group_ids = tf.cast(group_ids, dtype=np.int32)
-        chunk_indices = tf.math.floordiv(x_1, 12)
-
-        original_indices = tf.range(0,tf.size(x_1),1)
-
-        a = tf.dynamic_partition(original_indices, group_ids, num_partitions=12)
-        b = tf.dynamic_partition(chunk_indices, group_ids, num_partitions=12)
-
-        c = [tf.gather(variables[i], b[i]) for i in range(12)]
-
-        d = tf.dynamic_stitch(a, c)
-
-        return d
-
-if __name__ == "__main__":
-    tf.compat.v1.disable_eager_execution()
-    unittest.main(argv=[''], verbosity=1)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_reshape_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_reshape_test.py
deleted file mode 100644
index 37d27531..00000000
--- a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_reshape_test.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import tensorflow as tf
-import numpy as np
-import unittest
-
-from tensorflow.python.ops import gen_embedding_fused_ops
-
-class TestFusedSparseReshape(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """Initialize test data and custom op"""
-        # Load custom op
-        cls.custom_op = gen_embedding_fused_ops
-        
-        # Base test data
-        cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2], [3, 4]], dtype=np.int64)
-        cls.base_begin = [0, 1]
-        cls.base_end = [0, 2]
-        cls.base_strides = [1, 1]
-        cls.base_newshape = [2, 4]
-         # Create tf session
-        cls.sess = tf.compat.v1.Session()
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.sess.close()
-
-    def test_custom(self):
-        # execute custom op
-        custom_out1, custom_out2, = self.custom_op.KPFusedSparseReshape(
-            slice_input=self.base_slice_input,
-            begin=self.base_begin,
-            new_shape=self.base_newshape
-        )
-
-        # tf native implementation
-        tf_out1, tf_out2, tf_out3 = self._tf_reference_impl(
-            self.base_slice_input,
-            self.base_begin,
-            self.base_newshape
-        )
-
-        custom_out_val1, custom_out_val2 = self.sess.run([custom_out1, custom_out2])
-        tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3])
-        
-        print("custom_out_val1: ", custom_out_val1)
-        print("custom_out_val2: ", custom_out_val2)
-        print("tf_out_val1: ", tf_out_val1)
-        print("tf_out_val2: ", tf_out_val2)
-
-        np.testing.assert_array_equal(
-            custom_out_val1,
-            tf_out_val1,
-            err_msg="Segment count mismatch"
-        )
-
-        np.testing.assert_array_equal(
-            custom_out_val2,
-            tf_out_val2,
-            err_msg="Segment count mismatch"
-        )
-
-    def _tf_reference_impl(self, slice_input, begin, new_shape):
-        slice67_out = tf.strided_slice(
-            slice_input,
-            begin=begin,
-            end=[0, 2],
-            strides=[1, 1],
-            begin_mask=1,
-            end_mask=1,
-            shrink_axis_mask=2
-        )
-
-        slice67_out = tf.reshape(slice67_out, [-1, 1])
-        shape_out = tf.shape(slice67_out)
-        slice57_out = tf.strided_slice(
-            shape_out, 
-            begin=[0],
-            end=[1],
-            strides=[1],
-            shrink_axis_mask=1
-        )
-        
-        const2 = tf.constant(2)
-        input_shape = tf.stack([slice57_out, const2])
-        input_shape = tf.cast(input_shape, tf.int64)
-
-        range_out = tf.range(0, slice57_out, 1)
-        range_out = tf.reshape(range_out, [-1, 1])
-        range_out_64 = tf.cast(range_out, dtype=tf.int64)
-        concat_out = tf.concat([range_out_64, slice67_out], axis=-1)
-        
-        sparse_tensor = tf.SparseTensor(
-            indices=concat_out,
-            values=[1,2,3,4],
-            dense_shape=input_shape
-        )
-        sparse_tensor_out = tf.sparse.reshape(sparse_tensor, new_shape)
-        return sparse_tensor_out.indices, sparse_tensor_out.dense_shape, concat_out
-
-if __name__ == "__main__":
-    tf.compat.v1.disable_eager_execution()
-    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_segment_reduce_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_segment_reduce_test.py
deleted file mode 100644
index 69c7a114..00000000
--- a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_segment_reduce_test.py
+++ /dev/null
@@ -1,134 +0,0 @@
-import tensorflow as tf
-import numpy as np
-import unittest
-
-from tensorflow.python.ops import gen_embedding_fused_ops
-
-class TestSparseSegmentMeanSlice(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """Initialize test data and custom op"""
-        # Load custom op
-        cls.custom_op = gen_embedding_fused_ops
-        
-        # Base test data
-        cls.base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0,5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4， 3}
-        cls.base_indices = np.array([0, 1, 2], dtype=np.int64) # shape {3}
-        cls.base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) # shape {3, 2}
-        cls.base_begin = [0, 1]
-        cls.base_end = [0, 2]
-        cls.base_strides = [1, 2]
-         # Create tf session
-        cls.sess = tf.compat.v1.Session()
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.sess.close()
-
-    def test_mean(self):
-        # execute custom op
-        custom_out, custom_slice_out = self.custom_op.KPFusedSparseSegmentReduce(
-            data=self.base_data,
-            indices=self.base_indices,
-            slice_input=self.base_slice_input,
-            begin=self.base_begin,
-            end = self.base_end,
-            strides = self.base_strides
-        )
-
-        # tf native implementation
-        tf_out, tf_slice_out = self._tf_reference_impl(
-            self.base_data, 
-            self.base_indices,
-            self.base_slice_input,
-            self.base_begin,
-            self.base_end,
-            self.base_strides,
-            True
-        )
-
-        custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out])
-        tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out])
-        
-        # Numerical comparison
-        np.testing.assert_allclose(
-            custom_out_val,
-            tf_out_val,
-            rtol=1e-6,
-            err_msg="Output values mismatch"
-        )
-        np.testing.assert_array_equal(
-            custom_slice_out_val,
-            tf_slice_out_val,
-            err_msg="Segment count mismatch"
-        )
-    
-    def test_sum(self):
-        custom_out, custom_slice_out = self.custom_op.KPFusedSparseSegmentReduce(
-            data=self.base_data,
-            indices=self.base_indices,
-            slice_input=self.base_slice_input,
-            begin=self.base_begin,
-            end = self.base_end,
-            strides = self.base_strides,
-            combiner=0
-        )
-
-        tf_out, tf_slice_out = self._tf_reference_impl(
-            self.base_data, 
-            self.base_indices,
-            self.base_slice_input,
-            self.base_begin,
-            self.base_end,
-            self.base_strides,
-            False
-        )
-
-        custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out])
-        tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out])
-        
-        np.testing.assert_allclose(
-            custom_out_val,
-            tf_out_val,
-            rtol=1e-6,
-            err_msg="Output values mismatch"
-        )
-        np.testing.assert_array_equal(
-            custom_slice_out_val,
-            tf_slice_out_val,
-            err_msg="Segment count mismatch"
-        )
-
-    def _tf_reference_impl(self, data, indices, slice_input, begin, end, strides, is_mean):
-        slice_out = tf.strided_slice(
-            slice_input,
-            begin= begin,
-            end= end,
-            strides= strides,
-            begin_mask=1,
-            end_mask=1,
-            shrink_axis_mask=2
-        )
-        
-        segment_ids = tf.cast(slice_out, dtype=tf.int32)
-        if is_mean:
-            output = tf.sparse.segment_mean(
-                data = data,
-                indices = indices,
-                segment_ids= segment_ids
-            )
-        else:
-            output = tf.sparse.segment_sum(
-                data = data,
-                indices = indices,
-                segment_ids= segment_ids
-            )
-        
-        output_shape = tf.shape(output)
-        slice_out = tf.strided_slice(output_shape, begin=[0], end=[1], strides=[1])
-        
-        return output, slice_out
-
-if __name__ == "__main__":
-    tf.compat.v1.disable_eager_execution()
-    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_select.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_select.py
deleted file mode 100644
index 6740811a..00000000
--- a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_select.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import tensorflow as tf
-import numpy as np
-import unittest
-
-from tensorflow.python.ops import gen_embedding_fused_ops
-
-class TestKPFusedSparseSelect(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """Initialize test data and custom op"""
-        # Load custom op
-        cls.custom_op = gen_embedding_fused_ops
-        
-        # Base test data
-        cls.input_a = np.random.randint(-10, 10, size=(40, 50)).astype(np.int32)
-        cls.input_b = np.random.randint(0, 20000, size=(20, 100)).astype(np.int32)
-        cls.input_c = np.random.randint(0, 10, size=(50, 2, 20)).astype(np.int32)
-         # Create tf session
-        cls.sess = tf.compat.v1.Session()
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.sess.close()
-
-    def test_custom(self):
-        # execute custom op
-        output_x, output_y, output_z = self.custom_op.KPFusedSparseSelect(
-            input_a=tf.constant(self.input_a, dtype=tf.int32),
-            input_b=tf.constant(self.input_b, dtype=tf.int32),
-            input_c=tf.constant(self.input_c, dtype=tf.int32),
-        )
-
-        # tf native implementation
-        ref_output_x, ref_output_y, ref_output_z = self._tf_reference_impl(
-            input_a=tf.constant(self.input_a, dtype=tf.int32),
-            input_b=tf.constant(self.input_b, dtype=tf.int32),
-            input_c=tf.constant(self.input_c, dtype=tf.int32),
-        )
-
-        op_x_val, op_y_val, op_z_val = self.sess.run([output_x, output_y, output_z])
-        ref_x_val, ref_y_val, ref_z_val = self.sess.run([ref_output_x, ref_output_y, ref_output_z])
-        
-        np.testing.assert_allclose(
-            op_x_val,
-            ref_x_val,
-            rtol=1e-5,
-            err_msg="Output values mismatch"
-        )
-
-        np.testing.assert_allclose(
-            op_y_val,
-            ref_y_val,
-            rtol=1e-5,
-            err_msg="Output values mismatch"
-        )
-
-        np.testing.assert_allclose(
-            op_z_val,
-            ref_z_val,
-            rtol=1e-5,
-            err_msg="Output values mismatch"
-        )
-
-    def _tf_reference_impl(self, input_a, input_b, input_c):
-        a = tf.reshape(input_a, [-1, 1])
-        b = tf.reshape(input_b, [-1, 1])
-        c = tf.reshape(input_c, [-1, 1])
-
-        greater_a = tf.greater(a, 0)
-        shape_reshape_a1 = tf.shape(a)
-        shape_reshape_a2 = tf.shape(a)
-        fill_a1 = tf.fill(shape_reshape_a1, tf.constant(1, dtype=tf.float32))
-        realdiv = tf.realdiv(fill_a1, tf.constant(1, dtype=tf.float32))
-        fill_a2 = tf.fill(shape_reshape_a2, tf.constant(0, dtype=tf.float32))
-        cast_a = tf.cast(greater_a, tf.float32)
-        shape_a = tf.shape(cast_a)
-        fill_a = tf.fill(shape_a, tf.constant(1, dtype=tf.float32))
-        equal_4563 = tf.equal(b, 4563)
-        equal_10831 = tf.equal(b, 10831)
-        equal_3 = tf.equal(c, 3)
-        select_1 = tf.where(equal_4563, fill_a, cast_a)
-        select_2 = tf.where(equal_10831, fill_a, select_1)
-        sub = tf.subtract(tf.constant(1, dtype=tf.float32), select_2)
-        mul = tf.multiply(tf.constant(1, dtype=tf.float32), select_2)
-        select_3 = tf.where(equal_3, realdiv, fill_a1)
-        concat = tf.concat([mul, select_3], axis=-1)
-        return fill_a2, sub, concat
-
-
-if __name__ == "__main__":
-    tf.compat.v1.disable_eager_execution()
-    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_action_id_gather_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_action_id_gather_test.py
deleted file mode 100644
index c766085c..00000000
--- a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_action_id_gather_test.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import tensorflow as tf
-import numpy as np
-import unittest
-
-class TestFusedEmbeddingActionIdGather(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """Initialize test data and custom op"""
-        # Load custom op
-        cls.custom_op = tf.load_op_library('../kernels/fused_embedding_action_id_gather.so')
-        
-        # Base test data
-        np.random.seed(140)
-        indices1_shape = (8, 10)
-        indices2_shape = (5, 6)
-        params_shape = (80, 300)
-        cls.input0 = np.random.randint(0, params_shape[0], size=indices1_shape, dtype=np.int32)
-        cls.input1 = np.random.random(params_shape).astype(np.float32)
-        cls.input2 = np.random.randint(0, indices1_shape[0], size=indices2_shape, dtype=np.int32)
-        cls.input3 = params_shape[0]
-         # Create tf session
-        cls.sess = tf.compat.v1.Session()
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.sess.close()
-
-    def test_kp_fused_embedding_action_id_gather(self):
-        # execute custom op
-        custom_out = self.custom_op.kp_fused_embedding_action_id_gather(
-            input0=tf.constant(self.input0, dtype=tf.int32),
-            input1=tf.constant(self.input1, dtype=tf.float32),
-            input2=tf.constant(self.input2, dtype=tf.int32),
-            input3=tf.constant(self.input3, dtype=tf.int32),
-        )
-
-        # tf native implementation
-        tf_out = self._tf_reference_impl(
-            input0=tf.constant(self.input0, dtype=tf.int32),
-            input1=tf.constant(self.input1, dtype=tf.float32),
-            input2=tf.constant(self.input2, dtype=tf.int32),
-            input3=tf.constant(self.input3, dtype=tf.int32),
-        )
-
-        custom_out_val = self.sess.run([custom_out])
-        tf_out_val = self.sess.run([tf_out])
-        
-        # Numerical comparison
-        np.testing.assert_array_equal(
-            custom_out_val,
-            tf_out_val,
-            err_msg="result mismatch"
-        )
-    
-    def _tf_reference_impl(self, input0, input1, input2, input3):
-        gather1 = tf.gather(input1, input0, axis=0)
-        gather2 = tf.gather(gather1, input2, axis=0)
-        pack1 = tf.stack([input3, 1680], axis=0)
-        pack2 = tf.stack([input3, -1], axis=0)
-        reshape = tf.reshape(gather2, pack2)
-        fill = tf.fill(pack1, tf.constant(0, dtype=tf.float32))
-        output = tf.concat([reshape, fill], axis=-1)
-        return output
-
-if __name__ == "__main__":
-    tf.compat.v1.disable_eager_execution()
-    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_gather_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_gather_test.py
deleted file mode 100644
index 981c3e3b..00000000
--- a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_gather_test.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import tensorflow as tf
-import numpy as np
-import unittest
-
-class TestFusedGather(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """Initialize test data and custom op"""
-        # Load custom op
-        cls.custom_op = tf.load_op_library('kpfusedgather.so')
-        
-        # Base test data
-        cls.base_data = np.linspace(0, 11, num=240, endpoint=False, dtype=np.float32).reshape(20, 12)
-        cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2]], dtype=np.int64)
-        cls.base_begin = [0, 1]
-        cls.base_end = [0, 2]
-        cls.base_strides = [1, 1]
-         # Create tf session
-        cls.sess = tf.compat.v1.Session()
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.sess.close()
-
-    def test_custom(self):
-        # execute custom op
-        custom_out1, custom_out2, custom_out3= self.custom_op.KPFusedGather(
-            data=self.base_data,
-            slice_input=self.base_slice_input,
-            begin=self.base_begin,
-        )
-
-        # tf native implementation
-        tf_out1, tf_out2, tf_out3 = self._tf_reference_impl(
-            self.base_data, 
-            self.base_slice_input,
-            self.base_begin,
-        )
-
-        custom_out_val1, custom_out_val2, custom_out_val3 = self.sess.run([custom_out1, custom_out2, custom_out3])
-        tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3])
-        
-        print("custom_out_val1: ", custom_out_val1)
-        print("custom_out_val2: ", custom_out_val2)
-        print("custom_out_val3: ", custom_out_val3)
-        print("tf_out_val1: ", tf_out_val1)
-        print("tf_out_val2: ", tf_out_val2)
-        print("tf_out_val3: ", tf_out_val3)
-
-        np.testing.assert_array_equal(
-            custom_out_val1,
-            tf_out_val1,
-            err_msg="Segment count mismatch"
-        )
-
-        np.testing.assert_array_equal(
-            custom_out_val2,
-            tf_out_val2,
-            err_msg="Segment count mismatch"
-        )
-
-        np.testing.assert_allclose(
-            custom_out_val3,
-            tf_out_val3,
-            rtol=1e-6,
-            err_msg="Output values mismatch"
-        )
-
-    def _tf_reference_impl(self, data, slice_input, begin):
-        slice_out = tf.strided_slice(
-            slice_input,
-            begin = begin,
-            end = [tf.shape(slice_input)[0], begin[1] + 2],
-            strides = [1, 1],
-            begin_mask = 1,
-            end_mask = 1,
-            shrink_axis_mask = 2
-        )
-        
-        slice_out, slice_out_indices = tf.unique(slice_out)
-        output_shape = tf.shape(slice_out)
-        slice_out = tf.reshape(slice_out, [-1])
-        slice_out, _ = tf.unique(slice_out)
-
-        gather1_result = tf.gather(data, slice_out)
-        gather1_result = tf.reshape(gather1_result, [-1, 12])
-
-        gather2_result = tf.gather(gather1_result, slice_out)
-        return output_shape, slice_out_indices, gather2_result
-
-if __name__ == "__main__":
-    tf.compat.v1.disable_eager_execution()
-    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_padding_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_padding_test.py
deleted file mode 100644
index 75dce007..00000000
--- a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_padding_test.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import tensorflow as tf
-import numpy as np
-import unittest
-
-class TestFusedEmbeddingPadding(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """Initialize test data and custom op"""
-        # Load custom op
-        cls.custom_op = tf.load_op_library('../kernels/fused_embedding_padding.so')
-        
-        # Base test data
-        np.random.seed(140)
-        cls.input0 = np.random.randint(0, 100, size=(2 * 3, 10), dtype=np.int64)
-        cls.input1 = np.random.rand(2 * 2, 10).astype(np.float)
-        cls.input2 = cls.input1.shape
-        cls.input3 = np.array([-1, 20]).astype(np.int32)
-         # Create tf session
-        cls.sess = tf.compat.v1.Session()
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.sess.close()
-
-    def test_kp_fused_embedding_padding_fast(self):
-        # execute custom op
-        _, custom_out = self.custom_op.kp_fused_embedding_padding_fast(
-            input0=self.input0.shape,
-            input1=self.input1,
-            input2=self.input2[0],
-            input3=self.input3,
-        )
-
-        # tf native implementation
-        tf_out = self._fused_embedding_padding_fast_reference_impl(
-            tf.constant(self.input0.shape, dtype=tf.int64),
-            tf.constant(self.input1, dtype=tf.float32),
-            tf.constant(self.input2[0], dtype=tf.int32),
-            tf.constant(self.input3, dtype=tf.int32),
-        )
-
-        custom_out_val = self.sess.run([custom_out])
-        tf_out_val = self.sess.run([tf_out])
-        
-        # Numerical comparison
-        np.testing.assert_array_equal(
-            custom_out_val,
-            tf_out_val,
-            err_msg="result mismatch"
-        )
-    
-    def test_kp_fused_embedding_padding(self):
-        # execute custom op
-        _, custom_out = self.custom_op.kp_fused_embedding_padding(
-            input0=self.input0.shape,
-            input1=self.input1,
-            input2=self.input2[0],
-            input3=self.input3,
-        )
-
-        # tf native implementation
-        tf_out = self._fused_embedding_padding_reference_impl(
-            tf.constant(self.input0.shape, dtype=tf.int64),
-            tf.constant(self.input1, dtype=tf.float32),
-            tf.constant(self.input2[0], dtype=tf.int32),
-            tf.constant(self.input3, dtype=tf.int32),
-        )
-
-        custom_out_val = self.sess.run([custom_out])
-        tf_out_val = self.sess.run([tf_out])
-        
-        # Numerical comparison
-        np.testing.assert_array_equal(
-            custom_out_val,
-            tf_out_val,
-            err_msg="result mismatch"
-        )
-
-    def _fused_embedding_padding_fast_reference_impl(self, input0, input1, input2, input3):
-        cast = tf.cast(input0, tf.int32)
-        begin = tf.constant([0], dtype=tf.int32)
-        end = tf.constant([1], dtype=tf.int32)
-        strides = tf.constant([1], dtype=tf.int32)
-        hash_rows = tf.strided_slice(cast, begin=begin, end=end, strides=strides, shrink_axis_mask=1)
-        sub_out = hash_rows - input2
-        const = tf.constant(10, dtype=tf.int32)
-        pack = tf.stack([sub_out, const], axis=0)
-        fill = tf.fill(pack, tf.constant(0, dtype=tf.float32))
-        concat = tf.concat([input1, fill], 0)
-        reshape = tf.reshape(concat, input3)
-        shape_tensor = tf.shape(reshape)
-        output = tf.strided_slice(shape_tensor, begin=begin, end=end, strides=strides, shrink_axis_mask=1)
-        return output
-
-    def _fused_embedding_padding_reference_impl(self, input0, input1, input2, input3):
-        cast = tf.cast(input0, tf.int32)
-        begin = tf.constant([0], dtype=tf.int32)
-        end = tf.constant([1], dtype=tf.int32)
-        strides = tf.constant([1], dtype=tf.int32)
-        hash_rows = tf.strided_slice(cast, begin=begin, end=end, strides=strides, shrink_axis_mask=1)
-        sub_out = hash_rows - input2
-        const = tf.constant(10, dtype=tf.int32)
-        pack = tf.stack([sub_out, const], axis=0)
-        fill = tf.fill(pack, tf.constant(0, dtype=tf.float32))
-        concat = tf.concat([input1, fill], 0)
-        output = tf.reshape(concat, input3)
-        return output
-
-if __name__ == "__main__":
-    tf.compat.v1.disable_eager_execution()
-    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_select.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_select.py
deleted file mode 100644
index 981c3e3b..00000000
--- a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_select.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import tensorflow as tf
-import numpy as np
-import unittest
-
-class TestFusedGather(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """Initialize test data and custom op"""
-        # Load custom op
-        cls.custom_op = tf.load_op_library('kpfusedgather.so')
-        
-        # Base test data
-        cls.base_data = np.linspace(0, 11, num=240, endpoint=False, dtype=np.float32).reshape(20, 12)
-        cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2]], dtype=np.int64)
-        cls.base_begin = [0, 1]
-        cls.base_end = [0, 2]
-        cls.base_strides = [1, 1]
-         # Create tf session
-        cls.sess = tf.compat.v1.Session()
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.sess.close()
-
-    def test_custom(self):
-        # execute custom op
-        custom_out1, custom_out2, custom_out3= self.custom_op.KPFusedGather(
-            data=self.base_data,
-            slice_input=self.base_slice_input,
-            begin=self.base_begin,
-        )
-
-        # tf native implementation
-        tf_out1, tf_out2, tf_out3 = self._tf_reference_impl(
-            self.base_data, 
-            self.base_slice_input,
-            self.base_begin,
-        )
-
-        custom_out_val1, custom_out_val2, custom_out_val3 = self.sess.run([custom_out1, custom_out2, custom_out3])
-        tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3])
-        
-        print("custom_out_val1: ", custom_out_val1)
-        print("custom_out_val2: ", custom_out_val2)
-        print("custom_out_val3: ", custom_out_val3)
-        print("tf_out_val1: ", tf_out_val1)
-        print("tf_out_val2: ", tf_out_val2)
-        print("tf_out_val3: ", tf_out_val3)
-
-        np.testing.assert_array_equal(
-            custom_out_val1,
-            tf_out_val1,
-            err_msg="Segment count mismatch"
-        )
-
-        np.testing.assert_array_equal(
-            custom_out_val2,
-            tf_out_val2,
-            err_msg="Segment count mismatch"
-        )
-
-        np.testing.assert_allclose(
-            custom_out_val3,
-            tf_out_val3,
-            rtol=1e-6,
-            err_msg="Output values mismatch"
-        )
-
-    def _tf_reference_impl(self, data, slice_input, begin):
-        slice_out = tf.strided_slice(
-            slice_input,
-            begin = begin,
-            end = [tf.shape(slice_input)[0], begin[1] + 2],
-            strides = [1, 1],
-            begin_mask = 1,
-            end_mask = 1,
-            shrink_axis_mask = 2
-        )
-        
-        slice_out, slice_out_indices = tf.unique(slice_out)
-        output_shape = tf.shape(slice_out)
-        slice_out = tf.reshape(slice_out, [-1])
-        slice_out, _ = tf.unique(slice_out)
-
-        gather1_result = tf.gather(data, slice_out)
-        gather1_result = tf.reshape(gather1_result, [-1, 12])
-
-        gather2_result = tf.gather(gather1_result, slice_out)
-        return output_shape, slice_out_indices, gather2_result
-
-if __name__ == "__main__":
-    tf.compat.v1.disable_eager_execution()
-    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_dynamic_stitch_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_dynamic_stitch_test.py
deleted file mode 100644
index a85f9ae7..00000000
--- a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_dynamic_stitch_test.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import os
-os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '0' 
-import tensorflow as tf
-import numpy as np
-import unittest
-
-class TestSparseSegmentMeanSlice(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """Initialize test data and custom op"""
-        # Load custom op
-        cls.custom_op = tf.load_op_library('kp_fused_sparse_dynamic_stitch.so')
-        
-        cls.variables = []
-        max_val = float('inf')
-        for i in range(12):
-            N_i = np.random.randint(1000000, 44739244)
-            max_val = min(N_i, max_val)
-            var = tf.Variable(
-                tf.random.normal([N_i, 10], dtype=tf.float32),  # shape: (N_i, 10)
-                name=f"embedding_table_{i}"
-            )
-            cls.variables.append(var)
-            print(f"Created variable {i}: shape={var.shape}")
-
-        x_np = np.random.randint(0, 12*max_val, size=(10000, 12))
-        cls.x = tf.constant(x_np, dtype=tf.int64)
-        
-        # Create tf session
-        cls.sess = tf.compat.v1.Session()
-        cls.sess.run(tf.compat.v1.global_variables_initializer())
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.sess.close()
-
-    def test_base(self):
-        x_first = self.sess.run(self.x)
-        var_first = self.sess.run(self.variables[0])
-
-        x_second = self.sess.run(self.x)
-        var_second = self.sess.run(self.variables[0])
-        np.testing.assert_allclose(
-            x_first,
-            x_second,
-            rtol=1e-6,
-            err_msg="Input values mismatch"
-        )
-
-        np.testing.assert_allclose(
-            var_first,
-            var_second,
-            rtol=1e-6,
-            err_msg="Input values mismatch"
-        )
-
-        # execute custom op
-        custom_out = self.custom_op.kp_fused_sparse_dynamic_stitch(self.x, self.variables)
-
-        # tf native implementation
-        tf_out = self._tf_reference_impl(self.x, self.variables)
-
-        custom_out_val = self.sess.run([custom_out])
-        tf_out_val = self.sess.run([tf_out])
-        print("custom_shape: ")
-        print(custom_out_val[0].shape)
-        print("tf_out shape: ")
-        print(tf_out_val[0].shape)
-        # Numerical comparison
-        np.testing.assert_allclose(
-            custom_out_val[0],
-            tf_out_val[0],
-            rtol=1e-6,
-            err_msg="Output values mismatch"
-        )
-
-    def _tf_reference_impl(self, x, variables):
-        x_1 = tf.reshape(x, shape=[-1])
-        group_ids = tf.math.floormod(x_1, 12)
-        group_ids = tf.cast(group_ids, dtype=np.int32)
-        chunk_indices = tf.math.floordiv(x_1, 12)
-
-        original_indices = tf.range(0,tf.size(x_1),1)
-
-        a = tf.dynamic_partition(original_indices, group_ids, num_partitions=12)
-        b = tf.dynamic_partition(chunk_indices, group_ids, num_partitions=12)
-
-        c = [tf.gather(variables[i], b[i]) for i in range(12)]
-
-        d = tf.dynamic_stitch(a, c)
-
-        return d
-
-if __name__ == "__main__":
-    tf.compat.v1.disable_eager_execution()
-    unittest.main(argv=[''], verbosity=1)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_reshape_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_reshape_test.py
deleted file mode 100644
index e8922699..00000000
--- a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_reshape_test.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import tensorflow as tf
-import numpy as np
-import unittest
-
-class TestFusedSparseReshape(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """Initialize test data and custom op"""
-        # Load custom op
-        cls.custom_op = tf.load_op_library('kppattern6.so')
-        
-        # Base test data
-        cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2], [3, 4]], dtype=np.int64)
-        cls.base_begin = [0, 1]
-        cls.base_end = [0, 2]
-        cls.base_strides = [1, 1]
-        cls.base_newshape = [2, 4]
-         # Create tf session
-        cls.sess = tf.compat.v1.Session()
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.sess.close()
-
-    def test_custom(self):
-        # execute custom op
-        custom_out1, custom_out2, = self.custom_op.KPFusedSparseReshape(
-            slice_input=self.base_slice_input,
-            begin=self.base_begin,
-            new_shape=self.base_newshape
-        )
-
-        # tf native implementation
-        tf_out1, tf_out2, tf_out3 = self._tf_reference_impl(
-            self.base_slice_input,
-            self.base_begin,
-            self.base_newshape
-        )
-
-        custom_out_val1, custom_out_val2 = self.sess.run([custom_out1, custom_out2])
-        tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3])
-        
-        print("custom_out_val1: ", custom_out_val1)
-        print("custom_out_val2: ", custom_out_val2)
-        print("tf_out_val1: ", tf_out_val1)
-        print("tf_out_val2: ", tf_out_val2)
-
-        np.testing.assert_array_equal(
-            custom_out_val1,
-            tf_out_val1,
-            err_msg="Segment count mismatch"
-        )
-
-        np.testing.assert_array_equal(
-            custom_out_val2,
-            tf_out_val2,
-            err_msg="Segment count mismatch"
-        )
-
-    def _tf_reference_impl(self, slice_input, begin, new_shape):
-        slice67_out = tf.strided_slice(
-            slice_input,
-            begin=begin,
-            end=[0, 2],
-            strides=[1, 1],
-            begin_mask=1,
-            end_mask=1,
-            shrink_axis_mask=2
-        )
-
-        slice67_out = tf.reshape(slice67_out, [-1, 1])
-        shape_out = tf.shape(slice67_out)
-        slice57_out = tf.strided_slice(
-            shape_out, 
-            begin=[0],
-            end=[1],
-            strides=[1],
-            shrink_axis_mask=1
-        )
-        
-        const2 = tf.constant(2)
-        input_shape = tf.stack([slice57_out, const2])
-        input_shape = tf.cast(input_shape, tf.int64)
-
-        range_out = tf.range(0, slice57_out, 1)
-        range_out = tf.reshape(range_out, [-1, 1])
-        range_out_64 = tf.cast(range_out, dtype=tf.int64)
-        concat_out = tf.concat([range_out_64, slice67_out], axis=-1)
-        
-        sparse_tensor = tf.SparseTensor(
-            indices=concat_out,
-            values=[1,2,3,4],
-            dense_shape=input_shape
-        )
-        sparse_tensor_out = tf.sparse.reshape(sparse_tensor, new_shape)
-        return sparse_tensor_out.indices, sparse_tensor_out.dense_shape, concat_out
-
-if __name__ == "__main__":
-    tf.compat.v1.disable_eager_execution()
-    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_segment_reduce_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_segment_reduce_test.py
deleted file mode 100644
index dcca2f81..00000000
--- a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_segment_reduce_test.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import tensorflow as tf
-import numpy as np
-import unittest
-
-class TestSparseSegmentMeanSlice(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """Initialize test data and custom op"""
-        # Load custom op
-        cls.custom_op = tf.load_op_library('kp_fused_sparse_segment_reduce.so')
-        
-        # Base test data
-        cls.base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0,5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4， 3}
-        cls.base_indices = np.array([0, 1, 2], dtype=np.int64) # shape {3}
-        cls.base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) # shape {3, 2}
-        cls.base_begin = [0, 1]
-        cls.base_end = [0, 2]
-        cls.base_strides = [1, 2]
-         # Create tf session
-        cls.sess = tf.compat.v1.Session()
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.sess.close()
-
-    def test_mean(self):
-        # execute custom op
-        custom_out, custom_slice_out = self.custom_op.kp_fused_sparse_segment_reduce(
-            data=self.base_data,
-            indices=self.base_indices,
-            slice_input=self.base_slice_input,
-            begin=self.base_begin,
-            end = self.base_end,
-            strides = self.base_strides
-        )
-
-        # tf native implementation
-        tf_out, tf_slice_out = self._tf_reference_impl(
-            self.base_data, 
-            self.base_indices,
-            self.base_slice_input,
-            self.base_begin,
-            self.base_end,
-            self.base_strides,
-            True
-        )
-
-        custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out])
-        tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out])
-        
-        # Numerical comparison
-        np.testing.assert_allclose(
-            custom_out_val,
-            tf_out_val,
-            rtol=1e-6,
-            err_msg="Output values mismatch"
-        )
-        np.testing.assert_array_equal(
-            custom_slice_out_val,
-            tf_slice_out_val,
-            err_msg="Segment count mismatch"
-        )
-    
-    def test_sum(self):
-        custom_out, custom_slice_out = self.custom_op.kp_fused_sparse_segment_reduce(
-            data=self.base_data,
-            indices=self.base_indices,
-            slice_input=self.base_slice_input,
-            begin=self.base_begin,
-            end = self.base_end,
-            strides = self.base_strides,
-            combiner=0
-        )
-
-        tf_out, tf_slice_out = self._tf_reference_impl(
-            self.base_data, 
-            self.base_indices,
-            self.base_slice_input,
-            self.base_begin,
-            self.base_end,
-            self.base_strides,
-            False
-        )
-
-        custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out])
-        tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out])
-        
-        np.testing.assert_allclose(
-            custom_out_val,
-            tf_out_val,
-            rtol=1e-6,
-            err_msg="Output values mismatch"
-        )
-        np.testing.assert_array_equal(
-            custom_slice_out_val,
-            tf_slice_out_val,
-            err_msg="Segment count mismatch"
-        )
-
-    def _tf_reference_impl(self, data, indices, slice_input, begin, end, strides, is_mean):
-        slice_out = tf.strided_slice(
-            slice_input,
-            begin= begin,
-            end= end,
-            strides= strides,
-            begin_mask=1,
-            end_mask=1,
-            shrink_axis_mask=2
-        )
-        
-        segment_ids = tf.cast(slice_out, dtype=tf.int32)
-        if is_mean:
-            output = tf.sparse.segment_mean(
-                data = data,
-                indices = indices,
-                segment_ids= segment_ids
-            )
-        else:
-            output = tf.sparse.segment_sum(
-                data = data,
-                indices = indices,
-                segment_ids= segment_ids
-            )
-        
-        output_shape = tf.shape(output)
-        slice_out = tf.strided_slice(output_shape, begin=[0], end=[1], strides=[1])
-        
-        return output, slice_out
-
-if __name__ == "__main__":
-    tf.compat.v1.disable_eager_execution()
-    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py
index 1c73adc1..70b6f539 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py
@@ -73,7 +73,7 @@ class TestFusedGather(unittest.TestCase):
         )
         
         slice_out, slice_out_indices = tf.unique(slice_out)
-        output_shape = slice_out
+        output_shape = tf.shape(slice_out)
         slice_out = tf.reshape(slice_out, [-1])
         slice_out, _ = tf.unique(slice_out)
 
@@ -85,4 +85,4 @@ class TestFusedGather(unittest.TestCase):
 
 if __name__ == "__main__":
     tf.compat.v1.disable_eager_execution()
-    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
+    unittest.main(argv=[''], verbosity=2)
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select.py
new file mode 100644
index 00000000..d37128cb
--- /dev/null
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select.py
@@ -0,0 +1,113 @@
+import tensorflow as tf
+import numpy as np
+import unittest
+
+from tensorflow.python.ops import gen_embedding_fused_ops
+from utils.utils import perf_run, generate_timeline, wrapper_sess
+
+
+def ori_fused_embedding_sparse_select_graph(input_a, input_b, input_c):
+    a = tf.reshape(input_a, [-1, 1])
+    b = tf.reshape(input_b, [-1, 1])
+    c = tf.reshape(input_c, [-1, 1])
+
+    greater_a = tf.greater(a, 0)
+    shape_reshape_a1 = tf.shape(a)
+    shape_reshape_a2 = tf.shape(a)
+    fill_a1 = tf.fill(shape_reshape_a1, tf.constant(1, dtype=tf.float32))
+    realdiv = tf.realdiv(fill_a1, tf.constant(1, dtype=tf.float32))
+    output_x = tf.fill(shape_reshape_a2, tf.constant(0, dtype=tf.float32))
+    cast_a = tf.cast(greater_a, tf.float32)
+    shape_a = tf.shape(cast_a)
+    fill_a = tf.fill(shape_a, tf.constant(1, dtype=tf.float32))
+    equal_4563 = tf.equal(b, 4563)
+    equal_10831 = tf.equal(b, 10831)
+    equal_3 = tf.equal(c, 3)
+    select_1 = tf.where(equal_4563, fill_a, cast_a)
+    select_2 = tf.where(equal_10831, fill_a, select_1)
+    output_y = tf.subtract(tf.constant(1, dtype=tf.float32), select_2)
+    mul = tf.multiply(tf.constant(1, dtype=tf.float32), select_2)
+    select_3 = tf.where(equal_3, realdiv, fill_a1)
+    output_z = tf.concat([mul, select_3], axis=-1)
+    return output_x, output_y, output_z
+
+
+def opt_fused_embedding_sparse_select_graph(input_a, input_b, input_c):
+    output_x, output_y, output_z = gen_embedding_fused_ops.KPFusedSparseSelect(
+        input_a=input_a, input_b=input_b, input_c=input_c
+    )
+    return output_x, output_y, output_z
+
+
+class TestKPFusedSparseSelect(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Initialize config"""
+        cls.config = tf.compat.v1.ConfigProto()
+        cls.config.intra_op_parallelism_threads = 16
+        cls.config.inter_op_parallelism_threads = 16
+
+        cls.run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
+        cls.run_metadata_ori = tf.compat.v1.RunMetadata()
+        cls.run_metadata_opt = tf.compat.v1.RunMetadata()
+
+    @classmethod
+    def tearDownClass(cls):
+        return
+
+    def test_fused_embedding_sparse_select(self):
+        # Create Graph
+        with tf.Graph().as_default():
+            input0 = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name="input_a")
+            input1 = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name="input_b")
+            input2 = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name="input_c")
+            """Initialize test data"""
+            feed = {
+                input0: np.random.randint(0, 100, size=(100, 10)).astype(np.int32),
+                input1: np.random.randint(0, 100, size=(10, 100)).astype(np.int32),
+                input2: np.random.randint(0, 100, size=(20, 50)).astype(np.int32),
+            }
+            with tf.name_scope("ori"):
+                out0_ori, out1_ori, out2_ori = ori_fused_embedding_sparse_select_graph(input0, input1, input2)
+            with tf.name_scope("opt"):
+                out0_opt, out1_opt, out2_opt = opt_fused_embedding_sparse_select_graph(input0, input1, input2)
+        
+            # Create tf session
+            with tf.compat.v1.Session(config=self.config) as sess:
+                # functest
+                out0_ori_val, out1_ori_val, out2_ori_val = sess.run([out0_ori, out1_ori, out2_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori)
+                out0_opt_val, out1_opt_val, out2_opt_val = sess.run([out0_opt, out1_opt, out2_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt)
+
+                np.testing.assert_allclose(
+                    out0_ori_val,
+                    out0_opt_val,
+                    rtol=1e-5,
+                    err_msg="Output values mismatch"
+                )
+
+                np.testing.assert_allclose(
+                    out1_ori_val,
+                    out1_opt_val,
+                    rtol=1e-5,
+                    err_msg="Output values mismatch"
+                )
+
+                np.testing.assert_allclose(
+                    out2_ori_val,
+                    out2_opt_val,
+                    rtol=1e-5,
+                    err_msg="Output values mismatch"
+                )
+
+                generate_timeline(self.run_metadata_ori.step_stats, f"{self._testMethodName}_ori")
+                generate_timeline(self.run_metadata_opt.step_stats, f"{self._testMethodName}_opt")
+
+                # perftest
+                perf_run(wrapper_sess(sess, [out0_ori, out1_ori, out2_ori], feed_dict=feed), 
+                         wrapper_sess(sess, [out0_opt, out1_opt, out2_opt], feed_dict=feed), 
+                         "KPFusedEmbeddingSparseSelect")
+
+
+if __name__ == "__main__":
+    tf.compat.v1.disable_eager_execution()
+    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
-- 
Gitee


From 6ea0ce42cbf7833f8ef7d850cbef44a9bb42ee61 Mon Sep 17 00:00:00 2001
From: rayshine <1324789704@qq.com>
Date: Fri, 15 Aug 2025 09:49:57 +0800
Subject: [PATCH 5/9] =?UTF-8?q?=E4=BF=AE=E6=94=B9=20fused=20embedding?=
 =?UTF-8?q?=E7=AE=97=E5=AD=90=E7=9A=84=E6=97=B6=E5=BB=B6=E6=B5=8B=E8=AF=95?=
 =?UTF-8?q?=E6=96=B9=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../fused_embedding_action_id_gather_test.py  |  51 ++-
 .../fused_embedding_gather_test.py            | 219 ++++++++-----
 .../fused_embedding_padding_test.py           | 107 +++++--
 ...ed_embedding_sparse_dynamic_stitch_test.py | 196 ++++++++----
 .../fused_embedding_sparse_reshape_test.py    | 227 ++++++++-----
 ...ed_embedding_sparse_segment_reduce_test.py | 297 +++++++++++-------
 .../fused_embedding_sparse_select.py          | 113 -------
 .../fused_embedding_sparse_select_test.py     |  73 +++--
 .../embedding_fused_test/utils/utils.py       |  30 +-
 9 files changed, 820 insertions(+), 493 deletions(-)
 delete mode 100644 tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select.py

diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py
index d20628b0..1fa83bc9 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py
@@ -3,7 +3,7 @@ import numpy as np
 import unittest
 
 from tensorflow.python.ops import gen_embedding_fused_ops
-from utils.utils import perf_run, generate_timeline, wrapper_sess
+from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time
 
 
 def ori_fused_embedding_action_id_gather_graph(input0, input1, input2, input3):
@@ -33,7 +33,7 @@ class TestFusedEmbeddingActionIdGather(unittest.TestCase):
         """Initialize config"""
         cls.config = tf.compat.v1.ConfigProto()
         cls.config.intra_op_parallelism_threads = 16
-        cls.config.inter_op_parallelism_threads = 16
+        cls.config.inter_op_parallelism_threads = 1
 
         cls.run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
         cls.run_metadata_ori = tf.compat.v1.RunMetadata()
@@ -76,15 +76,48 @@ class TestFusedEmbeddingActionIdGather(unittest.TestCase):
                     out_opt_val,
                     err_msg="result mismatch"
                 )
+                
+                op_name   = "KPFusedEmbeddingActionIdGather"
+                TF_origin = "----------TF_origin-----------"
+                print("-" * 60)
+                print("-" * 60)
+                print("new test")
+                # 多次生成 timeline 并统计平均值
+                num_runs = 1000
+                start_op = "ori/stack_1"
+                end_op   = "ori/concat"
+                total_times_ori = 0
+                total_times_opt = 0
+                for i in range(num_runs):
+                    sess.run(
+                        [out_ori], 
+                        feed_dict=feed, 
+                        options=self.run_options, 
+                        run_metadata=self.run_metadata_ori
+                    )
+                    sess.run(
+                        [out_opt], 
+                        feed_dict=feed, 
+                        options=self.run_options, 
+                        run_metadata=self.run_metadata_opt
+                    )
+                    filename_ori = f"{op_name}_ori"
+                    filename_opt = f"{op_name}_opt"
+                    generate_timeline(self.run_metadata_ori.step_stats, filename_ori)
+                    generate_timeline(self.run_metadata_opt.step_stats, filename_opt)
 
-                generate_timeline(self.run_metadata_ori.step_stats, f"{self._testMethodName}_ori")
-                generate_timeline(self.run_metadata_opt.step_stats, f"{self._testMethodName}_opt")
+                    total_times_ori += extract_op_total_time(filename_ori, start_op, end_op)
+                    total_times_opt += extract_op_dur(filename_opt, op_name)
 
-                # perftest
-                perf_run(wrapper_sess(sess, [out_ori], feed_dict=feed), 
-                         wrapper_sess(sess, [out_opt], feed_dict=feed), 
-                         "KPFusedEmbeddingActionIdGather")
-        
+                avg_ori = total_times_ori / num_runs
+                avg_opt = total_times_opt / num_runs
+                speedup = (avg_ori - avg_opt) / avg_ori * 100
+
+                print(f"{TF_origin}: {avg_ori:.4f} us per run")
+                print(f"{op_name}: {avg_opt:.4f} us per run")
+                print(f"improve: {speedup:.2f}%")
+
+                
 
 if __name__ == "__main__":
     tf.compat.v1.disable_eager_execution()
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py
index 70b6f539..f87ee1b5 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py
@@ -3,85 +3,160 @@ import numpy as np
 import unittest
 
 from tensorflow.python.ops import gen_embedding_fused_ops
+from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time
+
+
+def ori_fused_embedding_gather_graph(data, slice_input, begin):
+    slice_out = tf.strided_slice(
+        slice_input,
+        begin = begin,
+        end = [tf.shape(slice_input)[0], begin[1] + 2],
+        strides = [1, 1],
+        begin_mask = 1,
+        end_mask = 1,
+        shrink_axis_mask = 2
+    )
+    
+    slice_out, slice_out_indices = tf.unique(slice_out)
+    output_shape = tf.shape(slice_out)
+    slice_out = tf.reshape(slice_out, [-1])
+    slice_out, _ = tf.unique(slice_out)
+
+    gather1_result = tf.gather(data, slice_out)
+    gather1_result = tf.reshape(gather1_result, [-1, 12])
+
+    gather2_result = tf.gather(gather1_result, slice_out)
+    return output_shape, slice_out_indices, gather2_result
+
+
+def opt_fused_embedding_gather_graph(data, slice_input, begin):
+    custom_out1, custom_out2, custom_out3 = gen_embedding_fused_ops.KPFusedGather(
+        data=data,
+        slice_input=slice_input,
+        begin=begin
+    )
+    return custom_out1, custom_out2, custom_out3
+
 
 class TestFusedGather(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        """Initialize test data and custom op"""
-        # Load custom op
-        cls.custom_op = gen_embedding_fused_ops
-        
-        # Base test data
-        cls.base_data = np.linspace(0, 11, num=240, endpoint=False, dtype=np.float32).reshape(20, 12)
-        cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2]], dtype=np.int64)
-        cls.base_begin = [0, 1]
-        cls.base_end = [0, 2]
-        cls.base_strides = [1, 1]
-         # Create tf session
-        cls.sess = tf.compat.v1.Session()
+        """Initialize"""        
+        cls.config = tf.compat.v1.ConfigProto()
+        cls.config.intra_op_parallelism_threads = 16
+        cls.config.inter_op_parallelism_threads = 1
+
+        cls.run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
+        cls.run_metadata_ori = tf.compat.v1.RunMetadata()
+        cls.run_metadata_opt = tf.compat.v1.RunMetadata()
 
     @classmethod
     def tearDownClass(cls):
-        cls.sess.close()
-
-    def test_custom(self):
-        # execute custom op
-        custom_out1, custom_out2, custom_out3= self.custom_op.KPFusedGather(
-            data=self.base_data,
-            slice_input=self.base_slice_input,
-            begin=self.base_begin,
-        )
-
-        # tf native implementation
-        tf_out1, tf_out2, tf_out3 = self._tf_reference_impl(
-            self.base_data, 
-            self.base_slice_input,
-            self.base_begin,
-        )
-
-        custom_out_val1, custom_out_val2, custom_out_val3 = self.sess.run([custom_out1, custom_out2, custom_out3])
-        tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3])
-
-        np.testing.assert_array_equal(
-            custom_out_val1,
-            tf_out_val1,
-            err_msg="Segment count mismatch"
-        )
-
-        np.testing.assert_array_equal(
-            custom_out_val2,
-            tf_out_val2,
-            err_msg="Segment count mismatch"
-        )
-
-        np.testing.assert_allclose(
-            custom_out_val3,
-            tf_out_val3,
-            rtol=1e-6,
-            err_msg="Output values mismatch"
-        )
-
-    def _tf_reference_impl(self, data, slice_input, begin):
-        slice_out = tf.strided_slice(
-            slice_input,
-            begin = begin,
-            end = [tf.shape(slice_input)[0], begin[1] + 2],
-            strides = [1, 1],
-            begin_mask = 1,
-            end_mask = 1,
-            shrink_axis_mask = 2
-        )
-        
-        slice_out, slice_out_indices = tf.unique(slice_out)
-        output_shape = tf.shape(slice_out)
-        slice_out = tf.reshape(slice_out, [-1])
-        slice_out, _ = tf.unique(slice_out)
-
-        gather1_result = tf.gather(data, slice_out)
-        gather1_result = tf.reshape(gather1_result, [-1, 12])
-
-        gather2_result = tf.gather(gather1_result, slice_out)
-        return output_shape, slice_out_indices, gather2_result
+        return 
+
+    def test_kp_embedding_gather(self):
+        with tf.Graph().as_default():
+            data = tf.compat.v1.placeholder(tf.float32, shape=(20, 12), name="data")
+            slice_input = tf.compat.v1.placeholder(tf.int64, shape=(3, 2), name="slice_input")
+            begin = tf.compat.v1.placeholder(tf.int32, name="begin")
+            base_data = np.linspace(0, 11, num=240, endpoint=False, dtype=np.float32).reshape(20, 12)
+            base_slice_input = np.array([[0, 0], [0, 1], [1, 2]], dtype=np.int64)
+            base_begin = [0, 1]
+            feed = {
+                data: base_data,
+                slice_input: base_slice_input,
+                begin: base_begin
+            }
+            # original graph
+            with tf.name_scope("ori"):
+                out_ori1, out_ori2, out_ori3 = ori_fused_embedding_gather_graph(
+                    data,
+                    slice_input,
+                    begin
+                )
+
+            # optimized graph
+            with tf.name_scope("opt"):
+                out_opt1, out_opt2, out_opt3 = opt_fused_embedding_gather_graph(
+                    data=data,
+                    slice_input=slice_input,
+                    begin=begin
+                )
+            
+            with tf.compat.v1.Session(config=self.config) as sess:
+                # run ori
+                out_ori_val1, out_ori_val2, out_ori_val3 = sess.run(
+                    [out_ori1, out_ori2, out_ori3],
+                    feed_dict=feed,
+                    options=self.run_options,
+                    run_metadata=self.run_metadata_ori
+                )
+                # run opt
+                out_opt_val1, out_opt_val2, out_opt_val3 = sess.run(
+                    [out_opt1, out_opt2, out_opt3],
+                    feed_dict=feed,
+                    options=self.run_options,
+                    run_metadata=self.run_metadata_opt
+                )
+                # 功能测试
+                np.testing.assert_array_equal(
+                    out_ori_val1,
+                    out_opt_val1,
+                    err_msg="Segment count mismatch"
+                )
+
+                np.testing.assert_array_equal(
+                    out_ori_val2,
+                    out_opt_val2,
+                    err_msg="Segment count mismatch"
+                )
+                np.testing.assert_allclose(
+                    out_opt_val3,
+                    out_ori_val3,
+                    rtol=1e-6,
+                    err_msg="Output values mismatch"
+                )
+    
+                op_name   = "KPFusedGather"
+                TF_origin = "--TF_origin--"
+                print("-" * 60)
+                print("-" * 60)
+                print("new test")
+                # 多次生成 timeline 并统计平均值
+                num_runs = 1000
+                start_op = "ori/strided_slice_1"
+                end_op   = "ori/GatherV2_1"
+                total_times_ori = 0
+                total_times_opt = 0
+                for i in range(num_runs):
+                    sess.run(
+                        [out_ori1, out_ori2, out_ori3], 
+                        feed_dict=feed, 
+                        options=self.run_options, 
+                        run_metadata=self.run_metadata_ori
+                    )
+                    sess.run(
+                        [out_opt1, out_opt2], 
+                        feed_dict=feed, 
+                        options=self.run_options, 
+                        run_metadata=self.run_metadata_opt
+                    )
+                    filename_ori = f"{op_name}_ori"
+                    filename_opt = f"{op_name}_opt"
+                    generate_timeline(self.run_metadata_ori.step_stats, filename_ori)
+                    generate_timeline(self.run_metadata_opt.step_stats, filename_opt)
+                    
+                    total_times_ori += extract_op_total_time(filename_ori, start_op, end_op)
+                    total_times_opt += extract_op_dur(filename_opt, op_name)
+                
+                avg_ori = total_times_ori / num_runs
+                avg_opt = total_times_opt / num_runs
+                speedup = (avg_ori - avg_opt) / avg_ori * 100
+                
+                print(f"{TF_origin}: {avg_ori:.4f} us per run")
+                print(f"{op_name}: {avg_opt:.4f} us per run")
+                print(f"improve: {speedup:.2f}%")
+
 
 if __name__ == "__main__":
     tf.compat.v1.disable_eager_execution()
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py
index b9950e51..ba6a87cd 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py
@@ -3,15 +3,15 @@ import numpy as np
 import unittest
 
 from tensorflow.python.ops import gen_embedding_fused_ops
-from utils.utils import perf_run, generate_timeline, wrapper_sess
+from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time
 
 np.random.seed(140)
 
 
 def opt_fused_embedding_padding_fast_graph(input0, input1, input2, input3):
-        # execute custom op
-        _, custom_out = gen_embedding_fused_ops.kp_fused_embedding_padding_fast(input0, input1, input2, input3)
-        return custom_out
+    # execute custom op
+    _, custom_out = gen_embedding_fused_ops.kp_fused_embedding_padding_fast(input0, input1, input2, input3)
+    return custom_out
     
 def opt_fused_embedding_padding_graph(input0, input1, input2, input3):
     # execute custom op
@@ -55,7 +55,7 @@ class TestFusedEmbeddingPadding(unittest.TestCase):
         """Initialize config"""
         cls.config = tf.compat.v1.ConfigProto()
         cls.config.intra_op_parallelism_threads = 16
-        cls.config.inter_op_parallelism_threads = 16
+        cls.config.inter_op_parallelism_threads = 1
 
         cls.run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
         cls.run_metadata_ori = tf.compat.v1.RunMetadata()
@@ -95,20 +95,47 @@ class TestFusedEmbeddingPadding(unittest.TestCase):
                     opt_result,
                     err_msg="result mismatch"
                 )
-
-                from tensorflow.python.client import timeline
-                tl_ori = timeline.Timeline(self.run_metadata_ori.step_stats)
-                tl_opt = timeline.Timeline(self.run_metadata_opt.step_stats)
-                ctf_ori = tl_ori.generate_chrome_trace_format()
-                ctf_opt = tl_opt.generate_chrome_trace_format()
-
-                with open("timeline_ori.json", "w") as f:
-                    f.write(ctf_ori)
-                with open("timeline_opt.json", "w") as f:
-                    f.write(ctf_opt)
                 
-                # perftest
-                perf_run(wrapper_sess(sess, [out_ori], feed), wrapper_sess(sess, [out_opt], feed_dict=feed), "KPFusedEmbeddingPadding")
+                op_name   = "KPFusedEmbeddingPadding"
+                TF_origin = "-------TF_origin-------"
+                print("-" * 60)
+                print("-" * 60)
+                print("new test")
+                # 多次生成 timeline 并统计平均值
+                num_runs = 1000
+                start_op = "ori/Cast"
+                end_op   = "ori/Reshape"
+                total_times_ori = 0
+                total_times_opt = 0
+                for i in range(num_runs):
+                    sess.run(
+                        [out_ori], 
+                        feed_dict=feed, 
+                        options=self.run_options, 
+                        run_metadata=self.run_metadata_ori
+                    )
+                    sess.run(
+                        [out_opt], 
+                        feed_dict=feed, 
+                        options=self.run_options, 
+                        run_metadata=self.run_metadata_opt
+                    )
+                    filename_ori = f"{op_name}_ori"
+                    filename_opt = f"{op_name}_opt"
+                    generate_timeline(self.run_metadata_ori.step_stats, filename_ori)
+                    generate_timeline(self.run_metadata_opt.step_stats, filename_opt)
+                    
+                    total_times_ori += extract_op_total_time(filename_ori, start_op, end_op)
+                    total_times_opt += extract_op_dur(filename_opt, op_name)
+                
+                avg_ori = total_times_ori / num_runs
+                avg_opt = total_times_opt / num_runs
+                speedup = (avg_ori - avg_opt) / avg_ori * 100
+                
+                print(f"{TF_origin}: {avg_ori:.4f} us per run")
+                print(f"{op_name}: {avg_opt:.4f} us per run")
+                print(f"improve: {speedup:.2f}%")
+
 
     def test_func_kp_fused_embedding_padding_fast(self):
         # Create Graph
@@ -141,11 +168,45 @@ class TestFusedEmbeddingPadding(unittest.TestCase):
                     err_msg="result mismatch"
                 )
 
-                generate_timeline(self.run_metadata_ori.step_stats, f"{self._testMethodName}_ori")
-                generate_timeline(self.run_metadata_opt.step_stats, f"{self._testMethodName}_opt")
-
-                # perftest
-                perf_run(wrapper_sess(sess, [out_ori], feed), wrapper_sess(sess, [out_opt], feed_dict=feed), "KPFusedEmbeddingPaddingFast")
+                op_name   = "KPFusedEmbeddingPaddingFast"
+                TF_origin = "---------TF_origin---------"                
+                print("-" * 60)
+                print("-" * 60)
+                print("new test")
+                # 多次生成 timeline 并统计平均值
+                num_runs = 1000
+                start_op = "ori/Cast"
+                end_op   = "ori/StridedSlice_1"
+                total_times_ori = 0
+                total_times_opt = 0
+                for i in range(num_runs):
+                    sess.run(
+                        [out_ori], 
+                        feed_dict=feed, 
+                        options=self.run_options, 
+                        run_metadata=self.run_metadata_ori
+                    )
+                    sess.run(
+                        [out_opt], 
+                        feed_dict=feed, 
+                        options=self.run_options, 
+                        run_metadata=self.run_metadata_opt
+                    )
+                    filename_ori = f"{op_name}_ori"
+                    filename_opt = f"{op_name}_opt"
+                    generate_timeline(self.run_metadata_ori.step_stats, filename_ori)
+                    generate_timeline(self.run_metadata_opt.step_stats, filename_opt)
+                    
+                    total_times_ori += extract_op_total_time(filename_ori, start_op, end_op)
+                    total_times_opt += extract_op_dur(filename_opt, op_name)
+                
+                avg_ori = total_times_ori / num_runs
+                avg_opt = total_times_opt / num_runs
+                speedup = (avg_ori - avg_opt) / avg_ori * 100
+                
+                print(f"{TF_origin}: {avg_ori:.4f} us per run")
+                print(f"{op_name}: {avg_opt:.4f} us per run")
+                print(f"improve: {speedup:.2f}%")
 
 if __name__ == "__main__":
     tf.compat.v1.disable_eager_execution()
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py
index 4de55241..5a81692c 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py
@@ -4,28 +4,41 @@ import numpy as np
 import unittest
 
 from tensorflow.python.ops import gen_embedding_fused_ops
+from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time
 
-class TestSparseSegmentMeanSlice(unittest.TestCase):
+
+def ori_fused_sparse_dynamic_stitch_graph(x, emb_tables):
+    x_1 = tf.reshape(x, shape=[-1])            # 将输入 x 展平成一维向量 x_1
+    group_ids = tf.math.floormod(x_1, 12)	   
+    group_ids = tf.cast(group_ids, dtype=np.int32)
+    chunk_indices = tf.math.floordiv(x_1, 12)
+    original_indices = tf.range(0, tf.size(x_1), 1)
+    a = tf.dynamic_partition(original_indices, group_ids, num_partitions=12)
+    b = tf.dynamic_partition(chunk_indices, group_ids, num_partitions=12)
+    c = [tf.gather(emb_tables[i], b[i]) for i in range(12)]
+    d = tf.dynamic_stitch(a, c)
+    return d
+
+
+def opt_fused_sparse_dynamic_stitch_graph(x, emb_tables):
+    output = gen_embedding_fused_ops.KPFusedSparseDynamicStitch(
+        x = x,
+        variables = emb_tables
+    )
+    return output
+
+
+class TestSparseDynamicStitch(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        """Initialize test data and custom op"""
-        # Load custom op
-        cls.custom_op = gen_embedding_fused_ops
+        """Initialize config"""
+        cls.config = tf.compat.v1.ConfigProto()
+        cls.config.intra_op_parallelism_threads = 16
+        cls.config.inter_op_parallelism_threads = 1
         
-        cls.variables = []
-        max_val = float('inf')
-        for i in range(12):
-            N_i = np.random.randint(1000000, 44739244)
-            max_val = min(N_i, max_val)
-            var = tf.Variable(
-                tf.random.normal([N_i, 10], dtype=tf.float32),  # shape: (N_i, 10)
-                name=f"embedding_table_{i}"
-            )
-            cls.variables.append(var)
-            print(f"Created variable {i}: shape={var.shape}")
-
-        x_np = np.random.randint(0, 12*max_val, size=(10000, 12))
-        cls.x = tf.constant(x_np, dtype=tf.int64)
+        cls.run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
+        cls.run_metadata_ori = tf.compat.v1.RunMetadata()
+        cls.run_metadata_opt = tf.compat.v1.RunMetadata()
         
         # Create tf session
         cls.sess = tf.compat.v1.Session()
@@ -36,11 +49,29 @@ class TestSparseSegmentMeanSlice(unittest.TestCase):
         cls.sess.close()
 
     def test_base(self):
-        x_first = self.sess.run(self.x)
-        var_first = self.sess.run(self.variables[0])
+        variables = []
+        max_val = float('inf')
+        for i in range(12):
+            N_i = np.random.randint(100000, 4473924)
+            max_val = min(N_i, max_val)
+            var = tf.Variable(
+                tf.random.normal([N_i, 10], dtype=tf.float32),  # shape: (N_i, 10)
+                name=f"embedding_{i}"
+            ) 
+            variables.append(var)
+            # print(f"Created variable {i}: shape={var.shape}")
+        
+        x_np = np.random.randint(0, 12*max_val, size=(10000, 12))
+        x = tf.constant(x_np, dtype=tf.int64)
+
+        self.sess.run(tf.compat.v1.variables_initializer(variables))
+
+        x_first = self.sess.run(x)
+        var_first = self.sess.run(variables[0])
 
-        x_second = self.sess.run(self.x)
-        var_second = self.sess.run(self.variables[0])
+        x_second = self.sess.run(x)
+        var_second = self.sess.run(variables[0])
+        
         np.testing.assert_allclose(
             x_first,
             x_second,
@@ -55,42 +86,93 @@ class TestSparseSegmentMeanSlice(unittest.TestCase):
             err_msg="Input values mismatch"
         )
 
-        # execute custom op
-        custom_out = self.custom_op.KPFusedSparseDynamicStitch(x=self.x, variables=self.variables)
-
-        # tf native implementation
-        tf_out = self._tf_reference_impl(x=self.x, variables=self.variables)
-
-        custom_out_val = self.sess.run([custom_out])
-        tf_out_val = self.sess.run([tf_out])
-        print("custom_shape: ")
-        print(custom_out_val[0].shape)
-        print("tf_out shape: ")
-        print(tf_out_val[0].shape)
-        # Numerical comparison
-        np.testing.assert_allclose(
-            custom_out_val[0],
-            tf_out_val[0],
-            rtol=1e-6,
-            err_msg="Output values mismatch"
-        )
-
-    def _tf_reference_impl(self, x, variables):
-        x_1 = tf.reshape(x, shape=[-1])
-        group_ids = tf.math.floormod(x_1, 12)
-        group_ids = tf.cast(group_ids, dtype=np.int32)
-        chunk_indices = tf.math.floordiv(x_1, 12)
-
-        original_indices = tf.range(0,tf.size(x_1),1)
-
-        a = tf.dynamic_partition(original_indices, group_ids, num_partitions=12)
-        b = tf.dynamic_partition(chunk_indices, group_ids, num_partitions=12)
-
-        c = [tf.gather(variables[i], b[i]) for i in range(12)]
-
-        d = tf.dynamic_stitch(a, c)
+    def test_kp_sparse_dynamic_stitch(self):
+        # Create Graph
+        with tf.Graph().as_default():
+            num_tables = 12
+            emb_dim = 10
+            max_val = float('inf')
+            # 每张表的 placeholder，行数随机生成
+            tables = []
+            table_sizes = []
+            for i in range(num_tables):
+                N_i = np.random.randint(1000000, 44739244)
+                table_sizes.append(N_i)
+                max_val = min(N_i, max_val)
+                table_ph = tf.compat.v1.placeholder(
+                    tf.float32, shape=(N_i, emb_dim), name=f"embedding_table_{i}"
+                )
+                tables.append(table_ph)
+            # 生成全局索引 placeholder
+            x_shape = (1000, num_tables)
+            input_x = tf.compat.v1.placeholder(tf.int64, shape=x_shape, name="input_x")
+            # 初始化 feed 数据
+            feed = {}
+            rng = np.random.default_rng(12345)
+            # 为每张表生成随机 embedding 数据
+            for i in range(num_tables):
+                feed[tables[i]] = rng.standard_normal(size=(table_sizes[i], emb_dim)).astype(np.float32)
+            # 生成索引数据（保持原逻辑：范围是 0 ~ num_tables * max_val - 1）
+            feed[input_x] = rng.integers(
+                low=0, high=num_tables * max_val, size=x_shape, dtype=np.int64
+            )
+            with tf.name_scope("ori"):
+                out_ori = ori_fused_sparse_dynamic_stitch_graph(input_x, tables)
+            with tf.name_scope("opt"):
+                out_opt = opt_fused_sparse_dynamic_stitch_graph(input_x, tables)
+            
+            # Create tf session
+            with tf.compat.v1.Session(config=self.config) as sess:
+                # functest
+                out_ori_val = sess.run([out_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori)
+                out_opt_val = sess.run([out_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt)
+                
+                np.testing.assert_array_equal(
+                    out_ori_val,
+                    out_opt_val,
+                    err_msg="result mismatch"
+                )
+
+                op_name   = "KPFusedSparseDynamicStitch"
+                TF_origin = "--------TF_origin---------"
+                print("-" * 60)
+                print("-" * 60)
+                print("new test")
+                # 多次生成 timeline 并统计平均值
+                num_runs = 10
+                start_op = "ori/Reshape"
+                end_op   = "ori/DynamicStitch"
+                total_times_ori = 0
+                total_times_opt = 0
+                for i in range(num_runs):
+                    sess.run(
+                        [out_ori], 
+                        feed_dict=feed, 
+                        options=self.run_options, 
+                        run_metadata=self.run_metadata_ori
+                    )
+                    sess.run(
+                        [out_opt], 
+                        feed_dict=feed, 
+                        options=self.run_options, 
+                        run_metadata=self.run_metadata_opt
+                    )
+                    filename_ori = f"{op_name}_ori"
+                    filename_opt = f"{op_name}_opt"
+                    generate_timeline(self.run_metadata_ori.step_stats, filename_ori)
+                    generate_timeline(self.run_metadata_opt.step_stats, filename_opt)
+                    
+                    total_times_ori += extract_op_total_time(filename_ori, start_op, end_op)
+                    total_times_opt += extract_op_dur(filename_opt, op_name)
+                
+                avg_ori = total_times_ori / num_runs
+                avg_opt = total_times_opt / num_runs
+                speedup = (avg_ori - avg_opt) / avg_ori * 100
+                
+                print(f"{TF_origin}: {avg_ori:.4f} us per run")
+                print(f"{op_name}: {avg_opt:.4f} us per run")
+                print(f"improve: {speedup:.2f}%")
 
-        return d
 
 if __name__ == "__main__":
     tf.compat.v1.disable_eager_execution()
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py
index 37d27531..bb3d3727 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py
@@ -2,65 +2,13 @@ import tensorflow as tf
 import numpy as np
 import unittest
 
-from tensorflow.python.ops import gen_embedding_fused_ops
-
-class TestFusedSparseReshape(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """Initialize test data and custom op"""
-        # Load custom op
-        cls.custom_op = gen_embedding_fused_ops
-        
-        # Base test data
-        cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2], [3, 4]], dtype=np.int64)
-        cls.base_begin = [0, 1]
-        cls.base_end = [0, 2]
-        cls.base_strides = [1, 1]
-        cls.base_newshape = [2, 4]
-         # Create tf session
-        cls.sess = tf.compat.v1.Session()
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.sess.close()
-
-    def test_custom(self):
-        # execute custom op
-        custom_out1, custom_out2, = self.custom_op.KPFusedSparseReshape(
-            slice_input=self.base_slice_input,
-            begin=self.base_begin,
-            new_shape=self.base_newshape
-        )
-
-        # tf native implementation
-        tf_out1, tf_out2, tf_out3 = self._tf_reference_impl(
-            self.base_slice_input,
-            self.base_begin,
-            self.base_newshape
-        )
 
-        custom_out_val1, custom_out_val2 = self.sess.run([custom_out1, custom_out2])
-        tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3])
-        
-        print("custom_out_val1: ", custom_out_val1)
-        print("custom_out_val2: ", custom_out_val2)
-        print("tf_out_val1: ", tf_out_val1)
-        print("tf_out_val2: ", tf_out_val2)
-
-        np.testing.assert_array_equal(
-            custom_out_val1,
-            tf_out_val1,
-            err_msg="Segment count mismatch"
-        )
+from tensorflow.python.ops import gen_embedding_fused_ops
+from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time
 
-        np.testing.assert_array_equal(
-            custom_out_val2,
-            tf_out_val2,
-            err_msg="Segment count mismatch"
-        )
 
-    def _tf_reference_impl(self, slice_input, begin, new_shape):
-        slice67_out = tf.strided_slice(
+def ori_fused_embedding_sparse_reshape_graph(slice_input, begin, newshape):
+    slice67_out = tf.strided_slice(
             slice_input,
             begin=begin,
             end=[0, 2],
@@ -70,32 +18,147 @@ class TestFusedSparseReshape(unittest.TestCase):
             shrink_axis_mask=2
         )
 
-        slice67_out = tf.reshape(slice67_out, [-1, 1])
-        shape_out = tf.shape(slice67_out)
-        slice57_out = tf.strided_slice(
-            shape_out, 
-            begin=[0],
-            end=[1],
-            strides=[1],
-            shrink_axis_mask=1
-        )
-        
-        const2 = tf.constant(2)
-        input_shape = tf.stack([slice57_out, const2])
-        input_shape = tf.cast(input_shape, tf.int64)
-
-        range_out = tf.range(0, slice57_out, 1)
-        range_out = tf.reshape(range_out, [-1, 1])
-        range_out_64 = tf.cast(range_out, dtype=tf.int64)
-        concat_out = tf.concat([range_out_64, slice67_out], axis=-1)
-        
-        sparse_tensor = tf.SparseTensor(
-            indices=concat_out,
-            values=[1,2,3,4],
-            dense_shape=input_shape
-        )
-        sparse_tensor_out = tf.sparse.reshape(sparse_tensor, new_shape)
-        return sparse_tensor_out.indices, sparse_tensor_out.dense_shape, concat_out
+    slice67_out = tf.reshape(slice67_out, [-1, 1])
+    shape_out = tf.shape(slice67_out)
+    slice57_out = tf.strided_slice(
+        shape_out, 
+        begin=[0],
+        end=[1],
+        strides=[1],
+        shrink_axis_mask=1
+    )
+    
+    const2 = tf.constant(2)
+    input_shape = tf.stack([slice57_out, const2])
+    input_shape = tf.cast(input_shape, tf.int64)
+
+    range_out = tf.range(0, slice57_out, 1)
+    range_out = tf.reshape(range_out, [-1, 1])
+    range_out_64 = tf.cast(range_out, dtype=tf.int64)
+    concat_out = tf.concat([range_out_64, slice67_out], axis=-1)
+    
+    sparse_tensor = tf.SparseTensor(
+        indices=concat_out,
+        values=[1,2,3,4],
+        dense_shape=input_shape
+    )
+    sparse_tensor_out = tf.sparse.reshape(sparse_tensor, newshape)
+    return sparse_tensor_out.indices, sparse_tensor_out.dense_shape, concat_out
+
+
+def opt_fused_sparse_reshape_graph(slice_input, begin, newshape):
+    custom_out1, custom_out2 = gen_embedding_fused_ops.KPFusedSparseReshape(
+            slice_input=slice_input,
+            begin=begin,
+            new_shape=newshape
+    )
+    return custom_out1, custom_out2
+
+
+class TestFusedSparseReshape(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Initialize"""
+        cls.config = tf.compat.v1.ConfigProto()
+        cls.config.intra_op_parallelism_threads = 16
+        cls.config.inter_op_parallelism_threads = 1
+
+        cls.run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
+        cls.run_metadata_ori = tf.compat.v1.RunMetadata()
+        cls.run_metadata_opt = tf.compat.v1.RunMetadata()
+
+    @classmethod
+    def tearDownClass(cls):
+        # cls.sess.close()
+        return 
+
+    def test_kp_sparse_reshape(self):
+        with tf.Graph().as_default():
+            slice_input = tf.compat.v1.placeholder(tf.int64, shape=(4,2), name="slice_input")
+            begin = tf.compat.v1.placeholder(tf.int32, name="begin")
+            newshape = tf.compat.v1.placeholder(tf.int32, name="newshape")
+            base_slice_input = np.array([[0, 0], [0, 1], [1, 2], [3, 4]], dtype=np.int64)
+            base_begin = [0, 1]
+            base_end = [0, 2]
+            base_strides = [1, 1]
+            base_newshape = [2, 4]
+            feed = {
+                slice_input: base_slice_input,
+                begin: base_begin,
+                newshape: base_newshape
+            }
+            
+            with tf.name_scope("ori"):
+                out_ori1, out_ori2, out_ori3 = ori_fused_embedding_sparse_reshape_graph(slice_input, begin, newshape)
+            with tf.name_scope("opt"):
+                out_opt1, out_opt2 = opt_fused_sparse_reshape_graph(slice_input, begin, newshape)
+            
+            with tf.compat.v1.Session(config=self.config) as sess:
+                out_ori_val1, out_ori_val2, out_ori_val3 = sess.run(
+                    [out_ori1, out_ori2, out_ori3], 
+                    feed_dict=feed, 
+                    options=self.run_options, 
+                    run_metadata=self.run_metadata_ori
+                )
+                out_opt_val1, out_opt_val2 = sess.run(
+                    [out_opt1,out_opt2], 
+                    feed_dict=feed, 
+                    options=self.run_options, 
+                    run_metadata=self.run_metadata_opt
+                )
+                
+                # 功能测试
+                np.testing.assert_array_equal(
+                    out_opt_val1,
+                    out_ori_val1,
+                    err_msg="Segment count mismatch"
+                )
+                np.testing.assert_array_equal(
+                    out_opt_val2,
+                    out_ori_val2,
+                    err_msg="Segment count mismatch"
+                )
+
+                op_name   = "KPFusedSparseReshape"
+                TF_origin = "-----TF_origin------"
+                print("-" * 60)
+                print("-" * 60)
+                print("new test")
+                # 多次生成 timeline 并统计平均值
+                num_runs = 1000
+                start_op = "ori/StridedSlice"
+                end_op   = "ori/SparseReshape"
+                total_times_ori = 0
+                total_times_opt = 0
+                for i in range(num_runs):
+                    sess.run(
+                        [out_ori1, out_ori2, out_ori3], 
+                        feed_dict=feed, 
+                        options=self.run_options, 
+                        run_metadata=self.run_metadata_ori
+                    )
+                    sess.run(
+                        [out_opt1, out_opt2], 
+                        feed_dict=feed, 
+                        options=self.run_options, 
+                        run_metadata=self.run_metadata_opt
+                    )
+                    filename_ori = f"{op_name}_ori"
+                    filename_opt = f"{op_name}_opt"
+                    generate_timeline(self.run_metadata_ori.step_stats, filename_ori)
+                    generate_timeline(self.run_metadata_opt.step_stats, filename_opt)
+                    
+                    total_times_ori += extract_op_total_time(filename_ori, start_op, end_op)
+                    total_times_opt += extract_op_dur(filename_opt, op_name)
+                
+                avg_ori = total_times_ori / num_runs
+                avg_opt = total_times_opt / num_runs
+                speedup = (avg_ori - avg_opt) / avg_ori * 100
+                
+                print(f"{TF_origin}: {avg_ori:.4f} us per run")
+                print(f"{op_name}: {avg_opt:.4f} us per run")
+                print(f"improve: {speedup:.2f}%")
+
 
 if __name__ == "__main__":
     tf.compat.v1.disable_eager_execution()
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py
index 69c7a114..c98760ae 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py
@@ -3,131 +3,206 @@ import numpy as np
 import unittest
 
 from tensorflow.python.ops import gen_embedding_fused_ops
+from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time
+
+
+def ori_fused_embedding_sparse_segment_reduce_graph(data, indices, slice_input, begin, end, strides, is_mean):
+    slice_out = tf.strided_slice(
+            slice_input,
+            begin= begin,
+            end= end,
+            strides= strides,
+            begin_mask=1,
+            end_mask=1,
+            shrink_axis_mask=2
+        )
+        
+    segment_ids = tf.cast(slice_out, dtype=tf.int32)
+    if is_mean:
+        output = tf.sparse.segment_mean(
+            data = data,
+            indices = indices,
+            segment_ids= segment_ids
+        )
+    else:
+        output = tf.sparse.segment_sum(
+            data = data,
+            indices = indices,
+            segment_ids= segment_ids
+        )
+    
+    output_shape = tf.shape(output)
+    slice_out = tf.strided_slice(output_shape, begin=[0], end=[1], strides=[1])
+    
+    return output, slice_out
+
+
+def opt_fused_embedding_sparse_segment_reduce_graph(data, indices, slice_input, begin, end, strides, is_mean):
+    if is_mean:
+        custom_out, custom_slice_out = gen_embedding_fused_ops.KPFusedSparseSegmentReduce(
+                data=data,
+                indices=indices,
+                slice_input=slice_input,
+                begin=begin,
+                end = end,
+                strides = strides
+            )
+        return custom_out, custom_slice_out
+    else:
+        custom_out, custom_slice_out = gen_embedding_fused_ops.KPFusedSparseSegmentReduce(
+                data=data,
+                indices=indices,
+                slice_input=slice_input,
+                begin=begin,
+                end = end,
+                strides = strides,
+                combiner = 0
+            )
+        return custom_out, custom_slice_out
+
 
 class TestSparseSegmentMeanSlice(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        """Initialize test data and custom op"""
-        # Load custom op
-        cls.custom_op = gen_embedding_fused_ops
-        
-        # Base test data
-        cls.base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0,5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4， 3}
-        cls.base_indices = np.array([0, 1, 2], dtype=np.int64) # shape {3}
-        cls.base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) # shape {3, 2}
-        cls.base_begin = [0, 1]
-        cls.base_end = [0, 2]
-        cls.base_strides = [1, 2]
-         # Create tf session
-        cls.sess = tf.compat.v1.Session()
+        """Initialize"""
+        cls.config = tf.compat.v1.ConfigProto()
+        cls.config.intra_op_parallelism_threads = 16
+        cls.config.inter_op_parallelism_threads = 1
+
+        cls.run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
+        cls.run_metadata_ori = tf.compat.v1.RunMetadata()
+        cls.run_metadata_opt = tf.compat.v1.RunMetadata()
 
     @classmethod
     def tearDownClass(cls):
-        cls.sess.close()
+        return
 
     def test_mean(self):
-        # execute custom op
-        custom_out, custom_slice_out = self.custom_op.KPFusedSparseSegmentReduce(
-            data=self.base_data,
-            indices=self.base_indices,
-            slice_input=self.base_slice_input,
-            begin=self.base_begin,
-            end = self.base_end,
-            strides = self.base_strides
-        )
+        with tf.Graph().as_default():
+            base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0, 5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4， 3}
+            base_indices = np.array([0, 1, 2], dtype=np.int64) # shape {3}
+            base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) # shape {3, 2}
+            base_begin = [0, 1]
+            base_end = [0, 2]
+            base_strides = [1, 2]
+            data = tf.compat.v1.placeholder(tf.float32, shape=(4,3), name="data")
+            indices = tf.compat.v1.placeholder(tf.int32, name="indices")
+            slice_input = tf.compat.v1.placeholder(tf.int64, shape=(3,2), name="slice_input")
+            begin = tf.compat.v1.placeholder(tf.int32, name="begin")
+            end = tf.compat.v1.placeholder(tf.int32, name="end")
+            strides = tf.compat.v1.placeholder(tf.int32, name="strides")
+            feed = {
+                data: base_data,
+                indices: base_indices,
+                slice_input: base_slice_input,
+                begin: base_begin,
+                end: base_end,
+                strides: base_strides
+            }
+            with tf.name_scope("ori"):
+                out_ori1, out_ori2 = ori_fused_embedding_sparse_segment_reduce_graph(data,indices,slice_input,begin,end,strides,True)
+            with tf.name_scope("opt"):
+                out_opt1, out_opt2 = opt_fused_embedding_sparse_segment_reduce_graph(data,indices,slice_input,begin,end,strides,True)
+            
+            with tf.compat.v1.Session(config=self.config) as sess:
+                out_ori_val1, out_ori_val2 = sess.run([out_ori1, out_ori2], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori)
+                out_opt_val1, out_opt_val2 = sess.run([out_opt1, out_opt2], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt)
 
-        # tf native implementation
-        tf_out, tf_slice_out = self._tf_reference_impl(
-            self.base_data, 
-            self.base_indices,
-            self.base_slice_input,
-            self.base_begin,
-            self.base_end,
-            self.base_strides,
-            True
-        )
+                np.testing.assert_allclose(
+                    out_opt_val1,
+                    out_ori_val1,
+                    rtol=1e-6,
+                    err_msg="Output values mismatch"
+                )
+                np.testing.assert_array_equal(
+                    out_opt_val2,
+                    out_ori_val2,
+                    err_msg="Segment count mismatch"
+                )
+                
+                TF_origin = "--------TF_origin---------"
+                op_name   = "KPFusedSparseSegmentReduce"
+                print("-" * 60)
+                print("-" * 60)
+                print("new test")
+                # 多次生成 timeline 并统计平均值
+                num_runs = 500
+                start_op = "ori/StridedSlice"
+                end_op   = "ori/StridedSlice_1"
+                total_times_ori = 0
+                total_times_opt = 0
+                for i in range(num_runs):
+                    sess.run(
+                        [out_ori1, out_ori2], 
+                        feed_dict=feed, 
+                        options=self.run_options, 
+                        run_metadata=self.run_metadata_ori
+                    )
+                    sess.run(
+                        [out_opt1, out_opt2], 
+                        feed_dict=feed, 
+                        options=self.run_options, 
+                        run_metadata=self.run_metadata_opt
+                    )
+                    filename_ori = f"{op_name}_ori"
+                    filename_opt = f"{op_name}_opt"
+                    generate_timeline(self.run_metadata_ori.step_stats, filename_ori)
+                    generate_timeline(self.run_metadata_opt.step_stats, filename_opt)
+                    
+                    total_times_ori += extract_op_total_time(filename_ori, start_op, end_op)
+                    total_times_opt += extract_op_dur(filename_opt, op_name)
+                
+                avg_ori = total_times_ori / num_runs
+                avg_opt = total_times_opt / num_runs
+                speedup = (avg_ori - avg_opt) / avg_ori * 100
+                
+                print(f"{TF_origin}: {avg_ori:.4f} us per run")
+                print(f"{op_name}: {avg_opt:.4f} us per run")
+                print(f"improve: {speedup:.2f}%")
 
-        custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out])
-        tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out])
-        
-        # Numerical comparison
-        np.testing.assert_allclose(
-            custom_out_val,
-            tf_out_val,
-            rtol=1e-6,
-            err_msg="Output values mismatch"
-        )
-        np.testing.assert_array_equal(
-            custom_slice_out_val,
-            tf_slice_out_val,
-            err_msg="Segment count mismatch"
-        )
     
     def test_sum(self):
-        custom_out, custom_slice_out = self.custom_op.KPFusedSparseSegmentReduce(
-            data=self.base_data,
-            indices=self.base_indices,
-            slice_input=self.base_slice_input,
-            begin=self.base_begin,
-            end = self.base_end,
-            strides = self.base_strides,
-            combiner=0
-        )
-
-        tf_out, tf_slice_out = self._tf_reference_impl(
-            self.base_data, 
-            self.base_indices,
-            self.base_slice_input,
-            self.base_begin,
-            self.base_end,
-            self.base_strides,
-            False
-        )
+        with tf.Graph().as_default():
+            base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0,5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4， 3}
+            base_indices = np.array([0, 1, 2], dtype=np.int64)
+            base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) 
+            base_begin = [0, 1]
+            base_end = [0, 2]
+            base_strides = [1, 2]
+            data = tf.compat.v1.placeholder(tf.float32, shape=(4,3), name="data")
+            indices = tf.compat.v1.placeholder(tf.int32, name="indices")
+            slice_input = tf.compat.v1.placeholder(tf.int64, shape=(3,2), name="slice_input")
+            begin = tf.compat.v1.placeholder(tf.int32, name="begin")
+            end = tf.compat.v1.placeholder(tf.int32, name="end")
+            strides = tf.compat.v1.placeholder(tf.int32, name="strides")
+            feed = {
+                data: base_data,
+                indices: base_indices,
+                slice_input: base_slice_input,
+                begin: base_begin,
+                end: base_end,
+                strides: base_strides
+            }
+            with tf.name_scope("ori"):
+                out_ori1, out_ori2 = ori_fused_embedding_sparse_segment_reduce_graph(data,indices,slice_input,begin,end,strides,False)
+            with tf.name_scope("opt"):
+                out_opt1, out_opt2 = opt_fused_embedding_sparse_segment_reduce_graph(data,indices,slice_input,begin,end,strides,False)
+            
+            with tf.compat.v1.Session(config=self.config) as sess:
+                out_ori_val1, out_ori_val2 = sess.run([out_ori1, out_ori2], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori)
+                out_opt_val1, out_opt_val2 = sess.run([out_opt1, out_opt2], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt)
+                np.testing.assert_allclose(
+                    out_opt_val1,
+                    out_ori_val1,
+                    rtol=1e-6,
+                    err_msg="Output values mismatch"
+                )
+                np.testing.assert_array_equal(
+                    out_opt_val2,
+                    out_ori_val2,
+                    err_msg="Segment count mismatch"
+                )
 
-        custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out])
-        tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out])
-        
-        np.testing.assert_allclose(
-            custom_out_val,
-            tf_out_val,
-            rtol=1e-6,
-            err_msg="Output values mismatch"
-        )
-        np.testing.assert_array_equal(
-            custom_slice_out_val,
-            tf_slice_out_val,
-            err_msg="Segment count mismatch"
-        )
-
-    def _tf_reference_impl(self, data, indices, slice_input, begin, end, strides, is_mean):
-        slice_out = tf.strided_slice(
-            slice_input,
-            begin= begin,
-            end= end,
-            strides= strides,
-            begin_mask=1,
-            end_mask=1,
-            shrink_axis_mask=2
-        )
-        
-        segment_ids = tf.cast(slice_out, dtype=tf.int32)
-        if is_mean:
-            output = tf.sparse.segment_mean(
-                data = data,
-                indices = indices,
-                segment_ids= segment_ids
-            )
-        else:
-            output = tf.sparse.segment_sum(
-                data = data,
-                indices = indices,
-                segment_ids= segment_ids
-            )
-        
-        output_shape = tf.shape(output)
-        slice_out = tf.strided_slice(output_shape, begin=[0], end=[1], strides=[1])
-        
-        return output, slice_out
 
 if __name__ == "__main__":
     tf.compat.v1.disable_eager_execution()
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select.py
deleted file mode 100644
index d37128cb..00000000
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import tensorflow as tf
-import numpy as np
-import unittest
-
-from tensorflow.python.ops import gen_embedding_fused_ops
-from utils.utils import perf_run, generate_timeline, wrapper_sess
-
-
-def ori_fused_embedding_sparse_select_graph(input_a, input_b, input_c):
-    a = tf.reshape(input_a, [-1, 1])
-    b = tf.reshape(input_b, [-1, 1])
-    c = tf.reshape(input_c, [-1, 1])
-
-    greater_a = tf.greater(a, 0)
-    shape_reshape_a1 = tf.shape(a)
-    shape_reshape_a2 = tf.shape(a)
-    fill_a1 = tf.fill(shape_reshape_a1, tf.constant(1, dtype=tf.float32))
-    realdiv = tf.realdiv(fill_a1, tf.constant(1, dtype=tf.float32))
-    output_x = tf.fill(shape_reshape_a2, tf.constant(0, dtype=tf.float32))
-    cast_a = tf.cast(greater_a, tf.float32)
-    shape_a = tf.shape(cast_a)
-    fill_a = tf.fill(shape_a, tf.constant(1, dtype=tf.float32))
-    equal_4563 = tf.equal(b, 4563)
-    equal_10831 = tf.equal(b, 10831)
-    equal_3 = tf.equal(c, 3)
-    select_1 = tf.where(equal_4563, fill_a, cast_a)
-    select_2 = tf.where(equal_10831, fill_a, select_1)
-    output_y = tf.subtract(tf.constant(1, dtype=tf.float32), select_2)
-    mul = tf.multiply(tf.constant(1, dtype=tf.float32), select_2)
-    select_3 = tf.where(equal_3, realdiv, fill_a1)
-    output_z = tf.concat([mul, select_3], axis=-1)
-    return output_x, output_y, output_z
-
-
-def opt_fused_embedding_sparse_select_graph(input_a, input_b, input_c):
-    output_x, output_y, output_z = gen_embedding_fused_ops.KPFusedSparseSelect(
-        input_a=input_a, input_b=input_b, input_c=input_c
-    )
-    return output_x, output_y, output_z
-
-
-class TestKPFusedSparseSelect(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        """Initialize config"""
-        cls.config = tf.compat.v1.ConfigProto()
-        cls.config.intra_op_parallelism_threads = 16
-        cls.config.inter_op_parallelism_threads = 16
-
-        cls.run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
-        cls.run_metadata_ori = tf.compat.v1.RunMetadata()
-        cls.run_metadata_opt = tf.compat.v1.RunMetadata()
-
-    @classmethod
-    def tearDownClass(cls):
-        return
-
-    def test_fused_embedding_sparse_select(self):
-        # Create Graph
-        with tf.Graph().as_default():
-            input0 = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name="input_a")
-            input1 = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name="input_b")
-            input2 = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name="input_c")
-            """Initialize test data"""
-            feed = {
-                input0: np.random.randint(0, 100, size=(100, 10)).astype(np.int32),
-                input1: np.random.randint(0, 100, size=(10, 100)).astype(np.int32),
-                input2: np.random.randint(0, 100, size=(20, 50)).astype(np.int32),
-            }
-            with tf.name_scope("ori"):
-                out0_ori, out1_ori, out2_ori = ori_fused_embedding_sparse_select_graph(input0, input1, input2)
-            with tf.name_scope("opt"):
-                out0_opt, out1_opt, out2_opt = opt_fused_embedding_sparse_select_graph(input0, input1, input2)
-        
-            # Create tf session
-            with tf.compat.v1.Session(config=self.config) as sess:
-                # functest
-                out0_ori_val, out1_ori_val, out2_ori_val = sess.run([out0_ori, out1_ori, out2_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori)
-                out0_opt_val, out1_opt_val, out2_opt_val = sess.run([out0_opt, out1_opt, out2_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt)
-
-                np.testing.assert_allclose(
-                    out0_ori_val,
-                    out0_opt_val,
-                    rtol=1e-5,
-                    err_msg="Output values mismatch"
-                )
-
-                np.testing.assert_allclose(
-                    out1_ori_val,
-                    out1_opt_val,
-                    rtol=1e-5,
-                    err_msg="Output values mismatch"
-                )
-
-                np.testing.assert_allclose(
-                    out2_ori_val,
-                    out2_opt_val,
-                    rtol=1e-5,
-                    err_msg="Output values mismatch"
-                )
-
-                generate_timeline(self.run_metadata_ori.step_stats, f"{self._testMethodName}_ori")
-                generate_timeline(self.run_metadata_opt.step_stats, f"{self._testMethodName}_opt")
-
-                # perftest
-                perf_run(wrapper_sess(sess, [out0_ori, out1_ori, out2_ori], feed_dict=feed), 
-                         wrapper_sess(sess, [out0_opt, out1_opt, out2_opt], feed_dict=feed), 
-                         "KPFusedEmbeddingSparseSelect")
-
-
-if __name__ == "__main__":
-    tf.compat.v1.disable_eager_execution()
-    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py
index d37128cb..45aa8ff7 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py
@@ -3,7 +3,7 @@ import numpy as np
 import unittest
 
 from tensorflow.python.ops import gen_embedding_fused_ops
-from utils.utils import perf_run, generate_timeline, wrapper_sess
+from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time
 
 
 def ori_fused_embedding_sparse_select_graph(input_a, input_b, input_c):
@@ -45,7 +45,7 @@ class TestKPFusedSparseSelect(unittest.TestCase):
         """Initialize config"""
         cls.config = tf.compat.v1.ConfigProto()
         cls.config.intra_op_parallelism_threads = 16
-        cls.config.inter_op_parallelism_threads = 16
+        cls.config.inter_op_parallelism_threads = 1
 
         cls.run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
         cls.run_metadata_ori = tf.compat.v1.RunMetadata()
@@ -68,46 +68,77 @@ class TestKPFusedSparseSelect(unittest.TestCase):
                 input2: np.random.randint(0, 100, size=(20, 50)).astype(np.int32),
             }
             with tf.name_scope("ori"):
-                out0_ori, out1_ori, out2_ori = ori_fused_embedding_sparse_select_graph(input0, input1, input2)
+                out_ori1, out_ori2, out_ori3 = ori_fused_embedding_sparse_select_graph(input0, input1, input2)
             with tf.name_scope("opt"):
-                out0_opt, out1_opt, out2_opt = opt_fused_embedding_sparse_select_graph(input0, input1, input2)
+                out_opt1, out_opt2, out_opt3 = opt_fused_embedding_sparse_select_graph(input0, input1, input2)
         
             # Create tf session
             with tf.compat.v1.Session(config=self.config) as sess:
                 # functest
-                out0_ori_val, out1_ori_val, out2_ori_val = sess.run([out0_ori, out1_ori, out2_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori)
-                out0_opt_val, out1_opt_val, out2_opt_val = sess.run([out0_opt, out1_opt, out2_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt)
+                out_ori_val1, out_ori_val2, out_ori_val3 = sess.run([out_ori1, out_ori2, out_ori3], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori)
+                out_opt_val1, out_opt_val2, out_opt_val3 = sess.run([out_opt1, out_opt2, out_opt3], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt)
 
                 np.testing.assert_allclose(
-                    out0_ori_val,
-                    out0_opt_val,
+                    out_ori_val1,
+                    out_opt_val1,
                     rtol=1e-5,
                     err_msg="Output values mismatch"
                 )
 
                 np.testing.assert_allclose(
-                    out1_ori_val,
-                    out1_opt_val,
+                    out_ori_val2,
+                    out_opt_val2,
                     rtol=1e-5,
                     err_msg="Output values mismatch"
                 )
 
                 np.testing.assert_allclose(
-                    out2_ori_val,
-                    out2_opt_val,
+                    out_ori_val3,
+                    out_opt_val3,
                     rtol=1e-5,
                     err_msg="Output values mismatch"
                 )
 
-                generate_timeline(self.run_metadata_ori.step_stats, f"{self._testMethodName}_ori")
-                generate_timeline(self.run_metadata_opt.step_stats, f"{self._testMethodName}_opt")
-
-                # perftest
-                perf_run(wrapper_sess(sess, [out0_ori, out1_ori, out2_ori], feed_dict=feed), 
-                         wrapper_sess(sess, [out0_opt, out1_opt, out2_opt], feed_dict=feed), 
-                         "KPFusedEmbeddingSparseSelect")
-
+                op_name   = "KPFusedSparseSelect"
+                TF_origin = "-----TF_origin-----"
+                print("-" * 60)
+                print("-" * 60)
+                print("new test")
+                # 多次生成 timeline 并统计平均值
+                num_runs = 1000
+                start_op = "ori/Reshape"
+                end_op   = "ori/Sub"
+                total_times_ori = 0
+                total_times_opt = 0
+                for i in range(num_runs):
+                    sess.run(
+                        [out_ori1, out_ori2, out_ori3], 
+                        feed_dict=feed, 
+                        options=self.run_options, 
+                        run_metadata=self.run_metadata_ori
+                    )
+                    sess.run(
+                        [out_opt1, out_opt2, out_opt3], 
+                        feed_dict=feed, 
+                        options=self.run_options, 
+                        run_metadata=self.run_metadata_opt
+                    )
+                    filename_ori = f"{op_name}_ori"
+                    filename_opt = f"{op_name}_opt"
+                    generate_timeline(self.run_metadata_ori.step_stats, filename_ori)
+                    generate_timeline(self.run_metadata_opt.step_stats, filename_opt)
+                    
+                    total_times_ori += extract_op_total_time(filename_ori, start_op, end_op)
+                    total_times_opt += extract_op_dur(filename_opt, op_name)
+                
+                avg_ori = total_times_ori / num_runs
+                avg_opt = total_times_opt / num_runs
+                speedup = (avg_ori - avg_opt) / avg_ori * 100
+                
+                print(f"{TF_origin}: {avg_ori:.4f} us per run")
+                print(f"{op_name}: {avg_opt:.4f} us per run")
+                print(f"improve: {speedup:.2f}%")
 
 if __name__ == "__main__":
     tf.compat.v1.disable_eager_execution()
-    unittest.main(argv=[''], verbosity=2)
\ No newline at end of file
+    unittest.main(argv=[''], verbosity=2)
diff --git a/tensorflow/python/grappler/embedding_fused_test/utils/utils.py b/tensorflow/python/grappler/embedding_fused_test/utils/utils.py
index 06f02d6b..cd42c5b4 100644
--- a/tensorflow/python/grappler/embedding_fused_test/utils/utils.py
+++ b/tensorflow/python/grappler/embedding_fused_test/utils/utils.py
@@ -1,10 +1,30 @@
 import timeit
+import json
+import os
 
 from tensorflow.python.client import timeline
 
 
-def perf_run(ori_func, opt_func, name, warmup=5, iters=50):
-    
+def extract_op_dur(timeline_file, op_name):
+    """从 timeline JSON 文件中提取指定算子(fusedOp)的耗时（μs）"""
+    with open(f"timeline/{timeline_file}.json", "r") as f:
+        trace_events = json.load(f)["traceEvents"]  # timeline.json的格式
+    durations = [e["dur"] for e in trace_events if e.get("name") == op_name and "dur" in e]
+    return durations[0]
+
+
+def extract_op_total_time(timeline_file, start_op, end_op):
+    """计算从 start_op 到 end_op 的总耗时（包含调度空隙）"""
+    with open(f"timeline/{timeline_file}.json", "r") as f:
+        trace_events = json.load(f)["traceEvents"]
+    start_event = next(e for e in trace_events if e.get("args", {}).get("name") == start_op)  # 找到 timeline 里第一个 name 等于 start_op 的事件
+    end_event   = next(e for e in trace_events if e.get("args", {}).get("name") == end_op)    # 找不到会报错
+    start_time = start_event["ts"]
+    end_time = end_event["ts"] + end_event["dur"]  # ts 是开始时间，dur是算子的持续时间
+    return end_time - start_time
+
+
+def perf_run(ori_func, opt_func, name, warmup=5, iters=5):
     print(f"\nWarmup ori: {warmup} iters")
     for _ in range(warmup):
         ori_func()
@@ -12,7 +32,7 @@ def perf_run(ori_func, opt_func, name, warmup=5, iters=50):
     print(f"Running performance test: ori {iters} iters")
     total_time = timeit.timeit(ori_func, number=iters)
     ori_avg_time = total_time / iters * 1000
-    print(f"{name}: {ori_avg_time:.2f} ms per run")
+    print(f"{name}: {ori_avg_time:.6f} ms per run")
 
     print(f"\nWarmup opt: {warmup} iters")
     for _ in range(warmup):
@@ -21,7 +41,7 @@ def perf_run(ori_func, opt_func, name, warmup=5, iters=50):
     print(f"Running performance test: opt {iters} iters")
     total_time = timeit.timeit(opt_func, number=iters)
     opt_avg_time = total_time / iters * 1000
-    print(f"{name}: {opt_avg_time:.2f} ms per run")
+    print(f"{name}: {opt_avg_time:.6f} ms per run")
 
     improvement = (ori_avg_time - opt_avg_time) / ori_avg_time * 100
     print(f"improve: {improvement:.2f}%")
@@ -36,4 +56,4 @@ def generate_timeline(step_stats, filename):
 
 
 def wrapper_sess(sess, fetches, feed_dict=None, options=None, run_metadata=None):
-                    return lambda: sess.run(fetches, feed_dict=feed_dict, options=options, run_metadata=run_metadata)
\ No newline at end of file
+    return lambda: sess.run(fetches, feed_dict=feed_dict, options=options, run_metadata=run_metadata)
\ No newline at end of file
-- 
Gitee


From 7c3fb000408ff1cd803af0b0f8490bf8787a9584 Mon Sep 17 00:00:00 2001
From: rayshine <1324789704@qq.com>
Date: Tue, 19 Aug 2025 12:34:34 +0800
Subject: [PATCH 6/9] =?UTF-8?q?=E5=88=9B=E5=BB=BAbenchmark=5Fop=E5=87=BD?=
 =?UTF-8?q?=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../fused_embedding_action_id_gather_test.py  |  57 +++-----
 .../fused_embedding_gather_test.py            |  69 +++------
 .../fused_embedding_padding_test.py           | 114 +++++----------
 ...ed_embedding_sparse_dynamic_stitch_test.py |  62 +++-----
 .../fused_embedding_sparse_reshape_test.py    |  61 +++-----
 ...ed_embedding_sparse_segment_reduce_test.py | 132 +++++++++---------
 .../fused_embedding_sparse_select_test.py     |  58 +++-----
 .../embedding_fused_test/utils/utils.py       |  58 ++++++++
 8 files changed, 247 insertions(+), 364 deletions(-)

diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py
index 1fa83bc9..2bd76790 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py
@@ -3,7 +3,9 @@ import numpy as np
 import unittest
 
 from tensorflow.python.ops import gen_embedding_fused_ops
-from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time
+from utils.utils import benchmark_op
+
+np.random.seed(140)
 
 
 def ori_fused_embedding_action_id_gather_graph(input0, input1, input2, input3):
@@ -77,47 +79,20 @@ class TestFusedEmbeddingActionIdGather(unittest.TestCase):
                     err_msg="result mismatch"
                 )
                 
-                op_name   = "KPFusedEmbeddingActionIdGather"
-                TF_origin = "----------TF_origin-----------"
-                print("-" * 60)
-                print("-" * 60)
-                print("new test")
-                # 多次生成 timeline 并统计平均值
-                num_runs = 1000
-                start_op = "ori/stack_1"
-                end_op   = "ori/concat"
-                total_times_ori = 0
-                total_times_opt = 0
-                for i in range(num_runs):
-                    sess.run(
-                        [out_ori], 
-                        feed_dict=feed, 
-                        options=self.run_options, 
-                        run_metadata=self.run_metadata_ori
-                    )
-                    sess.run(
-                        [out_opt], 
-                        feed_dict=feed, 
-                        options=self.run_options, 
-                        run_metadata=self.run_metadata_opt
-                    )
-                    filename_ori = f"{op_name}_ori"
-                    filename_opt = f"{op_name}_opt"
-                    generate_timeline(self.run_metadata_ori.step_stats, filename_ori)
-                    generate_timeline(self.run_metadata_opt.step_stats, filename_opt)
-
-                    total_times_ori += extract_op_total_time(filename_ori, start_op, end_op)
-                    total_times_opt += extract_op_dur(filename_opt, op_name)
-
-                avg_ori = total_times_ori / num_runs
-                avg_opt = total_times_opt / num_runs
-                speedup = (avg_ori - avg_opt) / avg_ori * 100
-
-                print(f"{TF_origin}: {avg_ori:.4f} us per run")
-                print(f"{op_name}: {avg_opt:.4f} us per run")
-                print(f"improve: {speedup:.2f}%")
+                benchmark_op(
+                    sess,
+                    feed,
+                    [out_ori],
+                    [out_opt],
+                    self.run_options,
+                    self.run_metadata_ori,
+                    self.run_metadata_opt,
+                    op_name="KPFusedEmbeddingActionIdGather",
+                    start_op="ori/stack_1",
+                    end_op="ori/concat",
+                    num_runs=1000
+                )
 
-                
 
 if __name__ == "__main__":
     tf.compat.v1.disable_eager_execution()
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py
index f87ee1b5..241a4a9a 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py
@@ -3,18 +3,18 @@ import numpy as np
 import unittest
 
 from tensorflow.python.ops import gen_embedding_fused_ops
-from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time
+from utils.utils import benchmark_op
 
 
 def ori_fused_embedding_gather_graph(data, slice_input, begin):
     slice_out = tf.strided_slice(
         slice_input,
-        begin = begin,
-        end = [tf.shape(slice_input)[0], begin[1] + 2],
-        strides = [1, 1],
-        begin_mask = 1,
-        end_mask = 1,
-        shrink_axis_mask = 2
+        begin=begin,
+        end=[tf.shape(slice_input)[0], begin[1] + 2],
+        strides=[1, 1],
+        begin_mask=1,
+        end_mask=1,
+        shrink_axis_mask=2
     )
     
     slice_out, slice_out_indices = tf.unique(slice_out)
@@ -58,7 +58,7 @@ class TestFusedGather(unittest.TestCase):
         with tf.Graph().as_default():
             data = tf.compat.v1.placeholder(tf.float32, shape=(20, 12), name="data")
             slice_input = tf.compat.v1.placeholder(tf.int64, shape=(3, 2), name="slice_input")
-            begin = tf.compat.v1.placeholder(tf.int32, name="begin")
+            begin = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="begin")
             base_data = np.linspace(0, 11, num=240, endpoint=False, dtype=np.float32).reshape(20, 12)
             base_slice_input = np.array([[0, 0], [0, 1], [1, 2]], dtype=np.int64)
             base_begin = [0, 1]
@@ -116,46 +116,21 @@ class TestFusedGather(unittest.TestCase):
                     rtol=1e-6,
                     err_msg="Output values mismatch"
                 )
-    
-                op_name   = "KPFusedGather"
-                TF_origin = "--TF_origin--"
-                print("-" * 60)
-                print("-" * 60)
-                print("new test")
-                # 多次生成 timeline 并统计平均值
-                num_runs = 1000
-                start_op = "ori/strided_slice_1"
-                end_op   = "ori/GatherV2_1"
-                total_times_ori = 0
-                total_times_opt = 0
-                for i in range(num_runs):
-                    sess.run(
-                        [out_ori1, out_ori2, out_ori3], 
-                        feed_dict=feed, 
-                        options=self.run_options, 
-                        run_metadata=self.run_metadata_ori
-                    )
-                    sess.run(
-                        [out_opt1, out_opt2], 
-                        feed_dict=feed, 
-                        options=self.run_options, 
-                        run_metadata=self.run_metadata_opt
-                    )
-                    filename_ori = f"{op_name}_ori"
-                    filename_opt = f"{op_name}_opt"
-                    generate_timeline(self.run_metadata_ori.step_stats, filename_ori)
-                    generate_timeline(self.run_metadata_opt.step_stats, filename_opt)
-                    
-                    total_times_ori += extract_op_total_time(filename_ori, start_op, end_op)
-                    total_times_opt += extract_op_dur(filename_opt, op_name)
-                
-                avg_ori = total_times_ori / num_runs
-                avg_opt = total_times_opt / num_runs
-                speedup = (avg_ori - avg_opt) / avg_ori * 100
                 
-                print(f"{TF_origin}: {avg_ori:.4f} us per run")
-                print(f"{op_name}: {avg_opt:.4f} us per run")
-                print(f"improve: {speedup:.2f}%")
+                benchmark_op(
+                    sess,
+                    feed,
+                    [out_ori1, out_ori2, out_ori3],
+                    [out_opt1, out_opt2],
+                    self.run_options,
+                    self.run_metadata_ori,
+                    self.run_metadata_opt,
+                    op_name="KPFusedGather",
+                    start_op="ori/strided_slice_1",
+                    end_op="ori/GatherV2_1",
+                    num_runs=1000,
+                    tag="--TF_origin--"
+                )
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py
index ba6a87cd..61f00534 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py
@@ -3,7 +3,7 @@ import numpy as np
 import unittest
 
 from tensorflow.python.ops import gen_embedding_fused_ops
-from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time
+from utils.utils import benchmark_op
 
 np.random.seed(140)
 
@@ -12,12 +12,14 @@ def opt_fused_embedding_padding_fast_graph(input0, input1, input2, input3):
     # execute custom op
     _, custom_out = gen_embedding_fused_ops.kp_fused_embedding_padding_fast(input0, input1, input2, input3)
     return custom_out
-    
+
+
 def opt_fused_embedding_padding_graph(input0, input1, input2, input3):
     # execute custom op
     _, custom_out = gen_embedding_fused_ops.kp_fused_embedding_padding(input0, input1, input2, input3)
     return custom_out
 
+
 def ori_fused_embedding_padding_fast_graph(input0, input1, input2, input3):
     cast = tf.cast(input0, tf.int32)
     begin = tf.constant([0], dtype=tf.int32)
@@ -34,6 +36,7 @@ def ori_fused_embedding_padding_fast_graph(input0, input1, input2, input3):
     output = tf.strided_slice(shape_tensor, begin=begin, end=end, strides=strides, shrink_axis_mask=1)
     return output
 
+
 def ori_fused_embedding_padding_graph(input0, input1, input2, input3):
     cast = tf.cast(input0, tf.int32)
     begin = tf.constant([0], dtype=tf.int32)
@@ -96,45 +99,20 @@ class TestFusedEmbeddingPadding(unittest.TestCase):
                     err_msg="result mismatch"
                 )
                 
-                op_name   = "KPFusedEmbeddingPadding"
-                TF_origin = "-------TF_origin-------"
-                print("-" * 60)
-                print("-" * 60)
-                print("new test")
-                # 多次生成 timeline 并统计平均值
-                num_runs = 1000
-                start_op = "ori/Cast"
-                end_op   = "ori/Reshape"
-                total_times_ori = 0
-                total_times_opt = 0
-                for i in range(num_runs):
-                    sess.run(
-                        [out_ori], 
-                        feed_dict=feed, 
-                        options=self.run_options, 
-                        run_metadata=self.run_metadata_ori
-                    )
-                    sess.run(
-                        [out_opt], 
-                        feed_dict=feed, 
-                        options=self.run_options, 
-                        run_metadata=self.run_metadata_opt
-                    )
-                    filename_ori = f"{op_name}_ori"
-                    filename_opt = f"{op_name}_opt"
-                    generate_timeline(self.run_metadata_ori.step_stats, filename_ori)
-                    generate_timeline(self.run_metadata_opt.step_stats, filename_opt)
-                    
-                    total_times_ori += extract_op_total_time(filename_ori, start_op, end_op)
-                    total_times_opt += extract_op_dur(filename_opt, op_name)
-                
-                avg_ori = total_times_ori / num_runs
-                avg_opt = total_times_opt / num_runs
-                speedup = (avg_ori - avg_opt) / avg_ori * 100
-                
-                print(f"{TF_origin}: {avg_ori:.4f} us per run")
-                print(f"{op_name}: {avg_opt:.4f} us per run")
-                print(f"improve: {speedup:.2f}%")
+                benchmark_op(
+                    sess,
+                    feed,
+                    [out_ori],
+                    [out_opt],
+                    self.run_options,
+                    self.run_metadata_ori,
+                    self.run_metadata_opt,
+                    op_name="KPFusedEmbeddingPadding",
+                    start_op="ori/Cast",
+                    end_op="ori/Reshape",
+                    num_runs=1000,
+                    tag="-------TF_origin-------"
+                )
 
 
     def test_func_kp_fused_embedding_padding_fast(self):
@@ -167,46 +145,22 @@ class TestFusedEmbeddingPadding(unittest.TestCase):
                     opt_result,
                     err_msg="result mismatch"
                 )
-
-                op_name   = "KPFusedEmbeddingPaddingFast"
-                TF_origin = "---------TF_origin---------"                
-                print("-" * 60)
-                print("-" * 60)
-                print("new test")
-                # 多次生成 timeline 并统计平均值
-                num_runs = 1000
-                start_op = "ori/Cast"
-                end_op   = "ori/StridedSlice_1"
-                total_times_ori = 0
-                total_times_opt = 0
-                for i in range(num_runs):
-                    sess.run(
-                        [out_ori], 
-                        feed_dict=feed, 
-                        options=self.run_options, 
-                        run_metadata=self.run_metadata_ori
-                    )
-                    sess.run(
-                        [out_opt], 
-                        feed_dict=feed, 
-                        options=self.run_options, 
-                        run_metadata=self.run_metadata_opt
-                    )
-                    filename_ori = f"{op_name}_ori"
-                    filename_opt = f"{op_name}_opt"
-                    generate_timeline(self.run_metadata_ori.step_stats, filename_ori)
-                    generate_timeline(self.run_metadata_opt.step_stats, filename_opt)
-                    
-                    total_times_ori += extract_op_total_time(filename_ori, start_op, end_op)
-                    total_times_opt += extract_op_dur(filename_opt, op_name)
                 
-                avg_ori = total_times_ori / num_runs
-                avg_opt = total_times_opt / num_runs
-                speedup = (avg_ori - avg_opt) / avg_ori * 100
-                
-                print(f"{TF_origin}: {avg_ori:.4f} us per run")
-                print(f"{op_name}: {avg_opt:.4f} us per run")
-                print(f"improve: {speedup:.2f}%")
+                benchmark_op(
+                    sess,
+                    feed,
+                    [out_ori],
+                    [out_opt],
+                    self.run_options,
+                    self.run_metadata_ori,
+                    self.run_metadata_opt,
+                    op_name="KPFusedEmbeddingPaddingFast",
+                    start_op="ori/Cast",
+                    end_op="ori/StridedSlice_1",
+                    num_runs=1000,
+                    tag="---------TF_origin---------"
+                )
+
 
 if __name__ == "__main__":
     tf.compat.v1.disable_eager_execution()
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py
index 5a81692c..17b0bb05 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py
@@ -1,15 +1,16 @@
-import os
 import tensorflow as tf
 import numpy as np
 import unittest
 
 from tensorflow.python.ops import gen_embedding_fused_ops
-from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time
+from utils.utils import benchmark_op
+
+np.random.seed(140)
 
 
 def ori_fused_sparse_dynamic_stitch_graph(x, emb_tables):
-    x_1 = tf.reshape(x, shape=[-1])            # 将输入 x 展平成一维向量 x_1
-    group_ids = tf.math.floormod(x_1, 12)	   
+    x_1 = tf.reshape(x, shape=[-1])  # 将输入 x 展平成一维向量 x_1
+    group_ids = tf.math.floormod(x_1, 12)
     group_ids = tf.cast(group_ids, dtype=np.int32)
     chunk_indices = tf.math.floordiv(x_1, 12)
     original_indices = tf.range(0, tf.size(x_1), 1)
@@ -132,46 +133,21 @@ class TestSparseDynamicStitch(unittest.TestCase):
                     out_opt_val,
                     err_msg="result mismatch"
                 )
-
-                op_name   = "KPFusedSparseDynamicStitch"
-                TF_origin = "--------TF_origin---------"
-                print("-" * 60)
-                print("-" * 60)
-                print("new test")
-                # 多次生成 timeline 并统计平均值
-                num_runs = 10
-                start_op = "ori/Reshape"
-                end_op   = "ori/DynamicStitch"
-                total_times_ori = 0
-                total_times_opt = 0
-                for i in range(num_runs):
-                    sess.run(
-                        [out_ori], 
-                        feed_dict=feed, 
-                        options=self.run_options, 
-                        run_metadata=self.run_metadata_ori
-                    )
-                    sess.run(
-                        [out_opt], 
-                        feed_dict=feed, 
-                        options=self.run_options, 
-                        run_metadata=self.run_metadata_opt
-                    )
-                    filename_ori = f"{op_name}_ori"
-                    filename_opt = f"{op_name}_opt"
-                    generate_timeline(self.run_metadata_ori.step_stats, filename_ori)
-                    generate_timeline(self.run_metadata_opt.step_stats, filename_opt)
-                    
-                    total_times_ori += extract_op_total_time(filename_ori, start_op, end_op)
-                    total_times_opt += extract_op_dur(filename_opt, op_name)
                 
-                avg_ori = total_times_ori / num_runs
-                avg_opt = total_times_opt / num_runs
-                speedup = (avg_ori - avg_opt) / avg_ori * 100
-                
-                print(f"{TF_origin}: {avg_ori:.4f} us per run")
-                print(f"{op_name}: {avg_opt:.4f} us per run")
-                print(f"improve: {speedup:.2f}%")
+                benchmark_op(
+                    sess,
+                    feed,
+                    [out_ori],
+                    [out_opt],
+                    self.run_options,
+                    self.run_metadata_ori,
+                    self.run_metadata_opt,
+                    op_name="KPFusedSparseDynamicStitch",
+                    start_op="ori/Reshape",
+                    end_op="ori/DynamicStitch",
+                    num_runs=10,
+                    tag="---------TF_origin---------"
+                )
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py
index bb3d3727..f8184f20 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py
@@ -4,7 +4,7 @@ import unittest
 
 
 from tensorflow.python.ops import gen_embedding_fused_ops
-from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time
+from utils.utils import benchmark_op
 
 
 def ori_fused_embedding_sparse_reshape_graph(slice_input, begin, newshape):
@@ -75,12 +75,10 @@ class TestFusedSparseReshape(unittest.TestCase):
     def test_kp_sparse_reshape(self):
         with tf.Graph().as_default():
             slice_input = tf.compat.v1.placeholder(tf.int64, shape=(4,2), name="slice_input")
-            begin = tf.compat.v1.placeholder(tf.int32, name="begin")
-            newshape = tf.compat.v1.placeholder(tf.int32, name="newshape")
+            begin = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="begin")
+            newshape = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="newshape")
             base_slice_input = np.array([[0, 0], [0, 1], [1, 2], [3, 4]], dtype=np.int64)
             base_begin = [0, 1]
-            base_end = [0, 2]
-            base_strides = [1, 1]
             base_newshape = [2, 4]
             feed = {
                 slice_input: base_slice_input,
@@ -118,46 +116,21 @@ class TestFusedSparseReshape(unittest.TestCase):
                     out_ori_val2,
                     err_msg="Segment count mismatch"
                 )
-
-                op_name   = "KPFusedSparseReshape"
-                TF_origin = "-----TF_origin------"
-                print("-" * 60)
-                print("-" * 60)
-                print("new test")
-                # 多次生成 timeline 并统计平均值
-                num_runs = 1000
-                start_op = "ori/StridedSlice"
-                end_op   = "ori/SparseReshape"
-                total_times_ori = 0
-                total_times_opt = 0
-                for i in range(num_runs):
-                    sess.run(
-                        [out_ori1, out_ori2, out_ori3], 
-                        feed_dict=feed, 
-                        options=self.run_options, 
-                        run_metadata=self.run_metadata_ori
-                    )
-                    sess.run(
-                        [out_opt1, out_opt2], 
-                        feed_dict=feed, 
-                        options=self.run_options, 
-                        run_metadata=self.run_metadata_opt
-                    )
-                    filename_ori = f"{op_name}_ori"
-                    filename_opt = f"{op_name}_opt"
-                    generate_timeline(self.run_metadata_ori.step_stats, filename_ori)
-                    generate_timeline(self.run_metadata_opt.step_stats, filename_opt)
-                    
-                    total_times_ori += extract_op_total_time(filename_ori, start_op, end_op)
-                    total_times_opt += extract_op_dur(filename_opt, op_name)
-                
-                avg_ori = total_times_ori / num_runs
-                avg_opt = total_times_opt / num_runs
-                speedup = (avg_ori - avg_opt) / avg_ori * 100
                 
-                print(f"{TF_origin}: {avg_ori:.4f} us per run")
-                print(f"{op_name}: {avg_opt:.4f} us per run")
-                print(f"improve: {speedup:.2f}%")
+                benchmark_op(
+                    sess,
+                    feed,
+                    [out_ori1, out_ori2, out_ori3],
+                    [out_opt1, out_opt2],
+                    self.run_options,
+                    self.run_metadata_ori,
+                    self.run_metadata_opt,
+                    op_name="KPFusedSparseReshape",
+                    start_op="ori/StridedSlice",
+                    end_op="ori/SparseReshape",
+                    num_runs=1000,
+                    tag="------TF_origin-----"
+                )
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py
index c98760ae..0e492617 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py
@@ -3,15 +3,15 @@ import numpy as np
 import unittest
 
 from tensorflow.python.ops import gen_embedding_fused_ops
-from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time
+from utils.utils import benchmark_op
 
 
 def ori_fused_embedding_sparse_segment_reduce_graph(data, indices, slice_input, begin, end, strides, is_mean):
     slice_out = tf.strided_slice(
             slice_input,
-            begin= begin,
-            end= end,
-            strides= strides,
+            begin=begin,
+            end=end,
+            strides=strides,
             begin_mask=1,
             end_mask=1,
             shrink_axis_mask=2
@@ -20,15 +20,15 @@ def ori_fused_embedding_sparse_segment_reduce_graph(data, indices, slice_input,
     segment_ids = tf.cast(slice_out, dtype=tf.int32)
     if is_mean:
         output = tf.sparse.segment_mean(
-            data = data,
-            indices = indices,
-            segment_ids= segment_ids
+            data=data,
+            indices=indices,
+            segment_ids=segment_ids
         )
     else:
         output = tf.sparse.segment_sum(
-            data = data,
-            indices = indices,
-            segment_ids= segment_ids
+            data=data,
+            indices=indices,
+            segment_ids=segment_ids
         )
     
     output_shape = tf.shape(output)
@@ -44,10 +44,9 @@ def opt_fused_embedding_sparse_segment_reduce_graph(data, indices, slice_input,
                 indices=indices,
                 slice_input=slice_input,
                 begin=begin,
-                end = end,
-                strides = strides
+                end=end,
+                strides=strides
             )
-        return custom_out, custom_slice_out
     else:
         custom_out, custom_slice_out = gen_embedding_fused_ops.KPFusedSparseSegmentReduce(
                 data=data,
@@ -55,10 +54,10 @@ def opt_fused_embedding_sparse_segment_reduce_graph(data, indices, slice_input,
                 slice_input=slice_input,
                 begin=begin,
                 end = end,
-                strides = strides,
-                combiner = 0
+                strides=strides,
+                combiner=0
             )
-        return custom_out, custom_slice_out
+    return custom_out, custom_slice_out
 
 
 class TestSparseSegmentMeanSlice(unittest.TestCase):
@@ -79,18 +78,20 @@ class TestSparseSegmentMeanSlice(unittest.TestCase):
 
     def test_mean(self):
         with tf.Graph().as_default():
+            data = tf.compat.v1.placeholder(tf.float32, shape=(4,3), name="data")
+            indices = tf.compat.v1.placeholder(tf.int32, shape=(3,), name="indices")
+            slice_input = tf.compat.v1.placeholder(tf.int64, shape=(3,2), name="slice_input")
+            begin = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="begin")
+            end = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="end")
+            strides = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="strides")
+            
             base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0, 5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4， 3}
             base_indices = np.array([0, 1, 2], dtype=np.int64) # shape {3}
             base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) # shape {3, 2}
             base_begin = [0, 1]
             base_end = [0, 2]
             base_strides = [1, 2]
-            data = tf.compat.v1.placeholder(tf.float32, shape=(4,3), name="data")
-            indices = tf.compat.v1.placeholder(tf.int32, name="indices")
-            slice_input = tf.compat.v1.placeholder(tf.int64, shape=(3,2), name="slice_input")
-            begin = tf.compat.v1.placeholder(tf.int32, name="begin")
-            end = tf.compat.v1.placeholder(tf.int32, name="end")
-            strides = tf.compat.v1.placeholder(tf.int32, name="strides")
+            
             feed = {
                 data: base_data,
                 indices: base_indices,
@@ -99,6 +100,7 @@ class TestSparseSegmentMeanSlice(unittest.TestCase):
                 end: base_end,
                 strides: base_strides
             }
+            
             with tf.name_scope("ori"):
                 out_ori1, out_ori2 = ori_fused_embedding_sparse_segment_reduce_graph(data,indices,slice_input,begin,end,strides,True)
             with tf.name_scope("opt"):
@@ -120,61 +122,38 @@ class TestSparseSegmentMeanSlice(unittest.TestCase):
                     err_msg="Segment count mismatch"
                 )
                 
-                TF_origin = "--------TF_origin---------"
-                op_name   = "KPFusedSparseSegmentReduce"
-                print("-" * 60)
-                print("-" * 60)
-                print("new test")
-                # 多次生成 timeline 并统计平均值
-                num_runs = 500
-                start_op = "ori/StridedSlice"
-                end_op   = "ori/StridedSlice_1"
-                total_times_ori = 0
-                total_times_opt = 0
-                for i in range(num_runs):
-                    sess.run(
-                        [out_ori1, out_ori2], 
-                        feed_dict=feed, 
-                        options=self.run_options, 
-                        run_metadata=self.run_metadata_ori
-                    )
-                    sess.run(
-                        [out_opt1, out_opt2], 
-                        feed_dict=feed, 
-                        options=self.run_options, 
-                        run_metadata=self.run_metadata_opt
-                    )
-                    filename_ori = f"{op_name}_ori"
-                    filename_opt = f"{op_name}_opt"
-                    generate_timeline(self.run_metadata_ori.step_stats, filename_ori)
-                    generate_timeline(self.run_metadata_opt.step_stats, filename_opt)
-                    
-                    total_times_ori += extract_op_total_time(filename_ori, start_op, end_op)
-                    total_times_opt += extract_op_dur(filename_opt, op_name)
-                
-                avg_ori = total_times_ori / num_runs
-                avg_opt = total_times_opt / num_runs
-                speedup = (avg_ori - avg_opt) / avg_ori * 100
-                
-                print(f"{TF_origin}: {avg_ori:.4f} us per run")
-                print(f"{op_name}: {avg_opt:.4f} us per run")
-                print(f"improve: {speedup:.2f}%")
+                benchmark_op(
+                    sess,
+                    feed,
+                    [out_ori1, out_ori2],
+                    [out_opt1, out_opt2],
+                    self.run_options,
+                    self.run_metadata_ori,
+                    self.run_metadata_opt,
+                    op_name="KPFusedSparseSegmentReduce",
+                    start_op="ori/StridedSlice",
+                    end_op="ori/StridedSlice_1",
+                    num_runs=500,
+                    tag="--------TF_origin---------"
+                )
 
     
     def test_sum(self):
         with tf.Graph().as_default():
+            data = tf.compat.v1.placeholder(tf.float32, shape=(4,3), name="data")
+            indices = tf.compat.v1.placeholder(tf.int32, shape=(3,), name="indices")
+            slice_input = tf.compat.v1.placeholder(tf.int64, shape=(3,2), name="slice_input")
+            begin = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="begin")
+            end = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="end")
+            strides = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="strides")
+            
             base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0,5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4， 3}
             base_indices = np.array([0, 1, 2], dtype=np.int64)
             base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) 
             base_begin = [0, 1]
             base_end = [0, 2]
             base_strides = [1, 2]
-            data = tf.compat.v1.placeholder(tf.float32, shape=(4,3), name="data")
-            indices = tf.compat.v1.placeholder(tf.int32, name="indices")
-            slice_input = tf.compat.v1.placeholder(tf.int64, shape=(3,2), name="slice_input")
-            begin = tf.compat.v1.placeholder(tf.int32, name="begin")
-            end = tf.compat.v1.placeholder(tf.int32, name="end")
-            strides = tf.compat.v1.placeholder(tf.int32, name="strides")
+            
             feed = {
                 data: base_data,
                 indices: base_indices,
@@ -184,9 +163,9 @@ class TestSparseSegmentMeanSlice(unittest.TestCase):
                 strides: base_strides
             }
             with tf.name_scope("ori"):
-                out_ori1, out_ori2 = ori_fused_embedding_sparse_segment_reduce_graph(data,indices,slice_input,begin,end,strides,False)
+                out_ori1, out_ori2 = ori_fused_embedding_sparse_segment_reduce_graph(data, indices, slice_input, begin, end, strides, False)
             with tf.name_scope("opt"):
-                out_opt1, out_opt2 = opt_fused_embedding_sparse_segment_reduce_graph(data,indices,slice_input,begin,end,strides,False)
+                out_opt1, out_opt2 = opt_fused_embedding_sparse_segment_reduce_graph(data,indices, slice_input, begin, end, strides, False)
             
             with tf.compat.v1.Session(config=self.config) as sess:
                 out_ori_val1, out_ori_val2 = sess.run([out_ori1, out_ori2], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori)
@@ -202,6 +181,21 @@ class TestSparseSegmentMeanSlice(unittest.TestCase):
                     out_ori_val2,
                     err_msg="Segment count mismatch"
                 )
+                
+                benchmark_op(
+                    sess,
+                    feed,
+                    [out_ori1, out_ori2],
+                    [out_opt1, out_opt2],
+                    self.run_options,
+                    self.run_metadata_ori,
+                    self.run_metadata_opt,
+                    op_name="KPFusedSparseSegmentReduce",
+                    start_op="ori/StridedSlice",
+                    end_op="ori/StridedSlice_1",
+                    num_runs=1000,
+                    tag="---------TF_origin--------"
+                )
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py
index 45aa8ff7..de59d532 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py
@@ -3,7 +3,9 @@ import numpy as np
 import unittest
 
 from tensorflow.python.ops import gen_embedding_fused_ops
-from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time
+from utils.utils import benchmark_op
+
+np.random.seed(140)
 
 
 def ori_fused_embedding_sparse_select_graph(input_a, input_b, input_c):
@@ -98,46 +100,22 @@ class TestKPFusedSparseSelect(unittest.TestCase):
                     rtol=1e-5,
                     err_msg="Output values mismatch"
                 )
-
-                op_name   = "KPFusedSparseSelect"
-                TF_origin = "-----TF_origin-----"
-                print("-" * 60)
-                print("-" * 60)
-                print("new test")
-                # 多次生成 timeline 并统计平均值
-                num_runs = 1000
-                start_op = "ori/Reshape"
-                end_op   = "ori/Sub"
-                total_times_ori = 0
-                total_times_opt = 0
-                for i in range(num_runs):
-                    sess.run(
-                        [out_ori1, out_ori2, out_ori3], 
-                        feed_dict=feed, 
-                        options=self.run_options, 
-                        run_metadata=self.run_metadata_ori
-                    )
-                    sess.run(
-                        [out_opt1, out_opt2, out_opt3], 
-                        feed_dict=feed, 
-                        options=self.run_options, 
-                        run_metadata=self.run_metadata_opt
-                    )
-                    filename_ori = f"{op_name}_ori"
-                    filename_opt = f"{op_name}_opt"
-                    generate_timeline(self.run_metadata_ori.step_stats, filename_ori)
-                    generate_timeline(self.run_metadata_opt.step_stats, filename_opt)
-                    
-                    total_times_ori += extract_op_total_time(filename_ori, start_op, end_op)
-                    total_times_opt += extract_op_dur(filename_opt, op_name)
-                
-                avg_ori = total_times_ori / num_runs
-                avg_opt = total_times_opt / num_runs
-                speedup = (avg_ori - avg_opt) / avg_ori * 100
                 
-                print(f"{TF_origin}: {avg_ori:.4f} us per run")
-                print(f"{op_name}: {avg_opt:.4f} us per run")
-                print(f"improve: {speedup:.2f}%")
+                benchmark_op(
+                    sess,
+                    feed,
+                    [out_ori1, out_ori2, out_ori3],
+                    [out_opt1, out_opt2, out_opt3],
+                    self.run_options,
+                    self.run_metadata_ori,
+                    self.run_metadata_opt,
+                    op_name="KPFusedSparseSelect",
+                    start_op="ori/Reshape",
+                    end_op="ori/Sub",
+                    num_runs=1000,
+                    tag="-----TF_origin-----"
+                )
+
 
 if __name__ == "__main__":
     tf.compat.v1.disable_eager_execution()
diff --git a/tensorflow/python/grappler/embedding_fused_test/utils/utils.py b/tensorflow/python/grappler/embedding_fused_test/utils/utils.py
index cd42c5b4..f982f5dd 100644
--- a/tensorflow/python/grappler/embedding_fused_test/utils/utils.py
+++ b/tensorflow/python/grappler/embedding_fused_test/utils/utils.py
@@ -24,6 +24,64 @@ def extract_op_total_time(timeline_file, start_op, end_op):
     return end_time - start_time
 
 
+def benchmark_op(
+    sess,
+    feed,
+    out_ori,
+    out_opt,
+    run_options,
+    run_metadata_ori,
+    run_metadata_opt,
+    op_name,
+    start_op,
+    end_op,
+    num_runs=500,
+    tag="--------TF_origin---------"
+):
+    print("-" * 60)
+    print("-" * 60)
+    print("new test")
+
+    total_times_ori = 0.0
+    total_times_opt = 0.0
+
+    for i in range(num_runs):
+        # 执行原始算子
+        sess.run(
+            out_ori,
+            feed_dict=feed,
+            options=run_options,
+            run_metadata=run_metadata_ori
+        )
+        # 执行优化后的算子
+        sess.run(
+            out_opt,
+            feed_dict=feed,
+            options=run_options,
+            run_metadata=run_metadata_opt
+        )
+
+        # 生成 timeline 文件
+        filename_ori = f"{op_name}_ori"
+        filename_opt = f"{op_name}_opt"
+        generate_timeline(run_metadata_ori.step_stats, filename_ori)
+        generate_timeline(run_metadata_opt.step_stats, filename_opt)
+
+        # 统计时延
+        total_times_ori += extract_op_total_time(filename_ori, start_op, end_op)
+        total_times_opt += extract_op_dur(filename_opt, op_name)
+
+    # 计算平均值和加速比
+    avg_ori = total_times_ori / num_runs
+    avg_opt = total_times_opt / num_runs
+    speedup = (avg_ori - avg_opt) / avg_ori * 100 if avg_ori > 0 else 0
+
+    # 打印结果
+    print(f"{tag}: {avg_ori:.4f} us per run")
+    print(f"{op_name}: {avg_opt:.4f} us per run")
+    print(f"improve: {speedup:.2f}%")
+
+
 def perf_run(ori_func, opt_func, name, warmup=5, iters=5):
     print(f"\nWarmup ori: {warmup} iters")
     for _ in range(warmup):
-- 
Gitee


From 461a4f6fe24736dbcd4bb1494ad9031d1d40882e Mon Sep 17 00:00:00 2001
From: rayshine <1324789704@qq.com>
Date: Wed, 20 Aug 2025 11:12:45 +0800
Subject: [PATCH 7/9] =?UTF-8?q?=E4=BF=AE=E6=94=B9=20=E6=B5=8B=E8=AF=95?=
 =?UTF-8?q?=E8=84=9A=E6=9C=AC=E6=A0=BC=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../fused_embedding_action_id_gather_test.py  | 17 ++++--
 .../fused_embedding_gather_test.py            |  6 +-
 .../fused_embedding_padding_test.py           | 20 +++++--
 ...ed_embedding_sparse_dynamic_stitch_test.py | 16 +++--
 .../fused_embedding_sparse_reshape_test.py    |  7 ++-
 ...ed_embedding_sparse_segment_reduce_test.py | 58 +++++++++++++++----
 .../fused_embedding_sparse_select_test.py     | 18 +++++-
 7 files changed, 108 insertions(+), 34 deletions(-)

diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py
index 2bd76790..1ae2c1f5 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py
@@ -1,7 +1,9 @@
-import tensorflow as tf
-import numpy as np
+# Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved.
 import unittest
 
+import numpy as np
+import tensorflow as tf
+
 from tensorflow.python.ops import gen_embedding_fused_ops
 from utils.utils import benchmark_op
 
@@ -70,8 +72,12 @@ class TestFusedEmbeddingActionIdGather(unittest.TestCase):
             # Create tf session
             with tf.compat.v1.Session(config=self.config) as sess:
                 # functest
-                out_ori_val = sess.run([out_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori)
-                out_opt_val = sess.run([out_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt)
+                out_ori_val = sess.run(
+                    [out_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori
+                )
+                out_opt_val = sess.run(
+                    [out_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt
+                )
 
                 np.testing.assert_array_equal(
                     out_ori_val,
@@ -90,7 +96,8 @@ class TestFusedEmbeddingActionIdGather(unittest.TestCase):
                     op_name="KPFusedEmbeddingActionIdGather",
                     start_op="ori/stack_1",
                     end_op="ori/concat",
-                    num_runs=1000
+                    num_runs=10000,
+                    tag="----------TF_origin-----------"
                 )
 
 
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py
index 241a4a9a..4e1755ef 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py
@@ -1,6 +1,8 @@
+# Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved.
+import unittest
+
 import tensorflow as tf
 import numpy as np
-import unittest
 
 from tensorflow.python.ops import gen_embedding_fused_ops
 from utils.utils import benchmark_op
@@ -128,7 +130,7 @@ class TestFusedGather(unittest.TestCase):
                     op_name="KPFusedGather",
                     start_op="ori/strided_slice_1",
                     end_op="ori/GatherV2_1",
-                    num_runs=1000,
+                    num_runs=10000,
                     tag="--TF_origin--"
                 )
 
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py
index 61f00534..0b943792 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py
@@ -1,6 +1,8 @@
+# Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved.
+import unittest
+
 import tensorflow as tf
 import numpy as np
-import unittest
 
 from tensorflow.python.ops import gen_embedding_fused_ops
 from utils.utils import benchmark_op
@@ -90,8 +92,12 @@ class TestFusedEmbeddingPadding(unittest.TestCase):
             # Create tf session
             with tf.compat.v1.Session(config=self.config) as sess:
                 # functest
-                ori_result = sess.run([out_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori)
-                opt_result = sess.run([out_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt)
+                ori_result = sess.run(
+                    [out_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori
+                )
+                opt_result = sess.run(
+                    [out_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt
+                )
 
                 np.testing.assert_array_equal(
                     ori_result,
@@ -137,8 +143,12 @@ class TestFusedEmbeddingPadding(unittest.TestCase):
             # Create tf session
             with tf.compat.v1.Session(config=self.config) as sess:
                 # functest
-                ori_result = sess.run([out_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori)
-                opt_result = sess.run([out_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt)
+                ori_result = sess.run(
+                    [out_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori
+                )
+                opt_result = sess.run(
+                    [out_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt
+                )
 
                 np.testing.assert_array_equal(
                     ori_result,
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py
index 17b0bb05..ab471db8 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py
@@ -1,6 +1,8 @@
+# Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved.
+import unittest
+
 import tensorflow as tf
 import numpy as np
-import unittest
 
 from tensorflow.python.ops import gen_embedding_fused_ops
 from utils.utils import benchmark_op
@@ -125,8 +127,12 @@ class TestSparseDynamicStitch(unittest.TestCase):
             # Create tf session
             with tf.compat.v1.Session(config=self.config) as sess:
                 # functest
-                out_ori_val = sess.run([out_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori)
-                out_opt_val = sess.run([out_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt)
+                out_ori_val = sess.run(
+                    [out_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori
+                )
+                out_opt_val = sess.run(
+                    [out_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt
+                )
                 
                 np.testing.assert_array_equal(
                     out_ori_val,
@@ -145,8 +151,8 @@ class TestSparseDynamicStitch(unittest.TestCase):
                     op_name="KPFusedSparseDynamicStitch",
                     start_op="ori/Reshape",
                     end_op="ori/DynamicStitch",
-                    num_runs=10,
-                    tag="---------TF_origin---------"
+                    num_runs=100,
+                    tag="--------TF_origin---------"
                 )
 
 
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py
index f8184f20..f2ad66ff 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py
@@ -1,7 +1,8 @@
-import tensorflow as tf
-import numpy as np
+# Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved.
 import unittest
 
+import tensorflow as tf
+import numpy as np
 
 from tensorflow.python.ops import gen_embedding_fused_ops
 from utils.utils import benchmark_op
@@ -128,7 +129,7 @@ class TestFusedSparseReshape(unittest.TestCase):
                     op_name="KPFusedSparseReshape",
                     start_op="ori/StridedSlice",
                     end_op="ori/SparseReshape",
-                    num_runs=1000,
+                    num_runs=10000,
                     tag="------TF_origin-----"
                 )
 
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py
index 0e492617..5536eb1c 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py
@@ -1,6 +1,8 @@
+# Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved.
+import unittest
+
 import tensorflow as tf
 import numpy as np
-import unittest
 
 from tensorflow.python.ops import gen_embedding_fused_ops
 from utils.utils import benchmark_op
@@ -85,7 +87,10 @@ class TestSparseSegmentMeanSlice(unittest.TestCase):
             end = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="end")
             strides = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="strides")
             
-            base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0, 5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4， 3}
+            base_data = np.array(
+                [[1.0, 2.0, 3.0], [3.0, 4.0, 5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], 
+                dtype=np.float32
+            ) # shape {4， 3}
             base_indices = np.array([0, 1, 2], dtype=np.int64) # shape {3}
             base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) # shape {3, 2}
             base_begin = [0, 1]
@@ -102,13 +107,27 @@ class TestSparseSegmentMeanSlice(unittest.TestCase):
             }
             
             with tf.name_scope("ori"):
-                out_ori1, out_ori2 = ori_fused_embedding_sparse_segment_reduce_graph(data,indices,slice_input,begin,end,strides,True)
+                out_ori1, out_ori2 = ori_fused_embedding_sparse_segment_reduce_graph(
+                    data, indices, slice_input, begin, end, strides, True
+                )
             with tf.name_scope("opt"):
-                out_opt1, out_opt2 = opt_fused_embedding_sparse_segment_reduce_graph(data,indices,slice_input,begin,end,strides,True)
+                out_opt1, out_opt2 = opt_fused_embedding_sparse_segment_reduce_graph(
+                    data, indices, slice_input, begin, end, strides, True
+                )
             
             with tf.compat.v1.Session(config=self.config) as sess:
-                out_ori_val1, out_ori_val2 = sess.run([out_ori1, out_ori2], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori)
-                out_opt_val1, out_opt_val2 = sess.run([out_opt1, out_opt2], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt)
+                out_ori_val1, out_ori_val2 = sess.run(
+                    [out_ori1, out_ori2], 
+                    feed_dict=feed, 
+                    options=self.run_options, 
+                    run_metadata=self.run_metadata_ori
+                )
+                out_opt_val1, out_opt_val2 = sess.run(
+                    [out_opt1, out_opt2], 
+                    feed_dict=feed, 
+                    options=self.run_options, 
+                    run_metadata=self.run_metadata_opt
+                )
 
                 np.testing.assert_allclose(
                     out_opt_val1,
@@ -147,7 +166,10 @@ class TestSparseSegmentMeanSlice(unittest.TestCase):
             end = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="end")
             strides = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="strides")
             
-            base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0,5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4， 3}
+            base_data = np.array(
+                [[1.0, 2.0, 3.0], [3.0, 4.0,5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], 
+                dtype=np.float32
+            ) # shape {4， 3}
             base_indices = np.array([0, 1, 2], dtype=np.int64)
             base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) 
             base_begin = [0, 1]
@@ -163,13 +185,27 @@ class TestSparseSegmentMeanSlice(unittest.TestCase):
                 strides: base_strides
             }
             with tf.name_scope("ori"):
-                out_ori1, out_ori2 = ori_fused_embedding_sparse_segment_reduce_graph(data, indices, slice_input, begin, end, strides, False)
+                out_ori1, out_ori2 = ori_fused_embedding_sparse_segment_reduce_graph(
+                    data, indices, slice_input, begin, end, strides, False
+                )
             with tf.name_scope("opt"):
-                out_opt1, out_opt2 = opt_fused_embedding_sparse_segment_reduce_graph(data,indices, slice_input, begin, end, strides, False)
+                out_opt1, out_opt2 = opt_fused_embedding_sparse_segment_reduce_graph(
+                    data,indices, slice_input, begin, end, strides, False
+                )
             
             with tf.compat.v1.Session(config=self.config) as sess:
-                out_ori_val1, out_ori_val2 = sess.run([out_ori1, out_ori2], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori)
-                out_opt_val1, out_opt_val2 = sess.run([out_opt1, out_opt2], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt)
+                out_ori_val1, out_ori_val2 = sess.run(
+                    [out_ori1, out_ori2], 
+                    feed_dict=feed, 
+                    options=self.run_options, 
+                    run_metadata=self.run_metadata_ori
+                )
+                out_opt_val1, out_opt_val2 = sess.run(
+                    [out_opt1, out_opt2], 
+                    feed_dict=feed, 
+                    options=self.run_options, 
+                    run_metadata=self.run_metadata_opt
+                )
                 np.testing.assert_allclose(
                     out_opt_val1,
                     out_ori_val1,
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py
index de59d532..54c0926f 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py
@@ -1,6 +1,8 @@
+# Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved.
+import unittest
+
 import tensorflow as tf
 import numpy as np
-import unittest
 
 from tensorflow.python.ops import gen_embedding_fused_ops
 from utils.utils import benchmark_op
@@ -77,8 +79,18 @@ class TestKPFusedSparseSelect(unittest.TestCase):
             # Create tf session
             with tf.compat.v1.Session(config=self.config) as sess:
                 # functest
-                out_ori_val1, out_ori_val2, out_ori_val3 = sess.run([out_ori1, out_ori2, out_ori3], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori)
-                out_opt_val1, out_opt_val2, out_opt_val3 = sess.run([out_opt1, out_opt2, out_opt3], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt)
+                out_ori_val1, out_ori_val2, out_ori_val3 = sess.run(
+                    [out_ori1, out_ori2, out_ori3], 
+                    feed_dict=feed, 
+                    options=self.run_options, 
+                    run_metadata=self.run_metadata_ori
+                )
+                out_opt_val1, out_opt_val2, out_opt_val3 = sess.run(
+                    [out_opt1, out_opt2, out_opt3], 
+                    feed_dict=feed, 
+                    options=self.run_options, 
+                    run_metadata=self.run_metadata_opt
+                )
 
                 np.testing.assert_allclose(
                     out_ori_val1,
-- 
Gitee


From 254a67febf3450112e325b1b2093e5d2d89e1ba7 Mon Sep 17 00:00:00 2001
From: rayshine <1324789704@qq.com>
Date: Wed, 20 Aug 2025 11:34:26 +0800
Subject: [PATCH 8/9] =?UTF-8?q?=E8=9E=8D=E5=90=88=E7=AE=97=E5=AD=90?=
 =?UTF-8?q?=E4=BC=98=E5=8C=96=EF=BC=9AActionIdGather=E5=A4=9A=E7=BA=BF?=
 =?UTF-8?q?=E7=A8=8B=E3=80=81Gather=20SIMD=E5=B9=B6=E8=A1=8C=E3=80=81Resha?=
 =?UTF-8?q?pe=E5=A4=9A=E7=BA=BF=E7=A8=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../embedding_fused_action_id_gather.cc       |  80 +++----
 .../core/kernels/embedding_fused_gather.cc    |  54 +++--
 .../core/kernels/embedding_fused_padding.cc   |  27 ++-
 .../embedding_fused_sparse_dynamic_stitch.cc  |   8 +-
 .../kernels/embedding_fused_sparse_reshape.cc | 202 ++++--------------
 .../embedding_fused_sparse_segment_reduce.cc  |  10 +-
 .../kernels/embedding_fused_sparse_select.cc  |  31 +--
 .../core/profiler/lib/profiler_session.cc     |   2 +-
 8 files changed, 134 insertions(+), 280 deletions(-)

diff --git a/tensorflow/core/kernels/embedding_fused_action_id_gather.cc b/tensorflow/core/kernels/embedding_fused_action_id_gather.cc
index e20b8e54..af60b4ab 100644
--- a/tensorflow/core/kernels/embedding_fused_action_id_gather.cc
+++ b/tensorflow/core/kernels/embedding_fused_action_id_gather.cc
@@ -13,23 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <iostream>
-#include <vector>
-
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/op_kernel.h"
-namespace tensorflow {
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/util/work_sharder.h"
 
+namespace tensorflow {
+  
 template <typename Tindices>
-static void GatherV2Impl(OpKernelContext* context,
-                    const float* params_data, 
-                    const TensorShape& params_shape,
-                    const Tindices* indices_data,
-                    const TensorShape& indices_shape,
-                    int axis, Tensor* temp) {
+static void GatherV2Impl(OpKernelContext* context, const float* params_data, const TensorShape& params_shape,
+    const Tindices* indices_data, const TensorShape& indices_shape, int axis, Tensor* temp) {
   TensorShape temp_shape;
   const int P0 = params_shape.dim_size(0);
   int P1 = 1;
@@ -41,13 +33,10 @@ static void GatherV2Impl(OpKernelContext* context,
     temp_shape.AddDim(params_shape.dim_size(d));
     P1 *= params_shape.dim_size(d);
   }
-  OP_REQUIRES_OK(context,
-                  context->allocate_temp(DT_FLOAT, temp_shape, temp));
-  VLOG(1) << "temp shape: " << temp->shape().DebugString();
+  OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, temp_shape, temp));
 
   const int num_indices = indices_shape.num_elements();
   float* temp_data = temp->flat<float>().data();
-  VLOG(2) << "num_indices : " << num_indices;
   if (axis == 0) {
     const int slice_size = P1;
     for (int i = 0; i < num_indices; ++i) {
@@ -55,19 +44,19 @@ static void GatherV2Impl(OpKernelContext* context,
       if (idx < 0 || idx >= P0) {
         LOG(FATAL) << "GatherV2 axis=0: index out of range: " << idx;
       }
-      std::memcpy(temp_data + i * slice_size,
-                  params_data + idx * slice_size,
-                  sizeof(float) * slice_size);
+      std::memcpy(
+          temp_data + i * slice_size, params_data + idx * slice_size, sizeof(float) * slice_size
+      );
     }
   } else {
     LOG(FATAL) << "Only axis=0 is supported";
   }
-  VLOG(1) << "temp value : " << temp->DebugString(100);
 }
 
+
 template <typename Tindices1, typename Tindices2>
 class KPFusedEmbeddingActionIdGatherOp : public OpKernel {
- public:
+public:
   explicit KPFusedEmbeddingActionIdGatherOp(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
@@ -77,40 +66,41 @@ class KPFusedEmbeddingActionIdGatherOp : public OpKernel {
     const Tensor& indices2 = context->input(2);
     const Tensor& pack_dim = context->input(3);
 
-    VLOG(1) << "indices1 shape: " << indices1.shape().DebugString();
-    VLOG(1) << "params shape: " << params.shape().DebugString();
-    VLOG(1) << "indices2 shape: " << indices2.shape().DebugString();
     OP_REQUIRES(context, indices1.dims() <= 2, errors::InvalidArgument("indices1 dims must <= 2"));
     OP_REQUIRES(context, indices2.dims() <= 2, errors::InvalidArgument("indices2 dims must <= 2"));
     OP_REQUIRES(context, params.dims() == 2, errors::InvalidArgument("params dims must = 2"));
     OP_REQUIRES(context, pack_dim.NumElements() == 1, errors::InvalidArgument("pack_dim NumElements must = 1"));
 
     Tensor temp;
-    GatherV2Impl<Tindices1>(context, params.flat<float>().data(), params.shape(),
-                 indices1.flat<Tindices1>().data(), indices1.shape(),
-                 0, &temp);
+    GatherV2Impl<Tindices1>(context, params.flat<float>().data(), params.shape(), indices1.flat<Tindices1>().data(),
+        indices1.shape(), 0, &temp);
     Tensor temp1;
-    GatherV2Impl<Tindices2>(context, temp.flat<float>().data(), temp.shape(),
-                 indices2.flat<Tindices2>().data(), indices2.shape(),
-                 0, &temp1);
+    GatherV2Impl<Tindices2>(context, temp.flat<float>().data(), temp.shape(), indices2.flat<Tindices2>().data(),
+        indices2.shape(), 0, &temp1);
     int pack_size = pack_dim.scalar<int32>()();
-    VLOG(1) << "pack_size value: " << pack_size;
     int a_reshaped_cols = temp1.NumElements() / pack_size;
     auto a_reshaped = temp1.shaped<float, 2>({pack_size, a_reshaped_cols});
-    VLOG(1) << "a_reshaped_cols : " << a_reshaped_cols;
     Tensor* output;
     int output_cols = a_reshaped_cols + 1680;
     OP_REQUIRES_OK(context,
-                  context->allocate_output(0, TensorShape({pack_size, output_cols}), &output));
-    VLOG(1) << "output shape: " << output->shape().DebugString();
-    auto output_matrix = output->matrix<float>();
-    output_matrix.slice(
-      Eigen::array<Eigen::Index, 2>{0, 0},
-      Eigen::array<Eigen::Index, 2>{pack_size, a_reshaped_cols}) = a_reshaped;
-    
-    output_matrix.slice(
-      Eigen::array<Eigen::Index, 2>{0, a_reshaped_cols},
-      Eigen::array<Eigen::Index, 2>{pack_size, 1680}).setZero();
+                   context->allocate_output(0, TensorShape({pack_size, output_cols}), &output));
+
+    auto a_reshaped_data = a_reshaped.data();
+    auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    const int64 cost_per_unit = a_reshaped_cols + 1680;
+    float* base = output->matrix<float>().data();
+    Shard(worker_threads->num_threads, worker_threads->workers, pack_size, cost_per_unit,
+          [&](int64 start_row, int64 end_row) {
+            for (int64 row = start_row; row < end_row; ++row) {
+              float* dst_row = base + row * (a_reshaped_cols + 1680);
+              std::memcpy(
+                  dst_row, a_reshaped_data + row * a_reshaped_cols, sizeof(float) * a_reshaped_cols
+              );
+              std::memset(
+                  dst_row + a_reshaped_cols, 0, sizeof(float) * 1680
+              );
+            }
+          });
   }
 };
 
@@ -119,7 +109,7 @@ class KPFusedEmbeddingActionIdGatherOp : public OpKernel {
                               .Device(DEVICE_CPU)            \
                               .TypeConstraint<Tindices1>("Tindices1") \
                               .TypeConstraint<Tindices2>("Tindices2"), \
-                          KPFusedEmbeddingActionIdGatherOp<Tindices1, Tindices2>);
+                          KPFusedEmbeddingActionIdGatherOp<Tindices1, Tindices2>)
 
 REGISTER_CPU_KERNEL(int64, int32)
 REGISTER_CPU_KERNEL(int32, int32)
diff --git a/tensorflow/core/kernels/embedding_fused_gather.cc b/tensorflow/core/kernels/embedding_fused_gather.cc
index 51ec5776..6927d6b8 100644
--- a/tensorflow/core/kernels/embedding_fused_gather.cc
+++ b/tensorflow/core/kernels/embedding_fused_gather.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include <arm_neon.h>
+
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/util/work_sharder.h"
@@ -22,7 +22,7 @@ limitations under the License.
 using namespace tensorflow;
 
 class KPFusedGather : public OpKernel {
- public:
+public:
   explicit KPFusedGather(OpKernelConstruction* context) : OpKernel(context) { }
 
   void Compute(OpKernelContext* context) override {
@@ -33,16 +33,10 @@ class KPFusedGather : public OpKernel {
     OP_REQUIRES(context, slice_input.dims() == 2, errors::Internal("slice_input dims must == 2"));
     OP_REQUIRES(context, data.dims() == 2, errors::Internal("indentity dims must == 2"));
 
-    VLOG(1) << "Input indentity shape: " << data.shape().DebugString();
-    VLOG(1) << "Input slice_input shape: " << slice_input.shape().DebugString();
-    VLOG(1) << "Input begin value: " << begin.SummarizeValue(10);
-
     int32 col = begin.flat<int32>().data()[1];
     auto data_mat = data.matrix<float>();
     auto slice_input_mat = slice_input.matrix<int64>();
 
-    VLOG(1) << "Column index from begin: " << col;
-
     std::vector<int64_t> unique_values;
     std::vector<int32_t> indices(slice_input.dim_size(0));
     std::unordered_map<int64_t, int32_t> value_to_index;
@@ -60,41 +54,41 @@ class KPFusedGather : public OpKernel {
     }
 
     Tensor* out_shape = nullptr;
-    Tensor* out_indices = nullptr;
-    Tensor* out_data = nullptr;
-
     OP_REQUIRES_OK(context,
                    context->allocate_output(
-                       0, TensorShape({unique_values.size()}), &out_shape));
-    std::memcpy(out_shape->data(), unique_values.data(), unique_values.size() * sizeof(int64_t));
-
+                       0, TensorShape({1}), &out_shape));
+    out_shape->flat<int64>()(0) = static_cast<int64>(unique_values.size());
+    
+    Tensor* out_indices = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(
-                        1, TensorShape({static_cast<int32>(indices.size())}), &out_indices));
+                   1, TensorShape({static_cast<int32>(indices.size())}), &out_indices));
     std::memcpy(out_indices->data(), indices.data(), indices.size() * sizeof(int32_t));
-    OP_REQUIRES(context, data.dim_size(1) * unique_values.size() % 12 == 0, 
+
+    OP_REQUIRES(context, data.dim_size(1) * unique_values.size() % 12 == 0,
                 errors::Internal("cannot reshape to [-1, 12]"));
-    
-    std::vector<float> gather1_result;
-    for (auto &indice : unique_values) {
-        for (int64_t i = 0; i < data.dim_size(1); ++i) {
-            gather1_result.push_back(data_mat(indice, i));
-        }
-    }
 
+    Tensor* out_data = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(
-                        2, TensorShape({unique_values.size(), 12}), &out_data));
+                   2, TensorShape({unique_values.size(), 12}), &out_data));
     auto output_data = out_data->matrix<float>();
-    int cur_row = 0;
-    for (auto &indice : unique_values) {
+
+    for (int64_t cur_row = 0; cur_row < unique_values.size(); ++cur_row) {
+        int64_t idx = unique_values[cur_row];
         for (int i = 0; i < 12; ++i) {
-            output_data(cur_row, i) = gather1_result[12 * indice + i];
+            const float* src = &data_mat(idx, 0);
+            float* dst = &output_data(cur_row, 0);
+            float32x4_t v0 = vld1q_f32(src);
+            float32x4_t v1 = vld1q_f32(src + 4);
+            float32x4_t v2 = vld1q_f32(src + 8);
+            vst1q_f32(dst, v0);
+            vst1q_f32(dst + 4, v1);
+            vst1q_f32(dst + 8, v2);
         }
-        cur_row++;
     }
   }
 };
 
 REGISTER_KERNEL_BUILDER(Name("KPFusedGather").Device(DEVICE_CPU),
-                        KPFusedGather);
\ No newline at end of file
+                        KPFusedGather);
diff --git a/tensorflow/core/kernels/embedding_fused_padding.cc b/tensorflow/core/kernels/embedding_fused_padding.cc
index e36fbf7f..98351004 100644
--- a/tensorflow/core/kernels/embedding_fused_padding.cc
+++ b/tensorflow/core/kernels/embedding_fused_padding.cc
@@ -13,21 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <iostream>
 #include <vector>
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/op_kernel.h"
+
 namespace tensorflow {
 
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
 class KPFusedEmbeddingPaddingOp : public OpKernel {
- public:
+public:
   explicit KPFusedEmbeddingPaddingOp(OpKernelConstruction* context) : OpKernel(context) {
     fast_ = (type_string() == "KPFusedEmbeddingPaddingFast");
   }
@@ -67,32 +67,29 @@ class KPFusedEmbeddingPaddingOp : public OpKernel {
     int output_rows = padding_rows + input.dim_size(0);
     int output_cols = input.dim_size(1);
     OP_REQUIRES(
-      context,
-      output_rows * output_cols % reshape_cols == 0,
-      errors::InvalidArgument("padding cannot reshape to [-1, ", reshape_cols, "]")
+        context, output_rows * output_cols % reshape_cols == 0,
+        errors::InvalidArgument("padding cannot reshape to [-1, ", reshape_cols, "]")
     );
     int reshape_rows = output_rows * output_cols / reshape_cols;
     if (fast_) {
-      OP_REQUIRES_OK(context,
-                   context->allocate_output(1, TensorShape({}),
-                                            &output1));
+      OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape({}), &output1));
       output1->scalar<int32>()() = reshape_rows;
       return;
     }
 
     OP_REQUIRES_OK(context,
                    context->allocate_temp(DT_FLOAT, TensorShape({padding_rows + input_rows_value, output_cols}),
-                                            &padding));
+                   &padding));
     auto input_matrix = input.matrix<float>();
     auto padding_matrix = padding.matrix<float>();
 
     padding_matrix.slice(
-      Eigen::array<Eigen::Index, 2>{0, 0},
-      Eigen::array<Eigen::Index, 2>{input_rows_value, output_cols}) = input_matrix;
+        Eigen::array<Eigen::Index, 2>{0, 0},
+        Eigen::array<Eigen::Index, 2>{input_rows_value, output_cols}) = input_matrix;
     
     padding_matrix.slice(
-      Eigen::array<Eigen::Index, 2>{input_rows_value, 0},
-      Eigen::array<Eigen::Index, 2>{padding_rows, output_cols}).setZero();
+        Eigen::array<Eigen::Index, 2>{input_rows_value, 0},
+        Eigen::array<Eigen::Index, 2>{padding_rows, output_cols}).setZero();
 
     TensorShape reshaped_shape({reshape_rows, reshape_cols});
     OP_REQUIRES_OK(context,
@@ -100,7 +97,7 @@ class KPFusedEmbeddingPaddingOp : public OpKernel {
     output1->flat<float>() = padding.flat<float>();
   }
 
-  private:
+private:
     bool fast_;
 };
 
diff --git a/tensorflow/core/kernels/embedding_fused_sparse_dynamic_stitch.cc b/tensorflow/core/kernels/embedding_fused_sparse_dynamic_stitch.cc
index 9937a07e..e1cdbc5c 100644
--- a/tensorflow/core/kernels/embedding_fused_sparse_dynamic_stitch.cc
+++ b/tensorflow/core/kernels/embedding_fused_sparse_dynamic_stitch.cc
@@ -13,12 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <arm_neon.h>
-
 #include <vector>
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/util/work_sharder.h"
@@ -26,7 +22,7 @@ limitations under the License.
 using namespace tensorflow;
 
 class KPFusedSparseDynamicStitchOp : public OpKernel {
- public:
+public:
   explicit KPFusedSparseDynamicStitchOp(OpKernelConstruction* context)
       : OpKernel(context) {}
 
@@ -78,4 +74,4 @@ class KPFusedSparseDynamicStitchOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("KPFusedSparseDynamicStitch").Device(DEVICE_CPU),
-                        KPFusedSparseDynamicStitchOp);
+                        KPFusedSparseDynamicStitchOp);
\ No newline at end of file
diff --git a/tensorflow/core/kernels/embedding_fused_sparse_reshape.cc b/tensorflow/core/kernels/embedding_fused_sparse_reshape.cc
index 43428b88..219efaba 100644
--- a/tensorflow/core/kernels/embedding_fused_sparse_reshape.cc
+++ b/tensorflow/core/kernels/embedding_fused_sparse_reshape.cc
@@ -13,140 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/util/work_sharder.h"
-#include "tensorflow/core/kernels/reshape_util.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor_util.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
 
 using namespace tensorflow;
 
-static void ReshapeKp(OpKernelContext *context, const Tensor &input_indices_in,
-             const Tensor &input_shape_in, const Tensor &target_shape_in,
-             int output_indices_idx, int output_shape_idx) {
-  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_indices_in.shape()),
-              errors::InvalidArgument(
-                  "Input indices should be a matrix but received shape ",
-                  input_indices_in.shape().DebugString()));
-  OP_REQUIRES(context, TensorShapeUtils::IsVector(input_shape_in.shape()),
-              errors::InvalidArgument(
-                  "Input shape should be a vector but received shape ",
-                  input_shape_in.shape().DebugString()));
-  OP_REQUIRES(context, TensorShapeUtils::IsVector(target_shape_in.shape()),
-              errors::InvalidArgument(
-                  "Target shape should be a vector but received shape ",
-                  target_shape_in.shape().DebugString()));
-
-  const int64 input_rank = input_shape_in.NumElements();
-  const int64 output_rank = target_shape_in.NumElements();
-  const TensorShape input_shape(input_shape_in.vec<int64>());
-  const int64 dense_size = input_shape.num_elements();
-  const int64 nnz = input_indices_in.shape().dim_size(0);
-
-  TensorShape output_shape;
-  int64 product = 1;
-  int unknown_index = -1;
-  auto target_shape = target_shape_in.vec<int64>();
-  for (int d = 0; d < output_rank; ++d) {
-    const int64 size = target_shape(d);
-    if (size == -1) {
-      OP_REQUIRES(
-          context, unknown_index == -1,
-          errors::InvalidArgument("only one output dimension may be -1, "
-                                  "not both ",
-                                  unknown_index, " and ", d));
-      unknown_index = d;
-      output_shape.AddDim(1);
-    } else {
-      OP_REQUIRES(context, size >= 0,
-                  errors::InvalidArgument("size ", d,
-                                          " must be non-negative, not ", size));
-      product *= size;
-      output_shape.AddDim(size);
-    }
-  }
-  if (unknown_index != -1) {
-    OP_REQUIRES(
-        context, product > 0,
-        errors::InvalidArgument("reshape cannot infer the missing "
-                                "input size for an empty tensor unless all "
-                                "specified input sizes are non-zero"));
-    const int64 missing = dense_size / product;
-    OP_REQUIRES(
-        context, product * missing == dense_size,
-        errors::InvalidArgument(
-            "Input to reshape is a SparseTensor with ", dense_size,
-            " dense values, but the requested shape requires a multiple of ",
-            product, ". input_shape=", input_shape.DebugString(),
-            " output_shape=", output_shape.DebugString()));
-    output_shape.set_dim(unknown_index, missing);
-  }
-
-  OP_REQUIRES(
-      context, output_shape.num_elements() == dense_size,
-      errors::InvalidArgument("Input to reshape is a tensor with ", dense_size,
-                              " dense values, but the requested shape has ",
-                              output_shape.num_elements(),
-                              ". input_shape=", input_shape.DebugString(),
-                              " output_shape=", output_shape.DebugString()));
-
-  if (input_shape == output_shape) {
-    context->set_output(output_indices_idx, input_indices_in);
-    context->set_output(output_shape_idx, input_shape_in);
-    return;
-  }
-
-  gtl::InlinedVector<int64, 8> input_strides(input_rank);
-  if (input_rank > 0) {
-    input_strides[input_rank - 1] = 1;
-    for (int d = input_rank - 2; d >= 0; --d) {
-      input_strides[d] = input_strides[d + 1] * input_shape.dim_size(d + 1);
-    }
-  }
-
-  gtl::InlinedVector<int64, 8> output_strides(output_rank);
-  if (output_rank > 0) {
-    output_strides[output_rank - 1] = 1;
-    for (int d = output_rank - 2; d >= 0; --d) {
-      output_strides[d] = output_strides[d + 1] * output_shape.dim_size(d + 1);
-    }
-  }
-
-  Tensor *result_indices = nullptr;
-  OP_REQUIRES_OK(context,
-                 context->allocate_output(output_indices_idx,
-                                          TensorShape({nnz, output_rank}),
-                                          &result_indices));
-  auto input_ind = input_indices_in.matrix<int64>();
-  auto output_ind = result_indices->matrix<int64>();
-  for (int i = 0; i < nnz; ++i) {
-    int64 id = 0;
-    for (int j = 0; j < input_rank; ++j) {
-      id += input_ind(i, j) * input_strides[j];
-    }
-    for (int j = 0; j < output_rank; ++j) {
-      output_ind(i, j) = id / output_strides[j];
-      id %= output_strides[j];
-    }
-  }
-
-  Tensor *result_shape = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(output_shape_idx,
-                                                   TensorShape({output_rank}),
-                                                   &result_shape));
-  auto output_shape_vec = result_shape->vec<int64>();
-  for (int j = 0; j < output_shape.dims(); ++j) {
-    output_shape_vec(j) = output_shape.dim_size(j);
-  }
-}
 
 class KPFusedSparseReshapeOp : public OpKernel {
- public:
+public:
   explicit KPFusedSparseReshapeOp(OpKernelConstruction* context) : OpKernel(context) { }
 
   void Compute(OpKernelContext* context) override {
@@ -155,39 +30,54 @@ class KPFusedSparseReshapeOp : public OpKernel {
     const Tensor& new_shape = context->input(2);
 
     OP_REQUIRES(context, slice_input.dims() == 2, errors::Internal("slice_input dims must == 2"));
-
-    VLOG(1) << "Input slice_input shape: " << slice_input.shape().DebugString();
-    VLOG(1) << "Input begin value: " << begin.DebugString();
-    VLOG(1) << "Input new_shape value: " << new_shape.DebugString();
-
+    
     int32 col = begin.flat<int32>().data()[1];
-    int64_t stridedslice57_out = slice_input.dim_size(0);
-    auto slice_input_mat = slice_input.matrix<int64>();
-
-    VLOG(1) << "stridedslice57_out: " << stridedslice57_out;
-    VLOG(1) << "slice_input.dim_size(0): " << slice_input.dim_size(0);
-    VLOG(1) << "slice_input.dim_size(1): " << slice_input.dim_size(1);
-    OP_REQUIRES(context, stridedslice57_out == slice_input.dim_size(0), errors::Internal("concat shape mismatch"));
-    VLOG(1) << "Column index from begin: " << col;
-    VLOG(1) << "indices size: " << stridedslice57_out;
-
-    Tensor shape_in(DT_INT64, TensorShape({2}));
-    auto tensor_flat = shape_in.flat<int64>();
-    tensor_flat(0) = stridedslice57_out;
-    tensor_flat(1) = 2;
-
-    Tensor indices_in(DT_INT64, TensorShape({stridedslice57_out, 2}));
-    auto indices_in_mat = indices_in.matrix<int64>();
-    for (int i = 0; i < stridedslice57_out; ++i) {
-        indices_in_mat(i, 0) = i;
-        indices_in_mat(i, 1) = slice_input_mat(i, col);
+    int64 nnz = slice_input.dim_size(0);
+    TensorShape output_shape;
+    int64 product = 2 * nnz;
+    auto target_shape = new_shape.vec<int32>();
+    
+    OP_REQUIRES(context, !(target_shape(0)==-1&&target_shape(1)==-1), errors::InvalidArgument("only one output dimension may be -1."));
+    OP_REQUIRES(context, (target_shape(0)>0||target_shape(0)==-1) && (target_shape(1)>0||target_shape(1)==-1), errors::InvalidArgument("must be non-negative."));
+    OP_REQUIRES(context, product%target_shape(0)==0 && product%target_shape(1)==0, errors::InvalidArgument("reshape cannot infer the missing."));
+
+    output_shape.AddDim(target_shape(0) == -1 ? product / target_shape(1) : target_shape(0));
+    output_shape.AddDim(target_shape(1) == -1 ? product / target_shape(0) : target_shape(1));
+
+    if (output_shape.dim_size(0) == nnz && output_shape.dim_size(1) == 2) {
+      context->set_output(0, slice_input);
+      Tensor input_shape_in(DT_INT64, TensorShape({2}));
+      auto tensor_flat = input_shape_in.flat<int64>();
+      tensor_flat(0) = nnz;
+      tensor_flat(1) = 2;
+      context->set_output(1, input_shape_in);
+      return;
     }
 
-    Tensor new_shape_in(DT_INT64, TensorShape({2}));
-    auto newshape_tensor_flat = new_shape_in.flat<int64>();
-    newshape_tensor_flat(0) = new_shape.flat<int64>()(0);
-    newshape_tensor_flat(1) = new_shape.flat<int64>()(1);
-    ReshapeKp(context, indices_in, shape_in, new_shape_in, 0, 1);
+    Tensor *result_indices = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({nnz, 2}), &result_indices));
+    auto input_ind = slice_input.matrix<int64>();
+    auto output_ind = result_indices->matrix<int64>();
+
+    const int32 target_shape1 = target_shape(1);
+    auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    const int64 cost_per_unit = 50;
+
+    Shard(worker_threads->num_threads, worker_threads->workers, nnz, cost_per_unit,
+          [&](int64 start, int64 limit) {
+            for (int64 i = start; i < limit; ++i) {
+              int base_index = 2 * i + input_ind(i, col);
+              output_ind(i, 1) = base_index % target_shape1;
+              output_ind(i, 0) = base_index / target_shape1;
+            }
+          });
+
+    Tensor *result_shape = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape({2}), &result_shape));
+    auto output_shape_vec = result_shape->vec<int64>();
+    for (int j = 0; j < output_shape.dims(); ++j) {
+      output_shape_vec(j) = output_shape.dim_size(j);
+    }
   }
 };
 
diff --git a/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce.cc b/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce.cc
index 19cc7394..7472fbb9 100644
--- a/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce.cc
+++ b/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include <arm_neon.h>
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/util/work_sharder.h"
@@ -25,7 +23,7 @@ using namespace tensorflow;
 
 template <typename Tidx>
 class KPFusedSparseSegmentReduceOp : public OpKernel {
- public:
+public:
   explicit KPFusedSparseSegmentReduceOp(OpKernelConstruction* context)
       : OpKernel(context) {
     int combiner_mode;
@@ -138,7 +136,7 @@ class KPFusedSparseSegmentReduceOp : public OpKernel {
     }
   }
 
- private:
+private:
   bool is_mean_;
 };
 
@@ -146,7 +144,7 @@ class KPFusedSparseSegmentReduceOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(Name("KPFusedSparseSegmentReduce") \
                               .Device(DEVICE_CPU)            \
                               .TypeConstraint<Tidx>("Tidx"), \
-                          KPFusedSparseSegmentReduceOp<Tidx>);
+                          KPFusedSparseSegmentReduceOp<Tidx>)
 REGISTER_KERNEL(int64)
 REGISTER_KERNEL(int32)
-#undef REGISTER_KERNEL
+#undef REGISTER_KERNEL
\ No newline at end of file
diff --git a/tensorflow/core/kernels/embedding_fused_sparse_select.cc b/tensorflow/core/kernels/embedding_fused_sparse_select.cc
index 086092d5..89a42d14 100644
--- a/tensorflow/core/kernels/embedding_fused_sparse_select.cc
+++ b/tensorflow/core/kernels/embedding_fused_sparse_select.cc
@@ -16,22 +16,19 @@ limitations under the License.
 #include <vector>
 #include <algorithm>
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/platform/logging.h"
 
 using namespace tensorflow;
 
 class KPFusedSparseSelect : public OpKernel {
- public:
+public:
   explicit KPFusedSparseSelect(OpKernelConstruction* context) : OpKernel(context) {
-
   }
 
   void Compute(OpKernelContext* context) override {
-
     const Tensor& input_a = context->input(0);
     const Tensor& input_b = context->input(1);
     const Tensor& input_c = context->input(2);
@@ -39,12 +36,10 @@ class KPFusedSparseSelect : public OpKernel {
     auto a_flat = input_a.flat<int32_t>();
     auto b_flat = input_b.flat<int32_t>();
     auto c_flat = input_c.flat<int32_t>();
-    VLOG(1) << "input_a shape: " << input_a.shape().DebugString();
-    VLOG(1) << "input_b shape: " << input_b.shape().DebugString();
-    VLOG(1) << "input_c shape: " << input_c.shape().DebugString();
-    OP_REQUIRES(context,input_a.NumElements() == input_b.NumElements(),
+
+    OP_REQUIRES(context, input_a.NumElements() == input_b.NumElements(),
                 errors::InvalidArgument("Input num elements must match"));
-    OP_REQUIRES(context,input_a.NumElements() == input_c.NumElements(),
+    OP_REQUIRES(context, input_a.NumElements() == input_c.NumElements(),
                 errors::InvalidArgument("Input num elements must match"));
     auto N = input_a.NumElements();
 
@@ -58,10 +53,10 @@ class KPFusedSparseSelect : public OpKernel {
     auto b_equal_node0 = (b_reshaped_tensor == 4563);
     auto b_equal_node1 = (b_reshaped_tensor == 10831);
     
-    Eigen::Tensor<float,2, Eigen::RowMajor> tensor_ones(N, 1);
+    Eigen::Tensor<float, 2, Eigen::RowMajor> tensor_ones(N, 1);
     tensor_ones.setConstant(1.0f);
 
-    Eigen::Tensor<float,2, Eigen::RowMajor> tensor_zeros(N, 1);
+    Eigen::Tensor<float, 2, Eigen::RowMajor> tensor_zeros(N, 1);
     tensor_zeros.setConstant(0.0f);
 
     auto select_2412 = b_equal_node0.select(tensor_ones, a_greater_casted);
@@ -74,13 +69,9 @@ class KPFusedSparseSelect : public OpKernel {
     Tensor* output_y = nullptr;
     Tensor* output_w = nullptr;
 
-    OP_REQUIRES_OK(context,
-                  context->allocate_output(0,TensorShape({N, 1}), &output_x));
-    OP_REQUIRES_OK(context,
-                  context->allocate_output(1,TensorShape({N, 1}), &output_y));
-    OP_REQUIRES_OK(context,
-                  context->allocate_output(2,TensorShape({N, 2}), &output_w));
-
+    OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({N, 1}), &output_x));
+    OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape({N, 1}), &output_y));
+    OP_REQUIRES_OK(context, context->allocate_output(2, TensorShape({N, 2}), &output_w));
     
     Eigen::TensorMap<Eigen::Tensor<float, 2, Eigen::RowMajor>> map_output_x(
         output_x->flat<float>().data(),
@@ -102,9 +93,7 @@ class KPFusedSparseSelect : public OpKernel {
         output_w->dim_size(1)
     );
     map_output_w = concat_out;
-
   }
-
 };
 
 REGISTER_KERNEL_BUILDER(Name("KPFusedSparseSelect").Device(DEVICE_CPU),
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
index 982a0f93..24b820af 100644
--- a/tensorflow/core/profiler/lib/profiler_session.cc
+++ b/tensorflow/core/profiler/lib/profiler_session.cc
@@ -156,7 +156,7 @@ ProfilerSession::ProfilerSession(const profiler::ProfilerOptions& options)
     return;
   }
 
-  LOG(INFO) << "Profiler session started.";
+  // LOG(INFO) << "Profiler session started.";
 
 #if !defined(IS_MOBILE_PLATFORM)
   CreateProfilers(options, &profilers_);
-- 
Gitee


From 541e1c7c34efd6f4574e10320c03994d802be8f6 Mon Sep 17 00:00:00 2001
From: rayshine <1324789704@qq.com>
Date: Wed, 20 Aug 2025 17:09:10 +0800
Subject: [PATCH 9/9] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dreshape=E7=AE=97=E5=AD=90?=
 =?UTF-8?q?new=5Fshape=E7=B1=BB=E5=9E=8B=E9=97=AE=E9=A2=98=20int32-->int64?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tensorflow/core/kernels/embedding_fused_sparse_reshape.cc | 8 ++++----
 .../fused_embedding_sparse_reshape_test.py                | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/kernels/embedding_fused_sparse_reshape.cc b/tensorflow/core/kernels/embedding_fused_sparse_reshape.cc
index 219efaba..fb0fa578 100644
--- a/tensorflow/core/kernels/embedding_fused_sparse_reshape.cc
+++ b/tensorflow/core/kernels/embedding_fused_sparse_reshape.cc
@@ -35,7 +35,7 @@ public:
     int64 nnz = slice_input.dim_size(0);
     TensorShape output_shape;
     int64 product = 2 * nnz;
-    auto target_shape = new_shape.vec<int32>();
+    auto target_shape = new_shape.vec<int64>();
     
     OP_REQUIRES(context, !(target_shape(0)==-1&&target_shape(1)==-1), errors::InvalidArgument("only one output dimension may be -1."));
     OP_REQUIRES(context, (target_shape(0)>0||target_shape(0)==-1) && (target_shape(1)>0||target_shape(1)==-1), errors::InvalidArgument("must be non-negative."));
@@ -59,14 +59,14 @@ public:
     auto input_ind = slice_input.matrix<int64>();
     auto output_ind = result_indices->matrix<int64>();
 
-    const int32 target_shape1 = target_shape(1);
+    const int64 target_shape1 = target_shape(1);
     auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
     const int64 cost_per_unit = 50;
 
     Shard(worker_threads->num_threads, worker_threads->workers, nnz, cost_per_unit,
           [&](int64 start, int64 limit) {
             for (int64 i = start; i < limit; ++i) {
-              int base_index = 2 * i + input_ind(i, col);
+              int64 base_index = 2 * i + input_ind(i, col);
               output_ind(i, 1) = base_index % target_shape1;
               output_ind(i, 0) = base_index / target_shape1;
             }
@@ -75,7 +75,7 @@ public:
     Tensor *result_shape = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape({2}), &result_shape));
     auto output_shape_vec = result_shape->vec<int64>();
-    for (int j = 0; j < output_shape.dims(); ++j) {
+    for (int64 j = 0; j < output_shape.dims(); ++j) {
       output_shape_vec(j) = output_shape.dim_size(j);
     }
   }
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py
index f2ad66ff..8236393d 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py
@@ -77,7 +77,7 @@ class TestFusedSparseReshape(unittest.TestCase):
         with tf.Graph().as_default():
             slice_input = tf.compat.v1.placeholder(tf.int64, shape=(4,2), name="slice_input")
             begin = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="begin")
-            newshape = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="newshape")
+            newshape = tf.compat.v1.placeholder(tf.int64, shape=(2,), name="newshape")
             base_slice_input = np.array([[0, 0], [0, 1], [1, 2], [3, 4]], dtype=np.int64)
             base_begin = [0, 1]
             base_newshape = [2, 4]
-- 
Gitee