From 90d7796bce754c3a321afe90a7eeb74b9763163a Mon Sep 17 00:00:00 2001 From: Codersheepchen Date: Tue, 22 Jul 2025 21:53:33 -0400 Subject: [PATCH 1/9] add kernel func test and perftest --- .../fused_embedding_action_id_gather_test.py | 69 +++++++++ .../fused_embedding_gather_test.py | 88 ++++++++++++ .../fused_embedding_padding_test.py | 113 +++++++++++++++ ...ed_embedding_sparse_dynamic_stitch_test.py | 97 +++++++++++++ .../fused_embedding_sparse_reshape_test.py | 102 +++++++++++++ ...ed_embedding_sparse_segment_reduce_test.py | 134 ++++++++++++++++++ .../fused_embedding_sparse_select.py | 92 ++++++++++++ .../fused_embedding_action_id_gather_test.py | 67 +++++++++ .../fused_embedding_gather_test.py | 93 ++++++++++++ .../fused_embedding_padding_test.py | 111 +++++++++++++++ .../fused_embedding_select.py | 93 ++++++++++++ ...ed_embedding_sparse_dynamic_stitch_test.py | 96 +++++++++++++ .../fused_embedding_sparse_reshape_test.py | 100 +++++++++++++ ...ed_embedding_sparse_segment_reduce_test.py | 132 +++++++++++++++++ 14 files changed, 1387 insertions(+) create mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py create mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_gather_test.py create mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_padding_test.py create mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_dynamic_stitch_test.py create mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_reshape_test.py create mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_segment_reduce_test.py create mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_select.py create mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_action_id_gather_test.py create mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_gather_test.py create mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_padding_test.py create mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_select.py create mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_dynamic_stitch_test.py create mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_reshape_test.py create mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_segment_reduce_test.py diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py new file mode 100644 index 00000000..94db9d42 --- /dev/null +++ b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py @@ -0,0 +1,69 @@ +import tensorflow as tf +import numpy as np +import unittest + +from tensorflow.python.ops import gen_embedding_fused_ops + +class TestFusedEmbeddingActionIdGather(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Initialize test data and custom op""" + # Load custom op + cls.custom_op = gen_embedding_fused_ops + + # Base test data + np.random.seed(140) + indices1_shape = (8, 10) + indices2_shape = (5, 6) + params_shape = (80, 300) + cls.input0 = np.random.randint(0, params_shape[0], size=indices1_shape, dtype=np.int32) + cls.input1 = np.random.random(params_shape).astype(np.float32) + cls.input2 = np.random.randint(0, indices1_shape[0], size=indices2_shape, dtype=np.int32) + cls.input3 = params_shape[0] + # Create tf session + cls.sess = tf.compat.v1.Session() + + @classmethod + def tearDownClass(cls): + cls.sess.close() + + def test_kp_fused_embedding_action_id_gather(self): + # execute custom op + custom_out = self.custom_op.KPFusedEmbeddingActionIdGather( + input0=tf.constant(self.input0, dtype=tf.int32), + input1=tf.constant(self.input1, dtype=tf.float32), + input2=tf.constant(self.input2, dtype=tf.int32), + input3=tf.constant(self.input3, dtype=tf.int32), + ) + + # tf native implementation + tf_out = self._tf_reference_impl( + input0=tf.constant(self.input0, dtype=tf.int32), + input1=tf.constant(self.input1, dtype=tf.float32), + input2=tf.constant(self.input2, dtype=tf.int32), + input3=tf.constant(self.input3, dtype=tf.int32), + ) + + custom_out_val = self.sess.run([custom_out]) + tf_out_val = self.sess.run([tf_out]) + + # Numerical comparison + np.testing.assert_array_equal( + custom_out_val, + tf_out_val, + err_msg="result mismatch" + ) + + def _tf_reference_impl(self, input0, input1, input2, input3): + gather1 = tf.gather(input1, input0, axis=0) + gather2 = tf.gather(gather1, input2, axis=0) + pack1 = tf.stack([input3, 1680], axis=0) + pack2 = tf.stack([input3, -1], axis=0) + reshape = tf.reshape(gather2, pack2) + fill = tf.fill(pack1, tf.constant(0, dtype=tf.float32)) + output = tf.concat([reshape, fill], axis=-1) + return output + +if __name__ == "__main__": + tf.compat.v1.disable_eager_execution() + unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_gather_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_gather_test.py new file mode 100644 index 00000000..f47b70d2 --- /dev/null +++ b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_gather_test.py @@ -0,0 +1,88 @@ +import tensorflow as tf +import numpy as np +import unittest + +from tensorflow.python.ops import gen_embedding_fused_ops + +class TestFusedGather(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Initialize test data and custom op""" + # Load custom op + cls.custom_op = gen_embedding_fused_ops + + # Base test data + cls.base_data = np.linspace(0, 11, num=240, endpoint=False, dtype=np.float32).reshape(20, 12) + cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2]], dtype=np.int64) + cls.base_begin = [0, 1] + cls.base_end = [0, 2] + cls.base_strides = [1, 1] + # Create tf session + cls.sess = tf.compat.v1.Session() + + @classmethod + def tearDownClass(cls): + cls.sess.close() + + def test_custom(self): + # execute custom op + custom_out1, custom_out2, custom_out3= self.custom_op.KPFusedGather( + data=self.base_data, + slice_input=self.base_slice_input, + begin=self.base_begin, + ) + + # tf native implementation + tf_out1, tf_out2, tf_out3 = self._tf_reference_impl( + self.base_data, + self.base_slice_input, + self.base_begin, + ) + + custom_out_val1, custom_out_val2, custom_out_val3 = self.sess.run([custom_out1, custom_out2, custom_out3]) + tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3]) + + np.testing.assert_array_equal( + custom_out_val1, + tf_out_val1, + err_msg="Segment count mismatch" + ) + + np.testing.assert_array_equal( + custom_out_val2, + tf_out_val2, + err_msg="Segment count mismatch" + ) + + np.testing.assert_allclose( + custom_out_val3, + tf_out_val3, + rtol=1e-6, + err_msg="Output values mismatch" + ) + + def _tf_reference_impl(self, data, slice_input, begin): + slice_out = tf.strided_slice( + slice_input, + begin = begin, + end = [tf.shape(slice_input)[0], begin[1] + 2], + strides = [1, 1], + begin_mask = 1, + end_mask = 1, + shrink_axis_mask = 2 + ) + + slice_out, slice_out_indices = tf.unique(slice_out) + output_shape = tf.shape(slice_out) + slice_out = tf.reshape(slice_out, [-1]) + slice_out, _ = tf.unique(slice_out) + + gather1_result = tf.gather(data, slice_out) + gather1_result = tf.reshape(gather1_result, [-1, 12]) + + gather2_result = tf.gather(gather1_result, slice_out) + return output_shape, slice_out_indices, gather2_result + +if __name__ == "__main__": + tf.compat.v1.disable_eager_execution() + unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_padding_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_padding_test.py new file mode 100644 index 00000000..78c01e17 --- /dev/null +++ b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_padding_test.py @@ -0,0 +1,113 @@ +import tensorflow as tf +import numpy as np +import unittest + +from tensorflow.python.ops import gen_embedding_fused_ops + +class TestFusedEmbeddingPadding(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Initialize test data and custom op""" + # Load custom op + cls.custom_op = gen_embedding_fused_ops + + # Base test data + np.random.seed(140) + cls.input0 = np.random.randint(0, 100, size=(2 * 3, 10), dtype=np.int64) + cls.input1 = np.random.rand(2 * 2, 10).astype(np.float) + cls.input2 = cls.input1.shape + cls.input3 = np.array([-1, 20]).astype(np.int32) + # Create tf session + cls.sess = tf.compat.v1.Session() + + @classmethod + def tearDownClass(cls): + cls.sess.close() + + def test_kp_fused_embedding_padding_fast(self): + # execute custom op + _, custom_out = self.custom_op.KPFusedEmbeddingPaddingFast( + input0=self.input0.shape, + input1=self.input1, + input2=self.input2[0], + input3=self.input3, + ) + + # tf native implementation + tf_out = self._fused_embedding_padding_fast_reference_impl( + tf.constant(self.input0.shape, dtype=tf.int64), + tf.constant(self.input1, dtype=tf.float32), + tf.constant(self.input2[0], dtype=tf.int32), + tf.constant(self.input3, dtype=tf.int32), + ) + + custom_out_val = self.sess.run([custom_out]) + tf_out_val = self.sess.run([tf_out]) + + # Numerical comparison + np.testing.assert_array_equal( + custom_out_val, + tf_out_val, + err_msg="result mismatch" + ) + + def test_kp_fused_embedding_padding(self): + # execute custom op + _, custom_out = self.custom_op.KPFusedEmbeddingPadding( + input0=self.input0.shape, + input1=self.input1, + input2=self.input2[0], + input3=self.input3, + ) + + # tf native implementation + tf_out = self._fused_embedding_padding_reference_impl( + tf.constant(self.input0.shape, dtype=tf.int64), + tf.constant(self.input1, dtype=tf.float32), + tf.constant(self.input2[0], dtype=tf.int32), + tf.constant(self.input3, dtype=tf.int32), + ) + + custom_out_val = self.sess.run([custom_out]) + tf_out_val = self.sess.run([tf_out]) + + # Numerical comparison + np.testing.assert_array_equal( + custom_out_val, + tf_out_val, + err_msg="result mismatch" + ) + + def _fused_embedding_padding_fast_reference_impl(self, input0, input1, input2, input3): + cast = tf.cast(input0, tf.int32) + begin = tf.constant([0], dtype=tf.int32) + end = tf.constant([1], dtype=tf.int32) + strides = tf.constant([1], dtype=tf.int32) + hash_rows = tf.strided_slice(cast, begin=begin, end=end, strides=strides, shrink_axis_mask=1) + sub_out = hash_rows - input2 + const = tf.constant(10, dtype=tf.int32) + pack = tf.stack([sub_out, const], axis=0) + fill = tf.fill(pack, tf.constant(0, dtype=tf.float32)) + concat = tf.concat([input1, fill], 0) + reshape = tf.reshape(concat, input3) + shape_tensor = tf.shape(reshape) + output = tf.strided_slice(shape_tensor, begin=begin, end=end, strides=strides, shrink_axis_mask=1) + return output + + def _fused_embedding_padding_reference_impl(self, input0, input1, input2, input3): + cast = tf.cast(input0, tf.int32) + begin = tf.constant([0], dtype=tf.int32) + end = tf.constant([1], dtype=tf.int32) + strides = tf.constant([1], dtype=tf.int32) + hash_rows = tf.strided_slice(cast, begin=begin, end=end, strides=strides, shrink_axis_mask=1) + sub_out = hash_rows - input2 + const = tf.constant(10, dtype=tf.int32) + pack = tf.stack([sub_out, const], axis=0) + fill = tf.fill(pack, tf.constant(0, dtype=tf.float32)) + concat = tf.concat([input1, fill], 0) + output = tf.reshape(concat, input3) + return output + +if __name__ == "__main__": + tf.compat.v1.disable_eager_execution() + unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_dynamic_stitch_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_dynamic_stitch_test.py new file mode 100644 index 00000000..4de55241 --- /dev/null +++ b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_dynamic_stitch_test.py @@ -0,0 +1,97 @@ +import os +import tensorflow as tf +import numpy as np +import unittest + +from tensorflow.python.ops import gen_embedding_fused_ops + +class TestSparseSegmentMeanSlice(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Initialize test data and custom op""" + # Load custom op + cls.custom_op = gen_embedding_fused_ops + + cls.variables = [] + max_val = float('inf') + for i in range(12): + N_i = np.random.randint(1000000, 44739244) + max_val = min(N_i, max_val) + var = tf.Variable( + tf.random.normal([N_i, 10], dtype=tf.float32), # shape: (N_i, 10) + name=f"embedding_table_{i}" + ) + cls.variables.append(var) + print(f"Created variable {i}: shape={var.shape}") + + x_np = np.random.randint(0, 12*max_val, size=(10000, 12)) + cls.x = tf.constant(x_np, dtype=tf.int64) + + # Create tf session + cls.sess = tf.compat.v1.Session() + cls.sess.run(tf.compat.v1.global_variables_initializer()) + + @classmethod + def tearDownClass(cls): + cls.sess.close() + + def test_base(self): + x_first = self.sess.run(self.x) + var_first = self.sess.run(self.variables[0]) + + x_second = self.sess.run(self.x) + var_second = self.sess.run(self.variables[0]) + np.testing.assert_allclose( + x_first, + x_second, + rtol=1e-6, + err_msg="Input values mismatch" + ) + + np.testing.assert_allclose( + var_first, + var_second, + rtol=1e-6, + err_msg="Input values mismatch" + ) + + # execute custom op + custom_out = self.custom_op.KPFusedSparseDynamicStitch(x=self.x, variables=self.variables) + + # tf native implementation + tf_out = self._tf_reference_impl(x=self.x, variables=self.variables) + + custom_out_val = self.sess.run([custom_out]) + tf_out_val = self.sess.run([tf_out]) + print("custom_shape: ") + print(custom_out_val[0].shape) + print("tf_out shape: ") + print(tf_out_val[0].shape) + # Numerical comparison + np.testing.assert_allclose( + custom_out_val[0], + tf_out_val[0], + rtol=1e-6, + err_msg="Output values mismatch" + ) + + def _tf_reference_impl(self, x, variables): + x_1 = tf.reshape(x, shape=[-1]) + group_ids = tf.math.floormod(x_1, 12) + group_ids = tf.cast(group_ids, dtype=np.int32) + chunk_indices = tf.math.floordiv(x_1, 12) + + original_indices = tf.range(0,tf.size(x_1),1) + + a = tf.dynamic_partition(original_indices, group_ids, num_partitions=12) + b = tf.dynamic_partition(chunk_indices, group_ids, num_partitions=12) + + c = [tf.gather(variables[i], b[i]) for i in range(12)] + + d = tf.dynamic_stitch(a, c) + + return d + +if __name__ == "__main__": + tf.compat.v1.disable_eager_execution() + unittest.main(argv=[''], verbosity=1) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_reshape_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_reshape_test.py new file mode 100644 index 00000000..37d27531 --- /dev/null +++ b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_reshape_test.py @@ -0,0 +1,102 @@ +import tensorflow as tf +import numpy as np +import unittest + +from tensorflow.python.ops import gen_embedding_fused_ops + +class TestFusedSparseReshape(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Initialize test data and custom op""" + # Load custom op + cls.custom_op = gen_embedding_fused_ops + + # Base test data + cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2], [3, 4]], dtype=np.int64) + cls.base_begin = [0, 1] + cls.base_end = [0, 2] + cls.base_strides = [1, 1] + cls.base_newshape = [2, 4] + # Create tf session + cls.sess = tf.compat.v1.Session() + + @classmethod + def tearDownClass(cls): + cls.sess.close() + + def test_custom(self): + # execute custom op + custom_out1, custom_out2, = self.custom_op.KPFusedSparseReshape( + slice_input=self.base_slice_input, + begin=self.base_begin, + new_shape=self.base_newshape + ) + + # tf native implementation + tf_out1, tf_out2, tf_out3 = self._tf_reference_impl( + self.base_slice_input, + self.base_begin, + self.base_newshape + ) + + custom_out_val1, custom_out_val2 = self.sess.run([custom_out1, custom_out2]) + tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3]) + + print("custom_out_val1: ", custom_out_val1) + print("custom_out_val2: ", custom_out_val2) + print("tf_out_val1: ", tf_out_val1) + print("tf_out_val2: ", tf_out_val2) + + np.testing.assert_array_equal( + custom_out_val1, + tf_out_val1, + err_msg="Segment count mismatch" + ) + + np.testing.assert_array_equal( + custom_out_val2, + tf_out_val2, + err_msg="Segment count mismatch" + ) + + def _tf_reference_impl(self, slice_input, begin, new_shape): + slice67_out = tf.strided_slice( + slice_input, + begin=begin, + end=[0, 2], + strides=[1, 1], + begin_mask=1, + end_mask=1, + shrink_axis_mask=2 + ) + + slice67_out = tf.reshape(slice67_out, [-1, 1]) + shape_out = tf.shape(slice67_out) + slice57_out = tf.strided_slice( + shape_out, + begin=[0], + end=[1], + strides=[1], + shrink_axis_mask=1 + ) + + const2 = tf.constant(2) + input_shape = tf.stack([slice57_out, const2]) + input_shape = tf.cast(input_shape, tf.int64) + + range_out = tf.range(0, slice57_out, 1) + range_out = tf.reshape(range_out, [-1, 1]) + range_out_64 = tf.cast(range_out, dtype=tf.int64) + concat_out = tf.concat([range_out_64, slice67_out], axis=-1) + + sparse_tensor = tf.SparseTensor( + indices=concat_out, + values=[1,2,3,4], + dense_shape=input_shape + ) + sparse_tensor_out = tf.sparse.reshape(sparse_tensor, new_shape) + return sparse_tensor_out.indices, sparse_tensor_out.dense_shape, concat_out + +if __name__ == "__main__": + tf.compat.v1.disable_eager_execution() + unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_segment_reduce_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_segment_reduce_test.py new file mode 100644 index 00000000..69c7a114 --- /dev/null +++ b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_segment_reduce_test.py @@ -0,0 +1,134 @@ +import tensorflow as tf +import numpy as np +import unittest + +from tensorflow.python.ops import gen_embedding_fused_ops + +class TestSparseSegmentMeanSlice(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Initialize test data and custom op""" + # Load custom op + cls.custom_op = gen_embedding_fused_ops + + # Base test data + cls.base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0,5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4, 3} + cls.base_indices = np.array([0, 1, 2], dtype=np.int64) # shape {3} + cls.base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) # shape {3, 2} + cls.base_begin = [0, 1] + cls.base_end = [0, 2] + cls.base_strides = [1, 2] + # Create tf session + cls.sess = tf.compat.v1.Session() + + @classmethod + def tearDownClass(cls): + cls.sess.close() + + def test_mean(self): + # execute custom op + custom_out, custom_slice_out = self.custom_op.KPFusedSparseSegmentReduce( + data=self.base_data, + indices=self.base_indices, + slice_input=self.base_slice_input, + begin=self.base_begin, + end = self.base_end, + strides = self.base_strides + ) + + # tf native implementation + tf_out, tf_slice_out = self._tf_reference_impl( + self.base_data, + self.base_indices, + self.base_slice_input, + self.base_begin, + self.base_end, + self.base_strides, + True + ) + + custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out]) + tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out]) + + # Numerical comparison + np.testing.assert_allclose( + custom_out_val, + tf_out_val, + rtol=1e-6, + err_msg="Output values mismatch" + ) + np.testing.assert_array_equal( + custom_slice_out_val, + tf_slice_out_val, + err_msg="Segment count mismatch" + ) + + def test_sum(self): + custom_out, custom_slice_out = self.custom_op.KPFusedSparseSegmentReduce( + data=self.base_data, + indices=self.base_indices, + slice_input=self.base_slice_input, + begin=self.base_begin, + end = self.base_end, + strides = self.base_strides, + combiner=0 + ) + + tf_out, tf_slice_out = self._tf_reference_impl( + self.base_data, + self.base_indices, + self.base_slice_input, + self.base_begin, + self.base_end, + self.base_strides, + False + ) + + custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out]) + tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out]) + + np.testing.assert_allclose( + custom_out_val, + tf_out_val, + rtol=1e-6, + err_msg="Output values mismatch" + ) + np.testing.assert_array_equal( + custom_slice_out_val, + tf_slice_out_val, + err_msg="Segment count mismatch" + ) + + def _tf_reference_impl(self, data, indices, slice_input, begin, end, strides, is_mean): + slice_out = tf.strided_slice( + slice_input, + begin= begin, + end= end, + strides= strides, + begin_mask=1, + end_mask=1, + shrink_axis_mask=2 + ) + + segment_ids = tf.cast(slice_out, dtype=tf.int32) + if is_mean: + output = tf.sparse.segment_mean( + data = data, + indices = indices, + segment_ids= segment_ids + ) + else: + output = tf.sparse.segment_sum( + data = data, + indices = indices, + segment_ids= segment_ids + ) + + output_shape = tf.shape(output) + slice_out = tf.strided_slice(output_shape, begin=[0], end=[1], strides=[1]) + + return output, slice_out + +if __name__ == "__main__": + tf.compat.v1.disable_eager_execution() + unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_select.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_select.py new file mode 100644 index 00000000..6740811a --- /dev/null +++ b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_select.py @@ -0,0 +1,92 @@ +import tensorflow as tf +import numpy as np +import unittest + +from tensorflow.python.ops import gen_embedding_fused_ops + +class TestKPFusedSparseSelect(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Initialize test data and custom op""" + # Load custom op + cls.custom_op = gen_embedding_fused_ops + + # Base test data + cls.input_a = np.random.randint(-10, 10, size=(40, 50)).astype(np.int32) + cls.input_b = np.random.randint(0, 20000, size=(20, 100)).astype(np.int32) + cls.input_c = np.random.randint(0, 10, size=(50, 2, 20)).astype(np.int32) + # Create tf session + cls.sess = tf.compat.v1.Session() + + @classmethod + def tearDownClass(cls): + cls.sess.close() + + def test_custom(self): + # execute custom op + output_x, output_y, output_z = self.custom_op.KPFusedSparseSelect( + input_a=tf.constant(self.input_a, dtype=tf.int32), + input_b=tf.constant(self.input_b, dtype=tf.int32), + input_c=tf.constant(self.input_c, dtype=tf.int32), + ) + + # tf native implementation + ref_output_x, ref_output_y, ref_output_z = self._tf_reference_impl( + input_a=tf.constant(self.input_a, dtype=tf.int32), + input_b=tf.constant(self.input_b, dtype=tf.int32), + input_c=tf.constant(self.input_c, dtype=tf.int32), + ) + + op_x_val, op_y_val, op_z_val = self.sess.run([output_x, output_y, output_z]) + ref_x_val, ref_y_val, ref_z_val = self.sess.run([ref_output_x, ref_output_y, ref_output_z]) + + np.testing.assert_allclose( + op_x_val, + ref_x_val, + rtol=1e-5, + err_msg="Output values mismatch" + ) + + np.testing.assert_allclose( + op_y_val, + ref_y_val, + rtol=1e-5, + err_msg="Output values mismatch" + ) + + np.testing.assert_allclose( + op_z_val, + ref_z_val, + rtol=1e-5, + err_msg="Output values mismatch" + ) + + def _tf_reference_impl(self, input_a, input_b, input_c): + a = tf.reshape(input_a, [-1, 1]) + b = tf.reshape(input_b, [-1, 1]) + c = tf.reshape(input_c, [-1, 1]) + + greater_a = tf.greater(a, 0) + shape_reshape_a1 = tf.shape(a) + shape_reshape_a2 = tf.shape(a) + fill_a1 = tf.fill(shape_reshape_a1, tf.constant(1, dtype=tf.float32)) + realdiv = tf.realdiv(fill_a1, tf.constant(1, dtype=tf.float32)) + fill_a2 = tf.fill(shape_reshape_a2, tf.constant(0, dtype=tf.float32)) + cast_a = tf.cast(greater_a, tf.float32) + shape_a = tf.shape(cast_a) + fill_a = tf.fill(shape_a, tf.constant(1, dtype=tf.float32)) + equal_4563 = tf.equal(b, 4563) + equal_10831 = tf.equal(b, 10831) + equal_3 = tf.equal(c, 3) + select_1 = tf.where(equal_4563, fill_a, cast_a) + select_2 = tf.where(equal_10831, fill_a, select_1) + sub = tf.subtract(tf.constant(1, dtype=tf.float32), select_2) + mul = tf.multiply(tf.constant(1, dtype=tf.float32), select_2) + select_3 = tf.where(equal_3, realdiv, fill_a1) + concat = tf.concat([mul, select_3], axis=-1) + return fill_a2, sub, concat + + +if __name__ == "__main__": + tf.compat.v1.disable_eager_execution() + unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_action_id_gather_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_action_id_gather_test.py new file mode 100644 index 00000000..c766085c --- /dev/null +++ b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_action_id_gather_test.py @@ -0,0 +1,67 @@ +import tensorflow as tf +import numpy as np +import unittest + +class TestFusedEmbeddingActionIdGather(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Initialize test data and custom op""" + # Load custom op + cls.custom_op = tf.load_op_library('../kernels/fused_embedding_action_id_gather.so') + + # Base test data + np.random.seed(140) + indices1_shape = (8, 10) + indices2_shape = (5, 6) + params_shape = (80, 300) + cls.input0 = np.random.randint(0, params_shape[0], size=indices1_shape, dtype=np.int32) + cls.input1 = np.random.random(params_shape).astype(np.float32) + cls.input2 = np.random.randint(0, indices1_shape[0], size=indices2_shape, dtype=np.int32) + cls.input3 = params_shape[0] + # Create tf session + cls.sess = tf.compat.v1.Session() + + @classmethod + def tearDownClass(cls): + cls.sess.close() + + def test_kp_fused_embedding_action_id_gather(self): + # execute custom op + custom_out = self.custom_op.kp_fused_embedding_action_id_gather( + input0=tf.constant(self.input0, dtype=tf.int32), + input1=tf.constant(self.input1, dtype=tf.float32), + input2=tf.constant(self.input2, dtype=tf.int32), + input3=tf.constant(self.input3, dtype=tf.int32), + ) + + # tf native implementation + tf_out = self._tf_reference_impl( + input0=tf.constant(self.input0, dtype=tf.int32), + input1=tf.constant(self.input1, dtype=tf.float32), + input2=tf.constant(self.input2, dtype=tf.int32), + input3=tf.constant(self.input3, dtype=tf.int32), + ) + + custom_out_val = self.sess.run([custom_out]) + tf_out_val = self.sess.run([tf_out]) + + # Numerical comparison + np.testing.assert_array_equal( + custom_out_val, + tf_out_val, + err_msg="result mismatch" + ) + + def _tf_reference_impl(self, input0, input1, input2, input3): + gather1 = tf.gather(input1, input0, axis=0) + gather2 = tf.gather(gather1, input2, axis=0) + pack1 = tf.stack([input3, 1680], axis=0) + pack2 = tf.stack([input3, -1], axis=0) + reshape = tf.reshape(gather2, pack2) + fill = tf.fill(pack1, tf.constant(0, dtype=tf.float32)) + output = tf.concat([reshape, fill], axis=-1) + return output + +if __name__ == "__main__": + tf.compat.v1.disable_eager_execution() + unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_gather_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_gather_test.py new file mode 100644 index 00000000..981c3e3b --- /dev/null +++ b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_gather_test.py @@ -0,0 +1,93 @@ +import tensorflow as tf +import numpy as np +import unittest + +class TestFusedGather(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Initialize test data and custom op""" + # Load custom op + cls.custom_op = tf.load_op_library('kpfusedgather.so') + + # Base test data + cls.base_data = np.linspace(0, 11, num=240, endpoint=False, dtype=np.float32).reshape(20, 12) + cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2]], dtype=np.int64) + cls.base_begin = [0, 1] + cls.base_end = [0, 2] + cls.base_strides = [1, 1] + # Create tf session + cls.sess = tf.compat.v1.Session() + + @classmethod + def tearDownClass(cls): + cls.sess.close() + + def test_custom(self): + # execute custom op + custom_out1, custom_out2, custom_out3= self.custom_op.KPFusedGather( + data=self.base_data, + slice_input=self.base_slice_input, + begin=self.base_begin, + ) + + # tf native implementation + tf_out1, tf_out2, tf_out3 = self._tf_reference_impl( + self.base_data, + self.base_slice_input, + self.base_begin, + ) + + custom_out_val1, custom_out_val2, custom_out_val3 = self.sess.run([custom_out1, custom_out2, custom_out3]) + tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3]) + + print("custom_out_val1: ", custom_out_val1) + print("custom_out_val2: ", custom_out_val2) + print("custom_out_val3: ", custom_out_val3) + print("tf_out_val1: ", tf_out_val1) + print("tf_out_val2: ", tf_out_val2) + print("tf_out_val3: ", tf_out_val3) + + np.testing.assert_array_equal( + custom_out_val1, + tf_out_val1, + err_msg="Segment count mismatch" + ) + + np.testing.assert_array_equal( + custom_out_val2, + tf_out_val2, + err_msg="Segment count mismatch" + ) + + np.testing.assert_allclose( + custom_out_val3, + tf_out_val3, + rtol=1e-6, + err_msg="Output values mismatch" + ) + + def _tf_reference_impl(self, data, slice_input, begin): + slice_out = tf.strided_slice( + slice_input, + begin = begin, + end = [tf.shape(slice_input)[0], begin[1] + 2], + strides = [1, 1], + begin_mask = 1, + end_mask = 1, + shrink_axis_mask = 2 + ) + + slice_out, slice_out_indices = tf.unique(slice_out) + output_shape = tf.shape(slice_out) + slice_out = tf.reshape(slice_out, [-1]) + slice_out, _ = tf.unique(slice_out) + + gather1_result = tf.gather(data, slice_out) + gather1_result = tf.reshape(gather1_result, [-1, 12]) + + gather2_result = tf.gather(gather1_result, slice_out) + return output_shape, slice_out_indices, gather2_result + +if __name__ == "__main__": + tf.compat.v1.disable_eager_execution() + unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_padding_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_padding_test.py new file mode 100644 index 00000000..75dce007 --- /dev/null +++ b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_padding_test.py @@ -0,0 +1,111 @@ +import tensorflow as tf +import numpy as np +import unittest + +class TestFusedEmbeddingPadding(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Initialize test data and custom op""" + # Load custom op + cls.custom_op = tf.load_op_library('../kernels/fused_embedding_padding.so') + + # Base test data + np.random.seed(140) + cls.input0 = np.random.randint(0, 100, size=(2 * 3, 10), dtype=np.int64) + cls.input1 = np.random.rand(2 * 2, 10).astype(np.float) + cls.input2 = cls.input1.shape + cls.input3 = np.array([-1, 20]).astype(np.int32) + # Create tf session + cls.sess = tf.compat.v1.Session() + + @classmethod + def tearDownClass(cls): + cls.sess.close() + + def test_kp_fused_embedding_padding_fast(self): + # execute custom op + _, custom_out = self.custom_op.kp_fused_embedding_padding_fast( + input0=self.input0.shape, + input1=self.input1, + input2=self.input2[0], + input3=self.input3, + ) + + # tf native implementation + tf_out = self._fused_embedding_padding_fast_reference_impl( + tf.constant(self.input0.shape, dtype=tf.int64), + tf.constant(self.input1, dtype=tf.float32), + tf.constant(self.input2[0], dtype=tf.int32), + tf.constant(self.input3, dtype=tf.int32), + ) + + custom_out_val = self.sess.run([custom_out]) + tf_out_val = self.sess.run([tf_out]) + + # Numerical comparison + np.testing.assert_array_equal( + custom_out_val, + tf_out_val, + err_msg="result mismatch" + ) + + def test_kp_fused_embedding_padding(self): + # execute custom op + _, custom_out = self.custom_op.kp_fused_embedding_padding( + input0=self.input0.shape, + input1=self.input1, + input2=self.input2[0], + input3=self.input3, + ) + + # tf native implementation + tf_out = self._fused_embedding_padding_reference_impl( + tf.constant(self.input0.shape, dtype=tf.int64), + tf.constant(self.input1, dtype=tf.float32), + tf.constant(self.input2[0], dtype=tf.int32), + tf.constant(self.input3, dtype=tf.int32), + ) + + custom_out_val = self.sess.run([custom_out]) + tf_out_val = self.sess.run([tf_out]) + + # Numerical comparison + np.testing.assert_array_equal( + custom_out_val, + tf_out_val, + err_msg="result mismatch" + ) + + def _fused_embedding_padding_fast_reference_impl(self, input0, input1, input2, input3): + cast = tf.cast(input0, tf.int32) + begin = tf.constant([0], dtype=tf.int32) + end = tf.constant([1], dtype=tf.int32) + strides = tf.constant([1], dtype=tf.int32) + hash_rows = tf.strided_slice(cast, begin=begin, end=end, strides=strides, shrink_axis_mask=1) + sub_out = hash_rows - input2 + const = tf.constant(10, dtype=tf.int32) + pack = tf.stack([sub_out, const], axis=0) + fill = tf.fill(pack, tf.constant(0, dtype=tf.float32)) + concat = tf.concat([input1, fill], 0) + reshape = tf.reshape(concat, input3) + shape_tensor = tf.shape(reshape) + output = tf.strided_slice(shape_tensor, begin=begin, end=end, strides=strides, shrink_axis_mask=1) + return output + + def _fused_embedding_padding_reference_impl(self, input0, input1, input2, input3): + cast = tf.cast(input0, tf.int32) + begin = tf.constant([0], dtype=tf.int32) + end = tf.constant([1], dtype=tf.int32) + strides = tf.constant([1], dtype=tf.int32) + hash_rows = tf.strided_slice(cast, begin=begin, end=end, strides=strides, shrink_axis_mask=1) + sub_out = hash_rows - input2 + const = tf.constant(10, dtype=tf.int32) + pack = tf.stack([sub_out, const], axis=0) + fill = tf.fill(pack, tf.constant(0, dtype=tf.float32)) + concat = tf.concat([input1, fill], 0) + output = tf.reshape(concat, input3) + return output + +if __name__ == "__main__": + tf.compat.v1.disable_eager_execution() + unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_select.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_select.py new file mode 100644 index 00000000..981c3e3b --- /dev/null +++ b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_select.py @@ -0,0 +1,93 @@ +import tensorflow as tf +import numpy as np +import unittest + +class TestFusedGather(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Initialize test data and custom op""" + # Load custom op + cls.custom_op = tf.load_op_library('kpfusedgather.so') + + # Base test data + cls.base_data = np.linspace(0, 11, num=240, endpoint=False, dtype=np.float32).reshape(20, 12) + cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2]], dtype=np.int64) + cls.base_begin = [0, 1] + cls.base_end = [0, 2] + cls.base_strides = [1, 1] + # Create tf session + cls.sess = tf.compat.v1.Session() + + @classmethod + def tearDownClass(cls): + cls.sess.close() + + def test_custom(self): + # execute custom op + custom_out1, custom_out2, custom_out3= self.custom_op.KPFusedGather( + data=self.base_data, + slice_input=self.base_slice_input, + begin=self.base_begin, + ) + + # tf native implementation + tf_out1, tf_out2, tf_out3 = self._tf_reference_impl( + self.base_data, + self.base_slice_input, + self.base_begin, + ) + + custom_out_val1, custom_out_val2, custom_out_val3 = self.sess.run([custom_out1, custom_out2, custom_out3]) + tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3]) + + print("custom_out_val1: ", custom_out_val1) + print("custom_out_val2: ", custom_out_val2) + print("custom_out_val3: ", custom_out_val3) + print("tf_out_val1: ", tf_out_val1) + print("tf_out_val2: ", tf_out_val2) + print("tf_out_val3: ", tf_out_val3) + + np.testing.assert_array_equal( + custom_out_val1, + tf_out_val1, + err_msg="Segment count mismatch" + ) + + np.testing.assert_array_equal( + custom_out_val2, + tf_out_val2, + err_msg="Segment count mismatch" + ) + + np.testing.assert_allclose( + custom_out_val3, + tf_out_val3, + rtol=1e-6, + err_msg="Output values mismatch" + ) + + def _tf_reference_impl(self, data, slice_input, begin): + slice_out = tf.strided_slice( + slice_input, + begin = begin, + end = [tf.shape(slice_input)[0], begin[1] + 2], + strides = [1, 1], + begin_mask = 1, + end_mask = 1, + shrink_axis_mask = 2 + ) + + slice_out, slice_out_indices = tf.unique(slice_out) + output_shape = tf.shape(slice_out) + slice_out = tf.reshape(slice_out, [-1]) + slice_out, _ = tf.unique(slice_out) + + gather1_result = tf.gather(data, slice_out) + gather1_result = tf.reshape(gather1_result, [-1, 12]) + + gather2_result = tf.gather(gather1_result, slice_out) + return output_shape, slice_out_indices, gather2_result + +if __name__ == "__main__": + tf.compat.v1.disable_eager_execution() + unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_dynamic_stitch_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_dynamic_stitch_test.py new file mode 100644 index 00000000..a85f9ae7 --- /dev/null +++ b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_dynamic_stitch_test.py @@ -0,0 +1,96 @@ +import os +os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '0' +import tensorflow as tf +import numpy as np +import unittest + +class TestSparseSegmentMeanSlice(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Initialize test data and custom op""" + # Load custom op + cls.custom_op = tf.load_op_library('kp_fused_sparse_dynamic_stitch.so') + + cls.variables = [] + max_val = float('inf') + for i in range(12): + N_i = np.random.randint(1000000, 44739244) + max_val = min(N_i, max_val) + var = tf.Variable( + tf.random.normal([N_i, 10], dtype=tf.float32), # shape: (N_i, 10) + name=f"embedding_table_{i}" + ) + cls.variables.append(var) + print(f"Created variable {i}: shape={var.shape}") + + x_np = np.random.randint(0, 12*max_val, size=(10000, 12)) + cls.x = tf.constant(x_np, dtype=tf.int64) + + # Create tf session + cls.sess = tf.compat.v1.Session() + cls.sess.run(tf.compat.v1.global_variables_initializer()) + + @classmethod + def tearDownClass(cls): + cls.sess.close() + + def test_base(self): + x_first = self.sess.run(self.x) + var_first = self.sess.run(self.variables[0]) + + x_second = self.sess.run(self.x) + var_second = self.sess.run(self.variables[0]) + np.testing.assert_allclose( + x_first, + x_second, + rtol=1e-6, + err_msg="Input values mismatch" + ) + + np.testing.assert_allclose( + var_first, + var_second, + rtol=1e-6, + err_msg="Input values mismatch" + ) + + # execute custom op + custom_out = self.custom_op.kp_fused_sparse_dynamic_stitch(self.x, self.variables) + + # tf native implementation + tf_out = self._tf_reference_impl(self.x, self.variables) + + custom_out_val = self.sess.run([custom_out]) + tf_out_val = self.sess.run([tf_out]) + print("custom_shape: ") + print(custom_out_val[0].shape) + print("tf_out shape: ") + print(tf_out_val[0].shape) + # Numerical comparison + np.testing.assert_allclose( + custom_out_val[0], + tf_out_val[0], + rtol=1e-6, + err_msg="Output values mismatch" + ) + + def _tf_reference_impl(self, x, variables): + x_1 = tf.reshape(x, shape=[-1]) + group_ids = tf.math.floormod(x_1, 12) + group_ids = tf.cast(group_ids, dtype=np.int32) + chunk_indices = tf.math.floordiv(x_1, 12) + + original_indices = tf.range(0,tf.size(x_1),1) + + a = tf.dynamic_partition(original_indices, group_ids, num_partitions=12) + b = tf.dynamic_partition(chunk_indices, group_ids, num_partitions=12) + + c = [tf.gather(variables[i], b[i]) for i in range(12)] + + d = tf.dynamic_stitch(a, c) + + return d + +if __name__ == "__main__": + tf.compat.v1.disable_eager_execution() + unittest.main(argv=[''], verbosity=1) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_reshape_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_reshape_test.py new file mode 100644 index 00000000..e8922699 --- /dev/null +++ b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_reshape_test.py @@ -0,0 +1,100 @@ +import tensorflow as tf +import numpy as np +import unittest + +class TestFusedSparseReshape(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Initialize test data and custom op""" + # Load custom op + cls.custom_op = tf.load_op_library('kppattern6.so') + + # Base test data + cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2], [3, 4]], dtype=np.int64) + cls.base_begin = [0, 1] + cls.base_end = [0, 2] + cls.base_strides = [1, 1] + cls.base_newshape = [2, 4] + # Create tf session + cls.sess = tf.compat.v1.Session() + + @classmethod + def tearDownClass(cls): + cls.sess.close() + + def test_custom(self): + # execute custom op + custom_out1, custom_out2, = self.custom_op.KPFusedSparseReshape( + slice_input=self.base_slice_input, + begin=self.base_begin, + new_shape=self.base_newshape + ) + + # tf native implementation + tf_out1, tf_out2, tf_out3 = self._tf_reference_impl( + self.base_slice_input, + self.base_begin, + self.base_newshape + ) + + custom_out_val1, custom_out_val2 = self.sess.run([custom_out1, custom_out2]) + tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3]) + + print("custom_out_val1: ", custom_out_val1) + print("custom_out_val2: ", custom_out_val2) + print("tf_out_val1: ", tf_out_val1) + print("tf_out_val2: ", tf_out_val2) + + np.testing.assert_array_equal( + custom_out_val1, + tf_out_val1, + err_msg="Segment count mismatch" + ) + + np.testing.assert_array_equal( + custom_out_val2, + tf_out_val2, + err_msg="Segment count mismatch" + ) + + def _tf_reference_impl(self, slice_input, begin, new_shape): + slice67_out = tf.strided_slice( + slice_input, + begin=begin, + end=[0, 2], + strides=[1, 1], + begin_mask=1, + end_mask=1, + shrink_axis_mask=2 + ) + + slice67_out = tf.reshape(slice67_out, [-1, 1]) + shape_out = tf.shape(slice67_out) + slice57_out = tf.strided_slice( + shape_out, + begin=[0], + end=[1], + strides=[1], + shrink_axis_mask=1 + ) + + const2 = tf.constant(2) + input_shape = tf.stack([slice57_out, const2]) + input_shape = tf.cast(input_shape, tf.int64) + + range_out = tf.range(0, slice57_out, 1) + range_out = tf.reshape(range_out, [-1, 1]) + range_out_64 = tf.cast(range_out, dtype=tf.int64) + concat_out = tf.concat([range_out_64, slice67_out], axis=-1) + + sparse_tensor = tf.SparseTensor( + indices=concat_out, + values=[1,2,3,4], + dense_shape=input_shape + ) + sparse_tensor_out = tf.sparse.reshape(sparse_tensor, new_shape) + return sparse_tensor_out.indices, sparse_tensor_out.dense_shape, concat_out + +if __name__ == "__main__": + tf.compat.v1.disable_eager_execution() + unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_segment_reduce_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_segment_reduce_test.py new file mode 100644 index 00000000..dcca2f81 --- /dev/null +++ b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_segment_reduce_test.py @@ -0,0 +1,132 @@ +import tensorflow as tf +import numpy as np +import unittest + +class TestSparseSegmentMeanSlice(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Initialize test data and custom op""" + # Load custom op + cls.custom_op = tf.load_op_library('kp_fused_sparse_segment_reduce.so') + + # Base test data + cls.base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0,5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4, 3} + cls.base_indices = np.array([0, 1, 2], dtype=np.int64) # shape {3} + cls.base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) # shape {3, 2} + cls.base_begin = [0, 1] + cls.base_end = [0, 2] + cls.base_strides = [1, 2] + # Create tf session + cls.sess = tf.compat.v1.Session() + + @classmethod + def tearDownClass(cls): + cls.sess.close() + + def test_mean(self): + # execute custom op + custom_out, custom_slice_out = self.custom_op.kp_fused_sparse_segment_reduce( + data=self.base_data, + indices=self.base_indices, + slice_input=self.base_slice_input, + begin=self.base_begin, + end = self.base_end, + strides = self.base_strides + ) + + # tf native implementation + tf_out, tf_slice_out = self._tf_reference_impl( + self.base_data, + self.base_indices, + self.base_slice_input, + self.base_begin, + self.base_end, + self.base_strides, + True + ) + + custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out]) + tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out]) + + # Numerical comparison + np.testing.assert_allclose( + custom_out_val, + tf_out_val, + rtol=1e-6, + err_msg="Output values mismatch" + ) + np.testing.assert_array_equal( + custom_slice_out_val, + tf_slice_out_val, + err_msg="Segment count mismatch" + ) + + def test_sum(self): + custom_out, custom_slice_out = self.custom_op.kp_fused_sparse_segment_reduce( + data=self.base_data, + indices=self.base_indices, + slice_input=self.base_slice_input, + begin=self.base_begin, + end = self.base_end, + strides = self.base_strides, + combiner=0 + ) + + tf_out, tf_slice_out = self._tf_reference_impl( + self.base_data, + self.base_indices, + self.base_slice_input, + self.base_begin, + self.base_end, + self.base_strides, + False + ) + + custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out]) + tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out]) + + np.testing.assert_allclose( + custom_out_val, + tf_out_val, + rtol=1e-6, + err_msg="Output values mismatch" + ) + np.testing.assert_array_equal( + custom_slice_out_val, + tf_slice_out_val, + err_msg="Segment count mismatch" + ) + + def _tf_reference_impl(self, data, indices, slice_input, begin, end, strides, is_mean): + slice_out = tf.strided_slice( + slice_input, + begin= begin, + end= end, + strides= strides, + begin_mask=1, + end_mask=1, + shrink_axis_mask=2 + ) + + segment_ids = tf.cast(slice_out, dtype=tf.int32) + if is_mean: + output = tf.sparse.segment_mean( + data = data, + indices = indices, + segment_ids= segment_ids + ) + else: + output = tf.sparse.segment_sum( + data = data, + indices = indices, + segment_ids= segment_ids + ) + + output_shape = tf.shape(output) + slice_out = tf.strided_slice(output_shape, begin=[0], end=[1], strides=[1]) + + return output, slice_out + +if __name__ == "__main__": + tf.compat.v1.disable_eager_execution() + unittest.main(argv=[''], verbosity=2) \ No newline at end of file -- Gitee From 61cce88af0de1831a94584ab00138518c843bcf2 Mon Sep 17 00:00:00 2001 From: Codersheepchen Date: Wed, 23 Jul 2025 04:24:09 -0400 Subject: [PATCH 2/9] fix KPFusedEmbeddingActionIdGather type unmatched --- .../embedding_fused_action_id_gather.cc | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/tensorflow/core/kernels/embedding_fused_action_id_gather.cc b/tensorflow/core/kernels/embedding_fused_action_id_gather.cc index b324f35f..e20b8e54 100644 --- a/tensorflow/core/kernels/embedding_fused_action_id_gather.cc +++ b/tensorflow/core/kernels/embedding_fused_action_id_gather.cc @@ -47,15 +47,20 @@ static void GatherV2Impl(OpKernelContext* context, const int num_indices = indices_shape.num_elements(); float* temp_data = temp->flat().data(); - VLOG(1) << "num_indices : " << num_indices; - OP_REQUIRES(context, axis == 0, errors::InvalidArgument("axis only support 0")); - const int slice_size = P1; - for (int i = 0; i < num_indices; ++i) { - Tindices idx = indices_data[i]; - OP_REQUIRES(context, (idx < 0 || idx >= P0), errors::InvalidArgument("GatherV2 axis=0: index out of range")); - std::memcpy(temp_data + i * slice_size, - params_data + idx * slice_size, - sizeof(float) * slice_size); + VLOG(2) << "num_indices : " << num_indices; + if (axis == 0) { + const int slice_size = P1; + for (int i = 0; i < num_indices; ++i) { + Tindices idx = indices_data[i]; + if (idx < 0 || idx >= P0) { + LOG(FATAL) << "GatherV2 axis=0: index out of range: " << idx; + } + std::memcpy(temp_data + i * slice_size, + params_data + idx * slice_size, + sizeof(float) * slice_size); + } + } else { + LOG(FATAL) << "Only axis=0 is supported"; } VLOG(1) << "temp value : " << temp->DebugString(100); } @@ -121,4 +126,4 @@ REGISTER_CPU_KERNEL(int32, int32) REGISTER_CPU_KERNEL(int64, int64) REGISTER_CPU_KERNEL(int32, int64) -} \ No newline at end of file +} -- Gitee From 5e66f13ca1de6bd6873d69d28b4277b0cbb8219c Mon Sep 17 00:00:00 2001 From: Codersheepchen Date: Wed, 23 Jul 2025 04:31:56 -0400 Subject: [PATCH 3/9] chenge test_kp_fused_embedding_action_id_gather input0 from int32 to int64 --- .../fused_embedding_action_id_gather_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py index 94db9d42..56f24fea 100644 --- a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py +++ b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py @@ -16,7 +16,7 @@ class TestFusedEmbeddingActionIdGather(unittest.TestCase): indices1_shape = (8, 10) indices2_shape = (5, 6) params_shape = (80, 300) - cls.input0 = np.random.randint(0, params_shape[0], size=indices1_shape, dtype=np.int32) + cls.input0 = np.random.randint(0, params_shape[0], size=indices1_shape, dtype=np.int64) cls.input1 = np.random.random(params_shape).astype(np.float32) cls.input2 = np.random.randint(0, indices1_shape[0], size=indices2_shape, dtype=np.int32) cls.input3 = params_shape[0] @@ -30,7 +30,7 @@ class TestFusedEmbeddingActionIdGather(unittest.TestCase): def test_kp_fused_embedding_action_id_gather(self): # execute custom op custom_out = self.custom_op.KPFusedEmbeddingActionIdGather( - input0=tf.constant(self.input0, dtype=tf.int32), + input0=tf.constant(self.input0, dtype=tf.int64), input1=tf.constant(self.input1, dtype=tf.float32), input2=tf.constant(self.input2, dtype=tf.int32), input3=tf.constant(self.input3, dtype=tf.int32), @@ -38,7 +38,7 @@ class TestFusedEmbeddingActionIdGather(unittest.TestCase): # tf native implementation tf_out = self._tf_reference_impl( - input0=tf.constant(self.input0, dtype=tf.int32), + input0=tf.constant(self.input0, dtype=tf.int64), input1=tf.constant(self.input1, dtype=tf.float32), input2=tf.constant(self.input2, dtype=tf.int32), input3=tf.constant(self.input3, dtype=tf.int32), -- Gitee From cb683f7e0e9f57cfbcb58054c3341b1fe438986e Mon Sep 17 00:00:00 2001 From: Codersheepchen Date: Fri, 25 Jul 2025 03:44:12 -0400 Subject: [PATCH 4/9] add perftest and refactor functest --- .../fused_embedding_action_id_gather_test.py | 69 --------- .../fused_embedding_gather_test.py | 88 ------------ .../fused_embedding_padding_test.py | 113 --------------- ...ed_embedding_sparse_dynamic_stitch_test.py | 97 ------------- .../fused_embedding_sparse_reshape_test.py | 102 ------------- ...ed_embedding_sparse_segment_reduce_test.py | 134 ------------------ .../fused_embedding_sparse_select.py | 92 ------------ .../fused_embedding_action_id_gather_test.py | 67 --------- .../fused_embedding_gather_test.py | 93 ------------ .../fused_embedding_padding_test.py | 111 --------------- .../fused_embedding_select.py | 93 ------------ ...ed_embedding_sparse_dynamic_stitch_test.py | 96 ------------- .../fused_embedding_sparse_reshape_test.py | 100 ------------- ...ed_embedding_sparse_segment_reduce_test.py | 132 ----------------- .../fused_embedding_gather_test.py | 4 +- .../fused_embedding_sparse_select.py | 113 +++++++++++++++ 16 files changed, 115 insertions(+), 1389 deletions(-) delete mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py delete mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_gather_test.py delete mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_padding_test.py delete mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_dynamic_stitch_test.py delete mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_reshape_test.py delete mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_segment_reduce_test.py delete mode 100644 tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_select.py delete mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_action_id_gather_test.py delete mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_gather_test.py delete mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_padding_test.py delete mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_select.py delete mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_dynamic_stitch_test.py delete mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_reshape_test.py delete mode 100644 tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_segment_reduce_test.py create mode 100644 tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select.py diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py deleted file mode 100644 index 56f24fea..00000000 --- a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_action_id_gather_test.py +++ /dev/null @@ -1,69 +0,0 @@ -import tensorflow as tf -import numpy as np -import unittest - -from tensorflow.python.ops import gen_embedding_fused_ops - -class TestFusedEmbeddingActionIdGather(unittest.TestCase): - @classmethod - def setUpClass(cls): - """Initialize test data and custom op""" - # Load custom op - cls.custom_op = gen_embedding_fused_ops - - # Base test data - np.random.seed(140) - indices1_shape = (8, 10) - indices2_shape = (5, 6) - params_shape = (80, 300) - cls.input0 = np.random.randint(0, params_shape[0], size=indices1_shape, dtype=np.int64) - cls.input1 = np.random.random(params_shape).astype(np.float32) - cls.input2 = np.random.randint(0, indices1_shape[0], size=indices2_shape, dtype=np.int32) - cls.input3 = params_shape[0] - # Create tf session - cls.sess = tf.compat.v1.Session() - - @classmethod - def tearDownClass(cls): - cls.sess.close() - - def test_kp_fused_embedding_action_id_gather(self): - # execute custom op - custom_out = self.custom_op.KPFusedEmbeddingActionIdGather( - input0=tf.constant(self.input0, dtype=tf.int64), - input1=tf.constant(self.input1, dtype=tf.float32), - input2=tf.constant(self.input2, dtype=tf.int32), - input3=tf.constant(self.input3, dtype=tf.int32), - ) - - # tf native implementation - tf_out = self._tf_reference_impl( - input0=tf.constant(self.input0, dtype=tf.int64), - input1=tf.constant(self.input1, dtype=tf.float32), - input2=tf.constant(self.input2, dtype=tf.int32), - input3=tf.constant(self.input3, dtype=tf.int32), - ) - - custom_out_val = self.sess.run([custom_out]) - tf_out_val = self.sess.run([tf_out]) - - # Numerical comparison - np.testing.assert_array_equal( - custom_out_val, - tf_out_val, - err_msg="result mismatch" - ) - - def _tf_reference_impl(self, input0, input1, input2, input3): - gather1 = tf.gather(input1, input0, axis=0) - gather2 = tf.gather(gather1, input2, axis=0) - pack1 = tf.stack([input3, 1680], axis=0) - pack2 = tf.stack([input3, -1], axis=0) - reshape = tf.reshape(gather2, pack2) - fill = tf.fill(pack1, tf.constant(0, dtype=tf.float32)) - output = tf.concat([reshape, fill], axis=-1) - return output - -if __name__ == "__main__": - tf.compat.v1.disable_eager_execution() - unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_gather_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_gather_test.py deleted file mode 100644 index f47b70d2..00000000 --- a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_gather_test.py +++ /dev/null @@ -1,88 +0,0 @@ -import tensorflow as tf -import numpy as np -import unittest - -from tensorflow.python.ops import gen_embedding_fused_ops - -class TestFusedGather(unittest.TestCase): - @classmethod - def setUpClass(cls): - """Initialize test data and custom op""" - # Load custom op - cls.custom_op = gen_embedding_fused_ops - - # Base test data - cls.base_data = np.linspace(0, 11, num=240, endpoint=False, dtype=np.float32).reshape(20, 12) - cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2]], dtype=np.int64) - cls.base_begin = [0, 1] - cls.base_end = [0, 2] - cls.base_strides = [1, 1] - # Create tf session - cls.sess = tf.compat.v1.Session() - - @classmethod - def tearDownClass(cls): - cls.sess.close() - - def test_custom(self): - # execute custom op - custom_out1, custom_out2, custom_out3= self.custom_op.KPFusedGather( - data=self.base_data, - slice_input=self.base_slice_input, - begin=self.base_begin, - ) - - # tf native implementation - tf_out1, tf_out2, tf_out3 = self._tf_reference_impl( - self.base_data, - self.base_slice_input, - self.base_begin, - ) - - custom_out_val1, custom_out_val2, custom_out_val3 = self.sess.run([custom_out1, custom_out2, custom_out3]) - tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3]) - - np.testing.assert_array_equal( - custom_out_val1, - tf_out_val1, - err_msg="Segment count mismatch" - ) - - np.testing.assert_array_equal( - custom_out_val2, - tf_out_val2, - err_msg="Segment count mismatch" - ) - - np.testing.assert_allclose( - custom_out_val3, - tf_out_val3, - rtol=1e-6, - err_msg="Output values mismatch" - ) - - def _tf_reference_impl(self, data, slice_input, begin): - slice_out = tf.strided_slice( - slice_input, - begin = begin, - end = [tf.shape(slice_input)[0], begin[1] + 2], - strides = [1, 1], - begin_mask = 1, - end_mask = 1, - shrink_axis_mask = 2 - ) - - slice_out, slice_out_indices = tf.unique(slice_out) - output_shape = tf.shape(slice_out) - slice_out = tf.reshape(slice_out, [-1]) - slice_out, _ = tf.unique(slice_out) - - gather1_result = tf.gather(data, slice_out) - gather1_result = tf.reshape(gather1_result, [-1, 12]) - - gather2_result = tf.gather(gather1_result, slice_out) - return output_shape, slice_out_indices, gather2_result - -if __name__ == "__main__": - tf.compat.v1.disable_eager_execution() - unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_padding_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_padding_test.py deleted file mode 100644 index 78c01e17..00000000 --- a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_padding_test.py +++ /dev/null @@ -1,113 +0,0 @@ -import tensorflow as tf -import numpy as np -import unittest - -from tensorflow.python.ops import gen_embedding_fused_ops - -class TestFusedEmbeddingPadding(unittest.TestCase): - @classmethod - def setUpClass(cls): - """Initialize test data and custom op""" - # Load custom op - cls.custom_op = gen_embedding_fused_ops - - # Base test data - np.random.seed(140) - cls.input0 = np.random.randint(0, 100, size=(2 * 3, 10), dtype=np.int64) - cls.input1 = np.random.rand(2 * 2, 10).astype(np.float) - cls.input2 = cls.input1.shape - cls.input3 = np.array([-1, 20]).astype(np.int32) - # Create tf session - cls.sess = tf.compat.v1.Session() - - @classmethod - def tearDownClass(cls): - cls.sess.close() - - def test_kp_fused_embedding_padding_fast(self): - # execute custom op - _, custom_out = self.custom_op.KPFusedEmbeddingPaddingFast( - input0=self.input0.shape, - input1=self.input1, - input2=self.input2[0], - input3=self.input3, - ) - - # tf native implementation - tf_out = self._fused_embedding_padding_fast_reference_impl( - tf.constant(self.input0.shape, dtype=tf.int64), - tf.constant(self.input1, dtype=tf.float32), - tf.constant(self.input2[0], dtype=tf.int32), - tf.constant(self.input3, dtype=tf.int32), - ) - - custom_out_val = self.sess.run([custom_out]) - tf_out_val = self.sess.run([tf_out]) - - # Numerical comparison - np.testing.assert_array_equal( - custom_out_val, - tf_out_val, - err_msg="result mismatch" - ) - - def test_kp_fused_embedding_padding(self): - # execute custom op - _, custom_out = self.custom_op.KPFusedEmbeddingPadding( - input0=self.input0.shape, - input1=self.input1, - input2=self.input2[0], - input3=self.input3, - ) - - # tf native implementation - tf_out = self._fused_embedding_padding_reference_impl( - tf.constant(self.input0.shape, dtype=tf.int64), - tf.constant(self.input1, dtype=tf.float32), - tf.constant(self.input2[0], dtype=tf.int32), - tf.constant(self.input3, dtype=tf.int32), - ) - - custom_out_val = self.sess.run([custom_out]) - tf_out_val = self.sess.run([tf_out]) - - # Numerical comparison - np.testing.assert_array_equal( - custom_out_val, - tf_out_val, - err_msg="result mismatch" - ) - - def _fused_embedding_padding_fast_reference_impl(self, input0, input1, input2, input3): - cast = tf.cast(input0, tf.int32) - begin = tf.constant([0], dtype=tf.int32) - end = tf.constant([1], dtype=tf.int32) - strides = tf.constant([1], dtype=tf.int32) - hash_rows = tf.strided_slice(cast, begin=begin, end=end, strides=strides, shrink_axis_mask=1) - sub_out = hash_rows - input2 - const = tf.constant(10, dtype=tf.int32) - pack = tf.stack([sub_out, const], axis=0) - fill = tf.fill(pack, tf.constant(0, dtype=tf.float32)) - concat = tf.concat([input1, fill], 0) - reshape = tf.reshape(concat, input3) - shape_tensor = tf.shape(reshape) - output = tf.strided_slice(shape_tensor, begin=begin, end=end, strides=strides, shrink_axis_mask=1) - return output - - def _fused_embedding_padding_reference_impl(self, input0, input1, input2, input3): - cast = tf.cast(input0, tf.int32) - begin = tf.constant([0], dtype=tf.int32) - end = tf.constant([1], dtype=tf.int32) - strides = tf.constant([1], dtype=tf.int32) - hash_rows = tf.strided_slice(cast, begin=begin, end=end, strides=strides, shrink_axis_mask=1) - sub_out = hash_rows - input2 - const = tf.constant(10, dtype=tf.int32) - pack = tf.stack([sub_out, const], axis=0) - fill = tf.fill(pack, tf.constant(0, dtype=tf.float32)) - concat = tf.concat([input1, fill], 0) - output = tf.reshape(concat, input3) - return output - -if __name__ == "__main__": - tf.compat.v1.disable_eager_execution() - unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_dynamic_stitch_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_dynamic_stitch_test.py deleted file mode 100644 index 4de55241..00000000 --- a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_dynamic_stitch_test.py +++ /dev/null @@ -1,97 +0,0 @@ -import os -import tensorflow as tf -import numpy as np -import unittest - -from tensorflow.python.ops import gen_embedding_fused_ops - -class TestSparseSegmentMeanSlice(unittest.TestCase): - @classmethod - def setUpClass(cls): - """Initialize test data and custom op""" - # Load custom op - cls.custom_op = gen_embedding_fused_ops - - cls.variables = [] - max_val = float('inf') - for i in range(12): - N_i = np.random.randint(1000000, 44739244) - max_val = min(N_i, max_val) - var = tf.Variable( - tf.random.normal([N_i, 10], dtype=tf.float32), # shape: (N_i, 10) - name=f"embedding_table_{i}" - ) - cls.variables.append(var) - print(f"Created variable {i}: shape={var.shape}") - - x_np = np.random.randint(0, 12*max_val, size=(10000, 12)) - cls.x = tf.constant(x_np, dtype=tf.int64) - - # Create tf session - cls.sess = tf.compat.v1.Session() - cls.sess.run(tf.compat.v1.global_variables_initializer()) - - @classmethod - def tearDownClass(cls): - cls.sess.close() - - def test_base(self): - x_first = self.sess.run(self.x) - var_first = self.sess.run(self.variables[0]) - - x_second = self.sess.run(self.x) - var_second = self.sess.run(self.variables[0]) - np.testing.assert_allclose( - x_first, - x_second, - rtol=1e-6, - err_msg="Input values mismatch" - ) - - np.testing.assert_allclose( - var_first, - var_second, - rtol=1e-6, - err_msg="Input values mismatch" - ) - - # execute custom op - custom_out = self.custom_op.KPFusedSparseDynamicStitch(x=self.x, variables=self.variables) - - # tf native implementation - tf_out = self._tf_reference_impl(x=self.x, variables=self.variables) - - custom_out_val = self.sess.run([custom_out]) - tf_out_val = self.sess.run([tf_out]) - print("custom_shape: ") - print(custom_out_val[0].shape) - print("tf_out shape: ") - print(tf_out_val[0].shape) - # Numerical comparison - np.testing.assert_allclose( - custom_out_val[0], - tf_out_val[0], - rtol=1e-6, - err_msg="Output values mismatch" - ) - - def _tf_reference_impl(self, x, variables): - x_1 = tf.reshape(x, shape=[-1]) - group_ids = tf.math.floormod(x_1, 12) - group_ids = tf.cast(group_ids, dtype=np.int32) - chunk_indices = tf.math.floordiv(x_1, 12) - - original_indices = tf.range(0,tf.size(x_1),1) - - a = tf.dynamic_partition(original_indices, group_ids, num_partitions=12) - b = tf.dynamic_partition(chunk_indices, group_ids, num_partitions=12) - - c = [tf.gather(variables[i], b[i]) for i in range(12)] - - d = tf.dynamic_stitch(a, c) - - return d - -if __name__ == "__main__": - tf.compat.v1.disable_eager_execution() - unittest.main(argv=[''], verbosity=1) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_reshape_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_reshape_test.py deleted file mode 100644 index 37d27531..00000000 --- a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_reshape_test.py +++ /dev/null @@ -1,102 +0,0 @@ -import tensorflow as tf -import numpy as np -import unittest - -from tensorflow.python.ops import gen_embedding_fused_ops - -class TestFusedSparseReshape(unittest.TestCase): - @classmethod - def setUpClass(cls): - """Initialize test data and custom op""" - # Load custom op - cls.custom_op = gen_embedding_fused_ops - - # Base test data - cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2], [3, 4]], dtype=np.int64) - cls.base_begin = [0, 1] - cls.base_end = [0, 2] - cls.base_strides = [1, 1] - cls.base_newshape = [2, 4] - # Create tf session - cls.sess = tf.compat.v1.Session() - - @classmethod - def tearDownClass(cls): - cls.sess.close() - - def test_custom(self): - # execute custom op - custom_out1, custom_out2, = self.custom_op.KPFusedSparseReshape( - slice_input=self.base_slice_input, - begin=self.base_begin, - new_shape=self.base_newshape - ) - - # tf native implementation - tf_out1, tf_out2, tf_out3 = self._tf_reference_impl( - self.base_slice_input, - self.base_begin, - self.base_newshape - ) - - custom_out_val1, custom_out_val2 = self.sess.run([custom_out1, custom_out2]) - tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3]) - - print("custom_out_val1: ", custom_out_val1) - print("custom_out_val2: ", custom_out_val2) - print("tf_out_val1: ", tf_out_val1) - print("tf_out_val2: ", tf_out_val2) - - np.testing.assert_array_equal( - custom_out_val1, - tf_out_val1, - err_msg="Segment count mismatch" - ) - - np.testing.assert_array_equal( - custom_out_val2, - tf_out_val2, - err_msg="Segment count mismatch" - ) - - def _tf_reference_impl(self, slice_input, begin, new_shape): - slice67_out = tf.strided_slice( - slice_input, - begin=begin, - end=[0, 2], - strides=[1, 1], - begin_mask=1, - end_mask=1, - shrink_axis_mask=2 - ) - - slice67_out = tf.reshape(slice67_out, [-1, 1]) - shape_out = tf.shape(slice67_out) - slice57_out = tf.strided_slice( - shape_out, - begin=[0], - end=[1], - strides=[1], - shrink_axis_mask=1 - ) - - const2 = tf.constant(2) - input_shape = tf.stack([slice57_out, const2]) - input_shape = tf.cast(input_shape, tf.int64) - - range_out = tf.range(0, slice57_out, 1) - range_out = tf.reshape(range_out, [-1, 1]) - range_out_64 = tf.cast(range_out, dtype=tf.int64) - concat_out = tf.concat([range_out_64, slice67_out], axis=-1) - - sparse_tensor = tf.SparseTensor( - indices=concat_out, - values=[1,2,3,4], - dense_shape=input_shape - ) - sparse_tensor_out = tf.sparse.reshape(sparse_tensor, new_shape) - return sparse_tensor_out.indices, sparse_tensor_out.dense_shape, concat_out - -if __name__ == "__main__": - tf.compat.v1.disable_eager_execution() - unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_segment_reduce_test.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_segment_reduce_test.py deleted file mode 100644 index 69c7a114..00000000 --- a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_segment_reduce_test.py +++ /dev/null @@ -1,134 +0,0 @@ -import tensorflow as tf -import numpy as np -import unittest - -from tensorflow.python.ops import gen_embedding_fused_ops - -class TestSparseSegmentMeanSlice(unittest.TestCase): - @classmethod - def setUpClass(cls): - """Initialize test data and custom op""" - # Load custom op - cls.custom_op = gen_embedding_fused_ops - - # Base test data - cls.base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0,5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4, 3} - cls.base_indices = np.array([0, 1, 2], dtype=np.int64) # shape {3} - cls.base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) # shape {3, 2} - cls.base_begin = [0, 1] - cls.base_end = [0, 2] - cls.base_strides = [1, 2] - # Create tf session - cls.sess = tf.compat.v1.Session() - - @classmethod - def tearDownClass(cls): - cls.sess.close() - - def test_mean(self): - # execute custom op - custom_out, custom_slice_out = self.custom_op.KPFusedSparseSegmentReduce( - data=self.base_data, - indices=self.base_indices, - slice_input=self.base_slice_input, - begin=self.base_begin, - end = self.base_end, - strides = self.base_strides - ) - - # tf native implementation - tf_out, tf_slice_out = self._tf_reference_impl( - self.base_data, - self.base_indices, - self.base_slice_input, - self.base_begin, - self.base_end, - self.base_strides, - True - ) - - custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out]) - tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out]) - - # Numerical comparison - np.testing.assert_allclose( - custom_out_val, - tf_out_val, - rtol=1e-6, - err_msg="Output values mismatch" - ) - np.testing.assert_array_equal( - custom_slice_out_val, - tf_slice_out_val, - err_msg="Segment count mismatch" - ) - - def test_sum(self): - custom_out, custom_slice_out = self.custom_op.KPFusedSparseSegmentReduce( - data=self.base_data, - indices=self.base_indices, - slice_input=self.base_slice_input, - begin=self.base_begin, - end = self.base_end, - strides = self.base_strides, - combiner=0 - ) - - tf_out, tf_slice_out = self._tf_reference_impl( - self.base_data, - self.base_indices, - self.base_slice_input, - self.base_begin, - self.base_end, - self.base_strides, - False - ) - - custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out]) - tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out]) - - np.testing.assert_allclose( - custom_out_val, - tf_out_val, - rtol=1e-6, - err_msg="Output values mismatch" - ) - np.testing.assert_array_equal( - custom_slice_out_val, - tf_slice_out_val, - err_msg="Segment count mismatch" - ) - - def _tf_reference_impl(self, data, indices, slice_input, begin, end, strides, is_mean): - slice_out = tf.strided_slice( - slice_input, - begin= begin, - end= end, - strides= strides, - begin_mask=1, - end_mask=1, - shrink_axis_mask=2 - ) - - segment_ids = tf.cast(slice_out, dtype=tf.int32) - if is_mean: - output = tf.sparse.segment_mean( - data = data, - indices = indices, - segment_ids= segment_ids - ) - else: - output = tf.sparse.segment_sum( - data = data, - indices = indices, - segment_ids= segment_ids - ) - - output_shape = tf.shape(output) - slice_out = tf.strided_slice(output_shape, begin=[0], end=[1], strides=[1]) - - return output, slice_out - -if __name__ == "__main__": - tf.compat.v1.disable_eager_execution() - unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_select.py b/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_select.py deleted file mode 100644 index 6740811a..00000000 --- a/tensorflow/python/grappler/embedding_fused_functest/fused_embedding_sparse_select.py +++ /dev/null @@ -1,92 +0,0 @@ -import tensorflow as tf -import numpy as np -import unittest - -from tensorflow.python.ops import gen_embedding_fused_ops - -class TestKPFusedSparseSelect(unittest.TestCase): - @classmethod - def setUpClass(cls): - """Initialize test data and custom op""" - # Load custom op - cls.custom_op = gen_embedding_fused_ops - - # Base test data - cls.input_a = np.random.randint(-10, 10, size=(40, 50)).astype(np.int32) - cls.input_b = np.random.randint(0, 20000, size=(20, 100)).astype(np.int32) - cls.input_c = np.random.randint(0, 10, size=(50, 2, 20)).astype(np.int32) - # Create tf session - cls.sess = tf.compat.v1.Session() - - @classmethod - def tearDownClass(cls): - cls.sess.close() - - def test_custom(self): - # execute custom op - output_x, output_y, output_z = self.custom_op.KPFusedSparseSelect( - input_a=tf.constant(self.input_a, dtype=tf.int32), - input_b=tf.constant(self.input_b, dtype=tf.int32), - input_c=tf.constant(self.input_c, dtype=tf.int32), - ) - - # tf native implementation - ref_output_x, ref_output_y, ref_output_z = self._tf_reference_impl( - input_a=tf.constant(self.input_a, dtype=tf.int32), - input_b=tf.constant(self.input_b, dtype=tf.int32), - input_c=tf.constant(self.input_c, dtype=tf.int32), - ) - - op_x_val, op_y_val, op_z_val = self.sess.run([output_x, output_y, output_z]) - ref_x_val, ref_y_val, ref_z_val = self.sess.run([ref_output_x, ref_output_y, ref_output_z]) - - np.testing.assert_allclose( - op_x_val, - ref_x_val, - rtol=1e-5, - err_msg="Output values mismatch" - ) - - np.testing.assert_allclose( - op_y_val, - ref_y_val, - rtol=1e-5, - err_msg="Output values mismatch" - ) - - np.testing.assert_allclose( - op_z_val, - ref_z_val, - rtol=1e-5, - err_msg="Output values mismatch" - ) - - def _tf_reference_impl(self, input_a, input_b, input_c): - a = tf.reshape(input_a, [-1, 1]) - b = tf.reshape(input_b, [-1, 1]) - c = tf.reshape(input_c, [-1, 1]) - - greater_a = tf.greater(a, 0) - shape_reshape_a1 = tf.shape(a) - shape_reshape_a2 = tf.shape(a) - fill_a1 = tf.fill(shape_reshape_a1, tf.constant(1, dtype=tf.float32)) - realdiv = tf.realdiv(fill_a1, tf.constant(1, dtype=tf.float32)) - fill_a2 = tf.fill(shape_reshape_a2, tf.constant(0, dtype=tf.float32)) - cast_a = tf.cast(greater_a, tf.float32) - shape_a = tf.shape(cast_a) - fill_a = tf.fill(shape_a, tf.constant(1, dtype=tf.float32)) - equal_4563 = tf.equal(b, 4563) - equal_10831 = tf.equal(b, 10831) - equal_3 = tf.equal(c, 3) - select_1 = tf.where(equal_4563, fill_a, cast_a) - select_2 = tf.where(equal_10831, fill_a, select_1) - sub = tf.subtract(tf.constant(1, dtype=tf.float32), select_2) - mul = tf.multiply(tf.constant(1, dtype=tf.float32), select_2) - select_3 = tf.where(equal_3, realdiv, fill_a1) - concat = tf.concat([mul, select_3], axis=-1) - return fill_a2, sub, concat - - -if __name__ == "__main__": - tf.compat.v1.disable_eager_execution() - unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_action_id_gather_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_action_id_gather_test.py deleted file mode 100644 index c766085c..00000000 --- a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_action_id_gather_test.py +++ /dev/null @@ -1,67 +0,0 @@ -import tensorflow as tf -import numpy as np -import unittest - -class TestFusedEmbeddingActionIdGather(unittest.TestCase): - @classmethod - def setUpClass(cls): - """Initialize test data and custom op""" - # Load custom op - cls.custom_op = tf.load_op_library('../kernels/fused_embedding_action_id_gather.so') - - # Base test data - np.random.seed(140) - indices1_shape = (8, 10) - indices2_shape = (5, 6) - params_shape = (80, 300) - cls.input0 = np.random.randint(0, params_shape[0], size=indices1_shape, dtype=np.int32) - cls.input1 = np.random.random(params_shape).astype(np.float32) - cls.input2 = np.random.randint(0, indices1_shape[0], size=indices2_shape, dtype=np.int32) - cls.input3 = params_shape[0] - # Create tf session - cls.sess = tf.compat.v1.Session() - - @classmethod - def tearDownClass(cls): - cls.sess.close() - - def test_kp_fused_embedding_action_id_gather(self): - # execute custom op - custom_out = self.custom_op.kp_fused_embedding_action_id_gather( - input0=tf.constant(self.input0, dtype=tf.int32), - input1=tf.constant(self.input1, dtype=tf.float32), - input2=tf.constant(self.input2, dtype=tf.int32), - input3=tf.constant(self.input3, dtype=tf.int32), - ) - - # tf native implementation - tf_out = self._tf_reference_impl( - input0=tf.constant(self.input0, dtype=tf.int32), - input1=tf.constant(self.input1, dtype=tf.float32), - input2=tf.constant(self.input2, dtype=tf.int32), - input3=tf.constant(self.input3, dtype=tf.int32), - ) - - custom_out_val = self.sess.run([custom_out]) - tf_out_val = self.sess.run([tf_out]) - - # Numerical comparison - np.testing.assert_array_equal( - custom_out_val, - tf_out_val, - err_msg="result mismatch" - ) - - def _tf_reference_impl(self, input0, input1, input2, input3): - gather1 = tf.gather(input1, input0, axis=0) - gather2 = tf.gather(gather1, input2, axis=0) - pack1 = tf.stack([input3, 1680], axis=0) - pack2 = tf.stack([input3, -1], axis=0) - reshape = tf.reshape(gather2, pack2) - fill = tf.fill(pack1, tf.constant(0, dtype=tf.float32)) - output = tf.concat([reshape, fill], axis=-1) - return output - -if __name__ == "__main__": - tf.compat.v1.disable_eager_execution() - unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_gather_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_gather_test.py deleted file mode 100644 index 981c3e3b..00000000 --- a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_gather_test.py +++ /dev/null @@ -1,93 +0,0 @@ -import tensorflow as tf -import numpy as np -import unittest - -class TestFusedGather(unittest.TestCase): - @classmethod - def setUpClass(cls): - """Initialize test data and custom op""" - # Load custom op - cls.custom_op = tf.load_op_library('kpfusedgather.so') - - # Base test data - cls.base_data = np.linspace(0, 11, num=240, endpoint=False, dtype=np.float32).reshape(20, 12) - cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2]], dtype=np.int64) - cls.base_begin = [0, 1] - cls.base_end = [0, 2] - cls.base_strides = [1, 1] - # Create tf session - cls.sess = tf.compat.v1.Session() - - @classmethod - def tearDownClass(cls): - cls.sess.close() - - def test_custom(self): - # execute custom op - custom_out1, custom_out2, custom_out3= self.custom_op.KPFusedGather( - data=self.base_data, - slice_input=self.base_slice_input, - begin=self.base_begin, - ) - - # tf native implementation - tf_out1, tf_out2, tf_out3 = self._tf_reference_impl( - self.base_data, - self.base_slice_input, - self.base_begin, - ) - - custom_out_val1, custom_out_val2, custom_out_val3 = self.sess.run([custom_out1, custom_out2, custom_out3]) - tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3]) - - print("custom_out_val1: ", custom_out_val1) - print("custom_out_val2: ", custom_out_val2) - print("custom_out_val3: ", custom_out_val3) - print("tf_out_val1: ", tf_out_val1) - print("tf_out_val2: ", tf_out_val2) - print("tf_out_val3: ", tf_out_val3) - - np.testing.assert_array_equal( - custom_out_val1, - tf_out_val1, - err_msg="Segment count mismatch" - ) - - np.testing.assert_array_equal( - custom_out_val2, - tf_out_val2, - err_msg="Segment count mismatch" - ) - - np.testing.assert_allclose( - custom_out_val3, - tf_out_val3, - rtol=1e-6, - err_msg="Output values mismatch" - ) - - def _tf_reference_impl(self, data, slice_input, begin): - slice_out = tf.strided_slice( - slice_input, - begin = begin, - end = [tf.shape(slice_input)[0], begin[1] + 2], - strides = [1, 1], - begin_mask = 1, - end_mask = 1, - shrink_axis_mask = 2 - ) - - slice_out, slice_out_indices = tf.unique(slice_out) - output_shape = tf.shape(slice_out) - slice_out = tf.reshape(slice_out, [-1]) - slice_out, _ = tf.unique(slice_out) - - gather1_result = tf.gather(data, slice_out) - gather1_result = tf.reshape(gather1_result, [-1, 12]) - - gather2_result = tf.gather(gather1_result, slice_out) - return output_shape, slice_out_indices, gather2_result - -if __name__ == "__main__": - tf.compat.v1.disable_eager_execution() - unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_padding_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_padding_test.py deleted file mode 100644 index 75dce007..00000000 --- a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_padding_test.py +++ /dev/null @@ -1,111 +0,0 @@ -import tensorflow as tf -import numpy as np -import unittest - -class TestFusedEmbeddingPadding(unittest.TestCase): - @classmethod - def setUpClass(cls): - """Initialize test data and custom op""" - # Load custom op - cls.custom_op = tf.load_op_library('../kernels/fused_embedding_padding.so') - - # Base test data - np.random.seed(140) - cls.input0 = np.random.randint(0, 100, size=(2 * 3, 10), dtype=np.int64) - cls.input1 = np.random.rand(2 * 2, 10).astype(np.float) - cls.input2 = cls.input1.shape - cls.input3 = np.array([-1, 20]).astype(np.int32) - # Create tf session - cls.sess = tf.compat.v1.Session() - - @classmethod - def tearDownClass(cls): - cls.sess.close() - - def test_kp_fused_embedding_padding_fast(self): - # execute custom op - _, custom_out = self.custom_op.kp_fused_embedding_padding_fast( - input0=self.input0.shape, - input1=self.input1, - input2=self.input2[0], - input3=self.input3, - ) - - # tf native implementation - tf_out = self._fused_embedding_padding_fast_reference_impl( - tf.constant(self.input0.shape, dtype=tf.int64), - tf.constant(self.input1, dtype=tf.float32), - tf.constant(self.input2[0], dtype=tf.int32), - tf.constant(self.input3, dtype=tf.int32), - ) - - custom_out_val = self.sess.run([custom_out]) - tf_out_val = self.sess.run([tf_out]) - - # Numerical comparison - np.testing.assert_array_equal( - custom_out_val, - tf_out_val, - err_msg="result mismatch" - ) - - def test_kp_fused_embedding_padding(self): - # execute custom op - _, custom_out = self.custom_op.kp_fused_embedding_padding( - input0=self.input0.shape, - input1=self.input1, - input2=self.input2[0], - input3=self.input3, - ) - - # tf native implementation - tf_out = self._fused_embedding_padding_reference_impl( - tf.constant(self.input0.shape, dtype=tf.int64), - tf.constant(self.input1, dtype=tf.float32), - tf.constant(self.input2[0], dtype=tf.int32), - tf.constant(self.input3, dtype=tf.int32), - ) - - custom_out_val = self.sess.run([custom_out]) - tf_out_val = self.sess.run([tf_out]) - - # Numerical comparison - np.testing.assert_array_equal( - custom_out_val, - tf_out_val, - err_msg="result mismatch" - ) - - def _fused_embedding_padding_fast_reference_impl(self, input0, input1, input2, input3): - cast = tf.cast(input0, tf.int32) - begin = tf.constant([0], dtype=tf.int32) - end = tf.constant([1], dtype=tf.int32) - strides = tf.constant([1], dtype=tf.int32) - hash_rows = tf.strided_slice(cast, begin=begin, end=end, strides=strides, shrink_axis_mask=1) - sub_out = hash_rows - input2 - const = tf.constant(10, dtype=tf.int32) - pack = tf.stack([sub_out, const], axis=0) - fill = tf.fill(pack, tf.constant(0, dtype=tf.float32)) - concat = tf.concat([input1, fill], 0) - reshape = tf.reshape(concat, input3) - shape_tensor = tf.shape(reshape) - output = tf.strided_slice(shape_tensor, begin=begin, end=end, strides=strides, shrink_axis_mask=1) - return output - - def _fused_embedding_padding_reference_impl(self, input0, input1, input2, input3): - cast = tf.cast(input0, tf.int32) - begin = tf.constant([0], dtype=tf.int32) - end = tf.constant([1], dtype=tf.int32) - strides = tf.constant([1], dtype=tf.int32) - hash_rows = tf.strided_slice(cast, begin=begin, end=end, strides=strides, shrink_axis_mask=1) - sub_out = hash_rows - input2 - const = tf.constant(10, dtype=tf.int32) - pack = tf.stack([sub_out, const], axis=0) - fill = tf.fill(pack, tf.constant(0, dtype=tf.float32)) - concat = tf.concat([input1, fill], 0) - output = tf.reshape(concat, input3) - return output - -if __name__ == "__main__": - tf.compat.v1.disable_eager_execution() - unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_select.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_select.py deleted file mode 100644 index 981c3e3b..00000000 --- a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_select.py +++ /dev/null @@ -1,93 +0,0 @@ -import tensorflow as tf -import numpy as np -import unittest - -class TestFusedGather(unittest.TestCase): - @classmethod - def setUpClass(cls): - """Initialize test data and custom op""" - # Load custom op - cls.custom_op = tf.load_op_library('kpfusedgather.so') - - # Base test data - cls.base_data = np.linspace(0, 11, num=240, endpoint=False, dtype=np.float32).reshape(20, 12) - cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2]], dtype=np.int64) - cls.base_begin = [0, 1] - cls.base_end = [0, 2] - cls.base_strides = [1, 1] - # Create tf session - cls.sess = tf.compat.v1.Session() - - @classmethod - def tearDownClass(cls): - cls.sess.close() - - def test_custom(self): - # execute custom op - custom_out1, custom_out2, custom_out3= self.custom_op.KPFusedGather( - data=self.base_data, - slice_input=self.base_slice_input, - begin=self.base_begin, - ) - - # tf native implementation - tf_out1, tf_out2, tf_out3 = self._tf_reference_impl( - self.base_data, - self.base_slice_input, - self.base_begin, - ) - - custom_out_val1, custom_out_val2, custom_out_val3 = self.sess.run([custom_out1, custom_out2, custom_out3]) - tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3]) - - print("custom_out_val1: ", custom_out_val1) - print("custom_out_val2: ", custom_out_val2) - print("custom_out_val3: ", custom_out_val3) - print("tf_out_val1: ", tf_out_val1) - print("tf_out_val2: ", tf_out_val2) - print("tf_out_val3: ", tf_out_val3) - - np.testing.assert_array_equal( - custom_out_val1, - tf_out_val1, - err_msg="Segment count mismatch" - ) - - np.testing.assert_array_equal( - custom_out_val2, - tf_out_val2, - err_msg="Segment count mismatch" - ) - - np.testing.assert_allclose( - custom_out_val3, - tf_out_val3, - rtol=1e-6, - err_msg="Output values mismatch" - ) - - def _tf_reference_impl(self, data, slice_input, begin): - slice_out = tf.strided_slice( - slice_input, - begin = begin, - end = [tf.shape(slice_input)[0], begin[1] + 2], - strides = [1, 1], - begin_mask = 1, - end_mask = 1, - shrink_axis_mask = 2 - ) - - slice_out, slice_out_indices = tf.unique(slice_out) - output_shape = tf.shape(slice_out) - slice_out = tf.reshape(slice_out, [-1]) - slice_out, _ = tf.unique(slice_out) - - gather1_result = tf.gather(data, slice_out) - gather1_result = tf.reshape(gather1_result, [-1, 12]) - - gather2_result = tf.gather(gather1_result, slice_out) - return output_shape, slice_out_indices, gather2_result - -if __name__ == "__main__": - tf.compat.v1.disable_eager_execution() - unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_dynamic_stitch_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_dynamic_stitch_test.py deleted file mode 100644 index a85f9ae7..00000000 --- a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_dynamic_stitch_test.py +++ /dev/null @@ -1,96 +0,0 @@ -import os -os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '0' -import tensorflow as tf -import numpy as np -import unittest - -class TestSparseSegmentMeanSlice(unittest.TestCase): - @classmethod - def setUpClass(cls): - """Initialize test data and custom op""" - # Load custom op - cls.custom_op = tf.load_op_library('kp_fused_sparse_dynamic_stitch.so') - - cls.variables = [] - max_val = float('inf') - for i in range(12): - N_i = np.random.randint(1000000, 44739244) - max_val = min(N_i, max_val) - var = tf.Variable( - tf.random.normal([N_i, 10], dtype=tf.float32), # shape: (N_i, 10) - name=f"embedding_table_{i}" - ) - cls.variables.append(var) - print(f"Created variable {i}: shape={var.shape}") - - x_np = np.random.randint(0, 12*max_val, size=(10000, 12)) - cls.x = tf.constant(x_np, dtype=tf.int64) - - # Create tf session - cls.sess = tf.compat.v1.Session() - cls.sess.run(tf.compat.v1.global_variables_initializer()) - - @classmethod - def tearDownClass(cls): - cls.sess.close() - - def test_base(self): - x_first = self.sess.run(self.x) - var_first = self.sess.run(self.variables[0]) - - x_second = self.sess.run(self.x) - var_second = self.sess.run(self.variables[0]) - np.testing.assert_allclose( - x_first, - x_second, - rtol=1e-6, - err_msg="Input values mismatch" - ) - - np.testing.assert_allclose( - var_first, - var_second, - rtol=1e-6, - err_msg="Input values mismatch" - ) - - # execute custom op - custom_out = self.custom_op.kp_fused_sparse_dynamic_stitch(self.x, self.variables) - - # tf native implementation - tf_out = self._tf_reference_impl(self.x, self.variables) - - custom_out_val = self.sess.run([custom_out]) - tf_out_val = self.sess.run([tf_out]) - print("custom_shape: ") - print(custom_out_val[0].shape) - print("tf_out shape: ") - print(tf_out_val[0].shape) - # Numerical comparison - np.testing.assert_allclose( - custom_out_val[0], - tf_out_val[0], - rtol=1e-6, - err_msg="Output values mismatch" - ) - - def _tf_reference_impl(self, x, variables): - x_1 = tf.reshape(x, shape=[-1]) - group_ids = tf.math.floormod(x_1, 12) - group_ids = tf.cast(group_ids, dtype=np.int32) - chunk_indices = tf.math.floordiv(x_1, 12) - - original_indices = tf.range(0,tf.size(x_1),1) - - a = tf.dynamic_partition(original_indices, group_ids, num_partitions=12) - b = tf.dynamic_partition(chunk_indices, group_ids, num_partitions=12) - - c = [tf.gather(variables[i], b[i]) for i in range(12)] - - d = tf.dynamic_stitch(a, c) - - return d - -if __name__ == "__main__": - tf.compat.v1.disable_eager_execution() - unittest.main(argv=[''], verbosity=1) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_reshape_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_reshape_test.py deleted file mode 100644 index e8922699..00000000 --- a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_reshape_test.py +++ /dev/null @@ -1,100 +0,0 @@ -import tensorflow as tf -import numpy as np -import unittest - -class TestFusedSparseReshape(unittest.TestCase): - @classmethod - def setUpClass(cls): - """Initialize test data and custom op""" - # Load custom op - cls.custom_op = tf.load_op_library('kppattern6.so') - - # Base test data - cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2], [3, 4]], dtype=np.int64) - cls.base_begin = [0, 1] - cls.base_end = [0, 2] - cls.base_strides = [1, 1] - cls.base_newshape = [2, 4] - # Create tf session - cls.sess = tf.compat.v1.Session() - - @classmethod - def tearDownClass(cls): - cls.sess.close() - - def test_custom(self): - # execute custom op - custom_out1, custom_out2, = self.custom_op.KPFusedSparseReshape( - slice_input=self.base_slice_input, - begin=self.base_begin, - new_shape=self.base_newshape - ) - - # tf native implementation - tf_out1, tf_out2, tf_out3 = self._tf_reference_impl( - self.base_slice_input, - self.base_begin, - self.base_newshape - ) - - custom_out_val1, custom_out_val2 = self.sess.run([custom_out1, custom_out2]) - tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3]) - - print("custom_out_val1: ", custom_out_val1) - print("custom_out_val2: ", custom_out_val2) - print("tf_out_val1: ", tf_out_val1) - print("tf_out_val2: ", tf_out_val2) - - np.testing.assert_array_equal( - custom_out_val1, - tf_out_val1, - err_msg="Segment count mismatch" - ) - - np.testing.assert_array_equal( - custom_out_val2, - tf_out_val2, - err_msg="Segment count mismatch" - ) - - def _tf_reference_impl(self, slice_input, begin, new_shape): - slice67_out = tf.strided_slice( - slice_input, - begin=begin, - end=[0, 2], - strides=[1, 1], - begin_mask=1, - end_mask=1, - shrink_axis_mask=2 - ) - - slice67_out = tf.reshape(slice67_out, [-1, 1]) - shape_out = tf.shape(slice67_out) - slice57_out = tf.strided_slice( - shape_out, - begin=[0], - end=[1], - strides=[1], - shrink_axis_mask=1 - ) - - const2 = tf.constant(2) - input_shape = tf.stack([slice57_out, const2]) - input_shape = tf.cast(input_shape, tf.int64) - - range_out = tf.range(0, slice57_out, 1) - range_out = tf.reshape(range_out, [-1, 1]) - range_out_64 = tf.cast(range_out, dtype=tf.int64) - concat_out = tf.concat([range_out_64, slice67_out], axis=-1) - - sparse_tensor = tf.SparseTensor( - indices=concat_out, - values=[1,2,3,4], - dense_shape=input_shape - ) - sparse_tensor_out = tf.sparse.reshape(sparse_tensor, new_shape) - return sparse_tensor_out.indices, sparse_tensor_out.dense_shape, concat_out - -if __name__ == "__main__": - tf.compat.v1.disable_eager_execution() - unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_segment_reduce_test.py b/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_segment_reduce_test.py deleted file mode 100644 index dcca2f81..00000000 --- a/tensorflow/python/grappler/embedding_fused_perftest/fused_embedding_sparse_segment_reduce_test.py +++ /dev/null @@ -1,132 +0,0 @@ -import tensorflow as tf -import numpy as np -import unittest - -class TestSparseSegmentMeanSlice(unittest.TestCase): - @classmethod - def setUpClass(cls): - """Initialize test data and custom op""" - # Load custom op - cls.custom_op = tf.load_op_library('kp_fused_sparse_segment_reduce.so') - - # Base test data - cls.base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0,5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4, 3} - cls.base_indices = np.array([0, 1, 2], dtype=np.int64) # shape {3} - cls.base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) # shape {3, 2} - cls.base_begin = [0, 1] - cls.base_end = [0, 2] - cls.base_strides = [1, 2] - # Create tf session - cls.sess = tf.compat.v1.Session() - - @classmethod - def tearDownClass(cls): - cls.sess.close() - - def test_mean(self): - # execute custom op - custom_out, custom_slice_out = self.custom_op.kp_fused_sparse_segment_reduce( - data=self.base_data, - indices=self.base_indices, - slice_input=self.base_slice_input, - begin=self.base_begin, - end = self.base_end, - strides = self.base_strides - ) - - # tf native implementation - tf_out, tf_slice_out = self._tf_reference_impl( - self.base_data, - self.base_indices, - self.base_slice_input, - self.base_begin, - self.base_end, - self.base_strides, - True - ) - - custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out]) - tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out]) - - # Numerical comparison - np.testing.assert_allclose( - custom_out_val, - tf_out_val, - rtol=1e-6, - err_msg="Output values mismatch" - ) - np.testing.assert_array_equal( - custom_slice_out_val, - tf_slice_out_val, - err_msg="Segment count mismatch" - ) - - def test_sum(self): - custom_out, custom_slice_out = self.custom_op.kp_fused_sparse_segment_reduce( - data=self.base_data, - indices=self.base_indices, - slice_input=self.base_slice_input, - begin=self.base_begin, - end = self.base_end, - strides = self.base_strides, - combiner=0 - ) - - tf_out, tf_slice_out = self._tf_reference_impl( - self.base_data, - self.base_indices, - self.base_slice_input, - self.base_begin, - self.base_end, - self.base_strides, - False - ) - - custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out]) - tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out]) - - np.testing.assert_allclose( - custom_out_val, - tf_out_val, - rtol=1e-6, - err_msg="Output values mismatch" - ) - np.testing.assert_array_equal( - custom_slice_out_val, - tf_slice_out_val, - err_msg="Segment count mismatch" - ) - - def _tf_reference_impl(self, data, indices, slice_input, begin, end, strides, is_mean): - slice_out = tf.strided_slice( - slice_input, - begin= begin, - end= end, - strides= strides, - begin_mask=1, - end_mask=1, - shrink_axis_mask=2 - ) - - segment_ids = tf.cast(slice_out, dtype=tf.int32) - if is_mean: - output = tf.sparse.segment_mean( - data = data, - indices = indices, - segment_ids= segment_ids - ) - else: - output = tf.sparse.segment_sum( - data = data, - indices = indices, - segment_ids= segment_ids - ) - - output_shape = tf.shape(output) - slice_out = tf.strided_slice(output_shape, begin=[0], end=[1], strides=[1]) - - return output, slice_out - -if __name__ == "__main__": - tf.compat.v1.disable_eager_execution() - unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py index 1c73adc1..70b6f539 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py @@ -73,7 +73,7 @@ class TestFusedGather(unittest.TestCase): ) slice_out, slice_out_indices = tf.unique(slice_out) - output_shape = slice_out + output_shape = tf.shape(slice_out) slice_out = tf.reshape(slice_out, [-1]) slice_out, _ = tf.unique(slice_out) @@ -85,4 +85,4 @@ class TestFusedGather(unittest.TestCase): if __name__ == "__main__": tf.compat.v1.disable_eager_execution() - unittest.main(argv=[''], verbosity=2) \ No newline at end of file + unittest.main(argv=[''], verbosity=2) diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select.py new file mode 100644 index 00000000..d37128cb --- /dev/null +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select.py @@ -0,0 +1,113 @@ +import tensorflow as tf +import numpy as np +import unittest + +from tensorflow.python.ops import gen_embedding_fused_ops +from utils.utils import perf_run, generate_timeline, wrapper_sess + + +def ori_fused_embedding_sparse_select_graph(input_a, input_b, input_c): + a = tf.reshape(input_a, [-1, 1]) + b = tf.reshape(input_b, [-1, 1]) + c = tf.reshape(input_c, [-1, 1]) + + greater_a = tf.greater(a, 0) + shape_reshape_a1 = tf.shape(a) + shape_reshape_a2 = tf.shape(a) + fill_a1 = tf.fill(shape_reshape_a1, tf.constant(1, dtype=tf.float32)) + realdiv = tf.realdiv(fill_a1, tf.constant(1, dtype=tf.float32)) + output_x = tf.fill(shape_reshape_a2, tf.constant(0, dtype=tf.float32)) + cast_a = tf.cast(greater_a, tf.float32) + shape_a = tf.shape(cast_a) + fill_a = tf.fill(shape_a, tf.constant(1, dtype=tf.float32)) + equal_4563 = tf.equal(b, 4563) + equal_10831 = tf.equal(b, 10831) + equal_3 = tf.equal(c, 3) + select_1 = tf.where(equal_4563, fill_a, cast_a) + select_2 = tf.where(equal_10831, fill_a, select_1) + output_y = tf.subtract(tf.constant(1, dtype=tf.float32), select_2) + mul = tf.multiply(tf.constant(1, dtype=tf.float32), select_2) + select_3 = tf.where(equal_3, realdiv, fill_a1) + output_z = tf.concat([mul, select_3], axis=-1) + return output_x, output_y, output_z + + +def opt_fused_embedding_sparse_select_graph(input_a, input_b, input_c): + output_x, output_y, output_z = gen_embedding_fused_ops.KPFusedSparseSelect( + input_a=input_a, input_b=input_b, input_c=input_c + ) + return output_x, output_y, output_z + + +class TestKPFusedSparseSelect(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Initialize config""" + cls.config = tf.compat.v1.ConfigProto() + cls.config.intra_op_parallelism_threads = 16 + cls.config.inter_op_parallelism_threads = 16 + + cls.run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE) + cls.run_metadata_ori = tf.compat.v1.RunMetadata() + cls.run_metadata_opt = tf.compat.v1.RunMetadata() + + @classmethod + def tearDownClass(cls): + return + + def test_fused_embedding_sparse_select(self): + # Create Graph + with tf.Graph().as_default(): + input0 = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name="input_a") + input1 = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name="input_b") + input2 = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name="input_c") + """Initialize test data""" + feed = { + input0: np.random.randint(0, 100, size=(100, 10)).astype(np.int32), + input1: np.random.randint(0, 100, size=(10, 100)).astype(np.int32), + input2: np.random.randint(0, 100, size=(20, 50)).astype(np.int32), + } + with tf.name_scope("ori"): + out0_ori, out1_ori, out2_ori = ori_fused_embedding_sparse_select_graph(input0, input1, input2) + with tf.name_scope("opt"): + out0_opt, out1_opt, out2_opt = opt_fused_embedding_sparse_select_graph(input0, input1, input2) + + # Create tf session + with tf.compat.v1.Session(config=self.config) as sess: + # functest + out0_ori_val, out1_ori_val, out2_ori_val = sess.run([out0_ori, out1_ori, out2_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori) + out0_opt_val, out1_opt_val, out2_opt_val = sess.run([out0_opt, out1_opt, out2_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt) + + np.testing.assert_allclose( + out0_ori_val, + out0_opt_val, + rtol=1e-5, + err_msg="Output values mismatch" + ) + + np.testing.assert_allclose( + out1_ori_val, + out1_opt_val, + rtol=1e-5, + err_msg="Output values mismatch" + ) + + np.testing.assert_allclose( + out2_ori_val, + out2_opt_val, + rtol=1e-5, + err_msg="Output values mismatch" + ) + + generate_timeline(self.run_metadata_ori.step_stats, f"{self._testMethodName}_ori") + generate_timeline(self.run_metadata_opt.step_stats, f"{self._testMethodName}_opt") + + # perftest + perf_run(wrapper_sess(sess, [out0_ori, out1_ori, out2_ori], feed_dict=feed), + wrapper_sess(sess, [out0_opt, out1_opt, out2_opt], feed_dict=feed), + "KPFusedEmbeddingSparseSelect") + + +if __name__ == "__main__": + tf.compat.v1.disable_eager_execution() + unittest.main(argv=[''], verbosity=2) \ No newline at end of file -- Gitee From 6ea0ce42cbf7833f8ef7d850cbef44a9bb42ee61 Mon Sep 17 00:00:00 2001 From: rayshine <1324789704@qq.com> Date: Fri, 15 Aug 2025 09:49:57 +0800 Subject: [PATCH 5/9] =?UTF-8?q?=E4=BF=AE=E6=94=B9=20fused=20embedding?= =?UTF-8?q?=E7=AE=97=E5=AD=90=E7=9A=84=E6=97=B6=E5=BB=B6=E6=B5=8B=E8=AF=95?= =?UTF-8?q?=E6=96=B9=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../fused_embedding_action_id_gather_test.py | 51 ++- .../fused_embedding_gather_test.py | 219 ++++++++----- .../fused_embedding_padding_test.py | 107 +++++-- ...ed_embedding_sparse_dynamic_stitch_test.py | 196 ++++++++---- .../fused_embedding_sparse_reshape_test.py | 227 ++++++++----- ...ed_embedding_sparse_segment_reduce_test.py | 297 +++++++++++------- .../fused_embedding_sparse_select.py | 113 ------- .../fused_embedding_sparse_select_test.py | 73 +++-- .../embedding_fused_test/utils/utils.py | 30 +- 9 files changed, 820 insertions(+), 493 deletions(-) delete mode 100644 tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select.py diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py index d20628b0..1fa83bc9 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py @@ -3,7 +3,7 @@ import numpy as np import unittest from tensorflow.python.ops import gen_embedding_fused_ops -from utils.utils import perf_run, generate_timeline, wrapper_sess +from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time def ori_fused_embedding_action_id_gather_graph(input0, input1, input2, input3): @@ -33,7 +33,7 @@ class TestFusedEmbeddingActionIdGather(unittest.TestCase): """Initialize config""" cls.config = tf.compat.v1.ConfigProto() cls.config.intra_op_parallelism_threads = 16 - cls.config.inter_op_parallelism_threads = 16 + cls.config.inter_op_parallelism_threads = 1 cls.run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE) cls.run_metadata_ori = tf.compat.v1.RunMetadata() @@ -76,15 +76,48 @@ class TestFusedEmbeddingActionIdGather(unittest.TestCase): out_opt_val, err_msg="result mismatch" ) + + op_name = "KPFusedEmbeddingActionIdGather" + TF_origin = "----------TF_origin-----------" + print("-" * 60) + print("-" * 60) + print("new test") + # 多次生成 timeline 并统计平均值 + num_runs = 1000 + start_op = "ori/stack_1" + end_op = "ori/concat" + total_times_ori = 0 + total_times_opt = 0 + for i in range(num_runs): + sess.run( + [out_ori], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_ori + ) + sess.run( + [out_opt], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_opt + ) + filename_ori = f"{op_name}_ori" + filename_opt = f"{op_name}_opt" + generate_timeline(self.run_metadata_ori.step_stats, filename_ori) + generate_timeline(self.run_metadata_opt.step_stats, filename_opt) - generate_timeline(self.run_metadata_ori.step_stats, f"{self._testMethodName}_ori") - generate_timeline(self.run_metadata_opt.step_stats, f"{self._testMethodName}_opt") + total_times_ori += extract_op_total_time(filename_ori, start_op, end_op) + total_times_opt += extract_op_dur(filename_opt, op_name) - # perftest - perf_run(wrapper_sess(sess, [out_ori], feed_dict=feed), - wrapper_sess(sess, [out_opt], feed_dict=feed), - "KPFusedEmbeddingActionIdGather") - + avg_ori = total_times_ori / num_runs + avg_opt = total_times_opt / num_runs + speedup = (avg_ori - avg_opt) / avg_ori * 100 + + print(f"{TF_origin}: {avg_ori:.4f} us per run") + print(f"{op_name}: {avg_opt:.4f} us per run") + print(f"improve: {speedup:.2f}%") + + if __name__ == "__main__": tf.compat.v1.disable_eager_execution() diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py index 70b6f539..f87ee1b5 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py @@ -3,85 +3,160 @@ import numpy as np import unittest from tensorflow.python.ops import gen_embedding_fused_ops +from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time + + +def ori_fused_embedding_gather_graph(data, slice_input, begin): + slice_out = tf.strided_slice( + slice_input, + begin = begin, + end = [tf.shape(slice_input)[0], begin[1] + 2], + strides = [1, 1], + begin_mask = 1, + end_mask = 1, + shrink_axis_mask = 2 + ) + + slice_out, slice_out_indices = tf.unique(slice_out) + output_shape = tf.shape(slice_out) + slice_out = tf.reshape(slice_out, [-1]) + slice_out, _ = tf.unique(slice_out) + + gather1_result = tf.gather(data, slice_out) + gather1_result = tf.reshape(gather1_result, [-1, 12]) + + gather2_result = tf.gather(gather1_result, slice_out) + return output_shape, slice_out_indices, gather2_result + + +def opt_fused_embedding_gather_graph(data, slice_input, begin): + custom_out1, custom_out2, custom_out3 = gen_embedding_fused_ops.KPFusedGather( + data=data, + slice_input=slice_input, + begin=begin + ) + return custom_out1, custom_out2, custom_out3 + class TestFusedGather(unittest.TestCase): @classmethod def setUpClass(cls): - """Initialize test data and custom op""" - # Load custom op - cls.custom_op = gen_embedding_fused_ops - - # Base test data - cls.base_data = np.linspace(0, 11, num=240, endpoint=False, dtype=np.float32).reshape(20, 12) - cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2]], dtype=np.int64) - cls.base_begin = [0, 1] - cls.base_end = [0, 2] - cls.base_strides = [1, 1] - # Create tf session - cls.sess = tf.compat.v1.Session() + """Initialize""" + cls.config = tf.compat.v1.ConfigProto() + cls.config.intra_op_parallelism_threads = 16 + cls.config.inter_op_parallelism_threads = 1 + + cls.run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE) + cls.run_metadata_ori = tf.compat.v1.RunMetadata() + cls.run_metadata_opt = tf.compat.v1.RunMetadata() @classmethod def tearDownClass(cls): - cls.sess.close() - - def test_custom(self): - # execute custom op - custom_out1, custom_out2, custom_out3= self.custom_op.KPFusedGather( - data=self.base_data, - slice_input=self.base_slice_input, - begin=self.base_begin, - ) - - # tf native implementation - tf_out1, tf_out2, tf_out3 = self._tf_reference_impl( - self.base_data, - self.base_slice_input, - self.base_begin, - ) - - custom_out_val1, custom_out_val2, custom_out_val3 = self.sess.run([custom_out1, custom_out2, custom_out3]) - tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3]) - - np.testing.assert_array_equal( - custom_out_val1, - tf_out_val1, - err_msg="Segment count mismatch" - ) - - np.testing.assert_array_equal( - custom_out_val2, - tf_out_val2, - err_msg="Segment count mismatch" - ) - - np.testing.assert_allclose( - custom_out_val3, - tf_out_val3, - rtol=1e-6, - err_msg="Output values mismatch" - ) - - def _tf_reference_impl(self, data, slice_input, begin): - slice_out = tf.strided_slice( - slice_input, - begin = begin, - end = [tf.shape(slice_input)[0], begin[1] + 2], - strides = [1, 1], - begin_mask = 1, - end_mask = 1, - shrink_axis_mask = 2 - ) - - slice_out, slice_out_indices = tf.unique(slice_out) - output_shape = tf.shape(slice_out) - slice_out = tf.reshape(slice_out, [-1]) - slice_out, _ = tf.unique(slice_out) - - gather1_result = tf.gather(data, slice_out) - gather1_result = tf.reshape(gather1_result, [-1, 12]) - - gather2_result = tf.gather(gather1_result, slice_out) - return output_shape, slice_out_indices, gather2_result + return + + def test_kp_embedding_gather(self): + with tf.Graph().as_default(): + data = tf.compat.v1.placeholder(tf.float32, shape=(20, 12), name="data") + slice_input = tf.compat.v1.placeholder(tf.int64, shape=(3, 2), name="slice_input") + begin = tf.compat.v1.placeholder(tf.int32, name="begin") + base_data = np.linspace(0, 11, num=240, endpoint=False, dtype=np.float32).reshape(20, 12) + base_slice_input = np.array([[0, 0], [0, 1], [1, 2]], dtype=np.int64) + base_begin = [0, 1] + feed = { + data: base_data, + slice_input: base_slice_input, + begin: base_begin + } + # original graph + with tf.name_scope("ori"): + out_ori1, out_ori2, out_ori3 = ori_fused_embedding_gather_graph( + data, + slice_input, + begin + ) + + # optimized graph + with tf.name_scope("opt"): + out_opt1, out_opt2, out_opt3 = opt_fused_embedding_gather_graph( + data=data, + slice_input=slice_input, + begin=begin + ) + + with tf.compat.v1.Session(config=self.config) as sess: + # run ori + out_ori_val1, out_ori_val2, out_ori_val3 = sess.run( + [out_ori1, out_ori2, out_ori3], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_ori + ) + # run opt + out_opt_val1, out_opt_val2, out_opt_val3 = sess.run( + [out_opt1, out_opt2, out_opt3], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_opt + ) + # 功能测试 + np.testing.assert_array_equal( + out_ori_val1, + out_opt_val1, + err_msg="Segment count mismatch" + ) + + np.testing.assert_array_equal( + out_ori_val2, + out_opt_val2, + err_msg="Segment count mismatch" + ) + np.testing.assert_allclose( + out_opt_val3, + out_ori_val3, + rtol=1e-6, + err_msg="Output values mismatch" + ) + + op_name = "KPFusedGather" + TF_origin = "--TF_origin--" + print("-" * 60) + print("-" * 60) + print("new test") + # 多次生成 timeline 并统计平均值 + num_runs = 1000 + start_op = "ori/strided_slice_1" + end_op = "ori/GatherV2_1" + total_times_ori = 0 + total_times_opt = 0 + for i in range(num_runs): + sess.run( + [out_ori1, out_ori2, out_ori3], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_ori + ) + sess.run( + [out_opt1, out_opt2], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_opt + ) + filename_ori = f"{op_name}_ori" + filename_opt = f"{op_name}_opt" + generate_timeline(self.run_metadata_ori.step_stats, filename_ori) + generate_timeline(self.run_metadata_opt.step_stats, filename_opt) + + total_times_ori += extract_op_total_time(filename_ori, start_op, end_op) + total_times_opt += extract_op_dur(filename_opt, op_name) + + avg_ori = total_times_ori / num_runs + avg_opt = total_times_opt / num_runs + speedup = (avg_ori - avg_opt) / avg_ori * 100 + + print(f"{TF_origin}: {avg_ori:.4f} us per run") + print(f"{op_name}: {avg_opt:.4f} us per run") + print(f"improve: {speedup:.2f}%") + if __name__ == "__main__": tf.compat.v1.disable_eager_execution() diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py index b9950e51..ba6a87cd 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py @@ -3,15 +3,15 @@ import numpy as np import unittest from tensorflow.python.ops import gen_embedding_fused_ops -from utils.utils import perf_run, generate_timeline, wrapper_sess +from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time np.random.seed(140) def opt_fused_embedding_padding_fast_graph(input0, input1, input2, input3): - # execute custom op - _, custom_out = gen_embedding_fused_ops.kp_fused_embedding_padding_fast(input0, input1, input2, input3) - return custom_out + # execute custom op + _, custom_out = gen_embedding_fused_ops.kp_fused_embedding_padding_fast(input0, input1, input2, input3) + return custom_out def opt_fused_embedding_padding_graph(input0, input1, input2, input3): # execute custom op @@ -55,7 +55,7 @@ class TestFusedEmbeddingPadding(unittest.TestCase): """Initialize config""" cls.config = tf.compat.v1.ConfigProto() cls.config.intra_op_parallelism_threads = 16 - cls.config.inter_op_parallelism_threads = 16 + cls.config.inter_op_parallelism_threads = 1 cls.run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE) cls.run_metadata_ori = tf.compat.v1.RunMetadata() @@ -95,20 +95,47 @@ class TestFusedEmbeddingPadding(unittest.TestCase): opt_result, err_msg="result mismatch" ) - - from tensorflow.python.client import timeline - tl_ori = timeline.Timeline(self.run_metadata_ori.step_stats) - tl_opt = timeline.Timeline(self.run_metadata_opt.step_stats) - ctf_ori = tl_ori.generate_chrome_trace_format() - ctf_opt = tl_opt.generate_chrome_trace_format() - - with open("timeline_ori.json", "w") as f: - f.write(ctf_ori) - with open("timeline_opt.json", "w") as f: - f.write(ctf_opt) - # perftest - perf_run(wrapper_sess(sess, [out_ori], feed), wrapper_sess(sess, [out_opt], feed_dict=feed), "KPFusedEmbeddingPadding") + op_name = "KPFusedEmbeddingPadding" + TF_origin = "-------TF_origin-------" + print("-" * 60) + print("-" * 60) + print("new test") + # 多次生成 timeline 并统计平均值 + num_runs = 1000 + start_op = "ori/Cast" + end_op = "ori/Reshape" + total_times_ori = 0 + total_times_opt = 0 + for i in range(num_runs): + sess.run( + [out_ori], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_ori + ) + sess.run( + [out_opt], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_opt + ) + filename_ori = f"{op_name}_ori" + filename_opt = f"{op_name}_opt" + generate_timeline(self.run_metadata_ori.step_stats, filename_ori) + generate_timeline(self.run_metadata_opt.step_stats, filename_opt) + + total_times_ori += extract_op_total_time(filename_ori, start_op, end_op) + total_times_opt += extract_op_dur(filename_opt, op_name) + + avg_ori = total_times_ori / num_runs + avg_opt = total_times_opt / num_runs + speedup = (avg_ori - avg_opt) / avg_ori * 100 + + print(f"{TF_origin}: {avg_ori:.4f} us per run") + print(f"{op_name}: {avg_opt:.4f} us per run") + print(f"improve: {speedup:.2f}%") + def test_func_kp_fused_embedding_padding_fast(self): # Create Graph @@ -141,11 +168,45 @@ class TestFusedEmbeddingPadding(unittest.TestCase): err_msg="result mismatch" ) - generate_timeline(self.run_metadata_ori.step_stats, f"{self._testMethodName}_ori") - generate_timeline(self.run_metadata_opt.step_stats, f"{self._testMethodName}_opt") - - # perftest - perf_run(wrapper_sess(sess, [out_ori], feed), wrapper_sess(sess, [out_opt], feed_dict=feed), "KPFusedEmbeddingPaddingFast") + op_name = "KPFusedEmbeddingPaddingFast" + TF_origin = "---------TF_origin---------" + print("-" * 60) + print("-" * 60) + print("new test") + # 多次生成 timeline 并统计平均值 + num_runs = 1000 + start_op = "ori/Cast" + end_op = "ori/StridedSlice_1" + total_times_ori = 0 + total_times_opt = 0 + for i in range(num_runs): + sess.run( + [out_ori], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_ori + ) + sess.run( + [out_opt], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_opt + ) + filename_ori = f"{op_name}_ori" + filename_opt = f"{op_name}_opt" + generate_timeline(self.run_metadata_ori.step_stats, filename_ori) + generate_timeline(self.run_metadata_opt.step_stats, filename_opt) + + total_times_ori += extract_op_total_time(filename_ori, start_op, end_op) + total_times_opt += extract_op_dur(filename_opt, op_name) + + avg_ori = total_times_ori / num_runs + avg_opt = total_times_opt / num_runs + speedup = (avg_ori - avg_opt) / avg_ori * 100 + + print(f"{TF_origin}: {avg_ori:.4f} us per run") + print(f"{op_name}: {avg_opt:.4f} us per run") + print(f"improve: {speedup:.2f}%") if __name__ == "__main__": tf.compat.v1.disable_eager_execution() diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py index 4de55241..5a81692c 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py @@ -4,28 +4,41 @@ import numpy as np import unittest from tensorflow.python.ops import gen_embedding_fused_ops +from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time -class TestSparseSegmentMeanSlice(unittest.TestCase): + +def ori_fused_sparse_dynamic_stitch_graph(x, emb_tables): + x_1 = tf.reshape(x, shape=[-1]) # 将输入 x 展平成一维向量 x_1 + group_ids = tf.math.floormod(x_1, 12) + group_ids = tf.cast(group_ids, dtype=np.int32) + chunk_indices = tf.math.floordiv(x_1, 12) + original_indices = tf.range(0, tf.size(x_1), 1) + a = tf.dynamic_partition(original_indices, group_ids, num_partitions=12) + b = tf.dynamic_partition(chunk_indices, group_ids, num_partitions=12) + c = [tf.gather(emb_tables[i], b[i]) for i in range(12)] + d = tf.dynamic_stitch(a, c) + return d + + +def opt_fused_sparse_dynamic_stitch_graph(x, emb_tables): + output = gen_embedding_fused_ops.KPFusedSparseDynamicStitch( + x = x, + variables = emb_tables + ) + return output + + +class TestSparseDynamicStitch(unittest.TestCase): @classmethod def setUpClass(cls): - """Initialize test data and custom op""" - # Load custom op - cls.custom_op = gen_embedding_fused_ops + """Initialize config""" + cls.config = tf.compat.v1.ConfigProto() + cls.config.intra_op_parallelism_threads = 16 + cls.config.inter_op_parallelism_threads = 1 - cls.variables = [] - max_val = float('inf') - for i in range(12): - N_i = np.random.randint(1000000, 44739244) - max_val = min(N_i, max_val) - var = tf.Variable( - tf.random.normal([N_i, 10], dtype=tf.float32), # shape: (N_i, 10) - name=f"embedding_table_{i}" - ) - cls.variables.append(var) - print(f"Created variable {i}: shape={var.shape}") - - x_np = np.random.randint(0, 12*max_val, size=(10000, 12)) - cls.x = tf.constant(x_np, dtype=tf.int64) + cls.run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE) + cls.run_metadata_ori = tf.compat.v1.RunMetadata() + cls.run_metadata_opt = tf.compat.v1.RunMetadata() # Create tf session cls.sess = tf.compat.v1.Session() @@ -36,11 +49,29 @@ class TestSparseSegmentMeanSlice(unittest.TestCase): cls.sess.close() def test_base(self): - x_first = self.sess.run(self.x) - var_first = self.sess.run(self.variables[0]) + variables = [] + max_val = float('inf') + for i in range(12): + N_i = np.random.randint(100000, 4473924) + max_val = min(N_i, max_val) + var = tf.Variable( + tf.random.normal([N_i, 10], dtype=tf.float32), # shape: (N_i, 10) + name=f"embedding_{i}" + ) + variables.append(var) + # print(f"Created variable {i}: shape={var.shape}") + + x_np = np.random.randint(0, 12*max_val, size=(10000, 12)) + x = tf.constant(x_np, dtype=tf.int64) + + self.sess.run(tf.compat.v1.variables_initializer(variables)) + + x_first = self.sess.run(x) + var_first = self.sess.run(variables[0]) - x_second = self.sess.run(self.x) - var_second = self.sess.run(self.variables[0]) + x_second = self.sess.run(x) + var_second = self.sess.run(variables[0]) + np.testing.assert_allclose( x_first, x_second, @@ -55,42 +86,93 @@ class TestSparseSegmentMeanSlice(unittest.TestCase): err_msg="Input values mismatch" ) - # execute custom op - custom_out = self.custom_op.KPFusedSparseDynamicStitch(x=self.x, variables=self.variables) - - # tf native implementation - tf_out = self._tf_reference_impl(x=self.x, variables=self.variables) - - custom_out_val = self.sess.run([custom_out]) - tf_out_val = self.sess.run([tf_out]) - print("custom_shape: ") - print(custom_out_val[0].shape) - print("tf_out shape: ") - print(tf_out_val[0].shape) - # Numerical comparison - np.testing.assert_allclose( - custom_out_val[0], - tf_out_val[0], - rtol=1e-6, - err_msg="Output values mismatch" - ) - - def _tf_reference_impl(self, x, variables): - x_1 = tf.reshape(x, shape=[-1]) - group_ids = tf.math.floormod(x_1, 12) - group_ids = tf.cast(group_ids, dtype=np.int32) - chunk_indices = tf.math.floordiv(x_1, 12) - - original_indices = tf.range(0,tf.size(x_1),1) - - a = tf.dynamic_partition(original_indices, group_ids, num_partitions=12) - b = tf.dynamic_partition(chunk_indices, group_ids, num_partitions=12) - - c = [tf.gather(variables[i], b[i]) for i in range(12)] - - d = tf.dynamic_stitch(a, c) + def test_kp_sparse_dynamic_stitch(self): + # Create Graph + with tf.Graph().as_default(): + num_tables = 12 + emb_dim = 10 + max_val = float('inf') + # 每张表的 placeholder,行数随机生成 + tables = [] + table_sizes = [] + for i in range(num_tables): + N_i = np.random.randint(1000000, 44739244) + table_sizes.append(N_i) + max_val = min(N_i, max_val) + table_ph = tf.compat.v1.placeholder( + tf.float32, shape=(N_i, emb_dim), name=f"embedding_table_{i}" + ) + tables.append(table_ph) + # 生成全局索引 placeholder + x_shape = (1000, num_tables) + input_x = tf.compat.v1.placeholder(tf.int64, shape=x_shape, name="input_x") + # 初始化 feed 数据 + feed = {} + rng = np.random.default_rng(12345) + # 为每张表生成随机 embedding 数据 + for i in range(num_tables): + feed[tables[i]] = rng.standard_normal(size=(table_sizes[i], emb_dim)).astype(np.float32) + # 生成索引数据(保持原逻辑:范围是 0 ~ num_tables * max_val - 1) + feed[input_x] = rng.integers( + low=0, high=num_tables * max_val, size=x_shape, dtype=np.int64 + ) + with tf.name_scope("ori"): + out_ori = ori_fused_sparse_dynamic_stitch_graph(input_x, tables) + with tf.name_scope("opt"): + out_opt = opt_fused_sparse_dynamic_stitch_graph(input_x, tables) + + # Create tf session + with tf.compat.v1.Session(config=self.config) as sess: + # functest + out_ori_val = sess.run([out_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori) + out_opt_val = sess.run([out_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt) + + np.testing.assert_array_equal( + out_ori_val, + out_opt_val, + err_msg="result mismatch" + ) + + op_name = "KPFusedSparseDynamicStitch" + TF_origin = "--------TF_origin---------" + print("-" * 60) + print("-" * 60) + print("new test") + # 多次生成 timeline 并统计平均值 + num_runs = 10 + start_op = "ori/Reshape" + end_op = "ori/DynamicStitch" + total_times_ori = 0 + total_times_opt = 0 + for i in range(num_runs): + sess.run( + [out_ori], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_ori + ) + sess.run( + [out_opt], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_opt + ) + filename_ori = f"{op_name}_ori" + filename_opt = f"{op_name}_opt" + generate_timeline(self.run_metadata_ori.step_stats, filename_ori) + generate_timeline(self.run_metadata_opt.step_stats, filename_opt) + + total_times_ori += extract_op_total_time(filename_ori, start_op, end_op) + total_times_opt += extract_op_dur(filename_opt, op_name) + + avg_ori = total_times_ori / num_runs + avg_opt = total_times_opt / num_runs + speedup = (avg_ori - avg_opt) / avg_ori * 100 + + print(f"{TF_origin}: {avg_ori:.4f} us per run") + print(f"{op_name}: {avg_opt:.4f} us per run") + print(f"improve: {speedup:.2f}%") - return d if __name__ == "__main__": tf.compat.v1.disable_eager_execution() diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py index 37d27531..bb3d3727 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py @@ -2,65 +2,13 @@ import tensorflow as tf import numpy as np import unittest -from tensorflow.python.ops import gen_embedding_fused_ops - -class TestFusedSparseReshape(unittest.TestCase): - @classmethod - def setUpClass(cls): - """Initialize test data and custom op""" - # Load custom op - cls.custom_op = gen_embedding_fused_ops - - # Base test data - cls.base_slice_input = np.array([[0, 0], [0, 1], [1, 2], [3, 4]], dtype=np.int64) - cls.base_begin = [0, 1] - cls.base_end = [0, 2] - cls.base_strides = [1, 1] - cls.base_newshape = [2, 4] - # Create tf session - cls.sess = tf.compat.v1.Session() - - @classmethod - def tearDownClass(cls): - cls.sess.close() - - def test_custom(self): - # execute custom op - custom_out1, custom_out2, = self.custom_op.KPFusedSparseReshape( - slice_input=self.base_slice_input, - begin=self.base_begin, - new_shape=self.base_newshape - ) - - # tf native implementation - tf_out1, tf_out2, tf_out3 = self._tf_reference_impl( - self.base_slice_input, - self.base_begin, - self.base_newshape - ) - custom_out_val1, custom_out_val2 = self.sess.run([custom_out1, custom_out2]) - tf_out_val1, tf_out_val2, tf_out_val3 = self.sess.run([tf_out1, tf_out2, tf_out3]) - - print("custom_out_val1: ", custom_out_val1) - print("custom_out_val2: ", custom_out_val2) - print("tf_out_val1: ", tf_out_val1) - print("tf_out_val2: ", tf_out_val2) - - np.testing.assert_array_equal( - custom_out_val1, - tf_out_val1, - err_msg="Segment count mismatch" - ) +from tensorflow.python.ops import gen_embedding_fused_ops +from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time - np.testing.assert_array_equal( - custom_out_val2, - tf_out_val2, - err_msg="Segment count mismatch" - ) - def _tf_reference_impl(self, slice_input, begin, new_shape): - slice67_out = tf.strided_slice( +def ori_fused_embedding_sparse_reshape_graph(slice_input, begin, newshape): + slice67_out = tf.strided_slice( slice_input, begin=begin, end=[0, 2], @@ -70,32 +18,147 @@ class TestFusedSparseReshape(unittest.TestCase): shrink_axis_mask=2 ) - slice67_out = tf.reshape(slice67_out, [-1, 1]) - shape_out = tf.shape(slice67_out) - slice57_out = tf.strided_slice( - shape_out, - begin=[0], - end=[1], - strides=[1], - shrink_axis_mask=1 - ) - - const2 = tf.constant(2) - input_shape = tf.stack([slice57_out, const2]) - input_shape = tf.cast(input_shape, tf.int64) - - range_out = tf.range(0, slice57_out, 1) - range_out = tf.reshape(range_out, [-1, 1]) - range_out_64 = tf.cast(range_out, dtype=tf.int64) - concat_out = tf.concat([range_out_64, slice67_out], axis=-1) - - sparse_tensor = tf.SparseTensor( - indices=concat_out, - values=[1,2,3,4], - dense_shape=input_shape - ) - sparse_tensor_out = tf.sparse.reshape(sparse_tensor, new_shape) - return sparse_tensor_out.indices, sparse_tensor_out.dense_shape, concat_out + slice67_out = tf.reshape(slice67_out, [-1, 1]) + shape_out = tf.shape(slice67_out) + slice57_out = tf.strided_slice( + shape_out, + begin=[0], + end=[1], + strides=[1], + shrink_axis_mask=1 + ) + + const2 = tf.constant(2) + input_shape = tf.stack([slice57_out, const2]) + input_shape = tf.cast(input_shape, tf.int64) + + range_out = tf.range(0, slice57_out, 1) + range_out = tf.reshape(range_out, [-1, 1]) + range_out_64 = tf.cast(range_out, dtype=tf.int64) + concat_out = tf.concat([range_out_64, slice67_out], axis=-1) + + sparse_tensor = tf.SparseTensor( + indices=concat_out, + values=[1,2,3,4], + dense_shape=input_shape + ) + sparse_tensor_out = tf.sparse.reshape(sparse_tensor, newshape) + return sparse_tensor_out.indices, sparse_tensor_out.dense_shape, concat_out + + +def opt_fused_sparse_reshape_graph(slice_input, begin, newshape): + custom_out1, custom_out2 = gen_embedding_fused_ops.KPFusedSparseReshape( + slice_input=slice_input, + begin=begin, + new_shape=newshape + ) + return custom_out1, custom_out2 + + +class TestFusedSparseReshape(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Initialize""" + cls.config = tf.compat.v1.ConfigProto() + cls.config.intra_op_parallelism_threads = 16 + cls.config.inter_op_parallelism_threads = 1 + + cls.run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE) + cls.run_metadata_ori = tf.compat.v1.RunMetadata() + cls.run_metadata_opt = tf.compat.v1.RunMetadata() + + @classmethod + def tearDownClass(cls): + # cls.sess.close() + return + + def test_kp_sparse_reshape(self): + with tf.Graph().as_default(): + slice_input = tf.compat.v1.placeholder(tf.int64, shape=(4,2), name="slice_input") + begin = tf.compat.v1.placeholder(tf.int32, name="begin") + newshape = tf.compat.v1.placeholder(tf.int32, name="newshape") + base_slice_input = np.array([[0, 0], [0, 1], [1, 2], [3, 4]], dtype=np.int64) + base_begin = [0, 1] + base_end = [0, 2] + base_strides = [1, 1] + base_newshape = [2, 4] + feed = { + slice_input: base_slice_input, + begin: base_begin, + newshape: base_newshape + } + + with tf.name_scope("ori"): + out_ori1, out_ori2, out_ori3 = ori_fused_embedding_sparse_reshape_graph(slice_input, begin, newshape) + with tf.name_scope("opt"): + out_opt1, out_opt2 = opt_fused_sparse_reshape_graph(slice_input, begin, newshape) + + with tf.compat.v1.Session(config=self.config) as sess: + out_ori_val1, out_ori_val2, out_ori_val3 = sess.run( + [out_ori1, out_ori2, out_ori3], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_ori + ) + out_opt_val1, out_opt_val2 = sess.run( + [out_opt1,out_opt2], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_opt + ) + + # 功能测试 + np.testing.assert_array_equal( + out_opt_val1, + out_ori_val1, + err_msg="Segment count mismatch" + ) + np.testing.assert_array_equal( + out_opt_val2, + out_ori_val2, + err_msg="Segment count mismatch" + ) + + op_name = "KPFusedSparseReshape" + TF_origin = "-----TF_origin------" + print("-" * 60) + print("-" * 60) + print("new test") + # 多次生成 timeline 并统计平均值 + num_runs = 1000 + start_op = "ori/StridedSlice" + end_op = "ori/SparseReshape" + total_times_ori = 0 + total_times_opt = 0 + for i in range(num_runs): + sess.run( + [out_ori1, out_ori2, out_ori3], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_ori + ) + sess.run( + [out_opt1, out_opt2], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_opt + ) + filename_ori = f"{op_name}_ori" + filename_opt = f"{op_name}_opt" + generate_timeline(self.run_metadata_ori.step_stats, filename_ori) + generate_timeline(self.run_metadata_opt.step_stats, filename_opt) + + total_times_ori += extract_op_total_time(filename_ori, start_op, end_op) + total_times_opt += extract_op_dur(filename_opt, op_name) + + avg_ori = total_times_ori / num_runs + avg_opt = total_times_opt / num_runs + speedup = (avg_ori - avg_opt) / avg_ori * 100 + + print(f"{TF_origin}: {avg_ori:.4f} us per run") + print(f"{op_name}: {avg_opt:.4f} us per run") + print(f"improve: {speedup:.2f}%") + if __name__ == "__main__": tf.compat.v1.disable_eager_execution() diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py index 69c7a114..c98760ae 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py @@ -3,131 +3,206 @@ import numpy as np import unittest from tensorflow.python.ops import gen_embedding_fused_ops +from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time + + +def ori_fused_embedding_sparse_segment_reduce_graph(data, indices, slice_input, begin, end, strides, is_mean): + slice_out = tf.strided_slice( + slice_input, + begin= begin, + end= end, + strides= strides, + begin_mask=1, + end_mask=1, + shrink_axis_mask=2 + ) + + segment_ids = tf.cast(slice_out, dtype=tf.int32) + if is_mean: + output = tf.sparse.segment_mean( + data = data, + indices = indices, + segment_ids= segment_ids + ) + else: + output = tf.sparse.segment_sum( + data = data, + indices = indices, + segment_ids= segment_ids + ) + + output_shape = tf.shape(output) + slice_out = tf.strided_slice(output_shape, begin=[0], end=[1], strides=[1]) + + return output, slice_out + + +def opt_fused_embedding_sparse_segment_reduce_graph(data, indices, slice_input, begin, end, strides, is_mean): + if is_mean: + custom_out, custom_slice_out = gen_embedding_fused_ops.KPFusedSparseSegmentReduce( + data=data, + indices=indices, + slice_input=slice_input, + begin=begin, + end = end, + strides = strides + ) + return custom_out, custom_slice_out + else: + custom_out, custom_slice_out = gen_embedding_fused_ops.KPFusedSparseSegmentReduce( + data=data, + indices=indices, + slice_input=slice_input, + begin=begin, + end = end, + strides = strides, + combiner = 0 + ) + return custom_out, custom_slice_out + class TestSparseSegmentMeanSlice(unittest.TestCase): @classmethod def setUpClass(cls): - """Initialize test data and custom op""" - # Load custom op - cls.custom_op = gen_embedding_fused_ops - - # Base test data - cls.base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0,5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4, 3} - cls.base_indices = np.array([0, 1, 2], dtype=np.int64) # shape {3} - cls.base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) # shape {3, 2} - cls.base_begin = [0, 1] - cls.base_end = [0, 2] - cls.base_strides = [1, 2] - # Create tf session - cls.sess = tf.compat.v1.Session() + """Initialize""" + cls.config = tf.compat.v1.ConfigProto() + cls.config.intra_op_parallelism_threads = 16 + cls.config.inter_op_parallelism_threads = 1 + + cls.run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE) + cls.run_metadata_ori = tf.compat.v1.RunMetadata() + cls.run_metadata_opt = tf.compat.v1.RunMetadata() @classmethod def tearDownClass(cls): - cls.sess.close() + return def test_mean(self): - # execute custom op - custom_out, custom_slice_out = self.custom_op.KPFusedSparseSegmentReduce( - data=self.base_data, - indices=self.base_indices, - slice_input=self.base_slice_input, - begin=self.base_begin, - end = self.base_end, - strides = self.base_strides - ) + with tf.Graph().as_default(): + base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0, 5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4, 3} + base_indices = np.array([0, 1, 2], dtype=np.int64) # shape {3} + base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) # shape {3, 2} + base_begin = [0, 1] + base_end = [0, 2] + base_strides = [1, 2] + data = tf.compat.v1.placeholder(tf.float32, shape=(4,3), name="data") + indices = tf.compat.v1.placeholder(tf.int32, name="indices") + slice_input = tf.compat.v1.placeholder(tf.int64, shape=(3,2), name="slice_input") + begin = tf.compat.v1.placeholder(tf.int32, name="begin") + end = tf.compat.v1.placeholder(tf.int32, name="end") + strides = tf.compat.v1.placeholder(tf.int32, name="strides") + feed = { + data: base_data, + indices: base_indices, + slice_input: base_slice_input, + begin: base_begin, + end: base_end, + strides: base_strides + } + with tf.name_scope("ori"): + out_ori1, out_ori2 = ori_fused_embedding_sparse_segment_reduce_graph(data,indices,slice_input,begin,end,strides,True) + with tf.name_scope("opt"): + out_opt1, out_opt2 = opt_fused_embedding_sparse_segment_reduce_graph(data,indices,slice_input,begin,end,strides,True) + + with tf.compat.v1.Session(config=self.config) as sess: + out_ori_val1, out_ori_val2 = sess.run([out_ori1, out_ori2], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori) + out_opt_val1, out_opt_val2 = sess.run([out_opt1, out_opt2], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt) - # tf native implementation - tf_out, tf_slice_out = self._tf_reference_impl( - self.base_data, - self.base_indices, - self.base_slice_input, - self.base_begin, - self.base_end, - self.base_strides, - True - ) + np.testing.assert_allclose( + out_opt_val1, + out_ori_val1, + rtol=1e-6, + err_msg="Output values mismatch" + ) + np.testing.assert_array_equal( + out_opt_val2, + out_ori_val2, + err_msg="Segment count mismatch" + ) + + TF_origin = "--------TF_origin---------" + op_name = "KPFusedSparseSegmentReduce" + print("-" * 60) + print("-" * 60) + print("new test") + # 多次生成 timeline 并统计平均值 + num_runs = 500 + start_op = "ori/StridedSlice" + end_op = "ori/StridedSlice_1" + total_times_ori = 0 + total_times_opt = 0 + for i in range(num_runs): + sess.run( + [out_ori1, out_ori2], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_ori + ) + sess.run( + [out_opt1, out_opt2], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_opt + ) + filename_ori = f"{op_name}_ori" + filename_opt = f"{op_name}_opt" + generate_timeline(self.run_metadata_ori.step_stats, filename_ori) + generate_timeline(self.run_metadata_opt.step_stats, filename_opt) + + total_times_ori += extract_op_total_time(filename_ori, start_op, end_op) + total_times_opt += extract_op_dur(filename_opt, op_name) + + avg_ori = total_times_ori / num_runs + avg_opt = total_times_opt / num_runs + speedup = (avg_ori - avg_opt) / avg_ori * 100 + + print(f"{TF_origin}: {avg_ori:.4f} us per run") + print(f"{op_name}: {avg_opt:.4f} us per run") + print(f"improve: {speedup:.2f}%") - custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out]) - tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out]) - - # Numerical comparison - np.testing.assert_allclose( - custom_out_val, - tf_out_val, - rtol=1e-6, - err_msg="Output values mismatch" - ) - np.testing.assert_array_equal( - custom_slice_out_val, - tf_slice_out_val, - err_msg="Segment count mismatch" - ) def test_sum(self): - custom_out, custom_slice_out = self.custom_op.KPFusedSparseSegmentReduce( - data=self.base_data, - indices=self.base_indices, - slice_input=self.base_slice_input, - begin=self.base_begin, - end = self.base_end, - strides = self.base_strides, - combiner=0 - ) - - tf_out, tf_slice_out = self._tf_reference_impl( - self.base_data, - self.base_indices, - self.base_slice_input, - self.base_begin, - self.base_end, - self.base_strides, - False - ) + with tf.Graph().as_default(): + base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0,5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4, 3} + base_indices = np.array([0, 1, 2], dtype=np.int64) + base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) + base_begin = [0, 1] + base_end = [0, 2] + base_strides = [1, 2] + data = tf.compat.v1.placeholder(tf.float32, shape=(4,3), name="data") + indices = tf.compat.v1.placeholder(tf.int32, name="indices") + slice_input = tf.compat.v1.placeholder(tf.int64, shape=(3,2), name="slice_input") + begin = tf.compat.v1.placeholder(tf.int32, name="begin") + end = tf.compat.v1.placeholder(tf.int32, name="end") + strides = tf.compat.v1.placeholder(tf.int32, name="strides") + feed = { + data: base_data, + indices: base_indices, + slice_input: base_slice_input, + begin: base_begin, + end: base_end, + strides: base_strides + } + with tf.name_scope("ori"): + out_ori1, out_ori2 = ori_fused_embedding_sparse_segment_reduce_graph(data,indices,slice_input,begin,end,strides,False) + with tf.name_scope("opt"): + out_opt1, out_opt2 = opt_fused_embedding_sparse_segment_reduce_graph(data,indices,slice_input,begin,end,strides,False) + + with tf.compat.v1.Session(config=self.config) as sess: + out_ori_val1, out_ori_val2 = sess.run([out_ori1, out_ori2], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori) + out_opt_val1, out_opt_val2 = sess.run([out_opt1, out_opt2], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt) + np.testing.assert_allclose( + out_opt_val1, + out_ori_val1, + rtol=1e-6, + err_msg="Output values mismatch" + ) + np.testing.assert_array_equal( + out_opt_val2, + out_ori_val2, + err_msg="Segment count mismatch" + ) - custom_out_val, custom_slice_out_val = self.sess.run([custom_out, custom_slice_out]) - tf_out_val, tf_slice_out_val = self.sess.run([tf_out, tf_slice_out]) - - np.testing.assert_allclose( - custom_out_val, - tf_out_val, - rtol=1e-6, - err_msg="Output values mismatch" - ) - np.testing.assert_array_equal( - custom_slice_out_val, - tf_slice_out_val, - err_msg="Segment count mismatch" - ) - - def _tf_reference_impl(self, data, indices, slice_input, begin, end, strides, is_mean): - slice_out = tf.strided_slice( - slice_input, - begin= begin, - end= end, - strides= strides, - begin_mask=1, - end_mask=1, - shrink_axis_mask=2 - ) - - segment_ids = tf.cast(slice_out, dtype=tf.int32) - if is_mean: - output = tf.sparse.segment_mean( - data = data, - indices = indices, - segment_ids= segment_ids - ) - else: - output = tf.sparse.segment_sum( - data = data, - indices = indices, - segment_ids= segment_ids - ) - - output_shape = tf.shape(output) - slice_out = tf.strided_slice(output_shape, begin=[0], end=[1], strides=[1]) - - return output, slice_out if __name__ == "__main__": tf.compat.v1.disable_eager_execution() diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select.py deleted file mode 100644 index d37128cb..00000000 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select.py +++ /dev/null @@ -1,113 +0,0 @@ -import tensorflow as tf -import numpy as np -import unittest - -from tensorflow.python.ops import gen_embedding_fused_ops -from utils.utils import perf_run, generate_timeline, wrapper_sess - - -def ori_fused_embedding_sparse_select_graph(input_a, input_b, input_c): - a = tf.reshape(input_a, [-1, 1]) - b = tf.reshape(input_b, [-1, 1]) - c = tf.reshape(input_c, [-1, 1]) - - greater_a = tf.greater(a, 0) - shape_reshape_a1 = tf.shape(a) - shape_reshape_a2 = tf.shape(a) - fill_a1 = tf.fill(shape_reshape_a1, tf.constant(1, dtype=tf.float32)) - realdiv = tf.realdiv(fill_a1, tf.constant(1, dtype=tf.float32)) - output_x = tf.fill(shape_reshape_a2, tf.constant(0, dtype=tf.float32)) - cast_a = tf.cast(greater_a, tf.float32) - shape_a = tf.shape(cast_a) - fill_a = tf.fill(shape_a, tf.constant(1, dtype=tf.float32)) - equal_4563 = tf.equal(b, 4563) - equal_10831 = tf.equal(b, 10831) - equal_3 = tf.equal(c, 3) - select_1 = tf.where(equal_4563, fill_a, cast_a) - select_2 = tf.where(equal_10831, fill_a, select_1) - output_y = tf.subtract(tf.constant(1, dtype=tf.float32), select_2) - mul = tf.multiply(tf.constant(1, dtype=tf.float32), select_2) - select_3 = tf.where(equal_3, realdiv, fill_a1) - output_z = tf.concat([mul, select_3], axis=-1) - return output_x, output_y, output_z - - -def opt_fused_embedding_sparse_select_graph(input_a, input_b, input_c): - output_x, output_y, output_z = gen_embedding_fused_ops.KPFusedSparseSelect( - input_a=input_a, input_b=input_b, input_c=input_c - ) - return output_x, output_y, output_z - - -class TestKPFusedSparseSelect(unittest.TestCase): - @classmethod - def setUpClass(cls): - """Initialize config""" - cls.config = tf.compat.v1.ConfigProto() - cls.config.intra_op_parallelism_threads = 16 - cls.config.inter_op_parallelism_threads = 16 - - cls.run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE) - cls.run_metadata_ori = tf.compat.v1.RunMetadata() - cls.run_metadata_opt = tf.compat.v1.RunMetadata() - - @classmethod - def tearDownClass(cls): - return - - def test_fused_embedding_sparse_select(self): - # Create Graph - with tf.Graph().as_default(): - input0 = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name="input_a") - input1 = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name="input_b") - input2 = tf.compat.v1.placeholder(tf.int32, shape=[None, None], name="input_c") - """Initialize test data""" - feed = { - input0: np.random.randint(0, 100, size=(100, 10)).astype(np.int32), - input1: np.random.randint(0, 100, size=(10, 100)).astype(np.int32), - input2: np.random.randint(0, 100, size=(20, 50)).astype(np.int32), - } - with tf.name_scope("ori"): - out0_ori, out1_ori, out2_ori = ori_fused_embedding_sparse_select_graph(input0, input1, input2) - with tf.name_scope("opt"): - out0_opt, out1_opt, out2_opt = opt_fused_embedding_sparse_select_graph(input0, input1, input2) - - # Create tf session - with tf.compat.v1.Session(config=self.config) as sess: - # functest - out0_ori_val, out1_ori_val, out2_ori_val = sess.run([out0_ori, out1_ori, out2_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori) - out0_opt_val, out1_opt_val, out2_opt_val = sess.run([out0_opt, out1_opt, out2_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt) - - np.testing.assert_allclose( - out0_ori_val, - out0_opt_val, - rtol=1e-5, - err_msg="Output values mismatch" - ) - - np.testing.assert_allclose( - out1_ori_val, - out1_opt_val, - rtol=1e-5, - err_msg="Output values mismatch" - ) - - np.testing.assert_allclose( - out2_ori_val, - out2_opt_val, - rtol=1e-5, - err_msg="Output values mismatch" - ) - - generate_timeline(self.run_metadata_ori.step_stats, f"{self._testMethodName}_ori") - generate_timeline(self.run_metadata_opt.step_stats, f"{self._testMethodName}_opt") - - # perftest - perf_run(wrapper_sess(sess, [out0_ori, out1_ori, out2_ori], feed_dict=feed), - wrapper_sess(sess, [out0_opt, out1_opt, out2_opt], feed_dict=feed), - "KPFusedEmbeddingSparseSelect") - - -if __name__ == "__main__": - tf.compat.v1.disable_eager_execution() - unittest.main(argv=[''], verbosity=2) \ No newline at end of file diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py index d37128cb..45aa8ff7 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py @@ -3,7 +3,7 @@ import numpy as np import unittest from tensorflow.python.ops import gen_embedding_fused_ops -from utils.utils import perf_run, generate_timeline, wrapper_sess +from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time def ori_fused_embedding_sparse_select_graph(input_a, input_b, input_c): @@ -45,7 +45,7 @@ class TestKPFusedSparseSelect(unittest.TestCase): """Initialize config""" cls.config = tf.compat.v1.ConfigProto() cls.config.intra_op_parallelism_threads = 16 - cls.config.inter_op_parallelism_threads = 16 + cls.config.inter_op_parallelism_threads = 1 cls.run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE) cls.run_metadata_ori = tf.compat.v1.RunMetadata() @@ -68,46 +68,77 @@ class TestKPFusedSparseSelect(unittest.TestCase): input2: np.random.randint(0, 100, size=(20, 50)).astype(np.int32), } with tf.name_scope("ori"): - out0_ori, out1_ori, out2_ori = ori_fused_embedding_sparse_select_graph(input0, input1, input2) + out_ori1, out_ori2, out_ori3 = ori_fused_embedding_sparse_select_graph(input0, input1, input2) with tf.name_scope("opt"): - out0_opt, out1_opt, out2_opt = opt_fused_embedding_sparse_select_graph(input0, input1, input2) + out_opt1, out_opt2, out_opt3 = opt_fused_embedding_sparse_select_graph(input0, input1, input2) # Create tf session with tf.compat.v1.Session(config=self.config) as sess: # functest - out0_ori_val, out1_ori_val, out2_ori_val = sess.run([out0_ori, out1_ori, out2_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori) - out0_opt_val, out1_opt_val, out2_opt_val = sess.run([out0_opt, out1_opt, out2_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt) + out_ori_val1, out_ori_val2, out_ori_val3 = sess.run([out_ori1, out_ori2, out_ori3], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori) + out_opt_val1, out_opt_val2, out_opt_val3 = sess.run([out_opt1, out_opt2, out_opt3], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt) np.testing.assert_allclose( - out0_ori_val, - out0_opt_val, + out_ori_val1, + out_opt_val1, rtol=1e-5, err_msg="Output values mismatch" ) np.testing.assert_allclose( - out1_ori_val, - out1_opt_val, + out_ori_val2, + out_opt_val2, rtol=1e-5, err_msg="Output values mismatch" ) np.testing.assert_allclose( - out2_ori_val, - out2_opt_val, + out_ori_val3, + out_opt_val3, rtol=1e-5, err_msg="Output values mismatch" ) - generate_timeline(self.run_metadata_ori.step_stats, f"{self._testMethodName}_ori") - generate_timeline(self.run_metadata_opt.step_stats, f"{self._testMethodName}_opt") - - # perftest - perf_run(wrapper_sess(sess, [out0_ori, out1_ori, out2_ori], feed_dict=feed), - wrapper_sess(sess, [out0_opt, out1_opt, out2_opt], feed_dict=feed), - "KPFusedEmbeddingSparseSelect") - + op_name = "KPFusedSparseSelect" + TF_origin = "-----TF_origin-----" + print("-" * 60) + print("-" * 60) + print("new test") + # 多次生成 timeline 并统计平均值 + num_runs = 1000 + start_op = "ori/Reshape" + end_op = "ori/Sub" + total_times_ori = 0 + total_times_opt = 0 + for i in range(num_runs): + sess.run( + [out_ori1, out_ori2, out_ori3], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_ori + ) + sess.run( + [out_opt1, out_opt2, out_opt3], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_opt + ) + filename_ori = f"{op_name}_ori" + filename_opt = f"{op_name}_opt" + generate_timeline(self.run_metadata_ori.step_stats, filename_ori) + generate_timeline(self.run_metadata_opt.step_stats, filename_opt) + + total_times_ori += extract_op_total_time(filename_ori, start_op, end_op) + total_times_opt += extract_op_dur(filename_opt, op_name) + + avg_ori = total_times_ori / num_runs + avg_opt = total_times_opt / num_runs + speedup = (avg_ori - avg_opt) / avg_ori * 100 + + print(f"{TF_origin}: {avg_ori:.4f} us per run") + print(f"{op_name}: {avg_opt:.4f} us per run") + print(f"improve: {speedup:.2f}%") if __name__ == "__main__": tf.compat.v1.disable_eager_execution() - unittest.main(argv=[''], verbosity=2) \ No newline at end of file + unittest.main(argv=[''], verbosity=2) diff --git a/tensorflow/python/grappler/embedding_fused_test/utils/utils.py b/tensorflow/python/grappler/embedding_fused_test/utils/utils.py index 06f02d6b..cd42c5b4 100644 --- a/tensorflow/python/grappler/embedding_fused_test/utils/utils.py +++ b/tensorflow/python/grappler/embedding_fused_test/utils/utils.py @@ -1,10 +1,30 @@ import timeit +import json +import os from tensorflow.python.client import timeline -def perf_run(ori_func, opt_func, name, warmup=5, iters=50): - +def extract_op_dur(timeline_file, op_name): + """从 timeline JSON 文件中提取指定算子(fusedOp)的耗时(μs)""" + with open(f"timeline/{timeline_file}.json", "r") as f: + trace_events = json.load(f)["traceEvents"] # timeline.json的格式 + durations = [e["dur"] for e in trace_events if e.get("name") == op_name and "dur" in e] + return durations[0] + + +def extract_op_total_time(timeline_file, start_op, end_op): + """计算从 start_op 到 end_op 的总耗时(包含调度空隙)""" + with open(f"timeline/{timeline_file}.json", "r") as f: + trace_events = json.load(f)["traceEvents"] + start_event = next(e for e in trace_events if e.get("args", {}).get("name") == start_op) # 找到 timeline 里第一个 name 等于 start_op 的事件 + end_event = next(e for e in trace_events if e.get("args", {}).get("name") == end_op) # 找不到会报错 + start_time = start_event["ts"] + end_time = end_event["ts"] + end_event["dur"] # ts 是开始时间,dur是算子的持续时间 + return end_time - start_time + + +def perf_run(ori_func, opt_func, name, warmup=5, iters=5): print(f"\nWarmup ori: {warmup} iters") for _ in range(warmup): ori_func() @@ -12,7 +32,7 @@ def perf_run(ori_func, opt_func, name, warmup=5, iters=50): print(f"Running performance test: ori {iters} iters") total_time = timeit.timeit(ori_func, number=iters) ori_avg_time = total_time / iters * 1000 - print(f"{name}: {ori_avg_time:.2f} ms per run") + print(f"{name}: {ori_avg_time:.6f} ms per run") print(f"\nWarmup opt: {warmup} iters") for _ in range(warmup): @@ -21,7 +41,7 @@ def perf_run(ori_func, opt_func, name, warmup=5, iters=50): print(f"Running performance test: opt {iters} iters") total_time = timeit.timeit(opt_func, number=iters) opt_avg_time = total_time / iters * 1000 - print(f"{name}: {opt_avg_time:.2f} ms per run") + print(f"{name}: {opt_avg_time:.6f} ms per run") improvement = (ori_avg_time - opt_avg_time) / ori_avg_time * 100 print(f"improve: {improvement:.2f}%") @@ -36,4 +56,4 @@ def generate_timeline(step_stats, filename): def wrapper_sess(sess, fetches, feed_dict=None, options=None, run_metadata=None): - return lambda: sess.run(fetches, feed_dict=feed_dict, options=options, run_metadata=run_metadata) \ No newline at end of file + return lambda: sess.run(fetches, feed_dict=feed_dict, options=options, run_metadata=run_metadata) \ No newline at end of file -- Gitee From 7c3fb000408ff1cd803af0b0f8490bf8787a9584 Mon Sep 17 00:00:00 2001 From: rayshine <1324789704@qq.com> Date: Tue, 19 Aug 2025 12:34:34 +0800 Subject: [PATCH 6/9] =?UTF-8?q?=E5=88=9B=E5=BB=BAbenchmark=5Fop=E5=87=BD?= =?UTF-8?q?=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../fused_embedding_action_id_gather_test.py | 57 +++----- .../fused_embedding_gather_test.py | 69 +++------ .../fused_embedding_padding_test.py | 114 +++++---------- ...ed_embedding_sparse_dynamic_stitch_test.py | 62 +++----- .../fused_embedding_sparse_reshape_test.py | 61 +++----- ...ed_embedding_sparse_segment_reduce_test.py | 132 +++++++++--------- .../fused_embedding_sparse_select_test.py | 58 +++----- .../embedding_fused_test/utils/utils.py | 58 ++++++++ 8 files changed, 247 insertions(+), 364 deletions(-) diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py index 1fa83bc9..2bd76790 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py @@ -3,7 +3,9 @@ import numpy as np import unittest from tensorflow.python.ops import gen_embedding_fused_ops -from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time +from utils.utils import benchmark_op + +np.random.seed(140) def ori_fused_embedding_action_id_gather_graph(input0, input1, input2, input3): @@ -77,47 +79,20 @@ class TestFusedEmbeddingActionIdGather(unittest.TestCase): err_msg="result mismatch" ) - op_name = "KPFusedEmbeddingActionIdGather" - TF_origin = "----------TF_origin-----------" - print("-" * 60) - print("-" * 60) - print("new test") - # 多次生成 timeline 并统计平均值 - num_runs = 1000 - start_op = "ori/stack_1" - end_op = "ori/concat" - total_times_ori = 0 - total_times_opt = 0 - for i in range(num_runs): - sess.run( - [out_ori], - feed_dict=feed, - options=self.run_options, - run_metadata=self.run_metadata_ori - ) - sess.run( - [out_opt], - feed_dict=feed, - options=self.run_options, - run_metadata=self.run_metadata_opt - ) - filename_ori = f"{op_name}_ori" - filename_opt = f"{op_name}_opt" - generate_timeline(self.run_metadata_ori.step_stats, filename_ori) - generate_timeline(self.run_metadata_opt.step_stats, filename_opt) - - total_times_ori += extract_op_total_time(filename_ori, start_op, end_op) - total_times_opt += extract_op_dur(filename_opt, op_name) - - avg_ori = total_times_ori / num_runs - avg_opt = total_times_opt / num_runs - speedup = (avg_ori - avg_opt) / avg_ori * 100 - - print(f"{TF_origin}: {avg_ori:.4f} us per run") - print(f"{op_name}: {avg_opt:.4f} us per run") - print(f"improve: {speedup:.2f}%") + benchmark_op( + sess, + feed, + [out_ori], + [out_opt], + self.run_options, + self.run_metadata_ori, + self.run_metadata_opt, + op_name="KPFusedEmbeddingActionIdGather", + start_op="ori/stack_1", + end_op="ori/concat", + num_runs=1000 + ) - if __name__ == "__main__": tf.compat.v1.disable_eager_execution() diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py index f87ee1b5..241a4a9a 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py @@ -3,18 +3,18 @@ import numpy as np import unittest from tensorflow.python.ops import gen_embedding_fused_ops -from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time +from utils.utils import benchmark_op def ori_fused_embedding_gather_graph(data, slice_input, begin): slice_out = tf.strided_slice( slice_input, - begin = begin, - end = [tf.shape(slice_input)[0], begin[1] + 2], - strides = [1, 1], - begin_mask = 1, - end_mask = 1, - shrink_axis_mask = 2 + begin=begin, + end=[tf.shape(slice_input)[0], begin[1] + 2], + strides=[1, 1], + begin_mask=1, + end_mask=1, + shrink_axis_mask=2 ) slice_out, slice_out_indices = tf.unique(slice_out) @@ -58,7 +58,7 @@ class TestFusedGather(unittest.TestCase): with tf.Graph().as_default(): data = tf.compat.v1.placeholder(tf.float32, shape=(20, 12), name="data") slice_input = tf.compat.v1.placeholder(tf.int64, shape=(3, 2), name="slice_input") - begin = tf.compat.v1.placeholder(tf.int32, name="begin") + begin = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="begin") base_data = np.linspace(0, 11, num=240, endpoint=False, dtype=np.float32).reshape(20, 12) base_slice_input = np.array([[0, 0], [0, 1], [1, 2]], dtype=np.int64) base_begin = [0, 1] @@ -116,46 +116,21 @@ class TestFusedGather(unittest.TestCase): rtol=1e-6, err_msg="Output values mismatch" ) - - op_name = "KPFusedGather" - TF_origin = "--TF_origin--" - print("-" * 60) - print("-" * 60) - print("new test") - # 多次生成 timeline 并统计平均值 - num_runs = 1000 - start_op = "ori/strided_slice_1" - end_op = "ori/GatherV2_1" - total_times_ori = 0 - total_times_opt = 0 - for i in range(num_runs): - sess.run( - [out_ori1, out_ori2, out_ori3], - feed_dict=feed, - options=self.run_options, - run_metadata=self.run_metadata_ori - ) - sess.run( - [out_opt1, out_opt2], - feed_dict=feed, - options=self.run_options, - run_metadata=self.run_metadata_opt - ) - filename_ori = f"{op_name}_ori" - filename_opt = f"{op_name}_opt" - generate_timeline(self.run_metadata_ori.step_stats, filename_ori) - generate_timeline(self.run_metadata_opt.step_stats, filename_opt) - - total_times_ori += extract_op_total_time(filename_ori, start_op, end_op) - total_times_opt += extract_op_dur(filename_opt, op_name) - - avg_ori = total_times_ori / num_runs - avg_opt = total_times_opt / num_runs - speedup = (avg_ori - avg_opt) / avg_ori * 100 - print(f"{TF_origin}: {avg_ori:.4f} us per run") - print(f"{op_name}: {avg_opt:.4f} us per run") - print(f"improve: {speedup:.2f}%") + benchmark_op( + sess, + feed, + [out_ori1, out_ori2, out_ori3], + [out_opt1, out_opt2], + self.run_options, + self.run_metadata_ori, + self.run_metadata_opt, + op_name="KPFusedGather", + start_op="ori/strided_slice_1", + end_op="ori/GatherV2_1", + num_runs=1000, + tag="--TF_origin--" + ) if __name__ == "__main__": diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py index ba6a87cd..61f00534 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py @@ -3,7 +3,7 @@ import numpy as np import unittest from tensorflow.python.ops import gen_embedding_fused_ops -from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time +from utils.utils import benchmark_op np.random.seed(140) @@ -12,12 +12,14 @@ def opt_fused_embedding_padding_fast_graph(input0, input1, input2, input3): # execute custom op _, custom_out = gen_embedding_fused_ops.kp_fused_embedding_padding_fast(input0, input1, input2, input3) return custom_out - + + def opt_fused_embedding_padding_graph(input0, input1, input2, input3): # execute custom op _, custom_out = gen_embedding_fused_ops.kp_fused_embedding_padding(input0, input1, input2, input3) return custom_out + def ori_fused_embedding_padding_fast_graph(input0, input1, input2, input3): cast = tf.cast(input0, tf.int32) begin = tf.constant([0], dtype=tf.int32) @@ -34,6 +36,7 @@ def ori_fused_embedding_padding_fast_graph(input0, input1, input2, input3): output = tf.strided_slice(shape_tensor, begin=begin, end=end, strides=strides, shrink_axis_mask=1) return output + def ori_fused_embedding_padding_graph(input0, input1, input2, input3): cast = tf.cast(input0, tf.int32) begin = tf.constant([0], dtype=tf.int32) @@ -96,45 +99,20 @@ class TestFusedEmbeddingPadding(unittest.TestCase): err_msg="result mismatch" ) - op_name = "KPFusedEmbeddingPadding" - TF_origin = "-------TF_origin-------" - print("-" * 60) - print("-" * 60) - print("new test") - # 多次生成 timeline 并统计平均值 - num_runs = 1000 - start_op = "ori/Cast" - end_op = "ori/Reshape" - total_times_ori = 0 - total_times_opt = 0 - for i in range(num_runs): - sess.run( - [out_ori], - feed_dict=feed, - options=self.run_options, - run_metadata=self.run_metadata_ori - ) - sess.run( - [out_opt], - feed_dict=feed, - options=self.run_options, - run_metadata=self.run_metadata_opt - ) - filename_ori = f"{op_name}_ori" - filename_opt = f"{op_name}_opt" - generate_timeline(self.run_metadata_ori.step_stats, filename_ori) - generate_timeline(self.run_metadata_opt.step_stats, filename_opt) - - total_times_ori += extract_op_total_time(filename_ori, start_op, end_op) - total_times_opt += extract_op_dur(filename_opt, op_name) - - avg_ori = total_times_ori / num_runs - avg_opt = total_times_opt / num_runs - speedup = (avg_ori - avg_opt) / avg_ori * 100 - - print(f"{TF_origin}: {avg_ori:.4f} us per run") - print(f"{op_name}: {avg_opt:.4f} us per run") - print(f"improve: {speedup:.2f}%") + benchmark_op( + sess, + feed, + [out_ori], + [out_opt], + self.run_options, + self.run_metadata_ori, + self.run_metadata_opt, + op_name="KPFusedEmbeddingPadding", + start_op="ori/Cast", + end_op="ori/Reshape", + num_runs=1000, + tag="-------TF_origin-------" + ) def test_func_kp_fused_embedding_padding_fast(self): @@ -167,46 +145,22 @@ class TestFusedEmbeddingPadding(unittest.TestCase): opt_result, err_msg="result mismatch" ) - - op_name = "KPFusedEmbeddingPaddingFast" - TF_origin = "---------TF_origin---------" - print("-" * 60) - print("-" * 60) - print("new test") - # 多次生成 timeline 并统计平均值 - num_runs = 1000 - start_op = "ori/Cast" - end_op = "ori/StridedSlice_1" - total_times_ori = 0 - total_times_opt = 0 - for i in range(num_runs): - sess.run( - [out_ori], - feed_dict=feed, - options=self.run_options, - run_metadata=self.run_metadata_ori - ) - sess.run( - [out_opt], - feed_dict=feed, - options=self.run_options, - run_metadata=self.run_metadata_opt - ) - filename_ori = f"{op_name}_ori" - filename_opt = f"{op_name}_opt" - generate_timeline(self.run_metadata_ori.step_stats, filename_ori) - generate_timeline(self.run_metadata_opt.step_stats, filename_opt) - - total_times_ori += extract_op_total_time(filename_ori, start_op, end_op) - total_times_opt += extract_op_dur(filename_opt, op_name) - avg_ori = total_times_ori / num_runs - avg_opt = total_times_opt / num_runs - speedup = (avg_ori - avg_opt) / avg_ori * 100 - - print(f"{TF_origin}: {avg_ori:.4f} us per run") - print(f"{op_name}: {avg_opt:.4f} us per run") - print(f"improve: {speedup:.2f}%") + benchmark_op( + sess, + feed, + [out_ori], + [out_opt], + self.run_options, + self.run_metadata_ori, + self.run_metadata_opt, + op_name="KPFusedEmbeddingPaddingFast", + start_op="ori/Cast", + end_op="ori/StridedSlice_1", + num_runs=1000, + tag="---------TF_origin---------" + ) + if __name__ == "__main__": tf.compat.v1.disable_eager_execution() diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py index 5a81692c..17b0bb05 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py @@ -1,15 +1,16 @@ -import os import tensorflow as tf import numpy as np import unittest from tensorflow.python.ops import gen_embedding_fused_ops -from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time +from utils.utils import benchmark_op + +np.random.seed(140) def ori_fused_sparse_dynamic_stitch_graph(x, emb_tables): - x_1 = tf.reshape(x, shape=[-1]) # 将输入 x 展平成一维向量 x_1 - group_ids = tf.math.floormod(x_1, 12) + x_1 = tf.reshape(x, shape=[-1]) # 将输入 x 展平成一维向量 x_1 + group_ids = tf.math.floormod(x_1, 12) group_ids = tf.cast(group_ids, dtype=np.int32) chunk_indices = tf.math.floordiv(x_1, 12) original_indices = tf.range(0, tf.size(x_1), 1) @@ -132,46 +133,21 @@ class TestSparseDynamicStitch(unittest.TestCase): out_opt_val, err_msg="result mismatch" ) - - op_name = "KPFusedSparseDynamicStitch" - TF_origin = "--------TF_origin---------" - print("-" * 60) - print("-" * 60) - print("new test") - # 多次生成 timeline 并统计平均值 - num_runs = 10 - start_op = "ori/Reshape" - end_op = "ori/DynamicStitch" - total_times_ori = 0 - total_times_opt = 0 - for i in range(num_runs): - sess.run( - [out_ori], - feed_dict=feed, - options=self.run_options, - run_metadata=self.run_metadata_ori - ) - sess.run( - [out_opt], - feed_dict=feed, - options=self.run_options, - run_metadata=self.run_metadata_opt - ) - filename_ori = f"{op_name}_ori" - filename_opt = f"{op_name}_opt" - generate_timeline(self.run_metadata_ori.step_stats, filename_ori) - generate_timeline(self.run_metadata_opt.step_stats, filename_opt) - - total_times_ori += extract_op_total_time(filename_ori, start_op, end_op) - total_times_opt += extract_op_dur(filename_opt, op_name) - avg_ori = total_times_ori / num_runs - avg_opt = total_times_opt / num_runs - speedup = (avg_ori - avg_opt) / avg_ori * 100 - - print(f"{TF_origin}: {avg_ori:.4f} us per run") - print(f"{op_name}: {avg_opt:.4f} us per run") - print(f"improve: {speedup:.2f}%") + benchmark_op( + sess, + feed, + [out_ori], + [out_opt], + self.run_options, + self.run_metadata_ori, + self.run_metadata_opt, + op_name="KPFusedSparseDynamicStitch", + start_op="ori/Reshape", + end_op="ori/DynamicStitch", + num_runs=10, + tag="---------TF_origin---------" + ) if __name__ == "__main__": diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py index bb3d3727..f8184f20 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py @@ -4,7 +4,7 @@ import unittest from tensorflow.python.ops import gen_embedding_fused_ops -from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time +from utils.utils import benchmark_op def ori_fused_embedding_sparse_reshape_graph(slice_input, begin, newshape): @@ -75,12 +75,10 @@ class TestFusedSparseReshape(unittest.TestCase): def test_kp_sparse_reshape(self): with tf.Graph().as_default(): slice_input = tf.compat.v1.placeholder(tf.int64, shape=(4,2), name="slice_input") - begin = tf.compat.v1.placeholder(tf.int32, name="begin") - newshape = tf.compat.v1.placeholder(tf.int32, name="newshape") + begin = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="begin") + newshape = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="newshape") base_slice_input = np.array([[0, 0], [0, 1], [1, 2], [3, 4]], dtype=np.int64) base_begin = [0, 1] - base_end = [0, 2] - base_strides = [1, 1] base_newshape = [2, 4] feed = { slice_input: base_slice_input, @@ -118,46 +116,21 @@ class TestFusedSparseReshape(unittest.TestCase): out_ori_val2, err_msg="Segment count mismatch" ) - - op_name = "KPFusedSparseReshape" - TF_origin = "-----TF_origin------" - print("-" * 60) - print("-" * 60) - print("new test") - # 多次生成 timeline 并统计平均值 - num_runs = 1000 - start_op = "ori/StridedSlice" - end_op = "ori/SparseReshape" - total_times_ori = 0 - total_times_opt = 0 - for i in range(num_runs): - sess.run( - [out_ori1, out_ori2, out_ori3], - feed_dict=feed, - options=self.run_options, - run_metadata=self.run_metadata_ori - ) - sess.run( - [out_opt1, out_opt2], - feed_dict=feed, - options=self.run_options, - run_metadata=self.run_metadata_opt - ) - filename_ori = f"{op_name}_ori" - filename_opt = f"{op_name}_opt" - generate_timeline(self.run_metadata_ori.step_stats, filename_ori) - generate_timeline(self.run_metadata_opt.step_stats, filename_opt) - - total_times_ori += extract_op_total_time(filename_ori, start_op, end_op) - total_times_opt += extract_op_dur(filename_opt, op_name) - - avg_ori = total_times_ori / num_runs - avg_opt = total_times_opt / num_runs - speedup = (avg_ori - avg_opt) / avg_ori * 100 - print(f"{TF_origin}: {avg_ori:.4f} us per run") - print(f"{op_name}: {avg_opt:.4f} us per run") - print(f"improve: {speedup:.2f}%") + benchmark_op( + sess, + feed, + [out_ori1, out_ori2, out_ori3], + [out_opt1, out_opt2], + self.run_options, + self.run_metadata_ori, + self.run_metadata_opt, + op_name="KPFusedSparseReshape", + start_op="ori/StridedSlice", + end_op="ori/SparseReshape", + num_runs=1000, + tag="------TF_origin-----" + ) if __name__ == "__main__": diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py index c98760ae..0e492617 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py @@ -3,15 +3,15 @@ import numpy as np import unittest from tensorflow.python.ops import gen_embedding_fused_ops -from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time +from utils.utils import benchmark_op def ori_fused_embedding_sparse_segment_reduce_graph(data, indices, slice_input, begin, end, strides, is_mean): slice_out = tf.strided_slice( slice_input, - begin= begin, - end= end, - strides= strides, + begin=begin, + end=end, + strides=strides, begin_mask=1, end_mask=1, shrink_axis_mask=2 @@ -20,15 +20,15 @@ def ori_fused_embedding_sparse_segment_reduce_graph(data, indices, slice_input, segment_ids = tf.cast(slice_out, dtype=tf.int32) if is_mean: output = tf.sparse.segment_mean( - data = data, - indices = indices, - segment_ids= segment_ids + data=data, + indices=indices, + segment_ids=segment_ids ) else: output = tf.sparse.segment_sum( - data = data, - indices = indices, - segment_ids= segment_ids + data=data, + indices=indices, + segment_ids=segment_ids ) output_shape = tf.shape(output) @@ -44,10 +44,9 @@ def opt_fused_embedding_sparse_segment_reduce_graph(data, indices, slice_input, indices=indices, slice_input=slice_input, begin=begin, - end = end, - strides = strides + end=end, + strides=strides ) - return custom_out, custom_slice_out else: custom_out, custom_slice_out = gen_embedding_fused_ops.KPFusedSparseSegmentReduce( data=data, @@ -55,10 +54,10 @@ def opt_fused_embedding_sparse_segment_reduce_graph(data, indices, slice_input, slice_input=slice_input, begin=begin, end = end, - strides = strides, - combiner = 0 + strides=strides, + combiner=0 ) - return custom_out, custom_slice_out + return custom_out, custom_slice_out class TestSparseSegmentMeanSlice(unittest.TestCase): @@ -79,18 +78,20 @@ class TestSparseSegmentMeanSlice(unittest.TestCase): def test_mean(self): with tf.Graph().as_default(): + data = tf.compat.v1.placeholder(tf.float32, shape=(4,3), name="data") + indices = tf.compat.v1.placeholder(tf.int32, shape=(3,), name="indices") + slice_input = tf.compat.v1.placeholder(tf.int64, shape=(3,2), name="slice_input") + begin = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="begin") + end = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="end") + strides = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="strides") + base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0, 5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4, 3} base_indices = np.array([0, 1, 2], dtype=np.int64) # shape {3} base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) # shape {3, 2} base_begin = [0, 1] base_end = [0, 2] base_strides = [1, 2] - data = tf.compat.v1.placeholder(tf.float32, shape=(4,3), name="data") - indices = tf.compat.v1.placeholder(tf.int32, name="indices") - slice_input = tf.compat.v1.placeholder(tf.int64, shape=(3,2), name="slice_input") - begin = tf.compat.v1.placeholder(tf.int32, name="begin") - end = tf.compat.v1.placeholder(tf.int32, name="end") - strides = tf.compat.v1.placeholder(tf.int32, name="strides") + feed = { data: base_data, indices: base_indices, @@ -99,6 +100,7 @@ class TestSparseSegmentMeanSlice(unittest.TestCase): end: base_end, strides: base_strides } + with tf.name_scope("ori"): out_ori1, out_ori2 = ori_fused_embedding_sparse_segment_reduce_graph(data,indices,slice_input,begin,end,strides,True) with tf.name_scope("opt"): @@ -120,61 +122,38 @@ class TestSparseSegmentMeanSlice(unittest.TestCase): err_msg="Segment count mismatch" ) - TF_origin = "--------TF_origin---------" - op_name = "KPFusedSparseSegmentReduce" - print("-" * 60) - print("-" * 60) - print("new test") - # 多次生成 timeline 并统计平均值 - num_runs = 500 - start_op = "ori/StridedSlice" - end_op = "ori/StridedSlice_1" - total_times_ori = 0 - total_times_opt = 0 - for i in range(num_runs): - sess.run( - [out_ori1, out_ori2], - feed_dict=feed, - options=self.run_options, - run_metadata=self.run_metadata_ori - ) - sess.run( - [out_opt1, out_opt2], - feed_dict=feed, - options=self.run_options, - run_metadata=self.run_metadata_opt - ) - filename_ori = f"{op_name}_ori" - filename_opt = f"{op_name}_opt" - generate_timeline(self.run_metadata_ori.step_stats, filename_ori) - generate_timeline(self.run_metadata_opt.step_stats, filename_opt) - - total_times_ori += extract_op_total_time(filename_ori, start_op, end_op) - total_times_opt += extract_op_dur(filename_opt, op_name) - - avg_ori = total_times_ori / num_runs - avg_opt = total_times_opt / num_runs - speedup = (avg_ori - avg_opt) / avg_ori * 100 - - print(f"{TF_origin}: {avg_ori:.4f} us per run") - print(f"{op_name}: {avg_opt:.4f} us per run") - print(f"improve: {speedup:.2f}%") + benchmark_op( + sess, + feed, + [out_ori1, out_ori2], + [out_opt1, out_opt2], + self.run_options, + self.run_metadata_ori, + self.run_metadata_opt, + op_name="KPFusedSparseSegmentReduce", + start_op="ori/StridedSlice", + end_op="ori/StridedSlice_1", + num_runs=500, + tag="--------TF_origin---------" + ) def test_sum(self): with tf.Graph().as_default(): + data = tf.compat.v1.placeholder(tf.float32, shape=(4,3), name="data") + indices = tf.compat.v1.placeholder(tf.int32, shape=(3,), name="indices") + slice_input = tf.compat.v1.placeholder(tf.int64, shape=(3,2), name="slice_input") + begin = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="begin") + end = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="end") + strides = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="strides") + base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0,5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4, 3} base_indices = np.array([0, 1, 2], dtype=np.int64) base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) base_begin = [0, 1] base_end = [0, 2] base_strides = [1, 2] - data = tf.compat.v1.placeholder(tf.float32, shape=(4,3), name="data") - indices = tf.compat.v1.placeholder(tf.int32, name="indices") - slice_input = tf.compat.v1.placeholder(tf.int64, shape=(3,2), name="slice_input") - begin = tf.compat.v1.placeholder(tf.int32, name="begin") - end = tf.compat.v1.placeholder(tf.int32, name="end") - strides = tf.compat.v1.placeholder(tf.int32, name="strides") + feed = { data: base_data, indices: base_indices, @@ -184,9 +163,9 @@ class TestSparseSegmentMeanSlice(unittest.TestCase): strides: base_strides } with tf.name_scope("ori"): - out_ori1, out_ori2 = ori_fused_embedding_sparse_segment_reduce_graph(data,indices,slice_input,begin,end,strides,False) + out_ori1, out_ori2 = ori_fused_embedding_sparse_segment_reduce_graph(data, indices, slice_input, begin, end, strides, False) with tf.name_scope("opt"): - out_opt1, out_opt2 = opt_fused_embedding_sparse_segment_reduce_graph(data,indices,slice_input,begin,end,strides,False) + out_opt1, out_opt2 = opt_fused_embedding_sparse_segment_reduce_graph(data,indices, slice_input, begin, end, strides, False) with tf.compat.v1.Session(config=self.config) as sess: out_ori_val1, out_ori_val2 = sess.run([out_ori1, out_ori2], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori) @@ -202,6 +181,21 @@ class TestSparseSegmentMeanSlice(unittest.TestCase): out_ori_val2, err_msg="Segment count mismatch" ) + + benchmark_op( + sess, + feed, + [out_ori1, out_ori2], + [out_opt1, out_opt2], + self.run_options, + self.run_metadata_ori, + self.run_metadata_opt, + op_name="KPFusedSparseSegmentReduce", + start_op="ori/StridedSlice", + end_op="ori/StridedSlice_1", + num_runs=1000, + tag="---------TF_origin--------" + ) if __name__ == "__main__": diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py index 45aa8ff7..de59d532 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py @@ -3,7 +3,9 @@ import numpy as np import unittest from tensorflow.python.ops import gen_embedding_fused_ops -from utils.utils import perf_run, generate_timeline, wrapper_sess, extract_op_dur, extract_op_total_time +from utils.utils import benchmark_op + +np.random.seed(140) def ori_fused_embedding_sparse_select_graph(input_a, input_b, input_c): @@ -98,46 +100,22 @@ class TestKPFusedSparseSelect(unittest.TestCase): rtol=1e-5, err_msg="Output values mismatch" ) - - op_name = "KPFusedSparseSelect" - TF_origin = "-----TF_origin-----" - print("-" * 60) - print("-" * 60) - print("new test") - # 多次生成 timeline 并统计平均值 - num_runs = 1000 - start_op = "ori/Reshape" - end_op = "ori/Sub" - total_times_ori = 0 - total_times_opt = 0 - for i in range(num_runs): - sess.run( - [out_ori1, out_ori2, out_ori3], - feed_dict=feed, - options=self.run_options, - run_metadata=self.run_metadata_ori - ) - sess.run( - [out_opt1, out_opt2, out_opt3], - feed_dict=feed, - options=self.run_options, - run_metadata=self.run_metadata_opt - ) - filename_ori = f"{op_name}_ori" - filename_opt = f"{op_name}_opt" - generate_timeline(self.run_metadata_ori.step_stats, filename_ori) - generate_timeline(self.run_metadata_opt.step_stats, filename_opt) - - total_times_ori += extract_op_total_time(filename_ori, start_op, end_op) - total_times_opt += extract_op_dur(filename_opt, op_name) - - avg_ori = total_times_ori / num_runs - avg_opt = total_times_opt / num_runs - speedup = (avg_ori - avg_opt) / avg_ori * 100 - print(f"{TF_origin}: {avg_ori:.4f} us per run") - print(f"{op_name}: {avg_opt:.4f} us per run") - print(f"improve: {speedup:.2f}%") + benchmark_op( + sess, + feed, + [out_ori1, out_ori2, out_ori3], + [out_opt1, out_opt2, out_opt3], + self.run_options, + self.run_metadata_ori, + self.run_metadata_opt, + op_name="KPFusedSparseSelect", + start_op="ori/Reshape", + end_op="ori/Sub", + num_runs=1000, + tag="-----TF_origin-----" + ) + if __name__ == "__main__": tf.compat.v1.disable_eager_execution() diff --git a/tensorflow/python/grappler/embedding_fused_test/utils/utils.py b/tensorflow/python/grappler/embedding_fused_test/utils/utils.py index cd42c5b4..f982f5dd 100644 --- a/tensorflow/python/grappler/embedding_fused_test/utils/utils.py +++ b/tensorflow/python/grappler/embedding_fused_test/utils/utils.py @@ -24,6 +24,64 @@ def extract_op_total_time(timeline_file, start_op, end_op): return end_time - start_time +def benchmark_op( + sess, + feed, + out_ori, + out_opt, + run_options, + run_metadata_ori, + run_metadata_opt, + op_name, + start_op, + end_op, + num_runs=500, + tag="--------TF_origin---------" +): + print("-" * 60) + print("-" * 60) + print("new test") + + total_times_ori = 0.0 + total_times_opt = 0.0 + + for i in range(num_runs): + # 执行原始算子 + sess.run( + out_ori, + feed_dict=feed, + options=run_options, + run_metadata=run_metadata_ori + ) + # 执行优化后的算子 + sess.run( + out_opt, + feed_dict=feed, + options=run_options, + run_metadata=run_metadata_opt + ) + + # 生成 timeline 文件 + filename_ori = f"{op_name}_ori" + filename_opt = f"{op_name}_opt" + generate_timeline(run_metadata_ori.step_stats, filename_ori) + generate_timeline(run_metadata_opt.step_stats, filename_opt) + + # 统计时延 + total_times_ori += extract_op_total_time(filename_ori, start_op, end_op) + total_times_opt += extract_op_dur(filename_opt, op_name) + + # 计算平均值和加速比 + avg_ori = total_times_ori / num_runs + avg_opt = total_times_opt / num_runs + speedup = (avg_ori - avg_opt) / avg_ori * 100 if avg_ori > 0 else 0 + + # 打印结果 + print(f"{tag}: {avg_ori:.4f} us per run") + print(f"{op_name}: {avg_opt:.4f} us per run") + print(f"improve: {speedup:.2f}%") + + def perf_run(ori_func, opt_func, name, warmup=5, iters=5): print(f"\nWarmup ori: {warmup} iters") for _ in range(warmup): -- Gitee From 461a4f6fe24736dbcd4bb1494ad9031d1d40882e Mon Sep 17 00:00:00 2001 From: rayshine <1324789704@qq.com> Date: Wed, 20 Aug 2025 11:12:45 +0800 Subject: [PATCH 7/9] =?UTF-8?q?=E4=BF=AE=E6=94=B9=20=E6=B5=8B=E8=AF=95?= =?UTF-8?q?=E8=84=9A=E6=9C=AC=E6=A0=BC=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../fused_embedding_action_id_gather_test.py | 17 ++++-- .../fused_embedding_gather_test.py | 6 +- .../fused_embedding_padding_test.py | 20 +++++-- ...ed_embedding_sparse_dynamic_stitch_test.py | 16 +++-- .../fused_embedding_sparse_reshape_test.py | 7 ++- ...ed_embedding_sparse_segment_reduce_test.py | 58 +++++++++++++++---- .../fused_embedding_sparse_select_test.py | 18 +++++- 7 files changed, 108 insertions(+), 34 deletions(-) diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py index 2bd76790..1ae2c1f5 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_action_id_gather_test.py @@ -1,7 +1,9 @@ -import tensorflow as tf -import numpy as np +# Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. import unittest +import numpy as np +import tensorflow as tf + from tensorflow.python.ops import gen_embedding_fused_ops from utils.utils import benchmark_op @@ -70,8 +72,12 @@ class TestFusedEmbeddingActionIdGather(unittest.TestCase): # Create tf session with tf.compat.v1.Session(config=self.config) as sess: # functest - out_ori_val = sess.run([out_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori) - out_opt_val = sess.run([out_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt) + out_ori_val = sess.run( + [out_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori + ) + out_opt_val = sess.run( + [out_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt + ) np.testing.assert_array_equal( out_ori_val, @@ -90,7 +96,8 @@ class TestFusedEmbeddingActionIdGather(unittest.TestCase): op_name="KPFusedEmbeddingActionIdGather", start_op="ori/stack_1", end_op="ori/concat", - num_runs=1000 + num_runs=10000, + tag="----------TF_origin-----------" ) diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py index 241a4a9a..4e1755ef 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_gather_test.py @@ -1,6 +1,8 @@ +# Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. +import unittest + import tensorflow as tf import numpy as np -import unittest from tensorflow.python.ops import gen_embedding_fused_ops from utils.utils import benchmark_op @@ -128,7 +130,7 @@ class TestFusedGather(unittest.TestCase): op_name="KPFusedGather", start_op="ori/strided_slice_1", end_op="ori/GatherV2_1", - num_runs=1000, + num_runs=10000, tag="--TF_origin--" ) diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py index 61f00534..0b943792 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py @@ -1,6 +1,8 @@ +# Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. +import unittest + import tensorflow as tf import numpy as np -import unittest from tensorflow.python.ops import gen_embedding_fused_ops from utils.utils import benchmark_op @@ -90,8 +92,12 @@ class TestFusedEmbeddingPadding(unittest.TestCase): # Create tf session with tf.compat.v1.Session(config=self.config) as sess: # functest - ori_result = sess.run([out_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori) - opt_result = sess.run([out_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt) + ori_result = sess.run( + [out_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori + ) + opt_result = sess.run( + [out_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt + ) np.testing.assert_array_equal( ori_result, @@ -137,8 +143,12 @@ class TestFusedEmbeddingPadding(unittest.TestCase): # Create tf session with tf.compat.v1.Session(config=self.config) as sess: # functest - ori_result = sess.run([out_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori) - opt_result = sess.run([out_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt) + ori_result = sess.run( + [out_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori + ) + opt_result = sess.run( + [out_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt + ) np.testing.assert_array_equal( ori_result, diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py index 17b0bb05..ab471db8 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_dynamic_stitch_test.py @@ -1,6 +1,8 @@ +# Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. +import unittest + import tensorflow as tf import numpy as np -import unittest from tensorflow.python.ops import gen_embedding_fused_ops from utils.utils import benchmark_op @@ -125,8 +127,12 @@ class TestSparseDynamicStitch(unittest.TestCase): # Create tf session with tf.compat.v1.Session(config=self.config) as sess: # functest - out_ori_val = sess.run([out_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori) - out_opt_val = sess.run([out_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt) + out_ori_val = sess.run( + [out_ori], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori + ) + out_opt_val = sess.run( + [out_opt], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt + ) np.testing.assert_array_equal( out_ori_val, @@ -145,8 +151,8 @@ class TestSparseDynamicStitch(unittest.TestCase): op_name="KPFusedSparseDynamicStitch", start_op="ori/Reshape", end_op="ori/DynamicStitch", - num_runs=10, - tag="---------TF_origin---------" + num_runs=100, + tag="--------TF_origin---------" ) diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py index f8184f20..f2ad66ff 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py @@ -1,7 +1,8 @@ -import tensorflow as tf -import numpy as np +# Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. import unittest +import tensorflow as tf +import numpy as np from tensorflow.python.ops import gen_embedding_fused_ops from utils.utils import benchmark_op @@ -128,7 +129,7 @@ class TestFusedSparseReshape(unittest.TestCase): op_name="KPFusedSparseReshape", start_op="ori/StridedSlice", end_op="ori/SparseReshape", - num_runs=1000, + num_runs=10000, tag="------TF_origin-----" ) diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py index 0e492617..5536eb1c 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_segment_reduce_test.py @@ -1,6 +1,8 @@ +# Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. +import unittest + import tensorflow as tf import numpy as np -import unittest from tensorflow.python.ops import gen_embedding_fused_ops from utils.utils import benchmark_op @@ -85,7 +87,10 @@ class TestSparseSegmentMeanSlice(unittest.TestCase): end = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="end") strides = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="strides") - base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0, 5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4, 3} + base_data = np.array( + [[1.0, 2.0, 3.0], [3.0, 4.0, 5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], + dtype=np.float32 + ) # shape {4, 3} base_indices = np.array([0, 1, 2], dtype=np.int64) # shape {3} base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) # shape {3, 2} base_begin = [0, 1] @@ -102,13 +107,27 @@ class TestSparseSegmentMeanSlice(unittest.TestCase): } with tf.name_scope("ori"): - out_ori1, out_ori2 = ori_fused_embedding_sparse_segment_reduce_graph(data,indices,slice_input,begin,end,strides,True) + out_ori1, out_ori2 = ori_fused_embedding_sparse_segment_reduce_graph( + data, indices, slice_input, begin, end, strides, True + ) with tf.name_scope("opt"): - out_opt1, out_opt2 = opt_fused_embedding_sparse_segment_reduce_graph(data,indices,slice_input,begin,end,strides,True) + out_opt1, out_opt2 = opt_fused_embedding_sparse_segment_reduce_graph( + data, indices, slice_input, begin, end, strides, True + ) with tf.compat.v1.Session(config=self.config) as sess: - out_ori_val1, out_ori_val2 = sess.run([out_ori1, out_ori2], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori) - out_opt_val1, out_opt_val2 = sess.run([out_opt1, out_opt2], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt) + out_ori_val1, out_ori_val2 = sess.run( + [out_ori1, out_ori2], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_ori + ) + out_opt_val1, out_opt_val2 = sess.run( + [out_opt1, out_opt2], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_opt + ) np.testing.assert_allclose( out_opt_val1, @@ -147,7 +166,10 @@ class TestSparseSegmentMeanSlice(unittest.TestCase): end = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="end") strides = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="strides") - base_data = np.array([[1.0, 2.0, 3.0], [3.0, 4.0,5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], dtype=np.float32) # shape {4, 3} + base_data = np.array( + [[1.0, 2.0, 3.0], [3.0, 4.0,5.0], [5.0, 6.0, 7.0], [5.0, 6.0, 7.0]], + dtype=np.float32 + ) # shape {4, 3} base_indices = np.array([0, 1, 2], dtype=np.int64) base_slice_input = np.array([[0, 0], [0, 2], [1, 2]], dtype=np.int64) base_begin = [0, 1] @@ -163,13 +185,27 @@ class TestSparseSegmentMeanSlice(unittest.TestCase): strides: base_strides } with tf.name_scope("ori"): - out_ori1, out_ori2 = ori_fused_embedding_sparse_segment_reduce_graph(data, indices, slice_input, begin, end, strides, False) + out_ori1, out_ori2 = ori_fused_embedding_sparse_segment_reduce_graph( + data, indices, slice_input, begin, end, strides, False + ) with tf.name_scope("opt"): - out_opt1, out_opt2 = opt_fused_embedding_sparse_segment_reduce_graph(data,indices, slice_input, begin, end, strides, False) + out_opt1, out_opt2 = opt_fused_embedding_sparse_segment_reduce_graph( + data,indices, slice_input, begin, end, strides, False + ) with tf.compat.v1.Session(config=self.config) as sess: - out_ori_val1, out_ori_val2 = sess.run([out_ori1, out_ori2], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori) - out_opt_val1, out_opt_val2 = sess.run([out_opt1, out_opt2], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt) + out_ori_val1, out_ori_val2 = sess.run( + [out_ori1, out_ori2], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_ori + ) + out_opt_val1, out_opt_val2 = sess.run( + [out_opt1, out_opt2], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_opt + ) np.testing.assert_allclose( out_opt_val1, out_ori_val1, diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py index de59d532..54c0926f 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_select_test.py @@ -1,6 +1,8 @@ +# Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. +import unittest + import tensorflow as tf import numpy as np -import unittest from tensorflow.python.ops import gen_embedding_fused_ops from utils.utils import benchmark_op @@ -77,8 +79,18 @@ class TestKPFusedSparseSelect(unittest.TestCase): # Create tf session with tf.compat.v1.Session(config=self.config) as sess: # functest - out_ori_val1, out_ori_val2, out_ori_val3 = sess.run([out_ori1, out_ori2, out_ori3], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_ori) - out_opt_val1, out_opt_val2, out_opt_val3 = sess.run([out_opt1, out_opt2, out_opt3], feed_dict=feed, options=self.run_options, run_metadata=self.run_metadata_opt) + out_ori_val1, out_ori_val2, out_ori_val3 = sess.run( + [out_ori1, out_ori2, out_ori3], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_ori + ) + out_opt_val1, out_opt_val2, out_opt_val3 = sess.run( + [out_opt1, out_opt2, out_opt3], + feed_dict=feed, + options=self.run_options, + run_metadata=self.run_metadata_opt + ) np.testing.assert_allclose( out_ori_val1, -- Gitee From 254a67febf3450112e325b1b2093e5d2d89e1ba7 Mon Sep 17 00:00:00 2001 From: rayshine <1324789704@qq.com> Date: Wed, 20 Aug 2025 11:34:26 +0800 Subject: [PATCH 8/9] =?UTF-8?q?=E8=9E=8D=E5=90=88=E7=AE=97=E5=AD=90?= =?UTF-8?q?=E4=BC=98=E5=8C=96=EF=BC=9AActionIdGather=E5=A4=9A=E7=BA=BF?= =?UTF-8?q?=E7=A8=8B=E3=80=81Gather=20SIMD=E5=B9=B6=E8=A1=8C=E3=80=81Resha?= =?UTF-8?q?pe=E5=A4=9A=E7=BA=BF=E7=A8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../embedding_fused_action_id_gather.cc | 80 +++---- .../core/kernels/embedding_fused_gather.cc | 54 +++-- .../core/kernels/embedding_fused_padding.cc | 27 ++- .../embedding_fused_sparse_dynamic_stitch.cc | 8 +- .../kernels/embedding_fused_sparse_reshape.cc | 202 ++++-------------- .../embedding_fused_sparse_segment_reduce.cc | 10 +- .../kernels/embedding_fused_sparse_select.cc | 31 +-- .../core/profiler/lib/profiler_session.cc | 2 +- 8 files changed, 134 insertions(+), 280 deletions(-) diff --git a/tensorflow/core/kernels/embedding_fused_action_id_gather.cc b/tensorflow/core/kernels/embedding_fused_action_id_gather.cc index e20b8e54..af60b4ab 100644 --- a/tensorflow/core/kernels/embedding_fused_action_id_gather.cc +++ b/tensorflow/core/kernels/embedding_fused_action_id_gather.cc @@ -13,23 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include -#include - -#include "tensorflow/core/framework/common_shape_fns.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/types.h" #include "tensorflow/core/framework/op_kernel.h" -namespace tensorflow { +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/util/work_sharder.h" +namespace tensorflow { + template -static void GatherV2Impl(OpKernelContext* context, - const float* params_data, - const TensorShape& params_shape, - const Tindices* indices_data, - const TensorShape& indices_shape, - int axis, Tensor* temp) { +static void GatherV2Impl(OpKernelContext* context, const float* params_data, const TensorShape& params_shape, + const Tindices* indices_data, const TensorShape& indices_shape, int axis, Tensor* temp) { TensorShape temp_shape; const int P0 = params_shape.dim_size(0); int P1 = 1; @@ -41,13 +33,10 @@ static void GatherV2Impl(OpKernelContext* context, temp_shape.AddDim(params_shape.dim_size(d)); P1 *= params_shape.dim_size(d); } - OP_REQUIRES_OK(context, - context->allocate_temp(DT_FLOAT, temp_shape, temp)); - VLOG(1) << "temp shape: " << temp->shape().DebugString(); + OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, temp_shape, temp)); const int num_indices = indices_shape.num_elements(); float* temp_data = temp->flat().data(); - VLOG(2) << "num_indices : " << num_indices; if (axis == 0) { const int slice_size = P1; for (int i = 0; i < num_indices; ++i) { @@ -55,19 +44,19 @@ static void GatherV2Impl(OpKernelContext* context, if (idx < 0 || idx >= P0) { LOG(FATAL) << "GatherV2 axis=0: index out of range: " << idx; } - std::memcpy(temp_data + i * slice_size, - params_data + idx * slice_size, - sizeof(float) * slice_size); + std::memcpy( + temp_data + i * slice_size, params_data + idx * slice_size, sizeof(float) * slice_size + ); } } else { LOG(FATAL) << "Only axis=0 is supported"; } - VLOG(1) << "temp value : " << temp->DebugString(100); } + template class KPFusedEmbeddingActionIdGatherOp : public OpKernel { - public: +public: explicit KPFusedEmbeddingActionIdGatherOp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { @@ -77,40 +66,41 @@ class KPFusedEmbeddingActionIdGatherOp : public OpKernel { const Tensor& indices2 = context->input(2); const Tensor& pack_dim = context->input(3); - VLOG(1) << "indices1 shape: " << indices1.shape().DebugString(); - VLOG(1) << "params shape: " << params.shape().DebugString(); - VLOG(1) << "indices2 shape: " << indices2.shape().DebugString(); OP_REQUIRES(context, indices1.dims() <= 2, errors::InvalidArgument("indices1 dims must <= 2")); OP_REQUIRES(context, indices2.dims() <= 2, errors::InvalidArgument("indices2 dims must <= 2")); OP_REQUIRES(context, params.dims() == 2, errors::InvalidArgument("params dims must = 2")); OP_REQUIRES(context, pack_dim.NumElements() == 1, errors::InvalidArgument("pack_dim NumElements must = 1")); Tensor temp; - GatherV2Impl(context, params.flat().data(), params.shape(), - indices1.flat().data(), indices1.shape(), - 0, &temp); + GatherV2Impl(context, params.flat().data(), params.shape(), indices1.flat().data(), + indices1.shape(), 0, &temp); Tensor temp1; - GatherV2Impl(context, temp.flat().data(), temp.shape(), - indices2.flat().data(), indices2.shape(), - 0, &temp1); + GatherV2Impl(context, temp.flat().data(), temp.shape(), indices2.flat().data(), + indices2.shape(), 0, &temp1); int pack_size = pack_dim.scalar()(); - VLOG(1) << "pack_size value: " << pack_size; int a_reshaped_cols = temp1.NumElements() / pack_size; auto a_reshaped = temp1.shaped({pack_size, a_reshaped_cols}); - VLOG(1) << "a_reshaped_cols : " << a_reshaped_cols; Tensor* output; int output_cols = a_reshaped_cols + 1680; OP_REQUIRES_OK(context, - context->allocate_output(0, TensorShape({pack_size, output_cols}), &output)); - VLOG(1) << "output shape: " << output->shape().DebugString(); - auto output_matrix = output->matrix(); - output_matrix.slice( - Eigen::array{0, 0}, - Eigen::array{pack_size, a_reshaped_cols}) = a_reshaped; - - output_matrix.slice( - Eigen::array{0, a_reshaped_cols}, - Eigen::array{pack_size, 1680}).setZero(); + context->allocate_output(0, TensorShape({pack_size, output_cols}), &output)); + + auto a_reshaped_data = a_reshaped.data(); + auto worker_threads = context->device()->tensorflow_cpu_worker_threads(); + const int64 cost_per_unit = a_reshaped_cols + 1680; + float* base = output->matrix().data(); + Shard(worker_threads->num_threads, worker_threads->workers, pack_size, cost_per_unit, + [&](int64 start_row, int64 end_row) { + for (int64 row = start_row; row < end_row; ++row) { + float* dst_row = base + row * (a_reshaped_cols + 1680); + std::memcpy( + dst_row, a_reshaped_data + row * a_reshaped_cols, sizeof(float) * a_reshaped_cols + ); + std::memset( + dst_row + a_reshaped_cols, 0, sizeof(float) * 1680 + ); + } + }); } }; @@ -119,7 +109,7 @@ class KPFusedEmbeddingActionIdGatherOp : public OpKernel { .Device(DEVICE_CPU) \ .TypeConstraint("Tindices1") \ .TypeConstraint("Tindices2"), \ - KPFusedEmbeddingActionIdGatherOp); + KPFusedEmbeddingActionIdGatherOp) REGISTER_CPU_KERNEL(int64, int32) REGISTER_CPU_KERNEL(int32, int32) diff --git a/tensorflow/core/kernels/embedding_fused_gather.cc b/tensorflow/core/kernels/embedding_fused_gather.cc index 51ec5776..6927d6b8 100644 --- a/tensorflow/core/kernels/embedding_fused_gather.cc +++ b/tensorflow/core/kernels/embedding_fused_gather.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/framework/common_shape_fns.h" -#include "tensorflow/core/framework/op.h" +#include + #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/util/work_sharder.h" @@ -22,7 +22,7 @@ limitations under the License. using namespace tensorflow; class KPFusedGather : public OpKernel { - public: +public: explicit KPFusedGather(OpKernelConstruction* context) : OpKernel(context) { } void Compute(OpKernelContext* context) override { @@ -33,16 +33,10 @@ class KPFusedGather : public OpKernel { OP_REQUIRES(context, slice_input.dims() == 2, errors::Internal("slice_input dims must == 2")); OP_REQUIRES(context, data.dims() == 2, errors::Internal("indentity dims must == 2")); - VLOG(1) << "Input indentity shape: " << data.shape().DebugString(); - VLOG(1) << "Input slice_input shape: " << slice_input.shape().DebugString(); - VLOG(1) << "Input begin value: " << begin.SummarizeValue(10); - int32 col = begin.flat().data()[1]; auto data_mat = data.matrix(); auto slice_input_mat = slice_input.matrix(); - VLOG(1) << "Column index from begin: " << col; - std::vector unique_values; std::vector indices(slice_input.dim_size(0)); std::unordered_map value_to_index; @@ -60,41 +54,41 @@ class KPFusedGather : public OpKernel { } Tensor* out_shape = nullptr; - Tensor* out_indices = nullptr; - Tensor* out_data = nullptr; - OP_REQUIRES_OK(context, context->allocate_output( - 0, TensorShape({unique_values.size()}), &out_shape)); - std::memcpy(out_shape->data(), unique_values.data(), unique_values.size() * sizeof(int64_t)); - + 0, TensorShape({1}), &out_shape)); + out_shape->flat()(0) = static_cast(unique_values.size()); + + Tensor* out_indices = nullptr; OP_REQUIRES_OK(context, context->allocate_output( - 1, TensorShape({static_cast(indices.size())}), &out_indices)); + 1, TensorShape({static_cast(indices.size())}), &out_indices)); std::memcpy(out_indices->data(), indices.data(), indices.size() * sizeof(int32_t)); - OP_REQUIRES(context, data.dim_size(1) * unique_values.size() % 12 == 0, + + OP_REQUIRES(context, data.dim_size(1) * unique_values.size() % 12 == 0, errors::Internal("cannot reshape to [-1, 12]")); - - std::vector gather1_result; - for (auto &indice : unique_values) { - for (int64_t i = 0; i < data.dim_size(1); ++i) { - gather1_result.push_back(data_mat(indice, i)); - } - } + Tensor* out_data = nullptr; OP_REQUIRES_OK(context, context->allocate_output( - 2, TensorShape({unique_values.size(), 12}), &out_data)); + 2, TensorShape({unique_values.size(), 12}), &out_data)); auto output_data = out_data->matrix(); - int cur_row = 0; - for (auto &indice : unique_values) { + + for (int64_t cur_row = 0; cur_row < unique_values.size(); ++cur_row) { + int64_t idx = unique_values[cur_row]; for (int i = 0; i < 12; ++i) { - output_data(cur_row, i) = gather1_result[12 * indice + i]; + const float* src = &data_mat(idx, 0); + float* dst = &output_data(cur_row, 0); + float32x4_t v0 = vld1q_f32(src); + float32x4_t v1 = vld1q_f32(src + 4); + float32x4_t v2 = vld1q_f32(src + 8); + vst1q_f32(dst, v0); + vst1q_f32(dst + 4, v1); + vst1q_f32(dst + 8, v2); } - cur_row++; } } }; REGISTER_KERNEL_BUILDER(Name("KPFusedGather").Device(DEVICE_CPU), - KPFusedGather); \ No newline at end of file + KPFusedGather); diff --git a/tensorflow/core/kernels/embedding_fused_padding.cc b/tensorflow/core/kernels/embedding_fused_padding.cc index e36fbf7f..98351004 100644 --- a/tensorflow/core/kernels/embedding_fused_padding.cc +++ b/tensorflow/core/kernels/embedding_fused_padding.cc @@ -13,21 +13,21 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include #include #include "tensorflow/core/framework/common_shape_fns.h" #include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/op_kernel.h" + namespace tensorflow { using shape_inference::InferenceContext; using shape_inference::ShapeHandle; class KPFusedEmbeddingPaddingOp : public OpKernel { - public: +public: explicit KPFusedEmbeddingPaddingOp(OpKernelConstruction* context) : OpKernel(context) { fast_ = (type_string() == "KPFusedEmbeddingPaddingFast"); } @@ -67,32 +67,29 @@ class KPFusedEmbeddingPaddingOp : public OpKernel { int output_rows = padding_rows + input.dim_size(0); int output_cols = input.dim_size(1); OP_REQUIRES( - context, - output_rows * output_cols % reshape_cols == 0, - errors::InvalidArgument("padding cannot reshape to [-1, ", reshape_cols, "]") + context, output_rows * output_cols % reshape_cols == 0, + errors::InvalidArgument("padding cannot reshape to [-1, ", reshape_cols, "]") ); int reshape_rows = output_rows * output_cols / reshape_cols; if (fast_) { - OP_REQUIRES_OK(context, - context->allocate_output(1, TensorShape({}), - &output1)); + OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape({}), &output1)); output1->scalar()() = reshape_rows; return; } OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, TensorShape({padding_rows + input_rows_value, output_cols}), - &padding)); + &padding)); auto input_matrix = input.matrix(); auto padding_matrix = padding.matrix(); padding_matrix.slice( - Eigen::array{0, 0}, - Eigen::array{input_rows_value, output_cols}) = input_matrix; + Eigen::array{0, 0}, + Eigen::array{input_rows_value, output_cols}) = input_matrix; padding_matrix.slice( - Eigen::array{input_rows_value, 0}, - Eigen::array{padding_rows, output_cols}).setZero(); + Eigen::array{input_rows_value, 0}, + Eigen::array{padding_rows, output_cols}).setZero(); TensorShape reshaped_shape({reshape_rows, reshape_cols}); OP_REQUIRES_OK(context, @@ -100,7 +97,7 @@ class KPFusedEmbeddingPaddingOp : public OpKernel { output1->flat() = padding.flat(); } - private: +private: bool fast_; }; diff --git a/tensorflow/core/kernels/embedding_fused_sparse_dynamic_stitch.cc b/tensorflow/core/kernels/embedding_fused_sparse_dynamic_stitch.cc index 9937a07e..e1cdbc5c 100644 --- a/tensorflow/core/kernels/embedding_fused_sparse_dynamic_stitch.cc +++ b/tensorflow/core/kernels/embedding_fused_sparse_dynamic_stitch.cc @@ -13,12 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include - #include -#include "tensorflow/core/framework/common_shape_fns.h" -#include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/util/work_sharder.h" @@ -26,7 +22,7 @@ limitations under the License. using namespace tensorflow; class KPFusedSparseDynamicStitchOp : public OpKernel { - public: +public: explicit KPFusedSparseDynamicStitchOp(OpKernelConstruction* context) : OpKernel(context) {} @@ -78,4 +74,4 @@ class KPFusedSparseDynamicStitchOp : public OpKernel { }; REGISTER_KERNEL_BUILDER(Name("KPFusedSparseDynamicStitch").Device(DEVICE_CPU), - KPFusedSparseDynamicStitchOp); + KPFusedSparseDynamicStitchOp); \ No newline at end of file diff --git a/tensorflow/core/kernels/embedding_fused_sparse_reshape.cc b/tensorflow/core/kernels/embedding_fused_sparse_reshape.cc index 43428b88..219efaba 100644 --- a/tensorflow/core/kernels/embedding_fused_sparse_reshape.cc +++ b/tensorflow/core/kernels/embedding_fused_sparse_reshape.cc @@ -13,140 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/framework/common_shape_fns.h" -#include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/util/work_sharder.h" -#include "tensorflow/core/kernels/reshape_util.h" -#include "tensorflow/core/framework/register_types.h" -#include "tensorflow/core/framework/tensor_util.h" -#include "tensorflow/core/framework/types.h" -#include "tensorflow/core/lib/gtl/inlined_vector.h" using namespace tensorflow; -static void ReshapeKp(OpKernelContext *context, const Tensor &input_indices_in, - const Tensor &input_shape_in, const Tensor &target_shape_in, - int output_indices_idx, int output_shape_idx) { - OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_indices_in.shape()), - errors::InvalidArgument( - "Input indices should be a matrix but received shape ", - input_indices_in.shape().DebugString())); - OP_REQUIRES(context, TensorShapeUtils::IsVector(input_shape_in.shape()), - errors::InvalidArgument( - "Input shape should be a vector but received shape ", - input_shape_in.shape().DebugString())); - OP_REQUIRES(context, TensorShapeUtils::IsVector(target_shape_in.shape()), - errors::InvalidArgument( - "Target shape should be a vector but received shape ", - target_shape_in.shape().DebugString())); - - const int64 input_rank = input_shape_in.NumElements(); - const int64 output_rank = target_shape_in.NumElements(); - const TensorShape input_shape(input_shape_in.vec()); - const int64 dense_size = input_shape.num_elements(); - const int64 nnz = input_indices_in.shape().dim_size(0); - - TensorShape output_shape; - int64 product = 1; - int unknown_index = -1; - auto target_shape = target_shape_in.vec(); - for (int d = 0; d < output_rank; ++d) { - const int64 size = target_shape(d); - if (size == -1) { - OP_REQUIRES( - context, unknown_index == -1, - errors::InvalidArgument("only one output dimension may be -1, " - "not both ", - unknown_index, " and ", d)); - unknown_index = d; - output_shape.AddDim(1); - } else { - OP_REQUIRES(context, size >= 0, - errors::InvalidArgument("size ", d, - " must be non-negative, not ", size)); - product *= size; - output_shape.AddDim(size); - } - } - if (unknown_index != -1) { - OP_REQUIRES( - context, product > 0, - errors::InvalidArgument("reshape cannot infer the missing " - "input size for an empty tensor unless all " - "specified input sizes are non-zero")); - const int64 missing = dense_size / product; - OP_REQUIRES( - context, product * missing == dense_size, - errors::InvalidArgument( - "Input to reshape is a SparseTensor with ", dense_size, - " dense values, but the requested shape requires a multiple of ", - product, ". input_shape=", input_shape.DebugString(), - " output_shape=", output_shape.DebugString())); - output_shape.set_dim(unknown_index, missing); - } - - OP_REQUIRES( - context, output_shape.num_elements() == dense_size, - errors::InvalidArgument("Input to reshape is a tensor with ", dense_size, - " dense values, but the requested shape has ", - output_shape.num_elements(), - ". input_shape=", input_shape.DebugString(), - " output_shape=", output_shape.DebugString())); - - if (input_shape == output_shape) { - context->set_output(output_indices_idx, input_indices_in); - context->set_output(output_shape_idx, input_shape_in); - return; - } - - gtl::InlinedVector input_strides(input_rank); - if (input_rank > 0) { - input_strides[input_rank - 1] = 1; - for (int d = input_rank - 2; d >= 0; --d) { - input_strides[d] = input_strides[d + 1] * input_shape.dim_size(d + 1); - } - } - - gtl::InlinedVector output_strides(output_rank); - if (output_rank > 0) { - output_strides[output_rank - 1] = 1; - for (int d = output_rank - 2; d >= 0; --d) { - output_strides[d] = output_strides[d + 1] * output_shape.dim_size(d + 1); - } - } - - Tensor *result_indices = nullptr; - OP_REQUIRES_OK(context, - context->allocate_output(output_indices_idx, - TensorShape({nnz, output_rank}), - &result_indices)); - auto input_ind = input_indices_in.matrix(); - auto output_ind = result_indices->matrix(); - for (int i = 0; i < nnz; ++i) { - int64 id = 0; - for (int j = 0; j < input_rank; ++j) { - id += input_ind(i, j) * input_strides[j]; - } - for (int j = 0; j < output_rank; ++j) { - output_ind(i, j) = id / output_strides[j]; - id %= output_strides[j]; - } - } - - Tensor *result_shape = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(output_shape_idx, - TensorShape({output_rank}), - &result_shape)); - auto output_shape_vec = result_shape->vec(); - for (int j = 0; j < output_shape.dims(); ++j) { - output_shape_vec(j) = output_shape.dim_size(j); - } -} class KPFusedSparseReshapeOp : public OpKernel { - public: +public: explicit KPFusedSparseReshapeOp(OpKernelConstruction* context) : OpKernel(context) { } void Compute(OpKernelContext* context) override { @@ -155,39 +30,54 @@ class KPFusedSparseReshapeOp : public OpKernel { const Tensor& new_shape = context->input(2); OP_REQUIRES(context, slice_input.dims() == 2, errors::Internal("slice_input dims must == 2")); - - VLOG(1) << "Input slice_input shape: " << slice_input.shape().DebugString(); - VLOG(1) << "Input begin value: " << begin.DebugString(); - VLOG(1) << "Input new_shape value: " << new_shape.DebugString(); - + int32 col = begin.flat().data()[1]; - int64_t stridedslice57_out = slice_input.dim_size(0); - auto slice_input_mat = slice_input.matrix(); - - VLOG(1) << "stridedslice57_out: " << stridedslice57_out; - VLOG(1) << "slice_input.dim_size(0): " << slice_input.dim_size(0); - VLOG(1) << "slice_input.dim_size(1): " << slice_input.dim_size(1); - OP_REQUIRES(context, stridedslice57_out == slice_input.dim_size(0), errors::Internal("concat shape mismatch")); - VLOG(1) << "Column index from begin: " << col; - VLOG(1) << "indices size: " << stridedslice57_out; - - Tensor shape_in(DT_INT64, TensorShape({2})); - auto tensor_flat = shape_in.flat(); - tensor_flat(0) = stridedslice57_out; - tensor_flat(1) = 2; - - Tensor indices_in(DT_INT64, TensorShape({stridedslice57_out, 2})); - auto indices_in_mat = indices_in.matrix(); - for (int i = 0; i < stridedslice57_out; ++i) { - indices_in_mat(i, 0) = i; - indices_in_mat(i, 1) = slice_input_mat(i, col); + int64 nnz = slice_input.dim_size(0); + TensorShape output_shape; + int64 product = 2 * nnz; + auto target_shape = new_shape.vec(); + + OP_REQUIRES(context, !(target_shape(0)==-1&&target_shape(1)==-1), errors::InvalidArgument("only one output dimension may be -1.")); + OP_REQUIRES(context, (target_shape(0)>0||target_shape(0)==-1) && (target_shape(1)>0||target_shape(1)==-1), errors::InvalidArgument("must be non-negative.")); + OP_REQUIRES(context, product%target_shape(0)==0 && product%target_shape(1)==0, errors::InvalidArgument("reshape cannot infer the missing.")); + + output_shape.AddDim(target_shape(0) == -1 ? product / target_shape(1) : target_shape(0)); + output_shape.AddDim(target_shape(1) == -1 ? product / target_shape(0) : target_shape(1)); + + if (output_shape.dim_size(0) == nnz && output_shape.dim_size(1) == 2) { + context->set_output(0, slice_input); + Tensor input_shape_in(DT_INT64, TensorShape({2})); + auto tensor_flat = input_shape_in.flat(); + tensor_flat(0) = nnz; + tensor_flat(1) = 2; + context->set_output(1, input_shape_in); + return; } - Tensor new_shape_in(DT_INT64, TensorShape({2})); - auto newshape_tensor_flat = new_shape_in.flat(); - newshape_tensor_flat(0) = new_shape.flat()(0); - newshape_tensor_flat(1) = new_shape.flat()(1); - ReshapeKp(context, indices_in, shape_in, new_shape_in, 0, 1); + Tensor *result_indices = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({nnz, 2}), &result_indices)); + auto input_ind = slice_input.matrix(); + auto output_ind = result_indices->matrix(); + + const int32 target_shape1 = target_shape(1); + auto worker_threads = context->device()->tensorflow_cpu_worker_threads(); + const int64 cost_per_unit = 50; + + Shard(worker_threads->num_threads, worker_threads->workers, nnz, cost_per_unit, + [&](int64 start, int64 limit) { + for (int64 i = start; i < limit; ++i) { + int base_index = 2 * i + input_ind(i, col); + output_ind(i, 1) = base_index % target_shape1; + output_ind(i, 0) = base_index / target_shape1; + } + }); + + Tensor *result_shape = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape({2}), &result_shape)); + auto output_shape_vec = result_shape->vec(); + for (int j = 0; j < output_shape.dims(); ++j) { + output_shape_vec(j) = output_shape.dim_size(j); + } } }; diff --git a/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce.cc b/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce.cc index 19cc7394..7472fbb9 100644 --- a/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce.cc +++ b/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce.cc @@ -15,8 +15,6 @@ limitations under the License. #include -#include "tensorflow/core/framework/common_shape_fns.h" -#include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/util/work_sharder.h" @@ -25,7 +23,7 @@ using namespace tensorflow; template class KPFusedSparseSegmentReduceOp : public OpKernel { - public: +public: explicit KPFusedSparseSegmentReduceOp(OpKernelConstruction* context) : OpKernel(context) { int combiner_mode; @@ -138,7 +136,7 @@ class KPFusedSparseSegmentReduceOp : public OpKernel { } } - private: +private: bool is_mean_; }; @@ -146,7 +144,7 @@ class KPFusedSparseSegmentReduceOp : public OpKernel { REGISTER_KERNEL_BUILDER(Name("KPFusedSparseSegmentReduce") \ .Device(DEVICE_CPU) \ .TypeConstraint("Tidx"), \ - KPFusedSparseSegmentReduceOp); + KPFusedSparseSegmentReduceOp) REGISTER_KERNEL(int64) REGISTER_KERNEL(int32) -#undef REGISTER_KERNEL +#undef REGISTER_KERNEL \ No newline at end of file diff --git a/tensorflow/core/kernels/embedding_fused_sparse_select.cc b/tensorflow/core/kernels/embedding_fused_sparse_select.cc index 086092d5..89a42d14 100644 --- a/tensorflow/core/kernels/embedding_fused_sparse_select.cc +++ b/tensorflow/core/kernels/embedding_fused_sparse_select.cc @@ -16,22 +16,19 @@ limitations under the License. #include #include -#include "tensorflow/core/framework/common_shape_fns.h" -#include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/platform/logging.h" using namespace tensorflow; class KPFusedSparseSelect : public OpKernel { - public: +public: explicit KPFusedSparseSelect(OpKernelConstruction* context) : OpKernel(context) { - } void Compute(OpKernelContext* context) override { - const Tensor& input_a = context->input(0); const Tensor& input_b = context->input(1); const Tensor& input_c = context->input(2); @@ -39,12 +36,10 @@ class KPFusedSparseSelect : public OpKernel { auto a_flat = input_a.flat(); auto b_flat = input_b.flat(); auto c_flat = input_c.flat(); - VLOG(1) << "input_a shape: " << input_a.shape().DebugString(); - VLOG(1) << "input_b shape: " << input_b.shape().DebugString(); - VLOG(1) << "input_c shape: " << input_c.shape().DebugString(); - OP_REQUIRES(context,input_a.NumElements() == input_b.NumElements(), + + OP_REQUIRES(context, input_a.NumElements() == input_b.NumElements(), errors::InvalidArgument("Input num elements must match")); - OP_REQUIRES(context,input_a.NumElements() == input_c.NumElements(), + OP_REQUIRES(context, input_a.NumElements() == input_c.NumElements(), errors::InvalidArgument("Input num elements must match")); auto N = input_a.NumElements(); @@ -58,10 +53,10 @@ class KPFusedSparseSelect : public OpKernel { auto b_equal_node0 = (b_reshaped_tensor == 4563); auto b_equal_node1 = (b_reshaped_tensor == 10831); - Eigen::Tensor tensor_ones(N, 1); + Eigen::Tensor tensor_ones(N, 1); tensor_ones.setConstant(1.0f); - Eigen::Tensor tensor_zeros(N, 1); + Eigen::Tensor tensor_zeros(N, 1); tensor_zeros.setConstant(0.0f); auto select_2412 = b_equal_node0.select(tensor_ones, a_greater_casted); @@ -74,13 +69,9 @@ class KPFusedSparseSelect : public OpKernel { Tensor* output_y = nullptr; Tensor* output_w = nullptr; - OP_REQUIRES_OK(context, - context->allocate_output(0,TensorShape({N, 1}), &output_x)); - OP_REQUIRES_OK(context, - context->allocate_output(1,TensorShape({N, 1}), &output_y)); - OP_REQUIRES_OK(context, - context->allocate_output(2,TensorShape({N, 2}), &output_w)); - + OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({N, 1}), &output_x)); + OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape({N, 1}), &output_y)); + OP_REQUIRES_OK(context, context->allocate_output(2, TensorShape({N, 2}), &output_w)); Eigen::TensorMap> map_output_x( output_x->flat().data(), @@ -102,9 +93,7 @@ class KPFusedSparseSelect : public OpKernel { output_w->dim_size(1) ); map_output_w = concat_out; - } - }; REGISTER_KERNEL_BUILDER(Name("KPFusedSparseSelect").Device(DEVICE_CPU), diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc index 982a0f93..24b820af 100644 --- a/tensorflow/core/profiler/lib/profiler_session.cc +++ b/tensorflow/core/profiler/lib/profiler_session.cc @@ -156,7 +156,7 @@ ProfilerSession::ProfilerSession(const profiler::ProfilerOptions& options) return; } - LOG(INFO) << "Profiler session started."; + // LOG(INFO) << "Profiler session started."; #if !defined(IS_MOBILE_PLATFORM) CreateProfilers(options, &profilers_); -- Gitee From 541e1c7c34efd6f4574e10320c03994d802be8f6 Mon Sep 17 00:00:00 2001 From: rayshine <1324789704@qq.com> Date: Wed, 20 Aug 2025 17:09:10 +0800 Subject: [PATCH 9/9] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dreshape=E7=AE=97=E5=AD=90?= =?UTF-8?q?new=5Fshape=E7=B1=BB=E5=9E=8B=E9=97=AE=E9=A2=98=20int32-->int64?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tensorflow/core/kernels/embedding_fused_sparse_reshape.cc | 8 ++++---- .../fused_embedding_sparse_reshape_test.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/kernels/embedding_fused_sparse_reshape.cc b/tensorflow/core/kernels/embedding_fused_sparse_reshape.cc index 219efaba..fb0fa578 100644 --- a/tensorflow/core/kernels/embedding_fused_sparse_reshape.cc +++ b/tensorflow/core/kernels/embedding_fused_sparse_reshape.cc @@ -35,7 +35,7 @@ public: int64 nnz = slice_input.dim_size(0); TensorShape output_shape; int64 product = 2 * nnz; - auto target_shape = new_shape.vec(); + auto target_shape = new_shape.vec(); OP_REQUIRES(context, !(target_shape(0)==-1&&target_shape(1)==-1), errors::InvalidArgument("only one output dimension may be -1.")); OP_REQUIRES(context, (target_shape(0)>0||target_shape(0)==-1) && (target_shape(1)>0||target_shape(1)==-1), errors::InvalidArgument("must be non-negative.")); @@ -59,14 +59,14 @@ public: auto input_ind = slice_input.matrix(); auto output_ind = result_indices->matrix(); - const int32 target_shape1 = target_shape(1); + const int64 target_shape1 = target_shape(1); auto worker_threads = context->device()->tensorflow_cpu_worker_threads(); const int64 cost_per_unit = 50; Shard(worker_threads->num_threads, worker_threads->workers, nnz, cost_per_unit, [&](int64 start, int64 limit) { for (int64 i = start; i < limit; ++i) { - int base_index = 2 * i + input_ind(i, col); + int64 base_index = 2 * i + input_ind(i, col); output_ind(i, 1) = base_index % target_shape1; output_ind(i, 0) = base_index / target_shape1; } @@ -75,7 +75,7 @@ public: Tensor *result_shape = nullptr; OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape({2}), &result_shape)); auto output_shape_vec = result_shape->vec(); - for (int j = 0; j < output_shape.dims(); ++j) { + for (int64 j = 0; j < output_shape.dims(); ++j) { output_shape_vec(j) = output_shape.dim_size(j); } } diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py index f2ad66ff..8236393d 100644 --- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py +++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_sparse_reshape_test.py @@ -77,7 +77,7 @@ class TestFusedSparseReshape(unittest.TestCase): with tf.Graph().as_default(): slice_input = tf.compat.v1.placeholder(tf.int64, shape=(4,2), name="slice_input") begin = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="begin") - newshape = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="newshape") + newshape = tf.compat.v1.placeholder(tf.int64, shape=(2,), name="newshape") base_slice_input = np.array([[0, 0], [0, 1], [1, 2], [3, 4]], dtype=np.int64) base_begin = [0, 1] base_newshape = [2, 4] -- Gitee