diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 1d1b2c9a9a8d09b65e365c51770f880da7fb9847..c64fd3c8674d49bc6d54ded3f55f81ad8940b44b 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4110,6 +4110,26 @@ tf_kernel_library(
     deps = MATH_DEPS,
 )
 
+tf_cc_test(
+    name = "embedding_fused_sparse_dynamic_stitch_test",
+    srcs = if_enable_annc([
+        "embedding_fused_sparse_dynamic_stitch_test.cc",
+    ]),
+    deps = [
+        ":embedding_fused_sparse_dynamic_stitch_op",
+        ":host_constant_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "embedding_fused_reshape_op",
     srcs = if_enable_annc([
@@ -4128,6 +4148,26 @@ tf_kernel_library(
     deps = MATH_DEPS,
 )
 
+tf_cc_test(
+    name = "embedding_fused_sparse_segment_reduce_test",
+    srcs = if_enable_annc([
+        "embedding_fused_sparse_segment_reduce_test.cc",
+    ]),
+    deps = [
+        ":embedding_fused_sparse_segment_reduce_op",
+        ":host_constant_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "embedding_fused_sparse_segment_reduce_nonzero_op",
     srcs = if_enable_annc([
@@ -4136,6 +4176,26 @@ tf_kernel_library(
     deps = MATH_DEPS,
 )
 
+tf_cc_test(
+    name = "embedding_fused_sparse_segment_reduce_nonzero_test",
+    srcs = if_enable_annc([
+        "embedding_fused_sparse_segment_reduce_nonzero_test.cc",
+    ]),
+    deps = [
+        ":embedding_fused_sparse_segment_reduce_nonzero_op",
+        ":host_constant_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "embedding_fused_sparse_select_op",
     srcs = if_enable_annc([
diff --git a/tensorflow/core/kernels/embedding_fused_sparse_dynamic_stitch.cc b/tensorflow/core/kernels/embedding_fused_sparse_dynamic_stitch.cc
index c514980dd4bb448fc6109b1b0a5e32daaff366dc..6e12550d2aa082066398b8ec76ac90c1253573b6 100644
--- a/tensorflow/core/kernels/embedding_fused_sparse_dynamic_stitch.cc
+++ b/tensorflow/core/kernels/embedding_fused_sparse_dynamic_stitch.cc
@@ -69,7 +69,7 @@ public:
         const int64_t global_id = x_flat(i);
         const int64_t table_id = global_id % num_partitions;
         const int64_t row_id = global_id / num_partitions;
-        
+
         OP_REQUIRES(context, row_id < variable_rows[table_id], errors::InvalidArgument(
           "row_id out of range."));
 
diff --git a/tensorflow/core/kernels/embedding_fused_sparse_dynamic_stitch_test.cc b/tensorflow/core/kernels/embedding_fused_sparse_dynamic_stitch_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5449453e0d31bfd43ac6540cd95ea3dac848e922
--- /dev/null
+++ b/tensorflow/core/kernels/embedding_fused_sparse_dynamic_stitch_test.cc
@@ -0,0 +1,112 @@
+/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *     ==============================================================================*/
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+class KPFusedSparseDynamicStitchOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(int N) {
+    TF_ASSERT_OK(NodeDefBuilder("kp_fused_sparse_dynamic_stitch",
+                                "KPFusedSparseDynamicStitch")
+                     .Input(FakeInput(DT_INT64))
+                     .Input(FakeInput(N, DT_FLOAT))
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(KPFusedSparseDynamicStitchOpTest, TestTwoTables) {
+  MakeOp(2);  // num_partitions = 2
+
+  AddInputFromArray<int64>(TensorShape({4}), {0, 3, 2, 1});
+  AddInputFromArray<float>(TensorShape({3, 2}),
+                           {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  AddInputFromArray<float>(TensorShape({2, 2}), {7.0f, 8.0f, 9.0f, 10.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 2}));
+  test::FillValues<float>(&expected,
+                          {1.0f, 2.0f, 9.0f, 10.0f, 3.0f, 4.0f, 7.0f, 8.0f});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(KPFusedSparseDynamicStitchOpTest, TestDifferentStride) {
+  MakeOp(2);
+
+  AddInputFromArray<int64>(TensorShape({4}), {0, 3, 2, 1});
+  AddInputFromArray<float>(TensorShape({3, 2}),
+                           {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  AddInputFromArray<float>(TensorShape({1, 4}), {7.0f, 8.0f, 9.0f, 10.0f});
+
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(
+      s.error_message().find("All inputs must have same second dimension") !=
+      std::string::npos);
+}
+
+TEST_F(KPFusedSparseDynamicStitchOpTest, TestIndicesOutOfBounds) {
+  MakeOp(2);
+
+  AddInputFromArray<int64>(TensorShape({4}), {0, 6, 2, 1});
+  AddInputFromArray<float>(TensorShape({3, 2}),
+                           {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  AddInputFromArray<float>(TensorShape({2, 2}), {7.0f, 8.0f, 9.0f, 10.0f});
+
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(s.error_message().find("row_id out of range") !=
+              std::string::npos);
+}
+
+TEST_F(KPFusedSparseDynamicStitchOpTest, TestInputDims) {
+  MakeOp(2);
+
+  AddInputFromArray<int64>(TensorShape({4}), {0, 6, 2, 1});
+  AddInputFromArray<float>(TensorShape({3, 2, 1}),
+                           {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  AddInputFromArray<float>(TensorShape({2, 2, 1}), {7.0f, 8.0f, 9.0f, 10.0f});
+
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(s.error_message().find("input dims must == 2") !=
+              std::string::npos);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce.cc b/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce.cc
index bc6c4084034e21a4873ec89e783f9f27198af48f..33bbd312bea3f388707cb57a23a1c27eb2c2bb4a 100644
--- a/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce.cc
+++ b/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce.cc
@@ -53,8 +53,8 @@ public:
 
     OP_REQUIRES(context, col >= 0 && col < slice_input.dim_size(1), 
                  errors::InvalidArgument("Column index out of range"));
-    OP_REQUIRES(context, num_indices <= slice_input.dim_size(0),
-                errors::InvalidArgument("indices out of range"));
+    OP_REQUIRES(context, num_indices == slice_input.dim_size(0),
+                errors::InvalidArgument("indices and slice_input.dim_zie(0) should have same size"));
 
     auto input_data = input_tensor.matrix<float>().data();
     auto indices_vec = indices.vec<Tidx>();
@@ -90,8 +90,8 @@ public:
       auto counts_vec = counts.flat<int32>();
 
       for (int64 i = 0; i < num_indices; ++i) {
-        const int32 seg_id = slice_input_mat(i, col);
-        const int32 data_row = indices_vec(i);
+        const int64 seg_id = slice_input_mat(i, col);
+        const Tidx data_row = indices_vec(i);
         counts_vec(seg_id) += 1;
 
         float* output_row = output_data + seg_id * embedding_size;
@@ -131,8 +131,8 @@ public:
       }
     } else {
       for (int64 i = 0; i < num_indices; ++i) {
-        const int32 seg_id = slice_input_mat(i, col);
-        const int32 data_row = indices_vec(i);
+        const int64 seg_id = slice_input_mat(i, col);
+        const Tidx data_row = indices_vec(i);
 
         float* output_row = output_data + seg_id * embedding_size;
         const float* input_data_row = input_data + data_row * embedding_size;
diff --git a/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce_nonzero.cc b/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce_nonzero.cc
index 8d68397a8589755ef0c26cbb7d50f6b16295a36c..cd1d341b6edb51c5193c936d94b7b464ac008685 100644
--- a/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce_nonzero.cc
+++ b/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce_nonzero.cc
@@ -51,8 +51,8 @@ public:
     
     OP_REQUIRES(context, col >= 0 && col < slice_input.dim_size(1), 
                  errors::InvalidArgument("Column index out of range"));
-    OP_REQUIRES(context, num_indices <= slice_input.dim_size(0),
-                errors::InvalidArgument("indices out of range"));
+    OP_REQUIRES(context, num_indices == slice_input.dim_size(0),
+                errors::InvalidArgument("indices and slice_input.dim_zie(0) should have same size"));
 
     auto input_data = input_tensor.flat<float>();
     auto indices_vec = indices.vec<Tidx>();
@@ -72,7 +72,7 @@ public:
     Tensor* output_shape = nullptr;
     OP_REQUIRES_OK(
         context, context->allocate_output(0, TensorShape({1}), &output_shape));
-    output_shape->flat<int64>()(0) = batch_size;
+    output_shape->flat<int32>()(0) = static_cast<int32>(batch_size);
 
     std::vector<std::pair<int64, float>> results(batch_size);
     int64 num_nonzero = 0;
@@ -86,8 +86,8 @@ public:
       auto counts_vec = counts.flat<int32>();
 
       for (int64 i = 0; i < num_indices; ++i) {
-        const int32 seg_id = slice_input_mat(i, col);
-        const int32 data_row = indices_vec(i);
+        const int64 seg_id = slice_input_mat(i, col);
+        const Tidx data_row = indices_vec(i);
         counts_vec(seg_id) += 1;
         temp_vec(seg_id) += input_data(data_row);
       }
@@ -104,8 +104,8 @@ public:
       }
     } else {
       for (int64 i = 0; i < num_indices; ++i) {
-        const int32 seg_id = slice_input_mat(i, col);
-        const int32 data_row = indices_vec(i);
+        const int64 seg_id = slice_input_mat(i, col);
+        const Tidx data_row = indices_vec(i);
         temp_vec(seg_id) += input_data(data_row);
       }
   
@@ -120,7 +120,7 @@ public:
     OP_REQUIRES_OK(context,
                    context->allocate_output(1, TensorShape({num_nonzero, 1}),
                                             &output_indices));
-    auto output_indices_data = output_indices->flat<int64>();
+    auto output_indices_data = output_indices->flat<int32>();
 
     Tensor* output_nonzero = nullptr;
     OP_REQUIRES_OK(context,
@@ -128,7 +128,7 @@ public:
                                             &output_nonzero));
     auto output_nonzero_data = output_nonzero->flat<float>();
     for (int64 i = 0; i < num_nonzero; ++i) {
-      output_indices_data(i) = results[i].first;
+      output_indices_data(i) = static_cast<int32>(results[i].first);
       output_nonzero_data(i) = results[i].second;
     }
 
diff --git a/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce_nonzero_test.cc b/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce_nonzero_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9d28a3b1f7186bb999737b7162599cdd4dac6002
--- /dev/null
+++ b/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce_nonzero_test.cc
@@ -0,0 +1,186 @@
+/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *     ==============================================================================*/
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+class KPFusedSparseSegmentReduceNonzeroOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(int combiner_mode) {
+    TF_ASSERT_OK(NodeDefBuilder("kp_fused_sparse_segment_reduce_nonzero",
+                                "KPFusedSparseSegmentReduceNonzero")
+                     .Input(FakeInput(DT_FLOAT))  // data
+                     .Input(FakeInput(DT_INT32))  // indices
+                     .Input(FakeInput(DT_INT64))  // slice_input
+                     .Input(FakeInput(DT_INT32))  // begin
+                     .Attr("combiner", combiner_mode)
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(KPFusedSparseSegmentReduceNonzeroOpTest, TestReduceMean) {
+  MakeOp(1);
+
+  AddInputFromArray<float>(TensorShape({8}),
+                           {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1});
+  AddInputFromArray<int64>(TensorShape({3, 4}),
+                           {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int32>(&expected, {4});
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));  // output_shape
+
+  Tensor expected_1(allocator(), DT_INT32, TensorShape({2, 1}));
+  test::FillValues<int32>(&expected_1, {2, 3});
+  test::ExpectTensorEqual<int32>(expected_1, *GetOutput(1));  // output_indices
+
+  Tensor expected_2(allocator(), DT_FLOAT, TensorShape({2}));
+  test::FillValues<float>(&expected_2, {2, 2});
+  test::ExpectTensorEqual<float>(expected_2, *GetOutput(2));  // output_nonzero
+}
+
+TEST_F(KPFusedSparseSegmentReduceNonzeroOpTest, TestReduceSum) {
+  MakeOp(0);
+
+  AddInputFromArray<float>(TensorShape({8}),
+                           {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1});
+  AddInputFromArray<int64>(TensorShape({3, 4}),
+                           {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int32>(&expected, {4});
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));  // output_shape
+
+  Tensor expected_1(allocator(), DT_INT32, TensorShape({2, 1}));
+  test::FillValues<int32>(&expected_1, {2, 3});
+  test::ExpectTensorEqual<int32>(expected_1, *GetOutput(1));  // output_indices
+
+  Tensor expected_2(allocator(), DT_FLOAT, TensorShape({2}));
+  test::FillValues<float>(&expected_2, {4, 2});
+  test::ExpectTensorEqual<float>(expected_2, *GetOutput(2));  // output_nonzero
+}
+
+TEST_F(KPFusedSparseSegmentReduceNonzeroOpTest, TestInvalidData) {
+  MakeOp(0);
+
+  AddInputFromArray<float>(TensorShape({4, 2}),
+                           {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1});
+  AddInputFromArray<int64>(TensorShape({3, 4}),
+                           {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 2});
+
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(s.error_message().find("Input data must be a vector") !=
+              std::string::npos);
+}
+
+TEST_F(KPFusedSparseSegmentReduceNonzeroOpTest, TestInvalidSliceinput) {
+  MakeOp(0);
+
+  AddInputFromArray<float>(TensorShape({8}),
+                           {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1});
+  AddInputFromArray<int64>(TensorShape({3, 4, 1}),
+                           {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 2});
+
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(s.error_message().find("slice input must be 2-D") !=
+              std::string::npos);
+}
+
+TEST_F(KPFusedSparseSegmentReduceNonzeroOpTest, TestInvalidbegin) {
+  MakeOp(0);
+
+  AddInputFromArray<float>(TensorShape({8}),
+                           {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1});
+  AddInputFromArray<int64>(TensorShape({3, 4}),
+                           {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1});
+
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(s.error_message().find("begin must have 2 elements") !=
+              std::string::npos);
+}
+
+TEST_F(KPFusedSparseSegmentReduceNonzeroOpTest, TestColsOutOfBounds) {
+  MakeOp(0);
+
+  AddInputFromArray<float>(TensorShape({8}),
+                           {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1});
+  AddInputFromArray<int64>(TensorShape({3, 4}),
+                           {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 4});
+
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(s.error_message().find("Column index out of range") !=
+              std::string::npos);
+}
+
+TEST_F(KPFusedSparseSegmentReduceNonzeroOpTest, TestIndicesOutOfBounds) {
+  MakeOp(0);
+
+  AddInputFromArray<float>(TensorShape({8}),
+                           {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 2});
+  AddInputFromArray<int64>(TensorShape({3, 4}),
+                           {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(s.error_message().find(
+                  "indices and slice_input.dim_zie(0) should have same size") !=
+              std::string::npos);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce_test.cc b/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..95ef344dbf4162fb7baf45de82a3de6bac8cc765
--- /dev/null
+++ b/tensorflow/core/kernels/embedding_fused_sparse_segment_reduce_test.cc
@@ -0,0 +1,210 @@
+/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *     ==============================================================================*/
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+class KPFusedSparseSegmentReduceOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(int combiner_mode) {
+    TF_ASSERT_OK(NodeDefBuilder("kp_fused_sparse_segment_reduce",
+                                "KPFusedSparseSegmentReduce")
+                     .Input(FakeInput(DT_FLOAT))  // data
+                     .Input(FakeInput(DT_INT32))  // indices
+                     .Input(FakeInput(DT_INT64))  // slice_input
+                     .Input(FakeInput(DT_INT32))  // begin
+                     .Input(FakeInput(DT_INT32))  // begin_1
+                     .Attr("combiner", combiner_mode)
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(KPFusedSparseSegmentReduceOpTest, TestReduceMean) {
+  MakeOp(1);
+
+  AddInputFromArray<float>(TensorShape({4, 2}),
+                           {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1});
+  AddInputFromArray<int64>(TensorShape({3, 4}),
+                           {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 2});
+  AddInputFromArray<int32>(TensorShape({1}), {1});
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 2}));
+  test::FillValues<float>(&expected,
+                          {0.0f, 0.0f, 0.0f, 0.0f, 3.0f, 4.0f, 3.0f, 4.0f});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+
+  Tensor expected_1(allocator(), DT_INT32, TensorShape({}));
+  test::FillValues<int32>(&expected_1, {2});
+  test::ExpectTensorEqual<int32>(expected_1, *GetOutput(1));
+}
+
+TEST_F(KPFusedSparseSegmentReduceOpTest, TestReduceSum) {
+  MakeOp(0);
+
+  AddInputFromArray<float>(TensorShape({4, 2}),
+                           {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1});
+  AddInputFromArray<int64>(TensorShape({3, 4}),
+                           {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 2});
+  AddInputFromArray<int32>(TensorShape({1}), {0});
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 2}));
+  test::FillValues<float>(&expected,
+                          {0.0f, 0.0f, 0.0f, 0.0f, 6.0f, 8.0f, 3.0f, 4.0f});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+
+  Tensor expected_1(allocator(), DT_INT32, TensorShape({}));
+  test::FillValues<int32>(&expected_1, {4});
+  test::ExpectTensorEqual<int32>(expected_1, *GetOutput(1));
+}
+
+TEST_F(KPFusedSparseSegmentReduceOpTest, TestColsOutOfBounds) {
+  MakeOp(0);
+
+  AddInputFromArray<float>(TensorShape({4, 2}),
+                           {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1});
+  AddInputFromArray<int64>(TensorShape({3, 4}),
+                           {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 5});
+  AddInputFromArray<int32>(TensorShape({1}), {0});
+
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(s.error_message().find("Column index out of range") !=
+              std::string::npos);
+}
+
+TEST_F(KPFusedSparseSegmentReduceOpTest, Test) {
+  MakeOp(0);
+
+  AddInputFromArray<float>(TensorShape({4, 2}),
+                           {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f});
+  AddInputFromArray<int32>(TensorShape({2}),
+                           {0, 2});  //  num_indices != slice_input.dim_size(0)
+  AddInputFromArray<int64>(TensorShape({3, 4}),
+                           {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 2});
+  AddInputFromArray<int32>(TensorShape({1}), {0});
+
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(s.error_message().find(
+                  "indices and slice_input.dim_zie(0) should have same size") !=
+              std::string::npos);
+}
+
+TEST_F(KPFusedSparseSegmentReduceOpTest, TestInvalidData) {
+  MakeOp(0);
+
+  AddInputFromArray<float>(
+      TensorShape({4, 2, 1}),
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f});  // data.dims() > 2
+  AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1});
+  AddInputFromArray<int64>(TensorShape({3, 4}),
+                           {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 2});
+  AddInputFromArray<int32>(TensorShape({1}), {0});
+
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(s.error_message().find("input must be 2-D") != std::string::npos);
+}
+
+TEST_F(KPFusedSparseSegmentReduceOpTest, TestInvalidSliceinput) {
+  MakeOp(0);
+
+  AddInputFromArray<float>(TensorShape({4, 2}),
+                           {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1});
+  AddInputFromArray<int64>(
+      TensorShape({3, 4, 1}),
+      {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4});  // slice_input.dims() > 2
+  AddInputFromArray<int32>(TensorShape({2}), {0, 2});
+  AddInputFromArray<int32>(TensorShape({1}), {0});
+
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(s.error_message().find("slice input must be 2-D") !=
+              std::string::npos);
+}
+
+TEST_F(KPFusedSparseSegmentReduceOpTest, TestInvalidBegin) {
+  MakeOp(0);
+
+  AddInputFromArray<float>(TensorShape({4, 2}),
+                           {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1});
+  AddInputFromArray<int64>(TensorShape({3, 4}),
+                           {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({3}),
+                           {0, 2, 1});  // begin has 3 elements
+  AddInputFromArray<int32>(TensorShape({1}), {0});
+
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(s.error_message().find("begin must have 2 elements") !=
+              std::string::npos);
+}
+
+TEST_F(KPFusedSparseSegmentReduceOpTest, TestInvalidBegin1) {
+  MakeOp(0);
+
+  AddInputFromArray<float>(TensorShape({4, 2}),
+                           {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1});
+  AddInputFromArray<int64>(TensorShape({3, 4}),
+                           {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 2});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 1});  // begin_1 has 2 elements
+
+  Status s = RunOpKernel();
+  EXPECT_FALSE(s.ok());
+  EXPECT_TRUE(s.error_message().find("begin_1 must have 1 element") !=
+              std::string::npos);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/embedding_fused_ops.cc b/tensorflow/core/ops/embedding_fused_ops.cc
index 326007b8dd878e893e3f083e7087ea18cef72c07..3cadfb150ad57c1a3df9c9a2f96614e294f88404 100644
--- a/tensorflow/core/ops/embedding_fused_ops.cc
+++ b/tensorflow/core/ops/embedding_fused_ops.cc
@@ -45,8 +45,8 @@ REGISTER_OP("KPFusedSparseSegmentReduceNonzero")
     .Input("begin: int32")
     .Attr("combiner: int = 1")  // 0 for SUM, 1 for MEAN
     .Attr("Tidx: {int32, int64} = DT_INT32")
-    .Output("output_shape: int64")
-    .Output("output_indices: int64")
+    .Output("output_shape: int32")
+    .Output("output_indices: int32")
     .Output("output_nonzero: float")
     .SetShapeFn(shape_inference::UnknownShape);