diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 13d3a7641ad2498d173ed7d1bc6ffe03c79c50bc..aa2965c9e7e6c89579118fa27486b2c0346e5a17 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4157,6 +4157,26 @@ cc_library(
     ]),
 )
 
+tf_cuda_cc_test(
+    name = "embedding_fused_padding_test",
+    size = "small",
+    srcs = if_enable_annc([
+        "embedding_fused_padding_test.cc",
+    ]),
+    deps = if_enable_annc([
+        ":embedding_fused_padding_op",
+        ":host_constant_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ]),
+)
 
 tf_kernel_library(
     name = "matmul_op",
diff --git a/tensorflow/core/kernels/embedding_fused_padding.cc b/tensorflow/core/kernels/embedding_fused_padding.cc
index 30208749b4865c80fd5189df1de9729ebf8c72c3..a0f14b2eb86bda358c82b360e15da8c7fedcbced 100644
--- a/tensorflow/core/kernels/embedding_fused_padding.cc
+++ b/tensorflow/core/kernels/embedding_fused_padding.cc
@@ -72,16 +72,17 @@ public:
     int output_cols = input.dim_size(1);
     OP_REQUIRES(context,
                 padding_rows >= 0,
-                errors::InvalidArgument("padding_rows must >= 0"));
+                errors::InvalidArgument("Pooling size(", input_rows_value,
+                ") is greater than Input size(", static_cast<int32>(origin_shape.flat<int64>()(0)), ")"));
     OP_REQUIRES(context,
                 reshape_cols > 0,
                 errors::InvalidArgument("reshape_cols must > 0"));
     OP_REQUIRES(context,
                 reshape_sizes.flat<int32>()(0) == -1,
-                errors::InvalidArgument("reshape first dim must be -1"));
+                errors::InvalidArgument("reshape[0] is not -1"));
     OP_REQUIRES(context,
                 pack.scalar<int32>()() == output_cols,
-                errors::InvalidArgument("pack must be output cols"));
+                errors::InvalidArgument("pack(", pack.scalar<int32>()(), ") is not equal to embedding dims"));
 
     Tensor* output0 = nullptr;
     Tensor* output1 = nullptr;
diff --git a/tensorflow/core/kernels/embedding_fused_padding_test.cc b/tensorflow/core/kernels/embedding_fused_padding_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5137d51300ad862d7ffabd425bf549f07a172ae3
--- /dev/null
+++ b/tensorflow/core/kernels/embedding_fused_padding_test.cc
@@ -0,0 +1,307 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+class KPFusedEmbeddingPaddingTest : public OpsTestBase {
+ protected:
+  void MakeOp(DataType input_shape_type, DataType pooling_type, DataType reshape_type, DataType const_type) {
+    TF_ASSERT_OK(NodeDefBuilder("fused_padding", "KPFusedEmbeddingPadding")
+                     .Input(FakeInput(input_shape_type))
+                     .Input(FakeInput(pooling_type))
+                     .Input(FakeInput(const_type))
+                     .Input(FakeInput(reshape_type))
+                     .Input(FakeInput(const_type))
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+
+  Status FeedAndRun(const int embedding_dims, const int table_size,
+                    const int pooling_size, const int reshape_size) {
+    MakeOp(DT_INT64, DT_FLOAT, DT_INT32, DT_INT32);
+    AddInputFromArray<int64>(TensorShape({2}), {table_size, embedding_dims});
+    AddInput<float>(TensorShape({pooling_size, embedding_dims}), [](int i) -> float { 
+      return static_cast<float>(i + 1); 
+    });
+    AddInputFromArray<int32>(TensorShape({}), {pooling_size});
+    AddInputFromArray<int32>(TensorShape({2}), {-1, reshape_size});
+    AddInputFromArray<int32>(TensorShape({}), {embedding_dims});
+    return RunOpKernel();
+  }
+
+  void MakeFastOp(DataType input_shape_type, DataType pooling_type, DataType reshape_type, DataType const_type) {
+    TF_ASSERT_OK(NodeDefBuilder("fused_padding_fast", "KPFusedEmbeddingPaddingFast")
+                     .Input(FakeInput(input_shape_type))
+                     .Input(FakeInput(pooling_type))
+                     .Input(FakeInput(const_type))
+                     .Input(FakeInput(reshape_type))
+                     .Input(FakeInput(const_type))
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+
+  Status FeedAndRunFast(const int embedding_dims, const int table_size,
+                        const int pooling_size, const int reshape_size) {
+    MakeFastOp(DT_INT64, DT_FLOAT, DT_INT32, DT_INT32);
+    AddInputFromArray<int64>(TensorShape({2}), {table_size, embedding_dims});
+    AddInput<float>(TensorShape({pooling_size, embedding_dims}), [](int i) -> float { 
+      return static_cast<float>(i + 1); 
+    });
+    AddInputFromArray<int32>(TensorShape({}), {pooling_size});
+    AddInputFromArray<int32>(TensorShape({2}), {-1, reshape_size});
+    AddInputFromArray<int32>(TensorShape({}), {embedding_dims});
+    return RunOpKernel();
+  }
+};
+
+TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingWithEmbeddingDims10_0) {
+  // Feed and run
+  const int embedding_dims = 10;
+  const int table_size = 151;
+  const int pooling_size = 151;
+  const int reshape_size = 1510;
+  TF_ASSERT_OK(FeedAndRun(embedding_dims, table_size, pooling_size, reshape_size));
+
+  // Check the output.
+  Tensor expected1(allocator(), DT_INT32, TensorShape({}));
+  Tensor expected2(allocator(), DT_FLOAT, TensorShape({table_size * embedding_dims / reshape_size, reshape_size}));
+  test::FillValues<int32>(&expected1, {table_size - pooling_size});
+  test::FillFn<float>(&expected2, [=](int i) -> float { 
+    if (i < pooling_size * embedding_dims) {
+      return static_cast<float>(i + 1); 
+    } else {
+      return 0.0f;
+    }
+  });
+  test::ExpectTensorEqual<int32>(expected1, *GetOutput(0));
+  test::ExpectTensorNear<float>(expected2, *GetOutput(1), 1e-5);
+}
+
+TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingWithEmbeddingDims10_1) {
+  // Feed and run
+  const int embedding_dims = 10;
+  const int table_size = 1510;
+  const int pooling_size = 151;
+  const int reshape_size = 1510;
+  TF_ASSERT_OK(FeedAndRun(embedding_dims, table_size, pooling_size, reshape_size));
+
+  // Check the output.
+  Tensor expected1(allocator(), DT_INT32, TensorShape({}));
+  Tensor expected2(allocator(), DT_FLOAT, TensorShape({table_size * embedding_dims / reshape_size, reshape_size}));
+  test::FillValues<int32>(&expected1, {table_size - pooling_size});
+  test::FillFn<float>(&expected2, [=](int i) -> float { 
+    if (i < pooling_size * embedding_dims) {
+      return static_cast<float>(i + 1); 
+    } else {
+      return 0.0f;
+    }
+  });
+  test::ExpectTensorEqual<int32>(expected1, *GetOutput(0));
+  test::ExpectTensorNear<float>(expected2, *GetOutput(1), 1e-5);
+}
+
+TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingWithEmbeddingDims12_0) {
+  // Feed and run
+  const int embedding_dims = 12;
+  const int table_size = 2;
+  const int pooling_size = 2;
+  const int reshape_size = 24;
+  TF_ASSERT_OK(FeedAndRun(embedding_dims, table_size, pooling_size, reshape_size));
+
+  // Check the output.
+  Tensor expected1(allocator(), DT_INT32, TensorShape({}));
+  Tensor expected2(allocator(), DT_FLOAT, TensorShape({table_size * embedding_dims / reshape_size, reshape_size}));
+  test::FillValues<int32>(&expected1, {table_size - pooling_size});
+  test::FillFn<float>(&expected2, [=](int i) -> float { 
+    if (i < pooling_size * embedding_dims) {
+      return static_cast<float>(i + 1); 
+    } else {
+      return 0.0f;
+    }
+  });
+  test::ExpectTensorEqual<int32>(expected1, *GetOutput(0));
+  test::ExpectTensorNear<float>(expected2, *GetOutput(1), 1e-5);
+}
+
+TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingWithEmbeddingDims12_1) {
+  // Feed and run
+  const int embedding_dims = 12;
+  const int table_size = 200;
+  const int pooling_size = 2;
+  const int reshape_size = 24;
+  TF_ASSERT_OK(FeedAndRun(embedding_dims, table_size, pooling_size, reshape_size));
+
+  // Check the output.
+  Tensor expected1(allocator(), DT_INT32, TensorShape({}));
+  Tensor expected2(allocator(), DT_FLOAT, TensorShape({table_size * embedding_dims / reshape_size, reshape_size}));
+  test::FillValues<int32>(&expected1, {table_size - pooling_size});
+  test::FillFn<float>(&expected2, [=](int i) -> float { 
+    if (i < pooling_size * embedding_dims) {
+      return static_cast<float>(i + 1); 
+    } else {
+      return 0.0f;
+    }
+  });
+  test::ExpectTensorEqual<int32>(expected1, *GetOutput(0));
+  test::ExpectTensorNear<float>(expected2, *GetOutput(1), 1e-5);
+}
+
+TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingFastWithEmbeddingDims10_0) {
+  // Feed and run
+  const int embedding_dims = 10;
+  const int table_size = 151;
+  const int pooling_size = 151;
+  const int reshape_size = 1510;
+  TF_ASSERT_OK(FeedAndRunFast(embedding_dims, table_size, pooling_size, reshape_size));
+
+  // Check the output.
+  Tensor expected1(allocator(), DT_INT32, TensorShape({}));
+  Tensor expected2(allocator(), DT_INT32, TensorShape({}));
+  test::FillValues<int32>(&expected1, {table_size - pooling_size});
+  test::FillValues<int32>(&expected2, {table_size * embedding_dims / reshape_size});
+  test::ExpectTensorEqual<int32>(expected1, *GetOutput(0));
+  test::ExpectTensorEqual<int32>(expected2, *GetOutput(1));
+}
+
+TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingFastWithEmbeddingDims10_1) {
+  // Feed and run
+  const int embedding_dims = 10;
+  const int table_size = 1510;
+  const int pooling_size = 151;
+  const int reshape_size = 1510;
+  TF_ASSERT_OK(FeedAndRunFast(embedding_dims, table_size, pooling_size, reshape_size));
+
+  // Check the output.
+  Tensor expected1(allocator(), DT_INT32, TensorShape({}));
+  Tensor expected2(allocator(), DT_INT32, TensorShape({}));
+  test::FillValues<int32>(&expected1, {table_size - pooling_size});
+  test::FillValues<int32>(&expected2, {table_size * embedding_dims / reshape_size});
+  test::ExpectTensorEqual<int32>(expected1, *GetOutput(0));
+  test::ExpectTensorEqual<int32>(expected2, *GetOutput(1));
+}
+
+TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingFastWithEmbeddingDims12_0) {
+  // Feed and run
+  const int embedding_dims = 12;
+  const int table_size = 2;
+  const int pooling_size = 2;
+  const int reshape_size = 24;
+  TF_ASSERT_OK(FeedAndRunFast(embedding_dims, table_size, pooling_size, reshape_size));
+
+  // Check the output.
+  Tensor expected1(allocator(), DT_INT32, TensorShape({}));
+  Tensor expected2(allocator(), DT_INT32, TensorShape({}));
+  test::FillValues<int32>(&expected1, {table_size - pooling_size});
+  test::FillValues<int32>(&expected2, {table_size * embedding_dims / reshape_size});
+  test::ExpectTensorEqual<int32>(expected1, *GetOutput(0));
+  test::ExpectTensorEqual<int32>(expected2, *GetOutput(1));
+}
+
+TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingFastWithEmbeddingDims12_1) {
+  // Feed and run
+  const int embedding_dims = 12;
+  const int table_size = 200;
+  const int pooling_size = 2;
+  const int reshape_size = 24;
+  TF_ASSERT_OK(FeedAndRunFast(embedding_dims, table_size, pooling_size, reshape_size));
+
+  // Check the output.
+  Tensor expected1(allocator(), DT_INT32, TensorShape({}));
+  Tensor expected2(allocator(), DT_INT32, TensorShape({}));
+  test::FillValues<int32>(&expected1, {table_size - pooling_size});
+  test::FillValues<int32>(&expected2, {table_size * embedding_dims / reshape_size});
+  test::ExpectTensorEqual<int32>(expected1, *GetOutput(0));
+  test::ExpectTensorEqual<int32>(expected2, *GetOutput(1));
+}
+
+TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingWithUnexpectReshape) {
+  // Feed and run
+  const int embedding_dims = 12;
+  const int table_size = 200;
+  const int pooling_size = 2;
+  const int reshape_size = 24;
+  MakeOp(DT_INT64, DT_FLOAT, DT_INT32, DT_INT32);
+  AddInputFromArray<int64>(TensorShape({2}), {table_size, embedding_dims});
+  AddInput<float>(TensorShape({pooling_size, embedding_dims}), [](int i) -> float { 
+    return static_cast<float>(i + 1); 
+  });
+  AddInputFromArray<int32>(TensorShape({}), {pooling_size});
+  AddInputFromArray<int32>(TensorShape({2}), {10, reshape_size});
+  AddInputFromArray<int32>(TensorShape({}), {embedding_dims});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(
+      absl::StrContains(s.ToString(), "reshape[0] is not -1"))
+      << s;
+}
+
+TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingWithUnexpectPack) {
+  // Feed and run
+  const int embedding_dims = 12;
+  const int table_size = 200;
+  const int pooling_size = 2;
+  const int reshape_size = 24;
+  MakeOp(DT_INT64, DT_FLOAT, DT_INT32, DT_INT32);
+  AddInputFromArray<int64>(TensorShape({2}), {table_size, embedding_dims});
+  AddInput<float>(TensorShape({pooling_size, embedding_dims}), [](int i) -> float { 
+    return static_cast<float>(i + 1); 
+  });
+  AddInputFromArray<int32>(TensorShape({}), {pooling_size});
+  AddInputFromArray<int32>(TensorShape({2}), {-1, reshape_size});
+  AddInputFromArray<int32>(TensorShape({}), {10});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(
+      absl::StrContains(s.ToString(), "pack(10) is not equal to embedding dims"))
+      << s;
+}
+
+TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingWithPoolingSizeGreaterInput) {
+  // Feed and run
+  const int embedding_dims = 12;
+  const int table_size = 200;
+  const int pooling_size = 201;
+  const int reshape_size = 24;
+  MakeOp(DT_INT64, DT_FLOAT, DT_INT32, DT_INT32);
+  AddInputFromArray<int64>(TensorShape({2}), {table_size, embedding_dims});
+  AddInput<float>(TensorShape({pooling_size, embedding_dims}), [](int i) -> float { 
+    return static_cast<float>(i + 1); 
+  });
+  AddInputFromArray<int32>(TensorShape({}), {pooling_size});
+  AddInputFromArray<int32>(TensorShape({2}), {-1, reshape_size});
+  AddInputFromArray<int32>(TensorShape({}), {embedding_dims});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(
+      absl::StrContains(s.ToString(), "Pooling size(201) is greater than Input size(200)"))
+      << s;
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py
index 8d051020daedf6568a75ab7aab7d8a1d8228142e..15e69f3f20bb8275f0f811ecc10a52e50c4503c9 100644
--- a/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py
+++ b/tensorflow/python/grappler/embedding_fused_test/fused_embedding_padding_test.py
@@ -68,20 +68,20 @@ class TestFusedEmbeddingPadding(unittest.TestCase):
     def tearDownClass(cls):
         return
     
-    def _run_kp_padding_test(self, input1_shape, input3_shape, num_runs=500):
+    def _run_kp_padding_test(self, input_shape, pooling_shape, reshape, num_runs=500):
         with tf.Graph().as_default():
             input0 = tf.compat.v1.placeholder(tf.int64, shape=(2,), name="input0")
-            input1 = tf.compat.v1.placeholder(tf.float32, shape=input1_shape, name="input1")
+            input1 = tf.compat.v1.placeholder(tf.float32, shape=pooling_shape, name="input1")
             input2 = tf.compat.v1.placeholder(tf.int32, shape=(), name="input2")
             input3 = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="input3")
-            pack = tf.compat.v1.placeholder(tf.int32, shape=(), name="input3")
+            pack = tf.compat.v1.placeholder(tf.int32, shape=(), name="pack")
             """Initialize test data"""
             feed = {
-                input0: np.array([6, input1_shape[1]]).astype(np.int64),
-                input1: np.random.rand(*input1_shape).astype(np.float),
-                input2: input1_shape[0],
-                input3: np.array(input3_shape).astype(np.int32),
-                pack: input1_shape[1],
+                input0: np.array(input_shape).astype(np.int64),
+                input1: np.random.rand(*pooling_shape).astype(np.float),
+                input2: pooling_shape[0],
+                input3: np.array(reshape).astype(np.int32),
+                pack: pooling_shape[1],
             }
             with tf.name_scope("ori"):
                 out_ori = ori_padding_graph(input0, input1, input2, input3, pack)
@@ -120,20 +120,20 @@ class TestFusedEmbeddingPadding(unittest.TestCase):
                 )
 
 
-    def _run_kp_padding_fast_test(self, input1_shape, input3_shape, num_runs=500):
+    def _run_kp_padding_fast_test(self, input_shape, pooling_shape, reshape, num_runs=500):
         with tf.Graph().as_default():
             input0 = tf.compat.v1.placeholder(tf.int64, shape=(2,), name="input0")
-            input1 = tf.compat.v1.placeholder(tf.float32, shape=input1_shape, name="input1")
+            input1 = tf.compat.v1.placeholder(tf.float32, shape=pooling_shape, name="input1")
             input2 = tf.compat.v1.placeholder(tf.int32, shape=(), name="input2")
             input3 = tf.compat.v1.placeholder(tf.int32, shape=(2,), name="input3")
             pack = tf.compat.v1.placeholder(tf.int32, shape=(), name="pack")
             """Initialize test data"""
             feed = {
-                input0: np.array([6, input1_shape[1]]).astype(np.int64),
-                input1: np.random.rand(*input1_shape).astype(np.float),
-                input2: input1_shape[0],
-                input3: np.array(input3_shape).astype(np.int32),
-                pack: input1_shape[1],
+                input0: np.array(input_shape).astype(np.int64),
+                input1: np.random.rand(*pooling_shape).astype(np.float),
+                input2: pooling_shape[0],
+                input3: np.array(reshape).astype(np.int32),
+                pack: pooling_shape[1],
             }
             with tf.name_scope("ori"):
                 out_ori = ori_padding_fast_graph(input0, input1, input2, input3, pack)
@@ -172,25 +172,33 @@ class TestFusedEmbeddingPadding(unittest.TestCase):
                 )
 
     
-    def test_kp_padding_shape10(self):
-        input1_shape = (4, 10)
-        input3_shape = (-1, 20)
-        self._run_kp_padding_test(input1_shape, input3_shape, num_runs=100)
-
-    def test_kp_padding_shape12(self):
-        input1_shape = (1, 12)
-        input3_shape = (-1, 36)
-        self._run_kp_padding_test(input1_shape, input3_shape, num_runs=100)
+    def test_kp_padding_shape10_1(self):
+        input_shape = (151 * 1, 10)
+        pooling_shape = (151 * 1, 10)
+        reshape = (-1, 1510)
+        self._run_kp_padding_test(input_shape, pooling_shape, reshape, num_runs=100)
+        self._run_kp_padding_fast_test(input_shape, pooling_shape, reshape, num_runs=100)
+    
+    def test_kp_padding_shape10_2(self):
+        input_shape = (151 * 1000, 10)
+        pooling_shape = (151 * 10, 10)
+        reshape = (-1, 1510)
+        self._run_kp_padding_test(input_shape, pooling_shape, reshape, num_runs=100)
+        self._run_kp_padding_fast_test(input_shape, pooling_shape, reshape, num_runs=100)
+
+    def test_kp_padding_shape12_1(self):
+        input_shape = (2 * 1, 12)
+        pooling_shape = (2 * 1, 12)
+        reshape = (-1, 24)
+        self._run_kp_padding_test(input_shape, pooling_shape, reshape, num_runs=100)
+        self._run_kp_padding_fast_test(input_shape, pooling_shape, reshape, num_runs=100)
     
-    def test_kp_padding_fast_shape10(self):
-        input1_shape = (4, 10)
-        input3_shape = (-1, 20)
-        self._run_kp_padding_fast_test(input1_shape, input3_shape, num_runs=100)
-
-    def test_kp_padding_fast_shape12(self):
-        input1_shape = (1, 12)
-        input3_shape = (-1, 36)
-        self._run_kp_padding_fast_test(input1_shape, input3_shape, num_runs=100)
+    def test_kp_padding_shape12_2(self):
+        input_shape = (2 * 1000, 12)
+        pooling_shape = (2 * 10, 12)
+        reshape = (-1, 24)
+        self._run_kp_padding_test(input_shape, pooling_shape, reshape, num_runs=100)
+        self._run_kp_padding_fast_test(input_shape, pooling_shape, reshape, num_runs=100)
 
 
 if __name__ == "__main__":