diff --git a/CMakeLists.txt b/CMakeLists.txt
index d10890e77dc91e677c601735dd7c1e314e50106d..94444027fd15bd1b5b4f3070dd3cebb9771b80ff 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,6 +138,7 @@ include_directories(${PROJECT_SOURCE_DIR})
 include_directories(${PROJECT_SOURCE_DIR}/torch_npu/csrc/aten)
 include_directories(${PROJECT_SOURCE_DIR}/third_party/hccl/inc)
 include_directories(${PROJECT_SOURCE_DIR}/third_party/acl/inc)
+include_directories(${PROJECT_SOURCE_DIR}/patch/include)
 
 # Set installed PyTorch dir
 if(DEFINED PYTORCH_INSTALL_DIR)
diff --git a/patch/include/c10d/comm.hpp b/patch/include/c10d/comm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3a39baccc9532569c6833a5fc927a7135453c606
--- /dev/null
+++ b/patch/include/c10d/comm.hpp
@@ -0,0 +1,125 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <c10d/ProcessGroup.hpp>
+
+namespace c10d {
+
+// Broadcast many tensors to all processes in the process group.
+void broadcast_coalesced(
+    c10::intrusive_ptr<c10d::ProcessGroup> process_group,
+    at::TensorList tensors,
+    size_t buffer_size,
+    int rank = 0);
+
+// This class passes bucket contents tensor (for multiple replicas) to
+// DDP communication hook.
+// Optionally in the future this can be enhanced with parameter to bucket
+// mappings as well.
+class GradBucket {
+ public:
+  explicit GradBucket(
+      size_t index,
+      const std::vector<at::Tensor>& tensors,
+      const std::vector<size_t>& offsets = {},
+      const std::vector<size_t>& lengths = {},
+      const std::vector<c10::IntArrayRef>& sizes_vec = {})
+      : index_(index),
+        tensors_(tensors),
+        offsets_(offsets),
+        lengths_(lengths),
+        sizes_vec_(sizes_vec) {}
+
+  // Returns the index of the bucket, which is unique across all the buckets.
+  size_t getIndex() const {
+    return index_;
+  }
+
+  // Each tensor in the list that getTensors returns refers to the replica on
+  // each device. There will be multiple replicas only in the case of single
+  // process multiple device mode. In the single process single device mode,
+  // this list would consist of only a single tensor.
+  const std::vector<at::Tensor>& getTensors() const {
+    return tensors_;
+  }
+
+  // Returns a mutable tensor vector compared with the above method.
+  std::vector<at::Tensor>& getTensorsRef() {
+    return tensors_;
+  }
+
+  // Returns the start index of each variable in tensors_[0].
+  const std::vector<size_t>& getOffsets() const {
+    return offsets_;
+  }
+
+  // Returns the total (i.e., flattened) length of each variable in
+  // tensors_[0].
+  const std::vector<size_t>& getLengths() const {
+    return lengths_;
+  }
+
+  // Returns the multi-dimensional sizes/shape of each variable in tensors_[0].
+  const std::vector<c10::IntArrayRef>& getSizesVec() const {
+    return sizes_vec_;
+  }
+
+ private:
+  size_t index_;
+  std::vector<at::Tensor> tensors_;
+
+  // Per-variable info in tensors_[0].
+  std::vector<size_t> offsets_;
+  std::vector<size_t> lengths_;
+  std::vector<c10::IntArrayRef> sizes_vec_;
+};
+
+// Base class of both `PythonCommHook` and `CppCommHook`.
+// Requires implementing 1) `runHook` method that communicates gradients
+// asynchronously, and 2) `parseHookResult` method that converts the hook
+// result into a tensor vector.
+class TORCH_PYTHON_API CommHookInterface {
+ public:
+  virtual ~CommHookInterface() {}
+
+  // Passes the input grad bucket to the registered communication hook.
+  // Once the tensors in the bucket are ready, kicks off the hook asynchronously
+  // and returns a future that holds the communication results.
+  virtual c10::intrusive_ptr<c10::ivalue::Future> runHook(
+      GradBucket& bucket) = 0;
+
+  // Returns the resulting tensors once the communication hook result is
+  // ready. The resulting tensors will then be copied to the grads of
+  // individual parameters.
+  virtual std::vector<at::Tensor> parseHookResult(
+      const c10::IValue& result) = 0;
+};
+
+// This CppCommHook interface only requires implementing runHook method that
+// potentially uses a state.
+// Still need TORCH_PYTHON_API instead of TORCH_API to support Windows platform.
+template <typename T>
+class TORCH_PYTHON_API CppCommHookInterface : public CommHookInterface {
+ public:
+  explicit CppCommHookInterface(T& state) : state_(state) {}
+
+  virtual ~CppCommHookInterface() {}
+
+  std::vector<at::Tensor> parseHookResult(const c10::IValue& result) override {
+    TORCH_INTERNAL_ASSERT(
+        result.isTensor() || result.isTensorList(),
+        "expected the hook result is either a Tensor or a TensorList");
+
+    if (result.isTensor()) {
+      return {result.toTensor()};
+    }
+
+    return result.toTensorVector();
+  }
+
+ protected:
+  T state_; // Not owned.
+};
+
+} // namespace c10d
diff --git a/patch/include/c10d/default_comm_hooks.hpp b/patch/include/c10d/default_comm_hooks.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..077d29bd977de9563e6824497bb20fd89374f187
--- /dev/null
+++ b/patch/include/c10d/default_comm_hooks.hpp
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <c10d/comm.hpp>
+#include <c10d/ProcessGroup.hpp>
+
+namespace c10d {
+
+enum class BuiltinCommHookType {
+  ALLREDUCE = 1,
+  FP16_COMPRESS = 2,
+};
+
+class AllReduceCommHook : public CppCommHookInterface<ProcessGroup*> {
+ public:
+  explicit AllReduceCommHook(ProcessGroup* state)
+      : CppCommHookInterface<ProcessGroup*>(state) {}
+
+  ~AllReduceCommHook() override {}
+
+  c10::intrusive_ptr<c10::ivalue::Future> runHook(GradBucket& bucket) override;
+};
+
+class FP16CompressCommHook : public CppCommHookInterface<ProcessGroup*> {
+ public:
+  explicit FP16CompressCommHook(ProcessGroup* state)
+      : CppCommHookInterface<ProcessGroup*>(state) {}
+
+  ~FP16CompressCommHook() override {}
+
+  c10::intrusive_ptr<c10::ivalue::Future> runHook(GradBucket& bucket) override;
+};
+
+} // namespace c10d
diff --git a/patch/include/c10d/frontend.hpp b/patch/include/c10d/frontend.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d91ac3cb674eaccd448d7ba3f9cac2f0e4d59755
--- /dev/null
+++ b/patch/include/c10d/frontend.hpp
@@ -0,0 +1,261 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <c10/util/Optional.h>
+#include <c10d/ProcessGroup.hpp>
+#include <c10d/Store.hpp>
+
+#include <chrono>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+namespace c10d {
+
+#ifdef USE_C10D_GLOO
+constexpr char* GLOO_SOCKET_IFNAME_ENV = "GLOO_SOCKET_IFNAME";
+#endif
+
+inline std::vector<std::string> split(
+    char separator,
+    const std::string& string) {
+  std::vector<std::string> pieces;
+  std::stringstream ss(string);
+  std::string item;
+  while (std::getline(ss, item, separator)) {
+    pieces.push_back(std::move(item));
+  }
+  return pieces;
+}
+
+class Backend {
+ public:
+  // Maps to Backend.__new__ in Python.
+  static std::string get(const std::string&);
+
+  // TODO: How to support registering third_party backend?
+  static void registerBackend();
+
+ private:
+  // TODO: Should this be an enum list instead since this set doesn't
+  // change at all.
+  std::unordered_set<std::string> registered_backends_;
+};
+
+class TORCH_PYTHON_API DistributedC10d : public torch::CustomClassHolder {
+ public:
+  static c10::intrusive_ptr<DistributedC10d> get();
+
+  DistributedC10d() = default;
+
+  void initProcessGroup(
+      const std::string& backend,
+      const std::string& init_method,
+      const std::chrono::milliseconds& timeout,
+      int64_t world_size,
+      int64_t rank,
+      c10::intrusive_ptr<Store> store,
+      const std::string& group_name);
+
+  void destroyProcessGroup(c10::intrusive_ptr<ProcessGroup> group);
+  int64_t getRank(const c10::intrusive_ptr<ProcessGroup>& group) const;
+  int64_t getWorldSize(const c10::intrusive_ptr<ProcessGroup>& group) const;
+
+  c10::intrusive_ptr<ProcessGroup::Work> isend(
+      at::Tensor tensor,
+      int64_t dst,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      c10::optional<int64_t>& tag);
+
+  c10::intrusive_ptr<ProcessGroup::Work> irecv(
+      at::Tensor tensor,
+      int64_t src,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      c10::optional<int64_t>& tag);
+
+  void send(
+      at::Tensor tensor,
+      int64_t dst,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      c10::optional<int64_t>& tag);
+
+  int64_t recv(
+      at::Tensor tensor,
+      const c10::optional<int64_t>& src,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      c10::optional<int64_t>& tag);
+
+  c10::intrusive_ptr<ProcessGroup::Work> broadcastMultiGPU(
+      std::vector<at::Tensor>& tensor_list,
+      int64_t src,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      bool async_op = false,
+      int64_t src_tensor = 0);
+
+  c10::intrusive_ptr<ProcessGroup::Work> broadcast(
+      at::Tensor tensor,
+      int64_t src,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      bool async_op = false);
+
+  c10::intrusive_ptr<ProcessGroup::Work> allReduceMultiGPU(
+      std::vector<at::Tensor>& tensor_list,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      ReduceOp op = ReduceOp::SUM,
+      bool async_op = false);
+
+  c10::intrusive_ptr<ProcessGroup::Work> allReduce(
+      at::Tensor tensor,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      ReduceOp op = ReduceOp::SUM,
+      bool async_op = false);
+
+  c10::intrusive_ptr<ProcessGroup::Work> allReduceCoalesced(
+      std::vector<at::Tensor>& tensors,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      ReduceOp op = ReduceOp::SUM,
+      bool async_op = false);
+
+  c10::intrusive_ptr<ProcessGroup::Work> reduceMultiGPU(
+      std::vector<at::Tensor>& tensor_list,
+      int64_t dst,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      ReduceOp op = ReduceOp::SUM,
+      bool async_op = false,
+      int64_t dst_tensor = 0);
+
+  c10::intrusive_ptr<ProcessGroup::Work> reduce(
+      at::Tensor tensor,
+      int64_t dst,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      ReduceOp op = ReduceOp::SUM,
+      bool async_op = false);
+
+  c10::intrusive_ptr<ProcessGroup::Work> allGatherMultiGPU(
+      std::vector<std::vector<at::Tensor>>& output_tensor_lists,
+      std::vector<at::Tensor>& input_tensor_list,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      bool async_op = false);
+
+  c10::intrusive_ptr<ProcessGroup::Work> allGather(
+      std::vector<at::Tensor>& tensor_list,
+      at::Tensor tensor,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      bool async_op = false);
+
+  c10::intrusive_ptr<ProcessGroup::Work> allGatherCoalesced(
+      std::vector<std::vector<at::Tensor>>& output_tensor_lists,
+      std::vector<at::Tensor>& input_tensor_list,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      bool async_op = false);
+
+  c10::intrusive_ptr<ProcessGroup::Work> gather(
+      at::Tensor tensor,
+      const c10::optional<std::vector<at::Tensor>>& gather_list,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      int64_t dst = 0,
+      bool async_op = false);
+
+  c10::intrusive_ptr<ProcessGroup::Work> scatter(
+      at::Tensor tensor,
+      std::vector<at::Tensor>& scatter_list,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      int64_t src = 0,
+      bool async_op = false);
+
+  c10::intrusive_ptr<ProcessGroup::Work> reduceScatterMultiGPU(
+      std::vector<at::Tensor>& output_tensor_list,
+      std::vector<std::vector<at::Tensor>>& input_tensor_lists,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      ReduceOp op = ReduceOp::SUM,
+      bool async_op = false);
+
+  c10::intrusive_ptr<ProcessGroup::Work> reduceScatter(
+      at::Tensor output,
+      std::vector<at::Tensor>& input_tensor_list,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      ReduceOp op = ReduceOp::SUM,
+      bool async_op = false);
+
+  c10::intrusive_ptr<ProcessGroup::Work> allToAllSingle(
+      at::Tensor output,
+      at::Tensor input,
+      std::vector<int64_t>& output_split_sizes,
+      std::vector<int64_t>& input_split_sizes,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      bool async_op = false);
+
+  c10::intrusive_ptr<ProcessGroup::Work> allToAll(
+      std::vector<at::Tensor>& output_tensor_list,
+      std::vector<at::Tensor>& input_tensor_list,
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      bool async_op = false);
+
+  c10::intrusive_ptr<ProcessGroup::Work> barrier(
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      bool async_op = false);
+
+  c10::intrusive_ptr<ProcessGroup> newGroup(
+      std::vector<int64_t> ranks,
+      std::chrono::milliseconds timeout,
+      Backend backend);
+
+  c10::intrusive_ptr<ProcessGroup> worldProcessGroup();
+
+  c10::intrusive_ptr<ProcessGroup> newProcessGroupHelper(
+    const int64_t world_size,
+    const int64_t rank,
+    const std::vector<int64_t>& group_ranks,
+    const std::string& backend_str,
+    const c10::intrusive_ptr<Store>& store,
+    c10::optional<std::string> group_name,
+    int64_t timeout_milisesonds);
+
+  c10::intrusive_ptr<ProcessGroup> getProcessGroupByName(
+      const std::string& name) const;
+
+  std::string getNameOfProcessGroup(
+      const c10::intrusive_ptr<ProcessGroup>& pg) const;
+
+    void registerProcessGroupName(const c10::intrusive_ptr<ProcessGroup>& process_group, const std::string& name);
+
+ private:
+
+  bool rankNotInGroup(const c10::intrusive_ptr<ProcessGroup>& group) const;
+  int64_t getGroupRank(
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      const int64_t rank) const;
+  int64_t getGlobalRank(
+      const c10::intrusive_ptr<ProcessGroup>& group,
+      const int64_t group_rank) const;
+  void checkDefaultPg() const;
+  int64_t getGroupSize(const c10::intrusive_ptr<ProcessGroup>& group) const;
+  std::string getBackend(const c10::intrusive_ptr<ProcessGroup>& group);
+
+  std::string backend_;
+  // TODO: Ask Alex what kind of equality we need. It determine whether we
+  // need to use ProcessGroup or ProcesGroup* as key.
+  std::unordered_map<
+      c10::intrusive_ptr<ProcessGroup>,
+      std::pair<std::string, c10::intrusive_ptr<Store>>>
+      pg_map_;
+
+  // Note, this is different mapping relationship than original Python
+  // implementation.
+  std::unordered_map<c10::intrusive_ptr<ProcessGroup>, std::string> pg_names_;
+
+  // Process group's global rank to local rank mapping
+  std::unordered_map<
+      c10::intrusive_ptr<ProcessGroup>,
+      std::unordered_map<int64_t, int64_t>>
+      pg_group_ranks_;
+
+  c10::intrusive_ptr<ProcessGroup> default_pg_;
+
+  // Default value should be "env://"
+  std::string default_pg_init_method_;
+
+  int64_t group_count_;
+};
+
+} // namespace c10d