From 548200899831c0d31fdee9c6ba351811fcc511e3 Mon Sep 17 00:00:00 2001
From: tangpeiqi96 <tangpeiqi1@huawei.com>
Date: Sat, 27 Jul 2024 09:29:36 +0800
Subject: [PATCH] add flops computing interface

---
 CMakeLists.txt                             |   4 +-
 torch_npu/csrc/InitNpuBindings.cpp         |   2 +
 torch_npu/csrc/flopcount/CMakeLists.txt    |   6 ++
 torch_npu/csrc/flopcount/FlopCount.h       |  18 ++++
 torch_npu/csrc/flopcount/FlopCountImpl.cpp |  35 +++++++
 torch_npu/csrc/flopcount/FlopCountImpl.h   |  22 +++++
 torch_npu/csrc/flopcount/FlopCounter.cpp   | 105 +++++++++++++++++++++
 torch_npu/csrc/flopcount/FlopCounter.h     |  22 +++++
 torch_npu/csrc/flopcount/Init.cpp          |  55 +++++++++++
 torch_npu/csrc/flopcount/Init.h            |  11 +++
 torch_npu/utils/__init__.py                |   6 ++
 torch_npu/utils/flops_count.py             |  30 ++++++
 12 files changed, 315 insertions(+), 1 deletion(-)
 create mode 100644 torch_npu/csrc/flopcount/CMakeLists.txt
 create mode 100644 torch_npu/csrc/flopcount/FlopCount.h
 create mode 100644 torch_npu/csrc/flopcount/FlopCountImpl.cpp
 create mode 100644 torch_npu/csrc/flopcount/FlopCountImpl.h
 create mode 100644 torch_npu/csrc/flopcount/FlopCounter.cpp
 create mode 100644 torch_npu/csrc/flopcount/FlopCounter.h
 create mode 100644 torch_npu/csrc/flopcount/Init.cpp
 create mode 100644 torch_npu/csrc/flopcount/Init.h
 create mode 100644 torch_npu/utils/flops_count.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b1a81efd08..d9732f54ad1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -196,6 +196,7 @@ set(FRAMEWORK_SRCS)
 
 if (NOT DEFINED BUILD_LIBTORCH)
   set(DIST_SRCS)
+  set(FLOP_SRCS)
   set(NPU_SRCS)
   set(PROF_SRCS)
   set(UTILS_SRCS)
@@ -216,6 +217,7 @@ if (NOT DEFINED BUILD_LIBTORCH)
   add_subdirectory(${TORCHNPU_ROOT}/profiler)
   add_subdirectory(${TORCHNPU_ROOT}/utils)
   add_subdirectory(${TORCHNPU_ROOT}/sanitizer)
+  add_subdirectory(${TORCHNPU_ROOT}/flopcount)
 endif()
 
 if (DEFINED BUILD_LIBTORCH)
@@ -238,7 +240,7 @@ if (DEFINED BUILD_LIBTORCH)
   set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${FRAMEWORK_SRCS} ${NPU_CPP_LIBS_SRCS})
 else()
 # Compile code with pybind11
-  set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${DIST_SRCS} ${FRAMEWORK_SRCS} ${NPU_SRCS} ${PROF_SRCS} ${UTILS_SRCS} ${SAN_SRCS})
+  set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${DIST_SRCS} ${FLOP_SRCS} ${FRAMEWORK_SRCS} ${NPU_SRCS} ${PROF_SRCS} ${UTILS_SRCS} ${SAN_SRCS})
 endif()
 
 add_library(${PLUGIN_NAME} SHARED ${CPP_SRCS})
diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp
index 139fcb45c18..283c3c26ec4 100644
--- a/torch_npu/csrc/InitNpuBindings.cpp
+++ b/torch_npu/csrc/InitNpuBindings.cpp
@@ -12,6 +12,7 @@
 #include "torch_npu/csrc/core/npu/THNPUCachingHostAllocator.h"
 #include "torch_npu/csrc/distributed/Init.h"
 #include "torch_npu/csrc/profiler/init.h"
+#include "torch_npu/csrc/flopcount/Init.h"
 #include "torch_npu/csrc/npu/Module.h"
 #include "torch_npu/csrc/utils/TensorType.h"
 #include "torch_npu/csrc/utils/AutocastMode.h"
@@ -126,6 +127,7 @@ PyObject* initModule() {
     AddPyMethodDefs(methods, torch_npu::distributed::python_functions());
     AddPyMethodDefs(methods, torch_npu::utils::npu_extension_functions());
     AddPyMethodDefs(methods, torch_npu::autocast::autocast_mode_functions());
+    AddPyMethodDefs(methods, torch_npu::flopcount::flops_count_functions());
     static struct PyModuleDef torchnpu_module = {
         PyModuleDef_HEAD_INIT,
         "torch_npu._C",
diff --git a/torch_npu/csrc/flopcount/CMakeLists.txt b/torch_npu/csrc/flopcount/CMakeLists.txt
new file mode 100644
index 00000000000..769250a65fe
--- /dev/null
+++ b/torch_npu/csrc/flopcount/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB _FLOP_SRCS *.cpp)
+
+LIST(APPEND FLOP_SRCS ${_FLOP_SRCS})
+
+# Pass to parent
+set(FLOP_SRCS ${FLOP_SRCS} PARENT_SCOPE)
diff --git a/torch_npu/csrc/flopcount/FlopCount.h b/torch_npu/csrc/flopcount/FlopCount.h
new file mode 100644
index 00000000000..f1a250a140c
--- /dev/null
+++ b/torch_npu/csrc/flopcount/FlopCount.h
@@ -0,0 +1,18 @@
+#ifndef FLOP_COUNT_H
+#define FLOP_COUNT_H
+
+#include "torch_npu/csrc/flopcount/FlopCountImpl.h"
+
+#define FLOP_COUNT(flopcount_func, ...) \
+do { \
+    FlopCountImpl& countImpl = FlopCountImpl::GetInstance(); \
+    if (countImpl.isEnabled) { \
+        int64_t flops = flopcount_func(__VA_ARGS__); \
+        countImpl.traversedCount += flops; \
+        if (!countImpl.isPaused) { \
+            countImpl.recordedCount += flops; \
+        } \
+    } \
+} while (0)
+
+#endif
diff --git a/torch_npu/csrc/flopcount/FlopCountImpl.cpp b/torch_npu/csrc/flopcount/FlopCountImpl.cpp
new file mode 100644
index 00000000000..99667e1efe0
--- /dev/null
+++ b/torch_npu/csrc/flopcount/FlopCountImpl.cpp
@@ -0,0 +1,35 @@
+#include "torch_npu/csrc/flopcount/FlopCountImpl.h"
+
+FlopCountImpl &FlopCountImpl::GetInstance()
+{
+    static FlopCountImpl instance;
+    return instance;
+}
+
+void FlopCountImpl::enable()
+{
+    isEnabled = true;
+}
+
+void FlopCountImpl::disable()
+{
+    isEnabled = false;
+}
+
+void FlopCountImpl::pause()
+{
+    isPaused = true;
+}
+
+void FlopCountImpl::resume()
+{
+    isPaused = false;
+}
+
+void FlopCountImpl::reset()
+{
+    isEnabled = false;
+    isPaused = false;
+    recordedCount = 0;
+    traversedCount = 0;
+}
diff --git a/torch_npu/csrc/flopcount/FlopCountImpl.h b/torch_npu/csrc/flopcount/FlopCountImpl.h
new file mode 100644
index 00000000000..86a8c9b55d8
--- /dev/null
+++ b/torch_npu/csrc/flopcount/FlopCountImpl.h
@@ -0,0 +1,22 @@
+#ifndef FLOP_COUNT_IMPL_H
+#define FLOP_COUNT_IMPL_H
+
+class FlopCountImpl {
+public:
+    bool isEnabled;
+    bool isPaused;
+    int64_t recordedCount;
+    int64_t traversedCount;
+
+    static FlopCountImpl& GetInstance();
+    void enable();
+    void disable();
+    void pause();
+    void resume();
+    void reset();
+
+private:
+    FlopCountImpl() : isEnabled(false),  isPaused(false), recordedCount(0), traversedCount(0) {}
+};
+
+#endif // FLOP_COUNT_IMPL_H
diff --git a/torch_npu/csrc/flopcount/FlopCounter.cpp b/torch_npu/csrc/flopcount/FlopCounter.cpp
new file mode 100644
index 00000000000..dd4da56ba10
--- /dev/null
+++ b/torch_npu/csrc/flopcount/FlopCounter.cpp
@@ -0,0 +1,105 @@
+#include "torch_npu/csrc/flopcount/FlopCounter.h"
+#include "torch_npu/csrc/core/npu/NPUException.h"
+
+int64_t FlopCounter::mm_flop(const at::Tensor &self, const at::Tensor &mat2)
+{
+    // Count flops for matmul.
+    // Inputs should be a list of length 2.
+    // Inputs contains the shapes of two matrices.
+    int64_t m = self.size(0);
+    int64_t k = self.size(1);
+    int64_t k2 = mat2.size(0);
+    int64_t n = mat2.size(1);
+    TORCH_CHECK(k == k2, "The tensor dimension is incorrect", PTA_ERROR(ErrCode::VALUE));
+    return m * n * 2 * k;
+}
+
+int64_t FlopCounter::addmm_flop(const at::Tensor &mat1, const at::Tensor &mat2)
+{
+    return mm_flop(mat1, mat2);
+}
+
+int64_t FlopCounter::bmm_flop(const at::Tensor &self, const at::Tensor &mat2)
+{
+    // Count flops for the bmm operation.
+    // Inputs should be a list of length 2.
+    // Inputs contains the shapes of two tensor.
+    int64_t b = self.size(0);
+    int64_t m = self.size(1);
+    int64_t k = self.size(2);
+    int64_t b2 = mat2.size(0);
+    int64_t k2 = mat2.size(1);
+    int64_t n = mat2.size(2);
+    TORCH_CHECK(b == b2 && k == k2, "The tensor dimension is incorrect", PTA_ERROR(ErrCode::VALUE));
+    return b * m * n * 2 * k;
+}
+
+int64_t FlopCounter::baddbmm_flop(const at::Tensor &batch1, const at::Tensor &batch2)
+{
+    return bmm_flop(batch1, batch2);
+}
+
+int64_t conv_flop_count(std::vector<int64_t> x_shape, std::vector<int64_t> w_shape, std::vector<int64_t> out_shape, bool transposed)
+{
+    // Count flops for convolution. Note only multiplication is
+    // counted. Computation for bias are ignored.
+    // Flops for a transposed convolution are calculated as
+    // flops = (x_shape[2:] * prod(w_shape) * batch_size).
+    // Args:
+    //     x_shape (std::vector<int64_t>): The input shape before convolution.
+    //     w_shape (std::vector<int64_t>): The filter shape.
+    //     out_shape (std::vector<int64_t>): The output shape after convolution.
+    //     transposed (bool): is the convolution transposed
+    // Returns:
+    //     int: the number of flops
+    int64_t batch_size = x_shape[0];
+    std::vector<int64_t> conv_shape = transposed ? out_shape : std::vector<int64_t>(out_shape.begin() + 2, out_shape.end());
+    int64_t c_out = w_shape[0];
+    int64_t c_in = w_shape[1];
+    int64_t filter_size = std::accumulate(w_shape.begin() + 2, w_shape.end(), 1, std::multiplies<int>());
+
+    int64_t flop = std::accumulate(conv_shape.begin(), conv_shape.end(), 1, std::multiplies<int>()) * filter_size * batch_size * c_out * c_in * 2;
+    return flop;
+}
+
+int64_t FlopCounter::conv_flop(const at::Tensor &input, const at::Tensor &weight, bool transposed, at::Tensor output)
+{
+    // Count flops for convolution.
+    std::vector<int64_t> out_shape(output.sizes().begin(), output.sizes().end());
+    std::vector<int64_t> x_shape(input.sizes().begin(), input.sizes().end());
+    std::vector<int64_t> w_shape(weight.sizes().begin(), weight.sizes().end());
+
+    return conv_flop_count(x_shape, w_shape, out_shape, transposed);
+}
+
+std::vector<int64_t> t(std::vector<int64_t> shape)
+{
+    return {shape[1], shape[0], shape[2], shape[3]};
+}
+
+int64_t FlopCounter::conv_backward_flop(const at::Tensor &grad_output, const at::Tensor &input,
+    const at::Tensor &weight, bool transposed, ::std::array<bool, 3> output_mask,
+    at::Tensor gradInput, at::Tensor gradeWeight)
+{
+    std::vector<int64_t> grad_output_shape(grad_output.sizes().begin(), grad_output.sizes().end());
+    std::vector<int64_t> w_shape(weight.sizes().begin(), weight.sizes().end());
+    std::vector<int64_t> input_shape(input.sizes().begin(), input.sizes().end());
+
+    int64_t flop_count = 0;
+
+    if (output_mask[0]) {
+        std::vector<int64_t> grad_input_shape(gradInput.sizes().begin(), gradInput.sizes().end());
+        flop_count += conv_flop_count(grad_output_shape, w_shape, grad_input_shape, !transposed);
+    }
+
+    if (output_mask[1]) {
+        std::vector<int64_t> grad_weight_shape(gradeWeight.sizes().begin(), gradeWeight.sizes().end());
+        if (transposed) {
+            flop_count += conv_flop_count(t(grad_output_shape), t(input_shape), t(grad_weight_shape), transposed=false);
+        } else {
+            flop_count += conv_flop_count(t(input_shape), t(grad_output_shape), t(grad_weight_shape), transposed=false);
+        }
+    }
+
+    return flop_count;
+}
diff --git a/torch_npu/csrc/flopcount/FlopCounter.h b/torch_npu/csrc/flopcount/FlopCounter.h
new file mode 100644
index 00000000000..b6f9caec922
--- /dev/null
+++ b/torch_npu/csrc/flopcount/FlopCounter.h
@@ -0,0 +1,22 @@
+#ifndef FLOP_COUNTER_H
+#define FLOP_COUNTER_H
+
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+
+class FlopCounter {
+public:
+    FlopCounter() = default;
+    ~FlopCounter() = default;
+
+    static int64_t mm_flop(const at::Tensor &self, const at::Tensor &mat2);
+    static int64_t addmm_flop(const at::Tensor &mat1, const at::Tensor &mat2);
+    static int64_t bmm_flop(const at::Tensor &self, const at::Tensor &mat2);
+    static int64_t baddbmm_flop(const at::Tensor &batch1, const at::Tensor &batch2);
+    static int64_t conv_flop(const at::Tensor &input, const at::Tensor &weight, bool transposed, at::Tensor output);
+    static int64_t conv_backward_flop(const at::Tensor &grad_output, const at::Tensor &input,
+        const at::Tensor &weight, bool transposed, ::std::array<bool, 3> output_mask,
+        at::Tensor gradInput, at::Tensor gradeWeight);
+};
+
+#endif
diff --git a/torch_npu/csrc/flopcount/Init.cpp b/torch_npu/csrc/flopcount/Init.cpp
new file mode 100644
index 00000000000..f7c103e47c8
--- /dev/null
+++ b/torch_npu/csrc/flopcount/Init.cpp
@@ -0,0 +1,55 @@
+#include "torch_npu/csrc/flopcount/Init.h"
+
+#include <pybind11/chrono.h>
+#include <pybind11/operators.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/object_ptr.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/types.h>
+
+#include "torch_npu/csrc/flopcount/FlopCountImpl.h"
+
+namespace torch_npu {
+namespace flopcount {
+
+template <typename T>
+using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
+
+PyObject* flops_count_init(PyObject* _unused, PyObject* noargs)
+{
+    auto torch_npu_C_module = THPObjectPtr(PyImport_ImportModule("torch_npu._C"));
+    if (!torch_npu_C_module) {
+        return nullptr;
+    }
+    auto torch_npu_C_m = py::handle(torch_npu_C_module).cast<py::module>();
+    auto m = torch_npu_C_m.def_submodule("_flops_count", " flops count bindings");
+    auto module = py::handle(m).cast<py::module>();
+
+    shared_ptr_class_<FlopCountImpl>(module, "_FlopCountImpl")
+        .def_static("GetInstance", &FlopCountImpl::GetInstance, py::return_value_policy::reference)
+        .def("enable", &FlopCountImpl::enable)
+        .def("disable", &FlopCountImpl::disable)
+        .def("pause", &FlopCountImpl::pause)
+        .def("resume", &FlopCountImpl::resume)
+        .def("reset", &FlopCountImpl::reset)
+        .def_readonly("recordedCount", &FlopCountImpl::recordedCount)
+        .def_readonly("traversedCount", &FlopCountImpl::traversedCount);
+
+    Py_RETURN_TRUE;
+}
+
+// autograd methods on torch._C
+static PyMethodDef TorchFlopsMethods[] = { // NOLINT
+    {"_flops_count_init", flops_count_init, METH_NOARGS, nullptr},
+    {nullptr, nullptr, 0, nullptr}
+};
+
+
+PyMethodDef* flops_count_functions()
+{
+    return TorchFlopsMethods;
+}
+
+}
+}
diff --git a/torch_npu/csrc/flopcount/Init.h b/torch_npu/csrc/flopcount/Init.h
new file mode 100644
index 00000000000..379f422ba45
--- /dev/null
+++ b/torch_npu/csrc/flopcount/Init.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+#include "torch_npu/csrc/core/npu/NPUMacros.h"
+
+namespace torch_npu {
+namespace flopcount {
+TORCH_NPU_API PyMethodDef* flops_count_functions();
+
+} // namespace flopcount
+} // namespace torch_npu
diff --git a/torch_npu/utils/__init__.py b/torch_npu/utils/__init__.py
index 91030444b72..5b300a5c9f1 100644
--- a/torch_npu/utils/__init__.py
+++ b/torch_npu/utils/__init__.py
@@ -1,3 +1,4 @@
+import torch_npu
 from ._module import _apply_module_patch
 from .tensor_methods import _add_tensor_methods
 from .storage import _add_storage_methods
@@ -14,3 +15,8 @@ from .asd_detector import set_asd_loss_scale, register_asd_hook
 from ._step import add_perf_dump_patch
 
 __all__ = []
+
+
+# init flopcount
+if not torch_npu._C._flops_count_init():
+    raise RuntimeError("flopcount initialization failed" + prof_error(ErrCode.UNAVAIL))
diff --git a/torch_npu/utils/flops_count.py b/torch_npu/utils/flops_count.py
new file mode 100644
index 00000000000..7e2f31308af
--- /dev/null
+++ b/torch_npu/utils/flops_count.py
@@ -0,0 +1,30 @@
+from torch_npu._C._flops_count import _FlopCountImpl
+
+
+class FlopsCounter:
+    def __init__(self, ):
+        self.flop_count_instance = _FlopCountImpl.GetInstance()
+    
+    def __enter__(self):
+        self.count_enable()
+    
+    def __exit__(self):
+        self.count_disable()
+
+    def start(self):
+        self.flop_count_instance.enable()
+
+    def stop(self):
+        self.flop_count_instance.disable()
+        self.flop_count_instance.reset()
+    
+    def pause(self):
+        self.flop_count_instance.pause()
+
+    def resume(self):
+        self.flop_count_instance.resume()
+
+    def get_flops(self):
+        recorded_count = self.flop_count_instance.recordedCount
+        traversed_count = self.flop_count_instance.traversedCount
+        return [recorded_count, traversed_count]
-- 
Gitee