From 548200899831c0d31fdee9c6ba351811fcc511e3 Mon Sep 17 00:00:00 2001 From: tangpeiqi96 Date: Sat, 27 Jul 2024 09:29:36 +0800 Subject: [PATCH] add flops computing interface --- CMakeLists.txt | 4 +- torch_npu/csrc/InitNpuBindings.cpp | 2 + torch_npu/csrc/flopcount/CMakeLists.txt | 6 ++ torch_npu/csrc/flopcount/FlopCount.h | 18 ++++ torch_npu/csrc/flopcount/FlopCountImpl.cpp | 35 +++++++ torch_npu/csrc/flopcount/FlopCountImpl.h | 22 +++++ torch_npu/csrc/flopcount/FlopCounter.cpp | 105 +++++++++++++++++++++ torch_npu/csrc/flopcount/FlopCounter.h | 22 +++++ torch_npu/csrc/flopcount/Init.cpp | 55 +++++++++++ torch_npu/csrc/flopcount/Init.h | 11 +++ torch_npu/utils/__init__.py | 6 ++ torch_npu/utils/flops_count.py | 30 ++++++ 12 files changed, 315 insertions(+), 1 deletion(-) create mode 100644 torch_npu/csrc/flopcount/CMakeLists.txt create mode 100644 torch_npu/csrc/flopcount/FlopCount.h create mode 100644 torch_npu/csrc/flopcount/FlopCountImpl.cpp create mode 100644 torch_npu/csrc/flopcount/FlopCountImpl.h create mode 100644 torch_npu/csrc/flopcount/FlopCounter.cpp create mode 100644 torch_npu/csrc/flopcount/FlopCounter.h create mode 100644 torch_npu/csrc/flopcount/Init.cpp create mode 100644 torch_npu/csrc/flopcount/Init.h create mode 100644 torch_npu/utils/flops_count.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b1a81efd08..d9732f54ad1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -196,6 +196,7 @@ set(FRAMEWORK_SRCS) if (NOT DEFINED BUILD_LIBTORCH) set(DIST_SRCS) + set(FLOP_SRCS) set(NPU_SRCS) set(PROF_SRCS) set(UTILS_SRCS) @@ -216,6 +217,7 @@ if (NOT DEFINED BUILD_LIBTORCH) add_subdirectory(${TORCHNPU_ROOT}/profiler) add_subdirectory(${TORCHNPU_ROOT}/utils) add_subdirectory(${TORCHNPU_ROOT}/sanitizer) + add_subdirectory(${TORCHNPU_ROOT}/flopcount) endif() if (DEFINED BUILD_LIBTORCH) @@ -238,7 +240,7 @@ if (DEFINED BUILD_LIBTORCH) set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${FRAMEWORK_SRCS} ${NPU_CPP_LIBS_SRCS}) else() # Compile code with pybind11 - set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${DIST_SRCS} ${FRAMEWORK_SRCS} ${NPU_SRCS} ${PROF_SRCS} ${UTILS_SRCS} ${SAN_SRCS}) + set(CPP_SRCS ${ATEN_SRCS} ${CORE_SRCS} ${OPS_PLUGIN_SRCS} ${DIST_SRCS} ${FLOP_SRCS} ${FRAMEWORK_SRCS} ${NPU_SRCS} ${PROF_SRCS} ${UTILS_SRCS} ${SAN_SRCS}) endif() add_library(${PLUGIN_NAME} SHARED ${CPP_SRCS}) diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp index 139fcb45c18..283c3c26ec4 100644 --- a/torch_npu/csrc/InitNpuBindings.cpp +++ b/torch_npu/csrc/InitNpuBindings.cpp @@ -12,6 +12,7 @@ #include "torch_npu/csrc/core/npu/THNPUCachingHostAllocator.h" #include "torch_npu/csrc/distributed/Init.h" #include "torch_npu/csrc/profiler/init.h" +#include "torch_npu/csrc/flopcount/Init.h" #include "torch_npu/csrc/npu/Module.h" #include "torch_npu/csrc/utils/TensorType.h" #include "torch_npu/csrc/utils/AutocastMode.h" @@ -126,6 +127,7 @@ PyObject* initModule() { AddPyMethodDefs(methods, torch_npu::distributed::python_functions()); AddPyMethodDefs(methods, torch_npu::utils::npu_extension_functions()); AddPyMethodDefs(methods, torch_npu::autocast::autocast_mode_functions()); + AddPyMethodDefs(methods, torch_npu::flopcount::flops_count_functions()); static struct PyModuleDef torchnpu_module = { PyModuleDef_HEAD_INIT, "torch_npu._C", diff --git a/torch_npu/csrc/flopcount/CMakeLists.txt b/torch_npu/csrc/flopcount/CMakeLists.txt new file mode 100644 index 00000000000..769250a65fe --- /dev/null +++ b/torch_npu/csrc/flopcount/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB _FLOP_SRCS *.cpp) + +LIST(APPEND FLOP_SRCS ${_FLOP_SRCS}) + +# Pass to parent +set(FLOP_SRCS ${FLOP_SRCS} PARENT_SCOPE) diff --git a/torch_npu/csrc/flopcount/FlopCount.h b/torch_npu/csrc/flopcount/FlopCount.h new file mode 100644 index 00000000000..f1a250a140c --- /dev/null +++ b/torch_npu/csrc/flopcount/FlopCount.h @@ -0,0 +1,18 @@ +#ifndef FLOP_COUNT_H +#define FLOP_COUNT_H + +#include "torch_npu/csrc/flopcount/FlopCountImpl.h" + +#define FLOP_COUNT(flopcount_func, ...) \ +do { \ + FlopCountImpl& countImpl = FlopCountImpl::GetInstance(); \ + if (countImpl.isEnabled) { \ + int64_t flops = flopcount_func(__VA_ARGS__); \ + countImpl.traversedCount += flops; \ + if (!countImpl.isPaused) { \ + countImpl.recordedCount += flops; \ + } \ + } \ +} while (0) + +#endif diff --git a/torch_npu/csrc/flopcount/FlopCountImpl.cpp b/torch_npu/csrc/flopcount/FlopCountImpl.cpp new file mode 100644 index 00000000000..99667e1efe0 --- /dev/null +++ b/torch_npu/csrc/flopcount/FlopCountImpl.cpp @@ -0,0 +1,35 @@ +#include "torch_npu/csrc/flopcount/FlopCountImpl.h" + +FlopCountImpl &FlopCountImpl::GetInstance() +{ + static FlopCountImpl instance; + return instance; +} + +void FlopCountImpl::enable() +{ + isEnabled = true; +} + +void FlopCountImpl::disable() +{ + isEnabled = false; +} + +void FlopCountImpl::pause() +{ + isPaused = true; +} + +void FlopCountImpl::resume() +{ + isPaused = false; +} + +void FlopCountImpl::reset() +{ + isEnabled = false; + isPaused = false; + recordedCount = 0; + traversedCount = 0; +} diff --git a/torch_npu/csrc/flopcount/FlopCountImpl.h b/torch_npu/csrc/flopcount/FlopCountImpl.h new file mode 100644 index 00000000000..86a8c9b55d8 --- /dev/null +++ b/torch_npu/csrc/flopcount/FlopCountImpl.h @@ -0,0 +1,22 @@ +#ifndef FLOP_COUNT_IMPL_H +#define FLOP_COUNT_IMPL_H + +class FlopCountImpl { +public: + bool isEnabled; + bool isPaused; + int64_t recordedCount; + int64_t traversedCount; + + static FlopCountImpl& GetInstance(); + void enable(); + void disable(); + void pause(); + void resume(); + void reset(); + +private: + FlopCountImpl() : isEnabled(false), isPaused(false), recordedCount(0), traversedCount(0) {} +}; + +#endif // FLOP_COUNT_IMPL_H diff --git a/torch_npu/csrc/flopcount/FlopCounter.cpp b/torch_npu/csrc/flopcount/FlopCounter.cpp new file mode 100644 index 00000000000..dd4da56ba10 --- /dev/null +++ b/torch_npu/csrc/flopcount/FlopCounter.cpp @@ -0,0 +1,105 @@ +#include "torch_npu/csrc/flopcount/FlopCounter.h" +#include "torch_npu/csrc/core/npu/NPUException.h" + +int64_t FlopCounter::mm_flop(const at::Tensor &self, const at::Tensor &mat2) +{ + // Count flops for matmul. + // Inputs should be a list of length 2. + // Inputs contains the shapes of two matrices. + int64_t m = self.size(0); + int64_t k = self.size(1); + int64_t k2 = mat2.size(0); + int64_t n = mat2.size(1); + TORCH_CHECK(k == k2, "The tensor dimension is incorrect", PTA_ERROR(ErrCode::VALUE)); + return m * n * 2 * k; +} + +int64_t FlopCounter::addmm_flop(const at::Tensor &mat1, const at::Tensor &mat2) +{ + return mm_flop(mat1, mat2); +} + +int64_t FlopCounter::bmm_flop(const at::Tensor &self, const at::Tensor &mat2) +{ + // Count flops for the bmm operation. + // Inputs should be a list of length 2. + // Inputs contains the shapes of two tensor. + int64_t b = self.size(0); + int64_t m = self.size(1); + int64_t k = self.size(2); + int64_t b2 = mat2.size(0); + int64_t k2 = mat2.size(1); + int64_t n = mat2.size(2); + TORCH_CHECK(b == b2 && k == k2, "The tensor dimension is incorrect", PTA_ERROR(ErrCode::VALUE)); + return b * m * n * 2 * k; +} + +int64_t FlopCounter::baddbmm_flop(const at::Tensor &batch1, const at::Tensor &batch2) +{ + return bmm_flop(batch1, batch2); +} + +int64_t conv_flop_count(std::vector x_shape, std::vector w_shape, std::vector out_shape, bool transposed) +{ + // Count flops for convolution. Note only multiplication is + // counted. Computation for bias are ignored. + // Flops for a transposed convolution are calculated as + // flops = (x_shape[2:] * prod(w_shape) * batch_size). + // Args: + // x_shape (std::vector): The input shape before convolution. + // w_shape (std::vector): The filter shape. + // out_shape (std::vector): The output shape after convolution. + // transposed (bool): is the convolution transposed + // Returns: + // int: the number of flops + int64_t batch_size = x_shape[0]; + std::vector conv_shape = transposed ? out_shape : std::vector(out_shape.begin() + 2, out_shape.end()); + int64_t c_out = w_shape[0]; + int64_t c_in = w_shape[1]; + int64_t filter_size = std::accumulate(w_shape.begin() + 2, w_shape.end(), 1, std::multiplies()); + + int64_t flop = std::accumulate(conv_shape.begin(), conv_shape.end(), 1, std::multiplies()) * filter_size * batch_size * c_out * c_in * 2; + return flop; +} + +int64_t FlopCounter::conv_flop(const at::Tensor &input, const at::Tensor &weight, bool transposed, at::Tensor output) +{ + // Count flops for convolution. + std::vector out_shape(output.sizes().begin(), output.sizes().end()); + std::vector x_shape(input.sizes().begin(), input.sizes().end()); + std::vector w_shape(weight.sizes().begin(), weight.sizes().end()); + + return conv_flop_count(x_shape, w_shape, out_shape, transposed); +} + +std::vector t(std::vector shape) +{ + return {shape[1], shape[0], shape[2], shape[3]}; +} + +int64_t FlopCounter::conv_backward_flop(const at::Tensor &grad_output, const at::Tensor &input, + const at::Tensor &weight, bool transposed, ::std::array output_mask, + at::Tensor gradInput, at::Tensor gradeWeight) +{ + std::vector grad_output_shape(grad_output.sizes().begin(), grad_output.sizes().end()); + std::vector w_shape(weight.sizes().begin(), weight.sizes().end()); + std::vector input_shape(input.sizes().begin(), input.sizes().end()); + + int64_t flop_count = 0; + + if (output_mask[0]) { + std::vector grad_input_shape(gradInput.sizes().begin(), gradInput.sizes().end()); + flop_count += conv_flop_count(grad_output_shape, w_shape, grad_input_shape, !transposed); + } + + if (output_mask[1]) { + std::vector grad_weight_shape(gradeWeight.sizes().begin(), gradeWeight.sizes().end()); + if (transposed) { + flop_count += conv_flop_count(t(grad_output_shape), t(input_shape), t(grad_weight_shape), transposed=false); + } else { + flop_count += conv_flop_count(t(input_shape), t(grad_output_shape), t(grad_weight_shape), transposed=false); + } + } + + return flop_count; +} diff --git a/torch_npu/csrc/flopcount/FlopCounter.h b/torch_npu/csrc/flopcount/FlopCounter.h new file mode 100644 index 00000000000..b6f9caec922 --- /dev/null +++ b/torch_npu/csrc/flopcount/FlopCounter.h @@ -0,0 +1,22 @@ +#ifndef FLOP_COUNTER_H +#define FLOP_COUNTER_H + +#include +#include + +class FlopCounter { +public: + FlopCounter() = default; + ~FlopCounter() = default; + + static int64_t mm_flop(const at::Tensor &self, const at::Tensor &mat2); + static int64_t addmm_flop(const at::Tensor &mat1, const at::Tensor &mat2); + static int64_t bmm_flop(const at::Tensor &self, const at::Tensor &mat2); + static int64_t baddbmm_flop(const at::Tensor &batch1, const at::Tensor &batch2); + static int64_t conv_flop(const at::Tensor &input, const at::Tensor &weight, bool transposed, at::Tensor output); + static int64_t conv_backward_flop(const at::Tensor &grad_output, const at::Tensor &input, + const at::Tensor &weight, bool transposed, ::std::array output_mask, + at::Tensor gradInput, at::Tensor gradeWeight); +}; + +#endif diff --git a/torch_npu/csrc/flopcount/Init.cpp b/torch_npu/csrc/flopcount/Init.cpp new file mode 100644 index 00000000000..f7c103e47c8 --- /dev/null +++ b/torch_npu/csrc/flopcount/Init.cpp @@ -0,0 +1,55 @@ +#include "torch_npu/csrc/flopcount/Init.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "torch_npu/csrc/flopcount/FlopCountImpl.h" + +namespace torch_npu { +namespace flopcount { + +template +using shared_ptr_class_ = py::class_>; + +PyObject* flops_count_init(PyObject* _unused, PyObject* noargs) +{ + auto torch_npu_C_module = THPObjectPtr(PyImport_ImportModule("torch_npu._C")); + if (!torch_npu_C_module) { + return nullptr; + } + auto torch_npu_C_m = py::handle(torch_npu_C_module).cast(); + auto m = torch_npu_C_m.def_submodule("_flops_count", " flops count bindings"); + auto module = py::handle(m).cast(); + + shared_ptr_class_(module, "_FlopCountImpl") + .def_static("GetInstance", &FlopCountImpl::GetInstance, py::return_value_policy::reference) + .def("enable", &FlopCountImpl::enable) + .def("disable", &FlopCountImpl::disable) + .def("pause", &FlopCountImpl::pause) + .def("resume", &FlopCountImpl::resume) + .def("reset", &FlopCountImpl::reset) + .def_readonly("recordedCount", &FlopCountImpl::recordedCount) + .def_readonly("traversedCount", &FlopCountImpl::traversedCount); + + Py_RETURN_TRUE; +} + +// autograd methods on torch._C +static PyMethodDef TorchFlopsMethods[] = { // NOLINT + {"_flops_count_init", flops_count_init, METH_NOARGS, nullptr}, + {nullptr, nullptr, 0, nullptr} +}; + + +PyMethodDef* flops_count_functions() +{ + return TorchFlopsMethods; +} + +} +} diff --git a/torch_npu/csrc/flopcount/Init.h b/torch_npu/csrc/flopcount/Init.h new file mode 100644 index 00000000000..379f422ba45 --- /dev/null +++ b/torch_npu/csrc/flopcount/Init.h @@ -0,0 +1,11 @@ +#pragma once + +#include +#include "torch_npu/csrc/core/npu/NPUMacros.h" + +namespace torch_npu { +namespace flopcount { +TORCH_NPU_API PyMethodDef* flops_count_functions(); + +} // namespace flopcount +} // namespace torch_npu diff --git a/torch_npu/utils/__init__.py b/torch_npu/utils/__init__.py index 91030444b72..5b300a5c9f1 100644 --- a/torch_npu/utils/__init__.py +++ b/torch_npu/utils/__init__.py @@ -1,3 +1,4 @@ +import torch_npu from ._module import _apply_module_patch from .tensor_methods import _add_tensor_methods from .storage import _add_storage_methods @@ -14,3 +15,8 @@ from .asd_detector import set_asd_loss_scale, register_asd_hook from ._step import add_perf_dump_patch __all__ = [] + + +# init flopcount +if not torch_npu._C._flops_count_init(): + raise RuntimeError("flopcount initialization failed" + prof_error(ErrCode.UNAVAIL)) diff --git a/torch_npu/utils/flops_count.py b/torch_npu/utils/flops_count.py new file mode 100644 index 00000000000..7e2f31308af --- /dev/null +++ b/torch_npu/utils/flops_count.py @@ -0,0 +1,30 @@ +from torch_npu._C._flops_count import _FlopCountImpl + + +class FlopsCounter: + def __init__(self, ): + self.flop_count_instance = _FlopCountImpl.GetInstance() + + def __enter__(self): + self.count_enable() + + def __exit__(self): + self.count_disable() + + def start(self): + self.flop_count_instance.enable() + + def stop(self): + self.flop_count_instance.disable() + self.flop_count_instance.reset() + + def pause(self): + self.flop_count_instance.pause() + + def resume(self): + self.flop_count_instance.resume() + + def get_flops(self): + recorded_count = self.flop_count_instance.recordedCount + traversed_count = self.flop_count_instance.traversedCount + return [recorded_count, traversed_count] -- Gitee