diff --git a/patch/npu.patch b/patch/npu.patch index aa6bf6f24e25767bff292fc4c9e0733dcf63fee1..07b50fa05ffaa56598fcbfc103b6d44f5b19b930 100644 --- a/patch/npu.patch +++ b/patch/npu.patch @@ -2539,7 +2539,7 @@ diff -Nur '--exclude=.git' apex/setup.py apex-develop/setup.py + libraries = kwargs.get('libraries', []) + libraries.append('c10') + libraries.append('torch') -+ libraries.append('torch_npu') ++ + libraries.append('torch_cpu') + libraries.append('torch_python') + kwargs['libraries'] = libraries diff --git a/src/csrc/npu_float_status/common.h b/src/csrc/npu_float_status/common.h deleted file mode 100644 index e578bfcd50ab3518013644b99ba9bd3b32cd4a17..0000000000000000000000000000000000000000 --- a/src/csrc/npu_float_status/common.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2020, Huawei Technologies.All rights reserved. - * - * Licensed under the BSD 3-Clause License (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://opensource.org/licenses/BSD-3-Clause - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef COMMON_H -#define COMMON_H - -#include -#include -#include -#include -#include - -#include -#include - -#define RUN_SUCCESS 0 -#define RUN_FAILED 1 - -#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) -#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) -#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR] " fmt "\n", ##args) - -const std::string OP_TYPE_NPU_GET_FLOAT_STATUS = "NPUGetFloatStatus"; -const std::string OP_TYPE_NPU_CLEAR_FLOAT_STATUS = "NPUClearFloatStatus"; - -const int FLOAT_STATUS_OP_TENSOR_DIMS_SIZE = 8; -const int FLOAT_STATUS_OVERFLOW = 1; - -#endif // COMMON_H diff --git a/src/csrc/npu_float_status/op_float_status.cpp b/src/csrc/npu_float_status/op_float_status.cpp index 18767fec925a6f7a590c78fc8bccf9e80f9ee2fe..669fb06cc694bd5ac633168e63f22888e854249e 100644 --- a/src/csrc/npu_float_status/op_float_status.cpp +++ b/src/csrc/npu_float_status/op_float_status.cpp @@ -14,29 +14,9 @@ * limitations under the License. */ -#include -#include -#include -#include -#include +#include #include #include -#include "op_runner.h" -#include "common.h" - -OperatorDesc CreateFloatStatusOpDesc(const std::string opType) -{ - std::vector shape{FLOAT_STATUS_OP_TENSOR_DIMS_SIZE}; - aclDataType dataType = ACL_FLOAT; - aclFormat format = ACL_FORMAT_ND; - OperatorDesc opDesc(opType); - if ((opType == OP_TYPE_NPU_GET_FLOAT_STATUS) || - (opType == OP_TYPE_NPU_CLEAR_FLOAT_STATUS)) { - opDesc.AddInputTensorDesc(dataType, shape.size(), shape.data(), format); - } - opDesc.AddOutputTensorDesc(dataType, shape.size(), shape.data(), format); - return opDesc; -} /* This function is used for linking torch/acl .so files */ at::Tensor TestFlatten(std::vector tensors) diff --git a/src/csrc/npu_float_status/op_runner.cpp b/src/csrc/npu_float_status/op_runner.cpp deleted file mode 100644 index 39d63886ea84569f4ec8debaed3bafad2880adf7..0000000000000000000000000000000000000000 --- a/src/csrc/npu_float_status/op_runner.cpp +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Copyright (c) 2020, Huawei Technologies.All rights reserved. - * - * Licensed under the BSD 3-Clause License (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://opensource.org/licenses/BSD-3-Clause - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "op_runner.h" -#include -#include "common.h" -#include -#include - -using namespace std; - -OpRunner::OpRunner(OperatorDesc *opDesc) : opDesc_(opDesc) -{ - numInputs_ = opDesc->inputDesc.size(); - numOutputs_ = opDesc->outputDesc.size(); -} - -OpRunner::~OpRunner() -{ - for (auto *inputBuf : inputBuffers_) { - aclDestroyDataBuffer(inputBuf); - } - - for (auto *devInput : devInputs_) { - aclrtFree(devInput); - } - - for (auto *hostInput : hostInputs_) { - aclrtFreeHost(hostInput); - } - - for (auto *outputBuf : outputBuffers_) { - aclDestroyDataBuffer(outputBuf); - } - - for (auto *devOutput : devOutputs_) { - aclrtFree(devOutput); - } - - for (auto *hostOutput : hostOutputs_) { - aclrtFreeHost(hostOutput); - } -} - -int OpRunner::Init() -{ - for (size_t i = 0; i < numInputs_; ++i) { - auto size = GetInputSize(i); - void *devMem = nullptr; - if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_ERROR_NONE) { - ERROR_LOG("Malloc device memory for input[%zu] failed", i); - return RUN_FAILED; - } - - if (aclrtMemset(devMem, size, 0, size) != ACL_ERROR_NONE) { - ERROR_LOG("Set device memory for input[%zu] failed", i); - return RUN_FAILED; - } - - devInputs_.emplace_back(devMem); - inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); - - void *hostMem = nullptr; - if (aclrtMallocHost(&hostMem, size) != ACL_ERROR_NONE) { - ERROR_LOG("Malloc device memory for input[%zu] failed", i); - return RUN_FAILED; - } - if (hostMem == nullptr) { - ERROR_LOG("Malloc memory for input[%zu] failed", i); - return RUN_FAILED; - } - hostInputs_.emplace_back(hostMem); - } - - for (size_t i = 0; i < numOutputs_; ++i) { - auto size = GetOutputSize(i); - void *devMem = nullptr; - if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_ERROR_NONE) { - ERROR_LOG("Malloc device memory for output[%zu] failed", i); - return RUN_FAILED; - } - - if (aclrtMemset(devMem, size, 0, size) != ACL_ERROR_NONE) { - ERROR_LOG("Set device memory for output[%zu] failed", i); - return RUN_FAILED; - } - - devOutputs_.emplace_back(devMem); - outputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); - - void *hostOutput = nullptr; - if (aclrtMallocHost(&hostOutput, size) != ACL_ERROR_NONE) { - ERROR_LOG("Malloc device memory for output[%zu] failed", i); - return RUN_FAILED; - } - if (hostOutput == nullptr) { - ERROR_LOG("Malloc host memory for output[%zu] failed", i); - return RUN_FAILED; - } - hostOutputs_.emplace_back(hostOutput); - } - - return RUN_SUCCESS; -} - -size_t OpRunner::NumInputs() -{ - return numInputs_; -} - -size_t OpRunner::NumOutputs() -{ - return numOutputs_; -} - -size_t OpRunner::GetInputSize(size_t index) -{ - if (index >= opDesc_->inputDesc.size()) { - ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); - return 0; - } - - return aclGetTensorDescSize(opDesc_->inputDesc[index]); -} - -size_t OpRunner::GetOutputSize(size_t index) -{ - if (index >= opDesc_->outputDesc.size()) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); - return 0; - } - - return aclGetTensorDescSize(opDesc_->outputDesc[index]); -} - -int OpRunner::RunOp() -{ - auto stream = c10_npu::getCurrentNPUStream(); - int holdGIL = PyGILState_Check(); - aclError ret = ACL_ERROR_NONE; - - if (holdGIL) { - Py_BEGIN_ALLOW_THREADS - ret = aclopCompileAndExecute(opDesc_->opType.c_str(), - numInputs_, - opDesc_->inputDesc.data(), - inputBuffers_.data(), - numOutputs_, - opDesc_->outputDesc.data(), - outputBuffers_.data(), - opDesc_->opAttr, - ACL_ENGINE_SYS, - ACL_COMPILE_SYS, - nullptr, - stream); - Py_END_ALLOW_THREADS - } else { - ret = aclopCompileAndExecute(opDesc_->opType.c_str(), - numInputs_, - opDesc_->inputDesc.data(), - inputBuffers_.data(), - numOutputs_, - opDesc_->outputDesc.data(), - outputBuffers_.data(), - opDesc_->opAttr, - ACL_ENGINE_SYS, - ACL_COMPILE_SYS, - nullptr, - stream); - } - - if (ret != ACL_ERROR_NONE) { - ERROR_LOG("Execute %s failed. ret = %d", opDesc_->opType.c_str(), ret); - return RUN_FAILED; - } - - if (opDesc_->opType == OP_TYPE_NPU_GET_FLOAT_STATUS) { - if (aclrtSynchronizeStream(stream) != ACL_ERROR_NONE) { - ERROR_LOG("Synchronize stream failed"); - return RUN_FAILED; - } - - for (size_t i = 0; i < numInputs_; ++i) { - auto size = GetInputSize(i); - if (aclrtMemcpy(hostInputs_[i], size, devInputs_[i], size, ACL_MEMCPY_DEVICE_TO_HOST) != ACL_ERROR_NONE) { - ERROR_LOG("Copy input[%zu] failed", i); - return RUN_FAILED; - } - } - } - - return RUN_SUCCESS; -} diff --git a/src/csrc/npu_float_status/op_runner.h b/src/csrc/npu_float_status/op_runner.h deleted file mode 100644 index 81b38b73573108a92bd53625682f7ce79377a5ad..0000000000000000000000000000000000000000 --- a/src/csrc/npu_float_status/op_runner.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2020, Huawei Technologies.All rights reserved. - * - * Licensed under the BSD 3-Clause License (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://opensource.org/licenses/BSD-3-Clause - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef OP_RUNNER_H -#define OP_RUNNER_H - -#include "common.h" -#include "operator_desc.h" - -/** - * Op Runner - */ -class OpRunner { -public: - /** - * @brief Constructor - * @param [in] opDesc: op description - */ - explicit OpRunner(OperatorDesc *opDesc); - - /** - * @brief Destructor - */ - ~OpRunner(); - - /** - * @brief Init op runner - */ - int Init(); - - /** - * @brief Get number of inputs - * @return number of inputs - */ - size_t NumInputs(); - - /** - * @brief Get number of outputs - * @return number of outputs - */ - size_t NumOutputs(); - - /** - * @brief Get input size by index - * @param [in] index: input index - * @return size of the input - */ - size_t GetInputSize(size_t index); - - /** - * @brief Get output size by index - * @param [in] index: output index - * @return size of the output - */ - size_t GetOutputSize(size_t index); - - /** - * @brief Get input buffer(host memory) by index - * @tparam T: data type - * @param [in] index: input index - * @return host address of the input - */ - template - T *GetInputBuffer(size_t index) - { - if (index >= numInputs_) { - ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); - return nullptr; - } - return reinterpret_cast(hostInputs_[index]); - } - - /** - * @brief Get output buffer(host memory) by index - * @tparam T: data type - * @param [in] index: output index - * @return host address of the output - */ - template - const T *GetOutputBuffer(size_t index) - { - if (index >= numOutputs_) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); - return nullptr; - } - - return reinterpret_cast(hostOutputs_[index]); - } - - /** - * @brief Run op - * @return run result - */ - int RunOp(); - -private: - size_t numInputs_; - size_t numOutputs_; - - std::vector inputBuffers_; - std::vector outputBuffers_; - - std::vector devInputs_; - std::vector devOutputs_; - - std::vector hostInputs_; - std::vector hostOutputs_; - OperatorDesc *opDesc_; -}; - -#endif // OP_RUNNER_H diff --git a/src/csrc/npu_float_status/operator_desc.cpp b/src/csrc/npu_float_status/operator_desc.cpp deleted file mode 100644 index f55e6efdbe79383d82ccfa7696b9a82c0bff1979..0000000000000000000000000000000000000000 --- a/src/csrc/npu_float_status/operator_desc.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2020, Huawei Technologies.All rights reserved. - * - * Licensed under the BSD 3-Clause License (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://opensource.org/licenses/BSD-3-Clause - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "common.h" -#include "operator_desc.h" - -using namespace std; - -OperatorDesc::OperatorDesc(std::string opType) : opType(std::move(opType)) -{ - opAttr = aclopCreateAttr(); -} - -OperatorDesc::~OperatorDesc() -{ - for (auto *desc : inputDesc) { - aclDestroyTensorDesc(desc); - } - - for (auto *desc : outputDesc) { - aclDestroyTensorDesc(desc); - } - - aclopDestroyAttr(opAttr); -} - -OperatorDesc &OperatorDesc::AddInputTensorDesc(aclDataType dataType, - int numDims, - const int64_t *dims, - aclFormat format) -{ - if (numDims > 0 && dims == nullptr) { - ERROR_LOG("dims is nullptr while numDims > 0"); - return *this; - } - inputDesc.push_back(aclCreateTensorDesc(dataType, numDims, dims, format)); - return *this; -} - -OperatorDesc &OperatorDesc::AddOutputTensorDesc(aclDataType dataType, - int numDims, - const int64_t *dims, - aclFormat format) -{ - if (numDims > 0 && dims == nullptr) { - ERROR_LOG("dims is nullptr while numDims > 0"); - return *this; - } - - outputDesc.push_back(aclCreateTensorDesc(dataType, numDims, dims, format)); - return *this; -} diff --git a/src/csrc/npu_float_status/operator_desc.h b/src/csrc/npu_float_status/operator_desc.h deleted file mode 100644 index 9035ff71d376cdfae693f023e7559bd0a287ba00..0000000000000000000000000000000000000000 --- a/src/csrc/npu_float_status/operator_desc.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2020, Huawei Technologies.All rights reserved. - * - * Licensed under the BSD 3-Clause License (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://opensource.org/licenses/BSD-3-Clause - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef OPERATOR_DESC_H -#define OPERATOR_DESC_H - -#include -#include - -#include - -/** - * Op description - */ -struct OperatorDesc { - /** - * Constructor - * @param [in] opType: op type - */ - explicit OperatorDesc(std::string opType); - - /** - * Destructor - */ - ~OperatorDesc(); - - /** - * Add an input tensor description - * @param [in] dataType: data type - * @param [in] numDims: number of dims - * @param [in] dims: dims - * @param [in] format: format - * @return OperatorDesc - */ - OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); - - /** - * Add an output tensor description - * @param [in] dataType: data type - * @param [in] numDims: number of dims - * @param [in] dims: dims - * @param [in] format: format - * @return OperatorDesc - */ - OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); - - std::string opType; - std::vector inputDesc; - std::vector outputDesc; - aclopAttr *opAttr; -}; - -#endif // OPERATOR_DESC_H