diff --git a/Cpp_example/C06_test_qr_code_detector/CMakeLists.txt b/Cpp_example/C06_test_qr_code_detector/CMakeLists.txt index a5e8335ef9c545ccefc201db60156cdbda311884..5bc112182bad8bea270137c6426b101469f7d190 100644 --- a/Cpp_example/C06_test_qr_code_detector/CMakeLists.txt +++ b/Cpp_example/C06_test_qr_code_detector/CMakeLists.txt @@ -25,10 +25,8 @@ find_package(LockzhinerVisionModule REQUIRED) # 定义 ZXing SDK 路径 set(ZXing_ROOT_PATH "${PROJECT_ROOT_PATH}/third_party/zxing-cpp-v2.2.1-lockzhiner-vision-module") -set(ZXing_DIR "${ZXing_ROOT_PATH}/lib/cmake/ZXing") set(ZXing_INCLUDE_DIRS "${ZXing_ROOT_PATH}/include") -find_package(ZXing REQUIRED) -set(ZXing_LIBRARIES "${ZXing_LIBS}") +set(ZXing_LIBRARIES "${ZXing_ROOT_PATH}/lib/libzxing.a") # 基本图像处理示例 add_executable(Test-qr_code-detector test_qr_code_detector.cc) diff --git a/Cpp_example/C07_test_bar_codeDetector/CMakeLists.txt b/Cpp_example/C07_test_bar_codeDetector/CMakeLists.txt index 12f88ccdae9dbafe8d836570a5494d94472135df..d76037a0a4c3d9b1c91f84fc8c66bb2329b2f8b6 100644 --- a/Cpp_example/C07_test_bar_codeDetector/CMakeLists.txt +++ b/Cpp_example/C07_test_bar_codeDetector/CMakeLists.txt @@ -25,10 +25,8 @@ find_package(LockzhinerVisionModule REQUIRED) # 定义 ZXing SDK 路径 set(ZXing_ROOT_PATH "${PROJECT_ROOT_PATH}/third_party/zxing-cpp-v2.2.1-lockzhiner-vision-module") -set(ZXing_DIR "${ZXing_ROOT_PATH}/lib/cmake/ZXing") set(ZXing_INCLUDE_DIRS "${ZXing_ROOT_PATH}/include") -find_package(ZXing REQUIRED) -set(ZXing_LIBRARIES "${ZXing_LIBS}") +set(ZXing_LIBRARIES "${ZXing_ROOT_PATH}/lib/libzxing.a") # 基本图像处理示例 add_executable(test-bar-codeDetector test_bar_codeDetector.cc) diff --git a/Cpp_example/D05_ocr_text_recognition/CMakeLists.txt b/Cpp_example/D05_ocr_text_recognition/CMakeLists.txt new file mode 100755 index 0000000000000000000000000000000000000000..fccef0fe18a75ba9b3d298ab1d67de3f47bd14b3 --- /dev/null +++ b/Cpp_example/D05_ocr_text_recognition/CMakeLists.txt @@ -0,0 +1,44 @@ +cmake_minimum_required(VERSION 3.10) + +project(ocr_text_recognition) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# 定义项目根目录路径 +set(PROJECT_ROOT_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../..") +message("PROJECT_ROOT_PATH = " ${PROJECT_ROOT_PATH}) + +include("${PROJECT_ROOT_PATH}/toolchains/arm-rockchip830-linux-uclibcgnueabihf.toolchain.cmake") + +# 定义 OpenCV SDK 路径 +set(OpenCV_ROOT_PATH "${PROJECT_ROOT_PATH}/third_party/opencv-mobile-4.10.0-lockzhiner-vision-module") +set(OpenCV_DIR "${OpenCV_ROOT_PATH}/lib/cmake/opencv4") +find_package(OpenCV REQUIRED) +set(OPENCV_LIBRARIES "${OpenCV_LIBS}") + +# 定义 LockzhinerVisionModule SDK 路径 +set(LockzhinerVisionModule_ROOT_PATH "${PROJECT_ROOT_PATH}/third_party/lockzhiner_vision_module_sdk") +set(LockzhinerVisionModule_DIR "${LockzhinerVisionModule_ROOT_PATH}/lib/cmake/lockzhiner_vision_module") +find_package(LockzhinerVisionModule REQUIRED) + +# ncnn配置 +set(NCNN_ROOT_DIR "${PROJECT_ROOT_PATH}/third_party/ncnn-20240820-lockzhiner-vision-module") # 确保third_party层级存在 +message(STATUS "Checking ncnn headers in: ${NCNN_ROOT_DIR}/include/ncnn") + +# 验证头文件存在 +if(NOT EXISTS "${NCNN_ROOT_DIR}/include/ncnn/net.h") + message(FATAL_ERROR "ncnn headers not found. Confirm the directory contains ncnn: ${NCNN_ROOT_DIR}") +endif() + +set(NCNN_INCLUDE_DIRS "${NCNN_ROOT_DIR}/include") +set(NCNN_LIBRARIES "${NCNN_ROOT_DIR}/lib/libncnn.a") + +add_executable(Test-ncnn_rec ncnn_rec.cc) +target_include_directories(Test-ncnn_rec PRIVATE ${LOCKZHINER_VISION_MODULE_INCLUDE_DIRS} ${NCNN_INCLUDE_DIRS}) +target_link_libraries(Test-ncnn_rec PRIVATE ${OPENCV_LIBRARIES} ${NCNN_LIBRARIES} ${LOCKZHINER_VISION_MODULE_LIBRARIES}) + +install( + TARGETS Test-ncnn_rec + RUNTIME DESTINATION . +) \ No newline at end of file diff --git a/Cpp_example/D05_ocr_text_recognition/README.md b/Cpp_example/D05_ocr_text_recognition/README.md new file mode 100755 index 0000000000000000000000000000000000000000..d7b86fb81cb68e02cbb48a6f94b1301c681b0bb1 --- /dev/null +++ b/Cpp_example/D05_ocr_text_recognition/README.md @@ -0,0 +1,397 @@ +# OCR 文字识别 +本章节在 Lockzhiner Vision Module 上基于OcrLiteNcnn模型, 实现了一个OCR文字识别系统。 +## 1. 基本知识讲解 +### 1.1 文字识别简介 +OCR(光学字符识别)是指通过电子设备读取并转换纸质文档或图像中的文字为可编辑和处理的数字文本的技术。它涉及图像预处理、字符分割、特征提取、字符识别及后处理等步骤,以实现高准确度的文字转换。OCR技术极大提升了信息数字化的效率,广泛应用于数字化图书馆、自动化数据录入、车牌识别系统及辅助阅读工具等领域,是现代办公与生活中不可或缺的一部分。 +### 1.2 文字识别常用方法 +- 模板匹配:通过与预定义字符模板比较来识别字符,适用于固定字体和字号。 +- 特征提取:从字符中提取关键特征(如线条、端点)并使用分类器识别,适应字体变化。 +- 神经网络:利用卷积神经网络自动学习字符特征,特别适合复杂背景和多变字体,提供高准确率。 +这些方法各有优势,选择取决于具体应用需求和文档特性。随着技术发展,基于神经网络的方法因其高性能而得到广泛应用。 + +--- + +## 2. C++ API 文档 +### 2.1 Net类 +#### 2.1.1 头文件 +```cpp +#include +``` +- 作用:用于声明Net类,使得Net类可以在当前文件中使用。 + +#### 2.1.2 构造类函数 +```cpp +ncnn::Net net; +``` +- 作用:创建一个Net类型的对象实例,用于实现文字识别。 +- 参数说明: + - 无 +- 返回值: + - 无 + +#### 2.1.3 load_param函数 +```cpp +int load_param(const DataReader& dr); +``` +- 参数说明: + - dr:传入的参数文件路径。 +- 返回值: + - 返回值为0表示加载参数文件成功。 + +#### 2.1.4 load_model函数 +```cpp +int load_model(const DataReader& dr); +``` +- 参数说明: + - dr:传入的模型文件路径。 +- 返回值:返回值为0表示加载模型成功。 + +#### 2.1.5 from_pixels函数 +```cpp +ncnn::Mat::from_pixels(srcResize.data, ncnn::Mat::PIXEL_BGR, srcResize.cols, srcResize.rows); +``` +- 参数说明: + - srcResize.data:输入图像的像素数据指针。 + - ncnn::Mat::PIXEL_BGR:输入像素数据的颜色格式。 + - srcResize.cols:输入图像的宽度。 + - srcResize.rows:输入图像的高度。 +- 返回值:适配成 NCNN 所需的格式的包含图像数据的新对象。 + +### 2.2 Extractor类 +#### 2.2.1 头文件 +```cpp +#include +``` +- 作用:用于声明Extractor类,使得Extractor类可以在当前文件中使用。 + +#### 2.2.2 构造类函数 +```cpp +ncnn::Extractor extractor = net.create_extractor(); +``` +- 作用:从已经加载了神经网络模型的 net 中创建一个 Extractor 实例,用于执行文字识别的推理任务。 +- 参数说明: + - 无 +- 返回值: + - 无 + +--- + +## 3. OCR 字符识别代码解析 +### 3.1 流程图 + + + +### 3.2 核心代码解析 +- 加载模型参数和权重 +```cpp +net.load_param(argv[2]); +net.load_model(argv[3]); +``` +- 读取字符集文件 +```cpp +std::ifstream in(argv[4]); +std::string line; +if (in) { +while (getline(in, line)) { // line中不包括每行的换行符 + keys.push_back(line); +} +} else { +printf("The keys.txt file was not found\n"); +return false; +} +if (keys.size() != 5531) { +fprintf(stderr, "missing keys\n"); +return false; +} +``` +- 对输入图像进行识别 +```cpp +for (int i = 0; i < out.h; i++) { + int maxIndex = 0; + float maxValue = -1000.f; + + // Softmax 计算 + std::vector exps(out.w); + for (int j = 0; j < out.w; j++) { + float expSingle = exp(outputData[i * out.w + j]); + exps.at(j) = expSingle; + } + float partition = accumulate(exps.begin(), exps.end(), 0.0); // 行总和 + + // 找到最大值及其索引 + auto maxElementIt = std::max_element(exps.begin(), exps.end()); + maxIndex = std::distance(exps.begin(), maxElementIt); + maxValue = *maxElementIt / partition; + + // 检测到有效字符 + if (maxIndex > 0 && maxIndex < keySize && + (!(i > 0 && maxIndex == lastIndex))) { + scores.emplace_back(maxValue); + strRes.append(keys[maxIndex - 1]); // 将字符追加到结果字符串中 + } + + lastIndex = maxIndex; + } +``` +自定义函数说明 +- OCR 文字识别 +```cpp +void processFrame(Mat &src, ncnn::Net &net, const std::vector &keys, const float meanValues[], const float normValues[], const int dstHeight) +``` +- 作用: + - 执行图像预处理,模型推理,Softmax解码,字符拼接全流程。 +- 参数说明: + - src:待识别的文本区域图像。 + - ocr_net:OCR识别模型。 + - keys:字符表(字符到索引映射)。 + - ocr_mean:图像归一化均值。 + - ocr_norm:图像归一化标准差。 + - dstHeight:目标高度。 +- 返回值: + - 无 + +### 3.3 完整代码实现 +```cpp +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + + +using namespace cv; +using namespace std::chrono; +template +inline static size_t argmax(ForwardIterator first, ForwardIterator last) { + return std::distance(first, std::max_element(first, last)); +} + +// 后续处理函数 +void processFrame(Mat &src, ncnn::Net &net, const std::vector &keys, const float meanValues[], const float normValues[], const int dstHeight) { + float scale = (float)dstHeight / (float)src.rows; + int dstWidth = int((float)src.cols * scale); + std::cout << "resize" << std::endl; + cv::Mat srcResize; + resize(src, srcResize, cv::Size(dstWidth, dstHeight)); + std::cout << "resize success" << std::endl; + ncnn::Mat input = ncnn::Mat::from_pixels(srcResize.data, ncnn::Mat::PIXEL_RGB, + srcResize.cols, srcResize.rows); + std::cout << "input success" << std::endl; + input.substract_mean_normalize(meanValues, normValues); + + ncnn::Extractor extractor = net.create_extractor(); + // net.num_threads = 6; + extractor.input("input", input); + std::cout << "extract success" << std::endl; + ncnn::Mat out; + extractor.extract("out", out); + + float *floatArray = (float *)out.data; + std::vector outputData(floatArray, floatArray + out.h * out.w); + + int keySize = keys.size(); + std::string strRes; + std::vector scores; + int lastIndex = 0; + int maxIndex; + float maxValue; + std::cout << "开始检测" << std::endl; + high_resolution_clock::time_point start_time = + high_resolution_clock::now(); + for (int i = 0; i < out.h; i++) { + int maxIndex = 0; + float maxValue = -1000.f; + + // Softmax 计算 + std::vector exps(out.w); + for (int j = 0; j < out.w; j++) { + float expSingle = exp(outputData[i * out.w + j]); + exps.at(j) = expSingle; + } + // 行总和 + float partition = accumulate(exps.begin(), exps.end(), 0.0); + + // 找到最大值及其索引 + auto maxElementIt = std::max_element(exps.begin(), exps.end()); + maxIndex = std::distance(exps.begin(), maxElementIt); + maxValue = *maxElementIt / partition; + + // 检测到有效字符 + if (maxIndex > 0 && maxIndex < keySize && + (!(i > 0 && maxIndex == lastIndex))) { + scores.emplace_back(maxValue); + // 将字符追加到结果字符串中 + strRes.append(keys[maxIndex - 1]); + } + + lastIndex = maxIndex; + } + high_resolution_clock::time_point end_time = high_resolution_clock::now(); + auto time_span = duration_cast(end_time - start_time); + // 所有检测完成后,一次性输出结果 + std::cout << "检测完成,最终结果:" << std::endl; + std::cout << "识别的文本: " << strRes << std::endl; + std::cout << "单张图片推理时间(ms): " << time_span.count() << std::endl; + std::cout << std::endl; + +} + +int main(int argc, char *argv[]) { + if (argc != 5) { + std::cerr + << "Usage: ./ncnn_ocr " + << std::endl; + } + const float meanValues[3] = {127.5, 127.5, 127.5}; + const float normValues[3] = {1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5}; + const int dstHeight = 32; + ncnn::Net net; + std::vector keys; + // Mat src = imread(argv[1]); // default : BGR + // Step 2: 加载 .param 和 .bin 文件 + net.load_param(argv[2]); + net.load_model(argv[3]); + std::cout << "load model success" << std::endl; + std::ifstream in(argv[4]); + std::string line; + if (in) { + // line中不包括每行的换行符 + while (getline(in, line)) { + keys.push_back(line); + } + } else { + printf("The keys.txt file was not found\n"); + return false; + } + if (keys.size() != 5531) { + fprintf(stderr, "missing keys\n"); + return false; + } + printf("total keys size(%lu)\n", keys.size()); + + std::string argument(argv[1]); + // Default: BGR + Mat src = imread(argv[1]); + if (src.empty()) { + std::cerr << "Error opening image file" << std::endl; + return -1; + } + processFrame(src, net, keys, meanValues, normValues, dstHeight); + return 0; +} +``` + +--- + +## 4. 编译调试 +### 4.1 编译环境搭建 +- 请确保你已经按照 [开发环境搭建指南](../../../../docs/introductory_tutorial/cpp_development_environment.md) 正确配置了开发环境。 +- 同时已经正确连接开发板。 +### 4.2 Cmake介绍 +```cmake +cmake_minimum_required(VERSION 3.10) + +project(test_distance) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# 定义项目根目录路径 +set(PROJECT_ROOT_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../..") +message("PROJECT_ROOT_PATH = " ${PROJECT_ROOT_PATH}) + +include("${PROJECT_ROOT_PATH}/toolchains/arm-rockchip830-linux-uclibcgnueabihf.toolchain.cmake") + +# 定义 OpenCV SDK 路径 +set(OpenCV_ROOT_PATH "${PROJECT_ROOT_PATH}/third_party/opencv-mobile-4.10.0-lockzhiner-vision-module") +set(OpenCV_DIR "${OpenCV_ROOT_PATH}/lib/cmake/opencv4") +find_package(OpenCV REQUIRED) +set(OPENCV_LIBRARIES "${OpenCV_LIBS}") + +# 定义 LockzhinerVisionModule SDK 路径 +set(LockzhinerVisionModule_ROOT_PATH "${PROJECT_ROOT_PATH}/third_party/lockzhiner_vision_module_sdk") +set(LockzhinerVisionModule_DIR "${LockzhinerVisionModule_ROOT_PATH}/lib/cmake/lockzhiner_vision_module") +find_package(LockzhinerVisionModule REQUIRED) + +# ncnn配置 +set(NCNN_ROOT_DIR "${PROJECT_ROOT_PATH}/third_party/ncnn-20240820-lockzhiner-vision-module") # 确保third_party层级存在 +message(STATUS "Checking ncnn headers in: ${NCNN_ROOT_DIR}/include/ncnn") + +# 验证头文件存在 +if(NOT EXISTS "${NCNN_ROOT_DIR}/include/ncnn/net.h") + message(FATAL_ERROR "ncnn headers not found. Confirm the directory contains ncnn: ${NCNN_ROOT_DIR}") +endif() + +set(NCNN_INCLUDE_DIRS "${NCNN_ROOT_DIR}/include") +set(NCNN_LIBRARIES "${NCNN_ROOT_DIR}/lib/libncnn.a") + +add_executable(Test-ncnn_rec ncnn_rec.cc) +target_include_directories(Test-ncnn_rec PRIVATE ${LOCKZHINER_VISION_MODULE_INCLUDE_DIRS} ${NCNN_INCLUDE_DIRS}) +target_link_libraries(Test-ncnn_rec PRIVATE ${OPENCV_LIBRARIES} ${NCNN_LIBRARIES} ${LOCKZHINER_VISION_MODULE_LIBRARIES}) + +install( + TARGETS Test-ncnn_rec + RUNTIME DESTINATION . +) +``` +### 4.3 编译项目 +使用 Docker Destop 打开 LockzhinerVisionModule 容器并执行以下命令来编译项目 +```bash +# 进入Demo所在目录 +cd /LockzhinerVisionModuleWorkSpace/LockzhinerVisionModule/Cpp_example/D05_ocr_text_recognition +# 创建编译目录 +rm -rf build && mkdir build && cd build +# 配置交叉编译工具链 +export TOOLCHAIN_ROOT_PATH="/LockzhinerVisionModuleWorkSpace/arm-rockchip830-linux-uclibcgnueabihf" +# 使用cmake配置项目 +cmake .. +# 执行编译项目 +make -j8 && make install +``` + +在执行完上述命令后,会在build目录下生成可执行文件。 + +--- + +## 5. 执行结果 +### 5.1 运行前准备 +- 请确保你已经下载了 [凌智视觉模块文字识别参数文件](https://gitee.com/LockzhinerAI/LockzhinerVisionModule/releases/download/v0.0.6/crnn_lite_op.param) +- 请确保你已经下载了 [凌智视觉模块文字识别bin文件](https://gitee.com/LockzhinerAI/LockzhinerVisionModule/releases/download/v0.0.6/crnn_lite_op.bin) +- 请确保你已经下载了 [凌智视觉模块文字识别keys文件](https://gitee.com/LockzhinerAI/LockzhinerVisionModule/releases/download/v0.0.6/keys.txt) +### 5.2 运行过程 +```shell +chmod 777 Test-ncnn_rec +# 对图像进行识别 +./Test-ncnn_rec image_path crnn_lite_op.param crnn_lite_op.bin keys.txt +``` +### 5.3 运行效果 +#### 5.2.1 图像OCR识别 +- 原始图像 + +![title](./images/Test.jpg) + +- 识别结果 + +![title](./images/result.png) + +#### 5.2.2 注意事项 +由于本章节只训练了一个识别模型,并没有训练检测模型,所有只针对包含单行文本的图像效果比较好,对于包含多行文本的识别,效果并不是很好。 + +--- + +## 6. 总结 +通过上述内容,我们成功实现了一个简单的OCR字符识别系统,包括: + +- 加载识别模型和检测图像。 +- 进行字符识别。 +- 将识别结果打印出来。 diff --git a/Cpp_example/D05_ocr_text_recognition/images/1.png b/Cpp_example/D05_ocr_text_recognition/images/1.png new file mode 100755 index 0000000000000000000000000000000000000000..367d62c6f9ad382e3517cc5d1654de9de8b717c1 Binary files /dev/null and b/Cpp_example/D05_ocr_text_recognition/images/1.png differ diff --git a/Cpp_example/D05_ocr_text_recognition/images/Test.jpg b/Cpp_example/D05_ocr_text_recognition/images/Test.jpg new file mode 100755 index 0000000000000000000000000000000000000000..c757f5b759c95cc23682692118b8be74100bf672 Binary files /dev/null and b/Cpp_example/D05_ocr_text_recognition/images/Test.jpg differ diff --git a/Cpp_example/D05_ocr_text_recognition/images/result.png b/Cpp_example/D05_ocr_text_recognition/images/result.png new file mode 100755 index 0000000000000000000000000000000000000000..a842880a63e6ebbbc15ea02e63ae8723dd97cd87 Binary files /dev/null and b/Cpp_example/D05_ocr_text_recognition/images/result.png differ diff --git a/Cpp_example/D05_ocr_text_recognition/ncnn_rec.cc b/Cpp_example/D05_ocr_text_recognition/ncnn_rec.cc new file mode 100755 index 0000000000000000000000000000000000000000..97e86068f0892f52030b0e1e409acb145945a282 --- /dev/null +++ b/Cpp_example/D05_ocr_text_recognition/ncnn_rec.cc @@ -0,0 +1,138 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + + +using namespace cv; +using namespace std::chrono; +template +inline static size_t argmax(ForwardIterator first, ForwardIterator last) { + return std::distance(first, std::max_element(first, last)); +} + +// 后续处理函数 +void processFrame(Mat &src, ncnn::Net &net, const std::vector &keys, const float meanValues[], const float normValues[], const int dstHeight) { + float scale = (float)dstHeight / (float)src.rows; + int dstWidth = int((float)src.cols * scale); + std::cout << "resize" << std::endl; + cv::Mat srcResize; + resize(src, srcResize, cv::Size(dstWidth, dstHeight)); + std::cout << "resize success" << std::endl; + ncnn::Mat input = ncnn::Mat::from_pixels(srcResize.data, ncnn::Mat::PIXEL_RGB, + srcResize.cols, srcResize.rows); + std::cout << "input success" << std::endl; + input.substract_mean_normalize(meanValues, normValues); + + ncnn::Extractor extractor = net.create_extractor(); + // net.num_threads = 6; + extractor.input("input", input); + std::cout << "extract success" << std::endl; + ncnn::Mat out; + extractor.extract("out", out); + + float *floatArray = (float *)out.data; + std::vector outputData(floatArray, floatArray + out.h * out.w); + + int keySize = keys.size(); + std::string strRes; + std::vector scores; + int lastIndex = 0; + int maxIndex; + float maxValue; + std::cout << "开始检测" << std::endl; + high_resolution_clock::time_point start_time = + high_resolution_clock::now(); + for (int i = 0; i < out.h; i++) { + int maxIndex = 0; + float maxValue = -1000.f; + + // Softmax 计算 + std::vector exps(out.w); + for (int j = 0; j < out.w; j++) { + float expSingle = exp(outputData[i * out.w + j]); + exps.at(j) = expSingle; + } + // 行总和 + float partition = accumulate(exps.begin(), exps.end(), 0.0); + + // 找到最大值及其索引 + auto maxElementIt = std::max_element(exps.begin(), exps.end()); + maxIndex = std::distance(exps.begin(), maxElementIt); + maxValue = *maxElementIt / partition; + + // 检测到有效字符 + if (maxIndex > 0 && maxIndex < keySize && + (!(i > 0 && maxIndex == lastIndex))) { + scores.emplace_back(maxValue); + // 将字符追加到结果字符串中 + strRes.append(keys[maxIndex - 1]); + } + + lastIndex = maxIndex; + } + high_resolution_clock::time_point end_time = high_resolution_clock::now(); + auto time_span = duration_cast(end_time - start_time); + // 所有检测完成后,一次性输出结果 + std::cout << "检测完成,最终结果:" << std::endl; + std::cout << "识别的文本: " << strRes << std::endl; + std::cout << "单张图片推理时间(ms): " << time_span.count() << std::endl; + std::cout << std::endl; + +} + +int main(int argc, char *argv[]) { + if (argc != 5) { + std::cerr + << "Usage: ./ncnn_ocr " + << std::endl; + } + const float meanValues[3] = {127.5, 127.5, 127.5}; + const float normValues[3] = {1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5}; + const int dstHeight = 32; + ncnn::Net net; + std::vector keys; + // Mat src = imread(argv[1]); // default : BGR + // Step 2: 加载 .param 和 .bin 文件 + net.load_param(argv[2]); + net.load_model(argv[3]); + std::cout << "load model success" << std::endl; + std::ifstream in(argv[4]); + std::string line; + if (in) { + // line中不包括每行的换行符 + while (getline(in, line)) { + keys.push_back(line); + } + } else { + printf("The keys.txt file was not found\n"); + return false; + } + if (keys.size() != 5531) { + fprintf(stderr, "missing keys\n"); + return false; + } + printf("total keys size(%lu)\n", keys.size()); + + std::string argument(argv[1]); + // Default: BGR + Mat src = imread(argv[1]); + if (src.empty()) { + std::cerr << "Error opening image file" << std::endl; + return -1; + } + processFrame(src, net, keys, meanValues, normValues, dstHeight); + return 0; +} diff --git a/Cpp_example/D06_ocr_text_detection/CMakeLists.txt b/Cpp_example/D06_ocr_text_detection/CMakeLists.txt new file mode 100755 index 0000000000000000000000000000000000000000..1c9dc4065f5b40088a620018cbccb2545aee7367 --- /dev/null +++ b/Cpp_example/D06_ocr_text_detection/CMakeLists.txt @@ -0,0 +1,44 @@ +cmake_minimum_required(VERSION 3.10) + +project(ocr_text_detection) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# 定义项目根目录路径 +set(PROJECT_ROOT_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../..") +message("PROJECT_ROOT_PATH = " ${PROJECT_ROOT_PATH}) + +include("${PROJECT_ROOT_PATH}/toolchains/arm-rockchip830-linux-uclibcgnueabihf.toolchain.cmake") + +# 定义 OpenCV SDK 路径 +set(OpenCV_ROOT_PATH "${PROJECT_ROOT_PATH}/third_party/opencv-mobile-4.10.0-lockzhiner-vision-module") +set(OpenCV_DIR "${OpenCV_ROOT_PATH}/lib/cmake/opencv4") +find_package(OpenCV REQUIRED) +set(OPENCV_LIBRARIES "${OpenCV_LIBS}") + +# 定义 LockzhinerVisionModule SDK 路径 +set(LockzhinerVisionModule_ROOT_PATH "${PROJECT_ROOT_PATH}/third_party/lockzhiner_vision_module_sdk") +set(LockzhinerVisionModule_DIR "${LockzhinerVisionModule_ROOT_PATH}/lib/cmake/lockzhiner_vision_module") +find_package(LockzhinerVisionModule REQUIRED) + +# ncnn配置 +set(NCNN_ROOT_DIR "${PROJECT_ROOT_PATH}/third_party/ncnn-20240820-lockzhiner-vision-module") # 确保third_party层级存在 +message(STATUS "Checking ncnn headers in: ${NCNN_ROOT_DIR}/include/ncnn") + +# 验证头文件存在 +if(NOT EXISTS "${NCNN_ROOT_DIR}/include/ncnn/net.h") + message(FATAL_ERROR "ncnn headers not found. Confirm the directory contains ncnn: ${NCNN_ROOT_DIR}") +endif() + +set(NCNN_INCLUDE_DIRS "${NCNN_ROOT_DIR}/include") +set(NCNN_LIBRARIES "${NCNN_ROOT_DIR}/lib/libncnn.a") + +add_executable(Test-ncnn_dbnet ncnn_dbnet.cc) +target_include_directories(Test-ncnn_dbnet PRIVATE ${LOCKZHINER_VISION_MODULE_INCLUDE_DIRS} ${NCNN_INCLUDE_DIRS}) +target_link_libraries(Test-ncnn_dbnet PRIVATE ${OPENCV_LIBRARIES} ${NCNN_LIBRARIES} ${LOCKZHINER_VISION_MODULE_LIBRARIES}) + +install( + TARGETS Test-ncnn_dbnet + RUNTIME DESTINATION . +) \ No newline at end of file diff --git a/Cpp_example/D06_ocr_text_detection/README.md b/Cpp_example/D06_ocr_text_detection/README.md new file mode 100755 index 0000000000000000000000000000000000000000..de54f5015e7c221a2fd90ed44f5c2409f68e9adc --- /dev/null +++ b/Cpp_example/D06_ocr_text_detection/README.md @@ -0,0 +1,450 @@ +# OCR 文本框检测 +本章节在Lockzhiner Vision Module 上基于OcrLiteNcnn模型, 实现了一个OCR文本框检测系统。 +## 1. 基本知识讲解 +### 1.1 文本检测简介 +文本检测是一种识别图像中文字位置的技术,作为光学字符识别(OCR)的一部分,它能准确找出并定位图片或视频中的文字区域。适用于文档数字化、车牌识别、实时翻译和辅助视障人士等场景,通过自动化提取文本信息,极大提升了信息处理的效率与便捷性。 +### 1.2 文本检测常用方法 +- 连通域分析:适用于前景与背景对比明显的图像,通过识别连通区域定位文本。 +- 边缘检测:利用Canny算子找到文本边缘,适合处理倾斜或弯曲的文本。 +- 深度学习:使用卷积神经网络(CNN)等模型自动提取特征,特别擅长自然场景中的多方向和多尺度文本检测。 + +--- + +## 2. C++ API 文档 +### 2.1 Net类 +#### 2.1.1 头文件 +```cpp +#include +``` +- 作用:用于声明Net类,使得Net类可以在当前文件中使用。 + +#### 2.1.2 构造类函数 +```cpp +ncnn::Net net; +``` +- 作用:创建一个Net类型的对象实例,用于实现文字区域的检测。 +- 参数说明: + - 无 +- 返回值: + - 无 + +#### 2.1.3 load_param函数 +```cpp +int load_param(const DataReader& dr); +``` +- 参数说明: + - dr:传入的参数文件路径。 +- 返回值: + - 返回值为0表示加载参数文件成功。 + +#### 2.1.4 load_model函数 +```cpp +int load_model(const DataReader& dr); +``` +- 参数说明: + - dr:传入的模型文件路径。 +- 返回值:返回值为0表示加载模型成功。 + +#### 2.1.5 from_pixels函数 +```cpp +ncnn::Mat::from_pixels(srcResize.data, ncnn::Mat::PIXEL_BGR, srcResize.cols, srcResize.rows); +``` +- 参数说明: + - srcResize.data:输入图像的像素数据指针。 + - ncnn::Mat::PIXEL_BGR:输入像素数据的颜色格式。 + - srcResize.cols:输入图像的宽度。 + - srcResize.rows:输入图像的高度。 +- 返回值:适配成 NCNN 所需的格式的包含图像数据的新对象。 + +### 2.2 Extractor类 +#### 2.2.1 头文件 +```cpp +#include +``` +- 作用:用于声明Extractor类,使得Extractor类可以在当前文件中使用。 + +#### 2.2.2 构造类函数 +```cpp +ncnn::Extractor extractor = net.create_extractor(); +``` +- 作用:从已经加载了神经网络模型的 net 中创建一个 Extractor 实例,用于执行文本区域框检测的推理任务。 +- 参数说明: + - 无 +- 返回值: + - 无 + +--- + +## 3. OCR 文本检测代码解析 +### 3.1 流程图 + + + +### 3.2 核心代码解析 +- 加载模型参数和权重 +```cpp +net.load_param(argv[2]); +net.load_model(argv[3]) +``` +自定义函数说明 +- 计算图像缩放比例以适应目标尺寸 +```cpp +ScaleParam getScaleParam(const cv::Mat &src, const int targetSize); +``` +- 参数说明: + - src:输入的原始图像。 + - targetSize:目标尺寸大小。 +- 返回值: + - 返回一个 ScaleParam 结构体,该结构体包含了原图尺寸、缩放后的尺寸以及缩放比例等信息。 + +- 向量归一化函数 +```cpp +cv::Point2f normalize(const cv::Point2f& v); +``` +- 作用: + - 计算向量长度并归一化,用于后续几何变换(如多边形扩展方向计算)。 +- 参数说明: + - v:待归一化的二维向量。 +- 返回值: + - 返回单位长度的向量,若输入向量为零向量则返回(0, 0)。 + +- 多边形扩张函数:用于扩大文本框边界 +```cpp +std::vector expandPolygon(const std::vector& inBox, float distance) +``` +- 作用: + - 基于邻边法线方向计算角平分线,向外扩展多边形边界,增强文本检测框的包容性。 +- 参数说明: + - inBox:输入的多边形顶点集合。 + - distance:扩展距离,控制文本框膨胀程度。 +- 返回值: + - 返回扩展后的新多边形顶点集合。 + +- 文本检测框提取 +```cpp +std::vector findRsBoxes(const cv::Mat &fMapMat, const cv::Mat &norfMapMat, ScaleParam &s, const float boxScoreThresh, const float unClipRatio); +``` +- 作用: + - 结合轮廓检测与UNet式后处理,生成最终文本检测框。 +- 参数说明: + - fMapMat:DBNet输出的概率图。 + - norfMapMat:二值化后的概率图。 + - s:图像缩放参数结构体。 + - boxScoreThresh:文本框置信度阈值。 + - unClipRatio:边界扩展系数。 +- 返回值: + - 返回包含文本框坐标、置信度的结构体列表。 + +### 3.3 完整代码实现 +```cpp +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace cv; +using namespace std::chrono; + + +// 定义必要的参数 +const float meanValues[3] = {0.485f * 255, 0.456f * 255, 0.406f * 255}; +const float normValues[3] = {1.0f / 0.229f / 255.0f, 1.0f / 0.224f / 255.0f, 1.0f / 0.225f / 255.0f}; + +// 图像缩放参数定义 +struct ScaleParam { + int srcWidth; + int srcHeight; + int dstWidth; + int dstHeight; + float ratioWidth; + float ratioHeight; +}; + +// 文本框结构体的定义 +struct TextBox { + std::vector boxPoint; + float score; +}; + +// 计算图像缩放比例以适应目标尺寸 +ScaleParam getScaleParam(const cv::Mat &src, const int targetSize) { + int imgHeight = src.rows; + int imgWidth = src.cols; + float ratio = std::min(static_cast(targetSize) / imgHeight, static_cast(targetSize) / imgWidth); + ScaleParam scaleParam; + scaleParam.srcHeight = imgHeight; + scaleParam.srcWidth = imgWidth; + scaleParam.dstHeight = static_cast(imgHeight * ratio); + scaleParam.dstWidth = static_cast(imgWidth * ratio); + scaleParam.ratioHeight = ratio; + scaleParam.ratioWidth = ratio; + return scaleParam; +} + +// 向量归一化函数 +cv::Point2f normalize(const cv::Point2f& v) { + float len = std::sqrt(v.x * v.x + v.y * v.y); + // 防止除以零 + if (len == 0) return cv::Point2f(0, 0); + return cv::Point2f(v.x / len, v.y / len); +} + +// 多边形扩张函数:用于扩大文本框边界 +std::vector expandPolygon(const std::vector& inBox, float distance) { + std::vector outBox; + int n = inBox.size(); + // 确保输入至少是一个三角形 + if (n < 3) return outBox; + + for (int i = 0; i < n; ++i) { + cv::Point2f prev = inBox[(i + n - 1) % n]; + cv::Point2f curr = inBox[i]; + cv::Point2f next = inBox[(i + 1) % n]; + + cv::Point2f v1 = cv::Point2f(curr.x - prev.x, curr.y - prev.y); + cv::Point2f v2 = cv::Point2f(next.x - curr.x, next.y - curr.y); + + cv::Point2f normal1(-v1.y, v1.x); + cv::Point2f normal2(-v2.y, v2.x); + + normal1 = normalize(normal1); + normal2 = normalize(normal2); + + cv::Point2f bisectorNormal = normal1 + normal2; + bisectorNormal = normalize(bisectorNormal); + + cv::Point2f newPoint = curr + bisectorNormal * distance; + + outBox.push_back(cv::Point(newPoint.x, newPoint.y)); + } + + return outBox; +} + +// 查找并处理文本框区域 +std::vector findRsBoxes(const cv::Mat &fMapMat, const cv::Mat &norfMapMat, ScaleParam &s, + const float boxScoreThresh, const float unClipRatio) { + float minArea = 3; + std::vector rsBoxes; + std::vector> contours; + cv::findContours(norfMapMat, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE); + + for (size_t i = 0; i < contours.size(); ++i) { + double perimeter = cv::arcLength(contours[i], true); + cv::RotatedRect minRect = cv::minAreaRect(contours[i]); + float minSideLen = std::min(minRect.size.width, minRect.size.height); + + if (minSideLen < minArea) + continue; + + // 创建一个掩码图像 + cv::Mat mask = cv::Mat::zeros(fMapMat.size(), CV_8UC1); + cv::drawContours(mask, contours, static_cast(i), cv::Scalar(255), cv::FILLED); + + // 计算掩码内部的平均得分 + cv::Scalar meanScore = cv::mean(fMapMat, mask); + float score = static_cast(meanScore[0]); + + if (score < boxScoreThresh) + continue; + + // 使用expandPolygon实现多边形扩张 + double area = cv::contourArea(contours[i]); + float distance = unClipRatio * area / static_cast(perimeter); + std::vector clipBox = expandPolygon(contours[i], distance); + + if (minSideLen < minArea + 2) + continue; + + for (auto &point : clipBox) { + point.x = std::max(0, std::min(static_cast(point.x / s.ratioWidth), s.srcWidth - 1)); + point.y = std::max(0, std::min(static_cast(point.y / s.ratioHeight), s.srcHeight - 1)); + } + + rsBoxes.emplace_back(TextBox{clipBox, score}); + } + return rsBoxes; +} + +// 绘制文本框到图像上 +void drawTextBox(cv::Mat &boxImg, const std::vector &box, int thickness) { + for (size_t i = 0; i < box.size(); ++i) { + cv::line(boxImg, box[i], box[(i + 1) % box.size()], cv::Scalar(0, 255, 0), thickness); + } +} + +int main(int argc, char** argv) { + if(argc != 4) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return -1; + } + // 读取输入图像 + cv::Mat src = cv::imread(argv[1], cv::IMREAD_COLOR); + if(src.empty()) { + std::cerr << "Failed to read image: " << argv[1] << std::endl; + return -1; + } + + ncnn::Net net; + // 禁用一些可能不被支持的功能 + net.opt.use_vulkan_compute = false; + net.opt.use_bf16_storage = false; + net.opt.use_fp16_packed = false; + net.opt.use_fp16_storage = false; + net.opt.use_fp16_arithmetic = false; + + // 加载模型参数和权重文件 + if (net.load_param(argv[2]) != 0 || net.load_model(argv[3]) != 0) { + std::cerr << "Failed to load model from " << argv[2] << " and " << argv[3] << std::endl; + return -1; + } + std::cout << "Model loaded successfully." << std::endl; + // 736是自定义的参数,可以自己设置 + ScaleParam scaleParam = getScaleParam(src, 736); + + std::cout << " scaleParam successfully." << std::endl; + float boxScoreThresh = 0.5f; + float boxThresh = 0.3f; + float unClipRatio = 3.5f; + + cv::Mat srcResize; + resize(src, srcResize, cv::Size(scaleParam.dstWidth, scaleParam.dstHeight)); + std::cout << " resize successfully." << std::endl; + + // 准备输入数据,并进行均值标准化 + ncnn::Mat input = ncnn::Mat::from_pixels(srcResize.data, ncnn::Mat::PIXEL_BGR, srcResize.cols, srcResize.rows); + input.substract_mean_normalize(meanValues, normValues); + + // 创建推理器并执行推理 + high_resolution_clock::time_point start_time = + high_resolution_clock::now(); + ncnn::Extractor extractor = net.create_extractor(); + extractor.input("input0", input); + ncnn::Mat out; + extractor.extract("out1", out); + high_resolution_clock::time_point end_time = high_resolution_clock::now(); + // 计算推理时间 + auto time_span = duration_cast(end_time - start_time); + std::cout << "单张图片推理时间(ms): " << time_span.count() << std::endl; + + // 创建一个单通道的 cv::Mat 来存储第一个通道的数据 + cv::Mat fMapMat(srcResize.rows, srcResize.cols, CV_32FC1); + + memcpy(fMapMat.data, (float *) out.data, srcResize.rows * srcResize.cols * sizeof(float)); + + cv::Mat norfMapMat; + norfMapMat = fMapMat > boxThresh; + + // 查找文本框并绘制在原图上 + std::vector textBoxes = findRsBoxes(fMapMat, norfMapMat, scaleParam, boxScoreThresh, unClipRatio); + + for (const auto &textBox : textBoxes) { + drawTextBox(src, textBox.boxPoint, 1); + } + cv::imshow("Detected Text Boxes", src); + cv::waitKey(0); + + return 0; +} +``` + +--- + +## 4. 编译调试 +### 4.1 编译环境搭建 +- 请确保你已经按照 [开发环境搭建指南](../../../../docs/introductory_tutorial/cpp_development_environment.md) 正确配置了开发环境。 +- 同时已经正确连接开发板。 +### 4.2 Cmake介绍 +```cmake +cmake_minimum_required(VERSION 3.10) + +project(ocr_text_detection) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# 定义项目根目录路径 +set(PROJECT_ROOT_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../..") +message("PROJECT_ROOT_PATH = " ${PROJECT_ROOT_PATH}) + +include("${PROJECT_ROOT_PATH}/toolchains/arm-rockchip830-linux-uclibcgnueabihf.toolchain.cmake") + +# 定义 OpenCV SDK 路径 +set(OpenCV_ROOT_PATH "${PROJECT_ROOT_PATH}/third_party/opencv-mobile-4.10.0-lockzhiner-vision-module") +set(OpenCV_DIR "${OpenCV_ROOT_PATH}/lib/cmake/opencv4") +find_package(OpenCV REQUIRED) +set(OPENCV_LIBRARIES "${OpenCV_LIBS}") + +# 定义 LockzhinerVisionModule SDK 路径 +set(LockzhinerVisionModule_ROOT_PATH "${PROJECT_ROOT_PATH}/third_party/lockzhiner_vision_module_sdk") +set(LockzhinerVisionModule_DIR "${LockzhinerVisionModule_ROOT_PATH}/lib/cmake/lockzhiner_vision_module") +find_package(LockzhinerVisionModule REQUIRED) + +# ncnn配置 +set(NCNN_ROOT_DIR "${PROJECT_ROOT_PATH}/third_party/ncnn-20240820-lockzhiner-vision-module") # 确保third_party层级存在 +message(STATUS "Checking ncnn headers in: ${NCNN_ROOT_DIR}/include/ncnn") + +# 验证头文件存在 +if(NOT EXISTS "${NCNN_ROOT_DIR}/include/ncnn/net.h") + message(FATAL_ERROR "ncnn headers not found. Confirm the directory contains ncnn: ${NCNN_ROOT_DIR}") +endif() + +set(NCNN_INCLUDE_DIRS "${NCNN_ROOT_DIR}/include") +set(NCNN_LIBRARIES "${NCNN_ROOT_DIR}/lib/libncnn.a") + +add_executable(Test-ncnn_dbnet ncnn_dbnet.cc) +target_include_directories(Test-ncnn_dbnet PRIVATE ${LOCKZHINER_VISION_MODULE_INCLUDE_DIRS} ${NCNN_INCLUDE_DIRS}) +target_link_libraries(Test-ncnn_dbnet PRIVATE ${OPENCV_LIBRARIES} ${NCNN_LIBRARIES} ${LOCKZHINER_VISION_MODULE_LIBRARIES}) + +install( + TARGETS Test-ncnn_dbnet + RUNTIME DESTINATION . +) +``` +### 4.3 编译项目 +使用 Docker Destop 打开 LockzhinerVisionModule 容器并执行以下命令来编译项目 +```bash +# 进入Demo所在目录 +cd /LockzhinerVisionModuleWorkSpace/LockzhinerVisionModule/Cpp_example/D06_ocr_text_detection +# 创建编译目录 +rm -rf build && mkdir build && cd build +# 配置交叉编译工具链 +export TOOLCHAIN_ROOT_PATH="/LockzhinerVisionModuleWorkSpace/arm-rockchip830-linux-uclibcgnueabihf" +# 使用cmake配置项目 +cmake .. +# 执行编译项目 +make -j8 && make install +``` + +在执行完上述命令后,会在build目录下生成可执行文件。 + +--- + +## 5. 执行结果 +### 5.1 运行前准备 +- 请确保你已经下载了 [凌智视觉模块文本检测参数文件](https://gitee.com/LockzhinerAI/LockzhinerVisionModule/releases/download/v0.0.6/dbnet_op.param) +- 请确保你已经下载了 [凌智视觉模块文本检测bin文件](https://gitee.com/LockzhinerAI/LockzhinerVisionModule/releases/download/v0.0.6/dbnet_op.bin) +### 5.2 运行过程 +```shell +chmod 777 Test-ncnn_dbnet +./Test-ncnn_dbnet dbnet_op.param dbnet_op.bin +``` +### 5.3 运行效果 +- 原始图像 +![title](./images/1.png) +- 检测结果 +![title](./images/result1.png) + +--- + +## 6. 总结 +通过上述内容,我们成功实现了一个简单的OCR文本检测系统,包括: + +- 加载检测模型和检测图像。 +- 进行文本检测推理。 +- 在原图上绘制检测结果并保存。 \ No newline at end of file diff --git a/Cpp_example/D06_ocr_text_detection/images/1.png b/Cpp_example/D06_ocr_text_detection/images/1.png new file mode 100755 index 0000000000000000000000000000000000000000..4b965257da890aaff23e19deee7ae5daa97fd529 Binary files /dev/null and b/Cpp_example/D06_ocr_text_detection/images/1.png differ diff --git a/Cpp_example/D06_ocr_text_detection/images/result1.png b/Cpp_example/D06_ocr_text_detection/images/result1.png new file mode 100755 index 0000000000000000000000000000000000000000..cc691ce9093fd258a397e9284e27120a74a96000 Binary files /dev/null and b/Cpp_example/D06_ocr_text_detection/images/result1.png differ diff --git a/Cpp_example/D06_ocr_text_detection/images/view.png b/Cpp_example/D06_ocr_text_detection/images/view.png new file mode 100755 index 0000000000000000000000000000000000000000..bfc0709fa80f69f80034499e7601766fbf1725b7 Binary files /dev/null and b/Cpp_example/D06_ocr_text_detection/images/view.png differ diff --git a/Cpp_example/D06_ocr_text_detection/ncnn_dbnet.cc b/Cpp_example/D06_ocr_text_detection/ncnn_dbnet.cc new file mode 100755 index 0000000000000000000000000000000000000000..3862fc727124dfbea3a1ccaf318a5b5b58ef08e7 --- /dev/null +++ b/Cpp_example/D06_ocr_text_detection/ncnn_dbnet.cc @@ -0,0 +1,214 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace cv; +using namespace std::chrono; + + +// 定义必要的参数 +const float meanValues[3] = {0.485f * 255, 0.456f * 255, 0.406f * 255}; +const float normValues[3] = {1.0f / 0.229f / 255.0f, 1.0f / 0.224f / 255.0f, 1.0f / 0.225f / 255.0f}; + +// 图像缩放参数定义 +struct ScaleParam { + int srcWidth; + int srcHeight; + int dstWidth; + int dstHeight; + float ratioWidth; + float ratioHeight; +}; + +// 文本框结构体的定义 +struct TextBox { + std::vector boxPoint; + float score; +}; + +// 计算图像缩放比例以适应目标尺寸 +ScaleParam getScaleParam(const cv::Mat &src, const int targetSize) { + int imgHeight = src.rows; + int imgWidth = src.cols; + float ratio = std::min(static_cast(targetSize) / imgHeight, static_cast(targetSize) / imgWidth); + ScaleParam scaleParam; + scaleParam.srcHeight = imgHeight; + scaleParam.srcWidth = imgWidth; + scaleParam.dstHeight = static_cast(imgHeight * ratio); + scaleParam.dstWidth = static_cast(imgWidth * ratio); + scaleParam.ratioHeight = ratio; + scaleParam.ratioWidth = ratio; + return scaleParam; +} + +// 向量归一化函数 +cv::Point2f normalize(const cv::Point2f& v) { + float len = std::sqrt(v.x * v.x + v.y * v.y); + // 防止除以零 + if (len == 0) return cv::Point2f(0, 0); + return cv::Point2f(v.x / len, v.y / len); +} + +// 多边形扩张函数:用于扩大文本框边界 +std::vector expandPolygon(const std::vector& inBox, float distance) { + std::vector outBox; + int n = inBox.size(); + // 确保输入至少是一个三角形 + if (n < 3) return outBox; + + for (int i = 0; i < n; ++i) { + cv::Point2f prev = inBox[(i + n - 1) % n]; + cv::Point2f curr = inBox[i]; + cv::Point2f next = inBox[(i + 1) % n]; + + cv::Point2f v1 = cv::Point2f(curr.x - prev.x, curr.y - prev.y); + cv::Point2f v2 = cv::Point2f(next.x - curr.x, next.y - curr.y); + + cv::Point2f normal1(-v1.y, v1.x); + cv::Point2f normal2(-v2.y, v2.x); + + normal1 = normalize(normal1); + normal2 = normalize(normal2); + + cv::Point2f bisectorNormal = normal1 + normal2; + bisectorNormal = normalize(bisectorNormal); + + cv::Point2f newPoint = curr + bisectorNormal * distance; + + outBox.push_back(cv::Point(newPoint.x, newPoint.y)); + } + + return outBox; +} + +// 查找并处理文本框区域 +std::vector findRsBoxes(const cv::Mat &fMapMat, const cv::Mat &norfMapMat, ScaleParam &s, + const float boxScoreThresh, const float unClipRatio) { + float minArea = 3; + std::vector rsBoxes; + std::vector> contours; + cv::findContours(norfMapMat, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE); + + for (size_t i = 0; i < contours.size(); ++i) { + double perimeter = cv::arcLength(contours[i], true); + cv::RotatedRect minRect = cv::minAreaRect(contours[i]); + float minSideLen = std::min(minRect.size.width, minRect.size.height); + + if (minSideLen < minArea) + continue; + + // 创建一个掩码图像 + cv::Mat mask = cv::Mat::zeros(fMapMat.size(), CV_8UC1); + cv::drawContours(mask, contours, static_cast(i), cv::Scalar(255), cv::FILLED); + + // 计算掩码内部的平均得分 + cv::Scalar meanScore = cv::mean(fMapMat, mask); + float score = static_cast(meanScore[0]); + + if (score < boxScoreThresh) + continue; + + // 使用expandPolygon实现多边形扩张 + double area = cv::contourArea(contours[i]); + float distance = unClipRatio * area / static_cast(perimeter); + std::vector clipBox = expandPolygon(contours[i], distance); + + if (minSideLen < minArea + 2) + continue; + + for (auto &point : clipBox) { + point.x = std::max(0, std::min(static_cast(point.x / s.ratioWidth), s.srcWidth - 1)); + point.y = std::max(0, std::min(static_cast(point.y / s.ratioHeight), s.srcHeight - 1)); + } + + rsBoxes.emplace_back(TextBox{clipBox, score}); + } + return rsBoxes; +} + +// 绘制文本框到图像上 +void drawTextBox(cv::Mat &boxImg, const std::vector &box, int thickness) { + for (size_t i = 0; i < box.size(); ++i) { + cv::line(boxImg, box[i], box[(i + 1) % box.size()], cv::Scalar(0, 255, 0), thickness); + } +} + +int main(int argc, char** argv) { + if(argc != 4) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return -1; + } + // 读取输入图像 + cv::Mat src = cv::imread(argv[1], cv::IMREAD_COLOR); + if(src.empty()) { + std::cerr << "Failed to read image: " << argv[1] << std::endl; + return -1; + } + + ncnn::Net net; + // 禁用一些可能不被支持的功能 + net.opt.use_vulkan_compute = false; + net.opt.use_bf16_storage = false; + net.opt.use_fp16_packed = false; + net.opt.use_fp16_storage = false; + net.opt.use_fp16_arithmetic = false; + + // 加载模型参数和权重文件 + if (net.load_param(argv[2]) != 0 || net.load_model(argv[3]) != 0) { + std::cerr << "Failed to load model from " << argv[2] << " and " << argv[3] << std::endl; + return -1; + } + std::cout << "Model loaded successfully." << std::endl; + // 736是自定义的参数,可以自己设置 + ScaleParam scaleParam = getScaleParam(src, 736); + + std::cout << " scaleParam successfully." << std::endl; + float boxScoreThresh = 0.5f; + float boxThresh = 0.3f; + float unClipRatio = 3.5f; + + cv::Mat srcResize; + resize(src, srcResize, cv::Size(scaleParam.dstWidth, scaleParam.dstHeight)); + std::cout << " resize successfully." << std::endl; + + // 准备输入数据,并进行均值标准化 + ncnn::Mat input = ncnn::Mat::from_pixels(srcResize.data, ncnn::Mat::PIXEL_BGR, srcResize.cols, srcResize.rows); + input.substract_mean_normalize(meanValues, normValues); + + // 创建推理器并执行推理 + high_resolution_clock::time_point start_time = + high_resolution_clock::now(); + ncnn::Extractor extractor = net.create_extractor(); + extractor.input("input0", input); + ncnn::Mat out; + extractor.extract("out1", out); + high_resolution_clock::time_point end_time = high_resolution_clock::now(); + // 计算推理时间 + auto time_span = duration_cast(end_time - start_time); + std::cout << "单张图片推理时间(ms): " << time_span.count() << std::endl; + + // 创建一个单通道的 cv::Mat 来存储第一个通道的数据 + cv::Mat fMapMat(srcResize.rows, srcResize.cols, CV_32FC1); + + memcpy(fMapMat.data, (float *) out.data, srcResize.rows * srcResize.cols * sizeof(float)); + + cv::Mat norfMapMat; + norfMapMat = fMapMat > boxThresh; + + // 查找文本框并绘制在原图上 + std::vector textBoxes = findRsBoxes(fMapMat, norfMapMat, scaleParam, boxScoreThresh, unClipRatio); + + for (const auto &textBox : textBoxes) { + drawTextBox(src, textBox.boxPoint, 1); + } + cv::imshow("Detected Text Boxes", src); + cv::waitKey(0); + + return 0; +} \ No newline at end of file diff --git a/Cpp_example/D07_ocr_synthesis/CMakeLists.txt b/Cpp_example/D07_ocr_synthesis/CMakeLists.txt new file mode 100755 index 0000000000000000000000000000000000000000..36ef5543ffe2f61ed2a6c5f0572385c109953666 --- /dev/null +++ b/Cpp_example/D07_ocr_synthesis/CMakeLists.txt @@ -0,0 +1,44 @@ +cmake_minimum_required(VERSION 3.10) + +project(ocr_synthesis) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# 定义项目根目录路径 +set(PROJECT_ROOT_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../..") +message("PROJECT_ROOT_PATH = " ${PROJECT_ROOT_PATH}) + +include("${PROJECT_ROOT_PATH}/toolchains/arm-rockchip830-linux-uclibcgnueabihf.toolchain.cmake") + +# 定义 OpenCV SDK 路径 +set(OpenCV_ROOT_PATH "${PROJECT_ROOT_PATH}/third_party/opencv-mobile-4.10.0-lockzhiner-vision-module") +set(OpenCV_DIR "${OpenCV_ROOT_PATH}/lib/cmake/opencv4") +find_package(OpenCV REQUIRED) +set(OPENCV_LIBRARIES "${OpenCV_LIBS}") + +# 定义 LockzhinerVisionModule SDK 路径 +set(LockzhinerVisionModule_ROOT_PATH "${PROJECT_ROOT_PATH}/third_party/lockzhiner_vision_module_sdk") +set(LockzhinerVisionModule_DIR "${LockzhinerVisionModule_ROOT_PATH}/lib/cmake/lockzhiner_vision_module") +find_package(LockzhinerVisionModule REQUIRED) + +# ncnn配置 +set(NCNN_ROOT_DIR "${PROJECT_ROOT_PATH}/third_party/ncnn-20240820-lockzhiner-vision-module") # 确保third_party层级存在 +message(STATUS "Checking ncnn headers in: ${NCNN_ROOT_DIR}/include/ncnn") + +# 验证头文件存在 +if(NOT EXISTS "${NCNN_ROOT_DIR}/include/ncnn/net.h") + message(FATAL_ERROR "ncnn headers not found. Confirm the directory contains ncnn: ${NCNN_ROOT_DIR}") +endif() + +set(NCNN_INCLUDE_DIRS "${NCNN_ROOT_DIR}/include") +set(NCNN_LIBRARIES "${NCNN_ROOT_DIR}/lib/libncnn.a") + +add_executable(Test-OcrLite OcrLite.cc) +target_include_directories(Test-OcrLite PRIVATE ${LOCKZHINER_VISION_MODULE_INCLUDE_DIRS} ${NCNN_INCLUDE_DIRS}) +target_link_libraries(Test-OcrLite PRIVATE ${OPENCV_LIBRARIES} ${NCNN_LIBRARIES} ${LOCKZHINER_VISION_MODULE_LIBRARIES}) + +install( + TARGETS Test-OcrLite + RUNTIME DESTINATION . +) \ No newline at end of file diff --git a/Cpp_example/D07_ocr_synthesis/OcrLite.cc b/Cpp_example/D07_ocr_synthesis/OcrLite.cc new file mode 100755 index 0000000000000000000000000000000000000000..b9464a0db3f6b7015d8117e9fab4ab647516fe3c --- /dev/null +++ b/Cpp_example/D07_ocr_synthesis/OcrLite.cc @@ -0,0 +1,300 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace cv; +using namespace std::chrono; + +// 定义必要的参数 +const float dbnet_meanValues[3] = {0.485f * 255, 0.456f * 255, 0.406f * 255}; +const float dbnet_normValues[3] = {1.0f / 0.229f / 255.0f, 1.0f / 0.224f / 255.0f, 1.0f / 0.225f / 255.0f}; + +// 文本检测结构体 +struct TextBox { + std::vector boxPoint; + float score; + std::string text; +}; + +// 图像缩放参数定义 +struct ScaleParam { + int srcWidth; + int srcHeight; + int dstWidth; + int dstHeight; + float ratioWidth; + float ratioHeight; +}; + +// 文本检测相关函数 +ScaleParam getScaleParam(const cv::Mat &src, const int targetSize) { + int imgHeight = src.rows; + int imgWidth = src.cols; + float ratio = std::min(static_cast(targetSize) / imgHeight, static_cast(targetSize) / imgWidth); + ScaleParam scaleParam; + scaleParam.srcHeight = imgHeight; + scaleParam.srcWidth = imgWidth; + scaleParam.dstHeight = static_cast(imgHeight * ratio); + scaleParam.dstWidth = static_cast(imgWidth * ratio); + scaleParam.ratioHeight = ratio; + scaleParam.ratioWidth = ratio; + return scaleParam; +} + +cv::Point2f normalize(const cv::Point2f& v) { + float len = std::sqrt(v.x * v.x + v.y * v.y); + return (len == 0) ? cv::Point2f(0, 0) : cv::Point2f(v.x / len, v.y / len); +} + +// +std::vector expandPolygon(const std::vector& inBox, float distance) { + std::vector outBox; + int n = inBox.size(); + if (n < 3) return outBox; + + for (int i = 0; i < n; ++i) { + cv::Point2f prev = inBox[(i + n - 1) % n]; + cv::Point2f curr = inBox[i]; + cv::Point2f next = inBox[(i + 1) % n]; + + cv::Point2f v1 = curr - prev; + cv::Point2f v2 = next - curr; + + cv::Point2f normal1(-v1.y, v1.x); + cv::Point2f normal2(-v2.y, v2.x); + + normal1 = normalize(normal1); + normal2 = normalize(normal2); + + cv::Point2f bisectorNormal = normalize(normal1 + normal2); + cv::Point2f newPoint = curr + bisectorNormal * distance; + + outBox.push_back(cv::Point(newPoint.x, newPoint.y)); + } + return outBox; +} + +// 提取检测到文本的检测框 +std::vector findRsBoxes(const cv::Mat &fMapMat, const cv::Mat &norfMapMat, ScaleParam &s, + const float boxScoreThresh, const float unClipRatio) { + float minArea = 3; + std::vector rsBoxes; + std::vector> contours; + cv::findContours(norfMapMat, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE); + + for (const auto& contour : contours) { + double perimeter = cv::arcLength(contour, true); + cv::RotatedRect minRect = cv::minAreaRect(contour); + float minSideLen = std::min(minRect.size.width, minRect.size.height); + if (minSideLen < minArea) continue; + cv::Mat mask = cv::Mat::zeros(fMapMat.size(), CV_8UC1); + for (int contour_idx = 0; contour_idx < contours.size(); contour_idx++) { + cv::drawContours(mask, contours, contour_idx, cv::Scalar(255), cv::FILLED); + } + + cv::Scalar meanScore = cv::mean(fMapMat, mask); + float score = static_cast(meanScore[0]); + + if (score < boxScoreThresh) continue; + + double area = cv::contourArea(contour); + float distance = unClipRatio * static_cast(area) / static_cast(perimeter); + std::vector clipBox = expandPolygon(contour, distance); + + for (auto &point : clipBox) { + point.x = std::max(0, std::min(static_cast(point.x / s.ratioWidth), s.srcWidth - 1)); + point.y = std::max(0, std::min(static_cast(point.y / s.ratioHeight), s.srcHeight - 1)); + } + + rsBoxes.emplace_back(TextBox{clipBox, score}); + } + return rsBoxes; +} + +void drawTextBox(cv::Mat &boxImg, const std::vector &box, int thickness) { + for (size_t i = 0; i < box.size(); ++i) { + cv::line(boxImg, box[i], box[(i + 1) % box.size()], cv::Scalar(0, 255, 0), thickness); + } +} + +// 辅助函数 +template +inline static size_t argmax(ForwardIterator first, ForwardIterator last) { + return std::distance(first, std::max_element(first, last)); +} + +// 文本识别相关函数 +void processFrame(Mat &src, ncnn::Net &ocr_net, const std::vector &keys, + const float ocr_mean[], const float ocr_norm[], int dstHeight, std::string &outputText) { + float scale = (float)dstHeight / (float)src.rows; + int dstWidth = int((float)src.cols * scale); + Mat resized_patch; + cv::resize(src, resized_patch, cv::Size(dstWidth, dstHeight)); + ncnn::Mat input = ncnn::Mat::from_pixels(resized_patch.data, ncnn::Mat::PIXEL_BGR, + resized_patch.cols, resized_patch.rows); + input.substract_mean_normalize(ocr_mean, ocr_norm); + ncnn::Extractor ocr_extractor = ocr_net.create_extractor(); + ocr_extractor.input("input", input); + ncnn::Mat ocr_output; + ocr_extractor.extract("out", ocr_output); + float *floatArray = (float *)ocr_output.data; + std::vector outputData(floatArray, floatArray + ocr_output.h * ocr_output.w); + int keySize = keys.size(); + std::string strRes; + std::vector scores; + int lastIndex = 0; + int maxIndex; + float maxValue; + for (int i = 0; i < ocr_output.h; i++) { + int maxIndex = 0; + float maxValue = -1000.f; + // Softmax 计算 + std::vector exps(ocr_output.w); + for (int j = 0; j < ocr_output.w; j++) { + float expSingle = exp(outputData[i * ocr_output.w + j]); + exps.at(j) = expSingle; + } + // 行总和 + float partition = accumulate(exps.begin(), exps.end(), 0.0); + // 找到最大值及其索引 + auto maxElementIt = std::max_element(exps.begin(), exps.end()); + maxIndex = std::distance(exps.begin(), maxElementIt); + maxValue = *maxElementIt / partition; + // 检测到有效字符 + if (maxIndex > 0 && maxIndex < keySize && + (!(i > 0 && maxIndex == lastIndex))) { + scores.emplace_back(maxValue); + // 将字符追加到结果字符串中 + strRes.append(keys[maxIndex - 1]); + } + + lastIndex = maxIndex; + } + outputText = strRes; +} + +int main(int argc, char *argv[]) { + if (argc != 7) { + std::cerr << "Usage: " << argv[0] + << " " << std::endl; + return -1; + } + + // 读取输入图像 + Mat src = imread(argv[1], IMREAD_COLOR); + if (src.empty()) { + std::cerr << "Failed to read image." << std::endl; + return -1; + } + + // 加载OCR字符表 + std::vector keys; + std::ifstream in(argv[6]); + std::string line; + if (in) { + while (getline(in, line)) { + keys.push_back(line); + } + } else { + std::cerr << "The keys.txt file was not found." << std::endl; + return -1; + } + if (keys.size() != 5531) { + std::cerr << "Invalid keys.txt format." << std::endl; + return -1; + } + + // 初始化DBNet检测模型 + ncnn::Net dbnet; + dbnet.opt.use_vulkan_compute = false; + if (dbnet.load_param(argv[2]) != 0 || dbnet.load_model(argv[3]) != 0) { + std::cerr << "Failed to load DBNet model." << std::endl; + return -1; + } + + // 初始化OCR识别模型 + ncnn::Net ocr_net; + ocr_net.opt.use_vulkan_compute = false; + if (ocr_net.load_param(argv[4]) != 0 || ocr_net.load_model(argv[5]) != 0) { + std::cerr << "Failed to load OCR model." << std::endl; + return -1; + } + + // 执行文本检测 + ScaleParam scaleParam = getScaleParam(src, 736); + Mat srcResize; + resize(src, srcResize, cv::Size(scaleParam.dstWidth, scaleParam.dstHeight)); + + ncnn::Mat dbnet_input = ncnn::Mat::from_pixels(srcResize.data, ncnn::Mat::PIXEL_BGR, + srcResize.cols, srcResize.rows); + dbnet_input.substract_mean_normalize(dbnet_meanValues, dbnet_normValues); + + ncnn::Extractor dbnet_extractor = dbnet.create_extractor(); + dbnet_extractor.input("input0", dbnet_input); + ncnn::Mat dbnet_output; + dbnet_extractor.extract("out1", dbnet_output); + + // 解析检测输出 + Mat fMapMat(srcResize.rows, srcResize.cols, CV_32FC1); + memcpy(fMapMat.data, (float *)dbnet_output.data, + srcResize.rows * srcResize.cols * sizeof(float)); + Mat norfMapMat = fMapMat > 0.3f; + + std::vector text_boxes = findRsBoxes(fMapMat, norfMapMat, scaleParam, 0.5f, 4.0f); + + // 初始化计数器 + int save_count = 0; + // 执行文本识别 + for (auto& box : text_boxes) { + // 裁剪文本区域 + Rect rect = cv::boundingRect(box.boxPoint); + Mat text_patch = src(rect).clone(); + + // 保存裁剪后的原始区域 + std::string filename = "cropped_" + std::to_string(save_count) + ".png"; + // 保存为PNG格式 + imwrite(filename, text_patch); + save_count++; + + // 执行OCR识别 + const float ocr_mean[3] = {127.5f, 127.5f, 127.5f}; + const float ocr_norm[3] = {1.0f / 127.5f, 1.0f / 127.5f, 1.0f / 127.5f}; + std::string textResult; + processFrame(text_patch, ocr_net, keys, ocr_mean, ocr_norm, 32, textResult); + + box.text = textResult; + } + + // 对文本框进行排序 + std::sort(text_boxes.begin(), text_boxes.end(), [](const TextBox &a, const TextBox &b) { + cv::Rect rectA = cv::boundingRect(a.boxPoint); + cv::Rect rectB = cv::boundingRect(b.boxPoint); + + int centerYA = rectA.y + rectA.height / 2; + int centerYB = rectB.y + rectB.height / 2; + + if (centerYA != centerYB) { + return centerYA < centerYB; + } + return (rectA.x + rectA.width / 2) < (rectB.x + rectB.width / 2); + }); + // 可视化结果 + for (const auto& box : text_boxes) { + drawTextBox(src, box.boxPoint, 1); + std::cout << box.text << std::endl; + // 由于opencv并不支持绘制中文,所以一下代码可以注释掉,感兴趣的可以自己尝试一下。 + // putText(src, box.text, box.boxPoint[0], + // FONT_HERSHEY_SIMPLEX, 0.8, + // Scalar(0, 0, 255), 1); + } + + // 显示结果 + imshow("Text_result", src); + waitKey(0); + return 0; +} \ No newline at end of file diff --git a/Cpp_example/D07_ocr_synthesis/README.md b/Cpp_example/D07_ocr_synthesis/README.md new file mode 100755 index 0000000000000000000000000000000000000000..a3869b0c13f6dd7707391ed8da7019f4c420d7bf --- /dev/null +++ b/Cpp_example/D07_ocr_synthesis/README.md @@ -0,0 +1,564 @@ +# OCR 中文字符识别 +本章节在 Lockzhiner Vision Module 上基于OcrLiteNcnn模型, 实现了一个OCR中文字符识别系统。 +## 1. 基本知识讲解 +### 1.1 OCR中文字符识别简介 +光学字符识别(OCR)是一种将图像中的文字转换为可编辑文本的技术,其中中文OCR因汉字结构复杂、字形多样而更具挑战性。通过图像预处理、深度学习模型(如CNN+RNN)及语言校正,系统能精准识别印刷体或手写中文,并适应不同字体、排版与复杂背景。该技术广泛应用于文档数字化、证件识别、智能办公等领域,尤其在处理海量纸质资料、提升信息处理效率方面具有重要意义。 +### 1.2 OCR中文字符识别的核心步骤 +要实现一个OCR中文字符识别,主要包含以下几个步骤: +- 图像预处理: + - 灰度化与二值化:将彩色图像转换为黑白图像,增强文字与背景的对比度。 + - 降噪与倾斜校正:去除噪声,校正因拍摄导致的倾斜角度。 +- 文字定位与区域检测: + - 一般利用深度学习模型来检测图像中的文字区域。 +- 字符分割: + - 将连续文字块拆分为单个字符,解决粘连字符问题。 +- 特征提取与识别: + - 提取字符的几何特征或通过深度学习模型生成抽象特征。 +- 后处理与校正 + +--- + +## 2. C++ API 文档 +### 2.1 Net类 +#### 2.1.1 头文件 +```cpp +#include +``` +- 作用:用于声明Net类,使得Net类可以在当前文件中使用。 + +#### 2.1.2 构造类函数 +```cpp +ncnn::Net dbnet; +``` +- 作用:创建一个Net类型的对象实例,用于实现文字区域的检测。 +- 参数说明: + - 无 +- 返回值: + - 无 + +```cpp +ncnn::Net ocr_net; +``` +- 作用:创建一个Net类型的对象实例,用于实现文字的识别。 +- 参数说明: + - 无 +- 返回值: + - 无 + +#### 2.1.3 load_param函数 +```cpp +int load_param(const DataReader& dr); +``` +- 参数说明: + - dr:传入的参数文件路径。 +- 返回值: + - 返回值为0表示加载参数文件成功。 + +#### 2.1.4 load_model函数 +```cpp +int load_model(const DataReader& dr); +``` +- 参数说明: + - dr:传入的模型文件路径。 +- 返回值:返回值为0表示加载模型成功。 + +#### 2.1.5 from_pixels函数 +```cpp +ncnn::Mat::from_pixels(srcResize.data, ncnn::Mat::PIXEL_BGR, srcResize.cols, srcResize.rows); +``` +- 参数说明: + - srcResize.data:输入图像的像素数据指针。 + - ncnn::Mat::PIXEL_BGR:输入像素数据的颜色格式。 + - srcResize.cols:输入图像的宽度。 + - srcResize.rows:输入图像的高度。 +- 返回值:适配成 NCNN 所需的格式的包含图像数据的新对象。 + +### 2.2 Extractor类 +#### 2.2.1 头文件 +```cpp +#include +``` +- 作用:用于声明Extractor类,使得Extractor类可以在当前文件中使用。 + +#### 2.2.2 构造类函数 +```cpp +ncnn::Extractor dbnet_extractor = dbnet.create_extractor(); +``` +- 作用:从已经加载了神经网络模型的 dbnet 中创建一个 Extractor 实例,用于执行文本区域检测的推理任务。 +- 参数说明: + - 无 +- 返回值: + - 无 + +```cpp +cnn::Extractor ocr_extractor = ocr_net.create_extractor(); +``` +- 作用:从已经加载了神经网络模型的 ocr_net 中创建一个 Extractor 实例,用于执行文本识别的推理任务。 +- 参数说明: + - 无 +- 返回值: + - 无 + +## 3. OCR 中文字符识别代码解析 +### 3.1 流程图 + + + +### 3.2 核心代码解析 +#### 3.2.1 向量归一化 +```cpp +cv::Point2f normalize(const cv::Point2f& v); +``` +- 作用: + - 计算向量长度并归一化,用于后续几何变换(如多边形扩展方向计算)。 +- 参数说明: + - v:待归一化的二维向量。 +- 返回值: + - 返回单位长度的向量,若输入向量为零向量则返回(0, 0)。 + +#### 3.2.2 多边形边界扩张 +```cpp +std::vector expandPolygon(const std::vector& inBox, float distance); +``` +- 作用: + - 基于邻边法线方向计算角平分线,向外扩展多边形边界,增强文本检测框的包容性。 +- 参数说明: + - inBox:输入的多边形顶点集合。 + - distance:扩展距离,控制文本框膨胀程度。 +- 返回值: + - 返回扩展后的新多边形顶点集合。 + +#### 3.2.3 文本检测框提取 +```cpp +std::vector findRsBoxes(const cv::Mat &fMapMat, const cv::Mat &norfMapMat, ScaleParam &s, const float boxScoreThresh, const float unClipRatio); +``` +- 作用: + - 结合轮廓检测与UNet式后处理,生成最终文本检测框。 +- 参数说明: + - fMapMat:DBNet输出的概率图。 + - norfMapMat:二值化后的概率图。 + - s:图像缩放参数结构体。 + - boxScoreThresh:文本框置信度阈值。 + - unClipRatio:边界扩展系数。 +- 返回值: + - 返回包含文本框坐标、置信度的结构体列表。 + +#### 3.2.4 OCR文本识别 +```cpp +void processFrame(Mat &src, ncnn::Net &ocr_net, const std::vector &keys, + const float ocr_mean[], const float ocr_norm[], int dstHeight, std::string &outputText); +``` +- 作用: + - 执行图像预处理,模型推理,Softmax解码,字符拼接全流程。 +- 参数说明: + - src:待识别的文本区域图像。 + - ocr_net:OCR识别模型。 + - keys:字符表(字符到索引映射)。 + - ocr_mean:图像归一化均值。 + - ocr_norm:图像归一化标准差。 + - dstHeight:目标高度。 + - outputText:输出识别结果字符串。 +- 返回值: + - 无 + +### 3.3 完整代码实现 +```cpp +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace cv; +using namespace std::chrono; + +// 定义必要的参数 +const float dbnet_meanValues[3] = {0.485f * 255, 0.456f * 255, 0.406f * 255}; +const float dbnet_normValues[3] = {1.0f / 0.229f / 255.0f, 1.0f / 0.224f / 255.0f, 1.0f / 0.225f / 255.0f}; + +// 文本检测结构体 +struct TextBox { + std::vector boxPoint; + float score; + std::string text; +}; + +// 图像缩放参数定义 +struct ScaleParam { + int srcWidth; + int srcHeight; + int dstWidth; + int dstHeight; + float ratioWidth; + float ratioHeight; +}; + +// 文本检测相关函数 +ScaleParam getScaleParam(const cv::Mat &src, const int targetSize) { + int imgHeight = src.rows; + int imgWidth = src.cols; + float ratio = std::min(static_cast(targetSize) / imgHeight, static_cast(targetSize) / imgWidth); + ScaleParam scaleParam; + scaleParam.srcHeight = imgHeight; + scaleParam.srcWidth = imgWidth; + scaleParam.dstHeight = static_cast(imgHeight * ratio); + scaleParam.dstWidth = static_cast(imgWidth * ratio); + scaleParam.ratioHeight = ratio; + scaleParam.ratioWidth = ratio; + return scaleParam; +} + +cv::Point2f normalize(const cv::Point2f& v) { + float len = std::sqrt(v.x * v.x + v.y * v.y); + return (len == 0) ? cv::Point2f(0, 0) : cv::Point2f(v.x / len, v.y / len); +} + +// 多边形边界扩展 +std::vector expandPolygon(const std::vector& inBox, float distance) { + std::vector outBox; + int n = inBox.size(); + if (n < 3) return outBox; + + for (int i = 0; i < n; ++i) { + cv::Point2f prev = inBox[(i + n - 1) % n]; + cv::Point2f curr = inBox[i]; + cv::Point2f next = inBox[(i + 1) % n]; + + cv::Point2f v1 = curr - prev; + cv::Point2f v2 = next - curr; + + cv::Point2f normal1(-v1.y, v1.x); + cv::Point2f normal2(-v2.y, v2.x); + + normal1 = normalize(normal1); + normal2 = normalize(normal2); + + cv::Point2f bisectorNormal = normalize(normal1 + normal2); + cv::Point2f newPoint = curr + bisectorNormal * distance; + + outBox.push_back(cv::Point(newPoint.x, newPoint.y)); + } + return outBox; +} + +// 提取检测到文本的检测框 +std::vector findRsBoxes(const cv::Mat &fMapMat, const cv::Mat &norfMapMat, ScaleParam &s, + const float boxScoreThresh, const float unClipRatio) { + float minArea = 3; + std::vector rsBoxes; + std::vector> contours; + cv::findContours(norfMapMat, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE); + + for (const auto& contour : contours) { + double perimeter = cv::arcLength(contour, true); + cv::RotatedRect minRect = cv::minAreaRect(contour); + float minSideLen = std::min(minRect.size.width, minRect.size.height); + if (minSideLen < minArea) continue; + cv::Mat mask = cv::Mat::zeros(fMapMat.size(), CV_8UC1); + for (int contour_idx = 0; contour_idx < contours.size(); contour_idx++) { + cv::drawContours(mask, contours, contour_idx, cv::Scalar(255), cv::FILLED); + } + + cv::Scalar meanScore = cv::mean(fMapMat, mask); + float score = static_cast(meanScore[0]); + + if (score < boxScoreThresh) continue; + + double area = cv::contourArea(contour); + float distance = unClipRatio * static_cast(area) / static_cast(perimeter); + std::vector clipBox = expandPolygon(contour, distance); + + for (auto &point : clipBox) { + point.x = std::max(0, std::min(static_cast(point.x / s.ratioWidth), s.srcWidth - 1)); + point.y = std::max(0, std::min(static_cast(point.y / s.ratioHeight), s.srcHeight - 1)); + } + + rsBoxes.emplace_back(TextBox{clipBox, score}); + } + return rsBoxes; +} + +void drawTextBox(cv::Mat &boxImg, const std::vector &box, int thickness) { + for (size_t i = 0; i < box.size(); ++i) { + cv::line(boxImg, box[i], box[(i + 1) % box.size()], cv::Scalar(0, 255, 0), thickness); + } +} + +// 辅助函数 +template +inline static size_t argmax(ForwardIterator first, ForwardIterator last) { + return std::distance(first, std::max_element(first, last)); +} + +// 文本识别相关函数 +void processFrame(Mat &src, ncnn::Net &ocr_net, const std::vector &keys, + const float ocr_mean[], const float ocr_norm[], int dstHeight, std::string &outputText) { + float scale = (float)dstHeight / (float)src.rows; + int dstWidth = int((float)src.cols * scale); + Mat resized_patch; + cv::resize(src, resized_patch, cv::Size(dstWidth, dstHeight)); + ncnn::Mat input = ncnn::Mat::from_pixels(resized_patch.data, ncnn::Mat::PIXEL_BGR, + resized_patch.cols, resized_patch.rows); + input.substract_mean_normalize(ocr_mean, ocr_norm); + ncnn::Extractor ocr_extractor = ocr_net.create_extractor(); + ocr_extractor.input("input", input); + ncnn::Mat ocr_output; + ocr_extractor.extract("out", ocr_output); + float *floatArray = (float *)ocr_output.data; + std::vector outputData(floatArray, floatArray + ocr_output.h * ocr_output.w); + int keySize = keys.size(); + std::string strRes; + std::vector scores; + int lastIndex = 0; + int maxIndex; + float maxValue; + for (int i = 0; i < ocr_output.h; i++) { + int maxIndex = 0; + float maxValue = -1000.f; + // Softmax 计算 + std::vector exps(ocr_output.w); + for (int j = 0; j < ocr_output.w; j++) { + float expSingle = exp(outputData[i * ocr_output.w + j]); + exps.at(j) = expSingle; + } + // 行总和 + float partition = accumulate(exps.begin(), exps.end(), 0.0); + // 找到最大值及其索引 + auto maxElementIt = std::max_element(exps.begin(), exps.end()); + maxIndex = std::distance(exps.begin(), maxElementIt); + maxValue = *maxElementIt / partition; + // 检测到有效字符 + if (maxIndex > 0 && maxIndex < keySize && + (!(i > 0 && maxIndex == lastIndex))) { + scores.emplace_back(maxValue); + // 将字符追加到结果字符串中 + strRes.append(keys[maxIndex - 1]); + } + + lastIndex = maxIndex; + } + outputText = strRes; +} + +int main(int argc, char *argv[]) { + if (argc != 7) { + std::cerr << "Usage: " << argv[0] + << " " << std::endl; + return -1; + } + + // 读取输入图像 + Mat src = imread(argv[1], IMREAD_COLOR); + if (src.empty()) { + std::cerr << "Failed to read image." << std::endl; + return -1; + } + + // 加载OCR字符表 + std::vector keys; + std::ifstream in(argv[6]); + std::string line; + if (in) { + while (getline(in, line)) { + keys.push_back(line); + } + } else { + std::cerr << "The keys.txt file was not found." << std::endl; + return -1; + } + if (keys.size() != 5531) { + std::cerr << "Invalid keys.txt format." << std::endl; + return -1; + } + + // 初始化DBNet检测模型 + ncnn::Net dbnet; + dbnet.opt.use_vulkan_compute = false; + if (dbnet.load_param(argv[2]) != 0 || dbnet.load_model(argv[3]) != 0) { + std::cerr << "Failed to load DBNet model." << std::endl; + return -1; + } + + // 初始化OCR识别模型 + ncnn::Net ocr_net; + ocr_net.opt.use_vulkan_compute = false; + if (ocr_net.load_param(argv[4]) != 0 || ocr_net.load_model(argv[5]) != 0) { + std::cerr << "Failed to load OCR model." << std::endl; + return -1; + } + + // 执行文本检测 + ScaleParam scaleParam = getScaleParam(src, 736); + Mat srcResize; + resize(src, srcResize, cv::Size(scaleParam.dstWidth, scaleParam.dstHeight)); + + ncnn::Mat dbnet_input = ncnn::Mat::from_pixels(srcResize.data, ncnn::Mat::PIXEL_BGR, + srcResize.cols, srcResize.rows); + dbnet_input.substract_mean_normalize(dbnet_meanValues, dbnet_normValues); + + ncnn::Extractor dbnet_extractor = dbnet.create_extractor(); + dbnet_extractor.input("input0", dbnet_input); + ncnn::Mat dbnet_output; + dbnet_extractor.extract("out1", dbnet_output); + + // 解析检测输出 + Mat fMapMat(srcResize.rows, srcResize.cols, CV_32FC1); + memcpy(fMapMat.data, (float *)dbnet_output.data, + srcResize.rows * srcResize.cols * sizeof(float)); + Mat norfMapMat = fMapMat > 0.3f; + + std::vector text_boxes = findRsBoxes(fMapMat, norfMapMat, scaleParam, 0.5f, 4.0f); + + // 初始化计数器 + int save_count = 0; + // 执行文本识别 + for (auto& box : text_boxes) { + // 裁剪文本区域 + Rect rect = cv::boundingRect(box.boxPoint); + Mat text_patch = src(rect).clone(); + + // 保存裁剪后的原始区域 + std::string filename = "cropped_" + std::to_string(save_count) + ".png"; + // 保存为PNG格式 + imwrite(filename, text_patch); + save_count++; + + // 执行OCR识别 + const float ocr_mean[3] = {127.5f, 127.5f, 127.5f}; + const float ocr_norm[3] = {1.0f / 127.5f, 1.0f / 127.5f, 1.0f / 127.5f}; + std::string textResult; + processFrame(text_patch, ocr_net, keys, ocr_mean, ocr_norm, 32, textResult); + + box.text = textResult; + } + + // 对文本框进行排序 + std::sort(text_boxes.begin(), text_boxes.end(), [](const TextBox &a, const TextBox &b) { + cv::Rect rectA = cv::boundingRect(a.boxPoint); + cv::Rect rectB = cv::boundingRect(b.boxPoint); + + int centerYA = rectA.y + rectA.height / 2; + int centerYB = rectB.y + rectB.height / 2; + + if (centerYA != centerYB) { + return centerYA < centerYB; + } + return (rectA.x + rectA.width / 2) < (rectB.x + rectB.width / 2); + }); + // 可视化结果 + for (const auto& box : text_boxes) { + drawTextBox(src, box.boxPoint, 1); + std::cout << box.text << std::endl; + // 由于opencv并不支持绘制中文,所以一下代码可以注释掉,感兴趣的可以自己尝试一下。 + // putText(src, box.text, box.boxPoint[0], + // FONT_HERSHEY_SIMPLEX, 0.8, + // Scalar(0, 0, 255), 1); + } + + // 显示结果 + imshow("Text_result", src); + waitKey(0); + return 0; +} +``` + +--- + +## 4. 编译调试 +### 4.1 编译环境搭建 +- 请确保你已经按照 [开发环境搭建指南](../../../../docs/introductory_tutorial/cpp_development_environment.md) 正确配置了开发环境。 +- 同时已经正确连接开发板。 +### 4.2 Cmake介绍 +```cmake +cmake_minimum_required(VERSION 3.10) + +project(ocr_synthesis) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# 定义项目根目录路径 +set(PROJECT_ROOT_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../..") +message("PROJECT_ROOT_PATH = " ${PROJECT_ROOT_PATH}) + +include("${PROJECT_ROOT_PATH}/toolchains/arm-rockchip830-linux-uclibcgnueabihf.toolchain.cmake") + +# 定义 OpenCV SDK 路径 +set(OpenCV_ROOT_PATH "${PROJECT_ROOT_PATH}/third_party/opencv-mobile-4.10.0-lockzhiner-vision-module") +set(OpenCV_DIR "${OpenCV_ROOT_PATH}/lib/cmake/opencv4") +find_package(OpenCV REQUIRED) +set(OPENCV_LIBRARIES "${OpenCV_LIBS}") + +# 定义 LockzhinerVisionModule SDK 路径 +set(LockzhinerVisionModule_ROOT_PATH "${PROJECT_ROOT_PATH}/third_party/lockzhiner_vision_module_sdk") +set(LockzhinerVisionModule_DIR "${LockzhinerVisionModule_ROOT_PATH}/lib/cmake/lockzhiner_vision_module") +find_package(LockzhinerVisionModule REQUIRED) + +# ncnn配置 +set(NCNN_ROOT_DIR "${PROJECT_ROOT_PATH}/third_party/ncnn-20240820-lockzhiner-vision-module") # 确保third_party层级存在 +message(STATUS "Checking ncnn headers in: ${NCNN_ROOT_DIR}/include/ncnn") + +# 验证头文件存在 +if(NOT EXISTS "${NCNN_ROOT_DIR}/include/ncnn/net.h") + message(FATAL_ERROR "ncnn headers not found. Confirm the directory contains ncnn: ${NCNN_ROOT_DIR}") +endif() + +set(NCNN_INCLUDE_DIRS "${NCNN_ROOT_DIR}/include") +set(NCNN_LIBRARIES "${NCNN_ROOT_DIR}/lib/libncnn.a") + +add_executable(Test-OcrLite OcrLite.cc) +target_include_directories(Test-OcrLite PRIVATE ${LOCKZHINER_VISION_MODULE_INCLUDE_DIRS} ${NCNN_INCLUDE_DIRS}) +target_link_libraries(Test-OcrLite PRIVATE ${OPENCV_LIBRARIES} ${NCNN_LIBRARIES} ${LOCKZHINER_VISION_MODULE_LIBRARIES}) + +install( + TARGETS Test-OcrLite + RUNTIME DESTINATION . +) +``` +### 4.3 编译项目 +使用 Docker Destop 打开 LockzhinerVisionModule 容器并执行以下命令来编译项目 +```bash +# 进入Demo所在目录 +cd /LockzhinerVisionModuleWorkSpace/LockzhinerVisionModule/Cpp_example/D07_ocr_synthesis +# 创建编译目录 +rm -rf build && mkdir build && cd build +# 配置交叉编译工具链 +export TOOLCHAIN_ROOT_PATH="/LockzhinerVisionModuleWorkSpace/arm-rockchip830-linux-uclibcgnueabihf" +# 使用cmake配置项目 +cmake .. +# 执行编译项目 +make -j8 && make install +``` + +在执行完上述命令后,会在build目录下生成可执行文件。 + +--- + +## 5. 执行结果 +### 5.1 运行前准备 +- 请确保已经下载了模型,并保存在当前目录下。 +### 5.2 运行过程 +```shell +chmod 777 Test-OcrLite +./Test-OcrLite dbnet_op.param dbnet_op.bin crnn_lite_op.param crnn_lite_op.bin keys.txt +``` +### 5.3 运行结果 +- 原始图像 + +![title](./images/test.png) + +- 识别结果 + +![title](./images/Result_Rec.png) + +--- + +## 6. 总结 +通过上述内容,我们成功实现了一个简单的OCR中文字符识别系统,包括: + +- 加载模型和待识别图像。 +- 进行文本区域检测推理和文本识别推理。 +- 打印并保存推理结果。 \ No newline at end of file diff --git a/Cpp_example/D07_ocr_synthesis/images/Result_Det.png b/Cpp_example/D07_ocr_synthesis/images/Result_Det.png new file mode 100755 index 0000000000000000000000000000000000000000..174859f525b59fe6de30c61db0e821d484486db6 Binary files /dev/null and b/Cpp_example/D07_ocr_synthesis/images/Result_Det.png differ diff --git a/Cpp_example/D07_ocr_synthesis/images/Result_Rec.png b/Cpp_example/D07_ocr_synthesis/images/Result_Rec.png new file mode 100755 index 0000000000000000000000000000000000000000..0de6f93c3eea0cb3c22ee58ee4fb99f76a01040a Binary files /dev/null and b/Cpp_example/D07_ocr_synthesis/images/Result_Rec.png differ diff --git a/Cpp_example/D07_ocr_synthesis/images/test.png b/Cpp_example/D07_ocr_synthesis/images/test.png new file mode 100755 index 0000000000000000000000000000000000000000..6819020d308f5f7749a53ed40c53b03cea9d12d1 Binary files /dev/null and b/Cpp_example/D07_ocr_synthesis/images/test.png differ diff --git a/Cpp_example/D07_ocr_synthesis/images/view.png b/Cpp_example/D07_ocr_synthesis/images/view.png new file mode 100755 index 0000000000000000000000000000000000000000..c5c42ad9cab056bef3fed8be3209386bdaeef0ef Binary files /dev/null and b/Cpp_example/D07_ocr_synthesis/images/view.png differ diff --git a/README.md b/README.md index 05a32a1b59ad56cf047b04383c0d5710a0712743..c1bc3d0692984595314c48f6fa64ce2c46ba5071 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,7 @@ * [凌智视觉模块连接设备指南](./docs/introductory_tutorial/connect_device_using_ssh.md) * [凌智视觉模块WiFi配置指南](./docs/introductory_tutorial/wifi_config.md) * [凌智视觉模块 Python 开发环境搭建指南](./docs/introductory_tutorial/python_development_environment.md) - +* [凌智视觉模块 C++ 开发环境搭建指南](./docs/introductory_tutorial/cpp_development_environment.md)