diff --git a/CMakeLists.txt b/CMakeLists.txt index d281c2c676ff776313042df9a5eb6fa25e77eb73..7ad285811ccaef4c6ec3addc080ed19e702c4fef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.18) project(MovingCorrelation) cmake_policy(SET CMP0104 NEW) -include("/usr/share/CMakeDependencyDiagram/CMakeDependencyDiagram.cmake") +# include("/usr/share/CMakeDependencyDiagram/CMakeDependencyDiagram.cmake") set(CMAKE_CXX_STANDARD 17) diff --git a/cuda_correlation.cu b/cuda_correlation.cu index 3762c8e19b3196208db6f2f2c074020bcf347f32..671273826d1093cb9bbceeb70f04526d673e524d 100644 --- a/cuda_correlation.cu +++ b/cuda_correlation.cu @@ -1,23 +1,17 @@ #include #include #include -#include // 包含 sqrt 函数 -#include -#include -#include #include #include "cuda_correlation.h" - -// cpu侧对 peekMaxKernel 计算结果还需要进一步处理 -// 则使能该宏定义,并在cpu侧增加相关逻辑处理 h_vecPeaks -// #define CPU_NEED_PROCESS_PEEKMAXKERNEL_RESULT +#include "cuda_kernel.h" using namespace std; template class CUDACorrelation; // 明确告诉编译器生成float特例 template class CUDACorrelation; // 明确告诉编译器生成double特例 +// CHECK_CUDA_ERROR:cuda api调用错误处理 #define CHECK_CUDA_ERROR(call) \ do { \ cudaError_t err = call; \ @@ -43,208 +37,7 @@ static constexpr cufftType getFFTType() { } } -// 单精度共轭乘核函数 -// 一次计算一个序列和所有信号的共轭乘 -__global__ void ConjugateMultiplyKernelFloat( - const cufftComplex* __restrict__ signalDatas, - const cufftComplex* __restrict__ sequenceFFT, - cufftComplex* __restrict__ outputs, int signalLength, int numChannels) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - - if (idx >= signalLength * numChannels) return; - - int pos = idx % signalLength; - - cufftComplex signal = signalDatas[idx]; - cufftComplex seq = sequenceFFT[pos]; - - // 计算 signal * conj(seq) - outputs[idx].x = signal.x * seq.x + signal.y * seq.y; // 实部 - outputs[idx].y = signal.y * seq.x - signal.x * seq.y; // 虚部 -} - -// 批处理:计算所有序列和信号的共轭乘 -__global__ void batchConjugateMultiplyKernelFloat( - const cufftComplex* __restrict__ signalDatas, - const cufftComplex* __restrict__ sequenceFFT, - cufftComplex* __restrict__ outputs, int signalLength, int numChannels, - int seqChannels) { - // 使用2D网格布局:x维度处理位置,y维度处理序列 - int pos = blockIdx.x * blockDim.x + threadIdx.x; - // 检查位置索引是否有效 - if (pos >= signalLength) return; - - int seqIdx = blockIdx.y; - if (seqIdx >= seqChannels) return; - - // 预计算常用偏移量 - const int signalOffset = pos; // 信号内存访问基础偏移 - const int sequenceOffset = seqIdx * signalLength + pos; - - // 预取序列FFT值(所有通道共享) - const cufftComplex seq = sequenceFFT[sequenceOffset]; - -// 处理所有通道 -#pragma unroll - for (int chIdx = 0; chIdx < numChannels; ++chIdx) { - // 计算当前通道的全局索引 - int globalIdx = - seqIdx * numChannels * signalLength + chIdx * signalLength + pos; - - // 读取信号数据(合并访问) - cufftComplex signal = signalDatas[chIdx * signalLength + signalOffset]; - - // 计算共轭乘法 - outputs[globalIdx].x = signal.x * seq.x + signal.y * seq.y; - outputs[globalIdx].y = signal.y * seq.x - signal.x * seq.y; - } -} - -// 双精度共轭乘核函数 -// 一次计算一个序列和所有信号的共轭乘 -__global__ void ConjugateMultiplyKernelDouble( - const cufftDoubleComplex* __restrict__ signalDatas, - const cufftDoubleComplex* __restrict__ sequenceFFT, - cufftDoubleComplex* __restrict__ outputs, int signalLength, - int numChannels) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - - if (idx >= signalLength * numChannels) return; - - int pos = idx % signalLength; - - cufftDoubleComplex signal = signalDatas[idx]; - cufftDoubleComplex seq = sequenceFFT[pos]; - - // 计算 signal * conj(seq) - outputs[idx].x = signal.x * seq.x + signal.y * seq.y; // 实部 - outputs[idx].y = signal.y * seq.x - signal.x * seq.y; // 虚部 -} - -// 批处理:计算所有序列和信号的共轭乘 -__global__ void batchConjugateMultiplyKernelDouble( - const cufftDoubleComplex* __restrict__ signalDatas, - const cufftDoubleComplex* __restrict__ sequenceFFT, - cufftDoubleComplex* __restrict__ outputs, int signalLength, int numChannels, - int seqChannels) { - // 使用2D网格布局:x维度处理位置,y维度处理序列 - int pos = blockIdx.x * blockDim.x + threadIdx.x; - // 检查位置索引是否有效 - if (pos >= signalLength) return; - - int seqIdx = blockIdx.y; - if (seqIdx >= seqChannels) return; - - // 预计算常用偏移量 - const int signalOffset = pos; // 信号内存访问基础偏移 - const int sequenceOffset = seqIdx * signalLength + pos; - - // 预取序列FFT值(所有通道共享) - const cufftDoubleComplex seq = sequenceFFT[sequenceOffset]; - -// 处理所有通道 -#pragma unroll - for (int chIdx = 0; chIdx < numChannels; ++chIdx) { - // 计算当前通道的全局索引 - int globalIdx = - seqIdx * numChannels * signalLength + chIdx * signalLength + pos; - - // 读取信号数据(合并访问) - cufftDoubleComplex signal = - signalDatas[chIdx * signalLength + signalOffset]; - - // 计算共轭乘法 - outputs[globalIdx].x = signal.x * seq.x + signal.y * seq.y; - outputs[globalIdx].y = signal.y * seq.x - signal.x * seq.y; - } -} - -#ifdef USE_PEEKSKERNEL -template -T hypot(T x, T y) { - // 使用 sqrt 和 fabs 来计算欧几里得范数 - return sqrt(fabs(x) * fabs(x) + fabs(y) * fabs(y)); -} - -// peekMax的核函数:参考CPU侧旧的计算逻辑实现 -__global__ void CalculatePeaksKernelFloat( - const cufftComplex* __restrict__ d_conjResults, - const uint* __restrict__ d_seqLen, uint seqChannels, uint signalChannels, - uint signalLength, uint* __restrict__ d_vecPeaks) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= seqChannels * signalChannels) return; - - // 根据GPU线程id,计算相应的序列id:seq_id - int seq_id = idx / signalChannels; - uint sequenceLength = d_seqLen[seq_id]; - const uint num_elements = signalLength - sequenceLength; - if (num_elements == 0) { - return; - } - - float max_abs = -10000; - // int max_index = -1; - float total_abs = 0.0; - - // d_conjResults的维度:[seqChannels * signalChannels][signalLength] - // 每个GPU线程(线程idx),处理相应的d_conjResults[idx][signalLength] - const auto& CorrelationValue = d_conjResults + idx * signalLength; - -#pragma unroll - for (int i = 0; i < num_elements; ++i) { - const float abs_val = hypot(CorrelationValue[i].x, CorrelationValue[i].y); - total_abs += abs_val; - - if (abs_val > max_abs) { - max_abs = abs_val; - // max_index = i; - } - } - - const float avg_abs = total_abs / num_elements; - int res = ((max_abs > (avg_abs * 7)) ? 1 : 0); - d_vecPeaks[idx] = res; -} - -__global__ void CalculatePeaksKernelDouble( - const cufftDoubleComplex* __restrict__ d_conjResults, - const uint* __restrict__ d_seqLen, uint seqChannels, uint signalChannels, - uint signalLength, uint* __restrict__ d_vecPeaks) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= seqChannels * signalChannels) return; - - // 根据GPU线程id,计算相应的序列id:seq_id - int seq_id = idx / signalChannels; - uint sequenceLength = d_seqLen[seq_id]; - const uint num_elements = signalLength - sequenceLength; - if (num_elements == 0) { - return; - } - - double max_abs = -10000; - // int max_index = -1; - double total_abs = 0.0; - - // d_conjResults的维度:[seqChannels * signalChannels][signalLength] - // 每个GPU线程(线程idx),处理相应的d_conjResults[idx][signalLength] - const auto& CorrelationValue = d_conjResults + idx * signalLength; - -#pragma unroll - for (int i = 0; i < num_elements; ++i) { - const double abs_val = hypot(CorrelationValue[i].x, CorrelationValue[i].y); - total_abs += abs_val; - - if (abs_val > max_abs) { - max_abs = abs_val; - // max_index = i; - } - } - - const double avg_abs = total_abs / num_elements; - d_vecPeaks[idx] = ((max_abs > (avg_abs * 7)) ? 1 : 0); -} -#endif - +// CUDACorrelation类构造函数 template CUDACorrelation::CUDACorrelation() : sigfftPlan_(0), @@ -260,7 +53,7 @@ CUDACorrelation::CUDACorrelation() stream_ = cudaStreamDefault; CHECK_CUDA_ERROR(cudaGetDeviceProperties(&deviceProp_, 0)); - // 检查设备是否支持零拷贝 + // 检查gpu设备是否支持零拷贝 CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&canMapHostMemory, cudaDevAttrCanMapHostMemory, 0)); @@ -275,6 +68,7 @@ CUDACorrelation::CUDACorrelation() } } +// CUDACorrelation类析构函数 template CUDACorrelation::~CUDACorrelation() { Cleanup(); @@ -290,11 +84,15 @@ void CUDACorrelation::FreeMemory() { cudaFreeAsync(d_sequence_fft, stream_); d_sequence_fft = nullptr; } + + // 释放signals的零拷贝内存 if (h_signals) { cudaFreeHost(h_signals); h_signals = nullptr; d_signals = nullptr; } + + // 释放sequence的零拷贝内存 if (h_sequence) { cudaFreeHost(h_sequence); h_sequence = nullptr; @@ -314,20 +112,34 @@ void CUDACorrelation::FreeMemory() { h_vecSeqLen = nullptr; d_vecSeqLen = nullptr; } - // peekmax核函数计算结果显存 -#ifdef CPU_NEED_PROCESS_PEEKMAXKERNEL_RESULT - if (h_vecPeaks) { - cudaFreeHost(h_vecPeaks); - h_vecPeaks = nullptr; - d_vecPeaks = nullptr; + + // 释放maxValue的零拷贝内存 + if (h_maxValue) { + cudaFreeHost(h_maxValue); + h_maxValue = nullptr; + d_maxValue = nullptr; } -#else - if (d_vecPeaks) { - cudaFreeAsync(d_vecPeaks, stream_); - d_vecPeaks = nullptr; + + // 释放maxValueIndex的零拷贝内存 + if (h_maxValueIndex) { + cudaFreeHost(h_maxValueIndex); + h_maxValueIndex = nullptr; + d_maxValueIndex = nullptr; + } + + // 释放maxValueIsValid的零拷贝内存 + if (h_maxValueIsValid) { + cudaFreeHost(h_maxValueIsValid); + h_maxValueIsValid = nullptr; + d_maxValueIsValid = nullptr; } -#endif + // 释放maxValueResults的零拷贝内存 + if (h_maxValueResults) { + cudaFreeHost(h_maxValueResults); + h_maxValueResults = nullptr; + d_maxValueResults = nullptr; + } #else if (h_results) { cudaFreeHost(h_results); @@ -361,7 +173,7 @@ void CUDACorrelation::Cleanup() { // 申请MappMemory(实现CPU与GPU的零拷贝:提升性能) template -bool CUDACorrelation::AllocMappMemory(void** host_ptr, void** device_ptr, +bool CUDACorrelation::AllocMappMemory(void **host_ptr, void **device_ptr, size_t size) { if (canMapHostMemory) { if (size == 0) { @@ -446,12 +258,12 @@ void CUDACorrelation::ComputeSequenceFFT(uint numSequence, uint fftLength) { if constexpr (std::is_same_v) { cufftStatus = cufftExecC2C( - seqfftPlan_, reinterpret_cast(d_sequence), - reinterpret_cast(d_sequence_fft), CUFFT_FORWARD); + seqfftPlan_, reinterpret_cast(d_sequence), + reinterpret_cast(d_sequence_fft), CUFFT_FORWARD); } else { cufftStatus = cufftExecZ2Z( - seqfftPlan_, reinterpret_cast(d_sequence), - reinterpret_cast(d_sequence_fft), CUFFT_FORWARD); + seqfftPlan_, reinterpret_cast(d_sequence), + reinterpret_cast(d_sequence_fft), CUFFT_FORWARD); } if (cufftStatus != CUFFT_SUCCESS) { @@ -478,11 +290,8 @@ void CUDACorrelation::ComputeSignalsFFT(uint numChannels, } uint signalsNum = numChannels * signalLength; - uint signalsSize = signalsNum * sizeof(Complex); - -#ifndef USE_PEEKSKERNEL + uint signalsSize = signalsNum * sizeof(CUDAComplex); signalsNum_ = signalsNum; -#endif // 共轭乘核函数线程配置 if (signalLength_ != signalLength) { @@ -533,16 +342,16 @@ void CUDACorrelation::ComputeSignalsFFT(uint numChannels, // 计算signals的fft if constexpr (std::is_same_v) { cufftStatus = cufftExecC2C( - sigfftPlan_, reinterpret_cast(d_signals), - reinterpret_cast(d_signals_fft), CUFFT_FORWARD); + sigfftPlan_, reinterpret_cast(d_signals), + reinterpret_cast(d_signals_fft), CUFFT_FORWARD); if (cufftStatus != CUFFT_SUCCESS) { throw std::runtime_error( "Failed to execute forward FFT on signalDatas"); } } else { cufftStatus = cufftExecZ2Z( - sigfftPlan_, reinterpret_cast(d_signals), - reinterpret_cast(d_signals_fft), CUFFT_FORWARD); + sigfftPlan_, reinterpret_cast(d_signals), + reinterpret_cast(d_signals_fft), CUFFT_FORWARD); if (cufftStatus != CUFFT_SUCCESS) { throw std::runtime_error( "Failed to execute forward FFT on signalDatas"); @@ -550,7 +359,7 @@ void CUDACorrelation::ComputeSignalsFFT(uint numChannels, } return; - } catch (const std::exception& e) { + } catch (const std::exception &e) { std::cerr << __FUNCTION__ << " CUDA error: " << e.what() << std::endl; cudaError_t lastError = cudaGetLastError(); @@ -566,8 +375,8 @@ void CUDACorrelation::ComputeSignalsFFT(uint numChannels, } /** - * 完成所有序列和信号的共轭乘、IFFT、peekmax计算 - * 计算所有序列sequenceFFT与signalsFFT的共轭乘、IFFT、peekMax等 + * 一次完成所有序列和信号的共轭乘、IFFT、peekMaxValue等计算 + * 计算所有序列sequenceFFT与signalsFFT的共轭乘、IFFT、peekMaxValue等 * 需预先计算sequence的fft * 需预先计算signals的fft * @@ -591,6 +400,7 @@ int CUDACorrelation::ComputeConjMul(void) { bool needNewPlan = (ifftPlan_ == 0) || (resultsSize != resultsSize_); #ifdef USE_PEEKSKERNEL + const int ResultsNumPerGroup = 11; // 使用 PeeksMaxKernel 核函数 if (d_vecSeqLen == nullptr) { std::cerr << __FUNCTION__ << " d_vecSeqLen ptr is null!" << std::endl; @@ -607,35 +417,56 @@ int CUDACorrelation::ComputeConjMul(void) { CHECK_CUDA_ERROR(cudaMallocAsync(&d_results, resultsSize_, stream_)); } - // 申请 PeeksMaxKernel 核函数 计算结果所需的显存 - uint vecPeaksSize = seqChannels_ * signalChannels_ * sizeof(uint); -#ifdef CPU_NEED_PROCESS_PEEKMAXKERNEL_RESULT - // 若cpu侧还需要进一步处理 - // 分配零拷贝内存,不需要显示调用cudaMemcpy 回传给cpu内存 - // 待GPU计算结束,则实现CPU侧相关逻辑,对 h_vecPeaks 进行进一步处理即可 - if (h_vecPeaks && (vecPeaksSize_ != vecPeaksSize || vecPeaksSize_ == 0)) { - cudaFreeHost(h_vecPeaks); - h_vecPeaks = nullptr; - } - if (h_vecPeaks == nullptr) { - vecPeaksSize_ = vecPeaksSize; - // 申请零拷贝内存,系统自动同步 - AllocMappMemory((void**)&h_vecPeaks, (void**)&d_vecPeaks, vecPeaksSize_); - } -#else - // 若cpu侧不需要进一步处理,则申请普通显存 - if (d_vecPeaks && (vecPeaksSize_ != vecPeaksSize || vecPeaksSize_ == 0)) { - if (d_vecPeaks) { - cudaFreeAsync(d_vecPeaks, stream_); - d_vecPeaks = nullptr; + uint totalChannels = seqChannels_ * signalChannels_; + if (totalChannels_ != totalChannels || totalChannels_ == 0) { + totalChannels_ = totalChannels; + + // 申请 maxValue 零拷贝内存 + if (h_maxValue) { + cudaFreeHost(h_maxValue); + h_maxValue = nullptr; + } + if (h_maxValue == nullptr) { + // 申请零拷贝内存,系统自动同步 + AllocMappMemory((void **)&h_maxValue, (void **)&d_maxValue, + totalChannels_ * sizeof(T)); } - } - if (!d_vecPeaks) { - vecPeaksSize_ = vecPeaksSize; - CHECK_CUDA_ERROR(cudaMallocAsync(&d_vecPeaks, vecPeaksSize_, stream_)); + // 申请 maxValueIndex 零拷贝内存 + if (h_maxValueIndex) { + cudaFreeHost(h_maxValueIndex); + h_maxValueIndex = nullptr; + } + if (h_maxValueIndex == nullptr) { + // 申请零拷贝内存,系统自动同步 + AllocMappMemory((void **)&h_maxValueIndex, (void **)&d_maxValueIndex, + totalChannels_ * sizeof(int)); + } + + // 申请 maxValueIsValid 零拷贝内存 + if (h_maxValueIsValid) { + cudaFreeHost(h_maxValueIsValid); + h_maxValueIsValid = nullptr; + } + if (h_maxValueIsValid == nullptr) { + // 申请零拷贝内存,系统自动同步 + AllocMappMemory((void **)&h_maxValueIsValid, (void **)&d_maxValueIsValid, + totalChannels_ * sizeof(int)); + } + + // 申请 maxValueResults 零拷贝内存: 8*11的矩阵数据 + if (h_maxValueResults) { + cudaFreeHost(h_maxValueResults); + h_maxValueResults = nullptr; + } + if (h_maxValueResults == nullptr) { + // 申请零拷贝内存,系统自动同步 + AllocMappMemory( + (void **)&h_maxValueResults, (void **)&d_maxValueResults, + totalChannels_ * ResultsNumPerGroup * sizeof(CUDAComplex)); + } } -#endif // CPU_NEED_PROCESS_PEEKMAXKERNEL_RESULT + #else // USE_PEEKSKERNEL // 不使用 PeeksMaxKernel 核函数 // 则需要把共轭乘IFFT结果copy回cpu内存,进行进一步处理 @@ -647,7 +478,7 @@ int CUDACorrelation::ComputeConjMul(void) { if (h_results == nullptr) { resultsSize_ = resultsSize; // 申请零拷贝内存,系统自动同步 - AllocMappMemory((void**)&h_results, (void**)&d_results, resultsSize_); + AllocMappMemory((void **)&h_results, (void **)&d_results, resultsSize_); } #endif // USE_PEEKSKERNEL @@ -682,61 +513,161 @@ int CUDACorrelation::ComputeConjMul(void) { } #ifdef USE_PEEKSKERNEL + const int groupNum = 8; // 后续处理:IFFT结果被分成8个一组,因此groupNum=8 // GPU计算peeksMax的线程数配置 dim3 peeksMax_block(signalChannels_); dim3 peeksMax_grid((seqChannels_ * signalChannels_ + peeksMax_block.x - 1) / peeksMax_block.x); + dim3 getMax_block(signalChannels_); + dim3 getMax_grid( + ((seqChannels_ * signalChannels_ / groupNum) + getMax_block.x - 1) / + getMax_block.x); #endif if constexpr (std::is_same_v) { // 批量计算共轭乘 batchConjugateMultiplyKernelFloat<<>>( - reinterpret_cast(d_signals_fft), - reinterpret_cast(d_sequence_fft), - reinterpret_cast(d_results), signalLength_, + reinterpret_cast(d_signals_fft), + reinterpret_cast(d_sequence_fft), + reinterpret_cast(d_results), signalLength_, signalChannels_, seqChannels_); // 批量计算IFFT cufftStatus = cufftExecC2C( - ifftPlan_, reinterpret_cast(d_results), - reinterpret_cast(d_results), CUFFT_INVERSE); + ifftPlan_, reinterpret_cast(d_results), + reinterpret_cast(d_results), CUFFT_INVERSE); if (cufftStatus != CUFFT_SUCCESS) { throw std::runtime_error("Failed to execute inverse FFT"); } // GPU计算peeksMax #ifdef USE_PEEKSKERNEL - CalculatePeaksKernelFloat<<>>( - reinterpret_cast(d_results), d_vecSeqLen, seqChannels_, - signalChannels_, signalLength_, d_vecPeaks); + float thresholdRatio = 10.0; + ComputeMaxValueKernelFloat<<>>( + reinterpret_cast(d_results), d_vecSeqLen, + seqChannels_, signalChannels_, signalLength_, d_maxValue, + d_maxValueIndex, d_maxValueIsValid, thresholdRatio); + getGlobalMaxValueKernelFloat<<>>( + reinterpret_cast(d_results), seqChannels_, + signalChannels_, signalLength_, d_maxValue, d_maxValueIndex, + d_maxValueIsValid, + reinterpret_cast(d_maxValueResults)); + + // std::cout << "======test ComputeMaxValueKernelFloat=[8*12]=====" + // << std::endl; + // CUDAComplex *h_testData = nullptr; + // uint sigCh = 8; + // uint sigL = 12; + // h_testData = (CUDAComplex *)malloc((sigCh * sigL * + // sizeof(CUDAComplex))); + + // CUDAComplex *d_testData = nullptr; + // CHECK_CUDA_ERROR( + // cudaMalloc(&d_testData, (sigCh * sigL * sizeof(CUDAComplex)))); + + // for (int i = 0; i < sigCh; i++) { + // const auto &testData = h_testData + i * sigL; + // for (int j = 0; j < sigL; j++) { + // if (j == 5) { + // testData[j] = {(float)i * 100, (float)0}; + // } else if (j == sigL - 1) { + // testData[j] = {(float)i, (float)0}; + // } else { + // testData[j] = {0.0, 0.0}; + // } + // } + // } + + // for (int i = 0; i < sigCh; i++) { + // const auto &testData = h_testData + i * sigL; + // for (int j = 0; j < sigL; j++) { + // std::cout << "(" << testData[j].x << "," << testData[j].y << ")" + // << "; "; + // } + // std::cout << std::endl; + // } + + // uint *h_SeqLen = nullptr; + // uint *d_SeqLen = nullptr; + // AllocMappMemory((void **)&h_SeqLen, (void **)&d_SeqLen, 1 * + // sizeof(uint)); h_SeqLen[0] = 0; + + // CHECK_CUDA_ERROR(cudaMemcpy(d_testData, h_testData, + // sigCh * sigL * sizeof(CUDAComplex), + // cudaMemcpyHostToDevice)); + + // CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + + // dim3 test_block(sigCh); + // dim3 test_grid((1 * sigCh + test_block.x - 1) / test_block.x); + // ComputeMaxValueKernelFloat<<>>( + // reinterpret_cast(d_testData), d_SeqLen, 1, sigCh, + // sigL, d_maxValue, d_maxValueIndex, d_maxValueIsValid, + // thresholdRatio); + + // CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + // for (int i = 0; i < sigCh; i++) { + // std::cout << "h_maxValue(" << h_maxValue[i] << ")" << std::endl; + // std::cout << "h_maxValueIndex(" << h_maxValueIndex[i] << ")" + // << std::endl; + // std::cout << "h_maxValueIsValid(" << h_maxValueIsValid[i] << ")" + // << std::endl; + // } + + // getGlobalMaxValueKernelFloat<<>>( + // reinterpret_cast(d_testData), 1, sigCh, sigL, + // d_maxValue, d_maxValueIndex, d_maxValueIsValid, + // reinterpret_cast(d_maxValueResults)); + + // for (int i = 0; i < sigCh; i++) { + // const auto &maxValueResults = h_maxValueResults + i * 11; + // for (int j = 0; j < 11; j++) { + // std::cout << "(" << maxValueResults[j].x << "," + // << maxValueResults[j].y << ")" + // << "; "; + // } + // std::cout << std::endl; + // } + + // free(h_testData); + // cudaFreeHost(h_SeqLen); #endif } else { // double类型 // 批量计算共轭乘 batchConjugateMultiplyKernelDouble<<>>( - reinterpret_cast(d_signals_fft), - reinterpret_cast(d_sequence_fft), - reinterpret_cast(d_results), signalLength_, + reinterpret_cast(d_signals_fft), + reinterpret_cast(d_sequence_fft), + reinterpret_cast(d_results), signalLength_, signalChannels_, seqChannels_); // 批量计算IFFT cufftStatus = cufftExecZ2Z( - ifftPlan_, reinterpret_cast(d_results), - reinterpret_cast(d_results), CUFFT_INVERSE); + ifftPlan_, reinterpret_cast(d_results), + reinterpret_cast(d_results), CUFFT_INVERSE); if (cufftStatus != CUFFT_SUCCESS) { throw std::runtime_error("Failed to execute inverse FFT"); } // GPU计算peeksMax #ifdef USE_PEEKSKERNEL - CalculatePeaksKernelDouble<<>>( - reinterpret_cast(d_results), d_vecSeqLen, - seqChannels_, signalChannels_, signalLength_, d_vecPeaks); + double thresholdRatio = 10.0; + ComputeMaxValueKernelDouble<<>>( + reinterpret_cast(d_results), d_vecSeqLen, + seqChannels_, signalChannels_, signalLength_, d_maxValue, + d_maxValueIndex, d_maxValueIsValid, thresholdRatio); + getGlobalMaxValueKernelDouble<<>>( + reinterpret_cast(d_results), seqChannels_, + signalChannels_, signalLength_, d_maxValue, d_maxValueIndex, + d_maxValueIsValid, + reinterpret_cast(d_maxValueResults)); #endif } CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); return 0; - } catch (const std::exception& e) { + } catch (const std::exception &e) { std::cerr << __FUNCTION__ << " CUDA error: " << e.what() << std::endl; cudaError_t lastError = cudaGetLastError(); @@ -753,7 +684,7 @@ int CUDACorrelation::ComputeConjMul(void) { template bool CUDACorrelation::ComputeNormalize(uint dataNum, T scale, - CUDAComplex* d_data) { + CUDAComplex *d_data) { // 1. 参数校验 if (dataNum == 0 || d_data == nullptr || scale == T(0)) { std::cerr << __FUNCTION__ << ": 无效参数" << std::endl; @@ -784,10 +715,10 @@ bool CUDACorrelation::ComputeNormalize(uint dataNum, T scale, // 3. 执行归一化 if constexpr (std::is_same_v) { status = cublasCsscal(cublas_handle_, dataNum, &scale, - reinterpret_cast(d_data), 1); + reinterpret_cast(d_data), 1); } else { status = cublasZdscal(cublas_handle_, dataNum, &scale, - reinterpret_cast(d_data), 1); + reinterpret_cast(d_data), 1); } // 4. 错误处理 diff --git a/cuda_correlation.h b/cuda_correlation.h index 7b2079b0db0e1b1523ca9de6fbe0e978dcf34c5b..7ed59500077dcb4cce6d0be8501e0985d2d5f394 100644 --- a/cuda_correlation.h +++ b/cuda_correlation.h @@ -14,10 +14,6 @@ class CUDACorrelation { "Only float and double types are supported"); public: - using RealType = T; - using Complex = std::complex; - using Complex1D = std::vector; - using Complex2D = std::vector; using CUDAComplex = typename std::conditional::value, cufftComplex, cufftDoubleComplex>::type; @@ -51,8 +47,19 @@ class CUDACorrelation { bool ComputeNormalize(uint dataNum, T scale, CUDAComplex* d_data); #ifdef USE_PEEKSKERNEL - uint* h_vecPeaks = nullptr; - uint* d_vecPeaks = nullptr; + uint totalChannels_ = 0; // totalChannels_ = signalChannels_ * seqChannels_ + + T* h_maxValue = nullptr; + T* d_maxValue = nullptr; // 最大值 + + int* h_maxValueIndex = nullptr; + int* d_maxValueIndex = nullptr; // 最大值位置 + + int* h_maxValueIsValid = nullptr; + int* d_maxValueIsValid = nullptr; // 最大值是否有效 + + cufftComplex* h_maxValueResults = nullptr; + cufftComplex* d_maxValueResults = nullptr; // 8*11的矩阵数据 #endif CUDAComplex* h_signals = nullptr; @@ -72,12 +79,8 @@ class CUDACorrelation { uint signalChannels_ = 0; uint signalLength_ = 0; -#ifndef USE_PEEKSKERNEL - uint signalsNum_ = 0; // signalsNum_ = signalChannels_ * signalLength_ -#endif + uint signalsNum_ = 0; // signalsNum_ = signalChannels_ * signalLength_ uint signalsSize_ = 0; // signalsSize_ = signalsNum_ * sizeof(CUDAComplex) - - uint vecPeaksSize_ = 0; uint resultsSize_ = 0; // GPU设备是否可以进行内存map diff --git a/cuda_kernel.cu b/cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..4f1781496bb99bfc7942c7db54e14d9682fbf3a8 --- /dev/null +++ b/cuda_kernel.cu @@ -0,0 +1,485 @@ +#include + +#include "cuda_kernel.h" + +template +T hypot(T x, T y) { + // 使用 sqrt 和 fabs 来计算欧几里得范数 + return sqrt(fabs(x) * fabs(x) + fabs(y) * fabs(y)); +} + +// 单精度共轭乘核函数 +// 一次计算一个序列和所有信号的共轭乘 +__global__ void ConjugateMultiplyKernelFloat( + const cufftComplex *__restrict__ signalDatas, + const cufftComplex *__restrict__ sequenceFFT, + cufftComplex *__restrict__ outputs, int signalLength, int numChannels) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx >= signalLength * numChannels) return; + + int pos = idx % signalLength; + + cufftComplex signal = signalDatas[idx]; + cufftComplex seq = sequenceFFT[pos]; + + // 计算 signal * conj(seq) + outputs[idx].x = signal.x * seq.x + signal.y * seq.y; // 实部 + outputs[idx].y = signal.y * seq.x - signal.x * seq.y; // 虚部 +} + +// 批处理:计算所有序列和信号的共轭乘 +__global__ void batchConjugateMultiplyKernelFloat( + const cufftComplex *__restrict__ signalDatas, + const cufftComplex *__restrict__ sequenceFFT, + cufftComplex *__restrict__ outputs, int signalLength, int numChannels, + int seqChannels) { + // 使用2D网格布局:x维度处理位置,y维度处理序列 + int pos = blockIdx.x * blockDim.x + threadIdx.x; + // 检查位置索引是否有效 + if (pos >= signalLength) return; + + int seqIdx = blockIdx.y; + if (seqIdx >= seqChannels) return; + + // 预计算常用偏移量 + const int signalOffset = pos; // 信号内存访问基础偏移 + const int sequenceOffset = seqIdx * signalLength + pos; + + // 预取序列FFT值(所有通道共享) + const cufftComplex seq = sequenceFFT[sequenceOffset]; + +// 处理所有通道 +#pragma unroll + for (int chIdx = 0; chIdx < numChannels; ++chIdx) { + // 计算当前通道的全局索引 + int globalIdx = + seqIdx * numChannels * signalLength + chIdx * signalLength + pos; + + // 读取信号数据(合并访问) + cufftComplex signal = signalDatas[chIdx * signalLength + signalOffset]; + + // 计算共轭乘法 + outputs[globalIdx].x = signal.x * seq.x + signal.y * seq.y; + outputs[globalIdx].y = signal.y * seq.x - signal.x * seq.y; + } +} + +// 双精度共轭乘核函数 +// 一次计算一个序列和所有信号的共轭乘 +__global__ void ConjugateMultiplyKernelDouble( + const cufftDoubleComplex *__restrict__ signalDatas, + const cufftDoubleComplex *__restrict__ sequenceFFT, + cufftDoubleComplex *__restrict__ outputs, int signalLength, + int numChannels) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx >= signalLength * numChannels) return; + + int pos = idx % signalLength; + + cufftDoubleComplex signal = signalDatas[idx]; + cufftDoubleComplex seq = sequenceFFT[pos]; + + // 计算 signal * conj(seq) + outputs[idx].x = signal.x * seq.x + signal.y * seq.y; // 实部 + outputs[idx].y = signal.y * seq.x - signal.x * seq.y; // 虚部 +} + +// 批处理:计算所有序列和信号的共轭乘 +__global__ void batchConjugateMultiplyKernelDouble( + const cufftDoubleComplex *__restrict__ signalDatas, + const cufftDoubleComplex *__restrict__ sequenceFFT, + cufftDoubleComplex *__restrict__ outputs, int signalLength, int numChannels, + int seqChannels) { + // 使用2D网格布局:x维度处理位置,y维度处理序列 + int pos = blockIdx.x * blockDim.x + threadIdx.x; + // 检查位置索引是否有效 + if (pos >= signalLength) return; + + int seqIdx = blockIdx.y; + if (seqIdx >= seqChannels) return; + + // 预计算常用偏移量 + const int signalOffset = pos; // 信号内存访问基础偏移 + const int sequenceOffset = seqIdx * signalLength + pos; + + // 预取序列FFT值(所有通道共享) + const cufftDoubleComplex seq = sequenceFFT[sequenceOffset]; + +// 处理所有通道 +#pragma unroll + for (int chIdx = 0; chIdx < numChannels; ++chIdx) { + // 计算当前通道的全局索引 + int globalIdx = + seqIdx * numChannels * signalLength + chIdx * signalLength + pos; + + // 读取信号数据(合并访问) + cufftDoubleComplex signal = + signalDatas[chIdx * signalLength + signalOffset]; + + // 计算共轭乘法 + outputs[globalIdx].x = signal.x * seq.x + signal.y * seq.y; + outputs[globalIdx].y = signal.y * seq.x - signal.x * seq.y; + } +} + +// peekMax的核函数:参考CPU侧旧的计算逻辑实现 +__global__ void CalculatePeaksKernelFloat( + const cufftComplex *__restrict__ d_conjResults, + const uint *__restrict__ d_seqLen, uint seqChannels, uint signalChannels, + uint signalLength, uint *__restrict__ d_vecPeaks) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= seqChannels * signalChannels) return; + + // 根据GPU线程id,计算相应的序列id:seq_id + int seq_id = idx / signalChannels; + uint sequenceLength = d_seqLen[seq_id]; + const uint num_elements = signalLength - sequenceLength; + if (num_elements == 0) { + return; + } + + float max_abs = -CUDA_INF_F; // float极小值 + float total_abs = 0.0; + + // d_conjResults的维度:[seqChannels * signalChannels][signalLength] + // 每个GPU线程(线程idx),处理相应的d_conjResults[idx][signalLength] + const auto &CorrelationValue = d_conjResults + idx * signalLength; + +#pragma unroll + for (int i = 0; i < num_elements; ++i) { + const float abs_val = hypot(CorrelationValue[i].x, CorrelationValue[i].y); + total_abs += abs_val; + + if (abs_val > max_abs) { + max_abs = abs_val; + } + } + + const float avg_abs = total_abs / num_elements; + int res = ((max_abs > (avg_abs * 7)) ? 1 : 0); + d_vecPeaks[idx] = res; +} + +// peekMax的核函数:参考CPU侧旧的计算逻辑实现 +__global__ void CalculatePeaksKernelDouble( + const cufftDoubleComplex *__restrict__ d_conjResults, + const uint *__restrict__ d_seqLen, uint seqChannels, uint signalChannels, + uint signalLength, uint *__restrict__ d_vecPeaks) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= seqChannels * signalChannels) return; + + // 根据GPU线程id,计算相应的序列id:seq_id + int seq_id = idx / signalChannels; + uint sequenceLength = d_seqLen[seq_id]; + const uint num_elements = signalLength - sequenceLength; + if (num_elements == 0) { + return; + } + + double max_abs = -CUDA_INF_D; // double极小值 + double total_abs = 0.0; + + // d_conjResults的维度:[seqChannels * signalChannels][signalLength] + // 每个GPU线程(线程idx),处理相应的d_conjResults[idx][signalLength] + const auto &CorrelationValue = d_conjResults + idx * signalLength; + +#pragma unroll + for (int i = 0; i < num_elements; ++i) { + const double abs_val = hypot(CorrelationValue[i].x, CorrelationValue[i].y); + total_abs += abs_val; + + if (abs_val > max_abs) { + max_abs = abs_val; + } + } + + const double avg_abs = total_abs / num_elements; + d_vecPeaks[idx] = ((max_abs > (avg_abs * 7)) ? 1 : 0); +} + +// ComputeMaxValueKernelFloat核函数: +// 计算d_conjResults[seqChannels * signalChannels][signalLength] +// 每个signalLength中最大值,最大值索引,是否有效(1:有效,0:无效) +__global__ void ComputeMaxValueKernelFloat( + const cufftComplex *__restrict__ d_conjResults, + const uint *__restrict__ d_seqLen, uint seqChannels, uint signalChannels, + uint signalLength, float *d_maxValue, int *d_maxValueIndex, + int *d_maxValueIsValid, float thresholdRatio = 10.0) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= seqChannels * signalChannels) return; + + // 根据GPU线程id,计算相应的序列id:seq_id + int seq_id = idx / signalChannels; + uint sequenceLength = d_seqLen[seq_id]; + + const uint num_elements = signalLength - sequenceLength; + if (num_elements == 0) { + d_maxValue[idx] = 0.0f; + d_maxValueIndex[idx] = -1; + d_maxValueIsValid[idx] = 0; + return; + } + + float max_abs = -CUDA_INF_F; // float极小值 + int max_index = -1; + + // d_conjResults的维度:[seqChannels * signalChannels][signalLength] + // 每个GPU线程(线程idx),处理相应的d_conjResults[idx][signalLength] + const auto &CorrelationValue = d_conjResults + idx * signalLength; + +#pragma unroll + // 计算绝对值,获取最大值和索引 + for (int i = 0; i < num_elements; ++i) { + const float abs_val = hypot(CorrelationValue[i].x, CorrelationValue[i].y); + if (abs_val > max_abs) { + max_abs = abs_val; + max_index = i; + } + } + + // 最大值和最大值位置 + d_maxValue[idx] = max_abs; + d_maxValueIndex[idx] = max_index; + + // 判断最大值是否有效 + if ((max_index < 512) || (max_index > (num_elements - 1000))) { + d_maxValueIsValid[idx] = 0; // 无效 + return; + } else { + // 取最大值附近1024个点,来判断最大值是否有效 + int startIdx = max_index - 24; + int stopIdx = min(max_index + 1000, num_elements); + float total_abs = 0.0f; + +#pragma unroll + // 计算最大值附近1024个值的绝对值之和 + for (int i = startIdx; i < stopIdx; ++i) { + total_abs += hypot(CorrelationValue[i].x, CorrelationValue[i].y); + } + + // 计算平均值 + float avg_abs = total_abs / (stopIdx - startIdx); + + // 判断最大值是否有效(thresholdRatio 参数默认为 10) + d_maxValueIsValid[idx] = (max_abs > (avg_abs * thresholdRatio)) ? 1 : 0; + } + return; +} + +// getGlobalMaxValueKernelFloat +// 对 ComputeMaxValueKernelFloat 进一步处理, +// 获取 8 * 11 的矩阵数据--存放在d_maxValueResults +// d_maxValueResults的维度:[seqChannels * signalChannels)/8][8][11] +__global__ void getGlobalMaxValueKernelFloat( + const cufftComplex *__restrict__ d_conjResults, uint seqChannels, + uint signalChannels, uint signalLength, float *d_maxValue, + int *d_maxValueIndex, int *d_maxValueIsValid, + cufftComplex *d_maxValueResults) { + const int groupNum = 8; + const int ResultsNumPerGroup = 11; + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= (seqChannels * signalChannels) / groupNum) return; + + // 判断是否有效,d_maxValueIsValid中的值:0表示无效,1表示有效 + bool IsValid = false; +#pragma unroll + for (int i = idx * groupNum; i < (idx + 1) * groupNum; i++) { + // 判断是否全为0 + if (d_maxValueIsValid[i] == 1) { + IsValid = true; // 有效,则退出 + break; + } + } + + // 无效(IsValid=false),则输出结果置为默认值 + cufftComplex defaultVar = {-1.0f, -1.0f}; // 实部和虚部均初始化为-1.0f + if (!IsValid) { +#pragma unroll + for (int i = idx * groupNum; i < (idx + 1) * groupNum; i++) { + d_maxValue[i] = 0.0f; + d_maxValueIndex[i] = -1; + + const auto &maxValueResults = + d_maxValueResults + i * ResultsNumPerGroup; // i*11 +#pragma unroll + for (int j = 0; j < ResultsNumPerGroup; j++) { + maxValueResults[j] = defaultVar; + } + } + + return; + } + + float global_maxValue = -CUDA_INF_F; // float极小值 + int global_maxValueIndex = -1; + +#pragma unroll + // 获取全局最大值及其索引 + for (int i = idx * groupNum; i < (idx + 1) * groupNum; i++) { + if ((d_maxValueIsValid[i] == 1) && (d_maxValue[i] > global_maxValue)) { + global_maxValue = d_maxValue[i]; + global_maxValueIndex = d_maxValueIndex[i]; + } + } + + // 根据全局最大值索引,取左右各5点数据,共11个点数据 + int startIdx = global_maxValueIndex - 5; + // int stopIdx = global_maxValueIndex + 5 + 1; + +#pragma unroll + for (int i = idx * groupNum; i < (idx + 1) * groupNum; i++) { + const auto &CorrelationValue = d_conjResults + i * signalLength + startIdx; + const auto &maxValueResults = + d_maxValueResults + i * ResultsNumPerGroup; // i*11 +#pragma unroll + for (int j = 0; j < ResultsNumPerGroup; j++) { + maxValueResults[j] = CorrelationValue[j]; + } + } + + return; +} + +// ComputeMaxValueKernelDouble核函数: +// 计算d_conjResults[seqChannels * signalChannels][signalLength] +// 每个signalLength中最大值,最大值索引,是否有效(1:有效,0:无效) +__global__ void ComputeMaxValueKernelDouble( + const cufftDoubleComplex *__restrict__ d_conjResults, + const uint *__restrict__ d_seqLen, uint seqChannels, uint signalChannels, + uint signalLength, double *d_maxValue, int *d_maxValueIndex, + int *d_maxValueIsValid, double thresholdRatio = 10.0) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= seqChannels * signalChannels) return; + + // 根据GPU线程id,计算相应的序列id:seq_id + int seq_id = idx / signalChannels; + uint sequenceLength = d_seqLen[seq_id]; + const uint num_elements = signalLength - sequenceLength; + if (num_elements == 0) { + d_maxValue[idx] = 0.0f; + d_maxValueIndex[idx] = -1; + d_maxValueIsValid[idx] = 0; + return; + } + + double max_abs = -CUDA_INF_D; // double 极小值 + int max_index = -1; + + // d_conjResults的维度:[seqChannels * signalChannels][signalLength] + // 每个GPU线程(线程idx),处理相应的d_conjResults[idx][signalLength] + const auto &CorrelationValue = d_conjResults + idx * signalLength; + +#pragma unroll + // 计算绝对值,获取最大值和索引 + for (int i = 0; i < num_elements; ++i) { + const double abs_val = hypot(CorrelationValue[i].x, CorrelationValue[i].y); + if (abs_val > max_abs) { + max_abs = abs_val; + max_index = i; + } + } + + // 最大值和最大值位置 + d_maxValue[idx] = max_abs; + d_maxValueIndex[idx] = max_index; + + // 判断最大值是否有效 + if ((max_index < 512) || (max_index > (num_elements - 1000))) { + d_maxValueIsValid[idx] = 0; // 无效 + return; + } else { + // 取最大值附近1024个点,来判断最大值是否有效 + int startIdx = max_index - 24; + int stopIdx = min(max_index + 1000, num_elements); + double total_abs = 0.0f; + +#pragma unroll + // 计算最大值附近1024个值的绝对值之和 + for (int i = startIdx; i < stopIdx; ++i) { + total_abs += hypot(CorrelationValue[i].x, CorrelationValue[i].y); + } + + // 计算平均值 + double avg_abs = total_abs / (stopIdx - startIdx); + + // 判断最大值是否有效(thresholdRatio 参数默认为 10) + d_maxValueIsValid[idx] = (max_abs > (avg_abs * thresholdRatio)) ? 1 : 0; + } + return; +} + +// getGlobalMaxValueKernelDouble +// 对 ComputeMaxValueKernelDouble 进一步处理, +// 获取 8 * 11 的矩阵数据--存放在d_maxValueResults中 +// d_maxValueResults的维度:[seqChannels * signalChannels)/8][8][11] +__global__ void getGlobalMaxValueKernelDouble( + const cufftDoubleComplex *__restrict__ d_conjResults, uint seqChannels, + uint signalChannels, uint signalLength, double *d_maxValue, + int *d_maxValueIndex, int *d_maxValueIsValid, + cufftDoubleComplex *d_maxValueResults) { + const int groupNum = 8; + const int ResultsNumPerGroup = 11; + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= (seqChannels * signalChannels) / groupNum) return; + + // 判断是否有效,d_maxValueIsValid中的值:0表示无效,1表示有效 + bool IsValid = false; +#pragma unroll + for (int i = idx * groupNum; i < (idx + 1) * groupNum; i++) { + // 判断是否全为0 + if (d_maxValueIsValid[i] == 1) { + IsValid = true; // 有效,则退出 + break; + } + } + + // 无效(IsValid=false),则输出结果置为默认值 + cufftDoubleComplex defaultVar = {-1.0, -1.0}; // 实部和虚部均初始化为-1.0 + if (!IsValid) { +#pragma unroll + for (int i = idx * groupNum; i < (idx + 1) * groupNum; i++) { + d_maxValue[i] = 0.0f; + d_maxValueIndex[i] = -1; + + const auto &maxValueResults = + d_maxValueResults + i * ResultsNumPerGroup; // i*11 +#pragma unroll + for (int j = 0; j < ResultsNumPerGroup; j++) { + maxValueResults[j] = defaultVar; + } + } + + return; + } + + double global_maxValue = -CUDA_INF_D; // double 极小值 + int global_maxValueIndex = -1; + +#pragma unroll + // 获取全局最大值及其索引 + for (int i = idx * groupNum; i < (idx + 1) * groupNum; i++) { + if ((d_maxValueIsValid[i] == 1) && (d_maxValue[i] > global_maxValue)) { + global_maxValue = d_maxValue[i]; + global_maxValueIndex = d_maxValueIndex[i]; + } + } + + // 根据全局最大值索引,取左右各5点数据,共11个点数据 + int startIdx = global_maxValueIndex - 5; + // int stopIdx = global_maxValueIndex + 5 + 1; + +#pragma unroll + for (int i = idx * groupNum; i < (idx + 1) * groupNum; i++) { + const auto &CorrelationValue = d_conjResults + i * signalLength + startIdx; + const auto &maxValueResults = + d_maxValueResults + i * ResultsNumPerGroup; // i*11 +#pragma unroll + for (int j = 0; j < ResultsNumPerGroup; j++) { + maxValueResults[j] = CorrelationValue[j]; + } + } + + return; +} diff --git a/cuda_kernel.h b/cuda_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..5768e946901ca00b686256a71281d8afe8967450 --- /dev/null +++ b/cuda_kernel.h @@ -0,0 +1,85 @@ +#ifndef CUDA_KERNEL_H +#define CUDA_KERNEL_H + +#include + +#include + +// cuda 核函数的头文件 + +#define CUDA_INF_F __int_as_float(0x7f800000U) +#define CUDA_INF_D __longlong_as_double(0x7ff0000000000000ULL) + +// 单精度共轭乘核函数 +// 一次计算一个序列和所有信号的共轭乘 +__global__ void ConjugateMultiplyKernelFloat( + const cufftComplex* __restrict__ signalDatas, + const cufftComplex* __restrict__ sequenceFFT, + cufftComplex* __restrict__ outputs, int signalLength, int numChannels); +// 批处理:一次计算所有序列和信号的共轭乘 +__global__ void batchConjugateMultiplyKernelFloat( + const cufftComplex* __restrict__ signalDatas, + const cufftComplex* __restrict__ sequenceFFT, + cufftComplex* __restrict__ outputs, int signalLength, int numChannels, + int seqChannels); + +// 双精度共轭乘核函数 +// 一次计算一个序列和所有信号的共轭乘 +__global__ void ConjugateMultiplyKernelDouble( + const cufftDoubleComplex* __restrict__ signalDatas, + const cufftDoubleComplex* __restrict__ sequenceFFT, + cufftDoubleComplex* __restrict__ outputs, int signalLength, + int numChannels); +// 批处理:一次计算所有序列和信号的共轭乘 +__global__ void batchConjugateMultiplyKernelDouble( + const cufftDoubleComplex* __restrict__ signalDatas, + const cufftDoubleComplex* __restrict__ sequenceFFT, + cufftDoubleComplex* __restrict__ outputs, int signalLength, int numChannels, + int seqChannels); + +// peekMax的核函数(旧版本) +__global__ void CalculatePeaksKernelFloat( + const cufftComplex* __restrict__ d_conjResults, + const uint* __restrict__ d_seqLen, uint seqChannels, uint signalChannels, + uint signalLength, uint* __restrict__ d_vecPeaks); +__global__ void CalculatePeaksKernelDouble( + const cufftDoubleComplex* __restrict__ d_conjResults, + const uint* __restrict__ d_seqLen, uint seqChannels, uint signalChannels, + uint signalLength, uint* __restrict__ d_vecPeaks); + +// ComputeMaxValueKernelFloat核函数 +// 计算d_conjResults[seqChannels * signalChannels][signalLength] +// 每个signalLength中最大值,最大值索引,是否有效(1:有效,0:无效) +__global__ void ComputeMaxValueKernelFloat( + const cufftComplex* __restrict__ d_conjResults, + const uint* __restrict__ d_seqLen, uint seqChannels, uint signalChannels, + uint signalLength, float* d_maxValue, int* d_maxValueIndex, + int* d_maxValueIsValid, float thresholdRatio); + +// 对 ComputeMaxValueKernelDouble 进一步处理, +// 获取 8 * 11的数据--d_maxValueResults +// d_maxValueResults[seqChannels * signalChannels) / groupNum][groupNum][11] +__global__ void getGlobalMaxValueKernelFloat( + const cufftComplex* __restrict__ d_conjResults, uint seqChannels, + uint signalChannels, uint signalLength, float* d_maxValue, + int* d_maxValueIndex, int* d_maxValueIsValid, + cufftComplex* d_maxValueResults); + +// ComputeMaxValueKernelDouble核函数 +// 计算d_conjResults[seqChannels * signalChannels][signalLength] +// 每个signalLength中最大值,最大值索引,是否有效(1:有效,0:无效) +__global__ void ComputeMaxValueKernelDouble( + const cufftDoubleComplex* __restrict__ d_conjResults, + const uint* __restrict__ d_seqLen, uint seqChannels, uint signalChannels, + uint signalLength, double* d_maxValue, int* d_maxValueIndex, + int* d_maxValueIsValid, double thresholdRatio); + +// 对 ComputeMaxValueKernelDouble 进一步处理, +// 获取 8 * 11的数据--d_maxValueResults +// d_maxValueResults[seqChannels * signalChannels) / groupNum][groupNum][11] +__global__ void getGlobalMaxValueKernelDouble( + const cufftDoubleComplex* __restrict__ d_conjResults, uint seqChannels, + uint signalChannels, uint signalLength, double* d_maxValue, + int* d_maxValueIndex, int* d_maxValueIsValid, + cufftDoubleComplex* d_maxValueResults); +#endif // CUDA_KERNEL_H diff --git a/mainwindow.cpp b/mainwindow.cpp index 0d4c89bc5e2282d71a8afa666102079a16a57d61..81143458bc38b390c208d825d616d0a3b55dc88b 100644 --- a/mainwindow.cpp +++ b/mainwindow.cpp @@ -21,7 +21,7 @@ MainWindow::~MainWindow() {} void MainWindow::InitControlValues() { m_btnCalculate = new QPushButton(QStringLiteral("加载数据计算"), this); - basePath = "/../data1/"; + basePath = "/../data/"; } void MainWindow::InitUI() { setCentralWidget(m_btnCalculate); } @@ -35,14 +35,14 @@ int MainWindow::CalculateRoutine(uint signalChannels, uint signalLength) { } void MainWindow::SlotCalculateClick() { - // QString strFileName = - // "20250213105831_Detect_FreqList5720-5750MHz_Span15.36MHz_Point4096_2025-" - // "02-13 10-58-31_0.dat"; + QString strFileName = + "20250213105831_Detect_FreqList5720-5750MHz_Span15.36MHz_Point4096_2025-" + "02-13 10-58-31_0.dat"; // QString strFileName = // "20250416100611_Detect_FreqList2390-2480MHz_Span30.72MHz_Point16384_2025-04-16 // 10-06-11_0_20.dat"; - QString strFileName = - "20250909173329_Detect_FreqList5824-5844MHz_Span30.72_Point16384_2025-09-09 17-33-29_0_101.dat"; + // QString strFileName = + // "20250909173329_Detect_FreqList5824-5844MHz_Span30.72_Point16384_2025-09-09 17-33-29_0_101.dat"; QString strIQFileName = QCoreApplication::applicationDirPath() + basePath + strFileName;