diff --git a/CMakeLists.txt b/CMakeLists.txt index 0671a7d4c44ce382a8f64d92b9c4586e6c964340..cbb7c0d4f77ef99929e26c1bb238eb8f97532447 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,8 +5,7 @@ cmake_policy(SET CMP0104 NEW) set(CMAKE_CXX_STANDARD 17) option(USE_CUDA "启用CUDA加速" ON) -option(USE_PRE_COMPUTE_MODE "启用预初始化sequence" OFF) -option(USE_OPT_CUDAMEMCPY "启用优化cudaMemcpy" OFF) +option(USE_PEEKSKERNEL "启用peekskernel计算" ON) if(USE_CUDA) add_compile_definitions(USE_CUDA) @@ -44,17 +43,12 @@ if(USE_CUDA) endif() endif() #if(USE_CUDA) -if(USE_CUDA AND USE_PRE_COMPUTE_MODE) - add_compile_definitions(USE_PRE_COMPUTE_MODE) -endif() - -if(USE_OPT_CUDAMEMCPY) - add_compile_definitions(USE_OPT_CUDAMEMCPY) +if(USE_PEEKSKERNEL) + add_compile_definitions(USE_PEEKSKERNEL) endif() message(STATUS "USE_CUDA=" ${USE_CUDA}) -message(STATUS "USE_PRE_COMPUTE_MODE=" ${USE_PRE_COMPUTE_MODE}) -message(STATUS "USE_OPT_CUDAMEMCPY=" ${USE_OPT_CUDAMEMCPY}) +message(STATUS "USE_PEEKSKERNEL=" ${USE_PEEKSKERNEL}) if(USE_CUDA) message(STATUS "CMAKE_CUDA_ARCHITECTURES=" ${CMAKE_CUDA_ARCHITECTURES}) endif() diff --git a/build.sh b/build.sh index d10cb49c4cd754395b5e766a52fae144c7197054..e2ef6e294369f847f357cf788857f4b72e8492bc 100755 --- a/build.sh +++ b/build.sh @@ -107,8 +107,7 @@ pushd ${BUILD_PATH} > /dev/null ${CMAKE} ../ -DCMAKE_BUILD_TYPE="${BUILD_MODE}" \ -DCMAKE_PREFIX_PATH=/usr/lib/x86_64-linux-gnu/qt5 \ -DUSE_CUDA=ON \ - -DUSE_PRE_COMPUTE_MODE=ON \ - -DUSE_OPT_CUDAMEMCPY=ON + -DUSE_PEEKSKERNEL=ON popd > /dev/null ${CMAKE} --build ${BUILD_PATH} -- -j${BUILD_JOBS} diff --git a/calculatemovingcorrelation.cpp b/calculatemovingcorrelation.cpp index d85c3dd4ff4f41e3e336a8403527d5599d052d75..5ed503e2222df5b858972dc6767cc40de90e20fd 100644 --- a/calculatemovingcorrelation.cpp +++ b/calculatemovingcorrelation.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include @@ -17,139 +18,118 @@ CalculateMovingCorrelation::CalculateMovingCorrelation() { CalculateMovingCorrelation::CalculateMovingCorrelation() {} #endif -int CalculateMovingCorrelation::CalMovingCorrlationRoutine( - const std::vector>> &vecsignal) { - const int numChannels = vecsignal.size(); - - qDebug() << __FUNCTION__ << "Signal processing start"; +CalculateMovingCorrelation::~CalculateMovingCorrelation() { + if (sequenceDatas_) { + free(sequenceDatas_); + sequenceDatas_ = nullptr; + } +} -#if defined(USE_CUDA) && defined(USE_PRE_COMPUTE_MODE) - // 预先计算所有batch的signals的fft值 - cudaCorrelation->ComputeSignalsFFT(vecsignal); +// 预计算sequence:初始化阶段预计算所有Sequence的FFT +void CalculateMovingCorrelation::ComputeAllSequence(uint fftLength) { +#if USE_CUDA + qDebug() << "Starting CUDA processing for compute all sequence fft"; + if (sequenceDatas_ == nullptr) { + std::cerr << __FUNCTION__ << " sequenceDatas_ ptr is null!" << std::endl; + return; + } + cudaCorrelation->ComputeSequenceFFT(sequenceDatas_, numSequences_, fftLength); #endif +} - // 处理每个序列 +int CalculateMovingCorrelation::CalMovingCorrlationRoutine( + const cpuComplex *signalDatas, uint numChannels, uint signalLength) { int result = 0; - int numSequences = m_VecSequence.size(); - - for (int seqIdx = 0; seqIdx < numSequences; ++seqIdx) { - qDebug() << __FUNCTION__ << "seqIdx-----------" << seqIdx; - - std::vector>> correlationResults( - numChannels); - #if USE_CUDA - qDebug() << __FUNCTION__ << "Starting CUDA processing"; -#ifdef USE_PRE_COMPUTE_MODE - qDebug() << __FUNCTION__ - << "use pre compute FFT (sequenceFFT, signalsFFT) mode"; - try { - // 计算共轭乘 - correlationResults = cudaCorrelation->ComputeConjugateMultiply(seqIdx); - } catch (const std::exception &e) { - qDebug() << "CUDA processing error:" << e.what(); - throw; - } - qDebug() << "CUDA processing completed successfully"; -#else - auto &sequence = m_VecSequence[seqIdx]; - qDebug() << __FUNCTION__ << " sequence length: " << sequence.size(); - try { - correlationResults = cudaCorrelation->ProcessBatch(vecsignal, sequence); - } catch (const std::exception &e) { - qDebug() << "CUDA processing error:" << e.what(); - throw; - } - qDebug() << "CUDA processing completed successfully"; -#endif -#else -#pragma omp parallel for schedule(dynamic) - auto &sequence = m_VecSequence[seqIdx]; - std::cout << "Starting omp processing for sequence" << std::endl; - for (int channel = 0; channel < numChannels; ++channel) { - MovingCorrelation(vecsignal[channel], sequence, - correlationResults[channel]); - } -#endif - if (CalculatePeaks(correlationResults) == 1) { - result = 1; - // 找到后 直接退出循环 不需要继续循环后边的序列 - break; - } + // 预先计算所有batch的signals的fft值 + if (signalDatas == nullptr) { + std::cerr << __FUNCTION__ << " signalDatas ptr is null!" << std::endl; + return 0; } - return result; -} + qDebug() << "Starting CUDA processing for compute all signals fft"; + cudaCorrelation->ComputeSignalsFFT(signalDatas, numChannels, signalLength); -int CalculateMovingCorrelation::CalMovingCorrlationRoutine( - QVector> &DownSamplingIData, - QVector> &DownSamplingQData) { - const int numChannels = DownSamplingIData.size(); - std::vector>> vecsignal(numChannels); - - // 填充每个通道的IQ数据 - for (int channel = 0; channel < numChannels; ++channel) { - const int dataLength = DownSamplingIData[channel].size(); - qDebug() << __FUNCTION__ << "dataLength:" << dataLength; - std::vector> channelData; - channelData.reserve(dataLength); - - for (int i = 0; i < dataLength; ++i) { - channelData.emplace_back(DownSamplingIData[channel][i], - DownSamplingQData[channel][i]); - } - qDebug() << __FUNCTION__ << "channelData.size:" << channelData.size(); - vecsignal[channel] = std::move(channelData); + qDebug() << __FUNCTION__ << "Starting CUDA processing for ComputeConjMul()"; + try { + // ComputeConjMul:函数完成共轭乘、IFFT、CalculatePeaks + result = cudaCorrelation->ComputeConjMul(); + } catch (const std::exception &e) { + qDebug() << "CUDA processing error:" << e.what(); + throw; } - return CalMovingCorrlationRoutine(vecsignal); + qDebug() << "CUDA processing completed successfully"; + +#ifndef USE_PEEKSKERNEL + if (CalculatePeaks(numSequences_) == 1) { + result = 1; + } +#endif +#endif + return result; } -int CalculateMovingCorrelation::CalculatePeaks( - std::vector>> &CorrelationResults) { +#ifndef USE_PEEKSKERNEL +int CalculateMovingCorrelation::CalculatePeaks(uint numSequences) { std::vector vecCaculatePeaks; - vecCaculatePeaks.reserve(CorrelationResults.size()); - - for (const auto &CorrelationValue : CorrelationResults) { - Real max_abs = -10000; - int max_index = -1; - Real total_abs = 0.0; - - const int num_elements = CorrelationValue.size(); + vecCaculatePeaks.reserve(cudaCorrelation->signalChannels_); + uint signalLength = cudaCorrelation->signalLength_; + int ret = 0; + for (int seqIdx = 0; seqIdx < numSequences; ++seqIdx) { + vecCaculatePeaks.clear(); + // 原始sequence的长度 + uint sequenceLength = cudaCorrelation->vecSequenceLength_[seqIdx]; + const uint num_elements = signalLength - sequenceLength; if (num_elements == 0) { return 0; } - for (int i = 0; i < num_elements; ++i) { - const Real abs_val = std::hypot(CorrelationValue.at(i).real(), - CorrelationValue.at(i).imag()); - total_abs += abs_val; + const auto &conj_results = + cudaCorrelation->cpu_results + seqIdx * cudaCorrelation->signalsNum_; - if (abs_val > max_abs) { - max_abs = abs_val; - max_index = i; + for (int res_i = 0; res_i < cudaCorrelation->signalChannels_; ++res_i) { + Real max_abs = -10000; + int max_index = -1; + Real total_abs = 0.0; + + const auto &CorrelationValue = conj_results + res_i * signalLength; + + for (int i = 0; i < num_elements; ++i) { + const Real abs_val = + std::hypot(CorrelationValue[i].x, CorrelationValue[i].y); + total_abs += abs_val; + + if (abs_val > max_abs) { + max_abs = abs_val; + max_index = i; + } } - } - const Real avg_abs = total_abs / num_elements; - // vecCaculateRes.push_back((max_abs > (avg_abs * 10)) ? 1 : 0); + const Real avg_abs = total_abs / num_elements; + // vecCaculateRes.push_back((max_abs > (avg_abs * 10)) ? 1 : 0); - vecCaculatePeaks.push_back( - (max_abs > (avg_abs * 7)) ? 1 - : 0); // 暂时以 峰值大于平均值的 7倍 作为判据 + vecCaculatePeaks.push_back( + (max_abs > (avg_abs * 7)) + ? 1 + : 0); // 暂时以 峰值大于平均值的 7倍 作为判据 - qDebug() << " MaxValueIndex " << max_index << " Maxvalue " << max_abs - << " ValueAbsSumAverage " << avg_abs; - } + qDebug() << " MaxValueIndex " << max_index << " Maxvalue " << max_abs + << " ValueAbsSumAverage " << avg_abs; + } - qDebug() << __FUNCTION__ << "vecCaculatePeaks " << vecCaculatePeaks; + qDebug() << __FUNCTION__ << "vecCaculatePeaks " << vecCaculatePeaks; - return std::any_of(vecCaculatePeaks.begin(), vecCaculatePeaks.end(), - [](int val) { return val == 1; }) - ? 1 - : 0; + ret = std::any_of(vecCaculatePeaks.begin(), vecCaculatePeaks.end(), + [](int val) { return val == 1; }) + ? 1 + : 0; + if (ret) return ret; + } + return ret; } +#endif void CalculateMovingCorrelation::MovingCorrelation( std::vector> &SingleChannelData, @@ -236,18 +216,10 @@ void CalculateMovingCorrelation::MovingCorrelation( ifftResult.begin() + (fftLength - sequenceLength)); } -void CalculateMovingCorrelation::ComputeAllSequence(int fftLength) { -#if defined(USE_CUDA) && defined(USE_PRE_COMPUTE_MODE) - // 预计算sequence:初始化阶段预计算所有Sequence的FFT - qDebug() << "Starting CUDA processing for compute all sequence"; - cudaCorrelation->ComputeSequenceFFT(m_VecSequence, fftLength); -#endif -} - -void CalculateMovingCorrelation::LoadAllSequenceBin(QString basePath) { +void CalculateMovingCorrelation::LoadAllSequenceBin(QString basePath, + uint SamplePoint) { QString PathName = QCoreApplication::applicationDirPath() + basePath + "Sequence"; - qDebug() << __FUNCTION__ << "PathName" << PathName; QDir dir(PathName); if (!dir.exists()) { @@ -264,34 +236,38 @@ void CalculateMovingCorrelation::LoadAllSequenceBin(QString basePath) { return; } - m_VecSequence.clear(); + numSequences_ = list.size(); - int sizeoflist = list.size(); + // malloc申请空间,需手动free,防止内存泄漏(在析构函数中free) + if (sequenceDatas_ == nullptr) { + sequenceDatas_ = + (cpuComplex *)malloc(numSequences_ * SamplePoint * sizeof(cpuComplex)); + if (sequenceDatas_ == nullptr) { + std::cerr << __FUNCTION__ << " Memory allocation failed!" << std::endl; + return; + } + memset(sequenceDatas_, 0, numSequences_ * SamplePoint * sizeof(cpuComplex)); + } - for (int i = 0; i < sizeoflist; ++i) { + for (int i = 0; i < numSequences_; ++i) { QFileInfo fileInfo = list.at(i); - + cpuComplex *sequence = sequenceDatas_ + i * SamplePoint; if (!fileInfo.isDir()) { if (fileInfo.suffix() == "bin") { - ReadSequenceFile(PathName + "/" + fileInfo.fileName()); + ReadSequenceFile(PathName + "/" + fileInfo.fileName(), sequence, + cudaCorrelation->vecSequenceLength_); } } } - - qDebug() << __FUNCTION__ << "m_VecFileNameSequence.num " - << m_VecSequence.size(); - - for (int AllSequenceIndex = 0; AllSequenceIndex < m_VecSequence.size(); - AllSequenceIndex++) { - qDebug() << __FUNCTION__ << "m_VecFileNameSequence.Index " - << AllSequenceIndex; - qDebug() << __FUNCTION__ << "m_VecFileNameSequence size " - << m_VecSequence.at(AllSequenceIndex).size(); - } } -void CalculateMovingCorrelation::ReadSequenceFile(QString strFileName) { - qDebug() << __FUNCTION__ << strFileName; +void CalculateMovingCorrelation::ReadSequenceFile( + QString strFileName, cpuComplex *sequence, + std::vector &vecSequenceLength) { + if (sequence == nullptr) { + std::cerr << __FUNCTION__ << " sequence ptr is null!" << std::endl; + return; + } std::ifstream inFile(strFileName.toStdString(), std::ios::in | std::ios::binary); // 二进制读方式打开 @@ -300,9 +276,6 @@ void CalculateMovingCorrelation::ReadSequenceFile(QString strFileName) { return; } - std::vector> VecComplex; - VecComplex.clear(); - inFile.seekg(0, std::ios::end); int len = inFile.tellg(); @@ -312,30 +285,30 @@ void CalculateMovingCorrelation::ReadSequenceFile(QString strFileName) { inFile.read(tempdata, len); QVector vecSequenceData; - BytesToDoubleInv(tempdata, len, vecSequenceData); + BytesToRealInv(tempdata, len, vecSequenceData); int sizeOfSequenceData = vecSequenceData.size(); + int SequenceLength = sizeOfSequenceData / 2; + vecSequenceLength.push_back(SequenceLength); - for (int index = 0; index < sizeOfSequenceData; index += 2) { - std::complex complexRI = {vecSequenceData[index], - vecSequenceData[index + 1]}; - VecComplex.push_back(complexRI); + for (int index = 0; index < SequenceLength; index++) { + cpuComplex data(vecSequenceData[index * 2], vecSequenceData[index * 2 + 1]); + sequence[index] = data; } - - m_VecSequence.push_back(VecComplex); } -void CalculateMovingCorrelation::BytesToDoubleInv(char *buf, int ReadFileLength, - QVector &VecReturn) { +void CalculateMovingCorrelation::BytesToRealInv(char *buf, int ReadFileLength, + QVector &VecReturn) { + int RealSize = sizeof(Real); for (int i = 0; i < ReadFileLength;) { Real real; Real imag; - memcpy(&real, buf + i, sizeof(Real)); - memcpy(&imag, buf + i + 8, sizeof(Real)); + memcpy(&real, buf + i, RealSize); + memcpy(&imag, buf + i + RealSize, RealSize); VecReturn.push_back(real); VecReturn.push_back(imag); - i += 16; + i += 2 * RealSize; } } diff --git a/calculatemovingcorrelation.h b/calculatemovingcorrelation.h index 2a11ff119b7ddb0ab1b3a87099b0ef97f3b2fc9c..69a31654c31c52c00804f93f1ef8face5852e9d3 100644 --- a/calculatemovingcorrelation.h +++ b/calculatemovingcorrelation.h @@ -15,51 +15,50 @@ // using Real = double; // 可随时替换为 float/double using Real = float; // 可随时替换为 float/double +using cpuComplex = std::complex; -// class CUDACorrelation; class CalculateMovingCorrelation { public: CalculateMovingCorrelation(); + ~CalculateMovingCorrelation(); // 初始时 读取 Sequence文件夹下的所有序列文件 保存到变量中 (仅一次) - void LoadAllSequenceBin(QString basePath); + void LoadAllSequenceBin(QString basePath, uint SamplePoint); // 计算所有序列的fft - void ComputeAllSequence(int fftLength); + void ComputeAllSequence(uint fftLength); // 计算滑动相关总流程 输入 8路 I数据 和 8路 Q数据 返回 1--找到相关峰 // 0--未找到相关峰 - int CalMovingCorrlationRoutine(QVector> &DownSamplingIData, - QVector> &DownSamplingQData); - - // 计算滑动相关总流程 输入 8路 I数据 和 8路 Q数据 返回 1--找到相关峰 - // 0--未找到相关峰 - int CalMovingCorrlationRoutine( - const std::vector>> &vecsignal); + int CalMovingCorrlationRoutine(const cpuComplex *signalDatas, + uint numChannels, uint signalLength); // 序列文件数据 - std::vector>> m_VecSequence; + cpuComplex *sequenceDatas_ = nullptr; + uint numSequences_ = 0; private: // 滑动相关函数 // 输入 一路数据的 IQ (组合成 复数) SingleChannelData // 输入 一个Sequence文件的 原始数据(组合成 复数) Sequence // 输出 相关数据CorrelationResults (复数) - void MovingCorrelation(std::vector> &SingleChannelData, - std::vector> &Sequence, - std::vector> &CorrelationResults); + void MovingCorrelation(std::vector &SingleChannelData, + std::vector &Sequence, + std::vector &CorrelationResults); // 计算是否有峰值(满足峰值条件) // 输入 前一步计算得到的 相关数据CorrelationResults // 返回值 1--找到相关峰 0--未找到相关峰 - int CalculatePeaks( - std::vector>> &CorrelationResults); +#ifndef USE_PEEKSKERNEL + int CalculatePeaks(uint numSequences); +#endif // 读取单个序列文件 - void ReadSequenceFile(QString strFileName); - // 字节转换成double - void BytesToDoubleInv(char *buf, int ReadFileLength, - QVector &VecReturn); + void ReadSequenceFile(QString strFileName, cpuComplex *sequence, + std::vector &vecSequenceLength); + + // 字节转换成Real(float/double) + void BytesToRealInv(char *buf, int ReadFileLength, QVector &VecReturn); #if USE_CUDA CUDACorrelation *cudaCorrelation; diff --git a/cuda_correlation.cu b/cuda_correlation.cu index 2a5823980f11be9c500a0c7934da2c47bdb5a8ac..0b946e591b5499088e7583e1ea8fd237a0497dc7 100644 --- a/cuda_correlation.cu +++ b/cuda_correlation.cu @@ -1,6 +1,7 @@ #include #include #include +#include // 包含 sqrt 函数 #include #include @@ -13,14 +14,15 @@ using namespace std; template class CUDACorrelation; // 明确告诉编译器生成float特例 template class CUDACorrelation; // 明确告诉编译器生成double特例 -#define CHECK_CUDA_ERROR(call) \ - do { \ - cudaError_t err = call; \ - if (err != cudaSuccess) { \ - std::cerr << "CUDA error in " << __FILE__ << ":" << __LINE__ << ": " \ - << cudaGetErrorString(err) << std::endl; \ - throw std::runtime_error("CUDA error"); \ - } \ +#define CHECK_CUDA_ERROR(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::cerr << __FUNCTION__ << " CUDA error in " << __FILE__ << ":" \ + << __LINE__ << ": " << cudaGetErrorString(err) << std::endl; \ + throw std::runtime_error("CUDA error"); \ + Cleanup(); \ + } \ } while (0) // 获取对应的FFT类型 @@ -76,9 +78,166 @@ __global__ void batchConjugateMultiplyKernelDouble( outputs[idx].y = signal.y * seq.x - signal.x * seq.y; // 虚部 } +#ifdef USE_PEEKSKERNEL +template +T hypot(T x, T y) { + // 使用 sqrt 和 fabs 来计算欧几里得范数 + return sqrt(fabs(x) * fabs(x) + fabs(y) * fabs(y)); +} + +__global__ void CalculatePeaksKernelFloat(const cufftComplex* d_results, + const uint* d_seqLen, + uint seqChannels, uint signalChannels, + uint signalLength, uint* d_vecPeaks) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= seqChannels * signalChannels) return; + + // 根据GPU线程id,计算相应的序列id:seq_id + int seq_id = idx / signalChannels; + uint sequenceLength = d_seqLen[seq_id]; + const uint num_elements = signalLength - sequenceLength; + if (num_elements == 0) { + return; + } + + float max_abs = -10000; + // int max_index = -1; + float total_abs = 0.0; + + // d_results的维度:[seqChannels * signalChannels][signalLength] + // 每个GPU线程(线程idx),处理相应的d_results[idx][signalLength] + const auto& CorrelationValue = d_results + idx * signalLength; + + for (int i = 0; i < num_elements; ++i) { + const float abs_val = hypot(CorrelationValue[i].x, CorrelationValue[i].y); + total_abs += abs_val; + + if (abs_val > max_abs) { + max_abs = abs_val; + // max_index = i; + } + } + + const float avg_abs = total_abs / num_elements; + int res = ((max_abs > (avg_abs * 7)) ? 1 : 0); + d_vecPeaks[idx] = res; +} + +__global__ void CalculatePeaksKernelDouble( + const cufftDoubleComplex* d_results, const uint* d_seqLen, uint seqChannels, + uint signalChannels, uint signalLength, uint* d_vecPeaks) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= seqChannels * signalChannels) return; + + // 根据GPU线程id,计算相应的序列id:seq_id + int seq_id = idx / signalChannels; + uint sequenceLength = d_seqLen[seq_id]; + const uint num_elements = signalLength - sequenceLength; + if (num_elements == 0) { + return; + } + + double max_abs = -10000; + // int max_index = -1; + double total_abs = 0.0; + + // d_results的维度:[seqChannels * signalChannels][signalLength] + // 每个GPU线程(线程idx),处理相应的d_results[idx][signalLength] + const auto& CorrelationValue = d_results + idx * signalLength; + + for (int i = 0; i < num_elements; ++i) { + const double abs_val = hypot(CorrelationValue[i].x, CorrelationValue[i].y); + total_abs += abs_val; + + if (abs_val > max_abs) { + max_abs = abs_val; + // max_index = i; + } + } + + const double avg_abs = total_abs / num_elements; + d_vecPeaks[idx] = ((max_abs > (avg_abs * 7)) ? 1 : 0); +} + +// std::map& +// CalculateMovingCorrelation::CalculatePeaksMaxKernelFloat( +// std::vector>>& CorrelationResults, +// std::string SequenceName, int multiple) { +// int sizeofResult = CorrelationResults.size(); +// int sizeofDataGroup = sizeofResult / 8; + +// std::vector vecTestCheck; +// vecTestCheck.reserve(sizeofResult); + +// for (int i = 0; i < sizeofDataGroup; i++) { +// int groupstartindex = i * 8; +// int groupstopIndex = groupstartindex + 8; + +// DataMaxinfostruct datamaxinfo; +// datamaxinfo.maxvaluePos.reserve(8); +// datamaxinfo.maxvalue.reserve(8); +// datamaxinfo.sequencyName = SequenceName; + +// for (int j = groupstartindex; j < groupstopIndex; j++) { +// auto CorrelationValue = CorrelationResults[j]; +// double max_abs = -10000; +// int max_index = -1; +// double total_abs = 0.0; + +// const int num_elements = CorrelationValue.size(); + +// if (num_elements == 0) continue; + +// for (int i = 0; i < num_elements; ++i) { +// const double abs_val = +// std::hypot(CorrelationValue[i].real(), +// CorrelationValue[i].imag()); +// // total_abs += abs_val; +// if (abs_val > max_abs) { +// max_abs = abs_val; +// max_index = i; +// } +// } + +// datamaxinfo.maxvalue.push_back(max_abs); +// datamaxinfo.maxvaluePos.push_back(max_index); + +// if (max_index < 512) continue; + +// if ((max_index + 1000) > (num_elements - 512)) { +// continue; +// } + +// int startindex = max_index - 24; +// int stopindex = max_index + 1000; +// int totalPoint = stopindex - startindex; + +// for (int j = startindex; j < stopindex; j++) { +// const auto& val = CorrelationValue[j]; +// total_abs += std::hypot(val.real(), val.imag()); +// } + +// const double avg_abs = total_abs / totalPoint; + +// int value = (max_abs > (avg_abs * multiple)) ? 1 : 0; + +// if (value == 1) { +// auto itrfind = m_mapResultMaxinfoAndSequenceName.find(i); +// if (itrfind == m_mapResultMaxinfoAndSequenceName.end()) { +// m_mapResultMaxinfoAndSequenceName[i] = datamaxinfo; +// } +// } +// } +// } + +// return m_mapResultMaxinfoAndSequenceName; +// } + +#endif + // 动态计算最优配置 template -void CUDACorrelation::configureKernel(int dataSize) { +void CUDACorrelation::configureKernel(uint dataSize) { int minGridSize, bestBlockSize; if constexpr (std::is_same_v) { @@ -101,14 +260,22 @@ CUDACorrelation::CUDACorrelation() d_signals(nullptr), d_sequence(nullptr), d_results(nullptr), - h_results(nullptr), signalLength_(0), - numChannels_(0), + signalChannels_(0), signalsSize_(0), sequenceSize_(0), fftLength_(0) { + seqChannels_ = 0; signalsNum_ = 0; - cublas_handle_ = nullptr; + +#ifdef USE_PEEKSKERNEL + h_vecPeaks = nullptr; + d_vecPeaks = nullptr; + d_vecSeqLen = nullptr; +#else + cpu_results = nullptr; +#endif + CHECK_CUDA_ERROR(cudaStreamCreate(&stream_)); CHECK_CUDA_ERROR(cudaGetDeviceProperties(&deviceProp_, 0)); } @@ -116,22 +283,19 @@ CUDACorrelation::CUDACorrelation() template CUDACorrelation::~CUDACorrelation() { Cleanup(); - cudaStreamSynchronize(stream_); + cudaDeviceSynchronize(); cudaStreamDestroy(stream_); } template void CUDACorrelation::Cleanup() { - FreeMemory(); - if (fftPlan_) { cufftDestroy(fftPlan_); fftPlan_ = 0; } - if (cublas_handle_ != nullptr) { - cublasDestroy(cublas_handle_); - cublas_handle_ = nullptr; - } + FreeMemory(); + + cudaStreamSynchronize(stream_); } template @@ -148,55 +312,47 @@ void CUDACorrelation::FreeMemory() { cudaFreeAsync(d_results, stream_); d_results = nullptr; } - if (h_results) { - cudaFreeHost(h_results); - h_results = nullptr; - } -} - -template -bool CUDACorrelation::AllocateMemory(int length, int channels) { - size_t size = length * channels * sizeof(CUDAComplex); - - CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals, size, stream_)); - CHECK_CUDA_ERROR( - cudaMallocAsync(&d_sequence, length * sizeof(CUDAComplex), stream_)); - CHECK_CUDA_ERROR(cudaMallocAsync(&d_results, size, stream_)); - - return true; -} -template -void CUDACorrelation::smartAllocPinned(CUDAComplex** pHost, size_t size) { - if (deviceProp_.major >= 8) { - CHECK_CUDA_ERROR(cudaHostAlloc( - pHost, size, cudaHostAllocMapped | cudaHostAllocWriteCombined)); - } else { - CHECK_CUDA_ERROR(cudaMallocHost(pHost, size)); +#ifdef USE_PEEKSKERNEL + if (d_vecSeqLen) { + cudaFreeAsync(d_vecSeqLen, stream_); + d_vecSeqLen = nullptr; + } + if (d_vecPeaks) { + cudaFreeAsync(d_vecPeaks, stream_); + d_vecPeaks = nullptr; + } + if (h_vecPeaks) { + free(h_vecPeaks); + h_vecPeaks = nullptr; + } +#else + if (cpu_results) { + free(cpu_results); + cpu_results = nullptr; } +#endif } template -void CUDACorrelation::ComputeSequenceFFT(Complex2D VecSequence, - int fftLength) { - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - Cleanup(); - +void CUDACorrelation::ComputeSequenceFFT(const Complex2D& VecSequence, + uint fftLength) { + uint numSequence = VecSequence.size(); fftLength_ = fftLength; - size_t numSequence = VecSequence.size(); - size_t sequenceSize = numSequence * fftLength * sizeof(CUDAComplex); + seqChannels_ = numSequence; + uint sequenceSize = numSequence * fftLength * sizeof(CUDAComplex); // 扁平化数据并填充零 std::vector hSequenceFlat(numSequence * fftLength, CUDAComplex{0.0, 0.0}); vecSequenceLength_.reserve(numSequence); - std::vector tmpLengths(numSequence); // 临时存储 + std::vector tmpLengths(numSequence); // 临时存储 #pragma omp parallel for num_threads(numSequence) { - for (size_t i = 0; i < numSequence; ++i) { + for (int i = 0; i < numSequence; ++i) { tmpLengths[i] = VecSequence[i].size(); // 无竞争写入 - int minlen = std::min(tmpLengths[i], fftLength); + uint minlen = std::min(tmpLengths[i], fftLength); memcpy(hSequenceFlat.data() + i * fftLength, VecSequence[i].data(), minlen * sizeof(Complex)); } @@ -204,16 +360,40 @@ void CUDACorrelation::ComputeSequenceFFT(Complex2D VecSequence, vecSequenceLength_.assign(tmpLengths.begin(), tmpLengths.end()); // 分配并拷贝数据到显存 + if (d_sequence) { + cudaFreeAsync(d_sequence, stream_); + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + d_sequence = nullptr; + } CHECK_CUDA_ERROR(cudaMallocAsync(&d_sequence, sequenceSize, stream_)); - CHECK_CUDA_ERROR(cudaMemcpy(d_sequence, hSequenceFlat.data(), sequenceSize, - cudaMemcpyHostToDevice)); + CHECK_CUDA_ERROR(cudaMemcpyAsync(d_sequence, hSequenceFlat.data(), + sequenceSize, cudaMemcpyHostToDevice, + stream_)); +#ifdef USE_PEEKSKERNEL + if (d_vecSeqLen) { + cudaFreeAsync(d_vecSeqLen, stream_); + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + d_vecSeqLen = nullptr; + } + CHECK_CUDA_ERROR( + cudaMallocAsync(&d_vecSeqLen, numSequence * sizeof(uint), stream_)); + CHECK_CUDA_ERROR(cudaMemcpyAsync(d_vecSeqLen, vecSequenceLength_.data(), + numSequence * sizeof(uint), + cudaMemcpyHostToDevice, stream_)); +#endif + + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + // 创建并执行FFT cufftResult cufftStatus; cufftHandle fftPlan; cufftStatus = cufftPlan1d(&fftPlan, fftLength, getFFTType(), numSequence); if (cufftStatus != CUFFT_SUCCESS) { - throw std::runtime_error("Failed to create CUFFT plan"); + std::cerr << __FUNCTION__ << " Failed to create CUFFT plan" << std::endl; + return; } + cufftSetStream(fftPlan, stream_); + if constexpr (std::is_same_v) { cufftStatus = cufftExecC2C( fftPlan, reinterpret_cast(d_sequence), @@ -224,92 +404,130 @@ void CUDACorrelation::ComputeSequenceFFT(Complex2D VecSequence, reinterpret_cast(d_sequence), CUFFT_FORWARD); } + cufftDestroy(fftPlan); if (cufftStatus != CUFFT_SUCCESS) { - cufftDestroy(fftPlan); - throw std::runtime_error("Failed to execute forward FFT"); + std::cerr << __FUNCTION__ << " Failed to execute forward FFT" << std::endl; + return; } +} -#ifndef USE_OPT_CUDAMEMCPY - // 分配主机内存并拷贝结果 - CUDAComplex* CompData = (CUDAComplex*)malloc(sequenceSize); - CHECK_CUDA_ERROR( - cudaMemcpy(CompData, d_sequence, sequenceSize, cudaMemcpyDeviceToHost)); +// 调用该接口需要先初始化vecSequenceLength_ +template +void CUDACorrelation::ComputeSequenceFFT(const Complex* sequenceDatas, + uint numSequence, uint fftLength) { + fftLength_ = fftLength; + seqChannels_ = numSequence; + uint sequenceSize = numSequence * fftLength * sizeof(CUDAComplex); - size_t sequenceFFTSize = fftLength * sizeof(CUDAComplex); - std::vector vSequence(fftLength); -#pragma omp parallel for num_threads(numSequence) - { - for (int i = 0; i < numSequence; ++i) { - memcpy(vSequence.data(), CompData + i * fftLength, sequenceFFTSize); - vecSequenceFFT_.emplace_back(vSequence); - } + // 分配并拷贝sequence数据到显存d_sequence + if (d_sequence) { + cudaFreeAsync(d_sequence, stream_); + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + d_sequence = nullptr; + } + CHECK_CUDA_ERROR(cudaMallocAsync(&d_sequence, sequenceSize, stream_)); + CHECK_CUDA_ERROR(cudaMemcpyAsync(d_sequence, sequenceDatas, sequenceSize, + cudaMemcpyHostToDevice, stream_)); +#ifdef USE_PEEKSKERNEL + if (d_vecSeqLen) { + cudaFreeAsync(d_vecSeqLen, stream_); + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + d_vecSeqLen = nullptr; + } + // 需要先初始化 vecSequenceLength_ + // 分配并拷贝vecSequenceLength_数据到显存d_vecSeqLen + if (seqChannels_ > 0) { + CHECK_CUDA_ERROR( + cudaMallocAsync(&d_vecSeqLen, numSequence * sizeof(uint), stream_)); + CHECK_CUDA_ERROR(cudaMemcpyAsync(d_vecSeqLen, vecSequenceLength_.data(), + seqChannels_ * sizeof(uint), + cudaMemcpyHostToDevice, stream_)); } - free(CompData); - cudaFreeAsync(d_sequence, stream_); - d_sequence = nullptr; #endif + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + + // 创建并执行FFT + cufftResult cufftStatus; + cufftHandle fftPlan; + cufftStatus = cufftPlan1d(&fftPlan, fftLength, getFFTType(), seqChannels_); + if (cufftStatus != CUFFT_SUCCESS) { + std::cerr << __FUNCTION__ << " Failed to create CUFFT plan" << std::endl; + return; + } + cufftSetStream(fftPlan, stream_); + + if constexpr (std::is_same_v) { + cufftStatus = cufftExecC2C( + fftPlan, reinterpret_cast(d_sequence), + reinterpret_cast(d_sequence), CUFFT_FORWARD); + } else { + cufftStatus = cufftExecZ2Z( + fftPlan, reinterpret_cast(d_sequence), + reinterpret_cast(d_sequence), CUFFT_FORWARD); + } + cufftDestroy(fftPlan); + if (cufftStatus != CUFFT_SUCCESS) { + std::cerr << __FUNCTION__ << " Failed to execute forward FFT" << std::endl; + return; + } } -// 预计算一个batch的signals的FFT +// 预先计算signals的fft template void CUDACorrelation::ComputeSignalsFFT(const Complex2D& signalDatas) { - size_t numChannels = signalDatas.size(); - size_t signalLength = signalDatas[0].size(); - size_t signalsNum = numChannels * signalLength; - size_t signalsSize = signalsNum * sizeof(Complex); + uint numChannels = signalDatas.size(); + uint signalLength = signalDatas[0].size(); + uint signalsNum = numChannels * signalLength; + uint signalsSize = signalsNum * sizeof(Complex); signalsNum_ = signalsNum; configureKernel(signalsNum_); // 分配设备侧显存(优化:复用之前的显存,避免重复的显存分配和释放,带来的性能损失) - if ((signalsSize_ == 0) || (d_signals == nullptr) || (d_results == nullptr) || - (h_results == nullptr)) { + if ((signalsSize_ == 0) || (d_signals == nullptr) || (d_results == nullptr)) { signalsSize_ = signalsSize; if (d_signals) { cudaFreeAsync(d_signals, stream_); + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); d_signals = nullptr; } if (d_results) { cudaFreeAsync(d_results, stream_); + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); d_results = nullptr; } - if (h_results) { - cudaFreeHost(h_results); - h_results = nullptr; - } + CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals, signalsSize_, stream_)); - CHECK_CUDA_ERROR(cudaMallocAsync(&d_results, signalsSize_, stream_)); - smartAllocPinned(&h_results, signalsSize_); + CHECK_CUDA_ERROR( + cudaMallocAsync(&d_results, seqChannels_ * signalsSize_, stream_)); } else { if (signalsSize_ != signalsSize) { signalsSize_ = signalsSize; if (d_signals) { cudaFreeAsync(d_signals, stream_); + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); d_signals = nullptr; } if (d_results) { cudaFreeAsync(d_results, stream_); + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); d_results = nullptr; } - if (h_results) { - cudaFreeHost(h_results); - h_results = nullptr; - } + CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals, signalsSize_, stream_)); - CHECK_CUDA_ERROR(cudaMallocAsync(&d_results, signalsSize_, stream_)); - smartAllocPinned(&h_results, signalsSize_); + CHECK_CUDA_ERROR( + cudaMallocAsync(&d_results, seqChannels_ * signalsSize_, stream_)); } } try { -#ifdef USE_OPT_CUDAMEMCPY // 扁平化数据:二维转一维(二维vector内存不连续,不能直接copy到显存) Complex1D hSignalsFlat(signalsNum, Complex{0.0, 0.0}); - size_t copySize = signalLength * sizeof(Complex); + uint copySize = signalLength * sizeof(Complex); #pragma omp parallel for num_threads(numChannels) { - for (size_t i = 0; i < numChannels; ++i) { + for (uint i = 0; i < numChannels; ++i) { memcpy(hSignalsFlat.data() + i * signalLength, // 目标地址偏移 signalDatas[i].data(), // 源数据起始点 copySize // 拷贝字节数 @@ -318,51 +536,42 @@ void CUDACorrelation::ComputeSignalsFFT(const Complex2D& signalDatas) { } // 拷贝数据到显存:CPU->GPU - CHECK_CUDA_ERROR(cudaMemcpy(d_signals, hSignalsFlat.data(), signalsSize, - cudaMemcpyHostToDevice)); -#else - // 循环copy:将所有通道数据一次性拷贝到GPU - size_t copySize = signalLength * sizeof(CUDAComplex); -#pragma omp parallel for num_threads(numChannels) - { - for (int i = 0; i < numChannels; i++) { - CUDAComplex* dst1 = d_signals + i * signalLength; - const void* src1 = signalDatas[i].data(); - - CHECK_CUDA_ERROR( - cudaMemcpy(dst1, src1, copySize, cudaMemcpyHostToDevice)); - } - } -#endif + CHECK_CUDA_ERROR(cudaMemcpyAsync(d_signals, hSignalsFlat.data(), + signalsSize, cudaMemcpyHostToDevice, + stream_)); + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); // 创建fftPlan cufftResult cufftStatus; if (fftPlan_ == 0) { signalLength_ = signalLength; - numChannels_ = numChannels; + signalChannels_ = numChannels; cufftStatus = cufftPlan1d(&fftPlan_, signalLength, getFFTType(), numChannels); if (cufftStatus != CUFFT_SUCCESS) { fftPlan_ = 0; throw std::runtime_error("Failed to create CUFFT fftPlan_"); } + cufftSetStream(fftPlan_, stream_); + } else { - if ((signalLength_ != signalLength) || (numChannels_ != numChannels)) { + if ((signalLength_ != signalLength) || (signalChannels_ != numChannels)) { if (fftPlan_) { cufftResult result = cufftDestroy(fftPlan_); fftPlan_ = 0; if (result != CUFFT_SUCCESS) { - std::cerr << "Error destroying FFT plan: " << result << std::endl; + throw std::runtime_error("Error destroying FFT plan"); } } signalLength_ = signalLength; - numChannels_ = numChannels; + signalChannels_ = numChannels; cufftStatus = cufftPlan1d(&fftPlan_, signalLength, getFFTType(), numChannels); if (cufftStatus != CUFFT_SUCCESS) { fftPlan_ = 0; throw std::runtime_error("Failed to create CUFFT fftPlan_"); } + cufftSetStream(fftPlan_, stream_); } } @@ -385,15 +594,16 @@ void CUDACorrelation::ComputeSignalsFFT(const Complex2D& signalDatas) { } } - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); return; } catch (const std::exception& e) { - std::cerr << "CUDA error: " << e.what() << std::endl; + std::cerr << __FUNCTION__ << " CUDA error: " << e.what() << std::endl; cudaError_t lastError = cudaGetLastError(); if (lastError != cudaSuccess) { - std::cerr << "Last CUDA error: " << cudaGetErrorString(lastError) + std::cerr << __FUNCTION__ + << " Last CUDA error: " << cudaGetErrorString(lastError) << std::endl; } @@ -402,138 +612,266 @@ void CUDACorrelation::ComputeSignalsFFT(const Complex2D& signalDatas) { } } -// 需要预先计算完一个batch的 signalDatas 的FFT结果 -// 调用此接口,直接计算signalsFFT与sequenceFFT的共轭乘 +// template -typename CUDACorrelation::Complex2D -CUDACorrelation::ComputeConjugateMultiply(int seqIdx) { - if (d_signals == nullptr) { - std::cerr << "请先调用 ComputeSignalsFFT(signalDatas) 接口" << std::endl; - throw; - } - - // 原始sequence的长度 - size_t sequenceLength = vecSequenceLength_[seqIdx]; - CUDAComplex* sequence_ptr; -#ifdef USE_OPT_CUDAMEMCPY - if (d_sequence == nullptr) { - std::cerr << "请先调用 ComputeSequenceFFT(VecSequence) 接口" << std::endl; - throw; - } - // 计算sequenceFFT的显存地址(initSequence计算之后,d_sequence显存资源没有释放) - sequence_ptr = d_sequence + seqIdx * fftLength_; -#else - if (vecSequenceFFT_[seqIdx].size() == 0) { - std::cerr << "请先调用 ComputeSequenceFFT(VecSequence) 接口" << std::endl; - throw; - } - - // 获取相应的sequence的fft,并copy到显存中 - // 根据seqIdx,获取相应sequence的fft - std::vector& sequenceFFT = vecSequenceFFT_[seqIdx]; - size_t sequenceFFTLength = sequenceFFT.size(); - size_t sequenceSize = sequenceFFTLength * sizeof(CUDAComplex); +void CUDACorrelation::ComputeSignalsFFT(const Complex* signalDatas, + uint numChannels, + uint signalLength) { + uint signalsNum = numChannels * signalLength; + uint signalsSize = signalsNum * sizeof(Complex); + signalsNum_ = signalsNum; + configureKernel(signalsNum_); - // 分配显存空间 - if ((sequenceSize_ == 0) || (d_sequence == nullptr)) { - sequenceSize_ = sequenceSize; - if (d_sequence) { - cudaFreeAsync(d_sequence, stream_); - d_sequence = nullptr; + // 分配设备侧显存(优化:复用之前的显存,避免重复的显存分配和释放,带来的性能损失) + if ((signalsSize_ == 0) || (d_signals == nullptr) || (d_results == nullptr)) { + signalsSize_ = signalsSize; + if (d_signals) { + cudaFreeAsync(d_signals, stream_); + d_signals = nullptr; } - cudaMallocAsync(&d_sequence, sequenceSize_, stream_); + if (d_results) { + cudaFreeAsync(d_results, stream_); + d_results = nullptr; + } + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals, signalsSize_, stream_)); + CHECK_CUDA_ERROR( + cudaMallocAsync(&d_results, seqChannels_ * signalsSize_, stream_)); } else { - if (sequenceSize_ != sequenceSize) { - if (d_sequence) { - cudaFreeAsync(d_sequence, stream_); - d_sequence = nullptr; + if (signalsSize_ != signalsSize) { + signalsSize_ = signalsSize; + if (d_signals) { + cudaFreeAsync(d_signals, stream_); + d_signals = nullptr; + } + if (d_results) { + cudaFreeAsync(d_results, stream_); + d_results = nullptr; } - sequenceSize_ = sequenceSize; - cudaMallocAsync(&d_sequence, sequenceSize_, stream_); + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals, signalsSize_, stream_)); + CHECK_CUDA_ERROR( + cudaMallocAsync(&d_results, seqChannels_ * signalsSize_, stream_)); } } - // copy sequence 的 fft 到显存中 - sequence_ptr = d_sequence; - CHECK_CUDA_ERROR(cudaMemcpy(sequence_ptr, sequenceFFT.data(), sequenceSize_, - cudaMemcpyHostToDevice)); -#endif - - // 分配设备侧显存(优化:复用之前的显存,避免重复的显存分配和释放,带来的性能损失) - if (d_results == nullptr) { - CHECK_CUDA_ERROR(cudaMallocAsync(&d_results, signalsSize_, stream_)); - } - - if (h_results == nullptr) { - //分配锁页内存 - smartAllocPinned(&h_results, signalsSize_); - } - try { + // 拷贝数据到显存:CPU->GPU + CHECK_CUDA_ERROR(cudaMemcpyAsync(d_signals, signalDatas, signalsSize, + cudaMemcpyHostToDevice, stream_)); + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + // 创建fftPlan cufftResult cufftStatus; if (fftPlan_ == 0) { + signalLength_ = signalLength; + signalChannels_ = numChannels; cufftStatus = - cufftPlan1d(&fftPlan_, signalLength_, getFFTType(), numChannels_); + cufftPlan1d(&fftPlan_, signalLength, getFFTType(), numChannels); if (cufftStatus != CUFFT_SUCCESS) { fftPlan_ = 0; throw std::runtime_error("Failed to create CUFFT fftPlan_"); } - } + cufftSetStream(fftPlan_, stream_); - // 计算d_signals与d_sequence的共轭乘 - // configureKernel(signalLength_ * numChannels_); - if constexpr (std::is_same_v) { - batchConjugateMultiplyKernelFloat<<>>( - reinterpret_cast(d_signals), sequence_ptr, - reinterpret_cast(d_results), signalLength_, - numChannels_); } else { - batchConjugateMultiplyKernelDouble<<>>( - reinterpret_cast(d_signals), sequence_ptr, - reinterpret_cast(d_results), signalLength_, - numChannels_); + if ((signalLength_ != signalLength) || (signalChannels_ != numChannels)) { + if (fftPlan_) { + cufftResult result = cufftDestroy(fftPlan_); + fftPlan_ = 0; + if (result != CUFFT_SUCCESS) { + throw std::runtime_error("Error destroying FFT plan"); + } + } + signalLength_ = signalLength; + signalChannels_ = numChannels; + cufftStatus = + cufftPlan1d(&fftPlan_, signalLength, getFFTType(), numChannels); + if (cufftStatus != CUFFT_SUCCESS) { + fftPlan_ = 0; + throw std::runtime_error("Failed to create CUFFT fftPlan_"); + } + cufftSetStream(fftPlan_, stream_); + } } - // 计算 d_results 的IFFT + // 计算signals的fft if constexpr (std::is_same_v) { cufftStatus = cufftExecC2C( - fftPlan_, reinterpret_cast(d_results), - reinterpret_cast(d_results), CUFFT_INVERSE); + fftPlan_, reinterpret_cast(d_signals), + reinterpret_cast(d_signals), CUFFT_FORWARD); if (cufftStatus != CUFFT_SUCCESS) { - throw std::runtime_error("Failed to execute inverse FFT"); + throw std::runtime_error( + "Failed to execute forward FFT on signalDatas"); } } else { cufftStatus = cufftExecZ2Z( - fftPlan_, reinterpret_cast(d_results), - reinterpret_cast(d_results), CUFFT_INVERSE); + fftPlan_, reinterpret_cast(d_signals), + reinterpret_cast(d_signals), CUFFT_FORWARD); if (cufftStatus != CUFFT_SUCCESS) { - throw std::runtime_error("Failed to execute inverse FFT"); + throw std::runtime_error( + "Failed to execute forward FFT on signalDatas"); } } - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + + return; + } catch (const std::exception& e) { + std::cerr << __FUNCTION__ << " CUDA error: " << e.what() << std::endl; + + cudaError_t lastError = cudaGetLastError(); + if (lastError != cudaSuccess) { + std::cerr << __FUNCTION__ + << " Last CUDA error: " << cudaGetErrorString(lastError) + << std::endl; + } + + Cleanup(); + throw; + } +} - // 拷贝结果回主机 - size_t resultLength = signalLength_ - sequenceLength; - size_t resultSize = resultLength * sizeof(CUDAComplex); - Complex2D results(numChannels_, Complex1D(resultLength)); +// 需要预先计算完一个batch的 signalDatas 的FFT结果 +// 调用此接口,直接计算signalsFFT与sequenceFFT的共轭乘 +template +int CUDACorrelation::ComputeConjMul(void) { + if (d_signals == nullptr) { + std::cerr << __FUNCTION__ << " 请先调用 ComputeSignalsFFT(signalDatas) 接口" + << std::endl; + return 0; + } + + if (d_sequence == nullptr) { + std::cerr << __FUNCTION__ + << " 请先调用 ComputeSequenceFFT(VecSequence) 接口" << std::endl; + return 0; + } + + // 分配设备侧显存(优化:复用之前的显存,避免重复的显存分配和释放,带来的性能损失) + if (d_results == nullptr) { CHECK_CUDA_ERROR( - cudaMemcpy(h_results, d_results, signalsSize_, cudaMemcpyDeviceToHost)); -#pragma omp parallel for num_threads(std::min(numChannels_, 16)) - { - for (int i = 0; i < numChannels_; i++) { - memcpy(results[i].data(), h_results + i * signalLength_, resultSize); + cudaMallocAsync(&d_results, seqChannels_ * signalsSize_, stream_)); + } + +#ifdef USE_PEEKSKERNEL + if (d_vecPeaks == nullptr) { + CHECK_CUDA_ERROR(cudaMallocAsync( + &d_vecPeaks, seqChannels_ * signalChannels_ * sizeof(uint), stream_)); + } + + if (h_vecPeaks == nullptr) { + h_vecPeaks = (uint*)malloc(seqChannels_ * signalChannels_ * sizeof(uint)); + } +#else + if (cpu_results == nullptr) { + cpu_results = (CUDAComplex*)malloc(seqChannels_ * signalsSize_); + } +#endif + + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + + try { + // 创建fftPlan + cufftResult cufftStatus; + if (fftPlan_ == 0) { + cufftStatus = cufftPlan1d(&fftPlan_, signalLength_, getFFTType(), + signalChannels_); + if (cufftStatus != CUFFT_SUCCESS) { + fftPlan_ = 0; + throw std::runtime_error("Failed to create CUFFT fftPlan_"); + } + cufftSetStream(fftPlan_, stream_); + } + + // sequence_fft的指针 + CUDAComplex* sequence_ptr = nullptr; + CUDAComplex* d_output = nullptr; + + // 循环计算每个序列与信号的共轭乘和IFFT + for (int seqIdx = 0; seqIdx < seqChannels_; ++seqIdx) { + // 计算sequenceFFT的显存地址(initSequence计算之后,d_sequence显存资源没有释放) + sequence_ptr = d_sequence + seqIdx * fftLength_; + + // 共轭乘结果的地址偏移 + d_output = d_results + seqIdx * signalsNum_; + + if constexpr (std::is_same_v) { + // 计算共轭乘 + batchConjugateMultiplyKernelFloat<<>>( + reinterpret_cast(d_signals), sequence_ptr, + reinterpret_cast(d_output), signalLength_, + signalChannels_); + + // 计算IFFT + cufftStatus = cufftExecC2C( + fftPlan_, reinterpret_cast(d_output), + reinterpret_cast(d_output), CUFFT_INVERSE); + if (cufftStatus != CUFFT_SUCCESS) { + throw std::runtime_error("Failed to execute inverse FFT"); + } + } else { + // 计算共轭乘 + batchConjugateMultiplyKernelDouble<<>>( + reinterpret_cast(d_signals), sequence_ptr, + reinterpret_cast(d_output), signalLength_, + signalChannels_); + + // 计算IFFT + cufftStatus = cufftExecZ2Z( + fftPlan_, reinterpret_cast(d_output), + reinterpret_cast(d_output), CUFFT_INVERSE); + if (cufftStatus != CUFFT_SUCCESS) { + throw std::runtime_error("Failed to execute inverse FFT"); + } } } - return std::move(results); +#ifdef USE_PEEKSKERNEL + if (d_vecSeqLen == nullptr) { + std::cerr << __FUNCTION__ << " d_vecSeqLen ptr is null!" << std::endl; + return 0; + } + + // GPU上计算PeeksMax + dim3 block(signalChannels_); + dim3 grid((seqChannels_ * signalChannels_ + block.x - 1) / block.x); + if constexpr (std::is_same_v) { + CalculatePeaksKernelFloat<<>>( + reinterpret_cast(d_results), d_vecSeqLen, seqChannels_, + signalChannels_, signalLength_, d_vecPeaks); + } else { + CalculatePeaksKernelDouble<<>>( + reinterpret_cast(d_results), d_vecSeqLen, + seqChannels_, signalChannels_, signalLength_, d_vecPeaks); + } + + CHECK_CUDA_ERROR(cudaMemcpyAsync( + h_vecPeaks, d_vecPeaks, seqChannels_ * signalChannels_ * sizeof(uint), + cudaMemcpyDeviceToHost, stream_)); + // for (int i = 0; i < signalChannels_; i++) { + // if (h_vecPeaks[i] == 1) { + // return 1; + // } + // } + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + + return 0; +#else + // 拷贝IFFT结果回主机,cpu侧计算PeeksMax + CHECK_CUDA_ERROR(cudaMemcpyAsync(cpu_results, d_results, + seqChannels_ * signalsSize_, + cudaMemcpyDeviceToHost, stream_)); + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + return 0; +#endif } catch (const std::exception& e) { - std::cerr << "CUDA error: " << e.what() << std::endl; + std::cerr << __FUNCTION__ << " CUDA error: " << e.what() << std::endl; cudaError_t lastError = cudaGetLastError(); if (lastError != cudaSuccess) { - std::cerr << "Last CUDA error: " << cudaGetErrorString(lastError) + std::cerr << __FUNCTION__ + << " Last CUDA error: " << cudaGetErrorString(lastError) << std::endl; } diff --git a/cuda_correlation.h b/cuda_correlation.h index ef29e8e83b18f4eec6b86a74ebe69ebc7c1877bf..b3286c4a2f5cd5dc14cc90e134ca40a30cc7dd49 100644 --- a/cuda_correlation.h +++ b/cuda_correlation.h @@ -8,14 +8,6 @@ #include #include -// =======条件编译宏====== -// 使用预计算sequence模式 -// #define USE_PRE_COMPUTE_MODE - -// 优化cudaMemcpy(减少cudaMemcpy调用,提升性能) -// 对于二维vector打平为一维(二维vector内存上不连续,不能直接copy到显存),便于一次copy数据到显存计算 -// #define USE_OPT_CUDAMEMCPY - template class CUDACorrelation { static_assert(std::is_same::value || std::is_same::value, @@ -34,49 +26,63 @@ class CUDACorrelation { ~CUDACorrelation(); // 预先计算sequence的fft - void ComputeSequenceFFT(Complex2D VecSequence, int fftLength); + void ComputeSequenceFFT(const Complex2D& VecSequence, uint fftLength); + + // 调用该接口时,需要先初始化 vecSequenceLength_ + // sequenceDatas:非vector类型,可在初始化时通过malloc分配并初始化 + // sequenceDatas 内存连续,可直接cudaMemcpy到显存中 + // 减少memcpy调用,提升性能 + void ComputeSequenceFFT(const Complex* sequenceDatas, uint numSequence, + uint fftLength); // 预先计算signals的fft void ComputeSignalsFFT(const Complex2D& signalDatas); + // signalDatas:非vector类型,可在初始化时通过malloc分配并初始化 + // signalDatas 内存连续,可直接cudaMemcpy到显存中 + // 减少memcpy调用,提升性能 + void ComputeSignalsFFT(const Complex* signalDatas, uint numChannels, + uint signalLength); + // 需预先计算sequence的fft // 需预先计算signals的fft - // 计算signalsFFT与相应的sequenceFFT(根据seqIdx)的共轭乘(归一化) - Complex2D ComputeConjugateMultiply(int seqIdx); + // 计算所有序列sequenceFFT与signalsFFT的共轭乘、IFFT + int ComputeConjMul(void); + +#ifdef USE_PEEKSKERNEL + uint* h_vecPeaks = nullptr; + uint* d_vecPeaks = nullptr; + uint* d_vecSeqLen = nullptr; +#else + CUDAComplex* cpu_results = nullptr; +#endif + + CUDAComplex* d_signals = nullptr; + CUDAComplex* d_sequence = nullptr; + CUDAComplex* d_results = nullptr; + + uint seqChannels_ = 0; + uint fftLength_ = 0; + uint sequenceSize_ = 0; + + uint signalChannels_ = 0; + uint signalLength_ = 0; + uint signalsNum_ = 0; // signalsNum_ = signalChannels_ * signalLength_ + uint signalsSize_ = 0; // signalsSize_ = signalsNum_ * sizeof(CUDAComplex) + + // 保存每个sequence的原始长度 + std::vector vecSequenceLength_; + std::vector> vecSequenceFFT_; private: cudaStream_t stream_; cudaDeviceProp deviceProp_; cufftHandle fftPlan_; - cublasHandle_t cublas_handle_; - - size_t signalLength_; - size_t numChannels_; - size_t signalsSize_; - size_t sequenceSize_; - size_t fftLength_; - size_t signalsNum_; - - CUDAComplex* d_signals; - CUDAComplex* d_sequence; - CUDAComplex* d_results; - CUDAComplex* h_results; - - // 保存原始每个sequence的长度 - std::vector vecSequenceLength_; - std::vector> vecSequenceFFT_; - dim3 block_; dim3 grid_; // 计算最优线程数配置 - void configureKernel(int dataSize); - - // 申请显存 - bool AllocateMemory(int length, int channels); - - // 分配锁页内存 - void smartAllocPinned(CUDAComplex** pHost, size_t size); + void configureKernel(uint dataSize); // 资源释放 void FreeMemory(); diff --git a/droneifiqparse.cpp b/droneifiqparse.cpp index cb0fe23b326137614d30f73d8fa4dd713a82f477..8e09ee56139f948910a025c732bfd693f69e0321 100644 --- a/droneifiqparse.cpp +++ b/droneifiqparse.cpp @@ -1,12 +1,14 @@ #include +#include #include "droneifiqparse.h" + DroneIFIQParse::DroneIFIQParse() {} DroneIFIQParse::~DroneIFIQParse() {} -unsigned int DroneIFIQParse::GetDataSize(char *buf) { +uint DroneIFIQParse::GetDataSize(char *buf) { quint32 a = (quint32)(buf[4 + 3] & 0xFF) << 24; quint32 b = (quint32)(buf[4 + 2] & 0xFF) << 16; quint32 c = (quint32)(buf[4 + 1] & 0xFF) << 8; @@ -15,9 +17,7 @@ unsigned int DroneIFIQParse::GetDataSize(char *buf) { return a | b | c | d; } -unsigned int DroneIFIQParse::GetDataTag(char *buf) { - return BytesToIntInv(buf, 8); -} +uint DroneIFIQParse::GetDataTag(char *buf) { return BytesToUintInv(buf, 8); } Real DroneIFIQParse::GetFrequency(char *buf) { Real freq = 0; @@ -26,140 +26,31 @@ Real DroneIFIQParse::GetFrequency(char *buf) { return freq; } -unsigned int DroneIFIQParse::GetSamplePoint(char *buf) { - return BytesToIntInv(buf, 20); +uint DroneIFIQParse::GetSamplePoint(char *buf) { + return BytesToUintInv(buf, 20); } -int DroneIFIQParse::GetChannelNumber(char *buf) { - return BytesToIntInv(buf, 24); +uint DroneIFIQParse::GetChannelNumber(char *buf) { + return BytesToUintInv(buf, 24); } -void DroneIFIQParse::ResolveIQData( - char *buf, unsigned int SampleCount, int ChannelCount, - std::vector>> &vecsignal) { - int startpos = 28; - int offset = 4 * ChannelCount; +void DroneIFIQParse::ResolveIQData(char *buf, uint SampleCount, + uint ChannelCount, cpuComplex *signalDatas) { + if (signalDatas == nullptr) { + std::cerr << __FUNCTION__ << " signalDatas ptr is null!" << std::endl; + return; + } + + uint startpos = 28; + uint offset = 4 * ChannelCount; for (int c = 0; c < ChannelCount; ++c) { - std::vector> channelData; - channelData.reserve(SampleCount); for (int k = 0; k < SampleCount; ++k) { - channelData.emplace_back( - BytesToShortInv(buf, startpos + k * offset + c * 4 + 0), - BytesToShortInv(buf, startpos + k * offset + c * 4 + 2)); + cpuComplex data(BytesToShortInv(buf, startpos + k * offset + c * 4 + 0), + BytesToShortInv(buf, startpos + k * offset + c * 4 + 2)); + signalDatas[c * SampleCount + k] = data; } - vecsignal[c] = std::move(channelData); - } -} - -void DroneIFIQParse::Resolve8CHDFIQData(char *buf, unsigned int SampleCount, - int ChannelCount, - QVector> &WholeIdata, - QVector> &WholeQdata) { - int startpos = 28; - - QVector vecFrameIdata1; - vecFrameIdata1.reserve(SampleCount); - QVector vecFrameQdata1; - vecFrameQdata1.reserve(SampleCount); - QVector vecFrameIdata2; - vecFrameIdata2.reserve(SampleCount); - QVector vecFrameQdata2; - vecFrameQdata2.reserve(SampleCount); - QVector vecFrameIdata3; - vecFrameIdata3.reserve(SampleCount); - QVector vecFrameQdata3; - vecFrameQdata3.reserve(SampleCount); - QVector vecFrameIdata4; - vecFrameIdata4.reserve(SampleCount); - QVector vecFrameQdata4; - vecFrameQdata4.reserve(SampleCount); - QVector vecFrameIdata5; - vecFrameIdata5.reserve(SampleCount); - QVector vecFrameQdata5; - vecFrameQdata5.reserve(SampleCount); - QVector vecFrameIdata6; - vecFrameIdata6.reserve(SampleCount); - QVector vecFrameQdata6; - vecFrameQdata6.reserve(SampleCount); - QVector vecFrameIdata7; - vecFrameIdata7.reserve(SampleCount); - QVector vecFrameQdata7; - vecFrameQdata7.reserve(SampleCount); - QVector vecFrameIdata8; - vecFrameIdata8.reserve(SampleCount); - QVector vecFrameQdata8; - vecFrameQdata8.reserve(SampleCount); - - int dataindex = 0; - - for (int k = 0; k < SampleCount; - ++k) // 理论上 这是 一次 的 所有Idata1 Qdata1 Idata2 Qdata2 - { - vecFrameIdata1.push_back(BytesToShortInv(buf, startpos + 0 + dataindex)); - vecFrameQdata1.push_back(BytesToShortInv(buf, startpos + 2 + dataindex)); - - vecFrameIdata2.push_back(BytesToShortInv(buf, startpos + 4 + dataindex)); - vecFrameQdata2.push_back(BytesToShortInv(buf, startpos + 6 + dataindex)); - - vecFrameIdata3.push_back(BytesToShortInv(buf, startpos + 8 + dataindex)); - vecFrameQdata3.push_back(BytesToShortInv(buf, startpos + 10 + dataindex)); - - vecFrameIdata4.push_back(BytesToShortInv(buf, startpos + 12 + dataindex)); - vecFrameQdata4.push_back(BytesToShortInv(buf, startpos + 14 + dataindex)); - - vecFrameIdata5.push_back(BytesToShortInv(buf, startpos + 16 + dataindex)); - vecFrameQdata5.push_back(BytesToShortInv(buf, startpos + 18 + dataindex)); - - vecFrameIdata6.push_back(BytesToShortInv(buf, startpos + 20 + dataindex)); - vecFrameQdata6.push_back(BytesToShortInv(buf, startpos + 22 + dataindex)); - - vecFrameIdata7.push_back(BytesToShortInv(buf, startpos + 24 + dataindex)); - vecFrameQdata7.push_back(BytesToShortInv(buf, startpos + 26 + dataindex)); - - vecFrameIdata8.push_back(BytesToShortInv(buf, startpos + 28 + dataindex)); - vecFrameQdata8.push_back(BytesToShortInv(buf, startpos + 30 + dataindex)); - - dataindex += 32; } - - // 按顺序 将本轮的 FrameIdata1 FrameQdata1 FrameIdata2 FrameQdata2 - // FrameIdata3 FrameQdata3 FrameIdata4 FrameQdata4 放入到 总 vector 一次 - // 各塞入四行数据 - WholeIdata.push_back(vecFrameIdata1); - WholeQdata.push_back(vecFrameQdata1); - WholeIdata.push_back(vecFrameIdata2); - WholeQdata.push_back(vecFrameQdata2); - WholeIdata.push_back(vecFrameIdata3); - WholeQdata.push_back(vecFrameQdata3); - WholeIdata.push_back(vecFrameIdata4); - WholeQdata.push_back(vecFrameQdata4); - WholeIdata.push_back(vecFrameIdata5); - WholeQdata.push_back(vecFrameQdata5); - WholeIdata.push_back(vecFrameIdata6); - WholeQdata.push_back(vecFrameQdata6); - WholeIdata.push_back(vecFrameIdata7); - WholeQdata.push_back(vecFrameQdata7); - WholeIdata.push_back(vecFrameIdata8); - WholeQdata.push_back(vecFrameQdata8); - - vecFrameIdata1.clear(); - vecFrameQdata1.clear(); - vecFrameIdata2.clear(); - vecFrameQdata2.clear(); - vecFrameIdata3.clear(); - vecFrameQdata3.clear(); - vecFrameIdata4.clear(); - vecFrameQdata4.clear(); - - vecFrameIdata5.clear(); - vecFrameQdata5.clear(); - vecFrameIdata6.clear(); - vecFrameQdata6.clear(); - vecFrameIdata7.clear(); - vecFrameQdata7.clear(); - vecFrameIdata8.clear(); - vecFrameQdata8.clear(); } short DroneIFIQParse::BytesToShortInv(char *buf, int startpos) { @@ -177,3 +68,12 @@ int DroneIFIQParse::BytesToIntInv(char *buf, int startpos) { return a | b | c | d; } + +uint DroneIFIQParse::BytesToUintInv(char *buf, int startpos) { + uint a = (uint)(buf[startpos + 3] & 0xff) << 24; + uint b = (uint)(buf[startpos + 2] & 0xff) << 16; + uint c = (uint)(buf[startpos + 1] & 0xff) << 8; + uint d = (uint)(buf[startpos] & 0xff); + + return a | b | c | d; +} diff --git a/droneifiqparse.h b/droneifiqparse.h index ca81bca734c33aa847cc3674ea249abd2c3f0fcf..6c909eadfe5967d4db2370825d800b42717f936a 100644 --- a/droneifiqparse.h +++ b/droneifiqparse.h @@ -12,24 +12,19 @@ class DroneIFIQParse { DroneIFIQParse(); ~DroneIFIQParse(); - unsigned int GetDataSize(char* buf); - unsigned int GetDataTag(char* buf); + uint GetDataSize(char* buf); + uint GetDataTag(char* buf); Real GetFrequency(char* buf); - unsigned int GetSamplePoint(char* buf); - int GetChannelNumber(char* buf); + uint GetSamplePoint(char* buf); + uint GetChannelNumber(char* buf); - void ResolveIQData(char* buf, unsigned int SampleCount, int ChannelCount, - std::vector>>& vecsignal); - - void Resolve8CHDFIQData(char* buf, unsigned int SampleCount, int ChannelCount, - QVector>& WholeIdata, - QVector>& WholeQdata); - - std::vector>> vecsignal_; + void ResolveIQData(char* buf, uint SampleCount, uint ChannelCount, + cpuComplex* signalDatas); private: short BytesToShortInv(char* buf, int startpos); int BytesToIntInv(char* buf, int startpos); + uint BytesToUintInv(char* buf, int startpos); }; #endif // DRONEIFIQPARSE_H diff --git a/mainwindow.cpp b/mainwindow.cpp index 0c673143b71f34e4a2365120284c0be87442ad5a..00e529037bdad0cf7a3385950bead22ccf737996 100644 --- a/mainwindow.cpp +++ b/mainwindow.cpp @@ -16,15 +16,17 @@ MainWindow::MainWindow(QWidget *parent) : QMainWindow(parent) { InitConnect(); } -MainWindow::~MainWindow() {} +MainWindow::~MainWindow() { + if (signalDatas_) { + free(signalDatas_); + signalDatas_ = nullptr; + } +} void MainWindow::InitControlValues() { m_btnCalculate = new QPushButton(QStringLiteral("加载数据计算"), this); basePath = "/../data/"; - - // 初始化:读取已知序列 - m_calMC.LoadAllSequenceBin(basePath); } void MainWindow::InitUI() { setCentralWidget(m_btnCalculate); } @@ -33,14 +35,10 @@ void MainWindow::InitConnect() { connect(m_btnCalculate, SIGNAL(clicked()), this, SLOT(SlotCalculateClick())); } -int MainWindow::CalculateRoutine( - const std::vector>> &vecsignal) { - return m_calMC.CalMovingCorrlationRoutine(vecsignal); -} - -int MainWindow::CalculateRoutine(QVector> &WholeIdata, - QVector> &WholeQdata) { - return m_calMC.CalMovingCorrlationRoutine(WholeIdata, WholeQdata); +int MainWindow::CalculateRoutine(const cpuComplex *signalDatas, + uint numChannels, uint signalLength) { + return m_calMC.CalMovingCorrlationRoutine(signalDatas, numChannels, + signalLength); } void MainWindow::SlotCalculateClick() { @@ -57,7 +55,8 @@ void MainWindow::SlotCalculateClick() { QCoreApplication::applicationDirPath() + basePath + strFileName; if (QFile::exists(strIQFileName) != true) { - std::cerr << strIQFileName.toStdString() << ":文件不存在" << std::endl; + std::cerr << __FUNCTION__ << strIQFileName.toStdString() << ":文件不存在" + << std::endl; return; } @@ -67,9 +66,12 @@ void MainWindow::SlotCalculateClick() { GetReplayFileHeadPos(strIQFileName, m_vecReplayHeadposDetect, m_ReplayfilesizeDetect); -#if defined(USE_CUDA) && defined(USE_PRE_COMPUTE_MODE) + // 初始化:读取已知序列 + m_calMC.LoadAllSequenceBin(basePath, SamplePoint_); + +#if defined(USE_CUDA) // 初始化:提前计算完所有Sequence的fft - m_calMC.ComputeAllSequence(SamplePoint); + m_calMC.ComputeAllSequence(SamplePoint_); #endif int m_iframeCnt = m_vecReplayHeadposDetect.size(); @@ -130,8 +132,7 @@ void MainWindow::GetReplayFileHeadPos(QString ReplayFilePath, char *buff = new char[Replayfilesize]; replayfileforcalculate.read(buff, Replayfilesize); - SamplePoint = m_droneIQParse.GetSamplePoint(buff); - qDebug() << __FUNCTION__ << "SamplePoint" << SamplePoint; + SamplePoint_ = m_droneIQParse.GetSamplePoint(buff); std::string source(buff, Replayfilesize); string match(cHeader, 4); @@ -156,36 +157,35 @@ void MainWindow::GetReplayFileHeadPos(QString ReplayFilePath, } void MainWindow::ReplayIQDataParse(char *buf) { - unsigned int SamplePoints = m_droneIQParse.GetSamplePoint(buf); + uint SamplePoints = m_droneIQParse.GetSamplePoint(buf); if (SamplePoints > 0) { - int channelnumber = + uint channelnumber = m_droneIQParse.GetChannelNumber(buf); // 8->16 16->32 32->64 if (channelnumber == 32) { - // qint64 freq = m_droneIQParse.GetFrequency(buf); - - // QVector> vecIdata; - // QVector> vecQdata; - // vecIdata.reserve(8); - // vecQdata.reserve(8); - // m_droneIQParse.Resolve8CHDFIQData(buf, SamplePoints, channelnumber, - // vecIdata, vecQdata); - // QElapsedTimer tm; - // tm.start(); - // // 每帧 4096点 IQ 输入 - // // 计算总流程 获得最终结果 1--找到相关峰 0--未找到相关峰 - // int result = CalculateRoutine(vecIdata, vecQdata); - channelnumber = 8; //原逻辑也是只取了前8个通道 - std::vector>> vecsignal( - channelnumber, std::vector>(SamplePoints)); - m_droneIQParse.ResolveIQData(buf, SamplePoints, channelnumber, vecsignal); + + // malloc申请空间,需手动free,防止内存泄漏(在析构函数中free) + if (signalDatas_ == nullptr) { + signalDatas_ = (cpuComplex *)malloc(channelnumber * SamplePoints * + sizeof(cpuComplex)); + if (signalDatas_ == nullptr) { + std::cerr << __FUNCTION__ << " Memory allocation failed!" + << std::endl; + return; + } + memset(signalDatas_, 0, + channelnumber * SamplePoints * sizeof(cpuComplex)); + } + + m_droneIQParse.ResolveIQData(buf, SamplePoints, channelnumber, + signalDatas_); QElapsedTimer tm; tm.start(); // 每帧 SamplePoints 个点 IQ 输入 // 计算总流程 获得最终结果 1--找到相关峰 0--未找到相关峰 - int result = CalculateRoutine(vecsignal); + int result = CalculateRoutine(signalDatas_, channelnumber, SamplePoints); std::cout << __FUNCTION__ << " result:" << result << " tm(ns):" << tm.nsecsElapsed() << std::endl; diff --git a/mainwindow.h b/mainwindow.h index f2c3db35b8840a996183297f11dae4f107992cc9..3742949e3608dfbc7c5ad5b7131c451c87194db9 100644 --- a/mainwindow.h +++ b/mainwindow.h @@ -26,11 +26,11 @@ class MainWindow : public QMainWindow { void InitConnect(); // 计算总流程 - int CalculateRoutine(QVector> &WholeIdata, - QVector> &WholeQdata); + int CalculateRoutine(const cpuComplex *signalDatas, uint numChannels, + uint signalLength); - int CalculateRoutine( - const std::vector>> &vecsignal); + cpuComplex *signalDatas_ = nullptr; + uint SamplePoint_; private: // 获取测试数据文件中 每一帧数据的帧头下标 @@ -54,7 +54,6 @@ class MainWindow : public QMainWindow { QString basePath; ifstream m_ReplayFile; int oneframesize; - unsigned int SamplePoint; public slots: void SlotCalculateClick();