diff --git a/CMakeLists.txt b/CMakeLists.txt index 053676838c05c998adc853b934ff0740f06d0f98..6a977516a152ba1e305ff3a8e1f370fa8f3aa284 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,21 +2,31 @@ cmake_minimum_required(VERSION 3.18) project(MovingCorrelation) cmake_policy(SET CMP0104 NEW) +set(CMAKE_CXX_STANDARD 17) + option(USE_CUDA "启用CUDA加速" ON) option(USE_PRE_COMPUTE_MODE "启用预初始化sequence" OFF) option(USE_OPT_CUDAMEMCPY "启用优化cudaMemcpy" OFF) if(USE_CUDA) add_compile_definitions(USE_CUDA) + # 启用CUDA支持 + enable_language(CUDA) + + # 设置cuda的动态库路径 + set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda") + set(CUDA_LIBRARY_DIR "${CUDA_TOOLKIT_ROOT_DIR}/lib64") + link_directories(${CUDA_LIBRARY_DIR}) + # 不同N卡,架构不同,计算能力不同 # RTX3060 的 Compute Capability #set(CMAKE_CUDA_ARCHITECTURES 86) # RTX1660 的 Compute Capability - set(CMAKE_CUDA_ARCHITECTURES 75) # 默认 Turing 架构 + # set(CMAKE_CUDA_ARCHITECTURES 75) # 默认 Turing 架构 # set(CMAKE_CUDA_ARCHITECTURES "70;72;75;80;86") # 方法 1:使用 CMake 原生支持(cmake version > 3.18) - # set(CMAKE_CUDA_ARCHITECTURES "native") + # set(CMAKE_CUDA_ARCHITECTURES "native") # 方法 2:手动检测(旧版 CMake 备用) if(CMAKE_VERSION VERSION_LESS 3.18) @@ -30,14 +40,11 @@ if(USE_CUDA) string(REPLACE "." "" CUDA_ARCH ${COMPUTE_CAPABILITY}) set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH}) else() - set(CMAKE_CUDA_ARCHITECTURES "native") + set(CMAKE_CUDA_ARCHITECTURES "75") endif() else() - set(CMAKE_CUDA_ARCHITECTURES "native") + set(CMAKE_CUDA_ARCHITECTURES "75") endif() #if(CMAKE_VERSION VERSION_LESS 3.18) - - # 启用CUDA支持 - enable_language(CUDA) endif() #if(USE_CUDA) if(USE_CUDA AND USE_PRE_COMPUTE_MODE) @@ -91,13 +98,14 @@ include_directories("${CMAKE_CURRENT_SOURCE_DIR}") # 收集项目源文件 # aux_source_directory(. SRC_LIST) # 自动收集源文件 # add_executable(${PROJECT_NAME} ${SRC_LIST}) +file(GLOB INC "${CMAKE_CURRENT_SOURCE_DIR}/*.h") file(GLOB CPPSRCS "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp") file(GLOB CUSRCS "${CMAKE_CURRENT_SOURCE_DIR}/*.cu") if(USE_CUDA) - add_executable(${PROJECT_NAME} ${CPPSRCS} ${CUSRCS}) + add_executable(${PROJECT_NAME} ${CPPSRCS} ${CUSRCS} ${INC}) else() - add_executable(${PROJECT_NAME} ${CPPSRCS}) + add_executable(${PROJECT_NAME} ${CPPSRCS} ${INC}) endif() # debug or release @@ -118,15 +126,15 @@ endif() if(USE_CUDA) # 链接所需的动态库 -target_link_libraries(${PROJECT_NAME} PRIVATE - Qt5::Widgets +target_link_libraries(${PROJECT_NAME} PRIVATE + Qt5::Widgets Qt5::Core - cudart + cudart cufft cublas) else() # 链接所需的动态库 -target_link_libraries(${PROJECT_NAME} PRIVATE - Qt5::Widgets +target_link_libraries(${PROJECT_NAME} PRIVATE + Qt5::Widgets Qt5::Core) endif() \ No newline at end of file diff --git a/README b/README index 0931d7395de4ac85ecf922902a582544e107e825..7fff25b4bb80967116be53a762b0c7b9282385b1 100644 --- a/README +++ b/README @@ -1,4 +1,7 @@ +#1. 安装GPU驱动和cuda toolkit等工具 + +#2.安装qt相关开发工具 sudo apt-get install qttools5-dev-tools -sudo apt-get install qtbase5-dev qttools5-dev +sudo apt-get install qtbase5-dev qtbase5-dev-tools sudo apt-get install qtcreator sudo ln -s /usr/lib/x86_64-linux-gnu/qt5/bin/moc /usr/lib/qt5/bin/moc diff --git a/calculatemovingcorrelation.cpp b/calculatemovingcorrelation.cpp index 80866707388046c98a15bb15cbf4a02f1bfb4bdd..d85c3dd4ff4f41e3e336a8403527d5599d052d75 100644 --- a/calculatemovingcorrelation.cpp +++ b/calculatemovingcorrelation.cpp @@ -1,25 +1,24 @@ -#include "cuda_correlation.h" -#include "calculatemovingcorrelation.h" -#include +#include #include #include #include - +#include "calculatemovingcorrelation.h" +#include "cuda_correlation.h" #include "fft.h" #include "log.h" #if USE_CUDA CalculateMovingCorrelation::CalculateMovingCorrelation() { - cudaCorrelation = new CUDACorrelation(); + cudaCorrelation = new CUDACorrelation(); } #else CalculateMovingCorrelation::CalculateMovingCorrelation() {} #endif int CalculateMovingCorrelation::CalMovingCorrlationRoutine( - const std::vector>> &vecsignal) { + const std::vector>> &vecsignal) { const int numChannels = vecsignal.size(); qDebug() << __FUNCTION__ << "Signal processing start"; @@ -36,7 +35,7 @@ int CalculateMovingCorrelation::CalMovingCorrlationRoutine( for (int seqIdx = 0; seqIdx < numSequences; ++seqIdx) { qDebug() << __FUNCTION__ << "seqIdx-----------" << seqIdx; - std::vector>> correlationResults( + std::vector>> correlationResults( numChannels); #if USE_CUDA @@ -45,7 +44,6 @@ int CalculateMovingCorrelation::CalMovingCorrlationRoutine( qDebug() << __FUNCTION__ << "use pre compute FFT (sequenceFFT, signalsFFT) mode"; try { - // correlationResults = cudaCorrelation->ProcessBatch(vecsignal, seqIdx); // 计算共轭乘 correlationResults = cudaCorrelation->ComputeConjugateMultiply(seqIdx); } catch (const std::exception &e) { @@ -87,13 +85,13 @@ int CalculateMovingCorrelation::CalMovingCorrlationRoutine( QVector> &DownSamplingIData, QVector> &DownSamplingQData) { const int numChannels = DownSamplingIData.size(); - std::vector>> vecsignal(numChannels); - + std::vector>> vecsignal(numChannels); + // 填充每个通道的IQ数据 for (int channel = 0; channel < numChannels; ++channel) { const int dataLength = DownSamplingIData[channel].size(); qDebug() << __FUNCTION__ << "dataLength:" << dataLength; - std::vector> channelData; + std::vector> channelData; channelData.reserve(dataLength); for (int i = 0; i < dataLength; ++i) { @@ -108,14 +106,14 @@ int CalculateMovingCorrelation::CalMovingCorrlationRoutine( } int CalculateMovingCorrelation::CalculatePeaks( - std::vector>> &CorrelationResults) { + std::vector>> &CorrelationResults) { std::vector vecCaculatePeaks; vecCaculatePeaks.reserve(CorrelationResults.size()); for (const auto &CorrelationValue : CorrelationResults) { - double max_abs = -10000; + Real max_abs = -10000; int max_index = -1; - double total_abs = 0.0; + Real total_abs = 0.0; const int num_elements = CorrelationValue.size(); @@ -124,8 +122,8 @@ int CalculateMovingCorrelation::CalculatePeaks( } for (int i = 0; i < num_elements; ++i) { - const double abs_val = std::hypot(CorrelationValue.at(i).real(), - CorrelationValue.at(i).imag()); + const Real abs_val = std::hypot(CorrelationValue.at(i).real(), + CorrelationValue.at(i).imag()); total_abs += abs_val; if (abs_val > max_abs) { @@ -134,7 +132,7 @@ int CalculateMovingCorrelation::CalculatePeaks( } } - const double avg_abs = total_abs / num_elements; + const Real avg_abs = total_abs / num_elements; // vecCaculateRes.push_back((max_abs > (avg_abs * 10)) ? 1 : 0); vecCaculatePeaks.push_back( @@ -154,31 +152,29 @@ int CalculateMovingCorrelation::CalculatePeaks( } void CalculateMovingCorrelation::MovingCorrelation( - std::vector> &SingleChannelData, - std::vector> &Sequence, - std::vector> &CorrelationResults) { - std::vector> SingalfftResult = - FFT::fft(SingleChannelData); + std::vector> &SingleChannelData, + std::vector> &Sequence, + std::vector> &CorrelationResults) { + std::vector> SingalfftResult = FFT::fft(SingleChannelData); int fftLength = SingalfftResult.size(); int sequenceLength = Sequence.size(); // 零填充 sequence - std::vector> paddedSequence( + std::vector> paddedSequence( fftLength, - std::complex( + std::complex( 0, 0)); // 创建一个全是0+0i 的vector 对前sequenceLength赋值 int minlen = std::min(sequenceLength, fftLength); for (int i = 0; i < minlen; ++i) { paddedSequence[i] = - std::complex(Sequence[i].real(), Sequence[i].imag()); + std::complex(Sequence[i].real(), Sequence[i].imag()); } // 计算序列fft - std::vector> SequencefftResult = - FFT::fft(paddedSequence); + std::vector> SequencefftResult = FFT::fft(paddedSequence); #ifdef _DEBUG_ std::cout << "signalLength:" << fftLength << std::endl; std::cout << "sequenceLength:" << sequenceLength << std::endl; @@ -201,18 +197,18 @@ void CalculateMovingCorrelation::MovingCorrelation( #endif // 计算共轭 - std::vector> conjugateSequence(fftLength); + std::vector> conjugateSequence(fftLength); for (int i = 0; i < fftLength; ++i) { conjugateSequence[i] = std::conj(SequencefftResult[i]); } // 点对点相乘 - std::vector> multipliedResult(fftLength); + std::vector> multipliedResult(fftLength); for (int i = 0; i < fftLength; ++i) { multipliedResult[i] = SingalfftResult[i] * conjugateSequence[i]; } - std::vector> ifftResult = FFT::ifft(multipliedResult); + std::vector> ifftResult = FFT::ifft(multipliedResult); #ifdef _DEBUG_ std::cout << "共轭乘结果" << std::endl; @@ -244,7 +240,7 @@ void CalculateMovingCorrelation::ComputeAllSequence(int fftLength) { #if defined(USE_CUDA) && defined(USE_PRE_COMPUTE_MODE) // 预计算sequence:初始化阶段预计算所有Sequence的FFT qDebug() << "Starting CUDA processing for compute all sequence"; - cudaCorrelation->InitSequence(m_VecSequence, fftLength); + cudaCorrelation->ComputeSequenceFFT(m_VecSequence, fftLength); #endif } @@ -304,7 +300,7 @@ void CalculateMovingCorrelation::ReadSequenceFile(QString strFileName) { return; } - std::vector> VecComplex; + std::vector> VecComplex; VecComplex.clear(); inFile.seekg(0, std::ios::end); @@ -315,14 +311,14 @@ void CalculateMovingCorrelation::ReadSequenceFile(QString strFileName) { char *tempdata = new char[len]; inFile.read(tempdata, len); - QVector vecSequenceData; + QVector vecSequenceData; BytesToDoubleInv(tempdata, len, vecSequenceData); int sizeOfSequenceData = vecSequenceData.size(); for (int index = 0; index < sizeOfSequenceData; index += 2) { - std::complex complexRI = {vecSequenceData[index], - vecSequenceData[index + 1]}; + std::complex complexRI = {vecSequenceData[index], + vecSequenceData[index + 1]}; VecComplex.push_back(complexRI); } @@ -330,12 +326,12 @@ void CalculateMovingCorrelation::ReadSequenceFile(QString strFileName) { } void CalculateMovingCorrelation::BytesToDoubleInv(char *buf, int ReadFileLength, - QVector &VecReturn) { + QVector &VecReturn) { for (int i = 0; i < ReadFileLength;) { - double real; - double imag; - memcpy(&real, buf + i, sizeof(double)); - memcpy(&imag, buf + i + 8, sizeof(double)); + Real real; + Real imag; + memcpy(&real, buf + i, sizeof(Real)); + memcpy(&imag, buf + i + 8, sizeof(Real)); VecReturn.push_back(real); VecReturn.push_back(imag); diff --git a/calculatemovingcorrelation.h b/calculatemovingcorrelation.h index 069bfe149a40acd2d80d5726820db5db5a1152f2..2a11ff119b7ddb0ab1b3a87099b0ef97f3b2fc9c 100644 --- a/calculatemovingcorrelation.h +++ b/calculatemovingcorrelation.h @@ -9,7 +9,14 @@ #include #include -class CUDACorrelation; +#if USE_CUDA +#include "cuda_correlation.h" +#endif + +// using Real = double; // 可随时替换为 float/double +using Real = float; // 可随时替换为 float/double + +// class CUDACorrelation; class CalculateMovingCorrelation { public: CalculateMovingCorrelation(); @@ -28,34 +35,34 @@ class CalculateMovingCorrelation { // 计算滑动相关总流程 输入 8路 I数据 和 8路 Q数据 返回 1--找到相关峰 // 0--未找到相关峰 int CalMovingCorrlationRoutine( - const std::vector>> &vecsignal); + const std::vector>> &vecsignal); // 序列文件数据 - std::vector>> m_VecSequence; + std::vector>> m_VecSequence; private: // 滑动相关函数 // 输入 一路数据的 IQ (组合成 复数) SingleChannelData // 输入 一个Sequence文件的 原始数据(组合成 复数) Sequence // 输出 相关数据CorrelationResults (复数) - void MovingCorrelation(std::vector> &SingleChannelData, - std::vector> &Sequence, - std::vector> &CorrelationResults); + void MovingCorrelation(std::vector> &SingleChannelData, + std::vector> &Sequence, + std::vector> &CorrelationResults); // 计算是否有峰值(满足峰值条件) // 输入 前一步计算得到的 相关数据CorrelationResults // 返回值 1--找到相关峰 0--未找到相关峰 int CalculatePeaks( - std::vector>> &CorrelationResults); + std::vector>> &CorrelationResults); // 读取单个序列文件 void ReadSequenceFile(QString strFileName); // 字节转换成double void BytesToDoubleInv(char *buf, int ReadFileLength, - QVector &VecReturn); + QVector &VecReturn); #if USE_CUDA - CUDACorrelation *cudaCorrelation; + CUDACorrelation *cudaCorrelation; #endif }; diff --git a/cuda_correlation.cu b/cuda_correlation.cu index cd91c123a591051f0992b16307e0b06b8a483de9..2a5823980f11be9c500a0c7934da2c47bdb5a8ac 100644 --- a/cuda_correlation.cu +++ b/cuda_correlation.cu @@ -4,11 +4,14 @@ #include #include +#include #include #include "cuda_correlation.h" using namespace std; +template class CUDACorrelation; // 明确告诉编译器生成float特例 +template class CUDACorrelation; // 明确告诉编译器生成double特例 #define CHECK_CUDA_ERROR(call) \ do { \ @@ -20,48 +23,80 @@ using namespace std; } \ } while (0) -__global__ void batchConjugateMultiplyKernel(cufftDoubleComplex* signals, - cufftDoubleComplex* sequenceFFT, - cufftDoubleComplex* outputs, - int signalLength, - int numChannels) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= signalLength * numChannels) { - return; +// 获取对应的FFT类型 +template +static constexpr cufftType getFFTType() { + if constexpr (std::is_same_v) { + return CUFFT_C2C; + } else if constexpr (std::is_same_v) { + return CUFFT_Z2Z; + } else { + static_assert(std::is_same_v || std::is_same_v, + "Only float and double types are supported"); + return CUFFT_C2C; // Never reached, but avoids compiler warning } +} + +// 单精度核函数 +__global__ void batchConjugateMultiplyKernelFloat( + const cufftComplex* __restrict__ signalDatas, + const cufftComplex* __restrict__ sequenceFFT, + cufftComplex* __restrict__ outputs, int signalLength, int numChannels) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx >= signalLength * numChannels) return; - // int channel = idx / signalLength; int pos = idx % signalLength; - cufftDoubleComplex signal = signals[idx]; + cufftComplex signal = signalDatas[idx]; + cufftComplex seq = sequenceFFT[pos]; + + // 计算 signal * conj(seq) + outputs[idx].x = signal.x * seq.x + signal.y * seq.y; // 实部 + outputs[idx].y = signal.y * seq.x - signal.x * seq.y; // 虚部 +} + +// 双精度核函数 +__global__ void batchConjugateMultiplyKernelDouble( + const cufftDoubleComplex* __restrict__ signalDatas, + const cufftDoubleComplex* __restrict__ sequenceFFT, + cufftDoubleComplex* __restrict__ outputs, int signalLength, + int numChannels) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx >= signalLength * numChannels) return; + + int pos = idx % signalLength; + + cufftDoubleComplex signal = signalDatas[idx]; cufftDoubleComplex seq = sequenceFFT[pos]; // 计算 signal * conj(seq) - // 如果 a = x + yi 和 b = u + vi - // 则 a * conj(b) = (x + yi)(u - vi) = (xu + yv) + (yu - xv)i - outputs[idx].x = signal.x * seq.x + signal.y * seq.y; // 实部:x*u + y*v - outputs[idx].y = signal.y * seq.x - signal.x * seq.y; // 虚部:y*u - x*v + outputs[idx].x = signal.x * seq.x + signal.y * seq.y; // 实部 + outputs[idx].y = signal.y * seq.x - signal.x * seq.y; // 虚部 } // 动态计算最优配置 -void CUDACorrelation::configureKernel(int dataSize) { - // 1. 自动计算最优Block维度(基于Occupancy API) +template +void CUDACorrelation::configureKernel(int dataSize) { int minGridSize, bestBlockSize; - cudaOccupancyMaxPotentialBlockSize( - &minGridSize, &bestBlockSize, - (void*)batchConjugateMultiplyKernel, // 替换为实际核函数指针 - 0, dataSize); - block_.x = bestBlockSize; // 自动适配硬件 - // 2. 校验Block限制(知乎示例中的maxThreadsPerBlock) - if (block_.x > deviceProp_.maxThreadsPerBlock) - block_.x = deviceProp_.maxThreadsPerBlock; + if constexpr (std::is_same_v) { + cudaOccupancyMaxPotentialBlockSize(&minGridSize, &bestBlockSize, + (void*)batchConjugateMultiplyKernelFloat, + 0, dataSize); + } else { + cudaOccupancyMaxPotentialBlockSize( + &minGridSize, &bestBlockSize, (void*)batchConjugateMultiplyKernelDouble, + 0, dataSize); + } - // 3. 计算Grid维度(保障数据全覆盖) + block_.x = std::min(bestBlockSize, (int)deviceProp_.maxThreadsPerBlock); grid_.x = (dataSize + block_.x - 1) / block_.x; } -CUDACorrelation::CUDACorrelation() +template +CUDACorrelation::CUDACorrelation() : fftPlan_(0), d_signals(nullptr), d_sequence(nullptr), @@ -72,638 +107,161 @@ CUDACorrelation::CUDACorrelation() signalsSize_(0), sequenceSize_(0), fftLength_(0) { - // 成员变量初始化 signalsNum_ = 0; cublas_handle_ = nullptr; - // 创建cuda流 - cudaStreamCreate(&stream_); - // 获取设备属性 + CHECK_CUDA_ERROR(cudaStreamCreate(&stream_)); CHECK_CUDA_ERROR(cudaGetDeviceProperties(&deviceProp_, 0)); } -CUDACorrelation::~CUDACorrelation() { +template +CUDACorrelation::~CUDACorrelation() { Cleanup(); cudaStreamSynchronize(stream_); cudaStreamDestroy(stream_); } -void CUDACorrelation::Cleanup() { +template +void CUDACorrelation::Cleanup() { FreeMemory(); if (fftPlan_) { - cufftResult result = cufftDestroy(fftPlan_); - if (result != CUFFT_SUCCESS) { - std::cerr << "Error destroying FFT plan: " << result << std::endl; - } + cufftDestroy(fftPlan_); fftPlan_ = 0; } - if (cublas_handle_ != nullptr) { cublasDestroy(cublas_handle_); cublas_handle_ = nullptr; } } -bool CUDACorrelation::AllocateMemory(int length, int channels) { - size_t size = length * channels * sizeof(cufftDoubleComplex); - - cudaMallocAsync(&d_signals, size, stream_); - cudaMallocAsync(&d_sequence, length * sizeof(cufftDoubleComplex), stream_); - cudaMallocAsync(&d_results, size, stream_); - - return true; -} - -void CUDACorrelation::FreeMemory() { +template +void CUDACorrelation::FreeMemory() { if (d_signals) { cudaFreeAsync(d_signals, stream_); d_signals = nullptr; } - if (d_sequence) { cudaFreeAsync(d_sequence, stream_); d_sequence = nullptr; } - if (d_results) { cudaFreeAsync(d_results, stream_); d_results = nullptr; } - if (h_results) { cudaFreeHost(h_results); h_results = nullptr; } } -// 一次计算完VecSequence中所有sequence的fft值 -void CUDACorrelation::InitSequence( - std::vector>> VecSequence, int fftLength) { - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); +template +bool CUDACorrelation::AllocateMemory(int length, int channels) { + size_t size = length * channels * sizeof(CUDAComplex); - // 清理之前的资源 + CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals, size, stream_)); + CHECK_CUDA_ERROR( + cudaMallocAsync(&d_sequence, length * sizeof(CUDAComplex), stream_)); + CHECK_CUDA_ERROR(cudaMallocAsync(&d_results, size, stream_)); + + return true; +} + +template +void CUDACorrelation::smartAllocPinned(CUDAComplex** pHost, size_t size) { + if (deviceProp_.major >= 8) { + CHECK_CUDA_ERROR(cudaHostAlloc( + pHost, size, cudaHostAllocMapped | cudaHostAllocWriteCombined)); + } else { + CHECK_CUDA_ERROR(cudaMallocHost(pHost, size)); + } +} + +template +void CUDACorrelation::ComputeSequenceFFT(Complex2D VecSequence, + int fftLength) { + CHECK_CUDA_ERROR(cudaDeviceSynchronize()); Cleanup(); fftLength_ = fftLength; size_t numSequence = VecSequence.size(); - size_t sequenceSize = numSequence * fftLength * sizeof(cufftDoubleComplex); + size_t sequenceSize = numSequence * fftLength * sizeof(CUDAComplex); - // 扁平化二维到一维(原二维VecSequence内存不连续,不能直接copy到显存) - // 并填充零,一次性copy数据到显存 - std::vector hSequenceFlat(numSequence * fftLength, - cufftDoubleComplex{0.0, 0.0}); + // 扁平化数据并填充零 + std::vector hSequenceFlat(numSequence * fftLength, + CUDAComplex{0.0, 0.0}); vecSequenceLength_.reserve(numSequence); - for (size_t i = 0; i < numSequence; ++i) { - // 记录 sequence 的原始长度 - int SequenceLength = VecSequence[i].size(); - vecSequenceLength_.emplace_back(SequenceLength); - // 扁平化二维到一维 - int minlen = std::min(SequenceLength, fftLength); - memcpy(hSequenceFlat.data() + i * fftLength, // 目标地址偏移 - VecSequence[i].data(), // 源数据起始点 - minlen * sizeof(std::complex) // 拷贝字节数 - ); + std::vector tmpLengths(numSequence); // 临时存储 +#pragma omp parallel for num_threads(numSequence) + { + for (size_t i = 0; i < numSequence; ++i) { + tmpLengths[i] = VecSequence[i].size(); // 无竞争写入 + + int minlen = std::min(tmpLengths[i], fftLength); + memcpy(hSequenceFlat.data() + i * fftLength, VecSequence[i].data(), + minlen * sizeof(Complex)); + } } + vecSequenceLength_.assign(tmpLengths.begin(), tmpLengths.end()); - // 分配显存空间 d_sequence + // 分配并拷贝数据到显存 CHECK_CUDA_ERROR(cudaMallocAsync(&d_sequence, sequenceSize, stream_)); - - // 拷贝sequence数据到显存:CPU->GPU CHECK_CUDA_ERROR(cudaMemcpy(d_sequence, hSequenceFlat.data(), sequenceSize, cudaMemcpyHostToDevice)); - - // 创建fftPlan + // 创建并执行FFT cufftResult cufftStatus; cufftHandle fftPlan; - cufftStatus = cufftPlan1d(&fftPlan, fftLength, CUFFT_Z2Z, numSequence); + cufftStatus = cufftPlan1d(&fftPlan, fftLength, getFFTType(), numSequence); if (cufftStatus != CUFFT_SUCCESS) { - throw std::runtime_error("Failed to create CUFFT fftPlan_"); + throw std::runtime_error("Failed to create CUFFT plan"); + } + if constexpr (std::is_same_v) { + cufftStatus = cufftExecC2C( + fftPlan, reinterpret_cast(d_sequence), + reinterpret_cast(d_sequence), CUFFT_FORWARD); + } else { + cufftStatus = cufftExecZ2Z( + fftPlan, reinterpret_cast(d_sequence), + reinterpret_cast(d_sequence), CUFFT_FORWARD); } - // 计算sequence的FFT - cufftStatus = cufftExecZ2Z(fftPlan, d_sequence, d_sequence, CUFFT_FORWARD); if (cufftStatus != CUFFT_SUCCESS) { - throw std::runtime_error("Failed to execute forward FFT on sequenceFFT"); + cufftDestroy(fftPlan); + throw std::runtime_error("Failed to execute forward FFT"); } #ifndef USE_OPT_CUDAMEMCPY - // 分配cpu侧内存空间 - cufftDoubleComplex* CompData = (cufftDoubleComplex*)malloc(sequenceSize); - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - // 计算结果copy回内存 + // 分配主机内存并拷贝结果 + CUDAComplex* CompData = (CUDAComplex*)malloc(sequenceSize); CHECK_CUDA_ERROR( cudaMemcpy(CompData, d_sequence, sequenceSize, cudaMemcpyDeviceToHost)); - size_t sequenceFFTSize = fftLength * sizeof(cufftDoubleComplex); - std::vector vSequence(fftLength); - for (int i = 0; i < numSequence; ++i) { - memcpy(vSequence.data(), CompData + i * fftLength, sequenceFFTSize); - vecSequenceFFT_.emplace_back(vSequence); - } - - // 释放资源 - free(CompData); - if (d_sequence) { - cudaFreeAsync(d_sequence, stream_); - d_sequence = nullptr; - } -#endif - - // 释放fftPlan - if (fftPlan) { - cufftResult result = cufftDestroy(fftPlan); - if (result != CUFFT_SUCCESS) { - std::cerr << "Error destroying FFT plan: " << result << std::endl; - } - fftPlan = 0; - } -} - -std::vector>> CUDACorrelation::ProcessBatch( - const std::vector>>& signals, - const std::vector>& sequence) { - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - - // 清理之前的资源 - Cleanup(); - - int signalLength = signals[0].size(); - int sequenceLength = sequence.size(); - int numChannels = signals.size(); - size_t signalsNum = numChannels * signalLength; - - // 1. 预先计算 FFT 长度 - int fftLength = - static_cast(std::pow(2, std::ceil(std::log2(signalLength)))); - - // 创建零填充的序列 - std::vector> paddedSequence(fftLength, - std::complex(0, 0)); - int minlen = std::min(sequenceLength, fftLength); - for (int i = 0; i < minlen; ++i) { - paddedSequence[i] = sequence[i]; - } - - if (!AllocateMemory(signalLength, numChannels)) { - throw std::runtime_error("GPU memory allocation failed"); - } - try { - // 检查数据大小是否超过设备限制 - // if (signalLength * sizeof(cufftDoubleComplex) > - // deviceProp_.maxTexture1D) - // { - // throw std::runtime_error("Signal length exceeds device maximum - // texture size"); - // } - - // 2. 将所有通道数据一次性拷贝到GPU - for (int i = 0; i < numChannels; i++) { - cufftDoubleComplex* dst1 = d_signals + i * signalLength; - const void* src1 = signals[i].data(); - size_t copySize1 = signalLength * sizeof(cufftDoubleComplex); - - CHECK_CUDA_ERROR( - cudaMemcpy(dst1, src1, copySize1, cudaMemcpyHostToDevice)); + size_t sequenceFFTSize = fftLength * sizeof(CUDAComplex); + std::vector vSequence(fftLength); +#pragma omp parallel for num_threads(numSequence) + { + for (int i = 0; i < numSequence; ++i) { + memcpy(vSequence.data(), CompData + i * fftLength, sequenceFFTSize); + vecSequenceFFT_.emplace_back(vSequence); } - - // 3. 拷贝零填充后的序列数据 - cufftDoubleComplex* dst2 = d_sequence; - const void* src2 = paddedSequence.data(); - size_t copySize2 = fftLength * sizeof(cufftDoubleComplex); - - CHECK_CUDA_ERROR(cudaMemcpy(dst2, src2, copySize2, cudaMemcpyHostToDevice)); - - // 4. 批量执行FFT - cufftResult cufftStatus; - cufftStatus = cufftPlan1d(&fftPlan_, signalLength, CUFFT_Z2Z, numChannels); - if (cufftStatus != CUFFT_SUCCESS) { - throw std::runtime_error("Failed to create CUFFT fftPlan_"); - } - - cufftStatus = cufftExecZ2Z(fftPlan_, d_signals, d_signals, CUFFT_FORWARD); - if (cufftStatus != CUFFT_SUCCESS) { - throw std::runtime_error("Failed to execute forward FFT on signals"); - } - - // 5. 序列FFT - cufftStatus = cufftExecZ2Z(fftPlan_, d_sequence, d_sequence, CUFFT_FORWARD); - if (cufftStatus != CUFFT_SUCCESS) { - throw std::runtime_error("Failed to execute forward FFT on sequenceFFT"); - } - -#ifdef _DEBUG_ - std::vector sequenceFFT(fftLength); - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - CHECK_CUDA_ERROR(cudaMemcpy(sequenceFFT.data(), d_sequence, copySize2, - cudaMemcpyDeviceToHost)); - std::cout << "fftSequence length:" << fftLength << std::endl; - for (int i = 0; i < fftLength; i++) { - std::cout << "fftSequence[" << i << "]: (" << sequenceFFT[i].x << ", " - << sequenceFFT[i].y << "i)" << std::endl; - } -#endif - - // std::vector verify(signalLength); - // CHECK_CUDA_ERROR(cudaMemcpy(verify.data(), d_signals, - // signalLength * sizeof(cufftDoubleComplex), - // cudaMemcpyDeviceToHost)); - - dim3 block(256); // 每个块使用256个线程 - dim3 grid((signalLength * numChannels + block.x - 1) / block.x); - - // 检查 grid 配置是否超出设备限制 - if (grid.x > deviceProp_.maxGridSize[0]) { - throw std::runtime_error("Grid size exceeds device limits"); - } - - batchConjugateMultiplyKernel<<>>( - d_signals, d_sequence, d_results, signalLength, numChannels); - - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - CHECK_CUDA_ERROR(cudaGetLastError()); - - // 7. 批量IFFT - cufftStatus = cufftExecZ2Z(fftPlan_, d_results, d_results, CUFFT_INVERSE); - if (cufftStatus != CUFFT_SUCCESS) { - throw std::runtime_error("Failed to execute inverse FFT"); - } - - // 8. 对d_results进行归一化计算 - double scale = (double)(1.0) / signalLength; - if (cublas_handle_ == nullptr) { - cublasCreate(&cublas_handle_); - cublasSetStream(cublas_handle_, stream_); - } - cublasZdscal(cublas_handle_, signalsNum, &scale, d_results, 1); - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - - // 9. 拷贝结果回主机 - std::vector>> results(numChannels); - size_t resultSize = - (signalLength - sequenceLength + 1) * sizeof(cufftDoubleComplex); - - for (int i = 0; i < numChannels; i++) { - results[i].resize(signalLength - sequenceLength + 1); - void* dst = results[i].data(); - cufftDoubleComplex* src = d_results + i * signalLength; - - CHECK_CUDA_ERROR( - cudaMemcpy(dst, src, resultSize, cudaMemcpyDeviceToHost)); - } - - return results; - } catch (const std::exception& e) { - std::cerr << "CUDA error: " << e.what() << std::endl; - - cudaError_t lastError = cudaGetLastError(); - if (lastError != cudaSuccess) { - std::cerr << "Last CUDA error: " << cudaGetErrorString(lastError) - << std::endl; - } - Cleanup(); - throw; - } -} - -std::vector>> CUDACorrelation::ProcessBatch( - const std::vector>>& signals, int seqIdx) { - size_t numChannels = signals.size(); - size_t signalLength = signals[0].size(); - size_t signalsNum = numChannels * signalLength; - size_t signalsSize = signalsNum * sizeof(std::complex); - -#ifdef _DEBUG_ - std::cout << "signalLength:" << signalLength << std::endl; -#endif - - // 分配设备侧显存(优化:复用之前的显存,避免重复的显存分配和释放,带来的性能损失) - if ((signalsSize_ == 0) || (d_signals == nullptr) || (d_results == nullptr) || - (h_results == nullptr)) { - signalsSize_ = signalsSize; - if (d_signals) { - cudaFreeAsync(d_signals, stream_); - d_signals = nullptr; - } - if (d_results) { - cudaFreeAsync(d_results, stream_); - d_results = nullptr; - } - if (h_results) { - cudaFreeHost(h_results); - h_results = nullptr; - } - CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals, signalsSize_, stream_)); - CHECK_CUDA_ERROR(cudaMallocAsync(&d_results, signalsSize_, stream_)); - cudaMallocHost(&h_results, signalsSize_); - } else { - if (signalsSize_ != signalsSize) { - if (d_signals) { - cudaFreeAsync(d_signals, stream_); - d_signals = nullptr; - } - if (d_results) { - cudaFreeAsync(d_results, stream_); - d_results = nullptr; - } - if (h_results) { - cudaFreeHost(h_results); - h_results = nullptr; - } - signalsSize_ = signalsSize; - CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals, signalsSize_, stream_)); - CHECK_CUDA_ERROR(cudaMallocAsync(&d_results, signalsSize_, stream_)); - cudaMallocHost(&h_results, signalsSize_); - } - } - - // 原始sequence的长度 - size_t sequenceLength = vecSequenceLength_[seqIdx]; -#ifdef _DEBUG_ - std::cout << "sequenceLength:" << sequenceLength << std::endl; -#endif - -#ifdef USE_OPT_CUDAMEMCPY - if (d_sequence == nullptr) { - std::cerr << "请先调用 InitSequence(VecSequence) 接口" << std::endl; - throw; - } - // 计算sequenceFFT的显存地址(initSequence计算之后,d_sequence显存资源没有释放) - cufftDoubleComplex* sequence_ptr = d_sequence + seqIdx * fftLength_; -#else - if (vecSequenceFFT_[seqIdx].size() == 0) { - std::cerr << "请先调用 InitSequence(VecSequence) 接口" << std::endl; - throw; } - // 获取相应的sequence的fft,并copy到显存中 - // 根据seqIdx,获取相应sequence的fft - std::vector& sequenceFFT = vecSequenceFFT_[seqIdx]; - size_t sequenceFFTLength = sequenceFFT.size(); - size_t sequenceSize = sequenceFFTLength * sizeof(cufftDoubleComplex); - - // 分配显存空间 - if ((sequenceSize_ == 0) || (d_sequence == nullptr)) { - sequenceSize_ = sequenceSize; - if (d_sequence) { - cudaFreeAsync(d_sequence, stream_); - d_sequence = nullptr; - } - cudaMallocAsync(&d_sequence, sequenceSize_, stream_); - } else { - if (sequenceSize_ != sequenceSize) { - if (d_sequence) { - cudaFreeAsync(d_sequence, stream_); - d_sequence = nullptr; - } - sequenceSize_ = sequenceSize; - cudaMallocAsync(&d_sequence, sequenceSize_, stream_); - } - } - - // copy sequence 的 fft 到显存中 - cufftDoubleComplex* sequence_ptr = d_sequence; - CHECK_CUDA_ERROR(cudaMemcpy(sequence_ptr, sequenceFFT.data(), sequenceSize_, - cudaMemcpyHostToDevice)); -#ifdef _DEBUG_ - std::cout << "sequenceFFTLength:" << sequenceFFTLength << std::endl; - std::cout << "sequence fft结果" << std::endl; - for (int i = 0; i < 10; i++) { - std::cout << "sequenceFFT[" << i << "]: (" << sequenceFFT[i].x << ", " - << sequenceFFT[i].y << "i)" << std::endl; - } -#endif -#endif - - try { - // 检查数据大小是否超过设备限制 - // if (signalLength * sizeof(cufftDoubleComplex) > - // deviceProp_.maxTexture1D) - // { - // throw std::runtime_error("Signal length exceeds device maximum - // texture size"); - // } - -#ifdef USE_OPT_CUDAMEMCPY - // 扁平化数据:二维转一维(二维vector内存不连续,不能直接copy到显存) - std::vector> hSignalsFlat( - signalsNum, std::complex(0.0, 0.0)); - size_t copySize = signalLength * sizeof(std::complex); - for (size_t i = 0; i < numChannels; ++i) { - memcpy(hSignalsFlat.data() + i * signalLength, // 目标地址偏移 - signals[i].data(), // 源数据起始点 - copySize // 拷贝字节数 - ); - } - - // 拷贝数据到显存:CPU->GPU - CHECK_CUDA_ERROR(cudaMemcpy(d_signals, hSignalsFlat.data(), signalsSize, - cudaMemcpyHostToDevice)); -#else - // 将所有通道数据一次性拷贝到GPU - for (int i = 0; i < numChannels; i++) { - cufftDoubleComplex* dst1 = d_signals + i * signalLength; - const void* src1 = signals[i].data(); - size_t copySize = signalLength * sizeof(cufftDoubleComplex); - - CHECK_CUDA_ERROR( - cudaMemcpy(dst1, src1, copySize, cudaMemcpyHostToDevice)); - } -#endif - - // 创建fftPlan - cufftResult cufftStatus; - if (fftPlan_ == 0) { - signalLength_ = signalLength; - numChannels_ = numChannels; - cufftStatus = - cufftPlan1d(&fftPlan_, signalLength, CUFFT_Z2Z, numChannels); - if (cufftStatus != CUFFT_SUCCESS) { - fftPlan_ = 0; - throw std::runtime_error("Failed to create CUFFT fftPlan_"); - } - } else { - if ((signalLength_ != signalLength) || (numChannels_ != numChannels)) { - if (fftPlan_) { - cufftResult result = cufftDestroy(fftPlan_); - fftPlan_ = 0; - if (result != CUFFT_SUCCESS) { - std::cerr << "Error destroying FFT plan: " << result << std::endl; - } - } - signalLength_ = signalLength; - numChannels_ = numChannels; - cufftStatus = - cufftPlan1d(&fftPlan_, signalLength, CUFFT_Z2Z, numChannels); - if (cufftStatus != CUFFT_SUCCESS) { - fftPlan_ = 0; - throw std::runtime_error("Failed to create CUFFT fftPlan_"); - } - } - } - - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - - // 计算signals的fft - cufftStatus = cufftExecZ2Z(fftPlan_, d_signals, d_signals, CUFFT_FORWARD); - if (cufftStatus != CUFFT_SUCCESS) { - throw std::runtime_error("Failed to execute forward FFT on signals"); - } - -#ifdef _DEBUG_ - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - std::vector h_signals(signalsNum); - CHECK_CUDA_ERROR(cudaMemcpy(h_signals.data(), d_signals, signalsSize_, - cudaMemcpyDeviceToHost)); - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - std::cout << "singnal 原始数据" << std::endl; - for (int i = 0; i < 10; i++) { - std::cout << "SingleData[" << i << "]: (" << signals[0][i].real() << ", " - << signals[0][i].imag() << "i)" << std::endl; - } - std::cout << "singnal fft结果" << std::endl; - for (int i = 0; i < 10; i++) { - std::cout << "SingalfftResult[" << i << "]: (" << h_signals[i].x << ", " - << h_signals[i].y << "i)" << std::endl; - } - - std::cout << "singnal [c=1]原始数据" << std::endl; - for (int i = 0; i < 10; i++) { - std::cout << "SingleData[" << i << "]: (" << signals[1][i].real() << ", " - << signals[1][i].imag() << "i)" << std::endl; - } - std::cout << "singnal [c=1] fft结果" << std::endl; - for (int i = 0; i < 10; i++) { - std::cout << "SingalfftResult[" << i << "]: (" - << h_signals[signalLength + i].x << ", " - << h_signals[signalLength + i].y << "i)" << std::endl; - } -#endif - - // 计算d_signals与sequence的共轭乘 - dim3 block(256); // 每个块使用256个线程 - dim3 grid((signalsNum + block.x - 1) / block.x); - -#ifdef _DEBUG_ - std::cout << "block(" << block.x << "," << block.y << "," << block.z << ")" - << std::endl; - std::cout << "grid(" << grid.x << "," << grid.y << "," << grid.z << ")" - << std::endl; -#endif - - // 检查 grid 配置是否超出设备限制 - if (grid.x > deviceProp_.maxGridSize[0]) { - throw std::runtime_error("Grid size exceeds device limits"); - } - - // 计算 d_signals 与 sequence_ptr 的共轭乘 - batchConjugateMultiplyKernel<<>>( - d_signals, sequence_ptr, d_results, signalLength, numChannels); - - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - CHECK_CUDA_ERROR(cudaGetLastError()); - -#ifdef _DEBUG_ - std::vector> h_res(signalsNum); - CHECK_CUDA_ERROR(cudaMemcpy(h_res.data(), d_results, signalsSize_, - cudaMemcpyDeviceToHost)); - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - std::cout << "共轭乘结果" << std::endl; - for (int i = 0; i < 10; i++) { - std::cout << "Conjugate-results[" << i << "]: (" << h_res[i].real() - << ", " << h_res[i].imag() << "i)" << std::endl; - } - std::cout << " [c=1] 共轭乘结果" << std::endl; - for (int i = 0; i < 10; i++) { - std::cout << "Conjugate-results[" << i << "]: (" - << h_res[signalLength + i].real() << ", " - << h_res[signalLength + i].imag() << "i)" << std::endl; - } -#endif - - // 计算 d_results 的IFFT - cufftStatus = cufftExecZ2Z(fftPlan_, d_results, d_results, CUFFT_INVERSE); - if (cufftStatus != CUFFT_SUCCESS) { - throw std::runtime_error("Failed to execute inverse FFT"); - } - - // 对 d_results 进行归一化计算(d_results/scale) - double scale = (double)(1.0) / signalLength; - if (cublas_handle_ == nullptr) { - cublasCreate(&cublas_handle_); - cublasSetStream(cublas_handle_, stream_); - } - cublasZdscal(cublas_handle_, signalsNum, &scale, d_results, 1); - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - - // 拷贝结果回主机 -#ifdef _DEBUG_ - std::cout << "(signalLength - sequenceLength):" - << (signalLength - sequenceLength) << std::endl; -#endif - -#ifdef USE_OPT_CUDAMEMCPY - // 方法1 - size_t resultLength = signalLength - sequenceLength; - size_t resultSize = resultLength * sizeof(cufftDoubleComplex); - std::vector>> results( - numChannels, std::vector>(resultLength)); - CHECK_CUDA_ERROR( - cudaMemcpy(h_results, d_results, signalsSize_, cudaMemcpyDeviceToHost)); - for (int i = 0; i < numChannels; i++) { - memcpy(results[i].data(), h_results + i * signalLength, resultSize); - } -#else - // 方法2(老方法) - std::vector>> results(numChannels); - size_t resultLength = signalLength - sequenceLength; - size_t resultSize = resultLength * sizeof(cufftDoubleComplex); - for (int i = 0; i < numChannels; i++) { - results[i].resize(resultLength); - void* dst = results[i].data(); - cufftDoubleComplex* src = d_results + i * signalLength; - - CHECK_CUDA_ERROR( - cudaMemcpy(dst, src, resultSize, cudaMemcpyDeviceToHost)); - } -#endif - -#ifdef _DEBUG_ - std::cout << "ifft结果" << std::endl; - for (int i = 0; i < 10; i++) { - std::cout << "CUFFT_INVERSE-results[" << i << "]: (" - << results[0][i].real() << ", " << results[0][i].imag() << "i)" - << std::endl; - } + free(CompData); + cudaFreeAsync(d_sequence, stream_); + d_sequence = nullptr; #endif - return results; - } catch (const std::exception& e) { - std::cerr << "CUDA error: " << e.what() << std::endl; - - cudaError_t lastError = cudaGetLastError(); - if (lastError != cudaSuccess) { - std::cerr << "Last CUDA error: " << cudaGetErrorString(lastError) - << std::endl; - } - - Cleanup(); - throw; - } + cufftDestroy(fftPlan); } // 预计算一个batch的signals的FFT -void CUDACorrelation::ComputeSignalsFFT( - const std::vector>>& signals) { - size_t numChannels = signals.size(); - size_t signalLength = signals[0].size(); +template +void CUDACorrelation::ComputeSignalsFFT(const Complex2D& signalDatas) { + size_t numChannels = signalDatas.size(); + size_t signalLength = signalDatas[0].size(); size_t signalsNum = numChannels * signalLength; - size_t signalsSize = signalsNum * sizeof(std::complex); + size_t signalsSize = signalsNum * sizeof(Complex); signalsNum_ = signalsNum; configureKernel(signalsNum_); -#ifdef _DEBUG_ - std::cout << "signalLength:" << signalLength << std::endl; -#endif - // 分配设备侧显存(优化:复用之前的显存,避免重复的显存分配和释放,带来的性能损失) if ((signalsSize_ == 0) || (d_signals == nullptr) || (d_results == nullptr) || (h_results == nullptr)) { @@ -722,9 +280,10 @@ void CUDACorrelation::ComputeSignalsFFT( } CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals, signalsSize_, stream_)); CHECK_CUDA_ERROR(cudaMallocAsync(&d_results, signalsSize_, stream_)); - cudaMallocHost(&h_results, signalsSize_); + smartAllocPinned(&h_results, signalsSize_); } else { if (signalsSize_ != signalsSize) { + signalsSize_ = signalsSize; if (d_signals) { cudaFreeAsync(d_signals, stream_); d_signals = nullptr; @@ -737,32 +296,25 @@ void CUDACorrelation::ComputeSignalsFFT( cudaFreeHost(h_results); h_results = nullptr; } - signalsSize_ = signalsSize; CHECK_CUDA_ERROR(cudaMallocAsync(&d_signals, signalsSize_, stream_)); CHECK_CUDA_ERROR(cudaMallocAsync(&d_results, signalsSize_, stream_)); - cudaMallocHost(&h_results, signalsSize_); + smartAllocPinned(&h_results, signalsSize_); } } try { - // 检查数据大小是否超过设备限制 - // if (signalLength * sizeof(cufftDoubleComplex) > - // deviceProp_.maxTexture1D) - // { - // throw std::runtime_error("Signal length exceeds device maximum - // texture size"); - // } - #ifdef USE_OPT_CUDAMEMCPY // 扁平化数据:二维转一维(二维vector内存不连续,不能直接copy到显存) - std::vector> hSignalsFlat( - signalsNum, std::complex(0.0, 0.0)); - size_t copySize = signalLength * sizeof(std::complex); - for (size_t i = 0; i < numChannels; ++i) { - memcpy(hSignalsFlat.data() + i * signalLength, // 目标地址偏移 - signals[i].data(), // 源数据起始点 - copySize // 拷贝字节数 - ); + Complex1D hSignalsFlat(signalsNum, Complex{0.0, 0.0}); + size_t copySize = signalLength * sizeof(Complex); +#pragma omp parallel for num_threads(numChannels) + { + for (size_t i = 0; i < numChannels; ++i) { + memcpy(hSignalsFlat.data() + i * signalLength, // 目标地址偏移 + signalDatas[i].data(), // 源数据起始点 + copySize // 拷贝字节数 + ); + } } // 拷贝数据到显存:CPU->GPU @@ -770,13 +322,16 @@ void CUDACorrelation::ComputeSignalsFFT( cudaMemcpyHostToDevice)); #else // 循环copy:将所有通道数据一次性拷贝到GPU - for (int i = 0; i < numChannels; i++) { - cufftDoubleComplex* dst1 = d_signals + i * signalLength; - const void* src1 = signals[i].data(); - size_t copySize = signalLength * sizeof(cufftDoubleComplex); - - CHECK_CUDA_ERROR( - cudaMemcpy(dst1, src1, copySize, cudaMemcpyHostToDevice)); + size_t copySize = signalLength * sizeof(CUDAComplex); +#pragma omp parallel for num_threads(numChannels) + { + for (int i = 0; i < numChannels; i++) { + CUDAComplex* dst1 = d_signals + i * signalLength; + const void* src1 = signalDatas[i].data(); + + CHECK_CUDA_ERROR( + cudaMemcpy(dst1, src1, copySize, cudaMemcpyHostToDevice)); + } } #endif @@ -786,7 +341,7 @@ void CUDACorrelation::ComputeSignalsFFT( signalLength_ = signalLength; numChannels_ = numChannels; cufftStatus = - cufftPlan1d(&fftPlan_, signalLength, CUFFT_Z2Z, numChannels); + cufftPlan1d(&fftPlan_, signalLength, getFFTType(), numChannels); if (cufftStatus != CUFFT_SUCCESS) { fftPlan_ = 0; throw std::runtime_error("Failed to create CUFFT fftPlan_"); @@ -803,7 +358,7 @@ void CUDACorrelation::ComputeSignalsFFT( signalLength_ = signalLength; numChannels_ = numChannels; cufftStatus = - cufftPlan1d(&fftPlan_, signalLength, CUFFT_Z2Z, numChannels); + cufftPlan1d(&fftPlan_, signalLength, getFFTType(), numChannels); if (cufftStatus != CUFFT_SUCCESS) { fftPlan_ = 0; throw std::runtime_error("Failed to create CUFFT fftPlan_"); @@ -811,44 +366,27 @@ void CUDACorrelation::ComputeSignalsFFT( } } - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - // 计算signals的fft - cufftStatus = cufftExecZ2Z(fftPlan_, d_signals, d_signals, CUFFT_FORWARD); - if (cufftStatus != CUFFT_SUCCESS) { - throw std::runtime_error("Failed to execute forward FFT on signals"); + if constexpr (std::is_same_v) { + cufftStatus = cufftExecC2C( + fftPlan_, reinterpret_cast(d_signals), + reinterpret_cast(d_signals), CUFFT_FORWARD); + if (cufftStatus != CUFFT_SUCCESS) { + throw std::runtime_error( + "Failed to execute forward FFT on signalDatas"); + } + } else { + cufftStatus = cufftExecZ2Z( + fftPlan_, reinterpret_cast(d_signals), + reinterpret_cast(d_signals), CUFFT_FORWARD); + if (cufftStatus != CUFFT_SUCCESS) { + throw std::runtime_error( + "Failed to execute forward FFT on signalDatas"); + } } CHECK_CUDA_ERROR(cudaDeviceSynchronize()); -#ifdef _DEBUG_ - std::vector h_signals(signalsNum); - CHECK_CUDA_ERROR(cudaMemcpy(h_signals.data(), d_signals, signalsSize_, - cudaMemcpyDeviceToHost)); - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - std::cout << "singnal 原始数据" << std::endl; - for (int i = 0; i < 10; i++) { - std::cout << "SingleData[" << i << "]: (" << signals[0][i].real() << ", " - << signals[0][i].imag() << "i)" << std::endl; - } - std::cout << "singnal fft结果" << std::endl; - for (int i = 0; i < 10; i++) { - std::cout << "SingalfftResult[" << i << "]: (" << h_signals[i].x << ", " - << h_signals[i].y << "i)" << std::endl; - } - - std::cout << "singnal [c=1]原始数据" << std::endl; - for (int i = 0; i < 10; i++) { - std::cout << "SingleData[" << i << "]: (" << signals[1][i].real() << ", " - << signals[1][i].imag() << "i)" << std::endl; - } - std::cout << "singnal [c=1] fft结果" << std::endl; - for (int i = 0; i < 10; i++) { - std::cout << "SingalfftResult[" << i << "]: (" - << h_signals[signalLength + i].x << ", " - << h_signals[signalLength + i].y << "i)" << std::endl; - } -#endif return; } catch (const std::exception& e) { std::cerr << "CUDA error: " << e.what() << std::endl; @@ -864,35 +402,37 @@ void CUDACorrelation::ComputeSignalsFFT( } } -// 需要预先计算完一个batch的 signals 的FFT结果 +// 需要预先计算完一个batch的 signalDatas 的FFT结果 // 调用此接口,直接计算signalsFFT与sequenceFFT的共轭乘 -std::vector>> -CUDACorrelation::ComputeConjugateMultiply(int seqIdx) { +template +typename CUDACorrelation::Complex2D +CUDACorrelation::ComputeConjugateMultiply(int seqIdx) { if (d_signals == nullptr) { - std::cerr << "请先调用 ComputeSignalsFFT(signals) 接口" << std::endl; + std::cerr << "请先调用 ComputeSignalsFFT(signalDatas) 接口" << std::endl; throw; } // 原始sequence的长度 size_t sequenceLength = vecSequenceLength_[seqIdx]; + CUDAComplex* sequence_ptr; #ifdef USE_OPT_CUDAMEMCPY if (d_sequence == nullptr) { - std::cerr << "请先调用 InitSequence(VecSequence) 接口" << std::endl; + std::cerr << "请先调用 ComputeSequenceFFT(VecSequence) 接口" << std::endl; throw; } // 计算sequenceFFT的显存地址(initSequence计算之后,d_sequence显存资源没有释放) - cufftDoubleComplex* sequence_ptr = d_sequence + seqIdx * fftLength_; + sequence_ptr = d_sequence + seqIdx * fftLength_; #else if (vecSequenceFFT_[seqIdx].size() == 0) { - std::cerr << "请先调用 InitSequence(VecSequence) 接口" << std::endl; + std::cerr << "请先调用 ComputeSequenceFFT(VecSequence) 接口" << std::endl; throw; } // 获取相应的sequence的fft,并copy到显存中 // 根据seqIdx,获取相应sequence的fft - std::vector& sequenceFFT = vecSequenceFFT_[seqIdx]; + std::vector& sequenceFFT = vecSequenceFFT_[seqIdx]; size_t sequenceFFTLength = sequenceFFT.size(); - size_t sequenceSize = sequenceFFTLength * sizeof(cufftDoubleComplex); + size_t sequenceSize = sequenceFFTLength * sizeof(CUDAComplex); // 分配显存空间 if ((sequenceSize_ == 0) || (d_sequence == nullptr)) { @@ -914,15 +454,19 @@ CUDACorrelation::ComputeConjugateMultiply(int seqIdx) { } // copy sequence 的 fft 到显存中 - cufftDoubleComplex* sequence_ptr = d_sequence; + sequence_ptr = d_sequence; CHECK_CUDA_ERROR(cudaMemcpy(sequence_ptr, sequenceFFT.data(), sequenceSize_, cudaMemcpyHostToDevice)); #endif // 分配设备侧显存(优化:复用之前的显存,避免重复的显存分配和释放,带来的性能损失) - if ((d_results == nullptr) || (h_results == nullptr)) { + if (d_results == nullptr) { CHECK_CUDA_ERROR(cudaMallocAsync(&d_results, signalsSize_, stream_)); - cudaMallocHost(&h_results, signalsSize_); + } + + if (h_results == nullptr) { + //分配锁页内存 + smartAllocPinned(&h_results, signalsSize_); } try { @@ -930,7 +474,7 @@ CUDACorrelation::ComputeConjugateMultiply(int seqIdx) { cufftResult cufftStatus; if (fftPlan_ == 0) { cufftStatus = - cufftPlan1d(&fftPlan_, signalLength_, CUFFT_Z2Z, numChannels_); + cufftPlan1d(&fftPlan_, signalLength_, getFFTType(), numChannels_); if (cufftStatus != CUFFT_SUCCESS) { fftPlan_ = 0; throw std::runtime_error("Failed to create CUFFT fftPlan_"); @@ -938,39 +482,52 @@ CUDACorrelation::ComputeConjugateMultiply(int seqIdx) { } // 计算d_signals与d_sequence的共轭乘 - batchConjugateMultiplyKernel<<>>( - d_signals, sequence_ptr, d_results, signalLength_, numChannels_); - - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - CHECK_CUDA_ERROR(cudaGetLastError()); + // configureKernel(signalLength_ * numChannels_); + if constexpr (std::is_same_v) { + batchConjugateMultiplyKernelFloat<<>>( + reinterpret_cast(d_signals), sequence_ptr, + reinterpret_cast(d_results), signalLength_, + numChannels_); + } else { + batchConjugateMultiplyKernelDouble<<>>( + reinterpret_cast(d_signals), sequence_ptr, + reinterpret_cast(d_results), signalLength_, + numChannels_); + } // 计算 d_results 的IFFT - cufftStatus = cufftExecZ2Z(fftPlan_, d_results, d_results, CUFFT_INVERSE); - if (cufftStatus != CUFFT_SUCCESS) { - throw std::runtime_error("Failed to execute inverse FFT"); + if constexpr (std::is_same_v) { + cufftStatus = cufftExecC2C( + fftPlan_, reinterpret_cast(d_results), + reinterpret_cast(d_results), CUFFT_INVERSE); + if (cufftStatus != CUFFT_SUCCESS) { + throw std::runtime_error("Failed to execute inverse FFT"); + } + } else { + cufftStatus = cufftExecZ2Z( + fftPlan_, reinterpret_cast(d_results), + reinterpret_cast(d_results), CUFFT_INVERSE); + if (cufftStatus != CUFFT_SUCCESS) { + throw std::runtime_error("Failed to execute inverse FFT"); + } } - // 对 d_results 进行归一化计算(d_results/scale) - double scale = (double)(1.0) / signalLength_; - if (cublas_handle_ == nullptr) { - cublasCreate(&cublas_handle_); - cublasSetStream(cublas_handle_, stream_); - } - cublasZdscal(cublas_handle_, signalsNum_, &scale, d_results, 1); CHECK_CUDA_ERROR(cudaDeviceSynchronize()); // 拷贝结果回主机 size_t resultLength = signalLength_ - sequenceLength; - size_t resultSize = resultLength * sizeof(cufftDoubleComplex); - std::vector>> results( - numChannels_, std::vector>(resultLength)); + size_t resultSize = resultLength * sizeof(CUDAComplex); + Complex2D results(numChannels_, Complex1D(resultLength)); CHECK_CUDA_ERROR( cudaMemcpy(h_results, d_results, signalsSize_, cudaMemcpyDeviceToHost)); - for (int i = 0; i < numChannels_; i++) { - memcpy(results[i].data(), h_results + i * signalLength_, resultSize); +#pragma omp parallel for num_threads(std::min(numChannels_, 16)) + { + for (int i = 0; i < numChannels_; i++) { + memcpy(results[i].data(), h_results + i * signalLength_, resultSize); + } } - return results; + return std::move(results); } catch (const std::exception& e) { std::cerr << "CUDA error: " << e.what() << std::endl; diff --git a/cuda_correlation.h b/cuda_correlation.h index 0ab00dbeadac198c877d502b54f12bc21fe9973b..ef29e8e83b18f4eec6b86a74ebe69ebc7c1877bf 100644 --- a/cuda_correlation.h +++ b/cuda_correlation.h @@ -5,6 +5,7 @@ #include #include +#include #include // =======条件编译宏====== @@ -15,34 +16,33 @@ // 对于二维vector打平为一维(二维vector内存上不连续,不能直接copy到显存),便于一次copy数据到显存计算 // #define USE_OPT_CUDAMEMCPY +template class CUDACorrelation { + static_assert(std::is_same::value || std::is_same::value, + "Only float and double types are supported"); + public: + using RealType = T; + using Complex = std::complex; + using Complex1D = std::vector; + using Complex2D = std::vector; + using CUDAComplex = + typename std::conditional::value, cufftComplex, + cufftDoubleComplex>::type; + CUDACorrelation(); ~CUDACorrelation(); // 预先计算sequence的fft - void InitSequence(std::vector>> VecSequence, - int fftLength); + void ComputeSequenceFFT(Complex2D VecSequence, int fftLength); // 预先计算signals的fft - void ComputeSignalsFFT( - const std::vector>>& signals); - - std::vector>> ProcessBatch( - const std::vector>>& signals, - const std::vector>& sequence); - - // 需预先计算sequence的fft - // 根据seqIdx,获取相应的sequence的fft,与signals计算共轭乘 - std::vector>> ProcessBatch( - const std::vector>>& signals, - int seqIdx); + void ComputeSignalsFFT(const Complex2D& signalDatas); // 需预先计算sequence的fft // 需预先计算signals的fft // 计算signalsFFT与相应的sequenceFFT(根据seqIdx)的共轭乘(归一化) - std::vector>> ComputeConjugateMultiply( - int seqIdx); + Complex2D ComputeConjugateMultiply(int seqIdx); private: cudaStream_t stream_; @@ -57,21 +57,28 @@ class CUDACorrelation { size_t fftLength_; size_t signalsNum_; - cufftDoubleComplex* d_signals; - cufftDoubleComplex* d_sequence; - cufftDoubleComplex* d_results; - cufftDoubleComplex* h_results; + CUDAComplex* d_signals; + CUDAComplex* d_sequence; + CUDAComplex* d_results; + CUDAComplex* h_results; // 保存原始每个sequence的长度 std::vector vecSequenceLength_; - std::vector> vecSequenceFFT_; + std::vector> vecSequenceFFT_; dim3 block_; dim3 grid_; + // 计算最优线程数配置 void configureKernel(int dataSize); + // 申请显存 bool AllocateMemory(int length, int channels); + + // 分配锁页内存 + void smartAllocPinned(CUDAComplex** pHost, size_t size); + + // 资源释放 void FreeMemory(); void Cleanup(); }; diff --git a/droneifiqparse.cpp b/droneifiqparse.cpp index 9c1b568b92db975c7f1728efece540f3501d191b..cb0fe23b326137614d30f73d8fa4dd713a82f477 100644 --- a/droneifiqparse.cpp +++ b/droneifiqparse.cpp @@ -1,6 +1,6 @@ -#include "droneifiqparse.h" +#include -#include +#include "droneifiqparse.h" DroneIFIQParse::DroneIFIQParse() {} @@ -19,8 +19,8 @@ unsigned int DroneIFIQParse::GetDataTag(char *buf) { return BytesToIntInv(buf, 8); } -float DroneIFIQParse::GetFrequency(char *buf) { - float freq = 0; +Real DroneIFIQParse::GetFrequency(char *buf) { + Real freq = 0; memcpy(&freq, buf + 12, 4); return freq; @@ -36,12 +36,12 @@ int DroneIFIQParse::GetChannelNumber(char *buf) { void DroneIFIQParse::ResolveIQData( char *buf, unsigned int SampleCount, int ChannelCount, - std::vector>> &vecsignal) { + std::vector>> &vecsignal) { int startpos = 28; int offset = 4 * ChannelCount; for (int c = 0; c < ChannelCount; ++c) { - std::vector> channelData; + std::vector> channelData; channelData.reserve(SampleCount); for (int k = 0; k < SampleCount; ++k) { channelData.emplace_back( @@ -93,7 +93,8 @@ void DroneIFIQParse::Resolve8CHDFIQData(char *buf, unsigned int SampleCount, int dataindex = 0; - for (int k = 0; k < SampleCount; ++k) // 理论上 这是 一次 的 所有Idata1 Qdata1 Idata2 Qdata2 + for (int k = 0; k < SampleCount; + ++k) // 理论上 这是 一次 的 所有Idata1 Qdata1 Idata2 Qdata2 { vecFrameIdata1.push_back(BytesToShortInv(buf, startpos + 0 + dataindex)); vecFrameQdata1.push_back(BytesToShortInv(buf, startpos + 2 + dataindex)); diff --git a/droneifiqparse.h b/droneifiqparse.h index 556cfe50fdf13b21d56131257976cf60cf250d76..ca81bca734c33aa847cc3674ea249abd2c3f0fcf 100644 --- a/droneifiqparse.h +++ b/droneifiqparse.h @@ -5,6 +5,8 @@ #include #include +#include "calculatemovingcorrelation.h" + class DroneIFIQParse { public: DroneIFIQParse(); @@ -12,18 +14,18 @@ class DroneIFIQParse { unsigned int GetDataSize(char* buf); unsigned int GetDataTag(char* buf); - float GetFrequency(char* buf); + Real GetFrequency(char* buf); unsigned int GetSamplePoint(char* buf); int GetChannelNumber(char* buf); void ResolveIQData(char* buf, unsigned int SampleCount, int ChannelCount, - std::vector>>& vecsignal); + std::vector>>& vecsignal); void Resolve8CHDFIQData(char* buf, unsigned int SampleCount, int ChannelCount, QVector>& WholeIdata, QVector>& WholeQdata); - std::vector>> vecsignal_; + std::vector>> vecsignal_; private: short BytesToShortInv(char* buf, int startpos); diff --git a/mainwindow.cpp b/mainwindow.cpp index 572ecb32e4c83911e1cee8d493413e2287292919..0c673143b71f34e4a2365120284c0be87442ad5a 100644 --- a/mainwindow.cpp +++ b/mainwindow.cpp @@ -1,6 +1,4 @@ -#include "mainwindow.h" - -#include +#include #include #include #include @@ -8,6 +6,7 @@ #include #include "log.h" +#include "mainwindow.h" using namespace std; @@ -35,7 +34,7 @@ void MainWindow::InitConnect() { } int MainWindow::CalculateRoutine( - const std::vector>> &vecsignal) { + const std::vector>> &vecsignal) { return m_calMC.CalMovingCorrlationRoutine(vecsignal); } @@ -177,9 +176,9 @@ void MainWindow::ReplayIQDataParse(char *buf) { // // 计算总流程 获得最终结果 1--找到相关峰 0--未找到相关峰 // int result = CalculateRoutine(vecIdata, vecQdata); - channelnumber = 8; //原逻辑也是只取了前8个通道 - std::vector>> vecsignal( - channelnumber, std::vector>(SamplePoints)); + channelnumber = 8; //原逻辑也是只取了前8个通道 + std::vector>> vecsignal( + channelnumber, std::vector>(SamplePoints)); m_droneIQParse.ResolveIQData(buf, SamplePoints, channelnumber, vecsignal); QElapsedTimer tm; diff --git a/mainwindow.h b/mainwindow.h index b0db523286d497a5ee9b120b6ccf783c2fafb534..f2c3db35b8840a996183297f11dae4f107992cc9 100644 --- a/mainwindow.h +++ b/mainwindow.h @@ -11,6 +11,7 @@ #include "calculatemovingcorrelation.h" #include "droneifiqparse.h" + using namespace std; class MainWindow : public QMainWindow { @@ -29,7 +30,7 @@ class MainWindow : public QMainWindow { QVector> &WholeQdata); int CalculateRoutine( - const std::vector>> &vecsignal); + const std::vector>> &vecsignal); private: // 获取测试数据文件中 每一帧数据的帧头下标