diff --git a/.gitignore b/.gitignore index 605f7b6e3fe0a2779e496e7d83b82eacd90dd987..ea7a941a6308c375e0ad824ba5cb2302e4113243 100644 --- a/.gitignore +++ b/.gitignore @@ -32,5 +32,6 @@ *.app /build +/build1 /fetchcontent /.vscode \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index ea3dd03f3f5df8af1674eb1c35213dcb0cb62a50..5485f9b587e53a1e044507ca50a48ef8fde4edf9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,7 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") endif() set(CMAKE_EXPORT_COMPILE_COMMANDS on) +set(CMAKE_VERBOSE_MAKEFILE ON) set(CUDA_KERNELS_DIR "${PROJECT_SOURCE_DIR}") @@ -19,9 +20,10 @@ set(TEST_DIR "${PROJECT_SOURCE_DIR}/test") list(APPEND CMAKE_MODULE_PATH ${CUDA_KERNELS_DIR}/cmake) include(FetchGTest) include(FetchSpdlog) -include(FetchTorch) -set(CMAKE_PREFIX_PATH ${TORCH_SOURCE_DIR}/share/cmake/Torch) -find_package(Torch REQUIRED) +# include(FetchTorch) +# set(CMAKE_PREFIX_PATH ${TORCH_SOURCE_DIR}/share/cmake/Torch) +# find_package(Torch REQUIRED) +set(CMAKE_BUILD_TYPE Release) function(build_cuda_kernel OP_NAME) file(GLOB_RECURSE OP_CPP_FILES "${SRC_DIR}/${OP_NAME}/*.cpp") @@ -32,7 +34,6 @@ function(build_cuda_kernel OP_NAME) PRIVATE spdlog gtest_main - ${TORCH_LIBRARIES} ) target_include_directories(test_${OP_NAME} PRIVATE ${INCLUDE_DIR}/${OP_NAME}) set_target_properties(test_${OP_NAME} PROPERTIES CUDA_ARCHITECTURES "80;90") @@ -41,4 +42,4 @@ function(build_cuda_kernel OP_NAME) endfunction(build_cuda_kernel) build_cuda_kernel(pointwise) -build_cuda_kernel(conv_fwd) \ No newline at end of file +# build_cuda_kernel(conv_fwd) \ No newline at end of file diff --git a/build.sh b/build.sh index 4c42d4c0df106daa73262637432470e33e3a6481..34a9df1a7671cc00d515b0ad9a5eca6a00db654a 100755 --- a/build.sh +++ b/build.sh @@ -3,8 +3,8 @@ script_path=$(readlink -f "$0") # 获取脚本所在目录 script_dir=$(dirname "$script_path") # 创建build目录 -build_dir="${script_dir}/build" -if [[ -d "$build_dir" ]]; then +build_dir="${script_dir}/build1" +if [ -d "$build_dir" ]; then cd $build_dir else mkdir -p $build_dir diff --git a/operator/pointwise/gmem_bw.cu b/operator/pointwise/gmem_bw.cu new file mode 100644 index 0000000000000000000000000000000000000000..3be0cd62c91c202fdcaa74d19a29cfdf8bc8c012 --- /dev/null +++ b/operator/pointwise/gmem_bw.cu @@ -0,0 +1,44 @@ + +#include "gmem_bw.h" +const size_t MAX_BLOCKS = 8192; +__global__ void cuda_add_ori(float *out, float *x, float y, size_t N) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t num_threads_in_grid = size_t(blockDim.x * gridDim.x); + for (size_t i = idx; i < N; i += num_threads_in_grid) { + out[i] = x[i] + y; + } +} +__global__ void cuda_add_opt1(float *out, float *x, float y, size_t N) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t num_threads_in_grid = size_t(blockDim.x * gridDim.x); + for(size_t i = idx; i < N/4; i += num_threads_in_grid) { + float4 x4 = reinterpret_cast(x)[i]; + float4 out4; + out4.x = x4.x + y; + out4.y = x4.y + y; + out4.z = x4.z + y; + out4.w = x4.w + y; + reinterpret_cast(out)[i] = out4; + } + + // in only one thread, process final elements (if there are any) + size_t remainder = N%4; + if (idx==N/4 && remainder!=0) { + while(remainder) { + size_t idx = N - remainder--; + out[idx] = x[idx] + y; + } + } +} + +void gmem_bw_ori(float *out, float *x, float y, cudaStream_t stream, size_t N) { + constexpr size_t threads_per_block = 128; + auto num_blocks = min(size_t((N + threads_per_block - 1) / threads_per_block), MAX_BLOCKS); + cuda_add_ori<<>>(out, x, y, N); +} + +void gmem_bw_opt1(float *out, float *x, float y, cudaStream_t stream, size_t N) { + constexpr size_t threads_per_block = 128; + auto num_blocks = min(size_t((N + threads_per_block * 4 - 1) / (threads_per_block * 4)), MAX_BLOCKS); + cuda_add_opt1<<>>(out, x, y, N); +} \ No newline at end of file diff --git a/operator/pointwise/gmem_bw.h b/operator/pointwise/gmem_bw.h new file mode 100644 index 0000000000000000000000000000000000000000..44bfb90b4a56345033bea27f9fb11e4b56406ae3 --- /dev/null +++ b/operator/pointwise/gmem_bw.h @@ -0,0 +1,4 @@ +#pragma once +#include +void gmem_bw_ori(float *out, float *x, float y, cudaStream_t stream, size_t N); +void gmem_bw_opt1(float *out, float *x, float y, cudaStream_t stream, size_t N); \ No newline at end of file diff --git a/operator/pointwise/l2mem_bw.cu b/operator/pointwise/l2mem_bw.cu new file mode 100644 index 0000000000000000000000000000000000000000..d6914a1745cc887a50cac8d82e2340556b289600 --- /dev/null +++ b/operator/pointwise/l2mem_bw.cu @@ -0,0 +1,54 @@ + +#include "gmem_bw.h" +const size_t MAX_BLOCKS = 8192; +__global__ void cuda_add_l2_ori(float *out, float *x, float y, size_t N) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t num_threads_in_grid = size_t(blockDim.x * gridDim.x); + for (int j = 0; j < 100; j++) { + for (size_t i = idx; i < N; i += num_threads_in_grid) { + size_t iidx = (i + j * blockDim.x * 4) & 0x3fffff; + out[iidx] = x[iidx] + y; + } + } +} +__global__ void cuda_add_l2_opt1(float *out, float *x, float y, size_t N) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t num_threads_in_grid = size_t(blockDim.x * gridDim.x); + for (int j = 0; j < 100; j++) { + for (size_t i = idx; i < N / 4; i += num_threads_in_grid) { + float4 x4 = reinterpret_cast(x)[i]; + float4 out4; + out4.x = x4.x + y; + out4.y = x4.y + y; + out4.z = x4.z + y; + out4.w = x4.w + y; + reinterpret_cast(out)[i] = out4; + } + + // in only one thread, process final elements (if there are any) + size_t remainder = N % 4; + if (idx == N / 4 && remainder != 0) { + while (remainder) { + size_t idx = N - remainder--; + out[idx] = x[idx] + y; + } + } + } +} + +void l2mem_bw_ori(float *out, float *x, float y, cudaStream_t stream, + size_t N) { + constexpr size_t threads_per_block = 128; + auto num_blocks = + min(size_t((N + threads_per_block - 1) / threads_per_block), MAX_BLOCKS); + cuda_add_l2_ori<<>>(out, x, y, N); +} + +void l2mem_bw_opt1(float *out, float *x, float y, cudaStream_t stream, + size_t N) { + constexpr size_t threads_per_block = 128; + auto num_blocks = + min(size_t((N + threads_per_block * 4 - 1) / (threads_per_block * 4)), + MAX_BLOCKS); + cuda_add_l2_opt1<<>>(out, x, y, N); +} \ No newline at end of file diff --git a/operator/pointwise/l2mem_bw.h b/operator/pointwise/l2mem_bw.h new file mode 100644 index 0000000000000000000000000000000000000000..a108b70832f82724660a2c3d40b360eca478d7e3 --- /dev/null +++ b/operator/pointwise/l2mem_bw.h @@ -0,0 +1,5 @@ +#pragma once +#include +void l2mem_bw_ori(float *out, float *x, float y, cudaStream_t stream, size_t N); +void l2mem_bw_opt1(float *out, float *x, float y, cudaStream_t stream, + size_t N); \ No newline at end of file diff --git a/operator/pointwise/reg_bw.cu b/operator/pointwise/reg_bw.cu new file mode 100644 index 0000000000000000000000000000000000000000..08cf0b7b54ba4232aa96e8541c1869fa636152b1 --- /dev/null +++ b/operator/pointwise/reg_bw.cu @@ -0,0 +1,38 @@ + +#include "reg_bw.h" +const size_t MAX_BLOCKS = 8192; +const int32_t THR_SIZE = 64; +__global__ void cuda_add_reg_ori(float *out, float *x, float y, size_t N) { + size_t num_in_grid = size_t(blockDim.x * gridDim.x * THR_SIZE); + float tmp[THR_SIZE]; + int tmp2[THR_SIZE]; + for (size_t n = 0; n < N; n += num_in_grid) { + for (size_t i = 0; i < THR_SIZE; i += 1) { + size_t idx = + n + blockIdx.x * blockDim.x * THR_SIZE + threadIdx.x + blockDim.x * i; + tmp[i] = x[idx]; + tmp2[i] = int(x[idx]); + } + int y2 = int(y); + for (size_t j = 0; j < 1000; j++) { +#pragma unroll + for (size_t i = 0; i < THR_SIZE; i += 1) { + tmp2[i] = tmp2[i] + y2; + tmp[i] = tmp[i] + y; + } + } + for (size_t i = 0; i < THR_SIZE; i += 1) { + size_t idx = + n + blockIdx.x * blockDim.x * THR_SIZE + threadIdx.x + blockDim.x * i; + out[idx] = tmp[i] + tmp2[i]; + } + } +} + +void reg_bw_ori(float *out, float *x, float y, cudaStream_t stream, size_t N) { + constexpr size_t threads_per_block = 128; + auto num_blocks = min(size_t((N + threads_per_block * THR_SIZE - 1) / + (threads_per_block * THR_SIZE)), + MAX_BLOCKS); + cuda_add_reg_ori<<>>(out, x, y, N); +} diff --git a/operator/pointwise/reg_bw.h b/operator/pointwise/reg_bw.h new file mode 100644 index 0000000000000000000000000000000000000000..e5014092c286e1537e81b04bcf5774981ab4d953 --- /dev/null +++ b/operator/pointwise/reg_bw.h @@ -0,0 +1,3 @@ +#pragma once +#include +void reg_bw_ori(float *out, float *x, float y, cudaStream_t stream, size_t N); diff --git a/operator/pointwise/shm_bw.cu b/operator/pointwise/shm_bw.cu new file mode 100644 index 0000000000000000000000000000000000000000..f34f909fadc1af861c438d0e6022606591a367ec --- /dev/null +++ b/operator/pointwise/shm_bw.cu @@ -0,0 +1,42 @@ + +#include "shm_bw.h" +const size_t MAX_BLOCKS = 8192; +const size_t THR_SIZE = 64; +__global__ void cuda_add_shm_ori(float *out, float *x, float y, size_t N) { + extern __shared__ float sharedMem[]; + size_t num_in_grid = size_t(blockDim.x * gridDim.x * THR_SIZE); + float tmp[THR_SIZE]; + for (size_t n = 0; n < N; n += num_in_grid) { + for (size_t i = threadIdx.x; i < blockDim.x * THR_SIZE; i += blockDim.x) { + sharedMem[i] = x[n + blockIdx.x * blockDim.x * THR_SIZE + i]; + } + __syncthreads(); +#pragma unroll + for (size_t j = 0; j < 1000; j++) { +#pragma unroll + for (size_t i = 0; i < THR_SIZE; i += 1) { + size_t idx = threadIdx.x + blockDim.x * i; + tmp[i] = sharedMem[idx] + y; + } +#pragma unroll + for (size_t i = 0; i < THR_SIZE; i += 1) { + size_t idx = threadIdx.x + blockDim.x * i; + sharedMem[idx] = tmp[i]; + } + } + __syncthreads(); + for (size_t i = threadIdx.x; i < blockDim.x * THR_SIZE; i += blockDim.x) { + out[n + blockIdx.x * blockDim.x * THR_SIZE + i] = sharedMem[i]; + } + __syncthreads(); + } +} + +void shm_bw_ori(float *out, float *x, float y, cudaStream_t stream, size_t N) { + constexpr size_t threads_per_block = 128; + auto num_blocks = min(size_t((N + threads_per_block * THR_SIZE - 1) / + (threads_per_block * THR_SIZE)), + MAX_BLOCKS); + cuda_add_shm_ori<<>>(out, x, y, N); +} diff --git a/operator/pointwise/shm_bw.h b/operator/pointwise/shm_bw.h new file mode 100644 index 0000000000000000000000000000000000000000..60cd8471909435090be7a3db231a47f7389e8531 --- /dev/null +++ b/operator/pointwise/shm_bw.h @@ -0,0 +1,3 @@ +#pragma once +#include +void shm_bw_ori(float *out, float *x, float y, cudaStream_t stream, size_t N); diff --git a/reg_bw.ncu-rep b/reg_bw.ncu-rep new file mode 100644 index 0000000000000000000000000000000000000000..3eb0695a969ca2b07112ed1fb71f9b2738060e0f Binary files /dev/null and b/reg_bw.ncu-rep differ diff --git a/reg_bw1.ncu-rep b/reg_bw1.ncu-rep new file mode 100644 index 0000000000000000000000000000000000000000..882218192b7d8903c77cff5c6314274d4a90d97a Binary files /dev/null and b/reg_bw1.ncu-rep differ diff --git a/test/pointwise/test_gmem_bw.cu b/test/pointwise/test_gmem_bw.cu new file mode 100644 index 0000000000000000000000000000000000000000..312e8c30c43176f8a09e53ca8eb4ded31f148886 --- /dev/null +++ b/test/pointwise/test_gmem_bw.cu @@ -0,0 +1,44 @@ +#include "gmem_bw.h" +#include +#include +#include + +bool TestGmemBW(size_t N) { + spdlog::info("Create tensors..."); + float *d_x = nullptr; + float *d_out = nullptr; + + cudaMalloc(&d_x, N * sizeof(*d_x)); + cudaMalloc(&d_out, N * sizeof(*d_x)); + + std::vector x(N, 1.0f); + // Copy vectors to device + cudaMemcpy(d_x, x.data(), N * sizeof(*d_x), cudaMemcpyHostToDevice); + + // Perform the addition. + cudaStream_t stream; + cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + gmem_bw_ori(d_out, d_x, 1.0, stream, N); + gmem_bw_ori(d_out, d_x, 1.0, stream, N); + gmem_bw_opt1(d_out, d_x, 1.0, stream, N); + cudaStreamSynchronize(stream); + std::vector out(N, -1.0f); + cudaMemcpy(out.data(), d_out, N * sizeof(*d_out), cudaMemcpyDeviceToHost); + + cudaFree(d_x); + cudaFree(d_out); + + // Verify that returned results are OK. + float max_err = -std::numeric_limits::infinity(); + for (size_t i = 0; i < N; ++i) { + max_err = std::max(max_err, std::abs(out[i] - (x[i] + 1.0f))); + } + spdlog::info("[gmem_bw] Max error: {}", max_err); + + return true; +} + +TEST(test_gmem_bw, EXAMPLE) { + // EXPECT_TRUE(TestGmemBW(1024 * 1024 * 1024)); + EXPECT_TRUE(TestGmemBW(1024 * 1024 * 4)); +} diff --git a/test/pointwise/test_l2mem_bw.cu b/test/pointwise/test_l2mem_bw.cu new file mode 100644 index 0000000000000000000000000000000000000000..e72ac9af4dbcd8daeb7ea14aea3ceadfbbc05948 --- /dev/null +++ b/test/pointwise/test_l2mem_bw.cu @@ -0,0 +1,41 @@ +#include "l2mem_bw.h" +#include +#include +#include + +bool TestL2memBW(size_t N) { + spdlog::info("Create tensors..."); + float *d_x = nullptr; + float *d_out = nullptr; + + cudaMalloc(&d_x, N * sizeof(*d_x)); + cudaMalloc(&d_out, N * sizeof(*d_x)); + + std::vector x(N, 1.0f); + // Copy vectors to device + cudaMemcpy(d_x, x.data(), N * sizeof(*d_x), cudaMemcpyHostToDevice); + + // Perform the addition. + cudaStream_t stream; + cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + l2mem_bw_ori(d_out, d_x, 1.0, stream, N); + l2mem_bw_ori(d_out, d_x, 1.0, stream, N); + l2mem_bw_opt1(d_out, d_x, 1.0, stream, N); + cudaStreamSynchronize(stream); + std::vector out(N, -1.0f); + cudaMemcpy(out.data(), d_out, N * sizeof(*d_out), cudaMemcpyDeviceToHost); + + cudaFree(d_x); + cudaFree(d_out); + + // Verify that returned results are OK. + float max_err = -std::numeric_limits::infinity(); + for (size_t i = 0; i < N; ++i) { + max_err = std::max(max_err, std::abs(out[i] - (x[i] + 1.0f))); + } + spdlog::info("[l2mem_bw] Max error: {}", max_err); + + return true; +} + +TEST(test_l2mem_bw, EXAMPLE) { EXPECT_TRUE(TestL2memBW(1024 * 1024 * 4)); } diff --git a/test/pointwise/test_reg_bw.cu b/test/pointwise/test_reg_bw.cu new file mode 100644 index 0000000000000000000000000000000000000000..8c8a6f7b82d26d72e60cf3590306c4a1909db6be --- /dev/null +++ b/test/pointwise/test_reg_bw.cu @@ -0,0 +1,40 @@ +#include "reg_bw.h" +#include +#include +#include + +bool TestRegBW(size_t N) { + spdlog::info("Create tensors..."); + float *d_x = nullptr; + float *d_out = nullptr; + + cudaMalloc(&d_x, N * sizeof(*d_x)); + cudaMalloc(&d_out, N * sizeof(*d_x)); + + std::vector x(N, 1.0f); + // Copy vectors to device + cudaMemcpy(d_x, x.data(), N * sizeof(*d_x), cudaMemcpyHostToDevice); + + // Perform the addition. + cudaStream_t stream; + cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + reg_bw_ori(d_out, d_x, 1.0, stream, N); + reg_bw_ori(d_out, d_x, 1.0, stream, N); + cudaStreamSynchronize(stream); + std::vector out(N, -1.0f); + cudaMemcpy(out.data(), d_out, N * sizeof(*d_out), cudaMemcpyDeviceToHost); + + cudaFree(d_x); + cudaFree(d_out); + + // Verify that returned results are OK. + float max_err = -std::numeric_limits::infinity(); + for (size_t i = 0; i < N; ++i) { + max_err = std::max(max_err, std::abs(out[i] - (x[i] + 100.0f))); + } + spdlog::info("[reg_bw] Max error: {}", max_err); + + return true; +} + +TEST(test_reg_bw, EXAMPLE) { EXPECT_TRUE(TestRegBW(128 * 64 * 8192)); } diff --git a/test/pointwise/test_shm_bw.cu b/test/pointwise/test_shm_bw.cu new file mode 100644 index 0000000000000000000000000000000000000000..8141709106711e4ab3974339fcde3ea560092255 --- /dev/null +++ b/test/pointwise/test_shm_bw.cu @@ -0,0 +1,40 @@ +#include "shm_bw.h" +#include +#include +#include + +bool TestShmBW(size_t N) { + spdlog::info("Create tensors..."); + float *d_x = nullptr; + float *d_out = nullptr; + + cudaMalloc(&d_x, N * sizeof(*d_x)); + cudaMalloc(&d_out, N * sizeof(*d_x)); + + std::vector x(N, 1.0f); + // Copy vectors to device + cudaMemcpy(d_x, x.data(), N * sizeof(*d_x), cudaMemcpyHostToDevice); + + // Perform the addition. + cudaStream_t stream; + cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + shm_bw_ori(d_out, d_x, 1.0, stream, N); + shm_bw_ori(d_out, d_x, 1.0, stream, N); + cudaStreamSynchronize(stream); + std::vector out(N, -1.0f); + cudaMemcpy(out.data(), d_out, N * sizeof(*d_out), cudaMemcpyDeviceToHost); + + cudaFree(d_x); + cudaFree(d_out); + + // Verify that returned results are OK. + float max_err = -std::numeric_limits::infinity(); + for (size_t i = 0; i < N; ++i) { + max_err = std::max(max_err, std::abs(out[i] - (x[i] + 100.0f))); + } + spdlog::info("[shm_bw] Max error: {}", max_err); + + return true; +} + +TEST(test_shm_bw, EXAMPLE) { EXPECT_TRUE(TestShmBW(128 * 64 * 8192)); } diff --git a/test/pointwise/test_vec_add.cu b/test/pointwise/test_vec_add.cu index fcb4f5806961ad1d24a44171e65f1e5674914acb..330ed448a07b2ee2737292d5029acb9183b5bb8c 100644 --- a/test/pointwise/test_vec_add.cu +++ b/test/pointwise/test_vec_add.cu @@ -2,7 +2,6 @@ #include #include #include -#include bool TestVectorAdd(int N) { spdlog::info("Create tensors..."); @@ -42,4 +41,4 @@ bool TestVectorAdd(int N) { return true; } -TEST(test_vec_add, EXAMPLE) { EXPECT_TRUE(TestVectorAdd(10000)); } +TEST(test_vec_add, EXAMPLE) { EXPECT_TRUE(TestVectorAdd(1024 ^ 3)); }