diff --git a/.gitignore b/.gitignore
index 605f7b6e3fe0a2779e496e7d83b82eacd90dd987..ea7a941a6308c375e0ad824ba5cb2302e4113243 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,5 +32,6 @@
 *.app
 
 /build
+/build1
 /fetchcontent
 /.vscode
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ea3dd03f3f5df8af1674eb1c35213dcb0cb62a50..5485f9b587e53a1e044507ca50a48ef8fde4edf9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,6 +9,7 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
 endif()
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS on)
+set(CMAKE_VERBOSE_MAKEFILE ON)
 
 
 set(CUDA_KERNELS_DIR   "${PROJECT_SOURCE_DIR}")
@@ -19,9 +20,10 @@ set(TEST_DIR           "${PROJECT_SOURCE_DIR}/test")
 list(APPEND CMAKE_MODULE_PATH ${CUDA_KERNELS_DIR}/cmake)
 include(FetchGTest)
 include(FetchSpdlog)
-include(FetchTorch)
-set(CMAKE_PREFIX_PATH ${TORCH_SOURCE_DIR}/share/cmake/Torch)
-find_package(Torch REQUIRED)
+# include(FetchTorch)
+# set(CMAKE_PREFIX_PATH ${TORCH_SOURCE_DIR}/share/cmake/Torch)
+# find_package(Torch REQUIRED)
+set(CMAKE_BUILD_TYPE Release)
 
 function(build_cuda_kernel OP_NAME)
     file(GLOB_RECURSE OP_CPP_FILES  "${SRC_DIR}/${OP_NAME}/*.cpp")
@@ -32,7 +34,6 @@ function(build_cuda_kernel OP_NAME)
         PRIVATE
         spdlog
         gtest_main
-        ${TORCH_LIBRARIES}
         )
     target_include_directories(test_${OP_NAME} PRIVATE ${INCLUDE_DIR}/${OP_NAME})
     set_target_properties(test_${OP_NAME} PROPERTIES CUDA_ARCHITECTURES "80;90")
@@ -41,4 +42,4 @@ function(build_cuda_kernel OP_NAME)
 endfunction(build_cuda_kernel)
 
 build_cuda_kernel(pointwise)
-build_cuda_kernel(conv_fwd)
\ No newline at end of file
+# build_cuda_kernel(conv_fwd)
\ No newline at end of file
diff --git a/build.sh b/build.sh
index 4c42d4c0df106daa73262637432470e33e3a6481..34a9df1a7671cc00d515b0ad9a5eca6a00db654a 100755
--- a/build.sh
+++ b/build.sh
@@ -3,8 +3,8 @@ script_path=$(readlink -f "$0")
 # 获取脚本所在目录
 script_dir=$(dirname "$script_path")
 # 创建build目录
-build_dir="${script_dir}/build"
-if [[ -d "$build_dir" ]]; then
+build_dir="${script_dir}/build1"
+if [ -d "$build_dir" ]; then
     cd $build_dir
 else
     mkdir -p $build_dir
diff --git a/operator/pointwise/gmem_bw.cu b/operator/pointwise/gmem_bw.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3be0cd62c91c202fdcaa74d19a29cfdf8bc8c012
--- /dev/null
+++ b/operator/pointwise/gmem_bw.cu
@@ -0,0 +1,44 @@
+
+#include "gmem_bw.h"
+const size_t MAX_BLOCKS = 8192;
+__global__ void cuda_add_ori(float *out, float *x, float y, size_t N) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t num_threads_in_grid = size_t(blockDim.x * gridDim.x);
+  for (size_t i = idx; i < N; i += num_threads_in_grid) {
+    out[i] = x[i] + y;
+  }
+}
+__global__ void cuda_add_opt1(float *out, float *x, float y, size_t N) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t num_threads_in_grid = size_t(blockDim.x * gridDim.x);
+  for(size_t i = idx; i < N/4; i += num_threads_in_grid) {
+    float4 x4 = reinterpret_cast<float4*>(x)[i];
+    float4 out4;
+    out4.x = x4.x + y;
+    out4.y = x4.y + y;
+    out4.z = x4.z + y;
+    out4.w = x4.w + y;
+    reinterpret_cast<float4*>(out)[i] = out4;
+  }
+
+  // in only one thread, process final elements (if there are any)
+  size_t remainder = N%4;
+  if (idx==N/4 && remainder!=0) {
+    while(remainder) {
+      size_t idx = N - remainder--;
+      out[idx] = x[idx] + y;
+    }
+  }
+}
+
+void gmem_bw_ori(float *out, float *x, float y, cudaStream_t stream, size_t N) {
+  constexpr size_t threads_per_block = 128;
+  auto num_blocks = min(size_t((N + threads_per_block - 1) / threads_per_block), MAX_BLOCKS);
+  cuda_add_ori<<<num_blocks, threads_per_block, 0, stream>>>(out, x, y, N);
+}
+
+void gmem_bw_opt1(float *out, float *x, float y, cudaStream_t stream, size_t N) {
+  constexpr size_t threads_per_block = 128;
+  auto num_blocks = min(size_t((N + threads_per_block * 4 - 1) / (threads_per_block * 4)), MAX_BLOCKS);
+  cuda_add_opt1<<<num_blocks, threads_per_block, 0, stream>>>(out, x, y, N);
+}
\ No newline at end of file
diff --git a/operator/pointwise/gmem_bw.h b/operator/pointwise/gmem_bw.h
new file mode 100644
index 0000000000000000000000000000000000000000..44bfb90b4a56345033bea27f9fb11e4b56406ae3
--- /dev/null
+++ b/operator/pointwise/gmem_bw.h
@@ -0,0 +1,4 @@
+#pragma once
+#include <cstddef>
+void gmem_bw_ori(float *out, float *x, float y, cudaStream_t stream, size_t N);
+void gmem_bw_opt1(float *out, float *x, float y, cudaStream_t stream, size_t N);
\ No newline at end of file
diff --git a/operator/pointwise/l2mem_bw.cu b/operator/pointwise/l2mem_bw.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d6914a1745cc887a50cac8d82e2340556b289600
--- /dev/null
+++ b/operator/pointwise/l2mem_bw.cu
@@ -0,0 +1,54 @@
+
+#include "gmem_bw.h"
+const size_t MAX_BLOCKS = 8192;
+__global__ void cuda_add_l2_ori(float *out, float *x, float y, size_t N) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t num_threads_in_grid = size_t(blockDim.x * gridDim.x);
+  for (int j = 0; j < 100; j++) {
+    for (size_t i = idx; i < N; i += num_threads_in_grid) {
+      size_t iidx = (i + j * blockDim.x * 4) & 0x3fffff;
+      out[iidx] = x[iidx] + y;
+    }
+  }
+}
+__global__ void cuda_add_l2_opt1(float *out, float *x, float y, size_t N) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t num_threads_in_grid = size_t(blockDim.x * gridDim.x);
+  for (int j = 0; j < 100; j++) {
+    for (size_t i = idx; i < N / 4; i += num_threads_in_grid) {
+      float4 x4 = reinterpret_cast<float4 *>(x)[i];
+      float4 out4;
+      out4.x = x4.x + y;
+      out4.y = x4.y + y;
+      out4.z = x4.z + y;
+      out4.w = x4.w + y;
+      reinterpret_cast<float4 *>(out)[i] = out4;
+    }
+
+    // in only one thread, process final elements (if there are any)
+    size_t remainder = N % 4;
+    if (idx == N / 4 && remainder != 0) {
+      while (remainder) {
+        size_t idx = N - remainder--;
+        out[idx] = x[idx] + y;
+      }
+    }
+  }
+}
+
+void l2mem_bw_ori(float *out, float *x, float y, cudaStream_t stream,
+                  size_t N) {
+  constexpr size_t threads_per_block = 128;
+  auto num_blocks =
+      min(size_t((N + threads_per_block - 1) / threads_per_block), MAX_BLOCKS);
+  cuda_add_l2_ori<<<num_blocks, threads_per_block, 0, stream>>>(out, x, y, N);
+}
+
+void l2mem_bw_opt1(float *out, float *x, float y, cudaStream_t stream,
+                   size_t N) {
+  constexpr size_t threads_per_block = 128;
+  auto num_blocks =
+      min(size_t((N + threads_per_block * 4 - 1) / (threads_per_block * 4)),
+          MAX_BLOCKS);
+  cuda_add_l2_opt1<<<num_blocks, threads_per_block, 0, stream>>>(out, x, y, N);
+}
\ No newline at end of file
diff --git a/operator/pointwise/l2mem_bw.h b/operator/pointwise/l2mem_bw.h
new file mode 100644
index 0000000000000000000000000000000000000000..a108b70832f82724660a2c3d40b360eca478d7e3
--- /dev/null
+++ b/operator/pointwise/l2mem_bw.h
@@ -0,0 +1,5 @@
+#pragma once
+#include <cstddef>
+void l2mem_bw_ori(float *out, float *x, float y, cudaStream_t stream, size_t N);
+void l2mem_bw_opt1(float *out, float *x, float y, cudaStream_t stream,
+                   size_t N);
\ No newline at end of file
diff --git a/operator/pointwise/reg_bw.cu b/operator/pointwise/reg_bw.cu
new file mode 100644
index 0000000000000000000000000000000000000000..08cf0b7b54ba4232aa96e8541c1869fa636152b1
--- /dev/null
+++ b/operator/pointwise/reg_bw.cu
@@ -0,0 +1,38 @@
+
+#include "reg_bw.h"
+const size_t MAX_BLOCKS = 8192;
+const int32_t THR_SIZE = 64;
+__global__ void cuda_add_reg_ori(float *out, float *x, float y, size_t N) {
+  size_t num_in_grid = size_t(blockDim.x * gridDim.x * THR_SIZE);
+  float tmp[THR_SIZE];
+  int tmp2[THR_SIZE];
+  for (size_t n = 0; n < N; n += num_in_grid) {
+    for (size_t i = 0; i < THR_SIZE; i += 1) {
+      size_t idx =
+          n + blockIdx.x * blockDim.x * THR_SIZE + threadIdx.x + blockDim.x * i;
+      tmp[i] = x[idx];
+      tmp2[i] = int(x[idx]);
+    }
+    int y2 = int(y);
+    for (size_t j = 0; j < 1000; j++) {
+#pragma unroll
+      for (size_t i = 0; i < THR_SIZE; i += 1) {
+        tmp2[i] = tmp2[i] + y2;
+        tmp[i] = tmp[i] + y;
+      }
+    }
+    for (size_t i = 0; i < THR_SIZE; i += 1) {
+      size_t idx =
+          n + blockIdx.x * blockDim.x * THR_SIZE + threadIdx.x + blockDim.x * i;
+      out[idx] = tmp[i] + tmp2[i];
+    }
+  }
+}
+
+void reg_bw_ori(float *out, float *x, float y, cudaStream_t stream, size_t N) {
+  constexpr size_t threads_per_block = 128;
+  auto num_blocks = min(size_t((N + threads_per_block * THR_SIZE - 1) /
+                               (threads_per_block * THR_SIZE)),
+                        MAX_BLOCKS);
+  cuda_add_reg_ori<<<num_blocks, threads_per_block, 0, stream>>>(out, x, y, N);
+}
diff --git a/operator/pointwise/reg_bw.h b/operator/pointwise/reg_bw.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5014092c286e1537e81b04bcf5774981ab4d953
--- /dev/null
+++ b/operator/pointwise/reg_bw.h
@@ -0,0 +1,3 @@
+#pragma once
+#include <cstddef>
+void reg_bw_ori(float *out, float *x, float y, cudaStream_t stream, size_t N);
diff --git a/operator/pointwise/shm_bw.cu b/operator/pointwise/shm_bw.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f34f909fadc1af861c438d0e6022606591a367ec
--- /dev/null
+++ b/operator/pointwise/shm_bw.cu
@@ -0,0 +1,42 @@
+
+#include "shm_bw.h"
+const size_t MAX_BLOCKS = 8192;
+const size_t THR_SIZE = 64;
+__global__ void cuda_add_shm_ori(float *out, float *x, float y, size_t N) {
+  extern __shared__ float sharedMem[];
+  size_t num_in_grid = size_t(blockDim.x * gridDim.x * THR_SIZE);
+  float tmp[THR_SIZE];
+  for (size_t n = 0; n < N; n += num_in_grid) {
+    for (size_t i = threadIdx.x; i < blockDim.x * THR_SIZE; i += blockDim.x) {
+      sharedMem[i] = x[n + blockIdx.x * blockDim.x * THR_SIZE + i];
+    }
+    __syncthreads();
+#pragma unroll
+    for (size_t j = 0; j < 1000; j++) {
+#pragma unroll
+      for (size_t i = 0; i < THR_SIZE; i += 1) {
+        size_t idx = threadIdx.x + blockDim.x * i;
+        tmp[i] = sharedMem[idx] + y;
+      }
+#pragma unroll
+      for (size_t i = 0; i < THR_SIZE; i += 1) {
+        size_t idx = threadIdx.x + blockDim.x * i;
+        sharedMem[idx] = tmp[i];
+      }
+    }
+    __syncthreads();
+    for (size_t i = threadIdx.x; i < blockDim.x * THR_SIZE; i += blockDim.x) {
+      out[n + blockIdx.x * blockDim.x * THR_SIZE + i] = sharedMem[i];
+    }
+    __syncthreads();
+  }
+}
+
+void shm_bw_ori(float *out, float *x, float y, cudaStream_t stream, size_t N) {
+  constexpr size_t threads_per_block = 128;
+  auto num_blocks = min(size_t((N + threads_per_block * THR_SIZE - 1) /
+                               (threads_per_block * THR_SIZE)),
+                        MAX_BLOCKS);
+  cuda_add_shm_ori<<<num_blocks, threads_per_block,
+                     threads_per_block * 4 * THR_SIZE, stream>>>(out, x, y, N);
+}
diff --git a/operator/pointwise/shm_bw.h b/operator/pointwise/shm_bw.h
new file mode 100644
index 0000000000000000000000000000000000000000..60cd8471909435090be7a3db231a47f7389e8531
--- /dev/null
+++ b/operator/pointwise/shm_bw.h
@@ -0,0 +1,3 @@
+#pragma once
+#include <cstddef>
+void shm_bw_ori(float *out, float *x, float y, cudaStream_t stream, size_t N);
diff --git a/reg_bw.ncu-rep b/reg_bw.ncu-rep
new file mode 100644
index 0000000000000000000000000000000000000000..3eb0695a969ca2b07112ed1fb71f9b2738060e0f
Binary files /dev/null and b/reg_bw.ncu-rep differ
diff --git a/reg_bw1.ncu-rep b/reg_bw1.ncu-rep
new file mode 100644
index 0000000000000000000000000000000000000000..882218192b7d8903c77cff5c6314274d4a90d97a
Binary files /dev/null and b/reg_bw1.ncu-rep differ
diff --git a/test/pointwise/test_gmem_bw.cu b/test/pointwise/test_gmem_bw.cu
new file mode 100644
index 0000000000000000000000000000000000000000..312e8c30c43176f8a09e53ca8eb4ded31f148886
--- /dev/null
+++ b/test/pointwise/test_gmem_bw.cu
@@ -0,0 +1,44 @@
+#include "gmem_bw.h"
+#include <gtest/gtest.h>
+#include <limits>
+#include <spdlog/spdlog.h>
+
+bool TestGmemBW(size_t N) {
+  spdlog::info("Create tensors...");
+  float *d_x = nullptr;
+  float *d_out = nullptr;
+
+  cudaMalloc(&d_x, N * sizeof(*d_x));
+  cudaMalloc(&d_out, N * sizeof(*d_x));
+
+  std::vector<float> x(N, 1.0f);
+  // Copy vectors to device
+  cudaMemcpy(d_x, x.data(), N * sizeof(*d_x), cudaMemcpyHostToDevice);
+
+  // Perform the addition.
+  cudaStream_t stream;
+  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+  gmem_bw_ori(d_out, d_x, 1.0, stream, N);
+  gmem_bw_ori(d_out, d_x, 1.0, stream, N);
+  gmem_bw_opt1(d_out, d_x, 1.0, stream, N);
+  cudaStreamSynchronize(stream);
+  std::vector<float> out(N, -1.0f);
+  cudaMemcpy(out.data(), d_out, N * sizeof(*d_out), cudaMemcpyDeviceToHost);
+
+  cudaFree(d_x);
+  cudaFree(d_out);
+
+  // Verify that returned results are OK.
+  float max_err = -std::numeric_limits<float>::infinity();
+  for (size_t i = 0; i < N; ++i) {
+    max_err = std::max(max_err, std::abs(out[i] - (x[i] + 1.0f)));
+  }
+  spdlog::info("[gmem_bw] Max error: {}", max_err);
+
+  return true;
+}
+
+TEST(test_gmem_bw, EXAMPLE) {
+  // EXPECT_TRUE(TestGmemBW(1024 * 1024 * 1024));
+  EXPECT_TRUE(TestGmemBW(1024 * 1024 * 4));
+}
diff --git a/test/pointwise/test_l2mem_bw.cu b/test/pointwise/test_l2mem_bw.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e72ac9af4dbcd8daeb7ea14aea3ceadfbbc05948
--- /dev/null
+++ b/test/pointwise/test_l2mem_bw.cu
@@ -0,0 +1,41 @@
+#include "l2mem_bw.h"
+#include <gtest/gtest.h>
+#include <limits>
+#include <spdlog/spdlog.h>
+
+bool TestL2memBW(size_t N) {
+  spdlog::info("Create tensors...");
+  float *d_x = nullptr;
+  float *d_out = nullptr;
+
+  cudaMalloc(&d_x, N * sizeof(*d_x));
+  cudaMalloc(&d_out, N * sizeof(*d_x));
+
+  std::vector<float> x(N, 1.0f);
+  // Copy vectors to device
+  cudaMemcpy(d_x, x.data(), N * sizeof(*d_x), cudaMemcpyHostToDevice);
+
+  // Perform the addition.
+  cudaStream_t stream;
+  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+  l2mem_bw_ori(d_out, d_x, 1.0, stream, N);
+  l2mem_bw_ori(d_out, d_x, 1.0, stream, N);
+  l2mem_bw_opt1(d_out, d_x, 1.0, stream, N);
+  cudaStreamSynchronize(stream);
+  std::vector<float> out(N, -1.0f);
+  cudaMemcpy(out.data(), d_out, N * sizeof(*d_out), cudaMemcpyDeviceToHost);
+
+  cudaFree(d_x);
+  cudaFree(d_out);
+
+  // Verify that returned results are OK.
+  float max_err = -std::numeric_limits<float>::infinity();
+  for (size_t i = 0; i < N; ++i) {
+    max_err = std::max(max_err, std::abs(out[i] - (x[i] + 1.0f)));
+  }
+  spdlog::info("[l2mem_bw] Max error: {}", max_err);
+
+  return true;
+}
+
+TEST(test_l2mem_bw, EXAMPLE) { EXPECT_TRUE(TestL2memBW(1024 * 1024 * 4)); }
diff --git a/test/pointwise/test_reg_bw.cu b/test/pointwise/test_reg_bw.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8c8a6f7b82d26d72e60cf3590306c4a1909db6be
--- /dev/null
+++ b/test/pointwise/test_reg_bw.cu
@@ -0,0 +1,40 @@
+#include "reg_bw.h"
+#include <gtest/gtest.h>
+#include <limits>
+#include <spdlog/spdlog.h>
+
+bool TestRegBW(size_t N) {
+  spdlog::info("Create tensors...");
+  float *d_x = nullptr;
+  float *d_out = nullptr;
+
+  cudaMalloc(&d_x, N * sizeof(*d_x));
+  cudaMalloc(&d_out, N * sizeof(*d_x));
+
+  std::vector<float> x(N, 1.0f);
+  // Copy vectors to device
+  cudaMemcpy(d_x, x.data(), N * sizeof(*d_x), cudaMemcpyHostToDevice);
+
+  // Perform the addition.
+  cudaStream_t stream;
+  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+  reg_bw_ori(d_out, d_x, 1.0, stream, N);
+  reg_bw_ori(d_out, d_x, 1.0, stream, N);
+  cudaStreamSynchronize(stream);
+  std::vector<float> out(N, -1.0f);
+  cudaMemcpy(out.data(), d_out, N * sizeof(*d_out), cudaMemcpyDeviceToHost);
+
+  cudaFree(d_x);
+  cudaFree(d_out);
+
+  // Verify that returned results are OK.
+  float max_err = -std::numeric_limits<float>::infinity();
+  for (size_t i = 0; i < N; ++i) {
+    max_err = std::max(max_err, std::abs(out[i] - (x[i] + 100.0f)));
+  }
+  spdlog::info("[reg_bw] Max error: {}", max_err);
+
+  return true;
+}
+
+TEST(test_reg_bw, EXAMPLE) { EXPECT_TRUE(TestRegBW(128 * 64 * 8192)); }
diff --git a/test/pointwise/test_shm_bw.cu b/test/pointwise/test_shm_bw.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8141709106711e4ab3974339fcde3ea560092255
--- /dev/null
+++ b/test/pointwise/test_shm_bw.cu
@@ -0,0 +1,40 @@
+#include "shm_bw.h"
+#include <gtest/gtest.h>
+#include <limits>
+#include <spdlog/spdlog.h>
+
+bool TestShmBW(size_t N) {
+  spdlog::info("Create tensors...");
+  float *d_x = nullptr;
+  float *d_out = nullptr;
+
+  cudaMalloc(&d_x, N * sizeof(*d_x));
+  cudaMalloc(&d_out, N * sizeof(*d_x));
+
+  std::vector<float> x(N, 1.0f);
+  // Copy vectors to device
+  cudaMemcpy(d_x, x.data(), N * sizeof(*d_x), cudaMemcpyHostToDevice);
+
+  // Perform the addition.
+  cudaStream_t stream;
+  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+  shm_bw_ori(d_out, d_x, 1.0, stream, N);
+  shm_bw_ori(d_out, d_x, 1.0, stream, N);
+  cudaStreamSynchronize(stream);
+  std::vector<float> out(N, -1.0f);
+  cudaMemcpy(out.data(), d_out, N * sizeof(*d_out), cudaMemcpyDeviceToHost);
+
+  cudaFree(d_x);
+  cudaFree(d_out);
+
+  // Verify that returned results are OK.
+  float max_err = -std::numeric_limits<float>::infinity();
+  for (size_t i = 0; i < N; ++i) {
+    max_err = std::max(max_err, std::abs(out[i] - (x[i] + 100.0f)));
+  }
+  spdlog::info("[shm_bw] Max error: {}", max_err);
+
+  return true;
+}
+
+TEST(test_shm_bw, EXAMPLE) { EXPECT_TRUE(TestShmBW(128 * 64 * 8192)); }
diff --git a/test/pointwise/test_vec_add.cu b/test/pointwise/test_vec_add.cu
index fcb4f5806961ad1d24a44171e65f1e5674914acb..330ed448a07b2ee2737292d5029acb9183b5bb8c 100644
--- a/test/pointwise/test_vec_add.cu
+++ b/test/pointwise/test_vec_add.cu
@@ -2,7 +2,6 @@
 #include <gtest/gtest.h>
 #include <limits>
 #include <spdlog/spdlog.h>
-#include <torch/torch.h>
 
 bool TestVectorAdd(int N) {
   spdlog::info("Create tensors...");
@@ -42,4 +41,4 @@ bool TestVectorAdd(int N) {
   return true;
 }
 
-TEST(test_vec_add, EXAMPLE) { EXPECT_TRUE(TestVectorAdd(10000)); }
+TEST(test_vec_add, EXAMPLE) { EXPECT_TRUE(TestVectorAdd(1024 ^ 3)); }