Update params for KernelConfig and add basic tests for getThreadId

2025-09-18 23:47:40 -04:00 · 2025-09-18 23:47:40 -04:00 · 8dec472929
commit 8dec472929
parent 9825c0d14d
5 changed files with 63 additions and 12 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -16,7 +16,7 @@ set(CMAKE_CXX_STANDARD 17)
 # Add debug configuration
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 --generate-line-info")
+    set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0")
 elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
    set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math")
 elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
--- a/kernels/kernel_config.cu
+++ b/kernels/kernel_config.cu
@ -13,12 +13,12 @@ void KernelConfig::print() const {
         total_threads());
 }
-KernelConfig get_launch_config(size_t n_elements, int threads_per_block,
+KernelConfig get_launch_config(size_t n_elements, size_t threads_per_block,
-                               int max_blocks_per_dim) {
+                               size_t max_blocks_per_dim) {
  // Ensure threads_per_block is valid
-  threads_per_block = std::min(threads_per_block, 1024);
+  threads_per_block = std::min(threads_per_block, (size_t)1024);
-  threads_per_block = std::max(threads_per_block, 32);
+  threads_per_block = std::max(threads_per_block, (size_t)32);
  // Calculate total blocks needed
  size_t total_blocks =
@ -66,8 +66,8 @@ KernelConfig get_launch_config_advanced(size_t n_elements, int device_id) {
  cudaDeviceProp prop;
  cudaGetDeviceProperties(&prop, device_id);
-  int threads_per_block = get_optimal_block_size(device_id);
+  size_t threads_per_block = get_optimal_block_size(device_id);
-  int max_blocks_per_dim = prop.maxGridSize[0];
+  size_t max_blocks_per_dim = prop.maxGridSize[0];
  return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim);
 }
--- a/kernels/kernel_config.cuh
+++ b/kernels/kernel_config.cuh
@ -28,17 +28,19 @@ struct KernelConfig {
 * @param max_blocks_per_dim Maximum blocks per grid dimension (default: 65535)
 * @return LaunchConfig with optimal grid and block dimensions
 */
-KernelConfig get_launch_config(size_t n_elements, int threads_per_block = 256,
+KernelConfig get_launch_config(size_t n_elements,
-                               int max_blocks_per_dim = 65535);
+                               size_t threads_per_block = 256,
                               size_t max_blocks_per_dim = 65535);
 /**
 * Calculate 1D thread index for kernels launched with get_launch_config()
 * Use this inside your CUDA kernels
 */
 __device__ inline size_t get_thread_id() {
-  return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
+  size_t index = (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
-         (size_t)blockIdx.y * gridDim.x * blockDim.x +
+                 (size_t)blockIdx.y * gridDim.x * blockDim.x +
-         (size_t)blockIdx.x * blockDim.x + threadIdx.x;
+                 (size_t)blockIdx.x * blockDim.x + threadIdx.x;
  return index;
 }
 /**
--- a/tests/cuda_unit_tests/CMakeLists.txt
+++ b/tests/cuda_unit_tests/CMakeLists.txt
@ -3,6 +3,7 @@ include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})
 add_executable(${NAME}_cuda_tests
    test_potential.cu
    test_forces.cu
    test_kernel_config.cu
 )
 target_link_libraries(${NAME}_cuda_tests gtest gtest_main)
--- a/tests/cuda_unit_tests/test_kernel_config.cu
+++ b/tests/cuda_unit_tests/test_kernel_config.cu
@ -0,0 +1,48 @@
 #include "kernel_config.cuh"
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>
 #include <numeric>
 #include <vector>
 // Kernel to test the get_thread_id() function
 __global__ void test_get_thread_id_kernel(size_t *output, size_t n_elements) {
  size_t i = get_thread_id();
  if (i < n_elements) {
    output[i] = i;
  }
 }
 // Test fixture for kernel config tests
 class KernelConfigTest : public ::testing::Test {
 protected:
  void SetUp() override {
    // Set up any common resources for the tests
  }
  void TearDown() override {
    // Clean up any resources
  }
 };
 TEST_F(KernelConfigTest, GetThreadId) {
  const size_t n_elements = 10000;
  KernelConfig config = get_launch_config(n_elements);
  size_t *d_output;
  cudaMalloc(&d_output, n_elements * sizeof(size_t));
  test_get_thread_id_kernel<<<config.blocks, config.threads>>>(d_output,
                                                               n_elements);
  std::vector<size_t> h_output(n_elements);
  cudaMemcpy(h_output.data(), d_output, n_elements * sizeof(size_t),
             cudaMemcpyDeviceToHost);
  cudaFree(d_output);
  std::vector<size_t> expected(n_elements);
  std::iota(expected.begin(), expected.end(), 0);
  ASSERT_EQ(h_output, expected);
 }