Update params for KernelConfig and add basic tests for getThreadId

2025-09-18 23:47:40 -04:00 · 2025-09-18 23:47:40 -04:00 · 8dec472929
commit 8dec472929
parent 9825c0d14d
5 changed files with 63 additions and 12 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -16,7 +16,7 @@ set(CMAKE_CXX_STANDARD 17)

 # Add debug configuration
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 --generate-line-info")
+    set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0")
 elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
    set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math")
 elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
--- a/kernels/kernel_config.cu
+++ b/kernels/kernel_config.cu
@ -13,12 +13,12 @@ void KernelConfig::print() const {
         total_threads());
 }

-KernelConfig get_launch_config(size_t n_elements, int threads_per_block,
-                               int max_blocks_per_dim) {
+KernelConfig get_launch_config(size_t n_elements, size_t threads_per_block,
+                               size_t max_blocks_per_dim) {

  // Ensure threads_per_block is valid
-  threads_per_block = std::min(threads_per_block, 1024);
-  threads_per_block = std::max(threads_per_block, 32);
+  threads_per_block = std::min(threads_per_block, (size_t)1024);
+  threads_per_block = std::max(threads_per_block, (size_t)32);

  // Calculate total blocks needed
  size_t total_blocks =
@ -66,8 +66,8 @@ KernelConfig get_launch_config_advanced(size_t n_elements, int device_id) {
  cudaDeviceProp prop;
  cudaGetDeviceProperties(&prop, device_id);

-  int threads_per_block = get_optimal_block_size(device_id);
-  int max_blocks_per_dim = prop.maxGridSize[0];
+  size_t threads_per_block = get_optimal_block_size(device_id);
+  size_t max_blocks_per_dim = prop.maxGridSize[0];

  return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim);
 }
--- a/kernels/kernel_config.cuh
+++ b/kernels/kernel_config.cuh
@ -28,17 +28,19 @@ struct KernelConfig {
 * @param max_blocks_per_dim Maximum blocks per grid dimension (default: 65535)
 * @return LaunchConfig with optimal grid and block dimensions
 */
-KernelConfig get_launch_config(size_t n_elements, int threads_per_block = 256,
-                               int max_blocks_per_dim = 65535);
+KernelConfig get_launch_config(size_t n_elements,
+                               size_t threads_per_block = 256,
+                               size_t max_blocks_per_dim = 65535);

 /**
 * Calculate 1D thread index for kernels launched with get_launch_config()
 * Use this inside your CUDA kernels
 */
 __device__ inline size_t get_thread_id() {
-  return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
-         (size_t)blockIdx.y * gridDim.x * blockDim.x +
-         (size_t)blockIdx.x * blockDim.x + threadIdx.x;
+  size_t index = (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
+                 (size_t)blockIdx.y * gridDim.x * blockDim.x +
+                 (size_t)blockIdx.x * blockDim.x + threadIdx.x;
+  return index;
 }

 /**
--- a/tests/cuda_unit_tests/CMakeLists.txt
+++ b/tests/cuda_unit_tests/CMakeLists.txt
@ -3,6 +3,7 @@ include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})
 add_executable(${NAME}_cuda_tests
    test_potential.cu
    test_forces.cu
+    test_kernel_config.cu
 )

 target_link_libraries(${NAME}_cuda_tests gtest gtest_main)
--- a/tests/cuda_unit_tests/test_kernel_config.cu
+++ b/tests/cuda_unit_tests/test_kernel_config.cu
@ -0,0 +1,48 @@
+
+#include "kernel_config.cuh"
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+#include <numeric>
+#include <vector>
+
+// Kernel to test the get_thread_id() function
+__global__ void test_get_thread_id_kernel(size_t *output, size_t n_elements) {
+  size_t i = get_thread_id();
+  if (i < n_elements) {
+    output[i] = i;
+  }
+}
+
+// Test fixture for kernel config tests
+class KernelConfigTest : public ::testing::Test {
+protected:
+  void SetUp() override {
+    // Set up any common resources for the tests
+  }
+
+  void TearDown() override {
+    // Clean up any resources
+  }
+};
+
+TEST_F(KernelConfigTest, GetThreadId) {
+  const size_t n_elements = 10000;
+  KernelConfig config = get_launch_config(n_elements);
+
+  size_t *d_output;
+  cudaMalloc(&d_output, n_elements * sizeof(size_t));
+
+  test_get_thread_id_kernel<<<config.blocks, config.threads>>>(d_output,
+                                                               n_elements);
+
+  std::vector<size_t> h_output(n_elements);
+  cudaMemcpy(h_output.data(), d_output, n_elements * sizeof(size_t),
+             cudaMemcpyDeviceToHost);
+
+  cudaFree(d_output);
+
+  std::vector<size_t> expected(n_elements);
+  std::iota(expected.begin(), expected.end(), 0);
+
+  ASSERT_EQ(h_output, expected);
+}