diff --git a/CMakeLists.txt b/CMakeLists.txt index 07aacb5..618c891 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ set(CMAKE_CXX_STANDARD 17) # Add debug configuration if(CMAKE_BUILD_TYPE STREQUAL "Debug") - set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 --generate-line-info") + set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0") elseif(CMAKE_BUILD_TYPE STREQUAL "Release") set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math") elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") diff --git a/kernels/kernel_config.cu b/kernels/kernel_config.cu index 3c1644c..a98a1a1 100644 --- a/kernels/kernel_config.cu +++ b/kernels/kernel_config.cu @@ -13,12 +13,12 @@ void KernelConfig::print() const { total_threads()); } -KernelConfig get_launch_config(size_t n_elements, int threads_per_block, - int max_blocks_per_dim) { +KernelConfig get_launch_config(size_t n_elements, size_t threads_per_block, + size_t max_blocks_per_dim) { // Ensure threads_per_block is valid - threads_per_block = std::min(threads_per_block, 1024); - threads_per_block = std::max(threads_per_block, 32); + threads_per_block = std::min(threads_per_block, (size_t)1024); + threads_per_block = std::max(threads_per_block, (size_t)32); // Calculate total blocks needed size_t total_blocks = @@ -66,8 +66,8 @@ KernelConfig get_launch_config_advanced(size_t n_elements, int device_id) { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, device_id); - int threads_per_block = get_optimal_block_size(device_id); - int max_blocks_per_dim = prop.maxGridSize[0]; + size_t threads_per_block = get_optimal_block_size(device_id); + size_t max_blocks_per_dim = prop.maxGridSize[0]; return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim); } diff --git a/kernels/kernel_config.cuh b/kernels/kernel_config.cuh index 66364fe..13450cd 100644 --- a/kernels/kernel_config.cuh +++ b/kernels/kernel_config.cuh @@ -28,17 +28,19 @@ struct KernelConfig { * @param max_blocks_per_dim Maximum blocks per grid dimension (default: 65535) * @return LaunchConfig with optimal grid and block dimensions */ -KernelConfig get_launch_config(size_t n_elements, int threads_per_block = 256, - int max_blocks_per_dim = 65535); +KernelConfig get_launch_config(size_t n_elements, + size_t threads_per_block = 256, + size_t max_blocks_per_dim = 65535); /** * Calculate 1D thread index for kernels launched with get_launch_config() * Use this inside your CUDA kernels */ __device__ inline size_t get_thread_id() { - return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x + - (size_t)blockIdx.y * gridDim.x * blockDim.x + - (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t index = (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x + + (size_t)blockIdx.y * gridDim.x * blockDim.x + + (size_t)blockIdx.x * blockDim.x + threadIdx.x; + return index; } /** diff --git a/tests/cuda_unit_tests/CMakeLists.txt b/tests/cuda_unit_tests/CMakeLists.txt index 3419e5e..4ead02b 100644 --- a/tests/cuda_unit_tests/CMakeLists.txt +++ b/tests/cuda_unit_tests/CMakeLists.txt @@ -3,6 +3,7 @@ include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) add_executable(${NAME}_cuda_tests test_potential.cu test_forces.cu + test_kernel_config.cu ) target_link_libraries(${NAME}_cuda_tests gtest gtest_main) diff --git a/tests/cuda_unit_tests/test_kernel_config.cu b/tests/cuda_unit_tests/test_kernel_config.cu new file mode 100644 index 0000000..a616c19 --- /dev/null +++ b/tests/cuda_unit_tests/test_kernel_config.cu @@ -0,0 +1,48 @@ + +#include "kernel_config.cuh" +#include +#include +#include +#include + +// Kernel to test the get_thread_id() function +__global__ void test_get_thread_id_kernel(size_t *output, size_t n_elements) { + size_t i = get_thread_id(); + if (i < n_elements) { + output[i] = i; + } +} + +// Test fixture for kernel config tests +class KernelConfigTest : public ::testing::Test { +protected: + void SetUp() override { + // Set up any common resources for the tests + } + + void TearDown() override { + // Clean up any resources + } +}; + +TEST_F(KernelConfigTest, GetThreadId) { + const size_t n_elements = 10000; + KernelConfig config = get_launch_config(n_elements); + + size_t *d_output; + cudaMalloc(&d_output, n_elements * sizeof(size_t)); + + test_get_thread_id_kernel<<>>(d_output, + n_elements); + + std::vector h_output(n_elements); + cudaMemcpy(h_output.data(), d_output, n_elements * sizeof(size_t), + cudaMemcpyDeviceToHost); + + cudaFree(d_output); + + std::vector expected(n_elements); + std::iota(expected.begin(), expected.end(), 0); + + ASSERT_EQ(h_output, expected); +}