Update params for KernelConfig and add basic tests for getThreadId
Some checks are pending
Build and Test / build-and-test (push) Waiting to run

This commit is contained in:
Alex Selimov 2025-09-18 23:47:40 -04:00
parent 9825c0d14d
commit 8dec472929
Signed by: aselimov
GPG key ID: 3DDB9C3E023F1F31
5 changed files with 63 additions and 12 deletions

View file

@ -16,7 +16,7 @@ set(CMAKE_CXX_STANDARD 17)
# Add debug configuration # Add debug configuration
if(CMAKE_BUILD_TYPE STREQUAL "Debug") if(CMAKE_BUILD_TYPE STREQUAL "Debug")
set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 --generate-line-info") set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0")
elseif(CMAKE_BUILD_TYPE STREQUAL "Release") elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math") set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math")
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")

View file

@ -13,12 +13,12 @@ void KernelConfig::print() const {
total_threads()); total_threads());
} }
KernelConfig get_launch_config(size_t n_elements, int threads_per_block, KernelConfig get_launch_config(size_t n_elements, size_t threads_per_block,
int max_blocks_per_dim) { size_t max_blocks_per_dim) {
// Ensure threads_per_block is valid // Ensure threads_per_block is valid
threads_per_block = std::min(threads_per_block, 1024); threads_per_block = std::min(threads_per_block, (size_t)1024);
threads_per_block = std::max(threads_per_block, 32); threads_per_block = std::max(threads_per_block, (size_t)32);
// Calculate total blocks needed // Calculate total blocks needed
size_t total_blocks = size_t total_blocks =
@ -66,8 +66,8 @@ KernelConfig get_launch_config_advanced(size_t n_elements, int device_id) {
cudaDeviceProp prop; cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, device_id); cudaGetDeviceProperties(&prop, device_id);
int threads_per_block = get_optimal_block_size(device_id); size_t threads_per_block = get_optimal_block_size(device_id);
int max_blocks_per_dim = prop.maxGridSize[0]; size_t max_blocks_per_dim = prop.maxGridSize[0];
return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim); return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim);
} }

View file

@ -28,17 +28,19 @@ struct KernelConfig {
* @param max_blocks_per_dim Maximum blocks per grid dimension (default: 65535) * @param max_blocks_per_dim Maximum blocks per grid dimension (default: 65535)
* @return LaunchConfig with optimal grid and block dimensions * @return LaunchConfig with optimal grid and block dimensions
*/ */
KernelConfig get_launch_config(size_t n_elements, int threads_per_block = 256, KernelConfig get_launch_config(size_t n_elements,
int max_blocks_per_dim = 65535); size_t threads_per_block = 256,
size_t max_blocks_per_dim = 65535);
/** /**
* Calculate 1D thread index for kernels launched with get_launch_config() * Calculate 1D thread index for kernels launched with get_launch_config()
* Use this inside your CUDA kernels * Use this inside your CUDA kernels
*/ */
__device__ inline size_t get_thread_id() { __device__ inline size_t get_thread_id() {
return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x + size_t index = (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
(size_t)blockIdx.y * gridDim.x * blockDim.x + (size_t)blockIdx.y * gridDim.x * blockDim.x +
(size_t)blockIdx.x * blockDim.x + threadIdx.x; (size_t)blockIdx.x * blockDim.x + threadIdx.x;
return index;
} }
/** /**

View file

@ -3,6 +3,7 @@ include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})
add_executable(${NAME}_cuda_tests add_executable(${NAME}_cuda_tests
test_potential.cu test_potential.cu
test_forces.cu test_forces.cu
test_kernel_config.cu
) )
target_link_libraries(${NAME}_cuda_tests gtest gtest_main) target_link_libraries(${NAME}_cuda_tests gtest gtest_main)

View file

@ -0,0 +1,48 @@
#include "kernel_config.cuh"
#include <cuda_runtime.h>
#include <gtest/gtest.h>
#include <numeric>
#include <vector>
// Kernel to test the get_thread_id() function
__global__ void test_get_thread_id_kernel(size_t *output, size_t n_elements) {
size_t i = get_thread_id();
if (i < n_elements) {
output[i] = i;
}
}
// Test fixture for kernel config tests
class KernelConfigTest : public ::testing::Test {
protected:
void SetUp() override {
// Set up any common resources for the tests
}
void TearDown() override {
// Clean up any resources
}
};
TEST_F(KernelConfigTest, GetThreadId) {
const size_t n_elements = 10000;
KernelConfig config = get_launch_config(n_elements);
size_t *d_output;
cudaMalloc(&d_output, n_elements * sizeof(size_t));
test_get_thread_id_kernel<<<config.blocks, config.threads>>>(d_output,
n_elements);
std::vector<size_t> h_output(n_elements);
cudaMemcpy(h_output.data(), d_output, n_elements * sizeof(size_t),
cudaMemcpyDeviceToHost);
cudaFree(d_output);
std::vector<size_t> expected(n_elements);
std::iota(expected.begin(), expected.end(), 0);
ASSERT_EQ(h_output, expected);
}