Update params for KernelConfig and add basic tests for getThreadId
Some checks are pending
Build and Test / build-and-test (push) Waiting to run
Some checks are pending
Build and Test / build-and-test (push) Waiting to run
This commit is contained in:
parent
9825c0d14d
commit
8dec472929
5 changed files with 63 additions and 12 deletions
|
@ -16,7 +16,7 @@ set(CMAKE_CXX_STANDARD 17)
|
|||
|
||||
# Add debug configuration
|
||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||
set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 --generate-line-info")
|
||||
set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0")
|
||||
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
|
||||
set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math")
|
||||
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
|
||||
|
|
|
@ -13,12 +13,12 @@ void KernelConfig::print() const {
|
|||
total_threads());
|
||||
}
|
||||
|
||||
KernelConfig get_launch_config(size_t n_elements, int threads_per_block,
|
||||
int max_blocks_per_dim) {
|
||||
KernelConfig get_launch_config(size_t n_elements, size_t threads_per_block,
|
||||
size_t max_blocks_per_dim) {
|
||||
|
||||
// Ensure threads_per_block is valid
|
||||
threads_per_block = std::min(threads_per_block, 1024);
|
||||
threads_per_block = std::max(threads_per_block, 32);
|
||||
threads_per_block = std::min(threads_per_block, (size_t)1024);
|
||||
threads_per_block = std::max(threads_per_block, (size_t)32);
|
||||
|
||||
// Calculate total blocks needed
|
||||
size_t total_blocks =
|
||||
|
@ -66,8 +66,8 @@ KernelConfig get_launch_config_advanced(size_t n_elements, int device_id) {
|
|||
cudaDeviceProp prop;
|
||||
cudaGetDeviceProperties(&prop, device_id);
|
||||
|
||||
int threads_per_block = get_optimal_block_size(device_id);
|
||||
int max_blocks_per_dim = prop.maxGridSize[0];
|
||||
size_t threads_per_block = get_optimal_block_size(device_id);
|
||||
size_t max_blocks_per_dim = prop.maxGridSize[0];
|
||||
|
||||
return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim);
|
||||
}
|
||||
|
|
|
@ -28,17 +28,19 @@ struct KernelConfig {
|
|||
* @param max_blocks_per_dim Maximum blocks per grid dimension (default: 65535)
|
||||
* @return LaunchConfig with optimal grid and block dimensions
|
||||
*/
|
||||
KernelConfig get_launch_config(size_t n_elements, int threads_per_block = 256,
|
||||
int max_blocks_per_dim = 65535);
|
||||
KernelConfig get_launch_config(size_t n_elements,
|
||||
size_t threads_per_block = 256,
|
||||
size_t max_blocks_per_dim = 65535);
|
||||
|
||||
/**
|
||||
* Calculate 1D thread index for kernels launched with get_launch_config()
|
||||
* Use this inside your CUDA kernels
|
||||
*/
|
||||
__device__ inline size_t get_thread_id() {
|
||||
return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
|
||||
(size_t)blockIdx.y * gridDim.x * blockDim.x +
|
||||
(size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t index = (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
|
||||
(size_t)blockIdx.y * gridDim.x * blockDim.x +
|
||||
(size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||
return index;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -3,6 +3,7 @@ include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})
|
|||
add_executable(${NAME}_cuda_tests
|
||||
test_potential.cu
|
||||
test_forces.cu
|
||||
test_kernel_config.cu
|
||||
)
|
||||
|
||||
target_link_libraries(${NAME}_cuda_tests gtest gtest_main)
|
||||
|
|
48
tests/cuda_unit_tests/test_kernel_config.cu
Normal file
48
tests/cuda_unit_tests/test_kernel_config.cu
Normal file
|
@ -0,0 +1,48 @@
|
|||
|
||||
#include "kernel_config.cuh"
|
||||
#include <cuda_runtime.h>
|
||||
#include <gtest/gtest.h>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
// Kernel to test the get_thread_id() function
|
||||
__global__ void test_get_thread_id_kernel(size_t *output, size_t n_elements) {
|
||||
size_t i = get_thread_id();
|
||||
if (i < n_elements) {
|
||||
output[i] = i;
|
||||
}
|
||||
}
|
||||
|
||||
// Test fixture for kernel config tests
|
||||
class KernelConfigTest : public ::testing::Test {
|
||||
protected:
|
||||
void SetUp() override {
|
||||
// Set up any common resources for the tests
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
// Clean up any resources
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(KernelConfigTest, GetThreadId) {
|
||||
const size_t n_elements = 10000;
|
||||
KernelConfig config = get_launch_config(n_elements);
|
||||
|
||||
size_t *d_output;
|
||||
cudaMalloc(&d_output, n_elements * sizeof(size_t));
|
||||
|
||||
test_get_thread_id_kernel<<<config.blocks, config.threads>>>(d_output,
|
||||
n_elements);
|
||||
|
||||
std::vector<size_t> h_output(n_elements);
|
||||
cudaMemcpy(h_output.data(), d_output, n_elements * sizeof(size_t),
|
||||
cudaMemcpyDeviceToHost);
|
||||
|
||||
cudaFree(d_output);
|
||||
|
||||
std::vector<size_t> expected(n_elements);
|
||||
std::iota(expected.begin(), expected.end(), 0);
|
||||
|
||||
ASSERT_EQ(h_output, expected);
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue