Update params for KernelConfig and add basic tests for getThreadId
Some checks are pending
Build and Test / build-and-test (push) Waiting to run
Some checks are pending
Build and Test / build-and-test (push) Waiting to run
This commit is contained in:
parent
9825c0d14d
commit
8dec472929
5 changed files with 63 additions and 12 deletions
|
@ -16,7 +16,7 @@ set(CMAKE_CXX_STANDARD 17)
|
||||||
|
|
||||||
# Add debug configuration
|
# Add debug configuration
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||||
set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 --generate-line-info")
|
set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0")
|
||||||
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
|
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
|
||||||
set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math")
|
set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math")
|
||||||
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
|
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
|
||||||
|
|
|
@ -13,12 +13,12 @@ void KernelConfig::print() const {
|
||||||
total_threads());
|
total_threads());
|
||||||
}
|
}
|
||||||
|
|
||||||
KernelConfig get_launch_config(size_t n_elements, int threads_per_block,
|
KernelConfig get_launch_config(size_t n_elements, size_t threads_per_block,
|
||||||
int max_blocks_per_dim) {
|
size_t max_blocks_per_dim) {
|
||||||
|
|
||||||
// Ensure threads_per_block is valid
|
// Ensure threads_per_block is valid
|
||||||
threads_per_block = std::min(threads_per_block, 1024);
|
threads_per_block = std::min(threads_per_block, (size_t)1024);
|
||||||
threads_per_block = std::max(threads_per_block, 32);
|
threads_per_block = std::max(threads_per_block, (size_t)32);
|
||||||
|
|
||||||
// Calculate total blocks needed
|
// Calculate total blocks needed
|
||||||
size_t total_blocks =
|
size_t total_blocks =
|
||||||
|
@ -66,8 +66,8 @@ KernelConfig get_launch_config_advanced(size_t n_elements, int device_id) {
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
cudaGetDeviceProperties(&prop, device_id);
|
cudaGetDeviceProperties(&prop, device_id);
|
||||||
|
|
||||||
int threads_per_block = get_optimal_block_size(device_id);
|
size_t threads_per_block = get_optimal_block_size(device_id);
|
||||||
int max_blocks_per_dim = prop.maxGridSize[0];
|
size_t max_blocks_per_dim = prop.maxGridSize[0];
|
||||||
|
|
||||||
return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim);
|
return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim);
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,17 +28,19 @@ struct KernelConfig {
|
||||||
* @param max_blocks_per_dim Maximum blocks per grid dimension (default: 65535)
|
* @param max_blocks_per_dim Maximum blocks per grid dimension (default: 65535)
|
||||||
* @return LaunchConfig with optimal grid and block dimensions
|
* @return LaunchConfig with optimal grid and block dimensions
|
||||||
*/
|
*/
|
||||||
KernelConfig get_launch_config(size_t n_elements, int threads_per_block = 256,
|
KernelConfig get_launch_config(size_t n_elements,
|
||||||
int max_blocks_per_dim = 65535);
|
size_t threads_per_block = 256,
|
||||||
|
size_t max_blocks_per_dim = 65535);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calculate 1D thread index for kernels launched with get_launch_config()
|
* Calculate 1D thread index for kernels launched with get_launch_config()
|
||||||
* Use this inside your CUDA kernels
|
* Use this inside your CUDA kernels
|
||||||
*/
|
*/
|
||||||
__device__ inline size_t get_thread_id() {
|
__device__ inline size_t get_thread_id() {
|
||||||
return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
|
size_t index = (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
|
||||||
(size_t)blockIdx.y * gridDim.x * blockDim.x +
|
(size_t)blockIdx.y * gridDim.x * blockDim.x +
|
||||||
(size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
(size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
return index;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -3,6 +3,7 @@ include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})
|
||||||
add_executable(${NAME}_cuda_tests
|
add_executable(${NAME}_cuda_tests
|
||||||
test_potential.cu
|
test_potential.cu
|
||||||
test_forces.cu
|
test_forces.cu
|
||||||
|
test_kernel_config.cu
|
||||||
)
|
)
|
||||||
|
|
||||||
target_link_libraries(${NAME}_cuda_tests gtest gtest_main)
|
target_link_libraries(${NAME}_cuda_tests gtest gtest_main)
|
||||||
|
|
48
tests/cuda_unit_tests/test_kernel_config.cu
Normal file
48
tests/cuda_unit_tests/test_kernel_config.cu
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
|
||||||
|
#include "kernel_config.cuh"
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
#include <numeric>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
// Kernel to test the get_thread_id() function
|
||||||
|
__global__ void test_get_thread_id_kernel(size_t *output, size_t n_elements) {
|
||||||
|
size_t i = get_thread_id();
|
||||||
|
if (i < n_elements) {
|
||||||
|
output[i] = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test fixture for kernel config tests
|
||||||
|
class KernelConfigTest : public ::testing::Test {
|
||||||
|
protected:
|
||||||
|
void SetUp() override {
|
||||||
|
// Set up any common resources for the tests
|
||||||
|
}
|
||||||
|
|
||||||
|
void TearDown() override {
|
||||||
|
// Clean up any resources
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
TEST_F(KernelConfigTest, GetThreadId) {
|
||||||
|
const size_t n_elements = 10000;
|
||||||
|
KernelConfig config = get_launch_config(n_elements);
|
||||||
|
|
||||||
|
size_t *d_output;
|
||||||
|
cudaMalloc(&d_output, n_elements * sizeof(size_t));
|
||||||
|
|
||||||
|
test_get_thread_id_kernel<<<config.blocks, config.threads>>>(d_output,
|
||||||
|
n_elements);
|
||||||
|
|
||||||
|
std::vector<size_t> h_output(n_elements);
|
||||||
|
cudaMemcpy(h_output.data(), d_output, n_elements * sizeof(size_t),
|
||||||
|
cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
|
cudaFree(d_output);
|
||||||
|
|
||||||
|
std::vector<size_t> expected(n_elements);
|
||||||
|
std::iota(expected.begin(), expected.end(), 0);
|
||||||
|
|
||||||
|
ASSERT_EQ(h_output, expected);
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue