diff --git a/kernels/CMakeLists.txt b/kernels/CMakeLists.txt index f07f1a4..abcaadf 100644 --- a/kernels/CMakeLists.txt +++ b/kernels/CMakeLists.txt @@ -3,12 +3,14 @@ project(${NAME}_cuda_lib CUDA CXX) set(HEADER_FILES potentials/pair_potentials.cuh forces.cuh + kernel_config.cuh ) set(SOURCE_FILES + kernel_config.cu ) # The library contains header and source files. -add_library(${NAME}_cuda_lib INTERFACE +add_library(${NAME}_cuda_lib STATIC ${SOURCE_FILES} ${HEADER_FILES} ) diff --git a/kernels/forces.cuh b/kernels/forces.cuh index d8b4c2e..e4f52f1 100644 --- a/kernels/forces.cuh +++ b/kernels/forces.cuh @@ -1,5 +1,6 @@ #ifndef FORCES_CUH #define FORCES_CUH +#include "kernel_config.cuh" #include "potentials/pair_potentials.cuh" #include "precision.hpp" #include @@ -18,7 +19,7 @@ __global__ void calc_forces_and_energies(float4 *pos, float4 *force_energies, int n_particles, real *box_len, PotentialType potential) { - int i = blockIdx.x * blockDim.x + threadIdx.x; + int i = get_thread_id(); if (i < n_particles) { float4 my_pos = pos[i]; // Loads 16 bytes in one transaction @@ -54,7 +55,7 @@ __global__ void calc_forces_and_energies(float4 *pos, float4 *force_energies, inline void launch_force_kernels(float4 *xs, float4 *force_energies, int n_particles, real *box_len, std::vector potentials, - int grid_size, int block_size) { + dim3 blocks, dim3 threads_per_block) { reset_forces_and_energies(n_particles, force_energies); @@ -62,8 +63,9 @@ inline void launch_force_kernels(float4 *xs, float4 *force_energies, std::visit( [&](const auto &potential) { using PotentialType = std::decay_t; - calc_forces_and_energies<<>>( - xs, force_energies, n_particles, box_len, potential); + calc_forces_and_energies + <<>>(xs, force_energies, n_particles, + box_len, potential); }, potential); cudaDeviceSynchronize(); diff --git a/kernels/kernel_config.cu b/kernels/kernel_config.cu new file mode 100644 index 0000000..3c1644c --- /dev/null +++ b/kernels/kernel_config.cu @@ -0,0 +1,73 @@ +#include "kernel_config.cuh" +#include +#include + +size_t KernelConfig::total_threads() const { + return (size_t)blocks.x * blocks.y * blocks.z * threads.x * threads.y * + threads.z; +} + +void KernelConfig::print() const { + printf("Grid: (%u, %u, %u), Block: (%u, %u, %u), Total threads: %zu\n", + blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z, + total_threads()); +} + +KernelConfig get_launch_config(size_t n_elements, int threads_per_block, + int max_blocks_per_dim) { + + // Ensure threads_per_block is valid + threads_per_block = std::min(threads_per_block, 1024); + threads_per_block = std::max(threads_per_block, 32); + + // Calculate total blocks needed + size_t total_blocks = + (n_elements + threads_per_block - 1) / threads_per_block; + + dim3 threads(threads_per_block); + dim3 blocks; + + if (total_blocks <= max_blocks_per_dim) { + // Simple 1D grid + blocks = dim3(total_blocks); + } else { + // Use 2D grid + blocks.x = max_blocks_per_dim; + blocks.y = (total_blocks + max_blocks_per_dim - 1) / max_blocks_per_dim; + + // If still too big, use 3D grid + if (blocks.y > max_blocks_per_dim) { + blocks.y = max_blocks_per_dim; + blocks.z = + (total_blocks + (size_t)max_blocks_per_dim * max_blocks_per_dim - 1) / + ((size_t)max_blocks_per_dim * max_blocks_per_dim); + } + } + + return KernelConfig(blocks, threads); +} + +int get_optimal_block_size(int device_id) { + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, device_id); + + // Use a fraction of max threads per block for better occupancy + // Typically 256 or 512 work well for most kernels + if (prop.maxThreadsPerBlock >= 1024) { + return 256; // Good balance of occupancy and register usage + } else if (prop.maxThreadsPerBlock >= 512) { + return 256; + } else { + return prop.maxThreadsPerBlock / 2; + } +} + +KernelConfig get_launch_config_advanced(size_t n_elements, int device_id) { + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, device_id); + + int threads_per_block = get_optimal_block_size(device_id); + int max_blocks_per_dim = prop.maxGridSize[0]; + + return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim); +} diff --git a/kernels/kernel_config.cuh b/kernels/kernel_config.cuh new file mode 100644 index 0000000..66364fe --- /dev/null +++ b/kernels/kernel_config.cuh @@ -0,0 +1,83 @@ +#ifndef KERNEL_CONFIG_CUH +#define KERNEL_CONFIG_CUH +#include +#include + +/** + * Structure to hold grid launch configuration + */ +struct KernelConfig { + dim3 blocks; + dim3 threads; + + // Convenience constructor + KernelConfig(dim3 b, dim3 t) : blocks(b), threads(t) {} + + // Total number of threads launched + size_t total_threads() const; + + // Print configuration for debugging + void print() const; +}; + +/** + * Calculate optimal CUDA launch configuration for 1D problem + * + * @param n_elements Number of elements to process + * @param threads_per_block Desired threads per block (default: 256) + * @param max_blocks_per_dim Maximum blocks per grid dimension (default: 65535) + * @return LaunchConfig with optimal grid and block dimensions + */ +KernelConfig get_launch_config(size_t n_elements, int threads_per_block = 256, + int max_blocks_per_dim = 65535); + +/** + * Calculate 1D thread index for kernels launched with get_launch_config() + * Use this inside your CUDA kernels + */ +__device__ inline size_t get_thread_id() { + return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x + + (size_t)blockIdx.y * gridDim.x * blockDim.x + + (size_t)blockIdx.x * blockDim.x + threadIdx.x; +} + +/** + * Alternative version that takes grid dimensions as parameters + * Useful if you need the index calculation in multiple places + */ +__device__ inline size_t get_thread_id(dim3 gridDim, dim3 blockDim, + dim3 blockIdx, dim3 threadIdx) { + return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x + + (size_t)blockIdx.y * gridDim.x * blockDim.x + + (size_t)blockIdx.x * blockDim.x + threadIdx.x; +} + +/** + * GPU device properties helper - gets optimal block size for current device + */ +int get_optimal_block_size(int device_id = 0); + +/** + * Advanced version that considers device properties + */ +KernelConfig get_launch_config_advanced(size_t n_elements, int device_id = 0); + +// Example usage in your kernel: +/* +template +__global__ void calc_forces_and_energies(float4 *pos, float4 *force_energies, + size_t n_particles, real *box_len, + PotentialType potential) { + + size_t i = get_thread_id(); + + if (i < n_particles) { + // Your existing force calculation code here... + float4 my_pos = pos[i]; + // ... rest of kernel unchanged + } +} + +*/ + +#endif diff --git a/tests/cuda_unit_tests/test_forces.cu b/tests/cuda_unit_tests/test_forces.cu index 8d7175e..923ebe5 100644 --- a/tests/cuda_unit_tests/test_forces.cu +++ b/tests/cuda_unit_tests/test_forces.cu @@ -5,14 +5,12 @@ // Include your header files #include "forces.cuh" +#include "kernel_config.cuh" #include "potentials/pair_potentials.cuh" #include "precision.hpp" class CudaForceKernelTest : public ::testing::Test { protected: - const int GRID_SIZE = 1; - const int BLOCK_SIZE = 4; - void SetUp() override { // Set up CUDA device cudaError_t err = cudaSetDevice(0); @@ -61,13 +59,15 @@ protected: std::vector force_energies(n_particles, make_float4(0.0, 0.0, 0.0, 0.0)); + KernelConfig kernel_config = get_launch_config(n_particles); float4 *d_positions = allocateAndCopyToGPU(positions); float4 *d_force_energies = allocateAndCopyToGPU(force_energies); real *d_box_len = allocateAndCopyToGPU(box_dimensions); std::vector potentials = {LennardJones(1.0, 1.0, 3.0)}; CAC::launch_force_kernels(d_positions, d_force_energies, n_particles, - d_box_len, potentials, GRID_SIZE, BLOCK_SIZE); + d_box_len, potentials, kernel_config.blocks, + kernel_config.threads); checkCudaError(cudaGetLastError(), "kernel launch"); checkCudaError(cudaDeviceSynchronize(), "kernel execution");