#include "kernel_config.cuh" #include #include size_t KernelConfig::total_threads() const { return (size_t)blocks.x * blocks.y * blocks.z * threads.x * threads.y * threads.z; } void KernelConfig::print() const { printf("Grid: (%u, %u, %u), Block: (%u, %u, %u), Total threads: %zu\n", blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z, total_threads()); } KernelConfig get_launch_config(size_t n_elements, int threads_per_block, int max_blocks_per_dim) { // Ensure threads_per_block is valid threads_per_block = std::min(threads_per_block, 1024); threads_per_block = std::max(threads_per_block, 32); // Calculate total blocks needed size_t total_blocks = (n_elements + threads_per_block - 1) / threads_per_block; dim3 threads(threads_per_block); dim3 blocks; if (total_blocks <= max_blocks_per_dim) { // Simple 1D grid blocks = dim3(total_blocks); } else { // Use 2D grid blocks.x = max_blocks_per_dim; blocks.y = (total_blocks + max_blocks_per_dim - 1) / max_blocks_per_dim; // If still too big, use 3D grid if (blocks.y > max_blocks_per_dim) { blocks.y = max_blocks_per_dim; blocks.z = (total_blocks + (size_t)max_blocks_per_dim * max_blocks_per_dim - 1) / ((size_t)max_blocks_per_dim * max_blocks_per_dim); } } return KernelConfig(blocks, threads); } int get_optimal_block_size(int device_id) { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, device_id); // Use a fraction of max threads per block for better occupancy // Typically 256 or 512 work well for most kernels if (prop.maxThreadsPerBlock >= 1024) { return 256; // Good balance of occupancy and register usage } else if (prop.maxThreadsPerBlock >= 512) { return 256; } else { return prop.maxThreadsPerBlock / 2; } } KernelConfig get_launch_config_advanced(size_t n_elements, int device_id) { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, device_id); int threads_per_block = get_optimal_block_size(device_id); int max_blocks_per_dim = prop.maxGridSize[0]; return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim); }