cudaCAC/kernels/kernel_config.cu

74 lines
2.2 KiB
Text
Raw Normal View History

#include "kernel_config.cuh"
#include <algorithm>
#include <cstdio>
size_t KernelConfig::total_threads() const {
return (size_t)blocks.x * blocks.y * blocks.z * threads.x * threads.y *
threads.z;
}
void KernelConfig::print() const {
printf("Grid: (%u, %u, %u), Block: (%u, %u, %u), Total threads: %zu\n",
blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z,
total_threads());
}
KernelConfig get_launch_config(size_t n_elements, size_t threads_per_block,
size_t max_blocks_per_dim) {
// Ensure threads_per_block is valid
threads_per_block = std::min(threads_per_block, (size_t)1024);
threads_per_block = std::max(threads_per_block, (size_t)32);
// Calculate total blocks needed
size_t total_blocks =
(n_elements + threads_per_block - 1) / threads_per_block;
dim3 threads(threads_per_block);
dim3 blocks;
if (total_blocks <= max_blocks_per_dim) {
// Simple 1D grid
blocks = dim3(total_blocks);
} else {
// Use 2D grid
blocks.x = max_blocks_per_dim;
blocks.y = (total_blocks + max_blocks_per_dim - 1) / max_blocks_per_dim;
// If still too big, use 3D grid
if (blocks.y > max_blocks_per_dim) {
blocks.y = max_blocks_per_dim;
blocks.z =
(total_blocks + (size_t)max_blocks_per_dim * max_blocks_per_dim - 1) /
((size_t)max_blocks_per_dim * max_blocks_per_dim);
}
}
return KernelConfig(blocks, threads);
}
int get_optimal_block_size(int device_id) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, device_id);
// Use a fraction of max threads per block for better occupancy
// Typically 256 or 512 work well for most kernels
if (prop.maxThreadsPerBlock >= 1024) {
return 256; // Good balance of occupancy and register usage
} else if (prop.maxThreadsPerBlock >= 512) {
return 256;
} else {
return prop.maxThreadsPerBlock / 2;
}
}
KernelConfig get_launch_config_advanced(size_t n_elements, int device_id) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, device_id);
size_t threads_per_block = get_optimal_block_size(device_id);
size_t max_blocks_per_dim = prop.maxGridSize[0];
return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim);
}