74 lines
2.2 KiB
Text
74 lines
2.2 KiB
Text
|
#include "kernel_config.cuh"
|
||
|
#include <algorithm>
|
||
|
#include <cstdio>
|
||
|
|
||
|
size_t KernelConfig::total_threads() const {
|
||
|
return (size_t)blocks.x * blocks.y * blocks.z * threads.x * threads.y *
|
||
|
threads.z;
|
||
|
}
|
||
|
|
||
|
void KernelConfig::print() const {
|
||
|
printf("Grid: (%u, %u, %u), Block: (%u, %u, %u), Total threads: %zu\n",
|
||
|
blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z,
|
||
|
total_threads());
|
||
|
}
|
||
|
|
||
|
KernelConfig get_launch_config(size_t n_elements, int threads_per_block,
|
||
|
int max_blocks_per_dim) {
|
||
|
|
||
|
// Ensure threads_per_block is valid
|
||
|
threads_per_block = std::min(threads_per_block, 1024);
|
||
|
threads_per_block = std::max(threads_per_block, 32);
|
||
|
|
||
|
// Calculate total blocks needed
|
||
|
size_t total_blocks =
|
||
|
(n_elements + threads_per_block - 1) / threads_per_block;
|
||
|
|
||
|
dim3 threads(threads_per_block);
|
||
|
dim3 blocks;
|
||
|
|
||
|
if (total_blocks <= max_blocks_per_dim) {
|
||
|
// Simple 1D grid
|
||
|
blocks = dim3(total_blocks);
|
||
|
} else {
|
||
|
// Use 2D grid
|
||
|
blocks.x = max_blocks_per_dim;
|
||
|
blocks.y = (total_blocks + max_blocks_per_dim - 1) / max_blocks_per_dim;
|
||
|
|
||
|
// If still too big, use 3D grid
|
||
|
if (blocks.y > max_blocks_per_dim) {
|
||
|
blocks.y = max_blocks_per_dim;
|
||
|
blocks.z =
|
||
|
(total_blocks + (size_t)max_blocks_per_dim * max_blocks_per_dim - 1) /
|
||
|
((size_t)max_blocks_per_dim * max_blocks_per_dim);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return KernelConfig(blocks, threads);
|
||
|
}
|
||
|
|
||
|
int get_optimal_block_size(int device_id) {
|
||
|
cudaDeviceProp prop;
|
||
|
cudaGetDeviceProperties(&prop, device_id);
|
||
|
|
||
|
// Use a fraction of max threads per block for better occupancy
|
||
|
// Typically 256 or 512 work well for most kernels
|
||
|
if (prop.maxThreadsPerBlock >= 1024) {
|
||
|
return 256; // Good balance of occupancy and register usage
|
||
|
} else if (prop.maxThreadsPerBlock >= 512) {
|
||
|
return 256;
|
||
|
} else {
|
||
|
return prop.maxThreadsPerBlock / 2;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
KernelConfig get_launch_config_advanced(size_t n_elements, int device_id) {
|
||
|
cudaDeviceProp prop;
|
||
|
cudaGetDeviceProperties(&prop, device_id);
|
||
|
|
||
|
int threads_per_block = get_optimal_block_size(device_id);
|
||
|
int max_blocks_per_dim = prop.maxGridSize[0];
|
||
|
|
||
|
return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim);
|
||
|
}
|