Add a kernel_config to calculate blocks and threads for launching kernels
Some checks failed
Build and Test / build-and-test (push) Failing after 5m3s
Some checks failed
Build and Test / build-and-test (push) Failing after 5m3s
This commit is contained in:
parent
130b613a7c
commit
8ba5714648
5 changed files with 169 additions and 9 deletions
73
kernels/kernel_config.cu
Normal file
73
kernels/kernel_config.cu
Normal file
|
@ -0,0 +1,73 @@
|
|||
#include "kernel_config.cuh"
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
|
||||
size_t KernelConfig::total_threads() const {
|
||||
return (size_t)blocks.x * blocks.y * blocks.z * threads.x * threads.y *
|
||||
threads.z;
|
||||
}
|
||||
|
||||
void KernelConfig::print() const {
|
||||
printf("Grid: (%u, %u, %u), Block: (%u, %u, %u), Total threads: %zu\n",
|
||||
blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z,
|
||||
total_threads());
|
||||
}
|
||||
|
||||
KernelConfig get_launch_config(size_t n_elements, int threads_per_block,
|
||||
int max_blocks_per_dim) {
|
||||
|
||||
// Ensure threads_per_block is valid
|
||||
threads_per_block = std::min(threads_per_block, 1024);
|
||||
threads_per_block = std::max(threads_per_block, 32);
|
||||
|
||||
// Calculate total blocks needed
|
||||
size_t total_blocks =
|
||||
(n_elements + threads_per_block - 1) / threads_per_block;
|
||||
|
||||
dim3 threads(threads_per_block);
|
||||
dim3 blocks;
|
||||
|
||||
if (total_blocks <= max_blocks_per_dim) {
|
||||
// Simple 1D grid
|
||||
blocks = dim3(total_blocks);
|
||||
} else {
|
||||
// Use 2D grid
|
||||
blocks.x = max_blocks_per_dim;
|
||||
blocks.y = (total_blocks + max_blocks_per_dim - 1) / max_blocks_per_dim;
|
||||
|
||||
// If still too big, use 3D grid
|
||||
if (blocks.y > max_blocks_per_dim) {
|
||||
blocks.y = max_blocks_per_dim;
|
||||
blocks.z =
|
||||
(total_blocks + (size_t)max_blocks_per_dim * max_blocks_per_dim - 1) /
|
||||
((size_t)max_blocks_per_dim * max_blocks_per_dim);
|
||||
}
|
||||
}
|
||||
|
||||
return KernelConfig(blocks, threads);
|
||||
}
|
||||
|
||||
int get_optimal_block_size(int device_id) {
|
||||
cudaDeviceProp prop;
|
||||
cudaGetDeviceProperties(&prop, device_id);
|
||||
|
||||
// Use a fraction of max threads per block for better occupancy
|
||||
// Typically 256 or 512 work well for most kernels
|
||||
if (prop.maxThreadsPerBlock >= 1024) {
|
||||
return 256; // Good balance of occupancy and register usage
|
||||
} else if (prop.maxThreadsPerBlock >= 512) {
|
||||
return 256;
|
||||
} else {
|
||||
return prop.maxThreadsPerBlock / 2;
|
||||
}
|
||||
}
|
||||
|
||||
KernelConfig get_launch_config_advanced(size_t n_elements, int device_id) {
|
||||
cudaDeviceProp prop;
|
||||
cudaGetDeviceProperties(&prop, device_id);
|
||||
|
||||
int threads_per_block = get_optimal_block_size(device_id);
|
||||
int max_blocks_per_dim = prop.maxGridSize[0];
|
||||
|
||||
return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim);
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue