Add a kernel_config to calculate blocks and threads for launching kernels

2025-09-12 22:47:21 -04:00 · 2025-09-12 22:47:21 -04:00 · 8ba5714648
commit 8ba5714648
parent 130b613a7c
5 changed files with 169 additions and 9 deletions
--- a/kernels/kernel_config.cu
+++ b/kernels/kernel_config.cu
@ -0,0 +1,73 @@
+#include "kernel_config.cuh"
+#include <algorithm>
+#include <cstdio>
+
+size_t KernelConfig::total_threads() const {
+  return (size_t)blocks.x * blocks.y * blocks.z * threads.x * threads.y *
+         threads.z;
+}
+
+void KernelConfig::print() const {
+  printf("Grid: (%u, %u, %u), Block: (%u, %u, %u), Total threads: %zu\n",
+         blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z,
+         total_threads());
+}
+
+KernelConfig get_launch_config(size_t n_elements, int threads_per_block,
+                               int max_blocks_per_dim) {
+
+  // Ensure threads_per_block is valid
+  threads_per_block = std::min(threads_per_block, 1024);
+  threads_per_block = std::max(threads_per_block, 32);
+
+  // Calculate total blocks needed
+  size_t total_blocks =
+      (n_elements + threads_per_block - 1) / threads_per_block;
+
+  dim3 threads(threads_per_block);
+  dim3 blocks;
+
+  if (total_blocks <= max_blocks_per_dim) {
+    // Simple 1D grid
+    blocks = dim3(total_blocks);
+  } else {
+    // Use 2D grid
+    blocks.x = max_blocks_per_dim;
+    blocks.y = (total_blocks + max_blocks_per_dim - 1) / max_blocks_per_dim;
+
+    // If still too big, use 3D grid
+    if (blocks.y > max_blocks_per_dim) {
+      blocks.y = max_blocks_per_dim;
+      blocks.z =
+          (total_blocks + (size_t)max_blocks_per_dim * max_blocks_per_dim - 1) /
+          ((size_t)max_blocks_per_dim * max_blocks_per_dim);
+    }
+  }
+
+  return KernelConfig(blocks, threads);
+}
+
+int get_optimal_block_size(int device_id) {
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, device_id);
+
+  // Use a fraction of max threads per block for better occupancy
+  // Typically 256 or 512 work well for most kernels
+  if (prop.maxThreadsPerBlock >= 1024) {
+    return 256; // Good balance of occupancy and register usage
+  } else if (prop.maxThreadsPerBlock >= 512) {
+    return 256;
+  } else {
+    return prop.maxThreadsPerBlock / 2;
+  }
+}
+
+KernelConfig get_launch_config_advanced(size_t n_elements, int device_id) {
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, device_id);
+
+  int threads_per_block = get_optimal_block_size(device_id);
+  int max_blocks_per_dim = prop.maxGridSize[0];
+
+  return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim);
+}