Add a kernel_config to calculate blocks and threads for launching kernels

2025-09-12 22:47:21 -04:00 · 2025-09-12 22:47:21 -04:00 · 8ba5714648
commit 8ba5714648
parent 130b613a7c
5 changed files with 169 additions and 9 deletions
--- a/kernels/kernel_config.cuh
+++ b/kernels/kernel_config.cuh
@ -0,0 +1,83 @@
+#ifndef KERNEL_CONFIG_CUH
+#define KERNEL_CONFIG_CUH
+#include <cstdio>
+#include <cuda_runtime.h>
+
+/**
+ * Structure to hold grid launch configuration
+ */
+struct KernelConfig {
+  dim3 blocks;
+  dim3 threads;
+
+  // Convenience constructor
+  KernelConfig(dim3 b, dim3 t) : blocks(b), threads(t) {}
+
+  // Total number of threads launched
+  size_t total_threads() const;
+
+  // Print configuration for debugging
+  void print() const;
+};
+
+/**
+ * Calculate optimal CUDA launch configuration for 1D problem
+ *
+ * @param n_elements Number of elements to process
+ * @param threads_per_block Desired threads per block (default: 256)
+ * @param max_blocks_per_dim Maximum blocks per grid dimension (default: 65535)
+ * @return LaunchConfig with optimal grid and block dimensions
+ */
+KernelConfig get_launch_config(size_t n_elements, int threads_per_block = 256,
+                               int max_blocks_per_dim = 65535);
+
+/**
+ * Calculate 1D thread index for kernels launched with get_launch_config()
+ * Use this inside your CUDA kernels
+ */
+__device__ inline size_t get_thread_id() {
+  return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
+         (size_t)blockIdx.y * gridDim.x * blockDim.x +
+         (size_t)blockIdx.x * blockDim.x + threadIdx.x;
+}
+
+/**
+ * Alternative version that takes grid dimensions as parameters
+ * Useful if you need the index calculation in multiple places
+ */
+__device__ inline size_t get_thread_id(dim3 gridDim, dim3 blockDim,
+                                       dim3 blockIdx, dim3 threadIdx) {
+  return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
+         (size_t)blockIdx.y * gridDim.x * blockDim.x +
+         (size_t)blockIdx.x * blockDim.x + threadIdx.x;
+}
+
+/**
+ * GPU device properties helper - gets optimal block size for current device
+ */
+int get_optimal_block_size(int device_id = 0);
+
+/**
+ * Advanced version that considers device properties
+ */
+KernelConfig get_launch_config_advanced(size_t n_elements, int device_id = 0);
+
+// Example usage in your kernel:
+/*
+template <typename PotentialType>
+__global__ void calc_forces_and_energies(float4 *pos, float4 *force_energies,
+                                         size_t n_particles, real *box_len,
+                                         PotentialType potential) {
+
+    size_t i = get_thread_id();
+
+    if (i < n_particles) {
+        // Your existing force calculation code here...
+        float4 my_pos = pos[i];
+        // ... rest of kernel unchanged
+    }
+}
+
+*/
+
+#endif