Add a kernel_config to calculate blocks and threads for launching kernels

2025-09-12 22:47:21 -04:00 · 2025-09-12 22:47:21 -04:00 · 8ba5714648
commit 8ba5714648
parent 130b613a7c
5 changed files with 169 additions and 9 deletions
--- a/kernels/CMakeLists.txt
+++ b/kernels/CMakeLists.txt
@ -3,12 +3,14 @@ project(${NAME}_cuda_lib CUDA CXX)
 set(HEADER_FILES
    potentials/pair_potentials.cuh
    forces.cuh
+    kernel_config.cuh
 )
 set(SOURCE_FILES
+    kernel_config.cu
 )

 # The library contains header and source files.
-add_library(${NAME}_cuda_lib INTERFACE
+add_library(${NAME}_cuda_lib STATIC
    ${SOURCE_FILES}
    ${HEADER_FILES}
 )
--- a/kernels/forces.cuh
+++ b/kernels/forces.cuh
@ -1,5 +1,6 @@
 #ifndef FORCES_CUH
 #define FORCES_CUH
+#include "kernel_config.cuh"
 #include "potentials/pair_potentials.cuh"
 #include "precision.hpp"
 #include <cstdio>
@ -18,7 +19,7 @@ __global__ void calc_forces_and_energies(float4 *pos, float4 *force_energies,
                                         int n_particles, real *box_len,
                                         PotentialType potential) {

-  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int i = get_thread_id();

  if (i < n_particles) {
    float4 my_pos = pos[i]; // Loads 16 bytes in one transaction
@ -54,7 +55,7 @@ __global__ void calc_forces_and_energies(float4 *pos, float4 *force_energies,
 inline void launch_force_kernels(float4 *xs, float4 *force_energies,
                                 int n_particles, real *box_len,
                                 std::vector<PairPotentials> potentials,
-                                 int grid_size, int block_size) {
+                                 dim3 blocks, dim3 threads_per_block) {

  reset_forces_and_energies(n_particles, force_energies);

@ -62,8 +63,9 @@ inline void launch_force_kernels(float4 *xs, float4 *force_energies,
    std::visit(
        [&](const auto &potential) {
          using PotentialType = std::decay_t<decltype(potential)>;
-          calc_forces_and_energies<PotentialType><<<grid_size, block_size>>>(
-              xs, force_energies, n_particles, box_len, potential);
+          calc_forces_and_energies<PotentialType>
+              <<<blocks, threads_per_block>>>(xs, force_energies, n_particles,
+                                              box_len, potential);
        },
        potential);
    cudaDeviceSynchronize();
--- a/kernels/kernel_config.cu
+++ b/kernels/kernel_config.cu
@ -0,0 +1,73 @@
+#include "kernel_config.cuh"
+#include <algorithm>
+#include <cstdio>
+
+size_t KernelConfig::total_threads() const {
+  return (size_t)blocks.x * blocks.y * blocks.z * threads.x * threads.y *
+         threads.z;
+}
+
+void KernelConfig::print() const {
+  printf("Grid: (%u, %u, %u), Block: (%u, %u, %u), Total threads: %zu\n",
+         blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z,
+         total_threads());
+}
+
+KernelConfig get_launch_config(size_t n_elements, int threads_per_block,
+                               int max_blocks_per_dim) {
+
+  // Ensure threads_per_block is valid
+  threads_per_block = std::min(threads_per_block, 1024);
+  threads_per_block = std::max(threads_per_block, 32);
+
+  // Calculate total blocks needed
+  size_t total_blocks =
+      (n_elements + threads_per_block - 1) / threads_per_block;
+
+  dim3 threads(threads_per_block);
+  dim3 blocks;
+
+  if (total_blocks <= max_blocks_per_dim) {
+    // Simple 1D grid
+    blocks = dim3(total_blocks);
+  } else {
+    // Use 2D grid
+    blocks.x = max_blocks_per_dim;
+    blocks.y = (total_blocks + max_blocks_per_dim - 1) / max_blocks_per_dim;
+
+    // If still too big, use 3D grid
+    if (blocks.y > max_blocks_per_dim) {
+      blocks.y = max_blocks_per_dim;
+      blocks.z =
+          (total_blocks + (size_t)max_blocks_per_dim * max_blocks_per_dim - 1) /
+          ((size_t)max_blocks_per_dim * max_blocks_per_dim);
+    }
+  }
+
+  return KernelConfig(blocks, threads);
+}
+
+int get_optimal_block_size(int device_id) {
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, device_id);
+
+  // Use a fraction of max threads per block for better occupancy
+  // Typically 256 or 512 work well for most kernels
+  if (prop.maxThreadsPerBlock >= 1024) {
+    return 256; // Good balance of occupancy and register usage
+  } else if (prop.maxThreadsPerBlock >= 512) {
+    return 256;
+  } else {
+    return prop.maxThreadsPerBlock / 2;
+  }
+}
+
+KernelConfig get_launch_config_advanced(size_t n_elements, int device_id) {
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, device_id);
+
+  int threads_per_block = get_optimal_block_size(device_id);
+  int max_blocks_per_dim = prop.maxGridSize[0];
+
+  return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim);
+}
--- a/kernels/kernel_config.cuh
+++ b/kernels/kernel_config.cuh
@ -0,0 +1,83 @@
+#ifndef KERNEL_CONFIG_CUH
+#define KERNEL_CONFIG_CUH
+#include <cstdio>
+#include <cuda_runtime.h>
+
+/**
+ * Structure to hold grid launch configuration
+ */
+struct KernelConfig {
+  dim3 blocks;
+  dim3 threads;
+
+  // Convenience constructor
+  KernelConfig(dim3 b, dim3 t) : blocks(b), threads(t) {}
+
+  // Total number of threads launched
+  size_t total_threads() const;
+
+  // Print configuration for debugging
+  void print() const;
+};
+
+/**
+ * Calculate optimal CUDA launch configuration for 1D problem
+ *
+ * @param n_elements Number of elements to process
+ * @param threads_per_block Desired threads per block (default: 256)
+ * @param max_blocks_per_dim Maximum blocks per grid dimension (default: 65535)
+ * @return LaunchConfig with optimal grid and block dimensions
+ */
+KernelConfig get_launch_config(size_t n_elements, int threads_per_block = 256,
+                               int max_blocks_per_dim = 65535);
+
+/**
+ * Calculate 1D thread index for kernels launched with get_launch_config()
+ * Use this inside your CUDA kernels
+ */
+__device__ inline size_t get_thread_id() {
+  return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
+         (size_t)blockIdx.y * gridDim.x * blockDim.x +
+         (size_t)blockIdx.x * blockDim.x + threadIdx.x;
+}
+
+/**
+ * Alternative version that takes grid dimensions as parameters
+ * Useful if you need the index calculation in multiple places
+ */
+__device__ inline size_t get_thread_id(dim3 gridDim, dim3 blockDim,
+                                       dim3 blockIdx, dim3 threadIdx) {
+  return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
+         (size_t)blockIdx.y * gridDim.x * blockDim.x +
+         (size_t)blockIdx.x * blockDim.x + threadIdx.x;
+}
+
+/**
+ * GPU device properties helper - gets optimal block size for current device
+ */
+int get_optimal_block_size(int device_id = 0);
+
+/**
+ * Advanced version that considers device properties
+ */
+KernelConfig get_launch_config_advanced(size_t n_elements, int device_id = 0);
+
+// Example usage in your kernel:
+/*
+template <typename PotentialType>
+__global__ void calc_forces_and_energies(float4 *pos, float4 *force_energies,
+                                         size_t n_particles, real *box_len,
+                                         PotentialType potential) {
+
+    size_t i = get_thread_id();
+
+    if (i < n_particles) {
+        // Your existing force calculation code here...
+        float4 my_pos = pos[i];
+        // ... rest of kernel unchanged
+    }
+}
+
+*/
+
+#endif
--- a/tests/cuda_unit_tests/test_forces.cu
+++ b/tests/cuda_unit_tests/test_forces.cu
@ -5,14 +5,12 @@

 // Include your header files
 #include "forces.cuh"
+#include "kernel_config.cuh"
 #include "potentials/pair_potentials.cuh"
 #include "precision.hpp"

 class CudaForceKernelTest : public ::testing::Test {
 protected:
-  const int GRID_SIZE = 1;
-  const int BLOCK_SIZE = 4;
-
  void SetUp() override {
    // Set up CUDA device
    cudaError_t err = cudaSetDevice(0);
@ -61,13 +59,15 @@ protected:
    std::vector<float4> force_energies(n_particles,
                                       make_float4(0.0, 0.0, 0.0, 0.0));

+    KernelConfig kernel_config = get_launch_config(n_particles);
    float4 *d_positions = allocateAndCopyToGPU(positions);
    float4 *d_force_energies = allocateAndCopyToGPU(force_energies);
    real *d_box_len = allocateAndCopyToGPU(box_dimensions);

    std::vector<PairPotentials> potentials = {LennardJones(1.0, 1.0, 3.0)};
    CAC::launch_force_kernels(d_positions, d_force_energies, n_particles,
-                              d_box_len, potentials, GRID_SIZE, BLOCK_SIZE);
+                              d_box_len, potentials, kernel_config.blocks,
+                              kernel_config.threads);

    checkCudaError(cudaGetLastError(), "kernel launch");
    checkCudaError(cudaDeviceSynchronize(), "kernel execution");