cudaCAC/kernels/kernel_config.cuh

#ifndef KERNEL_CONFIG_CUH
#define KERNEL_CONFIG_CUH
#include <cstdio>
#include <cuda_runtime.h>

/**
 * Structure to hold grid launch configuration
 */
struct KernelConfig {
  dim3 blocks;
  dim3 threads;

  // Convenience constructor
  KernelConfig(dim3 b, dim3 t) : blocks(b), threads(t) {}

  // Total number of threads launched
  size_t total_threads() const;

  // Print configuration for debugging
  void print() const;
};

/**
 * Calculate optimal CUDA launch configuration for 1D problem
 *
 * @param n_elements Number of elements to process
 * @param threads_per_block Desired threads per block (default: 256)
 * @param max_blocks_per_dim Maximum blocks per grid dimension (default: 65535)
 * @return LaunchConfig with optimal grid and block dimensions
 */
KernelConfig get_launch_config(size_t n_elements, int threads_per_block = 256,
                               int max_blocks_per_dim = 65535);

/**
 * Calculate 1D thread index for kernels launched with get_launch_config()
 * Use this inside your CUDA kernels
 */
__device__ inline size_t get_thread_id() {
  return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
         (size_t)blockIdx.y * gridDim.x * blockDim.x +
         (size_t)blockIdx.x * blockDim.x + threadIdx.x;
}

/**
 * Alternative version that takes grid dimensions as parameters
 * Useful if you need the index calculation in multiple places
 */
__device__ inline size_t get_thread_id(dim3 gridDim, dim3 blockDim,
                                       dim3 blockIdx, dim3 threadIdx) {
  return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
         (size_t)blockIdx.y * gridDim.x * blockDim.x +
         (size_t)blockIdx.x * blockDim.x + threadIdx.x;
}

/**
 * GPU device properties helper - gets optimal block size for current device
 */
int get_optimal_block_size(int device_id = 0);

/**
 * Advanced version that considers device properties
 */
KernelConfig get_launch_config_advanced(size_t n_elements, int device_id = 0);

// Example usage in your kernel:
/*
template <typename PotentialType>
__global__ void calc_forces_and_energies(float4 *pos, float4 *force_energies,
                                         size_t n_particles, real *box_len,
                                         PotentialType potential) {

    size_t i = get_thread_id();

    if (i < n_particles) {
        // Your existing force calculation code here...
        float4 my_pos = pos[i];
        // ... rest of kernel unchanged
    }
}

*/

#endif
Add a kernel_config to calculate blocks and threads for launching kernels 2025-09-12 22:47:21 -04:00			`#ifndef KERNEL_CONFIG_CUH`
			`#define KERNEL_CONFIG_CUH`
			`#include <cstdio>`
			`#include <cuda_runtime.h>`

			`/**`
			`* Structure to hold grid launch configuration`
			`*/`
			`struct KernelConfig {`
			`dim3 blocks;`
			`dim3 threads;`

			`// Convenience constructor`
			`KernelConfig(dim3 b, dim3 t) : blocks(b), threads(t) {}`

			`// Total number of threads launched`
			`size_t total_threads() const;`

			`// Print configuration for debugging`
			`void print() const;`
			`};`

			`/**`
			`* Calculate optimal CUDA launch configuration for 1D problem`
			`*`
			`* @param n_elements Number of elements to process`
			`* @param threads_per_block Desired threads per block (default: 256)`
			`* @param max_blocks_per_dim Maximum blocks per grid dimension (default: 65535)`
			`* @return LaunchConfig with optimal grid and block dimensions`
			`*/`
			`KernelConfig get_launch_config(size_t n_elements, int threads_per_block = 256,`
			`int max_blocks_per_dim = 65535);`

			`/**`
			`* Calculate 1D thread index for kernels launched with get_launch_config()`
			`* Use this inside your CUDA kernels`
			`*/`
			`__device__ inline size_t get_thread_id() {`
			`return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +`
			`(size_t)blockIdx.y * gridDim.x * blockDim.x +`
			`(size_t)blockIdx.x * blockDim.x + threadIdx.x;`
			`}`

			`/**`
			`* Alternative version that takes grid dimensions as parameters`
			`* Useful if you need the index calculation in multiple places`
			`*/`
			`__device__ inline size_t get_thread_id(dim3 gridDim, dim3 blockDim,`
			`dim3 blockIdx, dim3 threadIdx) {`
			`return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +`
			`(size_t)blockIdx.y * gridDim.x * blockDim.x +`
			`(size_t)blockIdx.x * blockDim.x + threadIdx.x;`
			`}`

			`/**`
			`* GPU device properties helper - gets optimal block size for current device`
			`*/`
			`int get_optimal_block_size(int device_id = 0);`

			`/**`
			`* Advanced version that considers device properties`
			`*/`
			`KernelConfig get_launch_config_advanced(size_t n_elements, int device_id = 0);`

			`// Example usage in your kernel:`
			`/*`
			`template <typename PotentialType>`
			`__global__ void calc_forces_and_energies(float4 pos, float4 force_energies,`
			`size_t n_particles, real *box_len,`
			`PotentialType potential) {`

			`size_t i = get_thread_id();`

			`if (i < n_particles) {`
			`// Your existing force calculation code here...`
			`float4 my_pos = pos[i];`
			`// ... rest of kernel unchanged`
			`}`
			`}`

			`*/`

			`#endif`