cudaCAC/kernels/kernel_config.cu

#include "kernel_config.cuh"
#include <algorithm>
#include <cstdio>

size_t KernelConfig::total_threads() const {
  return (size_t)blocks.x * blocks.y * blocks.z * threads.x * threads.y *
         threads.z;
}

void KernelConfig::print() const {
  printf("Grid: (%u, %u, %u), Block: (%u, %u, %u), Total threads: %zu\n",
         blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z,
         total_threads());
}

KernelConfig get_launch_config(size_t n_elements, int threads_per_block,
                               int max_blocks_per_dim) {

  // Ensure threads_per_block is valid
  threads_per_block = std::min(threads_per_block, 1024);
  threads_per_block = std::max(threads_per_block, 32);

  // Calculate total blocks needed
  size_t total_blocks =
      (n_elements + threads_per_block - 1) / threads_per_block;

  dim3 threads(threads_per_block);
  dim3 blocks;

  if (total_blocks <= max_blocks_per_dim) {
    // Simple 1D grid
    blocks = dim3(total_blocks);
  } else {
    // Use 2D grid
    blocks.x = max_blocks_per_dim;
    blocks.y = (total_blocks + max_blocks_per_dim - 1) / max_blocks_per_dim;

    // If still too big, use 3D grid
    if (blocks.y > max_blocks_per_dim) {
      blocks.y = max_blocks_per_dim;
      blocks.z =
          (total_blocks + (size_t)max_blocks_per_dim * max_blocks_per_dim - 1) /
          ((size_t)max_blocks_per_dim * max_blocks_per_dim);
    }
  }

  return KernelConfig(blocks, threads);
}

int get_optimal_block_size(int device_id) {
  cudaDeviceProp prop;
  cudaGetDeviceProperties(&prop, device_id);

  // Use a fraction of max threads per block for better occupancy
  // Typically 256 or 512 work well for most kernels
  if (prop.maxThreadsPerBlock >= 1024) {
    return 256; // Good balance of occupancy and register usage
  } else if (prop.maxThreadsPerBlock >= 512) {
    return 256;
  } else {
    return prop.maxThreadsPerBlock / 2;
  }
}

KernelConfig get_launch_config_advanced(size_t n_elements, int device_id) {
  cudaDeviceProp prop;
  cudaGetDeviceProperties(&prop, device_id);

  int threads_per_block = get_optimal_block_size(device_id);
  int max_blocks_per_dim = prop.maxGridSize[0];

  return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim);
}
Add a kernel_config to calculate blocks and threads for launching kernels 2025-09-12 22:47:21 -04:00			`#include "kernel_config.cuh"`
			`#include <algorithm>`
			`#include <cstdio>`

			`size_t KernelConfig::total_threads() const {`
			`return (size_t)blocks.x * blocks.y * blocks.z * threads.x * threads.y *`
			`threads.z;`
			`}`

			`void KernelConfig::print() const {`
			`printf("Grid: (%u, %u, %u), Block: (%u, %u, %u), Total threads: %zu\n",`
			`blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z,`
			`total_threads());`
			`}`

			`KernelConfig get_launch_config(size_t n_elements, int threads_per_block,`
			`int max_blocks_per_dim) {`

			`// Ensure threads_per_block is valid`
			`threads_per_block = std::min(threads_per_block, 1024);`
			`threads_per_block = std::max(threads_per_block, 32);`

			`// Calculate total blocks needed`
			`size_t total_blocks =`
			`(n_elements + threads_per_block - 1) / threads_per_block;`

			`dim3 threads(threads_per_block);`
			`dim3 blocks;`

			`if (total_blocks <= max_blocks_per_dim) {`
			`// Simple 1D grid`
			`blocks = dim3(total_blocks);`
			`} else {`
			`// Use 2D grid`
			`blocks.x = max_blocks_per_dim;`
			`blocks.y = (total_blocks + max_blocks_per_dim - 1) / max_blocks_per_dim;`

			`// If still too big, use 3D grid`
			`if (blocks.y > max_blocks_per_dim) {`
			`blocks.y = max_blocks_per_dim;`
			`blocks.z =`
			`(total_blocks + (size_t)max_blocks_per_dim * max_blocks_per_dim - 1) /`
			`((size_t)max_blocks_per_dim * max_blocks_per_dim);`
			`}`
			`}`

			`return KernelConfig(blocks, threads);`
			`}`

			`int get_optimal_block_size(int device_id) {`
			`cudaDeviceProp prop;`
			`cudaGetDeviceProperties(&prop, device_id);`

			`// Use a fraction of max threads per block for better occupancy`
			`// Typically 256 or 512 work well for most kernels`
			`if (prop.maxThreadsPerBlock >= 1024) {`
			`return 256; // Good balance of occupancy and register usage`
			`} else if (prop.maxThreadsPerBlock >= 512) {`
			`return 256;`
			`} else {`
			`return prop.maxThreadsPerBlock / 2;`
			`}`
			`}`

			`KernelConfig get_launch_config_advanced(size_t n_elements, int device_id) {`
			`cudaDeviceProp prop;`
			`cudaGetDeviceProperties(&prop, device_id);`

			`int threads_per_block = get_optimal_block_size(device_id);`
			`int max_blocks_per_dim = prop.maxGridSize[0];`

			`return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim);`
			`}`