Add a kernel_config to calculate blocks and threads for launching kernels
Some checks failed
Build and Test / build-and-test (push) Failing after 5m3s
Some checks failed
Build and Test / build-and-test (push) Failing after 5m3s
This commit is contained in:
parent
130b613a7c
commit
8ba5714648
5 changed files with 169 additions and 9 deletions
83
kernels/kernel_config.cuh
Normal file
83
kernels/kernel_config.cuh
Normal file
|
@ -0,0 +1,83 @@
|
|||
#ifndef KERNEL_CONFIG_CUH
|
||||
#define KERNEL_CONFIG_CUH
|
||||
#include <cstdio>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
/**
|
||||
* Structure to hold grid launch configuration
|
||||
*/
|
||||
struct KernelConfig {
|
||||
dim3 blocks;
|
||||
dim3 threads;
|
||||
|
||||
// Convenience constructor
|
||||
KernelConfig(dim3 b, dim3 t) : blocks(b), threads(t) {}
|
||||
|
||||
// Total number of threads launched
|
||||
size_t total_threads() const;
|
||||
|
||||
// Print configuration for debugging
|
||||
void print() const;
|
||||
};
|
||||
|
||||
/**
|
||||
* Calculate optimal CUDA launch configuration for 1D problem
|
||||
*
|
||||
* @param n_elements Number of elements to process
|
||||
* @param threads_per_block Desired threads per block (default: 256)
|
||||
* @param max_blocks_per_dim Maximum blocks per grid dimension (default: 65535)
|
||||
* @return LaunchConfig with optimal grid and block dimensions
|
||||
*/
|
||||
KernelConfig get_launch_config(size_t n_elements, int threads_per_block = 256,
|
||||
int max_blocks_per_dim = 65535);
|
||||
|
||||
/**
|
||||
* Calculate 1D thread index for kernels launched with get_launch_config()
|
||||
* Use this inside your CUDA kernels
|
||||
*/
|
||||
__device__ inline size_t get_thread_id() {
|
||||
return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
|
||||
(size_t)blockIdx.y * gridDim.x * blockDim.x +
|
||||
(size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||
}
|
||||
|
||||
/**
|
||||
* Alternative version that takes grid dimensions as parameters
|
||||
* Useful if you need the index calculation in multiple places
|
||||
*/
|
||||
__device__ inline size_t get_thread_id(dim3 gridDim, dim3 blockDim,
|
||||
dim3 blockIdx, dim3 threadIdx) {
|
||||
return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
|
||||
(size_t)blockIdx.y * gridDim.x * blockDim.x +
|
||||
(size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||
}
|
||||
|
||||
/**
|
||||
* GPU device properties helper - gets optimal block size for current device
|
||||
*/
|
||||
int get_optimal_block_size(int device_id = 0);
|
||||
|
||||
/**
|
||||
* Advanced version that considers device properties
|
||||
*/
|
||||
KernelConfig get_launch_config_advanced(size_t n_elements, int device_id = 0);
|
||||
|
||||
// Example usage in your kernel:
|
||||
/*
|
||||
template <typename PotentialType>
|
||||
__global__ void calc_forces_and_energies(float4 *pos, float4 *force_energies,
|
||||
size_t n_particles, real *box_len,
|
||||
PotentialType potential) {
|
||||
|
||||
size_t i = get_thread_id();
|
||||
|
||||
if (i < n_particles) {
|
||||
// Your existing force calculation code here...
|
||||
float4 my_pos = pos[i];
|
||||
// ... rest of kernel unchanged
|
||||
}
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
#endif
|
Loading…
Add table
Add a link
Reference in a new issue