Add a kernel_config to calculate blocks and threads for launching kernels
Some checks failed
Build and Test / build-and-test (push) Failing after 5m3s
Some checks failed
Build and Test / build-and-test (push) Failing after 5m3s
This commit is contained in:
parent
130b613a7c
commit
8ba5714648
5 changed files with 169 additions and 9 deletions
|
@ -3,12 +3,14 @@ project(${NAME}_cuda_lib CUDA CXX)
|
||||||
set(HEADER_FILES
|
set(HEADER_FILES
|
||||||
potentials/pair_potentials.cuh
|
potentials/pair_potentials.cuh
|
||||||
forces.cuh
|
forces.cuh
|
||||||
|
kernel_config.cuh
|
||||||
)
|
)
|
||||||
set(SOURCE_FILES
|
set(SOURCE_FILES
|
||||||
|
kernel_config.cu
|
||||||
)
|
)
|
||||||
|
|
||||||
# The library contains header and source files.
|
# The library contains header and source files.
|
||||||
add_library(${NAME}_cuda_lib INTERFACE
|
add_library(${NAME}_cuda_lib STATIC
|
||||||
${SOURCE_FILES}
|
${SOURCE_FILES}
|
||||||
${HEADER_FILES}
|
${HEADER_FILES}
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#ifndef FORCES_CUH
|
#ifndef FORCES_CUH
|
||||||
#define FORCES_CUH
|
#define FORCES_CUH
|
||||||
|
#include "kernel_config.cuh"
|
||||||
#include "potentials/pair_potentials.cuh"
|
#include "potentials/pair_potentials.cuh"
|
||||||
#include "precision.hpp"
|
#include "precision.hpp"
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
@ -18,7 +19,7 @@ __global__ void calc_forces_and_energies(float4 *pos, float4 *force_energies,
|
||||||
int n_particles, real *box_len,
|
int n_particles, real *box_len,
|
||||||
PotentialType potential) {
|
PotentialType potential) {
|
||||||
|
|
||||||
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
int i = get_thread_id();
|
||||||
|
|
||||||
if (i < n_particles) {
|
if (i < n_particles) {
|
||||||
float4 my_pos = pos[i]; // Loads 16 bytes in one transaction
|
float4 my_pos = pos[i]; // Loads 16 bytes in one transaction
|
||||||
|
@ -54,7 +55,7 @@ __global__ void calc_forces_and_energies(float4 *pos, float4 *force_energies,
|
||||||
inline void launch_force_kernels(float4 *xs, float4 *force_energies,
|
inline void launch_force_kernels(float4 *xs, float4 *force_energies,
|
||||||
int n_particles, real *box_len,
|
int n_particles, real *box_len,
|
||||||
std::vector<PairPotentials> potentials,
|
std::vector<PairPotentials> potentials,
|
||||||
int grid_size, int block_size) {
|
dim3 blocks, dim3 threads_per_block) {
|
||||||
|
|
||||||
reset_forces_and_energies(n_particles, force_energies);
|
reset_forces_and_energies(n_particles, force_energies);
|
||||||
|
|
||||||
|
@ -62,8 +63,9 @@ inline void launch_force_kernels(float4 *xs, float4 *force_energies,
|
||||||
std::visit(
|
std::visit(
|
||||||
[&](const auto &potential) {
|
[&](const auto &potential) {
|
||||||
using PotentialType = std::decay_t<decltype(potential)>;
|
using PotentialType = std::decay_t<decltype(potential)>;
|
||||||
calc_forces_and_energies<PotentialType><<<grid_size, block_size>>>(
|
calc_forces_and_energies<PotentialType>
|
||||||
xs, force_energies, n_particles, box_len, potential);
|
<<<blocks, threads_per_block>>>(xs, force_energies, n_particles,
|
||||||
|
box_len, potential);
|
||||||
},
|
},
|
||||||
potential);
|
potential);
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
|
|
73
kernels/kernel_config.cu
Normal file
73
kernels/kernel_config.cu
Normal file
|
@ -0,0 +1,73 @@
|
||||||
|
#include "kernel_config.cuh"
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cstdio>
|
||||||
|
|
||||||
|
size_t KernelConfig::total_threads() const {
|
||||||
|
return (size_t)blocks.x * blocks.y * blocks.z * threads.x * threads.y *
|
||||||
|
threads.z;
|
||||||
|
}
|
||||||
|
|
||||||
|
void KernelConfig::print() const {
|
||||||
|
printf("Grid: (%u, %u, %u), Block: (%u, %u, %u), Total threads: %zu\n",
|
||||||
|
blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z,
|
||||||
|
total_threads());
|
||||||
|
}
|
||||||
|
|
||||||
|
KernelConfig get_launch_config(size_t n_elements, int threads_per_block,
|
||||||
|
int max_blocks_per_dim) {
|
||||||
|
|
||||||
|
// Ensure threads_per_block is valid
|
||||||
|
threads_per_block = std::min(threads_per_block, 1024);
|
||||||
|
threads_per_block = std::max(threads_per_block, 32);
|
||||||
|
|
||||||
|
// Calculate total blocks needed
|
||||||
|
size_t total_blocks =
|
||||||
|
(n_elements + threads_per_block - 1) / threads_per_block;
|
||||||
|
|
||||||
|
dim3 threads(threads_per_block);
|
||||||
|
dim3 blocks;
|
||||||
|
|
||||||
|
if (total_blocks <= max_blocks_per_dim) {
|
||||||
|
// Simple 1D grid
|
||||||
|
blocks = dim3(total_blocks);
|
||||||
|
} else {
|
||||||
|
// Use 2D grid
|
||||||
|
blocks.x = max_blocks_per_dim;
|
||||||
|
blocks.y = (total_blocks + max_blocks_per_dim - 1) / max_blocks_per_dim;
|
||||||
|
|
||||||
|
// If still too big, use 3D grid
|
||||||
|
if (blocks.y > max_blocks_per_dim) {
|
||||||
|
blocks.y = max_blocks_per_dim;
|
||||||
|
blocks.z =
|
||||||
|
(total_blocks + (size_t)max_blocks_per_dim * max_blocks_per_dim - 1) /
|
||||||
|
((size_t)max_blocks_per_dim * max_blocks_per_dim);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return KernelConfig(blocks, threads);
|
||||||
|
}
|
||||||
|
|
||||||
|
int get_optimal_block_size(int device_id) {
|
||||||
|
cudaDeviceProp prop;
|
||||||
|
cudaGetDeviceProperties(&prop, device_id);
|
||||||
|
|
||||||
|
// Use a fraction of max threads per block for better occupancy
|
||||||
|
// Typically 256 or 512 work well for most kernels
|
||||||
|
if (prop.maxThreadsPerBlock >= 1024) {
|
||||||
|
return 256; // Good balance of occupancy and register usage
|
||||||
|
} else if (prop.maxThreadsPerBlock >= 512) {
|
||||||
|
return 256;
|
||||||
|
} else {
|
||||||
|
return prop.maxThreadsPerBlock / 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
KernelConfig get_launch_config_advanced(size_t n_elements, int device_id) {
|
||||||
|
cudaDeviceProp prop;
|
||||||
|
cudaGetDeviceProperties(&prop, device_id);
|
||||||
|
|
||||||
|
int threads_per_block = get_optimal_block_size(device_id);
|
||||||
|
int max_blocks_per_dim = prop.maxGridSize[0];
|
||||||
|
|
||||||
|
return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim);
|
||||||
|
}
|
83
kernels/kernel_config.cuh
Normal file
83
kernels/kernel_config.cuh
Normal file
|
@ -0,0 +1,83 @@
|
||||||
|
#ifndef KERNEL_CONFIG_CUH
|
||||||
|
#define KERNEL_CONFIG_CUH
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Structure to hold grid launch configuration
|
||||||
|
*/
|
||||||
|
struct KernelConfig {
|
||||||
|
dim3 blocks;
|
||||||
|
dim3 threads;
|
||||||
|
|
||||||
|
// Convenience constructor
|
||||||
|
KernelConfig(dim3 b, dim3 t) : blocks(b), threads(t) {}
|
||||||
|
|
||||||
|
// Total number of threads launched
|
||||||
|
size_t total_threads() const;
|
||||||
|
|
||||||
|
// Print configuration for debugging
|
||||||
|
void print() const;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate optimal CUDA launch configuration for 1D problem
|
||||||
|
*
|
||||||
|
* @param n_elements Number of elements to process
|
||||||
|
* @param threads_per_block Desired threads per block (default: 256)
|
||||||
|
* @param max_blocks_per_dim Maximum blocks per grid dimension (default: 65535)
|
||||||
|
* @return LaunchConfig with optimal grid and block dimensions
|
||||||
|
*/
|
||||||
|
KernelConfig get_launch_config(size_t n_elements, int threads_per_block = 256,
|
||||||
|
int max_blocks_per_dim = 65535);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate 1D thread index for kernels launched with get_launch_config()
|
||||||
|
* Use this inside your CUDA kernels
|
||||||
|
*/
|
||||||
|
__device__ inline size_t get_thread_id() {
|
||||||
|
return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
|
||||||
|
(size_t)blockIdx.y * gridDim.x * blockDim.x +
|
||||||
|
(size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Alternative version that takes grid dimensions as parameters
|
||||||
|
* Useful if you need the index calculation in multiple places
|
||||||
|
*/
|
||||||
|
__device__ inline size_t get_thread_id(dim3 gridDim, dim3 blockDim,
|
||||||
|
dim3 blockIdx, dim3 threadIdx) {
|
||||||
|
return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
|
||||||
|
(size_t)blockIdx.y * gridDim.x * blockDim.x +
|
||||||
|
(size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GPU device properties helper - gets optimal block size for current device
|
||||||
|
*/
|
||||||
|
int get_optimal_block_size(int device_id = 0);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advanced version that considers device properties
|
||||||
|
*/
|
||||||
|
KernelConfig get_launch_config_advanced(size_t n_elements, int device_id = 0);
|
||||||
|
|
||||||
|
// Example usage in your kernel:
|
||||||
|
/*
|
||||||
|
template <typename PotentialType>
|
||||||
|
__global__ void calc_forces_and_energies(float4 *pos, float4 *force_energies,
|
||||||
|
size_t n_particles, real *box_len,
|
||||||
|
PotentialType potential) {
|
||||||
|
|
||||||
|
size_t i = get_thread_id();
|
||||||
|
|
||||||
|
if (i < n_particles) {
|
||||||
|
// Your existing force calculation code here...
|
||||||
|
float4 my_pos = pos[i];
|
||||||
|
// ... rest of kernel unchanged
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#endif
|
|
@ -5,14 +5,12 @@
|
||||||
|
|
||||||
// Include your header files
|
// Include your header files
|
||||||
#include "forces.cuh"
|
#include "forces.cuh"
|
||||||
|
#include "kernel_config.cuh"
|
||||||
#include "potentials/pair_potentials.cuh"
|
#include "potentials/pair_potentials.cuh"
|
||||||
#include "precision.hpp"
|
#include "precision.hpp"
|
||||||
|
|
||||||
class CudaForceKernelTest : public ::testing::Test {
|
class CudaForceKernelTest : public ::testing::Test {
|
||||||
protected:
|
protected:
|
||||||
const int GRID_SIZE = 1;
|
|
||||||
const int BLOCK_SIZE = 4;
|
|
||||||
|
|
||||||
void SetUp() override {
|
void SetUp() override {
|
||||||
// Set up CUDA device
|
// Set up CUDA device
|
||||||
cudaError_t err = cudaSetDevice(0);
|
cudaError_t err = cudaSetDevice(0);
|
||||||
|
@ -61,13 +59,15 @@ protected:
|
||||||
std::vector<float4> force_energies(n_particles,
|
std::vector<float4> force_energies(n_particles,
|
||||||
make_float4(0.0, 0.0, 0.0, 0.0));
|
make_float4(0.0, 0.0, 0.0, 0.0));
|
||||||
|
|
||||||
|
KernelConfig kernel_config = get_launch_config(n_particles);
|
||||||
float4 *d_positions = allocateAndCopyToGPU(positions);
|
float4 *d_positions = allocateAndCopyToGPU(positions);
|
||||||
float4 *d_force_energies = allocateAndCopyToGPU(force_energies);
|
float4 *d_force_energies = allocateAndCopyToGPU(force_energies);
|
||||||
real *d_box_len = allocateAndCopyToGPU(box_dimensions);
|
real *d_box_len = allocateAndCopyToGPU(box_dimensions);
|
||||||
|
|
||||||
std::vector<PairPotentials> potentials = {LennardJones(1.0, 1.0, 3.0)};
|
std::vector<PairPotentials> potentials = {LennardJones(1.0, 1.0, 3.0)};
|
||||||
CAC::launch_force_kernels(d_positions, d_force_energies, n_particles,
|
CAC::launch_force_kernels(d_positions, d_force_energies, n_particles,
|
||||||
d_box_len, potentials, GRID_SIZE, BLOCK_SIZE);
|
d_box_len, potentials, kernel_config.blocks,
|
||||||
|
kernel_config.threads);
|
||||||
|
|
||||||
checkCudaError(cudaGetLastError(), "kernel launch");
|
checkCudaError(cudaGetLastError(), "kernel launch");
|
||||||
checkCudaError(cudaDeviceSynchronize(), "kernel execution");
|
checkCudaError(cudaDeviceSynchronize(), "kernel execution");
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue