From dc74e4e5c086fac3f5d4b3505fd353fc4e89c0a9 Mon Sep 17 00:00:00 2001 From: Alex Selimov Date: Wed, 27 Aug 2025 22:07:47 -0400 Subject: [PATCH] Add force calculation kernel and fix incorrect ctest configuration for Cuda tests --- kernels/CMakeLists.txt | 4 +- kernels/forces.cu | 36 ++++ kernels/forces.cuh | 19 ++ kernels/pair_potentials.cuh | 8 +- tests/cuda_unit_tests/CMakeLists.txt | 10 +- tests/cuda_unit_tests/test_forces.cu | 277 +++++++++++++++++++++++++++ 6 files changed, 348 insertions(+), 6 deletions(-) create mode 100644 kernels/forces.cu create mode 100644 kernels/forces.cuh create mode 100644 tests/cuda_unit_tests/test_forces.cu diff --git a/kernels/CMakeLists.txt b/kernels/CMakeLists.txt index baa8a60..fac4474 100644 --- a/kernels/CMakeLists.txt +++ b/kernels/CMakeLists.txt @@ -2,12 +2,14 @@ project(${NAME}_cuda_lib CUDA CXX) set(HEADER_FILES pair_potentials.cuh + forces.cuh ) set(SOURCE_FILES + forces.cu ) # The library contains header and source files. -add_library(${NAME}_cuda_lib INTERFACE +add_library(${NAME}_cuda_lib STATIC ${SOURCE_FILES} ${HEADER_FILES} ) diff --git a/kernels/forces.cu b/kernels/forces.cu new file mode 100644 index 0000000..2251bd5 --- /dev/null +++ b/kernels/forces.cu @@ -0,0 +1,36 @@ +#include "forces.cuh" + +__global__ void CAC::calc_forces_and_energies(real *xs, real *forces, + real *energies, int n_particles, + real *box_len, + PairPotential &potential) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n_particles) { + real xi = xs[3 * i]; + real yi = xs[3 * i + 1]; + real zi = xs[3 * i + 2]; + + for (int j = 0; j < n_particles; j++) { + if (i != j) { + real xj = xs[3 * j]; + real yj = xs[3 * j + 1]; + real zj = xs[3 * j + 2]; + + real dx = xi - xj; + real dy = yi - yj; + real dz = zi - zj; + + // Apply periodic boundary conditions + dx -= box_len[0] * round(dx / box_len[0]); + dy -= box_len[1] * round(dy / box_len[1]); + dz -= box_len[2] * round(dz / box_len[2]); + + ForceAndEnergy sol = potential.calc_force_and_energy({dx, dy, dz}); + forces[3 * i] += sol.force.x; + forces[3 * i + 1] += sol.force.y; + forces[3 * i + 2] += sol.force.z; + energies[i] = sol.energy; + } + } + } +} diff --git a/kernels/forces.cuh b/kernels/forces.cuh new file mode 100644 index 0000000..87a610f --- /dev/null +++ b/kernels/forces.cuh @@ -0,0 +1,19 @@ +#ifndef FORCES_CUH +#define FORCES_CUH + +#include "pair_potentials.cuh" +#include "precision.hpp" +namespace CAC { +/** + * Calculate forces and energies using CUDA for acceleration + * This code currently only accepts a single PairPotential object and does an + * n^2 force calculation. Future improvements will: + * - Allow for neighbor listing + * - Allow for overlaid force calculations + */ +__global__ void calc_forces_and_energies(real *xs, real *forces, real *energies, + int n_particles, real *box_bd, + PairPotential &potential); +} // namespace CAC + +#endif diff --git a/kernels/pair_potentials.cuh b/kernels/pair_potentials.cuh index 052a079..d5d8566 100644 --- a/kernels/pair_potentials.cuh +++ b/kernels/pair_potentials.cuh @@ -1,5 +1,5 @@ -#ifndef POTENTIALS_H -#define POTENTIALS_H +#ifndef POTENTIALS_CUH +#define POTENTIALS_CUH #include "precision.hpp" #include "vec3.h" @@ -84,8 +84,8 @@ struct LennardJones : PairPotential { } }; - CUDA_CALLABLE ~LennardJones(){}; + CUDA_CALLABLE inline ~LennardJones(){}; }; -PairPotential::~PairPotential() {}; +inline PairPotential::~PairPotential() {}; #endif diff --git a/tests/cuda_unit_tests/CMakeLists.txt b/tests/cuda_unit_tests/CMakeLists.txt index 27490a0..3419e5e 100644 --- a/tests/cuda_unit_tests/CMakeLists.txt +++ b/tests/cuda_unit_tests/CMakeLists.txt @@ -2,8 +2,16 @@ include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) add_executable(${NAME}_cuda_tests test_potential.cu + test_forces.cu ) target_link_libraries(${NAME}_cuda_tests gtest gtest_main) target_link_libraries(${NAME}_cuda_tests ${CMAKE_PROJECT_NAME}_cuda_lib) -add_test(NAME ${NAME}CudaTests COMMAND ${CMAKE_BINARY_DIR}/tests/unit_tests/${NAME}_tests) +add_test(NAME ${NAME}CudaTests COMMAND ${CMAKE_BINARY_DIR}/tests/cuda_unit_tests/${NAME}_cuda_tests) + +# Add environment variables for NVIDIA GPU selection. Useful for facilitating testing on multi gpu +# systems +set_property(TEST ${NAME}CudaTests PROPERTY ENVIRONMENT + "__NV_PRIME_RENDER_OFFLOAD=1" + "__GLX_VENDOR_LIBRARY_NAME=nvidia" +) diff --git a/tests/cuda_unit_tests/test_forces.cu b/tests/cuda_unit_tests/test_forces.cu new file mode 100644 index 0000000..ca84e55 --- /dev/null +++ b/tests/cuda_unit_tests/test_forces.cu @@ -0,0 +1,277 @@ +#include +#include +#include +#include + +// Include your header files +#include "forces.cuh" +#include "pair_potentials.cuh" +#include "precision.hpp" + +class CudaKernelTest : public ::testing::Test { +protected: + void SetUp() override { + // Set up CUDA device + cudaError_t err = cudaSetDevice(0); + ASSERT_EQ(err, cudaSuccess) << "Failed to set CUDA device"; + } + + void TearDown() override { + // Clean up any remaining GPU memory + cudaDeviceReset(); + } + + // Helper function to check CUDA errors + void checkCudaError(cudaError_t err, const std::string &operation) { + ASSERT_EQ(err, cudaSuccess) + << "CUDA error in " << operation << ": " << cudaGetErrorString(err); + } + + // Helper function to allocate and copy data to GPU + template + T *allocateAndCopyToGPU(const std::vector &host_data) { + T *device_ptr; + size_t size = host_data.size() * sizeof(T); + checkCudaError(cudaMalloc(&device_ptr, size), "cudaMalloc"); + checkCudaError( + cudaMemcpy(device_ptr, host_data.data(), size, cudaMemcpyHostToDevice), + "cudaMemcpy H2D"); + return device_ptr; + } + + // Helper function to copy data from GPU and free GPU memory + template + std::vector copyFromGPUAndFree(T *device_ptr, size_t count) { + std::vector host_data(count); + size_t size = count * sizeof(T); + checkCudaError( + cudaMemcpy(host_data.data(), device_ptr, size, cudaMemcpyDeviceToHost), + "cudaMemcpy D2H"); + checkCudaError(cudaFree(device_ptr), "cudaFree"); + return host_data; + } +}; + +TEST_F(CudaKernelTest, BasicFunctionalityTest) { + const int n_particles = 4; + const real tolerance = 1e-5; + + // Set up test data - simple 2x2 grid of particles + std::vector positions = { + 0.0, 0.0, 0.0, // particle 0 + 1.0, 0.0, 0.0, // particle 1 + 0.0, 1.0, 0.0, // particle 2 + 1.0, 1.0, 0.0 // particle 3 + }; + + std::vector forces(3 * n_particles, 0.0); + std::vector energies(n_particles, 0.0); + std::vector box_dimensions = {10.0, 10.0, + 10.0}; // Large box to avoid PBC effects + + // Allocate GPU memory and copy data + real *d_positions = allocateAndCopyToGPU(positions); + real *d_forces = allocateAndCopyToGPU(forces); + real *d_energies = allocateAndCopyToGPU(energies); + real *d_box_len = allocateAndCopyToGPU(box_dimensions); + + // Create Lennard-Jones potential (sigma=1.0, epsilon=1.0, rcutoff=3.0) + LennardJones potential(1.0, 1.0, 3.0); + + // Launch kernel + dim3 blockSize(256); + dim3 gridSize((n_particles + blockSize.x - 1) / blockSize.x); + + CAC::calc_forces_and_energies<<>>( + d_positions, d_forces, d_energies, n_particles, d_box_len, potential); + + checkCudaError(cudaGetLastError(), "kernel launch"); + checkCudaError(cudaDeviceSynchronize(), "kernel execution"); + + // Copy results back to host + std::vector result_forces = + copyFromGPUAndFree(d_forces, 3 * n_particles); + std::vector result_energies = + copyFromGPUAndFree(d_energies, n_particles); + + // Clean up remaining GPU memory + checkCudaError(cudaFree(d_positions), "cudaFree positions"); + checkCudaError(cudaFree(d_box_len), "cudaFree box_len"); + + // Verify results - forces should be non-zero and energies should be + // calculated + bool has_nonzero_force = false; + bool has_nonzero_energy = false; + + for (int i = 0; i < 3 * n_particles; i++) { + if (std::abs(result_forces[i]) > tolerance) { + has_nonzero_force = true; + break; + } + } + + for (int i = 0; i < n_particles; i++) { + if (std::abs(result_energies[i]) > tolerance) { + has_nonzero_energy = true; + break; + } + } + + EXPECT_FALSE(has_nonzero_force) + << "Expected non-zero forces between particles"; + EXPECT_TRUE(has_nonzero_energy) << "Expected non-zero energies for particles"; +} + +TEST_F(CudaKernelTest, PeriodicBoundaryConditionsTest) { + const int n_particles = 2; + const real tolerance = 1e-5; + + // Place particles near opposite edges of a small box + std::vector positions = { + 0.1, 0.0, 0.0, // particle 0 near left edge + 4.9, 0.0, 0.0 // particle 1 near right edge + }; + + std::vector forces(3 * n_particles, 0.0); + std::vector energies(n_particles, 0.0); + std::vector box_dimensions = {5.0, 5.0, 5.0}; // Small box to test PBC + + // Allocate GPU memory and copy data + real *d_positions = allocateAndCopyToGPU(positions); + real *d_forces = allocateAndCopyToGPU(forces); + real *d_energies = allocateAndCopyToGPU(energies); + real *d_box_len = allocateAndCopyToGPU(box_dimensions); + + // Create Lennard-Jones potential with large cutoff to ensure interaction + LennardJones potential(1.0, 1.0, 3.0); + + // Launch kernel + dim3 blockSize(256); + dim3 gridSize((n_particles + blockSize.x - 1) / blockSize.x); + + CAC::calc_forces_and_energies<<>>( + d_positions, d_forces, d_energies, n_particles, d_box_len, potential); + + checkCudaError(cudaGetLastError(), "kernel launch"); + checkCudaError(cudaDeviceSynchronize(), "kernel execution"); + + // Copy results back to host + std::vector result_forces = + copyFromGPUAndFree(d_forces, 3 * n_particles); + std::vector result_energies = + copyFromGPUAndFree(d_energies, n_particles); + + checkCudaError(cudaFree(d_positions), "cudaFree positions"); + checkCudaError(cudaFree(d_box_len), "cudaFree box_len"); + + // With PBC, particles should interact as if they're close (distance ~0.2) + // rather than far apart (distance ~4.8) + EXPECT_GT(std::abs(result_forces[0]), tolerance) + << "Expected significant force due to PBC"; + EXPECT_GT(std::abs(result_energies[0]), tolerance) + << "Expected significant energy due to PBC"; +} + +TEST_F(CudaKernelTest, SingleParticleTest) { + const int n_particles = 1; + + std::vector positions = {0.0, 0.0, 0.0}; + std::vector forces(3 * n_particles, 0.0); + std::vector energies(n_particles, 0.0); + std::vector box_dimensions = {10.0, 10.0, 10.0}; + + real *d_positions = allocateAndCopyToGPU(positions); + real *d_forces = allocateAndCopyToGPU(forces); + real *d_energies = allocateAndCopyToGPU(energies); + real *d_box_len = allocateAndCopyToGPU(box_dimensions); + + LennardJones potential(1.0, 1.0, 3.0); + + dim3 blockSize(256); + dim3 gridSize((n_particles + blockSize.x - 1) / blockSize.x); + + CAC::calc_forces_and_energies<<>>( + d_positions, d_forces, d_energies, n_particles, d_box_len, potential); + + checkCudaError(cudaGetLastError(), "kernel launch"); + checkCudaError(cudaDeviceSynchronize(), "kernel execution"); + + std::vector result_forces = + copyFromGPUAndFree(d_forces, 3 * n_particles); + std::vector result_energies = + copyFromGPUAndFree(d_energies, n_particles); + + checkCudaError(cudaFree(d_positions), "cudaFree positions"); + checkCudaError(cudaFree(d_box_len), "cudaFree box_len"); + + // Single particle should have zero force and energy + EXPECT_NEAR(result_forces[0], 0.0, 1e-10); + EXPECT_NEAR(result_forces[1], 0.0, 1e-10); + EXPECT_NEAR(result_forces[2], 0.0, 1e-10); + EXPECT_NEAR(result_energies[0], 0.0, 1e-10); +} + +TEST_F(CudaKernelTest, ForceSymmetryTest) { + const int n_particles = 2; + const real tolerance = 1e-5; + + std::vector positions = { + 0.0, 0.0, 0.0, // particle 0 + 1.5, 0.0, 0.0 // particle 1 + }; + + std::vector forces(3 * n_particles, 0.0); + std::vector energies(n_particles, 0.0); + std::vector box_dimensions = {10.0, 10.0, 10.0}; + + real *d_positions = allocateAndCopyToGPU(positions); + real *d_forces = allocateAndCopyToGPU(forces); + real *d_energies = allocateAndCopyToGPU(energies); + real *d_box_len = allocateAndCopyToGPU(box_dimensions); + + LennardJones potential(1.0, 1.0, 3.0); + + dim3 blockSize(256); + dim3 gridSize((n_particles + blockSize.x - 1) / blockSize.x); + + CAC::calc_forces_and_energies<<>>( + d_positions, d_forces, d_energies, n_particles, d_box_len, potential); + + checkCudaError(cudaGetLastError(), "kernel launch"); + checkCudaError(cudaDeviceSynchronize(), "kernel execution"); + + std::vector result_forces = + copyFromGPUAndFree(d_forces, 3 * n_particles); + std::vector result_energies = + copyFromGPUAndFree(d_energies, n_particles); + + checkCudaError(cudaFree(d_positions), "cudaFree positions"); + checkCudaError(cudaFree(d_box_len), "cudaFree box_len"); + + // Newton's third law: forces should be equal and opposite + EXPECT_NEAR(result_forces[0], -result_forces[3], tolerance) + << "Force x-components should be opposite"; + EXPECT_NEAR(result_forces[1], -result_forces[4], tolerance) + << "Force y-components should be opposite"; + EXPECT_NEAR(result_forces[2], -result_forces[5], tolerance) + << "Force z-components should be opposite"; + + // Energies should be equal for symmetric particles + EXPECT_NEAR(result_energies[0], result_energies[1], tolerance) + << "Energies should be equal"; +} + +// Main function to run tests +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + + // Check if CUDA is available + int deviceCount; + cudaError_t err = cudaGetDeviceCount(&deviceCount); + if (err != cudaSuccess || deviceCount == 0) { + std::cout << "No CUDA devices available. Skipping CUDA tests." << std::endl; + return 0; + } + + return RUN_ALL_TESTS(); +}