From dc74e4e5c086fac3f5d4b3505fd353fc4e89c0a9 Mon Sep 17 00:00:00 2001
From: Alex Selimov <alex@alexselimov.com>
Date: Wed, 27 Aug 2025 22:07:47 -0400
Subject: [PATCH] Add force calculation kernel and fix incorrect ctest
 configuration for Cuda tests

---
 kernels/CMakeLists.txt               |   4 +-
 kernels/forces.cu                    |  36 ++++
 kernels/forces.cuh                   |  19 ++
 kernels/pair_potentials.cuh          |   8 +-
 tests/cuda_unit_tests/CMakeLists.txt |  10 +-
 tests/cuda_unit_tests/test_forces.cu | 277 +++++++++++++++++++++++++++
 6 files changed, 348 insertions(+), 6 deletions(-)
 create mode 100644 kernels/forces.cu
 create mode 100644 kernels/forces.cuh
 create mode 100644 tests/cuda_unit_tests/test_forces.cu

diff --git a/kernels/CMakeLists.txt b/kernels/CMakeLists.txt
index baa8a60..fac4474 100644
--- a/kernels/CMakeLists.txt
+++ b/kernels/CMakeLists.txt
@@ -2,12 +2,14 @@ project(${NAME}_cuda_lib CUDA CXX)
 
 set(HEADER_FILES
     pair_potentials.cuh
+    forces.cuh
 )
 set(SOURCE_FILES
+    forces.cu
 )
 
 # The library contains header and source files.
-add_library(${NAME}_cuda_lib INTERFACE
+add_library(${NAME}_cuda_lib STATIC
     ${SOURCE_FILES}
     ${HEADER_FILES}
 )
diff --git a/kernels/forces.cu b/kernels/forces.cu
new file mode 100644
index 0000000..2251bd5
--- /dev/null
+++ b/kernels/forces.cu
@@ -0,0 +1,36 @@
+#include "forces.cuh"
+
+__global__ void CAC::calc_forces_and_energies(real *xs, real *forces,
+                                              real *energies, int n_particles,
+                                              real *box_len,
+                                              PairPotential &potential) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < n_particles) {
+    real xi = xs[3 * i];
+    real yi = xs[3 * i + 1];
+    real zi = xs[3 * i + 2];
+
+    for (int j = 0; j < n_particles; j++) {
+      if (i != j) {
+        real xj = xs[3 * j];
+        real yj = xs[3 * j + 1];
+        real zj = xs[3 * j + 2];
+
+        real dx = xi - xj;
+        real dy = yi - yj;
+        real dz = zi - zj;
+
+        // Apply periodic boundary conditions
+        dx -= box_len[0] * round(dx / box_len[0]);
+        dy -= box_len[1] * round(dy / box_len[1]);
+        dz -= box_len[2] * round(dz / box_len[2]);
+
+        ForceAndEnergy sol = potential.calc_force_and_energy({dx, dy, dz});
+        forces[3 * i] += sol.force.x;
+        forces[3 * i + 1] += sol.force.y;
+        forces[3 * i + 2] += sol.force.z;
+        energies[i] = sol.energy;
+      }
+    }
+  }
+}
diff --git a/kernels/forces.cuh b/kernels/forces.cuh
new file mode 100644
index 0000000..87a610f
--- /dev/null
+++ b/kernels/forces.cuh
@@ -0,0 +1,19 @@
+#ifndef FORCES_CUH
+#define FORCES_CUH
+
+#include "pair_potentials.cuh"
+#include "precision.hpp"
+namespace CAC {
+/**
+ * Calculate forces and energies using CUDA for acceleration
+ * This code currently only accepts a single PairPotential object and does an
+ * n^2 force calculation. Future improvements will:
+ *   - Allow for neighbor listing
+ *   - Allow for overlaid force calculations
+ */
+__global__ void calc_forces_and_energies(real *xs, real *forces, real *energies,
+                                         int n_particles, real *box_bd,
+                                         PairPotential &potential);
+} // namespace CAC
+
+#endif
diff --git a/kernels/pair_potentials.cuh b/kernels/pair_potentials.cuh
index 052a079..d5d8566 100644
--- a/kernels/pair_potentials.cuh
+++ b/kernels/pair_potentials.cuh
@@ -1,5 +1,5 @@
-#ifndef POTENTIALS_H
-#define POTENTIALS_H
+#ifndef POTENTIALS_CUH
+#define POTENTIALS_CUH
 
 #include "precision.hpp"
 #include "vec3.h"
@@ -84,8 +84,8 @@ struct LennardJones : PairPotential {
     }
   };
 
-  CUDA_CALLABLE ~LennardJones(){};
+  CUDA_CALLABLE inline ~LennardJones(){};
 };
 
-PairPotential::~PairPotential() {};
+inline PairPotential::~PairPotential() {};
 #endif
diff --git a/tests/cuda_unit_tests/CMakeLists.txt b/tests/cuda_unit_tests/CMakeLists.txt
index 27490a0..3419e5e 100644
--- a/tests/cuda_unit_tests/CMakeLists.txt
+++ b/tests/cuda_unit_tests/CMakeLists.txt
@@ -2,8 +2,16 @@ include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})
 
 add_executable(${NAME}_cuda_tests
     test_potential.cu
+    test_forces.cu
 )
 
 target_link_libraries(${NAME}_cuda_tests gtest gtest_main)
 target_link_libraries(${NAME}_cuda_tests ${CMAKE_PROJECT_NAME}_cuda_lib)
-add_test(NAME ${NAME}CudaTests COMMAND ${CMAKE_BINARY_DIR}/tests/unit_tests/${NAME}_tests)
+add_test(NAME ${NAME}CudaTests COMMAND ${CMAKE_BINARY_DIR}/tests/cuda_unit_tests/${NAME}_cuda_tests)
+
+# Add environment variables for NVIDIA GPU selection. Useful for facilitating testing on multi gpu
+# systems
+set_property(TEST ${NAME}CudaTests PROPERTY ENVIRONMENT
+    "__NV_PRIME_RENDER_OFFLOAD=1"
+    "__GLX_VENDOR_LIBRARY_NAME=nvidia"
+)
diff --git a/tests/cuda_unit_tests/test_forces.cu b/tests/cuda_unit_tests/test_forces.cu
new file mode 100644
index 0000000..ca84e55
--- /dev/null
+++ b/tests/cuda_unit_tests/test_forces.cu
@@ -0,0 +1,277 @@
+#include <cmath>
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+#include <vector>
+
+// Include your header files
+#include "forces.cuh"
+#include "pair_potentials.cuh"
+#include "precision.hpp"
+
+class CudaKernelTest : public ::testing::Test {
+protected:
+  void SetUp() override {
+    // Set up CUDA device
+    cudaError_t err = cudaSetDevice(0);
+    ASSERT_EQ(err, cudaSuccess) << "Failed to set CUDA device";
+  }
+
+  void TearDown() override {
+    // Clean up any remaining GPU memory
+    cudaDeviceReset();
+  }
+
+  // Helper function to check CUDA errors
+  void checkCudaError(cudaError_t err, const std::string &operation) {
+    ASSERT_EQ(err, cudaSuccess)
+        << "CUDA error in " << operation << ": " << cudaGetErrorString(err);
+  }
+
+  // Helper function to allocate and copy data to GPU
+  template <typename T>
+  T *allocateAndCopyToGPU(const std::vector<T> &host_data) {
+    T *device_ptr;
+    size_t size = host_data.size() * sizeof(T);
+    checkCudaError(cudaMalloc(&device_ptr, size), "cudaMalloc");
+    checkCudaError(
+        cudaMemcpy(device_ptr, host_data.data(), size, cudaMemcpyHostToDevice),
+        "cudaMemcpy H2D");
+    return device_ptr;
+  }
+
+  // Helper function to copy data from GPU and free GPU memory
+  template <typename T>
+  std::vector<T> copyFromGPUAndFree(T *device_ptr, size_t count) {
+    std::vector<T> host_data(count);
+    size_t size = count * sizeof(T);
+    checkCudaError(
+        cudaMemcpy(host_data.data(), device_ptr, size, cudaMemcpyDeviceToHost),
+        "cudaMemcpy D2H");
+    checkCudaError(cudaFree(device_ptr), "cudaFree");
+    return host_data;
+  }
+};
+
+TEST_F(CudaKernelTest, BasicFunctionalityTest) {
+  const int n_particles = 4;
+  const real tolerance = 1e-5;
+
+  // Set up test data - simple 2x2 grid of particles
+  std::vector<real> positions = {
+      0.0, 0.0, 0.0, // particle 0
+      1.0, 0.0, 0.0, // particle 1
+      0.0, 1.0, 0.0, // particle 2
+      1.0, 1.0, 0.0  // particle 3
+  };
+
+  std::vector<real> forces(3 * n_particles, 0.0);
+  std::vector<real> energies(n_particles, 0.0);
+  std::vector<real> box_dimensions = {10.0, 10.0,
+                                      10.0}; // Large box to avoid PBC effects
+
+  // Allocate GPU memory and copy data
+  real *d_positions = allocateAndCopyToGPU(positions);
+  real *d_forces = allocateAndCopyToGPU(forces);
+  real *d_energies = allocateAndCopyToGPU(energies);
+  real *d_box_len = allocateAndCopyToGPU(box_dimensions);
+
+  // Create Lennard-Jones potential (sigma=1.0, epsilon=1.0, rcutoff=3.0)
+  LennardJones potential(1.0, 1.0, 3.0);
+
+  // Launch kernel
+  dim3 blockSize(256);
+  dim3 gridSize((n_particles + blockSize.x - 1) / blockSize.x);
+
+  CAC::calc_forces_and_energies<<<gridSize, blockSize>>>(
+      d_positions, d_forces, d_energies, n_particles, d_box_len, potential);
+
+  checkCudaError(cudaGetLastError(), "kernel launch");
+  checkCudaError(cudaDeviceSynchronize(), "kernel execution");
+
+  // Copy results back to host
+  std::vector<real> result_forces =
+      copyFromGPUAndFree(d_forces, 3 * n_particles);
+  std::vector<real> result_energies =
+      copyFromGPUAndFree(d_energies, n_particles);
+
+  // Clean up remaining GPU memory
+  checkCudaError(cudaFree(d_positions), "cudaFree positions");
+  checkCudaError(cudaFree(d_box_len), "cudaFree box_len");
+
+  // Verify results - forces should be non-zero and energies should be
+  // calculated
+  bool has_nonzero_force = false;
+  bool has_nonzero_energy = false;
+
+  for (int i = 0; i < 3 * n_particles; i++) {
+    if (std::abs(result_forces[i]) > tolerance) {
+      has_nonzero_force = true;
+      break;
+    }
+  }
+
+  for (int i = 0; i < n_particles; i++) {
+    if (std::abs(result_energies[i]) > tolerance) {
+      has_nonzero_energy = true;
+      break;
+    }
+  }
+
+  EXPECT_FALSE(has_nonzero_force)
+      << "Expected non-zero forces between particles";
+  EXPECT_TRUE(has_nonzero_energy) << "Expected non-zero energies for particles";
+}
+
+TEST_F(CudaKernelTest, PeriodicBoundaryConditionsTest) {
+  const int n_particles = 2;
+  const real tolerance = 1e-5;
+
+  // Place particles near opposite edges of a small box
+  std::vector<real> positions = {
+      0.1, 0.0, 0.0, // particle 0 near left edge
+      4.9, 0.0, 0.0  // particle 1 near right edge
+  };
+
+  std::vector<real> forces(3 * n_particles, 0.0);
+  std::vector<real> energies(n_particles, 0.0);
+  std::vector<real> box_dimensions = {5.0, 5.0, 5.0}; // Small box to test PBC
+
+  // Allocate GPU memory and copy data
+  real *d_positions = allocateAndCopyToGPU(positions);
+  real *d_forces = allocateAndCopyToGPU(forces);
+  real *d_energies = allocateAndCopyToGPU(energies);
+  real *d_box_len = allocateAndCopyToGPU(box_dimensions);
+
+  // Create Lennard-Jones potential with large cutoff to ensure interaction
+  LennardJones potential(1.0, 1.0, 3.0);
+
+  // Launch kernel
+  dim3 blockSize(256);
+  dim3 gridSize((n_particles + blockSize.x - 1) / blockSize.x);
+
+  CAC::calc_forces_and_energies<<<gridSize, blockSize>>>(
+      d_positions, d_forces, d_energies, n_particles, d_box_len, potential);
+
+  checkCudaError(cudaGetLastError(), "kernel launch");
+  checkCudaError(cudaDeviceSynchronize(), "kernel execution");
+
+  // Copy results back to host
+  std::vector<real> result_forces =
+      copyFromGPUAndFree(d_forces, 3 * n_particles);
+  std::vector<real> result_energies =
+      copyFromGPUAndFree(d_energies, n_particles);
+
+  checkCudaError(cudaFree(d_positions), "cudaFree positions");
+  checkCudaError(cudaFree(d_box_len), "cudaFree box_len");
+
+  // With PBC, particles should interact as if they're close (distance ~0.2)
+  // rather than far apart (distance ~4.8)
+  EXPECT_GT(std::abs(result_forces[0]), tolerance)
+      << "Expected significant force due to PBC";
+  EXPECT_GT(std::abs(result_energies[0]), tolerance)
+      << "Expected significant energy due to PBC";
+}
+
+TEST_F(CudaKernelTest, SingleParticleTest) {
+  const int n_particles = 1;
+
+  std::vector<real> positions = {0.0, 0.0, 0.0};
+  std::vector<real> forces(3 * n_particles, 0.0);
+  std::vector<real> energies(n_particles, 0.0);
+  std::vector<real> box_dimensions = {10.0, 10.0, 10.0};
+
+  real *d_positions = allocateAndCopyToGPU(positions);
+  real *d_forces = allocateAndCopyToGPU(forces);
+  real *d_energies = allocateAndCopyToGPU(energies);
+  real *d_box_len = allocateAndCopyToGPU(box_dimensions);
+
+  LennardJones potential(1.0, 1.0, 3.0);
+
+  dim3 blockSize(256);
+  dim3 gridSize((n_particles + blockSize.x - 1) / blockSize.x);
+
+  CAC::calc_forces_and_energies<<<gridSize, blockSize>>>(
+      d_positions, d_forces, d_energies, n_particles, d_box_len, potential);
+
+  checkCudaError(cudaGetLastError(), "kernel launch");
+  checkCudaError(cudaDeviceSynchronize(), "kernel execution");
+
+  std::vector<real> result_forces =
+      copyFromGPUAndFree(d_forces, 3 * n_particles);
+  std::vector<real> result_energies =
+      copyFromGPUAndFree(d_energies, n_particles);
+
+  checkCudaError(cudaFree(d_positions), "cudaFree positions");
+  checkCudaError(cudaFree(d_box_len), "cudaFree box_len");
+
+  // Single particle should have zero force and energy
+  EXPECT_NEAR(result_forces[0], 0.0, 1e-10);
+  EXPECT_NEAR(result_forces[1], 0.0, 1e-10);
+  EXPECT_NEAR(result_forces[2], 0.0, 1e-10);
+  EXPECT_NEAR(result_energies[0], 0.0, 1e-10);
+}
+
+TEST_F(CudaKernelTest, ForceSymmetryTest) {
+  const int n_particles = 2;
+  const real tolerance = 1e-5;
+
+  std::vector<real> positions = {
+      0.0, 0.0, 0.0, // particle 0
+      1.5, 0.0, 0.0  // particle 1
+  };
+
+  std::vector<real> forces(3 * n_particles, 0.0);
+  std::vector<real> energies(n_particles, 0.0);
+  std::vector<real> box_dimensions = {10.0, 10.0, 10.0};
+
+  real *d_positions = allocateAndCopyToGPU(positions);
+  real *d_forces = allocateAndCopyToGPU(forces);
+  real *d_energies = allocateAndCopyToGPU(energies);
+  real *d_box_len = allocateAndCopyToGPU(box_dimensions);
+
+  LennardJones potential(1.0, 1.0, 3.0);
+
+  dim3 blockSize(256);
+  dim3 gridSize((n_particles + blockSize.x - 1) / blockSize.x);
+
+  CAC::calc_forces_and_energies<<<gridSize, blockSize>>>(
+      d_positions, d_forces, d_energies, n_particles, d_box_len, potential);
+
+  checkCudaError(cudaGetLastError(), "kernel launch");
+  checkCudaError(cudaDeviceSynchronize(), "kernel execution");
+
+  std::vector<real> result_forces =
+      copyFromGPUAndFree(d_forces, 3 * n_particles);
+  std::vector<real> result_energies =
+      copyFromGPUAndFree(d_energies, n_particles);
+
+  checkCudaError(cudaFree(d_positions), "cudaFree positions");
+  checkCudaError(cudaFree(d_box_len), "cudaFree box_len");
+
+  // Newton's third law: forces should be equal and opposite
+  EXPECT_NEAR(result_forces[0], -result_forces[3], tolerance)
+      << "Force x-components should be opposite";
+  EXPECT_NEAR(result_forces[1], -result_forces[4], tolerance)
+      << "Force y-components should be opposite";
+  EXPECT_NEAR(result_forces[2], -result_forces[5], tolerance)
+      << "Force z-components should be opposite";
+
+  // Energies should be equal for symmetric particles
+  EXPECT_NEAR(result_energies[0], result_energies[1], tolerance)
+      << "Energies should be equal";
+}
+
+// Main function to run tests
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  // Check if CUDA is available
+  int deviceCount;
+  cudaError_t err = cudaGetDeviceCount(&deviceCount);
+  if (err != cudaSuccess || deviceCount == 0) {
+    std::cout << "No CUDA devices available. Skipping CUDA tests." << std::endl;
+    return 0;
+  }
+
+  return RUN_ALL_TESTS();
+}