Change default precision to float and use float4 for force and potential calculations

2025-09-12 21:44:41 -04:00 · 2025-09-12 21:44:41 -04:00 · 130b613a7c
commit 130b613a7c
parent dd83fc6330
9 changed files with 151 additions and 362 deletions
--- a/kernels/forces.cuh
+++ b/kernels/forces.cuh
@ -3,72 +3,71 @@
 #include "potentials/pair_potentials.cuh"
 #include "precision.hpp"
 #include <cstdio>
-#include <type_traits>
-#include <variant>
+#include <cuda_runtime.h>
 #include <vector>

 namespace CAC {

-inline void reset_forces_and_energies(int n_particles, real *forces,
-                                      real *energies) {
-  cudaMemset(forces, 0, n_particles * sizeof(real) * 3);
-  cudaMemset(energies, 0, n_particles * sizeof(real));
+inline void reset_forces_and_energies(int n_particles,
+                                      float4 *forces_energies) {
+  cudaMemset(forces_energies, 0, n_particles * sizeof(float4));
 }

 template <typename PotentialType>
-__global__ void calc_forces_and_energies(real *xs, real *forces, real *energies,
+__global__ void calc_forces_and_energies(float4 *pos, float4 *force_energies,
                                         int n_particles, real *box_len,
                                         PotentialType potential) {
+
  int i = blockIdx.x * blockDim.x + threadIdx.x;

  if (i < n_particles) {
-    real xi = xs[3 * i];
-    real yi = xs[3 * i + 1];
-    real zi = xs[3 * i + 2];
+    float4 my_pos = pos[i]; // Loads 16 bytes in one transaction
+    real xi = my_pos.x;
+    real yi = my_pos.y;
+    real zi = my_pos.z;
+
+    real total_fx = 0, total_fy = 0, total_fz = 0, total_energy = 0;

    for (int j = 0; j < n_particles; j++) {
      if (i != j) {
-        real xj = xs[3 * j];
-        real yj = xs[3 * j + 1];
-        real zj = xs[3 * j + 2];
-
-        real dx = xi - xj;
-        real dy = yi - yj;
-        real dz = zi - zj;
+        float4 other_pos = pos[j];
+        real dx = xi - other_pos.x;
+        real dy = yi - other_pos.y;
+        real dz = zi - other_pos.z;

        // Apply periodic boundary conditions
        dx -= box_len[0] * round(dx / box_len[0]);
        dy -= box_len[1] * round(dy / box_len[1]);
        dz -= box_len[2] * round(dz / box_len[2]);

-        ForceAndEnergy sol = potential.calc_force_and_energy({dx, dy, dz});
-        forces[3 * i] += sol.force.x;
-        forces[3 * i + 1] += sol.force.y;
-        forces[3 * i + 2] += sol.force.z;
-        energies[i] += sol.energy;
+        float4 sol = potential.calc_force_and_energy({dx, dy, dz});
+        total_fx += sol.x;
+        total_fy += sol.y;
+        total_fz += sol.z;
+        total_energy += sol.w;
      }
    }
+
+    force_energies[i] = make_float4(total_fx, total_fy, total_fz, total_energy);
  }
 }
-
-inline void launch_force_kernels(real *xs, real *forces, real *energies,
+inline void launch_force_kernels(float4 *xs, float4 *force_energies,
                                 int n_particles, real *box_len,
                                 std::vector<PairPotentials> potentials,
                                 int grid_size, int block_size) {

-  reset_forces_and_energies(n_particles, forces, energies);
+  reset_forces_and_energies(n_particles, force_energies);

  for (const auto &potential : potentials) {
    std::visit(
        [&](const auto &potential) {
          using PotentialType = std::decay_t<decltype(potential)>;
          calc_forces_and_energies<PotentialType><<<grid_size, block_size>>>(
-              xs, forces, energies, n_particles, box_len, potential);
+              xs, force_energies, n_particles, box_len, potential);
        },
        potential);
    cudaDeviceSynchronize();
  }
 }
 } // namespace CAC
-
 #endif