#ifndef FORCES_CUH #define FORCES_CUH #include "kernel_config.cuh" #include "potentials/pair_potentials.cuh" #include "precision.hpp" #include #include #include namespace CAC { inline void reset_forces_and_energies(int n_particles, float4 *forces_energies) { cudaMemset(forces_energies, 0, n_particles * sizeof(float4)); } template __global__ void calc_forces_and_energies(float4 *pos, float4 *force_energies, int n_particles, real *box_len, PotentialType potential) { int i = get_thread_id(); if (i < n_particles) { float4 my_pos = pos[i]; // Loads 16 bytes in one transaction real xi = my_pos.x; real yi = my_pos.y; real zi = my_pos.z; real total_fx = 0, total_fy = 0, total_fz = 0, total_energy = 0; for (int j = 0; j < n_particles; j++) { if (i < j) { float4 other_pos = pos[j]; real dx = xi - other_pos.x; real dy = yi - other_pos.y; real dz = zi - other_pos.z; // Apply periodic boundary conditions dx -= box_len[0] * round(dx / box_len[0]); dy -= box_len[1] * round(dy / box_len[1]); dz -= box_len[2] * round(dz / box_len[2]); float4 sol = potential.calc_force_and_energy({dx, dy, dz}); total_fx += sol.x; total_fy += sol.y; total_fz += sol.z; total_energy += sol.w; atomicAdd(&force_energies[j].x, -sol.x); atomicAdd(&force_energies[j].y, -sol.y); atomicAdd(&force_energies[j].z, -sol.z); atomicAdd(&force_energies[j].w, sol.w); } } atomicAdd(&force_energies[i].x, total_fx); atomicAdd(&force_energies[i].y, total_fy); atomicAdd(&force_energies[i].z, total_fz); atomicAdd(&force_energies[i].w, total_energy); } } inline void launch_force_kernels(float4 *xs, float4 *force_energies, int n_particles, real *box_len, std::vector potentials, dim3 blocks, dim3 threads_per_block) { reset_forces_and_energies(n_particles, force_energies); for (const auto &potential : potentials) { std::visit( [&](const auto &potential) { using PotentialType = std::decay_t; calc_forces_and_energies <<>>(xs, force_energies, n_particles, box_len, potential); }, potential); cudaDeviceSynchronize(); } } } // namespace CAC #endif