#ifndef NEIGHBOR_LIST_CUH
#define NEIGHBOR_LIST_CUH

#include "kernel_config.cuh" // From previous artifact
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>

/**
 * Simple kernel to initialize neighbor offsets
 */
__global__ void init_neighbor_offsets(int *offsets, size_t n_particles,
                                      int max_neighbors) {
  size_t i = get_thread_id();
  if (i <= n_particles) { // Note: <= because we need n_particles + 1 elements
    offsets[i] = i * max_neighbors;
  }
}

/**
 * Neighbor list data structure
 * Uses a compact format similar to CSR sparse matrices
 */
struct NeighborList {
  int *neighbor_offsets; // Size: n_particles + 1, offset into neighbor_indices
  int *neighbor_indices; // Size: total_neighbors, actual neighbor particle IDs
  int *neighbor_counts;  // Size: n_particles, number of neighbors per particle
  size_t max_neighbors;  // Maximum neighbors allocated per particle
  size_t n_particles;

  // Constructor
  NeighborList(size_t n_particles, size_t max_neighbors_per_particle)
      : max_neighbors(max_neighbors_per_particle), n_particles(n_particles) {

    cudaMalloc(&neighbor_offsets, (n_particles + 1) * sizeof(int));
    cudaMalloc(&neighbor_indices, n_particles * max_neighbors * sizeof(int));
    cudaMalloc(&neighbor_counts, n_particles * sizeof(int));

    // Initialize offsets
    auto kernel_config = get_launch_config(n_particles);
    init_neighbor_offsets<<<kernel_config.blocks, kernel_config.threads>>>(
        neighbor_offsets, n_particles, max_neighbors);
  }

  ~NeighborList() {
    cudaFree(neighbor_offsets);
    cudaFree(neighbor_indices);
    cudaFree(neighbor_counts);
  }

  // Get neighbors for particle i (device function)
  __device__ int *get_neighbors(int i) const {
    return &neighbor_indices[neighbor_offsets[i]];
  }

  __device__ int get_neighbor_count(int i) const { return neighbor_counts[i]; }
};

/**
 * Cell list structure for spatial hashing
 */
struct CellList {
  int *cell_starts;      // Size: total_cells, start index in sorted_particles
  int *cell_ends;        // Size: total_cells, end index in sorted_particles
  int *sorted_particles; // Size: n_particles, particle IDs sorted by cell
  int *particle_cells; // Size: n_particles, which cell each particle belongs to

  int3 grid_size;   // Number of cells in each dimension
  float3 cell_size; // Size of each cell
  float3 box_min;   // Minimum corner of simulation box
  size_t total_cells;
  size_t n_particles;

  CellList(size_t n_particles, float3 box_size, float cutoff)
      : n_particles(n_particles) {

    // Calculate grid dimensions (cell size slightly larger than cutoff)
    float cell_margin = 1.001f; // Small safety margin
    cell_size = make_float3(cutoff * cell_margin, cutoff * cell_margin,
                            cutoff * cell_margin);

    grid_size.x = (int)ceilf(box_size.x / cell_size.x);
    grid_size.y = (int)ceilf(box_size.y / cell_size.y);
    grid_size.z = (int)ceilf(box_size.z / cell_size.z);

    total_cells = (size_t)grid_size.x * grid_size.y * grid_size.z;
    box_min = make_float3(0.0f, 0.0f, 0.0f); // Assume box starts at origin

    // Allocate memory
    cudaMalloc(&cell_starts, total_cells * sizeof(int));
    cudaMalloc(&cell_ends, total_cells * sizeof(int));
    cudaMalloc(&sorted_particles, n_particles * sizeof(int));
    cudaMalloc(&particle_cells, n_particles * sizeof(int));
  }

  ~CellList() {
    cudaFree(cell_starts);
    cudaFree(cell_ends);
    cudaFree(sorted_particles);
    cudaFree(particle_cells);
  }

  // Get cell index from 3D coordinates
  __device__ int get_cell_index(int x, int y, int z) const {
    return z * grid_size.x * grid_size.y + y * grid_size.x + x;
  }

  // Get cell coordinates from position
  __device__ int3 get_cell_coords(float3 pos) const {
    return make_int3((int)((pos.x - box_min.x) / cell_size.x),
                     (int)((pos.y - box_min.y) / cell_size.y),
                     (int)((pos.z - box_min.z) / cell_size.z));
  }
};

// Forward declarations
void build_cell_list(const float4 *positions, CellList &cell_list,
                     const float3 *box_lengths);

void build_neighbor_list(const float4 *positions, const CellList &cell_list,
                         NeighborList &neighbor_list, float cutoff_squared,
                         const float3 *box_lengths);

/**
 * High-level interface - builds complete neighbor list
 */
void build_neighbor_list_optimized(const float4 *positions, size_t n_particles,
                                   const float3 *box_lengths, float cutoff,
                                   NeighborList &neighbor_list);

#endif