diff --git a/CMakeLists.txt b/CMakeLists.txt
index 07aacb5..618c891 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,7 +16,7 @@ set(CMAKE_CXX_STANDARD 17)
 
 # Add debug configuration
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 --generate-line-info")
+    set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0")
 elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
     set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math")
 elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
diff --git a/kernels/kernel_config.cu b/kernels/kernel_config.cu
index 3c1644c..a98a1a1 100644
--- a/kernels/kernel_config.cu
+++ b/kernels/kernel_config.cu
@@ -13,12 +13,12 @@ void KernelConfig::print() const {
          total_threads());
 }
 
-KernelConfig get_launch_config(size_t n_elements, int threads_per_block,
-                               int max_blocks_per_dim) {
+KernelConfig get_launch_config(size_t n_elements, size_t threads_per_block,
+                               size_t max_blocks_per_dim) {
 
   // Ensure threads_per_block is valid
-  threads_per_block = std::min(threads_per_block, 1024);
-  threads_per_block = std::max(threads_per_block, 32);
+  threads_per_block = std::min(threads_per_block, (size_t)1024);
+  threads_per_block = std::max(threads_per_block, (size_t)32);
 
   // Calculate total blocks needed
   size_t total_blocks =
@@ -66,8 +66,8 @@ KernelConfig get_launch_config_advanced(size_t n_elements, int device_id) {
   cudaDeviceProp prop;
   cudaGetDeviceProperties(&prop, device_id);
 
-  int threads_per_block = get_optimal_block_size(device_id);
-  int max_blocks_per_dim = prop.maxGridSize[0];
+  size_t threads_per_block = get_optimal_block_size(device_id);
+  size_t max_blocks_per_dim = prop.maxGridSize[0];
 
   return get_launch_config(n_elements, threads_per_block, max_blocks_per_dim);
 }
diff --git a/kernels/kernel_config.cuh b/kernels/kernel_config.cuh
index 66364fe..13450cd 100644
--- a/kernels/kernel_config.cuh
+++ b/kernels/kernel_config.cuh
@@ -28,17 +28,19 @@ struct KernelConfig {
  * @param max_blocks_per_dim Maximum blocks per grid dimension (default: 65535)
  * @return LaunchConfig with optimal grid and block dimensions
  */
-KernelConfig get_launch_config(size_t n_elements, int threads_per_block = 256,
-                               int max_blocks_per_dim = 65535);
+KernelConfig get_launch_config(size_t n_elements,
+                               size_t threads_per_block = 256,
+                               size_t max_blocks_per_dim = 65535);
 
 /**
  * Calculate 1D thread index for kernels launched with get_launch_config()
  * Use this inside your CUDA kernels
  */
 __device__ inline size_t get_thread_id() {
-  return (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
-         (size_t)blockIdx.y * gridDim.x * blockDim.x +
-         (size_t)blockIdx.x * blockDim.x + threadIdx.x;
+  size_t index = (size_t)blockIdx.z * gridDim.x * gridDim.y * blockDim.x +
+                 (size_t)blockIdx.y * gridDim.x * blockDim.x +
+                 (size_t)blockIdx.x * blockDim.x + threadIdx.x;
+  return index;
 }
 
 /**
diff --git a/tests/cuda_unit_tests/CMakeLists.txt b/tests/cuda_unit_tests/CMakeLists.txt
index 3419e5e..4ead02b 100644
--- a/tests/cuda_unit_tests/CMakeLists.txt
+++ b/tests/cuda_unit_tests/CMakeLists.txt
@@ -3,6 +3,7 @@ include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})
 add_executable(${NAME}_cuda_tests
     test_potential.cu
     test_forces.cu
+    test_kernel_config.cu
 )
 
 target_link_libraries(${NAME}_cuda_tests gtest gtest_main)
diff --git a/tests/cuda_unit_tests/test_kernel_config.cu b/tests/cuda_unit_tests/test_kernel_config.cu
new file mode 100644
index 0000000..a616c19
--- /dev/null
+++ b/tests/cuda_unit_tests/test_kernel_config.cu
@@ -0,0 +1,48 @@
+
+#include "kernel_config.cuh"
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+#include <numeric>
+#include <vector>
+
+// Kernel to test the get_thread_id() function
+__global__ void test_get_thread_id_kernel(size_t *output, size_t n_elements) {
+  size_t i = get_thread_id();
+  if (i < n_elements) {
+    output[i] = i;
+  }
+}
+
+// Test fixture for kernel config tests
+class KernelConfigTest : public ::testing::Test {
+protected:
+  void SetUp() override {
+    // Set up any common resources for the tests
+  }
+
+  void TearDown() override {
+    // Clean up any resources
+  }
+};
+
+TEST_F(KernelConfigTest, GetThreadId) {
+  const size_t n_elements = 10000;
+  KernelConfig config = get_launch_config(n_elements);
+
+  size_t *d_output;
+  cudaMalloc(&d_output, n_elements * sizeof(size_t));
+
+  test_get_thread_id_kernel<<<config.blocks, config.threads>>>(d_output,
+                                                               n_elements);
+
+  std::vector<size_t> h_output(n_elements);
+  cudaMemcpy(h_output.data(), d_output, n_elements * sizeof(size_t),
+             cudaMemcpyDeviceToHost);
+
+  cudaFree(d_output);
+
+  std::vector<size_t> expected(n_elements);
+  std::iota(expected.begin(), expected.end(), 0);
+
+  ASSERT_EQ(h_output, expected);
+}