// CUDA version of 1D smoothing convolution
//
// To compile and run:
//   nvcc -O3 cuda.cu -o cuda
//   ./cuda

#include <chrono>
#include <cstdlib>
#include <iomanip>
#include <iostream>

#define SIZE 500'000'000

using std::chrono::duration;
using std::chrono::high_resolution_clock;

// CUDA kernel: out[i] = (in[i - 1] + in[i] + in[i + 1]) / 3
__global__ void smoothKernel(const float* __restrict__ in,
                             float* __restrict__ out, int n) {
  int i = blockIdx.x * blockDim.x + threadIdx.x + 1;  // Start from 1 to n-2

  // Main case
  out[i] = (in[i - 1] + in[i] + in[i + 1]) / 3.0f;
}

// Launches the GPU.
// Returns the time taken by the GPU kernel in milliseconds.
static double smooth(float* host_in, float* host_out, int n) {
  float *device_in, *device_out;
  cudaMalloc(&device_in, SIZE * sizeof(float));
  cudaMalloc(&device_out, SIZE * sizeof(float));

  // Copy input to GPU
  cudaMemcpy(device_in, host_in, SIZE * sizeof(float), cudaMemcpyHostToDevice);

  // Wait for copy to finish
  cudaDeviceSynchronize();

  auto kernel_start = high_resolution_clock::now();

  int blockSize = 256;
  int gridSize = ((n - 2) + blockSize - 1) / blockSize;
  
  // "Fork"
  smoothKernel<<<gridSize, blockSize>>>(device_in, device_out, n);

  // "Join"
  cudaDeviceSynchronize();

  auto kernel_end = high_resolution_clock::now();

  double kernel_elapsedMS =
      duration<double, std::milli>(kernel_end - kernel_start).count();

  // Copy GPU result back to main memory
  cudaMemcpy(host_out, device_out, SIZE * sizeof(float),
             cudaMemcpyDeviceToHost);

  // Handle boundaries on CPU
  host_out[0] = (host_in[0] + host_in[1] + host_in[2]) / 3.0f;
  host_out[SIZE - 1] =
      (host_in[SIZE - 3] + host_in[SIZE - 2] + host_in[SIZE - 1]) / 3.0f;

  // Cleanup GPU memory allocations
  cudaFree(device_in);
  cudaFree(device_out);
  return kernel_elapsedMS;
}

static void runTest(float* host_in, float* host_out) {
  auto start = high_resolution_clock::now();

  double kernel_elapsedMS = smooth(host_in, host_out, SIZE);

  auto end = high_resolution_clock::now();
  double elapsedMS = duration<double, std::milli>(end - start).count();

  // Calculate a simple hash checksum.
  int checksum = 17;
  for (int i = 0; i < SIZE; i++) {
    checksum = checksum * 31 + static_cast<int>(host_out[i]);
  }

  std::cout << "CUDA: Total time taken: " << std::fixed << std::setprecision(3)
            << elapsedMS << " ms, Kernel time: " << std::fixed
            << std::setprecision(3) << kernel_elapsedMS
            << " ms, checksum: " << checksum << std::endl;
}

void generateData(float* data, int n) {
  for (int i = 0; i < n; i++) data[i] = (float)i;
}

int main() {
  float* host_in = new float[SIZE];
  float* host_out = new float[SIZE];

  generateData(host_in, SIZE);

  // Run the test a few times to warm up the GPU
  for (int i = 0; i < 5; i++) runTest(host_in, host_out);

  delete[] host_in;
  delete[] host_out;

  return EXIT_SUCCESS;
}
