// Copyright (c) 2012-2017 VideoStitch SAS
// Copyright (c) 2018 stitchEm

#include "gpu/image/reduce.hpp"

#include "backend/common/imageOps.hpp"
#include "backend/cuda/deviceBuffer.hpp"

#include <cuda/error.hpp>
#include <cuda/util.hpp>

#include <cuda_runtime.h>
#include <cassert>
#include <stdint.h>
#include <stdio.h>

namespace VideoStitch {
namespace Image {

// See "Optimizing Parallel Reduction in CUDA" Mark Haris

template <uint32_t (*getValue)(uint32_t)>
__global__ void reduceKernel(uint32_t* __restrict__ dst, const uint32_t* __restrict__ src, unsigned size) {
  extern __shared__ uint32_t sdata[];
  const unsigned tid = threadIdx.x;
  unsigned i = blockIdx.x * (BLOCKSIZE * 2) + tid;
  const unsigned gridSize = BLOCKSIZE * 2 * gridDim.x;
  uint32_t startVal = 0;
  while (i < size) {
    startVal += getValue(src[i]);
    if (i + BLOCKSIZE < size) {
      startVal += getValue(src[i + BLOCKSIZE]);
    }
    i += gridSize;
  }
  sdata[tid] = startVal;
  __syncthreads();
  if (tid < 128) {
    sdata[tid] += sdata[tid + 128];
  }
  __syncthreads();
  if (tid < 64) {
    sdata[tid] += sdata[tid + 64];
  }
  __syncthreads();
  if (tid < 32) {
    // No need to sync, only one warp. But fermi needs volatile !
    volatile uint32_t* localSdata = sdata;
    localSdata[tid] += localSdata[tid + 32];
    localSdata[tid] += localSdata[tid + 16];
    localSdata[tid] += localSdata[tid + 8];
    localSdata[tid] += localSdata[tid + 4];
    localSdata[tid] += localSdata[tid + 2];
    localSdata[tid] += localSdata[tid + 1];
  }
  if (tid == 0) {
    dst[blockIdx.x] = sdata[0];
  }
}

namespace {
inline __device__ uint32_t identity(uint32_t value) { return value; }

inline __device__ uint32_t isSolid(uint32_t value) { return RGBA::a(value) > 0 ? 1 : 0; }

inline __device__ uint32_t solidIdentityOrZero(uint32_t value) {
  return RGBA::a(value) > 0 ? (Image::RGBA::r(value) + Image::RGBA::g(value) + Image::RGBA::b(value)) / 3 : 0;
}
}  // namespace

template <uint32_t (*getValue)(uint32_t)>
Status reduce(const uint32_t* src, uint32_t* work, std::size_t size, uint32_t& result) {
  const dim3 dimBlock(BLOCKSIZE);
  // The first pass uses the actual getValue.
  if (size > 1) {
    const dim3 dimGrid((unsigned)Cuda::ceilDiv(size, 2 * BLOCKSIZE));
    reduceKernel<getValue><<<dimGrid, dimBlock, 4 * BLOCKSIZE>>>(work, src, (unsigned)size);
    size = dimGrid.x;
    src = work;
    work = work + size;
  }
  // Other passes simply sum.
  while (size > 1) {
    const dim3 dimGrid((unsigned)Cuda::ceilDiv(size, 2 * BLOCKSIZE));
    reduceKernel<identity><<<dimGrid, dimBlock, 4 * BLOCKSIZE>>>(work, src, (unsigned)size);
    size = dimGrid.x;
    src = work;
    work = work + size;
  }
  FAIL_RETURN(CUDA_STATUS);
  return CUDA_ERROR(cudaMemcpy(&result, src, sizeof(uint32_t), cudaMemcpyDeviceToHost));
}

Status reduceSum(GPU::Buffer<const uint32_t> src, GPU::Buffer<uint32_t> work, std::size_t size, uint32_t& result) {
  return reduce<identity>(src.get(), work.get(), size, result);
}

Status reduceSumSolid(GPU::Buffer<const uint32_t> src, GPU::Buffer<uint32_t> work, std::size_t size, uint32_t& result) {
  return reduce<solidIdentityOrZero>(src.get(), work.get(), size, result);
}

Status reduceCountSolid(GPU::Buffer<const uint32_t> src, GPU::Buffer<uint32_t> work, std::size_t size,
                        uint32_t& result) {
  return reduce<isSolid>(src.get(), work.get(), size, result);
}

}  // namespace Image
}  // namespace VideoStitch