// Copyright (c) 2012-2017 VideoStitch SAS
// Copyright (c) 2018 stitchEm

#include "gpu/core1/voronoi.hpp"

#include "../kernel.hpp"

namespace VideoStitch {
namespace Core {

namespace {
#include "voronoi.xxd"
}

INDIRECT_REGISTER_OPENCL_PROGRAM(voronoi, true);

namespace {
/**
 * Returns the largest power of two smaller than @a v.
 */
unsigned largestPowerOfTwoLessThan(unsigned v) {
  unsigned res = 1;
  while (res < v) {
    res *= 2;
  }
  return res / 2;
}
}  // namespace

// returns pointer to destination of last step
GPU::Buffer<uint32_t> distanceMap(GPU::Buffer<uint32_t> src, GPU::Buffer<uint32_t> work, std::size_t width,
                                  std::size_t height, bool hWrap, GPU::Stream stream) {
  GPU::Buffer<uint32_t> tmpSrc = src;
  GPU::Buffer<uint32_t> tmpDst = work;

  std::string voronoiComputeVariant;

  if (hWrap) {
    voronoiComputeVariant = KERNEL_STR(voronoiComputeWrap);
  } else {
    voronoiComputeVariant = KERNEL_STR(voronoiComputeNoWrap);
  }

  auto voronoiCompute =
      GPU::Kernel::get(PROGRAM(voronoi), voronoiComputeVariant).setup2D(stream, (unsigned)width, (unsigned)height);
  for (unsigned step = largestPowerOfTwoLessThan((unsigned)std::max(width, height)); step > 0; step /= 2) {
    const Status computeStatus =
        voronoiCompute.enqueueWithKernelArgs(tmpDst, tmpSrc, (unsigned)width, (unsigned)height, step);
    assert(computeStatus.ok());
    std::swap(tmpDst, tmpSrc);
  }

  return tmpSrc;
}

/**
 * Compute the generalized voronoi diagram of @a src.
 * @param dst Output buffer for the voronoi diagram. Only two values: 0 and 255.
 * @param src Source buffer containing a setup image (i.e. the i-th bit of a pixel represents the i-th input).
 * @param work A work buffer.
 * @param width Width of the previous buffers.
 * @param height Height of the previous buffers.
 * @param fromIdMask Bit mask of the first input (e.g. if 0x00000004, the first input will be input 2, starting at 0).
 * @param toIdMask Bit mask of the second input.
 * @param hWrap If true, we consider the buffer to wrap horizontally.
 * @param stream CUDA stream where to run the kernels.
 * @note This call is asynchronous.
 */
void voronoiCompute(unsigned char* /*dst*/, uint32_t* /*src*/, uint32_t* /*work*/, std::size_t /*width*/,
                    std::size_t /*height*/, uint32_t /*fromIdMask*/, uint32_t /*toIdMask*/, bool /*hWrap*/,
                    unsigned /*blockSize*/, GPU::Stream /*stream*/) {
  // TODO_OPENCL_IMPL
}

/**
 * Compute the euclidian distance transform of src.
 * @param dst Output buffer for the voronoi diagram. Output values are in [0;255].
 * @param src Source buffer containing a setup image (i.e. the i-th bit of a pixel represents the i-th input).
 * @param work A work buffer. Twice the size of @a src.
 * @param width Width of the previous buffers.
 * @param height Height of the previous buffers.
 * @param fromIdMask Bit mask of the first input (e.g. if 0x00000004, the first input will be input 2, starting at 0).
 * @param toIdMask Bit mask of the second input.
 * @param hWrap If true, we consider the buffer to wrap horizontally.
 * @param maxTransitionDistance maximum width of the transition / overlap.
 * @param power parameter of the p-norm that's used to calculate the transition. Should be >= 2.0 to use at least L2.
 * Steeper transition with larger power.
 * @param stream CUDA stream where to run the kernels.
 * @note This call is asynchronous.
 */
Status edtCompute(GPU::Buffer<unsigned char> dst, GPU::Buffer<uint32_t> src, GPU::Buffer<uint32_t> workBuffer1,
                  GPU::Buffer<uint32_t> workBuffer2, std::size_t width, std::size_t height, uint32_t fromIdMask,
                  uint32_t toIdMask, bool hWrap, int maxTransitionDistance, float power, GPU::Stream stream) {
  // TODO_OPENCL_IMPL merge this with CUDA code, create backend shared header, impl

  const auto blackWork = workBuffer1;
  const auto whiteWork = workBuffer2;

  // dim3 dimBlock2D(blockSize, blockSize, 1);
  // // FIXME: make sure this holds ?
  // assert((width % dimBlock2D.x) == 0);
  // assert((height % dimBlock2D.x) == 0);
  // dim3 dimGrid2D((unsigned)width / dimBlock2D.x, (unsigned)height / dimBlock2D.y, 1);

  auto edtInit =
      GPU::Kernel::get(PROGRAM(voronoi), KERNEL_STR(edtInit)).setup2D(stream, (unsigned)width, (unsigned)height);

  // Extract base distance maps.
  PROPAGATE_FAILURE_STATUS(
      edtInit.enqueueWithKernelArgs(blackWork, src, (unsigned)width, (unsigned)height, fromIdMask, toIdMask));
  PROPAGATE_FAILURE_STATUS(
      edtInit.enqueueWithKernelArgs(whiteWork, src, (unsigned)width, (unsigned)height, toIdMask, fromIdMask));

  // Process black.
  const auto blackResult = distanceMap(blackWork, src, width, height, hWrap, stream);

  const auto workBuffer = (blackResult == blackWork) ? src : blackWork;

  // Process white.
  const auto whiteResult = distanceMap(whiteWork, workBuffer, width, height, hWrap, stream);

  const auto edtMakeMaskVariant =
      (hWrap ? KERNEL_STR(edtMakeMaskKernel_extractDistWrap) : KERNEL_STR(edtMakeMaskKernel_extractDistNoWrap));

  auto edtMakeMask =
      GPU::Kernel::get(PROGRAM(voronoi), edtMakeMaskVariant).setup2D(stream, (unsigned)width, (unsigned)height);
  return edtMakeMask.enqueueWithKernelArgs(dst, blackResult, whiteResult, (unsigned)width, (unsigned)height,
                                           maxTransitionDistance, power);
}

/**
 * Compute the euclidian distance transform of src to src.
 * @param dst Output buffer for the voronoi diagram. Output values are in [0;255].
 * @param src Source buffer containing a setup image (i.e. the i-th bit of a pixel represents the i-th input).
 * @param width Width of the previous buffers.
 * @param height Height of the previous buffers.
 * @param fromIdMask Bit mask of the  input (e.g. if 0x00000004, the first input will be input 2, starting at 0).
 * @param stream CUDA stream where to run the kernels.
 * @note This call is asynchronous.
 */
Status edtReflexive(GPU::Buffer<unsigned char> dst, GPU::Buffer<uint32_t> src, std::size_t width, std::size_t height,
                    uint32_t fromIdMask, GPU::Stream stream) {
  auto kernel2D = GPU::Kernel::get(PROGRAM(voronoi), KERNEL_STR(edtReflexiveKernel))
                      .setup2D(stream, (unsigned)width, (unsigned)height);
  return kernel2D.enqueueWithKernelArgs(dst, src, (unsigned)width, (unsigned)height, fromIdMask);
}

}  // namespace Core
}  // namespace VideoStitch