// Copyright (c) 2012-2017 VideoStitch SAS
// Copyright (c) 2018 stitchEm

#include "../../../gpu/image/sampling.hpp"

#include "../kernel.hpp"

namespace {
#include "sampling.xxd"
}

INDIRECT_REGISTER_OPENCL_PROGRAM(sampling, true);

#define BLOCK_SIZE 16

namespace VideoStitch {
namespace Image {

/**
 * Subsample a buffer by a factor of two, picking the topleft value for every 2x2 pixels blocks.
 * WARNING: no antialiasing filter ! Blur first !
 * @param dst subsampled buffer, size (srcWidth / 2) * (srcHeight / 2).
 * @param src subsampled buffer, size srcWidth * srcHeight.
 * @param srcWidth Source width.
 * @param srcHeight Source height.
 * @param stream Cuda stream to run in.
 */
template <typename T>
Status subsample22(GPU::Buffer<T> dst, GPU::Buffer<const T> src, std::size_t srcWidth, std::size_t srcHeight,
                   GPU::Stream stream) {
  std::size_t dstWidth = (srcWidth + 1) / 2;
  std::size_t dstHeight = (srcHeight + 1) / 2;
  // interior
  {
    auto kernel2D = GPU::Kernel::get(PROGRAM(sampling), KERNEL_STR(subsample22RegularKernel))
                        .setup2D(stream, (unsigned)dstWidth, (unsigned)dstHeight);
    FAIL_RETURN(kernel2D.enqueueWithKernelArgs(dst, src, (unsigned)srcWidth, (unsigned)srcHeight, (unsigned)dstWidth,
                                               (unsigned)dstHeight));
  }
  // right boundary
  if (srcWidth & 1) {
    auto kernel2D = GPU::Kernel::get(PROGRAM(sampling), KERNEL_STR(subsample22RightBoundaryKernel))
                        .setup1D(stream, (unsigned)dstHeight);
    FAIL_RETURN(kernel2D.enqueueWithKernelArgs(dst, src, (unsigned)srcWidth, (unsigned)srcHeight, (unsigned)dstWidth,
                                               (unsigned)dstHeight));
  }
  // bottom boundary
  if (srcHeight & 1) {
    auto kernel2D = GPU::Kernel::get(PROGRAM(sampling), KERNEL_STR(subsample22RightBoundaryKernel))
                        .setup1D(stream, (unsigned)dstWidth);
    FAIL_RETURN(kernel2D.enqueueWithKernelArgs(dst, src, (unsigned)srcWidth, (unsigned)srcHeight, (unsigned)dstWidth,
                                               (unsigned)dstHeight));
  }
  if ((srcWidth & 1) && (srcHeight & 1)) {
    // simple copy of the last element
    return CL_ERROR(clEnqueueCopyBuffer(stream.get(), src.get(), dst.get(), srcHeight * srcWidth - 1,
                                        dstWidth * dstHeight - 1, sizeof(T), 0, nullptr, nullptr));
  }
  return Status::OK();
}

/**
 * Subsample a buffer by a factor of two, picking the topleft value for every 2x2 pixels blocks.
 * WARNING: no antialiasing filter ! Blur first !
 * @param dst subsampled buffer, size (srcWidth / 2) * (srcHeight / 2). Pixels in RGB210.
 * @param src subsampled buffer, size srcWidth * srcHeight. Pixel in RGBA.
 * @param srcWidth Source width.
 * @param srcHeight Source height.
 * @param stream Cuda stream to run in.
 */
Status subsample22RGBA(GPU::Buffer<uint32_t> dst, GPU::Buffer<const uint32_t> src, std::size_t srcWidth,
                       std::size_t srcHeight, GPU::Stream stream) {
  std::size_t dstWidth = (srcWidth + 1) / 2;
  std::size_t dstHeight = (srcHeight + 1) / 2;
  // interior
  {
    auto kernel2D = GPU::Kernel::get(PROGRAM(sampling), KERNEL_STR(subsample22RGBARegularKernel))
                        .setup2D(stream, (unsigned)dstWidth, (unsigned)dstHeight);
    FAIL_RETURN(kernel2D.enqueueWithKernelArgs(dst, src, (unsigned)srcWidth, (unsigned)srcHeight, (unsigned)dstWidth));
  }
  // right boundary
  if (srcWidth & 1) {
    auto kernel2D = GPU::Kernel::get(PROGRAM(sampling), KERNEL_STR(subsample22RGBARightBoundaryKernel))
                        .setup1D(stream, (unsigned)dstHeight);
    FAIL_RETURN(kernel2D.enqueueWithKernelArgs(dst, src, (unsigned)srcWidth, (unsigned)srcHeight, (unsigned)dstWidth));
  }
  // bottom boundary
  if (srcHeight & 1) {
    auto kernel2D = GPU::Kernel::get(PROGRAM(sampling), KERNEL_STR(subsample22RGBABottomBoundaryKernel))
                        .setup1D(stream, (unsigned)dstWidth);
    FAIL_RETURN(kernel2D.enqueueWithKernelArgs(dst, src, (unsigned)srcWidth, (unsigned)srcHeight, (unsigned)dstWidth));
  }
  if ((srcWidth & 1) && (srcHeight & 1)) {
    // simple copy of the last element
    return CL_ERROR(clEnqueueCopyBuffer(stream.get(), src.get(), dst.get(), srcHeight * srcWidth - 1,
                                        dstWidth * dstHeight - 1, sizeof(uint32_t), 0, nullptr, nullptr));
  }
  return Status::OK();
}

/**
 * Upsamples a buffer.
 * @param dst subsampled buffer, size dstWidth * dstHeight.
 * @param src subsampled buffer, size (dstWidth / 2) * (dstHeight / 2).
 * @param dstWidth Destination width.
 * @param dstHeight Destination height.
 * @param stream Cuda stream to run in.
 */
template <typename T>
Status upsample22(GPU::Buffer<T> dst, GPU::Buffer<const T> src, std::size_t dstWidth, std::size_t dstHeight, bool wrap,
                  GPU::Stream stream) {
  const unsigned srcWidth = ((unsigned)dstWidth + 1) / 2;
  const unsigned srcHeight = ((unsigned)dstHeight + 1) / 2;
  auto kernel2D = GPU::Kernel::get(PROGRAM(sampling), KERNEL_STR(upsample22KernelScalar))
                      .setup2D(stream, (unsigned)srcWidth, (unsigned)srcHeight, BLOCK_SIZE);
  return kernel2D.enqueueWithKernelArgs(dst, src, (unsigned)dstWidth, (unsigned)dstHeight, (unsigned)srcWidth,
                                        (unsigned)srcHeight, (int)wrap);
}

/**
 * Upsamples an image in RGB210.
 * @param dst subsampled buffer, size dstWidth * dstHeight.
 * @param src subsampled buffer, size (dstWidth / 2) * (dstHeight / 2).
 * @param dstWidth Destination width.
 * @param dstHeight Destination height.
 * @param stream Cuda stream to run in.
 */
Status upsample22RGBA210(GPU::Buffer<uint32_t> dst, GPU::Buffer<const uint32_t> src, std::size_t dstWidth,
                         std::size_t dstHeight, bool wrap, GPU::Stream stream) {
  const unsigned srcWidth = ((unsigned)dstWidth + 1) / 2;
  const unsigned srcHeight = ((unsigned)dstHeight + 1) / 2;
  auto kernel2D = GPU::Kernel::get(PROGRAM(sampling), KERNEL_STR(upsample22KernelRGB210))
                      .setup2D(stream, (unsigned)srcWidth, (unsigned)srcHeight, BLOCK_SIZE);
  return kernel2D.enqueueWithKernelArgs(dst, src, (unsigned)dstWidth, (unsigned)dstHeight, (unsigned)srcWidth,
                                        (unsigned)srcHeight, (int)wrap);
}

/**
 * Upsamples an image in RGBA.
 * @param dst subsampled buffer, size dstWidth * dstHeight.
 * @param src subsampled buffer, size (dstWidth / 2) * (dstHeight / 2).
 * @param dstWidth Destination width.
 * @param dstHeight Destination height.
 * @param blockSize Cuda block size (effective size is blockSize * blockSize)
 * @param stream Cuda stream to run in.
 */
Status upsample22RGBA(GPU::Buffer<uint32_t> dst, GPU::Buffer<const uint32_t> src, std::size_t dstWidth,
                      std::size_t dstHeight, bool wrap, GPU::Stream stream) {
  const unsigned srcWidth = ((unsigned)dstWidth + 1) / 2;
  const unsigned srcHeight = ((unsigned)dstHeight + 1) / 2;
  auto kernel2D = GPU::Kernel::get(PROGRAM(sampling), KERNEL_STR(upsample22KernelRGBA))
                      .setup2D(stream, (unsigned)srcWidth, (unsigned)srcHeight, BLOCK_SIZE);
  return kernel2D.enqueueWithKernelArgs(dst, src, (unsigned)dstWidth, (unsigned)dstHeight, (unsigned)srcWidth,
                                        (unsigned)srcHeight, (int)wrap);
}

/**
 * Subsamples the given mask by a factor of two. For each 2x2 pixel block, the output pixel is masked out if any of the
 * input pixels is masked out (i.e. any pixel has value 1).
 * @param dst subsampled buffer, size (srcWidth / 2) * (srcHeight / 2).
 * @param src subsampled buffer, size srcWidth * srcHeight.
 * @param srcWidth Source width.
 * @param srcHeight Source height.
 * @param blockSize Cuda block size (effective size is blockSize * blockSize)
 * @param stream Cuda stream to run in.
 */
Status subsampleMask22(GPU::Buffer<unsigned char> /*dst*/, GPU::Buffer<const unsigned char> /*src*/,
                       std::size_t /*srcWidth*/, std::size_t /*srcHeight*/, unsigned int /*blockSize*/,
                       GPU::Stream /*stream*/) {
  // TODO_OPENCL_IMPL
  return {Origin::Stitcher, ErrType::UnsupportedAction, "Masked subsampling not implemented in OpenCL backend"};
}

#include "../../common/sampling.inst"

}  // namespace Image
}  // namespace VideoStitch