// Copyright (c) 2012-2017 VideoStitch SAS
// Copyright (c) 2018 stitchEm

#include "image/unpack.hpp"
#include "colorArrayDevice.hpp"

#include "backend/cuda/deviceBuffer.hpp"
#include "backend/cuda/deviceBuffer2D.hpp"
#include "backend/cuda/surface.hpp"
#include "backend/cuda/deviceStream.hpp"

#include "cuda/util.hpp"

#include "unpackKernel.cu"

#include <cuda_runtime.h>
#include <cassert>

const unsigned int CudaBlockSize = 16;

namespace VideoStitch {
namespace Image {

// ---------------- Convert RGBA -> other colorspace --------------------------

Status unpackRGB(GPU::Buffer2D& dst, const GPU::Buffer<const uint32_t>& array, std::size_t width, std::size_t height,
                 GPU::Stream s) {
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid((unsigned)Cuda::ceilDiv(width, dimBlock.x), (unsigned)Cuda::ceilDiv(height, dimBlock.y), 1);
  unpackKernelRGB<<<dimGrid, dimBlock, 0, s.get()>>>(dst.get().raw(), (unsigned)dst.getPitch(), array.get(),
                                                     (unsigned)width, (unsigned)height);
  return CUDA_STATUS;
}

Status unpackRGB(GPU::Buffer2D& dst, const GPU::Surface& surf, std::size_t width, std::size_t height, GPU::Stream s) {
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid((unsigned)Cuda::ceilDiv(width, dimBlock.x), (unsigned)Cuda::ceilDiv(height, dimBlock.y), 1);
  unpackSourceKernelRGB<<<dimGrid, dimBlock, 0, s.get()>>>(dst.get().raw(), (unsigned)dst.getPitch(),
                                                           surf.get().surface(), (unsigned)width, (unsigned)height);
  return CUDA_STATUS;
}

Status unpackRGBA(GPU::Buffer2D& dst, const GPU::Buffer<const uint32_t>& array, std::size_t /*width*/,
                  std::size_t /*height*/, GPU::Stream s) {
  return CUDA_ERROR(cudaMemcpy2DAsync(dst.get().raw(), (unsigned)dst.getPitch(), array.get(), dst.getWidth(),
                                      dst.getWidth(), dst.getHeight(), cudaMemcpyDeviceToDevice, s.get()));
}

Status unpackRGBA(GPU::Buffer2D& dst, const GPU::Surface& surf, std::size_t width, std::size_t height, GPU::Stream s) {
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid((unsigned)Cuda::ceilDiv(width, dimBlock.x), (unsigned)Cuda::ceilDiv(height, dimBlock.y), 1);
  unpackSourceKernelRGBA<<<dimGrid, dimBlock, 0, s.get()>>>(
      (uint32_t*)dst.get().raw(), (unsigned)dst.getPitch() / sizeof(uint32_t),  // pitch is in bytes
      surf.get().surface(), (unsigned)width, (unsigned)height);
  return CUDA_STATUS;
}

Status unpackF32C1(GPU::Buffer2D& dst, const GPU::Buffer<const uint32_t>& array, std::size_t /*width*/,
                   std::size_t /*height*/, GPU::Stream s) {
  return CUDA_ERROR(cudaMemcpy2DAsync(dst.get().raw(), (unsigned)dst.getPitch(), array.get(), dst.getWidth(),
                                      dst.getWidth(), dst.getHeight(), cudaMemcpyDeviceToDevice, s.get()));
}

Status unpackF32C1(GPU::Buffer2D& dst, const GPU::Surface& surf, std::size_t width, std::size_t height, GPU::Stream s) {
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid((unsigned)Cuda::ceilDiv(width, dimBlock.x), (unsigned)Cuda::ceilDiv(height, dimBlock.y), 1);
  unpackSourceKernelF32C1<<<dimGrid, dimBlock, 0, s.get()>>>(
      (float*)dst.get().raw(), (unsigned)dst.getPitch() / sizeof(float),  // pitch is in bytes
      surf.get().surface(), (unsigned)width, (unsigned)height);
  return CUDA_STATUS;
}

Status unpackGrayscale16(GPU::Buffer2D& /* dst */, const GPU::Buffer<const uint32_t>& /* input */, size_t /* width*/,
                         size_t /* height */, GPU::Stream /* s */) {
  // TODO
  return {Origin::GPU, ErrType::UnsupportedAction,
          "Color space conversion for Grayscale16 not implemented from buffer"};
}

Status unpackGrayscale16(GPU::Buffer2D& dst, const GPU::Surface& surf, size_t width, size_t height, GPU::Stream s) {
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid((unsigned)Cuda::ceilDiv(width, dimBlock.x), (unsigned)Cuda::ceilDiv(height, dimBlock.y), 1);
  unpackSourceKernelGrayscale16<<<dimGrid, dimBlock, 0, s.get()>>>(
      (uint16_t*)dst.get().raw(), (unsigned)dst.getPitch() / sizeof(uint16_t),  // pitch is in bytes
      surf.get().surface(), (unsigned)width, (unsigned)height);
  return CUDA_STATUS;
}

Status unpackDepth(GPU::Buffer2D& yDst, GPU::Buffer2D& uDst, GPU::Buffer2D& vDst,
                   const GPU::Buffer<const uint32_t>& array, std::size_t width, std::size_t height, GPU::Stream s) {
  assert(!(width & 1));
  assert(!(height & 1));
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid((unsigned)Cuda::ceilDiv((width + 1) / 2, dimBlock.x),
                     (unsigned)Cuda::ceilDiv((height + 1) / 2, dimBlock.y), 1);
  unpackKernelDepth<<<dimGrid, dimBlock, 0, s.get()>>>(
      yDst.get().raw(), (unsigned)yDst.getPitch(), uDst.get().raw(), (unsigned)uDst.getPitch(), vDst.get().raw(),
      (unsigned)vDst.getPitch(), (float*)array.get().raw(), (unsigned)width, (unsigned)height);
  return CUDA_STATUS;
}

Status unpackDepth(GPU::Buffer2D& yDst, GPU::Buffer2D& uDst, GPU::Buffer2D& vDst, const GPU::Surface& surf,
                   std::size_t width, std::size_t height, GPU::Stream s) {
  assert(!(width & 1));
  assert(!(height & 1));
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid((unsigned)Cuda::ceilDiv((width + 1) / 2, dimBlock.x),
                     (unsigned)Cuda::ceilDiv((height + 1) / 2, dimBlock.y), 1);
  unpackSourceKernelDepth<<<dimGrid, dimBlock, 0, s.get()>>>(
      yDst.get().raw(), (unsigned)yDst.getPitch(), uDst.get().raw(), (unsigned)uDst.getPitch(), vDst.get().raw(),
      (unsigned)vDst.getPitch(), surf.get().surface(), (unsigned)width, (unsigned)height);
  return CUDA_STATUS;
}

Status unpackYV12(GPU::Buffer2D& yDst, GPU::Buffer2D& uDst, GPU::Buffer2D& vDst,
                  const GPU::Buffer<const uint32_t>& array, std::size_t width, std::size_t height, GPU::Stream s) {
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid((unsigned)Cuda::ceilDiv((width + 1) / 2, dimBlock.x),
                     (unsigned)Cuda::ceilDiv((height + 1) / 2, dimBlock.y), 1);
  unpackKernelYV12<<<dimGrid, dimBlock, 0, s.get()>>>(
      yDst.get().raw(), (unsigned)yDst.getPitch(), uDst.get().raw(), (unsigned)uDst.getPitch(), vDst.get().raw(),
      (unsigned)vDst.getPitch(), array.get(), (unsigned)width, (unsigned)height);
  return CUDA_STATUS;
}

Status unpackYV12(GPU::Buffer2D& yDst, GPU::Buffer2D& uDst, GPU::Buffer2D& vDst, const GPU::Surface& surf,
                  std::size_t width, std::size_t height, GPU::Stream s) {
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid((unsigned)Cuda::ceilDiv((width + 1) / 2, dimBlock.x),
                     (unsigned)Cuda::ceilDiv((height + 1) / 2, dimBlock.y), 1);
  unpackSourceKernelYV12<<<dimGrid, dimBlock, 0, s.get()>>>(
      yDst.get().raw(), (unsigned)yDst.getPitch(), uDst.get().raw(), (unsigned)uDst.getPitch(), vDst.get().raw(),
      (unsigned)vDst.getPitch(), surf.get().surface(), (unsigned)width, (unsigned)height);
  return CUDA_STATUS;
}

Status unpackNV12(GPU::Buffer2D& yDst, GPU::Buffer2D& uvDst, const GPU::Surface& surf, std::size_t width,
                  std::size_t height, GPU::Stream s) {
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid((unsigned)Cuda::ceilDiv((width + 1) / 2, dimBlock.x),
                     (unsigned)Cuda::ceilDiv((height + 1) / 2, dimBlock.y), 1);
  unpackSourceKernelNV12<<<dimGrid, dimBlock, 0, s.get()>>>(yDst.get().raw(), (unsigned)yDst.getPitch(),
                                                            uvDst.get().raw(), (unsigned)uvDst.getPitch(),
                                                            surf.get().surface(), (unsigned)width, (unsigned)height);
  return CUDA_STATUS;
}

Status unpackNV12(GPU::Buffer2D& yDst, GPU::Buffer2D& uvDst, const GPU::Buffer<const uint32_t>& array,
                  std::size_t width, std::size_t height, GPU::Stream s) {
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid((unsigned)Cuda::ceilDiv((width + 1) / 2, dimBlock.x),
                     (unsigned)Cuda::ceilDiv((height + 1) / 2, dimBlock.y), 1);
  unpackKernelNV12<<<dimGrid, dimBlock, 0, s.get()>>>(yDst.get().raw(), (unsigned)yDst.getPitch(), uvDst.get().raw(),
                                                      (unsigned)uvDst.getPitch(), array.get(), (unsigned)width,
                                                      (unsigned)height);
  return CUDA_STATUS;
}

Status unpackYUY2(GPU::Buffer2D& dst, const GPU::Buffer<const uint32_t>& src, std::size_t width, std::size_t height,
                  GPU::Stream stream) {
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid((unsigned)Cuda::ceilDiv((width + 1) / 2, dimBlock.x), (unsigned)Cuda::ceilDiv(height, dimBlock.y),
                     1);
  unpackYUY2Kernel<<<dimGrid, dimBlock, 0, stream.get()>>>(dst.get().raw(), (unsigned)dst.getPitch(), src.get(),
                                                           (unsigned)width, (unsigned)height);
  return CUDA_STATUS;
}

Status unpackYUY2(GPU::Buffer2D&, const GPU::Surface&, std::size_t, std::size_t, GPU::Stream) {
  return Status{Origin::GPU, ErrType::ImplementationError, "Unpacking not implemented from Surface"};
}

Status unpackUYVY(GPU::Buffer2D& dst, const GPU::Buffer<const uint32_t>& src, std::size_t width, std::size_t height,
                  GPU::Stream stream) {
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid((unsigned)Cuda::ceilDiv((width + 1) / 2, dimBlock.x), (unsigned)Cuda::ceilDiv(height, dimBlock.y),
                     1);
  unpackUYVYKernel<<<dimGrid, dimBlock, 0, stream.get()>>>(dst.get().raw(), (unsigned)dst.getPitch(), src.get(),
                                                           (unsigned)width, (unsigned)height);
  return CUDA_STATUS;
}

Status unpackUYVY(GPU::Buffer2D&, const GPU::Surface&, std::size_t, std::size_t, GPU::Stream) {
  return Status{Origin::GPU, ErrType::ImplementationError, "Unpacking not implemented from Surface"};
}

Status convertGrayscale(GPU::Buffer<uint32_t> dst, GPU::Buffer<const unsigned char> src, std::size_t width,
                        std::size_t height, GPU::Stream stream) {
  const dim3 dimBlock2D(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid2D((unsigned)Cuda::ceilDiv(width, dimBlock2D.x), (unsigned)Cuda::ceilDiv(height, dimBlock2D.y), 1);
  convertKernelGrayscale<<<dimGrid2D, dimBlock2D, 0, stream.get()>>>(dst.get(), src.get(), (unsigned)width,
                                                                     (unsigned)height);
  return CUDA_STATUS;
}

Status unpackYUV422P10(GPU::Buffer2D& yDst, GPU::Buffer2D& uDst, GPU::Buffer2D& vDst,
                       const GPU::Buffer<const uint32_t>& src, std::size_t width, std::size_t height,
                       GPU::Stream stream) {
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid((unsigned)Cuda::ceilDiv((width + 1) / 2, dimBlock.x), (unsigned)Cuda::ceilDiv(height, dimBlock.y),
                     1);
  unpackYUV422P10Kernel<<<dimGrid, dimBlock, 0, stream.get()>>>(
      reinterpret_cast<uint16_t*>(yDst.get().raw()), (unsigned)yDst.getPitch() / 2,
      reinterpret_cast<uint16_t*>(uDst.get().raw()), (unsigned)uDst.getPitch() / 2,
      reinterpret_cast<uint16_t*>(vDst.get().raw()), (unsigned)vDst.getPitch() / 2, src.get(), (unsigned)width,
      (unsigned)height);
  return CUDA_STATUS;
}

Status unpackYUV422P10(GPU::Buffer2D&, GPU::Buffer2D&, GPU::Buffer2D&, const GPU::Surface&, std::size_t, std::size_t,
                       GPU::Stream) {
  return Status{Origin::GPU, ErrType::ImplementationError, "Unpacking not implemented from Surface"};
}

Status unpackGrayscale(GPU::Buffer2D& dst, const GPU::Surface& src, std::size_t width, std::size_t height,
                       GPU::Stream stream) {
  const dim3 dimBlock2D(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid2D((unsigned)Cuda::ceilDiv(width, dimBlock2D.x), (unsigned)Cuda::ceilDiv(height, dimBlock2D.y), 1);
  unpackKernelGrayscale<<<dimGrid2D, dimBlock2D, 0, stream.get()>>>(
      dst.get().raw(), (unsigned)dst.getPitch(), src.get().surface(), (unsigned)width, (unsigned)height);
  return CUDA_STATUS;
}

// ---------------- Convert other colorspace -> RGBA --------------------------

Status convertRGBToRGBA(GPU::Surface& dst, GPU::Buffer<const unsigned char> src, std::size_t width, std::size_t height,
                        GPU::Stream stream) {
  const dim3 dimBlock2D(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid2D((unsigned)Cuda::ceilDiv(width, dimBlock2D.x), (unsigned)Cuda::ceilDiv(height, dimBlock2D.y), 1);
  convertRGBToRGBAKernel<<<dimGrid2D, dimBlock2D, 0, stream.get()>>>(dst.get().surface(), src.get(), (unsigned)width,
                                                                     (unsigned)height);
  return CUDA_STATUS;
}

Status convertRGB210ToRGBA(GPU::Surface& dst, GPU::Buffer<const uint32_t> src, std::size_t width, std::size_t height,
                           GPU::Stream stream) {
  const dim3 dimBlock2D(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid2D((unsigned)Cuda::ceilDiv(width, dimBlock2D.x), (unsigned)Cuda::ceilDiv(height, dimBlock2D.y), 1);
  convertRGB210ToRGBAKernel<<<dimGrid2D, dimBlock2D, 0, stream.get()>>>(dst.get().surface(), src.get(), (unsigned)width,
                                                                        (unsigned)height);
  return CUDA_STATUS;
}

Status convertBGRToRGBA(GPU::Buffer<uint32_t> dst, GPU::Buffer<const unsigned char> src, std::size_t width,
                        std::size_t height, GPU::Stream stream) {
  const dim3 dimBlock2D(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid2D((unsigned)Cuda::ceilDiv(width, dimBlock2D.x), (unsigned)Cuda::ceilDiv(height, dimBlock2D.y), 1);
  convertBGRToRGBAKernel<<<dimGrid2D, dimBlock2D, 0, stream.get()>>>(dst.get(), src.get(), (unsigned)width,
                                                                     (unsigned)height);
  return CUDA_STATUS;
}

Status convertBGRUToRGBA(GPU::Buffer<uint32_t> dst, GPU::Buffer<const unsigned char> src, std::size_t width,
                         std::size_t height, GPU::Stream stream) {
  const dim3 dimBlock2D(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid2D((unsigned)Cuda::ceilDiv(width / 2, dimBlock2D.x),
                       (unsigned)Cuda::ceilDiv(height / 2, dimBlock2D.y), 1);
  convertBGRUToRGBAKernel<<<dimGrid2D, dimBlock2D, 0, stream.get()>>>(dst.get(), src.get(), (unsigned)width,
                                                                      (unsigned)height);
  return CUDA_STATUS;
}

Status convertBayerRGGBToRGBA(GPU::Buffer<uint32_t> dst, GPU::Buffer<const unsigned char> src, std::size_t width,
                              std::size_t height, GPU::Stream stream) {
  assert(!(width & 1));
  assert(!(height & 1));
  const dim3 dimBlock2D(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid2D((unsigned)Cuda::ceilDiv(width / 2, dimBlock2D.x),
                       (unsigned)Cuda::ceilDiv(height / 2, dimBlock2D.y), 1);
  convertBayerRGGBToRGBAKernel<<<dimGrid2D, dimBlock2D, sizeof(uint32_t) * (dimBlock2D.x + 1) * (dimBlock2D.y + 1),
                                 stream.get()>>>(dst.get(), src.get(), (unsigned)width, (unsigned)height);
  return CUDA_STATUS;
}

Status convertBayerBGGRToRGBA(GPU::Buffer<uint32_t> dst, GPU::Buffer<const unsigned char> src, std::size_t width,
                              std::size_t height, GPU::Stream stream) {
  assert(!(width & 1));
  assert(!(height & 1));
  const dim3 dimBlock2D(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid2D((unsigned)Cuda::ceilDiv(width / 2, dimBlock2D.x),
                       (unsigned)Cuda::ceilDiv(height / 2, dimBlock2D.y), 1);
  convertBayerBGGRToRGBAKernel<<<dimGrid2D, dimBlock2D, sizeof(uint32_t) * (dimBlock2D.x + 1) * (dimBlock2D.y + 1),
                                 stream.get()>>>(dst.get(), src.get(), (unsigned)width, (unsigned)height);
  return CUDA_STATUS;
}

Status convertBayerGRBGToRGBA(GPU::Buffer<uint32_t> dst, GPU::Buffer<const unsigned char> src, std::size_t width,
                              std::size_t height, GPU::Stream stream) {
  assert(!(width & 1));
  assert(!(height & 1));
  const dim3 dimBlock2D(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid2D((unsigned)Cuda::ceilDiv(width / 2, dimBlock2D.x),
                       (unsigned)Cuda::ceilDiv(height / 2, dimBlock2D.y), 1);
  convertBayerGRBGToRGBAKernel<<<dimGrid2D, dimBlock2D, sizeof(uint32_t) * (dimBlock2D.x + 1) * (dimBlock2D.y + 1),
                                 stream.get()>>>(dst.get(), src.get(), (unsigned)width, (unsigned)height);
  return CUDA_STATUS;
}

Status convertBayerGBRGToRGBA(GPU::Buffer<uint32_t> dst, GPU::Buffer<const unsigned char> src, std::size_t width,
                              std::size_t height, GPU::Stream stream) {
  assert(!(width & 1));
  assert(!(height & 1));
  const dim3 dimBlock2D(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid2D((unsigned)Cuda::ceilDiv(width / 2, dimBlock2D.x),
                       (unsigned)Cuda::ceilDiv(height / 2, dimBlock2D.y), 1);
  convertBayerGBRGToRGBAKernel<<<dimGrid2D, dimBlock2D, sizeof(uint32_t) * (dimBlock2D.x + 1) * (dimBlock2D.y + 1),
                                 stream.get()>>>(dst.get(), src.get(), (unsigned)width, (unsigned)height);
  return CUDA_STATUS;
}

Status convertUYVYToRGBA(GPU::Surface& dst, GPU::Buffer<const unsigned char> src, std::size_t width, std::size_t height,
                         GPU::Stream stream) {
  const dim3 dimBlock(16, 16, 1);
  assert(!(width & 1));
  assert(!(height & 1));
  const dim3 dimGrid((unsigned)Cuda::ceilDiv(width / 2, dimBlock.x), (unsigned)Cuda::ceilDiv(height, dimBlock.y), 1);
  convertUYVYToRGBAKernel<<<dimGrid, dimBlock, 0, stream.get()>>>(dst.get().surface(), src.get(), (unsigned)width,
                                                                  (unsigned)height);
  return CUDA_STATUS;
}

Status convertYUV422P10ToRGBA(GPU::Surface& dst, GPU::Buffer<const unsigned char> src, std::size_t width,
                              std::size_t height, GPU::Stream stream) {
  assert(!(width & 1));
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid((unsigned)Cuda::ceilDiv(width / 2, dimBlock.x), (unsigned)Cuda::ceilDiv(height, dimBlock.y), 1);
  convertYUV422P10ToRGBAKernel<<<dimGrid, dimBlock, 0, stream.get()>>>(
      dst.get().surface(), src.as<const uint16_t>().get(), (unsigned)width, (unsigned)height);
  return CUDA_STATUS;
}

Status convertYUY2ToRGBA(GPU::Surface& dst, GPU::Buffer<const unsigned char> src, std::size_t width, std::size_t height,
                         GPU::Stream stream) {
  const dim3 dimBlock(16, 16, 1);
  assert(!(width & 1));
  assert(!(height & 1));
  const dim3 dimGrid((unsigned)Cuda::ceilDiv(width / 2, dimBlock.x), (unsigned)Cuda::ceilDiv(height, dimBlock.y), 1);
  convertYUY2ToRGBAKernel<<<dimGrid, dimBlock, 0, stream.get()>>>(dst.get().surface(), src.get(), (unsigned)width,
                                                                  (unsigned)height);
  return CUDA_STATUS;
}

Status convertYV12ToRGBA(GPU::Surface& dst, GPU::Buffer<const unsigned char> src, std::size_t width, std::size_t height,
                         GPU::Stream stream) {
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  assert(!(width & 1));
  assert(!(height & 1));
  const dim3 dimGrid((unsigned)Cuda::ceilDiv(width / 2, dimBlock.x), (unsigned)Cuda::ceilDiv(height / 2, dimBlock.y),
                     1);
  convertYV12ToRGBAKernel<<<dimGrid, dimBlock, 0, stream.get()>>>(dst.get().surface(), src.get(), (unsigned)width,
                                                                  (unsigned)height);
  return CUDA_STATUS;
}

Status convertNV12ToRGBA(GPU::Surface& dst, GPU::Buffer<const unsigned char> src, std::size_t width, std::size_t height,
                         GPU::Stream stream) {
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  assert(!(width & 1));
  assert(!(height & 1));
  const dim3 dimGrid((unsigned)Cuda::ceilDiv(width / 2, dimBlock.x), (unsigned)Cuda::ceilDiv(height / 2, dimBlock.y),
                     1);
  convertNV12ToRGBAKernel<<<dimGrid, dimBlock, 0, stream.get()>>>(dst.get().surface(), src.get(), (unsigned)width,
                                                                  (unsigned)height);
  return CUDA_STATUS;
}

Status convertYUV420ToMono(GPU::Buffer<unsigned char> dst, GPU::Buffer<const unsigned char> src, std::size_t width,
                           std::size_t height, GPU::Stream stream) {
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  assert(!(width & 1));
  assert(!(height & 1));
  const dim3 dimGrid((unsigned)Cuda::ceilDiv(width / 2, dimBlock.x), (unsigned)Cuda::ceilDiv(height / 2, dimBlock.y),
                     1);
  unpackMonoKernelYUV420P<<<dimGrid, dimBlock, 0, stream.get()>>>(dst.get(), src.get(), (unsigned)width,
                                                                  (unsigned)height);
  return CUDA_STATUS;
}

Status convertGrayscaleToRGBA(GPU::Surface& dst, GPU::Buffer<const unsigned char> src, std::size_t width,
                              std::size_t height, GPU::Stream stream) {
  const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1);
  const dim3 dimGrid((unsigned)Cuda::ceilDiv(width, dimBlock.x), (unsigned)Cuda::ceilDiv(height, dimBlock.y), 1);
  convertGrayscaleKernel<<<dimGrid, dimBlock, 0, stream.get()>>>(dst.get().surface(), src.get(), (unsigned)width,
                                                                 (unsigned)height);
  return CUDA_STATUS;
}
}  // namespace Image
}  // namespace VideoStitch