// Copyright (c) 2012-2017 VideoStitch SAS
// Copyright (c) 2018 stitchEm

#pragma once

#include "2dBuffer.hpp"
#include "buffer.hpp"
#include "surface.hpp"
#include "hostBuffer.hpp"
#include "stream.hpp"

namespace VideoStitch {
namespace GPU {

// Device --> Device
// * async
template <typename T>
Status memcpyAsync(Buffer<T> dst, Buffer<const T> src, size_t copySize, const Stream& stream);

template <typename T>
inline Status memcpyAsync(Buffer<T> dst, Buffer<T> src, size_t copySize, const Stream& stream) {
  return memcpyAsync(dst, src.as_const(), copySize, stream);
}

// * blocking
template <typename T>
Status memcpyBlocking(Buffer<T> dst, Buffer<const T> src, size_t copySize);

template <typename T>
inline Status memcpyBlocking(Buffer<T> dst, Buffer<T> src, size_t copySize) {
  return memcpyBlocking(dst, src.as_const(), copySize);
}

template <typename T>
inline Status memcpyBlocking(Buffer<T> dst, Buffer<T> src) {
  return memcpyBlocking(dst, src.as_const());
}

template <typename T>
inline Status memcpyBlocking(Buffer<T> dst, Buffer<const T> src) {
  if (dst.byteSize() < src.byteSize()) {
    return {Origin::GPU, ErrType::ImplementationError, "Copy destination is too small"};
  }
  return memcpyBlocking(dst, src, src.byteSize());
}

// Host --> Device
// * async
template <typename T>
Status memcpyAsync(Buffer<T> dst, const T* src, size_t copySize, const Stream& stream);

template <typename T>
inline Status memcpyAsync(Buffer<T> dst, const T* src, const Stream& stream) {
  return memcpyAsync(dst, src, dst.byteSize(), stream);
}

template <typename T>
inline Status memcpyAsync(Buffer<T> dst, HostBuffer<const T> src, const Stream& stream) {
  if (dst.byteSize() < src.byteSize()) {
    return {Origin::GPU, ErrType::ImplementationError, "Copy destination is too small"};
  }
  return memcpyAsync(dst, src.hostPtr(), src.byteSize(), stream);
}

template <typename T>
inline Status memcpyAsync(Buffer<T> dst, HostBuffer<const T> src, size_t copySize, const Stream& stream) {
  assert(copySize <= src.byteSize());
  return memcpyAsync(dst, src.hostPtr(), copySize, stream);
}

template <typename T>
inline Status memcpyAsync(Buffer<T> dst, HostBuffer<T> src, size_t copySize, const Stream& stream) {
  return memcpyAsync(dst, src.as_const(), copySize, stream);
}

// * blocking
template <typename T>
Status memcpyBlocking(Buffer<T> dst, const T* src, size_t copySize);

template <typename T>
inline Status memcpyBlocking(Buffer<T> dst, const T* src) {
  return memcpyBlocking(dst, src, dst.byteSize());
}

template <typename T>
inline Status memcpyBlocking(typename Buffer<T>::PotentialBuffer dst, const T* src) {
  if (!dst.status()) {
    return dst.status();
  }
  return memcpyBlocking(dst.value(), src, dst.byteSize());
}

// Copy memory Device --> Host
// * async
template <typename T>
Status memcpyAsync(T* dst, Buffer<const T> src, size_t copySize, const Stream& stream);

template <typename T>
inline Status memcpyAsync(T* dst, Buffer<const T> src, const Stream& stream) {
  return memcpyAsync(dst, src, src.byteSize(), stream);
}

template <typename T>
inline Status memcpyAsync(HostBuffer<T> dst, Buffer<const T> src, size_t copySize, const Stream& stream) {
  assert(copySize <= dst.byteSize());
  return memcpyAsync(dst.hostPtr(), src, copySize, stream);
}

template <typename T>
inline Status memcpyAsync(HostBuffer<T> dst, Buffer<const T> src, const Stream& stream) {
  if (dst.byteSize() < src.byteSize()) {
    return {Origin::GPU, ErrType::ImplementationError, "Copy destination is too small"};
  }
  return memcpyAsync(dst.hostPtr(), src, src.byteSize(), stream);
}

Status memcpyAsync(unsigned char* dst, Buffer2D src, const Stream& stream);
inline Status memcpyAsync(HostBuffer<unsigned char> dst, Buffer2D src, const Stream& stream) {
  if (dst.byteSize() < src.getWidth() * src.getHeight()) {
    return {Origin::GPU, ErrType::ImplementationError, "Copy destination is too small"};
  }
  return memcpyAsync(dst.hostPtr(), src, stream);
}

// * blocking
template <typename T>
Status memcpyBlocking(T* dst, Buffer<const T> src, size_t copySize);

template <typename T>
inline Status memcpyBlocking(T* dst, Buffer<const T> src) {
  return memcpyBlocking(dst, src, src.byteSize());
}

template <typename T>
inline Status memcpyBlocking(T* dst, Buffer<T> src) {
  return memcpyBlocking(dst, src.as_const());
}

template <typename T>
inline Status memcpyBlocking(HostBuffer<T> dst, Buffer<const T> src) {
  if (dst.byteSize() < src.byteSize()) {
    return {Origin::GPU, ErrType::ImplementationError, "Copy destination is too small"};
  }
  return memcpyBlocking(dst.hostPtr(), src);
}

Status memcpyBlocking(unsigned char* dst, Buffer2D src);
inline Status memcpyBlocking(HostBuffer<unsigned char> dst, Buffer2D src) {
  if (dst.byteSize() < src.getWidth() * src.getHeight()) {
    return {Origin::GPU, ErrType::ImplementationError, "Copy destination is too small"};
  }
  return memcpyBlocking(dst.hostPtr(), src);
}

Status memcpyBlocking(Buffer2D dst, const unsigned char* src);
inline Status memcpyBlocking(Buffer2D dst, HostBuffer<unsigned char> src) {
  if (src.byteSize() < dst.getWidth() * dst.getHeight()) {
    return {Origin::GPU, ErrType::ImplementationError, "Copy destination is too small"};
  }
  return memcpyBlocking(dst, src.hostPtr());
}

// Memset
// Note: memset to value implemented in render.hpp

// Set settingSize Bytes of GPU memory to 0
template <typename T>
Status memsetToZeroAsync(Buffer<T> devPtr, size_t settingSize, const Stream& stream);

template <typename T>
inline Status memsetToZeroAsync(Buffer<T> devPtr, const Stream& stream) {
  return memsetToZeroAsync(devPtr, devPtr.byteSize(), stream);
}

template <typename T>
Status memsetToZeroBlocking(Buffer<T> devPtr, size_t settingSize);

Status memsetToZeroAsync(Surface& dst, const Stream& stream);

// Device buffer to texture memory
template <typename T>
Status memcpyAsync(Surface& dst, Buffer<const T> src, const Stream& stream);
Status memcpyBlocking(Surface& dst, Buffer<const uint32_t> src);

// Texture memory to device buffer
template <typename T>
Status memcpyAsync(Buffer<T> dst, Surface& src, const Stream& stream);
template <typename T>
Status memcpyBlocking(Buffer<T> dst, Surface& src);

// Texture memory to host buffer
template <typename T>
Status memcpyAsync(T* dst, Surface& src, const Stream& stream);

template <typename T>
Status memcpyBlocking(T* dst, Surface& src);

// Host to texture memory
Status memcpyAsync(Surface& dst, uint32_t* src, const Stream& stream);
Status memcpyBlocking(Surface& dst, uint32_t* src);

Status memcpy2DAsync(Buffer<uint32_t> dst, Buffer<uint32_t> src, size_t src_origin_width, size_t src_origin_height,
                     size_t dst_origin_width, size_t dst_origin_height, size_t region_width, size_t region_height,
                     size_t src_pitch, size_t dst_pitch, const Stream& stream);

Status memcpyCubemapAsync(CubemapSurface& dst, Buffer<uint32_t> srcXP, Buffer<uint32_t> srcXN, Buffer<uint32_t> srcYP,
                          Buffer<uint32_t> srcYN, Buffer<uint32_t> srcZP, Buffer<uint32_t> srcZN, size_t faceDim,
                          const Stream& stream);

}  // namespace GPU
}  // namespace VideoStitch