// Copyright (c) 2012-2017 VideoStitch SAS
// Copyright (c) 2018 stitchEm

#ifndef __APPLE__
#define GLEW_STATIC
#include <GL/glew.h>
#include <GL/gl.h>
#else
#include <GL/glew.h>
#include <OpenGL/gl.h>
#endif
#include "cuda/error.hpp"
#include "../common/glAllocator.hpp"
#include <cuda_gl_interop.h>
#include <cuda_runtime.h>

#include "surface.hpp"
#include "deviceBuffer.hpp"

#include "../common/allocStats.hpp"

#include "gpu/allocator.hpp"
#include "gpu/memcpy.hpp"

namespace VideoStitch {
namespace Core {

unsigned int getCudaGLMemAllocType(OpenGLAllocator::BufferAllocType flag) {
  switch (flag) {
    case OpenGLAllocator::BufferAllocType::ReadWrite:
      return cudaGraphicsMapFlagsNone;
    case OpenGLAllocator::BufferAllocType::ReadOnly:
      return cudaGraphicsMapFlagsReadOnly;
    case OpenGLAllocator::BufferAllocType::WriteOnly:
      return cudaGraphicsMapFlagsWriteDiscard;
  }

  assert(false);
  return cudaGraphicsMapFlagsNone;
}

Potential<SourceSurface> OffscreenAllocator::createAlphaSurface(size_t width, size_t height, const char* name) {
  cudaArray_t array;
  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
  FAIL_RETURN(CUDA_ERROR(cudaMallocArray(&array, &channelDesc, width, height, cudaArraySurfaceLoadStore)));
  deviceStats.addPtr(name, array, width * height);

  // Specify surface
  struct cudaResourceDesc resDesc = {};
  resDesc.resType = cudaResourceTypeArray;
  // Create the surface objects
  resDesc.res.array.array = array;
  cudaSurfaceObject_t surface;
  FAIL_RETURN(CUDA_ERROR(cudaCreateSurfaceObject(&surface, &resDesc)));
  // create a texture
  cudaTextureObject_t tex;
  cudaTextureDesc texDesc = {};
  texDesc.filterMode = cudaFilterModeLinear;
  texDesc.addressMode[0] = cudaAddressModeWrap;
  texDesc.addressMode[1] = cudaAddressModeClamp;
  texDesc.normalizedCoords = false;
  texDesc.readMode = cudaReadModeNormalizedFloat;
  cudaResourceViewDesc resViewDesc = {};
  resViewDesc.format = cudaResViewFormatUnsignedChar1;
  resViewDesc.width = width;
  resViewDesc.height = height;
  cudaCreateTextureObject(&tex, &resDesc, &texDesc, &resViewDesc);

  auto gsurface = new GPU::Surface(new GPU::DeviceSurface(array, tex, surface, true), width, height);

  Potential<SourceSurface::Pimpl> impl = SourceSurface::Pimpl::create(gsurface);
  FAIL_RETURN(impl.status());

  return new SourceSurface(impl.release());
}

namespace {
Potential<GPU::Surface> makeSurface(std::string name, size_t width, size_t height) {
  cudaArray_t array;
  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
  FAIL_RETURN(CUDA_ERROR(cudaMallocArray(&array, &channelDesc, width, height, cudaArraySurfaceLoadStore)));
  deviceStats.addPtr(name.c_str(), array, width * height * 4);

  // Specify surface
  struct cudaResourceDesc resDesc = {};
  resDesc.resType = cudaResourceTypeArray;
  // Create the surface objects
  resDesc.res.array.array = array;
  cudaSurfaceObject_t surface;
  FAIL_RETURN(CUDA_ERROR(cudaCreateSurfaceObject(&surface, &resDesc)));
  // create a texture
  cudaTextureObject_t tex;
  cudaTextureDesc texDesc = {};
  texDesc.filterMode = cudaFilterModeLinear;
  texDesc.addressMode[0] = cudaAddressModeWrap;
  texDesc.addressMode[1] = cudaAddressModeClamp;
  texDesc.normalizedCoords = false;
  texDesc.readMode = cudaReadModeNormalizedFloat;
  cudaResourceViewDesc resViewDesc = {};
  resViewDesc.format = cudaResViewFormatUnsignedChar4;
  resViewDesc.width = width;
  resViewDesc.height = height;
  FAIL_RETURN(CUDA_ERROR(cudaCreateTextureObject(&tex, &resDesc, &texDesc, &resViewDesc)));
  return new GPU::Surface(new GPU::DeviceSurface(array, tex, surface, true), width, height);
}

Potential<GPU::Surface> makeSurface_F32_C1(std::string name, size_t width, size_t height) {
  cudaArray_t array;
  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
  FAIL_RETURN(CUDA_ERROR(cudaMallocArray(&array, &channelDesc, width, height, cudaArraySurfaceLoadStore)));
  deviceStats.addPtr(name.c_str(), array, width * height * sizeof(float));

  // Specify surface
  struct cudaResourceDesc resDesc;
  memset(&resDesc, 0, sizeof(resDesc));
  resDesc.resType = cudaResourceTypeArray;
  // Create the surface objects
  resDesc.res.array.array = array;
  cudaSurfaceObject_t surface;
  FAIL_RETURN(CUDA_ERROR(cudaCreateSurfaceObject(&surface, &resDesc)));
  // create a texture
  cudaTextureObject_t tex;
  cudaTextureDesc texDesc = {};
  texDesc.filterMode = cudaFilterModeLinear;
  texDesc.addressMode[0] = cudaAddressModeWrap;
  texDesc.addressMode[1] = cudaAddressModeClamp;
  texDesc.normalizedCoords = false;
  texDesc.readMode = cudaReadModeElementType;
  cudaResourceViewDesc resViewDesc;
  memset(&resViewDesc, 0, sizeof(cudaResourceViewDesc));
  resViewDesc.format = cudaResViewFormatFloat1;
  resViewDesc.width = width;
  resViewDesc.height = height;
  FAIL_RETURN(CUDA_ERROR(cudaCreateTextureObject(&tex, &resDesc, &texDesc, &resViewDesc)));
  return new GPU::Surface(new GPU::DeviceSurface(array, tex, surface, true), width, height);
}
}  // namespace

Potential<SourceSurface> OffscreenAllocator::createSourceSurface(size_t width, size_t height, const char* name) {
  Potential<GPU::Surface> potSurf = makeSurface(name, width, height);
  if (!potSurf.ok()) {
    return potSurf.status();
  }

  Potential<SourceSurface::Pimpl> impl = SourceSurface::Pimpl::create(potSurf.release());
  FAIL_RETURN(impl.status());

  return new SourceSurface(impl.release());
}

Potential<SourceSurface> OffscreenAllocator::createDepthSurface(size_t width, size_t height, const char* name) {
  Potential<GPU::Surface> potSurf = makeSurface_F32_C1(name, width, height);
  if (!potSurf.ok()) {
    return potSurf.status();
  }

  Potential<SourceSurface::Pimpl> impl = SourceSurface::Pimpl::create(potSurf.release());
  FAIL_RETURN(impl.status());

  return new SourceSurface(impl.release());
}

Potential<SourceSurface> OffscreenAllocator::createCoordSurface(size_t width, size_t height, const char* name) {
  cudaArray_t array;
  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat);
  FAIL_RETURN(CUDA_ERROR(cudaMallocArray(&array, &channelDesc, width, height, cudaArraySurfaceLoadStore)));
  deviceStats.addPtr(name, array, width * height * 8);

  // Specify surface
  struct cudaResourceDesc resDesc = {};
  resDesc.resType = cudaResourceTypeArray;
  // Create the surface objects
  resDesc.res.array.array = array;
  cudaSurfaceObject_t surface;
  FAIL_RETURN(CUDA_ERROR(cudaCreateSurfaceObject(&surface, &resDesc)));
  // create a texture
  cudaTextureObject_t tex;
  cudaTextureDesc texDesc = {};
  texDesc.filterMode = cudaFilterModeLinear;
  texDesc.addressMode[0] = cudaAddressModeWrap;
  texDesc.addressMode[1] = cudaAddressModeClamp;
  texDesc.normalizedCoords = false;
  texDesc.readMode = cudaReadModeElementType;
  cudaResourceViewDesc resViewDesc = {};
  resViewDesc.format = cudaResViewFormatFloat2;
  resViewDesc.width = width;
  resViewDesc.height = height;
  FAIL_RETURN(CUDA_ERROR(cudaCreateTextureObject(&tex, &resDesc, &texDesc, &resViewDesc)));

  auto gsurface = new GPU::Surface(new GPU::DeviceSurface(array, tex, surface, true), width, height);

  Potential<SourceSurface::Pimpl> impl = SourceSurface::Pimpl::create(gsurface);
  FAIL_RETURN(impl.status());

  return new SourceSurface(impl.release());
}

Potential<SourceOpenGLSurface> OpenGLAllocator::createSourceSurface(size_t width, size_t height) {
  auto allocPotSurf = [](GLuint texture, size_t width, size_t height) -> Potential<SourceOpenGLSurface> {
    cudaGraphicsResource* image;
    FAIL_RETURN(CUDA_ERROR(
        cudaGraphicsGLRegisterImage(&image, texture, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsSurfaceLoadStore)))
    FAIL_RETURN(CUDA_ERROR(cudaGraphicsMapResources(1, &image, cudaStreamPerThread)))

    cudaArray_t array;
    FAIL_RETURN(CUDA_ERROR(cudaGraphicsSubResourceGetMappedArray(&array, image, 0, 0)))
    FAIL_RETURN(CUDA_ERROR(cudaGraphicsUnmapResources(1, &image, cudaStreamPerThread)))

    // Specify surface
    struct cudaResourceDesc resDesc = {};
    resDesc.resType = cudaResourceTypeArray;
    // Create the surface objects
    resDesc.res.array.array = array;
    cudaSurfaceObject_t surface;
    FAIL_RETURN(CUDA_ERROR(cudaCreateSurfaceObject(&surface, &resDesc)))

    // create a texture
    cudaTextureObject_t tex;
    cudaTextureDesc texDesc = {};
    texDesc.filterMode = cudaFilterModeLinear;
    texDesc.addressMode[0] = cudaAddressModeClamp;
    texDesc.addressMode[1] = cudaAddressModeClamp;
    texDesc.normalizedCoords = false;
    texDesc.readMode = cudaReadModeNormalizedFloat;

    cudaResourceViewDesc resViewDesc = {};
    resViewDesc.format = cudaResViewFormatUnsignedChar4;
    resViewDesc.width = width;
    resViewDesc.height = height;
    FAIL_RETURN(CUDA_ERROR(cudaCreateTextureObject(&tex, &resDesc, &texDesc, &resViewDesc)))

    {
      auto gsurface = std::make_unique<GPU::Surface>(new GPU::DeviceSurface(array, tex, surface, false), width, height);
      Potential<SourceOpenGLSurface::Pimpl> impl = SourceOpenGLSurface::Pimpl::create(image, std::move(gsurface));
      FAIL_RETURN(impl.status())

      SourceOpenGLSurface* surf = new SourceOpenGLSurface(impl.release());
      surf->texture = texture;

      return surf;
    }
  };

  PotentialValue<GLuint> potTexture = createSourceSurfaceTexture(width, height);
  FAIL_RETURN(potTexture.status());

  GLuint texture = potTexture.value();

  auto potSurf = allocPotSurf(texture, width, height);

  if (!potSurf.ok()) {
    glDeleteTextures(1, &texture);
    return Status{Origin::GPU, ErrType::RuntimeError, "Could not map OpenGL surface to CUDA.", potSurf.status()};
  }

  return potSurf;
}

Potential<PanoOpenGLSurface> OpenGLAllocator::createPanoSurface(size_t width, size_t height, BufferAllocType flag) {
  auto allocPotSurf = [](GLuint pixelbuffer, size_t width, size_t height,
                         BufferAllocType flag) -> Potential<PanoOpenGLSurface> {
    cudaGraphicsResource* pbo;
    unsigned int memFlag = getCudaGLMemAllocType(flag);

    FAIL_RETURN(CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&pbo, pixelbuffer, memFlag)))
    FAIL_RETURN(CUDA_ERROR(cudaGraphicsMapResources(1, &pbo, cudaStreamPerThread)))

    void* devPtr;
    size_t size;
    FAIL_RETURN(CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&devPtr, &size, pbo)))

    FAIL_RETURN(CUDA_ERROR(cudaGraphicsUnmapResources(1, &pbo, cudaStreamPerThread)))

    GPU::Buffer<uint32_t> buffer = GPU::DeviceBuffer<uint32_t>::createBuffer((uint32_t*)devPtr, width * height);
    FAIL_RETURN(GPU::memsetToZeroBlocking(buffer, width * height * sizeof(uint32_t)))

    Potential<GPU::Surface> potremapSurf = makeSurface("Remap Buffer", width, height);
    FAIL_RETURN(potremapSurf.status())

    Potential<PanoOpenGLSurface::Pimpl> impl =
        PanoOpenGLSurface::Pimpl::create(pbo, buffer, potremapSurf.object(), width, height);
    FAIL_RETURN(impl.status())

    // ownership transferred to impl
    potremapSurf.release();

    PanoOpenGLSurface* surf = new PanoOpenGLSurface(impl.release());
    surf->pixelbuffer = pixelbuffer;
    surf->pimpl->externalAlloc = true;

    return surf;
  };

  auto pbtex = createPanoSurfacePB(width, height);
  FAIL_RETURN(pbtex.status());

  GLuint pixelbuffer = pbtex.value();

  auto potSurf = allocPotSurf(pixelbuffer, width, height, flag);

  if (!potSurf.ok()) {
    glDeleteBuffers(1, &pixelbuffer);
    return Status{Origin::GPU, ErrType::RuntimeError, "Could not map OpenGL surface to CUDA.", potSurf.status()};
  }

  return potSurf;
}

namespace {
Potential<GPU::CubemapSurface> makeCubemapSurface(std::string name, size_t width) {
  cudaArray_t array;
  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
  FAIL_RETURN(CUDA_ERROR(cudaMalloc3DArray(&array, &channelDesc, make_cudaExtent(width, width, 6),
                                           cudaArrayCubemap | cudaArraySurfaceLoadStore)));
  deviceStats.addPtr("Remap Buffer", array, width * width * 6);

  // Specify surface
  struct cudaResourceDesc resDesc = {};
  resDesc.resType = cudaResourceTypeArray;
  // Create the surface objects
  resDesc.res.array.array = array;
  cudaSurfaceObject_t surface;
  FAIL_RETURN(CUDA_ERROR(cudaCreateSurfaceObject(&surface, &resDesc)));
  // create a texture
  cudaTextureObject_t tex;

  cudaTextureDesc texDesc = {};
  texDesc.filterMode = cudaFilterModeLinear;
  texDesc.addressMode[0] = cudaAddressModeClamp;
  texDesc.addressMode[1] = cudaAddressModeClamp;
  texDesc.normalizedCoords = false;
  texDesc.readMode = cudaReadModeNormalizedFloat;

  cudaResourceViewDesc resViewDesc = {};
  resViewDesc.format = cudaResViewFormatUnsignedChar4;
  resViewDesc.width = width;
  resViewDesc.height = width;
  resViewDesc.depth = 6;
  cudaCreateTextureObject(&tex, &resDesc, &texDesc, &resViewDesc);

  return new GPU::CubemapSurface(new GPU::DeviceCubemapSurface(array, tex, surface, true), width);
}
}  // namespace

Potential<CubemapOpenGLSurface> OpenGLAllocator::createCubemapSurface(size_t width, bool equiangular,
                                                                      BufferAllocType flag) {
  std::array<GLuint, 6> pbo;

  glewInit();
  glEnable(GL_TEXTURE_CUBE_MAP);
#ifdef GL_VERSION_3_2
  glEnable(GL_TEXTURE_CUBE_MAP_SEAMLESS);
#endif
  {
    // clear error flag before mapping to CUDA
    GLenum glerr = glGetError();
    while (glerr != GL_NO_ERROR) glerr = glGetError();
  }

  auto allocPotSurf = [](const std::array<GLuint, 6>& pbo, size_t width, bool equiangular,
                         BufferAllocType flag) -> Potential<CubemapOpenGLSurface> {
    GLenum glerr = glGetError();
    if (glerr != GL_NO_ERROR) {
      return {Origin::GPU, ErrType::RuntimeError, "Could not allocate OpenGL buffer."};
    }

    cudaGraphicsResource* resources[6];
    GPU::Buffer<uint32_t> buffers[6];
    unsigned int memFlag = getCudaGLMemAllocType(flag);
    for (int i = 0; i < 6; ++i) {
      FAIL_RETURN(CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&resources[i], pbo[i], memFlag)))
      FAIL_RETURN(CUDA_ERROR(cudaGraphicsMapResources(1, &resources[i], cudaStreamPerThread)))

      void* devPtr;
      size_t size;
      FAIL_RETURN(CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&devPtr, &size, resources[i])))

      buffers[i] = GPU::DeviceBuffer<uint32_t>::createBuffer((uint32_t*)devPtr, width * width);
      FAIL_RETURN(CUDA_ERROR(cudaGraphicsUnmapResources(1, &resources[i], cudaStreamPerThread)))
    }

    PotentialValue<GPU::Buffer<uint32_t>> buf = GPU::Buffer<uint32_t>::allocate(6 * width * width, "Offscreen Surface");
    FAIL_RETURN(buf.status())

    PotentialValue<GPU::Buffer<uint32_t>> potBuf = GPU::Buffer<uint32_t>::allocate(width * width, "Cubemap");
    FAIL_RETURN(potBuf.status())

    Potential<GPU::CubemapSurface> remapSurf = makeCubemapSurface("Remap Buffer", width);
    FAIL_RETURN(remapSurf.status())

    Potential<CubemapOpenGLSurface::Pimpl> impl = CubemapOpenGLSurface::Pimpl::create(
        resources, buffers, buf.value(), remapSurf.release(), potBuf.value(), width, equiangular);
    FAIL_RETURN(impl.status())
    impl->externalAlloc = true;

    return new CubemapOpenGLSurface(impl.release(), (int*)pbo.data());
  };

  for (int i = 0; i < 6; ++i) {
    glGenBuffers(1, &pbo[i]);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo[i]);
    glBufferData(GL_PIXEL_UNPACK_BUFFER, width * width * 4, NULL, GL_STREAM_DRAW);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
  }

  auto potSurf = allocPotSurf(pbo, width, equiangular, flag);

  if (!potSurf.ok()) {
    glDeleteBuffers(6, pbo.data());
    return Status{Origin::GPU, ErrType::RuntimeError, "Could not allocate OpenGL Surface.", potSurf.status()};
  }

  return potSurf;
}

Potential<PanoSurface> OffscreenAllocator::createPanoSurface(size_t width, size_t height, const char* name) {
  Status status;
  GPU::Buffer<uint32_t> buf;
  GPU::Surface* remapSurf = nullptr;

  PotentialValue<GPU::Buffer<uint32_t>> potbuf = GPU::Buffer<uint32_t>::allocate(width * height, name);
  status = potbuf.status();
  if (!status.ok()) {
    return status;
  }
  buf = potbuf.value();
  GPU::memsetToZeroBlocking(buf, width * height * sizeof(uint32_t));

  {
    Potential<GPU::Surface> potremapSurf = makeSurface("Remap Buffer", width, height);
    status = potremapSurf.status();
    if (!status.ok()) {
      goto error;
    }
    remapSurf = potremapSurf.release();
    Potential<PanoPimpl> impl = PanoPimpl::create(buf, remapSurf, width, height);
    status = impl.status();
    if (!status.ok()) {
      goto error;
    }
    PanoSurface* surf = new PanoSurface(impl.release());

    surf->pimpl->externalAlloc = false;

    return Potential<PanoSurface>(surf);
  }

error:
  buf.release();
  delete remapSurf;
  return status;
}

Potential<CubemapSurface> OffscreenAllocator::createCubemapSurface(size_t width, const char* name, bool equiangular) {
  GPU::Stream stream;
  GPU::Buffer<uint32_t> faces[6];
  GPU::Buffer<uint32_t> buf, tmp;
  Status status;

  for (int i = 0; i < 6; ++i) {
    PotentialValue<GPU::Buffer<uint32_t>> buf = GPU::Buffer<uint32_t>::allocate(width * width, name);
    status = buf.status();
    if (!status.ok()) {
      goto error_1;
    }
    GPU::memsetToZeroBlocking(buf.value(), width * width * sizeof(uint32_t));
    faces[i] = buf.value();
  }

  {
    PotentialValue<GPU::Stream> potStream = GPU::Stream::create();
    status = potStream.status();
    if (!status.ok()) {
      goto error_1;
    }
    stream = potStream.value();

    PotentialValue<GPU::Buffer<uint32_t>> potBuf =
        GPU::Buffer<uint32_t>::allocate(6 * width * width, "Offscreen Surface");
    status = potBuf.status();
    if (!status.ok()) {
      goto error_2;
    }
    buf = potBuf.value();

    potBuf = GPU::Buffer<uint32_t>::allocate(width * width, "Cubemap");
    status = potBuf.status();
    if (!status.ok()) {
      goto error_3;
    }
    tmp = potBuf.value();

    Potential<GPU::CubemapSurface> remapSurf = makeCubemapSurface("Remap Buffer", width);
    status = remapSurf.status();
    if (!status.ok()) {
      goto error_4;
    }

    CubemapPimpl* impl = new CubemapPimpl(equiangular, stream, &faces[0], buf, remapSurf.release(), tmp, width);
    CubemapSurface* surf = new CubemapSurface(impl);

    surf->pimpl->externalAlloc = false;

    return Potential<CubemapSurface>(surf);
  }

error_4:
  tmp.release();
error_3:
  buf.release();
error_2:
  stream.destroy();
error_1:
  for (int i = 0; i < 6; ++i) {
    faces[i].release();
  }

  return status;
}

SourceOpenGLSurface::SourceOpenGLSurface(Pimpl* pimpl) : SourceSurface(pimpl), texture(0) {}

SourceOpenGLSurface::~SourceOpenGLSurface() { glDeleteTextures(1, (GLuint*)&texture); }

PanoOpenGLSurface::PanoOpenGLSurface(Pimpl* pimpl) : PanoSurface(pimpl), pixelbuffer(0) {}

PanoOpenGLSurface::~PanoOpenGLSurface() { glDeleteBuffers(1, (GLuint*)&pixelbuffer); }

CubemapOpenGLSurface::CubemapOpenGLSurface(Pimpl* pimpl, int* f) : CubemapSurface(pimpl) {
  for (int i = 0; i < 6; ++i) {
    faces[i] = f[i];
  }
}

CubemapOpenGLSurface::~CubemapOpenGLSurface() {
  for (int i = 0; i < 6; ++i) {
    glDeleteBuffers(1, (GLuint*)&faces[i]);
  }
}

}  // namespace Core

}  // namespace VideoStitch