// Copyright (c) 2012-2017 VideoStitch SAS
// Copyright (c) 2018 stitchEm

#include "libvideostitch/opengl.hpp"

#include "gpu/buffer.hpp"
#include "backend/cuda/deviceBuffer.hpp"
#include "backend/cuda/deviceStream.hpp"
#include "cuda/memory.hpp"
#include "cuda/error.hpp"

#include "libvideostitch/input.hpp"
#include "libvideostitch/logging.hpp"

#ifndef __APPLE__
#define GLEW_STATIC
#include <GL/glew.h>
#include <GL/gl.h>
#else
#include <GL/glew.h>
#include <OpenGL/gl.h>
#endif

#include <cuda.h>
#include <cudaGL.h>

namespace VideoStitch {

enum OpenGLInitState {
  NOT_INITIALIZED,
  CONTEXT_CREATED,
  GL_BUFFER_CREATED,
  CUDA_INTEROP,
  GL_TEXTURE_CREATED,
  PREPARED,
};

std::vector<int> getGLDevices() {
  std::vector<int> glDevices;
  unsigned int cudaDeviceCount;
  int cudaDevices[1];

  CUresult result = cuGLGetDevices(&cudaDeviceCount, cudaDevices, 1, CU_GL_DEVICE_LIST_ALL);
  if (result != CUDA_SUCCESS) {
    // iMac / OS X will return "operation not supported" on the cuGLGetDevices call
    // fall back to device 0
    glDevices.push_back(0);
    return glDevices;
  }

  for (unsigned i = 0; i < cudaDeviceCount; ++i) {
    glDevices.push_back(cudaDevices[i]);
  }

  return glDevices;
}

class OpenGLUpload::Pimpl {
 public:
  Pimpl();
  ~Pimpl();

  Status upload(PixelFormat fmt, int width, int height, const char *video);

  Status initializeStorageConfiguration(PixelFormat fmt, int frameWidth, int frameHeight);
  void cleanState();

  int textureWidth, textureHeight;
  GLuint textureId;

  OpenGLInitState state;
  Cuda::DeviceUniquePtr<uint32_t> transfer;
  Cuda::DeviceUniquePtr<uint32_t> rgbaPanorama;
  CUdevice openGLDevice;
  unsigned int bufferId;
  CUcontext ctx;
  GPU::Stream gpuStream;
  CUstream stream;
};

OpenGLUpload::Pimpl::Pimpl() : textureWidth(0), textureHeight(0), textureId(0), state(NOT_INITIALIZED), bufferId(0) {
  gpuStream = GPU::Stream::create().value();
  stream = gpuStream.get();
}

OpenGLUpload::Pimpl::~Pimpl() {
  cleanState();
  gpuStream.destroy();
}

void OpenGLUpload::Pimpl::cleanState() {
  if (state == NOT_INITIALIZED) return;

  GLenum err;
  if (state == GL_TEXTURE_CREATED || state == PREPARED) {
    err = glGetError();
    if (err != GL_NO_ERROR) {
      Logger::get(Logger::Error) << "OpenGL error " << err << " at cleanState" << std::endl;
    }
    // no need to unbind the texture, it is unbind
    // after glDeleteTextures
    glDeleteTextures(1, &textureId);
    err = glGetError();
    if (err != GL_NO_ERROR) {
      Logger::get(Logger::Error) << "OpenGL error " << err << " on glDeleteTextures" << std::endl;
    }
    state = CUDA_INTEROP;
  }

  if (state == CUDA_INTEROP) {
    cuGLUnregisterBufferObject(bufferId);
    state = GL_BUFFER_CREATED;
  }

  if (state == GL_BUFFER_CREATED) {
    // no need to unbind buffers, it is unbind
    // after glDeleteTextures
    glDeleteBuffers(1, &bufferId);
    err = glGetError();
    if (err != GL_NO_ERROR) {
      Logger::get(Logger::Error) << "OpenGL error " << err << " on glDeleteBuffers" << std::endl;
    }

    state = CONTEXT_CREATED;
  }

  if (state == CONTEXT_CREATED) {
    cuCtxDestroy(ctx);
    textureWidth = 0;
    textureHeight = 0;
  }

  state = NOT_INITIALIZED;
}

Status OpenGLUpload::Pimpl::initializeStorageConfiguration(PixelFormat fmt, int frameWidth, int frameHeight) {
  Status ret;

  cleanState();

  /*Enable OpenGL extensions*/
  GLenum glErr = glewInit();
  if (glErr != GLEW_OK) {
    return {Origin::GPU, ErrType::SetupFailure, "Unable to initialize glew"};
  }

  /*Enable texture 2d for opengl*/
  glEnable(GL_TEXTURE_2D);
  glErr = glGetError();
  if (glErr != GL_NO_ERROR) {
    return {Origin::GPU, ErrType::SetupFailure, "GL_TEXTURE_2D is not available."};
  }

  /*Get the maximum available texture size*/
  GLint maxTextureSize = 0;
  glGetIntegerv(GL_MAX_TEXTURE_SIZE, &maxTextureSize);
  if (frameWidth > maxTextureSize || frameHeight > maxTextureSize) {
    return {Origin::GPU, ErrType::SetupFailure,
            "The frame size is not supported by OpenGL (maxTextureSize = " + std::to_string(maxTextureSize) + ")"};
  }

  /*Retrieve the device which do the openGL work*/
  std::vector<int> devices = getGLDevices();
  openGLDevice = devices[0];

  ret = CUDA_ERROR(cuGLCtxCreate(&ctx, CU_CTX_SCHED_AUTO, openGLDevice));
  if (!ret.ok()) {
    cleanState();
    return ret;
  }

  state = CONTEXT_CREATED;

  ret = CUDA_ERROR(cuCtxPushCurrent(ctx));
  if (!ret.ok()) {
    cleanState();
    return ret;
  }

  textureWidth = frameWidth;
  textureHeight = frameHeight;

  /*Intermediate buffer for YV12 conversion*/
  if (fmt == VideoStitch::YV12) {
    ret = rgbaPanorama.alloc(frameWidth * frameHeight, "OpenGL");
    if (!ret.ok()) {
      cleanState();
      return ret;
    }
  }

  /* 1- Allocate a GL buffer the size of the image. Cannot throw an error as the first parameter is a constant */
  glGenBuffers(1, &bufferId);
  glErr = glGetError();
  if (glErr != GL_NO_ERROR) {
    return {Origin::GPU, ErrType::UnsupportedAction, "Error on glGenBuffers"};
  }
  state = GL_BUFFER_CREATED;

  /* Make this the current UNPACK buffer (OpenGL is state-based).*/
  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, bufferId);
  glErr = glGetError();
  if (glErr != GL_NO_ERROR) {
    return {Origin::GPU, ErrType::UnsupportedAction, "GL_PIXEL_UNPACK_BUFFER is not available"};
  }

  /* Allocate data for the buffer. 4-channel 8-bit image */
  glBufferData(GL_PIXEL_UNPACK_BUFFER, frameWidth * frameHeight * 4, NULL, GL_DYNAMIC_DRAW);
  glErr = glGetError();
  if (glErr != GL_NO_ERROR) {
    return {Origin::GPU, ErrType::UnsupportedAction, "GL_PIXEL_UNPACK_BUFFER is not available"};
  }

  /*Associate GL buffer with cuda memory space */
  ret = CUDA_ERROR(cuGLRegisterBufferObject(bufferId));
  if (!ret.ok()) {
    cleanState();
    return ret;
  }

  state = CUDA_INTEROP;

  // 2- Allocate a GL texture the size of the image
  glGenTextures(1, &textureId);
  glErr = glGetError();
  if (glErr != GL_NO_ERROR) {
    return {Origin::GPU, ErrType::UnsupportedAction, "glGenTextures returned error"};
  }

  /*Activate texture*/
  glBindTexture(GL_TEXTURE_2D, textureId);
  glErr = glGetError();
  if (glErr != GL_NO_ERROR) {
    return {Origin::GPU, ErrType::UnsupportedAction, "GL_TEXTURE_2D is not available"};
  }

  glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, textureWidth, textureHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
  glErr = glGetError();
  if (glErr != GL_NO_ERROR) {
    return {Origin::GPU, ErrType::SetupFailure, "Unable to initialize OpenGL texture"};
  }

  /*Texture parameters*/
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
  glErr = glGetError();
  if (glErr != GL_NO_ERROR) {
    return {Origin::GPU, ErrType::SetupFailure, "Unable to set OpenGL texture min filter to GL_LINEAR"};
  }
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
  glErr = glGetError();
  if (glErr != GL_NO_ERROR) {
    return {Origin::GPU, ErrType::SetupFailure, "Unable to set OpenGL texture mag filter to GL_LINEAR"};
  }
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
  glErr = glGetError();
  if (glErr != GL_NO_ERROR) {
    return {Origin::GPU, ErrType::SetupFailure, "Unable to set OpenGL texture wrap S to GL_CLAMP_TO_EDGE"};
  }
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
  glErr = glGetError();
  if (glErr != GL_NO_ERROR) {
    return {Origin::GPU, ErrType::SetupFailure, "Unable to set OpenGL texture wrap T to GL_CLAMP_TO_EDGE"};
  }

  state = GL_TEXTURE_CREATED;

  PROPAGATE_FAILURE_STATUS(CUDA_ERROR(cuCtxPopCurrent(&ctx)));

  state = PREPARED;

  return Status::OK();
}

std::mutex opengl_mutex;

Status OpenGLUpload::Pimpl::upload(VideoStitch::PixelFormat fmt, int width, int height, const char *video) {
  // late storage configuration
  if (state == NOT_INITIALIZED) {
    /*
      Need to create a mutex locker here to prevent computer with old GPUs (for example: GT 640) from freezing.
      The problem likely comes from a bug of the old graphic card driver that hangs computer when multiple threads are
      calling the "cuGLCtxCreate" function.
      */
    std::lock_guard<std::mutex> guard(opengl_mutex);
    Status initStatus = initializeStorageConfiguration(fmt, width, height);
    if (!initStatus.ok()) {
      cleanState();
    }
    FAIL_RETURN(initStatus);
  }

  if (state != PREPARED) {
    return {Origin::GPU, ErrType::ImplementationError, "OpenGL output has not been initialized completely"};
  }

  Status ret;

  PROPAGATE_FAILURE_STATUS(CUDA_ERROR(cuCtxPushCurrent(ctx)));

  // 3- Map the GL buffer to CUDA memory
  CUdeviceptr glBuffer;
  size_t size;
  PROPAGATE_FAILURE_STATUS(CUDA_ERROR(cuGLMapBufferObject(&glBuffer, &size, bufferId)));

  // 4- Write the image from CUDA to the mapped memory

  // Which GPU owns the video buffer ?
  CUcontext origin;
  PROPAGATE_FAILURE_STATUS(
      CUDA_ERROR(cuPointerGetAttribute(&origin, CU_POINTER_ATTRIBUTE_CONTEXT, (CUdeviceptr)video)));
  // If its not on the same GPU, first copy the video to the current gpu
  CUdeviceptr buffer = (CUdeviceptr)video;

  PROPAGATE_FAILURE_STATUS(CUDA_ERROR(cuCtxPushCurrent(origin)));
  CUdevice originDevice;
  PROPAGATE_FAILURE_STATUS(CUDA_ERROR(cuCtxGetDevice(&originDevice)));
  PROPAGATE_FAILURE_STATUS(CUDA_ERROR(cuCtxPopCurrent(&origin)));
  if (originDevice != openGLDevice) {
    buffer = (CUdeviceptr)transfer.get();

    // lazy alloc
    if (!buffer) {
      ret = transfer.alloc(width * height, "OpenGL");
      if (!ret.ok()) {
        cuGLUnmapBufferObject(bufferId);
        cuCtxPopCurrent(&ctx);
        return ret;
      }
      buffer = (CUdeviceptr)transfer.get();
    }
    size_t transfer_size;
    switch (fmt) {
      case VideoStitch::YV12: {
        transfer_size = (width * height * 3) / 2;
        break;
      }
      case VideoStitch::RGBA:
      default: {
        transfer_size = width * height * 4;
        break;
      }
    }
    ret = CUDA_ERROR(cuMemcpy(buffer, (CUdeviceptr)video, transfer_size));
    if (!ret.ok()) {
      cuGLUnmapBufferObject(bufferId);
      cuCtxPopCurrent(&ctx);
      return ret;
    }
  }

  CUdeviceptr rgbaPtr;
  switch (fmt) {
      /*
          case VideoStitch::YV12: {
            auto gpuRGBAGPano = GPU::DeviceBuffer<uint32_t>::createBuffer(rgbaPanorama.get(), width *height);
            auto gpuBuffer = GPU::DeviceBuffer<unsigned char>::createBuffer((unsigned char *)buffer, (width * height) *
         3 / 2);
            // Unpack and copy to texture
            VideoStitch::Input::VideoReader::unpackDevBuffer(VideoStitch::YV12,
                                                             gpuRGBAGPano,
                                                             gpuBuffer,
                                                             width, height, gpuStream);
            rgbaPtr = (CUdeviceptr) rgbaPanorama.get();
            break;
          }
          */
    case VideoStitch::RGBA: {
      rgbaPtr = buffer;
      break;
    }
    default:
      rgbaPtr = buffer;
      break;
  }
  PROPAGATE_FAILURE_STATUS(CUDA_ERROR(cuMemcpy(glBuffer, rgbaPtr, width * height * 4)));

  // 5- Unmap the GL buffer
  PROPAGATE_FAILURE_STATUS(CUDA_ERROR(cuGLUnmapBufferObject(bufferId)));

  // 6- Create a Texture From the Buffer
  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, bufferId);
  GLenum glErr = glGetError();
  if (glErr != GL_NO_ERROR) {
    return {Origin::GPU, ErrType::RuntimeError, "Unable to bind buffer"};
  }
  glBindTexture(GL_TEXTURE_2D, textureId);
  glErr = glGetError();
  if (glErr != GL_NO_ERROR) {
    return {Origin::GPU, ErrType::RuntimeError, "Unable to bind texture"};
  }
  glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
  glErr = glGetError();
  if (glErr != GL_NO_ERROR) {
    return {Origin::GPU, ErrType::RuntimeError, "Unable to tex sub image"};
  }
  // It's necessary to flush our texture upload commands here,
  // else the rendering might end up using an outdated texture
  // (eg. you sync, and everything but the openGL display report
  // the correct frame)
  glFlush();
  glErr = glGetError();
  if (glErr != GL_NO_ERROR) {
    return {Origin::GPU, ErrType::RuntimeError, "Unable to flush"};
  }
  // unbind openGL texture
  glBindTexture(GL_TEXTURE_2D, 0);

  PROPAGATE_FAILURE_STATUS(CUDA_ERROR(cuCtxPopCurrent(&ctx)));
  return Status::OK();
}

OpenGLUpload::OpenGLUpload() { pimpl = new Pimpl; }
OpenGLUpload::~OpenGLUpload() { delete pimpl; }

Status OpenGLUpload::upload(VideoStitch::PixelFormat fmt, int width, int height, const char *video) {
  return pimpl->upload(fmt, width, height, video);
}

void OpenGLUpload::cleanState() { pimpl->cleanState(); }

int OpenGLUpload::getTexWidth() const { return pimpl->textureWidth; }
int OpenGLUpload::getTexHeight() const { return pimpl->textureHeight; }
int OpenGLUpload::getTexId() const { return (int)pimpl->textureId; }

}  // namespace VideoStitch