blender-archive/source/blender/compositor/intern/COM_WorkScheduler.cc

/*
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 *
 * Copyright 2011, Blender Foundation.
 */

#include <cstdio>
#include <list>

#include "COM_CPUDevice.h"
#include "COM_OpenCLDevice.h"
#include "COM_OpenCLKernels.cl.h"
#include "COM_WorkScheduler.h"
#include "COM_WriteBufferOperation.h"
#include "COM_compositor.h"
#include "clew.h"

#include "MEM_guardedalloc.h"

#include "BLI_threads.h"
#include "PIL_time.h"

#include "BKE_global.h"

#if COM_CURRENT_THREADING_MODEL == COM_TM_NOTHREAD
#  ifndef DEBUG /* Test this so we don't get warnings in debug builds. */
#    warning COM_CURRENT_THREADING_MODEL COM_TM_NOTHREAD is activated. Use only for debugging.
#  endif
#elif COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
/* do nothing - default */
#else
#  error COM_CURRENT_THREADING_MODEL No threading model selected
#endif

static ThreadLocal(CPUDevice *) g_thread_device;
static struct {
  /** \brief list of all CPUDevices. for every hardware thread an instance of CPUDevice is created
   */
  std::vector<CPUDevice *> cpu_devices;

#if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
  /** \brief list of all thread for every CPUDevice in cpudevices a thread exists. */
  ListBase cpu_threads;
  bool cpu_initialized = false;
  /** \brief all scheduled work for the cpu */
  ThreadQueue *cpu_queue;
  ThreadQueue *gpu_queue;
#  ifdef COM_OPENCL_ENABLED
  cl_context opencl_context;
  cl_program opencl_program;
  /** \brief list of all OpenCLDevices. for every OpenCL GPU device an instance of OpenCLDevice is
   * created. */
  std::vector<OpenCLDevice *> gpu_devices;
  /** \brief list of all thread for every GPUDevice in cpudevices a thread exists. */
  ListBase gpu_threads;
  /** \brief all scheduled work for the GPU. */
  bool opencl_active = false;
  bool opencl_initialized = false;
#  endif
#endif

} g_work_scheduler;

#if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
void *WorkScheduler::thread_execute_cpu(void *data)
{
  CPUDevice *device = (CPUDevice *)data;
  WorkPackage *work;
  BLI_thread_local_set(g_thread_device, device);
  while ((work = (WorkPackage *)BLI_thread_queue_pop(g_work_scheduler.cpu_queue))) {
    device->execute(work);
    delete work;
  }

  return nullptr;
}

void *WorkScheduler::thread_execute_gpu(void *data)
{
  Device *device = (Device *)data;
  WorkPackage *work;

  while ((work = (WorkPackage *)BLI_thread_queue_pop(g_work_scheduler.gpu_queue))) {
    device->execute(work);
    delete work;
  }

  return nullptr;
}
#endif

void WorkScheduler::schedule(ExecutionGroup *group, int chunkNumber)
{
  WorkPackage *package = new WorkPackage(group, chunkNumber);
#if COM_CURRENT_THREADING_MODEL == COM_TM_NOTHREAD
  CPUDevice device(0);
  device.execute(package);
  delete package;
#elif COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
#  ifdef COM_OPENCL_ENABLED
  if (group->isOpenCL() && g_work_scheduler.opencl_active) {
    BLI_thread_queue_push(g_work_scheduler.gpu_queue, package);
  }
  else {
    BLI_thread_queue_push(g_work_scheduler.cpu_queue, package);
  }
#  else
  BLI_thread_queue_push(g_work_scheduler.cpu_queue, package);
#  endif
#endif
}

void WorkScheduler::start(CompositorContext &context)
{
#if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
  unsigned int index;
  g_work_scheduler.cpu_queue = BLI_thread_queue_init();
  BLI_threadpool_init(
      &g_work_scheduler.cpu_threads, thread_execute_cpu, g_work_scheduler.cpu_devices.size());
  for (index = 0; index < g_work_scheduler.cpu_devices.size(); index++) {
    Device *device = g_work_scheduler.cpu_devices[index];
    BLI_threadpool_insert(&g_work_scheduler.cpu_threads, device);
  }
#  ifdef COM_OPENCL_ENABLED
  if (context.getHasActiveOpenCLDevices()) {
    g_work_scheduler.gpu_queue = BLI_thread_queue_init();
    BLI_threadpool_init(
        &g_work_scheduler.gpu_threads, thread_execute_gpu, g_work_scheduler.gpu_devices.size());
    for (index = 0; index < g_work_scheduler.gpu_devices.size(); index++) {
      Device *device = g_work_scheduler.gpu_devices[index];
      BLI_threadpool_insert(&g_work_scheduler.gpu_threads, device);
    }
    g_work_scheduler.opencl_active = true;
  }
  else {
    g_work_scheduler.opencl_active = false;
  }
#  endif
#endif
}
void WorkScheduler::finish()
{
#if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
#  ifdef COM_OPENCL_ENABLED
  if (g_work_scheduler.opencl_active) {
    BLI_thread_queue_wait_finish(g_work_scheduler.gpu_queue);
    BLI_thread_queue_wait_finish(g_work_scheduler.cpu_queue);
  }
  else {
    BLI_thread_queue_wait_finish(g_work_scheduler.cpu_queue);
  }
#  else
  BLI_thread_queue_wait_finish(cpuqueue);
#  endif
#endif
}
void WorkScheduler::stop()
{
#if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
  BLI_thread_queue_nowait(g_work_scheduler.cpu_queue);
  BLI_threadpool_end(&g_work_scheduler.cpu_threads);
  BLI_thread_queue_free(g_work_scheduler.cpu_queue);
  g_work_scheduler.cpu_queue = nullptr;
#  ifdef COM_OPENCL_ENABLED
  if (g_work_scheduler.opencl_active) {
    BLI_thread_queue_nowait(g_work_scheduler.gpu_queue);
    BLI_threadpool_end(&g_work_scheduler.gpu_threads);
    BLI_thread_queue_free(g_work_scheduler.gpu_queue);
    g_work_scheduler.gpu_queue = nullptr;
  }
#  endif
#endif
}

bool WorkScheduler::has_gpu_devices()
{
#if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
#  ifdef COM_OPENCL_ENABLED
  return !g_work_scheduler.gpu_devices.empty();
#  else
  return false;
#  endif
#else
  return false;
#endif
}

#if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
static void CL_CALLBACK clContextError(const char *errinfo,
                                       const void * /*private_info*/,
                                       size_t /*cb*/,
                                       void * /*user_data*/)
{
  printf("OPENCL error: %s\n", errinfo);
}
#endif

void WorkScheduler::initialize(bool use_opencl, int num_cpu_threads)
{
#if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
  /* deinitialize if number of threads doesn't match */
  if (g_work_scheduler.cpu_devices.size() != num_cpu_threads) {
    Device *device;

    while (!g_work_scheduler.cpu_devices.empty()) {
      device = g_work_scheduler.cpu_devices.back();
      g_work_scheduler.cpu_devices.pop_back();
      device->deinitialize();
      delete device;
    }
    if (g_work_scheduler.cpu_initialized) {
      BLI_thread_local_delete(g_thread_device);
    }
    g_work_scheduler.cpu_initialized = false;
  }

  /* initialize CPU threads */
  if (!g_work_scheduler.cpu_initialized) {
    for (int index = 0; index < num_cpu_threads; index++) {
      CPUDevice *device = new CPUDevice(index);
      device->initialize();
      g_work_scheduler.cpu_devices.push_back(device);
    }
    BLI_thread_local_create(g_thread_device);
    g_work_scheduler.cpu_initialized = true;
  }

#  ifdef COM_OPENCL_ENABLED
  /* deinitialize OpenCL GPU's */
  if (use_opencl && !g_work_scheduler.opencl_initialized) {
    g_work_scheduler.opencl_context = nullptr;
    g_work_scheduler.opencl_program = nullptr;

    /* This will check for errors and skip if already initialized. */
    if (clewInit() != CLEW_SUCCESS) {
      return;
    }

    if (clCreateContextFromType) {
      cl_uint numberOfPlatforms = 0;
      cl_int error;
      error = clGetPlatformIDs(0, nullptr, &numberOfPlatforms);
      if (error == -1001) {
      } /* GPU not supported */
      else if (error != CL_SUCCESS) {
        printf("CLERROR[%d]: %s\n", error, clewErrorString(error));
      }
      if (G.f & G_DEBUG) {
        printf("%u number of platforms\n", numberOfPlatforms);
      }
      cl_platform_id *platforms = (cl_platform_id *)MEM_mallocN(
          sizeof(cl_platform_id) * numberOfPlatforms, __func__);
      error = clGetPlatformIDs(numberOfPlatforms, platforms, nullptr);
      unsigned int indexPlatform;
      for (indexPlatform = 0; indexPlatform < numberOfPlatforms; indexPlatform++) {
        cl_platform_id platform = platforms[indexPlatform];
        cl_uint numberOfDevices = 0;
        clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, nullptr, &numberOfDevices);
        if (numberOfDevices <= 0) {
          continue;
        }

        cl_device_id *cldevices = (cl_device_id *)MEM_mallocN(
            sizeof(cl_device_id) * numberOfDevices, __func__);
        clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numberOfDevices, cldevices, nullptr);

        g_work_scheduler.opencl_context = clCreateContext(
            nullptr, numberOfDevices, cldevices, clContextError, nullptr, &error);
        if (error != CL_SUCCESS) {
          printf("CLERROR[%d]: %s\n", error, clewErrorString(error));
        }
        const char *cl_str[2] = {datatoc_COM_OpenCLKernels_cl, nullptr};
        g_work_scheduler.opencl_program = clCreateProgramWithSource(
            g_work_scheduler.opencl_context, 1, cl_str, nullptr, &error);
        error = clBuildProgram(g_work_scheduler.opencl_program,
                               numberOfDevices,
                               cldevices,
                               nullptr,
                               nullptr,
                               nullptr);
        if (error != CL_SUCCESS) {
          cl_int error2;
          size_t ret_val_size = 0;
          printf("CLERROR[%d]: %s\n", error, clewErrorString(error));
          error2 = clGetProgramBuildInfo(g_work_scheduler.opencl_program,
                                         cldevices[0],
                                         CL_PROGRAM_BUILD_LOG,
                                         0,
                                         nullptr,
                                         &ret_val_size);
          if (error2 != CL_SUCCESS) {
            printf("CLERROR[%d]: %s\n", error, clewErrorString(error));
          }
          char *build_log = (char *)MEM_mallocN(sizeof(char) * ret_val_size + 1, __func__);
          error2 = clGetProgramBuildInfo(g_work_scheduler.opencl_program,
                                         cldevices[0],
                                         CL_PROGRAM_BUILD_LOG,
                                         ret_val_size,
                                         build_log,
                                         nullptr);
          if (error2 != CL_SUCCESS) {
            printf("CLERROR[%d]: %s\n", error, clewErrorString(error));
          }
          build_log[ret_val_size] = '\0';
          printf("%s", build_log);
          MEM_freeN(build_log);
        }
        else {
          unsigned int indexDevices;
          for (indexDevices = 0; indexDevices < numberOfDevices; indexDevices++) {
            cl_device_id device = cldevices[indexDevices];
            cl_int vendorID = 0;
            cl_int error2 = clGetDeviceInfo(
                device, CL_DEVICE_VENDOR_ID, sizeof(cl_int), &vendorID, nullptr);
            if (error2 != CL_SUCCESS) {
              printf("CLERROR[%d]: %s\n", error2, clewErrorString(error2));
            }
            OpenCLDevice *clDevice = new OpenCLDevice(g_work_scheduler.opencl_context,
                                                      device,
                                                      g_work_scheduler.opencl_program,
                                                      vendorID);
            clDevice->initialize();
            g_work_scheduler.gpu_devices.push_back(clDevice);
          }
        }
        MEM_freeN(cldevices);
      }
      MEM_freeN(platforms);
    }

    g_work_scheduler.opencl_initialized = true;
  }
#  endif
#endif
}

void WorkScheduler::deinitialize()
{
#if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
  /* deinitialize CPU threads */
  if (g_work_scheduler.cpu_initialized) {
    Device *device;
    while (!g_work_scheduler.cpu_devices.empty()) {
      device = g_work_scheduler.cpu_devices.back();
      g_work_scheduler.cpu_devices.pop_back();
      device->deinitialize();
      delete device;
    }
    BLI_thread_local_delete(g_thread_device);
    g_work_scheduler.cpu_initialized = false;
  }

#  ifdef COM_OPENCL_ENABLED
  /* deinitialize OpenCL GPU's */
  if (g_work_scheduler.opencl_initialized) {
    Device *device;
    while (!g_work_scheduler.gpu_devices.empty()) {
      device = g_work_scheduler.gpu_devices.back();
      g_work_scheduler.gpu_devices.pop_back();
      device->deinitialize();
      delete device;
    }
    if (g_work_scheduler.opencl_program) {
      clReleaseProgram(g_work_scheduler.opencl_program);
      g_work_scheduler.opencl_program = nullptr;
    }
    if (g_work_scheduler.opencl_context) {
      clReleaseContext(g_work_scheduler.opencl_context);
      g_work_scheduler.opencl_context = nullptr;
    }

    g_work_scheduler.opencl_initialized = false;
  }
#  endif
#endif
}

int WorkScheduler::current_thread_id()
{
  CPUDevice *device = (CPUDevice *)BLI_thread_local_get(g_thread_device);
  return device->thread_id();
}