This repository has been archived on 2023-10-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
blender-archive/intern/cycles/device/queue.h
Michael Jones 654e1e901b Cycles: Use local atomics for faster shader sorting (enabled on Metal)
This patch adds two new kernels: SORT_BUCKET_PASS and SORT_WRITE_PASS. These replace PREFIX_SUM and SORTED_PATHS_ARRAY on supported devices (currently implemented on Metal, but will be trivial to enable on the other backends). The new kernels exploit sort partitioning (see D15331) by sorting each partition separately using local atomics. This can give an overall render speedup of 2-3% depending on architecture. As before, we fall back to the original non-partitioned sorting when the shader count is "too high".

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D16909
2023-02-06 11:18:26 +00:00

189 lines
5.6 KiB
C++

/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
#pragma once
#include "device/kernel.h"
#include "device/graphics_interop.h"
#include "util/debug.h"
#include "util/log.h"
#include "util/map.h"
#include "util/string.h"
#include "util/unique_ptr.h"
CCL_NAMESPACE_BEGIN
class Device;
class device_memory;
struct KernelWorkTile;
/* Container for device kernel arguments with type correctness ensured by API. */
struct DeviceKernelArguments {
enum Type {
POINTER,
INT32,
FLOAT32,
BOOLEAN,
KERNEL_FILM_CONVERT,
};
static const int MAX_ARGS = 18;
Type types[MAX_ARGS];
void *values[MAX_ARGS];
size_t sizes[MAX_ARGS];
size_t count = 0;
DeviceKernelArguments()
{
}
template<class T> DeviceKernelArguments(const T *arg)
{
add(arg);
}
template<class T, class... Args> DeviceKernelArguments(const T *first, Args... args)
{
add(first);
add(args...);
}
void add(const KernelFilmConvert *value)
{
add(KERNEL_FILM_CONVERT, value, sizeof(KernelFilmConvert));
}
void add(const device_ptr *value)
{
add(POINTER, value, sizeof(device_ptr));
}
void add(const int32_t *value)
{
add(INT32, value, sizeof(int32_t));
}
void add(const float *value)
{
add(FLOAT32, value, sizeof(float));
}
void add(const bool *value)
{
add(BOOLEAN, value, 4);
}
void add(const Type type, const void *value, size_t size)
{
assert(count < MAX_ARGS);
types[count] = type;
values[count] = (void *)value;
sizes[count] = size;
count++;
}
template<typename T, typename... Args> void add(const T *first, Args... args)
{
add(first);
add(args...);
}
};
/* Abstraction of a command queue for a device.
* Provides API to schedule kernel execution in a specific queue with minimal possible overhead
* from driver side.
*
* This class encapsulates all properties needed for commands execution. */
class DeviceQueue {
public:
virtual ~DeviceQueue();
/* Number of concurrent states to process for integrator,
* based on number of cores and/or available memory. */
virtual int num_concurrent_states(const size_t state_size) const = 0;
/* Number of states which keeps the device occupied with work without losing performance.
* The renderer will add more work (when available) when number of active paths falls below this
* value. */
virtual int num_concurrent_busy_states(const size_t state_size) const = 0;
/* Number of elements in a partition of sorted shaders, that improves memory locality of
* integrator state fetch at the cost of decreased coherence for shader kernel execution. */
virtual int num_sort_partition_elements() const
{
return 65536;
}
/* Does device support local atomic sorting kernels (INTEGRATOR_SORT_BUCKET_PASS and
* INTEGRATOR_SORT_WRITE_PASS)? */
virtual bool supports_local_atomic_sort() const
{
return false;
}
/* Initialize execution of kernels on this queue.
*
* Will, for example, load all data required by the kernels from Device to global or path state.
*
* Use this method after device synchronization has finished before enqueueing any kernels. */
virtual void init_execution() = 0;
/* Enqueue kernel execution.
*
* Execute the kernel work_size times on the device.
* Supported arguments types:
* - int: pass pointer to the int
* - device memory: pass pointer to device_memory.device_pointer
* Return false if there was an error executing this or a previous kernel. */
virtual bool enqueue(DeviceKernel kernel,
const int work_size,
DeviceKernelArguments const &args) = 0;
/* Wait unit all enqueued kernels have finished execution.
* Return false if there was an error executing any of the enqueued kernels. */
virtual bool synchronize() = 0;
/* Copy memory to/from device as part of the command queue, to ensure
* operations are done in order without having to synchronize. */
virtual void zero_to_device(device_memory &mem) = 0;
virtual void copy_to_device(device_memory &mem) = 0;
virtual void copy_from_device(device_memory &mem) = 0;
/* Graphics resources interoperability.
*
* The interoperability comes here by the meaning that the device is capable of computing result
* directly into an OpenGL (or other graphics library) buffer. */
/* Create graphics interoperability context which will be taking care of mapping graphics
* resource as a buffer writable by kernels of this device. */
virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create()
{
LOG(FATAL) << "Request of GPU interop of a device which does not support it.";
return nullptr;
}
/* Device this queue has been created for. */
Device *device;
protected:
/* Hide construction so that allocation via `Device` API is enforced. */
explicit DeviceQueue(Device *device);
/* Implementations call these from the corresponding methods to generate debugging logs. */
void debug_init_execution();
void debug_enqueue_begin(DeviceKernel kernel, const int work_size);
void debug_enqueue_end();
void debug_synchronize();
string debug_active_kernels();
/* Combination of kernels enqueued together sync last synchronize. */
DeviceKernelMask last_kernels_enqueued_;
/* Time of synchronize call. */
double last_sync_time_;
/* Accumulated execution time for combinations of kernels launched together. */
map<DeviceKernelMask, double> stats_kernel_time_;
/* If it is true, then a performance statistics in the debugging logs will have focus on kernels
* and an explicit queue synchronization will be added after each kernel execution. */
bool is_per_kernel_performance_;
};
CCL_NAMESPACE_END