Merge branch 'master' into blender2.8

This commit is contained in:
2018-10-09 08:46:00 +11:00
17 changed files with 297 additions and 141 deletions

View File

@@ -184,11 +184,11 @@ public:
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel; KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel; KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
@@ -499,6 +499,7 @@ public:
filter_nlm_update_output_kernel()(dx, dy, filter_nlm_update_output_kernel()(dx, dy,
blurDifference, blurDifference,
(float*) image_ptr, (float*) image_ptr,
difference,
(float*) out_ptr, (float*) out_ptr,
weightAccum, weightAccum,
local_rect, local_rect,

View File

@@ -1397,18 +1397,14 @@ public:
int h = task->reconstruction_state.source_h; int h = task->reconstruction_state.source_h;
int stride = task->buffer.stride; int stride = task->buffer.stride;
int shift_stride = stride*h; int pass_stride = task->buffer.pass_stride;
int num_shifts = (2*r+1)*(2*r+1); int num_shifts = (2*r+1)*(2*r+1);
int mem_size = sizeof(float)*shift_stride*num_shifts;
device_only_memory<uchar> temporary_mem(this, "Denoising temporary_mem");
temporary_mem.alloc_to_device(2*mem_size);
if(have_error()) if(have_error())
return false; return false;
CUdeviceptr difference = cuda_device_ptr(temporary_mem.device_pointer); CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
CUdeviceptr blurDifference = difference + mem_size; CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
{ {
CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
@@ -1426,9 +1422,9 @@ public:
task->reconstruction_state.source_w * task->reconstruction_state.source_h, task->reconstruction_state.source_w * task->reconstruction_state.source_h,
num_shifts); num_shifts);
void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &task->buffer.pass_stride, &a, &k_2}; void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &pass_stride, &a, &k_2};
void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f}; void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f}; void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
void *construct_gramian_args[] = {&blurDifference, void *construct_gramian_args[] = {&blurDifference,
&task->buffer.mem.device_pointer, &task->buffer.mem.device_pointer,
&task->storage.transform.device_pointer, &task->storage.transform.device_pointer,
@@ -1437,9 +1433,8 @@ public:
&task->storage.XtWY.device_pointer, &task->storage.XtWY.device_pointer,
&task->reconstruction_state.filter_window, &task->reconstruction_state.filter_window,
&w, &h, &stride, &w, &h, &stride,
&shift_stride, &r, &pass_stride, &r,
&f, &f};
&task->buffer.pass_stride};
CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
@@ -1448,8 +1443,6 @@ public:
CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
} }
temporary_mem.free();
{ {
CUfunction cuFinalize; CUfunction cuFinalize;
cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));

View File

@@ -99,14 +99,18 @@ void DenoisingTask::setup_denoising_buffer()
buffer.mem.alloc_to_device(mem_size, false); buffer.mem.alloc_to_device(mem_size, false);
/* CPUs process shifts sequentially while GPUs process them in parallel. */ /* CPUs process shifts sequentially while GPUs process them in parallel. */
int num_shifts = 1; int num_layers;
if(buffer.gpu_temporary_mem) { if(buffer.gpu_temporary_mem) {
/* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */ /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */
int max_radius = max(radius, 6); int max_radius = max(radius, 6);
num_shifts = (2*max_radius + 1) * (2*max_radius + 1); int num_shifts = (2*max_radius + 1) * (2*max_radius + 1);
num_layers = 2*num_shifts + 1;
}
else {
num_layers = 3;
} }
/* Allocate two layers per shift as well as one for the weight accumulation. */ /* Allocate two layers per shift as well as one for the weight accumulation. */
buffer.temporary_mem.alloc_to_device((2*num_shifts + 1) * buffer.pass_stride); buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride);
} }
void DenoisingTask::prefilter_shadowing() void DenoisingTask::prefilter_shadowing()

View File

@@ -865,38 +865,38 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
int h = task->reconstruction_state.source_h; int h = task->reconstruction_state.source_h;
int stride = task->buffer.stride; int stride = task->buffer.stride;
int shift_stride = stride*h; int r = task->radius;
int num_shifts = (2*task->radius + 1)*(2*task->radius + 1); int pass_stride = task->buffer.pass_stride;
int mem_size = sizeof(float)*shift_stride*num_shifts; int num_shifts = (2*r+1)*(2*r+1);
cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts);
opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct"); device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts);
cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); cl_mem difference_mem = CL_MEM_PTR(*difference);
opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct"); cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
kernel_set_args(ckNLMCalcDifference, 0, kernel_set_args(ckNLMCalcDifference, 0,
color_mem, color_mem,
color_variance_mem, color_variance_mem,
difference, difference_mem,
w, h, stride, w, h, stride,
shift_stride, pass_stride,
task->radius, r,
task->buffer.pass_stride, pass_stride,
1.0f, task->nlm_k_2); 1.0f, task->nlm_k_2);
kernel_set_args(ckNLMBlur, 0, kernel_set_args(ckNLMBlur, 0,
difference, difference_mem,
blurDifference, blurDifference_mem,
w, h, stride, w, h, stride,
shift_stride, pass_stride,
task->radius, 4); r, 4);
kernel_set_args(ckNLMCalcWeight, 0, kernel_set_args(ckNLMCalcWeight, 0,
blurDifference, blurDifference_mem,
difference, difference_mem,
w, h, stride, w, h, stride,
shift_stride, pass_stride,
task->radius, 4); r, 4);
kernel_set_args(ckNLMConstructGramian, 0, kernel_set_args(ckNLMConstructGramian, 0,
blurDifference, blurDifference_mem,
buffer_mem, buffer_mem,
transform_mem, transform_mem,
rank_mem, rank_mem,
@@ -904,9 +904,8 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
XtWY_mem, XtWY_mem,
task->reconstruction_state.filter_window, task->reconstruction_state.filter_window,
w, h, stride, w, h, stride,
shift_stride, pass_stride,
task->radius, 4, r, 4);
task->buffer.pass_stride);
enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
@@ -914,9 +913,6 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256); enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
opencl_assert(clReleaseMemObject(difference));
opencl_assert(clReleaseMemObject(blurDifference));
kernel_set_args(ckFinalize, 0, kernel_set_args(ckFinalize, 0,
output_mem, output_mem,
rank_mem, rank_mem,

View File

@@ -16,6 +16,9 @@
CCL_NAMESPACE_BEGIN CCL_NAMESPACE_BEGIN
#define load4_a(buf, ofs) (*((float4*) ((buf) + (ofs))))
#define load4_u(buf, ofs) load_float4((buf)+(ofs))
ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
const float *ccl_restrict weight_image, const float *ccl_restrict weight_image,
const float *ccl_restrict variance_image, const float *ccl_restrict variance_image,
@@ -26,20 +29,28 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
float a, float a,
float k_2) float k_2)
{ {
/* Strides need to be aligned to 16 bytes. */
kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0);
int aligned_lowx = rect.x & (~3);
const int numChannels = (channel_offset > 0)? 3 : 1;
const float4 channel_fac = make_float4(1.0f / numChannels);
for(int y = rect.y; y < rect.w; y++) { for(int y = rect.y; y < rect.w; y++) {
for(int x = rect.x; x < rect.z; x++) { int idx_p = y*stride + aligned_lowx;
float diff = 0.0f; int idx_q = (y+dy)*stride + aligned_lowx + dx;
int numChannels = channel_offset? 3 : 1; for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
for(int c = 0; c < numChannels; c++) { float4 diff = make_float4(0.0f);
float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)]; for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) {
float pvar = variance_image[c*channel_offset + y*stride + x]; /* idx_p is guaranteed to be aligned, but idx_q isn't. */
float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)]; float4 color_p = load4_a(weight_image, idx_p + chan_ofs);
diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); float4 color_q = load4_u(weight_image, idx_q + chan_ofs);
float4 cdiff = color_p - color_q;
float4 var_p = load4_a(variance_image, idx_p + chan_ofs);
float4 var_q = load4_u(variance_image, idx_q + chan_ofs);
diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q));
} }
if(numChannels > 1) { load4_a(difference_image, idx_p) = diff*channel_fac;
diff *= 1.0f/numChannels;
}
difference_image[y*stride + x] = diff;
} }
} }
} }
@@ -50,23 +61,61 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
int stride, int stride,
int f) int f)
{ {
int aligned_lowx = rect.x / 4; int aligned_lowx = round_down(rect.x, 4);
int aligned_highx = (rect.z + 3) / 4;
for(int y = rect.y; y < rect.w; y++) { for(int y = rect.y; y < rect.w; y++) {
const int low = max(rect.y, y-f); const int low = max(rect.y, y-f);
const int high = min(rect.w, y+f+1); const int high = min(rect.w, y+f+1);
for(int x = rect.x; x < rect.z; x++) { for(int x = aligned_lowx; x < rect.z; x += 4) {
out_image[y*stride + x] = 0.0f; load4_a(out_image, y*stride + x) = make_float4(0.0f);
} }
for(int y1 = low; y1 < high; y1++) { for(int y1 = low; y1 < high; y1++) {
float4* out_image4 = (float4*)(out_image + y*stride); for(int x = aligned_lowx; x < rect.z; x += 4) {
float4* difference_image4 = (float4*)(difference_image + y1*stride); load4_a(out_image, y*stride + x) += load4_a(difference_image, y1*stride + x);
for(int x = aligned_lowx; x < aligned_highx; x++) {
out_image4[x] += difference_image4[x];
} }
} }
for(int x = rect.x; x < rect.z; x++) { float fac = 1.0f/(high - low);
out_image[y*stride + x] *= 1.0f/(high - low); for(int x = aligned_lowx; x < rect.z; x += 4) {
load4_a(out_image, y*stride + x) *= fac;
}
}
}
ccl_device_inline void nlm_blur_horizontal(const float *ccl_restrict difference_image,
float *out_image,
int4 rect,
int stride,
int f)
{
int aligned_lowx = round_down(rect.x, 4);
for(int y = rect.y; y < rect.w; y++) {
for(int x = aligned_lowx; x < rect.z; x += 4) {
load4_a(out_image, y*stride + x) = make_float4(0.0f);
}
}
for(int dx = -f; dx <= f; dx++) {
aligned_lowx = round_down(rect.x - min(0, dx), 4);
int highx = rect.z - max(0, dx);
int4 lowx4 = make_int4(rect.x - min(0, dx));
int4 highx4 = make_int4(rect.z - max(0, dx));
for(int y = rect.y; y < rect.w; y++) {
for(int x = aligned_lowx; x < highx; x += 4) {
int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
int4 active = (x4 >= lowx4) & (x4 < highx4);
float4 diff = load4_u(difference_image, y*stride + x + dx);
load4_a(out_image, y*stride + x) += mask(active, diff);
}
}
}
aligned_lowx = round_down(rect.x, 4);
for(int y = rect.y; y < rect.w; y++) {
for(int x = aligned_lowx; x < rect.z; x += 4) {
float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f);
float4 low = max(make_float4(rect.x), x4 - make_float4(f));
float4 high = min(make_float4(rect.z), x4 + make_float4(f+1));
load4_a(out_image, y*stride + x) *= rcp(high - low);
} }
} }
} }
@@ -77,25 +126,12 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d
int stride, int stride,
int f) int f)
{ {
nlm_blur_horizontal(difference_image, out_image, rect, stride, f);
int aligned_lowx = round_down(rect.x, 4);
for(int y = rect.y; y < rect.w; y++) { for(int y = rect.y; y < rect.w; y++) {
for(int x = rect.x; x < rect.z; x++) { for(int x = aligned_lowx; x < rect.z; x += 4) {
out_image[y*stride + x] = 0.0f; load4_a(out_image, y*stride + x) = fast_expf4(-max(load4_a(out_image, y*stride + x), make_float4(0.0f)));
}
}
for(int dx = -f; dx <= f; dx++) {
int pos_dx = max(0, dx);
int neg_dx = min(0, dx);
for(int y = rect.y; y < rect.w; y++) {
for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
out_image[y*stride + x] += difference_image[y*stride + x+dx];
}
}
}
for(int y = rect.y; y < rect.w; y++) {
for(int x = rect.x; x < rect.z; x++) {
const int low = max(rect.x, x-f);
const int high = min(rect.z, x+f+1);
out_image[y*stride + x] = fast_expf(-max(out_image[y*stride + x] * (1.0f/(high - low)), 0.0f));
} }
} }
} }
@@ -103,23 +139,29 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d
ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
const float *ccl_restrict difference_image, const float *ccl_restrict difference_image,
const float *ccl_restrict image, const float *ccl_restrict image,
float *temp_image,
float *out_image, float *out_image,
float *accum_image, float *accum_image,
int4 rect, int4 rect,
int stride, int stride,
int f) int f)
{ {
nlm_blur_horizontal(difference_image, temp_image, rect, stride, f);
int aligned_lowx = round_down(rect.x, 4);
for(int y = rect.y; y < rect.w; y++) { for(int y = rect.y; y < rect.w; y++) {
for(int x = rect.x; x < rect.z; x++) { for(int x = aligned_lowx; x < rect.z; x += 4) {
const int low = max(rect.x, x-f); int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
const int high = min(rect.z, x+f+1); int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z));
float sum = 0.0f;
for(int x1 = low; x1 < high; x1++) { int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
sum += difference_image[y*stride + x1];
} float4 weight = load4_a(temp_image, idx_p);
float weight = sum * (1.0f/(high - low)); load4_a(accum_image, idx_p) += mask(active, weight);
accum_image[y*stride + x] += weight;
out_image[y*stride + x] += weight*image[(y+dy)*stride + (x+dx)]; float4 val = load4_u(image, idx_q);
load4_a(out_image, idx_p) += mask(active, weight*val);
} }
} }
} }
@@ -177,4 +219,7 @@ ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
} }
} }
#undef load4_a
#undef load4_u
CCL_NAMESPACE_END CCL_NAMESPACE_END

View File

@@ -95,6 +95,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
int dy, int dy,
float *difference_image, float *difference_image,
float *image, float *image,
float *temp_image,
float *out_image, float *out_image,
float *accum_image, float *accum_image,
int* rect, int* rect,

View File

@@ -191,6 +191,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
int dy, int dy,
float *difference_image, float *difference_image,
float *image, float *image,
float *temp_image,
float *out_image, float *out_image,
float *accum_image, float *accum_image,
int *rect, int *rect,
@@ -200,7 +201,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
#ifdef KERNEL_STUB #ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
#else #else
kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), stride, f); kernel_filter_nlm_update_output(dx, dy, difference_image, image, temp_image, out_image, accum_image, load_int4(rect), stride, f);
#endif #endif
} }

View File

@@ -140,7 +140,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
int w, int w,
int h, int h,
int stride, int stride,
int shift_stride, int pass_stride,
int r, int r,
int channel_offset, int channel_offset,
float a, float a,
@@ -148,7 +148,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
{ {
int4 co, rect; int4 co, rect;
int ofs; int ofs;
if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
weight_image, weight_image,
variance_image, variance_image,
@@ -165,13 +165,13 @@ kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
int w, int w,
int h, int h,
int stride, int stride,
int shift_stride, int pass_stride,
int r, int r,
int f) int f)
{ {
int4 co, rect; int4 co, rect;
int ofs; int ofs;
if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
kernel_filter_nlm_blur(co.x, co.y, kernel_filter_nlm_blur(co.x, co.y,
difference_image + ofs, difference_image + ofs,
out_image + ofs, out_image + ofs,
@@ -186,13 +186,13 @@ kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
int w, int w,
int h, int h,
int stride, int stride,
int shift_stride, int pass_stride,
int r, int r,
int f) int f)
{ {
int4 co, rect; int4 co, rect;
int ofs; int ofs;
if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
kernel_filter_nlm_calc_weight(co.x, co.y, kernel_filter_nlm_calc_weight(co.x, co.y,
difference_image + ofs, difference_image + ofs,
out_image + ofs, out_image + ofs,
@@ -209,13 +209,13 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
int w, int w,
int h, int h,
int stride, int stride,
int shift_stride, int pass_stride,
int r, int r,
int f) int f)
{ {
int4 co, rect; int4 co, rect;
int ofs; int ofs;
if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
difference_image + ofs, difference_image + ofs,
image, image,
@@ -252,14 +252,13 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im
int w, int w,
int h, int h,
int stride, int stride,
int shift_stride, int pass_stride,
int r, int r,
int f, int f)
int pass_stride)
{ {
int4 co, rect; int4 co, rect;
int ofs; int ofs;
if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) { if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
kernel_filter_nlm_construct_gramian(co.x, co.y, kernel_filter_nlm_construct_gramian(co.x, co.y,
co.z, co.w, co.z, co.w,
difference_image + ofs, difference_image + ofs,

View File

@@ -132,7 +132,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
int w, int w,
int h, int h,
int stride, int stride,
int shift_stride, int pass_stride,
int r, int r,
int channel_offset, int channel_offset,
float a, float a,
@@ -140,7 +140,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
{ {
int4 co, rect; int4 co, rect;
int ofs; int ofs;
if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
weight_image, weight_image,
variance_image, variance_image,
@@ -155,13 +155,13 @@ __kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict di
int w, int w,
int h, int h,
int stride, int stride,
int shift_stride, int pass_stride,
int r, int r,
int f) int f)
{ {
int4 co, rect; int4 co, rect;
int ofs; int ofs;
if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
kernel_filter_nlm_blur(co.x, co.y, kernel_filter_nlm_blur(co.x, co.y,
difference_image + ofs, difference_image + ofs,
out_image + ofs, out_image + ofs,
@@ -174,13 +174,13 @@ __kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_rest
int w, int w,
int h, int h,
int stride, int stride,
int shift_stride, int pass_stride,
int r, int r,
int f) int f)
{ {
int4 co, rect; int4 co, rect;
int ofs; int ofs;
if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
kernel_filter_nlm_calc_weight(co.x, co.y, kernel_filter_nlm_calc_weight(co.x, co.y,
difference_image + ofs, difference_image + ofs,
out_image + ofs, out_image + ofs,
@@ -195,13 +195,13 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re
int w, int w,
int h, int h,
int stride, int stride,
int shift_stride, int pass_stride,
int r, int r,
int f) int f)
{ {
int4 co, rect; int4 co, rect;
int ofs; int ofs;
if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
difference_image + ofs, difference_image + ofs,
image, image,
@@ -234,14 +234,13 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc
int w, int w,
int h, int h,
int stride, int stride,
int shift_stride, int pass_stride,
int r, int r,
int f, int f)
int pass_stride)
{ {
int4 co, rect; int4 co, rect;
int ofs; int ofs;
if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) { if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
kernel_filter_nlm_construct_gramian(co.x, co.y, kernel_filter_nlm_construct_gramian(co.x, co.y,
co.z, co.w, co.z, co.w,
difference_image + ofs, difference_image + ofs,

View File

@@ -220,6 +220,30 @@ ccl_device_inline float __uint_as_float(uint i)
u.i = i; u.i = i;
return u.f; return u.f;
} }
ccl_device_inline int4 __float4_as_int4(float4 f)
{
#ifdef __KERNEL_SSE__
return int4(_mm_castps_si128(f.m128));
#else
return make_int4(__float_as_int(f.x),
__float_as_int(f.y),
__float_as_int(f.z),
__float_as_int(f.w));
#endif
}
ccl_device_inline float4 __int4_as_float4(int4 i)
{
#ifdef __KERNEL_SSE__
return float4(_mm_castsi128_ps(i.m128));
#else
return make_float4(__int_as_float(i.x),
__int_as_float(i.y),
__int_as_float(i.z),
__int_as_float(i.w));
#endif
}
#endif /* __KERNEL_OPENCL__ */ #endif /* __KERNEL_OPENCL__ */
/* Versions of functions which are safe for fast math. */ /* Versions of functions which are safe for fast math. */

View File

@@ -58,6 +58,11 @@ ccl_device_inline float madd(const float a, const float b, const float c)
return a * b + c; return a * b + c;
} }
ccl_device_inline float4 madd4(const float4 a, const float4 b, const float4 c)
{
return a * b + c;
}
/* /*
* FAST & APPROXIMATE MATH * FAST & APPROXIMATE MATH
* *
@@ -438,6 +443,29 @@ ccl_device_inline float fast_expf(float x)
return fast_exp2f(x / M_LN2_F); return fast_exp2f(x / M_LN2_F);
} }
#ifndef __KERNEL_GPU__
ccl_device float4 fast_exp2f4(float4 x)
{
const float4 one = make_float4(1.0f);
const float4 limit = make_float4(126.0f);
x = clamp(x, -limit, limit);
int4 m = make_int4(x);
x = one - (one - (x - make_float4(m)));
float4 r = make_float4(1.33336498402e-3f);
r = madd4(x, r, make_float4(9.810352697968e-3f));
r = madd4(x, r, make_float4(5.551834031939e-2f));
r = madd4(x, r, make_float4(0.2401793301105f));
r = madd4(x, r, make_float4(0.693144857883f));
r = madd4(x, r, make_float4(1.0f));
return __int4_as_float4(__float4_as_int4(r) + (m << 23));
}
ccl_device_inline float4 fast_expf4(float4 x)
{
return fast_exp2f4(x / M_LN2_F);
}
#endif
ccl_device_inline float fast_exp10(float x) ccl_device_inline float fast_exp10(float x)
{ {
/* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]: /* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]:

View File

@@ -38,6 +38,7 @@ ccl_device_inline float4 operator+(const float4& a, const float4& b);
ccl_device_inline float4 operator-(const float4& a, const float4& b); ccl_device_inline float4 operator-(const float4& a, const float4& b);
ccl_device_inline float4 operator+=(float4& a, const float4& b); ccl_device_inline float4 operator+=(float4& a, const float4& b);
ccl_device_inline float4 operator*=(float4& a, const float4& b); ccl_device_inline float4 operator*=(float4& a, const float4& b);
ccl_device_inline float4 operator*=(float4& a, float f);
ccl_device_inline float4 operator/=(float4& a, float f); ccl_device_inline float4 operator/=(float4& a, float f);
ccl_device_inline int4 operator<(const float4& a, const float4& b); ccl_device_inline int4 operator<(const float4& a, const float4& b);
@@ -58,6 +59,7 @@ ccl_device_inline float4 normalize(const float4& a);
ccl_device_inline float4 safe_normalize(const float4& a); ccl_device_inline float4 safe_normalize(const float4& a);
ccl_device_inline float4 min(const float4& a, const float4& b); ccl_device_inline float4 min(const float4& a, const float4& b);
ccl_device_inline float4 max(const float4& a, const float4& b); ccl_device_inline float4 max(const float4& a, const float4& b);
ccl_device_inline float4 clamp(const float4& a, const float4& mn, const float4& mx);
ccl_device_inline float4 fabs(const float4& a); ccl_device_inline float4 fabs(const float4& a);
#endif /* !__KERNEL_OPENCL__*/ #endif /* !__KERNEL_OPENCL__*/
@@ -168,6 +170,11 @@ ccl_device_inline float4 operator*=(float4& a, const float4& b)
return a = a * b; return a = a * b;
} }
ccl_device_inline float4 operator*=(float4& a, float f)
{
return a = a * f;
}
ccl_device_inline float4 operator/=(float4& a, float f) ccl_device_inline float4 operator/=(float4& a, float f)
{ {
return a = a / f; return a = a / f;
@@ -333,6 +340,11 @@ ccl_device_inline float4 max(const float4& a, const float4& b)
#endif #endif
} }
ccl_device_inline float4 clamp(const float4& a, const float4& mn, const float4& mx)
{
return min(max(a, mn), mx);
}
ccl_device_inline float4 fabs(const float4& a) ccl_device_inline float4 fabs(const float4& a)
{ {
#ifdef __KERNEL_SSE__ #ifdef __KERNEL_SSE__

View File

@@ -31,6 +31,10 @@ CCL_NAMESPACE_BEGIN
ccl_device_inline int4 operator+(const int4& a, const int4& b); ccl_device_inline int4 operator+(const int4& a, const int4& b);
ccl_device_inline int4 operator+=(int4& a, const int4& b); ccl_device_inline int4 operator+=(int4& a, const int4& b);
ccl_device_inline int4 operator>>(const int4& a, int i); ccl_device_inline int4 operator>>(const int4& a, int i);
ccl_device_inline int4 operator<<(const int4& a, int i);
ccl_device_inline int4 operator<(const int4& a, const int4& b);
ccl_device_inline int4 operator>=(const int4& a, const int4& b);
ccl_device_inline int4 operator&(const int4& a, const int4& b);
ccl_device_inline int4 min(int4 a, int4 b); ccl_device_inline int4 min(int4 a, int4 b);
ccl_device_inline int4 max(int4 a, int4 b); ccl_device_inline int4 max(int4 a, int4 b);
ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx); ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx);
@@ -65,6 +69,42 @@ ccl_device_inline int4 operator>>(const int4& a, int i)
#endif #endif
} }
ccl_device_inline int4 operator<<(const int4& a, int i)
{
#ifdef __KERNEL_SSE__
return int4(_mm_slli_epi32(a.m128, i));
#else
return make_int4(a.x << i, a.y << i, a.z << i, a.w << i);
#endif
}
ccl_device_inline int4 operator<(const int4& a, const int4& b)
{
#ifdef __KERNEL_SSE__
return int4(_mm_cmplt_epi32(a.m128, b.m128));
#else
return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
#endif
}
ccl_device_inline int4 operator>=(const int4& a, const int4& b)
{
#ifdef __KERNEL_SSE__
return int4(_mm_xor_si128(_mm_set1_epi32(0xffffffff), _mm_cmplt_epi32(a.m128, b.m128)));
#else
return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
#endif
}
ccl_device_inline int4 operator&(const int4& a, const int4& b)
{
#ifdef __KERNEL_SSE__
return int4(_mm_and_si128(a.m128, b.m128));
#else
return make_int4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w);
#endif
}
ccl_device_inline int4 min(int4 a, int4 b) ccl_device_inline int4 min(int4 a, int4 b)
{ {
#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)

View File

@@ -26,6 +26,7 @@ CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_GPU__ #ifndef __KERNEL_GPU__
struct float3; struct float3;
struct float4;
struct ccl_try_align(16) int4 { struct ccl_try_align(16) int4 {
#ifdef __KERNEL_SSE__ #ifdef __KERNEL_SSE__
@@ -53,6 +54,7 @@ struct ccl_try_align(16) int4 {
ccl_device_inline int4 make_int4(int i); ccl_device_inline int4 make_int4(int i);
ccl_device_inline int4 make_int4(int x, int y, int z, int w); ccl_device_inline int4 make_int4(int x, int y, int z, int w);
ccl_device_inline int4 make_int4(const float3& f); ccl_device_inline int4 make_int4(const float3& f);
ccl_device_inline int4 make_int4(const float4& f);
ccl_device_inline void print_int4(const char *label, const int4& a); ccl_device_inline void print_int4(const char *label, const int4& a);
#endif /* __KERNEL_GPU__ */ #endif /* __KERNEL_GPU__ */

View File

@@ -104,6 +104,16 @@ ccl_device_inline int4 make_int4(const float3& f)
return a; return a;
} }
ccl_device_inline int4 make_int4(const float4& f)
{
#ifdef __KERNEL_SSE__
int4 a(_mm_cvtps_epi32(f.m128));
#else
int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
#endif
return a;
}
ccl_device_inline void print_int4(const char *label, const int4& a) ccl_device_inline void print_int4(const char *label, const int4& a)
{ {
printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w); printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);

View File

@@ -182,15 +182,12 @@ static void interp_slerp_co_no_v3(
/* calculate sphere 'center' */ /* calculate sphere 'center' */
{ {
/* use point on plane to */ /* use point on plane to */
float plane_a[4], plane_b[4], plane_c[4];
float no_mid[3], no_ortho[3]; float no_mid[3], no_ortho[3];
/* pass this as an arg instead */ /* pass this as an arg instead */
#if 0 #if 0
float no_dir[3]; float no_dir[3];
#endif #endif
float v_a_no_ortho[3], v_b_no_ortho[3];
add_v3_v3v3(no_mid, no_a, no_b); add_v3_v3v3(no_mid, no_a, no_b);
normalize_v3(no_mid); normalize_v3(no_mid);
@@ -200,24 +197,28 @@ static void interp_slerp_co_no_v3(
#endif #endif
/* axis of slerp */ /* axis of slerp */
bool center_ok = false;
cross_v3_v3v3(no_ortho, no_mid, no_dir); cross_v3_v3v3(no_ortho, no_mid, no_dir);
normalize_v3(no_ortho); if (normalize_v3(no_ortho) != 0.0f) {
float plane_a[4], plane_b[4], plane_c[4];
float v_a_no_ortho[3], v_b_no_ortho[3];
/* create planes */ /* create planes */
cross_v3_v3v3(v_a_no_ortho, no_ortho, no_a); cross_v3_v3v3(v_a_no_ortho, no_ortho, no_a);
cross_v3_v3v3(v_b_no_ortho, no_ortho, no_b); cross_v3_v3v3(v_b_no_ortho, no_ortho, no_b);
project_v3_plane(v_a_no_ortho, no_ortho, v_a_no_ortho); project_v3_plane(v_a_no_ortho, no_ortho, v_a_no_ortho);
project_v3_plane(v_b_no_ortho, no_ortho, v_b_no_ortho); project_v3_plane(v_b_no_ortho, no_ortho, v_b_no_ortho);
plane_from_point_normal_v3(plane_a, co_a, v_a_no_ortho); plane_from_point_normal_v3(plane_a, co_a, v_a_no_ortho);
plane_from_point_normal_v3(plane_b, co_b, v_b_no_ortho); plane_from_point_normal_v3(plane_b, co_b, v_b_no_ortho);
plane_from_point_normal_v3(plane_c, co_b, no_ortho); plane_from_point_normal_v3(plane_c, co_b, no_ortho);
/* find the sphere center from 3 planes */ /* find the sphere center from 3 planes */
if (isect_plane_plane_plane_v3(plane_a, plane_b, plane_c, center)) { if (isect_plane_plane_plane_v3(plane_a, plane_b, plane_c, center)) {
/* pass */ center_ok = true;
}
} }
else { if (center_ok == false) {
mid_v3_v3v3(center, co_a, co_b); mid_v3_v3v3(center, co_a, co_b);
} }
} }

View File

@@ -374,7 +374,7 @@ class Report:
testname = test_get_name(filepath) testname = test_get_name(filepath)
print_message(testname, 'SUCCESS', 'RUN') print_message(testname, 'SUCCESS', 'RUN')
time_start = time.time() time_start = time.time()
tmp_filepath = os.path.join(self.output_dir, "tmp") tmp_filepath = os.path.join(self.output_dir, "tmp_" + testname)
error = render_cb(filepath, tmp_filepath) error = render_cb(filepath, tmp_filepath)
status = "FAIL" status = "FAIL"