Work around what appears to be a compiler bug, just changing the code a bit without any functional changes.
		
			
				
	
	
		
			232 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			232 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*
 | |
|  * Copyright 2011-2013 Blender Foundation
 | |
|  *
 | |
|  * Licensed under the Apache License, Version 2.0 (the "License");
 | |
|  * you may not use this file except in compliance with the License.
 | |
|  * You may obtain a copy of the License at
 | |
|  *
 | |
|  * http://www.apache.org/licenses/LICENSE-2.0
 | |
|  *
 | |
|  * Unless required by applicable law or agreed to in writing, software
 | |
|  * distributed under the License is distributed on an "AS IS" BASIS,
 | |
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
|  * See the License for the specific language governing permissions and
 | |
|  * limitations under the License.
 | |
|  */
 | |
| 
 | |
| /* TODO(sergey): Consider moving portable ctz/clz stuff to util. */
 | |
| 
 | |
| CCL_NAMESPACE_BEGIN
 | |
| 
 | |
| /* "Correlated Multi-Jittered Sampling"
 | |
|  * Andrew Kensler, Pixar Technical Memo 13-01, 2013 */
 | |
| 
 | |
| /* todo: find good value, suggested 64 gives pattern on cornell box ceiling */
 | |
| #define CMJ_RANDOM_OFFSET_LIMIT 4096
 | |
| 
 | |
| ccl_device_inline bool cmj_is_pow2(int i)
 | |
| {
 | |
|   return (i > 1) && ((i & (i - 1)) == 0);
 | |
| }
 | |
| 
 | |
| ccl_device_inline int cmj_fast_mod_pow2(int a, int b)
 | |
| {
 | |
|   return (a & (b - 1));
 | |
| }
 | |
| 
 | |
| /* b must be > 1 */
 | |
| ccl_device_inline int cmj_fast_div_pow2(int a, int b)
 | |
| {
 | |
|   kernel_assert(b > 1);
 | |
|   return a >> count_trailing_zeros(b);
 | |
| }
 | |
| 
 | |
| ccl_device_inline uint cmj_w_mask(uint w)
 | |
| {
 | |
|   kernel_assert(w > 1);
 | |
|   return ((1 << (32 - count_leading_zeros(w))) - 1);
 | |
| }
 | |
| 
 | |
| ccl_device_inline uint cmj_permute(uint i, uint l, uint p)
 | |
| {
 | |
|   uint w = l - 1;
 | |
| 
 | |
|   if ((l & w) == 0) {
 | |
|     /* l is a power of two (fast) */
 | |
|     i ^= p;
 | |
|     i *= 0xe170893d;
 | |
|     i ^= p >> 16;
 | |
|     i ^= (i & w) >> 4;
 | |
|     i ^= p >> 8;
 | |
|     i *= 0x0929eb3f;
 | |
|     i ^= p >> 23;
 | |
|     i ^= (i & w) >> 1;
 | |
|     i *= 1 | p >> 27;
 | |
|     i *= 0x6935fa69;
 | |
|     i ^= (i & w) >> 11;
 | |
|     i *= 0x74dcb303;
 | |
|     i ^= (i & w) >> 2;
 | |
|     i *= 0x9e501cc3;
 | |
|     i ^= (i & w) >> 2;
 | |
|     i *= 0xc860a3df;
 | |
|     i &= w;
 | |
|     i ^= i >> 5;
 | |
| 
 | |
|     return (i + p) & w;
 | |
|   }
 | |
|   else {
 | |
|     /* l is not a power of two (slow) */
 | |
|     w = cmj_w_mask(w);
 | |
| 
 | |
|     do {
 | |
|       i ^= p;
 | |
|       i *= 0xe170893d;
 | |
|       i ^= p >> 16;
 | |
|       i ^= (i & w) >> 4;
 | |
|       i ^= p >> 8;
 | |
|       i *= 0x0929eb3f;
 | |
|       i ^= p >> 23;
 | |
|       i ^= (i & w) >> 1;
 | |
|       i *= 1 | p >> 27;
 | |
|       i *= 0x6935fa69;
 | |
|       i ^= (i & w) >> 11;
 | |
|       i *= 0x74dcb303;
 | |
|       i ^= (i & w) >> 2;
 | |
|       i *= 0x9e501cc3;
 | |
|       i ^= (i & w) >> 2;
 | |
|       i *= 0xc860a3df;
 | |
|       i &= w;
 | |
|       i ^= i >> 5;
 | |
|     } while (i >= l);
 | |
| 
 | |
|     return (i + p) % l;
 | |
|   }
 | |
| }
 | |
| 
 | |
| ccl_device_inline uint cmj_hash(uint i, uint p)
 | |
| {
 | |
|   i ^= p;
 | |
|   i ^= i >> 17;
 | |
|   i ^= i >> 10;
 | |
|   i *= 0xb36534e5;
 | |
|   i ^= i >> 12;
 | |
|   i ^= i >> 21;
 | |
|   i *= 0x93fc4795;
 | |
|   i ^= 0xdf6e307f;
 | |
|   i ^= i >> 17;
 | |
|   i *= 1 | p >> 18;
 | |
| 
 | |
|   return i;
 | |
| }
 | |
| 
 | |
| ccl_device_inline uint cmj_hash_simple(uint i, uint p)
 | |
| {
 | |
|   i = (i ^ 61) ^ p;
 | |
|   i += i << 3;
 | |
|   i ^= i >> 4;
 | |
|   i *= 0x27d4eb2d;
 | |
|   return i;
 | |
| }
 | |
| 
 | |
| ccl_device_inline float cmj_randfloat(uint i, uint p)
 | |
| {
 | |
|   return cmj_hash(i, p) * (1.0f / 4294967808.0f);
 | |
| }
 | |
| 
 | |
| #ifdef __CMJ__
 | |
| ccl_device float cmj_sample_1D(int s, int N, int p)
 | |
| {
 | |
|   kernel_assert(s < N);
 | |
| 
 | |
|   uint x = cmj_permute(s, N, p * 0x68bc21eb);
 | |
|   float jx = cmj_randfloat(s, p * 0x967a889b);
 | |
| 
 | |
|   float invN = 1.0f / N;
 | |
|   return (x + jx) * invN;
 | |
| }
 | |
| 
 | |
| /* TODO(sergey): Do some extra tests and consider moving to util_math.h. */
 | |
| ccl_device_inline int cmj_isqrt(int value)
 | |
| {
 | |
| #  if defined(__KERNEL_CUDA__)
 | |
|   return float_to_int(__fsqrt_ru(value));
 | |
| #  elif defined(__KERNEL_GPU__)
 | |
|   return float_to_int(sqrtf(value));
 | |
| #  else
 | |
|   /* This is a work around for fast-math on CPU which might replace sqrtf()
 | |
|    * with am approximated version.
 | |
|    */
 | |
|   return float_to_int(sqrtf(value) + 1e-6f);
 | |
| #  endif
 | |
| }
 | |
| 
 | |
| ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
 | |
| {
 | |
|   kernel_assert(s < N);
 | |
| 
 | |
|   int m = cmj_isqrt(N);
 | |
|   int n = (N - 1) / m + 1;
 | |
|   float invN = 1.0f / N;
 | |
|   float invm = 1.0f / m;
 | |
|   float invn = 1.0f / n;
 | |
| 
 | |
|   s = cmj_permute(s, N, p * 0x51633e2d);
 | |
| 
 | |
|   int sdivm, smodm;
 | |
| 
 | |
|   if (cmj_is_pow2(m)) {
 | |
|     sdivm = cmj_fast_div_pow2(s, m);
 | |
|     smodm = cmj_fast_mod_pow2(s, m);
 | |
|   }
 | |
|   else {
 | |
|     /* Doing s*inmv gives precision issues here. */
 | |
|     sdivm = s / m;
 | |
|     smodm = s - sdivm * m;
 | |
|   }
 | |
| 
 | |
|   uint sx = cmj_permute(smodm, m, p * 0x68bc21eb);
 | |
|   uint sy = cmj_permute(sdivm, n, p * 0x02e5be93);
 | |
| 
 | |
|   float jx = cmj_randfloat(s, p * 0x967a889b);
 | |
|   float jy = cmj_randfloat(s, p * 0x368cc8b7);
 | |
| 
 | |
|   *fx = (sx + (sy + jx) * invn) * invm;
 | |
|   *fy = (s + jy) * invN;
 | |
| }
 | |
| #endif
 | |
| 
 | |
| ccl_device float pmj_sample_1D(KernelGlobals *kg, int sample, int rng_hash, int dimension)
 | |
| {
 | |
|   /* Fallback to random */
 | |
|   if (sample >= NUM_PMJ_SAMPLES) {
 | |
|     const int p = rng_hash + dimension;
 | |
|     return cmj_randfloat(sample, p);
 | |
|   }
 | |
|   else {
 | |
|     const uint mask = cmj_hash_simple(dimension, rng_hash) & 0x007fffff;
 | |
|     const int index = ((dimension % NUM_PMJ_PATTERNS) * NUM_PMJ_SAMPLES + sample) * 2;
 | |
|     return __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ mask) - 1.0f;
 | |
|   }
 | |
| }
 | |
| 
 | |
| ccl_device float2 pmj_sample_2D(KernelGlobals *kg, int sample, int rng_hash, int dimension)
 | |
| {
 | |
|   if (sample >= NUM_PMJ_SAMPLES) {
 | |
|     const int p = rng_hash + dimension;
 | |
|     const float fx = cmj_randfloat(sample, p);
 | |
|     const float fy = cmj_randfloat(sample, p + 1);
 | |
|     return make_float2(fx, fy);
 | |
|   }
 | |
|   else {
 | |
|     const int index = ((dimension % NUM_PMJ_PATTERNS) * NUM_PMJ_SAMPLES + sample) * 2;
 | |
|     const uint maskx = cmj_hash_simple(dimension, rng_hash) & 0x007fffff;
 | |
|     const uint masky = cmj_hash_simple(dimension + 1, rng_hash) & 0x007fffff;
 | |
|     const float fx = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ maskx) - 1.0f;
 | |
|     const float fy = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index + 1) ^ masky) -
 | |
|                      1.0f;
 | |
|     return make_float2(fx, fy);
 | |
|   }
 | |
| }
 | |
| 
 | |
| CCL_NAMESPACE_END
 |