Cycles: Distance optimization for QBVH
This commit implements heuristic which allows to skip nodes pushed to the stack from intersection if distance to them is larger than the distance to the current intersection. This should solve speed regression which i didn't notice in the original QBVH commit (which could have because i had WIP version of this patch applied in my local branch). From quick tests speed seems to be much closer to what is was with regular BVH. There's still some possible code cleanup, but they'll need a bit of assembly code check and now i want to make it so artists can happily use Cycles over the holidays.
This commit is contained in:
@@ -391,6 +391,38 @@ ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ra
|
||||
*t *= len;
|
||||
}
|
||||
|
||||
#ifdef __QBVH__
|
||||
/* Same as above, but optimized for QBVH scene intersection,
|
||||
* which needs to modify two max distances.
|
||||
*
|
||||
* TODO(sergey): Investigate if passing NULL instead of t1 gets optimized
|
||||
* so we can avoid having this duplication.
|
||||
*/
|
||||
ccl_device_inline void qbvh_instance_push(KernelGlobals *kg,
|
||||
int object,
|
||||
const Ray *ray,
|
||||
float3 *P,
|
||||
float3 *dir,
|
||||
float3 *idir,
|
||||
float *t,
|
||||
float *t1)
|
||||
{
|
||||
Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
|
||||
|
||||
*P = transform_point(&tfm, ray->P);
|
||||
|
||||
float len;
|
||||
*dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
|
||||
*idir = bvh_inverse_direction(*dir);
|
||||
|
||||
if(*t != FLT_MAX)
|
||||
*t *= len;
|
||||
|
||||
if(*t1 != -FLT_MAX)
|
||||
*t1 *= len;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Transorm ray to exit static object in BVH */
|
||||
|
||||
ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
|
||||
@@ -436,6 +468,33 @@ ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, c
|
||||
*t *= len;
|
||||
}
|
||||
|
||||
#ifdef __QBVH__
|
||||
/* Same as above, but optimized for QBVH scene intersection,
|
||||
* which needs to modify two max distances.
|
||||
*
|
||||
* TODO(sergey): Investigate if passing NULL instead of t1 gets optimized
|
||||
* so we can avoid having this duplication.
|
||||
*/
|
||||
ccl_device_inline void qbvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, float *t1, Transform *tfm)
|
||||
{
|
||||
Transform itfm;
|
||||
*tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm);
|
||||
|
||||
*P = transform_point(&itfm, ray->P);
|
||||
|
||||
float len;
|
||||
*dir = bvh_clamp_direction(normalize_len(transform_direction(&itfm, ray->D), &len));
|
||||
*idir = bvh_inverse_direction(*dir);
|
||||
|
||||
|
||||
if(*t != FLT_MAX)
|
||||
*t *= len;
|
||||
|
||||
if(*t1 != -FLT_MAX)
|
||||
*t1 *= len;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Transorm ray to exit motion blurred object in BVH */
|
||||
|
||||
ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm)
|
||||
|
||||
@@ -14,32 +14,31 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
ccl_device_inline void qbvh_stack_sort(int *__restrict s1,
|
||||
int *__restrict s2,
|
||||
int *__restrict s3,
|
||||
float *__restrict d1,
|
||||
float *__restrict d2,
|
||||
float *__restrict d3)
|
||||
struct QBVHStackItem {
|
||||
int addr;
|
||||
float dist;
|
||||
};
|
||||
|
||||
/* TOOD(sergey): Investigate if using instrinsics helps here. */
|
||||
ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
|
||||
QBVHStackItem *__restrict s2,
|
||||
QBVHStackItem *__restrict s3)
|
||||
{
|
||||
if(*d2 < *d1) { util_swap(s2, s1); util_swap(d2, d1); }
|
||||
if(*d3 < *d2) { util_swap(s3, s2); util_swap(d3, d2); }
|
||||
if(*d2 < *d1) { util_swap(s2, s1); util_swap(d2, d1); }
|
||||
if(s2->dist < s1->dist) { util_swap(s2, s1); }
|
||||
if(s3->dist < s2->dist) { util_swap(s3, s2); }
|
||||
if(s2->dist < s1->dist) { util_swap(s2, s1); }
|
||||
}
|
||||
|
||||
ccl_device_inline void qbvh_stack_sort(int *__restrict s1,
|
||||
int *__restrict s2,
|
||||
int *__restrict s3,
|
||||
int *__restrict s4,
|
||||
float *__restrict d1,
|
||||
float *__restrict d2,
|
||||
float *__restrict d3,
|
||||
float *__restrict d4)
|
||||
ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
|
||||
QBVHStackItem *__restrict s2,
|
||||
QBVHStackItem *__restrict s3,
|
||||
QBVHStackItem *__restrict s4)
|
||||
{
|
||||
if(*d2 < *d1) { util_swap(s2, s1); util_swap(d2, d1); }
|
||||
if(*d4 < *d3) { util_swap(s4, s3); util_swap(d4, d3); }
|
||||
if(*d3 < *d1) { util_swap(s3, s1); util_swap(d3, d1); }
|
||||
if(*d4 < *d2) { util_swap(s4, s2); util_swap(d4, d2); }
|
||||
if(*d3 < *d2) { util_swap(s3, s2); util_swap(d3, d2); }
|
||||
if(s2->dist < s1->dist) { util_swap(s2, s1); }
|
||||
if(s4->dist < s3->dist) { util_swap(s4, s3); }
|
||||
if(s3->dist < s1->dist) { util_swap(s3, s1); }
|
||||
if(s4->dist < s2->dist) { util_swap(s4, s2); }
|
||||
if(s3->dist < s2->dist) { util_swap(s3, s2); }
|
||||
}
|
||||
|
||||
ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg,
|
||||
|
||||
@@ -39,8 +39,8 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
*/
|
||||
|
||||
/* Traversal stack in CUDA thread-local memory. */
|
||||
int traversalStack[BVH_STACK_SIZE];
|
||||
traversalStack[0] = ENTRYPOINT_SENTINEL;
|
||||
QBVHStackItem traversalStack[BVH_STACK_SIZE];
|
||||
traversalStack[0].addr = ENTRYPOINT_SENTINEL;
|
||||
|
||||
/* Traversal variables in registers. */
|
||||
int stackPtr = 0;
|
||||
@@ -128,13 +128,15 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
if(d1 < d0) {
|
||||
nodeAddr = c1;
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c0;
|
||||
traversalStack[stackPtr].addr = c0;
|
||||
traversalStack[stackPtr].dist = d0;
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
nodeAddr = c0;
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c1;
|
||||
traversalStack[stackPtr].addr = c1;
|
||||
traversalStack[stackPtr].dist = d1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -143,9 +145,11 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
* all nodes onto the stack to sort them there.
|
||||
*/
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c1;
|
||||
traversalStack[stackPtr].addr = c1;
|
||||
traversalStack[stackPtr].dist = c1;
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c0;
|
||||
traversalStack[stackPtr].addr = c0;
|
||||
traversalStack[stackPtr].dist = c0;
|
||||
|
||||
/* Three children are hit, push all onto stack and sort 3
|
||||
* stack items, continue with closest child.
|
||||
@@ -155,12 +159,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
float d2 = ((float*)&dist)[r];
|
||||
if(traverseChild == 0) {
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c2;
|
||||
traversalStack[stackPtr].addr = c2;
|
||||
traversalStack[stackPtr].dist = d2;
|
||||
qbvh_stack_sort(&traversalStack[stackPtr],
|
||||
&traversalStack[stackPtr - 1],
|
||||
&traversalStack[stackPtr - 2],
|
||||
&d2, &d1, &d0);
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
&traversalStack[stackPtr - 2]);
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
--stackPtr;
|
||||
continue;
|
||||
}
|
||||
@@ -172,17 +176,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
int c3 = __float_as_int(cnodes[r]);
|
||||
float d3 = ((float*)&dist)[r];
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c3;
|
||||
traversalStack[stackPtr].addr = c3;
|
||||
traversalStack[stackPtr].dist = d3;
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c2;
|
||||
traversalStack[stackPtr].addr = c2;
|
||||
traversalStack[stackPtr].dist = d2;
|
||||
qbvh_stack_sort(&traversalStack[stackPtr],
|
||||
&traversalStack[stackPtr - 1],
|
||||
&traversalStack[stackPtr - 2],
|
||||
&traversalStack[stackPtr - 3],
|
||||
&d3, &d2, &d1, &d0);
|
||||
&traversalStack[stackPtr - 3]);
|
||||
}
|
||||
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
--stackPtr;
|
||||
}
|
||||
|
||||
@@ -197,7 +202,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
int primAddr2 = __float_as_int(leaf.y);
|
||||
|
||||
/* Pop. */
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
--stackPtr;
|
||||
|
||||
#ifdef __VISIBILITY_FLAG__
|
||||
@@ -315,7 +320,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
triangle_intersect_precalc(dir, &isect_precalc);
|
||||
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
|
||||
traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
|
||||
|
||||
nodeAddr = kernel_tex_fetch(__object_node, object);
|
||||
|
||||
@@ -368,7 +373,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
triangle_intersect_precalc(dir, &isect_precalc);
|
||||
|
||||
object = OBJECT_NONE;
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
--stackPtr;
|
||||
}
|
||||
#endif /* FEATURE(BVH_INSTANCING) */
|
||||
|
||||
@@ -42,8 +42,8 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
*/
|
||||
|
||||
/* Traversal stack in CUDA thread-local memory. */
|
||||
int traversalStack[BVH_STACK_SIZE];
|
||||
traversalStack[0] = ENTRYPOINT_SENTINEL;
|
||||
QBVHStackItem traversalStack[BVH_STACK_SIZE];
|
||||
traversalStack[0].addr = ENTRYPOINT_SENTINEL;
|
||||
|
||||
/* Traversal variables in registers. */
|
||||
int stackPtr = 0;
|
||||
@@ -124,13 +124,15 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
if(d1 < d0) {
|
||||
nodeAddr = c1;
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c0;
|
||||
traversalStack[stackPtr].addr = c0;
|
||||
traversalStack[stackPtr].dist = d0;
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
nodeAddr = c0;
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c1;
|
||||
traversalStack[stackPtr].addr = c1;
|
||||
traversalStack[stackPtr].dist = d1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -139,9 +141,11 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
* all nodes onto the stack to sort them there.
|
||||
*/
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c1;
|
||||
traversalStack[stackPtr].addr = c1;
|
||||
traversalStack[stackPtr].dist = d1;
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c0;
|
||||
traversalStack[stackPtr].addr = c0;
|
||||
traversalStack[stackPtr].dist = d0;
|
||||
|
||||
/* Three children are hit, push all onto stack and sort 3
|
||||
* stack items, continue with closest child.
|
||||
@@ -151,12 +155,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
float d2 = ((float*)&dist)[r];
|
||||
if(traverseChild == 0) {
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c2;
|
||||
traversalStack[stackPtr].addr = c2;
|
||||
traversalStack[stackPtr].dist = d2;
|
||||
qbvh_stack_sort(&traversalStack[stackPtr],
|
||||
&traversalStack[stackPtr - 1],
|
||||
&traversalStack[stackPtr - 2],
|
||||
&d2, &d1, &d0);
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
&traversalStack[stackPtr - 2]);
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
--stackPtr;
|
||||
continue;
|
||||
}
|
||||
@@ -168,17 +172,18 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
int c3 = __float_as_int(cnodes[r]);
|
||||
float d3 = ((float*)&dist)[r];
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c3;
|
||||
traversalStack[stackPtr].addr = c3;
|
||||
traversalStack[stackPtr].dist = d3;
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c2;
|
||||
traversalStack[stackPtr].addr = c2;
|
||||
traversalStack[stackPtr].dist = d2;
|
||||
qbvh_stack_sort(&traversalStack[stackPtr],
|
||||
&traversalStack[stackPtr - 1],
|
||||
&traversalStack[stackPtr - 2],
|
||||
&traversalStack[stackPtr - 3],
|
||||
&d3, &d2, &d1, &d0);
|
||||
&traversalStack[stackPtr - 3]);
|
||||
}
|
||||
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
--stackPtr;
|
||||
}
|
||||
|
||||
@@ -193,7 +198,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
int primAddr2 = __float_as_int(leaf.y);
|
||||
|
||||
/* Pop. */
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
--stackPtr;
|
||||
|
||||
/* Primitive intersection. */
|
||||
@@ -250,13 +255,13 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
triangle_intersect_precalc(dir, &isect_precalc);
|
||||
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
|
||||
traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
|
||||
|
||||
nodeAddr = kernel_tex_fetch(__object_node, object);
|
||||
}
|
||||
else {
|
||||
/* Pop. */
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
--stackPtr;
|
||||
}
|
||||
|
||||
@@ -290,7 +295,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
triangle_intersect_precalc(dir, &isect_precalc);
|
||||
|
||||
object = OBJECT_NONE;
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
--stackPtr;
|
||||
}
|
||||
#endif /* FEATURE(BVH_INSTANCING) */
|
||||
|
||||
@@ -47,12 +47,14 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
*/
|
||||
|
||||
/* Traversal stack in CUDA thread-local memory. */
|
||||
int traversalStack[BVH_STACK_SIZE];
|
||||
traversalStack[0] = ENTRYPOINT_SENTINEL;
|
||||
QBVHStackItem traversalStack[BVH_STACK_SIZE];
|
||||
traversalStack[0].addr = ENTRYPOINT_SENTINEL;
|
||||
traversalStack[0].dist = -FLT_MAX;
|
||||
|
||||
/* Traversal variables in registers. */
|
||||
int stackPtr = 0;
|
||||
int nodeAddr = kernel_data.bvh.root;
|
||||
float nodeDist = -FLT_MAX;
|
||||
|
||||
/* Ray parameters in registers. */
|
||||
float3 P = ray->P;
|
||||
@@ -100,6 +102,14 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
do {
|
||||
/* Traverse internal nodes. */
|
||||
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
|
||||
if(UNLIKELY(nodeDist > isect->t)) {
|
||||
/* Pop. */
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
nodeDist = traversalStack[stackPtr].dist;
|
||||
--stackPtr;
|
||||
continue;
|
||||
}
|
||||
|
||||
int traverseChild;
|
||||
ssef dist;
|
||||
|
||||
@@ -152,8 +162,10 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
|
||||
/* One child is hit, continue with that child. */
|
||||
int r = __bscf(traverseChild);
|
||||
float d0 = ((float*)&dist)[r];
|
||||
if(traverseChild == 0) {
|
||||
nodeAddr = __float_as_int(cnodes[r]);
|
||||
nodeDist = d0;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -161,7 +173,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
* closer child.
|
||||
*/
|
||||
int c0 = __float_as_int(cnodes[r]);
|
||||
float d0 = ((float*)&dist)[r];
|
||||
r = __bscf(traverseChild);
|
||||
int c1 = __float_as_int(cnodes[r]);
|
||||
float d1 = ((float*)&dist)[r];
|
||||
@@ -169,13 +180,15 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
if(d1 < d0) {
|
||||
nodeAddr = c1;
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c0;
|
||||
traversalStack[stackPtr].addr = c0;
|
||||
traversalStack[stackPtr].dist = d0;
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
nodeAddr = c0;
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c1;
|
||||
traversalStack[stackPtr].addr = c1;
|
||||
traversalStack[stackPtr].dist = d1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -184,9 +197,11 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
* all nodes onto the stack to sort them there.
|
||||
*/
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c1;
|
||||
traversalStack[stackPtr].addr = c1;
|
||||
traversalStack[stackPtr].dist = d1;
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c0;
|
||||
traversalStack[stackPtr].addr = c0;
|
||||
traversalStack[stackPtr].dist = d0;
|
||||
|
||||
/* Three children are hit, push all onto stack and sort 3
|
||||
* stack items, continue with closest child.
|
||||
@@ -196,12 +211,13 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
float d2 = ((float*)&dist)[r];
|
||||
if(traverseChild == 0) {
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c2;
|
||||
traversalStack[stackPtr].addr = c2;
|
||||
traversalStack[stackPtr].dist = d2;
|
||||
qbvh_stack_sort(&traversalStack[stackPtr],
|
||||
&traversalStack[stackPtr - 1],
|
||||
&traversalStack[stackPtr - 2],
|
||||
&d2, &d1, &d0);
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
&traversalStack[stackPtr - 2]);
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
nodeDist = traversalStack[stackPtr].dist;
|
||||
--stackPtr;
|
||||
continue;
|
||||
}
|
||||
@@ -213,22 +229,32 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
int c3 = __float_as_int(cnodes[r]);
|
||||
float d3 = ((float*)&dist)[r];
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c3;
|
||||
traversalStack[stackPtr].addr = c3;
|
||||
traversalStack[stackPtr].dist = d3;
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c2;
|
||||
traversalStack[stackPtr].addr = c2;
|
||||
traversalStack[stackPtr].dist = d2;
|
||||
qbvh_stack_sort(&traversalStack[stackPtr],
|
||||
&traversalStack[stackPtr - 1],
|
||||
&traversalStack[stackPtr - 2],
|
||||
&traversalStack[stackPtr - 3],
|
||||
&d3, &d2, &d1, &d0);
|
||||
&traversalStack[stackPtr - 3]);
|
||||
}
|
||||
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
nodeDist = traversalStack[stackPtr].dist;
|
||||
--stackPtr;
|
||||
}
|
||||
|
||||
/* If node is leaf, fetch triangle list. */
|
||||
if(nodeAddr < 0) {
|
||||
if(UNLIKELY(nodeDist > isect->t)) {
|
||||
/* Pop. */
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
nodeDist = traversalStack[stackPtr].dist;
|
||||
--stackPtr;
|
||||
continue;
|
||||
}
|
||||
|
||||
float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_QNODE_SIZE+6);
|
||||
int primAddr = __float_as_int(leaf.x);
|
||||
|
||||
@@ -238,7 +264,8 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
int primAddr2 = __float_as_int(leaf.y);
|
||||
|
||||
/* Pop. */
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
nodeDist = traversalStack[stackPtr].dist;
|
||||
--stackPtr;
|
||||
|
||||
#ifdef __VISIBILITY_FLAG__
|
||||
@@ -299,9 +326,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
object = kernel_tex_fetch(__prim_object, -primAddr-1);
|
||||
|
||||
#if BVH_FEATURE(BVH_MOTION)
|
||||
bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
|
||||
qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &nodeDist, &ob_tfm);
|
||||
#else
|
||||
bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
|
||||
qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &nodeDist);
|
||||
#endif
|
||||
|
||||
if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
|
||||
@@ -318,7 +345,8 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
triangle_intersect_precalc(dir, &isect_precalc);
|
||||
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
|
||||
traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
|
||||
traversalStack[stackPtr].dist = -FLT_MAX;
|
||||
|
||||
nodeAddr = kernel_tex_fetch(__object_node, object);
|
||||
}
|
||||
@@ -351,7 +379,8 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
triangle_intersect_precalc(dir, &isect_precalc);
|
||||
|
||||
object = OBJECT_NONE;
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
nodeDist = traversalStack[stackPtr].dist;
|
||||
--stackPtr;
|
||||
}
|
||||
#endif /* FEATURE(BVH_INSTANCING) */
|
||||
|
||||
@@ -38,8 +38,8 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
*/
|
||||
|
||||
/* Traversal stack in CUDA thread-local memory. */
|
||||
int traversalStack[BVH_STACK_SIZE];
|
||||
traversalStack[0] = ENTRYPOINT_SENTINEL;
|
||||
QBVHStackItem traversalStack[BVH_STACK_SIZE];
|
||||
traversalStack[0].addr = ENTRYPOINT_SENTINEL;
|
||||
|
||||
/* Traversal variables in registers. */
|
||||
int stackPtr = 0;
|
||||
@@ -130,13 +130,15 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
if(d1 < d0) {
|
||||
nodeAddr = c1;
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c0;
|
||||
traversalStack[stackPtr].addr = c0;
|
||||
traversalStack[stackPtr].dist = d0;
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
nodeAddr = c0;
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c1;
|
||||
traversalStack[stackPtr].addr = c1;
|
||||
traversalStack[stackPtr].dist = d1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -145,9 +147,11 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
* all nodes onto the stack to sort them there.
|
||||
*/
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c1;
|
||||
traversalStack[stackPtr].addr = c1;
|
||||
traversalStack[stackPtr].dist = d1;
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c0;
|
||||
traversalStack[stackPtr].addr = c0;
|
||||
traversalStack[stackPtr].dist = d0;
|
||||
|
||||
/* Three children are hit, push all onto stack and sort 3
|
||||
* stack items, continue with closest child.
|
||||
@@ -157,12 +161,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
float d2 = ((float*)&dist)[r];
|
||||
if(traverseChild == 0) {
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c2;
|
||||
traversalStack[stackPtr].addr = c2;
|
||||
traversalStack[stackPtr].dist = d2;
|
||||
qbvh_stack_sort(&traversalStack[stackPtr],
|
||||
&traversalStack[stackPtr - 1],
|
||||
&traversalStack[stackPtr - 2],
|
||||
&d2, &d1, &d0);
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
&traversalStack[stackPtr - 2]);
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
--stackPtr;
|
||||
continue;
|
||||
}
|
||||
@@ -174,17 +178,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
int c3 = __float_as_int(cnodes[r]);
|
||||
float d3 = ((float*)&dist)[r];
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c3;
|
||||
traversalStack[stackPtr].addr = c3;
|
||||
traversalStack[stackPtr].dist = d3;
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = c2;
|
||||
traversalStack[stackPtr].addr = c2;
|
||||
traversalStack[stackPtr].dist = d2;
|
||||
qbvh_stack_sort(&traversalStack[stackPtr],
|
||||
&traversalStack[stackPtr - 1],
|
||||
&traversalStack[stackPtr - 2],
|
||||
&traversalStack[stackPtr - 3],
|
||||
&d3, &d2, &d1, &d0);
|
||||
&traversalStack[stackPtr - 3]);
|
||||
}
|
||||
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
--stackPtr;
|
||||
}
|
||||
|
||||
@@ -199,7 +204,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
int primAddr2 = __float_as_int(leaf.y);
|
||||
|
||||
/* Pop. */
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
--stackPtr;
|
||||
|
||||
/* Primitive intersection. */
|
||||
@@ -270,14 +275,14 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
triangle_intersect_precalc(dir, &isect_precalc);
|
||||
|
||||
++stackPtr;
|
||||
traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
|
||||
traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
|
||||
|
||||
nodeAddr = kernel_tex_fetch(__object_node, object);
|
||||
}
|
||||
else {
|
||||
/* Pop. */
|
||||
object = OBJECT_NONE;
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
--stackPtr;
|
||||
}
|
||||
}
|
||||
@@ -310,7 +315,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
triangle_intersect_precalc(dir, &isect_precalc);
|
||||
|
||||
object = OBJECT_NONE;
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
nodeAddr = traversalStack[stackPtr].addr;
|
||||
--stackPtr;
|
||||
}
|
||||
#endif /* FEATURE(BVH_INSTANCING) */
|
||||
|
||||
Reference in New Issue
Block a user