index-of-nearest-104619 #2

Merged
Iliya Katushenock merged 62 commits from HooglyBoogly/blender:index-of-nearest-104619 into index_of_nearest 2023-04-20 21:19:53 +02:00
18 changed files with 191 additions and 85 deletions
Showing only changes of commit dda4c0721c - Show all commits

View File

@ -590,7 +590,7 @@ void dof_gather_accumulator(sampler2D color_tx,
* The full pixel neighborhood is gathered.
* \{ */
void dof_slight_focus_gather(sampler2D depth_tx,
void dof_slight_focus_gather(depth2D depth_tx,
sampler2D color_tx,
sampler2D bkh_lut_tx, /* Renamed because of ugly macro job. */
float radius,

View File

@ -62,7 +62,7 @@ void main()
int mask_shift = 1;
#define downsample_level(out_mip__, lod_) \
active_thread = all(lessThan(local_px, gl_WorkGroupSize.xy >> uint(mask_shift))); \
active_thread = all(lessThan(uvec2(local_px), gl_WorkGroupSize.xy >> uint(mask_shift))); \
barrier(); /* Wait for previous writes to finish. */ \
if (active_thread) { \
max_depth = max_v4(load_local_depths(local_px)); \
@ -89,12 +89,12 @@ void main()
}
finished_tile_counter = 0u;
ivec2 iter = divide_ceil(imageSize(out_mip_5), ivec2(gl_WorkGroupSize * 2u));
ivec2 iter = divide_ceil(imageSize(out_mip_5), ivec2(gl_WorkGroupSize.xy * 2u));
ivec2 image_border = imageSize(out_mip_5) - 1;
for (int y = 0; y < iter.y; y++) {
for (int x = 0; x < iter.x; x++) {
/* Load result of the other work groups. */
kernel_origin = ivec2(gl_WorkGroupSize) * ivec2(x, y);
kernel_origin = ivec2(gl_WorkGroupSize.xy) * ivec2(x, y);
src_px = ivec2(kernel_origin + local_px) * 2;
vec4 samp;
samp.x = imageLoad(out_mip_5, min(src_px + ivec2(0, 1), image_border)).x;

View File

@ -168,13 +168,15 @@ void main()
}
/* Fallthrough to the hemispheric case. */
case LIGHT_RECT:
case LIGHT_ELLIPSE:
case LIGHT_ELLIPSE: {
vec3 v000 = vP - v_right * radius - v_up * radius;
vec3 v100 = v000 + v_right * (radius * 2.0);
vec3 v010 = v000 + v_up * (radius * 2.0);
vec3 v001 = v000 - v_back * radius;
Box bbox = shape_box(v000, v100, v010, v001);
intersect_tile = intersect_tile && intersect(tile, bbox);
break;
}
default:
break;
}

View File

@ -74,8 +74,10 @@ void main()
vec4 max_motion = imageLoad(in_tiles_img, src_tile);
MotionPayload payload_prv = motion_blur_tile_indirection_pack_payload(max_motion.xy, src_tile);
MotionPayload payload_nxt = motion_blur_tile_indirection_pack_payload(max_motion.zw, src_tile);
MotionPayload payload_prv = motion_blur_tile_indirection_pack_payload(max_motion.xy,
uvec2(src_tile));
MotionPayload payload_nxt = motion_blur_tile_indirection_pack_payload(max_motion.zw,
uvec2(src_tile));
if (true) {
/* Rectangular area (in tiles) where the motion vector spreads. */
MotionRect motion_rect = compute_motion_rect(src_tile, max_motion.xy);
@ -85,17 +87,20 @@ void main()
for (int y = 0; y < motion_rect.extent.y; y++) {
ivec2 tile = motion_rect.bottom_left + ivec2(x, y);
if (is_inside_motion_line(tile, motion_line)) {
motion_blur_tile_indirection_store(tile_indirection_buf, MOTION_PREV, tile, payload_prv);
motion_blur_tile_indirection_store(
tile_indirection_buf, MOTION_PREV, uvec2(tile), payload_prv);
/* FIXME: This is a bit weird, but for some reason, we need the store the same vector in
* the motion next so that weighting in gather pass is better. */
motion_blur_tile_indirection_store(tile_indirection_buf, MOTION_NEXT, tile, payload_nxt);
motion_blur_tile_indirection_store(
tile_indirection_buf, MOTION_NEXT, uvec2(tile), payload_nxt);
}
}
}
}
if (true) {
MotionPayload payload = motion_blur_tile_indirection_pack_payload(max_motion.zw, src_tile);
MotionPayload payload = motion_blur_tile_indirection_pack_payload(max_motion.zw,
uvec2(src_tile));
/* Rectangular area (in tiles) where the motion vector spreads. */
MotionRect motion_rect = compute_motion_rect(src_tile, max_motion.zw);
MotionLine motion_line = compute_motion_line(src_tile, max_motion.zw);
@ -104,10 +109,12 @@ void main()
for (int y = 0; y < motion_rect.extent.y; y++) {
ivec2 tile = motion_rect.bottom_left + ivec2(x, y);
if (is_inside_motion_line(tile, motion_line)) {
motion_blur_tile_indirection_store(tile_indirection_buf, MOTION_NEXT, tile, payload_nxt);
motion_blur_tile_indirection_store(
tile_indirection_buf, MOTION_NEXT, uvec2(tile), payload_nxt);
/* FIXME: This is a bit weird, but for some reason, we need the store the same vector in
* the motion next so that weighting in gather pass is better. */
motion_blur_tile_indirection_store(tile_indirection_buf, MOTION_PREV, tile, payload_prv);
motion_blur_tile_indirection_store(
tile_indirection_buf, MOTION_PREV, uvec2(tile), payload_prv);
}
}
}

View File

@ -178,10 +178,10 @@ void main()
vec4 max_motion;
/* Load dilation result from the indirection table. */
ivec2 tile_prev;
motion_blur_tile_indirection_load(tile_indirection_buf, MOTION_PREV, tile, tile_prev);
motion_blur_tile_indirection_load(tile_indirection_buf, MOTION_PREV, uvec2(tile), tile_prev);
max_motion.xy = imageLoad(in_tiles_img, tile_prev).xy;
ivec2 tile_next;
motion_blur_tile_indirection_load(tile_indirection_buf, MOTION_NEXT, tile, tile_next);
motion_blur_tile_indirection_load(tile_indirection_buf, MOTION_NEXT, uvec2(tile), tile_next);
max_motion.zw = imageLoad(in_tiles_img, tile_next).zw;
Accumulator accum;

View File

@ -242,13 +242,13 @@ void output_aov(vec4 color, float value, uint hash)
#if defined(MAT_AOV_SUPPORT) && defined(GPU_FRAGMENT_SHADER)
for (int i = 0; i < AOV_MAX && i < aov_buf.color_len; i++) {
if (aov_buf.hash_color[i] == hash) {
imageStore(aov_color_img, ivec3(gl_FragCoord.xy, i), color);
imageStore(aov_color_img, ivec3(ivec2(gl_FragCoord.xy), i), color);
return;
}
}
for (int i = 0; i < AOV_MAX && i < aov_buf.value_len; i++) {
if (aov_buf.hash_value[i] == hash) {
imageStore(aov_value_img, ivec3(gl_FragCoord.xy, i), vec4(value));
imageStore(aov_value_img, ivec3(ivec2(gl_FragCoord.xy), i), vec4(value));
return;
}
}

View File

@ -65,7 +65,7 @@ void main()
}
AABB aabb_tag;
AABB aabb_map = AABB(vec3(-0.99999), vec3(0.99999));
AABB aabb_map = shape_aabb(vec3(-0.99999), vec3(0.99999));
/* Directionnal winmat have no correct near/far in the Z dimension at this point.
* Do not clip in this dimension. */
@ -87,7 +87,7 @@ void main()
for (int y = box_min.y; y <= box_max.y; y++) {
for (int x = box_min.x; x <= box_max.x; x++) {
int tile_index = shadow_tile_offset(ivec2(x, y), tilemap.tiles_index, lod);
atomicOr(tiles_buf[tile_index], SHADOW_DO_UPDATE);
atomicOr(tiles_buf[tile_index], uint(SHADOW_DO_UPDATE));
}
}
}

View File

@ -21,7 +21,7 @@ void shadow_tag_usage_tile(LightData light, ivec2 tile_co, int lod, int tilemap_
tile_co >>= lod;
int tile_index = shadow_tile_offset(tile_co, tilemaps_buf[tilemap_index].tiles_index, lod);
atomicOr(tiles_buf[tile_index], SHADOW_IS_USED);
atomicOr(tiles_buf[tile_index], uint(SHADOW_IS_USED));
}
void shadow_tag_usage_tilemap_directional(uint l_idx, vec3 P, vec3 V, float radius)

View File

@ -44,7 +44,7 @@ bool is_visible(IsectBox box)
bool intersects_near_plane(IsectBox box)
{
vec4 near_plane = drw_view_culling.planes[4];
vec4 near_plane = drw_view_culling.frustum_planes.planes[4];
bool on_positive_side = false;
bool on_negative_side = false;

View File

@ -21,6 +21,8 @@ typedef struct DispatchCommand DispatchCommand;
typedef struct DRWDebugPrintBuffer DRWDebugPrintBuffer;
typedef struct DRWDebugVert DRWDebugVert;
typedef struct DRWDebugDrawBuffer DRWDebugDrawBuffer;
typedef struct FrustumCorners FrustumCorners;
typedef struct FrustumPlanes FrustumPlanes;
/* __cplusplus is true when compiling with MSL. */
# if defined(__cplusplus) && !defined(GPU_SHADER)
@ -94,11 +96,27 @@ uint drw_view_id = 0;
# define DRW_VIEW_FROM_RESOURCE_ID drw_view_id = (drw_ResourceID & DRW_VIEW_MASK)
#endif
struct FrustumCorners {
float4 corners[8];
};
BLI_STATIC_ASSERT_ALIGN(FrustumCorners, 16)
struct FrustumPlanes {
/* [0] left
* [1] right
* [2] bottom
* [3] top
* [4] near
* [5] far */
float4 planes[6];
};
BLI_STATIC_ASSERT_ALIGN(FrustumPlanes, 16)
struct ViewCullingData {
/** \note vec3 array padded to vec4. */
/** Frustum corners. */
float4 corners[8];
float4 planes[6];
FrustumCorners frustum_corners;
FrustumPlanes frustum_planes;
float4 bound_sphere;
};
BLI_STATIC_ASSERT_ALIGN(ViewCullingData, 16)

View File

@ -50,7 +50,8 @@ void View::frustum_boundbox_calc(int view_id)
}
#endif
MutableSpan<float4> corners = {culling_[view_id].corners, ARRAY_SIZE(culling_[view_id].corners)};
MutableSpan<float4> corners = {culling_[view_id].frustum_corners.corners,
ARRAY_SIZE(culling_[view_id].frustum_corners.corners)};
float left, right, bottom, top, near, far;
bool is_persp = data_[view_id].winmat[3][3] == 0.0f;
@ -89,15 +90,15 @@ void View::frustum_culling_planes_calc(int view_id)
{
float4x4 persmat = data_[view_id].winmat * data_[view_id].viewmat;
planes_from_projmat(persmat.ptr(),
culling_[view_id].planes[0],
culling_[view_id].planes[5],
culling_[view_id].planes[1],
culling_[view_id].planes[3],
culling_[view_id].planes[4],
culling_[view_id].planes[2]);
culling_[view_id].frustum_planes.planes[0],
culling_[view_id].frustum_planes.planes[5],
culling_[view_id].frustum_planes.planes[1],
culling_[view_id].frustum_planes.planes[3],
culling_[view_id].frustum_planes.planes[4],
culling_[view_id].frustum_planes.planes[2]);
/* Normalize. */
for (float4 &plane : culling_[view_id].planes) {
for (float4 &plane : culling_[view_id].frustum_planes.planes) {
plane.w /= normalize_v3(plane);
}
}
@ -105,7 +106,8 @@ void View::frustum_culling_planes_calc(int view_id)
void View::frustum_culling_sphere_calc(int view_id)
{
BoundSphere &bsphere = *reinterpret_cast<BoundSphere *>(&culling_[view_id].bound_sphere);
Span<float4> corners = {culling_[view_id].corners, ARRAY_SIZE(culling_[view_id].corners)};
Span<float4> corners = {culling_[view_id].frustum_corners.corners,
ARRAY_SIZE(culling_[view_id].frustum_corners.corners)};
/* Extract Bounding Sphere */
if (data_[view_id].winmat[3][3] != 0.0f) {

View File

@ -9,6 +9,14 @@ struct AABB {
vec3 min, max;
};
AABB shape_aabb(vec3 min, vec3 max)
{
AABB aabb;
aabb.min = min;
aabb.max = max;
return aabb;
}
AABB aabb_init_min_max()
{
AABB aabb;

View File

@ -136,7 +136,7 @@ bool intersect_view(Pyramid pyramid)
for (int p = 0; p < 6; ++p) {
bool is_any_vertex_on_positive_side = false;
for (int v = 0; v < 5; ++v) {
float test = dot(drw_view_culling.planes[p], vec4(pyramid.corners[v], 1.0));
float test = dot(drw_view_culling.frustum_planes.planes[p], vec4(pyramid.corners[v], 1.0));
if (test > 0.0) {
is_any_vertex_on_positive_side = true;
break;
@ -158,7 +158,8 @@ bool intersect_view(Pyramid pyramid)
for (int p = 0; p < 5; ++p) {
bool is_any_vertex_on_positive_side = false;
for (int v = 0; v < 8; ++v) {
float test = dot(i_pyramid.planes[p], vec4(drw_view_culling.corners[v].xyz, 1.0));
float test = dot(i_pyramid.planes[p],
vec4(drw_view_culling.frustum_corners.corners[v].xyz, 1.0));
if (test > 0.0) {
is_any_vertex_on_positive_side = true;
break;
@ -181,7 +182,7 @@ bool intersect_view(Box box)
for (int p = 0; p < 6; ++p) {
bool is_any_vertex_on_positive_side = false;
for (int v = 0; v < 8; ++v) {
float test = dot(drw_view_culling.planes[p], vec4(box.corners[v], 1.0));
float test = dot(drw_view_culling.frustum_planes.planes[p], vec4(box.corners[v], 1.0));
if (test > 0.0) {
is_any_vertex_on_positive_side = true;
break;
@ -203,7 +204,8 @@ bool intersect_view(Box box)
for (int p = 0; p < 6; ++p) {
bool is_any_vertex_on_positive_side = false;
for (int v = 0; v < 8; ++v) {
float test = dot(i_box.planes[p], vec4(drw_view_culling.corners[v].xyz, 1.0));
float test = dot(i_box.planes[p],
vec4(drw_view_culling.frustum_corners.corners[v].xyz, 1.0));
if (test > 0.0) {
is_any_vertex_on_positive_side = true;
break;
@ -227,7 +229,7 @@ bool intersect_view(IsectBox i_box)
for (int p = 0; p < 6; ++p) {
bool is_any_vertex_on_positive_side = false;
for (int v = 0; v < 8; ++v) {
float test = dot(drw_view_culling.planes[p], vec4(i_box.corners[v], 1.0));
float test = dot(drw_view_culling.frustum_planes.planes[p], vec4(i_box.corners[v], 1.0));
if (test > 0.0) {
is_any_vertex_on_positive_side = true;
break;
@ -247,7 +249,8 @@ bool intersect_view(IsectBox i_box)
for (int p = 0; p < 6; ++p) {
bool is_any_vertex_on_positive_side = false;
for (int v = 0; v < 8; ++v) {
float test = dot(i_box.planes[p], vec4(drw_view_culling.corners[v].xyz, 1.0));
float test = dot(i_box.planes[p],
vec4(drw_view_culling.frustum_corners.corners[v].xyz, 1.0));
if (test > 0.0) {
is_any_vertex_on_positive_side = true;
break;
@ -268,7 +271,7 @@ bool intersect_view(Sphere sphere)
bool intersects = true;
for (int p = 0; p < 6 && intersects; ++p) {
float dist_to_plane = dot(drw_view_culling.planes[p], vec4(sphere.center, 1.0));
float dist_to_plane = dot(drw_view_culling.frustum_planes.planes[p], vec4(sphere.center, 1.0));
if (dist_to_plane < -sphere.radius) {
intersects = false;
}

View File

@ -18,7 +18,10 @@ struct Circle {
Circle shape_circle(vec2 center, float radius)
{
return Circle(center, radius);
Circle circle;
circle.center = center;
circle.radius = radius;
return circle;
}
/** \} */
@ -34,7 +37,10 @@ struct Sphere {
Sphere shape_sphere(vec3 center, float radius)
{
return Sphere(center, radius);
Sphere sphere;
sphere.center = center;
sphere.radius = radius;
return sphere;
}
/** \} */
@ -192,6 +198,14 @@ Frustum shape_frustum(vec3 corners[8])
struct Cone {
vec3 direction;
float angle_cos;
#ifdef GPU_METAL
inline Cone() = default;
inline Cone(vec3 in_direction, float in_angle_cos)
: direction(in_direction), angle_cos(in_angle_cos)
{
}
#endif
};
Cone shape_cone(vec3 direction, float angle_cosine)

View File

@ -33,18 +33,19 @@ void projmat_dimensions(mat4 winmat,
}
}
void frustum_boundbox_calc(mat4 winmat, mat4 viewinv, out vec4 corners[8])
void frustum_boundbox_calc(mat4 winmat, mat4 viewinv, out FrustumCorners frustum_corners)
{
float left, right, bottom, top, near, far;
bool is_persp = winmat[3][3] == 0.0;
projmat_dimensions(winmat, left, right, bottom, top, near, far);
corners[0][2] = corners[3][2] = corners[7][2] = corners[4][2] = -near;
corners[0][0] = corners[3][0] = left;
corners[4][0] = corners[7][0] = right;
corners[0][1] = corners[4][1] = bottom;
corners[7][1] = corners[3][1] = top;
frustum_corners.corners[0][2] = frustum_corners.corners[3][2] = frustum_corners.corners[7][2] =
frustum_corners.corners[4][2] = -near;
frustum_corners.corners[0][0] = frustum_corners.corners[3][0] = left;
frustum_corners.corners[4][0] = frustum_corners.corners[7][0] = right;
frustum_corners.corners[0][1] = frustum_corners.corners[4][1] = bottom;
frustum_corners.corners[7][1] = frustum_corners.corners[3][1] = top;
/* Get the coordinates of the far plane. */
if (is_persp) {
@ -55,25 +56,20 @@ void frustum_boundbox_calc(mat4 winmat, mat4 viewinv, out vec4 corners[8])
top *= sca_far;
}
corners[1][2] = corners[2][2] = corners[6][2] = corners[5][2] = -far;
corners[1][0] = corners[2][0] = left;
corners[6][0] = corners[5][0] = right;
corners[1][1] = corners[5][1] = bottom;
corners[2][1] = corners[6][1] = top;
frustum_corners.corners[1][2] = frustum_corners.corners[2][2] = frustum_corners.corners[6][2] =
frustum_corners.corners[5][2] = -far;
frustum_corners.corners[1][0] = frustum_corners.corners[2][0] = left;
frustum_corners.corners[6][0] = frustum_corners.corners[5][0] = right;
frustum_corners.corners[1][1] = frustum_corners.corners[5][1] = bottom;
frustum_corners.corners[2][1] = frustum_corners.corners[6][1] = top;
/* Transform into world space. */
for (int i = 0; i < 8; i++) {
corners[i].xyz = transform_point(viewinv, corners[i].xyz);
frustum_corners.corners[i].xyz = transform_point(viewinv, frustum_corners.corners[i].xyz);
}
}
void planes_from_projmat(mat4 mat,
out vec4 left,
out vec4 right,
out vec4 bottom,
out vec4 top,
out vec4 near,
out vec4 far)
void planes_from_projmat(mat4 mat, out FrustumPlanes frustum_planes)
{
/* References:
*
@ -81,35 +77,35 @@ void planes_from_projmat(mat4 mat,
* http://www8.cs.umu.se/kurser/5DV051/HT12/lab/plane_extraction.pdf
*/
mat = transpose(mat);
left = mat[3] + mat[0];
right = mat[3] - mat[0];
bottom = mat[3] + mat[1];
top = mat[3] - mat[1];
near = mat[3] + mat[2];
far = mat[3] - mat[2];
frustum_planes.planes[0] = mat[3] + mat[0];
frustum_planes.planes[1] = mat[3] - mat[0];
frustum_planes.planes[2] = mat[3] + mat[1];
frustum_planes.planes[3] = mat[3] - mat[1];
frustum_planes.planes[4] = mat[3] + mat[2];
frustum_planes.planes[5] = mat[3] - mat[2];
}
void frustum_culling_planes_calc(mat4 winmat, mat4 viewmat, out vec4 planes[6])
void frustum_culling_planes_calc(mat4 winmat, mat4 viewmat, out FrustumPlanes frustum_planes)
{
mat4 persmat = winmat * viewmat;
planes_from_projmat(persmat, planes[0], planes[5], planes[1], planes[3], planes[4], planes[2]);
planes_from_projmat(persmat, frustum_planes);
/* Normalize. */
for (int p = 0; p < 6; p++) {
planes[p] /= length(planes[p].xyz);
frustum_planes.planes[p] /= length(frustum_planes.planes[p].xyz);
}
}
vec4 frustum_culling_sphere_calc(vec4 corners[8])
vec4 frustum_culling_sphere_calc(FrustumCorners frustum_corners)
{
/* Extract Bounding Sphere */
/* TODO(fclem): This is significantly less precise than CPU, but it isn't used in most cases. */
vec4 bsphere;
bsphere.xyz = (corners[0].xyz + corners[6].xyz) * 0.5;
bsphere.xyz = (frustum_corners.corners[0].xyz + frustum_corners.corners[6].xyz) * 0.5;
bsphere.w = 0.0;
for (int i = 0; i < 8; i++) {
bsphere.w = max(bsphere.w, distance(bsphere.xyz, corners[i].xyz));
bsphere.w = max(bsphere.w, distance(bsphere.xyz, frustum_corners.corners[i].xyz));
}
return bsphere;
}
@ -125,11 +121,15 @@ void main()
return;
}
frustum_boundbox_calc(drw_view.winmat, drw_view.viewinv, view_culling_buf[drw_view_id].corners);
/* Read frustom_corners from device memory, update, and write back. */
FrustumCorners frustum_corners = view_culling_buf[drw_view_id].frustum_corners;
frustum_boundbox_calc(drw_view.winmat, drw_view.viewinv, frustum_corners);
view_culling_buf[drw_view_id].frustum_corners = frustum_corners;
frustum_culling_planes_calc(
drw_view.winmat, drw_view.viewmat, view_culling_buf[drw_view_id].planes);
/* Read frustum_planes from device memory, update, and write back. */
FrustumPlanes frustum_planes = view_culling_buf[drw_view_id].frustum_planes;
frustum_culling_planes_calc(drw_view.winmat, drw_view.viewmat, frustum_planes);
view_culling_buf[drw_view_id].bound_sphere = frustum_culling_sphere_calc(
view_culling_buf[drw_view_id].corners);
view_culling_buf[drw_view_id].frustum_planes = frustum_planes;
view_culling_buf[drw_view_id].bound_sphere = frustum_culling_sphere_calc(frustum_corners);
}

View File

@ -34,8 +34,9 @@ void main()
bounds.bounding_corners[1].xyz,
bounds.bounding_corners[2].xyz,
bounds.bounding_corners[3].xyz);
Sphere bounding_sphere = Sphere(bounds.bounding_sphere.xyz, bounds.bounding_sphere.w);
Sphere inscribed_sphere = Sphere(bounds.bounding_sphere.xyz, bounds._inner_sphere_radius);
Sphere bounding_sphere = shape_sphere(bounds.bounding_sphere.xyz, bounds.bounding_sphere.w);
Sphere inscribed_sphere = shape_sphere(bounds.bounding_sphere.xyz,
bounds._inner_sphere_radius);
for (drw_view_id = 0; drw_view_id < view_len; drw_view_id++) {
if (drw_view_culling.bound_sphere.w == -1.0) {

View File

@ -101,10 +101,18 @@ struct constexp_uvec3 {
return 0;
}
}
inline operator uint3() const
constexpr inline operator uint3() const
{
return xyz;
}
constexpr inline operator uint2() const
{
return xy;
}
constexpr inline operator uint() const
{
return x;
}
};
constexpr constexp_uvec3 __internal_workgroupsize_get()
@ -140,6 +148,10 @@ template<typename T> T atomicSub(threadgroup T &mem, T data)
{
return atomic_fetch_sub_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
}
template<typename T> T atomicAnd(threadgroup T &mem, T data)
{
return atomic_fetch_and_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
}
template<typename T> T atomicOr(threadgroup T &mem, T data)
{
return atomic_fetch_or_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
@ -152,29 +164,41 @@ template<typename T> T atomicXor(threadgroup T &mem, T data)
/* Device memory. */
template<typename T> T atomicMax(device T &mem, T data)
{
return atomic_fetch_max_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
return atomic_fetch_max_explicit((device _atomic<T> *)&mem, data, memory_order_relaxed);
}
template<typename T> T atomicMin(device T &mem, T data)
{
return atomic_fetch_min_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
return atomic_fetch_min_explicit((device _atomic<T> *)&mem, data, memory_order_relaxed);
}
template<typename T> T atomicAdd(device T &mem, T data)
{
return atomic_fetch_add_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
return atomic_fetch_add_explicit((device _atomic<T> *)&mem, data, memory_order_relaxed);
}
template<typename T> T atomicSub(device T &mem, T data)
{
return atomic_fetch_sub_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
return atomic_fetch_sub_explicit((device _atomic<T> *)&mem, data, memory_order_relaxed);
}
template<typename T> T atomicAnd(device T &mem, T data)
{
return atomic_fetch_and_explicit((device _atomic<T> *)&mem, data, memory_order_relaxed);
}
template<typename T> T atomicOr(device T &mem, T data)
{
return atomic_fetch_or_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
return atomic_fetch_or_explicit((device _atomic<T> *)&mem, data, memory_order_relaxed);
}
template<typename T> T atomicXor(device T &mem, T data)
{
return atomic_fetch_xor_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
return atomic_fetch_xor_explicit((device _atomic<T> *)&mem, data, memory_order_relaxed);
}
/* Unblock texture atomic compilation.
* TODO(Metal): This is not correct for global atomic behaviour, but will be safe within a single thread.
* We need to re-visit the solution for this use-case and use a 2D texture buffer instead. */
#define imageAtomicMin(tex, coord, data) \
uint val = _texelFetch_internal(tex, coord, 0).r;\
_texture_write_internal(tex, coord, uint4((val < data) ? val : data));\
tex.texture->fence();
/* Used to replace 'out' in function parameters with threadlocal reference
* shortened to avoid expanding the glsl source string. */
#define THD thread
@ -1126,6 +1150,27 @@ inline float4 uintBitsToFloat(uint4 f)
return as_type<float4>(f);
}
#define bitfieldReverse reverse_bits
#define bitfieldExtract extract_bits
#define bitfieldInsert insert_bits
#define bitCount popcount
template<typename T> T findLSB(T x)
{
/* ctz returns the number of trailing zeroes. To fetch the index of the LSB, we can also use this
* value as index, however need to filter out the case where the input value is zero to match
* GLSL functionality. */
return (x == T(0)) ? T(-1) : T(ctz(x));
}
template<typename T> T findMSB(T x)
{
/* clz returns the number of leading zeroes. To fetch the index of the LSB, we can also use this
* value as index when offset by 1. however need to filter out the case where the input value is
* zero to match GLSL functionality. 000000010*/
return (x == T(0)) ? T(-1) : (clz(T(0)) - clz(x) - T(1));
}
/* Texture size functions. Add texture types as needed. */
#define imageSize(image) textureSize(image, 0)

View File

@ -15,6 +15,12 @@
#define depthCubeArray samplerCubeArray
#define depth2DArrayShadow sampler2DArrayShadow
/* Memory scope and pass by reference types.
* NOTE: These are required by Metal, but are not required in all cases by GLSL. */
#define device
#define threadgroup
#define OUT(type, name, array_len) out type name[array_len]
/* Backend Functions. */
#define select(A, B, mask) mix(A, B, mask)