This repository has been archived on 2023-10-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
blender-archive/intern/cycles/util/math_float8.h
Brecht Van Lommel e1b3d91127 Refactor: replace Cycles sse/avx types by vectorized float4/int4/float8/int8
The distinction existed for legacy reasons, to easily port of Embree
intersection code without affecting the main vector types. However we are now
using SIMD for these types as well, so no good reason to keep the distinction.

Also more consistently pass these vector types by value in inline functions.
Previously it was partially changed for functions used by Metal to avoid having
to add address space qualifiers, simple to do it everywhere.

Also removes function declarations for vector math headers, serves no real
purpose.

Differential Revision: https://developer.blender.org/D16146
2022-11-08 12:28:40 +01:00

483 lines
13 KiB
C++

/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2013 Intel Corporation
* Copyright 2022 Blender Foundation */
#ifndef __UTIL_MATH_FLOAT8_H__
#define __UTIL_MATH_FLOAT8_H__
#ifndef __UTIL_MATH_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
CCL_NAMESPACE_BEGIN
ccl_device_inline vfloat8 zero_vfloat8()
{
#ifdef __KERNEL_AVX__
return vfloat8(_mm256_setzero_ps());
#else
return make_vfloat8(0.0f);
#endif
}
ccl_device_inline vfloat8 one_vfloat8()
{
return make_vfloat8(1.0f);
}
ccl_device_inline vfloat8 operator+(const vfloat8 a, const vfloat8 b)
{
#ifdef __KERNEL_AVX__
return vfloat8(_mm256_add_ps(a.m256, b.m256));
#else
return make_vfloat8(
a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h);
#endif
}
ccl_device_inline vfloat8 operator+(const vfloat8 a, const float f)
{
return a + make_vfloat8(f);
}
ccl_device_inline vfloat8 operator+(const float f, const vfloat8 a)
{
return make_vfloat8(f) + a;
}
ccl_device_inline vfloat8 operator-(const vfloat8 a)
{
#ifdef __KERNEL_AVX__
__m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
return vfloat8(_mm256_xor_ps(a.m256, mask));
#else
return make_vfloat8(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h);
#endif
}
ccl_device_inline vfloat8 operator-(const vfloat8 a, const vfloat8 b)
{
#ifdef __KERNEL_AVX__
return vfloat8(_mm256_sub_ps(a.m256, b.m256));
#else
return make_vfloat8(
a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h);
#endif
}
ccl_device_inline vfloat8 operator-(const vfloat8 a, const float f)
{
return a - make_vfloat8(f);
}
ccl_device_inline vfloat8 operator-(const float f, const vfloat8 a)
{
return make_vfloat8(f) - a;
}
ccl_device_inline vfloat8 operator*(const vfloat8 a, const vfloat8 b)
{
#ifdef __KERNEL_AVX__
return vfloat8(_mm256_mul_ps(a.m256, b.m256));
#else
return make_vfloat8(
a.a * b.a, a.b * b.b, a.c * b.c, a.d * b.d, a.e * b.e, a.f * b.f, a.g * b.g, a.h * b.h);
#endif
}
ccl_device_inline vfloat8 operator*(const vfloat8 a, const float f)
{
return a * make_vfloat8(f);
}
ccl_device_inline vfloat8 operator*(const float f, const vfloat8 a)
{
return make_vfloat8(f) * a;
}
ccl_device_inline vfloat8 operator/(const vfloat8 a, const vfloat8 b)
{
#ifdef __KERNEL_AVX__
return vfloat8(_mm256_div_ps(a.m256, b.m256));
#else
return make_vfloat8(
a.a / b.a, a.b / b.b, a.c / b.c, a.d / b.d, a.e / b.e, a.f / b.f, a.g / b.g, a.h / b.h);
#endif
}
ccl_device_inline vfloat8 operator/(const vfloat8 a, const float f)
{
return a / make_vfloat8(f);
}
ccl_device_inline vfloat8 operator/(const float f, const vfloat8 a)
{
return make_vfloat8(f) / a;
}
ccl_device_inline vfloat8 operator+=(vfloat8 a, const vfloat8 b)
{
return a = a + b;
}
ccl_device_inline vfloat8 operator-=(vfloat8 a, const vfloat8 b)
{
return a = a - b;
}
ccl_device_inline vfloat8 operator*=(vfloat8 a, const vfloat8 b)
{
return a = a * b;
}
ccl_device_inline vfloat8 operator*=(vfloat8 a, float f)
{
return a = a * f;
}
ccl_device_inline vfloat8 operator/=(vfloat8 a, float f)
{
return a = a / f;
}
ccl_device_inline bool operator==(const vfloat8 a, const vfloat8 b)
{
#ifdef __KERNEL_AVX__
return (_mm256_movemask_ps(_mm256_castsi256_ps(
_mm256_cmpeq_epi32(_mm256_castps_si256(a.m256), _mm256_castps_si256(b.m256)))) &
0b11111111) == 0b11111111;
#else
return (a.a == b.a && a.b == b.b && a.c == b.c && a.d == b.d && a.e == b.e && a.f == b.f &&
a.g == b.g && a.h == b.h);
#endif
}
ccl_device_inline const vfloat8 operator^(const vfloat8 a, const vfloat8 b)
{
#ifdef __KERNEL_AVX__
return vfloat8(_mm256_xor_ps(a.m256, b.m256));
#else
return make_vfloat8(__uint_as_float(__float_as_uint(a.a) ^ __float_as_uint(b.a)),
__uint_as_float(__float_as_uint(a.b) ^ __float_as_uint(b.b)),
__uint_as_float(__float_as_uint(a.c) ^ __float_as_uint(b.c)),
__uint_as_float(__float_as_uint(a.d) ^ __float_as_uint(b.d)),
__uint_as_float(__float_as_uint(a.e) ^ __float_as_uint(b.e)),
__uint_as_float(__float_as_uint(a.f) ^ __float_as_uint(b.f)),
__uint_as_float(__float_as_uint(a.g) ^ __float_as_uint(b.g)),
__uint_as_float(__float_as_uint(a.h) ^ __float_as_uint(b.h)));
#endif
}
ccl_device_inline vfloat8 rcp(const vfloat8 a)
{
#ifdef __KERNEL_AVX__
return vfloat8(_mm256_rcp_ps(a.m256));
#else
return make_vfloat8(1.0f / a.a,
1.0f / a.b,
1.0f / a.c,
1.0f / a.d,
1.0f / a.e,
1.0f / a.f,
1.0f / a.g,
1.0f / a.h);
#endif
}
ccl_device_inline vfloat8 sqrt(const vfloat8 a)
{
#ifdef __KERNEL_AVX__
return vfloat8(_mm256_sqrt_ps(a.m256));
#else
return make_vfloat8(sqrtf(a.a),
sqrtf(a.b),
sqrtf(a.c),
sqrtf(a.d),
sqrtf(a.e),
sqrtf(a.f),
sqrtf(a.g),
sqrtf(a.h));
#endif
}
ccl_device_inline vfloat8 sqr(const vfloat8 a)
{
return a * a;
}
ccl_device_inline bool is_zero(const vfloat8 a)
{
return a == make_vfloat8(0.0f);
}
ccl_device_inline float reduce_add(const vfloat8 a)
{
#ifdef __KERNEL_AVX__
vfloat8 b(_mm256_hadd_ps(a.m256, a.m256));
vfloat8 h(_mm256_hadd_ps(b.m256, b.m256));
return h[0] + h[4];
#else
return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h;
#endif
}
ccl_device_inline float average(const vfloat8 a)
{
return reduce_add(a) / 8.0f;
}
ccl_device_inline vfloat8 min(const vfloat8 a, const vfloat8 b)
{
#ifdef __KERNEL_AVX__
return vfloat8(_mm256_min_ps(a.m256, b.m256));
#else
return make_vfloat8(min(a.a, b.a),
min(a.b, b.b),
min(a.c, b.c),
min(a.d, b.d),
min(a.e, b.e),
min(a.f, b.f),
min(a.g, b.g),
min(a.h, b.h));
#endif
}
ccl_device_inline vfloat8 max(const vfloat8 a, const vfloat8 b)
{
#ifdef __KERNEL_AVX__
return vfloat8(_mm256_max_ps(a.m256, b.m256));
#else
return make_vfloat8(max(a.a, b.a),
max(a.b, b.b),
max(a.c, b.c),
max(a.d, b.d),
max(a.e, b.e),
max(a.f, b.f),
max(a.g, b.g),
max(a.h, b.h));
#endif
}
ccl_device_inline vfloat8 clamp(const vfloat8 a, const vfloat8 mn, const vfloat8 mx)
{
return min(max(a, mn), mx);
}
ccl_device_inline vfloat8 select(const vint8 mask, const vfloat8 a, const vfloat8 b)
{
#ifdef __KERNEL_AVX__
return vfloat8(_mm256_blendv_ps(b, a, _mm256_castsi256_ps(mask)));
#else
return make_vfloat8((mask.a) ? a.a : b.a,
(mask.b) ? a.b : b.b,
(mask.c) ? a.c : b.c,
(mask.d) ? a.d : b.d,
(mask.e) ? a.e : b.e,
(mask.f) ? a.f : b.f,
(mask.g) ? a.g : b.g,
(mask.h) ? a.h : b.h);
#endif
}
ccl_device_inline vfloat8 fabs(const vfloat8 a)
{
#ifdef __KERNEL_AVX__
return vfloat8(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
#else
return make_vfloat8(fabsf(a.a),
fabsf(a.b),
fabsf(a.c),
fabsf(a.d),
fabsf(a.e),
fabsf(a.f),
fabsf(a.g),
fabsf(a.h));
#endif
}
ccl_device_inline vfloat8 mix(const vfloat8 a, const vfloat8 b, float t)
{
return a + t * (b - a);
}
ccl_device_inline vfloat8 mix(const vfloat8 a, const vfloat8 b, vfloat8 t)
{
return a + t * (b - a);
}
ccl_device_inline vfloat8 saturate(const vfloat8 a)
{
return clamp(a, make_vfloat8(0.0f), make_vfloat8(1.0f));
}
ccl_device_inline vfloat8 exp(vfloat8 v)
{
return make_vfloat8(
expf(v.a), expf(v.b), expf(v.c), expf(v.d), expf(v.e), expf(v.f), expf(v.g), expf(v.h));
}
ccl_device_inline vfloat8 log(vfloat8 v)
{
return make_vfloat8(
logf(v.a), logf(v.b), logf(v.c), logf(v.d), logf(v.e), logf(v.f), logf(v.g), logf(v.h));
}
ccl_device_inline float dot(const vfloat8 a, const vfloat8 b)
{
#ifdef __KERNEL_AVX__
vfloat8 t(_mm256_dp_ps(a.m256, b.m256, 0xFF));
return t[0] + t[4];
#else
return (a.a * b.a) + (a.b * b.b) + (a.c * b.c) + (a.d * b.d) + (a.e * b.e) + (a.f * b.f) +
(a.g * b.g) + (a.h * b.h);
#endif
}
ccl_device_inline vfloat8 pow(vfloat8 v, float e)
{
return make_vfloat8(powf(v.a, e),
powf(v.b, e),
powf(v.c, e),
powf(v.d, e),
powf(v.e, e),
powf(v.f, e),
powf(v.g, e),
powf(v.h, e));
}
ccl_device_inline float reduce_min(const vfloat8 a)
{
return min(min(min(a.a, a.b), min(a.c, a.d)), min(min(a.e, a.f), min(a.g, a.h)));
}
ccl_device_inline float reduce_max(const vfloat8 a)
{
return max(max(max(a.a, a.b), max(a.c, a.d)), max(max(a.e, a.f), max(a.g, a.h)));
}
ccl_device_inline bool isequal(const vfloat8 a, const vfloat8 b)
{
return a == b;
}
ccl_device_inline vfloat8 safe_divide(const vfloat8 a, const float b)
{
return (b != 0.0f) ? a / b : make_vfloat8(0.0f);
}
ccl_device_inline vfloat8 safe_divide(const vfloat8 a, const vfloat8 b)
{
return make_vfloat8((b.a != 0.0f) ? a.a / b.a : 0.0f,
(b.b != 0.0f) ? a.b / b.b : 0.0f,
(b.c != 0.0f) ? a.c / b.c : 0.0f,
(b.d != 0.0f) ? a.d / b.d : 0.0f,
(b.e != 0.0f) ? a.e / b.e : 0.0f,
(b.f != 0.0f) ? a.f / b.f : 0.0f,
(b.g != 0.0f) ? a.g / b.g : 0.0f,
(b.h != 0.0f) ? a.h / b.h : 0.0f);
}
ccl_device_inline vfloat8 ensure_finite(vfloat8 v)
{
v.a = ensure_finite(v.a);
v.b = ensure_finite(v.b);
v.c = ensure_finite(v.c);
v.d = ensure_finite(v.d);
v.e = ensure_finite(v.e);
v.f = ensure_finite(v.f);
v.g = ensure_finite(v.g);
v.h = ensure_finite(v.h);
return v;
}
ccl_device_inline bool isfinite_safe(vfloat8 v)
{
return isfinite_safe(v.a) && isfinite_safe(v.b) && isfinite_safe(v.c) && isfinite_safe(v.d) &&
isfinite_safe(v.e) && isfinite_safe(v.f) && isfinite_safe(v.g) && isfinite_safe(v.h);
}
ccl_device_inline vint8 cast(const vfloat8 a)
{
#ifdef __KERNEL_AVX__
return vint8(_mm256_castps_si256(a));
#else
return make_vint8(__float_as_int(a.a),
__float_as_int(a.b),
__float_as_int(a.c),
__float_as_int(a.d),
__float_as_int(a.e),
__float_as_int(a.f),
__float_as_int(a.g),
__float_as_int(a.h));
#endif
}
#ifdef __KERNEL_SSE__
ccl_device_forceinline float4 low(const vfloat8 a)
{
# ifdef __KERNEL_AVX__
return float4(_mm256_extractf128_ps(a.m256, 0));
# else
return make_float4(a.e, a.f, a.g, a.h);
# endif
}
ccl_device_forceinline float4 high(const vfloat8 a)
{
# ifdef __KERNEL_AVX__
return float4(_mm256_extractf128_ps(a.m256, 1));
# else
return make_float4(a.a, a.b, a.c, a.d);
# endif
}
template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a)
{
# ifdef __KERNEL_AVX__
return vfloat8(_mm256_permutevar_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0)));
# else
return make_vfloat8(a[i0], a[i1], a[i2], a[i3], a[i4 + 4], a[i5 + 4], a[i6 + 4], a[i7 + 4]);
# endif
}
template<size_t i0, size_t i1, size_t i2, size_t i3>
ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a, const vfloat8 b)
{
# ifdef __KERNEL_AVX__
return vfloat8(_mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)));
# else
return make_vfloat8(shuffle<i0, i1, i2, i3>(high(a), high(b)),
shuffle<i0, i1, i2, i3>(low(a), low(b)));
# endif
}
template<size_t i0, size_t i1, size_t i2, size_t i3>
ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a)
{
return shuffle<i0, i1, i2, i3>(a, a);
}
template<size_t i0> ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a, const vfloat8 b)
{
return shuffle<i0, i0, i0, i0>(a, b);
}
template<size_t i0> ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a)
{
return shuffle<i0>(a, a);
}
template<size_t i> ccl_device_forceinline float extract(const vfloat8 a)
{
# ifdef __KERNEL_AVX__
__m256 b = shuffle<i, i, i, i>(a).m256;
return _mm256_cvtss_f32(b);
# else
return a[i];
# endif
}
#endif
CCL_NAMESPACE_END
#endif /* __UTIL_MATH_FLOAT8_H__ */