2 changed files with 106 additions and 25 deletions
--- a/source/blender/gpu/vulkan/vk_data_conversion.hh
+++ b/source/blender/gpu/vulkan/vk_data_conversion.hh
@ -110,12 +110,14 @@ class FloatingPointFormat {
  static constexpr uint8_t MantissaLen = MantissaBitLen;
  static constexpr uint8_t MantissaShift = 0;
  static constexpr uint32_t MantissaMask = (1 << MantissaBitLen) - 1;
+  static constexpr uint32_t MantissaNanMask = MantissaMask;
  static constexpr uint8_t ExponentShift = MantissaBitLen;
  static constexpr uint8_t ExponentLen = ExponentBitLen;
  static constexpr uint32_t ExponentMask = (1 << ExponentBitLen) - 1;
+  static constexpr uint32_t ExponentBias = (1 << (ExponentBitLen - 1)) - 1;
+  static constexpr int32_t ExponentSpecialMask = ExponentMask;
  static constexpr uint8_t SignShift = MantissaBitLen + ExponentBitLen;
  static constexpr uint32_t SignMask = HasSignBit ? 1 : 0;
-  static constexpr uint32_t ExponentBias = (1 << (ExponentBitLen - 1)) - 1;

  static uint32_t get_mantissa(uint32_t floating_point_number)
  {
@ -161,7 +163,7 @@ class FloatingPointFormat {

  static uint32_t set_sign(bool sign, uint32_t floating_point_number)
  {
-    if constexpr (HasSignBit) {
+    if constexpr (!HasSignBit) {
      return floating_point_number;
    }
    uint32_t result = clear_sign(floating_point_number);
@ -182,9 +184,6 @@ using FormatF10 = FloatingPointFormat<false, 5, 5>;
 * convert between the formats. Additional conversion rules can be applied to the conversion
 * function. Due to the implementation the compiler would make an optimized version depending on
 * the actual possibilities.
- *
- * NOTE: Implementation should be extended to support Nan, Inf, -Inf and clamping to min/max when
- * values don't fit in the destination.
 */
 template<
    /**
@ -199,7 +198,7 @@ template<

    /**
     * Should negative values be clamped to zero when DestinationFormat doesn't contain a sign
-     * bit.
+     * bit. Also -Inf will be clamped to zero.
     *
     * When set to `false` and DestinationFormat doesn't contain a sign bit the value will be
     * made absolute.
@ -211,25 +210,46 @@ uint32_t convert_float_formats(uint32_t value)
  uint32_t mantissa = SourceFormat::get_mantissa(value);
  int32_t exponent = SourceFormat::get_exponent(value);

+  const bool is_nan = (exponent == SourceFormat::ExponentSpecialMask) && mantissa;
+  const bool is_inf = (exponent == SourceFormat::ExponentSpecialMask) && (mantissa == 0);
+
  /* Sign conversion */
  if constexpr (!DestinationFormat::HasSign && ClampNegativeToZero) {
-    if (is_signed) {
+    if (is_signed && !is_nan) {
      return 0;
    }
  }

-  /* Mantissa conversion */
-  if constexpr (SourceFormat::MantissaLen > DestinationFormat::MantissaLen) {
-    mantissa = mantissa >> (SourceFormat::MantissaLen - DestinationFormat::MantissaLen);
+  if (is_inf) {
+    exponent = DestinationFormat::ExponentSpecialMask;
  }
-  else if constexpr (SourceFormat::MantissaLen < DestinationFormat::MantissaLen) {
-    mantissa = mantissa << (DestinationFormat::MantissaLen - SourceFormat::MantissaLen);
+  else if (is_nan) {
+    exponent = DestinationFormat::ExponentSpecialMask;
+    mantissa = DestinationFormat::MantissaNanMask;
  }
+  else {
+    /* Exponent conversion */
+    exponent -= SourceFormat::ExponentBias;
+    /* Clamping when destination has lower precision. */
+    if constexpr (SourceFormat::ExponentLen > DestinationFormat::ExponentLen) {
+      if (exponent > DestinationFormat::ExponentBias) {
+        exponent = 0;
+        mantissa = SourceFormat::MantissaMask;
+      }
+      else if (exponent < -int32_t(DestinationFormat::ExponentBias)) {
+        return 0;
+      }
+    }
+    exponent += DestinationFormat::ExponentBias;

-  /* Exponent conversion */
-  exponent += DestinationFormat::ExponentBias - SourceFormat::ExponentBias;
-  // TODO: Clamp to min/max value? only when Destination::ExponentBias <
-  // SourceFormat::ExponentBias.
+    /* Mantissa conversion */
+    if constexpr (SourceFormat::MantissaLen > DestinationFormat::MantissaLen) {
+      mantissa = mantissa >> (SourceFormat::MantissaLen - DestinationFormat::MantissaLen);
+    }
+    else if constexpr (SourceFormat::MantissaLen < DestinationFormat::MantissaLen) {
+      mantissa = mantissa << (DestinationFormat::MantissaLen - SourceFormat::MantissaLen);
+    }
+  }

  uint32_t result = 0;
  result = DestinationFormat::set_sign(is_signed, result);
--- a/source/blender/gpu/vulkan/vk_data_conversion_test.cc
+++ b/source/blender/gpu/vulkan/vk_data_conversion_test.cc
@ -5,22 +5,83 @@
 namespace blender::gpu::tests {
 TEST(VulkanDataConversion, ConvertF32F16)
 {
-  uint32_t f32_2 = 0b01000000000000000000000000000000;
-  uint32_t f16_2_expected = 0b0100000000000000;
-  uint32_t f16_2 = convert_float_formats<FormatF16, FormatF32>(f32_2);
+  const uint32_t f32_2 = 0b01000000000000000000000000000000;
+  const uint32_t f16_2_expected = 0b0100000000000000;
+  const uint32_t f16_2 = convert_float_formats<FormatF16, FormatF32>(f32_2);
  EXPECT_EQ(f16_2, f16_2_expected);

-  uint32_t f32_3 = 0b01000000010000000000000000000000;
-  uint32_t f16_3_expected = 0b0100001000000000;
-  uint32_t f16_3 = convert_float_formats<FormatF16, FormatF32>(f32_3);
+  const uint32_t f32_3 = 0b01000000010000000000000000000000;
+  const uint32_t f16_3_expected = 0b0100001000000000;
+  const uint32_t f16_3 = convert_float_formats<FormatF16, FormatF32>(f32_3);
  EXPECT_EQ(f16_3, f16_3_expected);

-  uint32_t f32_4 = 0b01000000100000000000000000000000;
-  uint32_t f16_4_expected = 0b0100010000000000;
-  uint32_t f16_4 = convert_float_formats<FormatF16, FormatF32>(f32_4);
+  const uint32_t f32_4 = 0b01000000100000000000000000000000;
+  const uint32_t f16_4_expected = 0b0100010000000000;
+  const uint32_t f16_4 = convert_float_formats<FormatF16, FormatF32>(f32_4);
  EXPECT_EQ(f16_4, f16_4_expected);
 }

+TEST(VulkanDataConversion, clamp_negative_to_zero)
+{
+  const uint32_t f32_2 = 0b11000000000000000000000000000000;
+  const uint32_t f32_inf_min = 0b11111111100000000000000000000000;
+  const uint32_t f32_inf_max = 0b01111111100000000000000000000000;
+  const uint32_t f32_nan = 0b11111111111111111111111111111111;
+
+  /* F32(-2) fits in F16. */
+  const uint32_t f16_2_expected = 0b1100000000000000;
+  const uint32_t f16_2a = convert_float_formats<FormatF16, FormatF32, true>(f32_2);
+  EXPECT_EQ(f16_2a, f16_2_expected);
+
+  const uint32_t f16_2b = convert_float_formats<FormatF16, FormatF32, false>(f32_2);
+  EXPECT_EQ(f16_2b, f16_2_expected);
+
+  /* F32(-2) doesn't fit in F11 as F11 only supports unsigned values. Clamp to zero. */
+  const uint32_t f11_0_expected = 0b00000000000;
+  const uint32_t f11_2_expected = 0b10000000000;
+  const uint32_t f11_inf_expected = 0b11111000000;
+  const uint32_t f11_nan_expected = 0b11111111111;
+  {
+    const uint32_t f11_0 = convert_float_formats<FormatF11, FormatF32, true>(f32_2);
+    EXPECT_EQ(f11_0, f11_0_expected);
+    const uint32_t f11_0b = convert_float_formats<FormatF11, FormatF32, true>(f32_inf_min);
+    EXPECT_EQ(f11_0b, f11_0_expected);
+    const uint32_t f11_inf = convert_float_formats<FormatF11, FormatF32, true>(f32_inf_max);
+    EXPECT_EQ(f11_inf, f11_inf_expected);
+    const uint32_t f11_nan = convert_float_formats<FormatF11, FormatF32, true>(f32_nan);
+    EXPECT_EQ(f11_nan, f11_nan_expected);
+  }
+
+  /* F32(-2) doesn't fit in F11 as F11 only supports unsigned values. Make absolute. */
+  {
+    const uint32_t f11_2 = convert_float_formats<FormatF11, FormatF32, false>(f32_2);
+    EXPECT_EQ(f11_2, f11_2_expected);
+    const uint32_t f11_inf = convert_float_formats<FormatF11, FormatF32, false>(f32_inf_min);
+    EXPECT_EQ(f11_inf, f11_inf_expected);
+    const uint32_t f11_infb = convert_float_formats<FormatF11, FormatF32, false>(f32_inf_max);
+    EXPECT_EQ(f11_infb, f11_inf_expected);
+    const uint32_t f11_nan = convert_float_formats<FormatF11, FormatF32, false>(f32_nan);
+    EXPECT_EQ(f11_nan, f11_nan_expected);
+  }
+}
+
+TEST(VulkanDataConversion, infinity_upper)
+{
+  const uint32_t f32_inf = 0b01111111100000000000000000000000;
+
+  const uint32_t f16_inf_expected = 0b0111110000000000;
+  const uint32_t f16_inf = convert_float_formats<FormatF16, FormatF32, true>(f32_inf);
+  EXPECT_EQ(f16_inf, f16_inf_expected);
+
+  const uint32_t f11_inf_expected = 0b11111000000;
+  const uint32_t f11_inf = convert_float_formats<FormatF11, FormatF32, true>(f32_inf);
+  EXPECT_EQ(f11_inf, f11_inf_expected);
+
+  const uint32_t f10_inf_expected = 0b1111100000;
+  const uint32_t f10_inf = convert_float_formats<FormatF10, FormatF32, true>(f32_inf);
+  EXPECT_EQ(f10_inf, f10_inf_expected);
+}
+
 // TODO: add test case for Nan, Inf, -Inf, Clamping

 }  // namespace blender::gpu::tests