glsl: Fix software 64-bit integer to 32-bit float conversions.

The current implementation was broken for any integers between 2^24 and 2^30 (it would return zero for me on ICL). The reason is that for such integers we wouldn't take the 'if (0 <= shiftCount)' early return path, however 'shiftCount + 7' would be positive, leading to a negative 'count' argument passed to __shift64RightJamming(), which would give undefined results. This reworks the affected conversion functions to use either __shortShift64Left() or __shift64RightJamming() based on the sign of the final shift count, which should avoid the problem. In addition this should qualify as a clean-up/optimization -- This implementation of the conversion functions translates to 7 instructions less than the original on Intel hardware. This fixes the 'KHR-GL46.shader_ballot_tests.ShaderBallotFunctionBallot' conformance tests on soft fp64 hardware with large enough subgroup size (>16). Fixes: d5cf6e92b4f7 "glsl: Add built-in functions to do uint64_to_fp32(uint64_t)" Fixes: c9d333a6b76e "glsl: Add built-in functions to do int64_to_fp32(int64_t)" Cc: Sagar Ghuge <sagar.ghuge@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
author: Francisco Jerez <currojerez@riseup.net> 2019-12-27 14:10:31 -0800
committer: Francisco Jerez <currojerez@riseup.net> 2020-01-10 10:51:58 -0800
commit: a30bb25a7a495db7b7cb3be50431029f48019fc3 (patch)
tree: 4923a1c68ab80d427b1b8e3037b84a982ced18b0
parent: 8b7a42d6d0b15508940e095642136c53d0c7dcee (diff)
1 files changed, 14 insertions, 22 deletions
diff --git a/src/compiler/glsl/float64.glsl b/src/compiler/glsl/float64.glsl
index 0433d925a39..5f201c8f725 100644
--- a/src/compiler/glsl/float64.glsl
+++ b/src/compiler/glsl/float64.glsl
@@ -1299,43 +1299,35 @@ __fp64_to_fp32(uint64_t __a)
 float
 __uint64_to_fp32(uint64_t __a)
 {
-   uint zFrac = 0u;
    uvec2 aFrac = unpackUint2x32(__a);
-   int shiftCount = __countLeadingZeros32(mix(aFrac.y, aFrac.x, aFrac.y == 0u));
-   shiftCount -= mix(40, 8, aFrac.y == 0u);
+   int shiftCount = mix(__countLeadingZeros32(aFrac.y) - 33,
+                        __countLeadingZeros32(aFrac.x) - 1,
+                        aFrac.y == 0u);
 
-   if (0 <= shiftCount) {
+   if (0 <= shiftCount)
       __shortShift64Left(aFrac.y, aFrac.x, shiftCount, aFrac.y, aFrac.x);
-      bool is_zero = (aFrac.y | aFrac.x) == 0u;
-      return mix(__packFloat32(0u, 0x95 - shiftCount, aFrac.x), 0, is_zero);
-   }
+   else
+      __shift64RightJamming(aFrac.y, aFrac.x, -shiftCount, aFrac.y, aFrac.x);
 
-   shiftCount += 7;
-   __shift64RightJamming(aFrac.y, aFrac.x, -shiftCount, aFrac.y, aFrac.x);
-   zFrac = mix(aFrac.x<<shiftCount, aFrac.x, shiftCount < 0);
-   return __roundAndPackFloat32(0u, 0x9C - shiftCount, zFrac);
+   return __roundAndPackFloat32(0u, 0x9C - shiftCount, aFrac.x);
 }
 
 float
 __int64_to_fp32(int64_t __a)
 {
-   uint zFrac = 0u;
    uint aSign = uint(__a < 0);
    uint64_t absA = mix(uint64_t(__a), uint64_t(-__a), __a < 0);
    uvec2 aFrac = unpackUint2x32(absA);
-   int shiftCount = __countLeadingZeros32(mix(aFrac.y, aFrac.x, aFrac.y == 0u));
-   shiftCount -= mix(40, 8, aFrac.y == 0u);
+   int shiftCount = mix(__countLeadingZeros32(aFrac.y) - 33,
+                        __countLeadingZeros32(aFrac.x) - 1,
+                        aFrac.y == 0u);
 
-   if (0 <= shiftCount) {
+   if (0 <= shiftCount)
       __shortShift64Left(aFrac.y, aFrac.x, shiftCount, aFrac.y, aFrac.x);
-      bool is_zero = (aFrac.y | aFrac.x) == 0u;
-      return mix(__packFloat32(aSign, 0x95 - shiftCount, aFrac.x), 0, absA == 0u);
-   }
+   else
+      __shift64RightJamming(aFrac.y, aFrac.x, -shiftCount, aFrac.y, aFrac.x);
 
-   shiftCount += 7;
-   __shift64RightJamming(aFrac.y, aFrac.x, -shiftCount, aFrac.y, aFrac.x);
-   zFrac = mix(aFrac.x<<shiftCount, aFrac.x, shiftCount < 0);
-   return __roundAndPackFloat32(aSign, 0x9C - shiftCount, zFrac);
+   return __roundAndPackFloat32(aSign, 0x9C - shiftCount, aFrac.x);
 }
 
 /* Returns the result of converting the single-precision floating-point value
author	Francisco Jerez <currojerez@riseup.net>	2019-12-27 14:10:31 -0800
committer	Francisco Jerez <currojerez@riseup.net>	2020-01-10 10:51:58 -0800
commit	a30bb25a7a495db7b7cb3be50431029f48019fc3 (patch)
tree	4923a1c68ab80d427b1b8e3037b84a982ced18b0
parent	8b7a42d6d0b15508940e095642136c53d0c7dcee (diff)