translate: improve sse2 32-bit unsigned -> float conversion

The existing logic would drop the low bit. Instead, let's drop the high bit, do the conversion, and then add the fixed constant back in if the value had the high bit set originally. Fixes KHR-GL45.direct_state_access.vertex_arrays_attribute_format on drivers that use this module to handle the format conversion. Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu> Acked-by: Emma Anholt <emma@anholt.net> Tested-By: Mike Blumenkrantz <michael.blumenkrantz@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14922>
author: Ilia Mirkin <imirkin@alum.mit.edu> 2022-02-07 23:40:25 -0500
committer: Marge Bot <emma+marge@anholt.net> 2022-02-09 06:04:25 +0000
commit: 5200e1c2120f3143dcd11eb0bc0b4ddf8ee62f86 (patch)
tree: b693bea059407b5db70e69ee71cbdf156038bf37
parent: 0b69f7b15d526fc763f1dd4aafbc358449aa9ac6 (diff)
2 files changed, 42 insertions, 14 deletions
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
index c128ac3da7e..91f0ea6e4cd 100644
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -64,7 +64,8 @@ struct translate_buffer_variant
 
 #define ELEMENT_BUFFER_INSTANCE_ID  1001
 
-#define NUM_CONSTS 7
+#define NUM_FLOAT_CONSTS 9
+#define NUM_UNSIGNED_CONSTS 1
 
 enum
 {
@@ -74,22 +75,32 @@ enum
    CONST_INV_32767,
    CONST_INV_65535,
    CONST_INV_2147483647,
-   CONST_255
+   CONST_INV_4294967295,
+   CONST_255,
+   CONST_2147483648,
+   /* float consts end */
+   CONST_2147483647_INT,
 };
 
 #define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
-static float consts[NUM_CONSTS][4] = {
+static float consts[NUM_FLOAT_CONSTS][4] = {
    {0, 0, 0, 1},
    C(1.0 / 127.0),
    C(1.0 / 255.0),
    C(1.0 / 32767.0),
    C(1.0 / 65535.0),
    C(1.0 / 2147483647.0),
-   C(255.0)
+   C(1.0 / 4294967295.0),
+   C(255.0),
+   C(2147483648.0),
 };
 
 #undef C
 
+static unsigned uconsts[NUM_UNSIGNED_CONSTS][4] = {
+   {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff},
+};
+
 struct translate_sse
 {
    struct translate translate;
@@ -100,9 +111,10 @@ struct translate_sse
    struct x86_function elt8_func;
    struct x86_function *func;
 
-     PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
+   PIPE_ALIGN_VAR(16) float consts[NUM_FLOAT_CONSTS][4];
+   PIPE_ALIGN_VAR(16) float uconsts[NUM_UNSIGNED_CONSTS][4];
    int8_t reg_to_const[16];
-   int8_t const_to_reg[NUM_CONSTS];
+   int8_t const_to_reg[NUM_FLOAT_CONSTS + NUM_UNSIGNED_CONSTS];
 
    struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS];
    unsigned nr_buffers;
@@ -165,9 +177,13 @@ get_const(struct translate_sse *p, unsigned id)
    p->const_to_reg[id] = i;
 
    /* TODO: this should happen outside the loop, if possible */
+   const void *c;
+   if (id < NUM_FLOAT_CONSTS)
+      c = &p->consts[id][0];
+   else
+      c = &p->uconsts[id - NUM_FLOAT_CONSTS][0];
    sse_movaps(p->func, reg,
-              x86_make_disp(p->machine_EDI,
-                            get_offset(p, &p->consts[id][0])));
+              x86_make_disp(p->machine_EDI, get_offset(p, c)));
 
    return reg;
 }
@@ -508,6 +524,7 @@ translate_attr_convert(struct translate_sse *p,
         || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
         || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) {
       struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+      struct x86_reg auxXMM;
 
       for (i = 0; i < output_desc->nr_channels; ++i) {
          if (swizzle[i] == PIPE_SWIZZLE_0
@@ -544,12 +561,26 @@ translate_attr_convert(struct translate_sse *p,
                sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
                break;
             case 32:           /* we lose precision here */
-               sse2_psrld_imm(p->func, dataXMM, 1);
+               /* No unsigned conversion (except in AVX512F), so we check if
+                * it's negative, and stick the high bit as a separate float
+                * value in an aux register: */
+               auxXMM = x86_make_reg(file_XMM, 1);
+               /* aux = 0 */
+               sse_xorps(p->func, auxXMM, auxXMM);
+               /* aux = aux > data ? 0xffffffff : 0 */
+               sse2_pcmpgtd(p->func, auxXMM, dataXMM);
+               /* data = data & 0x7fffffff */
+               sse_andps(p->func, dataXMM, get_const(p, CONST_2147483647_INT));
+               /* aux = aux & 2147483648.0 */
+               sse_andps(p->func, auxXMM, get_const(p, CONST_2147483648));
                break;
             default:
                return FALSE;
             }
             sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
+            if (input_desc->channel[0].size == 32)
+               /* add in the high bit's worth of float that we AND'd away */
+               sse_addps(p->func, dataXMM, auxXMM);
             if (input_desc->channel[0].normalized) {
                struct x86_reg factor;
                switch (input_desc->channel[0].size) {
@@ -560,7 +591,7 @@ translate_attr_convert(struct translate_sse *p,
                   factor = get_const(p, CONST_INV_65535);
                   break;
                case 32:
-                  factor = get_const(p, CONST_INV_2147483647);
+                  factor = get_const(p, CONST_INV_4294967295);
                   break;
                default:
                   assert(0);
@@ -572,9 +603,6 @@ translate_attr_convert(struct translate_sse *p,
                }
                sse_mulps(p->func, dataXMM, factor);
             }
-            else if (input_desc->channel[0].size == 32)
-               /* compensate for the bit we threw away to fit u32 into s32 */
-               sse_addps(p->func, dataXMM, dataXMM);
             break;
          case UTIL_FORMAT_TYPE_SIGNED:
             if (!(x86_target_caps(p->func) & X86_SSE2))
@@ -1491,6 +1519,7 @@ translate_sse2_create(const struct translate_key *key)
 
    memset(p, 0, sizeof(*p));
    memcpy(p->consts, consts, sizeof(consts));
+   memcpy(p->uconsts, uconsts, sizeof(uconsts));
 
    p->translate.key = *key;
    p->translate.release = translate_sse_release;
diff --git a/src/gallium/drivers/zink/ci/zink-lvp-fails.txt b/src/gallium/drivers/zink/ci/zink-lvp-fails.txt
index 103d922a57b..1ab12943331 100644
--- a/src/gallium/drivers/zink/ci/zink-lvp-fails.txt
+++ b/src/gallium/drivers/zink/ci/zink-lvp-fails.txt
@@ -1,5 +1,4 @@
 KHR-GL46.compute_shader.conditional-dispatching,Fail
-KHR-GL46.direct_state_access.vertex_arrays_attribute_format,Fail
 KHR-GL46.gpu_shader_fp64.builtin.mod_dvec2,Fail
 KHR-GL46.gpu_shader_fp64.builtin.mod_dvec3,Fail
 KHR-GL46.gpu_shader_fp64.builtin.mod_dvec4,Fail
author	Ilia Mirkin <imirkin@alum.mit.edu>	2022-02-07 23:40:25 -0500
committer	Marge Bot <emma+marge@anholt.net>	2022-02-09 06:04:25 +0000
commit	5200e1c2120f3143dcd11eb0bc0b4ddf8ee62f86 (patch)
tree	b693bea059407b5db70e69ee71cbdf156038bf37
parent	0b69f7b15d526fc763f1dd4aafbc358449aa9ac6 (diff)