1 files changed, 77 insertions, 15 deletions
diff --git a/src/mesa/tnl/t_vb_arbprogram_sse.c b/src/mesa/tnl/t_vb_arbprogram_sse.c
index 19061c0d8d1..b9126d6d886 100644
--- a/src/mesa/tnl/t_vb_arbprogram_sse.c
+++ b/src/mesa/tnl/t_vb_arbprogram_sse.c
@@ -294,11 +294,12 @@ static GLboolean emit_RSW( struct compilation *cp, union instruction op )
 {
    struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0);
    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst);
-   GLuint swz = op.rsw.swz;
+   GLuint swz = GET_SWZ(op.rsw.swz, 0) | (GET_SWZ(op.rsw.swz, 1) << 2) |
+		(GET_SWZ(op.rsw.swz, 2) << 4| (GET_SWZ(op.rsw.swz, 3) << 6));
    GLuint neg = op.rsw.neg;
 
    emit_pshufd(cp, dst, arg0, swz);
-   
+
    if (neg) {
       struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ);
       struct x86_reg tmp = get_xmm_reg(cp);
@@ -306,6 +307,7 @@ static GLboolean emit_RSW( struct compilation *cp, union instruction op )
        * Use neg as arg to pshufd
        * Multiply
        */
+      /* is the emit_pshufd necessary? only SWZ can negate individual components */
       emit_pshufd(cp, tmp, negs, 
 		  SHUF((neg & 1) ? 1 : 0,
 		       (neg & 2) ? 1 : 0,
@@ -317,6 +319,64 @@ static GLboolean emit_RSW( struct compilation *cp, union instruction op )
    return GL_TRUE;
 }
 
+/* Perform a full swizzle
+ */
+static GLboolean emit_SWZ( struct compilation *cp, union instruction op ) 
+{
+   struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0);
+   struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst);
+   struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ);
+   struct x86_reg tmp = get_xmm_reg(cp);
+   GLubyte neg = op.rsw.neg;
+   GLubyte shuf2, swz, savepos, savemask, swizzle[4];
+
+   swizzle[0] = GET_SWZ(op.rsw.swz, 0);
+   swizzle[1] = GET_SWZ(op.rsw.swz, 1);
+   swizzle[2] = GET_SWZ(op.rsw.swz, 2);
+   swizzle[3] = GET_SWZ(op.rsw.swz, 3);
+
+   swz = SHUF((swizzle[0] & 3), (swizzle[1] & 3),
+	      (swizzle[2] & 3), (swizzle[3] & 3));
+
+   emit_pshufd(cp, dst, arg0, swz);
+
+   /* can handle negation and replace with zero with the same shuffle/mul */
+   shuf2 = SHUF(swizzle[0] == 4 ? 2 : (neg & 1),
+	        swizzle[1] == 4 ? 2 : ((neg & 2) >> 1),
+	        swizzle[2] == 4 ? 2 : ((neg & 4) >> 2),
+	        swizzle[3] == 4 ? 2 : ((neg & 8) >> 3));
+
+   /* now the hard part is getting those 1's in there... */
+   savepos = 0;
+   savemask = 0;
+   if (swizzle[0] == 5) savepos = 1;
+   if (swizzle[1] == 5) savepos = 2;
+   else savemask |= 1 << 2;
+   if (swizzle[2] == 5) savepos = 3;
+   else savemask |= 2 << 4;
+   if (swizzle[3] == 5) savepos = 4;
+   else savemask |= 3 << 6;
+   if (savepos) {
+      /* need a mov first as movss from memory will overwrite high bits of xmm reg */
+      sse_movups(&cp->func, tmp, negs);
+      /* can only replace lowest 32bits, thus move away that part first */
+      emit_pshufd(cp, dst, dst, savemask);
+      sse_movss(&cp->func, dst, tmp);
+      emit_pshufd(cp, dst, dst, (savepos - 1) | (savemask & 0xfc));
+   }
+
+   if (shuf2) {
+      /* Load 1,-1,0,0
+       * Use neg as arg to pshufd
+       * Multiply
+       */
+      emit_pshufd(cp, tmp, negs, shuf2);
+      sse_mulps(&cp->func, dst, tmp);
+   }
+
+   return GL_TRUE;
+}
+
 /* Helper for writemask:
  */
 static GLboolean emit_shuf_copy1( struct compilation *cp,
@@ -595,20 +655,19 @@ static GLboolean emit_DPH( struct compilation *cp, union instruction op )
    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); 
    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1); 
    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
-   struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
-   struct x86_reg tmp = get_xmm_reg(cp);      
+   struct x86_reg tmp = get_xmm_reg(cp);
 
-   emit_pshufd(cp, dst, arg0, SHUF(W,X,Y,Z));
-   sse_movss(&cp->func, dst, ones);
-   emit_pshufd(cp, dst, dst, SHUF(W,X,Y,Z));
+   sse_movups(&cp->func, dst, arg0);
    sse_mulps(&cp->func, dst, arg1);
-   
-   /* Now the hard bit: sum the values (from DP4):
+
+   /* Now the hard bit: sum the values (from DP3):
     */ 
    sse_movhlps(&cp->func, tmp, dst);
-   sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
+   sse_addss(&cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
    emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
    sse_addss(&cp->func, dst, tmp);
+   emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
+   sse_addss(&cp->func, dst, tmp);
    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
    return GL_TRUE;
 }
@@ -985,15 +1044,18 @@ static GLboolean emit_RSQ( struct compilation *cp, union instruction op )
 {
    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
-
-   /* TODO: Calculate absolute value
-    */
 #if 0
+   struct x86_reg neg = get_reg_ptr(FILE_REG, REG_NEG);
+
+/* get abs value first. This STILL doesn't work.
+   Looks like we get bogus neg values ?
+*/
    sse_movss(&cp->func, dst, arg0);
    sse_mulss(&cp->func, dst, neg);
    sse_maxss(&cp->func, dst, arg0);
-#endif
 
+   sse_rsqrtss(&cp->func, dst, dst);
+#endif
    sse_rsqrtss(&cp->func, dst, arg0);
    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
    return GL_TRUE;
@@ -1132,7 +1194,7 @@ static GLboolean (* const emit_func[])(struct compilation *, union instruction)
    emit_NOP, /* SSG */
    emit_NOP, /* STR */
    emit_SUB,
-   emit_RSW, /* SWZ */
+   emit_SWZ, /* SWZ */
    emit_NOP, /* TEX */
    emit_NOP, /* TXB */
    emit_NOP, /* TXD */