diff options
Diffstat (limited to 'src/gallium/drivers/llvmpipe/lp_bld_interp.c')
-rw-r--r-- | src/gallium/drivers/llvmpipe/lp_bld_interp.c | 429 |
1 files changed, 343 insertions, 86 deletions
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c index 0d51ccb0349..d108f35f719 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c +++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c @@ -61,6 +61,9 @@ * # | # | # * ################# * + * If we iterate over multiple quads at once, quads 01 and 23 are processed + * together. + * * Within each quad, we have four pixels which are represented in SOA * order: * @@ -72,6 +75,10 @@ * * So the green channel (for example) of the four pixels is stored in * a single vector register: {g0, g1, g2, g3}. + * The order stays the same even with multiple quads: + * 0 1 4 5 + * 2 3 6 7 + * is stored as g0..g7 */ @@ -102,8 +109,8 @@ #define PERSPECTIVE_DIVIDE_PER_QUAD 0 -static const unsigned char quad_offset_x[4] = {0, 1, 0, 1}; -static const unsigned char quad_offset_y[4] = {0, 0, 1, 1}; +static const unsigned char quad_offset_x[16] = {0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3}; +static const unsigned char quad_offset_y[16] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3}; static void @@ -115,132 +122,353 @@ attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix); } - -/** - * Initialize the bld->a0, dadx, dady fields. This involves fetching - * those values from the arrays which are passed into the JIT function. +/* Much easier, and significantly less instructions in the per-stamp + * part (less than half) but overall more instructions so a loss if + * most quads are active. Might be a win though with larger vectors. + * No ability to do per-quad divide (doable but not implemented) + * Could be made to work with passed in pixel offsets (i.e. active quad merging). */ static void -coeffs_init(struct lp_build_interp_soa_context *bld, - LLVMValueRef a0_ptr, - LLVMValueRef dadx_ptr, - LLVMValueRef dady_ptr) +coeffs_init_simple(struct lp_build_interp_soa_context *bld, + LLVMValueRef a0_ptr, + LLVMValueRef dadx_ptr, + LLVMValueRef dady_ptr) { struct lp_build_context *coeff_bld = &bld->coeff_bld; + struct lp_build_context *setup_bld = &bld->setup_bld; struct gallivm_state *gallivm = coeff_bld->gallivm; LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef zero = LLVMConstNull(coeff_bld->elem_type); - LLVMValueRef one = LLVMConstReal(coeff_bld->elem_type, 1.0); - LLVMValueRef i0 = lp_build_const_int32(gallivm, 0); - LLVMValueRef i1 = lp_build_const_int32(gallivm, 1); - LLVMValueRef i2 = lp_build_const_int32(gallivm, 2); - LLVMValueRef i3 = lp_build_const_int32(gallivm, 3); unsigned attrib; - unsigned chan; - - /* TODO: Use more vector operations */ for (attrib = 0; attrib < bld->num_attribs; ++attrib) { + /* + * always fetch all 4 values for performance/simplicity + * Note: we do that here because it seems to generate better + * code. It generates a lot of moves initially but less + * moves later. As far as I can tell this looks like a + * llvm issue, instead of simply reloading the values from + * the passed in pointers it if it runs out of registers + * it spills/reloads them. Maybe some optimization passes + * would help. + * Might want to investigate this again later. + */ + const unsigned interp = bld->interp[attrib]; + LLVMValueRef index = lp_build_const_int32(gallivm, + attrib * TGSI_NUM_CHANNELS); + LLVMValueRef ptr; + LLVMValueRef dadxaos = setup_bld->zero; + LLVMValueRef dadyaos = setup_bld->zero; + LLVMValueRef a0aos = setup_bld->zero; + + switch (interp) { + case LP_INTERP_PERSPECTIVE: + /* fall-through */ + + case LP_INTERP_LINEAR: + ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""); + ptr = LLVMBuildBitCast(builder, ptr, + LLVMPointerType(setup_bld->vec_type, 0), ""); + dadxaos = LLVMBuildLoad(builder, ptr, ""); + + ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, ""); + ptr = LLVMBuildBitCast(builder, ptr, + LLVMPointerType(setup_bld->vec_type, 0), ""); + dadyaos = LLVMBuildLoad(builder, ptr, ""); + + attrib_name(dadxaos, attrib, 0, ".dadxaos"); + attrib_name(dadyaos, attrib, 0, ".dadyaos"); + /* fall-through */ + + case LP_INTERP_CONSTANT: + case LP_INTERP_FACING: + ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, ""); + ptr = LLVMBuildBitCast(builder, ptr, + LLVMPointerType(setup_bld->vec_type, 0), ""); + a0aos = LLVMBuildLoad(builder, ptr, ""); + attrib_name(a0aos, attrib, 0, ".a0aos"); + break; + + case LP_INTERP_POSITION: + /* Nothing to do as the position coeffs are already setup in slot 0 */ + continue; + + default: + assert(0); + break; + } + bld->a0aos[attrib] = a0aos; + bld->dadxaos[attrib] = dadxaos; + bld->dadyaos[attrib] = dadyaos; + } +} + +/** + * Interpolate the shader input attribute values. + * This is called for each (group of) quad(s). + */ +static void +attribs_update_simple(struct lp_build_interp_soa_context *bld, + struct gallivm_state *gallivm, + int quad_start_index, + int start, + int end) +{ + LLVMBuilderRef builder = gallivm->builder; + struct lp_build_context *coeff_bld = &bld->coeff_bld; + struct lp_build_context *setup_bld = &bld->setup_bld; + LLVMValueRef oow = NULL; + unsigned attrib, i; + LLVMValueRef pixoffx; + LLVMValueRef pixoffy; + unsigned num_pix = coeff_bld->type.length; + + /* could do this with code-generated passed in pixel offsets */ + pixoffx = coeff_bld->undef; + pixoffy = coeff_bld->undef; + for (i = 0; i < coeff_bld->type.length; i++) { + LLVMValueRef nr = lp_build_const_int32(gallivm, i); + LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i % num_pix] + + (quad_start_index & 1) * 2); + LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i % num_pix] + + (quad_start_index & 2)); + pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, ""); + pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, ""); + } + + pixoffx = LLVMBuildFAdd(builder, pixoffx, + lp_build_broadcast_scalar(coeff_bld, bld->x), ""); + pixoffy = LLVMBuildFAdd(builder, pixoffy, + lp_build_broadcast_scalar(coeff_bld, bld->y), ""); + + for (attrib = start; attrib < end; attrib++) { const unsigned mask = bld->mask[attrib]; const unsigned interp = bld->interp[attrib]; - for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { + unsigned chan; + + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { if (mask & (1 << chan)) { - LLVMValueRef index = lp_build_const_int32(gallivm, - attrib * TGSI_NUM_CHANNELS + chan); - LLVMValueRef a0 = zero; - LLVMValueRef dadx = zero; - LLVMValueRef dady = zero; - LLVMValueRef dadxy = zero; - LLVMValueRef dadq; - LLVMValueRef dadq2; - LLVMValueRef a; + LLVMValueRef index; + LLVMValueRef dadx = coeff_bld->zero; + LLVMValueRef dady = coeff_bld->zero; + LLVMValueRef a = coeff_bld->zero; + index = lp_build_const_int32(gallivm, chan); switch (interp) { case LP_INTERP_PERSPECTIVE: /* fall-through */ case LP_INTERP_LINEAR: if (attrib == 0 && chan == 0) { - dadxy = dadx = one; + dadx = coeff_bld->one; } else if (attrib == 0 && chan == 1) { - dadxy = dady = one; + dady = coeff_bld->one; } else { - dadx = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""), ""); - dady = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dady_ptr, &index, 1, ""), ""); - dadxy = LLVMBuildFAdd(builder, dadx, dady, ""); - attrib_name(dadx, attrib, chan, ".dadx"); - attrib_name(dady, attrib, chan, ".dady"); - attrib_name(dadxy, attrib, chan, ".dadxy"); + dadx = lp_build_extract_broadcast(gallivm, setup_bld->type, + coeff_bld->type, bld->dadxaos[attrib], + index); + dady = lp_build_extract_broadcast(gallivm, setup_bld->type, + coeff_bld->type, bld->dadyaos[attrib], + index); + a = lp_build_extract_broadcast(gallivm, setup_bld->type, + coeff_bld->type, bld->a0aos[attrib], + index); } - /* fall-through */ + /* + * a = a0 + (x * dadx + y * dady) + */ + dadx = LLVMBuildFMul(builder, dadx, pixoffx, ""); + dady = LLVMBuildFMul(builder, dady, pixoffy, ""); + a = LLVMBuildFAdd(builder, a, dadx, ""); + a = LLVMBuildFAdd(builder, a, dady, ""); + + if (interp == LP_INTERP_PERSPECTIVE) { + if (oow == NULL) { + LLVMValueRef w = bld->attribs[0][3]; + assert(attrib != 0); + assert(bld->mask[0] & TGSI_WRITEMASK_W); + oow = lp_build_rcp(coeff_bld, w); + } + a = lp_build_mul(coeff_bld, a, oow); + } + break; case LP_INTERP_CONSTANT: case LP_INTERP_FACING: - a0 = LLVMBuildLoad(builder, LLVMBuildGEP(builder, a0_ptr, &index, 1, ""), ""); - attrib_name(a0, attrib, chan, ".a0"); + a = lp_build_extract_broadcast(gallivm, setup_bld->type, + coeff_bld->type, bld->a0aos[attrib], + index); break; case LP_INTERP_POSITION: - /* Nothing to do as the position coeffs are already setup in slot 0 */ - continue; + assert(attrib > 0); + a = bld->attribs[0][chan]; + break; default: assert(0); break; } - /* - * dadq = {0, dadx, dady, dadx + dady} - */ + if ((attrib == 0) && (chan == 2)){ + /* FIXME: Depth values can exceed 1.0, due to the fact that + * setup interpolation coefficients refer to (0,0) which causes + * precision loss. So we must clamp to 1.0 here to avoid artifacts + */ + a = lp_build_min(coeff_bld, a, coeff_bld->one); + } + bld->attribs[attrib][chan] = a; + } + } + } +} - dadq = coeff_bld->undef; - dadq = LLVMBuildInsertElement(builder, dadq, zero, i0, ""); - dadq = LLVMBuildInsertElement(builder, dadq, dadx, i1, ""); - dadq = LLVMBuildInsertElement(builder, dadq, dady, i2, ""); - dadq = LLVMBuildInsertElement(builder, dadq, dadxy, i3, ""); +/** + * Initialize the bld->a, dadq fields. This involves fetching + * those values from the arrays which are passed into the JIT function. + */ +static void +coeffs_init(struct lp_build_interp_soa_context *bld, + LLVMValueRef a0_ptr, + LLVMValueRef dadx_ptr, + LLVMValueRef dady_ptr) +{ + struct lp_build_context *coeff_bld = &bld->coeff_bld; + struct lp_build_context *setup_bld = &bld->setup_bld; + struct gallivm_state *gallivm = coeff_bld->gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef pixoffx, pixoffy; + unsigned attrib; + unsigned chan; + unsigned i; + + pixoffx = coeff_bld->undef; + pixoffy = coeff_bld->undef; + for (i = 0; i < coeff_bld->type.length; i++) { + LLVMValueRef nr = lp_build_const_int32(gallivm, i); + LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i]); + LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i]); + pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, ""); + pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, ""); + } - /* - * dadq2 = 2 * dq - */ - dadq2 = LLVMBuildFAdd(builder, dadq, dadq, ""); + for (attrib = 0; attrib < bld->num_attribs; ++attrib) { + const unsigned mask = bld->mask[attrib]; + const unsigned interp = bld->interp[attrib]; + LLVMValueRef index = lp_build_const_int32(gallivm, + attrib * TGSI_NUM_CHANNELS); + LLVMValueRef ptr; + LLVMValueRef dadxaos = setup_bld->zero; + LLVMValueRef dadyaos = setup_bld->zero; + LLVMValueRef a0aos = setup_bld->zero; + + /* always fetch all 4 values for performance/simplicity */ + switch (interp) { + case LP_INTERP_PERSPECTIVE: + /* fall-through */ + + case LP_INTERP_LINEAR: + ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""); + ptr = LLVMBuildBitCast(builder, ptr, + LLVMPointerType(setup_bld->vec_type, 0), ""); + dadxaos = LLVMBuildLoad(builder, ptr, ""); + + ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, ""); + ptr = LLVMBuildBitCast(builder, ptr, + LLVMPointerType(setup_bld->vec_type, 0), ""); + dadyaos = LLVMBuildLoad(builder, ptr, ""); + + attrib_name(dadxaos, attrib, 0, ".dadxaos"); + attrib_name(dadyaos, attrib, 0, ".dadyaos"); + /* fall-through */ + + case LP_INTERP_CONSTANT: + case LP_INTERP_FACING: + ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, ""); + ptr = LLVMBuildBitCast(builder, ptr, + LLVMPointerType(setup_bld->vec_type, 0), ""); + a0aos = LLVMBuildLoad(builder, ptr, ""); + attrib_name(a0aos, attrib, 0, ".a0aos"); + break; + + case LP_INTERP_POSITION: + /* Nothing to do as the position coeffs are already setup in slot 0 */ + continue; + + default: + assert(0); + break; + } - /* - * a = a0 + (x * dadx + y * dady) - */ + /* + * a = a0 + (x * dadx + y * dady) + * a0aos is the attrib value at top left corner of stamp + */ + if (interp != LP_INTERP_CONSTANT && + interp != LP_INTERP_FACING) { + LLVMValueRef axaos, ayaos; + axaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->x), + dadxaos, ""); + ayaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->y), + dadyaos, ""); + a0aos = LLVMBuildFAdd(builder, a0aos, ayaos, ""); + a0aos = LLVMBuildFAdd(builder, a0aos, axaos, ""); + } + + /* + * dadq = {0, dadx, dady, dadx + dady} + * for two quads (side by side) this is: + * {0, dadx, dady, dadx+dady, 2*dadx, 2*dadx+dady, 3*dadx+dady} + */ + for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { + /* this generates a CRAPLOAD of shuffles... */ + if (mask & (1 << chan)) { + LLVMValueRef dadx, dady; + LLVMValueRef dadq, dadq2; + LLVMValueRef a; + LLVMValueRef chan_index = lp_build_const_int32(gallivm, chan); if (attrib == 0 && chan == 0) { - a = bld->x; + a = lp_build_broadcast_scalar(coeff_bld, bld->x); + dadx = coeff_bld->one; + dady = coeff_bld->zero; } else if (attrib == 0 && chan == 1) { - a = bld->y; + a = lp_build_broadcast_scalar(coeff_bld, bld->y); + dady = coeff_bld->one; + dadx = coeff_bld->zero; } else { - a = a0; - if (interp != LP_INTERP_CONSTANT && - interp != LP_INTERP_FACING) { - LLVMValueRef ax, ay, axy; - ax = LLVMBuildFMul(builder, bld->x, dadx, ""); - ay = LLVMBuildFMul(builder, bld->y, dady, ""); - axy = LLVMBuildFAdd(builder, ax, ay, ""); - a = LLVMBuildFAdd(builder, a, axy, ""); - } - } + dadx = lp_build_extract_broadcast(gallivm, setup_bld->type, + coeff_bld->type, dadxaos, chan_index); + dady = lp_build_extract_broadcast(gallivm, setup_bld->type, + coeff_bld->type, dadyaos, chan_index); - /* - * a = {a, a, a, a} - */ + /* + * a = {a, a, a, a} + */ + a = lp_build_extract_broadcast(gallivm, setup_bld->type, + coeff_bld->type, a0aos, chan_index); + } - a = lp_build_broadcast(gallivm, coeff_bld->vec_type, a); + dadx = LLVMBuildFMul(builder, dadx, pixoffx, ""); + dady = LLVMBuildFMul(builder, dady, pixoffy, ""); + dadq = LLVMBuildFAdd(builder, dadx, dady, ""); /* - * Compute the attrib values on the upper-left corner of each quad. + * Compute the attrib values on the upper-left corner of each + * group of quads. + * Note that if we process 2 quads at once this doesn't + * really exactly to what we want. + * We need to access elem 0 and 2 respectively later if we process + * 2 quads at once. */ if (interp != LP_INTERP_CONSTANT && interp != LP_INTERP_FACING) { + dadq2 = LLVMBuildFAdd(builder, dadq, dadq, ""); a = LLVMBuildFAdd(builder, a, dadq2, ""); } @@ -249,6 +477,12 @@ coeffs_init(struct lp_build_interp_soa_context *bld, * a *= 1 / w */ + /* + * XXX since we're only going to access elements 0,2 out of 8 + * if we have 8-wide vectors we should do the division only 4-wide. + * a is really a 2-elements in a 4-wide vector disguised as 8-wide + * in this case. + */ if (interp == LP_INTERP_PERSPECTIVE) { LLVMValueRef w = bld->a[0][3]; assert(attrib != 0); @@ -279,18 +513,18 @@ coeffs_init(struct lp_build_interp_soa_context *bld, static void attribs_update(struct lp_build_interp_soa_context *bld, struct gallivm_state *gallivm, - int quad_index, + int quad_start_index, int start, int end) { LLVMBuilderRef builder = gallivm->builder; struct lp_build_context *coeff_bld = &bld->coeff_bld; - LLVMValueRef shuffle = lp_build_const_int_vec(gallivm, coeff_bld->type, quad_index); + LLVMValueRef shuffle = lp_build_const_int_vec(gallivm, coeff_bld->type, quad_start_index); LLVMValueRef oow = NULL; unsigned attrib; unsigned chan; - assert(quad_index < 4); + assert(quad_start_index < 4); for(attrib = start; attrib < end; ++attrib) { const unsigned mask = bld->mask[attrib]; @@ -412,6 +646,7 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld, LLVMValueRef y0) { struct lp_type coeff_type; + struct lp_type setup_type; unsigned attrib; unsigned chan; @@ -421,19 +656,26 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld, coeff_type.floating = TRUE; coeff_type.sign = TRUE; coeff_type.width = 32; - coeff_type.length = TGSI_QUAD_SIZE; + coeff_type.length = type.length; + + memset(&setup_type, 0, sizeof setup_type); + setup_type.floating = TRUE; + setup_type.sign = TRUE; + setup_type.width = 32; + setup_type.length = TGSI_NUM_CHANNELS; + /* XXX: we don't support interpolating into any other types */ assert(memcmp(&coeff_type, &type, sizeof coeff_type) == 0); lp_build_context_init(&bld->coeff_bld, gallivm, coeff_type); + lp_build_context_init(&bld->setup_bld, gallivm, setup_type); /* For convenience */ bld->pos = bld->attribs[0]; bld->inputs = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) bld->attribs[1]; /* Position */ - bld->num_attribs = 1; bld->mask[0] = TGSI_WRITEMASK_XYZW; bld->interp[0] = LP_INTERP_LINEAR; @@ -453,7 +695,12 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld, pos_init(bld, x0, y0); - coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr); + if (coeff_type.length > 4) { + coeffs_init_simple(bld, a0_ptr, dadx_ptr, dady_ptr); + } + else { + coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr); + } } @@ -463,20 +710,30 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld, void lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld, struct gallivm_state *gallivm, - int quad_index) + int quad_start_index) { - assert(quad_index < 4); + assert(quad_start_index < 4); - attribs_update(bld, gallivm, quad_index, 1, bld->num_attribs); + if (bld->coeff_bld.type.length > 4) { + attribs_update_simple(bld, gallivm, quad_start_index, 1, bld->num_attribs); + } + else { + attribs_update(bld, gallivm, quad_start_index, 1, bld->num_attribs); + } } void lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld, struct gallivm_state *gallivm, - int quad_index) + int quad_start_index) { - assert(quad_index < 4); + assert(quad_start_index < 4); - attribs_update(bld, gallivm, quad_index, 0, 1); + if (bld->coeff_bld.type.length > 4) { + attribs_update_simple(bld, gallivm, quad_start_index, 0, 1); + } + else { + attribs_update(bld, gallivm, quad_start_index, 0, 1); + } } |