diff options
Diffstat (limited to 'src/gallium/auxiliary/gallivm')
36 files changed, 3837 insertions, 1210 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index 9fc57629822..d226dab5b81 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -75,9 +75,9 @@ lp_build_min_simple(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) { - LLVMBuilderRef builder = bld->gallivm->builder; const struct lp_type type = bld->type; const char *intrinsic = NULL; + unsigned intr_size; LLVMValueRef cond; assert(lp_check_value(type, a)); @@ -85,31 +85,71 @@ lp_build_min_simple(struct lp_build_context *bld, /* TODO: optimize the constant case */ - if(type.width * type.length == 128) { - if(type.floating) { - if(type.width == 32 && util_cpu_caps.has_sse) + if (type.floating && util_cpu_caps.has_sse) { + if (type.width == 32) { + if (type.length == 1) { + intrinsic = "llvm.x86.sse.min.ss"; + intr_size = 128; + } + else if (type.length <= 4 || !util_cpu_caps.has_avx) { intrinsic = "llvm.x86.sse.min.ps"; - if(type.width == 64 && util_cpu_caps.has_sse2) + intr_size = 128; + } + else { + intrinsic = "llvm.x86.avx.min.ps.256"; + intr_size = 256; + } + } + if (type.width == 64 && util_cpu_caps.has_sse2) { + if (type.length == 1) { + intrinsic = "llvm.x86.sse2.min.sd"; + intr_size = 128; + } + else if (type.length == 2 || !util_cpu_caps.has_avx) { intrinsic = "llvm.x86.sse2.min.pd"; + intr_size = 128; + } + else { + intrinsic = "llvm.x86.avx.min.pd.256"; + intr_size = 256; + } } - else { - if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) - intrinsic = "llvm.x86.sse2.pminu.b"; - if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) + } + else if (util_cpu_caps.has_sse2 && type.length >= 2) { + intr_size = 128; + if ((type.width == 8 || type.width == 16) && + (type.width * type.length <= 64) && + (gallivm_debug & GALLIVM_DEBUG_PERF)) { + debug_printf("%s: inefficient code, bogus shuffle due to packing\n", + __FUNCTION__); + } + if (type.width == 8 && !type.sign) { + intrinsic = "llvm.x86.sse2.pminu.b"; + } + else if (type.width == 16 && type.sign) { + intrinsic = "llvm.x86.sse2.pmins.w"; + } + if (util_cpu_caps.has_sse4_1) { + if (type.width == 8 && type.sign) { intrinsic = "llvm.x86.sse41.pminsb"; - if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) + } + if (type.width == 16 && !type.sign) { intrinsic = "llvm.x86.sse41.pminuw"; - if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) - intrinsic = "llvm.x86.sse2.pmins.w"; - if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) + } + if (type.width == 32 && !type.sign) { intrinsic = "llvm.x86.sse41.pminud"; - if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) + } + if (type.width == 32 && type.sign) { intrinsic = "llvm.x86.sse41.pminsd"; + } } } - if(intrinsic) - return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); + if(intrinsic) { + return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, + type, + intr_size, a, b); + } cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); return lp_build_select(bld, cond, a, b); @@ -125,9 +165,9 @@ lp_build_max_simple(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) { - LLVMBuilderRef builder = bld->gallivm->builder; const struct lp_type type = bld->type; const char *intrinsic = NULL; + unsigned intr_size; LLVMValueRef cond; assert(lp_check_value(type, a)); @@ -135,31 +175,72 @@ lp_build_max_simple(struct lp_build_context *bld, /* TODO: optimize the constant case */ - if(type.width * type.length == 128) { - if(type.floating) { - if(type.width == 32 && util_cpu_caps.has_sse) + if (type.floating && util_cpu_caps.has_sse) { + if (type.width == 32) { + if (type.length == 1) { + intrinsic = "llvm.x86.sse.max.ss"; + intr_size = 128; + } + else if (type.length <= 4 || !util_cpu_caps.has_avx) { intrinsic = "llvm.x86.sse.max.ps"; - if(type.width == 64 && util_cpu_caps.has_sse2) + intr_size = 128; + } + else { + intrinsic = "llvm.x86.avx.max.ps.256"; + intr_size = 256; + } + } + if (type.width == 64 && util_cpu_caps.has_sse2) { + if (type.length == 1) { + intrinsic = "llvm.x86.sse2.max.sd"; + intr_size = 128; + } + else if (type.length == 2 || !util_cpu_caps.has_avx) { intrinsic = "llvm.x86.sse2.max.pd"; + intr_size = 128; + } + else { + intrinsic = "llvm.x86.avx.max.pd.256"; + intr_size = 256; + } } - else { - if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) - intrinsic = "llvm.x86.sse2.pmaxu.b"; - if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) + } + else if (util_cpu_caps.has_sse2 && type.length >= 2) { + intr_size = 128; + if ((type.width == 8 || type.width == 16) && + (type.width * type.length <= 64) && + (gallivm_debug & GALLIVM_DEBUG_PERF)) { + debug_printf("%s: inefficient code, bogus shuffle due to packing\n", + __FUNCTION__); + } + if (type.width == 8 && !type.sign) { + intrinsic = "llvm.x86.sse2.pmaxu.b"; + intr_size = 128; + } + else if (type.width == 16 && type.sign) { + intrinsic = "llvm.x86.sse2.pmaxs.w"; + } + if (util_cpu_caps.has_sse4_1) { + if (type.width == 8 && type.sign) { intrinsic = "llvm.x86.sse41.pmaxsb"; - if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) + } + if (type.width == 16 && !type.sign) { intrinsic = "llvm.x86.sse41.pmaxuw"; - if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) - intrinsic = "llvm.x86.sse2.pmaxs.w"; - if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) + } + if (type.width == 32 && !type.sign) { intrinsic = "llvm.x86.sse41.pmaxud"; - if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) + } + if (type.width == 32 && type.sign) { intrinsic = "llvm.x86.sse41.pmaxsd"; + } } } - if(intrinsic) - return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); + if(intrinsic) { + return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, + type, + intr_size, a, b); + } cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); return lp_build_select(bld, cond, a, b); @@ -265,15 +346,20 @@ lp_build_add(struct lp_build_context *bld, } -/** Return the scalar sum of the elements of a */ +/** Return the scalar sum of the elements of a. + * Should avoid this operation whenever possible. + */ LLVMValueRef -lp_build_sum_vector(struct lp_build_context *bld, - LLVMValueRef a) +lp_build_horizontal_add(struct lp_build_context *bld, + LLVMValueRef a) { LLVMBuilderRef builder = bld->gallivm->builder; const struct lp_type type = bld->type; LLVMValueRef index, res; - unsigned i; + unsigned i, length; + LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2]; + LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2]; + LLVMValueRef vecres, elem2; assert(lp_check_value(type, a)); @@ -283,26 +369,191 @@ lp_build_sum_vector(struct lp_build_context *bld, assert(!bld->type.norm); - index = lp_build_const_int32(bld->gallivm, 0); - res = LLVMBuildExtractElement(builder, a, index, ""); + /* + * for byte vectors can do much better with psadbw. + * Using repeated shuffle/adds here. Note with multiple vectors + * this can be done more efficiently as outlined in the intel + * optimization manual. + * Note: could cause data rearrangement if used with smaller element + * sizes. + */ - for (i = 1; i < type.length; i++) { - index = lp_build_const_int32(bld->gallivm, i); - if (type.floating) - res = LLVMBuildFAdd(builder, res, - LLVMBuildExtractElement(builder, - a, index, ""), - ""); - else - res = LLVMBuildAdd(builder, res, - LLVMBuildExtractElement(builder, - a, index, ""), - ""); + vecres = a; + length = type.length / 2; + while (length > 1) { + LLVMValueRef vec1, vec2; + for (i = 0; i < length; i++) { + shuffles1[i] = lp_build_const_int32(bld->gallivm, i); + shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length); + } + vec1 = LLVMBuildShuffleVector(builder, vecres, vecres, + LLVMConstVector(shuffles1, length), ""); + vec2 = LLVMBuildShuffleVector(builder, vecres, vecres, + LLVMConstVector(shuffles2, length), ""); + if (type.floating) { + vecres = LLVMBuildFAdd(builder, vec1, vec2, ""); + } + else { + vecres = LLVMBuildAdd(builder, vec1, vec2, ""); + } + length = length >> 1; } + /* always have vector of size 2 here */ + assert(length == 1); + + index = lp_build_const_int32(bld->gallivm, 0); + res = LLVMBuildExtractElement(builder, vecres, index, ""); + index = lp_build_const_int32(bld->gallivm, 1); + elem2 = LLVMBuildExtractElement(builder, vecres, index, ""); + + if (type.floating) + res = LLVMBuildFAdd(builder, res, elem2, ""); + else + res = LLVMBuildAdd(builder, res, elem2, ""); + return res; } +/** + * Return the horizontal sums of 4 float vectors as a float4 vector. + * This uses the technique as outlined in Intel Optimization Manual. + */ +static LLVMValueRef +lp_build_horizontal_add4x4f(struct lp_build_context *bld, + LLVMValueRef src[4]) +{ + struct gallivm_state *gallivm = bld->gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef shuffles[4]; + LLVMValueRef tmp[4]; + LLVMValueRef sumtmp[2], shuftmp[2]; + + /* lower half of regs */ + shuffles[0] = lp_build_const_int32(gallivm, 0); + shuffles[1] = lp_build_const_int32(gallivm, 1); + shuffles[2] = lp_build_const_int32(gallivm, 4); + shuffles[3] = lp_build_const_int32(gallivm, 5); + tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1], + LLVMConstVector(shuffles, 4), ""); + tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3], + LLVMConstVector(shuffles, 4), ""); + + /* upper half of regs */ + shuffles[0] = lp_build_const_int32(gallivm, 2); + shuffles[1] = lp_build_const_int32(gallivm, 3); + shuffles[2] = lp_build_const_int32(gallivm, 6); + shuffles[3] = lp_build_const_int32(gallivm, 7); + tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1], + LLVMConstVector(shuffles, 4), ""); + tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3], + LLVMConstVector(shuffles, 4), ""); + + sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], ""); + sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], ""); + + shuffles[0] = lp_build_const_int32(gallivm, 0); + shuffles[1] = lp_build_const_int32(gallivm, 2); + shuffles[2] = lp_build_const_int32(gallivm, 4); + shuffles[3] = lp_build_const_int32(gallivm, 6); + shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1], + LLVMConstVector(shuffles, 4), ""); + + shuffles[0] = lp_build_const_int32(gallivm, 1); + shuffles[1] = lp_build_const_int32(gallivm, 3); + shuffles[2] = lp_build_const_int32(gallivm, 5); + shuffles[3] = lp_build_const_int32(gallivm, 7); + shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1], + LLVMConstVector(shuffles, 4), ""); + + return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], ""); +} + + +/* + * partially horizontally add 2-4 float vectors with length nx4, + * i.e. only four adjacent values in each vector will be added, + * assuming values are really grouped in 4 which also determines + * output order. + * + * Return a vector of the same length as the initial vectors, + * with the excess elements (if any) being undefined. + * The element order is independent of number of input vectors. + * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7 + * the output order thus will be + * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef + */ +LLVMValueRef +lp_build_hadd_partial4(struct lp_build_context *bld, + LLVMValueRef vectors[], + unsigned num_vecs) +{ + struct gallivm_state *gallivm = bld->gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef ret_vec; + LLVMValueRef tmp[4]; + const char *intrinsic = NULL; + + assert(num_vecs >= 2 && num_vecs <= 4); + assert(bld->type.floating); + + /* only use this with at least 2 vectors, as it is sort of expensive + * (depending on cpu) and we always need two horizontal adds anyway, + * so a shuffle/add approach might be better. + */ + + tmp[0] = vectors[0]; + tmp[1] = vectors[1]; + + tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0]; + tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0]; + + if (util_cpu_caps.has_sse3 && bld->type.width == 32 && + bld->type.length == 4) { + intrinsic = "llvm.x86.sse3.hadd.ps"; + } + else if (util_cpu_caps.has_avx && bld->type.width == 32 && + bld->type.length == 8) { + intrinsic = "llvm.x86.avx.hadd.ps.256"; + } + if (intrinsic) { + tmp[0] = lp_build_intrinsic_binary(builder, intrinsic, + lp_build_vec_type(gallivm, bld->type), + tmp[0], tmp[1]); + if (num_vecs > 2) { + tmp[1] = lp_build_intrinsic_binary(builder, intrinsic, + lp_build_vec_type(gallivm, bld->type), + tmp[2], tmp[3]); + } + else { + tmp[1] = tmp[0]; + } + return lp_build_intrinsic_binary(builder, intrinsic, + lp_build_vec_type(gallivm, bld->type), + tmp[0], tmp[1]); + } + + if (bld->type.length == 4) { + ret_vec = lp_build_horizontal_add4x4f(bld, tmp); + } + else { + LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4]; + unsigned j; + unsigned num_iter = bld->type.length / 4; + struct lp_type parttype = bld->type; + parttype.length = 4; + for (j = 0; j < num_iter; j++) { + LLVMValueRef partsrc[4]; + unsigned i; + for (i = 0; i < 4; i++) { + partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4); + } + partres[j] = lp_build_horizontal_add4x4f(bld, partsrc); + } + ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter); + } + return ret_vec; +} /** * Generate a - b @@ -553,7 +804,7 @@ lp_build_mul_imm(struct lp_build_context *bld, if(bld->type.floating) { #if 0 /* - * Power of two multiplication by directly manipulating the mantissa. + * Power of two multiplication by directly manipulating the exponent. * * XXX: This might not be always faster, it will introduce a small error * for multiplication by zero, and it will produce wrong results @@ -612,7 +863,8 @@ lp_build_div(struct lp_build_context *bld, return LLVMConstUDiv(a, b); } - if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4 && + if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || + (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) && type.floating) return lp_build_mul(bld, a, lp_build_rcp(bld, b)); @@ -871,6 +1123,12 @@ lp_build_abs(struct lp_build_context *bld, return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); } } + else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 && + (gallivm_debug & GALLIVM_DEBUG_PERF) && + (type.width == 8 || type.width == 16 || type.width == 32)) { + debug_printf("%s: inefficient code, should split vectors manually\n", + __FUNCTION__); + } return lp_build_max(bld, a, LLVMBuildNeg(builder, a, "")); } @@ -934,6 +1192,7 @@ lp_build_sgn(struct lp_build_context *bld, else { /* signed int/norm/fixed point */ + /* could use psign with sse3 and appropriate vectors here */ LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0); cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero); res = lp_build_select(bld, cond, bld->one, minus_one); @@ -1000,7 +1259,16 @@ lp_build_int_to_float(struct lp_build_context *bld, return LLVMBuildSIToFP(builder, a, vec_type, ""); } +static boolean +sse41_rounding_available(const struct lp_type type) +{ + if ((util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) || + (util_cpu_caps.has_avx && type.width*type.length == 256)) + return TRUE; + return FALSE; +} enum lp_build_round_sse41_mode { @@ -1065,18 +1333,34 @@ lp_build_round_sse41(struct lp_build_context *bld, res = LLVMBuildExtractElement(builder, res, index0, ""); } else { - assert(type.width*type.length == 128); - - switch(type.width) { - case 32: - intrinsic = "llvm.x86.sse41.round.ps"; - break; - case 64: - intrinsic = "llvm.x86.sse41.round.pd"; - break; - default: - assert(0); - return bld->undef; + if (type.width * type.length == 128) { + switch(type.width) { + case 32: + intrinsic = "llvm.x86.sse41.round.ps"; + break; + case 64: + intrinsic = "llvm.x86.sse41.round.pd"; + break; + default: + assert(0); + return bld->undef; + } + } + else { + assert(type.width * type.length == 256); + assert(util_cpu_caps.has_avx); + + switch(type.width) { + case 32: + intrinsic = "llvm.x86.avx.round.ps.256"; + break; + case 64: + intrinsic = "llvm.x86.avx.round.pd.256"; + break; + default: + assert(0); + return bld->undef; + } } res = lp_build_intrinsic_binary(builder, intrinsic, @@ -1125,10 +1409,15 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld, ret_type, arg); } else { - assert(type.width*type.length == 128); - - intrinsic = "llvm.x86.sse2.cvtps2dq"; + if (type.width* type.length == 128) { + intrinsic = "llvm.x86.sse2.cvtps2dq"; + } + else { + assert(type.width*type.length == 256); + assert(util_cpu_caps.has_avx); + intrinsic = "llvm.x86.avx.cvt.ps2dq.256"; + } res = lp_build_intrinsic_unary(builder, intrinsic, ret_type, a); } @@ -1152,8 +1441,7 @@ lp_build_trunc(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && - (type.length == 1 || type.width*type.length == 128)) { + if (sse41_rounding_available(type)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE); } else { @@ -1183,8 +1471,7 @@ lp_build_round(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && - (type.length == 1 || type.width*type.length == 128)) { + if (sse41_rounding_available(type)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); } else { @@ -1212,8 +1499,7 @@ lp_build_floor(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && - (type.length == 1 || type.width*type.length == 128)) { + if (sse41_rounding_available(type)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); } else { @@ -1241,8 +1527,7 @@ lp_build_ceil(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && - (type.length == 1 || type.width*type.length == 128)) { + if (sse41_rounding_available(type)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); } else { @@ -1269,6 +1554,34 @@ lp_build_fract(struct lp_build_context *bld, /** + * Prevent returning a fractional part of 1.0 for very small negative values of + * 'a' by clamping against 0.99999(9). + */ +static inline LLVMValueRef +clamp_fract(struct lp_build_context *bld, LLVMValueRef fract) +{ + LLVMValueRef max; + + /* this is the largest number smaller than 1.0 representable as float */ + max = lp_build_const_vec(bld->gallivm, bld->type, + 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1))); + return lp_build_min(bld, fract, max); +} + + +/** + * Same as lp_build_fract, but guarantees that the result is always smaller + * than one. + */ +LLVMValueRef +lp_build_fract_safe(struct lp_build_context *bld, + LLVMValueRef a) +{ + return clamp_fract(bld, lp_build_fract(bld, a)); +} + + +/** * Return the integer part of a float (vector) value (== round toward zero). * The returned value is an integer (vector). * Ex: itrunc(-1.5) = -1 @@ -1307,12 +1620,12 @@ lp_build_iround(struct lp_build_context *bld, assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse2 && - ((type.width == 32) && (type.length == 1 || type.length == 4))) { + if ((util_cpu_caps.has_sse2 && + ((type.width == 32) && (type.length == 1 || type.length == 4))) || + (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { return lp_build_iround_nearest_sse2(bld, a); } - else if (util_cpu_caps.has_sse4_1 && - (type.length == 1 || type.width*type.length == 128)) { + if (sse41_rounding_available(type)) { res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); } else { @@ -1362,14 +1675,12 @@ lp_build_ifloor(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && - (type.length == 1 || type.width*type.length == 128)) { - res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); - } - else { - res = a; - - if (type.sign) { + res = a; + if (type.sign) { + if (sse41_rounding_available(type)) { + res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); + } + else { /* Take the sign bit and add it to 1 constant */ LLVMTypeRef vec_type = bld->vec_type; unsigned mantissa = lp_mantissa(type); @@ -1423,8 +1734,7 @@ lp_build_iceil(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && - (type.length == 1 || type.width*type.length == 128)) { + if (sse41_rounding_available(type)) { res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); } else { @@ -1470,7 +1780,7 @@ lp_build_iceil(struct lp_build_context *bld, * Combined ifloor() & fract(). * * Preferred to calling the functions separately, as it will ensure that the - * stratergy (floor() vs ifloor()) that results in less redundant work is used. + * strategy (floor() vs ifloor()) that results in less redundant work is used. */ void lp_build_ifloor_fract(struct lp_build_context *bld, @@ -1485,8 +1795,7 @@ lp_build_ifloor_fract(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && - (type.length == 1 || type.width*type.length == 128)) { + if (sse41_rounding_available(type)) { /* * floor() is easier. */ @@ -1507,6 +1816,21 @@ lp_build_ifloor_fract(struct lp_build_context *bld, } +/** + * Same as lp_build_ifloor_fract, but guarantees that the fractional part is + * always smaller than one. + */ +void +lp_build_ifloor_fract_safe(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef *out_ipart, + LLVMValueRef *out_fpart) +{ + lp_build_ifloor_fract(bld, a, out_ipart, out_fpart); + *out_fpart = clamp_fract(bld, *out_fpart); +} + + LLVMValueRef lp_build_sqrt(struct lp_build_context *bld, LLVMValueRef a) @@ -1519,10 +1843,14 @@ lp_build_sqrt(struct lp_build_context *bld, assert(lp_check_value(type, a)); /* TODO: optimize the constant case */ - /* TODO: optimize the constant case */ assert(type.floating); - util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width); + if (type.length == 1) { + util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width); + } + else { + util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width); + } return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); } @@ -1586,19 +1914,28 @@ lp_build_rcp(struct lp_build_context *bld, * - it doesn't even get the reciprocate of 1.0 exactly * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf * - for recent processors the benefit over DIVPS is marginal, a case - * depedent + * dependent * * We could still use it on certain processors if benchmarks show that the * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for * particular uses that require less workarounds. */ - if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || + (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){ const unsigned num_iterations = 0; LLVMValueRef res; unsigned i; + const char *intrinsic = NULL; - res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a); + if (type.length == 4) { + intrinsic = "llvm.x86.sse.rcp.ps"; + } + else { + intrinsic = "llvm.x86.avx.rcp.ps.256"; + } + + res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); for (i = 0; i < num_iterations; ++i) { res = lp_build_rcp_refine(bld, a, res); @@ -1653,12 +1990,22 @@ lp_build_rsqrt(struct lp_build_context *bld, assert(type.floating); - if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || + (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { const unsigned num_iterations = 1; LLVMValueRef res; unsigned i; + const char *intrinsic = NULL; + + if (type.length == 4) { + intrinsic = "llvm.x86.sse.rsqrt.ps"; + } + else { + intrinsic = "llvm.x86.avx.rsqrt.ps.256"; + } + + res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); - res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a); for (i = 0; i < num_iterations; ++i) { res = lp_build_rsqrt_refine(bld, a, res); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h index aeb987ff352..60b9907e60f 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h @@ -57,8 +57,13 @@ lp_build_add(struct lp_build_context *bld, LLVMValueRef b); LLVMValueRef -lp_build_sum_vector(struct lp_build_context *bld, - LLVMValueRef a); +lp_build_horizontal_add(struct lp_build_context *bld, + LLVMValueRef a); + +LLVMValueRef +lp_build_hadd_partial4(struct lp_build_context *bld, + LLVMValueRef vectors[], + unsigned num_vecs); LLVMValueRef lp_build_sub(struct lp_build_context *bld, @@ -157,6 +162,10 @@ lp_build_fract(struct lp_build_context *bld, LLVMValueRef a); LLVMValueRef +lp_build_fract_safe(struct lp_build_context *bld, + LLVMValueRef a); + +LLVMValueRef lp_build_ifloor(struct lp_build_context *bld, LLVMValueRef a); LLVMValueRef @@ -177,6 +186,12 @@ lp_build_ifloor_fract(struct lp_build_context *bld, LLVMValueRef *out_ipart, LLVMValueRef *out_fpart); +void +lp_build_ifloor_fract_safe(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef *out_ipart, + LLVMValueRef *out_fpart); + LLVMValueRef lp_build_sqrt(struct lp_build_context *bld, LLVMValueRef a); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.c b/src/gallium/auxiliary/gallivm/lp_bld_const.c index 59e8fb2ed6e..35799a1ef8e 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_const.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_const.c @@ -37,6 +37,7 @@ #include "util/u_debug.h" #include "util/u_math.h" +#include "util/u_half.h" #include "lp_bld_type.h" #include "lp_bld_const.h" @@ -50,10 +51,12 @@ lp_mantissa(struct lp_type type) if(type.floating) { switch(type.width) { + case 16: + return 10; case 32: return 23; case 64: - return 53; + return 52; default: assert(0); return 0; @@ -136,6 +139,8 @@ lp_const_min(struct lp_type type) if (type.floating) { switch(type.width) { + case 16: + return -65504; case 32: return -FLT_MAX; case 64: @@ -169,6 +174,8 @@ lp_const_max(struct lp_type type) if (type.floating) { switch(type.width) { + case 16: + return 65504; case 32: return FLT_MAX; case 64: @@ -196,6 +203,8 @@ lp_const_eps(struct lp_type type) { if (type.floating) { switch(type.width) { + case 16: + return 2E-10; case 32: return FLT_EPSILON; case 64: @@ -247,7 +256,9 @@ lp_build_one(struct gallivm_state *gallivm, struct lp_type type) elem_type = lp_build_elem_type(gallivm, type); - if(type.floating) + if(type.floating && type.width == 16) + elems[0] = LLVMConstInt(elem_type, util_float_to_half(1.0f), 0); + else if(type.floating) elems[0] = LLVMConstReal(elem_type, 1.0); else if(type.fixed) elems[0] = LLVMConstInt(elem_type, 1LL << (type.width/2), 0); @@ -292,7 +303,9 @@ lp_build_const_elem(struct gallivm_state *gallivm, LLVMTypeRef elem_type = lp_build_elem_type(gallivm, type); LLVMValueRef elem; - if(type.floating) { + if(type.floating && type.width == 16) { + elem = LLVMConstInt(elem_type, util_float_to_half((float)val), 0); + } else if(type.floating) { elem = LLVMConstReal(elem_type, val); } else { @@ -364,20 +377,10 @@ lp_build_const_aos(struct gallivm_state *gallivm, if(swizzle == NULL) swizzle = default_swizzle; - if(type.floating) { - elems[swizzle[0]] = LLVMConstReal(elem_type, r); - elems[swizzle[1]] = LLVMConstReal(elem_type, g); - elems[swizzle[2]] = LLVMConstReal(elem_type, b); - elems[swizzle[3]] = LLVMConstReal(elem_type, a); - } - else { - double dscale = lp_const_scale(type); - - elems[swizzle[0]] = LLVMConstInt(elem_type, round(r*dscale), 0); - elems[swizzle[1]] = LLVMConstInt(elem_type, round(g*dscale), 0); - elems[swizzle[2]] = LLVMConstInt(elem_type, round(b*dscale), 0); - elems[swizzle[3]] = LLVMConstInt(elem_type, round(a*dscale), 0); - } + elems[swizzle[0]] = lp_build_const_elem(gallivm, type, r); + elems[swizzle[1]] = lp_build_const_elem(gallivm, type, g); + elems[swizzle[2]] = lp_build_const_elem(gallivm, type, b); + elems[swizzle[3]] = lp_build_const_elem(gallivm, type, a); for(i = 4; i < type.length; ++i) elems[i] = elems[i % 4]; @@ -452,7 +455,7 @@ lp_build_const_string(struct gallivm_state *gallivm, /** * Build a callable function pointer. * - * We this casts instead of LLVMAddGlobalMapping() + * We use function pointer constants instead of LLVMAddGlobalMapping() * to work around a bug in LLVM 2.6, and for efficiency/simplicity. */ LLVMValueRef diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c index 0973e1f16f3..0399709faad 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c @@ -70,6 +70,66 @@ #include "lp_bld_arit.h" #include "lp_bld_pack.h" #include "lp_bld_conv.h" +#include "lp_bld_logic.h" + + +/** + * Converts int16 half-float to float32 + * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?) + * [llvm.x86.vcvtph2ps / _mm_cvtph_ps] + * + * @param src_type <vector> type of int16 + * @param src value to convert + * + * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ + */ +LLVMValueRef +lp_build_half_to_float(struct gallivm_state *gallivm, + struct lp_type src_type, + LLVMValueRef src) +{ + struct lp_type f32_type = lp_type_float_vec(32, 32 * src_type.length); + struct lp_type i32_type = lp_type_int_vec(32, 32 * src_type.length); + + LLVMBuilderRef builder = gallivm->builder; + LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type); + LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type); + + /* Constants */ + LLVMValueRef i32_13 = lp_build_const_int_vec(gallivm, i32_type, 13); + LLVMValueRef i32_16 = lp_build_const_int_vec(gallivm, i32_type, 16); + LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff); + LLVMValueRef i32_was_infnan = lp_build_const_int_vec(gallivm, i32_type, 0x7bff); + LLVMValueRef i32_exp_infnan = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23); + LLVMValueRef f32_magic = LLVMBuildBitCast(builder, + lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23), + float_vec_type, ""); + + /* Convert int16 vector to int32 vector by zero ext */ + LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, ""); + + /* Exponent / mantissa bits */ + LLVMValueRef expmant = LLVMBuildAnd(builder, i32_mask_nosign, h, ""); + LLVMValueRef shifted = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, ""); + + /* Exponent adjust */ + LLVMValueRef scaled = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, ""); + + /* Make sure Inf/NaN survive */ + LLVMValueRef b_wasinfnan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan); + LLVMValueRef infnanexp = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, ""); + + /* Sign bit */ + LLVMValueRef justsign = LLVMBuildXor(builder, h, expmant, ""); + LLVMValueRef sign = LLVMBuildShl(builder, justsign, i32_16, ""); + + /* Combine result */ + LLVMValueRef sign_inf = LLVMBuildOr(builder, sign, infnanexp, ""); + LLVMValueRef final = LLVMBuildOr(builder, scaled, sign_inf, ""); + + /* Cast from int32 vector to float32 vector */ + return LLVMBuildBitCast(builder, final, float_vec_type, ""); +} /** @@ -334,6 +394,8 @@ lp_build_conv(struct gallivm_state *gallivm, dst_type.width == 8 && dst_type.length == 16 && + 4 * num_dsts == num_srcs && + util_cpu_caps.has_sse2) { struct lp_build_context bld; @@ -371,6 +433,76 @@ lp_build_conv(struct gallivm_state *gallivm, return; } + /* Special case 2x8f --> 1x16ub + */ + else if (src_type.floating == 1 && + src_type.fixed == 0 && + src_type.sign == 1 && + src_type.norm == 0 && + src_type.width == 32 && + src_type.length == 8 && + + dst_type.floating == 0 && + dst_type.fixed == 0 && + dst_type.sign == 0 && + dst_type.norm == 1 && + dst_type.width == 8 && + dst_type.length == 16 && + + 2 * num_dsts == num_srcs && + + util_cpu_caps.has_avx) { + + struct lp_build_context bld; + struct lp_type int16_type = dst_type; + struct lp_type int32_type = dst_type; + LLVMValueRef const_255f; + unsigned i; + + lp_build_context_init(&bld, gallivm, src_type); + + int16_type.width *= 2; + int16_type.length /= 2; + int16_type.sign = 1; + + int32_type.width *= 4; + int32_type.length /= 4; + int32_type.sign = 1; + + const_255f = lp_build_const_vec(gallivm, src_type, 255.0f); + + for (i = 0; i < num_dsts; ++i, src += 2) { + LLVMValueRef lo, hi, a, b; + + a = LLVMBuildFMul(builder, src[0], const_255f, ""); + b = LLVMBuildFMul(builder, src[1], const_255f, ""); + + a = lp_build_iround(&bld, a); + b = lp_build_iround(&bld, b); + + tmp[0] = lp_build_extract_range(gallivm, a, 0, 4); + tmp[1] = lp_build_extract_range(gallivm, a, 4, 4); + tmp[2] = lp_build_extract_range(gallivm, b, 0, 4); + tmp[3] = lp_build_extract_range(gallivm, b, 4, 4); + + /* relying on clamping behavior of sse2 intrinsics here */ + lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); + hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); + dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi); + } + return; + } + + /* Pre convert half-floats to floats + */ + else if (src_type.floating && src_type.width == 16) + { + for(i = 0; i < num_tmps; ++i) + tmp[i] = lp_build_half_to_float(gallivm, src_type, tmp[i]); + + tmp_type.width = 32; + } + /* * Clamp if necessary */ @@ -580,7 +712,7 @@ lp_build_conv(struct gallivm_state *gallivm, * This will convert the integer masks that match the given types. * * The mask values should 0 or -1, i.e., all bits either set to zero or one. - * Any other value will likely cause in unpredictable results. + * Any other value will likely cause unpredictable results. * * This is basically a very trimmed down version of lp_build_conv. */ @@ -591,8 +723,6 @@ lp_build_conv_mask(struct gallivm_state *gallivm, const LLVMValueRef *src, unsigned num_srcs, LLVMValueRef *dst, unsigned num_dsts) { - /* Register width must remain constant */ - assert(src_type.width * src_type.length == dst_type.width * dst_type.length); /* We must not loose or gain channels. Only precision */ assert(src_type.length * num_srcs == dst_type.length * num_dsts); @@ -617,16 +747,5 @@ lp_build_conv_mask(struct gallivm_state *gallivm, * Truncate or expand bit width */ - if(src_type.width > dst_type.width) { - assert(num_dsts == 1); - dst[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs); - } - else if(src_type.width < dst_type.width) { - assert(num_srcs == 1); - lp_build_unpack(gallivm, src_type, dst_type, src[0], dst, num_dsts); - } - else { - assert(num_srcs == num_dsts); - memcpy(dst, src, num_dsts * sizeof *dst); - } + lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h b/src/gallium/auxiliary/gallivm/lp_bld_conv.h index cec655980fa..c830fbef5f2 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h @@ -42,6 +42,10 @@ struct lp_type; +LLVMValueRef +lp_build_half_to_float(struct gallivm_state *gallivm, + struct lp_type src_type, + LLVMValueRef src); LLVMValueRef lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp index 444b70a678c..93505f3da45 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp +++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp @@ -35,10 +35,8 @@ #if HAVE_LLVM >= 0x0300 #include <llvm/Support/TargetRegistry.h> -#include <llvm/Support/TargetSelect.h> #else /* HAVE_LLVM < 0x0300 */ #include <llvm/Target/TargetRegistry.h> -#include <llvm/Target/TargetSelect.h> #endif /* HAVE_LLVM < 0x0300 */ #if HAVE_LLVM >= 0x0209 @@ -183,7 +181,7 @@ lp_disassemble(const void* func) /* * Limit disassembly to this extent */ - const uint64_t extent = 0x10000; + const uint64_t extent = 96 * 1024; uint64_t max_pc = 0; @@ -200,24 +198,6 @@ lp_disassemble(const void* func) std::string Error; const Target *T = TargetRegistry::lookupTarget(Triple, Error); -#if HAVE_LLVM >= 0x0208 - InitializeNativeTargetAsmPrinter(); -#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) - LLVMInitializeX86AsmPrinter(); -#elif defined(PIPE_ARCH_ARM) - LLVMInitializeARMAsmPrinter(); -#elif defined(PIPE_ARCH_PPC) - LLVMInitializePowerPCAsmPrinter(); -#endif - -#if HAVE_LLVM >= 0x0301 - InitializeNativeTargetDisassembler(); -#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) - LLVMInitializeX86Disassembler(); -#elif defined(PIPE_ARCH_ARM) - LLVMInitializeARMDisassembler(); -#endif - #if HAVE_LLVM >= 0x0300 OwningPtr<const MCAsmInfo> AsmInfo(T->createMCAsmInfo(Triple)); #else diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c index d2b3713ed2d..30da44e5b9c 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c @@ -131,6 +131,15 @@ lp_build_mask_check(struct lp_build_mask_context *mask) value = lp_build_mask_value(mask); + /* + * XXX this doesn't quite generate the most efficient code possible, if + * the masks are vectors which have all bits set to the same value + * in each element. + * movmskps/pmovmskb would be more efficient to get the required value + * into ordinary reg (certainly with 8 floats). + * Not sure if llvm could figure that out on its own. + */ + /* cond = (mask == 0) */ cond = LLVMBuildICmp(builder, LLVMIntEQ, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h index 04142d905b1..3608a68202f 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h @@ -67,6 +67,13 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, LLVMValueRef i, LLVMValueRef j); +LLVMValueRef +lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + struct lp_type type, + LLVMValueRef base_ptr, + LLVMValueRef offset); + /* * SoA diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c index e4b8da6bcfd..9591bcfb2c7 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c @@ -470,6 +470,11 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, return lp_build_format_swizzle_aos(format_desc, &bld, res); } + /* If all channels are of same type and we are not using half-floats */ + if (util_format_is_array(format_desc)) { + return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset); + } + /* * YUV / subsampled formats */ @@ -601,7 +606,6 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, return res; } - /* * Fallback to util_format_description::fetch_rgba_float(). */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c new file mode 100644 index 00000000000..b8ec379d76f --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c @@ -0,0 +1,102 @@ +/************************************************************************** + * + * Copyright 2012 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "lp_bld_const.h" +#include "lp_bld_struct.h" +#include "lp_bld_format.h" +#include "lp_bld_debug.h" +#include "lp_bld_type.h" +#include "lp_bld_conv.h" +#include "lp_bld_pack.h" + +#include "util/u_memory.h" +#include "util/u_format.h" +#include "pipe/p_state.h" + +/** + * @brief lp_build_fetch_rgba_aos_array + * + * \param format_desc describes format of the image we're fetching from + * \param dst_type output type + * \param base_ptr address of the pixel block (or the texel if uncompressed) + * \param offset ptr offset + */ +LLVMValueRef +lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + struct lp_type dst_type, + LLVMValueRef base_ptr, + LLVMValueRef offset) +{ + struct lp_build_context bld; + LLVMBuilderRef builder = gallivm->builder; + LLVMTypeRef src_elem_type, src_vec_type; + LLVMValueRef ptr, res = NULL; + struct lp_type src_type; + + memset(&src_type, 0, sizeof src_type); + src_type.floating = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT; + src_type.fixed = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FIXED; + src_type.sign = format_desc->channel[0].type != UTIL_FORMAT_TYPE_UNSIGNED; + src_type.norm = format_desc->channel[0].normalized; + src_type.width = format_desc->channel[0].size; + src_type.length = format_desc->nr_channels; + + assert(src_type.length <= dst_type.length); + + src_elem_type = lp_build_elem_type(gallivm, src_type); + src_vec_type = lp_build_vec_type(gallivm, src_type); + + /* Read whole vector from memory, unaligned */ + if (!res) { + ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, ""); + ptr = LLVMBuildPointerCast(builder, ptr, LLVMPointerType(src_vec_type, 0), ""); + res = LLVMBuildLoad(builder, ptr, ""); + lp_set_load_alignment(res, src_type.width / 8); + } + + /* Truncate doubles to float */ + if (src_type.floating && src_type.width == 64) { + src_type.width = 32; + src_vec_type = lp_build_vec_type(gallivm, src_type); + + res = LLVMBuildFPTrunc(builder, res, src_vec_type, ""); + } + + /* Expand to correct length */ + if (src_type.length < dst_type.length) { + res = lp_build_pad_vector(gallivm, res, src_type, dst_type.length); + src_type.length = dst_type.length; + } + + /* Convert to correct format */ + lp_build_conv(gallivm, src_type, dst_type, &res, 1, &res, 1); + + /* Swizzle it */ + lp_build_context_init(&bld, gallivm, dst_type); + return lp_build_format_swizzle_aos(format_desc, &bld, res); +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c index 0a57b3ce794..afeb34079bf 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c @@ -359,7 +359,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, */ if (util_format_fits_8unorm(format_desc) && - type.floating && type.width == 32 && type.length == 4) { + type.floating && type.width == 32 && + (type.length == 1 || (type.length % 4 == 0))) { struct lp_type tmp_type; LLVMValueRef tmp; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c index ccc83207004..f77eb1212b1 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c @@ -84,7 +84,7 @@ uyvy_to_yuv_soa(struct gallivm_state *gallivm, * per element. Didn't measure performance but cuts shader size * by quite a bit (less difference if cpu has no sse4.1 support). */ - if (util_cpu_caps.has_sse2 && n == 4) { + if (util_cpu_caps.has_sse2 && n > 1) { LLVMValueRef sel, tmp, tmp2; struct lp_build_context bld32; @@ -152,7 +152,7 @@ yuyv_to_yuv_soa(struct gallivm_state *gallivm, * per element. Didn't measure performance but cuts shader size * by quite a bit (less difference if cpu has no sse4.1 support). */ - if (util_cpu_caps.has_sse2 && n == 4) { + if (util_cpu_caps.has_sse2 && n > 1) { LLVMValueRef sel, tmp; struct lp_build_context bld32; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c index 768d935dae5..5bf4bcfab3b 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c @@ -26,15 +26,44 @@ **************************************************************************/ +#include "pipe/p_config.h" #include "pipe/p_compiler.h" #include "util/u_cpu_detect.h" #include "util/u_debug.h" #include "util/u_memory.h" #include "util/u_simple_list.h" +#include "lp_bld.h" #include "lp_bld_debug.h" +#include "lp_bld_misc.h" #include "lp_bld_init.h" +#include <llvm-c/Analysis.h> #include <llvm-c/Transforms/Scalar.h> +#include <llvm-c/BitWriter.h> + + +/** + * AVX is supported in: + * - standard JIT from LLVM 3.2 onwards + * - MC-JIT from LLVM 3.1 + * - MC-JIT supports limited OSes (MacOSX and Linux) + * - standard JIT in LLVM 3.1, with backports + */ +#if HAVE_LLVM >= 0x0301 && (defined(PIPE_OS_LINUX) || defined(PIPE_OS_APPLE)) +# define USE_MCJIT 1 +# define HAVE_AVX 1 +#elif HAVE_LLVM >= 0x0302 || (HAVE_LLVM == 0x0301 && defined(HAVE_JIT_AVX_SUPPORT)) +# define USE_MCJIT 0 +# define HAVE_AVX 1 +#else +# define USE_MCJIT 0 +# define HAVE_AVX 0 +#endif + + +#if USE_MCJIT +void LLVMLinkInMCJIT(); +#endif #ifdef DEBUG @@ -57,6 +86,8 @@ DEBUG_GET_ONCE_FLAGS_OPTION(gallivm_debug, "GALLIVM_DEBUG", lp_bld_debug_flags, static boolean gallivm_initialized = FALSE; +unsigned lp_native_vector_width; + /* * Optimization values are: @@ -81,25 +112,13 @@ enum LLVM_CodeGenOpt_Level { }; +#if HAVE_LLVM <= 0x0206 /** - * LLVM 2.6 permits only one ExecutionEngine to be created. This is it. - */ -static LLVMExecutionEngineRef GlobalEngine = NULL; - -/** - * Same gallivm state shared by all contexts. + * LLVM 2.6 permits only one ExecutionEngine to be created. So use the + * same gallivm state everywhere. */ static struct gallivm_state *GlobalGallivm = NULL; - - - - -extern void -lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE); - -extern void -lp_set_target_options(void); - +#endif /** @@ -111,6 +130,7 @@ static boolean create_pass_manager(struct gallivm_state *gallivm) { assert(!gallivm->passmgr); + assert(gallivm->target); gallivm->passmgr = LLVMCreateFunctionPassManager(gallivm->provider); if (!gallivm->passmgr) @@ -174,33 +194,37 @@ free_gallivm_state(struct gallivm_state *gallivm) &mod, &error); #endif + if (gallivm->passmgr) { + LLVMDisposePassManager(gallivm->passmgr); + } + #if 0 /* XXX this seems to crash with all versions of LLVM */ if (gallivm->provider) LLVMDisposeModuleProvider(gallivm->provider); #endif - if (gallivm->passmgr) - LLVMDisposePassManager(gallivm->passmgr); - -#if HAVE_LLVM >= 0x207 - if (gallivm->module) - LLVMDisposeModule(gallivm->module); -#endif - -#if 0 - /* Don't free the exec engine, it's a global/singleton */ - if (gallivm->engine) + if (HAVE_LLVM >= 0x207 && gallivm->engine) { + /* This will already destroy any associated module */ LLVMDisposeExecutionEngine(gallivm->engine); -#endif + } else { + LLVMDisposeModule(gallivm->module); + } -#if 0 +#if !USE_MCJIT /* Don't free the TargetData, it's owned by the exec engine */ - LLVMDisposeTargetData(gallivm->target); +#else + if (gallivm->target) { + LLVMDisposeTargetData(gallivm->target); + } #endif + /* Never free the LLVM context. + */ +#if 0 if (gallivm->context) LLVMContextDispose(gallivm->context); +#endif if (gallivm->builder) LLVMDisposeBuilder(gallivm->builder); @@ -215,37 +239,14 @@ free_gallivm_state(struct gallivm_state *gallivm) } -/** - * Allocate gallivm LLVM objects. - * \return TRUE for success, FALSE for failure - */ static boolean -init_gallivm_state(struct gallivm_state *gallivm) +init_gallivm_engine(struct gallivm_state *gallivm) { - assert(!gallivm->context); - assert(!gallivm->module); - assert(!gallivm->provider); - - lp_build_init(); - - gallivm->context = LLVMContextCreate(); - if (!gallivm->context) - goto fail; - - gallivm->module = LLVMModuleCreateWithNameInContext("gallivm", - gallivm->context); - if (!gallivm->module) - goto fail; - - gallivm->provider = - LLVMCreateModuleProviderForExistingModule(gallivm->module); - if (!gallivm->provider) - goto fail; - - if (!GlobalEngine) { + if (1) { /* We can only create one LLVMExecutionEngine (w/ LLVM 2.6 anyway) */ enum LLVM_CodeGenOpt_Level optlevel; char *error = NULL; + int ret; if (gallivm_debug & GALLIVM_DEBUG_NO_OPT) { optlevel = None; @@ -254,135 +255,162 @@ init_gallivm_state(struct gallivm_state *gallivm) optlevel = Default; } - if (LLVMCreateJITCompiler(&GlobalEngine, gallivm->provider, - (unsigned) optlevel, &error)) { +#if USE_MCJIT + ret = lp_build_create_mcjit_compiler_for_module(&gallivm->engine, + gallivm->module, + (unsigned) optlevel, + &error); +#else + ret = LLVMCreateJITCompiler(&gallivm->engine, gallivm->provider, + (unsigned) optlevel, &error); +#endif + if (ret) { _debug_printf("%s\n", error); LLVMDisposeMessage(error); goto fail; } #if defined(DEBUG) || defined(PROFILE) - lp_register_oprofile_jit_event_listener(GlobalEngine); + lp_register_oprofile_jit_event_listener(gallivm->engine); #endif } - gallivm->engine = GlobalEngine; - LLVMAddModuleProvider(gallivm->engine, gallivm->provider);//new +#if !USE_MCJIT gallivm->target = LLVMGetExecutionEngineTargetData(gallivm->engine); if (!gallivm->target) goto fail; +#else + if (0) { + /* + * Dump the data layout strings. + */ - if (!create_pass_manager(gallivm)) - goto fail; + LLVMTargetDataRef target = LLVMGetExecutionEngineTargetData(gallivm->engine); + char *data_layout; + char *engine_data_layout; - gallivm->builder = LLVMCreateBuilderInContext(gallivm->context); - if (!gallivm->builder) - goto fail; + data_layout = LLVMCopyStringRepOfTargetData(gallivm->target); + engine_data_layout = LLVMCopyStringRepOfTargetData(target); + + if (1) { + debug_printf("module target data = %s\n", data_layout); + debug_printf("engine target data = %s\n", engine_data_layout); + } + + free(data_layout); + free(engine_data_layout); + } +#endif return TRUE; fail: - free_gallivm_state(gallivm); return FALSE; } -struct callback -{ - garbage_collect_callback_func func; - void *cb_data; - struct callback *prev, *next; -}; - - -/** list of all garbage collector callbacks */ -static struct callback callback_list = {NULL, NULL, NULL, NULL}; +/** + * Singleton + * + * We must never free LLVM contexts, because LLVM has several global caches + * which pointing/derived from objects owned by the context, causing false + * memory leaks and false cache hits when these objects are destroyed. + * + * TODO: For thread safety on multi-threaded OpenGL we should use one LLVM + * context per thread, and put them in a pool when threads are destroyed. + */ +static LLVMContextRef gallivm_context = NULL; /** - * Register a function with gallivm which will be called when we - * do garbage collection. + * Allocate gallivm LLVM objects. + * \return TRUE for success, FALSE for failure */ -void -gallivm_register_garbage_collector_callback(garbage_collect_callback_func func, - void *cb_data) +static boolean +init_gallivm_state(struct gallivm_state *gallivm) { - struct callback *cb; - - if (!callback_list.prev) { - make_empty_list(&callback_list); - } + assert(!gallivm->context); + assert(!gallivm->module); + assert(!gallivm->provider); - /* see if already in list */ - foreach(cb, &callback_list) { - if (cb->func == func && cb->cb_data == cb_data) - return; - } + lp_build_init(); - /* add to list */ - cb = CALLOC_STRUCT(callback); - if (cb) { - cb->func = func; - cb->cb_data = cb_data; - insert_at_head(&callback_list, cb); + if (!gallivm_context) { + gallivm_context = LLVMContextCreate(); } -} + gallivm->context = gallivm_context; + if (!gallivm->context) + goto fail; + gallivm->module = LLVMModuleCreateWithNameInContext("gallivm", + gallivm->context); + if (!gallivm->module) + goto fail; -/** - * Remove a callback. - */ -void -gallivm_remove_garbage_collector_callback(garbage_collect_callback_func func, - void *cb_data) -{ - struct callback *cb; - - /* search list */ - foreach(cb, &callback_list) { - if (cb->func == func && cb->cb_data == cb_data) { - /* found, remove it */ - remove_from_list(cb); - FREE(cb); - return; - } - } -} + gallivm->provider = + LLVMCreateModuleProviderForExistingModule(gallivm->module); + if (!gallivm->provider) + goto fail; + gallivm->builder = LLVMCreateBuilderInContext(gallivm->context); + if (!gallivm->builder) + goto fail; -/** - * Call the callback functions (which are typically in the - * draw module and llvmpipe driver. - */ -static void -call_garbage_collector_callbacks(void) -{ - struct callback *cb; - foreach(cb, &callback_list) { - cb->func(cb->cb_data); + /* FIXME: MC-JIT only allows compiling one module at a time, and it must be + * complete when MC-JIT is created. So defer the MC-JIT engine creation for + * now. + */ +#if !USE_MCJIT + if (!init_gallivm_engine(gallivm)) { + goto fail; } -} +#else + /* + * MC-JIT engine compiles the module immediately on creation, so we can't + * obtain the target data from it. Instead we create a target data layout + * from a string. + * + * The produced layout strings are not precisely the same, but should make + * no difference for the kind of optimization passes we run. + * + * For reference this is the layout string on x64: + * + * e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64 + * + * See also: + * - http://llvm.org/docs/LangRef.html#datalayout + */ + + { + const unsigned pointer_size = 8 * sizeof(void *); + char layout[512]; + util_snprintf(layout, sizeof layout, "%c-p:%u:%u:%u-i64:64:64-a0:0:%u-s0:%u:%u", +#ifdef PIPE_ARCH_LITTLE_ENDIAN + 'e', // little endian +#else + 'E', // big endian +#endif + pointer_size, pointer_size, pointer_size, // pointer size, abi alignment, preferred alignment + pointer_size, // aggregate preferred alignment + pointer_size, pointer_size); // stack objects abi alignment, preferred alignment + gallivm->target = LLVMCreateTargetData(layout); + if (!gallivm->target) { + return FALSE; + } + } +#endif + if (!create_pass_manager(gallivm)) + goto fail; -/** - * Other gallium components using gallivm should call this periodically - * to let us do garbage collection (or at least try to free memory - * accumulated by the LLVM libraries). - */ -void -gallivm_garbage_collect(struct gallivm_state *gallivm) -{ - if (gallivm->context) { - if (gallivm_debug & GALLIVM_DEBUG_GC) - debug_printf("***** Doing LLVM garbage collection\n"); + return TRUE; - call_garbage_collector_callbacks(); - free_gallivm_state(gallivm); - init_gallivm_state(gallivm); - } +fail: + free_gallivm_state(gallivm); + return FALSE; } @@ -398,12 +426,27 @@ lp_build_init(void) lp_set_target_options(); - LLVMInitializeNativeTarget(); - +#if USE_MCJIT + LLVMLinkInMCJIT(); +#else LLVMLinkInJIT(); +#endif util_cpu_detect(); + + if (HAVE_AVX && + util_cpu_caps.has_avx) { + lp_native_vector_width = 256; + } else { + /* Leave it at 128, even when no SIMD extensions are available. + * Really needs to be a multiple of 128 so can fit 4 floats. + */ + lp_native_vector_width = 128; + } + lp_native_vector_width = debug_get_num_option("LP_NATIVE_VECTOR_WIDTH", + lp_native_vector_width); + gallivm_initialized = TRUE; #if 0 @@ -423,16 +466,27 @@ lp_build_init(void) struct gallivm_state * gallivm_create(void) { - if (!GlobalGallivm) { - GlobalGallivm = CALLOC_STRUCT(gallivm_state); - if (GlobalGallivm) { - if (!init_gallivm_state(GlobalGallivm)) { - FREE(GlobalGallivm); - GlobalGallivm = NULL; - } + struct gallivm_state *gallivm; + +#if HAVE_LLVM <= 0x206 + if (GlobalGallivm) { + return GlobalGallivm; + } +#endif + + gallivm = CALLOC_STRUCT(gallivm_state); + if (gallivm) { + if (!init_gallivm_state(gallivm)) { + FREE(gallivm); + gallivm = NULL; } } - return GlobalGallivm; + +#if HAVE_LLVM <= 0x206 + GlobalGallivm = gallivm; +#endif + + return gallivm; } @@ -442,6 +496,132 @@ gallivm_create(void) void gallivm_destroy(struct gallivm_state *gallivm) { +#if HAVE_LLVM <= 0x0206 /* No-op: don't destroy the singleton */ (void) gallivm; +#else + free_gallivm_state(gallivm); + FREE(gallivm); +#endif +} + + +/** + * Validate and optimze a function. + */ +static void +gallivm_optimize_function(struct gallivm_state *gallivm, + LLVMValueRef func) +{ + if (0) { + debug_printf("optimizing %s...\n", LLVMGetValueName(func)); + } + + assert(gallivm->passmgr); + + /* Apply optimizations to LLVM IR */ + LLVMRunFunctionPassManager(gallivm->passmgr, func); + + if (0) { + if (gallivm_debug & GALLIVM_DEBUG_IR) { + /* Print the LLVM IR to stderr */ + lp_debug_dump_value(func); + debug_printf("\n"); + } + } +} + + +/** + * Validate a function. + */ +void +gallivm_verify_function(struct gallivm_state *gallivm, + LLVMValueRef func) +{ + /* Verify the LLVM IR. If invalid, dump and abort */ +#ifdef DEBUG + if (LLVMVerifyFunction(func, LLVMPrintMessageAction)) { + lp_debug_dump_value(func); + assert(0); + return; + } +#endif + + gallivm_optimize_function(gallivm, func); + + if (gallivm_debug & GALLIVM_DEBUG_IR) { + /* Print the LLVM IR to stderr */ + lp_debug_dump_value(func); + debug_printf("\n"); + } +} + + +void +gallivm_compile_module(struct gallivm_state *gallivm) +{ +#if HAVE_LLVM > 0x206 + assert(!gallivm->compiled); +#endif + + /* Dump byte code to a file */ + if (0) { + LLVMWriteBitcodeToFile(gallivm->module, "llvmpipe.bc"); + debug_printf("llvmpipe.bc written\n"); + debug_printf("Invoke as \"llc -o - llvmpipe.bc\"\n"); + } + +#if USE_MCJIT + assert(!gallivm->engine); + if (!init_gallivm_engine(gallivm)) { + assert(0); + } +#endif + assert(gallivm->engine); + + ++gallivm->compiled; +} + + +func_pointer +gallivm_jit_function(struct gallivm_state *gallivm, + LLVMValueRef func) +{ + void *code; + func_pointer jit_func; + + assert(gallivm->compiled); + assert(gallivm->engine); + + code = LLVMGetPointerToGlobal(gallivm->engine, func); + assert(code); + jit_func = pointer_to_func(code); + + if (gallivm_debug & GALLIVM_DEBUG_ASM) { + lp_disassemble(code); + } + + /* Free the function body to save memory */ + lp_func_delete_body(func); + + return jit_func; +} + + +/** + * Free the function (and its machine code). + */ +void +gallivm_free_function(struct gallivm_state *gallivm, + LLVMValueRef func, + const void *code) +{ +#if !USE_MCJIT + if (code) { + LLVMFreeMachineCodeForFunction(gallivm->engine, func); + } + + LLVMDeleteFunction(func); +#endif } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h index 5fc0f996c64..7edea616c4e 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h @@ -31,6 +31,7 @@ #include "pipe/p_compiler.h" +#include "util/u_pointer.h" // for func_pointer #include "lp_bld.h" #include <llvm-c/ExecutionEngine.h> @@ -44,6 +45,7 @@ struct gallivm_state LLVMPassManagerRef passmgr; LLVMContextRef context; LLVMBuilderRef builder; + unsigned compiled; }; @@ -51,35 +53,28 @@ void lp_build_init(void); -extern void -lp_func_delete_body(LLVMValueRef func); - +struct gallivm_state * +gallivm_create(void); void -gallivm_garbage_collect(struct gallivm_state *gallivm); - +gallivm_destroy(struct gallivm_state *gallivm); -typedef void (*garbage_collect_callback_func)(void *cb_data); void -gallivm_register_garbage_collector_callback(garbage_collect_callback_func func, - void *cb_data); +gallivm_verify_function(struct gallivm_state *gallivm, + LLVMValueRef func); void -gallivm_remove_garbage_collector_callback(garbage_collect_callback_func func, - void *cb_data); +gallivm_compile_module(struct gallivm_state *gallivm); - -struct gallivm_state * -gallivm_create(void); +func_pointer +gallivm_jit_function(struct gallivm_state *gallivm, + LLVMValueRef func); void -gallivm_destroy(struct gallivm_state *gallivm); - - -extern LLVMValueRef -lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal, - const char *Name); +gallivm_free_function(struct gallivm_state *gallivm, + LLVMValueRef func, + const void * code); void lp_set_load_alignment(LLVMValueRef Inst, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.c b/src/gallium/auxiliary/gallivm/lp_bld_intr.c index 2323f124ae4..2bf1211bcd7 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_intr.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.c @@ -48,6 +48,8 @@ #include "lp_bld_const.h" #include "lp_bld_intr.h" +#include "lp_bld_type.h" +#include "lp_bld_pack.h" LLVMValueRef @@ -129,6 +131,95 @@ lp_build_intrinsic_binary(LLVMBuilderRef builder, } +/** + * Call intrinsic with arguments adapted to intrinsic vector length. + * + * Split vectors which are too large for the hw, or expand them if they + * are too small, so a caller calling a function which might use intrinsics + * doesn't need to do splitting/expansion on its own. + * This only supports intrinsics where src and dst types match. + */ +LLVMValueRef +lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm, + const char *name, + struct lp_type src_type, + unsigned intr_size, + LLVMValueRef a, + LLVMValueRef b) +{ + unsigned i; + struct lp_type intrin_type = src_type; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); + LLVMValueRef anative, bnative; + unsigned intrin_length = intr_size / src_type.width; + + intrin_type.length = intrin_length; + + if (intrin_length > src_type.length) { + LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; + LLVMValueRef constvec, tmp; + + for (i = 0; i < src_type.length; i++) { + elems[i] = lp_build_const_int32(gallivm, i); + } + for (; i < intrin_length; i++) { + elems[i] = i32undef; + } + if (src_type.length == 1) { + LLVMTypeRef elem_type = lp_build_elem_type(gallivm, intrin_type); + a = LLVMBuildBitCast(builder, a, LLVMVectorType(elem_type, 1), ""); + b = LLVMBuildBitCast(builder, b, LLVMVectorType(elem_type, 1), ""); + } + constvec = LLVMConstVector(elems, intrin_length); + anative = LLVMBuildShuffleVector(builder, a, a, constvec, ""); + bnative = LLVMBuildShuffleVector(builder, b, b, constvec, ""); + tmp = lp_build_intrinsic_binary(builder, name, + lp_build_vec_type(gallivm, intrin_type), + anative, bnative); + if (src_type.length > 1) { + constvec = LLVMConstVector(elems, src_type.length); + return LLVMBuildShuffleVector(builder, tmp, tmp, constvec, ""); + } + else { + return LLVMBuildExtractElement(builder, tmp, elems[0], ""); + } + } + else if (intrin_length < src_type.length) { + unsigned num_vec = src_type.length / intrin_length; + LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; + + /*Â don't support arbitrary size here as this is so yuck */ + if (src_type.length % intrin_length) { + /*Â FIXME: This is something which should be supported + * but there doesn't seem to be any need for it currently + * so crash and burn. + */ + debug_printf("%s: should handle arbitrary vector size\n", + __FUNCTION__); + assert(0); + return NULL; + } + + for (i = 0; i < num_vec; i++) { + anative = lp_build_extract_range(gallivm, a, i*intrin_length, + intrin_length); + bnative = lp_build_extract_range(gallivm, b, i*intrin_length, + intrin_length); + tmp[i] = lp_build_intrinsic_binary(builder, name, + lp_build_vec_type(gallivm, intrin_type), + anative, bnative); + } + return lp_build_concat(gallivm, tmp, intrin_type, num_vec); + } + else { + return lp_build_intrinsic_binary(builder, name, + lp_build_vec_type(gallivm, src_type), + a, b); + } +} + + LLVMValueRef lp_build_intrinsic_map(struct gallivm_state *gallivm, const char *name, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.h b/src/gallium/auxiliary/gallivm/lp_bld_intr.h index b73dd700362..38c5c29c980 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_intr.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.h @@ -78,6 +78,15 @@ lp_build_intrinsic_binary(LLVMBuilderRef builder, LLVMValueRef +lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm, + const char *name, + struct lp_type src_type, + unsigned intr_size, + LLVMValueRef a, + LLVMValueRef b); + + +LLVMValueRef lp_build_intrinsic_map(struct gallivm_state *gallivm, const char *name, LLVMTypeRef ret_type, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c index 69796149aaa..7a4a5bb11d3 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c @@ -52,8 +52,8 @@ * * select <4 x i1> %C, %A, %B * - * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is not - * supported on any backend. + * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is only + * supported on some backends (x86) starting with llvm 3.1. * * Expanding the boolean vector to full SIMD register width, as in * @@ -485,8 +485,10 @@ lp_build_select(struct lp_build_context *bld, } res = LLVMBuildSelect(builder, mask, a, b, ""); } - else if (util_cpu_caps.has_sse4_1 && - type.width * type.length == 128 && + else if (((util_cpu_caps.has_sse4_1 && + type.width * type.length == 128) || + (util_cpu_caps.has_avx && + type.width * type.length == 256 && type.width >= 32)) && !LLVMIsConstant(a) && !LLVMIsConstant(b) && !LLVMIsConstant(mask)) { @@ -494,8 +496,22 @@ lp_build_select(struct lp_build_context *bld, LLVMTypeRef arg_type; LLVMValueRef args[3]; - if (type.floating && - type.width == 64) { + /* + * There's only float blend in AVX but can just cast i32/i64 + * to float. + */ + if (type.width * type.length == 256) { + if (type.width == 64) { + intrinsic = "llvm.x86.avx.blendv.pd.256"; + arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 4); + } + else { + intrinsic = "llvm.x86.avx.blendv.ps.256"; + arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8); + } + } + else if (type.floating && + type.width == 64) { intrinsic = "llvm.x86.sse41.blendvpd"; arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 2); } else if (type.floating && @@ -591,3 +607,35 @@ lp_build_select_aos(struct lp_build_context *bld, return lp_build_select(bld, mask_vec, a, b); } } + + +/** + * Return (scalar-cast)val ? true : false; + */ +LLVMValueRef +lp_build_any_true_range(struct lp_build_context *bld, + unsigned real_length, + LLVMValueRef val) +{ + LLVMBuilderRef builder = bld->gallivm->builder; + LLVMTypeRef scalar_type; + LLVMTypeRef true_type; + + assert(real_length <= bld->type.length); + + true_type = LLVMIntTypeInContext(bld->gallivm->context, + bld->type.width * real_length); + scalar_type = LLVMIntTypeInContext(bld->gallivm->context, + bld->type.width * bld->type.length); + val = LLVMBuildBitCast(builder, val, scalar_type, ""); + /* + * We're using always native types so we can use intrinsics. + * However, if we don't do per-element calculations, we must ensure + * the excess elements aren't used since they may contain garbage. + */ + if (real_length < bld->type.length) { + val = LLVMBuildTrunc(builder, val, true_type, ""); + } + return LLVMBuildICmp(builder, LLVMIntNE, + val, LLVMConstNull(true_type), ""); +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.h b/src/gallium/auxiliary/gallivm/lp_bld_logic.h index ef33a653682..64c0a1f5946 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_logic.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.h @@ -82,4 +82,9 @@ lp_build_select_aos(struct lp_build_context *bld, LLVMValueRef b); +LLVMValueRef +lp_build_any_true_range(struct lp_build_context *bld, + unsigned real_length, + LLVMValueRef val); + #endif /* !LP_BLD_LOGIC_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp index 6c4586c4212..dd2c6120afb 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp +++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp @@ -26,6 +26,12 @@ **************************************************************************/ +/** + * The purpose of this module is to expose LLVM functionality not available + * through the C++ bindings. + */ + + #ifndef __STDC_LIMIT_MACROS #define __STDC_LIMIT_MACROS #endif @@ -41,11 +47,24 @@ #include <llvm/Target/TargetOptions.h> #include <llvm/ExecutionEngine/ExecutionEngine.h> #include <llvm/ExecutionEngine/JITEventListener.h> +#if HAVE_LLVM >= 0x0301 +#include <llvm/ADT/Triple.h> +#include <llvm/ExecutionEngine/JITMemoryManager.h> +#endif #include <llvm/Support/CommandLine.h> #include <llvm/Support/PrettyStackTrace.h> +#if HAVE_LLVM >= 0x0300 +#include <llvm/Support/TargetSelect.h> +#else /* HAVE_LLVM < 0x0300 */ +#include <llvm/Target/TargetSelect.h> +#endif /* HAVE_LLVM < 0x0300 */ + #include "pipe/p_config.h" #include "util/u_debug.h" +#include "util/u_cpu_detect.h" + +#include "lp_bld_misc.h" /** @@ -99,6 +118,9 @@ lp_set_target_options(void) #if defined(DEBUG) || defined(PROFILE) llvm::NoFramePointerElim = true; +#if HAVE_LLVM >= 0x0208 + llvm::NoFramePointerElimNonLeaf = true; +#endif #endif llvm::NoExcessFPPrecision = false; @@ -146,6 +168,30 @@ lp_set_target_options(void) * shared object where the gallium driver resides. */ llvm::DisablePrettyStackTrace = true; + + // If we have a native target, initialize it to ensure it is linked in and + // usable by the JIT. + llvm::InitializeNativeTarget(); + +#if HAVE_LLVM >= 0x0208 + llvm::InitializeNativeTargetAsmPrinter(); +#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) + LLVMInitializeX86AsmPrinter(); +#elif defined(PIPE_ARCH_ARM) + LLVMInitializeARMAsmPrinter(); +#elif defined(PIPE_ARCH_PPC) + LLVMInitializePowerPCAsmPrinter(); +#endif + +#if HAVE_LLVM >= 0x0207 +# if HAVE_LLVM >= 0x0301 + llvm::InitializeNativeTargetDisassembler(); +# elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) + LLVMInitializeX86Disassembler(); +# elif defined(PIPE_ARCH_ARM) + LLVMInitializeARMDisassembler(); +# endif +#endif } @@ -165,6 +211,7 @@ lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal, return llvm::wrap(llvm::unwrap(B)->CreateLoad(llvm::unwrap(PointerVal), true, Name)); } + extern "C" void lp_set_load_alignment(LLVMValueRef Inst, @@ -180,3 +227,67 @@ lp_set_store_alignment(LLVMValueRef Inst, { llvm::unwrap<llvm::StoreInst>(Inst)->setAlignment(Align); } + + +#if HAVE_LLVM >= 0x301 + +/** + * Same as LLVMCreateJITCompilerForModule, but using MCJIT and enabling AVX + * feature where available. + * + * See also: + * - llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp + * - llvm/tools/lli/lli.cpp + * - http://markmail.org/message/ttkuhvgj4cxxy2on#query:+page:1+mid:aju2dggerju3ivd3+state:results + */ +extern "C" +LLVMBool +lp_build_create_mcjit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, + LLVMModuleRef M, + unsigned OptLevel, + char **OutError) +{ + using namespace llvm; + + std::string Error; + EngineBuilder builder(unwrap(M)); + builder.setEngineKind(EngineKind::JIT) + .setErrorStr(&Error) + .setOptLevel((CodeGenOpt::Level)OptLevel); + + builder.setUseMCJIT(true); + + llvm::SmallVector<std::string, 1> MAttrs; + if (util_cpu_caps.has_avx) { + /* + * AVX feature is not automatically detected from CPUID by the X86 target + * yet, because the old (yet default) JIT engine is not capable of + * emitting the opcodes. But as we're using MCJIT here, it is safe to + * add set this attribute. + */ + MAttrs.push_back("+avx"); + builder.setMAttrs(MAttrs); + } + builder.setJITMemoryManager(JITMemoryManager::CreateDefaultMemManager()); + + ExecutionEngine *JIT; +#if 0 + JIT = builder.create(); +#else + /* + * Workaround http://llvm.org/bugs/show_bug.cgi?id=12833 + */ + StringRef MArch = ""; + StringRef MCPU = ""; + Triple TT(unwrap(M)->getTargetTriple()); + JIT = builder.create(builder.selectTarget(TT, MArch, MCPU, MAttrs)); +#endif + if (JIT) { + *OutJIT = wrap(JIT); + return 0; + } + *OutError = strdup(Error.c_str()); + return 1; +} + +#endif /* HAVE_LLVM >= 0x301 */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.h b/src/gallium/auxiliary/gallivm/lp_bld_misc.h new file mode 100644 index 00000000000..4f80b38280c --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.h @@ -0,0 +1,70 @@ +/************************************************************************** + * + * Copyright 2012 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef LP_BLD_MISC_H +#define LP_BLD_MISC_H + + +#include "lp_bld.h" +#include <llvm-c/ExecutionEngine.h> + + +#ifdef __cplusplus +extern "C" { +#endif + + + +extern void +lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE); + +extern void +lp_set_target_options(void); + + +extern void +lp_func_delete_body(LLVMValueRef func); + + +extern LLVMValueRef +lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal, + const char *Name); + +extern int +lp_build_create_mcjit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, + LLVMModuleRef M, + unsigned OptLevel, + char **OutError); + + +#ifdef __cplusplus +} +#endif + + +#endif /* !LP_BLD_MISC_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c index fde6bb594f1..b18f7841ccb 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c @@ -69,6 +69,7 @@ #include "util/u_debug.h" #include "util/u_math.h" #include "util/u_cpu_detect.h" +#include "util/u_memory.h" #include "lp_bld_type.h" #include "lp_bld_const.h" @@ -76,6 +77,7 @@ #include "lp_bld_intr.h" #include "lp_bld_arit.h" #include "lp_bld_pack.h" +#include "lp_bld_swizzle.h" /** @@ -101,6 +103,30 @@ lp_build_const_unpack_shuffle(struct gallivm_state *gallivm, return LLVMConstVector(elems, n); } +/** + * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack. + * See comment above lp_build_interleave2_half for more details. + */ +static LLVMValueRef +lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm, + unsigned n, unsigned lo_hi) +{ + LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; + unsigned i, j; + + assert(n <= LP_MAX_VECTOR_LENGTH); + assert(lo_hi < 2); + + for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) { + if (i == (n / 2)) + j += n / 4; + + elems[i + 0] = lp_build_const_int32(gallivm, 0 + j); + elems[i + 1] = lp_build_const_int32(gallivm, n + j); + } + + return LLVMConstVector(elems, n); +} /** * Build shuffle vectors that match PACKxx instructions. @@ -119,6 +145,71 @@ lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n) return LLVMConstVector(elems, n); } +/** + * Return a vector with elements src[start:start+size] + * Most useful for getting half the values out of a 256bit sized vector, + * otherwise may cause data rearrangement to happen. + */ +LLVMValueRef +lp_build_extract_range(struct gallivm_state *gallivm, + LLVMValueRef src, + unsigned start, + unsigned size) +{ + LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; + unsigned i; + + assert(size <= Elements(elems)); + + for (i = 0; i < size; ++i) + elems[i] = lp_build_const_int32(gallivm, i + start); + + if (size == 1) { + return LLVMBuildExtractElement(gallivm->builder, src, elems[0], ""); + } + else { + return LLVMBuildShuffleVector(gallivm->builder, src, src, + LLVMConstVector(elems, size), ""); + } +} + +/** + * Concatenates several (must be a power of 2) vectors (of same type) + * into a larger one. + * Most useful for building up a 256bit sized vector out of two 128bit ones. + */ +LLVMValueRef +lp_build_concat(struct gallivm_state *gallivm, + LLVMValueRef src[], + struct lp_type src_type, + unsigned num_vectors) +{ + unsigned new_length, i; + LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2]; + LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; + + assert(src_type.length * num_vectors <= Elements(shuffles)); + assert(util_is_power_of_two(num_vectors)); + + new_length = src_type.length; + + for (i = 0; i < num_vectors; i++) + tmp[i] = src[i]; + + while (num_vectors > 1) { + num_vectors >>= 1; + new_length <<= 1; + for (i = 0; i < new_length; i++) { + shuffles[i] = lp_build_const_int32(gallivm, i); + } + for (i = 0; i < num_vectors; i++) { + tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1], + LLVMConstVector(shuffles, new_length), ""); + } + } + + return tmp[0]; +} /** * Interleave vector elements. @@ -139,6 +230,40 @@ lp_build_interleave2(struct gallivm_state *gallivm, return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, ""); } +/** + * Interleave vector elements but with 256 bit, + * treats it as interleave with 2 concatenated 128 bit vectors. + * + * This differs to lp_build_interleave2 as that function would do the following (for lo): + * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction. + * + * + * An example interleave 8x float with 8x float on AVX 256bit unpack: + * a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7 + * + * Equivalent to interleaving 2x 128 bit vectors + * a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7 + * + * So interleave-lo would result in: + * a0 b0 a1 b1 a4 b4 a5 b5 + * + * And interleave-hi would result in: + * a2 b2 a3 b3 a6 b6 a7 b7 + */ +LLVMValueRef +lp_build_interleave2_half(struct gallivm_state *gallivm, + struct lp_type type, + LLVMValueRef a, + LLVMValueRef b, + unsigned lo_hi) +{ + if (type.length * type.width == 256) { + LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi); + return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, ""); + } else { + return lp_build_interleave2(gallivm, type, a, b, lo_hi); + } +} /** * Double the bit width. @@ -237,9 +362,9 @@ lp_build_unpack(struct gallivm_state *gallivm, * Non-interleaved pack. * * This will move values as - * - * lo = __ l0 __ l1 __ l2 __.. __ ln - * hi = __ h0 __ h1 __ h2 __.. __ hn + * (LSB) (MSB) + * lo = l0 __ l1 __ l2 __.. __ ln __ + * hi = h0 __ h1 __ h2 __.. __ hn __ * res = l0 l1 l2 .. ln h0 h1 h2 .. hn * * This will only change the number of bits the values are represented, not the @@ -257,12 +382,14 @@ lp_build_pack2(struct gallivm_state *gallivm, LLVMValueRef hi) { LLVMBuilderRef builder = gallivm->builder; -#if HAVE_LLVM < 0x0207 - LLVMTypeRef src_vec_type = lp_build_vec_type(gallivm, src_type); -#endif LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type); LLVMValueRef shuffle; LLVMValueRef res = NULL; + struct lp_type intr_type = dst_type; + +#if HAVE_LLVM < 0x0207 + intr_type = src_type; +#endif assert(!src_type.floating); assert(!dst_type.floating); @@ -270,50 +397,81 @@ lp_build_pack2(struct gallivm_state *gallivm, assert(src_type.length * 2 == dst_type.length); /* Check for special cases first */ - if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) { + if(util_cpu_caps.has_sse2 && src_type.width * src_type.length >= 128) { + const char *intrinsic = NULL; + switch(src_type.width) { case 32: if(dst_type.sign) { -#if HAVE_LLVM >= 0x0207 - res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", dst_vec_type, lo, hi); -#else - res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi); -#endif + intrinsic = "llvm.x86.sse2.packssdw.128"; } else { if (util_cpu_caps.has_sse4_1) { - return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi); - } - else { - /* use generic shuffle below */ - res = NULL; + intrinsic = "llvm.x86.sse41.packusdw"; +#if HAVE_LLVM < 0x0207 + /* llvm < 2.7 has inconsistent signatures except for packusdw */ + intr_type = dst_type; +#endif } } break; - case 16: - if(dst_type.sign) -#if HAVE_LLVM >= 0x0207 - res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", dst_vec_type, lo, hi); -#else - res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi); -#endif - else -#if HAVE_LLVM >= 0x0207 - res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", dst_vec_type, lo, hi); -#else - res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi); -#endif - break; - - default: - assert(0); - return LLVMGetUndef(dst_vec_type); + if (dst_type.sign) { + intrinsic = "llvm.x86.sse2.packsswb.128"; + } + else { + intrinsic = "llvm.x86.sse2.packuswb.128"; + } break; + /* default uses generic shuffle below */ } - - if (res) { - res = LLVMBuildBitCast(builder, res, dst_vec_type, ""); + if (intrinsic) { + if (src_type.width * src_type.length == 128) { + LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type); + res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi); + if (dst_vec_type != intr_vec_type) { + res = LLVMBuildBitCast(builder, res, dst_vec_type, ""); + } + } + else { + int num_split = src_type.width * src_type.length / 128; + int i; + int nlen = 128 / src_type.width; + struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128); + struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128); + LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128]; + LLVMValueRef tmplo, tmphi; + LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type); + LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type); + + assert(num_split <= LP_MAX_VECTOR_WIDTH / 128); + + for (i = 0; i < num_split / 2; i++) { + tmplo = lp_build_extract_range(gallivm, + lo, i*nlen*2, nlen); + tmphi = lp_build_extract_range(gallivm, + lo, i*nlen*2 + nlen, nlen); + tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic, + nintr_vec_type, tmplo, tmphi); + if (ndst_vec_type != nintr_vec_type) { + tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, ""); + } + } + for (i = 0; i < num_split / 2; i++) { + tmplo = lp_build_extract_range(gallivm, + hi, i*nlen*2, nlen); + tmphi = lp_build_extract_range(gallivm, + hi, i*nlen*2 + nlen, nlen); + tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic, + nintr_vec_type, + tmplo, tmphi); + if (ndst_vec_type != nintr_vec_type) { + tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2], + ndst_vec_type, ""); + } + } + res = lp_build_concat(gallivm, tmpres, ndst_type, num_split); + } return res; } } @@ -357,8 +515,9 @@ lp_build_packs2(struct gallivm_state *gallivm, /* All X86 SSE non-interleaved pack instructions take signed inputs and * saturate them, so no need to clamp for those cases. */ if(util_cpu_caps.has_sse2 && - src_type.width * src_type.length == 128 && - src_type.sign) + src_type.width * src_type.length >= 128 && + src_type.sign && + (src_type.width == 32 || src_type.width == 16)) clamp = FALSE; if(clamp) { @@ -395,7 +554,6 @@ lp_build_pack(struct gallivm_state *gallivm, LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; unsigned i; - /* Register width must remain constant */ assert(src_type.width * src_type.length == dst_type.width * dst_type.length); @@ -487,21 +645,44 @@ lp_build_resize(struct gallivm_state *gallivm, /* * Register width remains constant -- use vector packing intrinsics */ - tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs); } else { - /* - * Do it element-wise. - */ - - assert(src_type.length == dst_type.length); - tmp[0] = lp_build_undef(gallivm, dst_type); - for (i = 0; i < dst_type.length; ++i) { - LLVMValueRef index = lp_build_const_int32(gallivm, i); - LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, ""); - val = LLVMBuildTrunc(builder, val, lp_build_elem_type(gallivm, dst_type), ""); - tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, ""); + if (src_type.width / dst_type.width > num_srcs) { + /* + * First change src vectors size (with shuffle) so they have the + * same size as the destination vector, then pack normally. + * Note: cannot use cast/extract because llvm generates atrocious code. + */ + unsigned size_ratio = (src_type.width * src_type.length) / + (dst_type.length * dst_type.width); + unsigned new_length = src_type.length / size_ratio; + + for (i = 0; i < size_ratio * num_srcs; i++) { + unsigned start_index = (i % size_ratio) * new_length; + tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio], + start_index, new_length); + } + num_srcs *= size_ratio; + src_type.length = new_length; + tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs); + } + else { + /* + * Truncate bit width but expand vector size - first pack + * then expand simply because this should be more AVX-friendly + * for the cases we probably hit. + */ + unsigned size_ratio = (dst_type.width * dst_type.length) / + (src_type.length * src_type.width); + unsigned num_pack_srcs = num_srcs / size_ratio; + dst_type.length = dst_type.length / size_ratio; + + for (i = 0; i < size_ratio; i++) { + tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE, + &src[i*num_pack_srcs], num_pack_srcs); + } + tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio); } } } @@ -522,19 +703,24 @@ lp_build_resize(struct gallivm_state *gallivm, /* * Do it element-wise. */ + assert(src_type.length * num_srcs == dst_type.length * num_dsts); + + for (i = 0; i < num_dsts; i++) { + tmp[i] = lp_build_undef(gallivm, dst_type); + } - assert(src_type.length == dst_type.length); - tmp[0] = lp_build_undef(gallivm, dst_type); - for (i = 0; i < dst_type.length; ++i) { - LLVMValueRef index = lp_build_const_int32(gallivm, i); - LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, ""); + for (i = 0; i < src_type.length; ++i) { + unsigned j = i / dst_type.length; + LLVMValueRef srcindex = lp_build_const_int32(gallivm, i); + LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length); + LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, ""); if (src_type.sign && dst_type.sign) { val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), ""); } else { val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), ""); } - tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, ""); + tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, ""); } } } @@ -554,3 +740,38 @@ lp_build_resize(struct gallivm_state *gallivm, } +/** + * Expands src vector from src.length to dst_length + */ +LLVMValueRef +lp_build_pad_vector(struct gallivm_state *gallivm, + LLVMValueRef src, + struct lp_type src_type, + unsigned dst_length) +{ + LLVMValueRef undef = LLVMGetUndef(lp_build_vec_type(gallivm, src_type)); + LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; + unsigned i; + + assert(dst_length <= Elements(elems)); + assert(dst_length > src_type.length); + + if (src_type.length == dst_length) + return src; + + /* If its a single scalar type, no need to reinvent the wheel */ + if (src_type.length == 1) { + return lp_build_broadcast(gallivm, LLVMVectorType(lp_build_elem_type(gallivm, src_type), dst_length), src); + } + + /* All elements from src vector */ + for (i = 0; i < src_type.length; ++i) + elems[i] = lp_build_const_int32(gallivm, i); + + /* Undef fill remaining space */ + for (i = src_type.length; i < dst_length; ++i) + elems[i] = lp_build_const_int32(gallivm, src_type.length); + + /* Combine the two vectors */ + return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), ""); +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h index d58da4f01b3..73f299cca11 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h @@ -44,6 +44,12 @@ struct lp_type; +LLVMValueRef +lp_build_interleave2_half(struct gallivm_state *gallivm, + struct lp_type type, + LLVMValueRef a, + LLVMValueRef b, + unsigned lo_hi); LLVMValueRef lp_build_interleave2(struct gallivm_state *gallivm, @@ -69,6 +75,17 @@ lp_build_unpack(struct gallivm_state *gallivm, LLVMValueRef src, LLVMValueRef *dst, unsigned num_dsts); +LLVMValueRef +lp_build_extract_range(struct gallivm_state *gallivm, + LLVMValueRef src, + unsigned start, + unsigned size); + +LLVMValueRef +lp_build_concat(struct gallivm_state *gallivm, + LLVMValueRef src[], + struct lp_type src_type, + unsigned num_vectors); LLVMValueRef lp_build_packs2(struct gallivm_state *gallivm, @@ -102,4 +119,10 @@ lp_build_resize(struct gallivm_state *gallivm, LLVMValueRef *dst, unsigned num_dsts); +LLVMValueRef +lp_build_pad_vector(struct gallivm_state *gallivm, + LLVMValueRef src, + struct lp_type src_type, + unsigned dst_length); + #endif /* !LP_BLD_PACK_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c index b0a5bc0267f..b1ba7c72655 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c @@ -26,6 +26,7 @@ **************************************************************************/ +#include "u_cpu_detect.h" #include "lp_bld_type.h" #include "lp_bld_arit.h" #include "lp_bld_const.h" @@ -77,34 +78,82 @@ lp_build_ddy(struct lp_build_context *bld, return lp_build_sub(bld, a_bottom, a_top); } - +/* + * To be able to handle multiple quads at once in texture sampling and + * do lod calculations per quad, it is necessary to get the per-quad + * derivatives into the lp_build_rho function. + * For 8-wide vectors the packed derivative values for 3 coords would + * look like this, this scales to a arbitrary (multiple of 4) vector size: + * ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy + * dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ + * The second vector will be unused for 1d and 2d textures. + */ LLVMValueRef -lp_build_scalar_ddx(struct lp_build_context *bld, - LLVMValueRef a) +lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld, + LLVMValueRef a) { - LLVMBuilderRef builder = bld->gallivm->builder; - LLVMValueRef idx_left = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_LEFT); - LLVMValueRef idx_right = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_RIGHT); - LLVMValueRef a_left = LLVMBuildExtractElement(builder, a, idx_left, "left"); - LLVMValueRef a_right = LLVMBuildExtractElement(builder, a, idx_right, "right"); + struct gallivm_state *gallivm = bld->gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef vec1, vec2; + + /* same packing as _twocoord, but can use aos swizzle helper */ + + /* + * XXX could make swizzle1 a noop swizzle by using right top/bottom + * pair for ddy + */ + static const unsigned char swizzle1[] = { + LP_BLD_QUAD_TOP_LEFT, LP_BLD_QUAD_TOP_LEFT, + LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE + }; + static const unsigned char swizzle2[] = { + LP_BLD_QUAD_TOP_RIGHT, LP_BLD_QUAD_BOTTOM_LEFT, + LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE + }; + + vec1 = lp_build_swizzle_aos(bld, a, swizzle1); + vec2 = lp_build_swizzle_aos(bld, a, swizzle2); + if (bld->type.floating) - return LLVMBuildFSub(builder, a_right, a_left, "ddx"); + return LLVMBuildFSub(builder, vec2, vec1, "ddxddy"); else - return LLVMBuildSub(builder, a_right, a_left, "ddx"); + return LLVMBuildSub(builder, vec2, vec1, "ddxddy"); } LLVMValueRef -lp_build_scalar_ddy(struct lp_build_context *bld, - LLVMValueRef a) +lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld, + LLVMValueRef a, LLVMValueRef b) { - LLVMBuilderRef builder = bld->gallivm->builder; - LLVMValueRef idx_top = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_LEFT); - LLVMValueRef idx_bottom = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_BOTTOM_LEFT); - LLVMValueRef a_top = LLVMBuildExtractElement(builder, a, idx_top, "top"); - LLVMValueRef a_bottom = LLVMBuildExtractElement(builder, a, idx_bottom, "bottom"); + struct gallivm_state *gallivm = bld->gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH/4]; + LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH/4]; + LLVMValueRef vec1, vec2; + unsigned length, num_quads, i; + + /* XXX: do hsub version */ + length = bld->type.length; + num_quads = length / 4; + for (i = 0; i < num_quads; i++) { + unsigned s1 = 4 * i; + unsigned s2 = 4 * i + length; + shuffles1[4*i + 0] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s1); + shuffles1[4*i + 1] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s1); + shuffles1[4*i + 2] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s2); + shuffles1[4*i + 3] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s2); + shuffles2[4*i + 0] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_RIGHT + s1); + shuffles2[4*i + 1] = lp_build_const_int32(gallivm, LP_BLD_QUAD_BOTTOM_LEFT + s1); + shuffles2[4*i + 2] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_RIGHT + s2); + shuffles2[4*i + 3] = lp_build_const_int32(gallivm, LP_BLD_QUAD_BOTTOM_LEFT + s2); + } + vec1 = LLVMBuildShuffleVector(builder, a, b, + LLVMConstVector(shuffles1, length), ""); + vec2 = LLVMBuildShuffleVector(builder, a, b, + LLVMConstVector(shuffles2, length), ""); if (bld->type.floating) - return LLVMBuildFSub(builder, a_bottom, a_top, "ddy"); + return LLVMBuildFSub(builder, vec2, vec1, "ddxddyddxddy"); else - return LLVMBuildSub(builder, a_bottom, a_top, "ddy"); + return LLVMBuildSub(builder, vec2, vec1, "ddxddyddxddy"); } + diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.h b/src/gallium/auxiliary/gallivm/lp_bld_quad.h index b7992912927..be6a1efc396 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_quad.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.h @@ -78,19 +78,15 @@ lp_build_ddy(struct lp_build_context *bld, /* - * Scalar derivatives. - * - * Same as getting the first value of above. + * Packed derivatives (one derivative for each direction per quad) */ - LLVMValueRef -lp_build_scalar_ddx(struct lp_build_context *bld, - LLVMValueRef a); - +lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld, + LLVMValueRef a, LLVMValueRef b); LLVMValueRef -lp_build_scalar_ddy(struct lp_build_context *bld, - LLVMValueRef a); +lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld, + LLVMValueRef a); #endif /* LP_BLD_QUAD_H_ */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c index d966788d74e..85211161f3c 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c @@ -44,6 +44,8 @@ #include "lp_bld_sample.h" #include "lp_bld_swizzle.h" #include "lp_bld_type.h" +#include "lp_bld_logic.h" +#include "lp_bld_pack.h" /* @@ -175,67 +177,89 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, /** * Generate code to compute coordinate gradient (rho). - * \param ddx partial derivatives of (s, t, r, q) with respect to X - * \param ddy partial derivatives of (s, t, r, q) with respect to Y + * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y * - * XXX: The resulting rho is scalar, so we ignore all but the first element of - * derivatives that are passed by the shader. + * The resulting rho is scalar per quad. */ static LLVMValueRef lp_build_rho(struct lp_build_sample_context *bld, unsigned unit, - const LLVMValueRef ddx[4], - const LLVMValueRef ddy[4]) + const struct lp_derivatives *derivs) { + struct gallivm_state *gallivm = bld->gallivm; struct lp_build_context *int_size_bld = &bld->int_size_bld; struct lp_build_context *float_size_bld = &bld->float_size_bld; struct lp_build_context *float_bld = &bld->float_bld; + struct lp_build_context *coord_bld = &bld->coord_bld; + struct lp_build_context *perquadf_bld = &bld->perquadf_bld; + const LLVMValueRef *ddx_ddy = derivs->ddx_ddy; const unsigned dims = bld->dims; LLVMBuilderRef builder = bld->gallivm->builder; LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0); LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0); - LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy; - LLVMValueRef rho_x, rho_y; LLVMValueRef rho_vec; LLVMValueRef int_size, float_size; LLVMValueRef rho; LLVMValueRef first_level, first_level_vec; + LLVMValueRef abs_ddx_ddy[2]; + unsigned length = coord_bld->type.length; + unsigned num_quads = length / 4; + unsigned i; + LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); + LLVMValueRef rho_xvec, rho_yvec; + + abs_ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]); + if (dims > 2) { + abs_ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]); + } - dsdx = ddx[0]; - dsdy = ddy[0]; - - if (dims <= 1) { - rho_x = dsdx; - rho_y = dsdy; + if (dims == 1) { + static const unsigned char swizzle1[] = { + 0, LP_BLD_SWIZZLE_DONTCARE, + LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE + }; + static const unsigned char swizzle2[] = { + 1, LP_BLD_SWIZZLE_DONTCARE, + LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE + }; + rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1); + rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2); + } + else if (dims == 2) { + static const unsigned char swizzle1[] = { + 0, 2, + LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE + }; + static const unsigned char swizzle2[] = { + 1, 3, + LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE + }; + rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1); + rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2); } else { - rho_x = float_size_bld->undef; - rho_y = float_size_bld->undef; - - rho_x = LLVMBuildInsertElement(builder, rho_x, dsdx, index0, ""); - rho_y = LLVMBuildInsertElement(builder, rho_y, dsdy, index0, ""); - - dtdx = ddx[1]; - dtdy = ddy[1]; - - rho_x = LLVMBuildInsertElement(builder, rho_x, dtdx, index1, ""); - rho_y = LLVMBuildInsertElement(builder, rho_y, dtdy, index1, ""); - - if (dims >= 3) { - drdx = ddx[2]; - drdy = ddy[2]; - - rho_x = LLVMBuildInsertElement(builder, rho_x, drdx, index2, ""); - rho_y = LLVMBuildInsertElement(builder, rho_y, drdy, index2, ""); + LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH]; + LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH]; + assert(dims == 3); + for (i = 0; i < num_quads; i++) { + shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i); + shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2); + shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i); + shuffles1[4*i + 3] = i32undef; + shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1); + shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3); + shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 1); + shuffles2[4*i + 3] = i32undef; } + rho_xvec = LLVMBuildShuffleVector(builder, abs_ddx_ddy[0], abs_ddx_ddy[1], + LLVMConstVector(shuffles1, length), ""); + rho_yvec = LLVMBuildShuffleVector(builder, abs_ddx_ddy[0], abs_ddx_ddy[1], + LLVMConstVector(shuffles2, length), ""); } - rho_x = lp_build_abs(float_size_bld, rho_x); - rho_y = lp_build_abs(float_size_bld, rho_y); - - rho_vec = lp_build_max(float_size_bld, rho_x, rho_y); + rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec); first_level = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm, unit); @@ -243,22 +267,77 @@ lp_build_rho(struct lp_build_sample_context *bld, int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec); float_size = lp_build_int_to_float(float_size_bld, int_size); - rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size); + if (bld->coord_type.length > 4) { + /* expand size to each quad */ + if (dims > 1) { + /* could use some broadcast_vector helper for this? */ + int num_quads = bld->coord_type.length / 4; + LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4]; + for (i = 0; i < num_quads; i++) { + src[i] = float_size; + } + float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads); + } + else { + float_size = lp_build_broadcast_scalar(coord_bld, float_size); + } + rho_vec = lp_build_mul(coord_bld, rho_vec, float_size); - if (dims <= 1) { - rho = rho_vec; + if (dims <= 1) { + rho = rho_vec; + } + else { + if (dims >= 2) { + static const unsigned char swizzle1[] = { + 0, LP_BLD_SWIZZLE_DONTCARE, + LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE + }; + static const unsigned char swizzle2[] = { + 1, LP_BLD_SWIZZLE_DONTCARE, + LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE + }; + LLVMValueRef rho_s, rho_t, rho_r; + + rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1); + rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2); + + rho = lp_build_max(coord_bld, rho_s, rho_t); + + if (dims >= 3) { + static const unsigned char swizzle3[] = { + 2, LP_BLD_SWIZZLE_DONTCARE, + LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE + }; + rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle3); + rho = lp_build_max(coord_bld, rho, rho_r); + } + } + } + rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type, + perquadf_bld->type, rho); } else { - if (dims >= 2) { - LLVMValueRef rho_s, rho_t, rho_r; + if (dims <= 1) { + rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, ""); + } + rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size); - rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, ""); - rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, ""); + if (dims <= 1) { + rho = rho_vec; + } + else { + if (dims >= 2) { + LLVMValueRef rho_s, rho_t, rho_r; + + rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, ""); + rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, ""); - rho = lp_build_max(float_bld, rho_s, rho_t); - if (dims >= 3) { - rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, ""); - rho = lp_build_max(float_bld, rho, rho_r); + rho = lp_build_max(float_bld, rho_s, rho_t); + + if (dims >= 3) { + rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, ""); + rho = lp_build_max(float_bld, rho, rho_r); + } } } } @@ -396,22 +475,20 @@ lp_build_brilinear_rho(struct lp_build_context *bld, /** * Generate code to compute texture level of detail (lambda). - * \param ddx partial derivatives of (s, t, r, q) with respect to X - * \param ddy partial derivatives of (s, t, r, q) with respect to Y + * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y * \param lod_bias optional float vector with the shader lod bias * \param explicit_lod optional float vector with the explicit lod * \param width scalar int texture width * \param height scalar int texture height * \param depth scalar int texture depth * - * XXX: The resulting lod is scalar, so ignore all but the first element of - * derivatives, lod_bias, etc that are passed by the shader. + * The resulting lod is scalar per quad, so only the first value per quad + * passed in from lod_bias, explicit_lod is used. */ void lp_build_lod_selector(struct lp_build_sample_context *bld, unsigned unit, - const LLVMValueRef ddx[4], - const LLVMValueRef ddy[4], + const struct lp_derivatives *derivs, LLVMValueRef lod_bias, /* optional */ LLVMValueRef explicit_lod, /* optional */ unsigned mip_filter, @@ -420,11 +497,11 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, { LLVMBuilderRef builder = bld->gallivm->builder; - struct lp_build_context *float_bld = &bld->float_bld; + struct lp_build_context *perquadf_bld = &bld->perquadf_bld; LLVMValueRef lod; - *out_lod_ipart = bld->int_bld.zero; - *out_lod_fpart = bld->float_bld.zero; + *out_lod_ipart = bld->perquadi_bld.zero; + *out_lod_fpart = perquadf_bld->zero; if (bld->static_state->min_max_lod_equal) { /* User is forcing sampling from a particular mipmap level. @@ -433,21 +510,17 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, LLVMValueRef min_lod = bld->dynamic_state->min_lod(bld->dynamic_state, bld->gallivm, unit); - lod = min_lod; + lod = lp_build_broadcast_scalar(perquadf_bld, min_lod); } else { - LLVMValueRef sampler_lod_bias = - bld->dynamic_state->lod_bias(bld->dynamic_state, bld->gallivm, unit); - LLVMValueRef index0 = lp_build_const_int32(bld->gallivm, 0); - if (explicit_lod) { - lod = LLVMBuildExtractElement(builder, explicit_lod, - index0, ""); + lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type, + perquadf_bld->type, explicit_lod); } else { LLVMValueRef rho; - rho = lp_build_rho(bld, unit, ddx, ddy); + rho = lp_build_rho(bld, unit, derivs); /* * Compute lod = log2(rho) @@ -465,66 +538,72 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, if (mip_filter == PIPE_TEX_MIPFILTER_NONE || mip_filter == PIPE_TEX_MIPFILTER_NEAREST) { - *out_lod_ipart = lp_build_ilog2(float_bld, rho); - *out_lod_fpart = bld->float_bld.zero; + *out_lod_ipart = lp_build_ilog2(perquadf_bld, rho); + *out_lod_fpart = perquadf_bld->zero; return; } if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR && !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) { - lp_build_brilinear_rho(float_bld, rho, BRILINEAR_FACTOR, + lp_build_brilinear_rho(perquadf_bld, rho, BRILINEAR_FACTOR, out_lod_ipart, out_lod_fpart); return; } } if (0) { - lod = lp_build_log2(float_bld, rho); + lod = lp_build_log2(perquadf_bld, rho); } else { - lod = lp_build_fast_log2(float_bld, rho); + lod = lp_build_fast_log2(perquadf_bld, rho); } /* add shader lod bias */ if (lod_bias) { - lod_bias = LLVMBuildExtractElement(builder, lod_bias, - index0, ""); + lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type, + perquadf_bld->type, lod_bias); lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias"); } } /* add sampler lod bias */ - if (bld->static_state->lod_bias_non_zero) + if (bld->static_state->lod_bias_non_zero) { + LLVMValueRef sampler_lod_bias = + bld->dynamic_state->lod_bias(bld->dynamic_state, bld->gallivm, unit); + sampler_lod_bias = lp_build_broadcast_scalar(perquadf_bld, + sampler_lod_bias); lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias"); - + } /* clamp lod */ if (bld->static_state->apply_max_lod) { LLVMValueRef max_lod = bld->dynamic_state->max_lod(bld->dynamic_state, bld->gallivm, unit); + max_lod = lp_build_broadcast_scalar(perquadf_bld, max_lod); - lod = lp_build_min(float_bld, lod, max_lod); + lod = lp_build_min(perquadf_bld, lod, max_lod); } if (bld->static_state->apply_min_lod) { LLVMValueRef min_lod = bld->dynamic_state->min_lod(bld->dynamic_state, bld->gallivm, unit); + min_lod = lp_build_broadcast_scalar(perquadf_bld, min_lod); - lod = lp_build_max(float_bld, lod, min_lod); + lod = lp_build_max(perquadf_bld, lod, min_lod); } } if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) { - lp_build_brilinear_lod(float_bld, lod, BRILINEAR_FACTOR, + lp_build_brilinear_lod(perquadf_bld, lod, BRILINEAR_FACTOR, out_lod_ipart, out_lod_fpart); } else { - lp_build_ifloor_fract(float_bld, lod, out_lod_ipart, out_lod_fpart); + lp_build_ifloor_fract(perquadf_bld, lod, out_lod_ipart, out_lod_fpart); } lp_build_name(*out_lod_fpart, "lod_fpart"); } else { - *out_lod_ipart = lp_build_iround(float_bld, lod); + *out_lod_ipart = lp_build_iround(perquadf_bld, lod); } lp_build_name(*out_lod_ipart, "lod_ipart"); @@ -536,8 +615,8 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, /** * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer * mipmap level index. - * Note: this is all scalar code. - * \param lod scalar float texture level of detail + * Note: this is all scalar per quad code. + * \param lod_ipart int texture level of detail * \param level_out returns integer */ void @@ -546,26 +625,27 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld, LLVMValueRef lod_ipart, LLVMValueRef *level_out) { - struct lp_build_context *int_bld = &bld->int_bld; + struct lp_build_context *perquadi_bld = &bld->perquadi_bld; LLVMValueRef first_level, last_level, level; first_level = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm, unit); last_level = bld->dynamic_state->last_level(bld->dynamic_state, bld->gallivm, unit); + first_level = lp_build_broadcast_scalar(perquadi_bld, first_level); + last_level = lp_build_broadcast_scalar(perquadi_bld, last_level); - /* convert float lod to integer */ - level = lp_build_add(int_bld, lod_ipart, first_level); + level = lp_build_add(perquadi_bld, lod_ipart, first_level); /* clamp level to legal range of levels */ - *level_out = lp_build_clamp(int_bld, level, first_level, last_level); + *level_out = lp_build_clamp(perquadi_bld, level, first_level, last_level); } /** - * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to - * two (adjacent) mipmap level indexes. Later, we'll sample from those - * two mipmap levels and interpolate between them. + * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad int LOD(s) to two (per-quad) + * (adjacent) mipmap level indexes, and fix up float lod part accordingly. + * Later, we'll sample from those two mipmap levels and interpolate between them. */ void lp_build_linear_mip_levels(struct lp_build_sample_context *bld, @@ -576,20 +656,21 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld, LLVMValueRef *level1_out) { LLVMBuilderRef builder = bld->gallivm->builder; - struct lp_build_context *int_bld = &bld->int_bld; - struct lp_build_context *float_bld = &bld->float_bld; + struct lp_build_context *perquadi_bld = &bld->perquadi_bld; + struct lp_build_context *perquadf_bld = &bld->perquadf_bld; LLVMValueRef first_level, last_level; LLVMValueRef clamp_min; LLVMValueRef clamp_max; first_level = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm, unit); - - *level0_out = lp_build_add(int_bld, lod_ipart, first_level); - *level1_out = lp_build_add(int_bld, *level0_out, int_bld->one); - last_level = bld->dynamic_state->last_level(bld->dynamic_state, bld->gallivm, unit); + first_level = lp_build_broadcast_scalar(perquadi_bld, first_level); + last_level = lp_build_broadcast_scalar(perquadi_bld, last_level); + + *level0_out = lp_build_add(perquadi_bld, lod_ipart, first_level); + *level1_out = lp_build_add(perquadi_bld, *level0_out, perquadi_bld->one); /* * Clamp both *level0_out and *level1_out to [first_level, last_level], with @@ -597,6 +678,15 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld, * ends in the process. */ + /* + * This code (vector select in particular) only works with llvm 3.1 + * (if there's more than one quad, with x86 backend). Might consider + * converting to our lp_bld_logic helpers. + */ +#if HAVE_LLVM < 0x0301 + assert(perquadi_bld->type.length == 1); +#endif + /* *level0_out < first_level */ clamp_min = LLVMBuildICmp(builder, LLVMIntSLT, *level0_out, first_level, @@ -609,7 +699,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld, first_level, *level1_out, ""); *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min, - float_bld->zero, *lod_fpart_inout, ""); + perquadf_bld->zero, *lod_fpart_inout, ""); /* *level0_out >= last_level */ clamp_max = LLVMBuildICmp(builder, LLVMIntSGE, @@ -623,7 +713,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld, last_level, *level1_out, ""); *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max, - float_bld->zero, *lod_fpart_inout, ""); + perquadf_bld->zero, *lod_fpart_inout, ""); lp_build_name(*level0_out, "sampler%u_miplevel0", unit); lp_build_name(*level1_out, "sampler%u_miplevel1", unit); @@ -651,15 +741,6 @@ lp_build_get_mipmap_level(struct lp_build_sample_context *bld, } -LLVMValueRef -lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld, - int level) -{ - LLVMValueRef lvl = lp_build_const_int32(bld->gallivm, level); - return lp_build_get_mipmap_level(bld, lvl); -} - - /** * Codegen equivalent for u_minify(). * Return max(1, base_size >> level); @@ -748,8 +829,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld, * bld->int_size_type or bld->float_size_type) * @param coord_type type of the texture size vector (either * bld->int_coord_type or bld->coord_type) - * @param int_size vector with the integer texture size (width, height, - * depth) + * @param size vector with the texture size (width, height, depth) */ void lp_build_extract_image_sizes(struct lp_build_sample_context *bld, @@ -788,7 +868,7 @@ lp_build_extract_image_sizes(struct lp_build_sample_context *bld, /** * Unnormalize coords. * - * @param int_size vector with the integer texture size (width, height, depth) + * @param flt_size vector with the integer texture size (width, height, depth) */ void lp_build_unnormalized_coords(struct lp_build_sample_context *bld, @@ -823,7 +903,18 @@ lp_build_unnormalized_coords(struct lp_build_sample_context *bld, /** Helper used by lp_build_cube_lookup() */ static LLVMValueRef -lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord) +lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord) +{ + /* ima = +0.5 / abs(coord); */ + LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5); + LLVMValueRef absCoord = lp_build_abs(coord_bld, coord); + LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord); + return ima; +} + +/** Helper used by lp_build_cube_lookup() */ +static LLVMValueRef +lp_build_cube_imaneg(struct lp_build_context *coord_bld, LLVMValueRef coord) { /* ima = -0.5 / abs(coord); */ LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, -0.5); @@ -832,9 +923,12 @@ lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord) return ima; } - /** * Helper used by lp_build_cube_lookup() + * FIXME: the sign here can also be 0. + * Arithmetically this could definitely make a difference. Either + * fix the comment or use other (simpler) sign function, not sure + * which one it should be. * \param sign scalar +1 or -1 * \param coord float vector * \param ima float vector @@ -898,58 +992,186 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, LLVMValueRef *face_s, LLVMValueRef *face_t) { - struct lp_build_context *float_bld = &bld->float_bld; struct lp_build_context *coord_bld = &bld->coord_bld; LLVMBuilderRef builder = bld->gallivm->builder; + struct gallivm_state *gallivm = bld->gallivm; LLVMValueRef rx, ry, rz; - LLVMValueRef arx, ary, arz; - LLVMValueRef c25 = lp_build_const_float(bld->gallivm, 0.25); - LLVMValueRef arx_ge_ary, arx_ge_arz; - LLVMValueRef ary_ge_arx, ary_ge_arz; - LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz; - - assert(bld->coord_bld.type.length == 4); + LLVMValueRef tmp[4], rxyz, arxyz; /* * Use the average of the four pixel's texcoords to choose the face. + * Slight simplification just calculate the sum, skip scaling. */ - rx = lp_build_mul(float_bld, c25, - lp_build_sum_vector(&bld->coord_bld, s)); - ry = lp_build_mul(float_bld, c25, - lp_build_sum_vector(&bld->coord_bld, t)); - rz = lp_build_mul(float_bld, c25, - lp_build_sum_vector(&bld->coord_bld, r)); + tmp[0] = s; + tmp[1] = t; + tmp[2] = r; + rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3); + arxyz = lp_build_abs(&bld->coord_bld, rxyz); + + if (coord_bld->type.length > 4) { + struct lp_build_context *cint_bld = &bld->int_coord_bld; + struct lp_type intctype = cint_bld->type; + LLVMValueRef signrxs, signrys, signrzs, signrxyz, sign; + LLVMValueRef arxs, arys, arzs; + LLVMValueRef arx_ge_ary, maxarxsarys, arz_ge_arx_ary; + LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz; + LLVMValueRef ryneg, rzneg; + LLVMValueRef ma, ima; + LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5); + LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype, + 1 << (intctype.width - 1)); + LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype, + intctype.width -1); + LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X); + LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y); + LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z); + + assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1); + assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1); + assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1); + + rx = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), ""); + ry = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), ""); + rz = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), ""); + ryneg = LLVMBuildXor(builder, ry, signmask, ""); + rzneg = LLVMBuildXor(builder, rz, signmask, ""); + + /* the sign bit comes from the averaged vector (per quad), + * as does the decision which face to use */ + signrxyz = LLVMBuildBitCast(builder, rxyz, lp_build_vec_type(gallivm, intctype), ""); + signrxyz = LLVMBuildAnd(builder, signrxyz, signmask, ""); + + arxs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 0); + arys = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 1); + arzs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 2); - arx = lp_build_abs(float_bld, rx); - ary = lp_build_abs(float_bld, ry); - arz = lp_build_abs(float_bld, rz); + /* + * select x if x >= y else select y + * select previous result if y >= max(x,y) else select z + */ + arx_ge_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, arxs, arys); + maxarxsarys = lp_build_max(coord_bld, arxs, arys); + arz_ge_arx_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, maxarxsarys, arzs); - /* - * Compare sign/magnitude of rx,ry,rz to determine face - */ - arx_ge_ary = LLVMBuildFCmp(builder, LLVMRealUGE, arx, ary, ""); - arx_ge_arz = LLVMBuildFCmp(builder, LLVMRealUGE, arx, arz, ""); - ary_ge_arx = LLVMBuildFCmp(builder, LLVMRealUGE, ary, arx, ""); - ary_ge_arz = LLVMBuildFCmp(builder, LLVMRealUGE, ary, arz, ""); + /* + * compute all possible new s/t coords + * snewx = signrx * -rz; + * tnewx = -ry; + * snewy = rx; + * tnewy = signry * rz; + * snewz = signrz * rx; + * tnewz = -ry; + */ + signrxs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 0); + snewx = LLVMBuildXor(builder, signrxs, rzneg, ""); + tnewx = ryneg; + + signrys = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 1); + snewy = rx; + tnewy = LLVMBuildXor(builder, signrys, rz, ""); + + signrzs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 2); + snewz = LLVMBuildXor(builder, signrzs, rx, ""); + tnewz = ryneg; + + /* XXX on x86 unclear if we should cast the values back to float + * or not - on some cpus (nehalem) pblendvb has twice the throughput + * of blendvps though on others there just might be domain + * transition penalties when using it (this depends on what llvm + * will chose for the bit ops above so there appears no "right way", + * but given the boatload of selects let's just use the int type). + * + * Unfortunately we also need the sign bit of the summed coords. + */ + *face_s = lp_build_select(cint_bld, arx_ge_ary, snewx, snewy); + *face_t = lp_build_select(cint_bld, arx_ge_ary, tnewx, tnewy); + ma = lp_build_select(coord_bld, arx_ge_ary, s, t); + *face = lp_build_select(cint_bld, arx_ge_ary, facex, facey); + sign = lp_build_select(cint_bld, arx_ge_ary, signrxs, signrys); + + *face_s = lp_build_select(cint_bld, arz_ge_arx_ary, *face_s, snewz); + *face_t = lp_build_select(cint_bld, arz_ge_arx_ary, *face_t, tnewz); + ma = lp_build_select(coord_bld, arz_ge_arx_ary, ma, r); + *face = lp_build_select(cint_bld, arz_ge_arx_ary, *face, facez); + sign = lp_build_select(cint_bld, arz_ge_arx_ary, sign, signrzs); + + *face_s = LLVMBuildBitCast(builder, *face_s, + lp_build_vec_type(gallivm, coord_bld->type), ""); + *face_t = LLVMBuildBitCast(builder, *face_t, + lp_build_vec_type(gallivm, coord_bld->type), ""); + + /* add +1 for neg face */ + /* XXX with AVX probably want to use another select here - + * as long as we ensure vblendvps gets used we can actually + * skip the comparison and just use sign as a "mask" directly. + */ + sign = LLVMBuildLShr(builder, sign, signshift, ""); + *face = LLVMBuildOr(builder, *face, sign, "face"); - arx_ge_ary_arz = LLVMBuildAnd(builder, arx_ge_ary, arx_ge_arz, ""); - ary_ge_arx_arz = LLVMBuildAnd(builder, ary_ge_arx, ary_ge_arz, ""); + ima = lp_build_cube_imapos(coord_bld, ma); + + *face_s = lp_build_mul(coord_bld, *face_s, ima); + *face_s = lp_build_add(coord_bld, *face_s, posHalf); + *face_t = lp_build_mul(coord_bld, *face_t, ima); + *face_t = lp_build_add(coord_bld, *face_t, posHalf); + } - { + else { struct lp_build_if_state if_ctx; LLVMValueRef face_s_var; LLVMValueRef face_t_var; LLVMValueRef face_var; - - face_s_var = lp_build_alloca(bld->gallivm, bld->coord_bld.vec_type, "face_s_var"); - face_t_var = lp_build_alloca(bld->gallivm, bld->coord_bld.vec_type, "face_t_var"); - face_var = lp_build_alloca(bld->gallivm, bld->int_bld.vec_type, "face_var"); - - lp_build_if(&if_ctx, bld->gallivm, arx_ge_ary_arz); + LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz; + LLVMValueRef shuffles[4]; + LLVMValueRef arxy_ge_aryx, arxy_ge_arzz, arxy_ge_arxy_arzz; + LLVMValueRef arxyxy, aryxzz, arxyxy_ge_aryxzz; + struct lp_build_context *float_bld = &bld->float_bld; + + assert(bld->coord_bld.type.length == 4); + + shuffles[0] = lp_build_const_int32(gallivm, 0); + shuffles[1] = lp_build_const_int32(gallivm, 1); + shuffles[2] = lp_build_const_int32(gallivm, 0); + shuffles[3] = lp_build_const_int32(gallivm, 1); + arxyxy = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), ""); + shuffles[0] = lp_build_const_int32(gallivm, 1); + shuffles[1] = lp_build_const_int32(gallivm, 0); + shuffles[2] = lp_build_const_int32(gallivm, 2); + shuffles[3] = lp_build_const_int32(gallivm, 2); + aryxzz = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), ""); + arxyxy_ge_aryxzz = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_GEQUAL, arxyxy, aryxzz); + + shuffles[0] = lp_build_const_int32(gallivm, 0); + shuffles[1] = lp_build_const_int32(gallivm, 1); + arxy_ge_aryx = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz, + LLVMConstVector(shuffles, 2), ""); + shuffles[0] = lp_build_const_int32(gallivm, 2); + shuffles[1] = lp_build_const_int32(gallivm, 3); + arxy_ge_arzz = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz, + LLVMConstVector(shuffles, 2), ""); + arxy_ge_arxy_arzz = LLVMBuildAnd(builder, arxy_ge_aryx, arxy_ge_arzz, ""); + + arx_ge_ary_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz, + lp_build_const_int32(gallivm, 0), ""); + arx_ge_ary_arz = LLVMBuildICmp(builder, LLVMIntNE, arx_ge_ary_arz, + lp_build_const_int32(gallivm, 0), ""); + ary_ge_arx_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz, + lp_build_const_int32(gallivm, 1), ""); + ary_ge_arx_arz = LLVMBuildICmp(builder, LLVMIntNE, ary_ge_arx_arz, + lp_build_const_int32(gallivm, 0), ""); + face_s_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_s_var"); + face_t_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_t_var"); + face_var = lp_build_alloca(gallivm, bld->int_bld.vec_type, "face_var"); + + lp_build_if(&if_ctx, gallivm, arx_ge_ary_arz); { /* +/- X face */ - LLVMValueRef sign = lp_build_sgn(float_bld, rx); - LLVMValueRef ima = lp_build_cube_ima(coord_bld, s); + LLVMValueRef sign, ima; + rx = LLVMBuildExtractElement(builder, rxyz, + lp_build_const_int32(gallivm, 0), ""); + /* +/- X face */ + sign = lp_build_sgn(float_bld, rx); + ima = lp_build_cube_imaneg(coord_bld, s); *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima); *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima); *face = lp_build_cube_face(bld, rx, @@ -963,11 +1185,14 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, { struct lp_build_if_state if_ctx2; - lp_build_if(&if_ctx2, bld->gallivm, ary_ge_arx_arz); + lp_build_if(&if_ctx2, gallivm, ary_ge_arx_arz); { + LLVMValueRef sign, ima; /* +/- Y face */ - LLVMValueRef sign = lp_build_sgn(float_bld, ry); - LLVMValueRef ima = lp_build_cube_ima(coord_bld, t); + ry = LLVMBuildExtractElement(builder, rxyz, + lp_build_const_int32(gallivm, 1), ""); + sign = lp_build_sgn(float_bld, ry); + ima = lp_build_cube_imaneg(coord_bld, t); *face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima); *face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima); *face = lp_build_cube_face(bld, ry, @@ -980,8 +1205,11 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, lp_build_else(&if_ctx2); { /* +/- Z face */ - LLVMValueRef sign = lp_build_sgn(float_bld, rz); - LLVMValueRef ima = lp_build_cube_ima(coord_bld, r); + LLVMValueRef sign, ima; + rz = LLVMBuildExtractElement(builder, rxyz, + lp_build_const_int32(gallivm, 2), ""); + sign = lp_build_sgn(float_bld, rz); + ima = lp_build_cube_imaneg(coord_bld, r); *face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima); *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima); *face = lp_build_cube_face(bld, rz, @@ -999,6 +1227,7 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, *face_s = LLVMBuildLoad(builder, face_s_var, "face_s"); *face_t = LLVMBuildLoad(builder, face_t_var, "face_t"); *face = LLVMBuildLoad(builder, face_var, "face"); + *face = lp_build_broadcast_scalar(&bld->int_coord_bld, *face); } } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h index dad138abee0..0f3d8ae6cb5 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h @@ -52,6 +52,15 @@ struct lp_build_context; /** + * Helper struct holding all derivatives needed for sampling + */ +struct lp_derivatives +{ + LLVMValueRef ddx_ddy[2]; +}; + + +/** * Sampler static state. * * These are the bits of state from pipe_resource and pipe_sampler_state that @@ -192,6 +201,9 @@ struct lp_build_sample_context /* See texture_dims() */ unsigned dims; + /** SIMD vector width */ + unsigned vector_width; + /** regular scalar float type */ struct lp_type float_type; struct lp_build_context float_bld; @@ -199,7 +211,7 @@ struct lp_build_sample_context /** float vector type */ struct lp_build_context float_vec_bld; - /** regular scalar float type */ + /** regular scalar int type */ struct lp_type int_type; struct lp_build_context int_bld; @@ -223,10 +235,15 @@ struct lp_build_sample_context struct lp_type texel_type; struct lp_build_context texel_bld; + /** Float per-quad type */ + struct lp_type perquadf_type; + struct lp_build_context perquadf_bld; + + /** Int per-quad type */ + struct lp_type perquadi_type; + struct lp_build_context perquadi_bld; + /* Common dynamic state values */ - LLVMValueRef width; - LLVMValueRef height; - LLVMValueRef depth; LLVMValueRef row_stride_array; LLVMValueRef img_stride_array; LLVMValueRef data_array; @@ -305,8 +322,7 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, void lp_build_lod_selector(struct lp_build_sample_context *bld, unsigned unit, - const LLVMValueRef ddx[4], - const LLVMValueRef ddy[4], + const struct lp_derivatives *derivs, LLVMValueRef lod_bias, /* optional */ LLVMValueRef explicit_lod, /* optional */ unsigned mip_filter, @@ -331,10 +347,6 @@ LLVMValueRef lp_build_get_mipmap_level(struct lp_build_sample_context *bld, LLVMValueRef level); -LLVMValueRef -lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld, - int level); - void lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld, @@ -402,22 +414,35 @@ lp_build_sample_soa(struct gallivm_state *gallivm, unsigned unit, unsigned num_coords, const LLVMValueRef *coords, - const LLVMValueRef *ddx, - const LLVMValueRef *ddy, + const struct lp_derivatives *derivs, LLVMValueRef lod_bias, LLVMValueRef explicit_lod, LLVMValueRef texel_out[4]); + +void +lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld, + LLVMValueRef coord_f, + LLVMValueRef length_i, + LLVMValueRef length_f, + LLVMValueRef *coord0_i, + LLVMValueRef *weight_f); + + void lp_build_size_query_soa(struct gallivm_state *gallivm, const struct lp_sampler_static_state *static_state, struct lp_sampler_dynamic_state *dynamic_state, + struct lp_type int_type, unsigned unit, LLVMValueRef explicit_lod, LLVMValueRef *sizes_out); void -lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type, +lp_build_sample_nop(struct gallivm_state *gallivm, + struct lp_type type, + unsigned num_coords, + const LLVMValueRef *coords, LLVMValueRef texel_out[4]); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c index 74858bc9718..ad1b29cf096 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c @@ -27,7 +27,7 @@ /** * @file - * Texture sampling -- SoA. + * Texture sampling -- AoS. * * @author Jose Fonseca <jfonseca@vmware.com> * @author Brian Paul <brianp@vmware.com> @@ -40,6 +40,7 @@ #include "util/u_memory.h" #include "util/u_math.h" #include "util/u_format.h" +#include "util/u_cpu_detect.h" #include "lp_bld_debug.h" #include "lp_bld_type.h" #include "lp_bld_const.h" @@ -75,6 +76,7 @@ static void lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld, unsigned block_length, LLVMValueRef coord, + LLVMValueRef coord_f, LLVMValueRef length, LLVMValueRef stride, boolean is_pot, @@ -93,10 +95,11 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld, if(is_pot) coord = LLVMBuildAnd(builder, coord, length_minus_one, ""); else { - /* Add a bias to the texcoord to handle negative coords */ - LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024); - coord = LLVMBuildAdd(builder, coord, bias, ""); - coord = LLVMBuildURem(builder, coord, length, ""); + struct lp_build_context *coord_bld = &bld->coord_bld; + LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length); + coord = lp_build_fract_safe(coord_bld, coord_f); + coord = lp_build_mul(coord_bld, coord, length_f); + coord = lp_build_itrunc(coord_bld, coord); } break; @@ -121,6 +124,56 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld, /** + * Build LLVM code for texture coord wrapping, for nearest filtering, + * for float texcoords. + * \param coord the incoming texcoord (s,t,r or q) + * \param length the texture size along one dimension + * \param is_pot if TRUE, length is a power of two + * \param wrap_mode one of PIPE_TEX_WRAP_x + * \param icoord the texcoord after wrapping, as int + */ +static void +lp_build_sample_wrap_nearest_float(struct lp_build_sample_context *bld, + LLVMValueRef coord, + LLVMValueRef length, + boolean is_pot, + unsigned wrap_mode, + LLVMValueRef *icoord) +{ + struct lp_build_context *coord_bld = &bld->coord_bld; + LLVMValueRef length_minus_one; + + switch(wrap_mode) { + case PIPE_TEX_WRAP_REPEAT: + /* take fraction, unnormalize */ + coord = lp_build_fract_safe(coord_bld, coord); + coord = lp_build_mul(coord_bld, coord, length); + *icoord = lp_build_itrunc(coord_bld, coord); + break; + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one); + if (bld->static_state->normalized_coords) { + /* scale coord to length */ + coord = lp_build_mul(coord_bld, coord, length); + } + coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, + length_minus_one); + *icoord = lp_build_itrunc(coord_bld, coord); + break; + + case PIPE_TEX_WRAP_CLAMP: + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + case PIPE_TEX_WRAP_MIRROR_REPEAT: + case PIPE_TEX_WRAP_MIRROR_CLAMP: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + default: + assert(0); + } +} + + +/** * Build LLVM code for texture coord wrapping, for linear filtering, * for scaled integer texcoords. * \param block_length is the length of the pixel block along the @@ -139,6 +192,8 @@ static void lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, unsigned block_length, LLVMValueRef coord0, + LLVMValueRef *weight_i, + LLVMValueRef coord_f, LLVMValueRef length, LLVMValueRef stride, boolean is_pot, @@ -153,58 +208,85 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, LLVMValueRef length_minus_one; LLVMValueRef lmask, umask, mask; - if (block_length != 1) { - /* - * If the pixel block covers more than one pixel then there is no easy - * way to calculate offset1 relative to offset0. Instead, compute them - * independently. - */ - - LLVMValueRef coord1; - - lp_build_sample_wrap_nearest_int(bld, - block_length, - coord0, - length, - stride, - is_pot, - wrap_mode, - offset0, i0); - - coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); + /* + * If the pixel block covers more than one pixel then there is no easy + * way to calculate offset1 relative to offset0. Instead, compute them + * independently. Otherwise, try to compute offset0 and offset1 with + * a single stride multiplication. + */ - lp_build_sample_wrap_nearest_int(bld, - block_length, - coord1, - length, - stride, - is_pot, - wrap_mode, - offset1, i1); + length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one); + if (block_length != 1) { + LLVMValueRef coord1; + switch(wrap_mode) { + case PIPE_TEX_WRAP_REPEAT: + if (is_pot) { + coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); + coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, ""); + coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, ""); + } + else { + LLVMValueRef mask; + LLVMValueRef weight; + LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length); + lp_build_coord_repeat_npot_linear(bld, coord_f, + length, length_f, + &coord0, &weight); + mask = lp_build_compare(bld->gallivm, int_coord_bld->type, + PIPE_FUNC_NOTEQUAL, coord0, length_minus_one); + coord1 = LLVMBuildAnd(builder, + lp_build_add(int_coord_bld, coord0, + int_coord_bld->one), + mask, ""); + weight = lp_build_mul_imm(&bld->coord_bld, weight, 256); + *weight_i = lp_build_itrunc(&bld->coord_bld, weight); + } + break; + + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); + coord0 = lp_build_clamp(int_coord_bld, coord0, int_coord_bld->zero, + length_minus_one); + coord1 = lp_build_clamp(int_coord_bld, coord1, int_coord_bld->zero, + length_minus_one); + break; + + case PIPE_TEX_WRAP_CLAMP: + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + case PIPE_TEX_WRAP_MIRROR_REPEAT: + case PIPE_TEX_WRAP_MIRROR_CLAMP: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + default: + assert(0); + coord0 = int_coord_bld->zero; + coord1 = int_coord_bld->zero; + break; + } + lp_build_sample_partial_offset(int_coord_bld, block_length, coord0, stride, + offset0, i0); + lp_build_sample_partial_offset(int_coord_bld, block_length, coord1, stride, + offset1, i1); return; } - /* - * Scalar pixels -- try to compute offset0 and offset1 with a single stride - * multiplication. - */ - *i0 = int_coord_bld->zero; *i1 = int_coord_bld->zero; - length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one); - switch(wrap_mode) { case PIPE_TEX_WRAP_REPEAT: if (is_pot) { coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, ""); } else { - /* Add a bias to the texcoord to handle negative coords */ - LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024); - coord0 = LLVMBuildAdd(builder, coord0, bias, ""); - coord0 = LLVMBuildURem(builder, coord0, length, ""); + LLVMValueRef weight; + LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length); + lp_build_coord_repeat_npot_linear(bld, coord_f, + length, length_f, + &coord0, &weight); + weight = lp_build_mul_imm(&bld->coord_bld, weight, 256); + *weight_i = lp_build_itrunc(&bld->coord_bld, weight); } mask = lp_build_compare(bld->gallivm, int_coord_bld->type, @@ -217,6 +299,11 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, break; case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + /* XXX this might be slower than the separate path + * on some newer cpus. With sse41 this is 8 instructions vs. 7 + * - at least on SNB this is almost certainly slower since + * min/max are cheaper than selects, and the muls aren't bad. + */ lmask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type, PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero); umask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type, @@ -249,6 +336,176 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, /** + * Build LLVM code for texture coord wrapping, for linear filtering, + * for float texcoords. + * \param block_length is the length of the pixel block along the + * coordinate axis + * \param coord the incoming texcoord (s,t,r or q) + * \param length the texture size along one dimension + * \param is_pot if TRUE, length is a power of two + * \param wrap_mode one of PIPE_TEX_WRAP_x + * \param coord0 the first texcoord after wrapping, as int + * \param coord1 the second texcoord after wrapping, as int + * \param weight the filter weight as int (0-255) + * \param force_nearest if this coord actually uses nearest filtering + */ +static void +lp_build_sample_wrap_linear_float(struct lp_build_sample_context *bld, + unsigned block_length, + LLVMValueRef coord, + LLVMValueRef length, + boolean is_pot, + unsigned wrap_mode, + LLVMValueRef *coord0, + LLVMValueRef *coord1, + LLVMValueRef *weight, + unsigned force_nearest) +{ + struct lp_build_context *int_coord_bld = &bld->int_coord_bld; + struct lp_build_context *coord_bld = &bld->coord_bld; + LLVMBuilderRef builder = bld->gallivm->builder; + LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5); + LLVMValueRef length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one); + + switch(wrap_mode) { + case PIPE_TEX_WRAP_REPEAT: + if (is_pot) { + /* mul by size and subtract 0.5 */ + coord = lp_build_mul(coord_bld, coord, length); + if (!force_nearest) + coord = lp_build_sub(coord_bld, coord, half); + *coord1 = lp_build_add(coord_bld, coord, coord_bld->one); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, coord0, weight); + *coord1 = lp_build_ifloor(coord_bld, *coord1); + /* repeat wrap */ + length_minus_one = lp_build_itrunc(coord_bld, length_minus_one); + *coord0 = LLVMBuildAnd(builder, *coord0, length_minus_one, ""); + *coord1 = LLVMBuildAnd(builder, *coord1, length_minus_one, ""); + } + else { + LLVMValueRef mask; + /* wrap with normalized floats is just fract */ + coord = lp_build_fract(coord_bld, coord); + /* unnormalize */ + coord = lp_build_mul(coord_bld, coord, length); + /* + * we avoided the 0.5/length division, have to fix up wrong + * edge cases with selects + */ + *coord1 = lp_build_add(coord_bld, coord, half); + coord = lp_build_sub(coord_bld, coord, half); + *weight = lp_build_fract(coord_bld, coord); + mask = lp_build_compare(coord_bld->gallivm, coord_bld->type, + PIPE_FUNC_LESS, coord, coord_bld->zero); + *coord0 = lp_build_select(coord_bld, mask, length_minus_one, coord); + *coord0 = lp_build_itrunc(coord_bld, *coord0); + mask = lp_build_compare(coord_bld->gallivm, coord_bld->type, + PIPE_FUNC_LESS, *coord1, length); + *coord1 = lp_build_select(coord_bld, mask, *coord1, coord_bld->zero); + *coord1 = lp_build_itrunc(coord_bld, *coord1); + } + break; + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + if (bld->static_state->normalized_coords) { + /* mul by tex size */ + coord = lp_build_mul(coord_bld, coord, length); + } + /* subtract 0.5 */ + if (!force_nearest) { + coord = lp_build_sub(coord_bld, coord, half); + } + /* clamp to [0, length - 1] */ + coord = lp_build_min(coord_bld, coord, length_minus_one); + coord = lp_build_max(coord_bld, coord, coord_bld->zero); + *coord1 = lp_build_add(coord_bld, coord, coord_bld->one); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, coord0, weight); + /* coord1 = min(coord1, length-1) */ + *coord1 = lp_build_min(coord_bld, *coord1, length_minus_one); + *coord1 = lp_build_itrunc(coord_bld, *coord1); + break; + default: + assert(0); + *coord0 = int_coord_bld->zero; + *coord1 = int_coord_bld->zero; + *weight = coord_bld->zero; + break; + } + *weight = lp_build_mul_imm(coord_bld, *weight, 256); + *weight = lp_build_itrunc(coord_bld, *weight); + return; +} + + +/** + * Fetch texels for image with nearest sampling. + * Return filtered color as two vectors of 16-bit fixed point values. + */ +static void +lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld, + LLVMValueRef data_ptr, + LLVMValueRef offset, + LLVMValueRef x_subcoord, + LLVMValueRef y_subcoord, + LLVMValueRef *colors_lo, + LLVMValueRef *colors_hi) +{ + /* + * Fetch the pixels as 4 x 32bit (rgba order might differ): + * + * rgba0 rgba1 rgba2 rgba3 + * + * bit cast them into 16 x u8 + * + * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + * + * unpack them into two 8 x i16: + * + * r0 g0 b0 a0 r1 g1 b1 a1 + * r2 g2 b2 a2 r3 g3 b3 a3 + * + * The higher 8 bits of the resulting elements will be zero. + */ + LLVMBuilderRef builder = bld->gallivm->builder; + LLVMValueRef rgba8; + struct lp_build_context h16, u8n; + LLVMTypeRef u8n_vec_type; + + lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width)); + lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width)); + u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type); + + if (util_format_is_rgba8_variant(bld->format_desc)) { + /* + * Given the format is a rgba8, just read the pixels as is, + * without any swizzling. Swizzling will be done later. + */ + rgba8 = lp_build_gather(bld->gallivm, + bld->texel_type.length, + bld->format_desc->block.bits, + bld->texel_type.width, + data_ptr, offset); + + rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, ""); + } + else { + rgba8 = lp_build_fetch_rgba_aos(bld->gallivm, + bld->format_desc, + u8n.type, + data_ptr, offset, + x_subcoord, + y_subcoord); + } + + /* Expand one 4*rgba8 to two 2*rgba16 */ + lp_build_unpack2(bld->gallivm, u8n.type, h16.type, + rgba8, + colors_lo, colors_hi); +} + + +/** * Sample a single texture image with nearest sampling. * If sampling a cube texture, r = cube face in [0,5]. * Return filtered color as two vectors of 16-bit fixed point values. @@ -267,21 +524,19 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, { const unsigned dims = bld->dims; LLVMBuilderRef builder = bld->gallivm->builder; - struct lp_build_context i32, h16, u8n; - LLVMTypeRef i32_vec_type, u8n_vec_type; + struct lp_build_context i32; + LLVMTypeRef i32_vec_type; LLVMValueRef i32_c8; LLVMValueRef width_vec, height_vec, depth_vec; LLVMValueRef s_ipart, t_ipart = NULL, r_ipart = NULL; + LLVMValueRef s_float, t_float = NULL, r_float = NULL; LLVMValueRef x_stride; LLVMValueRef x_offset, offset; LLVMValueRef x_subcoord, y_subcoord, z_subcoord; - lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32)); - lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16)); - lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8)); + lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width)); i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type); - u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type); lp_build_extract_image_sizes(bld, bld->int_size_type, @@ -291,6 +546,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, &height_vec, &depth_vec); + s_float = s; t_float = t; r_float = r; + if (bld->static_state->normalized_coords) { LLVMValueRef scaled_size; LLVMValueRef flt_size; @@ -334,7 +591,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, /* Do texcoord wrapping, compute texel offset */ lp_build_sample_wrap_nearest_int(bld, bld->format_desc->block.width, - s_ipart, width_vec, x_stride, + s_ipart, s_float, + width_vec, x_stride, bld->static_state->pot_width, bld->static_state->wrap_s, &x_offset, &x_subcoord); @@ -343,7 +601,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, LLVMValueRef y_offset; lp_build_sample_wrap_nearest_int(bld, bld->format_desc->block.height, - t_ipart, height_vec, row_stride_vec, + t_ipart, t_float, + height_vec, row_stride_vec, bld->static_state->pot_height, bld->static_state->wrap_t, &y_offset, &y_subcoord); @@ -352,7 +611,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, LLVMValueRef z_offset; lp_build_sample_wrap_nearest_int(bld, 1, /* block length (depth) */ - r_ipart, depth_vec, img_stride_vec, + r_ipart, r_float, + depth_vec, img_stride_vec, bld->static_state->pot_depth, bld->static_state->wrap_r, &z_offset, &z_subcoord); @@ -366,6 +626,196 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, } } + lp_build_sample_fetch_image_nearest(bld, data_ptr, offset, + x_subcoord, y_subcoord, + colors_lo, colors_hi); +} + + +/** + * Sample a single texture image with nearest sampling. + * If sampling a cube texture, r = cube face in [0,5]. + * Return filtered color as two vectors of 16-bit fixed point values. + * Does address calcs (except offsets) with floats. + * Useful for AVX which has support for 8x32 floats but not 8x32 ints. + */ +static void +lp_build_sample_image_nearest_afloat(struct lp_build_sample_context *bld, + LLVMValueRef int_size, + LLVMValueRef row_stride_vec, + LLVMValueRef img_stride_vec, + LLVMValueRef data_ptr, + LLVMValueRef s, + LLVMValueRef t, + LLVMValueRef r, + LLVMValueRef *colors_lo, + LLVMValueRef *colors_hi) + { + const unsigned dims = bld->dims; + LLVMValueRef width_vec, height_vec, depth_vec; + LLVMValueRef offset; + LLVMValueRef x_subcoord, y_subcoord; + LLVMValueRef x_icoord, y_icoord, z_icoord; + LLVMValueRef flt_size; + + flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size); + + lp_build_extract_image_sizes(bld, + bld->float_size_type, + bld->coord_type, + flt_size, + &width_vec, + &height_vec, + &depth_vec); + + /* Do texcoord wrapping */ + lp_build_sample_wrap_nearest_float(bld, + s, width_vec, + bld->static_state->pot_width, + bld->static_state->wrap_s, + &x_icoord); + + if (dims >= 2) { + lp_build_sample_wrap_nearest_float(bld, + t, height_vec, + bld->static_state->pot_height, + bld->static_state->wrap_t, + &y_icoord); + + if (dims >= 3) { + lp_build_sample_wrap_nearest_float(bld, + r, depth_vec, + bld->static_state->pot_depth, + bld->static_state->wrap_r, + &z_icoord); + } + else if (bld->static_state->target == PIPE_TEXTURE_CUBE) { + z_icoord = r; + } + } + + /* + * From here on we deal with ints, and we should split up the 256bit + * vectors manually for better generated code. + */ + + /* + * compute texel offsets - + * cannot do offset calc with floats, difficult for block-based formats, + * and not enough precision anyway. + */ + lp_build_sample_offset(&bld->int_coord_bld, + bld->format_desc, + x_icoord, y_icoord, + z_icoord, + row_stride_vec, img_stride_vec, + &offset, + &x_subcoord, &y_subcoord); + + lp_build_sample_fetch_image_nearest(bld, data_ptr, offset, + x_subcoord, y_subcoord, + colors_lo, colors_hi); +} + + +/** + * Fetch texels for image with linear sampling. + * Return filtered color as two vectors of 16-bit fixed point values. + */ +static void +lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld, + LLVMValueRef data_ptr, + LLVMValueRef offset[2][2][2], + LLVMValueRef x_subcoord[2], + LLVMValueRef y_subcoord[2], + LLVMValueRef s_fpart, + LLVMValueRef t_fpart, + LLVMValueRef r_fpart, + LLVMValueRef *colors_lo, + LLVMValueRef *colors_hi) +{ + const unsigned dims = bld->dims; + LLVMBuilderRef builder = bld->gallivm->builder; + struct lp_build_context h16, u8n; + LLVMTypeRef h16_vec_type, u8n_vec_type; + LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context); + LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH]; + LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH]; + LLVMValueRef shuffle_lo, shuffle_hi; + LLVMValueRef s_fpart_lo, s_fpart_hi; + LLVMValueRef t_fpart_lo = NULL, t_fpart_hi = NULL; + LLVMValueRef r_fpart_lo = NULL, r_fpart_hi = NULL; + LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */ + LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */ + LLVMValueRef packed_lo, packed_hi; + unsigned i, j, k; + unsigned numj, numk; + + lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width)); + lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width)); + h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type); + u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type); + + /* + * Transform 4 x i32 in + * + * s_fpart = {s0, s1, s2, s3} + * + * into 8 x i16 + * + * s_fpart = {00, s0, 00, s1, 00, s2, 00, s3} + * + * into two 8 x i16 + * + * s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1} + * s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3} + * + * and likewise for t_fpart. There is no risk of loosing precision here + * since the fractional parts only use the lower 8bits. + */ + s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, ""); + if (dims >= 2) + t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, ""); + if (dims >= 3) + r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, ""); + + for (j = 0; j < h16.type.length; j += 4) { +#ifdef PIPE_ARCH_LITTLE_ENDIAN + unsigned subindex = 0; +#else + unsigned subindex = 1; +#endif + LLVMValueRef index; + + index = LLVMConstInt(elem_type, j/2 + subindex, 0); + for (i = 0; i < 4; ++i) + shuffles_lo[j + i] = index; + + index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0); + for (i = 0; i < 4; ++i) + shuffles_hi[j + i] = index; + } + + shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length); + shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length); + + s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, + shuffle_lo, ""); + s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, + shuffle_hi, ""); + if (dims >= 2) { + t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, + shuffle_lo, ""); + t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, + shuffle_hi, ""); + } + if (dims >= 3) { + r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef, + shuffle_lo, ""); + r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef, + shuffle_hi, ""); + } + /* * Fetch the pixels as 4 x 32bit (rgba order might differ): * @@ -382,38 +832,129 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, * * The higher 8 bits of the resulting elements will be zero. */ - { - LLVMValueRef rgba8; + numj = 1 + (dims >= 2); + numk = 1 + (dims >= 3); - if (util_format_is_rgba8_variant(bld->format_desc)) { - /* - * Given the format is a rgba8, just read the pixels as is, - * without any swizzling. Swizzling will be done later. - */ - rgba8 = lp_build_gather(bld->gallivm, - bld->texel_type.length, - bld->format_desc->block.bits, - bld->texel_type.width, - data_ptr, offset); + for (k = 0; k < numk; k++) { + for (j = 0; j < numj; j++) { + for (i = 0; i < 2; i++) { + LLVMValueRef rgba8; + + if (util_format_is_rgba8_variant(bld->format_desc)) { + /* + * Given the format is a rgba8, just read the pixels as is, + * without any swizzling. Swizzling will be done later. + */ + rgba8 = lp_build_gather(bld->gallivm, + bld->texel_type.length, + bld->format_desc->block.bits, + bld->texel_type.width, + data_ptr, offset[k][j][i]); + + rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, ""); + } + else { + rgba8 = lp_build_fetch_rgba_aos(bld->gallivm, + bld->format_desc, + u8n.type, + data_ptr, offset[k][j][i], + x_subcoord[i], + y_subcoord[j]); + } - rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, ""); + /* Expand one 4*rgba8 to two 2*rgba16 */ + lp_build_unpack2(bld->gallivm, u8n.type, h16.type, + rgba8, + &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]); + } } - else { - rgba8 = lp_build_fetch_rgba_aos(bld->gallivm, - bld->format_desc, - u8n.type, - data_ptr, offset, - x_subcoord, - y_subcoord); + } + + /* + * Linear interpolation with 8.8 fixed point. + */ + if (bld->static_state->force_nearest_s) { + /* special case 1-D lerp */ + packed_lo = lp_build_lerp(&h16, + t_fpart_lo, + neighbors_lo[0][0][0], + neighbors_lo[0][0][1]); + + packed_hi = lp_build_lerp(&h16, + t_fpart_hi, + neighbors_hi[0][1][0], + neighbors_hi[0][1][0]); + } + else if (bld->static_state->force_nearest_t) { + /* special case 1-D lerp */ + packed_lo = lp_build_lerp(&h16, + s_fpart_lo, + neighbors_lo[0][0][0], + neighbors_lo[0][0][1]); + + packed_hi = lp_build_lerp(&h16, + s_fpart_hi, + neighbors_hi[0][0][0], + neighbors_hi[0][0][1]); + } + else { + /* general 1/2/3-D lerping */ + if (dims == 1) { + packed_lo = lp_build_lerp(&h16, + s_fpart_lo, + neighbors_lo[0][0][0], + neighbors_lo[0][0][1]); + + packed_hi = lp_build_lerp(&h16, + s_fpart_hi, + neighbors_hi[0][0][0], + neighbors_hi[0][0][1]); } + else { + /* 2-D lerp */ + packed_lo = lp_build_lerp_2d(&h16, + s_fpart_lo, t_fpart_lo, + neighbors_lo[0][0][0], + neighbors_lo[0][0][1], + neighbors_lo[0][1][0], + neighbors_lo[0][1][1]); + + packed_hi = lp_build_lerp_2d(&h16, + s_fpart_hi, t_fpart_hi, + neighbors_hi[0][0][0], + neighbors_hi[0][0][1], + neighbors_hi[0][1][0], + neighbors_hi[0][1][1]); + + if (dims >= 3) { + LLVMValueRef packed_lo2, packed_hi2; + + /* lerp in the second z slice */ + packed_lo2 = lp_build_lerp_2d(&h16, + s_fpart_lo, t_fpart_lo, + neighbors_lo[1][0][0], + neighbors_lo[1][0][1], + neighbors_lo[1][1][0], + neighbors_lo[1][1][1]); - /* Expand one 4*rgba8 to two 2*rgba16 */ - lp_build_unpack2(bld->gallivm, u8n.type, h16.type, - rgba8, - colors_lo, colors_hi); + packed_hi2 = lp_build_lerp_2d(&h16, + s_fpart_hi, t_fpart_hi, + neighbors_hi[1][0][0], + neighbors_hi[1][0][1], + neighbors_hi[1][1][0], + neighbors_hi[1][1][1]); + /* interp between two z slices */ + packed_lo = lp_build_lerp(&h16, r_fpart_lo, + packed_lo, packed_lo2); + packed_hi = lp_build_lerp(&h16, r_fpart_hi, + packed_hi, packed_hi2); + } + } } -} + *colors_lo = packed_lo; + *colors_hi = packed_hi; +} /** * Sample a single texture image with (bi-)(tri-)linear sampling. @@ -433,33 +974,24 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, { const unsigned dims = bld->dims; LLVMBuilderRef builder = bld->gallivm->builder; - struct lp_build_context i32, h16, u8n; - LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type; + struct lp_build_context i32; + LLVMTypeRef i32_vec_type; LLVMValueRef i32_c8, i32_c128, i32_c255; LLVMValueRef width_vec, height_vec, depth_vec; - LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi; - LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_fpart_lo = NULL, t_fpart_hi = NULL; - LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_fpart_lo = NULL, r_fpart_hi = NULL; + LLVMValueRef s_ipart, s_fpart, s_float; + LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_float = NULL; + LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_float = NULL; LLVMValueRef x_stride, y_stride, z_stride; LLVMValueRef x_offset0, x_offset1; LLVMValueRef y_offset0, y_offset1; LLVMValueRef z_offset0, z_offset1; LLVMValueRef offset[2][2][2]; /* [z][y][x] */ LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2]; - LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */ - LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */ - LLVMValueRef packed_lo, packed_hi; unsigned x, y, z; - unsigned i, j, k; - unsigned numj, numk; - lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32)); - lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16)); - lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8)); + lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width)); i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type); - h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type); - u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type); lp_build_extract_image_sizes(bld, bld->int_size_type, @@ -469,6 +1001,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, &height_vec, &depth_vec); + s_float = s; t_float = t; r_float = r; + if (bld->static_state->normalized_coords) { LLVMValueRef scaled_size; LLVMValueRef flt_size; @@ -533,7 +1067,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, /* do texcoord wrapping and compute texel offsets */ lp_build_sample_wrap_linear_int(bld, bld->format_desc->block.width, - s_ipart, width_vec, x_stride, + s_ipart, &s_fpart, s_float, + width_vec, x_stride, bld->static_state->pot_width, bld->static_state->wrap_s, &x_offset0, &x_offset1, @@ -548,7 +1083,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, if (dims >= 2) { lp_build_sample_wrap_linear_int(bld, bld->format_desc->block.height, - t_ipart, height_vec, y_stride, + t_ipart, &t_fpart, t_float, + height_vec, y_stride, bld->static_state->pot_height, bld->static_state->wrap_t, &y_offset0, &y_offset1, @@ -567,7 +1103,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, if (dims >= 3) { lp_build_sample_wrap_linear_int(bld, bld->format_desc->block.height, - r_ipart, depth_vec, z_stride, + r_ipart, &r_fpart, r_float, + depth_vec, z_stride, bld->static_state->pot_depth, bld->static_state->wrap_r, &z_offset0, &z_offset1, @@ -593,212 +1130,175 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, } } - /* - * Transform 4 x i32 in - * - * s_fpart = {s0, s1, s2, s3} - * - * into 8 x i16 - * - * s_fpart = {00, s0, 00, s1, 00, s2, 00, s3} - * - * into two 8 x i16 - * - * s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1} - * s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3} - * - * and likewise for t_fpart. There is no risk of loosing precision here - * since the fractional parts only use the lower 8bits. - */ - s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, ""); - if (dims >= 2) - t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, ""); - if (dims >= 3) - r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, ""); + lp_build_sample_fetch_image_linear(bld, data_ptr, offset, + x_subcoord, y_subcoord, + s_fpart, t_fpart, r_fpart, + colors_lo, colors_hi); +} - { - LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context); - LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH]; - LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH]; - LLVMValueRef shuffle_lo; - LLVMValueRef shuffle_hi; - for (j = 0; j < h16.type.length; j += 4) { -#ifdef PIPE_ARCH_LITTLE_ENDIAN - unsigned subindex = 0; -#else - unsigned subindex = 1; -#endif - LLVMValueRef index; +/** + * Sample a single texture image with (bi-)(tri-)linear sampling. + * Return filtered color as two vectors of 16-bit fixed point values. + * Does address calcs (except offsets) with floats. + * Useful for AVX which has support for 8x32 floats but not 8x32 ints. + */ +static void +lp_build_sample_image_linear_afloat(struct lp_build_sample_context *bld, + LLVMValueRef int_size, + LLVMValueRef row_stride_vec, + LLVMValueRef img_stride_vec, + LLVMValueRef data_ptr, + LLVMValueRef s, + LLVMValueRef t, + LLVMValueRef r, + LLVMValueRef *colors_lo, + LLVMValueRef *colors_hi) +{ + const unsigned dims = bld->dims; + LLVMValueRef width_vec, height_vec, depth_vec; + LLVMValueRef s_fpart; + LLVMValueRef t_fpart = NULL; + LLVMValueRef r_fpart = NULL; + LLVMValueRef x_stride, y_stride, z_stride; + LLVMValueRef x_offset0, x_offset1; + LLVMValueRef y_offset0, y_offset1; + LLVMValueRef z_offset0, z_offset1; + LLVMValueRef offset[2][2][2]; /* [z][y][x] */ + LLVMValueRef x_subcoord[2], y_subcoord[2]; + LLVMValueRef flt_size; + LLVMValueRef x_icoord0, x_icoord1; + LLVMValueRef y_icoord0, y_icoord1; + LLVMValueRef z_icoord0, z_icoord1; + unsigned x, y, z; - index = LLVMConstInt(elem_type, j/2 + subindex, 0); - for (i = 0; i < 4; ++i) - shuffles_lo[j + i] = index; + flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size); - index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0); - for (i = 0; i < 4; ++i) - shuffles_hi[j + i] = index; - } + lp_build_extract_image_sizes(bld, + bld->float_size_type, + bld->coord_type, + flt_size, + &width_vec, + &height_vec, + &depth_vec); - shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length); - shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length); + /* do texcoord wrapping and compute texel offsets */ + lp_build_sample_wrap_linear_float(bld, + bld->format_desc->block.width, + s, width_vec, + bld->static_state->pot_width, + bld->static_state->wrap_s, + &x_icoord0, &x_icoord1, + &s_fpart, + bld->static_state->force_nearest_s); + + if (dims >= 2) { + lp_build_sample_wrap_linear_float(bld, + bld->format_desc->block.height, + t, height_vec, + bld->static_state->pot_height, + bld->static_state->wrap_t, + &y_icoord0, &y_icoord1, + &t_fpart, + bld->static_state->force_nearest_t); - s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, - shuffle_lo, ""); - s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, - shuffle_hi, ""); - if (dims >= 2) { - t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, - shuffle_lo, ""); - t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, - shuffle_hi, ""); - } if (dims >= 3) { - r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef, - shuffle_lo, ""); - r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef, - shuffle_hi, ""); + lp_build_sample_wrap_linear_float(bld, + bld->format_desc->block.height, + r, depth_vec, + bld->static_state->pot_depth, + bld->static_state->wrap_r, + &z_icoord0, &z_icoord1, + &r_fpart, 0); } } /* - * Fetch the pixels as 4 x 32bit (rgba order might differ): - * - * rgba0 rgba1 rgba2 rgba3 - * - * bit cast them into 16 x u8 - * - * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 - * - * unpack them into two 8 x i16: - * - * r0 g0 b0 a0 r1 g1 b1 a1 - * r2 g2 b2 a2 r3 g3 b3 a3 - * - * The higher 8 bits of the resulting elements will be zero. + * From here on we deal with ints, and we should split up the 256bit + * vectors manually for better generated code. */ - numj = 1 + (dims >= 2); - numk = 1 + (dims >= 3); - for (k = 0; k < numk; k++) { - for (j = 0; j < numj; j++) { - for (i = 0; i < 2; i++) { - LLVMValueRef rgba8; - - if (util_format_is_rgba8_variant(bld->format_desc)) { - /* - * Given the format is a rgba8, just read the pixels as is, - * without any swizzling. Swizzling will be done later. - */ - rgba8 = lp_build_gather(bld->gallivm, - bld->texel_type.length, - bld->format_desc->block.bits, - bld->texel_type.width, - data_ptr, offset[k][j][i]); - - rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, ""); - } - else { - rgba8 = lp_build_fetch_rgba_aos(bld->gallivm, - bld->format_desc, - u8n.type, - data_ptr, offset[k][j][i], - x_subcoord[i], - y_subcoord[j]); - } - - /* Expand one 4*rgba8 to two 2*rgba16 */ - lp_build_unpack2(bld->gallivm, u8n.type, h16.type, - rgba8, - &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]); - } - } - } + /* get pixel, row and image strides */ + x_stride = lp_build_const_vec(bld->gallivm, + bld->int_coord_bld.type, + bld->format_desc->block.bits/8); + y_stride = row_stride_vec; + z_stride = img_stride_vec; /* - * Linear interpolation with 8.8 fixed point. + * compute texel offset - + * cannot do offset calc with floats, difficult for block-based formats, + * and not enough precision anyway. */ - if (bld->static_state->force_nearest_s) { - /* special case 1-D lerp */ - packed_lo = lp_build_lerp(&h16, - t_fpart_lo, - neighbors_lo[0][0][0], - neighbors_lo[0][0][1]); - - packed_hi = lp_build_lerp(&h16, - t_fpart_hi, - neighbors_hi[0][1][0], - neighbors_hi[0][1][0]); + lp_build_sample_partial_offset(&bld->int_coord_bld, + bld->format_desc->block.width, + x_icoord0, x_stride, + &x_offset0, &x_subcoord[0]); + lp_build_sample_partial_offset(&bld->int_coord_bld, + bld->format_desc->block.width, + x_icoord1, x_stride, + &x_offset1, &x_subcoord[1]); + for (z = 0; z < 2; z++) { + for (y = 0; y < 2; y++) { + offset[z][y][0] = x_offset0; + offset[z][y][1] = x_offset1; + } } - else if (bld->static_state->force_nearest_t) { - /* special case 1-D lerp */ - packed_lo = lp_build_lerp(&h16, - s_fpart_lo, - neighbors_lo[0][0][0], - neighbors_lo[0][0][1]); - packed_hi = lp_build_lerp(&h16, - s_fpart_hi, - neighbors_hi[0][0][0], - neighbors_hi[0][0][1]); + if (dims >= 2) { + lp_build_sample_partial_offset(&bld->int_coord_bld, + bld->format_desc->block.height, + y_icoord0, y_stride, + &y_offset0, &y_subcoord[0]); + lp_build_sample_partial_offset(&bld->int_coord_bld, + bld->format_desc->block.height, + y_icoord1, y_stride, + &y_offset1, &y_subcoord[1]); + for (z = 0; z < 2; z++) { + for (x = 0; x < 2; x++) { + offset[z][0][x] = lp_build_add(&bld->int_coord_bld, + offset[z][0][x], y_offset0); + offset[z][1][x] = lp_build_add(&bld->int_coord_bld, + offset[z][1][x], y_offset1); + } + } } - else { - /* general 1/2/3-D lerping */ - if (dims == 1) { - packed_lo = lp_build_lerp(&h16, - s_fpart_lo, - neighbors_lo[0][0][0], - neighbors_lo[0][0][1]); - packed_hi = lp_build_lerp(&h16, - s_fpart_hi, - neighbors_hi[0][0][0], - neighbors_hi[0][0][1]); + if (dims >= 3) { + LLVMValueRef z_subcoord[2]; + lp_build_sample_partial_offset(&bld->int_coord_bld, + 1, + z_icoord0, z_stride, + &z_offset0, &z_subcoord[0]); + lp_build_sample_partial_offset(&bld->int_coord_bld, + 1, + z_icoord1, z_stride, + &z_offset1, &z_subcoord[1]); + for (y = 0; y < 2; y++) { + for (x = 0; x < 2; x++) { + offset[0][y][x] = lp_build_add(&bld->int_coord_bld, + offset[0][y][x], z_offset0); + offset[1][y][x] = lp_build_add(&bld->int_coord_bld, + offset[1][y][x], z_offset1); + } } - else { - /* 2-D lerp */ - packed_lo = lp_build_lerp_2d(&h16, - s_fpart_lo, t_fpart_lo, - neighbors_lo[0][0][0], - neighbors_lo[0][0][1], - neighbors_lo[0][1][0], - neighbors_lo[0][1][1]); - - packed_hi = lp_build_lerp_2d(&h16, - s_fpart_hi, t_fpart_hi, - neighbors_hi[0][0][0], - neighbors_hi[0][0][1], - neighbors_hi[0][1][0], - neighbors_hi[0][1][1]); - - if (dims >= 3) { - LLVMValueRef packed_lo2, packed_hi2; - - /* lerp in the second z slice */ - packed_lo2 = lp_build_lerp_2d(&h16, - s_fpart_lo, t_fpart_lo, - neighbors_lo[1][0][0], - neighbors_lo[1][0][1], - neighbors_lo[1][1][0], - neighbors_lo[1][1][1]); - - packed_hi2 = lp_build_lerp_2d(&h16, - s_fpart_hi, t_fpart_hi, - neighbors_hi[1][0][0], - neighbors_hi[1][0][1], - neighbors_hi[1][1][0], - neighbors_hi[1][1][1]); - /* interp between two z slices */ - packed_lo = lp_build_lerp(&h16, r_fpart_lo, - packed_lo, packed_lo2); - packed_hi = lp_build_lerp(&h16, r_fpart_hi, - packed_hi, packed_hi2); + } + else if (bld->static_state->target == PIPE_TEXTURE_CUBE) { + LLVMValueRef z_offset; + z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec); + for (y = 0; y < 2; y++) { + for (x = 0; x < 2; x++) { + /* The r coord is the cube face in [0,5] */ + offset[0][y][x] = lp_build_add(&bld->int_coord_bld, + offset[0][y][x], z_offset); } } } - *colors_lo = packed_lo; - *colors_hi = packed_hi; + lp_build_sample_fetch_image_linear(bld, data_ptr, offset, + x_subcoord, y_subcoord, + s_fpart, t_fpart, r_fpart, + colors_lo, colors_hi); } @@ -824,10 +1324,10 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, LLVMBuilderRef builder = bld->gallivm->builder; LLVMValueRef size0; LLVMValueRef size1; - LLVMValueRef row_stride0_vec; - LLVMValueRef row_stride1_vec; - LLVMValueRef img_stride0_vec; - LLVMValueRef img_stride1_vec; + LLVMValueRef row_stride0_vec = NULL; + LLVMValueRef row_stride1_vec = NULL; + LLVMValueRef img_stride0_vec = NULL; + LLVMValueRef img_stride1_vec = NULL; LLVMValueRef data_ptr0; LLVMValueRef data_ptr1; LLVMValueRef colors0_lo, colors0_hi; @@ -838,20 +1338,39 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, &size0, &row_stride0_vec, &img_stride0_vec); data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0); - if (img_filter == PIPE_TEX_FILTER_NEAREST) { - lp_build_sample_image_nearest(bld, - size0, - row_stride0_vec, img_stride0_vec, - data_ptr0, s, t, r, - &colors0_lo, &colors0_hi); + if (util_cpu_caps.has_avx && bld->coord_type.length > 4) { + if (img_filter == PIPE_TEX_FILTER_NEAREST) { + lp_build_sample_image_nearest_afloat(bld, + size0, + row_stride0_vec, img_stride0_vec, + data_ptr0, s, t, r, + &colors0_lo, &colors0_hi); + } + else { + assert(img_filter == PIPE_TEX_FILTER_LINEAR); + lp_build_sample_image_linear_afloat(bld, + size0, + row_stride0_vec, img_stride0_vec, + data_ptr0, s, t, r, + &colors0_lo, &colors0_hi); + } } else { - assert(img_filter == PIPE_TEX_FILTER_LINEAR); - lp_build_sample_image_linear(bld, - size0, - row_stride0_vec, img_stride0_vec, - data_ptr0, s, t, r, - &colors0_lo, &colors0_hi); + if (img_filter == PIPE_TEX_FILTER_NEAREST) { + lp_build_sample_image_nearest(bld, + size0, + row_stride0_vec, img_stride0_vec, + data_ptr0, s, t, r, + &colors0_lo, &colors0_hi); + } + else { + assert(img_filter == PIPE_TEX_FILTER_LINEAR); + lp_build_sample_image_linear(bld, + size0, + row_stride0_vec, img_stride0_vec, + data_ptr0, s, t, r, + &colors0_lo, &colors0_hi); + } } /* Store the first level's colors in the output variables */ @@ -859,74 +1378,138 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, LLVMBuildStore(builder, colors0_hi, colors_hi_var); if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { - LLVMValueRef h16_scale = lp_build_const_float(bld->gallivm, 256.0); - LLVMTypeRef i32_type = LLVMIntTypeInContext(bld->gallivm->context, 32); + LLVMValueRef h16vec_scale = lp_build_const_vec(bld->gallivm, + bld->perquadf_bld.type, 256.0); + LLVMTypeRef i32vec_type = lp_build_vec_type(bld->gallivm, bld->perquadi_bld.type); struct lp_build_if_state if_ctx; LLVMValueRef need_lerp; + unsigned num_quads = bld->coord_bld.type.length / 4; + unsigned i; - lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16_scale, ""); - lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32_type, "lod_fpart.fixed16"); + lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16vec_scale, ""); + lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32vec_type, "lod_fpart.fixed16"); /* need_lerp = lod_fpart > 0 */ - need_lerp = LLVMBuildICmp(builder, LLVMIntSGT, - lod_fpart, LLVMConstNull(i32_type), - "need_lerp"); + if (num_quads == 1) { + need_lerp = LLVMBuildICmp(builder, LLVMIntSGT, + lod_fpart, bld->perquadi_bld.zero, + "need_lerp"); + } + else { + /* + * We'll do mip filtering if any of the quads need it. + * It might be better to split the vectors here and only fetch/filter + * quads which need it. + */ + /* + * We need to clamp lod_fpart here since we can get negative + * values which would screw up filtering if not all + * lod_fpart values have same sign. + * We can however then skip the greater than comparison. + */ + lod_fpart = lp_build_max(&bld->perquadi_bld, lod_fpart, + bld->perquadi_bld.zero); + need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, lod_fpart); + } lp_build_if(&if_ctx, bld->gallivm, need_lerp); { struct lp_build_context h16_bld; - lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16)); + lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width)); /* sample the second mipmap level */ lp_build_mipmap_level_sizes(bld, ilevel1, &size1, &row_stride1_vec, &img_stride1_vec); data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1); - if (img_filter == PIPE_TEX_FILTER_NEAREST) { - lp_build_sample_image_nearest(bld, - size1, - row_stride1_vec, img_stride1_vec, - data_ptr1, s, t, r, - &colors1_lo, &colors1_hi); + + if (util_cpu_caps.has_avx && bld->coord_type.length > 4) { + if (img_filter == PIPE_TEX_FILTER_NEAREST) { + lp_build_sample_image_nearest_afloat(bld, + size1, + row_stride1_vec, img_stride1_vec, + data_ptr1, s, t, r, + &colors1_lo, &colors1_hi); + } + else { + lp_build_sample_image_linear_afloat(bld, + size1, + row_stride1_vec, img_stride1_vec, + data_ptr1, s, t, r, + &colors1_lo, &colors1_hi); + } } else { - lp_build_sample_image_linear(bld, - size1, - row_stride1_vec, img_stride1_vec, - data_ptr1, s, t, r, - &colors1_lo, &colors1_hi); + if (img_filter == PIPE_TEX_FILTER_NEAREST) { + lp_build_sample_image_nearest(bld, + size1, + row_stride1_vec, img_stride1_vec, + data_ptr1, s, t, r, + &colors1_lo, &colors1_hi); + } + else { + lp_build_sample_image_linear(bld, + size1, + row_stride1_vec, img_stride1_vec, + data_ptr1, s, t, r, + &colors1_lo, &colors1_hi); + } } /* interpolate samples from the two mipmap levels */ - lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, ""); - lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart); + if (num_quads == 1) { + lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, ""); + lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart); #if HAVE_LLVM == 0x208 - /* This is a work-around for a bug in LLVM 2.8. - * Evidently, something goes wrong in the construction of the - * lod_fpart short[8] vector. Adding this no-effect shuffle seems - * to force the vector to be properly constructed. - * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f). - */ - { - LLVMValueRef shuffles[8], shuffle; - int i; - assert(h16_bld.type.length <= Elements(shuffles)); - for (i = 0; i < h16_bld.type.length; i++) - shuffles[i] = lp_build_const_int32(bld->gallivm, 2 * (i & 1)); - shuffle = LLVMConstVector(shuffles, h16_bld.type.length); - lod_fpart = LLVMBuildShuffleVector(builder, - lod_fpart, lod_fpart, - shuffle, ""); - } + /* This is a work-around for a bug in LLVM 2.8. + * Evidently, something goes wrong in the construction of the + * lod_fpart short[8] vector. Adding this no-effect shuffle seems + * to force the vector to be properly constructed. + * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f). + */ + { + LLVMValueRef shuffles[8], shuffle; + assert(h16_bld.type.length <= Elements(shuffles)); + for (i = 0; i < h16_bld.type.length; i++) + shuffles[i] = lp_build_const_int32(bld->gallivm, 2 * (i & 1)); + shuffle = LLVMConstVector(shuffles, h16_bld.type.length); + lod_fpart = LLVMBuildShuffleVector(builder, + lod_fpart, lod_fpart, + shuffle, ""); + } #endif - colors0_lo = lp_build_lerp(&h16_bld, lod_fpart, - colors0_lo, colors1_lo); - colors0_hi = lp_build_lerp(&h16_bld, lod_fpart, - colors0_hi, colors1_hi); + colors0_lo = lp_build_lerp(&h16_bld, lod_fpart, + colors0_lo, colors1_lo); + colors0_hi = lp_build_lerp(&h16_bld, lod_fpart, + colors0_hi, colors1_hi); + } + else { + LLVMValueRef lod_parts[LP_MAX_VECTOR_LENGTH/16]; + struct lp_type perquadi16_type = bld->perquadi_bld.type; + perquadi16_type.width /= 2; + perquadi16_type.length *= 2; + lod_fpart = LLVMBuildBitCast(builder, lod_fpart, + lp_build_vec_type(bld->gallivm, + perquadi16_type), ""); + /* XXX this only works for exactly 2 quads. More quads need shuffle */ + assert(num_quads == 2); + for (i = 0; i < num_quads; i++) { + LLVMValueRef indexi2 = lp_build_const_int32(bld->gallivm, i*2); + lod_parts[i] = lp_build_extract_broadcast(bld->gallivm, + perquadi16_type, + h16_bld.type, + lod_fpart, + indexi2); + } + colors0_lo = lp_build_lerp(&h16_bld, lod_parts[0], + colors0_lo, colors1_lo); + colors0_hi = lp_build_lerp(&h16_bld, lod_parts[1], + colors0_hi, colors1_hi); + } LLVMBuildStore(builder, colors0_lo, colors_lo_var); LLVMBuildStore(builder, colors0_hi, colors_hi_var); @@ -948,10 +1531,10 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, LLVMValueRef s, LLVMValueRef t, LLVMValueRef r, - const LLVMValueRef *ddx, - const LLVMValueRef *ddy, - LLVMValueRef lod_bias, /* optional */ - LLVMValueRef explicit_lod, /* optional */ + LLVMValueRef lod_ipart, + LLVMValueRef lod_fpart, + LLVMValueRef ilevel0, + LLVMValueRef ilevel1, LLVMValueRef texel_out[4]) { struct lp_build_context *int_bld = &bld->int_bld; @@ -960,14 +1543,9 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, const unsigned min_filter = bld->static_state->min_img_filter; const unsigned mag_filter = bld->static_state->mag_img_filter; const unsigned dims = bld->dims; - LLVMValueRef lod_ipart = NULL, lod_fpart = NULL; - LLVMValueRef ilevel0, ilevel1 = NULL; LLVMValueRef packed, packed_lo, packed_hi; LLVMValueRef unswizzled[4]; - LLVMValueRef face_ddx[4], face_ddy[4]; struct lp_build_context h16_bld; - LLVMValueRef first_level; - LLVMValueRef i32t_zero = lp_build_const_int32(bld->gallivm, 0); /* we only support the common/simple wrap modes at this time */ assert(lp_is_simple_wrap_mode(bld->static_state->wrap_s)); @@ -978,81 +1556,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, /* make 16-bit fixed-pt builder context */ - lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16)); - - /* cube face selection, compute pre-face coords, etc. */ - if (bld->static_state->target == PIPE_TEXTURE_CUBE) { - LLVMValueRef face, face_s, face_t; - lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t); - s = face_s; /* vec */ - t = face_t; /* vec */ - /* use 'r' to indicate cube face */ - r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */ - - /* recompute ddx, ddy using the new (s,t) face texcoords */ - face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s); - face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t); - face_ddx[2] = NULL; - face_ddx[3] = NULL; - face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s); - face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t); - face_ddy[2] = NULL; - face_ddy[3] = NULL; - ddx = face_ddx; - ddy = face_ddy; - } - - /* - * Compute the level of detail (float). - */ - if (min_filter != mag_filter || - mip_filter != PIPE_TEX_MIPFILTER_NONE) { - /* Need to compute lod either to choose mipmap levels or to - * distinguish between minification/magnification with one mipmap level. - */ - lp_build_lod_selector(bld, unit, ddx, ddy, - lod_bias, explicit_lod, - mip_filter, - &lod_ipart, &lod_fpart); - } else { - lod_ipart = i32t_zero; - } - - /* - * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1 - */ - switch (mip_filter) { - default: - assert(0 && "bad mip_filter value in lp_build_sample_aos()"); - /* fall-through */ - case PIPE_TEX_MIPFILTER_NONE: - /* always use mip level 0 */ - if (bld->static_state->target == PIPE_TEXTURE_CUBE) { - /* XXX this is a work-around for an apparent bug in LLVM 2.7. - * We should be able to set ilevel0 = const(0) but that causes - * bad x86 code to be emitted. - */ - assert(lod_ipart); - lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); - } - else { - first_level = bld->dynamic_state->first_level(bld->dynamic_state, - bld->gallivm, unit); - ilevel0 = first_level; - } - break; - case PIPE_TEX_MIPFILTER_NEAREST: - assert(lod_ipart); - lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); - break; - case PIPE_TEX_MIPFILTER_LINEAR: - assert(lod_ipart); - assert(lod_fpart); - lp_build_linear_mip_levels(bld, unit, - lod_ipart, &lod_fpart, - &ilevel0, &ilevel1); - break; - } + lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width)); /* * Get/interpolate texture colors. @@ -1062,7 +1566,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, packed_hi = lp_build_alloca(bld->gallivm, h16_bld.vec_type, "packed_hi"); if (min_filter == mag_filter) { - /* no need to distinquish between minification and magnification */ + /* no need to distinguish between minification and magnification */ lp_build_sample_mipmap(bld, min_filter, mip_filter, s, t, r, @@ -1106,7 +1610,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, * into 'packed' */ packed = lp_build_pack2(bld->gallivm, - h16_bld.type, lp_type_unorm(8), + h16_bld.type, lp_type_unorm(8, bld->vector_width), LLVMBuildLoad(builder, packed_lo, ""), LLVMBuildLoad(builder, packed_hi, "")); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h index 5d9ecac4d50..55b3bc1c09a 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h @@ -46,10 +46,10 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, LLVMValueRef s, LLVMValueRef t, LLVMValueRef r, - const LLVMValueRef *ddx, - const LLVMValueRef *ddy, - LLVMValueRef lod_bias, /* optional */ - LLVMValueRef explicit_lod, /* optional */ + LLVMValueRef lod_ipart, + LLVMValueRef lod_fpart, + LLVMValueRef ilevel0, + LLVMValueRef ilevel1, LLVMValueRef texel_out[4]); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index 73dc3e77083..aaef7970635 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -41,6 +41,7 @@ #include "util/u_memory.h" #include "util/u_math.h" #include "util/u_format.h" +#include "util/u_cpu_detect.h" #include "lp_bld_debug.h" #include "lp_bld_type.h" #include "lp_bld_const.h" @@ -57,6 +58,7 @@ #include "lp_bld_sample_aos.h" #include "lp_bld_struct.h" #include "lp_bld_quad.h" +#include "lp_bld_pack.h" /** @@ -221,6 +223,41 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld, /** + * Helper to compute the first coord and the weight for + * linear wrap repeat npot textures + */ +void +lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld, + LLVMValueRef coord_f, + LLVMValueRef length_i, + LLVMValueRef length_f, + LLVMValueRef *coord0_i, + LLVMValueRef *weight_f) +{ + struct lp_build_context *coord_bld = &bld->coord_bld; + struct lp_build_context *int_coord_bld = &bld->int_coord_bld; + LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5); + LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i, + int_coord_bld->one); + LLVMValueRef mask; + /* wrap with normalized floats is just fract */ + coord_f = lp_build_fract(coord_bld, coord_f); + /* mul by size and subtract 0.5 */ + coord_f = lp_build_mul(coord_bld, coord_f, length_f); + coord_f = lp_build_sub(coord_bld, coord_f, half); + /* + * we avoided the 0.5/length division before the repeat wrap, + * now need to fix up edge cases with selects + */ + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f); + mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type, + PIPE_FUNC_LESS, *coord0_i, int_coord_bld->zero); + *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i); +} + + +/** * Build LLVM code for texture wrap mode for linear filtering. * \param x0_out returns first integer texcoord * \param x1_out returns second integer texcoord @@ -246,28 +283,27 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, switch(wrap_mode) { case PIPE_TEX_WRAP_REPEAT: - /* mul by size and subtract 0.5 */ - coord = lp_build_mul(coord_bld, coord, length_f); - coord = lp_build_sub(coord_bld, coord, half); - /* convert to int, compute lerp weight */ - lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); - /* repeat wrap */ if (is_pot) { + /* mul by size and subtract 0.5 */ + coord = lp_build_mul(coord_bld, coord, length_f); + coord = lp_build_sub(coord_bld, coord, half); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); + /* repeat wrap */ coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, ""); coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, ""); } else { - /* Add a bias to the texcoord to handle negative coords */ - LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024); LLVMValueRef mask; - coord0 = LLVMBuildAdd(builder, coord0, bias, ""); - coord0 = LLVMBuildURem(builder, coord0, length, ""); - mask = lp_build_compare(bld->gallivm, int_coord_bld->type, + lp_build_coord_repeat_npot_linear(bld, coord, + length, length_f, + &coord0, &weight); + mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type, PIPE_FUNC_NOTEQUAL, coord0, length_minus_one); coord1 = LLVMBuildAnd(builder, - lp_build_add(int_coord_bld, coord0, int_coord_bld->one), - mask, ""); + lp_build_add(int_coord_bld, coord0, int_coord_bld->one), + mask, ""); } break; @@ -444,15 +480,16 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld, switch(wrap_mode) { case PIPE_TEX_WRAP_REPEAT: - coord = lp_build_mul(coord_bld, coord, length_f); - icoord = lp_build_ifloor(coord_bld, coord); - if (is_pot) + if (is_pot) { + coord = lp_build_mul(coord_bld, coord, length_f); + icoord = lp_build_ifloor(coord_bld, coord); icoord = LLVMBuildAnd(builder, icoord, length_minus_one, ""); + } else { - /* Add a bias to the texcoord to handle negative coords */ - LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024); - icoord = LLVMBuildAdd(builder, icoord, bias, ""); - icoord = LLVMBuildURem(builder, icoord, length, ""); + /* take fraction, unnormalize */ + coord = lp_build_fract_safe(coord_bld, coord); + coord = lp_build_mul(coord_bld, coord, length_f); + icoord = lp_build_itrunc(coord_bld, coord); } break; @@ -473,7 +510,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld, break; case PIPE_TEX_WRAP_CLAMP_TO_BORDER: - /* Note: this is the same as CLAMP_TO_EDGE, except min = -min */ + /* Note: this is the same as CLAMP_TO_EDGE, except min = -1 */ { LLVMValueRef min, max; @@ -873,12 +910,32 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { struct lp_build_if_state if_ctx; LLVMValueRef need_lerp; + unsigned num_quads = bld->coord_bld.type.length / 4; /* need_lerp = lod_fpart > 0 */ - need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT, - lod_fpart, - bld->float_bld.zero, - "need_lerp"); + if (num_quads == 1) { + need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT, + lod_fpart, bld->perquadf_bld.zero, + "need_lerp"); + } + else { + /* + * We'll do mip filtering if any of the quads need it. + * It might be better to split the vectors here and only fetch/filter + * quads which need it. + */ + /* + * We unfortunately need to clamp lod_fpart here since we can get + * negative values which would screw up filtering if not all + * lod_fpart values have same sign. + */ + lod_fpart = lp_build_max(&bld->perquadf_bld, lod_fpart, + bld->perquadf_bld.zero); + need_lerp = lp_build_compare(bld->gallivm, bld->perquadf_bld.type, + PIPE_FUNC_GREATER, + lod_fpart, bld->perquadf_bld.zero); + need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, need_lerp); + } lp_build_if(&if_ctx, bld->gallivm, need_lerp); { @@ -904,7 +961,10 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, /* interpolate samples from the two mipmap levels */ - lod_fpart = lp_build_broadcast_scalar(&bld->texel_bld, lod_fpart); + lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm, + bld->perquadf_bld.type, + bld->texel_bld.type, + lod_fpart); for (chan = 0; chan < 4; chan++) { colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart, @@ -916,37 +976,28 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, } } - - /** - * General texture sampling codegen. - * This function handles texture sampling for all texture targets (1D, - * 2D, 3D, cube) and all filtering modes. + * Calculate cube face, lod, mip levels. */ static void -lp_build_sample_general(struct lp_build_sample_context *bld, - unsigned unit, - LLVMValueRef s, - LLVMValueRef t, - LLVMValueRef r, - const LLVMValueRef *ddx, - const LLVMValueRef *ddy, - LLVMValueRef lod_bias, /* optional */ - LLVMValueRef explicit_lod, /* optional */ - LLVMValueRef *colors_out) +lp_build_sample_common(struct lp_build_sample_context *bld, + unsigned unit, + LLVMValueRef *s, + LLVMValueRef *t, + LLVMValueRef *r, + const struct lp_derivatives *derivs, + LLVMValueRef lod_bias, /* optional */ + LLVMValueRef explicit_lod, /* optional */ + LLVMValueRef *lod_ipart, + LLVMValueRef *lod_fpart, + LLVMValueRef *ilevel0, + LLVMValueRef *ilevel1) { - struct lp_build_context *int_bld = &bld->int_bld; - LLVMBuilderRef builder = bld->gallivm->builder; const unsigned mip_filter = bld->static_state->min_mip_filter; const unsigned min_filter = bld->static_state->min_img_filter; const unsigned mag_filter = bld->static_state->mag_img_filter; - LLVMValueRef lod_ipart = NULL, lod_fpart = NULL; - LLVMValueRef ilevel0, ilevel1 = NULL; - LLVMValueRef face_ddx[4], face_ddy[4]; - LLVMValueRef texels[4]; LLVMValueRef first_level; - LLVMValueRef i32t_zero = lp_build_const_int32(bld->gallivm, 0); - unsigned chan; + struct lp_derivatives face_derivs; /* printf("%s mip %d min %d mag %d\n", __FUNCTION__, @@ -958,23 +1009,16 @@ lp_build_sample_general(struct lp_build_sample_context *bld, */ if (bld->static_state->target == PIPE_TEXTURE_CUBE) { LLVMValueRef face, face_s, face_t; - lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t); - s = face_s; /* vec */ - t = face_t; /* vec */ + lp_build_cube_lookup(bld, *s, *t, *r, &face, &face_s, &face_t); + *s = face_s; /* vec */ + *t = face_t; /* vec */ /* use 'r' to indicate cube face */ - r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */ + *r = face; /* vec */ /* recompute ddx, ddy using the new (s,t) face texcoords */ - face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s); - face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t); - face_ddx[2] = NULL; - face_ddx[3] = NULL; - face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s); - face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t); - face_ddy[2] = NULL; - face_ddy[3] = NULL; - ddx = face_ddx; - ddy = face_ddy; + face_derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(&bld->coord_bld, *s, *t); + face_derivs.ddx_ddy[1] = NULL; + derivs = &face_derivs; } /* @@ -985,12 +1029,12 @@ lp_build_sample_general(struct lp_build_sample_context *bld, /* Need to compute lod either to choose mipmap levels or to * distinguish between minification/magnification with one mipmap level. */ - lp_build_lod_selector(bld, unit, ddx, ddy, + lp_build_lod_selector(bld, unit, derivs, lod_bias, explicit_lod, mip_filter, - &lod_ipart, &lod_fpart); + lod_ipart, lod_fpart); } else { - lod_ipart = i32t_zero; + *lod_ipart = bld->perquadi_bld.zero; } /* @@ -1006,28 +1050,56 @@ lp_build_sample_general(struct lp_build_sample_context *bld, /* XXX this is a work-around for an apparent bug in LLVM 2.7. * We should be able to set ilevel0 = const(0) but that causes * bad x86 code to be emitted. + * XXX should probably disable that on other llvm versions. */ - assert(lod_ipart); - lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); + assert(*lod_ipart); + lp_build_nearest_mip_level(bld, unit, *lod_ipart, ilevel0); } else { first_level = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm, unit); - ilevel0 = first_level; + first_level = lp_build_broadcast_scalar(&bld->perquadi_bld, first_level); + *ilevel0 = first_level; } break; case PIPE_TEX_MIPFILTER_NEAREST: - assert(lod_ipart); - lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); + assert(*lod_ipart); + lp_build_nearest_mip_level(bld, unit, *lod_ipart, ilevel0); break; case PIPE_TEX_MIPFILTER_LINEAR: - assert(lod_ipart); - assert(lod_fpart); + assert(*lod_ipart); + assert(*lod_fpart); lp_build_linear_mip_levels(bld, unit, - lod_ipart, &lod_fpart, - &ilevel0, &ilevel1); + *lod_ipart, lod_fpart, + ilevel0, ilevel1); break; } +} + +/** + * General texture sampling codegen. + * This function handles texture sampling for all texture targets (1D, + * 2D, 3D, cube) and all filtering modes. + */ +static void +lp_build_sample_general(struct lp_build_sample_context *bld, + unsigned unit, + LLVMValueRef s, + LLVMValueRef t, + LLVMValueRef r, + LLVMValueRef lod_ipart, + LLVMValueRef lod_fpart, + LLVMValueRef ilevel0, + LLVMValueRef ilevel1, + LLVMValueRef *colors_out) +{ + struct lp_build_context *int_bld = &bld->int_bld; + LLVMBuilderRef builder = bld->gallivm->builder; + const unsigned mip_filter = bld->static_state->min_mip_filter; + const unsigned min_filter = bld->static_state->min_img_filter; + const unsigned mag_filter = bld->static_state->mag_img_filter; + LLVMValueRef texels[4]; + unsigned chan; /* * Get/interpolate texture colors. @@ -1039,7 +1111,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld, } if (min_filter == mag_filter) { - /* no need to distinquish between minification and magnification */ + /* no need to distinguish between minification and magnification */ lp_build_sample_mipmap(bld, unit, min_filter, mip_filter, s, t, r, @@ -1135,7 +1207,10 @@ lp_build_sample_compare(struct lp_build_sample_context *bld, * For debugging. */ void -lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type, +lp_build_sample_nop(struct gallivm_state *gallivm, + struct lp_type type, + unsigned num_coords, + const LLVMValueRef *coords, LLVMValueRef texel_out[4]) { LLVMValueRef one = lp_build_one(gallivm, type); @@ -1152,8 +1227,7 @@ lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type, * 'texel' will return a vector of four LLVMValueRefs corresponding to * R, G, B, A. * \param type vector float type to use for coords, etc. - * \param ddx partial derivatives of (s,t,r,q) with respect to x - * \param ddy partial derivatives of (s,t,r,q) with respect to y + * \param derivs partial derivatives of (s,t,r,q) with respect to x and y */ void lp_build_sample_soa(struct gallivm_state *gallivm, @@ -1163,8 +1237,7 @@ lp_build_sample_soa(struct gallivm_state *gallivm, unsigned unit, unsigned num_coords, const LLVMValueRef *coords, - const LLVMValueRef ddx[4], - const LLVMValueRef ddy[4], + const struct lp_derivatives *derivs, LLVMValueRef lod_bias, /* optional */ LLVMValueRef explicit_lod, /* optional */ LLVMValueRef texel_out[4]) @@ -1173,10 +1246,10 @@ lp_build_sample_soa(struct gallivm_state *gallivm, struct lp_build_sample_context bld; LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef tex_width, tex_height, tex_depth; LLVMValueRef s; LLVMValueRef t; LLVMValueRef r; - struct lp_type float_vec_type; if (0) { enum pipe_format fmt = static_state->format; @@ -1193,6 +1266,8 @@ lp_build_sample_soa(struct gallivm_state *gallivm, bld.format_desc = util_format_description(static_state->format); bld.dims = dims; + bld.vector_width = lp_type_width(type); + bld.float_type = lp_type_float(32); bld.int_type = lp_type_int(32); bld.coord_type = type; @@ -1201,22 +1276,26 @@ lp_build_sample_soa(struct gallivm_state *gallivm, bld.float_size_type.length = dims > 1 ? 4 : 1; bld.int_size_type = lp_int_type(bld.float_size_type); bld.texel_type = type; - - float_vec_type = lp_type_float_vec(32); + bld.perquadf_type = type; + /* we want native vector size to be able to use our intrinsics */ + bld.perquadf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1; + bld.perquadi_type = lp_int_type(bld.perquadf_type); lp_build_context_init(&bld.float_bld, gallivm, bld.float_type); - lp_build_context_init(&bld.float_vec_bld, gallivm, float_vec_type); + lp_build_context_init(&bld.float_vec_bld, gallivm, type); lp_build_context_init(&bld.int_bld, gallivm, bld.int_type); lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type); lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type); lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type); lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type); lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type); + lp_build_context_init(&bld.perquadf_bld, gallivm, bld.perquadf_type); + lp_build_context_init(&bld.perquadi_bld, gallivm, bld.perquadi_type); /* Get the dynamic state */ - bld.width = dynamic_state->width(dynamic_state, gallivm, unit); - bld.height = dynamic_state->height(dynamic_state, gallivm, unit); - bld.depth = dynamic_state->depth(dynamic_state, gallivm, unit); + tex_width = dynamic_state->width(dynamic_state, gallivm, unit); + tex_height = dynamic_state->height(dynamic_state, gallivm, unit); + tex_depth = dynamic_state->depth(dynamic_state, gallivm, unit); bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm, unit); bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm, unit); bld.data_array = dynamic_state->data_ptr(dynamic_state, gallivm, unit); @@ -1228,37 +1307,40 @@ lp_build_sample_soa(struct gallivm_state *gallivm, /* width, height, depth as single int vector */ if (dims <= 1) { - bld.int_size = bld.width; + bld.int_size = tex_width; } else { bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_bld.undef, - bld.width, LLVMConstInt(i32t, 0, 0), ""); + tex_width, LLVMConstInt(i32t, 0, 0), ""); if (dims >= 2) { bld.int_size = LLVMBuildInsertElement(builder, bld.int_size, - bld.height, LLVMConstInt(i32t, 1, 0), ""); + tex_height, LLVMConstInt(i32t, 1, 0), ""); if (dims >= 3) { bld.int_size = LLVMBuildInsertElement(builder, bld.int_size, - bld.depth, LLVMConstInt(i32t, 2, 0), ""); + tex_depth, LLVMConstInt(i32t, 2, 0), ""); } } } if (0) { /* For debug: no-op texture sampling */ - lp_build_sample_nop(gallivm, bld.texel_type, texel_out); - } - else if (util_format_fits_8unorm(bld.format_desc) && - lp_is_simple_wrap_mode(static_state->wrap_s) && - lp_is_simple_wrap_mode(static_state->wrap_t)) { - /* do sampling/filtering with fixed pt arithmetic */ - lp_build_sample_aos(&bld, unit, s, t, r, ddx, ddy, - lod_bias, explicit_lod, + lp_build_sample_nop(gallivm, + bld.texel_type, + num_coords, + coords, texel_out); } - else { + LLVMValueRef lod_ipart = NULL, lod_fpart = NULL; + LLVMValueRef ilevel0 = NULL, ilevel1 = NULL; + unsigned num_quads = type.length / 4; + const unsigned mip_filter = bld.static_state->min_mip_filter; + boolean use_aos = util_format_fits_8unorm(bld.format_desc) && + lp_is_simple_wrap_mode(static_state->wrap_s) && + lp_is_simple_wrap_mode(static_state->wrap_t); + if ((gallivm_debug & GALLIVM_DEBUG_PERF) && - util_format_fits_8unorm(bld.format_desc)) { + !use_aos && util_format_fits_8unorm(bld.format_desc)) { debug_printf("%s: using floating point linear filtering for %s\n", __FUNCTION__, bld.format_desc->short_name); debug_printf(" min_img %d mag_img %d mip %d wraps %d wrapt %d\n", @@ -1269,9 +1351,203 @@ lp_build_sample_soa(struct gallivm_state *gallivm, static_state->wrap_t); } - lp_build_sample_general(&bld, unit, s, t, r, ddx, ddy, - lod_bias, explicit_lod, - texel_out); + lp_build_sample_common(&bld, unit, + &s, &t, &r, + derivs, lod_bias, explicit_lod, + &lod_ipart, &lod_fpart, + &ilevel0, &ilevel1); + + /* + * we only try 8-wide sampling with soa as it appears to + * be a loss with aos with AVX. + */ + if (num_quads == 1 || (mip_filter == PIPE_TEX_MIPFILTER_NONE && + !use_aos)) { + + if (num_quads > 1) { + LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); + /* These parameters are the same for all quads */ + lod_ipart = LLVMBuildExtractElement(builder, lod_ipart, index0, ""); + ilevel0 = LLVMBuildExtractElement(builder, ilevel0, index0, ""); + } + if (use_aos) { + /* do sampling/filtering with fixed pt arithmetic */ + lp_build_sample_aos(&bld, unit, + s, t, r, + lod_ipart, lod_fpart, + ilevel0, ilevel1, + texel_out); + } + + else { + lp_build_sample_general(&bld, unit, + s, t, r, + lod_ipart, lod_fpart, + ilevel0, ilevel1, + texel_out); + } + } + else { + struct lp_build_if_state if_ctx; + LLVMValueRef notsame_levels, notsame; + LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); + LLVMValueRef texels[4]; + LLVMValueRef texelout[4]; + unsigned j; + + texels[0] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texr"); + texels[1] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texg"); + texels[2] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texb"); + texels[3] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texa"); + + /* only build the if if we MAY split, otherwise always split */ + if (!use_aos) { + notsame = lp_build_extract_broadcast(gallivm, + bld.perquadi_bld.type, + bld.perquadi_bld.type, + ilevel0, index0); + notsame = lp_build_sub(&bld.perquadi_bld, ilevel0, notsame); + notsame_levels = lp_build_any_true_range(&bld.perquadi_bld, num_quads, + notsame); + if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { + notsame = lp_build_extract_broadcast(gallivm, + bld.perquadi_bld.type, + bld.perquadi_bld.type, + ilevel1, index0); + notsame = lp_build_sub(&bld.perquadi_bld, ilevel1, notsame); + notsame = lp_build_any_true_range(&bld.perquadi_bld, num_quads, notsame); + notsame_levels = LLVMBuildOr(builder, notsame_levels, notsame, ""); + } + lp_build_if(&if_ctx, gallivm, notsame_levels); + } + + { + struct lp_build_sample_context bld4; + struct lp_type type4 = type; + unsigned i; + LLVMValueRef texelout4[4]; + LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16]; + + type4.length = 4; + + /* Setup our build context */ + memset(&bld4, 0, sizeof bld4); + bld4.gallivm = bld.gallivm; + bld4.static_state = bld.static_state; + bld4.dynamic_state = bld.dynamic_state; + bld4.format_desc = bld.format_desc; + bld4.dims = bld.dims; + bld4.row_stride_array = bld.row_stride_array; + bld4.img_stride_array = bld.img_stride_array; + bld4.data_array = bld.data_array; + bld4.int_size = bld.int_size; + + bld4.vector_width = lp_type_width(type4); + + bld4.float_type = lp_type_float(32); + bld4.int_type = lp_type_int(32); + bld4.coord_type = type4; + bld4.int_coord_type = lp_int_type(type4); + bld4.float_size_type = lp_type_float(32); + bld4.float_size_type.length = dims > 1 ? 4 : 1; + bld4.int_size_type = lp_int_type(bld4.float_size_type); + bld4.texel_type = type4; + bld4.perquadf_type = type4; + /* we want native vector size to be able to use our intrinsics */ + bld4.perquadf_type.length = 1; + bld4.perquadi_type = lp_int_type(bld4.perquadf_type); + + lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type); + lp_build_context_init(&bld4.float_vec_bld, gallivm, type4); + lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type); + lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type); + lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type); + lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type); + lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type); + lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type); + lp_build_context_init(&bld4.perquadf_bld, gallivm, bld4.perquadf_type); + lp_build_context_init(&bld4.perquadi_bld, gallivm, bld4.perquadi_type); + + for (i = 0; i < num_quads; i++) { + LLVMValueRef s4, t4, r4; + LLVMValueRef lod_iparts, lod_fparts = NULL; + LLVMValueRef ilevel0s, ilevel1s = NULL; + LLVMValueRef indexi = lp_build_const_int32(gallivm, i); + + s4 = lp_build_extract_range(gallivm, s, 4*i, 4); + t4 = lp_build_extract_range(gallivm, t, 4*i, 4); + r4 = lp_build_extract_range(gallivm, r, 4*i, 4); + lod_iparts = LLVMBuildExtractElement(builder, lod_ipart, indexi, ""); + ilevel0s = LLVMBuildExtractElement(builder, ilevel0, indexi, ""); + if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { + ilevel1s = LLVMBuildExtractElement(builder, ilevel1, indexi, ""); + lod_fparts = LLVMBuildExtractElement(builder, lod_fpart, indexi, ""); + } + + if (use_aos) { + /* do sampling/filtering with fixed pt arithmetic */ + lp_build_sample_aos(&bld4, unit, + s4, t4, r4, + lod_iparts, lod_fparts, + ilevel0s, ilevel1s, + texelout4); + } + + else { + lp_build_sample_general(&bld4, unit, + s4, t4, r4, + lod_iparts, lod_fparts, + ilevel0s, ilevel1s, + texelout4); + } + for (j = 0; j < 4; j++) { + texelouttmp[j][i] = texelout4[j]; + } + } + for (j = 0; j < 4; j++) { + texelout[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads); + LLVMBuildStore(builder, texelout[j], texels[j]); + } + } + if (!use_aos) { + LLVMValueRef ilevel0s, lod_iparts, ilevel1s = NULL; + + lp_build_else(&if_ctx); + + /* These parameters are the same for all quads */ + lod_iparts = LLVMBuildExtractElement(builder, lod_ipart, index0, ""); + ilevel0s = LLVMBuildExtractElement(builder, ilevel0, index0, ""); + if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { + ilevel1s = LLVMBuildExtractElement(builder, ilevel1, index0, ""); + } + + if (use_aos) { + /* do sampling/filtering with fixed pt arithmetic */ + lp_build_sample_aos(&bld, unit, + s, t, r, + lod_iparts, lod_fpart, + ilevel0s, ilevel1s, + texelout); + } + + else { + lp_build_sample_general(&bld, unit, + s, t, r, + lod_iparts, lod_fpart, + ilevel0s, ilevel1s, + texelout); + } + for (j = 0; j < 4; j++) { + LLVMBuildStore(builder, texelout[j], texels[j]); + } + + lp_build_endif(&if_ctx); + } + + for (j = 0; j < 4; j++) { + texel_out[j] = LLVMBuildLoad(builder, texels[j], ""); + } + } } lp_build_sample_compare(&bld, r, texel_out); @@ -1283,6 +1559,7 @@ void lp_build_size_query_soa(struct gallivm_state *gallivm, const struct lp_sampler_static_state *static_state, struct lp_sampler_dynamic_state *dynamic_state, + struct lp_type int_type, unsigned unit, LLVMValueRef explicit_lod, LLVMValueRef *sizes_out) @@ -1311,7 +1588,9 @@ lp_build_size_query_soa(struct gallivm_state *gallivm, return; } - lp_build_context_init(&bld_int_vec, gallivm, lp_type_int_vec(32)); + assert(!int_type.floating); + + lp_build_context_init(&bld_int_vec, gallivm, lp_type_int_vec(32, 128)); if (explicit_lod) { LLVMValueRef first_level; @@ -1345,7 +1624,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm, size = lp_build_minify(&bld_int_vec, size, lod); for (i=0; i < dims; i++) { - sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec.type, bld_int_vec.type, + sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec.type, int_type, size, lp_build_const_int32(gallivm, i)); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c index 5d4406812c7..641c960431d 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c @@ -40,6 +40,7 @@ #include "lp_bld_init.h" #include "lp_bld_logic.h" #include "lp_bld_swizzle.h" +#include "lp_bld_pack.h" LLVMValueRef @@ -95,7 +96,7 @@ lp_build_broadcast_scalar(struct lp_build_context *bld, /** - * Combined extract and broadcast (or a mere shuffle when the two types match) + * Combined extract and broadcast (mere shuffle in most cases) */ LLVMValueRef lp_build_extract_broadcast(struct gallivm_state *gallivm, @@ -132,9 +133,9 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm, } } else { - if (dst_type.length == src_type.length) { + if (dst_type.length > 1) { /* - * Special shuffle of the same size. + * shuffle - result can be of different length. */ LLVMValueRef shuffle; @@ -142,28 +143,14 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm, LLVMVectorType(i32t, dst_type.length), index); res = LLVMBuildShuffleVector(gallivm->builder, vector, - LLVMGetUndef(lp_build_vec_type(gallivm, dst_type)), + LLVMGetUndef(lp_build_vec_type(gallivm, src_type)), shuffle, ""); } else { - LLVMValueRef scalar; - scalar = LLVMBuildExtractElement(gallivm->builder, vector, index, ""); - if (dst_type.length == 1) { - /* - * Trivial extract scalar from vector. - */ - - res = scalar; - } - else { - /* - * General case of different sized vectors. - */ - - res = lp_build_broadcast(gallivm, - lp_build_vec_type(gallivm, dst_type), - vector); - } + /* + * Trivial extract scalar from vector. + */ + res = LLVMBuildExtractElement(gallivm->builder, vector, index, ""); } } @@ -290,6 +277,8 @@ lp_build_swizzle_aos(struct lp_build_context *bld, return bld->zero; case PIPE_SWIZZLE_ONE: return bld->one; + case LP_BLD_SWIZZLE_DONTCARE: + return bld->undef; default: assert(0); return bld->undef; @@ -319,21 +308,26 @@ lp_build_swizzle_aos(struct lp_build_context *bld, case PIPE_SWIZZLE_BLUE: case PIPE_SWIZZLE_ALPHA: shuffle = j + swizzles[i]; + shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0); break; case PIPE_SWIZZLE_ZERO: shuffle = type.length + 0; + shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0); if (!aux[0]) { aux[0] = lp_build_const_elem(bld->gallivm, type, 0.0); } break; case PIPE_SWIZZLE_ONE: shuffle = type.length + 1; + shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0); if (!aux[1]) { aux[1] = lp_build_const_elem(bld->gallivm, type, 1.0); } break; + case LP_BLD_SWIZZLE_DONTCARE: + shuffles[j + i] = LLVMGetUndef(i32t); + break; } - shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0); } } @@ -508,3 +502,127 @@ lp_build_swizzle_soa_inplace(struct lp_build_context *bld, lp_build_swizzle_soa(bld, unswizzled, swizzles, values); } + + +/** + * Transpose from AOS <-> SOA + * + * @param single_type_lp type of pixels + * @param src the 4 * n pixel input + * @param dst the 4 * n pixel output + */ +void +lp_build_transpose_aos(struct gallivm_state *gallivm, + struct lp_type single_type_lp, + const LLVMValueRef src[4], + LLVMValueRef dst[4]) +{ + struct lp_type double_type_lp = single_type_lp; + LLVMTypeRef single_type; + LLVMTypeRef double_type; + LLVMValueRef t0, t1, t2, t3; + + double_type_lp.length >>= 1; + double_type_lp.width <<= 1; + + double_type = lp_build_vec_type(gallivm, double_type_lp); + single_type = lp_build_vec_type(gallivm, single_type_lp); + + /* Interleave x, y, z, w -> xy and zw */ + t0 = lp_build_interleave2_half(gallivm, single_type_lp, src[0], src[1], 0); + t1 = lp_build_interleave2_half(gallivm, single_type_lp, src[2], src[3], 0); + t2 = lp_build_interleave2_half(gallivm, single_type_lp, src[0], src[1], 1); + t3 = lp_build_interleave2_half(gallivm, single_type_lp, src[2], src[3], 1); + + /* Cast to double width type for second interleave */ + t0 = LLVMBuildBitCast(gallivm->builder, t0, double_type, "t0"); + t1 = LLVMBuildBitCast(gallivm->builder, t1, double_type, "t1"); + t2 = LLVMBuildBitCast(gallivm->builder, t2, double_type, "t2"); + t3 = LLVMBuildBitCast(gallivm->builder, t3, double_type, "t3"); + + /* Interleave xy, zw -> xyzw */ + dst[0] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 0); + dst[1] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 1); + dst[2] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 0); + dst[3] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 1); + + /* Cast back to original single width type */ + dst[0] = LLVMBuildBitCast(gallivm->builder, dst[0], single_type, "dst0"); + dst[1] = LLVMBuildBitCast(gallivm->builder, dst[1], single_type, "dst1"); + dst[2] = LLVMBuildBitCast(gallivm->builder, dst[2], single_type, "dst2"); + dst[3] = LLVMBuildBitCast(gallivm->builder, dst[3], single_type, "dst3"); +} + + +/** + * Pack first element of aos values, + * pad out to destination size. + * i.e. x1 _ _ _ x2 _ _ _ will become x1 x2 _ _ + */ +LLVMValueRef +lp_build_pack_aos_scalars(struct gallivm_state *gallivm, + struct lp_type src_type, + struct lp_type dst_type, + const LLVMValueRef src) +{ + LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); + LLVMValueRef undef = LLVMGetUndef(i32t); + LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; + unsigned num_src = src_type.length / 4; + unsigned num_dst = dst_type.length; + unsigned i; + + assert(num_src <= num_dst); + + for (i = 0; i < num_src; i++) { + shuffles[i] = LLVMConstInt(i32t, i * 4, 0); + } + for (i = num_src; i < num_dst; i++) { + shuffles[i] = undef; + } + + if (num_dst == 1) { + return LLVMBuildExtractElement(gallivm->builder, src, shuffles[0], ""); + } + else { + return LLVMBuildShuffleVector(gallivm->builder, src, src, + LLVMConstVector(shuffles, num_dst), ""); + } +} + + +/** + * Unpack and broadcast packed aos values consisting of only the + * first value, i.e. x1 x2 _ _ will become x1 x1 x1 x1 x2 x2 x2 x2 + */ +LLVMValueRef +lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm, + struct lp_type src_type, + struct lp_type dst_type, + const LLVMValueRef src) +{ + LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); + LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; + unsigned num_dst = dst_type.length; + unsigned num_src = dst_type.length / 4; + unsigned i; + + assert(num_dst / 4 <= src_type.length); + + for (i = 0; i < num_src; i++) { + shuffles[i*4] = LLVMConstInt(i32t, i, 0); + shuffles[i*4+1] = LLVMConstInt(i32t, i, 0); + shuffles[i*4+2] = LLVMConstInt(i32t, i, 0); + shuffles[i*4+3] = LLVMConstInt(i32t, i, 0); + } + + if (num_src == 1) { + return lp_build_extract_broadcast(gallivm, src_type, dst_type, + src, shuffles[0]); + } + else { + return LLVMBuildShuffleVector(gallivm->builder, src, src, + LLVMConstVector(shuffles, num_dst), ""); + } +} + diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h index c366a65103e..0bf4ce988a2 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h @@ -44,6 +44,9 @@ struct lp_type; struct lp_build_context; +#define LP_BLD_SWIZZLE_DONTCARE 0xFF + + LLVMValueRef lp_build_broadcast(struct gallivm_state *gallivm, LLVMTypeRef vec_type, @@ -103,4 +106,25 @@ lp_build_swizzle_soa_inplace(struct lp_build_context *bld, const unsigned char swizzles[4]); +void +lp_build_transpose_aos(struct gallivm_state *gallivm, + struct lp_type type, + const LLVMValueRef src[4], + LLVMValueRef dst[4]); + + +LLVMValueRef +lp_build_pack_aos_scalars(struct gallivm_state *gallivm, + struct lp_type src_type, + struct lp_type dst_type, + const LLVMValueRef src); + + +LLVMValueRef +lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm, + struct lp_type src_type, + struct lp_type dst_type, + const LLVMValueRef src); + + #endif /* !LP_BLD_SWIZZLE_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h index 4423bc5dedd..e292420a61a 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h @@ -60,6 +60,7 @@ struct tgsi_token; struct tgsi_shader_info; struct lp_build_mask_context; struct gallivm_state; +struct lp_derivatives; enum lp_build_tex_modifier { @@ -174,8 +175,7 @@ struct lp_build_sampler_soa unsigned unit, unsigned num_coords, const LLVMValueRef *coords, - const LLVMValueRef *ddx, - const LLVMValueRef *ddy, + const struct lp_derivatives *derivs, LLVMValueRef lod_bias, /* optional */ LLVMValueRef explicit_lod, /* optional */ LLVMValueRef *texel); @@ -183,6 +183,7 @@ struct lp_build_sampler_soa void (*emit_size_query)( const struct lp_build_sampler_soa *sampler, struct gallivm_state *gallivm, + struct lp_type type, unsigned unit, LLVMValueRef explicit_lod, /* optional */ LLVMValueRef *sizes_out); @@ -197,8 +198,7 @@ struct lp_build_sampler_aos unsigned target, /* TGSI_TEXTURE_* */ unsigned unit, LLVMValueRef coords, - LLVMValueRef ddx, - LLVMValueRef ddy, + const struct lp_derivatives derivs, enum lp_build_tex_modifier modifier); }; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c index 24bc13a9be8..0666bba7fbd 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c @@ -56,6 +56,7 @@ #include "lp_bld_quad.h" #include "lp_bld_tgsi.h" #include "lp_bld_debug.h" +#include "lp_bld_sample.h" /** @@ -363,6 +364,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld, LLVMValueRef coords; LLVMValueRef ddx; LLVMValueRef ddy; + struct lp_derivatives derivs; if (!bld->sampler) { _debug_printf("warning: found texture instruction but no sampler generator supplied\n"); @@ -373,7 +375,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld, coords = lp_build_emit_fetch( &bld->bld_base, inst, 0 , LP_CHAN_ALL); - if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) { + if (0 && modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) { ddx = lp_build_emit_fetch( &bld->bld_base, inst, 1 , LP_CHAN_ALL); ddy = lp_build_emit_fetch( &bld->bld_base, inst, 2 , LP_CHAN_ALL); unit = inst->Src[3].Register.Index; @@ -383,8 +385,8 @@ emit_tex(struct lp_build_tgsi_aos_context *bld, ddy = lp_build_ddy( &bld->bld_base.base, coords ); #else /* TODO */ - ddx = bld->bld_base.base.one; - ddy = bld->bld_base.base.one; + derivs.ddx_ddy[0] = bld->bld_base.base.one; + derivs.ddx_ddy[1] = bld->bld_base.base.one; #endif unit = inst->Src[1].Register.Index; } @@ -392,7 +394,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld, return bld->sampler->emit_fetch_texel(bld->sampler, &bld->bld_base.base, target, unit, - coords, ddx, ddy, + coords, derivs, modifier); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c index d9faaf20273..85a4401b534 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c @@ -62,6 +62,7 @@ #include "lp_bld_limits.h" #include "lp_bld_debug.h" #include "lp_bld_printf.h" +#include "lp_bld_sample.h" static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld) @@ -763,7 +764,7 @@ emit_fetch_temporary( else { LLVMValueRef temp_ptr; if (stype != TGSI_TYPE_FLOAT && stype != TGSI_TYPE_UNTYPED) { - LLVMTypeRef itype = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0); + LLVMTypeRef itype = LLVMPointerType(bld->bld_base.int_bld.vec_type, 0); LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle); temp_ptr = LLVMBuildBitCast(builder, tint_ptr, itype, ""); @@ -1068,7 +1069,7 @@ emit_store_chan( switch (dtype) { case TGSI_TYPE_UNSIGNED: case TGSI_TYPE_SIGNED: { - LLVMTypeRef itype = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4); + LLVMTypeRef itype = bld_base->int_bld.vec_type; LLVMTypeRef ivtype = LLVMPointerType(itype, 0); LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, chan_index); @@ -1141,13 +1142,14 @@ emit_tex( struct lp_build_tgsi_soa_context *bld, LLVMValueRef *texel) { LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; + struct gallivm_state *gallivm = bld->bld_base.base.gallivm; unsigned unit; LLVMValueRef lod_bias, explicit_lod; LLVMValueRef oow = NULL; LLVMValueRef coords[3]; - LLVMValueRef ddx[3]; - LLVMValueRef ddy[3]; + struct lp_derivatives derivs; unsigned num_coords; + unsigned dims; unsigned i; if (!bld->sampler) { @@ -1158,26 +1160,42 @@ emit_tex( struct lp_build_tgsi_soa_context *bld, return; } + derivs.ddx_ddy[0] = bld->bld_base.base.undef; + derivs.ddx_ddy[1] = bld->bld_base.base.undef; + switch (inst->Texture.Texture) { case TGSI_TEXTURE_1D: num_coords = 1; + dims = 1; break; case TGSI_TEXTURE_1D_ARRAY: + num_coords = 2; + dims = 1; + break; case TGSI_TEXTURE_2D: case TGSI_TEXTURE_RECT: num_coords = 2; + dims = 2; break; case TGSI_TEXTURE_SHADOW1D: case TGSI_TEXTURE_SHADOW1D_ARRAY: + num_coords = 3; + dims = 1; + break; case TGSI_TEXTURE_SHADOW2D: case TGSI_TEXTURE_SHADOWRECT: case TGSI_TEXTURE_2D_ARRAY: - case TGSI_TEXTURE_3D: case TGSI_TEXTURE_CUBE: num_coords = 3; + dims = 2; + break; + case TGSI_TEXTURE_3D: + num_coords = 3; + dims = 3; break; case TGSI_TEXTURE_SHADOW2D_ARRAY: num_coords = 4; + dims = 2; break; default: assert(0); @@ -1212,31 +1230,66 @@ emit_tex( struct lp_build_tgsi_soa_context *bld, } if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) { - LLVMValueRef index0 = lp_build_const_int32(bld->bld_base.base.gallivm, 0); - for (i = 0; i < num_coords; i++) { - LLVMValueRef src1 = lp_build_emit_fetch( &bld->bld_base, inst, 1, i ); - LLVMValueRef src2 = lp_build_emit_fetch( &bld->bld_base, inst, 2, i ); - ddx[i] = LLVMBuildExtractElement(builder, src1, index0, ""); - ddy[i] = LLVMBuildExtractElement(builder, src2, index0, ""); + LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); + LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; + LLVMValueRef ddxdyonec[3]; + unsigned length = bld->bld_base.base.type.length; + unsigned num_quads = length / 4; + unsigned dim; + unsigned quad; + + for (dim = 0; dim < dims; ++dim) { + LLVMValueRef srcx = lp_build_emit_fetch( &bld->bld_base, inst, 1, dim ); + LLVMValueRef srcy = lp_build_emit_fetch( &bld->bld_base, inst, 2, dim ); + for (quad = 0; quad < num_quads; ++quad) { + unsigned s1 = 4*quad; + unsigned s2 = 4*quad + length; + shuffles[4*quad + 0] = lp_build_const_int32(gallivm, s1); + shuffles[4*quad + 1] = lp_build_const_int32(gallivm, s2); + shuffles[4*quad + 2] = i32undef; + shuffles[4*quad + 3] = i32undef; + } + ddxdyonec[dim] = LLVMBuildShuffleVector(builder, srcx, srcy, + LLVMConstVector(shuffles, length), ""); + } + if (dims == 1) { + derivs.ddx_ddy[0] = ddxdyonec[0]; + } + else if (dims >= 2) { + for (quad = 0; quad < num_quads; ++quad) { + unsigned s1 = 4*quad; + unsigned s2 = 4*quad + length; + shuffles[4*quad + 0] = lp_build_const_int32(gallivm, s1); + shuffles[4*quad + 1] = lp_build_const_int32(gallivm, s1 + 1); + shuffles[4*quad + 2] = lp_build_const_int32(gallivm, s2); + shuffles[4*quad + 3] = lp_build_const_int32(gallivm, s2 + 1); + } + derivs.ddx_ddy[0] = LLVMBuildShuffleVector(builder, ddxdyonec[0], ddxdyonec[1], + LLVMConstVector(shuffles, length), ""); + if (dims == 3) { + derivs.ddx_ddy[1] = ddxdyonec[2]; + } } unit = inst->Src[3].Register.Index; } else { - for (i = 0; i < num_coords; i++) { - ddx[i] = lp_build_scalar_ddx( &bld->bld_base.base, coords[i] ); - ddy[i] = lp_build_scalar_ddy( &bld->bld_base.base, coords[i] ); + if (dims == 1) { + derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(&bld->bld_base.base, coords[0]); + } + else if (dims >= 2) { + derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(&bld->bld_base.base, + coords[0], coords[1]); + if (dims == 3) { + derivs.ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(&bld->bld_base.base, coords[2]); + } } unit = inst->Src[1].Register.Index; } - for (i = num_coords; i < 3; i++) { - ddx[i] = LLVMGetUndef(bld->bld_base.base.elem_type); - ddy[i] = LLVMGetUndef(bld->bld_base.base.elem_type); - } bld->sampler->emit_fetch_texel(bld->sampler, bld->bld_base.base.gallivm, bld->bld_base.base.type, unit, num_coords, coords, - ddx, ddy, + &derivs, lod_bias, explicit_lod, texel); } @@ -1310,6 +1363,7 @@ emit_txq( struct lp_build_tgsi_soa_context *bld, bld->sampler->emit_size_query(bld->sampler, bld->bld_base.base.gallivm, + bld->bld_base.int_bld.type, inst->Src[1].Register.Index, explicit_lod, sizes_out); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.c b/src/gallium/auxiliary/gallivm/lp_bld_type.c index 413e69bedac..6c3aa38bfb1 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_type.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_type.c @@ -38,6 +38,9 @@ lp_build_elem_type(struct gallivm_state *gallivm, struct lp_type type) { if (type.floating) { switch(type.width) { + case 16: + return LLVMIntTypeInContext(gallivm->context, 16); + break; case 32: return LLVMFloatTypeInContext(gallivm->context); break; @@ -85,6 +88,10 @@ lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type) if (type.floating) { switch(type.width) { + case 16: + if(elem_kind != LLVMIntegerTypeKind) + return FALSE; + break; case 32: if(elem_kind != LLVMFloatTypeKind) return FALSE; @@ -168,27 +175,6 @@ lp_build_int_vec_type(struct gallivm_state *gallivm, struct lp_type type) /** - * Build int32[4] vector type - */ -LLVMTypeRef -lp_build_int32_vec4_type(struct gallivm_state *gallivm) -{ - struct lp_type t; - LLVMTypeRef type; - - memset(&t, 0, sizeof(t)); - t.floating = FALSE; /* floating point values */ - t.sign = TRUE; /* values are signed */ - t.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */ - t.width = 32; /* 32-bit int */ - t.length = 4; /* 4 elements per vector */ - - type = lp_build_int_elem_type(gallivm, t); - return LLVMVectorType(type, t.length); -} - - -/** * Create element of vector type */ struct lp_type diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h index f11a190e7cc..75310e05f3e 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_type.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h @@ -40,21 +40,35 @@ #include "pipe/p_compiler.h" #include "gallivm/lp_bld.h" +/** + * Native SIMD architecture width available at runtime. + * + * Using this width should give the best performance, + * and it determines the necessary alignment of vector variables. + */ +extern unsigned lp_native_vector_width; +/** + * Maximum supported vector width (not necessarily supported at run-time). + * + * Should only be used when lp_native_vector_width isn't available, + * i.e. sizing/alignment of non-malloced variables. + */ +#define LP_MAX_VECTOR_WIDTH 256 /** - * Native SIMD register width. + * Minimum vector alignment for static variable alignment * - * 128 for all architectures we care about. + * It should always be a constant equal to LP_MAX_VECTOR_WIDTH/8. An + * expression is non-portable. */ -#define LP_NATIVE_VECTOR_WIDTH 128 +#define LP_MIN_VECTOR_ALIGN 32 /** * Several functions can only cope with vectors of length up to this value. * You may need to increase that value if you want to represent bigger vectors. */ -#define LP_MAX_VECTOR_LENGTH 16 - +#define LP_MAX_VECTOR_LENGTH (LP_MAX_VECTOR_WIDTH/8) /** * The LLVM type system can't conveniently express all the things we care about @@ -151,6 +165,13 @@ struct lp_build_context }; +static INLINE unsigned +lp_type_width(struct lp_type type) +{ + return type.width * type.length; +} + + /** Create scalar float type */ static INLINE struct lp_type lp_type_float(unsigned width) @@ -169,7 +190,7 @@ lp_type_float(unsigned width) /** Create vector of float type */ static INLINE struct lp_type -lp_type_float_vec(unsigned width) +lp_type_float_vec(unsigned width, unsigned total_width) { struct lp_type res_type; @@ -177,7 +198,7 @@ lp_type_float_vec(unsigned width) res_type.floating = TRUE; res_type.sign = TRUE; res_type.width = width; - res_type.length = LP_NATIVE_VECTOR_WIDTH / width; + res_type.length = total_width / width; return res_type; } @@ -200,14 +221,14 @@ lp_type_int(unsigned width) /** Create vector int type */ static INLINE struct lp_type -lp_type_int_vec(unsigned width) +lp_type_int_vec(unsigned width, unsigned total_width) { struct lp_type res_type; memset(&res_type, 0, sizeof res_type); res_type.sign = TRUE; res_type.width = width; - res_type.length = LP_NATIVE_VECTOR_WIDTH / width; + res_type.length = total_width / width; return res_type; } @@ -229,34 +250,34 @@ lp_type_uint(unsigned width) /** Create vector uint type */ static INLINE struct lp_type -lp_type_uint_vec(unsigned width) +lp_type_uint_vec(unsigned width, unsigned total_width) { struct lp_type res_type; memset(&res_type, 0, sizeof res_type); res_type.width = width; - res_type.length = LP_NATIVE_VECTOR_WIDTH / width; + res_type.length = total_width / width; return res_type; } static INLINE struct lp_type -lp_type_unorm(unsigned width) +lp_type_unorm(unsigned width, unsigned total_width) { struct lp_type res_type; memset(&res_type, 0, sizeof res_type); res_type.norm = TRUE; res_type.width = width; - res_type.length = LP_NATIVE_VECTOR_WIDTH / width; + res_type.length = total_width / width; return res_type; } static INLINE struct lp_type -lp_type_fixed(unsigned width) +lp_type_fixed(unsigned width, unsigned total_width) { struct lp_type res_type; @@ -264,21 +285,21 @@ lp_type_fixed(unsigned width) res_type.sign = TRUE; res_type.fixed = TRUE; res_type.width = width; - res_type.length = LP_NATIVE_VECTOR_WIDTH / width; + res_type.length = total_width / width; return res_type; } static INLINE struct lp_type -lp_type_ufixed(unsigned width) +lp_type_ufixed(unsigned width, unsigned total_width) { struct lp_type res_type; memset(&res_type, 0, sizeof res_type); res_type.fixed = TRUE; res_type.width = width; - res_type.length = LP_NATIVE_VECTOR_WIDTH / width; + res_type.length = total_width / width; return res_type; } @@ -312,10 +333,6 @@ LLVMTypeRef lp_build_int_vec_type(struct gallivm_state *gallivm, struct lp_type type); -LLVMTypeRef -lp_build_int32_vec4_type(struct gallivm_state *gallivm); - - static INLINE struct lp_type lp_float32_vec4_type(void) { |