summaryrefslogtreecommitdiff
path: root/src/gallium/auxiliary/gallivm
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/auxiliary/gallivm')
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_arit.c545
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_arit.h19
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_const.c39
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_conv.c149
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_conv.h4
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_debug.cpp22
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_flow.c9
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format.h7
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format_aos.c6
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c102
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format_soa.c3
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c4
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_init.c488
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_init.h33
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_intr.c91
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_intr.h9
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_logic.c60
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_logic.h5
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_misc.cpp111
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_misc.h70
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_pack.c339
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_pack.h23
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_quad.c87
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_quad.h14
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_sample.c527
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_sample.h51
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c1344
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h8
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c493
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_swizzle.c164
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_swizzle.h24
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_tgsi.h8
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c10
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c92
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_type.c28
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_type.h59
36 files changed, 3837 insertions, 1210 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 9fc57629822..d226dab5b81 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -75,9 +75,9 @@ lp_build_min_simple(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b)
{
- LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
const char *intrinsic = NULL;
+ unsigned intr_size;
LLVMValueRef cond;
assert(lp_check_value(type, a));
@@ -85,31 +85,71 @@ lp_build_min_simple(struct lp_build_context *bld,
/* TODO: optimize the constant case */
- if(type.width * type.length == 128) {
- if(type.floating) {
- if(type.width == 32 && util_cpu_caps.has_sse)
+ if (type.floating && util_cpu_caps.has_sse) {
+ if (type.width == 32) {
+ if (type.length == 1) {
+ intrinsic = "llvm.x86.sse.min.ss";
+ intr_size = 128;
+ }
+ else if (type.length <= 4 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse.min.ps";
- if(type.width == 64 && util_cpu_caps.has_sse2)
+ intr_size = 128;
+ }
+ else {
+ intrinsic = "llvm.x86.avx.min.ps.256";
+ intr_size = 256;
+ }
+ }
+ if (type.width == 64 && util_cpu_caps.has_sse2) {
+ if (type.length == 1) {
+ intrinsic = "llvm.x86.sse2.min.sd";
+ intr_size = 128;
+ }
+ else if (type.length == 2 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse2.min.pd";
+ intr_size = 128;
+ }
+ else {
+ intrinsic = "llvm.x86.avx.min.pd.256";
+ intr_size = 256;
+ }
}
- else {
- if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
- intrinsic = "llvm.x86.sse2.pminu.b";
- if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
+ }
+ else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+ intr_size = 128;
+ if ((type.width == 8 || type.width == 16) &&
+ (type.width * type.length <= 64) &&
+ (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+ debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
+ __FUNCTION__);
+ }
+ if (type.width == 8 && !type.sign) {
+ intrinsic = "llvm.x86.sse2.pminu.b";
+ }
+ else if (type.width == 16 && type.sign) {
+ intrinsic = "llvm.x86.sse2.pmins.w";
+ }
+ if (util_cpu_caps.has_sse4_1) {
+ if (type.width == 8 && type.sign) {
intrinsic = "llvm.x86.sse41.pminsb";
- if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 16 && !type.sign) {
intrinsic = "llvm.x86.sse41.pminuw";
- if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
- intrinsic = "llvm.x86.sse2.pmins.w";
- if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 32 && !type.sign) {
intrinsic = "llvm.x86.sse41.pminud";
- if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 32 && type.sign) {
intrinsic = "llvm.x86.sse41.pminsd";
+ }
}
}
- if(intrinsic)
- return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
+ if(intrinsic) {
+ return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
+ type,
+ intr_size, a, b);
+ }
cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
return lp_build_select(bld, cond, a, b);
@@ -125,9 +165,9 @@ lp_build_max_simple(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b)
{
- LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
const char *intrinsic = NULL;
+ unsigned intr_size;
LLVMValueRef cond;
assert(lp_check_value(type, a));
@@ -135,31 +175,72 @@ lp_build_max_simple(struct lp_build_context *bld,
/* TODO: optimize the constant case */
- if(type.width * type.length == 128) {
- if(type.floating) {
- if(type.width == 32 && util_cpu_caps.has_sse)
+ if (type.floating && util_cpu_caps.has_sse) {
+ if (type.width == 32) {
+ if (type.length == 1) {
+ intrinsic = "llvm.x86.sse.max.ss";
+ intr_size = 128;
+ }
+ else if (type.length <= 4 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse.max.ps";
- if(type.width == 64 && util_cpu_caps.has_sse2)
+ intr_size = 128;
+ }
+ else {
+ intrinsic = "llvm.x86.avx.max.ps.256";
+ intr_size = 256;
+ }
+ }
+ if (type.width == 64 && util_cpu_caps.has_sse2) {
+ if (type.length == 1) {
+ intrinsic = "llvm.x86.sse2.max.sd";
+ intr_size = 128;
+ }
+ else if (type.length == 2 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse2.max.pd";
+ intr_size = 128;
+ }
+ else {
+ intrinsic = "llvm.x86.avx.max.pd.256";
+ intr_size = 256;
+ }
}
- else {
- if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
- intrinsic = "llvm.x86.sse2.pmaxu.b";
- if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
+ }
+ else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+ intr_size = 128;
+ if ((type.width == 8 || type.width == 16) &&
+ (type.width * type.length <= 64) &&
+ (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+ debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
+ __FUNCTION__);
+ }
+ if (type.width == 8 && !type.sign) {
+ intrinsic = "llvm.x86.sse2.pmaxu.b";
+ intr_size = 128;
+ }
+ else if (type.width == 16 && type.sign) {
+ intrinsic = "llvm.x86.sse2.pmaxs.w";
+ }
+ if (util_cpu_caps.has_sse4_1) {
+ if (type.width == 8 && type.sign) {
intrinsic = "llvm.x86.sse41.pmaxsb";
- if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 16 && !type.sign) {
intrinsic = "llvm.x86.sse41.pmaxuw";
- if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
- intrinsic = "llvm.x86.sse2.pmaxs.w";
- if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 32 && !type.sign) {
intrinsic = "llvm.x86.sse41.pmaxud";
- if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 32 && type.sign) {
intrinsic = "llvm.x86.sse41.pmaxsd";
+ }
}
}
- if(intrinsic)
- return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
+ if(intrinsic) {
+ return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
+ type,
+ intr_size, a, b);
+ }
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
return lp_build_select(bld, cond, a, b);
@@ -265,15 +346,20 @@ lp_build_add(struct lp_build_context *bld,
}
-/** Return the scalar sum of the elements of a */
+/** Return the scalar sum of the elements of a.
+ * Should avoid this operation whenever possible.
+ */
LLVMValueRef
-lp_build_sum_vector(struct lp_build_context *bld,
- LLVMValueRef a)
+lp_build_horizontal_add(struct lp_build_context *bld,
+ LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMValueRef index, res;
- unsigned i;
+ unsigned i, length;
+ LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
+ LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
+ LLVMValueRef vecres, elem2;
assert(lp_check_value(type, a));
@@ -283,26 +369,191 @@ lp_build_sum_vector(struct lp_build_context *bld,
assert(!bld->type.norm);
- index = lp_build_const_int32(bld->gallivm, 0);
- res = LLVMBuildExtractElement(builder, a, index, "");
+ /*
+ * for byte vectors can do much better with psadbw.
+ * Using repeated shuffle/adds here. Note with multiple vectors
+ * this can be done more efficiently as outlined in the intel
+ * optimization manual.
+ * Note: could cause data rearrangement if used with smaller element
+ * sizes.
+ */
- for (i = 1; i < type.length; i++) {
- index = lp_build_const_int32(bld->gallivm, i);
- if (type.floating)
- res = LLVMBuildFAdd(builder, res,
- LLVMBuildExtractElement(builder,
- a, index, ""),
- "");
- else
- res = LLVMBuildAdd(builder, res,
- LLVMBuildExtractElement(builder,
- a, index, ""),
- "");
+ vecres = a;
+ length = type.length / 2;
+ while (length > 1) {
+ LLVMValueRef vec1, vec2;
+ for (i = 0; i < length; i++) {
+ shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
+ shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
+ }
+ vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
+ LLVMConstVector(shuffles1, length), "");
+ vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
+ LLVMConstVector(shuffles2, length), "");
+ if (type.floating) {
+ vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
+ }
+ else {
+ vecres = LLVMBuildAdd(builder, vec1, vec2, "");
+ }
+ length = length >> 1;
}
+ /* always have vector of size 2 here */
+ assert(length == 1);
+
+ index = lp_build_const_int32(bld->gallivm, 0);
+ res = LLVMBuildExtractElement(builder, vecres, index, "");
+ index = lp_build_const_int32(bld->gallivm, 1);
+ elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
+
+ if (type.floating)
+ res = LLVMBuildFAdd(builder, res, elem2, "");
+ else
+ res = LLVMBuildAdd(builder, res, elem2, "");
+
return res;
}
+/**
+ * Return the horizontal sums of 4 float vectors as a float4 vector.
+ * This uses the technique as outlined in Intel Optimization Manual.
+ */
+static LLVMValueRef
+lp_build_horizontal_add4x4f(struct lp_build_context *bld,
+ LLVMValueRef src[4])
+{
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef shuffles[4];
+ LLVMValueRef tmp[4];
+ LLVMValueRef sumtmp[2], shuftmp[2];
+
+ /* lower half of regs */
+ shuffles[0] = lp_build_const_int32(gallivm, 0);
+ shuffles[1] = lp_build_const_int32(gallivm, 1);
+ shuffles[2] = lp_build_const_int32(gallivm, 4);
+ shuffles[3] = lp_build_const_int32(gallivm, 5);
+ tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
+ LLVMConstVector(shuffles, 4), "");
+ tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
+ LLVMConstVector(shuffles, 4), "");
+
+ /* upper half of regs */
+ shuffles[0] = lp_build_const_int32(gallivm, 2);
+ shuffles[1] = lp_build_const_int32(gallivm, 3);
+ shuffles[2] = lp_build_const_int32(gallivm, 6);
+ shuffles[3] = lp_build_const_int32(gallivm, 7);
+ tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
+ LLVMConstVector(shuffles, 4), "");
+ tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
+ LLVMConstVector(shuffles, 4), "");
+
+ sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
+ sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
+
+ shuffles[0] = lp_build_const_int32(gallivm, 0);
+ shuffles[1] = lp_build_const_int32(gallivm, 2);
+ shuffles[2] = lp_build_const_int32(gallivm, 4);
+ shuffles[3] = lp_build_const_int32(gallivm, 6);
+ shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
+ LLVMConstVector(shuffles, 4), "");
+
+ shuffles[0] = lp_build_const_int32(gallivm, 1);
+ shuffles[1] = lp_build_const_int32(gallivm, 3);
+ shuffles[2] = lp_build_const_int32(gallivm, 5);
+ shuffles[3] = lp_build_const_int32(gallivm, 7);
+ shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
+ LLVMConstVector(shuffles, 4), "");
+
+ return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
+}
+
+
+/*
+ * partially horizontally add 2-4 float vectors with length nx4,
+ * i.e. only four adjacent values in each vector will be added,
+ * assuming values are really grouped in 4 which also determines
+ * output order.
+ *
+ * Return a vector of the same length as the initial vectors,
+ * with the excess elements (if any) being undefined.
+ * The element order is independent of number of input vectors.
+ * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
+ * the output order thus will be
+ * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
+ */
+LLVMValueRef
+lp_build_hadd_partial4(struct lp_build_context *bld,
+ LLVMValueRef vectors[],
+ unsigned num_vecs)
+{
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef ret_vec;
+ LLVMValueRef tmp[4];
+ const char *intrinsic = NULL;
+
+ assert(num_vecs >= 2 && num_vecs <= 4);
+ assert(bld->type.floating);
+
+ /* only use this with at least 2 vectors, as it is sort of expensive
+ * (depending on cpu) and we always need two horizontal adds anyway,
+ * so a shuffle/add approach might be better.
+ */
+
+ tmp[0] = vectors[0];
+ tmp[1] = vectors[1];
+
+ tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
+ tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
+
+ if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
+ bld->type.length == 4) {
+ intrinsic = "llvm.x86.sse3.hadd.ps";
+ }
+ else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
+ bld->type.length == 8) {
+ intrinsic = "llvm.x86.avx.hadd.ps.256";
+ }
+ if (intrinsic) {
+ tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
+ lp_build_vec_type(gallivm, bld->type),
+ tmp[0], tmp[1]);
+ if (num_vecs > 2) {
+ tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
+ lp_build_vec_type(gallivm, bld->type),
+ tmp[2], tmp[3]);
+ }
+ else {
+ tmp[1] = tmp[0];
+ }
+ return lp_build_intrinsic_binary(builder, intrinsic,
+ lp_build_vec_type(gallivm, bld->type),
+ tmp[0], tmp[1]);
+ }
+
+ if (bld->type.length == 4) {
+ ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
+ }
+ else {
+ LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
+ unsigned j;
+ unsigned num_iter = bld->type.length / 4;
+ struct lp_type parttype = bld->type;
+ parttype.length = 4;
+ for (j = 0; j < num_iter; j++) {
+ LLVMValueRef partsrc[4];
+ unsigned i;
+ for (i = 0; i < 4; i++) {
+ partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
+ }
+ partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
+ }
+ ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
+ }
+ return ret_vec;
+}
/**
* Generate a - b
@@ -553,7 +804,7 @@ lp_build_mul_imm(struct lp_build_context *bld,
if(bld->type.floating) {
#if 0
/*
- * Power of two multiplication by directly manipulating the mantissa.
+ * Power of two multiplication by directly manipulating the exponent.
*
* XXX: This might not be always faster, it will introduce a small error
* for multiplication by zero, and it will produce wrong results
@@ -612,7 +863,8 @@ lp_build_div(struct lp_build_context *bld,
return LLVMConstUDiv(a, b);
}
- if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4 &&
+ if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+ (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
type.floating)
return lp_build_mul(bld, a, lp_build_rcp(bld, b));
@@ -871,6 +1123,12 @@ lp_build_abs(struct lp_build_context *bld,
return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
}
}
+ else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
+ (gallivm_debug & GALLIVM_DEBUG_PERF) &&
+ (type.width == 8 || type.width == 16 || type.width == 32)) {
+ debug_printf("%s: inefficient code, should split vectors manually\n",
+ __FUNCTION__);
+ }
return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
}
@@ -934,6 +1192,7 @@ lp_build_sgn(struct lp_build_context *bld,
else
{
/* signed int/norm/fixed point */
+ /* could use psign with sse3 and appropriate vectors here */
LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
res = lp_build_select(bld, cond, bld->one, minus_one);
@@ -1000,7 +1259,16 @@ lp_build_int_to_float(struct lp_build_context *bld,
return LLVMBuildSIToFP(builder, a, vec_type, "");
}
+static boolean
+sse41_rounding_available(const struct lp_type type)
+{
+ if ((util_cpu_caps.has_sse4_1 &&
+ (type.length == 1 || type.width*type.length == 128)) ||
+ (util_cpu_caps.has_avx && type.width*type.length == 256))
+ return TRUE;
+ return FALSE;
+}
enum lp_build_round_sse41_mode
{
@@ -1065,18 +1333,34 @@ lp_build_round_sse41(struct lp_build_context *bld,
res = LLVMBuildExtractElement(builder, res, index0, "");
}
else {
- assert(type.width*type.length == 128);
-
- switch(type.width) {
- case 32:
- intrinsic = "llvm.x86.sse41.round.ps";
- break;
- case 64:
- intrinsic = "llvm.x86.sse41.round.pd";
- break;
- default:
- assert(0);
- return bld->undef;
+ if (type.width * type.length == 128) {
+ switch(type.width) {
+ case 32:
+ intrinsic = "llvm.x86.sse41.round.ps";
+ break;
+ case 64:
+ intrinsic = "llvm.x86.sse41.round.pd";
+ break;
+ default:
+ assert(0);
+ return bld->undef;
+ }
+ }
+ else {
+ assert(type.width * type.length == 256);
+ assert(util_cpu_caps.has_avx);
+
+ switch(type.width) {
+ case 32:
+ intrinsic = "llvm.x86.avx.round.ps.256";
+ break;
+ case 64:
+ intrinsic = "llvm.x86.avx.round.pd.256";
+ break;
+ default:
+ assert(0);
+ return bld->undef;
+ }
}
res = lp_build_intrinsic_binary(builder, intrinsic,
@@ -1125,10 +1409,15 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld,
ret_type, arg);
}
else {
- assert(type.width*type.length == 128);
-
- intrinsic = "llvm.x86.sse2.cvtps2dq";
+ if (type.width* type.length == 128) {
+ intrinsic = "llvm.x86.sse2.cvtps2dq";
+ }
+ else {
+ assert(type.width*type.length == 256);
+ assert(util_cpu_caps.has_avx);
+ intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
+ }
res = lp_build_intrinsic_unary(builder, intrinsic,
ret_type, a);
}
@@ -1152,8 +1441,7 @@ lp_build_trunc(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
}
else {
@@ -1183,8 +1471,7 @@ lp_build_round(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
}
else {
@@ -1212,8 +1499,7 @@ lp_build_floor(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
}
else {
@@ -1241,8 +1527,7 @@ lp_build_ceil(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
}
else {
@@ -1269,6 +1554,34 @@ lp_build_fract(struct lp_build_context *bld,
/**
+ * Prevent returning a fractional part of 1.0 for very small negative values of
+ * 'a' by clamping against 0.99999(9).
+ */
+static inline LLVMValueRef
+clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
+{
+ LLVMValueRef max;
+
+ /* this is the largest number smaller than 1.0 representable as float */
+ max = lp_build_const_vec(bld->gallivm, bld->type,
+ 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
+ return lp_build_min(bld, fract, max);
+}
+
+
+/**
+ * Same as lp_build_fract, but guarantees that the result is always smaller
+ * than one.
+ */
+LLVMValueRef
+lp_build_fract_safe(struct lp_build_context *bld,
+ LLVMValueRef a)
+{
+ return clamp_fract(bld, lp_build_fract(bld, a));
+}
+
+
+/**
* Return the integer part of a float (vector) value (== round toward zero).
* The returned value is an integer (vector).
* Ex: itrunc(-1.5) = -1
@@ -1307,12 +1620,12 @@ lp_build_iround(struct lp_build_context *bld,
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse2 &&
- ((type.width == 32) && (type.length == 1 || type.length == 4))) {
+ if ((util_cpu_caps.has_sse2 &&
+ ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
+ (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
return lp_build_iround_nearest_sse2(bld, a);
}
- else if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
}
else {
@@ -1362,14 +1675,12 @@ lp_build_ifloor(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
- res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
- }
- else {
- res = a;
-
- if (type.sign) {
+ res = a;
+ if (type.sign) {
+ if (sse41_rounding_available(type)) {
+ res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+ }
+ else {
/* Take the sign bit and add it to 1 constant */
LLVMTypeRef vec_type = bld->vec_type;
unsigned mantissa = lp_mantissa(type);
@@ -1423,8 +1734,7 @@ lp_build_iceil(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
}
else {
@@ -1470,7 +1780,7 @@ lp_build_iceil(struct lp_build_context *bld,
* Combined ifloor() & fract().
*
* Preferred to calling the functions separately, as it will ensure that the
- * stratergy (floor() vs ifloor()) that results in less redundant work is used.
+ * strategy (floor() vs ifloor()) that results in less redundant work is used.
*/
void
lp_build_ifloor_fract(struct lp_build_context *bld,
@@ -1485,8 +1795,7 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
/*
* floor() is easier.
*/
@@ -1507,6 +1816,21 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
}
+/**
+ * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
+ * always smaller than one.
+ */
+void
+lp_build_ifloor_fract_safe(struct lp_build_context *bld,
+ LLVMValueRef a,
+ LLVMValueRef *out_ipart,
+ LLVMValueRef *out_fpart)
+{
+ lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
+ *out_fpart = clamp_fract(bld, *out_fpart);
+}
+
+
LLVMValueRef
lp_build_sqrt(struct lp_build_context *bld,
LLVMValueRef a)
@@ -1519,10 +1843,14 @@ lp_build_sqrt(struct lp_build_context *bld,
assert(lp_check_value(type, a));
/* TODO: optimize the constant case */
- /* TODO: optimize the constant case */
assert(type.floating);
- util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
+ if (type.length == 1) {
+ util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
+ }
+ else {
+ util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
+ }
return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
}
@@ -1586,19 +1914,28 @@ lp_build_rcp(struct lp_build_context *bld,
* - it doesn't even get the reciprocate of 1.0 exactly
* - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
* - for recent processors the benefit over DIVPS is marginal, a case
- * depedent
+ * dependent
*
* We could still use it on certain processors if benchmarks show that the
* RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
* particular uses that require less workarounds.
*/
- if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+ if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+ (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
const unsigned num_iterations = 0;
LLVMValueRef res;
unsigned i;
+ const char *intrinsic = NULL;
- res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
+ if (type.length == 4) {
+ intrinsic = "llvm.x86.sse.rcp.ps";
+ }
+ else {
+ intrinsic = "llvm.x86.avx.rcp.ps.256";
+ }
+
+ res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
for (i = 0; i < num_iterations; ++i) {
res = lp_build_rcp_refine(bld, a, res);
@@ -1653,12 +1990,22 @@ lp_build_rsqrt(struct lp_build_context *bld,
assert(type.floating);
- if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+ if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+ (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
const unsigned num_iterations = 1;
LLVMValueRef res;
unsigned i;
+ const char *intrinsic = NULL;
+
+ if (type.length == 4) {
+ intrinsic = "llvm.x86.sse.rsqrt.ps";
+ }
+ else {
+ intrinsic = "llvm.x86.avx.rsqrt.ps.256";
+ }
+
+ res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
- res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
for (i = 0; i < num_iterations; ++i) {
res = lp_build_rsqrt_refine(bld, a, res);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index aeb987ff352..60b9907e60f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -57,8 +57,13 @@ lp_build_add(struct lp_build_context *bld,
LLVMValueRef b);
LLVMValueRef
-lp_build_sum_vector(struct lp_build_context *bld,
- LLVMValueRef a);
+lp_build_horizontal_add(struct lp_build_context *bld,
+ LLVMValueRef a);
+
+LLVMValueRef
+lp_build_hadd_partial4(struct lp_build_context *bld,
+ LLVMValueRef vectors[],
+ unsigned num_vecs);
LLVMValueRef
lp_build_sub(struct lp_build_context *bld,
@@ -157,6 +162,10 @@ lp_build_fract(struct lp_build_context *bld,
LLVMValueRef a);
LLVMValueRef
+lp_build_fract_safe(struct lp_build_context *bld,
+ LLVMValueRef a);
+
+LLVMValueRef
lp_build_ifloor(struct lp_build_context *bld,
LLVMValueRef a);
LLVMValueRef
@@ -177,6 +186,12 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
LLVMValueRef *out_ipart,
LLVMValueRef *out_fpart);
+void
+lp_build_ifloor_fract_safe(struct lp_build_context *bld,
+ LLVMValueRef a,
+ LLVMValueRef *out_ipart,
+ LLVMValueRef *out_fpart);
+
LLVMValueRef
lp_build_sqrt(struct lp_build_context *bld,
LLVMValueRef a);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.c b/src/gallium/auxiliary/gallivm/lp_bld_const.c
index 59e8fb2ed6e..35799a1ef8e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.c
@@ -37,6 +37,7 @@
#include "util/u_debug.h"
#include "util/u_math.h"
+#include "util/u_half.h"
#include "lp_bld_type.h"
#include "lp_bld_const.h"
@@ -50,10 +51,12 @@ lp_mantissa(struct lp_type type)
if(type.floating) {
switch(type.width) {
+ case 16:
+ return 10;
case 32:
return 23;
case 64:
- return 53;
+ return 52;
default:
assert(0);
return 0;
@@ -136,6 +139,8 @@ lp_const_min(struct lp_type type)
if (type.floating) {
switch(type.width) {
+ case 16:
+ return -65504;
case 32:
return -FLT_MAX;
case 64:
@@ -169,6 +174,8 @@ lp_const_max(struct lp_type type)
if (type.floating) {
switch(type.width) {
+ case 16:
+ return 65504;
case 32:
return FLT_MAX;
case 64:
@@ -196,6 +203,8 @@ lp_const_eps(struct lp_type type)
{
if (type.floating) {
switch(type.width) {
+ case 16:
+ return 2E-10;
case 32:
return FLT_EPSILON;
case 64:
@@ -247,7 +256,9 @@ lp_build_one(struct gallivm_state *gallivm, struct lp_type type)
elem_type = lp_build_elem_type(gallivm, type);
- if(type.floating)
+ if(type.floating && type.width == 16)
+ elems[0] = LLVMConstInt(elem_type, util_float_to_half(1.0f), 0);
+ else if(type.floating)
elems[0] = LLVMConstReal(elem_type, 1.0);
else if(type.fixed)
elems[0] = LLVMConstInt(elem_type, 1LL << (type.width/2), 0);
@@ -292,7 +303,9 @@ lp_build_const_elem(struct gallivm_state *gallivm,
LLVMTypeRef elem_type = lp_build_elem_type(gallivm, type);
LLVMValueRef elem;
- if(type.floating) {
+ if(type.floating && type.width == 16) {
+ elem = LLVMConstInt(elem_type, util_float_to_half((float)val), 0);
+ } else if(type.floating) {
elem = LLVMConstReal(elem_type, val);
}
else {
@@ -364,20 +377,10 @@ lp_build_const_aos(struct gallivm_state *gallivm,
if(swizzle == NULL)
swizzle = default_swizzle;
- if(type.floating) {
- elems[swizzle[0]] = LLVMConstReal(elem_type, r);
- elems[swizzle[1]] = LLVMConstReal(elem_type, g);
- elems[swizzle[2]] = LLVMConstReal(elem_type, b);
- elems[swizzle[3]] = LLVMConstReal(elem_type, a);
- }
- else {
- double dscale = lp_const_scale(type);
-
- elems[swizzle[0]] = LLVMConstInt(elem_type, round(r*dscale), 0);
- elems[swizzle[1]] = LLVMConstInt(elem_type, round(g*dscale), 0);
- elems[swizzle[2]] = LLVMConstInt(elem_type, round(b*dscale), 0);
- elems[swizzle[3]] = LLVMConstInt(elem_type, round(a*dscale), 0);
- }
+ elems[swizzle[0]] = lp_build_const_elem(gallivm, type, r);
+ elems[swizzle[1]] = lp_build_const_elem(gallivm, type, g);
+ elems[swizzle[2]] = lp_build_const_elem(gallivm, type, b);
+ elems[swizzle[3]] = lp_build_const_elem(gallivm, type, a);
for(i = 4; i < type.length; ++i)
elems[i] = elems[i % 4];
@@ -452,7 +455,7 @@ lp_build_const_string(struct gallivm_state *gallivm,
/**
* Build a callable function pointer.
*
- * We this casts instead of LLVMAddGlobalMapping()
+ * We use function pointer constants instead of LLVMAddGlobalMapping()
* to work around a bug in LLVM 2.6, and for efficiency/simplicity.
*/
LLVMValueRef
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 0973e1f16f3..0399709faad 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -70,6 +70,66 @@
#include "lp_bld_arit.h"
#include "lp_bld_pack.h"
#include "lp_bld_conv.h"
+#include "lp_bld_logic.h"
+
+
+/**
+ * Converts int16 half-float to float32
+ * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
+ * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
+ *
+ * @param src_type <vector> type of int16
+ * @param src value to convert
+ *
+ * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
+ */
+LLVMValueRef
+lp_build_half_to_float(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ LLVMValueRef src)
+{
+ struct lp_type f32_type = lp_type_float_vec(32, 32 * src_type.length);
+ struct lp_type i32_type = lp_type_int_vec(32, 32 * src_type.length);
+
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
+ LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type);
+
+ /* Constants */
+ LLVMValueRef i32_13 = lp_build_const_int_vec(gallivm, i32_type, 13);
+ LLVMValueRef i32_16 = lp_build_const_int_vec(gallivm, i32_type, 16);
+ LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff);
+ LLVMValueRef i32_was_infnan = lp_build_const_int_vec(gallivm, i32_type, 0x7bff);
+ LLVMValueRef i32_exp_infnan = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
+ LLVMValueRef f32_magic = LLVMBuildBitCast(builder,
+ lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23),
+ float_vec_type, "");
+
+ /* Convert int16 vector to int32 vector by zero ext */
+ LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, "");
+
+ /* Exponent / mantissa bits */
+ LLVMValueRef expmant = LLVMBuildAnd(builder, i32_mask_nosign, h, "");
+ LLVMValueRef shifted = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, "");
+
+ /* Exponent adjust */
+ LLVMValueRef scaled = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, "");
+
+ /* Make sure Inf/NaN survive */
+ LLVMValueRef b_wasinfnan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan);
+ LLVMValueRef infnanexp = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, "");
+
+ /* Sign bit */
+ LLVMValueRef justsign = LLVMBuildXor(builder, h, expmant, "");
+ LLVMValueRef sign = LLVMBuildShl(builder, justsign, i32_16, "");
+
+ /* Combine result */
+ LLVMValueRef sign_inf = LLVMBuildOr(builder, sign, infnanexp, "");
+ LLVMValueRef final = LLVMBuildOr(builder, scaled, sign_inf, "");
+
+ /* Cast from int32 vector to float32 vector */
+ return LLVMBuildBitCast(builder, final, float_vec_type, "");
+}
/**
@@ -334,6 +394,8 @@ lp_build_conv(struct gallivm_state *gallivm,
dst_type.width == 8 &&
dst_type.length == 16 &&
+ 4 * num_dsts == num_srcs &&
+
util_cpu_caps.has_sse2)
{
struct lp_build_context bld;
@@ -371,6 +433,76 @@ lp_build_conv(struct gallivm_state *gallivm,
return;
}
+ /* Special case 2x8f --> 1x16ub
+ */
+ else if (src_type.floating == 1 &&
+ src_type.fixed == 0 &&
+ src_type.sign == 1 &&
+ src_type.norm == 0 &&
+ src_type.width == 32 &&
+ src_type.length == 8 &&
+
+ dst_type.floating == 0 &&
+ dst_type.fixed == 0 &&
+ dst_type.sign == 0 &&
+ dst_type.norm == 1 &&
+ dst_type.width == 8 &&
+ dst_type.length == 16 &&
+
+ 2 * num_dsts == num_srcs &&
+
+ util_cpu_caps.has_avx) {
+
+ struct lp_build_context bld;
+ struct lp_type int16_type = dst_type;
+ struct lp_type int32_type = dst_type;
+ LLVMValueRef const_255f;
+ unsigned i;
+
+ lp_build_context_init(&bld, gallivm, src_type);
+
+ int16_type.width *= 2;
+ int16_type.length /= 2;
+ int16_type.sign = 1;
+
+ int32_type.width *= 4;
+ int32_type.length /= 4;
+ int32_type.sign = 1;
+
+ const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
+
+ for (i = 0; i < num_dsts; ++i, src += 2) {
+ LLVMValueRef lo, hi, a, b;
+
+ a = LLVMBuildFMul(builder, src[0], const_255f, "");
+ b = LLVMBuildFMul(builder, src[1], const_255f, "");
+
+ a = lp_build_iround(&bld, a);
+ b = lp_build_iround(&bld, b);
+
+ tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
+ tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
+ tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
+ tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
+
+ /* relying on clamping behavior of sse2 intrinsics here */
+ lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
+ hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
+ dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
+ }
+ return;
+ }
+
+ /* Pre convert half-floats to floats
+ */
+ else if (src_type.floating && src_type.width == 16)
+ {
+ for(i = 0; i < num_tmps; ++i)
+ tmp[i] = lp_build_half_to_float(gallivm, src_type, tmp[i]);
+
+ tmp_type.width = 32;
+ }
+
/*
* Clamp if necessary
*/
@@ -580,7 +712,7 @@ lp_build_conv(struct gallivm_state *gallivm,
* This will convert the integer masks that match the given types.
*
* The mask values should 0 or -1, i.e., all bits either set to zero or one.
- * Any other value will likely cause in unpredictable results.
+ * Any other value will likely cause unpredictable results.
*
* This is basically a very trimmed down version of lp_build_conv.
*/
@@ -591,8 +723,6 @@ lp_build_conv_mask(struct gallivm_state *gallivm,
const LLVMValueRef *src, unsigned num_srcs,
LLVMValueRef *dst, unsigned num_dsts)
{
- /* Register width must remain constant */
- assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
/* We must not loose or gain channels. Only precision */
assert(src_type.length * num_srcs == dst_type.length * num_dsts);
@@ -617,16 +747,5 @@ lp_build_conv_mask(struct gallivm_state *gallivm,
* Truncate or expand bit width
*/
- if(src_type.width > dst_type.width) {
- assert(num_dsts == 1);
- dst[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
- }
- else if(src_type.width < dst_type.width) {
- assert(num_srcs == 1);
- lp_build_unpack(gallivm, src_type, dst_type, src[0], dst, num_dsts);
- }
- else {
- assert(num_srcs == num_dsts);
- memcpy(dst, src, num_dsts * sizeof *dst);
- }
+ lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
index cec655980fa..c830fbef5f2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
@@ -42,6 +42,10 @@
struct lp_type;
+LLVMValueRef
+lp_build_half_to_float(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ LLVMValueRef src);
LLVMValueRef
lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
index 444b70a678c..93505f3da45 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
@@ -35,10 +35,8 @@
#if HAVE_LLVM >= 0x0300
#include <llvm/Support/TargetRegistry.h>
-#include <llvm/Support/TargetSelect.h>
#else /* HAVE_LLVM < 0x0300 */
#include <llvm/Target/TargetRegistry.h>
-#include <llvm/Target/TargetSelect.h>
#endif /* HAVE_LLVM < 0x0300 */
#if HAVE_LLVM >= 0x0209
@@ -183,7 +181,7 @@ lp_disassemble(const void* func)
/*
* Limit disassembly to this extent
*/
- const uint64_t extent = 0x10000;
+ const uint64_t extent = 96 * 1024;
uint64_t max_pc = 0;
@@ -200,24 +198,6 @@ lp_disassemble(const void* func)
std::string Error;
const Target *T = TargetRegistry::lookupTarget(Triple, Error);
-#if HAVE_LLVM >= 0x0208
- InitializeNativeTargetAsmPrinter();
-#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
- LLVMInitializeX86AsmPrinter();
-#elif defined(PIPE_ARCH_ARM)
- LLVMInitializeARMAsmPrinter();
-#elif defined(PIPE_ARCH_PPC)
- LLVMInitializePowerPCAsmPrinter();
-#endif
-
-#if HAVE_LLVM >= 0x0301
- InitializeNativeTargetDisassembler();
-#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
- LLVMInitializeX86Disassembler();
-#elif defined(PIPE_ARCH_ARM)
- LLVMInitializeARMDisassembler();
-#endif
-
#if HAVE_LLVM >= 0x0300
OwningPtr<const MCAsmInfo> AsmInfo(T->createMCAsmInfo(Triple));
#else
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
index d2b3713ed2d..30da44e5b9c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -131,6 +131,15 @@ lp_build_mask_check(struct lp_build_mask_context *mask)
value = lp_build_mask_value(mask);
+ /*
+ * XXX this doesn't quite generate the most efficient code possible, if
+ * the masks are vectors which have all bits set to the same value
+ * in each element.
+ * movmskps/pmovmskb would be more efficient to get the required value
+ * into ordinary reg (certainly with 8 floats).
+ * Not sure if llvm could figure that out on its own.
+ */
+
/* cond = (mask == 0) */
cond = LLVMBuildICmp(builder,
LLVMIntEQ,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 04142d905b1..3608a68202f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -67,6 +67,13 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
LLVMValueRef i,
LLVMValueRef j);
+LLVMValueRef
+lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
+ const struct util_format_description *format_desc,
+ struct lp_type type,
+ LLVMValueRef base_ptr,
+ LLVMValueRef offset);
+
/*
* SoA
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index e4b8da6bcfd..9591bcfb2c7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -470,6 +470,11 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
return lp_build_format_swizzle_aos(format_desc, &bld, res);
}
+ /* If all channels are of same type and we are not using half-floats */
+ if (util_format_is_array(format_desc)) {
+ return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset);
+ }
+
/*
* YUV / subsampled formats
*/
@@ -601,7 +606,6 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
return res;
}
-
/*
* Fallback to util_format_description::fetch_rgba_float().
*/
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
new file mode 100644
index 00000000000..b8ec379d76f
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
@@ -0,0 +1,102 @@
+/**************************************************************************
+ *
+ * Copyright 2012 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "lp_bld_const.h"
+#include "lp_bld_struct.h"
+#include "lp_bld_format.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_type.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_pack.h"
+
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "pipe/p_state.h"
+
+/**
+ * @brief lp_build_fetch_rgba_aos_array
+ *
+ * \param format_desc describes format of the image we're fetching from
+ * \param dst_type output type
+ * \param base_ptr address of the pixel block (or the texel if uncompressed)
+ * \param offset ptr offset
+ */
+LLVMValueRef
+lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
+ const struct util_format_description *format_desc,
+ struct lp_type dst_type,
+ LLVMValueRef base_ptr,
+ LLVMValueRef offset)
+{
+ struct lp_build_context bld;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMTypeRef src_elem_type, src_vec_type;
+ LLVMValueRef ptr, res = NULL;
+ struct lp_type src_type;
+
+ memset(&src_type, 0, sizeof src_type);
+ src_type.floating = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT;
+ src_type.fixed = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FIXED;
+ src_type.sign = format_desc->channel[0].type != UTIL_FORMAT_TYPE_UNSIGNED;
+ src_type.norm = format_desc->channel[0].normalized;
+ src_type.width = format_desc->channel[0].size;
+ src_type.length = format_desc->nr_channels;
+
+ assert(src_type.length <= dst_type.length);
+
+ src_elem_type = lp_build_elem_type(gallivm, src_type);
+ src_vec_type = lp_build_vec_type(gallivm, src_type);
+
+ /* Read whole vector from memory, unaligned */
+ if (!res) {
+ ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, "");
+ ptr = LLVMBuildPointerCast(builder, ptr, LLVMPointerType(src_vec_type, 0), "");
+ res = LLVMBuildLoad(builder, ptr, "");
+ lp_set_load_alignment(res, src_type.width / 8);
+ }
+
+ /* Truncate doubles to float */
+ if (src_type.floating && src_type.width == 64) {
+ src_type.width = 32;
+ src_vec_type = lp_build_vec_type(gallivm, src_type);
+
+ res = LLVMBuildFPTrunc(builder, res, src_vec_type, "");
+ }
+
+ /* Expand to correct length */
+ if (src_type.length < dst_type.length) {
+ res = lp_build_pad_vector(gallivm, res, src_type, dst_type.length);
+ src_type.length = dst_type.length;
+ }
+
+ /* Convert to correct format */
+ lp_build_conv(gallivm, src_type, dst_type, &res, 1, &res, 1);
+
+ /* Swizzle it */
+ lp_build_context_init(&bld, gallivm, dst_type);
+ return lp_build_format_swizzle_aos(format_desc, &bld, res);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 0a57b3ce794..afeb34079bf 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -359,7 +359,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
*/
if (util_format_fits_8unorm(format_desc) &&
- type.floating && type.width == 32 && type.length == 4) {
+ type.floating && type.width == 32 &&
+ (type.length == 1 || (type.length % 4 == 0))) {
struct lp_type tmp_type;
LLVMValueRef tmp;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
index ccc83207004..f77eb1212b1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -84,7 +84,7 @@ uyvy_to_yuv_soa(struct gallivm_state *gallivm,
* per element. Didn't measure performance but cuts shader size
* by quite a bit (less difference if cpu has no sse4.1 support).
*/
- if (util_cpu_caps.has_sse2 && n == 4) {
+ if (util_cpu_caps.has_sse2 && n > 1) {
LLVMValueRef sel, tmp, tmp2;
struct lp_build_context bld32;
@@ -152,7 +152,7 @@ yuyv_to_yuv_soa(struct gallivm_state *gallivm,
* per element. Didn't measure performance but cuts shader size
* by quite a bit (less difference if cpu has no sse4.1 support).
*/
- if (util_cpu_caps.has_sse2 && n == 4) {
+ if (util_cpu_caps.has_sse2 && n > 1) {
LLVMValueRef sel, tmp;
struct lp_build_context bld32;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 768d935dae5..5bf4bcfab3b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -26,15 +26,44 @@
**************************************************************************/
+#include "pipe/p_config.h"
#include "pipe/p_compiler.h"
#include "util/u_cpu_detect.h"
#include "util/u_debug.h"
#include "util/u_memory.h"
#include "util/u_simple_list.h"
+#include "lp_bld.h"
#include "lp_bld_debug.h"
+#include "lp_bld_misc.h"
#include "lp_bld_init.h"
+#include <llvm-c/Analysis.h>
#include <llvm-c/Transforms/Scalar.h>
+#include <llvm-c/BitWriter.h>
+
+
+/**
+ * AVX is supported in:
+ * - standard JIT from LLVM 3.2 onwards
+ * - MC-JIT from LLVM 3.1
+ * - MC-JIT supports limited OSes (MacOSX and Linux)
+ * - standard JIT in LLVM 3.1, with backports
+ */
+#if HAVE_LLVM >= 0x0301 && (defined(PIPE_OS_LINUX) || defined(PIPE_OS_APPLE))
+# define USE_MCJIT 1
+# define HAVE_AVX 1
+#elif HAVE_LLVM >= 0x0302 || (HAVE_LLVM == 0x0301 && defined(HAVE_JIT_AVX_SUPPORT))
+# define USE_MCJIT 0
+# define HAVE_AVX 1
+#else
+# define USE_MCJIT 0
+# define HAVE_AVX 0
+#endif
+
+
+#if USE_MCJIT
+void LLVMLinkInMCJIT();
+#endif
#ifdef DEBUG
@@ -57,6 +86,8 @@ DEBUG_GET_ONCE_FLAGS_OPTION(gallivm_debug, "GALLIVM_DEBUG", lp_bld_debug_flags,
static boolean gallivm_initialized = FALSE;
+unsigned lp_native_vector_width;
+
/*
* Optimization values are:
@@ -81,25 +112,13 @@ enum LLVM_CodeGenOpt_Level {
};
+#if HAVE_LLVM <= 0x0206
/**
- * LLVM 2.6 permits only one ExecutionEngine to be created. This is it.
- */
-static LLVMExecutionEngineRef GlobalEngine = NULL;
-
-/**
- * Same gallivm state shared by all contexts.
+ * LLVM 2.6 permits only one ExecutionEngine to be created. So use the
+ * same gallivm state everywhere.
*/
static struct gallivm_state *GlobalGallivm = NULL;
-
-
-
-
-extern void
-lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE);
-
-extern void
-lp_set_target_options(void);
-
+#endif
/**
@@ -111,6 +130,7 @@ static boolean
create_pass_manager(struct gallivm_state *gallivm)
{
assert(!gallivm->passmgr);
+ assert(gallivm->target);
gallivm->passmgr = LLVMCreateFunctionPassManager(gallivm->provider);
if (!gallivm->passmgr)
@@ -174,33 +194,37 @@ free_gallivm_state(struct gallivm_state *gallivm)
&mod, &error);
#endif
+ if (gallivm->passmgr) {
+ LLVMDisposePassManager(gallivm->passmgr);
+ }
+
#if 0
/* XXX this seems to crash with all versions of LLVM */
if (gallivm->provider)
LLVMDisposeModuleProvider(gallivm->provider);
#endif
- if (gallivm->passmgr)
- LLVMDisposePassManager(gallivm->passmgr);
-
-#if HAVE_LLVM >= 0x207
- if (gallivm->module)
- LLVMDisposeModule(gallivm->module);
-#endif
-
-#if 0
- /* Don't free the exec engine, it's a global/singleton */
- if (gallivm->engine)
+ if (HAVE_LLVM >= 0x207 && gallivm->engine) {
+ /* This will already destroy any associated module */
LLVMDisposeExecutionEngine(gallivm->engine);
-#endif
+ } else {
+ LLVMDisposeModule(gallivm->module);
+ }
-#if 0
+#if !USE_MCJIT
/* Don't free the TargetData, it's owned by the exec engine */
- LLVMDisposeTargetData(gallivm->target);
+#else
+ if (gallivm->target) {
+ LLVMDisposeTargetData(gallivm->target);
+ }
#endif
+ /* Never free the LLVM context.
+ */
+#if 0
if (gallivm->context)
LLVMContextDispose(gallivm->context);
+#endif
if (gallivm->builder)
LLVMDisposeBuilder(gallivm->builder);
@@ -215,37 +239,14 @@ free_gallivm_state(struct gallivm_state *gallivm)
}
-/**
- * Allocate gallivm LLVM objects.
- * \return TRUE for success, FALSE for failure
- */
static boolean
-init_gallivm_state(struct gallivm_state *gallivm)
+init_gallivm_engine(struct gallivm_state *gallivm)
{
- assert(!gallivm->context);
- assert(!gallivm->module);
- assert(!gallivm->provider);
-
- lp_build_init();
-
- gallivm->context = LLVMContextCreate();
- if (!gallivm->context)
- goto fail;
-
- gallivm->module = LLVMModuleCreateWithNameInContext("gallivm",
- gallivm->context);
- if (!gallivm->module)
- goto fail;
-
- gallivm->provider =
- LLVMCreateModuleProviderForExistingModule(gallivm->module);
- if (!gallivm->provider)
- goto fail;
-
- if (!GlobalEngine) {
+ if (1) {
/* We can only create one LLVMExecutionEngine (w/ LLVM 2.6 anyway) */
enum LLVM_CodeGenOpt_Level optlevel;
char *error = NULL;
+ int ret;
if (gallivm_debug & GALLIVM_DEBUG_NO_OPT) {
optlevel = None;
@@ -254,135 +255,162 @@ init_gallivm_state(struct gallivm_state *gallivm)
optlevel = Default;
}
- if (LLVMCreateJITCompiler(&GlobalEngine, gallivm->provider,
- (unsigned) optlevel, &error)) {
+#if USE_MCJIT
+ ret = lp_build_create_mcjit_compiler_for_module(&gallivm->engine,
+ gallivm->module,
+ (unsigned) optlevel,
+ &error);
+#else
+ ret = LLVMCreateJITCompiler(&gallivm->engine, gallivm->provider,
+ (unsigned) optlevel, &error);
+#endif
+ if (ret) {
_debug_printf("%s\n", error);
LLVMDisposeMessage(error);
goto fail;
}
#if defined(DEBUG) || defined(PROFILE)
- lp_register_oprofile_jit_event_listener(GlobalEngine);
+ lp_register_oprofile_jit_event_listener(gallivm->engine);
#endif
}
- gallivm->engine = GlobalEngine;
-
LLVMAddModuleProvider(gallivm->engine, gallivm->provider);//new
+#if !USE_MCJIT
gallivm->target = LLVMGetExecutionEngineTargetData(gallivm->engine);
if (!gallivm->target)
goto fail;
+#else
+ if (0) {
+ /*
+ * Dump the data layout strings.
+ */
- if (!create_pass_manager(gallivm))
- goto fail;
+ LLVMTargetDataRef target = LLVMGetExecutionEngineTargetData(gallivm->engine);
+ char *data_layout;
+ char *engine_data_layout;
- gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
- if (!gallivm->builder)
- goto fail;
+ data_layout = LLVMCopyStringRepOfTargetData(gallivm->target);
+ engine_data_layout = LLVMCopyStringRepOfTargetData(target);
+
+ if (1) {
+ debug_printf("module target data = %s\n", data_layout);
+ debug_printf("engine target data = %s\n", engine_data_layout);
+ }
+
+ free(data_layout);
+ free(engine_data_layout);
+ }
+#endif
return TRUE;
fail:
- free_gallivm_state(gallivm);
return FALSE;
}
-struct callback
-{
- garbage_collect_callback_func func;
- void *cb_data;
- struct callback *prev, *next;
-};
-
-
-/** list of all garbage collector callbacks */
-static struct callback callback_list = {NULL, NULL, NULL, NULL};
+/**
+ * Singleton
+ *
+ * We must never free LLVM contexts, because LLVM has several global caches
+ * which pointing/derived from objects owned by the context, causing false
+ * memory leaks and false cache hits when these objects are destroyed.
+ *
+ * TODO: For thread safety on multi-threaded OpenGL we should use one LLVM
+ * context per thread, and put them in a pool when threads are destroyed.
+ */
+static LLVMContextRef gallivm_context = NULL;
/**
- * Register a function with gallivm which will be called when we
- * do garbage collection.
+ * Allocate gallivm LLVM objects.
+ * \return TRUE for success, FALSE for failure
*/
-void
-gallivm_register_garbage_collector_callback(garbage_collect_callback_func func,
- void *cb_data)
+static boolean
+init_gallivm_state(struct gallivm_state *gallivm)
{
- struct callback *cb;
-
- if (!callback_list.prev) {
- make_empty_list(&callback_list);
- }
+ assert(!gallivm->context);
+ assert(!gallivm->module);
+ assert(!gallivm->provider);
- /* see if already in list */
- foreach(cb, &callback_list) {
- if (cb->func == func && cb->cb_data == cb_data)
- return;
- }
+ lp_build_init();
- /* add to list */
- cb = CALLOC_STRUCT(callback);
- if (cb) {
- cb->func = func;
- cb->cb_data = cb_data;
- insert_at_head(&callback_list, cb);
+ if (!gallivm_context) {
+ gallivm_context = LLVMContextCreate();
}
-}
+ gallivm->context = gallivm_context;
+ if (!gallivm->context)
+ goto fail;
+ gallivm->module = LLVMModuleCreateWithNameInContext("gallivm",
+ gallivm->context);
+ if (!gallivm->module)
+ goto fail;
-/**
- * Remove a callback.
- */
-void
-gallivm_remove_garbage_collector_callback(garbage_collect_callback_func func,
- void *cb_data)
-{
- struct callback *cb;
-
- /* search list */
- foreach(cb, &callback_list) {
- if (cb->func == func && cb->cb_data == cb_data) {
- /* found, remove it */
- remove_from_list(cb);
- FREE(cb);
- return;
- }
- }
-}
+ gallivm->provider =
+ LLVMCreateModuleProviderForExistingModule(gallivm->module);
+ if (!gallivm->provider)
+ goto fail;
+ gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
+ if (!gallivm->builder)
+ goto fail;
-/**
- * Call the callback functions (which are typically in the
- * draw module and llvmpipe driver.
- */
-static void
-call_garbage_collector_callbacks(void)
-{
- struct callback *cb;
- foreach(cb, &callback_list) {
- cb->func(cb->cb_data);
+ /* FIXME: MC-JIT only allows compiling one module at a time, and it must be
+ * complete when MC-JIT is created. So defer the MC-JIT engine creation for
+ * now.
+ */
+#if !USE_MCJIT
+ if (!init_gallivm_engine(gallivm)) {
+ goto fail;
}
-}
+#else
+ /*
+ * MC-JIT engine compiles the module immediately on creation, so we can't
+ * obtain the target data from it. Instead we create a target data layout
+ * from a string.
+ *
+ * The produced layout strings are not precisely the same, but should make
+ * no difference for the kind of optimization passes we run.
+ *
+ * For reference this is the layout string on x64:
+ *
+ * e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64
+ *
+ * See also:
+ * - http://llvm.org/docs/LangRef.html#datalayout
+ */
+
+ {
+ const unsigned pointer_size = 8 * sizeof(void *);
+ char layout[512];
+ util_snprintf(layout, sizeof layout, "%c-p:%u:%u:%u-i64:64:64-a0:0:%u-s0:%u:%u",
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+ 'e', // little endian
+#else
+ 'E', // big endian
+#endif
+ pointer_size, pointer_size, pointer_size, // pointer size, abi alignment, preferred alignment
+ pointer_size, // aggregate preferred alignment
+ pointer_size, pointer_size); // stack objects abi alignment, preferred alignment
+ gallivm->target = LLVMCreateTargetData(layout);
+ if (!gallivm->target) {
+ return FALSE;
+ }
+ }
+#endif
+ if (!create_pass_manager(gallivm))
+ goto fail;
-/**
- * Other gallium components using gallivm should call this periodically
- * to let us do garbage collection (or at least try to free memory
- * accumulated by the LLVM libraries).
- */
-void
-gallivm_garbage_collect(struct gallivm_state *gallivm)
-{
- if (gallivm->context) {
- if (gallivm_debug & GALLIVM_DEBUG_GC)
- debug_printf("***** Doing LLVM garbage collection\n");
+ return TRUE;
- call_garbage_collector_callbacks();
- free_gallivm_state(gallivm);
- init_gallivm_state(gallivm);
- }
+fail:
+ free_gallivm_state(gallivm);
+ return FALSE;
}
@@ -398,12 +426,27 @@ lp_build_init(void)
lp_set_target_options();
- LLVMInitializeNativeTarget();
-
+#if USE_MCJIT
+ LLVMLinkInMCJIT();
+#else
LLVMLinkInJIT();
+#endif
util_cpu_detect();
+
+ if (HAVE_AVX &&
+ util_cpu_caps.has_avx) {
+ lp_native_vector_width = 256;
+ } else {
+ /* Leave it at 128, even when no SIMD extensions are available.
+ * Really needs to be a multiple of 128 so can fit 4 floats.
+ */
+ lp_native_vector_width = 128;
+ }
+ lp_native_vector_width = debug_get_num_option("LP_NATIVE_VECTOR_WIDTH",
+ lp_native_vector_width);
+
gallivm_initialized = TRUE;
#if 0
@@ -423,16 +466,27 @@ lp_build_init(void)
struct gallivm_state *
gallivm_create(void)
{
- if (!GlobalGallivm) {
- GlobalGallivm = CALLOC_STRUCT(gallivm_state);
- if (GlobalGallivm) {
- if (!init_gallivm_state(GlobalGallivm)) {
- FREE(GlobalGallivm);
- GlobalGallivm = NULL;
- }
+ struct gallivm_state *gallivm;
+
+#if HAVE_LLVM <= 0x206
+ if (GlobalGallivm) {
+ return GlobalGallivm;
+ }
+#endif
+
+ gallivm = CALLOC_STRUCT(gallivm_state);
+ if (gallivm) {
+ if (!init_gallivm_state(gallivm)) {
+ FREE(gallivm);
+ gallivm = NULL;
}
}
- return GlobalGallivm;
+
+#if HAVE_LLVM <= 0x206
+ GlobalGallivm = gallivm;
+#endif
+
+ return gallivm;
}
@@ -442,6 +496,132 @@ gallivm_create(void)
void
gallivm_destroy(struct gallivm_state *gallivm)
{
+#if HAVE_LLVM <= 0x0206
/* No-op: don't destroy the singleton */
(void) gallivm;
+#else
+ free_gallivm_state(gallivm);
+ FREE(gallivm);
+#endif
+}
+
+
+/**
+ * Validate and optimze a function.
+ */
+static void
+gallivm_optimize_function(struct gallivm_state *gallivm,
+ LLVMValueRef func)
+{
+ if (0) {
+ debug_printf("optimizing %s...\n", LLVMGetValueName(func));
+ }
+
+ assert(gallivm->passmgr);
+
+ /* Apply optimizations to LLVM IR */
+ LLVMRunFunctionPassManager(gallivm->passmgr, func);
+
+ if (0) {
+ if (gallivm_debug & GALLIVM_DEBUG_IR) {
+ /* Print the LLVM IR to stderr */
+ lp_debug_dump_value(func);
+ debug_printf("\n");
+ }
+ }
+}
+
+
+/**
+ * Validate a function.
+ */
+void
+gallivm_verify_function(struct gallivm_state *gallivm,
+ LLVMValueRef func)
+{
+ /* Verify the LLVM IR. If invalid, dump and abort */
+#ifdef DEBUG
+ if (LLVMVerifyFunction(func, LLVMPrintMessageAction)) {
+ lp_debug_dump_value(func);
+ assert(0);
+ return;
+ }
+#endif
+
+ gallivm_optimize_function(gallivm, func);
+
+ if (gallivm_debug & GALLIVM_DEBUG_IR) {
+ /* Print the LLVM IR to stderr */
+ lp_debug_dump_value(func);
+ debug_printf("\n");
+ }
+}
+
+
+void
+gallivm_compile_module(struct gallivm_state *gallivm)
+{
+#if HAVE_LLVM > 0x206
+ assert(!gallivm->compiled);
+#endif
+
+ /* Dump byte code to a file */
+ if (0) {
+ LLVMWriteBitcodeToFile(gallivm->module, "llvmpipe.bc");
+ debug_printf("llvmpipe.bc written\n");
+ debug_printf("Invoke as \"llc -o - llvmpipe.bc\"\n");
+ }
+
+#if USE_MCJIT
+ assert(!gallivm->engine);
+ if (!init_gallivm_engine(gallivm)) {
+ assert(0);
+ }
+#endif
+ assert(gallivm->engine);
+
+ ++gallivm->compiled;
+}
+
+
+func_pointer
+gallivm_jit_function(struct gallivm_state *gallivm,
+ LLVMValueRef func)
+{
+ void *code;
+ func_pointer jit_func;
+
+ assert(gallivm->compiled);
+ assert(gallivm->engine);
+
+ code = LLVMGetPointerToGlobal(gallivm->engine, func);
+ assert(code);
+ jit_func = pointer_to_func(code);
+
+ if (gallivm_debug & GALLIVM_DEBUG_ASM) {
+ lp_disassemble(code);
+ }
+
+ /* Free the function body to save memory */
+ lp_func_delete_body(func);
+
+ return jit_func;
+}
+
+
+/**
+ * Free the function (and its machine code).
+ */
+void
+gallivm_free_function(struct gallivm_state *gallivm,
+ LLVMValueRef func,
+ const void *code)
+{
+#if !USE_MCJIT
+ if (code) {
+ LLVMFreeMachineCodeForFunction(gallivm->engine, func);
+ }
+
+ LLVMDeleteFunction(func);
+#endif
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h
index 5fc0f996c64..7edea616c4e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h
@@ -31,6 +31,7 @@
#include "pipe/p_compiler.h"
+#include "util/u_pointer.h" // for func_pointer
#include "lp_bld.h"
#include <llvm-c/ExecutionEngine.h>
@@ -44,6 +45,7 @@ struct gallivm_state
LLVMPassManagerRef passmgr;
LLVMContextRef context;
LLVMBuilderRef builder;
+ unsigned compiled;
};
@@ -51,35 +53,28 @@ void
lp_build_init(void);
-extern void
-lp_func_delete_body(LLVMValueRef func);
-
+struct gallivm_state *
+gallivm_create(void);
void
-gallivm_garbage_collect(struct gallivm_state *gallivm);
-
+gallivm_destroy(struct gallivm_state *gallivm);
-typedef void (*garbage_collect_callback_func)(void *cb_data);
void
-gallivm_register_garbage_collector_callback(garbage_collect_callback_func func,
- void *cb_data);
+gallivm_verify_function(struct gallivm_state *gallivm,
+ LLVMValueRef func);
void
-gallivm_remove_garbage_collector_callback(garbage_collect_callback_func func,
- void *cb_data);
+gallivm_compile_module(struct gallivm_state *gallivm);
-
-struct gallivm_state *
-gallivm_create(void);
+func_pointer
+gallivm_jit_function(struct gallivm_state *gallivm,
+ LLVMValueRef func);
void
-gallivm_destroy(struct gallivm_state *gallivm);
-
-
-extern LLVMValueRef
-lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
- const char *Name);
+gallivm_free_function(struct gallivm_state *gallivm,
+ LLVMValueRef func,
+ const void * code);
void
lp_set_load_alignment(LLVMValueRef Inst,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.c b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
index 2323f124ae4..2bf1211bcd7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
@@ -48,6 +48,8 @@
#include "lp_bld_const.h"
#include "lp_bld_intr.h"
+#include "lp_bld_type.h"
+#include "lp_bld_pack.h"
LLVMValueRef
@@ -129,6 +131,95 @@ lp_build_intrinsic_binary(LLVMBuilderRef builder,
}
+/**
+ * Call intrinsic with arguments adapted to intrinsic vector length.
+ *
+ * Split vectors which are too large for the hw, or expand them if they
+ * are too small, so a caller calling a function which might use intrinsics
+ * doesn't need to do splitting/expansion on its own.
+ * This only supports intrinsics where src and dst types match.
+ */
+LLVMValueRef
+lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm,
+ const char *name,
+ struct lp_type src_type,
+ unsigned intr_size,
+ LLVMValueRef a,
+ LLVMValueRef b)
+{
+ unsigned i;
+ struct lp_type intrin_type = src_type;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+ LLVMValueRef anative, bnative;
+ unsigned intrin_length = intr_size / src_type.width;
+
+ intrin_type.length = intrin_length;
+
+ if (intrin_length > src_type.length) {
+ LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+ LLVMValueRef constvec, tmp;
+
+ for (i = 0; i < src_type.length; i++) {
+ elems[i] = lp_build_const_int32(gallivm, i);
+ }
+ for (; i < intrin_length; i++) {
+ elems[i] = i32undef;
+ }
+ if (src_type.length == 1) {
+ LLVMTypeRef elem_type = lp_build_elem_type(gallivm, intrin_type);
+ a = LLVMBuildBitCast(builder, a, LLVMVectorType(elem_type, 1), "");
+ b = LLVMBuildBitCast(builder, b, LLVMVectorType(elem_type, 1), "");
+ }
+ constvec = LLVMConstVector(elems, intrin_length);
+ anative = LLVMBuildShuffleVector(builder, a, a, constvec, "");
+ bnative = LLVMBuildShuffleVector(builder, b, b, constvec, "");
+ tmp = lp_build_intrinsic_binary(builder, name,
+ lp_build_vec_type(gallivm, intrin_type),
+ anative, bnative);
+ if (src_type.length > 1) {
+ constvec = LLVMConstVector(elems, src_type.length);
+ return LLVMBuildShuffleVector(builder, tmp, tmp, constvec, "");
+ }
+ else {
+ return LLVMBuildExtractElement(builder, tmp, elems[0], "");
+ }
+ }
+ else if (intrin_length < src_type.length) {
+ unsigned num_vec = src_type.length / intrin_length;
+ LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
+
+ /* don't support arbitrary size here as this is so yuck */
+ if (src_type.length % intrin_length) {
+ /* FIXME: This is something which should be supported
+ * but there doesn't seem to be any need for it currently
+ * so crash and burn.
+ */
+ debug_printf("%s: should handle arbitrary vector size\n",
+ __FUNCTION__);
+ assert(0);
+ return NULL;
+ }
+
+ for (i = 0; i < num_vec; i++) {
+ anative = lp_build_extract_range(gallivm, a, i*intrin_length,
+ intrin_length);
+ bnative = lp_build_extract_range(gallivm, b, i*intrin_length,
+ intrin_length);
+ tmp[i] = lp_build_intrinsic_binary(builder, name,
+ lp_build_vec_type(gallivm, intrin_type),
+ anative, bnative);
+ }
+ return lp_build_concat(gallivm, tmp, intrin_type, num_vec);
+ }
+ else {
+ return lp_build_intrinsic_binary(builder, name,
+ lp_build_vec_type(gallivm, src_type),
+ a, b);
+ }
+}
+
+
LLVMValueRef
lp_build_intrinsic_map(struct gallivm_state *gallivm,
const char *name,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.h b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
index b73dd700362..38c5c29c980 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
@@ -78,6 +78,15 @@ lp_build_intrinsic_binary(LLVMBuilderRef builder,
LLVMValueRef
+lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm,
+ const char *name,
+ struct lp_type src_type,
+ unsigned intr_size,
+ LLVMValueRef a,
+ LLVMValueRef b);
+
+
+LLVMValueRef
lp_build_intrinsic_map(struct gallivm_state *gallivm,
const char *name,
LLVMTypeRef ret_type,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 69796149aaa..7a4a5bb11d3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -52,8 +52,8 @@
*
* select <4 x i1> %C, %A, %B
*
- * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is not
- * supported on any backend.
+ * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is only
+ * supported on some backends (x86) starting with llvm 3.1.
*
* Expanding the boolean vector to full SIMD register width, as in
*
@@ -485,8 +485,10 @@ lp_build_select(struct lp_build_context *bld,
}
res = LLVMBuildSelect(builder, mask, a, b, "");
}
- else if (util_cpu_caps.has_sse4_1 &&
- type.width * type.length == 128 &&
+ else if (((util_cpu_caps.has_sse4_1 &&
+ type.width * type.length == 128) ||
+ (util_cpu_caps.has_avx &&
+ type.width * type.length == 256 && type.width >= 32)) &&
!LLVMIsConstant(a) &&
!LLVMIsConstant(b) &&
!LLVMIsConstant(mask)) {
@@ -494,8 +496,22 @@ lp_build_select(struct lp_build_context *bld,
LLVMTypeRef arg_type;
LLVMValueRef args[3];
- if (type.floating &&
- type.width == 64) {
+ /*
+ * There's only float blend in AVX but can just cast i32/i64
+ * to float.
+ */
+ if (type.width * type.length == 256) {
+ if (type.width == 64) {
+ intrinsic = "llvm.x86.avx.blendv.pd.256";
+ arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 4);
+ }
+ else {
+ intrinsic = "llvm.x86.avx.blendv.ps.256";
+ arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8);
+ }
+ }
+ else if (type.floating &&
+ type.width == 64) {
intrinsic = "llvm.x86.sse41.blendvpd";
arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 2);
} else if (type.floating &&
@@ -591,3 +607,35 @@ lp_build_select_aos(struct lp_build_context *bld,
return lp_build_select(bld, mask_vec, a, b);
}
}
+
+
+/**
+ * Return (scalar-cast)val ? true : false;
+ */
+LLVMValueRef
+lp_build_any_true_range(struct lp_build_context *bld,
+ unsigned real_length,
+ LLVMValueRef val)
+{
+ LLVMBuilderRef builder = bld->gallivm->builder;
+ LLVMTypeRef scalar_type;
+ LLVMTypeRef true_type;
+
+ assert(real_length <= bld->type.length);
+
+ true_type = LLVMIntTypeInContext(bld->gallivm->context,
+ bld->type.width * real_length);
+ scalar_type = LLVMIntTypeInContext(bld->gallivm->context,
+ bld->type.width * bld->type.length);
+ val = LLVMBuildBitCast(builder, val, scalar_type, "");
+ /*
+ * We're using always native types so we can use intrinsics.
+ * However, if we don't do per-element calculations, we must ensure
+ * the excess elements aren't used since they may contain garbage.
+ */
+ if (real_length < bld->type.length) {
+ val = LLVMBuildTrunc(builder, val, true_type, "");
+ }
+ return LLVMBuildICmp(builder, LLVMIntNE,
+ val, LLVMConstNull(true_type), "");
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.h b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
index ef33a653682..64c0a1f5946 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
@@ -82,4 +82,9 @@ lp_build_select_aos(struct lp_build_context *bld,
LLVMValueRef b);
+LLVMValueRef
+lp_build_any_true_range(struct lp_build_context *bld,
+ unsigned real_length,
+ LLVMValueRef val);
+
#endif /* !LP_BLD_LOGIC_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 6c4586c4212..dd2c6120afb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -26,6 +26,12 @@
**************************************************************************/
+/**
+ * The purpose of this module is to expose LLVM functionality not available
+ * through the C++ bindings.
+ */
+
+
#ifndef __STDC_LIMIT_MACROS
#define __STDC_LIMIT_MACROS
#endif
@@ -41,11 +47,24 @@
#include <llvm/Target/TargetOptions.h>
#include <llvm/ExecutionEngine/ExecutionEngine.h>
#include <llvm/ExecutionEngine/JITEventListener.h>
+#if HAVE_LLVM >= 0x0301
+#include <llvm/ADT/Triple.h>
+#include <llvm/ExecutionEngine/JITMemoryManager.h>
+#endif
#include <llvm/Support/CommandLine.h>
#include <llvm/Support/PrettyStackTrace.h>
+#if HAVE_LLVM >= 0x0300
+#include <llvm/Support/TargetSelect.h>
+#else /* HAVE_LLVM < 0x0300 */
+#include <llvm/Target/TargetSelect.h>
+#endif /* HAVE_LLVM < 0x0300 */
+
#include "pipe/p_config.h"
#include "util/u_debug.h"
+#include "util/u_cpu_detect.h"
+
+#include "lp_bld_misc.h"
/**
@@ -99,6 +118,9 @@ lp_set_target_options(void)
#if defined(DEBUG) || defined(PROFILE)
llvm::NoFramePointerElim = true;
+#if HAVE_LLVM >= 0x0208
+ llvm::NoFramePointerElimNonLeaf = true;
+#endif
#endif
llvm::NoExcessFPPrecision = false;
@@ -146,6 +168,30 @@ lp_set_target_options(void)
* shared object where the gallium driver resides.
*/
llvm::DisablePrettyStackTrace = true;
+
+ // If we have a native target, initialize it to ensure it is linked in and
+ // usable by the JIT.
+ llvm::InitializeNativeTarget();
+
+#if HAVE_LLVM >= 0x0208
+ llvm::InitializeNativeTargetAsmPrinter();
+#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+ LLVMInitializeX86AsmPrinter();
+#elif defined(PIPE_ARCH_ARM)
+ LLVMInitializeARMAsmPrinter();
+#elif defined(PIPE_ARCH_PPC)
+ LLVMInitializePowerPCAsmPrinter();
+#endif
+
+#if HAVE_LLVM >= 0x0207
+# if HAVE_LLVM >= 0x0301
+ llvm::InitializeNativeTargetDisassembler();
+# elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+ LLVMInitializeX86Disassembler();
+# elif defined(PIPE_ARCH_ARM)
+ LLVMInitializeARMDisassembler();
+# endif
+#endif
}
@@ -165,6 +211,7 @@ lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
return llvm::wrap(llvm::unwrap(B)->CreateLoad(llvm::unwrap(PointerVal), true, Name));
}
+
extern "C"
void
lp_set_load_alignment(LLVMValueRef Inst,
@@ -180,3 +227,67 @@ lp_set_store_alignment(LLVMValueRef Inst,
{
llvm::unwrap<llvm::StoreInst>(Inst)->setAlignment(Align);
}
+
+
+#if HAVE_LLVM >= 0x301
+
+/**
+ * Same as LLVMCreateJITCompilerForModule, but using MCJIT and enabling AVX
+ * feature where available.
+ *
+ * See also:
+ * - llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+ * - llvm/tools/lli/lli.cpp
+ * - http://markmail.org/message/ttkuhvgj4cxxy2on#query:+page:1+mid:aju2dggerju3ivd3+state:results
+ */
+extern "C"
+LLVMBool
+lp_build_create_mcjit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
+ LLVMModuleRef M,
+ unsigned OptLevel,
+ char **OutError)
+{
+ using namespace llvm;
+
+ std::string Error;
+ EngineBuilder builder(unwrap(M));
+ builder.setEngineKind(EngineKind::JIT)
+ .setErrorStr(&Error)
+ .setOptLevel((CodeGenOpt::Level)OptLevel);
+
+ builder.setUseMCJIT(true);
+
+ llvm::SmallVector<std::string, 1> MAttrs;
+ if (util_cpu_caps.has_avx) {
+ /*
+ * AVX feature is not automatically detected from CPUID by the X86 target
+ * yet, because the old (yet default) JIT engine is not capable of
+ * emitting the opcodes. But as we're using MCJIT here, it is safe to
+ * add set this attribute.
+ */
+ MAttrs.push_back("+avx");
+ builder.setMAttrs(MAttrs);
+ }
+ builder.setJITMemoryManager(JITMemoryManager::CreateDefaultMemManager());
+
+ ExecutionEngine *JIT;
+#if 0
+ JIT = builder.create();
+#else
+ /*
+ * Workaround http://llvm.org/bugs/show_bug.cgi?id=12833
+ */
+ StringRef MArch = "";
+ StringRef MCPU = "";
+ Triple TT(unwrap(M)->getTargetTriple());
+ JIT = builder.create(builder.selectTarget(TT, MArch, MCPU, MAttrs));
+#endif
+ if (JIT) {
+ *OutJIT = wrap(JIT);
+ return 0;
+ }
+ *OutError = strdup(Error.c_str());
+ return 1;
+}
+
+#endif /* HAVE_LLVM >= 0x301 */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.h b/src/gallium/auxiliary/gallivm/lp_bld_misc.h
new file mode 100644
index 00000000000..4f80b38280c
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.h
@@ -0,0 +1,70 @@
+/**************************************************************************
+ *
+ * Copyright 2012 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_BLD_MISC_H
+#define LP_BLD_MISC_H
+
+
+#include "lp_bld.h"
+#include <llvm-c/ExecutionEngine.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+extern void
+lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE);
+
+extern void
+lp_set_target_options(void);
+
+
+extern void
+lp_func_delete_body(LLVMValueRef func);
+
+
+extern LLVMValueRef
+lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
+ const char *Name);
+
+extern int
+lp_build_create_mcjit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
+ LLVMModuleRef M,
+ unsigned OptLevel,
+ char **OutError);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* !LP_BLD_MISC_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index fde6bb594f1..b18f7841ccb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -69,6 +69,7 @@
#include "util/u_debug.h"
#include "util/u_math.h"
#include "util/u_cpu_detect.h"
+#include "util/u_memory.h"
#include "lp_bld_type.h"
#include "lp_bld_const.h"
@@ -76,6 +77,7 @@
#include "lp_bld_intr.h"
#include "lp_bld_arit.h"
#include "lp_bld_pack.h"
+#include "lp_bld_swizzle.h"
/**
@@ -101,6 +103,30 @@ lp_build_const_unpack_shuffle(struct gallivm_state *gallivm,
return LLVMConstVector(elems, n);
}
+/**
+ * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack.
+ * See comment above lp_build_interleave2_half for more details.
+ */
+static LLVMValueRef
+lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm,
+ unsigned n, unsigned lo_hi)
+{
+ LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+ unsigned i, j;
+
+ assert(n <= LP_MAX_VECTOR_LENGTH);
+ assert(lo_hi < 2);
+
+ for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) {
+ if (i == (n / 2))
+ j += n / 4;
+
+ elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
+ elems[i + 1] = lp_build_const_int32(gallivm, n + j);
+ }
+
+ return LLVMConstVector(elems, n);
+}
/**
* Build shuffle vectors that match PACKxx instructions.
@@ -119,6 +145,71 @@ lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n)
return LLVMConstVector(elems, n);
}
+/**
+ * Return a vector with elements src[start:start+size]
+ * Most useful for getting half the values out of a 256bit sized vector,
+ * otherwise may cause data rearrangement to happen.
+ */
+LLVMValueRef
+lp_build_extract_range(struct gallivm_state *gallivm,
+ LLVMValueRef src,
+ unsigned start,
+ unsigned size)
+{
+ LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+ unsigned i;
+
+ assert(size <= Elements(elems));
+
+ for (i = 0; i < size; ++i)
+ elems[i] = lp_build_const_int32(gallivm, i + start);
+
+ if (size == 1) {
+ return LLVMBuildExtractElement(gallivm->builder, src, elems[0], "");
+ }
+ else {
+ return LLVMBuildShuffleVector(gallivm->builder, src, src,
+ LLVMConstVector(elems, size), "");
+ }
+}
+
+/**
+ * Concatenates several (must be a power of 2) vectors (of same type)
+ * into a larger one.
+ * Most useful for building up a 256bit sized vector out of two 128bit ones.
+ */
+LLVMValueRef
+lp_build_concat(struct gallivm_state *gallivm,
+ LLVMValueRef src[],
+ struct lp_type src_type,
+ unsigned num_vectors)
+{
+ unsigned new_length, i;
+ LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2];
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+
+ assert(src_type.length * num_vectors <= Elements(shuffles));
+ assert(util_is_power_of_two(num_vectors));
+
+ new_length = src_type.length;
+
+ for (i = 0; i < num_vectors; i++)
+ tmp[i] = src[i];
+
+ while (num_vectors > 1) {
+ num_vectors >>= 1;
+ new_length <<= 1;
+ for (i = 0; i < new_length; i++) {
+ shuffles[i] = lp_build_const_int32(gallivm, i);
+ }
+ for (i = 0; i < num_vectors; i++) {
+ tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1],
+ LLVMConstVector(shuffles, new_length), "");
+ }
+ }
+
+ return tmp[0];
+}
/**
* Interleave vector elements.
@@ -139,6 +230,40 @@ lp_build_interleave2(struct gallivm_state *gallivm,
return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
}
+/**
+ * Interleave vector elements but with 256 bit,
+ * treats it as interleave with 2 concatenated 128 bit vectors.
+ *
+ * This differs to lp_build_interleave2 as that function would do the following (for lo):
+ * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction.
+ *
+ *
+ * An example interleave 8x float with 8x float on AVX 256bit unpack:
+ * a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7
+ *
+ * Equivalent to interleaving 2x 128 bit vectors
+ * a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7
+ *
+ * So interleave-lo would result in:
+ * a0 b0 a1 b1 a4 b4 a5 b5
+ *
+ * And interleave-hi would result in:
+ * a2 b2 a3 b3 a6 b6 a7 b7
+ */
+LLVMValueRef
+lp_build_interleave2_half(struct gallivm_state *gallivm,
+ struct lp_type type,
+ LLVMValueRef a,
+ LLVMValueRef b,
+ unsigned lo_hi)
+{
+ if (type.length * type.width == 256) {
+ LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);
+ return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
+ } else {
+ return lp_build_interleave2(gallivm, type, a, b, lo_hi);
+ }
+}
/**
* Double the bit width.
@@ -237,9 +362,9 @@ lp_build_unpack(struct gallivm_state *gallivm,
* Non-interleaved pack.
*
* This will move values as
- *
- * lo = __ l0 __ l1 __ l2 __.. __ ln
- * hi = __ h0 __ h1 __ h2 __.. __ hn
+ * (LSB) (MSB)
+ * lo = l0 __ l1 __ l2 __.. __ ln __
+ * hi = h0 __ h1 __ h2 __.. __ hn __
* res = l0 l1 l2 .. ln h0 h1 h2 .. hn
*
* This will only change the number of bits the values are represented, not the
@@ -257,12 +382,14 @@ lp_build_pack2(struct gallivm_state *gallivm,
LLVMValueRef hi)
{
LLVMBuilderRef builder = gallivm->builder;
-#if HAVE_LLVM < 0x0207
- LLVMTypeRef src_vec_type = lp_build_vec_type(gallivm, src_type);
-#endif
LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type);
LLVMValueRef shuffle;
LLVMValueRef res = NULL;
+ struct lp_type intr_type = dst_type;
+
+#if HAVE_LLVM < 0x0207
+ intr_type = src_type;
+#endif
assert(!src_type.floating);
assert(!dst_type.floating);
@@ -270,50 +397,81 @@ lp_build_pack2(struct gallivm_state *gallivm,
assert(src_type.length * 2 == dst_type.length);
/* Check for special cases first */
- if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
+ if(util_cpu_caps.has_sse2 && src_type.width * src_type.length >= 128) {
+ const char *intrinsic = NULL;
+
switch(src_type.width) {
case 32:
if(dst_type.sign) {
-#if HAVE_LLVM >= 0x0207
- res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", dst_vec_type, lo, hi);
-#else
- res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
-#endif
+ intrinsic = "llvm.x86.sse2.packssdw.128";
}
else {
if (util_cpu_caps.has_sse4_1) {
- return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
- }
- else {
- /* use generic shuffle below */
- res = NULL;
+ intrinsic = "llvm.x86.sse41.packusdw";
+#if HAVE_LLVM < 0x0207
+ /* llvm < 2.7 has inconsistent signatures except for packusdw */
+ intr_type = dst_type;
+#endif
}
}
break;
-
case 16:
- if(dst_type.sign)
-#if HAVE_LLVM >= 0x0207
- res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", dst_vec_type, lo, hi);
-#else
- res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
-#endif
- else
-#if HAVE_LLVM >= 0x0207
- res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", dst_vec_type, lo, hi);
-#else
- res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
-#endif
- break;
-
- default:
- assert(0);
- return LLVMGetUndef(dst_vec_type);
+ if (dst_type.sign) {
+ intrinsic = "llvm.x86.sse2.packsswb.128";
+ }
+ else {
+ intrinsic = "llvm.x86.sse2.packuswb.128";
+ }
break;
+ /* default uses generic shuffle below */
}
-
- if (res) {
- res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
+ if (intrinsic) {
+ if (src_type.width * src_type.length == 128) {
+ LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
+ res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
+ if (dst_vec_type != intr_vec_type) {
+ res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
+ }
+ }
+ else {
+ int num_split = src_type.width * src_type.length / 128;
+ int i;
+ int nlen = 128 / src_type.width;
+ struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);
+ struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);
+ LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];
+ LLVMValueRef tmplo, tmphi;
+ LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type);
+ LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type);
+
+ assert(num_split <= LP_MAX_VECTOR_WIDTH / 128);
+
+ for (i = 0; i < num_split / 2; i++) {
+ tmplo = lp_build_extract_range(gallivm,
+ lo, i*nlen*2, nlen);
+ tmphi = lp_build_extract_range(gallivm,
+ lo, i*nlen*2 + nlen, nlen);
+ tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,
+ nintr_vec_type, tmplo, tmphi);
+ if (ndst_vec_type != nintr_vec_type) {
+ tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, "");
+ }
+ }
+ for (i = 0; i < num_split / 2; i++) {
+ tmplo = lp_build_extract_range(gallivm,
+ hi, i*nlen*2, nlen);
+ tmphi = lp_build_extract_range(gallivm,
+ hi, i*nlen*2 + nlen, nlen);
+ tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,
+ nintr_vec_type,
+ tmplo, tmphi);
+ if (ndst_vec_type != nintr_vec_type) {
+ tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2],
+ ndst_vec_type, "");
+ }
+ }
+ res = lp_build_concat(gallivm, tmpres, ndst_type, num_split);
+ }
return res;
}
}
@@ -357,8 +515,9 @@ lp_build_packs2(struct gallivm_state *gallivm,
/* All X86 SSE non-interleaved pack instructions take signed inputs and
* saturate them, so no need to clamp for those cases. */
if(util_cpu_caps.has_sse2 &&
- src_type.width * src_type.length == 128 &&
- src_type.sign)
+ src_type.width * src_type.length >= 128 &&
+ src_type.sign &&
+ (src_type.width == 32 || src_type.width == 16))
clamp = FALSE;
if(clamp) {
@@ -395,7 +554,6 @@ lp_build_pack(struct gallivm_state *gallivm,
LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
unsigned i;
-
/* Register width must remain constant */
assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
@@ -487,21 +645,44 @@ lp_build_resize(struct gallivm_state *gallivm,
/*
* Register width remains constant -- use vector packing intrinsics
*/
-
tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
}
else {
- /*
- * Do it element-wise.
- */
-
- assert(src_type.length == dst_type.length);
- tmp[0] = lp_build_undef(gallivm, dst_type);
- for (i = 0; i < dst_type.length; ++i) {
- LLVMValueRef index = lp_build_const_int32(gallivm, i);
- LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
- val = LLVMBuildTrunc(builder, val, lp_build_elem_type(gallivm, dst_type), "");
- tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
+ if (src_type.width / dst_type.width > num_srcs) {
+ /*
+ * First change src vectors size (with shuffle) so they have the
+ * same size as the destination vector, then pack normally.
+ * Note: cannot use cast/extract because llvm generates atrocious code.
+ */
+ unsigned size_ratio = (src_type.width * src_type.length) /
+ (dst_type.length * dst_type.width);
+ unsigned new_length = src_type.length / size_ratio;
+
+ for (i = 0; i < size_ratio * num_srcs; i++) {
+ unsigned start_index = (i % size_ratio) * new_length;
+ tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio],
+ start_index, new_length);
+ }
+ num_srcs *= size_ratio;
+ src_type.length = new_length;
+ tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs);
+ }
+ else {
+ /*
+ * Truncate bit width but expand vector size - first pack
+ * then expand simply because this should be more AVX-friendly
+ * for the cases we probably hit.
+ */
+ unsigned size_ratio = (dst_type.width * dst_type.length) /
+ (src_type.length * src_type.width);
+ unsigned num_pack_srcs = num_srcs / size_ratio;
+ dst_type.length = dst_type.length / size_ratio;
+
+ for (i = 0; i < size_ratio; i++) {
+ tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE,
+ &src[i*num_pack_srcs], num_pack_srcs);
+ }
+ tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio);
}
}
}
@@ -522,19 +703,24 @@ lp_build_resize(struct gallivm_state *gallivm,
/*
* Do it element-wise.
*/
+ assert(src_type.length * num_srcs == dst_type.length * num_dsts);
+
+ for (i = 0; i < num_dsts; i++) {
+ tmp[i] = lp_build_undef(gallivm, dst_type);
+ }
- assert(src_type.length == dst_type.length);
- tmp[0] = lp_build_undef(gallivm, dst_type);
- for (i = 0; i < dst_type.length; ++i) {
- LLVMValueRef index = lp_build_const_int32(gallivm, i);
- LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
+ for (i = 0; i < src_type.length; ++i) {
+ unsigned j = i / dst_type.length;
+ LLVMValueRef srcindex = lp_build_const_int32(gallivm, i);
+ LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length);
+ LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, "");
if (src_type.sign && dst_type.sign) {
val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
} else {
val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
}
- tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
+ tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, "");
}
}
}
@@ -554,3 +740,38 @@ lp_build_resize(struct gallivm_state *gallivm,
}
+/**
+ * Expands src vector from src.length to dst_length
+ */
+LLVMValueRef
+lp_build_pad_vector(struct gallivm_state *gallivm,
+ LLVMValueRef src,
+ struct lp_type src_type,
+ unsigned dst_length)
+{
+ LLVMValueRef undef = LLVMGetUndef(lp_build_vec_type(gallivm, src_type));
+ LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+ unsigned i;
+
+ assert(dst_length <= Elements(elems));
+ assert(dst_length > src_type.length);
+
+ if (src_type.length == dst_length)
+ return src;
+
+ /* If its a single scalar type, no need to reinvent the wheel */
+ if (src_type.length == 1) {
+ return lp_build_broadcast(gallivm, LLVMVectorType(lp_build_elem_type(gallivm, src_type), dst_length), src);
+ }
+
+ /* All elements from src vector */
+ for (i = 0; i < src_type.length; ++i)
+ elems[i] = lp_build_const_int32(gallivm, i);
+
+ /* Undef fill remaining space */
+ for (i = src_type.length; i < dst_length; ++i)
+ elems[i] = lp_build_const_int32(gallivm, src_type.length);
+
+ /* Combine the two vectors */
+ return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), "");
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
index d58da4f01b3..73f299cca11 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -44,6 +44,12 @@
struct lp_type;
+LLVMValueRef
+lp_build_interleave2_half(struct gallivm_state *gallivm,
+ struct lp_type type,
+ LLVMValueRef a,
+ LLVMValueRef b,
+ unsigned lo_hi);
LLVMValueRef
lp_build_interleave2(struct gallivm_state *gallivm,
@@ -69,6 +75,17 @@ lp_build_unpack(struct gallivm_state *gallivm,
LLVMValueRef src,
LLVMValueRef *dst, unsigned num_dsts);
+LLVMValueRef
+lp_build_extract_range(struct gallivm_state *gallivm,
+ LLVMValueRef src,
+ unsigned start,
+ unsigned size);
+
+LLVMValueRef
+lp_build_concat(struct gallivm_state *gallivm,
+ LLVMValueRef src[],
+ struct lp_type src_type,
+ unsigned num_vectors);
LLVMValueRef
lp_build_packs2(struct gallivm_state *gallivm,
@@ -102,4 +119,10 @@ lp_build_resize(struct gallivm_state *gallivm,
LLVMValueRef *dst, unsigned num_dsts);
+LLVMValueRef
+lp_build_pad_vector(struct gallivm_state *gallivm,
+ LLVMValueRef src,
+ struct lp_type src_type,
+ unsigned dst_length);
+
#endif /* !LP_BLD_PACK_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
index b0a5bc0267f..b1ba7c72655 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
@@ -26,6 +26,7 @@
**************************************************************************/
+#include "u_cpu_detect.h"
#include "lp_bld_type.h"
#include "lp_bld_arit.h"
#include "lp_bld_const.h"
@@ -77,34 +78,82 @@ lp_build_ddy(struct lp_build_context *bld,
return lp_build_sub(bld, a_bottom, a_top);
}
-
+/*
+ * To be able to handle multiple quads at once in texture sampling and
+ * do lod calculations per quad, it is necessary to get the per-quad
+ * derivatives into the lp_build_rho function.
+ * For 8-wide vectors the packed derivative values for 3 coords would
+ * look like this, this scales to a arbitrary (multiple of 4) vector size:
+ * ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy
+ * dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____
+ * The second vector will be unused for 1d and 2d textures.
+ */
LLVMValueRef
-lp_build_scalar_ddx(struct lp_build_context *bld,
- LLVMValueRef a)
+lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld,
+ LLVMValueRef a)
{
- LLVMBuilderRef builder = bld->gallivm->builder;
- LLVMValueRef idx_left = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_LEFT);
- LLVMValueRef idx_right = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_RIGHT);
- LLVMValueRef a_left = LLVMBuildExtractElement(builder, a, idx_left, "left");
- LLVMValueRef a_right = LLVMBuildExtractElement(builder, a, idx_right, "right");
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef vec1, vec2;
+
+ /* same packing as _twocoord, but can use aos swizzle helper */
+
+ /*
+ * XXX could make swizzle1 a noop swizzle by using right top/bottom
+ * pair for ddy
+ */
+ static const unsigned char swizzle1[] = {
+ LP_BLD_QUAD_TOP_LEFT, LP_BLD_QUAD_TOP_LEFT,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ static const unsigned char swizzle2[] = {
+ LP_BLD_QUAD_TOP_RIGHT, LP_BLD_QUAD_BOTTOM_LEFT,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+
+ vec1 = lp_build_swizzle_aos(bld, a, swizzle1);
+ vec2 = lp_build_swizzle_aos(bld, a, swizzle2);
+
if (bld->type.floating)
- return LLVMBuildFSub(builder, a_right, a_left, "ddx");
+ return LLVMBuildFSub(builder, vec2, vec1, "ddxddy");
else
- return LLVMBuildSub(builder, a_right, a_left, "ddx");
+ return LLVMBuildSub(builder, vec2, vec1, "ddxddy");
}
LLVMValueRef
-lp_build_scalar_ddy(struct lp_build_context *bld,
- LLVMValueRef a)
+lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld,
+ LLVMValueRef a, LLVMValueRef b)
{
- LLVMBuilderRef builder = bld->gallivm->builder;
- LLVMValueRef idx_top = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_LEFT);
- LLVMValueRef idx_bottom = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_BOTTOM_LEFT);
- LLVMValueRef a_top = LLVMBuildExtractElement(builder, a, idx_top, "top");
- LLVMValueRef a_bottom = LLVMBuildExtractElement(builder, a, idx_bottom, "bottom");
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH/4];
+ LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH/4];
+ LLVMValueRef vec1, vec2;
+ unsigned length, num_quads, i;
+
+ /* XXX: do hsub version */
+ length = bld->type.length;
+ num_quads = length / 4;
+ for (i = 0; i < num_quads; i++) {
+ unsigned s1 = 4 * i;
+ unsigned s2 = 4 * i + length;
+ shuffles1[4*i + 0] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s1);
+ shuffles1[4*i + 1] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s1);
+ shuffles1[4*i + 2] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s2);
+ shuffles1[4*i + 3] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s2);
+ shuffles2[4*i + 0] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_RIGHT + s1);
+ shuffles2[4*i + 1] = lp_build_const_int32(gallivm, LP_BLD_QUAD_BOTTOM_LEFT + s1);
+ shuffles2[4*i + 2] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_RIGHT + s2);
+ shuffles2[4*i + 3] = lp_build_const_int32(gallivm, LP_BLD_QUAD_BOTTOM_LEFT + s2);
+ }
+ vec1 = LLVMBuildShuffleVector(builder, a, b,
+ LLVMConstVector(shuffles1, length), "");
+ vec2 = LLVMBuildShuffleVector(builder, a, b,
+ LLVMConstVector(shuffles2, length), "");
if (bld->type.floating)
- return LLVMBuildFSub(builder, a_bottom, a_top, "ddy");
+ return LLVMBuildFSub(builder, vec2, vec1, "ddxddyddxddy");
else
- return LLVMBuildSub(builder, a_bottom, a_top, "ddy");
+ return LLVMBuildSub(builder, vec2, vec1, "ddxddyddxddy");
}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.h b/src/gallium/auxiliary/gallivm/lp_bld_quad.h
index b7992912927..be6a1efc396 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.h
@@ -78,19 +78,15 @@ lp_build_ddy(struct lp_build_context *bld,
/*
- * Scalar derivatives.
- *
- * Same as getting the first value of above.
+ * Packed derivatives (one derivative for each direction per quad)
*/
-
LLVMValueRef
-lp_build_scalar_ddx(struct lp_build_context *bld,
- LLVMValueRef a);
-
+lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld,
+ LLVMValueRef a, LLVMValueRef b);
LLVMValueRef
-lp_build_scalar_ddy(struct lp_build_context *bld,
- LLVMValueRef a);
+lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld,
+ LLVMValueRef a);
#endif /* LP_BLD_QUAD_H_ */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index d966788d74e..85211161f3c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -44,6 +44,8 @@
#include "lp_bld_sample.h"
#include "lp_bld_swizzle.h"
#include "lp_bld_type.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_pack.h"
/*
@@ -175,67 +177,89 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
/**
* Generate code to compute coordinate gradient (rho).
- * \param ddx partial derivatives of (s, t, r, q) with respect to X
- * \param ddy partial derivatives of (s, t, r, q) with respect to Y
+ * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y
*
- * XXX: The resulting rho is scalar, so we ignore all but the first element of
- * derivatives that are passed by the shader.
+ * The resulting rho is scalar per quad.
*/
static LLVMValueRef
lp_build_rho(struct lp_build_sample_context *bld,
unsigned unit,
- const LLVMValueRef ddx[4],
- const LLVMValueRef ddy[4])
+ const struct lp_derivatives *derivs)
{
+ struct gallivm_state *gallivm = bld->gallivm;
struct lp_build_context *int_size_bld = &bld->int_size_bld;
struct lp_build_context *float_size_bld = &bld->float_size_bld;
struct lp_build_context *float_bld = &bld->float_bld;
+ struct lp_build_context *coord_bld = &bld->coord_bld;
+ struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
+ const LLVMValueRef *ddx_ddy = derivs->ddx_ddy;
const unsigned dims = bld->dims;
LLVMBuilderRef builder = bld->gallivm->builder;
LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
- LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy;
- LLVMValueRef rho_x, rho_y;
LLVMValueRef rho_vec;
LLVMValueRef int_size, float_size;
LLVMValueRef rho;
LLVMValueRef first_level, first_level_vec;
+ LLVMValueRef abs_ddx_ddy[2];
+ unsigned length = coord_bld->type.length;
+ unsigned num_quads = length / 4;
+ unsigned i;
+ LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+ LLVMValueRef rho_xvec, rho_yvec;
+
+ abs_ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
+ if (dims > 2) {
+ abs_ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
+ }
- dsdx = ddx[0];
- dsdy = ddy[0];
-
- if (dims <= 1) {
- rho_x = dsdx;
- rho_y = dsdy;
+ if (dims == 1) {
+ static const unsigned char swizzle1[] = {
+ 0, LP_BLD_SWIZZLE_DONTCARE,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ static const unsigned char swizzle2[] = {
+ 1, LP_BLD_SWIZZLE_DONTCARE,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1);
+ rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2);
+ }
+ else if (dims == 2) {
+ static const unsigned char swizzle1[] = {
+ 0, 2,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ static const unsigned char swizzle2[] = {
+ 1, 3,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1);
+ rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2);
}
else {
- rho_x = float_size_bld->undef;
- rho_y = float_size_bld->undef;
-
- rho_x = LLVMBuildInsertElement(builder, rho_x, dsdx, index0, "");
- rho_y = LLVMBuildInsertElement(builder, rho_y, dsdy, index0, "");
-
- dtdx = ddx[1];
- dtdy = ddy[1];
-
- rho_x = LLVMBuildInsertElement(builder, rho_x, dtdx, index1, "");
- rho_y = LLVMBuildInsertElement(builder, rho_y, dtdy, index1, "");
-
- if (dims >= 3) {
- drdx = ddx[2];
- drdy = ddy[2];
-
- rho_x = LLVMBuildInsertElement(builder, rho_x, drdx, index2, "");
- rho_y = LLVMBuildInsertElement(builder, rho_y, drdy, index2, "");
+ LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH];
+ LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH];
+ assert(dims == 3);
+ for (i = 0; i < num_quads; i++) {
+ shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i);
+ shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2);
+ shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i);
+ shuffles1[4*i + 3] = i32undef;
+ shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1);
+ shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3);
+ shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 1);
+ shuffles2[4*i + 3] = i32undef;
}
+ rho_xvec = LLVMBuildShuffleVector(builder, abs_ddx_ddy[0], abs_ddx_ddy[1],
+ LLVMConstVector(shuffles1, length), "");
+ rho_yvec = LLVMBuildShuffleVector(builder, abs_ddx_ddy[0], abs_ddx_ddy[1],
+ LLVMConstVector(shuffles2, length), "");
}
- rho_x = lp_build_abs(float_size_bld, rho_x);
- rho_y = lp_build_abs(float_size_bld, rho_y);
-
- rho_vec = lp_build_max(float_size_bld, rho_x, rho_y);
+ rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
first_level = bld->dynamic_state->first_level(bld->dynamic_state,
bld->gallivm, unit);
@@ -243,22 +267,77 @@ lp_build_rho(struct lp_build_sample_context *bld,
int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
float_size = lp_build_int_to_float(float_size_bld, int_size);
- rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
+ if (bld->coord_type.length > 4) {
+ /* expand size to each quad */
+ if (dims > 1) {
+ /* could use some broadcast_vector helper for this? */
+ int num_quads = bld->coord_type.length / 4;
+ LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4];
+ for (i = 0; i < num_quads; i++) {
+ src[i] = float_size;
+ }
+ float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads);
+ }
+ else {
+ float_size = lp_build_broadcast_scalar(coord_bld, float_size);
+ }
+ rho_vec = lp_build_mul(coord_bld, rho_vec, float_size);
- if (dims <= 1) {
- rho = rho_vec;
+ if (dims <= 1) {
+ rho = rho_vec;
+ }
+ else {
+ if (dims >= 2) {
+ static const unsigned char swizzle1[] = {
+ 0, LP_BLD_SWIZZLE_DONTCARE,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ static const unsigned char swizzle2[] = {
+ 1, LP_BLD_SWIZZLE_DONTCARE,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ LLVMValueRef rho_s, rho_t, rho_r;
+
+ rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
+ rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2);
+
+ rho = lp_build_max(coord_bld, rho_s, rho_t);
+
+ if (dims >= 3) {
+ static const unsigned char swizzle3[] = {
+ 2, LP_BLD_SWIZZLE_DONTCARE,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle3);
+ rho = lp_build_max(coord_bld, rho, rho_r);
+ }
+ }
+ }
+ rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+ perquadf_bld->type, rho);
}
else {
- if (dims >= 2) {
- LLVMValueRef rho_s, rho_t, rho_r;
+ if (dims <= 1) {
+ rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, "");
+ }
+ rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
- rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
- rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
+ if (dims <= 1) {
+ rho = rho_vec;
+ }
+ else {
+ if (dims >= 2) {
+ LLVMValueRef rho_s, rho_t, rho_r;
+
+ rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
+ rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
- rho = lp_build_max(float_bld, rho_s, rho_t);
- if (dims >= 3) {
- rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
- rho = lp_build_max(float_bld, rho, rho_r);
+ rho = lp_build_max(float_bld, rho_s, rho_t);
+
+ if (dims >= 3) {
+ rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
+ rho = lp_build_max(float_bld, rho, rho_r);
+ }
}
}
}
@@ -396,22 +475,20 @@ lp_build_brilinear_rho(struct lp_build_context *bld,
/**
* Generate code to compute texture level of detail (lambda).
- * \param ddx partial derivatives of (s, t, r, q) with respect to X
- * \param ddy partial derivatives of (s, t, r, q) with respect to Y
+ * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y
* \param lod_bias optional float vector with the shader lod bias
* \param explicit_lod optional float vector with the explicit lod
* \param width scalar int texture width
* \param height scalar int texture height
* \param depth scalar int texture depth
*
- * XXX: The resulting lod is scalar, so ignore all but the first element of
- * derivatives, lod_bias, etc that are passed by the shader.
+ * The resulting lod is scalar per quad, so only the first value per quad
+ * passed in from lod_bias, explicit_lod is used.
*/
void
lp_build_lod_selector(struct lp_build_sample_context *bld,
unsigned unit,
- const LLVMValueRef ddx[4],
- const LLVMValueRef ddy[4],
+ const struct lp_derivatives *derivs,
LLVMValueRef lod_bias, /* optional */
LLVMValueRef explicit_lod, /* optional */
unsigned mip_filter,
@@ -420,11 +497,11 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
{
LLVMBuilderRef builder = bld->gallivm->builder;
- struct lp_build_context *float_bld = &bld->float_bld;
+ struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
LLVMValueRef lod;
- *out_lod_ipart = bld->int_bld.zero;
- *out_lod_fpart = bld->float_bld.zero;
+ *out_lod_ipart = bld->perquadi_bld.zero;
+ *out_lod_fpart = perquadf_bld->zero;
if (bld->static_state->min_max_lod_equal) {
/* User is forcing sampling from a particular mipmap level.
@@ -433,21 +510,17 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
LLVMValueRef min_lod =
bld->dynamic_state->min_lod(bld->dynamic_state, bld->gallivm, unit);
- lod = min_lod;
+ lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
}
else {
- LLVMValueRef sampler_lod_bias =
- bld->dynamic_state->lod_bias(bld->dynamic_state, bld->gallivm, unit);
- LLVMValueRef index0 = lp_build_const_int32(bld->gallivm, 0);
-
if (explicit_lod) {
- lod = LLVMBuildExtractElement(builder, explicit_lod,
- index0, "");
+ lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
+ perquadf_bld->type, explicit_lod);
}
else {
LLVMValueRef rho;
- rho = lp_build_rho(bld, unit, ddx, ddy);
+ rho = lp_build_rho(bld, unit, derivs);
/*
* Compute lod = log2(rho)
@@ -465,66 +538,72 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
- *out_lod_ipart = lp_build_ilog2(float_bld, rho);
- *out_lod_fpart = bld->float_bld.zero;
+ *out_lod_ipart = lp_build_ilog2(perquadf_bld, rho);
+ *out_lod_fpart = perquadf_bld->zero;
return;
}
if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
- lp_build_brilinear_rho(float_bld, rho, BRILINEAR_FACTOR,
+ lp_build_brilinear_rho(perquadf_bld, rho, BRILINEAR_FACTOR,
out_lod_ipart, out_lod_fpart);
return;
}
}
if (0) {
- lod = lp_build_log2(float_bld, rho);
+ lod = lp_build_log2(perquadf_bld, rho);
}
else {
- lod = lp_build_fast_log2(float_bld, rho);
+ lod = lp_build_fast_log2(perquadf_bld, rho);
}
/* add shader lod bias */
if (lod_bias) {
- lod_bias = LLVMBuildExtractElement(builder, lod_bias,
- index0, "");
+ lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
+ perquadf_bld->type, lod_bias);
lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
}
}
/* add sampler lod bias */
- if (bld->static_state->lod_bias_non_zero)
+ if (bld->static_state->lod_bias_non_zero) {
+ LLVMValueRef sampler_lod_bias =
+ bld->dynamic_state->lod_bias(bld->dynamic_state, bld->gallivm, unit);
+ sampler_lod_bias = lp_build_broadcast_scalar(perquadf_bld,
+ sampler_lod_bias);
lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
-
+ }
/* clamp lod */
if (bld->static_state->apply_max_lod) {
LLVMValueRef max_lod =
bld->dynamic_state->max_lod(bld->dynamic_state, bld->gallivm, unit);
+ max_lod = lp_build_broadcast_scalar(perquadf_bld, max_lod);
- lod = lp_build_min(float_bld, lod, max_lod);
+ lod = lp_build_min(perquadf_bld, lod, max_lod);
}
if (bld->static_state->apply_min_lod) {
LLVMValueRef min_lod =
bld->dynamic_state->min_lod(bld->dynamic_state, bld->gallivm, unit);
+ min_lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
- lod = lp_build_max(float_bld, lod, min_lod);
+ lod = lp_build_max(perquadf_bld, lod, min_lod);
}
}
if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
- lp_build_brilinear_lod(float_bld, lod, BRILINEAR_FACTOR,
+ lp_build_brilinear_lod(perquadf_bld, lod, BRILINEAR_FACTOR,
out_lod_ipart, out_lod_fpart);
}
else {
- lp_build_ifloor_fract(float_bld, lod, out_lod_ipart, out_lod_fpart);
+ lp_build_ifloor_fract(perquadf_bld, lod, out_lod_ipart, out_lod_fpart);
}
lp_build_name(*out_lod_fpart, "lod_fpart");
}
else {
- *out_lod_ipart = lp_build_iround(float_bld, lod);
+ *out_lod_ipart = lp_build_iround(perquadf_bld, lod);
}
lp_build_name(*out_lod_ipart, "lod_ipart");
@@ -536,8 +615,8 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
/**
* For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
* mipmap level index.
- * Note: this is all scalar code.
- * \param lod scalar float texture level of detail
+ * Note: this is all scalar per quad code.
+ * \param lod_ipart int texture level of detail
* \param level_out returns integer
*/
void
@@ -546,26 +625,27 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
LLVMValueRef lod_ipart,
LLVMValueRef *level_out)
{
- struct lp_build_context *int_bld = &bld->int_bld;
+ struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
LLVMValueRef first_level, last_level, level;
first_level = bld->dynamic_state->first_level(bld->dynamic_state,
bld->gallivm, unit);
last_level = bld->dynamic_state->last_level(bld->dynamic_state,
bld->gallivm, unit);
+ first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
+ last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
- /* convert float lod to integer */
- level = lp_build_add(int_bld, lod_ipart, first_level);
+ level = lp_build_add(perquadi_bld, lod_ipart, first_level);
/* clamp level to legal range of levels */
- *level_out = lp_build_clamp(int_bld, level, first_level, last_level);
+ *level_out = lp_build_clamp(perquadi_bld, level, first_level, last_level);
}
/**
- * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to
- * two (adjacent) mipmap level indexes. Later, we'll sample from those
- * two mipmap levels and interpolate between them.
+ * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad int LOD(s) to two (per-quad)
+ * (adjacent) mipmap level indexes, and fix up float lod part accordingly.
+ * Later, we'll sample from those two mipmap levels and interpolate between them.
*/
void
lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
@@ -576,20 +656,21 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
LLVMValueRef *level1_out)
{
LLVMBuilderRef builder = bld->gallivm->builder;
- struct lp_build_context *int_bld = &bld->int_bld;
- struct lp_build_context *float_bld = &bld->float_bld;
+ struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
+ struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
LLVMValueRef first_level, last_level;
LLVMValueRef clamp_min;
LLVMValueRef clamp_max;
first_level = bld->dynamic_state->first_level(bld->dynamic_state,
bld->gallivm, unit);
-
- *level0_out = lp_build_add(int_bld, lod_ipart, first_level);
- *level1_out = lp_build_add(int_bld, *level0_out, int_bld->one);
-
last_level = bld->dynamic_state->last_level(bld->dynamic_state,
bld->gallivm, unit);
+ first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
+ last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
+
+ *level0_out = lp_build_add(perquadi_bld, lod_ipart, first_level);
+ *level1_out = lp_build_add(perquadi_bld, *level0_out, perquadi_bld->one);
/*
* Clamp both *level0_out and *level1_out to [first_level, last_level], with
@@ -597,6 +678,15 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
* ends in the process.
*/
+ /*
+ * This code (vector select in particular) only works with llvm 3.1
+ * (if there's more than one quad, with x86 backend). Might consider
+ * converting to our lp_bld_logic helpers.
+ */
+#if HAVE_LLVM < 0x0301
+ assert(perquadi_bld->type.length == 1);
+#endif
+
/* *level0_out < first_level */
clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
*level0_out, first_level,
@@ -609,7 +699,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
first_level, *level1_out, "");
*lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
- float_bld->zero, *lod_fpart_inout, "");
+ perquadf_bld->zero, *lod_fpart_inout, "");
/* *level0_out >= last_level */
clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
@@ -623,7 +713,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
last_level, *level1_out, "");
*lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
- float_bld->zero, *lod_fpart_inout, "");
+ perquadf_bld->zero, *lod_fpart_inout, "");
lp_build_name(*level0_out, "sampler%u_miplevel0", unit);
lp_build_name(*level1_out, "sampler%u_miplevel1", unit);
@@ -651,15 +741,6 @@ lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
}
-LLVMValueRef
-lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
- int level)
-{
- LLVMValueRef lvl = lp_build_const_int32(bld->gallivm, level);
- return lp_build_get_mipmap_level(bld, lvl);
-}
-
-
/**
* Codegen equivalent for u_minify().
* Return max(1, base_size >> level);
@@ -748,8 +829,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
* bld->int_size_type or bld->float_size_type)
* @param coord_type type of the texture size vector (either
* bld->int_coord_type or bld->coord_type)
- * @param int_size vector with the integer texture size (width, height,
- * depth)
+ * @param size vector with the texture size (width, height, depth)
*/
void
lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
@@ -788,7 +868,7 @@ lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
/**
* Unnormalize coords.
*
- * @param int_size vector with the integer texture size (width, height, depth)
+ * @param flt_size vector with the integer texture size (width, height, depth)
*/
void
lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
@@ -823,7 +903,18 @@ lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
/** Helper used by lp_build_cube_lookup() */
static LLVMValueRef
-lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
+lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
+{
+ /* ima = +0.5 / abs(coord); */
+ LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
+ LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
+ LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord);
+ return ima;
+}
+
+/** Helper used by lp_build_cube_lookup() */
+static LLVMValueRef
+lp_build_cube_imaneg(struct lp_build_context *coord_bld, LLVMValueRef coord)
{
/* ima = -0.5 / abs(coord); */
LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, -0.5);
@@ -832,9 +923,12 @@ lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
return ima;
}
-
/**
* Helper used by lp_build_cube_lookup()
+ * FIXME: the sign here can also be 0.
+ * Arithmetically this could definitely make a difference. Either
+ * fix the comment or use other (simpler) sign function, not sure
+ * which one it should be.
* \param sign scalar +1 or -1
* \param coord float vector
* \param ima float vector
@@ -898,58 +992,186 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
LLVMValueRef *face_s,
LLVMValueRef *face_t)
{
- struct lp_build_context *float_bld = &bld->float_bld;
struct lp_build_context *coord_bld = &bld->coord_bld;
LLVMBuilderRef builder = bld->gallivm->builder;
+ struct gallivm_state *gallivm = bld->gallivm;
LLVMValueRef rx, ry, rz;
- LLVMValueRef arx, ary, arz;
- LLVMValueRef c25 = lp_build_const_float(bld->gallivm, 0.25);
- LLVMValueRef arx_ge_ary, arx_ge_arz;
- LLVMValueRef ary_ge_arx, ary_ge_arz;
- LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
-
- assert(bld->coord_bld.type.length == 4);
+ LLVMValueRef tmp[4], rxyz, arxyz;
/*
* Use the average of the four pixel's texcoords to choose the face.
+ * Slight simplification just calculate the sum, skip scaling.
*/
- rx = lp_build_mul(float_bld, c25,
- lp_build_sum_vector(&bld->coord_bld, s));
- ry = lp_build_mul(float_bld, c25,
- lp_build_sum_vector(&bld->coord_bld, t));
- rz = lp_build_mul(float_bld, c25,
- lp_build_sum_vector(&bld->coord_bld, r));
+ tmp[0] = s;
+ tmp[1] = t;
+ tmp[2] = r;
+ rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3);
+ arxyz = lp_build_abs(&bld->coord_bld, rxyz);
+
+ if (coord_bld->type.length > 4) {
+ struct lp_build_context *cint_bld = &bld->int_coord_bld;
+ struct lp_type intctype = cint_bld->type;
+ LLVMValueRef signrxs, signrys, signrzs, signrxyz, sign;
+ LLVMValueRef arxs, arys, arzs;
+ LLVMValueRef arx_ge_ary, maxarxsarys, arz_ge_arx_ary;
+ LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
+ LLVMValueRef ryneg, rzneg;
+ LLVMValueRef ma, ima;
+ LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
+ LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
+ 1 << (intctype.width - 1));
+ LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
+ intctype.width -1);
+ LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
+ LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
+ LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
+
+ assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
+ assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
+ assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
+
+ rx = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
+ ry = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
+ rz = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
+ ryneg = LLVMBuildXor(builder, ry, signmask, "");
+ rzneg = LLVMBuildXor(builder, rz, signmask, "");
+
+ /* the sign bit comes from the averaged vector (per quad),
+ * as does the decision which face to use */
+ signrxyz = LLVMBuildBitCast(builder, rxyz, lp_build_vec_type(gallivm, intctype), "");
+ signrxyz = LLVMBuildAnd(builder, signrxyz, signmask, "");
+
+ arxs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 0);
+ arys = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 1);
+ arzs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 2);
- arx = lp_build_abs(float_bld, rx);
- ary = lp_build_abs(float_bld, ry);
- arz = lp_build_abs(float_bld, rz);
+ /*
+ * select x if x >= y else select y
+ * select previous result if y >= max(x,y) else select z
+ */
+ arx_ge_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, arxs, arys);
+ maxarxsarys = lp_build_max(coord_bld, arxs, arys);
+ arz_ge_arx_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, maxarxsarys, arzs);
- /*
- * Compare sign/magnitude of rx,ry,rz to determine face
- */
- arx_ge_ary = LLVMBuildFCmp(builder, LLVMRealUGE, arx, ary, "");
- arx_ge_arz = LLVMBuildFCmp(builder, LLVMRealUGE, arx, arz, "");
- ary_ge_arx = LLVMBuildFCmp(builder, LLVMRealUGE, ary, arx, "");
- ary_ge_arz = LLVMBuildFCmp(builder, LLVMRealUGE, ary, arz, "");
+ /*
+ * compute all possible new s/t coords
+ * snewx = signrx * -rz;
+ * tnewx = -ry;
+ * snewy = rx;
+ * tnewy = signry * rz;
+ * snewz = signrz * rx;
+ * tnewz = -ry;
+ */
+ signrxs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 0);
+ snewx = LLVMBuildXor(builder, signrxs, rzneg, "");
+ tnewx = ryneg;
+
+ signrys = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 1);
+ snewy = rx;
+ tnewy = LLVMBuildXor(builder, signrys, rz, "");
+
+ signrzs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 2);
+ snewz = LLVMBuildXor(builder, signrzs, rx, "");
+ tnewz = ryneg;
+
+ /* XXX on x86 unclear if we should cast the values back to float
+ * or not - on some cpus (nehalem) pblendvb has twice the throughput
+ * of blendvps though on others there just might be domain
+ * transition penalties when using it (this depends on what llvm
+ * will chose for the bit ops above so there appears no "right way",
+ * but given the boatload of selects let's just use the int type).
+ *
+ * Unfortunately we also need the sign bit of the summed coords.
+ */
+ *face_s = lp_build_select(cint_bld, arx_ge_ary, snewx, snewy);
+ *face_t = lp_build_select(cint_bld, arx_ge_ary, tnewx, tnewy);
+ ma = lp_build_select(coord_bld, arx_ge_ary, s, t);
+ *face = lp_build_select(cint_bld, arx_ge_ary, facex, facey);
+ sign = lp_build_select(cint_bld, arx_ge_ary, signrxs, signrys);
+
+ *face_s = lp_build_select(cint_bld, arz_ge_arx_ary, *face_s, snewz);
+ *face_t = lp_build_select(cint_bld, arz_ge_arx_ary, *face_t, tnewz);
+ ma = lp_build_select(coord_bld, arz_ge_arx_ary, ma, r);
+ *face = lp_build_select(cint_bld, arz_ge_arx_ary, *face, facez);
+ sign = lp_build_select(cint_bld, arz_ge_arx_ary, sign, signrzs);
+
+ *face_s = LLVMBuildBitCast(builder, *face_s,
+ lp_build_vec_type(gallivm, coord_bld->type), "");
+ *face_t = LLVMBuildBitCast(builder, *face_t,
+ lp_build_vec_type(gallivm, coord_bld->type), "");
+
+ /* add +1 for neg face */
+ /* XXX with AVX probably want to use another select here -
+ * as long as we ensure vblendvps gets used we can actually
+ * skip the comparison and just use sign as a "mask" directly.
+ */
+ sign = LLVMBuildLShr(builder, sign, signshift, "");
+ *face = LLVMBuildOr(builder, *face, sign, "face");
- arx_ge_ary_arz = LLVMBuildAnd(builder, arx_ge_ary, arx_ge_arz, "");
- ary_ge_arx_arz = LLVMBuildAnd(builder, ary_ge_arx, ary_ge_arz, "");
+ ima = lp_build_cube_imapos(coord_bld, ma);
+
+ *face_s = lp_build_mul(coord_bld, *face_s, ima);
+ *face_s = lp_build_add(coord_bld, *face_s, posHalf);
+ *face_t = lp_build_mul(coord_bld, *face_t, ima);
+ *face_t = lp_build_add(coord_bld, *face_t, posHalf);
+ }
- {
+ else {
struct lp_build_if_state if_ctx;
LLVMValueRef face_s_var;
LLVMValueRef face_t_var;
LLVMValueRef face_var;
-
- face_s_var = lp_build_alloca(bld->gallivm, bld->coord_bld.vec_type, "face_s_var");
- face_t_var = lp_build_alloca(bld->gallivm, bld->coord_bld.vec_type, "face_t_var");
- face_var = lp_build_alloca(bld->gallivm, bld->int_bld.vec_type, "face_var");
-
- lp_build_if(&if_ctx, bld->gallivm, arx_ge_ary_arz);
+ LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
+ LLVMValueRef shuffles[4];
+ LLVMValueRef arxy_ge_aryx, arxy_ge_arzz, arxy_ge_arxy_arzz;
+ LLVMValueRef arxyxy, aryxzz, arxyxy_ge_aryxzz;
+ struct lp_build_context *float_bld = &bld->float_bld;
+
+ assert(bld->coord_bld.type.length == 4);
+
+ shuffles[0] = lp_build_const_int32(gallivm, 0);
+ shuffles[1] = lp_build_const_int32(gallivm, 1);
+ shuffles[2] = lp_build_const_int32(gallivm, 0);
+ shuffles[3] = lp_build_const_int32(gallivm, 1);
+ arxyxy = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
+ shuffles[0] = lp_build_const_int32(gallivm, 1);
+ shuffles[1] = lp_build_const_int32(gallivm, 0);
+ shuffles[2] = lp_build_const_int32(gallivm, 2);
+ shuffles[3] = lp_build_const_int32(gallivm, 2);
+ aryxzz = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
+ arxyxy_ge_aryxzz = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_GEQUAL, arxyxy, aryxzz);
+
+ shuffles[0] = lp_build_const_int32(gallivm, 0);
+ shuffles[1] = lp_build_const_int32(gallivm, 1);
+ arxy_ge_aryx = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
+ LLVMConstVector(shuffles, 2), "");
+ shuffles[0] = lp_build_const_int32(gallivm, 2);
+ shuffles[1] = lp_build_const_int32(gallivm, 3);
+ arxy_ge_arzz = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
+ LLVMConstVector(shuffles, 2), "");
+ arxy_ge_arxy_arzz = LLVMBuildAnd(builder, arxy_ge_aryx, arxy_ge_arzz, "");
+
+ arx_ge_ary_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
+ lp_build_const_int32(gallivm, 0), "");
+ arx_ge_ary_arz = LLVMBuildICmp(builder, LLVMIntNE, arx_ge_ary_arz,
+ lp_build_const_int32(gallivm, 0), "");
+ ary_ge_arx_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
+ lp_build_const_int32(gallivm, 1), "");
+ ary_ge_arx_arz = LLVMBuildICmp(builder, LLVMIntNE, ary_ge_arx_arz,
+ lp_build_const_int32(gallivm, 0), "");
+ face_s_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_s_var");
+ face_t_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_t_var");
+ face_var = lp_build_alloca(gallivm, bld->int_bld.vec_type, "face_var");
+
+ lp_build_if(&if_ctx, gallivm, arx_ge_ary_arz);
{
/* +/- X face */
- LLVMValueRef sign = lp_build_sgn(float_bld, rx);
- LLVMValueRef ima = lp_build_cube_ima(coord_bld, s);
+ LLVMValueRef sign, ima;
+ rx = LLVMBuildExtractElement(builder, rxyz,
+ lp_build_const_int32(gallivm, 0), "");
+ /* +/- X face */
+ sign = lp_build_sgn(float_bld, rx);
+ ima = lp_build_cube_imaneg(coord_bld, s);
*face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
*face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
*face = lp_build_cube_face(bld, rx,
@@ -963,11 +1185,14 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
{
struct lp_build_if_state if_ctx2;
- lp_build_if(&if_ctx2, bld->gallivm, ary_ge_arx_arz);
+ lp_build_if(&if_ctx2, gallivm, ary_ge_arx_arz);
{
+ LLVMValueRef sign, ima;
/* +/- Y face */
- LLVMValueRef sign = lp_build_sgn(float_bld, ry);
- LLVMValueRef ima = lp_build_cube_ima(coord_bld, t);
+ ry = LLVMBuildExtractElement(builder, rxyz,
+ lp_build_const_int32(gallivm, 1), "");
+ sign = lp_build_sgn(float_bld, ry);
+ ima = lp_build_cube_imaneg(coord_bld, t);
*face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
*face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
*face = lp_build_cube_face(bld, ry,
@@ -980,8 +1205,11 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
lp_build_else(&if_ctx2);
{
/* +/- Z face */
- LLVMValueRef sign = lp_build_sgn(float_bld, rz);
- LLVMValueRef ima = lp_build_cube_ima(coord_bld, r);
+ LLVMValueRef sign, ima;
+ rz = LLVMBuildExtractElement(builder, rxyz,
+ lp_build_const_int32(gallivm, 2), "");
+ sign = lp_build_sgn(float_bld, rz);
+ ima = lp_build_cube_imaneg(coord_bld, r);
*face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
*face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
*face = lp_build_cube_face(bld, rz,
@@ -999,6 +1227,7 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
*face_s = LLVMBuildLoad(builder, face_s_var, "face_s");
*face_t = LLVMBuildLoad(builder, face_t_var, "face_t");
*face = LLVMBuildLoad(builder, face_var, "face");
+ *face = lp_build_broadcast_scalar(&bld->int_coord_bld, *face);
}
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index dad138abee0..0f3d8ae6cb5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -52,6 +52,15 @@ struct lp_build_context;
/**
+ * Helper struct holding all derivatives needed for sampling
+ */
+struct lp_derivatives
+{
+ LLVMValueRef ddx_ddy[2];
+};
+
+
+/**
* Sampler static state.
*
* These are the bits of state from pipe_resource and pipe_sampler_state that
@@ -192,6 +201,9 @@ struct lp_build_sample_context
/* See texture_dims() */
unsigned dims;
+ /** SIMD vector width */
+ unsigned vector_width;
+
/** regular scalar float type */
struct lp_type float_type;
struct lp_build_context float_bld;
@@ -199,7 +211,7 @@ struct lp_build_sample_context
/** float vector type */
struct lp_build_context float_vec_bld;
- /** regular scalar float type */
+ /** regular scalar int type */
struct lp_type int_type;
struct lp_build_context int_bld;
@@ -223,10 +235,15 @@ struct lp_build_sample_context
struct lp_type texel_type;
struct lp_build_context texel_bld;
+ /** Float per-quad type */
+ struct lp_type perquadf_type;
+ struct lp_build_context perquadf_bld;
+
+ /** Int per-quad type */
+ struct lp_type perquadi_type;
+ struct lp_build_context perquadi_bld;
+
/* Common dynamic state values */
- LLVMValueRef width;
- LLVMValueRef height;
- LLVMValueRef depth;
LLVMValueRef row_stride_array;
LLVMValueRef img_stride_array;
LLVMValueRef data_array;
@@ -305,8 +322,7 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
void
lp_build_lod_selector(struct lp_build_sample_context *bld,
unsigned unit,
- const LLVMValueRef ddx[4],
- const LLVMValueRef ddy[4],
+ const struct lp_derivatives *derivs,
LLVMValueRef lod_bias, /* optional */
LLVMValueRef explicit_lod, /* optional */
unsigned mip_filter,
@@ -331,10 +347,6 @@ LLVMValueRef
lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
LLVMValueRef level);
-LLVMValueRef
-lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
- int level);
-
void
lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
@@ -402,22 +414,35 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
unsigned unit,
unsigned num_coords,
const LLVMValueRef *coords,
- const LLVMValueRef *ddx,
- const LLVMValueRef *ddy,
+ const struct lp_derivatives *derivs,
LLVMValueRef lod_bias,
LLVMValueRef explicit_lod,
LLVMValueRef texel_out[4]);
+
+void
+lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
+ LLVMValueRef coord_f,
+ LLVMValueRef length_i,
+ LLVMValueRef length_f,
+ LLVMValueRef *coord0_i,
+ LLVMValueRef *weight_f);
+
+
void
lp_build_size_query_soa(struct gallivm_state *gallivm,
const struct lp_sampler_static_state *static_state,
struct lp_sampler_dynamic_state *dynamic_state,
+ struct lp_type int_type,
unsigned unit,
LLVMValueRef explicit_lod,
LLVMValueRef *sizes_out);
void
-lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type,
+lp_build_sample_nop(struct gallivm_state *gallivm,
+ struct lp_type type,
+ unsigned num_coords,
+ const LLVMValueRef *coords,
LLVMValueRef texel_out[4]);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index 74858bc9718..ad1b29cf096 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -27,7 +27,7 @@
/**
* @file
- * Texture sampling -- SoA.
+ * Texture sampling -- AoS.
*
* @author Jose Fonseca <jfonseca@vmware.com>
* @author Brian Paul <brianp@vmware.com>
@@ -40,6 +40,7 @@
#include "util/u_memory.h"
#include "util/u_math.h"
#include "util/u_format.h"
+#include "util/u_cpu_detect.h"
#include "lp_bld_debug.h"
#include "lp_bld_type.h"
#include "lp_bld_const.h"
@@ -75,6 +76,7 @@ static void
lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
unsigned block_length,
LLVMValueRef coord,
+ LLVMValueRef coord_f,
LLVMValueRef length,
LLVMValueRef stride,
boolean is_pot,
@@ -93,10 +95,11 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
if(is_pot)
coord = LLVMBuildAnd(builder, coord, length_minus_one, "");
else {
- /* Add a bias to the texcoord to handle negative coords */
- LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
- coord = LLVMBuildAdd(builder, coord, bias, "");
- coord = LLVMBuildURem(builder, coord, length, "");
+ struct lp_build_context *coord_bld = &bld->coord_bld;
+ LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
+ coord = lp_build_fract_safe(coord_bld, coord_f);
+ coord = lp_build_mul(coord_bld, coord, length_f);
+ coord = lp_build_itrunc(coord_bld, coord);
}
break;
@@ -121,6 +124,56 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
/**
+ * Build LLVM code for texture coord wrapping, for nearest filtering,
+ * for float texcoords.
+ * \param coord the incoming texcoord (s,t,r or q)
+ * \param length the texture size along one dimension
+ * \param is_pot if TRUE, length is a power of two
+ * \param wrap_mode one of PIPE_TEX_WRAP_x
+ * \param icoord the texcoord after wrapping, as int
+ */
+static void
+lp_build_sample_wrap_nearest_float(struct lp_build_sample_context *bld,
+ LLVMValueRef coord,
+ LLVMValueRef length,
+ boolean is_pot,
+ unsigned wrap_mode,
+ LLVMValueRef *icoord)
+{
+ struct lp_build_context *coord_bld = &bld->coord_bld;
+ LLVMValueRef length_minus_one;
+
+ switch(wrap_mode) {
+ case PIPE_TEX_WRAP_REPEAT:
+ /* take fraction, unnormalize */
+ coord = lp_build_fract_safe(coord_bld, coord);
+ coord = lp_build_mul(coord_bld, coord, length);
+ *icoord = lp_build_itrunc(coord_bld, coord);
+ break;
+ case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+ length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
+ if (bld->static_state->normalized_coords) {
+ /* scale coord to length */
+ coord = lp_build_mul(coord_bld, coord, length);
+ }
+ coord = lp_build_clamp(coord_bld, coord, coord_bld->zero,
+ length_minus_one);
+ *icoord = lp_build_itrunc(coord_bld, coord);
+ break;
+
+ case PIPE_TEX_WRAP_CLAMP:
+ case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+ case PIPE_TEX_WRAP_MIRROR_REPEAT:
+ case PIPE_TEX_WRAP_MIRROR_CLAMP:
+ case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+ case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+ default:
+ assert(0);
+ }
+}
+
+
+/**
* Build LLVM code for texture coord wrapping, for linear filtering,
* for scaled integer texcoords.
* \param block_length is the length of the pixel block along the
@@ -139,6 +192,8 @@ static void
lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
unsigned block_length,
LLVMValueRef coord0,
+ LLVMValueRef *weight_i,
+ LLVMValueRef coord_f,
LLVMValueRef length,
LLVMValueRef stride,
boolean is_pot,
@@ -153,58 +208,85 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
LLVMValueRef length_minus_one;
LLVMValueRef lmask, umask, mask;
- if (block_length != 1) {
- /*
- * If the pixel block covers more than one pixel then there is no easy
- * way to calculate offset1 relative to offset0. Instead, compute them
- * independently.
- */
-
- LLVMValueRef coord1;
-
- lp_build_sample_wrap_nearest_int(bld,
- block_length,
- coord0,
- length,
- stride,
- is_pot,
- wrap_mode,
- offset0, i0);
-
- coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+ /*
+ * If the pixel block covers more than one pixel then there is no easy
+ * way to calculate offset1 relative to offset0. Instead, compute them
+ * independently. Otherwise, try to compute offset0 and offset1 with
+ * a single stride multiplication.
+ */
- lp_build_sample_wrap_nearest_int(bld,
- block_length,
- coord1,
- length,
- stride,
- is_pot,
- wrap_mode,
- offset1, i1);
+ length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
+ if (block_length != 1) {
+ LLVMValueRef coord1;
+ switch(wrap_mode) {
+ case PIPE_TEX_WRAP_REPEAT:
+ if (is_pot) {
+ coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+ coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
+ coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
+ }
+ else {
+ LLVMValueRef mask;
+ LLVMValueRef weight;
+ LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
+ lp_build_coord_repeat_npot_linear(bld, coord_f,
+ length, length_f,
+ &coord0, &weight);
+ mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
+ PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
+ coord1 = LLVMBuildAnd(builder,
+ lp_build_add(int_coord_bld, coord0,
+ int_coord_bld->one),
+ mask, "");
+ weight = lp_build_mul_imm(&bld->coord_bld, weight, 256);
+ *weight_i = lp_build_itrunc(&bld->coord_bld, weight);
+ }
+ break;
+
+ case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+ coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+ coord0 = lp_build_clamp(int_coord_bld, coord0, int_coord_bld->zero,
+ length_minus_one);
+ coord1 = lp_build_clamp(int_coord_bld, coord1, int_coord_bld->zero,
+ length_minus_one);
+ break;
+
+ case PIPE_TEX_WRAP_CLAMP:
+ case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+ case PIPE_TEX_WRAP_MIRROR_REPEAT:
+ case PIPE_TEX_WRAP_MIRROR_CLAMP:
+ case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+ case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+ default:
+ assert(0);
+ coord0 = int_coord_bld->zero;
+ coord1 = int_coord_bld->zero;
+ break;
+ }
+ lp_build_sample_partial_offset(int_coord_bld, block_length, coord0, stride,
+ offset0, i0);
+ lp_build_sample_partial_offset(int_coord_bld, block_length, coord1, stride,
+ offset1, i1);
return;
}
- /*
- * Scalar pixels -- try to compute offset0 and offset1 with a single stride
- * multiplication.
- */
-
*i0 = int_coord_bld->zero;
*i1 = int_coord_bld->zero;
- length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
-
switch(wrap_mode) {
case PIPE_TEX_WRAP_REPEAT:
if (is_pot) {
coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
}
else {
- /* Add a bias to the texcoord to handle negative coords */
- LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
- coord0 = LLVMBuildAdd(builder, coord0, bias, "");
- coord0 = LLVMBuildURem(builder, coord0, length, "");
+ LLVMValueRef weight;
+ LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
+ lp_build_coord_repeat_npot_linear(bld, coord_f,
+ length, length_f,
+ &coord0, &weight);
+ weight = lp_build_mul_imm(&bld->coord_bld, weight, 256);
+ *weight_i = lp_build_itrunc(&bld->coord_bld, weight);
}
mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
@@ -217,6 +299,11 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
break;
case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+ /* XXX this might be slower than the separate path
+ * on some newer cpus. With sse41 this is 8 instructions vs. 7
+ * - at least on SNB this is almost certainly slower since
+ * min/max are cheaper than selects, and the muls aren't bad.
+ */
lmask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
umask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
@@ -249,6 +336,176 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
/**
+ * Build LLVM code for texture coord wrapping, for linear filtering,
+ * for float texcoords.
+ * \param block_length is the length of the pixel block along the
+ * coordinate axis
+ * \param coord the incoming texcoord (s,t,r or q)
+ * \param length the texture size along one dimension
+ * \param is_pot if TRUE, length is a power of two
+ * \param wrap_mode one of PIPE_TEX_WRAP_x
+ * \param coord0 the first texcoord after wrapping, as int
+ * \param coord1 the second texcoord after wrapping, as int
+ * \param weight the filter weight as int (0-255)
+ * \param force_nearest if this coord actually uses nearest filtering
+ */
+static void
+lp_build_sample_wrap_linear_float(struct lp_build_sample_context *bld,
+ unsigned block_length,
+ LLVMValueRef coord,
+ LLVMValueRef length,
+ boolean is_pot,
+ unsigned wrap_mode,
+ LLVMValueRef *coord0,
+ LLVMValueRef *coord1,
+ LLVMValueRef *weight,
+ unsigned force_nearest)
+{
+ struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+ struct lp_build_context *coord_bld = &bld->coord_bld;
+ LLVMBuilderRef builder = bld->gallivm->builder;
+ LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
+ LLVMValueRef length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
+
+ switch(wrap_mode) {
+ case PIPE_TEX_WRAP_REPEAT:
+ if (is_pot) {
+ /* mul by size and subtract 0.5 */
+ coord = lp_build_mul(coord_bld, coord, length);
+ if (!force_nearest)
+ coord = lp_build_sub(coord_bld, coord, half);
+ *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
+ /* convert to int, compute lerp weight */
+ lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
+ *coord1 = lp_build_ifloor(coord_bld, *coord1);
+ /* repeat wrap */
+ length_minus_one = lp_build_itrunc(coord_bld, length_minus_one);
+ *coord0 = LLVMBuildAnd(builder, *coord0, length_minus_one, "");
+ *coord1 = LLVMBuildAnd(builder, *coord1, length_minus_one, "");
+ }
+ else {
+ LLVMValueRef mask;
+ /* wrap with normalized floats is just fract */
+ coord = lp_build_fract(coord_bld, coord);
+ /* unnormalize */
+ coord = lp_build_mul(coord_bld, coord, length);
+ /*
+ * we avoided the 0.5/length division, have to fix up wrong
+ * edge cases with selects
+ */
+ *coord1 = lp_build_add(coord_bld, coord, half);
+ coord = lp_build_sub(coord_bld, coord, half);
+ *weight = lp_build_fract(coord_bld, coord);
+ mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
+ PIPE_FUNC_LESS, coord, coord_bld->zero);
+ *coord0 = lp_build_select(coord_bld, mask, length_minus_one, coord);
+ *coord0 = lp_build_itrunc(coord_bld, *coord0);
+ mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
+ PIPE_FUNC_LESS, *coord1, length);
+ *coord1 = lp_build_select(coord_bld, mask, *coord1, coord_bld->zero);
+ *coord1 = lp_build_itrunc(coord_bld, *coord1);
+ }
+ break;
+ case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+ if (bld->static_state->normalized_coords) {
+ /* mul by tex size */
+ coord = lp_build_mul(coord_bld, coord, length);
+ }
+ /* subtract 0.5 */
+ if (!force_nearest) {
+ coord = lp_build_sub(coord_bld, coord, half);
+ }
+ /* clamp to [0, length - 1] */
+ coord = lp_build_min(coord_bld, coord, length_minus_one);
+ coord = lp_build_max(coord_bld, coord, coord_bld->zero);
+ *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
+ /* convert to int, compute lerp weight */
+ lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
+ /* coord1 = min(coord1, length-1) */
+ *coord1 = lp_build_min(coord_bld, *coord1, length_minus_one);
+ *coord1 = lp_build_itrunc(coord_bld, *coord1);
+ break;
+ default:
+ assert(0);
+ *coord0 = int_coord_bld->zero;
+ *coord1 = int_coord_bld->zero;
+ *weight = coord_bld->zero;
+ break;
+ }
+ *weight = lp_build_mul_imm(coord_bld, *weight, 256);
+ *weight = lp_build_itrunc(coord_bld, *weight);
+ return;
+}
+
+
+/**
+ * Fetch texels for image with nearest sampling.
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ */
+static void
+lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
+ LLVMValueRef data_ptr,
+ LLVMValueRef offset,
+ LLVMValueRef x_subcoord,
+ LLVMValueRef y_subcoord,
+ LLVMValueRef *colors_lo,
+ LLVMValueRef *colors_hi)
+{
+ /*
+ * Fetch the pixels as 4 x 32bit (rgba order might differ):
+ *
+ * rgba0 rgba1 rgba2 rgba3
+ *
+ * bit cast them into 16 x u8
+ *
+ * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
+ *
+ * unpack them into two 8 x i16:
+ *
+ * r0 g0 b0 a0 r1 g1 b1 a1
+ * r2 g2 b2 a2 r3 g3 b3 a3
+ *
+ * The higher 8 bits of the resulting elements will be zero.
+ */
+ LLVMBuilderRef builder = bld->gallivm->builder;
+ LLVMValueRef rgba8;
+ struct lp_build_context h16, u8n;
+ LLVMTypeRef u8n_vec_type;
+
+ lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
+ lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
+ u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
+
+ if (util_format_is_rgba8_variant(bld->format_desc)) {
+ /*
+ * Given the format is a rgba8, just read the pixels as is,
+ * without any swizzling. Swizzling will be done later.
+ */
+ rgba8 = lp_build_gather(bld->gallivm,
+ bld->texel_type.length,
+ bld->format_desc->block.bits,
+ bld->texel_type.width,
+ data_ptr, offset);
+
+ rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+ }
+ else {
+ rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
+ bld->format_desc,
+ u8n.type,
+ data_ptr, offset,
+ x_subcoord,
+ y_subcoord);
+ }
+
+ /* Expand one 4*rgba8 to two 2*rgba16 */
+ lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
+ rgba8,
+ colors_lo, colors_hi);
+}
+
+
+/**
* Sample a single texture image with nearest sampling.
* If sampling a cube texture, r = cube face in [0,5].
* Return filtered color as two vectors of 16-bit fixed point values.
@@ -267,21 +524,19 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
{
const unsigned dims = bld->dims;
LLVMBuilderRef builder = bld->gallivm->builder;
- struct lp_build_context i32, h16, u8n;
- LLVMTypeRef i32_vec_type, u8n_vec_type;
+ struct lp_build_context i32;
+ LLVMTypeRef i32_vec_type;
LLVMValueRef i32_c8;
LLVMValueRef width_vec, height_vec, depth_vec;
LLVMValueRef s_ipart, t_ipart = NULL, r_ipart = NULL;
+ LLVMValueRef s_float, t_float = NULL, r_float = NULL;
LLVMValueRef x_stride;
LLVMValueRef x_offset, offset;
LLVMValueRef x_subcoord, y_subcoord, z_subcoord;
- lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32));
- lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16));
- lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8));
+ lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);
- u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
lp_build_extract_image_sizes(bld,
bld->int_size_type,
@@ -291,6 +546,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
&height_vec,
&depth_vec);
+ s_float = s; t_float = t; r_float = r;
+
if (bld->static_state->normalized_coords) {
LLVMValueRef scaled_size;
LLVMValueRef flt_size;
@@ -334,7 +591,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
/* Do texcoord wrapping, compute texel offset */
lp_build_sample_wrap_nearest_int(bld,
bld->format_desc->block.width,
- s_ipart, width_vec, x_stride,
+ s_ipart, s_float,
+ width_vec, x_stride,
bld->static_state->pot_width,
bld->static_state->wrap_s,
&x_offset, &x_subcoord);
@@ -343,7 +601,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
LLVMValueRef y_offset;
lp_build_sample_wrap_nearest_int(bld,
bld->format_desc->block.height,
- t_ipart, height_vec, row_stride_vec,
+ t_ipart, t_float,
+ height_vec, row_stride_vec,
bld->static_state->pot_height,
bld->static_state->wrap_t,
&y_offset, &y_subcoord);
@@ -352,7 +611,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
LLVMValueRef z_offset;
lp_build_sample_wrap_nearest_int(bld,
1, /* block length (depth) */
- r_ipart, depth_vec, img_stride_vec,
+ r_ipart, r_float,
+ depth_vec, img_stride_vec,
bld->static_state->pot_depth,
bld->static_state->wrap_r,
&z_offset, &z_subcoord);
@@ -366,6 +626,196 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
}
}
+ lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
+ x_subcoord, y_subcoord,
+ colors_lo, colors_hi);
+}
+
+
+/**
+ * Sample a single texture image with nearest sampling.
+ * If sampling a cube texture, r = cube face in [0,5].
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ * Does address calcs (except offsets) with floats.
+ * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
+ */
+static void
+lp_build_sample_image_nearest_afloat(struct lp_build_sample_context *bld,
+ LLVMValueRef int_size,
+ LLVMValueRef row_stride_vec,
+ LLVMValueRef img_stride_vec,
+ LLVMValueRef data_ptr,
+ LLVMValueRef s,
+ LLVMValueRef t,
+ LLVMValueRef r,
+ LLVMValueRef *colors_lo,
+ LLVMValueRef *colors_hi)
+ {
+ const unsigned dims = bld->dims;
+ LLVMValueRef width_vec, height_vec, depth_vec;
+ LLVMValueRef offset;
+ LLVMValueRef x_subcoord, y_subcoord;
+ LLVMValueRef x_icoord, y_icoord, z_icoord;
+ LLVMValueRef flt_size;
+
+ flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
+
+ lp_build_extract_image_sizes(bld,
+ bld->float_size_type,
+ bld->coord_type,
+ flt_size,
+ &width_vec,
+ &height_vec,
+ &depth_vec);
+
+ /* Do texcoord wrapping */
+ lp_build_sample_wrap_nearest_float(bld,
+ s, width_vec,
+ bld->static_state->pot_width,
+ bld->static_state->wrap_s,
+ &x_icoord);
+
+ if (dims >= 2) {
+ lp_build_sample_wrap_nearest_float(bld,
+ t, height_vec,
+ bld->static_state->pot_height,
+ bld->static_state->wrap_t,
+ &y_icoord);
+
+ if (dims >= 3) {
+ lp_build_sample_wrap_nearest_float(bld,
+ r, depth_vec,
+ bld->static_state->pot_depth,
+ bld->static_state->wrap_r,
+ &z_icoord);
+ }
+ else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+ z_icoord = r;
+ }
+ }
+
+ /*
+ * From here on we deal with ints, and we should split up the 256bit
+ * vectors manually for better generated code.
+ */
+
+ /*
+ * compute texel offsets -
+ * cannot do offset calc with floats, difficult for block-based formats,
+ * and not enough precision anyway.
+ */
+ lp_build_sample_offset(&bld->int_coord_bld,
+ bld->format_desc,
+ x_icoord, y_icoord,
+ z_icoord,
+ row_stride_vec, img_stride_vec,
+ &offset,
+ &x_subcoord, &y_subcoord);
+
+ lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
+ x_subcoord, y_subcoord,
+ colors_lo, colors_hi);
+}
+
+
+/**
+ * Fetch texels for image with linear sampling.
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ */
+static void
+lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
+ LLVMValueRef data_ptr,
+ LLVMValueRef offset[2][2][2],
+ LLVMValueRef x_subcoord[2],
+ LLVMValueRef y_subcoord[2],
+ LLVMValueRef s_fpart,
+ LLVMValueRef t_fpart,
+ LLVMValueRef r_fpart,
+ LLVMValueRef *colors_lo,
+ LLVMValueRef *colors_hi)
+{
+ const unsigned dims = bld->dims;
+ LLVMBuilderRef builder = bld->gallivm->builder;
+ struct lp_build_context h16, u8n;
+ LLVMTypeRef h16_vec_type, u8n_vec_type;
+ LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
+ LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
+ LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
+ LLVMValueRef shuffle_lo, shuffle_hi;
+ LLVMValueRef s_fpart_lo, s_fpart_hi;
+ LLVMValueRef t_fpart_lo = NULL, t_fpart_hi = NULL;
+ LLVMValueRef r_fpart_lo = NULL, r_fpart_hi = NULL;
+ LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */
+ LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */
+ LLVMValueRef packed_lo, packed_hi;
+ unsigned i, j, k;
+ unsigned numj, numk;
+
+ lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
+ lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
+ h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type);
+ u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
+
+ /*
+ * Transform 4 x i32 in
+ *
+ * s_fpart = {s0, s1, s2, s3}
+ *
+ * into 8 x i16
+ *
+ * s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
+ *
+ * into two 8 x i16
+ *
+ * s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
+ * s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
+ *
+ * and likewise for t_fpart. There is no risk of loosing precision here
+ * since the fractional parts only use the lower 8bits.
+ */
+ s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
+ if (dims >= 2)
+ t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
+ if (dims >= 3)
+ r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, "");
+
+ for (j = 0; j < h16.type.length; j += 4) {
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+ unsigned subindex = 0;
+#else
+ unsigned subindex = 1;
+#endif
+ LLVMValueRef index;
+
+ index = LLVMConstInt(elem_type, j/2 + subindex, 0);
+ for (i = 0; i < 4; ++i)
+ shuffles_lo[j + i] = index;
+
+ index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
+ for (i = 0; i < 4; ++i)
+ shuffles_hi[j + i] = index;
+ }
+
+ shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
+ shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
+
+ s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
+ shuffle_lo, "");
+ s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
+ shuffle_hi, "");
+ if (dims >= 2) {
+ t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
+ shuffle_lo, "");
+ t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
+ shuffle_hi, "");
+ }
+ if (dims >= 3) {
+ r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
+ shuffle_lo, "");
+ r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
+ shuffle_hi, "");
+ }
+
/*
* Fetch the pixels as 4 x 32bit (rgba order might differ):
*
@@ -382,38 +832,129 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
*
* The higher 8 bits of the resulting elements will be zero.
*/
- {
- LLVMValueRef rgba8;
+ numj = 1 + (dims >= 2);
+ numk = 1 + (dims >= 3);
- if (util_format_is_rgba8_variant(bld->format_desc)) {
- /*
- * Given the format is a rgba8, just read the pixels as is,
- * without any swizzling. Swizzling will be done later.
- */
- rgba8 = lp_build_gather(bld->gallivm,
- bld->texel_type.length,
- bld->format_desc->block.bits,
- bld->texel_type.width,
- data_ptr, offset);
+ for (k = 0; k < numk; k++) {
+ for (j = 0; j < numj; j++) {
+ for (i = 0; i < 2; i++) {
+ LLVMValueRef rgba8;
+
+ if (util_format_is_rgba8_variant(bld->format_desc)) {
+ /*
+ * Given the format is a rgba8, just read the pixels as is,
+ * without any swizzling. Swizzling will be done later.
+ */
+ rgba8 = lp_build_gather(bld->gallivm,
+ bld->texel_type.length,
+ bld->format_desc->block.bits,
+ bld->texel_type.width,
+ data_ptr, offset[k][j][i]);
+
+ rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+ }
+ else {
+ rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
+ bld->format_desc,
+ u8n.type,
+ data_ptr, offset[k][j][i],
+ x_subcoord[i],
+ y_subcoord[j]);
+ }
- rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+ /* Expand one 4*rgba8 to two 2*rgba16 */
+ lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
+ rgba8,
+ &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]);
+ }
}
- else {
- rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
- bld->format_desc,
- u8n.type,
- data_ptr, offset,
- x_subcoord,
- y_subcoord);
+ }
+
+ /*
+ * Linear interpolation with 8.8 fixed point.
+ */
+ if (bld->static_state->force_nearest_s) {
+ /* special case 1-D lerp */
+ packed_lo = lp_build_lerp(&h16,
+ t_fpart_lo,
+ neighbors_lo[0][0][0],
+ neighbors_lo[0][0][1]);
+
+ packed_hi = lp_build_lerp(&h16,
+ t_fpart_hi,
+ neighbors_hi[0][1][0],
+ neighbors_hi[0][1][0]);
+ }
+ else if (bld->static_state->force_nearest_t) {
+ /* special case 1-D lerp */
+ packed_lo = lp_build_lerp(&h16,
+ s_fpart_lo,
+ neighbors_lo[0][0][0],
+ neighbors_lo[0][0][1]);
+
+ packed_hi = lp_build_lerp(&h16,
+ s_fpart_hi,
+ neighbors_hi[0][0][0],
+ neighbors_hi[0][0][1]);
+ }
+ else {
+ /* general 1/2/3-D lerping */
+ if (dims == 1) {
+ packed_lo = lp_build_lerp(&h16,
+ s_fpart_lo,
+ neighbors_lo[0][0][0],
+ neighbors_lo[0][0][1]);
+
+ packed_hi = lp_build_lerp(&h16,
+ s_fpart_hi,
+ neighbors_hi[0][0][0],
+ neighbors_hi[0][0][1]);
}
+ else {
+ /* 2-D lerp */
+ packed_lo = lp_build_lerp_2d(&h16,
+ s_fpart_lo, t_fpart_lo,
+ neighbors_lo[0][0][0],
+ neighbors_lo[0][0][1],
+ neighbors_lo[0][1][0],
+ neighbors_lo[0][1][1]);
+
+ packed_hi = lp_build_lerp_2d(&h16,
+ s_fpart_hi, t_fpart_hi,
+ neighbors_hi[0][0][0],
+ neighbors_hi[0][0][1],
+ neighbors_hi[0][1][0],
+ neighbors_hi[0][1][1]);
+
+ if (dims >= 3) {
+ LLVMValueRef packed_lo2, packed_hi2;
+
+ /* lerp in the second z slice */
+ packed_lo2 = lp_build_lerp_2d(&h16,
+ s_fpart_lo, t_fpart_lo,
+ neighbors_lo[1][0][0],
+ neighbors_lo[1][0][1],
+ neighbors_lo[1][1][0],
+ neighbors_lo[1][1][1]);
- /* Expand one 4*rgba8 to two 2*rgba16 */
- lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
- rgba8,
- colors_lo, colors_hi);
+ packed_hi2 = lp_build_lerp_2d(&h16,
+ s_fpart_hi, t_fpart_hi,
+ neighbors_hi[1][0][0],
+ neighbors_hi[1][0][1],
+ neighbors_hi[1][1][0],
+ neighbors_hi[1][1][1]);
+ /* interp between two z slices */
+ packed_lo = lp_build_lerp(&h16, r_fpart_lo,
+ packed_lo, packed_lo2);
+ packed_hi = lp_build_lerp(&h16, r_fpart_hi,
+ packed_hi, packed_hi2);
+ }
+ }
}
-}
+ *colors_lo = packed_lo;
+ *colors_hi = packed_hi;
+}
/**
* Sample a single texture image with (bi-)(tri-)linear sampling.
@@ -433,33 +974,24 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
{
const unsigned dims = bld->dims;
LLVMBuilderRef builder = bld->gallivm->builder;
- struct lp_build_context i32, h16, u8n;
- LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
+ struct lp_build_context i32;
+ LLVMTypeRef i32_vec_type;
LLVMValueRef i32_c8, i32_c128, i32_c255;
LLVMValueRef width_vec, height_vec, depth_vec;
- LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
- LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_fpart_lo = NULL, t_fpart_hi = NULL;
- LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_fpart_lo = NULL, r_fpart_hi = NULL;
+ LLVMValueRef s_ipart, s_fpart, s_float;
+ LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_float = NULL;
+ LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_float = NULL;
LLVMValueRef x_stride, y_stride, z_stride;
LLVMValueRef x_offset0, x_offset1;
LLVMValueRef y_offset0, y_offset1;
LLVMValueRef z_offset0, z_offset1;
LLVMValueRef offset[2][2][2]; /* [z][y][x] */
LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2];
- LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */
- LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */
- LLVMValueRef packed_lo, packed_hi;
unsigned x, y, z;
- unsigned i, j, k;
- unsigned numj, numk;
- lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32));
- lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16));
- lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8));
+ lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);
- h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type);
- u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
lp_build_extract_image_sizes(bld,
bld->int_size_type,
@@ -469,6 +1001,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
&height_vec,
&depth_vec);
+ s_float = s; t_float = t; r_float = r;
+
if (bld->static_state->normalized_coords) {
LLVMValueRef scaled_size;
LLVMValueRef flt_size;
@@ -533,7 +1067,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
/* do texcoord wrapping and compute texel offsets */
lp_build_sample_wrap_linear_int(bld,
bld->format_desc->block.width,
- s_ipart, width_vec, x_stride,
+ s_ipart, &s_fpart, s_float,
+ width_vec, x_stride,
bld->static_state->pot_width,
bld->static_state->wrap_s,
&x_offset0, &x_offset1,
@@ -548,7 +1083,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
if (dims >= 2) {
lp_build_sample_wrap_linear_int(bld,
bld->format_desc->block.height,
- t_ipart, height_vec, y_stride,
+ t_ipart, &t_fpart, t_float,
+ height_vec, y_stride,
bld->static_state->pot_height,
bld->static_state->wrap_t,
&y_offset0, &y_offset1,
@@ -567,7 +1103,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
if (dims >= 3) {
lp_build_sample_wrap_linear_int(bld,
bld->format_desc->block.height,
- r_ipart, depth_vec, z_stride,
+ r_ipart, &r_fpart, r_float,
+ depth_vec, z_stride,
bld->static_state->pot_depth,
bld->static_state->wrap_r,
&z_offset0, &z_offset1,
@@ -593,212 +1130,175 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
}
}
- /*
- * Transform 4 x i32 in
- *
- * s_fpart = {s0, s1, s2, s3}
- *
- * into 8 x i16
- *
- * s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
- *
- * into two 8 x i16
- *
- * s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
- * s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
- *
- * and likewise for t_fpart. There is no risk of loosing precision here
- * since the fractional parts only use the lower 8bits.
- */
- s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
- if (dims >= 2)
- t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
- if (dims >= 3)
- r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, "");
+ lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
+ x_subcoord, y_subcoord,
+ s_fpart, t_fpart, r_fpart,
+ colors_lo, colors_hi);
+}
- {
- LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
- LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
- LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
- LLVMValueRef shuffle_lo;
- LLVMValueRef shuffle_hi;
- for (j = 0; j < h16.type.length; j += 4) {
-#ifdef PIPE_ARCH_LITTLE_ENDIAN
- unsigned subindex = 0;
-#else
- unsigned subindex = 1;
-#endif
- LLVMValueRef index;
+/**
+ * Sample a single texture image with (bi-)(tri-)linear sampling.
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ * Does address calcs (except offsets) with floats.
+ * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
+ */
+static void
+lp_build_sample_image_linear_afloat(struct lp_build_sample_context *bld,
+ LLVMValueRef int_size,
+ LLVMValueRef row_stride_vec,
+ LLVMValueRef img_stride_vec,
+ LLVMValueRef data_ptr,
+ LLVMValueRef s,
+ LLVMValueRef t,
+ LLVMValueRef r,
+ LLVMValueRef *colors_lo,
+ LLVMValueRef *colors_hi)
+{
+ const unsigned dims = bld->dims;
+ LLVMValueRef width_vec, height_vec, depth_vec;
+ LLVMValueRef s_fpart;
+ LLVMValueRef t_fpart = NULL;
+ LLVMValueRef r_fpart = NULL;
+ LLVMValueRef x_stride, y_stride, z_stride;
+ LLVMValueRef x_offset0, x_offset1;
+ LLVMValueRef y_offset0, y_offset1;
+ LLVMValueRef z_offset0, z_offset1;
+ LLVMValueRef offset[2][2][2]; /* [z][y][x] */
+ LLVMValueRef x_subcoord[2], y_subcoord[2];
+ LLVMValueRef flt_size;
+ LLVMValueRef x_icoord0, x_icoord1;
+ LLVMValueRef y_icoord0, y_icoord1;
+ LLVMValueRef z_icoord0, z_icoord1;
+ unsigned x, y, z;
- index = LLVMConstInt(elem_type, j/2 + subindex, 0);
- for (i = 0; i < 4; ++i)
- shuffles_lo[j + i] = index;
+ flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
- index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
- for (i = 0; i < 4; ++i)
- shuffles_hi[j + i] = index;
- }
+ lp_build_extract_image_sizes(bld,
+ bld->float_size_type,
+ bld->coord_type,
+ flt_size,
+ &width_vec,
+ &height_vec,
+ &depth_vec);
- shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
- shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
+ /* do texcoord wrapping and compute texel offsets */
+ lp_build_sample_wrap_linear_float(bld,
+ bld->format_desc->block.width,
+ s, width_vec,
+ bld->static_state->pot_width,
+ bld->static_state->wrap_s,
+ &x_icoord0, &x_icoord1,
+ &s_fpart,
+ bld->static_state->force_nearest_s);
+
+ if (dims >= 2) {
+ lp_build_sample_wrap_linear_float(bld,
+ bld->format_desc->block.height,
+ t, height_vec,
+ bld->static_state->pot_height,
+ bld->static_state->wrap_t,
+ &y_icoord0, &y_icoord1,
+ &t_fpart,
+ bld->static_state->force_nearest_t);
- s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
- shuffle_lo, "");
- s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
- shuffle_hi, "");
- if (dims >= 2) {
- t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
- shuffle_lo, "");
- t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
- shuffle_hi, "");
- }
if (dims >= 3) {
- r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
- shuffle_lo, "");
- r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
- shuffle_hi, "");
+ lp_build_sample_wrap_linear_float(bld,
+ bld->format_desc->block.height,
+ r, depth_vec,
+ bld->static_state->pot_depth,
+ bld->static_state->wrap_r,
+ &z_icoord0, &z_icoord1,
+ &r_fpart, 0);
}
}
/*
- * Fetch the pixels as 4 x 32bit (rgba order might differ):
- *
- * rgba0 rgba1 rgba2 rgba3
- *
- * bit cast them into 16 x u8
- *
- * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
- *
- * unpack them into two 8 x i16:
- *
- * r0 g0 b0 a0 r1 g1 b1 a1
- * r2 g2 b2 a2 r3 g3 b3 a3
- *
- * The higher 8 bits of the resulting elements will be zero.
+ * From here on we deal with ints, and we should split up the 256bit
+ * vectors manually for better generated code.
*/
- numj = 1 + (dims >= 2);
- numk = 1 + (dims >= 3);
- for (k = 0; k < numk; k++) {
- for (j = 0; j < numj; j++) {
- for (i = 0; i < 2; i++) {
- LLVMValueRef rgba8;
-
- if (util_format_is_rgba8_variant(bld->format_desc)) {
- /*
- * Given the format is a rgba8, just read the pixels as is,
- * without any swizzling. Swizzling will be done later.
- */
- rgba8 = lp_build_gather(bld->gallivm,
- bld->texel_type.length,
- bld->format_desc->block.bits,
- bld->texel_type.width,
- data_ptr, offset[k][j][i]);
-
- rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
- }
- else {
- rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
- bld->format_desc,
- u8n.type,
- data_ptr, offset[k][j][i],
- x_subcoord[i],
- y_subcoord[j]);
- }
-
- /* Expand one 4*rgba8 to two 2*rgba16 */
- lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
- rgba8,
- &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]);
- }
- }
- }
+ /* get pixel, row and image strides */
+ x_stride = lp_build_const_vec(bld->gallivm,
+ bld->int_coord_bld.type,
+ bld->format_desc->block.bits/8);
+ y_stride = row_stride_vec;
+ z_stride = img_stride_vec;
/*
- * Linear interpolation with 8.8 fixed point.
+ * compute texel offset -
+ * cannot do offset calc with floats, difficult for block-based formats,
+ * and not enough precision anyway.
*/
- if (bld->static_state->force_nearest_s) {
- /* special case 1-D lerp */
- packed_lo = lp_build_lerp(&h16,
- t_fpart_lo,
- neighbors_lo[0][0][0],
- neighbors_lo[0][0][1]);
-
- packed_hi = lp_build_lerp(&h16,
- t_fpart_hi,
- neighbors_hi[0][1][0],
- neighbors_hi[0][1][0]);
+ lp_build_sample_partial_offset(&bld->int_coord_bld,
+ bld->format_desc->block.width,
+ x_icoord0, x_stride,
+ &x_offset0, &x_subcoord[0]);
+ lp_build_sample_partial_offset(&bld->int_coord_bld,
+ bld->format_desc->block.width,
+ x_icoord1, x_stride,
+ &x_offset1, &x_subcoord[1]);
+ for (z = 0; z < 2; z++) {
+ for (y = 0; y < 2; y++) {
+ offset[z][y][0] = x_offset0;
+ offset[z][y][1] = x_offset1;
+ }
}
- else if (bld->static_state->force_nearest_t) {
- /* special case 1-D lerp */
- packed_lo = lp_build_lerp(&h16,
- s_fpart_lo,
- neighbors_lo[0][0][0],
- neighbors_lo[0][0][1]);
- packed_hi = lp_build_lerp(&h16,
- s_fpart_hi,
- neighbors_hi[0][0][0],
- neighbors_hi[0][0][1]);
+ if (dims >= 2) {
+ lp_build_sample_partial_offset(&bld->int_coord_bld,
+ bld->format_desc->block.height,
+ y_icoord0, y_stride,
+ &y_offset0, &y_subcoord[0]);
+ lp_build_sample_partial_offset(&bld->int_coord_bld,
+ bld->format_desc->block.height,
+ y_icoord1, y_stride,
+ &y_offset1, &y_subcoord[1]);
+ for (z = 0; z < 2; z++) {
+ for (x = 0; x < 2; x++) {
+ offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
+ offset[z][0][x], y_offset0);
+ offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
+ offset[z][1][x], y_offset1);
+ }
+ }
}
- else {
- /* general 1/2/3-D lerping */
- if (dims == 1) {
- packed_lo = lp_build_lerp(&h16,
- s_fpart_lo,
- neighbors_lo[0][0][0],
- neighbors_lo[0][0][1]);
- packed_hi = lp_build_lerp(&h16,
- s_fpart_hi,
- neighbors_hi[0][0][0],
- neighbors_hi[0][0][1]);
+ if (dims >= 3) {
+ LLVMValueRef z_subcoord[2];
+ lp_build_sample_partial_offset(&bld->int_coord_bld,
+ 1,
+ z_icoord0, z_stride,
+ &z_offset0, &z_subcoord[0]);
+ lp_build_sample_partial_offset(&bld->int_coord_bld,
+ 1,
+ z_icoord1, z_stride,
+ &z_offset1, &z_subcoord[1]);
+ for (y = 0; y < 2; y++) {
+ for (x = 0; x < 2; x++) {
+ offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
+ offset[0][y][x], z_offset0);
+ offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
+ offset[1][y][x], z_offset1);
+ }
}
- else {
- /* 2-D lerp */
- packed_lo = lp_build_lerp_2d(&h16,
- s_fpart_lo, t_fpart_lo,
- neighbors_lo[0][0][0],
- neighbors_lo[0][0][1],
- neighbors_lo[0][1][0],
- neighbors_lo[0][1][1]);
-
- packed_hi = lp_build_lerp_2d(&h16,
- s_fpart_hi, t_fpart_hi,
- neighbors_hi[0][0][0],
- neighbors_hi[0][0][1],
- neighbors_hi[0][1][0],
- neighbors_hi[0][1][1]);
-
- if (dims >= 3) {
- LLVMValueRef packed_lo2, packed_hi2;
-
- /* lerp in the second z slice */
- packed_lo2 = lp_build_lerp_2d(&h16,
- s_fpart_lo, t_fpart_lo,
- neighbors_lo[1][0][0],
- neighbors_lo[1][0][1],
- neighbors_lo[1][1][0],
- neighbors_lo[1][1][1]);
-
- packed_hi2 = lp_build_lerp_2d(&h16,
- s_fpart_hi, t_fpart_hi,
- neighbors_hi[1][0][0],
- neighbors_hi[1][0][1],
- neighbors_hi[1][1][0],
- neighbors_hi[1][1][1]);
- /* interp between two z slices */
- packed_lo = lp_build_lerp(&h16, r_fpart_lo,
- packed_lo, packed_lo2);
- packed_hi = lp_build_lerp(&h16, r_fpart_hi,
- packed_hi, packed_hi2);
+ }
+ else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+ LLVMValueRef z_offset;
+ z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
+ for (y = 0; y < 2; y++) {
+ for (x = 0; x < 2; x++) {
+ /* The r coord is the cube face in [0,5] */
+ offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
+ offset[0][y][x], z_offset);
}
}
}
- *colors_lo = packed_lo;
- *colors_hi = packed_hi;
+ lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
+ x_subcoord, y_subcoord,
+ s_fpart, t_fpart, r_fpart,
+ colors_lo, colors_hi);
}
@@ -824,10 +1324,10 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
LLVMBuilderRef builder = bld->gallivm->builder;
LLVMValueRef size0;
LLVMValueRef size1;
- LLVMValueRef row_stride0_vec;
- LLVMValueRef row_stride1_vec;
- LLVMValueRef img_stride0_vec;
- LLVMValueRef img_stride1_vec;
+ LLVMValueRef row_stride0_vec = NULL;
+ LLVMValueRef row_stride1_vec = NULL;
+ LLVMValueRef img_stride0_vec = NULL;
+ LLVMValueRef img_stride1_vec = NULL;
LLVMValueRef data_ptr0;
LLVMValueRef data_ptr1;
LLVMValueRef colors0_lo, colors0_hi;
@@ -838,20 +1338,39 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
&size0,
&row_stride0_vec, &img_stride0_vec);
data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
- if (img_filter == PIPE_TEX_FILTER_NEAREST) {
- lp_build_sample_image_nearest(bld,
- size0,
- row_stride0_vec, img_stride0_vec,
- data_ptr0, s, t, r,
- &colors0_lo, &colors0_hi);
+ if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
+ if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+ lp_build_sample_image_nearest_afloat(bld,
+ size0,
+ row_stride0_vec, img_stride0_vec,
+ data_ptr0, s, t, r,
+ &colors0_lo, &colors0_hi);
+ }
+ else {
+ assert(img_filter == PIPE_TEX_FILTER_LINEAR);
+ lp_build_sample_image_linear_afloat(bld,
+ size0,
+ row_stride0_vec, img_stride0_vec,
+ data_ptr0, s, t, r,
+ &colors0_lo, &colors0_hi);
+ }
}
else {
- assert(img_filter == PIPE_TEX_FILTER_LINEAR);
- lp_build_sample_image_linear(bld,
- size0,
- row_stride0_vec, img_stride0_vec,
- data_ptr0, s, t, r,
- &colors0_lo, &colors0_hi);
+ if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+ lp_build_sample_image_nearest(bld,
+ size0,
+ row_stride0_vec, img_stride0_vec,
+ data_ptr0, s, t, r,
+ &colors0_lo, &colors0_hi);
+ }
+ else {
+ assert(img_filter == PIPE_TEX_FILTER_LINEAR);
+ lp_build_sample_image_linear(bld,
+ size0,
+ row_stride0_vec, img_stride0_vec,
+ data_ptr0, s, t, r,
+ &colors0_lo, &colors0_hi);
+ }
}
/* Store the first level's colors in the output variables */
@@ -859,74 +1378,138 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
LLVMBuildStore(builder, colors0_hi, colors_hi_var);
if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
- LLVMValueRef h16_scale = lp_build_const_float(bld->gallivm, 256.0);
- LLVMTypeRef i32_type = LLVMIntTypeInContext(bld->gallivm->context, 32);
+ LLVMValueRef h16vec_scale = lp_build_const_vec(bld->gallivm,
+ bld->perquadf_bld.type, 256.0);
+ LLVMTypeRef i32vec_type = lp_build_vec_type(bld->gallivm, bld->perquadi_bld.type);
struct lp_build_if_state if_ctx;
LLVMValueRef need_lerp;
+ unsigned num_quads = bld->coord_bld.type.length / 4;
+ unsigned i;
- lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16_scale, "");
- lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32_type, "lod_fpart.fixed16");
+ lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16vec_scale, "");
+ lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32vec_type, "lod_fpart.fixed16");
/* need_lerp = lod_fpart > 0 */
- need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
- lod_fpart, LLVMConstNull(i32_type),
- "need_lerp");
+ if (num_quads == 1) {
+ need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
+ lod_fpart, bld->perquadi_bld.zero,
+ "need_lerp");
+ }
+ else {
+ /*
+ * We'll do mip filtering if any of the quads need it.
+ * It might be better to split the vectors here and only fetch/filter
+ * quads which need it.
+ */
+ /*
+ * We need to clamp lod_fpart here since we can get negative
+ * values which would screw up filtering if not all
+ * lod_fpart values have same sign.
+ * We can however then skip the greater than comparison.
+ */
+ lod_fpart = lp_build_max(&bld->perquadi_bld, lod_fpart,
+ bld->perquadi_bld.zero);
+ need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, lod_fpart);
+ }
lp_build_if(&if_ctx, bld->gallivm, need_lerp);
{
struct lp_build_context h16_bld;
- lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16));
+ lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
/* sample the second mipmap level */
lp_build_mipmap_level_sizes(bld, ilevel1,
&size1,
&row_stride1_vec, &img_stride1_vec);
data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
- if (img_filter == PIPE_TEX_FILTER_NEAREST) {
- lp_build_sample_image_nearest(bld,
- size1,
- row_stride1_vec, img_stride1_vec,
- data_ptr1, s, t, r,
- &colors1_lo, &colors1_hi);
+
+ if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
+ if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+ lp_build_sample_image_nearest_afloat(bld,
+ size1,
+ row_stride1_vec, img_stride1_vec,
+ data_ptr1, s, t, r,
+ &colors1_lo, &colors1_hi);
+ }
+ else {
+ lp_build_sample_image_linear_afloat(bld,
+ size1,
+ row_stride1_vec, img_stride1_vec,
+ data_ptr1, s, t, r,
+ &colors1_lo, &colors1_hi);
+ }
}
else {
- lp_build_sample_image_linear(bld,
- size1,
- row_stride1_vec, img_stride1_vec,
- data_ptr1, s, t, r,
- &colors1_lo, &colors1_hi);
+ if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+ lp_build_sample_image_nearest(bld,
+ size1,
+ row_stride1_vec, img_stride1_vec,
+ data_ptr1, s, t, r,
+ &colors1_lo, &colors1_hi);
+ }
+ else {
+ lp_build_sample_image_linear(bld,
+ size1,
+ row_stride1_vec, img_stride1_vec,
+ data_ptr1, s, t, r,
+ &colors1_lo, &colors1_hi);
+ }
}
/* interpolate samples from the two mipmap levels */
- lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, "");
- lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart);
+ if (num_quads == 1) {
+ lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, "");
+ lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart);
#if HAVE_LLVM == 0x208
- /* This is a work-around for a bug in LLVM 2.8.
- * Evidently, something goes wrong in the construction of the
- * lod_fpart short[8] vector. Adding this no-effect shuffle seems
- * to force the vector to be properly constructed.
- * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f).
- */
- {
- LLVMValueRef shuffles[8], shuffle;
- int i;
- assert(h16_bld.type.length <= Elements(shuffles));
- for (i = 0; i < h16_bld.type.length; i++)
- shuffles[i] = lp_build_const_int32(bld->gallivm, 2 * (i & 1));
- shuffle = LLVMConstVector(shuffles, h16_bld.type.length);
- lod_fpart = LLVMBuildShuffleVector(builder,
- lod_fpart, lod_fpart,
- shuffle, "");
- }
+ /* This is a work-around for a bug in LLVM 2.8.
+ * Evidently, something goes wrong in the construction of the
+ * lod_fpart short[8] vector. Adding this no-effect shuffle seems
+ * to force the vector to be properly constructed.
+ * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f).
+ */
+ {
+ LLVMValueRef shuffles[8], shuffle;
+ assert(h16_bld.type.length <= Elements(shuffles));
+ for (i = 0; i < h16_bld.type.length; i++)
+ shuffles[i] = lp_build_const_int32(bld->gallivm, 2 * (i & 1));
+ shuffle = LLVMConstVector(shuffles, h16_bld.type.length);
+ lod_fpart = LLVMBuildShuffleVector(builder,
+ lod_fpart, lod_fpart,
+ shuffle, "");
+ }
#endif
- colors0_lo = lp_build_lerp(&h16_bld, lod_fpart,
- colors0_lo, colors1_lo);
- colors0_hi = lp_build_lerp(&h16_bld, lod_fpart,
- colors0_hi, colors1_hi);
+ colors0_lo = lp_build_lerp(&h16_bld, lod_fpart,
+ colors0_lo, colors1_lo);
+ colors0_hi = lp_build_lerp(&h16_bld, lod_fpart,
+ colors0_hi, colors1_hi);
+ }
+ else {
+ LLVMValueRef lod_parts[LP_MAX_VECTOR_LENGTH/16];
+ struct lp_type perquadi16_type = bld->perquadi_bld.type;
+ perquadi16_type.width /= 2;
+ perquadi16_type.length *= 2;
+ lod_fpart = LLVMBuildBitCast(builder, lod_fpart,
+ lp_build_vec_type(bld->gallivm,
+ perquadi16_type), "");
+ /* XXX this only works for exactly 2 quads. More quads need shuffle */
+ assert(num_quads == 2);
+ for (i = 0; i < num_quads; i++) {
+ LLVMValueRef indexi2 = lp_build_const_int32(bld->gallivm, i*2);
+ lod_parts[i] = lp_build_extract_broadcast(bld->gallivm,
+ perquadi16_type,
+ h16_bld.type,
+ lod_fpart,
+ indexi2);
+ }
+ colors0_lo = lp_build_lerp(&h16_bld, lod_parts[0],
+ colors0_lo, colors1_lo);
+ colors0_hi = lp_build_lerp(&h16_bld, lod_parts[1],
+ colors0_hi, colors1_hi);
+ }
LLVMBuildStore(builder, colors0_lo, colors_lo_var);
LLVMBuildStore(builder, colors0_hi, colors_hi_var);
@@ -948,10 +1531,10 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
LLVMValueRef s,
LLVMValueRef t,
LLVMValueRef r,
- const LLVMValueRef *ddx,
- const LLVMValueRef *ddy,
- LLVMValueRef lod_bias, /* optional */
- LLVMValueRef explicit_lod, /* optional */
+ LLVMValueRef lod_ipart,
+ LLVMValueRef lod_fpart,
+ LLVMValueRef ilevel0,
+ LLVMValueRef ilevel1,
LLVMValueRef texel_out[4])
{
struct lp_build_context *int_bld = &bld->int_bld;
@@ -960,14 +1543,9 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
const unsigned min_filter = bld->static_state->min_img_filter;
const unsigned mag_filter = bld->static_state->mag_img_filter;
const unsigned dims = bld->dims;
- LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
- LLVMValueRef ilevel0, ilevel1 = NULL;
LLVMValueRef packed, packed_lo, packed_hi;
LLVMValueRef unswizzled[4];
- LLVMValueRef face_ddx[4], face_ddy[4];
struct lp_build_context h16_bld;
- LLVMValueRef first_level;
- LLVMValueRef i32t_zero = lp_build_const_int32(bld->gallivm, 0);
/* we only support the common/simple wrap modes at this time */
assert(lp_is_simple_wrap_mode(bld->static_state->wrap_s));
@@ -978,81 +1556,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
/* make 16-bit fixed-pt builder context */
- lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16));
-
- /* cube face selection, compute pre-face coords, etc. */
- if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
- LLVMValueRef face, face_s, face_t;
- lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
- s = face_s; /* vec */
- t = face_t; /* vec */
- /* use 'r' to indicate cube face */
- r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
-
- /* recompute ddx, ddy using the new (s,t) face texcoords */
- face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s);
- face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t);
- face_ddx[2] = NULL;
- face_ddx[3] = NULL;
- face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s);
- face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t);
- face_ddy[2] = NULL;
- face_ddy[3] = NULL;
- ddx = face_ddx;
- ddy = face_ddy;
- }
-
- /*
- * Compute the level of detail (float).
- */
- if (min_filter != mag_filter ||
- mip_filter != PIPE_TEX_MIPFILTER_NONE) {
- /* Need to compute lod either to choose mipmap levels or to
- * distinguish between minification/magnification with one mipmap level.
- */
- lp_build_lod_selector(bld, unit, ddx, ddy,
- lod_bias, explicit_lod,
- mip_filter,
- &lod_ipart, &lod_fpart);
- } else {
- lod_ipart = i32t_zero;
- }
-
- /*
- * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
- */
- switch (mip_filter) {
- default:
- assert(0 && "bad mip_filter value in lp_build_sample_aos()");
- /* fall-through */
- case PIPE_TEX_MIPFILTER_NONE:
- /* always use mip level 0 */
- if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
- /* XXX this is a work-around for an apparent bug in LLVM 2.7.
- * We should be able to set ilevel0 = const(0) but that causes
- * bad x86 code to be emitted.
- */
- assert(lod_ipart);
- lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
- }
- else {
- first_level = bld->dynamic_state->first_level(bld->dynamic_state,
- bld->gallivm, unit);
- ilevel0 = first_level;
- }
- break;
- case PIPE_TEX_MIPFILTER_NEAREST:
- assert(lod_ipart);
- lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
- break;
- case PIPE_TEX_MIPFILTER_LINEAR:
- assert(lod_ipart);
- assert(lod_fpart);
- lp_build_linear_mip_levels(bld, unit,
- lod_ipart, &lod_fpart,
- &ilevel0, &ilevel1);
- break;
- }
+ lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
/*
* Get/interpolate texture colors.
@@ -1062,7 +1566,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
packed_hi = lp_build_alloca(bld->gallivm, h16_bld.vec_type, "packed_hi");
if (min_filter == mag_filter) {
- /* no need to distinquish between minification and magnification */
+ /* no need to distinguish between minification and magnification */
lp_build_sample_mipmap(bld,
min_filter, mip_filter,
s, t, r,
@@ -1106,7 +1610,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
* into 'packed'
*/
packed = lp_build_pack2(bld->gallivm,
- h16_bld.type, lp_type_unorm(8),
+ h16_bld.type, lp_type_unorm(8, bld->vector_width),
LLVMBuildLoad(builder, packed_lo, ""),
LLVMBuildLoad(builder, packed_hi, ""));
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
index 5d9ecac4d50..55b3bc1c09a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
@@ -46,10 +46,10 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
LLVMValueRef s,
LLVMValueRef t,
LLVMValueRef r,
- const LLVMValueRef *ddx,
- const LLVMValueRef *ddy,
- LLVMValueRef lod_bias, /* optional */
- LLVMValueRef explicit_lod, /* optional */
+ LLVMValueRef lod_ipart,
+ LLVMValueRef lod_fpart,
+ LLVMValueRef ilevel0,
+ LLVMValueRef ilevel1,
LLVMValueRef texel_out[4]);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 73dc3e77083..aaef7970635 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -41,6 +41,7 @@
#include "util/u_memory.h"
#include "util/u_math.h"
#include "util/u_format.h"
+#include "util/u_cpu_detect.h"
#include "lp_bld_debug.h"
#include "lp_bld_type.h"
#include "lp_bld_const.h"
@@ -57,6 +58,7 @@
#include "lp_bld_sample_aos.h"
#include "lp_bld_struct.h"
#include "lp_bld_quad.h"
+#include "lp_bld_pack.h"
/**
@@ -221,6 +223,41 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld,
/**
+ * Helper to compute the first coord and the weight for
+ * linear wrap repeat npot textures
+ */
+void
+lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
+ LLVMValueRef coord_f,
+ LLVMValueRef length_i,
+ LLVMValueRef length_f,
+ LLVMValueRef *coord0_i,
+ LLVMValueRef *weight_f)
+{
+ struct lp_build_context *coord_bld = &bld->coord_bld;
+ struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+ LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
+ LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
+ int_coord_bld->one);
+ LLVMValueRef mask;
+ /* wrap with normalized floats is just fract */
+ coord_f = lp_build_fract(coord_bld, coord_f);
+ /* mul by size and subtract 0.5 */
+ coord_f = lp_build_mul(coord_bld, coord_f, length_f);
+ coord_f = lp_build_sub(coord_bld, coord_f, half);
+ /*
+ * we avoided the 0.5/length division before the repeat wrap,
+ * now need to fix up edge cases with selects
+ */
+ /* convert to int, compute lerp weight */
+ lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
+ mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
+ PIPE_FUNC_LESS, *coord0_i, int_coord_bld->zero);
+ *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
+}
+
+
+/**
* Build LLVM code for texture wrap mode for linear filtering.
* \param x0_out returns first integer texcoord
* \param x1_out returns second integer texcoord
@@ -246,28 +283,27 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
switch(wrap_mode) {
case PIPE_TEX_WRAP_REPEAT:
- /* mul by size and subtract 0.5 */
- coord = lp_build_mul(coord_bld, coord, length_f);
- coord = lp_build_sub(coord_bld, coord, half);
- /* convert to int, compute lerp weight */
- lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
- /* repeat wrap */
if (is_pot) {
+ /* mul by size and subtract 0.5 */
+ coord = lp_build_mul(coord_bld, coord, length_f);
+ coord = lp_build_sub(coord_bld, coord, half);
+ /* convert to int, compute lerp weight */
+ lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+ /* repeat wrap */
coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
}
else {
- /* Add a bias to the texcoord to handle negative coords */
- LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
LLVMValueRef mask;
- coord0 = LLVMBuildAdd(builder, coord0, bias, "");
- coord0 = LLVMBuildURem(builder, coord0, length, "");
- mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
+ lp_build_coord_repeat_npot_linear(bld, coord,
+ length, length_f,
+ &coord0, &weight);
+ mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
coord1 = LLVMBuildAnd(builder,
- lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
- mask, "");
+ lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
+ mask, "");
}
break;
@@ -444,15 +480,16 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
switch(wrap_mode) {
case PIPE_TEX_WRAP_REPEAT:
- coord = lp_build_mul(coord_bld, coord, length_f);
- icoord = lp_build_ifloor(coord_bld, coord);
- if (is_pot)
+ if (is_pot) {
+ coord = lp_build_mul(coord_bld, coord, length_f);
+ icoord = lp_build_ifloor(coord_bld, coord);
icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
+ }
else {
- /* Add a bias to the texcoord to handle negative coords */
- LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
- icoord = LLVMBuildAdd(builder, icoord, bias, "");
- icoord = LLVMBuildURem(builder, icoord, length, "");
+ /* take fraction, unnormalize */
+ coord = lp_build_fract_safe(coord_bld, coord);
+ coord = lp_build_mul(coord_bld, coord, length_f);
+ icoord = lp_build_itrunc(coord_bld, coord);
}
break;
@@ -473,7 +510,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
break;
case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
- /* Note: this is the same as CLAMP_TO_EDGE, except min = -min */
+ /* Note: this is the same as CLAMP_TO_EDGE, except min = -1 */
{
LLVMValueRef min, max;
@@ -873,12 +910,32 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
struct lp_build_if_state if_ctx;
LLVMValueRef need_lerp;
+ unsigned num_quads = bld->coord_bld.type.length / 4;
/* need_lerp = lod_fpart > 0 */
- need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
- lod_fpart,
- bld->float_bld.zero,
- "need_lerp");
+ if (num_quads == 1) {
+ need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
+ lod_fpart, bld->perquadf_bld.zero,
+ "need_lerp");
+ }
+ else {
+ /*
+ * We'll do mip filtering if any of the quads need it.
+ * It might be better to split the vectors here and only fetch/filter
+ * quads which need it.
+ */
+ /*
+ * We unfortunately need to clamp lod_fpart here since we can get
+ * negative values which would screw up filtering if not all
+ * lod_fpart values have same sign.
+ */
+ lod_fpart = lp_build_max(&bld->perquadf_bld, lod_fpart,
+ bld->perquadf_bld.zero);
+ need_lerp = lp_build_compare(bld->gallivm, bld->perquadf_bld.type,
+ PIPE_FUNC_GREATER,
+ lod_fpart, bld->perquadf_bld.zero);
+ need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, need_lerp);
+ }
lp_build_if(&if_ctx, bld->gallivm, need_lerp);
{
@@ -904,7 +961,10 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
/* interpolate samples from the two mipmap levels */
- lod_fpart = lp_build_broadcast_scalar(&bld->texel_bld, lod_fpart);
+ lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
+ bld->perquadf_bld.type,
+ bld->texel_bld.type,
+ lod_fpart);
for (chan = 0; chan < 4; chan++) {
colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
@@ -916,37 +976,28 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
}
}
-
-
/**
- * General texture sampling codegen.
- * This function handles texture sampling for all texture targets (1D,
- * 2D, 3D, cube) and all filtering modes.
+ * Calculate cube face, lod, mip levels.
*/
static void
-lp_build_sample_general(struct lp_build_sample_context *bld,
- unsigned unit,
- LLVMValueRef s,
- LLVMValueRef t,
- LLVMValueRef r,
- const LLVMValueRef *ddx,
- const LLVMValueRef *ddy,
- LLVMValueRef lod_bias, /* optional */
- LLVMValueRef explicit_lod, /* optional */
- LLVMValueRef *colors_out)
+lp_build_sample_common(struct lp_build_sample_context *bld,
+ unsigned unit,
+ LLVMValueRef *s,
+ LLVMValueRef *t,
+ LLVMValueRef *r,
+ const struct lp_derivatives *derivs,
+ LLVMValueRef lod_bias, /* optional */
+ LLVMValueRef explicit_lod, /* optional */
+ LLVMValueRef *lod_ipart,
+ LLVMValueRef *lod_fpart,
+ LLVMValueRef *ilevel0,
+ LLVMValueRef *ilevel1)
{
- struct lp_build_context *int_bld = &bld->int_bld;
- LLVMBuilderRef builder = bld->gallivm->builder;
const unsigned mip_filter = bld->static_state->min_mip_filter;
const unsigned min_filter = bld->static_state->min_img_filter;
const unsigned mag_filter = bld->static_state->mag_img_filter;
- LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
- LLVMValueRef ilevel0, ilevel1 = NULL;
- LLVMValueRef face_ddx[4], face_ddy[4];
- LLVMValueRef texels[4];
LLVMValueRef first_level;
- LLVMValueRef i32t_zero = lp_build_const_int32(bld->gallivm, 0);
- unsigned chan;
+ struct lp_derivatives face_derivs;
/*
printf("%s mip %d min %d mag %d\n", __FUNCTION__,
@@ -958,23 +1009,16 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
*/
if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
LLVMValueRef face, face_s, face_t;
- lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
- s = face_s; /* vec */
- t = face_t; /* vec */
+ lp_build_cube_lookup(bld, *s, *t, *r, &face, &face_s, &face_t);
+ *s = face_s; /* vec */
+ *t = face_t; /* vec */
/* use 'r' to indicate cube face */
- r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
+ *r = face; /* vec */
/* recompute ddx, ddy using the new (s,t) face texcoords */
- face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s);
- face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t);
- face_ddx[2] = NULL;
- face_ddx[3] = NULL;
- face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s);
- face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t);
- face_ddy[2] = NULL;
- face_ddy[3] = NULL;
- ddx = face_ddx;
- ddy = face_ddy;
+ face_derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(&bld->coord_bld, *s, *t);
+ face_derivs.ddx_ddy[1] = NULL;
+ derivs = &face_derivs;
}
/*
@@ -985,12 +1029,12 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
/* Need to compute lod either to choose mipmap levels or to
* distinguish between minification/magnification with one mipmap level.
*/
- lp_build_lod_selector(bld, unit, ddx, ddy,
+ lp_build_lod_selector(bld, unit, derivs,
lod_bias, explicit_lod,
mip_filter,
- &lod_ipart, &lod_fpart);
+ lod_ipart, lod_fpart);
} else {
- lod_ipart = i32t_zero;
+ *lod_ipart = bld->perquadi_bld.zero;
}
/*
@@ -1006,28 +1050,56 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
/* XXX this is a work-around for an apparent bug in LLVM 2.7.
* We should be able to set ilevel0 = const(0) but that causes
* bad x86 code to be emitted.
+ * XXX should probably disable that on other llvm versions.
*/
- assert(lod_ipart);
- lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
+ assert(*lod_ipart);
+ lp_build_nearest_mip_level(bld, unit, *lod_ipart, ilevel0);
}
else {
first_level = bld->dynamic_state->first_level(bld->dynamic_state,
bld->gallivm, unit);
- ilevel0 = first_level;
+ first_level = lp_build_broadcast_scalar(&bld->perquadi_bld, first_level);
+ *ilevel0 = first_level;
}
break;
case PIPE_TEX_MIPFILTER_NEAREST:
- assert(lod_ipart);
- lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
+ assert(*lod_ipart);
+ lp_build_nearest_mip_level(bld, unit, *lod_ipart, ilevel0);
break;
case PIPE_TEX_MIPFILTER_LINEAR:
- assert(lod_ipart);
- assert(lod_fpart);
+ assert(*lod_ipart);
+ assert(*lod_fpart);
lp_build_linear_mip_levels(bld, unit,
- lod_ipart, &lod_fpart,
- &ilevel0, &ilevel1);
+ *lod_ipart, lod_fpart,
+ ilevel0, ilevel1);
break;
}
+}
+
+/**
+ * General texture sampling codegen.
+ * This function handles texture sampling for all texture targets (1D,
+ * 2D, 3D, cube) and all filtering modes.
+ */
+static void
+lp_build_sample_general(struct lp_build_sample_context *bld,
+ unsigned unit,
+ LLVMValueRef s,
+ LLVMValueRef t,
+ LLVMValueRef r,
+ LLVMValueRef lod_ipart,
+ LLVMValueRef lod_fpart,
+ LLVMValueRef ilevel0,
+ LLVMValueRef ilevel1,
+ LLVMValueRef *colors_out)
+{
+ struct lp_build_context *int_bld = &bld->int_bld;
+ LLVMBuilderRef builder = bld->gallivm->builder;
+ const unsigned mip_filter = bld->static_state->min_mip_filter;
+ const unsigned min_filter = bld->static_state->min_img_filter;
+ const unsigned mag_filter = bld->static_state->mag_img_filter;
+ LLVMValueRef texels[4];
+ unsigned chan;
/*
* Get/interpolate texture colors.
@@ -1039,7 +1111,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
}
if (min_filter == mag_filter) {
- /* no need to distinquish between minification and magnification */
+ /* no need to distinguish between minification and magnification */
lp_build_sample_mipmap(bld, unit,
min_filter, mip_filter,
s, t, r,
@@ -1135,7 +1207,10 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
* For debugging.
*/
void
-lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type,
+lp_build_sample_nop(struct gallivm_state *gallivm,
+ struct lp_type type,
+ unsigned num_coords,
+ const LLVMValueRef *coords,
LLVMValueRef texel_out[4])
{
LLVMValueRef one = lp_build_one(gallivm, type);
@@ -1152,8 +1227,7 @@ lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type,
* 'texel' will return a vector of four LLVMValueRefs corresponding to
* R, G, B, A.
* \param type vector float type to use for coords, etc.
- * \param ddx partial derivatives of (s,t,r,q) with respect to x
- * \param ddy partial derivatives of (s,t,r,q) with respect to y
+ * \param derivs partial derivatives of (s,t,r,q) with respect to x and y
*/
void
lp_build_sample_soa(struct gallivm_state *gallivm,
@@ -1163,8 +1237,7 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
unsigned unit,
unsigned num_coords,
const LLVMValueRef *coords,
- const LLVMValueRef ddx[4],
- const LLVMValueRef ddy[4],
+ const struct lp_derivatives *derivs,
LLVMValueRef lod_bias, /* optional */
LLVMValueRef explicit_lod, /* optional */
LLVMValueRef texel_out[4])
@@ -1173,10 +1246,10 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
struct lp_build_sample_context bld;
LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef tex_width, tex_height, tex_depth;
LLVMValueRef s;
LLVMValueRef t;
LLVMValueRef r;
- struct lp_type float_vec_type;
if (0) {
enum pipe_format fmt = static_state->format;
@@ -1193,6 +1266,8 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
bld.format_desc = util_format_description(static_state->format);
bld.dims = dims;
+ bld.vector_width = lp_type_width(type);
+
bld.float_type = lp_type_float(32);
bld.int_type = lp_type_int(32);
bld.coord_type = type;
@@ -1201,22 +1276,26 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
bld.float_size_type.length = dims > 1 ? 4 : 1;
bld.int_size_type = lp_int_type(bld.float_size_type);
bld.texel_type = type;
-
- float_vec_type = lp_type_float_vec(32);
+ bld.perquadf_type = type;
+ /* we want native vector size to be able to use our intrinsics */
+ bld.perquadf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
+ bld.perquadi_type = lp_int_type(bld.perquadf_type);
lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
- lp_build_context_init(&bld.float_vec_bld, gallivm, float_vec_type);
+ lp_build_context_init(&bld.float_vec_bld, gallivm, type);
lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
+ lp_build_context_init(&bld.perquadf_bld, gallivm, bld.perquadf_type);
+ lp_build_context_init(&bld.perquadi_bld, gallivm, bld.perquadi_type);
/* Get the dynamic state */
- bld.width = dynamic_state->width(dynamic_state, gallivm, unit);
- bld.height = dynamic_state->height(dynamic_state, gallivm, unit);
- bld.depth = dynamic_state->depth(dynamic_state, gallivm, unit);
+ tex_width = dynamic_state->width(dynamic_state, gallivm, unit);
+ tex_height = dynamic_state->height(dynamic_state, gallivm, unit);
+ tex_depth = dynamic_state->depth(dynamic_state, gallivm, unit);
bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm, unit);
bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm, unit);
bld.data_array = dynamic_state->data_ptr(dynamic_state, gallivm, unit);
@@ -1228,37 +1307,40 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
/* width, height, depth as single int vector */
if (dims <= 1) {
- bld.int_size = bld.width;
+ bld.int_size = tex_width;
}
else {
bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_bld.undef,
- bld.width, LLVMConstInt(i32t, 0, 0), "");
+ tex_width, LLVMConstInt(i32t, 0, 0), "");
if (dims >= 2) {
bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
- bld.height, LLVMConstInt(i32t, 1, 0), "");
+ tex_height, LLVMConstInt(i32t, 1, 0), "");
if (dims >= 3) {
bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
- bld.depth, LLVMConstInt(i32t, 2, 0), "");
+ tex_depth, LLVMConstInt(i32t, 2, 0), "");
}
}
}
if (0) {
/* For debug: no-op texture sampling */
- lp_build_sample_nop(gallivm, bld.texel_type, texel_out);
- }
- else if (util_format_fits_8unorm(bld.format_desc) &&
- lp_is_simple_wrap_mode(static_state->wrap_s) &&
- lp_is_simple_wrap_mode(static_state->wrap_t)) {
- /* do sampling/filtering with fixed pt arithmetic */
- lp_build_sample_aos(&bld, unit, s, t, r, ddx, ddy,
- lod_bias, explicit_lod,
+ lp_build_sample_nop(gallivm,
+ bld.texel_type,
+ num_coords,
+ coords,
texel_out);
}
-
else {
+ LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
+ LLVMValueRef ilevel0 = NULL, ilevel1 = NULL;
+ unsigned num_quads = type.length / 4;
+ const unsigned mip_filter = bld.static_state->min_mip_filter;
+ boolean use_aos = util_format_fits_8unorm(bld.format_desc) &&
+ lp_is_simple_wrap_mode(static_state->wrap_s) &&
+ lp_is_simple_wrap_mode(static_state->wrap_t);
+
if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
- util_format_fits_8unorm(bld.format_desc)) {
+ !use_aos && util_format_fits_8unorm(bld.format_desc)) {
debug_printf("%s: using floating point linear filtering for %s\n",
__FUNCTION__, bld.format_desc->short_name);
debug_printf(" min_img %d mag_img %d mip %d wraps %d wrapt %d\n",
@@ -1269,9 +1351,203 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
static_state->wrap_t);
}
- lp_build_sample_general(&bld, unit, s, t, r, ddx, ddy,
- lod_bias, explicit_lod,
- texel_out);
+ lp_build_sample_common(&bld, unit,
+ &s, &t, &r,
+ derivs, lod_bias, explicit_lod,
+ &lod_ipart, &lod_fpart,
+ &ilevel0, &ilevel1);
+
+ /*
+ * we only try 8-wide sampling with soa as it appears to
+ * be a loss with aos with AVX.
+ */
+ if (num_quads == 1 || (mip_filter == PIPE_TEX_MIPFILTER_NONE &&
+ !use_aos)) {
+
+ if (num_quads > 1) {
+ LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
+ /* These parameters are the same for all quads */
+ lod_ipart = LLVMBuildExtractElement(builder, lod_ipart, index0, "");
+ ilevel0 = LLVMBuildExtractElement(builder, ilevel0, index0, "");
+ }
+ if (use_aos) {
+ /* do sampling/filtering with fixed pt arithmetic */
+ lp_build_sample_aos(&bld, unit,
+ s, t, r,
+ lod_ipart, lod_fpart,
+ ilevel0, ilevel1,
+ texel_out);
+ }
+
+ else {
+ lp_build_sample_general(&bld, unit,
+ s, t, r,
+ lod_ipart, lod_fpart,
+ ilevel0, ilevel1,
+ texel_out);
+ }
+ }
+ else {
+ struct lp_build_if_state if_ctx;
+ LLVMValueRef notsame_levels, notsame;
+ LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
+ LLVMValueRef texels[4];
+ LLVMValueRef texelout[4];
+ unsigned j;
+
+ texels[0] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texr");
+ texels[1] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texg");
+ texels[2] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texb");
+ texels[3] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texa");
+
+ /* only build the if if we MAY split, otherwise always split */
+ if (!use_aos) {
+ notsame = lp_build_extract_broadcast(gallivm,
+ bld.perquadi_bld.type,
+ bld.perquadi_bld.type,
+ ilevel0, index0);
+ notsame = lp_build_sub(&bld.perquadi_bld, ilevel0, notsame);
+ notsame_levels = lp_build_any_true_range(&bld.perquadi_bld, num_quads,
+ notsame);
+ if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+ notsame = lp_build_extract_broadcast(gallivm,
+ bld.perquadi_bld.type,
+ bld.perquadi_bld.type,
+ ilevel1, index0);
+ notsame = lp_build_sub(&bld.perquadi_bld, ilevel1, notsame);
+ notsame = lp_build_any_true_range(&bld.perquadi_bld, num_quads, notsame);
+ notsame_levels = LLVMBuildOr(builder, notsame_levels, notsame, "");
+ }
+ lp_build_if(&if_ctx, gallivm, notsame_levels);
+ }
+
+ {
+ struct lp_build_sample_context bld4;
+ struct lp_type type4 = type;
+ unsigned i;
+ LLVMValueRef texelout4[4];
+ LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
+
+ type4.length = 4;
+
+ /* Setup our build context */
+ memset(&bld4, 0, sizeof bld4);
+ bld4.gallivm = bld.gallivm;
+ bld4.static_state = bld.static_state;
+ bld4.dynamic_state = bld.dynamic_state;
+ bld4.format_desc = bld.format_desc;
+ bld4.dims = bld.dims;
+ bld4.row_stride_array = bld.row_stride_array;
+ bld4.img_stride_array = bld.img_stride_array;
+ bld4.data_array = bld.data_array;
+ bld4.int_size = bld.int_size;
+
+ bld4.vector_width = lp_type_width(type4);
+
+ bld4.float_type = lp_type_float(32);
+ bld4.int_type = lp_type_int(32);
+ bld4.coord_type = type4;
+ bld4.int_coord_type = lp_int_type(type4);
+ bld4.float_size_type = lp_type_float(32);
+ bld4.float_size_type.length = dims > 1 ? 4 : 1;
+ bld4.int_size_type = lp_int_type(bld4.float_size_type);
+ bld4.texel_type = type4;
+ bld4.perquadf_type = type4;
+ /* we want native vector size to be able to use our intrinsics */
+ bld4.perquadf_type.length = 1;
+ bld4.perquadi_type = lp_int_type(bld4.perquadf_type);
+
+ lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
+ lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
+ lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
+ lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
+ lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
+ lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
+ lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
+ lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
+ lp_build_context_init(&bld4.perquadf_bld, gallivm, bld4.perquadf_type);
+ lp_build_context_init(&bld4.perquadi_bld, gallivm, bld4.perquadi_type);
+
+ for (i = 0; i < num_quads; i++) {
+ LLVMValueRef s4, t4, r4;
+ LLVMValueRef lod_iparts, lod_fparts = NULL;
+ LLVMValueRef ilevel0s, ilevel1s = NULL;
+ LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
+
+ s4 = lp_build_extract_range(gallivm, s, 4*i, 4);
+ t4 = lp_build_extract_range(gallivm, t, 4*i, 4);
+ r4 = lp_build_extract_range(gallivm, r, 4*i, 4);
+ lod_iparts = LLVMBuildExtractElement(builder, lod_ipart, indexi, "");
+ ilevel0s = LLVMBuildExtractElement(builder, ilevel0, indexi, "");
+ if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+ ilevel1s = LLVMBuildExtractElement(builder, ilevel1, indexi, "");
+ lod_fparts = LLVMBuildExtractElement(builder, lod_fpart, indexi, "");
+ }
+
+ if (use_aos) {
+ /* do sampling/filtering with fixed pt arithmetic */
+ lp_build_sample_aos(&bld4, unit,
+ s4, t4, r4,
+ lod_iparts, lod_fparts,
+ ilevel0s, ilevel1s,
+ texelout4);
+ }
+
+ else {
+ lp_build_sample_general(&bld4, unit,
+ s4, t4, r4,
+ lod_iparts, lod_fparts,
+ ilevel0s, ilevel1s,
+ texelout4);
+ }
+ for (j = 0; j < 4; j++) {
+ texelouttmp[j][i] = texelout4[j];
+ }
+ }
+ for (j = 0; j < 4; j++) {
+ texelout[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
+ LLVMBuildStore(builder, texelout[j], texels[j]);
+ }
+ }
+ if (!use_aos) {
+ LLVMValueRef ilevel0s, lod_iparts, ilevel1s = NULL;
+
+ lp_build_else(&if_ctx);
+
+ /* These parameters are the same for all quads */
+ lod_iparts = LLVMBuildExtractElement(builder, lod_ipart, index0, "");
+ ilevel0s = LLVMBuildExtractElement(builder, ilevel0, index0, "");
+ if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+ ilevel1s = LLVMBuildExtractElement(builder, ilevel1, index0, "");
+ }
+
+ if (use_aos) {
+ /* do sampling/filtering with fixed pt arithmetic */
+ lp_build_sample_aos(&bld, unit,
+ s, t, r,
+ lod_iparts, lod_fpart,
+ ilevel0s, ilevel1s,
+ texelout);
+ }
+
+ else {
+ lp_build_sample_general(&bld, unit,
+ s, t, r,
+ lod_iparts, lod_fpart,
+ ilevel0s, ilevel1s,
+ texelout);
+ }
+ for (j = 0; j < 4; j++) {
+ LLVMBuildStore(builder, texelout[j], texels[j]);
+ }
+
+ lp_build_endif(&if_ctx);
+ }
+
+ for (j = 0; j < 4; j++) {
+ texel_out[j] = LLVMBuildLoad(builder, texels[j], "");
+ }
+ }
}
lp_build_sample_compare(&bld, r, texel_out);
@@ -1283,6 +1559,7 @@ void
lp_build_size_query_soa(struct gallivm_state *gallivm,
const struct lp_sampler_static_state *static_state,
struct lp_sampler_dynamic_state *dynamic_state,
+ struct lp_type int_type,
unsigned unit,
LLVMValueRef explicit_lod,
LLVMValueRef *sizes_out)
@@ -1311,7 +1588,9 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
return;
}
- lp_build_context_init(&bld_int_vec, gallivm, lp_type_int_vec(32));
+ assert(!int_type.floating);
+
+ lp_build_context_init(&bld_int_vec, gallivm, lp_type_int_vec(32, 128));
if (explicit_lod) {
LLVMValueRef first_level;
@@ -1345,7 +1624,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
size = lp_build_minify(&bld_int_vec, size, lod);
for (i=0; i < dims; i++) {
- sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec.type, bld_int_vec.type,
+ sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec.type, int_type,
size,
lp_build_const_int32(gallivm, i));
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
index 5d4406812c7..641c960431d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -40,6 +40,7 @@
#include "lp_bld_init.h"
#include "lp_bld_logic.h"
#include "lp_bld_swizzle.h"
+#include "lp_bld_pack.h"
LLVMValueRef
@@ -95,7 +96,7 @@ lp_build_broadcast_scalar(struct lp_build_context *bld,
/**
- * Combined extract and broadcast (or a mere shuffle when the two types match)
+ * Combined extract and broadcast (mere shuffle in most cases)
*/
LLVMValueRef
lp_build_extract_broadcast(struct gallivm_state *gallivm,
@@ -132,9 +133,9 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm,
}
}
else {
- if (dst_type.length == src_type.length) {
+ if (dst_type.length > 1) {
/*
- * Special shuffle of the same size.
+ * shuffle - result can be of different length.
*/
LLVMValueRef shuffle;
@@ -142,28 +143,14 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm,
LLVMVectorType(i32t, dst_type.length),
index);
res = LLVMBuildShuffleVector(gallivm->builder, vector,
- LLVMGetUndef(lp_build_vec_type(gallivm, dst_type)),
+ LLVMGetUndef(lp_build_vec_type(gallivm, src_type)),
shuffle, "");
}
else {
- LLVMValueRef scalar;
- scalar = LLVMBuildExtractElement(gallivm->builder, vector, index, "");
- if (dst_type.length == 1) {
- /*
- * Trivial extract scalar from vector.
- */
-
- res = scalar;
- }
- else {
- /*
- * General case of different sized vectors.
- */
-
- res = lp_build_broadcast(gallivm,
- lp_build_vec_type(gallivm, dst_type),
- vector);
- }
+ /*
+ * Trivial extract scalar from vector.
+ */
+ res = LLVMBuildExtractElement(gallivm->builder, vector, index, "");
}
}
@@ -290,6 +277,8 @@ lp_build_swizzle_aos(struct lp_build_context *bld,
return bld->zero;
case PIPE_SWIZZLE_ONE:
return bld->one;
+ case LP_BLD_SWIZZLE_DONTCARE:
+ return bld->undef;
default:
assert(0);
return bld->undef;
@@ -319,21 +308,26 @@ lp_build_swizzle_aos(struct lp_build_context *bld,
case PIPE_SWIZZLE_BLUE:
case PIPE_SWIZZLE_ALPHA:
shuffle = j + swizzles[i];
+ shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
break;
case PIPE_SWIZZLE_ZERO:
shuffle = type.length + 0;
+ shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
if (!aux[0]) {
aux[0] = lp_build_const_elem(bld->gallivm, type, 0.0);
}
break;
case PIPE_SWIZZLE_ONE:
shuffle = type.length + 1;
+ shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
if (!aux[1]) {
aux[1] = lp_build_const_elem(bld->gallivm, type, 1.0);
}
break;
+ case LP_BLD_SWIZZLE_DONTCARE:
+ shuffles[j + i] = LLVMGetUndef(i32t);
+ break;
}
- shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
}
}
@@ -508,3 +502,127 @@ lp_build_swizzle_soa_inplace(struct lp_build_context *bld,
lp_build_swizzle_soa(bld, unswizzled, swizzles, values);
}
+
+
+/**
+ * Transpose from AOS <-> SOA
+ *
+ * @param single_type_lp type of pixels
+ * @param src the 4 * n pixel input
+ * @param dst the 4 * n pixel output
+ */
+void
+lp_build_transpose_aos(struct gallivm_state *gallivm,
+ struct lp_type single_type_lp,
+ const LLVMValueRef src[4],
+ LLVMValueRef dst[4])
+{
+ struct lp_type double_type_lp = single_type_lp;
+ LLVMTypeRef single_type;
+ LLVMTypeRef double_type;
+ LLVMValueRef t0, t1, t2, t3;
+
+ double_type_lp.length >>= 1;
+ double_type_lp.width <<= 1;
+
+ double_type = lp_build_vec_type(gallivm, double_type_lp);
+ single_type = lp_build_vec_type(gallivm, single_type_lp);
+
+ /* Interleave x, y, z, w -> xy and zw */
+ t0 = lp_build_interleave2_half(gallivm, single_type_lp, src[0], src[1], 0);
+ t1 = lp_build_interleave2_half(gallivm, single_type_lp, src[2], src[3], 0);
+ t2 = lp_build_interleave2_half(gallivm, single_type_lp, src[0], src[1], 1);
+ t3 = lp_build_interleave2_half(gallivm, single_type_lp, src[2], src[3], 1);
+
+ /* Cast to double width type for second interleave */
+ t0 = LLVMBuildBitCast(gallivm->builder, t0, double_type, "t0");
+ t1 = LLVMBuildBitCast(gallivm->builder, t1, double_type, "t1");
+ t2 = LLVMBuildBitCast(gallivm->builder, t2, double_type, "t2");
+ t3 = LLVMBuildBitCast(gallivm->builder, t3, double_type, "t3");
+
+ /* Interleave xy, zw -> xyzw */
+ dst[0] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 0);
+ dst[1] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 1);
+ dst[2] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 0);
+ dst[3] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 1);
+
+ /* Cast back to original single width type */
+ dst[0] = LLVMBuildBitCast(gallivm->builder, dst[0], single_type, "dst0");
+ dst[1] = LLVMBuildBitCast(gallivm->builder, dst[1], single_type, "dst1");
+ dst[2] = LLVMBuildBitCast(gallivm->builder, dst[2], single_type, "dst2");
+ dst[3] = LLVMBuildBitCast(gallivm->builder, dst[3], single_type, "dst3");
+}
+
+
+/**
+ * Pack first element of aos values,
+ * pad out to destination size.
+ * i.e. x1 _ _ _ x2 _ _ _ will become x1 x2 _ _
+ */
+LLVMValueRef
+lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ struct lp_type dst_type,
+ const LLVMValueRef src)
+{
+ LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+ LLVMValueRef undef = LLVMGetUndef(i32t);
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+ unsigned num_src = src_type.length / 4;
+ unsigned num_dst = dst_type.length;
+ unsigned i;
+
+ assert(num_src <= num_dst);
+
+ for (i = 0; i < num_src; i++) {
+ shuffles[i] = LLVMConstInt(i32t, i * 4, 0);
+ }
+ for (i = num_src; i < num_dst; i++) {
+ shuffles[i] = undef;
+ }
+
+ if (num_dst == 1) {
+ return LLVMBuildExtractElement(gallivm->builder, src, shuffles[0], "");
+ }
+ else {
+ return LLVMBuildShuffleVector(gallivm->builder, src, src,
+ LLVMConstVector(shuffles, num_dst), "");
+ }
+}
+
+
+/**
+ * Unpack and broadcast packed aos values consisting of only the
+ * first value, i.e. x1 x2 _ _ will become x1 x1 x1 x1 x2 x2 x2 x2
+ */
+LLVMValueRef
+lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ struct lp_type dst_type,
+ const LLVMValueRef src)
+{
+ LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+ unsigned num_dst = dst_type.length;
+ unsigned num_src = dst_type.length / 4;
+ unsigned i;
+
+ assert(num_dst / 4 <= src_type.length);
+
+ for (i = 0; i < num_src; i++) {
+ shuffles[i*4] = LLVMConstInt(i32t, i, 0);
+ shuffles[i*4+1] = LLVMConstInt(i32t, i, 0);
+ shuffles[i*4+2] = LLVMConstInt(i32t, i, 0);
+ shuffles[i*4+3] = LLVMConstInt(i32t, i, 0);
+ }
+
+ if (num_src == 1) {
+ return lp_build_extract_broadcast(gallivm, src_type, dst_type,
+ src, shuffles[0]);
+ }
+ else {
+ return LLVMBuildShuffleVector(gallivm->builder, src, src,
+ LLVMConstVector(shuffles, num_dst), "");
+ }
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
index c366a65103e..0bf4ce988a2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
@@ -44,6 +44,9 @@ struct lp_type;
struct lp_build_context;
+#define LP_BLD_SWIZZLE_DONTCARE 0xFF
+
+
LLVMValueRef
lp_build_broadcast(struct gallivm_state *gallivm,
LLVMTypeRef vec_type,
@@ -103,4 +106,25 @@ lp_build_swizzle_soa_inplace(struct lp_build_context *bld,
const unsigned char swizzles[4]);
+void
+lp_build_transpose_aos(struct gallivm_state *gallivm,
+ struct lp_type type,
+ const LLVMValueRef src[4],
+ LLVMValueRef dst[4]);
+
+
+LLVMValueRef
+lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ struct lp_type dst_type,
+ const LLVMValueRef src);
+
+
+LLVMValueRef
+lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ struct lp_type dst_type,
+ const LLVMValueRef src);
+
+
#endif /* !LP_BLD_SWIZZLE_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 4423bc5dedd..e292420a61a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -60,6 +60,7 @@ struct tgsi_token;
struct tgsi_shader_info;
struct lp_build_mask_context;
struct gallivm_state;
+struct lp_derivatives;
enum lp_build_tex_modifier {
@@ -174,8 +175,7 @@ struct lp_build_sampler_soa
unsigned unit,
unsigned num_coords,
const LLVMValueRef *coords,
- const LLVMValueRef *ddx,
- const LLVMValueRef *ddy,
+ const struct lp_derivatives *derivs,
LLVMValueRef lod_bias, /* optional */
LLVMValueRef explicit_lod, /* optional */
LLVMValueRef *texel);
@@ -183,6 +183,7 @@ struct lp_build_sampler_soa
void
(*emit_size_query)( const struct lp_build_sampler_soa *sampler,
struct gallivm_state *gallivm,
+ struct lp_type type,
unsigned unit,
LLVMValueRef explicit_lod, /* optional */
LLVMValueRef *sizes_out);
@@ -197,8 +198,7 @@ struct lp_build_sampler_aos
unsigned target, /* TGSI_TEXTURE_* */
unsigned unit,
LLVMValueRef coords,
- LLVMValueRef ddx,
- LLVMValueRef ddy,
+ const struct lp_derivatives derivs,
enum lp_build_tex_modifier modifier);
};
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
index 24bc13a9be8..0666bba7fbd 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
@@ -56,6 +56,7 @@
#include "lp_bld_quad.h"
#include "lp_bld_tgsi.h"
#include "lp_bld_debug.h"
+#include "lp_bld_sample.h"
/**
@@ -363,6 +364,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
LLVMValueRef coords;
LLVMValueRef ddx;
LLVMValueRef ddy;
+ struct lp_derivatives derivs;
if (!bld->sampler) {
_debug_printf("warning: found texture instruction but no sampler generator supplied\n");
@@ -373,7 +375,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
coords = lp_build_emit_fetch( &bld->bld_base, inst, 0 , LP_CHAN_ALL);
- if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
+ if (0 && modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
ddx = lp_build_emit_fetch( &bld->bld_base, inst, 1 , LP_CHAN_ALL);
ddy = lp_build_emit_fetch( &bld->bld_base, inst, 2 , LP_CHAN_ALL);
unit = inst->Src[3].Register.Index;
@@ -383,8 +385,8 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
ddy = lp_build_ddy( &bld->bld_base.base, coords );
#else
/* TODO */
- ddx = bld->bld_base.base.one;
- ddy = bld->bld_base.base.one;
+ derivs.ddx_ddy[0] = bld->bld_base.base.one;
+ derivs.ddx_ddy[1] = bld->bld_base.base.one;
#endif
unit = inst->Src[1].Register.Index;
}
@@ -392,7 +394,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
return bld->sampler->emit_fetch_texel(bld->sampler,
&bld->bld_base.base,
target, unit,
- coords, ddx, ddy,
+ coords, derivs,
modifier);
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index d9faaf20273..85a4401b534 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -62,6 +62,7 @@
#include "lp_bld_limits.h"
#include "lp_bld_debug.h"
#include "lp_bld_printf.h"
+#include "lp_bld_sample.h"
static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld)
@@ -763,7 +764,7 @@ emit_fetch_temporary(
else {
LLVMValueRef temp_ptr;
if (stype != TGSI_TYPE_FLOAT && stype != TGSI_TYPE_UNTYPED) {
- LLVMTypeRef itype = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
+ LLVMTypeRef itype = LLVMPointerType(bld->bld_base.int_bld.vec_type, 0);
LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index,
swizzle);
temp_ptr = LLVMBuildBitCast(builder, tint_ptr, itype, "");
@@ -1068,7 +1069,7 @@ emit_store_chan(
switch (dtype) {
case TGSI_TYPE_UNSIGNED:
case TGSI_TYPE_SIGNED: {
- LLVMTypeRef itype = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
+ LLVMTypeRef itype = bld_base->int_bld.vec_type;
LLVMTypeRef ivtype = LLVMPointerType(itype, 0);
LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index,
chan_index);
@@ -1141,13 +1142,14 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
LLVMValueRef *texel)
{
LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+ struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
unsigned unit;
LLVMValueRef lod_bias, explicit_lod;
LLVMValueRef oow = NULL;
LLVMValueRef coords[3];
- LLVMValueRef ddx[3];
- LLVMValueRef ddy[3];
+ struct lp_derivatives derivs;
unsigned num_coords;
+ unsigned dims;
unsigned i;
if (!bld->sampler) {
@@ -1158,26 +1160,42 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
return;
}
+ derivs.ddx_ddy[0] = bld->bld_base.base.undef;
+ derivs.ddx_ddy[1] = bld->bld_base.base.undef;
+
switch (inst->Texture.Texture) {
case TGSI_TEXTURE_1D:
num_coords = 1;
+ dims = 1;
break;
case TGSI_TEXTURE_1D_ARRAY:
+ num_coords = 2;
+ dims = 1;
+ break;
case TGSI_TEXTURE_2D:
case TGSI_TEXTURE_RECT:
num_coords = 2;
+ dims = 2;
break;
case TGSI_TEXTURE_SHADOW1D:
case TGSI_TEXTURE_SHADOW1D_ARRAY:
+ num_coords = 3;
+ dims = 1;
+ break;
case TGSI_TEXTURE_SHADOW2D:
case TGSI_TEXTURE_SHADOWRECT:
case TGSI_TEXTURE_2D_ARRAY:
- case TGSI_TEXTURE_3D:
case TGSI_TEXTURE_CUBE:
num_coords = 3;
+ dims = 2;
+ break;
+ case TGSI_TEXTURE_3D:
+ num_coords = 3;
+ dims = 3;
break;
case TGSI_TEXTURE_SHADOW2D_ARRAY:
num_coords = 4;
+ dims = 2;
break;
default:
assert(0);
@@ -1212,31 +1230,66 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
}
if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
- LLVMValueRef index0 = lp_build_const_int32(bld->bld_base.base.gallivm, 0);
- for (i = 0; i < num_coords; i++) {
- LLVMValueRef src1 = lp_build_emit_fetch( &bld->bld_base, inst, 1, i );
- LLVMValueRef src2 = lp_build_emit_fetch( &bld->bld_base, inst, 2, i );
- ddx[i] = LLVMBuildExtractElement(builder, src1, index0, "");
- ddy[i] = LLVMBuildExtractElement(builder, src2, index0, "");
+ LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+ LLVMValueRef ddxdyonec[3];
+ unsigned length = bld->bld_base.base.type.length;
+ unsigned num_quads = length / 4;
+ unsigned dim;
+ unsigned quad;
+
+ for (dim = 0; dim < dims; ++dim) {
+ LLVMValueRef srcx = lp_build_emit_fetch( &bld->bld_base, inst, 1, dim );
+ LLVMValueRef srcy = lp_build_emit_fetch( &bld->bld_base, inst, 2, dim );
+ for (quad = 0; quad < num_quads; ++quad) {
+ unsigned s1 = 4*quad;
+ unsigned s2 = 4*quad + length;
+ shuffles[4*quad + 0] = lp_build_const_int32(gallivm, s1);
+ shuffles[4*quad + 1] = lp_build_const_int32(gallivm, s2);
+ shuffles[4*quad + 2] = i32undef;
+ shuffles[4*quad + 3] = i32undef;
+ }
+ ddxdyonec[dim] = LLVMBuildShuffleVector(builder, srcx, srcy,
+ LLVMConstVector(shuffles, length), "");
+ }
+ if (dims == 1) {
+ derivs.ddx_ddy[0] = ddxdyonec[0];
+ }
+ else if (dims >= 2) {
+ for (quad = 0; quad < num_quads; ++quad) {
+ unsigned s1 = 4*quad;
+ unsigned s2 = 4*quad + length;
+ shuffles[4*quad + 0] = lp_build_const_int32(gallivm, s1);
+ shuffles[4*quad + 1] = lp_build_const_int32(gallivm, s1 + 1);
+ shuffles[4*quad + 2] = lp_build_const_int32(gallivm, s2);
+ shuffles[4*quad + 3] = lp_build_const_int32(gallivm, s2 + 1);
+ }
+ derivs.ddx_ddy[0] = LLVMBuildShuffleVector(builder, ddxdyonec[0], ddxdyonec[1],
+ LLVMConstVector(shuffles, length), "");
+ if (dims == 3) {
+ derivs.ddx_ddy[1] = ddxdyonec[2];
+ }
}
unit = inst->Src[3].Register.Index;
} else {
- for (i = 0; i < num_coords; i++) {
- ddx[i] = lp_build_scalar_ddx( &bld->bld_base.base, coords[i] );
- ddy[i] = lp_build_scalar_ddy( &bld->bld_base.base, coords[i] );
+ if (dims == 1) {
+ derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(&bld->bld_base.base, coords[0]);
+ }
+ else if (dims >= 2) {
+ derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(&bld->bld_base.base,
+ coords[0], coords[1]);
+ if (dims == 3) {
+ derivs.ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(&bld->bld_base.base, coords[2]);
+ }
}
unit = inst->Src[1].Register.Index;
}
- for (i = num_coords; i < 3; i++) {
- ddx[i] = LLVMGetUndef(bld->bld_base.base.elem_type);
- ddy[i] = LLVMGetUndef(bld->bld_base.base.elem_type);
- }
bld->sampler->emit_fetch_texel(bld->sampler,
bld->bld_base.base.gallivm,
bld->bld_base.base.type,
unit, num_coords, coords,
- ddx, ddy,
+ &derivs,
lod_bias, explicit_lod,
texel);
}
@@ -1310,6 +1363,7 @@ emit_txq( struct lp_build_tgsi_soa_context *bld,
bld->sampler->emit_size_query(bld->sampler,
bld->bld_base.base.gallivm,
+ bld->bld_base.int_bld.type,
inst->Src[1].Register.Index,
explicit_lod,
sizes_out);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.c b/src/gallium/auxiliary/gallivm/lp_bld_type.c
index 413e69bedac..6c3aa38bfb1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.c
@@ -38,6 +38,9 @@ lp_build_elem_type(struct gallivm_state *gallivm, struct lp_type type)
{
if (type.floating) {
switch(type.width) {
+ case 16:
+ return LLVMIntTypeInContext(gallivm->context, 16);
+ break;
case 32:
return LLVMFloatTypeInContext(gallivm->context);
break;
@@ -85,6 +88,10 @@ lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type)
if (type.floating) {
switch(type.width) {
+ case 16:
+ if(elem_kind != LLVMIntegerTypeKind)
+ return FALSE;
+ break;
case 32:
if(elem_kind != LLVMFloatTypeKind)
return FALSE;
@@ -168,27 +175,6 @@ lp_build_int_vec_type(struct gallivm_state *gallivm, struct lp_type type)
/**
- * Build int32[4] vector type
- */
-LLVMTypeRef
-lp_build_int32_vec4_type(struct gallivm_state *gallivm)
-{
- struct lp_type t;
- LLVMTypeRef type;
-
- memset(&t, 0, sizeof(t));
- t.floating = FALSE; /* floating point values */
- t.sign = TRUE; /* values are signed */
- t.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */
- t.width = 32; /* 32-bit int */
- t.length = 4; /* 4 elements per vector */
-
- type = lp_build_int_elem_type(gallivm, t);
- return LLVMVectorType(type, t.length);
-}
-
-
-/**
* Create element of vector type
*/
struct lp_type
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h
index f11a190e7cc..75310e05f3e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -40,21 +40,35 @@
#include "pipe/p_compiler.h"
#include "gallivm/lp_bld.h"
+/**
+ * Native SIMD architecture width available at runtime.
+ *
+ * Using this width should give the best performance,
+ * and it determines the necessary alignment of vector variables.
+ */
+extern unsigned lp_native_vector_width;
+/**
+ * Maximum supported vector width (not necessarily supported at run-time).
+ *
+ * Should only be used when lp_native_vector_width isn't available,
+ * i.e. sizing/alignment of non-malloced variables.
+ */
+#define LP_MAX_VECTOR_WIDTH 256
/**
- * Native SIMD register width.
+ * Minimum vector alignment for static variable alignment
*
- * 128 for all architectures we care about.
+ * It should always be a constant equal to LP_MAX_VECTOR_WIDTH/8. An
+ * expression is non-portable.
*/
-#define LP_NATIVE_VECTOR_WIDTH 128
+#define LP_MIN_VECTOR_ALIGN 32
/**
* Several functions can only cope with vectors of length up to this value.
* You may need to increase that value if you want to represent bigger vectors.
*/
-#define LP_MAX_VECTOR_LENGTH 16
-
+#define LP_MAX_VECTOR_LENGTH (LP_MAX_VECTOR_WIDTH/8)
/**
* The LLVM type system can't conveniently express all the things we care about
@@ -151,6 +165,13 @@ struct lp_build_context
};
+static INLINE unsigned
+lp_type_width(struct lp_type type)
+{
+ return type.width * type.length;
+}
+
+
/** Create scalar float type */
static INLINE struct lp_type
lp_type_float(unsigned width)
@@ -169,7 +190,7 @@ lp_type_float(unsigned width)
/** Create vector of float type */
static INLINE struct lp_type
-lp_type_float_vec(unsigned width)
+lp_type_float_vec(unsigned width, unsigned total_width)
{
struct lp_type res_type;
@@ -177,7 +198,7 @@ lp_type_float_vec(unsigned width)
res_type.floating = TRUE;
res_type.sign = TRUE;
res_type.width = width;
- res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+ res_type.length = total_width / width;
return res_type;
}
@@ -200,14 +221,14 @@ lp_type_int(unsigned width)
/** Create vector int type */
static INLINE struct lp_type
-lp_type_int_vec(unsigned width)
+lp_type_int_vec(unsigned width, unsigned total_width)
{
struct lp_type res_type;
memset(&res_type, 0, sizeof res_type);
res_type.sign = TRUE;
res_type.width = width;
- res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+ res_type.length = total_width / width;
return res_type;
}
@@ -229,34 +250,34 @@ lp_type_uint(unsigned width)
/** Create vector uint type */
static INLINE struct lp_type
-lp_type_uint_vec(unsigned width)
+lp_type_uint_vec(unsigned width, unsigned total_width)
{
struct lp_type res_type;
memset(&res_type, 0, sizeof res_type);
res_type.width = width;
- res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+ res_type.length = total_width / width;
return res_type;
}
static INLINE struct lp_type
-lp_type_unorm(unsigned width)
+lp_type_unorm(unsigned width, unsigned total_width)
{
struct lp_type res_type;
memset(&res_type, 0, sizeof res_type);
res_type.norm = TRUE;
res_type.width = width;
- res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+ res_type.length = total_width / width;
return res_type;
}
static INLINE struct lp_type
-lp_type_fixed(unsigned width)
+lp_type_fixed(unsigned width, unsigned total_width)
{
struct lp_type res_type;
@@ -264,21 +285,21 @@ lp_type_fixed(unsigned width)
res_type.sign = TRUE;
res_type.fixed = TRUE;
res_type.width = width;
- res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+ res_type.length = total_width / width;
return res_type;
}
static INLINE struct lp_type
-lp_type_ufixed(unsigned width)
+lp_type_ufixed(unsigned width, unsigned total_width)
{
struct lp_type res_type;
memset(&res_type, 0, sizeof res_type);
res_type.fixed = TRUE;
res_type.width = width;
- res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+ res_type.length = total_width / width;
return res_type;
}
@@ -312,10 +333,6 @@ LLVMTypeRef
lp_build_int_vec_type(struct gallivm_state *gallivm, struct lp_type type);
-LLVMTypeRef
-lp_build_int32_vec4_type(struct gallivm_state *gallivm);
-
-
static INLINE struct lp_type
lp_float32_vec4_type(void)
{