summaryrefslogtreecommitdiff
path: root/src/gallium/auxiliary/gallivm/lp_bld_arit.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/auxiliary/gallivm/lp_bld_arit.c')
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_arit.c545
1 files changed, 446 insertions, 99 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 9fc57629822..d226dab5b81 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -75,9 +75,9 @@ lp_build_min_simple(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b)
{
- LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
const char *intrinsic = NULL;
+ unsigned intr_size;
LLVMValueRef cond;
assert(lp_check_value(type, a));
@@ -85,31 +85,71 @@ lp_build_min_simple(struct lp_build_context *bld,
/* TODO: optimize the constant case */
- if(type.width * type.length == 128) {
- if(type.floating) {
- if(type.width == 32 && util_cpu_caps.has_sse)
+ if (type.floating && util_cpu_caps.has_sse) {
+ if (type.width == 32) {
+ if (type.length == 1) {
+ intrinsic = "llvm.x86.sse.min.ss";
+ intr_size = 128;
+ }
+ else if (type.length <= 4 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse.min.ps";
- if(type.width == 64 && util_cpu_caps.has_sse2)
+ intr_size = 128;
+ }
+ else {
+ intrinsic = "llvm.x86.avx.min.ps.256";
+ intr_size = 256;
+ }
+ }
+ if (type.width == 64 && util_cpu_caps.has_sse2) {
+ if (type.length == 1) {
+ intrinsic = "llvm.x86.sse2.min.sd";
+ intr_size = 128;
+ }
+ else if (type.length == 2 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse2.min.pd";
+ intr_size = 128;
+ }
+ else {
+ intrinsic = "llvm.x86.avx.min.pd.256";
+ intr_size = 256;
+ }
}
- else {
- if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
- intrinsic = "llvm.x86.sse2.pminu.b";
- if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
+ }
+ else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+ intr_size = 128;
+ if ((type.width == 8 || type.width == 16) &&
+ (type.width * type.length <= 64) &&
+ (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+ debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
+ __FUNCTION__);
+ }
+ if (type.width == 8 && !type.sign) {
+ intrinsic = "llvm.x86.sse2.pminu.b";
+ }
+ else if (type.width == 16 && type.sign) {
+ intrinsic = "llvm.x86.sse2.pmins.w";
+ }
+ if (util_cpu_caps.has_sse4_1) {
+ if (type.width == 8 && type.sign) {
intrinsic = "llvm.x86.sse41.pminsb";
- if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 16 && !type.sign) {
intrinsic = "llvm.x86.sse41.pminuw";
- if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
- intrinsic = "llvm.x86.sse2.pmins.w";
- if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 32 && !type.sign) {
intrinsic = "llvm.x86.sse41.pminud";
- if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 32 && type.sign) {
intrinsic = "llvm.x86.sse41.pminsd";
+ }
}
}
- if(intrinsic)
- return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
+ if(intrinsic) {
+ return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
+ type,
+ intr_size, a, b);
+ }
cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
return lp_build_select(bld, cond, a, b);
@@ -125,9 +165,9 @@ lp_build_max_simple(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b)
{
- LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
const char *intrinsic = NULL;
+ unsigned intr_size;
LLVMValueRef cond;
assert(lp_check_value(type, a));
@@ -135,31 +175,72 @@ lp_build_max_simple(struct lp_build_context *bld,
/* TODO: optimize the constant case */
- if(type.width * type.length == 128) {
- if(type.floating) {
- if(type.width == 32 && util_cpu_caps.has_sse)
+ if (type.floating && util_cpu_caps.has_sse) {
+ if (type.width == 32) {
+ if (type.length == 1) {
+ intrinsic = "llvm.x86.sse.max.ss";
+ intr_size = 128;
+ }
+ else if (type.length <= 4 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse.max.ps";
- if(type.width == 64 && util_cpu_caps.has_sse2)
+ intr_size = 128;
+ }
+ else {
+ intrinsic = "llvm.x86.avx.max.ps.256";
+ intr_size = 256;
+ }
+ }
+ if (type.width == 64 && util_cpu_caps.has_sse2) {
+ if (type.length == 1) {
+ intrinsic = "llvm.x86.sse2.max.sd";
+ intr_size = 128;
+ }
+ else if (type.length == 2 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse2.max.pd";
+ intr_size = 128;
+ }
+ else {
+ intrinsic = "llvm.x86.avx.max.pd.256";
+ intr_size = 256;
+ }
}
- else {
- if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
- intrinsic = "llvm.x86.sse2.pmaxu.b";
- if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
+ }
+ else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+ intr_size = 128;
+ if ((type.width == 8 || type.width == 16) &&
+ (type.width * type.length <= 64) &&
+ (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+ debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
+ __FUNCTION__);
+ }
+ if (type.width == 8 && !type.sign) {
+ intrinsic = "llvm.x86.sse2.pmaxu.b";
+ intr_size = 128;
+ }
+ else if (type.width == 16 && type.sign) {
+ intrinsic = "llvm.x86.sse2.pmaxs.w";
+ }
+ if (util_cpu_caps.has_sse4_1) {
+ if (type.width == 8 && type.sign) {
intrinsic = "llvm.x86.sse41.pmaxsb";
- if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 16 && !type.sign) {
intrinsic = "llvm.x86.sse41.pmaxuw";
- if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
- intrinsic = "llvm.x86.sse2.pmaxs.w";
- if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 32 && !type.sign) {
intrinsic = "llvm.x86.sse41.pmaxud";
- if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 32 && type.sign) {
intrinsic = "llvm.x86.sse41.pmaxsd";
+ }
}
}
- if(intrinsic)
- return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
+ if(intrinsic) {
+ return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
+ type,
+ intr_size, a, b);
+ }
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
return lp_build_select(bld, cond, a, b);
@@ -265,15 +346,20 @@ lp_build_add(struct lp_build_context *bld,
}
-/** Return the scalar sum of the elements of a */
+/** Return the scalar sum of the elements of a.
+ * Should avoid this operation whenever possible.
+ */
LLVMValueRef
-lp_build_sum_vector(struct lp_build_context *bld,
- LLVMValueRef a)
+lp_build_horizontal_add(struct lp_build_context *bld,
+ LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMValueRef index, res;
- unsigned i;
+ unsigned i, length;
+ LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
+ LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
+ LLVMValueRef vecres, elem2;
assert(lp_check_value(type, a));
@@ -283,26 +369,191 @@ lp_build_sum_vector(struct lp_build_context *bld,
assert(!bld->type.norm);
- index = lp_build_const_int32(bld->gallivm, 0);
- res = LLVMBuildExtractElement(builder, a, index, "");
+ /*
+ * for byte vectors can do much better with psadbw.
+ * Using repeated shuffle/adds here. Note with multiple vectors
+ * this can be done more efficiently as outlined in the intel
+ * optimization manual.
+ * Note: could cause data rearrangement if used with smaller element
+ * sizes.
+ */
- for (i = 1; i < type.length; i++) {
- index = lp_build_const_int32(bld->gallivm, i);
- if (type.floating)
- res = LLVMBuildFAdd(builder, res,
- LLVMBuildExtractElement(builder,
- a, index, ""),
- "");
- else
- res = LLVMBuildAdd(builder, res,
- LLVMBuildExtractElement(builder,
- a, index, ""),
- "");
+ vecres = a;
+ length = type.length / 2;
+ while (length > 1) {
+ LLVMValueRef vec1, vec2;
+ for (i = 0; i < length; i++) {
+ shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
+ shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
+ }
+ vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
+ LLVMConstVector(shuffles1, length), "");
+ vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
+ LLVMConstVector(shuffles2, length), "");
+ if (type.floating) {
+ vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
+ }
+ else {
+ vecres = LLVMBuildAdd(builder, vec1, vec2, "");
+ }
+ length = length >> 1;
}
+ /* always have vector of size 2 here */
+ assert(length == 1);
+
+ index = lp_build_const_int32(bld->gallivm, 0);
+ res = LLVMBuildExtractElement(builder, vecres, index, "");
+ index = lp_build_const_int32(bld->gallivm, 1);
+ elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
+
+ if (type.floating)
+ res = LLVMBuildFAdd(builder, res, elem2, "");
+ else
+ res = LLVMBuildAdd(builder, res, elem2, "");
+
return res;
}
+/**
+ * Return the horizontal sums of 4 float vectors as a float4 vector.
+ * This uses the technique as outlined in Intel Optimization Manual.
+ */
+static LLVMValueRef
+lp_build_horizontal_add4x4f(struct lp_build_context *bld,
+ LLVMValueRef src[4])
+{
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef shuffles[4];
+ LLVMValueRef tmp[4];
+ LLVMValueRef sumtmp[2], shuftmp[2];
+
+ /* lower half of regs */
+ shuffles[0] = lp_build_const_int32(gallivm, 0);
+ shuffles[1] = lp_build_const_int32(gallivm, 1);
+ shuffles[2] = lp_build_const_int32(gallivm, 4);
+ shuffles[3] = lp_build_const_int32(gallivm, 5);
+ tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
+ LLVMConstVector(shuffles, 4), "");
+ tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
+ LLVMConstVector(shuffles, 4), "");
+
+ /* upper half of regs */
+ shuffles[0] = lp_build_const_int32(gallivm, 2);
+ shuffles[1] = lp_build_const_int32(gallivm, 3);
+ shuffles[2] = lp_build_const_int32(gallivm, 6);
+ shuffles[3] = lp_build_const_int32(gallivm, 7);
+ tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
+ LLVMConstVector(shuffles, 4), "");
+ tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
+ LLVMConstVector(shuffles, 4), "");
+
+ sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
+ sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
+
+ shuffles[0] = lp_build_const_int32(gallivm, 0);
+ shuffles[1] = lp_build_const_int32(gallivm, 2);
+ shuffles[2] = lp_build_const_int32(gallivm, 4);
+ shuffles[3] = lp_build_const_int32(gallivm, 6);
+ shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
+ LLVMConstVector(shuffles, 4), "");
+
+ shuffles[0] = lp_build_const_int32(gallivm, 1);
+ shuffles[1] = lp_build_const_int32(gallivm, 3);
+ shuffles[2] = lp_build_const_int32(gallivm, 5);
+ shuffles[3] = lp_build_const_int32(gallivm, 7);
+ shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
+ LLVMConstVector(shuffles, 4), "");
+
+ return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
+}
+
+
+/*
+ * partially horizontally add 2-4 float vectors with length nx4,
+ * i.e. only four adjacent values in each vector will be added,
+ * assuming values are really grouped in 4 which also determines
+ * output order.
+ *
+ * Return a vector of the same length as the initial vectors,
+ * with the excess elements (if any) being undefined.
+ * The element order is independent of number of input vectors.
+ * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
+ * the output order thus will be
+ * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
+ */
+LLVMValueRef
+lp_build_hadd_partial4(struct lp_build_context *bld,
+ LLVMValueRef vectors[],
+ unsigned num_vecs)
+{
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef ret_vec;
+ LLVMValueRef tmp[4];
+ const char *intrinsic = NULL;
+
+ assert(num_vecs >= 2 && num_vecs <= 4);
+ assert(bld->type.floating);
+
+ /* only use this with at least 2 vectors, as it is sort of expensive
+ * (depending on cpu) and we always need two horizontal adds anyway,
+ * so a shuffle/add approach might be better.
+ */
+
+ tmp[0] = vectors[0];
+ tmp[1] = vectors[1];
+
+ tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
+ tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
+
+ if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
+ bld->type.length == 4) {
+ intrinsic = "llvm.x86.sse3.hadd.ps";
+ }
+ else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
+ bld->type.length == 8) {
+ intrinsic = "llvm.x86.avx.hadd.ps.256";
+ }
+ if (intrinsic) {
+ tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
+ lp_build_vec_type(gallivm, bld->type),
+ tmp[0], tmp[1]);
+ if (num_vecs > 2) {
+ tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
+ lp_build_vec_type(gallivm, bld->type),
+ tmp[2], tmp[3]);
+ }
+ else {
+ tmp[1] = tmp[0];
+ }
+ return lp_build_intrinsic_binary(builder, intrinsic,
+ lp_build_vec_type(gallivm, bld->type),
+ tmp[0], tmp[1]);
+ }
+
+ if (bld->type.length == 4) {
+ ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
+ }
+ else {
+ LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
+ unsigned j;
+ unsigned num_iter = bld->type.length / 4;
+ struct lp_type parttype = bld->type;
+ parttype.length = 4;
+ for (j = 0; j < num_iter; j++) {
+ LLVMValueRef partsrc[4];
+ unsigned i;
+ for (i = 0; i < 4; i++) {
+ partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
+ }
+ partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
+ }
+ ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
+ }
+ return ret_vec;
+}
/**
* Generate a - b
@@ -553,7 +804,7 @@ lp_build_mul_imm(struct lp_build_context *bld,
if(bld->type.floating) {
#if 0
/*
- * Power of two multiplication by directly manipulating the mantissa.
+ * Power of two multiplication by directly manipulating the exponent.
*
* XXX: This might not be always faster, it will introduce a small error
* for multiplication by zero, and it will produce wrong results
@@ -612,7 +863,8 @@ lp_build_div(struct lp_build_context *bld,
return LLVMConstUDiv(a, b);
}
- if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4 &&
+ if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+ (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
type.floating)
return lp_build_mul(bld, a, lp_build_rcp(bld, b));
@@ -871,6 +1123,12 @@ lp_build_abs(struct lp_build_context *bld,
return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
}
}
+ else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
+ (gallivm_debug & GALLIVM_DEBUG_PERF) &&
+ (type.width == 8 || type.width == 16 || type.width == 32)) {
+ debug_printf("%s: inefficient code, should split vectors manually\n",
+ __FUNCTION__);
+ }
return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
}
@@ -934,6 +1192,7 @@ lp_build_sgn(struct lp_build_context *bld,
else
{
/* signed int/norm/fixed point */
+ /* could use psign with sse3 and appropriate vectors here */
LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
res = lp_build_select(bld, cond, bld->one, minus_one);
@@ -1000,7 +1259,16 @@ lp_build_int_to_float(struct lp_build_context *bld,
return LLVMBuildSIToFP(builder, a, vec_type, "");
}
+static boolean
+sse41_rounding_available(const struct lp_type type)
+{
+ if ((util_cpu_caps.has_sse4_1 &&
+ (type.length == 1 || type.width*type.length == 128)) ||
+ (util_cpu_caps.has_avx && type.width*type.length == 256))
+ return TRUE;
+ return FALSE;
+}
enum lp_build_round_sse41_mode
{
@@ -1065,18 +1333,34 @@ lp_build_round_sse41(struct lp_build_context *bld,
res = LLVMBuildExtractElement(builder, res, index0, "");
}
else {
- assert(type.width*type.length == 128);
-
- switch(type.width) {
- case 32:
- intrinsic = "llvm.x86.sse41.round.ps";
- break;
- case 64:
- intrinsic = "llvm.x86.sse41.round.pd";
- break;
- default:
- assert(0);
- return bld->undef;
+ if (type.width * type.length == 128) {
+ switch(type.width) {
+ case 32:
+ intrinsic = "llvm.x86.sse41.round.ps";
+ break;
+ case 64:
+ intrinsic = "llvm.x86.sse41.round.pd";
+ break;
+ default:
+ assert(0);
+ return bld->undef;
+ }
+ }
+ else {
+ assert(type.width * type.length == 256);
+ assert(util_cpu_caps.has_avx);
+
+ switch(type.width) {
+ case 32:
+ intrinsic = "llvm.x86.avx.round.ps.256";
+ break;
+ case 64:
+ intrinsic = "llvm.x86.avx.round.pd.256";
+ break;
+ default:
+ assert(0);
+ return bld->undef;
+ }
}
res = lp_build_intrinsic_binary(builder, intrinsic,
@@ -1125,10 +1409,15 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld,
ret_type, arg);
}
else {
- assert(type.width*type.length == 128);
-
- intrinsic = "llvm.x86.sse2.cvtps2dq";
+ if (type.width* type.length == 128) {
+ intrinsic = "llvm.x86.sse2.cvtps2dq";
+ }
+ else {
+ assert(type.width*type.length == 256);
+ assert(util_cpu_caps.has_avx);
+ intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
+ }
res = lp_build_intrinsic_unary(builder, intrinsic,
ret_type, a);
}
@@ -1152,8 +1441,7 @@ lp_build_trunc(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
}
else {
@@ -1183,8 +1471,7 @@ lp_build_round(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
}
else {
@@ -1212,8 +1499,7 @@ lp_build_floor(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
}
else {
@@ -1241,8 +1527,7 @@ lp_build_ceil(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
}
else {
@@ -1269,6 +1554,34 @@ lp_build_fract(struct lp_build_context *bld,
/**
+ * Prevent returning a fractional part of 1.0 for very small negative values of
+ * 'a' by clamping against 0.99999(9).
+ */
+static inline LLVMValueRef
+clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
+{
+ LLVMValueRef max;
+
+ /* this is the largest number smaller than 1.0 representable as float */
+ max = lp_build_const_vec(bld->gallivm, bld->type,
+ 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
+ return lp_build_min(bld, fract, max);
+}
+
+
+/**
+ * Same as lp_build_fract, but guarantees that the result is always smaller
+ * than one.
+ */
+LLVMValueRef
+lp_build_fract_safe(struct lp_build_context *bld,
+ LLVMValueRef a)
+{
+ return clamp_fract(bld, lp_build_fract(bld, a));
+}
+
+
+/**
* Return the integer part of a float (vector) value (== round toward zero).
* The returned value is an integer (vector).
* Ex: itrunc(-1.5) = -1
@@ -1307,12 +1620,12 @@ lp_build_iround(struct lp_build_context *bld,
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse2 &&
- ((type.width == 32) && (type.length == 1 || type.length == 4))) {
+ if ((util_cpu_caps.has_sse2 &&
+ ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
+ (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
return lp_build_iround_nearest_sse2(bld, a);
}
- else if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
}
else {
@@ -1362,14 +1675,12 @@ lp_build_ifloor(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
- res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
- }
- else {
- res = a;
-
- if (type.sign) {
+ res = a;
+ if (type.sign) {
+ if (sse41_rounding_available(type)) {
+ res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+ }
+ else {
/* Take the sign bit and add it to 1 constant */
LLVMTypeRef vec_type = bld->vec_type;
unsigned mantissa = lp_mantissa(type);
@@ -1423,8 +1734,7 @@ lp_build_iceil(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
}
else {
@@ -1470,7 +1780,7 @@ lp_build_iceil(struct lp_build_context *bld,
* Combined ifloor() & fract().
*
* Preferred to calling the functions separately, as it will ensure that the
- * stratergy (floor() vs ifloor()) that results in less redundant work is used.
+ * strategy (floor() vs ifloor()) that results in less redundant work is used.
*/
void
lp_build_ifloor_fract(struct lp_build_context *bld,
@@ -1485,8 +1795,7 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
/*
* floor() is easier.
*/
@@ -1507,6 +1816,21 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
}
+/**
+ * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
+ * always smaller than one.
+ */
+void
+lp_build_ifloor_fract_safe(struct lp_build_context *bld,
+ LLVMValueRef a,
+ LLVMValueRef *out_ipart,
+ LLVMValueRef *out_fpart)
+{
+ lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
+ *out_fpart = clamp_fract(bld, *out_fpart);
+}
+
+
LLVMValueRef
lp_build_sqrt(struct lp_build_context *bld,
LLVMValueRef a)
@@ -1519,10 +1843,14 @@ lp_build_sqrt(struct lp_build_context *bld,
assert(lp_check_value(type, a));
/* TODO: optimize the constant case */
- /* TODO: optimize the constant case */
assert(type.floating);
- util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
+ if (type.length == 1) {
+ util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
+ }
+ else {
+ util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
+ }
return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
}
@@ -1586,19 +1914,28 @@ lp_build_rcp(struct lp_build_context *bld,
* - it doesn't even get the reciprocate of 1.0 exactly
* - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
* - for recent processors the benefit over DIVPS is marginal, a case
- * depedent
+ * dependent
*
* We could still use it on certain processors if benchmarks show that the
* RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
* particular uses that require less workarounds.
*/
- if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+ if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+ (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
const unsigned num_iterations = 0;
LLVMValueRef res;
unsigned i;
+ const char *intrinsic = NULL;
- res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
+ if (type.length == 4) {
+ intrinsic = "llvm.x86.sse.rcp.ps";
+ }
+ else {
+ intrinsic = "llvm.x86.avx.rcp.ps.256";
+ }
+
+ res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
for (i = 0; i < num_iterations; ++i) {
res = lp_build_rcp_refine(bld, a, res);
@@ -1653,12 +1990,22 @@ lp_build_rsqrt(struct lp_build_context *bld,
assert(type.floating);
- if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+ if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+ (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
const unsigned num_iterations = 1;
LLVMValueRef res;
unsigned i;
+ const char *intrinsic = NULL;
+
+ if (type.length == 4) {
+ intrinsic = "llvm.x86.sse.rsqrt.ps";
+ }
+ else {
+ intrinsic = "llvm.x86.avx.rsqrt.ps.256";
+ }
+
+ res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
- res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
for (i = 0; i < num_iterations; ++i) {
res = lp_build_rsqrt_refine(bld, a, res);