summaryrefslogtreecommitdiff
path: root/src/gallium/auxiliary
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/auxiliary')
-rw-r--r--src/gallium/auxiliary/Makefile.sources2
-rw-r--r--src/gallium/auxiliary/draw/draw_context.c31
-rw-r--r--src/gallium/auxiliary/draw/draw_context.h4
-rw-r--r--src/gallium/auxiliary/draw/draw_llvm.c759
-rw-r--r--src/gallium/auxiliary/draw/draw_llvm.h28
-rw-r--r--src/gallium/auxiliary/draw/draw_llvm_sample.c7
-rw-r--r--src/gallium/auxiliary/draw/draw_llvm_translate.c506
-rw-r--r--src/gallium/auxiliary/draw/draw_private.h3
-rw-r--r--src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c4
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_arit.c545
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_arit.h19
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_const.c39
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_conv.c149
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_conv.h4
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_debug.cpp22
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_flow.c9
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format.h7
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format_aos.c6
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c102
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format_soa.c3
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c4
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_init.c488
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_init.h33
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_intr.c91
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_intr.h9
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_logic.c60
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_logic.h5
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_misc.cpp111
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_misc.h70
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_pack.c339
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_pack.h23
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_quad.c87
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_quad.h14
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_sample.c527
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_sample.h51
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c1344
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h8
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c493
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_swizzle.c164
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_swizzle.h24
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_tgsi.h8
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c10
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c92
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_type.c28
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_type.h59
-rw-r--r--src/gallium/auxiliary/util/u_cpu_detect.h12
46 files changed, 4156 insertions, 2247 deletions
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 28a176d68fa..2807c780d2d 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -165,6 +165,7 @@ GALLIVM_SOURCES := \
gallivm/lp_bld_conv.c \
gallivm/lp_bld_flow.c \
gallivm/lp_bld_format_aos.c \
+ gallivm/lp_bld_format_aos_array.c \
gallivm/lp_bld_format_soa.c \
gallivm/lp_bld_format_yuv.c \
gallivm/lp_bld_gather.c \
@@ -187,7 +188,6 @@ GALLIVM_SOURCES := \
gallivm/lp_bld_type.c \
draw/draw_llvm.c \
draw/draw_llvm_sample.c \
- draw/draw_llvm_translate.c \
draw/draw_vs_llvm.c \
draw/draw_pt_fetch_shade_pipeline_llvm.c
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 20260c1abbf..be30b7db245 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -70,8 +70,7 @@ draw_get_option_use_llvm(void)
* Create new draw module context with gallivm state for LLVM JIT.
*/
static struct draw_context *
-draw_create_context(struct pipe_context *pipe, boolean try_llvm,
- struct gallivm_state *gallivm)
+draw_create_context(struct pipe_context *pipe, boolean try_llvm)
{
struct draw_context *draw = CALLOC_STRUCT( draw_context );
if (draw == NULL)
@@ -79,16 +78,7 @@ draw_create_context(struct pipe_context *pipe, boolean try_llvm,
#if HAVE_LLVM
if (try_llvm && draw_get_option_use_llvm()) {
- if (!gallivm) {
- gallivm = gallivm_create();
- draw->own_gallivm = gallivm;
- }
-
- if (!gallivm)
- goto err_destroy;
-
- draw->llvm = draw_llvm_create(draw, gallivm);
-
+ draw->llvm = draw_llvm_create(draw);
if (!draw->llvm)
goto err_destroy;
}
@@ -114,7 +104,7 @@ err_out:
struct draw_context *
draw_create(struct pipe_context *pipe)
{
- return draw_create_context(pipe, TRUE, NULL);
+ return draw_create_context(pipe, TRUE);
}
@@ -124,17 +114,7 @@ draw_create(struct pipe_context *pipe)
struct draw_context *
draw_create_no_llvm(struct pipe_context *pipe)
{
- return draw_create_context(pipe, FALSE, NULL);
-}
-
-
-/**
- * Create new draw module context with gallivm state for LLVM JIT.
- */
-struct draw_context *
-draw_create_gallivm(struct pipe_context *pipe, struct gallivm_state *gallivm)
-{
- return draw_create_context(pipe, TRUE, gallivm);
+ return draw_create_context(pipe, FALSE);
}
@@ -213,9 +193,6 @@ void draw_destroy( struct draw_context *draw )
#ifdef HAVE_LLVM
if (draw->llvm)
draw_llvm_destroy( draw->llvm );
-
- if (draw->own_gallivm)
- gallivm_destroy(draw->own_gallivm);
#endif
FREE( draw );
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index 852cbc3da13..cc95600c530 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -48,7 +48,6 @@ struct draw_vertex_shader;
struct draw_geometry_shader;
struct draw_fragment_shader;
struct tgsi_sampler;
-struct gallivm_state;
/*
* structure to contain driver internal information
@@ -67,9 +66,6 @@ struct draw_context *draw_create( struct pipe_context *pipe );
struct draw_context *draw_create_no_llvm(struct pipe_context *pipe);
-struct draw_context *
-draw_create_gallivm(struct pipe_context *pipe, struct gallivm_state *gallivm);
-
void draw_destroy( struct draw_context *draw );
void draw_flush(struct draw_context *draw);
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index e08221eb392..8d9b5309aff 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -43,6 +43,8 @@
#include "gallivm/lp_bld_intr.h"
#include "gallivm/lp_bld_init.h"
#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_pack.h"
+#include "gallivm/lp_bld_format.h"
#include "tgsi/tgsi_exec.h"
#include "tgsi/tgsi_dump.h"
@@ -56,40 +58,6 @@
#define DEBUG_STORE 0
-/**
- * This function is called by the gallivm "garbage collector" when
- * the LLVM global data structures are freed. We must free all LLVM-related
- * data. Specifically, all JIT'd shader variants.
- */
-static void
-draw_llvm_garbage_collect_callback(void *cb_data)
-{
- struct draw_llvm *llvm = (struct draw_llvm *) cb_data;
- struct draw_context *draw = llvm->draw;
- struct draw_llvm_variant_list_item *li;
-
- /* Ensure prepare will be run and shaders recompiled */
- assert(!draw->suspend_flushing);
- draw_do_flush(draw, DRAW_FLUSH_STATE_CHANGE);
-
- /* free all shader variants */
- li = first_elem(&llvm->vs_variants_list);
- while (!at_end(&llvm->vs_variants_list, li)) {
- struct draw_llvm_variant_list_item *next = next_elem(li);
- draw_llvm_destroy_variant(li->base);
- li = next;
- }
-
- /* Null-out these pointers so they get remade next time they're needed.
- * See the accessor functions below.
- */
- llvm->context_ptr_type = NULL;
- llvm->buffer_ptr_type = NULL;
- llvm->vb_ptr_type = NULL;
- llvm->vertex_header_ptr_type = NULL;
-}
-
-
static void
draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *var,
boolean elts);
@@ -316,56 +284,56 @@ create_jit_vertex_header(struct gallivm_state *gallivm, int data_elems)
* Create LLVM types for various structures.
*/
static void
-create_jit_types(struct draw_llvm *llvm)
+create_jit_types(struct draw_llvm_variant *variant)
{
- struct gallivm_state *gallivm = llvm->gallivm;
+ struct gallivm_state *gallivm = variant->gallivm;
LLVMTypeRef texture_type, context_type, buffer_type, vb_type;
texture_type = create_jit_texture_type(gallivm, "texture");
context_type = create_jit_context_type(gallivm, texture_type, "draw_jit_context");
- llvm->context_ptr_type = LLVMPointerType(context_type, 0);
+ variant->context_ptr_type = LLVMPointerType(context_type, 0);
buffer_type = LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 8), 0);
- llvm->buffer_ptr_type = LLVMPointerType(buffer_type, 0);
+ variant->buffer_ptr_type = LLVMPointerType(buffer_type, 0);
vb_type = create_jit_vertex_buffer_type(gallivm, "pipe_vertex_buffer");
- llvm->vb_ptr_type = LLVMPointerType(vb_type, 0);
+ variant->vb_ptr_type = LLVMPointerType(vb_type, 0);
}
static LLVMTypeRef
-get_context_ptr_type(struct draw_llvm *llvm)
+get_context_ptr_type(struct draw_llvm_variant *variant)
{
- if (!llvm->context_ptr_type)
- create_jit_types(llvm);
- return llvm->context_ptr_type;
+ if (!variant->context_ptr_type)
+ create_jit_types(variant);
+ return variant->context_ptr_type;
}
static LLVMTypeRef
-get_buffer_ptr_type(struct draw_llvm *llvm)
+get_buffer_ptr_type(struct draw_llvm_variant *variant)
{
- if (!llvm->buffer_ptr_type)
- create_jit_types(llvm);
- return llvm->buffer_ptr_type;
+ if (!variant->buffer_ptr_type)
+ create_jit_types(variant);
+ return variant->buffer_ptr_type;
}
static LLVMTypeRef
-get_vb_ptr_type(struct draw_llvm *llvm)
+get_vb_ptr_type(struct draw_llvm_variant *variant)
{
- if (!llvm->vb_ptr_type)
- create_jit_types(llvm);
- return llvm->vb_ptr_type;
+ if (!variant->vb_ptr_type)
+ create_jit_types(variant);
+ return variant->vb_ptr_type;
}
static LLVMTypeRef
-get_vertex_header_ptr_type(struct draw_llvm *llvm)
+get_vertex_header_ptr_type(struct draw_llvm_variant *variant)
{
- if (!llvm->vertex_header_ptr_type)
- create_jit_types(llvm);
- return llvm->vertex_header_ptr_type;
+ if (!variant->vertex_header_ptr_type)
+ create_jit_types(variant);
+ return variant->vertex_header_ptr_type;
}
@@ -373,7 +341,7 @@ get_vertex_header_ptr_type(struct draw_llvm *llvm)
* Create per-context LLVM info.
*/
struct draw_llvm *
-draw_llvm_create(struct draw_context *draw, struct gallivm_state *gallivm)
+draw_llvm_create(struct draw_context *draw)
{
struct draw_llvm *llvm;
@@ -384,18 +352,10 @@ draw_llvm_create(struct draw_context *draw, struct gallivm_state *gallivm)
lp_build_init();
llvm->draw = draw;
- llvm->gallivm = gallivm;
-
- if (gallivm_debug & GALLIVM_DEBUG_IR) {
- LLVMDumpModule(llvm->gallivm->module);
- }
llvm->nr_variants = 0;
make_empty_list(&llvm->vs_variants_list);
- gallivm_register_garbage_collector_callback(
- draw_llvm_garbage_collect_callback, llvm);
-
return llvm;
}
@@ -406,9 +366,6 @@ draw_llvm_create(struct draw_context *draw, struct gallivm_state *gallivm)
void
draw_llvm_destroy(struct draw_llvm *llvm)
{
- gallivm_remove_garbage_collector_callback(
- draw_llvm_garbage_collect_callback, llvm);
-
/* XXX free other draw_llvm data? */
FREE(llvm);
}
@@ -435,15 +392,27 @@ draw_llvm_create_variant(struct draw_llvm *llvm,
variant->llvm = llvm;
+ variant->gallivm = gallivm_create();
+
+ create_jit_types(variant);
+
memcpy(&variant->key, key, shader->variant_key_size);
- vertex_header = create_jit_vertex_header(llvm->gallivm, num_inputs);
+ vertex_header = create_jit_vertex_header(variant->gallivm, num_inputs);
- llvm->vertex_header_ptr_type = LLVMPointerType(vertex_header, 0);
+ variant->vertex_header_ptr_type = LLVMPointerType(vertex_header, 0);
draw_llvm_generate(llvm, variant, FALSE); /* linear */
draw_llvm_generate(llvm, variant, TRUE); /* elts */
+ gallivm_compile_module(variant->gallivm);
+
+ variant->jit_func = (draw_jit_vert_func)
+ gallivm_jit_function(variant->gallivm, variant->function);
+
+ variant->jit_func_elts = (draw_jit_vert_func_elts)
+ gallivm_jit_function(variant->gallivm, variant->function_elts);
+
variant->shader = shader;
variant->list_item_global.base = variant;
variant->list_item_local.base = variant;
@@ -455,8 +424,9 @@ draw_llvm_create_variant(struct draw_llvm *llvm,
static void
-generate_vs(struct draw_llvm *llvm,
+generate_vs(struct draw_llvm_variant *variant,
LLVMBuilderRef builder,
+ struct lp_type vs_type,
LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS],
const struct lp_bld_tgsi_system_values *system_values,
@@ -464,21 +434,11 @@ generate_vs(struct draw_llvm *llvm,
struct lp_build_sampler_soa *draw_sampler,
boolean clamp_vertex_color)
{
+ struct draw_llvm *llvm = variant->llvm;
const struct tgsi_token *tokens = llvm->draw->vs.vertex_shader->state.tokens;
- struct lp_type vs_type;
- LLVMValueRef consts_ptr = draw_jit_context_vs_constants(llvm->gallivm, context_ptr);
+ LLVMValueRef consts_ptr = draw_jit_context_vs_constants(variant->gallivm, context_ptr);
struct lp_build_sampler_soa *sampler = 0;
- memset(&vs_type, 0, sizeof vs_type);
- vs_type.floating = TRUE; /* floating point values */
- vs_type.sign = TRUE; /* values are signed */
- vs_type.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */
- vs_type.width = 32; /* 32-bit float */
- vs_type.length = 4; /* 4 elements per vector */
-#if 0
- num_vs = 4; /* number of vertices per block */
-#endif
-
if (gallivm_debug & GALLIVM_DEBUG_IR) {
tgsi_dump(tokens, 0);
}
@@ -486,7 +446,7 @@ generate_vs(struct draw_llvm *llvm,
if (llvm->draw->num_sampler_views && llvm->draw->num_samplers)
sampler = draw_sampler;
- lp_build_tgsi_soa(llvm->gallivm,
+ lp_build_tgsi_soa(variant->gallivm,
tokens,
vs_type,
NULL /*struct lp_build_mask_context *mask*/,
@@ -503,7 +463,7 @@ generate_vs(struct draw_llvm *llvm,
unsigned chan, attrib;
struct lp_build_context bld;
struct tgsi_shader_info* info = &llvm->draw->vs.vertex_shader->info;
- lp_build_context_init(&bld, llvm->gallivm, vs_type);
+ lp_build_context_init(&bld, variant->gallivm, vs_type);
for (attrib = 0; attrib < info->num_outputs; ++attrib) {
for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
@@ -531,25 +491,6 @@ generate_vs(struct draw_llvm *llvm,
}
-#if DEBUG_STORE
-static void print_vectorf(LLVMBuilderRef builder,
- LLVMValueRef vec)
-{
- LLVMValueRef val[4];
- val[0] = LLVMBuildExtractElement(builder, vec,
- lp_build_const_int32(gallivm, 0), "");
- val[1] = LLVMBuildExtractElement(builder, vec,
- lp_build_const_int32(gallivm, 1), "");
- val[2] = LLVMBuildExtractElement(builder, vec,
- lp_build_const_int32(gallivm, 2), "");
- val[3] = LLVMBuildExtractElement(builder, vec,
- lp_build_const_int32(gallivm, 3), "");
- lp_build_printf(builder, "vector = [%f, %f, %f, %f]\n",
- val[0], val[1], val[2], val[3]);
-}
-#endif
-
-
static void
generate_fetch(struct gallivm_state *gallivm,
LLVMValueRef vbuffers_ptr,
@@ -559,6 +500,8 @@ generate_fetch(struct gallivm_state *gallivm,
LLVMValueRef index,
LLVMValueRef instance_id)
{
+ const struct util_format_description *format_desc = util_format_description(velem->src_format);
+ LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
LLVMBuilderRef builder = gallivm->builder;
LLVMValueRef indices =
LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
@@ -587,118 +530,47 @@ generate_fetch(struct gallivm_state *gallivm,
lp_build_const_int32(gallivm, velem->src_offset),
"");
- /*lp_build_printf(builder, "vbuf index = %d, stride is %d\n", indices, stride);*/
+/* lp_build_printf(gallivm, "vbuf index = %d, stride is %d\n", indices, stride);*/
vbuffer_ptr = LLVMBuildGEP(builder, vbuffer_ptr, &stride, 1, "");
- *res = draw_llvm_translate_from(gallivm, vbuffer_ptr, velem->src_format);
-}
-
-
-static LLVMValueRef
-aos_to_soa(struct gallivm_state *gallivm,
- LLVMValueRef val0,
- LLVMValueRef val1,
- LLVMValueRef val2,
- LLVMValueRef val3,
- LLVMValueRef channel)
-{
- LLVMBuilderRef builder = gallivm->builder;
- LLVMValueRef ex, res;
-
- ex = LLVMBuildExtractElement(builder, val0,
- channel, "");
- res = LLVMBuildInsertElement(builder,
- LLVMConstNull(LLVMTypeOf(val0)),
- ex,
- lp_build_const_int32(gallivm, 0),
- "");
-
- ex = LLVMBuildExtractElement(builder, val1,
- channel, "");
- res = LLVMBuildInsertElement(builder,
- res, ex,
- lp_build_const_int32(gallivm, 1),
- "");
-
- ex = LLVMBuildExtractElement(builder, val2,
- channel, "");
- res = LLVMBuildInsertElement(builder,
- res, ex,
- lp_build_const_int32(gallivm, 2),
- "");
-
- ex = LLVMBuildExtractElement(builder, val3,
- channel, "");
- res = LLVMBuildInsertElement(builder,
- res, ex,
- lp_build_const_int32(gallivm, 3),
- "");
-
- return res;
+ *res = lp_build_fetch_rgba_aos(gallivm,
+ format_desc,
+ lp_float32_vec4_type(),
+ vbuffer_ptr,
+ zero, zero, zero);
}
-
static void
-soa_to_aos(struct gallivm_state *gallivm,
- LLVMValueRef soa[TGSI_NUM_CHANNELS],
- LLVMValueRef aos[TGSI_NUM_CHANNELS])
+convert_to_soa(struct gallivm_state *gallivm,
+ LLVMValueRef (*src_aos)[LP_MAX_VECTOR_WIDTH / 32],
+ LLVMValueRef (*dst_soa)[TGSI_NUM_CHANNELS],
+ unsigned num_attribs, const struct lp_type soa_type)
{
- LLVMBuilderRef builder = gallivm->builder;
- LLVMValueRef comp;
- int i = 0;
+ unsigned i, j, k;
+ struct lp_type aos_channel_type = soa_type;
debug_assert(TGSI_NUM_CHANNELS == 4);
+ debug_assert((soa_type.length % TGSI_NUM_CHANNELS) == 0);
- aos[0] = LLVMConstNull(LLVMTypeOf(soa[0]));
- aos[1] = aos[2] = aos[3] = aos[0];
-
- for (i = 0; i < TGSI_NUM_CHANNELS; ++i) {
- LLVMValueRef channel = lp_build_const_int32(gallivm, i);
-
- comp = LLVMBuildExtractElement(builder, soa[i],
- lp_build_const_int32(gallivm, 0), "");
- aos[0] = LLVMBuildInsertElement(builder, aos[0], comp, channel, "");
-
- comp = LLVMBuildExtractElement(builder, soa[i],
- lp_build_const_int32(gallivm, 1), "");
- aos[1] = LLVMBuildInsertElement(builder, aos[1], comp, channel, "");
+ aos_channel_type.length >>= 1;
- comp = LLVMBuildExtractElement(builder, soa[i],
- lp_build_const_int32(gallivm, 2), "");
- aos[2] = LLVMBuildInsertElement(builder, aos[2], comp, channel, "");
-
- comp = LLVMBuildExtractElement(builder, soa[i],
- lp_build_const_int32(gallivm, 3), "");
- aos[3] = LLVMBuildInsertElement(builder, aos[3], comp, channel, "");
+ for (i = 0; i < num_attribs; ++i) {
+ LLVMValueRef aos_channels[TGSI_NUM_CHANNELS];
+ unsigned pixels_per_channel = soa_type.length / TGSI_NUM_CHANNELS;
- }
-}
+ for (j = 0; j < TGSI_NUM_CHANNELS; ++j) {
+ LLVMValueRef channel[LP_MAX_VECTOR_LENGTH];
+ assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
-static void
-convert_to_soa(struct gallivm_state *gallivm,
- LLVMValueRef (*aos)[TGSI_NUM_CHANNELS],
- LLVMValueRef (*soa)[TGSI_NUM_CHANNELS],
- int num_attribs)
-{
- int i;
+ for (k = 0; k < pixels_per_channel; ++k) {
+ channel[k] = src_aos[i][j + TGSI_NUM_CHANNELS * k];
+ }
- debug_assert(TGSI_NUM_CHANNELS == 4);
+ aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
+ }
- for (i = 0; i < num_attribs; ++i) {
- LLVMValueRef val0 = aos[i][0];
- LLVMValueRef val1 = aos[i][1];
- LLVMValueRef val2 = aos[i][2];
- LLVMValueRef val3 = aos[i][3];
-
- soa[i][0] = aos_to_soa(gallivm, val0, val1, val2, val3,
- lp_build_const_int32(gallivm, 0));
- soa[i][1] = aos_to_soa(gallivm, val0, val1, val2, val3,
- lp_build_const_int32(gallivm, 1));
- soa[i][2] = aos_to_soa(gallivm, val0, val1, val2, val3,
- lp_build_const_int32(gallivm, 2));
- soa[i][3] = aos_to_soa(gallivm, val0, val1, val2, val3,
- lp_build_const_int32(gallivm, 3));
+ lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa[i]);
}
}
@@ -707,89 +579,34 @@ static void
store_aos(struct gallivm_state *gallivm,
LLVMValueRef io_ptr,
LLVMValueRef index,
- LLVMValueRef value,
- LLVMValueRef clipmask, boolean have_clipdist)
+ LLVMValueRef value)
{
+ LLVMTypeRef data_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, lp_float32_vec4_type()), 0);
LLVMBuilderRef builder = gallivm->builder;
- LLVMValueRef id_ptr = draw_jit_header_id(gallivm, io_ptr);
LLVMValueRef data_ptr = draw_jit_header_data(gallivm, io_ptr);
LLVMValueRef indices[3];
- LLVMValueRef val;
- int vertex_id_pad_edgeflag;
indices[0] = lp_build_const_int32(gallivm, 0);
indices[1] = index;
indices[2] = lp_build_const_int32(gallivm, 0);
- /* If this assertion fails, it means we need to update the bit twidding
- * code here. See struct vertex_header in draw_private.h.
- */
- assert(DRAW_TOTAL_CLIP_PLANES==14);
- /* initialize vertex id:16 = 0xffff, have_clipdist:1 = 0, edgeflag:1 = 1 */
- vertex_id_pad_edgeflag = (0xffff << 16) | (1 << DRAW_TOTAL_CLIP_PLANES);
- if (have_clipdist)
- vertex_id_pad_edgeflag |= 1 << (DRAW_TOTAL_CLIP_PLANES+1);
- val = lp_build_const_int32(gallivm, vertex_id_pad_edgeflag);
- /* OR with the clipmask */
- val = LLVMBuildOr(builder, val, clipmask, "");
-
- /* store vertex header */
- LLVMBuildStore(builder, val, id_ptr);
-
-
#if DEBUG_STORE
- lp_build_printf(builder, " ---- %p storing attribute %d (io = %p)\n", data_ptr, index, io_ptr);
-#endif
-#if 0
- /*lp_build_printf(builder, " ---- %p storing at %d (%p) ", io_ptr, index, data_ptr);
- print_vectorf(builder, value);*/
- data_ptr = LLVMBuildBitCast(builder, data_ptr,
- LLVMPointerType(LLVMArrayType(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), 0), 0),
- "datavec");
- data_ptr = LLVMBuildGEP(builder, data_ptr, indices, 2, "");
-
- LLVMBuildStore(builder, value, data_ptr);
-#else
- {
- LLVMValueRef x, y, z, w;
- LLVMValueRef idx0, idx1, idx2, idx3;
- LLVMValueRef gep0, gep1, gep2, gep3;
- data_ptr = LLVMBuildGEP(builder, data_ptr, indices, 3, "");
-
- idx0 = lp_build_const_int32(gallivm, 0);
- idx1 = lp_build_const_int32(gallivm, 1);
- idx2 = lp_build_const_int32(gallivm, 2);
- idx3 = lp_build_const_int32(gallivm, 3);
-
- x = LLVMBuildExtractElement(builder, value,
- idx0, "");
- y = LLVMBuildExtractElement(builder, value,
- idx1, "");
- z = LLVMBuildExtractElement(builder, value,
- idx2, "");
- w = LLVMBuildExtractElement(builder, value,
- idx3, "");
-
- gep0 = LLVMBuildGEP(builder, data_ptr, &idx0, 1, "");
- gep1 = LLVMBuildGEP(builder, data_ptr, &idx1, 1, "");
- gep2 = LLVMBuildGEP(builder, data_ptr, &idx2, 1, "");
- gep3 = LLVMBuildGEP(builder, data_ptr, &idx3, 1, "");
-
- /*lp_build_printf(builder, "##### x = %f (%p), y = %f (%p), z = %f (%p), w = %f (%p)\n",
- x, gep0, y, gep1, z, gep2, w, gep3);*/
- LLVMBuildStore(builder, x, gep0);
- LLVMBuildStore(builder, y, gep1);
- LLVMBuildStore(builder, z, gep2);
- LLVMBuildStore(builder, w, gep3);
- }
+ lp_build_printf(gallivm, " ---- %p storing attribute %d (io = %p)\n", data_ptr, index, io_ptr);
#endif
+
+ data_ptr = LLVMBuildGEP(builder, data_ptr, indices, 3, "");
+ data_ptr = LLVMBuildPointerCast(builder, data_ptr, data_ptr_type, "");
+
+ /* Unaligned store due to the vertex header */
+ lp_set_store_alignment(LLVMBuildStore(builder, value, data_ptr), sizeof(float));
}
static void
store_aos_array(struct gallivm_state *gallivm,
+ struct lp_type soa_type,
LLVMValueRef io_ptr,
- LLVMValueRef aos[TGSI_NUM_CHANNELS],
+ LLVMValueRef* aos,
int attrib,
int num_outputs,
LLVMValueRef clipmask,
@@ -797,42 +614,49 @@ store_aos_array(struct gallivm_state *gallivm,
{
LLVMBuilderRef builder = gallivm->builder;
LLVMValueRef attr_index = lp_build_const_int32(gallivm, attrib);
- LLVMValueRef ind0 = lp_build_const_int32(gallivm, 0);
- LLVMValueRef ind1 = lp_build_const_int32(gallivm, 1);
- LLVMValueRef ind2 = lp_build_const_int32(gallivm, 2);
- LLVMValueRef ind3 = lp_build_const_int32(gallivm, 3);
- LLVMValueRef io0_ptr, io1_ptr, io2_ptr, io3_ptr;
- LLVMValueRef clipmask0, clipmask1, clipmask2, clipmask3;
+ LLVMValueRef inds[LP_MAX_VECTOR_WIDTH / 32];
+ LLVMValueRef io_ptrs[LP_MAX_VECTOR_WIDTH / 32];
+ int vector_length = soa_type.length;
+ int i;
debug_assert(TGSI_NUM_CHANNELS == 4);
- io0_ptr = LLVMBuildGEP(builder, io_ptr,
- &ind0, 1, "");
- io1_ptr = LLVMBuildGEP(builder, io_ptr,
- &ind1, 1, "");
- io2_ptr = LLVMBuildGEP(builder, io_ptr,
- &ind2, 1, "");
- io3_ptr = LLVMBuildGEP(builder, io_ptr,
- &ind3, 1, "");
-
- clipmask0 = LLVMBuildExtractElement(builder, clipmask,
- ind0, "");
- clipmask1 = LLVMBuildExtractElement(builder, clipmask,
- ind1, "");
- clipmask2 = LLVMBuildExtractElement(builder, clipmask,
- ind2, "");
- clipmask3 = LLVMBuildExtractElement(builder, clipmask,
- ind3, "");
+ for (i = 0; i < vector_length; i++) {
+ inds[i] = lp_build_const_int32(gallivm, i);
+ io_ptrs[i] = LLVMBuildGEP(builder, io_ptr, &inds[i], 1, "");
+ }
+ if (attrib == 0) {
+ /* store vertex header for each of the n vertices */
+ LLVMValueRef val, cliptmp;
+ int vertex_id_pad_edgeflag;
+
+ /* If this assertion fails, it means we need to update the bit twidding
+ * code here. See struct vertex_header in draw_private.h.
+ */
+ assert(DRAW_TOTAL_CLIP_PLANES==14);
+ /* initialize vertex id:16 = 0xffff, have_clipdist:1 = 0, edgeflag:1 = 1 */
+ vertex_id_pad_edgeflag = (0xffff << 16) | (1 << DRAW_TOTAL_CLIP_PLANES);
+ if (have_clipdist)
+ vertex_id_pad_edgeflag |= 1 << (DRAW_TOTAL_CLIP_PLANES+1);
+ val = lp_build_const_int_vec(gallivm, lp_int_type(soa_type), vertex_id_pad_edgeflag);
+ /* OR with the clipmask */
+ cliptmp = LLVMBuildOr(builder, val, clipmask, "");
+ for (i = 0; i < vector_length; i++) {
+ LLVMValueRef id_ptr = draw_jit_header_id(gallivm, io_ptrs[i]);
+ val = LLVMBuildExtractElement(builder, cliptmp, inds[i], "");
+ LLVMBuildStore(builder, val, id_ptr);
#if DEBUG_STORE
- lp_build_printf(builder, "io = %p, indexes[%d, %d, %d, %d]\n, clipmask0 = %x, clipmask1 = %x, clipmask2 = %x, clipmask3 = %x\n",
- io_ptr, ind0, ind1, ind2, ind3, clipmask0, clipmask1, clipmask2, clipmask3);
+ lp_build_printf(gallivm, "io = %p, index %d\n, clipmask = %x\n",
+ io_ptrs[i], inds[i], val);
#endif
- /* store for each of the 4 vertices */
- store_aos(gallivm, io0_ptr, attr_index, aos[0], clipmask0, have_clipdist);
- store_aos(gallivm, io1_ptr, attr_index, aos[1], clipmask1, have_clipdist);
- store_aos(gallivm, io2_ptr, attr_index, aos[2], clipmask2, have_clipdist);
- store_aos(gallivm, io3_ptr, attr_index, aos[3], clipmask3, have_clipdist);
+ }
+ }
+
+ /* store for each of the n vertices */
+ for (i = 0; i < vector_length; i++) {
+ store_aos(gallivm, io_ptrs[i], attr_index, aos[i]);
+ }
}
@@ -842,33 +666,53 @@ convert_to_aos(struct gallivm_state *gallivm,
LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
LLVMValueRef clipmask,
int num_outputs,
- int max_vertices, boolean have_clipdist)
+ struct lp_type soa_type,
+ boolean have_clipdist)
{
LLVMBuilderRef builder = gallivm->builder;
- unsigned chan, attrib;
+ unsigned chan, attrib, i;
#if DEBUG_STORE
- lp_build_printf(builder, " # storing begin\n");
+ lp_build_printf(gallivm, " # storing begin\n");
#endif
for (attrib = 0; attrib < num_outputs; ++attrib) {
- LLVMValueRef soa[4];
- LLVMValueRef aos[4];
+ LLVMValueRef soa[TGSI_NUM_CHANNELS];
+ LLVMValueRef aos[LP_MAX_VECTOR_WIDTH / 32];
for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
if (outputs[attrib][chan]) {
LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
lp_build_name(out, "output%u.%c", attrib, "xyzw"[chan]);
- /*lp_build_printf(builder, "output %d : %d ",
- LLVMConstInt(LLVMInt32Type(), attrib, 0),
- LLVMConstInt(LLVMInt32Type(), chan, 0));
- print_vectorf(builder, out);*/
+#if DEBUG_STORE
+ lp_build_printf(gallivm, "output %d : %d ",
+ LLVMConstInt(LLVMInt32TypeInContext(gallivm->context),
+ attrib, 0),
+ LLVMConstInt(LLVMInt32TypeInContext(gallivm->context),
+ chan, 0));
+ lp_build_print_value(gallivm, "val = ", out);
+#endif
soa[chan] = out;
}
else {
soa[chan] = 0;
}
}
- soa_to_aos(gallivm, soa, aos);
+
+
+ if (soa_type.length == TGSI_NUM_CHANNELS) {
+ lp_build_transpose_aos(gallivm, soa_type, soa, aos);
+ } else {
+ lp_build_transpose_aos(gallivm, soa_type, soa, soa);
+
+ for (i = 0; i < soa_type.length; ++i) {
+ aos[i] = lp_build_extract_range(gallivm,
+ soa[i % TGSI_NUM_CHANNELS],
+ (i / TGSI_NUM_CHANNELS) * TGSI_NUM_CHANNELS,
+ TGSI_NUM_CHANNELS);
+ }
+ }
+
store_aos_array(gallivm,
+ soa_type,
io,
aos,
attrib,
@@ -876,104 +720,71 @@ convert_to_aos(struct gallivm_state *gallivm,
clipmask, have_clipdist);
}
#if DEBUG_STORE
- lp_build_printf(builder, " # storing end\n");
+ lp_build_printf(gallivm, " # storing end\n");
#endif
}
/**
* Stores original vertex positions in clip coordinates
- * There is probably a more efficient way to do this, 4 floats at once
- * rather than extracting each element one by one.
- * idx is the output to store things too, if pre_clip_pos is set
- * we store the pos to the idx, if not we store the clipvertex to it.
*/
static void
store_clip(struct gallivm_state *gallivm,
+ const struct lp_type vs_type,
LLVMValueRef io_ptr,
LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
boolean pre_clip_pos, int idx)
{
LLVMBuilderRef builder = gallivm->builder;
- LLVMValueRef out[4];
+ LLVMValueRef soa[4];
+ LLVMValueRef aos[LP_MAX_VECTOR_LENGTH];
LLVMValueRef indices[2];
- LLVMValueRef io0_ptr, io1_ptr, io2_ptr, io3_ptr;
- LLVMValueRef clip_ptr0, clip_ptr1, clip_ptr2, clip_ptr3;
- LLVMValueRef clip0_ptr, clip1_ptr, clip2_ptr, clip3_ptr;
- LLVMValueRef out0elem, out1elem, out2elem, out3elem;
- int i;
+ LLVMValueRef io_ptrs[LP_MAX_VECTOR_WIDTH / 32];
+ LLVMValueRef inds[LP_MAX_VECTOR_WIDTH / 32];
+ LLVMValueRef clip_ptrs[LP_MAX_VECTOR_WIDTH / 32];
+ int i, j;
- LLVMValueRef ind0 = lp_build_const_int32(gallivm, 0);
- LLVMValueRef ind1 = lp_build_const_int32(gallivm, 1);
- LLVMValueRef ind2 = lp_build_const_int32(gallivm, 2);
- LLVMValueRef ind3 = lp_build_const_int32(gallivm, 3);
-
indices[0] =
indices[1] = lp_build_const_int32(gallivm, 0);
- out[0] = LLVMBuildLoad(builder, outputs[idx][0], ""); /*x0 x1 x2 x3*/
- out[1] = LLVMBuildLoad(builder, outputs[idx][1], ""); /*y0 y1 y2 y3*/
- out[2] = LLVMBuildLoad(builder, outputs[idx][2], ""); /*z0 z1 z2 z3*/
- out[3] = LLVMBuildLoad(builder, outputs[idx][3], ""); /*w0 w1 w2 w3*/
+ for (i = 0; i < vs_type.length; i++) {
+ inds[i] = lp_build_const_int32(gallivm, i);
+ io_ptrs[i] = LLVMBuildGEP(builder, io_ptr, &inds[i], 1, "");
+ }
- io0_ptr = LLVMBuildGEP(builder, io_ptr, &ind0, 1, "");
- io1_ptr = LLVMBuildGEP(builder, io_ptr, &ind1, 1, "");
- io2_ptr = LLVMBuildGEP(builder, io_ptr, &ind2, 1, "");
- io3_ptr = LLVMBuildGEP(builder, io_ptr, &ind3, 1, "");
+ soa[0] = LLVMBuildLoad(builder, outputs[idx][0], ""); /*x0 x1 .. xn*/
+ soa[1] = LLVMBuildLoad(builder, outputs[idx][1], ""); /*y0 y1 .. yn*/
+ soa[2] = LLVMBuildLoad(builder, outputs[idx][2], ""); /*z0 z1 .. zn*/
+ soa[3] = LLVMBuildLoad(builder, outputs[idx][3], ""); /*w0 w1 .. wn*/
if (!pre_clip_pos) {
- clip_ptr0 = draw_jit_header_clip(gallivm, io0_ptr);
- clip_ptr1 = draw_jit_header_clip(gallivm, io1_ptr);
- clip_ptr2 = draw_jit_header_clip(gallivm, io2_ptr);
- clip_ptr3 = draw_jit_header_clip(gallivm, io3_ptr);
+ for (i = 0; i < vs_type.length; i++) {
+ clip_ptrs[i] = draw_jit_header_clip(gallivm, io_ptrs[i]);
+ }
} else {
- clip_ptr0 = draw_jit_header_pre_clip_pos(gallivm, io0_ptr);
- clip_ptr1 = draw_jit_header_pre_clip_pos(gallivm, io1_ptr);
- clip_ptr2 = draw_jit_header_pre_clip_pos(gallivm, io2_ptr);
- clip_ptr3 = draw_jit_header_pre_clip_pos(gallivm, io3_ptr);
+ for (i = 0; i < vs_type.length; i++) {
+ clip_ptrs[i] = draw_jit_header_pre_clip_pos(gallivm, io_ptrs[i]);
+ }
}
- for (i = 0; i<4; i++) {
- clip0_ptr = LLVMBuildGEP(builder, clip_ptr0, indices, 2, ""); /* x0 */
- clip1_ptr = LLVMBuildGEP(builder, clip_ptr1, indices, 2, ""); /* x1 */
- clip2_ptr = LLVMBuildGEP(builder, clip_ptr2, indices, 2, ""); /* x2 */
- clip3_ptr = LLVMBuildGEP(builder, clip_ptr3, indices, 2, ""); /* x3 */
-
- out0elem = LLVMBuildExtractElement(builder, out[i], ind0, ""); /* x0 */
- out1elem = LLVMBuildExtractElement(builder, out[i], ind1, ""); /* x1 */
- out2elem = LLVMBuildExtractElement(builder, out[i], ind2, ""); /* x2 */
- out3elem = LLVMBuildExtractElement(builder, out[i], ind3, ""); /* x3 */
-
- LLVMBuildStore(builder, out0elem, clip0_ptr);
- LLVMBuildStore(builder, out1elem, clip1_ptr);
- LLVMBuildStore(builder, out2elem, clip2_ptr);
- LLVMBuildStore(builder, out3elem, clip3_ptr);
-
- indices[1]= LLVMBuildAdd(builder, indices[1], ind1, "");
+ lp_build_transpose_aos(gallivm, vs_type, soa, soa);
+ for (i = 0; i < vs_type.length; ++i) {
+ aos[i] = lp_build_extract_range(gallivm,
+ soa[i % TGSI_NUM_CHANNELS],
+ (i / TGSI_NUM_CHANNELS) * TGSI_NUM_CHANNELS,
+ TGSI_NUM_CHANNELS);
}
-}
-
+ for (j = 0; j < vs_type.length; j++) {
+ LLVMTypeRef clip_ptr_type = LLVMPointerType(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), 0);
+ LLVMValueRef clip_ptr;
-/**
- * Equivalent of _mm_set1_ps(a)
- */
-static LLVMValueRef
-vec4f_from_scalar(struct gallivm_state *gallivm,
- LLVMValueRef a,
- const char *name)
-{
- LLVMTypeRef float_type = LLVMFloatTypeInContext(gallivm->context);
- LLVMValueRef res = LLVMGetUndef(LLVMVectorType(float_type, 4));
- int i;
+ clip_ptr = LLVMBuildGEP(builder, clip_ptrs[j], indices, 2, "clipo");
+ clip_ptr = LLVMBuildPointerCast(builder, clip_ptr, clip_ptr_type, "");
- for (i = 0; i < 4; ++i) {
- LLVMValueRef index = lp_build_const_int32(gallivm, i);
- res = LLVMBuildInsertElement(gallivm->builder, res, a,
- index, i == 3 ? name : "");
+ /* Unaligned store */
+ lp_set_store_alignment(LLVMBuildStore(builder, aos[j], clip_ptr), sizeof(float));
}
-
- return res;
}
@@ -981,15 +792,17 @@ vec4f_from_scalar(struct gallivm_state *gallivm,
* Transforms the outputs for viewport mapping
*/
static void
-generate_viewport(struct draw_llvm *llvm,
+generate_viewport(struct draw_llvm_variant *variant,
LLVMBuilderRef builder,
+ struct lp_type vs_type,
LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
LLVMValueRef context_ptr)
{
int i;
- struct gallivm_state *gallivm = llvm->gallivm;
- struct lp_type f32_type = lp_type_float_vec(32);
- LLVMValueRef out3 = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 w2 w3*/
+ struct gallivm_state *gallivm = variant->gallivm;
+ struct lp_type f32_type = vs_type;
+ LLVMTypeRef vs_type_llvm = lp_build_vec_type(gallivm, vs_type);
+ LLVMValueRef out3 = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 .. wn*/
LLVMValueRef const1 = lp_build_const_vec(gallivm, f32_type, 1.0); /*1.0 1.0 1.0 1.0*/
LLVMValueRef vp_ptr = draw_jit_context_viewport(gallivm, context_ptr);
@@ -999,7 +812,7 @@ generate_viewport(struct draw_llvm *llvm,
/* Viewport Mapping */
for (i=0; i<3; i++) {
- LLVMValueRef out = LLVMBuildLoad(builder, outputs[0][i], ""); /*x0 x1 x2 x3*/
+ LLVMValueRef out = LLVMBuildLoad(builder, outputs[0][i], ""); /*x0 x1 .. xn*/
LLVMValueRef scale;
LLVMValueRef trans;
LLVMValueRef scale_i;
@@ -1012,8 +825,10 @@ generate_viewport(struct draw_llvm *llvm,
index = lp_build_const_int32(gallivm, i+4);
trans_i = LLVMBuildGEP(builder, vp_ptr, &index, 1, "");
- scale = vec4f_from_scalar(gallivm, LLVMBuildLoad(builder, scale_i, ""), "scale");
- trans = vec4f_from_scalar(gallivm, LLVMBuildLoad(builder, trans_i, ""), "trans");
+ scale = lp_build_broadcast(gallivm, vs_type_llvm,
+ LLVMBuildLoad(builder, scale_i, "scale"));
+ trans = lp_build_broadcast(gallivm, vs_type_llvm,
+ LLVMBuildLoad(builder, trans_i, "trans"));
/* divide by w */
out = LLVMBuildFMul(builder, out, out3, "");
@@ -1030,10 +845,12 @@ generate_viewport(struct draw_llvm *llvm,
/**
- * Returns clipmask as 4xi32 bitmask for the 4 vertices
+ * Returns clipmask as nxi32 bitmask for the n vertices
*/
static LLVMValueRef
generate_clipmask(struct draw_llvm *llvm,
+ struct gallivm_state *gallivm,
+ struct lp_type vs_type,
LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
boolean clip_xy,
boolean clip_z,
@@ -1043,15 +860,15 @@ generate_clipmask(struct draw_llvm *llvm,
LLVMValueRef context_ptr,
boolean *have_clipdist)
{
- struct gallivm_state *gallivm = llvm->gallivm;
LLVMBuilderRef builder = gallivm->builder;
- LLVMValueRef mask; /* stores the <4xi32> clipmasks */
+ LLVMValueRef mask; /* stores the <nxi32> clipmasks */
LLVMValueRef test, temp;
LLVMValueRef zero, shift;
LLVMValueRef pos_x, pos_y, pos_z, pos_w;
LLVMValueRef cv_x, cv_y, cv_z, cv_w;
LLVMValueRef plane1, planes, plane_ptr, sum;
- struct lp_type f32_type = lp_type_float_vec(32);
+ struct lp_type f32_type = vs_type;
+ struct lp_type i32_type = lp_int_type(vs_type);
const unsigned pos = draw_current_shader_position_output(llvm->draw);
const unsigned cv = draw_current_shader_clipvertex_output(llvm->draw);
int num_written_clipdistance = llvm->draw->vs.vertex_shader->info.num_written_clipdistance;
@@ -1064,25 +881,25 @@ generate_clipmask(struct draw_llvm *llvm,
if (cd[0] != pos || cd[1] != pos)
have_cd = true;
- mask = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 0);
- temp = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 0);
- zero = lp_build_const_vec(gallivm, f32_type, 0); /* 0.0f 0.0f 0.0f 0.0f */
- shift = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 1); /* 1 1 1 1 */
+ mask = lp_build_const_int_vec(gallivm, i32_type, 0);
+ temp = lp_build_const_int_vec(gallivm, i32_type, 0);
+ zero = lp_build_const_vec(gallivm, f32_type, 0); /* 0.0f 0.0f 0.0f 0.0f */
+ shift = lp_build_const_int_vec(gallivm, i32_type, 1); /* 1 1 1 1 */
/*
* load clipvertex and position from correct locations.
* if they are the same just load them once.
*/
- pos_x = LLVMBuildLoad(builder, outputs[pos][0], ""); /*x0 x1 x2 x3*/
- pos_y = LLVMBuildLoad(builder, outputs[pos][1], ""); /*y0 y1 y2 y3*/
- pos_z = LLVMBuildLoad(builder, outputs[pos][2], ""); /*z0 z1 z2 z3*/
- pos_w = LLVMBuildLoad(builder, outputs[pos][3], ""); /*w0 w1 w2 w3*/
+ pos_x = LLVMBuildLoad(builder, outputs[pos][0], ""); /*x0 x1 .. xn */
+ pos_y = LLVMBuildLoad(builder, outputs[pos][1], ""); /*y0 y1 .. yn */
+ pos_z = LLVMBuildLoad(builder, outputs[pos][2], ""); /*z0 z1 .. zn */
+ pos_w = LLVMBuildLoad(builder, outputs[pos][3], ""); /*w0 w1 .. wn */
if (clip_user && cv != pos) {
- cv_x = LLVMBuildLoad(builder, outputs[cv][0], ""); /*x0 x1 x2 x3*/
- cv_y = LLVMBuildLoad(builder, outputs[cv][1], ""); /*y0 y1 y2 y3*/
- cv_z = LLVMBuildLoad(builder, outputs[cv][2], ""); /*z0 z1 z2 z3*/
- cv_w = LLVMBuildLoad(builder, outputs[cv][3], ""); /*w0 w1 w2 w3*/
+ cv_x = LLVMBuildLoad(builder, outputs[cv][0], ""); /*x0 x1 .. xn */
+ cv_y = LLVMBuildLoad(builder, outputs[cv][1], ""); /*y0 y1 .. yn */
+ cv_z = LLVMBuildLoad(builder, outputs[cv][2], ""); /*z0 z1 .. zn */
+ cv_w = LLVMBuildLoad(builder, outputs[cv][3], ""); /*w0 w1 .. wn */
} else {
cv_x = pos_x;
cv_y = pos_y;
@@ -1120,7 +937,7 @@ generate_clipmask(struct draw_llvm *llvm,
}
if (clip_z) {
- temp = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 16);
+ temp = lp_build_const_int_vec(gallivm, i32_type, 16);
if (clip_halfz) {
/* plane 5 */
test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, zero, pos_z);
@@ -1163,42 +980,43 @@ generate_clipmask(struct draw_llvm *llvm,
clipdist = LLVMBuildLoad(builder, outputs[cd[1]][i-4], "");
}
test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, zero, clipdist);
- temp = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 1 << plane_idx);
+ temp = lp_build_const_int_vec(gallivm, i32_type, 1 << plane_idx);
test = LLVMBuildAnd(builder, test, temp, "");
mask = LLVMBuildOr(builder, mask, test, "");
} else {
+ LLVMTypeRef vs_type_llvm = lp_build_vec_type(gallivm, vs_type);
indices[0] = lp_build_const_int32(gallivm, 0);
indices[1] = lp_build_const_int32(gallivm, plane_idx);
indices[2] = lp_build_const_int32(gallivm, 0);
plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_x");
- planes = vec4f_from_scalar(gallivm, plane1, "plane4_x");
+ planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1);
sum = LLVMBuildFMul(builder, planes, cv_x, "");
indices[2] = lp_build_const_int32(gallivm, 1);
plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_y");
- planes = vec4f_from_scalar(gallivm, plane1, "plane4_y");
+ planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1);
test = LLVMBuildFMul(builder, planes, cv_y, "");
sum = LLVMBuildFAdd(builder, sum, test, "");
indices[2] = lp_build_const_int32(gallivm, 2);
plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_z");
- planes = vec4f_from_scalar(gallivm, plane1, "plane4_z");
+ planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1);
test = LLVMBuildFMul(builder, planes, cv_z, "");
sum = LLVMBuildFAdd(builder, sum, test, "");
indices[2] = lp_build_const_int32(gallivm, 3);
plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_w");
- planes = vec4f_from_scalar(gallivm, plane1, "plane4_w");
+ planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1);
test = LLVMBuildFMul(builder, planes, cv_w, "");
sum = LLVMBuildFAdd(builder, sum, test, "");
test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, zero, sum);
- temp = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 1 << plane_idx);
+ temp = lp_build_const_int_vec(gallivm, i32_type, 1 << plane_idx);
test = LLVMBuildAnd(builder, test, temp, "");
mask = LLVMBuildOr(builder, mask, test, "");
}
@@ -1212,23 +1030,28 @@ generate_clipmask(struct draw_llvm *llvm,
* Returns boolean if any clipping has occurred
* Used zero/non-zero i32 value to represent boolean
*/
-static void
-clipmask_bool(struct gallivm_state *gallivm,
- LLVMValueRef clipmask,
- LLVMValueRef ret_ptr)
+static LLVMValueRef
+clipmask_booli32(struct gallivm_state *gallivm,
+ const struct lp_type vs_type,
+ LLVMValueRef clipmask_bool_ptr)
{
LLVMBuilderRef builder = gallivm->builder;
- LLVMValueRef ret = LLVMBuildLoad(builder, ret_ptr, "");
+ LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
+ LLVMValueRef clipmask_bool = LLVMBuildLoad(builder, clipmask_bool_ptr, "");
+ LLVMValueRef ret = LLVMConstNull(int32_type);
LLVMValueRef temp;
int i;
- for (i=0; i<4; i++) {
- temp = LLVMBuildExtractElement(builder, clipmask,
+ /*
+ * Can do this with log2(vector length) pack instructions and one extract
+ * (as we don't actually need a or) with sse2 which would be way better.
+ */
+ for (i=0; i < vs_type.length; i++) {
+ temp = LLVMBuildExtractElement(builder, clipmask_bool,
lp_build_const_int32(gallivm, i) , "");
ret = LLVMBuildOr(builder, ret, temp, "");
}
-
- LLVMBuildStore(builder, ret, ret_ptr);
+ return ret;
}
@@ -1236,7 +1059,7 @@ static void
draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
boolean elts)
{
- struct gallivm_state *gallivm = llvm->gallivm;
+ struct gallivm_state *gallivm = variant->gallivm;
LLVMContextRef context = gallivm->context;
LLVMTypeRef int32_type = LLVMInt32TypeInContext(context);
LLVMTypeRef arg_types[8];
@@ -1244,6 +1067,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
LLVMValueRef context_ptr;
LLVMBasicBlockRef block;
LLVMBuilderRef builder;
+ struct lp_type vs_type;
LLVMValueRef end, start;
LLVMValueRef count, fetch_elts, fetch_count;
LLVMValueRef stride, step, io_itr;
@@ -1255,12 +1079,11 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
unsigned i, j;
struct lp_build_context bld;
struct lp_build_loop_state lp_loop;
- const int max_vertices = 4;
+ const int vector_length = lp_native_vector_width / 32;
LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
LLVMValueRef fetch_max;
- void *code;
struct lp_build_sampler_soa *sampler = 0;
- LLVMValueRef ret, ret_ptr;
+ LLVMValueRef ret, clipmask_bool_ptr;
const boolean bypass_viewport = variant->key.bypass_viewport;
const boolean enable_cliptest = variant->key.clip_xy ||
variant->key.clip_z ||
@@ -1273,16 +1096,16 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
memset(&system_values, 0, sizeof(system_values));
- arg_types[0] = get_context_ptr_type(llvm); /* context */
- arg_types[1] = get_vertex_header_ptr_type(llvm); /* vertex_header */
- arg_types[2] = get_buffer_ptr_type(llvm); /* vbuffers */
+ arg_types[0] = get_context_ptr_type(variant); /* context */
+ arg_types[1] = get_vertex_header_ptr_type(variant); /* vertex_header */
+ arg_types[2] = get_buffer_ptr_type(variant); /* vbuffers */
if (elts)
arg_types[3] = LLVMPointerType(int32_type, 0);/* fetch_elts * */
else
arg_types[3] = int32_type; /* start */
arg_types[4] = int32_type; /* fetch_count / count */
arg_types[5] = int32_type; /* stride */
- arg_types[6] = get_vb_ptr_type(llvm); /* pipe_vertex_buffer's */
+ arg_types[6] = get_vb_ptr_type(variant); /* pipe_vertex_buffer's */
arg_types[7] = int32_type; /* instance_id */
func_type = LLVMFunctionType(int32_type, arg_types, Elements(arg_types), 0);
@@ -1341,9 +1164,16 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
lp_build_context_init(&bld, gallivm, lp_type_int(32));
- /* function will return non-zero i32 value if any clipped vertices */
- ret_ptr = lp_build_alloca(gallivm, int32_type, "");
- LLVMBuildStore(builder, zero, ret_ptr);
+ memset(&vs_type, 0, sizeof vs_type);
+ vs_type.floating = TRUE; /* floating point values */
+ vs_type.sign = TRUE; /* values are signed */
+ vs_type.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */
+ vs_type.width = 32; /* 32-bit float */
+ vs_type.length = vector_length;
+
+ /* hold temporary "bool" clipmask */
+ clipmask_bool_ptr = lp_build_alloca(gallivm, lp_build_int_vec_type(gallivm, vs_type), "");
+ LLVMBuildStore(builder, lp_build_zero(gallivm, lp_int_type(vs_type)), clipmask_bool_ptr);
/* code generated texture sampling */
sampler = draw_llvm_sampler_soa_create(
@@ -1358,14 +1188,14 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
end = lp_build_add(&bld, start, count);
}
- step = lp_build_const_int32(gallivm, max_vertices);
+ step = lp_build_const_int32(gallivm, vector_length);
fetch_max = LLVMBuildSub(builder, end, one, "fetch_max");
lp_build_loop_begin(&lp_loop, gallivm, start);
{
LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
- LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS] = { { 0 } };
+ LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][LP_MAX_VECTOR_WIDTH / 32] = { { 0 } };
LLVMValueRef io;
LLVMValueRef clipmask; /* holds the clipmask value */
const LLVMValueRef (*ptr_aos)[TGSI_NUM_CHANNELS];
@@ -1377,11 +1207,11 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
io = LLVMBuildGEP(builder, io_ptr, &io_itr, 1, "");
#if DEBUG_STORE
- lp_build_printf(builder, " --- io %d = %p, loop counter %d\n",
+ lp_build_printf(gallivm, " --- io %d = %p, loop counter %d\n",
io_itr, io, lp_loop.counter);
#endif
- system_values.vertex_id = lp_build_zero(gallivm, lp_type_uint_vec(32));
- for (i = 0; i < TGSI_NUM_CHANNELS; ++i) {
+ system_values.vertex_id = lp_build_zero(gallivm, lp_type_uint_vec(32, 32*vector_length));
+ for (i = 0; i < vector_length; ++i) {
LLVMValueRef true_index =
LLVMBuildAdd(builder,
lp_loop.counter,
@@ -1413,11 +1243,12 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
}
}
convert_to_soa(gallivm, aos_attribs, inputs,
- draw->pt.nr_vertex_elements);
+ draw->pt.nr_vertex_elements, vs_type);
ptr_aos = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) inputs;
- generate_vs(llvm,
+ generate_vs(variant,
builder,
+ vs_type,
outputs,
ptr_aos,
&system_values,
@@ -1426,29 +1257,34 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
variant->key.clamp_vertex_color);
/* store original positions in clip before further manipulation */
- store_clip(gallivm, io, outputs, 0, cv);
- store_clip(gallivm, io, outputs, 1, pos);
+ store_clip(gallivm, vs_type, io, outputs, 0, cv);
+ store_clip(gallivm, vs_type, io, outputs, 1, pos);
/* do cliptest */
if (enable_cliptest) {
+ LLVMValueRef temp = LLVMBuildLoad(builder, clipmask_bool_ptr, "");
/* allocate clipmask, assign it integer type */
- clipmask = generate_clipmask(llvm, outputs,
+ clipmask = generate_clipmask(llvm,
+ gallivm,
+ vs_type,
+ outputs,
variant->key.clip_xy,
variant->key.clip_z,
variant->key.clip_user,
variant->key.clip_halfz,
variant->key.ucp_enable,
context_ptr, &have_clipdist);
- /* return clipping boolean value for function */
- clipmask_bool(gallivm, clipmask, ret_ptr);
+ temp = LLVMBuildOr(builder, clipmask, temp, "");
+ /* store temporary clipping boolean value */
+ LLVMBuildStore(builder, temp, clipmask_bool_ptr);
}
else {
- clipmask = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 0);
+ clipmask = lp_build_const_int_vec(gallivm, lp_int_type(vs_type), 0);
}
/* do viewport mapping */
if (!bypass_viewport) {
- generate_viewport(llvm, builder, outputs, context_ptr);
+ generate_viewport(variant, builder, vs_type, outputs, context_ptr);
}
/* store clipmask in vertex header,
@@ -1456,43 +1292,20 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
* and transformed positions in data
*/
convert_to_aos(gallivm, io, outputs, clipmask,
- vs_info->num_outputs, max_vertices, have_clipdist);
+ vs_info->num_outputs, vs_type,
+ have_clipdist);
}
lp_build_loop_end_cond(&lp_loop, end, step, LLVMIntUGE);
sampler->destroy(sampler);
- ret = LLVMBuildLoad(builder, ret_ptr, "");
- LLVMBuildRet(builder, ret);
-
- /*
- * Translate the LLVM IR into machine code.
- */
-#ifdef DEBUG
- if (LLVMVerifyFunction(variant_func, LLVMPrintMessageAction)) {
- lp_debug_dump_value(variant_func);
- assert(0);
- }
-#endif
-
- LLVMRunFunctionPassManager(gallivm->passmgr, variant_func);
+ /* return clipping boolean value for function */
+ ret = clipmask_booli32(gallivm, vs_type, clipmask_bool_ptr);
- if (gallivm_debug & GALLIVM_DEBUG_IR) {
- lp_debug_dump_value(variant_func);
- debug_printf("\n");
- }
-
- code = LLVMGetPointerToGlobal(gallivm->engine, variant_func);
- if (elts)
- variant->jit_func_elts = (draw_jit_vert_func_elts) pointer_to_func(code);
- else
- variant->jit_func = (draw_jit_vert_func) pointer_to_func(code);
+ LLVMBuildRet(builder, ret);
- if (gallivm_debug & GALLIVM_DEBUG_ASM) {
- lp_disassemble(code);
- }
- lp_func_delete_body(variant_func);
+ gallivm_verify_function(gallivm, variant_func);
}
@@ -1600,17 +1413,17 @@ draw_llvm_destroy_variant(struct draw_llvm_variant *variant)
struct draw_llvm *llvm = variant->llvm;
if (variant->function_elts) {
- LLVMFreeMachineCodeForFunction(llvm->gallivm->engine,
- variant->function_elts);
- LLVMDeleteFunction(variant->function_elts);
+ gallivm_free_function(variant->gallivm,
+ variant->function_elts, variant->jit_func_elts);
}
if (variant->function) {
- LLVMFreeMachineCodeForFunction(llvm->gallivm->engine,
- variant->function);
- LLVMDeleteFunction(variant->function);
+ gallivm_free_function(variant->gallivm,
+ variant->function, variant->jit_func);
}
+ gallivm_destroy(variant->gallivm);
+
remove_from_list(&variant->list_item_local);
variant->shader->variants_cached--;
remove_from_list(&variant->list_item_global);
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h
index 31fc2db05bd..39d83cfe99f 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -36,11 +36,6 @@
#include "pipe/p_context.h"
#include "util/u_simple_list.h"
-#include <llvm-c/Core.h>
-#include <llvm-c/Analysis.h>
-#include <llvm-c/Target.h>
-#include <llvm-c/ExecutionEngine.h>
-
struct draw_llvm;
struct llvm_vertex_shader;
@@ -220,6 +215,14 @@ struct draw_llvm_variant_list_item
struct draw_llvm_variant
{
+ struct gallivm_state *gallivm;
+
+ /* LLVM JIT builder types */
+ LLVMTypeRef context_ptr_type;
+ LLVMTypeRef buffer_ptr_type;
+ LLVMTypeRef vb_ptr_type;
+ LLVMTypeRef vertex_header_ptr_type;
+
LLVMValueRef function;
LLVMValueRef function_elts;
draw_jit_vert_func jit_func;
@@ -249,16 +252,8 @@ struct draw_llvm {
struct draw_jit_context jit_context;
- struct gallivm_state *gallivm;
-
struct draw_llvm_variant_list_item vs_variants_list;
int nr_variants;
-
- /* LLVM JIT builder types */
- LLVMTypeRef context_ptr_type;
- LLVMTypeRef buffer_ptr_type;
- LLVMTypeRef vb_ptr_type;
- LLVMTypeRef vertex_header_ptr_type;
};
@@ -270,7 +265,7 @@ llvm_vertex_shader(struct draw_vertex_shader *vs)
struct draw_llvm *
-draw_llvm_create(struct draw_context *draw, struct gallivm_state *gallivm);
+draw_llvm_create(struct draw_context *draw);
void
draw_llvm_destroy(struct draw_llvm *llvm);
@@ -286,11 +281,6 @@ draw_llvm_destroy_variant(struct draw_llvm_variant *variant);
struct draw_llvm_variant_key *
draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store);
-LLVMValueRef
-draw_llvm_translate_from(struct gallivm_state *gallivm,
- LLVMValueRef vbuffer,
- enum pipe_format from_format);
-
struct lp_build_sampler_soa *
draw_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state,
LLVMValueRef context_ptr);
diff --git a/src/gallium/auxiliary/draw/draw_llvm_sample.c b/src/gallium/auxiliary/draw/draw_llvm_sample.c
index 0a8b3bc535f..1dbe5f5bd19 100644
--- a/src/gallium/auxiliary/draw/draw_llvm_sample.c
+++ b/src/gallium/auxiliary/draw/draw_llvm_sample.c
@@ -173,8 +173,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
unsigned unit,
unsigned num_coords,
const LLVMValueRef *coords,
- const LLVMValueRef *ddx,
- const LLVMValueRef *ddy,
+ const struct lp_derivatives *derivs,
LLVMValueRef lod_bias, /* optional */
LLVMValueRef explicit_lod, /* optional */
LLVMValueRef *texel)
@@ -189,7 +188,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
type,
unit,
num_coords, coords,
- ddx, ddy,
+ derivs,
lod_bias, explicit_lod,
texel);
}
@@ -201,6 +200,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
static void
draw_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
struct gallivm_state *gallivm,
+ struct lp_type type,
unsigned unit,
LLVMValueRef explicit_lod, /* optional */
LLVMValueRef *sizes_out)
@@ -212,6 +212,7 @@ draw_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
lp_build_size_query_soa(gallivm,
&sampler->dynamic_state.static_state[unit],
&sampler->dynamic_state.base,
+ type,
unit,
explicit_lod,
sizes_out);
diff --git a/src/gallium/auxiliary/draw/draw_llvm_translate.c b/src/gallium/auxiliary/draw/draw_llvm_translate.c
deleted file mode 100644
index 77d0af74733..00000000000
--- a/src/gallium/auxiliary/draw/draw_llvm_translate.c
+++ /dev/null
@@ -1,506 +0,0 @@
-#include "draw_private.h"
-#include "draw_context.h"
-
-#include "draw_llvm.h"
-
-#include "gallivm/lp_bld_const.h"
-#include "gallivm/lp_bld_struct.h"
-#include "gallivm/lp_bld_format.h"
-#include "gallivm/lp_bld_debug.h"
-#include "gallivm/lp_bld_type.h"
-
-#include "util/u_memory.h"
-#include "util/u_format.h"
-#include "pipe/p_state.h"
-
-
-#define DRAW_DBG 0
-
-static LLVMValueRef
-from_64_float(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMDoubleTypeInContext(gallivm->context), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- return LLVMBuildFPTrunc(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static LLVMValueRef
-from_32_float(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0) , "");
- return LLVMBuildLoad(gallivm->builder, bc, "");
-}
-
-static INLINE LLVMValueRef
-from_8_uscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
- return LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_16_uscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- return LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_32_uscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- return LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_8_sscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
- return LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_16_sscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- return LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_32_sscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- return LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-
-static INLINE LLVMValueRef
-from_8_unorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
- LLVMValueRef uscaled = LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
- return LLVMBuildFDiv(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 255.), "");
-}
-
-static INLINE LLVMValueRef
-from_16_unorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- LLVMValueRef uscaled = LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
- return LLVMBuildFDiv(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 65535.), "");
-}
-
-static INLINE LLVMValueRef
-from_32_unorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- LLVMValueRef uscaled = LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-
- return LLVMBuildFDiv(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 4294967295.), "");
-}
-
-static INLINE LLVMValueRef
-from_8_snorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
- LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
- return LLVMBuildFDiv(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 127.0), "");
-}
-
-static INLINE LLVMValueRef
-from_16_snorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
- return LLVMBuildFDiv(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 32767.0f), "");
-}
-
-static INLINE LLVMValueRef
-from_32_snorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-
- return LLVMBuildFDiv(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 2147483647.0), "");
-}
-
-static INLINE LLVMValueRef
-from_32_fixed(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-
- return LLVMBuildFDiv(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 65536.0), "");
-}
-
-static LLVMValueRef
-to_64_float(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- return LLVMBuildFPExt(gallivm->builder, l, LLVMDoubleTypeInContext(gallivm->context), "");
-}
-
-static LLVMValueRef
-to_32_float(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- return LLVMBuildLoad(gallivm->builder, fp, "");
-}
-
-static INLINE LLVMValueRef
-to_8_uscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- return LLVMBuildFPToUI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 8), "");
-}
-
-static INLINE LLVMValueRef
-to_16_uscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- return LLVMBuildFPToUI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 16), "");
-}
-
-static INLINE LLVMValueRef
-to_32_uscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- return LLVMBuildFPToUI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 32), "");
-}
-
-static INLINE LLVMValueRef
-to_8_sscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- return LLVMBuildFPToSI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 8), "");
-}
-
-static INLINE LLVMValueRef
-to_16_sscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- return LLVMBuildFPToSI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 16), "");
-}
-
-static INLINE LLVMValueRef
-to_32_sscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- return LLVMBuildFPToSI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 32), "");
-}
-
-static INLINE LLVMValueRef
-to_8_unorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- LLVMValueRef uscaled = LLVMBuildFPToUI(gallivm->builder, l,
- LLVMIntTypeInContext(gallivm->context, 8), "");
- return LLVMBuildFMul(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 255.), "");
-}
-
-static INLINE LLVMValueRef
-to_16_unorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- LLVMValueRef uscaled = LLVMBuildFPToUI(gallivm->builder, l,
- LLVMIntTypeInContext(gallivm->context, 32), "");
- return LLVMBuildFMul(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 65535.), "");
-}
-
-static INLINE LLVMValueRef
-to_32_unorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- LLVMValueRef uscaled = LLVMBuildFPToUI(gallivm->builder, l,
- LLVMIntTypeInContext(gallivm->context, 32), "");
-
- return LLVMBuildFMul(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 4294967295.), "");
-}
-
-static INLINE LLVMValueRef
-to_8_snorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
- LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l,
- LLVMIntTypeInContext(gallivm->context, 8), "");
- return LLVMBuildFMul(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 127.0), "");
-}
-
-static INLINE LLVMValueRef
-to_16_snorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l,
- LLVMIntTypeInContext(gallivm->context, 16), "");
- return LLVMBuildFMul(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 32767.0f), "");
-}
-
-static INLINE LLVMValueRef
-to_32_snorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l,
- LLVMIntTypeInContext(gallivm->context, 32), "");
-
- return LLVMBuildFMul(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 2147483647.0), "");
-}
-
-static INLINE LLVMValueRef
-to_32_fixed(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l,
- LLVMIntTypeInContext(gallivm->context, 32), "");
-
- return LLVMBuildFMul(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 65536.0), "");
-}
-
-typedef LLVMValueRef (*from_func)(struct gallivm_state *, LLVMValueRef);
-typedef LLVMValueRef (*to_func)(struct gallivm_state *, LLVMValueRef);
-
-/* so that underneath can avoid function calls which are prohibited
- * for static initialization we need this conversion */
-enum ll_type {
- LL_Double,
- LL_Float,
- LL_Int32,
- LL_Int16,
- LL_Int8
-};
-
-static INLINE LLVMTypeRef
-ll_type_to_llvm(struct gallivm_state *gallivm, enum ll_type type)
-{
- switch (type) {
- case LL_Double:
- return LLVMDoubleTypeInContext(gallivm->context);
- case LL_Float:
- return LLVMFloatTypeInContext(gallivm->context);
- case LL_Int32:
- return LLVMInt32TypeInContext(gallivm->context);
- case LL_Int16:
- return LLVMIntTypeInContext(gallivm->context, 16);
- case LL_Int8:
- return LLVMIntTypeInContext(gallivm->context, 8);
- }
- return LLVMIntTypeInContext(gallivm->context, 8);
-}
-
-static INLINE int
-ll_type_size(enum ll_type type)
-{
- switch (type) {
- case LL_Double:
- return 8;
- case LL_Float:
- return 4;
- case LL_Int32:
- return 4;
- case LL_Int16:
- return 2;
- case LL_Int8:
- return 1;
- }
- return 1;
-}
-
-struct draw_llvm_translate {
- int format;
- from_func from;
- to_func to;
- enum ll_type type;
- int num_components;
-} translates[] =
-{
- {PIPE_FORMAT_R64_FLOAT, from_64_float, to_64_float, LL_Double, 1},
- {PIPE_FORMAT_R64G64_FLOAT, from_64_float, to_64_float, LL_Double, 2},
- {PIPE_FORMAT_R64G64B64_FLOAT, from_64_float, to_64_float, LL_Double, 3},
- {PIPE_FORMAT_R64G64B64A64_FLOAT, from_64_float, to_64_float, LL_Double, 4},
- {PIPE_FORMAT_R32_FLOAT, from_32_float, to_32_float, LL_Float, 1},
- {PIPE_FORMAT_R32G32_FLOAT, from_32_float, to_32_float, LL_Float, 2},
- {PIPE_FORMAT_R32G32B32_FLOAT, from_32_float, to_32_float, LL_Float, 3},
- {PIPE_FORMAT_R32G32B32A32_FLOAT, from_32_float, to_32_float, LL_Float, 4},
-
- {PIPE_FORMAT_R32_UNORM, from_32_unorm, to_32_unorm, LL_Int32, 1},
- {PIPE_FORMAT_R32G32_UNORM, from_32_unorm, to_32_unorm, LL_Int32, 2},
- {PIPE_FORMAT_R32G32B32_UNORM, from_32_unorm, to_32_unorm, LL_Int32, 3},
- {PIPE_FORMAT_R32G32B32A32_UNORM, from_32_unorm, to_32_unorm, LL_Int32, 4},
-
- {PIPE_FORMAT_R32_USCALED, from_32_uscaled, to_32_uscaled, LL_Int32, 1},
- {PIPE_FORMAT_R32G32_USCALED, from_32_uscaled, to_32_uscaled, LL_Int32, 2},
- {PIPE_FORMAT_R32G32B32_USCALED, from_32_uscaled, to_32_uscaled, LL_Int32, 3},
- {PIPE_FORMAT_R32G32B32A32_USCALED, from_32_uscaled, to_32_uscaled, LL_Int32, 4},
-
- {PIPE_FORMAT_R32_SNORM, from_32_snorm, to_32_snorm, LL_Int32, 1},
- {PIPE_FORMAT_R32G32_SNORM, from_32_snorm, to_32_snorm, LL_Int32, 2},
- {PIPE_FORMAT_R32G32B32_SNORM, from_32_snorm, to_32_snorm, LL_Int32, 3},
- {PIPE_FORMAT_R32G32B32A32_SNORM, from_32_snorm, to_32_snorm, LL_Int32, 4},
-
- {PIPE_FORMAT_R32_SSCALED, from_32_sscaled, to_32_sscaled, LL_Int32, 1},
- {PIPE_FORMAT_R32G32_SSCALED, from_32_sscaled, to_32_sscaled, LL_Int32, 2},
- {PIPE_FORMAT_R32G32B32_SSCALED, from_32_sscaled, to_32_sscaled, LL_Int32, 3},
- {PIPE_FORMAT_R32G32B32A32_SSCALED, from_32_sscaled, to_32_sscaled, LL_Int32, 4},
-
- {PIPE_FORMAT_R16_UNORM, from_16_unorm, to_16_unorm, LL_Int16, 1},
- {PIPE_FORMAT_R16G16_UNORM, from_16_unorm, to_16_unorm, LL_Int16, 2},
- {PIPE_FORMAT_R16G16B16_UNORM, from_16_unorm, to_16_unorm, LL_Int16, 3},
- {PIPE_FORMAT_R16G16B16A16_UNORM, from_16_unorm, to_16_unorm, LL_Int16, 4},
-
- {PIPE_FORMAT_R16_USCALED, from_16_uscaled, to_16_uscaled, LL_Int16, 1},
- {PIPE_FORMAT_R16G16_USCALED, from_16_uscaled, to_16_uscaled, LL_Int16, 2},
- {PIPE_FORMAT_R16G16B16_USCALED, from_16_uscaled, to_16_uscaled, LL_Int16, 3},
- {PIPE_FORMAT_R16G16B16A16_USCALED, from_16_uscaled, to_16_uscaled, LL_Int16, 4},
-
- {PIPE_FORMAT_R16_SNORM, from_16_snorm, to_16_snorm, LL_Int16, 1},
- {PIPE_FORMAT_R16G16_SNORM, from_16_snorm, to_16_snorm, LL_Int16, 2},
- {PIPE_FORMAT_R16G16B16_SNORM, from_16_snorm, to_16_snorm, LL_Int16, 3},
- {PIPE_FORMAT_R16G16B16A16_SNORM, from_16_snorm, to_16_snorm, LL_Int16, 4},
-
- {PIPE_FORMAT_R16_SSCALED, from_16_sscaled, to_16_sscaled, LL_Int16, 1},
- {PIPE_FORMAT_R16G16_SSCALED, from_16_sscaled, to_16_sscaled, LL_Int16, 2},
- {PIPE_FORMAT_R16G16B16_SSCALED, from_16_sscaled, to_16_sscaled, LL_Int16, 3},
- {PIPE_FORMAT_R16G16B16A16_SSCALED, from_16_sscaled, to_16_sscaled, LL_Int16, 4},
-
- {PIPE_FORMAT_R8_UNORM, from_8_unorm, to_8_unorm, LL_Int8, 1},
- {PIPE_FORMAT_R8G8_UNORM, from_8_unorm, to_8_unorm, LL_Int8, 2},
- {PIPE_FORMAT_R8G8B8_UNORM, from_8_unorm, to_8_unorm, LL_Int8, 3},
- {PIPE_FORMAT_R8G8B8A8_UNORM, from_8_unorm, to_8_unorm, LL_Int8, 4},
-
- {PIPE_FORMAT_R8_USCALED, from_8_uscaled, to_8_uscaled, LL_Int8, 1},
- {PIPE_FORMAT_R8G8_USCALED, from_8_uscaled, to_8_uscaled, LL_Int8, 2},
- {PIPE_FORMAT_R8G8B8_USCALED, from_8_uscaled, to_8_uscaled, LL_Int8, 3},
- {PIPE_FORMAT_R8G8B8A8_USCALED, from_8_uscaled, to_8_uscaled, LL_Int8, 4},
-
- {PIPE_FORMAT_R8_SNORM, from_8_snorm, to_8_snorm, LL_Int8, 1},
- {PIPE_FORMAT_R8G8_SNORM, from_8_snorm, to_8_snorm, LL_Int8, 2},
- {PIPE_FORMAT_R8G8B8_SNORM, from_8_snorm, to_8_snorm, LL_Int8, 3},
- {PIPE_FORMAT_R8G8B8A8_SNORM, from_8_snorm, to_8_snorm, LL_Int8, 4},
-
- {PIPE_FORMAT_R8_SSCALED, from_8_sscaled, to_8_sscaled, LL_Int8, 1},
- {PIPE_FORMAT_R8G8_SSCALED, from_8_sscaled, to_8_sscaled, LL_Int8, 2},
- {PIPE_FORMAT_R8G8B8_SSCALED, from_8_sscaled, to_8_sscaled, LL_Int8, 3},
- {PIPE_FORMAT_R8G8B8A8_SSCALED, from_8_sscaled, to_8_sscaled, LL_Int8, 4},
-
- {PIPE_FORMAT_R32_FIXED, from_32_fixed, to_32_fixed, LL_Int32, 1},
- {PIPE_FORMAT_R32G32_FIXED, from_32_fixed, to_32_fixed, LL_Int32, 2},
- {PIPE_FORMAT_R32G32B32_FIXED, from_32_fixed, to_32_fixed, LL_Int32, 3},
- {PIPE_FORMAT_R32G32B32A32_FIXED, from_32_fixed, to_32_fixed, LL_Int32, 4},
-};
-
-
-static LLVMValueRef
-fetch(struct gallivm_state *gallivm,
- LLVMValueRef ptr, int val_size, int nr_components,
- from_func func)
-{
- int i;
- int offset = 0;
- LLVMValueRef res =
- LLVMConstNull(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4));
- LLVMValueRef defaults[4];
-
- defaults[0] =
- defaults[1] =
- defaults[2] = lp_build_const_float(gallivm, 0.0);
- defaults[3] = lp_build_const_float(gallivm, 1.0);
-
- for (i = 0; i < nr_components; ++i) {
- LLVMValueRef src_index = lp_build_const_int32(gallivm, offset);
- LLVMValueRef dst_index = lp_build_const_int32(gallivm, i);
- LLVMValueRef src_tmp;
- LLVMValueRef component;
-
- src_tmp = LLVMBuildGEP(gallivm->builder, ptr, &src_index, 1, "src_tmp");
-
- /* convert src_tmp to float */
- component = func(gallivm, src_tmp);
-
- /* vec.comp = component */
- res = LLVMBuildInsertElement(gallivm->builder,
- res,
- component,
- dst_index, "");
- offset += val_size;
- }
- for (; i < 4; ++i) {
- LLVMValueRef dst_index = lp_build_const_int32(gallivm, i);
- res = LLVMBuildInsertElement(gallivm->builder,
- res,
- defaults[i],
- dst_index, "");
- }
- return res;
-}
-
-
-LLVMValueRef
-draw_llvm_translate_from(struct gallivm_state *gallivm,
- LLVMValueRef vbuffer,
- enum pipe_format from_format)
-{
- const struct util_format_description *format_desc;
- LLVMValueRef zero;
- int i;
- struct lp_type type = lp_float32_vec4_type();
-
- /*
- * The above can only cope with straight arrays: no bitfields,
- * swizzles, or half floats.
- */
-
- for (i = 0; i < Elements(translates); ++i) {
- if (translates[i].format == from_format) {
- /*LLVMTypeRef type = ll_type_to_llvm(translates[i].type);*/
- return fetch(gallivm,
- vbuffer,
- ll_type_size(translates[i].type),
- translates[i].num_components,
- translates[i].from);
- }
- }
-
-
- /*
- * This doesn't handle anything bigger than 32bits, or half floats
- * yet.
- *
- * TODO: unify all this code into lp_build_fetch_rgba_aos().
- */
-
- format_desc = util_format_description(from_format);
- zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
- return lp_build_fetch_rgba_aos(gallivm, format_desc, type, vbuffer, zero, zero, zero);
-}
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index d85deeea7f5..9cede2108db 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -47,8 +47,8 @@
#include "tgsi/tgsi_scan.h"
#ifdef HAVE_LLVM
-#include <llvm-c/ExecutionEngine.h>
struct draw_llvm;
+struct gallivm_state;
#endif
@@ -301,7 +301,6 @@ struct draw_context
#ifdef HAVE_LLVM
struct draw_llvm *llvm;
- struct gallivm_state *own_gallivm;
#endif
struct pipe_sampler_view *sampler_views[PIPE_MAX_VERTEX_SAMPLERS];
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
index 1e17f808408..04b286f0f5b 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -230,7 +230,7 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle,
llvm_vert_info.stride = fpme->vertex_size;
llvm_vert_info.verts =
(struct vertex_header *)MALLOC(fpme->vertex_size *
- align(fetch_info->count, 4));
+ align(fetch_info->count, lp_native_vector_width / 32));
if (!llvm_vert_info.verts) {
assert(0);
return;
@@ -423,7 +423,7 @@ draw_pt_fetch_pipeline_or_emit_llvm(struct draw_context *draw)
{
struct llvm_middle_end *fpme = 0;
- if (!draw->llvm || !draw->llvm->gallivm->engine)
+ if (!draw->llvm)
return NULL;
fpme = CALLOC_STRUCT( llvm_middle_end );
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 9fc57629822..d226dab5b81 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -75,9 +75,9 @@ lp_build_min_simple(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b)
{
- LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
const char *intrinsic = NULL;
+ unsigned intr_size;
LLVMValueRef cond;
assert(lp_check_value(type, a));
@@ -85,31 +85,71 @@ lp_build_min_simple(struct lp_build_context *bld,
/* TODO: optimize the constant case */
- if(type.width * type.length == 128) {
- if(type.floating) {
- if(type.width == 32 && util_cpu_caps.has_sse)
+ if (type.floating && util_cpu_caps.has_sse) {
+ if (type.width == 32) {
+ if (type.length == 1) {
+ intrinsic = "llvm.x86.sse.min.ss";
+ intr_size = 128;
+ }
+ else if (type.length <= 4 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse.min.ps";
- if(type.width == 64 && util_cpu_caps.has_sse2)
+ intr_size = 128;
+ }
+ else {
+ intrinsic = "llvm.x86.avx.min.ps.256";
+ intr_size = 256;
+ }
+ }
+ if (type.width == 64 && util_cpu_caps.has_sse2) {
+ if (type.length == 1) {
+ intrinsic = "llvm.x86.sse2.min.sd";
+ intr_size = 128;
+ }
+ else if (type.length == 2 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse2.min.pd";
+ intr_size = 128;
+ }
+ else {
+ intrinsic = "llvm.x86.avx.min.pd.256";
+ intr_size = 256;
+ }
}
- else {
- if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
- intrinsic = "llvm.x86.sse2.pminu.b";
- if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
+ }
+ else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+ intr_size = 128;
+ if ((type.width == 8 || type.width == 16) &&
+ (type.width * type.length <= 64) &&
+ (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+ debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
+ __FUNCTION__);
+ }
+ if (type.width == 8 && !type.sign) {
+ intrinsic = "llvm.x86.sse2.pminu.b";
+ }
+ else if (type.width == 16 && type.sign) {
+ intrinsic = "llvm.x86.sse2.pmins.w";
+ }
+ if (util_cpu_caps.has_sse4_1) {
+ if (type.width == 8 && type.sign) {
intrinsic = "llvm.x86.sse41.pminsb";
- if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 16 && !type.sign) {
intrinsic = "llvm.x86.sse41.pminuw";
- if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
- intrinsic = "llvm.x86.sse2.pmins.w";
- if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 32 && !type.sign) {
intrinsic = "llvm.x86.sse41.pminud";
- if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 32 && type.sign) {
intrinsic = "llvm.x86.sse41.pminsd";
+ }
}
}
- if(intrinsic)
- return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
+ if(intrinsic) {
+ return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
+ type,
+ intr_size, a, b);
+ }
cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
return lp_build_select(bld, cond, a, b);
@@ -125,9 +165,9 @@ lp_build_max_simple(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b)
{
- LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
const char *intrinsic = NULL;
+ unsigned intr_size;
LLVMValueRef cond;
assert(lp_check_value(type, a));
@@ -135,31 +175,72 @@ lp_build_max_simple(struct lp_build_context *bld,
/* TODO: optimize the constant case */
- if(type.width * type.length == 128) {
- if(type.floating) {
- if(type.width == 32 && util_cpu_caps.has_sse)
+ if (type.floating && util_cpu_caps.has_sse) {
+ if (type.width == 32) {
+ if (type.length == 1) {
+ intrinsic = "llvm.x86.sse.max.ss";
+ intr_size = 128;
+ }
+ else if (type.length <= 4 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse.max.ps";
- if(type.width == 64 && util_cpu_caps.has_sse2)
+ intr_size = 128;
+ }
+ else {
+ intrinsic = "llvm.x86.avx.max.ps.256";
+ intr_size = 256;
+ }
+ }
+ if (type.width == 64 && util_cpu_caps.has_sse2) {
+ if (type.length == 1) {
+ intrinsic = "llvm.x86.sse2.max.sd";
+ intr_size = 128;
+ }
+ else if (type.length == 2 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse2.max.pd";
+ intr_size = 128;
+ }
+ else {
+ intrinsic = "llvm.x86.avx.max.pd.256";
+ intr_size = 256;
+ }
}
- else {
- if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
- intrinsic = "llvm.x86.sse2.pmaxu.b";
- if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
+ }
+ else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+ intr_size = 128;
+ if ((type.width == 8 || type.width == 16) &&
+ (type.width * type.length <= 64) &&
+ (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+ debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
+ __FUNCTION__);
+ }
+ if (type.width == 8 && !type.sign) {
+ intrinsic = "llvm.x86.sse2.pmaxu.b";
+ intr_size = 128;
+ }
+ else if (type.width == 16 && type.sign) {
+ intrinsic = "llvm.x86.sse2.pmaxs.w";
+ }
+ if (util_cpu_caps.has_sse4_1) {
+ if (type.width == 8 && type.sign) {
intrinsic = "llvm.x86.sse41.pmaxsb";
- if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 16 && !type.sign) {
intrinsic = "llvm.x86.sse41.pmaxuw";
- if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
- intrinsic = "llvm.x86.sse2.pmaxs.w";
- if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 32 && !type.sign) {
intrinsic = "llvm.x86.sse41.pmaxud";
- if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 32 && type.sign) {
intrinsic = "llvm.x86.sse41.pmaxsd";
+ }
}
}
- if(intrinsic)
- return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
+ if(intrinsic) {
+ return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
+ type,
+ intr_size, a, b);
+ }
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
return lp_build_select(bld, cond, a, b);
@@ -265,15 +346,20 @@ lp_build_add(struct lp_build_context *bld,
}
-/** Return the scalar sum of the elements of a */
+/** Return the scalar sum of the elements of a.
+ * Should avoid this operation whenever possible.
+ */
LLVMValueRef
-lp_build_sum_vector(struct lp_build_context *bld,
- LLVMValueRef a)
+lp_build_horizontal_add(struct lp_build_context *bld,
+ LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMValueRef index, res;
- unsigned i;
+ unsigned i, length;
+ LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
+ LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
+ LLVMValueRef vecres, elem2;
assert(lp_check_value(type, a));
@@ -283,26 +369,191 @@ lp_build_sum_vector(struct lp_build_context *bld,
assert(!bld->type.norm);
- index = lp_build_const_int32(bld->gallivm, 0);
- res = LLVMBuildExtractElement(builder, a, index, "");
+ /*
+ * for byte vectors can do much better with psadbw.
+ * Using repeated shuffle/adds here. Note with multiple vectors
+ * this can be done more efficiently as outlined in the intel
+ * optimization manual.
+ * Note: could cause data rearrangement if used with smaller element
+ * sizes.
+ */
- for (i = 1; i < type.length; i++) {
- index = lp_build_const_int32(bld->gallivm, i);
- if (type.floating)
- res = LLVMBuildFAdd(builder, res,
- LLVMBuildExtractElement(builder,
- a, index, ""),
- "");
- else
- res = LLVMBuildAdd(builder, res,
- LLVMBuildExtractElement(builder,
- a, index, ""),
- "");
+ vecres = a;
+ length = type.length / 2;
+ while (length > 1) {
+ LLVMValueRef vec1, vec2;
+ for (i = 0; i < length; i++) {
+ shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
+ shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
+ }
+ vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
+ LLVMConstVector(shuffles1, length), "");
+ vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
+ LLVMConstVector(shuffles2, length), "");
+ if (type.floating) {
+ vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
+ }
+ else {
+ vecres = LLVMBuildAdd(builder, vec1, vec2, "");
+ }
+ length = length >> 1;
}
+ /* always have vector of size 2 here */
+ assert(length == 1);
+
+ index = lp_build_const_int32(bld->gallivm, 0);
+ res = LLVMBuildExtractElement(builder, vecres, index, "");
+ index = lp_build_const_int32(bld->gallivm, 1);
+ elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
+
+ if (type.floating)
+ res = LLVMBuildFAdd(builder, res, elem2, "");
+ else
+ res = LLVMBuildAdd(builder, res, elem2, "");
+
return res;
}
+/**
+ * Return the horizontal sums of 4 float vectors as a float4 vector.
+ * This uses the technique as outlined in Intel Optimization Manual.
+ */
+static LLVMValueRef
+lp_build_horizontal_add4x4f(struct lp_build_context *bld,
+ LLVMValueRef src[4])
+{
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef shuffles[4];
+ LLVMValueRef tmp[4];
+ LLVMValueRef sumtmp[2], shuftmp[2];
+
+ /* lower half of regs */
+ shuffles[0] = lp_build_const_int32(gallivm, 0);
+ shuffles[1] = lp_build_const_int32(gallivm, 1);
+ shuffles[2] = lp_build_const_int32(gallivm, 4);
+ shuffles[3] = lp_build_const_int32(gallivm, 5);
+ tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
+ LLVMConstVector(shuffles, 4), "");
+ tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
+ LLVMConstVector(shuffles, 4), "");
+
+ /* upper half of regs */
+ shuffles[0] = lp_build_const_int32(gallivm, 2);
+ shuffles[1] = lp_build_const_int32(gallivm, 3);
+ shuffles[2] = lp_build_const_int32(gallivm, 6);
+ shuffles[3] = lp_build_const_int32(gallivm, 7);
+ tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
+ LLVMConstVector(shuffles, 4), "");
+ tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
+ LLVMConstVector(shuffles, 4), "");
+
+ sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
+ sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
+
+ shuffles[0] = lp_build_const_int32(gallivm, 0);
+ shuffles[1] = lp_build_const_int32(gallivm, 2);
+ shuffles[2] = lp_build_const_int32(gallivm, 4);
+ shuffles[3] = lp_build_const_int32(gallivm, 6);
+ shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
+ LLVMConstVector(shuffles, 4), "");
+
+ shuffles[0] = lp_build_const_int32(gallivm, 1);
+ shuffles[1] = lp_build_const_int32(gallivm, 3);
+ shuffles[2] = lp_build_const_int32(gallivm, 5);
+ shuffles[3] = lp_build_const_int32(gallivm, 7);
+ shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
+ LLVMConstVector(shuffles, 4), "");
+
+ return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
+}
+
+
+/*
+ * partially horizontally add 2-4 float vectors with length nx4,
+ * i.e. only four adjacent values in each vector will be added,
+ * assuming values are really grouped in 4 which also determines
+ * output order.
+ *
+ * Return a vector of the same length as the initial vectors,
+ * with the excess elements (if any) being undefined.
+ * The element order is independent of number of input vectors.
+ * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
+ * the output order thus will be
+ * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
+ */
+LLVMValueRef
+lp_build_hadd_partial4(struct lp_build_context *bld,
+ LLVMValueRef vectors[],
+ unsigned num_vecs)
+{
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef ret_vec;
+ LLVMValueRef tmp[4];
+ const char *intrinsic = NULL;
+
+ assert(num_vecs >= 2 && num_vecs <= 4);
+ assert(bld->type.floating);
+
+ /* only use this with at least 2 vectors, as it is sort of expensive
+ * (depending on cpu) and we always need two horizontal adds anyway,
+ * so a shuffle/add approach might be better.
+ */
+
+ tmp[0] = vectors[0];
+ tmp[1] = vectors[1];
+
+ tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
+ tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
+
+ if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
+ bld->type.length == 4) {
+ intrinsic = "llvm.x86.sse3.hadd.ps";
+ }
+ else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
+ bld->type.length == 8) {
+ intrinsic = "llvm.x86.avx.hadd.ps.256";
+ }
+ if (intrinsic) {
+ tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
+ lp_build_vec_type(gallivm, bld->type),
+ tmp[0], tmp[1]);
+ if (num_vecs > 2) {
+ tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
+ lp_build_vec_type(gallivm, bld->type),
+ tmp[2], tmp[3]);
+ }
+ else {
+ tmp[1] = tmp[0];
+ }
+ return lp_build_intrinsic_binary(builder, intrinsic,
+ lp_build_vec_type(gallivm, bld->type),
+ tmp[0], tmp[1]);
+ }
+
+ if (bld->type.length == 4) {
+ ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
+ }
+ else {
+ LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
+ unsigned j;
+ unsigned num_iter = bld->type.length / 4;
+ struct lp_type parttype = bld->type;
+ parttype.length = 4;
+ for (j = 0; j < num_iter; j++) {
+ LLVMValueRef partsrc[4];
+ unsigned i;
+ for (i = 0; i < 4; i++) {
+ partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
+ }
+ partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
+ }
+ ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
+ }
+ return ret_vec;
+}
/**
* Generate a - b
@@ -553,7 +804,7 @@ lp_build_mul_imm(struct lp_build_context *bld,
if(bld->type.floating) {
#if 0
/*
- * Power of two multiplication by directly manipulating the mantissa.
+ * Power of two multiplication by directly manipulating the exponent.
*
* XXX: This might not be always faster, it will introduce a small error
* for multiplication by zero, and it will produce wrong results
@@ -612,7 +863,8 @@ lp_build_div(struct lp_build_context *bld,
return LLVMConstUDiv(a, b);
}
- if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4 &&
+ if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+ (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
type.floating)
return lp_build_mul(bld, a, lp_build_rcp(bld, b));
@@ -871,6 +1123,12 @@ lp_build_abs(struct lp_build_context *bld,
return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
}
}
+ else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
+ (gallivm_debug & GALLIVM_DEBUG_PERF) &&
+ (type.width == 8 || type.width == 16 || type.width == 32)) {
+ debug_printf("%s: inefficient code, should split vectors manually\n",
+ __FUNCTION__);
+ }
return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
}
@@ -934,6 +1192,7 @@ lp_build_sgn(struct lp_build_context *bld,
else
{
/* signed int/norm/fixed point */
+ /* could use psign with sse3 and appropriate vectors here */
LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
res = lp_build_select(bld, cond, bld->one, minus_one);
@@ -1000,7 +1259,16 @@ lp_build_int_to_float(struct lp_build_context *bld,
return LLVMBuildSIToFP(builder, a, vec_type, "");
}
+static boolean
+sse41_rounding_available(const struct lp_type type)
+{
+ if ((util_cpu_caps.has_sse4_1 &&
+ (type.length == 1 || type.width*type.length == 128)) ||
+ (util_cpu_caps.has_avx && type.width*type.length == 256))
+ return TRUE;
+ return FALSE;
+}
enum lp_build_round_sse41_mode
{
@@ -1065,18 +1333,34 @@ lp_build_round_sse41(struct lp_build_context *bld,
res = LLVMBuildExtractElement(builder, res, index0, "");
}
else {
- assert(type.width*type.length == 128);
-
- switch(type.width) {
- case 32:
- intrinsic = "llvm.x86.sse41.round.ps";
- break;
- case 64:
- intrinsic = "llvm.x86.sse41.round.pd";
- break;
- default:
- assert(0);
- return bld->undef;
+ if (type.width * type.length == 128) {
+ switch(type.width) {
+ case 32:
+ intrinsic = "llvm.x86.sse41.round.ps";
+ break;
+ case 64:
+ intrinsic = "llvm.x86.sse41.round.pd";
+ break;
+ default:
+ assert(0);
+ return bld->undef;
+ }
+ }
+ else {
+ assert(type.width * type.length == 256);
+ assert(util_cpu_caps.has_avx);
+
+ switch(type.width) {
+ case 32:
+ intrinsic = "llvm.x86.avx.round.ps.256";
+ break;
+ case 64:
+ intrinsic = "llvm.x86.avx.round.pd.256";
+ break;
+ default:
+ assert(0);
+ return bld->undef;
+ }
}
res = lp_build_intrinsic_binary(builder, intrinsic,
@@ -1125,10 +1409,15 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld,
ret_type, arg);
}
else {
- assert(type.width*type.length == 128);
-
- intrinsic = "llvm.x86.sse2.cvtps2dq";
+ if (type.width* type.length == 128) {
+ intrinsic = "llvm.x86.sse2.cvtps2dq";
+ }
+ else {
+ assert(type.width*type.length == 256);
+ assert(util_cpu_caps.has_avx);
+ intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
+ }
res = lp_build_intrinsic_unary(builder, intrinsic,
ret_type, a);
}
@@ -1152,8 +1441,7 @@ lp_build_trunc(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
}
else {
@@ -1183,8 +1471,7 @@ lp_build_round(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
}
else {
@@ -1212,8 +1499,7 @@ lp_build_floor(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
}
else {
@@ -1241,8 +1527,7 @@ lp_build_ceil(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
}
else {
@@ -1269,6 +1554,34 @@ lp_build_fract(struct lp_build_context *bld,
/**
+ * Prevent returning a fractional part of 1.0 for very small negative values of
+ * 'a' by clamping against 0.99999(9).
+ */
+static inline LLVMValueRef
+clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
+{
+ LLVMValueRef max;
+
+ /* this is the largest number smaller than 1.0 representable as float */
+ max = lp_build_const_vec(bld->gallivm, bld->type,
+ 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
+ return lp_build_min(bld, fract, max);
+}
+
+
+/**
+ * Same as lp_build_fract, but guarantees that the result is always smaller
+ * than one.
+ */
+LLVMValueRef
+lp_build_fract_safe(struct lp_build_context *bld,
+ LLVMValueRef a)
+{
+ return clamp_fract(bld, lp_build_fract(bld, a));
+}
+
+
+/**
* Return the integer part of a float (vector) value (== round toward zero).
* The returned value is an integer (vector).
* Ex: itrunc(-1.5) = -1
@@ -1307,12 +1620,12 @@ lp_build_iround(struct lp_build_context *bld,
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse2 &&
- ((type.width == 32) && (type.length == 1 || type.length == 4))) {
+ if ((util_cpu_caps.has_sse2 &&
+ ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
+ (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
return lp_build_iround_nearest_sse2(bld, a);
}
- else if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
}
else {
@@ -1362,14 +1675,12 @@ lp_build_ifloor(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
- res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
- }
- else {
- res = a;
-
- if (type.sign) {
+ res = a;
+ if (type.sign) {
+ if (sse41_rounding_available(type)) {
+ res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+ }
+ else {
/* Take the sign bit and add it to 1 constant */
LLVMTypeRef vec_type = bld->vec_type;
unsigned mantissa = lp_mantissa(type);
@@ -1423,8 +1734,7 @@ lp_build_iceil(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
}
else {
@@ -1470,7 +1780,7 @@ lp_build_iceil(struct lp_build_context *bld,
* Combined ifloor() & fract().
*
* Preferred to calling the functions separately, as it will ensure that the
- * stratergy (floor() vs ifloor()) that results in less redundant work is used.
+ * strategy (floor() vs ifloor()) that results in less redundant work is used.
*/
void
lp_build_ifloor_fract(struct lp_build_context *bld,
@@ -1485,8 +1795,7 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
/*
* floor() is easier.
*/
@@ -1507,6 +1816,21 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
}
+/**
+ * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
+ * always smaller than one.
+ */
+void
+lp_build_ifloor_fract_safe(struct lp_build_context *bld,
+ LLVMValueRef a,
+ LLVMValueRef *out_ipart,
+ LLVMValueRef *out_fpart)
+{
+ lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
+ *out_fpart = clamp_fract(bld, *out_fpart);
+}
+
+
LLVMValueRef
lp_build_sqrt(struct lp_build_context *bld,
LLVMValueRef a)
@@ -1519,10 +1843,14 @@ lp_build_sqrt(struct lp_build_context *bld,
assert(lp_check_value(type, a));
/* TODO: optimize the constant case */
- /* TODO: optimize the constant case */
assert(type.floating);
- util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
+ if (type.length == 1) {
+ util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
+ }
+ else {
+ util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
+ }
return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
}
@@ -1586,19 +1914,28 @@ lp_build_rcp(struct lp_build_context *bld,
* - it doesn't even get the reciprocate of 1.0 exactly
* - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
* - for recent processors the benefit over DIVPS is marginal, a case
- * depedent
+ * dependent
*
* We could still use it on certain processors if benchmarks show that the
* RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
* particular uses that require less workarounds.
*/
- if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+ if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+ (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
const unsigned num_iterations = 0;
LLVMValueRef res;
unsigned i;
+ const char *intrinsic = NULL;
- res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
+ if (type.length == 4) {
+ intrinsic = "llvm.x86.sse.rcp.ps";
+ }
+ else {
+ intrinsic = "llvm.x86.avx.rcp.ps.256";
+ }
+
+ res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
for (i = 0; i < num_iterations; ++i) {
res = lp_build_rcp_refine(bld, a, res);
@@ -1653,12 +1990,22 @@ lp_build_rsqrt(struct lp_build_context *bld,
assert(type.floating);
- if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+ if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+ (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
const unsigned num_iterations = 1;
LLVMValueRef res;
unsigned i;
+ const char *intrinsic = NULL;
+
+ if (type.length == 4) {
+ intrinsic = "llvm.x86.sse.rsqrt.ps";
+ }
+ else {
+ intrinsic = "llvm.x86.avx.rsqrt.ps.256";
+ }
+
+ res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
- res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
for (i = 0; i < num_iterations; ++i) {
res = lp_build_rsqrt_refine(bld, a, res);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index aeb987ff352..60b9907e60f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -57,8 +57,13 @@ lp_build_add(struct lp_build_context *bld,
LLVMValueRef b);
LLVMValueRef
-lp_build_sum_vector(struct lp_build_context *bld,
- LLVMValueRef a);
+lp_build_horizontal_add(struct lp_build_context *bld,
+ LLVMValueRef a);
+
+LLVMValueRef
+lp_build_hadd_partial4(struct lp_build_context *bld,
+ LLVMValueRef vectors[],
+ unsigned num_vecs);
LLVMValueRef
lp_build_sub(struct lp_build_context *bld,
@@ -157,6 +162,10 @@ lp_build_fract(struct lp_build_context *bld,
LLVMValueRef a);
LLVMValueRef
+lp_build_fract_safe(struct lp_build_context *bld,
+ LLVMValueRef a);
+
+LLVMValueRef
lp_build_ifloor(struct lp_build_context *bld,
LLVMValueRef a);
LLVMValueRef
@@ -177,6 +186,12 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
LLVMValueRef *out_ipart,
LLVMValueRef *out_fpart);
+void
+lp_build_ifloor_fract_safe(struct lp_build_context *bld,
+ LLVMValueRef a,
+ LLVMValueRef *out_ipart,
+ LLVMValueRef *out_fpart);
+
LLVMValueRef
lp_build_sqrt(struct lp_build_context *bld,
LLVMValueRef a);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.c b/src/gallium/auxiliary/gallivm/lp_bld_const.c
index 59e8fb2ed6e..35799a1ef8e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.c
@@ -37,6 +37,7 @@
#include "util/u_debug.h"
#include "util/u_math.h"
+#include "util/u_half.h"
#include "lp_bld_type.h"
#include "lp_bld_const.h"
@@ -50,10 +51,12 @@ lp_mantissa(struct lp_type type)
if(type.floating) {
switch(type.width) {
+ case 16:
+ return 10;
case 32:
return 23;
case 64:
- return 53;
+ return 52;
default:
assert(0);
return 0;
@@ -136,6 +139,8 @@ lp_const_min(struct lp_type type)
if (type.floating) {
switch(type.width) {
+ case 16:
+ return -65504;
case 32:
return -FLT_MAX;
case 64:
@@ -169,6 +174,8 @@ lp_const_max(struct lp_type type)
if (type.floating) {
switch(type.width) {
+ case 16:
+ return 65504;
case 32:
return FLT_MAX;
case 64:
@@ -196,6 +203,8 @@ lp_const_eps(struct lp_type type)
{
if (type.floating) {
switch(type.width) {
+ case 16:
+ return 2E-10;
case 32:
return FLT_EPSILON;
case 64:
@@ -247,7 +256,9 @@ lp_build_one(struct gallivm_state *gallivm, struct lp_type type)
elem_type = lp_build_elem_type(gallivm, type);
- if(type.floating)
+ if(type.floating && type.width == 16)
+ elems[0] = LLVMConstInt(elem_type, util_float_to_half(1.0f), 0);
+ else if(type.floating)
elems[0] = LLVMConstReal(elem_type, 1.0);
else if(type.fixed)
elems[0] = LLVMConstInt(elem_type, 1LL << (type.width/2), 0);
@@ -292,7 +303,9 @@ lp_build_const_elem(struct gallivm_state *gallivm,
LLVMTypeRef elem_type = lp_build_elem_type(gallivm, type);
LLVMValueRef elem;
- if(type.floating) {
+ if(type.floating && type.width == 16) {
+ elem = LLVMConstInt(elem_type, util_float_to_half((float)val), 0);
+ } else if(type.floating) {
elem = LLVMConstReal(elem_type, val);
}
else {
@@ -364,20 +377,10 @@ lp_build_const_aos(struct gallivm_state *gallivm,
if(swizzle == NULL)
swizzle = default_swizzle;
- if(type.floating) {
- elems[swizzle[0]] = LLVMConstReal(elem_type, r);
- elems[swizzle[1]] = LLVMConstReal(elem_type, g);
- elems[swizzle[2]] = LLVMConstReal(elem_type, b);
- elems[swizzle[3]] = LLVMConstReal(elem_type, a);
- }
- else {
- double dscale = lp_const_scale(type);
-
- elems[swizzle[0]] = LLVMConstInt(elem_type, round(r*dscale), 0);
- elems[swizzle[1]] = LLVMConstInt(elem_type, round(g*dscale), 0);
- elems[swizzle[2]] = LLVMConstInt(elem_type, round(b*dscale), 0);
- elems[swizzle[3]] = LLVMConstInt(elem_type, round(a*dscale), 0);
- }
+ elems[swizzle[0]] = lp_build_const_elem(gallivm, type, r);
+ elems[swizzle[1]] = lp_build_const_elem(gallivm, type, g);
+ elems[swizzle[2]] = lp_build_const_elem(gallivm, type, b);
+ elems[swizzle[3]] = lp_build_const_elem(gallivm, type, a);
for(i = 4; i < type.length; ++i)
elems[i] = elems[i % 4];
@@ -452,7 +455,7 @@ lp_build_const_string(struct gallivm_state *gallivm,
/**
* Build a callable function pointer.
*
- * We this casts instead of LLVMAddGlobalMapping()
+ * We use function pointer constants instead of LLVMAddGlobalMapping()
* to work around a bug in LLVM 2.6, and for efficiency/simplicity.
*/
LLVMValueRef
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 0973e1f16f3..0399709faad 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -70,6 +70,66 @@
#include "lp_bld_arit.h"
#include "lp_bld_pack.h"
#include "lp_bld_conv.h"
+#include "lp_bld_logic.h"
+
+
+/**
+ * Converts int16 half-float to float32
+ * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
+ * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
+ *
+ * @param src_type <vector> type of int16
+ * @param src value to convert
+ *
+ * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
+ */
+LLVMValueRef
+lp_build_half_to_float(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ LLVMValueRef src)
+{
+ struct lp_type f32_type = lp_type_float_vec(32, 32 * src_type.length);
+ struct lp_type i32_type = lp_type_int_vec(32, 32 * src_type.length);
+
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
+ LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type);
+
+ /* Constants */
+ LLVMValueRef i32_13 = lp_build_const_int_vec(gallivm, i32_type, 13);
+ LLVMValueRef i32_16 = lp_build_const_int_vec(gallivm, i32_type, 16);
+ LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff);
+ LLVMValueRef i32_was_infnan = lp_build_const_int_vec(gallivm, i32_type, 0x7bff);
+ LLVMValueRef i32_exp_infnan = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
+ LLVMValueRef f32_magic = LLVMBuildBitCast(builder,
+ lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23),
+ float_vec_type, "");
+
+ /* Convert int16 vector to int32 vector by zero ext */
+ LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, "");
+
+ /* Exponent / mantissa bits */
+ LLVMValueRef expmant = LLVMBuildAnd(builder, i32_mask_nosign, h, "");
+ LLVMValueRef shifted = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, "");
+
+ /* Exponent adjust */
+ LLVMValueRef scaled = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, "");
+
+ /* Make sure Inf/NaN survive */
+ LLVMValueRef b_wasinfnan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan);
+ LLVMValueRef infnanexp = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, "");
+
+ /* Sign bit */
+ LLVMValueRef justsign = LLVMBuildXor(builder, h, expmant, "");
+ LLVMValueRef sign = LLVMBuildShl(builder, justsign, i32_16, "");
+
+ /* Combine result */
+ LLVMValueRef sign_inf = LLVMBuildOr(builder, sign, infnanexp, "");
+ LLVMValueRef final = LLVMBuildOr(builder, scaled, sign_inf, "");
+
+ /* Cast from int32 vector to float32 vector */
+ return LLVMBuildBitCast(builder, final, float_vec_type, "");
+}
/**
@@ -334,6 +394,8 @@ lp_build_conv(struct gallivm_state *gallivm,
dst_type.width == 8 &&
dst_type.length == 16 &&
+ 4 * num_dsts == num_srcs &&
+
util_cpu_caps.has_sse2)
{
struct lp_build_context bld;
@@ -371,6 +433,76 @@ lp_build_conv(struct gallivm_state *gallivm,
return;
}
+ /* Special case 2x8f --> 1x16ub
+ */
+ else if (src_type.floating == 1 &&
+ src_type.fixed == 0 &&
+ src_type.sign == 1 &&
+ src_type.norm == 0 &&
+ src_type.width == 32 &&
+ src_type.length == 8 &&
+
+ dst_type.floating == 0 &&
+ dst_type.fixed == 0 &&
+ dst_type.sign == 0 &&
+ dst_type.norm == 1 &&
+ dst_type.width == 8 &&
+ dst_type.length == 16 &&
+
+ 2 * num_dsts == num_srcs &&
+
+ util_cpu_caps.has_avx) {
+
+ struct lp_build_context bld;
+ struct lp_type int16_type = dst_type;
+ struct lp_type int32_type = dst_type;
+ LLVMValueRef const_255f;
+ unsigned i;
+
+ lp_build_context_init(&bld, gallivm, src_type);
+
+ int16_type.width *= 2;
+ int16_type.length /= 2;
+ int16_type.sign = 1;
+
+ int32_type.width *= 4;
+ int32_type.length /= 4;
+ int32_type.sign = 1;
+
+ const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
+
+ for (i = 0; i < num_dsts; ++i, src += 2) {
+ LLVMValueRef lo, hi, a, b;
+
+ a = LLVMBuildFMul(builder, src[0], const_255f, "");
+ b = LLVMBuildFMul(builder, src[1], const_255f, "");
+
+ a = lp_build_iround(&bld, a);
+ b = lp_build_iround(&bld, b);
+
+ tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
+ tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
+ tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
+ tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
+
+ /* relying on clamping behavior of sse2 intrinsics here */
+ lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
+ hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
+ dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
+ }
+ return;
+ }
+
+ /* Pre convert half-floats to floats
+ */
+ else if (src_type.floating && src_type.width == 16)
+ {
+ for(i = 0; i < num_tmps; ++i)
+ tmp[i] = lp_build_half_to_float(gallivm, src_type, tmp[i]);
+
+ tmp_type.width = 32;
+ }
+
/*
* Clamp if necessary
*/
@@ -580,7 +712,7 @@ lp_build_conv(struct gallivm_state *gallivm,
* This will convert the integer masks that match the given types.
*
* The mask values should 0 or -1, i.e., all bits either set to zero or one.
- * Any other value will likely cause in unpredictable results.
+ * Any other value will likely cause unpredictable results.
*
* This is basically a very trimmed down version of lp_build_conv.
*/
@@ -591,8 +723,6 @@ lp_build_conv_mask(struct gallivm_state *gallivm,
const LLVMValueRef *src, unsigned num_srcs,
LLVMValueRef *dst, unsigned num_dsts)
{
- /* Register width must remain constant */
- assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
/* We must not loose or gain channels. Only precision */
assert(src_type.length * num_srcs == dst_type.length * num_dsts);
@@ -617,16 +747,5 @@ lp_build_conv_mask(struct gallivm_state *gallivm,
* Truncate or expand bit width
*/
- if(src_type.width > dst_type.width) {
- assert(num_dsts == 1);
- dst[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
- }
- else if(src_type.width < dst_type.width) {
- assert(num_srcs == 1);
- lp_build_unpack(gallivm, src_type, dst_type, src[0], dst, num_dsts);
- }
- else {
- assert(num_srcs == num_dsts);
- memcpy(dst, src, num_dsts * sizeof *dst);
- }
+ lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
index cec655980fa..c830fbef5f2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
@@ -42,6 +42,10 @@
struct lp_type;
+LLVMValueRef
+lp_build_half_to_float(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ LLVMValueRef src);
LLVMValueRef
lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
index 444b70a678c..93505f3da45 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
@@ -35,10 +35,8 @@
#if HAVE_LLVM >= 0x0300
#include <llvm/Support/TargetRegistry.h>
-#include <llvm/Support/TargetSelect.h>
#else /* HAVE_LLVM < 0x0300 */
#include <llvm/Target/TargetRegistry.h>
-#include <llvm/Target/TargetSelect.h>
#endif /* HAVE_LLVM < 0x0300 */
#if HAVE_LLVM >= 0x0209
@@ -183,7 +181,7 @@ lp_disassemble(const void* func)
/*
* Limit disassembly to this extent
*/
- const uint64_t extent = 0x10000;
+ const uint64_t extent = 96 * 1024;
uint64_t max_pc = 0;
@@ -200,24 +198,6 @@ lp_disassemble(const void* func)
std::string Error;
const Target *T = TargetRegistry::lookupTarget(Triple, Error);
-#if HAVE_LLVM >= 0x0208
- InitializeNativeTargetAsmPrinter();
-#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
- LLVMInitializeX86AsmPrinter();
-#elif defined(PIPE_ARCH_ARM)
- LLVMInitializeARMAsmPrinter();
-#elif defined(PIPE_ARCH_PPC)
- LLVMInitializePowerPCAsmPrinter();
-#endif
-
-#if HAVE_LLVM >= 0x0301
- InitializeNativeTargetDisassembler();
-#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
- LLVMInitializeX86Disassembler();
-#elif defined(PIPE_ARCH_ARM)
- LLVMInitializeARMDisassembler();
-#endif
-
#if HAVE_LLVM >= 0x0300
OwningPtr<const MCAsmInfo> AsmInfo(T->createMCAsmInfo(Triple));
#else
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
index d2b3713ed2d..30da44e5b9c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -131,6 +131,15 @@ lp_build_mask_check(struct lp_build_mask_context *mask)
value = lp_build_mask_value(mask);
+ /*
+ * XXX this doesn't quite generate the most efficient code possible, if
+ * the masks are vectors which have all bits set to the same value
+ * in each element.
+ * movmskps/pmovmskb would be more efficient to get the required value
+ * into ordinary reg (certainly with 8 floats).
+ * Not sure if llvm could figure that out on its own.
+ */
+
/* cond = (mask == 0) */
cond = LLVMBuildICmp(builder,
LLVMIntEQ,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 04142d905b1..3608a68202f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -67,6 +67,13 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
LLVMValueRef i,
LLVMValueRef j);
+LLVMValueRef
+lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
+ const struct util_format_description *format_desc,
+ struct lp_type type,
+ LLVMValueRef base_ptr,
+ LLVMValueRef offset);
+
/*
* SoA
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index e4b8da6bcfd..9591bcfb2c7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -470,6 +470,11 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
return lp_build_format_swizzle_aos(format_desc, &bld, res);
}
+ /* If all channels are of same type and we are not using half-floats */
+ if (util_format_is_array(format_desc)) {
+ return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset);
+ }
+
/*
* YUV / subsampled formats
*/
@@ -601,7 +606,6 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
return res;
}
-
/*
* Fallback to util_format_description::fetch_rgba_float().
*/
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
new file mode 100644
index 00000000000..b8ec379d76f
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
@@ -0,0 +1,102 @@
+/**************************************************************************
+ *
+ * Copyright 2012 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "lp_bld_const.h"
+#include "lp_bld_struct.h"
+#include "lp_bld_format.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_type.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_pack.h"
+
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "pipe/p_state.h"
+
+/**
+ * @brief lp_build_fetch_rgba_aos_array
+ *
+ * \param format_desc describes format of the image we're fetching from
+ * \param dst_type output type
+ * \param base_ptr address of the pixel block (or the texel if uncompressed)
+ * \param offset ptr offset
+ */
+LLVMValueRef
+lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
+ const struct util_format_description *format_desc,
+ struct lp_type dst_type,
+ LLVMValueRef base_ptr,
+ LLVMValueRef offset)
+{
+ struct lp_build_context bld;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMTypeRef src_elem_type, src_vec_type;
+ LLVMValueRef ptr, res = NULL;
+ struct lp_type src_type;
+
+ memset(&src_type, 0, sizeof src_type);
+ src_type.floating = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT;
+ src_type.fixed = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FIXED;
+ src_type.sign = format_desc->channel[0].type != UTIL_FORMAT_TYPE_UNSIGNED;
+ src_type.norm = format_desc->channel[0].normalized;
+ src_type.width = format_desc->channel[0].size;
+ src_type.length = format_desc->nr_channels;
+
+ assert(src_type.length <= dst_type.length);
+
+ src_elem_type = lp_build_elem_type(gallivm, src_type);
+ src_vec_type = lp_build_vec_type(gallivm, src_type);
+
+ /* Read whole vector from memory, unaligned */
+ if (!res) {
+ ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, "");
+ ptr = LLVMBuildPointerCast(builder, ptr, LLVMPointerType(src_vec_type, 0), "");
+ res = LLVMBuildLoad(builder, ptr, "");
+ lp_set_load_alignment(res, src_type.width / 8);
+ }
+
+ /* Truncate doubles to float */
+ if (src_type.floating && src_type.width == 64) {
+ src_type.width = 32;
+ src_vec_type = lp_build_vec_type(gallivm, src_type);
+
+ res = LLVMBuildFPTrunc(builder, res, src_vec_type, "");
+ }
+
+ /* Expand to correct length */
+ if (src_type.length < dst_type.length) {
+ res = lp_build_pad_vector(gallivm, res, src_type, dst_type.length);
+ src_type.length = dst_type.length;
+ }
+
+ /* Convert to correct format */
+ lp_build_conv(gallivm, src_type, dst_type, &res, 1, &res, 1);
+
+ /* Swizzle it */
+ lp_build_context_init(&bld, gallivm, dst_type);
+ return lp_build_format_swizzle_aos(format_desc, &bld, res);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 0a57b3ce794..afeb34079bf 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -359,7 +359,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
*/
if (util_format_fits_8unorm(format_desc) &&
- type.floating && type.width == 32 && type.length == 4) {
+ type.floating && type.width == 32 &&
+ (type.length == 1 || (type.length % 4 == 0))) {
struct lp_type tmp_type;
LLVMValueRef tmp;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
index ccc83207004..f77eb1212b1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -84,7 +84,7 @@ uyvy_to_yuv_soa(struct gallivm_state *gallivm,
* per element. Didn't measure performance but cuts shader size
* by quite a bit (less difference if cpu has no sse4.1 support).
*/
- if (util_cpu_caps.has_sse2 && n == 4) {
+ if (util_cpu_caps.has_sse2 && n > 1) {
LLVMValueRef sel, tmp, tmp2;
struct lp_build_context bld32;
@@ -152,7 +152,7 @@ yuyv_to_yuv_soa(struct gallivm_state *gallivm,
* per element. Didn't measure performance but cuts shader size
* by quite a bit (less difference if cpu has no sse4.1 support).
*/
- if (util_cpu_caps.has_sse2 && n == 4) {
+ if (util_cpu_caps.has_sse2 && n > 1) {
LLVMValueRef sel, tmp;
struct lp_build_context bld32;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 768d935dae5..5bf4bcfab3b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -26,15 +26,44 @@
**************************************************************************/
+#include "pipe/p_config.h"
#include "pipe/p_compiler.h"
#include "util/u_cpu_detect.h"
#include "util/u_debug.h"
#include "util/u_memory.h"
#include "util/u_simple_list.h"
+#include "lp_bld.h"
#include "lp_bld_debug.h"
+#include "lp_bld_misc.h"
#include "lp_bld_init.h"
+#include <llvm-c/Analysis.h>
#include <llvm-c/Transforms/Scalar.h>
+#include <llvm-c/BitWriter.h>
+
+
+/**
+ * AVX is supported in:
+ * - standard JIT from LLVM 3.2 onwards
+ * - MC-JIT from LLVM 3.1
+ * - MC-JIT supports limited OSes (MacOSX and Linux)
+ * - standard JIT in LLVM 3.1, with backports
+ */
+#if HAVE_LLVM >= 0x0301 && (defined(PIPE_OS_LINUX) || defined(PIPE_OS_APPLE))
+# define USE_MCJIT 1
+# define HAVE_AVX 1
+#elif HAVE_LLVM >= 0x0302 || (HAVE_LLVM == 0x0301 && defined(HAVE_JIT_AVX_SUPPORT))
+# define USE_MCJIT 0
+# define HAVE_AVX 1
+#else
+# define USE_MCJIT 0
+# define HAVE_AVX 0
+#endif
+
+
+#if USE_MCJIT
+void LLVMLinkInMCJIT();
+#endif
#ifdef DEBUG
@@ -57,6 +86,8 @@ DEBUG_GET_ONCE_FLAGS_OPTION(gallivm_debug, "GALLIVM_DEBUG", lp_bld_debug_flags,
static boolean gallivm_initialized = FALSE;
+unsigned lp_native_vector_width;
+
/*
* Optimization values are:
@@ -81,25 +112,13 @@ enum LLVM_CodeGenOpt_Level {
};
+#if HAVE_LLVM <= 0x0206
/**
- * LLVM 2.6 permits only one ExecutionEngine to be created. This is it.
- */
-static LLVMExecutionEngineRef GlobalEngine = NULL;
-
-/**
- * Same gallivm state shared by all contexts.
+ * LLVM 2.6 permits only one ExecutionEngine to be created. So use the
+ * same gallivm state everywhere.
*/
static struct gallivm_state *GlobalGallivm = NULL;
-
-
-
-
-extern void
-lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE);
-
-extern void
-lp_set_target_options(void);
-
+#endif
/**
@@ -111,6 +130,7 @@ static boolean
create_pass_manager(struct gallivm_state *gallivm)
{
assert(!gallivm->passmgr);
+ assert(gallivm->target);
gallivm->passmgr = LLVMCreateFunctionPassManager(gallivm->provider);
if (!gallivm->passmgr)
@@ -174,33 +194,37 @@ free_gallivm_state(struct gallivm_state *gallivm)
&mod, &error);
#endif
+ if (gallivm->passmgr) {
+ LLVMDisposePassManager(gallivm->passmgr);
+ }
+
#if 0
/* XXX this seems to crash with all versions of LLVM */
if (gallivm->provider)
LLVMDisposeModuleProvider(gallivm->provider);
#endif
- if (gallivm->passmgr)
- LLVMDisposePassManager(gallivm->passmgr);
-
-#if HAVE_LLVM >= 0x207
- if (gallivm->module)
- LLVMDisposeModule(gallivm->module);
-#endif
-
-#if 0
- /* Don't free the exec engine, it's a global/singleton */
- if (gallivm->engine)
+ if (HAVE_LLVM >= 0x207 && gallivm->engine) {
+ /* This will already destroy any associated module */
LLVMDisposeExecutionEngine(gallivm->engine);
-#endif
+ } else {
+ LLVMDisposeModule(gallivm->module);
+ }
-#if 0
+#if !USE_MCJIT
/* Don't free the TargetData, it's owned by the exec engine */
- LLVMDisposeTargetData(gallivm->target);
+#else
+ if (gallivm->target) {
+ LLVMDisposeTargetData(gallivm->target);
+ }
#endif
+ /* Never free the LLVM context.
+ */
+#if 0
if (gallivm->context)
LLVMContextDispose(gallivm->context);
+#endif
if (gallivm->builder)
LLVMDisposeBuilder(gallivm->builder);
@@ -215,37 +239,14 @@ free_gallivm_state(struct gallivm_state *gallivm)
}
-/**
- * Allocate gallivm LLVM objects.
- * \return TRUE for success, FALSE for failure
- */
static boolean
-init_gallivm_state(struct gallivm_state *gallivm)
+init_gallivm_engine(struct gallivm_state *gallivm)
{
- assert(!gallivm->context);
- assert(!gallivm->module);
- assert(!gallivm->provider);
-
- lp_build_init();
-
- gallivm->context = LLVMContextCreate();
- if (!gallivm->context)
- goto fail;
-
- gallivm->module = LLVMModuleCreateWithNameInContext("gallivm",
- gallivm->context);
- if (!gallivm->module)
- goto fail;
-
- gallivm->provider =
- LLVMCreateModuleProviderForExistingModule(gallivm->module);
- if (!gallivm->provider)
- goto fail;
-
- if (!GlobalEngine) {
+ if (1) {
/* We can only create one LLVMExecutionEngine (w/ LLVM 2.6 anyway) */
enum LLVM_CodeGenOpt_Level optlevel;
char *error = NULL;
+ int ret;
if (gallivm_debug & GALLIVM_DEBUG_NO_OPT) {
optlevel = None;
@@ -254,135 +255,162 @@ init_gallivm_state(struct gallivm_state *gallivm)
optlevel = Default;
}
- if (LLVMCreateJITCompiler(&GlobalEngine, gallivm->provider,
- (unsigned) optlevel, &error)) {
+#if USE_MCJIT
+ ret = lp_build_create_mcjit_compiler_for_module(&gallivm->engine,
+ gallivm->module,
+ (unsigned) optlevel,
+ &error);
+#else
+ ret = LLVMCreateJITCompiler(&gallivm->engine, gallivm->provider,
+ (unsigned) optlevel, &error);
+#endif
+ if (ret) {
_debug_printf("%s\n", error);
LLVMDisposeMessage(error);
goto fail;
}
#if defined(DEBUG) || defined(PROFILE)
- lp_register_oprofile_jit_event_listener(GlobalEngine);
+ lp_register_oprofile_jit_event_listener(gallivm->engine);
#endif
}
- gallivm->engine = GlobalEngine;
-
LLVMAddModuleProvider(gallivm->engine, gallivm->provider);//new
+#if !USE_MCJIT
gallivm->target = LLVMGetExecutionEngineTargetData(gallivm->engine);
if (!gallivm->target)
goto fail;
+#else
+ if (0) {
+ /*
+ * Dump the data layout strings.
+ */
- if (!create_pass_manager(gallivm))
- goto fail;
+ LLVMTargetDataRef target = LLVMGetExecutionEngineTargetData(gallivm->engine);
+ char *data_layout;
+ char *engine_data_layout;
- gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
- if (!gallivm->builder)
- goto fail;
+ data_layout = LLVMCopyStringRepOfTargetData(gallivm->target);
+ engine_data_layout = LLVMCopyStringRepOfTargetData(target);
+
+ if (1) {
+ debug_printf("module target data = %s\n", data_layout);
+ debug_printf("engine target data = %s\n", engine_data_layout);
+ }
+
+ free(data_layout);
+ free(engine_data_layout);
+ }
+#endif
return TRUE;
fail:
- free_gallivm_state(gallivm);
return FALSE;
}
-struct callback
-{
- garbage_collect_callback_func func;
- void *cb_data;
- struct callback *prev, *next;
-};
-
-
-/** list of all garbage collector callbacks */
-static struct callback callback_list = {NULL, NULL, NULL, NULL};
+/**
+ * Singleton
+ *
+ * We must never free LLVM contexts, because LLVM has several global caches
+ * which pointing/derived from objects owned by the context, causing false
+ * memory leaks and false cache hits when these objects are destroyed.
+ *
+ * TODO: For thread safety on multi-threaded OpenGL we should use one LLVM
+ * context per thread, and put them in a pool when threads are destroyed.
+ */
+static LLVMContextRef gallivm_context = NULL;
/**
- * Register a function with gallivm which will be called when we
- * do garbage collection.
+ * Allocate gallivm LLVM objects.
+ * \return TRUE for success, FALSE for failure
*/
-void
-gallivm_register_garbage_collector_callback(garbage_collect_callback_func func,
- void *cb_data)
+static boolean
+init_gallivm_state(struct gallivm_state *gallivm)
{
- struct callback *cb;
-
- if (!callback_list.prev) {
- make_empty_list(&callback_list);
- }
+ assert(!gallivm->context);
+ assert(!gallivm->module);
+ assert(!gallivm->provider);
- /* see if already in list */
- foreach(cb, &callback_list) {
- if (cb->func == func && cb->cb_data == cb_data)
- return;
- }
+ lp_build_init();
- /* add to list */
- cb = CALLOC_STRUCT(callback);
- if (cb) {
- cb->func = func;
- cb->cb_data = cb_data;
- insert_at_head(&callback_list, cb);
+ if (!gallivm_context) {
+ gallivm_context = LLVMContextCreate();
}
-}
+ gallivm->context = gallivm_context;
+ if (!gallivm->context)
+ goto fail;
+ gallivm->module = LLVMModuleCreateWithNameInContext("gallivm",
+ gallivm->context);
+ if (!gallivm->module)
+ goto fail;
-/**
- * Remove a callback.
- */
-void
-gallivm_remove_garbage_collector_callback(garbage_collect_callback_func func,
- void *cb_data)
-{
- struct callback *cb;
-
- /* search list */
- foreach(cb, &callback_list) {
- if (cb->func == func && cb->cb_data == cb_data) {
- /* found, remove it */
- remove_from_list(cb);
- FREE(cb);
- return;
- }
- }
-}
+ gallivm->provider =
+ LLVMCreateModuleProviderForExistingModule(gallivm->module);
+ if (!gallivm->provider)
+ goto fail;
+ gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
+ if (!gallivm->builder)
+ goto fail;
-/**
- * Call the callback functions (which are typically in the
- * draw module and llvmpipe driver.
- */
-static void
-call_garbage_collector_callbacks(void)
-{
- struct callback *cb;
- foreach(cb, &callback_list) {
- cb->func(cb->cb_data);
+ /* FIXME: MC-JIT only allows compiling one module at a time, and it must be
+ * complete when MC-JIT is created. So defer the MC-JIT engine creation for
+ * now.
+ */
+#if !USE_MCJIT
+ if (!init_gallivm_engine(gallivm)) {
+ goto fail;
}
-}
+#else
+ /*
+ * MC-JIT engine compiles the module immediately on creation, so we can't
+ * obtain the target data from it. Instead we create a target data layout
+ * from a string.
+ *
+ * The produced layout strings are not precisely the same, but should make
+ * no difference for the kind of optimization passes we run.
+ *
+ * For reference this is the layout string on x64:
+ *
+ * e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64
+ *
+ * See also:
+ * - http://llvm.org/docs/LangRef.html#datalayout
+ */
+
+ {
+ const unsigned pointer_size = 8 * sizeof(void *);
+ char layout[512];
+ util_snprintf(layout, sizeof layout, "%c-p:%u:%u:%u-i64:64:64-a0:0:%u-s0:%u:%u",
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+ 'e', // little endian
+#else
+ 'E', // big endian
+#endif
+ pointer_size, pointer_size, pointer_size, // pointer size, abi alignment, preferred alignment
+ pointer_size, // aggregate preferred alignment
+ pointer_size, pointer_size); // stack objects abi alignment, preferred alignment
+ gallivm->target = LLVMCreateTargetData(layout);
+ if (!gallivm->target) {
+ return FALSE;
+ }
+ }
+#endif
+ if (!create_pass_manager(gallivm))
+ goto fail;
-/**
- * Other gallium components using gallivm should call this periodically
- * to let us do garbage collection (or at least try to free memory
- * accumulated by the LLVM libraries).
- */
-void
-gallivm_garbage_collect(struct gallivm_state *gallivm)
-{
- if (gallivm->context) {
- if (gallivm_debug & GALLIVM_DEBUG_GC)
- debug_printf("***** Doing LLVM garbage collection\n");
+ return TRUE;
- call_garbage_collector_callbacks();
- free_gallivm_state(gallivm);
- init_gallivm_state(gallivm);
- }
+fail:
+ free_gallivm_state(gallivm);
+ return FALSE;
}
@@ -398,12 +426,27 @@ lp_build_init(void)
lp_set_target_options();
- LLVMInitializeNativeTarget();
-
+#if USE_MCJIT
+ LLVMLinkInMCJIT();
+#else
LLVMLinkInJIT();
+#endif
util_cpu_detect();
+
+ if (HAVE_AVX &&
+ util_cpu_caps.has_avx) {
+ lp_native_vector_width = 256;
+ } else {
+ /* Leave it at 128, even when no SIMD extensions are available.
+ * Really needs to be a multiple of 128 so can fit 4 floats.
+ */
+ lp_native_vector_width = 128;
+ }
+ lp_native_vector_width = debug_get_num_option("LP_NATIVE_VECTOR_WIDTH",
+ lp_native_vector_width);
+
gallivm_initialized = TRUE;
#if 0
@@ -423,16 +466,27 @@ lp_build_init(void)
struct gallivm_state *
gallivm_create(void)
{
- if (!GlobalGallivm) {
- GlobalGallivm = CALLOC_STRUCT(gallivm_state);
- if (GlobalGallivm) {
- if (!init_gallivm_state(GlobalGallivm)) {
- FREE(GlobalGallivm);
- GlobalGallivm = NULL;
- }
+ struct gallivm_state *gallivm;
+
+#if HAVE_LLVM <= 0x206
+ if (GlobalGallivm) {
+ return GlobalGallivm;
+ }
+#endif
+
+ gallivm = CALLOC_STRUCT(gallivm_state);
+ if (gallivm) {
+ if (!init_gallivm_state(gallivm)) {
+ FREE(gallivm);
+ gallivm = NULL;
}
}
- return GlobalGallivm;
+
+#if HAVE_LLVM <= 0x206
+ GlobalGallivm = gallivm;
+#endif
+
+ return gallivm;
}
@@ -442,6 +496,132 @@ gallivm_create(void)
void
gallivm_destroy(struct gallivm_state *gallivm)
{
+#if HAVE_LLVM <= 0x0206
/* No-op: don't destroy the singleton */
(void) gallivm;
+#else
+ free_gallivm_state(gallivm);
+ FREE(gallivm);
+#endif
+}
+
+
+/**
+ * Validate and optimze a function.
+ */
+static void
+gallivm_optimize_function(struct gallivm_state *gallivm,
+ LLVMValueRef func)
+{
+ if (0) {
+ debug_printf("optimizing %s...\n", LLVMGetValueName(func));
+ }
+
+ assert(gallivm->passmgr);
+
+ /* Apply optimizations to LLVM IR */
+ LLVMRunFunctionPassManager(gallivm->passmgr, func);
+
+ if (0) {
+ if (gallivm_debug & GALLIVM_DEBUG_IR) {
+ /* Print the LLVM IR to stderr */
+ lp_debug_dump_value(func);
+ debug_printf("\n");
+ }
+ }
+}
+
+
+/**
+ * Validate a function.
+ */
+void
+gallivm_verify_function(struct gallivm_state *gallivm,
+ LLVMValueRef func)
+{
+ /* Verify the LLVM IR. If invalid, dump and abort */
+#ifdef DEBUG
+ if (LLVMVerifyFunction(func, LLVMPrintMessageAction)) {
+ lp_debug_dump_value(func);
+ assert(0);
+ return;
+ }
+#endif
+
+ gallivm_optimize_function(gallivm, func);
+
+ if (gallivm_debug & GALLIVM_DEBUG_IR) {
+ /* Print the LLVM IR to stderr */
+ lp_debug_dump_value(func);
+ debug_printf("\n");
+ }
+}
+
+
+void
+gallivm_compile_module(struct gallivm_state *gallivm)
+{
+#if HAVE_LLVM > 0x206
+ assert(!gallivm->compiled);
+#endif
+
+ /* Dump byte code to a file */
+ if (0) {
+ LLVMWriteBitcodeToFile(gallivm->module, "llvmpipe.bc");
+ debug_printf("llvmpipe.bc written\n");
+ debug_printf("Invoke as \"llc -o - llvmpipe.bc\"\n");
+ }
+
+#if USE_MCJIT
+ assert(!gallivm->engine);
+ if (!init_gallivm_engine(gallivm)) {
+ assert(0);
+ }
+#endif
+ assert(gallivm->engine);
+
+ ++gallivm->compiled;
+}
+
+
+func_pointer
+gallivm_jit_function(struct gallivm_state *gallivm,
+ LLVMValueRef func)
+{
+ void *code;
+ func_pointer jit_func;
+
+ assert(gallivm->compiled);
+ assert(gallivm->engine);
+
+ code = LLVMGetPointerToGlobal(gallivm->engine, func);
+ assert(code);
+ jit_func = pointer_to_func(code);
+
+ if (gallivm_debug & GALLIVM_DEBUG_ASM) {
+ lp_disassemble(code);
+ }
+
+ /* Free the function body to save memory */
+ lp_func_delete_body(func);
+
+ return jit_func;
+}
+
+
+/**
+ * Free the function (and its machine code).
+ */
+void
+gallivm_free_function(struct gallivm_state *gallivm,
+ LLVMValueRef func,
+ const void *code)
+{
+#if !USE_MCJIT
+ if (code) {
+ LLVMFreeMachineCodeForFunction(gallivm->engine, func);
+ }
+
+ LLVMDeleteFunction(func);
+#endif
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h
index 5fc0f996c64..7edea616c4e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h
@@ -31,6 +31,7 @@
#include "pipe/p_compiler.h"
+#include "util/u_pointer.h" // for func_pointer
#include "lp_bld.h"
#include <llvm-c/ExecutionEngine.h>
@@ -44,6 +45,7 @@ struct gallivm_state
LLVMPassManagerRef passmgr;
LLVMContextRef context;
LLVMBuilderRef builder;
+ unsigned compiled;
};
@@ -51,35 +53,28 @@ void
lp_build_init(void);
-extern void
-lp_func_delete_body(LLVMValueRef func);
-
+struct gallivm_state *
+gallivm_create(void);
void
-gallivm_garbage_collect(struct gallivm_state *gallivm);
-
+gallivm_destroy(struct gallivm_state *gallivm);
-typedef void (*garbage_collect_callback_func)(void *cb_data);
void
-gallivm_register_garbage_collector_callback(garbage_collect_callback_func func,
- void *cb_data);
+gallivm_verify_function(struct gallivm_state *gallivm,
+ LLVMValueRef func);
void
-gallivm_remove_garbage_collector_callback(garbage_collect_callback_func func,
- void *cb_data);
+gallivm_compile_module(struct gallivm_state *gallivm);
-
-struct gallivm_state *
-gallivm_create(void);
+func_pointer
+gallivm_jit_function(struct gallivm_state *gallivm,
+ LLVMValueRef func);
void
-gallivm_destroy(struct gallivm_state *gallivm);
-
-
-extern LLVMValueRef
-lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
- const char *Name);
+gallivm_free_function(struct gallivm_state *gallivm,
+ LLVMValueRef func,
+ const void * code);
void
lp_set_load_alignment(LLVMValueRef Inst,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.c b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
index 2323f124ae4..2bf1211bcd7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
@@ -48,6 +48,8 @@
#include "lp_bld_const.h"
#include "lp_bld_intr.h"
+#include "lp_bld_type.h"
+#include "lp_bld_pack.h"
LLVMValueRef
@@ -129,6 +131,95 @@ lp_build_intrinsic_binary(LLVMBuilderRef builder,
}
+/**
+ * Call intrinsic with arguments adapted to intrinsic vector length.
+ *
+ * Split vectors which are too large for the hw, or expand them if they
+ * are too small, so a caller calling a function which might use intrinsics
+ * doesn't need to do splitting/expansion on its own.
+ * This only supports intrinsics where src and dst types match.
+ */
+LLVMValueRef
+lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm,
+ const char *name,
+ struct lp_type src_type,
+ unsigned intr_size,
+ LLVMValueRef a,
+ LLVMValueRef b)
+{
+ unsigned i;
+ struct lp_type intrin_type = src_type;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+ LLVMValueRef anative, bnative;
+ unsigned intrin_length = intr_size / src_type.width;
+
+ intrin_type.length = intrin_length;
+
+ if (intrin_length > src_type.length) {
+ LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+ LLVMValueRef constvec, tmp;
+
+ for (i = 0; i < src_type.length; i++) {
+ elems[i] = lp_build_const_int32(gallivm, i);
+ }
+ for (; i < intrin_length; i++) {
+ elems[i] = i32undef;
+ }
+ if (src_type.length == 1) {
+ LLVMTypeRef elem_type = lp_build_elem_type(gallivm, intrin_type);
+ a = LLVMBuildBitCast(builder, a, LLVMVectorType(elem_type, 1), "");
+ b = LLVMBuildBitCast(builder, b, LLVMVectorType(elem_type, 1), "");
+ }
+ constvec = LLVMConstVector(elems, intrin_length);
+ anative = LLVMBuildShuffleVector(builder, a, a, constvec, "");
+ bnative = LLVMBuildShuffleVector(builder, b, b, constvec, "");
+ tmp = lp_build_intrinsic_binary(builder, name,
+ lp_build_vec_type(gallivm, intrin_type),
+ anative, bnative);
+ if (src_type.length > 1) {
+ constvec = LLVMConstVector(elems, src_type.length);
+ return LLVMBuildShuffleVector(builder, tmp, tmp, constvec, "");
+ }
+ else {
+ return LLVMBuildExtractElement(builder, tmp, elems[0], "");
+ }
+ }
+ else if (intrin_length < src_type.length) {
+ unsigned num_vec = src_type.length / intrin_length;
+ LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
+
+ /* don't support arbitrary size here as this is so yuck */
+ if (src_type.length % intrin_length) {
+ /* FIXME: This is something which should be supported
+ * but there doesn't seem to be any need for it currently
+ * so crash and burn.
+ */
+ debug_printf("%s: should handle arbitrary vector size\n",
+ __FUNCTION__);
+ assert(0);
+ return NULL;
+ }
+
+ for (i = 0; i < num_vec; i++) {
+ anative = lp_build_extract_range(gallivm, a, i*intrin_length,
+ intrin_length);
+ bnative = lp_build_extract_range(gallivm, b, i*intrin_length,
+ intrin_length);
+ tmp[i] = lp_build_intrinsic_binary(builder, name,
+ lp_build_vec_type(gallivm, intrin_type),
+ anative, bnative);
+ }
+ return lp_build_concat(gallivm, tmp, intrin_type, num_vec);
+ }
+ else {
+ return lp_build_intrinsic_binary(builder, name,
+ lp_build_vec_type(gallivm, src_type),
+ a, b);
+ }
+}
+
+
LLVMValueRef
lp_build_intrinsic_map(struct gallivm_state *gallivm,
const char *name,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.h b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
index b73dd700362..38c5c29c980 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
@@ -78,6 +78,15 @@ lp_build_intrinsic_binary(LLVMBuilderRef builder,
LLVMValueRef
+lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm,
+ const char *name,
+ struct lp_type src_type,
+ unsigned intr_size,
+ LLVMValueRef a,
+ LLVMValueRef b);
+
+
+LLVMValueRef
lp_build_intrinsic_map(struct gallivm_state *gallivm,
const char *name,
LLVMTypeRef ret_type,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 69796149aaa..7a4a5bb11d3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -52,8 +52,8 @@
*
* select <4 x i1> %C, %A, %B
*
- * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is not
- * supported on any backend.
+ * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is only
+ * supported on some backends (x86) starting with llvm 3.1.
*
* Expanding the boolean vector to full SIMD register width, as in
*
@@ -485,8 +485,10 @@ lp_build_select(struct lp_build_context *bld,
}
res = LLVMBuildSelect(builder, mask, a, b, "");
}
- else if (util_cpu_caps.has_sse4_1 &&
- type.width * type.length == 128 &&
+ else if (((util_cpu_caps.has_sse4_1 &&
+ type.width * type.length == 128) ||
+ (util_cpu_caps.has_avx &&
+ type.width * type.length == 256 && type.width >= 32)) &&
!LLVMIsConstant(a) &&
!LLVMIsConstant(b) &&
!LLVMIsConstant(mask)) {
@@ -494,8 +496,22 @@ lp_build_select(struct lp_build_context *bld,
LLVMTypeRef arg_type;
LLVMValueRef args[3];
- if (type.floating &&
- type.width == 64) {
+ /*
+ * There's only float blend in AVX but can just cast i32/i64
+ * to float.
+ */
+ if (type.width * type.length == 256) {
+ if (type.width == 64) {
+ intrinsic = "llvm.x86.avx.blendv.pd.256";
+ arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 4);
+ }
+ else {
+ intrinsic = "llvm.x86.avx.blendv.ps.256";
+ arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8);
+ }
+ }
+ else if (type.floating &&
+ type.width == 64) {
intrinsic = "llvm.x86.sse41.blendvpd";
arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 2);
} else if (type.floating &&
@@ -591,3 +607,35 @@ lp_build_select_aos(struct lp_build_context *bld,
return lp_build_select(bld, mask_vec, a, b);
}
}
+
+
+/**
+ * Return (scalar-cast)val ? true : false;
+ */
+LLVMValueRef
+lp_build_any_true_range(struct lp_build_context *bld,
+ unsigned real_length,
+ LLVMValueRef val)
+{
+ LLVMBuilderRef builder = bld->gallivm->builder;
+ LLVMTypeRef scalar_type;
+ LLVMTypeRef true_type;
+
+ assert(real_length <= bld->type.length);
+
+ true_type = LLVMIntTypeInContext(bld->gallivm->context,
+ bld->type.width * real_length);
+ scalar_type = LLVMIntTypeInContext(bld->gallivm->context,
+ bld->type.width * bld->type.length);
+ val = LLVMBuildBitCast(builder, val, scalar_type, "");
+ /*
+ * We're using always native types so we can use intrinsics.
+ * However, if we don't do per-element calculations, we must ensure
+ * the excess elements aren't used since they may contain garbage.
+ */
+ if (real_length < bld->type.length) {
+ val = LLVMBuildTrunc(builder, val, true_type, "");
+ }
+ return LLVMBuildICmp(builder, LLVMIntNE,
+ val, LLVMConstNull(true_type), "");
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.h b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
index ef33a653682..64c0a1f5946 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
@@ -82,4 +82,9 @@ lp_build_select_aos(struct lp_build_context *bld,
LLVMValueRef b);
+LLVMValueRef
+lp_build_any_true_range(struct lp_build_context *bld,
+ unsigned real_length,
+ LLVMValueRef val);
+
#endif /* !LP_BLD_LOGIC_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 6c4586c4212..dd2c6120afb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -26,6 +26,12 @@
**************************************************************************/
+/**
+ * The purpose of this module is to expose LLVM functionality not available
+ * through the C++ bindings.
+ */
+
+
#ifndef __STDC_LIMIT_MACROS
#define __STDC_LIMIT_MACROS
#endif
@@ -41,11 +47,24 @@
#include <llvm/Target/TargetOptions.h>
#include <llvm/ExecutionEngine/ExecutionEngine.h>
#include <llvm/ExecutionEngine/JITEventListener.h>
+#if HAVE_LLVM >= 0x0301
+#include <llvm/ADT/Triple.h>
+#include <llvm/ExecutionEngine/JITMemoryManager.h>
+#endif
#include <llvm/Support/CommandLine.h>
#include <llvm/Support/PrettyStackTrace.h>
+#if HAVE_LLVM >= 0x0300
+#include <llvm/Support/TargetSelect.h>
+#else /* HAVE_LLVM < 0x0300 */
+#include <llvm/Target/TargetSelect.h>
+#endif /* HAVE_LLVM < 0x0300 */
+
#include "pipe/p_config.h"
#include "util/u_debug.h"
+#include "util/u_cpu_detect.h"
+
+#include "lp_bld_misc.h"
/**
@@ -99,6 +118,9 @@ lp_set_target_options(void)
#if defined(DEBUG) || defined(PROFILE)
llvm::NoFramePointerElim = true;
+#if HAVE_LLVM >= 0x0208
+ llvm::NoFramePointerElimNonLeaf = true;
+#endif
#endif
llvm::NoExcessFPPrecision = false;
@@ -146,6 +168,30 @@ lp_set_target_options(void)
* shared object where the gallium driver resides.
*/
llvm::DisablePrettyStackTrace = true;
+
+ // If we have a native target, initialize it to ensure it is linked in and
+ // usable by the JIT.
+ llvm::InitializeNativeTarget();
+
+#if HAVE_LLVM >= 0x0208
+ llvm::InitializeNativeTargetAsmPrinter();
+#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+ LLVMInitializeX86AsmPrinter();
+#elif defined(PIPE_ARCH_ARM)
+ LLVMInitializeARMAsmPrinter();
+#elif defined(PIPE_ARCH_PPC)
+ LLVMInitializePowerPCAsmPrinter();
+#endif
+
+#if HAVE_LLVM >= 0x0207
+# if HAVE_LLVM >= 0x0301
+ llvm::InitializeNativeTargetDisassembler();
+# elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+ LLVMInitializeX86Disassembler();
+# elif defined(PIPE_ARCH_ARM)
+ LLVMInitializeARMDisassembler();
+# endif
+#endif
}
@@ -165,6 +211,7 @@ lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
return llvm::wrap(llvm::unwrap(B)->CreateLoad(llvm::unwrap(PointerVal), true, Name));
}
+
extern "C"
void
lp_set_load_alignment(LLVMValueRef Inst,
@@ -180,3 +227,67 @@ lp_set_store_alignment(LLVMValueRef Inst,
{
llvm::unwrap<llvm::StoreInst>(Inst)->setAlignment(Align);
}
+
+
+#if HAVE_LLVM >= 0x301
+
+/**
+ * Same as LLVMCreateJITCompilerForModule, but using MCJIT and enabling AVX
+ * feature where available.
+ *
+ * See also:
+ * - llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+ * - llvm/tools/lli/lli.cpp
+ * - http://markmail.org/message/ttkuhvgj4cxxy2on#query:+page:1+mid:aju2dggerju3ivd3+state:results
+ */
+extern "C"
+LLVMBool
+lp_build_create_mcjit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
+ LLVMModuleRef M,
+ unsigned OptLevel,
+ char **OutError)
+{
+ using namespace llvm;
+
+ std::string Error;
+ EngineBuilder builder(unwrap(M));
+ builder.setEngineKind(EngineKind::JIT)
+ .setErrorStr(&Error)
+ .setOptLevel((CodeGenOpt::Level)OptLevel);
+
+ builder.setUseMCJIT(true);
+
+ llvm::SmallVector<std::string, 1> MAttrs;
+ if (util_cpu_caps.has_avx) {
+ /*
+ * AVX feature is not automatically detected from CPUID by the X86 target
+ * yet, because the old (yet default) JIT engine is not capable of
+ * emitting the opcodes. But as we're using MCJIT here, it is safe to
+ * add set this attribute.
+ */
+ MAttrs.push_back("+avx");
+ builder.setMAttrs(MAttrs);
+ }
+ builder.setJITMemoryManager(JITMemoryManager::CreateDefaultMemManager());
+
+ ExecutionEngine *JIT;
+#if 0
+ JIT = builder.create();
+#else
+ /*
+ * Workaround http://llvm.org/bugs/show_bug.cgi?id=12833
+ */
+ StringRef MArch = "";
+ StringRef MCPU = "";
+ Triple TT(unwrap(M)->getTargetTriple());
+ JIT = builder.create(builder.selectTarget(TT, MArch, MCPU, MAttrs));
+#endif
+ if (JIT) {
+ *OutJIT = wrap(JIT);
+ return 0;
+ }
+ *OutError = strdup(Error.c_str());
+ return 1;
+}
+
+#endif /* HAVE_LLVM >= 0x301 */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.h b/src/gallium/auxiliary/gallivm/lp_bld_misc.h
new file mode 100644
index 00000000000..4f80b38280c
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.h
@@ -0,0 +1,70 @@
+/**************************************************************************
+ *
+ * Copyright 2012 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_BLD_MISC_H
+#define LP_BLD_MISC_H
+
+
+#include "lp_bld.h"
+#include <llvm-c/ExecutionEngine.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+extern void
+lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE);
+
+extern void
+lp_set_target_options(void);
+
+
+extern void
+lp_func_delete_body(LLVMValueRef func);
+
+
+extern LLVMValueRef
+lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
+ const char *Name);
+
+extern int
+lp_build_create_mcjit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
+ LLVMModuleRef M,
+ unsigned OptLevel,
+ char **OutError);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* !LP_BLD_MISC_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index fde6bb594f1..b18f7841ccb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -69,6 +69,7 @@
#include "util/u_debug.h"
#include "util/u_math.h"
#include "util/u_cpu_detect.h"
+#include "util/u_memory.h"
#include "lp_bld_type.h"
#include "lp_bld_const.h"
@@ -76,6 +77,7 @@
#include "lp_bld_intr.h"
#include "lp_bld_arit.h"
#include "lp_bld_pack.h"
+#include "lp_bld_swizzle.h"
/**
@@ -101,6 +103,30 @@ lp_build_const_unpack_shuffle(struct gallivm_state *gallivm,
return LLVMConstVector(elems, n);
}
+/**
+ * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack.
+ * See comment above lp_build_interleave2_half for more details.
+ */
+static LLVMValueRef
+lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm,
+ unsigned n, unsigned lo_hi)
+{
+ LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+ unsigned i, j;
+
+ assert(n <= LP_MAX_VECTOR_LENGTH);
+ assert(lo_hi < 2);
+
+ for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) {
+ if (i == (n / 2))
+ j += n / 4;
+
+ elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
+ elems[i + 1] = lp_build_const_int32(gallivm, n + j);
+ }
+
+ return LLVMConstVector(elems, n);
+}
/**
* Build shuffle vectors that match PACKxx instructions.
@@ -119,6 +145,71 @@ lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n)
return LLVMConstVector(elems, n);
}
+/**
+ * Return a vector with elements src[start:start+size]
+ * Most useful for getting half the values out of a 256bit sized vector,
+ * otherwise may cause data rearrangement to happen.
+ */
+LLVMValueRef
+lp_build_extract_range(struct gallivm_state *gallivm,
+ LLVMValueRef src,
+ unsigned start,
+ unsigned size)
+{
+ LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+ unsigned i;
+
+ assert(size <= Elements(elems));
+
+ for (i = 0; i < size; ++i)
+ elems[i] = lp_build_const_int32(gallivm, i + start);
+
+ if (size == 1) {
+ return LLVMBuildExtractElement(gallivm->builder, src, elems[0], "");
+ }
+ else {
+ return LLVMBuildShuffleVector(gallivm->builder, src, src,
+ LLVMConstVector(elems, size), "");
+ }
+}
+
+/**
+ * Concatenates several (must be a power of 2) vectors (of same type)
+ * into a larger one.
+ * Most useful for building up a 256bit sized vector out of two 128bit ones.
+ */
+LLVMValueRef
+lp_build_concat(struct gallivm_state *gallivm,
+ LLVMValueRef src[],
+ struct lp_type src_type,
+ unsigned num_vectors)
+{
+ unsigned new_length, i;
+ LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2];
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+
+ assert(src_type.length * num_vectors <= Elements(shuffles));
+ assert(util_is_power_of_two(num_vectors));
+
+ new_length = src_type.length;
+
+ for (i = 0; i < num_vectors; i++)
+ tmp[i] = src[i];
+
+ while (num_vectors > 1) {
+ num_vectors >>= 1;
+ new_length <<= 1;
+ for (i = 0; i < new_length; i++) {
+ shuffles[i] = lp_build_const_int32(gallivm, i);
+ }
+ for (i = 0; i < num_vectors; i++) {
+ tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1],
+ LLVMConstVector(shuffles, new_length), "");
+ }
+ }
+
+ return tmp[0];
+}
/**
* Interleave vector elements.
@@ -139,6 +230,40 @@ lp_build_interleave2(struct gallivm_state *gallivm,
return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
}
+/**
+ * Interleave vector elements but with 256 bit,
+ * treats it as interleave with 2 concatenated 128 bit vectors.
+ *
+ * This differs to lp_build_interleave2 as that function would do the following (for lo):
+ * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction.
+ *
+ *
+ * An example interleave 8x float with 8x float on AVX 256bit unpack:
+ * a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7
+ *
+ * Equivalent to interleaving 2x 128 bit vectors
+ * a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7
+ *
+ * So interleave-lo would result in:
+ * a0 b0 a1 b1 a4 b4 a5 b5
+ *
+ * And interleave-hi would result in:
+ * a2 b2 a3 b3 a6 b6 a7 b7
+ */
+LLVMValueRef
+lp_build_interleave2_half(struct gallivm_state *gallivm,
+ struct lp_type type,
+ LLVMValueRef a,
+ LLVMValueRef b,
+ unsigned lo_hi)
+{
+ if (type.length * type.width == 256) {
+ LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);
+ return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
+ } else {
+ return lp_build_interleave2(gallivm, type, a, b, lo_hi);
+ }
+}
/**
* Double the bit width.
@@ -237,9 +362,9 @@ lp_build_unpack(struct gallivm_state *gallivm,
* Non-interleaved pack.
*
* This will move values as
- *
- * lo = __ l0 __ l1 __ l2 __.. __ ln
- * hi = __ h0 __ h1 __ h2 __.. __ hn
+ * (LSB) (MSB)
+ * lo = l0 __ l1 __ l2 __.. __ ln __
+ * hi = h0 __ h1 __ h2 __.. __ hn __
* res = l0 l1 l2 .. ln h0 h1 h2 .. hn
*
* This will only change the number of bits the values are represented, not the
@@ -257,12 +382,14 @@ lp_build_pack2(struct gallivm_state *gallivm,
LLVMValueRef hi)
{
LLVMBuilderRef builder = gallivm->builder;
-#if HAVE_LLVM < 0x0207
- LLVMTypeRef src_vec_type = lp_build_vec_type(gallivm, src_type);
-#endif
LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type);
LLVMValueRef shuffle;
LLVMValueRef res = NULL;
+ struct lp_type intr_type = dst_type;
+
+#if HAVE_LLVM < 0x0207
+ intr_type = src_type;
+#endif
assert(!src_type.floating);
assert(!dst_type.floating);
@@ -270,50 +397,81 @@ lp_build_pack2(struct gallivm_state *gallivm,
assert(src_type.length * 2 == dst_type.length);
/* Check for special cases first */
- if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
+ if(util_cpu_caps.has_sse2 && src_type.width * src_type.length >= 128) {
+ const char *intrinsic = NULL;
+
switch(src_type.width) {
case 32:
if(dst_type.sign) {
-#if HAVE_LLVM >= 0x0207
- res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", dst_vec_type, lo, hi);
-#else
- res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
-#endif
+ intrinsic = "llvm.x86.sse2.packssdw.128";
}
else {
if (util_cpu_caps.has_sse4_1) {
- return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
- }
- else {
- /* use generic shuffle below */
- res = NULL;
+ intrinsic = "llvm.x86.sse41.packusdw";
+#if HAVE_LLVM < 0x0207
+ /* llvm < 2.7 has inconsistent signatures except for packusdw */
+ intr_type = dst_type;
+#endif
}
}
break;
-
case 16:
- if(dst_type.sign)
-#if HAVE_LLVM >= 0x0207
- res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", dst_vec_type, lo, hi);
-#else
- res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
-#endif
- else
-#if HAVE_LLVM >= 0x0207
- res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", dst_vec_type, lo, hi);
-#else
- res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
-#endif
- break;
-
- default:
- assert(0);
- return LLVMGetUndef(dst_vec_type);
+ if (dst_type.sign) {
+ intrinsic = "llvm.x86.sse2.packsswb.128";
+ }
+ else {
+ intrinsic = "llvm.x86.sse2.packuswb.128";
+ }
break;
+ /* default uses generic shuffle below */
}
-
- if (res) {
- res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
+ if (intrinsic) {
+ if (src_type.width * src_type.length == 128) {
+ LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
+ res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
+ if (dst_vec_type != intr_vec_type) {
+ res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
+ }
+ }
+ else {
+ int num_split = src_type.width * src_type.length / 128;
+ int i;
+ int nlen = 128 / src_type.width;
+ struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);
+ struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);
+ LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];
+ LLVMValueRef tmplo, tmphi;
+ LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type);
+ LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type);
+
+ assert(num_split <= LP_MAX_VECTOR_WIDTH / 128);
+
+ for (i = 0; i < num_split / 2; i++) {
+ tmplo = lp_build_extract_range(gallivm,
+ lo, i*nlen*2, nlen);
+ tmphi = lp_build_extract_range(gallivm,
+ lo, i*nlen*2 + nlen, nlen);
+ tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,
+ nintr_vec_type, tmplo, tmphi);
+ if (ndst_vec_type != nintr_vec_type) {
+ tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, "");
+ }
+ }
+ for (i = 0; i < num_split / 2; i++) {
+ tmplo = lp_build_extract_range(gallivm,
+ hi, i*nlen*2, nlen);
+ tmphi = lp_build_extract_range(gallivm,
+ hi, i*nlen*2 + nlen, nlen);
+ tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,
+ nintr_vec_type,
+ tmplo, tmphi);
+ if (ndst_vec_type != nintr_vec_type) {
+ tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2],
+ ndst_vec_type, "");
+ }
+ }
+ res = lp_build_concat(gallivm, tmpres, ndst_type, num_split);
+ }
return res;
}
}
@@ -357,8 +515,9 @@ lp_build_packs2(struct gallivm_state *gallivm,
/* All X86 SSE non-interleaved pack instructions take signed inputs and
* saturate them, so no need to clamp for those cases. */
if(util_cpu_caps.has_sse2 &&
- src_type.width * src_type.length == 128 &&
- src_type.sign)
+ src_type.width * src_type.length >= 128 &&
+ src_type.sign &&
+ (src_type.width == 32 || src_type.width == 16))
clamp = FALSE;
if(clamp) {
@@ -395,7 +554,6 @@ lp_build_pack(struct gallivm_state *gallivm,
LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
unsigned i;
-
/* Register width must remain constant */
assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
@@ -487,21 +645,44 @@ lp_build_resize(struct gallivm_state *gallivm,
/*
* Register width remains constant -- use vector packing intrinsics
*/
-
tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
}
else {
- /*
- * Do it element-wise.
- */
-
- assert(src_type.length == dst_type.length);
- tmp[0] = lp_build_undef(gallivm, dst_type);
- for (i = 0; i < dst_type.length; ++i) {
- LLVMValueRef index = lp_build_const_int32(gallivm, i);
- LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
- val = LLVMBuildTrunc(builder, val, lp_build_elem_type(gallivm, dst_type), "");
- tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
+ if (src_type.width / dst_type.width > num_srcs) {
+ /*
+ * First change src vectors size (with shuffle) so they have the
+ * same size as the destination vector, then pack normally.
+ * Note: cannot use cast/extract because llvm generates atrocious code.
+ */
+ unsigned size_ratio = (src_type.width * src_type.length) /
+ (dst_type.length * dst_type.width);
+ unsigned new_length = src_type.length / size_ratio;
+
+ for (i = 0; i < size_ratio * num_srcs; i++) {
+ unsigned start_index = (i % size_ratio) * new_length;
+ tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio],
+ start_index, new_length);
+ }
+ num_srcs *= size_ratio;
+ src_type.length = new_length;
+ tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs);
+ }
+ else {
+ /*
+ * Truncate bit width but expand vector size - first pack
+ * then expand simply because this should be more AVX-friendly
+ * for the cases we probably hit.
+ */
+ unsigned size_ratio = (dst_type.width * dst_type.length) /
+ (src_type.length * src_type.width);
+ unsigned num_pack_srcs = num_srcs / size_ratio;
+ dst_type.length = dst_type.length / size_ratio;
+
+ for (i = 0; i < size_ratio; i++) {
+ tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE,
+ &src[i*num_pack_srcs], num_pack_srcs);
+ }
+ tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio);
}
}
}
@@ -522,19 +703,24 @@ lp_build_resize(struct gallivm_state *gallivm,
/*
* Do it element-wise.
*/
+ assert(src_type.length * num_srcs == dst_type.length * num_dsts);
+
+ for (i = 0; i < num_dsts; i++) {
+ tmp[i] = lp_build_undef(gallivm, dst_type);
+ }
- assert(src_type.length == dst_type.length);
- tmp[0] = lp_build_undef(gallivm, dst_type);
- for (i = 0; i < dst_type.length; ++i) {
- LLVMValueRef index = lp_build_const_int32(gallivm, i);
- LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
+ for (i = 0; i < src_type.length; ++i) {
+ unsigned j = i / dst_type.length;
+ LLVMValueRef srcindex = lp_build_const_int32(gallivm, i);
+ LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length);
+ LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, "");
if (src_type.sign && dst_type.sign) {
val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
} else {
val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
}
- tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
+ tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, "");
}
}
}
@@ -554,3 +740,38 @@ lp_build_resize(struct gallivm_state *gallivm,
}
+/**
+ * Expands src vector from src.length to dst_length
+ */
+LLVMValueRef
+lp_build_pad_vector(struct gallivm_state *gallivm,
+ LLVMValueRef src,
+ struct lp_type src_type,
+ unsigned dst_length)
+{
+ LLVMValueRef undef = LLVMGetUndef(lp_build_vec_type(gallivm, src_type));
+ LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+ unsigned i;
+
+ assert(dst_length <= Elements(elems));
+ assert(dst_length > src_type.length);
+
+ if (src_type.length == dst_length)
+ return src;
+
+ /* If its a single scalar type, no need to reinvent the wheel */
+ if (src_type.length == 1) {
+ return lp_build_broadcast(gallivm, LLVMVectorType(lp_build_elem_type(gallivm, src_type), dst_length), src);
+ }
+
+ /* All elements from src vector */
+ for (i = 0; i < src_type.length; ++i)
+ elems[i] = lp_build_const_int32(gallivm, i);
+
+ /* Undef fill remaining space */
+ for (i = src_type.length; i < dst_length; ++i)
+ elems[i] = lp_build_const_int32(gallivm, src_type.length);
+
+ /* Combine the two vectors */
+ return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), "");
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
index d58da4f01b3..73f299cca11 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -44,6 +44,12 @@
struct lp_type;
+LLVMValueRef
+lp_build_interleave2_half(struct gallivm_state *gallivm,
+ struct lp_type type,
+ LLVMValueRef a,
+ LLVMValueRef b,
+ unsigned lo_hi);
LLVMValueRef
lp_build_interleave2(struct gallivm_state *gallivm,
@@ -69,6 +75,17 @@ lp_build_unpack(struct gallivm_state *gallivm,
LLVMValueRef src,
LLVMValueRef *dst, unsigned num_dsts);
+LLVMValueRef
+lp_build_extract_range(struct gallivm_state *gallivm,
+ LLVMValueRef src,
+ unsigned start,
+ unsigned size);
+
+LLVMValueRef
+lp_build_concat(struct gallivm_state *gallivm,
+ LLVMValueRef src[],
+ struct lp_type src_type,
+ unsigned num_vectors);
LLVMValueRef
lp_build_packs2(struct gallivm_state *gallivm,
@@ -102,4 +119,10 @@ lp_build_resize(struct gallivm_state *gallivm,
LLVMValueRef *dst, unsigned num_dsts);
+LLVMValueRef
+lp_build_pad_vector(struct gallivm_state *gallivm,
+ LLVMValueRef src,
+ struct lp_type src_type,
+ unsigned dst_length);
+
#endif /* !LP_BLD_PACK_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
index b0a5bc0267f..b1ba7c72655 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
@@ -26,6 +26,7 @@
**************************************************************************/
+#include "u_cpu_detect.h"
#include "lp_bld_type.h"
#include "lp_bld_arit.h"
#include "lp_bld_const.h"
@@ -77,34 +78,82 @@ lp_build_ddy(struct lp_build_context *bld,
return lp_build_sub(bld, a_bottom, a_top);
}
-
+/*
+ * To be able to handle multiple quads at once in texture sampling and
+ * do lod calculations per quad, it is necessary to get the per-quad
+ * derivatives into the lp_build_rho function.
+ * For 8-wide vectors the packed derivative values for 3 coords would
+ * look like this, this scales to a arbitrary (multiple of 4) vector size:
+ * ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy
+ * dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____
+ * The second vector will be unused for 1d and 2d textures.
+ */
LLVMValueRef
-lp_build_scalar_ddx(struct lp_build_context *bld,
- LLVMValueRef a)
+lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld,
+ LLVMValueRef a)
{
- LLVMBuilderRef builder = bld->gallivm->builder;
- LLVMValueRef idx_left = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_LEFT);
- LLVMValueRef idx_right = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_RIGHT);
- LLVMValueRef a_left = LLVMBuildExtractElement(builder, a, idx_left, "left");
- LLVMValueRef a_right = LLVMBuildExtractElement(builder, a, idx_right, "right");
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef vec1, vec2;
+
+ /* same packing as _twocoord, but can use aos swizzle helper */
+
+ /*
+ * XXX could make swizzle1 a noop swizzle by using right top/bottom
+ * pair for ddy
+ */
+ static const unsigned char swizzle1[] = {
+ LP_BLD_QUAD_TOP_LEFT, LP_BLD_QUAD_TOP_LEFT,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ static const unsigned char swizzle2[] = {
+ LP_BLD_QUAD_TOP_RIGHT, LP_BLD_QUAD_BOTTOM_LEFT,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+
+ vec1 = lp_build_swizzle_aos(bld, a, swizzle1);
+ vec2 = lp_build_swizzle_aos(bld, a, swizzle2);
+
if (bld->type.floating)
- return LLVMBuildFSub(builder, a_right, a_left, "ddx");
+ return LLVMBuildFSub(builder, vec2, vec1, "ddxddy");
else
- return LLVMBuildSub(builder, a_right, a_left, "ddx");
+ return LLVMBuildSub(builder, vec2, vec1, "ddxddy");
}
LLVMValueRef
-lp_build_scalar_ddy(struct lp_build_context *bld,
- LLVMValueRef a)
+lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld,
+ LLVMValueRef a, LLVMValueRef b)
{
- LLVMBuilderRef builder = bld->gallivm->builder;
- LLVMValueRef idx_top = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_LEFT);
- LLVMValueRef idx_bottom = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_BOTTOM_LEFT);
- LLVMValueRef a_top = LLVMBuildExtractElement(builder, a, idx_top, "top");
- LLVMValueRef a_bottom = LLVMBuildExtractElement(builder, a, idx_bottom, "bottom");
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH/4];
+ LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH/4];
+ LLVMValueRef vec1, vec2;
+ unsigned length, num_quads, i;
+
+ /* XXX: do hsub version */
+ length = bld->type.length;
+ num_quads = length / 4;
+ for (i = 0; i < num_quads; i++) {
+ unsigned s1 = 4 * i;
+ unsigned s2 = 4 * i + length;
+ shuffles1[4*i + 0] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s1);
+ shuffles1[4*i + 1] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s1);
+ shuffles1[4*i + 2] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s2);
+ shuffles1[4*i + 3] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s2);
+ shuffles2[4*i + 0] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_RIGHT + s1);
+ shuffles2[4*i + 1] = lp_build_const_int32(gallivm, LP_BLD_QUAD_BOTTOM_LEFT + s1);
+ shuffles2[4*i + 2] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_RIGHT + s2);
+ shuffles2[4*i + 3] = lp_build_const_int32(gallivm, LP_BLD_QUAD_BOTTOM_LEFT + s2);
+ }
+ vec1 = LLVMBuildShuffleVector(builder, a, b,
+ LLVMConstVector(shuffles1, length), "");
+ vec2 = LLVMBuildShuffleVector(builder, a, b,
+ LLVMConstVector(shuffles2, length), "");
if (bld->type.floating)
- return LLVMBuildFSub(builder, a_bottom, a_top, "ddy");
+ return LLVMBuildFSub(builder, vec2, vec1, "ddxddyddxddy");
else
- return LLVMBuildSub(builder, a_bottom, a_top, "ddy");
+ return LLVMBuildSub(builder, vec2, vec1, "ddxddyddxddy");
}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.h b/src/gallium/auxiliary/gallivm/lp_bld_quad.h
index b7992912927..be6a1efc396 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.h
@@ -78,19 +78,15 @@ lp_build_ddy(struct lp_build_context *bld,
/*
- * Scalar derivatives.
- *
- * Same as getting the first value of above.
+ * Packed derivatives (one derivative for each direction per quad)
*/
-
LLVMValueRef
-lp_build_scalar_ddx(struct lp_build_context *bld,
- LLVMValueRef a);
-
+lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld,
+ LLVMValueRef a, LLVMValueRef b);
LLVMValueRef
-lp_build_scalar_ddy(struct lp_build_context *bld,
- LLVMValueRef a);
+lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld,
+ LLVMValueRef a);
#endif /* LP_BLD_QUAD_H_ */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index d966788d74e..85211161f3c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -44,6 +44,8 @@
#include "lp_bld_sample.h"
#include "lp_bld_swizzle.h"
#include "lp_bld_type.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_pack.h"
/*
@@ -175,67 +177,89 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
/**
* Generate code to compute coordinate gradient (rho).
- * \param ddx partial derivatives of (s, t, r, q) with respect to X
- * \param ddy partial derivatives of (s, t, r, q) with respect to Y
+ * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y
*
- * XXX: The resulting rho is scalar, so we ignore all but the first element of
- * derivatives that are passed by the shader.
+ * The resulting rho is scalar per quad.
*/
static LLVMValueRef
lp_build_rho(struct lp_build_sample_context *bld,
unsigned unit,
- const LLVMValueRef ddx[4],
- const LLVMValueRef ddy[4])
+ const struct lp_derivatives *derivs)
{
+ struct gallivm_state *gallivm = bld->gallivm;
struct lp_build_context *int_size_bld = &bld->int_size_bld;
struct lp_build_context *float_size_bld = &bld->float_size_bld;
struct lp_build_context *float_bld = &bld->float_bld;
+ struct lp_build_context *coord_bld = &bld->coord_bld;
+ struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
+ const LLVMValueRef *ddx_ddy = derivs->ddx_ddy;
const unsigned dims = bld->dims;
LLVMBuilderRef builder = bld->gallivm->builder;
LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
- LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy;
- LLVMValueRef rho_x, rho_y;
LLVMValueRef rho_vec;
LLVMValueRef int_size, float_size;
LLVMValueRef rho;
LLVMValueRef first_level, first_level_vec;
+ LLVMValueRef abs_ddx_ddy[2];
+ unsigned length = coord_bld->type.length;
+ unsigned num_quads = length / 4;
+ unsigned i;
+ LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+ LLVMValueRef rho_xvec, rho_yvec;
+
+ abs_ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
+ if (dims > 2) {
+ abs_ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
+ }
- dsdx = ddx[0];
- dsdy = ddy[0];
-
- if (dims <= 1) {
- rho_x = dsdx;
- rho_y = dsdy;
+ if (dims == 1) {
+ static const unsigned char swizzle1[] = {
+ 0, LP_BLD_SWIZZLE_DONTCARE,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ static const unsigned char swizzle2[] = {
+ 1, LP_BLD_SWIZZLE_DONTCARE,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1);
+ rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2);
+ }
+ else if (dims == 2) {
+ static const unsigned char swizzle1[] = {
+ 0, 2,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ static const unsigned char swizzle2[] = {
+ 1, 3,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1);
+ rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2);
}
else {
- rho_x = float_size_bld->undef;
- rho_y = float_size_bld->undef;
-
- rho_x = LLVMBuildInsertElement(builder, rho_x, dsdx, index0, "");
- rho_y = LLVMBuildInsertElement(builder, rho_y, dsdy, index0, "");
-
- dtdx = ddx[1];
- dtdy = ddy[1];
-
- rho_x = LLVMBuildInsertElement(builder, rho_x, dtdx, index1, "");
- rho_y = LLVMBuildInsertElement(builder, rho_y, dtdy, index1, "");
-
- if (dims >= 3) {
- drdx = ddx[2];
- drdy = ddy[2];
-
- rho_x = LLVMBuildInsertElement(builder, rho_x, drdx, index2, "");
- rho_y = LLVMBuildInsertElement(builder, rho_y, drdy, index2, "");
+ LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH];
+ LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH];
+ assert(dims == 3);
+ for (i = 0; i < num_quads; i++) {
+ shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i);
+ shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2);
+ shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i);
+ shuffles1[4*i + 3] = i32undef;
+ shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1);
+ shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3);
+ shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 1);
+ shuffles2[4*i + 3] = i32undef;
}
+ rho_xvec = LLVMBuildShuffleVector(builder, abs_ddx_ddy[0], abs_ddx_ddy[1],
+ LLVMConstVector(shuffles1, length), "");
+ rho_yvec = LLVMBuildShuffleVector(builder, abs_ddx_ddy[0], abs_ddx_ddy[1],
+ LLVMConstVector(shuffles2, length), "");
}
- rho_x = lp_build_abs(float_size_bld, rho_x);
- rho_y = lp_build_abs(float_size_bld, rho_y);
-
- rho_vec = lp_build_max(float_size_bld, rho_x, rho_y);
+ rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
first_level = bld->dynamic_state->first_level(bld->dynamic_state,
bld->gallivm, unit);
@@ -243,22 +267,77 @@ lp_build_rho(struct lp_build_sample_context *bld,
int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
float_size = lp_build_int_to_float(float_size_bld, int_size);
- rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
+ if (bld->coord_type.length > 4) {
+ /* expand size to each quad */
+ if (dims > 1) {
+ /* could use some broadcast_vector helper for this? */
+ int num_quads = bld->coord_type.length / 4;
+ LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4];
+ for (i = 0; i < num_quads; i++) {
+ src[i] = float_size;
+ }
+ float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads);
+ }
+ else {
+ float_size = lp_build_broadcast_scalar(coord_bld, float_size);
+ }
+ rho_vec = lp_build_mul(coord_bld, rho_vec, float_size);
- if (dims <= 1) {
- rho = rho_vec;
+ if (dims <= 1) {
+ rho = rho_vec;
+ }
+ else {
+ if (dims >= 2) {
+ static const unsigned char swizzle1[] = {
+ 0, LP_BLD_SWIZZLE_DONTCARE,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ static const unsigned char swizzle2[] = {
+ 1, LP_BLD_SWIZZLE_DONTCARE,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ LLVMValueRef rho_s, rho_t, rho_r;
+
+ rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
+ rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2);
+
+ rho = lp_build_max(coord_bld, rho_s, rho_t);
+
+ if (dims >= 3) {
+ static const unsigned char swizzle3[] = {
+ 2, LP_BLD_SWIZZLE_DONTCARE,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle3);
+ rho = lp_build_max(coord_bld, rho, rho_r);
+ }
+ }
+ }
+ rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+ perquadf_bld->type, rho);
}
else {
- if (dims >= 2) {
- LLVMValueRef rho_s, rho_t, rho_r;
+ if (dims <= 1) {
+ rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, "");
+ }
+ rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
- rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
- rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
+ if (dims <= 1) {
+ rho = rho_vec;
+ }
+ else {
+ if (dims >= 2) {
+ LLVMValueRef rho_s, rho_t, rho_r;
+
+ rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
+ rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
- rho = lp_build_max(float_bld, rho_s, rho_t);
- if (dims >= 3) {
- rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
- rho = lp_build_max(float_bld, rho, rho_r);
+ rho = lp_build_max(float_bld, rho_s, rho_t);
+
+ if (dims >= 3) {
+ rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
+ rho = lp_build_max(float_bld, rho, rho_r);
+ }
}
}
}
@@ -396,22 +475,20 @@ lp_build_brilinear_rho(struct lp_build_context *bld,
/**
* Generate code to compute texture level of detail (lambda).
- * \param ddx partial derivatives of (s, t, r, q) with respect to X
- * \param ddy partial derivatives of (s, t, r, q) with respect to Y
+ * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y
* \param lod_bias optional float vector with the shader lod bias
* \param explicit_lod optional float vector with the explicit lod
* \param width scalar int texture width
* \param height scalar int texture height
* \param depth scalar int texture depth
*
- * XXX: The resulting lod is scalar, so ignore all but the first element of
- * derivatives, lod_bias, etc that are passed by the shader.
+ * The resulting lod is scalar per quad, so only the first value per quad
+ * passed in from lod_bias, explicit_lod is used.
*/
void
lp_build_lod_selector(struct lp_build_sample_context *bld,
unsigned unit,
- const LLVMValueRef ddx[4],
- const LLVMValueRef ddy[4],
+ const struct lp_derivatives *derivs,
LLVMValueRef lod_bias, /* optional */
LLVMValueRef explicit_lod, /* optional */
unsigned mip_filter,
@@ -420,11 +497,11 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
{
LLVMBuilderRef builder = bld->gallivm->builder;
- struct lp_build_context *float_bld = &bld->float_bld;
+ struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
LLVMValueRef lod;
- *out_lod_ipart = bld->int_bld.zero;
- *out_lod_fpart = bld->float_bld.zero;
+ *out_lod_ipart = bld->perquadi_bld.zero;
+ *out_lod_fpart = perquadf_bld->zero;
if (bld->static_state->min_max_lod_equal) {
/* User is forcing sampling from a particular mipmap level.
@@ -433,21 +510,17 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
LLVMValueRef min_lod =
bld->dynamic_state->min_lod(bld->dynamic_state, bld->gallivm, unit);
- lod = min_lod;
+ lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
}
else {
- LLVMValueRef sampler_lod_bias =
- bld->dynamic_state->lod_bias(bld->dynamic_state, bld->gallivm, unit);
- LLVMValueRef index0 = lp_build_const_int32(bld->gallivm, 0);
-
if (explicit_lod) {
- lod = LLVMBuildExtractElement(builder, explicit_lod,
- index0, "");
+ lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
+ perquadf_bld->type, explicit_lod);
}
else {
LLVMValueRef rho;
- rho = lp_build_rho(bld, unit, ddx, ddy);
+ rho = lp_build_rho(bld, unit, derivs);
/*
* Compute lod = log2(rho)
@@ -465,66 +538,72 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
- *out_lod_ipart = lp_build_ilog2(float_bld, rho);
- *out_lod_fpart = bld->float_bld.zero;
+ *out_lod_ipart = lp_build_ilog2(perquadf_bld, rho);
+ *out_lod_fpart = perquadf_bld->zero;
return;
}
if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
- lp_build_brilinear_rho(float_bld, rho, BRILINEAR_FACTOR,
+ lp_build_brilinear_rho(perquadf_bld, rho, BRILINEAR_FACTOR,
out_lod_ipart, out_lod_fpart);
return;
}
}
if (0) {
- lod = lp_build_log2(float_bld, rho);
+ lod = lp_build_log2(perquadf_bld, rho);
}
else {
- lod = lp_build_fast_log2(float_bld, rho);
+ lod = lp_build_fast_log2(perquadf_bld, rho);
}
/* add shader lod bias */
if (lod_bias) {
- lod_bias = LLVMBuildExtractElement(builder, lod_bias,
- index0, "");
+ lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
+ perquadf_bld->type, lod_bias);
lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
}
}
/* add sampler lod bias */
- if (bld->static_state->lod_bias_non_zero)
+ if (bld->static_state->lod_bias_non_zero) {
+ LLVMValueRef sampler_lod_bias =
+ bld->dynamic_state->lod_bias(bld->dynamic_state, bld->gallivm, unit);
+ sampler_lod_bias = lp_build_broadcast_scalar(perquadf_bld,
+ sampler_lod_bias);
lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
-
+ }
/* clamp lod */
if (bld->static_state->apply_max_lod) {
LLVMValueRef max_lod =
bld->dynamic_state->max_lod(bld->dynamic_state, bld->gallivm, unit);
+ max_lod = lp_build_broadcast_scalar(perquadf_bld, max_lod);
- lod = lp_build_min(float_bld, lod, max_lod);
+ lod = lp_build_min(perquadf_bld, lod, max_lod);
}
if (bld->static_state->apply_min_lod) {
LLVMValueRef min_lod =
bld->dynamic_state->min_lod(bld->dynamic_state, bld->gallivm, unit);
+ min_lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
- lod = lp_build_max(float_bld, lod, min_lod);
+ lod = lp_build_max(perquadf_bld, lod, min_lod);
}
}
if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
- lp_build_brilinear_lod(float_bld, lod, BRILINEAR_FACTOR,
+ lp_build_brilinear_lod(perquadf_bld, lod, BRILINEAR_FACTOR,
out_lod_ipart, out_lod_fpart);
}
else {
- lp_build_ifloor_fract(float_bld, lod, out_lod_ipart, out_lod_fpart);
+ lp_build_ifloor_fract(perquadf_bld, lod, out_lod_ipart, out_lod_fpart);
}
lp_build_name(*out_lod_fpart, "lod_fpart");
}
else {
- *out_lod_ipart = lp_build_iround(float_bld, lod);
+ *out_lod_ipart = lp_build_iround(perquadf_bld, lod);
}
lp_build_name(*out_lod_ipart, "lod_ipart");
@@ -536,8 +615,8 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
/**
* For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
* mipmap level index.
- * Note: this is all scalar code.
- * \param lod scalar float texture level of detail
+ * Note: this is all scalar per quad code.
+ * \param lod_ipart int texture level of detail
* \param level_out returns integer
*/
void
@@ -546,26 +625,27 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
LLVMValueRef lod_ipart,
LLVMValueRef *level_out)
{
- struct lp_build_context *int_bld = &bld->int_bld;
+ struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
LLVMValueRef first_level, last_level, level;
first_level = bld->dynamic_state->first_level(bld->dynamic_state,
bld->gallivm, unit);
last_level = bld->dynamic_state->last_level(bld->dynamic_state,
bld->gallivm, unit);
+ first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
+ last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
- /* convert float lod to integer */
- level = lp_build_add(int_bld, lod_ipart, first_level);
+ level = lp_build_add(perquadi_bld, lod_ipart, first_level);
/* clamp level to legal range of levels */
- *level_out = lp_build_clamp(int_bld, level, first_level, last_level);
+ *level_out = lp_build_clamp(perquadi_bld, level, first_level, last_level);
}
/**
- * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to
- * two (adjacent) mipmap level indexes. Later, we'll sample from those
- * two mipmap levels and interpolate between them.
+ * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad int LOD(s) to two (per-quad)
+ * (adjacent) mipmap level indexes, and fix up float lod part accordingly.
+ * Later, we'll sample from those two mipmap levels and interpolate between them.
*/
void
lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
@@ -576,20 +656,21 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
LLVMValueRef *level1_out)
{
LLVMBuilderRef builder = bld->gallivm->builder;
- struct lp_build_context *int_bld = &bld->int_bld;
- struct lp_build_context *float_bld = &bld->float_bld;
+ struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
+ struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
LLVMValueRef first_level, last_level;
LLVMValueRef clamp_min;
LLVMValueRef clamp_max;
first_level = bld->dynamic_state->first_level(bld->dynamic_state,
bld->gallivm, unit);
-
- *level0_out = lp_build_add(int_bld, lod_ipart, first_level);
- *level1_out = lp_build_add(int_bld, *level0_out, int_bld->one);
-
last_level = bld->dynamic_state->last_level(bld->dynamic_state,
bld->gallivm, unit);
+ first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
+ last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
+
+ *level0_out = lp_build_add(perquadi_bld, lod_ipart, first_level);
+ *level1_out = lp_build_add(perquadi_bld, *level0_out, perquadi_bld->one);
/*
* Clamp both *level0_out and *level1_out to [first_level, last_level], with
@@ -597,6 +678,15 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
* ends in the process.
*/
+ /*
+ * This code (vector select in particular) only works with llvm 3.1
+ * (if there's more than one quad, with x86 backend). Might consider
+ * converting to our lp_bld_logic helpers.
+ */
+#if HAVE_LLVM < 0x0301
+ assert(perquadi_bld->type.length == 1);
+#endif
+
/* *level0_out < first_level */
clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
*level0_out, first_level,
@@ -609,7 +699,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
first_level, *level1_out, "");
*lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
- float_bld->zero, *lod_fpart_inout, "");
+ perquadf_bld->zero, *lod_fpart_inout, "");
/* *level0_out >= last_level */
clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
@@ -623,7 +713,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
last_level, *level1_out, "");
*lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
- float_bld->zero, *lod_fpart_inout, "");
+ perquadf_bld->zero, *lod_fpart_inout, "");
lp_build_name(*level0_out, "sampler%u_miplevel0", unit);
lp_build_name(*level1_out, "sampler%u_miplevel1", unit);
@@ -651,15 +741,6 @@ lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
}
-LLVMValueRef
-lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
- int level)
-{
- LLVMValueRef lvl = lp_build_const_int32(bld->gallivm, level);
- return lp_build_get_mipmap_level(bld, lvl);
-}
-
-
/**
* Codegen equivalent for u_minify().
* Return max(1, base_size >> level);
@@ -748,8 +829,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
* bld->int_size_type or bld->float_size_type)
* @param coord_type type of the texture size vector (either
* bld->int_coord_type or bld->coord_type)
- * @param int_size vector with the integer texture size (width, height,
- * depth)
+ * @param size vector with the texture size (width, height, depth)
*/
void
lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
@@ -788,7 +868,7 @@ lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
/**
* Unnormalize coords.
*
- * @param int_size vector with the integer texture size (width, height, depth)
+ * @param flt_size vector with the integer texture size (width, height, depth)
*/
void
lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
@@ -823,7 +903,18 @@ lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
/** Helper used by lp_build_cube_lookup() */
static LLVMValueRef
-lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
+lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
+{
+ /* ima = +0.5 / abs(coord); */
+ LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
+ LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
+ LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord);
+ return ima;
+}
+
+/** Helper used by lp_build_cube_lookup() */
+static LLVMValueRef
+lp_build_cube_imaneg(struct lp_build_context *coord_bld, LLVMValueRef coord)
{
/* ima = -0.5 / abs(coord); */
LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, -0.5);
@@ -832,9 +923,12 @@ lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
return ima;
}
-
/**
* Helper used by lp_build_cube_lookup()
+ * FIXME: the sign here can also be 0.
+ * Arithmetically this could definitely make a difference. Either
+ * fix the comment or use other (simpler) sign function, not sure
+ * which one it should be.
* \param sign scalar +1 or -1
* \param coord float vector
* \param ima float vector
@@ -898,58 +992,186 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
LLVMValueRef *face_s,
LLVMValueRef *face_t)
{
- struct lp_build_context *float_bld = &bld->float_bld;
struct lp_build_context *coord_bld = &bld->coord_bld;
LLVMBuilderRef builder = bld->gallivm->builder;
+ struct gallivm_state *gallivm = bld->gallivm;
LLVMValueRef rx, ry, rz;
- LLVMValueRef arx, ary, arz;
- LLVMValueRef c25 = lp_build_const_float(bld->gallivm, 0.25);
- LLVMValueRef arx_ge_ary, arx_ge_arz;
- LLVMValueRef ary_ge_arx, ary_ge_arz;
- LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
-
- assert(bld->coord_bld.type.length == 4);
+ LLVMValueRef tmp[4], rxyz, arxyz;
/*
* Use the average of the four pixel's texcoords to choose the face.
+ * Slight simplification just calculate the sum, skip scaling.
*/
- rx = lp_build_mul(float_bld, c25,
- lp_build_sum_vector(&bld->coord_bld, s));
- ry = lp_build_mul(float_bld, c25,
- lp_build_sum_vector(&bld->coord_bld, t));
- rz = lp_build_mul(float_bld, c25,
- lp_build_sum_vector(&bld->coord_bld, r));
+ tmp[0] = s;
+ tmp[1] = t;
+ tmp[2] = r;
+ rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3);
+ arxyz = lp_build_abs(&bld->coord_bld, rxyz);
+
+ if (coord_bld->type.length > 4) {
+ struct lp_build_context *cint_bld = &bld->int_coord_bld;
+ struct lp_type intctype = cint_bld->type;
+ LLVMValueRef signrxs, signrys, signrzs, signrxyz, sign;
+ LLVMValueRef arxs, arys, arzs;
+ LLVMValueRef arx_ge_ary, maxarxsarys, arz_ge_arx_ary;
+ LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
+ LLVMValueRef ryneg, rzneg;
+ LLVMValueRef ma, ima;
+ LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
+ LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
+ 1 << (intctype.width - 1));
+ LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
+ intctype.width -1);
+ LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
+ LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
+ LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
+
+ assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
+ assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
+ assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
+
+ rx = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
+ ry = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
+ rz = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
+ ryneg = LLVMBuildXor(builder, ry, signmask, "");
+ rzneg = LLVMBuildXor(builder, rz, signmask, "");
+
+ /* the sign bit comes from the averaged vector (per quad),
+ * as does the decision which face to use */
+ signrxyz = LLVMBuildBitCast(builder, rxyz, lp_build_vec_type(gallivm, intctype), "");
+ signrxyz = LLVMBuildAnd(builder, signrxyz, signmask, "");
+
+ arxs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 0);
+ arys = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 1);
+ arzs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 2);
- arx = lp_build_abs(float_bld, rx);
- ary = lp_build_abs(float_bld, ry);
- arz = lp_build_abs(float_bld, rz);
+ /*
+ * select x if x >= y else select y
+ * select previous result if y >= max(x,y) else select z
+ */
+ arx_ge_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, arxs, arys);
+ maxarxsarys = lp_build_max(coord_bld, arxs, arys);
+ arz_ge_arx_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, maxarxsarys, arzs);
- /*
- * Compare sign/magnitude of rx,ry,rz to determine face
- */
- arx_ge_ary = LLVMBuildFCmp(builder, LLVMRealUGE, arx, ary, "");
- arx_ge_arz = LLVMBuildFCmp(builder, LLVMRealUGE, arx, arz, "");
- ary_ge_arx = LLVMBuildFCmp(builder, LLVMRealUGE, ary, arx, "");
- ary_ge_arz = LLVMBuildFCmp(builder, LLVMRealUGE, ary, arz, "");
+ /*
+ * compute all possible new s/t coords
+ * snewx = signrx * -rz;
+ * tnewx = -ry;
+ * snewy = rx;
+ * tnewy = signry * rz;
+ * snewz = signrz * rx;
+ * tnewz = -ry;
+ */
+ signrxs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 0);
+ snewx = LLVMBuildXor(builder, signrxs, rzneg, "");
+ tnewx = ryneg;
+
+ signrys = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 1);
+ snewy = rx;
+ tnewy = LLVMBuildXor(builder, signrys, rz, "");
+
+ signrzs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 2);
+ snewz = LLVMBuildXor(builder, signrzs, rx, "");
+ tnewz = ryneg;
+
+ /* XXX on x86 unclear if we should cast the values back to float
+ * or not - on some cpus (nehalem) pblendvb has twice the throughput
+ * of blendvps though on others there just might be domain
+ * transition penalties when using it (this depends on what llvm
+ * will chose for the bit ops above so there appears no "right way",
+ * but given the boatload of selects let's just use the int type).
+ *
+ * Unfortunately we also need the sign bit of the summed coords.
+ */
+ *face_s = lp_build_select(cint_bld, arx_ge_ary, snewx, snewy);
+ *face_t = lp_build_select(cint_bld, arx_ge_ary, tnewx, tnewy);
+ ma = lp_build_select(coord_bld, arx_ge_ary, s, t);
+ *face = lp_build_select(cint_bld, arx_ge_ary, facex, facey);
+ sign = lp_build_select(cint_bld, arx_ge_ary, signrxs, signrys);
+
+ *face_s = lp_build_select(cint_bld, arz_ge_arx_ary, *face_s, snewz);
+ *face_t = lp_build_select(cint_bld, arz_ge_arx_ary, *face_t, tnewz);
+ ma = lp_build_select(coord_bld, arz_ge_arx_ary, ma, r);
+ *face = lp_build_select(cint_bld, arz_ge_arx_ary, *face, facez);
+ sign = lp_build_select(cint_bld, arz_ge_arx_ary, sign, signrzs);
+
+ *face_s = LLVMBuildBitCast(builder, *face_s,
+ lp_build_vec_type(gallivm, coord_bld->type), "");
+ *face_t = LLVMBuildBitCast(builder, *face_t,
+ lp_build_vec_type(gallivm, coord_bld->type), "");
+
+ /* add +1 for neg face */
+ /* XXX with AVX probably want to use another select here -
+ * as long as we ensure vblendvps gets used we can actually
+ * skip the comparison and just use sign as a "mask" directly.
+ */
+ sign = LLVMBuildLShr(builder, sign, signshift, "");
+ *face = LLVMBuildOr(builder, *face, sign, "face");
- arx_ge_ary_arz = LLVMBuildAnd(builder, arx_ge_ary, arx_ge_arz, "");
- ary_ge_arx_arz = LLVMBuildAnd(builder, ary_ge_arx, ary_ge_arz, "");
+ ima = lp_build_cube_imapos(coord_bld, ma);
+
+ *face_s = lp_build_mul(coord_bld, *face_s, ima);
+ *face_s = lp_build_add(coord_bld, *face_s, posHalf);
+ *face_t = lp_build_mul(coord_bld, *face_t, ima);
+ *face_t = lp_build_add(coord_bld, *face_t, posHalf);
+ }
- {
+ else {
struct lp_build_if_state if_ctx;
LLVMValueRef face_s_var;
LLVMValueRef face_t_var;
LLVMValueRef face_var;
-
- face_s_var = lp_build_alloca(bld->gallivm, bld->coord_bld.vec_type, "face_s_var");
- face_t_var = lp_build_alloca(bld->gallivm, bld->coord_bld.vec_type, "face_t_var");
- face_var = lp_build_alloca(bld->gallivm, bld->int_bld.vec_type, "face_var");
-
- lp_build_if(&if_ctx, bld->gallivm, arx_ge_ary_arz);
+ LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
+ LLVMValueRef shuffles[4];
+ LLVMValueRef arxy_ge_aryx, arxy_ge_arzz, arxy_ge_arxy_arzz;
+ LLVMValueRef arxyxy, aryxzz, arxyxy_ge_aryxzz;
+ struct lp_build_context *float_bld = &bld->float_bld;
+
+ assert(bld->coord_bld.type.length == 4);
+
+ shuffles[0] = lp_build_const_int32(gallivm, 0);
+ shuffles[1] = lp_build_const_int32(gallivm, 1);
+ shuffles[2] = lp_build_const_int32(gallivm, 0);
+ shuffles[3] = lp_build_const_int32(gallivm, 1);
+ arxyxy = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
+ shuffles[0] = lp_build_const_int32(gallivm, 1);
+ shuffles[1] = lp_build_const_int32(gallivm, 0);
+ shuffles[2] = lp_build_const_int32(gallivm, 2);
+ shuffles[3] = lp_build_const_int32(gallivm, 2);
+ aryxzz = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
+ arxyxy_ge_aryxzz = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_GEQUAL, arxyxy, aryxzz);
+
+ shuffles[0] = lp_build_const_int32(gallivm, 0);
+ shuffles[1] = lp_build_const_int32(gallivm, 1);
+ arxy_ge_aryx = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
+ LLVMConstVector(shuffles, 2), "");
+ shuffles[0] = lp_build_const_int32(gallivm, 2);
+ shuffles[1] = lp_build_const_int32(gallivm, 3);
+ arxy_ge_arzz = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
+ LLVMConstVector(shuffles, 2), "");
+ arxy_ge_arxy_arzz = LLVMBuildAnd(builder, arxy_ge_aryx, arxy_ge_arzz, "");
+
+ arx_ge_ary_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
+ lp_build_const_int32(gallivm, 0), "");
+ arx_ge_ary_arz = LLVMBuildICmp(builder, LLVMIntNE, arx_ge_ary_arz,
+ lp_build_const_int32(gallivm, 0), "");
+ ary_ge_arx_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
+ lp_build_const_int32(gallivm, 1), "");
+ ary_ge_arx_arz = LLVMBuildICmp(builder, LLVMIntNE, ary_ge_arx_arz,
+ lp_build_const_int32(gallivm, 0), "");
+ face_s_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_s_var");
+ face_t_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_t_var");
+ face_var = lp_build_alloca(gallivm, bld->int_bld.vec_type, "face_var");
+
+ lp_build_if(&if_ctx, gallivm, arx_ge_ary_arz);
{
/* +/- X face */
- LLVMValueRef sign = lp_build_sgn(float_bld, rx);
- LLVMValueRef ima = lp_build_cube_ima(coord_bld, s);
+ LLVMValueRef sign, ima;
+ rx = LLVMBuildExtractElement(builder, rxyz,
+ lp_build_const_int32(gallivm, 0), "");
+ /* +/- X face */
+ sign = lp_build_sgn(float_bld, rx);
+ ima = lp_build_cube_imaneg(coord_bld, s);
*face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
*face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
*face = lp_build_cube_face(bld, rx,
@@ -963,11 +1185,14 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
{
struct lp_build_if_state if_ctx2;
- lp_build_if(&if_ctx2, bld->gallivm, ary_ge_arx_arz);
+ lp_build_if(&if_ctx2, gallivm, ary_ge_arx_arz);
{
+ LLVMValueRef sign, ima;
/* +/- Y face */
- LLVMValueRef sign = lp_build_sgn(float_bld, ry);
- LLVMValueRef ima = lp_build_cube_ima(coord_bld, t);
+ ry = LLVMBuildExtractElement(builder, rxyz,
+ lp_build_const_int32(gallivm, 1), "");
+ sign = lp_build_sgn(float_bld, ry);
+ ima = lp_build_cube_imaneg(coord_bld, t);
*face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
*face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
*face = lp_build_cube_face(bld, ry,
@@ -980,8 +1205,11 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
lp_build_else(&if_ctx2);
{
/* +/- Z face */
- LLVMValueRef sign = lp_build_sgn(float_bld, rz);
- LLVMValueRef ima = lp_build_cube_ima(coord_bld, r);
+ LLVMValueRef sign, ima;
+ rz = LLVMBuildExtractElement(builder, rxyz,
+ lp_build_const_int32(gallivm, 2), "");
+ sign = lp_build_sgn(float_bld, rz);
+ ima = lp_build_cube_imaneg(coord_bld, r);
*face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
*face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
*face = lp_build_cube_face(bld, rz,
@@ -999,6 +1227,7 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
*face_s = LLVMBuildLoad(builder, face_s_var, "face_s");
*face_t = LLVMBuildLoad(builder, face_t_var, "face_t");
*face = LLVMBuildLoad(builder, face_var, "face");
+ *face = lp_build_broadcast_scalar(&bld->int_coord_bld, *face);
}
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index dad138abee0..0f3d8ae6cb5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -52,6 +52,15 @@ struct lp_build_context;
/**
+ * Helper struct holding all derivatives needed for sampling
+ */
+struct lp_derivatives
+{
+ LLVMValueRef ddx_ddy[2];
+};
+
+
+/**
* Sampler static state.
*
* These are the bits of state from pipe_resource and pipe_sampler_state that
@@ -192,6 +201,9 @@ struct lp_build_sample_context
/* See texture_dims() */
unsigned dims;
+ /** SIMD vector width */
+ unsigned vector_width;
+
/** regular scalar float type */
struct lp_type float_type;
struct lp_build_context float_bld;
@@ -199,7 +211,7 @@ struct lp_build_sample_context
/** float vector type */
struct lp_build_context float_vec_bld;
- /** regular scalar float type */
+ /** regular scalar int type */
struct lp_type int_type;
struct lp_build_context int_bld;
@@ -223,10 +235,15 @@ struct lp_build_sample_context
struct lp_type texel_type;
struct lp_build_context texel_bld;
+ /** Float per-quad type */
+ struct lp_type perquadf_type;
+ struct lp_build_context perquadf_bld;
+
+ /** Int per-quad type */
+ struct lp_type perquadi_type;
+ struct lp_build_context perquadi_bld;
+
/* Common dynamic state values */
- LLVMValueRef width;
- LLVMValueRef height;
- LLVMValueRef depth;
LLVMValueRef row_stride_array;
LLVMValueRef img_stride_array;
LLVMValueRef data_array;
@@ -305,8 +322,7 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
void
lp_build_lod_selector(struct lp_build_sample_context *bld,
unsigned unit,
- const LLVMValueRef ddx[4],
- const LLVMValueRef ddy[4],
+ const struct lp_derivatives *derivs,
LLVMValueRef lod_bias, /* optional */
LLVMValueRef explicit_lod, /* optional */
unsigned mip_filter,
@@ -331,10 +347,6 @@ LLVMValueRef
lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
LLVMValueRef level);
-LLVMValueRef
-lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
- int level);
-
void
lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
@@ -402,22 +414,35 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
unsigned unit,
unsigned num_coords,
const LLVMValueRef *coords,
- const LLVMValueRef *ddx,
- const LLVMValueRef *ddy,
+ const struct lp_derivatives *derivs,
LLVMValueRef lod_bias,
LLVMValueRef explicit_lod,
LLVMValueRef texel_out[4]);
+
+void
+lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
+ LLVMValueRef coord_f,
+ LLVMValueRef length_i,
+ LLVMValueRef length_f,
+ LLVMValueRef *coord0_i,
+ LLVMValueRef *weight_f);
+
+
void
lp_build_size_query_soa(struct gallivm_state *gallivm,
const struct lp_sampler_static_state *static_state,
struct lp_sampler_dynamic_state *dynamic_state,
+ struct lp_type int_type,
unsigned unit,
LLVMValueRef explicit_lod,
LLVMValueRef *sizes_out);
void
-lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type,
+lp_build_sample_nop(struct gallivm_state *gallivm,
+ struct lp_type type,
+ unsigned num_coords,
+ const LLVMValueRef *coords,
LLVMValueRef texel_out[4]);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index 74858bc9718..ad1b29cf096 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -27,7 +27,7 @@
/**
* @file
- * Texture sampling -- SoA.
+ * Texture sampling -- AoS.
*
* @author Jose Fonseca <jfonseca@vmware.com>
* @author Brian Paul <brianp@vmware.com>
@@ -40,6 +40,7 @@
#include "util/u_memory.h"
#include "util/u_math.h"
#include "util/u_format.h"
+#include "util/u_cpu_detect.h"
#include "lp_bld_debug.h"
#include "lp_bld_type.h"
#include "lp_bld_const.h"
@@ -75,6 +76,7 @@ static void
lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
unsigned block_length,
LLVMValueRef coord,
+ LLVMValueRef coord_f,
LLVMValueRef length,
LLVMValueRef stride,
boolean is_pot,
@@ -93,10 +95,11 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
if(is_pot)
coord = LLVMBuildAnd(builder, coord, length_minus_one, "");
else {
- /* Add a bias to the texcoord to handle negative coords */
- LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
- coord = LLVMBuildAdd(builder, coord, bias, "");
- coord = LLVMBuildURem(builder, coord, length, "");
+ struct lp_build_context *coord_bld = &bld->coord_bld;
+ LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
+ coord = lp_build_fract_safe(coord_bld, coord_f);
+ coord = lp_build_mul(coord_bld, coord, length_f);
+ coord = lp_build_itrunc(coord_bld, coord);
}
break;
@@ -121,6 +124,56 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
/**
+ * Build LLVM code for texture coord wrapping, for nearest filtering,
+ * for float texcoords.
+ * \param coord the incoming texcoord (s,t,r or q)
+ * \param length the texture size along one dimension
+ * \param is_pot if TRUE, length is a power of two
+ * \param wrap_mode one of PIPE_TEX_WRAP_x
+ * \param icoord the texcoord after wrapping, as int
+ */
+static void
+lp_build_sample_wrap_nearest_float(struct lp_build_sample_context *bld,
+ LLVMValueRef coord,
+ LLVMValueRef length,
+ boolean is_pot,
+ unsigned wrap_mode,
+ LLVMValueRef *icoord)
+{
+ struct lp_build_context *coord_bld = &bld->coord_bld;
+ LLVMValueRef length_minus_one;
+
+ switch(wrap_mode) {
+ case PIPE_TEX_WRAP_REPEAT:
+ /* take fraction, unnormalize */
+ coord = lp_build_fract_safe(coord_bld, coord);
+ coord = lp_build_mul(coord_bld, coord, length);
+ *icoord = lp_build_itrunc(coord_bld, coord);
+ break;
+ case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+ length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
+ if (bld->static_state->normalized_coords) {
+ /* scale coord to length */
+ coord = lp_build_mul(coord_bld, coord, length);
+ }
+ coord = lp_build_clamp(coord_bld, coord, coord_bld->zero,
+ length_minus_one);
+ *icoord = lp_build_itrunc(coord_bld, coord);
+ break;
+
+ case PIPE_TEX_WRAP_CLAMP:
+ case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+ case PIPE_TEX_WRAP_MIRROR_REPEAT:
+ case PIPE_TEX_WRAP_MIRROR_CLAMP:
+ case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+ case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+ default:
+ assert(0);
+ }
+}
+
+
+/**
* Build LLVM code for texture coord wrapping, for linear filtering,
* for scaled integer texcoords.
* \param block_length is the length of the pixel block along the
@@ -139,6 +192,8 @@ static void
lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
unsigned block_length,
LLVMValueRef coord0,
+ LLVMValueRef *weight_i,
+ LLVMValueRef coord_f,
LLVMValueRef length,
LLVMValueRef stride,
boolean is_pot,
@@ -153,58 +208,85 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
LLVMValueRef length_minus_one;
LLVMValueRef lmask, umask, mask;
- if (block_length != 1) {
- /*
- * If the pixel block covers more than one pixel then there is no easy
- * way to calculate offset1 relative to offset0. Instead, compute them
- * independently.
- */
-
- LLVMValueRef coord1;
-
- lp_build_sample_wrap_nearest_int(bld,
- block_length,
- coord0,
- length,
- stride,
- is_pot,
- wrap_mode,
- offset0, i0);
-
- coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+ /*
+ * If the pixel block covers more than one pixel then there is no easy
+ * way to calculate offset1 relative to offset0. Instead, compute them
+ * independently. Otherwise, try to compute offset0 and offset1 with
+ * a single stride multiplication.
+ */
- lp_build_sample_wrap_nearest_int(bld,
- block_length,
- coord1,
- length,
- stride,
- is_pot,
- wrap_mode,
- offset1, i1);
+ length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
+ if (block_length != 1) {
+ LLVMValueRef coord1;
+ switch(wrap_mode) {
+ case PIPE_TEX_WRAP_REPEAT:
+ if (is_pot) {
+ coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+ coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
+ coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
+ }
+ else {
+ LLVMValueRef mask;
+ LLVMValueRef weight;
+ LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
+ lp_build_coord_repeat_npot_linear(bld, coord_f,
+ length, length_f,
+ &coord0, &weight);
+ mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
+ PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
+ coord1 = LLVMBuildAnd(builder,
+ lp_build_add(int_coord_bld, coord0,
+ int_coord_bld->one),
+ mask, "");
+ weight = lp_build_mul_imm(&bld->coord_bld, weight, 256);
+ *weight_i = lp_build_itrunc(&bld->coord_bld, weight);
+ }
+ break;
+
+ case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+ coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+ coord0 = lp_build_clamp(int_coord_bld, coord0, int_coord_bld->zero,
+ length_minus_one);
+ coord1 = lp_build_clamp(int_coord_bld, coord1, int_coord_bld->zero,
+ length_minus_one);
+ break;
+
+ case PIPE_TEX_WRAP_CLAMP:
+ case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+ case PIPE_TEX_WRAP_MIRROR_REPEAT:
+ case PIPE_TEX_WRAP_MIRROR_CLAMP:
+ case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+ case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+ default:
+ assert(0);
+ coord0 = int_coord_bld->zero;
+ coord1 = int_coord_bld->zero;
+ break;
+ }
+ lp_build_sample_partial_offset(int_coord_bld, block_length, coord0, stride,
+ offset0, i0);
+ lp_build_sample_partial_offset(int_coord_bld, block_length, coord1, stride,
+ offset1, i1);
return;
}
- /*
- * Scalar pixels -- try to compute offset0 and offset1 with a single stride
- * multiplication.
- */
-
*i0 = int_coord_bld->zero;
*i1 = int_coord_bld->zero;
- length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
-
switch(wrap_mode) {
case PIPE_TEX_WRAP_REPEAT:
if (is_pot) {
coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
}
else {
- /* Add a bias to the texcoord to handle negative coords */
- LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
- coord0 = LLVMBuildAdd(builder, coord0, bias, "");
- coord0 = LLVMBuildURem(builder, coord0, length, "");
+ LLVMValueRef weight;
+ LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
+ lp_build_coord_repeat_npot_linear(bld, coord_f,
+ length, length_f,
+ &coord0, &weight);
+ weight = lp_build_mul_imm(&bld->coord_bld, weight, 256);
+ *weight_i = lp_build_itrunc(&bld->coord_bld, weight);
}
mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
@@ -217,6 +299,11 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
break;
case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+ /* XXX this might be slower than the separate path
+ * on some newer cpus. With sse41 this is 8 instructions vs. 7
+ * - at least on SNB this is almost certainly slower since
+ * min/max are cheaper than selects, and the muls aren't bad.
+ */
lmask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
umask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
@@ -249,6 +336,176 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
/**
+ * Build LLVM code for texture coord wrapping, for linear filtering,
+ * for float texcoords.
+ * \param block_length is the length of the pixel block along the
+ * coordinate axis
+ * \param coord the incoming texcoord (s,t,r or q)
+ * \param length the texture size along one dimension
+ * \param is_pot if TRUE, length is a power of two
+ * \param wrap_mode one of PIPE_TEX_WRAP_x
+ * \param coord0 the first texcoord after wrapping, as int
+ * \param coord1 the second texcoord after wrapping, as int
+ * \param weight the filter weight as int (0-255)
+ * \param force_nearest if this coord actually uses nearest filtering
+ */
+static void
+lp_build_sample_wrap_linear_float(struct lp_build_sample_context *bld,
+ unsigned block_length,
+ LLVMValueRef coord,
+ LLVMValueRef length,
+ boolean is_pot,
+ unsigned wrap_mode,
+ LLVMValueRef *coord0,
+ LLVMValueRef *coord1,
+ LLVMValueRef *weight,
+ unsigned force_nearest)
+{
+ struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+ struct lp_build_context *coord_bld = &bld->coord_bld;
+ LLVMBuilderRef builder = bld->gallivm->builder;
+ LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
+ LLVMValueRef length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
+
+ switch(wrap_mode) {
+ case PIPE_TEX_WRAP_REPEAT:
+ if (is_pot) {
+ /* mul by size and subtract 0.5 */
+ coord = lp_build_mul(coord_bld, coord, length);
+ if (!force_nearest)
+ coord = lp_build_sub(coord_bld, coord, half);
+ *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
+ /* convert to int, compute lerp weight */
+ lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
+ *coord1 = lp_build_ifloor(coord_bld, *coord1);
+ /* repeat wrap */
+ length_minus_one = lp_build_itrunc(coord_bld, length_minus_one);
+ *coord0 = LLVMBuildAnd(builder, *coord0, length_minus_one, "");
+ *coord1 = LLVMBuildAnd(builder, *coord1, length_minus_one, "");
+ }
+ else {
+ LLVMValueRef mask;
+ /* wrap with normalized floats is just fract */
+ coord = lp_build_fract(coord_bld, coord);
+ /* unnormalize */
+ coord = lp_build_mul(coord_bld, coord, length);
+ /*
+ * we avoided the 0.5/length division, have to fix up wrong
+ * edge cases with selects
+ */
+ *coord1 = lp_build_add(coord_bld, coord, half);
+ coord = lp_build_sub(coord_bld, coord, half);
+ *weight = lp_build_fract(coord_bld, coord);
+ mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
+ PIPE_FUNC_LESS, coord, coord_bld->zero);
+ *coord0 = lp_build_select(coord_bld, mask, length_minus_one, coord);
+ *coord0 = lp_build_itrunc(coord_bld, *coord0);
+ mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
+ PIPE_FUNC_LESS, *coord1, length);
+ *coord1 = lp_build_select(coord_bld, mask, *coord1, coord_bld->zero);
+ *coord1 = lp_build_itrunc(coord_bld, *coord1);
+ }
+ break;
+ case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+ if (bld->static_state->normalized_coords) {
+ /* mul by tex size */
+ coord = lp_build_mul(coord_bld, coord, length);
+ }
+ /* subtract 0.5 */
+ if (!force_nearest) {
+ coord = lp_build_sub(coord_bld, coord, half);
+ }
+ /* clamp to [0, length - 1] */
+ coord = lp_build_min(coord_bld, coord, length_minus_one);
+ coord = lp_build_max(coord_bld, coord, coord_bld->zero);
+ *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
+ /* convert to int, compute lerp weight */
+ lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
+ /* coord1 = min(coord1, length-1) */
+ *coord1 = lp_build_min(coord_bld, *coord1, length_minus_one);
+ *coord1 = lp_build_itrunc(coord_bld, *coord1);
+ break;
+ default:
+ assert(0);
+ *coord0 = int_coord_bld->zero;
+ *coord1 = int_coord_bld->zero;
+ *weight = coord_bld->zero;
+ break;
+ }
+ *weight = lp_build_mul_imm(coord_bld, *weight, 256);
+ *weight = lp_build_itrunc(coord_bld, *weight);
+ return;
+}
+
+
+/**
+ * Fetch texels for image with nearest sampling.
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ */
+static void
+lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
+ LLVMValueRef data_ptr,
+ LLVMValueRef offset,
+ LLVMValueRef x_subcoord,
+ LLVMValueRef y_subcoord,
+ LLVMValueRef *colors_lo,
+ LLVMValueRef *colors_hi)
+{
+ /*
+ * Fetch the pixels as 4 x 32bit (rgba order might differ):
+ *
+ * rgba0 rgba1 rgba2 rgba3
+ *
+ * bit cast them into 16 x u8
+ *
+ * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
+ *
+ * unpack them into two 8 x i16:
+ *
+ * r0 g0 b0 a0 r1 g1 b1 a1
+ * r2 g2 b2 a2 r3 g3 b3 a3
+ *
+ * The higher 8 bits of the resulting elements will be zero.
+ */
+ LLVMBuilderRef builder = bld->gallivm->builder;
+ LLVMValueRef rgba8;
+ struct lp_build_context h16, u8n;
+ LLVMTypeRef u8n_vec_type;
+
+ lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
+ lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
+ u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
+
+ if (util_format_is_rgba8_variant(bld->format_desc)) {
+ /*
+ * Given the format is a rgba8, just read the pixels as is,
+ * without any swizzling. Swizzling will be done later.
+ */
+ rgba8 = lp_build_gather(bld->gallivm,
+ bld->texel_type.length,
+ bld->format_desc->block.bits,
+ bld->texel_type.width,
+ data_ptr, offset);
+
+ rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+ }
+ else {
+ rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
+ bld->format_desc,
+ u8n.type,
+ data_ptr, offset,
+ x_subcoord,
+ y_subcoord);
+ }
+
+ /* Expand one 4*rgba8 to two 2*rgba16 */
+ lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
+ rgba8,
+ colors_lo, colors_hi);
+}
+
+
+/**
* Sample a single texture image with nearest sampling.
* If sampling a cube texture, r = cube face in [0,5].
* Return filtered color as two vectors of 16-bit fixed point values.
@@ -267,21 +524,19 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
{
const unsigned dims = bld->dims;
LLVMBuilderRef builder = bld->gallivm->builder;
- struct lp_build_context i32, h16, u8n;
- LLVMTypeRef i32_vec_type, u8n_vec_type;
+ struct lp_build_context i32;
+ LLVMTypeRef i32_vec_type;
LLVMValueRef i32_c8;
LLVMValueRef width_vec, height_vec, depth_vec;
LLVMValueRef s_ipart, t_ipart = NULL, r_ipart = NULL;
+ LLVMValueRef s_float, t_float = NULL, r_float = NULL;
LLVMValueRef x_stride;
LLVMValueRef x_offset, offset;
LLVMValueRef x_subcoord, y_subcoord, z_subcoord;
- lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32));
- lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16));
- lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8));
+ lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);
- u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
lp_build_extract_image_sizes(bld,
bld->int_size_type,
@@ -291,6 +546,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
&height_vec,
&depth_vec);
+ s_float = s; t_float = t; r_float = r;
+
if (bld->static_state->normalized_coords) {
LLVMValueRef scaled_size;
LLVMValueRef flt_size;
@@ -334,7 +591,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
/* Do texcoord wrapping, compute texel offset */
lp_build_sample_wrap_nearest_int(bld,
bld->format_desc->block.width,
- s_ipart, width_vec, x_stride,
+ s_ipart, s_float,
+ width_vec, x_stride,
bld->static_state->pot_width,
bld->static_state->wrap_s,
&x_offset, &x_subcoord);
@@ -343,7 +601,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
LLVMValueRef y_offset;
lp_build_sample_wrap_nearest_int(bld,
bld->format_desc->block.height,
- t_ipart, height_vec, row_stride_vec,
+ t_ipart, t_float,
+ height_vec, row_stride_vec,
bld->static_state->pot_height,
bld->static_state->wrap_t,
&y_offset, &y_subcoord);
@@ -352,7 +611,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
LLVMValueRef z_offset;
lp_build_sample_wrap_nearest_int(bld,
1, /* block length (depth) */
- r_ipart, depth_vec, img_stride_vec,
+ r_ipart, r_float,
+ depth_vec, img_stride_vec,
bld->static_state->pot_depth,
bld->static_state->wrap_r,
&z_offset, &z_subcoord);
@@ -366,6 +626,196 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
}
}
+ lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
+ x_subcoord, y_subcoord,
+ colors_lo, colors_hi);
+}
+
+
+/**
+ * Sample a single texture image with nearest sampling.
+ * If sampling a cube texture, r = cube face in [0,5].
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ * Does address calcs (except offsets) with floats.
+ * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
+ */
+static void
+lp_build_sample_image_nearest_afloat(struct lp_build_sample_context *bld,
+ LLVMValueRef int_size,
+ LLVMValueRef row_stride_vec,
+ LLVMValueRef img_stride_vec,
+ LLVMValueRef data_ptr,
+ LLVMValueRef s,
+ LLVMValueRef t,
+ LLVMValueRef r,
+ LLVMValueRef *colors_lo,
+ LLVMValueRef *colors_hi)
+ {
+ const unsigned dims = bld->dims;
+ LLVMValueRef width_vec, height_vec, depth_vec;
+ LLVMValueRef offset;
+ LLVMValueRef x_subcoord, y_subcoord;
+ LLVMValueRef x_icoord, y_icoord, z_icoord;
+ LLVMValueRef flt_size;
+
+ flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
+
+ lp_build_extract_image_sizes(bld,
+ bld->float_size_type,
+ bld->coord_type,
+ flt_size,
+ &width_vec,
+ &height_vec,
+ &depth_vec);
+
+ /* Do texcoord wrapping */
+ lp_build_sample_wrap_nearest_float(bld,
+ s, width_vec,
+ bld->static_state->pot_width,
+ bld->static_state->wrap_s,
+ &x_icoord);
+
+ if (dims >= 2) {
+ lp_build_sample_wrap_nearest_float(bld,
+ t, height_vec,
+ bld->static_state->pot_height,
+ bld->static_state->wrap_t,
+ &y_icoord);
+
+ if (dims >= 3) {
+ lp_build_sample_wrap_nearest_float(bld,
+ r, depth_vec,
+ bld->static_state->pot_depth,
+ bld->static_state->wrap_r,
+ &z_icoord);
+ }
+ else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+ z_icoord = r;
+ }
+ }
+
+ /*
+ * From here on we deal with ints, and we should split up the 256bit
+ * vectors manually for better generated code.
+ */
+
+ /*
+ * compute texel offsets -
+ * cannot do offset calc with floats, difficult for block-based formats,
+ * and not enough precision anyway.
+ */
+ lp_build_sample_offset(&bld->int_coord_bld,
+ bld->format_desc,
+ x_icoord, y_icoord,
+ z_icoord,
+ row_stride_vec, img_stride_vec,
+ &offset,
+ &x_subcoord, &y_subcoord);
+
+ lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
+ x_subcoord, y_subcoord,
+ colors_lo, colors_hi);
+}
+
+
+/**
+ * Fetch texels for image with linear sampling.
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ */
+static void
+lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
+ LLVMValueRef data_ptr,
+ LLVMValueRef offset[2][2][2],
+ LLVMValueRef x_subcoord[2],
+ LLVMValueRef y_subcoord[2],
+ LLVMValueRef s_fpart,
+ LLVMValueRef t_fpart,
+ LLVMValueRef r_fpart,
+ LLVMValueRef *colors_lo,
+ LLVMValueRef *colors_hi)
+{
+ const unsigned dims = bld->dims;
+ LLVMBuilderRef builder = bld->gallivm->builder;
+ struct lp_build_context h16, u8n;
+ LLVMTypeRef h16_vec_type, u8n_vec_type;
+ LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
+ LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
+ LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
+ LLVMValueRef shuffle_lo, shuffle_hi;
+ LLVMValueRef s_fpart_lo, s_fpart_hi;
+ LLVMValueRef t_fpart_lo = NULL, t_fpart_hi = NULL;
+ LLVMValueRef r_fpart_lo = NULL, r_fpart_hi = NULL;
+ LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */
+ LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */
+ LLVMValueRef packed_lo, packed_hi;
+ unsigned i, j, k;
+ unsigned numj, numk;
+
+ lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
+ lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
+ h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type);
+ u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
+
+ /*
+ * Transform 4 x i32 in
+ *
+ * s_fpart = {s0, s1, s2, s3}
+ *
+ * into 8 x i16
+ *
+ * s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
+ *
+ * into two 8 x i16
+ *
+ * s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
+ * s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
+ *
+ * and likewise for t_fpart. There is no risk of loosing precision here
+ * since the fractional parts only use the lower 8bits.
+ */
+ s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
+ if (dims >= 2)
+ t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
+ if (dims >= 3)
+ r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, "");
+
+ for (j = 0; j < h16.type.length; j += 4) {
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+ unsigned subindex = 0;
+#else
+ unsigned subindex = 1;
+#endif
+ LLVMValueRef index;
+
+ index = LLVMConstInt(elem_type, j/2 + subindex, 0);
+ for (i = 0; i < 4; ++i)
+ shuffles_lo[j + i] = index;
+
+ index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
+ for (i = 0; i < 4; ++i)
+ shuffles_hi[j + i] = index;
+ }
+
+ shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
+ shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
+
+ s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
+ shuffle_lo, "");
+ s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
+ shuffle_hi, "");
+ if (dims >= 2) {
+ t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
+ shuffle_lo, "");
+ t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
+ shuffle_hi, "");
+ }
+ if (dims >= 3) {
+ r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
+ shuffle_lo, "");
+ r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
+ shuffle_hi, "");
+ }
+
/*
* Fetch the pixels as 4 x 32bit (rgba order might differ):
*
@@ -382,38 +832,129 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
*
* The higher 8 bits of the resulting elements will be zero.
*/
- {
- LLVMValueRef rgba8;
+ numj = 1 + (dims >= 2);
+ numk = 1 + (dims >= 3);
- if (util_format_is_rgba8_variant(bld->format_desc)) {
- /*
- * Given the format is a rgba8, just read the pixels as is,
- * without any swizzling. Swizzling will be done later.
- */
- rgba8 = lp_build_gather(bld->gallivm,
- bld->texel_type.length,
- bld->format_desc->block.bits,
- bld->texel_type.width,
- data_ptr, offset);
+ for (k = 0; k < numk; k++) {
+ for (j = 0; j < numj; j++) {
+ for (i = 0; i < 2; i++) {
+ LLVMValueRef rgba8;
+
+ if (util_format_is_rgba8_variant(bld->format_desc)) {
+ /*
+ * Given the format is a rgba8, just read the pixels as is,
+ * without any swizzling. Swizzling will be done later.
+ */
+ rgba8 = lp_build_gather(bld->gallivm,
+ bld->texel_type.length,
+ bld->format_desc->block.bits,
+ bld->texel_type.width,
+ data_ptr, offset[k][j][i]);
+
+ rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+ }
+ else {
+ rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
+ bld->format_desc,
+ u8n.type,
+ data_ptr, offset[k][j][i],
+ x_subcoord[i],
+ y_subcoord[j]);
+ }
- rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+ /* Expand one 4*rgba8 to two 2*rgba16 */
+ lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
+ rgba8,
+ &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]);
+ }
}
- else {
- rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
- bld->format_desc,
- u8n.type,
- data_ptr, offset,
- x_subcoord,
- y_subcoord);
+ }
+
+ /*
+ * Linear interpolation with 8.8 fixed point.
+ */
+ if (bld->static_state->force_nearest_s) {
+ /* special case 1-D lerp */
+ packed_lo = lp_build_lerp(&h16,
+ t_fpart_lo,
+ neighbors_lo[0][0][0],
+ neighbors_lo[0][0][1]);
+
+ packed_hi = lp_build_lerp(&h16,
+ t_fpart_hi,
+ neighbors_hi[0][1][0],
+ neighbors_hi[0][1][0]);
+ }
+ else if (bld->static_state->force_nearest_t) {
+ /* special case 1-D lerp */
+ packed_lo = lp_build_lerp(&h16,
+ s_fpart_lo,
+ neighbors_lo[0][0][0],
+ neighbors_lo[0][0][1]);
+
+ packed_hi = lp_build_lerp(&h16,
+ s_fpart_hi,
+ neighbors_hi[0][0][0],
+ neighbors_hi[0][0][1]);
+ }
+ else {
+ /* general 1/2/3-D lerping */
+ if (dims == 1) {
+ packed_lo = lp_build_lerp(&h16,
+ s_fpart_lo,
+ neighbors_lo[0][0][0],
+ neighbors_lo[0][0][1]);
+
+ packed_hi = lp_build_lerp(&h16,
+ s_fpart_hi,
+ neighbors_hi[0][0][0],
+ neighbors_hi[0][0][1]);
}
+ else {
+ /* 2-D lerp */
+ packed_lo = lp_build_lerp_2d(&h16,
+ s_fpart_lo, t_fpart_lo,
+ neighbors_lo[0][0][0],
+ neighbors_lo[0][0][1],
+ neighbors_lo[0][1][0],
+ neighbors_lo[0][1][1]);
+
+ packed_hi = lp_build_lerp_2d(&h16,
+ s_fpart_hi, t_fpart_hi,
+ neighbors_hi[0][0][0],
+ neighbors_hi[0][0][1],
+ neighbors_hi[0][1][0],
+ neighbors_hi[0][1][1]);
+
+ if (dims >= 3) {
+ LLVMValueRef packed_lo2, packed_hi2;
+
+ /* lerp in the second z slice */
+ packed_lo2 = lp_build_lerp_2d(&h16,
+ s_fpart_lo, t_fpart_lo,
+ neighbors_lo[1][0][0],
+ neighbors_lo[1][0][1],
+ neighbors_lo[1][1][0],
+ neighbors_lo[1][1][1]);
- /* Expand one 4*rgba8 to two 2*rgba16 */
- lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
- rgba8,
- colors_lo, colors_hi);
+ packed_hi2 = lp_build_lerp_2d(&h16,
+ s_fpart_hi, t_fpart_hi,
+ neighbors_hi[1][0][0],
+ neighbors_hi[1][0][1],
+ neighbors_hi[1][1][0],
+ neighbors_hi[1][1][1]);
+ /* interp between two z slices */
+ packed_lo = lp_build_lerp(&h16, r_fpart_lo,
+ packed_lo, packed_lo2);
+ packed_hi = lp_build_lerp(&h16, r_fpart_hi,
+ packed_hi, packed_hi2);
+ }
+ }
}
-}
+ *colors_lo = packed_lo;
+ *colors_hi = packed_hi;
+}
/**
* Sample a single texture image with (bi-)(tri-)linear sampling.
@@ -433,33 +974,24 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
{
const unsigned dims = bld->dims;
LLVMBuilderRef builder = bld->gallivm->builder;
- struct lp_build_context i32, h16, u8n;
- LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
+ struct lp_build_context i32;
+ LLVMTypeRef i32_vec_type;
LLVMValueRef i32_c8, i32_c128, i32_c255;
LLVMValueRef width_vec, height_vec, depth_vec;
- LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
- LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_fpart_lo = NULL, t_fpart_hi = NULL;
- LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_fpart_lo = NULL, r_fpart_hi = NULL;
+ LLVMValueRef s_ipart, s_fpart, s_float;
+ LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_float = NULL;
+ LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_float = NULL;
LLVMValueRef x_stride, y_stride, z_stride;
LLVMValueRef x_offset0, x_offset1;
LLVMValueRef y_offset0, y_offset1;
LLVMValueRef z_offset0, z_offset1;
LLVMValueRef offset[2][2][2]; /* [z][y][x] */
LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2];
- LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */
- LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */
- LLVMValueRef packed_lo, packed_hi;
unsigned x, y, z;
- unsigned i, j, k;
- unsigned numj, numk;
- lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32));
- lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16));
- lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8));
+ lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);
- h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type);
- u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
lp_build_extract_image_sizes(bld,
bld->int_size_type,
@@ -469,6 +1001,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
&height_vec,
&depth_vec);
+ s_float = s; t_float = t; r_float = r;
+
if (bld->static_state->normalized_coords) {
LLVMValueRef scaled_size;
LLVMValueRef flt_size;
@@ -533,7 +1067,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
/* do texcoord wrapping and compute texel offsets */
lp_build_sample_wrap_linear_int(bld,
bld->format_desc->block.width,
- s_ipart, width_vec, x_stride,
+ s_ipart, &s_fpart, s_float,
+ width_vec, x_stride,
bld->static_state->pot_width,
bld->static_state->wrap_s,
&x_offset0, &x_offset1,
@@ -548,7 +1083,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
if (dims >= 2) {
lp_build_sample_wrap_linear_int(bld,
bld->format_desc->block.height,
- t_ipart, height_vec, y_stride,
+ t_ipart, &t_fpart, t_float,
+ height_vec, y_stride,
bld->static_state->pot_height,
bld->static_state->wrap_t,
&y_offset0, &y_offset1,
@@ -567,7 +1103,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
if (dims >= 3) {
lp_build_sample_wrap_linear_int(bld,
bld->format_desc->block.height,
- r_ipart, depth_vec, z_stride,
+ r_ipart, &r_fpart, r_float,
+ depth_vec, z_stride,
bld->static_state->pot_depth,
bld->static_state->wrap_r,
&z_offset0, &z_offset1,
@@ -593,212 +1130,175 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
}
}
- /*
- * Transform 4 x i32 in
- *
- * s_fpart = {s0, s1, s2, s3}
- *
- * into 8 x i16
- *
- * s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
- *
- * into two 8 x i16
- *
- * s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
- * s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
- *
- * and likewise for t_fpart. There is no risk of loosing precision here
- * since the fractional parts only use the lower 8bits.
- */
- s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
- if (dims >= 2)
- t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
- if (dims >= 3)
- r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, "");
+ lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
+ x_subcoord, y_subcoord,
+ s_fpart, t_fpart, r_fpart,
+ colors_lo, colors_hi);
+}
- {
- LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
- LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
- LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
- LLVMValueRef shuffle_lo;
- LLVMValueRef shuffle_hi;
- for (j = 0; j < h16.type.length; j += 4) {
-#ifdef PIPE_ARCH_LITTLE_ENDIAN
- unsigned subindex = 0;
-#else
- unsigned subindex = 1;
-#endif
- LLVMValueRef index;
+/**
+ * Sample a single texture image with (bi-)(tri-)linear sampling.
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ * Does address calcs (except offsets) with floats.
+ * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
+ */
+static void
+lp_build_sample_image_linear_afloat(struct lp_build_sample_context *bld,
+ LLVMValueRef int_size,
+ LLVMValueRef row_stride_vec,
+ LLVMValueRef img_stride_vec,
+ LLVMValueRef data_ptr,
+ LLVMValueRef s,
+ LLVMValueRef t,
+ LLVMValueRef r,
+ LLVMValueRef *colors_lo,
+ LLVMValueRef *colors_hi)
+{
+ const unsigned dims = bld->dims;
+ LLVMValueRef width_vec, height_vec, depth_vec;
+ LLVMValueRef s_fpart;
+ LLVMValueRef t_fpart = NULL;
+ LLVMValueRef r_fpart = NULL;
+ LLVMValueRef x_stride, y_stride, z_stride;
+ LLVMValueRef x_offset0, x_offset1;
+ LLVMValueRef y_offset0, y_offset1;
+ LLVMValueRef z_offset0, z_offset1;
+ LLVMValueRef offset[2][2][2]; /* [z][y][x] */
+ LLVMValueRef x_subcoord[2], y_subcoord[2];
+ LLVMValueRef flt_size;
+ LLVMValueRef x_icoord0, x_icoord1;
+ LLVMValueRef y_icoord0, y_icoord1;
+ LLVMValueRef z_icoord0, z_icoord1;
+ unsigned x, y, z;
- index = LLVMConstInt(elem_type, j/2 + subindex, 0);
- for (i = 0; i < 4; ++i)
- shuffles_lo[j + i] = index;
+ flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
- index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
- for (i = 0; i < 4; ++i)
- shuffles_hi[j + i] = index;
- }
+ lp_build_extract_image_sizes(bld,
+ bld->float_size_type,
+ bld->coord_type,
+ flt_size,
+ &width_vec,
+ &height_vec,
+ &depth_vec);
- shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
- shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
+ /* do texcoord wrapping and compute texel offsets */
+ lp_build_sample_wrap_linear_float(bld,
+ bld->format_desc->block.width,
+ s, width_vec,
+ bld->static_state->pot_width,
+ bld->static_state->wrap_s,
+ &x_icoord0, &x_icoord1,
+ &s_fpart,
+ bld->static_state->force_nearest_s);
+
+ if (dims >= 2) {
+ lp_build_sample_wrap_linear_float(bld,
+ bld->format_desc->block.height,
+ t, height_vec,
+ bld->static_state->pot_height,
+ bld->static_state->wrap_t,
+ &y_icoord0, &y_icoord1,
+ &t_fpart,
+ bld->static_state->force_nearest_t);
- s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
- shuffle_lo, "");
- s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
- shuffle_hi, "");
- if (dims >= 2) {
- t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
- shuffle_lo, "");
- t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
- shuffle_hi, "");
- }
if (dims >= 3) {
- r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
- shuffle_lo, "");
- r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
- shuffle_hi, "");
+ lp_build_sample_wrap_linear_float(bld,
+ bld->format_desc->block.height,
+ r, depth_vec,
+ bld->static_state->pot_depth,
+ bld->static_state->wrap_r,
+ &z_icoord0, &z_icoord1,
+ &r_fpart, 0);
}
}
/*
- * Fetch the pixels as 4 x 32bit (rgba order might differ):
- *
- * rgba0 rgba1 rgba2 rgba3
- *
- * bit cast them into 16 x u8
- *
- * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
- *
- * unpack them into two 8 x i16:
- *
- * r0 g0 b0 a0 r1 g1 b1 a1
- * r2 g2 b2 a2 r3 g3 b3 a3
- *
- * The higher 8 bits of the resulting elements will be zero.
+ * From here on we deal with ints, and we should split up the 256bit
+ * vectors manually for better generated code.
*/
- numj = 1 + (dims >= 2);
- numk = 1 + (dims >= 3);
- for (k = 0; k < numk; k++) {
- for (j = 0; j < numj; j++) {
- for (i = 0; i < 2; i++) {
- LLVMValueRef rgba8;
-
- if (util_format_is_rgba8_variant(bld->format_desc)) {
- /*
- * Given the format is a rgba8, just read the pixels as is,
- * without any swizzling. Swizzling will be done later.
- */
- rgba8 = lp_build_gather(bld->gallivm,
- bld->texel_type.length,
- bld->format_desc->block.bits,
- bld->texel_type.width,
- data_ptr, offset[k][j][i]);
-
- rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
- }
- else {
- rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
- bld->format_desc,
- u8n.type,
- data_ptr, offset[k][j][i],
- x_subcoord[i],
- y_subcoord[j]);
- }
-
- /* Expand one 4*rgba8 to two 2*rgba16 */
- lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
- rgba8,
- &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]);
- }
- }
- }
+ /* get pixel, row and image strides */
+ x_stride = lp_build_const_vec(bld->gallivm,
+ bld->int_coord_bld.type,
+ bld->format_desc->block.bits/8);
+ y_stride = row_stride_vec;
+ z_stride = img_stride_vec;
/*
- * Linear interpolation with 8.8 fixed point.
+ * compute texel offset -
+ * cannot do offset calc with floats, difficult for block-based formats,
+ * and not enough precision anyway.
*/
- if (bld->static_state->force_nearest_s) {
- /* special case 1-D lerp */
- packed_lo = lp_build_lerp(&h16,
- t_fpart_lo,
- neighbors_lo[0][0][0],
- neighbors_lo[0][0][1]);
-
- packed_hi = lp_build_lerp(&h16,
- t_fpart_hi,
- neighbors_hi[0][1][0],
- neighbors_hi[0][1][0]);
+ lp_build_sample_partial_offset(&bld->int_coord_bld,
+ bld->format_desc->block.width,
+ x_icoord0, x_stride,
+ &x_offset0, &x_subcoord[0]);
+ lp_build_sample_partial_offset(&bld->int_coord_bld,
+ bld->format_desc->block.width,
+ x_icoord1, x_stride,
+ &x_offset1, &x_subcoord[1]);
+ for (z = 0; z < 2; z++) {
+ for (y = 0; y < 2; y++) {
+ offset[z][y][0] = x_offset0;
+ offset[z][y][1] = x_offset1;
+ }
}
- else if (bld->static_state->force_nearest_t) {
- /* special case 1-D lerp */
- packed_lo = lp_build_lerp(&h16,
- s_fpart_lo,
- neighbors_lo[0][0][0],
- neighbors_lo[0][0][1]);
- packed_hi = lp_build_lerp(&h16,
- s_fpart_hi,
- neighbors_hi[0][0][0],
- neighbors_hi[0][0][1]);
+ if (dims >= 2) {
+ lp_build_sample_partial_offset(&bld->int_coord_bld,
+ bld->format_desc->block.height,
+ y_icoord0, y_stride,
+ &y_offset0, &y_subcoord[0]);
+ lp_build_sample_partial_offset(&bld->int_coord_bld,
+ bld->format_desc->block.height,
+ y_icoord1, y_stride,
+ &y_offset1, &y_subcoord[1]);
+ for (z = 0; z < 2; z++) {
+ for (x = 0; x < 2; x++) {
+ offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
+ offset[z][0][x], y_offset0);
+ offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
+ offset[z][1][x], y_offset1);
+ }
+ }
}
- else {
- /* general 1/2/3-D lerping */
- if (dims == 1) {
- packed_lo = lp_build_lerp(&h16,
- s_fpart_lo,
- neighbors_lo[0][0][0],
- neighbors_lo[0][0][1]);
- packed_hi = lp_build_lerp(&h16,
- s_fpart_hi,
- neighbors_hi[0][0][0],
- neighbors_hi[0][0][1]);
+ if (dims >= 3) {
+ LLVMValueRef z_subcoord[2];
+ lp_build_sample_partial_offset(&bld->int_coord_bld,
+ 1,
+ z_icoord0, z_stride,
+ &z_offset0, &z_subcoord[0]);
+ lp_build_sample_partial_offset(&bld->int_coord_bld,
+ 1,
+ z_icoord1, z_stride,
+ &z_offset1, &z_subcoord[1]);
+ for (y = 0; y < 2; y++) {
+ for (x = 0; x < 2; x++) {
+ offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
+ offset[0][y][x], z_offset0);
+ offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
+ offset[1][y][x], z_offset1);
+ }
}
- else {
- /* 2-D lerp */
- packed_lo = lp_build_lerp_2d(&h16,
- s_fpart_lo, t_fpart_lo,
- neighbors_lo[0][0][0],
- neighbors_lo[0][0][1],
- neighbors_lo[0][1][0],
- neighbors_lo[0][1][1]);
-
- packed_hi = lp_build_lerp_2d(&h16,
- s_fpart_hi, t_fpart_hi,
- neighbors_hi[0][0][0],
- neighbors_hi[0][0][1],
- neighbors_hi[0][1][0],
- neighbors_hi[0][1][1]);
-
- if (dims >= 3) {
- LLVMValueRef packed_lo2, packed_hi2;
-
- /* lerp in the second z slice */
- packed_lo2 = lp_build_lerp_2d(&h16,
- s_fpart_lo, t_fpart_lo,
- neighbors_lo[1][0][0],
- neighbors_lo[1][0][1],
- neighbors_lo[1][1][0],
- neighbors_lo[1][1][1]);
-
- packed_hi2 = lp_build_lerp_2d(&h16,
- s_fpart_hi, t_fpart_hi,
- neighbors_hi[1][0][0],
- neighbors_hi[1][0][1],
- neighbors_hi[1][1][0],
- neighbors_hi[1][1][1]);
- /* interp between two z slices */
- packed_lo = lp_build_lerp(&h16, r_fpart_lo,
- packed_lo, packed_lo2);
- packed_hi = lp_build_lerp(&h16, r_fpart_hi,
- packed_hi, packed_hi2);
+ }
+ else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+ LLVMValueRef z_offset;
+ z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
+ for (y = 0; y < 2; y++) {
+ for (x = 0; x < 2; x++) {
+ /* The r coord is the cube face in [0,5] */
+ offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
+ offset[0][y][x], z_offset);
}
}
}
- *colors_lo = packed_lo;
- *colors_hi = packed_hi;
+ lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
+ x_subcoord, y_subcoord,
+ s_fpart, t_fpart, r_fpart,
+ colors_lo, colors_hi);
}
@@ -824,10 +1324,10 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
LLVMBuilderRef builder = bld->gallivm->builder;
LLVMValueRef size0;
LLVMValueRef size1;
- LLVMValueRef row_stride0_vec;
- LLVMValueRef row_stride1_vec;
- LLVMValueRef img_stride0_vec;
- LLVMValueRef img_stride1_vec;
+ LLVMValueRef row_stride0_vec = NULL;
+ LLVMValueRef row_stride1_vec = NULL;
+ LLVMValueRef img_stride0_vec = NULL;
+ LLVMValueRef img_stride1_vec = NULL;
LLVMValueRef data_ptr0;
LLVMValueRef data_ptr1;
LLVMValueRef colors0_lo, colors0_hi;
@@ -838,20 +1338,39 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
&size0,
&row_stride0_vec, &img_stride0_vec);
data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
- if (img_filter == PIPE_TEX_FILTER_NEAREST) {
- lp_build_sample_image_nearest(bld,
- size0,
- row_stride0_vec, img_stride0_vec,
- data_ptr0, s, t, r,
- &colors0_lo, &colors0_hi);
+ if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
+ if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+ lp_build_sample_image_nearest_afloat(bld,
+ size0,
+ row_stride0_vec, img_stride0_vec,
+ data_ptr0, s, t, r,
+ &colors0_lo, &colors0_hi);
+ }
+ else {
+ assert(img_filter == PIPE_TEX_FILTER_LINEAR);
+ lp_build_sample_image_linear_afloat(bld,
+ size0,
+ row_stride0_vec, img_stride0_vec,
+ data_ptr0, s, t, r,
+ &colors0_lo, &colors0_hi);
+ }
}
else {
- assert(img_filter == PIPE_TEX_FILTER_LINEAR);
- lp_build_sample_image_linear(bld,
- size0,
- row_stride0_vec, img_stride0_vec,
- data_ptr0, s, t, r,
- &colors0_lo, &colors0_hi);
+ if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+ lp_build_sample_image_nearest(bld,
+ size0,
+ row_stride0_vec, img_stride0_vec,
+ data_ptr0, s, t, r,
+ &colors0_lo, &colors0_hi);
+ }
+ else {
+ assert(img_filter == PIPE_TEX_FILTER_LINEAR);
+ lp_build_sample_image_linear(bld,
+ size0,
+ row_stride0_vec, img_stride0_vec,
+ data_ptr0, s, t, r,
+ &colors0_lo, &colors0_hi);
+ }
}
/* Store the first level's colors in the output variables */
@@ -859,74 +1378,138 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
LLVMBuildStore(builder, colors0_hi, colors_hi_var);
if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
- LLVMValueRef h16_scale = lp_build_const_float(bld->gallivm, 256.0);
- LLVMTypeRef i32_type = LLVMIntTypeInContext(bld->gallivm->context, 32);
+ LLVMValueRef h16vec_scale = lp_build_const_vec(bld->gallivm,
+ bld->perquadf_bld.type, 256.0);
+ LLVMTypeRef i32vec_type = lp_build_vec_type(bld->gallivm, bld->perquadi_bld.type);
struct lp_build_if_state if_ctx;
LLVMValueRef need_lerp;
+ unsigned num_quads = bld->coord_bld.type.length / 4;
+ unsigned i;
- lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16_scale, "");
- lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32_type, "lod_fpart.fixed16");
+ lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16vec_scale, "");
+ lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32vec_type, "lod_fpart.fixed16");
/* need_lerp = lod_fpart > 0 */
- need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
- lod_fpart, LLVMConstNull(i32_type),
- "need_lerp");
+ if (num_quads == 1) {
+ need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
+ lod_fpart, bld->perquadi_bld.zero,
+ "need_lerp");
+ }
+ else {
+ /*
+ * We'll do mip filtering if any of the quads need it.
+ * It might be better to split the vectors here and only fetch/filter
+ * quads which need it.
+ */
+ /*
+ * We need to clamp lod_fpart here since we can get negative
+ * values which would screw up filtering if not all
+ * lod_fpart values have same sign.
+ * We can however then skip the greater than comparison.
+ */
+ lod_fpart = lp_build_max(&bld->perquadi_bld, lod_fpart,
+ bld->perquadi_bld.zero);
+ need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, lod_fpart);
+ }
lp_build_if(&if_ctx, bld->gallivm, need_lerp);
{
struct lp_build_context h16_bld;
- lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16));
+ lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
/* sample the second mipmap level */
lp_build_mipmap_level_sizes(bld, ilevel1,
&size1,
&row_stride1_vec, &img_stride1_vec);
data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
- if (img_filter == PIPE_TEX_FILTER_NEAREST) {
- lp_build_sample_image_nearest(bld,
- size1,
- row_stride1_vec, img_stride1_vec,
- data_ptr1, s, t, r,
- &colors1_lo, &colors1_hi);
+
+ if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
+ if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+ lp_build_sample_image_nearest_afloat(bld,
+ size1,
+ row_stride1_vec, img_stride1_vec,
+ data_ptr1, s, t, r,
+ &colors1_lo, &colors1_hi);
+ }
+ else {
+ lp_build_sample_image_linear_afloat(bld,
+ size1,
+ row_stride1_vec, img_stride1_vec,
+ data_ptr1, s, t, r,
+ &colors1_lo, &colors1_hi);
+ }
}
else {
- lp_build_sample_image_linear(bld,
- size1,
- row_stride1_vec, img_stride1_vec,
- data_ptr1, s, t, r,
- &colors1_lo, &colors1_hi);
+ if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+ lp_build_sample_image_nearest(bld,
+ size1,
+ row_stride1_vec, img_stride1_vec,
+ data_ptr1, s, t, r,
+ &colors1_lo, &colors1_hi);
+ }
+ else {
+ lp_build_sample_image_linear(bld,
+ size1,
+ row_stride1_vec, img_stride1_vec,
+ data_ptr1, s, t, r,
+ &colors1_lo, &colors1_hi);
+ }
}
/* interpolate samples from the two mipmap levels */
- lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, "");
- lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart);
+ if (num_quads == 1) {
+ lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, "");
+ lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart);
#if HAVE_LLVM == 0x208
- /* This is a work-around for a bug in LLVM 2.8.
- * Evidently, something goes wrong in the construction of the
- * lod_fpart short[8] vector. Adding this no-effect shuffle seems
- * to force the vector to be properly constructed.
- * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f).
- */
- {
- LLVMValueRef shuffles[8], shuffle;
- int i;
- assert(h16_bld.type.length <= Elements(shuffles));
- for (i = 0; i < h16_bld.type.length; i++)
- shuffles[i] = lp_build_const_int32(bld->gallivm, 2 * (i & 1));
- shuffle = LLVMConstVector(shuffles, h16_bld.type.length);
- lod_fpart = LLVMBuildShuffleVector(builder,
- lod_fpart, lod_fpart,
- shuffle, "");
- }
+ /* This is a work-around for a bug in LLVM 2.8.
+ * Evidently, something goes wrong in the construction of the
+ * lod_fpart short[8] vector. Adding this no-effect shuffle seems
+ * to force the vector to be properly constructed.
+ * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f).
+ */
+ {
+ LLVMValueRef shuffles[8], shuffle;
+ assert(h16_bld.type.length <= Elements(shuffles));
+ for (i = 0; i < h16_bld.type.length; i++)
+ shuffles[i] = lp_build_const_int32(bld->gallivm, 2 * (i & 1));
+ shuffle = LLVMConstVector(shuffles, h16_bld.type.length);
+ lod_fpart = LLVMBuildShuffleVector(builder,
+ lod_fpart, lod_fpart,
+ shuffle, "");
+ }
#endif
- colors0_lo = lp_build_lerp(&h16_bld, lod_fpart,
- colors0_lo, colors1_lo);
- colors0_hi = lp_build_lerp(&h16_bld, lod_fpart,
- colors0_hi, colors1_hi);
+ colors0_lo = lp_build_lerp(&h16_bld, lod_fpart,
+ colors0_lo, colors1_lo);
+ colors0_hi = lp_build_lerp(&h16_bld, lod_fpart,
+ colors0_hi, colors1_hi);
+ }
+ else {
+ LLVMValueRef lod_parts[LP_MAX_VECTOR_LENGTH/16];
+ struct lp_type perquadi16_type = bld->perquadi_bld.type;
+ perquadi16_type.width /= 2;
+ perquadi16_type.length *= 2;
+ lod_fpart = LLVMBuildBitCast(builder, lod_fpart,
+ lp_build_vec_type(bld->gallivm,
+ perquadi16_type), "");
+ /* XXX this only works for exactly 2 quads. More quads need shuffle */
+ assert(num_quads == 2);
+ for (i = 0; i < num_quads; i++) {
+ LLVMValueRef indexi2 = lp_build_const_int32(bld->gallivm, i*2);
+ lod_parts[i] = lp_build_extract_broadcast(bld->gallivm,
+ perquadi16_type,
+ h16_bld.type,
+ lod_fpart,
+ indexi2);
+ }
+ colors0_lo = lp_build_lerp(&h16_bld, lod_parts[0],
+ colors0_lo, colors1_lo);
+ colors0_hi = lp_build_lerp(&h16_bld, lod_parts[1],
+ colors0_hi, colors1_hi);
+ }
LLVMBuildStore(builder, colors0_lo, colors_lo_var);
LLVMBuildStore(builder, colors0_hi, colors_hi_var);
@@ -948,10 +1531,10 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
LLVMValueRef s,
LLVMValueRef t,
LLVMValueRef r,
- const LLVMValueRef *ddx,
- const LLVMValueRef *ddy,
- LLVMValueRef lod_bias, /* optional */
- LLVMValueRef explicit_lod, /* optional */
+ LLVMValueRef lod_ipart,
+ LLVMValueRef lod_fpart,
+ LLVMValueRef ilevel0,
+ LLVMValueRef ilevel1,
LLVMValueRef texel_out[4])
{
struct lp_build_context *int_bld = &bld->int_bld;
@@ -960,14 +1543,9 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
const unsigned min_filter = bld->static_state->min_img_filter;
const unsigned mag_filter = bld->static_state->mag_img_filter;
const unsigned dims = bld->dims;
- LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
- LLVMValueRef ilevel0, ilevel1 = NULL;
LLVMValueRef packed, packed_lo, packed_hi;
LLVMValueRef unswizzled[4];
- LLVMValueRef face_ddx[4], face_ddy[4];
struct lp_build_context h16_bld;
- LLVMValueRef first_level;
- LLVMValueRef i32t_zero = lp_build_const_int32(bld->gallivm, 0);
/* we only support the common/simple wrap modes at this time */
assert(lp_is_simple_wrap_mode(bld->static_state->wrap_s));
@@ -978,81 +1556,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
/* make 16-bit fixed-pt builder context */
- lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16));
-
- /* cube face selection, compute pre-face coords, etc. */
- if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
- LLVMValueRef face, face_s, face_t;
- lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
- s = face_s; /* vec */
- t = face_t; /* vec */
- /* use 'r' to indicate cube face */
- r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
-
- /* recompute ddx, ddy using the new (s,t) face texcoords */
- face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s);
- face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t);
- face_ddx[2] = NULL;
- face_ddx[3] = NULL;
- face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s);
- face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t);
- face_ddy[2] = NULL;
- face_ddy[3] = NULL;
- ddx = face_ddx;
- ddy = face_ddy;
- }
-
- /*
- * Compute the level of detail (float).
- */
- if (min_filter != mag_filter ||
- mip_filter != PIPE_TEX_MIPFILTER_NONE) {
- /* Need to compute lod either to choose mipmap levels or to
- * distinguish between minification/magnification with one mipmap level.
- */
- lp_build_lod_selector(bld, unit, ddx, ddy,
- lod_bias, explicit_lod,
- mip_filter,
- &lod_ipart, &lod_fpart);
- } else {
- lod_ipart = i32t_zero;
- }
-
- /*
- * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
- */
- switch (mip_filter) {
- default:
- assert(0 && "bad mip_filter value in lp_build_sample_aos()");
- /* fall-through */
- case PIPE_TEX_MIPFILTER_NONE:
- /* always use mip level 0 */
- if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
- /* XXX this is a work-around for an apparent bug in LLVM 2.7.
- * We should be able to set ilevel0 = const(0) but that causes
- * bad x86 code to be emitted.
- */
- assert(lod_ipart);
- lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
- }
- else {
- first_level = bld->dynamic_state->first_level(bld->dynamic_state,
- bld->gallivm, unit);
- ilevel0 = first_level;
- }
- break;
- case PIPE_TEX_MIPFILTER_NEAREST:
- assert(lod_ipart);
- lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
- break;
- case PIPE_TEX_MIPFILTER_LINEAR:
- assert(lod_ipart);
- assert(lod_fpart);
- lp_build_linear_mip_levels(bld, unit,
- lod_ipart, &lod_fpart,
- &ilevel0, &ilevel1);
- break;
- }
+ lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
/*
* Get/interpolate texture colors.
@@ -1062,7 +1566,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
packed_hi = lp_build_alloca(bld->gallivm, h16_bld.vec_type, "packed_hi");
if (min_filter == mag_filter) {
- /* no need to distinquish between minification and magnification */
+ /* no need to distinguish between minification and magnification */
lp_build_sample_mipmap(bld,
min_filter, mip_filter,
s, t, r,
@@ -1106,7 +1610,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
* into 'packed'
*/
packed = lp_build_pack2(bld->gallivm,
- h16_bld.type, lp_type_unorm(8),
+ h16_bld.type, lp_type_unorm(8, bld->vector_width),
LLVMBuildLoad(builder, packed_lo, ""),
LLVMBuildLoad(builder, packed_hi, ""));
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
index 5d9ecac4d50..55b3bc1c09a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
@@ -46,10 +46,10 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
LLVMValueRef s,
LLVMValueRef t,
LLVMValueRef r,
- const LLVMValueRef *ddx,
- const LLVMValueRef *ddy,
- LLVMValueRef lod_bias, /* optional */
- LLVMValueRef explicit_lod, /* optional */
+ LLVMValueRef lod_ipart,
+ LLVMValueRef lod_fpart,
+ LLVMValueRef ilevel0,
+ LLVMValueRef ilevel1,
LLVMValueRef texel_out[4]);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 73dc3e77083..aaef7970635 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -41,6 +41,7 @@
#include "util/u_memory.h"
#include "util/u_math.h"
#include "util/u_format.h"
+#include "util/u_cpu_detect.h"
#include "lp_bld_debug.h"
#include "lp_bld_type.h"
#include "lp_bld_const.h"
@@ -57,6 +58,7 @@
#include "lp_bld_sample_aos.h"
#include "lp_bld_struct.h"
#include "lp_bld_quad.h"
+#include "lp_bld_pack.h"
/**
@@ -221,6 +223,41 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld,
/**
+ * Helper to compute the first coord and the weight for
+ * linear wrap repeat npot textures
+ */
+void
+lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
+ LLVMValueRef coord_f,
+ LLVMValueRef length_i,
+ LLVMValueRef length_f,
+ LLVMValueRef *coord0_i,
+ LLVMValueRef *weight_f)
+{
+ struct lp_build_context *coord_bld = &bld->coord_bld;
+ struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+ LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
+ LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
+ int_coord_bld->one);
+ LLVMValueRef mask;
+ /* wrap with normalized floats is just fract */
+ coord_f = lp_build_fract(coord_bld, coord_f);
+ /* mul by size and subtract 0.5 */
+ coord_f = lp_build_mul(coord_bld, coord_f, length_f);
+ coord_f = lp_build_sub(coord_bld, coord_f, half);
+ /*
+ * we avoided the 0.5/length division before the repeat wrap,
+ * now need to fix up edge cases with selects
+ */
+ /* convert to int, compute lerp weight */
+ lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
+ mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
+ PIPE_FUNC_LESS, *coord0_i, int_coord_bld->zero);
+ *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
+}
+
+
+/**
* Build LLVM code for texture wrap mode for linear filtering.
* \param x0_out returns first integer texcoord
* \param x1_out returns second integer texcoord
@@ -246,28 +283,27 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
switch(wrap_mode) {
case PIPE_TEX_WRAP_REPEAT:
- /* mul by size and subtract 0.5 */
- coord = lp_build_mul(coord_bld, coord, length_f);
- coord = lp_build_sub(coord_bld, coord, half);
- /* convert to int, compute lerp weight */
- lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
- /* repeat wrap */
if (is_pot) {
+ /* mul by size and subtract 0.5 */
+ coord = lp_build_mul(coord_bld, coord, length_f);
+ coord = lp_build_sub(coord_bld, coord, half);
+ /* convert to int, compute lerp weight */
+ lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+ /* repeat wrap */
coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
}
else {
- /* Add a bias to the texcoord to handle negative coords */
- LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
LLVMValueRef mask;
- coord0 = LLVMBuildAdd(builder, coord0, bias, "");
- coord0 = LLVMBuildURem(builder, coord0, length, "");
- mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
+ lp_build_coord_repeat_npot_linear(bld, coord,
+ length, length_f,
+ &coord0, &weight);
+ mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
coord1 = LLVMBuildAnd(builder,
- lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
- mask, "");
+ lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
+ mask, "");
}
break;
@@ -444,15 +480,16 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
switch(wrap_mode) {
case PIPE_TEX_WRAP_REPEAT:
- coord = lp_build_mul(coord_bld, coord, length_f);
- icoord = lp_build_ifloor(coord_bld, coord);
- if (is_pot)
+ if (is_pot) {
+ coord = lp_build_mul(coord_bld, coord, length_f);
+ icoord = lp_build_ifloor(coord_bld, coord);
icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
+ }
else {
- /* Add a bias to the texcoord to handle negative coords */
- LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
- icoord = LLVMBuildAdd(builder, icoord, bias, "");
- icoord = LLVMBuildURem(builder, icoord, length, "");
+ /* take fraction, unnormalize */
+ coord = lp_build_fract_safe(coord_bld, coord);
+ coord = lp_build_mul(coord_bld, coord, length_f);
+ icoord = lp_build_itrunc(coord_bld, coord);
}
break;
@@ -473,7 +510,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
break;
case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
- /* Note: this is the same as CLAMP_TO_EDGE, except min = -min */
+ /* Note: this is the same as CLAMP_TO_EDGE, except min = -1 */
{
LLVMValueRef min, max;
@@ -873,12 +910,32 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
struct lp_build_if_state if_ctx;
LLVMValueRef need_lerp;
+ unsigned num_quads = bld->coord_bld.type.length / 4;
/* need_lerp = lod_fpart > 0 */
- need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
- lod_fpart,
- bld->float_bld.zero,
- "need_lerp");
+ if (num_quads == 1) {
+ need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
+ lod_fpart, bld->perquadf_bld.zero,
+ "need_lerp");
+ }
+ else {
+ /*
+ * We'll do mip filtering if any of the quads need it.
+ * It might be better to split the vectors here and only fetch/filter
+ * quads which need it.
+ */
+ /*
+ * We unfortunately need to clamp lod_fpart here since we can get
+ * negative values which would screw up filtering if not all
+ * lod_fpart values have same sign.
+ */
+ lod_fpart = lp_build_max(&bld->perquadf_bld, lod_fpart,
+ bld->perquadf_bld.zero);
+ need_lerp = lp_build_compare(bld->gallivm, bld->perquadf_bld.type,
+ PIPE_FUNC_GREATER,
+ lod_fpart, bld->perquadf_bld.zero);
+ need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, need_lerp);
+ }
lp_build_if(&if_ctx, bld->gallivm, need_lerp);
{
@@ -904,7 +961,10 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
/* interpolate samples from the two mipmap levels */
- lod_fpart = lp_build_broadcast_scalar(&bld->texel_bld, lod_fpart);
+ lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
+ bld->perquadf_bld.type,
+ bld->texel_bld.type,
+ lod_fpart);
for (chan = 0; chan < 4; chan++) {
colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
@@ -916,37 +976,28 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
}
}
-
-
/**
- * General texture sampling codegen.
- * This function handles texture sampling for all texture targets (1D,
- * 2D, 3D, cube) and all filtering modes.
+ * Calculate cube face, lod, mip levels.
*/
static void
-lp_build_sample_general(struct lp_build_sample_context *bld,
- unsigned unit,
- LLVMValueRef s,
- LLVMValueRef t,
- LLVMValueRef r,
- const LLVMValueRef *ddx,
- const LLVMValueRef *ddy,
- LLVMValueRef lod_bias, /* optional */
- LLVMValueRef explicit_lod, /* optional */
- LLVMValueRef *colors_out)
+lp_build_sample_common(struct lp_build_sample_context *bld,
+ unsigned unit,
+ LLVMValueRef *s,
+ LLVMValueRef *t,
+ LLVMValueRef *r,
+ const struct lp_derivatives *derivs,
+ LLVMValueRef lod_bias, /* optional */
+ LLVMValueRef explicit_lod, /* optional */
+ LLVMValueRef *lod_ipart,
+ LLVMValueRef *lod_fpart,
+ LLVMValueRef *ilevel0,
+ LLVMValueRef *ilevel1)
{
- struct lp_build_context *int_bld = &bld->int_bld;
- LLVMBuilderRef builder = bld->gallivm->builder;
const unsigned mip_filter = bld->static_state->min_mip_filter;
const unsigned min_filter = bld->static_state->min_img_filter;
const unsigned mag_filter = bld->static_state->mag_img_filter;
- LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
- LLVMValueRef ilevel0, ilevel1 = NULL;
- LLVMValueRef face_ddx[4], face_ddy[4];
- LLVMValueRef texels[4];
LLVMValueRef first_level;
- LLVMValueRef i32t_zero = lp_build_const_int32(bld->gallivm, 0);
- unsigned chan;
+ struct lp_derivatives face_derivs;
/*
printf("%s mip %d min %d mag %d\n", __FUNCTION__,
@@ -958,23 +1009,16 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
*/
if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
LLVMValueRef face, face_s, face_t;
- lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
- s = face_s; /* vec */
- t = face_t; /* vec */
+ lp_build_cube_lookup(bld, *s, *t, *r, &face, &face_s, &face_t);
+ *s = face_s; /* vec */
+ *t = face_t; /* vec */
/* use 'r' to indicate cube face */
- r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
+ *r = face; /* vec */
/* recompute ddx, ddy using the new (s,t) face texcoords */
- face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s);
- face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t);
- face_ddx[2] = NULL;
- face_ddx[3] = NULL;
- face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s);
- face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t);
- face_ddy[2] = NULL;
- face_ddy[3] = NULL;
- ddx = face_ddx;
- ddy = face_ddy;
+ face_derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(&bld->coord_bld, *s, *t);
+ face_derivs.ddx_ddy[1] = NULL;
+ derivs = &face_derivs;
}
/*
@@ -985,12 +1029,12 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
/* Need to compute lod either to choose mipmap levels or to
* distinguish between minification/magnification with one mipmap level.
*/
- lp_build_lod_selector(bld, unit, ddx, ddy,
+ lp_build_lod_selector(bld, unit, derivs,
lod_bias, explicit_lod,
mip_filter,
- &lod_ipart, &lod_fpart);
+ lod_ipart, lod_fpart);
} else {
- lod_ipart = i32t_zero;
+ *lod_ipart = bld->perquadi_bld.zero;
}
/*
@@ -1006,28 +1050,56 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
/* XXX this is a work-around for an apparent bug in LLVM 2.7.
* We should be able to set ilevel0 = const(0) but that causes
* bad x86 code to be emitted.
+ * XXX should probably disable that on other llvm versions.
*/
- assert(lod_ipart);
- lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
+ assert(*lod_ipart);
+ lp_build_nearest_mip_level(bld, unit, *lod_ipart, ilevel0);
}
else {
first_level = bld->dynamic_state->first_level(bld->dynamic_state,
bld->gallivm, unit);
- ilevel0 = first_level;
+ first_level = lp_build_broadcast_scalar(&bld->perquadi_bld, first_level);
+ *ilevel0 = first_level;
}
break;
case PIPE_TEX_MIPFILTER_NEAREST:
- assert(lod_ipart);
- lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
+ assert(*lod_ipart);
+ lp_build_nearest_mip_level(bld, unit, *lod_ipart, ilevel0);
break;
case PIPE_TEX_MIPFILTER_LINEAR:
- assert(lod_ipart);
- assert(lod_fpart);
+ assert(*lod_ipart);
+ assert(*lod_fpart);
lp_build_linear_mip_levels(bld, unit,
- lod_ipart, &lod_fpart,
- &ilevel0, &ilevel1);
+ *lod_ipart, lod_fpart,
+ ilevel0, ilevel1);
break;
}
+}
+
+/**
+ * General texture sampling codegen.
+ * This function handles texture sampling for all texture targets (1D,
+ * 2D, 3D, cube) and all filtering modes.
+ */
+static void
+lp_build_sample_general(struct lp_build_sample_context *bld,
+ unsigned unit,
+ LLVMValueRef s,
+ LLVMValueRef t,
+ LLVMValueRef r,
+ LLVMValueRef lod_ipart,
+ LLVMValueRef lod_fpart,
+ LLVMValueRef ilevel0,
+ LLVMValueRef ilevel1,
+ LLVMValueRef *colors_out)
+{
+ struct lp_build_context *int_bld = &bld->int_bld;
+ LLVMBuilderRef builder = bld->gallivm->builder;
+ const unsigned mip_filter = bld->static_state->min_mip_filter;
+ const unsigned min_filter = bld->static_state->min_img_filter;
+ const unsigned mag_filter = bld->static_state->mag_img_filter;
+ LLVMValueRef texels[4];
+ unsigned chan;
/*
* Get/interpolate texture colors.
@@ -1039,7 +1111,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
}
if (min_filter == mag_filter) {
- /* no need to distinquish between minification and magnification */
+ /* no need to distinguish between minification and magnification */
lp_build_sample_mipmap(bld, unit,
min_filter, mip_filter,
s, t, r,
@@ -1135,7 +1207,10 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
* For debugging.
*/
void
-lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type,
+lp_build_sample_nop(struct gallivm_state *gallivm,
+ struct lp_type type,
+ unsigned num_coords,
+ const LLVMValueRef *coords,
LLVMValueRef texel_out[4])
{
LLVMValueRef one = lp_build_one(gallivm, type);
@@ -1152,8 +1227,7 @@ lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type,
* 'texel' will return a vector of four LLVMValueRefs corresponding to
* R, G, B, A.
* \param type vector float type to use for coords, etc.
- * \param ddx partial derivatives of (s,t,r,q) with respect to x
- * \param ddy partial derivatives of (s,t,r,q) with respect to y
+ * \param derivs partial derivatives of (s,t,r,q) with respect to x and y
*/
void
lp_build_sample_soa(struct gallivm_state *gallivm,
@@ -1163,8 +1237,7 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
unsigned unit,
unsigned num_coords,
const LLVMValueRef *coords,
- const LLVMValueRef ddx[4],
- const LLVMValueRef ddy[4],
+ const struct lp_derivatives *derivs,
LLVMValueRef lod_bias, /* optional */
LLVMValueRef explicit_lod, /* optional */
LLVMValueRef texel_out[4])
@@ -1173,10 +1246,10 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
struct lp_build_sample_context bld;
LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef tex_width, tex_height, tex_depth;
LLVMValueRef s;
LLVMValueRef t;
LLVMValueRef r;
- struct lp_type float_vec_type;
if (0) {
enum pipe_format fmt = static_state->format;
@@ -1193,6 +1266,8 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
bld.format_desc = util_format_description(static_state->format);
bld.dims = dims;
+ bld.vector_width = lp_type_width(type);
+
bld.float_type = lp_type_float(32);
bld.int_type = lp_type_int(32);
bld.coord_type = type;
@@ -1201,22 +1276,26 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
bld.float_size_type.length = dims > 1 ? 4 : 1;
bld.int_size_type = lp_int_type(bld.float_size_type);
bld.texel_type = type;
-
- float_vec_type = lp_type_float_vec(32);
+ bld.perquadf_type = type;
+ /* we want native vector size to be able to use our intrinsics */
+ bld.perquadf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
+ bld.perquadi_type = lp_int_type(bld.perquadf_type);
lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
- lp_build_context_init(&bld.float_vec_bld, gallivm, float_vec_type);
+ lp_build_context_init(&bld.float_vec_bld, gallivm, type);
lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
+ lp_build_context_init(&bld.perquadf_bld, gallivm, bld.perquadf_type);
+ lp_build_context_init(&bld.perquadi_bld, gallivm, bld.perquadi_type);
/* Get the dynamic state */
- bld.width = dynamic_state->width(dynamic_state, gallivm, unit);
- bld.height = dynamic_state->height(dynamic_state, gallivm, unit);
- bld.depth = dynamic_state->depth(dynamic_state, gallivm, unit);
+ tex_width = dynamic_state->width(dynamic_state, gallivm, unit);
+ tex_height = dynamic_state->height(dynamic_state, gallivm, unit);
+ tex_depth = dynamic_state->depth(dynamic_state, gallivm, unit);
bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm, unit);
bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm, unit);
bld.data_array = dynamic_state->data_ptr(dynamic_state, gallivm, unit);
@@ -1228,37 +1307,40 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
/* width, height, depth as single int vector */
if (dims <= 1) {
- bld.int_size = bld.width;
+ bld.int_size = tex_width;
}
else {
bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_bld.undef,
- bld.width, LLVMConstInt(i32t, 0, 0), "");
+ tex_width, LLVMConstInt(i32t, 0, 0), "");
if (dims >= 2) {
bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
- bld.height, LLVMConstInt(i32t, 1, 0), "");
+ tex_height, LLVMConstInt(i32t, 1, 0), "");
if (dims >= 3) {
bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
- bld.depth, LLVMConstInt(i32t, 2, 0), "");
+ tex_depth, LLVMConstInt(i32t, 2, 0), "");
}
}
}
if (0) {
/* For debug: no-op texture sampling */
- lp_build_sample_nop(gallivm, bld.texel_type, texel_out);
- }
- else if (util_format_fits_8unorm(bld.format_desc) &&
- lp_is_simple_wrap_mode(static_state->wrap_s) &&
- lp_is_simple_wrap_mode(static_state->wrap_t)) {
- /* do sampling/filtering with fixed pt arithmetic */
- lp_build_sample_aos(&bld, unit, s, t, r, ddx, ddy,
- lod_bias, explicit_lod,
+ lp_build_sample_nop(gallivm,
+ bld.texel_type,
+ num_coords,
+ coords,
texel_out);
}
-
else {
+ LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
+ LLVMValueRef ilevel0 = NULL, ilevel1 = NULL;
+ unsigned num_quads = type.length / 4;
+ const unsigned mip_filter = bld.static_state->min_mip_filter;
+ boolean use_aos = util_format_fits_8unorm(bld.format_desc) &&
+ lp_is_simple_wrap_mode(static_state->wrap_s) &&
+ lp_is_simple_wrap_mode(static_state->wrap_t);
+
if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
- util_format_fits_8unorm(bld.format_desc)) {
+ !use_aos && util_format_fits_8unorm(bld.format_desc)) {
debug_printf("%s: using floating point linear filtering for %s\n",
__FUNCTION__, bld.format_desc->short_name);
debug_printf(" min_img %d mag_img %d mip %d wraps %d wrapt %d\n",
@@ -1269,9 +1351,203 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
static_state->wrap_t);
}
- lp_build_sample_general(&bld, unit, s, t, r, ddx, ddy,
- lod_bias, explicit_lod,
- texel_out);
+ lp_build_sample_common(&bld, unit,
+ &s, &t, &r,
+ derivs, lod_bias, explicit_lod,
+ &lod_ipart, &lod_fpart,
+ &ilevel0, &ilevel1);
+
+ /*
+ * we only try 8-wide sampling with soa as it appears to
+ * be a loss with aos with AVX.
+ */
+ if (num_quads == 1 || (mip_filter == PIPE_TEX_MIPFILTER_NONE &&
+ !use_aos)) {
+
+ if (num_quads > 1) {
+ LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
+ /* These parameters are the same for all quads */
+ lod_ipart = LLVMBuildExtractElement(builder, lod_ipart, index0, "");
+ ilevel0 = LLVMBuildExtractElement(builder, ilevel0, index0, "");
+ }
+ if (use_aos) {
+ /* do sampling/filtering with fixed pt arithmetic */
+ lp_build_sample_aos(&bld, unit,
+ s, t, r,
+ lod_ipart, lod_fpart,
+ ilevel0, ilevel1,
+ texel_out);
+ }
+
+ else {
+ lp_build_sample_general(&bld, unit,
+ s, t, r,
+ lod_ipart, lod_fpart,
+ ilevel0, ilevel1,
+ texel_out);
+ }
+ }
+ else {
+ struct lp_build_if_state if_ctx;
+ LLVMValueRef notsame_levels, notsame;
+ LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
+ LLVMValueRef texels[4];
+ LLVMValueRef texelout[4];
+ unsigned j;
+
+ texels[0] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texr");
+ texels[1] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texg");
+ texels[2] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texb");
+ texels[3] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texa");
+
+ /* only build the if if we MAY split, otherwise always split */
+ if (!use_aos) {
+ notsame = lp_build_extract_broadcast(gallivm,
+ bld.perquadi_bld.type,
+ bld.perquadi_bld.type,
+ ilevel0, index0);
+ notsame = lp_build_sub(&bld.perquadi_bld, ilevel0, notsame);
+ notsame_levels = lp_build_any_true_range(&bld.perquadi_bld, num_quads,
+ notsame);
+ if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+ notsame = lp_build_extract_broadcast(gallivm,
+ bld.perquadi_bld.type,
+ bld.perquadi_bld.type,
+ ilevel1, index0);
+ notsame = lp_build_sub(&bld.perquadi_bld, ilevel1, notsame);
+ notsame = lp_build_any_true_range(&bld.perquadi_bld, num_quads, notsame);
+ notsame_levels = LLVMBuildOr(builder, notsame_levels, notsame, "");
+ }
+ lp_build_if(&if_ctx, gallivm, notsame_levels);
+ }
+
+ {
+ struct lp_build_sample_context bld4;
+ struct lp_type type4 = type;
+ unsigned i;
+ LLVMValueRef texelout4[4];
+ LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
+
+ type4.length = 4;
+
+ /* Setup our build context */
+ memset(&bld4, 0, sizeof bld4);
+ bld4.gallivm = bld.gallivm;
+ bld4.static_state = bld.static_state;
+ bld4.dynamic_state = bld.dynamic_state;
+ bld4.format_desc = bld.format_desc;
+ bld4.dims = bld.dims;
+ bld4.row_stride_array = bld.row_stride_array;
+ bld4.img_stride_array = bld.img_stride_array;
+ bld4.data_array = bld.data_array;
+ bld4.int_size = bld.int_size;
+
+ bld4.vector_width = lp_type_width(type4);
+
+ bld4.float_type = lp_type_float(32);
+ bld4.int_type = lp_type_int(32);
+ bld4.coord_type = type4;
+ bld4.int_coord_type = lp_int_type(type4);
+ bld4.float_size_type = lp_type_float(32);
+ bld4.float_size_type.length = dims > 1 ? 4 : 1;
+ bld4.int_size_type = lp_int_type(bld4.float_size_type);
+ bld4.texel_type = type4;
+ bld4.perquadf_type = type4;
+ /* we want native vector size to be able to use our intrinsics */
+ bld4.perquadf_type.length = 1;
+ bld4.perquadi_type = lp_int_type(bld4.perquadf_type);
+
+ lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
+ lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
+ lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
+ lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
+ lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
+ lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
+ lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
+ lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
+ lp_build_context_init(&bld4.perquadf_bld, gallivm, bld4.perquadf_type);
+ lp_build_context_init(&bld4.perquadi_bld, gallivm, bld4.perquadi_type);
+
+ for (i = 0; i < num_quads; i++) {
+ LLVMValueRef s4, t4, r4;
+ LLVMValueRef lod_iparts, lod_fparts = NULL;
+ LLVMValueRef ilevel0s, ilevel1s = NULL;
+ LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
+
+ s4 = lp_build_extract_range(gallivm, s, 4*i, 4);
+ t4 = lp_build_extract_range(gallivm, t, 4*i, 4);
+ r4 = lp_build_extract_range(gallivm, r, 4*i, 4);
+ lod_iparts = LLVMBuildExtractElement(builder, lod_ipart, indexi, "");
+ ilevel0s = LLVMBuildExtractElement(builder, ilevel0, indexi, "");
+ if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+ ilevel1s = LLVMBuildExtractElement(builder, ilevel1, indexi, "");
+ lod_fparts = LLVMBuildExtractElement(builder, lod_fpart, indexi, "");
+ }
+
+ if (use_aos) {
+ /* do sampling/filtering with fixed pt arithmetic */
+ lp_build_sample_aos(&bld4, unit,
+ s4, t4, r4,
+ lod_iparts, lod_fparts,
+ ilevel0s, ilevel1s,
+ texelout4);
+ }
+
+ else {
+ lp_build_sample_general(&bld4, unit,
+ s4, t4, r4,
+ lod_iparts, lod_fparts,
+ ilevel0s, ilevel1s,
+ texelout4);
+ }
+ for (j = 0; j < 4; j++) {
+ texelouttmp[j][i] = texelout4[j];
+ }
+ }
+ for (j = 0; j < 4; j++) {
+ texelout[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
+ LLVMBuildStore(builder, texelout[j], texels[j]);
+ }
+ }
+ if (!use_aos) {
+ LLVMValueRef ilevel0s, lod_iparts, ilevel1s = NULL;
+
+ lp_build_else(&if_ctx);
+
+ /* These parameters are the same for all quads */
+ lod_iparts = LLVMBuildExtractElement(builder, lod_ipart, index0, "");
+ ilevel0s = LLVMBuildExtractElement(builder, ilevel0, index0, "");
+ if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+ ilevel1s = LLVMBuildExtractElement(builder, ilevel1, index0, "");
+ }
+
+ if (use_aos) {
+ /* do sampling/filtering with fixed pt arithmetic */
+ lp_build_sample_aos(&bld, unit,
+ s, t, r,
+ lod_iparts, lod_fpart,
+ ilevel0s, ilevel1s,
+ texelout);
+ }
+
+ else {
+ lp_build_sample_general(&bld, unit,
+ s, t, r,
+ lod_iparts, lod_fpart,
+ ilevel0s, ilevel1s,
+ texelout);
+ }
+ for (j = 0; j < 4; j++) {
+ LLVMBuildStore(builder, texelout[j], texels[j]);
+ }
+
+ lp_build_endif(&if_ctx);
+ }
+
+ for (j = 0; j < 4; j++) {
+ texel_out[j] = LLVMBuildLoad(builder, texels[j], "");
+ }
+ }
}
lp_build_sample_compare(&bld, r, texel_out);
@@ -1283,6 +1559,7 @@ void
lp_build_size_query_soa(struct gallivm_state *gallivm,
const struct lp_sampler_static_state *static_state,
struct lp_sampler_dynamic_state *dynamic_state,
+ struct lp_type int_type,
unsigned unit,
LLVMValueRef explicit_lod,
LLVMValueRef *sizes_out)
@@ -1311,7 +1588,9 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
return;
}
- lp_build_context_init(&bld_int_vec, gallivm, lp_type_int_vec(32));
+ assert(!int_type.floating);
+
+ lp_build_context_init(&bld_int_vec, gallivm, lp_type_int_vec(32, 128));
if (explicit_lod) {
LLVMValueRef first_level;
@@ -1345,7 +1624,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
size = lp_build_minify(&bld_int_vec, size, lod);
for (i=0; i < dims; i++) {
- sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec.type, bld_int_vec.type,
+ sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec.type, int_type,
size,
lp_build_const_int32(gallivm, i));
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
index 5d4406812c7..641c960431d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -40,6 +40,7 @@
#include "lp_bld_init.h"
#include "lp_bld_logic.h"
#include "lp_bld_swizzle.h"
+#include "lp_bld_pack.h"
LLVMValueRef
@@ -95,7 +96,7 @@ lp_build_broadcast_scalar(struct lp_build_context *bld,
/**
- * Combined extract and broadcast (or a mere shuffle when the two types match)
+ * Combined extract and broadcast (mere shuffle in most cases)
*/
LLVMValueRef
lp_build_extract_broadcast(struct gallivm_state *gallivm,
@@ -132,9 +133,9 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm,
}
}
else {
- if (dst_type.length == src_type.length) {
+ if (dst_type.length > 1) {
/*
- * Special shuffle of the same size.
+ * shuffle - result can be of different length.
*/
LLVMValueRef shuffle;
@@ -142,28 +143,14 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm,
LLVMVectorType(i32t, dst_type.length),
index);
res = LLVMBuildShuffleVector(gallivm->builder, vector,
- LLVMGetUndef(lp_build_vec_type(gallivm, dst_type)),
+ LLVMGetUndef(lp_build_vec_type(gallivm, src_type)),
shuffle, "");
}
else {
- LLVMValueRef scalar;
- scalar = LLVMBuildExtractElement(gallivm->builder, vector, index, "");
- if (dst_type.length == 1) {
- /*
- * Trivial extract scalar from vector.
- */
-
- res = scalar;
- }
- else {
- /*
- * General case of different sized vectors.
- */
-
- res = lp_build_broadcast(gallivm,
- lp_build_vec_type(gallivm, dst_type),
- vector);
- }
+ /*
+ * Trivial extract scalar from vector.
+ */
+ res = LLVMBuildExtractElement(gallivm->builder, vector, index, "");
}
}
@@ -290,6 +277,8 @@ lp_build_swizzle_aos(struct lp_build_context *bld,
return bld->zero;
case PIPE_SWIZZLE_ONE:
return bld->one;
+ case LP_BLD_SWIZZLE_DONTCARE:
+ return bld->undef;
default:
assert(0);
return bld->undef;
@@ -319,21 +308,26 @@ lp_build_swizzle_aos(struct lp_build_context *bld,
case PIPE_SWIZZLE_BLUE:
case PIPE_SWIZZLE_ALPHA:
shuffle = j + swizzles[i];
+ shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
break;
case PIPE_SWIZZLE_ZERO:
shuffle = type.length + 0;
+ shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
if (!aux[0]) {
aux[0] = lp_build_const_elem(bld->gallivm, type, 0.0);
}
break;
case PIPE_SWIZZLE_ONE:
shuffle = type.length + 1;
+ shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
if (!aux[1]) {
aux[1] = lp_build_const_elem(bld->gallivm, type, 1.0);
}
break;
+ case LP_BLD_SWIZZLE_DONTCARE:
+ shuffles[j + i] = LLVMGetUndef(i32t);
+ break;
}
- shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
}
}
@@ -508,3 +502,127 @@ lp_build_swizzle_soa_inplace(struct lp_build_context *bld,
lp_build_swizzle_soa(bld, unswizzled, swizzles, values);
}
+
+
+/**
+ * Transpose from AOS <-> SOA
+ *
+ * @param single_type_lp type of pixels
+ * @param src the 4 * n pixel input
+ * @param dst the 4 * n pixel output
+ */
+void
+lp_build_transpose_aos(struct gallivm_state *gallivm,
+ struct lp_type single_type_lp,
+ const LLVMValueRef src[4],
+ LLVMValueRef dst[4])
+{
+ struct lp_type double_type_lp = single_type_lp;
+ LLVMTypeRef single_type;
+ LLVMTypeRef double_type;
+ LLVMValueRef t0, t1, t2, t3;
+
+ double_type_lp.length >>= 1;
+ double_type_lp.width <<= 1;
+
+ double_type = lp_build_vec_type(gallivm, double_type_lp);
+ single_type = lp_build_vec_type(gallivm, single_type_lp);
+
+ /* Interleave x, y, z, w -> xy and zw */
+ t0 = lp_build_interleave2_half(gallivm, single_type_lp, src[0], src[1], 0);
+ t1 = lp_build_interleave2_half(gallivm, single_type_lp, src[2], src[3], 0);
+ t2 = lp_build_interleave2_half(gallivm, single_type_lp, src[0], src[1], 1);
+ t3 = lp_build_interleave2_half(gallivm, single_type_lp, src[2], src[3], 1);
+
+ /* Cast to double width type for second interleave */
+ t0 = LLVMBuildBitCast(gallivm->builder, t0, double_type, "t0");
+ t1 = LLVMBuildBitCast(gallivm->builder, t1, double_type, "t1");
+ t2 = LLVMBuildBitCast(gallivm->builder, t2, double_type, "t2");
+ t3 = LLVMBuildBitCast(gallivm->builder, t3, double_type, "t3");
+
+ /* Interleave xy, zw -> xyzw */
+ dst[0] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 0);
+ dst[1] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 1);
+ dst[2] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 0);
+ dst[3] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 1);
+
+ /* Cast back to original single width type */
+ dst[0] = LLVMBuildBitCast(gallivm->builder, dst[0], single_type, "dst0");
+ dst[1] = LLVMBuildBitCast(gallivm->builder, dst[1], single_type, "dst1");
+ dst[2] = LLVMBuildBitCast(gallivm->builder, dst[2], single_type, "dst2");
+ dst[3] = LLVMBuildBitCast(gallivm->builder, dst[3], single_type, "dst3");
+}
+
+
+/**
+ * Pack first element of aos values,
+ * pad out to destination size.
+ * i.e. x1 _ _ _ x2 _ _ _ will become x1 x2 _ _
+ */
+LLVMValueRef
+lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ struct lp_type dst_type,
+ const LLVMValueRef src)
+{
+ LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+ LLVMValueRef undef = LLVMGetUndef(i32t);
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+ unsigned num_src = src_type.length / 4;
+ unsigned num_dst = dst_type.length;
+ unsigned i;
+
+ assert(num_src <= num_dst);
+
+ for (i = 0; i < num_src; i++) {
+ shuffles[i] = LLVMConstInt(i32t, i * 4, 0);
+ }
+ for (i = num_src; i < num_dst; i++) {
+ shuffles[i] = undef;
+ }
+
+ if (num_dst == 1) {
+ return LLVMBuildExtractElement(gallivm->builder, src, shuffles[0], "");
+ }
+ else {
+ return LLVMBuildShuffleVector(gallivm->builder, src, src,
+ LLVMConstVector(shuffles, num_dst), "");
+ }
+}
+
+
+/**
+ * Unpack and broadcast packed aos values consisting of only the
+ * first value, i.e. x1 x2 _ _ will become x1 x1 x1 x1 x2 x2 x2 x2
+ */
+LLVMValueRef
+lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ struct lp_type dst_type,
+ const LLVMValueRef src)
+{
+ LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+ unsigned num_dst = dst_type.length;
+ unsigned num_src = dst_type.length / 4;
+ unsigned i;
+
+ assert(num_dst / 4 <= src_type.length);
+
+ for (i = 0; i < num_src; i++) {
+ shuffles[i*4] = LLVMConstInt(i32t, i, 0);
+ shuffles[i*4+1] = LLVMConstInt(i32t, i, 0);
+ shuffles[i*4+2] = LLVMConstInt(i32t, i, 0);
+ shuffles[i*4+3] = LLVMConstInt(i32t, i, 0);
+ }
+
+ if (num_src == 1) {
+ return lp_build_extract_broadcast(gallivm, src_type, dst_type,
+ src, shuffles[0]);
+ }
+ else {
+ return LLVMBuildShuffleVector(gallivm->builder, src, src,
+ LLVMConstVector(shuffles, num_dst), "");
+ }
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
index c366a65103e..0bf4ce988a2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
@@ -44,6 +44,9 @@ struct lp_type;
struct lp_build_context;
+#define LP_BLD_SWIZZLE_DONTCARE 0xFF
+
+
LLVMValueRef
lp_build_broadcast(struct gallivm_state *gallivm,
LLVMTypeRef vec_type,
@@ -103,4 +106,25 @@ lp_build_swizzle_soa_inplace(struct lp_build_context *bld,
const unsigned char swizzles[4]);
+void
+lp_build_transpose_aos(struct gallivm_state *gallivm,
+ struct lp_type type,
+ const LLVMValueRef src[4],
+ LLVMValueRef dst[4]);
+
+
+LLVMValueRef
+lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ struct lp_type dst_type,
+ const LLVMValueRef src);
+
+
+LLVMValueRef
+lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ struct lp_type dst_type,
+ const LLVMValueRef src);
+
+
#endif /* !LP_BLD_SWIZZLE_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 4423bc5dedd..e292420a61a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -60,6 +60,7 @@ struct tgsi_token;
struct tgsi_shader_info;
struct lp_build_mask_context;
struct gallivm_state;
+struct lp_derivatives;
enum lp_build_tex_modifier {
@@ -174,8 +175,7 @@ struct lp_build_sampler_soa
unsigned unit,
unsigned num_coords,
const LLVMValueRef *coords,
- const LLVMValueRef *ddx,
- const LLVMValueRef *ddy,
+ const struct lp_derivatives *derivs,
LLVMValueRef lod_bias, /* optional */
LLVMValueRef explicit_lod, /* optional */
LLVMValueRef *texel);
@@ -183,6 +183,7 @@ struct lp_build_sampler_soa
void
(*emit_size_query)( const struct lp_build_sampler_soa *sampler,
struct gallivm_state *gallivm,
+ struct lp_type type,
unsigned unit,
LLVMValueRef explicit_lod, /* optional */
LLVMValueRef *sizes_out);
@@ -197,8 +198,7 @@ struct lp_build_sampler_aos
unsigned target, /* TGSI_TEXTURE_* */
unsigned unit,
LLVMValueRef coords,
- LLVMValueRef ddx,
- LLVMValueRef ddy,
+ const struct lp_derivatives derivs,
enum lp_build_tex_modifier modifier);
};
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
index 24bc13a9be8..0666bba7fbd 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
@@ -56,6 +56,7 @@
#include "lp_bld_quad.h"
#include "lp_bld_tgsi.h"
#include "lp_bld_debug.h"
+#include "lp_bld_sample.h"
/**
@@ -363,6 +364,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
LLVMValueRef coords;
LLVMValueRef ddx;
LLVMValueRef ddy;
+ struct lp_derivatives derivs;
if (!bld->sampler) {
_debug_printf("warning: found texture instruction but no sampler generator supplied\n");
@@ -373,7 +375,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
coords = lp_build_emit_fetch( &bld->bld_base, inst, 0 , LP_CHAN_ALL);
- if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
+ if (0 && modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
ddx = lp_build_emit_fetch( &bld->bld_base, inst, 1 , LP_CHAN_ALL);
ddy = lp_build_emit_fetch( &bld->bld_base, inst, 2 , LP_CHAN_ALL);
unit = inst->Src[3].Register.Index;
@@ -383,8 +385,8 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
ddy = lp_build_ddy( &bld->bld_base.base, coords );
#else
/* TODO */
- ddx = bld->bld_base.base.one;
- ddy = bld->bld_base.base.one;
+ derivs.ddx_ddy[0] = bld->bld_base.base.one;
+ derivs.ddx_ddy[1] = bld->bld_base.base.one;
#endif
unit = inst->Src[1].Register.Index;
}
@@ -392,7 +394,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
return bld->sampler->emit_fetch_texel(bld->sampler,
&bld->bld_base.base,
target, unit,
- coords, ddx, ddy,
+ coords, derivs,
modifier);
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index d9faaf20273..85a4401b534 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -62,6 +62,7 @@
#include "lp_bld_limits.h"
#include "lp_bld_debug.h"
#include "lp_bld_printf.h"
+#include "lp_bld_sample.h"
static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld)
@@ -763,7 +764,7 @@ emit_fetch_temporary(
else {
LLVMValueRef temp_ptr;
if (stype != TGSI_TYPE_FLOAT && stype != TGSI_TYPE_UNTYPED) {
- LLVMTypeRef itype = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
+ LLVMTypeRef itype = LLVMPointerType(bld->bld_base.int_bld.vec_type, 0);
LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index,
swizzle);
temp_ptr = LLVMBuildBitCast(builder, tint_ptr, itype, "");
@@ -1068,7 +1069,7 @@ emit_store_chan(
switch (dtype) {
case TGSI_TYPE_UNSIGNED:
case TGSI_TYPE_SIGNED: {
- LLVMTypeRef itype = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
+ LLVMTypeRef itype = bld_base->int_bld.vec_type;
LLVMTypeRef ivtype = LLVMPointerType(itype, 0);
LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index,
chan_index);
@@ -1141,13 +1142,14 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
LLVMValueRef *texel)
{
LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+ struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
unsigned unit;
LLVMValueRef lod_bias, explicit_lod;
LLVMValueRef oow = NULL;
LLVMValueRef coords[3];
- LLVMValueRef ddx[3];
- LLVMValueRef ddy[3];
+ struct lp_derivatives derivs;
unsigned num_coords;
+ unsigned dims;
unsigned i;
if (!bld->sampler) {
@@ -1158,26 +1160,42 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
return;
}
+ derivs.ddx_ddy[0] = bld->bld_base.base.undef;
+ derivs.ddx_ddy[1] = bld->bld_base.base.undef;
+
switch (inst->Texture.Texture) {
case TGSI_TEXTURE_1D:
num_coords = 1;
+ dims = 1;
break;
case TGSI_TEXTURE_1D_ARRAY:
+ num_coords = 2;
+ dims = 1;
+ break;
case TGSI_TEXTURE_2D:
case TGSI_TEXTURE_RECT:
num_coords = 2;
+ dims = 2;
break;
case TGSI_TEXTURE_SHADOW1D:
case TGSI_TEXTURE_SHADOW1D_ARRAY:
+ num_coords = 3;
+ dims = 1;
+ break;
case TGSI_TEXTURE_SHADOW2D:
case TGSI_TEXTURE_SHADOWRECT:
case TGSI_TEXTURE_2D_ARRAY:
- case TGSI_TEXTURE_3D:
case TGSI_TEXTURE_CUBE:
num_coords = 3;
+ dims = 2;
+ break;
+ case TGSI_TEXTURE_3D:
+ num_coords = 3;
+ dims = 3;
break;
case TGSI_TEXTURE_SHADOW2D_ARRAY:
num_coords = 4;
+ dims = 2;
break;
default:
assert(0);
@@ -1212,31 +1230,66 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
}
if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
- LLVMValueRef index0 = lp_build_const_int32(bld->bld_base.base.gallivm, 0);
- for (i = 0; i < num_coords; i++) {
- LLVMValueRef src1 = lp_build_emit_fetch( &bld->bld_base, inst, 1, i );
- LLVMValueRef src2 = lp_build_emit_fetch( &bld->bld_base, inst, 2, i );
- ddx[i] = LLVMBuildExtractElement(builder, src1, index0, "");
- ddy[i] = LLVMBuildExtractElement(builder, src2, index0, "");
+ LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+ LLVMValueRef ddxdyonec[3];
+ unsigned length = bld->bld_base.base.type.length;
+ unsigned num_quads = length / 4;
+ unsigned dim;
+ unsigned quad;
+
+ for (dim = 0; dim < dims; ++dim) {
+ LLVMValueRef srcx = lp_build_emit_fetch( &bld->bld_base, inst, 1, dim );
+ LLVMValueRef srcy = lp_build_emit_fetch( &bld->bld_base, inst, 2, dim );
+ for (quad = 0; quad < num_quads; ++quad) {
+ unsigned s1 = 4*quad;
+ unsigned s2 = 4*quad + length;
+ shuffles[4*quad + 0] = lp_build_const_int32(gallivm, s1);
+ shuffles[4*quad + 1] = lp_build_const_int32(gallivm, s2);
+ shuffles[4*quad + 2] = i32undef;
+ shuffles[4*quad + 3] = i32undef;
+ }
+ ddxdyonec[dim] = LLVMBuildShuffleVector(builder, srcx, srcy,
+ LLVMConstVector(shuffles, length), "");
+ }
+ if (dims == 1) {
+ derivs.ddx_ddy[0] = ddxdyonec[0];
+ }
+ else if (dims >= 2) {
+ for (quad = 0; quad < num_quads; ++quad) {
+ unsigned s1 = 4*quad;
+ unsigned s2 = 4*quad + length;
+ shuffles[4*quad + 0] = lp_build_const_int32(gallivm, s1);
+ shuffles[4*quad + 1] = lp_build_const_int32(gallivm, s1 + 1);
+ shuffles[4*quad + 2] = lp_build_const_int32(gallivm, s2);
+ shuffles[4*quad + 3] = lp_build_const_int32(gallivm, s2 + 1);
+ }
+ derivs.ddx_ddy[0] = LLVMBuildShuffleVector(builder, ddxdyonec[0], ddxdyonec[1],
+ LLVMConstVector(shuffles, length), "");
+ if (dims == 3) {
+ derivs.ddx_ddy[1] = ddxdyonec[2];
+ }
}
unit = inst->Src[3].Register.Index;
} else {
- for (i = 0; i < num_coords; i++) {
- ddx[i] = lp_build_scalar_ddx( &bld->bld_base.base, coords[i] );
- ddy[i] = lp_build_scalar_ddy( &bld->bld_base.base, coords[i] );
+ if (dims == 1) {
+ derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(&bld->bld_base.base, coords[0]);
+ }
+ else if (dims >= 2) {
+ derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(&bld->bld_base.base,
+ coords[0], coords[1]);
+ if (dims == 3) {
+ derivs.ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(&bld->bld_base.base, coords[2]);
+ }
}
unit = inst->Src[1].Register.Index;
}
- for (i = num_coords; i < 3; i++) {
- ddx[i] = LLVMGetUndef(bld->bld_base.base.elem_type);
- ddy[i] = LLVMGetUndef(bld->bld_base.base.elem_type);
- }
bld->sampler->emit_fetch_texel(bld->sampler,
bld->bld_base.base.gallivm,
bld->bld_base.base.type,
unit, num_coords, coords,
- ddx, ddy,
+ &derivs,
lod_bias, explicit_lod,
texel);
}
@@ -1310,6 +1363,7 @@ emit_txq( struct lp_build_tgsi_soa_context *bld,
bld->sampler->emit_size_query(bld->sampler,
bld->bld_base.base.gallivm,
+ bld->bld_base.int_bld.type,
inst->Src[1].Register.Index,
explicit_lod,
sizes_out);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.c b/src/gallium/auxiliary/gallivm/lp_bld_type.c
index 413e69bedac..6c3aa38bfb1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.c
@@ -38,6 +38,9 @@ lp_build_elem_type(struct gallivm_state *gallivm, struct lp_type type)
{
if (type.floating) {
switch(type.width) {
+ case 16:
+ return LLVMIntTypeInContext(gallivm->context, 16);
+ break;
case 32:
return LLVMFloatTypeInContext(gallivm->context);
break;
@@ -85,6 +88,10 @@ lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type)
if (type.floating) {
switch(type.width) {
+ case 16:
+ if(elem_kind != LLVMIntegerTypeKind)
+ return FALSE;
+ break;
case 32:
if(elem_kind != LLVMFloatTypeKind)
return FALSE;
@@ -168,27 +175,6 @@ lp_build_int_vec_type(struct gallivm_state *gallivm, struct lp_type type)
/**
- * Build int32[4] vector type
- */
-LLVMTypeRef
-lp_build_int32_vec4_type(struct gallivm_state *gallivm)
-{
- struct lp_type t;
- LLVMTypeRef type;
-
- memset(&t, 0, sizeof(t));
- t.floating = FALSE; /* floating point values */
- t.sign = TRUE; /* values are signed */
- t.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */
- t.width = 32; /* 32-bit int */
- t.length = 4; /* 4 elements per vector */
-
- type = lp_build_int_elem_type(gallivm, t);
- return LLVMVectorType(type, t.length);
-}
-
-
-/**
* Create element of vector type
*/
struct lp_type
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h
index f11a190e7cc..75310e05f3e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -40,21 +40,35 @@
#include "pipe/p_compiler.h"
#include "gallivm/lp_bld.h"
+/**
+ * Native SIMD architecture width available at runtime.
+ *
+ * Using this width should give the best performance,
+ * and it determines the necessary alignment of vector variables.
+ */
+extern unsigned lp_native_vector_width;
+/**
+ * Maximum supported vector width (not necessarily supported at run-time).
+ *
+ * Should only be used when lp_native_vector_width isn't available,
+ * i.e. sizing/alignment of non-malloced variables.
+ */
+#define LP_MAX_VECTOR_WIDTH 256
/**
- * Native SIMD register width.
+ * Minimum vector alignment for static variable alignment
*
- * 128 for all architectures we care about.
+ * It should always be a constant equal to LP_MAX_VECTOR_WIDTH/8. An
+ * expression is non-portable.
*/
-#define LP_NATIVE_VECTOR_WIDTH 128
+#define LP_MIN_VECTOR_ALIGN 32
/**
* Several functions can only cope with vectors of length up to this value.
* You may need to increase that value if you want to represent bigger vectors.
*/
-#define LP_MAX_VECTOR_LENGTH 16
-
+#define LP_MAX_VECTOR_LENGTH (LP_MAX_VECTOR_WIDTH/8)
/**
* The LLVM type system can't conveniently express all the things we care about
@@ -151,6 +165,13 @@ struct lp_build_context
};
+static INLINE unsigned
+lp_type_width(struct lp_type type)
+{
+ return type.width * type.length;
+}
+
+
/** Create scalar float type */
static INLINE struct lp_type
lp_type_float(unsigned width)
@@ -169,7 +190,7 @@ lp_type_float(unsigned width)
/** Create vector of float type */
static INLINE struct lp_type
-lp_type_float_vec(unsigned width)
+lp_type_float_vec(unsigned width, unsigned total_width)
{
struct lp_type res_type;
@@ -177,7 +198,7 @@ lp_type_float_vec(unsigned width)
res_type.floating = TRUE;
res_type.sign = TRUE;
res_type.width = width;
- res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+ res_type.length = total_width / width;
return res_type;
}
@@ -200,14 +221,14 @@ lp_type_int(unsigned width)
/** Create vector int type */
static INLINE struct lp_type
-lp_type_int_vec(unsigned width)
+lp_type_int_vec(unsigned width, unsigned total_width)
{
struct lp_type res_type;
memset(&res_type, 0, sizeof res_type);
res_type.sign = TRUE;
res_type.width = width;
- res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+ res_type.length = total_width / width;
return res_type;
}
@@ -229,34 +250,34 @@ lp_type_uint(unsigned width)
/** Create vector uint type */
static INLINE struct lp_type
-lp_type_uint_vec(unsigned width)
+lp_type_uint_vec(unsigned width, unsigned total_width)
{
struct lp_type res_type;
memset(&res_type, 0, sizeof res_type);
res_type.width = width;
- res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+ res_type.length = total_width / width;
return res_type;
}
static INLINE struct lp_type
-lp_type_unorm(unsigned width)
+lp_type_unorm(unsigned width, unsigned total_width)
{
struct lp_type res_type;
memset(&res_type, 0, sizeof res_type);
res_type.norm = TRUE;
res_type.width = width;
- res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+ res_type.length = total_width / width;
return res_type;
}
static INLINE struct lp_type
-lp_type_fixed(unsigned width)
+lp_type_fixed(unsigned width, unsigned total_width)
{
struct lp_type res_type;
@@ -264,21 +285,21 @@ lp_type_fixed(unsigned width)
res_type.sign = TRUE;
res_type.fixed = TRUE;
res_type.width = width;
- res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+ res_type.length = total_width / width;
return res_type;
}
static INLINE struct lp_type
-lp_type_ufixed(unsigned width)
+lp_type_ufixed(unsigned width, unsigned total_width)
{
struct lp_type res_type;
memset(&res_type, 0, sizeof res_type);
res_type.fixed = TRUE;
res_type.width = width;
- res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+ res_type.length = total_width / width;
return res_type;
}
@@ -312,10 +333,6 @@ LLVMTypeRef
lp_build_int_vec_type(struct gallivm_state *gallivm, struct lp_type type);
-LLVMTypeRef
-lp_build_int32_vec4_type(struct gallivm_state *gallivm);
-
-
static INLINE struct lp_type
lp_float32_vec4_type(void)
{
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h
index 856e8d7a0ef..b44d9d9a0fe 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.h
+++ b/src/gallium/auxiliary/util/u_cpu_detect.h
@@ -35,9 +35,16 @@
#ifndef _UTIL_CPU_DETECT_H
#define _UTIL_CPU_DETECT_H
+
#include "pipe/p_compiler.h"
#include "pipe/p_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
struct util_cpu_caps {
unsigned nr_cpus;
@@ -66,4 +73,9 @@ util_cpu_caps;
void util_cpu_detect(void);
+#ifdef __cplusplus
+}
+#endif
+
+
#endif /* _UTIL_CPU_DETECT_H */