summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--scons/llvm.py7
-rw-r--r--src/gallium/auxiliary/Makefile.sources2
-rw-r--r--src/gallium/auxiliary/draw/draw_context.c31
-rw-r--r--src/gallium/auxiliary/draw/draw_context.h4
-rw-r--r--src/gallium/auxiliary/draw/draw_llvm.c759
-rw-r--r--src/gallium/auxiliary/draw/draw_llvm.h28
-rw-r--r--src/gallium/auxiliary/draw/draw_llvm_sample.c7
-rw-r--r--src/gallium/auxiliary/draw/draw_llvm_translate.c506
-rw-r--r--src/gallium/auxiliary/draw/draw_private.h3
-rw-r--r--src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c4
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_arit.c545
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_arit.h19
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_const.c39
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_conv.c149
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_conv.h4
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_debug.cpp22
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_flow.c9
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format.h7
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format_aos.c6
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c102
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format_soa.c3
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c4
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_init.c488
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_init.h33
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_intr.c91
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_intr.h9
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_logic.c60
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_logic.h5
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_misc.cpp111
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_misc.h70
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_pack.c339
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_pack.h23
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_quad.c87
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_quad.h14
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_sample.c527
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_sample.h51
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c1344
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h8
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c493
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_swizzle.c164
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_swizzle.h24
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_tgsi.h8
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c10
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c92
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_type.c28
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_type.h59
-rw-r--r--src/gallium/auxiliary/util/u_cpu_detect.h12
-rw-r--r--src/gallium/drivers/llvmpipe/.gitignore1
-rw-r--r--src/gallium/drivers/llvmpipe/Makefile3
-rw-r--r--src/gallium/drivers/llvmpipe/SConscript1
-rw-r--r--src/gallium/drivers/llvmpipe/lp_bld_depth.c115
-rw-r--r--src/gallium/drivers/llvmpipe/lp_bld_interp.c429
-rw-r--r--src/gallium/drivers/llvmpipe/lp_bld_interp.h10
-rw-r--r--src/gallium/drivers/llvmpipe/lp_context.c48
-rw-r--r--src/gallium/drivers/llvmpipe/lp_context.h4
-rw-r--r--src/gallium/drivers/llvmpipe/lp_flush.c7
-rw-r--r--src/gallium/drivers/llvmpipe/lp_jit.c8
-rw-r--r--src/gallium/drivers/llvmpipe/lp_jit.h5
-rw-r--r--src/gallium/drivers/llvmpipe/lp_memory.c6
-rw-r--r--src/gallium/drivers/llvmpipe/lp_memory.h7
-rw-r--r--src/gallium/drivers/llvmpipe/lp_rast.c1
-rw-r--r--src/gallium/drivers/llvmpipe/lp_state_fs.c162
-rw-r--r--src/gallium/drivers/llvmpipe/lp_state_fs.h6
-rw-r--r--src/gallium/drivers/llvmpipe/lp_state_setup.c487
-rw-r--r--src/gallium/drivers/llvmpipe/lp_state_setup.h11
-rw-r--r--src/gallium/drivers/llvmpipe/lp_test.h11
-rw-r--r--src/gallium/drivers/llvmpipe/lp_test_arit.c166
-rw-r--r--src/gallium/drivers/llvmpipe/lp_test_blend.c102
-rw-r--r--src/gallium/drivers/llvmpipe/lp_test_conv.c87
-rw-r--r--src/gallium/drivers/llvmpipe/lp_test_format.c95
-rw-r--r--src/gallium/drivers/llvmpipe/lp_test_main.c17
-rw-r--r--src/gallium/drivers/llvmpipe/lp_test_printf.c37
-rw-r--r--src/gallium/drivers/llvmpipe/lp_test_round.c242
-rw-r--r--src/gallium/drivers/llvmpipe/lp_tex_sample.c9
74 files changed, 5220 insertions, 3267 deletions
diff --git a/scons/llvm.py b/scons/llvm.py
index 8222c5b45b1..f87766af190 100644
--- a/scons/llvm.py
+++ b/scons/llvm.py
@@ -178,7 +178,12 @@ def generate(env):
pass
env.MergeFlags(cppflags)
- env.ParseConfig('llvm-config --libs engine bitwriter')
+ components = ['engine', 'bitwriter', 'x86asmprinter']
+
+ if llvm_version >= distutils.version.LooseVersion('3.1'):
+ components.append('mcjit')
+
+ env.ParseConfig('llvm-config --libs ' + ' '.join(components))
env.ParseConfig('llvm-config --ldflags')
except OSError:
print 'scons: llvm-config version %s failed' % llvm_version
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 28a176d68fa..2807c780d2d 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -165,6 +165,7 @@ GALLIVM_SOURCES := \
gallivm/lp_bld_conv.c \
gallivm/lp_bld_flow.c \
gallivm/lp_bld_format_aos.c \
+ gallivm/lp_bld_format_aos_array.c \
gallivm/lp_bld_format_soa.c \
gallivm/lp_bld_format_yuv.c \
gallivm/lp_bld_gather.c \
@@ -187,7 +188,6 @@ GALLIVM_SOURCES := \
gallivm/lp_bld_type.c \
draw/draw_llvm.c \
draw/draw_llvm_sample.c \
- draw/draw_llvm_translate.c \
draw/draw_vs_llvm.c \
draw/draw_pt_fetch_shade_pipeline_llvm.c
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 20260c1abbf..be30b7db245 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -70,8 +70,7 @@ draw_get_option_use_llvm(void)
* Create new draw module context with gallivm state for LLVM JIT.
*/
static struct draw_context *
-draw_create_context(struct pipe_context *pipe, boolean try_llvm,
- struct gallivm_state *gallivm)
+draw_create_context(struct pipe_context *pipe, boolean try_llvm)
{
struct draw_context *draw = CALLOC_STRUCT( draw_context );
if (draw == NULL)
@@ -79,16 +78,7 @@ draw_create_context(struct pipe_context *pipe, boolean try_llvm,
#if HAVE_LLVM
if (try_llvm && draw_get_option_use_llvm()) {
- if (!gallivm) {
- gallivm = gallivm_create();
- draw->own_gallivm = gallivm;
- }
-
- if (!gallivm)
- goto err_destroy;
-
- draw->llvm = draw_llvm_create(draw, gallivm);
-
+ draw->llvm = draw_llvm_create(draw);
if (!draw->llvm)
goto err_destroy;
}
@@ -114,7 +104,7 @@ err_out:
struct draw_context *
draw_create(struct pipe_context *pipe)
{
- return draw_create_context(pipe, TRUE, NULL);
+ return draw_create_context(pipe, TRUE);
}
@@ -124,17 +114,7 @@ draw_create(struct pipe_context *pipe)
struct draw_context *
draw_create_no_llvm(struct pipe_context *pipe)
{
- return draw_create_context(pipe, FALSE, NULL);
-}
-
-
-/**
- * Create new draw module context with gallivm state for LLVM JIT.
- */
-struct draw_context *
-draw_create_gallivm(struct pipe_context *pipe, struct gallivm_state *gallivm)
-{
- return draw_create_context(pipe, TRUE, gallivm);
+ return draw_create_context(pipe, FALSE);
}
@@ -213,9 +193,6 @@ void draw_destroy( struct draw_context *draw )
#ifdef HAVE_LLVM
if (draw->llvm)
draw_llvm_destroy( draw->llvm );
-
- if (draw->own_gallivm)
- gallivm_destroy(draw->own_gallivm);
#endif
FREE( draw );
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index 852cbc3da13..cc95600c530 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -48,7 +48,6 @@ struct draw_vertex_shader;
struct draw_geometry_shader;
struct draw_fragment_shader;
struct tgsi_sampler;
-struct gallivm_state;
/*
* structure to contain driver internal information
@@ -67,9 +66,6 @@ struct draw_context *draw_create( struct pipe_context *pipe );
struct draw_context *draw_create_no_llvm(struct pipe_context *pipe);
-struct draw_context *
-draw_create_gallivm(struct pipe_context *pipe, struct gallivm_state *gallivm);
-
void draw_destroy( struct draw_context *draw );
void draw_flush(struct draw_context *draw);
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index e08221eb392..8d9b5309aff 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -43,6 +43,8 @@
#include "gallivm/lp_bld_intr.h"
#include "gallivm/lp_bld_init.h"
#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_pack.h"
+#include "gallivm/lp_bld_format.h"
#include "tgsi/tgsi_exec.h"
#include "tgsi/tgsi_dump.h"
@@ -56,40 +58,6 @@
#define DEBUG_STORE 0
-/**
- * This function is called by the gallivm "garbage collector" when
- * the LLVM global data structures are freed. We must free all LLVM-related
- * data. Specifically, all JIT'd shader variants.
- */
-static void
-draw_llvm_garbage_collect_callback(void *cb_data)
-{
- struct draw_llvm *llvm = (struct draw_llvm *) cb_data;
- struct draw_context *draw = llvm->draw;
- struct draw_llvm_variant_list_item *li;
-
- /* Ensure prepare will be run and shaders recompiled */
- assert(!draw->suspend_flushing);
- draw_do_flush(draw, DRAW_FLUSH_STATE_CHANGE);
-
- /* free all shader variants */
- li = first_elem(&llvm->vs_variants_list);
- while (!at_end(&llvm->vs_variants_list, li)) {
- struct draw_llvm_variant_list_item *next = next_elem(li);
- draw_llvm_destroy_variant(li->base);
- li = next;
- }
-
- /* Null-out these pointers so they get remade next time they're needed.
- * See the accessor functions below.
- */
- llvm->context_ptr_type = NULL;
- llvm->buffer_ptr_type = NULL;
- llvm->vb_ptr_type = NULL;
- llvm->vertex_header_ptr_type = NULL;
-}
-
-
static void
draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *var,
boolean elts);
@@ -316,56 +284,56 @@ create_jit_vertex_header(struct gallivm_state *gallivm, int data_elems)
* Create LLVM types for various structures.
*/
static void
-create_jit_types(struct draw_llvm *llvm)
+create_jit_types(struct draw_llvm_variant *variant)
{
- struct gallivm_state *gallivm = llvm->gallivm;
+ struct gallivm_state *gallivm = variant->gallivm;
LLVMTypeRef texture_type, context_type, buffer_type, vb_type;
texture_type = create_jit_texture_type(gallivm, "texture");
context_type = create_jit_context_type(gallivm, texture_type, "draw_jit_context");
- llvm->context_ptr_type = LLVMPointerType(context_type, 0);
+ variant->context_ptr_type = LLVMPointerType(context_type, 0);
buffer_type = LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 8), 0);
- llvm->buffer_ptr_type = LLVMPointerType(buffer_type, 0);
+ variant->buffer_ptr_type = LLVMPointerType(buffer_type, 0);
vb_type = create_jit_vertex_buffer_type(gallivm, "pipe_vertex_buffer");
- llvm->vb_ptr_type = LLVMPointerType(vb_type, 0);
+ variant->vb_ptr_type = LLVMPointerType(vb_type, 0);
}
static LLVMTypeRef
-get_context_ptr_type(struct draw_llvm *llvm)
+get_context_ptr_type(struct draw_llvm_variant *variant)
{
- if (!llvm->context_ptr_type)
- create_jit_types(llvm);
- return llvm->context_ptr_type;
+ if (!variant->context_ptr_type)
+ create_jit_types(variant);
+ return variant->context_ptr_type;
}
static LLVMTypeRef
-get_buffer_ptr_type(struct draw_llvm *llvm)
+get_buffer_ptr_type(struct draw_llvm_variant *variant)
{
- if (!llvm->buffer_ptr_type)
- create_jit_types(llvm);
- return llvm->buffer_ptr_type;
+ if (!variant->buffer_ptr_type)
+ create_jit_types(variant);
+ return variant->buffer_ptr_type;
}
static LLVMTypeRef
-get_vb_ptr_type(struct draw_llvm *llvm)
+get_vb_ptr_type(struct draw_llvm_variant *variant)
{
- if (!llvm->vb_ptr_type)
- create_jit_types(llvm);
- return llvm->vb_ptr_type;
+ if (!variant->vb_ptr_type)
+ create_jit_types(variant);
+ return variant->vb_ptr_type;
}
static LLVMTypeRef
-get_vertex_header_ptr_type(struct draw_llvm *llvm)
+get_vertex_header_ptr_type(struct draw_llvm_variant *variant)
{
- if (!llvm->vertex_header_ptr_type)
- create_jit_types(llvm);
- return llvm->vertex_header_ptr_type;
+ if (!variant->vertex_header_ptr_type)
+ create_jit_types(variant);
+ return variant->vertex_header_ptr_type;
}
@@ -373,7 +341,7 @@ get_vertex_header_ptr_type(struct draw_llvm *llvm)
* Create per-context LLVM info.
*/
struct draw_llvm *
-draw_llvm_create(struct draw_context *draw, struct gallivm_state *gallivm)
+draw_llvm_create(struct draw_context *draw)
{
struct draw_llvm *llvm;
@@ -384,18 +352,10 @@ draw_llvm_create(struct draw_context *draw, struct gallivm_state *gallivm)
lp_build_init();
llvm->draw = draw;
- llvm->gallivm = gallivm;
-
- if (gallivm_debug & GALLIVM_DEBUG_IR) {
- LLVMDumpModule(llvm->gallivm->module);
- }
llvm->nr_variants = 0;
make_empty_list(&llvm->vs_variants_list);
- gallivm_register_garbage_collector_callback(
- draw_llvm_garbage_collect_callback, llvm);
-
return llvm;
}
@@ -406,9 +366,6 @@ draw_llvm_create(struct draw_context *draw, struct gallivm_state *gallivm)
void
draw_llvm_destroy(struct draw_llvm *llvm)
{
- gallivm_remove_garbage_collector_callback(
- draw_llvm_garbage_collect_callback, llvm);
-
/* XXX free other draw_llvm data? */
FREE(llvm);
}
@@ -435,15 +392,27 @@ draw_llvm_create_variant(struct draw_llvm *llvm,
variant->llvm = llvm;
+ variant->gallivm = gallivm_create();
+
+ create_jit_types(variant);
+
memcpy(&variant->key, key, shader->variant_key_size);
- vertex_header = create_jit_vertex_header(llvm->gallivm, num_inputs);
+ vertex_header = create_jit_vertex_header(variant->gallivm, num_inputs);
- llvm->vertex_header_ptr_type = LLVMPointerType(vertex_header, 0);
+ variant->vertex_header_ptr_type = LLVMPointerType(vertex_header, 0);
draw_llvm_generate(llvm, variant, FALSE); /* linear */
draw_llvm_generate(llvm, variant, TRUE); /* elts */
+ gallivm_compile_module(variant->gallivm);
+
+ variant->jit_func = (draw_jit_vert_func)
+ gallivm_jit_function(variant->gallivm, variant->function);
+
+ variant->jit_func_elts = (draw_jit_vert_func_elts)
+ gallivm_jit_function(variant->gallivm, variant->function_elts);
+
variant->shader = shader;
variant->list_item_global.base = variant;
variant->list_item_local.base = variant;
@@ -455,8 +424,9 @@ draw_llvm_create_variant(struct draw_llvm *llvm,
static void
-generate_vs(struct draw_llvm *llvm,
+generate_vs(struct draw_llvm_variant *variant,
LLVMBuilderRef builder,
+ struct lp_type vs_type,
LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS],
const struct lp_bld_tgsi_system_values *system_values,
@@ -464,21 +434,11 @@ generate_vs(struct draw_llvm *llvm,
struct lp_build_sampler_soa *draw_sampler,
boolean clamp_vertex_color)
{
+ struct draw_llvm *llvm = variant->llvm;
const struct tgsi_token *tokens = llvm->draw->vs.vertex_shader->state.tokens;
- struct lp_type vs_type;
- LLVMValueRef consts_ptr = draw_jit_context_vs_constants(llvm->gallivm, context_ptr);
+ LLVMValueRef consts_ptr = draw_jit_context_vs_constants(variant->gallivm, context_ptr);
struct lp_build_sampler_soa *sampler = 0;
- memset(&vs_type, 0, sizeof vs_type);
- vs_type.floating = TRUE; /* floating point values */
- vs_type.sign = TRUE; /* values are signed */
- vs_type.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */
- vs_type.width = 32; /* 32-bit float */
- vs_type.length = 4; /* 4 elements per vector */
-#if 0
- num_vs = 4; /* number of vertices per block */
-#endif
-
if (gallivm_debug & GALLIVM_DEBUG_IR) {
tgsi_dump(tokens, 0);
}
@@ -486,7 +446,7 @@ generate_vs(struct draw_llvm *llvm,
if (llvm->draw->num_sampler_views && llvm->draw->num_samplers)
sampler = draw_sampler;
- lp_build_tgsi_soa(llvm->gallivm,
+ lp_build_tgsi_soa(variant->gallivm,
tokens,
vs_type,
NULL /*struct lp_build_mask_context *mask*/,
@@ -503,7 +463,7 @@ generate_vs(struct draw_llvm *llvm,
unsigned chan, attrib;
struct lp_build_context bld;
struct tgsi_shader_info* info = &llvm->draw->vs.vertex_shader->info;
- lp_build_context_init(&bld, llvm->gallivm, vs_type);
+ lp_build_context_init(&bld, variant->gallivm, vs_type);
for (attrib = 0; attrib < info->num_outputs; ++attrib) {
for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
@@ -531,25 +491,6 @@ generate_vs(struct draw_llvm *llvm,
}
-#if DEBUG_STORE
-static void print_vectorf(LLVMBuilderRef builder,
- LLVMValueRef vec)
-{
- LLVMValueRef val[4];
- val[0] = LLVMBuildExtractElement(builder, vec,
- lp_build_const_int32(gallivm, 0), "");
- val[1] = LLVMBuildExtractElement(builder, vec,
- lp_build_const_int32(gallivm, 1), "");
- val[2] = LLVMBuildExtractElement(builder, vec,
- lp_build_const_int32(gallivm, 2), "");
- val[3] = LLVMBuildExtractElement(builder, vec,
- lp_build_const_int32(gallivm, 3), "");
- lp_build_printf(builder, "vector = [%f, %f, %f, %f]\n",
- val[0], val[1], val[2], val[3]);
-}
-#endif
-
-
static void
generate_fetch(struct gallivm_state *gallivm,
LLVMValueRef vbuffers_ptr,
@@ -559,6 +500,8 @@ generate_fetch(struct gallivm_state *gallivm,
LLVMValueRef index,
LLVMValueRef instance_id)
{
+ const struct util_format_description *format_desc = util_format_description(velem->src_format);
+ LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
LLVMBuilderRef builder = gallivm->builder;
LLVMValueRef indices =
LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
@@ -587,118 +530,47 @@ generate_fetch(struct gallivm_state *gallivm,
lp_build_const_int32(gallivm, velem->src_offset),
"");
- /*lp_build_printf(builder, "vbuf index = %d, stride is %d\n", indices, stride);*/
+/* lp_build_printf(gallivm, "vbuf index = %d, stride is %d\n", indices, stride);*/
vbuffer_ptr = LLVMBuildGEP(builder, vbuffer_ptr, &stride, 1, "");
- *res = draw_llvm_translate_from(gallivm, vbuffer_ptr, velem->src_format);
-}
-
-
-static LLVMValueRef
-aos_to_soa(struct gallivm_state *gallivm,
- LLVMValueRef val0,
- LLVMValueRef val1,
- LLVMValueRef val2,
- LLVMValueRef val3,
- LLVMValueRef channel)
-{
- LLVMBuilderRef builder = gallivm->builder;
- LLVMValueRef ex, res;
-
- ex = LLVMBuildExtractElement(builder, val0,
- channel, "");
- res = LLVMBuildInsertElement(builder,
- LLVMConstNull(LLVMTypeOf(val0)),
- ex,
- lp_build_const_int32(gallivm, 0),
- "");
-
- ex = LLVMBuildExtractElement(builder, val1,
- channel, "");
- res = LLVMBuildInsertElement(builder,
- res, ex,
- lp_build_const_int32(gallivm, 1),
- "");
-
- ex = LLVMBuildExtractElement(builder, val2,
- channel, "");
- res = LLVMBuildInsertElement(builder,
- res, ex,
- lp_build_const_int32(gallivm, 2),
- "");
-
- ex = LLVMBuildExtractElement(builder, val3,
- channel, "");
- res = LLVMBuildInsertElement(builder,
- res, ex,
- lp_build_const_int32(gallivm, 3),
- "");
-
- return res;
+ *res = lp_build_fetch_rgba_aos(gallivm,
+ format_desc,
+ lp_float32_vec4_type(),
+ vbuffer_ptr,
+ zero, zero, zero);
}
-
static void
-soa_to_aos(struct gallivm_state *gallivm,
- LLVMValueRef soa[TGSI_NUM_CHANNELS],
- LLVMValueRef aos[TGSI_NUM_CHANNELS])
+convert_to_soa(struct gallivm_state *gallivm,
+ LLVMValueRef (*src_aos)[LP_MAX_VECTOR_WIDTH / 32],
+ LLVMValueRef (*dst_soa)[TGSI_NUM_CHANNELS],
+ unsigned num_attribs, const struct lp_type soa_type)
{
- LLVMBuilderRef builder = gallivm->builder;
- LLVMValueRef comp;
- int i = 0;
+ unsigned i, j, k;
+ struct lp_type aos_channel_type = soa_type;
debug_assert(TGSI_NUM_CHANNELS == 4);
+ debug_assert((soa_type.length % TGSI_NUM_CHANNELS) == 0);
- aos[0] = LLVMConstNull(LLVMTypeOf(soa[0]));
- aos[1] = aos[2] = aos[3] = aos[0];
-
- for (i = 0; i < TGSI_NUM_CHANNELS; ++i) {
- LLVMValueRef channel = lp_build_const_int32(gallivm, i);
-
- comp = LLVMBuildExtractElement(builder, soa[i],
- lp_build_const_int32(gallivm, 0), "");
- aos[0] = LLVMBuildInsertElement(builder, aos[0], comp, channel, "");
-
- comp = LLVMBuildExtractElement(builder, soa[i],
- lp_build_const_int32(gallivm, 1), "");
- aos[1] = LLVMBuildInsertElement(builder, aos[1], comp, channel, "");
+ aos_channel_type.length >>= 1;
- comp = LLVMBuildExtractElement(builder, soa[i],
- lp_build_const_int32(gallivm, 2), "");
- aos[2] = LLVMBuildInsertElement(builder, aos[2], comp, channel, "");
-
- comp = LLVMBuildExtractElement(builder, soa[i],
- lp_build_const_int32(gallivm, 3), "");
- aos[3] = LLVMBuildInsertElement(builder, aos[3], comp, channel, "");
+ for (i = 0; i < num_attribs; ++i) {
+ LLVMValueRef aos_channels[TGSI_NUM_CHANNELS];
+ unsigned pixels_per_channel = soa_type.length / TGSI_NUM_CHANNELS;
- }
-}
+ for (j = 0; j < TGSI_NUM_CHANNELS; ++j) {
+ LLVMValueRef channel[LP_MAX_VECTOR_LENGTH];
+ assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
-static void
-convert_to_soa(struct gallivm_state *gallivm,
- LLVMValueRef (*aos)[TGSI_NUM_CHANNELS],
- LLVMValueRef (*soa)[TGSI_NUM_CHANNELS],
- int num_attribs)
-{
- int i;
+ for (k = 0; k < pixels_per_channel; ++k) {
+ channel[k] = src_aos[i][j + TGSI_NUM_CHANNELS * k];
+ }
- debug_assert(TGSI_NUM_CHANNELS == 4);
+ aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
+ }
- for (i = 0; i < num_attribs; ++i) {
- LLVMValueRef val0 = aos[i][0];
- LLVMValueRef val1 = aos[i][1];
- LLVMValueRef val2 = aos[i][2];
- LLVMValueRef val3 = aos[i][3];
-
- soa[i][0] = aos_to_soa(gallivm, val0, val1, val2, val3,
- lp_build_const_int32(gallivm, 0));
- soa[i][1] = aos_to_soa(gallivm, val0, val1, val2, val3,
- lp_build_const_int32(gallivm, 1));
- soa[i][2] = aos_to_soa(gallivm, val0, val1, val2, val3,
- lp_build_const_int32(gallivm, 2));
- soa[i][3] = aos_to_soa(gallivm, val0, val1, val2, val3,
- lp_build_const_int32(gallivm, 3));
+ lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa[i]);
}
}
@@ -707,89 +579,34 @@ static void
store_aos(struct gallivm_state *gallivm,
LLVMValueRef io_ptr,
LLVMValueRef index,
- LLVMValueRef value,
- LLVMValueRef clipmask, boolean have_clipdist)
+ LLVMValueRef value)
{
+ LLVMTypeRef data_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, lp_float32_vec4_type()), 0);
LLVMBuilderRef builder = gallivm->builder;
- LLVMValueRef id_ptr = draw_jit_header_id(gallivm, io_ptr);
LLVMValueRef data_ptr = draw_jit_header_data(gallivm, io_ptr);
LLVMValueRef indices[3];
- LLVMValueRef val;
- int vertex_id_pad_edgeflag;
indices[0] = lp_build_const_int32(gallivm, 0);
indices[1] = index;
indices[2] = lp_build_const_int32(gallivm, 0);
- /* If this assertion fails, it means we need to update the bit twidding
- * code here. See struct vertex_header in draw_private.h.
- */
- assert(DRAW_TOTAL_CLIP_PLANES==14);
- /* initialize vertex id:16 = 0xffff, have_clipdist:1 = 0, edgeflag:1 = 1 */
- vertex_id_pad_edgeflag = (0xffff << 16) | (1 << DRAW_TOTAL_CLIP_PLANES);
- if (have_clipdist)
- vertex_id_pad_edgeflag |= 1 << (DRAW_TOTAL_CLIP_PLANES+1);
- val = lp_build_const_int32(gallivm, vertex_id_pad_edgeflag);
- /* OR with the clipmask */
- val = LLVMBuildOr(builder, val, clipmask, "");
-
- /* store vertex header */
- LLVMBuildStore(builder, val, id_ptr);
-
-
#if DEBUG_STORE
- lp_build_printf(builder, " ---- %p storing attribute %d (io = %p)\n", data_ptr, index, io_ptr);
-#endif
-#if 0
- /*lp_build_printf(builder, " ---- %p storing at %d (%p) ", io_ptr, index, data_ptr);
- print_vectorf(builder, value);*/
- data_ptr = LLVMBuildBitCast(builder, data_ptr,
- LLVMPointerType(LLVMArrayType(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), 0), 0),
- "datavec");
- data_ptr = LLVMBuildGEP(builder, data_ptr, indices, 2, "");
-
- LLVMBuildStore(builder, value, data_ptr);
-#else
- {
- LLVMValueRef x, y, z, w;
- LLVMValueRef idx0, idx1, idx2, idx3;
- LLVMValueRef gep0, gep1, gep2, gep3;
- data_ptr = LLVMBuildGEP(builder, data_ptr, indices, 3, "");
-
- idx0 = lp_build_const_int32(gallivm, 0);
- idx1 = lp_build_const_int32(gallivm, 1);
- idx2 = lp_build_const_int32(gallivm, 2);
- idx3 = lp_build_const_int32(gallivm, 3);
-
- x = LLVMBuildExtractElement(builder, value,
- idx0, "");
- y = LLVMBuildExtractElement(builder, value,
- idx1, "");
- z = LLVMBuildExtractElement(builder, value,
- idx2, "");
- w = LLVMBuildExtractElement(builder, value,
- idx3, "");
-
- gep0 = LLVMBuildGEP(builder, data_ptr, &idx0, 1, "");
- gep1 = LLVMBuildGEP(builder, data_ptr, &idx1, 1, "");
- gep2 = LLVMBuildGEP(builder, data_ptr, &idx2, 1, "");
- gep3 = LLVMBuildGEP(builder, data_ptr, &idx3, 1, "");
-
- /*lp_build_printf(builder, "##### x = %f (%p), y = %f (%p), z = %f (%p), w = %f (%p)\n",
- x, gep0, y, gep1, z, gep2, w, gep3);*/
- LLVMBuildStore(builder, x, gep0);
- LLVMBuildStore(builder, y, gep1);
- LLVMBuildStore(builder, z, gep2);
- LLVMBuildStore(builder, w, gep3);
- }
+ lp_build_printf(gallivm, " ---- %p storing attribute %d (io = %p)\n", data_ptr, index, io_ptr);
#endif
+
+ data_ptr = LLVMBuildGEP(builder, data_ptr, indices, 3, "");
+ data_ptr = LLVMBuildPointerCast(builder, data_ptr, data_ptr_type, "");
+
+ /* Unaligned store due to the vertex header */
+ lp_set_store_alignment(LLVMBuildStore(builder, value, data_ptr), sizeof(float));
}
static void
store_aos_array(struct gallivm_state *gallivm,
+ struct lp_type soa_type,
LLVMValueRef io_ptr,
- LLVMValueRef aos[TGSI_NUM_CHANNELS],
+ LLVMValueRef* aos,
int attrib,
int num_outputs,
LLVMValueRef clipmask,
@@ -797,42 +614,49 @@ store_aos_array(struct gallivm_state *gallivm,
{
LLVMBuilderRef builder = gallivm->builder;
LLVMValueRef attr_index = lp_build_const_int32(gallivm, attrib);
- LLVMValueRef ind0 = lp_build_const_int32(gallivm, 0);
- LLVMValueRef ind1 = lp_build_const_int32(gallivm, 1);
- LLVMValueRef ind2 = lp_build_const_int32(gallivm, 2);
- LLVMValueRef ind3 = lp_build_const_int32(gallivm, 3);
- LLVMValueRef io0_ptr, io1_ptr, io2_ptr, io3_ptr;
- LLVMValueRef clipmask0, clipmask1, clipmask2, clipmask3;
+ LLVMValueRef inds[LP_MAX_VECTOR_WIDTH / 32];
+ LLVMValueRef io_ptrs[LP_MAX_VECTOR_WIDTH / 32];
+ int vector_length = soa_type.length;
+ int i;
debug_assert(TGSI_NUM_CHANNELS == 4);
- io0_ptr = LLVMBuildGEP(builder, io_ptr,
- &ind0, 1, "");
- io1_ptr = LLVMBuildGEP(builder, io_ptr,
- &ind1, 1, "");
- io2_ptr = LLVMBuildGEP(builder, io_ptr,
- &ind2, 1, "");
- io3_ptr = LLVMBuildGEP(builder, io_ptr,
- &ind3, 1, "");
-
- clipmask0 = LLVMBuildExtractElement(builder, clipmask,
- ind0, "");
- clipmask1 = LLVMBuildExtractElement(builder, clipmask,
- ind1, "");
- clipmask2 = LLVMBuildExtractElement(builder, clipmask,
- ind2, "");
- clipmask3 = LLVMBuildExtractElement(builder, clipmask,
- ind3, "");
+ for (i = 0; i < vector_length; i++) {
+ inds[i] = lp_build_const_int32(gallivm, i);
+ io_ptrs[i] = LLVMBuildGEP(builder, io_ptr, &inds[i], 1, "");
+ }
+ if (attrib == 0) {
+ /* store vertex header for each of the n vertices */
+ LLVMValueRef val, cliptmp;
+ int vertex_id_pad_edgeflag;
+
+ /* If this assertion fails, it means we need to update the bit twidding
+ * code here. See struct vertex_header in draw_private.h.
+ */
+ assert(DRAW_TOTAL_CLIP_PLANES==14);
+ /* initialize vertex id:16 = 0xffff, have_clipdist:1 = 0, edgeflag:1 = 1 */
+ vertex_id_pad_edgeflag = (0xffff << 16) | (1 << DRAW_TOTAL_CLIP_PLANES);
+ if (have_clipdist)
+ vertex_id_pad_edgeflag |= 1 << (DRAW_TOTAL_CLIP_PLANES+1);
+ val = lp_build_const_int_vec(gallivm, lp_int_type(soa_type), vertex_id_pad_edgeflag);
+ /* OR with the clipmask */
+ cliptmp = LLVMBuildOr(builder, val, clipmask, "");
+ for (i = 0; i < vector_length; i++) {
+ LLVMValueRef id_ptr = draw_jit_header_id(gallivm, io_ptrs[i]);
+ val = LLVMBuildExtractElement(builder, cliptmp, inds[i], "");
+ LLVMBuildStore(builder, val, id_ptr);
#if DEBUG_STORE
- lp_build_printf(builder, "io = %p, indexes[%d, %d, %d, %d]\n, clipmask0 = %x, clipmask1 = %x, clipmask2 = %x, clipmask3 = %x\n",
- io_ptr, ind0, ind1, ind2, ind3, clipmask0, clipmask1, clipmask2, clipmask3);
+ lp_build_printf(gallivm, "io = %p, index %d\n, clipmask = %x\n",
+ io_ptrs[i], inds[i], val);
#endif
- /* store for each of the 4 vertices */
- store_aos(gallivm, io0_ptr, attr_index, aos[0], clipmask0, have_clipdist);
- store_aos(gallivm, io1_ptr, attr_index, aos[1], clipmask1, have_clipdist);
- store_aos(gallivm, io2_ptr, attr_index, aos[2], clipmask2, have_clipdist);
- store_aos(gallivm, io3_ptr, attr_index, aos[3], clipmask3, have_clipdist);
+ }
+ }
+
+ /* store for each of the n vertices */
+ for (i = 0; i < vector_length; i++) {
+ store_aos(gallivm, io_ptrs[i], attr_index, aos[i]);
+ }
}
@@ -842,33 +666,53 @@ convert_to_aos(struct gallivm_state *gallivm,
LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
LLVMValueRef clipmask,
int num_outputs,
- int max_vertices, boolean have_clipdist)
+ struct lp_type soa_type,
+ boolean have_clipdist)
{
LLVMBuilderRef builder = gallivm->builder;
- unsigned chan, attrib;
+ unsigned chan, attrib, i;
#if DEBUG_STORE
- lp_build_printf(builder, " # storing begin\n");
+ lp_build_printf(gallivm, " # storing begin\n");
#endif
for (attrib = 0; attrib < num_outputs; ++attrib) {
- LLVMValueRef soa[4];
- LLVMValueRef aos[4];
+ LLVMValueRef soa[TGSI_NUM_CHANNELS];
+ LLVMValueRef aos[LP_MAX_VECTOR_WIDTH / 32];
for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
if (outputs[attrib][chan]) {
LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
lp_build_name(out, "output%u.%c", attrib, "xyzw"[chan]);
- /*lp_build_printf(builder, "output %d : %d ",
- LLVMConstInt(LLVMInt32Type(), attrib, 0),
- LLVMConstInt(LLVMInt32Type(), chan, 0));
- print_vectorf(builder, out);*/
+#if DEBUG_STORE
+ lp_build_printf(gallivm, "output %d : %d ",
+ LLVMConstInt(LLVMInt32TypeInContext(gallivm->context),
+ attrib, 0),
+ LLVMConstInt(LLVMInt32TypeInContext(gallivm->context),
+ chan, 0));
+ lp_build_print_value(gallivm, "val = ", out);
+#endif
soa[chan] = out;
}
else {
soa[chan] = 0;
}
}
- soa_to_aos(gallivm, soa, aos);
+
+
+ if (soa_type.length == TGSI_NUM_CHANNELS) {
+ lp_build_transpose_aos(gallivm, soa_type, soa, aos);
+ } else {
+ lp_build_transpose_aos(gallivm, soa_type, soa, soa);
+
+ for (i = 0; i < soa_type.length; ++i) {
+ aos[i] = lp_build_extract_range(gallivm,
+ soa[i % TGSI_NUM_CHANNELS],
+ (i / TGSI_NUM_CHANNELS) * TGSI_NUM_CHANNELS,
+ TGSI_NUM_CHANNELS);
+ }
+ }
+
store_aos_array(gallivm,
+ soa_type,
io,
aos,
attrib,
@@ -876,104 +720,71 @@ convert_to_aos(struct gallivm_state *gallivm,
clipmask, have_clipdist);
}
#if DEBUG_STORE
- lp_build_printf(builder, " # storing end\n");
+ lp_build_printf(gallivm, " # storing end\n");
#endif
}
/**
* Stores original vertex positions in clip coordinates
- * There is probably a more efficient way to do this, 4 floats at once
- * rather than extracting each element one by one.
- * idx is the output to store things too, if pre_clip_pos is set
- * we store the pos to the idx, if not we store the clipvertex to it.
*/
static void
store_clip(struct gallivm_state *gallivm,
+ const struct lp_type vs_type,
LLVMValueRef io_ptr,
LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
boolean pre_clip_pos, int idx)
{
LLVMBuilderRef builder = gallivm->builder;
- LLVMValueRef out[4];
+ LLVMValueRef soa[4];
+ LLVMValueRef aos[LP_MAX_VECTOR_LENGTH];
LLVMValueRef indices[2];
- LLVMValueRef io0_ptr, io1_ptr, io2_ptr, io3_ptr;
- LLVMValueRef clip_ptr0, clip_ptr1, clip_ptr2, clip_ptr3;
- LLVMValueRef clip0_ptr, clip1_ptr, clip2_ptr, clip3_ptr;
- LLVMValueRef out0elem, out1elem, out2elem, out3elem;
- int i;
+ LLVMValueRef io_ptrs[LP_MAX_VECTOR_WIDTH / 32];
+ LLVMValueRef inds[LP_MAX_VECTOR_WIDTH / 32];
+ LLVMValueRef clip_ptrs[LP_MAX_VECTOR_WIDTH / 32];
+ int i, j;
- LLVMValueRef ind0 = lp_build_const_int32(gallivm, 0);
- LLVMValueRef ind1 = lp_build_const_int32(gallivm, 1);
- LLVMValueRef ind2 = lp_build_const_int32(gallivm, 2);
- LLVMValueRef ind3 = lp_build_const_int32(gallivm, 3);
-
indices[0] =
indices[1] = lp_build_const_int32(gallivm, 0);
- out[0] = LLVMBuildLoad(builder, outputs[idx][0], ""); /*x0 x1 x2 x3*/
- out[1] = LLVMBuildLoad(builder, outputs[idx][1], ""); /*y0 y1 y2 y3*/
- out[2] = LLVMBuildLoad(builder, outputs[idx][2], ""); /*z0 z1 z2 z3*/
- out[3] = LLVMBuildLoad(builder, outputs[idx][3], ""); /*w0 w1 w2 w3*/
+ for (i = 0; i < vs_type.length; i++) {
+ inds[i] = lp_build_const_int32(gallivm, i);
+ io_ptrs[i] = LLVMBuildGEP(builder, io_ptr, &inds[i], 1, "");
+ }
- io0_ptr = LLVMBuildGEP(builder, io_ptr, &ind0, 1, "");
- io1_ptr = LLVMBuildGEP(builder, io_ptr, &ind1, 1, "");
- io2_ptr = LLVMBuildGEP(builder, io_ptr, &ind2, 1, "");
- io3_ptr = LLVMBuildGEP(builder, io_ptr, &ind3, 1, "");
+ soa[0] = LLVMBuildLoad(builder, outputs[idx][0], ""); /*x0 x1 .. xn*/
+ soa[1] = LLVMBuildLoad(builder, outputs[idx][1], ""); /*y0 y1 .. yn*/
+ soa[2] = LLVMBuildLoad(builder, outputs[idx][2], ""); /*z0 z1 .. zn*/
+ soa[3] = LLVMBuildLoad(builder, outputs[idx][3], ""); /*w0 w1 .. wn*/
if (!pre_clip_pos) {
- clip_ptr0 = draw_jit_header_clip(gallivm, io0_ptr);
- clip_ptr1 = draw_jit_header_clip(gallivm, io1_ptr);
- clip_ptr2 = draw_jit_header_clip(gallivm, io2_ptr);
- clip_ptr3 = draw_jit_header_clip(gallivm, io3_ptr);
+ for (i = 0; i < vs_type.length; i++) {
+ clip_ptrs[i] = draw_jit_header_clip(gallivm, io_ptrs[i]);
+ }
} else {
- clip_ptr0 = draw_jit_header_pre_clip_pos(gallivm, io0_ptr);
- clip_ptr1 = draw_jit_header_pre_clip_pos(gallivm, io1_ptr);
- clip_ptr2 = draw_jit_header_pre_clip_pos(gallivm, io2_ptr);
- clip_ptr3 = draw_jit_header_pre_clip_pos(gallivm, io3_ptr);
+ for (i = 0; i < vs_type.length; i++) {
+ clip_ptrs[i] = draw_jit_header_pre_clip_pos(gallivm, io_ptrs[i]);
+ }
}
- for (i = 0; i<4; i++) {
- clip0_ptr = LLVMBuildGEP(builder, clip_ptr0, indices, 2, ""); /* x0 */
- clip1_ptr = LLVMBuildGEP(builder, clip_ptr1, indices, 2, ""); /* x1 */
- clip2_ptr = LLVMBuildGEP(builder, clip_ptr2, indices, 2, ""); /* x2 */
- clip3_ptr = LLVMBuildGEP(builder, clip_ptr3, indices, 2, ""); /* x3 */
-
- out0elem = LLVMBuildExtractElement(builder, out[i], ind0, ""); /* x0 */
- out1elem = LLVMBuildExtractElement(builder, out[i], ind1, ""); /* x1 */
- out2elem = LLVMBuildExtractElement(builder, out[i], ind2, ""); /* x2 */
- out3elem = LLVMBuildExtractElement(builder, out[i], ind3, ""); /* x3 */
-
- LLVMBuildStore(builder, out0elem, clip0_ptr);
- LLVMBuildStore(builder, out1elem, clip1_ptr);
- LLVMBuildStore(builder, out2elem, clip2_ptr);
- LLVMBuildStore(builder, out3elem, clip3_ptr);
-
- indices[1]= LLVMBuildAdd(builder, indices[1], ind1, "");
+ lp_build_transpose_aos(gallivm, vs_type, soa, soa);
+ for (i = 0; i < vs_type.length; ++i) {
+ aos[i] = lp_build_extract_range(gallivm,
+ soa[i % TGSI_NUM_CHANNELS],
+ (i / TGSI_NUM_CHANNELS) * TGSI_NUM_CHANNELS,
+ TGSI_NUM_CHANNELS);
}
-}
-
+ for (j = 0; j < vs_type.length; j++) {
+ LLVMTypeRef clip_ptr_type = LLVMPointerType(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), 0);
+ LLVMValueRef clip_ptr;
-/**
- * Equivalent of _mm_set1_ps(a)
- */
-static LLVMValueRef
-vec4f_from_scalar(struct gallivm_state *gallivm,
- LLVMValueRef a,
- const char *name)
-{
- LLVMTypeRef float_type = LLVMFloatTypeInContext(gallivm->context);
- LLVMValueRef res = LLVMGetUndef(LLVMVectorType(float_type, 4));
- int i;
+ clip_ptr = LLVMBuildGEP(builder, clip_ptrs[j], indices, 2, "clipo");
+ clip_ptr = LLVMBuildPointerCast(builder, clip_ptr, clip_ptr_type, "");
- for (i = 0; i < 4; ++i) {
- LLVMValueRef index = lp_build_const_int32(gallivm, i);
- res = LLVMBuildInsertElement(gallivm->builder, res, a,
- index, i == 3 ? name : "");
+ /* Unaligned store */
+ lp_set_store_alignment(LLVMBuildStore(builder, aos[j], clip_ptr), sizeof(float));
}
-
- return res;
}
@@ -981,15 +792,17 @@ vec4f_from_scalar(struct gallivm_state *gallivm,
* Transforms the outputs for viewport mapping
*/
static void
-generate_viewport(struct draw_llvm *llvm,
+generate_viewport(struct draw_llvm_variant *variant,
LLVMBuilderRef builder,
+ struct lp_type vs_type,
LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
LLVMValueRef context_ptr)
{
int i;
- struct gallivm_state *gallivm = llvm->gallivm;
- struct lp_type f32_type = lp_type_float_vec(32);
- LLVMValueRef out3 = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 w2 w3*/
+ struct gallivm_state *gallivm = variant->gallivm;
+ struct lp_type f32_type = vs_type;
+ LLVMTypeRef vs_type_llvm = lp_build_vec_type(gallivm, vs_type);
+ LLVMValueRef out3 = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 .. wn*/
LLVMValueRef const1 = lp_build_const_vec(gallivm, f32_type, 1.0); /*1.0 1.0 1.0 1.0*/
LLVMValueRef vp_ptr = draw_jit_context_viewport(gallivm, context_ptr);
@@ -999,7 +812,7 @@ generate_viewport(struct draw_llvm *llvm,
/* Viewport Mapping */
for (i=0; i<3; i++) {
- LLVMValueRef out = LLVMBuildLoad(builder, outputs[0][i], ""); /*x0 x1 x2 x3*/
+ LLVMValueRef out = LLVMBuildLoad(builder, outputs[0][i], ""); /*x0 x1 .. xn*/
LLVMValueRef scale;
LLVMValueRef trans;
LLVMValueRef scale_i;
@@ -1012,8 +825,10 @@ generate_viewport(struct draw_llvm *llvm,
index = lp_build_const_int32(gallivm, i+4);
trans_i = LLVMBuildGEP(builder, vp_ptr, &index, 1, "");
- scale = vec4f_from_scalar(gallivm, LLVMBuildLoad(builder, scale_i, ""), "scale");
- trans = vec4f_from_scalar(gallivm, LLVMBuildLoad(builder, trans_i, ""), "trans");
+ scale = lp_build_broadcast(gallivm, vs_type_llvm,
+ LLVMBuildLoad(builder, scale_i, "scale"));
+ trans = lp_build_broadcast(gallivm, vs_type_llvm,
+ LLVMBuildLoad(builder, trans_i, "trans"));
/* divide by w */
out = LLVMBuildFMul(builder, out, out3, "");
@@ -1030,10 +845,12 @@ generate_viewport(struct draw_llvm *llvm,
/**
- * Returns clipmask as 4xi32 bitmask for the 4 vertices
+ * Returns clipmask as nxi32 bitmask for the n vertices
*/
static LLVMValueRef
generate_clipmask(struct draw_llvm *llvm,
+ struct gallivm_state *gallivm,
+ struct lp_type vs_type,
LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
boolean clip_xy,
boolean clip_z,
@@ -1043,15 +860,15 @@ generate_clipmask(struct draw_llvm *llvm,
LLVMValueRef context_ptr,
boolean *have_clipdist)
{
- struct gallivm_state *gallivm = llvm->gallivm;
LLVMBuilderRef builder = gallivm->builder;
- LLVMValueRef mask; /* stores the <4xi32> clipmasks */
+ LLVMValueRef mask; /* stores the <nxi32> clipmasks */
LLVMValueRef test, temp;
LLVMValueRef zero, shift;
LLVMValueRef pos_x, pos_y, pos_z, pos_w;
LLVMValueRef cv_x, cv_y, cv_z, cv_w;
LLVMValueRef plane1, planes, plane_ptr, sum;
- struct lp_type f32_type = lp_type_float_vec(32);
+ struct lp_type f32_type = vs_type;
+ struct lp_type i32_type = lp_int_type(vs_type);
const unsigned pos = draw_current_shader_position_output(llvm->draw);
const unsigned cv = draw_current_shader_clipvertex_output(llvm->draw);
int num_written_clipdistance = llvm->draw->vs.vertex_shader->info.num_written_clipdistance;
@@ -1064,25 +881,25 @@ generate_clipmask(struct draw_llvm *llvm,
if (cd[0] != pos || cd[1] != pos)
have_cd = true;
- mask = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 0);
- temp = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 0);
- zero = lp_build_const_vec(gallivm, f32_type, 0); /* 0.0f 0.0f 0.0f 0.0f */
- shift = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 1); /* 1 1 1 1 */
+ mask = lp_build_const_int_vec(gallivm, i32_type, 0);
+ temp = lp_build_const_int_vec(gallivm, i32_type, 0);
+ zero = lp_build_const_vec(gallivm, f32_type, 0); /* 0.0f 0.0f 0.0f 0.0f */
+ shift = lp_build_const_int_vec(gallivm, i32_type, 1); /* 1 1 1 1 */
/*
* load clipvertex and position from correct locations.
* if they are the same just load them once.
*/
- pos_x = LLVMBuildLoad(builder, outputs[pos][0], ""); /*x0 x1 x2 x3*/
- pos_y = LLVMBuildLoad(builder, outputs[pos][1], ""); /*y0 y1 y2 y3*/
- pos_z = LLVMBuildLoad(builder, outputs[pos][2], ""); /*z0 z1 z2 z3*/
- pos_w = LLVMBuildLoad(builder, outputs[pos][3], ""); /*w0 w1 w2 w3*/
+ pos_x = LLVMBuildLoad(builder, outputs[pos][0], ""); /*x0 x1 .. xn */
+ pos_y = LLVMBuildLoad(builder, outputs[pos][1], ""); /*y0 y1 .. yn */
+ pos_z = LLVMBuildLoad(builder, outputs[pos][2], ""); /*z0 z1 .. zn */
+ pos_w = LLVMBuildLoad(builder, outputs[pos][3], ""); /*w0 w1 .. wn */
if (clip_user && cv != pos) {
- cv_x = LLVMBuildLoad(builder, outputs[cv][0], ""); /*x0 x1 x2 x3*/
- cv_y = LLVMBuildLoad(builder, outputs[cv][1], ""); /*y0 y1 y2 y3*/
- cv_z = LLVMBuildLoad(builder, outputs[cv][2], ""); /*z0 z1 z2 z3*/
- cv_w = LLVMBuildLoad(builder, outputs[cv][3], ""); /*w0 w1 w2 w3*/
+ cv_x = LLVMBuildLoad(builder, outputs[cv][0], ""); /*x0 x1 .. xn */
+ cv_y = LLVMBuildLoad(builder, outputs[cv][1], ""); /*y0 y1 .. yn */
+ cv_z = LLVMBuildLoad(builder, outputs[cv][2], ""); /*z0 z1 .. zn */
+ cv_w = LLVMBuildLoad(builder, outputs[cv][3], ""); /*w0 w1 .. wn */
} else {
cv_x = pos_x;
cv_y = pos_y;
@@ -1120,7 +937,7 @@ generate_clipmask(struct draw_llvm *llvm,
}
if (clip_z) {
- temp = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 16);
+ temp = lp_build_const_int_vec(gallivm, i32_type, 16);
if (clip_halfz) {
/* plane 5 */
test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, zero, pos_z);
@@ -1163,42 +980,43 @@ generate_clipmask(struct draw_llvm *llvm,
clipdist = LLVMBuildLoad(builder, outputs[cd[1]][i-4], "");
}
test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, zero, clipdist);
- temp = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 1 << plane_idx);
+ temp = lp_build_const_int_vec(gallivm, i32_type, 1 << plane_idx);
test = LLVMBuildAnd(builder, test, temp, "");
mask = LLVMBuildOr(builder, mask, test, "");
} else {
+ LLVMTypeRef vs_type_llvm = lp_build_vec_type(gallivm, vs_type);
indices[0] = lp_build_const_int32(gallivm, 0);
indices[1] = lp_build_const_int32(gallivm, plane_idx);
indices[2] = lp_build_const_int32(gallivm, 0);
plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_x");
- planes = vec4f_from_scalar(gallivm, plane1, "plane4_x");
+ planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1);
sum = LLVMBuildFMul(builder, planes, cv_x, "");
indices[2] = lp_build_const_int32(gallivm, 1);
plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_y");
- planes = vec4f_from_scalar(gallivm, plane1, "plane4_y");
+ planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1);
test = LLVMBuildFMul(builder, planes, cv_y, "");
sum = LLVMBuildFAdd(builder, sum, test, "");
indices[2] = lp_build_const_int32(gallivm, 2);
plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_z");
- planes = vec4f_from_scalar(gallivm, plane1, "plane4_z");
+ planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1);
test = LLVMBuildFMul(builder, planes, cv_z, "");
sum = LLVMBuildFAdd(builder, sum, test, "");
indices[2] = lp_build_const_int32(gallivm, 3);
plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_w");
- planes = vec4f_from_scalar(gallivm, plane1, "plane4_w");
+ planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1);
test = LLVMBuildFMul(builder, planes, cv_w, "");
sum = LLVMBuildFAdd(builder, sum, test, "");
test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, zero, sum);
- temp = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 1 << plane_idx);
+ temp = lp_build_const_int_vec(gallivm, i32_type, 1 << plane_idx);
test = LLVMBuildAnd(builder, test, temp, "");
mask = LLVMBuildOr(builder, mask, test, "");
}
@@ -1212,23 +1030,28 @@ generate_clipmask(struct draw_llvm *llvm,
* Returns boolean if any clipping has occurred
* Used zero/non-zero i32 value to represent boolean
*/
-static void
-clipmask_bool(struct gallivm_state *gallivm,
- LLVMValueRef clipmask,
- LLVMValueRef ret_ptr)
+static LLVMValueRef
+clipmask_booli32(struct gallivm_state *gallivm,
+ const struct lp_type vs_type,
+ LLVMValueRef clipmask_bool_ptr)
{
LLVMBuilderRef builder = gallivm->builder;
- LLVMValueRef ret = LLVMBuildLoad(builder, ret_ptr, "");
+ LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
+ LLVMValueRef clipmask_bool = LLVMBuildLoad(builder, clipmask_bool_ptr, "");
+ LLVMValueRef ret = LLVMConstNull(int32_type);
LLVMValueRef temp;
int i;
- for (i=0; i<4; i++) {
- temp = LLVMBuildExtractElement(builder, clipmask,
+ /*
+ * Can do this with log2(vector length) pack instructions and one extract
+ * (as we don't actually need a or) with sse2 which would be way better.
+ */
+ for (i=0; i < vs_type.length; i++) {
+ temp = LLVMBuildExtractElement(builder, clipmask_bool,
lp_build_const_int32(gallivm, i) , "");
ret = LLVMBuildOr(builder, ret, temp, "");
}
-
- LLVMBuildStore(builder, ret, ret_ptr);
+ return ret;
}
@@ -1236,7 +1059,7 @@ static void
draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
boolean elts)
{
- struct gallivm_state *gallivm = llvm->gallivm;
+ struct gallivm_state *gallivm = variant->gallivm;
LLVMContextRef context = gallivm->context;
LLVMTypeRef int32_type = LLVMInt32TypeInContext(context);
LLVMTypeRef arg_types[8];
@@ -1244,6 +1067,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
LLVMValueRef context_ptr;
LLVMBasicBlockRef block;
LLVMBuilderRef builder;
+ struct lp_type vs_type;
LLVMValueRef end, start;
LLVMValueRef count, fetch_elts, fetch_count;
LLVMValueRef stride, step, io_itr;
@@ -1255,12 +1079,11 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
unsigned i, j;
struct lp_build_context bld;
struct lp_build_loop_state lp_loop;
- const int max_vertices = 4;
+ const int vector_length = lp_native_vector_width / 32;
LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
LLVMValueRef fetch_max;
- void *code;
struct lp_build_sampler_soa *sampler = 0;
- LLVMValueRef ret, ret_ptr;
+ LLVMValueRef ret, clipmask_bool_ptr;
const boolean bypass_viewport = variant->key.bypass_viewport;
const boolean enable_cliptest = variant->key.clip_xy ||
variant->key.clip_z ||
@@ -1273,16 +1096,16 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
memset(&system_values, 0, sizeof(system_values));
- arg_types[0] = get_context_ptr_type(llvm); /* context */
- arg_types[1] = get_vertex_header_ptr_type(llvm); /* vertex_header */
- arg_types[2] = get_buffer_ptr_type(llvm); /* vbuffers */
+ arg_types[0] = get_context_ptr_type(variant); /* context */
+ arg_types[1] = get_vertex_header_ptr_type(variant); /* vertex_header */
+ arg_types[2] = get_buffer_ptr_type(variant); /* vbuffers */
if (elts)
arg_types[3] = LLVMPointerType(int32_type, 0);/* fetch_elts * */
else
arg_types[3] = int32_type; /* start */
arg_types[4] = int32_type; /* fetch_count / count */
arg_types[5] = int32_type; /* stride */
- arg_types[6] = get_vb_ptr_type(llvm); /* pipe_vertex_buffer's */
+ arg_types[6] = get_vb_ptr_type(variant); /* pipe_vertex_buffer's */
arg_types[7] = int32_type; /* instance_id */
func_type = LLVMFunctionType(int32_type, arg_types, Elements(arg_types), 0);
@@ -1341,9 +1164,16 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
lp_build_context_init(&bld, gallivm, lp_type_int(32));
- /* function will return non-zero i32 value if any clipped vertices */
- ret_ptr = lp_build_alloca(gallivm, int32_type, "");
- LLVMBuildStore(builder, zero, ret_ptr);
+ memset(&vs_type, 0, sizeof vs_type);
+ vs_type.floating = TRUE; /* floating point values */
+ vs_type.sign = TRUE; /* values are signed */
+ vs_type.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */
+ vs_type.width = 32; /* 32-bit float */
+ vs_type.length = vector_length;
+
+ /* hold temporary "bool" clipmask */
+ clipmask_bool_ptr = lp_build_alloca(gallivm, lp_build_int_vec_type(gallivm, vs_type), "");
+ LLVMBuildStore(builder, lp_build_zero(gallivm, lp_int_type(vs_type)), clipmask_bool_ptr);
/* code generated texture sampling */
sampler = draw_llvm_sampler_soa_create(
@@ -1358,14 +1188,14 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
end = lp_build_add(&bld, start, count);
}
- step = lp_build_const_int32(gallivm, max_vertices);
+ step = lp_build_const_int32(gallivm, vector_length);
fetch_max = LLVMBuildSub(builder, end, one, "fetch_max");
lp_build_loop_begin(&lp_loop, gallivm, start);
{
LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
- LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS] = { { 0 } };
+ LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][LP_MAX_VECTOR_WIDTH / 32] = { { 0 } };
LLVMValueRef io;
LLVMValueRef clipmask; /* holds the clipmask value */
const LLVMValueRef (*ptr_aos)[TGSI_NUM_CHANNELS];
@@ -1377,11 +1207,11 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
io = LLVMBuildGEP(builder, io_ptr, &io_itr, 1, "");
#if DEBUG_STORE
- lp_build_printf(builder, " --- io %d = %p, loop counter %d\n",
+ lp_build_printf(gallivm, " --- io %d = %p, loop counter %d\n",
io_itr, io, lp_loop.counter);
#endif
- system_values.vertex_id = lp_build_zero(gallivm, lp_type_uint_vec(32));
- for (i = 0; i < TGSI_NUM_CHANNELS; ++i) {
+ system_values.vertex_id = lp_build_zero(gallivm, lp_type_uint_vec(32, 32*vector_length));
+ for (i = 0; i < vector_length; ++i) {
LLVMValueRef true_index =
LLVMBuildAdd(builder,
lp_loop.counter,
@@ -1413,11 +1243,12 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
}
}
convert_to_soa(gallivm, aos_attribs, inputs,
- draw->pt.nr_vertex_elements);
+ draw->pt.nr_vertex_elements, vs_type);
ptr_aos = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) inputs;
- generate_vs(llvm,
+ generate_vs(variant,
builder,
+ vs_type,
outputs,
ptr_aos,
&system_values,
@@ -1426,29 +1257,34 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
variant->key.clamp_vertex_color);
/* store original positions in clip before further manipulation */
- store_clip(gallivm, io, outputs, 0, cv);
- store_clip(gallivm, io, outputs, 1, pos);
+ store_clip(gallivm, vs_type, io, outputs, 0, cv);
+ store_clip(gallivm, vs_type, io, outputs, 1, pos);
/* do cliptest */
if (enable_cliptest) {
+ LLVMValueRef temp = LLVMBuildLoad(builder, clipmask_bool_ptr, "");
/* allocate clipmask, assign it integer type */
- clipmask = generate_clipmask(llvm, outputs,
+ clipmask = generate_clipmask(llvm,
+ gallivm,
+ vs_type,
+ outputs,
variant->key.clip_xy,
variant->key.clip_z,
variant->key.clip_user,
variant->key.clip_halfz,
variant->key.ucp_enable,
context_ptr, &have_clipdist);
- /* return clipping boolean value for function */
- clipmask_bool(gallivm, clipmask, ret_ptr);
+ temp = LLVMBuildOr(builder, clipmask, temp, "");
+ /* store temporary clipping boolean value */
+ LLVMBuildStore(builder, temp, clipmask_bool_ptr);
}
else {
- clipmask = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 0);
+ clipmask = lp_build_const_int_vec(gallivm, lp_int_type(vs_type), 0);
}
/* do viewport mapping */
if (!bypass_viewport) {
- generate_viewport(llvm, builder, outputs, context_ptr);
+ generate_viewport(variant, builder, vs_type, outputs, context_ptr);
}
/* store clipmask in vertex header,
@@ -1456,43 +1292,20 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
* and transformed positions in data
*/
convert_to_aos(gallivm, io, outputs, clipmask,
- vs_info->num_outputs, max_vertices, have_clipdist);
+ vs_info->num_outputs, vs_type,
+ have_clipdist);
}
lp_build_loop_end_cond(&lp_loop, end, step, LLVMIntUGE);
sampler->destroy(sampler);
- ret = LLVMBuildLoad(builder, ret_ptr, "");
- LLVMBuildRet(builder, ret);
-
- /*
- * Translate the LLVM IR into machine code.
- */
-#ifdef DEBUG
- if (LLVMVerifyFunction(variant_func, LLVMPrintMessageAction)) {
- lp_debug_dump_value(variant_func);
- assert(0);
- }
-#endif
-
- LLVMRunFunctionPassManager(gallivm->passmgr, variant_func);
+ /* return clipping boolean value for function */
+ ret = clipmask_booli32(gallivm, vs_type, clipmask_bool_ptr);
- if (gallivm_debug & GALLIVM_DEBUG_IR) {
- lp_debug_dump_value(variant_func);
- debug_printf("\n");
- }
-
- code = LLVMGetPointerToGlobal(gallivm->engine, variant_func);
- if (elts)
- variant->jit_func_elts = (draw_jit_vert_func_elts) pointer_to_func(code);
- else
- variant->jit_func = (draw_jit_vert_func) pointer_to_func(code);
+ LLVMBuildRet(builder, ret);
- if (gallivm_debug & GALLIVM_DEBUG_ASM) {
- lp_disassemble(code);
- }
- lp_func_delete_body(variant_func);
+ gallivm_verify_function(gallivm, variant_func);
}
@@ -1600,17 +1413,17 @@ draw_llvm_destroy_variant(struct draw_llvm_variant *variant)
struct draw_llvm *llvm = variant->llvm;
if (variant->function_elts) {
- LLVMFreeMachineCodeForFunction(llvm->gallivm->engine,
- variant->function_elts);
- LLVMDeleteFunction(variant->function_elts);
+ gallivm_free_function(variant->gallivm,
+ variant->function_elts, variant->jit_func_elts);
}
if (variant->function) {
- LLVMFreeMachineCodeForFunction(llvm->gallivm->engine,
- variant->function);
- LLVMDeleteFunction(variant->function);
+ gallivm_free_function(variant->gallivm,
+ variant->function, variant->jit_func);
}
+ gallivm_destroy(variant->gallivm);
+
remove_from_list(&variant->list_item_local);
variant->shader->variants_cached--;
remove_from_list(&variant->list_item_global);
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h
index 31fc2db05bd..39d83cfe99f 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -36,11 +36,6 @@
#include "pipe/p_context.h"
#include "util/u_simple_list.h"
-#include <llvm-c/Core.h>
-#include <llvm-c/Analysis.h>
-#include <llvm-c/Target.h>
-#include <llvm-c/ExecutionEngine.h>
-
struct draw_llvm;
struct llvm_vertex_shader;
@@ -220,6 +215,14 @@ struct draw_llvm_variant_list_item
struct draw_llvm_variant
{
+ struct gallivm_state *gallivm;
+
+ /* LLVM JIT builder types */
+ LLVMTypeRef context_ptr_type;
+ LLVMTypeRef buffer_ptr_type;
+ LLVMTypeRef vb_ptr_type;
+ LLVMTypeRef vertex_header_ptr_type;
+
LLVMValueRef function;
LLVMValueRef function_elts;
draw_jit_vert_func jit_func;
@@ -249,16 +252,8 @@ struct draw_llvm {
struct draw_jit_context jit_context;
- struct gallivm_state *gallivm;
-
struct draw_llvm_variant_list_item vs_variants_list;
int nr_variants;
-
- /* LLVM JIT builder types */
- LLVMTypeRef context_ptr_type;
- LLVMTypeRef buffer_ptr_type;
- LLVMTypeRef vb_ptr_type;
- LLVMTypeRef vertex_header_ptr_type;
};
@@ -270,7 +265,7 @@ llvm_vertex_shader(struct draw_vertex_shader *vs)
struct draw_llvm *
-draw_llvm_create(struct draw_context *draw, struct gallivm_state *gallivm);
+draw_llvm_create(struct draw_context *draw);
void
draw_llvm_destroy(struct draw_llvm *llvm);
@@ -286,11 +281,6 @@ draw_llvm_destroy_variant(struct draw_llvm_variant *variant);
struct draw_llvm_variant_key *
draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store);
-LLVMValueRef
-draw_llvm_translate_from(struct gallivm_state *gallivm,
- LLVMValueRef vbuffer,
- enum pipe_format from_format);
-
struct lp_build_sampler_soa *
draw_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state,
LLVMValueRef context_ptr);
diff --git a/src/gallium/auxiliary/draw/draw_llvm_sample.c b/src/gallium/auxiliary/draw/draw_llvm_sample.c
index 0a8b3bc535f..1dbe5f5bd19 100644
--- a/src/gallium/auxiliary/draw/draw_llvm_sample.c
+++ b/src/gallium/auxiliary/draw/draw_llvm_sample.c
@@ -173,8 +173,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
unsigned unit,
unsigned num_coords,
const LLVMValueRef *coords,
- const LLVMValueRef *ddx,
- const LLVMValueRef *ddy,
+ const struct lp_derivatives *derivs,
LLVMValueRef lod_bias, /* optional */
LLVMValueRef explicit_lod, /* optional */
LLVMValueRef *texel)
@@ -189,7 +188,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
type,
unit,
num_coords, coords,
- ddx, ddy,
+ derivs,
lod_bias, explicit_lod,
texel);
}
@@ -201,6 +200,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
static void
draw_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
struct gallivm_state *gallivm,
+ struct lp_type type,
unsigned unit,
LLVMValueRef explicit_lod, /* optional */
LLVMValueRef *sizes_out)
@@ -212,6 +212,7 @@ draw_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
lp_build_size_query_soa(gallivm,
&sampler->dynamic_state.static_state[unit],
&sampler->dynamic_state.base,
+ type,
unit,
explicit_lod,
sizes_out);
diff --git a/src/gallium/auxiliary/draw/draw_llvm_translate.c b/src/gallium/auxiliary/draw/draw_llvm_translate.c
deleted file mode 100644
index 77d0af74733..00000000000
--- a/src/gallium/auxiliary/draw/draw_llvm_translate.c
+++ /dev/null
@@ -1,506 +0,0 @@
-#include "draw_private.h"
-#include "draw_context.h"
-
-#include "draw_llvm.h"
-
-#include "gallivm/lp_bld_const.h"
-#include "gallivm/lp_bld_struct.h"
-#include "gallivm/lp_bld_format.h"
-#include "gallivm/lp_bld_debug.h"
-#include "gallivm/lp_bld_type.h"
-
-#include "util/u_memory.h"
-#include "util/u_format.h"
-#include "pipe/p_state.h"
-
-
-#define DRAW_DBG 0
-
-static LLVMValueRef
-from_64_float(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMDoubleTypeInContext(gallivm->context), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- return LLVMBuildFPTrunc(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static LLVMValueRef
-from_32_float(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0) , "");
- return LLVMBuildLoad(gallivm->builder, bc, "");
-}
-
-static INLINE LLVMValueRef
-from_8_uscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
- return LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_16_uscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- return LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_32_uscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- return LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_8_sscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
- return LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_16_sscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- return LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_32_sscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- return LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-
-static INLINE LLVMValueRef
-from_8_unorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
- LLVMValueRef uscaled = LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
- return LLVMBuildFDiv(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 255.), "");
-}
-
-static INLINE LLVMValueRef
-from_16_unorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- LLVMValueRef uscaled = LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
- return LLVMBuildFDiv(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 65535.), "");
-}
-
-static INLINE LLVMValueRef
-from_32_unorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- LLVMValueRef uscaled = LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-
- return LLVMBuildFDiv(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 4294967295.), "");
-}
-
-static INLINE LLVMValueRef
-from_8_snorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
- LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
- return LLVMBuildFDiv(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 127.0), "");
-}
-
-static INLINE LLVMValueRef
-from_16_snorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
- return LLVMBuildFDiv(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 32767.0f), "");
-}
-
-static INLINE LLVMValueRef
-from_32_snorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-
- return LLVMBuildFDiv(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 2147483647.0), "");
-}
-
-static INLINE LLVMValueRef
-from_32_fixed(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
- LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
- LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-
- return LLVMBuildFDiv(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 65536.0), "");
-}
-
-static LLVMValueRef
-to_64_float(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- return LLVMBuildFPExt(gallivm->builder, l, LLVMDoubleTypeInContext(gallivm->context), "");
-}
-
-static LLVMValueRef
-to_32_float(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- return LLVMBuildLoad(gallivm->builder, fp, "");
-}
-
-static INLINE LLVMValueRef
-to_8_uscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- return LLVMBuildFPToUI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 8), "");
-}
-
-static INLINE LLVMValueRef
-to_16_uscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- return LLVMBuildFPToUI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 16), "");
-}
-
-static INLINE LLVMValueRef
-to_32_uscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- return LLVMBuildFPToUI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 32), "");
-}
-
-static INLINE LLVMValueRef
-to_8_sscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- return LLVMBuildFPToSI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 8), "");
-}
-
-static INLINE LLVMValueRef
-to_16_sscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- return LLVMBuildFPToSI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 16), "");
-}
-
-static INLINE LLVMValueRef
-to_32_sscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- return LLVMBuildFPToSI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 32), "");
-}
-
-static INLINE LLVMValueRef
-to_8_unorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- LLVMValueRef uscaled = LLVMBuildFPToUI(gallivm->builder, l,
- LLVMIntTypeInContext(gallivm->context, 8), "");
- return LLVMBuildFMul(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 255.), "");
-}
-
-static INLINE LLVMValueRef
-to_16_unorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- LLVMValueRef uscaled = LLVMBuildFPToUI(gallivm->builder, l,
- LLVMIntTypeInContext(gallivm->context, 32), "");
- return LLVMBuildFMul(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 65535.), "");
-}
-
-static INLINE LLVMValueRef
-to_32_unorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- LLVMValueRef uscaled = LLVMBuildFPToUI(gallivm->builder, l,
- LLVMIntTypeInContext(gallivm->context, 32), "");
-
- return LLVMBuildFMul(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 4294967295.), "");
-}
-
-static INLINE LLVMValueRef
-to_8_snorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
- LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l,
- LLVMIntTypeInContext(gallivm->context, 8), "");
- return LLVMBuildFMul(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 127.0), "");
-}
-
-static INLINE LLVMValueRef
-to_16_snorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l,
- LLVMIntTypeInContext(gallivm->context, 16), "");
- return LLVMBuildFMul(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 32767.0f), "");
-}
-
-static INLINE LLVMValueRef
-to_32_snorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l,
- LLVMIntTypeInContext(gallivm->context, 32), "");
-
- return LLVMBuildFMul(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 2147483647.0), "");
-}
-
-static INLINE LLVMValueRef
-to_32_fixed(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
- LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
- LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l,
- LLVMIntTypeInContext(gallivm->context, 32), "");
-
- return LLVMBuildFMul(gallivm->builder, uscaled,
- lp_build_const_float(gallivm, 65536.0), "");
-}
-
-typedef LLVMValueRef (*from_func)(struct gallivm_state *, LLVMValueRef);
-typedef LLVMValueRef (*to_func)(struct gallivm_state *, LLVMValueRef);
-
-/* so that underneath can avoid function calls which are prohibited
- * for static initialization we need this conversion */
-enum ll_type {
- LL_Double,
- LL_Float,
- LL_Int32,
- LL_Int16,
- LL_Int8
-};
-
-static INLINE LLVMTypeRef
-ll_type_to_llvm(struct gallivm_state *gallivm, enum ll_type type)
-{
- switch (type) {
- case LL_Double:
- return LLVMDoubleTypeInContext(gallivm->context);
- case LL_Float:
- return LLVMFloatTypeInContext(gallivm->context);
- case LL_Int32:
- return LLVMInt32TypeInContext(gallivm->context);
- case LL_Int16:
- return LLVMIntTypeInContext(gallivm->context, 16);
- case LL_Int8:
- return LLVMIntTypeInContext(gallivm->context, 8);
- }
- return LLVMIntTypeInContext(gallivm->context, 8);
-}
-
-static INLINE int
-ll_type_size(enum ll_type type)
-{
- switch (type) {
- case LL_Double:
- return 8;
- case LL_Float:
- return 4;
- case LL_Int32:
- return 4;
- case LL_Int16:
- return 2;
- case LL_Int8:
- return 1;
- }
- return 1;
-}
-
-struct draw_llvm_translate {
- int format;
- from_func from;
- to_func to;
- enum ll_type type;
- int num_components;
-} translates[] =
-{
- {PIPE_FORMAT_R64_FLOAT, from_64_float, to_64_float, LL_Double, 1},
- {PIPE_FORMAT_R64G64_FLOAT, from_64_float, to_64_float, LL_Double, 2},
- {PIPE_FORMAT_R64G64B64_FLOAT, from_64_float, to_64_float, LL_Double, 3},
- {PIPE_FORMAT_R64G64B64A64_FLOAT, from_64_float, to_64_float, LL_Double, 4},
- {PIPE_FORMAT_R32_FLOAT, from_32_float, to_32_float, LL_Float, 1},
- {PIPE_FORMAT_R32G32_FLOAT, from_32_float, to_32_float, LL_Float, 2},
- {PIPE_FORMAT_R32G32B32_FLOAT, from_32_float, to_32_float, LL_Float, 3},
- {PIPE_FORMAT_R32G32B32A32_FLOAT, from_32_float, to_32_float, LL_Float, 4},
-
- {PIPE_FORMAT_R32_UNORM, from_32_unorm, to_32_unorm, LL_Int32, 1},
- {PIPE_FORMAT_R32G32_UNORM, from_32_unorm, to_32_unorm, LL_Int32, 2},
- {PIPE_FORMAT_R32G32B32_UNORM, from_32_unorm, to_32_unorm, LL_Int32, 3},
- {PIPE_FORMAT_R32G32B32A32_UNORM, from_32_unorm, to_32_unorm, LL_Int32, 4},
-
- {PIPE_FORMAT_R32_USCALED, from_32_uscaled, to_32_uscaled, LL_Int32, 1},
- {PIPE_FORMAT_R32G32_USCALED, from_32_uscaled, to_32_uscaled, LL_Int32, 2},
- {PIPE_FORMAT_R32G32B32_USCALED, from_32_uscaled, to_32_uscaled, LL_Int32, 3},
- {PIPE_FORMAT_R32G32B32A32_USCALED, from_32_uscaled, to_32_uscaled, LL_Int32, 4},
-
- {PIPE_FORMAT_R32_SNORM, from_32_snorm, to_32_snorm, LL_Int32, 1},
- {PIPE_FORMAT_R32G32_SNORM, from_32_snorm, to_32_snorm, LL_Int32, 2},
- {PIPE_FORMAT_R32G32B32_SNORM, from_32_snorm, to_32_snorm, LL_Int32, 3},
- {PIPE_FORMAT_R32G32B32A32_SNORM, from_32_snorm, to_32_snorm, LL_Int32, 4},
-
- {PIPE_FORMAT_R32_SSCALED, from_32_sscaled, to_32_sscaled, LL_Int32, 1},
- {PIPE_FORMAT_R32G32_SSCALED, from_32_sscaled, to_32_sscaled, LL_Int32, 2},
- {PIPE_FORMAT_R32G32B32_SSCALED, from_32_sscaled, to_32_sscaled, LL_Int32, 3},
- {PIPE_FORMAT_R32G32B32A32_SSCALED, from_32_sscaled, to_32_sscaled, LL_Int32, 4},
-
- {PIPE_FORMAT_R16_UNORM, from_16_unorm, to_16_unorm, LL_Int16, 1},
- {PIPE_FORMAT_R16G16_UNORM, from_16_unorm, to_16_unorm, LL_Int16, 2},
- {PIPE_FORMAT_R16G16B16_UNORM, from_16_unorm, to_16_unorm, LL_Int16, 3},
- {PIPE_FORMAT_R16G16B16A16_UNORM, from_16_unorm, to_16_unorm, LL_Int16, 4},
-
- {PIPE_FORMAT_R16_USCALED, from_16_uscaled, to_16_uscaled, LL_Int16, 1},
- {PIPE_FORMAT_R16G16_USCALED, from_16_uscaled, to_16_uscaled, LL_Int16, 2},
- {PIPE_FORMAT_R16G16B16_USCALED, from_16_uscaled, to_16_uscaled, LL_Int16, 3},
- {PIPE_FORMAT_R16G16B16A16_USCALED, from_16_uscaled, to_16_uscaled, LL_Int16, 4},
-
- {PIPE_FORMAT_R16_SNORM, from_16_snorm, to_16_snorm, LL_Int16, 1},
- {PIPE_FORMAT_R16G16_SNORM, from_16_snorm, to_16_snorm, LL_Int16, 2},
- {PIPE_FORMAT_R16G16B16_SNORM, from_16_snorm, to_16_snorm, LL_Int16, 3},
- {PIPE_FORMAT_R16G16B16A16_SNORM, from_16_snorm, to_16_snorm, LL_Int16, 4},
-
- {PIPE_FORMAT_R16_SSCALED, from_16_sscaled, to_16_sscaled, LL_Int16, 1},
- {PIPE_FORMAT_R16G16_SSCALED, from_16_sscaled, to_16_sscaled, LL_Int16, 2},
- {PIPE_FORMAT_R16G16B16_SSCALED, from_16_sscaled, to_16_sscaled, LL_Int16, 3},
- {PIPE_FORMAT_R16G16B16A16_SSCALED, from_16_sscaled, to_16_sscaled, LL_Int16, 4},
-
- {PIPE_FORMAT_R8_UNORM, from_8_unorm, to_8_unorm, LL_Int8, 1},
- {PIPE_FORMAT_R8G8_UNORM, from_8_unorm, to_8_unorm, LL_Int8, 2},
- {PIPE_FORMAT_R8G8B8_UNORM, from_8_unorm, to_8_unorm, LL_Int8, 3},
- {PIPE_FORMAT_R8G8B8A8_UNORM, from_8_unorm, to_8_unorm, LL_Int8, 4},
-
- {PIPE_FORMAT_R8_USCALED, from_8_uscaled, to_8_uscaled, LL_Int8, 1},
- {PIPE_FORMAT_R8G8_USCALED, from_8_uscaled, to_8_uscaled, LL_Int8, 2},
- {PIPE_FORMAT_R8G8B8_USCALED, from_8_uscaled, to_8_uscaled, LL_Int8, 3},
- {PIPE_FORMAT_R8G8B8A8_USCALED, from_8_uscaled, to_8_uscaled, LL_Int8, 4},
-
- {PIPE_FORMAT_R8_SNORM, from_8_snorm, to_8_snorm, LL_Int8, 1},
- {PIPE_FORMAT_R8G8_SNORM, from_8_snorm, to_8_snorm, LL_Int8, 2},
- {PIPE_FORMAT_R8G8B8_SNORM, from_8_snorm, to_8_snorm, LL_Int8, 3},
- {PIPE_FORMAT_R8G8B8A8_SNORM, from_8_snorm, to_8_snorm, LL_Int8, 4},
-
- {PIPE_FORMAT_R8_SSCALED, from_8_sscaled, to_8_sscaled, LL_Int8, 1},
- {PIPE_FORMAT_R8G8_SSCALED, from_8_sscaled, to_8_sscaled, LL_Int8, 2},
- {PIPE_FORMAT_R8G8B8_SSCALED, from_8_sscaled, to_8_sscaled, LL_Int8, 3},
- {PIPE_FORMAT_R8G8B8A8_SSCALED, from_8_sscaled, to_8_sscaled, LL_Int8, 4},
-
- {PIPE_FORMAT_R32_FIXED, from_32_fixed, to_32_fixed, LL_Int32, 1},
- {PIPE_FORMAT_R32G32_FIXED, from_32_fixed, to_32_fixed, LL_Int32, 2},
- {PIPE_FORMAT_R32G32B32_FIXED, from_32_fixed, to_32_fixed, LL_Int32, 3},
- {PIPE_FORMAT_R32G32B32A32_FIXED, from_32_fixed, to_32_fixed, LL_Int32, 4},
-};
-
-
-static LLVMValueRef
-fetch(struct gallivm_state *gallivm,
- LLVMValueRef ptr, int val_size, int nr_components,
- from_func func)
-{
- int i;
- int offset = 0;
- LLVMValueRef res =
- LLVMConstNull(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4));
- LLVMValueRef defaults[4];
-
- defaults[0] =
- defaults[1] =
- defaults[2] = lp_build_const_float(gallivm, 0.0);
- defaults[3] = lp_build_const_float(gallivm, 1.0);
-
- for (i = 0; i < nr_components; ++i) {
- LLVMValueRef src_index = lp_build_const_int32(gallivm, offset);
- LLVMValueRef dst_index = lp_build_const_int32(gallivm, i);
- LLVMValueRef src_tmp;
- LLVMValueRef component;
-
- src_tmp = LLVMBuildGEP(gallivm->builder, ptr, &src_index, 1, "src_tmp");
-
- /* convert src_tmp to float */
- component = func(gallivm, src_tmp);
-
- /* vec.comp = component */
- res = LLVMBuildInsertElement(gallivm->builder,
- res,
- component,
- dst_index, "");
- offset += val_size;
- }
- for (; i < 4; ++i) {
- LLVMValueRef dst_index = lp_build_const_int32(gallivm, i);
- res = LLVMBuildInsertElement(gallivm->builder,
- res,
- defaults[i],
- dst_index, "");
- }
- return res;
-}
-
-
-LLVMValueRef
-draw_llvm_translate_from(struct gallivm_state *gallivm,
- LLVMValueRef vbuffer,
- enum pipe_format from_format)
-{
- const struct util_format_description *format_desc;
- LLVMValueRef zero;
- int i;
- struct lp_type type = lp_float32_vec4_type();
-
- /*
- * The above can only cope with straight arrays: no bitfields,
- * swizzles, or half floats.
- */
-
- for (i = 0; i < Elements(translates); ++i) {
- if (translates[i].format == from_format) {
- /*LLVMTypeRef type = ll_type_to_llvm(translates[i].type);*/
- return fetch(gallivm,
- vbuffer,
- ll_type_size(translates[i].type),
- translates[i].num_components,
- translates[i].from);
- }
- }
-
-
- /*
- * This doesn't handle anything bigger than 32bits, or half floats
- * yet.
- *
- * TODO: unify all this code into lp_build_fetch_rgba_aos().
- */
-
- format_desc = util_format_description(from_format);
- zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
- return lp_build_fetch_rgba_aos(gallivm, format_desc, type, vbuffer, zero, zero, zero);
-}
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index d85deeea7f5..9cede2108db 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -47,8 +47,8 @@
#include "tgsi/tgsi_scan.h"
#ifdef HAVE_LLVM
-#include <llvm-c/ExecutionEngine.h>
struct draw_llvm;
+struct gallivm_state;
#endif
@@ -301,7 +301,6 @@ struct draw_context
#ifdef HAVE_LLVM
struct draw_llvm *llvm;
- struct gallivm_state *own_gallivm;
#endif
struct pipe_sampler_view *sampler_views[PIPE_MAX_VERTEX_SAMPLERS];
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
index 1e17f808408..04b286f0f5b 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -230,7 +230,7 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle,
llvm_vert_info.stride = fpme->vertex_size;
llvm_vert_info.verts =
(struct vertex_header *)MALLOC(fpme->vertex_size *
- align(fetch_info->count, 4));
+ align(fetch_info->count, lp_native_vector_width / 32));
if (!llvm_vert_info.verts) {
assert(0);
return;
@@ -423,7 +423,7 @@ draw_pt_fetch_pipeline_or_emit_llvm(struct draw_context *draw)
{
struct llvm_middle_end *fpme = 0;
- if (!draw->llvm || !draw->llvm->gallivm->engine)
+ if (!draw->llvm)
return NULL;
fpme = CALLOC_STRUCT( llvm_middle_end );
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 9fc57629822..d226dab5b81 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -75,9 +75,9 @@ lp_build_min_simple(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b)
{
- LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
const char *intrinsic = NULL;
+ unsigned intr_size;
LLVMValueRef cond;
assert(lp_check_value(type, a));
@@ -85,31 +85,71 @@ lp_build_min_simple(struct lp_build_context *bld,
/* TODO: optimize the constant case */
- if(type.width * type.length == 128) {
- if(type.floating) {
- if(type.width == 32 && util_cpu_caps.has_sse)
+ if (type.floating && util_cpu_caps.has_sse) {
+ if (type.width == 32) {
+ if (type.length == 1) {
+ intrinsic = "llvm.x86.sse.min.ss";
+ intr_size = 128;
+ }
+ else if (type.length <= 4 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse.min.ps";
- if(type.width == 64 && util_cpu_caps.has_sse2)
+ intr_size = 128;
+ }
+ else {
+ intrinsic = "llvm.x86.avx.min.ps.256";
+ intr_size = 256;
+ }
+ }
+ if (type.width == 64 && util_cpu_caps.has_sse2) {
+ if (type.length == 1) {
+ intrinsic = "llvm.x86.sse2.min.sd";
+ intr_size = 128;
+ }
+ else if (type.length == 2 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse2.min.pd";
+ intr_size = 128;
+ }
+ else {
+ intrinsic = "llvm.x86.avx.min.pd.256";
+ intr_size = 256;
+ }
}
- else {
- if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
- intrinsic = "llvm.x86.sse2.pminu.b";
- if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
+ }
+ else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+ intr_size = 128;
+ if ((type.width == 8 || type.width == 16) &&
+ (type.width * type.length <= 64) &&
+ (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+ debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
+ __FUNCTION__);
+ }
+ if (type.width == 8 && !type.sign) {
+ intrinsic = "llvm.x86.sse2.pminu.b";
+ }
+ else if (type.width == 16 && type.sign) {
+ intrinsic = "llvm.x86.sse2.pmins.w";
+ }
+ if (util_cpu_caps.has_sse4_1) {
+ if (type.width == 8 && type.sign) {
intrinsic = "llvm.x86.sse41.pminsb";
- if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 16 && !type.sign) {
intrinsic = "llvm.x86.sse41.pminuw";
- if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
- intrinsic = "llvm.x86.sse2.pmins.w";
- if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 32 && !type.sign) {
intrinsic = "llvm.x86.sse41.pminud";
- if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 32 && type.sign) {
intrinsic = "llvm.x86.sse41.pminsd";
+ }
}
}
- if(intrinsic)
- return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
+ if(intrinsic) {
+ return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
+ type,
+ intr_size, a, b);
+ }
cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
return lp_build_select(bld, cond, a, b);
@@ -125,9 +165,9 @@ lp_build_max_simple(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b)
{
- LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
const char *intrinsic = NULL;
+ unsigned intr_size;
LLVMValueRef cond;
assert(lp_check_value(type, a));
@@ -135,31 +175,72 @@ lp_build_max_simple(struct lp_build_context *bld,
/* TODO: optimize the constant case */
- if(type.width * type.length == 128) {
- if(type.floating) {
- if(type.width == 32 && util_cpu_caps.has_sse)
+ if (type.floating && util_cpu_caps.has_sse) {
+ if (type.width == 32) {
+ if (type.length == 1) {
+ intrinsic = "llvm.x86.sse.max.ss";
+ intr_size = 128;
+ }
+ else if (type.length <= 4 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse.max.ps";
- if(type.width == 64 && util_cpu_caps.has_sse2)
+ intr_size = 128;
+ }
+ else {
+ intrinsic = "llvm.x86.avx.max.ps.256";
+ intr_size = 256;
+ }
+ }
+ if (type.width == 64 && util_cpu_caps.has_sse2) {
+ if (type.length == 1) {
+ intrinsic = "llvm.x86.sse2.max.sd";
+ intr_size = 128;
+ }
+ else if (type.length == 2 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse2.max.pd";
+ intr_size = 128;
+ }
+ else {
+ intrinsic = "llvm.x86.avx.max.pd.256";
+ intr_size = 256;
+ }
}
- else {
- if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
- intrinsic = "llvm.x86.sse2.pmaxu.b";
- if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
+ }
+ else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+ intr_size = 128;
+ if ((type.width == 8 || type.width == 16) &&
+ (type.width * type.length <= 64) &&
+ (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+ debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
+ __FUNCTION__);
+ }
+ if (type.width == 8 && !type.sign) {
+ intrinsic = "llvm.x86.sse2.pmaxu.b";
+ intr_size = 128;
+ }
+ else if (type.width == 16 && type.sign) {
+ intrinsic = "llvm.x86.sse2.pmaxs.w";
+ }
+ if (util_cpu_caps.has_sse4_1) {
+ if (type.width == 8 && type.sign) {
intrinsic = "llvm.x86.sse41.pmaxsb";
- if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 16 && !type.sign) {
intrinsic = "llvm.x86.sse41.pmaxuw";
- if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
- intrinsic = "llvm.x86.sse2.pmaxs.w";
- if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 32 && !type.sign) {
intrinsic = "llvm.x86.sse41.pmaxud";
- if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
+ }
+ if (type.width == 32 && type.sign) {
intrinsic = "llvm.x86.sse41.pmaxsd";
+ }
}
}
- if(intrinsic)
- return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
+ if(intrinsic) {
+ return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
+ type,
+ intr_size, a, b);
+ }
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
return lp_build_select(bld, cond, a, b);
@@ -265,15 +346,20 @@ lp_build_add(struct lp_build_context *bld,
}
-/** Return the scalar sum of the elements of a */
+/** Return the scalar sum of the elements of a.
+ * Should avoid this operation whenever possible.
+ */
LLVMValueRef
-lp_build_sum_vector(struct lp_build_context *bld,
- LLVMValueRef a)
+lp_build_horizontal_add(struct lp_build_context *bld,
+ LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMValueRef index, res;
- unsigned i;
+ unsigned i, length;
+ LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
+ LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
+ LLVMValueRef vecres, elem2;
assert(lp_check_value(type, a));
@@ -283,26 +369,191 @@ lp_build_sum_vector(struct lp_build_context *bld,
assert(!bld->type.norm);
- index = lp_build_const_int32(bld->gallivm, 0);
- res = LLVMBuildExtractElement(builder, a, index, "");
+ /*
+ * for byte vectors can do much better with psadbw.
+ * Using repeated shuffle/adds here. Note with multiple vectors
+ * this can be done more efficiently as outlined in the intel
+ * optimization manual.
+ * Note: could cause data rearrangement if used with smaller element
+ * sizes.
+ */
- for (i = 1; i < type.length; i++) {
- index = lp_build_const_int32(bld->gallivm, i);
- if (type.floating)
- res = LLVMBuildFAdd(builder, res,
- LLVMBuildExtractElement(builder,
- a, index, ""),
- "");
- else
- res = LLVMBuildAdd(builder, res,
- LLVMBuildExtractElement(builder,
- a, index, ""),
- "");
+ vecres = a;
+ length = type.length / 2;
+ while (length > 1) {
+ LLVMValueRef vec1, vec2;
+ for (i = 0; i < length; i++) {
+ shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
+ shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
+ }
+ vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
+ LLVMConstVector(shuffles1, length), "");
+ vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
+ LLVMConstVector(shuffles2, length), "");
+ if (type.floating) {
+ vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
+ }
+ else {
+ vecres = LLVMBuildAdd(builder, vec1, vec2, "");
+ }
+ length = length >> 1;
}
+ /* always have vector of size 2 here */
+ assert(length == 1);
+
+ index = lp_build_const_int32(bld->gallivm, 0);
+ res = LLVMBuildExtractElement(builder, vecres, index, "");
+ index = lp_build_const_int32(bld->gallivm, 1);
+ elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
+
+ if (type.floating)
+ res = LLVMBuildFAdd(builder, res, elem2, "");
+ else
+ res = LLVMBuildAdd(builder, res, elem2, "");
+
return res;
}
+/**
+ * Return the horizontal sums of 4 float vectors as a float4 vector.
+ * This uses the technique as outlined in Intel Optimization Manual.
+ */
+static LLVMValueRef
+lp_build_horizontal_add4x4f(struct lp_build_context *bld,
+ LLVMValueRef src[4])
+{
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef shuffles[4];
+ LLVMValueRef tmp[4];
+ LLVMValueRef sumtmp[2], shuftmp[2];
+
+ /* lower half of regs */
+ shuffles[0] = lp_build_const_int32(gallivm, 0);
+ shuffles[1] = lp_build_const_int32(gallivm, 1);
+ shuffles[2] = lp_build_const_int32(gallivm, 4);
+ shuffles[3] = lp_build_const_int32(gallivm, 5);
+ tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
+ LLVMConstVector(shuffles, 4), "");
+ tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
+ LLVMConstVector(shuffles, 4), "");
+
+ /* upper half of regs */
+ shuffles[0] = lp_build_const_int32(gallivm, 2);
+ shuffles[1] = lp_build_const_int32(gallivm, 3);
+ shuffles[2] = lp_build_const_int32(gallivm, 6);
+ shuffles[3] = lp_build_const_int32(gallivm, 7);
+ tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
+ LLVMConstVector(shuffles, 4), "");
+ tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
+ LLVMConstVector(shuffles, 4), "");
+
+ sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
+ sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
+
+ shuffles[0] = lp_build_const_int32(gallivm, 0);
+ shuffles[1] = lp_build_const_int32(gallivm, 2);
+ shuffles[2] = lp_build_const_int32(gallivm, 4);
+ shuffles[3] = lp_build_const_int32(gallivm, 6);
+ shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
+ LLVMConstVector(shuffles, 4), "");
+
+ shuffles[0] = lp_build_const_int32(gallivm, 1);
+ shuffles[1] = lp_build_const_int32(gallivm, 3);
+ shuffles[2] = lp_build_const_int32(gallivm, 5);
+ shuffles[3] = lp_build_const_int32(gallivm, 7);
+ shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
+ LLVMConstVector(shuffles, 4), "");
+
+ return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
+}
+
+
+/*
+ * partially horizontally add 2-4 float vectors with length nx4,
+ * i.e. only four adjacent values in each vector will be added,
+ * assuming values are really grouped in 4 which also determines
+ * output order.
+ *
+ * Return a vector of the same length as the initial vectors,
+ * with the excess elements (if any) being undefined.
+ * The element order is independent of number of input vectors.
+ * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
+ * the output order thus will be
+ * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
+ */
+LLVMValueRef
+lp_build_hadd_partial4(struct lp_build_context *bld,
+ LLVMValueRef vectors[],
+ unsigned num_vecs)
+{
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef ret_vec;
+ LLVMValueRef tmp[4];
+ const char *intrinsic = NULL;
+
+ assert(num_vecs >= 2 && num_vecs <= 4);
+ assert(bld->type.floating);
+
+ /* only use this with at least 2 vectors, as it is sort of expensive
+ * (depending on cpu) and we always need two horizontal adds anyway,
+ * so a shuffle/add approach might be better.
+ */
+
+ tmp[0] = vectors[0];
+ tmp[1] = vectors[1];
+
+ tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
+ tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
+
+ if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
+ bld->type.length == 4) {
+ intrinsic = "llvm.x86.sse3.hadd.ps";
+ }
+ else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
+ bld->type.length == 8) {
+ intrinsic = "llvm.x86.avx.hadd.ps.256";
+ }
+ if (intrinsic) {
+ tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
+ lp_build_vec_type(gallivm, bld->type),
+ tmp[0], tmp[1]);
+ if (num_vecs > 2) {
+ tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
+ lp_build_vec_type(gallivm, bld->type),
+ tmp[2], tmp[3]);
+ }
+ else {
+ tmp[1] = tmp[0];
+ }
+ return lp_build_intrinsic_binary(builder, intrinsic,
+ lp_build_vec_type(gallivm, bld->type),
+ tmp[0], tmp[1]);
+ }
+
+ if (bld->type.length == 4) {
+ ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
+ }
+ else {
+ LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
+ unsigned j;
+ unsigned num_iter = bld->type.length / 4;
+ struct lp_type parttype = bld->type;
+ parttype.length = 4;
+ for (j = 0; j < num_iter; j++) {
+ LLVMValueRef partsrc[4];
+ unsigned i;
+ for (i = 0; i < 4; i++) {
+ partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
+ }
+ partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
+ }
+ ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
+ }
+ return ret_vec;
+}
/**
* Generate a - b
@@ -553,7 +804,7 @@ lp_build_mul_imm(struct lp_build_context *bld,
if(bld->type.floating) {
#if 0
/*
- * Power of two multiplication by directly manipulating the mantissa.
+ * Power of two multiplication by directly manipulating the exponent.
*
* XXX: This might not be always faster, it will introduce a small error
* for multiplication by zero, and it will produce wrong results
@@ -612,7 +863,8 @@ lp_build_div(struct lp_build_context *bld,
return LLVMConstUDiv(a, b);
}
- if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4 &&
+ if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+ (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
type.floating)
return lp_build_mul(bld, a, lp_build_rcp(bld, b));
@@ -871,6 +1123,12 @@ lp_build_abs(struct lp_build_context *bld,
return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
}
}
+ else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
+ (gallivm_debug & GALLIVM_DEBUG_PERF) &&
+ (type.width == 8 || type.width == 16 || type.width == 32)) {
+ debug_printf("%s: inefficient code, should split vectors manually\n",
+ __FUNCTION__);
+ }
return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
}
@@ -934,6 +1192,7 @@ lp_build_sgn(struct lp_build_context *bld,
else
{
/* signed int/norm/fixed point */
+ /* could use psign with sse3 and appropriate vectors here */
LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
res = lp_build_select(bld, cond, bld->one, minus_one);
@@ -1000,7 +1259,16 @@ lp_build_int_to_float(struct lp_build_context *bld,
return LLVMBuildSIToFP(builder, a, vec_type, "");
}
+static boolean
+sse41_rounding_available(const struct lp_type type)
+{
+ if ((util_cpu_caps.has_sse4_1 &&
+ (type.length == 1 || type.width*type.length == 128)) ||
+ (util_cpu_caps.has_avx && type.width*type.length == 256))
+ return TRUE;
+ return FALSE;
+}
enum lp_build_round_sse41_mode
{
@@ -1065,18 +1333,34 @@ lp_build_round_sse41(struct lp_build_context *bld,
res = LLVMBuildExtractElement(builder, res, index0, "");
}
else {
- assert(type.width*type.length == 128);
-
- switch(type.width) {
- case 32:
- intrinsic = "llvm.x86.sse41.round.ps";
- break;
- case 64:
- intrinsic = "llvm.x86.sse41.round.pd";
- break;
- default:
- assert(0);
- return bld->undef;
+ if (type.width * type.length == 128) {
+ switch(type.width) {
+ case 32:
+ intrinsic = "llvm.x86.sse41.round.ps";
+ break;
+ case 64:
+ intrinsic = "llvm.x86.sse41.round.pd";
+ break;
+ default:
+ assert(0);
+ return bld->undef;
+ }
+ }
+ else {
+ assert(type.width * type.length == 256);
+ assert(util_cpu_caps.has_avx);
+
+ switch(type.width) {
+ case 32:
+ intrinsic = "llvm.x86.avx.round.ps.256";
+ break;
+ case 64:
+ intrinsic = "llvm.x86.avx.round.pd.256";
+ break;
+ default:
+ assert(0);
+ return bld->undef;
+ }
}
res = lp_build_intrinsic_binary(builder, intrinsic,
@@ -1125,10 +1409,15 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld,
ret_type, arg);
}
else {
- assert(type.width*type.length == 128);
-
- intrinsic = "llvm.x86.sse2.cvtps2dq";
+ if (type.width* type.length == 128) {
+ intrinsic = "llvm.x86.sse2.cvtps2dq";
+ }
+ else {
+ assert(type.width*type.length == 256);
+ assert(util_cpu_caps.has_avx);
+ intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
+ }
res = lp_build_intrinsic_unary(builder, intrinsic,
ret_type, a);
}
@@ -1152,8 +1441,7 @@ lp_build_trunc(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
}
else {
@@ -1183,8 +1471,7 @@ lp_build_round(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
}
else {
@@ -1212,8 +1499,7 @@ lp_build_floor(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
}
else {
@@ -1241,8 +1527,7 @@ lp_build_ceil(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
}
else {
@@ -1269,6 +1554,34 @@ lp_build_fract(struct lp_build_context *bld,
/**
+ * Prevent returning a fractional part of 1.0 for very small negative values of
+ * 'a' by clamping against 0.99999(9).
+ */
+static inline LLVMValueRef
+clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
+{
+ LLVMValueRef max;
+
+ /* this is the largest number smaller than 1.0 representable as float */
+ max = lp_build_const_vec(bld->gallivm, bld->type,
+ 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
+ return lp_build_min(bld, fract, max);
+}
+
+
+/**
+ * Same as lp_build_fract, but guarantees that the result is always smaller
+ * than one.
+ */
+LLVMValueRef
+lp_build_fract_safe(struct lp_build_context *bld,
+ LLVMValueRef a)
+{
+ return clamp_fract(bld, lp_build_fract(bld, a));
+}
+
+
+/**
* Return the integer part of a float (vector) value (== round toward zero).
* The returned value is an integer (vector).
* Ex: itrunc(-1.5) = -1
@@ -1307,12 +1620,12 @@ lp_build_iround(struct lp_build_context *bld,
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse2 &&
- ((type.width == 32) && (type.length == 1 || type.length == 4))) {
+ if ((util_cpu_caps.has_sse2 &&
+ ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
+ (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
return lp_build_iround_nearest_sse2(bld, a);
}
- else if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
}
else {
@@ -1362,14 +1675,12 @@ lp_build_ifloor(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
- res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
- }
- else {
- res = a;
-
- if (type.sign) {
+ res = a;
+ if (type.sign) {
+ if (sse41_rounding_available(type)) {
+ res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+ }
+ else {
/* Take the sign bit and add it to 1 constant */
LLVMTypeRef vec_type = bld->vec_type;
unsigned mantissa = lp_mantissa(type);
@@ -1423,8 +1734,7 @@ lp_build_iceil(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
}
else {
@@ -1470,7 +1780,7 @@ lp_build_iceil(struct lp_build_context *bld,
* Combined ifloor() & fract().
*
* Preferred to calling the functions separately, as it will ensure that the
- * stratergy (floor() vs ifloor()) that results in less redundant work is used.
+ * strategy (floor() vs ifloor()) that results in less redundant work is used.
*/
void
lp_build_ifloor_fract(struct lp_build_context *bld,
@@ -1485,8 +1795,7 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
assert(type.floating);
assert(lp_check_value(type, a));
- if (util_cpu_caps.has_sse4_1 &&
- (type.length == 1 || type.width*type.length == 128)) {
+ if (sse41_rounding_available(type)) {
/*
* floor() is easier.
*/
@@ -1507,6 +1816,21 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
}
+/**
+ * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
+ * always smaller than one.
+ */
+void
+lp_build_ifloor_fract_safe(struct lp_build_context *bld,
+ LLVMValueRef a,
+ LLVMValueRef *out_ipart,
+ LLVMValueRef *out_fpart)
+{
+ lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
+ *out_fpart = clamp_fract(bld, *out_fpart);
+}
+
+
LLVMValueRef
lp_build_sqrt(struct lp_build_context *bld,
LLVMValueRef a)
@@ -1519,10 +1843,14 @@ lp_build_sqrt(struct lp_build_context *bld,
assert(lp_check_value(type, a));
/* TODO: optimize the constant case */
- /* TODO: optimize the constant case */
assert(type.floating);
- util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
+ if (type.length == 1) {
+ util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
+ }
+ else {
+ util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
+ }
return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
}
@@ -1586,19 +1914,28 @@ lp_build_rcp(struct lp_build_context *bld,
* - it doesn't even get the reciprocate of 1.0 exactly
* - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
* - for recent processors the benefit over DIVPS is marginal, a case
- * depedent
+ * dependent
*
* We could still use it on certain processors if benchmarks show that the
* RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
* particular uses that require less workarounds.
*/
- if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+ if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+ (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
const unsigned num_iterations = 0;
LLVMValueRef res;
unsigned i;
+ const char *intrinsic = NULL;
- res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
+ if (type.length == 4) {
+ intrinsic = "llvm.x86.sse.rcp.ps";
+ }
+ else {
+ intrinsic = "llvm.x86.avx.rcp.ps.256";
+ }
+
+ res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
for (i = 0; i < num_iterations; ++i) {
res = lp_build_rcp_refine(bld, a, res);
@@ -1653,12 +1990,22 @@ lp_build_rsqrt(struct lp_build_context *bld,
assert(type.floating);
- if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+ if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+ (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
const unsigned num_iterations = 1;
LLVMValueRef res;
unsigned i;
+ const char *intrinsic = NULL;
+
+ if (type.length == 4) {
+ intrinsic = "llvm.x86.sse.rsqrt.ps";
+ }
+ else {
+ intrinsic = "llvm.x86.avx.rsqrt.ps.256";
+ }
+
+ res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
- res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
for (i = 0; i < num_iterations; ++i) {
res = lp_build_rsqrt_refine(bld, a, res);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index aeb987ff352..60b9907e60f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -57,8 +57,13 @@ lp_build_add(struct lp_build_context *bld,
LLVMValueRef b);
LLVMValueRef
-lp_build_sum_vector(struct lp_build_context *bld,
- LLVMValueRef a);
+lp_build_horizontal_add(struct lp_build_context *bld,
+ LLVMValueRef a);
+
+LLVMValueRef
+lp_build_hadd_partial4(struct lp_build_context *bld,
+ LLVMValueRef vectors[],
+ unsigned num_vecs);
LLVMValueRef
lp_build_sub(struct lp_build_context *bld,
@@ -157,6 +162,10 @@ lp_build_fract(struct lp_build_context *bld,
LLVMValueRef a);
LLVMValueRef
+lp_build_fract_safe(struct lp_build_context *bld,
+ LLVMValueRef a);
+
+LLVMValueRef
lp_build_ifloor(struct lp_build_context *bld,
LLVMValueRef a);
LLVMValueRef
@@ -177,6 +186,12 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
LLVMValueRef *out_ipart,
LLVMValueRef *out_fpart);
+void
+lp_build_ifloor_fract_safe(struct lp_build_context *bld,
+ LLVMValueRef a,
+ LLVMValueRef *out_ipart,
+ LLVMValueRef *out_fpart);
+
LLVMValueRef
lp_build_sqrt(struct lp_build_context *bld,
LLVMValueRef a);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.c b/src/gallium/auxiliary/gallivm/lp_bld_const.c
index 59e8fb2ed6e..35799a1ef8e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.c
@@ -37,6 +37,7 @@
#include "util/u_debug.h"
#include "util/u_math.h"
+#include "util/u_half.h"
#include "lp_bld_type.h"
#include "lp_bld_const.h"
@@ -50,10 +51,12 @@ lp_mantissa(struct lp_type type)
if(type.floating) {
switch(type.width) {
+ case 16:
+ return 10;
case 32:
return 23;
case 64:
- return 53;
+ return 52;
default:
assert(0);
return 0;
@@ -136,6 +139,8 @@ lp_const_min(struct lp_type type)
if (type.floating) {
switch(type.width) {
+ case 16:
+ return -65504;
case 32:
return -FLT_MAX;
case 64:
@@ -169,6 +174,8 @@ lp_const_max(struct lp_type type)
if (type.floating) {
switch(type.width) {
+ case 16:
+ return 65504;
case 32:
return FLT_MAX;
case 64:
@@ -196,6 +203,8 @@ lp_const_eps(struct lp_type type)
{
if (type.floating) {
switch(type.width) {
+ case 16:
+ return 2E-10;
case 32:
return FLT_EPSILON;
case 64:
@@ -247,7 +256,9 @@ lp_build_one(struct gallivm_state *gallivm, struct lp_type type)
elem_type = lp_build_elem_type(gallivm, type);
- if(type.floating)
+ if(type.floating && type.width == 16)
+ elems[0] = LLVMConstInt(elem_type, util_float_to_half(1.0f), 0);
+ else if(type.floating)
elems[0] = LLVMConstReal(elem_type, 1.0);
else if(type.fixed)
elems[0] = LLVMConstInt(elem_type, 1LL << (type.width/2), 0);
@@ -292,7 +303,9 @@ lp_build_const_elem(struct gallivm_state *gallivm,
LLVMTypeRef elem_type = lp_build_elem_type(gallivm, type);
LLVMValueRef elem;
- if(type.floating) {
+ if(type.floating && type.width == 16) {
+ elem = LLVMConstInt(elem_type, util_float_to_half((float)val), 0);
+ } else if(type.floating) {
elem = LLVMConstReal(elem_type, val);
}
else {
@@ -364,20 +377,10 @@ lp_build_const_aos(struct gallivm_state *gallivm,
if(swizzle == NULL)
swizzle = default_swizzle;
- if(type.floating) {
- elems[swizzle[0]] = LLVMConstReal(elem_type, r);
- elems[swizzle[1]] = LLVMConstReal(elem_type, g);
- elems[swizzle[2]] = LLVMConstReal(elem_type, b);
- elems[swizzle[3]] = LLVMConstReal(elem_type, a);
- }
- else {
- double dscale = lp_const_scale(type);
-
- elems[swizzle[0]] = LLVMConstInt(elem_type, round(r*dscale), 0);
- elems[swizzle[1]] = LLVMConstInt(elem_type, round(g*dscale), 0);
- elems[swizzle[2]] = LLVMConstInt(elem_type, round(b*dscale), 0);
- elems[swizzle[3]] = LLVMConstInt(elem_type, round(a*dscale), 0);
- }
+ elems[swizzle[0]] = lp_build_const_elem(gallivm, type, r);
+ elems[swizzle[1]] = lp_build_const_elem(gallivm, type, g);
+ elems[swizzle[2]] = lp_build_const_elem(gallivm, type, b);
+ elems[swizzle[3]] = lp_build_const_elem(gallivm, type, a);
for(i = 4; i < type.length; ++i)
elems[i] = elems[i % 4];
@@ -452,7 +455,7 @@ lp_build_const_string(struct gallivm_state *gallivm,
/**
* Build a callable function pointer.
*
- * We this casts instead of LLVMAddGlobalMapping()
+ * We use function pointer constants instead of LLVMAddGlobalMapping()
* to work around a bug in LLVM 2.6, and for efficiency/simplicity.
*/
LLVMValueRef
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 0973e1f16f3..0399709faad 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -70,6 +70,66 @@
#include "lp_bld_arit.h"
#include "lp_bld_pack.h"
#include "lp_bld_conv.h"
+#include "lp_bld_logic.h"
+
+
+/**
+ * Converts int16 half-float to float32
+ * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
+ * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
+ *
+ * @param src_type <vector> type of int16
+ * @param src value to convert
+ *
+ * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
+ */
+LLVMValueRef
+lp_build_half_to_float(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ LLVMValueRef src)
+{
+ struct lp_type f32_type = lp_type_float_vec(32, 32 * src_type.length);
+ struct lp_type i32_type = lp_type_int_vec(32, 32 * src_type.length);
+
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
+ LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type);
+
+ /* Constants */
+ LLVMValueRef i32_13 = lp_build_const_int_vec(gallivm, i32_type, 13);
+ LLVMValueRef i32_16 = lp_build_const_int_vec(gallivm, i32_type, 16);
+ LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff);
+ LLVMValueRef i32_was_infnan = lp_build_const_int_vec(gallivm, i32_type, 0x7bff);
+ LLVMValueRef i32_exp_infnan = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
+ LLVMValueRef f32_magic = LLVMBuildBitCast(builder,
+ lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23),
+ float_vec_type, "");
+
+ /* Convert int16 vector to int32 vector by zero ext */
+ LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, "");
+
+ /* Exponent / mantissa bits */
+ LLVMValueRef expmant = LLVMBuildAnd(builder, i32_mask_nosign, h, "");
+ LLVMValueRef shifted = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, "");
+
+ /* Exponent adjust */
+ LLVMValueRef scaled = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, "");
+
+ /* Make sure Inf/NaN survive */
+ LLVMValueRef b_wasinfnan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan);
+ LLVMValueRef infnanexp = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, "");
+
+ /* Sign bit */
+ LLVMValueRef justsign = LLVMBuildXor(builder, h, expmant, "");
+ LLVMValueRef sign = LLVMBuildShl(builder, justsign, i32_16, "");
+
+ /* Combine result */
+ LLVMValueRef sign_inf = LLVMBuildOr(builder, sign, infnanexp, "");
+ LLVMValueRef final = LLVMBuildOr(builder, scaled, sign_inf, "");
+
+ /* Cast from int32 vector to float32 vector */
+ return LLVMBuildBitCast(builder, final, float_vec_type, "");
+}
/**
@@ -334,6 +394,8 @@ lp_build_conv(struct gallivm_state *gallivm,
dst_type.width == 8 &&
dst_type.length == 16 &&
+ 4 * num_dsts == num_srcs &&
+
util_cpu_caps.has_sse2)
{
struct lp_build_context bld;
@@ -371,6 +433,76 @@ lp_build_conv(struct gallivm_state *gallivm,
return;
}
+ /* Special case 2x8f --> 1x16ub
+ */
+ else if (src_type.floating == 1 &&
+ src_type.fixed == 0 &&
+ src_type.sign == 1 &&
+ src_type.norm == 0 &&
+ src_type.width == 32 &&
+ src_type.length == 8 &&
+
+ dst_type.floating == 0 &&
+ dst_type.fixed == 0 &&
+ dst_type.sign == 0 &&
+ dst_type.norm == 1 &&
+ dst_type.width == 8 &&
+ dst_type.length == 16 &&
+
+ 2 * num_dsts == num_srcs &&
+
+ util_cpu_caps.has_avx) {
+
+ struct lp_build_context bld;
+ struct lp_type int16_type = dst_type;
+ struct lp_type int32_type = dst_type;
+ LLVMValueRef const_255f;
+ unsigned i;
+
+ lp_build_context_init(&bld, gallivm, src_type);
+
+ int16_type.width *= 2;
+ int16_type.length /= 2;
+ int16_type.sign = 1;
+
+ int32_type.width *= 4;
+ int32_type.length /= 4;
+ int32_type.sign = 1;
+
+ const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
+
+ for (i = 0; i < num_dsts; ++i, src += 2) {
+ LLVMValueRef lo, hi, a, b;
+
+ a = LLVMBuildFMul(builder, src[0], const_255f, "");
+ b = LLVMBuildFMul(builder, src[1], const_255f, "");
+
+ a = lp_build_iround(&bld, a);
+ b = lp_build_iround(&bld, b);
+
+ tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
+ tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
+ tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
+ tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
+
+ /* relying on clamping behavior of sse2 intrinsics here */
+ lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
+ hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
+ dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
+ }
+ return;
+ }
+
+ /* Pre convert half-floats to floats
+ */
+ else if (src_type.floating && src_type.width == 16)
+ {
+ for(i = 0; i < num_tmps; ++i)
+ tmp[i] = lp_build_half_to_float(gallivm, src_type, tmp[i]);
+
+ tmp_type.width = 32;
+ }
+
/*
* Clamp if necessary
*/
@@ -580,7 +712,7 @@ lp_build_conv(struct gallivm_state *gallivm,
* This will convert the integer masks that match the given types.
*
* The mask values should 0 or -1, i.e., all bits either set to zero or one.
- * Any other value will likely cause in unpredictable results.
+ * Any other value will likely cause unpredictable results.
*
* This is basically a very trimmed down version of lp_build_conv.
*/
@@ -591,8 +723,6 @@ lp_build_conv_mask(struct gallivm_state *gallivm,
const LLVMValueRef *src, unsigned num_srcs,
LLVMValueRef *dst, unsigned num_dsts)
{
- /* Register width must remain constant */
- assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
/* We must not loose or gain channels. Only precision */
assert(src_type.length * num_srcs == dst_type.length * num_dsts);
@@ -617,16 +747,5 @@ lp_build_conv_mask(struct gallivm_state *gallivm,
* Truncate or expand bit width
*/
- if(src_type.width > dst_type.width) {
- assert(num_dsts == 1);
- dst[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
- }
- else if(src_type.width < dst_type.width) {
- assert(num_srcs == 1);
- lp_build_unpack(gallivm, src_type, dst_type, src[0], dst, num_dsts);
- }
- else {
- assert(num_srcs == num_dsts);
- memcpy(dst, src, num_dsts * sizeof *dst);
- }
+ lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
index cec655980fa..c830fbef5f2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
@@ -42,6 +42,10 @@
struct lp_type;
+LLVMValueRef
+lp_build_half_to_float(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ LLVMValueRef src);
LLVMValueRef
lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
index 444b70a678c..93505f3da45 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
@@ -35,10 +35,8 @@
#if HAVE_LLVM >= 0x0300
#include <llvm/Support/TargetRegistry.h>
-#include <llvm/Support/TargetSelect.h>
#else /* HAVE_LLVM < 0x0300 */
#include <llvm/Target/TargetRegistry.h>
-#include <llvm/Target/TargetSelect.h>
#endif /* HAVE_LLVM < 0x0300 */
#if HAVE_LLVM >= 0x0209
@@ -183,7 +181,7 @@ lp_disassemble(const void* func)
/*
* Limit disassembly to this extent
*/
- const uint64_t extent = 0x10000;
+ const uint64_t extent = 96 * 1024;
uint64_t max_pc = 0;
@@ -200,24 +198,6 @@ lp_disassemble(const void* func)
std::string Error;
const Target *T = TargetRegistry::lookupTarget(Triple, Error);
-#if HAVE_LLVM >= 0x0208
- InitializeNativeTargetAsmPrinter();
-#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
- LLVMInitializeX86AsmPrinter();
-#elif defined(PIPE_ARCH_ARM)
- LLVMInitializeARMAsmPrinter();
-#elif defined(PIPE_ARCH_PPC)
- LLVMInitializePowerPCAsmPrinter();
-#endif
-
-#if HAVE_LLVM >= 0x0301
- InitializeNativeTargetDisassembler();
-#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
- LLVMInitializeX86Disassembler();
-#elif defined(PIPE_ARCH_ARM)
- LLVMInitializeARMDisassembler();
-#endif
-
#if HAVE_LLVM >= 0x0300
OwningPtr<const MCAsmInfo> AsmInfo(T->createMCAsmInfo(Triple));
#else
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
index d2b3713ed2d..30da44e5b9c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -131,6 +131,15 @@ lp_build_mask_check(struct lp_build_mask_context *mask)
value = lp_build_mask_value(mask);
+ /*
+ * XXX this doesn't quite generate the most efficient code possible, if
+ * the masks are vectors which have all bits set to the same value
+ * in each element.
+ * movmskps/pmovmskb would be more efficient to get the required value
+ * into ordinary reg (certainly with 8 floats).
+ * Not sure if llvm could figure that out on its own.
+ */
+
/* cond = (mask == 0) */
cond = LLVMBuildICmp(builder,
LLVMIntEQ,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 04142d905b1..3608a68202f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -67,6 +67,13 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
LLVMValueRef i,
LLVMValueRef j);
+LLVMValueRef
+lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
+ const struct util_format_description *format_desc,
+ struct lp_type type,
+ LLVMValueRef base_ptr,
+ LLVMValueRef offset);
+
/*
* SoA
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index e4b8da6bcfd..9591bcfb2c7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -470,6 +470,11 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
return lp_build_format_swizzle_aos(format_desc, &bld, res);
}
+ /* If all channels are of same type and we are not using half-floats */
+ if (util_format_is_array(format_desc)) {
+ return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset);
+ }
+
/*
* YUV / subsampled formats
*/
@@ -601,7 +606,6 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
return res;
}
-
/*
* Fallback to util_format_description::fetch_rgba_float().
*/
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
new file mode 100644
index 00000000000..b8ec379d76f
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
@@ -0,0 +1,102 @@
+/**************************************************************************
+ *
+ * Copyright 2012 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "lp_bld_const.h"
+#include "lp_bld_struct.h"
+#include "lp_bld_format.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_type.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_pack.h"
+
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "pipe/p_state.h"
+
+/**
+ * @brief lp_build_fetch_rgba_aos_array
+ *
+ * \param format_desc describes format of the image we're fetching from
+ * \param dst_type output type
+ * \param base_ptr address of the pixel block (or the texel if uncompressed)
+ * \param offset ptr offset
+ */
+LLVMValueRef
+lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
+ const struct util_format_description *format_desc,
+ struct lp_type dst_type,
+ LLVMValueRef base_ptr,
+ LLVMValueRef offset)
+{
+ struct lp_build_context bld;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMTypeRef src_elem_type, src_vec_type;
+ LLVMValueRef ptr, res = NULL;
+ struct lp_type src_type;
+
+ memset(&src_type, 0, sizeof src_type);
+ src_type.floating = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT;
+ src_type.fixed = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FIXED;
+ src_type.sign = format_desc->channel[0].type != UTIL_FORMAT_TYPE_UNSIGNED;
+ src_type.norm = format_desc->channel[0].normalized;
+ src_type.width = format_desc->channel[0].size;
+ src_type.length = format_desc->nr_channels;
+
+ assert(src_type.length <= dst_type.length);
+
+ src_elem_type = lp_build_elem_type(gallivm, src_type);
+ src_vec_type = lp_build_vec_type(gallivm, src_type);
+
+ /* Read whole vector from memory, unaligned */
+ if (!res) {
+ ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, "");
+ ptr = LLVMBuildPointerCast(builder, ptr, LLVMPointerType(src_vec_type, 0), "");
+ res = LLVMBuildLoad(builder, ptr, "");
+ lp_set_load_alignment(res, src_type.width / 8);
+ }
+
+ /* Truncate doubles to float */
+ if (src_type.floating && src_type.width == 64) {
+ src_type.width = 32;
+ src_vec_type = lp_build_vec_type(gallivm, src_type);
+
+ res = LLVMBuildFPTrunc(builder, res, src_vec_type, "");
+ }
+
+ /* Expand to correct length */
+ if (src_type.length < dst_type.length) {
+ res = lp_build_pad_vector(gallivm, res, src_type, dst_type.length);
+ src_type.length = dst_type.length;
+ }
+
+ /* Convert to correct format */
+ lp_build_conv(gallivm, src_type, dst_type, &res, 1, &res, 1);
+
+ /* Swizzle it */
+ lp_build_context_init(&bld, gallivm, dst_type);
+ return lp_build_format_swizzle_aos(format_desc, &bld, res);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 0a57b3ce794..afeb34079bf 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -359,7 +359,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
*/
if (util_format_fits_8unorm(format_desc) &&
- type.floating && type.width == 32 && type.length == 4) {
+ type.floating && type.width == 32 &&
+ (type.length == 1 || (type.length % 4 == 0))) {
struct lp_type tmp_type;
LLVMValueRef tmp;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
index ccc83207004..f77eb1212b1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -84,7 +84,7 @@ uyvy_to_yuv_soa(struct gallivm_state *gallivm,
* per element. Didn't measure performance but cuts shader size
* by quite a bit (less difference if cpu has no sse4.1 support).
*/
- if (util_cpu_caps.has_sse2 && n == 4) {
+ if (util_cpu_caps.has_sse2 && n > 1) {
LLVMValueRef sel, tmp, tmp2;
struct lp_build_context bld32;
@@ -152,7 +152,7 @@ yuyv_to_yuv_soa(struct gallivm_state *gallivm,
* per element. Didn't measure performance but cuts shader size
* by quite a bit (less difference if cpu has no sse4.1 support).
*/
- if (util_cpu_caps.has_sse2 && n == 4) {
+ if (util_cpu_caps.has_sse2 && n > 1) {
LLVMValueRef sel, tmp;
struct lp_build_context bld32;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 768d935dae5..5bf4bcfab3b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -26,15 +26,44 @@
**************************************************************************/
+#include "pipe/p_config.h"
#include "pipe/p_compiler.h"
#include "util/u_cpu_detect.h"
#include "util/u_debug.h"
#include "util/u_memory.h"
#include "util/u_simple_list.h"
+#include "lp_bld.h"
#include "lp_bld_debug.h"
+#include "lp_bld_misc.h"
#include "lp_bld_init.h"
+#include <llvm-c/Analysis.h>
#include <llvm-c/Transforms/Scalar.h>
+#include <llvm-c/BitWriter.h>
+
+
+/**
+ * AVX is supported in:
+ * - standard JIT from LLVM 3.2 onwards
+ * - MC-JIT from LLVM 3.1
+ * - MC-JIT supports limited OSes (MacOSX and Linux)
+ * - standard JIT in LLVM 3.1, with backports
+ */
+#if HAVE_LLVM >= 0x0301 && (defined(PIPE_OS_LINUX) || defined(PIPE_OS_APPLE))
+# define USE_MCJIT 1
+# define HAVE_AVX 1
+#elif HAVE_LLVM >= 0x0302 || (HAVE_LLVM == 0x0301 && defined(HAVE_JIT_AVX_SUPPORT))
+# define USE_MCJIT 0
+# define HAVE_AVX 1
+#else
+# define USE_MCJIT 0
+# define HAVE_AVX 0
+#endif
+
+
+#if USE_MCJIT
+void LLVMLinkInMCJIT();
+#endif
#ifdef DEBUG
@@ -57,6 +86,8 @@ DEBUG_GET_ONCE_FLAGS_OPTION(gallivm_debug, "GALLIVM_DEBUG", lp_bld_debug_flags,
static boolean gallivm_initialized = FALSE;
+unsigned lp_native_vector_width;
+
/*
* Optimization values are:
@@ -81,25 +112,13 @@ enum LLVM_CodeGenOpt_Level {
};
+#if HAVE_LLVM <= 0x0206
/**
- * LLVM 2.6 permits only one ExecutionEngine to be created. This is it.
- */
-static LLVMExecutionEngineRef GlobalEngine = NULL;
-
-/**
- * Same gallivm state shared by all contexts.
+ * LLVM 2.6 permits only one ExecutionEngine to be created. So use the
+ * same gallivm state everywhere.
*/
static struct gallivm_state *GlobalGallivm = NULL;
-
-
-
-
-extern void
-lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE);
-
-extern void
-lp_set_target_options(void);
-
+#endif
/**
@@ -111,6 +130,7 @@ static boolean
create_pass_manager(struct gallivm_state *gallivm)
{
assert(!gallivm->passmgr);
+ assert(gallivm->target);
gallivm->passmgr = LLVMCreateFunctionPassManager(gallivm->provider);
if (!gallivm->passmgr)
@@ -174,33 +194,37 @@ free_gallivm_state(struct gallivm_state *gallivm)
&mod, &error);
#endif
+ if (gallivm->passmgr) {
+ LLVMDisposePassManager(gallivm->passmgr);
+ }
+
#if 0
/* XXX this seems to crash with all versions of LLVM */
if (gallivm->provider)
LLVMDisposeModuleProvider(gallivm->provider);
#endif
- if (gallivm->passmgr)
- LLVMDisposePassManager(gallivm->passmgr);
-
-#if HAVE_LLVM >= 0x207
- if (gallivm->module)
- LLVMDisposeModule(gallivm->module);
-#endif
-
-#if 0
- /* Don't free the exec engine, it's a global/singleton */
- if (gallivm->engine)
+ if (HAVE_LLVM >= 0x207 && gallivm->engine) {
+ /* This will already destroy any associated module */
LLVMDisposeExecutionEngine(gallivm->engine);
-#endif
+ } else {
+ LLVMDisposeModule(gallivm->module);
+ }
-#if 0
+#if !USE_MCJIT
/* Don't free the TargetData, it's owned by the exec engine */
- LLVMDisposeTargetData(gallivm->target);
+#else
+ if (gallivm->target) {
+ LLVMDisposeTargetData(gallivm->target);
+ }
#endif
+ /* Never free the LLVM context.
+ */
+#if 0
if (gallivm->context)
LLVMContextDispose(gallivm->context);
+#endif
if (gallivm->builder)
LLVMDisposeBuilder(gallivm->builder);
@@ -215,37 +239,14 @@ free_gallivm_state(struct gallivm_state *gallivm)
}
-/**
- * Allocate gallivm LLVM objects.
- * \return TRUE for success, FALSE for failure
- */
static boolean
-init_gallivm_state(struct gallivm_state *gallivm)
+init_gallivm_engine(struct gallivm_state *gallivm)
{
- assert(!gallivm->context);
- assert(!gallivm->module);
- assert(!gallivm->provider);
-
- lp_build_init();
-
- gallivm->context = LLVMContextCreate();
- if (!gallivm->context)
- goto fail;
-
- gallivm->module = LLVMModuleCreateWithNameInContext("gallivm",
- gallivm->context);
- if (!gallivm->module)
- goto fail;
-
- gallivm->provider =
- LLVMCreateModuleProviderForExistingModule(gallivm->module);
- if (!gallivm->provider)
- goto fail;
-
- if (!GlobalEngine) {
+ if (1) {
/* We can only create one LLVMExecutionEngine (w/ LLVM 2.6 anyway) */
enum LLVM_CodeGenOpt_Level optlevel;
char *error = NULL;
+ int ret;
if (gallivm_debug & GALLIVM_DEBUG_NO_OPT) {
optlevel = None;
@@ -254,135 +255,162 @@ init_gallivm_state(struct gallivm_state *gallivm)
optlevel = Default;
}
- if (LLVMCreateJITCompiler(&GlobalEngine, gallivm->provider,
- (unsigned) optlevel, &error)) {
+#if USE_MCJIT
+ ret = lp_build_create_mcjit_compiler_for_module(&gallivm->engine,
+ gallivm->module,
+ (unsigned) optlevel,
+ &error);
+#else
+ ret = LLVMCreateJITCompiler(&gallivm->engine, gallivm->provider,
+ (unsigned) optlevel, &error);
+#endif
+ if (ret) {
_debug_printf("%s\n", error);
LLVMDisposeMessage(error);
goto fail;
}
#if defined(DEBUG) || defined(PROFILE)
- lp_register_oprofile_jit_event_listener(GlobalEngine);
+ lp_register_oprofile_jit_event_listener(gallivm->engine);
#endif
}
- gallivm->engine = GlobalEngine;
-
LLVMAddModuleProvider(gallivm->engine, gallivm->provider);//new
+#if !USE_MCJIT
gallivm->target = LLVMGetExecutionEngineTargetData(gallivm->engine);
if (!gallivm->target)
goto fail;
+#else
+ if (0) {
+ /*
+ * Dump the data layout strings.
+ */
- if (!create_pass_manager(gallivm))
- goto fail;
+ LLVMTargetDataRef target = LLVMGetExecutionEngineTargetData(gallivm->engine);
+ char *data_layout;
+ char *engine_data_layout;
- gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
- if (!gallivm->builder)
- goto fail;
+ data_layout = LLVMCopyStringRepOfTargetData(gallivm->target);
+ engine_data_layout = LLVMCopyStringRepOfTargetData(target);
+
+ if (1) {
+ debug_printf("module target data = %s\n", data_layout);
+ debug_printf("engine target data = %s\n", engine_data_layout);
+ }
+
+ free(data_layout);
+ free(engine_data_layout);
+ }
+#endif
return TRUE;
fail:
- free_gallivm_state(gallivm);
return FALSE;
}
-struct callback
-{
- garbage_collect_callback_func func;
- void *cb_data;
- struct callback *prev, *next;
-};
-
-
-/** list of all garbage collector callbacks */
-static struct callback callback_list = {NULL, NULL, NULL, NULL};
+/**
+ * Singleton
+ *
+ * We must never free LLVM contexts, because LLVM has several global caches
+ * which pointing/derived from objects owned by the context, causing false
+ * memory leaks and false cache hits when these objects are destroyed.
+ *
+ * TODO: For thread safety on multi-threaded OpenGL we should use one LLVM
+ * context per thread, and put them in a pool when threads are destroyed.
+ */
+static LLVMContextRef gallivm_context = NULL;
/**
- * Register a function with gallivm which will be called when we
- * do garbage collection.
+ * Allocate gallivm LLVM objects.
+ * \return TRUE for success, FALSE for failure
*/
-void
-gallivm_register_garbage_collector_callback(garbage_collect_callback_func func,
- void *cb_data)
+static boolean
+init_gallivm_state(struct gallivm_state *gallivm)
{
- struct callback *cb;
-
- if (!callback_list.prev) {
- make_empty_list(&callback_list);
- }
+ assert(!gallivm->context);
+ assert(!gallivm->module);
+ assert(!gallivm->provider);
- /* see if already in list */
- foreach(cb, &callback_list) {
- if (cb->func == func && cb->cb_data == cb_data)
- return;
- }
+ lp_build_init();
- /* add to list */
- cb = CALLOC_STRUCT(callback);
- if (cb) {
- cb->func = func;
- cb->cb_data = cb_data;
- insert_at_head(&callback_list, cb);
+ if (!gallivm_context) {
+ gallivm_context = LLVMContextCreate();
}
-}
+ gallivm->context = gallivm_context;
+ if (!gallivm->context)
+ goto fail;
+ gallivm->module = LLVMModuleCreateWithNameInContext("gallivm",
+ gallivm->context);
+ if (!gallivm->module)
+ goto fail;
-/**
- * Remove a callback.
- */
-void
-gallivm_remove_garbage_collector_callback(garbage_collect_callback_func func,
- void *cb_data)
-{
- struct callback *cb;
-
- /* search list */
- foreach(cb, &callback_list) {
- if (cb->func == func && cb->cb_data == cb_data) {
- /* found, remove it */
- remove_from_list(cb);
- FREE(cb);
- return;
- }
- }
-}
+ gallivm->provider =
+ LLVMCreateModuleProviderForExistingModule(gallivm->module);
+ if (!gallivm->provider)
+ goto fail;
+ gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
+ if (!gallivm->builder)
+ goto fail;
-/**
- * Call the callback functions (which are typically in the
- * draw module and llvmpipe driver.
- */
-static void
-call_garbage_collector_callbacks(void)
-{
- struct callback *cb;
- foreach(cb, &callback_list) {
- cb->func(cb->cb_data);
+ /* FIXME: MC-JIT only allows compiling one module at a time, and it must be
+ * complete when MC-JIT is created. So defer the MC-JIT engine creation for
+ * now.
+ */
+#if !USE_MCJIT
+ if (!init_gallivm_engine(gallivm)) {
+ goto fail;
}
-}
+#else
+ /*
+ * MC-JIT engine compiles the module immediately on creation, so we can't
+ * obtain the target data from it. Instead we create a target data layout
+ * from a string.
+ *
+ * The produced layout strings are not precisely the same, but should make
+ * no difference for the kind of optimization passes we run.
+ *
+ * For reference this is the layout string on x64:
+ *
+ * e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64
+ *
+ * See also:
+ * - http://llvm.org/docs/LangRef.html#datalayout
+ */
+
+ {
+ const unsigned pointer_size = 8 * sizeof(void *);
+ char layout[512];
+ util_snprintf(layout, sizeof layout, "%c-p:%u:%u:%u-i64:64:64-a0:0:%u-s0:%u:%u",
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+ 'e', // little endian
+#else
+ 'E', // big endian
+#endif
+ pointer_size, pointer_size, pointer_size, // pointer size, abi alignment, preferred alignment
+ pointer_size, // aggregate preferred alignment
+ pointer_size, pointer_size); // stack objects abi alignment, preferred alignment
+ gallivm->target = LLVMCreateTargetData(layout);
+ if (!gallivm->target) {
+ return FALSE;
+ }
+ }
+#endif
+ if (!create_pass_manager(gallivm))
+ goto fail;
-/**
- * Other gallium components using gallivm should call this periodically
- * to let us do garbage collection (or at least try to free memory
- * accumulated by the LLVM libraries).
- */
-void
-gallivm_garbage_collect(struct gallivm_state *gallivm)
-{
- if (gallivm->context) {
- if (gallivm_debug & GALLIVM_DEBUG_GC)
- debug_printf("***** Doing LLVM garbage collection\n");
+ return TRUE;
- call_garbage_collector_callbacks();
- free_gallivm_state(gallivm);
- init_gallivm_state(gallivm);
- }
+fail:
+ free_gallivm_state(gallivm);
+ return FALSE;
}
@@ -398,12 +426,27 @@ lp_build_init(void)
lp_set_target_options();
- LLVMInitializeNativeTarget();
-
+#if USE_MCJIT
+ LLVMLinkInMCJIT();
+#else
LLVMLinkInJIT();
+#endif
util_cpu_detect();
+
+ if (HAVE_AVX &&
+ util_cpu_caps.has_avx) {
+ lp_native_vector_width = 256;
+ } else {
+ /* Leave it at 128, even when no SIMD extensions are available.
+ * Really needs to be a multiple of 128 so can fit 4 floats.
+ */
+ lp_native_vector_width = 128;
+ }
+ lp_native_vector_width = debug_get_num_option("LP_NATIVE_VECTOR_WIDTH",
+ lp_native_vector_width);
+
gallivm_initialized = TRUE;
#if 0
@@ -423,16 +466,27 @@ lp_build_init(void)
struct gallivm_state *
gallivm_create(void)
{
- if (!GlobalGallivm) {
- GlobalGallivm = CALLOC_STRUCT(gallivm_state);
- if (GlobalGallivm) {
- if (!init_gallivm_state(GlobalGallivm)) {
- FREE(GlobalGallivm);
- GlobalGallivm = NULL;
- }
+ struct gallivm_state *gallivm;
+
+#if HAVE_LLVM <= 0x206
+ if (GlobalGallivm) {
+ return GlobalGallivm;
+ }
+#endif
+
+ gallivm = CALLOC_STRUCT(gallivm_state);
+ if (gallivm) {
+ if (!init_gallivm_state(gallivm)) {
+ FREE(gallivm);
+ gallivm = NULL;
}
}
- return GlobalGallivm;
+
+#if HAVE_LLVM <= 0x206
+ GlobalGallivm = gallivm;
+#endif
+
+ return gallivm;
}
@@ -442,6 +496,132 @@ gallivm_create(void)
void
gallivm_destroy(struct gallivm_state *gallivm)
{
+#if HAVE_LLVM <= 0x0206
/* No-op: don't destroy the singleton */
(void) gallivm;
+#else
+ free_gallivm_state(gallivm);
+ FREE(gallivm);
+#endif
+}
+
+
+/**
+ * Validate and optimze a function.
+ */
+static void
+gallivm_optimize_function(struct gallivm_state *gallivm,
+ LLVMValueRef func)
+{
+ if (0) {
+ debug_printf("optimizing %s...\n", LLVMGetValueName(func));
+ }
+
+ assert(gallivm->passmgr);
+
+ /* Apply optimizations to LLVM IR */
+ LLVMRunFunctionPassManager(gallivm->passmgr, func);
+
+ if (0) {
+ if (gallivm_debug & GALLIVM_DEBUG_IR) {
+ /* Print the LLVM IR to stderr */
+ lp_debug_dump_value(func);
+ debug_printf("\n");
+ }
+ }
+}
+
+
+/**
+ * Validate a function.
+ */
+void
+gallivm_verify_function(struct gallivm_state *gallivm,
+ LLVMValueRef func)
+{
+ /* Verify the LLVM IR. If invalid, dump and abort */
+#ifdef DEBUG
+ if (LLVMVerifyFunction(func, LLVMPrintMessageAction)) {
+ lp_debug_dump_value(func);
+ assert(0);
+ return;
+ }
+#endif
+
+ gallivm_optimize_function(gallivm, func);
+
+ if (gallivm_debug & GALLIVM_DEBUG_IR) {
+ /* Print the LLVM IR to stderr */
+ lp_debug_dump_value(func);
+ debug_printf("\n");
+ }
+}
+
+
+void
+gallivm_compile_module(struct gallivm_state *gallivm)
+{
+#if HAVE_LLVM > 0x206
+ assert(!gallivm->compiled);
+#endif
+
+ /* Dump byte code to a file */
+ if (0) {
+ LLVMWriteBitcodeToFile(gallivm->module, "llvmpipe.bc");
+ debug_printf("llvmpipe.bc written\n");
+ debug_printf("Invoke as \"llc -o - llvmpipe.bc\"\n");
+ }
+
+#if USE_MCJIT
+ assert(!gallivm->engine);
+ if (!init_gallivm_engine(gallivm)) {
+ assert(0);
+ }
+#endif
+ assert(gallivm->engine);
+
+ ++gallivm->compiled;
+}
+
+
+func_pointer
+gallivm_jit_function(struct gallivm_state *gallivm,
+ LLVMValueRef func)
+{
+ void *code;
+ func_pointer jit_func;
+
+ assert(gallivm->compiled);
+ assert(gallivm->engine);
+
+ code = LLVMGetPointerToGlobal(gallivm->engine, func);
+ assert(code);
+ jit_func = pointer_to_func(code);
+
+ if (gallivm_debug & GALLIVM_DEBUG_ASM) {
+ lp_disassemble(code);
+ }
+
+ /* Free the function body to save memory */
+ lp_func_delete_body(func);
+
+ return jit_func;
+}
+
+
+/**
+ * Free the function (and its machine code).
+ */
+void
+gallivm_free_function(struct gallivm_state *gallivm,
+ LLVMValueRef func,
+ const void *code)
+{
+#if !USE_MCJIT
+ if (code) {
+ LLVMFreeMachineCodeForFunction(gallivm->engine, func);
+ }
+
+ LLVMDeleteFunction(func);
+#endif
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h
index 5fc0f996c64..7edea616c4e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h
@@ -31,6 +31,7 @@
#include "pipe/p_compiler.h"
+#include "util/u_pointer.h" // for func_pointer
#include "lp_bld.h"
#include <llvm-c/ExecutionEngine.h>
@@ -44,6 +45,7 @@ struct gallivm_state
LLVMPassManagerRef passmgr;
LLVMContextRef context;
LLVMBuilderRef builder;
+ unsigned compiled;
};
@@ -51,35 +53,28 @@ void
lp_build_init(void);
-extern void
-lp_func_delete_body(LLVMValueRef func);
-
+struct gallivm_state *
+gallivm_create(void);
void
-gallivm_garbage_collect(struct gallivm_state *gallivm);
-
+gallivm_destroy(struct gallivm_state *gallivm);
-typedef void (*garbage_collect_callback_func)(void *cb_data);
void
-gallivm_register_garbage_collector_callback(garbage_collect_callback_func func,
- void *cb_data);
+gallivm_verify_function(struct gallivm_state *gallivm,
+ LLVMValueRef func);
void
-gallivm_remove_garbage_collector_callback(garbage_collect_callback_func func,
- void *cb_data);
+gallivm_compile_module(struct gallivm_state *gallivm);
-
-struct gallivm_state *
-gallivm_create(void);
+func_pointer
+gallivm_jit_function(struct gallivm_state *gallivm,
+ LLVMValueRef func);
void
-gallivm_destroy(struct gallivm_state *gallivm);
-
-
-extern LLVMValueRef
-lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
- const char *Name);
+gallivm_free_function(struct gallivm_state *gallivm,
+ LLVMValueRef func,
+ const void * code);
void
lp_set_load_alignment(LLVMValueRef Inst,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.c b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
index 2323f124ae4..2bf1211bcd7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
@@ -48,6 +48,8 @@
#include "lp_bld_const.h"
#include "lp_bld_intr.h"
+#include "lp_bld_type.h"
+#include "lp_bld_pack.h"
LLVMValueRef
@@ -129,6 +131,95 @@ lp_build_intrinsic_binary(LLVMBuilderRef builder,
}
+/**
+ * Call intrinsic with arguments adapted to intrinsic vector length.
+ *
+ * Split vectors which are too large for the hw, or expand them if they
+ * are too small, so a caller calling a function which might use intrinsics
+ * doesn't need to do splitting/expansion on its own.
+ * This only supports intrinsics where src and dst types match.
+ */
+LLVMValueRef
+lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm,
+ const char *name,
+ struct lp_type src_type,
+ unsigned intr_size,
+ LLVMValueRef a,
+ LLVMValueRef b)
+{
+ unsigned i;
+ struct lp_type intrin_type = src_type;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+ LLVMValueRef anative, bnative;
+ unsigned intrin_length = intr_size / src_type.width;
+
+ intrin_type.length = intrin_length;
+
+ if (intrin_length > src_type.length) {
+ LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+ LLVMValueRef constvec, tmp;
+
+ for (i = 0; i < src_type.length; i++) {
+ elems[i] = lp_build_const_int32(gallivm, i);
+ }
+ for (; i < intrin_length; i++) {
+ elems[i] = i32undef;
+ }
+ if (src_type.length == 1) {
+ LLVMTypeRef elem_type = lp_build_elem_type(gallivm, intrin_type);
+ a = LLVMBuildBitCast(builder, a, LLVMVectorType(elem_type, 1), "");
+ b = LLVMBuildBitCast(builder, b, LLVMVectorType(elem_type, 1), "");
+ }
+ constvec = LLVMConstVector(elems, intrin_length);
+ anative = LLVMBuildShuffleVector(builder, a, a, constvec, "");
+ bnative = LLVMBuildShuffleVector(builder, b, b, constvec, "");
+ tmp = lp_build_intrinsic_binary(builder, name,
+ lp_build_vec_type(gallivm, intrin_type),
+ anative, bnative);
+ if (src_type.length > 1) {
+ constvec = LLVMConstVector(elems, src_type.length);
+ return LLVMBuildShuffleVector(builder, tmp, tmp, constvec, "");
+ }
+ else {
+ return LLVMBuildExtractElement(builder, tmp, elems[0], "");
+ }
+ }
+ else if (intrin_length < src_type.length) {
+ unsigned num_vec = src_type.length / intrin_length;
+ LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
+
+ /* don't support arbitrary size here as this is so yuck */
+ if (src_type.length % intrin_length) {
+ /* FIXME: This is something which should be supported
+ * but there doesn't seem to be any need for it currently
+ * so crash and burn.
+ */
+ debug_printf("%s: should handle arbitrary vector size\n",
+ __FUNCTION__);
+ assert(0);
+ return NULL;
+ }
+
+ for (i = 0; i < num_vec; i++) {
+ anative = lp_build_extract_range(gallivm, a, i*intrin_length,
+ intrin_length);
+ bnative = lp_build_extract_range(gallivm, b, i*intrin_length,
+ intrin_length);
+ tmp[i] = lp_build_intrinsic_binary(builder, name,
+ lp_build_vec_type(gallivm, intrin_type),
+ anative, bnative);
+ }
+ return lp_build_concat(gallivm, tmp, intrin_type, num_vec);
+ }
+ else {
+ return lp_build_intrinsic_binary(builder, name,
+ lp_build_vec_type(gallivm, src_type),
+ a, b);
+ }
+}
+
+
LLVMValueRef
lp_build_intrinsic_map(struct gallivm_state *gallivm,
const char *name,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.h b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
index b73dd700362..38c5c29c980 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
@@ -78,6 +78,15 @@ lp_build_intrinsic_binary(LLVMBuilderRef builder,
LLVMValueRef
+lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm,
+ const char *name,
+ struct lp_type src_type,
+ unsigned intr_size,
+ LLVMValueRef a,
+ LLVMValueRef b);
+
+
+LLVMValueRef
lp_build_intrinsic_map(struct gallivm_state *gallivm,
const char *name,
LLVMTypeRef ret_type,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 69796149aaa..7a4a5bb11d3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -52,8 +52,8 @@
*
* select <4 x i1> %C, %A, %B
*
- * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is not
- * supported on any backend.
+ * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is only
+ * supported on some backends (x86) starting with llvm 3.1.
*
* Expanding the boolean vector to full SIMD register width, as in
*
@@ -485,8 +485,10 @@ lp_build_select(struct lp_build_context *bld,
}
res = LLVMBuildSelect(builder, mask, a, b, "");
}
- else if (util_cpu_caps.has_sse4_1 &&
- type.width * type.length == 128 &&
+ else if (((util_cpu_caps.has_sse4_1 &&
+ type.width * type.length == 128) ||
+ (util_cpu_caps.has_avx &&
+ type.width * type.length == 256 && type.width >= 32)) &&
!LLVMIsConstant(a) &&
!LLVMIsConstant(b) &&
!LLVMIsConstant(mask)) {
@@ -494,8 +496,22 @@ lp_build_select(struct lp_build_context *bld,
LLVMTypeRef arg_type;
LLVMValueRef args[3];
- if (type.floating &&
- type.width == 64) {
+ /*
+ * There's only float blend in AVX but can just cast i32/i64
+ * to float.
+ */
+ if (type.width * type.length == 256) {
+ if (type.width == 64) {
+ intrinsic = "llvm.x86.avx.blendv.pd.256";
+ arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 4);
+ }
+ else {
+ intrinsic = "llvm.x86.avx.blendv.ps.256";
+ arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8);
+ }
+ }
+ else if (type.floating &&
+ type.width == 64) {
intrinsic = "llvm.x86.sse41.blendvpd";
arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 2);
} else if (type.floating &&
@@ -591,3 +607,35 @@ lp_build_select_aos(struct lp_build_context *bld,
return lp_build_select(bld, mask_vec, a, b);
}
}
+
+
+/**
+ * Return (scalar-cast)val ? true : false;
+ */
+LLVMValueRef
+lp_build_any_true_range(struct lp_build_context *bld,
+ unsigned real_length,
+ LLVMValueRef val)
+{
+ LLVMBuilderRef builder = bld->gallivm->builder;
+ LLVMTypeRef scalar_type;
+ LLVMTypeRef true_type;
+
+ assert(real_length <= bld->type.length);
+
+ true_type = LLVMIntTypeInContext(bld->gallivm->context,
+ bld->type.width * real_length);
+ scalar_type = LLVMIntTypeInContext(bld->gallivm->context,
+ bld->type.width * bld->type.length);
+ val = LLVMBuildBitCast(builder, val, scalar_type, "");
+ /*
+ * We're using always native types so we can use intrinsics.
+ * However, if we don't do per-element calculations, we must ensure
+ * the excess elements aren't used since they may contain garbage.
+ */
+ if (real_length < bld->type.length) {
+ val = LLVMBuildTrunc(builder, val, true_type, "");
+ }
+ return LLVMBuildICmp(builder, LLVMIntNE,
+ val, LLVMConstNull(true_type), "");
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.h b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
index ef33a653682..64c0a1f5946 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
@@ -82,4 +82,9 @@ lp_build_select_aos(struct lp_build_context *bld,
LLVMValueRef b);
+LLVMValueRef
+lp_build_any_true_range(struct lp_build_context *bld,
+ unsigned real_length,
+ LLVMValueRef val);
+
#endif /* !LP_BLD_LOGIC_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 6c4586c4212..dd2c6120afb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -26,6 +26,12 @@
**************************************************************************/
+/**
+ * The purpose of this module is to expose LLVM functionality not available
+ * through the C++ bindings.
+ */
+
+
#ifndef __STDC_LIMIT_MACROS
#define __STDC_LIMIT_MACROS
#endif
@@ -41,11 +47,24 @@
#include <llvm/Target/TargetOptions.h>
#include <llvm/ExecutionEngine/ExecutionEngine.h>
#include <llvm/ExecutionEngine/JITEventListener.h>
+#if HAVE_LLVM >= 0x0301
+#include <llvm/ADT/Triple.h>
+#include <llvm/ExecutionEngine/JITMemoryManager.h>
+#endif
#include <llvm/Support/CommandLine.h>
#include <llvm/Support/PrettyStackTrace.h>
+#if HAVE_LLVM >= 0x0300
+#include <llvm/Support/TargetSelect.h>
+#else /* HAVE_LLVM < 0x0300 */
+#include <llvm/Target/TargetSelect.h>
+#endif /* HAVE_LLVM < 0x0300 */
+
#include "pipe/p_config.h"
#include "util/u_debug.h"
+#include "util/u_cpu_detect.h"
+
+#include "lp_bld_misc.h"
/**
@@ -99,6 +118,9 @@ lp_set_target_options(void)
#if defined(DEBUG) || defined(PROFILE)
llvm::NoFramePointerElim = true;
+#if HAVE_LLVM >= 0x0208
+ llvm::NoFramePointerElimNonLeaf = true;
+#endif
#endif
llvm::NoExcessFPPrecision = false;
@@ -146,6 +168,30 @@ lp_set_target_options(void)
* shared object where the gallium driver resides.
*/
llvm::DisablePrettyStackTrace = true;
+
+ // If we have a native target, initialize it to ensure it is linked in and
+ // usable by the JIT.
+ llvm::InitializeNativeTarget();
+
+#if HAVE_LLVM >= 0x0208
+ llvm::InitializeNativeTargetAsmPrinter();
+#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+ LLVMInitializeX86AsmPrinter();
+#elif defined(PIPE_ARCH_ARM)
+ LLVMInitializeARMAsmPrinter();
+#elif defined(PIPE_ARCH_PPC)
+ LLVMInitializePowerPCAsmPrinter();
+#endif
+
+#if HAVE_LLVM >= 0x0207
+# if HAVE_LLVM >= 0x0301
+ llvm::InitializeNativeTargetDisassembler();
+# elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+ LLVMInitializeX86Disassembler();
+# elif defined(PIPE_ARCH_ARM)
+ LLVMInitializeARMDisassembler();
+# endif
+#endif
}
@@ -165,6 +211,7 @@ lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
return llvm::wrap(llvm::unwrap(B)->CreateLoad(llvm::unwrap(PointerVal), true, Name));
}
+
extern "C"
void
lp_set_load_alignment(LLVMValueRef Inst,
@@ -180,3 +227,67 @@ lp_set_store_alignment(LLVMValueRef Inst,
{
llvm::unwrap<llvm::StoreInst>(Inst)->setAlignment(Align);
}
+
+
+#if HAVE_LLVM >= 0x301
+
+/**
+ * Same as LLVMCreateJITCompilerForModule, but using MCJIT and enabling AVX
+ * feature where available.
+ *
+ * See also:
+ * - llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+ * - llvm/tools/lli/lli.cpp
+ * - http://markmail.org/message/ttkuhvgj4cxxy2on#query:+page:1+mid:aju2dggerju3ivd3+state:results
+ */
+extern "C"
+LLVMBool
+lp_build_create_mcjit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
+ LLVMModuleRef M,
+ unsigned OptLevel,
+ char **OutError)
+{
+ using namespace llvm;
+
+ std::string Error;
+ EngineBuilder builder(unwrap(M));
+ builder.setEngineKind(EngineKind::JIT)
+ .setErrorStr(&Error)
+ .setOptLevel((CodeGenOpt::Level)OptLevel);
+
+ builder.setUseMCJIT(true);
+
+ llvm::SmallVector<std::string, 1> MAttrs;
+ if (util_cpu_caps.has_avx) {
+ /*
+ * AVX feature is not automatically detected from CPUID by the X86 target
+ * yet, because the old (yet default) JIT engine is not capable of
+ * emitting the opcodes. But as we're using MCJIT here, it is safe to
+ * add set this attribute.
+ */
+ MAttrs.push_back("+avx");
+ builder.setMAttrs(MAttrs);
+ }
+ builder.setJITMemoryManager(JITMemoryManager::CreateDefaultMemManager());
+
+ ExecutionEngine *JIT;
+#if 0
+ JIT = builder.create();
+#else
+ /*
+ * Workaround http://llvm.org/bugs/show_bug.cgi?id=12833
+ */
+ StringRef MArch = "";
+ StringRef MCPU = "";
+ Triple TT(unwrap(M)->getTargetTriple());
+ JIT = builder.create(builder.selectTarget(TT, MArch, MCPU, MAttrs));
+#endif
+ if (JIT) {
+ *OutJIT = wrap(JIT);
+ return 0;
+ }
+ *OutError = strdup(Error.c_str());
+ return 1;
+}
+
+#endif /* HAVE_LLVM >= 0x301 */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.h b/src/gallium/auxiliary/gallivm/lp_bld_misc.h
new file mode 100644
index 00000000000..4f80b38280c
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.h
@@ -0,0 +1,70 @@
+/**************************************************************************
+ *
+ * Copyright 2012 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_BLD_MISC_H
+#define LP_BLD_MISC_H
+
+
+#include "lp_bld.h"
+#include <llvm-c/ExecutionEngine.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+extern void
+lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE);
+
+extern void
+lp_set_target_options(void);
+
+
+extern void
+lp_func_delete_body(LLVMValueRef func);
+
+
+extern LLVMValueRef
+lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
+ const char *Name);
+
+extern int
+lp_build_create_mcjit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
+ LLVMModuleRef M,
+ unsigned OptLevel,
+ char **OutError);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* !LP_BLD_MISC_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index fde6bb594f1..b18f7841ccb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -69,6 +69,7 @@
#include "util/u_debug.h"
#include "util/u_math.h"
#include "util/u_cpu_detect.h"
+#include "util/u_memory.h"
#include "lp_bld_type.h"
#include "lp_bld_const.h"
@@ -76,6 +77,7 @@
#include "lp_bld_intr.h"
#include "lp_bld_arit.h"
#include "lp_bld_pack.h"
+#include "lp_bld_swizzle.h"
/**
@@ -101,6 +103,30 @@ lp_build_const_unpack_shuffle(struct gallivm_state *gallivm,
return LLVMConstVector(elems, n);
}
+/**
+ * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack.
+ * See comment above lp_build_interleave2_half for more details.
+ */
+static LLVMValueRef
+lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm,
+ unsigned n, unsigned lo_hi)
+{
+ LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+ unsigned i, j;
+
+ assert(n <= LP_MAX_VECTOR_LENGTH);
+ assert(lo_hi < 2);
+
+ for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) {
+ if (i == (n / 2))
+ j += n / 4;
+
+ elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
+ elems[i + 1] = lp_build_const_int32(gallivm, n + j);
+ }
+
+ return LLVMConstVector(elems, n);
+}
/**
* Build shuffle vectors that match PACKxx instructions.
@@ -119,6 +145,71 @@ lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n)
return LLVMConstVector(elems, n);
}
+/**
+ * Return a vector with elements src[start:start+size]
+ * Most useful for getting half the values out of a 256bit sized vector,
+ * otherwise may cause data rearrangement to happen.
+ */
+LLVMValueRef
+lp_build_extract_range(struct gallivm_state *gallivm,
+ LLVMValueRef src,
+ unsigned start,
+ unsigned size)
+{
+ LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+ unsigned i;
+
+ assert(size <= Elements(elems));
+
+ for (i = 0; i < size; ++i)
+ elems[i] = lp_build_const_int32(gallivm, i + start);
+
+ if (size == 1) {
+ return LLVMBuildExtractElement(gallivm->builder, src, elems[0], "");
+ }
+ else {
+ return LLVMBuildShuffleVector(gallivm->builder, src, src,
+ LLVMConstVector(elems, size), "");
+ }
+}
+
+/**
+ * Concatenates several (must be a power of 2) vectors (of same type)
+ * into a larger one.
+ * Most useful for building up a 256bit sized vector out of two 128bit ones.
+ */
+LLVMValueRef
+lp_build_concat(struct gallivm_state *gallivm,
+ LLVMValueRef src[],
+ struct lp_type src_type,
+ unsigned num_vectors)
+{
+ unsigned new_length, i;
+ LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2];
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+
+ assert(src_type.length * num_vectors <= Elements(shuffles));
+ assert(util_is_power_of_two(num_vectors));
+
+ new_length = src_type.length;
+
+ for (i = 0; i < num_vectors; i++)
+ tmp[i] = src[i];
+
+ while (num_vectors > 1) {
+ num_vectors >>= 1;
+ new_length <<= 1;
+ for (i = 0; i < new_length; i++) {
+ shuffles[i] = lp_build_const_int32(gallivm, i);
+ }
+ for (i = 0; i < num_vectors; i++) {
+ tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1],
+ LLVMConstVector(shuffles, new_length), "");
+ }
+ }
+
+ return tmp[0];
+}
/**
* Interleave vector elements.
@@ -139,6 +230,40 @@ lp_build_interleave2(struct gallivm_state *gallivm,
return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
}
+/**
+ * Interleave vector elements but with 256 bit,
+ * treats it as interleave with 2 concatenated 128 bit vectors.
+ *
+ * This differs to lp_build_interleave2 as that function would do the following (for lo):
+ * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction.
+ *
+ *
+ * An example interleave 8x float with 8x float on AVX 256bit unpack:
+ * a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7
+ *
+ * Equivalent to interleaving 2x 128 bit vectors
+ * a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7
+ *
+ * So interleave-lo would result in:
+ * a0 b0 a1 b1 a4 b4 a5 b5
+ *
+ * And interleave-hi would result in:
+ * a2 b2 a3 b3 a6 b6 a7 b7
+ */
+LLVMValueRef
+lp_build_interleave2_half(struct gallivm_state *gallivm,
+ struct lp_type type,
+ LLVMValueRef a,
+ LLVMValueRef b,
+ unsigned lo_hi)
+{
+ if (type.length * type.width == 256) {
+ LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);
+ return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
+ } else {
+ return lp_build_interleave2(gallivm, type, a, b, lo_hi);
+ }
+}
/**
* Double the bit width.
@@ -237,9 +362,9 @@ lp_build_unpack(struct gallivm_state *gallivm,
* Non-interleaved pack.
*
* This will move values as
- *
- * lo = __ l0 __ l1 __ l2 __.. __ ln
- * hi = __ h0 __ h1 __ h2 __.. __ hn
+ * (LSB) (MSB)
+ * lo = l0 __ l1 __ l2 __.. __ ln __
+ * hi = h0 __ h1 __ h2 __.. __ hn __
* res = l0 l1 l2 .. ln h0 h1 h2 .. hn
*
* This will only change the number of bits the values are represented, not the
@@ -257,12 +382,14 @@ lp_build_pack2(struct gallivm_state *gallivm,
LLVMValueRef hi)
{
LLVMBuilderRef builder = gallivm->builder;
-#if HAVE_LLVM < 0x0207
- LLVMTypeRef src_vec_type = lp_build_vec_type(gallivm, src_type);
-#endif
LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type);
LLVMValueRef shuffle;
LLVMValueRef res = NULL;
+ struct lp_type intr_type = dst_type;
+
+#if HAVE_LLVM < 0x0207
+ intr_type = src_type;
+#endif
assert(!src_type.floating);
assert(!dst_type.floating);
@@ -270,50 +397,81 @@ lp_build_pack2(struct gallivm_state *gallivm,
assert(src_type.length * 2 == dst_type.length);
/* Check for special cases first */
- if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
+ if(util_cpu_caps.has_sse2 && src_type.width * src_type.length >= 128) {
+ const char *intrinsic = NULL;
+
switch(src_type.width) {
case 32:
if(dst_type.sign) {
-#if HAVE_LLVM >= 0x0207
- res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", dst_vec_type, lo, hi);
-#else
- res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
-#endif
+ intrinsic = "llvm.x86.sse2.packssdw.128";
}
else {
if (util_cpu_caps.has_sse4_1) {
- return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
- }
- else {
- /* use generic shuffle below */
- res = NULL;
+ intrinsic = "llvm.x86.sse41.packusdw";
+#if HAVE_LLVM < 0x0207
+ /* llvm < 2.7 has inconsistent signatures except for packusdw */
+ intr_type = dst_type;
+#endif
}
}
break;
-
case 16:
- if(dst_type.sign)
-#if HAVE_LLVM >= 0x0207
- res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", dst_vec_type, lo, hi);
-#else
- res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
-#endif
- else
-#if HAVE_LLVM >= 0x0207
- res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", dst_vec_type, lo, hi);
-#else
- res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
-#endif
- break;
-
- default:
- assert(0);
- return LLVMGetUndef(dst_vec_type);
+ if (dst_type.sign) {
+ intrinsic = "llvm.x86.sse2.packsswb.128";
+ }
+ else {
+ intrinsic = "llvm.x86.sse2.packuswb.128";
+ }
break;
+ /* default uses generic shuffle below */
}
-
- if (res) {
- res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
+ if (intrinsic) {
+ if (src_type.width * src_type.length == 128) {
+ LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
+ res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
+ if (dst_vec_type != intr_vec_type) {
+ res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
+ }
+ }
+ else {
+ int num_split = src_type.width * src_type.length / 128;
+ int i;
+ int nlen = 128 / src_type.width;
+ struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);
+ struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);
+ LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];
+ LLVMValueRef tmplo, tmphi;
+ LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type);
+ LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type);
+
+ assert(num_split <= LP_MAX_VECTOR_WIDTH / 128);
+
+ for (i = 0; i < num_split / 2; i++) {
+ tmplo = lp_build_extract_range(gallivm,
+ lo, i*nlen*2, nlen);
+ tmphi = lp_build_extract_range(gallivm,
+ lo, i*nlen*2 + nlen, nlen);
+ tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,
+ nintr_vec_type, tmplo, tmphi);
+ if (ndst_vec_type != nintr_vec_type) {
+ tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, "");
+ }
+ }
+ for (i = 0; i < num_split / 2; i++) {
+ tmplo = lp_build_extract_range(gallivm,
+ hi, i*nlen*2, nlen);
+ tmphi = lp_build_extract_range(gallivm,
+ hi, i*nlen*2 + nlen, nlen);
+ tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,
+ nintr_vec_type,
+ tmplo, tmphi);
+ if (ndst_vec_type != nintr_vec_type) {
+ tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2],
+ ndst_vec_type, "");
+ }
+ }
+ res = lp_build_concat(gallivm, tmpres, ndst_type, num_split);
+ }
return res;
}
}
@@ -357,8 +515,9 @@ lp_build_packs2(struct gallivm_state *gallivm,
/* All X86 SSE non-interleaved pack instructions take signed inputs and
* saturate them, so no need to clamp for those cases. */
if(util_cpu_caps.has_sse2 &&
- src_type.width * src_type.length == 128 &&
- src_type.sign)
+ src_type.width * src_type.length >= 128 &&
+ src_type.sign &&
+ (src_type.width == 32 || src_type.width == 16))
clamp = FALSE;
if(clamp) {
@@ -395,7 +554,6 @@ lp_build_pack(struct gallivm_state *gallivm,
LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
unsigned i;
-
/* Register width must remain constant */
assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
@@ -487,21 +645,44 @@ lp_build_resize(struct gallivm_state *gallivm,
/*
* Register width remains constant -- use vector packing intrinsics
*/
-
tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
}
else {
- /*
- * Do it element-wise.
- */
-
- assert(src_type.length == dst_type.length);
- tmp[0] = lp_build_undef(gallivm, dst_type);
- for (i = 0; i < dst_type.length; ++i) {
- LLVMValueRef index = lp_build_const_int32(gallivm, i);
- LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
- val = LLVMBuildTrunc(builder, val, lp_build_elem_type(gallivm, dst_type), "");
- tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
+ if (src_type.width / dst_type.width > num_srcs) {
+ /*
+ * First change src vectors size (with shuffle) so they have the
+ * same size as the destination vector, then pack normally.
+ * Note: cannot use cast/extract because llvm generates atrocious code.
+ */
+ unsigned size_ratio = (src_type.width * src_type.length) /
+ (dst_type.length * dst_type.width);
+ unsigned new_length = src_type.length / size_ratio;
+
+ for (i = 0; i < size_ratio * num_srcs; i++) {
+ unsigned start_index = (i % size_ratio) * new_length;
+ tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio],
+ start_index, new_length);
+ }
+ num_srcs *= size_ratio;
+ src_type.length = new_length;
+ tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs);
+ }
+ else {
+ /*
+ * Truncate bit width but expand vector size - first pack
+ * then expand simply because this should be more AVX-friendly
+ * for the cases we probably hit.
+ */
+ unsigned size_ratio = (dst_type.width * dst_type.length) /
+ (src_type.length * src_type.width);
+ unsigned num_pack_srcs = num_srcs / size_ratio;
+ dst_type.length = dst_type.length / size_ratio;
+
+ for (i = 0; i < size_ratio; i++) {
+ tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE,
+ &src[i*num_pack_srcs], num_pack_srcs);
+ }
+ tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio);
}
}
}
@@ -522,19 +703,24 @@ lp_build_resize(struct gallivm_state *gallivm,
/*
* Do it element-wise.
*/
+ assert(src_type.length * num_srcs == dst_type.length * num_dsts);
+
+ for (i = 0; i < num_dsts; i++) {
+ tmp[i] = lp_build_undef(gallivm, dst_type);
+ }
- assert(src_type.length == dst_type.length);
- tmp[0] = lp_build_undef(gallivm, dst_type);
- for (i = 0; i < dst_type.length; ++i) {
- LLVMValueRef index = lp_build_const_int32(gallivm, i);
- LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
+ for (i = 0; i < src_type.length; ++i) {
+ unsigned j = i / dst_type.length;
+ LLVMValueRef srcindex = lp_build_const_int32(gallivm, i);
+ LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length);
+ LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, "");
if (src_type.sign && dst_type.sign) {
val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
} else {
val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
}
- tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
+ tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, "");
}
}
}
@@ -554,3 +740,38 @@ lp_build_resize(struct gallivm_state *gallivm,
}
+/**
+ * Expands src vector from src.length to dst_length
+ */
+LLVMValueRef
+lp_build_pad_vector(struct gallivm_state *gallivm,
+ LLVMValueRef src,
+ struct lp_type src_type,
+ unsigned dst_length)
+{
+ LLVMValueRef undef = LLVMGetUndef(lp_build_vec_type(gallivm, src_type));
+ LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+ unsigned i;
+
+ assert(dst_length <= Elements(elems));
+ assert(dst_length > src_type.length);
+
+ if (src_type.length == dst_length)
+ return src;
+
+ /* If its a single scalar type, no need to reinvent the wheel */
+ if (src_type.length == 1) {
+ return lp_build_broadcast(gallivm, LLVMVectorType(lp_build_elem_type(gallivm, src_type), dst_length), src);
+ }
+
+ /* All elements from src vector */
+ for (i = 0; i < src_type.length; ++i)
+ elems[i] = lp_build_const_int32(gallivm, i);
+
+ /* Undef fill remaining space */
+ for (i = src_type.length; i < dst_length; ++i)
+ elems[i] = lp_build_const_int32(gallivm, src_type.length);
+
+ /* Combine the two vectors */
+ return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), "");
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
index d58da4f01b3..73f299cca11 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -44,6 +44,12 @@
struct lp_type;
+LLVMValueRef
+lp_build_interleave2_half(struct gallivm_state *gallivm,
+ struct lp_type type,
+ LLVMValueRef a,
+ LLVMValueRef b,
+ unsigned lo_hi);
LLVMValueRef
lp_build_interleave2(struct gallivm_state *gallivm,
@@ -69,6 +75,17 @@ lp_build_unpack(struct gallivm_state *gallivm,
LLVMValueRef src,
LLVMValueRef *dst, unsigned num_dsts);
+LLVMValueRef
+lp_build_extract_range(struct gallivm_state *gallivm,
+ LLVMValueRef src,
+ unsigned start,
+ unsigned size);
+
+LLVMValueRef
+lp_build_concat(struct gallivm_state *gallivm,
+ LLVMValueRef src[],
+ struct lp_type src_type,
+ unsigned num_vectors);
LLVMValueRef
lp_build_packs2(struct gallivm_state *gallivm,
@@ -102,4 +119,10 @@ lp_build_resize(struct gallivm_state *gallivm,
LLVMValueRef *dst, unsigned num_dsts);
+LLVMValueRef
+lp_build_pad_vector(struct gallivm_state *gallivm,
+ LLVMValueRef src,
+ struct lp_type src_type,
+ unsigned dst_length);
+
#endif /* !LP_BLD_PACK_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
index b0a5bc0267f..b1ba7c72655 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
@@ -26,6 +26,7 @@
**************************************************************************/
+#include "u_cpu_detect.h"
#include "lp_bld_type.h"
#include "lp_bld_arit.h"
#include "lp_bld_const.h"
@@ -77,34 +78,82 @@ lp_build_ddy(struct lp_build_context *bld,
return lp_build_sub(bld, a_bottom, a_top);
}
-
+/*
+ * To be able to handle multiple quads at once in texture sampling and
+ * do lod calculations per quad, it is necessary to get the per-quad
+ * derivatives into the lp_build_rho function.
+ * For 8-wide vectors the packed derivative values for 3 coords would
+ * look like this, this scales to a arbitrary (multiple of 4) vector size:
+ * ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy
+ * dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____
+ * The second vector will be unused for 1d and 2d textures.
+ */
LLVMValueRef
-lp_build_scalar_ddx(struct lp_build_context *bld,
- LLVMValueRef a)
+lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld,
+ LLVMValueRef a)
{
- LLVMBuilderRef builder = bld->gallivm->builder;
- LLVMValueRef idx_left = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_LEFT);
- LLVMValueRef idx_right = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_RIGHT);
- LLVMValueRef a_left = LLVMBuildExtractElement(builder, a, idx_left, "left");
- LLVMValueRef a_right = LLVMBuildExtractElement(builder, a, idx_right, "right");
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef vec1, vec2;
+
+ /* same packing as _twocoord, but can use aos swizzle helper */
+
+ /*
+ * XXX could make swizzle1 a noop swizzle by using right top/bottom
+ * pair for ddy
+ */
+ static const unsigned char swizzle1[] = {
+ LP_BLD_QUAD_TOP_LEFT, LP_BLD_QUAD_TOP_LEFT,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ static const unsigned char swizzle2[] = {
+ LP_BLD_QUAD_TOP_RIGHT, LP_BLD_QUAD_BOTTOM_LEFT,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+
+ vec1 = lp_build_swizzle_aos(bld, a, swizzle1);
+ vec2 = lp_build_swizzle_aos(bld, a, swizzle2);
+
if (bld->type.floating)
- return LLVMBuildFSub(builder, a_right, a_left, "ddx");
+ return LLVMBuildFSub(builder, vec2, vec1, "ddxddy");
else
- return LLVMBuildSub(builder, a_right, a_left, "ddx");
+ return LLVMBuildSub(builder, vec2, vec1, "ddxddy");
}
LLVMValueRef
-lp_build_scalar_ddy(struct lp_build_context *bld,
- LLVMValueRef a)
+lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld,
+ LLVMValueRef a, LLVMValueRef b)
{
- LLVMBuilderRef builder = bld->gallivm->builder;
- LLVMValueRef idx_top = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_LEFT);
- LLVMValueRef idx_bottom = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_BOTTOM_LEFT);
- LLVMValueRef a_top = LLVMBuildExtractElement(builder, a, idx_top, "top");
- LLVMValueRef a_bottom = LLVMBuildExtractElement(builder, a, idx_bottom, "bottom");
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH/4];
+ LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH/4];
+ LLVMValueRef vec1, vec2;
+ unsigned length, num_quads, i;
+
+ /* XXX: do hsub version */
+ length = bld->type.length;
+ num_quads = length / 4;
+ for (i = 0; i < num_quads; i++) {
+ unsigned s1 = 4 * i;
+ unsigned s2 = 4 * i + length;
+ shuffles1[4*i + 0] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s1);
+ shuffles1[4*i + 1] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s1);
+ shuffles1[4*i + 2] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s2);
+ shuffles1[4*i + 3] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s2);
+ shuffles2[4*i + 0] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_RIGHT + s1);
+ shuffles2[4*i + 1] = lp_build_const_int32(gallivm, LP_BLD_QUAD_BOTTOM_LEFT + s1);
+ shuffles2[4*i + 2] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_RIGHT + s2);
+ shuffles2[4*i + 3] = lp_build_const_int32(gallivm, LP_BLD_QUAD_BOTTOM_LEFT + s2);
+ }
+ vec1 = LLVMBuildShuffleVector(builder, a, b,
+ LLVMConstVector(shuffles1, length), "");
+ vec2 = LLVMBuildShuffleVector(builder, a, b,
+ LLVMConstVector(shuffles2, length), "");
if (bld->type.floating)
- return LLVMBuildFSub(builder, a_bottom, a_top, "ddy");
+ return LLVMBuildFSub(builder, vec2, vec1, "ddxddyddxddy");
else
- return LLVMBuildSub(builder, a_bottom, a_top, "ddy");
+ return LLVMBuildSub(builder, vec2, vec1, "ddxddyddxddy");
}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.h b/src/gallium/auxiliary/gallivm/lp_bld_quad.h
index b7992912927..be6a1efc396 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.h
@@ -78,19 +78,15 @@ lp_build_ddy(struct lp_build_context *bld,
/*
- * Scalar derivatives.
- *
- * Same as getting the first value of above.
+ * Packed derivatives (one derivative for each direction per quad)
*/
-
LLVMValueRef
-lp_build_scalar_ddx(struct lp_build_context *bld,
- LLVMValueRef a);
-
+lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld,
+ LLVMValueRef a, LLVMValueRef b);
LLVMValueRef
-lp_build_scalar_ddy(struct lp_build_context *bld,
- LLVMValueRef a);
+lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld,
+ LLVMValueRef a);
#endif /* LP_BLD_QUAD_H_ */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index d966788d74e..85211161f3c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -44,6 +44,8 @@
#include "lp_bld_sample.h"
#include "lp_bld_swizzle.h"
#include "lp_bld_type.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_pack.h"
/*
@@ -175,67 +177,89 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
/**
* Generate code to compute coordinate gradient (rho).
- * \param ddx partial derivatives of (s, t, r, q) with respect to X
- * \param ddy partial derivatives of (s, t, r, q) with respect to Y
+ * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y
*
- * XXX: The resulting rho is scalar, so we ignore all but the first element of
- * derivatives that are passed by the shader.
+ * The resulting rho is scalar per quad.
*/
static LLVMValueRef
lp_build_rho(struct lp_build_sample_context *bld,
unsigned unit,
- const LLVMValueRef ddx[4],
- const LLVMValueRef ddy[4])
+ const struct lp_derivatives *derivs)
{
+ struct gallivm_state *gallivm = bld->gallivm;
struct lp_build_context *int_size_bld = &bld->int_size_bld;
struct lp_build_context *float_size_bld = &bld->float_size_bld;
struct lp_build_context *float_bld = &bld->float_bld;
+ struct lp_build_context *coord_bld = &bld->coord_bld;
+ struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
+ const LLVMValueRef *ddx_ddy = derivs->ddx_ddy;
const unsigned dims = bld->dims;
LLVMBuilderRef builder = bld->gallivm->builder;
LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
- LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy;
- LLVMValueRef rho_x, rho_y;
LLVMValueRef rho_vec;
LLVMValueRef int_size, float_size;
LLVMValueRef rho;
LLVMValueRef first_level, first_level_vec;
+ LLVMValueRef abs_ddx_ddy[2];
+ unsigned length = coord_bld->type.length;
+ unsigned num_quads = length / 4;
+ unsigned i;
+ LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+ LLVMValueRef rho_xvec, rho_yvec;
+
+ abs_ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
+ if (dims > 2) {
+ abs_ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
+ }
- dsdx = ddx[0];
- dsdy = ddy[0];
-
- if (dims <= 1) {
- rho_x = dsdx;
- rho_y = dsdy;
+ if (dims == 1) {
+ static const unsigned char swizzle1[] = {
+ 0, LP_BLD_SWIZZLE_DONTCARE,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ static const unsigned char swizzle2[] = {
+ 1, LP_BLD_SWIZZLE_DONTCARE,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1);
+ rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2);
+ }
+ else if (dims == 2) {
+ static const unsigned char swizzle1[] = {
+ 0, 2,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ static const unsigned char swizzle2[] = {
+ 1, 3,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1);
+ rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2);
}
else {
- rho_x = float_size_bld->undef;
- rho_y = float_size_bld->undef;
-
- rho_x = LLVMBuildInsertElement(builder, rho_x, dsdx, index0, "");
- rho_y = LLVMBuildInsertElement(builder, rho_y, dsdy, index0, "");
-
- dtdx = ddx[1];
- dtdy = ddy[1];
-
- rho_x = LLVMBuildInsertElement(builder, rho_x, dtdx, index1, "");
- rho_y = LLVMBuildInsertElement(builder, rho_y, dtdy, index1, "");
-
- if (dims >= 3) {
- drdx = ddx[2];
- drdy = ddy[2];
-
- rho_x = LLVMBuildInsertElement(builder, rho_x, drdx, index2, "");
- rho_y = LLVMBuildInsertElement(builder, rho_y, drdy, index2, "");
+ LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH];
+ LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH];
+ assert(dims == 3);
+ for (i = 0; i < num_quads; i++) {
+ shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i);
+ shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2);
+ shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i);
+ shuffles1[4*i + 3] = i32undef;
+ shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1);
+ shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3);
+ shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 1);
+ shuffles2[4*i + 3] = i32undef;
}
+ rho_xvec = LLVMBuildShuffleVector(builder, abs_ddx_ddy[0], abs_ddx_ddy[1],
+ LLVMConstVector(shuffles1, length), "");
+ rho_yvec = LLVMBuildShuffleVector(builder, abs_ddx_ddy[0], abs_ddx_ddy[1],
+ LLVMConstVector(shuffles2, length), "");
}
- rho_x = lp_build_abs(float_size_bld, rho_x);
- rho_y = lp_build_abs(float_size_bld, rho_y);
-
- rho_vec = lp_build_max(float_size_bld, rho_x, rho_y);
+ rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
first_level = bld->dynamic_state->first_level(bld->dynamic_state,
bld->gallivm, unit);
@@ -243,22 +267,77 @@ lp_build_rho(struct lp_build_sample_context *bld,
int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
float_size = lp_build_int_to_float(float_size_bld, int_size);
- rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
+ if (bld->coord_type.length > 4) {
+ /* expand size to each quad */
+ if (dims > 1) {
+ /* could use some broadcast_vector helper for this? */
+ int num_quads = bld->coord_type.length / 4;
+ LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4];
+ for (i = 0; i < num_quads; i++) {
+ src[i] = float_size;
+ }
+ float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads);
+ }
+ else {
+ float_size = lp_build_broadcast_scalar(coord_bld, float_size);
+ }
+ rho_vec = lp_build_mul(coord_bld, rho_vec, float_size);
- if (dims <= 1) {
- rho = rho_vec;
+ if (dims <= 1) {
+ rho = rho_vec;
+ }
+ else {
+ if (dims >= 2) {
+ static const unsigned char swizzle1[] = {
+ 0, LP_BLD_SWIZZLE_DONTCARE,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ static const unsigned char swizzle2[] = {
+ 1, LP_BLD_SWIZZLE_DONTCARE,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ LLVMValueRef rho_s, rho_t, rho_r;
+
+ rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
+ rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2);
+
+ rho = lp_build_max(coord_bld, rho_s, rho_t);
+
+ if (dims >= 3) {
+ static const unsigned char swizzle3[] = {
+ 2, LP_BLD_SWIZZLE_DONTCARE,
+ LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+ };
+ rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle3);
+ rho = lp_build_max(coord_bld, rho, rho_r);
+ }
+ }
+ }
+ rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+ perquadf_bld->type, rho);
}
else {
- if (dims >= 2) {
- LLVMValueRef rho_s, rho_t, rho_r;
+ if (dims <= 1) {
+ rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, "");
+ }
+ rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
- rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
- rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
+ if (dims <= 1) {
+ rho = rho_vec;
+ }
+ else {
+ if (dims >= 2) {
+ LLVMValueRef rho_s, rho_t, rho_r;
+
+ rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
+ rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
- rho = lp_build_max(float_bld, rho_s, rho_t);
- if (dims >= 3) {
- rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
- rho = lp_build_max(float_bld, rho, rho_r);
+ rho = lp_build_max(float_bld, rho_s, rho_t);
+
+ if (dims >= 3) {
+ rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
+ rho = lp_build_max(float_bld, rho, rho_r);
+ }
}
}
}
@@ -396,22 +475,20 @@ lp_build_brilinear_rho(struct lp_build_context *bld,
/**
* Generate code to compute texture level of detail (lambda).
- * \param ddx partial derivatives of (s, t, r, q) with respect to X
- * \param ddy partial derivatives of (s, t, r, q) with respect to Y
+ * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y
* \param lod_bias optional float vector with the shader lod bias
* \param explicit_lod optional float vector with the explicit lod
* \param width scalar int texture width
* \param height scalar int texture height
* \param depth scalar int texture depth
*
- * XXX: The resulting lod is scalar, so ignore all but the first element of
- * derivatives, lod_bias, etc that are passed by the shader.
+ * The resulting lod is scalar per quad, so only the first value per quad
+ * passed in from lod_bias, explicit_lod is used.
*/
void
lp_build_lod_selector(struct lp_build_sample_context *bld,
unsigned unit,
- const LLVMValueRef ddx[4],
- const LLVMValueRef ddy[4],
+ const struct lp_derivatives *derivs,
LLVMValueRef lod_bias, /* optional */
LLVMValueRef explicit_lod, /* optional */
unsigned mip_filter,
@@ -420,11 +497,11 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
{
LLVMBuilderRef builder = bld->gallivm->builder;
- struct lp_build_context *float_bld = &bld->float_bld;
+ struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
LLVMValueRef lod;
- *out_lod_ipart = bld->int_bld.zero;
- *out_lod_fpart = bld->float_bld.zero;
+ *out_lod_ipart = bld->perquadi_bld.zero;
+ *out_lod_fpart = perquadf_bld->zero;
if (bld->static_state->min_max_lod_equal) {
/* User is forcing sampling from a particular mipmap level.
@@ -433,21 +510,17 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
LLVMValueRef min_lod =
bld->dynamic_state->min_lod(bld->dynamic_state, bld->gallivm, unit);
- lod = min_lod;
+ lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
}
else {
- LLVMValueRef sampler_lod_bias =
- bld->dynamic_state->lod_bias(bld->dynamic_state, bld->gallivm, unit);
- LLVMValueRef index0 = lp_build_const_int32(bld->gallivm, 0);
-
if (explicit_lod) {
- lod = LLVMBuildExtractElement(builder, explicit_lod,
- index0, "");
+ lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
+ perquadf_bld->type, explicit_lod);
}
else {
LLVMValueRef rho;
- rho = lp_build_rho(bld, unit, ddx, ddy);
+ rho = lp_build_rho(bld, unit, derivs);
/*
* Compute lod = log2(rho)
@@ -465,66 +538,72 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
- *out_lod_ipart = lp_build_ilog2(float_bld, rho);
- *out_lod_fpart = bld->float_bld.zero;
+ *out_lod_ipart = lp_build_ilog2(perquadf_bld, rho);
+ *out_lod_fpart = perquadf_bld->zero;
return;
}
if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
- lp_build_brilinear_rho(float_bld, rho, BRILINEAR_FACTOR,
+ lp_build_brilinear_rho(perquadf_bld, rho, BRILINEAR_FACTOR,
out_lod_ipart, out_lod_fpart);
return;
}
}
if (0) {
- lod = lp_build_log2(float_bld, rho);
+ lod = lp_build_log2(perquadf_bld, rho);
}
else {
- lod = lp_build_fast_log2(float_bld, rho);
+ lod = lp_build_fast_log2(perquadf_bld, rho);
}
/* add shader lod bias */
if (lod_bias) {
- lod_bias = LLVMBuildExtractElement(builder, lod_bias,
- index0, "");
+ lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
+ perquadf_bld->type, lod_bias);
lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
}
}
/* add sampler lod bias */
- if (bld->static_state->lod_bias_non_zero)
+ if (bld->static_state->lod_bias_non_zero) {
+ LLVMValueRef sampler_lod_bias =
+ bld->dynamic_state->lod_bias(bld->dynamic_state, bld->gallivm, unit);
+ sampler_lod_bias = lp_build_broadcast_scalar(perquadf_bld,
+ sampler_lod_bias);
lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
-
+ }
/* clamp lod */
if (bld->static_state->apply_max_lod) {
LLVMValueRef max_lod =
bld->dynamic_state->max_lod(bld->dynamic_state, bld->gallivm, unit);
+ max_lod = lp_build_broadcast_scalar(perquadf_bld, max_lod);
- lod = lp_build_min(float_bld, lod, max_lod);
+ lod = lp_build_min(perquadf_bld, lod, max_lod);
}
if (bld->static_state->apply_min_lod) {
LLVMValueRef min_lod =
bld->dynamic_state->min_lod(bld->dynamic_state, bld->gallivm, unit);
+ min_lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
- lod = lp_build_max(float_bld, lod, min_lod);
+ lod = lp_build_max(perquadf_bld, lod, min_lod);
}
}
if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
- lp_build_brilinear_lod(float_bld, lod, BRILINEAR_FACTOR,
+ lp_build_brilinear_lod(perquadf_bld, lod, BRILINEAR_FACTOR,
out_lod_ipart, out_lod_fpart);
}
else {
- lp_build_ifloor_fract(float_bld, lod, out_lod_ipart, out_lod_fpart);
+ lp_build_ifloor_fract(perquadf_bld, lod, out_lod_ipart, out_lod_fpart);
}
lp_build_name(*out_lod_fpart, "lod_fpart");
}
else {
- *out_lod_ipart = lp_build_iround(float_bld, lod);
+ *out_lod_ipart = lp_build_iround(perquadf_bld, lod);
}
lp_build_name(*out_lod_ipart, "lod_ipart");
@@ -536,8 +615,8 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
/**
* For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
* mipmap level index.
- * Note: this is all scalar code.
- * \param lod scalar float texture level of detail
+ * Note: this is all scalar per quad code.
+ * \param lod_ipart int texture level of detail
* \param level_out returns integer
*/
void
@@ -546,26 +625,27 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
LLVMValueRef lod_ipart,
LLVMValueRef *level_out)
{
- struct lp_build_context *int_bld = &bld->int_bld;
+ struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
LLVMValueRef first_level, last_level, level;
first_level = bld->dynamic_state->first_level(bld->dynamic_state,
bld->gallivm, unit);
last_level = bld->dynamic_state->last_level(bld->dynamic_state,
bld->gallivm, unit);
+ first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
+ last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
- /* convert float lod to integer */
- level = lp_build_add(int_bld, lod_ipart, first_level);
+ level = lp_build_add(perquadi_bld, lod_ipart, first_level);
/* clamp level to legal range of levels */
- *level_out = lp_build_clamp(int_bld, level, first_level, last_level);
+ *level_out = lp_build_clamp(perquadi_bld, level, first_level, last_level);
}
/**
- * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to
- * two (adjacent) mipmap level indexes. Later, we'll sample from those
- * two mipmap levels and interpolate between them.
+ * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad int LOD(s) to two (per-quad)
+ * (adjacent) mipmap level indexes, and fix up float lod part accordingly.
+ * Later, we'll sample from those two mipmap levels and interpolate between them.
*/
void
lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
@@ -576,20 +656,21 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
LLVMValueRef *level1_out)
{
LLVMBuilderRef builder = bld->gallivm->builder;
- struct lp_build_context *int_bld = &bld->int_bld;
- struct lp_build_context *float_bld = &bld->float_bld;
+ struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
+ struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
LLVMValueRef first_level, last_level;
LLVMValueRef clamp_min;
LLVMValueRef clamp_max;
first_level = bld->dynamic_state->first_level(bld->dynamic_state,
bld->gallivm, unit);
-
- *level0_out = lp_build_add(int_bld, lod_ipart, first_level);
- *level1_out = lp_build_add(int_bld, *level0_out, int_bld->one);
-
last_level = bld->dynamic_state->last_level(bld->dynamic_state,
bld->gallivm, unit);
+ first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
+ last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
+
+ *level0_out = lp_build_add(perquadi_bld, lod_ipart, first_level);
+ *level1_out = lp_build_add(perquadi_bld, *level0_out, perquadi_bld->one);
/*
* Clamp both *level0_out and *level1_out to [first_level, last_level], with
@@ -597,6 +678,15 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
* ends in the process.
*/
+ /*
+ * This code (vector select in particular) only works with llvm 3.1
+ * (if there's more than one quad, with x86 backend). Might consider
+ * converting to our lp_bld_logic helpers.
+ */
+#if HAVE_LLVM < 0x0301
+ assert(perquadi_bld->type.length == 1);
+#endif
+
/* *level0_out < first_level */
clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
*level0_out, first_level,
@@ -609,7 +699,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
first_level, *level1_out, "");
*lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
- float_bld->zero, *lod_fpart_inout, "");
+ perquadf_bld->zero, *lod_fpart_inout, "");
/* *level0_out >= last_level */
clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
@@ -623,7 +713,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
last_level, *level1_out, "");
*lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
- float_bld->zero, *lod_fpart_inout, "");
+ perquadf_bld->zero, *lod_fpart_inout, "");
lp_build_name(*level0_out, "sampler%u_miplevel0", unit);
lp_build_name(*level1_out, "sampler%u_miplevel1", unit);
@@ -651,15 +741,6 @@ lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
}
-LLVMValueRef
-lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
- int level)
-{
- LLVMValueRef lvl = lp_build_const_int32(bld->gallivm, level);
- return lp_build_get_mipmap_level(bld, lvl);
-}
-
-
/**
* Codegen equivalent for u_minify().
* Return max(1, base_size >> level);
@@ -748,8 +829,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
* bld->int_size_type or bld->float_size_type)
* @param coord_type type of the texture size vector (either
* bld->int_coord_type or bld->coord_type)
- * @param int_size vector with the integer texture size (width, height,
- * depth)
+ * @param size vector with the texture size (width, height, depth)
*/
void
lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
@@ -788,7 +868,7 @@ lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
/**
* Unnormalize coords.
*
- * @param int_size vector with the integer texture size (width, height, depth)
+ * @param flt_size vector with the integer texture size (width, height, depth)
*/
void
lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
@@ -823,7 +903,18 @@ lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
/** Helper used by lp_build_cube_lookup() */
static LLVMValueRef
-lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
+lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
+{
+ /* ima = +0.5 / abs(coord); */
+ LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
+ LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
+ LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord);
+ return ima;
+}
+
+/** Helper used by lp_build_cube_lookup() */
+static LLVMValueRef
+lp_build_cube_imaneg(struct lp_build_context *coord_bld, LLVMValueRef coord)
{
/* ima = -0.5 / abs(coord); */
LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, -0.5);
@@ -832,9 +923,12 @@ lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
return ima;
}
-
/**
* Helper used by lp_build_cube_lookup()
+ * FIXME: the sign here can also be 0.
+ * Arithmetically this could definitely make a difference. Either
+ * fix the comment or use other (simpler) sign function, not sure
+ * which one it should be.
* \param sign scalar +1 or -1
* \param coord float vector
* \param ima float vector
@@ -898,58 +992,186 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
LLVMValueRef *face_s,
LLVMValueRef *face_t)
{
- struct lp_build_context *float_bld = &bld->float_bld;
struct lp_build_context *coord_bld = &bld->coord_bld;
LLVMBuilderRef builder = bld->gallivm->builder;
+ struct gallivm_state *gallivm = bld->gallivm;
LLVMValueRef rx, ry, rz;
- LLVMValueRef arx, ary, arz;
- LLVMValueRef c25 = lp_build_const_float(bld->gallivm, 0.25);
- LLVMValueRef arx_ge_ary, arx_ge_arz;
- LLVMValueRef ary_ge_arx, ary_ge_arz;
- LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
-
- assert(bld->coord_bld.type.length == 4);
+ LLVMValueRef tmp[4], rxyz, arxyz;
/*
* Use the average of the four pixel's texcoords to choose the face.
+ * Slight simplification just calculate the sum, skip scaling.
*/
- rx = lp_build_mul(float_bld, c25,
- lp_build_sum_vector(&bld->coord_bld, s));
- ry = lp_build_mul(float_bld, c25,
- lp_build_sum_vector(&bld->coord_bld, t));
- rz = lp_build_mul(float_bld, c25,
- lp_build_sum_vector(&bld->coord_bld, r));
+ tmp[0] = s;
+ tmp[1] = t;
+ tmp[2] = r;
+ rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3);
+ arxyz = lp_build_abs(&bld->coord_bld, rxyz);
+
+ if (coord_bld->type.length > 4) {
+ struct lp_build_context *cint_bld = &bld->int_coord_bld;
+ struct lp_type intctype = cint_bld->type;
+ LLVMValueRef signrxs, signrys, signrzs, signrxyz, sign;
+ LLVMValueRef arxs, arys, arzs;
+ LLVMValueRef arx_ge_ary, maxarxsarys, arz_ge_arx_ary;
+ LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
+ LLVMValueRef ryneg, rzneg;
+ LLVMValueRef ma, ima;
+ LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
+ LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
+ 1 << (intctype.width - 1));
+ LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
+ intctype.width -1);
+ LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
+ LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
+ LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
+
+ assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
+ assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
+ assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
+
+ rx = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
+ ry = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
+ rz = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
+ ryneg = LLVMBuildXor(builder, ry, signmask, "");
+ rzneg = LLVMBuildXor(builder, rz, signmask, "");
+
+ /* the sign bit comes from the averaged vector (per quad),
+ * as does the decision which face to use */
+ signrxyz = LLVMBuildBitCast(builder, rxyz, lp_build_vec_type(gallivm, intctype), "");
+ signrxyz = LLVMBuildAnd(builder, signrxyz, signmask, "");
+
+ arxs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 0);
+ arys = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 1);
+ arzs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 2);
- arx = lp_build_abs(float_bld, rx);
- ary = lp_build_abs(float_bld, ry);
- arz = lp_build_abs(float_bld, rz);
+ /*
+ * select x if x >= y else select y
+ * select previous result if y >= max(x,y) else select z
+ */
+ arx_ge_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, arxs, arys);
+ maxarxsarys = lp_build_max(coord_bld, arxs, arys);
+ arz_ge_arx_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, maxarxsarys, arzs);
- /*
- * Compare sign/magnitude of rx,ry,rz to determine face
- */
- arx_ge_ary = LLVMBuildFCmp(builder, LLVMRealUGE, arx, ary, "");
- arx_ge_arz = LLVMBuildFCmp(builder, LLVMRealUGE, arx, arz, "");
- ary_ge_arx = LLVMBuildFCmp(builder, LLVMRealUGE, ary, arx, "");
- ary_ge_arz = LLVMBuildFCmp(builder, LLVMRealUGE, ary, arz, "");
+ /*
+ * compute all possible new s/t coords
+ * snewx = signrx * -rz;
+ * tnewx = -ry;
+ * snewy = rx;
+ * tnewy = signry * rz;
+ * snewz = signrz * rx;
+ * tnewz = -ry;
+ */
+ signrxs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 0);
+ snewx = LLVMBuildXor(builder, signrxs, rzneg, "");
+ tnewx = ryneg;
+
+ signrys = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 1);
+ snewy = rx;
+ tnewy = LLVMBuildXor(builder, signrys, rz, "");
+
+ signrzs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 2);
+ snewz = LLVMBuildXor(builder, signrzs, rx, "");
+ tnewz = ryneg;
+
+ /* XXX on x86 unclear if we should cast the values back to float
+ * or not - on some cpus (nehalem) pblendvb has twice the throughput
+ * of blendvps though on others there just might be domain
+ * transition penalties when using it (this depends on what llvm
+ * will chose for the bit ops above so there appears no "right way",
+ * but given the boatload of selects let's just use the int type).
+ *
+ * Unfortunately we also need the sign bit of the summed coords.
+ */
+ *face_s = lp_build_select(cint_bld, arx_ge_ary, snewx, snewy);
+ *face_t = lp_build_select(cint_bld, arx_ge_ary, tnewx, tnewy);
+ ma = lp_build_select(coord_bld, arx_ge_ary, s, t);
+ *face = lp_build_select(cint_bld, arx_ge_ary, facex, facey);
+ sign = lp_build_select(cint_bld, arx_ge_ary, signrxs, signrys);
+
+ *face_s = lp_build_select(cint_bld, arz_ge_arx_ary, *face_s, snewz);
+ *face_t = lp_build_select(cint_bld, arz_ge_arx_ary, *face_t, tnewz);
+ ma = lp_build_select(coord_bld, arz_ge_arx_ary, ma, r);
+ *face = lp_build_select(cint_bld, arz_ge_arx_ary, *face, facez);
+ sign = lp_build_select(cint_bld, arz_ge_arx_ary, sign, signrzs);
+
+ *face_s = LLVMBuildBitCast(builder, *face_s,
+ lp_build_vec_type(gallivm, coord_bld->type), "");
+ *face_t = LLVMBuildBitCast(builder, *face_t,
+ lp_build_vec_type(gallivm, coord_bld->type), "");
+
+ /* add +1 for neg face */
+ /* XXX with AVX probably want to use another select here -
+ * as long as we ensure vblendvps gets used we can actually
+ * skip the comparison and just use sign as a "mask" directly.
+ */
+ sign = LLVMBuildLShr(builder, sign, signshift, "");
+ *face = LLVMBuildOr(builder, *face, sign, "face");
- arx_ge_ary_arz = LLVMBuildAnd(builder, arx_ge_ary, arx_ge_arz, "");
- ary_ge_arx_arz = LLVMBuildAnd(builder, ary_ge_arx, ary_ge_arz, "");
+ ima = lp_build_cube_imapos(coord_bld, ma);
+
+ *face_s = lp_build_mul(coord_bld, *face_s, ima);
+ *face_s = lp_build_add(coord_bld, *face_s, posHalf);
+ *face_t = lp_build_mul(coord_bld, *face_t, ima);
+ *face_t = lp_build_add(coord_bld, *face_t, posHalf);
+ }
- {
+ else {
struct lp_build_if_state if_ctx;
LLVMValueRef face_s_var;
LLVMValueRef face_t_var;
LLVMValueRef face_var;
-
- face_s_var = lp_build_alloca(bld->gallivm, bld->coord_bld.vec_type, "face_s_var");
- face_t_var = lp_build_alloca(bld->gallivm, bld->coord_bld.vec_type, "face_t_var");
- face_var = lp_build_alloca(bld->gallivm, bld->int_bld.vec_type, "face_var");
-
- lp_build_if(&if_ctx, bld->gallivm, arx_ge_ary_arz);
+ LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
+ LLVMValueRef shuffles[4];
+ LLVMValueRef arxy_ge_aryx, arxy_ge_arzz, arxy_ge_arxy_arzz;
+ LLVMValueRef arxyxy, aryxzz, arxyxy_ge_aryxzz;
+ struct lp_build_context *float_bld = &bld->float_bld;
+
+ assert(bld->coord_bld.type.length == 4);
+
+ shuffles[0] = lp_build_const_int32(gallivm, 0);
+ shuffles[1] = lp_build_const_int32(gallivm, 1);
+ shuffles[2] = lp_build_const_int32(gallivm, 0);
+ shuffles[3] = lp_build_const_int32(gallivm, 1);
+ arxyxy = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
+ shuffles[0] = lp_build_const_int32(gallivm, 1);
+ shuffles[1] = lp_build_const_int32(gallivm, 0);
+ shuffles[2] = lp_build_const_int32(gallivm, 2);
+ shuffles[3] = lp_build_const_int32(gallivm, 2);
+ aryxzz = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
+ arxyxy_ge_aryxzz = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_GEQUAL, arxyxy, aryxzz);
+
+ shuffles[0] = lp_build_const_int32(gallivm, 0);
+ shuffles[1] = lp_build_const_int32(gallivm, 1);
+ arxy_ge_aryx = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
+ LLVMConstVector(shuffles, 2), "");
+ shuffles[0] = lp_build_const_int32(gallivm, 2);
+ shuffles[1] = lp_build_const_int32(gallivm, 3);
+ arxy_ge_arzz = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
+ LLVMConstVector(shuffles, 2), "");
+ arxy_ge_arxy_arzz = LLVMBuildAnd(builder, arxy_ge_aryx, arxy_ge_arzz, "");
+
+ arx_ge_ary_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
+ lp_build_const_int32(gallivm, 0), "");
+ arx_ge_ary_arz = LLVMBuildICmp(builder, LLVMIntNE, arx_ge_ary_arz,
+ lp_build_const_int32(gallivm, 0), "");
+ ary_ge_arx_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
+ lp_build_const_int32(gallivm, 1), "");
+ ary_ge_arx_arz = LLVMBuildICmp(builder, LLVMIntNE, ary_ge_arx_arz,
+ lp_build_const_int32(gallivm, 0), "");
+ face_s_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_s_var");
+ face_t_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_t_var");
+ face_var = lp_build_alloca(gallivm, bld->int_bld.vec_type, "face_var");
+
+ lp_build_if(&if_ctx, gallivm, arx_ge_ary_arz);
{
/* +/- X face */
- LLVMValueRef sign = lp_build_sgn(float_bld, rx);
- LLVMValueRef ima = lp_build_cube_ima(coord_bld, s);
+ LLVMValueRef sign, ima;
+ rx = LLVMBuildExtractElement(builder, rxyz,
+ lp_build_const_int32(gallivm, 0), "");
+ /* +/- X face */
+ sign = lp_build_sgn(float_bld, rx);
+ ima = lp_build_cube_imaneg(coord_bld, s);
*face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
*face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
*face = lp_build_cube_face(bld, rx,
@@ -963,11 +1185,14 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
{
struct lp_build_if_state if_ctx2;
- lp_build_if(&if_ctx2, bld->gallivm, ary_ge_arx_arz);
+ lp_build_if(&if_ctx2, gallivm, ary_ge_arx_arz);
{
+ LLVMValueRef sign, ima;
/* +/- Y face */
- LLVMValueRef sign = lp_build_sgn(float_bld, ry);
- LLVMValueRef ima = lp_build_cube_ima(coord_bld, t);
+ ry = LLVMBuildExtractElement(builder, rxyz,
+ lp_build_const_int32(gallivm, 1), "");
+ sign = lp_build_sgn(float_bld, ry);
+ ima = lp_build_cube_imaneg(coord_bld, t);
*face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
*face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
*face = lp_build_cube_face(bld, ry,
@@ -980,8 +1205,11 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
lp_build_else(&if_ctx2);
{
/* +/- Z face */
- LLVMValueRef sign = lp_build_sgn(float_bld, rz);
- LLVMValueRef ima = lp_build_cube_ima(coord_bld, r);
+ LLVMValueRef sign, ima;
+ rz = LLVMBuildExtractElement(builder, rxyz,
+ lp_build_const_int32(gallivm, 2), "");
+ sign = lp_build_sgn(float_bld, rz);
+ ima = lp_build_cube_imaneg(coord_bld, r);
*face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
*face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
*face = lp_build_cube_face(bld, rz,
@@ -999,6 +1227,7 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
*face_s = LLVMBuildLoad(builder, face_s_var, "face_s");
*face_t = LLVMBuildLoad(builder, face_t_var, "face_t");
*face = LLVMBuildLoad(builder, face_var, "face");
+ *face = lp_build_broadcast_scalar(&bld->int_coord_bld, *face);
}
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index dad138abee0..0f3d8ae6cb5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -52,6 +52,15 @@ struct lp_build_context;
/**
+ * Helper struct holding all derivatives needed for sampling
+ */
+struct lp_derivatives
+{
+ LLVMValueRef ddx_ddy[2];
+};
+
+
+/**
* Sampler static state.
*
* These are the bits of state from pipe_resource and pipe_sampler_state that
@@ -192,6 +201,9 @@ struct lp_build_sample_context
/* See texture_dims() */
unsigned dims;
+ /** SIMD vector width */
+ unsigned vector_width;
+
/** regular scalar float type */
struct lp_type float_type;
struct lp_build_context float_bld;
@@ -199,7 +211,7 @@ struct lp_build_sample_context
/** float vector type */
struct lp_build_context float_vec_bld;
- /** regular scalar float type */
+ /** regular scalar int type */
struct lp_type int_type;
struct lp_build_context int_bld;
@@ -223,10 +235,15 @@ struct lp_build_sample_context
struct lp_type texel_type;
struct lp_build_context texel_bld;
+ /** Float per-quad type */
+ struct lp_type perquadf_type;
+ struct lp_build_context perquadf_bld;
+
+ /** Int per-quad type */
+ struct lp_type perquadi_type;
+ struct lp_build_context perquadi_bld;
+
/* Common dynamic state values */
- LLVMValueRef width;
- LLVMValueRef height;
- LLVMValueRef depth;
LLVMValueRef row_stride_array;
LLVMValueRef img_stride_array;
LLVMValueRef data_array;
@@ -305,8 +322,7 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
void
lp_build_lod_selector(struct lp_build_sample_context *bld,
unsigned unit,
- const LLVMValueRef ddx[4],
- const LLVMValueRef ddy[4],
+ const struct lp_derivatives *derivs,
LLVMValueRef lod_bias, /* optional */
LLVMValueRef explicit_lod, /* optional */
unsigned mip_filter,
@@ -331,10 +347,6 @@ LLVMValueRef
lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
LLVMValueRef level);
-LLVMValueRef
-lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
- int level);
-
void
lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
@@ -402,22 +414,35 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
unsigned unit,
unsigned num_coords,
const LLVMValueRef *coords,
- const LLVMValueRef *ddx,
- const LLVMValueRef *ddy,
+ const struct lp_derivatives *derivs,
LLVMValueRef lod_bias,
LLVMValueRef explicit_lod,
LLVMValueRef texel_out[4]);
+
+void
+lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
+ LLVMValueRef coord_f,
+ LLVMValueRef length_i,
+ LLVMValueRef length_f,
+ LLVMValueRef *coord0_i,
+ LLVMValueRef *weight_f);
+
+
void
lp_build_size_query_soa(struct gallivm_state *gallivm,
const struct lp_sampler_static_state *static_state,
struct lp_sampler_dynamic_state *dynamic_state,
+ struct lp_type int_type,
unsigned unit,
LLVMValueRef explicit_lod,
LLVMValueRef *sizes_out);
void
-lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type,
+lp_build_sample_nop(struct gallivm_state *gallivm,
+ struct lp_type type,
+ unsigned num_coords,
+ const LLVMValueRef *coords,
LLVMValueRef texel_out[4]);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index 74858bc9718..ad1b29cf096 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -27,7 +27,7 @@
/**
* @file
- * Texture sampling -- SoA.
+ * Texture sampling -- AoS.
*
* @author Jose Fonseca <jfonseca@vmware.com>
* @author Brian Paul <brianp@vmware.com>
@@ -40,6 +40,7 @@
#include "util/u_memory.h"
#include "util/u_math.h"
#include "util/u_format.h"
+#include "util/u_cpu_detect.h"
#include "lp_bld_debug.h"
#include "lp_bld_type.h"
#include "lp_bld_const.h"
@@ -75,6 +76,7 @@ static void
lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
unsigned block_length,
LLVMValueRef coord,
+ LLVMValueRef coord_f,
LLVMValueRef length,
LLVMValueRef stride,
boolean is_pot,
@@ -93,10 +95,11 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
if(is_pot)
coord = LLVMBuildAnd(builder, coord, length_minus_one, "");
else {
- /* Add a bias to the texcoord to handle negative coords */
- LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
- coord = LLVMBuildAdd(builder, coord, bias, "");
- coord = LLVMBuildURem(builder, coord, length, "");
+ struct lp_build_context *coord_bld = &bld->coord_bld;
+ LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
+ coord = lp_build_fract_safe(coord_bld, coord_f);
+ coord = lp_build_mul(coord_bld, coord, length_f);
+ coord = lp_build_itrunc(coord_bld, coord);
}
break;
@@ -121,6 +124,56 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
/**
+ * Build LLVM code for texture coord wrapping, for nearest filtering,
+ * for float texcoords.
+ * \param coord the incoming texcoord (s,t,r or q)
+ * \param length the texture size along one dimension
+ * \param is_pot if TRUE, length is a power of two
+ * \param wrap_mode one of PIPE_TEX_WRAP_x
+ * \param icoord the texcoord after wrapping, as int
+ */
+static void
+lp_build_sample_wrap_nearest_float(struct lp_build_sample_context *bld,
+ LLVMValueRef coord,
+ LLVMValueRef length,
+ boolean is_pot,
+ unsigned wrap_mode,
+ LLVMValueRef *icoord)
+{
+ struct lp_build_context *coord_bld = &bld->coord_bld;
+ LLVMValueRef length_minus_one;
+
+ switch(wrap_mode) {
+ case PIPE_TEX_WRAP_REPEAT:
+ /* take fraction, unnormalize */
+ coord = lp_build_fract_safe(coord_bld, coord);
+ coord = lp_build_mul(coord_bld, coord, length);
+ *icoord = lp_build_itrunc(coord_bld, coord);
+ break;
+ case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+ length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
+ if (bld->static_state->normalized_coords) {
+ /* scale coord to length */
+ coord = lp_build_mul(coord_bld, coord, length);
+ }
+ coord = lp_build_clamp(coord_bld, coord, coord_bld->zero,
+ length_minus_one);
+ *icoord = lp_build_itrunc(coord_bld, coord);
+ break;
+
+ case PIPE_TEX_WRAP_CLAMP:
+ case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+ case PIPE_TEX_WRAP_MIRROR_REPEAT:
+ case PIPE_TEX_WRAP_MIRROR_CLAMP:
+ case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+ case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+ default:
+ assert(0);
+ }
+}
+
+
+/**
* Build LLVM code for texture coord wrapping, for linear filtering,
* for scaled integer texcoords.
* \param block_length is the length of the pixel block along the
@@ -139,6 +192,8 @@ static void
lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
unsigned block_length,
LLVMValueRef coord0,
+ LLVMValueRef *weight_i,
+ LLVMValueRef coord_f,
LLVMValueRef length,
LLVMValueRef stride,
boolean is_pot,
@@ -153,58 +208,85 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
LLVMValueRef length_minus_one;
LLVMValueRef lmask, umask, mask;
- if (block_length != 1) {
- /*
- * If the pixel block covers more than one pixel then there is no easy
- * way to calculate offset1 relative to offset0. Instead, compute them
- * independently.
- */
-
- LLVMValueRef coord1;
-
- lp_build_sample_wrap_nearest_int(bld,
- block_length,
- coord0,
- length,
- stride,
- is_pot,
- wrap_mode,
- offset0, i0);
-
- coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+ /*
+ * If the pixel block covers more than one pixel then there is no easy
+ * way to calculate offset1 relative to offset0. Instead, compute them
+ * independently. Otherwise, try to compute offset0 and offset1 with
+ * a single stride multiplication.
+ */
- lp_build_sample_wrap_nearest_int(bld,
- block_length,
- coord1,
- length,
- stride,
- is_pot,
- wrap_mode,
- offset1, i1);
+ length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
+ if (block_length != 1) {
+ LLVMValueRef coord1;
+ switch(wrap_mode) {
+ case PIPE_TEX_WRAP_REPEAT:
+ if (is_pot) {
+ coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+ coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
+ coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
+ }
+ else {
+ LLVMValueRef mask;
+ LLVMValueRef weight;
+ LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
+ lp_build_coord_repeat_npot_linear(bld, coord_f,
+ length, length_f,
+ &coord0, &weight);
+ mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
+ PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
+ coord1 = LLVMBuildAnd(builder,
+ lp_build_add(int_coord_bld, coord0,
+ int_coord_bld->one),
+ mask, "");
+ weight = lp_build_mul_imm(&bld->coord_bld, weight, 256);
+ *weight_i = lp_build_itrunc(&bld->coord_bld, weight);
+ }
+ break;
+
+ case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+ coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+ coord0 = lp_build_clamp(int_coord_bld, coord0, int_coord_bld->zero,
+ length_minus_one);
+ coord1 = lp_build_clamp(int_coord_bld, coord1, int_coord_bld->zero,
+ length_minus_one);
+ break;
+
+ case PIPE_TEX_WRAP_CLAMP:
+ case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+ case PIPE_TEX_WRAP_MIRROR_REPEAT:
+ case PIPE_TEX_WRAP_MIRROR_CLAMP:
+ case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+ case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+ default:
+ assert(0);
+ coord0 = int_coord_bld->zero;
+ coord1 = int_coord_bld->zero;
+ break;
+ }
+ lp_build_sample_partial_offset(int_coord_bld, block_length, coord0, stride,
+ offset0, i0);
+ lp_build_sample_partial_offset(int_coord_bld, block_length, coord1, stride,
+ offset1, i1);
return;
}
- /*
- * Scalar pixels -- try to compute offset0 and offset1 with a single stride
- * multiplication.
- */
-
*i0 = int_coord_bld->zero;
*i1 = int_coord_bld->zero;
- length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
-
switch(wrap_mode) {
case PIPE_TEX_WRAP_REPEAT:
if (is_pot) {
coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
}
else {
- /* Add a bias to the texcoord to handle negative coords */
- LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
- coord0 = LLVMBuildAdd(builder, coord0, bias, "");
- coord0 = LLVMBuildURem(builder, coord0, length, "");
+ LLVMValueRef weight;
+ LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
+ lp_build_coord_repeat_npot_linear(bld, coord_f,
+ length, length_f,
+ &coord0, &weight);
+ weight = lp_build_mul_imm(&bld->coord_bld, weight, 256);
+ *weight_i = lp_build_itrunc(&bld->coord_bld, weight);
}
mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
@@ -217,6 +299,11 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
break;
case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+ /* XXX this might be slower than the separate path
+ * on some newer cpus. With sse41 this is 8 instructions vs. 7
+ * - at least on SNB this is almost certainly slower since
+ * min/max are cheaper than selects, and the muls aren't bad.
+ */
lmask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
umask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
@@ -249,6 +336,176 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
/**
+ * Build LLVM code for texture coord wrapping, for linear filtering,
+ * for float texcoords.
+ * \param block_length is the length of the pixel block along the
+ * coordinate axis
+ * \param coord the incoming texcoord (s,t,r or q)
+ * \param length the texture size along one dimension
+ * \param is_pot if TRUE, length is a power of two
+ * \param wrap_mode one of PIPE_TEX_WRAP_x
+ * \param coord0 the first texcoord after wrapping, as int
+ * \param coord1 the second texcoord after wrapping, as int
+ * \param weight the filter weight as int (0-255)
+ * \param force_nearest if this coord actually uses nearest filtering
+ */
+static void
+lp_build_sample_wrap_linear_float(struct lp_build_sample_context *bld,
+ unsigned block_length,
+ LLVMValueRef coord,
+ LLVMValueRef length,
+ boolean is_pot,
+ unsigned wrap_mode,
+ LLVMValueRef *coord0,
+ LLVMValueRef *coord1,
+ LLVMValueRef *weight,
+ unsigned force_nearest)
+{
+ struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+ struct lp_build_context *coord_bld = &bld->coord_bld;
+ LLVMBuilderRef builder = bld->gallivm->builder;
+ LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
+ LLVMValueRef length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
+
+ switch(wrap_mode) {
+ case PIPE_TEX_WRAP_REPEAT:
+ if (is_pot) {
+ /* mul by size and subtract 0.5 */
+ coord = lp_build_mul(coord_bld, coord, length);
+ if (!force_nearest)
+ coord = lp_build_sub(coord_bld, coord, half);
+ *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
+ /* convert to int, compute lerp weight */
+ lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
+ *coord1 = lp_build_ifloor(coord_bld, *coord1);
+ /* repeat wrap */
+ length_minus_one = lp_build_itrunc(coord_bld, length_minus_one);
+ *coord0 = LLVMBuildAnd(builder, *coord0, length_minus_one, "");
+ *coord1 = LLVMBuildAnd(builder, *coord1, length_minus_one, "");
+ }
+ else {
+ LLVMValueRef mask;
+ /* wrap with normalized floats is just fract */
+ coord = lp_build_fract(coord_bld, coord);
+ /* unnormalize */
+ coord = lp_build_mul(coord_bld, coord, length);
+ /*
+ * we avoided the 0.5/length division, have to fix up wrong
+ * edge cases with selects
+ */
+ *coord1 = lp_build_add(coord_bld, coord, half);
+ coord = lp_build_sub(coord_bld, coord, half);
+ *weight = lp_build_fract(coord_bld, coord);
+ mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
+ PIPE_FUNC_LESS, coord, coord_bld->zero);
+ *coord0 = lp_build_select(coord_bld, mask, length_minus_one, coord);
+ *coord0 = lp_build_itrunc(coord_bld, *coord0);
+ mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
+ PIPE_FUNC_LESS, *coord1, length);
+ *coord1 = lp_build_select(coord_bld, mask, *coord1, coord_bld->zero);
+ *coord1 = lp_build_itrunc(coord_bld, *coord1);
+ }
+ break;
+ case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+ if (bld->static_state->normalized_coords) {
+ /* mul by tex size */
+ coord = lp_build_mul(coord_bld, coord, length);
+ }
+ /* subtract 0.5 */
+ if (!force_nearest) {
+ coord = lp_build_sub(coord_bld, coord, half);
+ }
+ /* clamp to [0, length - 1] */
+ coord = lp_build_min(coord_bld, coord, length_minus_one);
+ coord = lp_build_max(coord_bld, coord, coord_bld->zero);
+ *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
+ /* convert to int, compute lerp weight */
+ lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
+ /* coord1 = min(coord1, length-1) */
+ *coord1 = lp_build_min(coord_bld, *coord1, length_minus_one);
+ *coord1 = lp_build_itrunc(coord_bld, *coord1);
+ break;
+ default:
+ assert(0);
+ *coord0 = int_coord_bld->zero;
+ *coord1 = int_coord_bld->zero;
+ *weight = coord_bld->zero;
+ break;
+ }
+ *weight = lp_build_mul_imm(coord_bld, *weight, 256);
+ *weight = lp_build_itrunc(coord_bld, *weight);
+ return;
+}
+
+
+/**
+ * Fetch texels for image with nearest sampling.
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ */
+static void
+lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
+ LLVMValueRef data_ptr,
+ LLVMValueRef offset,
+ LLVMValueRef x_subcoord,
+ LLVMValueRef y_subcoord,
+ LLVMValueRef *colors_lo,
+ LLVMValueRef *colors_hi)
+{
+ /*
+ * Fetch the pixels as 4 x 32bit (rgba order might differ):
+ *
+ * rgba0 rgba1 rgba2 rgba3
+ *
+ * bit cast them into 16 x u8
+ *
+ * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
+ *
+ * unpack them into two 8 x i16:
+ *
+ * r0 g0 b0 a0 r1 g1 b1 a1
+ * r2 g2 b2 a2 r3 g3 b3 a3
+ *
+ * The higher 8 bits of the resulting elements will be zero.
+ */
+ LLVMBuilderRef builder = bld->gallivm->builder;
+ LLVMValueRef rgba8;
+ struct lp_build_context h16, u8n;
+ LLVMTypeRef u8n_vec_type;
+
+ lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
+ lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
+ u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
+
+ if (util_format_is_rgba8_variant(bld->format_desc)) {
+ /*
+ * Given the format is a rgba8, just read the pixels as is,
+ * without any swizzling. Swizzling will be done later.
+ */
+ rgba8 = lp_build_gather(bld->gallivm,
+ bld->texel_type.length,
+ bld->format_desc->block.bits,
+ bld->texel_type.width,
+ data_ptr, offset);
+
+ rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+ }
+ else {
+ rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
+ bld->format_desc,
+ u8n.type,
+ data_ptr, offset,
+ x_subcoord,
+ y_subcoord);
+ }
+
+ /* Expand one 4*rgba8 to two 2*rgba16 */
+ lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
+ rgba8,
+ colors_lo, colors_hi);
+}
+
+
+/**
* Sample a single texture image with nearest sampling.
* If sampling a cube texture, r = cube face in [0,5].
* Return filtered color as two vectors of 16-bit fixed point values.
@@ -267,21 +524,19 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
{
const unsigned dims = bld->dims;
LLVMBuilderRef builder = bld->gallivm->builder;
- struct lp_build_context i32, h16, u8n;
- LLVMTypeRef i32_vec_type, u8n_vec_type;
+ struct lp_build_context i32;
+ LLVMTypeRef i32_vec_type;
LLVMValueRef i32_c8;
LLVMValueRef width_vec, height_vec, depth_vec;
LLVMValueRef s_ipart, t_ipart = NULL, r_ipart = NULL;
+ LLVMValueRef s_float, t_float = NULL, r_float = NULL;
LLVMValueRef x_stride;
LLVMValueRef x_offset, offset;
LLVMValueRef x_subcoord, y_subcoord, z_subcoord;
- lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32));
- lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16));
- lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8));
+ lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);
- u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
lp_build_extract_image_sizes(bld,
bld->int_size_type,
@@ -291,6 +546,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
&height_vec,
&depth_vec);
+ s_float = s; t_float = t; r_float = r;
+
if (bld->static_state->normalized_coords) {
LLVMValueRef scaled_size;
LLVMValueRef flt_size;
@@ -334,7 +591,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
/* Do texcoord wrapping, compute texel offset */
lp_build_sample_wrap_nearest_int(bld,
bld->format_desc->block.width,
- s_ipart, width_vec, x_stride,
+ s_ipart, s_float,
+ width_vec, x_stride,
bld->static_state->pot_width,
bld->static_state->wrap_s,
&x_offset, &x_subcoord);
@@ -343,7 +601,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
LLVMValueRef y_offset;
lp_build_sample_wrap_nearest_int(bld,
bld->format_desc->block.height,
- t_ipart, height_vec, row_stride_vec,
+ t_ipart, t_float,
+ height_vec, row_stride_vec,
bld->static_state->pot_height,
bld->static_state->wrap_t,
&y_offset, &y_subcoord);
@@ -352,7 +611,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
LLVMValueRef z_offset;
lp_build_sample_wrap_nearest_int(bld,
1, /* block length (depth) */
- r_ipart, depth_vec, img_stride_vec,
+ r_ipart, r_float,
+ depth_vec, img_stride_vec,
bld->static_state->pot_depth,
bld->static_state->wrap_r,
&z_offset, &z_subcoord);
@@ -366,6 +626,196 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
}
}
+ lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
+ x_subcoord, y_subcoord,
+ colors_lo, colors_hi);
+}
+
+
+/**
+ * Sample a single texture image with nearest sampling.
+ * If sampling a cube texture, r = cube face in [0,5].
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ * Does address calcs (except offsets) with floats.
+ * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
+ */
+static void
+lp_build_sample_image_nearest_afloat(struct lp_build_sample_context *bld,
+ LLVMValueRef int_size,
+ LLVMValueRef row_stride_vec,
+ LLVMValueRef img_stride_vec,
+ LLVMValueRef data_ptr,
+ LLVMValueRef s,
+ LLVMValueRef t,
+ LLVMValueRef r,
+ LLVMValueRef *colors_lo,
+ LLVMValueRef *colors_hi)
+ {
+ const unsigned dims = bld->dims;
+ LLVMValueRef width_vec, height_vec, depth_vec;
+ LLVMValueRef offset;
+ LLVMValueRef x_subcoord, y_subcoord;
+ LLVMValueRef x_icoord, y_icoord, z_icoord;
+ LLVMValueRef flt_size;
+
+ flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
+
+ lp_build_extract_image_sizes(bld,
+ bld->float_size_type,
+ bld->coord_type,
+ flt_size,
+ &width_vec,
+ &height_vec,
+ &depth_vec);
+
+ /* Do texcoord wrapping */
+ lp_build_sample_wrap_nearest_float(bld,
+ s, width_vec,
+ bld->static_state->pot_width,
+ bld->static_state->wrap_s,
+ &x_icoord);
+
+ if (dims >= 2) {
+ lp_build_sample_wrap_nearest_float(bld,
+ t, height_vec,
+ bld->static_state->pot_height,
+ bld->static_state->wrap_t,
+ &y_icoord);
+
+ if (dims >= 3) {
+ lp_build_sample_wrap_nearest_float(bld,
+ r, depth_vec,
+ bld->static_state->pot_depth,
+ bld->static_state->wrap_r,
+ &z_icoord);
+ }
+ else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+ z_icoord = r;
+ }
+ }
+
+ /*
+ * From here on we deal with ints, and we should split up the 256bit
+ * vectors manually for better generated code.
+ */
+
+ /*
+ * compute texel offsets -
+ * cannot do offset calc with floats, difficult for block-based formats,
+ * and not enough precision anyway.
+ */
+ lp_build_sample_offset(&bld->int_coord_bld,
+ bld->format_desc,
+ x_icoord, y_icoord,
+ z_icoord,
+ row_stride_vec, img_stride_vec,
+ &offset,
+ &x_subcoord, &y_subcoord);
+
+ lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
+ x_subcoord, y_subcoord,
+ colors_lo, colors_hi);
+}
+
+
+/**
+ * Fetch texels for image with linear sampling.
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ */
+static void
+lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
+ LLVMValueRef data_ptr,
+ LLVMValueRef offset[2][2][2],
+ LLVMValueRef x_subcoord[2],
+ LLVMValueRef y_subcoord[2],
+ LLVMValueRef s_fpart,
+ LLVMValueRef t_fpart,
+ LLVMValueRef r_fpart,
+ LLVMValueRef *colors_lo,
+ LLVMValueRef *colors_hi)
+{
+ const unsigned dims = bld->dims;
+ LLVMBuilderRef builder = bld->gallivm->builder;
+ struct lp_build_context h16, u8n;
+ LLVMTypeRef h16_vec_type, u8n_vec_type;
+ LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
+ LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
+ LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
+ LLVMValueRef shuffle_lo, shuffle_hi;
+ LLVMValueRef s_fpart_lo, s_fpart_hi;
+ LLVMValueRef t_fpart_lo = NULL, t_fpart_hi = NULL;
+ LLVMValueRef r_fpart_lo = NULL, r_fpart_hi = NULL;
+ LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */
+ LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */
+ LLVMValueRef packed_lo, packed_hi;
+ unsigned i, j, k;
+ unsigned numj, numk;
+
+ lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
+ lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
+ h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type);
+ u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
+
+ /*
+ * Transform 4 x i32 in
+ *
+ * s_fpart = {s0, s1, s2, s3}
+ *
+ * into 8 x i16
+ *
+ * s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
+ *
+ * into two 8 x i16
+ *
+ * s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
+ * s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
+ *
+ * and likewise for t_fpart. There is no risk of loosing precision here
+ * since the fractional parts only use the lower 8bits.
+ */
+ s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
+ if (dims >= 2)
+ t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
+ if (dims >= 3)
+ r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, "");
+
+ for (j = 0; j < h16.type.length; j += 4) {
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+ unsigned subindex = 0;
+#else
+ unsigned subindex = 1;
+#endif
+ LLVMValueRef index;
+
+ index = LLVMConstInt(elem_type, j/2 + subindex, 0);
+ for (i = 0; i < 4; ++i)
+ shuffles_lo[j + i] = index;
+
+ index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
+ for (i = 0; i < 4; ++i)
+ shuffles_hi[j + i] = index;
+ }
+
+ shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
+ shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
+
+ s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
+ shuffle_lo, "");
+ s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
+ shuffle_hi, "");
+ if (dims >= 2) {
+ t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
+ shuffle_lo, "");
+ t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
+ shuffle_hi, "");
+ }
+ if (dims >= 3) {
+ r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
+ shuffle_lo, "");
+ r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
+ shuffle_hi, "");
+ }
+
/*
* Fetch the pixels as 4 x 32bit (rgba order might differ):
*
@@ -382,38 +832,129 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
*
* The higher 8 bits of the resulting elements will be zero.
*/
- {
- LLVMValueRef rgba8;
+ numj = 1 + (dims >= 2);
+ numk = 1 + (dims >= 3);
- if (util_format_is_rgba8_variant(bld->format_desc)) {
- /*
- * Given the format is a rgba8, just read the pixels as is,
- * without any swizzling. Swizzling will be done later.
- */
- rgba8 = lp_build_gather(bld->gallivm,
- bld->texel_type.length,
- bld->format_desc->block.bits,
- bld->texel_type.width,
- data_ptr, offset);
+ for (k = 0; k < numk; k++) {
+ for (j = 0; j < numj; j++) {
+ for (i = 0; i < 2; i++) {
+ LLVMValueRef rgba8;
+
+ if (util_format_is_rgba8_variant(bld->format_desc)) {
+ /*
+ * Given the format is a rgba8, just read the pixels as is,
+ * without any swizzling. Swizzling will be done later.
+ */
+ rgba8 = lp_build_gather(bld->gallivm,
+ bld->texel_type.length,
+ bld->format_desc->block.bits,
+ bld->texel_type.width,
+ data_ptr, offset[k][j][i]);
+
+ rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+ }
+ else {
+ rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
+ bld->format_desc,
+ u8n.type,
+ data_ptr, offset[k][j][i],
+ x_subcoord[i],
+ y_subcoord[j]);
+ }
- rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+ /* Expand one 4*rgba8 to two 2*rgba16 */
+ lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
+ rgba8,
+ &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]);
+ }
}
- else {
- rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
- bld->format_desc,
- u8n.type,
- data_ptr, offset,
- x_subcoord,
- y_subcoord);
+ }
+
+ /*
+ * Linear interpolation with 8.8 fixed point.
+ */
+ if (bld->static_state->force_nearest_s) {
+ /* special case 1-D lerp */
+ packed_lo = lp_build_lerp(&h16,
+ t_fpart_lo,
+ neighbors_lo[0][0][0],
+ neighbors_lo[0][0][1]);
+
+ packed_hi = lp_build_lerp(&h16,
+ t_fpart_hi,
+ neighbors_hi[0][1][0],
+ neighbors_hi[0][1][0]);
+ }
+ else if (bld->static_state->force_nearest_t) {
+ /* special case 1-D lerp */
+ packed_lo = lp_build_lerp(&h16,
+ s_fpart_lo,
+ neighbors_lo[0][0][0],
+ neighbors_lo[0][0][1]);
+
+ packed_hi = lp_build_lerp(&h16,
+ s_fpart_hi,
+ neighbors_hi[0][0][0],
+ neighbors_hi[0][0][1]);
+ }
+ else {
+ /* general 1/2/3-D lerping */
+ if (dims == 1) {
+ packed_lo = lp_build_lerp(&h16,
+ s_fpart_lo,
+ neighbors_lo[0][0][0],
+ neighbors_lo[0][0][1]);
+
+ packed_hi = lp_build_lerp(&h16,
+ s_fpart_hi,
+ neighbors_hi[0][0][0],
+ neighbors_hi[0][0][1]);
}
+ else {
+ /* 2-D lerp */
+ packed_lo = lp_build_lerp_2d(&h16,
+ s_fpart_lo, t_fpart_lo,
+ neighbors_lo[0][0][0],
+ neighbors_lo[0][0][1],
+ neighbors_lo[0][1][0],
+ neighbors_lo[0][1][1]);
+
+ packed_hi = lp_build_lerp_2d(&h16,
+ s_fpart_hi, t_fpart_hi,
+ neighbors_hi[0][0][0],
+ neighbors_hi[0][0][1],
+ neighbors_hi[0][1][0],
+ neighbors_hi[0][1][1]);
+
+ if (dims >= 3) {
+ LLVMValueRef packed_lo2, packed_hi2;
+
+ /* lerp in the second z slice */
+ packed_lo2 = lp_build_lerp_2d(&h16,
+ s_fpart_lo, t_fpart_lo,
+ neighbors_lo[1][0][0],
+ neighbors_lo[1][0][1],
+ neighbors_lo[1][1][0],
+ neighbors_lo[1][1][1]);
- /* Expand one 4*rgba8 to two 2*rgba16 */
- lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
- rgba8,
- colors_lo, colors_hi);
+ packed_hi2 = lp_build_lerp_2d(&h16,
+ s_fpart_hi, t_fpart_hi,
+ neighbors_hi[1][0][0],
+ neighbors_hi[1][0][1],
+ neighbors_hi[1][1][0],
+ neighbors_hi[1][1][1]);
+ /* interp between two z slices */
+ packed_lo = lp_build_lerp(&h16, r_fpart_lo,
+ packed_lo, packed_lo2);
+ packed_hi = lp_build_lerp(&h16, r_fpart_hi,
+ packed_hi, packed_hi2);
+ }
+ }
}
-}
+ *colors_lo = packed_lo;
+ *colors_hi = packed_hi;
+}
/**
* Sample a single texture image with (bi-)(tri-)linear sampling.
@@ -433,33 +974,24 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
{
const unsigned dims = bld->dims;
LLVMBuilderRef builder = bld->gallivm->builder;
- struct lp_build_context i32, h16, u8n;
- LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
+ struct lp_build_context i32;
+ LLVMTypeRef i32_vec_type;
LLVMValueRef i32_c8, i32_c128, i32_c255;
LLVMValueRef width_vec, height_vec, depth_vec;
- LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
- LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_fpart_lo = NULL, t_fpart_hi = NULL;
- LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_fpart_lo = NULL, r_fpart_hi = NULL;
+ LLVMValueRef s_ipart, s_fpart, s_float;
+ LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_float = NULL;
+ LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_float = NULL;
LLVMValueRef x_stride, y_stride, z_stride;
LLVMValueRef x_offset0, x_offset1;
LLVMValueRef y_offset0, y_offset1;
LLVMValueRef z_offset0, z_offset1;
LLVMValueRef offset[2][2][2]; /* [z][y][x] */
LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2];
- LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */
- LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */
- LLVMValueRef packed_lo, packed_hi;
unsigned x, y, z;
- unsigned i, j, k;
- unsigned numj, numk;
- lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32));
- lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16));
- lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8));
+ lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);
- h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type);
- u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
lp_build_extract_image_sizes(bld,
bld->int_size_type,
@@ -469,6 +1001,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
&height_vec,
&depth_vec);
+ s_float = s; t_float = t; r_float = r;
+
if (bld->static_state->normalized_coords) {
LLVMValueRef scaled_size;
LLVMValueRef flt_size;
@@ -533,7 +1067,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
/* do texcoord wrapping and compute texel offsets */
lp_build_sample_wrap_linear_int(bld,
bld->format_desc->block.width,
- s_ipart, width_vec, x_stride,
+ s_ipart, &s_fpart, s_float,
+ width_vec, x_stride,
bld->static_state->pot_width,
bld->static_state->wrap_s,
&x_offset0, &x_offset1,
@@ -548,7 +1083,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
if (dims >= 2) {
lp_build_sample_wrap_linear_int(bld,
bld->format_desc->block.height,
- t_ipart, height_vec, y_stride,
+ t_ipart, &t_fpart, t_float,
+ height_vec, y_stride,
bld->static_state->pot_height,
bld->static_state->wrap_t,
&y_offset0, &y_offset1,
@@ -567,7 +1103,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
if (dims >= 3) {
lp_build_sample_wrap_linear_int(bld,
bld->format_desc->block.height,
- r_ipart, depth_vec, z_stride,
+ r_ipart, &r_fpart, r_float,
+ depth_vec, z_stride,
bld->static_state->pot_depth,
bld->static_state->wrap_r,
&z_offset0, &z_offset1,
@@ -593,212 +1130,175 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
}
}
- /*
- * Transform 4 x i32 in
- *
- * s_fpart = {s0, s1, s2, s3}
- *
- * into 8 x i16
- *
- * s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
- *
- * into two 8 x i16
- *
- * s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
- * s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
- *
- * and likewise for t_fpart. There is no risk of loosing precision here
- * since the fractional parts only use the lower 8bits.
- */
- s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
- if (dims >= 2)
- t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
- if (dims >= 3)
- r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, "");
+ lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
+ x_subcoord, y_subcoord,
+ s_fpart, t_fpart, r_fpart,
+ colors_lo, colors_hi);
+}
- {
- LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
- LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
- LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
- LLVMValueRef shuffle_lo;
- LLVMValueRef shuffle_hi;
- for (j = 0; j < h16.type.length; j += 4) {
-#ifdef PIPE_ARCH_LITTLE_ENDIAN
- unsigned subindex = 0;
-#else
- unsigned subindex = 1;
-#endif
- LLVMValueRef index;
+/**
+ * Sample a single texture image with (bi-)(tri-)linear sampling.
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ * Does address calcs (except offsets) with floats.
+ * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
+ */
+static void
+lp_build_sample_image_linear_afloat(struct lp_build_sample_context *bld,
+ LLVMValueRef int_size,
+ LLVMValueRef row_stride_vec,
+ LLVMValueRef img_stride_vec,
+ LLVMValueRef data_ptr,
+ LLVMValueRef s,
+ LLVMValueRef t,
+ LLVMValueRef r,
+ LLVMValueRef *colors_lo,
+ LLVMValueRef *colors_hi)
+{
+ const unsigned dims = bld->dims;
+ LLVMValueRef width_vec, height_vec, depth_vec;
+ LLVMValueRef s_fpart;
+ LLVMValueRef t_fpart = NULL;
+ LLVMValueRef r_fpart = NULL;
+ LLVMValueRef x_stride, y_stride, z_stride;
+ LLVMValueRef x_offset0, x_offset1;
+ LLVMValueRef y_offset0, y_offset1;
+ LLVMValueRef z_offset0, z_offset1;
+ LLVMValueRef offset[2][2][2]; /* [z][y][x] */
+ LLVMValueRef x_subcoord[2], y_subcoord[2];
+ LLVMValueRef flt_size;
+ LLVMValueRef x_icoord0, x_icoord1;
+ LLVMValueRef y_icoord0, y_icoord1;
+ LLVMValueRef z_icoord0, z_icoord1;
+ unsigned x, y, z;
- index = LLVMConstInt(elem_type, j/2 + subindex, 0);
- for (i = 0; i < 4; ++i)
- shuffles_lo[j + i] = index;
+ flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
- index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
- for (i = 0; i < 4; ++i)
- shuffles_hi[j + i] = index;
- }
+ lp_build_extract_image_sizes(bld,
+ bld->float_size_type,
+ bld->coord_type,
+ flt_size,
+ &width_vec,
+ &height_vec,
+ &depth_vec);
- shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
- shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
+ /* do texcoord wrapping and compute texel offsets */
+ lp_build_sample_wrap_linear_float(bld,
+ bld->format_desc->block.width,
+ s, width_vec,
+ bld->static_state->pot_width,
+ bld->static_state->wrap_s,
+ &x_icoord0, &x_icoord1,
+ &s_fpart,
+ bld->static_state->force_nearest_s);
+
+ if (dims >= 2) {
+ lp_build_sample_wrap_linear_float(bld,
+ bld->format_desc->block.height,
+ t, height_vec,
+ bld->static_state->pot_height,
+ bld->static_state->wrap_t,
+ &y_icoord0, &y_icoord1,
+ &t_fpart,
+ bld->static_state->force_nearest_t);
- s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
- shuffle_lo, "");
- s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
- shuffle_hi, "");
- if (dims >= 2) {
- t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
- shuffle_lo, "");
- t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
- shuffle_hi, "");
- }
if (dims >= 3) {
- r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
- shuffle_lo, "");
- r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
- shuffle_hi, "");
+ lp_build_sample_wrap_linear_float(bld,
+ bld->format_desc->block.height,
+ r, depth_vec,
+ bld->static_state->pot_depth,
+ bld->static_state->wrap_r,
+ &z_icoord0, &z_icoord1,
+ &r_fpart, 0);
}
}
/*
- * Fetch the pixels as 4 x 32bit (rgba order might differ):
- *
- * rgba0 rgba1 rgba2 rgba3
- *
- * bit cast them into 16 x u8
- *
- * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
- *
- * unpack them into two 8 x i16:
- *
- * r0 g0 b0 a0 r1 g1 b1 a1
- * r2 g2 b2 a2 r3 g3 b3 a3
- *
- * The higher 8 bits of the resulting elements will be zero.
+ * From here on we deal with ints, and we should split up the 256bit
+ * vectors manually for better generated code.
*/
- numj = 1 + (dims >= 2);
- numk = 1 + (dims >= 3);
- for (k = 0; k < numk; k++) {
- for (j = 0; j < numj; j++) {
- for (i = 0; i < 2; i++) {
- LLVMValueRef rgba8;
-
- if (util_format_is_rgba8_variant(bld->format_desc)) {
- /*
- * Given the format is a rgba8, just read the pixels as is,
- * without any swizzling. Swizzling will be done later.
- */
- rgba8 = lp_build_gather(bld->gallivm,
- bld->texel_type.length,
- bld->format_desc->block.bits,
- bld->texel_type.width,
- data_ptr, offset[k][j][i]);
-
- rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
- }
- else {
- rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
- bld->format_desc,
- u8n.type,
- data_ptr, offset[k][j][i],
- x_subcoord[i],
- y_subcoord[j]);
- }
-
- /* Expand one 4*rgba8 to two 2*rgba16 */
- lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
- rgba8,
- &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]);
- }
- }
- }
+ /* get pixel, row and image strides */
+ x_stride = lp_build_const_vec(bld->gallivm,
+ bld->int_coord_bld.type,
+ bld->format_desc->block.bits/8);
+ y_stride = row_stride_vec;
+ z_stride = img_stride_vec;
/*
- * Linear interpolation with 8.8 fixed point.
+ * compute texel offset -
+ * cannot do offset calc with floats, difficult for block-based formats,
+ * and not enough precision anyway.
*/
- if (bld->static_state->force_nearest_s) {
- /* special case 1-D lerp */
- packed_lo = lp_build_lerp(&h16,
- t_fpart_lo,
- neighbors_lo[0][0][0],
- neighbors_lo[0][0][1]);
-
- packed_hi = lp_build_lerp(&h16,
- t_fpart_hi,
- neighbors_hi[0][1][0],
- neighbors_hi[0][1][0]);
+ lp_build_sample_partial_offset(&bld->int_coord_bld,
+ bld->format_desc->block.width,
+ x_icoord0, x_stride,
+ &x_offset0, &x_subcoord[0]);
+ lp_build_sample_partial_offset(&bld->int_coord_bld,
+ bld->format_desc->block.width,
+ x_icoord1, x_stride,
+ &x_offset1, &x_subcoord[1]);
+ for (z = 0; z < 2; z++) {
+ for (y = 0; y < 2; y++) {
+ offset[z][y][0] = x_offset0;
+ offset[z][y][1] = x_offset1;
+ }
}
- else if (bld->static_state->force_nearest_t) {
- /* special case 1-D lerp */
- packed_lo = lp_build_lerp(&h16,
- s_fpart_lo,
- neighbors_lo[0][0][0],
- neighbors_lo[0][0][1]);
- packed_hi = lp_build_lerp(&h16,
- s_fpart_hi,
- neighbors_hi[0][0][0],
- neighbors_hi[0][0][1]);
+ if (dims >= 2) {
+ lp_build_sample_partial_offset(&bld->int_coord_bld,
+ bld->format_desc->block.height,
+ y_icoord0, y_stride,
+ &y_offset0, &y_subcoord[0]);
+ lp_build_sample_partial_offset(&bld->int_coord_bld,
+ bld->format_desc->block.height,
+ y_icoord1, y_stride,
+ &y_offset1, &y_subcoord[1]);
+ for (z = 0; z < 2; z++) {
+ for (x = 0; x < 2; x++) {
+ offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
+ offset[z][0][x], y_offset0);
+ offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
+ offset[z][1][x], y_offset1);
+ }
+ }
}
- else {
- /* general 1/2/3-D lerping */
- if (dims == 1) {
- packed_lo = lp_build_lerp(&h16,
- s_fpart_lo,
- neighbors_lo[0][0][0],
- neighbors_lo[0][0][1]);
- packed_hi = lp_build_lerp(&h16,
- s_fpart_hi,
- neighbors_hi[0][0][0],
- neighbors_hi[0][0][1]);
+ if (dims >= 3) {
+ LLVMValueRef z_subcoord[2];
+ lp_build_sample_partial_offset(&bld->int_coord_bld,
+ 1,
+ z_icoord0, z_stride,
+ &z_offset0, &z_subcoord[0]);
+ lp_build_sample_partial_offset(&bld->int_coord_bld,
+ 1,
+ z_icoord1, z_stride,
+ &z_offset1, &z_subcoord[1]);
+ for (y = 0; y < 2; y++) {
+ for (x = 0; x < 2; x++) {
+ offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
+ offset[0][y][x], z_offset0);
+ offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
+ offset[1][y][x], z_offset1);
+ }
}
- else {
- /* 2-D lerp */
- packed_lo = lp_build_lerp_2d(&h16,
- s_fpart_lo, t_fpart_lo,
- neighbors_lo[0][0][0],
- neighbors_lo[0][0][1],
- neighbors_lo[0][1][0],
- neighbors_lo[0][1][1]);
-
- packed_hi = lp_build_lerp_2d(&h16,
- s_fpart_hi, t_fpart_hi,
- neighbors_hi[0][0][0],
- neighbors_hi[0][0][1],
- neighbors_hi[0][1][0],
- neighbors_hi[0][1][1]);
-
- if (dims >= 3) {
- LLVMValueRef packed_lo2, packed_hi2;
-
- /* lerp in the second z slice */
- packed_lo2 = lp_build_lerp_2d(&h16,
- s_fpart_lo, t_fpart_lo,
- neighbors_lo[1][0][0],
- neighbors_lo[1][0][1],
- neighbors_lo[1][1][0],
- neighbors_lo[1][1][1]);
-
- packed_hi2 = lp_build_lerp_2d(&h16,
- s_fpart_hi, t_fpart_hi,
- neighbors_hi[1][0][0],
- neighbors_hi[1][0][1],
- neighbors_hi[1][1][0],
- neighbors_hi[1][1][1]);
- /* interp between two z slices */
- packed_lo = lp_build_lerp(&h16, r_fpart_lo,
- packed_lo, packed_lo2);
- packed_hi = lp_build_lerp(&h16, r_fpart_hi,
- packed_hi, packed_hi2);
+ }
+ else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+ LLVMValueRef z_offset;
+ z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
+ for (y = 0; y < 2; y++) {
+ for (x = 0; x < 2; x++) {
+ /* The r coord is the cube face in [0,5] */
+ offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
+ offset[0][y][x], z_offset);
}
}
}
- *colors_lo = packed_lo;
- *colors_hi = packed_hi;
+ lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
+ x_subcoord, y_subcoord,
+ s_fpart, t_fpart, r_fpart,
+ colors_lo, colors_hi);
}
@@ -824,10 +1324,10 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
LLVMBuilderRef builder = bld->gallivm->builder;
LLVMValueRef size0;
LLVMValueRef size1;
- LLVMValueRef row_stride0_vec;
- LLVMValueRef row_stride1_vec;
- LLVMValueRef img_stride0_vec;
- LLVMValueRef img_stride1_vec;
+ LLVMValueRef row_stride0_vec = NULL;
+ LLVMValueRef row_stride1_vec = NULL;
+ LLVMValueRef img_stride0_vec = NULL;
+ LLVMValueRef img_stride1_vec = NULL;
LLVMValueRef data_ptr0;
LLVMValueRef data_ptr1;
LLVMValueRef colors0_lo, colors0_hi;
@@ -838,20 +1338,39 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
&size0,
&row_stride0_vec, &img_stride0_vec);
data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
- if (img_filter == PIPE_TEX_FILTER_NEAREST) {
- lp_build_sample_image_nearest(bld,
- size0,
- row_stride0_vec, img_stride0_vec,
- data_ptr0, s, t, r,
- &colors0_lo, &colors0_hi);
+ if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
+ if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+ lp_build_sample_image_nearest_afloat(bld,
+ size0,
+ row_stride0_vec, img_stride0_vec,
+ data_ptr0, s, t, r,
+ &colors0_lo, &colors0_hi);
+ }
+ else {
+ assert(img_filter == PIPE_TEX_FILTER_LINEAR);
+ lp_build_sample_image_linear_afloat(bld,
+ size0,
+ row_stride0_vec, img_stride0_vec,
+ data_ptr0, s, t, r,
+ &colors0_lo, &colors0_hi);
+ }
}
else {
- assert(img_filter == PIPE_TEX_FILTER_LINEAR);
- lp_build_sample_image_linear(bld,
- size0,
- row_stride0_vec, img_stride0_vec,
- data_ptr0, s, t, r,
- &colors0_lo, &colors0_hi);
+ if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+ lp_build_sample_image_nearest(bld,
+ size0,
+ row_stride0_vec, img_stride0_vec,
+ data_ptr0, s, t, r,
+ &colors0_lo, &colors0_hi);
+ }
+ else {
+ assert(img_filter == PIPE_TEX_FILTER_LINEAR);
+ lp_build_sample_image_linear(bld,
+ size0,
+ row_stride0_vec, img_stride0_vec,
+ data_ptr0, s, t, r,
+ &colors0_lo, &colors0_hi);
+ }
}
/* Store the first level's colors in the output variables */
@@ -859,74 +1378,138 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
LLVMBuildStore(builder, colors0_hi, colors_hi_var);
if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
- LLVMValueRef h16_scale = lp_build_const_float(bld->gallivm, 256.0);
- LLVMTypeRef i32_type = LLVMIntTypeInContext(bld->gallivm->context, 32);
+ LLVMValueRef h16vec_scale = lp_build_const_vec(bld->gallivm,
+ bld->perquadf_bld.type, 256.0);
+ LLVMTypeRef i32vec_type = lp_build_vec_type(bld->gallivm, bld->perquadi_bld.type);
struct lp_build_if_state if_ctx;
LLVMValueRef need_lerp;
+ unsigned num_quads = bld->coord_bld.type.length / 4;
+ unsigned i;
- lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16_scale, "");
- lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32_type, "lod_fpart.fixed16");
+ lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16vec_scale, "");
+ lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32vec_type, "lod_fpart.fixed16");
/* need_lerp = lod_fpart > 0 */
- need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
- lod_fpart, LLVMConstNull(i32_type),
- "need_lerp");
+ if (num_quads == 1) {
+ need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
+ lod_fpart, bld->perquadi_bld.zero,
+ "need_lerp");
+ }
+ else {
+ /*
+ * We'll do mip filtering if any of the quads need it.
+ * It might be better to split the vectors here and only fetch/filter
+ * quads which need it.
+ */
+ /*
+ * We need to clamp lod_fpart here since we can get negative
+ * values which would screw up filtering if not all
+ * lod_fpart values have same sign.
+ * We can however then skip the greater than comparison.
+ */
+ lod_fpart = lp_build_max(&bld->perquadi_bld, lod_fpart,
+ bld->perquadi_bld.zero);
+ need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, lod_fpart);
+ }
lp_build_if(&if_ctx, bld->gallivm, need_lerp);
{
struct lp_build_context h16_bld;
- lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16));
+ lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
/* sample the second mipmap level */
lp_build_mipmap_level_sizes(bld, ilevel1,
&size1,
&row_stride1_vec, &img_stride1_vec);
data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
- if (img_filter == PIPE_TEX_FILTER_NEAREST) {
- lp_build_sample_image_nearest(bld,
- size1,
- row_stride1_vec, img_stride1_vec,
- data_ptr1, s, t, r,
- &colors1_lo, &colors1_hi);
+
+ if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
+ if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+ lp_build_sample_image_nearest_afloat(bld,
+ size1,
+ row_stride1_vec, img_stride1_vec,
+ data_ptr1, s, t, r,
+ &colors1_lo, &colors1_hi);
+ }
+ else {
+ lp_build_sample_image_linear_afloat(bld,
+ size1,
+ row_stride1_vec, img_stride1_vec,
+ data_ptr1, s, t, r,
+ &colors1_lo, &colors1_hi);
+ }
}
else {
- lp_build_sample_image_linear(bld,
- size1,
- row_stride1_vec, img_stride1_vec,
- data_ptr1, s, t, r,
- &colors1_lo, &colors1_hi);
+ if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+ lp_build_sample_image_nearest(bld,
+ size1,
+ row_stride1_vec, img_stride1_vec,
+ data_ptr1, s, t, r,
+ &colors1_lo, &colors1_hi);
+ }
+ else {
+ lp_build_sample_image_linear(bld,
+ size1,
+ row_stride1_vec, img_stride1_vec,
+ data_ptr1, s, t, r,
+ &colors1_lo, &colors1_hi);
+ }
}
/* interpolate samples from the two mipmap levels */
- lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, "");
- lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart);
+ if (num_quads == 1) {
+ lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, "");
+ lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart);
#if HAVE_LLVM == 0x208
- /* This is a work-around for a bug in LLVM 2.8.
- * Evidently, something goes wrong in the construction of the
- * lod_fpart short[8] vector. Adding this no-effect shuffle seems
- * to force the vector to be properly constructed.
- * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f).
- */
- {
- LLVMValueRef shuffles[8], shuffle;
- int i;
- assert(h16_bld.type.length <= Elements(shuffles));
- for (i = 0; i < h16_bld.type.length; i++)
- shuffles[i] = lp_build_const_int32(bld->gallivm, 2 * (i & 1));
- shuffle = LLVMConstVector(shuffles, h16_bld.type.length);
- lod_fpart = LLVMBuildShuffleVector(builder,
- lod_fpart, lod_fpart,
- shuffle, "");
- }
+ /* This is a work-around for a bug in LLVM 2.8.
+ * Evidently, something goes wrong in the construction of the
+ * lod_fpart short[8] vector. Adding this no-effect shuffle seems
+ * to force the vector to be properly constructed.
+ * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f).
+ */
+ {
+ LLVMValueRef shuffles[8], shuffle;
+ assert(h16_bld.type.length <= Elements(shuffles));
+ for (i = 0; i < h16_bld.type.length; i++)
+ shuffles[i] = lp_build_const_int32(bld->gallivm, 2 * (i & 1));
+ shuffle = LLVMConstVector(shuffles, h16_bld.type.length);
+ lod_fpart = LLVMBuildShuffleVector(builder,
+ lod_fpart, lod_fpart,
+ shuffle, "");
+ }
#endif
- colors0_lo = lp_build_lerp(&h16_bld, lod_fpart,
- colors0_lo, colors1_lo);
- colors0_hi = lp_build_lerp(&h16_bld, lod_fpart,
- colors0_hi, colors1_hi);
+ colors0_lo = lp_build_lerp(&h16_bld, lod_fpart,
+ colors0_lo, colors1_lo);
+ colors0_hi = lp_build_lerp(&h16_bld, lod_fpart,
+ colors0_hi, colors1_hi);
+ }
+ else {
+ LLVMValueRef lod_parts[LP_MAX_VECTOR_LENGTH/16];
+ struct lp_type perquadi16_type = bld->perquadi_bld.type;
+ perquadi16_type.width /= 2;
+ perquadi16_type.length *= 2;
+ lod_fpart = LLVMBuildBitCast(builder, lod_fpart,
+ lp_build_vec_type(bld->gallivm,
+ perquadi16_type), "");
+ /* XXX this only works for exactly 2 quads. More quads need shuffle */
+ assert(num_quads == 2);
+ for (i = 0; i < num_quads; i++) {
+ LLVMValueRef indexi2 = lp_build_const_int32(bld->gallivm, i*2);
+ lod_parts[i] = lp_build_extract_broadcast(bld->gallivm,
+ perquadi16_type,
+ h16_bld.type,
+ lod_fpart,
+ indexi2);
+ }
+ colors0_lo = lp_build_lerp(&h16_bld, lod_parts[0],
+ colors0_lo, colors1_lo);
+ colors0_hi = lp_build_lerp(&h16_bld, lod_parts[1],
+ colors0_hi, colors1_hi);
+ }
LLVMBuildStore(builder, colors0_lo, colors_lo_var);
LLVMBuildStore(builder, colors0_hi, colors_hi_var);
@@ -948,10 +1531,10 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
LLVMValueRef s,
LLVMValueRef t,
LLVMValueRef r,
- const LLVMValueRef *ddx,
- const LLVMValueRef *ddy,
- LLVMValueRef lod_bias, /* optional */
- LLVMValueRef explicit_lod, /* optional */
+ LLVMValueRef lod_ipart,
+ LLVMValueRef lod_fpart,
+ LLVMValueRef ilevel0,
+ LLVMValueRef ilevel1,
LLVMValueRef texel_out[4])
{
struct lp_build_context *int_bld = &bld->int_bld;
@@ -960,14 +1543,9 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
const unsigned min_filter = bld->static_state->min_img_filter;
const unsigned mag_filter = bld->static_state->mag_img_filter;
const unsigned dims = bld->dims;
- LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
- LLVMValueRef ilevel0, ilevel1 = NULL;
LLVMValueRef packed, packed_lo, packed_hi;
LLVMValueRef unswizzled[4];
- LLVMValueRef face_ddx[4], face_ddy[4];
struct lp_build_context h16_bld;
- LLVMValueRef first_level;
- LLVMValueRef i32t_zero = lp_build_const_int32(bld->gallivm, 0);
/* we only support the common/simple wrap modes at this time */
assert(lp_is_simple_wrap_mode(bld->static_state->wrap_s));
@@ -978,81 +1556,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
/* make 16-bit fixed-pt builder context */
- lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16));
-
- /* cube face selection, compute pre-face coords, etc. */
- if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
- LLVMValueRef face, face_s, face_t;
- lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
- s = face_s; /* vec */
- t = face_t; /* vec */
- /* use 'r' to indicate cube face */
- r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
-
- /* recompute ddx, ddy using the new (s,t) face texcoords */
- face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s);
- face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t);
- face_ddx[2] = NULL;
- face_ddx[3] = NULL;
- face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s);
- face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t);
- face_ddy[2] = NULL;
- face_ddy[3] = NULL;
- ddx = face_ddx;
- ddy = face_ddy;
- }
-
- /*
- * Compute the level of detail (float).
- */
- if (min_filter != mag_filter ||
- mip_filter != PIPE_TEX_MIPFILTER_NONE) {
- /* Need to compute lod either to choose mipmap levels or to
- * distinguish between minification/magnification with one mipmap level.
- */
- lp_build_lod_selector(bld, unit, ddx, ddy,
- lod_bias, explicit_lod,
- mip_filter,
- &lod_ipart, &lod_fpart);
- } else {
- lod_ipart = i32t_zero;
- }
-
- /*
- * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
- */
- switch (mip_filter) {
- default:
- assert(0 && "bad mip_filter value in lp_build_sample_aos()");
- /* fall-through */
- case PIPE_TEX_MIPFILTER_NONE:
- /* always use mip level 0 */
- if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
- /* XXX this is a work-around for an apparent bug in LLVM 2.7.
- * We should be able to set ilevel0 = const(0) but that causes
- * bad x86 code to be emitted.
- */
- assert(lod_ipart);
- lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
- }
- else {
- first_level = bld->dynamic_state->first_level(bld->dynamic_state,
- bld->gallivm, unit);
- ilevel0 = first_level;
- }
- break;
- case PIPE_TEX_MIPFILTER_NEAREST:
- assert(lod_ipart);
- lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
- break;
- case PIPE_TEX_MIPFILTER_LINEAR:
- assert(lod_ipart);
- assert(lod_fpart);
- lp_build_linear_mip_levels(bld, unit,
- lod_ipart, &lod_fpart,
- &ilevel0, &ilevel1);
- break;
- }
+ lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
/*
* Get/interpolate texture colors.
@@ -1062,7 +1566,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
packed_hi = lp_build_alloca(bld->gallivm, h16_bld.vec_type, "packed_hi");
if (min_filter == mag_filter) {
- /* no need to distinquish between minification and magnification */
+ /* no need to distinguish between minification and magnification */
lp_build_sample_mipmap(bld,
min_filter, mip_filter,
s, t, r,
@@ -1106,7 +1610,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
* into 'packed'
*/
packed = lp_build_pack2(bld->gallivm,
- h16_bld.type, lp_type_unorm(8),
+ h16_bld.type, lp_type_unorm(8, bld->vector_width),
LLVMBuildLoad(builder, packed_lo, ""),
LLVMBuildLoad(builder, packed_hi, ""));
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
index 5d9ecac4d50..55b3bc1c09a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
@@ -46,10 +46,10 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
LLVMValueRef s,
LLVMValueRef t,
LLVMValueRef r,
- const LLVMValueRef *ddx,
- const LLVMValueRef *ddy,
- LLVMValueRef lod_bias, /* optional */
- LLVMValueRef explicit_lod, /* optional */
+ LLVMValueRef lod_ipart,
+ LLVMValueRef lod_fpart,
+ LLVMValueRef ilevel0,
+ LLVMValueRef ilevel1,
LLVMValueRef texel_out[4]);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 73dc3e77083..aaef7970635 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -41,6 +41,7 @@
#include "util/u_memory.h"
#include "util/u_math.h"
#include "util/u_format.h"
+#include "util/u_cpu_detect.h"
#include "lp_bld_debug.h"
#include "lp_bld_type.h"
#include "lp_bld_const.h"
@@ -57,6 +58,7 @@
#include "lp_bld_sample_aos.h"
#include "lp_bld_struct.h"
#include "lp_bld_quad.h"
+#include "lp_bld_pack.h"
/**
@@ -221,6 +223,41 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld,
/**
+ * Helper to compute the first coord and the weight for
+ * linear wrap repeat npot textures
+ */
+void
+lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
+ LLVMValueRef coord_f,
+ LLVMValueRef length_i,
+ LLVMValueRef length_f,
+ LLVMValueRef *coord0_i,
+ LLVMValueRef *weight_f)
+{
+ struct lp_build_context *coord_bld = &bld->coord_bld;
+ struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+ LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
+ LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
+ int_coord_bld->one);
+ LLVMValueRef mask;
+ /* wrap with normalized floats is just fract */
+ coord_f = lp_build_fract(coord_bld, coord_f);
+ /* mul by size and subtract 0.5 */
+ coord_f = lp_build_mul(coord_bld, coord_f, length_f);
+ coord_f = lp_build_sub(coord_bld, coord_f, half);
+ /*
+ * we avoided the 0.5/length division before the repeat wrap,
+ * now need to fix up edge cases with selects
+ */
+ /* convert to int, compute lerp weight */
+ lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
+ mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
+ PIPE_FUNC_LESS, *coord0_i, int_coord_bld->zero);
+ *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
+}
+
+
+/**
* Build LLVM code for texture wrap mode for linear filtering.
* \param x0_out returns first integer texcoord
* \param x1_out returns second integer texcoord
@@ -246,28 +283,27 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
switch(wrap_mode) {
case PIPE_TEX_WRAP_REPEAT:
- /* mul by size and subtract 0.5 */
- coord = lp_build_mul(coord_bld, coord, length_f);
- coord = lp_build_sub(coord_bld, coord, half);
- /* convert to int, compute lerp weight */
- lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
- /* repeat wrap */
if (is_pot) {
+ /* mul by size and subtract 0.5 */
+ coord = lp_build_mul(coord_bld, coord, length_f);
+ coord = lp_build_sub(coord_bld, coord, half);
+ /* convert to int, compute lerp weight */
+ lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+ /* repeat wrap */
coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
}
else {
- /* Add a bias to the texcoord to handle negative coords */
- LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
LLVMValueRef mask;
- coord0 = LLVMBuildAdd(builder, coord0, bias, "");
- coord0 = LLVMBuildURem(builder, coord0, length, "");
- mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
+ lp_build_coord_repeat_npot_linear(bld, coord,
+ length, length_f,
+ &coord0, &weight);
+ mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
coord1 = LLVMBuildAnd(builder,
- lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
- mask, "");
+ lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
+ mask, "");
}
break;
@@ -444,15 +480,16 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
switch(wrap_mode) {
case PIPE_TEX_WRAP_REPEAT:
- coord = lp_build_mul(coord_bld, coord, length_f);
- icoord = lp_build_ifloor(coord_bld, coord);
- if (is_pot)
+ if (is_pot) {
+ coord = lp_build_mul(coord_bld, coord, length_f);
+ icoord = lp_build_ifloor(coord_bld, coord);
icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
+ }
else {
- /* Add a bias to the texcoord to handle negative coords */
- LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
- icoord = LLVMBuildAdd(builder, icoord, bias, "");
- icoord = LLVMBuildURem(builder, icoord, length, "");
+ /* take fraction, unnormalize */
+ coord = lp_build_fract_safe(coord_bld, coord);
+ coord = lp_build_mul(coord_bld, coord, length_f);
+ icoord = lp_build_itrunc(coord_bld, coord);
}
break;
@@ -473,7 +510,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
break;
case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
- /* Note: this is the same as CLAMP_TO_EDGE, except min = -min */
+ /* Note: this is the same as CLAMP_TO_EDGE, except min = -1 */
{
LLVMValueRef min, max;
@@ -873,12 +910,32 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
struct lp_build_if_state if_ctx;
LLVMValueRef need_lerp;
+ unsigned num_quads = bld->coord_bld.type.length / 4;
/* need_lerp = lod_fpart > 0 */
- need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
- lod_fpart,
- bld->float_bld.zero,
- "need_lerp");
+ if (num_quads == 1) {
+ need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
+ lod_fpart, bld->perquadf_bld.zero,
+ "need_lerp");
+ }
+ else {
+ /*
+ * We'll do mip filtering if any of the quads need it.
+ * It might be better to split the vectors here and only fetch/filter
+ * quads which need it.
+ */
+ /*
+ * We unfortunately need to clamp lod_fpart here since we can get
+ * negative values which would screw up filtering if not all
+ * lod_fpart values have same sign.
+ */
+ lod_fpart = lp_build_max(&bld->perquadf_bld, lod_fpart,
+ bld->perquadf_bld.zero);
+ need_lerp = lp_build_compare(bld->gallivm, bld->perquadf_bld.type,
+ PIPE_FUNC_GREATER,
+ lod_fpart, bld->perquadf_bld.zero);
+ need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, need_lerp);
+ }
lp_build_if(&if_ctx, bld->gallivm, need_lerp);
{
@@ -904,7 +961,10 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
/* interpolate samples from the two mipmap levels */
- lod_fpart = lp_build_broadcast_scalar(&bld->texel_bld, lod_fpart);
+ lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
+ bld->perquadf_bld.type,
+ bld->texel_bld.type,
+ lod_fpart);
for (chan = 0; chan < 4; chan++) {
colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
@@ -916,37 +976,28 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
}
}
-
-
/**
- * General texture sampling codegen.
- * This function handles texture sampling for all texture targets (1D,
- * 2D, 3D, cube) and all filtering modes.
+ * Calculate cube face, lod, mip levels.
*/
static void
-lp_build_sample_general(struct lp_build_sample_context *bld,
- unsigned unit,
- LLVMValueRef s,
- LLVMValueRef t,
- LLVMValueRef r,
- const LLVMValueRef *ddx,
- const LLVMValueRef *ddy,
- LLVMValueRef lod_bias, /* optional */
- LLVMValueRef explicit_lod, /* optional */
- LLVMValueRef *colors_out)
+lp_build_sample_common(struct lp_build_sample_context *bld,
+ unsigned unit,
+ LLVMValueRef *s,
+ LLVMValueRef *t,
+ LLVMValueRef *r,
+ const struct lp_derivatives *derivs,
+ LLVMValueRef lod_bias, /* optional */
+ LLVMValueRef explicit_lod, /* optional */
+ LLVMValueRef *lod_ipart,
+ LLVMValueRef *lod_fpart,
+ LLVMValueRef *ilevel0,
+ LLVMValueRef *ilevel1)
{
- struct lp_build_context *int_bld = &bld->int_bld;
- LLVMBuilderRef builder = bld->gallivm->builder;
const unsigned mip_filter = bld->static_state->min_mip_filter;
const unsigned min_filter = bld->static_state->min_img_filter;
const unsigned mag_filter = bld->static_state->mag_img_filter;
- LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
- LLVMValueRef ilevel0, ilevel1 = NULL;
- LLVMValueRef face_ddx[4], face_ddy[4];
- LLVMValueRef texels[4];
LLVMValueRef first_level;
- LLVMValueRef i32t_zero = lp_build_const_int32(bld->gallivm, 0);
- unsigned chan;
+ struct lp_derivatives face_derivs;
/*
printf("%s mip %d min %d mag %d\n", __FUNCTION__,
@@ -958,23 +1009,16 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
*/
if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
LLVMValueRef face, face_s, face_t;
- lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
- s = face_s; /* vec */
- t = face_t; /* vec */
+ lp_build_cube_lookup(bld, *s, *t, *r, &face, &face_s, &face_t);
+ *s = face_s; /* vec */
+ *t = face_t; /* vec */
/* use 'r' to indicate cube face */
- r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
+ *r = face; /* vec */
/* recompute ddx, ddy using the new (s,t) face texcoords */
- face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s);
- face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t);
- face_ddx[2] = NULL;
- face_ddx[3] = NULL;
- face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s);
- face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t);
- face_ddy[2] = NULL;
- face_ddy[3] = NULL;
- ddx = face_ddx;
- ddy = face_ddy;
+ face_derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(&bld->coord_bld, *s, *t);
+ face_derivs.ddx_ddy[1] = NULL;
+ derivs = &face_derivs;
}
/*
@@ -985,12 +1029,12 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
/* Need to compute lod either to choose mipmap levels or to
* distinguish between minification/magnification with one mipmap level.
*/
- lp_build_lod_selector(bld, unit, ddx, ddy,
+ lp_build_lod_selector(bld, unit, derivs,
lod_bias, explicit_lod,
mip_filter,
- &lod_ipart, &lod_fpart);
+ lod_ipart, lod_fpart);
} else {
- lod_ipart = i32t_zero;
+ *lod_ipart = bld->perquadi_bld.zero;
}
/*
@@ -1006,28 +1050,56 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
/* XXX this is a work-around for an apparent bug in LLVM 2.7.
* We should be able to set ilevel0 = const(0) but that causes
* bad x86 code to be emitted.
+ * XXX should probably disable that on other llvm versions.
*/
- assert(lod_ipart);
- lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
+ assert(*lod_ipart);
+ lp_build_nearest_mip_level(bld, unit, *lod_ipart, ilevel0);
}
else {
first_level = bld->dynamic_state->first_level(bld->dynamic_state,
bld->gallivm, unit);
- ilevel0 = first_level;
+ first_level = lp_build_broadcast_scalar(&bld->perquadi_bld, first_level);
+ *ilevel0 = first_level;
}
break;
case PIPE_TEX_MIPFILTER_NEAREST:
- assert(lod_ipart);
- lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
+ assert(*lod_ipart);
+ lp_build_nearest_mip_level(bld, unit, *lod_ipart, ilevel0);
break;
case PIPE_TEX_MIPFILTER_LINEAR:
- assert(lod_ipart);
- assert(lod_fpart);
+ assert(*lod_ipart);
+ assert(*lod_fpart);
lp_build_linear_mip_levels(bld, unit,
- lod_ipart, &lod_fpart,
- &ilevel0, &ilevel1);
+ *lod_ipart, lod_fpart,
+ ilevel0, ilevel1);
break;
}
+}
+
+/**
+ * General texture sampling codegen.
+ * This function handles texture sampling for all texture targets (1D,
+ * 2D, 3D, cube) and all filtering modes.
+ */
+static void
+lp_build_sample_general(struct lp_build_sample_context *bld,
+ unsigned unit,
+ LLVMValueRef s,
+ LLVMValueRef t,
+ LLVMValueRef r,
+ LLVMValueRef lod_ipart,
+ LLVMValueRef lod_fpart,
+ LLVMValueRef ilevel0,
+ LLVMValueRef ilevel1,
+ LLVMValueRef *colors_out)
+{
+ struct lp_build_context *int_bld = &bld->int_bld;
+ LLVMBuilderRef builder = bld->gallivm->builder;
+ const unsigned mip_filter = bld->static_state->min_mip_filter;
+ const unsigned min_filter = bld->static_state->min_img_filter;
+ const unsigned mag_filter = bld->static_state->mag_img_filter;
+ LLVMValueRef texels[4];
+ unsigned chan;
/*
* Get/interpolate texture colors.
@@ -1039,7 +1111,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
}
if (min_filter == mag_filter) {
- /* no need to distinquish between minification and magnification */
+ /* no need to distinguish between minification and magnification */
lp_build_sample_mipmap(bld, unit,
min_filter, mip_filter,
s, t, r,
@@ -1135,7 +1207,10 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
* For debugging.
*/
void
-lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type,
+lp_build_sample_nop(struct gallivm_state *gallivm,
+ struct lp_type type,
+ unsigned num_coords,
+ const LLVMValueRef *coords,
LLVMValueRef texel_out[4])
{
LLVMValueRef one = lp_build_one(gallivm, type);
@@ -1152,8 +1227,7 @@ lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type,
* 'texel' will return a vector of four LLVMValueRefs corresponding to
* R, G, B, A.
* \param type vector float type to use for coords, etc.
- * \param ddx partial derivatives of (s,t,r,q) with respect to x
- * \param ddy partial derivatives of (s,t,r,q) with respect to y
+ * \param derivs partial derivatives of (s,t,r,q) with respect to x and y
*/
void
lp_build_sample_soa(struct gallivm_state *gallivm,
@@ -1163,8 +1237,7 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
unsigned unit,
unsigned num_coords,
const LLVMValueRef *coords,
- const LLVMValueRef ddx[4],
- const LLVMValueRef ddy[4],
+ const struct lp_derivatives *derivs,
LLVMValueRef lod_bias, /* optional */
LLVMValueRef explicit_lod, /* optional */
LLVMValueRef texel_out[4])
@@ -1173,10 +1246,10 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
struct lp_build_sample_context bld;
LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef tex_width, tex_height, tex_depth;
LLVMValueRef s;
LLVMValueRef t;
LLVMValueRef r;
- struct lp_type float_vec_type;
if (0) {
enum pipe_format fmt = static_state->format;
@@ -1193,6 +1266,8 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
bld.format_desc = util_format_description(static_state->format);
bld.dims = dims;
+ bld.vector_width = lp_type_width(type);
+
bld.float_type = lp_type_float(32);
bld.int_type = lp_type_int(32);
bld.coord_type = type;
@@ -1201,22 +1276,26 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
bld.float_size_type.length = dims > 1 ? 4 : 1;
bld.int_size_type = lp_int_type(bld.float_size_type);
bld.texel_type = type;
-
- float_vec_type = lp_type_float_vec(32);
+ bld.perquadf_type = type;
+ /* we want native vector size to be able to use our intrinsics */
+ bld.perquadf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
+ bld.perquadi_type = lp_int_type(bld.perquadf_type);
lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
- lp_build_context_init(&bld.float_vec_bld, gallivm, float_vec_type);
+ lp_build_context_init(&bld.float_vec_bld, gallivm, type);
lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
+ lp_build_context_init(&bld.perquadf_bld, gallivm, bld.perquadf_type);
+ lp_build_context_init(&bld.perquadi_bld, gallivm, bld.perquadi_type);
/* Get the dynamic state */
- bld.width = dynamic_state->width(dynamic_state, gallivm, unit);
- bld.height = dynamic_state->height(dynamic_state, gallivm, unit);
- bld.depth = dynamic_state->depth(dynamic_state, gallivm, unit);
+ tex_width = dynamic_state->width(dynamic_state, gallivm, unit);
+ tex_height = dynamic_state->height(dynamic_state, gallivm, unit);
+ tex_depth = dynamic_state->depth(dynamic_state, gallivm, unit);
bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm, unit);
bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm, unit);
bld.data_array = dynamic_state->data_ptr(dynamic_state, gallivm, unit);
@@ -1228,37 +1307,40 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
/* width, height, depth as single int vector */
if (dims <= 1) {
- bld.int_size = bld.width;
+ bld.int_size = tex_width;
}
else {
bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_bld.undef,
- bld.width, LLVMConstInt(i32t, 0, 0), "");
+ tex_width, LLVMConstInt(i32t, 0, 0), "");
if (dims >= 2) {
bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
- bld.height, LLVMConstInt(i32t, 1, 0), "");
+ tex_height, LLVMConstInt(i32t, 1, 0), "");
if (dims >= 3) {
bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
- bld.depth, LLVMConstInt(i32t, 2, 0), "");
+ tex_depth, LLVMConstInt(i32t, 2, 0), "");
}
}
}
if (0) {
/* For debug: no-op texture sampling */
- lp_build_sample_nop(gallivm, bld.texel_type, texel_out);
- }
- else if (util_format_fits_8unorm(bld.format_desc) &&
- lp_is_simple_wrap_mode(static_state->wrap_s) &&
- lp_is_simple_wrap_mode(static_state->wrap_t)) {
- /* do sampling/filtering with fixed pt arithmetic */
- lp_build_sample_aos(&bld, unit, s, t, r, ddx, ddy,
- lod_bias, explicit_lod,
+ lp_build_sample_nop(gallivm,
+ bld.texel_type,
+ num_coords,
+ coords,
texel_out);
}
-
else {
+ LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
+ LLVMValueRef ilevel0 = NULL, ilevel1 = NULL;
+ unsigned num_quads = type.length / 4;
+ const unsigned mip_filter = bld.static_state->min_mip_filter;
+ boolean use_aos = util_format_fits_8unorm(bld.format_desc) &&
+ lp_is_simple_wrap_mode(static_state->wrap_s) &&
+ lp_is_simple_wrap_mode(static_state->wrap_t);
+
if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
- util_format_fits_8unorm(bld.format_desc)) {
+ !use_aos && util_format_fits_8unorm(bld.format_desc)) {
debug_printf("%s: using floating point linear filtering for %s\n",
__FUNCTION__, bld.format_desc->short_name);
debug_printf(" min_img %d mag_img %d mip %d wraps %d wrapt %d\n",
@@ -1269,9 +1351,203 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
static_state->wrap_t);
}
- lp_build_sample_general(&bld, unit, s, t, r, ddx, ddy,
- lod_bias, explicit_lod,
- texel_out);
+ lp_build_sample_common(&bld, unit,
+ &s, &t, &r,
+ derivs, lod_bias, explicit_lod,
+ &lod_ipart, &lod_fpart,
+ &ilevel0, &ilevel1);
+
+ /*
+ * we only try 8-wide sampling with soa as it appears to
+ * be a loss with aos with AVX.
+ */
+ if (num_quads == 1 || (mip_filter == PIPE_TEX_MIPFILTER_NONE &&
+ !use_aos)) {
+
+ if (num_quads > 1) {
+ LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
+ /* These parameters are the same for all quads */
+ lod_ipart = LLVMBuildExtractElement(builder, lod_ipart, index0, "");
+ ilevel0 = LLVMBuildExtractElement(builder, ilevel0, index0, "");
+ }
+ if (use_aos) {
+ /* do sampling/filtering with fixed pt arithmetic */
+ lp_build_sample_aos(&bld, unit,
+ s, t, r,
+ lod_ipart, lod_fpart,
+ ilevel0, ilevel1,
+ texel_out);
+ }
+
+ else {
+ lp_build_sample_general(&bld, unit,
+ s, t, r,
+ lod_ipart, lod_fpart,
+ ilevel0, ilevel1,
+ texel_out);
+ }
+ }
+ else {
+ struct lp_build_if_state if_ctx;
+ LLVMValueRef notsame_levels, notsame;
+ LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
+ LLVMValueRef texels[4];
+ LLVMValueRef texelout[4];
+ unsigned j;
+
+ texels[0] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texr");
+ texels[1] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texg");
+ texels[2] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texb");
+ texels[3] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texa");
+
+ /* only build the if if we MAY split, otherwise always split */
+ if (!use_aos) {
+ notsame = lp_build_extract_broadcast(gallivm,
+ bld.perquadi_bld.type,
+ bld.perquadi_bld.type,
+ ilevel0, index0);
+ notsame = lp_build_sub(&bld.perquadi_bld, ilevel0, notsame);
+ notsame_levels = lp_build_any_true_range(&bld.perquadi_bld, num_quads,
+ notsame);
+ if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+ notsame = lp_build_extract_broadcast(gallivm,
+ bld.perquadi_bld.type,
+ bld.perquadi_bld.type,
+ ilevel1, index0);
+ notsame = lp_build_sub(&bld.perquadi_bld, ilevel1, notsame);
+ notsame = lp_build_any_true_range(&bld.perquadi_bld, num_quads, notsame);
+ notsame_levels = LLVMBuildOr(builder, notsame_levels, notsame, "");
+ }
+ lp_build_if(&if_ctx, gallivm, notsame_levels);
+ }
+
+ {
+ struct lp_build_sample_context bld4;
+ struct lp_type type4 = type;
+ unsigned i;
+ LLVMValueRef texelout4[4];
+ LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
+
+ type4.length = 4;
+
+ /* Setup our build context */
+ memset(&bld4, 0, sizeof bld4);
+ bld4.gallivm = bld.gallivm;
+ bld4.static_state = bld.static_state;
+ bld4.dynamic_state = bld.dynamic_state;
+ bld4.format_desc = bld.format_desc;
+ bld4.dims = bld.dims;
+ bld4.row_stride_array = bld.row_stride_array;
+ bld4.img_stride_array = bld.img_stride_array;
+ bld4.data_array = bld.data_array;
+ bld4.int_size = bld.int_size;
+
+ bld4.vector_width = lp_type_width(type4);
+
+ bld4.float_type = lp_type_float(32);
+ bld4.int_type = lp_type_int(32);
+ bld4.coord_type = type4;
+ bld4.int_coord_type = lp_int_type(type4);
+ bld4.float_size_type = lp_type_float(32);
+ bld4.float_size_type.length = dims > 1 ? 4 : 1;
+ bld4.int_size_type = lp_int_type(bld4.float_size_type);
+ bld4.texel_type = type4;
+ bld4.perquadf_type = type4;
+ /* we want native vector size to be able to use our intrinsics */
+ bld4.perquadf_type.length = 1;
+ bld4.perquadi_type = lp_int_type(bld4.perquadf_type);
+
+ lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
+ lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
+ lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
+ lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
+ lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
+ lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
+ lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
+ lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
+ lp_build_context_init(&bld4.perquadf_bld, gallivm, bld4.perquadf_type);
+ lp_build_context_init(&bld4.perquadi_bld, gallivm, bld4.perquadi_type);
+
+ for (i = 0; i < num_quads; i++) {
+ LLVMValueRef s4, t4, r4;
+ LLVMValueRef lod_iparts, lod_fparts = NULL;
+ LLVMValueRef ilevel0s, ilevel1s = NULL;
+ LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
+
+ s4 = lp_build_extract_range(gallivm, s, 4*i, 4);
+ t4 = lp_build_extract_range(gallivm, t, 4*i, 4);
+ r4 = lp_build_extract_range(gallivm, r, 4*i, 4);
+ lod_iparts = LLVMBuildExtractElement(builder, lod_ipart, indexi, "");
+ ilevel0s = LLVMBuildExtractElement(builder, ilevel0, indexi, "");
+ if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+ ilevel1s = LLVMBuildExtractElement(builder, ilevel1, indexi, "");
+ lod_fparts = LLVMBuildExtractElement(builder, lod_fpart, indexi, "");
+ }
+
+ if (use_aos) {
+ /* do sampling/filtering with fixed pt arithmetic */
+ lp_build_sample_aos(&bld4, unit,
+ s4, t4, r4,
+ lod_iparts, lod_fparts,
+ ilevel0s, ilevel1s,
+ texelout4);
+ }
+
+ else {
+ lp_build_sample_general(&bld4, unit,
+ s4, t4, r4,
+ lod_iparts, lod_fparts,
+ ilevel0s, ilevel1s,
+ texelout4);
+ }
+ for (j = 0; j < 4; j++) {
+ texelouttmp[j][i] = texelout4[j];
+ }
+ }
+ for (j = 0; j < 4; j++) {
+ texelout[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
+ LLVMBuildStore(builder, texelout[j], texels[j]);
+ }
+ }
+ if (!use_aos) {
+ LLVMValueRef ilevel0s, lod_iparts, ilevel1s = NULL;
+
+ lp_build_else(&if_ctx);
+
+ /* These parameters are the same for all quads */
+ lod_iparts = LLVMBuildExtractElement(builder, lod_ipart, index0, "");
+ ilevel0s = LLVMBuildExtractElement(builder, ilevel0, index0, "");
+ if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+ ilevel1s = LLVMBuildExtractElement(builder, ilevel1, index0, "");
+ }
+
+ if (use_aos) {
+ /* do sampling/filtering with fixed pt arithmetic */
+ lp_build_sample_aos(&bld, unit,
+ s, t, r,
+ lod_iparts, lod_fpart,
+ ilevel0s, ilevel1s,
+ texelout);
+ }
+
+ else {
+ lp_build_sample_general(&bld, unit,
+ s, t, r,
+ lod_iparts, lod_fpart,
+ ilevel0s, ilevel1s,
+ texelout);
+ }
+ for (j = 0; j < 4; j++) {
+ LLVMBuildStore(builder, texelout[j], texels[j]);
+ }
+
+ lp_build_endif(&if_ctx);
+ }
+
+ for (j = 0; j < 4; j++) {
+ texel_out[j] = LLVMBuildLoad(builder, texels[j], "");
+ }
+ }
}
lp_build_sample_compare(&bld, r, texel_out);
@@ -1283,6 +1559,7 @@ void
lp_build_size_query_soa(struct gallivm_state *gallivm,
const struct lp_sampler_static_state *static_state,
struct lp_sampler_dynamic_state *dynamic_state,
+ struct lp_type int_type,
unsigned unit,
LLVMValueRef explicit_lod,
LLVMValueRef *sizes_out)
@@ -1311,7 +1588,9 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
return;
}
- lp_build_context_init(&bld_int_vec, gallivm, lp_type_int_vec(32));
+ assert(!int_type.floating);
+
+ lp_build_context_init(&bld_int_vec, gallivm, lp_type_int_vec(32, 128));
if (explicit_lod) {
LLVMValueRef first_level;
@@ -1345,7 +1624,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
size = lp_build_minify(&bld_int_vec, size, lod);
for (i=0; i < dims; i++) {
- sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec.type, bld_int_vec.type,
+ sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec.type, int_type,
size,
lp_build_const_int32(gallivm, i));
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
index 5d4406812c7..641c960431d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -40,6 +40,7 @@
#include "lp_bld_init.h"
#include "lp_bld_logic.h"
#include "lp_bld_swizzle.h"
+#include "lp_bld_pack.h"
LLVMValueRef
@@ -95,7 +96,7 @@ lp_build_broadcast_scalar(struct lp_build_context *bld,
/**
- * Combined extract and broadcast (or a mere shuffle when the two types match)
+ * Combined extract and broadcast (mere shuffle in most cases)
*/
LLVMValueRef
lp_build_extract_broadcast(struct gallivm_state *gallivm,
@@ -132,9 +133,9 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm,
}
}
else {
- if (dst_type.length == src_type.length) {
+ if (dst_type.length > 1) {
/*
- * Special shuffle of the same size.
+ * shuffle - result can be of different length.
*/
LLVMValueRef shuffle;
@@ -142,28 +143,14 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm,
LLVMVectorType(i32t, dst_type.length),
index);
res = LLVMBuildShuffleVector(gallivm->builder, vector,
- LLVMGetUndef(lp_build_vec_type(gallivm, dst_type)),
+ LLVMGetUndef(lp_build_vec_type(gallivm, src_type)),
shuffle, "");
}
else {
- LLVMValueRef scalar;
- scalar = LLVMBuildExtractElement(gallivm->builder, vector, index, "");
- if (dst_type.length == 1) {
- /*
- * Trivial extract scalar from vector.
- */
-
- res = scalar;
- }
- else {
- /*
- * General case of different sized vectors.
- */
-
- res = lp_build_broadcast(gallivm,
- lp_build_vec_type(gallivm, dst_type),
- vector);
- }
+ /*
+ * Trivial extract scalar from vector.
+ */
+ res = LLVMBuildExtractElement(gallivm->builder, vector, index, "");
}
}
@@ -290,6 +277,8 @@ lp_build_swizzle_aos(struct lp_build_context *bld,
return bld->zero;
case PIPE_SWIZZLE_ONE:
return bld->one;
+ case LP_BLD_SWIZZLE_DONTCARE:
+ return bld->undef;
default:
assert(0);
return bld->undef;
@@ -319,21 +308,26 @@ lp_build_swizzle_aos(struct lp_build_context *bld,
case PIPE_SWIZZLE_BLUE:
case PIPE_SWIZZLE_ALPHA:
shuffle = j + swizzles[i];
+ shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
break;
case PIPE_SWIZZLE_ZERO:
shuffle = type.length + 0;
+ shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
if (!aux[0]) {
aux[0] = lp_build_const_elem(bld->gallivm, type, 0.0);
}
break;
case PIPE_SWIZZLE_ONE:
shuffle = type.length + 1;
+ shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
if (!aux[1]) {
aux[1] = lp_build_const_elem(bld->gallivm, type, 1.0);
}
break;
+ case LP_BLD_SWIZZLE_DONTCARE:
+ shuffles[j + i] = LLVMGetUndef(i32t);
+ break;
}
- shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
}
}
@@ -508,3 +502,127 @@ lp_build_swizzle_soa_inplace(struct lp_build_context *bld,
lp_build_swizzle_soa(bld, unswizzled, swizzles, values);
}
+
+
+/**
+ * Transpose from AOS <-> SOA
+ *
+ * @param single_type_lp type of pixels
+ * @param src the 4 * n pixel input
+ * @param dst the 4 * n pixel output
+ */
+void
+lp_build_transpose_aos(struct gallivm_state *gallivm,
+ struct lp_type single_type_lp,
+ const LLVMValueRef src[4],
+ LLVMValueRef dst[4])
+{
+ struct lp_type double_type_lp = single_type_lp;
+ LLVMTypeRef single_type;
+ LLVMTypeRef double_type;
+ LLVMValueRef t0, t1, t2, t3;
+
+ double_type_lp.length >>= 1;
+ double_type_lp.width <<= 1;
+
+ double_type = lp_build_vec_type(gallivm, double_type_lp);
+ single_type = lp_build_vec_type(gallivm, single_type_lp);
+
+ /* Interleave x, y, z, w -> xy and zw */
+ t0 = lp_build_interleave2_half(gallivm, single_type_lp, src[0], src[1], 0);
+ t1 = lp_build_interleave2_half(gallivm, single_type_lp, src[2], src[3], 0);
+ t2 = lp_build_interleave2_half(gallivm, single_type_lp, src[0], src[1], 1);
+ t3 = lp_build_interleave2_half(gallivm, single_type_lp, src[2], src[3], 1);
+
+ /* Cast to double width type for second interleave */
+ t0 = LLVMBuildBitCast(gallivm->builder, t0, double_type, "t0");
+ t1 = LLVMBuildBitCast(gallivm->builder, t1, double_type, "t1");
+ t2 = LLVMBuildBitCast(gallivm->builder, t2, double_type, "t2");
+ t3 = LLVMBuildBitCast(gallivm->builder, t3, double_type, "t3");
+
+ /* Interleave xy, zw -> xyzw */
+ dst[0] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 0);
+ dst[1] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 1);
+ dst[2] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 0);
+ dst[3] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 1);
+
+ /* Cast back to original single width type */
+ dst[0] = LLVMBuildBitCast(gallivm->builder, dst[0], single_type, "dst0");
+ dst[1] = LLVMBuildBitCast(gallivm->builder, dst[1], single_type, "dst1");
+ dst[2] = LLVMBuildBitCast(gallivm->builder, dst[2], single_type, "dst2");
+ dst[3] = LLVMBuildBitCast(gallivm->builder, dst[3], single_type, "dst3");
+}
+
+
+/**
+ * Pack first element of aos values,
+ * pad out to destination size.
+ * i.e. x1 _ _ _ x2 _ _ _ will become x1 x2 _ _
+ */
+LLVMValueRef
+lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ struct lp_type dst_type,
+ const LLVMValueRef src)
+{
+ LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+ LLVMValueRef undef = LLVMGetUndef(i32t);
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+ unsigned num_src = src_type.length / 4;
+ unsigned num_dst = dst_type.length;
+ unsigned i;
+
+ assert(num_src <= num_dst);
+
+ for (i = 0; i < num_src; i++) {
+ shuffles[i] = LLVMConstInt(i32t, i * 4, 0);
+ }
+ for (i = num_src; i < num_dst; i++) {
+ shuffles[i] = undef;
+ }
+
+ if (num_dst == 1) {
+ return LLVMBuildExtractElement(gallivm->builder, src, shuffles[0], "");
+ }
+ else {
+ return LLVMBuildShuffleVector(gallivm->builder, src, src,
+ LLVMConstVector(shuffles, num_dst), "");
+ }
+}
+
+
+/**
+ * Unpack and broadcast packed aos values consisting of only the
+ * first value, i.e. x1 x2 _ _ will become x1 x1 x1 x1 x2 x2 x2 x2
+ */
+LLVMValueRef
+lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ struct lp_type dst_type,
+ const LLVMValueRef src)
+{
+ LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+ unsigned num_dst = dst_type.length;
+ unsigned num_src = dst_type.length / 4;
+ unsigned i;
+
+ assert(num_dst / 4 <= src_type.length);
+
+ for (i = 0; i < num_src; i++) {
+ shuffles[i*4] = LLVMConstInt(i32t, i, 0);
+ shuffles[i*4+1] = LLVMConstInt(i32t, i, 0);
+ shuffles[i*4+2] = LLVMConstInt(i32t, i, 0);
+ shuffles[i*4+3] = LLVMConstInt(i32t, i, 0);
+ }
+
+ if (num_src == 1) {
+ return lp_build_extract_broadcast(gallivm, src_type, dst_type,
+ src, shuffles[0]);
+ }
+ else {
+ return LLVMBuildShuffleVector(gallivm->builder, src, src,
+ LLVMConstVector(shuffles, num_dst), "");
+ }
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
index c366a65103e..0bf4ce988a2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
@@ -44,6 +44,9 @@ struct lp_type;
struct lp_build_context;
+#define LP_BLD_SWIZZLE_DONTCARE 0xFF
+
+
LLVMValueRef
lp_build_broadcast(struct gallivm_state *gallivm,
LLVMTypeRef vec_type,
@@ -103,4 +106,25 @@ lp_build_swizzle_soa_inplace(struct lp_build_context *bld,
const unsigned char swizzles[4]);
+void
+lp_build_transpose_aos(struct gallivm_state *gallivm,
+ struct lp_type type,
+ const LLVMValueRef src[4],
+ LLVMValueRef dst[4]);
+
+
+LLVMValueRef
+lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ struct lp_type dst_type,
+ const LLVMValueRef src);
+
+
+LLVMValueRef
+lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ struct lp_type dst_type,
+ const LLVMValueRef src);
+
+
#endif /* !LP_BLD_SWIZZLE_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 4423bc5dedd..e292420a61a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -60,6 +60,7 @@ struct tgsi_token;
struct tgsi_shader_info;
struct lp_build_mask_context;
struct gallivm_state;
+struct lp_derivatives;
enum lp_build_tex_modifier {
@@ -174,8 +175,7 @@ struct lp_build_sampler_soa
unsigned unit,
unsigned num_coords,
const LLVMValueRef *coords,
- const LLVMValueRef *ddx,
- const LLVMValueRef *ddy,
+ const struct lp_derivatives *derivs,
LLVMValueRef lod_bias, /* optional */
LLVMValueRef explicit_lod, /* optional */
LLVMValueRef *texel);
@@ -183,6 +183,7 @@ struct lp_build_sampler_soa
void
(*emit_size_query)( const struct lp_build_sampler_soa *sampler,
struct gallivm_state *gallivm,
+ struct lp_type type,
unsigned unit,
LLVMValueRef explicit_lod, /* optional */
LLVMValueRef *sizes_out);
@@ -197,8 +198,7 @@ struct lp_build_sampler_aos
unsigned target, /* TGSI_TEXTURE_* */
unsigned unit,
LLVMValueRef coords,
- LLVMValueRef ddx,
- LLVMValueRef ddy,
+ const struct lp_derivatives derivs,
enum lp_build_tex_modifier modifier);
};
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
index 24bc13a9be8..0666bba7fbd 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
@@ -56,6 +56,7 @@
#include "lp_bld_quad.h"
#include "lp_bld_tgsi.h"
#include "lp_bld_debug.h"
+#include "lp_bld_sample.h"
/**
@@ -363,6 +364,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
LLVMValueRef coords;
LLVMValueRef ddx;
LLVMValueRef ddy;
+ struct lp_derivatives derivs;
if (!bld->sampler) {
_debug_printf("warning: found texture instruction but no sampler generator supplied\n");
@@ -373,7 +375,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
coords = lp_build_emit_fetch( &bld->bld_base, inst, 0 , LP_CHAN_ALL);
- if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
+ if (0 && modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
ddx = lp_build_emit_fetch( &bld->bld_base, inst, 1 , LP_CHAN_ALL);
ddy = lp_build_emit_fetch( &bld->bld_base, inst, 2 , LP_CHAN_ALL);
unit = inst->Src[3].Register.Index;
@@ -383,8 +385,8 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
ddy = lp_build_ddy( &bld->bld_base.base, coords );
#else
/* TODO */
- ddx = bld->bld_base.base.one;
- ddy = bld->bld_base.base.one;
+ derivs.ddx_ddy[0] = bld->bld_base.base.one;
+ derivs.ddx_ddy[1] = bld->bld_base.base.one;
#endif
unit = inst->Src[1].Register.Index;
}
@@ -392,7 +394,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
return bld->sampler->emit_fetch_texel(bld->sampler,
&bld->bld_base.base,
target, unit,
- coords, ddx, ddy,
+ coords, derivs,
modifier);
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index d9faaf20273..85a4401b534 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -62,6 +62,7 @@
#include "lp_bld_limits.h"
#include "lp_bld_debug.h"
#include "lp_bld_printf.h"
+#include "lp_bld_sample.h"
static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld)
@@ -763,7 +764,7 @@ emit_fetch_temporary(
else {
LLVMValueRef temp_ptr;
if (stype != TGSI_TYPE_FLOAT && stype != TGSI_TYPE_UNTYPED) {
- LLVMTypeRef itype = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
+ LLVMTypeRef itype = LLVMPointerType(bld->bld_base.int_bld.vec_type, 0);
LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index,
swizzle);
temp_ptr = LLVMBuildBitCast(builder, tint_ptr, itype, "");
@@ -1068,7 +1069,7 @@ emit_store_chan(
switch (dtype) {
case TGSI_TYPE_UNSIGNED:
case TGSI_TYPE_SIGNED: {
- LLVMTypeRef itype = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
+ LLVMTypeRef itype = bld_base->int_bld.vec_type;
LLVMTypeRef ivtype = LLVMPointerType(itype, 0);
LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index,
chan_index);
@@ -1141,13 +1142,14 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
LLVMValueRef *texel)
{
LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+ struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
unsigned unit;
LLVMValueRef lod_bias, explicit_lod;
LLVMValueRef oow = NULL;
LLVMValueRef coords[3];
- LLVMValueRef ddx[3];
- LLVMValueRef ddy[3];
+ struct lp_derivatives derivs;
unsigned num_coords;
+ unsigned dims;
unsigned i;
if (!bld->sampler) {
@@ -1158,26 +1160,42 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
return;
}
+ derivs.ddx_ddy[0] = bld->bld_base.base.undef;
+ derivs.ddx_ddy[1] = bld->bld_base.base.undef;
+
switch (inst->Texture.Texture) {
case TGSI_TEXTURE_1D:
num_coords = 1;
+ dims = 1;
break;
case TGSI_TEXTURE_1D_ARRAY:
+ num_coords = 2;
+ dims = 1;
+ break;
case TGSI_TEXTURE_2D:
case TGSI_TEXTURE_RECT:
num_coords = 2;
+ dims = 2;
break;
case TGSI_TEXTURE_SHADOW1D:
case TGSI_TEXTURE_SHADOW1D_ARRAY:
+ num_coords = 3;
+ dims = 1;
+ break;
case TGSI_TEXTURE_SHADOW2D:
case TGSI_TEXTURE_SHADOWRECT:
case TGSI_TEXTURE_2D_ARRAY:
- case TGSI_TEXTURE_3D:
case TGSI_TEXTURE_CUBE:
num_coords = 3;
+ dims = 2;
+ break;
+ case TGSI_TEXTURE_3D:
+ num_coords = 3;
+ dims = 3;
break;
case TGSI_TEXTURE_SHADOW2D_ARRAY:
num_coords = 4;
+ dims = 2;
break;
default:
assert(0);
@@ -1212,31 +1230,66 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
}
if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
- LLVMValueRef index0 = lp_build_const_int32(bld->bld_base.base.gallivm, 0);
- for (i = 0; i < num_coords; i++) {
- LLVMValueRef src1 = lp_build_emit_fetch( &bld->bld_base, inst, 1, i );
- LLVMValueRef src2 = lp_build_emit_fetch( &bld->bld_base, inst, 2, i );
- ddx[i] = LLVMBuildExtractElement(builder, src1, index0, "");
- ddy[i] = LLVMBuildExtractElement(builder, src2, index0, "");
+ LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+ LLVMValueRef ddxdyonec[3];
+ unsigned length = bld->bld_base.base.type.length;
+ unsigned num_quads = length / 4;
+ unsigned dim;
+ unsigned quad;
+
+ for (dim = 0; dim < dims; ++dim) {
+ LLVMValueRef srcx = lp_build_emit_fetch( &bld->bld_base, inst, 1, dim );
+ LLVMValueRef srcy = lp_build_emit_fetch( &bld->bld_base, inst, 2, dim );
+ for (quad = 0; quad < num_quads; ++quad) {
+ unsigned s1 = 4*quad;
+ unsigned s2 = 4*quad + length;
+ shuffles[4*quad + 0] = lp_build_const_int32(gallivm, s1);
+ shuffles[4*quad + 1] = lp_build_const_int32(gallivm, s2);
+ shuffles[4*quad + 2] = i32undef;
+ shuffles[4*quad + 3] = i32undef;
+ }
+ ddxdyonec[dim] = LLVMBuildShuffleVector(builder, srcx, srcy,
+ LLVMConstVector(shuffles, length), "");
+ }
+ if (dims == 1) {
+ derivs.ddx_ddy[0] = ddxdyonec[0];
+ }
+ else if (dims >= 2) {
+ for (quad = 0; quad < num_quads; ++quad) {
+ unsigned s1 = 4*quad;
+ unsigned s2 = 4*quad + length;
+ shuffles[4*quad + 0] = lp_build_const_int32(gallivm, s1);
+ shuffles[4*quad + 1] = lp_build_const_int32(gallivm, s1 + 1);
+ shuffles[4*quad + 2] = lp_build_const_int32(gallivm, s2);
+ shuffles[4*quad + 3] = lp_build_const_int32(gallivm, s2 + 1);
+ }
+ derivs.ddx_ddy[0] = LLVMBuildShuffleVector(builder, ddxdyonec[0], ddxdyonec[1],
+ LLVMConstVector(shuffles, length), "");
+ if (dims == 3) {
+ derivs.ddx_ddy[1] = ddxdyonec[2];
+ }
}
unit = inst->Src[3].Register.Index;
} else {
- for (i = 0; i < num_coords; i++) {
- ddx[i] = lp_build_scalar_ddx( &bld->bld_base.base, coords[i] );
- ddy[i] = lp_build_scalar_ddy( &bld->bld_base.base, coords[i] );
+ if (dims == 1) {
+ derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(&bld->bld_base.base, coords[0]);
+ }
+ else if (dims >= 2) {
+ derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(&bld->bld_base.base,
+ coords[0], coords[1]);
+ if (dims == 3) {
+ derivs.ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(&bld->bld_base.base, coords[2]);
+ }
}
unit = inst->Src[1].Register.Index;
}
- for (i = num_coords; i < 3; i++) {
- ddx[i] = LLVMGetUndef(bld->bld_base.base.elem_type);
- ddy[i] = LLVMGetUndef(bld->bld_base.base.elem_type);
- }
bld->sampler->emit_fetch_texel(bld->sampler,
bld->bld_base.base.gallivm,
bld->bld_base.base.type,
unit, num_coords, coords,
- ddx, ddy,
+ &derivs,
lod_bias, explicit_lod,
texel);
}
@@ -1310,6 +1363,7 @@ emit_txq( struct lp_build_tgsi_soa_context *bld,
bld->sampler->emit_size_query(bld->sampler,
bld->bld_base.base.gallivm,
+ bld->bld_base.int_bld.type,
inst->Src[1].Register.Index,
explicit_lod,
sizes_out);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.c b/src/gallium/auxiliary/gallivm/lp_bld_type.c
index 413e69bedac..6c3aa38bfb1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.c
@@ -38,6 +38,9 @@ lp_build_elem_type(struct gallivm_state *gallivm, struct lp_type type)
{
if (type.floating) {
switch(type.width) {
+ case 16:
+ return LLVMIntTypeInContext(gallivm->context, 16);
+ break;
case 32:
return LLVMFloatTypeInContext(gallivm->context);
break;
@@ -85,6 +88,10 @@ lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type)
if (type.floating) {
switch(type.width) {
+ case 16:
+ if(elem_kind != LLVMIntegerTypeKind)
+ return FALSE;
+ break;
case 32:
if(elem_kind != LLVMFloatTypeKind)
return FALSE;
@@ -168,27 +175,6 @@ lp_build_int_vec_type(struct gallivm_state *gallivm, struct lp_type type)
/**
- * Build int32[4] vector type
- */
-LLVMTypeRef
-lp_build_int32_vec4_type(struct gallivm_state *gallivm)
-{
- struct lp_type t;
- LLVMTypeRef type;
-
- memset(&t, 0, sizeof(t));
- t.floating = FALSE; /* floating point values */
- t.sign = TRUE; /* values are signed */
- t.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */
- t.width = 32; /* 32-bit int */
- t.length = 4; /* 4 elements per vector */
-
- type = lp_build_int_elem_type(gallivm, t);
- return LLVMVectorType(type, t.length);
-}
-
-
-/**
* Create element of vector type
*/
struct lp_type
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h
index f11a190e7cc..75310e05f3e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -40,21 +40,35 @@
#include "pipe/p_compiler.h"
#include "gallivm/lp_bld.h"
+/**
+ * Native SIMD architecture width available at runtime.
+ *
+ * Using this width should give the best performance,
+ * and it determines the necessary alignment of vector variables.
+ */
+extern unsigned lp_native_vector_width;
+/**
+ * Maximum supported vector width (not necessarily supported at run-time).
+ *
+ * Should only be used when lp_native_vector_width isn't available,
+ * i.e. sizing/alignment of non-malloced variables.
+ */
+#define LP_MAX_VECTOR_WIDTH 256
/**
- * Native SIMD register width.
+ * Minimum vector alignment for static variable alignment
*
- * 128 for all architectures we care about.
+ * It should always be a constant equal to LP_MAX_VECTOR_WIDTH/8. An
+ * expression is non-portable.
*/
-#define LP_NATIVE_VECTOR_WIDTH 128
+#define LP_MIN_VECTOR_ALIGN 32
/**
* Several functions can only cope with vectors of length up to this value.
* You may need to increase that value if you want to represent bigger vectors.
*/
-#define LP_MAX_VECTOR_LENGTH 16
-
+#define LP_MAX_VECTOR_LENGTH (LP_MAX_VECTOR_WIDTH/8)
/**
* The LLVM type system can't conveniently express all the things we care about
@@ -151,6 +165,13 @@ struct lp_build_context
};
+static INLINE unsigned
+lp_type_width(struct lp_type type)
+{
+ return type.width * type.length;
+}
+
+
/** Create scalar float type */
static INLINE struct lp_type
lp_type_float(unsigned width)
@@ -169,7 +190,7 @@ lp_type_float(unsigned width)
/** Create vector of float type */
static INLINE struct lp_type
-lp_type_float_vec(unsigned width)
+lp_type_float_vec(unsigned width, unsigned total_width)
{
struct lp_type res_type;
@@ -177,7 +198,7 @@ lp_type_float_vec(unsigned width)
res_type.floating = TRUE;
res_type.sign = TRUE;
res_type.width = width;
- res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+ res_type.length = total_width / width;
return res_type;
}
@@ -200,14 +221,14 @@ lp_type_int(unsigned width)
/** Create vector int type */
static INLINE struct lp_type
-lp_type_int_vec(unsigned width)
+lp_type_int_vec(unsigned width, unsigned total_width)
{
struct lp_type res_type;
memset(&res_type, 0, sizeof res_type);
res_type.sign = TRUE;
res_type.width = width;
- res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+ res_type.length = total_width / width;
return res_type;
}
@@ -229,34 +250,34 @@ lp_type_uint(unsigned width)
/** Create vector uint type */
static INLINE struct lp_type
-lp_type_uint_vec(unsigned width)
+lp_type_uint_vec(unsigned width, unsigned total_width)
{
struct lp_type res_type;
memset(&res_type, 0, sizeof res_type);
res_type.width = width;
- res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+ res_type.length = total_width / width;
return res_type;
}
static INLINE struct lp_type
-lp_type_unorm(unsigned width)
+lp_type_unorm(unsigned width, unsigned total_width)
{
struct lp_type res_type;
memset(&res_type, 0, sizeof res_type);
res_type.norm = TRUE;
res_type.width = width;
- res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+ res_type.length = total_width / width;
return res_type;
}
static INLINE struct lp_type
-lp_type_fixed(unsigned width)
+lp_type_fixed(unsigned width, unsigned total_width)
{
struct lp_type res_type;
@@ -264,21 +285,21 @@ lp_type_fixed(unsigned width)
res_type.sign = TRUE;
res_type.fixed = TRUE;
res_type.width = width;
- res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+ res_type.length = total_width / width;
return res_type;
}
static INLINE struct lp_type
-lp_type_ufixed(unsigned width)
+lp_type_ufixed(unsigned width, unsigned total_width)
{
struct lp_type res_type;
memset(&res_type, 0, sizeof res_type);
res_type.fixed = TRUE;
res_type.width = width;
- res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+ res_type.length = total_width / width;
return res_type;
}
@@ -312,10 +333,6 @@ LLVMTypeRef
lp_build_int_vec_type(struct gallivm_state *gallivm, struct lp_type type);
-LLVMTypeRef
-lp_build_int32_vec4_type(struct gallivm_state *gallivm);
-
-
static INLINE struct lp_type
lp_float32_vec4_type(void)
{
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h
index 856e8d7a0ef..b44d9d9a0fe 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.h
+++ b/src/gallium/auxiliary/util/u_cpu_detect.h
@@ -35,9 +35,16 @@
#ifndef _UTIL_CPU_DETECT_H
#define _UTIL_CPU_DETECT_H
+
#include "pipe/p_compiler.h"
#include "pipe/p_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
struct util_cpu_caps {
unsigned nr_cpus;
@@ -66,4 +73,9 @@ util_cpu_caps;
void util_cpu_detect(void);
+#ifdef __cplusplus
+}
+#endif
+
+
#endif /* _UTIL_CPU_DETECT_H */
diff --git a/src/gallium/drivers/llvmpipe/.gitignore b/src/gallium/drivers/llvmpipe/.gitignore
index f6973b54d2c..21cd3cf2ed2 100644
--- a/src/gallium/drivers/llvmpipe/.gitignore
+++ b/src/gallium/drivers/llvmpipe/.gitignore
@@ -4,4 +4,3 @@ lp_test_blend
lp_test_conv
lp_test_format
lp_test_printf
-lp_test_round
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index 26fbde9a169..ef16fc7d882 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -55,8 +55,7 @@ PROGS := lp_test_format \
lp_test_arit \
lp_test_blend \
lp_test_conv \
- lp_test_printf \
- lp_test_round
+ lp_test_printf
# Need this for the lp_test_*.o files
CLEAN_EXTRA = *.o
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index 85560a1c716..cea44a78679 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -94,7 +94,6 @@ if not env['embedded']:
if not env['msvc']:
tests.append('arit')
- tests.append('round')
for test in tests:
testname = 'lp_test_' + test
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 87a6a2751d4..8efa75c01d3 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -59,6 +59,7 @@
#include "pipe/p_state.h"
#include "util/u_format.h"
+#include "util/u_cpu_detect.h"
#include "gallivm/lp_bld_type.h"
#include "gallivm/lp_bld_arit.h"
@@ -102,7 +103,16 @@ lp_build_stencil_test_single(struct lp_build_context *bld,
struct lp_type type = bld->type;
LLVMValueRef res;
- assert(type.sign);
+ /*
+ * SSE2 has intrinsics for signed comparisons, but not unsigned ones. Values
+ * are between 0..255 so ensure we generate the fastest comparisons for
+ * wider elements.
+ */
+ if (type.width <= 8) {
+ assert(!type.sign);
+ } else {
+ assert(type.sign);
+ }
assert(stencil->enabled);
@@ -424,29 +434,86 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
LLVMBuilderRef builder = gallivm->builder;
LLVMContextRef context = gallivm->context;
LLVMValueRef countmask = lp_build_const_int_vec(gallivm, type, 1);
- LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv");
- LLVMTypeRef i8v16 = LLVMVectorType(LLVMInt8TypeInContext(context), 16);
- LLVMValueRef counti = LLVMBuildBitCast(builder, countv, i8v16, "counti");
- LLVMValueRef maskarray[4] = {
- lp_build_const_int32(gallivm, 0),
- lp_build_const_int32(gallivm, 4),
- lp_build_const_int32(gallivm, 8),
- lp_build_const_int32(gallivm, 12)
- };
- LLVMValueRef shufflemask = LLVMConstVector(maskarray, 4);
- LLVMValueRef shufflev = LLVMBuildShuffleVector(builder, counti, LLVMGetUndef(i8v16), shufflemask, "shufflev");
- LLVMValueRef shuffle = LLVMBuildBitCast(builder, shufflev, LLVMInt32TypeInContext(context), "shuffle");
- LLVMValueRef count = lp_build_intrinsic_unary(builder, "llvm.ctpop.i32", LLVMInt32TypeInContext(context), shuffle);
- LLVMValueRef orig = LLVMBuildLoad(builder, counter, "orig");
- LLVMValueRef incr = LLVMBuildAdd(builder, orig, count, "incr");
- LLVMBuildStore(builder, incr, counter);
+ LLVMValueRef count, newcount;
+
+ assert(type.length <= 16);
+ assert(type.floating);
+
+ if(util_cpu_caps.has_sse && type.length == 4) {
+ const char *movmskintr = "llvm.x86.sse.movmsk.ps";
+ const char *popcntintr = "llvm.ctpop.i32";
+ LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
+ lp_build_vec_type(gallivm, type), "");
+ bits = lp_build_intrinsic_unary(builder, movmskintr,
+ LLVMInt32TypeInContext(context), bits);
+ count = lp_build_intrinsic_unary(builder, popcntintr,
+ LLVMInt32TypeInContext(context), bits);
+ }
+ else if(util_cpu_caps.has_avx && type.length == 8) {
+ const char *movmskintr = "llvm.x86.avx.movmsk.ps.256";
+ const char *popcntintr = "llvm.ctpop.i32";
+ LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
+ lp_build_vec_type(gallivm, type), "");
+ bits = lp_build_intrinsic_unary(builder, movmskintr,
+ LLVMInt32TypeInContext(context), bits);
+ count = lp_build_intrinsic_unary(builder, popcntintr,
+ LLVMInt32TypeInContext(context), bits);
+ }
+ else {
+ unsigned i;
+ LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv");
+ LLVMTypeRef counttype = LLVMIntTypeInContext(context, type.length * 8);
+ LLVMTypeRef i8vntype = LLVMVectorType(LLVMInt8TypeInContext(context), type.length * 4);
+ LLVMValueRef shufflev, countd;
+ LLVMValueRef shuffles[16];
+ const char *popcntintr = NULL;
+
+ countv = LLVMBuildBitCast(builder, countv, i8vntype, "");
+
+ for (i = 0; i < type.length; i++) {
+ shuffles[i] = lp_build_const_int32(gallivm, 4*i);
+ }
+
+ shufflev = LLVMConstVector(shuffles, type.length);
+ countd = LLVMBuildShuffleVector(builder, countv, LLVMGetUndef(i8vntype), shufflev, "");
+ countd = LLVMBuildBitCast(builder, countd, counttype, "countd");
+
+ /*
+ * XXX FIXME
+ * this is bad on cpus without popcount (on x86 supported by intel
+ * nehalem, amd barcelona, and up - not tied to sse42).
+ * Would be much faster to just sum the 4 elements of the vector with
+ * some horizontal add (shuffle/add/shuffle/add after the initial and).
+ */
+ switch (type.length) {
+ case 4:
+ popcntintr = "llvm.ctpop.i32";
+ break;
+ case 8:
+ popcntintr = "llvm.ctpop.i64";
+ break;
+ case 16:
+ popcntintr = "llvm.ctpop.i128";
+ break;
+ default:
+ assert(0);
+ }
+ count = lp_build_intrinsic_unary(builder, popcntintr, counttype, countd);
+
+ if (type.length > 4) {
+ count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 32), "");
+ }
+ }
+ newcount = LLVMBuildLoad(builder, counter, "origcount");
+ newcount = LLVMBuildAdd(builder, newcount, count, "newcount");
+ LLVMBuildStore(builder, newcount, counter);
}
/**
* Generate code for performing depth and/or stencil tests.
- * We operate on a vector of values (typically a 2x2 quad).
+ * We operate on a vector of values (typically n 2x2 quads).
*
* \param depth the depth test state
* \param stencil the front/back stencil state
@@ -454,9 +521,9 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
* \param format_desc description of the depth/stencil surface
* \param mask the alive/dead pixel mask for the quad (vector)
* \param stencil_refs the front/back stencil ref values (scalar)
- * \param z_src the incoming depth/stencil values (a 2x2 quad, float32)
+ * \param z_src the incoming depth/stencil values (n 2x2 quad values, float32)
* \param zs_dst_ptr pointer to depth/stencil values in framebuffer
- * \param facing contains boolean value indicating front/back facing polygon
+ * \param face contains boolean value indicating front/back facing polygon
*/
void
lp_build_depth_stencil_test(struct gallivm_state *gallivm,
@@ -507,6 +574,12 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
assert(z_type.width == z_src_type.width);
assert(z_type.length == z_src_type.length);
+ /* FIXME: for non-float depth/stencil might generate better code
+ * if we'd always split it up to use 128bit operations.
+ * For stencil we'd almost certainly want to pack to 8xi16 values,
+ * for z just run twice.
+ */
+
/* Sanity checking */
{
const unsigned z_swizzle = format_desc->swizzle[0];
@@ -548,7 +621,7 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
lp_build_context_init(&z_bld, gallivm, z_type);
/* Setup build context for stencil vals */
- s_type = lp_type_int_vec(z_type.width);
+ s_type = lp_int_type(z_type);
lp_build_context_init(&s_bld, gallivm, s_type);
/* Load current z/stencil value from z/stencil buffer */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index 0d51ccb0349..d108f35f719 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -61,6 +61,9 @@
* # | # | #
* #################
*
+ * If we iterate over multiple quads at once, quads 01 and 23 are processed
+ * together.
+ *
* Within each quad, we have four pixels which are represented in SOA
* order:
*
@@ -72,6 +75,10 @@
*
* So the green channel (for example) of the four pixels is stored in
* a single vector register: {g0, g1, g2, g3}.
+ * The order stays the same even with multiple quads:
+ * 0 1 4 5
+ * 2 3 6 7
+ * is stored as g0..g7
*/
@@ -102,8 +109,8 @@
#define PERSPECTIVE_DIVIDE_PER_QUAD 0
-static const unsigned char quad_offset_x[4] = {0, 1, 0, 1};
-static const unsigned char quad_offset_y[4] = {0, 0, 1, 1};
+static const unsigned char quad_offset_x[16] = {0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3};
+static const unsigned char quad_offset_y[16] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3};
static void
@@ -115,132 +122,353 @@ attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix
lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix);
}
-
-/**
- * Initialize the bld->a0, dadx, dady fields. This involves fetching
- * those values from the arrays which are passed into the JIT function.
+/* Much easier, and significantly less instructions in the per-stamp
+ * part (less than half) but overall more instructions so a loss if
+ * most quads are active. Might be a win though with larger vectors.
+ * No ability to do per-quad divide (doable but not implemented)
+ * Could be made to work with passed in pixel offsets (i.e. active quad merging).
*/
static void
-coeffs_init(struct lp_build_interp_soa_context *bld,
- LLVMValueRef a0_ptr,
- LLVMValueRef dadx_ptr,
- LLVMValueRef dady_ptr)
+coeffs_init_simple(struct lp_build_interp_soa_context *bld,
+ LLVMValueRef a0_ptr,
+ LLVMValueRef dadx_ptr,
+ LLVMValueRef dady_ptr)
{
struct lp_build_context *coeff_bld = &bld->coeff_bld;
+ struct lp_build_context *setup_bld = &bld->setup_bld;
struct gallivm_state *gallivm = coeff_bld->gallivm;
LLVMBuilderRef builder = gallivm->builder;
- LLVMValueRef zero = LLVMConstNull(coeff_bld->elem_type);
- LLVMValueRef one = LLVMConstReal(coeff_bld->elem_type, 1.0);
- LLVMValueRef i0 = lp_build_const_int32(gallivm, 0);
- LLVMValueRef i1 = lp_build_const_int32(gallivm, 1);
- LLVMValueRef i2 = lp_build_const_int32(gallivm, 2);
- LLVMValueRef i3 = lp_build_const_int32(gallivm, 3);
unsigned attrib;
- unsigned chan;
-
- /* TODO: Use more vector operations */
for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
+ /*
+ * always fetch all 4 values for performance/simplicity
+ * Note: we do that here because it seems to generate better
+ * code. It generates a lot of moves initially but less
+ * moves later. As far as I can tell this looks like a
+ * llvm issue, instead of simply reloading the values from
+ * the passed in pointers it if it runs out of registers
+ * it spills/reloads them. Maybe some optimization passes
+ * would help.
+ * Might want to investigate this again later.
+ */
+ const unsigned interp = bld->interp[attrib];
+ LLVMValueRef index = lp_build_const_int32(gallivm,
+ attrib * TGSI_NUM_CHANNELS);
+ LLVMValueRef ptr;
+ LLVMValueRef dadxaos = setup_bld->zero;
+ LLVMValueRef dadyaos = setup_bld->zero;
+ LLVMValueRef a0aos = setup_bld->zero;
+
+ switch (interp) {
+ case LP_INTERP_PERSPECTIVE:
+ /* fall-through */
+
+ case LP_INTERP_LINEAR:
+ ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
+ ptr = LLVMBuildBitCast(builder, ptr,
+ LLVMPointerType(setup_bld->vec_type, 0), "");
+ dadxaos = LLVMBuildLoad(builder, ptr, "");
+
+ ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
+ ptr = LLVMBuildBitCast(builder, ptr,
+ LLVMPointerType(setup_bld->vec_type, 0), "");
+ dadyaos = LLVMBuildLoad(builder, ptr, "");
+
+ attrib_name(dadxaos, attrib, 0, ".dadxaos");
+ attrib_name(dadyaos, attrib, 0, ".dadyaos");
+ /* fall-through */
+
+ case LP_INTERP_CONSTANT:
+ case LP_INTERP_FACING:
+ ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
+ ptr = LLVMBuildBitCast(builder, ptr,
+ LLVMPointerType(setup_bld->vec_type, 0), "");
+ a0aos = LLVMBuildLoad(builder, ptr, "");
+ attrib_name(a0aos, attrib, 0, ".a0aos");
+ break;
+
+ case LP_INTERP_POSITION:
+ /* Nothing to do as the position coeffs are already setup in slot 0 */
+ continue;
+
+ default:
+ assert(0);
+ break;
+ }
+ bld->a0aos[attrib] = a0aos;
+ bld->dadxaos[attrib] = dadxaos;
+ bld->dadyaos[attrib] = dadyaos;
+ }
+}
+
+/**
+ * Interpolate the shader input attribute values.
+ * This is called for each (group of) quad(s).
+ */
+static void
+attribs_update_simple(struct lp_build_interp_soa_context *bld,
+ struct gallivm_state *gallivm,
+ int quad_start_index,
+ int start,
+ int end)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ struct lp_build_context *coeff_bld = &bld->coeff_bld;
+ struct lp_build_context *setup_bld = &bld->setup_bld;
+ LLVMValueRef oow = NULL;
+ unsigned attrib, i;
+ LLVMValueRef pixoffx;
+ LLVMValueRef pixoffy;
+ unsigned num_pix = coeff_bld->type.length;
+
+ /* could do this with code-generated passed in pixel offsets */
+ pixoffx = coeff_bld->undef;
+ pixoffy = coeff_bld->undef;
+ for (i = 0; i < coeff_bld->type.length; i++) {
+ LLVMValueRef nr = lp_build_const_int32(gallivm, i);
+ LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i % num_pix] +
+ (quad_start_index & 1) * 2);
+ LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i % num_pix] +
+ (quad_start_index & 2));
+ pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, "");
+ pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, "");
+ }
+
+ pixoffx = LLVMBuildFAdd(builder, pixoffx,
+ lp_build_broadcast_scalar(coeff_bld, bld->x), "");
+ pixoffy = LLVMBuildFAdd(builder, pixoffy,
+ lp_build_broadcast_scalar(coeff_bld, bld->y), "");
+
+ for (attrib = start; attrib < end; attrib++) {
const unsigned mask = bld->mask[attrib];
const unsigned interp = bld->interp[attrib];
- for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+ unsigned chan;
+
+ for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
if (mask & (1 << chan)) {
- LLVMValueRef index = lp_build_const_int32(gallivm,
- attrib * TGSI_NUM_CHANNELS + chan);
- LLVMValueRef a0 = zero;
- LLVMValueRef dadx = zero;
- LLVMValueRef dady = zero;
- LLVMValueRef dadxy = zero;
- LLVMValueRef dadq;
- LLVMValueRef dadq2;
- LLVMValueRef a;
+ LLVMValueRef index;
+ LLVMValueRef dadx = coeff_bld->zero;
+ LLVMValueRef dady = coeff_bld->zero;
+ LLVMValueRef a = coeff_bld->zero;
+ index = lp_build_const_int32(gallivm, chan);
switch (interp) {
case LP_INTERP_PERSPECTIVE:
/* fall-through */
case LP_INTERP_LINEAR:
if (attrib == 0 && chan == 0) {
- dadxy = dadx = one;
+ dadx = coeff_bld->one;
}
else if (attrib == 0 && chan == 1) {
- dadxy = dady = one;
+ dady = coeff_bld->one;
}
else {
- dadx = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""), "");
- dady = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dady_ptr, &index, 1, ""), "");
- dadxy = LLVMBuildFAdd(builder, dadx, dady, "");
- attrib_name(dadx, attrib, chan, ".dadx");
- attrib_name(dady, attrib, chan, ".dady");
- attrib_name(dadxy, attrib, chan, ".dadxy");
+ dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
+ coeff_bld->type, bld->dadxaos[attrib],
+ index);
+ dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
+ coeff_bld->type, bld->dadyaos[attrib],
+ index);
+ a = lp_build_extract_broadcast(gallivm, setup_bld->type,
+ coeff_bld->type, bld->a0aos[attrib],
+ index);
}
- /* fall-through */
+ /*
+ * a = a0 + (x * dadx + y * dady)
+ */
+ dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
+ dady = LLVMBuildFMul(builder, dady, pixoffy, "");
+ a = LLVMBuildFAdd(builder, a, dadx, "");
+ a = LLVMBuildFAdd(builder, a, dady, "");
+
+ if (interp == LP_INTERP_PERSPECTIVE) {
+ if (oow == NULL) {
+ LLVMValueRef w = bld->attribs[0][3];
+ assert(attrib != 0);
+ assert(bld->mask[0] & TGSI_WRITEMASK_W);
+ oow = lp_build_rcp(coeff_bld, w);
+ }
+ a = lp_build_mul(coeff_bld, a, oow);
+ }
+ break;
case LP_INTERP_CONSTANT:
case LP_INTERP_FACING:
- a0 = LLVMBuildLoad(builder, LLVMBuildGEP(builder, a0_ptr, &index, 1, ""), "");
- attrib_name(a0, attrib, chan, ".a0");
+ a = lp_build_extract_broadcast(gallivm, setup_bld->type,
+ coeff_bld->type, bld->a0aos[attrib],
+ index);
break;
case LP_INTERP_POSITION:
- /* Nothing to do as the position coeffs are already setup in slot 0 */
- continue;
+ assert(attrib > 0);
+ a = bld->attribs[0][chan];
+ break;
default:
assert(0);
break;
}
- /*
- * dadq = {0, dadx, dady, dadx + dady}
- */
+ if ((attrib == 0) && (chan == 2)){
+ /* FIXME: Depth values can exceed 1.0, due to the fact that
+ * setup interpolation coefficients refer to (0,0) which causes
+ * precision loss. So we must clamp to 1.0 here to avoid artifacts
+ */
+ a = lp_build_min(coeff_bld, a, coeff_bld->one);
+ }
+ bld->attribs[attrib][chan] = a;
+ }
+ }
+ }
+}
- dadq = coeff_bld->undef;
- dadq = LLVMBuildInsertElement(builder, dadq, zero, i0, "");
- dadq = LLVMBuildInsertElement(builder, dadq, dadx, i1, "");
- dadq = LLVMBuildInsertElement(builder, dadq, dady, i2, "");
- dadq = LLVMBuildInsertElement(builder, dadq, dadxy, i3, "");
+/**
+ * Initialize the bld->a, dadq fields. This involves fetching
+ * those values from the arrays which are passed into the JIT function.
+ */
+static void
+coeffs_init(struct lp_build_interp_soa_context *bld,
+ LLVMValueRef a0_ptr,
+ LLVMValueRef dadx_ptr,
+ LLVMValueRef dady_ptr)
+{
+ struct lp_build_context *coeff_bld = &bld->coeff_bld;
+ struct lp_build_context *setup_bld = &bld->setup_bld;
+ struct gallivm_state *gallivm = coeff_bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef pixoffx, pixoffy;
+ unsigned attrib;
+ unsigned chan;
+ unsigned i;
+
+ pixoffx = coeff_bld->undef;
+ pixoffy = coeff_bld->undef;
+ for (i = 0; i < coeff_bld->type.length; i++) {
+ LLVMValueRef nr = lp_build_const_int32(gallivm, i);
+ LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i]);
+ LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i]);
+ pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, "");
+ pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, "");
+ }
- /*
- * dadq2 = 2 * dq
- */
- dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
+ for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
+ const unsigned mask = bld->mask[attrib];
+ const unsigned interp = bld->interp[attrib];
+ LLVMValueRef index = lp_build_const_int32(gallivm,
+ attrib * TGSI_NUM_CHANNELS);
+ LLVMValueRef ptr;
+ LLVMValueRef dadxaos = setup_bld->zero;
+ LLVMValueRef dadyaos = setup_bld->zero;
+ LLVMValueRef a0aos = setup_bld->zero;
+
+ /* always fetch all 4 values for performance/simplicity */
+ switch (interp) {
+ case LP_INTERP_PERSPECTIVE:
+ /* fall-through */
+
+ case LP_INTERP_LINEAR:
+ ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
+ ptr = LLVMBuildBitCast(builder, ptr,
+ LLVMPointerType(setup_bld->vec_type, 0), "");
+ dadxaos = LLVMBuildLoad(builder, ptr, "");
+
+ ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
+ ptr = LLVMBuildBitCast(builder, ptr,
+ LLVMPointerType(setup_bld->vec_type, 0), "");
+ dadyaos = LLVMBuildLoad(builder, ptr, "");
+
+ attrib_name(dadxaos, attrib, 0, ".dadxaos");
+ attrib_name(dadyaos, attrib, 0, ".dadyaos");
+ /* fall-through */
+
+ case LP_INTERP_CONSTANT:
+ case LP_INTERP_FACING:
+ ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
+ ptr = LLVMBuildBitCast(builder, ptr,
+ LLVMPointerType(setup_bld->vec_type, 0), "");
+ a0aos = LLVMBuildLoad(builder, ptr, "");
+ attrib_name(a0aos, attrib, 0, ".a0aos");
+ break;
+
+ case LP_INTERP_POSITION:
+ /* Nothing to do as the position coeffs are already setup in slot 0 */
+ continue;
+
+ default:
+ assert(0);
+ break;
+ }
- /*
- * a = a0 + (x * dadx + y * dady)
- */
+ /*
+ * a = a0 + (x * dadx + y * dady)
+ * a0aos is the attrib value at top left corner of stamp
+ */
+ if (interp != LP_INTERP_CONSTANT &&
+ interp != LP_INTERP_FACING) {
+ LLVMValueRef axaos, ayaos;
+ axaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->x),
+ dadxaos, "");
+ ayaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->y),
+ dadyaos, "");
+ a0aos = LLVMBuildFAdd(builder, a0aos, ayaos, "");
+ a0aos = LLVMBuildFAdd(builder, a0aos, axaos, "");
+ }
+
+ /*
+ * dadq = {0, dadx, dady, dadx + dady}
+ * for two quads (side by side) this is:
+ * {0, dadx, dady, dadx+dady, 2*dadx, 2*dadx+dady, 3*dadx+dady}
+ */
+ for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+ /* this generates a CRAPLOAD of shuffles... */
+ if (mask & (1 << chan)) {
+ LLVMValueRef dadx, dady;
+ LLVMValueRef dadq, dadq2;
+ LLVMValueRef a;
+ LLVMValueRef chan_index = lp_build_const_int32(gallivm, chan);
if (attrib == 0 && chan == 0) {
- a = bld->x;
+ a = lp_build_broadcast_scalar(coeff_bld, bld->x);
+ dadx = coeff_bld->one;
+ dady = coeff_bld->zero;
}
else if (attrib == 0 && chan == 1) {
- a = bld->y;
+ a = lp_build_broadcast_scalar(coeff_bld, bld->y);
+ dady = coeff_bld->one;
+ dadx = coeff_bld->zero;
}
else {
- a = a0;
- if (interp != LP_INTERP_CONSTANT &&
- interp != LP_INTERP_FACING) {
- LLVMValueRef ax, ay, axy;
- ax = LLVMBuildFMul(builder, bld->x, dadx, "");
- ay = LLVMBuildFMul(builder, bld->y, dady, "");
- axy = LLVMBuildFAdd(builder, ax, ay, "");
- a = LLVMBuildFAdd(builder, a, axy, "");
- }
- }
+ dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
+ coeff_bld->type, dadxaos, chan_index);
+ dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
+ coeff_bld->type, dadyaos, chan_index);
- /*
- * a = {a, a, a, a}
- */
+ /*
+ * a = {a, a, a, a}
+ */
+ a = lp_build_extract_broadcast(gallivm, setup_bld->type,
+ coeff_bld->type, a0aos, chan_index);
+ }
- a = lp_build_broadcast(gallivm, coeff_bld->vec_type, a);
+ dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
+ dady = LLVMBuildFMul(builder, dady, pixoffy, "");
+ dadq = LLVMBuildFAdd(builder, dadx, dady, "");
/*
- * Compute the attrib values on the upper-left corner of each quad.
+ * Compute the attrib values on the upper-left corner of each
+ * group of quads.
+ * Note that if we process 2 quads at once this doesn't
+ * really exactly to what we want.
+ * We need to access elem 0 and 2 respectively later if we process
+ * 2 quads at once.
*/
if (interp != LP_INTERP_CONSTANT &&
interp != LP_INTERP_FACING) {
+ dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
a = LLVMBuildFAdd(builder, a, dadq2, "");
}
@@ -249,6 +477,12 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
* a *= 1 / w
*/
+ /*
+ * XXX since we're only going to access elements 0,2 out of 8
+ * if we have 8-wide vectors we should do the division only 4-wide.
+ * a is really a 2-elements in a 4-wide vector disguised as 8-wide
+ * in this case.
+ */
if (interp == LP_INTERP_PERSPECTIVE) {
LLVMValueRef w = bld->a[0][3];
assert(attrib != 0);
@@ -279,18 +513,18 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
static void
attribs_update(struct lp_build_interp_soa_context *bld,
struct gallivm_state *gallivm,
- int quad_index,
+ int quad_start_index,
int start,
int end)
{
LLVMBuilderRef builder = gallivm->builder;
struct lp_build_context *coeff_bld = &bld->coeff_bld;
- LLVMValueRef shuffle = lp_build_const_int_vec(gallivm, coeff_bld->type, quad_index);
+ LLVMValueRef shuffle = lp_build_const_int_vec(gallivm, coeff_bld->type, quad_start_index);
LLVMValueRef oow = NULL;
unsigned attrib;
unsigned chan;
- assert(quad_index < 4);
+ assert(quad_start_index < 4);
for(attrib = start; attrib < end; ++attrib) {
const unsigned mask = bld->mask[attrib];
@@ -412,6 +646,7 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
LLVMValueRef y0)
{
struct lp_type coeff_type;
+ struct lp_type setup_type;
unsigned attrib;
unsigned chan;
@@ -421,19 +656,26 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
coeff_type.floating = TRUE;
coeff_type.sign = TRUE;
coeff_type.width = 32;
- coeff_type.length = TGSI_QUAD_SIZE;
+ coeff_type.length = type.length;
+
+ memset(&setup_type, 0, sizeof setup_type);
+ setup_type.floating = TRUE;
+ setup_type.sign = TRUE;
+ setup_type.width = 32;
+ setup_type.length = TGSI_NUM_CHANNELS;
+
/* XXX: we don't support interpolating into any other types */
assert(memcmp(&coeff_type, &type, sizeof coeff_type) == 0);
lp_build_context_init(&bld->coeff_bld, gallivm, coeff_type);
+ lp_build_context_init(&bld->setup_bld, gallivm, setup_type);
/* For convenience */
bld->pos = bld->attribs[0];
bld->inputs = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) bld->attribs[1];
/* Position */
- bld->num_attribs = 1;
bld->mask[0] = TGSI_WRITEMASK_XYZW;
bld->interp[0] = LP_INTERP_LINEAR;
@@ -453,7 +695,12 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
pos_init(bld, x0, y0);
- coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
+ if (coeff_type.length > 4) {
+ coeffs_init_simple(bld, a0_ptr, dadx_ptr, dady_ptr);
+ }
+ else {
+ coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
+ }
}
@@ -463,20 +710,30 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
void
lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld,
struct gallivm_state *gallivm,
- int quad_index)
+ int quad_start_index)
{
- assert(quad_index < 4);
+ assert(quad_start_index < 4);
- attribs_update(bld, gallivm, quad_index, 1, bld->num_attribs);
+ if (bld->coeff_bld.type.length > 4) {
+ attribs_update_simple(bld, gallivm, quad_start_index, 1, bld->num_attribs);
+ }
+ else {
+ attribs_update(bld, gallivm, quad_start_index, 1, bld->num_attribs);
+ }
}
void
lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld,
struct gallivm_state *gallivm,
- int quad_index)
+ int quad_start_index)
{
- assert(quad_index < 4);
+ assert(quad_start_index < 4);
- attribs_update(bld, gallivm, quad_index, 0, 1);
+ if (bld->coeff_bld.type.length > 4) {
+ attribs_update_simple(bld, gallivm, quad_start_index, 0, 1);
+ }
+ else {
+ attribs_update(bld, gallivm, quad_start_index, 0, 1);
+ }
}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
index 6970a9b8c2c..f293b582318 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
@@ -79,6 +79,7 @@ struct lp_build_interp_soa_context
{
/* TGSI_QUAD_SIZE x float */
struct lp_build_context coeff_bld;
+ struct lp_build_context setup_bld;
unsigned num_attribs;
unsigned mask[1 + PIPE_MAX_SHADER_INPUTS]; /**< TGSI_WRITE_MASK_x */
@@ -87,8 +88,11 @@ struct lp_build_interp_soa_context
LLVMValueRef x;
LLVMValueRef y;
- LLVMValueRef a [1 + PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
+ LLVMValueRef a[1 + PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
LLVMValueRef dadq[1 + PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
+ LLVMValueRef a0aos[1 + PIPE_MAX_SHADER_INPUTS];
+ LLVMValueRef dadxaos[1 + PIPE_MAX_SHADER_INPUTS];
+ LLVMValueRef dadyaos[1 + PIPE_MAX_SHADER_INPUTS];
LLVMValueRef oow;
@@ -118,12 +122,12 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
void
lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld,
struct gallivm_state *gallivm,
- int quad_index);
+ int quad_start_index);
void
lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld,
struct gallivm_state *gallivm,
- int quad_index);
+ int quad__start_index);
#endif /* LP_BLD_INTERP_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 9e4c7d6734e..07cea9158c3 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -51,42 +51,6 @@
unsigned llvmpipe_variant_count;
-/**
- * This function is called by the gallivm "garbage collector" when
- * the LLVM global data structures are freed. We must free all LLVM-related
- * data. Specifically, all JIT'd shader variants.
- */
-static void
-garbage_collect_callback(void *cb_data)
-{
- struct llvmpipe_context *lp = (struct llvmpipe_context *) cb_data;
- struct lp_fs_variant_list_item *li;
-
- /* Free all the context's shader variants */
- li = first_elem(&lp->fs_variants_list);
- while (!at_end(&lp->fs_variants_list, li)) {
- struct lp_fs_variant_list_item *next = next_elem(li);
- llvmpipe_remove_shader_variant(lp, li->base);
- li = next;
- }
-
- /* Free all the context's primitive setup variants */
- lp_delete_setup_variants(lp);
-
- /* release references to setup variants, shaders */
- lp_setup_set_setup_variant(lp->setup, NULL);
- lp_setup_set_fs_variant(lp->setup, NULL);
- lp_setup_reset(lp->setup);
-
- /* This type will be recreated upon demand */
- lp->jit_context_ptr_type = NULL;
-
- /* mark all state as dirty to ensure new shaders are jit'd, etc. */
- lp->dirty = ~0;
-}
-
-
-
static void llvmpipe_destroy( struct pipe_context *pipe )
{
struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
@@ -94,9 +58,6 @@ static void llvmpipe_destroy( struct pipe_context *pipe )
lp_print_counters();
- gallivm_remove_garbage_collector_callback(garbage_collect_callback,
- llvmpipe);
-
/* This will also destroy llvmpipe->setup:
*/
if (llvmpipe->draw)
@@ -128,8 +89,6 @@ static void llvmpipe_destroy( struct pipe_context *pipe )
lp_delete_setup_variants(llvmpipe);
- gallivm_destroy(llvmpipe->gallivm);
-
align_free( llvmpipe );
}
@@ -195,12 +154,10 @@ llvmpipe_create_context( struct pipe_screen *screen, void *priv )
llvmpipe_init_context_resource_funcs( &llvmpipe->pipe );
llvmpipe_init_surface_functions(llvmpipe);
- llvmpipe->gallivm = gallivm_create();
-
/*
* Create drawing context and plug our rendering stage into it.
*/
- llvmpipe->draw = draw_create_gallivm(&llvmpipe->pipe, llvmpipe->gallivm);
+ llvmpipe->draw = draw_create(&llvmpipe->pipe);
if (!llvmpipe->draw)
goto fail;
@@ -226,9 +183,6 @@ llvmpipe_create_context( struct pipe_screen *screen, void *priv )
lp_reset_counters();
- gallivm_register_garbage_collector_callback(garbage_collect_callback,
- llvmpipe);
-
return &llvmpipe->pipe;
fail:
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index d4750705b43..d0220e188cf 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -131,10 +131,6 @@ struct llvmpipe_context {
unsigned nr_fs_variants;
unsigned nr_fs_instrs;
- /** JIT code generation */
- struct gallivm_state *gallivm;
- LLVMTypeRef jit_context_ptr_type;
-
struct lp_setup_variant_list_item setup_variants_list;
unsigned nr_setup_variants;
diff --git a/src/gallium/drivers/llvmpipe/lp_flush.c b/src/gallium/drivers/llvmpipe/lp_flush.c
index 42430550ea6..964b792b739 100644
--- a/src/gallium/drivers/llvmpipe/lp_flush.c
+++ b/src/gallium/drivers/llvmpipe/lp_flush.c
@@ -54,13 +54,6 @@ llvmpipe_flush( struct pipe_context *pipe,
/* ask the setup module to flush */
lp_setup_flush(llvmpipe->setup, fence, reason);
-
- if (llvmpipe_variant_count > 1000) {
- /* time to do a garbage collection */
- gallivm_garbage_collect(llvmpipe->gallivm);
- llvmpipe_variant_count = 0;
- }
-
/* Enable to dump BMPs of the color/depth buffers each frame */
if (0) {
static unsigned frame_no = 1;
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index eb1db84e4b8..7a85eab41a0 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -41,7 +41,7 @@
static void
-lp_jit_create_types(struct llvmpipe_context *lp)
+lp_jit_create_types(struct lp_fragment_shader_variant *lp)
{
struct gallivm_state *gallivm = lp->gallivm;
LLVMContextRef lc = gallivm->context;
@@ -183,11 +183,9 @@ lp_jit_screen_init(struct llvmpipe_screen *screen)
}
-LLVMTypeRef
-lp_jit_get_context_type(struct llvmpipe_context *lp)
+void
+lp_jit_init_types(struct lp_fragment_shader_variant *lp)
{
if (!lp->jit_context_ptr_type)
lp_jit_create_types(lp);
-
- return lp->jit_context_ptr_type;
}
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index 04e8dd5267b..584d2c8fd81 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -42,6 +42,7 @@
#include "lp_texture.h"
+struct lp_fragment_shader_variant;
struct llvmpipe_screen;
@@ -164,8 +165,8 @@ void
lp_jit_screen_init(struct llvmpipe_screen *screen);
-LLVMTypeRef
-lp_jit_get_context_type(struct llvmpipe_context *lp);
+void
+lp_jit_init_types(struct lp_fragment_shader_variant *lp);
#endif /* LP_JIT_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_memory.c b/src/gallium/drivers/llvmpipe/lp_memory.c
index 0f55d4a80ae..85f73e54ac4 100644
--- a/src/gallium/drivers/llvmpipe/lp_memory.c
+++ b/src/gallium/drivers/llvmpipe/lp_memory.c
@@ -36,10 +36,12 @@
* number of threads or using a smaller tilesize when multiple
* colorbuffers are bound.
*/
-PIPE_ALIGN_VAR(16) uint8_t lp_swizzled_cbuf[LP_MAX_THREADS][PIPE_MAX_COLOR_BUFS][TILE_SIZE * TILE_SIZE * 4];
+PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN)
+uint8_t lp_swizzled_cbuf[LP_MAX_THREADS][PIPE_MAX_COLOR_BUFS][TILE_SIZE * TILE_SIZE * 4];
/* A single dummy tile used in a couple of out-of-memory situations.
*/
-PIPE_ALIGN_VAR(16) uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4];
+PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN)
+uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4];
diff --git a/src/gallium/drivers/llvmpipe/lp_memory.h b/src/gallium/drivers/llvmpipe/lp_memory.h
index f7418f5e087..5552c2908e1 100644
--- a/src/gallium/drivers/llvmpipe/lp_memory.h
+++ b/src/gallium/drivers/llvmpipe/lp_memory.h
@@ -32,9 +32,12 @@
#include "pipe/p_compiler.h"
#include "pipe/p_state.h"
#include "lp_limits.h"
+#include "gallivm/lp_bld_type.h"
-extern PIPE_ALIGN_VAR(16) uint8_t lp_swizzled_cbuf[LP_MAX_THREADS][PIPE_MAX_COLOR_BUFS][TILE_SIZE * TILE_SIZE * 4];
+extern PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN)
+uint8_t lp_swizzled_cbuf[LP_MAX_THREADS][PIPE_MAX_COLOR_BUFS][TILE_SIZE * TILE_SIZE * 4];
-extern PIPE_ALIGN_VAR(16) uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4];
+extern PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN)
+uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4];
#endif /* LP_MEMORY_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index 09af0274d7a..d743d7689ae 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -42,6 +42,7 @@
#include "lp_tile_soa.h"
#include "gallivm/lp_bld_debug.h"
#include "lp_scene.h"
+#include "lp_tex_sample.h"
#ifdef DEBUG
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 03d15f6e2b0..54f45357fdc 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -97,56 +97,56 @@
#include "lp_state_fs.h"
-#include <llvm-c/Analysis.h>
-#include <llvm-c/BitWriter.h>
-
-
/** Fragment shader number (for debugging) */
static unsigned fs_no = 0;
/**
- * Expand the relevent bits of mask_input to a 4-dword mask for the
- * four pixels in a 2x2 quad. This will set the four elements of the
+ * Expand the relevant bits of mask_input to a n*4-dword mask for the
+ * n*four pixels in n 2x2 quads. This will set the n*four elements of the
* quad mask vector to 0 or ~0.
+ * Grouping is 01, 23 for 2 quad mode hence only 0 and 2 are valid
+ * quad arguments with fs length 8.
*
- * \param quad which quad of the quad group to test, in [0,3]
+ * \param first_quad which quad(s) of the quad group to test, in [0,3]
* \param mask_input bitwise mask for the whole 4x4 stamp
*/
static LLVMValueRef
generate_quad_mask(struct gallivm_state *gallivm,
struct lp_type fs_type,
- unsigned quad,
+ unsigned first_quad,
LLVMValueRef mask_input) /* int32 */
{
LLVMBuilderRef builder = gallivm->builder;
struct lp_type mask_type;
LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
- LLVMValueRef bits[4];
+ LLVMValueRef bits[16];
LLVMValueRef mask;
- int shift;
+ int shift, i;
/*
* XXX: We'll need a different path for 16 x u8
*/
assert(fs_type.width == 32);
- assert(fs_type.length == 4);
+ assert(fs_type.length <= Elements(bits));
mask_type = lp_int_type(fs_type);
/*
* mask_input >>= (quad * 4)
*/
- switch (quad) {
+ switch (first_quad) {
case 0:
shift = 0;
break;
case 1:
+ assert(fs_type.length == 4);
shift = 2;
break;
case 2:
shift = 8;
break;
case 3:
+ assert(fs_type.length == 4);
shift = 10;
break;
default:
@@ -166,12 +166,14 @@ generate_quad_mask(struct gallivm_state *gallivm,
lp_build_vec_type(gallivm, mask_type),
mask_input);
- bits[0] = LLVMConstInt(i32t, 1 << 0, 0);
- bits[1] = LLVMConstInt(i32t, 1 << 1, 0);
- bits[2] = LLVMConstInt(i32t, 1 << 4, 0);
- bits[3] = LLVMConstInt(i32t, 1 << 5, 0);
-
- mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, 4), "");
+ for (i = 0; i < fs_type.length / 4; i++) {
+ unsigned j = 2 * (i % 2) + (i / 2) * 8;
+ bits[4*i + 0] = LLVMConstInt(i32t, 1 << (j + 0), 0);
+ bits[4*i + 1] = LLVMConstInt(i32t, 1 << (j + 1), 0);
+ bits[4*i + 2] = LLVMConstInt(i32t, 1 << (j + 4), 0);
+ bits[4*i + 3] = LLVMConstInt(i32t, 1 << (j + 5), 0);
+ }
+ mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, fs_type.length), "");
/*
* mask = mask != 0 ? ~0 : 0
@@ -300,7 +302,7 @@ generate_fs(struct gallivm_state *gallivm,
/* do triangle edge testing */
if (partial_mask) {
*pmask = generate_quad_mask(gallivm, type,
- i, mask_input);
+ i*type.length/4, mask_input);
}
else {
*pmask = lp_build_const_int_vec(gallivm, type, ~0);
@@ -312,7 +314,7 @@ generate_fs(struct gallivm_state *gallivm,
if (!(depth_mode & EARLY_DEPTH_TEST) && !simple_shader)
lp_build_mask_check(&mask);
- lp_build_interp_soa_update_pos(interp, gallivm, i);
+ lp_build_interp_soa_update_pos(interp, gallivm, i*type.length/4);
z = interp->pos[2];
if (depth_mode & EARLY_DEPTH_TEST) {
@@ -333,7 +335,7 @@ generate_fs(struct gallivm_state *gallivm,
}
}
- lp_build_interp_soa_update_inputs(interp, gallivm, i);
+ lp_build_interp_soa_update_inputs(interp, gallivm, i*type.length/4);
/* Build the actual shader */
lp_build_tgsi_soa(gallivm, tokens, type, &mask,
@@ -515,7 +517,7 @@ generate_fragment(struct llvmpipe_context *lp,
struct lp_fragment_shader_variant *variant,
unsigned partial_mask)
{
- struct gallivm_state *gallivm = lp->gallivm;
+ struct gallivm_state *gallivm = variant->gallivm;
const struct lp_fragment_shader_variant_key *key = &variant->key;
struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
char func_name[256];
@@ -541,8 +543,8 @@ generate_fragment(struct llvmpipe_context *lp,
LLVMBuilderRef builder;
struct lp_build_sampler_soa *sampler;
struct lp_build_interp_soa_context interp;
- LLVMValueRef fs_mask[LP_MAX_VECTOR_LENGTH];
- LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][LP_MAX_VECTOR_LENGTH];
+ LLVMValueRef fs_mask[16 / 4];
+ LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][16 / 4];
LLVMValueRef blend_mask;
LLVMValueRef function;
LLVMValueRef facing;
@@ -553,6 +555,8 @@ generate_fragment(struct llvmpipe_context *lp,
unsigned cbuf;
boolean cbuf0_write_all;
+ assert(lp_native_vector_width / 32 >= 4);
+
/* Adjust color input interpolation according to flatshade state:
*/
memcpy(inputs, shader->inputs, shader->info.base.num_inputs * sizeof inputs[0]);
@@ -579,12 +583,12 @@ generate_fragment(struct llvmpipe_context *lp,
* characteristics. */
memset(&fs_type, 0, sizeof fs_type);
- fs_type.floating = TRUE; /* floating point values */
- fs_type.sign = TRUE; /* values are signed */
- fs_type.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */
- fs_type.width = 32; /* 32-bit float */
- fs_type.length = 4; /* 4 elements per vector */
- num_fs = 4; /* number of quads per block */
+ fs_type.floating = TRUE; /* floating point values */
+ fs_type.sign = TRUE; /* values are signed */
+ fs_type.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */
+ fs_type.width = 32; /* 32-bit float */
+ fs_type.length = MIN2(lp_native_vector_width / 32, 16); /* n*4 elements per vector */
+ num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
memset(&blend_type, 0, sizeof blend_type);
blend_type.floating = FALSE; /* values are integers */
@@ -605,7 +609,7 @@ generate_fragment(struct llvmpipe_context *lp,
util_snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s",
shader->no, variant->no, partial_mask ? "partial" : "whole");
- arg_types[0] = lp_jit_get_context_type(lp); /* context */
+ arg_types[0] = variant->jit_context_ptr_type; /* context */
arg_types[1] = int32_type; /* x */
arg_types[2] = int32_type; /* y */
arg_types[3] = int32_type; /* facing */
@@ -738,20 +742,20 @@ generate_fragment(struct llvmpipe_context *lp,
LLVMBuildLoad(builder, fs_out_color[cbuf][chan][i], "fs_color_vals");
}
- lp_build_conv(gallivm, fs_type, blend_type,
+ lp_build_conv(gallivm, fs_type, blend_type,
fs_color_vals,
num_fs,
- &blend_in_color[chan], 1);
+ &blend_in_color[chan], 1);
- lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]);
+ lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]);
}
if (partial_mask || !variant->opaque) {
- lp_build_conv_mask(lp->gallivm, fs_type, blend_type,
+ lp_build_conv_mask(variant->gallivm, fs_type, blend_type,
fs_mask, num_fs,
&blend_mask, 1);
} else {
- blend_mask = lp_build_const_int_vec(lp->gallivm, blend_type, ~0);
+ blend_mask = lp_build_const_int_vec(variant->gallivm, blend_type, ~0);
}
color_ptr = LLVMBuildLoad(builder,
@@ -772,7 +776,7 @@ generate_fragment(struct llvmpipe_context *lp,
!key->alpha.enabled &&
!shader->info.base.uses_kill);
- generate_blend(lp->gallivm,
+ generate_blend(variant->gallivm,
&key->blend,
rt,
builder,
@@ -787,43 +791,9 @@ generate_fragment(struct llvmpipe_context *lp,
LLVMBuildRetVoid(builder);
- /* Verify the LLVM IR. If invalid, dump and abort */
-#ifdef DEBUG
- if(LLVMVerifyFunction(function, LLVMPrintMessageAction)) {
- if (1)
- lp_debug_dump_value(function);
- abort();
- }
-#endif
-
- /* Apply optimizations to LLVM IR */
- LLVMRunFunctionPassManager(gallivm->passmgr, function);
-
- if ((gallivm_debug & GALLIVM_DEBUG_IR) || (LP_DEBUG & DEBUG_FS)) {
- /* Print the LLVM IR to stderr */
- lp_debug_dump_value(function);
- debug_printf("\n");
- }
-
- /* Dump byte code to a file */
- if (0) {
- LLVMWriteBitcodeToFile(gallivm->module, "llvmpipe.bc");
- }
+ gallivm_verify_function(gallivm, function);
variant->nr_instrs += lp_build_count_instructions(function);
- /*
- * Translate the LLVM IR into machine code.
- */
- {
- void *f = LLVMGetPointerToGlobal(gallivm->engine, function);
-
- variant->jit_function[partial_mask] = (lp_jit_frag_func)pointer_to_func(f);
-
- if ((gallivm_debug & GALLIVM_DEBUG_ASM) || (LP_DEBUG & DEBUG_FS)) {
- lp_disassemble(f);
- }
- lp_func_delete_body(function);
- }
}
@@ -937,6 +907,12 @@ generate_variant(struct llvmpipe_context *lp,
if(!variant)
return NULL;
+ variant->gallivm = gallivm_create();
+ if (!variant->gallivm) {
+ FREE(variant);
+ return NULL;
+ }
+
variant->shader = shader;
variant->list_item_global.base = variant;
variant->list_item_local.base = variant;
@@ -968,12 +944,35 @@ generate_variant(struct llvmpipe_context *lp,
lp_debug_fs_variant(variant);
}
- generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
+ lp_jit_init_types(variant);
+
+ if (variant->jit_function[RAST_EDGE_TEST] == NULL)
+ generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
+
+ if (variant->jit_function[RAST_WHOLE] == NULL) {
+ if (variant->opaque) {
+ /* Specialized shader, which doesn't need to read the color buffer. */
+ generate_fragment(lp, shader, variant, RAST_WHOLE);
+ }
+ }
+
+ /*
+ * Compile everything
+ */
+
+ gallivm_compile_module(variant->gallivm);
+
+ if (variant->function[RAST_EDGE_TEST]) {
+ variant->jit_function[RAST_EDGE_TEST] = (lp_jit_frag_func)
+ gallivm_jit_function(variant->gallivm,
+ variant->function[RAST_EDGE_TEST]);
+ }
- if (variant->opaque) {
- /* Specialized shader, which doesn't need to read the color buffer. */
- generate_fragment(lp, shader, variant, RAST_WHOLE);
- } else {
+ if (variant->function[RAST_WHOLE]) {
+ variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func)
+ gallivm_jit_function(variant->gallivm,
+ variant->function[RAST_WHOLE]);
+ } else if (!variant->jit_function[RAST_WHOLE]) {
variant->jit_function[RAST_WHOLE] = variant->jit_function[RAST_EDGE_TEST];
}
@@ -1116,13 +1115,14 @@ llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
/* free all the variant's JIT'd functions */
for (i = 0; i < Elements(variant->function); i++) {
if (variant->function[i]) {
- if (variant->jit_function[i])
- LLVMFreeMachineCodeForFunction(lp->gallivm->engine,
- variant->function[i]);
- LLVMDeleteFunction(variant->function[i]);
+ gallivm_free_function(variant->gallivm,
+ variant->function[i],
+ variant->jit_function[i]);
}
}
+ gallivm_destroy(variant->gallivm);
+
/* remove from shader's list */
remove_from_list(&variant->list_item_local);
variant->shader->variants_cached--;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h
index 273d241d8fc..306f5f9669a 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h
@@ -84,6 +84,12 @@ struct lp_fragment_shader_variant
boolean opaque;
+ struct gallivm_state *gallivm;
+
+ LLVMTypeRef jit_context_ptr_type;
+ LLVMTypeRef jit_thread_data_ptr_type;
+ LLVMTypeRef jit_linear_context_ptr_type;
+
LLVMValueRef function[2];
lp_jit_frag_func jit_function[2];
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c b/src/gallium/drivers/llvmpipe/lp_state_setup.c
index 299c1ef85dc..1d5e50be9b7 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c
@@ -38,7 +38,6 @@
#include "gallivm/lp_bld_intr.h"
#include "gallivm/lp_bld_flow.h"
#include "gallivm/lp_bld_type.h"
-#include <llvm-c/Analysis.h> /* for LLVMVerifyFunction */
#include "lp_perf.h"
#include "lp_debug.h"
@@ -77,12 +76,6 @@ struct lp_setup_args
LLVMValueRef dy01_ooa;
LLVMValueRef dx20_ooa;
LLVMValueRef dx01_ooa;
-
- /* Temporary, per-attribute:
- */
- LLVMValueRef v0a;
- LLVMValueRef v1a;
- LLVMValueRef v2a;
};
@@ -146,7 +139,7 @@ store_coef(struct gallivm_state *gallivm,
{
LLVMBuilderRef builder = gallivm->builder;
LLVMValueRef idx = lp_build_const_int32(gallivm, slot);
-
+
LLVMBuildStore(builder,
a0,
LLVMBuildGEP(builder, args->a0, &idx, 1, ""));
@@ -210,27 +203,13 @@ vert_attrib(struct gallivm_state *gallivm,
return LLVMBuildLoad(b, LLVMBuildGEP(b, vert, idx, 2, ""), name);
}
-static LLVMValueRef
-vert_clamp(LLVMBuilderRef b,
- LLVMValueRef x,
- LLVMValueRef min,
- LLVMValueRef max)
-{
- LLVMValueRef min_result = LLVMBuildFCmp(b, LLVMRealUGT, min, x, "");
- LLVMValueRef max_result = LLVMBuildFCmp(b, LLVMRealUGT, x, max, "");
- LLVMValueRef clamp_value;
-
- clamp_value = LLVMBuildSelect(b, min_result, min, x, "");
- clamp_value = LLVMBuildSelect(b, max_result, max, x, "");
-
- return clamp_value;
-}
static void
lp_twoside(struct gallivm_state *gallivm,
struct lp_setup_args *args,
const struct lp_setup_variant_key *key,
- int bcolor_slot)
+ int bcolor_slot,
+ LLVMValueRef attribv[3])
{
LLVMBuilderRef b = gallivm->builder;
LLVMValueRef a0_back, a1_back, a2_back;
@@ -248,67 +227,66 @@ lp_twoside(struct gallivm_state *gallivm,
* Prefer select to if so we don't have to worry about phis or
* allocas.
*/
- args->v0a = LLVMBuildSelect(b, front_facing, a0_back, args->v0a, "");
- args->v1a = LLVMBuildSelect(b, front_facing, a1_back, args->v1a, "");
- args->v2a = LLVMBuildSelect(b, front_facing, a2_back, args->v2a, "");
+ attribv[0] = LLVMBuildSelect(b, front_facing, a0_back, attribv[0], "");
+ attribv[1] = LLVMBuildSelect(b, front_facing, a1_back, attribv[1], "");
+ attribv[2] = LLVMBuildSelect(b, front_facing, a2_back, attribv[2], "");
}
static void
lp_do_offset_tri(struct gallivm_state *gallivm,
struct lp_setup_args *args,
- const struct lp_setup_variant_key *key)
+ const struct lp_setup_variant_key *key,
+ LLVMValueRef inv_det,
+ LLVMValueRef dxyz01,
+ LLVMValueRef dxyz20,
+ LLVMValueRef attribv[3])
{
LLVMBuilderRef b = gallivm->builder;
struct lp_build_context bld;
LLVMValueRef zoffset, mult;
LLVMValueRef z0_new, z1_new, z2_new;
- LLVMValueRef dzdx0, dzdx, dzdy0, dzdy;
- LLVMValueRef max, max_value;
-
- LLVMValueRef one = lp_build_const_float(gallivm, 1.0);
- LLVMValueRef zero = lp_build_const_float(gallivm, 0.0);
- LLVMValueRef two = lp_build_const_int32(gallivm, 2);
-
- /* edge vectors: e = v0 - v2, f = v1 - v2 */
- LLVMValueRef v0_x = vert_attrib(gallivm, args->v0, 0, 0, "v0_x");
- LLVMValueRef v1_x = vert_attrib(gallivm, args->v1, 0, 0, "v1_x");
- LLVMValueRef v2_x = vert_attrib(gallivm, args->v2, 0, 0, "v2_x");
- LLVMValueRef v0_y = vert_attrib(gallivm, args->v0, 0, 1, "v0_y");
- LLVMValueRef v1_y = vert_attrib(gallivm, args->v1, 0, 1, "v1_y");
- LLVMValueRef v2_y = vert_attrib(gallivm, args->v2, 0, 1, "v2_y");
- LLVMValueRef v0_z = vert_attrib(gallivm, args->v0, 0, 2, "v0_z");
- LLVMValueRef v1_z = vert_attrib(gallivm, args->v1, 0, 2, "v1_z");
- LLVMValueRef v2_z = vert_attrib(gallivm, args->v2, 0, 2, "v2_z");
-
- /* edge vectors: e = v0 - v2, f = v1 - v2 */
- LLVMValueRef dx02 = LLVMBuildFSub(b, v0_x, v2_x, "dx02");
- LLVMValueRef dy02 = LLVMBuildFSub(b, v0_y, v2_y, "dy02");
- LLVMValueRef dz02 = LLVMBuildFSub(b, v0_z, v2_z, "dz02");
- LLVMValueRef dx12 = LLVMBuildFSub(b, v1_x, v2_x, "dx12");
- LLVMValueRef dy12 = LLVMBuildFSub(b, v1_y, v2_y, "dy12");
- LLVMValueRef dz12 = LLVMBuildFSub(b, v1_z, v2_z, "dz12");
-
- /* det = cross(e,f).z */
- LLVMValueRef dx02_dy12 = LLVMBuildFMul(b, dx02, dy12, "dx02_dy12");
- LLVMValueRef dy02_dx12 = LLVMBuildFMul(b, dy02, dx12, "dy02_dx12");
- LLVMValueRef det = LLVMBuildFSub(b, dx02_dy12, dy02_dx12, "det");
- LLVMValueRef inv_det = LLVMBuildFDiv(b, one, det, "inv_det");
-
- /* (res1,res2) = cross(e,f).xy */
- LLVMValueRef dy02_dz12 = LLVMBuildFMul(b, dy02, dz12, "dy02_dz12");
- LLVMValueRef dz02_dy12 = LLVMBuildFMul(b, dz02, dy12, "dz02_dy12");
- LLVMValueRef dz02_dx12 = LLVMBuildFMul(b, dz02, dx12, "dz02_dx12");
- LLVMValueRef dx02_dz12 = LLVMBuildFMul(b, dx02, dz12, "dx02_dz12");
- LLVMValueRef res1 = LLVMBuildFSub(b, dy02_dz12, dz02_dy12, "res1");
- LLVMValueRef res2 = LLVMBuildFSub(b, dz02_dx12, dx02_dz12, "res2");
+ LLVMValueRef dzdxdzdy, dzdx, dzdy, dzxyz20, dyzzx01, dyzzx01_dzxyz20, dzx01_dyz20;
+ LLVMValueRef z0z1, z0z1z2;
+ LLVMValueRef max, max_value, res12;
+ LLVMValueRef shuffles[4];
+ LLVMTypeRef shuf_type = LLVMInt32TypeInContext(gallivm->context);
+ LLVMValueRef onei = lp_build_const_int32(gallivm, 1);
+ LLVMValueRef zeroi = lp_build_const_int32(gallivm, 0);
+ LLVMValueRef twoi = lp_build_const_int32(gallivm, 2);
+ LLVMValueRef threei = lp_build_const_int32(gallivm, 3);
+
+ /* (res12) = cross(e,f).xy */
+ shuffles[0] = twoi;
+ shuffles[1] = zeroi;
+ shuffles[2] = onei;
+ shuffles[3] = twoi;
+ dzxyz20 = LLVMBuildShuffleVector(b, dxyz20, dxyz20, LLVMConstVector(shuffles, 4), "");
+
+ shuffles[0] = onei;
+ shuffles[1] = twoi;
+ shuffles[2] = twoi;
+ shuffles[3] = zeroi;
+ dyzzx01 = LLVMBuildShuffleVector(b, dxyz01, dxyz01, LLVMConstVector(shuffles, 4), "");
+
+ dyzzx01_dzxyz20 = LLVMBuildFMul(b, dzxyz20, dyzzx01, "dyzzx01_dzxyz20");
+
+ shuffles[0] = twoi;
+ shuffles[1] = threei;
+ shuffles[2] = LLVMGetUndef(shuf_type);
+ shuffles[3] = LLVMGetUndef(shuf_type);
+ dzx01_dyz20 = LLVMBuildShuffleVector(b, dyzzx01_dzxyz20, dyzzx01_dzxyz20,
+ LLVMConstVector(shuffles, 4), "");
+
+ res12 = LLVMBuildFSub(b, dyzzx01_dzxyz20, dzx01_dyz20, "res12");
/* dzdx = fabsf(res1 * inv_det), dydx = fabsf(res2 * inv_det)*/
- lp_build_context_init(&bld, gallivm, lp_type_float(32));
- dzdx0 = LLVMBuildFMul(b, res1, inv_det, "dzdx");
- dzdx = lp_build_abs(&bld, dzdx0);
- dzdy0 = LLVMBuildFMul(b, res2, inv_det, "dzdy");
- dzdy = lp_build_abs(&bld, dzdy0);
+ lp_build_context_init(&bld, gallivm, lp_type_float_vec(32, 128));
+ dzdxdzdy = LLVMBuildFMul(b, res12, inv_det, "dzdxdzdy");
+ dzdxdzdy = lp_build_abs(&bld, dzdxdzdy);
+
+ dzdx = LLVMBuildExtractElement(b, dzdxdzdy, zeroi, "");
+ dzdy = LLVMBuildExtractElement(b, dzdxdzdy, onei, "");
/* zoffset = offset->units + MAX2(dzdx, dzdy) * offset->scale */
max = LLVMBuildFCmp(b, LLVMRealUGT, dzdx, dzdy, "");
@@ -317,45 +295,56 @@ lp_do_offset_tri(struct gallivm_state *gallivm,
mult = LLVMBuildFMul(b, max_value, lp_build_const_float(gallivm, key->scale), "");
zoffset = LLVMBuildFAdd(b, lp_build_const_float(gallivm, key->units), mult, "zoffset");
+ /* yuck */
+ shuffles[0] = twoi;
+ shuffles[1] = lp_build_const_int32(gallivm, 6);
+ shuffles[2] = LLVMGetUndef(shuf_type);
+ shuffles[3] = LLVMGetUndef(shuf_type);
+ z0z1 = LLVMBuildShuffleVector(b, attribv[0], attribv[1], LLVMConstVector(shuffles, 4), "");
+ shuffles[0] = zeroi;
+ shuffles[1] = onei;
+ shuffles[2] = lp_build_const_int32(gallivm, 6);
+ shuffles[3] = LLVMGetUndef(shuf_type);
+ z0z1z2 = LLVMBuildShuffleVector(b, z0z1, attribv[2], LLVMConstVector(shuffles, 4), "");
+ zoffset = vec4f_from_scalar(gallivm, zoffset, "");
+
/* clamp and do offset */
- z0_new = vert_clamp(b, LLVMBuildFAdd(b, v0_z, zoffset, ""), zero, one);
- z1_new = vert_clamp(b, LLVMBuildFAdd(b, v1_z, zoffset, ""), zero, one);
- z2_new = vert_clamp(b, LLVMBuildFAdd(b, v2_z, zoffset, ""), zero, one);
+ z0z1z2 = lp_build_clamp(&bld, LLVMBuildFAdd(b, z0z1z2, zoffset, ""), bld.zero, bld.one);
/* insert into args->a0.z, a1.z, a2.z:
- */
- args->v0a = LLVMBuildInsertElement(b, args->v0a, z0_new, two, "");
- args->v1a = LLVMBuildInsertElement(b, args->v1a, z1_new, two, "");
- args->v2a = LLVMBuildInsertElement(b, args->v2a, z2_new, two, "");
+ */
+ z0_new = LLVMBuildExtractElement(b, z0z1z2, zeroi, "");
+ z1_new = LLVMBuildExtractElement(b, z0z1z2, onei, "");
+ z2_new = LLVMBuildExtractElement(b, z0z1z2, twoi, "");
+ attribv[0] = LLVMBuildInsertElement(b, attribv[0], z0_new, twoi, "");
+ attribv[1] = LLVMBuildInsertElement(b, attribv[1], z1_new, twoi, "");
+ attribv[2] = LLVMBuildInsertElement(b, attribv[2], z2_new, twoi, "");
}
static void
load_attribute(struct gallivm_state *gallivm,
struct lp_setup_args *args,
const struct lp_setup_variant_key *key,
- unsigned vert_attr)
+ unsigned vert_attr,
+ LLVMValueRef attribv[3])
{
LLVMBuilderRef b = gallivm->builder;
LLVMValueRef idx = lp_build_const_int32(gallivm, vert_attr);
/* Load the vertex data
*/
- args->v0a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v0, &idx, 1, ""), "v0a");
- args->v1a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v1, &idx, 1, ""), "v1a");
- args->v2a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v2, &idx, 1, ""), "v2a");
+ attribv[0] = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v0, &idx, 1, ""), "v0a");
+ attribv[1] = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v1, &idx, 1, ""), "v1a");
+ attribv[2] = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v2, &idx, 1, ""), "v2a");
- /* Potentially modify it according to twoside, offset, etc:
+ /* Potentially modify it according to twoside, etc:
*/
- if (vert_attr == 0 && (key->scale != 0.0f || key->units != 0.0f)) {
- lp_do_offset_tri(gallivm, args, key);
- }
-
if (key->twoside) {
if (vert_attr == key->color_slot && key->bcolor_slot >= 0)
- lp_twoside(gallivm, args, key, key->bcolor_slot);
+ lp_twoside(gallivm, args, key, key->bcolor_slot, attribv);
else if (vert_attr == key->spec_slot && key->bspec_slot >= 0)
- lp_twoside(gallivm, args, key, key->bspec_slot);
+ lp_twoside(gallivm, args, key, key->bspec_slot, attribv);
}
}
@@ -375,8 +364,6 @@ emit_coef4( struct gallivm_state *gallivm,
LLVMValueRef x0_center = args->x0_center;
LLVMValueRef y0_center = args->y0_center;
- /* XXX: using fsub, fmul on vector types -- does this work??
- */
LLVMValueRef da01 = LLVMBuildFSub(b, a0, a1, "da01");
LLVMValueRef da20 = LLVMBuildFSub(b, a2, a0, "da20");
@@ -406,14 +393,15 @@ emit_coef4( struct gallivm_state *gallivm,
static void
emit_linear_coef( struct gallivm_state *gallivm,
struct lp_setup_args *args,
- unsigned slot)
+ unsigned slot,
+ LLVMValueRef attribv[3])
{
/* nothing to do anymore */
emit_coef4(gallivm,
args, slot,
- args->v0a,
- args->v1a,
- args->v2a);
+ attribv[0],
+ attribv[1],
+ attribv[2]);
}
@@ -426,9 +414,10 @@ emit_linear_coef( struct gallivm_state *gallivm,
* divide the interpolated value by the interpolated W at that fragment.
*/
static void
-emit_perspective_coef( struct gallivm_state *gallivm,
- struct lp_setup_args *args,
- unsigned slot)
+apply_perspective_corr( struct gallivm_state *gallivm,
+ struct lp_setup_args *args,
+ unsigned slot,
+ LLVMValueRef attribv[3])
{
LLVMBuilderRef b = gallivm->builder;
@@ -438,20 +427,19 @@ emit_perspective_coef( struct gallivm_state *gallivm,
LLVMValueRef v1_oow = vec4f_from_scalar(gallivm, vert_attrib(gallivm, args->v1, 0, 3, ""), "v1_oow");
LLVMValueRef v2_oow = vec4f_from_scalar(gallivm, vert_attrib(gallivm, args->v2, 0, 3, ""), "v2_oow");
- LLVMValueRef v0_oow_v0a = LLVMBuildFMul(b, args->v0a, v0_oow, "v0_oow_v0a");
- LLVMValueRef v1_oow_v1a = LLVMBuildFMul(b, args->v1a, v1_oow, "v1_oow_v1a");
- LLVMValueRef v2_oow_v2a = LLVMBuildFMul(b, args->v2a, v2_oow, "v2_oow_v2a");
-
- emit_coef4(gallivm, args, slot, v0_oow_v0a, v1_oow_v1a, v2_oow_v2a);
+ attribv[0] = LLVMBuildFMul(b, attribv[0], v0_oow, "v0_oow_v0a");
+ attribv[1] = LLVMBuildFMul(b, attribv[1], v1_oow, "v1_oow_v1a");
+ attribv[2] = LLVMBuildFMul(b, attribv[2], v2_oow, "v2_oow_v2a");
}
static void
emit_position_coef( struct gallivm_state *gallivm,
struct lp_setup_args *args,
- int slot )
+ int slot,
+ LLVMValueRef attribv[3])
{
- emit_linear_coef(gallivm, args, slot);
+ emit_linear_coef(gallivm, args, slot, attribv);
}
@@ -464,7 +452,9 @@ emit_position_coef( struct gallivm_state *gallivm,
static void
emit_apply_cyl_wrap(struct gallivm_state *gallivm,
struct lp_setup_args *args,
- uint cyl_wrap)
+ uint cyl_wrap,
+ LLVMValueRef attribv[3])
+
{
LLVMBuilderRef builder = gallivm->builder;
struct lp_type type = lp_float32_vec4_type();
@@ -489,43 +479,43 @@ emit_apply_cyl_wrap(struct gallivm_state *gallivm,
one = LLVMBuildAnd(builder, one, cyl_mask, "");
/* Edge v0 -> v1 */
- delta = LLVMBuildFSub(builder, args->v1a, args->v0a, "");
+ delta = LLVMBuildFSub(builder, attribv[1], attribv[0], "");
- offset = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half);
- offset = LLVMBuildAnd(builder, offset, one, "");
- offset = LLVMBuildBitCast(builder, offset, float_vec_type, "");
- args->v0a = LLVMBuildFAdd(builder, args->v0a, offset, "");
+ offset = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half);
+ offset = LLVMBuildAnd(builder, offset, one, "");
+ offset = LLVMBuildBitCast(builder, offset, float_vec_type, "");
+ attribv[0] = LLVMBuildFAdd(builder, attribv[0], offset, "");
- offset = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half);
- offset = LLVMBuildAnd(builder, offset, one, "");
- offset = LLVMBuildBitCast(builder, offset, float_vec_type, "");
- args->v1a = LLVMBuildFAdd(builder, args->v1a, offset, "");
+ offset = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half);
+ offset = LLVMBuildAnd(builder, offset, one, "");
+ offset = LLVMBuildBitCast(builder, offset, float_vec_type, "");
+ attribv[1] = LLVMBuildFAdd(builder, attribv[1], offset, "");
/* Edge v1 -> v2 */
- delta = LLVMBuildFSub(builder, args->v2a, args->v1a, "");
+ delta = LLVMBuildFSub(builder, attribv[2], attribv[1], "");
- offset = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half);
- offset = LLVMBuildAnd(builder, offset, one, "");
- offset = LLVMBuildBitCast(builder, offset, float_vec_type, "");
- args->v1a = LLVMBuildFAdd(builder, args->v1a, offset, "");
+ offset = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half);
+ offset = LLVMBuildAnd(builder, offset, one, "");
+ offset = LLVMBuildBitCast(builder, offset, float_vec_type, "");
+ attribv[1] = LLVMBuildFAdd(builder, attribv[1], offset, "");
- offset = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half);
- offset = LLVMBuildAnd(builder, offset, one, "");
- offset = LLVMBuildBitCast(builder, offset, float_vec_type, "");
- args->v2a = LLVMBuildFAdd(builder, args->v2a, offset, "");
+ offset = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half);
+ offset = LLVMBuildAnd(builder, offset, one, "");
+ offset = LLVMBuildBitCast(builder, offset, float_vec_type, "");
+ attribv[2] = LLVMBuildFAdd(builder, attribv[2], offset, "");
/* Edge v2 -> v0 */
- delta = LLVMBuildFSub(builder, args->v0a, args->v2a, "");
+ delta = LLVMBuildFSub(builder, attribv[0], attribv[2], "");
- offset = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half);
- offset = LLVMBuildAnd(builder, offset, one, "");
- offset = LLVMBuildBitCast(builder, offset, float_vec_type, "");
- args->v2a = LLVMBuildFAdd(builder, args->v2a, offset, "");
+ offset = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half);
+ offset = LLVMBuildAnd(builder, offset, one, "");
+ offset = LLVMBuildBitCast(builder, offset, float_vec_type, "");
+ attribv[2] = LLVMBuildFAdd(builder, attribv[2], offset, "");
- offset = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half);
- offset = LLVMBuildAnd(builder, offset, one, "");
- offset = LLVMBuildBitCast(builder, offset, float_vec_type, "");
- args->v0a = LLVMBuildFAdd(builder, args->v0a, offset, "");
+ offset = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half);
+ offset = LLVMBuildAnd(builder, offset, one, "");
+ offset = LLVMBuildBitCast(builder, offset, float_vec_type, "");
+ attribv[0] = LLVMBuildFAdd(builder, attribv[0], offset, "");
}
@@ -534,43 +524,38 @@ emit_apply_cyl_wrap(struct gallivm_state *gallivm,
*/
static void
emit_tri_coef( struct gallivm_state *gallivm,
- const struct lp_setup_variant_key *key,
- struct lp_setup_args *args )
+ const struct lp_setup_variant_key *key,
+ struct lp_setup_args *args)
{
unsigned slot;
- /* The internal position input is in slot zero:
- */
- load_attribute(gallivm, args, key, 0);
- emit_position_coef(gallivm, args, 0);
+ LLVMValueRef attribs[3];
- /* setup interpolation for all the remaining attributes:
+ /* setup interpolation for all the remaining attributes:
*/
for (slot = 0; slot < key->num_inputs; slot++) {
-
- if (key->inputs[slot].interp == LP_INTERP_CONSTANT ||
- key->inputs[slot].interp == LP_INTERP_LINEAR ||
- key->inputs[slot].interp == LP_INTERP_PERSPECTIVE)
- load_attribute(gallivm, args, key, key->inputs[slot].src_index);
-
switch (key->inputs[slot].interp) {
case LP_INTERP_CONSTANT:
- if (key->flatshade_first) {
- emit_constant_coef4(gallivm, args, slot+1, args->v0a);
- }
- else {
- emit_constant_coef4(gallivm, args, slot+1, args->v2a);
- }
- break;
+ load_attribute(gallivm, args, key, key->inputs[slot].src_index, attribs);
+ if (key->flatshade_first) {
+ emit_constant_coef4(gallivm, args, slot+1, attribs[0]);
+ }
+ else {
+ emit_constant_coef4(gallivm, args, slot+1, attribs[2]);
+ }
+ break;
case LP_INTERP_LINEAR:
- emit_apply_cyl_wrap(gallivm, args, key->inputs[slot].cyl_wrap);
- emit_linear_coef(gallivm, args, slot+1);
+ load_attribute(gallivm, args, key, key->inputs[slot].src_index, attribs);
+ emit_apply_cyl_wrap(gallivm, args, key->inputs[slot].cyl_wrap, attribs);
+ emit_linear_coef(gallivm, args, slot+1, attribs);
break;
case LP_INTERP_PERSPECTIVE:
- emit_apply_cyl_wrap(gallivm, args, key->inputs[slot].cyl_wrap);
- emit_perspective_coef(gallivm, args, slot+1);
+ load_attribute(gallivm, args, key, key->inputs[slot].src_index, attribs);
+ emit_apply_cyl_wrap(gallivm, args, key->inputs[slot].cyl_wrap, attribs);
+ apply_perspective_corr(gallivm, args, slot+1, attribs);
+ emit_linear_coef(gallivm, args, slot+1, attribs);
break;
case LP_INTERP_POSITION:
@@ -591,62 +576,6 @@ emit_tri_coef( struct gallivm_state *gallivm,
}
-/* XXX: This is generic code, share with fs/vs codegen:
- */
-static lp_jit_setup_triangle
-finalize_function(struct gallivm_state *gallivm,
- LLVMBuilderRef builder,
- LLVMValueRef function)
-{
- void *f;
-
- /* Verify the LLVM IR. If invalid, dump and abort */
-#ifdef DEBUG
- if (LLVMVerifyFunction(function, LLVMPrintMessageAction)) {
- if (1)
- lp_debug_dump_value(function);
- abort();
- }
-#endif
-
- /* Apply optimizations to LLVM IR */
- LLVMRunFunctionPassManager(gallivm->passmgr, function);
-
- if (gallivm_debug & GALLIVM_DEBUG_IR)
- {
- /* Print the LLVM IR to stderr */
- lp_debug_dump_value(function);
- debug_printf("\n");
- }
-
- /*
- * Translate the LLVM IR into machine code.
- */
- f = LLVMGetPointerToGlobal(gallivm->engine, function);
-
- if (gallivm_debug & GALLIVM_DEBUG_ASM)
- {
- lp_disassemble(f);
- }
-
- lp_func_delete_body(function);
-
- return (lp_jit_setup_triangle) pointer_to_func(f);
-}
-
-/* XXX: Generic code:
- */
-static void
-lp_emit_emms(struct gallivm_state *gallivm)
-{
-#ifdef PIPE_ARCH_X86
- /* Avoid corrupting the FPU stack on 32bit OSes. */
- lp_build_intrinsic(gallivm->builder, "llvm.x86.mmx.emms",
- LLVMVoidTypeInContext(gallivm->context), NULL, 0);
-#endif
-}
-
-
/* XXX: generic code:
*/
static void
@@ -664,49 +593,70 @@ set_noalias(LLVMBuilderRef builder,
static void
init_args(struct gallivm_state *gallivm,
- struct lp_setup_args *args,
- const struct lp_setup_variant *variant)
+ const struct lp_setup_variant_key *key,
+ struct lp_setup_args *args)
{
LLVMBuilderRef b = gallivm->builder;
+ LLVMTypeRef shuf_type = LLVMInt32TypeInContext(gallivm->context);
+ LLVMValueRef onef = lp_build_const_float(gallivm, 1.0);
+ LLVMValueRef onei = lp_build_const_int32(gallivm, 1);
+ LLVMValueRef zeroi = lp_build_const_int32(gallivm, 0);
+ LLVMValueRef pixel_center, xy0_center, dxy01, dxy20, dyx20;
+ LLVMValueRef e, f, ef, ooa;
+ LLVMValueRef shuffles[4];
+ LLVMValueRef attr_pos[3];
+ struct lp_type typef4 = lp_type_float_vec(32, 128);
- LLVMValueRef v0_x = vert_attrib(gallivm, args->v0, 0, 0, "v0_x");
- LLVMValueRef v0_y = vert_attrib(gallivm, args->v0, 0, 1, "v0_y");
+ /* The internal position input is in slot zero:
+ */
+ load_attribute(gallivm, args, key, 0, attr_pos);
- LLVMValueRef v1_x = vert_attrib(gallivm, args->v1, 0, 0, "v1_x");
- LLVMValueRef v1_y = vert_attrib(gallivm, args->v1, 0, 1, "v1_y");
+ pixel_center = lp_build_const_vec(gallivm, typef4,
+ key->pixel_center_half ? 0.5 : 0.0);
- LLVMValueRef v2_x = vert_attrib(gallivm, args->v2, 0, 0, "v2_x");
- LLVMValueRef v2_y = vert_attrib(gallivm, args->v2, 0, 1, "v2_y");
+ /*
+ * xy are first two elems in v0a/v1a/v2a but just use vec4 arit
+ * also offset_tri uses actually xyz in them
+ */
+ xy0_center = LLVMBuildFSub(b, attr_pos[0], pixel_center, "xy0_center" );
- LLVMValueRef pixel_center = lp_build_const_float(gallivm,
- variant->key.pixel_center_half ? 0.5 : 0);
+ dxy01 = LLVMBuildFSub(b, attr_pos[0], attr_pos[1], "dxy01");
+ dxy20 = LLVMBuildFSub(b, attr_pos[2], attr_pos[0], "dxy20");
- LLVMValueRef x0_center = LLVMBuildFSub(b, v0_x, pixel_center, "x0_center" );
- LLVMValueRef y0_center = LLVMBuildFSub(b, v0_y, pixel_center, "y0_center" );
-
- LLVMValueRef dx01 = LLVMBuildFSub(b, v0_x, v1_x, "dx01");
- LLVMValueRef dy01 = LLVMBuildFSub(b, v0_y, v1_y, "dy01");
- LLVMValueRef dx20 = LLVMBuildFSub(b, v2_x, v0_x, "dx20");
- LLVMValueRef dy20 = LLVMBuildFSub(b, v2_y, v0_y, "dy20");
+ shuffles[0] = onei;
+ shuffles[1] = zeroi;
+ shuffles[2] = LLVMGetUndef(shuf_type);
+ shuffles[3] = LLVMGetUndef(shuf_type);
+
+ dyx20 = LLVMBuildShuffleVector(b, dxy20, dxy20, LLVMConstVector(shuffles, 4), "");
+
+ ef = LLVMBuildFMul(b, dxy01, dyx20, "ef");
+ e = LLVMBuildExtractElement(b, ef, zeroi, "");
+ f = LLVMBuildExtractElement(b, ef, onei, "");
- LLVMValueRef one = lp_build_const_float(gallivm, 1.0);
- LLVMValueRef e = LLVMBuildFMul(b, dx01, dy20, "e");
- LLVMValueRef f = LLVMBuildFMul(b, dx20, dy01, "f");
- LLVMValueRef ooa = LLVMBuildFDiv(b, one, LLVMBuildFSub(b, e, f, ""), "ooa");
+ ooa = LLVMBuildFDiv(b, onef, LLVMBuildFSub(b, e, f, ""), "ooa");
- LLVMValueRef dy20_ooa = LLVMBuildFMul(b, dy20, ooa, "dy20_ooa");
- LLVMValueRef dy01_ooa = LLVMBuildFMul(b, dy01, ooa, "dy01_ooa");
- LLVMValueRef dx20_ooa = LLVMBuildFMul(b, dx20, ooa, "dx20_ooa");
- LLVMValueRef dx01_ooa = LLVMBuildFMul(b, dx01, ooa, "dx01_ooa");
+ ooa = vec4f_from_scalar(gallivm, ooa, "");
+
+ /* tri offset calc shares a lot of arithmetic, do it here */
+ if (key->scale != 0.0f || key->units != 0.0f) {
+ lp_do_offset_tri(gallivm, args, key, ooa, dxy01, dxy20, attr_pos);
+ }
- args->dy20_ooa = vec4f_from_scalar(gallivm, dy20_ooa, "dy20_ooa_4f");
- args->dy01_ooa = vec4f_from_scalar(gallivm, dy01_ooa, "dy01_ooa_4f");
+ dxy20 = LLVMBuildFMul(b, dxy20, ooa, "");
+ dxy01 = LLVMBuildFMul(b, dxy01, ooa, "");
- args->dx20_ooa = vec4f_from_scalar(gallivm, dx20_ooa, "dx20_ooa_4f");
- args->dx01_ooa = vec4f_from_scalar(gallivm, dx01_ooa, "dx01_ooa_4f");
+ args->dy20_ooa = lp_build_extract_broadcast(gallivm, typef4, typef4, dxy20, onei);
+ args->dy01_ooa = lp_build_extract_broadcast(gallivm, typef4, typef4, dxy01, onei);
- args->x0_center = vec4f_from_scalar(gallivm, x0_center, "x0_center_4f");
- args->y0_center = vec4f_from_scalar(gallivm, y0_center, "y0_center_4f");
+ args->dx20_ooa = lp_build_extract_broadcast(gallivm, typef4, typef4, dxy20, zeroi);
+ args->dx01_ooa = lp_build_extract_broadcast(gallivm, typef4, typef4, dxy01, zeroi);
+
+ args->x0_center = lp_build_extract_broadcast(gallivm, typef4, typef4, xy0_center, zeroi);
+ args->y0_center = lp_build_extract_broadcast(gallivm, typef4, typef4, xy0_center, onei);
+
+ /* might want to merge that with other coef emit in the future */
+ emit_position_coef(gallivm, args, 0, attr_pos);
}
/**
@@ -714,18 +664,18 @@ init_args(struct gallivm_state *gallivm,
*
*/
static struct lp_setup_variant *
-generate_setup_variant(struct gallivm_state *gallivm,
- struct lp_setup_variant_key *key,
+generate_setup_variant(struct lp_setup_variant_key *key,
struct llvmpipe_context *lp)
{
struct lp_setup_variant *variant = NULL;
+ struct gallivm_state *gallivm;
struct lp_setup_args args;
char func_name[256];
LLVMTypeRef vec4f_type;
LLVMTypeRef func_type;
LLVMTypeRef arg_types[7];
LLVMBasicBlockRef block;
- LLVMBuilderRef builder = gallivm->builder;
+ LLVMBuilderRef builder;
int64_t t0 = 0, t1;
if (0)
@@ -735,6 +685,13 @@ generate_setup_variant(struct gallivm_state *gallivm,
if (variant == NULL)
goto fail;
+ variant->gallivm = gallivm = gallivm_create();
+ if (!variant->gallivm) {
+ goto fail;
+ }
+
+ builder = gallivm->builder;
+
if (LP_DEBUG & DEBUG_COUNTERS) {
t0 = os_time_get();
}
@@ -793,14 +750,17 @@ generate_setup_variant(struct gallivm_state *gallivm,
LLVMPositionBuilderAtEnd(builder, block);
set_noalias(builder, variant->function, arg_types, Elements(arg_types));
- init_args(gallivm, &args, variant);
+ init_args(gallivm, &variant->key, &args);
emit_tri_coef(gallivm, &variant->key, &args);
- lp_emit_emms(gallivm);
LLVMBuildRetVoid(builder);
- variant->jit_function = finalize_function(gallivm, builder,
- variant->function);
+ gallivm_verify_function(gallivm, variant->function);
+
+ gallivm_compile_module(gallivm);
+
+ variant->jit_function = (lp_jit_setup_triangle)
+ gallivm_jit_function(gallivm, variant->function);
if (!variant->jit_function)
goto fail;
@@ -818,10 +778,12 @@ generate_setup_variant(struct gallivm_state *gallivm,
fail:
if (variant) {
if (variant->function) {
- if (variant->jit_function)
- LLVMFreeMachineCodeForFunction(gallivm->engine,
- variant->function);
- LLVMDeleteFunction(variant->function);
+ gallivm_free_function(gallivm,
+ variant->function,
+ variant->jit_function);
+ }
+ if (variant->gallivm) {
+ gallivm_destroy(variant->gallivm);
}
FREE(variant);
}
@@ -882,10 +844,13 @@ remove_setup_variant(struct llvmpipe_context *lp,
}
if (variant->function) {
- if (variant->jit_function)
- LLVMFreeMachineCodeForFunction(lp->gallivm->engine,
- variant->function);
- LLVMDeleteFunction(variant->function);
+ gallivm_free_function(variant->gallivm,
+ variant->function,
+ variant->jit_function);
+ }
+
+ if (variant->gallivm) {
+ gallivm_destroy(variant->gallivm);
}
remove_from_list(&variant->list_item_global);
@@ -954,7 +919,7 @@ llvmpipe_update_setup(struct llvmpipe_context *lp)
cull_setup_variants(lp);
}
- variant = generate_setup_variant(lp->gallivm, key, lp);
+ variant = generate_setup_variant(key, lp);
if (variant) {
insert_at_head(&lp->setup_variants_list, &variant->list_item_global);
lp->nr_setup_variants++;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.h b/src/gallium/drivers/llvmpipe/lp_state_setup.h
index 609c4f62511..e0abe467a6d 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.h
@@ -55,6 +55,8 @@ struct lp_setup_variant {
struct lp_setup_variant_list_item list_item_global;
+ struct gallivm_state *gallivm;
+
/* XXX: this is a pointer to the LLVM IR. Once jit_function is
* generated, we never need to use the IR again - need to find a
* way to release this data without destroying the generated
@@ -69,15 +71,6 @@ struct lp_setup_variant {
unsigned no;
};
-void lp_setup_tri_fallback( const float (*v0)[4],
- const float (*v1)[4],
- const float (*v2)[4],
- boolean front_facing,
- float (*a0)[4],
- float (*dadx)[4],
- float (*dady)[4],
- const struct lp_setup_variant_key *key );
-
void lp_delete_setup_variants(struct llvmpipe_context *lp);
void
diff --git a/src/gallium/drivers/llvmpipe/lp_test.h b/src/gallium/drivers/llvmpipe/lp_test.h
index c64f3e149fd..4b6c8a7a6a5 100644
--- a/src/gallium/drivers/llvmpipe/lp_test.h
+++ b/src/gallium/drivers/llvmpipe/lp_test.h
@@ -42,11 +42,6 @@
#include <float.h>
#include "gallivm/lp_bld.h"
-#include <llvm-c/Analysis.h>
-#include <llvm-c/ExecutionEngine.h>
-#include <llvm-c/Target.h>
-#include <llvm-c/BitWriter.h>
-#include <llvm-c/Transforms/Scalar.h>
#include "pipe/p_state.h"
#include "util/u_format.h"
@@ -64,14 +59,14 @@ write_tsv_header(FILE *fp);
boolean
-test_some(struct gallivm_state *gallivm,unsigned verbose, FILE *fp,
+test_some(unsigned verbose, FILE *fp,
unsigned long n);
boolean
-test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp);
+test_single(unsigned verbose, FILE *fp);
boolean
-test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp);
+test_all(unsigned verbose, FILE *fp);
#if defined(PIPE_CC_MSVC)
diff --git a/src/gallium/drivers/llvmpipe/lp_test_arit.c b/src/gallium/drivers/llvmpipe/lp_test_arit.c
index 45ca32f5866..6e09f7e67b0 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_arit.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_arit.c
@@ -53,7 +53,7 @@ write_tsv_header(FILE *fp)
}
-typedef float (*unary_func_t)(float);
+typedef void (*unary_func_t)(float *out, const float *in);
/**
@@ -180,6 +180,45 @@ const float sincos_values[] = {
5*M_PI/4,
};
+const float round_values[] = {
+ -10.0, -1, 0.0, 12.0,
+ -1.49, -0.25, 1.25, 2.51,
+ -0.99, -0.01, 0.01, 0.99,
+};
+
+static float fractf(float x)
+{
+ x -= floorf(x);
+ if (x >= 1.0f) {
+ // clamp to the largest number smaller than one
+ x = 1.0f - 0.5f*FLT_EPSILON;
+ }
+ return x;
+}
+
+
+const float fract_values[] = {
+ // http://en.wikipedia.org/wiki/IEEE_754-1985#Examples
+ 0.0f,
+ -0.0f,
+ 1.0f,
+ -1.0f,
+ 0.5f,
+ -0.5f,
+ 1.401298464324817e-45f, // smallest denormal
+ -1.401298464324817e-45f,
+ 5.88e-39f, // middle denormal
+ 1.18e-38f, // largest denormal
+ -1.18e-38f,
+ -1.62981451e-08f,
+ FLT_EPSILON,
+ -FLT_EPSILON,
+ 1.0f - 0.5f*FLT_EPSILON,
+ -1.0f + FLT_EPSILON,
+ FLT_MAX,
+ -FLT_MAX
+};
+
/*
* Unary test cases.
@@ -196,6 +235,11 @@ unary_tests[] = {
{"sin", &lp_build_sin, &sinf, sincos_values, Elements(sincos_values), 20.0 },
{"cos", &lp_build_cos, &cosf, sincos_values, Elements(sincos_values), 20.0 },
{"sgn", &lp_build_sgn, &sgnf, exp2_values, Elements(exp2_values), 20.0 },
+ {"round", &lp_build_round, &roundf, round_values, Elements(round_values), 24.0 },
+ {"trunc", &lp_build_trunc, &truncf, round_values, Elements(round_values), 24.0 },
+ {"floor", &lp_build_floor, &floorf, round_values, Elements(round_values), 24.0 },
+ {"ceil", &lp_build_ceil, &ceilf, round_values, Elements(round_values), 24.0 },
+ {"fract", &lp_build_fract_safe, &fractf, fract_values, Elements(fract_values), 24.0 },
};
@@ -204,39 +248,40 @@ unary_tests[] = {
*/
static LLVMValueRef
build_unary_test_func(struct gallivm_state *gallivm,
- LLVMModuleRef module,
- LLVMContextRef context,
const struct unary_test_t *test)
{
- struct lp_type type = lp_type_float_vec(32);
- LLVMTypeRef i32t = LLVMInt32TypeInContext(context);
- LLVMTypeRef f32t = LLVMFloatTypeInContext(context);
+ struct lp_type type = lp_type_float_vec(32, lp_native_vector_width);
+ LLVMContextRef context = gallivm->context;
+ LLVMModuleRef module = gallivm->module;
LLVMTypeRef vf32t = lp_build_vec_type(gallivm, type);
- LLVMTypeRef args[1] = { f32t };
- LLVMValueRef func = LLVMAddFunction(module, test->name, LLVMFunctionType(f32t, args, Elements(args), 0));
- LLVMValueRef arg1 = LLVMGetParam(func, 0);
+ LLVMTypeRef args[2] = { LLVMPointerType(vf32t, 0), LLVMPointerType(vf32t, 0) };
+ LLVMValueRef func = LLVMAddFunction(module, test->name,
+ LLVMFunctionType(LLVMVoidTypeInContext(context),
+ args, Elements(args), 0));
+ LLVMValueRef arg0 = LLVMGetParam(func, 0);
+ LLVMValueRef arg1 = LLVMGetParam(func, 1);
LLVMBuilderRef builder = gallivm->builder;
LLVMBasicBlockRef block = LLVMAppendBasicBlockInContext(context, func, "entry");
- LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
LLVMValueRef ret;
struct lp_build_context bld;
- lp_build_context_init(&bld, gallivm, lp_type_float_vec(32));
+ lp_build_context_init(&bld, gallivm, type);
LLVMSetFunctionCallConv(func, LLVMCCallConv);
LLVMPositionBuilderAtEnd(builder, block);
- /* scalar to vector */
- arg1 = LLVMBuildInsertElement(builder, LLVMGetUndef(vf32t), arg1, index0, "");
+ arg1 = LLVMBuildLoad(builder, arg1, "");
ret = test->builder(&bld, arg1);
- /* vector to scalar */
- ret = LLVMBuildExtractElement(builder, ret, index0, "");
+ LLVMBuildStore(builder, ret, arg0);
+
+ LLVMBuildRetVoid(builder);
+
+ gallivm_verify_function(gallivm, func);
- LLVMBuildRet(builder, ret);
return func;
}
@@ -245,67 +290,86 @@ build_unary_test_func(struct gallivm_state *gallivm,
* Test one LLVM unary arithmetic builder function.
*/
static boolean
-test_unary(struct gallivm_state *gallivm, unsigned verbose, FILE *fp, const struct unary_test_t *test)
+test_unary(unsigned verbose, FILE *fp, const struct unary_test_t *test)
{
- LLVMModuleRef module = gallivm->module;
+ struct gallivm_state *gallivm;
LLVMValueRef test_func;
- LLVMExecutionEngineRef engine = gallivm->engine;
- LLVMContextRef context = gallivm->context;
- char *error = NULL;
unary_func_t test_func_jit;
boolean success = TRUE;
- int i;
+ int i, j;
+ int length = lp_native_vector_width / 32;
+ float *in, *out;
- test_func = build_unary_test_func(gallivm, module, context, test);
+ in = align_malloc(length * 4, length * 4);
+ out = align_malloc(length * 4, length * 4);
- if (LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
- printf("LLVMVerifyModule: %s\n", error);
- LLVMDumpModule(module);
- abort();
+ /* random NaNs or 0s could wreak havoc */
+ for (i = 0; i < length; i++) {
+ in[i] = 1.0;
}
- LLVMDisposeMessage(error);
- test_func_jit = (unary_func_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_func));
+ gallivm = gallivm_create();
- for (i = 0; i < test->num_values; ++i) {
- float value = test->values[i];
- float ref = test->ref(value);
- float src = test_func_jit(value);
+ test_func = build_unary_test_func(gallivm, test);
- double error = fabs(src - ref);
- double precision = error ? -log2(error/fabs(ref)) : FLT_MANT_DIG;
+ gallivm_compile_module(gallivm);
- bool pass = precision >= test->precision;
+ test_func_jit = (unary_func_t) gallivm_jit_function(gallivm, test_func);
- if (isnan(ref)) {
- continue;
- }
+ for (j = 0; j < (test->num_values + length - 1) / length; j++) {
+ int num_vals = ((j + 1) * length <= test->num_values) ? length :
+ test->num_values % length;
- if (!pass || verbose) {
- printf("%s(%.9g): ref = %.9g, src = %.9g, precision = %f bits, %s\n",
- test->name, value, ref, src, precision,
- pass ? "PASS" : "FAIL");
+ for (i = 0; i < num_vals; ++i) {
+ in[i] = test->values[i+j*length];
}
- if (!pass) {
- success = FALSE;
+ test_func_jit(out, in);
+ for (i = 0; i < num_vals; ++i) {
+ float ref = test->ref(in[i]);
+ double error, precision;
+ bool pass;
+
+ error = fabs(out[i] - ref);
+ precision = error ? -log2(error/fabs(ref)) : FLT_MANT_DIG;
+
+ pass = precision >= test->precision;
+
+ if (isnan(ref)) {
+ continue;
+ }
+
+ if (!pass || verbose) {
+ printf("%s(%.9g): ref = %.9g, out = %.9g, precision = %f bits, %s\n",
+ test->name, in[i], ref, out[i], precision,
+ pass ? "PASS" : "FAIL");
+ }
+
+ if (!pass) {
+ success = FALSE;
+ }
}
}
- LLVMFreeMachineCodeForFunction(engine, test_func);
+ gallivm_free_function(gallivm, test_func, test_func_jit);
+
+ gallivm_destroy(gallivm);
+
+ align_free(in);
+ align_free(out);
return success;
}
boolean
-test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_all(unsigned verbose, FILE *fp)
{
boolean success = TRUE;
int i;
for (i = 0; i < Elements(unary_tests); ++i) {
- if (!test_unary(gallivm, verbose, fp, &unary_tests[i])) {
+ if (!test_unary(verbose, fp, &unary_tests[i])) {
success = FALSE;
}
}
@@ -315,19 +379,19 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
boolean
-test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
+test_some(unsigned verbose, FILE *fp,
unsigned long n)
{
/*
* Not randomly generated test cases, so test all.
*/
- return test_all(gallivm, verbose, fp);
+ return test_all(verbose, fp);
}
boolean
-test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_single(unsigned verbose, FILE *fp)
{
return TRUE;
}
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index 51324cbb6a3..37b37fda40e 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -36,6 +36,7 @@
* @author Brian Paul <brian@vmware.com>
*/
+#include "util/u_memory.h"
#include "gallivm/lp_bld_init.h"
#include "gallivm/lp_bld_type.h"
@@ -53,19 +54,6 @@ enum vector_mode
typedef void (*blend_test_ptr_t)(const void *src, const void *dst, const void *con, void *res);
-/** cast wrapper */
-static blend_test_ptr_t
-voidptr_to_blend_test_ptr_t(void *p)
-{
- union {
- void *v;
- blend_test_ptr_t f;
- } u;
- u.v = p;
- return u.f;
-}
-
-
void
write_tsv_header(FILE *fp)
@@ -468,50 +456,43 @@ compute_blend_ref(const struct pipe_blend_state *blend,
PIPE_ALIGN_STACK
static boolean
-test_one(struct gallivm_state *gallivm,
- unsigned verbose,
+test_one(unsigned verbose,
FILE *fp,
const struct pipe_blend_state *blend,
enum vector_mode mode,
struct lp_type type)
{
- LLVMModuleRef module = gallivm->module;
+ struct gallivm_state *gallivm;
LLVMValueRef func = NULL;
- LLVMExecutionEngineRef engine = gallivm->engine;
- char *error = NULL;
blend_test_ptr_t blend_test_ptr;
boolean success;
const unsigned n = LP_TEST_NUM_SAMPLES;
int64_t cycles[LP_TEST_NUM_SAMPLES];
double cycles_avg = 0.0;
unsigned i, j;
- void *code;
+ const unsigned stride = lp_type_width(type)/8;
if(verbose >= 1)
dump_blend_type(stdout, blend, mode, type);
- func = add_blend_test(gallivm, blend, mode, type);
+ gallivm = gallivm_create();
- if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
- LLVMDumpModule(module);
- abort();
- }
- LLVMDisposeMessage(error);
+ func = add_blend_test(gallivm, blend, mode, type);
- code = LLVMGetPointerToGlobal(engine, func);
- blend_test_ptr = voidptr_to_blend_test_ptr_t(code);
+ gallivm_compile_module(gallivm);
- if(verbose >= 2)
- lp_disassemble(code);
+ blend_test_ptr = (blend_test_ptr_t)gallivm_jit_function(gallivm, func);
success = TRUE;
- for(i = 0; i < n && success; ++i) {
- if(mode == AoS) {
- PIPE_ALIGN_VAR(16) uint8_t src[LP_NATIVE_VECTOR_WIDTH/8];
- PIPE_ALIGN_VAR(16) uint8_t dst[LP_NATIVE_VECTOR_WIDTH/8];
- PIPE_ALIGN_VAR(16) uint8_t con[LP_NATIVE_VECTOR_WIDTH/8];
- PIPE_ALIGN_VAR(16) uint8_t res[LP_NATIVE_VECTOR_WIDTH/8];
- PIPE_ALIGN_VAR(16) uint8_t ref[LP_NATIVE_VECTOR_WIDTH/8];
+ if(mode == AoS) {
+ uint8_t *src, *dst, *con, *res, *ref;
+ src = align_malloc(stride, stride);
+ dst = align_malloc(stride, stride);
+ con = align_malloc(stride, stride);
+ res = align_malloc(stride, stride);
+ ref = align_malloc(stride, stride);
+
+ for(i = 0; i < n && success; ++i) {
int64_t start_counter = 0;
int64_t end_counter = 0;
@@ -569,14 +550,21 @@ test_one(struct gallivm_state *gallivm,
fprintf(stderr, "\n");
}
}
-
- if(mode == SoA) {
- const unsigned stride = type.length*type.width/8;
- PIPE_ALIGN_VAR(16) uint8_t src[4*LP_NATIVE_VECTOR_WIDTH/8];
- PIPE_ALIGN_VAR(16) uint8_t dst[4*LP_NATIVE_VECTOR_WIDTH/8];
- PIPE_ALIGN_VAR(16) uint8_t con[4*LP_NATIVE_VECTOR_WIDTH/8];
- PIPE_ALIGN_VAR(16) uint8_t res[4*LP_NATIVE_VECTOR_WIDTH/8];
- PIPE_ALIGN_VAR(16) uint8_t ref[4*LP_NATIVE_VECTOR_WIDTH/8];
+ align_free(src);
+ align_free(dst);
+ align_free(con);
+ align_free(res);
+ align_free(ref);
+ }
+ else if(mode == SoA) {
+ uint8_t *src, *dst, *con, *res, *ref;
+ src = align_malloc(4*stride, stride);
+ dst = align_malloc(4*stride, stride);
+ con = align_malloc(4*stride, stride);
+ res = align_malloc(4*stride, stride);
+ ref = align_malloc(4*stride, stride);
+
+ for(i = 0; i < n && success; ++i) {
int64_t start_counter = 0;
int64_t end_counter = 0;
boolean mismatch;
@@ -651,6 +639,11 @@ test_one(struct gallivm_state *gallivm,
}
}
}
+ align_free(src);
+ align_free(dst);
+ align_free(con);
+ align_free(res);
+ align_free(ref);
}
/*
@@ -687,16 +680,9 @@ test_one(struct gallivm_state *gallivm,
if(fp)
write_tsv_row(fp, blend, mode, type, cycles_avg, success);
- if (!success) {
- if(verbose < 2)
- LLVMDumpModule(module);
- LLVMWriteBitcodeToFile(module, "blend.bc");
- fprintf(stderr, "blend.bc written\n");
- fprintf(stderr, "Invoke as \"llc -o - blend.bc\"\n");
- abort();
- }
+ gallivm_free_function(gallivm, func, blend_test_ptr);
- LLVMFreeMachineCodeForFunction(engine, func);
+ gallivm_destroy(gallivm);
return success;
}
@@ -753,7 +739,7 @@ const unsigned num_types = sizeof(blend_types)/sizeof(blend_types[0]);
boolean
-test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_all(unsigned verbose, FILE *fp)
{
const unsigned *rgb_func;
const unsigned *rgb_src_factor;
@@ -789,7 +775,7 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
blend.rt[0].alpha_dst_factor = *alpha_dst_factor;
blend.rt[0].colormask = PIPE_MASK_RGBA;
- if(!test_one(gallivm, verbose, fp, &blend, mode, *type))
+ if(!test_one(verbose, fp, &blend, mode, *type))
success = FALSE;
}
@@ -806,7 +792,7 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
boolean
-test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
+test_some(unsigned verbose, FILE *fp,
unsigned long n)
{
const unsigned *rgb_func;
@@ -849,7 +835,7 @@ test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
blend.rt[0].alpha_dst_factor = *alpha_dst_factor;
blend.rt[0].colormask = PIPE_MASK_RGBA;
- if(!test_one(gallivm, verbose, fp, &blend, mode, *type))
+ if(!test_one(verbose, fp, &blend, mode, *type))
success = FALSE;
}
@@ -858,7 +844,7 @@ test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
boolean
-test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_single(unsigned verbose, FILE *fp)
{
printf("no test_single()");
return TRUE;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
index 0dcb5422887..71d45bd5ce7 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -142,21 +142,21 @@ add_conv_test(struct gallivm_state *gallivm,
LLVMBuildRetVoid(builder);;
+ gallivm_verify_function(gallivm, func);
+
return func;
}
PIPE_ALIGN_STACK
static boolean
-test_one(struct gallivm_state *gallivm, unsigned verbose,
+test_one(unsigned verbose,
FILE *fp,
struct lp_type src_type,
struct lp_type dst_type)
{
- LLVMModuleRef module = gallivm->module;
- LLVMExecutionEngineRef engine = gallivm->engine;
+ struct gallivm_state *gallivm;
LLVMValueRef func = NULL;
- char *error = NULL;
conv_test_ptr_t conv_test_ptr;
boolean success;
const unsigned n = LP_TEST_NUM_SAMPLES;
@@ -166,10 +166,18 @@ test_one(struct gallivm_state *gallivm, unsigned verbose,
unsigned num_dsts;
double eps;
unsigned i, j;
- void *code;
- if (src_type.width * src_type.length != dst_type.width * dst_type.length &&
- src_type.length != dst_type.length) {
+ if ((src_type.width >= dst_type.width && src_type.length > dst_type.length) ||
+ (src_type.width <= dst_type.width && src_type.length < dst_type.length)) {
+ return TRUE;
+ }
+
+ /* Known failures
+ * - fixed point 32 -> float 32
+ * - float 32 -> signed normalised integer 32
+ */
+ if ((src_type.floating && !dst_type.floating && dst_type.sign && dst_type.norm && src_type.width == dst_type.width) ||
+ (!src_type.floating && dst_type.floating && src_type.fixed && src_type.width == dst_type.width)) {
return TRUE;
}
@@ -183,7 +191,7 @@ test_one(struct gallivm_state *gallivm, unsigned verbose,
}
if(verbose >= 1)
- dump_conv_types(stdout, src_type, dst_type);
+ dump_conv_types(stderr, src_type, dst_type);
if (src_type.length > dst_type.length) {
num_srcs = 1;
@@ -203,29 +211,20 @@ test_one(struct gallivm_state *gallivm, unsigned verbose,
eps = MAX2(lp_const_eps(src_type), lp_const_eps(dst_type));
- func = add_conv_test(gallivm, src_type, num_srcs, dst_type, num_dsts);
+ gallivm = gallivm_create();
- if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
- LLVMDumpModule(module);
- abort();
- }
- LLVMDisposeMessage(error);
-
- if(verbose >= 2)
- LLVMDumpModule(module);
+ func = add_conv_test(gallivm, src_type, num_srcs, dst_type, num_dsts);
- code = LLVMGetPointerToGlobal(engine, func);
- conv_test_ptr = (conv_test_ptr_t)pointer_to_func(code);
+ gallivm_compile_module(gallivm);
- if(verbose >= 2)
- lp_disassemble(code);
+ conv_test_ptr = (conv_test_ptr_t)gallivm_jit_function(gallivm, func);
success = TRUE;
for(i = 0; i < n && success; ++i) {
unsigned src_stride = src_type.length*src_type.width/8;
unsigned dst_stride = dst_type.length*dst_type.width/8;
- PIPE_ALIGN_VAR(16) uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
- PIPE_ALIGN_VAR(16) uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+ PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN) uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+ PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN) uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
double fref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
int64_t start_counter = 0;
@@ -320,20 +319,9 @@ test_one(struct gallivm_state *gallivm, unsigned verbose,
if(fp)
write_tsv_row(fp, src_type, dst_type, cycles_avg, success);
- if (!success) {
- static boolean firsttime = TRUE;
- if(firsttime) {
- if(verbose < 2)
- LLVMDumpModule(module);
- LLVMWriteBitcodeToFile(module, "conv.bc");
- fprintf(stderr, "conv.bc written\n");
- fprintf(stderr, "Invoke as \"llc -o - conv.bc\"\n");
- firsttime = FALSE;
- /* abort(); */
- }
- }
+ gallivm_free_function(gallivm, func, conv_test_ptr);
- LLVMFreeMachineCodeForFunction(engine, func);
+ gallivm_destroy(gallivm);
return success;
}
@@ -348,18 +336,33 @@ const struct lp_type conv_types[] = {
{ TRUE, FALSE, FALSE, TRUE, 32, 4 },
{ TRUE, FALSE, FALSE, FALSE, 32, 4 },
+ { TRUE, FALSE, TRUE, TRUE, 32, 8 },
+ { TRUE, FALSE, TRUE, FALSE, 32, 8 },
+ { TRUE, FALSE, FALSE, TRUE, 32, 8 },
+ { TRUE, FALSE, FALSE, FALSE, 32, 8 },
+
/* Fixed */
{ FALSE, TRUE, TRUE, TRUE, 32, 4 },
{ FALSE, TRUE, TRUE, FALSE, 32, 4 },
{ FALSE, TRUE, FALSE, TRUE, 32, 4 },
{ FALSE, TRUE, FALSE, FALSE, 32, 4 },
+ { FALSE, TRUE, TRUE, TRUE, 32, 8 },
+ { FALSE, TRUE, TRUE, FALSE, 32, 8 },
+ { FALSE, TRUE, FALSE, TRUE, 32, 8 },
+ { FALSE, TRUE, FALSE, FALSE, 32, 8 },
+
/* Integer */
{ FALSE, FALSE, TRUE, TRUE, 32, 4 },
{ FALSE, FALSE, TRUE, FALSE, 32, 4 },
{ FALSE, FALSE, FALSE, TRUE, 32, 4 },
{ FALSE, FALSE, FALSE, FALSE, 32, 4 },
+ { FALSE, FALSE, TRUE, TRUE, 32, 8 },
+ { FALSE, FALSE, TRUE, FALSE, 32, 8 },
+ { FALSE, FALSE, FALSE, TRUE, 32, 8 },
+ { FALSE, FALSE, FALSE, FALSE, 32, 8 },
+
{ FALSE, FALSE, TRUE, TRUE, 16, 8 },
{ FALSE, FALSE, TRUE, FALSE, 16, 8 },
{ FALSE, FALSE, FALSE, TRUE, 16, 8 },
@@ -381,7 +384,7 @@ const unsigned num_types = sizeof(conv_types)/sizeof(conv_types[0]);
boolean
-test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_all(unsigned verbose, FILE *fp)
{
const struct lp_type *src_type;
const struct lp_type *dst_type;
@@ -394,7 +397,7 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
if(src_type == dst_type)
continue;
- if(!test_one(gallivm, verbose, fp, *src_type, *dst_type)){
+ if(!test_one(verbose, fp, *src_type, *dst_type)){
success = FALSE;
++error_count;
}
@@ -408,7 +411,7 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
boolean
-test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
+test_some(unsigned verbose, FILE *fp,
unsigned long n)
{
const struct lp_type *src_type;
@@ -423,7 +426,7 @@ test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
dst_type = &conv_types[rand() % num_types];
} while (src_type == dst_type || src_type->norm != dst_type->norm);
- if(!test_one(gallivm, verbose, fp, *src_type, *dst_type))
+ if(!test_one(verbose, fp, *src_type, *dst_type))
success = FALSE;
}
@@ -432,7 +435,7 @@ test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
boolean
-test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_single(unsigned verbose, FILE *fp)
{
/* float, fixed, sign, norm, width, len */
struct lp_type f32x4_type =
@@ -442,7 +445,7 @@ test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
boolean success;
- success = test_one(gallivm, verbose, fp, f32x4_type, ub8x4_type);
+ success = test_one(verbose, fp, f32x4_type, ub8x4_type);
return success;
}
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c
index daf6ded29c7..34cbdbdd630 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_format.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -83,7 +83,6 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
LLVMContextRef context = gallivm->context;
LLVMModuleRef module = gallivm->module;
LLVMBuilderRef builder = gallivm->builder;
- LLVMPassManagerRef passmgr = gallivm->passmgr;
LLVMTypeRef args[4];
LLVMValueRef func;
LLVMValueRef packed_ptr;
@@ -120,16 +119,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
LLVMBuildRetVoid(builder);
- if (LLVMVerifyFunction(func, LLVMPrintMessageAction)) {
- LLVMDumpValue(func);
- abort();
- }
-
- LLVMRunFunctionPassManager(passmgr, func);
-
- if (verbose >= 1) {
- LLVMDumpValue(func);
- }
+ gallivm_verify_function(gallivm, func);
return func;
}
@@ -137,26 +127,24 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
PIPE_ALIGN_STACK
static boolean
-test_format_float(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
+test_format_float(unsigned verbose, FILE *fp,
const struct util_format_description *desc)
{
+ struct gallivm_state *gallivm;
LLVMValueRef fetch = NULL;
- LLVMExecutionEngineRef engine = gallivm->engine;
fetch_ptr_t fetch_ptr;
PIPE_ALIGN_VAR(16) float unpacked[4];
boolean first = TRUE;
boolean success = TRUE;
unsigned i, j, k, l;
- void *f;
+
+ gallivm = gallivm_create();
fetch = add_fetch_rgba_test(gallivm, verbose, desc, lp_float32_vec4_type());
- f = LLVMGetPointerToGlobal(engine, fetch);
- fetch_ptr = (fetch_ptr_t) pointer_to_func(f);
+ gallivm_compile_module(gallivm);
- if (verbose >= 2) {
- lp_disassemble(f);
- }
+ fetch_ptr = (fetch_ptr_t) gallivm_jit_function(gallivm, fetch);
for (l = 0; l < util_format_nr_test_cases; ++l) {
const struct util_format_test_case *test = &util_format_test_cases[l];
@@ -171,25 +159,35 @@ test_format_float(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
for (i = 0; i < desc->block.height; ++i) {
for (j = 0; j < desc->block.width; ++j) {
- boolean match;
+ boolean match = TRUE;
memset(unpacked, 0, sizeof unpacked);
fetch_ptr(unpacked, test->packed, j, i);
- match = TRUE;
- for(k = 0; k < 4; ++k)
- if (fabs((float)test->unpacked[i][j][k] - unpacked[k]) > FLT_EPSILON)
+ for(k = 0; k < 4; ++k) {
+ if (util_double_inf_sign(test->unpacked[i][j][k]) != util_inf_sign(unpacked[k])) {
match = FALSE;
+ }
+
+ if (util_is_double_nan(test->unpacked[i][j][k]) != util_is_nan(unpacked[k])) {
+ match = FALSE;
+ }
+
+ if (!util_is_double_inf_or_nan(test->unpacked[i][j][k]) &&
+ fabs((float)test->unpacked[i][j][k] - unpacked[k]) > FLT_EPSILON) {
+ match = FALSE;
+ }
+ }
if (!match) {
printf("FAILED\n");
printf(" Packed: %02x %02x %02x %02x\n",
test->packed[0], test->packed[1], test->packed[2], test->packed[3]);
- printf(" Unpacked (%u,%u): %f %f %f %f obtained\n",
+ printf(" Unpacked (%u,%u): %.9g %.9g %.9g %.9g obtained\n",
j, i,
unpacked[0], unpacked[1], unpacked[2], unpacked[3]);
- printf(" %f %f %f %f expected\n",
+ printf(" %.9g %.9g %.9g %.9g expected\n",
test->unpacked[i][j][0],
test->unpacked[i][j][1],
test->unpacked[i][j][2],
@@ -201,14 +199,9 @@ test_format_float(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
}
}
- if (!success) {
- if (verbose < 1) {
- LLVMDumpValue(fetch);
- }
- }
+ gallivm_free_function(gallivm, fetch, fetch_ptr);
- LLVMFreeMachineCodeForFunction(engine, fetch);
- LLVMDeleteFunction(fetch);
+ gallivm_destroy(gallivm);
if(fp)
write_tsv_row(fp, desc, success);
@@ -219,26 +212,24 @@ test_format_float(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
PIPE_ALIGN_STACK
static boolean
-test_format_unorm8(struct gallivm_state *gallivm,
- unsigned verbose, FILE *fp,
+test_format_unorm8(unsigned verbose, FILE *fp,
const struct util_format_description *desc)
{
+ struct gallivm_state *gallivm;
LLVMValueRef fetch = NULL;
fetch_ptr_t fetch_ptr;
uint8_t unpacked[4];
boolean first = TRUE;
boolean success = TRUE;
unsigned i, j, k, l;
- void *f;
+
+ gallivm = gallivm_create();
fetch = add_fetch_rgba_test(gallivm, verbose, desc, lp_unorm8_vec4_type());
- f = LLVMGetPointerToGlobal(gallivm->engine, fetch);
- fetch_ptr = (fetch_ptr_t) pointer_to_func(f);
+ gallivm_compile_module(gallivm);
- if (verbose >= 2) {
- lp_disassemble(f);
- }
+ fetch_ptr = (fetch_ptr_t) gallivm_jit_function(gallivm, fetch);
for (l = 0; l < util_format_nr_test_cases; ++l) {
const struct util_format_test_case *test = &util_format_test_cases[l];
@@ -285,6 +276,7 @@ test_format_unorm8(struct gallivm_state *gallivm,
float_to_ubyte(test->unpacked[i][j][1]),
float_to_ubyte(test->unpacked[i][j][2]),
float_to_ubyte(test->unpacked[i][j][3]));
+
success = FALSE;
}
}
@@ -292,11 +284,9 @@ test_format_unorm8(struct gallivm_state *gallivm,
}
}
- if (!success)
- LLVMDumpValue(fetch);
+ gallivm_free_function(gallivm, fetch, fetch_ptr);
- LLVMFreeMachineCodeForFunction(gallivm->engine, fetch);
- LLVMDeleteFunction(fetch);
+ gallivm_destroy(gallivm);
if(fp)
write_tsv_row(fp, desc, success);
@@ -308,17 +298,16 @@ test_format_unorm8(struct gallivm_state *gallivm,
static boolean
-test_one(struct gallivm_state *gallivm,
- unsigned verbose, FILE *fp,
+test_one(unsigned verbose, FILE *fp,
const struct util_format_description *format_desc)
{
boolean success = TRUE;
- if (!test_format_float(gallivm, verbose, fp, format_desc)) {
+ if (!test_format_float(verbose, fp, format_desc)) {
success = FALSE;
}
- if (!test_format_unorm8(gallivm, verbose, fp, format_desc)) {
+ if (!test_format_unorm8(verbose, fp, format_desc)) {
success = FALSE;
}
@@ -327,7 +316,7 @@ test_one(struct gallivm_state *gallivm,
boolean
-test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_all(unsigned verbose, FILE *fp)
{
enum pipe_format format;
boolean success = TRUE;
@@ -359,7 +348,7 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
continue;
}
- if (!test_one(gallivm, verbose, fp, format_desc)) {
+ if (!test_one(verbose, fp, format_desc)) {
success = FALSE;
}
}
@@ -369,15 +358,15 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
boolean
-test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
+test_some(unsigned verbose, FILE *fp,
unsigned long n)
{
- return test_all(gallivm, verbose, fp);
+ return test_all(verbose, fp);
}
boolean
-test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_single(unsigned verbose, FILE *fp)
{
printf("no test_single()");
return TRUE;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_main.c b/src/gallium/drivers/llvmpipe/lp_test_main.c
index d229c620310..4c610923146 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_main.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_main.c
@@ -39,6 +39,7 @@
#include "gallivm/lp_bld_const.h"
#include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_debug.h"
#include "lp_test.h"
@@ -369,7 +370,6 @@ int main(int argc, char **argv)
unsigned i;
boolean success;
boolean single = FALSE;
- struct gallivm_state *gallivm;
for(i = 1; i < argc; ++i) {
if(strcmp(argv[i], "-v") == 0)
@@ -384,23 +384,28 @@ int main(int argc, char **argv)
lp_build_init();
- gallivm = gallivm_create();
+#ifdef DEBUG
+ if (verbose >= 2) {
+ gallivm_debug |= GALLIVM_DEBUG_IR;
+ gallivm_debug |= GALLIVM_DEBUG_ASM;
+ }
+#endif
util_cpu_detect();
if(fp) {
/* Warm up the caches */
- test_some(gallivm, 0, NULL, 100);
+ test_some(0, NULL, 100);
write_tsv_header(fp);
}
if (single)
- success = test_single(gallivm, verbose, fp);
+ success = test_single(verbose, fp);
else if (n)
- success = test_some(gallivm, verbose, fp, n);
+ success = test_some(verbose, fp, n);
else
- success = test_all(gallivm, verbose, fp);
+ success = test_all(verbose, fp);
if(fp)
fclose(fp);
diff --git a/src/gallium/drivers/llvmpipe/lp_test_printf.c b/src/gallium/drivers/llvmpipe/lp_test_printf.c
index 620cdb57c13..c483de94d40 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_printf.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_printf.c
@@ -78,66 +78,61 @@ add_printf_test(struct gallivm_state *gallivm)
LLVMBuildRetVoid(builder);
+ gallivm_verify_function(gallivm, func);
+
return func;
}
PIPE_ALIGN_STACK
static boolean
-test_printf(struct gallivm_state *gallivm,
- unsigned verbose, FILE *fp,
+test_printf(unsigned verbose, FILE *fp,
const struct printf_test_case *testcase)
{
- LLVMExecutionEngineRef engine = gallivm->engine;
- LLVMModuleRef module = gallivm->module;
+ struct gallivm_state *gallivm;
LLVMValueRef test;
- char *error = NULL;
test_printf_t test_printf_func;
boolean success = TRUE;
- void *code;
- test = add_printf_test(gallivm);
+ gallivm = gallivm_create();
- if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
- LLVMDumpModule(module);
- abort();
- }
- LLVMDisposeMessage(error);
+ test = add_printf_test(gallivm);
- code = LLVMGetPointerToGlobal(engine, test);
- test_printf_func = (test_printf_t) pointer_to_func(code);
+ gallivm_compile_module(gallivm);
- // LLVMDumpModule(module);
+ test_printf_func = (test_printf_t) gallivm_jit_function(gallivm, test);
test_printf_func(0);
- LLVMFreeMachineCodeForFunction(engine, test);
+ gallivm_free_function(gallivm, test, test_printf_func);
+
+ gallivm_destroy(gallivm);
return success;
}
boolean
-test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_all(unsigned verbose, FILE *fp)
{
boolean success = TRUE;
- test_printf(gallivm, verbose, fp, NULL);
+ test_printf(verbose, fp, NULL);
return success;
}
boolean
-test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
+test_some(unsigned verbose, FILE *fp,
unsigned long n)
{
- return test_all(gallivm, verbose, fp);
+ return test_all(verbose, fp);
}
boolean
-test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_single(unsigned verbose, FILE *fp)
{
printf("no test_single()");
return TRUE;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_round.c b/src/gallium/drivers/llvmpipe/lp_test_round.c
deleted file mode 100644
index fc3edf372d5..00000000000
--- a/src/gallium/drivers/llvmpipe/lp_test_round.c
+++ /dev/null
@@ -1,242 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2010 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "util/u_pointer.h"
-#include "gallivm/lp_bld.h"
-#include "gallivm/lp_bld_init.h"
-#include "gallivm/lp_bld_arit.h"
-
-#include "lp_test.h"
-
-
-void
-write_tsv_header(FILE *fp)
-{
- fprintf(fp,
- "result\t"
- "format\n");
-
- fflush(fp);
-}
-
-
-#ifdef PIPE_ARCH_SSE
-
-# include <emmintrin.h>
-
-typedef __m128 (*test_round_t)(__m128);
-
-typedef LLVMValueRef (*lp_func_t)(struct lp_build_context *, LLVMValueRef);
-
-
-static LLVMValueRef
-add_test(struct gallivm_state *gallivm, const char *name, lp_func_t lp_func)
-{
- LLVMModuleRef module = gallivm->module;
- LLVMContextRef context = gallivm->context;
- LLVMBuilderRef builder = gallivm->builder;
-
- LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatTypeInContext(context), 4);
- LLVMTypeRef args[1] = { v4sf };
- LLVMValueRef func = LLVMAddFunction(module, name, LLVMFunctionType(v4sf, args, 1, 0));
- LLVMValueRef arg1 = LLVMGetParam(func, 0);
- LLVMBasicBlockRef block = LLVMAppendBasicBlockInContext(context, func, "entry");
- LLVMValueRef ret;
- struct lp_build_context bld;
-
- lp_build_context_init(&bld, gallivm, lp_float32_vec4_type());
-
- LLVMSetFunctionCallConv(func, LLVMCCallConv);
-
- LLVMPositionBuilderAtEnd(builder, block);
-
- ret = lp_func(&bld, arg1);
-
- LLVMBuildRet(builder, ret);
-
- return func;
-}
-
-static void
-printv(char* string, __m128 value)
-{
- __m128 v = value;
- float *f = (float *)&v;
- printf("%s: %10f %10f %10f %10f\n", string,
- f[0], f[1], f[2], f[3]);
-}
-
-static boolean
-compare(__m128 x, __m128 y)
-{
- boolean success = TRUE;
- float *xp = (float *) &x;
- float *yp = (float *) &y;
- if (xp[0] != yp[0] ||
- xp[1] != yp[1] ||
- xp[2] != yp[2] ||
- xp[3] != yp[3]) {
- printf(" Incorrect result! ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ \n");
- success = FALSE;
- }
- return success;
-}
-
-
-
-PIPE_ALIGN_STACK
-static boolean
-test_round(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
-{
- LLVMModuleRef module = gallivm->module;
- LLVMValueRef test_round = NULL, test_trunc, test_floor, test_ceil;
- LLVMExecutionEngineRef engine = gallivm->engine;
- char *error = NULL;
- test_round_t round_func, trunc_func, floor_func, ceil_func;
- float unpacked[4];
- boolean success = TRUE;
- int i;
-
- test_round = add_test(gallivm, "round", lp_build_round);
- test_trunc = add_test(gallivm, "trunc", lp_build_trunc);
- test_floor = add_test(gallivm, "floor", lp_build_floor);
- test_ceil = add_test(gallivm, "ceil", lp_build_ceil);
-
- if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
- printf("LLVMVerifyModule: %s\n", error);
- LLVMDumpModule(module);
- abort();
- }
- LLVMDisposeMessage(error);
-
- round_func = (test_round_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_round));
- trunc_func = (test_round_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_trunc));
- floor_func = (test_round_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_floor));
- ceil_func = (test_round_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_ceil));
-
- memset(unpacked, 0, sizeof unpacked);
-
- if (0)
- LLVMDumpModule(module);
-
- for (i = 0; i < 3; i++) {
- /* NOTE: There are several acceptable rules for x.5 rounding: ceiling,
- * nearest even, etc. So we avoid testing such corner cases here.
- */
- __m128 xvals[3] = {
- {-10.0, -1, 0, 12.0},
- {-1.49, -0.25, 1.25, 2.51},
- {-0.99, -0.01, 0.01, 0.99}
- };
- __m128 x = xvals[i];
- __m128 y, ref;
- float *xp = (float *) &x;
- float *refp = (float *) &ref;
-
- printf("\n");
- printv("x ", x);
-
- refp[0] = round(xp[0]);
- refp[1] = round(xp[1]);
- refp[2] = round(xp[2]);
- refp[3] = round(xp[3]);
- y = round_func(x);
- printv("C round(x) ", ref);
- printv("LLVM round(x)", y);
- success = success && compare(ref, y);
-
- refp[0] = trunc(xp[0]);
- refp[1] = trunc(xp[1]);
- refp[2] = trunc(xp[2]);
- refp[3] = trunc(xp[3]);
- y = trunc_func(x);
- printv("C trunc(x) ", ref);
- printv("LLVM trunc(x)", y);
- success = success && compare(ref, y);
-
- refp[0] = floor(xp[0]);
- refp[1] = floor(xp[1]);
- refp[2] = floor(xp[2]);
- refp[3] = floor(xp[3]);
- y = floor_func(x);
- printv("C floor(x) ", ref);
- printv("LLVM floor(x)", y);
- success = success && compare(ref, y);
-
- refp[0] = ceil(xp[0]);
- refp[1] = ceil(xp[1]);
- refp[2] = ceil(xp[2]);
- refp[3] = ceil(xp[3]);
- y = ceil_func(x);
- printv("C ceil(x) ", ref);
- printv("LLVM ceil(x) ", y);
- success = success && compare(ref, y);
- }
-
- LLVMFreeMachineCodeForFunction(engine, test_round);
- LLVMFreeMachineCodeForFunction(engine, test_trunc);
- LLVMFreeMachineCodeForFunction(engine, test_floor);
- LLVMFreeMachineCodeForFunction(engine, test_ceil);
-
- return success;
-}
-
-#else /* !PIPE_ARCH_SSE */
-
-static boolean
-test_round(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
-{
- return TRUE;
-}
-
-#endif /* !PIPE_ARCH_SSE */
-
-
-boolean
-test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
-{
- return test_round(gallivm, verbose, fp);
-}
-
-
-boolean
-test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
- unsigned long n)
-{
- return test_all(gallivm, verbose, fp);
-}
-
-boolean
-test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
-{
- printf("no test_single()");
- return TRUE;
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.c b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
index daa96f20c7e..9151e427ba7 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
@@ -178,8 +178,7 @@ lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
unsigned unit,
unsigned num_coords,
const LLVMValueRef *coords,
- const LLVMValueRef *ddx,
- const LLVMValueRef *ddy,
+ const struct lp_derivatives *derivs,
LLVMValueRef lod_bias, /* optional */
LLVMValueRef explicit_lod, /* optional */
LLVMValueRef *texel)
@@ -189,7 +188,7 @@ lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
assert(unit < PIPE_MAX_SAMPLERS);
if (LP_PERF & PERF_NO_TEX) {
- lp_build_sample_nop(gallivm, type, texel);
+ lp_build_sample_nop(gallivm, type, num_coords, coords, texel);
return;
}
@@ -199,7 +198,7 @@ lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
type,
unit,
num_coords, coords,
- ddx, ddy,
+ derivs,
lod_bias, explicit_lod,
texel);
}
@@ -210,6 +209,7 @@ lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
static void
lp_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
struct gallivm_state *gallivm,
+ struct lp_type type,
unsigned unit,
LLVMValueRef explicit_lod, /* optional */
LLVMValueRef *sizes_out)
@@ -221,6 +221,7 @@ lp_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
lp_build_size_query_soa(gallivm,
&sampler->dynamic_state.static_state[unit],
&sampler->dynamic_state.base,
+ type,
unit,
explicit_lod,
sizes_out);