#define _GNU_SOURCE #include #include #include #include #include "simplex86.h" #include "crc32.h" #include "regnaive.h" #define N_ELEMENTS(a) \ (sizeof (a) / sizeof (a[0])) typedef struct { pixman_op_t op; pixman_image_t * src_image; pixman_image_t * mask_image; pixman_image_t * dest_image; int32_t src_x; int32_t src_y; int32_t mask_x; int32_t mask_y; int32_t dest_x; int32_t dest_y; int32_t width; int32_t height; uint32_t src_flags; uint32_t mask_flags; uint32_t dest_flags; } pixman_composite_info_t; typedef struct image_common image_common_t; typedef struct solid_fill solid_fill_t; typedef struct gradient gradient_t; typedef struct linear_gradient linear_gradient_t; typedef struct horizontal_gradient horizontal_gradient_t; typedef struct vertical_gradient vertical_gradient_t; typedef struct conical_gradient conical_gradient_t; typedef struct radial_gradient radial_gradient_t; typedef struct bits_image bits_image_t; typedef struct circle circle_t; typedef struct argb_t argb_t; struct argb_t { float a; float r; float g; float b; }; typedef void (*fetch_scanline_t) (bits_image_t *image, int x, int y, int width, uint32_t *buffer, const uint32_t *mask); typedef void (*store_scanline_t) (bits_image_t * image, int x, int y, int width, const uint32_t *values); typedef enum { BITS, LINEAR, CONICAL, RADIAL, SOLID } image_type_t; typedef void (*property_changed_func_t) (pixman_image_t *image); struct image_common { image_type_t type; int32_t ref_count; pixman_region32_t clip_region; int32_t alpha_count; /* How many times this image is being used as an alpha map */ pixman_bool_t have_clip_region; /* FALSE if there is no clip */ pixman_bool_t client_clip; /* Whether the source clip was set by a client */ pixman_bool_t clip_sources; /* Whether the clip applies when * the image is used as a source */ pixman_bool_t dirty; pixman_transform_t * transform; pixman_repeat_t repeat; pixman_filter_t filter; pixman_fixed_t * filter_params; int n_filter_params; bits_image_t * alpha_map; int alpha_origin_x; int alpha_origin_y; pixman_bool_t component_alpha; property_changed_func_t property_changed; pixman_image_destroy_func_t destroy_func; void * destroy_data; uint32_t flags; pixman_format_code_t extended_format_code; }; struct solid_fill { image_common_t common; pixman_color_t color; uint32_t color_32; argb_t color_float; }; struct gradient { image_common_t common; int n_stops; pixman_gradient_stop_t *stops; }; struct linear_gradient { gradient_t common; pixman_point_fixed_t p1; pixman_point_fixed_t p2; }; struct circle { pixman_fixed_t x; pixman_fixed_t y; pixman_fixed_t radius; }; struct radial_gradient { gradient_t common; circle_t c1; circle_t c2; circle_t delta; double a; double inva; double mindr; }; struct conical_gradient { gradient_t common; pixman_point_fixed_t center; double angle; }; struct bits_image { image_common_t common; pixman_format_code_t format; const pixman_indexed_t * indexed; int width; int height; uint32_t * bits; uint32_t * free_me; int rowstride; /* in number of uint32_t's */ fetch_scanline_t fetch_scanline_32; store_scanline_t store_scanline_32; fetch_scanline_t fetch_scanline_float; store_scanline_t store_scanline_float; /* Used for indirect access to the bits */ pixman_read_memory_func_t read_func; pixman_write_memory_func_t write_func; }; union pixman_image { image_type_t type; image_common_t common; bits_image_t bits; gradient_t gradient; linear_gradient_t linear; conical_gradient_t conical; radial_gradient_t radial; solid_fill_t solid; }; typedef struct jit_dest_iter_t jit_dest_iter_t; typedef struct jit_src_iter_t jit_src_iter_t; typedef struct jit_combiner_t jit_combiner_t; typedef struct jit_t jit_t; struct jit_t { assembler_t *assembler; fragment_t *fragment; reg_alloc_t gp_allocator; reg_alloc_t xmm_allocator; }; /* * The structure of a composite loop: * * [prologue] * outer: * [pre-line] * * inner_1: * [mask.load_pixels] * [mask.advance_pixels] * * [src.load_pixels] * [src.advance_pixels] * * [dest.load_pixels] * * [combine] * * [dest.store_pixels] * [dest.advance_pixels] * jcc inner1 * * ... * * inner_n: * [src.load_pixels] * [src.advance_pixels] * * [dest.load_pixels] * * [combine] * * [dest.store_pixels] * [dest.advance_pixels] * jcc inner_n * * [post-line] * * jcc outer * * The methods that can be called more than once per line need to have * "first" and "last" arguments so that they can allocate and free as * required. It's not good enough to do this in pre-line and post-line * because that will potentially lead to too much spilling/restoring. */ struct jit_src_iter_t { void (* begin) (jit_src_iter_t *src, jit_t *jit, reg_t info); void (* begin_line) (jit_src_iter_t *src, jit_t *jit); reg_t (* load_pixels) (jit_src_iter_t *src, jit_t *jit, int n_pixels); void (* advance_pixels) (jit_src_iter_t *src, jit_t *jit, int n_pixels); void (* end_line) (jit_src_iter_t *src, jit_t *jit); void (* end) (jit_src_iter_t *src, jit_t *jit); reg_t line; int *line_save; reg_t stride; int *stride_save; reg_t s; int *s_save; }; struct jit_combiner_t { void (* combine) (jit_combiner_t *combiner, jit_t *jit, jit_src_iter_t *src, jit_src_iter_t *mask, jit_dest_iter_t *dest, int n_pixels); }; struct jit_dest_iter_t { void (* begin) (jit_dest_iter_t *dest, jit_t *jit, reg_t info); void (* process_line) (jit_dest_iter_t *dest_iter, jit_t *jit, jit_src_iter_t *src_iter, jit_src_iter_t *mask_iter, jit_combiner_t *combiner); reg_t (* load_pixels) (jit_dest_iter_t *dest, jit_t *jit, int n_pixels); void (* store_pixels) (jit_dest_iter_t *dest, jit_t *jit, int n_pixels, reg_t reg); void (* advance_pixels) (jit_dest_iter_t *dest, jit_t *jit, int n_pixels); void (* end) (jit_dest_iter_t *dest, jit_t *jit); reg_t line; reg_t stride; reg_t d; reg_t w; reg_t width; }; #define MEMBER(variable, type, member) \ BASE((variable), offsetof (type, member)) static const reg_pool_t xmm_pool = { 16, { xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, }, }; static const reg_pool_t gp64_pool = { 14, { rsi, rdi, r8, r9, r10, r11, rcx, rdx, rax, rbx, r12, r13, r14, r15, } }; jit_t * jit_new (void) { jit_t *jit = malloc (sizeof (jit_t)); jit->assembler = assembler_new ("pixman"); jit->fragment = fragment_new (jit->assembler); reg_alloc_init (&jit->xmm_allocator, &xmm_pool); reg_alloc_init (&jit->gp_allocator, &gp64_pool); return jit; } reg_t jit_alloc_gp (jit_t *jit) { return reg_alloc_alloc (&jit->gp_allocator); } reg_t jit_alloc_xmm (jit_t *jit) { return reg_alloc_alloc (&jit->xmm_allocator); } reg_t jit_preserve_gp (jit_t *jit, reg_t reg) { return reg_alloc_alloc_preserve (&jit->gp_allocator, reg); } reg_t jit_preserve_xmm (jit_t *jit, reg_t reg) { return reg_alloc_alloc_preserve (&jit->xmm_allocator, reg); } void jit_free_gp (jit_t *jit, reg_t reg) { reg_alloc_free (&jit->gp_allocator, reg); } void jit_free_xmm (jit_t *jit, reg_t reg) { reg_alloc_free (&jit->xmm_allocator, reg); } static void src_a8r8g8b8_begin (jit_src_iter_t *src, jit_t *jit, reg_t info) { reg_t image, tmp; jit_preserve_gp (jit, info); src->line = jit_alloc_gp (jit); src->stride = jit_alloc_gp (jit); image = jit_alloc_gp (jit); tmp = jit_alloc_gp (jit); BEGIN_ASM (jit->fragment) I_mov, image, MEMBER (info, pixman_composite_info_t, src_image), /* Stride */ I_mov, src->stride, MEMBER (image, pixman_image_t, bits.rowstride), I_shl, src->stride, IMM (2), /* Line */ I_mov, tmp, src->stride, I_imul2, tmp, MEMBER (info, pixman_composite_info_t, src_y), I_mov, src->line, MEMBER (info, pixman_composite_info_t, src_x), I_lea, src->line, INDEX(tmp, 0, src->line, 2), I_add, src->line, MEMBER(image, pixman_image_t, bits.bits), END_ASM (); jit_free_gp (jit, info); jit_free_gp (jit, image); jit_free_gp (jit, tmp); } static void src_a8r8g8b8_begin_line (jit_src_iter_t *src, jit_t *jit) { src->s = jit_alloc_gp (jit); BEGIN_ASM (jit->fragment) I_mov, src->s, src->line, I_add, src->line, src->stride, END_ASM (); } static reg_t src_a8r8g8b8_load_pixels (jit_src_iter_t *src, jit_t *jit, int n_pixels) { instruction_t move; reg_t r; int size; r = jit_alloc_xmm (jit); switch (n_pixels) { case 1: move = I_movd; size = DWORD_PTR; break; case 2: move = I_movq; size = QWORD_PTR; break; case 4: move = I_movdqu; size = 0; break; } BEGIN_ASM (jit->fragment) move, r, size + PTR (src->s), END_ASM (); jit_free_xmm (jit, r); return r; } static void src_a8r8g8b8_advance_pixels (jit_src_iter_t *src, jit_t *jit, int n_pixels) { int n_bytes = n_pixels * 4; BEGIN_ASM (jit->fragment) I_add, src->s, IMM (n_bytes), END_ASM (); } static void src_a8r8g8b8_end_line (jit_src_iter_t *src, jit_t *jit) { jit_free_gp (jit, src->s); } static void src_a8r8g8b8_end (jit_src_iter_t *src, jit_t *jit) { jit_free_gp (jit, src->stride); jit_free_gp (jit, src->line); } jit_src_iter_t * src_iter_create_a8r8g8b8 (void) { jit_src_iter_t *iter = malloc (sizeof *iter); /* FIXME OOM */ iter->begin = src_a8r8g8b8_begin; iter->begin_line = src_a8r8g8b8_begin_line; iter->load_pixels = src_a8r8g8b8_load_pixels; iter->advance_pixels = src_a8r8g8b8_advance_pixels; iter->end_line = src_a8r8g8b8_end_line; iter->end = src_a8r8g8b8_end; return iter; } /* Dest iter */ static void dest_a8r8g8b8_begin (jit_dest_iter_t * dest, jit_t * jit, reg_t info) { reg_t image, tmp; info = jit_preserve_gp (jit, info); dest->line = jit_alloc_gp (jit); dest->stride = jit_alloc_gp (jit); image = jit_alloc_gp (jit); tmp = jit_alloc_gp (jit); BEGIN_ASM (jit->fragment) I_mov, image, MEMBER (info, pixman_composite_info_t, dest_image), /* Stride */ I_mov, dest->stride, MEMBER (image, pixman_image_t, bits.rowstride), I_shl, dest->stride, IMM (2), /* Line */ I_mov, tmp, dest->stride, I_imul2, tmp, MEMBER (info, pixman_composite_info_t, dest_y), I_mov, dest->line, MEMBER (info, pixman_composite_info_t, dest_x), I_lea, dest->line, INDEX(tmp, 0, dest->line, 2), I_add, dest->line, MEMBER(image, pixman_image_t, bits.bits), END_ASM (); jit_free_gp (jit, info); jit_free_gp (jit, image); jit_free_gp (jit, tmp); dest->width = jit_alloc_gp (jit); BEGIN_ASM (jit->fragment) I_mov, dest->width, MEMBER (info, pixman_composite_info_t, width), END_ASM (); } static void dest_a8r8g8b8_process_line (jit_dest_iter_t *dest, jit_t *jit, jit_src_iter_t *src, jit_src_iter_t *mask, jit_combiner_t *combiner) { int n_pixels[] = { 1, 2, 4, 2, 1 }; int i; src->begin_line (src, jit); if (mask) mask->begin_line (mask, jit); dest->d = jit_alloc_gp (jit); dest->w = jit_alloc_gp (jit); BEGIN_ASM (jit->fragment) I_mov, dest->d, dest->line, I_mov, dest->w, dest->width, I_add, dest->line, dest->stride, END_ASM (); for (i = 0; i < sizeof (n_pixels) / sizeof (n_pixels[0]); ++i) { char loop[32] = { 0 }; char test[32] = { 0 }; char done[32] = { 0 }; snprintf (loop, sizeof (loop), "horz_%d_loop", i); snprintf (test, sizeof (test), "horz_%d_test", i); snprintf (done, sizeof (done), "horz_%d_done", i); BEGIN_ASM (jit->fragment) I_jmp, LABEL (test), END_ASM(); if (i == 2) { /* Cache-line align the main loop */ BEGIN_ASM (jit->fragment) I_align, IMM (64), END_ASM (); } BEGIN_ASM (jit->fragment) DEFINE_LABEL (loop), I_sub, dest->w, IMM (n_pixels[i]), END_ASM (); combiner->combine (combiner, jit, src, mask, dest, n_pixels[i]); BEGIN_ASM (jit->fragment) DEFINE_LABEL (test), END_ASM (); if (i < 2) { BEGIN_ASM (jit->fragment) /* If aligned properly, skip to next block */ I_test, dest->d, IMM (n_pixels[i] * 4 * 2 - 1), I_jz, LABEL (done), END_ASM (); } BEGIN_ASM (jit->fragment) I_cmp, dest->w, IMM (n_pixels[i]), I_jge, LABEL (loop), DEFINE_LABEL (done), END_ASM (); } jit_free_gp (jit, dest->d); jit_free_gp (jit, dest->w); } static reg_t dest_a8r8g8b8_load_pixels (jit_dest_iter_t *dest, jit_t *jit, int n_pixels) { instruction_t move; int size; reg_t r; r = jit_alloc_xmm (jit); switch (n_pixels) { case 1: move = I_movd; size = DWORD_PTR; break; case 2: move = I_movq; size = QWORD_PTR; break; case 4: move = I_movdqa; size = 0; break; } BEGIN_ASM (jit->fragment) move, r, size + PTR (dest->d), END_ASM (); jit_free_xmm (jit, r); return r; } static void dest_a8r8g8b8_store_pixels (jit_dest_iter_t *dest, jit_t *jit, int n_pixels, reg_t reg) { instruction_t move; int size; switch (n_pixels) { case 1: move = I_movd; size = DWORD_PTR; break; case 2: move = I_movq; size = QWORD_PTR; break; case 4: move = I_movdqa; size = 0; break; } BEGIN_ASM (jit->fragment) move, reg, size + PTR (dest->d), END_ASM(); } static void dest_a8r8g8b8_advance_pixels (jit_dest_iter_t *dest, jit_t *jit, int n_pixels) { int n_bytes = n_pixels * 4; BEGIN_ASM (jit->fragment) I_add, dest->d, IMM (n_bytes), END_ASM (); } static void dest_a8r8g8b8_end (jit_dest_iter_t *dest, jit_t *jit) { } jit_dest_iter_t * dest_iter_create_a8r8g8b8 (void) { jit_dest_iter_t *iter = malloc (sizeof *iter); /* FIXME OOM */ iter->begin = dest_a8r8g8b8_begin; iter->process_line = dest_a8r8g8b8_process_line; iter->load_pixels = dest_a8r8g8b8_load_pixels; iter->store_pixels = dest_a8r8g8b8_store_pixels; iter->advance_pixels = dest_a8r8g8b8_advance_pixels; iter->end = dest_a8r8g8b8_end; return iter; } /* combiner */ static void combine_over (jit_combiner_t *combiner, jit_t *jit, jit_src_iter_t *src, jit_src_iter_t *mask, jit_dest_iter_t *dest, int n_pixels) { reg_t s, d, m_hi, m_lo, d_hi, d_lo, zero; reg_t m00ff, m0101, m0080; s = src->load_pixels (src, jit, n_pixels); s = jit_preserve_xmm (jit, s); src->advance_pixels (src, jit, n_pixels); m00ff = zero = jit_alloc_xmm (jit); m_hi = jit_alloc_xmm (jit); m_lo = jit_alloc_xmm (jit); BEGIN_ASM (jit->fragment) /* Generate zero */ I_pxor, zero, zero, /* Expand alpha */ I_movdqa, m_hi, s, I_movdqa, m_lo, s, I_punpckhbw, m_hi, zero, I_punpcklbw, m_lo, zero, I_pshuflw, m_hi, m_hi, UIMM (0xff), I_pshuflw, m_lo, m_lo, UIMM (0xff), /* Negate mask */ I_pcmpeqw, m00ff, m00ff, I_psrlw, m00ff, IMM (8), I_pxor, m_lo, m00ff, I_pxor, m_hi, m00ff, END_ASM (); jit_free_xmm (jit, zero); d = dest->load_pixels (dest, jit, n_pixels); d = jit_preserve_xmm (jit, d); m0080 = zero = jit_alloc_xmm (jit); m0101 = jit_alloc_xmm (jit); d_hi = jit_alloc_xmm (jit); d_lo = d; BEGIN_ASM (jit->fragment) /* Unpack dest */ I_pxor, zero, zero, I_movdqa, d_hi, d, I_punpckhbw, d_hi, zero, I_punpcklbw, d_lo, zero, /* Generate 0101 */ I_pcmpeqw, m0101, m0101, I_psrlw, m0101, IMM (15), I_packuswb, m0101, m0101, /* Generate 0080 */ I_pcmpeqw, m0080, m0080, I_psrlw, m0080, IMM (15), I_psllw, m0080, IMM (7), /* Multiply */ I_pmullw, d_hi, m_hi, I_paddusw, d_hi, m0080, I_pmulhw, d_hi, m0101, I_pmullw, d_lo, m_lo, I_paddusw, d_lo, m0080, I_pmulhw, d_lo, m0101, /* Pack */ I_packuswb, d_lo, d_hi, /* Add */ I_paddusb, d, s, END_ASM(); jit_free_xmm (jit, m0080); jit_free_xmm (jit, m0101); jit_free_xmm (jit, m_hi); jit_free_xmm (jit, m_lo); jit_free_xmm (jit, d_lo); jit_free_xmm (jit, d_hi); jit_free_xmm (jit, s); dest->store_pixels (dest, jit, n_pixels, d); dest->advance_pixels (dest, jit, n_pixels); } jit_combiner_t * combiner_create_over (void) { jit_combiner_t *combiner = malloc (sizeof *combiner); /* FIXME OOM */ combiner->combine = combine_over; return combiner; } uint8_t * generate_kernel (jit_t *jit, jit_src_iter_t *src, jit_src_iter_t *mask, jit_dest_iter_t *dest, jit_combiner_t *combiner) { static const reg_t callee_save[] = { rbx, r12, r13, r14, r15 }; fragment_t *prologue, *epilogue; reg_t h, composite_info; int i; composite_info = rsi; composite_info = jit_preserve_gp (jit, composite_info); h = jit_alloc_gp (jit); BEGIN_ASM (jit->fragment) I_mov, h, MEMBER (composite_info, pixman_composite_info_t, height), I_test, h, h, I_jz, LABEL ("done"), END_ASM (); jit_free_gp (jit, composite_info); /* begin */ src->begin (src, jit, composite_info); if (mask) mask->begin (mask, jit, composite_info); dest->begin (dest, jit, composite_info); /* loop */ BEGIN_ASM (jit->fragment) DEFINE_LABEL ("vertical_loop"), END_ASM (); dest->process_line ( dest, jit, src, mask, combiner); BEGIN_ASM (jit->fragment) I_sub, h, IMM (1), I_jnz, LABEL ("vertical_loop"), END_ASM (); jit_free_gp (jit, h); /* end */ dest->end (dest, jit); if (mask) mask->end (mask, jit); src->end (src, jit); BEGIN_ASM (jit->fragment) DEFINE_LABEL ("done"), END_ASM(); /* Prologue */ prologue = fragment_new (jit->assembler); BEGIN_ASM (prologue) I_push, rbp, I_mov, rbp, rsp, END_ASM (); for (i = 0; i < N_ELEMENTS (callee_save); ++i) { reg_t reg = callee_save[i]; if (reg_alloc_clobbered (&jit->gp_allocator, reg)) { BEGIN_ASM (prologue) I_push, reg, END_ASM (); } } /* Epilogue */ epilogue = fragment_new (jit->assembler); for (i = N_ELEMENTS (callee_save) - 1; i >= 0; --i) { reg_t reg = callee_save[i]; if (reg_alloc_clobbered (&jit->gp_allocator, reg)) { BEGIN_ASM (epilogue) I_pop, reg, END_ASM (); } } BEGIN_ASM (epilogue) I_pop, rbp, I_ret, END_ASM (); return assembler_link (jit->assembler, prologue, jit->fragment, epilogue, NULL); } int main () { jit_dest_iter_t *dest = dest_iter_create_a8r8g8b8 (); jit_combiner_t *combiner = combiner_create_over (); jit_src_iter_t *src = src_iter_create_a8r8g8b8 (); jit_t *jit = jit_new (); uint8_t *code; /* n_8_8888() */ printf ("iter jit\n"); code = generate_kernel (jit, src, NULL, dest, combiner); return 0; }