diff options
author | Lionel Landwerlin <lionel.g.landwerlin@intel.com> | 2020-10-29 15:10:59 +0200 |
---|---|---|
committer | Marge Bot <eric+marge@anholt.net> | 2021-05-02 20:20:06 +0000 |
commit | 6d4070f3ddb5a5aafaf7f7f51e2f503b78fd0868 (patch) | |
tree | f7140f435c7f5d4968ff6869afd616f705b163df | |
parent | a297061524c7b7caaaaebfc589f550bb9d6a353a (diff) |
intel/compiler: add support for fragment coordinate with coarse pixels
v2: Drop new internal opcodes (Jason)
Simplify code (Jason)
v3: Add Z computation for coarse pixels
v4: Document things a little
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7455>
-rw-r--r-- | src/intel/compiler/brw_compiler.h | 1 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs.cpp | 27 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs.h | 2 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs_generator.cpp | 18 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs_visitor.cpp | 141 |
5 files changed, 172 insertions, 17 deletions
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index 03eb7e1a7d7..805537488e4 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -845,6 +845,7 @@ struct brw_wm_prog_data { bool uses_kill; bool uses_src_depth; bool uses_src_w; + bool uses_depth_w_coefficients; bool uses_sample_mask; bool has_render_target_reads; bool has_side_effects; diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 957f5a2838f..46a9e7cefc4 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -754,8 +754,11 @@ fs_inst::components_read(unsigned i) const case FS_OPCODE_PIXEL_X: case FS_OPCODE_PIXEL_Y: - assert(i == 0); - return 2; + assert(i < 2); + if (i == 0) + return 2; + else + return 1; case FS_OPCODE_FB_WRITE_LOGICAL: assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM); @@ -1241,7 +1244,7 @@ fs_visitor::emit_fragcoord_interpolation(fs_reg wpos) /* gl_FragCoord.z */ if (devinfo->ver >= 6) { - bld.MOV(wpos, fetch_payload_reg(bld, payload.source_depth_reg)); + bld.MOV(wpos, this->pixel_z); } else { bld.emit(FS_OPCODE_LINTERP, wpos, this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL], @@ -7829,6 +7832,12 @@ fs_visitor::setup_fs_payload_gfx6() payload.sample_mask_in_reg[j] = payload.num_regs; payload.num_regs += payload_width / 8; } + + /* R66: Source Depth and/or W Attribute Vertex Deltas */ + if (prog_data->uses_depth_w_coefficients) { + payload.depth_w_coef_reg[j] = payload.num_regs; + payload.num_regs++; + } } if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { @@ -9099,9 +9108,6 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader, const struct brw_wm_prog_key *key, struct brw_wm_prog_data *prog_data) { - prog_data->uses_src_depth = prog_data->uses_src_w = - BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD); - /* key->alpha_test_func means simulating alpha testing via discards, * so the shader definitely kills pixels. */ @@ -9154,6 +9160,15 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader, (prog_data->computed_depth_mode == BRW_PSCDEPTH_OFF) && !prog_data->computed_stencil; + prog_data->uses_src_w = + BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD); + prog_data->uses_src_depth = + BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) && + !prog_data->per_coarse_pixel_dispatch; + prog_data->uses_depth_w_coefficients = + BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) && + prog_data->per_coarse_pixel_dispatch; + calculate_urb_setup(devinfo, key, prog_data, shader); brw_compute_flat_inputs(prog_data, shader); } diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 3b13e557568..8e4212d2273 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -397,6 +397,7 @@ public: uint8_t dest_depth_reg[2]; uint8_t sample_pos_reg[2]; uint8_t sample_mask_in_reg[2]; + uint8_t depth_w_coef_reg[2]; uint8_t barycentric_coord_reg[BRW_BARYCENTRIC_MODE_COUNT][2]; uint8_t local_invocation_id_reg[2]; @@ -409,6 +410,7 @@ public: fs_reg pixel_x; fs_reg pixel_y; + fs_reg pixel_z; fs_reg wpos_w; fs_reg pixel_w; fs_reg delta_xy[BRW_BARYCENTRIC_MODE_COUNT]; diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index cdd361e138c..ad7686e179e 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -2289,13 +2289,27 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, break; case FS_OPCODE_PIXEL_X: assert(src[0].type == BRW_REGISTER_TYPE_UW); + assert(src[1].type == BRW_REGISTER_TYPE_UW); src[0].subnr = 0 * type_sz(src[0].type); - brw_MOV(p, dst, stride(src[0], 8, 4, 1)); + if (src[1].file == BRW_IMMEDIATE_VALUE) { + assert(src[1].ud == 0); + brw_MOV(p, dst, stride(src[0], 8, 4, 1)); + } else { + /* Coarse pixel case */ + brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]); + } break; case FS_OPCODE_PIXEL_Y: assert(src[0].type == BRW_REGISTER_TYPE_UW); + assert(src[1].type == BRW_REGISTER_TYPE_UW); src[0].subnr = 4 * type_sz(src[0].type); - brw_MOV(p, dst, stride(src[0], 8, 4, 1)); + if (src[1].file == BRW_IMMEDIATE_VALUE) { + assert(src[1].ud == 0); + brw_MOV(p, dst, stride(src[0], 8, 4, 1)); + } else { + /* Coarse pixel case */ + brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]); + } break; case SHADER_OPCODE_SEND: diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp index 2dc5d8fb3d9..081023ec16d 100644 --- a/src/intel/compiler/brw_fs_visitor.cpp +++ b/src/intel/compiler/brw_fs_visitor.cpp @@ -189,6 +189,8 @@ fs_visitor::emit_interpolation_setup_gfx4() abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart); } + this->pixel_z = fetch_payload_reg(bld, payload.source_depth_reg); + abld = bld.annotate("compute pos.w and 1/pos.w"); /* Compute wpos.w. It's always in our setup, since it's needed to * interpolate the other attributes. @@ -274,6 +276,76 @@ fs_visitor::emit_interpolation_setup_gfx6() this->pixel_x = vgrf(glsl_type::float_type); this->pixel_y = vgrf(glsl_type::float_type); + struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(prog_data); + + fs_reg int_pixel_offset_xy, half_int_pixel_offset_x, half_int_pixel_offset_y; + if (!wm_prog_data->per_coarse_pixel_dispatch) { + /* The thread payload only delivers subspan locations (ss0, ss1, + * ss2, ...). Since subspans covers 2x2 pixels blocks, we need to + * generate 4 pixel coordinates out of each subspan location. We do this + * by replicating a subspan coordinate 4 times and adding an offset of 1 + * in each direction from the initial top left (tl) location to generate + * top right (tr = +1 in x), bottom left (bl = +1 in y) and bottom right + * (br = +1 in x, +1 in y). + * + * The locations we build look like this in SIMD8 : + * + * ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br + * + * The value 0x11001010 is a vector of 8 half byte vector. It adds + * following to generate the 4 pixels coordinates out of the subspan0: + * + * 0x + * 1 : ss0.y + 1 -> ss0.br.y + * 1 : ss0.y + 1 -> ss0.bl.y + * 0 : ss0.y + 0 -> ss0.tr.y + * 0 : ss0.y + 0 -> ss0.tl.y + * 1 : ss0.x + 1 -> ss0.br.x + * 0 : ss0.x + 0 -> ss0.bl.x + * 1 : ss0.x + 1 -> ss0.tr.x + * 0 : ss0.x + 0 -> ss0.tl.x + * + * By doing a SIMD16 add in a SIMD8 shader, we can generate the 8 pixels + * coordinates out of 2 subspans coordinates in a single ADD instruction + * (twice the operation above). + */ + int_pixel_offset_xy = fs_reg(brw_imm_v(0x11001010)); + half_int_pixel_offset_x = fs_reg(brw_imm_uw(0)); + half_int_pixel_offset_y = fs_reg(brw_imm_uw(0)); + } else { + /* In coarse pixel dispatch we have to do the same ADD instruction that + * we do in normal per pixel dispatch, except this time we're not adding + * 1 in each direction, but instead the coarse pixel size. + * + * The coarse pixel size is delivered as 2 u8 in r1.0 + */ + struct brw_reg r1_0 = retype(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), BRW_REGISTER_TYPE_UB); + + const fs_builder dbld = + abld.exec_all().group(MIN2(16, dispatch_width) * 2, 0); + + /* To build the array of half bytes we do and AND operation with the + * right mask in X. + */ + fs_reg int_pixel_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW); + dbld.AND(int_pixel_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0000f0f0)); + + /* And the right mask in Y. */ + fs_reg int_pixel_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW); + dbld.AND(int_pixel_offset_y, byte_offset(r1_0, 1), brw_imm_v(0xff000000)); + + /* Finally OR the 2 registers. */ + int_pixel_offset_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW); + dbld.OR(int_pixel_offset_xy, int_pixel_offset_x, int_pixel_offset_y); + + /* Also compute the half pixel size used to center pixels. */ + half_int_pixel_offset_x = bld.vgrf(BRW_REGISTER_TYPE_UW); + half_int_pixel_offset_y = bld.vgrf(BRW_REGISTER_TYPE_UW); + + bld.SHR(half_int_pixel_offset_x, suboffset(r1_0, 0), brw_imm_ud(1)); + bld.SHR(half_int_pixel_offset_y, suboffset(r1_0, 1), brw_imm_ud(1)); + } + for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) { const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i); struct brw_reg gi_uw = retype(brw_vec1_grf(1 + i, 0), BRW_REGISTER_TYPE_UW); @@ -311,10 +383,12 @@ fs_visitor::emit_interpolation_setup_gfx6() dbld.ADD(int_pixel_xy, fs_reg(stride(suboffset(gi_uw, 4), 1, 4, 0)), - fs_reg(brw_imm_v(0x11001010))); + int_pixel_offset_xy); - hbld.emit(FS_OPCODE_PIXEL_X, offset(pixel_x, hbld, i), int_pixel_xy); - hbld.emit(FS_OPCODE_PIXEL_Y, offset(pixel_y, hbld, i), int_pixel_xy); + hbld.emit(FS_OPCODE_PIXEL_X, offset(pixel_x, hbld, i), int_pixel_xy, + horiz_stride(half_int_pixel_offset_x, 0)); + hbld.emit(FS_OPCODE_PIXEL_Y, offset(pixel_y, hbld, i), int_pixel_xy, + horiz_stride(half_int_pixel_offset_y, 0)); } else { /* The "Register Region Restrictions" page says for SNB, IVB, HSW: * @@ -343,12 +417,61 @@ fs_visitor::emit_interpolation_setup_gfx6() } } - abld = bld.annotate("compute pos.w"); - this->pixel_w = fetch_payload_reg(abld, payload.source_w_reg); - this->wpos_w = vgrf(glsl_type::float_type); - abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w); + abld = bld.annotate("compute pos.z"); + if (wm_prog_data->uses_depth_w_coefficients) { + assert(!wm_prog_data->uses_src_depth); + /* In coarse pixel mode, the HW doesn't interpolate Z coordinate + * properly. In the same way we have to add the coarse pixel size to + * pixels locations, here we recompute the Z value with 2 coefficients + * in X & Y axis. + */ + fs_reg coef_payload = fetch_payload_reg(abld, payload.depth_w_coef_reg, BRW_REGISTER_TYPE_F); + const fs_reg x_start = brw_vec1_grf(coef_payload.nr, 2); + const fs_reg y_start = brw_vec1_grf(coef_payload.nr, 6); + const fs_reg z_cx = brw_vec1_grf(coef_payload.nr, 1); + const fs_reg z_cy = brw_vec1_grf(coef_payload.nr, 0); + const fs_reg z_c0 = brw_vec1_grf(coef_payload.nr, 3); + + const fs_reg float_pixel_x = abld.vgrf(BRW_REGISTER_TYPE_F); + const fs_reg float_pixel_y = abld.vgrf(BRW_REGISTER_TYPE_F); + + abld.ADD(float_pixel_x, this->pixel_x, negate(x_start)); + abld.ADD(float_pixel_y, this->pixel_y, negate(y_start)); + + /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */ + const fs_reg u8_cps_width = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB)); + /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */ + const fs_reg u8_cps_height = byte_offset(u8_cps_width, 1); + const fs_reg u32_cps_width = abld.vgrf(BRW_REGISTER_TYPE_UD); + const fs_reg u32_cps_height = abld.vgrf(BRW_REGISTER_TYPE_UD); + abld.MOV(u32_cps_width, u8_cps_width); + abld.MOV(u32_cps_height, u8_cps_height); + + const fs_reg f_cps_width = abld.vgrf(BRW_REGISTER_TYPE_F); + const fs_reg f_cps_height = abld.vgrf(BRW_REGISTER_TYPE_F); + abld.MOV(f_cps_width, u32_cps_width); + abld.MOV(f_cps_height, u32_cps_height); + + /* Center in the middle of the coarse pixel. */ + abld.MAD(float_pixel_x, float_pixel_x, brw_imm_f(0.5f), f_cps_width); + abld.MAD(float_pixel_y, float_pixel_y, brw_imm_f(0.5f), f_cps_height); + + this->pixel_z = abld.vgrf(BRW_REGISTER_TYPE_F); + abld.MAD(this->pixel_z, z_c0, z_cx, float_pixel_x); + abld.MAD(this->pixel_z, this->pixel_z, z_cy, float_pixel_y); + } - struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(prog_data); + if (wm_prog_data->uses_src_depth) { + assert(!wm_prog_data->uses_depth_w_coefficients); + this->pixel_z = fetch_payload_reg(bld, payload.source_depth_reg); + } + + if (wm_prog_data->uses_src_w) { + abld = bld.annotate("compute pos.w"); + this->pixel_w = fetch_payload_reg(abld, payload.source_w_reg); + this->wpos_w = vgrf(glsl_type::float_type); + abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w); + } for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) { this->delta_xy[i] = fetch_barycentric_reg( @@ -462,7 +585,7 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld, if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) src_depth = frag_depth; else - src_depth = fetch_payload_reg(bld, payload.source_depth_reg); + src_depth = this->pixel_z; } if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) |