summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIan Romanick <ian.d.romanick@intel.com>2023-08-02 13:36:33 -0700
committerIan Romanick <ian.d.romanick@intel.com>2023-12-29 20:28:54 -0800
commitc6d44284aa633569a58200d00015b3e6d80a465a (patch)
tree731a9099f5d15f6ef797ae022800c25f4b38fa84
parent8ea032b78ee3257fd9398db8b79cdf9ca5ff4a36 (diff)
intel/dev: Enable VK_KHR_cooperative_matrix on all Gfx9+ GPUs
Gfx12.5 (DG2) will use DPAS instructions to accelerate the implementation. Earlier platforms will use equivalent discrete instructions (basically subgroup operations). Gfx12 (Tigerlake) will use DP4A for 8-bit integer matrix multiplication. Older platforms, which lack DP4A, will use a suboptimal instruction sequence. There is plenty of room for improvement here. On DG2 (Gfx12.5) gets the following results from the CTS: Test run totals: Passed: 1642/13982 (11.7%) Failed: 0/13982 (0.0%) Not supported: 12340/13982 (88.3%) Warnings: 0/13982 (0.0%) Waived: 0/13982 (0.0%) On DG2 (Gfx12.5) with forced lowering, Raptor Lake (Gfx12) and Ice Lake (Gfx11): Test run totals: Passed: 1662/13982 (11.9%) Failed: 0/13982 (0.0%) Not supported: 12320/13982 (88.1%) Warnings: 0/13982 (0.0%) Waived: 0/13982 (0.0%) The difference in the number of tests run is due to saturatingAccumulation not being set on DG2 when DPAS is used. There is a comment in "intel/dev: Advertise integer configs with saturatingAccumulation too" that explains how this could be added should the need arise. v2: Prefix type names with INTEL_CMAT_. Suggested by Lionel. Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25994>
-rw-r--r--src/intel/dev/intel_device_info.c30
1 files changed, 27 insertions, 3 deletions
diff --git a/src/intel/dev/intel_device_info.c b/src/intel/dev/intel_device_info.c
index 2da815e46d1..1ed313d71a2 100644
--- a/src/intel/dev/intel_device_info.c
+++ b/src/intel/dev/intel_device_info.c
@@ -614,7 +614,13 @@ static const struct intel_device_info intel_device_info_chv = {
GFX8_FEATURES, \
GFX9_HW_INFO, \
.has_sample_with_hiz = true, \
- .has_illegal_ccs_values = true
+ .has_illegal_ccs_values = true, \
+ .cooperative_matrix_configurations = { \
+ { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \
+ { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
+ { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \
+ { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \
+ }
static const struct intel_device_info intel_device_info_skl_gt1 = {
GFX9_FEATURES, .gt = 1,
@@ -840,7 +846,13 @@ static const struct intel_device_info intel_device_info_cfl_gt3 = {
.has_illegal_ccs_values = true, \
.gt = _gt, .num_slices = _slices, .l3_banks = _l3, \
.num_subslices = _subslices, \
- .max_eus_per_subslice = 8
+ .max_eus_per_subslice = 8, \
+ .cooperative_matrix_configurations = { \
+ { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \
+ { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
+ { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \
+ { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \
+ }
#define GFX11_URB_MIN_MAX_ENTRIES \
.min_entries = { \
@@ -967,6 +979,12 @@ static const struct intel_device_info intel_device_info_ehl_2x4 = {
.scanout = PAT_ENTRY(1, WC, NONE), \
.writeback_incoherent = PAT_ENTRY(0, WB, 2WAY), \
.writecombining = PAT_ENTRY(1, WC, NONE), \
+ }, \
+ .cooperative_matrix_configurations = { \
+ { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \
+ { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
+ { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \
+ { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \
}
#define dual_subslices(args...) { args, }
@@ -1099,7 +1117,13 @@ static const struct intel_device_info intel_device_info_sg1 = {
.has_lsc = true, \
.has_local_mem = true, \
.has_aux_map = false, \
- .simulator_id = 29
+ .simulator_id = 29, \
+ .cooperative_matrix_configurations = { \
+ { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \
+ { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
+ { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \
+ { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \
+ }
#define DG2_FEATURES \
/* (Sub)slice info comes from the kernel topology info */ \