intel/dev: Enable VK_KHR_cooperative_matrix on all Gfx9+ GPUs

Gfx12.5 (DG2) will use DPAS instructions to accelerate the implementation. Earlier platforms will use equivalent discrete instructions (basically subgroup operations). Gfx12 (Tigerlake) will use DP4A for 8-bit integer matrix multiplication. Older platforms, which lack DP4A, will use a suboptimal instruction sequence. There is plenty of room for improvement here. On DG2 (Gfx12.5) gets the following results from the CTS: Test run totals: Passed: 1642/13982 (11.7%) Failed: 0/13982 (0.0%) Not supported: 12340/13982 (88.3%) Warnings: 0/13982 (0.0%) Waived: 0/13982 (0.0%) On DG2 (Gfx12.5) with forced lowering, Raptor Lake (Gfx12) and Ice Lake (Gfx11): Test run totals: Passed: 1662/13982 (11.9%) Failed: 0/13982 (0.0%) Not supported: 12320/13982 (88.1%) Warnings: 0/13982 (0.0%) Waived: 0/13982 (0.0%) The difference in the number of tests run is due to saturatingAccumulation not being set on DG2 when DPAS is used. There is a comment in "intel/dev: Advertise integer configs with saturatingAccumulation too" that explains how this could be added should the need arise. v2: Prefix type names with INTEL_CMAT_. Suggested by Lionel. Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25994>
author: Ian Romanick <ian.d.romanick@intel.com> 2023-08-02 13:36:33 -0700
committer: Ian Romanick <ian.d.romanick@intel.com> 2023-12-29 20:28:54 -0800
commit: c6d44284aa633569a58200d00015b3e6d80a465a (patch)
tree: 731a9099f5d15f6ef797ae022800c25f4b38fa84
parent: 8ea032b78ee3257fd9398db8b79cdf9ca5ff4a36 (diff)
1 files changed, 27 insertions, 3 deletions
diff --git a/src/intel/dev/intel_device_info.c b/src/intel/dev/intel_device_info.c
index 2da815e46d1..1ed313d71a2 100644
--- a/src/intel/dev/intel_device_info.c
+++ b/src/intel/dev/intel_device_info.c
@@ -614,7 +614,13 @@ static const struct intel_device_info intel_device_info_chv = {
    GFX8_FEATURES,                                   \
    GFX9_HW_INFO,                                    \
    .has_sample_with_hiz = true,                     \
-   .has_illegal_ccs_values = true
+   .has_illegal_ccs_values = true,                                    \
+   .cooperative_matrix_configurations = {                             \
+    { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \
+    { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
+    { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 },       \
+    { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 },       \
+   }
 
 static const struct intel_device_info intel_device_info_skl_gt1 = {
    GFX9_FEATURES, .gt = 1,
@@ -840,7 +846,13 @@ static const struct intel_device_info intel_device_info_cfl_gt3 = {
    .has_illegal_ccs_values = true,                    \
    .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \
    .num_subslices = _subslices,                       \
-   .max_eus_per_subslice = 8
+   .max_eus_per_subslice = 8,                                         \
+   .cooperative_matrix_configurations = {                             \
+    { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \
+    { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
+    { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 },       \
+    { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 },       \
+   }
 
 #define GFX11_URB_MIN_MAX_ENTRIES                     \
    .min_entries = {                                   \
@@ -967,6 +979,12 @@ static const struct intel_device_info intel_device_info_ehl_2x4 = {
          .scanout = PAT_ENTRY(1, WC, NONE),                     \
          .writeback_incoherent = PAT_ENTRY(0, WB, 2WAY),        \
          .writecombining = PAT_ENTRY(1, WC, NONE),              \
+   },                                                           \
+   .cooperative_matrix_configurations = {                       \
+    { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \
+    { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
+    { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 },       \
+    { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 },       \
    }
 
 #define dual_subslices(args...) { args, }
@@ -1099,7 +1117,13 @@ static const struct intel_device_info intel_device_info_sg1 = {
    .has_lsc = true,                                             \
    .has_local_mem = true,                                       \
    .has_aux_map = false,                                        \
-   .simulator_id = 29
+   .simulator_id = 29,                                          \
+   .cooperative_matrix_configurations = {                       \
+    { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16 }, \
+    { SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
+    { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 },       \
+    { SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 },       \
+   }
 
 #define DG2_FEATURES                                            \
    /* (Sub)slice info comes from the kernel topology info */    \
author	Ian Romanick <ian.d.romanick@intel.com>	2023-08-02 13:36:33 -0700
committer	Ian Romanick <ian.d.romanick@intel.com>	2023-12-29 20:28:54 -0800
commit	c6d44284aa633569a58200d00015b3e6d80a465a (patch)
tree	731a9099f5d15f6ef797ae022800c25f4b38fa84
parent	8ea032b78ee3257fd9398db8b79cdf9ca5ff4a36 (diff)