summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/Makefile3
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h782
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm1124
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm13
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm63
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_chardev.c83
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_crat.c17
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_crat.h3
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c36
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device.c105
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c664
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h18
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c88
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c1
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_events.c2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c4
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_iommu.c10
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c25
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h1
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v10.c348
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c6
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c4
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_module.c6
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c90
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h24
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c134
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c498
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c155
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c143
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c13
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h16
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h7
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_priv.h71
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process.c101
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c71
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_topology.c30
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_topology.h3
39 files changed, 3800 insertions, 966 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index 69ec96998bb9..48155060a57c 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -36,16 +36,19 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_mqd_manager_cik.o \
$(AMDKFD_PATH)/kfd_mqd_manager_vi.o \
$(AMDKFD_PATH)/kfd_mqd_manager_v9.o \
+ $(AMDKFD_PATH)/kfd_mqd_manager_v10.o \
$(AMDKFD_PATH)/kfd_kernel_queue.o \
$(AMDKFD_PATH)/kfd_kernel_queue_cik.o \
$(AMDKFD_PATH)/kfd_kernel_queue_vi.o \
$(AMDKFD_PATH)/kfd_kernel_queue_v9.o \
+ $(AMDKFD_PATH)/kfd_kernel_queue_v10.o \
$(AMDKFD_PATH)/kfd_packet_manager.o \
$(AMDKFD_PATH)/kfd_process_queue_manager.o \
$(AMDKFD_PATH)/kfd_device_queue_manager.o \
$(AMDKFD_PATH)/kfd_device_queue_manager_cik.o \
$(AMDKFD_PATH)/kfd_device_queue_manager_vi.o \
$(AMDKFD_PATH)/kfd_device_queue_manager_v9.o \
+ $(AMDKFD_PATH)/kfd_device_queue_manager_v10.o \
$(AMDKFD_PATH)/kfd_interrupt.o \
$(AMDKFD_PATH)/kfd_events.o \
$(AMDKFD_PATH)/cik_event_interrupt.o \
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index 3621efbd5759..826913c70766 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -21,7 +21,7 @@
*/
static const uint32_t cwsr_trap_gfx8_hex[] = {
- 0xbf820001, 0xbf82012b,
+ 0xbf820001, 0xbf820121,
0xb8f4f802, 0x89748674,
0xb8f5f803, 0x8675ff75,
0x00000400, 0xbf850017,
@@ -36,12 +36,7 @@ static const uint32_t cwsr_trap_gfx8_hex[] = {
0x8671ff71, 0x0000ffff,
0x8f728374, 0xb972e0c2,
0xbf800002, 0xb9740002,
- 0xbe801f70, 0xb8f5f803,
- 0x8675ff75, 0x00000100,
- 0xbf840006, 0xbefa0080,
- 0xb97a0203, 0x8671ff71,
- 0x0000ffff, 0x80f08870,
- 0x82f18071, 0xbefa0080,
+ 0xbe801f70, 0xbefa0080,
0xb97a0283, 0xbef60068,
0xbef70069, 0xb8fa1c07,
0x8e7a9c7a, 0x87717a71,
@@ -279,15 +274,17 @@ static const uint32_t cwsr_trap_gfx8_hex[] = {
static const uint32_t cwsr_trap_gfx9_hex[] = {
- 0xbf820001, 0xbf82015d,
+ 0xbf820001, 0xbf82015e,
0xb8f8f802, 0x89788678,
- 0xb8f1f803, 0x866eff71,
- 0x00000400, 0xbf850037,
- 0x866eff71, 0x00000800,
- 0xbf850003, 0x866eff71,
- 0x00000100, 0xbf840008,
+ 0xb8fbf803, 0x866eff7b,
+ 0x00000400, 0xbf85003b,
+ 0x866eff7b, 0x00000800,
+ 0xbf850003, 0x866eff7b,
+ 0x00000100, 0xbf84000c,
0x866eff78, 0x00002000,
- 0xbf840001, 0xbf810000,
+ 0xbf840005, 0xbf8e0010,
+ 0xb8eef803, 0x866eff6e,
+ 0x00000400, 0xbf84fffb,
0x8778ff78, 0x00002000,
0x80ec886c, 0x82ed806d,
0xb8eef807, 0x866fff6e,
@@ -295,13 +292,13 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
0x8977ff77, 0xfc000000,
0x87776f77, 0x896eff6e,
0x001f8000, 0xb96ef807,
- 0xb8f0f812, 0xb8f1f813,
- 0x8ef08870, 0xc0071bb8,
+ 0xb8faf812, 0xb8fbf813,
+ 0x8efa887a, 0xc0071bbd,
0x00000000, 0xbf8cc07f,
- 0xc0071c38, 0x00000008,
+ 0xc0071ebd, 0x00000008,
0xbf8cc07f, 0x86ee6e6e,
0xbf840001, 0xbe801d6e,
- 0xb8f1f803, 0x8671ff71,
+ 0xb8fbf803, 0x867bff7b,
0x000001ff, 0xbf850002,
0x806c846c, 0x826d806d,
0x866dff6d, 0x0000ffff,
@@ -311,258 +308,555 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
0x8f6e8378, 0xb96ee0c2,
0xbf800002, 0xb9780002,
0xbe801f6c, 0x866dff6d,
- 0x0000ffff, 0xbef00080,
- 0xb9700283, 0xb8f02407,
- 0x8e709c70, 0x876d706d,
- 0xb8f003c7, 0x8e709b70,
- 0x876d706d, 0xb8f0f807,
- 0x8670ff70, 0x00007fff,
- 0xb970f807, 0xbeee007e,
+ 0x0000ffff, 0xbefa0080,
+ 0xb97a0283, 0xb8fa2407,
+ 0x8e7a9b7a, 0x876d7a6d,
+ 0xb8fa03c7, 0x8e7a9a7a,
+ 0x876d7a6d, 0xb8faf807,
+ 0x867aff7a, 0x00007fff,
+ 0xb97af807, 0xbeee007e,
0xbeef007f, 0xbefe0180,
- 0xbf900004, 0x87708478,
- 0xb970f802, 0xbf8e0002,
- 0xbf88fffe, 0xb8f02a05,
+ 0xbf900004, 0x877a8478,
+ 0xb97af802, 0xbf8e0002,
+ 0xbf88fffe, 0xb8fa2a05,
+ 0x807a817a, 0x8e7a8a7a,
+ 0xb8fb1605, 0x807b817b,
+ 0x8e7b867b, 0x807a7b7a,
+ 0x807a7e7a, 0x827b807f,
+ 0x867bff7b, 0x0000ffff,
+ 0xc04b1c3d, 0x00000050,
+ 0xbf8cc07f, 0xc04b1d3d,
+ 0x00000060, 0xbf8cc07f,
+ 0xc0431e7d, 0x00000074,
+ 0xbf8cc07f, 0xbef4007e,
+ 0x8675ff7f, 0x0000ffff,
+ 0x8775ff75, 0x00040000,
+ 0xbef60080, 0xbef700ff,
+ 0x00807fac, 0x867aff7f,
+ 0x08000000, 0x8f7a837a,
+ 0x87777a77, 0x867aff7f,
+ 0x70000000, 0x8f7a817a,
+ 0x87777a77, 0xbef1007c,
+ 0xbef00080, 0xb8f02a05,
0x80708170, 0x8e708a70,
- 0xb8f11605, 0x80718171,
- 0x8e718671, 0x80707170,
- 0x80707e70, 0x8271807f,
- 0x8671ff71, 0x0000ffff,
- 0xc0471cb8, 0x00000040,
- 0xbf8cc07f, 0xc04b1d38,
- 0x00000048, 0xbf8cc07f,
- 0xc0431e78, 0x00000058,
- 0xbf8cc07f, 0xc0471eb8,
- 0x0000005c, 0xbf8cc07f,
- 0xbef4007e, 0x8675ff7f,
- 0x0000ffff, 0x8775ff75,
- 0x00040000, 0xbef60080,
- 0xbef700ff, 0x00807fac,
- 0x8670ff7f, 0x08000000,
- 0x8f708370, 0x87777077,
- 0x8670ff7f, 0x70000000,
- 0x8f708170, 0x87777077,
- 0xbefb007c, 0xbefa0080,
- 0xb8fa2a05, 0x807a817a,
- 0x8e7a8a7a, 0xb8f01605,
- 0x80708170, 0x8e708670,
- 0x807a707a, 0xbef60084,
- 0xbef600ff, 0x01000000,
- 0xbefe007c, 0xbefc007a,
- 0xc0611efa, 0x0000007c,
- 0xbf8cc07f, 0x807a847a,
- 0xbefc007e, 0xbefe007c,
- 0xbefc007a, 0xc0611b3a,
+ 0xb8fa1605, 0x807a817a,
+ 0x8e7a867a, 0x80707a70,
+ 0xbef60084, 0xbef600ff,
+ 0x01000000, 0xbefe007c,
+ 0xbefc0070, 0xc0611c7a,
0x0000007c, 0xbf8cc07f,
- 0x807a847a, 0xbefc007e,
- 0xbefe007c, 0xbefc007a,
- 0xc0611b7a, 0x0000007c,
- 0xbf8cc07f, 0x807a847a,
+ 0x80708470, 0xbefc007e,
+ 0xbefe007c, 0xbefc0070,
+ 0xc0611b3a, 0x0000007c,
+ 0xbf8cc07f, 0x80708470,
0xbefc007e, 0xbefe007c,
- 0xbefc007a, 0xc0611bba,
+ 0xbefc0070, 0xc0611b7a,
0x0000007c, 0xbf8cc07f,
- 0x807a847a, 0xbefc007e,
- 0xbefe007c, 0xbefc007a,
- 0xc0611bfa, 0x0000007c,
- 0xbf8cc07f, 0x807a847a,
+ 0x80708470, 0xbefc007e,
+ 0xbefe007c, 0xbefc0070,
+ 0xc0611bba, 0x0000007c,
+ 0xbf8cc07f, 0x80708470,
0xbefc007e, 0xbefe007c,
- 0xbefc007a, 0xc0611e3a,
+ 0xbefc0070, 0xc0611bfa,
0x0000007c, 0xbf8cc07f,
- 0x807a847a, 0xbefc007e,
- 0xb8f1f803, 0xbefe007c,
- 0xbefc007a, 0xc0611c7a,
- 0x0000007c, 0xbf8cc07f,
- 0x807a847a, 0xbefc007e,
- 0xbefe007c, 0xbefc007a,
- 0xc0611a3a, 0x0000007c,
- 0xbf8cc07f, 0x807a847a,
+ 0x80708470, 0xbefc007e,
+ 0xbefe007c, 0xbefc0070,
+ 0xc0611e3a, 0x0000007c,
+ 0xbf8cc07f, 0x80708470,
+ 0xbefc007e, 0xb8fbf803,
+ 0xbefe007c, 0xbefc0070,
+ 0xc0611efa, 0x0000007c,
+ 0xbf8cc07f, 0x80708470,
0xbefc007e, 0xbefe007c,
- 0xbefc007a, 0xc0611a7a,
+ 0xbefc0070, 0xc0611a3a,
0x0000007c, 0xbf8cc07f,
- 0x807a847a, 0xbefc007e,
- 0xb8fbf801, 0xbefe007c,
- 0xbefc007a, 0xc0611efa,
- 0x0000007c, 0xbf8cc07f,
- 0x807a847a, 0xbefc007e,
- 0x8670ff7f, 0x04000000,
- 0xbeef0080, 0x876f6f70,
- 0xb8fa2a05, 0x807a817a,
- 0x8e7a8a7a, 0xb8f11605,
- 0x80718171, 0x8e718471,
- 0x8e768271, 0xbef600ff,
- 0x01000000, 0xbef20174,
- 0x80747a74, 0x82758075,
- 0xbefc0080, 0xbf800000,
- 0xbe802b00, 0xbe822b02,
- 0xbe842b04, 0xbe862b06,
- 0xbe882b08, 0xbe8a2b0a,
- 0xbe8c2b0c, 0xbe8e2b0e,
- 0xc06b003a, 0x00000000,
- 0xbf8cc07f, 0xc06b013a,
- 0x00000010, 0xbf8cc07f,
- 0xc06b023a, 0x00000020,
- 0xbf8cc07f, 0xc06b033a,
- 0x00000030, 0xbf8cc07f,
- 0x8074c074, 0x82758075,
- 0x807c907c, 0xbf0a717c,
- 0xbf85ffe7, 0xbef40172,
- 0xbefa0080, 0xbefe00c1,
- 0xbeff00c1, 0xbee80080,
- 0xbee90080, 0xbef600ff,
- 0x01000000, 0xe0724000,
- 0x7a1d0000, 0xe0724100,
- 0x7a1d0100, 0xe0724200,
- 0x7a1d0200, 0xe0724300,
- 0x7a1d0300, 0xbefe00c1,
- 0xbeff00c1, 0xb8f14306,
- 0x8671c171, 0xbf84002c,
- 0xbf8a0000, 0x8670ff6f,
- 0x04000000, 0xbf840028,
- 0x8e718671, 0x8e718271,
- 0xbef60071, 0xb8fa2a05,
- 0x807a817a, 0x8e7a8a7a,
- 0xb8f01605, 0x80708170,
- 0x8e708670, 0x807a707a,
- 0x807aff7a, 0x00000080,
+ 0x80708470, 0xbefc007e,
+ 0xbefe007c, 0xbefc0070,
+ 0xc0611a7a, 0x0000007c,
+ 0xbf8cc07f, 0x80708470,
+ 0xbefc007e, 0xb8f1f801,
+ 0xbefe007c, 0xbefc0070,
+ 0xc0611c7a, 0x0000007c,
+ 0xbf8cc07f, 0x80708470,
+ 0xbefc007e, 0x867aff7f,
+ 0x04000000, 0xbeef0080,
+ 0x876f6f7a, 0xb8f02a05,
+ 0x80708170, 0x8e708a70,
+ 0xb8fb1605, 0x807b817b,
+ 0x8e7b847b, 0x8e76827b,
0xbef600ff, 0x01000000,
- 0xbefc0080, 0xd28c0002,
- 0x000100c1, 0xd28d0003,
- 0x000204c1, 0xd1060002,
- 0x00011103, 0x7e0602ff,
- 0x00000200, 0xbefc00ff,
- 0x00010000, 0xbe800077,
- 0x8677ff77, 0xff7fffff,
- 0x8777ff77, 0x00058000,
- 0xd8ec0000, 0x00000002,
- 0xbf8cc07f, 0xe0765000,
- 0x7a1d0002, 0x68040702,
- 0xd0c9006a, 0x0000e302,
- 0xbf87fff7, 0xbef70000,
- 0xbefa00ff, 0x00000400,
+ 0xbef20174, 0x80747074,
+ 0x82758075, 0xbefc0080,
+ 0xbf800000, 0xbe802b00,
+ 0xbe822b02, 0xbe842b04,
+ 0xbe862b06, 0xbe882b08,
+ 0xbe8a2b0a, 0xbe8c2b0c,
+ 0xbe8e2b0e, 0xc06b003a,
+ 0x00000000, 0xbf8cc07f,
+ 0xc06b013a, 0x00000010,
+ 0xbf8cc07f, 0xc06b023a,
+ 0x00000020, 0xbf8cc07f,
+ 0xc06b033a, 0x00000030,
+ 0xbf8cc07f, 0x8074c074,
+ 0x82758075, 0x807c907c,
+ 0xbf0a7b7c, 0xbf85ffe7,
+ 0xbef40172, 0xbef00080,
0xbefe00c1, 0xbeff00c1,
- 0xb8f12a05, 0x80718171,
- 0x8e718271, 0x8e768871,
+ 0xbee80080, 0xbee90080,
0xbef600ff, 0x01000000,
- 0xbefc0084, 0xbf0a717c,
- 0xbf840015, 0xbf11017c,
- 0x8071ff71, 0x00001000,
- 0x7e000300, 0x7e020301,
- 0x7e040302, 0x7e060303,
- 0xe0724000, 0x7a1d0000,
- 0xe0724100, 0x7a1d0100,
- 0xe0724200, 0x7a1d0200,
- 0xe0724300, 0x7a1d0300,
- 0x807c847c, 0x807aff7a,
- 0x00000400, 0xbf0a717c,
- 0xbf85ffef, 0xbf9c0000,
- 0xbf8200dc, 0xbef4007e,
- 0x8675ff7f, 0x0000ffff,
- 0x8775ff75, 0x00040000,
- 0xbef60080, 0xbef700ff,
- 0x00807fac, 0x866eff7f,
- 0x08000000, 0x8f6e836e,
- 0x87776e77, 0x866eff7f,
- 0x70000000, 0x8f6e816e,
- 0x87776e77, 0x866eff7f,
- 0x04000000, 0xbf84001e,
+ 0xe0724000, 0x701d0000,
+ 0xe0724100, 0x701d0100,
+ 0xe0724200, 0x701d0200,
+ 0xe0724300, 0x701d0300,
0xbefe00c1, 0xbeff00c1,
- 0xb8ef4306, 0x866fc16f,
- 0xbf840019, 0x8e6f866f,
- 0x8e6f826f, 0xbef6006f,
- 0xb8f82a05, 0x80788178,
- 0x8e788a78, 0xb8ee1605,
- 0x806e816e, 0x8e6e866e,
- 0x80786e78, 0x8078ff78,
+ 0xb8fb4306, 0x867bc17b,
+ 0xbf84002c, 0xbf8a0000,
+ 0x867aff6f, 0x04000000,
+ 0xbf840028, 0x8e7b867b,
+ 0x8e7b827b, 0xbef6007b,
+ 0xb8f02a05, 0x80708170,
+ 0x8e708a70, 0xb8fa1605,
+ 0x807a817a, 0x8e7a867a,
+ 0x80707a70, 0x8070ff70,
0x00000080, 0xbef600ff,
0x01000000, 0xbefc0080,
- 0xe0510000, 0x781d0000,
- 0xe0510100, 0x781d0000,
- 0x807cff7c, 0x00000200,
- 0x8078ff78, 0x00000200,
- 0xbf0a6f7c, 0xbf85fff6,
- 0xbef80080, 0xbefe00c1,
- 0xbeff00c1, 0xb8ef2a05,
- 0x806f816f, 0x8e6f826f,
- 0x8e76886f, 0xbef600ff,
- 0x01000000, 0xbeee0078,
- 0x8078ff78, 0x00000400,
- 0xbefc0084, 0xbf11087c,
- 0x806fff6f, 0x00008000,
- 0xe0524000, 0x781d0000,
- 0xe0524100, 0x781d0100,
- 0xe0524200, 0x781d0200,
- 0xe0524300, 0x781d0300,
- 0xbf8c0f70, 0x7e000300,
+ 0xd28c0002, 0x000100c1,
+ 0xd28d0003, 0x000204c1,
+ 0xd1060002, 0x00011103,
+ 0x7e0602ff, 0x00000200,
+ 0xbefc00ff, 0x00010000,
+ 0xbe800077, 0x8677ff77,
+ 0xff7fffff, 0x8777ff77,
+ 0x00058000, 0xd8ec0000,
+ 0x00000002, 0xbf8cc07f,
+ 0xe0765000, 0x701d0002,
+ 0x68040702, 0xd0c9006a,
+ 0x0000f702, 0xbf87fff7,
+ 0xbef70000, 0xbef000ff,
+ 0x00000400, 0xbefe00c1,
+ 0xbeff00c1, 0xb8fb2a05,
+ 0x807b817b, 0x8e7b827b,
+ 0x8e76887b, 0xbef600ff,
+ 0x01000000, 0xbefc0084,
+ 0xbf0a7b7c, 0xbf840015,
+ 0xbf11017c, 0x807bff7b,
+ 0x00001000, 0x7e000300,
0x7e020301, 0x7e040302,
- 0x7e060303, 0x807c847c,
- 0x8078ff78, 0x00000400,
- 0xbf0a6f7c, 0xbf85ffee,
- 0xbf9c0000, 0xe0524000,
- 0x6e1d0000, 0xe0524100,
- 0x6e1d0100, 0xe0524200,
- 0x6e1d0200, 0xe0524300,
- 0x6e1d0300, 0xb8f82a05,
+ 0x7e060303, 0xe0724000,
+ 0x701d0000, 0xe0724100,
+ 0x701d0100, 0xe0724200,
+ 0x701d0200, 0xe0724300,
+ 0x701d0300, 0x807c847c,
+ 0x8070ff70, 0x00000400,
+ 0xbf0a7b7c, 0xbf85ffef,
+ 0xbf9c0000, 0xbf8200da,
+ 0xbef4007e, 0x8675ff7f,
+ 0x0000ffff, 0x8775ff75,
+ 0x00040000, 0xbef60080,
+ 0xbef700ff, 0x00807fac,
+ 0x866eff7f, 0x08000000,
+ 0x8f6e836e, 0x87776e77,
+ 0x866eff7f, 0x70000000,
+ 0x8f6e816e, 0x87776e77,
+ 0x866eff7f, 0x04000000,
+ 0xbf84001e, 0xbefe00c1,
+ 0xbeff00c1, 0xb8ef4306,
+ 0x866fc16f, 0xbf840019,
+ 0x8e6f866f, 0x8e6f826f,
+ 0xbef6006f, 0xb8f82a05,
0x80788178, 0x8e788a78,
0xb8ee1605, 0x806e816e,
0x8e6e866e, 0x80786e78,
- 0x80f8c078, 0xb8ef1605,
- 0x806f816f, 0x8e6f846f,
- 0x8e76826f, 0xbef600ff,
- 0x01000000, 0xbefc006f,
- 0xc031003a, 0x00000078,
- 0x80f8c078, 0xbf8cc07f,
- 0x80fc907c, 0xbf800000,
- 0xbe802d00, 0xbe822d02,
- 0xbe842d04, 0xbe862d06,
- 0xbe882d08, 0xbe8a2d0a,
- 0xbe8c2d0c, 0xbe8e2d0e,
- 0xbf06807c, 0xbf84fff0,
+ 0x8078ff78, 0x00000080,
+ 0xbef600ff, 0x01000000,
+ 0xbefc0080, 0xe0510000,
+ 0x781d0000, 0xe0510100,
+ 0x781d0000, 0x807cff7c,
+ 0x00000200, 0x8078ff78,
+ 0x00000200, 0xbf0a6f7c,
+ 0xbf85fff6, 0xbef80080,
+ 0xbefe00c1, 0xbeff00c1,
+ 0xb8ef2a05, 0x806f816f,
+ 0x8e6f826f, 0x8e76886f,
+ 0xbef600ff, 0x01000000,
+ 0xbeee0078, 0x8078ff78,
+ 0x00000400, 0xbefc0084,
+ 0xbf11087c, 0x806fff6f,
+ 0x00008000, 0xe0524000,
+ 0x781d0000, 0xe0524100,
+ 0x781d0100, 0xe0524200,
+ 0x781d0200, 0xe0524300,
+ 0x781d0300, 0xbf8c0f70,
+ 0x7e000300, 0x7e020301,
+ 0x7e040302, 0x7e060303,
+ 0x807c847c, 0x8078ff78,
+ 0x00000400, 0xbf0a6f7c,
+ 0xbf85ffee, 0xbf9c0000,
+ 0xe0524000, 0x6e1d0000,
+ 0xe0524100, 0x6e1d0100,
+ 0xe0524200, 0x6e1d0200,
+ 0xe0524300, 0x6e1d0300,
0xb8f82a05, 0x80788178,
0x8e788a78, 0xb8ee1605,
0x806e816e, 0x8e6e866e,
- 0x80786e78, 0xbef60084,
+ 0x80786e78, 0x80f8c078,
+ 0xb8ef1605, 0x806f816f,
+ 0x8e6f846f, 0x8e76826f,
0xbef600ff, 0x01000000,
- 0xc0211bfa, 0x00000078,
- 0x80788478, 0xc0211b3a,
+ 0xbefc006f, 0xc031003a,
+ 0x00000078, 0x80f8c078,
+ 0xbf8cc07f, 0x80fc907c,
+ 0xbf800000, 0xbe802d00,
+ 0xbe822d02, 0xbe842d04,
+ 0xbe862d06, 0xbe882d08,
+ 0xbe8a2d0a, 0xbe8c2d0c,
+ 0xbe8e2d0e, 0xbf06807c,
+ 0xbf84fff0, 0xb8f82a05,
+ 0x80788178, 0x8e788a78,
+ 0xb8ee1605, 0x806e816e,
+ 0x8e6e866e, 0x80786e78,
+ 0xbef60084, 0xbef600ff,
+ 0x01000000, 0xc0211bfa,
0x00000078, 0x80788478,
- 0xc0211b7a, 0x00000078,
- 0x80788478, 0xc0211eba,
+ 0xc0211b3a, 0x00000078,
+ 0x80788478, 0xc0211b7a,
0x00000078, 0x80788478,
- 0xc0211efa, 0x00000078,
- 0x80788478, 0xc0211c3a,
+ 0xc0211c3a, 0x00000078,
+ 0x80788478, 0xc0211c7a,
0x00000078, 0x80788478,
- 0xc0211c7a, 0x00000078,
- 0x80788478, 0xc0211a3a,
+ 0xc0211eba, 0x00000078,
+ 0x80788478, 0xc0211efa,
0x00000078, 0x80788478,
- 0xc0211a7a, 0x00000078,
- 0x80788478, 0xc0211cfa,
+ 0xc0211a3a, 0x00000078,
+ 0x80788478, 0xc0211a7a,
0x00000078, 0x80788478,
- 0xbf8cc07f, 0xbefc006f,
- 0xbefe007a, 0xbeff007b,
- 0x866f71ff, 0x000003ff,
- 0xb96f4803, 0x866f71ff,
- 0xfffff800, 0x8f6f8b6f,
- 0xb96fa2c3, 0xb973f801,
- 0xb8ee2a05, 0x806e816e,
- 0x8e6e8a6e, 0xb8ef1605,
- 0x806f816f, 0x8e6f866f,
- 0x806e6f6e, 0x806e746e,
- 0x826f8075, 0x866fff6f,
- 0x0000ffff, 0xc0071cb7,
- 0x00000040, 0xc00b1d37,
- 0x00000048, 0xc0031e77,
- 0x00000058, 0xc0071eb7,
- 0x0000005c, 0xbf8cc07f,
- 0x866fff6d, 0xf0000000,
- 0x8f6f9c6f, 0x8e6f906f,
- 0xbeee0080, 0x876e6f6e,
- 0x866fff6d, 0x08000000,
- 0x8f6f9b6f, 0x8e6f8f6f,
- 0x876e6f6e, 0x866fff70,
- 0x00800000, 0x8f6f976f,
- 0xb96ef807, 0x866dff6d,
- 0x0000ffff, 0x86fe7e7e,
- 0x86ea6a6a, 0x8f6e8370,
- 0xb96ee0c2, 0xbf800002,
- 0xb9700002, 0xbf8a0000,
- 0x95806f6c, 0xbf810000,
+ 0xc0211cfa, 0x00000078,
+ 0x80788478, 0xbf8cc07f,
+ 0xbefc006f, 0xbefe0070,
+ 0xbeff0071, 0x866f7bff,
+ 0x000003ff, 0xb96f4803,
+ 0x866f7bff, 0xfffff800,
+ 0x8f6f8b6f, 0xb96fa2c3,
+ 0xb973f801, 0xb8ee2a05,
+ 0x806e816e, 0x8e6e8a6e,
+ 0xb8ef1605, 0x806f816f,
+ 0x8e6f866f, 0x806e6f6e,
+ 0x806e746e, 0x826f8075,
+ 0x866fff6f, 0x0000ffff,
+ 0xc00b1c37, 0x00000050,
+ 0xc00b1d37, 0x00000060,
+ 0xc0031e77, 0x00000074,
+ 0xbf8cc07f, 0x866fff6d,
+ 0xf8000000, 0x8f6f9b6f,
+ 0x8e6f906f, 0xbeee0080,
+ 0x876e6f6e, 0x866fff6d,
+ 0x04000000, 0x8f6f9a6f,
+ 0x8e6f8f6f, 0x876e6f6e,
+ 0x866fff7a, 0x00800000,
+ 0x8f6f976f, 0xb96ef807,
+ 0x866dff6d, 0x0000ffff,
+ 0x86fe7e7e, 0x86ea6a6a,
+ 0x8f6e837a, 0xb96ee0c2,
+ 0xbf800002, 0xb97a0002,
+ 0xbf8a0000, 0x95806f6c,
+ 0xbf810000, 0x00000000,
+};
+
+static const uint32_t cwsr_trap_gfx10_hex[] = {
+ 0xbf820001, 0xbf82012e,
+ 0xb0804004, 0xb970f802,
+ 0x8a708670, 0xb971f803,
+ 0x8771ff71, 0x00000400,
+ 0xbf850008, 0xb971f803,
+ 0x8771ff71, 0x000001ff,
+ 0xbf850001, 0x806c846c,
+ 0x876dff6d, 0x0000ffff,
+ 0xbe80226c, 0xb971f803,
+ 0x8771ff71, 0x00000100,
+ 0xbf840006, 0xbef60380,
+ 0xb9f60203, 0x876dff6d,
+ 0x0000ffff, 0x80ec886c,
+ 0x82ed806d, 0xbef60380,
+ 0xb9f60283, 0xb973f816,
+ 0xb9762c07, 0x8f769c76,
+ 0x886d766d, 0xb97603c7,
+ 0x8f769b76, 0x886d766d,
+ 0xb976f807, 0x8776ff76,
+ 0x00007fff, 0xb9f6f807,
+ 0xbeee037e, 0xbeef037f,
+ 0xbefe0480, 0xbf900004,
+ 0xbf8e0002, 0xbf88fffe,
+ 0xbef4037e, 0x8775ff7f,
+ 0x0000ffff, 0x8875ff75,
+ 0x00040000, 0xbef60380,
+ 0xbef703ff, 0x00807fac,
+ 0x8776ff7f, 0x08000000,
+ 0x90768376, 0x88777677,
+ 0x8776ff7f, 0x70000000,
+ 0x90768176, 0x88777677,
+ 0xbefb037c, 0xbefa0380,
+ 0xb97202dc, 0x8872727f,
+ 0xbefe03c1, 0x877c8172,
+ 0xbf06817c, 0xbf850002,
+ 0xbeff0380, 0xbf820001,
+ 0xbeff03c1, 0xb9712a05,
+ 0x80718171, 0x8f718271,
+ 0x877c8172, 0xbf06817c,
+ 0xbf85000d, 0x8f768771,
+ 0xbef603ff, 0x01000000,
+ 0xbefc0380, 0x7e008700,
+ 0xe0704000, 0x7a5d0000,
+ 0x807c817c, 0x807aff7a,
+ 0x00000080, 0xbf0a717c,
+ 0xbf85fff8, 0xbf82001b,
+ 0x8f768871, 0xbef603ff,
+ 0x01000000, 0xbefc0380,
+ 0x7e008700, 0xe0704000,
+ 0x7a5d0000, 0x807c817c,
+ 0x807aff7a, 0x00000100,
+ 0xbf0a717c, 0xbf85fff8,
+ 0xb9711e06, 0x8771c171,
+ 0xbf84000c, 0x8f718371,
+ 0x80717c71, 0xbefe03c1,
+ 0xbeff0380, 0x7e008700,
+ 0xe0704000, 0x7a5d0000,
+ 0x807c817c, 0x807aff7a,
+ 0x00000080, 0xbf0a717c,
+ 0xbf85fff8, 0xbf8a0000,
+ 0x8776ff72, 0x04000000,
+ 0xbf84002b, 0xbefe03c1,
+ 0x877c8172, 0xbf06817c,
+ 0xbf850002, 0xbeff0380,
+ 0xbf820001, 0xbeff03c1,
+ 0xb9714306, 0x8771c171,
+ 0xbf840021, 0x8f718671,
+ 0x8f718271, 0xbef60371,
+ 0xbef603ff, 0x01000000,
+ 0xd7650000, 0x000100c1,
+ 0xd7660000, 0x000200c1,
+ 0x16000084, 0x877c8172,
+ 0xbf06817c, 0xbefc0380,
+ 0xbf85000a, 0x807cff7c,
+ 0x00000080, 0x807aff7a,
+ 0x00000080, 0xd5250000,
+ 0x0001ff00, 0x00000080,
+ 0xbf0a717c, 0xbf85fff7,
+ 0xbf820009, 0x807cff7c,
+ 0x00000100, 0x807aff7a,
+ 0x00000100, 0xd5250000,
+ 0x0001ff00, 0x00000100,
+ 0xbf0a717c, 0xbf85fff7,
+ 0x877c8172, 0xbf06817c,
+ 0xbf850003, 0x8f7687ff,
+ 0x0000006a, 0xbf820002,
+ 0x8f7688ff, 0x0000006a,
+ 0xbef603ff, 0x01000000,
+ 0x877c8172, 0xbf06817c,
+ 0xbefc0380, 0xbf800000,
+ 0xbf85000b, 0xbe802e00,
+ 0x7e000200, 0xe0704000,
+ 0x7a5d0000, 0x807aff7a,
+ 0x00000080, 0x807c817c,
+ 0xbf0aff7c, 0x0000006a,
+ 0xbf85fff6, 0xbf82000a,
+ 0xbe802e00, 0x7e000200,
+ 0xe0704000, 0x7a5d0000,
+ 0x807aff7a, 0x00000100,
+ 0x807c817c, 0xbf0aff7c,
+ 0x0000006a, 0xbf85fff6,
+ 0xbef60384, 0xbef603ff,
+ 0x01000000, 0x877c8172,
+ 0xbf06817c, 0xbf850030,
+ 0x7e00027b, 0xe0704000,
+ 0x7a5d0000, 0x807aff7a,
+ 0x00000080, 0x7e00026c,
+ 0xe0704000, 0x7a5d0000,
+ 0x807aff7a, 0x00000080,
+ 0x7e00026d, 0xe0704000,
+ 0x7a5d0000, 0x807aff7a,
+ 0x00000080, 0x7e00026e,
+ 0xe0704000, 0x7a5d0000,
+ 0x807aff7a, 0x00000080,
+ 0x7e00026f, 0xe0704000,
+ 0x7a5d0000, 0x807aff7a,
+ 0x00000080, 0x7e000270,
+ 0xe0704000, 0x7a5d0000,
+ 0x807aff7a, 0x00000080,
+ 0xb971f803, 0x7e000271,
+ 0xe0704000, 0x7a5d0000,
+ 0x807aff7a, 0x00000080,
+ 0x7e000273, 0xe0704000,
+ 0x7a5d0000, 0x807aff7a,
+ 0x00000080, 0xb97bf801,
+ 0x7e00027b, 0xe0704000,
+ 0x7a5d0000, 0x807aff7a,
+ 0x00000080, 0xbf82002f,
+ 0x7e00027b, 0xe0704000,
+ 0x7a5d0000, 0x807aff7a,
+ 0x00000100, 0x7e00026c,
+ 0xe0704000, 0x7a5d0000,
+ 0x807aff7a, 0x00000100,
+ 0x7e00026d, 0xe0704000,
+ 0x7a5d0000, 0x807aff7a,
+ 0x00000100, 0x7e00026e,
+ 0xe0704000, 0x7a5d0000,
+ 0x807aff7a, 0x00000100,
+ 0x7e00026f, 0xe0704000,
+ 0x7a5d0000, 0x807aff7a,
+ 0x00000100, 0x7e000270,
+ 0xe0704000, 0x7a5d0000,
+ 0x807aff7a, 0x00000100,
+ 0xb971f803, 0x7e000271,
+ 0xe0704000, 0x7a5d0000,
+ 0x807aff7a, 0x00000100,
+ 0x7e000273, 0xe0704000,
+ 0x7a5d0000, 0x807aff7a,
+ 0x00000100, 0xb97bf801,
+ 0x7e00027b, 0xe0704000,
+ 0x7a5d0000, 0x807aff7a,
+ 0x00000100, 0xbf820119,
+ 0xbef4037e, 0x8775ff7f,
+ 0x0000ffff, 0x8875ff75,
+ 0x00040000, 0xbef60380,
+ 0xbef703ff, 0x00807fac,
+ 0x8772ff7f, 0x08000000,
+ 0x90728372, 0x88777277,
+ 0x8772ff7f, 0x70000000,
+ 0x90728172, 0x88777277,
+ 0xb97902dc, 0x8879797f,
+ 0xbef80380, 0xbefe03c1,
+ 0x877c8179, 0xbf06817c,
+ 0xbf850002, 0xbeff0380,
+ 0xbf820001, 0xbeff03c1,
+ 0xb96f2a05, 0x806f816f,
+ 0x8f6f826f, 0x877c8179,
+ 0xbf06817c, 0xbf850013,
+ 0x8f76876f, 0xbef603ff,
+ 0x01000000, 0xbef20378,
+ 0x8078ff78, 0x00000080,
+ 0xbefc0381, 0xe0304000,
+ 0x785d0000, 0xbf8c3f70,
+ 0x7e008500, 0x807c817c,
+ 0x8078ff78, 0x00000080,
+ 0xbf0a6f7c, 0xbf85fff7,
+ 0xe0304000, 0x725d0000,
+ 0xbf820023, 0x8f76886f,
+ 0xbef603ff, 0x01000000,
+ 0xbef20378, 0x8078ff78,
+ 0x00000100, 0xbefc0381,
+ 0xe0304000, 0x785d0000,
+ 0xbf8c3f70, 0x7e008500,
+ 0x807c817c, 0x8078ff78,
+ 0x00000100, 0xbf0a6f7c,
+ 0xbf85fff7, 0xb96f1e06,
+ 0x876fc16f, 0xbf84000e,
+ 0x8f6f836f, 0x806f7c6f,
+ 0xbefe03c1, 0xbeff0380,
+ 0xe0304000, 0x785d0000,
+ 0xbf8c3f70, 0x7e008500,
+ 0x807c817c, 0x8078ff78,
+ 0x00000080, 0xbf0a6f7c,
+ 0xbf85fff7, 0xbeff03c1,
+ 0xe0304000, 0x725d0000,
+ 0x8772ff79, 0x04000000,
+ 0xbf840020, 0xbefe03c1,
+ 0x877c8179, 0xbf06817c,
+ 0xbf850002, 0xbeff0380,
+ 0xbf820001, 0xbeff03c1,
+ 0xb96f4306, 0x876fc16f,
+ 0xbf840016, 0x8f6f866f,
+ 0x8f6f826f, 0xbef6036f,
+ 0xbef603ff, 0x01000000,
+ 0x877c8172, 0xbf06817c,
+ 0xbefc0380, 0xbf850007,
+ 0x807cff7c, 0x00000080,
+ 0x8078ff78, 0x00000080,
+ 0xbf0a6f7c, 0xbf85fffa,
+ 0xbf820006, 0x807cff7c,
+ 0x00000100, 0x8078ff78,
+ 0x00000100, 0xbf0a6f7c,
+ 0xbf85fffa, 0x877c8179,
+ 0xbf06817c, 0xbf850003,
+ 0x8f7687ff, 0x0000006a,
+ 0xbf820002, 0x8f7688ff,
+ 0x0000006a, 0xbef603ff,
+ 0x01000000, 0x877c8179,
+ 0xbf06817c, 0xbf850012,
+ 0xf4211cba, 0xf0000000,
+ 0x8078ff78, 0x00000080,
+ 0xbefc0381, 0xf421003a,
+ 0xf0000000, 0x8078ff78,
+ 0x00000080, 0xbf8cc07f,
+ 0xbe803000, 0xbf800000,
+ 0x807c817c, 0xbf0aff7c,
+ 0x0000006a, 0xbf85fff5,
+ 0xbe800372, 0xbf820011,
+ 0xf4211cba, 0xf0000000,
+ 0x8078ff78, 0x00000100,
+ 0xbefc0381, 0xf421003a,
+ 0xf0000000, 0x8078ff78,
+ 0x00000100, 0xbf8cc07f,
+ 0xbe803000, 0xbf800000,
+ 0x807c817c, 0xbf0aff7c,
+ 0x0000006a, 0xbf85fff5,
+ 0xbe800372, 0xbef60384,
+ 0xbef603ff, 0x01000000,
+ 0x877c8179, 0xbf06817c,
+ 0xbf850025, 0xf4211bfa,
+ 0xf0000000, 0x8078ff78,
+ 0x00000080, 0xf4211b3a,
+ 0xf0000000, 0x8078ff78,
+ 0x00000080, 0xf4211b7a,
+ 0xf0000000, 0x8078ff78,
+ 0x00000080, 0xf4211eba,
+ 0xf0000000, 0x8078ff78,
+ 0x00000080, 0xf4211efa,
+ 0xf0000000, 0x8078ff78,
+ 0x00000080, 0xf4211c3a,
+ 0xf0000000, 0x8078ff78,
+ 0x00000080, 0xf4211c7a,
+ 0xf0000000, 0x8078ff78,
+ 0x00000080, 0xf4211cfa,
+ 0xf0000000, 0x8078ff78,
+ 0x00000080, 0xf4211e7a,
+ 0xf0000000, 0x8078ff78,
+ 0x00000080, 0xbf820024,
+ 0xf4211bfa, 0xf0000000,
+ 0x8078ff78, 0x00000100,
+ 0xf4211b3a, 0xf0000000,
+ 0x8078ff78, 0x00000100,
+ 0xf4211b7a, 0xf0000000,
+ 0x8078ff78, 0x00000100,
+ 0xf4211eba, 0xf0000000,
+ 0x8078ff78, 0x00000100,
+ 0xf4211efa, 0xf0000000,
+ 0x8078ff78, 0x00000100,
+ 0xf4211c3a, 0xf0000000,
+ 0x8078ff78, 0x00000100,
+ 0xf4211c7a, 0xf0000000,
+ 0x8078ff78, 0x00000100,
+ 0xf4211cfa, 0xf0000000,
+ 0x8078ff78, 0x00000100,
+ 0xf4211e7a, 0xf0000000,
+ 0x8078ff78, 0x00000100,
+ 0xbf8cc07f, 0x876dff6d,
+ 0x0000ffff, 0xbefc036f,
+ 0xbefe037a, 0xbeff037b,
+ 0x876f71ff, 0x000003ff,
+ 0xb9ef4803, 0xb9f3f816,
+ 0x876f71ff, 0xfffff800,
+ 0x906f8b6f, 0xb9efa2c3,
+ 0xb9f9f801, 0x876fff6d,
+ 0xf0000000, 0x906f9c6f,
+ 0x8f6f906f, 0xbef20380,
+ 0x88726f72, 0x876fff6d,
+ 0x08000000, 0x906f9b6f,
+ 0x8f6f8f6f, 0x88726f72,
+ 0x876fff70, 0x00800000,
+ 0x906f976f, 0xb9f2f807,
+ 0xb9f0f802, 0xbf8a0000,
+ 0xbe80226c, 0xbf810000,
+ 0xbf9f0000, 0xbf9f0000,
+ 0xbf9f0000, 0xbf9f0000,
+ 0xbf9f0000, 0x00000000,
};
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
new file mode 100644
index 000000000000..f20e463e748b
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
@@ -0,0 +1,1124 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+shader main
+
+asic(DEFAULT)
+
+type(CS)
+
+wave_size(32)
+/*************************************************************************/
+/* control on how to run the shader */
+/*************************************************************************/
+//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run)
+var EMU_RUN_HACK = 0
+var EMU_RUN_HACK_RESTORE_NORMAL = 0
+var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0
+var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0
+var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
+var SAVE_LDS = 0
+var WG_BASE_ADDR_LO = 0x9000a000
+var WG_BASE_ADDR_HI = 0x0
+var WAVE_SPACE = 0x9000 //memory size that each wave occupies in workgroup state mem, increase from 5000 to 9000 for more SGPR need to be saved
+var CTX_SAVE_CONTROL = 0x0
+var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL
+var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run)
+var SGPR_SAVE_USE_SQC = 0 //use SQC D$ to do the write
+var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //need to change BUF_DATA_FORMAT in S_SAVE_BUF_RSRC_WORD3_MISC from 0 to BUF_DATA_FORMAT_32 if set to 1 (i.e. 0x00827FAC)
+var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing
+var SAVE_RESTORE_HWID_DDID = 0
+var RESTORE_DDID_IN_SGPR18 = 0
+/**************************************************************************/
+/* variables */
+/**************************************************************************/
+var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23
+var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000
+var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006
+
+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8
+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24
+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 4 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
+var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24
+var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4
+var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11
+var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1
+
+var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400
+var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask
+var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10
+var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
+var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21
+
+var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME
+var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME
+var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 //FIXME
+var SQ_WAVE_IB_STS_RCNT_SIZE = 6 //FIXME
+var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME
+
+var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24
+var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27
+
+
+/* Save */
+var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes
+var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
+
+var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
+var S_SAVE_SPI_INIT_ATC_SHIFT = 27
+var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
+var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28
+var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
+var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
+
+var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used
+var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME
+var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME
+var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME
+
+var s_save_spi_init_lo = exec_lo
+var s_save_spi_init_hi = exec_hi
+
+var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
+var s_save_pc_hi = ttmp1
+var s_save_exec_lo = ttmp2
+var s_save_exec_hi = ttmp3
+var s_save_status = ttmp4
+var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine
+var s_wave_size = ttmp6 //ttmp6 is not needed now, since it's only 32bit xnack mask, now use it to determine wave32 or wave64 in EMU_HACK
+var s_save_xnack_mask = ttmp7
+var s_save_buf_rsrc0 = ttmp8
+var s_save_buf_rsrc1 = ttmp9
+var s_save_buf_rsrc2 = ttmp10
+var s_save_buf_rsrc3 = ttmp11
+
+var s_save_mem_offset = ttmp14
+var s_sgpr_save_num = 106 //in gfx10, all sgpr must be saved
+var s_save_alloc_size = s_save_trapsts //conflict
+var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time)
+var s_save_m0 = ttmp15
+
+/* Restore */
+var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
+var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
+
+var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
+var S_RESTORE_SPI_INIT_ATC_SHIFT = 27
+var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
+var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28
+var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
+var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
+
+var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT
+var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK
+var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK
+
+var s_restore_spi_init_lo = exec_lo
+var s_restore_spi_init_hi = exec_hi
+
+var s_restore_mem_offset = ttmp12
+var s_restore_alloc_size = ttmp3
+var s_restore_tmp = ttmp6
+var s_restore_mem_offset_save = s_restore_tmp //no conflict
+
+var s_restore_m0 = s_restore_alloc_size //no conflict
+
+var s_restore_mode = ttmp13
+var s_restore_hwid1 = ttmp2
+var s_restore_ddid = s_restore_hwid1
+var s_restore_pc_lo = ttmp0
+var s_restore_pc_hi = ttmp1
+var s_restore_exec_lo = ttmp14
+var s_restore_exec_hi = ttmp15
+var s_restore_status = ttmp4
+var s_restore_trapsts = ttmp5
+//var s_restore_xnack_mask_lo = xnack_mask_lo
+//var s_restore_xnack_mask_hi = xnack_mask_hi
+var s_restore_xnack_mask = ttmp7
+var s_restore_buf_rsrc0 = ttmp8
+var s_restore_buf_rsrc1 = ttmp9
+var s_restore_buf_rsrc2 = ttmp10
+var s_restore_buf_rsrc3 = ttmp11
+var s_restore_size = ttmp13 //ttmp13 has no conflict
+
+/**************************************************************************/
+/* trap handler entry points */
+/**************************************************************************/
+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore
+ //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
+ s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC
+ s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
+ s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE
+ //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE
+ s_branch L_SKIP_RESTORE //NOT restore, SAVE actually
+ else
+ s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
+ end
+
+L_JUMP_TO_RESTORE:
+ s_branch L_RESTORE //restore
+
+L_SKIP_RESTORE:
+
+ s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
+ s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save
+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save
+ s_cbranch_scc1 L_SAVE //this is the operation for save
+
+ // ********* Handle non-CWSR traps *******************
+ if (!EMU_RUN_HACK)
+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
+ s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly.
+ s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0
+
+ L_EXCP_CASE:
+ s_and_b32 ttmp1, ttmp1, 0xFFFF
+ s_rfe_b64 [ttmp0, ttmp1]
+ end
+ // ********* End handling of non-CWSR traps *******************
+
+/**************************************************************************/
+/* save routine */
+/**************************************************************************/
+
+L_SAVE:
+
+ //check whether there is mem_viol
+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
+ s_cbranch_scc0 L_NO_PC_REWIND
+
+ //if so, need rewind PC assuming GDS operation gets NACKed
+ s_mov_b32 s_save_tmp, 0 //clear mem_viol bit
+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit
+ s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
+ s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8
+ s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc
+
+L_NO_PC_REWIND:
+ s_mov_b32 s_save_tmp, 0 //clear saveCtx bit
+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
+
+ //s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK
+ //s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi
+ s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK)
+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT
+ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
+ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY
+ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS
+ s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
+
+ s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp
+
+ /* inform SPI the readiness and wait for SPI's go signal */
+ s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
+ s_mov_b32 s_save_exec_hi, exec_hi
+ s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
+ if (EMU_RUN_HACK)
+
+ else
+ s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
+ end
+
+ L_SLEEP:
+ s_sleep 0x2
+
+ if (EMU_RUN_HACK)
+
+ else
+ s_cbranch_execz L_SLEEP
+ end
+
+
+ /* setup Resource Contants */
+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+ //calculate wd_addr using absolute thread id
+ v_readlane_b32 s_save_tmp, v9, 0
+ //determine it is wave32 or wave64
+ s_getreg_b32 s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
+ s_cmp_eq_u32 s_wave_size, 0
+ s_cbranch_scc1 L_SAVE_WAVE32
+ s_lshr_b32 s_save_tmp, s_save_tmp, 6 //SAVE WAVE64
+ s_branch L_SAVE_CON
+ L_SAVE_WAVE32:
+ s_lshr_b32 s_save_tmp, s_save_tmp, 5 //SAVE WAVE32
+ L_SAVE_CON:
+ s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
+ s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
+ s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
+ s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
+ else
+ end
+ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+ s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
+ s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
+ s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
+ else
+ end
+
+
+ s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
+ s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
+ s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
+ s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
+ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
+ s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
+ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC
+ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
+ s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
+ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE
+
+ s_mov_b32 s_save_m0, m0 //save M0
+
+ /* global mem offset */
+ s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0
+ s_getreg_b32 s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) //get wave_save_size
+ s_or_b32 s_wave_size, s_save_spi_init_hi, s_wave_size //share s_wave_size with exec_hi
+
+ /* save VGPRs */
+ //////////////////////////////
+ L_SAVE_VGPR:
+
+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
+ s_and_b32 m0, s_wave_size, 1
+ s_cmp_eq_u32 m0, 1
+ s_cbranch_scc1 L_ENABLE_SAVE_VGPR_EXEC_HI
+ s_mov_b32 exec_hi, 0x00000000
+ s_branch L_SAVE_VGPR_NORMAL
+ L_ENABLE_SAVE_VGPR_EXEC_HI:
+ s_mov_b32 exec_hi, 0xFFFFFFFF
+ L_SAVE_VGPR_NORMAL:
+ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
+ //for wave32 and wave64, the num of vgpr function is the same?
+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible
+ //determine it is wave32 or wave64
+ s_and_b32 m0, s_wave_size, 1
+ s_cmp_eq_u32 m0, 1
+ s_cbranch_scc1 L_SAVE_VGPR_WAVE64
+
+ //zhenxu added it for save vgpr for wave32
+ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 7 //NUM_RECORDS in bytes (32 threads*4)
+ if (SWIZZLE_EN)
+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+ s_mov_b32 m0, 0x0 //VGPR initial index value =0
+ //s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
+ //s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later, doesn't need this in gfx10
+
+ L_SAVE_VGPR_WAVE32_LOOP:
+ v_movrels_b32 v0, v0 //v0 = v[0+m0]
+
+ if(USE_MTBUF_INSTEAD_OF_MUBUF)
+ tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+ else
+ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+ end
+
+ s_add_u32 m0, m0, 1 //next vgpr index
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 //every buffer_store_dword does 128 bytes
+ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_SAVE_VGPR_WAVE32_LOOP //VGPR save is complete?
+ s_branch L_SAVE_LDS
+ //save vgpr for wave32 ends
+
+ L_SAVE_VGPR_WAVE64:
+ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
+ if (SWIZZLE_EN)
+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+ s_mov_b32 m0, 0x0 //VGPR initial index value =0
+ //s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
+ //s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later, doesn't need this in gfx10
+
+ L_SAVE_VGPR_WAVE64_LOOP:
+ v_movrels_b32 v0, v0 //v0 = v[0+m0]
+
+ if(USE_MTBUF_INSTEAD_OF_MUBUF)
+ tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+ else
+ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+ end
+
+ s_add_u32 m0, m0, 1 //next vgpr index
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //every buffer_store_dword does 256 bytes
+ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_SAVE_VGPR_WAVE64_LOOP //VGPR save is complete?
+ //s_set_gpr_idx_off
+ //
+ //Below part will be the save shared vgpr part (new for gfx10)
+ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size
+ s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
+ s_cbranch_scc0 L_SAVE_LDS //no shared_vgpr used? jump to L_SAVE_LDS
+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
+ //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
+ //save shared_vgpr will start from the index of m0
+ s_add_u32 s_save_alloc_size, s_save_alloc_size, m0
+ s_mov_b32 exec_lo, 0xFFFFFFFF
+ s_mov_b32 exec_hi, 0x00000000
+ L_SAVE_SHARED_VGPR_WAVE64_LOOP:
+ v_movrels_b32 v0, v0 //v0 = v[0+m0]
+ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+ s_add_u32 m0, m0, 1 //next vgpr index
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 //every buffer_store_dword does 256 bytes
+ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete?
+
+ /* save LDS */
+ //////////////////////////////
+ L_SAVE_LDS:
+
+ //Only check the first wave need LDS
+ /* the first wave in the threadgroup */
+ s_barrier //FIXME not performance-optimal "LDS is used? wait for other waves in the same TG"
+ s_and_b32 s_save_tmp, s_wave_size, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here
+ s_cbranch_scc0 L_SAVE_SGPR
+
+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
+ s_and_b32 m0, s_wave_size, 1
+ s_cmp_eq_u32 m0, 1
+ s_cbranch_scc1 L_ENABLE_SAVE_LDS_EXEC_HI
+ s_mov_b32 exec_hi, 0x00000000
+ s_branch L_SAVE_LDS_NORMAL
+ L_ENABLE_SAVE_LDS_EXEC_HI:
+ s_mov_b32 exec_hi, 0xFFFFFFFF
+ L_SAVE_LDS_NORMAL:
+ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
+ s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero?
+ s_cbranch_scc0 L_SAVE_SGPR //no lds used? jump to L_SAVE_VGPR
+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes
+ s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
+ if (SWIZZLE_EN)
+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+ //load 0~63*4(byte address) to vgpr v15
+ v_mbcnt_lo_u32_b32 v0, -1, 0
+ v_mbcnt_hi_u32_b32 v0, -1, v0
+ v_mul_u32_u24 v0, 4, v0
+
+ s_and_b32 m0, s_wave_size, 1
+ s_cmp_eq_u32 m0, 1
+ s_mov_b32 m0, 0x0
+ s_cbranch_scc1 L_SAVE_LDS_LOOP_W64
+
+ L_SAVE_LDS_LOOP_W32:
+ if (SAVE_LDS)
+ ds_read_b32 v1, v0
+ s_waitcnt 0 //ensure data ready
+ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+ //buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 //save lds to memory doesn't exist in 10
+ end
+ s_add_u32 m0, m0, 128 //every buffer_store_lds does 128 bytes
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 //mem offset increased by 128 bytes
+ v_add_nc_u32 v0, v0, 128
+ s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_SAVE_LDS_LOOP_W32 //LDS save is complete?
+ s_branch L_SAVE_SGPR
+
+ L_SAVE_LDS_LOOP_W64:
+ if (SAVE_LDS)
+ ds_read_b32 v1, v0
+ s_waitcnt 0 //ensure data ready
+ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+ //buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 //save lds to memory doesn't exist in 10
+ end
+ s_add_u32 m0, m0, 256 //every buffer_store_lds does 256 bytes
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //mem offset increased by 256 bytes
+ v_add_nc_u32 v0, v0, 256
+ s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 //LDS save is complete?
+
+
+ /* save SGPRs */
+ //////////////////////////////
+ //s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
+ //s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
+ //s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
+ //s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //In gfx10, Number of SGPRs = (sgpr_size + 1) * 8 (non-zero value)
+ L_SAVE_SGPR:
+ //need to look at it is wave32 or wave64
+ s_and_b32 m0, s_wave_size, 1
+ s_cmp_eq_u32 m0, 1
+ s_cbranch_scc1 L_SAVE_SGPR_VMEM_WAVE64
+ if (SGPR_SAVE_USE_SQC)
+ s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes
+ else
+ s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 7 //NUM_RECORDS in bytes (32 threads)
+ end
+ s_branch L_SAVE_SGPR_CONT
+ L_SAVE_SGPR_VMEM_WAVE64:
+ if (SGPR_SAVE_USE_SQC)
+ s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes
+ else
+ s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 8 //NUM_RECORDS in bytes (64 threads)
+ end
+ L_SAVE_SGPR_CONT:
+ if (SWIZZLE_EN)
+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+ //s_mov_b32 m0, 0x0 //SGPR initial index value =0
+ //s_nop 0x0 //Manually inserted wait states
+
+ s_and_b32 m0, s_wave_size, 1
+ s_cmp_eq_u32 m0, 1
+
+ s_mov_b32 m0, 0x0 //SGPR initial index value =0
+ s_nop 0x0 //Manually inserted wait states
+
+ s_cbranch_scc1 L_SAVE_SGPR_LOOP_WAVE64
+
+ L_SAVE_SGPR_LOOP_WAVE32:
+ s_movrels_b32 s0, s0 //s0 = s[0+m0]
+ //zhenxu, adding one more argument to save sgpr function, this is only for vmem, using sqc is not change
+ write_sgpr_to_mem_wave32(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PV: the best performance should be using s_buffer_store_dwordx4
+ s_add_u32 m0, m0, 1 //next sgpr index
+ s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_sgpr_save_num) ? 1 : 0
+ s_cbranch_scc1 L_SAVE_SGPR_LOOP_WAVE32 //SGPR save is complete?
+ s_branch L_SAVE_HWREG
+
+ L_SAVE_SGPR_LOOP_WAVE64:
+ s_movrels_b32 s0, s0 //s0 = s[0+m0]
+ //zhenxu, adding one more argument to save sgpr function, this is only for vmem, using sqc is not change
+ write_sgpr_to_mem_wave64(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PV: the best performance should be using s_buffer_store_dwordx4
+ s_add_u32 m0, m0, 1 //next sgpr index
+ s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_sgpr_save_num) ? 1 : 0
+ s_cbranch_scc1 L_SAVE_SGPR_LOOP_WAVE64 //SGPR save is complete?
+
+
+ /* save HW registers */
+ //////////////////////////////
+ L_SAVE_HWREG:
+ s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
+ if (SWIZZLE_EN)
+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+ s_and_b32 m0, s_wave_size, 1
+ s_cmp_eq_u32 m0, 1
+ s_cbranch_scc1 L_SAVE_HWREG_WAVE64
+
+ write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //M0
+
+ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
+ s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
+ s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
+ end
+
+ write_sgpr_to_mem_wave32(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PC
+ write_sgpr_to_mem_wave32(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+ write_sgpr_to_mem_wave32(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //EXEC
+ write_sgpr_to_mem_wave32(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+ write_sgpr_to_mem_wave32(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //STATUS
+
+ //s_save_trapsts conflicts with s_save_alloc_size
+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+ write_sgpr_to_mem_wave32(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TRAPSTS
+
+ //write_sgpr_to_mem_wave32(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_LO
+ write_sgpr_to_mem_wave32(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_HI
+
+ //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
+ s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE
+ write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+ if(SAVE_RESTORE_HWID_DDID)
+ s_getreg_b32 s_save_m0, hwreg(HW_REG_HW_ID1) //HW_ID1, handler records the SE/SA/WGP/SIMD/wave of the original wave
+ write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+ end
+ s_branch L_S_PGM_END_SAVED
+
+ L_SAVE_HWREG_WAVE64:
+ write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //M0
+
+ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
+ s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
+ s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
+ end
+
+ write_sgpr_to_mem_wave64(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PC
+ write_sgpr_to_mem_wave64(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+ write_sgpr_to_mem_wave64(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //EXEC
+ write_sgpr_to_mem_wave64(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+ write_sgpr_to_mem_wave64(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //STATUS
+
+ //s_save_trapsts conflicts with s_save_alloc_size
+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+ write_sgpr_to_mem_wave64(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TRAPSTS
+
+ //write_sgpr_to_mem_wave64(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_LO
+ write_sgpr_to_mem_wave64(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_HI
+
+ //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
+ s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE
+ write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+
+
+ if(SAVE_RESTORE_HWID_DDID)
+ s_getreg_b32 s_save_m0, hwreg(HW_REG_HW_ID1) //HW_ID1, handler records the SE/SA/WGP/SIMD/wave of the original wave
+ write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+
+ /* save DDID */
+ //////////////////////////////
+ L_SAVE_DDID:
+ //EXEC has been saved, no vector inst following
+ s_mov_b32 exec_lo, 0x80000000 //Set MSB to 1. Cleared when draw index is returned
+ s_sendmsg sendmsg(MSG_GET_DDID)
+
+ L_WAIT_DDID_LOOP:
+ s_nop 7 // sleep a bit
+ s_bitcmp0_b32 exec_lo, 31 // test to see if MSB is cleared, meaning done
+ s_cbranch_scc0 L_WAIT_DDID_LOOP
+
+ s_mov_b32 s_save_m0, exec_lo
+
+
+ s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
+ if (SWIZZLE_EN)
+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+ s_and_b32 m0, s_wave_size, 1
+ s_cmp_eq_u32 m0, 1
+ s_cbranch_scc1 L_SAVE_DDID_WAVE64
+
+ write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+
+ L_SAVE_DDID_WAVE64:
+ write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+
+ end
+
+ L_S_PGM_END_SAVED:
+ /* S_PGM_END_SAVED */ //FIXME graphics ONLY
+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
+ s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
+ s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
+ s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
+ s_rfe_b64 s_save_pc_lo //Return to the main shader program
+ else
+ end
+
+
+ s_branch L_END_PGM
+
+
+
+/**************************************************************************/
+/* restore routine */
+/**************************************************************************/
+
+L_RESTORE:
+ /* Setup Resource Contants */
+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+ //calculate wd_addr using absolute thread id
+ v_readlane_b32 s_restore_tmp, v9, 0
+ //determine it is wave32 or wave64
+ s_getreg_b32 s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) //change to ttmp13
+ s_cmp_eq_u32 s_restore_size, 0
+ s_cbranch_scc1 L_RESTORE_WAVE32
+ s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 //SAVE WAVE64
+ s_branch L_RESTORE_CON
+ L_RESTORE_WAVE32:
+ s_lshr_b32 s_restore_tmp, s_restore_tmp, 5 //SAVE WAVE32
+ L_RESTORE_CON:
+ s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
+ s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
+ s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
+ s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
+ else
+ end
+
+ s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
+ s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
+ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
+ s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
+ s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
+ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
+ s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
+ s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC
+ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
+ s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
+ s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE
+ //determine it is wave32 or wave64
+ s_getreg_b32 s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
+ s_or_b32 s_restore_size, s_restore_spi_init_hi, s_restore_size //share s_wave_size with exec_hi
+
+ /* global mem offset */
+ s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0
+
+ /* restore VGPRs */
+ //////////////////////////////
+ L_RESTORE_VGPR:
+
+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
+ s_and_b32 m0, s_restore_size, 1
+ s_cmp_eq_u32 m0, 1
+ s_cbranch_scc1 L_ENABLE_RESTORE_VGPR_EXEC_HI
+ s_mov_b32 exec_hi, 0x00000000
+ s_branch L_RESTORE_VGPR_NORMAL
+ L_ENABLE_RESTORE_VGPR_EXEC_HI:
+ s_mov_b32 exec_hi, 0xFFFFFFFF
+ L_RESTORE_VGPR_NORMAL:
+ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
+ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
+ //determine it is wave32 or wave64
+ s_and_b32 m0, s_restore_size, 1
+ s_cmp_eq_u32 m0, 1
+ s_cbranch_scc1 L_RESTORE_VGPR_WAVE64
+
+ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 7 //NUM_RECORDS in bytes (32 threads*4)
+ if (SWIZZLE_EN)
+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+ s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128
+ s_mov_b32 m0, 1 //VGPR initial index value = 1
+ //s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
+ //s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later, might not need this in gfx10
+
+ L_RESTORE_VGPR_WAVE32_LOOP:
+ if(USE_MTBUF_INSTEAD_OF_MUBUF)
+ tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+ else
+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+ end
+ s_waitcnt vmcnt(0) //ensure data ready
+ v_movreld_b32 v0, v0 //v[0+m0] = v0
+ s_add_u32 m0, m0, 1 //next vgpr index
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //every buffer_load_dword does 128 bytes
+ s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete?
+ //s_set_gpr_idx_off
+ /* VGPR restore on v0 */
+ if(USE_MTBUF_INSTEAD_OF_MUBUF)
+ tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+ else
+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
+ end
+
+ s_branch L_RESTORE_LDS
+
+ L_RESTORE_VGPR_WAVE64:
+ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
+ if (SWIZZLE_EN)
+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+ s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256
+ s_mov_b32 m0, 1 //VGPR initial index value = 1
+ L_RESTORE_VGPR_WAVE64_LOOP:
+ if(USE_MTBUF_INSTEAD_OF_MUBUF)
+ tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+ else
+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+ end
+ s_waitcnt vmcnt(0) //ensure data ready
+ v_movreld_b32 v0, v0 //v[0+m0] = v0
+ s_add_u32 m0, m0, 1 //next vgpr index
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //every buffer_load_dword does 256 bytes
+ s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
+ //s_set_gpr_idx_off
+ //
+ //Below part will be the restore shared vgpr part (new for gfx10)
+ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size
+ s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
+ s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used? jump to L_SAVE_LDS
+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
+ //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
+ //restore shared_vgpr will start from the index of m0
+ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0
+ s_mov_b32 exec_lo, 0xFFFFFFFF
+ s_mov_b32 exec_hi, 0x00000000
+ L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+ s_waitcnt vmcnt(0) //ensure data ready
+ v_movreld_b32 v0, v0 //v[0+m0] = v0
+ s_add_u32 m0, m0, 1 //next vgpr index
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //every buffer_load_dword does 256 bytes
+ s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
+
+ s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!!
+
+ /* VGPR restore on v0 */
+ L_RESTORE_V0:
+ if(USE_MTBUF_INSTEAD_OF_MUBUF)
+ tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+ else
+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
+ end
+
+
+ /* restore LDS */
+ //////////////////////////////
+ L_RESTORE_LDS:
+
+ //Only need to check the first wave
+ /* the first wave in the threadgroup */
+ s_and_b32 s_restore_tmp, s_restore_size, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
+ s_cbranch_scc0 L_RESTORE_SGPR
+
+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
+ s_and_b32 m0, s_restore_size, 1
+ s_cmp_eq_u32 m0, 1
+ s_cbranch_scc1 L_ENABLE_RESTORE_LDS_EXEC_HI
+ s_mov_b32 exec_hi, 0x00000000
+ s_branch L_RESTORE_LDS_NORMAL
+ L_ENABLE_RESTORE_LDS_EXEC_HI:
+ s_mov_b32 exec_hi, 0xFFFFFFFF
+ L_RESTORE_LDS_NORMAL:
+ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
+ s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
+ s_cbranch_scc0 L_RESTORE_SGPR //no lds used? jump to L_RESTORE_VGPR
+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes
+ s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
+ if (SWIZZLE_EN)
+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+ s_and_b32 m0, s_wave_size, 1
+ s_cmp_eq_u32 m0, 1
+ s_mov_b32 m0, 0x0
+ s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64
+
+ L_RESTORE_LDS_LOOP_W32:
+ if (SAVE_LDS)
+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1
+ s_waitcnt 0
+ end
+ s_add_u32 m0, m0, 128 //every buffer_load_dword does 256 bytes
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 256 bytes
+ s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_RESTORE_LDS_LOOP_W32 //LDS restore is complete?
+ s_branch L_RESTORE_SGPR
+
+ L_RESTORE_LDS_LOOP_W64:
+ if (SAVE_LDS)
+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1
+ s_waitcnt 0
+ end
+ s_add_u32 m0, m0, 256 //every buffer_load_dword does 256 bytes
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256 bytes
+ s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 //LDS restore is complete?
+
+
+ /* restore SGPRs */
+ //////////////////////////////
+ //s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
+ //s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
+ //s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
+ //s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SGPRs = (sgpr_size + 1) * 8 (non-zero value)
+ L_RESTORE_SGPR:
+ //need to look at it is wave32 or wave64
+ s_and_b32 m0, s_restore_size, 1
+ s_cmp_eq_u32 m0, 1
+ s_cbranch_scc1 L_RESTORE_SGPR_VMEM_WAVE64
+ if (SGPR_SAVE_USE_SQC)
+ s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes
+ else
+ s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 7 //NUM_RECORDS in bytes (32 threads)
+ end
+ s_branch L_RESTORE_SGPR_CONT
+ L_RESTORE_SGPR_VMEM_WAVE64:
+ if (SGPR_SAVE_USE_SQC)
+ s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes
+ else
+ s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 8 //NUM_RECORDS in bytes (64 threads)
+ end
+
+ L_RESTORE_SGPR_CONT:
+ if (SWIZZLE_EN)
+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+ s_and_b32 m0, s_restore_size, 1
+ s_cmp_eq_u32 m0, 1
+ s_cbranch_scc1 L_RESTORE_SGPR_WAVE64
+
+ read_sgpr_from_mem_wave32(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //save s0 to s_restore_tmp
+ s_mov_b32 m0, 0x1
+
+ L_RESTORE_SGPR_LOOP_WAVE32:
+ read_sgpr_from_mem_wave32(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PV: further performance improvement can be made
+ s_waitcnt lgkmcnt(0) //ensure data ready
+ s_movreld_b32 s0, s0 //s[0+m0] = s0
+ s_nop 0 // hazard SALU M0=> S_MOVREL
+ s_add_u32 m0, m0, 1 //next sgpr index
+ s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_RESTORE_SGPR_LOOP_WAVE32 //SGPR restore (except s0) is complete?
+ s_mov_b32 s0, s_restore_tmp /* SGPR restore on s0 */
+ s_branch L_RESTORE_HWREG
+
+ L_RESTORE_SGPR_WAVE64:
+ read_sgpr_from_mem_wave64(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //save s0 to s_restore_tmp
+ s_mov_b32 m0, 0x1 //SGPR initial index value =1 //go on with with s1
+
+ L_RESTORE_SGPR_LOOP_WAVE64:
+ read_sgpr_from_mem_wave64(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PV: further performance improvement can be made
+ s_waitcnt lgkmcnt(0) //ensure data ready
+ s_movreld_b32 s0, s0 //s[0+m0] = s0
+ s_nop 0 // hazard SALU M0=> S_MOVREL
+ s_add_u32 m0, m0, 1 //next sgpr index
+ s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_RESTORE_SGPR_LOOP_WAVE64 //SGPR restore (except s0) is complete?
+ s_mov_b32 s0, s_restore_tmp /* SGPR restore on s0 */
+
+
+ /* restore HW registers */
+ //////////////////////////////
+ L_RESTORE_HWREG:
+ s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
+ if (SWIZZLE_EN)
+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+ s_and_b32 m0, s_restore_size, 1
+ s_cmp_eq_u32 m0, 1
+ s_cbranch_scc1 L_RESTORE_HWREG_WAVE64
+
+ read_sgpr_from_mem_wave32(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //M0
+ read_sgpr_from_mem_wave32(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PC
+ read_sgpr_from_mem_wave32(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
+ read_sgpr_from_mem_wave32(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //EXEC
+ read_sgpr_from_mem_wave32(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
+ read_sgpr_from_mem_wave32(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //STATUS
+ read_sgpr_from_mem_wave32(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TRAPSTS
+ //read_sgpr_from_mem_wave32(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_LO
+ //read_sgpr_from_mem_wave32(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_HI
+ read_sgpr_from_mem_wave32(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK
+ read_sgpr_from_mem_wave32(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //MODE
+ if(SAVE_RESTORE_HWID_DDID)
+ read_sgpr_from_mem_wave32(s_restore_hwid1, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //HW_ID1
+ end
+ s_branch L_RESTORE_HWREG_FINISH
+
+ L_RESTORE_HWREG_WAVE64:
+ read_sgpr_from_mem_wave64(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //M0
+ read_sgpr_from_mem_wave64(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PC
+ read_sgpr_from_mem_wave64(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
+ read_sgpr_from_mem_wave64(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //EXEC
+ read_sgpr_from_mem_wave64(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
+ read_sgpr_from_mem_wave64(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //STATUS
+ read_sgpr_from_mem_wave64(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TRAPSTS
+ //read_sgpr_from_mem_wave64(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_LO
+ //read_sgpr_from_mem_wave64(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_HI
+ read_sgpr_from_mem_wave64(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK
+ read_sgpr_from_mem_wave64(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //MODE
+ if(SAVE_RESTORE_HWID_DDID)
+ read_sgpr_from_mem_wave64(s_restore_hwid1, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //HW_ID1
+ end
+ L_RESTORE_HWREG_FINISH:
+ s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
+
+
+
+ if(SAVE_RESTORE_HWID_DDID)
+ L_RESTORE_DDID:
+ s_mov_b32 m0, s_restore_hwid1 //virture ttrace support: The save-context handler records the SE/SA/WGP/SIMD/wave of the original wave
+ s_ttracedata //and then can output it as SHADER_DATA to ttrace on restore to provide a correlation across the save-restore
+
+ s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
+ if (SWIZZLE_EN)
+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+ s_and_b32 m0, s_restore_size, 1
+ s_cmp_eq_u32 m0, 1
+ s_cbranch_scc1 L_RESTORE_DDID_WAVE64
+
+ read_sgpr_from_mem_wave32(s_restore_ddid, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
+ s_branch L_RESTORE_DDID_FINISH
+ L_RESTORE_DDID_WAVE64:
+ read_sgpr_from_mem_wave64(s_restore_ddid, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
+
+ L_RESTORE_DDID_FINISH:
+ s_waitcnt lgkmcnt(0)
+ //s_mov_b32 m0, s_restore_ddid
+ //s_ttracedata
+ if (RESTORE_DDID_IN_SGPR18)
+ s_mov_b32 s18, s_restore_ddid
+ end
+
+ end
+
+ s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
+
+ //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+ s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore)
+ s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
+ end
+ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
+ s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal
+ s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
+ end
+
+ s_mov_b32 m0, s_restore_m0
+ s_mov_b32 exec_lo, s_restore_exec_lo
+ s_mov_b32 exec_hi, s_restore_exec_hi
+
+ s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
+ s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask //restore xnack_mask
+ s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
+ s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
+ //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
+ s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode
+ //reuse s_restore_m0 as a temp register
+ s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
+ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
+ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
+ s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero
+ s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
+ s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
+ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
+ s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
+ s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
+ s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
+ s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp
+ s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status
+
+ s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time
+
+
+// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
+ s_rfe_b64 s_restore_pc_lo // s_restore_m0[0] is used to set STATUS.inst_atc
+
+
+/**************************************************************************/
+/* the END */
+/**************************************************************************/
+L_END_PGM:
+ s_endpgm
+
+end
+
+
+/**************************************************************************/
+/* the helper functions */
+/**************************************************************************/
+function write_sgpr_to_mem_wave32(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf)
+ if (use_sqc)
+ s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on
+ s_mov_b32 m0, s_mem_offset
+ s_buffer_store_dword s, s_rsrc, m0 glc:1
+ s_add_u32 s_mem_offset, s_mem_offset, 4
+ s_mov_b32 m0, exec_lo
+ elsif (use_mtbuf)
+ v_mov_b32 v0, s
+ tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+ s_add_u32 s_mem_offset, s_mem_offset, 128
+ else
+ v_mov_b32 v0, s
+ buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
+ s_add_u32 s_mem_offset, s_mem_offset, 128
+ end
+end
+
+function write_sgpr_to_mem_wave64(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf)
+ if (use_sqc)
+ s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on
+ s_mov_b32 m0, s_mem_offset
+ s_buffer_store_dword s, s_rsrc, m0 glc:1
+ s_add_u32 s_mem_offset, s_mem_offset, 4
+ s_mov_b32 m0, exec_lo
+ elsif (use_mtbuf)
+ v_mov_b32 v0, s
+ tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+ s_add_u32 s_mem_offset, s_mem_offset, 256
+ else
+ v_mov_b32 v0, s
+ buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
+ s_add_u32 s_mem_offset, s_mem_offset, 256
+ end
+end
+
+function read_sgpr_from_mem_wave32(s, s_rsrc, s_mem_offset, use_sqc)
+ s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
+ if (use_sqc)
+ s_add_u32 s_mem_offset, s_mem_offset, 4
+ else
+ s_add_u32 s_mem_offset, s_mem_offset, 128
+ end
+end
+
+function read_sgpr_from_mem_wave64(s, s_rsrc, s_mem_offset, use_sqc)
+ s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
+ if (use_sqc)
+ s_add_u32 s_mem_offset, s_mem_offset, 4
+ else
+ s_add_u32 s_mem_offset, s_mem_offset, 256
+ end
+end
+
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
index abe1a5da29fb..a47f5b933120 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
@@ -282,19 +282,6 @@ if G8SR_DEBUG_TIMESTAMP
s_waitcnt lgkmcnt(0) //FIXME, will cause xnack??
end
- //check whether there is mem_viol
- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
- s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
- s_cbranch_scc0 L_NO_PC_REWIND
-
- //if so, need rewind PC assuming GDS operation gets NACKed
- s_mov_b32 s_save_tmp, 0 //clear mem_viol bit
- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit
- s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
- s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8
- s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc
-
-L_NO_PC_REWIND:
s_mov_b32 s_save_tmp, 0 //clear saveCtx bit
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
index 0bb9c577b3a2..6bae2e022c6e 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
@@ -150,10 +150,10 @@ var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28
var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
-var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used
-var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME
-var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME
-var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME
+var S_SAVE_PC_HI_RCNT_SHIFT = 27 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used
+var S_SAVE_PC_HI_RCNT_MASK = 0xF8000000 //FIXME
+var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 26 //FIXME
+var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x04000000 //FIXME
var s_save_spi_init_lo = exec_lo
var s_save_spi_init_hi = exec_hi
@@ -162,8 +162,8 @@ var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],tra
var s_save_pc_hi = ttmp1
var s_save_exec_lo = ttmp2
var s_save_exec_hi = ttmp3
-var s_save_tmp = ttmp4
-var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine
+var s_save_tmp = ttmp14
+var s_save_trapsts = ttmp15 //not really used until the end of the SAVE routine
var s_save_xnack_mask_lo = ttmp6
var s_save_xnack_mask_hi = ttmp7
var s_save_buf_rsrc0 = ttmp8
@@ -171,9 +171,9 @@ var s_save_buf_rsrc1 = ttmp9
var s_save_buf_rsrc2 = ttmp10
var s_save_buf_rsrc3 = ttmp11
var s_save_status = ttmp12
-var s_save_mem_offset = ttmp14
+var s_save_mem_offset = ttmp4
var s_save_alloc_size = s_save_trapsts //conflict
-var s_save_m0 = ttmp15
+var s_save_m0 = ttmp5
var s_save_ttmps_lo = s_save_tmp //no conflict
var s_save_ttmps_hi = s_save_trapsts //no conflict
@@ -207,10 +207,10 @@ var s_restore_mode = ttmp7
var s_restore_pc_lo = ttmp0
var s_restore_pc_hi = ttmp1
-var s_restore_exec_lo = ttmp14
-var s_restore_exec_hi = ttmp15
-var s_restore_status = ttmp4
-var s_restore_trapsts = ttmp5
+var s_restore_exec_lo = ttmp4
+var s_restore_exec_hi = ttmp5
+var s_restore_status = ttmp14
+var s_restore_trapsts = ttmp15
var s_restore_xnack_mask_lo = xnack_mask_lo
var s_restore_xnack_mask_hi = xnack_mask_hi
var s_restore_buf_rsrc0 = ttmp8
@@ -266,10 +266,16 @@ if (!EMU_RUN_HACK)
L_HALT_WAVE:
// If STATUS.HALT is set then this fault must come from SQC instruction fetch.
- // We cannot prevent further faults so just terminate the wavefront.
+ // We cannot prevent further faults. Spin wait until context saved.
s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK
s_cbranch_scc0 L_NOT_ALREADY_HALTED
- s_endpgm
+
+L_WAIT_CTX_SAVE:
+ s_sleep 0x10
+ s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS)
+ s_and_b32 ttmp2, ttmp2, SQ_WAVE_TRAPSTS_SAVECTX_MASK
+ s_cbranch_scc0 L_WAIT_CTX_SAVE
+
L_NOT_ALREADY_HALTED:
s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
@@ -293,12 +299,12 @@ L_FETCH_2ND_TRAP:
// Read second-level TBA/TMA from first-level TMA and jump if available.
// ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
// ttmp12 holds SQ_WAVE_STATUS
- s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_SHADER_TMA_LO)
- s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_SHADER_TMA_HI)
- s_lshl_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8
- s_load_dwordx2 [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA
+ s_getreg_b32 ttmp14, hwreg(HW_REG_SQ_SHADER_TMA_LO)
+ s_getreg_b32 ttmp15, hwreg(HW_REG_SQ_SHADER_TMA_HI)
+ s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
+ s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1 // second-level TBA
s_waitcnt lgkmcnt(0)
- s_load_dwordx2 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA
+ s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1 // second-level TMA
s_waitcnt lgkmcnt(0)
s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set
@@ -405,7 +411,7 @@ end
else
end
- // Save trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic
+ // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
// ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
get_vgpr_size_bytes(s_save_ttmps_lo)
get_sgpr_size_bytes(s_save_ttmps_hi)
@@ -413,13 +419,11 @@ end
s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
s_addc_u32 s_save_ttmps_hi, s_save_spi_init_hi, 0x0
s_and_b32 s_save_ttmps_hi, s_save_ttmps_hi, 0xFFFF
- s_store_dwordx2 [ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x40 glc:1
- ack_sqc_store_workaround()
- s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x48 glc:1
+ s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1
ack_sqc_store_workaround()
- s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x58 glc:1
+ s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1
ack_sqc_store_workaround()
- s_store_dwordx2 [ttmp14, ttmp15], [s_save_ttmps_lo, s_save_ttmps_hi], 0x5C glc:1
+ s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1
ack_sqc_store_workaround()
/* setup Resource Contants */
@@ -1093,7 +1097,7 @@ end
//s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode
- // Restore trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic
+ // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
// ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
get_vgpr_size_bytes(s_restore_ttmps_lo)
get_sgpr_size_bytes(s_restore_ttmps_hi)
@@ -1101,10 +1105,9 @@ end
s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
- s_load_dwordx2 [ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x40 glc:1
- s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x48 glc:1
- s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x58 glc:1
- s_load_dwordx2 [ttmp14, ttmp15], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x5C glc:1
+ s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1
+ s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1
+ s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1
s_waitcnt lgkmcnt(0)
//reuse s_restore_m0 as a temp register
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index dd6b4b0b5f30..26b15cc56c31 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -213,6 +213,8 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA)
q_properties->type = KFD_QUEUE_TYPE_SDMA;
+ else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI)
+ q_properties->type = KFD_QUEUE_TYPE_SDMA_XGMI;
else
return -ENOTSUPP;
@@ -522,7 +524,7 @@ static int kfd_ioctl_set_trap_handler(struct file *filep,
struct kfd_process_device *pdd;
dev = kfd_device_by_id(args->gpu_id);
- if (dev == NULL)
+ if (!dev)
return -EINVAL;
mutex_lock(&p->mutex);
@@ -1272,6 +1274,12 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
if (args->size != kfd_doorbell_process_slice(dev))
return -EINVAL;
offset = kfd_get_process_doorbells(dev, p);
+ } else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
+ if (args->size != PAGE_SIZE)
+ return -EINVAL;
+ offset = amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd);
+ if (!offset)
+ return -ENOMEM;
}
mutex_lock(&p->mutex);
@@ -1301,6 +1309,14 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
args->handle = MAKE_HANDLE(args->gpu_id, idr_handle);
args->mmap_offset = offset;
+ /* MMIO is mapped through kfd device
+ * Generate a kfd mmap offset
+ */
+ if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
+ args->mmap_offset = KFD_MMAP_TYPE_MMIO | KFD_MMAP_GPU_ID(args->gpu_id);
+ args->mmap_offset <<= PAGE_SHIFT;
+ }
+
return 0;
err_free:
@@ -1551,6 +1567,32 @@ copy_from_user_failed:
return err;
}
+static int kfd_ioctl_alloc_queue_gws(struct file *filep,
+ struct kfd_process *p, void *data)
+{
+ int retval;
+ struct kfd_ioctl_alloc_queue_gws_args *args = data;
+ struct kfd_dev *dev;
+
+ if (!hws_gws_support)
+ return -ENODEV;
+
+ dev = kfd_device_by_id(args->gpu_id);
+ if (!dev) {
+ pr_debug("Could not find gpu id 0x%x\n", args->gpu_id);
+ return -ENODEV;
+ }
+ if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
+ return -ENODEV;
+
+ mutex_lock(&p->mutex);
+ retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL);
+ mutex_unlock(&p->mutex);
+
+ args->first_gws = 0;
+ return retval;
+}
+
static int kfd_ioctl_get_dmabuf_info(struct file *filep,
struct kfd_process *p, void *data)
{
@@ -1753,6 +1795,8 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF,
kfd_ioctl_import_dmabuf, 0),
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
+ kfd_ioctl_alloc_queue_gws, 0),
};
#define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls)
@@ -1845,6 +1889,39 @@ err_i1:
return retcode;
}
+static int kfd_mmio_mmap(struct kfd_dev *dev, struct kfd_process *process,
+ struct vm_area_struct *vma)
+{
+ phys_addr_t address;
+ int ret;
+
+ if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+ return -EINVAL;
+
+ address = amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd);
+
+ vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE |
+ VM_DONTDUMP | VM_PFNMAP;
+
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+ pr_debug("Process %d mapping mmio page\n"
+ " target user address == 0x%08llX\n"
+ " physical address == 0x%08llX\n"
+ " vm_flags == 0x%04lX\n"
+ " size == 0x%04lX\n",
+ process->pasid, (unsigned long long) vma->vm_start,
+ address, vma->vm_flags, PAGE_SIZE);
+
+ ret = io_remap_pfn_range(vma,
+ vma->vm_start,
+ address >> PAGE_SHIFT,
+ PAGE_SIZE,
+ vma->vm_page_prot);
+ return ret;
+}
+
+
static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct kfd_process *process;
@@ -1875,6 +1952,10 @@ static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
if (!dev)
return -ENODEV;
return kfd_reserved_mem_mmap(dev, process, vma);
+ case KFD_MMAP_TYPE_MMIO:
+ if (!dev)
+ return -ENODEV;
+ return kfd_mmio_mmap(dev, process, vma);
}
return -EFAULT;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 2e7c44955f43..792371442195 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -134,9 +134,12 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = {
#define polaris10_cache_info carrizo_cache_info
#define polaris11_cache_info carrizo_cache_info
#define polaris12_cache_info carrizo_cache_info
+#define vegam_cache_info carrizo_cache_info
/* TODO - check & update Vega10 cache details */
#define vega10_cache_info carrizo_cache_info
#define raven_cache_info carrizo_cache_info
+/* TODO - check & update Navi10 cache details */
+#define navi10_cache_info carrizo_cache_info
static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
struct crat_subtype_computeunit *cu)
@@ -372,7 +375,7 @@ static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,
if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS)
props->weight = 20;
else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI)
- props->weight = 15;
+ props->weight = 15 * iolink->num_hops_xgmi;
else
props->weight = node_distance(id_from, id_to);
@@ -652,6 +655,10 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
pcache_info = polaris12_cache_info;
num_of_cache_types = ARRAY_SIZE(polaris12_cache_info);
break;
+ case CHIP_VEGAM:
+ pcache_info = vegam_cache_info;
+ num_of_cache_types = ARRAY_SIZE(vegam_cache_info);
+ break;
case CHIP_VEGA10:
case CHIP_VEGA12:
case CHIP_VEGA20:
@@ -661,6 +668,9 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
case CHIP_RAVEN:
pcache_info = raven_cache_info;
num_of_cache_types = ARRAY_SIZE(raven_cache_info);
+ case CHIP_NAVI10:
+ pcache_info = navi10_cache_info;
+ num_of_cache_types = ARRAY_SIZE(navi10_cache_info);
break;
default:
return -EINVAL;
@@ -1092,6 +1102,7 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size,
static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,
struct kfd_dev *kdev,
+ struct kfd_dev *peer_kdev,
struct crat_subtype_iolink *sub_type_hdr,
uint32_t proximity_domain_from,
uint32_t proximity_domain_to)
@@ -1110,6 +1121,8 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,
sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;
sub_type_hdr->proximity_domain_from = proximity_domain_from;
sub_type_hdr->proximity_domain_to = proximity_domain_to;
+ sub_type_hdr->num_hops_xgmi =
+ amdgpu_amdkfd_get_xgmi_hops_count(kdev->kgd, peer_kdev->kgd);
return 0;
}
@@ -1287,7 +1300,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
(char *)sub_type_hdr +
sizeof(struct crat_subtype_iolink));
ret = kfd_fill_gpu_xgmi_link_to_gpu(
- &avail_size, kdev,
+ &avail_size, kdev, peer_dev->gpu,
(struct crat_subtype_iolink *)sub_type_hdr,
proximity_domain, nid);
if (ret < 0)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
index 7c3f192fe25f..d54ceebd346b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
@@ -274,7 +274,8 @@ struct crat_subtype_iolink {
uint32_t minimum_bandwidth_mbs;
uint32_t maximum_bandwidth_mbs;
uint32_t recommended_transfer_size;
- uint8_t reserved2[CRAT_IOLINK_RESERVED_LENGTH];
+ uint8_t reserved2[CRAT_IOLINK_RESERVED_LENGTH - 1];
+ uint8_t num_hops_xgmi;
};
/*
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
index ab37d36d9cd6..15c523027285 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
@@ -85,36 +85,16 @@ static const struct file_operations kfd_debugfs_hang_hws_fops = {
void kfd_debugfs_init(void)
{
- struct dentry *ent;
-
debugfs_root = debugfs_create_dir("kfd", NULL);
- if (!debugfs_root || debugfs_root == ERR_PTR(-ENODEV)) {
- pr_warn("Failed to create kfd debugfs dir\n");
- return;
- }
-
- ent = debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root,
- kfd_debugfs_mqds_by_process,
- &kfd_debugfs_fops);
- if (!ent)
- pr_warn("Failed to create mqds in kfd debugfs\n");
-
- ent = debugfs_create_file("hqds", S_IFREG | 0444, debugfs_root,
- kfd_debugfs_hqds_by_device,
- &kfd_debugfs_fops);
- if (!ent)
- pr_warn("Failed to create hqds in kfd debugfs\n");
-
- ent = debugfs_create_file("rls", S_IFREG | 0444, debugfs_root,
- kfd_debugfs_rls_by_device,
- &kfd_debugfs_fops);
-
- ent = debugfs_create_file("hang_hws", S_IFREG | 0644, debugfs_root,
- NULL,
- &kfd_debugfs_hang_hws_fops);
- if (!ent)
- pr_warn("Failed to create rls in kfd debugfs\n");
+ debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root,
+ kfd_debugfs_mqds_by_process, &kfd_debugfs_fops);
+ debugfs_create_file("hqds", S_IFREG | 0444, debugfs_root,
+ kfd_debugfs_hqds_by_device, &kfd_debugfs_fops);
+ debugfs_create_file("rls", S_IFREG | 0444, debugfs_root,
+ kfd_debugfs_rls_by_device, &kfd_debugfs_fops);
+ debugfs_create_file("hang_hws", S_IFREG | 0644, debugfs_root,
+ NULL, &kfd_debugfs_hang_hws_fops);
}
void kfd_debugfs_fini(void)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 765b58a17dc7..3322a443dfb2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -54,6 +54,7 @@ static const struct kfd_device_info kaveri_device_info = {
.needs_iommu_device = true,
.needs_pci_atomics = false,
.num_sdma_engines = 2,
+ .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
};
@@ -71,6 +72,7 @@ static const struct kfd_device_info carrizo_device_info = {
.needs_iommu_device = true,
.needs_pci_atomics = false,
.num_sdma_engines = 2,
+ .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
};
@@ -87,6 +89,7 @@ static const struct kfd_device_info raven_device_info = {
.needs_iommu_device = true,
.needs_pci_atomics = true,
.num_sdma_engines = 1,
+ .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
};
#endif
@@ -105,6 +108,7 @@ static const struct kfd_device_info hawaii_device_info = {
.needs_iommu_device = false,
.needs_pci_atomics = false,
.num_sdma_engines = 2,
+ .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
};
@@ -121,6 +125,7 @@ static const struct kfd_device_info tonga_device_info = {
.needs_iommu_device = false,
.needs_pci_atomics = true,
.num_sdma_engines = 2,
+ .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
};
@@ -137,6 +142,7 @@ static const struct kfd_device_info fiji_device_info = {
.needs_iommu_device = false,
.needs_pci_atomics = true,
.num_sdma_engines = 2,
+ .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
};
@@ -153,6 +159,7 @@ static const struct kfd_device_info fiji_vf_device_info = {
.needs_iommu_device = false,
.needs_pci_atomics = false,
.num_sdma_engines = 2,
+ .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
};
@@ -170,6 +177,7 @@ static const struct kfd_device_info polaris10_device_info = {
.needs_iommu_device = false,
.needs_pci_atomics = true,
.num_sdma_engines = 2,
+ .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
};
@@ -186,6 +194,7 @@ static const struct kfd_device_info polaris10_vf_device_info = {
.needs_iommu_device = false,
.needs_pci_atomics = false,
.num_sdma_engines = 2,
+ .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
};
@@ -202,6 +211,7 @@ static const struct kfd_device_info polaris11_device_info = {
.needs_iommu_device = false,
.needs_pci_atomics = true,
.num_sdma_engines = 2,
+ .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
};
@@ -218,6 +228,24 @@ static const struct kfd_device_info polaris12_device_info = {
.needs_iommu_device = false,
.needs_pci_atomics = true,
.num_sdma_engines = 2,
+ .num_xgmi_sdma_engines = 0,
+ .num_sdma_queues_per_engine = 2,
+};
+
+static const struct kfd_device_info vegam_device_info = {
+ .asic_family = CHIP_VEGAM,
+ .max_pasid_bits = 16,
+ .max_no_of_hqd = 24,
+ .doorbell_size = 4,
+ .ih_ring_entry_size = 4 * sizeof(uint32_t),
+ .event_interrupt_class = &event_interrupt_class_cik,
+ .num_of_watch_points = 4,
+ .mqd_size_aligned = MQD_SIZE_ALIGNED,
+ .supports_cwsr = true,
+ .needs_iommu_device = false,
+ .needs_pci_atomics = true,
+ .num_sdma_engines = 2,
+ .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
};
@@ -234,6 +262,7 @@ static const struct kfd_device_info vega10_device_info = {
.needs_iommu_device = false,
.needs_pci_atomics = false,
.num_sdma_engines = 2,
+ .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
};
@@ -250,6 +279,7 @@ static const struct kfd_device_info vega10_vf_device_info = {
.needs_iommu_device = false,
.needs_pci_atomics = false,
.num_sdma_engines = 2,
+ .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
};
@@ -266,6 +296,7 @@ static const struct kfd_device_info vega12_device_info = {
.needs_iommu_device = false,
.needs_pci_atomics = false,
.num_sdma_engines = 2,
+ .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
};
@@ -282,6 +313,24 @@ static const struct kfd_device_info vega20_device_info = {
.needs_iommu_device = false,
.needs_pci_atomics = false,
.num_sdma_engines = 2,
+ .num_xgmi_sdma_engines = 0,
+ .num_sdma_queues_per_engine = 8,
+};
+
+static const struct kfd_device_info navi10_device_info = {
+ .asic_family = CHIP_NAVI10,
+ .max_pasid_bits = 16,
+ .max_no_of_hqd = 24,
+ .doorbell_size = 8,
+ .ih_ring_entry_size = 8 * sizeof(uint32_t),
+ .event_interrupt_class = &event_interrupt_class_v9,
+ .num_of_watch_points = 4,
+ .mqd_size_aligned = MQD_SIZE_ALIGNED,
+ .needs_iommu_device = false,
+ .supports_cwsr = true,
+ .needs_pci_atomics = false,
+ .num_sdma_engines = 2,
+ .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 8,
};
@@ -373,6 +422,9 @@ static const struct kfd_deviceid supported_devices[] = {
{ 0x6995, &polaris12_device_info }, /* Polaris12 */
{ 0x6997, &polaris12_device_info }, /* Polaris12 */
{ 0x699F, &polaris12_device_info }, /* Polaris12 */
+ { 0x694C, &vegam_device_info }, /* VegaM */
+ { 0x694E, &vegam_device_info }, /* VegaM */
+ { 0x694F, &vegam_device_info }, /* VegaM */
{ 0x6860, &vega10_device_info }, /* Vega10 */
{ 0x6861, &vega10_device_info }, /* Vega10 */
{ 0x6862, &vega10_device_info }, /* Vega10 */
@@ -399,7 +451,13 @@ static const struct kfd_deviceid supported_devices[] = {
{ 0x66a3, &vega20_device_info }, /* Vega20 */
{ 0x66a4, &vega20_device_info }, /* Vega20 */
{ 0x66a7, &vega20_device_info }, /* Vega20 */
- { 0x66af, &vega20_device_info } /* Vega20 */
+ { 0x66af, &vega20_device_info }, /* Vega20 */
+ /* Navi10 */
+ { 0x7310, &navi10_device_info }, /* Navi10 */
+ { 0x7312, &navi10_device_info }, /* Navi10 */
+ { 0x7318, &navi10_device_info }, /* Navi10 */
+ { 0x731a, &navi10_device_info }, /* Navi10 */
+ { 0x731f, &navi10_device_info }, /* Navi10 */
};
static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
@@ -429,7 +487,6 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
struct pci_dev *pdev, const struct kfd2kgd_calls *f2g)
{
struct kfd_dev *kfd;
- int ret;
const struct kfd_device_info *device_info =
lookup_device_info(pdev->device);
@@ -446,17 +503,15 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
* 32 and 64-bit requests are possible and must be
* supported.
*/
- ret = pci_enable_atomic_ops_to_root(pdev,
- PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
- PCI_EXP_DEVCAP2_ATOMIC_COMP64);
- if (device_info->needs_pci_atomics && ret < 0) {
+ kfd->pci_atomic_requested = amdgpu_amdkfd_have_atomics_support(kgd);
+ if (device_info->needs_pci_atomics &&
+ !kfd->pci_atomic_requested) {
dev_info(kfd_device,
"skipped device %x:%x, PCI rejects atomics\n",
pdev->vendor, pdev->device);
kfree(kfd);
return NULL;
- } else if (!ret)
- kfd->pci_atomic_requested = true;
+ }
kfd->kgd = kgd;
kfd->device_info = device_info;
@@ -481,10 +536,14 @@ static void kfd_cwsr_init(struct kfd_dev *kfd)
BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE);
kfd->cwsr_isa = cwsr_trap_gfx8_hex;
kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex);
- } else {
+ } else if (kfd->device_info->asic_family < CHIP_NAVI10) {
BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE);
kfd->cwsr_isa = cwsr_trap_gfx9_hex;
kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex);
+ } else {
+ BUILD_BUG_ON(sizeof(cwsr_trap_gfx10_hex) > PAGE_SIZE);
+ kfd->cwsr_isa = cwsr_trap_gfx10_hex;
+ kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx10_hex);
}
kfd->cwsr_enabled = true;
@@ -518,6 +577,13 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
} else
kfd->max_proc_per_quantum = hws_max_conc_proc;
+ /* Allocate global GWS that is shared by all KFD processes */
+ if (hws_gws_support && amdgpu_amdkfd_alloc_gws(kfd->kgd,
+ amdgpu_amdkfd_get_num_gws(kfd->kgd), &kfd->gws)) {
+ dev_err(kfd_device, "Could not allocate %d gws\n",
+ amdgpu_amdkfd_get_num_gws(kfd->kgd));
+ goto out;
+ }
/* calculate max size of mqds needed for queues */
size = max_num_of_queues_per_device *
kfd->device_info->mqd_size_aligned;
@@ -541,7 +607,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
&kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr,
false)) {
dev_err(kfd_device, "Could not allocate %d bytes\n", size);
- goto out;
+ goto alloc_gtt_mem_failure;
}
dev_info(kfd_device, "Allocated %d bytes on gart\n", size);
@@ -561,11 +627,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
if (kfd->kfd2kgd->get_hive_id)
kfd->hive_id = kfd->kfd2kgd->get_hive_id(kfd->kgd);
- if (kfd_topology_add_device(kfd)) {
- dev_err(kfd_device, "Error adding device to topology\n");
- goto kfd_topology_add_device_error;
- }
-
if (kfd_interrupt_init(kfd)) {
dev_err(kfd_device, "Error initializing interrupts\n");
goto kfd_interrupt_error;
@@ -589,6 +650,11 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
kfd->dbgmgr = NULL;
+ if (kfd_topology_add_device(kfd)) {
+ dev_err(kfd_device, "Error adding device to topology\n");
+ goto kfd_topology_add_device_error;
+ }
+
kfd->init_complete = true;
dev_info(kfd_device, "added device %x:%x\n", kfd->pdev->vendor,
kfd->pdev->device);
@@ -598,19 +664,21 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
goto out;
+kfd_topology_add_device_error:
kfd_resume_error:
device_iommu_error:
device_queue_manager_uninit(kfd->dqm);
device_queue_manager_error:
kfd_interrupt_exit(kfd);
kfd_interrupt_error:
- kfd_topology_remove_device(kfd);
-kfd_topology_add_device_error:
kfd_doorbell_fini(kfd);
kfd_doorbell_error:
kfd_gtt_sa_fini(kfd);
kfd_gtt_sa_init_error:
amdgpu_amdkfd_free_gtt_mem(kfd->kgd, kfd->gtt_mem);
+alloc_gtt_mem_failure:
+ if (hws_gws_support)
+ amdgpu_amdkfd_free_gws(kfd->kgd, kfd->gws);
dev_err(kfd_device,
"device %x:%x NOT added due to errors\n",
kfd->pdev->vendor, kfd->pdev->device);
@@ -628,6 +696,8 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
kfd_doorbell_fini(kfd);
kfd_gtt_sa_fini(kfd);
amdgpu_amdkfd_free_gtt_mem(kfd->kgd, kfd->gtt_mem);
+ if (hws_gws_support)
+ amdgpu_amdkfd_free_gws(kfd->kgd, kfd->gws);
}
kfree(kfd);
@@ -665,7 +735,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
if (ret)
return ret;
count = atomic_dec_return(&kfd_locked);
- WARN_ONCE(count != 0, "KFD reset ref. error");
atomic_set(&kfd->sram_ecc_flag, 0);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index ae381450601c..584748c23f14 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -42,10 +42,6 @@
static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
unsigned int pasid, unsigned int vmid);
-static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
- struct queue *q,
- struct qcm_process_device *qpd);
-
static int execute_queues_cpsch(struct device_queue_manager *dqm,
enum kfd_unmap_queues_filter filter,
uint32_t filter_param);
@@ -55,19 +51,20 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
static int map_queues_cpsch(struct device_queue_manager *dqm);
-static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
- struct queue *q,
- struct qcm_process_device *qpd);
-
static void deallocate_sdma_queue(struct device_queue_manager *dqm,
- unsigned int sdma_queue_id);
+ struct queue *q);
+static inline void deallocate_hqd(struct device_queue_manager *dqm,
+ struct queue *q);
+static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
+static int allocate_sdma_queue(struct device_queue_manager *dqm,
+ struct queue *q);
static void kfd_process_hw_exception(struct work_struct *work);
static inline
enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
{
- if (type == KFD_QUEUE_TYPE_SDMA)
+ if (type == KFD_QUEUE_TYPE_SDMA || type == KFD_QUEUE_TYPE_SDMA_XGMI)
return KFD_MQD_TYPE_SDMA;
return KFD_MQD_TYPE_CP;
}
@@ -107,12 +104,23 @@ static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm)
return dqm->dev->device_info->num_sdma_engines;
}
+static unsigned int get_num_xgmi_sdma_engines(struct device_queue_manager *dqm)
+{
+ return dqm->dev->device_info->num_xgmi_sdma_engines;
+}
+
unsigned int get_num_sdma_queues(struct device_queue_manager *dqm)
{
return dqm->dev->device_info->num_sdma_engines
* dqm->dev->device_info->num_sdma_queues_per_engine;
}
+unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm)
+{
+ return dqm->dev->device_info->num_xgmi_sdma_engines
+ * dqm->dev->device_info->num_sdma_queues_per_engine;
+}
+
void program_sh_mem_settings(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
@@ -133,7 +141,8 @@ static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q)
* preserve the user mode ABI.
*/
q->doorbell_id = q->properties.queue_id;
- } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+ } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+ q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
/* For SDMA queues on SOC15 with 8-byte doorbell, use static
* doorbell assignments based on the engine and queue id.
* The doobell index distance between RLC (2*i) and (2*i+1)
@@ -174,7 +183,8 @@ static void deallocate_doorbell(struct qcm_process_device *qpd,
struct kfd_dev *dev = qpd->dqm->dev;
if (!KFD_IS_SOC15(dev->device_info->asic_family) ||
- q->properties.type == KFD_QUEUE_TYPE_SDMA)
+ q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+ q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
return;
old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap);
@@ -210,6 +220,9 @@ static int allocate_vmid(struct device_queue_manager *dqm,
/* invalidate the VM context after pasid and vmid mapping is set up */
kfd_flush_tlb(qpd_to_pdd(qpd));
+ dqm->dev->kfd2kgd->set_scratch_backing_va(
+ dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
+
return 0;
}
@@ -256,6 +269,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
struct queue *q,
struct qcm_process_device *qpd)
{
+ struct mqd_manager *mqd_mgr;
int retval;
print_queue(q);
@@ -276,28 +290,56 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
}
q->properties.vmid = qpd->vmid;
/*
- * Eviction state logic: we only mark active queues as evicted
- * to avoid the overhead of restoring inactive queues later
+ * Eviction state logic: mark all queues as evicted, even ones
+ * not currently active. Restoring inactive queues later only
+ * updates the is_evicted flag but is a no-op otherwise.
*/
- if (qpd->evicted)
- q->properties.is_evicted = (q->properties.queue_size > 0 &&
- q->properties.queue_percent > 0 &&
- q->properties.queue_address != 0);
+ q->properties.is_evicted = !!qpd->evicted;
q->properties.tba_addr = qpd->tba_addr;
q->properties.tma_addr = qpd->tma_addr;
- if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
- retval = create_compute_queue_nocpsch(dqm, q, qpd);
- else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
- retval = create_sdma_queue_nocpsch(dqm, q, qpd);
- else
- retval = -EINVAL;
+ mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+ q->properties.type)];
+ if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) {
+ retval = allocate_hqd(dqm, q);
+ if (retval)
+ goto deallocate_vmid;
+ pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
+ q->pipe, q->queue);
+ } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+ q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+ retval = allocate_sdma_queue(dqm, q);
+ if (retval)
+ goto deallocate_vmid;
+ dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
+ }
- if (retval) {
- if (list_empty(&qpd->queues_list))
- deallocate_vmid(dqm, qpd, q);
- goto out_unlock;
+ retval = allocate_doorbell(qpd, q);
+ if (retval)
+ goto out_deallocate_hqd;
+
+ /* Temporarily release dqm lock to avoid a circular lock dependency */
+ dqm_unlock(dqm);
+ q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, &q->properties);
+ dqm_lock(dqm);
+
+ if (!q->mqd_mem_obj) {
+ retval = -ENOMEM;
+ goto out_deallocate_doorbell;
+ }
+ mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
+ &q->gart_mqd_addr, &q->properties);
+ if (q->properties.is_active) {
+
+ if (WARN(q->process->mm != current->mm,
+ "should only run in user thread"))
+ retval = -EFAULT;
+ else
+ retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
+ q->queue, &q->properties, current->mm);
+ if (retval)
+ goto out_free_mqd;
}
list_add(&q->list, &qpd->queues_list);
@@ -307,6 +349,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
dqm->sdma_queue_count++;
+ else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
+ dqm->xgmi_sdma_queue_count++;
/*
* Unconditionally increment this counter, regardless of the queue's
@@ -315,7 +359,21 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
dqm->total_queue_count++;
pr_debug("Total of %d queues are accountable so far\n",
dqm->total_queue_count);
+ goto out_unlock;
+out_free_mqd:
+ mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+out_deallocate_doorbell:
+ deallocate_doorbell(qpd, q);
+out_deallocate_hqd:
+ if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
+ deallocate_hqd(dqm, q);
+ else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+ q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
+ deallocate_sdma_queue(dqm, q);
+deallocate_vmid:
+ if (list_empty(&qpd->queues_list))
+ deallocate_vmid(dqm, qpd, q);
out_unlock:
dqm_unlock(dqm);
return retval;
@@ -361,60 +419,6 @@ static inline void deallocate_hqd(struct device_queue_manager *dqm,
dqm->allocated_queues[q->pipe] |= (1 << q->queue);
}
-static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
- struct queue *q,
- struct qcm_process_device *qpd)
-{
- struct mqd_manager *mqd_mgr;
- int retval;
-
- mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE);
- if (!mqd_mgr)
- return -ENOMEM;
-
- retval = allocate_hqd(dqm, q);
- if (retval)
- return retval;
-
- retval = allocate_doorbell(qpd, q);
- if (retval)
- goto out_deallocate_hqd;
-
- retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
- &q->gart_mqd_addr, &q->properties);
- if (retval)
- goto out_deallocate_doorbell;
-
- pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
- q->pipe, q->queue);
-
- dqm->dev->kfd2kgd->set_scratch_backing_va(
- dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
-
- if (!q->properties.is_active)
- return 0;
-
- if (WARN(q->process->mm != current->mm,
- "should only run in user thread"))
- retval = -EFAULT;
- else
- retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue,
- &q->properties, current->mm);
- if (retval)
- goto out_uninit_mqd;
-
- return 0;
-
-out_uninit_mqd:
- mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
-out_deallocate_doorbell:
- deallocate_doorbell(qpd, q);
-out_deallocate_hqd:
- deallocate_hqd(dqm, q);
-
- return retval;
-}
-
/* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked
* to avoid asynchronized access
*/
@@ -425,16 +429,17 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
int retval;
struct mqd_manager *mqd_mgr;
- mqd_mgr = dqm->ops.get_mqd_manager(dqm,
- get_mqd_type_from_queue_type(q->properties.type));
- if (!mqd_mgr)
- return -ENOMEM;
+ mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+ q->properties.type)];
if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) {
deallocate_hqd(dqm, q);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
dqm->sdma_queue_count--;
- deallocate_sdma_queue(dqm, q->sdma_id);
+ deallocate_sdma_queue(dqm, q);
+ } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+ dqm->xgmi_sdma_queue_count--;
+ deallocate_sdma_queue(dqm, q);
} else {
pr_debug("q->properties.type %d is invalid\n",
q->properties.type);
@@ -451,7 +456,7 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
if (retval == -ETIME)
qpd->reset_wavefronts = true;
- mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+ mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
list_del(&q->list);
if (list_empty(&qpd->queues_list)) {
@@ -490,7 +495,7 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
static int update_queue(struct device_queue_manager *dqm, struct queue *q)
{
- int retval;
+ int retval = 0;
struct mqd_manager *mqd_mgr;
struct kfd_process_device *pdd;
bool prev_active = false;
@@ -501,20 +506,8 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
retval = -ENODEV;
goto out_unlock;
}
- mqd_mgr = dqm->ops.get_mqd_manager(dqm,
- get_mqd_type_from_queue_type(q->properties.type));
- if (!mqd_mgr) {
- retval = -ENOMEM;
- goto out_unlock;
- }
- /*
- * Eviction state logic: we only mark active queues as evicted
- * to avoid the overhead of restoring inactive queues later
- */
- if (pdd->qpd.evicted)
- q->properties.is_evicted = (q->properties.queue_size > 0 &&
- q->properties.queue_percent > 0 &&
- q->properties.queue_address != 0);
+ mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+ q->properties.type)];
/* Save previous activity state for counters */
prev_active = q->properties.is_active;
@@ -529,7 +522,8 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
}
} else if (prev_active &&
(q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
- q->properties.type == KFD_QUEUE_TYPE_SDMA)) {
+ q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+ q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
@@ -539,7 +533,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
}
}
- retval = mqd_mgr->update_mqd(mqd_mgr, q->mqd, &q->properties);
+ mqd_mgr->update_mqd(mqd_mgr, q->mqd, &q->properties);
/*
* check active state vs. the previous state and modify
@@ -556,7 +550,8 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
retval = map_queues_cpsch(dqm);
else if (q->properties.is_active &&
(q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
- q->properties.type == KFD_QUEUE_TYPE_SDMA)) {
+ q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+ q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
if (WARN(q->process->mm != current->mm,
"should only run in user thread"))
retval = -EFAULT;
@@ -571,34 +566,13 @@ out_unlock:
return retval;
}
-static struct mqd_manager *get_mqd_manager(
- struct device_queue_manager *dqm, enum KFD_MQD_TYPE type)
-{
- struct mqd_manager *mqd_mgr;
-
- if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
- return NULL;
-
- pr_debug("mqd type %d\n", type);
-
- mqd_mgr = dqm->mqd_mgrs[type];
- if (!mqd_mgr) {
- mqd_mgr = mqd_manager_init(type, dqm->dev);
- if (!mqd_mgr)
- pr_err("mqd manager is NULL");
- dqm->mqd_mgrs[type] = mqd_mgr;
- }
-
- return mqd_mgr;
-}
-
static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
struct queue *q;
struct mqd_manager *mqd_mgr;
struct kfd_process_device *pdd;
- int retval = 0;
+ int retval, ret = 0;
dqm_lock(dqm);
if (qpd->evicted++ > 0) /* already evicted, do nothing */
@@ -608,30 +582,31 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
pr_info_ratelimited("Evicting PASID %u queues\n",
pdd->process->pasid);
- /* unactivate all active queues on the qpd */
+ /* Mark all queues as evicted. Deactivate all active queues on
+ * the qpd.
+ */
list_for_each_entry(q, &qpd->queues_list, list) {
+ q->properties.is_evicted = true;
if (!q->properties.is_active)
continue;
- mqd_mgr = dqm->ops.get_mqd_manager(dqm,
- get_mqd_type_from_queue_type(q->properties.type));
- if (!mqd_mgr) { /* should not be here */
- pr_err("Cannot evict queue, mqd mgr is NULL\n");
- retval = -ENOMEM;
- goto out;
- }
- q->properties.is_evicted = true;
+
+ mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+ q->properties.type)];
q->properties.is_active = false;
retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
- if (retval)
- goto out;
+ if (retval && !ret)
+ /* Return the first error, but keep going to
+ * maintain a consistent eviction state
+ */
+ ret = retval;
dqm->queue_count--;
}
out:
dqm_unlock(dqm);
- return retval;
+ return ret;
}
static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
@@ -649,11 +624,14 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
pr_info_ratelimited("Evicting PASID %u queues\n",
pdd->process->pasid);
- /* unactivate all active queues on the qpd */
+ /* Mark all queues as evicted. Deactivate all active queues on
+ * the qpd.
+ */
list_for_each_entry(q, &qpd->queues_list, list) {
+ q->properties.is_evicted = true;
if (!q->properties.is_active)
continue;
- q->properties.is_evicted = true;
+
q->properties.is_active = false;
dqm->queue_count--;
}
@@ -675,7 +653,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
struct mqd_manager *mqd_mgr;
struct kfd_process_device *pdd;
uint64_t pd_base;
- int retval = 0;
+ int retval, ret = 0;
pdd = qpd_to_pdd(qpd);
/* Retrieve PD base */
@@ -709,27 +687,28 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
*/
mm = get_task_mm(pdd->process->lead_thread);
if (!mm) {
- retval = -EFAULT;
+ ret = -EFAULT;
goto out;
}
- /* activate all active queues on the qpd */
+ /* Remove the eviction flags. Activate queues that are not
+ * inactive for other reasons.
+ */
list_for_each_entry(q, &qpd->queues_list, list) {
- if (!q->properties.is_evicted)
- continue;
- mqd_mgr = dqm->ops.get_mqd_manager(dqm,
- get_mqd_type_from_queue_type(q->properties.type));
- if (!mqd_mgr) { /* should not be here */
- pr_err("Cannot restore queue, mqd mgr is NULL\n");
- retval = -ENOMEM;
- goto out;
- }
q->properties.is_evicted = false;
+ if (!QUEUE_IS_ACTIVE(q->properties))
+ continue;
+
+ mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+ q->properties.type)];
q->properties.is_active = true;
retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
q->queue, &q->properties, mm);
- if (retval)
- goto out;
+ if (retval && !ret)
+ /* Return the first error, but keep going to
+ * maintain a consistent eviction state
+ */
+ ret = retval;
dqm->queue_count++;
}
qpd->evicted = 0;
@@ -737,7 +716,7 @@ out:
if (mm)
mmput(mm);
dqm_unlock(dqm);
- return retval;
+ return ret;
}
static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
@@ -769,16 +748,16 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
/* activate all active queues on the qpd */
list_for_each_entry(q, &qpd->queues_list, list) {
- if (!q->properties.is_evicted)
- continue;
q->properties.is_evicted = false;
+ if (!QUEUE_IS_ACTIVE(q->properties))
+ continue;
+
q->properties.is_active = true;
dqm->queue_count++;
}
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
- if (!retval)
- qpd->evicted = 0;
+ qpd->evicted = 0;
out:
dqm_unlock(dqm);
return retval;
@@ -812,10 +791,14 @@ static int register_process(struct device_queue_manager *dqm,
retval = dqm->asic_ops.update_qpd(dqm, qpd);
dqm->processes_count++;
- kfd_inc_compute_active(dqm->dev);
dqm_unlock(dqm);
+ /* Outside the DQM lock because under the DQM lock we can't do
+ * reclaim or take other locks that others hold while reclaiming.
+ */
+ kfd_inc_compute_active(dqm->dev);
+
return retval;
}
@@ -836,7 +819,6 @@ static int unregister_process(struct device_queue_manager *dqm,
list_del(&cur->list);
kfree(cur);
dqm->processes_count--;
- kfd_dec_compute_active(dqm->dev);
goto out;
}
}
@@ -844,6 +826,13 @@ static int unregister_process(struct device_queue_manager *dqm,
retval = 1;
out:
dqm_unlock(dqm);
+
+ /* Outside the DQM lock because under the DQM lock we can't do
+ * reclaim or take other locks that others hold while reclaiming.
+ */
+ if (!retval)
+ kfd_dec_compute_active(dqm->dev);
+
return retval;
}
@@ -879,6 +868,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
INIT_LIST_HEAD(&dqm->queues);
dqm->queue_count = dqm->next_pipe_to_allocate = 0;
dqm->sdma_queue_count = 0;
+ dqm->xgmi_sdma_queue_count = 0;
for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) {
int pipe_offset = pipe * get_queues_per_pipe(dqm);
@@ -890,7 +880,8 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
}
dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1;
- dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1;
+ dqm->sdma_bitmap = (1ULL << get_num_sdma_queues(dqm)) - 1;
+ dqm->xgmi_sdma_bitmap = (1ULL << get_num_xgmi_sdma_queues(dqm)) - 1;
return 0;
}
@@ -921,75 +912,56 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
}
static int allocate_sdma_queue(struct device_queue_manager *dqm,
- unsigned int *sdma_queue_id)
+ struct queue *q)
{
int bit;
- if (dqm->sdma_bitmap == 0)
- return -ENOMEM;
+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+ if (dqm->sdma_bitmap == 0)
+ return -ENOMEM;
+ bit = __ffs64(dqm->sdma_bitmap);
+ dqm->sdma_bitmap &= ~(1ULL << bit);
+ q->sdma_id = bit;
+ q->properties.sdma_engine_id = q->sdma_id %
+ get_num_sdma_engines(dqm);
+ q->properties.sdma_queue_id = q->sdma_id /
+ get_num_sdma_engines(dqm);
+ } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+ if (dqm->xgmi_sdma_bitmap == 0)
+ return -ENOMEM;
+ bit = __ffs64(dqm->xgmi_sdma_bitmap);
+ dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
+ q->sdma_id = bit;
+ /* sdma_engine_id is sdma id including
+ * both PCIe-optimized SDMAs and XGMI-
+ * optimized SDMAs. The calculation below
+ * assumes the first N engines are always
+ * PCIe-optimized ones
+ */
+ q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
+ q->sdma_id % get_num_xgmi_sdma_engines(dqm);
+ q->properties.sdma_queue_id = q->sdma_id /
+ get_num_xgmi_sdma_engines(dqm);
+ }
- bit = ffs(dqm->sdma_bitmap) - 1;
- dqm->sdma_bitmap &= ~(1 << bit);
- *sdma_queue_id = bit;
+ pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
+ pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
return 0;
}
static void deallocate_sdma_queue(struct device_queue_manager *dqm,
- unsigned int sdma_queue_id)
-{
- if (sdma_queue_id >= get_num_sdma_queues(dqm))
- return;
- dqm->sdma_bitmap |= (1 << sdma_queue_id);
-}
-
-static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
- struct queue *q,
- struct qcm_process_device *qpd)
+ struct queue *q)
{
- struct mqd_manager *mqd_mgr;
- int retval;
-
- mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA);
- if (!mqd_mgr)
- return -ENOMEM;
-
- retval = allocate_sdma_queue(dqm, &q->sdma_id);
- if (retval)
- return retval;
-
- q->properties.sdma_queue_id = q->sdma_id / get_num_sdma_engines(dqm);
- q->properties.sdma_engine_id = q->sdma_id % get_num_sdma_engines(dqm);
-
- retval = allocate_doorbell(qpd, q);
- if (retval)
- goto out_deallocate_sdma_queue;
-
- pr_debug("SDMA id is: %d\n", q->sdma_id);
- pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
- pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
-
- dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
- retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
- &q->gart_mqd_addr, &q->properties);
- if (retval)
- goto out_deallocate_doorbell;
-
- retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, 0, 0, &q->properties,
- NULL);
- if (retval)
- goto out_uninit_mqd;
-
- return 0;
-
-out_uninit_mqd:
- mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
-out_deallocate_doorbell:
- deallocate_doorbell(qpd, q);
-out_deallocate_sdma_queue:
- deallocate_sdma_queue(dqm, q->sdma_id);
-
- return retval;
+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+ if (q->sdma_id >= get_num_sdma_queues(dqm))
+ return;
+ dqm->sdma_bitmap |= (1ULL << q->sdma_id);
+ } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+ if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
+ return;
+ dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
+ }
}
/*
@@ -1026,8 +998,8 @@ static int set_sched_resources(struct device_queue_manager *dqm)
res.queue_mask |= (1ull << i);
}
- res.gws_mask = res.oac_mask = res.gds_heap_base =
- res.gds_heap_size = 0;
+ res.gws_mask = ~0ull;
+ res.oac_mask = res.gds_heap_base = res.gds_heap_size = 0;
pr_debug("Scheduling resources:\n"
"vmid mask: 0x%8X\n"
@@ -1045,8 +1017,10 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
INIT_LIST_HEAD(&dqm->queues);
dqm->queue_count = dqm->processes_count = 0;
dqm->sdma_queue_count = 0;
+ dqm->xgmi_sdma_queue_count = 0;
dqm->active_runlist = false;
- dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1;
+ dqm->sdma_bitmap = (1ULL << get_num_sdma_queues(dqm)) - 1;
+ dqm->xgmi_sdma_bitmap = (1ULL << get_num_xgmi_sdma_queues(dqm)) - 1;
INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception);
@@ -1161,55 +1135,47 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
int retval;
struct mqd_manager *mqd_mgr;
- retval = 0;
-
- dqm_lock(dqm);
-
if (dqm->total_queue_count >= max_num_of_queues_per_device) {
pr_warn("Can't create new usermode queue because %d queues were already created\n",
dqm->total_queue_count);
retval = -EPERM;
- goto out_unlock;
+ goto out;
}
- if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
- retval = allocate_sdma_queue(dqm, &q->sdma_id);
+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+ q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+ dqm_lock(dqm);
+ retval = allocate_sdma_queue(dqm, q);
+ dqm_unlock(dqm);
if (retval)
- goto out_unlock;
- q->properties.sdma_queue_id =
- q->sdma_id / get_num_sdma_engines(dqm);
- q->properties.sdma_engine_id =
- q->sdma_id % get_num_sdma_engines(dqm);
+ goto out;
}
retval = allocate_doorbell(qpd, q);
if (retval)
goto out_deallocate_sdma_queue;
- mqd_mgr = dqm->ops.get_mqd_manager(dqm,
- get_mqd_type_from_queue_type(q->properties.type));
-
- if (!mqd_mgr) {
- retval = -ENOMEM;
- goto out_deallocate_doorbell;
- }
+ mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+ q->properties.type)];
/*
- * Eviction state logic: we only mark active queues as evicted
- * to avoid the overhead of restoring inactive queues later
+ * Eviction state logic: mark all queues as evicted, even ones
+ * not currently active. Restoring inactive queues later only
+ * updates the is_evicted flag but is a no-op otherwise.
*/
- if (qpd->evicted)
- q->properties.is_evicted = (q->properties.queue_size > 0 &&
- q->properties.queue_percent > 0 &&
- q->properties.queue_address != 0);
-
- dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
-
+ q->properties.is_evicted = !!qpd->evicted;
+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+ q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
+ dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
q->properties.tba_addr = qpd->tba_addr;
q->properties.tma_addr = qpd->tma_addr;
- retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
- &q->gart_mqd_addr, &q->properties);
- if (retval)
+ q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, &q->properties);
+ if (!q->mqd_mem_obj) {
+ retval = -ENOMEM;
goto out_deallocate_doorbell;
+ }
+ mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
+ &q->gart_mqd_addr, &q->properties);
+ dqm_lock(dqm);
list_add(&q->list, &qpd->queues_list);
qpd->queue_count++;
@@ -1221,6 +1187,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
dqm->sdma_queue_count++;
+ else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
+ dqm->xgmi_sdma_queue_count++;
/*
* Unconditionally increment this counter, regardless of the queue's
* type or whether the queue is active.
@@ -1236,11 +1204,13 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
out_deallocate_doorbell:
deallocate_doorbell(qpd, q);
out_deallocate_sdma_queue:
- if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
- deallocate_sdma_queue(dqm, q->sdma_id);
-out_unlock:
- dqm_unlock(dqm);
-
+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+ q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+ dqm_lock(dqm);
+ deallocate_sdma_queue(dqm, q);
+ dqm_unlock(dqm);
+ }
+out:
return retval;
}
@@ -1268,12 +1238,18 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
return 0;
}
-static int unmap_sdma_queues(struct device_queue_manager *dqm,
- unsigned int sdma_engine)
+static int unmap_sdma_queues(struct device_queue_manager *dqm)
{
- return pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA,
- KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false,
- sdma_engine);
+ int i, retval = 0;
+
+ for (i = 0; i < dqm->dev->device_info->num_sdma_engines +
+ dqm->dev->device_info->num_xgmi_sdma_engines; i++) {
+ retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA,
+ KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, i);
+ if (retval)
+ return retval;
+ }
+ return retval;
}
/* dqm->lock mutex has to be locked before calling this function */
@@ -1288,6 +1264,7 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
return 0;
retval = pm_send_runlist(&dqm->packets, &dqm->queues);
+ pr_debug("%s sent runlist\n", __func__);
if (retval) {
pr_err("failed to execute runlist\n");
return retval;
@@ -1309,13 +1286,11 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
if (!dqm->active_runlist)
return retval;
- pr_debug("Before destroying queues, sdma queue count is : %u\n",
- dqm->sdma_queue_count);
+ pr_debug("Before destroying queues, sdma queue count is : %u, xgmi sdma queue count is : %u\n",
+ dqm->sdma_queue_count, dqm->xgmi_sdma_queue_count);
- if (dqm->sdma_queue_count > 0) {
- unmap_sdma_queues(dqm, 0);
- unmap_sdma_queues(dqm, 1);
- }
+ if (dqm->sdma_queue_count > 0 || dqm->xgmi_sdma_queue_count)
+ unmap_sdma_queues(dqm);
retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE,
filter, filter_param, false, 0);
@@ -1327,7 +1302,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
KFD_FENCE_COMPLETED);
/* should be timed out */
retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
- QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS);
+ queue_preemption_timeout_ms);
if (retval)
return retval;
@@ -1379,18 +1354,17 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
}
- mqd_mgr = dqm->ops.get_mqd_manager(dqm,
- get_mqd_type_from_queue_type(q->properties.type));
- if (!mqd_mgr) {
- retval = -ENOMEM;
- goto failed;
- }
+ mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+ q->properties.type)];
deallocate_doorbell(qpd, q);
if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
dqm->sdma_queue_count--;
- deallocate_sdma_queue(dqm, q->sdma_id);
+ deallocate_sdma_queue(dqm, q);
+ } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+ dqm->xgmi_sdma_queue_count--;
+ deallocate_sdma_queue(dqm, q);
}
list_del(&q->list);
@@ -1403,8 +1377,6 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
qpd->reset_wavefronts = true;
}
- mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
-
/*
* Unconditionally decrement this counter, regardless of the queue's
* type
@@ -1415,9 +1387,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
dqm_unlock(dqm);
+ /* Do free_mqd after dqm_unlock(dqm) to avoid circular locking */
+ mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+
return retval;
-failed:
failed_try_destroy_debugged_queue:
dqm_unlock(dqm);
@@ -1520,6 +1494,7 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm,
struct queue *q, *next;
struct device_process_node *cur, *next_dpn;
int retval = 0;
+ bool found = false;
dqm_lock(dqm);
@@ -1538,12 +1513,19 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm,
list_del(&cur->list);
kfree(cur);
dqm->processes_count--;
- kfd_dec_compute_active(dqm->dev);
+ found = true;
break;
}
}
dqm_unlock(dqm);
+
+ /* Outside the DQM lock because under the DQM lock we can't do
+ * reclaim or take other locks that others hold while reclaiming.
+ */
+ if (found)
+ kfd_dec_compute_active(dqm->dev);
+
return retval;
}
@@ -1564,11 +1546,7 @@ static int get_wave_state(struct device_queue_manager *dqm,
goto dqm_unlock;
}
- mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE);
- if (!mqd_mgr) {
- r = -ENOMEM;
- goto dqm_unlock;
- }
+ mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_COMPUTE];
if (!mqd_mgr->get_wave_state) {
r = -EINVAL;
@@ -1593,6 +1571,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
struct device_process_node *cur, *next_dpn;
enum kfd_unmap_queues_filter filter =
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES;
+ bool found = false;
retval = 0;
@@ -1611,7 +1590,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
list_for_each_entry(q, &qpd->queues_list, list) {
if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
dqm->sdma_queue_count--;
- deallocate_sdma_queue(dqm, q->sdma_id);
+ deallocate_sdma_queue(dqm, q);
+ } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+ dqm->xgmi_sdma_queue_count--;
+ deallocate_sdma_queue(dqm, q);
}
if (q->properties.is_active)
@@ -1626,7 +1608,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
list_del(&cur->list);
kfree(cur);
dqm->processes_count--;
- kfd_dec_compute_active(dqm->dev);
+ found = true;
break;
}
}
@@ -1638,21 +1620,68 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
qpd->reset_wavefronts = false;
}
- /* lastly, free mqd resources */
+ dqm_unlock(dqm);
+
+ /* Outside the DQM lock because under the DQM lock we can't do
+ * reclaim or take other locks that others hold while reclaiming.
+ */
+ if (found)
+ kfd_dec_compute_active(dqm->dev);
+
+ /* Lastly, free mqd resources.
+ * Do free_mqd() after dqm_unlock to avoid circular locking.
+ */
list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
- mqd_mgr = dqm->ops.get_mqd_manager(dqm,
- get_mqd_type_from_queue_type(q->properties.type));
- if (!mqd_mgr) {
- retval = -ENOMEM;
- goto out;
- }
+ mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+ q->properties.type)];
list_del(&q->list);
qpd->queue_count--;
- mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+ mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
}
-out:
- dqm_unlock(dqm);
+ return retval;
+}
+
+static int init_mqd_managers(struct device_queue_manager *dqm)
+{
+ int i, j;
+ struct mqd_manager *mqd_mgr;
+
+ for (i = 0; i < KFD_MQD_TYPE_MAX; i++) {
+ mqd_mgr = dqm->asic_ops.mqd_manager_init(i, dqm->dev);
+ if (!mqd_mgr) {
+ pr_err("mqd manager [%d] initialization failed\n", i);
+ goto out_free;
+ }
+ dqm->mqd_mgrs[i] = mqd_mgr;
+ }
+
+ return 0;
+
+out_free:
+ for (j = 0; j < i; j++) {
+ kfree(dqm->mqd_mgrs[j]);
+ dqm->mqd_mgrs[j] = NULL;
+ }
+
+ return -ENOMEM;
+}
+
+/* Allocate one hiq mqd (HWS) and all SDMA mqd in a continuous trunk*/
+static int allocate_hiq_sdma_mqd(struct device_queue_manager *dqm)
+{
+ int retval;
+ struct kfd_dev *dev = dqm->dev;
+ struct kfd_mem_obj *mem_obj = &dqm->hiq_sdma_mqd;
+ uint32_t size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size *
+ dev->device_info->num_sdma_engines *
+ dev->device_info->num_sdma_queues_per_engine +
+ dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
+
+ retval = amdgpu_amdkfd_alloc_gtt_mem(dev->kgd, size,
+ &(mem_obj->gtt_mem), &(mem_obj->gpu_addr),
+ (void *)&(mem_obj->cpu_ptr), true);
+
return retval;
}
@@ -1693,7 +1722,6 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
dqm->ops.stop = stop_cpsch;
dqm->ops.destroy_queue = destroy_queue_cpsch;
dqm->ops.update_queue = update_queue;
- dqm->ops.get_mqd_manager = get_mqd_manager;
dqm->ops.register_process = register_process;
dqm->ops.unregister_process = unregister_process;
dqm->ops.uninitialize = uninitialize;
@@ -1713,7 +1741,6 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
dqm->ops.create_queue = create_queue_nocpsch;
dqm->ops.destroy_queue = destroy_queue_nocpsch;
dqm->ops.update_queue = update_queue;
- dqm->ops.get_mqd_manager = get_mqd_manager;
dqm->ops.register_process = register_process;
dqm->ops.unregister_process = unregister_process;
dqm->ops.initialize = initialize_nocpsch;
@@ -1749,6 +1776,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
case CHIP_POLARIS10:
case CHIP_POLARIS11:
case CHIP_POLARIS12:
+ case CHIP_VEGAM:
device_queue_manager_init_vi_tonga(&dqm->asic_ops);
break;
@@ -1758,12 +1786,23 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
case CHIP_RAVEN:
device_queue_manager_init_v9(&dqm->asic_ops);
break;
+ case CHIP_NAVI10:
+ device_queue_manager_init_v10_navi10(&dqm->asic_ops);
+ break;
default:
WARN(1, "Unexpected ASIC family %u",
dev->device_info->asic_family);
goto out_free;
}
+ if (init_mqd_managers(dqm))
+ goto out_free;
+
+ if (allocate_hiq_sdma_mqd(dqm)) {
+ pr_err("Failed to allocate hiq sdma mqd trunk buffer\n");
+ goto out_free;
+ }
+
if (!dqm->ops.initialize(dqm))
return dqm;
@@ -1772,9 +1811,17 @@ out_free:
return NULL;
}
+void deallocate_hiq_sdma_mqd(struct kfd_dev *dev, struct kfd_mem_obj *mqd)
+{
+ WARN(!mqd, "No hiq sdma mqd trunk to free");
+
+ amdgpu_amdkfd_free_gtt_mem(dev->kgd, mqd->gtt_mem);
+}
+
void device_queue_manager_uninit(struct device_queue_manager *dqm)
{
dqm->ops.uninitialize(dqm);
+ deallocate_hiq_sdma_mqd(dqm->dev, &dqm->hiq_sdma_mqd);
kfree(dqm);
}
@@ -1833,12 +1880,13 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
int r = 0;
r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd,
- KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE, &dump, &n_regs);
+ KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE,
+ &dump, &n_regs);
if (!r) {
seq_printf(m, " HIQ on MEC %d Pipe %d Queue %d\n",
- KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1,
- KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm),
- KFD_CIK_HIQ_QUEUE);
+ KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1,
+ KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm),
+ KFD_CIK_HIQ_QUEUE);
seq_reg_dump(m, dump, n_regs);
kfree(dump);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 70e38a2e23b9..90db2c9275f6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -31,8 +31,6 @@
#include "kfd_priv.h"
#include "kfd_mqd_manager.h"
-#define KFD_UNMAP_LATENCY_MS (4000)
-#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000)
struct device_process_node {
struct qcm_process_device *qpd;
@@ -48,8 +46,6 @@ struct device_process_node {
*
* @update_queue: Queue update routine.
*
- * @get_mqd_manager: Returns the mqd manager according to the mqd type.
- *
* @exeute_queues: Dispatches the queues list to the H/W.
*
* @register_process: This routine associates a specific process with device.
@@ -97,10 +93,6 @@ struct device_queue_manager_ops {
int (*update_queue)(struct device_queue_manager *dqm,
struct queue *q);
- struct mqd_manager * (*get_mqd_manager)
- (struct device_queue_manager *dqm,
- enum KFD_MQD_TYPE type);
-
int (*register_process)(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
@@ -158,6 +150,8 @@ struct device_queue_manager_asic_ops {
void (*init_sdma_vm)(struct device_queue_manager *dqm,
struct queue *q,
struct qcm_process_device *qpd);
+ struct mqd_manager * (*mqd_manager_init)(enum KFD_MQD_TYPE type,
+ struct kfd_dev *dev);
};
/**
@@ -185,10 +179,12 @@ struct device_queue_manager {
unsigned int processes_count;
unsigned int queue_count;
unsigned int sdma_queue_count;
+ unsigned int xgmi_sdma_queue_count;
unsigned int total_queue_count;
unsigned int next_pipe_to_allocate;
unsigned int *allocated_queues;
- unsigned int sdma_bitmap;
+ uint64_t sdma_bitmap;
+ uint64_t xgmi_sdma_bitmap;
unsigned int vmid_bitmap;
uint64_t pipelines_addr;
struct kfd_mem_obj *pipeline_mem;
@@ -201,6 +197,7 @@ struct device_queue_manager {
/* hw exception */
bool is_hws_hang;
struct work_struct hw_exception_work;
+ struct kfd_mem_obj hiq_sdma_mqd;
};
void device_queue_manager_init_cik(
@@ -213,12 +210,15 @@ void device_queue_manager_init_vi_tonga(
struct device_queue_manager_asic_ops *asic_ops);
void device_queue_manager_init_v9(
struct device_queue_manager_asic_ops *asic_ops);
+void device_queue_manager_init_v10_navi10(
+ struct device_queue_manager_asic_ops *asic_ops);
void program_sh_mem_settings(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
unsigned int get_queues_num(struct device_queue_manager *dqm);
unsigned int get_queues_per_pipe(struct device_queue_manager *dqm);
unsigned int get_pipes_per_mec(struct device_queue_manager *dqm);
unsigned int get_num_sdma_queues(struct device_queue_manager *dqm);
+unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm);
static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
{
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
index aed4c21417bf..0d26506798cf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
@@ -48,6 +48,7 @@ void device_queue_manager_init_cik(
asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik;
asic_ops->update_qpd = update_qpd_cik;
asic_ops->init_sdma_vm = init_sdma_vm;
+ asic_ops->mqd_manager_init = mqd_manager_init_cik;
}
void device_queue_manager_init_cik_hawaii(
@@ -56,6 +57,7 @@ void device_queue_manager_init_cik_hawaii(
asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik;
asic_ops->update_qpd = update_qpd_cik_hawaii;
asic_ops->init_sdma_vm = init_sdma_vm_hawaii;
+ asic_ops->mqd_manager_init = mqd_manager_init_cik_hawaii;
}
static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c
new file mode 100644
index 000000000000..72e4d61ac752
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "kfd_device_queue_manager.h"
+#include "navi10_enum.h"
+#include "gc/gc_10_1_0_offset.h"
+#include "gc/gc_10_1_0_sh_mask.h"
+
+static int update_qpd_v10(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd);
+static void init_sdma_vm_v10(struct device_queue_manager *dqm, struct queue *q,
+ struct qcm_process_device *qpd);
+
+void device_queue_manager_init_v10_navi10(
+ struct device_queue_manager_asic_ops *asic_ops)
+{
+ asic_ops->update_qpd = update_qpd_v10;
+ asic_ops->init_sdma_vm = init_sdma_vm_v10;
+ asic_ops->mqd_manager_init = mqd_manager_init_v10;
+}
+
+static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd)
+{
+ uint32_t shared_base = pdd->lds_base >> 48;
+ uint32_t private_base = pdd->scratch_base >> 48;
+
+ return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) |
+ private_base;
+}
+
+static int update_qpd_v10(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd)
+{
+ struct kfd_process_device *pdd;
+
+ pdd = qpd_to_pdd(qpd);
+
+ /* check if sh_mem_config register already configured */
+ if (qpd->sh_mem_config == 0) {
+ qpd->sh_mem_config =
+ SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+ SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;
+#if 0
+ /* TODO:
+ * This shouldn't be an issue with Navi10. Verify.
+ */
+ if (vega10_noretry)
+ qpd->sh_mem_config |=
+ 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
+#endif
+
+ qpd->sh_mem_ape1_limit = 0;
+ qpd->sh_mem_ape1_base = 0;
+ }
+
+ qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd);
+
+ pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases);
+
+ return 0;
+}
+
+static void init_sdma_vm_v10(struct device_queue_manager *dqm, struct queue *q,
+ struct qcm_process_device *qpd)
+{
+ /* Not needed on SDMAv4 onwards any more */
+ q->properties.sdma_vm_addr = 0;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
index 417515332c35..e9fe39382371 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
@@ -37,6 +37,7 @@ void device_queue_manager_init_v9(
{
asic_ops->update_qpd = update_qpd_v9;
asic_ops->init_sdma_vm = init_sdma_vm_v9;
+ asic_ops->mqd_manager_init = mqd_manager_init_v9;
}
static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
index c3a5dcfe877a..3a7cb2f88366 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
@@ -54,6 +54,7 @@ void device_queue_manager_init_vi(
asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi;
asic_ops->update_qpd = update_qpd_vi;
asic_ops->init_sdma_vm = init_sdma_vm;
+ asic_ops->mqd_manager_init = mqd_manager_init_vi;
}
void device_queue_manager_init_vi_tonga(
@@ -62,6 +63,7 @@ void device_queue_manager_init_vi_tonga(
asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi_tonga;
asic_ops->update_qpd = update_qpd_vi_tonga;
asic_ops->init_sdma_vm = init_sdma_vm_tonga;
+ asic_ops->mqd_manager_init = mqd_manager_init_vi_tonga;
}
static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 6e1d41c5bf86..d674d4b3340f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -983,7 +983,7 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
return; /* Presumably process exited. */
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
memory_exception_data.gpu_id = dev->id;
- memory_exception_data.failure.imprecise = 1;
+ memory_exception_data.failure.imprecise = true;
/* Set failure reason */
if (info) {
memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
index 213ea5454d11..60521366dd31 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@@ -398,12 +398,14 @@ int kfd_init_apertures(struct kfd_process *process)
case CHIP_POLARIS10:
case CHIP_POLARIS11:
case CHIP_POLARIS12:
+ case CHIP_VEGAM:
kfd_init_apertures_vi(pdd, id);
break;
case CHIP_VEGA10:
case CHIP_VEGA12:
case CHIP_VEGA20:
case CHIP_RAVEN:
+ case CHIP_NAVI10:
kfd_init_apertures_v9(pdd, id);
break;
default:
@@ -435,5 +437,3 @@ int kfd_init_apertures(struct kfd_process *process)
return 0;
}
-
-
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
index 01494752c36a..5f35df23fb18 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
@@ -66,16 +66,8 @@ int kfd_iommu_device_init(struct kfd_dev *kfd)
top_dev = kfd_topology_device_by_id(kfd->id);
- /*
- * Overwrite ATS capability according to needs_iommu_device to fix
- * potential missing corresponding bit in CRAT of BIOS.
- */
- if (!kfd->device_info->needs_iommu_device) {
- top_dev->node_props.capability &= ~HSA_CAP_ATS_PRESENT;
+ if (!kfd->device_info->needs_iommu_device)
return 0;
- }
-
- top_dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
iommu_info.flags = 0;
err = amd_iommu_device_info(kfd->pdev, &iommu_info);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index f1596881f20a..29c0bd2d7a5c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -58,9 +58,10 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
kq->nop_packet = nop.u32all;
switch (type) {
case KFD_QUEUE_TYPE_DIQ:
+ kq->mqd_mgr = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_DIQ];
+ break;
case KFD_QUEUE_TYPE_HIQ:
- kq->mqd_mgr = dev->dqm->ops.get_mqd_manager(dev->dqm,
- KFD_MQD_TYPE_HIQ);
+ kq->mqd_mgr = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];
break;
default:
pr_err("Invalid queue type %d\n", type);
@@ -131,13 +132,14 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
kq->queue->device = dev;
kq->queue->process = kfd_get_process(current);
- retval = kq->mqd_mgr->init_mqd(kq->mqd_mgr, &kq->queue->mqd,
- &kq->queue->mqd_mem_obj,
+ kq->queue->mqd_mem_obj = kq->mqd_mgr->allocate_mqd(kq->mqd_mgr->dev,
+ &kq->queue->properties);
+ if (!kq->queue->mqd_mem_obj)
+ goto err_allocate_mqd;
+ kq->mqd_mgr->init_mqd(kq->mqd_mgr, &kq->queue->mqd,
+ kq->queue->mqd_mem_obj,
&kq->queue->gart_mqd_addr,
&kq->queue->properties);
- if (retval != 0)
- goto err_init_mqd;
-
/* assign HIQ to HQD */
if (type == KFD_QUEUE_TYPE_HIQ) {
pr_debug("Assigning hiq to hqd\n");
@@ -163,7 +165,8 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
return true;
err_alloc_fence:
-err_init_mqd:
+ kq->mqd_mgr->free_mqd(kq->mqd_mgr, kq->queue->mqd, kq->queue->mqd_mem_obj);
+err_allocate_mqd:
uninit_queue(kq->queue);
err_init_queue:
kfd_gtt_sa_free(dev, kq->wptr_mem);
@@ -192,7 +195,7 @@ static void uninitialize(struct kernel_queue *kq)
else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ)
kfd_gtt_sa_free(kq->dev, kq->fence_mem_obj);
- kq->mqd_mgr->uninit_mqd(kq->mqd_mgr, kq->queue->mqd,
+ kq->mqd_mgr->free_mqd(kq->mqd_mgr, kq->queue->mqd,
kq->queue->mqd_mem_obj);
kfd_gtt_sa_free(kq->dev, kq->rptr_mem);
@@ -314,6 +317,7 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
case CHIP_POLARIS10:
case CHIP_POLARIS11:
case CHIP_POLARIS12:
+ case CHIP_VEGAM:
kernel_queue_init_vi(&kq->ops_asic_specific);
break;
@@ -328,6 +332,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
case CHIP_RAVEN:
kernel_queue_init_v9(&kq->ops_asic_specific);
break;
+ case CHIP_NAVI10:
+ kernel_queue_init_v10(&kq->ops_asic_specific);
+ break;
default:
WARN(1, "Unexpected ASIC family %u",
dev->device_info->asic_family);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
index a7116a939029..365fc674fea4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
@@ -102,5 +102,6 @@ struct kernel_queue {
void kernel_queue_init_cik(struct kernel_queue_ops *ops);
void kernel_queue_init_vi(struct kernel_queue_ops *ops);
void kernel_queue_init_v9(struct kernel_queue_ops *ops);
+void kernel_queue_init_v10(struct kernel_queue_ops *ops);
#endif /* KFD_KERNEL_QUEUE_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v10.c
new file mode 100644
index 000000000000..aed32ab7102e
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v10.c
@@ -0,0 +1,348 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "kfd_kernel_queue.h"
+#include "kfd_device_queue_manager.h"
+#include "kfd_pm4_headers_ai.h"
+#include "kfd_pm4_opcodes.h"
+#include "gc/gc_10_1_0_sh_mask.h"
+
+static bool initialize_v10(struct kernel_queue *kq, struct kfd_dev *dev,
+ enum kfd_queue_type type, unsigned int queue_size);
+static void uninitialize_v10(struct kernel_queue *kq);
+static void submit_packet_v10(struct kernel_queue *kq);
+
+void kernel_queue_init_v10(struct kernel_queue_ops *ops)
+{
+ ops->initialize = initialize_v10;
+ ops->uninitialize = uninitialize_v10;
+ ops->submit_packet = submit_packet_v10;
+}
+
+static bool initialize_v10(struct kernel_queue *kq, struct kfd_dev *dev,
+ enum kfd_queue_type type, unsigned int queue_size)
+{
+ int retval;
+
+ retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem);
+ if (retval != 0)
+ return false;
+
+ kq->eop_gpu_addr = kq->eop_mem->gpu_addr;
+ kq->eop_kernel_addr = kq->eop_mem->cpu_ptr;
+
+ memset(kq->eop_kernel_addr, 0, PAGE_SIZE);
+
+ return true;
+}
+
+static void uninitialize_v10(struct kernel_queue *kq)
+{
+ kfd_gtt_sa_free(kq->dev, kq->eop_mem);
+}
+
+static void submit_packet_v10(struct kernel_queue *kq)
+{
+ *kq->wptr64_kernel = kq->pending_wptr64;
+ write_kernel_doorbell64(kq->queue->properties.doorbell_ptr,
+ kq->pending_wptr64);
+}
+
+static int pm_map_process_v10(struct packet_manager *pm,
+ uint32_t *buffer, struct qcm_process_device *qpd)
+{
+ struct pm4_mes_map_process *packet;
+ uint64_t vm_page_table_base_addr = qpd->page_table_base;
+
+ packet = (struct pm4_mes_map_process *)buffer;
+ memset(buffer, 0, sizeof(struct pm4_mes_map_process));
+
+ packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
+ sizeof(struct pm4_mes_map_process));
+ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
+ packet->bitfields2.process_quantum = 1;
+ packet->bitfields2.pasid = qpd->pqm->process->pasid;
+ packet->bitfields14.gds_size = qpd->gds_size;
+ packet->bitfields14.num_gws = qpd->num_gws;
+ packet->bitfields14.num_oac = qpd->num_oac;
+ packet->bitfields14.sdma_enable = 1;
+
+ packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
+
+ packet->sh_mem_config = qpd->sh_mem_config;
+ packet->sh_mem_bases = qpd->sh_mem_bases;
+ if (qpd->tba_addr) {
+ packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8);
+ packet->sq_shader_tba_hi = (1 << SQ_SHADER_TBA_HI__TRAP_EN__SHIFT) |
+ upper_32_bits(qpd->tba_addr >> 8);
+ packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8);
+ packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8);
+ }
+
+ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
+ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
+
+ packet->vm_context_page_table_base_addr_lo32 =
+ lower_32_bits(vm_page_table_base_addr);
+ packet->vm_context_page_table_base_addr_hi32 =
+ upper_32_bits(vm_page_table_base_addr);
+
+ return 0;
+}
+
+static int pm_runlist_v10(struct packet_manager *pm, uint32_t *buffer,
+ uint64_t ib, size_t ib_size_in_dwords, bool chain)
+{
+ struct pm4_mes_runlist *packet;
+
+ int concurrent_proc_cnt = 0;
+ struct kfd_dev *kfd = pm->dqm->dev;
+
+ /* Determine the number of processes to map together to HW:
+ * it can not exceed the number of VMIDs available to the
+ * scheduler, and it is determined by the smaller of the number
+ * of processes in the runlist and kfd module parameter
+ * hws_max_conc_proc.
+ * Note: the arbitration between the number of VMIDs and
+ * hws_max_conc_proc has been done in
+ * kgd2kfd_device_init().
+ */
+ concurrent_proc_cnt = min(pm->dqm->processes_count,
+ kfd->max_proc_per_quantum);
+
+
+ packet = (struct pm4_mes_runlist *)buffer;
+
+ memset(buffer, 0, sizeof(struct pm4_mes_runlist));
+ packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST,
+ sizeof(struct pm4_mes_runlist));
+
+ packet->bitfields4.ib_size = ib_size_in_dwords;
+ packet->bitfields4.chain = chain ? 1 : 0;
+ packet->bitfields4.offload_polling = 0;
+ packet->bitfields4.valid = 1;
+ packet->bitfields4.process_cnt = concurrent_proc_cnt;
+ packet->ordinal2 = lower_32_bits(ib);
+ packet->ib_base_hi = upper_32_bits(ib);
+
+ return 0;
+}
+
+static int pm_map_queues_v10(struct packet_manager *pm, uint32_t *buffer,
+ struct queue *q, bool is_static)
+{
+ struct pm4_mes_map_queues *packet;
+ bool use_static = is_static;
+
+ packet = (struct pm4_mes_map_queues *)buffer;
+ memset(buffer, 0, sizeof(struct pm4_mes_map_queues));
+
+ packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES,
+ sizeof(struct pm4_mes_map_queues));
+ packet->bitfields2.num_queues = 1;
+ packet->bitfields2.queue_sel =
+ queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
+
+ packet->bitfields2.engine_sel =
+ engine_sel__mes_map_queues__compute_vi;
+ packet->bitfields2.queue_type =
+ queue_type__mes_map_queues__normal_compute_vi;
+
+ switch (q->properties.type) {
+ case KFD_QUEUE_TYPE_COMPUTE:
+ if (use_static)
+ packet->bitfields2.queue_type =
+ queue_type__mes_map_queues__normal_latency_static_queue_vi;
+ break;
+ case KFD_QUEUE_TYPE_DIQ:
+ packet->bitfields2.queue_type =
+ queue_type__mes_map_queues__debug_interface_queue_vi;
+ break;
+ case KFD_QUEUE_TYPE_SDMA:
+ case KFD_QUEUE_TYPE_SDMA_XGMI:
+ packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
+ engine_sel__mes_map_queues__sdma0_vi;
+ use_static = false; /* no static queues under SDMA */
+ break;
+ default:
+ WARN(1, "queue type %d\n", q->properties.type);
+ return -EINVAL;
+ }
+ packet->bitfields3.doorbell_offset =
+ q->properties.doorbell_off;
+
+ packet->mqd_addr_lo =
+ lower_32_bits(q->gart_mqd_addr);
+
+ packet->mqd_addr_hi =
+ upper_32_bits(q->gart_mqd_addr);
+
+ packet->wptr_addr_lo =
+ lower_32_bits((uint64_t)q->properties.write_ptr);
+
+ packet->wptr_addr_hi =
+ upper_32_bits((uint64_t)q->properties.write_ptr);
+
+ return 0;
+}
+
+static int pm_unmap_queues_v10(struct packet_manager *pm, uint32_t *buffer,
+ enum kfd_queue_type type,
+ enum kfd_unmap_queues_filter filter,
+ uint32_t filter_param, bool reset,
+ unsigned int sdma_engine)
+{
+ struct pm4_mes_unmap_queues *packet;
+
+ packet = (struct pm4_mes_unmap_queues *)buffer;
+ memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues));
+
+ packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES,
+ sizeof(struct pm4_mes_unmap_queues));
+ switch (type) {
+ case KFD_QUEUE_TYPE_COMPUTE:
+ case KFD_QUEUE_TYPE_DIQ:
+ packet->bitfields2.engine_sel =
+ engine_sel__mes_unmap_queues__compute;
+ break;
+ case KFD_QUEUE_TYPE_SDMA:
+ case KFD_QUEUE_TYPE_SDMA_XGMI:
+ packet->bitfields2.engine_sel =
+ engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
+ break;
+ default:
+ WARN(1, "queue type %d\n", type);
+ break;
+ }
+
+ if (reset)
+ packet->bitfields2.action =
+ action__mes_unmap_queues__reset_queues;
+ else
+ packet->bitfields2.action =
+ action__mes_unmap_queues__preempt_queues;
+
+ switch (filter) {
+ case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE:
+ packet->bitfields2.queue_sel =
+ queue_sel__mes_unmap_queues__perform_request_on_specified_queues;
+ packet->bitfields2.num_queues = 1;
+ packet->bitfields3b.doorbell_offset0 = filter_param;
+ break;
+ case KFD_UNMAP_QUEUES_FILTER_BY_PASID:
+ packet->bitfields2.queue_sel =
+ queue_sel__mes_unmap_queues__perform_request_on_pasid_queues;
+ packet->bitfields3a.pasid = filter_param;
+ break;
+ case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES:
+ packet->bitfields2.queue_sel =
+ queue_sel__mes_unmap_queues__unmap_all_queues;
+ break;
+ case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES:
+ /* in this case, we do not preempt static queues */
+ packet->bitfields2.queue_sel =
+ queue_sel__mes_unmap_queues__unmap_all_non_static_queues;
+ break;
+ default:
+ WARN(1, "filter %d\n", filter);
+ break;
+ }
+
+ return 0;
+
+}
+
+static int pm_query_status_v10(struct packet_manager *pm, uint32_t *buffer,
+ uint64_t fence_address, uint32_t fence_value)
+{
+ struct pm4_mes_query_status *packet;
+
+ packet = (struct pm4_mes_query_status *)buffer;
+ memset(buffer, 0, sizeof(struct pm4_mes_query_status));
+
+
+ packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS,
+ sizeof(struct pm4_mes_query_status));
+
+ packet->bitfields2.context_id = 0;
+ packet->bitfields2.interrupt_sel =
+ interrupt_sel__mes_query_status__completion_status;
+ packet->bitfields2.command =
+ command__mes_query_status__fence_only_after_write_ack;
+
+ packet->addr_hi = upper_32_bits((uint64_t)fence_address);
+ packet->addr_lo = lower_32_bits((uint64_t)fence_address);
+ packet->data_hi = upper_32_bits((uint64_t)fence_value);
+ packet->data_lo = lower_32_bits((uint64_t)fence_value);
+
+ return 0;
+}
+
+
+static int pm_release_mem_v10(uint64_t gpu_addr, uint32_t *buffer)
+{
+ struct pm4_mec_release_mem *packet;
+
+ WARN_ON(!buffer);
+
+ packet = (struct pm4_mec_release_mem *)buffer;
+ memset(buffer, 0, sizeof(struct pm4_mec_release_mem));
+
+ packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM,
+ sizeof(struct pm4_mec_release_mem));
+
+ packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
+ packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe;
+ packet->bitfields2.tcl1_action_ena = 1;
+ packet->bitfields2.tc_action_ena = 1;
+ packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru;
+
+ packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low;
+ packet->bitfields3.int_sel =
+ int_sel__mec_release_mem__send_interrupt_after_write_confirm;
+
+ packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2;
+ packet->address_hi = upper_32_bits(gpu_addr);
+
+ packet->data_lo = 0;
+
+ return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int);
+}
+
+const struct packet_manager_funcs kfd_v10_pm_funcs = {
+ .map_process = pm_map_process_v10,
+ .runlist = pm_runlist_v10,
+ .set_resources = pm_set_resources_vi,
+ .map_queues = pm_map_queues_v10,
+ .unmap_queues = pm_unmap_queues_v10,
+ .query_status = pm_query_status_v10,
+ .release_mem = pm_release_mem_v10,
+ .map_process_size = sizeof(struct pm4_mes_map_process),
+ .runlist_size = sizeof(struct pm4_mes_runlist),
+ .set_resources_size = sizeof(struct pm4_mes_set_resources),
+ .map_queues_size = sizeof(struct pm4_mes_map_queues),
+ .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues),
+ .query_status_size = sizeof(struct pm4_mes_query_status),
+ .release_mem_size = sizeof(struct pm4_mec_release_mem)
+};
+
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
index 33830b1a5a54..2d5ddf199bd0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
@@ -134,6 +134,7 @@ static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer,
packet->bitfields4.ib_size = ib_size_in_dwords;
packet->bitfields4.chain = chain ? 1 : 0;
packet->bitfields4.offload_polling = 0;
+ packet->bitfields4.chained_runlist_idle_disable = chain ? 1 : 0;
packet->bitfields4.valid = 1;
packet->bitfields4.process_cnt = concurrent_proc_cnt;
packet->ordinal2 = lower_32_bits(ib);
@@ -153,14 +154,13 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES,
sizeof(struct pm4_mes_map_queues));
- packet->bitfields2.alloc_format =
- alloc_format__mes_map_queues__one_per_pipe_vi;
packet->bitfields2.num_queues = 1;
packet->bitfields2.queue_sel =
queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
packet->bitfields2.engine_sel =
engine_sel__mes_map_queues__compute_vi;
+ packet->bitfields2.gws_control_queue = q->gws ? 1 : 0;
packet->bitfields2.queue_type =
queue_type__mes_map_queues__normal_compute_vi;
@@ -175,6 +175,7 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
queue_type__mes_map_queues__debug_interface_queue_vi;
break;
case KFD_QUEUE_TYPE_SDMA:
+ case KFD_QUEUE_TYPE_SDMA_XGMI:
packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
engine_sel__mes_map_queues__sdma0_vi;
use_static = false; /* no static queues under SDMA */
@@ -221,6 +222,7 @@ static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer,
engine_sel__mes_unmap_queues__compute;
break;
case KFD_QUEUE_TYPE_SDMA:
+ case KFD_QUEUE_TYPE_SDMA_XGMI:
packet->bitfields2.engine_sel =
engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
break;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
index bf20c6d32ef3..2adaf40027eb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
@@ -190,8 +190,6 @@ static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer,
packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES,
sizeof(struct pm4_mes_map_queues));
- packet->bitfields2.alloc_format =
- alloc_format__mes_map_queues__one_per_pipe_vi;
packet->bitfields2.num_queues = 1;
packet->bitfields2.queue_sel =
queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
@@ -212,6 +210,7 @@ static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer,
queue_type__mes_map_queues__debug_interface_queue_vi;
break;
case KFD_QUEUE_TYPE_SDMA:
+ case KFD_QUEUE_TYPE_SDMA_XGMI:
packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
engine_sel__mes_map_queues__sdma0_vi;
use_static = false; /* no static queues under SDMA */
@@ -258,6 +257,7 @@ static int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer,
engine_sel__mes_unmap_queues__compute;
break;
case KFD_QUEUE_TYPE_SDMA:
+ case KFD_QUEUE_TYPE_SDMA_XGMI:
packet->bitfields2.engine_sel =
engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
break;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
index 932007eb9168..986ff52d5750 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -56,6 +56,11 @@ static int kfd_init(void)
if (err < 0)
goto err_create_wq;
+ /* Ignore the return value, so that we can continue
+ * to init the KFD, even if procfs isn't craated
+ */
+ kfd_procfs_init();
+
kfd_debugfs_init();
return 0;
@@ -72,6 +77,7 @@ static void kfd_exit(void)
{
kfd_debugfs_fini();
kfd_process_destroy_wq();
+ kfd_procfs_shutdown();
kfd_topology_shutdown();
kfd_chardev_exit();
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
index aed9b9b82213..d6cf391da591 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
@@ -23,34 +23,74 @@
#include "kfd_mqd_manager.h"
#include "amdgpu_amdkfd.h"
+#include "kfd_device_queue_manager.h"
-struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
- struct kfd_dev *dev)
+/* Mapping queue priority to pipe priority, indexed by queue priority */
+int pipe_priority_map[] = {
+ KFD_PIPE_PRIORITY_CS_LOW,
+ KFD_PIPE_PRIORITY_CS_LOW,
+ KFD_PIPE_PRIORITY_CS_LOW,
+ KFD_PIPE_PRIORITY_CS_LOW,
+ KFD_PIPE_PRIORITY_CS_LOW,
+ KFD_PIPE_PRIORITY_CS_LOW,
+ KFD_PIPE_PRIORITY_CS_LOW,
+ KFD_PIPE_PRIORITY_CS_MEDIUM,
+ KFD_PIPE_PRIORITY_CS_MEDIUM,
+ KFD_PIPE_PRIORITY_CS_MEDIUM,
+ KFD_PIPE_PRIORITY_CS_MEDIUM,
+ KFD_PIPE_PRIORITY_CS_HIGH,
+ KFD_PIPE_PRIORITY_CS_HIGH,
+ KFD_PIPE_PRIORITY_CS_HIGH,
+ KFD_PIPE_PRIORITY_CS_HIGH,
+ KFD_PIPE_PRIORITY_CS_HIGH
+};
+
+struct kfd_mem_obj *allocate_hiq_mqd(struct kfd_dev *dev, struct queue_properties *q)
{
- switch (dev->device_info->asic_family) {
- case CHIP_KAVERI:
- return mqd_manager_init_cik(type, dev);
- case CHIP_HAWAII:
- return mqd_manager_init_cik_hawaii(type, dev);
- case CHIP_CARRIZO:
- return mqd_manager_init_vi(type, dev);
- case CHIP_TONGA:
- case CHIP_FIJI:
- case CHIP_POLARIS10:
- case CHIP_POLARIS11:
- case CHIP_POLARIS12:
- return mqd_manager_init_vi_tonga(type, dev);
- case CHIP_VEGA10:
- case CHIP_VEGA12:
- case CHIP_VEGA20:
- case CHIP_RAVEN:
- return mqd_manager_init_v9(type, dev);
- default:
- WARN(1, "Unexpected ASIC family %u",
- dev->device_info->asic_family);
- }
+ struct kfd_mem_obj *mqd_mem_obj = NULL;
+
+ mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
+ if (!mqd_mem_obj)
+ return NULL;
+
+ mqd_mem_obj->gtt_mem = dev->dqm->hiq_sdma_mqd.gtt_mem;
+ mqd_mem_obj->gpu_addr = dev->dqm->hiq_sdma_mqd.gpu_addr;
+ mqd_mem_obj->cpu_ptr = dev->dqm->hiq_sdma_mqd.cpu_ptr;
+
+ return mqd_mem_obj;
+}
+
+struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_dev *dev,
+ struct queue_properties *q)
+{
+ struct kfd_mem_obj *mqd_mem_obj = NULL;
+ uint64_t offset;
- return NULL;
+ mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
+ if (!mqd_mem_obj)
+ return NULL;
+
+ offset = (q->sdma_engine_id *
+ dev->device_info->num_sdma_queues_per_engine +
+ q->sdma_queue_id) *
+ dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size;
+
+ offset += dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
+
+ mqd_mem_obj->gtt_mem = (void *)((uint64_t)dev->dqm->hiq_sdma_mqd.gtt_mem
+ + offset);
+ mqd_mem_obj->gpu_addr = dev->dqm->hiq_sdma_mqd.gpu_addr + offset;
+ mqd_mem_obj->cpu_ptr = (uint32_t *)((uint64_t)
+ dev->dqm->hiq_sdma_mqd.cpu_ptr + offset);
+
+ return mqd_mem_obj;
+}
+
+void free_mqd_hiq_sdma(struct mqd_manager *mm, void *mqd,
+ struct kfd_mem_obj *mqd_mem_obj)
+{
+ WARN_ON(!mqd_mem_obj->gtt_mem);
+ kfree(mqd_mem_obj);
}
void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
index f8261313ae7b..550b61e81015 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
@@ -39,7 +39,7 @@
* @destroy_mqd: Destroys the HQD slot and by that preempt the relevant queue.
* Used only for no cp scheduling.
*
- * @uninit_mqd: Releases the mqd buffer from local gpu memory.
+ * @free_mqd: Releases the mqd buffer from local gpu memory.
*
* @is_occupied: Checks if the relevant HQD slot is occupied.
*
@@ -62,10 +62,13 @@
* per KFD_MQD_TYPE for each device.
*
*/
-
+extern int pipe_priority_map[];
struct mqd_manager {
- int (*init_mqd)(struct mqd_manager *mm, void **mqd,
- struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+ struct kfd_mem_obj* (*allocate_mqd)(struct kfd_dev *kfd,
+ struct queue_properties *q);
+
+ void (*init_mqd)(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q);
int (*load_mqd)(struct mqd_manager *mm, void *mqd,
@@ -73,7 +76,7 @@ struct mqd_manager {
struct queue_properties *p,
struct mm_struct *mms);
- int (*update_mqd)(struct mqd_manager *mm, void *mqd,
+ void (*update_mqd)(struct mqd_manager *mm, void *mqd,
struct queue_properties *q);
int (*destroy_mqd)(struct mqd_manager *mm, void *mqd,
@@ -81,7 +84,7 @@ struct mqd_manager {
unsigned int timeout, uint32_t pipe_id,
uint32_t queue_id);
- void (*uninit_mqd)(struct mqd_manager *mm, void *mqd,
+ void (*free_mqd)(struct mqd_manager *mm, void *mqd,
struct kfd_mem_obj *mqd_mem_obj);
bool (*is_occupied)(struct mqd_manager *mm, void *mqd,
@@ -99,8 +102,17 @@ struct mqd_manager {
struct mutex mqd_mutex;
struct kfd_dev *dev;
+ uint32_t mqd_size;
};
+struct kfd_mem_obj *allocate_hiq_mqd(struct kfd_dev *dev,
+ struct queue_properties *q);
+
+struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_dev *dev,
+ struct queue_properties *q);
+void free_mqd_hiq_sdma(struct mqd_manager *mm, void *mqd,
+ struct kfd_mem_obj *mqd_mem_obj);
+
void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
const uint32_t *cu_mask, uint32_t cu_mask_count,
uint32_t *se_mask);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
index ae90a99909ef..28876aceb14b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -66,22 +66,33 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
m->compute_static_thread_mgmt_se3);
}
-static int init_mqd(struct mqd_manager *mm, void **mqd,
- struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+static void set_priority(struct cik_mqd *m, struct queue_properties *q)
+{
+ m->cp_hqd_pipe_priority = pipe_priority_map[q->priority];
+ m->cp_hqd_queue_priority = q->priority;
+}
+
+static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd,
+ struct queue_properties *q)
+{
+ struct kfd_mem_obj *mqd_mem_obj;
+
+ if (kfd_gtt_sa_allocate(kfd, sizeof(struct cik_mqd),
+ &mqd_mem_obj))
+ return NULL;
+
+ return mqd_mem_obj;
+}
+
+static void init_mqd(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
{
uint64_t addr;
struct cik_mqd *m;
- int retval;
-
- retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd),
- mqd_mem_obj);
- if (retval != 0)
- return -ENOMEM;
-
- m = (struct cik_mqd *) (*mqd_mem_obj)->cpu_ptr;
- addr = (*mqd_mem_obj)->gpu_addr;
+ m = (struct cik_mqd *) mqd_mem_obj->cpu_ptr;
+ addr = mqd_mem_obj->gpu_addr;
memset(m, 0, ALIGN(sizeof(struct cik_mqd), 256));
@@ -116,8 +127,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
* 1 = CS_MEDIUM (typically between HP3D and GFX
* 2 = CS_HIGH (typically above HP3D)
*/
- m->cp_hqd_pipe_priority = 1;
- m->cp_hqd_queue_priority = 15;
+ set_priority(m, q);
if (q->format == KFD_QUEUE_FORMAT_AQL)
m->cp_hqd_iq_rptr = AQL_ENABLE;
@@ -125,49 +135,32 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
*mqd = m;
if (gart_addr)
*gart_addr = addr;
- retval = mm->update_mqd(mm, m, q);
-
- return retval;
+ mm->update_mqd(mm, m, q);
}
-static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
- struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+static void init_mqd_sdma(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
{
- int retval;
struct cik_sdma_rlc_registers *m;
- retval = kfd_gtt_sa_allocate(mm->dev,
- sizeof(struct cik_sdma_rlc_registers),
- mqd_mem_obj);
-
- if (retval != 0)
- return -ENOMEM;
-
- m = (struct cik_sdma_rlc_registers *) (*mqd_mem_obj)->cpu_ptr;
+ m = (struct cik_sdma_rlc_registers *) mqd_mem_obj->cpu_ptr;
memset(m, 0, sizeof(struct cik_sdma_rlc_registers));
*mqd = m;
if (gart_addr)
- *gart_addr = (*mqd_mem_obj)->gpu_addr;
+ *gart_addr = mqd_mem_obj->gpu_addr;
- retval = mm->update_mqd(mm, m, q);
-
- return retval;
+ mm->update_mqd(mm, m, q);
}
-static void uninit_mqd(struct mqd_manager *mm, void *mqd,
+static void free_mqd(struct mqd_manager *mm, void *mqd,
struct kfd_mem_obj *mqd_mem_obj)
{
kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
}
-static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
- struct kfd_mem_obj *mqd_mem_obj)
-{
- kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
-}
static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id,
uint32_t queue_id, struct queue_properties *p,
@@ -191,7 +184,7 @@ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
mms);
}
-static int __update_mqd(struct mqd_manager *mm, void *mqd,
+static void __update_mqd(struct mqd_manager *mm, void *mqd,
struct queue_properties *q, unsigned int atc_bit)
{
struct cik_mqd *m;
@@ -222,28 +215,24 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
m->cp_hqd_pq_control |= NO_UPDATE_RPTR;
update_cu_mask(mm, mqd, q);
+ set_priority(m, q);
- q->is_active = (q->queue_size > 0 &&
- q->queue_address != 0 &&
- q->queue_percent > 0 &&
- !q->is_evicted);
-
- return 0;
+ q->is_active = QUEUE_IS_ACTIVE(*q);
}
-static int update_mqd(struct mqd_manager *mm, void *mqd,
+static void update_mqd(struct mqd_manager *mm, void *mqd,
struct queue_properties *q)
{
- return __update_mqd(mm, mqd, q, 1);
+ __update_mqd(mm, mqd, q, 1);
}
-static int update_mqd_hawaii(struct mqd_manager *mm, void *mqd,
+static void update_mqd_hawaii(struct mqd_manager *mm, void *mqd,
struct queue_properties *q)
{
- return __update_mqd(mm, mqd, q, 0);
+ __update_mqd(mm, mqd, q, 0);
}
-static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+static void update_mqd_sdma(struct mqd_manager *mm, void *mqd,
struct queue_properties *q)
{
struct cik_sdma_rlc_registers *m;
@@ -267,12 +256,7 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
m->sdma_engine_id = q->sdma_engine_id;
m->sdma_queue_id = q->sdma_queue_id;
- q->is_active = (q->queue_size > 0 &&
- q->queue_address != 0 &&
- q->queue_percent > 0 &&
- !q->is_evicted);
-
- return 0;
+ q->is_active = QUEUE_IS_ACTIVE(*q);
}
static int destroy_mqd(struct mqd_manager *mm, void *mqd,
@@ -319,14 +303,14 @@ static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
* queues but with different initial values.
*/
-static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
- struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+static void init_mqd_hiq(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
{
- return init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
+ init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
}
-static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+static void update_mqd_hiq(struct mqd_manager *mm, void *mqd,
struct queue_properties *q)
{
struct cik_mqd *m;
@@ -350,12 +334,9 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
m->cp_hqd_vmid = q->vmid;
- q->is_active = (q->queue_size > 0 &&
- q->queue_address != 0 &&
- q->queue_percent > 0 &&
- !q->is_evicted);
+ q->is_active = QUEUE_IS_ACTIVE(*q);
- return 0;
+ set_priority(m, q);
}
#if defined(CONFIG_DEBUG_FS)
@@ -394,34 +375,53 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
switch (type) {
case KFD_MQD_TYPE_CP:
case KFD_MQD_TYPE_COMPUTE:
+ mqd->allocate_mqd = allocate_mqd;
mqd->init_mqd = init_mqd;
- mqd->uninit_mqd = uninit_mqd;
+ mqd->free_mqd = free_mqd;
mqd->load_mqd = load_mqd;
mqd->update_mqd = update_mqd;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
+ mqd->mqd_size = sizeof(struct cik_mqd);
#if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
#endif
break;
case KFD_MQD_TYPE_HIQ:
+ mqd->allocate_mqd = allocate_hiq_mqd;
+ mqd->init_mqd = init_mqd_hiq;
+ mqd->free_mqd = free_mqd_hiq_sdma;
+ mqd->load_mqd = load_mqd;
+ mqd->update_mqd = update_mqd_hiq;
+ mqd->destroy_mqd = destroy_mqd;
+ mqd->is_occupied = is_occupied;
+ mqd->mqd_size = sizeof(struct cik_mqd);
+#if defined(CONFIG_DEBUG_FS)
+ mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+ break;
+ case KFD_MQD_TYPE_DIQ:
+ mqd->allocate_mqd = allocate_hiq_mqd;
mqd->init_mqd = init_mqd_hiq;
- mqd->uninit_mqd = uninit_mqd;
+ mqd->free_mqd = free_mqd;
mqd->load_mqd = load_mqd;
mqd->update_mqd = update_mqd_hiq;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
+ mqd->mqd_size = sizeof(struct cik_mqd);
#if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
#endif
break;
case KFD_MQD_TYPE_SDMA:
+ mqd->allocate_mqd = allocate_sdma_mqd;
mqd->init_mqd = init_mqd_sdma;
- mqd->uninit_mqd = uninit_mqd_sdma;
+ mqd->free_mqd = free_mqd_hiq_sdma;
mqd->load_mqd = load_mqd_sdma;
mqd->update_mqd = update_mqd_sdma;
mqd->destroy_mqd = destroy_mqd_sdma;
mqd->is_occupied = is_occupied_sdma;
+ mqd->mqd_size = sizeof(struct cik_sdma_rlc_registers);
#if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
new file mode 100644
index 000000000000..4f8a6ffc5775
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
@@ -0,0 +1,498 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include "kfd_priv.h"
+#include "kfd_mqd_manager.h"
+#include "v10_structs.h"
+#include "gc/gc_10_1_0_offset.h"
+#include "gc/gc_10_1_0_sh_mask.h"
+#include "amdgpu_amdkfd.h"
+
+static inline struct v10_compute_mqd *get_mqd(void *mqd)
+{
+ return (struct v10_compute_mqd *)mqd;
+}
+
+static inline struct v10_sdma_mqd *get_sdma_mqd(void *mqd)
+{
+ return (struct v10_sdma_mqd *)mqd;
+}
+
+static void update_cu_mask(struct mqd_manager *mm, void *mqd,
+ struct queue_properties *q)
+{
+ struct v10_compute_mqd *m;
+ uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
+
+ if (q->cu_mask_count == 0)
+ return;
+
+ mqd_symmetrically_map_cu_mask(mm,
+ q->cu_mask, q->cu_mask_count, se_mask);
+
+ m = get_mqd(mqd);
+ m->compute_static_thread_mgmt_se0 = se_mask[0];
+ m->compute_static_thread_mgmt_se1 = se_mask[1];
+ m->compute_static_thread_mgmt_se2 = se_mask[2];
+ m->compute_static_thread_mgmt_se3 = se_mask[3];
+
+ pr_debug("update cu mask to %#x %#x %#x %#x\n",
+ m->compute_static_thread_mgmt_se0,
+ m->compute_static_thread_mgmt_se1,
+ m->compute_static_thread_mgmt_se2,
+ m->compute_static_thread_mgmt_se3);
+}
+
+static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd,
+ struct queue_properties *q)
+{
+ int retval;
+ struct kfd_mem_obj *mqd_mem_obj = NULL;
+
+ /* From V9, for CWSR, the control stack is located on the next page
+ * boundary after the mqd, we will use the gtt allocation function
+ * instead of sub-allocation function.
+ */
+ if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) {
+ mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO);
+ if (!mqd_mem_obj)
+ return NULL;
+ retval = amdgpu_amdkfd_alloc_gtt_mem(kfd->kgd,
+ ALIGN(q->ctl_stack_size, PAGE_SIZE) +
+ ALIGN(sizeof(struct v10_compute_mqd), PAGE_SIZE),
+ &(mqd_mem_obj->gtt_mem),
+ &(mqd_mem_obj->gpu_addr),
+ (void *)&(mqd_mem_obj->cpu_ptr), true);
+ } else {
+ retval = kfd_gtt_sa_allocate(kfd, sizeof(struct v10_compute_mqd),
+ &mqd_mem_obj);
+ }
+
+ if (retval) {
+ kfree(mqd_mem_obj);
+ return NULL;
+ }
+
+ return mqd_mem_obj;
+
+}
+
+static void init_mqd(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
+ struct queue_properties *q)
+{
+ uint64_t addr;
+ struct v10_compute_mqd *m;
+
+ m = (struct v10_compute_mqd *) mqd_mem_obj->cpu_ptr;
+ addr = mqd_mem_obj->gpu_addr;
+
+ memset(m, 0, sizeof(struct v10_compute_mqd));
+
+ m->header = 0xC0310800;
+ m->compute_pipelinestat_enable = 1;
+ m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
+ m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
+ m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
+ m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
+
+ m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
+ 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;
+
+ m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT;
+
+ m->cp_mqd_base_addr_lo = lower_32_bits(addr);
+ m->cp_mqd_base_addr_hi = upper_32_bits(addr);
+
+ m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT |
+ 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
+ 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
+
+ m->cp_hqd_pipe_priority = 1;
+ m->cp_hqd_queue_priority = 15;
+
+ if (q->format == KFD_QUEUE_FORMAT_AQL) {
+ m->cp_hqd_aql_control =
+ 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
+ }
+
+ if (mm->dev->cwsr_enabled) {
+ m->cp_hqd_persistent_state |=
+ (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
+ m->cp_hqd_ctx_save_base_addr_lo =
+ lower_32_bits(q->ctx_save_restore_area_address);
+ m->cp_hqd_ctx_save_base_addr_hi =
+ upper_32_bits(q->ctx_save_restore_area_address);
+ m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size;
+ m->cp_hqd_cntl_stack_size = q->ctl_stack_size;
+ m->cp_hqd_cntl_stack_offset = q->ctl_stack_size;
+ m->cp_hqd_wg_state_offset = q->ctl_stack_size;
+ }
+
+ *mqd = m;
+ if (gart_addr)
+ *gart_addr = addr;
+ mm->update_mqd(mm, m, q);
+}
+
+static int load_mqd(struct mqd_manager *mm, void *mqd,
+ uint32_t pipe_id, uint32_t queue_id,
+ struct queue_properties *p, struct mm_struct *mms)
+{
+ int r = 0;
+ /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
+ uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
+
+ r = mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id,
+ (uint32_t __user *)p->write_ptr,
+ wptr_shift, 0, mms);
+ return r;
+}
+
+static void update_mqd(struct mqd_manager *mm, void *mqd,
+ struct queue_properties *q)
+{
+ struct v10_compute_mqd *m;
+
+ m = get_mqd(mqd);
+
+ m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT;
+ m->cp_hqd_pq_control |=
+ ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1;
+ pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control);
+
+ m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
+ m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
+
+ m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+ m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+ m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr);
+ m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr);
+
+ m->cp_hqd_pq_doorbell_control =
+ q->doorbell_off <<
+ CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
+ pr_debug("cp_hqd_pq_doorbell_control 0x%x\n",
+ m->cp_hqd_pq_doorbell_control);
+
+ m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT;
+
+ /*
+ * HW does not clamp this field correctly. Maximum EOP queue size
+ * is constrained by per-SE EOP done signal count, which is 8-bit.
+ * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit
+ * more than (EOP entry count - 1) so a queue size of 0x800 dwords
+ * is safe, giving a maximum field value of 0xA.
+ */
+ m->cp_hqd_eop_control = min(0xA,
+ ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1);
+ m->cp_hqd_eop_base_addr_lo =
+ lower_32_bits(q->eop_ring_buffer_address >> 8);
+ m->cp_hqd_eop_base_addr_hi =
+ upper_32_bits(q->eop_ring_buffer_address >> 8);
+
+ m->cp_hqd_iq_timer = 0;
+
+ m->cp_hqd_vmid = q->vmid;
+
+ if (q->format == KFD_QUEUE_FORMAT_AQL) {
+ /* GC 10 removed WPP_CLAMP from PQ Control */
+ m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK |
+ 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT |
+ 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT ;
+ m->cp_hqd_pq_doorbell_control |=
+ 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT;
+ }
+ if (mm->dev->cwsr_enabled)
+ m->cp_hqd_ctx_save_control = 0;
+
+ update_cu_mask(mm, mqd, q);
+
+ q->is_active = (q->queue_size > 0 &&
+ q->queue_address != 0 &&
+ q->queue_percent > 0 &&
+ !q->is_evicted);
+}
+
+static int destroy_mqd(struct mqd_manager *mm, void *mqd,
+ enum kfd_preempt_type type,
+ unsigned int timeout, uint32_t pipe_id,
+ uint32_t queue_id)
+{
+ return mm->dev->kfd2kgd->hqd_destroy
+ (mm->dev->kgd, mqd, type, timeout,
+ pipe_id, queue_id);
+}
+
+static void free_mqd(struct mqd_manager *mm, void *mqd,
+ struct kfd_mem_obj *mqd_mem_obj)
+{
+ struct kfd_dev *kfd = mm->dev;
+
+ if (mqd_mem_obj->gtt_mem) {
+ amdgpu_amdkfd_free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem);
+ kfree(mqd_mem_obj);
+ } else {
+ kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+ }
+}
+
+static bool is_occupied(struct mqd_manager *mm, void *mqd,
+ uint64_t queue_address, uint32_t pipe_id,
+ uint32_t queue_id)
+{
+ return mm->dev->kfd2kgd->hqd_is_occupied(
+ mm->dev->kgd, queue_address,
+ pipe_id, queue_id);
+}
+
+static int get_wave_state(struct mqd_manager *mm, void *mqd,
+ void __user *ctl_stack,
+ u32 *ctl_stack_used_size,
+ u32 *save_area_used_size)
+{
+ struct v10_compute_mqd *m;
+
+ /* Control stack is located one page after MQD. */
+ void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE);
+
+ m = get_mqd(mqd);
+
+ *ctl_stack_used_size = m->cp_hqd_cntl_stack_size -
+ m->cp_hqd_cntl_stack_offset;
+ *save_area_used_size = m->cp_hqd_wg_state_offset -
+ m->cp_hqd_cntl_stack_size;
+
+ if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size))
+ return -EFAULT;
+
+ return 0;
+}
+
+static void init_mqd_hiq(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
+ struct queue_properties *q)
+{
+ struct v10_compute_mqd *m;
+
+ init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
+
+ m = get_mqd(*mqd);
+
+ m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT |
+ 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT;
+}
+
+static void update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+ struct queue_properties *q)
+{
+ struct v10_compute_mqd *m;
+
+ update_mqd(mm, mqd, q);
+
+ /* TODO: what's the point? update_mqd already does this. */
+ m = get_mqd(mqd);
+ m->cp_hqd_vmid = q->vmid;
+}
+
+static void init_mqd_sdma(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
+ struct queue_properties *q)
+{
+ struct v10_sdma_mqd *m;
+
+ m = (struct v10_sdma_mqd *) mqd_mem_obj->cpu_ptr;
+
+ memset(m, 0, sizeof(struct v10_sdma_mqd));
+
+ *mqd = m;
+ if (gart_addr)
+ *gart_addr = mqd_mem_obj->gpu_addr;
+
+ mm->update_mqd(mm, m, q);
+}
+
+static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ uint32_t pipe_id, uint32_t queue_id,
+ struct queue_properties *p, struct mm_struct *mms)
+{
+ return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd,
+ (uint32_t __user *)p->write_ptr,
+ mms);
+}
+
+#define SDMA_RLC_DUMMY_DEFAULT 0xf
+
+static void update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ struct queue_properties *q)
+{
+ struct v10_sdma_mqd *m;
+
+ m = get_sdma_mqd(mqd);
+ m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1)
+ << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
+ q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
+ 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
+ 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
+
+ m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8);
+ m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
+ m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+ m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+ m->sdmax_rlcx_doorbell_offset =
+ q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT;
+
+ m->sdma_engine_id = q->sdma_engine_id;
+ m->sdma_queue_id = q->sdma_queue_id;
+ m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT;
+
+
+ q->is_active = (q->queue_size > 0 &&
+ q->queue_address != 0 &&
+ q->queue_percent > 0 &&
+ !q->is_evicted);
+}
+
+/*
+ * * preempt type here is ignored because there is only one way
+ * * to preempt sdma queue
+ */
+static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ enum kfd_preempt_type type,
+ unsigned int timeout, uint32_t pipe_id,
+ uint32_t queue_id)
+{
+ return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout);
+}
+
+static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
+ uint64_t queue_address, uint32_t pipe_id,
+ uint32_t queue_id)
+{
+ return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd);
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+static int debugfs_show_mqd(struct seq_file *m, void *data)
+{
+ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
+ data, sizeof(struct v10_compute_mqd), false);
+ return 0;
+}
+
+static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
+{
+ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
+ data, sizeof(struct v10_sdma_mqd), false);
+ return 0;
+}
+
+#endif
+
+struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type,
+ struct kfd_dev *dev)
+{
+ struct mqd_manager *mqd;
+
+ if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
+ return NULL;
+
+ mqd = kzalloc(sizeof(*mqd), GFP_NOIO);
+ if (!mqd)
+ return NULL;
+
+ mqd->dev = dev;
+
+ switch (type) {
+ case KFD_MQD_TYPE_CP:
+ pr_debug("%s@%i\n", __func__, __LINE__);
+ case KFD_MQD_TYPE_COMPUTE:
+ pr_debug("%s@%i\n", __func__, __LINE__);
+ mqd->allocate_mqd = allocate_mqd;
+ mqd->init_mqd = init_mqd;
+ mqd->free_mqd = free_mqd;
+ mqd->load_mqd = load_mqd;
+ mqd->update_mqd = update_mqd;
+ mqd->destroy_mqd = destroy_mqd;
+ mqd->is_occupied = is_occupied;
+ mqd->mqd_size = sizeof(struct v10_compute_mqd);
+ mqd->get_wave_state = get_wave_state;
+#if defined(CONFIG_DEBUG_FS)
+ mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+ pr_debug("%s@%i\n", __func__, __LINE__);
+ break;
+ case KFD_MQD_TYPE_HIQ:
+ pr_debug("%s@%i\n", __func__, __LINE__);
+ mqd->allocate_mqd = allocate_hiq_mqd;
+ mqd->init_mqd = init_mqd_hiq;
+ mqd->free_mqd = free_mqd_hiq_sdma;
+ mqd->load_mqd = load_mqd;
+ mqd->update_mqd = update_mqd_hiq;
+ mqd->destroy_mqd = destroy_mqd;
+ mqd->is_occupied = is_occupied;
+ mqd->mqd_size = sizeof(struct v10_compute_mqd);
+#if defined(CONFIG_DEBUG_FS)
+ mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+ pr_debug("%s@%i\n", __func__, __LINE__);
+ break;
+ case KFD_MQD_TYPE_DIQ:
+ mqd->allocate_mqd = allocate_hiq_mqd;
+ mqd->init_mqd = init_mqd_hiq;
+ mqd->free_mqd = free_mqd;
+ mqd->load_mqd = load_mqd;
+ mqd->update_mqd = update_mqd_hiq;
+ mqd->destroy_mqd = destroy_mqd;
+ mqd->is_occupied = is_occupied;
+ mqd->mqd_size = sizeof(struct v10_compute_mqd);
+#if defined(CONFIG_DEBUG_FS)
+ mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+ break;
+ case KFD_MQD_TYPE_SDMA:
+ pr_debug("%s@%i\n", __func__, __LINE__);
+ mqd->allocate_mqd = allocate_sdma_mqd;
+ mqd->init_mqd = init_mqd_sdma;
+ mqd->free_mqd = free_mqd_hiq_sdma;
+ mqd->load_mqd = load_mqd_sdma;
+ mqd->update_mqd = update_mqd_sdma;
+ mqd->destroy_mqd = destroy_mqd_sdma;
+ mqd->is_occupied = is_occupied_sdma;
+ mqd->mqd_size = sizeof(struct v10_sdma_mqd);
+#if defined(CONFIG_DEBUG_FS)
+ mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
+#endif
+ pr_debug("%s@%i\n", __func__, __LINE__);
+ break;
+ default:
+ kfree(mqd);
+ return NULL;
+ }
+
+ return mqd;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 9dbba609450e..0c58f91b3ff3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -67,37 +67,55 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
m->compute_static_thread_mgmt_se3);
}
-static int init_mqd(struct mqd_manager *mm, void **mqd,
- struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
- struct queue_properties *q)
+static void set_priority(struct v9_mqd *m, struct queue_properties *q)
+{
+ m->cp_hqd_pipe_priority = pipe_priority_map[q->priority];
+ m->cp_hqd_queue_priority = q->priority;
+}
+
+static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd,
+ struct queue_properties *q)
{
int retval;
- uint64_t addr;
- struct v9_mqd *m;
- struct kfd_dev *kfd = mm->dev;
+ struct kfd_mem_obj *mqd_mem_obj = NULL;
/* From V9, for CWSR, the control stack is located on the next page
* boundary after the mqd, we will use the gtt allocation function
* instead of sub-allocation function.
*/
if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) {
- *mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
- if (!*mqd_mem_obj)
- return -ENOMEM;
+ mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO);
+ if (!mqd_mem_obj)
+ return NULL;
retval = amdgpu_amdkfd_alloc_gtt_mem(kfd->kgd,
ALIGN(q->ctl_stack_size, PAGE_SIZE) +
ALIGN(sizeof(struct v9_mqd), PAGE_SIZE),
- &((*mqd_mem_obj)->gtt_mem),
- &((*mqd_mem_obj)->gpu_addr),
- (void *)&((*mqd_mem_obj)->cpu_ptr), true);
- } else
- retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd),
- mqd_mem_obj);
- if (retval != 0)
- return -ENOMEM;
-
- m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr;
- addr = (*mqd_mem_obj)->gpu_addr;
+ &(mqd_mem_obj->gtt_mem),
+ &(mqd_mem_obj->gpu_addr),
+ (void *)&(mqd_mem_obj->cpu_ptr), true);
+ } else {
+ retval = kfd_gtt_sa_allocate(kfd, sizeof(struct v9_mqd),
+ &mqd_mem_obj);
+ }
+
+ if (retval) {
+ kfree(mqd_mem_obj);
+ return NULL;
+ }
+
+ return mqd_mem_obj;
+
+}
+
+static void init_mqd(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
+ struct queue_properties *q)
+{
+ uint64_t addr;
+ struct v9_mqd *m;
+
+ m = (struct v9_mqd *) mqd_mem_obj->cpu_ptr;
+ addr = mqd_mem_obj->gpu_addr;
memset(m, 0, sizeof(struct v9_mqd));
@@ -120,9 +138,6 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
- m->cp_hqd_pipe_priority = 1;
- m->cp_hqd_queue_priority = 15;
-
if (q->format == KFD_QUEUE_FORMAT_AQL) {
m->cp_hqd_aql_control =
1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
@@ -149,9 +164,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
*mqd = m;
if (gart_addr)
*gart_addr = addr;
- retval = mm->update_mqd(mm, m, q);
-
- return retval;
+ mm->update_mqd(mm, m, q);
}
static int load_mqd(struct mqd_manager *mm, void *mqd,
@@ -166,7 +179,7 @@ static int load_mqd(struct mqd_manager *mm, void *mqd,
wptr_shift, 0, mms);
}
-static int update_mqd(struct mqd_manager *mm, void *mqd,
+static void update_mqd(struct mqd_manager *mm, void *mqd,
struct queue_properties *q)
{
struct v9_mqd *m;
@@ -225,13 +238,9 @@ static int update_mqd(struct mqd_manager *mm, void *mqd,
m->cp_hqd_ctx_save_control = 0;
update_cu_mask(mm, mqd, q);
+ set_priority(m, q);
- q->is_active = (q->queue_size > 0 &&
- q->queue_address != 0 &&
- q->queue_percent > 0 &&
- !q->is_evicted);
-
- return 0;
+ q->is_active = QUEUE_IS_ACTIVE(*q);
}
@@ -245,7 +254,7 @@ static int destroy_mqd(struct mqd_manager *mm, void *mqd,
pipe_id, queue_id);
}
-static void uninit_mqd(struct mqd_manager *mm, void *mqd,
+static void free_mqd(struct mqd_manager *mm, void *mqd,
struct kfd_mem_obj *mqd_mem_obj)
{
struct kfd_dev *kfd = mm->dev;
@@ -289,71 +298,47 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
return 0;
}
-static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
- struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+static void init_mqd_hiq(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
{
struct v9_mqd *m;
- int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
- if (retval != 0)
- return retval;
+ init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
m = get_mqd(*mqd);
m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT |
1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT;
-
- return retval;
}
-static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+static void update_mqd_hiq(struct mqd_manager *mm, void *mqd,
struct queue_properties *q)
{
struct v9_mqd *m;
- int retval = update_mqd(mm, mqd, q);
- if (retval != 0)
- return retval;
+ update_mqd(mm, mqd, q);
/* TODO: what's the point? update_mqd already does this. */
m = get_mqd(mqd);
m->cp_hqd_vmid = q->vmid;
- return retval;
}
-static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
- struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+static void init_mqd_sdma(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
{
- int retval;
struct v9_sdma_mqd *m;
-
- retval = kfd_gtt_sa_allocate(mm->dev,
- sizeof(struct v9_sdma_mqd),
- mqd_mem_obj);
-
- if (retval != 0)
- return -ENOMEM;
-
- m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr;
+ m = (struct v9_sdma_mqd *) mqd_mem_obj->cpu_ptr;
memset(m, 0, sizeof(struct v9_sdma_mqd));
*mqd = m;
if (gart_addr)
- *gart_addr = (*mqd_mem_obj)->gpu_addr;
-
- retval = mm->update_mqd(mm, m, q);
+ *gart_addr = mqd_mem_obj->gpu_addr;
- return retval;
-}
-
-static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
- struct kfd_mem_obj *mqd_mem_obj)
-{
- kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+ mm->update_mqd(mm, m, q);
}
static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
@@ -367,7 +352,7 @@ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
#define SDMA_RLC_DUMMY_DEFAULT 0xf
-static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+static void update_mqd_sdma(struct mqd_manager *mm, void *mqd,
struct queue_properties *q)
{
struct v9_sdma_mqd *m;
@@ -390,12 +375,7 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
m->sdma_queue_id = q->sdma_queue_id;
m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT;
- q->is_active = (q->queue_size > 0 &&
- q->queue_address != 0 &&
- q->queue_percent > 0 &&
- !q->is_evicted);
-
- return 0;
+ q->is_active = QUEUE_IS_ACTIVE(*q);
}
/*
@@ -452,35 +432,54 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
switch (type) {
case KFD_MQD_TYPE_CP:
case KFD_MQD_TYPE_COMPUTE:
+ mqd->allocate_mqd = allocate_mqd;
mqd->init_mqd = init_mqd;
- mqd->uninit_mqd = uninit_mqd;
+ mqd->free_mqd = free_mqd;
mqd->load_mqd = load_mqd;
mqd->update_mqd = update_mqd;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
mqd->get_wave_state = get_wave_state;
+ mqd->mqd_size = sizeof(struct v9_mqd);
#if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
#endif
break;
case KFD_MQD_TYPE_HIQ:
+ mqd->allocate_mqd = allocate_hiq_mqd;
+ mqd->init_mqd = init_mqd_hiq;
+ mqd->free_mqd = free_mqd_hiq_sdma;
+ mqd->load_mqd = load_mqd;
+ mqd->update_mqd = update_mqd_hiq;
+ mqd->destroy_mqd = destroy_mqd;
+ mqd->is_occupied = is_occupied;
+ mqd->mqd_size = sizeof(struct v9_mqd);
+#if defined(CONFIG_DEBUG_FS)
+ mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+ break;
+ case KFD_MQD_TYPE_DIQ:
+ mqd->allocate_mqd = allocate_hiq_mqd;
mqd->init_mqd = init_mqd_hiq;
- mqd->uninit_mqd = uninit_mqd;
+ mqd->free_mqd = free_mqd;
mqd->load_mqd = load_mqd;
mqd->update_mqd = update_mqd_hiq;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
+ mqd->mqd_size = sizeof(struct v9_mqd);
#if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
#endif
break;
case KFD_MQD_TYPE_SDMA:
+ mqd->allocate_mqd = allocate_sdma_mqd;
mqd->init_mqd = init_mqd_sdma;
- mqd->uninit_mqd = uninit_mqd_sdma;
+ mqd->free_mqd = free_mqd_hiq_sdma;
mqd->load_mqd = load_mqd_sdma;
mqd->update_mqd = update_mqd_sdma;
mqd->destroy_mqd = destroy_mqd_sdma;
mqd->is_occupied = is_occupied_sdma;
+ mqd->mqd_size = sizeof(struct v9_sdma_mqd);
#if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
index 6469b3456f00..7d144f56f421 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
@@ -31,6 +31,7 @@
#include "gca/gfx_8_0_sh_mask.h"
#include "gca/gfx_8_0_enum.h"
#include "oss/oss_3_0_sh_mask.h"
+
#define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8
static inline struct vi_mqd *get_mqd(void *mqd)
@@ -68,21 +69,33 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
m->compute_static_thread_mgmt_se3);
}
-static int init_mqd(struct mqd_manager *mm, void **mqd,
- struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+static void set_priority(struct vi_mqd *m, struct queue_properties *q)
+{
+ m->cp_hqd_pipe_priority = pipe_priority_map[q->priority];
+ m->cp_hqd_queue_priority = q->priority;
+}
+
+static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd,
+ struct queue_properties *q)
+{
+ struct kfd_mem_obj *mqd_mem_obj;
+
+ if (kfd_gtt_sa_allocate(kfd, sizeof(struct vi_mqd),
+ &mqd_mem_obj))
+ return NULL;
+
+ return mqd_mem_obj;
+}
+
+static void init_mqd(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
{
- int retval;
uint64_t addr;
struct vi_mqd *m;
- retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct vi_mqd),
- mqd_mem_obj);
- if (retval != 0)
- return -ENOMEM;
-
- m = (struct vi_mqd *) (*mqd_mem_obj)->cpu_ptr;
- addr = (*mqd_mem_obj)->gpu_addr;
+ m = (struct vi_mqd *) mqd_mem_obj->cpu_ptr;
+ addr = mqd_mem_obj->gpu_addr;
memset(m, 0, sizeof(struct vi_mqd));
@@ -106,9 +119,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
- m->cp_hqd_pipe_priority = 1;
- m->cp_hqd_queue_priority = 15;
-
+ set_priority(m, q);
m->cp_hqd_eop_rptr = 1 << CP_HQD_EOP_RPTR__INIT_FETCHER__SHIFT;
if (q->format == KFD_QUEUE_FORMAT_AQL)
@@ -139,9 +150,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
*mqd = m;
if (gart_addr)
*gart_addr = addr;
- retval = mm->update_mqd(mm, m, q);
-
- return retval;
+ mm->update_mqd(mm, m, q);
}
static int load_mqd(struct mqd_manager *mm, void *mqd,
@@ -157,7 +166,7 @@ static int load_mqd(struct mqd_manager *mm, void *mqd,
wptr_shift, wptr_mask, mms);
}
-static int __update_mqd(struct mqd_manager *mm, void *mqd,
+static void __update_mqd(struct mqd_manager *mm, void *mqd,
struct queue_properties *q, unsigned int mtype,
unsigned int atc_bit)
{
@@ -222,26 +231,22 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT;
update_cu_mask(mm, mqd, q);
+ set_priority(m, q);
- q->is_active = (q->queue_size > 0 &&
- q->queue_address != 0 &&
- q->queue_percent > 0 &&
- !q->is_evicted);
-
- return 0;
+ q->is_active = QUEUE_IS_ACTIVE(*q);
}
-static int update_mqd(struct mqd_manager *mm, void *mqd,
+static void update_mqd(struct mqd_manager *mm, void *mqd,
struct queue_properties *q)
{
- return __update_mqd(mm, mqd, q, MTYPE_CC, 1);
+ __update_mqd(mm, mqd, q, MTYPE_CC, 1);
}
-static int update_mqd_tonga(struct mqd_manager *mm, void *mqd,
+static void update_mqd_tonga(struct mqd_manager *mm, void *mqd,
struct queue_properties *q)
{
- return __update_mqd(mm, mqd, q, MTYPE_UC, 0);
+ __update_mqd(mm, mqd, q, MTYPE_UC, 0);
}
static int destroy_mqd(struct mqd_manager *mm, void *mqd,
@@ -254,7 +259,7 @@ static int destroy_mqd(struct mqd_manager *mm, void *mqd,
pipe_id, queue_id);
}
-static void uninit_mqd(struct mqd_manager *mm, void *mqd,
+static void free_mqd(struct mqd_manager *mm, void *mqd,
struct kfd_mem_obj *mqd_mem_obj)
{
kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
@@ -291,70 +296,44 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
return 0;
}
-static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
- struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+static void init_mqd_hiq(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
{
struct vi_mqd *m;
- int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
-
- if (retval != 0)
- return retval;
+ init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
m = get_mqd(*mqd);
m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT |
1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT;
-
- return retval;
}
-static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+static void update_mqd_hiq(struct mqd_manager *mm, void *mqd,
struct queue_properties *q)
{
struct vi_mqd *m;
- int retval = __update_mqd(mm, mqd, q, MTYPE_UC, 0);
-
- if (retval != 0)
- return retval;
+ __update_mqd(mm, mqd, q, MTYPE_UC, 0);
m = get_mqd(mqd);
m->cp_hqd_vmid = q->vmid;
- return retval;
}
-static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
- struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+static void init_mqd_sdma(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
{
- int retval;
struct vi_sdma_mqd *m;
-
- retval = kfd_gtt_sa_allocate(mm->dev,
- sizeof(struct vi_sdma_mqd),
- mqd_mem_obj);
-
- if (retval != 0)
- return -ENOMEM;
-
- m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr;
+ m = (struct vi_sdma_mqd *) mqd_mem_obj->cpu_ptr;
memset(m, 0, sizeof(struct vi_sdma_mqd));
*mqd = m;
- if (gart_addr != NULL)
- *gart_addr = (*mqd_mem_obj)->gpu_addr;
-
- retval = mm->update_mqd(mm, m, q);
-
- return retval;
-}
+ if (gart_addr)
+ *gart_addr = mqd_mem_obj->gpu_addr;
-static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
- struct kfd_mem_obj *mqd_mem_obj)
-{
- kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+ mm->update_mqd(mm, m, q);
}
static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
@@ -366,7 +345,7 @@ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
mms);
}
-static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+static void update_mqd_sdma(struct mqd_manager *mm, void *mqd,
struct queue_properties *q)
{
struct vi_sdma_mqd *m;
@@ -390,12 +369,7 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
m->sdma_engine_id = q->sdma_engine_id;
m->sdma_queue_id = q->sdma_queue_id;
- q->is_active = (q->queue_size > 0 &&
- q->queue_address != 0 &&
- q->queue_percent > 0 &&
- !q->is_evicted);
-
- return 0;
+ q->is_active = QUEUE_IS_ACTIVE(*q);
}
/*
@@ -452,35 +426,54 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
switch (type) {
case KFD_MQD_TYPE_CP:
case KFD_MQD_TYPE_COMPUTE:
+ mqd->allocate_mqd = allocate_mqd;
mqd->init_mqd = init_mqd;
- mqd->uninit_mqd = uninit_mqd;
+ mqd->free_mqd = free_mqd;
mqd->load_mqd = load_mqd;
mqd->update_mqd = update_mqd;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
mqd->get_wave_state = get_wave_state;
+ mqd->mqd_size = sizeof(struct vi_mqd);
#if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
#endif
break;
case KFD_MQD_TYPE_HIQ:
+ mqd->allocate_mqd = allocate_hiq_mqd;
+ mqd->init_mqd = init_mqd_hiq;
+ mqd->free_mqd = free_mqd_hiq_sdma;
+ mqd->load_mqd = load_mqd;
+ mqd->update_mqd = update_mqd_hiq;
+ mqd->destroy_mqd = destroy_mqd;
+ mqd->is_occupied = is_occupied;
+ mqd->mqd_size = sizeof(struct vi_mqd);
+#if defined(CONFIG_DEBUG_FS)
+ mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+ break;
+ case KFD_MQD_TYPE_DIQ:
+ mqd->allocate_mqd = allocate_hiq_mqd;
mqd->init_mqd = init_mqd_hiq;
- mqd->uninit_mqd = uninit_mqd;
+ mqd->free_mqd = free_mqd;
mqd->load_mqd = load_mqd;
mqd->update_mqd = update_mqd_hiq;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
+ mqd->mqd_size = sizeof(struct vi_mqd);
#if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
#endif
break;
case KFD_MQD_TYPE_SDMA:
+ mqd->allocate_mqd = allocate_sdma_mqd;
mqd->init_mqd = init_mqd_sdma;
- mqd->uninit_mqd = uninit_mqd_sdma;
+ mqd->free_mqd = free_mqd_hiq_sdma;
mqd->load_mqd = load_mqd_sdma;
mqd->update_mqd = update_mqd_sdma;
mqd->destroy_mqd = destroy_mqd_sdma;
mqd->is_occupied = is_occupied_sdma;
+ mqd->mqd_size = sizeof(struct vi_sdma_mqd);
#if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index 045a229436a0..ccf6b2310316 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -48,7 +48,8 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
process_count = pm->dqm->processes_count;
queue_count = pm->dqm->queue_count;
- compute_queue_count = queue_count - pm->dqm->sdma_queue_count;
+ compute_queue_count = queue_count - pm->dqm->sdma_queue_count -
+ pm->dqm->xgmi_sdma_queue_count;
/* check if there is over subscription
* Note: the arbitration between the number of VMIDs and
@@ -202,11 +203,15 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
pr_debug("Finished map process and queues to runlist\n");
- if (is_over_subscription)
+ if (is_over_subscription) {
+ if (!pm->is_over_subscription)
+ pr_warn("Runlist is getting oversubscribed. Expect reduced ROCm performance.\n");
retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr],
*rl_gpu_addr,
alloc_size_bytes / sizeof(uint32_t),
true);
+ }
+ pm->is_over_subscription = is_over_subscription;
for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++)
pr_debug("0x%2X ", rl_buffer[i]);
@@ -227,6 +232,7 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
case CHIP_POLARIS10:
case CHIP_POLARIS11:
case CHIP_POLARIS12:
+ case CHIP_VEGAM:
pm->pmf = &kfd_vi_pm_funcs;
break;
case CHIP_VEGA10:
@@ -235,6 +241,9 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
case CHIP_RAVEN:
pm->pmf = &kfd_v9_pm_funcs;
break;
+ case CHIP_NAVI10:
+ pm->pmf = &kfd_v10_pm_funcs;
+ break;
default:
WARN(1, "Unexpected ASIC family %u",
dqm->dev->device_info->asic_family);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
index f2bcf5c092ea..e3e21404cfa0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
@@ -120,7 +120,7 @@ struct pm4_mes_runlist {
uint32_t ib_size:20;
uint32_t chain:1;
uint32_t offload_polling:1;
- uint32_t reserved2:1;
+ uint32_t chained_runlist_idle_disable:1;
uint32_t valid:1;
uint32_t process_cnt:4;
uint32_t reserved3:4;
@@ -176,8 +176,7 @@ struct pm4_mes_map_process {
union {
struct {
- uint32_t num_gws:6;
- uint32_t reserved7:1;
+ uint32_t num_gws:7;
uint32_t sdma_enable:1;
uint32_t num_oac:4;
uint32_t reserved8:4;
@@ -255,11 +254,6 @@ enum mes_map_queues_queue_type_enum {
queue_type__mes_map_queues__low_latency_static_queue_vi = 3
};
-enum mes_map_queues_alloc_format_enum {
- alloc_format__mes_map_queues__one_per_pipe_vi = 0,
-alloc_format__mes_map_queues__all_on_one_pipe_vi = 1
-};
-
enum mes_map_queues_engine_sel_enum {
engine_sel__mes_map_queues__compute_vi = 0,
engine_sel__mes_map_queues__sdma0_vi = 2,
@@ -277,9 +271,11 @@ struct pm4_mes_map_queues {
struct {
uint32_t reserved1:4;
enum mes_map_queues_queue_sel_enum queue_sel:2;
- uint32_t reserved2:15;
+ uint32_t reserved5:6;
+ uint32_t gws_control_queue:1;
+ uint32_t reserved2:8;
enum mes_map_queues_queue_type_enum queue_type:3;
- enum mes_map_queues_alloc_format_enum alloc_format:2;
+ uint32_t reserved3:2;
enum mes_map_queues_engine_sel_enum engine_sel:3;
uint32_t num_queues:3;
} bitfields2;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h
index 7c8d9b357749..5466cfe1c3cc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h
@@ -216,11 +216,6 @@ enum mes_map_queues_queue_type_vi_enum {
queue_type__mes_map_queues__low_latency_static_queue_vi = 3
};
-enum mes_map_queues_alloc_format_vi_enum {
- alloc_format__mes_map_queues__one_per_pipe_vi = 0,
-alloc_format__mes_map_queues__all_on_one_pipe_vi = 1
-};
-
enum mes_map_queues_engine_sel_vi_enum {
engine_sel__mes_map_queues__compute_vi = 0,
engine_sel__mes_map_queues__sdma0_vi = 2,
@@ -240,7 +235,7 @@ struct pm4_mes_map_queues {
enum mes_map_queues_queue_sel_vi_enum queue_sel:2;
uint32_t reserved2:15;
enum mes_map_queues_queue_type_vi_enum queue_type:3;
- enum mes_map_queues_alloc_format_vi_enum alloc_format:2;
+ uint32_t reserved3:2;
enum mes_map_queues_engine_sel_vi_enum engine_sel:3;
uint32_t num_queues:3;
} bitfields2;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 487d5da337c1..08a0feb9d0a0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -35,6 +35,7 @@
#include <linux/kfifo.h>
#include <linux/seq_file.h>
#include <linux/kref.h>
+#include <linux/sysfs.h>
#include <kgd_kfd_interface.h>
#include "amd_shared.h"
@@ -59,6 +60,7 @@
#define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT)
#define KFD_MMAP_TYPE_EVENTS (0x2ULL << KFD_MMAP_TYPE_SHIFT)
#define KFD_MMAP_TYPE_RESERVED_MEM (0x1ULL << KFD_MMAP_TYPE_SHIFT)
+#define KFD_MMAP_TYPE_MMIO (0x0ULL << KFD_MMAP_TYPE_SHIFT)
#define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT)
#define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \
@@ -103,6 +105,8 @@
#define KFD_KERNEL_QUEUE_SIZE 2048
+#define KFD_UNMAP_LATENCY_MS (4000)
+
/*
* 512 = 0x200
* The doorbell index distance between SDMA RLC (2*i) and (2*i+1) in the
@@ -160,11 +164,25 @@ extern int noretry;
*/
extern int halt_if_hws_hang;
+/*
+ * Whether MEC FW support GWS barriers
+ */
+extern bool hws_gws_support;
+
+/*
+ * Queue preemption timeout in ms
+ */
+extern int queue_preemption_timeout_ms;
+
enum cache_policy {
cache_policy_coherent,
cache_policy_noncoherent
};
+#define KFD_IS_VI(chip) ((chip) >= CHIP_CARRIZO && (chip) <= CHIP_POLARIS11)
+#define KFD_IS_DGPU(chip) (((chip) >= CHIP_TONGA && \
+ (chip) <= CHIP_NAVI10) || \
+ (chip) == CHIP_HAWAII)
#define KFD_IS_SOC15(chip) ((chip) >= CHIP_VEGA10)
struct kfd_event_interrupt_class {
@@ -188,6 +206,7 @@ struct kfd_device_info {
bool needs_iommu_device;
bool needs_pci_atomics;
unsigned int num_sdma_engines;
+ unsigned int num_xgmi_sdma_engines;
unsigned int num_sdma_queues_per_engine;
};
@@ -258,7 +277,7 @@ struct kfd_dev {
bool interrupts_active;
/* Debug manager */
- struct kfd_dbgmgr *dbgmgr;
+ struct kfd_dbgmgr *dbgmgr;
/* Firmware versions */
uint16_t mec_fw_version;
@@ -282,6 +301,9 @@ struct kfd_dev {
/* Compute Profile ref. count */
atomic_t compute_profile;
+
+ /* Global GWS resource shared b/t processes*/
+ void *gws;
};
enum kfd_mempool {
@@ -329,7 +351,8 @@ enum kfd_queue_type {
KFD_QUEUE_TYPE_COMPUTE,
KFD_QUEUE_TYPE_SDMA,
KFD_QUEUE_TYPE_HIQ,
- KFD_QUEUE_TYPE_DIQ
+ KFD_QUEUE_TYPE_DIQ,
+ KFD_QUEUE_TYPE_SDMA_XGMI
};
enum kfd_queue_format {
@@ -337,6 +360,11 @@ enum kfd_queue_format {
KFD_QUEUE_FORMAT_AQL
};
+enum KFD_QUEUE_PRIORITY {
+ KFD_QUEUE_PRIORITY_MINIMUM = 0,
+ KFD_QUEUE_PRIORITY_MAXIMUM = 15
+};
+
/**
* struct queue_properties
*
@@ -419,6 +447,11 @@ struct queue_properties {
uint32_t *cu_mask;
};
+#define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 && \
+ (q).queue_address != 0 && \
+ (q).queue_percent > 0 && \
+ !(q).is_evicted)
+
/**
* struct queue
*
@@ -444,6 +477,9 @@ struct queue_properties {
*
* @device: The kfd device that created this queue.
*
+ * @gws: Pointing to gws kgd_mem if this is a gws control queue; NULL
+ * otherwise.
+ *
* This structure represents user mode compute queues.
* It contains all the necessary data to handle such queues.
*
@@ -465,6 +501,7 @@ struct queue {
struct kfd_process *process;
struct kfd_dev *device;
+ void *gws;
};
/*
@@ -475,9 +512,16 @@ enum KFD_MQD_TYPE {
KFD_MQD_TYPE_HIQ, /* for hiq */
KFD_MQD_TYPE_CP, /* for cp queues and diq */
KFD_MQD_TYPE_SDMA, /* for sdma queues */
+ KFD_MQD_TYPE_DIQ, /* for diq */
KFD_MQD_TYPE_MAX
};
+enum KFD_PIPE_PRIORITY {
+ KFD_PIPE_PRIORITY_CS_LOW = 0,
+ KFD_PIPE_PRIORITY_CS_MEDIUM,
+ KFD_PIPE_PRIORITY_CS_HIGH
+};
+
struct scheduling_resources {
unsigned int vmid_mask;
enum kfd_queue_type type;
@@ -686,6 +730,10 @@ struct kfd_process {
* restored after an eviction
*/
unsigned long last_restore_timestamp;
+
+ /* Kobj for our procfs */
+ struct kobject *kobj;
+ struct attribute attr_pasid;
};
#define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
@@ -788,6 +836,10 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj);
extern struct device *kfd_device;
+/* KFD's procfs */
+void kfd_procfs_init(void);
+void kfd_procfs_shutdown(void);
+
/* Topology */
int kfd_topology_init(void);
void kfd_topology_shutdown(void);
@@ -819,8 +871,6 @@ void uninit_queue(struct queue *q);
void print_queue_properties(struct queue_properties *q);
void print_queue(struct queue *q);
-struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
- struct kfd_dev *dev);
struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
struct kfd_dev *dev);
struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type,
@@ -831,6 +881,8 @@ struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type,
struct kfd_dev *dev);
struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
struct kfd_dev *dev);
+struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type,
+ struct kfd_dev *dev);
struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
void device_queue_manager_uninit(struct device_queue_manager *dqm);
struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
@@ -859,6 +911,8 @@ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid,
struct queue_properties *p);
int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid,
struct queue_properties *p);
+int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
+ void *gws);
struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm,
unsigned int qid);
int pqm_get_wave_state(struct process_queue_manager *pqm,
@@ -868,8 +922,8 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,
u32 *save_area_used_size);
int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
- unsigned int fence_value,
- unsigned int timeout_ms);
+ unsigned int fence_value,
+ unsigned int timeout_ms);
/* Packet Manager */
@@ -883,6 +937,7 @@ struct packet_manager {
bool allocated;
struct kfd_mem_obj *ib_buffer_obj;
unsigned int ib_size_bytes;
+ bool is_over_subscription;
const struct packet_manager_funcs *pmf;
};
@@ -918,6 +973,7 @@ struct packet_manager_funcs {
extern const struct packet_manager_funcs kfd_vi_pm_funcs;
extern const struct packet_manager_funcs kfd_v9_pm_funcs;
+extern const struct packet_manager_funcs kfd_v10_pm_funcs;
int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
void pm_uninit(struct packet_manager *pm);
@@ -937,7 +993,8 @@ void pm_release_ib(struct packet_manager *pm);
/* Following PM funcs can be shared among VI and AI */
unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size);
int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer,
- struct scheduling_resources *res);
+ struct scheduling_resources *res);
+
uint64_t kfd_get_number_elems(struct kfd_dev *kfd);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 4bdae78bab8e..8f1076c0c88a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -68,6 +68,68 @@ static struct kfd_process *create_process(const struct task_struct *thread,
static void evict_process_worker(struct work_struct *work);
static void restore_process_worker(struct work_struct *work);
+struct kfd_procfs_tree {
+ struct kobject *kobj;
+};
+
+static struct kfd_procfs_tree procfs;
+
+static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr,
+ char *buffer)
+{
+ int val = 0;
+
+ if (strcmp(attr->name, "pasid") == 0) {
+ struct kfd_process *p = container_of(attr, struct kfd_process,
+ attr_pasid);
+ val = p->pasid;
+ } else {
+ pr_err("Invalid attribute");
+ return -EINVAL;
+ }
+
+ return snprintf(buffer, PAGE_SIZE, "%d\n", val);
+}
+
+static void kfd_procfs_kobj_release(struct kobject *kobj)
+{
+ kfree(kobj);
+}
+
+static const struct sysfs_ops kfd_procfs_ops = {
+ .show = kfd_procfs_show,
+};
+
+static struct kobj_type procfs_type = {
+ .release = kfd_procfs_kobj_release,
+ .sysfs_ops = &kfd_procfs_ops,
+};
+
+void kfd_procfs_init(void)
+{
+ int ret = 0;
+
+ procfs.kobj = kfd_alloc_struct(procfs.kobj);
+ if (!procfs.kobj)
+ return;
+
+ ret = kobject_init_and_add(procfs.kobj, &procfs_type,
+ &kfd_device->kobj, "proc");
+ if (ret) {
+ pr_warn("Could not create procfs proc folder");
+ /* If we fail to create the procfs, clean up */
+ kfd_procfs_shutdown();
+ }
+}
+
+void kfd_procfs_shutdown(void)
+{
+ if (procfs.kobj) {
+ kobject_del(procfs.kobj);
+ kobject_put(procfs.kobj);
+ procfs.kobj = NULL;
+ }
+}
int kfd_process_create_wq(void)
{
@@ -206,6 +268,7 @@ struct kfd_process *kfd_create_process(struct file *filep)
{
struct kfd_process *process;
struct task_struct *thread = current;
+ int ret;
if (!thread->mm)
return ERR_PTR(-EINVAL);
@@ -223,11 +286,36 @@ struct kfd_process *kfd_create_process(struct file *filep)
/* A prior open of /dev/kfd could have already created the process. */
process = find_process(thread);
- if (process)
+ if (process) {
pr_debug("Process already found\n");
- else
+ } else {
process = create_process(thread, filep);
+ if (!procfs.kobj)
+ goto out;
+
+ process->kobj = kfd_alloc_struct(process->kobj);
+ if (!process->kobj) {
+ pr_warn("Creating procfs kobject failed");
+ goto out;
+ }
+ ret = kobject_init_and_add(process->kobj, &procfs_type,
+ procfs.kobj, "%d",
+ (int)process->lead_thread->pid);
+ if (ret) {
+ pr_warn("Creating procfs pid directory failed");
+ goto out;
+ }
+
+ process->attr_pasid.name = "pasid";
+ process->attr_pasid.mode = KFD_SYSFS_FILE_MODE;
+ sysfs_attr_init(&process->attr_pasid);
+ ret = sysfs_create_file(process->kobj, &process->attr_pasid);
+ if (ret)
+ pr_warn("Creating pasid for pid %d failed",
+ (int)process->lead_thread->pid);
+ }
+out:
mutex_unlock(&kfd_processes_mutex);
return process;
@@ -355,6 +443,14 @@ static void kfd_process_wq_release(struct work_struct *work)
struct kfd_process *p = container_of(work, struct kfd_process,
release_work);
+ /* Remove the procfs files */
+ if (p->kobj) {
+ sysfs_remove_file(p->kobj, &p->attr_pasid);
+ kobject_del(p->kobj);
+ kobject_put(p->kobj);
+ p->kobj = NULL;
+ }
+
kfd_iommu_unbind_process(p);
kfd_process_free_outstanding_kfd_bos(p);
@@ -1107,3 +1203,4 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
}
#endif
+
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index fcaaf93681ac..da0958625861 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -26,6 +26,7 @@
#include "kfd_device_queue_manager.h"
#include "kfd_priv.h"
#include "kfd_kernel_queue.h"
+#include "amdgpu_amdkfd.h"
static inline struct process_queue_node *get_queue_by_qid(
struct process_queue_manager *pqm, unsigned int qid)
@@ -74,6 +75,55 @@ void kfd_process_dequeue_from_device(struct kfd_process_device *pdd)
pdd->already_dequeued = true;
}
+int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
+ void *gws)
+{
+ struct kfd_dev *dev = NULL;
+ struct process_queue_node *pqn;
+ struct kfd_process_device *pdd;
+ struct kgd_mem *mem = NULL;
+ int ret;
+
+ pqn = get_queue_by_qid(pqm, qid);
+ if (!pqn) {
+ pr_err("Queue id does not match any known queue\n");
+ return -EINVAL;
+ }
+
+ if (pqn->q)
+ dev = pqn->q->device;
+ if (WARN_ON(!dev))
+ return -ENODEV;
+
+ pdd = kfd_get_process_device_data(dev, pqm->process);
+ if (!pdd) {
+ pr_err("Process device data doesn't exist\n");
+ return -EINVAL;
+ }
+
+ /* Only allow one queue per process can have GWS assigned */
+ if (gws && pdd->qpd.num_gws)
+ return -EBUSY;
+
+ if (!gws && pdd->qpd.num_gws == 0)
+ return -EINVAL;
+
+ if (gws)
+ ret = amdgpu_amdkfd_add_gws_to_process(pdd->process->kgd_process_info,
+ gws, &mem);
+ else
+ ret = amdgpu_amdkfd_remove_gws_from_process(pdd->process->kgd_process_info,
+ pqn->q->gws);
+ if (unlikely(ret))
+ return ret;
+
+ pqn->q->gws = mem;
+ pdd->qpd.num_gws = gws ? amdgpu_amdkfd_get_num_gws(dev->kgd) : 0;
+
+ return pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
+ pqn->q);
+}
+
void kfd_process_dequeue_from_all_devices(struct kfd_process *p)
{
struct kfd_process_device *pdd;
@@ -186,8 +236,13 @@ int pqm_create_queue(struct process_queue_manager *pqm,
switch (type) {
case KFD_QUEUE_TYPE_SDMA:
- if (dev->dqm->queue_count >= get_num_sdma_queues(dev->dqm)) {
- pr_err("Over-subscription is not allowed for SDMA.\n");
+ case KFD_QUEUE_TYPE_SDMA_XGMI:
+ if ((type == KFD_QUEUE_TYPE_SDMA && dev->dqm->sdma_queue_count
+ >= get_num_sdma_queues(dev->dqm)) ||
+ (type == KFD_QUEUE_TYPE_SDMA_XGMI &&
+ dev->dqm->xgmi_sdma_queue_count
+ >= get_num_xgmi_sdma_queues(dev->dqm))) {
+ pr_debug("Over-subscription is not allowed for SDMA.\n");
retval = -EPERM;
goto err_create_queue;
}
@@ -325,6 +380,13 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
if (retval != -ETIME)
goto err_destroy_queue;
}
+
+ if (pqn->q->gws) {
+ amdgpu_amdkfd_remove_gws_from_process(pqm->process->kgd_process_info,
+ pqn->q->gws);
+ pdd->qpd.num_gws = 0;
+ }
+
kfree(pqn->q->properties.cu_mask);
pqn->q->properties.cu_mask = NULL;
uninit_queue(pqn->q);
@@ -446,6 +508,7 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data)
q = pqn->q;
switch (q->properties.type) {
case KFD_QUEUE_TYPE_SDMA:
+ case KFD_QUEUE_TYPE_SDMA_XGMI:
seq_printf(m, " SDMA queue on device %x\n",
q->device->id);
mqd_type = KFD_MQD_TYPE_SDMA;
@@ -461,8 +524,7 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data)
q->properties.type, q->device->id);
continue;
}
- mqd_mgr = q->device->dqm->ops.get_mqd_manager(
- q->device->dqm, mqd_type);
+ mqd_mgr = q->device->dqm->mqd_mgrs[mqd_type];
} else if (pqn->kq) {
q = pqn->kq->queue;
mqd_mgr = pqn->kq->mqd_mgr;
@@ -470,7 +532,6 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data)
case KFD_QUEUE_TYPE_DIQ:
seq_printf(m, " DIQ on device %x\n",
pqn->kq->dev->id);
- mqd_type = KFD_MQD_TYPE_HIQ;
break;
default:
seq_printf(m,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 769dbc7be8cb..c2e6e47abaf2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -454,6 +454,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
dev->node_props.lds_size_in_kb);
sysfs_show_32bit_prop(buffer, "gds_size_in_kb",
dev->node_props.gds_size_in_kb);
+ sysfs_show_32bit_prop(buffer, "num_gws",
+ dev->node_props.num_gws);
sysfs_show_32bit_prop(buffer, "wave_front_size",
dev->node_props.wave_front_size);
sysfs_show_32bit_prop(buffer, "array_count",
@@ -476,6 +478,10 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
dev->node_props.drm_render_minor);
sysfs_show_64bit_prop(buffer, "hive_id",
dev->node_props.hive_id);
+ sysfs_show_32bit_prop(buffer, "num_sdma_engines",
+ dev->node_props.num_sdma_engines);
+ sysfs_show_32bit_prop(buffer, "num_sdma_xgmi_engines",
+ dev->node_props.num_sdma_xgmi_engines);
if (dev->gpu) {
log_max_watch_addr =
@@ -1078,8 +1084,9 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu)
local_mem_info.local_mem_size_public;
buf[0] = gpu->pdev->devfn;
- buf[1] = gpu->pdev->subsystem_vendor;
- buf[2] = gpu->pdev->subsystem_device;
+ buf[1] = gpu->pdev->subsystem_vendor |
+ (gpu->pdev->subsystem_device << 16);
+ buf[2] = pci_domain_nr(gpu->pdev->bus);
buf[3] = gpu->pdev->device;
buf[4] = gpu->pdev->bus->number;
buf[5] = lower_32_bits(local_mem_size);
@@ -1281,6 +1288,12 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
gpu->shared_resources.drm_render_minor;
dev->node_props.hive_id = gpu->hive_id;
+ dev->node_props.num_sdma_engines = gpu->device_info->num_sdma_engines;
+ dev->node_props.num_sdma_xgmi_engines =
+ gpu->device_info->num_xgmi_sdma_engines;
+ dev->node_props.num_gws = (hws_gws_support &&
+ dev->gpu->dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) ?
+ amdgpu_amdkfd_get_num_gws(dev->gpu->kgd) : 0;
kfd_fill_mem_clk_max_info(dev);
kfd_fill_iolink_non_crat_info(dev);
@@ -1298,6 +1311,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
case CHIP_POLARIS10:
case CHIP_POLARIS11:
case CHIP_POLARIS12:
+ case CHIP_VEGAM:
pr_debug("Adding doorbell packet type capability\n");
dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 <<
HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
@@ -1307,6 +1321,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
case CHIP_VEGA12:
case CHIP_VEGA20:
case CHIP_RAVEN:
+ case CHIP_NAVI10:
dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
@@ -1316,17 +1331,24 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
dev->gpu->device_info->asic_family);
}
+ /*
+ * Overwrite ATS capability according to needs_iommu_device to fix
+ * potential missing corresponding bit in CRAT of BIOS.
+ */
+ if (dev->gpu->device_info->needs_iommu_device)
+ dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
+ else
+ dev->node_props.capability &= ~HSA_CAP_ATS_PRESENT;
+
/* Fix errors in CZ CRAT.
* simd_count: Carrizo CRAT reports wrong simd_count, probably
* because it doesn't consider masked out CUs
* max_waves_per_simd: Carrizo reports wrong max_waves_per_simd
- * capability flag: Carrizo CRAT doesn't report IOMMU flags
*/
if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) {
dev->node_props.simd_count =
cu_info.simd_per_cu * cu_info.cu_active_number;
dev->node_props.max_waves_per_simd = 10;
- dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
}
ctx = amdgpu_ras_get_context((struct amdgpu_device *)(dev->gpu->kgd));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
index 84710cfd23c2..276354aa0fcc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
@@ -65,6 +65,7 @@ struct kfd_node_properties {
uint32_t max_waves_per_simd;
uint32_t lds_size_in_kb;
uint32_t gds_size_in_kb;
+ uint32_t num_gws;
uint32_t wave_front_size;
uint32_t array_count;
uint32_t simd_arrays_per_engine;
@@ -78,6 +79,8 @@ struct kfd_node_properties {
uint32_t max_engine_clk_fcompute;
uint32_t max_engine_clk_ccompute;
int32_t drm_render_minor;
+ uint32_t num_sdma_engines;
+ uint32_t num_sdma_xgmi_engines;
uint16_t marketing_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE];
};