diff options
author | Lizhi Hou <lizhi.hou@amd.com> | 2024-11-18 09:29:41 -0800 |
---|---|---|
committer | Jeffrey Hugo <quic_jhugo@quicinc.com> | 2024-11-22 11:44:47 -0700 |
commit | 4fd4ca984b833a41f36bf7b2eaa9025377e310d0 (patch) | |
tree | 8cc115047f720efcc39aa601a8493179201727b8 /drivers/accel/amdxdna/aie2_pci.c | |
parent | bed4c73e59e8e32a3dd68a5ea755601ab000bf7b (diff) |
accel/amdxdna: Add error handling
When there is a hardware error, the NPU firmware notifies the host through
a mailbox message. The message includes details of the error, such as the
tile and column indexes where the error occurred.
The driver starts a thread to handle the NPU error message. The thread
stops the clients which are using the column where error occurred. Then
the driver resets that column.
Co-developed-by: Min Ma <min.ma@amd.com>
Signed-off-by: Min Ma <min.ma@amd.com>
Reviewed-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
Signed-off-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241118172942.2014541-10-lizhi.hou@amd.com
Diffstat (limited to 'drivers/accel/amdxdna/aie2_pci.c')
-rw-r--r-- | drivers/accel/amdxdna/aie2_pci.c | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c index 803ebb237971..5467aabe7308 100644 --- a/drivers/accel/amdxdna/aie2_pci.c +++ b/drivers/accel/amdxdna/aie2_pci.c @@ -180,6 +180,15 @@ static int aie2_mgmt_fw_init(struct amdxdna_dev_hdl *ndev) return ret; } + if (!ndev->async_events) + return 0; + + ret = aie2_error_async_events_send(ndev); + if (ret) { + XDNA_ERR(ndev->xdna, "Send async events failed"); + return ret; + } + return 0; } @@ -469,9 +478,30 @@ static int aie2_init(struct amdxdna_dev *xdna) goto stop_hw; } + ret = aie2_error_async_events_alloc(ndev); + if (ret) { + XDNA_ERR(xdna, "Allocate async events failed, ret %d", ret); + goto stop_hw; + } + + ret = aie2_error_async_events_send(ndev); + if (ret) { + XDNA_ERR(xdna, "Send async events failed, ret %d", ret); + goto async_event_free; + } + + /* Issue a command to make sure firmware handled async events */ + ret = aie2_query_firmware_version(ndev, &ndev->xdna->fw_ver); + if (ret) { + XDNA_ERR(xdna, "Re-query firmware version failed"); + goto async_event_free; + } + release_firmware(fw); return 0; +async_event_free: + aie2_error_async_events_free(ndev); stop_hw: aie2_hw_stop(xdna); disable_sva: @@ -487,8 +517,10 @@ release_fw: static void aie2_fini(struct amdxdna_dev *xdna) { struct pci_dev *pdev = to_pci_dev(xdna->ddev.dev); + struct amdxdna_dev_hdl *ndev = xdna->dev_handle; aie2_hw_stop(xdna); + aie2_error_async_events_free(ndev); iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA); pci_free_irq_vectors(pdev); } |