summaryrefslogtreecommitdiff
path: root/drivers/accel/amdxdna/aie2_pci.c
diff options
context:
space:
mode:
authorLizhi Hou <lizhi.hou@amd.com>2024-11-18 09:29:41 -0800
committerJeffrey Hugo <quic_jhugo@quicinc.com>2024-11-22 11:44:47 -0700
commit4fd4ca984b833a41f36bf7b2eaa9025377e310d0 (patch)
tree8cc115047f720efcc39aa601a8493179201727b8 /drivers/accel/amdxdna/aie2_pci.c
parentbed4c73e59e8e32a3dd68a5ea755601ab000bf7b (diff)
accel/amdxdna: Add error handling
When there is a hardware error, the NPU firmware notifies the host through a mailbox message. The message includes details of the error, such as the tile and column indexes where the error occurred. The driver starts a thread to handle the NPU error message. The thread stops the clients which are using the column where error occurred. Then the driver resets that column. Co-developed-by: Min Ma <min.ma@amd.com> Signed-off-by: Min Ma <min.ma@amd.com> Reviewed-by: Jeffrey Hugo <quic_jhugo@quicinc.com> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com> Signed-off-by: Jeffrey Hugo <quic_jhugo@quicinc.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241118172942.2014541-10-lizhi.hou@amd.com
Diffstat (limited to 'drivers/accel/amdxdna/aie2_pci.c')
-rw-r--r--drivers/accel/amdxdna/aie2_pci.c32
1 files changed, 32 insertions, 0 deletions
diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
index 803ebb237971..5467aabe7308 100644
--- a/drivers/accel/amdxdna/aie2_pci.c
+++ b/drivers/accel/amdxdna/aie2_pci.c
@@ -180,6 +180,15 @@ static int aie2_mgmt_fw_init(struct amdxdna_dev_hdl *ndev)
return ret;
}
+ if (!ndev->async_events)
+ return 0;
+
+ ret = aie2_error_async_events_send(ndev);
+ if (ret) {
+ XDNA_ERR(ndev->xdna, "Send async events failed");
+ return ret;
+ }
+
return 0;
}
@@ -469,9 +478,30 @@ static int aie2_init(struct amdxdna_dev *xdna)
goto stop_hw;
}
+ ret = aie2_error_async_events_alloc(ndev);
+ if (ret) {
+ XDNA_ERR(xdna, "Allocate async events failed, ret %d", ret);
+ goto stop_hw;
+ }
+
+ ret = aie2_error_async_events_send(ndev);
+ if (ret) {
+ XDNA_ERR(xdna, "Send async events failed, ret %d", ret);
+ goto async_event_free;
+ }
+
+ /* Issue a command to make sure firmware handled async events */
+ ret = aie2_query_firmware_version(ndev, &ndev->xdna->fw_ver);
+ if (ret) {
+ XDNA_ERR(xdna, "Re-query firmware version failed");
+ goto async_event_free;
+ }
+
release_firmware(fw);
return 0;
+async_event_free:
+ aie2_error_async_events_free(ndev);
stop_hw:
aie2_hw_stop(xdna);
disable_sva:
@@ -487,8 +517,10 @@ release_fw:
static void aie2_fini(struct amdxdna_dev *xdna)
{
struct pci_dev *pdev = to_pci_dev(xdna->ddev.dev);
+ struct amdxdna_dev_hdl *ndev = xdna->dev_handle;
aie2_hw_stop(xdna);
+ aie2_error_async_events_free(ndev);
iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA);
pci_free_irq_vectors(pdev);
}