summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd
diff options
context:
space:
mode:
authorRajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>2021-01-11 13:27:50 -0500
committerAlex Deucher <alexander.deucher@amd.com>2022-02-07 17:59:41 -0500
commit011bbb03024f5a22dc04eba370f9296f0cb83502 (patch)
treee5427c42e03acdc91872818eae0a00d5d488032f /drivers/gpu/drm/amd/amdkfd
parent73fa13b6a5112b59d09e58b8075973769a15814c (diff)
drm/amdkfd: CRIU Implement KFD resume ioctl
This adds support to create userptr BOs on restore and introduces a new ioctl op to restart memory notifiers for the restored userptr BOs. When doing CRIU restore MMU notifications can happen anytime after we call amdgpu_mn_register. Prevent MMU notifications until we reach stage-4 of the restore process i.e. criu_resume ioctl op is received, and the process is ready to be resumed. This ioctl is different from other KFD CRIU ioctls since its called by CRIU master restore process for all the target processes being resumed by CRIU. Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: David Yat Sin <david.yatsin@amd.com> Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_chardev.c41
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_priv.h1
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process.c35
3 files changed, 67 insertions, 10 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 342fc56b1940..95fc5668195c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1324,7 +1324,7 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
err = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
dev->adev, args->va_addr, args->size,
pdd->drm_priv, (struct kgd_mem **) &mem, &offset,
- flags);
+ flags, false);
if (err)
goto err_unlock;
@@ -2116,6 +2116,7 @@ static int criu_restore_bos(struct kfd_process *p,
{
struct kfd_criu_bo_bucket *bo_buckets;
struct kfd_criu_bo_priv_data *bo_privs;
+ const bool criu_resume = true;
bool flush_tlbs = false;
int ret = 0, j = 0;
uint32_t i;
@@ -2123,6 +2124,9 @@ static int criu_restore_bos(struct kfd_process *p,
if (*priv_offset + (args->num_bos * sizeof(*bo_privs)) > max_priv_data_size)
return -EINVAL;
+ /* Prevent MMU notifications until stage-4 IOCTL (CRIU_RESUME) is received */
+ amdgpu_amdkfd_block_mmu_notifications(p->kgd_process_info);
+
bo_buckets = kvmalloc_array(args->num_bos, sizeof(*bo_buckets), GFP_KERNEL);
if (!bo_buckets)
return -ENOMEM;
@@ -2211,7 +2215,6 @@ static int criu_restore_bos(struct kfd_process *p,
} else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
offset = bo_priv->user_addr;
}
-
/* Create the BO */
ret = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(dev->adev,
bo_bucket->addr,
@@ -2219,7 +2222,8 @@ static int criu_restore_bos(struct kfd_process *p,
pdd->drm_priv,
(struct kgd_mem **) &mem,
&offset,
- bo_bucket->alloc_flags);
+ bo_bucket->alloc_flags,
+ criu_resume);
if (ret) {
pr_err("Could not create the BO\n");
ret = -ENOMEM;
@@ -2239,7 +2243,6 @@ static int criu_restore_bos(struct kfd_process *p,
amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev,
(struct kgd_mem *)mem,
pdd->drm_priv, NULL);
-
ret = -ENOMEM;
goto exit;
}
@@ -2392,7 +2395,35 @@ static int criu_resume(struct file *filep,
struct kfd_process *p,
struct kfd_ioctl_criu_args *args)
{
- return 0;
+ struct kfd_process *target = NULL;
+ struct pid *pid = NULL;
+ int ret = 0;
+
+ pr_debug("Inside %s, target pid for criu restore: %d\n", __func__,
+ args->pid);
+
+ pid = find_get_pid(args->pid);
+ if (!pid) {
+ pr_err("Cannot find pid info for %i\n", args->pid);
+ return -ESRCH;
+ }
+
+ pr_debug("calling kfd_lookup_process_by_pid\n");
+ target = kfd_lookup_process_by_pid(pid);
+
+ put_pid(pid);
+
+ if (!target) {
+ pr_debug("Cannot find process info for %i\n", args->pid);
+ return -ESRCH;
+ }
+
+ mutex_lock(&target->mutex);
+ ret = amdgpu_amdkfd_criu_resume(target->kgd_process_info);
+ mutex_unlock(&target->mutex);
+
+ kfd_unref_process(target);
+ return ret;
}
static int criu_process_info(struct file *filep,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index a4d08b6b2e6b..9b347247055c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -951,6 +951,7 @@ void *kfd_process_device_translate_handle(struct kfd_process_device *p,
int handle);
void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
int handle);
+struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid);
/* PASIDs */
int kfd_pasid_init(void);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 74f162887d3b..b3198e186622 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -64,7 +64,8 @@ static struct workqueue_struct *kfd_process_wq;
*/
static struct workqueue_struct *kfd_restore_wq;
-static struct kfd_process *find_process(const struct task_struct *thread);
+static struct kfd_process *find_process(const struct task_struct *thread,
+ bool ref);
static void kfd_process_ref_release(struct kref *ref);
static struct kfd_process *create_process(const struct task_struct *thread);
static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep);
@@ -715,7 +716,8 @@ static int kfd_process_alloc_gpuvm(struct kfd_process_device *pdd,
int err;
err = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(kdev->adev, gpu_va, size,
- pdd->drm_priv, mem, NULL, flags);
+ pdd->drm_priv, mem, NULL,
+ flags, false);
if (err)
goto err_alloc_mem;
@@ -816,7 +818,7 @@ struct kfd_process *kfd_create_process(struct file *filep)
mutex_lock(&kfd_processes_mutex);
/* A prior open of /dev/kfd could have already created the process. */
- process = find_process(thread);
+ process = find_process(thread, false);
if (process) {
pr_debug("Process already found\n");
} else {
@@ -884,7 +886,7 @@ struct kfd_process *kfd_get_process(const struct task_struct *thread)
if (thread->group_leader->mm != thread->mm)
return ERR_PTR(-EINVAL);
- process = find_process(thread);
+ process = find_process(thread, false);
if (!process)
return ERR_PTR(-EINVAL);
@@ -903,13 +905,16 @@ static struct kfd_process *find_process_by_mm(const struct mm_struct *mm)
return NULL;
}
-static struct kfd_process *find_process(const struct task_struct *thread)
+static struct kfd_process *find_process(const struct task_struct *thread,
+ bool ref)
{
struct kfd_process *p;
int idx;
idx = srcu_read_lock(&kfd_processes_srcu);
p = find_process_by_mm(thread->mm);
+ if (p && ref)
+ kref_get(&p->ref);
srcu_read_unlock(&kfd_processes_srcu, idx);
return p;
@@ -920,6 +925,26 @@ void kfd_unref_process(struct kfd_process *p)
kref_put(&p->ref, kfd_process_ref_release);
}
+/* This increments the process->ref counter. */
+struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid)
+{
+ struct task_struct *task = NULL;
+ struct kfd_process *p = NULL;
+
+ if (!pid) {
+ task = current;
+ get_task_struct(task);
+ } else {
+ task = get_pid_task(pid, PIDTYPE_PID);
+ }
+
+ if (task) {
+ p = find_process(task, true);
+ put_task_struct(task);
+ }
+
+ return p;
+}
static void kfd_process_device_free_bos(struct kfd_process_device *pdd)
{