diff options
Diffstat (limited to 'drivers/infiniband/hw/mlx5/odp.c')
-rw-r--r-- | drivers/infiniband/hw/mlx5/odp.c | 106 |
1 files changed, 58 insertions, 48 deletions
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 2bc4d67b3e42..6213fe028cf2 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -257,8 +257,7 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, * estimate the cost of another UMR vs. the cost of bigger * UMR. */ - if (umem_odp->dma_list[idx] & - (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { + if (umem_odp->pfns[idx] & ODP_READ_BIT) { if (!in_block) { blk_start_idx = idx; in_block = 1; @@ -580,17 +579,18 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, u32 *bytes_mapped, u32 flags) { - int npages = 0, current_seq, page_shift, ret, np; - bool implicit = false; struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH; - u64 access_mask = ODP_READ_ALLOWED_BIT; + unsigned long npages = 0, page_shift, np, off; u64 start_idx, page_mask; struct ib_umem_odp *odp; - size_t size; + struct hmm_range range; + bool implicit = false; + size_t size, fault_size; + long ret; - if (!odp_mr->page_list) { + if (!odp_mr->pfns) { odp = implicit_mr_get_data(mr, io_virt, bcnt); if (IS_ERR(odp)) @@ -603,11 +603,30 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, next_mr: size = min_t(size_t, bcnt, ib_umem_end(&odp->umem) - io_virt); - page_shift = mr->umem->page_shift; page_mask = ~(BIT(page_shift) - 1); + /* + * We need to align io_virt on page size so off is the extra bytes we + * will be faulting and fault_size is the page aligned size we are + * faulting. + */ + io_virt = io_virt & page_mask; + off = (io_virt & (~page_mask)); + fault_size = ALIGN(size + off, 1UL << page_shift); + + if (io_virt < ib_umem_start(&odp->umem)) + return -EINVAL; + start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; + if (odp_mr->per_mm == NULL || odp_mr->per_mm->mm == NULL) + return -ENOENT; + + ret = hmm_range_register(&range, odp_mr->per_mm->mm, + io_virt, io_virt + fault_size, page_shift); + if (ret) + return ret; + if (prefetch && !downgrade && !mr->umem->writable) { /* prefetch with write-access must * be supported by the MR @@ -616,58 +635,55 @@ next_mr: goto out; } + range.default_flags = ODP_READ_BIT; if (mr->umem->writable && !downgrade) - access_mask |= ODP_WRITE_ALLOWED_BIT; - - current_seq = READ_ONCE(odp->notifiers_seq); - /* - * Ensure the sequence number is valid for some time before we call - * gup. - */ - smp_rmb(); - - ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), io_virt, size, - access_mask, current_seq); + range.default_flags |= ODP_WRITE_BIT; + ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), &range); if (ret < 0) - goto out; + goto again; np = ret; mutex_lock(&odp->umem_mutex); - if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr->umem), - current_seq)) { + if (hmm_range_valid(&range)) { /* * No need to check whether the MTTs really belong to - * this MR, since ib_umem_odp_map_dma_pages already + * this MR, since ib_umem_odp_map_dma_pages() already * checks this. */ ret = mlx5_ib_update_xlt(mr, start_idx, np, page_shift, MLX5_IB_UPD_XLT_ATOMIC); - } else { + } else ret = -EAGAIN; - } mutex_unlock(&odp->umem_mutex); if (ret < 0) { - if (ret != -EAGAIN) + if (ret != -EAGAIN) { mlx5_ib_err(dev, "Failed to update mkey page tables\n"); - goto out; + goto out; + } + goto again; } if (bytes_mapped) { - u32 new_mappings = (np << page_shift) - - (io_virt - round_down(io_virt, 1 << page_shift)); + long new_mappings = (np << page_shift) - off; + new_mappings = new_mappings < 0 ? 0 : new_mappings; *bytes_mapped += min_t(u32, new_mappings, size); } npages += np << (page_shift - PAGE_SHIFT); + hmm_range_unregister(&range); bcnt -= size; - if (unlikely(bcnt)) { + if (unlikely(bcnt > 0)) { struct ib_umem_odp *next; - io_virt += size; + /* + * Next virtual address is after the number of bytes we faulted + * in this step. + */ + io_virt += fault_size; next = odp_next(odp); if (unlikely(!next || next->umem.address != io_virt)) { mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", @@ -681,24 +697,18 @@ next_mr: return npages; -out: - if (ret == -EAGAIN) { - if (implicit || !odp->dying) { - unsigned long timeout = - msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); - - if (!wait_for_completion_timeout( - &odp->notifier_completion, - timeout)) { - mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n", - current_seq, odp->notifiers_seq, odp->notifiers_count); - } - } else { - /* The MR is being killed, kill the QP as well. */ - ret = -EFAULT; - } - } +again: + if (ret != -EAGAIN) + goto out; + + /* Check if the MR is being killed, kill the QP as well. */ + if (!implicit || odp->dying) + ret = -EFAULT; + else if (!hmm_range_wait_until_valid(&range, MMU_NOTIFIER_TIMEOUT)) + mlx5_ib_warn(dev, "timeout waiting for mmu notifier.\n"); +out: + hmm_range_unregister(&range); return ret; } |