summaryrefslogtreecommitdiff
path: root/drivers/infiniband/hw/mlx5/odp.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/hw/mlx5/odp.c')
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c106
1 files changed, 58 insertions, 48 deletions
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 2bc4d67b3e42..6213fe028cf2 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -257,8 +257,7 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
* estimate the cost of another UMR vs. the cost of bigger
* UMR.
*/
- if (umem_odp->dma_list[idx] &
- (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
+ if (umem_odp->pfns[idx] & ODP_READ_BIT) {
if (!in_block) {
blk_start_idx = idx;
in_block = 1;
@@ -580,17 +579,18 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
u64 io_virt, size_t bcnt, u32 *bytes_mapped,
u32 flags)
{
- int npages = 0, current_seq, page_shift, ret, np;
- bool implicit = false;
struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem);
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
- u64 access_mask = ODP_READ_ALLOWED_BIT;
+ unsigned long npages = 0, page_shift, np, off;
u64 start_idx, page_mask;
struct ib_umem_odp *odp;
- size_t size;
+ struct hmm_range range;
+ bool implicit = false;
+ size_t size, fault_size;
+ long ret;
- if (!odp_mr->page_list) {
+ if (!odp_mr->pfns) {
odp = implicit_mr_get_data(mr, io_virt, bcnt);
if (IS_ERR(odp))
@@ -603,11 +603,30 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
next_mr:
size = min_t(size_t, bcnt, ib_umem_end(&odp->umem) - io_virt);
-
page_shift = mr->umem->page_shift;
page_mask = ~(BIT(page_shift) - 1);
+ /*
+ * We need to align io_virt on page size so off is the extra bytes we
+ * will be faulting and fault_size is the page aligned size we are
+ * faulting.
+ */
+ io_virt = io_virt & page_mask;
+ off = (io_virt & (~page_mask));
+ fault_size = ALIGN(size + off, 1UL << page_shift);
+
+ if (io_virt < ib_umem_start(&odp->umem))
+ return -EINVAL;
+
start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
+ if (odp_mr->per_mm == NULL || odp_mr->per_mm->mm == NULL)
+ return -ENOENT;
+
+ ret = hmm_range_register(&range, odp_mr->per_mm->mm,
+ io_virt, io_virt + fault_size, page_shift);
+ if (ret)
+ return ret;
+
if (prefetch && !downgrade && !mr->umem->writable) {
/* prefetch with write-access must
* be supported by the MR
@@ -616,58 +635,55 @@ next_mr:
goto out;
}
+ range.default_flags = ODP_READ_BIT;
if (mr->umem->writable && !downgrade)
- access_mask |= ODP_WRITE_ALLOWED_BIT;
-
- current_seq = READ_ONCE(odp->notifiers_seq);
- /*
- * Ensure the sequence number is valid for some time before we call
- * gup.
- */
- smp_rmb();
-
- ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), io_virt, size,
- access_mask, current_seq);
+ range.default_flags |= ODP_WRITE_BIT;
+ ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), &range);
if (ret < 0)
- goto out;
+ goto again;
np = ret;
mutex_lock(&odp->umem_mutex);
- if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr->umem),
- current_seq)) {
+ if (hmm_range_valid(&range)) {
/*
* No need to check whether the MTTs really belong to
- * this MR, since ib_umem_odp_map_dma_pages already
+ * this MR, since ib_umem_odp_map_dma_pages() already
* checks this.
*/
ret = mlx5_ib_update_xlt(mr, start_idx, np,
page_shift, MLX5_IB_UPD_XLT_ATOMIC);
- } else {
+ } else
ret = -EAGAIN;
- }
mutex_unlock(&odp->umem_mutex);
if (ret < 0) {
- if (ret != -EAGAIN)
+ if (ret != -EAGAIN) {
mlx5_ib_err(dev, "Failed to update mkey page tables\n");
- goto out;
+ goto out;
+ }
+ goto again;
}
if (bytes_mapped) {
- u32 new_mappings = (np << page_shift) -
- (io_virt - round_down(io_virt, 1 << page_shift));
+ long new_mappings = (np << page_shift) - off;
+ new_mappings = new_mappings < 0 ? 0 : new_mappings;
*bytes_mapped += min_t(u32, new_mappings, size);
}
npages += np << (page_shift - PAGE_SHIFT);
+ hmm_range_unregister(&range);
bcnt -= size;
- if (unlikely(bcnt)) {
+ if (unlikely(bcnt > 0)) {
struct ib_umem_odp *next;
- io_virt += size;
+ /*
+ * Next virtual address is after the number of bytes we faulted
+ * in this step.
+ */
+ io_virt += fault_size;
next = odp_next(odp);
if (unlikely(!next || next->umem.address != io_virt)) {
mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
@@ -681,24 +697,18 @@ next_mr:
return npages;
-out:
- if (ret == -EAGAIN) {
- if (implicit || !odp->dying) {
- unsigned long timeout =
- msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
-
- if (!wait_for_completion_timeout(
- &odp->notifier_completion,
- timeout)) {
- mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n",
- current_seq, odp->notifiers_seq, odp->notifiers_count);
- }
- } else {
- /* The MR is being killed, kill the QP as well. */
- ret = -EFAULT;
- }
- }
+again:
+ if (ret != -EAGAIN)
+ goto out;
+
+ /* Check if the MR is being killed, kill the QP as well. */
+ if (!implicit || odp->dying)
+ ret = -EFAULT;
+ else if (!hmm_range_wait_until_valid(&range, MMU_NOTIFIER_TIMEOUT))
+ mlx5_ib_warn(dev, "timeout waiting for mmu notifier.\n");
+out:
+ hmm_range_unregister(&range);
return ret;
}