mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-22 08:39:31 +07:00
fcdfb968a7
The use of a scatterlist for handling DMA mapping and unmapping was recently introduced in frwr_ops.c in commit4143f34e01
("xprtrdma: Port to new memory registration API"). That commit did not make a similar update to xprtrdma's FMR support because the core ib_map_phys_fmr() and ib_unmap_fmr() APIs have not been changed to take a scatterlist argument. However, FMR still needs to do DMA mapping and unmapping. It appears that RDS, for example, uses a scatterlist for this, then builds the DMA addr array for the ib_map_phys_fmr call separately. I see that SRP also utilizes a scatterlist for DMA mapping. xprtrdma can do something similar. This modernization is used immediately to properly defer DMA unmapping during fmr_unmap_safe (a FIXME). It separates the DMA unmapping coordinates from the rl_segments array. This array, being part of an rpcrdma_req, is always re-used immediately when an RPC exits. A scatterlist is allocated in memory independent of the rl_segments array, so it can be preserved indefinitely (ie, until the MR invalidation and DMA unmapping can actually be done by a worker thread). The FRWR and FMR DMA mapping code are slightly different from each other now, and will diverge further when the "Check for holes" logic can be removed from FRWR (support for SG_GAP MRs). So I chose not to create helpers for the common-looking code. Fixes:ead3f26e35
("xprtrdma: Add ro_unmap_safe memreg method") Suggested-by: Sagi Grimberg <sagi@lightbits.io> Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Tested-by: Steve Wise <swise@opengridcomputing.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
416 lines
9.3 KiB
C
416 lines
9.3 KiB
C
/*
|
|
* Copyright (c) 2015 Oracle. All rights reserved.
|
|
* Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
|
|
*/
|
|
|
|
/* Lightweight memory registration using Fast Memory Regions (FMR).
|
|
* Referred to sometimes as MTHCAFMR mode.
|
|
*
|
|
* FMR uses synchronous memory registration and deregistration.
|
|
* FMR registration is known to be fast, but FMR deregistration
|
|
* can take tens of usecs to complete.
|
|
*/
|
|
|
|
/* Normal operation
|
|
*
|
|
* A Memory Region is prepared for RDMA READ or WRITE using the
|
|
* ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is
|
|
* finished, the Memory Region is unmapped using the ib_unmap_fmr
|
|
* verb (fmr_op_unmap).
|
|
*/
|
|
|
|
/* Transport recovery
|
|
*
|
|
* After a transport reconnect, fmr_op_map re-uses the MR already
|
|
* allocated for the RPC, but generates a fresh rkey then maps the
|
|
* MR again. This process is synchronous.
|
|
*/
|
|
|
|
#include "xprt_rdma.h"
|
|
|
|
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
|
|
# define RPCDBG_FACILITY RPCDBG_TRANS
|
|
#endif
|
|
|
|
/* Maximum scatter/gather per FMR */
|
|
#define RPCRDMA_MAX_FMR_SGES (64)
|
|
|
|
/* Access mode of externally registered pages */
|
|
enum {
|
|
RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE |
|
|
IB_ACCESS_REMOTE_READ,
|
|
};
|
|
|
|
static struct workqueue_struct *fmr_recovery_wq;
|
|
|
|
#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND)
|
|
|
|
int
|
|
fmr_alloc_recovery_wq(void)
|
|
{
|
|
fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0);
|
|
return !fmr_recovery_wq ? -ENOMEM : 0;
|
|
}
|
|
|
|
void
|
|
fmr_destroy_recovery_wq(void)
|
|
{
|
|
struct workqueue_struct *wq;
|
|
|
|
if (!fmr_recovery_wq)
|
|
return;
|
|
|
|
wq = fmr_recovery_wq;
|
|
fmr_recovery_wq = NULL;
|
|
destroy_workqueue(wq);
|
|
}
|
|
|
|
static int
|
|
__fmr_init(struct rpcrdma_mw *mw, struct ib_pd *pd)
|
|
{
|
|
static struct ib_fmr_attr fmr_attr = {
|
|
.max_pages = RPCRDMA_MAX_FMR_SGES,
|
|
.max_maps = 1,
|
|
.page_shift = PAGE_SHIFT
|
|
};
|
|
|
|
mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
|
|
sizeof(u64), GFP_KERNEL);
|
|
if (!mw->fmr.fm_physaddrs)
|
|
goto out_free;
|
|
|
|
mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
|
|
sizeof(*mw->mw_sg), GFP_KERNEL);
|
|
if (!mw->mw_sg)
|
|
goto out_free;
|
|
|
|
sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES);
|
|
|
|
mw->fmr.fm_mr = ib_alloc_fmr(pd, RPCRDMA_FMR_ACCESS_FLAGS,
|
|
&fmr_attr);
|
|
if (IS_ERR(mw->fmr.fm_mr))
|
|
goto out_fmr_err;
|
|
|
|
return 0;
|
|
|
|
out_fmr_err:
|
|
dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__,
|
|
PTR_ERR(mw->fmr.fm_mr));
|
|
|
|
out_free:
|
|
kfree(mw->mw_sg);
|
|
kfree(mw->fmr.fm_physaddrs);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
static int
|
|
__fmr_unmap(struct rpcrdma_mw *mw)
|
|
{
|
|
LIST_HEAD(l);
|
|
int rc;
|
|
|
|
list_add(&mw->fmr.fm_mr->list, &l);
|
|
rc = ib_unmap_fmr(&l);
|
|
list_del_init(&mw->fmr.fm_mr->list);
|
|
return rc;
|
|
}
|
|
|
|
static void
|
|
__fmr_dma_unmap(struct rpcrdma_mw *mw)
|
|
{
|
|
struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
|
|
|
|
ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
|
|
mw->mw_sg, mw->mw_nents, mw->mw_dir);
|
|
rpcrdma_put_mw(r_xprt, mw);
|
|
}
|
|
|
|
static void
|
|
__fmr_reset_and_unmap(struct rpcrdma_mw *mw)
|
|
{
|
|
int rc;
|
|
|
|
/* ORDER */
|
|
rc = __fmr_unmap(mw);
|
|
if (rc) {
|
|
pr_warn("rpcrdma: ib_unmap_fmr status %d, fmr %p orphaned\n",
|
|
rc, mw);
|
|
return;
|
|
}
|
|
__fmr_dma_unmap(mw);
|
|
}
|
|
|
|
static void
|
|
__fmr_release(struct rpcrdma_mw *r)
|
|
{
|
|
int rc;
|
|
|
|
kfree(r->fmr.fm_physaddrs);
|
|
kfree(r->mw_sg);
|
|
|
|
rc = ib_dealloc_fmr(r->fmr.fm_mr);
|
|
if (rc)
|
|
pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
|
|
r, rc);
|
|
}
|
|
|
|
/* Deferred reset of a single FMR. Generate a fresh rkey by
|
|
* replacing the MR. There's no recovery if this fails.
|
|
*/
|
|
static void
|
|
__fmr_recovery_worker(struct work_struct *work)
|
|
{
|
|
struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw,
|
|
mw_work);
|
|
|
|
__fmr_reset_and_unmap(mw);
|
|
return;
|
|
}
|
|
|
|
/* A broken MR was discovered in a context that can't sleep.
|
|
* Defer recovery to the recovery worker.
|
|
*/
|
|
static void
|
|
__fmr_queue_recovery(struct rpcrdma_mw *mw)
|
|
{
|
|
INIT_WORK(&mw->mw_work, __fmr_recovery_worker);
|
|
queue_work(fmr_recovery_wq, &mw->mw_work);
|
|
}
|
|
|
|
static int
|
|
fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
|
|
struct rpcrdma_create_data_internal *cdata)
|
|
{
|
|
rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
|
|
RPCRDMA_MAX_DATA_SEGS /
|
|
RPCRDMA_MAX_FMR_SGES));
|
|
return 0;
|
|
}
|
|
|
|
/* FMR mode conveys up to 64 pages of payload per chunk segment.
|
|
*/
|
|
static size_t
|
|
fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
|
|
{
|
|
return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
|
|
RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
|
|
}
|
|
|
|
static int
|
|
fmr_op_init(struct rpcrdma_xprt *r_xprt)
|
|
{
|
|
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
|
|
struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
|
|
struct rpcrdma_mw *r;
|
|
int i, rc;
|
|
|
|
spin_lock_init(&buf->rb_mwlock);
|
|
INIT_LIST_HEAD(&buf->rb_mws);
|
|
INIT_LIST_HEAD(&buf->rb_all);
|
|
|
|
i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1);
|
|
i += 2; /* head + tail */
|
|
i *= buf->rb_max_requests; /* one set for each RPC slot */
|
|
dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
|
|
|
|
while (i--) {
|
|
r = kzalloc(sizeof(*r), GFP_KERNEL);
|
|
if (!r)
|
|
return -ENOMEM;
|
|
|
|
rc = __fmr_init(r, pd);
|
|
if (rc) {
|
|
kfree(r);
|
|
return rc;
|
|
}
|
|
|
|
r->mw_xprt = r_xprt;
|
|
list_add(&r->mw_list, &buf->rb_mws);
|
|
list_add(&r->mw_all, &buf->rb_all);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* Use the ib_map_phys_fmr() verb to register a memory region
|
|
* for remote access via RDMA READ or RDMA WRITE.
|
|
*/
|
|
static int
|
|
fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
|
|
int nsegs, bool writing)
|
|
{
|
|
struct rpcrdma_mr_seg *seg1 = seg;
|
|
int len, pageoff, i, rc;
|
|
struct rpcrdma_mw *mw;
|
|
u64 *dma_pages;
|
|
|
|
mw = seg1->rl_mw;
|
|
seg1->rl_mw = NULL;
|
|
if (!mw) {
|
|
mw = rpcrdma_get_mw(r_xprt);
|
|
if (!mw)
|
|
return -ENOMEM;
|
|
} else {
|
|
/* this is a retransmit; generate a fresh rkey */
|
|
rc = __fmr_unmap(mw);
|
|
if (rc)
|
|
return rc;
|
|
}
|
|
|
|
pageoff = offset_in_page(seg1->mr_offset);
|
|
seg1->mr_offset -= pageoff; /* start of page */
|
|
seg1->mr_len += pageoff;
|
|
len = -pageoff;
|
|
if (nsegs > RPCRDMA_MAX_FMR_SGES)
|
|
nsegs = RPCRDMA_MAX_FMR_SGES;
|
|
for (i = 0; i < nsegs;) {
|
|
if (seg->mr_page)
|
|
sg_set_page(&mw->mw_sg[i],
|
|
seg->mr_page,
|
|
seg->mr_len,
|
|
offset_in_page(seg->mr_offset));
|
|
else
|
|
sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
|
|
seg->mr_len);
|
|
len += seg->mr_len;
|
|
++seg;
|
|
++i;
|
|
/* Check for holes */
|
|
if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
|
|
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
|
|
break;
|
|
}
|
|
mw->mw_nents = i;
|
|
mw->mw_dir = rpcrdma_data_dir(writing);
|
|
|
|
if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device,
|
|
mw->mw_sg, mw->mw_nents, mw->mw_dir))
|
|
goto out_dmamap_err;
|
|
|
|
for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++)
|
|
dma_pages[i] = sg_dma_address(&mw->mw_sg[i]);
|
|
rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents,
|
|
dma_pages[0]);
|
|
if (rc)
|
|
goto out_maperr;
|
|
|
|
seg1->rl_mw = mw;
|
|
seg1->mr_rkey = mw->fmr.fm_mr->rkey;
|
|
seg1->mr_base = dma_pages[0] + pageoff;
|
|
seg1->mr_nsegs = mw->mw_nents;
|
|
seg1->mr_len = len;
|
|
return mw->mw_nents;
|
|
|
|
out_dmamap_err:
|
|
pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
|
|
mw->mw_sg, mw->mw_nents);
|
|
return -ENOMEM;
|
|
|
|
out_maperr:
|
|
pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
|
|
len, (unsigned long long)dma_pages[0],
|
|
pageoff, mw->mw_nents, rc);
|
|
__fmr_dma_unmap(mw);
|
|
return rc;
|
|
}
|
|
|
|
/* Invalidate all memory regions that were registered for "req".
|
|
*
|
|
* Sleeps until it is safe for the host CPU to access the
|
|
* previously mapped memory regions.
|
|
*/
|
|
static void
|
|
fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
|
|
{
|
|
struct rpcrdma_mr_seg *seg;
|
|
unsigned int i, nchunks;
|
|
struct rpcrdma_mw *mw;
|
|
LIST_HEAD(unmap_list);
|
|
int rc;
|
|
|
|
dprintk("RPC: %s: req %p\n", __func__, req);
|
|
|
|
/* ORDER: Invalidate all of the req's MRs first
|
|
*
|
|
* ib_unmap_fmr() is slow, so use a single call instead
|
|
* of one call per mapped MR.
|
|
*/
|
|
for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
|
|
seg = &req->rl_segments[i];
|
|
mw = seg->rl_mw;
|
|
|
|
list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
|
|
|
|
i += seg->mr_nsegs;
|
|
}
|
|
rc = ib_unmap_fmr(&unmap_list);
|
|
if (rc)
|
|
pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc);
|
|
|
|
/* ORDER: Now DMA unmap all of the req's MRs, and return
|
|
* them to the free MW list.
|
|
*/
|
|
for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
|
|
seg = &req->rl_segments[i];
|
|
mw = seg->rl_mw;
|
|
|
|
list_del_init(&mw->fmr.fm_mr->list);
|
|
__fmr_dma_unmap(mw);
|
|
|
|
i += seg->mr_nsegs;
|
|
seg->mr_nsegs = 0;
|
|
seg->rl_mw = NULL;
|
|
}
|
|
|
|
req->rl_nchunks = 0;
|
|
}
|
|
|
|
/* Use a slow, safe mechanism to invalidate all memory regions
|
|
* that were registered for "req".
|
|
*/
|
|
static void
|
|
fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
|
|
bool sync)
|
|
{
|
|
struct rpcrdma_mr_seg *seg;
|
|
struct rpcrdma_mw *mw;
|
|
unsigned int i;
|
|
|
|
for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
|
|
seg = &req->rl_segments[i];
|
|
mw = seg->rl_mw;
|
|
|
|
if (sync)
|
|
__fmr_reset_and_unmap(mw);
|
|
else
|
|
__fmr_queue_recovery(mw);
|
|
|
|
i += seg->mr_nsegs;
|
|
seg->mr_nsegs = 0;
|
|
seg->rl_mw = NULL;
|
|
}
|
|
}
|
|
|
|
static void
|
|
fmr_op_destroy(struct rpcrdma_buffer *buf)
|
|
{
|
|
struct rpcrdma_mw *r;
|
|
|
|
while (!list_empty(&buf->rb_all)) {
|
|
r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
|
|
list_del(&r->mw_all);
|
|
__fmr_release(r);
|
|
kfree(r);
|
|
}
|
|
}
|
|
|
|
const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
|
|
.ro_map = fmr_op_map,
|
|
.ro_unmap_sync = fmr_op_unmap_sync,
|
|
.ro_unmap_safe = fmr_op_unmap_safe,
|
|
.ro_open = fmr_op_open,
|
|
.ro_maxpages = fmr_op_maxpages,
|
|
.ro_init = fmr_op_init,
|
|
.ro_destroy = fmr_op_destroy,
|
|
.ro_displayname = "fmr",
|
|
};
|