mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-12 18:46:46 +07:00
nvme-rdma: don't suppress send completions
The entire completions suppress mechanism is currently broken because the HCA might retry a send operation (due to dropped ack) after the nvme transaction has completed. In order to handle this, we signal all send completions and introduce a separate done handler for async events as they will be handled differently (as they don't include in-capsule data by definition). Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Reviewed-by: Max Gurtovoy <maxg@mellanox.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
This commit is contained in:
parent
6c4ca1e36c
commit
b4b591c87f
@ -77,7 +77,6 @@ enum nvme_rdma_queue_flags {
|
|||||||
|
|
||||||
struct nvme_rdma_queue {
|
struct nvme_rdma_queue {
|
||||||
struct nvme_rdma_qe *rsp_ring;
|
struct nvme_rdma_qe *rsp_ring;
|
||||||
atomic_t sig_count;
|
|
||||||
int queue_size;
|
int queue_size;
|
||||||
size_t cmnd_capsule_len;
|
size_t cmnd_capsule_len;
|
||||||
struct nvme_rdma_ctrl *ctrl;
|
struct nvme_rdma_ctrl *ctrl;
|
||||||
@ -510,7 +509,6 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
|
|||||||
queue->cmnd_capsule_len = sizeof(struct nvme_command);
|
queue->cmnd_capsule_len = sizeof(struct nvme_command);
|
||||||
|
|
||||||
queue->queue_size = queue_size;
|
queue->queue_size = queue_size;
|
||||||
atomic_set(&queue->sig_count, 0);
|
|
||||||
|
|
||||||
queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
|
queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
|
||||||
RDMA_PS_TCP, IB_QPT_RC);
|
RDMA_PS_TCP, IB_QPT_RC);
|
||||||
@ -1204,21 +1202,9 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
|
|||||||
nvme_rdma_wr_error(cq, wc, "SEND");
|
nvme_rdma_wr_error(cq, wc, "SEND");
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* We want to signal completion at least every queue depth/2. This returns the
|
|
||||||
* largest power of two that is not above half of (queue size + 1) to optimize
|
|
||||||
* (avoid divisions).
|
|
||||||
*/
|
|
||||||
static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue)
|
|
||||||
{
|
|
||||||
int limit = 1 << ilog2((queue->queue_size + 1) / 2);
|
|
||||||
|
|
||||||
return (atomic_inc_return(&queue->sig_count) & (limit - 1)) == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
|
static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
|
||||||
struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
|
struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
|
||||||
struct ib_send_wr *first, bool flush)
|
struct ib_send_wr *first)
|
||||||
{
|
{
|
||||||
struct ib_send_wr wr, *bad_wr;
|
struct ib_send_wr wr, *bad_wr;
|
||||||
int ret;
|
int ret;
|
||||||
@ -1227,31 +1213,12 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
|
|||||||
sge->length = sizeof(struct nvme_command),
|
sge->length = sizeof(struct nvme_command),
|
||||||
sge->lkey = queue->device->pd->local_dma_lkey;
|
sge->lkey = queue->device->pd->local_dma_lkey;
|
||||||
|
|
||||||
qe->cqe.done = nvme_rdma_send_done;
|
|
||||||
|
|
||||||
wr.next = NULL;
|
wr.next = NULL;
|
||||||
wr.wr_cqe = &qe->cqe;
|
wr.wr_cqe = &qe->cqe;
|
||||||
wr.sg_list = sge;
|
wr.sg_list = sge;
|
||||||
wr.num_sge = num_sge;
|
wr.num_sge = num_sge;
|
||||||
wr.opcode = IB_WR_SEND;
|
wr.opcode = IB_WR_SEND;
|
||||||
wr.send_flags = 0;
|
wr.send_flags = IB_SEND_SIGNALED;
|
||||||
|
|
||||||
/*
|
|
||||||
* Unsignalled send completions are another giant desaster in the
|
|
||||||
* IB Verbs spec: If we don't regularly post signalled sends
|
|
||||||
* the send queue will fill up and only a QP reset will rescue us.
|
|
||||||
* Would have been way to obvious to handle this in hardware or
|
|
||||||
* at least the RDMA stack..
|
|
||||||
*
|
|
||||||
* Always signal the flushes. The magic request used for the flush
|
|
||||||
* sequencer is not allocated in our driver's tagset and it's
|
|
||||||
* triggered to be freed by blk_cleanup_queue(). So we need to
|
|
||||||
* always mark it as signaled to ensure that the "wr_cqe", which is
|
|
||||||
* embedded in request's payload, is not freed when __ib_process_cq()
|
|
||||||
* calls wr_cqe->done().
|
|
||||||
*/
|
|
||||||
if (nvme_rdma_queue_sig_limit(queue) || flush)
|
|
||||||
wr.send_flags |= IB_SEND_SIGNALED;
|
|
||||||
|
|
||||||
if (first)
|
if (first)
|
||||||
first->next = ≀
|
first->next = ≀
|
||||||
@ -1301,6 +1268,12 @@ static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
|
|||||||
return queue->ctrl->tag_set.tags[queue_idx - 1];
|
return queue->ctrl->tag_set.tags[queue_idx - 1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc)
|
||||||
|
{
|
||||||
|
if (unlikely(wc->status != IB_WC_SUCCESS))
|
||||||
|
nvme_rdma_wr_error(cq, wc, "ASYNC");
|
||||||
|
}
|
||||||
|
|
||||||
static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
|
static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
|
||||||
{
|
{
|
||||||
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
|
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
|
||||||
@ -1319,10 +1292,12 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
|
|||||||
cmd->common.flags |= NVME_CMD_SGL_METABUF;
|
cmd->common.flags |= NVME_CMD_SGL_METABUF;
|
||||||
nvme_rdma_set_sg_null(cmd);
|
nvme_rdma_set_sg_null(cmd);
|
||||||
|
|
||||||
|
sqe->cqe.done = nvme_rdma_async_done;
|
||||||
|
|
||||||
ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
|
ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
|
||||||
DMA_TO_DEVICE);
|
DMA_TO_DEVICE);
|
||||||
|
|
||||||
ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false);
|
ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL);
|
||||||
WARN_ON_ONCE(ret);
|
WARN_ON_ONCE(ret);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1607,7 +1582,6 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
|
|||||||
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
|
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
|
||||||
struct nvme_rdma_qe *sqe = &req->sqe;
|
struct nvme_rdma_qe *sqe = &req->sqe;
|
||||||
struct nvme_command *c = sqe->data;
|
struct nvme_command *c = sqe->data;
|
||||||
bool flush = false;
|
|
||||||
struct ib_device *dev;
|
struct ib_device *dev;
|
||||||
blk_status_t ret;
|
blk_status_t ret;
|
||||||
int err;
|
int err;
|
||||||
@ -1636,13 +1610,13 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
|
|||||||
goto err;
|
goto err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sqe->cqe.done = nvme_rdma_send_done;
|
||||||
|
|
||||||
ib_dma_sync_single_for_device(dev, sqe->dma,
|
ib_dma_sync_single_for_device(dev, sqe->dma,
|
||||||
sizeof(struct nvme_command), DMA_TO_DEVICE);
|
sizeof(struct nvme_command), DMA_TO_DEVICE);
|
||||||
|
|
||||||
if (req_op(rq) == REQ_OP_FLUSH)
|
|
||||||
flush = true;
|
|
||||||
err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
|
err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
|
||||||
req->mr->need_inval ? &req->reg_wr.wr : NULL, flush);
|
req->mr->need_inval ? &req->reg_wr.wr : NULL);
|
||||||
if (unlikely(err)) {
|
if (unlikely(err)) {
|
||||||
nvme_rdma_unmap_data(queue, rq);
|
nvme_rdma_unmap_data(queue, rq);
|
||||||
goto err;
|
goto err;
|
||||||
|
Loading…
Reference in New Issue
Block a user