nvme fixes for 5.10

- rdma error handling fixes (Chao Leng)
  - fc error handling and reconnect fixes (James Smart)
  - fix the qid displace when tracing ioctl command (Keith Busch)
  - don't use BLK_MQ_REQ_NOWAIT for passthru (Chaitanya Kulkarni)
  - fix MTDT for passthru (Logan Gunthorpe)
  - blacklist Write Same on more devices (Kai-Heng Feng)
  - fix an uninitialized work struct (zhenwei pi)
 -----BEGIN PGP SIGNATURE-----
 
 iQI/BAABCgApFiEEgdbnc3r/njty3Iq9D55TZVIEUYMFAl+StyILHGhjaEBsc3Qu
 ZGUACgkQD55TZVIEUYPkMg//RdykKGgAPLFiFDsWpZCQEUiiZjWKndtEpX4orT25
 o7QE6TunecQFNZPIobow164ZbeeaKP5syCnPvtjgYMVvmIwZMZcHhA+pAZQAu9/j
 tsZ9wquTa1QV4qJbDmY4ptQGIqXODf8rLZybqMhw/e0l+E0iI5ngkn8TCnNQTqP5
 S0YFzdoVDefubDx9uzuLIbCuzBW+zv8bpxRaYWxKDI8AmezDn6PV8ZeFZOgxsmkI
 0nBSHfwbSPvxSWHAw/pzGoQ0g5aOe7zxHtslz/g3gOYVto/6JsGyCVAXICym5ZbD
 59GQ5PPZggfw+GVtdoEA/A21hMusf0aGnk7lTpNnLJ1IXmVNUI61/1AaD4LBAXjM
 Q1VfgP4YrXl+XpYhbGcAYB0OTNsntM8O9xsX1ZQpW2VRnKgjPiohtKq2QAEdWCKH
 mZR9f+XIe/BRnRRl0A37IyUDeo4/EFiQ66wKW+GAL1R/x/rJUFdY7sM4u1j0XGgN
 WZMHdpLyoSlXYcV280I0w8f7wASwAZ78A87w5W7pcuBKH0aCnssreRQ2vOGPHb0x
 7cqNQ+lyrp/Rhb9NHj0jJWEEJ6tHa8xH4/Tz+hsktx0vkK454YmS3Rbrll3D9wtC
 7LZZkluy9r3pg24xJNUo6V0DyVCF1Nl063TZ5YR8/N4lXRlk/6BE4k1IsK/Jcl8g
 4T4=
 =gjYy
 -----END PGP SIGNATURE-----

Merge tag 'nvme-5.10-2020-10-23' of git://git.infradead.org/nvme into block-5.10

Pull NVMe fixes from Christoph:

"nvme fixes for 5.10

 - rdma error handling fixes (Chao Leng)
 - fc error handling and reconnect fixes (James Smart)
 - fix the qid displace when tracing ioctl command (Keith Busch)
 - don't use BLK_MQ_REQ_NOWAIT for passthru (Chaitanya Kulkarni)
 - fix MTDT for passthru (Logan Gunthorpe)
 - blacklist Write Same on more devices (Kai-Heng Feng)
 - fix an uninitialized work struct (zhenwei pi)"

* tag 'nvme-5.10-2020-10-23' of git://git.infradead.org/nvme:
  nvme-fc: shorten reconnect delay if possible for FC
  nvme-fc: wait for queues to freeze before calling update_hr_hw_queues
  nvme-fc: fix error loop in create_hw_io_queues
  nvme-fc: fix io timeout to abort I/O
  nvmet: don't use BLK_MQ_REQ_NOWAIT for passthru
  nvmet: cleanup nvmet_passthru_map_sg()
  nvmet: limit passthru MTDS by BIO_MAX_PAGES
  nvmet: fix uninitialized work for zero kato
  nvme-pci: disable Write Zeroes on Sandisk Skyhawk
  nvme: use queuedata for nvme_req_qid
  nvme-rdma: fix crash due to incorrect cqe
  nvme-rdma: fix crash when connect rejected
This commit is contained in:
Jens Axboe 2020-10-23 07:29:08 -06:00
commit ddc62910b4
6 changed files with 137 additions and 76 deletions

View File

@ -26,6 +26,10 @@ enum nvme_fc_queue_flags {
};
#define NVME_FC_DEFAULT_DEV_LOSS_TMO 60 /* seconds */
#define NVME_FC_DEFAULT_RECONNECT_TMO 2 /* delay between reconnects
* when connected and a
* connection failure.
*/
struct nvme_fc_queue {
struct nvme_fc_ctrl *ctrl;
@ -1837,8 +1841,10 @@ __nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
opstate = atomic_xchg(&op->state, FCPOP_STATE_ABORTED);
if (opstate != FCPOP_STATE_ACTIVE)
atomic_set(&op->state, opstate);
else if (test_bit(FCCTRL_TERMIO, &ctrl->flags))
else if (test_bit(FCCTRL_TERMIO, &ctrl->flags)) {
op->flags |= FCOP_FLAGS_TERMIO;
ctrl->iocnt++;
}
spin_unlock_irqrestore(&ctrl->lock, flags);
if (opstate != FCPOP_STATE_ACTIVE)
@ -1874,7 +1880,8 @@ __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl,
if (opstate == FCPOP_STATE_ABORTED) {
spin_lock_irqsave(&ctrl->lock, flags);
if (test_bit(FCCTRL_TERMIO, &ctrl->flags)) {
if (test_bit(FCCTRL_TERMIO, &ctrl->flags) &&
op->flags & FCOP_FLAGS_TERMIO) {
if (!--ctrl->iocnt)
wake_up(&ctrl->ioabort_wait);
}
@ -2314,7 +2321,7 @@ nvme_fc_create_hw_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize)
return 0;
delete_queues:
for (; i >= 0; i--)
for (; i > 0; i--)
__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[i], i);
return ret;
}
@ -2433,7 +2440,7 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
return;
dev_warn(ctrl->ctrl.device,
"NVME-FC{%d}: transport association error detected: %s\n",
"NVME-FC{%d}: transport association event: %s\n",
ctrl->cnum, errmsg);
dev_warn(ctrl->ctrl.device,
"NVME-FC{%d}: resetting controller\n", ctrl->cnum);
@ -2446,15 +2453,20 @@ nvme_fc_timeout(struct request *rq, bool reserved)
{
struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
struct nvme_fc_ctrl *ctrl = op->ctrl;
struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
struct nvme_command *sqe = &cmdiu->sqe;
/*
* we can't individually ABTS an io without affecting the queue,
* thus killing the queue, and thus the association.
* So resolve by performing a controller reset, which will stop
* the host/io stack, terminate the association on the link,
* and recreate an association on the link.
* Attempt to abort the offending command. Command completion
* will detect the aborted io and will fail the connection.
*/
nvme_fc_error_recovery(ctrl, "io timeout error");
dev_info(ctrl->ctrl.device,
"NVME-FC{%d.%d}: io timeout: opcode %d fctype %d w10/11: "
"x%08x/x%08x\n",
ctrl->cnum, op->queue->qnum, sqe->common.opcode,
sqe->connect.fctype, sqe->common.cdw10, sqe->common.cdw11);
if (__nvme_fc_abort_op(ctrl, op))
nvme_fc_error_recovery(ctrl, "io timeout abort failed");
/*
* the io abort has been initiated. Have the reset timer
@ -2726,6 +2738,7 @@ nvme_fc_complete_rq(struct request *rq)
struct nvme_fc_ctrl *ctrl = op->ctrl;
atomic_set(&op->state, FCPOP_STATE_IDLE);
op->flags &= ~FCOP_FLAGS_TERMIO;
nvme_fc_unmap_data(ctrl, rq, op);
nvme_complete_rq(rq);
@ -2876,11 +2889,14 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl)
if (ret)
goto out_delete_hw_queues;
if (prior_ioq_cnt != nr_io_queues)
if (prior_ioq_cnt != nr_io_queues) {
dev_info(ctrl->ctrl.device,
"reconnect: revising io queue count from %d to %d\n",
prior_ioq_cnt, nr_io_queues);
blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues);
nvme_wait_freeze(&ctrl->ctrl);
blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues);
nvme_unfreeze(&ctrl->ctrl);
}
return 0;
@ -3090,6 +3106,61 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
return ret;
}
/*
* This routine runs through all outstanding commands on the association
* and aborts them. This routine is typically be called by the
* delete_association routine. It is also called due to an error during
* reconnect. In that scenario, it is most likely a command that initializes
* the controller, including fabric Connect commands on io queues, that
* may have timed out or failed thus the io must be killed for the connect
* thread to see the error.
*/
static void
__nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
{
/*
* If io queues are present, stop them and terminate all outstanding
* ios on them. As FC allocates FC exchange for each io, the
* transport must contact the LLDD to terminate the exchange,
* thus releasing the FC exchange. We use blk_mq_tagset_busy_itr()
* to tell us what io's are busy and invoke a transport routine
* to kill them with the LLDD. After terminating the exchange
* the LLDD will call the transport's normal io done path, but it
* will have an aborted status. The done path will return the
* io requests back to the block layer as part of normal completions
* (but with error status).
*/
if (ctrl->ctrl.queue_count > 1) {
nvme_stop_queues(&ctrl->ctrl);
blk_mq_tagset_busy_iter(&ctrl->tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
blk_mq_tagset_wait_completed_request(&ctrl->tag_set);
if (start_queues)
nvme_start_queues(&ctrl->ctrl);
}
/*
* Other transports, which don't have link-level contexts bound
* to sqe's, would try to gracefully shutdown the controller by
* writing the registers for shutdown and polling (call
* nvme_shutdown_ctrl()). Given a bunch of i/o was potentially
* just aborted and we will wait on those contexts, and given
* there was no indication of how live the controlelr is on the
* link, don't send more io to create more contexts for the
* shutdown. Let the controller fail via keepalive failure if
* its still present.
*/
/*
* clean up the admin queue. Same thing as above.
*/
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set);
}
/*
* This routine stops operation of the controller on the host side.
* On the host os stack side: Admin and IO queues are stopped,
@ -3110,46 +3181,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
ctrl->iocnt = 0;
spin_unlock_irqrestore(&ctrl->lock, flags);
/*
* If io queues are present, stop them and terminate all outstanding
* ios on them. As FC allocates FC exchange for each io, the
* transport must contact the LLDD to terminate the exchange,
* thus releasing the FC exchange. We use blk_mq_tagset_busy_itr()
* to tell us what io's are busy and invoke a transport routine
* to kill them with the LLDD. After terminating the exchange
* the LLDD will call the transport's normal io done path, but it
* will have an aborted status. The done path will return the
* io requests back to the block layer as part of normal completions
* (but with error status).
*/
if (ctrl->ctrl.queue_count > 1) {
nvme_stop_queues(&ctrl->ctrl);
blk_mq_tagset_busy_iter(&ctrl->tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
blk_mq_tagset_wait_completed_request(&ctrl->tag_set);
}
/*
* Other transports, which don't have link-level contexts bound
* to sqe's, would try to gracefully shutdown the controller by
* writing the registers for shutdown and polling (call
* nvme_shutdown_ctrl()). Given a bunch of i/o was potentially
* just aborted and we will wait on those contexts, and given
* there was no indication of how live the controlelr is on the
* link, don't send more io to create more contexts for the
* shutdown. Let the controller fail via keepalive failure if
* its still present.
*/
/*
* clean up the admin queue. Same thing as above.
* use blk_mq_tagset_busy_itr() and the transport routine to
* terminate the exchanges.
*/
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set);
__nvme_fc_abort_outstanding_ios(ctrl, false);
/* kill the aens as they are a separate path */
nvme_fc_abort_aen_ops(ctrl);
@ -3263,22 +3295,27 @@ static void
__nvme_fc_terminate_io(struct nvme_fc_ctrl *ctrl)
{
/*
* if state is connecting - the error occurred as part of a
* reconnect attempt. The create_association error paths will
* clean up any outstanding io.
*
* if it's a different state - ensure all pending io is
* terminated. Given this can delay while waiting for the
* aborted io to return, we recheck adapter state below
* before changing state.
* if state is CONNECTING - the error occurred as part of a
* reconnect attempt. Abort any ios on the association and
* let the create_association error paths resolve things.
*/
if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) {
nvme_stop_keep_alive(&ctrl->ctrl);
/* will block will waiting for io to terminate */
nvme_fc_delete_association(ctrl);
if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) {
__nvme_fc_abort_outstanding_ios(ctrl, true);
return;
}
/*
* For any other state, kill the association. As this routine
* is a common io abort routine for resetting and such, after
* the association is terminated, ensure that the state is set
* to CONNECTING.
*/
nvme_stop_keep_alive(&ctrl->ctrl);
/* will block will waiting for io to terminate */
nvme_fc_delete_association(ctrl);
if (ctrl->ctrl.state != NVME_CTRL_CONNECTING &&
!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
dev_err(ctrl->ctrl.device,
@ -3403,7 +3440,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
{
struct nvme_fc_ctrl *ctrl;
unsigned long flags;
int ret, idx;
int ret, idx, ctrl_loss_tmo;
if (!(rport->remoteport.port_role &
(FC_PORT_ROLE_NVME_DISCOVERY | FC_PORT_ROLE_NVME_TARGET))) {
@ -3429,6 +3466,19 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
goto out_free_ctrl;
}
/*
* if ctrl_loss_tmo is being enforced and the default reconnect delay
* is being used, change to a shorter reconnect delay for FC.
*/
if (opts->max_reconnects != -1 &&
opts->reconnect_delay == NVMF_DEF_RECONNECT_DELAY &&
opts->reconnect_delay > NVME_FC_DEFAULT_RECONNECT_TMO) {
ctrl_loss_tmo = opts->max_reconnects * opts->reconnect_delay;
opts->reconnect_delay = NVME_FC_DEFAULT_RECONNECT_TMO;
opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
opts->reconnect_delay);
}
ctrl->ctrl.opts = opts;
ctrl->ctrl.nr_reconnects = 0;
if (lport->dev)

View File

@ -176,7 +176,7 @@ static inline struct nvme_request *nvme_req(struct request *req)
static inline u16 nvme_req_qid(struct request *req)
{
if (!req->rq_disk)
if (!req->q->queuedata)
return 0;
return blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(req)) + 1;
}

View File

@ -3185,6 +3185,8 @@ static const struct pci_device_id nvme_id_table[] = {
NVME_QUIRK_IGNORE_DEV_SUBNQN, },
{ PCI_DEVICE(0x1c5c, 0x1504), /* SK Hynix PC400 */
.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_DEVICE(0x15b7, 0x2001), /* Sandisk Skyhawk */
.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
.driver_data = NVME_QUIRK_SINGLE_VECTOR },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },

View File

@ -1730,10 +1730,11 @@ static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
req->result = cqe->result;
if (wc->wc_flags & IB_WC_WITH_INVALIDATE) {
if (unlikely(wc->ex.invalidate_rkey != req->mr->rkey)) {
if (unlikely(!req->mr ||
wc->ex.invalidate_rkey != req->mr->rkey)) {
dev_err(queue->ctrl->ctrl.device,
"Bogus remote invalidation for rkey %#x\n",
req->mr->rkey);
req->mr ? req->mr->rkey : 0);
nvme_rdma_error_recovery(queue->ctrl);
}
} else if (req->mr) {
@ -1926,7 +1927,6 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
complete(&queue->cm_done);
return 0;
case RDMA_CM_EVENT_REJECTED:
nvme_rdma_destroy_queue_ib(queue);
cm_error = nvme_rdma_conn_rejected(queue, ev);
break;
case RDMA_CM_EVENT_ROUTE_ERROR:

View File

@ -1126,7 +1126,8 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
* in case a host died before it enabled the controller. Hence, simply
* reset the keep alive timer when the controller is enabled.
*/
mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ);
if (ctrl->kato)
mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ);
}
static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl)

View File

@ -26,7 +26,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req)
struct nvme_ctrl *pctrl = ctrl->subsys->passthru_ctrl;
u16 status = NVME_SC_SUCCESS;
struct nvme_id_ctrl *id;
u32 max_hw_sectors;
int max_hw_sectors;
int page_shift;
id = kzalloc(sizeof(*id), GFP_KERNEL);
@ -48,6 +48,13 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req)
max_hw_sectors = min_not_zero(pctrl->max_segments << (PAGE_SHIFT - 9),
pctrl->max_hw_sectors);
/*
* nvmet_passthru_map_sg is limitted to using a single bio so limit
* the mdts based on BIO_MAX_PAGES as well
*/
max_hw_sectors = min_not_zero(BIO_MAX_PAGES << (PAGE_SHIFT - 9),
max_hw_sectors);
page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
id->mdts = ilog2(max_hw_sectors) + 9 - page_shift;
@ -180,18 +187,20 @@ static void nvmet_passthru_req_done(struct request *rq,
static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq)
{
int sg_cnt = req->sg_cnt;
struct scatterlist *sg;
int op_flags = 0;
struct bio *bio;
int i, ret;
if (req->sg_cnt > BIO_MAX_PAGES)
return -EINVAL;
if (req->cmd->common.opcode == nvme_cmd_flush)
op_flags = REQ_FUA;
else if (nvme_is_write(req->cmd))
op_flags = REQ_SYNC | REQ_IDLE;
bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
bio = bio_alloc(GFP_KERNEL, req->sg_cnt);
bio->bi_end_io = bio_put;
bio->bi_opf = req_op(rq) | op_flags;
@ -201,7 +210,6 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq)
bio_put(bio);
return -EINVAL;
}
sg_cnt--;
}
ret = blk_rq_append_bio(rq, &bio);
@ -236,7 +244,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req)
q = ns->queue;
}
rq = nvme_alloc_request(q, req->cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
rq = nvme_alloc_request(q, req->cmd, 0, NVME_QID_ANY);
if (IS_ERR(rq)) {
status = NVME_SC_INTERNAL;
goto out_put_ns;