IB/srp: Use SRP transport layer error recovery

Enable fast_io_fail_tmo and dev_loss_tmo functionality for the IB SRP
initiator.  Add kernel module parameters that allow to specify default
values for these parameters.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Acked-by: David Dillow <dillowda@ornl.gov>
Signed-off-by: Roland Dreier <roland@purestorage.com>
This commit is contained in:
Bart Van Assche 2013-10-26 14:34:27 +02:00 committed by Roland Dreier
parent 29c1732480
commit ed9b2264fb
2 changed files with 101 additions and 41 deletions

View File

@ -86,6 +86,27 @@ module_param(topspin_workarounds, int, 0444);
MODULE_PARM_DESC(topspin_workarounds,
"Enable workarounds for Topspin/Cisco SRP target bugs if != 0");
static struct kernel_param_ops srp_tmo_ops;
static int srp_fast_io_fail_tmo = 15;
module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo,
S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(fast_io_fail_tmo,
"Number of seconds between the observation of a transport"
" layer error and failing all I/O. \"off\" means that this"
" functionality is disabled.");
static int srp_dev_loss_tmo = 60;
module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo,
S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(dev_loss_tmo,
"Maximum number of seconds that the SRP transport should"
" insulate transport layer errors. After this time has been"
" exceeded the SCSI host is removed. Should be"
" between 1 and " __stringify(SCSI_DEVICE_BLOCK_MAX_TIMEOUT)
" if fast_io_fail_tmo has not been set. \"off\" means that"
" this functionality is disabled.");
static void srp_add_one(struct ib_device *device);
static void srp_remove_one(struct ib_device *device);
static void srp_recv_completion(struct ib_cq *cq, void *target_ptr);
@ -102,6 +123,44 @@ static struct ib_client srp_client = {
static struct ib_sa_client srp_sa_client;
static int srp_tmo_get(char *buffer, const struct kernel_param *kp)
{
int tmo = *(int *)kp->arg;
if (tmo >= 0)
return sprintf(buffer, "%d", tmo);
else
return sprintf(buffer, "off");
}
static int srp_tmo_set(const char *val, const struct kernel_param *kp)
{
int tmo, res;
if (strncmp(val, "off", 3) != 0) {
res = kstrtoint(val, 0, &tmo);
if (res)
goto out;
} else {
tmo = -1;
}
if (kp->arg == &srp_fast_io_fail_tmo)
res = srp_tmo_valid(tmo, srp_dev_loss_tmo);
else
res = srp_tmo_valid(srp_fast_io_fail_tmo, tmo);
if (res)
goto out;
*(int *)kp->arg = tmo;
out:
return res;
}
static struct kernel_param_ops srp_tmo_ops = {
.get = srp_tmo_get,
.set = srp_tmo_set,
};
static inline struct srp_target_port *host_to_target(struct Scsi_Host *host)
{
return (struct srp_target_port *) host->hostdata;
@ -688,23 +747,42 @@ static void srp_free_req(struct srp_target_port *target,
spin_unlock_irqrestore(&target->lock, flags);
}
static void srp_reset_req(struct srp_target_port *target, struct srp_request *req)
static void srp_finish_req(struct srp_target_port *target,
struct srp_request *req, int result)
{
struct scsi_cmnd *scmnd = srp_claim_req(target, req, NULL);
if (scmnd) {
srp_free_req(target, req, scmnd, 0);
scmnd->result = DID_RESET << 16;
scmnd->result = result;
scmnd->scsi_done(scmnd);
}
}
static int srp_reconnect_target(struct srp_target_port *target)
static void srp_terminate_io(struct srp_rport *rport)
{
struct Scsi_Host *shost = target->scsi_host;
int i, ret;
struct srp_target_port *target = rport->lld_data;
int i;
scsi_target_block(&shost->shost_gendev);
for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) {
struct srp_request *req = &target->req_ring[i];
srp_finish_req(target, req, DID_TRANSPORT_FAILFAST << 16);
}
}
/*
* It is up to the caller to ensure that srp_rport_reconnect() calls are
* serialized and that no concurrent srp_queuecommand(), srp_abort(),
* srp_reset_device() or srp_reset_host() calls will occur while this function
* is in progress. One way to realize that is not to call this function
* directly but to call srp_reconnect_rport() instead since that last function
* serializes calls of this function via rport->mutex and also blocks
* srp_queuecommand() calls before invoking this function.
*/
static int srp_rport_reconnect(struct srp_rport *rport)
{
struct srp_target_port *target = rport->lld_data;
int i, ret;
srp_disconnect_target(target);
/*
@ -725,8 +803,7 @@ static int srp_reconnect_target(struct srp_target_port *target)
for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) {
struct srp_request *req = &target->req_ring[i];
if (req->scmnd)
srp_reset_req(target, req);
srp_finish_req(target, req, DID_RESET << 16);
}
INIT_LIST_HEAD(&target->free_tx);
@ -736,28 +813,9 @@ static int srp_reconnect_target(struct srp_target_port *target)
if (ret == 0)
ret = srp_connect_target(target);
scsi_target_unblock(&shost->shost_gendev, ret == 0 ? SDEV_RUNNING :
SDEV_TRANSPORT_OFFLINE);
target->transport_offline = !!ret;
if (ret)
goto err;
shost_printk(KERN_INFO, target->scsi_host, PFX "reconnect succeeded\n");
return ret;
err:
shost_printk(KERN_ERR, target->scsi_host,
PFX "reconnect failed (%d), removing target port.\n", ret);
/*
* We couldn't reconnect, so kill our target port off.
* However, we have to defer the real removal because we
* are in the context of the SCSI error handler now, which
* will deadlock if we call scsi_remove_host().
*/
srp_queue_remove_work(target);
if (ret == 0)
shost_printk(KERN_INFO, target->scsi_host,
PFX "reconnect succeeded\n");
return ret;
}
@ -1356,10 +1414,11 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
struct srp_cmd *cmd;
struct ib_device *dev;
unsigned long flags;
int len;
int len, result;
if (unlikely(target->transport_offline)) {
scmnd->result = DID_NO_CONNECT << 16;
result = srp_chkready(target->rport);
if (unlikely(result)) {
scmnd->result = result;
scmnd->scsi_done(scmnd);
return 0;
}
@ -1757,7 +1816,7 @@ static int srp_abort(struct scsi_cmnd *scmnd)
if (srp_send_tsk_mgmt(target, req->index, scmnd->device->lun,
SRP_TSK_ABORT_TASK) == 0)
ret = SUCCESS;
else if (target->transport_offline)
else if (target->rport->state == SRP_RPORT_LOST)
ret = FAST_IO_FAIL;
else
ret = FAILED;
@ -1784,7 +1843,7 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) {
struct srp_request *req = &target->req_ring[i];
if (req->scmnd && req->scmnd->device == scmnd->device)
srp_reset_req(target, req);
srp_finish_req(target, req, DID_RESET << 16);
}
return SUCCESS;
@ -1793,14 +1852,10 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
static int srp_reset_host(struct scsi_cmnd *scmnd)
{
struct srp_target_port *target = host_to_target(scmnd->device->host);
int ret = FAILED;
shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n");
if (!srp_reconnect_target(target))
ret = SUCCESS;
return ret;
return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED;
}
static int srp_slave_configure(struct scsi_device *sdev)
@ -2637,7 +2692,13 @@ static void srp_remove_one(struct ib_device *device)
}
static struct srp_function_template ib_srp_transport_functions = {
.has_rport_state = true,
.reset_timer_if_blocked = true,
.fast_io_fail_tmo = &srp_fast_io_fail_tmo,
.dev_loss_tmo = &srp_dev_loss_tmo,
.reconnect = srp_rport_reconnect,
.rport_delete = srp_rport_delete,
.terminate_rport_io = srp_terminate_io,
};
static int __init srp_init_module(void)

View File

@ -140,7 +140,6 @@ struct srp_target_port {
unsigned int cmd_sg_cnt;
unsigned int indirect_size;
bool allow_ext_sg;
bool transport_offline;
/* Everything above this point is used in the hot path of
* command processing. Try to keep them packed into cachelines.