linux_dsm_epyc7002/drivers/nvme/host/multipath.c
Baegjae Sung 9bd82b1a44 nvme-multipath: fix sysfs dangerously created links
If multipathing is enabled, each NVMe subsystem creates a head
namespace (e.g., nvme0n1) and multiple private namespaces
(e.g., nvme0c0n1 and nvme0c1n1) in sysfs. When creating links for
private namespaces, links of head namespace are used, so the
namespace creation order must be followed (e.g., nvme0n1 ->
nvme0c1n1). If the order is not followed, links of sysfs will be
incomplete or kernel panic will occur.

The kernel panic was:
  kernel BUG at fs/sysfs/symlink.c:27!
  Call Trace:
    nvme_mpath_add_disk_links+0x5d/0x80 [nvme_core]
    nvme_validate_ns+0x5c2/0x850 [nvme_core]
    nvme_scan_work+0x1af/0x2d0 [nvme_core]

Correct order
Context A     Context B
nvme0n1
nvme0c0n1     nvme0c1n1

Incorrect order
Context A     Context B
              nvme0c1n1
nvme0n1
nvme0c0n1

The nvme_mpath_add_disk (for creating head namespace) is called
just before the nvme_mpath_add_disk_links (for creating private
namespaces). In nvme_mpath_add_disk, the first context acquires
the lock of subsystem and creates a head namespace, and other
contexts do nothing by checking GENHD_FL_UP of a head namespace
after waiting to acquire the lock. We verified the code with or
without multipathing using three vendors of dual-port NVMe SSDs.

Signed-off-by: Baegjae Sung <baegjae@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
2018-02-28 02:46:48 -07:00

257 lines
6.7 KiB
C

/*
* Copyright (c) 2017 Christoph Hellwig.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/moduleparam.h>
#include "nvme.h"
static bool multipath = true;
module_param(multipath, bool, 0644);
MODULE_PARM_DESC(multipath,
"turn on native support for multiple controllers per subsystem");
void nvme_failover_req(struct request *req)
{
struct nvme_ns *ns = req->q->queuedata;
unsigned long flags;
spin_lock_irqsave(&ns->head->requeue_lock, flags);
blk_steal_bios(&ns->head->requeue_list, req);
spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
blk_mq_end_request(req, 0);
nvme_reset_ctrl(ns->ctrl);
kblockd_schedule_work(&ns->head->requeue_work);
}
bool nvme_req_needs_failover(struct request *req, blk_status_t error)
{
if (!(req->cmd_flags & REQ_NVME_MPATH))
return false;
return blk_path_error(error);
}
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry(ns, &ctrl->namespaces, list) {
if (ns->head->disk)
kblockd_schedule_work(&ns->head->requeue_work);
}
mutex_unlock(&ctrl->namespaces_mutex);
}
static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
{
struct nvme_ns *ns;
list_for_each_entry_rcu(ns, &head->list, siblings) {
if (ns->ctrl->state == NVME_CTRL_LIVE) {
rcu_assign_pointer(head->current_path, ns);
return ns;
}
}
return NULL;
}
inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
{
struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu);
if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE))
ns = __nvme_find_path(head);
return ns;
}
static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
struct bio *bio)
{
struct nvme_ns_head *head = q->queuedata;
struct device *dev = disk_to_dev(head->disk);
struct nvme_ns *ns;
blk_qc_t ret = BLK_QC_T_NONE;
int srcu_idx;
srcu_idx = srcu_read_lock(&head->srcu);
ns = nvme_find_path(head);
if (likely(ns)) {
bio->bi_disk = ns->disk;
bio->bi_opf |= REQ_NVME_MPATH;
ret = direct_make_request(bio);
} else if (!list_empty_careful(&head->list)) {
dev_warn_ratelimited(dev, "no path available - requeuing I/O\n");
spin_lock_irq(&head->requeue_lock);
bio_list_add(&head->requeue_list, bio);
spin_unlock_irq(&head->requeue_lock);
} else {
dev_warn_ratelimited(dev, "no path - failing I/O\n");
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
}
srcu_read_unlock(&head->srcu, srcu_idx);
return ret;
}
static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
{
struct nvme_ns_head *head = q->queuedata;
struct nvme_ns *ns;
bool found = false;
int srcu_idx;
srcu_idx = srcu_read_lock(&head->srcu);
ns = srcu_dereference(head->current_path, &head->srcu);
if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE))
found = ns->queue->poll_fn(q, qc);
srcu_read_unlock(&head->srcu, srcu_idx);
return found;
}
static void nvme_requeue_work(struct work_struct *work)
{
struct nvme_ns_head *head =
container_of(work, struct nvme_ns_head, requeue_work);
struct bio *bio, *next;
spin_lock_irq(&head->requeue_lock);
next = bio_list_get(&head->requeue_list);
spin_unlock_irq(&head->requeue_lock);
while ((bio = next) != NULL) {
next = bio->bi_next;
bio->bi_next = NULL;
/*
* Reset disk to the mpath node and resubmit to select a new
* path.
*/
bio->bi_disk = head->disk;
generic_make_request(bio);
}
}
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
{
struct request_queue *q;
bool vwc = false;
bio_list_init(&head->requeue_list);
spin_lock_init(&head->requeue_lock);
INIT_WORK(&head->requeue_work, nvme_requeue_work);
/*
* Add a multipath node if the subsystems supports multiple controllers.
* We also do this for private namespaces as the namespace sharing data could
* change after a rescan.
*/
if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
return 0;
q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
if (!q)
goto out;
q->queuedata = head;
blk_queue_make_request(q, nvme_ns_head_make_request);
q->poll_fn = nvme_ns_head_poll;
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
/* set to a default value for 512 until disk is validated */
blk_queue_logical_block_size(q, 512);
/* we need to propagate up the VMC settings */
if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
vwc = true;
blk_queue_write_cache(q, vwc, vwc);
head->disk = alloc_disk(0);
if (!head->disk)
goto out_cleanup_queue;
head->disk->fops = &nvme_ns_head_ops;
head->disk->private_data = head;
head->disk->queue = q;
head->disk->flags = GENHD_FL_EXT_DEVT;
sprintf(head->disk->disk_name, "nvme%dn%d",
ctrl->subsys->instance, head->instance);
return 0;
out_cleanup_queue:
blk_cleanup_queue(q);
out:
return -ENOMEM;
}
void nvme_mpath_add_disk(struct nvme_ns_head *head)
{
if (!head->disk)
return;
mutex_lock(&head->subsys->lock);
if (!(head->disk->flags & GENHD_FL_UP)) {
device_add_disk(&head->subsys->dev, head->disk);
if (sysfs_create_group(&disk_to_dev(head->disk)->kobj,
&nvme_ns_id_attr_group))
pr_warn("%s: failed to create sysfs group for identification\n",
head->disk->disk_name);
}
mutex_unlock(&head->subsys->lock);
}
void nvme_mpath_add_disk_links(struct nvme_ns *ns)
{
struct kobject *slave_disk_kobj, *holder_disk_kobj;
if (!ns->head->disk)
return;
slave_disk_kobj = &disk_to_dev(ns->disk)->kobj;
if (sysfs_create_link(ns->head->disk->slave_dir, slave_disk_kobj,
kobject_name(slave_disk_kobj)))
return;
holder_disk_kobj = &disk_to_dev(ns->head->disk)->kobj;
if (sysfs_create_link(ns->disk->part0.holder_dir, holder_disk_kobj,
kobject_name(holder_disk_kobj)))
sysfs_remove_link(ns->head->disk->slave_dir,
kobject_name(slave_disk_kobj));
}
void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
if (!head->disk)
return;
sysfs_remove_group(&disk_to_dev(head->disk)->kobj,
&nvme_ns_id_attr_group);
del_gendisk(head->disk);
blk_set_queue_dying(head->disk->queue);
/* make sure all pending bios are cleaned up */
kblockd_schedule_work(&head->requeue_work);
flush_work(&head->requeue_work);
blk_cleanup_queue(head->disk->queue);
put_disk(head->disk);
}
void nvme_mpath_remove_disk_links(struct nvme_ns *ns)
{
if (!ns->head->disk)
return;
sysfs_remove_link(ns->disk->part0.holder_dir,
kobject_name(&disk_to_dev(ns->head->disk)->kobj));
sysfs_remove_link(ns->head->disk->slave_dir,
kobject_name(&disk_to_dev(ns->disk)->kobj));
}