Merge branch 'nvme-5.4' of git://git.infradead.org/nvme into for-linus

Pull NVMe changes from Sagi:

"This set consists of various fixes and cleanups:
 - controller removal race fix from Balbir
 - quirk additions from Gabriel and Jian-Hong
 - nvme-pci power state save fix from Mario
 - Add 64bit user commands (for 64bit registers) from Marta
 - nvme-rdma/nvme-tcp fixes from Max, Mark and Me
 - Minor cleanups and nits from James, Dan and John"

* 'nvme-5.4' of git://git.infradead.org/nvme:
  nvme-rdma: fix possible use-after-free in connect timeout
  nvme: Move ctrl sqsize to generic space
  nvme: Add ctrl attributes for queue_count and sqsize
  nvme: allow 64-bit results in passthru commands
  nvme: Add quirk for Kingston NVME SSD running FW E8FK11.T
  nvmet-tcp: remove superflous check on request sgl
  Added QUIRKs for ADATA XPG SX8200 Pro 512GB
  nvme-rdma: Fix max_hw_sectors calculation
  nvme: fix an error code in nvme_init_subsystem()
  nvme-pci: Save PCI state before putting drive into deepest state
  nvme-tcp: fix wrong stop condition in io_work
  nvme-pci: Fix a race in controller removal
  nvmet: change ppl to lpp
This commit is contained in:
Jens Axboe 2019-09-27 13:17:37 -06:00
commit 2d5ba0c712
8 changed files with 177 additions and 51 deletions

View File

@ -102,10 +102,13 @@ static void nvme_set_queue_dying(struct nvme_ns *ns)
*/
if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
return;
revalidate_disk(ns->disk);
blk_set_queue_dying(ns->queue);
/* Forcibly unquiesce queues to avoid blocking dispatch */
blk_mq_unquiesce_queue(ns->queue);
/*
* Revalidate after unblocking dispatchers that may be holding bd_butex
*/
revalidate_disk(ns->disk);
}
static void nvme_queue_scan(struct nvme_ctrl *ctrl)
@ -847,7 +850,7 @@ static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
static int nvme_submit_user_cmd(struct request_queue *q,
struct nvme_command *cmd, void __user *ubuffer,
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
u32 meta_seed, u32 *result, unsigned timeout)
u32 meta_seed, u64 *result, unsigned timeout)
{
bool write = nvme_is_write(cmd);
struct nvme_ns *ns = q->queuedata;
@ -888,7 +891,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
else
ret = nvme_req(req)->status;
if (result)
*result = le32_to_cpu(nvme_req(req)->result.u32);
*result = le64_to_cpu(nvme_req(req)->result.u64);
if (meta && !ret && !write) {
if (copy_to_user(meta_buffer, meta, meta_len))
ret = -EFAULT;
@ -1335,6 +1338,54 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct nvme_command c;
unsigned timeout = 0;
u32 effects;
u64 result;
int status;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
return -EFAULT;
if (cmd.flags)
return -EINVAL;
memset(&c, 0, sizeof(c));
c.common.opcode = cmd.opcode;
c.common.flags = cmd.flags;
c.common.nsid = cpu_to_le32(cmd.nsid);
c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
c.common.cdw10 = cpu_to_le32(cmd.cdw10);
c.common.cdw11 = cpu_to_le32(cmd.cdw11);
c.common.cdw12 = cpu_to_le32(cmd.cdw12);
c.common.cdw13 = cpu_to_le32(cmd.cdw13);
c.common.cdw14 = cpu_to_le32(cmd.cdw14);
c.common.cdw15 = cpu_to_le32(cmd.cdw15);
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
(void __user *)(uintptr_t)cmd.addr, cmd.data_len,
(void __user *)(uintptr_t)cmd.metadata,
cmd.metadata_len, 0, &result, timeout);
nvme_passthru_end(ctrl, effects);
if (status >= 0) {
if (put_user(result, &ucmd->result))
return -EFAULT;
}
return status;
}
static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct nvme_passthru_cmd64 __user *ucmd)
{
struct nvme_passthru_cmd64 cmd;
struct nvme_command c;
unsigned timeout = 0;
u32 effects;
int status;
if (!capable(CAP_SYS_ADMIN))
@ -1405,6 +1456,41 @@ static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
srcu_read_unlock(&head->srcu, idx);
}
static bool is_ctrl_ioctl(unsigned int cmd)
{
if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD)
return true;
if (is_sed_ioctl(cmd))
return true;
return false;
}
static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
void __user *argp,
struct nvme_ns_head *head,
int srcu_idx)
{
struct nvme_ctrl *ctrl = ns->ctrl;
int ret;
nvme_get_ctrl(ns->ctrl);
nvme_put_ns_from_disk(head, srcu_idx);
switch (cmd) {
case NVME_IOCTL_ADMIN_CMD:
ret = nvme_user_cmd(ctrl, NULL, argp);
break;
case NVME_IOCTL_ADMIN64_CMD:
ret = nvme_user_cmd64(ctrl, NULL, argp);
break;
default:
ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
break;
}
nvme_put_ctrl(ctrl);
return ret;
}
static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
@ -1422,20 +1508,8 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
* seperately and drop the ns SRCU reference early. This avoids a
* deadlock when deleting namespaces using the passthrough interface.
*/
if (cmd == NVME_IOCTL_ADMIN_CMD || is_sed_ioctl(cmd)) {
struct nvme_ctrl *ctrl = ns->ctrl;
nvme_get_ctrl(ns->ctrl);
nvme_put_ns_from_disk(head, srcu_idx);
if (cmd == NVME_IOCTL_ADMIN_CMD)
ret = nvme_user_cmd(ctrl, NULL, argp);
else
ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
nvme_put_ctrl(ctrl);
return ret;
}
if (is_ctrl_ioctl(cmd))
return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
switch (cmd) {
case NVME_IOCTL_ID:
@ -1448,6 +1522,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
case NVME_IOCTL_SUBMIT_IO:
ret = nvme_submit_io(ns, argp);
break;
case NVME_IOCTL_IO64_CMD:
ret = nvme_user_cmd64(ns->ctrl, ns, argp);
break;
default:
if (ns->ndev)
ret = nvme_nvm_ioctl(ns, cmd, arg);
@ -2289,6 +2366,16 @@ static const struct nvme_core_quirk_entry core_quirks[] = {
.vid = 0x14a4,
.fr = "22301111",
.quirks = NVME_QUIRK_SIMPLE_SUSPEND,
},
{
/*
* This Kingston E8FK11.T firmware version has no interrupt
* after resume with actions related to suspend to idle
* https://bugzilla.kernel.org/show_bug.cgi?id=204887
*/
.vid = 0x2646,
.fr = "E8FK11.T",
.quirks = NVME_QUIRK_SIMPLE_SUSPEND,
}
};
@ -2540,8 +2627,9 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
list_add_tail(&subsys->entry, &nvme_subsystems);
}
if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
dev_name(ctrl->device))) {
ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
dev_name(ctrl->device));
if (ret) {
dev_err(ctrl->device,
"failed to create sysfs link from subsystem.\n");
goto out_put_subsystem;
@ -2838,6 +2926,8 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
switch (cmd) {
case NVME_IOCTL_ADMIN_CMD:
return nvme_user_cmd(ctrl, NULL, argp);
case NVME_IOCTL_ADMIN64_CMD:
return nvme_user_cmd64(ctrl, NULL, argp);
case NVME_IOCTL_IO_CMD:
return nvme_dev_user_cmd(ctrl, argp);
case NVME_IOCTL_RESET:
@ -3045,6 +3135,8 @@ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
nvme_show_int_function(cntlid);
nvme_show_int_function(numa_node);
nvme_show_int_function(queue_count);
nvme_show_int_function(sqsize);
static ssize_t nvme_sysfs_delete(struct device *dev,
struct device_attribute *attr, const char *buf,
@ -3125,6 +3217,8 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_address.attr,
&dev_attr_state.attr,
&dev_attr_numa_node.attr,
&dev_attr_queue_count.attr,
&dev_attr_sqsize.attr,
NULL
};

View File

@ -221,6 +221,7 @@ struct nvme_ctrl {
u16 oacs;
u16 nssa;
u16 nr_streams;
u16 sqsize;
u32 max_namespaces;
atomic_t abort_limit;
u8 vwc;
@ -269,7 +270,6 @@ struct nvme_ctrl {
u16 hmmaxd;
/* Fabrics only */
u16 sqsize;
u32 ioccsz;
u32 iorcsz;
u16 icdoff;

View File

@ -2946,11 +2946,21 @@ static int nvme_suspend(struct device *dev)
if (ret < 0)
goto unfreeze;
/*
* A saved state prevents pci pm from generically controlling the
* device's power. If we're using protocol specific settings, we don't
* want pci interfering.
*/
pci_save_state(pdev);
ret = nvme_set_power_state(ctrl, ctrl->npss);
if (ret < 0)
goto unfreeze;
if (ret) {
/* discard the saved state */
pci_load_saved_state(pdev, NULL);
/*
* Clearing npss forces a controller reset on resume. The
* correct value will be resdicovered then.
@ -2958,14 +2968,7 @@ static int nvme_suspend(struct device *dev)
nvme_dev_disable(ndev, true);
ctrl->npss = 0;
ret = 0;
goto unfreeze;
}
/*
* A saved state prevents pci pm from generically controlling the
* device's power. If we're using protocol specific settings, we don't
* want pci interfering.
*/
pci_save_state(pdev);
unfreeze:
nvme_unfreeze(ctrl);
return ret;
@ -3090,6 +3093,9 @@ static const struct pci_device_id nvme_id_table[] = {
.driver_data = NVME_QUIRK_LIGHTNVM, },
{ PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */
.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
{ PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
NVME_QUIRK_IGNORE_DEV_SUBNQN, },
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },

View File

@ -427,7 +427,7 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev)
{
return min_t(u32, NVME_RDMA_MAX_SEGMENTS,
ibdev->attrs.max_fast_reg_page_list_len);
ibdev->attrs.max_fast_reg_page_list_len - 1);
}
static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
@ -437,7 +437,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
const int cq_factor = send_wr_factor + 1; /* + RECV */
int comp_vector, idx = nvme_rdma_queue_idx(queue);
enum ib_poll_context poll_ctx;
int ret;
int ret, pages_per_mr;
queue->device = nvme_rdma_find_get_device(queue->cm_id);
if (!queue->device) {
@ -479,10 +479,16 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
goto out_destroy_qp;
}
/*
* Currently we don't use SG_GAPS MR's so if the first entry is
* misaligned we'll end up using two entries for a single data page,
* so one additional entry is required.
*/
pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev) + 1;
ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
queue->queue_size,
IB_MR_TYPE_MEM_REG,
nvme_rdma_get_max_fr_pages(ibdev), 0);
pages_per_mr, 0);
if (ret) {
dev_err(queue->ctrl->ctrl.device,
"failed to initialize MR pool sized %d for QID %d\n",
@ -614,7 +620,8 @@ static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
if (!ret) {
set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
} else {
__nvme_rdma_stop_queue(queue);
if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
__nvme_rdma_stop_queue(queue);
dev_info(ctrl->ctrl.device,
"failed to connect queue: %d ret=%d\n", idx, ret);
}
@ -820,8 +827,8 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
if (error)
goto out_stop_queue;
ctrl->ctrl.max_hw_sectors =
(ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
ctrl->ctrl.max_segments = ctrl->max_fr_pages;
ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9);
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);

View File

@ -1042,7 +1042,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
{
struct nvme_tcp_queue *queue =
container_of(w, struct nvme_tcp_queue, io_work);
unsigned long start = jiffies + msecs_to_jiffies(1);
unsigned long deadline = jiffies + msecs_to_jiffies(1);
do {
bool pending = false;
@ -1067,7 +1067,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
if (!pending)
return;
} while (time_after(jiffies, start)); /* quota is exhausted */
} while (!time_after(jiffies, deadline)); /* quota is exhausted */
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
}

View File

@ -11,10 +11,10 @@
void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id)
{
const struct queue_limits *ql = &bdev_get_queue(bdev)->limits;
/* Number of physical blocks per logical block. */
const u32 ppl = ql->physical_block_size / ql->logical_block_size;
/* Physical blocks per logical block, 0's based. */
const __le16 ppl0b = to0based(ppl);
/* Number of logical blocks per physical block. */
const u32 lpp = ql->physical_block_size / ql->logical_block_size;
/* Logical blocks per physical block, 0's based. */
const __le16 lpp0b = to0based(lpp);
/*
* For NVMe 1.2 and later, bit 1 indicates that the fields NAWUN,
@ -25,9 +25,9 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id)
* field from the identify controller data structure should be used.
*/
id->nsfeat |= 1 << 1;
id->nawun = ppl0b;
id->nawupf = ppl0b;
id->nacwu = ppl0b;
id->nawun = lpp0b;
id->nawupf = lpp0b;
id->nacwu = lpp0b;
/*
* Bit 4 indicates that the fields NPWG, NPWA, NPDG, NPDA, and
@ -36,7 +36,7 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id)
*/
id->nsfeat |= 1 << 4;
/* NPWG = Namespace Preferred Write Granularity. 0's based */
id->npwg = ppl0b;
id->npwg = lpp0b;
/* NPWA = Namespace Preferred Write Alignment. 0's based */
id->npwa = id->npwg;
/* NPDG = Namespace Preferred Deallocate Granularity. 0's based */

View File

@ -348,8 +348,7 @@ static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
return 0;
err:
if (cmd->req.sg_cnt)
sgl_free(cmd->req.sg);
sgl_free(cmd->req.sg);
return NVME_SC_INTERNAL;
}
@ -554,8 +553,7 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd)
if (queue->nvme_sq.sqhd_disabled) {
kfree(cmd->iov);
if (cmd->req.sg_cnt)
sgl_free(cmd->req.sg);
sgl_free(cmd->req.sg);
}
return 1;
@ -586,8 +584,7 @@ static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
return -EAGAIN;
kfree(cmd->iov);
if (cmd->req.sg_cnt)
sgl_free(cmd->req.sg);
sgl_free(cmd->req.sg);
cmd->queue->snd_cmd = NULL;
nvmet_tcp_put_cmd(cmd);
return 1;
@ -1310,8 +1307,7 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
nvmet_req_uninit(&cmd->req);
nvmet_tcp_unmap_pdu_iovec(cmd);
kfree(cmd->iov);
if (cmd->req.sg_cnt)
sgl_free(cmd->req.sg);
sgl_free(cmd->req.sg);
}
static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)

View File

@ -45,6 +45,27 @@ struct nvme_passthru_cmd {
__u32 result;
};
struct nvme_passthru_cmd64 {
__u8 opcode;
__u8 flags;
__u16 rsvd1;
__u32 nsid;
__u32 cdw2;
__u32 cdw3;
__u64 metadata;
__u64 addr;
__u32 metadata_len;
__u32 data_len;
__u32 cdw10;
__u32 cdw11;
__u32 cdw12;
__u32 cdw13;
__u32 cdw14;
__u32 cdw15;
__u32 timeout_ms;
__u64 result;
};
#define nvme_admin_cmd nvme_passthru_cmd
#define NVME_IOCTL_ID _IO('N', 0x40)
@ -54,5 +75,7 @@ struct nvme_passthru_cmd {
#define NVME_IOCTL_RESET _IO('N', 0x44)
#define NVME_IOCTL_SUBSYS_RESET _IO('N', 0x45)
#define NVME_IOCTL_RESCAN _IO('N', 0x46)
#define NVME_IOCTL_ADMIN64_CMD _IOWR('N', 0x47, struct nvme_passthru_cmd64)
#define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64)
#endif /* _UAPI_LINUX_NVME_IOCTL_H */