2011-01-21 00:50:14 +07:00
|
|
|
/*
|
|
|
|
* NVM Express device driver
|
2014-03-24 21:11:22 +07:00
|
|
|
* Copyright (c) 2011-2014, Intel Corporation.
|
2011-01-21 00:50:14 +07:00
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify it
|
|
|
|
* under the terms and conditions of the GNU General Public License,
|
|
|
|
* version 2, as published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope it will be useful, but WITHOUT
|
|
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
|
|
* more details.
|
|
|
|
*/
|
|
|
|
|
2011-05-13 00:50:28 +07:00
|
|
|
#include <linux/bitops.h>
|
2011-01-21 00:50:14 +07:00
|
|
|
#include <linux/blkdev.h>
|
2014-11-04 22:20:14 +07:00
|
|
|
#include <linux/blk-mq.h>
|
2014-03-24 23:46:25 +07:00
|
|
|
#include <linux/cpu.h>
|
2011-05-06 19:37:54 +07:00
|
|
|
#include <linux/delay.h>
|
2011-01-21 00:50:14 +07:00
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/genhd.h>
|
2014-04-03 04:45:37 +07:00
|
|
|
#include <linux/hdreg.h>
|
2011-05-06 19:45:47 +07:00
|
|
|
#include <linux/idr.h>
|
2011-01-21 00:50:14 +07:00
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/interrupt.h>
|
|
|
|
#include <linux/io.h>
|
|
|
|
#include <linux/kdev_t.h>
|
2011-03-03 06:37:18 +07:00
|
|
|
#include <linux/kthread.h>
|
2011-01-21 00:50:14 +07:00
|
|
|
#include <linux/kernel.h>
|
2015-06-02 03:28:14 +07:00
|
|
|
#include <linux/list_sort.h>
|
2011-01-21 00:50:14 +07:00
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/moduleparam.h>
|
|
|
|
#include <linux/pci.h>
|
2011-02-06 19:53:23 +07:00
|
|
|
#include <linux/poison.h>
|
2013-07-09 04:26:25 +07:00
|
|
|
#include <linux/ptrace.h>
|
2011-01-21 00:50:14 +07:00
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/slab.h>
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
#include <linux/t10-pi.h>
|
2011-01-21 00:50:14 +07:00
|
|
|
#include <linux/types.h>
|
2015-10-15 19:10:52 +07:00
|
|
|
#include <linux/pr.h>
|
2013-03-05 08:40:58 +07:00
|
|
|
#include <scsi/sg.h>
|
2015-08-28 14:27:14 +07:00
|
|
|
#include <linux/io-64-nonatomic-lo-hi.h>
|
2015-10-15 19:10:52 +07:00
|
|
|
#include <asm/unaligned.h>
|
2012-02-07 09:45:33 +07:00
|
|
|
|
2015-10-02 20:25:49 +07:00
|
|
|
#include <uapi/linux/nvme_ioctl.h>
|
2015-10-03 20:46:41 +07:00
|
|
|
#include "nvme.h"
|
|
|
|
|
2015-02-04 01:21:42 +07:00
|
|
|
#define NVME_MINORS (1U << MINORBITS)
|
2014-05-14 00:42:02 +07:00
|
|
|
#define NVME_Q_DEPTH 1024
|
2015-03-07 02:56:13 +07:00
|
|
|
#define NVME_AQ_DEPTH 256
|
2011-01-21 00:50:14 +07:00
|
|
|
#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
|
|
|
|
#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
|
2014-07-01 22:33:32 +07:00
|
|
|
#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ)
|
2014-05-14 00:42:02 +07:00
|
|
|
|
2015-11-26 15:08:36 +07:00
|
|
|
unsigned char admin_timeout = 60;
|
2014-05-14 00:42:02 +07:00
|
|
|
module_param(admin_timeout, byte, 0644);
|
|
|
|
MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2014-06-04 10:04:30 +07:00
|
|
|
unsigned char nvme_io_timeout = 30;
|
|
|
|
module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
|
2014-04-05 00:43:36 +07:00
|
|
|
MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2014-07-01 22:33:32 +07:00
|
|
|
static unsigned char shutdown_timeout = 5;
|
|
|
|
module_param(shutdown_timeout, byte, 0644);
|
|
|
|
MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
|
|
|
|
|
2011-01-21 00:50:14 +07:00
|
|
|
static int nvme_major;
|
|
|
|
module_param(nvme_major, int, 0);
|
|
|
|
|
2015-02-04 01:21:42 +07:00
|
|
|
static int nvme_char_major;
|
|
|
|
module_param(nvme_char_major, int, 0);
|
|
|
|
|
2011-02-06 19:28:06 +07:00
|
|
|
static int use_threaded_interrupts;
|
|
|
|
module_param(use_threaded_interrupts, int, 0);
|
|
|
|
|
2015-07-20 23:14:09 +07:00
|
|
|
static bool use_cmb_sqes = true;
|
|
|
|
module_param(use_cmb_sqes, bool, 0644);
|
|
|
|
MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
|
|
|
|
|
2011-03-03 06:37:18 +07:00
|
|
|
static DEFINE_SPINLOCK(dev_list_lock);
|
|
|
|
static LIST_HEAD(dev_list);
|
|
|
|
static struct task_struct *nvme_thread;
|
2013-12-11 03:10:36 +07:00
|
|
|
static struct workqueue_struct *nvme_workq;
|
2014-04-08 06:10:11 +07:00
|
|
|
static wait_queue_head_t nvme_kthread_wait;
|
2011-03-03 06:37:18 +07:00
|
|
|
|
2015-02-04 01:21:42 +07:00
|
|
|
static struct class *nvme_class;
|
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
struct nvme_dev;
|
|
|
|
struct nvme_queue;
|
|
|
|
|
2015-10-02 23:49:23 +07:00
|
|
|
static int __nvme_reset(struct nvme_dev *dev);
|
2015-06-05 23:30:08 +07:00
|
|
|
static int nvme_reset(struct nvme_dev *dev);
|
2015-11-04 10:37:26 +07:00
|
|
|
static void nvme_process_cq(struct nvme_queue *nvmeq);
|
2015-10-03 14:49:23 +07:00
|
|
|
static void nvme_dead_ctrl(struct nvme_dev *dev);
|
2013-12-11 03:10:37 +07:00
|
|
|
|
2013-12-11 03:10:40 +07:00
|
|
|
struct async_cmd_info {
|
|
|
|
struct kthread_work work;
|
|
|
|
struct kthread_worker *worker;
|
2014-11-04 22:20:14 +07:00
|
|
|
struct request *req;
|
2013-12-11 03:10:40 +07:00
|
|
|
u32 result;
|
|
|
|
int status;
|
|
|
|
void *ctx;
|
|
|
|
};
|
2011-03-03 06:37:18 +07:00
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
/*
|
|
|
|
* Represents an NVM Express device. Each nvme_dev is a PCI function.
|
|
|
|
*/
|
|
|
|
struct nvme_dev {
|
|
|
|
struct list_head node;
|
|
|
|
struct nvme_queue **queues;
|
|
|
|
struct blk_mq_tag_set tagset;
|
|
|
|
struct blk_mq_tag_set admin_tagset;
|
|
|
|
u32 __iomem *dbs;
|
|
|
|
struct device *dev;
|
|
|
|
struct dma_pool *prp_page_pool;
|
|
|
|
struct dma_pool *prp_small_pool;
|
|
|
|
unsigned queue_count;
|
|
|
|
unsigned online_queues;
|
|
|
|
unsigned max_qid;
|
|
|
|
int q_depth;
|
|
|
|
u32 db_stride;
|
|
|
|
u32 ctrl_config;
|
|
|
|
struct msix_entry *entry;
|
|
|
|
void __iomem *bar;
|
|
|
|
struct list_head namespaces;
|
|
|
|
struct kref kref;
|
|
|
|
struct device *device;
|
|
|
|
struct work_struct reset_work;
|
|
|
|
struct work_struct probe_work;
|
|
|
|
struct work_struct scan_work;
|
|
|
|
bool subsystem;
|
|
|
|
u32 max_hw_sectors;
|
|
|
|
u32 stripe_size;
|
|
|
|
u32 page_size;
|
|
|
|
void __iomem *cmb;
|
|
|
|
dma_addr_t cmb_dma_addr;
|
|
|
|
u64 cmb_size;
|
|
|
|
u32 cmbsz;
|
|
|
|
|
|
|
|
struct nvme_ctrl ctrl;
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
|
|
|
|
{
|
|
|
|
return container_of(ctrl, struct nvme_dev, ctrl);
|
|
|
|
}
|
|
|
|
|
2011-01-21 00:50:14 +07:00
|
|
|
/*
|
|
|
|
* An NVM Express queue. Each device has at least two (one for admin
|
|
|
|
* commands and one for I/O commands).
|
|
|
|
*/
|
|
|
|
struct nvme_queue {
|
|
|
|
struct device *q_dmadev;
|
2011-02-10 21:56:01 +07:00
|
|
|
struct nvme_dev *dev;
|
2014-01-28 03:57:22 +07:00
|
|
|
char irqname[24]; /* nvme4294967295-65535\0 */
|
2011-01-21 00:50:14 +07:00
|
|
|
spinlock_t q_lock;
|
|
|
|
struct nvme_command *sq_cmds;
|
2015-07-20 23:14:09 +07:00
|
|
|
struct nvme_command __iomem *sq_cmds_io;
|
2011-01-21 00:50:14 +07:00
|
|
|
volatile struct nvme_completion *cqes;
|
2015-06-01 22:29:54 +07:00
|
|
|
struct blk_mq_tags **tags;
|
2011-01-21 00:50:14 +07:00
|
|
|
dma_addr_t sq_dma_addr;
|
|
|
|
dma_addr_t cq_dma_addr;
|
|
|
|
u32 __iomem *q_db;
|
|
|
|
u16 q_depth;
|
2015-01-16 05:19:10 +07:00
|
|
|
s16 cq_vector;
|
2011-01-21 00:50:14 +07:00
|
|
|
u16 sq_head;
|
|
|
|
u16 sq_tail;
|
|
|
|
u16 cq_head;
|
2013-12-11 03:10:38 +07:00
|
|
|
u16 qid;
|
2013-06-24 22:47:34 +07:00
|
|
|
u8 cq_phase;
|
|
|
|
u8 cqe_seen;
|
2013-12-11 03:10:40 +07:00
|
|
|
struct async_cmd_info cmdinfo;
|
2011-01-21 00:50:14 +07:00
|
|
|
};
|
|
|
|
|
2015-10-16 12:58:32 +07:00
|
|
|
/*
|
|
|
|
* The nvme_iod describes the data in an I/O, including the list of PRP
|
|
|
|
* entries. You can't see it in this data structure because C doesn't let
|
|
|
|
* me express that. Use nvme_alloc_iod to ensure there's enough space
|
|
|
|
* allocated to store the PRP list.
|
|
|
|
*/
|
|
|
|
struct nvme_iod {
|
|
|
|
unsigned long private; /* For the use of the submitter of the I/O */
|
|
|
|
int npages; /* In the PRP list. 0 means small pool in use */
|
|
|
|
int offset; /* Of PRP list */
|
|
|
|
int nents; /* Used in scatterlist */
|
|
|
|
int length; /* Of data, in bytes */
|
|
|
|
dma_addr_t first_dma;
|
|
|
|
struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */
|
|
|
|
struct scatterlist sg[0];
|
|
|
|
};
|
|
|
|
|
2011-01-21 00:50:14 +07:00
|
|
|
/*
|
|
|
|
* Check we didin't inadvertently grow the command struct
|
|
|
|
*/
|
|
|
|
static inline void _nvme_check_size(void)
|
|
|
|
{
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
|
2013-03-27 18:13:41 +07:00
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
|
2013-12-11 03:10:38 +07:00
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
|
2011-01-21 00:50:14 +07:00
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
|
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
|
2012-09-27 01:49:27 +07:00
|
|
|
BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2014-04-04 05:45:23 +07:00
|
|
|
typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
|
2011-10-15 18:33:46 +07:00
|
|
|
struct nvme_completion *);
|
|
|
|
|
2011-02-07 06:30:16 +07:00
|
|
|
struct nvme_cmd_info {
|
2011-10-15 18:33:46 +07:00
|
|
|
nvme_completion_fn fn;
|
|
|
|
void *ctx;
|
2013-12-11 03:10:38 +07:00
|
|
|
int aborted;
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_queue *nvmeq;
|
2015-01-23 02:07:58 +07:00
|
|
|
struct nvme_iod iod[0];
|
2011-02-07 06:30:16 +07:00
|
|
|
};
|
|
|
|
|
2015-01-23 02:07:58 +07:00
|
|
|
/*
|
|
|
|
* Max size of iod being embedded in the request payload
|
|
|
|
*/
|
|
|
|
#define NVME_INT_PAGES 2
|
|
|
|
#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size)
|
2015-03-27 08:21:32 +07:00
|
|
|
#define NVME_INT_MASK 0x01
|
2015-01-23 02:07:58 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Will slightly overestimate the number of pages needed. This is OK
|
|
|
|
* as it only leads to a small amount of wasted memory for the lifetime of
|
|
|
|
* the I/O.
|
|
|
|
*/
|
|
|
|
static int nvme_npages(unsigned size, struct nvme_dev *dev)
|
|
|
|
{
|
|
|
|
unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
|
|
|
|
return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned int nvme_cmd_size(struct nvme_dev *dev)
|
|
|
|
{
|
|
|
|
unsigned int ret = sizeof(struct nvme_cmd_info);
|
|
|
|
|
|
|
|
ret += sizeof(struct nvme_iod);
|
|
|
|
ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
|
|
|
|
ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
|
|
|
|
unsigned int hctx_idx)
|
2011-02-07 06:30:16 +07:00
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_dev *dev = data;
|
|
|
|
struct nvme_queue *nvmeq = dev->queues[0];
|
|
|
|
|
2015-06-01 22:29:54 +07:00
|
|
|
WARN_ON(hctx_idx != 0);
|
|
|
|
WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
|
|
|
|
WARN_ON(nvmeq->tags);
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
hctx->driver_data = nvmeq;
|
2015-06-01 22:29:54 +07:00
|
|
|
nvmeq->tags = &dev->admin_tagset.tags[0];
|
2014-11-04 22:20:14 +07:00
|
|
|
return 0;
|
2011-02-07 06:30:16 +07:00
|
|
|
}
|
|
|
|
|
2015-06-08 23:08:13 +07:00
|
|
|
static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
|
|
|
|
{
|
|
|
|
struct nvme_queue *nvmeq = hctx->driver_data;
|
|
|
|
|
|
|
|
nvmeq->tags = NULL;
|
|
|
|
}
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
static int nvme_admin_init_request(void *data, struct request *req,
|
|
|
|
unsigned int hctx_idx, unsigned int rq_idx,
|
|
|
|
unsigned int numa_node)
|
2013-07-16 04:02:20 +07:00
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_dev *dev = data;
|
|
|
|
struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
|
|
|
|
struct nvme_queue *nvmeq = dev->queues[0];
|
|
|
|
|
|
|
|
BUG_ON(!nvmeq);
|
|
|
|
cmd->nvmeq = nvmeq;
|
|
|
|
return 0;
|
2013-07-16 04:02:20 +07:00
|
|
|
}
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
|
|
|
|
unsigned int hctx_idx)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_dev *dev = data;
|
2015-06-01 22:29:54 +07:00
|
|
|
struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
|
2014-11-04 22:20:14 +07:00
|
|
|
|
2015-06-01 22:29:54 +07:00
|
|
|
if (!nvmeq->tags)
|
|
|
|
nvmeq->tags = &dev->tagset.tags[hctx_idx];
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2015-06-01 22:29:54 +07:00
|
|
|
WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
|
2014-11-04 22:20:14 +07:00
|
|
|
hctx->driver_data = nvmeq;
|
|
|
|
return 0;
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
static int nvme_init_request(void *data, struct request *req,
|
|
|
|
unsigned int hctx_idx, unsigned int rq_idx,
|
|
|
|
unsigned int numa_node)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_dev *dev = data;
|
|
|
|
struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
|
|
|
|
struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
|
|
|
|
|
|
|
|
BUG_ON(!nvmeq);
|
|
|
|
cmd->nvmeq = nvmeq;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
|
|
|
|
nvme_completion_fn handler)
|
|
|
|
{
|
|
|
|
cmd->fn = handler;
|
|
|
|
cmd->ctx = ctx;
|
|
|
|
cmd->aborted = 0;
|
2015-01-08 08:55:48 +07:00
|
|
|
blk_mq_start_request(blk_mq_rq_from_pdu(cmd));
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2015-01-23 02:07:58 +07:00
|
|
|
static void *iod_get_private(struct nvme_iod *iod)
|
|
|
|
{
|
|
|
|
return (void *) (iod->private & ~0x1UL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If bit 0 is set, the iod is embedded in the request payload.
|
|
|
|
*/
|
|
|
|
static bool iod_should_kfree(struct nvme_iod *iod)
|
|
|
|
{
|
2015-03-27 08:21:32 +07:00
|
|
|
return (iod->private & NVME_INT_MASK) == 0;
|
2015-01-23 02:07:58 +07:00
|
|
|
}
|
|
|
|
|
2011-10-15 18:33:46 +07:00
|
|
|
/* Special values must be less than 0x1000 */
|
|
|
|
#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA)
|
2011-02-08 03:55:59 +07:00
|
|
|
#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
|
|
|
|
#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE)
|
|
|
|
#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE)
|
2011-02-06 19:53:23 +07:00
|
|
|
|
2014-04-04 05:45:23 +07:00
|
|
|
static void special_completion(struct nvme_queue *nvmeq, void *ctx,
|
2011-10-15 18:33:46 +07:00
|
|
|
struct nvme_completion *cqe)
|
|
|
|
{
|
|
|
|
if (ctx == CMD_CTX_CANCELLED)
|
|
|
|
return;
|
|
|
|
if (ctx == CMD_CTX_COMPLETED) {
|
2014-04-04 05:45:23 +07:00
|
|
|
dev_warn(nvmeq->q_dmadev,
|
2011-10-15 18:33:46 +07:00
|
|
|
"completed id %d twice on queue %d\n",
|
|
|
|
cqe->command_id, le16_to_cpup(&cqe->sq_id));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (ctx == CMD_CTX_INVALID) {
|
2014-04-04 05:45:23 +07:00
|
|
|
dev_warn(nvmeq->q_dmadev,
|
2011-10-15 18:33:46 +07:00
|
|
|
"invalid id %d completed on queue %d\n",
|
|
|
|
cqe->command_id, le16_to_cpup(&cqe->sq_id));
|
|
|
|
return;
|
|
|
|
}
|
2014-04-04 05:45:23 +07:00
|
|
|
dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
|
2011-10-15 18:33:46 +07:00
|
|
|
}
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2011-10-15 18:33:46 +07:00
|
|
|
void *ctx;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2012-08-03 03:05:59 +07:00
|
|
|
if (fn)
|
2014-11-04 22:20:14 +07:00
|
|
|
*fn = cmd->fn;
|
|
|
|
ctx = cmd->ctx;
|
|
|
|
cmd->fn = special_completion;
|
|
|
|
cmd->ctx = CMD_CTX_CANCELLED;
|
2011-10-15 18:33:46 +07:00
|
|
|
return ctx;
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
static void async_req_completion(struct nvme_queue *nvmeq, void *ctx,
|
|
|
|
struct nvme_completion *cqe)
|
2011-02-05 04:03:56 +07:00
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
u32 result = le32_to_cpup(&cqe->result);
|
|
|
|
u16 status = le16_to_cpup(&cqe->status) >> 1;
|
|
|
|
|
|
|
|
if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
|
2015-11-26 16:06:56 +07:00
|
|
|
++nvmeq->dev->ctrl.event_limit;
|
2015-06-02 03:28:14 +07:00
|
|
|
if (status != NVME_SC_SUCCESS)
|
|
|
|
return;
|
|
|
|
|
|
|
|
switch (result & 0xff07) {
|
|
|
|
case NVME_AER_NOTICE_NS_CHANGED:
|
|
|
|
dev_info(nvmeq->q_dmadev, "rescanning\n");
|
|
|
|
schedule_work(&nvmeq->dev->scan_work);
|
|
|
|
default:
|
|
|
|
dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result);
|
|
|
|
}
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
|
|
|
|
struct nvme_completion *cqe)
|
2014-02-22 04:13:44 +07:00
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
struct request *req = ctx;
|
|
|
|
|
|
|
|
u16 status = le16_to_cpup(&cqe->status) >> 1;
|
|
|
|
u32 result = le32_to_cpup(&cqe->result);
|
2014-05-13 23:32:46 +07:00
|
|
|
|
2015-06-01 22:29:54 +07:00
|
|
|
blk_mq_free_request(req);
|
2014-05-13 23:32:46 +07:00
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
|
2015-11-26 16:06:56 +07:00
|
|
|
++nvmeq->dev->ctrl.abort_limit;
|
2014-02-22 04:13:44 +07:00
|
|
|
}
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
static void async_completion(struct nvme_queue *nvmeq, void *ctx,
|
|
|
|
struct nvme_completion *cqe)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
struct async_cmd_info *cmdinfo = ctx;
|
|
|
|
cmdinfo->result = le32_to_cpup(&cqe->result);
|
|
|
|
cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
|
|
|
|
queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
|
2015-06-01 22:29:54 +07:00
|
|
|
blk_mq_free_request(cmdinfo->req);
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
|
|
|
|
unsigned int tag)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2015-06-01 22:29:54 +07:00
|
|
|
struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag);
|
2014-05-13 23:32:46 +07:00
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
return blk_mq_rq_to_pdu(req);
|
2014-03-04 06:39:13 +07:00
|
|
|
}
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
/*
|
|
|
|
* Called with local interrupts disabled and the q_lock held. May not sleep.
|
|
|
|
*/
|
|
|
|
static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag,
|
|
|
|
nvme_completion_fn *fn)
|
2014-03-04 06:39:13 +07:00
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag);
|
|
|
|
void *ctx;
|
|
|
|
if (tag >= nvmeq->q_depth) {
|
|
|
|
*fn = special_completion;
|
|
|
|
return CMD_CTX_INVALID;
|
|
|
|
}
|
|
|
|
if (fn)
|
|
|
|
*fn = cmd->fn;
|
|
|
|
ctx = cmd->ctx;
|
|
|
|
cmd->fn = special_completion;
|
|
|
|
cmd->ctx = CMD_CTX_COMPLETED;
|
|
|
|
return ctx;
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2011-03-17 03:28:24 +07:00
|
|
|
* nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
|
2011-01-21 00:50:14 +07:00
|
|
|
* @nvmeq: The queue to use
|
|
|
|
* @cmd: The command to send
|
|
|
|
*
|
|
|
|
* Safe to use from interrupt context
|
|
|
|
*/
|
2015-07-31 20:26:58 +07:00
|
|
|
static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
|
|
|
|
struct nvme_command *cmd)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
u16 tail = nvmeq->sq_tail;
|
|
|
|
|
2015-07-20 23:14:09 +07:00
|
|
|
if (nvmeq->sq_cmds_io)
|
|
|
|
memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
|
|
|
|
else
|
|
|
|
memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
|
|
|
|
|
2011-01-21 00:50:14 +07:00
|
|
|
if (++tail == nvmeq->q_depth)
|
|
|
|
tail = 0;
|
2011-02-16 21:59:59 +07:00
|
|
|
writel(tail, nvmeq->q_db);
|
2011-01-21 00:50:14 +07:00
|
|
|
nvmeq->sq_tail = tail;
|
|
|
|
}
|
|
|
|
|
2015-07-31 20:26:58 +07:00
|
|
|
static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
|
2014-11-04 22:20:14 +07:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&nvmeq->q_lock, flags);
|
2015-07-31 20:26:58 +07:00
|
|
|
__nvme_submit_cmd(nvmeq, cmd);
|
2014-11-04 22:20:14 +07:00
|
|
|
spin_unlock_irqrestore(&nvmeq->q_lock, flags);
|
|
|
|
}
|
|
|
|
|
2011-12-21 01:34:52 +07:00
|
|
|
static __le64 **iod_list(struct nvme_iod *iod)
|
2011-02-10 20:51:24 +07:00
|
|
|
{
|
2011-12-21 01:34:52 +07:00
|
|
|
return ((void *)iod) + iod->offset;
|
2011-02-10 20:51:24 +07:00
|
|
|
}
|
|
|
|
|
2015-01-23 02:07:58 +07:00
|
|
|
static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
|
|
|
|
unsigned nseg, unsigned long private)
|
2011-12-21 01:34:52 +07:00
|
|
|
{
|
2015-01-23 02:07:58 +07:00
|
|
|
iod->private = private;
|
|
|
|
iod->offset = offsetof(struct nvme_iod, sg[nseg]);
|
|
|
|
iod->npages = -1;
|
|
|
|
iod->length = nbytes;
|
|
|
|
iod->nents = 0;
|
2011-12-21 01:34:52 +07:00
|
|
|
}
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2011-12-21 01:34:52 +07:00
|
|
|
static struct nvme_iod *
|
2015-01-23 02:07:58 +07:00
|
|
|
__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
|
|
|
|
unsigned long priv, gfp_t gfp)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2011-12-21 01:34:52 +07:00
|
|
|
struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
|
2015-01-23 02:07:58 +07:00
|
|
|
sizeof(__le64 *) * nvme_npages(bytes, dev) +
|
2011-12-21 01:34:52 +07:00
|
|
|
sizeof(struct scatterlist) * nseg, gfp);
|
|
|
|
|
2015-01-23 02:07:58 +07:00
|
|
|
if (iod)
|
|
|
|
iod_init(iod, bytes, nseg, priv);
|
2011-12-21 01:34:52 +07:00
|
|
|
|
|
|
|
return iod;
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2015-01-23 02:07:58 +07:00
|
|
|
static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
|
|
|
|
gfp_t gfp)
|
|
|
|
{
|
|
|
|
unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
|
|
|
|
sizeof(struct nvme_dsm_range);
|
|
|
|
struct nvme_iod *iod;
|
|
|
|
|
|
|
|
if (rq->nr_phys_segments <= NVME_INT_PAGES &&
|
|
|
|
size <= NVME_INT_BYTES(dev)) {
|
|
|
|
struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
|
|
|
|
|
|
|
|
iod = cmd->iod;
|
|
|
|
iod_init(iod, size, rq->nr_phys_segments,
|
2015-03-27 08:21:32 +07:00
|
|
|
(unsigned long) rq | NVME_INT_MASK);
|
2015-01-23 02:07:58 +07:00
|
|
|
return iod;
|
|
|
|
}
|
|
|
|
|
|
|
|
return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
|
|
|
|
(unsigned long) rq, gfp);
|
|
|
|
}
|
|
|
|
|
2015-05-22 16:12:46 +07:00
|
|
|
static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2014-06-24 00:34:01 +07:00
|
|
|
const int last_prp = dev->page_size / 8 - 1;
|
2011-12-21 01:34:52 +07:00
|
|
|
int i;
|
|
|
|
__le64 **list = iod_list(iod);
|
|
|
|
dma_addr_t prp_dma = iod->first_dma;
|
|
|
|
|
|
|
|
if (iod->npages == 0)
|
|
|
|
dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
|
|
|
|
for (i = 0; i < iod->npages; i++) {
|
|
|
|
__le64 *prp_list = list[i];
|
|
|
|
dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
|
|
|
|
dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
|
|
|
|
prp_dma = next_prp_dma;
|
|
|
|
}
|
2015-01-23 02:07:58 +07:00
|
|
|
|
|
|
|
if (iod_should_kfree(iod))
|
|
|
|
kfree(iod);
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2014-08-29 22:06:12 +07:00
|
|
|
static int nvme_error_status(u16 status)
|
|
|
|
{
|
|
|
|
switch (status & 0x7ff) {
|
|
|
|
case NVME_SC_SUCCESS:
|
|
|
|
return 0;
|
|
|
|
case NVME_SC_CAP_EXCEEDED:
|
|
|
|
return -ENOSPC;
|
|
|
|
default:
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-23 23:16:21 +07:00
|
|
|
#ifdef CONFIG_BLK_DEV_INTEGRITY
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
|
|
|
|
{
|
|
|
|
if (be32_to_cpu(pi->ref_tag) == v)
|
|
|
|
pi->ref_tag = cpu_to_be32(p);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
|
|
|
|
{
|
|
|
|
if (be32_to_cpu(pi->ref_tag) == p)
|
|
|
|
pi->ref_tag = cpu_to_be32(v);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* nvme_dif_remap - remaps ref tags to bip seed and physical lba
|
|
|
|
*
|
|
|
|
* The virtual start sector is the one that was originally submitted by the
|
|
|
|
* block layer. Due to partitioning, MD/DM cloning, etc. the actual physical
|
|
|
|
* start sector may be different. Remap protection information to match the
|
|
|
|
* physical LBA on writes, and back to the original seed on reads.
|
|
|
|
*
|
|
|
|
* Type 0 and 3 do not have a ref tag, so no remapping required.
|
|
|
|
*/
|
|
|
|
static void nvme_dif_remap(struct request *req,
|
|
|
|
void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns = req->rq_disk->private_data;
|
|
|
|
struct bio_integrity_payload *bip;
|
|
|
|
struct t10_pi_tuple *pi;
|
|
|
|
void *p, *pmap;
|
|
|
|
u32 i, nlb, ts, phys, virt;
|
|
|
|
|
|
|
|
if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3)
|
|
|
|
return;
|
|
|
|
|
|
|
|
bip = bio_integrity(req->bio);
|
|
|
|
if (!bip)
|
|
|
|
return;
|
|
|
|
|
|
|
|
pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset;
|
|
|
|
|
|
|
|
p = pmap;
|
|
|
|
virt = bip_get_seed(bip);
|
|
|
|
phys = nvme_block_nr(ns, blk_rq_pos(req));
|
|
|
|
nlb = (blk_rq_bytes(req) >> ns->lba_shift);
|
2015-10-22 00:20:18 +07:00
|
|
|
ts = ns->disk->queue->integrity.tuple_size;
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
|
|
|
|
for (i = 0; i < nlb; i++, virt++, phys++) {
|
|
|
|
pi = (struct t10_pi_tuple *)p;
|
|
|
|
dif_swap(phys, virt, pi);
|
|
|
|
p += ts;
|
|
|
|
}
|
|
|
|
kunmap_atomic(pmap);
|
|
|
|
}
|
|
|
|
|
2015-02-23 23:16:21 +07:00
|
|
|
static void nvme_init_integrity(struct nvme_ns *ns)
|
|
|
|
{
|
|
|
|
struct blk_integrity integrity;
|
|
|
|
|
|
|
|
switch (ns->pi_type) {
|
|
|
|
case NVME_NS_DPS_PI_TYPE3:
|
2015-10-22 00:19:33 +07:00
|
|
|
integrity.profile = &t10_pi_type3_crc;
|
2015-02-23 23:16:21 +07:00
|
|
|
break;
|
|
|
|
case NVME_NS_DPS_PI_TYPE1:
|
|
|
|
case NVME_NS_DPS_PI_TYPE2:
|
2015-10-22 00:19:33 +07:00
|
|
|
integrity.profile = &t10_pi_type1_crc;
|
2015-02-23 23:16:21 +07:00
|
|
|
break;
|
|
|
|
default:
|
2015-10-22 00:20:29 +07:00
|
|
|
integrity.profile = NULL;
|
2015-02-23 23:16:21 +07:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
integrity.tuple_size = ns->ms;
|
|
|
|
blk_integrity_register(ns->disk, &integrity);
|
|
|
|
blk_queue_max_integrity_segments(ns->queue, 1);
|
|
|
|
}
|
|
|
|
#else /* CONFIG_BLK_DEV_INTEGRITY */
|
|
|
|
static void nvme_dif_remap(struct request *req,
|
|
|
|
void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static void nvme_init_integrity(struct nvme_ns *ns)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
static void req_completion(struct nvme_queue *nvmeq, void *ctx,
|
2011-01-21 00:50:14 +07:00
|
|
|
struct nvme_completion *cqe)
|
|
|
|
{
|
2011-12-21 01:34:52 +07:00
|
|
|
struct nvme_iod *iod = ctx;
|
2015-01-23 02:07:58 +07:00
|
|
|
struct request *req = iod_get_private(iod);
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
|
2011-01-21 00:50:14 +07:00
|
|
|
u16 status = le16_to_cpup(&cqe->status) >> 1;
|
2015-10-16 02:38:48 +07:00
|
|
|
bool requeue = false;
|
2015-10-13 02:23:39 +07:00
|
|
|
int error = 0;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2014-04-04 05:45:23 +07:00
|
|
|
if (unlikely(status)) {
|
2014-11-04 22:20:14 +07:00
|
|
|
if (!(status & NVME_SC_DNR || blk_noretry_request(req))
|
|
|
|
&& (jiffies - req->start_time) < req->timeout) {
|
2015-01-08 08:55:52 +07:00
|
|
|
unsigned long flags;
|
|
|
|
|
2015-10-16 02:38:48 +07:00
|
|
|
requeue = true;
|
2014-11-04 22:20:14 +07:00
|
|
|
blk_mq_requeue_request(req);
|
2015-01-08 08:55:52 +07:00
|
|
|
spin_lock_irqsave(req->q->queue_lock, flags);
|
|
|
|
if (!blk_queue_stopped(req->q))
|
|
|
|
blk_mq_kick_requeue_list(req->q);
|
|
|
|
spin_unlock_irqrestore(req->q->queue_lock, flags);
|
2015-10-16 02:38:48 +07:00
|
|
|
goto release_iod;
|
2014-04-04 05:45:23 +07:00
|
|
|
}
|
2015-09-28 02:01:50 +07:00
|
|
|
|
2015-05-22 16:12:46 +07:00
|
|
|
if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
|
2015-06-08 23:08:14 +07:00
|
|
|
if (cmd_rq->ctx == CMD_CTX_CANCELLED)
|
2015-10-13 02:23:39 +07:00
|
|
|
error = -EINTR;
|
|
|
|
else
|
|
|
|
error = status;
|
2015-05-22 16:12:46 +07:00
|
|
|
} else {
|
2015-10-13 02:23:39 +07:00
|
|
|
error = nvme_error_status(status);
|
2015-05-22 16:12:46 +07:00
|
|
|
}
|
2015-09-28 02:01:50 +07:00
|
|
|
}
|
|
|
|
|
2015-05-23 01:28:31 +07:00
|
|
|
if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
|
|
|
|
u32 result = le32_to_cpup(&cqe->result);
|
|
|
|
req->special = (void *)(uintptr_t)result;
|
|
|
|
}
|
2014-11-04 22:20:14 +07:00
|
|
|
|
|
|
|
if (cmd_rq->aborted)
|
2015-05-22 16:12:39 +07:00
|
|
|
dev_warn(nvmeq->dev->dev,
|
2014-11-04 22:20:14 +07:00
|
|
|
"completing aborted command with status:%04x\n",
|
2015-10-13 02:23:39 +07:00
|
|
|
error);
|
2014-11-04 22:20:14 +07:00
|
|
|
|
2015-10-16 02:38:48 +07:00
|
|
|
release_iod:
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
if (iod->nents) {
|
2015-05-22 16:12:39 +07:00
|
|
|
dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents,
|
2014-11-04 22:20:14 +07:00
|
|
|
rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
if (blk_integrity_rq(req)) {
|
|
|
|
if (!rq_data_dir(req))
|
|
|
|
nvme_dif_remap(req, nvme_dif_complete);
|
2015-05-22 16:12:39 +07:00
|
|
|
dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1,
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
|
|
|
|
}
|
|
|
|
}
|
2014-04-04 05:45:23 +07:00
|
|
|
nvme_free_iod(nvmeq->dev, iod);
|
2014-04-29 01:30:52 +07:00
|
|
|
|
2015-10-16 02:38:48 +07:00
|
|
|
if (likely(!requeue))
|
|
|
|
blk_mq_complete_request(req, error);
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2015-10-16 12:58:37 +07:00
|
|
|
static bool nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
|
|
|
|
int total_len)
|
2011-01-26 22:02:29 +07:00
|
|
|
{
|
2011-02-10 22:30:34 +07:00
|
|
|
struct dma_pool *pool;
|
2011-12-21 01:34:52 +07:00
|
|
|
int length = total_len;
|
|
|
|
struct scatterlist *sg = iod->sg;
|
2011-01-26 22:02:29 +07:00
|
|
|
int dma_len = sg_dma_len(sg);
|
|
|
|
u64 dma_addr = sg_dma_address(sg);
|
2015-03-26 23:07:51 +07:00
|
|
|
u32 page_size = dev->page_size;
|
|
|
|
int offset = dma_addr & (page_size - 1);
|
2011-02-10 20:51:24 +07:00
|
|
|
__le64 *prp_list;
|
2011-12-21 01:34:52 +07:00
|
|
|
__le64 **list = iod_list(iod);
|
2011-02-10 20:51:24 +07:00
|
|
|
dma_addr_t prp_dma;
|
2011-12-21 01:34:52 +07:00
|
|
|
int nprps, i;
|
2011-01-26 22:02:29 +07:00
|
|
|
|
2014-06-24 00:34:01 +07:00
|
|
|
length -= (page_size - offset);
|
2011-01-26 22:02:29 +07:00
|
|
|
if (length <= 0)
|
2015-10-16 12:58:37 +07:00
|
|
|
return true;
|
2011-01-26 22:02:29 +07:00
|
|
|
|
2014-06-24 00:34:01 +07:00
|
|
|
dma_len -= (page_size - offset);
|
2011-01-26 22:02:29 +07:00
|
|
|
if (dma_len) {
|
2014-06-24 00:34:01 +07:00
|
|
|
dma_addr += (page_size - offset);
|
2011-01-26 22:02:29 +07:00
|
|
|
} else {
|
|
|
|
sg = sg_next(sg);
|
|
|
|
dma_addr = sg_dma_address(sg);
|
|
|
|
dma_len = sg_dma_len(sg);
|
|
|
|
}
|
|
|
|
|
2014-06-24 00:34:01 +07:00
|
|
|
if (length <= page_size) {
|
2014-04-04 05:45:23 +07:00
|
|
|
iod->first_dma = dma_addr;
|
2015-10-16 12:58:37 +07:00
|
|
|
return true;
|
2011-02-10 20:51:24 +07:00
|
|
|
}
|
|
|
|
|
2014-06-24 00:34:01 +07:00
|
|
|
nprps = DIV_ROUND_UP(length, page_size);
|
2011-02-10 22:30:34 +07:00
|
|
|
if (nprps <= (256 / 8)) {
|
|
|
|
pool = dev->prp_small_pool;
|
2011-12-21 01:34:52 +07:00
|
|
|
iod->npages = 0;
|
2011-02-10 22:30:34 +07:00
|
|
|
} else {
|
|
|
|
pool = dev->prp_page_pool;
|
2011-12-21 01:34:52 +07:00
|
|
|
iod->npages = 1;
|
2011-02-10 22:30:34 +07:00
|
|
|
}
|
|
|
|
|
2015-10-16 12:58:37 +07:00
|
|
|
prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
|
2011-05-13 00:51:41 +07:00
|
|
|
if (!prp_list) {
|
2014-04-04 05:45:23 +07:00
|
|
|
iod->first_dma = dma_addr;
|
2011-12-21 01:34:52 +07:00
|
|
|
iod->npages = -1;
|
2015-10-16 12:58:37 +07:00
|
|
|
return false;
|
2011-05-13 00:51:41 +07:00
|
|
|
}
|
2011-12-21 01:34:52 +07:00
|
|
|
list[0] = prp_list;
|
|
|
|
iod->first_dma = prp_dma;
|
2011-02-10 20:51:24 +07:00
|
|
|
i = 0;
|
|
|
|
for (;;) {
|
2014-06-24 00:34:01 +07:00
|
|
|
if (i == page_size >> 3) {
|
2011-02-10 20:51:24 +07:00
|
|
|
__le64 *old_prp_list = prp_list;
|
2015-10-16 12:58:37 +07:00
|
|
|
prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
|
2011-12-21 01:34:52 +07:00
|
|
|
if (!prp_list)
|
2015-10-16 12:58:37 +07:00
|
|
|
return false;
|
2011-12-21 01:34:52 +07:00
|
|
|
list[iod->npages++] = prp_list;
|
2011-03-17 03:43:40 +07:00
|
|
|
prp_list[0] = old_prp_list[i - 1];
|
|
|
|
old_prp_list[i - 1] = cpu_to_le64(prp_dma);
|
|
|
|
i = 1;
|
2011-02-10 20:51:24 +07:00
|
|
|
}
|
|
|
|
prp_list[i++] = cpu_to_le64(dma_addr);
|
2014-06-24 00:34:01 +07:00
|
|
|
dma_len -= page_size;
|
|
|
|
dma_addr += page_size;
|
|
|
|
length -= page_size;
|
2011-02-10 20:51:24 +07:00
|
|
|
if (length <= 0)
|
|
|
|
break;
|
|
|
|
if (dma_len > 0)
|
|
|
|
continue;
|
|
|
|
BUG_ON(dma_len < 0);
|
|
|
|
sg = sg_next(sg);
|
|
|
|
dma_addr = sg_dma_address(sg);
|
|
|
|
dma_len = sg_dma_len(sg);
|
2011-01-26 22:02:29 +07:00
|
|
|
}
|
|
|
|
|
2015-10-16 12:58:37 +07:00
|
|
|
return true;
|
2011-01-26 22:02:29 +07:00
|
|
|
}
|
|
|
|
|
2015-05-22 16:12:46 +07:00
|
|
|
static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req,
|
|
|
|
struct nvme_iod *iod)
|
|
|
|
{
|
2015-07-20 23:14:08 +07:00
|
|
|
struct nvme_command cmnd;
|
2015-05-22 16:12:46 +07:00
|
|
|
|
2015-07-20 23:14:08 +07:00
|
|
|
memcpy(&cmnd, req->cmd, sizeof(cmnd));
|
|
|
|
cmnd.rw.command_id = req->tag;
|
2015-05-22 16:12:46 +07:00
|
|
|
if (req->nr_phys_segments) {
|
2015-07-20 23:14:08 +07:00
|
|
|
cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
|
|
|
|
cmnd.rw.prp2 = cpu_to_le64(iod->first_dma);
|
2015-05-22 16:12:46 +07:00
|
|
|
}
|
|
|
|
|
2015-07-20 23:14:08 +07:00
|
|
|
__nvme_submit_cmd(nvmeq, &cmnd);
|
2015-05-22 16:12:46 +07:00
|
|
|
}
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
/*
|
|
|
|
* We reuse the small pool to allocate the 16-byte range here as it is not
|
|
|
|
* worth having a special pool for these or additional cases to handle freeing
|
|
|
|
* the iod.
|
|
|
|
*/
|
|
|
|
static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
|
|
|
|
struct request *req, struct nvme_iod *iod)
|
2012-11-10 06:33:05 +07:00
|
|
|
{
|
2014-04-04 05:45:23 +07:00
|
|
|
struct nvme_dsm_range *range =
|
|
|
|
(struct nvme_dsm_range *)iod_list(iod)[0];
|
2015-07-20 23:14:08 +07:00
|
|
|
struct nvme_command cmnd;
|
2012-11-10 06:33:05 +07:00
|
|
|
|
|
|
|
range->cattr = cpu_to_le32(0);
|
2014-11-04 22:20:14 +07:00
|
|
|
range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
|
|
|
|
range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
|
2012-11-10 06:33:05 +07:00
|
|
|
|
2015-07-20 23:14:08 +07:00
|
|
|
memset(&cmnd, 0, sizeof(cmnd));
|
|
|
|
cmnd.dsm.opcode = nvme_cmd_dsm;
|
|
|
|
cmnd.dsm.command_id = req->tag;
|
|
|
|
cmnd.dsm.nsid = cpu_to_le32(ns->ns_id);
|
|
|
|
cmnd.dsm.prp1 = cpu_to_le64(iod->first_dma);
|
|
|
|
cmnd.dsm.nr = 0;
|
|
|
|
cmnd.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
|
2012-11-10 06:33:05 +07:00
|
|
|
|
2015-07-20 23:14:08 +07:00
|
|
|
__nvme_submit_cmd(nvmeq, &cmnd);
|
2012-11-10 06:33:05 +07:00
|
|
|
}
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
|
2011-02-23 02:18:30 +07:00
|
|
|
int cmdid)
|
|
|
|
{
|
2015-07-20 23:14:08 +07:00
|
|
|
struct nvme_command cmnd;
|
2011-02-23 02:18:30 +07:00
|
|
|
|
2015-07-20 23:14:08 +07:00
|
|
|
memset(&cmnd, 0, sizeof(cmnd));
|
|
|
|
cmnd.common.opcode = nvme_cmd_flush;
|
|
|
|
cmnd.common.command_id = cmdid;
|
|
|
|
cmnd.common.nsid = cpu_to_le32(ns->ns_id);
|
2011-02-23 02:18:30 +07:00
|
|
|
|
2015-07-20 23:14:08 +07:00
|
|
|
__nvme_submit_cmd(nvmeq, &cmnd);
|
2011-02-23 02:18:30 +07:00
|
|
|
}
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
|
|
|
|
struct nvme_ns *ns)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2015-01-23 02:07:58 +07:00
|
|
|
struct request *req = iod_get_private(iod);
|
2015-07-20 23:14:08 +07:00
|
|
|
struct nvme_command cmnd;
|
2014-11-04 22:20:14 +07:00
|
|
|
u16 control = 0;
|
|
|
|
u32 dsmgmt = 0;
|
2011-02-23 02:18:30 +07:00
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
if (req->cmd_flags & REQ_FUA)
|
2011-01-21 00:50:14 +07:00
|
|
|
control |= NVME_RW_FUA;
|
2014-11-04 22:20:14 +07:00
|
|
|
if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
|
2011-01-21 00:50:14 +07:00
|
|
|
control |= NVME_RW_LR;
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
if (req->cmd_flags & REQ_RAHEAD)
|
2011-01-21 00:50:14 +07:00
|
|
|
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
|
|
|
|
|
2015-07-20 23:14:08 +07:00
|
|
|
memset(&cmnd, 0, sizeof(cmnd));
|
|
|
|
cmnd.rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
|
|
|
|
cmnd.rw.command_id = req->tag;
|
|
|
|
cmnd.rw.nsid = cpu_to_le32(ns->ns_id);
|
|
|
|
cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
|
|
|
|
cmnd.rw.prp2 = cpu_to_le64(iod->first_dma);
|
|
|
|
cmnd.rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
|
|
|
|
cmnd.rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2015-08-26 21:56:14 +07:00
|
|
|
if (ns->ms) {
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
switch (ns->pi_type) {
|
|
|
|
case NVME_NS_DPS_PI_TYPE3:
|
|
|
|
control |= NVME_RW_PRINFO_PRCHK_GUARD;
|
|
|
|
break;
|
|
|
|
case NVME_NS_DPS_PI_TYPE1:
|
|
|
|
case NVME_NS_DPS_PI_TYPE2:
|
|
|
|
control |= NVME_RW_PRINFO_PRCHK_GUARD |
|
|
|
|
NVME_RW_PRINFO_PRCHK_REF;
|
2015-07-20 23:14:08 +07:00
|
|
|
cmnd.rw.reftag = cpu_to_le32(
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
nvme_block_nr(ns, blk_rq_pos(req)));
|
|
|
|
break;
|
|
|
|
}
|
2015-08-26 21:56:14 +07:00
|
|
|
if (blk_integrity_rq(req))
|
|
|
|
cmnd.rw.metadata =
|
|
|
|
cpu_to_le64(sg_dma_address(iod->meta_sg));
|
|
|
|
else
|
|
|
|
control |= NVME_RW_PRINFO_PRACT;
|
|
|
|
}
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
|
2015-07-20 23:14:08 +07:00
|
|
|
cmnd.rw.control = cpu_to_le16(control);
|
|
|
|
cmnd.rw.dsmgmt = cpu_to_le32(dsmgmt);
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2015-07-20 23:14:08 +07:00
|
|
|
__nvme_submit_cmd(nvmeq, &cmnd);
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2011-02-11 00:01:09 +07:00
|
|
|
return 0;
|
2014-04-04 05:45:23 +07:00
|
|
|
}
|
|
|
|
|
2015-05-22 16:12:46 +07:00
|
|
|
/*
|
|
|
|
* NOTE: ns is NULL when called on the admin queue.
|
|
|
|
*/
|
2014-11-04 22:20:14 +07:00
|
|
|
static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
|
|
|
|
const struct blk_mq_queue_data *bd)
|
2014-04-04 05:45:23 +07:00
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_ns *ns = hctx->queue->queuedata;
|
|
|
|
struct nvme_queue *nvmeq = hctx->driver_data;
|
2015-05-22 16:12:46 +07:00
|
|
|
struct nvme_dev *dev = nvmeq->dev;
|
2014-11-04 22:20:14 +07:00
|
|
|
struct request *req = bd->rq;
|
|
|
|
struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
|
2014-04-04 05:45:23 +07:00
|
|
|
struct nvme_iod *iod;
|
2014-11-04 22:20:14 +07:00
|
|
|
enum dma_data_direction dma_dir;
|
2014-04-04 05:45:23 +07:00
|
|
|
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
/*
|
|
|
|
* If formated with metadata, require the block layer provide a buffer
|
|
|
|
* unless this namespace is formated such that the metadata can be
|
|
|
|
* stripped/generated by the controller with PRACT=1.
|
|
|
|
*/
|
2015-05-22 16:12:46 +07:00
|
|
|
if (ns && ns->ms && !blk_integrity_rq(req)) {
|
2015-06-20 00:07:30 +07:00
|
|
|
if (!(ns->pi_type && ns->ms == 8) &&
|
|
|
|
req->cmd_type != REQ_TYPE_DRV_PRIV) {
|
2015-09-28 02:01:50 +07:00
|
|
|
blk_mq_complete_request(req, -EFAULT);
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
return BLK_MQ_RQ_QUEUE_OK;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-05-22 16:12:46 +07:00
|
|
|
iod = nvme_alloc_iod(req, dev, GFP_ATOMIC);
|
2014-04-04 05:45:23 +07:00
|
|
|
if (!iod)
|
2014-12-12 03:58:39 +07:00
|
|
|
return BLK_MQ_RQ_QUEUE_BUSY;
|
2014-11-04 22:20:14 +07:00
|
|
|
|
|
|
|
if (req->cmd_flags & REQ_DISCARD) {
|
2014-04-04 05:45:23 +07:00
|
|
|
void *range;
|
|
|
|
/*
|
|
|
|
* We reuse the small pool to allocate the 16-byte range here
|
|
|
|
* as it is not worth having a special pool for these or
|
|
|
|
* additional cases to handle freeing the iod.
|
|
|
|
*/
|
2015-05-22 16:12:46 +07:00
|
|
|
range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC,
|
2014-04-04 05:45:23 +07:00
|
|
|
&iod->first_dma);
|
2014-11-04 22:20:14 +07:00
|
|
|
if (!range)
|
2014-12-12 03:58:39 +07:00
|
|
|
goto retry_cmd;
|
2014-04-04 05:45:23 +07:00
|
|
|
iod_list(iod)[0] = (__le64 *)range;
|
|
|
|
iod->npages = 0;
|
2015-01-23 02:07:58 +07:00
|
|
|
} else if (req->nr_phys_segments) {
|
2014-11-04 22:20:14 +07:00
|
|
|
dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
|
|
|
|
|
2015-01-23 02:07:58 +07:00
|
|
|
sg_init_table(iod->sg, req->nr_phys_segments);
|
2014-11-04 22:20:14 +07:00
|
|
|
iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
|
2014-12-12 03:58:39 +07:00
|
|
|
if (!iod->nents)
|
|
|
|
goto error_cmd;
|
2014-11-04 22:20:14 +07:00
|
|
|
|
|
|
|
if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
|
2014-12-12 03:58:39 +07:00
|
|
|
goto retry_cmd;
|
2014-11-04 22:20:14 +07:00
|
|
|
|
2015-10-16 12:58:37 +07:00
|
|
|
if (!nvme_setup_prps(dev, iod, blk_rq_bytes(req))) {
|
2015-05-22 16:12:46 +07:00
|
|
|
dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
|
2014-12-12 03:58:39 +07:00
|
|
|
goto retry_cmd;
|
|
|
|
}
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
if (blk_integrity_rq(req)) {
|
2015-10-16 12:58:31 +07:00
|
|
|
if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) {
|
|
|
|
dma_unmap_sg(dev->dev, iod->sg, iod->nents,
|
|
|
|
dma_dir);
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
goto error_cmd;
|
2015-10-16 12:58:31 +07:00
|
|
|
}
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
|
|
|
|
sg_init_table(iod->meta_sg, 1);
|
|
|
|
if (blk_rq_map_integrity_sg(
|
2015-10-16 12:58:31 +07:00
|
|
|
req->q, req->bio, iod->meta_sg) != 1) {
|
|
|
|
dma_unmap_sg(dev->dev, iod->sg, iod->nents,
|
|
|
|
dma_dir);
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
goto error_cmd;
|
2015-10-16 12:58:31 +07:00
|
|
|
}
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
|
|
|
|
if (rq_data_dir(req))
|
|
|
|
nvme_dif_remap(req, nvme_dif_prep);
|
|
|
|
|
2015-10-16 12:58:31 +07:00
|
|
|
if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) {
|
|
|
|
dma_unmap_sg(dev->dev, iod->sg, iod->nents,
|
|
|
|
dma_dir);
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
goto error_cmd;
|
2015-10-16 12:58:31 +07:00
|
|
|
}
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
}
|
2014-04-04 05:45:23 +07:00
|
|
|
}
|
2011-02-11 00:01:09 +07:00
|
|
|
|
2014-12-04 07:07:13 +07:00
|
|
|
nvme_set_info(cmd, iod, req_completion);
|
2014-11-04 22:20:14 +07:00
|
|
|
spin_lock_irq(&nvmeq->q_lock);
|
2015-05-22 16:12:46 +07:00
|
|
|
if (req->cmd_type == REQ_TYPE_DRV_PRIV)
|
|
|
|
nvme_submit_priv(nvmeq, req, iod);
|
|
|
|
else if (req->cmd_flags & REQ_DISCARD)
|
2014-11-04 22:20:14 +07:00
|
|
|
nvme_submit_discard(nvmeq, ns, req, iod);
|
|
|
|
else if (req->cmd_flags & REQ_FLUSH)
|
|
|
|
nvme_submit_flush(nvmeq, ns, req->tag);
|
|
|
|
else
|
|
|
|
nvme_submit_iod(nvmeq, iod, ns);
|
|
|
|
|
|
|
|
nvme_process_cq(nvmeq);
|
|
|
|
spin_unlock_irq(&nvmeq->q_lock);
|
|
|
|
return BLK_MQ_RQ_QUEUE_OK;
|
|
|
|
|
2014-12-12 03:58:39 +07:00
|
|
|
error_cmd:
|
2015-05-22 16:12:46 +07:00
|
|
|
nvme_free_iod(dev, iod);
|
2014-12-12 03:58:39 +07:00
|
|
|
return BLK_MQ_RQ_QUEUE_ERROR;
|
|
|
|
retry_cmd:
|
2015-05-22 16:12:46 +07:00
|
|
|
nvme_free_iod(dev, iod);
|
2014-12-12 03:58:39 +07:00
|
|
|
return BLK_MQ_RQ_QUEUE_BUSY;
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2015-11-04 10:37:26 +07:00
|
|
|
static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2011-01-21 01:24:06 +07:00
|
|
|
u16 head, phase;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
|
|
|
head = nvmeq->cq_head;
|
2011-01-21 01:24:06 +07:00
|
|
|
phase = nvmeq->cq_phase;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
|
|
|
for (;;) {
|
2011-10-15 18:33:46 +07:00
|
|
|
void *ctx;
|
|
|
|
nvme_completion_fn fn;
|
2011-01-21 00:50:14 +07:00
|
|
|
struct nvme_completion cqe = nvmeq->cqes[head];
|
2011-01-21 01:24:06 +07:00
|
|
|
if ((le16_to_cpu(cqe.status) & 1) != phase)
|
2011-01-21 00:50:14 +07:00
|
|
|
break;
|
|
|
|
nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
|
|
|
|
if (++head == nvmeq->q_depth) {
|
|
|
|
head = 0;
|
2011-01-21 01:24:06 +07:00
|
|
|
phase = !phase;
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
2015-11-04 10:37:26 +07:00
|
|
|
if (tag && *tag == cqe.command_id)
|
|
|
|
*tag = -1;
|
2014-11-04 22:20:14 +07:00
|
|
|
ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn);
|
2014-04-04 05:45:23 +07:00
|
|
|
fn(nvmeq, ctx, &cqe);
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* If the controller ignores the cq head doorbell and continuously
|
|
|
|
* writes to the queue, it is theoretically possible to wrap around
|
|
|
|
* the queue twice and mistakenly return IRQ_NONE. Linux only
|
|
|
|
* requires that 0.1% of your interrupts are handled, so this isn't
|
|
|
|
* a big problem.
|
|
|
|
*/
|
2011-01-21 01:24:06 +07:00
|
|
|
if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
|
2015-11-04 10:37:26 +07:00
|
|
|
return;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2015-11-20 22:38:13 +07:00
|
|
|
if (likely(nvmeq->cq_vector >= 0))
|
|
|
|
writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
|
2011-01-21 00:50:14 +07:00
|
|
|
nvmeq->cq_head = head;
|
2011-01-21 01:24:06 +07:00
|
|
|
nvmeq->cq_phase = phase;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2013-06-24 22:47:34 +07:00
|
|
|
nvmeq->cqe_seen = 1;
|
2015-11-04 10:37:26 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_process_cq(struct nvme_queue *nvmeq)
|
|
|
|
{
|
|
|
|
__nvme_process_cq(nvmeq, NULL);
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static irqreturn_t nvme_irq(int irq, void *data)
|
2011-02-06 19:28:06 +07:00
|
|
|
{
|
|
|
|
irqreturn_t result;
|
|
|
|
struct nvme_queue *nvmeq = data;
|
|
|
|
spin_lock(&nvmeq->q_lock);
|
2013-06-24 22:47:34 +07:00
|
|
|
nvme_process_cq(nvmeq);
|
|
|
|
result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE;
|
|
|
|
nvmeq->cqe_seen = 0;
|
2011-02-06 19:28:06 +07:00
|
|
|
spin_unlock(&nvmeq->q_lock);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
static irqreturn_t nvme_irq_check(int irq, void *data)
|
|
|
|
{
|
|
|
|
struct nvme_queue *nvmeq = data;
|
|
|
|
struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head];
|
|
|
|
if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase)
|
|
|
|
return IRQ_NONE;
|
|
|
|
return IRQ_WAKE_THREAD;
|
|
|
|
}
|
|
|
|
|
2015-11-04 10:37:26 +07:00
|
|
|
static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
|
|
|
|
{
|
|
|
|
struct nvme_queue *nvmeq = hctx->driver_data;
|
|
|
|
|
|
|
|
if ((le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) ==
|
|
|
|
nvmeq->cq_phase) {
|
|
|
|
spin_lock_irq(&nvmeq->q_lock);
|
|
|
|
__nvme_process_cq(nvmeq, &tag);
|
|
|
|
spin_unlock_irq(&nvmeq->q_lock);
|
|
|
|
|
|
|
|
if (tag == -1)
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
static int nvme_submit_async_admin_req(struct nvme_dev *dev)
|
|
|
|
{
|
|
|
|
struct nvme_queue *nvmeq = dev->queues[0];
|
|
|
|
struct nvme_command c;
|
|
|
|
struct nvme_cmd_info *cmd_info;
|
|
|
|
struct request *req;
|
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE,
|
2015-11-26 15:13:05 +07:00
|
|
|
BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED);
|
2014-11-06 03:39:09 +07:00
|
|
|
if (IS_ERR(req))
|
|
|
|
return PTR_ERR(req);
|
2014-11-04 22:20:14 +07:00
|
|
|
|
2015-01-08 08:55:48 +07:00
|
|
|
req->cmd_flags |= REQ_NO_TIMEOUT;
|
2014-11-04 22:20:14 +07:00
|
|
|
cmd_info = blk_mq_rq_to_pdu(req);
|
2015-03-31 23:37:17 +07:00
|
|
|
nvme_set_info(cmd_info, NULL, async_req_completion);
|
2014-11-04 22:20:14 +07:00
|
|
|
|
|
|
|
memset(&c, 0, sizeof(c));
|
|
|
|
c.common.opcode = nvme_admin_async_event;
|
|
|
|
c.common.command_id = req->tag;
|
|
|
|
|
2015-06-01 22:29:54 +07:00
|
|
|
blk_mq_free_request(req);
|
2015-07-31 20:26:58 +07:00
|
|
|
__nvme_submit_cmd(nvmeq, &c);
|
|
|
|
return 0;
|
2014-11-04 22:20:14 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
|
2013-12-11 03:10:40 +07:00
|
|
|
struct nvme_command *cmd,
|
|
|
|
struct async_cmd_info *cmdinfo, unsigned timeout)
|
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_queue *nvmeq = dev->queues[0];
|
|
|
|
struct request *req;
|
|
|
|
struct nvme_cmd_info *cmd_rq;
|
2013-12-11 03:10:40 +07:00
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE, 0);
|
2014-11-06 03:39:09 +07:00
|
|
|
if (IS_ERR(req))
|
|
|
|
return PTR_ERR(req);
|
2014-11-04 22:20:14 +07:00
|
|
|
|
|
|
|
req->timeout = timeout;
|
|
|
|
cmd_rq = blk_mq_rq_to_pdu(req);
|
|
|
|
cmdinfo->req = req;
|
|
|
|
nvme_set_info(cmd_rq, cmdinfo, async_completion);
|
2013-12-11 03:10:40 +07:00
|
|
|
cmdinfo->status = -EINTR;
|
2014-11-04 22:20:14 +07:00
|
|
|
|
|
|
|
cmd->common.command_id = req->tag;
|
|
|
|
|
2015-07-31 20:26:58 +07:00
|
|
|
nvme_submit_cmd(nvmeq, cmd);
|
|
|
|
return 0;
|
2013-12-11 03:10:40 +07:00
|
|
|
}
|
|
|
|
|
2011-01-21 00:50:14 +07:00
|
|
|
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
|
|
|
|
{
|
|
|
|
struct nvme_command c;
|
|
|
|
|
|
|
|
memset(&c, 0, sizeof(c));
|
|
|
|
c.delete_queue.opcode = opcode;
|
|
|
|
c.delete_queue.qid = cpu_to_le16(id);
|
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
|
|
|
|
struct nvme_queue *nvmeq)
|
|
|
|
{
|
|
|
|
struct nvme_command c;
|
|
|
|
int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
|
|
|
|
|
2015-05-22 16:12:46 +07:00
|
|
|
/*
|
|
|
|
* Note: we (ab)use the fact the the prp fields survive if no data
|
|
|
|
* is attached to the request.
|
|
|
|
*/
|
2011-01-21 00:50:14 +07:00
|
|
|
memset(&c, 0, sizeof(c));
|
|
|
|
c.create_cq.opcode = nvme_admin_create_cq;
|
|
|
|
c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
|
|
|
|
c.create_cq.cqid = cpu_to_le16(qid);
|
|
|
|
c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
|
|
|
|
c.create_cq.cq_flags = cpu_to_le16(flags);
|
|
|
|
c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
|
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
|
|
|
|
struct nvme_queue *nvmeq)
|
|
|
|
{
|
|
|
|
struct nvme_command c;
|
|
|
|
int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
|
|
|
|
|
2015-05-22 16:12:46 +07:00
|
|
|
/*
|
|
|
|
* Note: we (ab)use the fact the the prp fields survive if no data
|
|
|
|
* is attached to the request.
|
|
|
|
*/
|
2011-01-21 00:50:14 +07:00
|
|
|
memset(&c, 0, sizeof(c));
|
|
|
|
c.create_sq.opcode = nvme_admin_create_sq;
|
|
|
|
c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
|
|
|
|
c.create_sq.sqid = cpu_to_le16(qid);
|
|
|
|
c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
|
|
|
|
c.create_sq.sq_flags = cpu_to_le16(flags);
|
|
|
|
c.create_sq.cqid = cpu_to_le16(qid);
|
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
|
|
|
|
{
|
|
|
|
return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
|
|
|
|
{
|
|
|
|
return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
|
|
|
|
}
|
|
|
|
|
2013-12-11 03:10:38 +07:00
|
|
|
/**
|
2014-11-04 22:20:14 +07:00
|
|
|
* nvme_abort_req - Attempt aborting a request
|
2013-12-11 03:10:38 +07:00
|
|
|
*
|
|
|
|
* Schedule controller reset if the command was already aborted once before and
|
|
|
|
* still hasn't been returned to the driver, or if this is the admin queue.
|
|
|
|
*/
|
2014-11-04 22:20:14 +07:00
|
|
|
static void nvme_abort_req(struct request *req)
|
2013-12-11 03:10:38 +07:00
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
|
|
|
|
struct nvme_queue *nvmeq = cmd_rq->nvmeq;
|
2013-12-11 03:10:38 +07:00
|
|
|
struct nvme_dev *dev = nvmeq->dev;
|
2014-11-04 22:20:14 +07:00
|
|
|
struct request *abort_req;
|
|
|
|
struct nvme_cmd_info *abort_cmd;
|
|
|
|
struct nvme_command cmd;
|
2013-12-11 03:10:38 +07:00
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
if (!nvmeq->qid || cmd_rq->aborted) {
|
2015-10-02 23:49:23 +07:00
|
|
|
spin_lock(&dev_list_lock);
|
|
|
|
if (!__nvme_reset(dev)) {
|
|
|
|
dev_warn(dev->dev,
|
|
|
|
"I/O %d QID %d timeout, reset controller\n",
|
|
|
|
req->tag, nvmeq->qid);
|
|
|
|
}
|
|
|
|
spin_unlock(&dev_list_lock);
|
2013-12-11 03:10:38 +07:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
if (!dev->ctrl.abort_limit)
|
2013-12-11 03:10:38 +07:00
|
|
|
return;
|
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE,
|
2015-11-26 15:13:05 +07:00
|
|
|
BLK_MQ_REQ_NOWAIT);
|
2014-11-06 03:39:09 +07:00
|
|
|
if (IS_ERR(abort_req))
|
2013-12-11 03:10:38 +07:00
|
|
|
return;
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
abort_cmd = blk_mq_rq_to_pdu(abort_req);
|
|
|
|
nvme_set_info(abort_cmd, abort_req, abort_completion);
|
|
|
|
|
2013-12-11 03:10:38 +07:00
|
|
|
memset(&cmd, 0, sizeof(cmd));
|
|
|
|
cmd.abort.opcode = nvme_admin_abort_cmd;
|
2014-11-04 22:20:14 +07:00
|
|
|
cmd.abort.cid = req->tag;
|
2013-12-11 03:10:38 +07:00
|
|
|
cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
|
2014-11-04 22:20:14 +07:00
|
|
|
cmd.abort.command_id = abort_req->tag;
|
2013-12-11 03:10:38 +07:00
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
--dev->ctrl.abort_limit;
|
2014-11-04 22:20:14 +07:00
|
|
|
cmd_rq->aborted = 1;
|
2013-12-11 03:10:38 +07:00
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag,
|
2013-12-11 03:10:38 +07:00
|
|
|
nvmeq->qid);
|
2015-07-31 20:26:58 +07:00
|
|
|
nvme_submit_cmd(dev->queues[0], &cmd);
|
2013-12-11 03:10:38 +07:00
|
|
|
}
|
|
|
|
|
2015-06-01 22:29:54 +07:00
|
|
|
static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved)
|
2012-08-08 02:56:23 +07:00
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_queue *nvmeq = data;
|
|
|
|
void *ctx;
|
|
|
|
nvme_completion_fn fn;
|
|
|
|
struct nvme_cmd_info *cmd;
|
2015-01-08 08:55:51 +07:00
|
|
|
struct nvme_completion cqe;
|
|
|
|
|
|
|
|
if (!blk_mq_request_started(req))
|
|
|
|
return;
|
2012-08-08 02:56:23 +07:00
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
cmd = blk_mq_rq_to_pdu(req);
|
2012-08-08 02:56:23 +07:00
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
if (cmd->ctx == CMD_CTX_CANCELLED)
|
|
|
|
return;
|
|
|
|
|
2015-01-08 08:55:51 +07:00
|
|
|
if (blk_queue_dying(req->q))
|
|
|
|
cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1);
|
|
|
|
else
|
|
|
|
cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
|
|
|
|
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n",
|
|
|
|
req->tag, nvmeq->qid);
|
|
|
|
ctx = cancel_cmd_info(cmd, &fn);
|
|
|
|
fn(nvmeq, ctx, &cqe);
|
2012-08-08 02:56:23 +07:00
|
|
|
}
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
|
2012-08-04 00:55:56 +07:00
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
|
|
|
|
struct nvme_queue *nvmeq = cmd->nvmeq;
|
|
|
|
|
|
|
|
dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
|
|
|
|
nvmeq->qid);
|
2015-01-08 08:55:53 +07:00
|
|
|
spin_lock_irq(&nvmeq->q_lock);
|
2015-02-20 00:34:48 +07:00
|
|
|
nvme_abort_req(req);
|
2015-01-08 08:55:53 +07:00
|
|
|
spin_unlock_irq(&nvmeq->q_lock);
|
2014-11-04 22:20:14 +07:00
|
|
|
|
2015-02-20 00:34:48 +07:00
|
|
|
/*
|
|
|
|
* The aborted req will be completed on receiving the abort req.
|
|
|
|
* We enable the timer again. If hit twice, it'll cause a device reset,
|
|
|
|
* as the device then is in a faulty state.
|
|
|
|
*/
|
|
|
|
return BLK_EH_RESET_TIMER;
|
2014-11-04 22:20:14 +07:00
|
|
|
}
|
2013-07-16 04:02:20 +07:00
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
static void nvme_free_queue(struct nvme_queue *nvmeq)
|
|
|
|
{
|
2012-08-04 00:55:56 +07:00
|
|
|
dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
|
|
|
|
(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
|
2015-07-20 23:14:09 +07:00
|
|
|
if (nvmeq->sq_cmds)
|
|
|
|
dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
|
2012-08-04 00:55:56 +07:00
|
|
|
nvmeq->sq_cmds, nvmeq->sq_dma_addr);
|
|
|
|
kfree(nvmeq);
|
|
|
|
}
|
|
|
|
|
2013-12-17 01:50:00 +07:00
|
|
|
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
|
2013-07-16 04:02:20 +07:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2013-12-17 01:50:00 +07:00
|
|
|
for (i = dev->queue_count - 1; i >= lowest; i--) {
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_queue *nvmeq = dev->queues[i];
|
2013-07-16 04:02:20 +07:00
|
|
|
dev->queue_count--;
|
2014-11-04 22:20:14 +07:00
|
|
|
dev->queues[i] = NULL;
|
2014-07-07 22:14:42 +07:00
|
|
|
nvme_free_queue(nvmeq);
|
2015-01-15 11:01:58 +07:00
|
|
|
}
|
2013-07-16 04:02:20 +07:00
|
|
|
}
|
|
|
|
|
2013-12-11 03:10:40 +07:00
|
|
|
/**
|
|
|
|
* nvme_suspend_queue - put queue into suspended state
|
|
|
|
* @nvmeq - queue to suspend
|
|
|
|
*/
|
|
|
|
static int nvme_suspend_queue(struct nvme_queue *nvmeq)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2014-12-23 02:59:04 +07:00
|
|
|
int vector;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2012-08-08 02:56:23 +07:00
|
|
|
spin_lock_irq(&nvmeq->q_lock);
|
2014-12-23 02:59:04 +07:00
|
|
|
if (nvmeq->cq_vector == -1) {
|
|
|
|
spin_unlock_irq(&nvmeq->q_lock);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
vector = nvmeq->dev->entry[nvmeq->cq_vector].vector;
|
2014-03-24 23:46:25 +07:00
|
|
|
nvmeq->dev->online_queues--;
|
2014-12-23 02:59:04 +07:00
|
|
|
nvmeq->cq_vector = -1;
|
2012-08-08 02:56:23 +07:00
|
|
|
spin_unlock_irq(&nvmeq->q_lock);
|
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
|
|
|
|
blk_mq_freeze_queue_start(nvmeq->dev->ctrl.admin_q);
|
2015-03-27 02:49:33 +07:00
|
|
|
|
2011-03-27 19:52:06 +07:00
|
|
|
irq_set_affinity_hint(vector, NULL);
|
|
|
|
free_irq(vector, nvmeq);
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2013-12-11 03:10:40 +07:00
|
|
|
return 0;
|
|
|
|
}
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2013-12-11 03:10:40 +07:00
|
|
|
static void nvme_clear_queue(struct nvme_queue *nvmeq)
|
|
|
|
{
|
2013-07-16 04:02:20 +07:00
|
|
|
spin_lock_irq(&nvmeq->q_lock);
|
2015-06-01 22:29:54 +07:00
|
|
|
if (nvmeq->tags && *nvmeq->tags)
|
|
|
|
blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq);
|
2013-07-16 04:02:20 +07:00
|
|
|
spin_unlock_irq(&nvmeq->q_lock);
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2013-12-11 03:10:40 +07:00
|
|
|
static void nvme_disable_queue(struct nvme_dev *dev, int qid)
|
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_queue *nvmeq = dev->queues[qid];
|
2013-12-11 03:10:40 +07:00
|
|
|
|
|
|
|
if (!nvmeq)
|
|
|
|
return;
|
|
|
|
if (nvme_suspend_queue(nvmeq))
|
|
|
|
return;
|
|
|
|
|
2013-12-11 03:10:39 +07:00
|
|
|
/* Don't tell the adapter to delete the admin queue.
|
|
|
|
* Don't tell a removed adapter to delete IO queues. */
|
2015-11-20 14:58:10 +07:00
|
|
|
if (qid && readl(dev->bar + NVME_REG_CSTS) != -1) {
|
2011-01-21 00:50:14 +07:00
|
|
|
adapter_delete_sq(dev, qid);
|
|
|
|
adapter_delete_cq(dev, qid);
|
|
|
|
}
|
2015-02-20 00:34:48 +07:00
|
|
|
|
|
|
|
spin_lock_irq(&nvmeq->q_lock);
|
|
|
|
nvme_process_cq(nvmeq);
|
|
|
|
spin_unlock_irq(&nvmeq->q_lock);
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2015-07-20 23:14:09 +07:00
|
|
|
static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
|
|
|
|
int entry_size)
|
|
|
|
{
|
|
|
|
int q_depth = dev->q_depth;
|
|
|
|
unsigned q_size_aligned = roundup(q_depth * entry_size, dev->page_size);
|
|
|
|
|
|
|
|
if (q_size_aligned * nr_io_queues > dev->cmb_size) {
|
2015-07-22 04:08:13 +07:00
|
|
|
u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
|
|
|
|
mem_per_q = round_down(mem_per_q, dev->page_size);
|
|
|
|
q_depth = div_u64(mem_per_q, entry_size);
|
2015-07-20 23:14:09 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Ensure the reduced q_depth is above some threshold where it
|
|
|
|
* would be better to map queues in system memory with the
|
|
|
|
* original depth
|
|
|
|
*/
|
|
|
|
if (q_depth < 64)
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
return q_depth;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
|
|
|
|
int qid, int depth)
|
|
|
|
{
|
|
|
|
if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
|
|
|
|
unsigned offset = (qid - 1) *
|
|
|
|
roundup(SQ_SIZE(depth), dev->page_size);
|
|
|
|
nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset;
|
|
|
|
nvmeq->sq_cmds_io = dev->cmb + offset;
|
|
|
|
} else {
|
|
|
|
nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
|
|
|
|
&nvmeq->sq_dma_addr, GFP_KERNEL);
|
|
|
|
if (!nvmeq->sq_cmds)
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-01-21 00:50:14 +07:00
|
|
|
static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
|
2014-12-23 02:59:04 +07:00
|
|
|
int depth)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL);
|
2011-01-21 00:50:14 +07:00
|
|
|
if (!nvmeq)
|
|
|
|
return NULL;
|
|
|
|
|
2015-05-22 16:12:39 +07:00
|
|
|
nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
|
2014-06-16 03:37:33 +07:00
|
|
|
&nvmeq->cq_dma_addr, GFP_KERNEL);
|
2011-01-21 00:50:14 +07:00
|
|
|
if (!nvmeq->cqes)
|
|
|
|
goto free_nvmeq;
|
|
|
|
|
2015-07-20 23:14:09 +07:00
|
|
|
if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
|
2011-01-21 00:50:14 +07:00
|
|
|
goto free_cqdma;
|
|
|
|
|
2015-05-22 16:12:39 +07:00
|
|
|
nvmeq->q_dmadev = dev->dev;
|
2011-02-10 21:56:01 +07:00
|
|
|
nvmeq->dev = dev;
|
2014-01-28 03:57:22 +07:00
|
|
|
snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
|
2015-11-26 16:06:56 +07:00
|
|
|
dev->ctrl.instance, qid);
|
2011-01-21 00:50:14 +07:00
|
|
|
spin_lock_init(&nvmeq->q_lock);
|
|
|
|
nvmeq->cq_head = 0;
|
2011-01-21 01:24:06 +07:00
|
|
|
nvmeq->cq_phase = 1;
|
2013-09-10 10:25:37 +07:00
|
|
|
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
|
2011-01-21 00:50:14 +07:00
|
|
|
nvmeq->q_depth = depth;
|
2013-12-11 03:10:38 +07:00
|
|
|
nvmeq->qid = qid;
|
2015-07-01 00:22:52 +07:00
|
|
|
nvmeq->cq_vector = -1;
|
2014-11-04 22:20:14 +07:00
|
|
|
dev->queues[qid] = nvmeq;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2015-05-28 01:26:23 +07:00
|
|
|
/* make sure queue descriptor is set before queue count, for kthread */
|
|
|
|
mb();
|
|
|
|
dev->queue_count++;
|
|
|
|
|
2011-01-21 00:50:14 +07:00
|
|
|
return nvmeq;
|
|
|
|
|
|
|
|
free_cqdma:
|
2015-05-22 16:12:39 +07:00
|
|
|
dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
|
2011-01-21 00:50:14 +07:00
|
|
|
nvmeq->cq_dma_addr);
|
|
|
|
free_nvmeq:
|
|
|
|
kfree(nvmeq);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2011-01-20 21:10:15 +07:00
|
|
|
static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
|
|
|
|
const char *name)
|
|
|
|
{
|
2011-02-06 19:28:06 +07:00
|
|
|
if (use_threaded_interrupts)
|
|
|
|
return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
|
2013-10-12 11:23:29 +07:00
|
|
|
nvme_irq_check, nvme_irq, IRQF_SHARED,
|
2011-02-06 19:28:06 +07:00
|
|
|
name, nvmeq);
|
2011-01-20 21:10:15 +07:00
|
|
|
return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
|
2013-10-12 11:23:29 +07:00
|
|
|
IRQF_SHARED, name, nvmeq);
|
2011-01-20 21:10:15 +07:00
|
|
|
}
|
|
|
|
|
2013-07-16 04:02:20 +07:00
|
|
|
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2013-07-16 04:02:20 +07:00
|
|
|
struct nvme_dev *dev = nvmeq->dev;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2014-09-11 04:48:47 +07:00
|
|
|
spin_lock_irq(&nvmeq->q_lock);
|
2013-07-16 04:02:20 +07:00
|
|
|
nvmeq->sq_tail = 0;
|
|
|
|
nvmeq->cq_head = 0;
|
|
|
|
nvmeq->cq_phase = 1;
|
2013-09-10 10:25:37 +07:00
|
|
|
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
|
2013-07-16 04:02:20 +07:00
|
|
|
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
|
2014-03-24 23:46:25 +07:00
|
|
|
dev->online_queues++;
|
2014-09-11 04:48:47 +07:00
|
|
|
spin_unlock_irq(&nvmeq->q_lock);
|
2013-07-16 04:02:20 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
|
|
|
|
{
|
|
|
|
struct nvme_dev *dev = nvmeq->dev;
|
|
|
|
int result;
|
2011-02-01 20:39:04 +07:00
|
|
|
|
2014-12-23 02:59:04 +07:00
|
|
|
nvmeq->cq_vector = qid - 1;
|
2011-01-21 00:50:14 +07:00
|
|
|
result = adapter_alloc_cq(dev, qid, nvmeq);
|
|
|
|
if (result < 0)
|
2013-07-16 04:02:20 +07:00
|
|
|
return result;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
|
|
|
result = adapter_alloc_sq(dev, qid, nvmeq);
|
|
|
|
if (result < 0)
|
|
|
|
goto release_cq;
|
|
|
|
|
2014-01-28 03:57:22 +07:00
|
|
|
result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
|
2011-01-21 00:50:14 +07:00
|
|
|
if (result < 0)
|
|
|
|
goto release_sq;
|
|
|
|
|
2013-07-16 04:02:20 +07:00
|
|
|
nvme_init_queue(nvmeq, qid);
|
|
|
|
return result;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
|
|
|
release_sq:
|
|
|
|
adapter_delete_sq(dev, qid);
|
|
|
|
release_cq:
|
|
|
|
adapter_delete_cq(dev, qid);
|
2013-07-16 04:02:20 +07:00
|
|
|
return result;
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2013-05-04 17:43:16 +07:00
|
|
|
static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
|
|
|
|
{
|
|
|
|
unsigned long timeout;
|
|
|
|
u32 bit = enabled ? NVME_CSTS_RDY : 0;
|
|
|
|
|
|
|
|
timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
|
|
|
|
|
2015-11-20 14:58:10 +07:00
|
|
|
while ((readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_RDY) != bit) {
|
2013-05-04 17:43:16 +07:00
|
|
|
msleep(100);
|
|
|
|
if (fatal_signal_pending(current))
|
|
|
|
return -EINTR;
|
|
|
|
if (time_after(jiffies, timeout)) {
|
2015-05-22 16:12:39 +07:00
|
|
|
dev_err(dev->dev,
|
2014-04-11 22:58:45 +07:00
|
|
|
"Device not ready; aborting %s\n", enabled ?
|
|
|
|
"initialisation" : "reset");
|
2013-05-04 17:43:16 +07:00
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the device has been passed off to us in an enabled state, just clear
|
|
|
|
* the enabled bit. The spec says we should set the 'shutdown notification
|
|
|
|
* bits', but doing so may cause the device to complete commands to the
|
|
|
|
* admin queue ... and we don't know what memory that might be pointing at!
|
|
|
|
*/
|
|
|
|
static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap)
|
|
|
|
{
|
2014-06-23 21:24:36 +07:00
|
|
|
dev->ctrl_config &= ~NVME_CC_SHN_MASK;
|
|
|
|
dev->ctrl_config &= ~NVME_CC_ENABLE;
|
2015-11-20 14:58:10 +07:00
|
|
|
writel(dev->ctrl_config, dev->bar + NVME_REG_CC);
|
2013-05-04 17:43:17 +07:00
|
|
|
|
2013-05-04 17:43:16 +07:00
|
|
|
return nvme_wait_ready(dev, cap, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap)
|
|
|
|
{
|
2014-06-23 21:24:36 +07:00
|
|
|
dev->ctrl_config &= ~NVME_CC_SHN_MASK;
|
|
|
|
dev->ctrl_config |= NVME_CC_ENABLE;
|
2015-11-20 14:58:10 +07:00
|
|
|
writel(dev->ctrl_config, dev->bar + NVME_REG_CC);
|
2014-06-23 21:24:36 +07:00
|
|
|
|
2013-05-04 17:43:16 +07:00
|
|
|
return nvme_wait_ready(dev, cap, true);
|
|
|
|
}
|
|
|
|
|
2013-07-16 04:02:22 +07:00
|
|
|
static int nvme_shutdown_ctrl(struct nvme_dev *dev)
|
|
|
|
{
|
|
|
|
unsigned long timeout;
|
|
|
|
|
2014-06-23 21:24:36 +07:00
|
|
|
dev->ctrl_config &= ~NVME_CC_SHN_MASK;
|
|
|
|
dev->ctrl_config |= NVME_CC_SHN_NORMAL;
|
|
|
|
|
2015-11-20 14:58:10 +07:00
|
|
|
writel(dev->ctrl_config, dev->bar + NVME_REG_CC);
|
2013-07-16 04:02:22 +07:00
|
|
|
|
2014-07-01 22:33:32 +07:00
|
|
|
timeout = SHUTDOWN_TIMEOUT + jiffies;
|
2015-11-20 14:58:10 +07:00
|
|
|
while ((readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_SHST_MASK) !=
|
2013-07-16 04:02:22 +07:00
|
|
|
NVME_CSTS_SHST_CMPLT) {
|
|
|
|
msleep(100);
|
|
|
|
if (fatal_signal_pending(current))
|
|
|
|
return -EINTR;
|
|
|
|
if (time_after(jiffies, timeout)) {
|
2015-05-22 16:12:39 +07:00
|
|
|
dev_err(dev->dev,
|
2013-07-16 04:02:22 +07:00
|
|
|
"Device shutdown incomplete; abort shutdown\n");
|
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
static struct blk_mq_ops nvme_mq_admin_ops = {
|
2015-05-22 16:12:46 +07:00
|
|
|
.queue_rq = nvme_queue_rq,
|
2014-11-04 22:20:14 +07:00
|
|
|
.map_queue = blk_mq_map_queue,
|
|
|
|
.init_hctx = nvme_admin_init_hctx,
|
2015-06-08 23:08:13 +07:00
|
|
|
.exit_hctx = nvme_admin_exit_hctx,
|
2014-11-04 22:20:14 +07:00
|
|
|
.init_request = nvme_admin_init_request,
|
|
|
|
.timeout = nvme_timeout,
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct blk_mq_ops nvme_mq_ops = {
|
|
|
|
.queue_rq = nvme_queue_rq,
|
|
|
|
.map_queue = blk_mq_map_queue,
|
|
|
|
.init_hctx = nvme_init_hctx,
|
|
|
|
.init_request = nvme_init_request,
|
|
|
|
.timeout = nvme_timeout,
|
2015-11-04 10:37:26 +07:00
|
|
|
.poll = nvme_poll,
|
2014-11-04 22:20:14 +07:00
|
|
|
};
|
|
|
|
|
2015-01-08 08:55:49 +07:00
|
|
|
static void nvme_dev_remove_admin(struct nvme_dev *dev)
|
|
|
|
{
|
2015-11-26 16:06:56 +07:00
|
|
|
if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
|
|
|
|
blk_cleanup_queue(dev->ctrl.admin_q);
|
2015-01-08 08:55:49 +07:00
|
|
|
blk_mq_free_tag_set(&dev->admin_tagset);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
static int nvme_alloc_admin_tags(struct nvme_dev *dev)
|
|
|
|
{
|
2015-11-26 16:06:56 +07:00
|
|
|
if (!dev->ctrl.admin_q) {
|
2014-11-04 22:20:14 +07:00
|
|
|
dev->admin_tagset.ops = &nvme_mq_admin_ops;
|
|
|
|
dev->admin_tagset.nr_hw_queues = 1;
|
|
|
|
dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
|
2015-03-31 23:37:17 +07:00
|
|
|
dev->admin_tagset.reserved_tags = 1;
|
2014-11-04 22:20:14 +07:00
|
|
|
dev->admin_tagset.timeout = ADMIN_TIMEOUT;
|
2015-05-22 16:12:39 +07:00
|
|
|
dev->admin_tagset.numa_node = dev_to_node(dev->dev);
|
2015-01-23 02:07:58 +07:00
|
|
|
dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
|
2014-11-04 22:20:14 +07:00
|
|
|
dev->admin_tagset.driver_data = dev;
|
|
|
|
|
|
|
|
if (blk_mq_alloc_tag_set(&dev->admin_tagset))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
|
|
|
|
if (IS_ERR(dev->ctrl.admin_q)) {
|
2014-11-04 22:20:14 +07:00
|
|
|
blk_mq_free_tag_set(&dev->admin_tagset);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
2015-11-26 16:06:56 +07:00
|
|
|
if (!blk_get_queue(dev->ctrl.admin_q)) {
|
2015-01-08 08:55:49 +07:00
|
|
|
nvme_dev_remove_admin(dev);
|
2015-11-26 16:06:56 +07:00
|
|
|
dev->ctrl.admin_q = NULL;
|
2015-01-08 08:55:49 +07:00
|
|
|
return -ENODEV;
|
|
|
|
}
|
2015-01-08 08:55:50 +07:00
|
|
|
} else
|
2015-11-26 16:06:56 +07:00
|
|
|
blk_mq_unfreeze_queue(dev->ctrl.admin_q);
|
2014-11-04 22:20:14 +07:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-12-22 06:13:49 +07:00
|
|
|
static int nvme_configure_admin_queue(struct nvme_dev *dev)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2013-05-04 17:43:16 +07:00
|
|
|
int result;
|
2011-01-21 00:50:14 +07:00
|
|
|
u32 aqa;
|
2015-11-20 14:58:10 +07:00
|
|
|
u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
|
2011-01-21 00:50:14 +07:00
|
|
|
struct nvme_queue *nvmeq;
|
NVMe: default to 4k device page size
We received a bug report recently when DDW (64-bit direct DMA on Power)
is not enabled for NVMe devices. In that case, we fall back to 32-bit
DMA via the IOMMU, which is always done via 4K TCEs (Translation Control
Entries).
The NVMe device driver, though, assumes that the DMA alignment for the
PRP entries will match the device's page size, and that the DMA aligment
matches the kernel's page aligment. On Power, the the IOMMU page size,
as mentioned above, can be 4K, while the device can have a page size of
8K, while the kernel has a page size of 64K. This eventually trips the
BUG_ON in nvme_setup_prps(), as we have a 'dma_len' that is a multiple
of 4K but not 8K (e.g., 0xF000).
In this particular case of page sizes, we clearly want to use the
IOMMU's page size in the driver. And generally, the NVMe driver in this
function should be using the IOMMU's page size for the default device
page size, rather than the kernel's page size. There is not currently an
API to obtain the IOMMU's page size across all architectures and in the
interest of a stop-gap fix to this functional issue, default the NVMe
device page size to 4K, with the intent of adding such an API and
implementation across all architectures in the next merge window.
With the functionally equivalent v3 of this patch, our hardware test
exerciser survives when using 32-bit DMA; without the patch, the kernel
will BUG within a few minutes.
Signed-off-by: Nishanth Aravamudan <nacc at linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
2015-11-24 23:55:05 +07:00
|
|
|
/*
|
|
|
|
* default to a 4K page size, with the intention to update this
|
|
|
|
* path in the future to accomodate architectures with differing
|
|
|
|
* kernel and IO page sizes.
|
|
|
|
*/
|
|
|
|
unsigned page_shift = 12;
|
2014-06-24 00:34:01 +07:00
|
|
|
unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12;
|
|
|
|
|
|
|
|
if (page_shift < dev_page_min) {
|
2015-05-22 16:12:39 +07:00
|
|
|
dev_err(dev->dev,
|
2014-06-24 00:34:01 +07:00
|
|
|
"Minimum device page size (%u) too large for "
|
|
|
|
"host (%u)\n", 1 << dev_page_min,
|
|
|
|
1 << page_shift);
|
|
|
|
return -ENODEV;
|
|
|
|
}
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2015-11-20 14:58:10 +07:00
|
|
|
dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1) ?
|
2015-08-11 04:20:40 +07:00
|
|
|
NVME_CAP_NSSRC(cap) : 0;
|
|
|
|
|
2015-11-20 14:58:10 +07:00
|
|
|
if (dev->subsystem &&
|
|
|
|
(readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
|
|
|
|
writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
|
2015-08-11 04:20:40 +07:00
|
|
|
|
2013-05-04 17:43:16 +07:00
|
|
|
result = nvme_disable_ctrl(dev, cap);
|
|
|
|
if (result < 0)
|
|
|
|
return result;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
nvmeq = dev->queues[0];
|
2013-07-16 04:02:23 +07:00
|
|
|
if (!nvmeq) {
|
2014-12-23 02:59:04 +07:00
|
|
|
nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
|
2013-07-16 04:02:23 +07:00
|
|
|
if (!nvmeq)
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
2011-01-21 00:50:14 +07:00
|
|
|
|
|
|
|
aqa = nvmeq->q_depth - 1;
|
|
|
|
aqa |= aqa << 16;
|
|
|
|
|
2014-06-24 00:34:01 +07:00
|
|
|
dev->page_size = 1 << page_shift;
|
|
|
|
|
2014-06-23 21:24:36 +07:00
|
|
|
dev->ctrl_config = NVME_CC_CSS_NVM;
|
2014-06-24 00:34:01 +07:00
|
|
|
dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
|
2011-01-21 00:50:14 +07:00
|
|
|
dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
|
2011-03-23 02:55:45 +07:00
|
|
|
dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2015-11-20 14:58:10 +07:00
|
|
|
writel(aqa, dev->bar + NVME_REG_AQA);
|
|
|
|
lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
|
|
|
|
lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2013-05-04 17:43:16 +07:00
|
|
|
result = nvme_enable_ctrl(dev, cap);
|
2013-05-02 02:07:51 +07:00
|
|
|
if (result)
|
2014-11-04 22:20:14 +07:00
|
|
|
goto free_nvmeq;
|
|
|
|
|
2014-12-23 02:59:04 +07:00
|
|
|
nvmeq->cq_vector = 0;
|
2014-01-28 03:57:22 +07:00
|
|
|
result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
|
2015-07-01 00:22:52 +07:00
|
|
|
if (result) {
|
|
|
|
nvmeq->cq_vector = -1;
|
2015-01-08 08:55:50 +07:00
|
|
|
goto free_nvmeq;
|
2015-07-01 00:22:52 +07:00
|
|
|
}
|
2013-05-02 02:07:51 +07:00
|
|
|
|
2011-01-21 00:50:14 +07:00
|
|
|
return result;
|
2014-11-04 22:20:14 +07:00
|
|
|
|
|
|
|
free_nvmeq:
|
|
|
|
nvme_free_queues(dev, 0);
|
|
|
|
return result;
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2011-02-02 04:13:29 +07:00
|
|
|
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
|
|
|
|
{
|
2015-11-26 16:06:56 +07:00
|
|
|
struct nvme_dev *dev = to_nvme_dev(ns->ctrl);
|
2011-02-02 04:13:29 +07:00
|
|
|
struct nvme_user_io io;
|
|
|
|
struct nvme_command c;
|
2015-05-22 16:12:46 +07:00
|
|
|
unsigned length, meta_len;
|
2015-04-08 05:57:19 +07:00
|
|
|
int status, write;
|
|
|
|
dma_addr_t meta_dma = 0;
|
|
|
|
void *meta = NULL;
|
2015-05-19 22:05:40 +07:00
|
|
|
void __user *metadata;
|
2011-02-02 04:13:29 +07:00
|
|
|
|
|
|
|
if (copy_from_user(&io, uio, sizeof(io)))
|
|
|
|
return -EFAULT;
|
2011-03-21 20:48:57 +07:00
|
|
|
|
|
|
|
switch (io.opcode) {
|
|
|
|
case nvme_cmd_write:
|
|
|
|
case nvme_cmd_read:
|
2011-05-21 00:03:42 +07:00
|
|
|
case nvme_cmd_compare:
|
2011-08-09 23:56:37 +07:00
|
|
|
break;
|
2011-03-21 20:48:57 +07:00
|
|
|
default:
|
2011-05-21 00:03:42 +07:00
|
|
|
return -EINVAL;
|
2011-03-21 20:48:57 +07:00
|
|
|
}
|
|
|
|
|
2015-05-22 16:12:46 +07:00
|
|
|
length = (io.nblocks + 1) << ns->lba_shift;
|
|
|
|
meta_len = (io.nblocks + 1) * ns->ms;
|
2015-10-07 03:29:48 +07:00
|
|
|
metadata = (void __user *)(uintptr_t)io.metadata;
|
2015-05-22 16:12:46 +07:00
|
|
|
write = io.opcode & 1;
|
2011-02-02 04:13:29 +07:00
|
|
|
|
2015-06-20 00:07:30 +07:00
|
|
|
if (ns->ext) {
|
|
|
|
length += meta_len;
|
|
|
|
meta_len = 0;
|
2015-04-08 05:57:19 +07:00
|
|
|
}
|
|
|
|
if (meta_len) {
|
2015-05-22 16:12:46 +07:00
|
|
|
if (((io.metadata & 3) || !io.metadata) && !ns->ext)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2015-05-22 16:12:39 +07:00
|
|
|
meta = dma_alloc_coherent(dev->dev, meta_len,
|
2015-04-08 05:57:19 +07:00
|
|
|
&meta_dma, GFP_KERNEL);
|
2015-05-19 22:05:40 +07:00
|
|
|
|
2015-04-08 05:57:19 +07:00
|
|
|
if (!meta) {
|
|
|
|
status = -ENOMEM;
|
|
|
|
goto unmap;
|
|
|
|
}
|
|
|
|
if (write) {
|
2015-05-19 22:05:40 +07:00
|
|
|
if (copy_from_user(meta, metadata, meta_len)) {
|
2015-04-08 05:57:19 +07:00
|
|
|
status = -EFAULT;
|
|
|
|
goto unmap;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-02-02 04:13:29 +07:00
|
|
|
memset(&c, 0, sizeof(c));
|
|
|
|
c.rw.opcode = io.opcode;
|
|
|
|
c.rw.flags = io.flags;
|
2011-03-21 20:48:57 +07:00
|
|
|
c.rw.nsid = cpu_to_le32(ns->ns_id);
|
2011-02-02 04:13:29 +07:00
|
|
|
c.rw.slba = cpu_to_le64(io.slba);
|
2011-03-21 20:48:57 +07:00
|
|
|
c.rw.length = cpu_to_le16(io.nblocks);
|
2011-02-02 04:13:29 +07:00
|
|
|
c.rw.control = cpu_to_le16(io.control);
|
2013-04-17 02:21:06 +07:00
|
|
|
c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
|
|
|
|
c.rw.reftag = cpu_to_le32(io.reftag);
|
|
|
|
c.rw.apptag = cpu_to_le16(io.apptag);
|
|
|
|
c.rw.appmask = cpu_to_le16(io.appmask);
|
2015-04-08 05:57:19 +07:00
|
|
|
c.rw.metadata = cpu_to_le64(meta_dma);
|
2015-05-22 16:12:46 +07:00
|
|
|
|
|
|
|
status = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
|
2015-10-07 03:29:48 +07:00
|
|
|
(void __user *)(uintptr_t)io.addr, length, NULL, 0);
|
2013-04-24 06:23:59 +07:00
|
|
|
unmap:
|
2015-04-08 05:57:19 +07:00
|
|
|
if (meta) {
|
|
|
|
if (status == NVME_SC_SUCCESS && !write) {
|
2015-05-19 22:05:40 +07:00
|
|
|
if (copy_to_user(metadata, meta, meta_len))
|
2015-04-08 05:57:19 +07:00
|
|
|
status = -EFAULT;
|
|
|
|
}
|
2015-05-22 16:12:39 +07:00
|
|
|
dma_free_coherent(dev->dev, meta_len, meta, meta_dma);
|
2013-04-24 06:23:59 +07:00
|
|
|
}
|
2011-02-02 04:13:29 +07:00
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_passthru_cmd __user *ucmd)
|
2011-02-03 22:58:26 +07:00
|
|
|
{
|
2014-09-13 05:07:20 +07:00
|
|
|
struct nvme_passthru_cmd cmd;
|
2011-02-03 22:58:26 +07:00
|
|
|
struct nvme_command c;
|
2015-05-22 16:12:46 +07:00
|
|
|
unsigned timeout = 0;
|
|
|
|
int status;
|
2011-02-03 22:58:26 +07:00
|
|
|
|
2011-05-21 00:03:42 +07:00
|
|
|
if (!capable(CAP_SYS_ADMIN))
|
|
|
|
return -EACCES;
|
|
|
|
if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
|
2011-02-03 22:58:26 +07:00
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
memset(&c, 0, sizeof(c));
|
2011-05-21 00:03:42 +07:00
|
|
|
c.common.opcode = cmd.opcode;
|
|
|
|
c.common.flags = cmd.flags;
|
|
|
|
c.common.nsid = cpu_to_le32(cmd.nsid);
|
|
|
|
c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
|
|
|
|
c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
|
|
|
|
c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
|
|
|
|
c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
|
|
|
|
c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
|
|
|
|
c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
|
|
|
|
c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
|
|
|
|
c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
|
|
|
|
|
2015-05-22 16:12:46 +07:00
|
|
|
if (cmd.timeout_ms)
|
|
|
|
timeout = msecs_to_jiffies(cmd.timeout_ms);
|
2011-12-21 01:34:52 +07:00
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
status = __nvme_submit_sync_cmd(ns ? ns->queue : ctrl->admin_q, &c,
|
2015-10-07 03:29:48 +07:00
|
|
|
NULL, (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
|
2015-05-22 16:12:46 +07:00
|
|
|
&cmd.result, timeout);
|
|
|
|
if (status >= 0) {
|
|
|
|
if (put_user(cmd.result, &ucmd->result))
|
|
|
|
return -EFAULT;
|
2011-05-21 00:03:42 +07:00
|
|
|
}
|
2012-09-21 23:49:05 +07:00
|
|
|
|
2011-02-03 22:58:26 +07:00
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
2015-08-11 04:20:41 +07:00
|
|
|
static int nvme_subsys_reset(struct nvme_dev *dev)
|
|
|
|
{
|
|
|
|
if (!dev->subsystem)
|
|
|
|
return -ENOTTY;
|
|
|
|
|
2015-11-20 14:58:10 +07:00
|
|
|
writel(0x4E564D65, dev->bar + NVME_REG_NSSR); /* "NVMe" */
|
2015-08-11 04:20:41 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-01-21 00:50:14 +07:00
|
|
|
static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
|
|
|
|
unsigned long arg)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns = bdev->bd_disk->private_data;
|
|
|
|
|
|
|
|
switch (cmd) {
|
2011-05-21 00:03:42 +07:00
|
|
|
case NVME_IOCTL_ID:
|
2013-07-09 04:26:25 +07:00
|
|
|
force_successful_syscall_return();
|
2011-05-21 00:03:42 +07:00
|
|
|
return ns->ns_id;
|
|
|
|
case NVME_IOCTL_ADMIN_CMD:
|
2015-11-26 16:06:56 +07:00
|
|
|
return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
|
2014-09-13 05:07:20 +07:00
|
|
|
case NVME_IOCTL_IO_CMD:
|
2015-11-26 16:06:56 +07:00
|
|
|
return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
|
2011-02-02 04:13:29 +07:00
|
|
|
case NVME_IOCTL_SUBMIT_IO:
|
|
|
|
return nvme_submit_io(ns, (void __user *)arg);
|
2013-03-05 08:40:58 +07:00
|
|
|
case SG_GET_VERSION_NUM:
|
|
|
|
return nvme_sg_get_version_num((void __user *)arg);
|
|
|
|
case SG_IO:
|
|
|
|
return nvme_sg_io(ns, (void __user *)arg);
|
2011-01-21 00:50:14 +07:00
|
|
|
default:
|
|
|
|
return -ENOTTY;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-10-24 02:07:34 +07:00
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
|
|
|
|
unsigned int cmd, unsigned long arg)
|
|
|
|
{
|
|
|
|
switch (cmd) {
|
|
|
|
case SG_IO:
|
2014-08-28 02:55:38 +07:00
|
|
|
return -ENOIOCTLCMD;
|
2013-10-24 02:07:34 +07:00
|
|
|
}
|
|
|
|
return nvme_ioctl(bdev, mode, cmd, arg);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
#define nvme_compat_ioctl NULL
|
|
|
|
#endif
|
|
|
|
|
2015-10-02 23:37:28 +07:00
|
|
|
static void nvme_free_dev(struct kref *kref);
|
2015-10-02 06:14:10 +07:00
|
|
|
static void nvme_free_ns(struct kref *kref)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
|
2015-11-26 16:06:56 +07:00
|
|
|
struct nvme_dev *dev = to_nvme_dev(ns->ctrl);
|
2015-10-02 06:14:10 +07:00
|
|
|
|
2015-10-29 15:57:29 +07:00
|
|
|
if (ns->type == NVME_NS_LIGHTNVM)
|
|
|
|
nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
|
|
|
|
|
2015-10-02 06:14:10 +07:00
|
|
|
spin_lock(&dev_list_lock);
|
|
|
|
ns->disk->private_data = NULL;
|
|
|
|
spin_unlock(&dev_list_lock);
|
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
kref_put(&dev->kref, nvme_free_dev);
|
2015-10-02 06:14:10 +07:00
|
|
|
put_disk(ns->disk);
|
|
|
|
kfree(ns);
|
|
|
|
}
|
|
|
|
|
2014-02-01 06:53:39 +07:00
|
|
|
static int nvme_open(struct block_device *bdev, fmode_t mode)
|
|
|
|
{
|
2014-10-04 00:15:47 +07:00
|
|
|
int ret = 0;
|
|
|
|
struct nvme_ns *ns;
|
2014-02-01 06:53:39 +07:00
|
|
|
|
2014-10-04 00:15:47 +07:00
|
|
|
spin_lock(&dev_list_lock);
|
|
|
|
ns = bdev->bd_disk->private_data;
|
|
|
|
if (!ns)
|
|
|
|
ret = -ENXIO;
|
2015-10-02 06:14:10 +07:00
|
|
|
else if (!kref_get_unless_zero(&ns->kref))
|
2014-10-04 00:15:47 +07:00
|
|
|
ret = -ENXIO;
|
|
|
|
spin_unlock(&dev_list_lock);
|
|
|
|
|
|
|
|
return ret;
|
2014-02-01 06:53:39 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_release(struct gendisk *disk, fmode_t mode)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns = disk->private_data;
|
2015-10-02 06:14:10 +07:00
|
|
|
kref_put(&ns->kref, nvme_free_ns);
|
2014-02-01 06:53:39 +07:00
|
|
|
}
|
|
|
|
|
2014-04-03 04:45:37 +07:00
|
|
|
static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
|
|
|
|
{
|
|
|
|
/* some standard values */
|
|
|
|
geo->heads = 1 << 6;
|
|
|
|
geo->sectors = 1 << 5;
|
|
|
|
geo->cylinders = get_capacity(bd->bd_disk) >> 11;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
static void nvme_config_discard(struct nvme_ns *ns)
|
|
|
|
{
|
|
|
|
u32 logical_block_size = queue_logical_block_size(ns->queue);
|
|
|
|
ns->queue->limits.discard_zeroes_data = 0;
|
|
|
|
ns->queue->limits.discard_alignment = logical_block_size;
|
|
|
|
ns->queue->limits.discard_granularity = logical_block_size;
|
2015-07-14 21:15:12 +07:00
|
|
|
blk_queue_max_discard_sectors(ns->queue, 0xffffffff);
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
|
|
|
|
}
|
|
|
|
|
2014-09-11 06:21:14 +07:00
|
|
|
static int nvme_revalidate_disk(struct gendisk *disk)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns = disk->private_data;
|
2015-11-26 16:06:56 +07:00
|
|
|
struct nvme_dev *dev = to_nvme_dev(ns->ctrl);
|
2014-09-11 06:21:14 +07:00
|
|
|
struct nvme_id_ns *id;
|
2015-04-08 05:57:19 +07:00
|
|
|
u8 lbaf, pi_type;
|
|
|
|
u16 old_ms;
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
unsigned short bs;
|
2014-09-11 06:21:14 +07:00
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
if (nvme_identify_ns(&dev->ctrl, ns->ns_id, &id)) {
|
2015-06-02 03:28:14 +07:00
|
|
|
dev_warn(dev->dev, "%s: Identify failure nvme%dn%d\n", __func__,
|
2015-11-26 16:06:56 +07:00
|
|
|
dev->ctrl.instance, ns->ns_id);
|
2015-06-02 03:28:14 +07:00
|
|
|
return -ENODEV;
|
2014-09-11 06:21:14 +07:00
|
|
|
}
|
2015-06-02 03:28:14 +07:00
|
|
|
if (id->ncap == 0) {
|
|
|
|
kfree(id);
|
|
|
|
return -ENODEV;
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
}
|
2014-09-11 06:21:14 +07:00
|
|
|
|
2015-10-29 15:57:29 +07:00
|
|
|
if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
|
|
|
|
if (nvme_nvm_register(ns->queue, disk->disk_name)) {
|
|
|
|
dev_warn(dev->dev,
|
|
|
|
"%s: LightNVM init failure\n", __func__);
|
|
|
|
kfree(id);
|
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
ns->type = NVME_NS_LIGHTNVM;
|
|
|
|
}
|
|
|
|
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
old_ms = ns->ms;
|
|
|
|
lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
|
2014-09-11 06:21:14 +07:00
|
|
|
ns->lba_shift = id->lbaf[lbaf].ds;
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
|
2015-04-08 05:57:19 +07:00
|
|
|
ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If identify namespace failed, use default 512 byte block size so
|
|
|
|
* block layer can use before failing read/write for 0 capacity.
|
|
|
|
*/
|
|
|
|
if (ns->lba_shift == 0)
|
|
|
|
ns->lba_shift = 9;
|
|
|
|
bs = 1 << ns->lba_shift;
|
|
|
|
|
|
|
|
/* XXX: PI implementation requires metadata equal t10 pi tuple size */
|
|
|
|
pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
|
|
|
|
id->dps & NVME_NS_DPS_PI_MASK : 0;
|
|
|
|
|
2015-10-22 00:20:07 +07:00
|
|
|
blk_mq_freeze_queue(disk->queue);
|
2015-02-23 23:16:21 +07:00
|
|
|
if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
|
|
|
|
ns->ms != old_ms ||
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
bs != queue_logical_block_size(disk->queue) ||
|
2015-04-08 05:57:19 +07:00
|
|
|
(ns->ms && ns->ext)))
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
blk_integrity_unregister(disk);
|
|
|
|
|
|
|
|
ns->pi_type = pi_type;
|
|
|
|
blk_queue_logical_block_size(ns->queue, bs);
|
|
|
|
|
2015-10-22 00:19:49 +07:00
|
|
|
if (ns->ms && !ns->ext)
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
nvme_init_integrity(ns);
|
|
|
|
|
2015-10-29 15:57:29 +07:00
|
|
|
if ((ns->ms && !(ns->ms == 8 && ns->pi_type) &&
|
|
|
|
!blk_get_integrity(disk)) ||
|
|
|
|
ns->type == NVME_NS_LIGHTNVM)
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
set_capacity(disk, 0);
|
|
|
|
else
|
|
|
|
set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
|
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
if (dev->ctrl.oncs & NVME_CTRL_ONCS_DSM)
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
nvme_config_discard(ns);
|
2015-10-22 00:20:07 +07:00
|
|
|
blk_mq_unfreeze_queue(disk->queue);
|
2014-09-11 06:21:14 +07:00
|
|
|
|
2015-05-22 16:12:46 +07:00
|
|
|
kfree(id);
|
2014-09-11 06:21:14 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-10-15 19:10:52 +07:00
|
|
|
static char nvme_pr_type(enum pr_type type)
|
|
|
|
{
|
|
|
|
switch (type) {
|
|
|
|
case PR_WRITE_EXCLUSIVE:
|
|
|
|
return 1;
|
|
|
|
case PR_EXCLUSIVE_ACCESS:
|
|
|
|
return 2;
|
|
|
|
case PR_WRITE_EXCLUSIVE_REG_ONLY:
|
|
|
|
return 3;
|
|
|
|
case PR_EXCLUSIVE_ACCESS_REG_ONLY:
|
|
|
|
return 4;
|
|
|
|
case PR_WRITE_EXCLUSIVE_ALL_REGS:
|
|
|
|
return 5;
|
|
|
|
case PR_EXCLUSIVE_ACCESS_ALL_REGS:
|
|
|
|
return 6;
|
|
|
|
default:
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
|
|
|
|
u64 key, u64 sa_key, u8 op)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns = bdev->bd_disk->private_data;
|
|
|
|
struct nvme_command c;
|
|
|
|
u8 data[16] = { 0, };
|
|
|
|
|
|
|
|
put_unaligned_le64(key, &data[0]);
|
|
|
|
put_unaligned_le64(sa_key, &data[8]);
|
|
|
|
|
|
|
|
memset(&c, 0, sizeof(c));
|
|
|
|
c.common.opcode = op;
|
2015-10-22 17:01:05 +07:00
|
|
|
c.common.nsid = cpu_to_le32(ns->ns_id);
|
|
|
|
c.common.cdw10[0] = cpu_to_le32(cdw10);
|
2015-10-15 19:10:52 +07:00
|
|
|
|
|
|
|
return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_pr_register(struct block_device *bdev, u64 old,
|
|
|
|
u64 new, unsigned flags)
|
|
|
|
{
|
|
|
|
u32 cdw10;
|
|
|
|
|
|
|
|
if (flags & ~PR_FL_IGNORE_KEY)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
cdw10 = old ? 2 : 0;
|
|
|
|
cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
|
|
|
|
cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
|
|
|
|
return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_pr_reserve(struct block_device *bdev, u64 key,
|
|
|
|
enum pr_type type, unsigned flags)
|
|
|
|
{
|
|
|
|
u32 cdw10;
|
|
|
|
|
|
|
|
if (flags & ~PR_FL_IGNORE_KEY)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
cdw10 = nvme_pr_type(type) << 8;
|
|
|
|
cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
|
|
|
|
return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
|
|
|
|
enum pr_type type, bool abort)
|
|
|
|
{
|
|
|
|
u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
|
|
|
|
return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_pr_clear(struct block_device *bdev, u64 key)
|
|
|
|
{
|
2015-11-04 02:50:49 +07:00
|
|
|
u32 cdw10 = 1 | (key ? 1 << 3 : 0);
|
2015-10-15 19:10:52 +07:00
|
|
|
return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
|
|
|
|
{
|
|
|
|
u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
|
|
|
|
return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct pr_ops nvme_pr_ops = {
|
|
|
|
.pr_register = nvme_pr_register,
|
|
|
|
.pr_reserve = nvme_pr_reserve,
|
|
|
|
.pr_release = nvme_pr_release,
|
|
|
|
.pr_preempt = nvme_pr_preempt,
|
|
|
|
.pr_clear = nvme_pr_clear,
|
|
|
|
};
|
|
|
|
|
2011-01-21 00:50:14 +07:00
|
|
|
static const struct block_device_operations nvme_fops = {
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.ioctl = nvme_ioctl,
|
2013-10-24 02:07:34 +07:00
|
|
|
.compat_ioctl = nvme_compat_ioctl,
|
2014-02-01 06:53:39 +07:00
|
|
|
.open = nvme_open,
|
|
|
|
.release = nvme_release,
|
2014-04-03 04:45:37 +07:00
|
|
|
.getgeo = nvme_getgeo,
|
2014-09-11 06:21:14 +07:00
|
|
|
.revalidate_disk= nvme_revalidate_disk,
|
2015-10-15 19:10:52 +07:00
|
|
|
.pr_ops = &nvme_pr_ops,
|
2011-01-21 00:50:14 +07:00
|
|
|
};
|
|
|
|
|
2011-03-03 06:37:18 +07:00
|
|
|
static int nvme_kthread(void *data)
|
|
|
|
{
|
2013-12-11 03:10:37 +07:00
|
|
|
struct nvme_dev *dev, *next;
|
2011-03-03 06:37:18 +07:00
|
|
|
|
|
|
|
while (!kthread_should_stop()) {
|
2013-05-02 03:38:23 +07:00
|
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
2011-03-03 06:37:18 +07:00
|
|
|
spin_lock(&dev_list_lock);
|
2013-12-11 03:10:37 +07:00
|
|
|
list_for_each_entry_safe(dev, next, &dev_list, node) {
|
2011-03-03 06:37:18 +07:00
|
|
|
int i;
|
2015-11-20 14:58:10 +07:00
|
|
|
u32 csts = readl(dev->bar + NVME_REG_CSTS);
|
2015-08-11 04:20:40 +07:00
|
|
|
|
|
|
|
if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) ||
|
|
|
|
csts & NVME_CSTS_CFS) {
|
2015-10-02 23:49:23 +07:00
|
|
|
if (!__nvme_reset(dev)) {
|
|
|
|
dev_warn(dev->dev,
|
|
|
|
"Failed status: %x, reset controller\n",
|
2015-11-20 14:58:10 +07:00
|
|
|
readl(dev->bar + NVME_REG_CSTS));
|
2015-10-02 23:49:23 +07:00
|
|
|
}
|
2013-12-11 03:10:37 +07:00
|
|
|
continue;
|
|
|
|
}
|
2011-03-03 06:37:18 +07:00
|
|
|
for (i = 0; i < dev->queue_count; i++) {
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_queue *nvmeq = dev->queues[i];
|
2011-02-16 04:28:20 +07:00
|
|
|
if (!nvmeq)
|
|
|
|
continue;
|
2011-03-03 06:37:18 +07:00
|
|
|
spin_lock_irq(&nvmeq->q_lock);
|
2013-06-24 22:56:42 +07:00
|
|
|
nvme_process_cq(nvmeq);
|
2014-06-19 02:58:57 +07:00
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
while (i == 0 && dev->ctrl.event_limit > 0) {
|
2014-11-04 22:20:14 +07:00
|
|
|
if (nvme_submit_async_admin_req(dev))
|
2014-06-19 02:58:57 +07:00
|
|
|
break;
|
2015-11-26 16:06:56 +07:00
|
|
|
dev->ctrl.event_limit--;
|
2014-06-19 02:58:57 +07:00
|
|
|
}
|
2011-03-03 06:37:18 +07:00
|
|
|
spin_unlock_irq(&nvmeq->q_lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
spin_unlock(&dev_list_lock);
|
2013-02-05 05:44:33 +07:00
|
|
|
schedule_timeout(round_jiffies_relative(HZ));
|
2011-03-03 06:37:18 +07:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
struct gendisk *disk;
|
2015-05-22 16:12:39 +07:00
|
|
|
int node = dev_to_node(dev->dev);
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
|
2011-01-21 00:50:14 +07:00
|
|
|
if (!ns)
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
return;
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
ns->queue = blk_mq_init_queue(&dev->tagset);
|
2014-11-06 03:39:09 +07:00
|
|
|
if (IS_ERR(ns->queue))
|
2011-01-21 00:50:14 +07:00
|
|
|
goto out_free_ns;
|
2012-01-11 04:35:08 +07:00
|
|
|
queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
|
|
|
|
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
|
2015-11-26 16:06:56 +07:00
|
|
|
ns->ctrl = &dev->ctrl;
|
2011-01-21 00:50:14 +07:00
|
|
|
ns->queue->queuedata = ns;
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
disk = alloc_disk_node(0, node);
|
2011-01-21 00:50:14 +07:00
|
|
|
if (!disk)
|
|
|
|
goto out_free_queue;
|
2014-11-04 22:20:14 +07:00
|
|
|
|
2015-10-02 06:14:10 +07:00
|
|
|
kref_init(&ns->kref);
|
2011-05-06 19:45:47 +07:00
|
|
|
ns->ns_id = nsid;
|
2011-01-21 00:50:14 +07:00
|
|
|
ns->disk = disk;
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
|
|
|
|
list_add_tail(&ns->list, &dev->namespaces);
|
|
|
|
|
2012-07-25 04:01:04 +07:00
|
|
|
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
|
2015-08-13 05:17:54 +07:00
|
|
|
if (dev->max_hw_sectors) {
|
2012-07-27 00:29:57 +07:00
|
|
|
blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
|
2015-08-13 05:17:54 +07:00
|
|
|
blk_queue_max_segments(ns->queue,
|
2015-11-19 06:33:08 +07:00
|
|
|
(dev->max_hw_sectors / (dev->page_size >> 9)) + 1);
|
2015-08-13 05:17:54 +07:00
|
|
|
}
|
2014-11-04 22:20:14 +07:00
|
|
|
if (dev->stripe_size)
|
|
|
|
blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9);
|
2015-11-26 16:06:56 +07:00
|
|
|
if (dev->ctrl.vwc & NVME_CTRL_VWC_PRESENT)
|
2014-04-30 00:41:28 +07:00
|
|
|
blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
|
2015-08-20 04:24:05 +07:00
|
|
|
blk_queue_virt_boundary(ns->queue, dev->page_size - 1);
|
2011-01-21 00:50:14 +07:00
|
|
|
|
|
|
|
disk->major = nvme_major;
|
2013-12-10 00:58:46 +07:00
|
|
|
disk->first_minor = 0;
|
2011-01-21 00:50:14 +07:00
|
|
|
disk->fops = &nvme_fops;
|
|
|
|
disk->private_data = ns;
|
|
|
|
disk->queue = ns->queue;
|
2015-02-04 01:21:42 +07:00
|
|
|
disk->driverfs_dev = dev->device;
|
2013-12-10 00:58:46 +07:00
|
|
|
disk->flags = GENHD_FL_EXT_DEVT;
|
2015-11-26 16:06:56 +07:00
|
|
|
sprintf(disk->disk_name, "nvme%dn%d", dev->ctrl.instance, nsid);
|
2011-01-21 00:50:14 +07:00
|
|
|
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
/*
|
|
|
|
* Initialize capacity to 0 until we establish the namespace format and
|
|
|
|
* setup integrity extentions if necessary. The revalidate_disk after
|
|
|
|
* add_disk allows the driver to register with integrity if the format
|
|
|
|
* requires it.
|
|
|
|
*/
|
|
|
|
set_capacity(disk, 0);
|
2015-06-02 03:28:14 +07:00
|
|
|
if (nvme_revalidate_disk(ns->disk))
|
|
|
|
goto out_free_disk;
|
|
|
|
|
2015-10-02 23:37:28 +07:00
|
|
|
kref_get(&dev->kref);
|
2015-10-29 15:57:29 +07:00
|
|
|
if (ns->type != NVME_NS_LIGHTNVM) {
|
|
|
|
add_disk(ns->disk);
|
|
|
|
if (ns->ms) {
|
|
|
|
struct block_device *bd = bdget_disk(ns->disk, 0);
|
|
|
|
if (!bd)
|
|
|
|
return;
|
|
|
|
if (blkdev_get(bd, FMODE_READ, NULL)) {
|
|
|
|
bdput(bd);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
blkdev_reread_part(bd);
|
|
|
|
blkdev_put(bd, FMODE_READ);
|
2015-07-15 00:57:48 +07:00
|
|
|
}
|
|
|
|
}
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
return;
|
2015-06-02 03:28:14 +07:00
|
|
|
out_free_disk:
|
|
|
|
kfree(disk);
|
|
|
|
list_del(&ns->list);
|
2011-01-21 00:50:14 +07:00
|
|
|
out_free_queue:
|
|
|
|
blk_cleanup_queue(ns->queue);
|
|
|
|
out_free_ns:
|
|
|
|
kfree(ns);
|
|
|
|
}
|
|
|
|
|
2015-10-02 23:51:31 +07:00
|
|
|
/*
|
|
|
|
* Create I/O queues. Failing to create an I/O queue is not an issue,
|
|
|
|
* we can continue with less than the desired amount of queues, and
|
|
|
|
* even a controller without I/O queues an still be used to issue
|
|
|
|
* admin commands. This might be useful to upgrade a buggy firmware
|
|
|
|
* for example.
|
|
|
|
*/
|
2014-03-24 23:46:25 +07:00
|
|
|
static void nvme_create_io_queues(struct nvme_dev *dev)
|
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
unsigned i;
|
2014-03-24 23:46:25 +07:00
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
for (i = dev->queue_count; i <= dev->max_qid; i++)
|
2014-12-23 02:59:04 +07:00
|
|
|
if (!nvme_alloc_queue(dev, i, dev->q_depth))
|
2014-03-24 23:46:25 +07:00
|
|
|
break;
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
for (i = dev->online_queues; i <= dev->queue_count - 1; i++)
|
2015-10-02 23:51:31 +07:00
|
|
|
if (nvme_create_queue(dev->queues[i], i)) {
|
|
|
|
nvme_free_queues(dev, i);
|
2014-03-24 23:46:25 +07:00
|
|
|
break;
|
2015-10-02 23:51:31 +07:00
|
|
|
}
|
2014-03-24 23:46:25 +07:00
|
|
|
}
|
|
|
|
|
2011-01-20 21:14:34 +07:00
|
|
|
static int set_queue_count(struct nvme_dev *dev, int count)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
|
|
|
int status;
|
|
|
|
u32 result;
|
2011-01-20 21:14:34 +07:00
|
|
|
u32 q_count = (count - 1) | ((count - 1) << 16);
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
status = nvme_set_features(&dev->ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0,
|
2011-09-20 04:08:14 +07:00
|
|
|
&result);
|
2014-04-11 22:58:45 +07:00
|
|
|
if (status < 0)
|
|
|
|
return status;
|
|
|
|
if (status > 0) {
|
2015-05-22 16:12:39 +07:00
|
|
|
dev_err(dev->dev, "Could not set queue count (%d)\n", status);
|
2014-06-24 03:25:35 +07:00
|
|
|
return 0;
|
2014-04-11 22:58:45 +07:00
|
|
|
}
|
2011-01-21 00:50:14 +07:00
|
|
|
return min(result & 0xffff, result >> 16) + 1;
|
|
|
|
}
|
|
|
|
|
2015-07-20 23:14:09 +07:00
|
|
|
static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
|
|
|
|
{
|
|
|
|
u64 szu, size, offset;
|
|
|
|
u32 cmbloc;
|
|
|
|
resource_size_t bar_size;
|
|
|
|
struct pci_dev *pdev = to_pci_dev(dev->dev);
|
|
|
|
void __iomem *cmb;
|
|
|
|
dma_addr_t dma_addr;
|
|
|
|
|
|
|
|
if (!use_cmb_sqes)
|
|
|
|
return NULL;
|
|
|
|
|
2015-11-20 14:58:10 +07:00
|
|
|
dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
|
2015-07-20 23:14:09 +07:00
|
|
|
if (!(NVME_CMB_SZ(dev->cmbsz)))
|
|
|
|
return NULL;
|
|
|
|
|
2015-11-20 14:58:10 +07:00
|
|
|
cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
|
2015-07-20 23:14:09 +07:00
|
|
|
|
|
|
|
szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
|
|
|
|
size = szu * NVME_CMB_SZ(dev->cmbsz);
|
|
|
|
offset = szu * NVME_CMB_OFST(cmbloc);
|
|
|
|
bar_size = pci_resource_len(pdev, NVME_CMB_BIR(cmbloc));
|
|
|
|
|
|
|
|
if (offset > bar_size)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Controllers may support a CMB size larger than their BAR,
|
|
|
|
* for example, due to being behind a bridge. Reduce the CMB to
|
|
|
|
* the reported size of the BAR
|
|
|
|
*/
|
|
|
|
if (size > bar_size - offset)
|
|
|
|
size = bar_size - offset;
|
|
|
|
|
|
|
|
dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(cmbloc)) + offset;
|
|
|
|
cmb = ioremap_wc(dma_addr, size);
|
|
|
|
if (!cmb)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
dev->cmb_dma_addr = dma_addr;
|
|
|
|
dev->cmb_size = size;
|
|
|
|
return cmb;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void nvme_release_cmb(struct nvme_dev *dev)
|
|
|
|
{
|
|
|
|
if (dev->cmb) {
|
|
|
|
iounmap(dev->cmb);
|
|
|
|
dev->cmb = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-07-16 04:02:24 +07:00
|
|
|
static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
|
|
|
|
{
|
2013-09-10 10:25:37 +07:00
|
|
|
return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
|
2013-07-16 04:02:24 +07:00
|
|
|
}
|
|
|
|
|
2012-12-22 06:13:49 +07:00
|
|
|
static int nvme_setup_io_queues(struct nvme_dev *dev)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_queue *adminq = dev->queues[0];
|
2015-05-22 16:12:39 +07:00
|
|
|
struct pci_dev *pdev = to_pci_dev(dev->dev);
|
2014-03-24 23:46:25 +07:00
|
|
|
int result, i, vecs, nr_io_queues, size;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2014-03-24 23:46:25 +07:00
|
|
|
nr_io_queues = num_possible_cpus();
|
2011-02-16 04:16:02 +07:00
|
|
|
result = set_queue_count(dev, nr_io_queues);
|
2014-06-24 03:25:35 +07:00
|
|
|
if (result <= 0)
|
2011-01-21 01:01:49 +07:00
|
|
|
return result;
|
2011-02-16 04:16:02 +07:00
|
|
|
if (result < nr_io_queues)
|
|
|
|
nr_io_queues = result;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2015-07-20 23:14:09 +07:00
|
|
|
if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
|
|
|
|
result = nvme_cmb_qdepth(dev, nr_io_queues,
|
|
|
|
sizeof(struct nvme_command));
|
|
|
|
if (result > 0)
|
|
|
|
dev->q_depth = result;
|
|
|
|
else
|
|
|
|
nvme_release_cmb(dev);
|
|
|
|
}
|
|
|
|
|
2013-07-16 04:02:24 +07:00
|
|
|
size = db_bar_size(dev, nr_io_queues);
|
|
|
|
if (size > 8192) {
|
2011-10-21 04:00:41 +07:00
|
|
|
iounmap(dev->bar);
|
2013-07-16 04:02:24 +07:00
|
|
|
do {
|
|
|
|
dev->bar = ioremap(pci_resource_start(pdev, 0), size);
|
|
|
|
if (dev->bar)
|
|
|
|
break;
|
|
|
|
if (!--nr_io_queues)
|
|
|
|
return -ENOMEM;
|
|
|
|
size = db_bar_size(dev, nr_io_queues);
|
|
|
|
} while (1);
|
2015-11-20 14:58:10 +07:00
|
|
|
dev->dbs = dev->bar + 4096;
|
2014-02-22 04:13:44 +07:00
|
|
|
adminq->q_db = dev->dbs;
|
2011-10-21 04:00:41 +07:00
|
|
|
}
|
|
|
|
|
2013-07-16 04:02:24 +07:00
|
|
|
/* Deregister the admin queue's interrupt */
|
2014-01-28 03:57:22 +07:00
|
|
|
free_irq(dev->entry[0].vector, adminq);
|
2013-07-16 04:02:24 +07:00
|
|
|
|
2014-11-14 23:49:26 +07:00
|
|
|
/*
|
|
|
|
* If we enable msix early due to not intx, disable it again before
|
|
|
|
* setting up the full range we need.
|
|
|
|
*/
|
|
|
|
if (!pdev->irq)
|
|
|
|
pci_disable_msix(pdev);
|
|
|
|
|
2014-03-04 22:22:00 +07:00
|
|
|
for (i = 0; i < nr_io_queues; i++)
|
2011-01-21 01:01:49 +07:00
|
|
|
dev->entry[i].entry = i;
|
2014-03-04 22:22:00 +07:00
|
|
|
vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues);
|
|
|
|
if (vecs < 0) {
|
|
|
|
vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32));
|
|
|
|
if (vecs < 0) {
|
|
|
|
vecs = 1;
|
|
|
|
} else {
|
|
|
|
for (i = 0; i < vecs; i++)
|
|
|
|
dev->entry[i].vector = i + pdev->irq;
|
2013-05-12 05:19:31 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-06-20 21:53:48 +07:00
|
|
|
/*
|
|
|
|
* Should investigate if there's a performance win from allocating
|
|
|
|
* more queues than interrupt vectors; it might allow the submission
|
|
|
|
* path to scale better, even if the receive path is limited by the
|
|
|
|
* number of interrupts.
|
|
|
|
*/
|
|
|
|
nr_io_queues = vecs;
|
2014-03-24 23:46:25 +07:00
|
|
|
dev->max_qid = nr_io_queues;
|
2013-06-20 21:53:48 +07:00
|
|
|
|
2014-01-28 03:57:22 +07:00
|
|
|
result = queue_request_irq(dev, adminq, adminq->irqname);
|
2015-07-01 00:22:52 +07:00
|
|
|
if (result) {
|
|
|
|
adminq->cq_vector = -1;
|
2013-07-16 04:02:20 +07:00
|
|
|
goto free_queues;
|
2015-07-01 00:22:52 +07:00
|
|
|
}
|
2011-01-21 01:01:49 +07:00
|
|
|
|
2013-07-16 04:02:23 +07:00
|
|
|
/* Free previously allocated queues that are no longer usable */
|
2014-03-24 23:46:25 +07:00
|
|
|
nvme_free_queues(dev, nr_io_queues + 1);
|
2014-11-04 22:20:14 +07:00
|
|
|
nvme_create_io_queues(dev);
|
2011-03-17 03:52:19 +07:00
|
|
|
|
2013-07-16 04:02:20 +07:00
|
|
|
return 0;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2013-07-16 04:02:20 +07:00
|
|
|
free_queues:
|
2013-12-17 01:50:00 +07:00
|
|
|
nvme_free_queues(dev, 1);
|
2013-07-16 04:02:20 +07:00
|
|
|
return result;
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2015-06-02 03:28:14 +07:00
|
|
|
static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
|
|
|
|
{
|
|
|
|
struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
|
|
|
|
struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
|
|
|
|
|
|
|
|
return nsa->ns_id - nsb->ns_id;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct nvme_ns *nvme_find_ns(struct nvme_dev *dev, unsigned nsid)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
|
|
|
list_for_each_entry(ns, &dev->namespaces, list) {
|
|
|
|
if (ns->ns_id == nsid)
|
|
|
|
return ns;
|
|
|
|
if (ns->ns_id > nsid)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool nvme_io_incapable(struct nvme_dev *dev)
|
|
|
|
{
|
2015-11-20 14:58:10 +07:00
|
|
|
return (!dev->bar ||
|
|
|
|
readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_CFS ||
|
|
|
|
dev->online_queues < 2);
|
2015-06-02 03:28:14 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_ns_remove(struct nvme_ns *ns)
|
|
|
|
{
|
2015-11-26 16:06:56 +07:00
|
|
|
bool kill = nvme_io_incapable(to_nvme_dev(ns->ctrl)) &&
|
|
|
|
!blk_queue_dying(ns->queue);
|
2015-06-02 03:28:14 +07:00
|
|
|
|
|
|
|
if (kill)
|
|
|
|
blk_set_queue_dying(ns->queue);
|
2015-10-22 00:19:55 +07:00
|
|
|
if (ns->disk->flags & GENHD_FL_UP)
|
2015-06-02 03:28:14 +07:00
|
|
|
del_gendisk(ns->disk);
|
|
|
|
if (kill || !blk_queue_dying(ns->queue)) {
|
|
|
|
blk_mq_abort_requeue_list(ns->queue);
|
|
|
|
blk_cleanup_queue(ns->queue);
|
2015-10-02 23:37:28 +07:00
|
|
|
}
|
|
|
|
list_del_init(&ns->list);
|
|
|
|
kref_put(&ns->kref, nvme_free_ns);
|
2015-06-02 03:28:14 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns, *next;
|
|
|
|
unsigned i;
|
|
|
|
|
|
|
|
for (i = 1; i <= nn; i++) {
|
|
|
|
ns = nvme_find_ns(dev, i);
|
|
|
|
if (ns) {
|
2015-10-02 23:37:28 +07:00
|
|
|
if (revalidate_disk(ns->disk))
|
2015-06-02 03:28:14 +07:00
|
|
|
nvme_ns_remove(ns);
|
|
|
|
} else
|
|
|
|
nvme_alloc_ns(dev, i);
|
|
|
|
}
|
|
|
|
list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
|
2015-10-02 23:37:28 +07:00
|
|
|
if (ns->ns_id > nn)
|
2015-06-02 03:28:14 +07:00
|
|
|
nvme_ns_remove(ns);
|
|
|
|
}
|
|
|
|
list_sort(NULL, &dev->namespaces, ns_cmp);
|
|
|
|
}
|
|
|
|
|
2015-09-03 21:18:17 +07:00
|
|
|
static void nvme_set_irq_hints(struct nvme_dev *dev)
|
|
|
|
{
|
|
|
|
struct nvme_queue *nvmeq;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < dev->online_queues; i++) {
|
|
|
|
nvmeq = dev->queues[i];
|
|
|
|
|
|
|
|
if (!nvmeq->tags || !(*nvmeq->tags))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
|
|
|
|
blk_mq_tags_cpumask(*nvmeq->tags));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-06-02 03:28:14 +07:00
|
|
|
static void nvme_dev_scan(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work);
|
|
|
|
struct nvme_id_ctrl *ctrl;
|
|
|
|
|
|
|
|
if (!dev->tagset.tags)
|
|
|
|
return;
|
2015-11-26 16:06:56 +07:00
|
|
|
if (nvme_identify_ctrl(&dev->ctrl, &ctrl))
|
2015-06-02 03:28:14 +07:00
|
|
|
return;
|
|
|
|
nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn));
|
|
|
|
kfree(ctrl);
|
2015-09-03 21:18:17 +07:00
|
|
|
nvme_set_irq_hints(dev);
|
2015-06-02 03:28:14 +07:00
|
|
|
}
|
|
|
|
|
2013-04-16 22:22:36 +07:00
|
|
|
/*
|
|
|
|
* Return: error value if an error occurred setting up the queues or calling
|
|
|
|
* Identify Device. 0 if these succeeded, even if adding some of the
|
|
|
|
* namespaces failed. At the moment, these failures are silent. TBD which
|
|
|
|
* failures should be reported.
|
|
|
|
*/
|
2012-12-22 06:13:49 +07:00
|
|
|
static int nvme_dev_add(struct nvme_dev *dev)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2015-05-22 16:12:39 +07:00
|
|
|
struct pci_dev *pdev = to_pci_dev(dev->dev);
|
2013-07-09 04:26:25 +07:00
|
|
|
int res;
|
2011-02-02 04:18:08 +07:00
|
|
|
struct nvme_id_ctrl *ctrl;
|
2015-11-20 14:58:10 +07:00
|
|
|
int shift = NVME_CAP_MPSMIN(lo_hi_readq(dev->bar + NVME_REG_CAP)) + 12;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
res = nvme_identify_ctrl(&dev->ctrl, &ctrl);
|
2011-01-21 00:50:14 +07:00
|
|
|
if (res) {
|
2015-05-22 16:12:39 +07:00
|
|
|
dev_err(dev->dev, "Identify Controller failed (%d)\n", res);
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
return -EIO;
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
dev->ctrl.oncs = le16_to_cpup(&ctrl->oncs);
|
|
|
|
dev->ctrl.abort_limit = ctrl->acl + 1;
|
|
|
|
dev->ctrl.vwc = ctrl->vwc;
|
|
|
|
memcpy(dev->ctrl.serial, ctrl->sn, sizeof(ctrl->sn));
|
|
|
|
memcpy(dev->ctrl.model, ctrl->mn, sizeof(ctrl->mn));
|
|
|
|
memcpy(dev->ctrl.firmware_rev, ctrl->fr, sizeof(ctrl->fr));
|
2013-04-10 06:13:20 +07:00
|
|
|
if (ctrl->mdts)
|
2012-07-27 00:29:57 +07:00
|
|
|
dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
|
2015-11-06 02:52:28 +07:00
|
|
|
else
|
|
|
|
dev->max_hw_sectors = UINT_MAX;
|
2013-06-22 01:36:34 +07:00
|
|
|
if ((pdev->vendor == PCI_VENDOR_ID_INTEL) &&
|
2014-11-04 22:20:14 +07:00
|
|
|
(pdev->device == 0x0953) && ctrl->vs[3]) {
|
|
|
|
unsigned int max_hw_sectors;
|
|
|
|
|
2013-04-10 06:13:20 +07:00
|
|
|
dev->stripe_size = 1 << (ctrl->vs[3] + shift);
|
2014-11-04 22:20:14 +07:00
|
|
|
max_hw_sectors = dev->stripe_size >> (shift - 9);
|
|
|
|
if (dev->max_hw_sectors) {
|
|
|
|
dev->max_hw_sectors = min(max_hw_sectors,
|
|
|
|
dev->max_hw_sectors);
|
|
|
|
} else
|
|
|
|
dev->max_hw_sectors = max_hw_sectors;
|
|
|
|
}
|
2015-05-22 16:12:46 +07:00
|
|
|
kfree(ctrl);
|
2014-11-04 22:20:14 +07:00
|
|
|
|
2015-06-08 23:08:15 +07:00
|
|
|
if (!dev->tagset.tags) {
|
|
|
|
dev->tagset.ops = &nvme_mq_ops;
|
|
|
|
dev->tagset.nr_hw_queues = dev->online_queues - 1;
|
|
|
|
dev->tagset.timeout = NVME_IO_TIMEOUT;
|
|
|
|
dev->tagset.numa_node = dev_to_node(dev->dev);
|
|
|
|
dev->tagset.queue_depth =
|
2014-11-04 22:20:14 +07:00
|
|
|
min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
|
2015-06-08 23:08:15 +07:00
|
|
|
dev->tagset.cmd_size = nvme_cmd_size(dev);
|
|
|
|
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
|
|
|
|
dev->tagset.driver_data = dev;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2015-06-08 23:08:15 +07:00
|
|
|
if (blk_mq_alloc_tag_set(&dev->tagset))
|
|
|
|
return 0;
|
|
|
|
}
|
2015-06-02 03:28:14 +07:00
|
|
|
schedule_work(&dev->scan_work);
|
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-20 03:39:03 +07:00
|
|
|
return 0;
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2013-07-16 04:02:19 +07:00
|
|
|
static int nvme_dev_map(struct nvme_dev *dev)
|
|
|
|
{
|
2014-03-24 23:46:25 +07:00
|
|
|
u64 cap;
|
2013-07-16 04:02:19 +07:00
|
|
|
int bars, result = -ENOMEM;
|
2015-05-22 16:12:39 +07:00
|
|
|
struct pci_dev *pdev = to_pci_dev(dev->dev);
|
2013-07-16 04:02:19 +07:00
|
|
|
|
|
|
|
if (pci_enable_device_mem(pdev))
|
|
|
|
return result;
|
|
|
|
|
|
|
|
dev->entry[0].vector = pdev->irq;
|
|
|
|
pci_set_master(pdev);
|
|
|
|
bars = pci_select_bars(pdev, IORESOURCE_MEM);
|
2014-11-14 23:50:19 +07:00
|
|
|
if (!bars)
|
|
|
|
goto disable_pci;
|
|
|
|
|
2013-07-16 04:02:19 +07:00
|
|
|
if (pci_request_selected_regions(pdev, bars, "nvme"))
|
|
|
|
goto disable_pci;
|
|
|
|
|
2015-05-22 16:12:39 +07:00
|
|
|
if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
|
|
|
|
dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
|
2013-06-27 05:49:11 +07:00
|
|
|
goto disable;
|
2013-07-16 04:02:19 +07:00
|
|
|
|
|
|
|
dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
|
|
|
|
if (!dev->bar)
|
|
|
|
goto disable;
|
2014-11-14 23:49:26 +07:00
|
|
|
|
2015-11-20 14:58:10 +07:00
|
|
|
if (readl(dev->bar + NVME_REG_CSTS) == -1) {
|
2013-12-11 03:10:39 +07:00
|
|
|
result = -ENODEV;
|
|
|
|
goto unmap;
|
|
|
|
}
|
2014-11-14 23:49:26 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Some devices don't advertse INTx interrupts, pre-enable a single
|
|
|
|
* MSIX vec for setup. We'll adjust this later.
|
|
|
|
*/
|
|
|
|
if (!pdev->irq) {
|
|
|
|
result = pci_enable_msix(pdev, dev->entry, 1);
|
|
|
|
if (result < 0)
|
|
|
|
goto unmap;
|
|
|
|
}
|
|
|
|
|
2015-11-20 14:58:10 +07:00
|
|
|
cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
|
|
|
|
|
2014-03-24 23:46:25 +07:00
|
|
|
dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
|
|
|
|
dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
|
2015-11-20 14:58:10 +07:00
|
|
|
dev->dbs = dev->bar + 4096;
|
|
|
|
if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2))
|
2015-07-20 23:14:09 +07:00
|
|
|
dev->cmb = nvme_map_cmb(dev);
|
2013-07-16 04:02:19 +07:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
2013-12-11 03:10:39 +07:00
|
|
|
unmap:
|
|
|
|
iounmap(dev->bar);
|
|
|
|
dev->bar = NULL;
|
2013-07-16 04:02:19 +07:00
|
|
|
disable:
|
|
|
|
pci_release_regions(pdev);
|
|
|
|
disable_pci:
|
|
|
|
pci_disable_device(pdev);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_dev_unmap(struct nvme_dev *dev)
|
|
|
|
{
|
2015-05-22 16:12:39 +07:00
|
|
|
struct pci_dev *pdev = to_pci_dev(dev->dev);
|
|
|
|
|
|
|
|
if (pdev->msi_enabled)
|
|
|
|
pci_disable_msi(pdev);
|
|
|
|
else if (pdev->msix_enabled)
|
|
|
|
pci_disable_msix(pdev);
|
2013-07-16 04:02:19 +07:00
|
|
|
|
|
|
|
if (dev->bar) {
|
|
|
|
iounmap(dev->bar);
|
|
|
|
dev->bar = NULL;
|
2015-05-22 16:12:39 +07:00
|
|
|
pci_release_regions(pdev);
|
2013-07-16 04:02:19 +07:00
|
|
|
}
|
|
|
|
|
2015-05-22 16:12:39 +07:00
|
|
|
if (pci_is_enabled(pdev))
|
|
|
|
pci_disable_device(pdev);
|
2013-07-16 04:02:19 +07:00
|
|
|
}
|
|
|
|
|
2013-12-11 03:10:40 +07:00
|
|
|
struct nvme_delq_ctx {
|
|
|
|
struct task_struct *waiter;
|
|
|
|
struct kthread_worker *worker;
|
|
|
|
atomic_t refcount;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev)
|
|
|
|
{
|
|
|
|
dq->waiter = current;
|
|
|
|
mb();
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
set_current_state(TASK_KILLABLE);
|
|
|
|
if (!atomic_read(&dq->refcount))
|
|
|
|
break;
|
|
|
|
if (!schedule_timeout(ADMIN_TIMEOUT) ||
|
|
|
|
fatal_signal_pending(current)) {
|
2015-01-08 08:55:50 +07:00
|
|
|
/*
|
|
|
|
* Disable the controller first since we can't trust it
|
|
|
|
* at this point, but leave the admin queue enabled
|
|
|
|
* until all queue deletion requests are flushed.
|
|
|
|
* FIXME: This may take a while if there are more h/w
|
|
|
|
* queues than admin tags.
|
|
|
|
*/
|
2013-12-11 03:10:40 +07:00
|
|
|
set_current_state(TASK_RUNNING);
|
2015-11-20 14:58:10 +07:00
|
|
|
nvme_disable_ctrl(dev,
|
|
|
|
lo_hi_readq(dev->bar + NVME_REG_CAP));
|
2015-01-08 08:55:50 +07:00
|
|
|
nvme_clear_queue(dev->queues[0]);
|
2013-12-11 03:10:40 +07:00
|
|
|
flush_kthread_worker(dq->worker);
|
2015-01-08 08:55:50 +07:00
|
|
|
nvme_disable_queue(dev, 0);
|
2013-12-11 03:10:40 +07:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
set_current_state(TASK_RUNNING);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_put_dq(struct nvme_delq_ctx *dq)
|
|
|
|
{
|
|
|
|
atomic_dec(&dq->refcount);
|
|
|
|
if (dq->waiter)
|
|
|
|
wake_up_process(dq->waiter);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq)
|
|
|
|
{
|
|
|
|
atomic_inc(&dq->refcount);
|
|
|
|
return dq;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_del_queue_end(struct nvme_queue *nvmeq)
|
|
|
|
{
|
|
|
|
struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx;
|
|
|
|
nvme_put_dq(dq);
|
2015-11-20 22:38:13 +07:00
|
|
|
|
|
|
|
spin_lock_irq(&nvmeq->q_lock);
|
|
|
|
nvme_process_cq(nvmeq);
|
|
|
|
spin_unlock_irq(&nvmeq->q_lock);
|
2013-12-11 03:10:40 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
|
|
|
|
kthread_work_func_t fn)
|
|
|
|
{
|
|
|
|
struct nvme_command c;
|
|
|
|
|
|
|
|
memset(&c, 0, sizeof(c));
|
|
|
|
c.delete_queue.opcode = opcode;
|
|
|
|
c.delete_queue.qid = cpu_to_le16(nvmeq->qid);
|
|
|
|
|
|
|
|
init_kthread_work(&nvmeq->cmdinfo.work, fn);
|
2014-11-04 22:20:14 +07:00
|
|
|
return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo,
|
|
|
|
ADMIN_TIMEOUT);
|
2013-12-11 03:10:40 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_del_cq_work_handler(struct kthread_work *work)
|
|
|
|
{
|
|
|
|
struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
|
|
|
|
cmdinfo.work);
|
|
|
|
nvme_del_queue_end(nvmeq);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_delete_cq(struct nvme_queue *nvmeq)
|
|
|
|
{
|
|
|
|
return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq,
|
|
|
|
nvme_del_cq_work_handler);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_del_sq_work_handler(struct kthread_work *work)
|
|
|
|
{
|
|
|
|
struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
|
|
|
|
cmdinfo.work);
|
|
|
|
int status = nvmeq->cmdinfo.status;
|
|
|
|
|
|
|
|
if (!status)
|
|
|
|
status = nvme_delete_cq(nvmeq);
|
|
|
|
if (status)
|
|
|
|
nvme_del_queue_end(nvmeq);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_delete_sq(struct nvme_queue *nvmeq)
|
|
|
|
{
|
|
|
|
return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq,
|
|
|
|
nvme_del_sq_work_handler);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_del_queue_start(struct kthread_work *work)
|
|
|
|
{
|
|
|
|
struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
|
|
|
|
cmdinfo.work);
|
|
|
|
if (nvme_delete_sq(nvmeq))
|
|
|
|
nvme_del_queue_end(nvmeq);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_disable_io_queues(struct nvme_dev *dev)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
DEFINE_KTHREAD_WORKER_ONSTACK(worker);
|
|
|
|
struct nvme_delq_ctx dq;
|
|
|
|
struct task_struct *kworker_task = kthread_run(kthread_worker_fn,
|
2015-11-26 16:06:56 +07:00
|
|
|
&worker, "nvme%d", dev->ctrl.instance);
|
2013-12-11 03:10:40 +07:00
|
|
|
|
|
|
|
if (IS_ERR(kworker_task)) {
|
2015-05-22 16:12:39 +07:00
|
|
|
dev_err(dev->dev,
|
2013-12-11 03:10:40 +07:00
|
|
|
"Failed to create queue del task\n");
|
|
|
|
for (i = dev->queue_count - 1; i > 0; i--)
|
|
|
|
nvme_disable_queue(dev, i);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
dq.waiter = NULL;
|
|
|
|
atomic_set(&dq.refcount, 0);
|
|
|
|
dq.worker = &worker;
|
|
|
|
for (i = dev->queue_count - 1; i > 0; i--) {
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_queue *nvmeq = dev->queues[i];
|
2013-12-11 03:10:40 +07:00
|
|
|
|
|
|
|
if (nvme_suspend_queue(nvmeq))
|
|
|
|
continue;
|
|
|
|
nvmeq->cmdinfo.ctx = nvme_get_dq(&dq);
|
|
|
|
nvmeq->cmdinfo.worker = dq.worker;
|
|
|
|
init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start);
|
|
|
|
queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work);
|
|
|
|
}
|
|
|
|
nvme_wait_dq(&dq, dev);
|
|
|
|
kthread_stop(kworker_task);
|
|
|
|
}
|
|
|
|
|
2014-04-08 06:10:11 +07:00
|
|
|
/*
|
|
|
|
* Remove the node from the device list and check
|
|
|
|
* for whether or not we need to stop the nvme_thread.
|
|
|
|
*/
|
|
|
|
static void nvme_dev_list_remove(struct nvme_dev *dev)
|
|
|
|
{
|
|
|
|
struct task_struct *tmp = NULL;
|
|
|
|
|
|
|
|
spin_lock(&dev_list_lock);
|
|
|
|
list_del_init(&dev->node);
|
|
|
|
if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) {
|
|
|
|
tmp = nvme_thread;
|
|
|
|
nvme_thread = NULL;
|
|
|
|
}
|
|
|
|
spin_unlock(&dev_list_lock);
|
|
|
|
|
|
|
|
if (tmp)
|
|
|
|
kthread_stop(tmp);
|
|
|
|
}
|
|
|
|
|
2015-01-08 08:55:52 +07:00
|
|
|
static void nvme_freeze_queues(struct nvme_dev *dev)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
|
|
|
list_for_each_entry(ns, &dev->namespaces, list) {
|
|
|
|
blk_mq_freeze_queue_start(ns->queue);
|
|
|
|
|
2015-05-07 14:38:14 +07:00
|
|
|
spin_lock_irq(ns->queue->queue_lock);
|
2015-01-08 08:55:52 +07:00
|
|
|
queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
|
2015-05-07 14:38:14 +07:00
|
|
|
spin_unlock_irq(ns->queue->queue_lock);
|
2015-01-08 08:55:52 +07:00
|
|
|
|
|
|
|
blk_mq_cancel_requeue_work(ns->queue);
|
|
|
|
blk_mq_stop_hw_queues(ns->queue);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_unfreeze_queues(struct nvme_dev *dev)
|
|
|
|
{
|
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
|
|
|
list_for_each_entry(ns, &dev->namespaces, list) {
|
|
|
|
queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
|
|
|
|
blk_mq_unfreeze_queue(ns->queue);
|
|
|
|
blk_mq_start_stopped_hw_queues(ns->queue, true);
|
|
|
|
blk_mq_kick_requeue_list(ns->queue);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-07-16 04:02:21 +07:00
|
|
|
static void nvme_dev_shutdown(struct nvme_dev *dev)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2013-07-16 04:02:20 +07:00
|
|
|
int i;
|
2014-06-26 00:18:12 +07:00
|
|
|
u32 csts = -1;
|
2013-07-16 04:02:20 +07:00
|
|
|
|
2014-04-08 06:10:11 +07:00
|
|
|
nvme_dev_list_remove(dev);
|
2011-03-03 06:37:18 +07:00
|
|
|
|
2015-01-08 08:55:52 +07:00
|
|
|
if (dev->bar) {
|
|
|
|
nvme_freeze_queues(dev);
|
2015-11-20 14:58:10 +07:00
|
|
|
csts = readl(dev->bar + NVME_REG_CSTS);
|
2015-01-08 08:55:52 +07:00
|
|
|
}
|
2014-06-26 00:18:12 +07:00
|
|
|
if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) {
|
2013-12-11 03:10:40 +07:00
|
|
|
for (i = dev->queue_count - 1; i >= 0; i--) {
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_queue *nvmeq = dev->queues[i];
|
2013-12-11 03:10:40 +07:00
|
|
|
nvme_suspend_queue(nvmeq);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
nvme_disable_io_queues(dev);
|
2013-07-16 04:02:22 +07:00
|
|
|
nvme_shutdown_ctrl(dev);
|
2013-12-11 03:10:40 +07:00
|
|
|
nvme_disable_queue(dev, 0);
|
|
|
|
}
|
2013-07-16 04:02:21 +07:00
|
|
|
nvme_dev_unmap(dev);
|
2015-02-20 00:34:48 +07:00
|
|
|
|
|
|
|
for (i = dev->queue_count - 1; i >= 0; i--)
|
|
|
|
nvme_clear_queue(dev->queues[i]);
|
2013-07-16 04:02:21 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_dev_remove(struct nvme_dev *dev)
|
|
|
|
{
|
2015-10-02 23:37:28 +07:00
|
|
|
struct nvme_ns *ns, *next;
|
2013-07-16 04:02:21 +07:00
|
|
|
|
2015-10-02 23:37:28 +07:00
|
|
|
list_for_each_entry_safe(ns, next, &dev->namespaces, list)
|
2015-06-02 03:28:14 +07:00
|
|
|
nvme_ns_remove(ns);
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2011-02-10 21:56:01 +07:00
|
|
|
static int nvme_setup_prp_pools(struct nvme_dev *dev)
|
|
|
|
{
|
2015-05-22 16:12:39 +07:00
|
|
|
dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
|
2011-02-10 21:56:01 +07:00
|
|
|
PAGE_SIZE, PAGE_SIZE, 0);
|
|
|
|
if (!dev->prp_page_pool)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2011-02-10 22:30:34 +07:00
|
|
|
/* Optimisation for I/Os between 4k and 128k */
|
2015-05-22 16:12:39 +07:00
|
|
|
dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
|
2011-02-10 22:30:34 +07:00
|
|
|
256, 256, 0);
|
|
|
|
if (!dev->prp_small_pool) {
|
|
|
|
dma_pool_destroy(dev->prp_page_pool);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
2011-02-10 21:56:01 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_release_prp_pools(struct nvme_dev *dev)
|
|
|
|
{
|
|
|
|
dma_pool_destroy(dev->prp_page_pool);
|
2011-02-10 22:30:34 +07:00
|
|
|
dma_pool_destroy(dev->prp_small_pool);
|
2011-02-10 21:56:01 +07:00
|
|
|
}
|
|
|
|
|
2012-02-22 06:50:53 +07:00
|
|
|
static DEFINE_IDA(nvme_instance_ida);
|
|
|
|
|
|
|
|
static int nvme_set_instance(struct nvme_dev *dev)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2012-02-22 06:50:53 +07:00
|
|
|
int instance, error;
|
|
|
|
|
|
|
|
do {
|
|
|
|
if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
|
|
|
|
return -ENODEV;
|
|
|
|
|
|
|
|
spin_lock(&dev_list_lock);
|
|
|
|
error = ida_get_new(&nvme_instance_ida, &instance);
|
|
|
|
spin_unlock(&dev_list_lock);
|
|
|
|
} while (error == -EAGAIN);
|
|
|
|
|
|
|
|
if (error)
|
|
|
|
return -ENODEV;
|
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
dev->ctrl.instance = instance;
|
2012-02-22 06:50:53 +07:00
|
|
|
return 0;
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void nvme_release_instance(struct nvme_dev *dev)
|
|
|
|
{
|
2012-02-22 06:50:53 +07:00
|
|
|
spin_lock(&dev_list_lock);
|
2015-11-26 16:06:56 +07:00
|
|
|
ida_remove(&nvme_instance_ida, dev->ctrl.instance);
|
2012-02-22 06:50:53 +07:00
|
|
|
spin_unlock(&dev_list_lock);
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
2013-02-20 00:17:58 +07:00
|
|
|
static void nvme_free_dev(struct kref *kref)
|
|
|
|
{
|
|
|
|
struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
|
2014-02-01 06:53:39 +07:00
|
|
|
|
2015-05-22 16:12:39 +07:00
|
|
|
put_device(dev->dev);
|
2015-02-04 01:21:42 +07:00
|
|
|
put_device(dev->device);
|
2014-12-11 22:24:18 +07:00
|
|
|
nvme_release_instance(dev);
|
2015-06-08 23:08:13 +07:00
|
|
|
if (dev->tagset.tags)
|
|
|
|
blk_mq_free_tag_set(&dev->tagset);
|
2015-11-26 16:06:56 +07:00
|
|
|
if (dev->ctrl.admin_q)
|
|
|
|
blk_put_queue(dev->ctrl.admin_q);
|
2013-02-20 00:17:58 +07:00
|
|
|
kfree(dev->queues);
|
|
|
|
kfree(dev->entry);
|
|
|
|
kfree(dev);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_dev_open(struct inode *inode, struct file *f)
|
|
|
|
{
|
2015-02-04 01:21:42 +07:00
|
|
|
struct nvme_dev *dev;
|
|
|
|
int instance = iminor(inode);
|
|
|
|
int ret = -ENODEV;
|
|
|
|
|
|
|
|
spin_lock(&dev_list_lock);
|
|
|
|
list_for_each_entry(dev, &dev_list, node) {
|
2015-11-26 16:06:56 +07:00
|
|
|
if (dev->ctrl.instance == instance) {
|
|
|
|
if (!dev->ctrl.admin_q) {
|
2015-02-13 05:33:00 +07:00
|
|
|
ret = -EWOULDBLOCK;
|
|
|
|
break;
|
|
|
|
}
|
2015-02-04 01:21:42 +07:00
|
|
|
if (!kref_get_unless_zero(&dev->kref))
|
|
|
|
break;
|
|
|
|
f->private_data = dev;
|
|
|
|
ret = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
spin_unlock(&dev_list_lock);
|
|
|
|
|
|
|
|
return ret;
|
2013-02-20 00:17:58 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_dev_release(struct inode *inode, struct file *f)
|
|
|
|
{
|
|
|
|
struct nvme_dev *dev = f->private_data;
|
|
|
|
kref_put(&dev->kref, nvme_free_dev);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
|
|
|
|
{
|
|
|
|
struct nvme_dev *dev = f->private_data;
|
2014-11-04 22:20:14 +07:00
|
|
|
struct nvme_ns *ns;
|
|
|
|
|
2013-02-20 00:17:58 +07:00
|
|
|
switch (cmd) {
|
|
|
|
case NVME_IOCTL_ADMIN_CMD:
|
2015-11-26 16:06:56 +07:00
|
|
|
return nvme_user_cmd(&dev->ctrl, NULL, (void __user *)arg);
|
2014-09-13 05:07:20 +07:00
|
|
|
case NVME_IOCTL_IO_CMD:
|
2014-11-04 22:20:14 +07:00
|
|
|
if (list_empty(&dev->namespaces))
|
|
|
|
return -ENOTTY;
|
|
|
|
ns = list_first_entry(&dev->namespaces, struct nvme_ns, list);
|
2015-11-26 16:06:56 +07:00
|
|
|
return nvme_user_cmd(&dev->ctrl, ns, (void __user *)arg);
|
2015-06-05 23:30:08 +07:00
|
|
|
case NVME_IOCTL_RESET:
|
|
|
|
dev_warn(dev->dev, "resetting controller\n");
|
|
|
|
return nvme_reset(dev);
|
2015-08-11 04:20:41 +07:00
|
|
|
case NVME_IOCTL_SUBSYS_RESET:
|
|
|
|
return nvme_subsys_reset(dev);
|
2013-02-20 00:17:58 +07:00
|
|
|
default:
|
|
|
|
return -ENOTTY;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations nvme_dev_fops = {
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.open = nvme_dev_open,
|
|
|
|
.release = nvme_dev_release,
|
|
|
|
.unlocked_ioctl = nvme_dev_ioctl,
|
|
|
|
.compat_ioctl = nvme_dev_ioctl,
|
|
|
|
};
|
|
|
|
|
2015-10-03 14:49:23 +07:00
|
|
|
static void nvme_probe_work(struct work_struct *work)
|
2013-07-16 04:02:21 +07:00
|
|
|
{
|
2015-10-03 14:49:23 +07:00
|
|
|
struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
|
2014-04-08 06:10:11 +07:00
|
|
|
bool start_thread = false;
|
2015-10-03 14:49:23 +07:00
|
|
|
int result;
|
2013-07-16 04:02:21 +07:00
|
|
|
|
|
|
|
result = nvme_dev_map(dev);
|
|
|
|
if (result)
|
2015-10-03 14:49:23 +07:00
|
|
|
goto out;
|
2013-07-16 04:02:21 +07:00
|
|
|
|
|
|
|
result = nvme_configure_admin_queue(dev);
|
|
|
|
if (result)
|
|
|
|
goto unmap;
|
|
|
|
|
|
|
|
spin_lock(&dev_list_lock);
|
2014-04-08 06:10:11 +07:00
|
|
|
if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) {
|
|
|
|
start_thread = true;
|
|
|
|
nvme_thread = NULL;
|
|
|
|
}
|
2013-07-16 04:02:21 +07:00
|
|
|
list_add(&dev->node, &dev_list);
|
|
|
|
spin_unlock(&dev_list_lock);
|
|
|
|
|
2014-04-08 06:10:11 +07:00
|
|
|
if (start_thread) {
|
|
|
|
nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
|
2014-09-23 02:46:19 +07:00
|
|
|
wake_up_all(&nvme_kthread_wait);
|
2014-04-08 06:10:11 +07:00
|
|
|
} else
|
|
|
|
wait_event_killable(nvme_kthread_wait, nvme_thread);
|
|
|
|
|
|
|
|
if (IS_ERR_OR_NULL(nvme_thread)) {
|
|
|
|
result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
|
|
|
|
goto disable;
|
|
|
|
}
|
2014-11-04 22:20:14 +07:00
|
|
|
|
|
|
|
nvme_init_queue(dev->queues[0], 0);
|
2015-01-08 08:55:50 +07:00
|
|
|
result = nvme_alloc_admin_tags(dev);
|
|
|
|
if (result)
|
|
|
|
goto disable;
|
2014-04-08 06:10:11 +07:00
|
|
|
|
2013-07-16 04:02:21 +07:00
|
|
|
result = nvme_setup_io_queues(dev);
|
2014-06-24 03:25:35 +07:00
|
|
|
if (result)
|
2015-01-08 08:55:50 +07:00
|
|
|
goto free_tags;
|
2013-07-16 04:02:21 +07:00
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
dev->ctrl.event_limit = 1;
|
2015-10-03 14:49:23 +07:00
|
|
|
|
2015-10-02 23:51:31 +07:00
|
|
|
/*
|
|
|
|
* Keep the controller around but remove all namespaces if we don't have
|
|
|
|
* any working I/O queue.
|
|
|
|
*/
|
2015-10-03 14:49:23 +07:00
|
|
|
if (dev->online_queues < 2) {
|
|
|
|
dev_warn(dev->dev, "IO queues not created\n");
|
|
|
|
nvme_dev_remove(dev);
|
|
|
|
} else {
|
|
|
|
nvme_unfreeze_queues(dev);
|
|
|
|
nvme_dev_add(dev);
|
|
|
|
}
|
|
|
|
|
|
|
|
return;
|
2013-07-16 04:02:21 +07:00
|
|
|
|
2015-01-08 08:55:50 +07:00
|
|
|
free_tags:
|
|
|
|
nvme_dev_remove_admin(dev);
|
2015-11-26 16:06:56 +07:00
|
|
|
blk_put_queue(dev->ctrl.admin_q);
|
|
|
|
dev->ctrl.admin_q = NULL;
|
2015-06-08 23:08:13 +07:00
|
|
|
dev->queues[0]->tags = NULL;
|
2013-07-16 04:02:21 +07:00
|
|
|
disable:
|
2013-12-17 01:50:00 +07:00
|
|
|
nvme_disable_queue(dev, 0);
|
2014-04-08 06:10:11 +07:00
|
|
|
nvme_dev_list_remove(dev);
|
2013-07-16 04:02:21 +07:00
|
|
|
unmap:
|
|
|
|
nvme_dev_unmap(dev);
|
2015-10-03 14:49:23 +07:00
|
|
|
out:
|
|
|
|
if (!work_busy(&dev->reset_work))
|
|
|
|
nvme_dead_ctrl(dev);
|
2013-07-16 04:02:21 +07:00
|
|
|
}
|
|
|
|
|
2013-12-11 03:10:36 +07:00
|
|
|
static int nvme_remove_dead_ctrl(void *arg)
|
|
|
|
{
|
|
|
|
struct nvme_dev *dev = (struct nvme_dev *)arg;
|
2015-05-22 16:12:39 +07:00
|
|
|
struct pci_dev *pdev = to_pci_dev(dev->dev);
|
2013-12-11 03:10:36 +07:00
|
|
|
|
|
|
|
if (pci_get_drvdata(pdev))
|
2014-06-24 04:24:53 +07:00
|
|
|
pci_stop_and_remove_bus_device_locked(pdev);
|
2013-12-11 03:10:36 +07:00
|
|
|
kref_put(&dev->kref, nvme_free_dev);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-06-19 02:36:39 +07:00
|
|
|
static void nvme_dead_ctrl(struct nvme_dev *dev)
|
|
|
|
{
|
|
|
|
dev_warn(dev->dev, "Device failed to resume\n");
|
|
|
|
kref_get(&dev->kref);
|
|
|
|
if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
|
2015-11-26 16:06:56 +07:00
|
|
|
dev->ctrl.instance))) {
|
2015-06-19 02:36:39 +07:00
|
|
|
dev_err(dev->dev,
|
|
|
|
"Failed to start controller remove task\n");
|
|
|
|
kref_put(&dev->kref, nvme_free_dev);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-02 22:41:18 +07:00
|
|
|
static void nvme_reset_work(struct work_struct *ws)
|
2013-12-11 03:10:36 +07:00
|
|
|
{
|
2015-10-02 22:41:18 +07:00
|
|
|
struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
|
2015-06-08 23:08:15 +07:00
|
|
|
bool in_probe = work_busy(&dev->probe_work);
|
|
|
|
|
2013-12-11 03:10:36 +07:00
|
|
|
nvme_dev_shutdown(dev);
|
2015-06-08 23:08:15 +07:00
|
|
|
|
|
|
|
/* Synchronize with device probe so that work will see failure status
|
|
|
|
* and exit gracefully without trying to schedule another reset */
|
|
|
|
flush_work(&dev->probe_work);
|
|
|
|
|
|
|
|
/* Fail this device if reset occured during probe to avoid
|
|
|
|
* infinite initialization loops. */
|
|
|
|
if (in_probe) {
|
2015-06-19 02:36:39 +07:00
|
|
|
nvme_dead_ctrl(dev);
|
2015-06-08 23:08:15 +07:00
|
|
|
return;
|
2013-12-11 03:10:36 +07:00
|
|
|
}
|
2015-06-08 23:08:15 +07:00
|
|
|
/* Schedule device resume asynchronously so the reset work is available
|
|
|
|
* to cleanup errors that may occur during reinitialization */
|
|
|
|
schedule_work(&dev->probe_work);
|
2013-12-11 03:10:36 +07:00
|
|
|
}
|
|
|
|
|
2015-10-02 23:49:23 +07:00
|
|
|
static int __nvme_reset(struct nvme_dev *dev)
|
2014-03-07 22:24:49 +07:00
|
|
|
{
|
2015-10-02 23:49:23 +07:00
|
|
|
if (work_pending(&dev->reset_work))
|
|
|
|
return -EBUSY;
|
|
|
|
list_del_init(&dev->node);
|
|
|
|
queue_work(nvme_workq, &dev->reset_work);
|
|
|
|
return 0;
|
2014-03-07 22:24:49 +07:00
|
|
|
}
|
|
|
|
|
2015-06-05 23:30:08 +07:00
|
|
|
static int nvme_reset(struct nvme_dev *dev)
|
|
|
|
{
|
2015-10-02 23:49:23 +07:00
|
|
|
int ret;
|
2015-06-05 23:30:08 +07:00
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
|
2015-06-05 23:30:08 +07:00
|
|
|
return -ENODEV;
|
|
|
|
|
|
|
|
spin_lock(&dev_list_lock);
|
2015-10-02 23:49:23 +07:00
|
|
|
ret = __nvme_reset(dev);
|
2015-06-05 23:30:08 +07:00
|
|
|
spin_unlock(&dev_list_lock);
|
|
|
|
|
|
|
|
if (!ret) {
|
|
|
|
flush_work(&dev->reset_work);
|
2015-06-08 23:08:15 +07:00
|
|
|
flush_work(&dev->probe_work);
|
2015-06-05 23:30:08 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t nvme_sysfs_reset(struct device *dev,
|
|
|
|
struct device_attribute *attr, const char *buf,
|
|
|
|
size_t count)
|
|
|
|
{
|
|
|
|
struct nvme_dev *ndev = dev_get_drvdata(dev);
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = nvme_reset(ndev);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
|
|
|
|
|
2015-11-26 16:06:56 +07:00
|
|
|
static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
|
|
|
|
{
|
|
|
|
*val = readl(to_nvme_dev(ctrl)->bar + off);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
|
|
|
|
.reg_read32 = nvme_pci_reg_read32,
|
|
|
|
};
|
|
|
|
|
2012-12-22 06:13:49 +07:00
|
|
|
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
2014-11-04 22:20:14 +07:00
|
|
|
int node, result = -ENOMEM;
|
2011-01-21 00:50:14 +07:00
|
|
|
struct nvme_dev *dev;
|
|
|
|
|
2014-11-04 22:20:14 +07:00
|
|
|
node = dev_to_node(&pdev->dev);
|
|
|
|
if (node == NUMA_NO_NODE)
|
|
|
|
set_dev_node(&pdev->dev, 0);
|
|
|
|
|
|
|
|
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
|
2011-01-21 00:50:14 +07:00
|
|
|
if (!dev)
|
|
|
|
return -ENOMEM;
|
2014-11-04 22:20:14 +07:00
|
|
|
dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry),
|
|
|
|
GFP_KERNEL, node);
|
2011-01-21 00:50:14 +07:00
|
|
|
if (!dev->entry)
|
|
|
|
goto free;
|
2014-11-04 22:20:14 +07:00
|
|
|
dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
|
|
|
|
GFP_KERNEL, node);
|
2011-01-21 00:50:14 +07:00
|
|
|
if (!dev->queues)
|
|
|
|
goto free;
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&dev->namespaces);
|
2015-10-02 22:41:18 +07:00
|
|
|
INIT_WORK(&dev->reset_work, nvme_reset_work);
|
2015-05-22 16:12:39 +07:00
|
|
|
dev->dev = get_device(&pdev->dev);
|
2013-12-11 03:10:36 +07:00
|
|
|
pci_set_drvdata(pdev, dev);
|
2015-11-26 16:06:56 +07:00
|
|
|
|
|
|
|
dev->ctrl.ops = &nvme_pci_ctrl_ops;
|
|
|
|
dev->ctrl.dev = dev->dev;
|
|
|
|
|
2012-02-22 06:50:53 +07:00
|
|
|
result = nvme_set_instance(dev);
|
|
|
|
if (result)
|
2014-08-20 08:15:59 +07:00
|
|
|
goto put_pci;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2011-02-10 21:56:01 +07:00
|
|
|
result = nvme_setup_prp_pools(dev);
|
|
|
|
if (result)
|
2013-07-16 04:02:19 +07:00
|
|
|
goto release;
|
2011-02-10 21:56:01 +07:00
|
|
|
|
2014-03-04 01:09:47 +07:00
|
|
|
kref_init(&dev->kref);
|
2015-02-04 01:21:42 +07:00
|
|
|
dev->device = device_create(nvme_class, &pdev->dev,
|
2015-11-26 16:06:56 +07:00
|
|
|
MKDEV(nvme_char_major, dev->ctrl.instance),
|
|
|
|
dev, "nvme%d", dev->ctrl.instance);
|
2015-02-04 01:21:42 +07:00
|
|
|
if (IS_ERR(dev->device)) {
|
|
|
|
result = PTR_ERR(dev->device);
|
2015-02-13 05:33:00 +07:00
|
|
|
goto release_pools;
|
2015-02-04 01:21:42 +07:00
|
|
|
}
|
|
|
|
get_device(dev->device);
|
2015-06-05 23:30:08 +07:00
|
|
|
dev_set_drvdata(dev->device, dev);
|
|
|
|
|
|
|
|
result = device_create_file(dev->device, &dev_attr_reset_controller);
|
|
|
|
if (result)
|
|
|
|
goto put_dev;
|
2011-02-16 04:28:20 +07:00
|
|
|
|
2015-03-23 22:32:37 +07:00
|
|
|
INIT_LIST_HEAD(&dev->node);
|
2015-06-02 03:28:14 +07:00
|
|
|
INIT_WORK(&dev->scan_work, nvme_dev_scan);
|
2015-10-03 14:49:23 +07:00
|
|
|
INIT_WORK(&dev->probe_work, nvme_probe_work);
|
2015-02-13 05:33:00 +07:00
|
|
|
schedule_work(&dev->probe_work);
|
2011-01-21 00:50:14 +07:00
|
|
|
return 0;
|
|
|
|
|
2015-06-05 23:30:08 +07:00
|
|
|
put_dev:
|
2015-11-26 16:06:56 +07:00
|
|
|
device_destroy(nvme_class, MKDEV(nvme_char_major, dev->ctrl.instance));
|
2015-06-05 23:30:08 +07:00
|
|
|
put_device(dev->device);
|
2013-07-16 04:02:19 +07:00
|
|
|
release_pools:
|
2011-02-10 21:56:01 +07:00
|
|
|
nvme_release_prp_pools(dev);
|
2013-07-16 04:02:19 +07:00
|
|
|
release:
|
|
|
|
nvme_release_instance(dev);
|
2014-08-20 08:15:59 +07:00
|
|
|
put_pci:
|
2015-05-22 16:12:39 +07:00
|
|
|
put_device(dev->dev);
|
2011-01-21 00:50:14 +07:00
|
|
|
free:
|
|
|
|
kfree(dev->queues);
|
|
|
|
kfree(dev->entry);
|
|
|
|
kfree(dev);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2014-05-02 23:40:43 +07:00
|
|
|
static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
|
|
|
|
{
|
2014-06-24 05:03:21 +07:00
|
|
|
struct nvme_dev *dev = pci_get_drvdata(pdev);
|
2014-05-02 23:40:43 +07:00
|
|
|
|
2014-06-24 05:03:21 +07:00
|
|
|
if (prepare)
|
|
|
|
nvme_dev_shutdown(dev);
|
|
|
|
else
|
2015-10-02 23:37:29 +07:00
|
|
|
schedule_work(&dev->probe_work);
|
2014-05-02 23:40:43 +07:00
|
|
|
}
|
|
|
|
|
2014-01-27 23:29:40 +07:00
|
|
|
static void nvme_shutdown(struct pci_dev *pdev)
|
|
|
|
{
|
|
|
|
struct nvme_dev *dev = pci_get_drvdata(pdev);
|
|
|
|
nvme_dev_shutdown(dev);
|
|
|
|
}
|
|
|
|
|
2012-12-22 06:13:49 +07:00
|
|
|
static void nvme_remove(struct pci_dev *pdev)
|
2011-01-21 00:50:14 +07:00
|
|
|
{
|
|
|
|
struct nvme_dev *dev = pci_get_drvdata(pdev);
|
2013-12-11 03:10:36 +07:00
|
|
|
|
|
|
|
spin_lock(&dev_list_lock);
|
|
|
|
list_del_init(&dev->node);
|
|
|
|
spin_unlock(&dev_list_lock);
|
|
|
|
|
|
|
|
pci_set_drvdata(pdev, NULL);
|
2015-02-13 05:33:00 +07:00
|
|
|
flush_work(&dev->probe_work);
|
2013-12-11 03:10:36 +07:00
|
|
|
flush_work(&dev->reset_work);
|
2015-06-02 03:28:14 +07:00
|
|
|
flush_work(&dev->scan_work);
|
2015-06-05 23:30:08 +07:00
|
|
|
device_remove_file(dev->device, &dev_attr_reset_controller);
|
2015-01-08 08:55:52 +07:00
|
|
|
nvme_dev_remove(dev);
|
2015-06-19 02:36:40 +07:00
|
|
|
nvme_dev_shutdown(dev);
|
2014-11-04 22:20:14 +07:00
|
|
|
nvme_dev_remove_admin(dev);
|
2015-11-26 16:06:56 +07:00
|
|
|
device_destroy(nvme_class, MKDEV(nvme_char_major, dev->ctrl.instance));
|
2013-12-17 01:50:00 +07:00
|
|
|
nvme_free_queues(dev, 0);
|
2015-07-20 23:14:09 +07:00
|
|
|
nvme_release_cmb(dev);
|
2013-12-11 03:10:36 +07:00
|
|
|
nvme_release_prp_pools(dev);
|
2013-02-20 00:17:58 +07:00
|
|
|
kref_put(&dev->kref, nvme_free_dev);
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* These functions are yet to be implemented */
|
|
|
|
#define nvme_error_detected NULL
|
|
|
|
#define nvme_dump_registers NULL
|
|
|
|
#define nvme_link_reset NULL
|
|
|
|
#define nvme_slot_reset NULL
|
|
|
|
#define nvme_error_resume NULL
|
2013-07-16 04:02:23 +07:00
|
|
|
|
2014-02-13 09:19:14 +07:00
|
|
|
#ifdef CONFIG_PM_SLEEP
|
2013-07-16 04:02:23 +07:00
|
|
|
static int nvme_suspend(struct device *dev)
|
|
|
|
{
|
|
|
|
struct pci_dev *pdev = to_pci_dev(dev);
|
|
|
|
struct nvme_dev *ndev = pci_get_drvdata(pdev);
|
|
|
|
|
|
|
|
nvme_dev_shutdown(ndev);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nvme_resume(struct device *dev)
|
|
|
|
{
|
|
|
|
struct pci_dev *pdev = to_pci_dev(dev);
|
|
|
|
struct nvme_dev *ndev = pci_get_drvdata(pdev);
|
|
|
|
|
2015-10-02 23:37:29 +07:00
|
|
|
schedule_work(&ndev->probe_work);
|
2013-12-11 03:10:36 +07:00
|
|
|
return 0;
|
2013-07-16 04:02:23 +07:00
|
|
|
}
|
2014-02-13 09:19:14 +07:00
|
|
|
#endif
|
2013-07-16 04:02:23 +07:00
|
|
|
|
|
|
|
static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2012-09-07 23:33:17 +07:00
|
|
|
static const struct pci_error_handlers nvme_err_handler = {
|
2011-01-21 00:50:14 +07:00
|
|
|
.error_detected = nvme_error_detected,
|
|
|
|
.mmio_enabled = nvme_dump_registers,
|
|
|
|
.link_reset = nvme_link_reset,
|
|
|
|
.slot_reset = nvme_slot_reset,
|
|
|
|
.resume = nvme_error_resume,
|
2014-05-02 23:40:43 +07:00
|
|
|
.reset_notify = nvme_reset_notify,
|
2011-01-21 00:50:14 +07:00
|
|
|
};
|
|
|
|
|
|
|
|
/* Move to pci_ids.h later */
|
|
|
|
#define PCI_CLASS_STORAGE_EXPRESS 0x010802
|
|
|
|
|
2014-03-24 21:11:22 +07:00
|
|
|
static const struct pci_device_id nvme_id_table[] = {
|
2011-01-21 00:50:14 +07:00
|
|
|
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
|
2015-11-04 06:49:45 +07:00
|
|
|
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
|
2011-01-21 00:50:14 +07:00
|
|
|
{ 0, }
|
|
|
|
};
|
|
|
|
MODULE_DEVICE_TABLE(pci, nvme_id_table);
|
|
|
|
|
|
|
|
static struct pci_driver nvme_driver = {
|
|
|
|
.name = "nvme",
|
|
|
|
.id_table = nvme_id_table,
|
|
|
|
.probe = nvme_probe,
|
2012-12-22 06:13:49 +07:00
|
|
|
.remove = nvme_remove,
|
2014-01-27 23:29:40 +07:00
|
|
|
.shutdown = nvme_shutdown,
|
2013-07-16 04:02:23 +07:00
|
|
|
.driver = {
|
|
|
|
.pm = &nvme_dev_pm_ops,
|
|
|
|
},
|
2011-01-21 00:50:14 +07:00
|
|
|
.err_handler = &nvme_err_handler,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init nvme_init(void)
|
|
|
|
{
|
2012-08-01 00:31:15 +07:00
|
|
|
int result;
|
2011-03-03 06:37:18 +07:00
|
|
|
|
2014-04-08 06:10:11 +07:00
|
|
|
init_waitqueue_head(&nvme_kthread_wait);
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2013-12-11 03:10:36 +07:00
|
|
|
nvme_workq = create_singlethread_workqueue("nvme");
|
|
|
|
if (!nvme_workq)
|
2014-04-08 06:10:11 +07:00
|
|
|
return -ENOMEM;
|
2013-12-11 03:10:36 +07:00
|
|
|
|
2012-07-26 05:05:18 +07:00
|
|
|
result = register_blkdev(nvme_major, "nvme");
|
|
|
|
if (result < 0)
|
2013-12-11 03:10:36 +07:00
|
|
|
goto kill_workq;
|
2012-07-26 05:05:18 +07:00
|
|
|
else if (result > 0)
|
2012-08-01 00:31:15 +07:00
|
|
|
nvme_major = result;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2015-02-04 01:21:42 +07:00
|
|
|
result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
|
|
|
|
&nvme_dev_fops);
|
|
|
|
if (result < 0)
|
|
|
|
goto unregister_blkdev;
|
|
|
|
else if (result > 0)
|
|
|
|
nvme_char_major = result;
|
|
|
|
|
|
|
|
nvme_class = class_create(THIS_MODULE, "nvme");
|
2015-03-07 05:43:41 +07:00
|
|
|
if (IS_ERR(nvme_class)) {
|
|
|
|
result = PTR_ERR(nvme_class);
|
2015-02-04 01:21:42 +07:00
|
|
|
goto unregister_chrdev;
|
2015-03-07 05:43:41 +07:00
|
|
|
}
|
2015-02-04 01:21:42 +07:00
|
|
|
|
2014-06-12 00:51:35 +07:00
|
|
|
result = pci_register_driver(&nvme_driver);
|
|
|
|
if (result)
|
2015-02-04 01:21:42 +07:00
|
|
|
goto destroy_class;
|
2011-03-03 06:37:18 +07:00
|
|
|
return 0;
|
2011-01-21 00:50:14 +07:00
|
|
|
|
2015-02-04 01:21:42 +07:00
|
|
|
destroy_class:
|
|
|
|
class_destroy(nvme_class);
|
|
|
|
unregister_chrdev:
|
|
|
|
__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
|
2011-03-03 06:37:18 +07:00
|
|
|
unregister_blkdev:
|
2011-01-21 00:50:14 +07:00
|
|
|
unregister_blkdev(nvme_major, "nvme");
|
2013-12-11 03:10:36 +07:00
|
|
|
kill_workq:
|
|
|
|
destroy_workqueue(nvme_workq);
|
2011-01-21 00:50:14 +07:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __exit nvme_exit(void)
|
|
|
|
{
|
|
|
|
pci_unregister_driver(&nvme_driver);
|
|
|
|
unregister_blkdev(nvme_major, "nvme");
|
2013-12-11 03:10:36 +07:00
|
|
|
destroy_workqueue(nvme_workq);
|
2015-02-04 01:21:42 +07:00
|
|
|
class_destroy(nvme_class);
|
|
|
|
__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
|
2014-04-08 06:10:11 +07:00
|
|
|
BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
|
2014-05-10 09:42:26 +07:00
|
|
|
_nvme_check_size();
|
2011-01-21 00:50:14 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
|
|
|
|
MODULE_LICENSE("GPL");
|
2014-11-22 05:16:32 +07:00
|
|
|
MODULE_VERSION("1.0");
|
2011-01-21 00:50:14 +07:00
|
|
|
module_init(nvme_init);
|
|
|
|
module_exit(nvme_exit);
|