mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-22 18:39:54 +07:00
a4bd217b43
This patch introduces pblk, a host-side translation layer for Open-Channel SSDs to expose them like block devices. The translation layer allows data placement decisions, and I/O scheduling to be managed by the host, enabling users to optimize the SSD for their specific workloads. An open-channel SSD has a set of LUNs (parallel units) and a collection of blocks. Each block can be read in any order, but writes must be sequential. Writes may also fail, and if a block requires it, must also be reset before new writes can be applied. To manage the constraints, pblk maintains a logical to physical address (L2P) table, write cache, garbage collection logic, recovery scheme, and logic to rate-limit user I/Os versus garbage collection I/Os. The L2P table is fully-associative and manages sectors at a 4KB granularity. Pblk stores the L2P table in two places, in the out-of-band area of the media and on the last page of a line. In the cause of a power failure, pblk will perform a scan to recover the L2P table. The user data is organized into lines. A line is data striped across blocks and LUNs. The lines enable the host to reduce the amount of metadata to maintain besides the user data and makes it easier to implement RAID or erasure coding in the future. pblk implements multi-tenant support and can be instantiated multiple times on the same drive. Each instance owns a portion of the SSD - both regarding I/O bandwidth and capacity - providing I/O isolation for each case. Finally, pblk also exposes a sysfs interface that allows user-space to peek into the internals of pblk. The interface is available at /dev/block/*/pblk/ where * is the block device name exposed. This work also contains contributions from: Matias Bjørling <matias@cnexlabs.com> Simon A. F. Lund <slund@cnexlabs.com> Young Tack Jin <youngtack.jin@gmail.com> Huaicheng Li <huaicheng@cs.uchicago.edu> Signed-off-by: Javier González <javier@cnexlabs.com> Signed-off-by: Matias Bjørling <matias@cnexlabs.com> Signed-off-by: Jens Axboe <axboe@fb.com>
853 lines
20 KiB
C
853 lines
20 KiB
C
/*
|
|
* Copyright (C) 2016 CNEX Labs
|
|
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
|
|
*
|
|
* Based upon the circular ringbuffer.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License version
|
|
* 2 as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful, but
|
|
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*
|
|
* pblk-rb.c - pblk's write buffer
|
|
*/
|
|
|
|
#include <linux/circ_buf.h>
|
|
|
|
#include "pblk.h"
|
|
|
|
static DECLARE_RWSEM(pblk_rb_lock);
|
|
|
|
void pblk_rb_data_free(struct pblk_rb *rb)
|
|
{
|
|
struct pblk_rb_pages *p, *t;
|
|
|
|
down_write(&pblk_rb_lock);
|
|
list_for_each_entry_safe(p, t, &rb->pages, list) {
|
|
free_pages((unsigned long)page_address(p->pages), p->order);
|
|
list_del(&p->list);
|
|
kfree(p);
|
|
}
|
|
up_write(&pblk_rb_lock);
|
|
}
|
|
|
|
/*
|
|
* Initialize ring buffer. The data and metadata buffers must be previously
|
|
* allocated and their size must be a power of two
|
|
* (Documentation/circular-buffers.txt)
|
|
*/
|
|
int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
|
|
unsigned int power_size, unsigned int power_seg_sz)
|
|
{
|
|
struct pblk *pblk = container_of(rb, struct pblk, rwb);
|
|
unsigned int init_entry = 0;
|
|
unsigned int alloc_order = power_size;
|
|
unsigned int max_order = MAX_ORDER - 1;
|
|
unsigned int order, iter;
|
|
|
|
down_write(&pblk_rb_lock);
|
|
rb->entries = rb_entry_base;
|
|
rb->seg_size = (1 << power_seg_sz);
|
|
rb->nr_entries = (1 << power_size);
|
|
rb->mem = rb->subm = rb->sync = rb->l2p_update = 0;
|
|
rb->sync_point = EMPTY_ENTRY;
|
|
|
|
spin_lock_init(&rb->w_lock);
|
|
spin_lock_init(&rb->s_lock);
|
|
|
|
INIT_LIST_HEAD(&rb->pages);
|
|
|
|
if (alloc_order >= max_order) {
|
|
order = max_order;
|
|
iter = (1 << (alloc_order - max_order));
|
|
} else {
|
|
order = alloc_order;
|
|
iter = 1;
|
|
}
|
|
|
|
do {
|
|
struct pblk_rb_entry *entry;
|
|
struct pblk_rb_pages *page_set;
|
|
void *kaddr;
|
|
unsigned long set_size;
|
|
int i;
|
|
|
|
page_set = kmalloc(sizeof(struct pblk_rb_pages), GFP_KERNEL);
|
|
if (!page_set) {
|
|
up_write(&pblk_rb_lock);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
page_set->order = order;
|
|
page_set->pages = alloc_pages(GFP_KERNEL, order);
|
|
if (!page_set->pages) {
|
|
kfree(page_set);
|
|
pblk_rb_data_free(rb);
|
|
up_write(&pblk_rb_lock);
|
|
return -ENOMEM;
|
|
}
|
|
kaddr = page_address(page_set->pages);
|
|
|
|
entry = &rb->entries[init_entry];
|
|
entry->data = kaddr;
|
|
entry->cacheline = pblk_cacheline_to_addr(init_entry++);
|
|
entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
|
|
|
|
set_size = (1 << order);
|
|
for (i = 1; i < set_size; i++) {
|
|
entry = &rb->entries[init_entry];
|
|
entry->cacheline = pblk_cacheline_to_addr(init_entry++);
|
|
entry->data = kaddr + (i * rb->seg_size);
|
|
entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
|
|
bio_list_init(&entry->w_ctx.bios);
|
|
}
|
|
|
|
list_add_tail(&page_set->list, &rb->pages);
|
|
iter--;
|
|
} while (iter > 0);
|
|
up_write(&pblk_rb_lock);
|
|
|
|
#ifdef CONFIG_NVM_DEBUG
|
|
atomic_set(&rb->inflight_sync_point, 0);
|
|
#endif
|
|
|
|
/*
|
|
* Initialize rate-limiter, which controls access to the write buffer
|
|
* but user and GC I/O
|
|
*/
|
|
pblk_rl_init(&pblk->rl, rb->nr_entries);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* pblk_rb_calculate_size -- calculate the size of the write buffer
|
|
*/
|
|
unsigned int pblk_rb_calculate_size(unsigned int nr_entries)
|
|
{
|
|
/* Alloc a write buffer that can at least fit 128 entries */
|
|
return (1 << max(get_count_order(nr_entries), 7));
|
|
}
|
|
|
|
void *pblk_rb_entries_ref(struct pblk_rb *rb)
|
|
{
|
|
return rb->entries;
|
|
}
|
|
|
|
static void clean_wctx(struct pblk_w_ctx *w_ctx)
|
|
{
|
|
int flags;
|
|
|
|
try:
|
|
flags = READ_ONCE(w_ctx->flags);
|
|
if (!(flags & PBLK_SUBMITTED_ENTRY))
|
|
goto try;
|
|
|
|
/* Release flags on context. Protect from writes and reads */
|
|
smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY);
|
|
pblk_ppa_set_empty(&w_ctx->ppa);
|
|
}
|
|
|
|
#define pblk_rb_ring_count(head, tail, size) CIRC_CNT(head, tail, size)
|
|
#define pblk_rb_ring_space(rb, head, tail, size) \
|
|
(CIRC_SPACE(head, tail, size))
|
|
|
|
/*
|
|
* Buffer space is calculated with respect to the back pointer signaling
|
|
* synchronized entries to the media.
|
|
*/
|
|
static unsigned int pblk_rb_space(struct pblk_rb *rb)
|
|
{
|
|
unsigned int mem = READ_ONCE(rb->mem);
|
|
unsigned int sync = READ_ONCE(rb->sync);
|
|
|
|
return pblk_rb_ring_space(rb, mem, sync, rb->nr_entries);
|
|
}
|
|
|
|
/*
|
|
* Buffer count is calculated with respect to the submission entry signaling the
|
|
* entries that are available to send to the media
|
|
*/
|
|
unsigned int pblk_rb_read_count(struct pblk_rb *rb)
|
|
{
|
|
unsigned int mem = READ_ONCE(rb->mem);
|
|
unsigned int subm = READ_ONCE(rb->subm);
|
|
|
|
return pblk_rb_ring_count(mem, subm, rb->nr_entries);
|
|
}
|
|
|
|
unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries)
|
|
{
|
|
unsigned int subm;
|
|
|
|
subm = READ_ONCE(rb->subm);
|
|
/* Commit read means updating submission pointer */
|
|
smp_store_release(&rb->subm,
|
|
(subm + nr_entries) & (rb->nr_entries - 1));
|
|
|
|
return subm;
|
|
}
|
|
|
|
static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd,
|
|
unsigned int to_update)
|
|
{
|
|
struct pblk *pblk = container_of(rb, struct pblk, rwb);
|
|
struct pblk_line *line;
|
|
struct pblk_rb_entry *entry;
|
|
struct pblk_w_ctx *w_ctx;
|
|
unsigned int i;
|
|
|
|
for (i = 0; i < to_update; i++) {
|
|
entry = &rb->entries[*l2p_upd];
|
|
w_ctx = &entry->w_ctx;
|
|
|
|
pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa,
|
|
entry->cacheline);
|
|
|
|
line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)];
|
|
kref_put(&line->ref, pblk_line_put);
|
|
clean_wctx(w_ctx);
|
|
*l2p_upd = (*l2p_upd + 1) & (rb->nr_entries - 1);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* When we move the l2p_update pointer, we update the l2p table - lookups will
|
|
* point to the physical address instead of to the cacheline in the write buffer
|
|
* from this moment on.
|
|
*/
|
|
static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries,
|
|
unsigned int mem, unsigned int sync)
|
|
{
|
|
unsigned int space, count;
|
|
int ret = 0;
|
|
|
|
lockdep_assert_held(&rb->w_lock);
|
|
|
|
/* Update l2p only as buffer entries are being overwritten */
|
|
space = pblk_rb_ring_space(rb, mem, rb->l2p_update, rb->nr_entries);
|
|
if (space > nr_entries)
|
|
goto out;
|
|
|
|
count = nr_entries - space;
|
|
/* l2p_update used exclusively under rb->w_lock */
|
|
ret = __pblk_rb_update_l2p(rb, &rb->l2p_update, count);
|
|
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Update the l2p entry for all sectors stored on the write buffer. This means
|
|
* that all future lookups to the l2p table will point to a device address, not
|
|
* to the cacheline in the write buffer.
|
|
*/
|
|
void pblk_rb_sync_l2p(struct pblk_rb *rb)
|
|
{
|
|
unsigned int sync;
|
|
unsigned int to_update;
|
|
|
|
spin_lock(&rb->w_lock);
|
|
|
|
/* Protect from reads and writes */
|
|
sync = smp_load_acquire(&rb->sync);
|
|
|
|
to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries);
|
|
__pblk_rb_update_l2p(rb, &rb->l2p_update, to_update);
|
|
|
|
spin_unlock(&rb->w_lock);
|
|
}
|
|
|
|
/*
|
|
* Write @nr_entries to ring buffer from @data buffer if there is enough space.
|
|
* Typically, 4KB data chunks coming from a bio will be copied to the ring
|
|
* buffer, thus the write will fail if not all incoming data can be copied.
|
|
*
|
|
*/
|
|
static void __pblk_rb_write_entry(struct pblk_rb *rb, void *data,
|
|
struct pblk_w_ctx w_ctx,
|
|
struct pblk_rb_entry *entry)
|
|
{
|
|
memcpy(entry->data, data, rb->seg_size);
|
|
|
|
entry->w_ctx.lba = w_ctx.lba;
|
|
entry->w_ctx.ppa = w_ctx.ppa;
|
|
}
|
|
|
|
void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
|
|
struct pblk_w_ctx w_ctx, unsigned int ring_pos)
|
|
{
|
|
struct pblk *pblk = container_of(rb, struct pblk, rwb);
|
|
struct pblk_rb_entry *entry;
|
|
int flags;
|
|
|
|
entry = &rb->entries[ring_pos];
|
|
flags = READ_ONCE(entry->w_ctx.flags);
|
|
#ifdef CONFIG_NVM_DEBUG
|
|
/* Caller must guarantee that the entry is free */
|
|
BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
|
|
#endif
|
|
|
|
__pblk_rb_write_entry(rb, data, w_ctx, entry);
|
|
|
|
pblk_update_map_cache(pblk, w_ctx.lba, entry->cacheline);
|
|
flags = w_ctx.flags | PBLK_WRITTEN_DATA;
|
|
|
|
/* Release flags on write context. Protect from writes */
|
|
smp_store_release(&entry->w_ctx.flags, flags);
|
|
}
|
|
|
|
void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
|
|
struct pblk_w_ctx w_ctx, struct pblk_line *gc_line,
|
|
unsigned int ring_pos)
|
|
{
|
|
struct pblk *pblk = container_of(rb, struct pblk, rwb);
|
|
struct pblk_rb_entry *entry;
|
|
int flags;
|
|
|
|
entry = &rb->entries[ring_pos];
|
|
flags = READ_ONCE(entry->w_ctx.flags);
|
|
#ifdef CONFIG_NVM_DEBUG
|
|
/* Caller must guarantee that the entry is free */
|
|
BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
|
|
#endif
|
|
|
|
__pblk_rb_write_entry(rb, data, w_ctx, entry);
|
|
|
|
if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, gc_line))
|
|
entry->w_ctx.lba = ADDR_EMPTY;
|
|
|
|
flags = w_ctx.flags | PBLK_WRITTEN_DATA;
|
|
|
|
/* Release flags on write context. Protect from writes */
|
|
smp_store_release(&entry->w_ctx.flags, flags);
|
|
}
|
|
|
|
static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio,
|
|
unsigned int pos)
|
|
{
|
|
struct pblk_rb_entry *entry;
|
|
unsigned int subm, sync_point;
|
|
int flags;
|
|
|
|
subm = READ_ONCE(rb->subm);
|
|
|
|
#ifdef CONFIG_NVM_DEBUG
|
|
atomic_inc(&rb->inflight_sync_point);
|
|
#endif
|
|
|
|
if (pos == subm)
|
|
return 0;
|
|
|
|
sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
|
|
entry = &rb->entries[sync_point];
|
|
|
|
flags = READ_ONCE(entry->w_ctx.flags);
|
|
flags |= PBLK_FLUSH_ENTRY;
|
|
|
|
/* Release flags on context. Protect from writes */
|
|
smp_store_release(&entry->w_ctx.flags, flags);
|
|
|
|
/* Protect syncs */
|
|
smp_store_release(&rb->sync_point, sync_point);
|
|
|
|
spin_lock_irq(&rb->s_lock);
|
|
bio_list_add(&entry->w_ctx.bios, bio);
|
|
spin_unlock_irq(&rb->s_lock);
|
|
|
|
return 1;
|
|
}
|
|
|
|
static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
|
|
unsigned int *pos)
|
|
{
|
|
unsigned int mem;
|
|
unsigned int sync;
|
|
|
|
sync = READ_ONCE(rb->sync);
|
|
mem = READ_ONCE(rb->mem);
|
|
|
|
if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < nr_entries)
|
|
return 0;
|
|
|
|
if (pblk_rb_update_l2p(rb, nr_entries, mem, sync))
|
|
return 0;
|
|
|
|
*pos = mem;
|
|
|
|
return 1;
|
|
}
|
|
|
|
static int pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
|
|
unsigned int *pos)
|
|
{
|
|
if (!__pblk_rb_may_write(rb, nr_entries, pos))
|
|
return 0;
|
|
|
|
/* Protect from read count */
|
|
smp_store_release(&rb->mem, (*pos + nr_entries) & (rb->nr_entries - 1));
|
|
return 1;
|
|
}
|
|
|
|
static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
|
|
unsigned int *pos, struct bio *bio,
|
|
int *io_ret)
|
|
{
|
|
unsigned int mem;
|
|
|
|
if (!__pblk_rb_may_write(rb, nr_entries, pos))
|
|
return 0;
|
|
|
|
mem = (*pos + nr_entries) & (rb->nr_entries - 1);
|
|
*io_ret = NVM_IO_DONE;
|
|
|
|
if (bio->bi_opf & REQ_PREFLUSH) {
|
|
struct pblk *pblk = container_of(rb, struct pblk, rwb);
|
|
|
|
#ifdef CONFIG_NVM_DEBUG
|
|
atomic_long_inc(&pblk->nr_flush);
|
|
#endif
|
|
if (pblk_rb_sync_point_set(&pblk->rwb, bio, mem))
|
|
*io_ret = NVM_IO_OK;
|
|
}
|
|
|
|
/* Protect from read count */
|
|
smp_store_release(&rb->mem, mem);
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* Atomically check that (i) there is space on the write buffer for the
|
|
* incoming I/O, and (ii) the current I/O type has enough budget in the write
|
|
* buffer (rate-limiter).
|
|
*/
|
|
int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio,
|
|
unsigned int nr_entries, unsigned int *pos)
|
|
{
|
|
struct pblk *pblk = container_of(rb, struct pblk, rwb);
|
|
int flush_done;
|
|
|
|
spin_lock(&rb->w_lock);
|
|
if (!pblk_rl_user_may_insert(&pblk->rl, nr_entries)) {
|
|
spin_unlock(&rb->w_lock);
|
|
return NVM_IO_REQUEUE;
|
|
}
|
|
|
|
if (!pblk_rb_may_write_flush(rb, nr_entries, pos, bio, &flush_done)) {
|
|
spin_unlock(&rb->w_lock);
|
|
return NVM_IO_REQUEUE;
|
|
}
|
|
|
|
pblk_rl_user_in(&pblk->rl, nr_entries);
|
|
spin_unlock(&rb->w_lock);
|
|
|
|
return flush_done;
|
|
}
|
|
|
|
/*
|
|
* Look at pblk_rb_may_write_user comment
|
|
*/
|
|
int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
|
|
unsigned int *pos)
|
|
{
|
|
struct pblk *pblk = container_of(rb, struct pblk, rwb);
|
|
|
|
spin_lock(&rb->w_lock);
|
|
if (!pblk_rl_gc_may_insert(&pblk->rl, nr_entries)) {
|
|
spin_unlock(&rb->w_lock);
|
|
return 0;
|
|
}
|
|
|
|
if (!pblk_rb_may_write(rb, nr_entries, pos)) {
|
|
spin_unlock(&rb->w_lock);
|
|
return 0;
|
|
}
|
|
|
|
pblk_rl_gc_in(&pblk->rl, nr_entries);
|
|
spin_unlock(&rb->w_lock);
|
|
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* The caller of this function must ensure that the backpointer will not
|
|
* overwrite the entries passed on the list.
|
|
*/
|
|
unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
|
|
struct list_head *list,
|
|
unsigned int max)
|
|
{
|
|
struct pblk_rb_entry *entry, *tentry;
|
|
struct page *page;
|
|
unsigned int read = 0;
|
|
int ret;
|
|
|
|
list_for_each_entry_safe(entry, tentry, list, index) {
|
|
if (read > max) {
|
|
pr_err("pblk: too many entries on list\n");
|
|
goto out;
|
|
}
|
|
|
|
page = virt_to_page(entry->data);
|
|
if (!page) {
|
|
pr_err("pblk: could not allocate write bio page\n");
|
|
goto out;
|
|
}
|
|
|
|
ret = bio_add_page(bio, page, rb->seg_size, 0);
|
|
if (ret != rb->seg_size) {
|
|
pr_err("pblk: could not add page to write bio\n");
|
|
goto out;
|
|
}
|
|
|
|
list_del(&entry->index);
|
|
read++;
|
|
}
|
|
|
|
out:
|
|
return read;
|
|
}
|
|
|
|
/*
|
|
* Read available entries on rb and add them to the given bio. To avoid a memory
|
|
* copy, a page reference to the write buffer is used to be added to the bio.
|
|
*
|
|
* This function is used by the write thread to form the write bio that will
|
|
* persist data on the write buffer to the media.
|
|
*/
|
|
unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio,
|
|
struct pblk_c_ctx *c_ctx,
|
|
unsigned int pos,
|
|
unsigned int nr_entries,
|
|
unsigned int count)
|
|
{
|
|
struct pblk *pblk = container_of(rb, struct pblk, rwb);
|
|
struct pblk_rb_entry *entry;
|
|
struct page *page;
|
|
unsigned int pad = 0, read = 0, to_read = nr_entries;
|
|
unsigned int user_io = 0, gc_io = 0;
|
|
unsigned int i;
|
|
int flags;
|
|
int ret;
|
|
|
|
if (count < nr_entries) {
|
|
pad = nr_entries - count;
|
|
to_read = count;
|
|
}
|
|
|
|
c_ctx->sentry = pos;
|
|
c_ctx->nr_valid = to_read;
|
|
c_ctx->nr_padded = pad;
|
|
|
|
for (i = 0; i < to_read; i++) {
|
|
entry = &rb->entries[pos];
|
|
|
|
/* A write has been allowed into the buffer, but data is still
|
|
* being copied to it. It is ok to busy wait.
|
|
*/
|
|
try:
|
|
flags = READ_ONCE(entry->w_ctx.flags);
|
|
if (!(flags & PBLK_WRITTEN_DATA))
|
|
goto try;
|
|
|
|
if (flags & PBLK_IOTYPE_USER)
|
|
user_io++;
|
|
else if (flags & PBLK_IOTYPE_GC)
|
|
gc_io++;
|
|
else
|
|
WARN(1, "pblk: unknown IO type\n");
|
|
|
|
page = virt_to_page(entry->data);
|
|
if (!page) {
|
|
pr_err("pblk: could not allocate write bio page\n");
|
|
flags &= ~PBLK_WRITTEN_DATA;
|
|
flags |= PBLK_SUBMITTED_ENTRY;
|
|
/* Release flags on context. Protect from writes */
|
|
smp_store_release(&entry->w_ctx.flags, flags);
|
|
goto out;
|
|
}
|
|
|
|
ret = bio_add_page(bio, page, rb->seg_size, 0);
|
|
if (ret != rb->seg_size) {
|
|
pr_err("pblk: could not add page to write bio\n");
|
|
flags &= ~PBLK_WRITTEN_DATA;
|
|
flags |= PBLK_SUBMITTED_ENTRY;
|
|
/* Release flags on context. Protect from writes */
|
|
smp_store_release(&entry->w_ctx.flags, flags);
|
|
goto out;
|
|
}
|
|
|
|
if (flags & PBLK_FLUSH_ENTRY) {
|
|
unsigned int sync_point;
|
|
|
|
sync_point = READ_ONCE(rb->sync_point);
|
|
if (sync_point == pos) {
|
|
/* Protect syncs */
|
|
smp_store_release(&rb->sync_point, EMPTY_ENTRY);
|
|
}
|
|
|
|
flags &= ~PBLK_FLUSH_ENTRY;
|
|
#ifdef CONFIG_NVM_DEBUG
|
|
atomic_dec(&rb->inflight_sync_point);
|
|
#endif
|
|
}
|
|
|
|
flags &= ~PBLK_WRITTEN_DATA;
|
|
flags |= PBLK_SUBMITTED_ENTRY;
|
|
|
|
/* Release flags on context. Protect from writes */
|
|
smp_store_release(&entry->w_ctx.flags, flags);
|
|
|
|
pos = (pos + 1) & (rb->nr_entries - 1);
|
|
}
|
|
|
|
read = to_read;
|
|
pblk_rl_out(&pblk->rl, user_io, gc_io);
|
|
#ifdef CONFIG_NVM_DEBUG
|
|
atomic_long_add(pad, &((struct pblk *)
|
|
(container_of(rb, struct pblk, rwb)))->padded_writes);
|
|
#endif
|
|
out:
|
|
return read;
|
|
}
|
|
|
|
/*
|
|
* Copy to bio only if the lba matches the one on the given cache entry.
|
|
* Otherwise, it means that the entry has been overwritten, and the bio should
|
|
* be directed to disk.
|
|
*/
|
|
int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
|
|
u64 pos, int bio_iter)
|
|
{
|
|
struct pblk_rb_entry *entry;
|
|
struct pblk_w_ctx *w_ctx;
|
|
void *data;
|
|
int flags;
|
|
int ret = 1;
|
|
|
|
spin_lock(&rb->w_lock);
|
|
|
|
#ifdef CONFIG_NVM_DEBUG
|
|
/* Caller must ensure that the access will not cause an overflow */
|
|
BUG_ON(pos >= rb->nr_entries);
|
|
#endif
|
|
entry = &rb->entries[pos];
|
|
w_ctx = &entry->w_ctx;
|
|
flags = READ_ONCE(w_ctx->flags);
|
|
|
|
/* Check if the entry has been overwritten or is scheduled to be */
|
|
if (w_ctx->lba != lba || flags & PBLK_WRITABLE_ENTRY) {
|
|
ret = 0;
|
|
goto out;
|
|
}
|
|
|
|
/* Only advance the bio if it hasn't been advanced already. If advanced,
|
|
* this bio is at least a partial bio (i.e., it has partially been
|
|
* filled with data from the cache). If part of the data resides on the
|
|
* media, we will read later on
|
|
*/
|
|
if (unlikely(!bio->bi_iter.bi_idx))
|
|
bio_advance(bio, bio_iter * PBLK_EXPOSED_PAGE_SIZE);
|
|
|
|
data = bio_data(bio);
|
|
memcpy(data, entry->data, rb->seg_size);
|
|
|
|
out:
|
|
spin_unlock(&rb->w_lock);
|
|
return ret;
|
|
}
|
|
|
|
struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos)
|
|
{
|
|
unsigned int entry = pos & (rb->nr_entries - 1);
|
|
|
|
return &rb->entries[entry].w_ctx;
|
|
}
|
|
|
|
unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags)
|
|
__acquires(&rb->s_lock)
|
|
{
|
|
if (flags)
|
|
spin_lock_irqsave(&rb->s_lock, *flags);
|
|
else
|
|
spin_lock_irq(&rb->s_lock);
|
|
|
|
return rb->sync;
|
|
}
|
|
|
|
void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags)
|
|
__releases(&rb->s_lock)
|
|
{
|
|
lockdep_assert_held(&rb->s_lock);
|
|
|
|
if (flags)
|
|
spin_unlock_irqrestore(&rb->s_lock, *flags);
|
|
else
|
|
spin_unlock_irq(&rb->s_lock);
|
|
}
|
|
|
|
unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
|
|
{
|
|
unsigned int sync;
|
|
unsigned int i;
|
|
|
|
lockdep_assert_held(&rb->s_lock);
|
|
|
|
sync = READ_ONCE(rb->sync);
|
|
|
|
for (i = 0; i < nr_entries; i++)
|
|
sync = (sync + 1) & (rb->nr_entries - 1);
|
|
|
|
/* Protect from counts */
|
|
smp_store_release(&rb->sync, sync);
|
|
|
|
return sync;
|
|
}
|
|
|
|
unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb)
|
|
{
|
|
unsigned int subm, sync_point;
|
|
unsigned int count;
|
|
|
|
/* Protect syncs */
|
|
sync_point = smp_load_acquire(&rb->sync_point);
|
|
if (sync_point == EMPTY_ENTRY)
|
|
return 0;
|
|
|
|
subm = READ_ONCE(rb->subm);
|
|
|
|
/* The sync point itself counts as a sector to sync */
|
|
count = pblk_rb_ring_count(sync_point, subm, rb->nr_entries) + 1;
|
|
|
|
return count;
|
|
}
|
|
|
|
/*
|
|
* Scan from the current position of the sync pointer to find the entry that
|
|
* corresponds to the given ppa. This is necessary since write requests can be
|
|
* completed out of order. The assumption is that the ppa is close to the sync
|
|
* pointer thus the search will not take long.
|
|
*
|
|
* The caller of this function must guarantee that the sync pointer will no
|
|
* reach the entry while it is using the metadata associated with it. With this
|
|
* assumption in mind, there is no need to take the sync lock.
|
|
*/
|
|
struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
|
|
struct ppa_addr *ppa)
|
|
{
|
|
unsigned int sync, subm, count;
|
|
unsigned int i;
|
|
|
|
sync = READ_ONCE(rb->sync);
|
|
subm = READ_ONCE(rb->subm);
|
|
count = pblk_rb_ring_count(subm, sync, rb->nr_entries);
|
|
|
|
for (i = 0; i < count; i++)
|
|
sync = (sync + 1) & (rb->nr_entries - 1);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
int pblk_rb_tear_down_check(struct pblk_rb *rb)
|
|
{
|
|
struct pblk_rb_entry *entry;
|
|
int i;
|
|
int ret = 0;
|
|
|
|
spin_lock(&rb->w_lock);
|
|
spin_lock_irq(&rb->s_lock);
|
|
|
|
if ((rb->mem == rb->subm) && (rb->subm == rb->sync) &&
|
|
(rb->sync == rb->l2p_update) &&
|
|
(rb->sync_point == EMPTY_ENTRY)) {
|
|
goto out;
|
|
}
|
|
|
|
if (!rb->entries) {
|
|
ret = 1;
|
|
goto out;
|
|
}
|
|
|
|
for (i = 0; i < rb->nr_entries; i++) {
|
|
entry = &rb->entries[i];
|
|
|
|
if (!entry->data) {
|
|
ret = 1;
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
out:
|
|
spin_unlock(&rb->w_lock);
|
|
spin_unlock_irq(&rb->s_lock);
|
|
|
|
return ret;
|
|
}
|
|
|
|
unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos)
|
|
{
|
|
return (pos & (rb->nr_entries - 1));
|
|
}
|
|
|
|
int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos)
|
|
{
|
|
return (pos >= rb->nr_entries);
|
|
}
|
|
|
|
ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
|
|
{
|
|
struct pblk *pblk = container_of(rb, struct pblk, rwb);
|
|
struct pblk_c_ctx *c;
|
|
ssize_t offset;
|
|
int queued_entries = 0;
|
|
|
|
spin_lock_irq(&rb->s_lock);
|
|
list_for_each_entry(c, &pblk->compl_list, list)
|
|
queued_entries++;
|
|
spin_unlock_irq(&rb->s_lock);
|
|
|
|
if (rb->sync_point != EMPTY_ENTRY)
|
|
offset = scnprintf(buf, PAGE_SIZE,
|
|
"%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n",
|
|
rb->nr_entries,
|
|
rb->mem,
|
|
rb->subm,
|
|
rb->sync,
|
|
rb->l2p_update,
|
|
#ifdef CONFIG_NVM_DEBUG
|
|
atomic_read(&rb->inflight_sync_point),
|
|
#else
|
|
0,
|
|
#endif
|
|
rb->sync_point,
|
|
pblk_rb_read_count(rb),
|
|
pblk_rb_space(rb),
|
|
pblk_rb_sync_point_count(rb),
|
|
queued_entries);
|
|
else
|
|
offset = scnprintf(buf, PAGE_SIZE,
|
|
"%u\t%u\t%u\t%u\t%u\t%u\tNULL - %u/%u/%u - %d\n",
|
|
rb->nr_entries,
|
|
rb->mem,
|
|
rb->subm,
|
|
rb->sync,
|
|
rb->l2p_update,
|
|
#ifdef CONFIG_NVM_DEBUG
|
|
atomic_read(&rb->inflight_sync_point),
|
|
#else
|
|
0,
|
|
#endif
|
|
pblk_rb_read_count(rb),
|
|
pblk_rb_space(rb),
|
|
pblk_rb_sync_point_count(rb),
|
|
queued_entries);
|
|
|
|
return offset;
|
|
}
|