2013-03-24 06:11:31 +07:00
|
|
|
/*
|
|
|
|
* Main bcache entry point - handle a read or a write request and decide what to
|
|
|
|
* do with it; the make_request functions are called by the block layer.
|
|
|
|
*
|
|
|
|
* Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
|
|
|
|
* Copyright 2012 Google, Inc.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "bcache.h"
|
|
|
|
#include "btree.h"
|
|
|
|
#include "debug.h"
|
|
|
|
#include "request.h"
|
2013-06-05 20:21:07 +07:00
|
|
|
#include "writeback.h"
|
2013-03-24 06:11:31 +07:00
|
|
|
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/hash.h>
|
|
|
|
#include <linux/random.h>
|
2015-05-23 04:13:32 +07:00
|
|
|
#include <linux/backing-dev.h>
|
2013-03-24 06:11:31 +07:00
|
|
|
|
|
|
|
#include <trace/events/bcache.h>
|
|
|
|
|
|
|
|
#define CUTOFF_CACHE_ADD 95
|
|
|
|
#define CUTOFF_CACHE_READA 90
|
|
|
|
|
|
|
|
struct kmem_cache *bch_search_cache;
|
|
|
|
|
2013-10-25 07:07:04 +07:00
|
|
|
static void bch_data_insert_start(struct closure *);
|
|
|
|
|
2017-10-14 06:35:34 +07:00
|
|
|
static unsigned cache_mode(struct cached_dev *dc)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
|
|
|
return BDEV_CACHE_MODE(&dc->sb);
|
|
|
|
}
|
|
|
|
|
2017-10-14 06:35:34 +07:00
|
|
|
static bool verify(struct cached_dev *dc)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
|
|
|
return dc->verify;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void bio_csum(struct bio *bio, struct bkey *k)
|
|
|
|
{
|
2013-11-24 08:19:00 +07:00
|
|
|
struct bio_vec bv;
|
|
|
|
struct bvec_iter iter;
|
2013-03-24 06:11:31 +07:00
|
|
|
uint64_t csum = 0;
|
|
|
|
|
2013-11-24 08:19:00 +07:00
|
|
|
bio_for_each_segment(bv, bio, iter) {
|
|
|
|
void *d = kmap(bv.bv_page) + bv.bv_offset;
|
|
|
|
csum = bch_crc64_update(csum, d, bv.bv_len);
|
|
|
|
kunmap(bv.bv_page);
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Insert data into cache */
|
|
|
|
|
2013-10-25 07:07:04 +07:00
|
|
|
static void bch_data_insert_keys(struct closure *cl)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
2013-09-11 09:02:45 +07:00
|
|
|
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
|
2013-07-25 07:44:17 +07:00
|
|
|
atomic_t *journal_ref = NULL;
|
2013-09-11 09:02:45 +07:00
|
|
|
struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
|
2013-07-25 08:06:22 +07:00
|
|
|
int ret;
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-10-25 07:07:04 +07:00
|
|
|
/*
|
|
|
|
* If we're looping, might already be waiting on
|
|
|
|
* another journal write - can't wait on more than one journal write at
|
|
|
|
* a time
|
|
|
|
*
|
|
|
|
* XXX: this looks wrong
|
|
|
|
*/
|
|
|
|
#if 0
|
|
|
|
while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING)
|
|
|
|
closure_sync(&s->cl);
|
|
|
|
#endif
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
if (!op->replace)
|
|
|
|
journal_ref = bch_journal(op->c, &op->insert_keys,
|
|
|
|
op->flush_journal ? cl : NULL);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
ret = bch_btree_insert(op->c, &op->insert_keys,
|
2013-07-25 08:06:22 +07:00
|
|
|
journal_ref, replace_key);
|
|
|
|
if (ret == -ESRCH) {
|
2013-09-11 09:02:45 +07:00
|
|
|
op->replace_collision = true;
|
2013-07-25 08:06:22 +07:00
|
|
|
} else if (ret) {
|
2017-06-03 14:38:06 +07:00
|
|
|
op->status = BLK_STS_RESOURCE;
|
2013-09-11 09:02:45 +07:00
|
|
|
op->insert_data_done = true;
|
2013-10-25 07:07:04 +07:00
|
|
|
}
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-07-25 07:44:17 +07:00
|
|
|
if (journal_ref)
|
|
|
|
atomic_dec_bug(journal_ref);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2015-03-06 22:37:46 +07:00
|
|
|
if (!op->insert_data_done) {
|
2014-01-10 07:03:04 +07:00
|
|
|
continue_at(cl, bch_data_insert_start, op->wq);
|
2015-03-06 22:37:46 +07:00
|
|
|
return;
|
|
|
|
}
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
bch_keylist_free(&op->insert_keys);
|
2013-10-25 07:07:04 +07:00
|
|
|
closure_return(cl);
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
2013-11-12 09:20:51 +07:00
|
|
|
static int bch_keylist_realloc(struct keylist *l, unsigned u64s,
|
|
|
|
struct cache_set *c)
|
|
|
|
{
|
|
|
|
size_t oldsize = bch_keylist_nkeys(l);
|
|
|
|
size_t newsize = oldsize + u64s;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The journalling code doesn't handle the case where the keys to insert
|
|
|
|
* is bigger than an empty write: If we just return -ENOMEM here,
|
|
|
|
* bio_insert() and bio_invalidate() will insert the keys created so far
|
|
|
|
* and finish the rest when the keylist is empty.
|
|
|
|
*/
|
|
|
|
if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
return __bch_keylist_realloc(l, u64s);
|
|
|
|
}
|
|
|
|
|
2013-10-25 07:07:04 +07:00
|
|
|
static void bch_data_invalidate(struct closure *cl)
|
|
|
|
{
|
2013-09-11 09:02:45 +07:00
|
|
|
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
|
|
|
|
struct bio *bio = op->bio;
|
2013-10-25 07:07:04 +07:00
|
|
|
|
|
|
|
pr_debug("invalidating %i sectors from %llu",
|
2013-10-12 05:44:27 +07:00
|
|
|
bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector);
|
2013-10-25 07:07:04 +07:00
|
|
|
|
|
|
|
while (bio_sectors(bio)) {
|
2013-11-01 05:46:42 +07:00
|
|
|
unsigned sectors = min(bio_sectors(bio),
|
|
|
|
1U << (KEY_SIZE_BITS - 1));
|
2013-10-25 07:07:04 +07:00
|
|
|
|
2013-11-12 09:20:51 +07:00
|
|
|
if (bch_keylist_realloc(&op->insert_keys, 2, op->c))
|
2013-10-25 07:07:04 +07:00
|
|
|
goto out;
|
|
|
|
|
2013-10-12 05:44:27 +07:00
|
|
|
bio->bi_iter.bi_sector += sectors;
|
|
|
|
bio->bi_iter.bi_size -= sectors << 9;
|
2013-10-25 07:07:04 +07:00
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
bch_keylist_add(&op->insert_keys,
|
2013-10-12 05:44:27 +07:00
|
|
|
&KEY(op->inode, bio->bi_iter.bi_sector, sectors));
|
2013-10-25 07:07:04 +07:00
|
|
|
}
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
op->insert_data_done = true;
|
2013-10-25 07:07:04 +07:00
|
|
|
bio_put(bio);
|
|
|
|
out:
|
2014-01-10 07:03:04 +07:00
|
|
|
continue_at(cl, bch_data_insert_keys, op->wq);
|
2013-10-25 07:07:04 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void bch_data_insert_error(struct closure *cl)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
2013-09-11 09:02:45 +07:00
|
|
|
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Our data write just errored, which means we've got a bunch of keys to
|
|
|
|
* insert that point to data that wasn't succesfully written.
|
|
|
|
*
|
|
|
|
* We don't have to insert those keys but we still have to invalidate
|
|
|
|
* that region of the cache - so, if we just strip off all the pointers
|
|
|
|
* from the keys we'll accomplish just that.
|
|
|
|
*/
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys;
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
while (src != op->insert_keys.top) {
|
2013-03-24 06:11:31 +07:00
|
|
|
struct bkey *n = bkey_next(src);
|
|
|
|
|
|
|
|
SET_KEY_PTRS(src, 0);
|
2013-07-25 07:24:25 +07:00
|
|
|
memmove(dst, src, bkey_bytes(src));
|
2013-03-24 06:11:31 +07:00
|
|
|
|
|
|
|
dst = bkey_next(dst);
|
|
|
|
src = n;
|
|
|
|
}
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
op->insert_keys.top = dst;
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-10-25 07:07:04 +07:00
|
|
|
bch_data_insert_keys(cl);
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
2015-07-20 20:29:37 +07:00
|
|
|
static void bch_data_insert_endio(struct bio *bio)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
|
|
|
struct closure *cl = bio->bi_private;
|
2013-09-11 09:02:45 +07:00
|
|
|
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2017-06-03 14:38:06 +07:00
|
|
|
if (bio->bi_status) {
|
2013-03-24 06:11:31 +07:00
|
|
|
/* TODO: We could try to recover from this. */
|
2013-09-11 09:02:45 +07:00
|
|
|
if (op->writeback)
|
2017-06-03 14:38:06 +07:00
|
|
|
op->status = bio->bi_status;
|
2013-09-11 09:02:45 +07:00
|
|
|
else if (!op->replace)
|
2014-01-10 07:03:04 +07:00
|
|
|
set_closure_fn(cl, bch_data_insert_error, op->wq);
|
2013-03-24 06:11:31 +07:00
|
|
|
else
|
|
|
|
set_closure_fn(cl, NULL, NULL);
|
|
|
|
}
|
|
|
|
|
2017-06-03 14:38:06 +07:00
|
|
|
bch_bbio_endio(op->c, bio, bio->bi_status, "writing data to cache");
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
2013-10-25 07:07:04 +07:00
|
|
|
static void bch_data_insert_start(struct closure *cl)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
2013-09-11 09:02:45 +07:00
|
|
|
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
|
|
|
|
struct bio *bio = op->bio, *n;
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-12-13 03:53:28 +07:00
|
|
|
if (op->bypass)
|
|
|
|
return bch_data_invalidate(cl);
|
|
|
|
|
2017-09-06 13:25:53 +07:00
|
|
|
if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
|
|
|
|
wake_up_gc(op->c);
|
|
|
|
|
2013-07-11 08:44:40 +07:00
|
|
|
/*
|
2016-06-06 02:32:25 +07:00
|
|
|
* Journal writes are marked REQ_PREFLUSH; if the original write was a
|
2013-07-11 08:44:40 +07:00
|
|
|
* flush, it'll wait on the journal write.
|
|
|
|
*/
|
2016-08-06 04:35:16 +07:00
|
|
|
bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA);
|
2013-07-11 08:44:40 +07:00
|
|
|
|
2013-03-24 06:11:31 +07:00
|
|
|
do {
|
|
|
|
unsigned i;
|
|
|
|
struct bkey *k;
|
2013-09-11 09:02:45 +07:00
|
|
|
struct bio_set *split = op->c->bio_split;
|
2013-03-24 06:11:31 +07:00
|
|
|
|
|
|
|
/* 1 for the device pointer and 1 for the chksum */
|
2013-09-11 09:02:45 +07:00
|
|
|
if (bch_keylist_realloc(&op->insert_keys,
|
2013-11-12 09:20:51 +07:00
|
|
|
3 + (op->csum ? 1 : 0),
|
2015-03-06 22:37:46 +07:00
|
|
|
op->c)) {
|
2014-01-10 07:03:04 +07:00
|
|
|
continue_at(cl, bch_data_insert_keys, op->wq);
|
2015-03-06 22:37:46 +07:00
|
|
|
return;
|
|
|
|
}
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
k = op->insert_keys.top;
|
2013-03-24 06:11:31 +07:00
|
|
|
bkey_init(k);
|
2013-09-11 09:02:45 +07:00
|
|
|
SET_KEY_INODE(k, op->inode);
|
2013-10-12 05:44:27 +07:00
|
|
|
SET_KEY_OFFSET(k, bio->bi_iter.bi_sector);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-07-25 08:11:11 +07:00
|
|
|
if (!bch_alloc_sectors(op->c, k, bio_sectors(bio),
|
|
|
|
op->write_point, op->write_prio,
|
|
|
|
op->writeback))
|
2013-03-24 06:11:31 +07:00
|
|
|
goto err;
|
|
|
|
|
2013-11-24 09:21:01 +07:00
|
|
|
n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-10-25 07:07:04 +07:00
|
|
|
n->bi_end_io = bch_data_insert_endio;
|
2013-03-24 06:11:31 +07:00
|
|
|
n->bi_private = cl;
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
if (op->writeback) {
|
2013-03-24 06:11:31 +07:00
|
|
|
SET_KEY_DIRTY(k, true);
|
|
|
|
|
|
|
|
for (i = 0; i < KEY_PTRS(k); i++)
|
2013-09-11 09:02:45 +07:00
|
|
|
SET_GC_MARK(PTR_BUCKET(op->c, k, i),
|
2013-03-24 06:11:31 +07:00
|
|
|
GC_MARK_DIRTY);
|
|
|
|
}
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
SET_KEY_CSUM(k, op->csum);
|
2013-03-24 06:11:31 +07:00
|
|
|
if (KEY_CSUM(k))
|
|
|
|
bio_csum(n, k);
|
|
|
|
|
2013-04-27 05:39:55 +07:00
|
|
|
trace_bcache_cache_insert(k);
|
2013-09-11 09:02:45 +07:00
|
|
|
bch_keylist_push(&op->insert_keys);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2016-06-06 02:32:05 +07:00
|
|
|
bio_set_op_attrs(n, REQ_OP_WRITE, 0);
|
2013-09-11 09:02:45 +07:00
|
|
|
bch_submit_bbio(n, op->c, k, 0);
|
2013-03-24 06:11:31 +07:00
|
|
|
} while (n != bio);
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
op->insert_data_done = true;
|
2014-01-10 07:03:04 +07:00
|
|
|
continue_at(cl, bch_data_insert_keys, op->wq);
|
2015-03-06 22:37:46 +07:00
|
|
|
return;
|
2013-03-24 06:11:31 +07:00
|
|
|
err:
|
|
|
|
/* bch_alloc_sectors() blocks if s->writeback = true */
|
2013-09-11 09:02:45 +07:00
|
|
|
BUG_ON(op->writeback);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* But if it's not a writeback write we'd rather just bail out if
|
|
|
|
* there aren't any buckets ready to write to - it might take awhile and
|
|
|
|
* we might be starving btree writes for gc or something.
|
|
|
|
*/
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
if (!op->replace) {
|
2013-03-24 06:11:31 +07:00
|
|
|
/*
|
|
|
|
* Writethrough write: We can't complete the write until we've
|
|
|
|
* updated the index. But we don't want to delay the write while
|
|
|
|
* we wait for buckets to be freed up, so just invalidate the
|
|
|
|
* rest of the write.
|
|
|
|
*/
|
2013-09-11 09:02:45 +07:00
|
|
|
op->bypass = true;
|
2013-10-25 07:07:04 +07:00
|
|
|
return bch_data_invalidate(cl);
|
2013-03-24 06:11:31 +07:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* From a cache miss, we can just insert the keys for the data
|
|
|
|
* we have written or bail out if we didn't do anything.
|
|
|
|
*/
|
2013-09-11 09:02:45 +07:00
|
|
|
op->insert_data_done = true;
|
2013-03-24 06:11:31 +07:00
|
|
|
bio_put(bio);
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
if (!bch_keylist_empty(&op->insert_keys))
|
2014-01-10 07:03:04 +07:00
|
|
|
continue_at(cl, bch_data_insert_keys, op->wq);
|
2013-03-24 06:11:31 +07:00
|
|
|
else
|
|
|
|
closure_return(cl);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2013-10-25 07:07:04 +07:00
|
|
|
* bch_data_insert - stick some data in the cache
|
2013-03-24 06:11:31 +07:00
|
|
|
*
|
|
|
|
* This is the starting point for any data to end up in a cache device; it could
|
|
|
|
* be from a normal write, or a writeback write, or a write to a flash only
|
|
|
|
* volume - it's also used by the moving garbage collector to compact data in
|
|
|
|
* mostly empty buckets.
|
|
|
|
*
|
|
|
|
* It first writes the data to the cache, creating a list of keys to be inserted
|
|
|
|
* (if the data had to be fragmented there will be multiple keys); after the
|
|
|
|
* data is written it calls bch_journal, and after the keys have been added to
|
|
|
|
* the next journal write they're inserted into the btree.
|
|
|
|
*
|
2013-07-25 07:44:17 +07:00
|
|
|
* It inserts the data in s->cache_bio; bi_sector is used for the key offset,
|
2013-03-24 06:11:31 +07:00
|
|
|
* and op->inode is used for the key inode.
|
|
|
|
*
|
2013-07-25 07:44:17 +07:00
|
|
|
* If s->bypass is true, instead of inserting the data it invalidates the
|
|
|
|
* region of the cache represented by s->cache_bio and op->inode.
|
2013-03-24 06:11:31 +07:00
|
|
|
*/
|
2013-10-25 07:07:04 +07:00
|
|
|
void bch_data_insert(struct closure *cl)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
2013-09-11 09:02:45 +07:00
|
|
|
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2014-05-23 02:14:24 +07:00
|
|
|
trace_bcache_write(op->c, op->inode, op->bio,
|
|
|
|
op->writeback, op->bypass);
|
2013-09-11 09:02:45 +07:00
|
|
|
|
|
|
|
bch_keylist_init(&op->insert_keys);
|
|
|
|
bio_get(op->bio);
|
2013-10-25 07:07:04 +07:00
|
|
|
bch_data_insert_start(cl);
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
/* Congested? */
|
|
|
|
|
|
|
|
unsigned bch_get_congested(struct cache_set *c)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
long rand;
|
|
|
|
|
|
|
|
if (!c->congested_read_threshold_us &&
|
|
|
|
!c->congested_write_threshold_us)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
i = (local_clock_us() - c->congested_last_us) / 1024;
|
|
|
|
if (i < 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
i += atomic_read(&c->congested);
|
|
|
|
if (i >= 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
i += CONGESTED_MAX;
|
|
|
|
|
|
|
|
if (i > 0)
|
|
|
|
i = fract_exp_two(i, 6);
|
|
|
|
|
|
|
|
rand = get_random_int();
|
|
|
|
i -= bitmap_weight(&rand, BITS_PER_LONG);
|
|
|
|
|
|
|
|
return i > 0 ? i : 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void add_sequential(struct task_struct *t)
|
|
|
|
{
|
|
|
|
ewma_add(t->sequential_io_avg,
|
|
|
|
t->sequential_io, 8, 0);
|
|
|
|
|
|
|
|
t->sequential_io = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
|
|
|
|
{
|
|
|
|
return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
|
|
|
|
{
|
|
|
|
struct cache_set *c = dc->disk.c;
|
2017-10-14 06:35:34 +07:00
|
|
|
unsigned mode = cache_mode(dc);
|
2013-09-11 09:02:45 +07:00
|
|
|
unsigned sectors, congested = bch_get_congested(c);
|
|
|
|
struct task_struct *task = current;
|
2013-07-31 12:34:40 +07:00
|
|
|
struct io *i;
|
2013-09-11 09:02:45 +07:00
|
|
|
|
2013-08-22 07:49:09 +07:00
|
|
|
if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
|
2013-09-11 09:02:45 +07:00
|
|
|
c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
|
2016-06-06 02:32:05 +07:00
|
|
|
(bio_op(bio) == REQ_OP_DISCARD))
|
2013-09-11 09:02:45 +07:00
|
|
|
goto skip;
|
|
|
|
|
|
|
|
if (mode == CACHE_MODE_NONE ||
|
|
|
|
(mode == CACHE_MODE_WRITEAROUND &&
|
2016-06-06 02:31:47 +07:00
|
|
|
op_is_write(bio_op(bio))))
|
2013-09-11 09:02:45 +07:00
|
|
|
goto skip;
|
|
|
|
|
2017-10-14 06:35:33 +07:00
|
|
|
/*
|
|
|
|
* Flag for bypass if the IO is for read-ahead or background,
|
|
|
|
* unless the read-ahead request is for metadata (eg, for gfs2).
|
|
|
|
*/
|
|
|
|
if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
|
|
|
|
!(bio->bi_opf & REQ_META))
|
|
|
|
goto skip;
|
|
|
|
|
2013-10-12 05:44:27 +07:00
|
|
|
if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
|
2013-09-11 09:02:45 +07:00
|
|
|
bio_sectors(bio) & (c->sb.block_size - 1)) {
|
|
|
|
pr_debug("skipping unaligned io");
|
|
|
|
goto skip;
|
|
|
|
}
|
|
|
|
|
2013-09-11 04:27:42 +07:00
|
|
|
if (bypass_torture_test(dc)) {
|
|
|
|
if ((get_random_int() & 3) == 3)
|
|
|
|
goto skip;
|
|
|
|
else
|
|
|
|
goto rescale;
|
|
|
|
}
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
if (!congested && !dc->sequential_cutoff)
|
|
|
|
goto rescale;
|
|
|
|
|
2013-07-31 12:34:40 +07:00
|
|
|
spin_lock(&dc->io_lock);
|
2013-09-11 09:02:45 +07:00
|
|
|
|
2013-10-12 05:44:27 +07:00
|
|
|
hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
|
|
|
|
if (i->last == bio->bi_iter.bi_sector &&
|
2013-07-31 12:34:40 +07:00
|
|
|
time_before(jiffies, i->jiffies))
|
|
|
|
goto found;
|
2013-09-11 09:02:45 +07:00
|
|
|
|
2013-07-31 12:34:40 +07:00
|
|
|
i = list_first_entry(&dc->io_lru, struct io, lru);
|
2013-09-11 09:02:45 +07:00
|
|
|
|
2013-07-31 12:34:40 +07:00
|
|
|
add_sequential(task);
|
|
|
|
i->sequential = 0;
|
2013-09-11 09:02:45 +07:00
|
|
|
found:
|
2013-10-12 05:44:27 +07:00
|
|
|
if (i->sequential + bio->bi_iter.bi_size > i->sequential)
|
|
|
|
i->sequential += bio->bi_iter.bi_size;
|
2013-09-11 09:02:45 +07:00
|
|
|
|
2013-07-31 12:34:40 +07:00
|
|
|
i->last = bio_end_sector(bio);
|
|
|
|
i->jiffies = jiffies + msecs_to_jiffies(5000);
|
|
|
|
task->sequential_io = i->sequential;
|
2013-09-11 09:02:45 +07:00
|
|
|
|
2013-07-31 12:34:40 +07:00
|
|
|
hlist_del(&i->hash);
|
|
|
|
hlist_add_head(&i->hash, iohash(dc, i->last));
|
|
|
|
list_move_tail(&i->lru, &dc->io_lru);
|
2013-09-11 09:02:45 +07:00
|
|
|
|
2013-07-31 12:34:40 +07:00
|
|
|
spin_unlock(&dc->io_lock);
|
2013-09-11 09:02:45 +07:00
|
|
|
|
|
|
|
sectors = max(task->sequential_io,
|
|
|
|
task->sequential_io_avg) >> 9;
|
|
|
|
|
|
|
|
if (dc->sequential_cutoff &&
|
|
|
|
sectors >= dc->sequential_cutoff >> 9) {
|
|
|
|
trace_bcache_bypass_sequential(bio);
|
|
|
|
goto skip;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (congested && sectors >= congested) {
|
|
|
|
trace_bcache_bypass_congested(bio);
|
|
|
|
goto skip;
|
|
|
|
}
|
|
|
|
|
|
|
|
rescale:
|
|
|
|
bch_rescale_priorities(c, bio_sectors(bio));
|
|
|
|
return false;
|
|
|
|
skip:
|
|
|
|
bch_mark_sectors_bypassed(c, dc, bio_sectors(bio));
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-07-25 07:41:08 +07:00
|
|
|
/* Cache lookup */
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
struct search {
|
|
|
|
/* Stack frame for bio_complete */
|
|
|
|
struct closure cl;
|
|
|
|
|
|
|
|
struct bbio bio;
|
|
|
|
struct bio *orig_bio;
|
|
|
|
struct bio *cache_miss;
|
2013-09-11 09:16:31 +07:00
|
|
|
struct bcache_device *d;
|
2013-09-11 09:02:45 +07:00
|
|
|
|
|
|
|
unsigned insert_bio_sectors;
|
|
|
|
unsigned recoverable:1;
|
|
|
|
unsigned write:1;
|
2013-09-11 04:27:42 +07:00
|
|
|
unsigned read_dirty_data:1;
|
2013-09-11 09:02:45 +07:00
|
|
|
|
|
|
|
unsigned long start_time;
|
|
|
|
|
|
|
|
struct btree_op op;
|
|
|
|
struct data_insert_op iop;
|
|
|
|
};
|
|
|
|
|
2015-07-20 20:29:37 +07:00
|
|
|
static void bch_cache_read_endio(struct bio *bio)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
|
|
|
struct bbio *b = container_of(bio, struct bbio, bio);
|
|
|
|
struct closure *cl = bio->bi_private;
|
|
|
|
struct search *s = container_of(cl, struct search, cl);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the bucket was reused while our bio was in flight, we might have
|
|
|
|
* read the wrong data. Set s->error but not error so it doesn't get
|
|
|
|
* counted against the cache device, but we'll still reread the data
|
|
|
|
* from the backing device.
|
|
|
|
*/
|
|
|
|
|
2017-06-03 14:38:06 +07:00
|
|
|
if (bio->bi_status)
|
|
|
|
s->iop.status = bio->bi_status;
|
2013-08-10 11:14:13 +07:00
|
|
|
else if (!KEY_DIRTY(&b->key) &&
|
|
|
|
ptr_stale(s->iop.c, &b->key, 0)) {
|
2013-09-11 09:02:45 +07:00
|
|
|
atomic_long_inc(&s->iop.c->cache_read_races);
|
2017-06-03 14:38:06 +07:00
|
|
|
s->iop.status = BLK_STS_IOERR;
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
2017-06-03 14:38:06 +07:00
|
|
|
bch_bbio_endio(s->iop.c, bio, bio->bi_status, "reading from cache");
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
2013-07-25 07:41:08 +07:00
|
|
|
/*
|
|
|
|
* Read from a single key, handling the initial cache miss if the key starts in
|
|
|
|
* the middle of the bio
|
|
|
|
*/
|
2013-07-25 07:41:13 +07:00
|
|
|
static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
|
2013-07-25 07:41:08 +07:00
|
|
|
{
|
|
|
|
struct search *s = container_of(op, struct search, op);
|
2013-07-25 07:41:13 +07:00
|
|
|
struct bio *n, *bio = &s->bio.bio;
|
|
|
|
struct bkey *bio_key;
|
2013-07-25 07:41:08 +07:00
|
|
|
unsigned ptr;
|
|
|
|
|
2013-10-12 05:44:27 +07:00
|
|
|
if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0)
|
2013-07-25 07:41:13 +07:00
|
|
|
return MAP_CONTINUE;
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
if (KEY_INODE(k) != s->iop.inode ||
|
2013-10-12 05:44:27 +07:00
|
|
|
KEY_START(k) > bio->bi_iter.bi_sector) {
|
2013-07-25 07:41:13 +07:00
|
|
|
unsigned bio_sectors = bio_sectors(bio);
|
2013-09-11 09:02:45 +07:00
|
|
|
unsigned sectors = KEY_INODE(k) == s->iop.inode
|
2013-07-25 07:41:13 +07:00
|
|
|
? min_t(uint64_t, INT_MAX,
|
2013-10-12 05:44:27 +07:00
|
|
|
KEY_START(k) - bio->bi_iter.bi_sector)
|
2013-07-25 07:41:13 +07:00
|
|
|
: INT_MAX;
|
|
|
|
|
|
|
|
int ret = s->d->cache_miss(b, s, bio, sectors);
|
|
|
|
if (ret != MAP_CONTINUE)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
/* if this was a complete miss we shouldn't get here */
|
|
|
|
BUG_ON(bio_sectors <= sectors);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!KEY_SIZE(k))
|
|
|
|
return MAP_CONTINUE;
|
2013-07-25 07:41:08 +07:00
|
|
|
|
|
|
|
/* XXX: figure out best pointer - for multiple cache devices */
|
|
|
|
ptr = 0;
|
|
|
|
|
|
|
|
PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
|
|
|
|
|
2013-09-11 04:27:42 +07:00
|
|
|
if (KEY_DIRTY(k))
|
|
|
|
s->read_dirty_data = true;
|
|
|
|
|
2013-11-24 09:21:01 +07:00
|
|
|
n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
|
|
|
|
KEY_OFFSET(k) - bio->bi_iter.bi_sector),
|
|
|
|
GFP_NOIO, s->d->bio_split);
|
2013-07-25 07:41:08 +07:00
|
|
|
|
2013-07-25 07:41:13 +07:00
|
|
|
bio_key = &container_of(n, struct bbio, bio)->key;
|
|
|
|
bch_bkey_copy_single_ptr(bio_key, k, ptr);
|
2013-07-25 07:41:08 +07:00
|
|
|
|
2013-10-12 05:44:27 +07:00
|
|
|
bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key);
|
2013-09-11 09:02:45 +07:00
|
|
|
bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
|
2013-07-25 07:41:08 +07:00
|
|
|
|
2013-07-25 07:41:13 +07:00
|
|
|
n->bi_end_io = bch_cache_read_endio;
|
|
|
|
n->bi_private = &s->cl;
|
2013-07-25 07:41:08 +07:00
|
|
|
|
2013-07-25 07:41:13 +07:00
|
|
|
/*
|
|
|
|
* The bucket we're reading from might be reused while our bio
|
|
|
|
* is in flight, and we could then end up reading the wrong
|
|
|
|
* data.
|
|
|
|
*
|
|
|
|
* We guard against this by checking (in cache_read_endio()) if
|
|
|
|
* the pointer is stale again; if so, we treat it as an error
|
|
|
|
* and reread from the backing device (but we don't pass that
|
|
|
|
* error up anywhere).
|
|
|
|
*/
|
2013-07-25 07:41:08 +07:00
|
|
|
|
2013-07-25 07:41:13 +07:00
|
|
|
__bch_submit_bbio(n, b->c);
|
|
|
|
return n == bio ? MAP_DONE : MAP_CONTINUE;
|
2013-07-25 07:41:08 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void cache_lookup(struct closure *cl)
|
|
|
|
{
|
2013-09-11 09:02:45 +07:00
|
|
|
struct search *s = container_of(cl, struct search, iop.cl);
|
2013-07-25 07:41:08 +07:00
|
|
|
struct bio *bio = &s->bio.bio;
|
2013-09-11 09:16:31 +07:00
|
|
|
int ret;
|
2013-07-25 07:41:08 +07:00
|
|
|
|
2013-09-11 09:16:31 +07:00
|
|
|
bch_btree_op_init(&s->op, -1);
|
2013-07-25 07:41:08 +07:00
|
|
|
|
2013-09-11 09:16:31 +07:00
|
|
|
ret = bch_btree_map_keys(&s->op, s->iop.c,
|
|
|
|
&KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
|
|
|
|
cache_lookup_fn, MAP_END_KEY);
|
2015-03-06 22:37:46 +07:00
|
|
|
if (ret == -EAGAIN) {
|
2013-07-25 07:41:08 +07:00
|
|
|
continue_at(cl, cache_lookup, bcache_wq);
|
2015-03-06 22:37:46 +07:00
|
|
|
return;
|
|
|
|
}
|
2013-07-25 07:41:08 +07:00
|
|
|
|
|
|
|
closure_return(cl);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Common code for the make_request functions */
|
|
|
|
|
2015-07-20 20:29:37 +07:00
|
|
|
static void request_endio(struct bio *bio)
|
2013-07-25 07:41:08 +07:00
|
|
|
{
|
|
|
|
struct closure *cl = bio->bi_private;
|
|
|
|
|
2017-06-03 14:38:06 +07:00
|
|
|
if (bio->bi_status) {
|
2013-07-25 07:41:08 +07:00
|
|
|
struct search *s = container_of(cl, struct search, cl);
|
2017-06-03 14:38:06 +07:00
|
|
|
s->iop.status = bio->bi_status;
|
2013-07-25 07:41:08 +07:00
|
|
|
/* Only cache read errors are recoverable */
|
|
|
|
s->recoverable = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bio_put(bio);
|
|
|
|
closure_put(cl);
|
|
|
|
}
|
|
|
|
|
2013-03-24 06:11:31 +07:00
|
|
|
static void bio_complete(struct search *s)
|
|
|
|
{
|
|
|
|
if (s->orig_bio) {
|
2017-08-24 00:10:32 +07:00
|
|
|
struct request_queue *q = s->orig_bio->bi_disk->queue;
|
2017-07-01 10:55:08 +07:00
|
|
|
generic_end_io_acct(q, bio_data_dir(s->orig_bio),
|
2014-11-24 10:05:24 +07:00
|
|
|
&s->d->disk->part0, s->start_time);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
trace_bcache_request_end(s->d, s->orig_bio);
|
2017-06-03 14:38:06 +07:00
|
|
|
s->orig_bio->bi_status = s->iop.status;
|
2015-07-20 20:29:37 +07:00
|
|
|
bio_endio(s->orig_bio);
|
2013-03-24 06:11:31 +07:00
|
|
|
s->orig_bio = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-09-11 09:16:31 +07:00
|
|
|
static void do_bio_hook(struct search *s, struct bio *orig_bio)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
|
|
|
struct bio *bio = &s->bio.bio;
|
|
|
|
|
2016-11-22 22:57:21 +07:00
|
|
|
bio_init(bio, NULL, 0);
|
2013-09-11 09:16:31 +07:00
|
|
|
__bio_clone_fast(bio, orig_bio);
|
2013-03-24 06:11:31 +07:00
|
|
|
bio->bi_end_io = request_endio;
|
|
|
|
bio->bi_private = &s->cl;
|
2013-11-23 10:37:48 +07:00
|
|
|
|
2015-04-18 05:23:59 +07:00
|
|
|
bio_cnt_set(bio, 3);
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void search_free(struct closure *cl)
|
|
|
|
{
|
|
|
|
struct search *s = container_of(cl, struct search, cl);
|
|
|
|
bio_complete(s);
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
if (s->iop.bio)
|
|
|
|
bio_put(s->iop.bio);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
|
|
|
closure_debug_destroy(cl);
|
|
|
|
mempool_free(s, s->d->c->search);
|
|
|
|
}
|
|
|
|
|
2013-09-11 09:16:31 +07:00
|
|
|
static inline struct search *search_alloc(struct bio *bio,
|
|
|
|
struct bcache_device *d)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
2013-07-25 07:26:51 +07:00
|
|
|
struct search *s;
|
|
|
|
|
|
|
|
s = mempool_alloc(d->c->search, GFP_NOIO);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-09-11 09:16:31 +07:00
|
|
|
closure_init(&s->cl, NULL);
|
|
|
|
do_bio_hook(s, bio);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
|
|
|
s->orig_bio = bio;
|
2013-09-11 09:16:31 +07:00
|
|
|
s->cache_miss = NULL;
|
|
|
|
s->d = d;
|
2013-03-24 06:11:31 +07:00
|
|
|
s->recoverable = 1;
|
2016-06-06 02:31:47 +07:00
|
|
|
s->write = op_is_write(bio_op(bio));
|
2013-09-11 09:16:31 +07:00
|
|
|
s->read_dirty_data = 0;
|
2013-03-24 06:11:31 +07:00
|
|
|
s->start_time = jiffies;
|
2013-09-11 09:16:31 +07:00
|
|
|
|
|
|
|
s->iop.c = d->c;
|
|
|
|
s->iop.bio = NULL;
|
|
|
|
s->iop.inode = d->id;
|
|
|
|
s->iop.write_point = hash_long((unsigned long) current, 16);
|
|
|
|
s->iop.write_prio = 0;
|
2017-06-03 14:38:06 +07:00
|
|
|
s->iop.status = 0;
|
2013-09-11 09:16:31 +07:00
|
|
|
s->iop.flags = 0;
|
2017-01-27 22:30:47 +07:00
|
|
|
s->iop.flush_journal = op_is_flush(bio->bi_opf);
|
2014-01-10 07:03:04 +07:00
|
|
|
s->iop.wq = bcache_wq;
|
2013-03-24 06:11:31 +07:00
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Cached devices */
|
|
|
|
|
|
|
|
static void cached_dev_bio_complete(struct closure *cl)
|
|
|
|
{
|
|
|
|
struct search *s = container_of(cl, struct search, cl);
|
|
|
|
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
|
|
|
|
|
|
|
|
search_free(cl);
|
|
|
|
cached_dev_put(dc);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Process reads */
|
|
|
|
|
2013-09-11 07:06:17 +07:00
|
|
|
static void cached_dev_cache_miss_done(struct closure *cl)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
|
|
|
struct search *s = container_of(cl, struct search, cl);
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
if (s->iop.replace_collision)
|
|
|
|
bch_mark_cache_miss_collision(s->iop.c, s->d);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2016-09-22 14:10:01 +07:00
|
|
|
if (s->iop.bio)
|
|
|
|
bio_free_pages(s->iop.bio);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
|
|
|
cached_dev_bio_complete(cl);
|
|
|
|
}
|
|
|
|
|
2013-09-11 07:06:17 +07:00
|
|
|
static void cached_dev_read_error(struct closure *cl)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
|
|
|
struct search *s = container_of(cl, struct search, cl);
|
2013-09-11 07:06:17 +07:00
|
|
|
struct bio *bio = &s->bio.bio;
|
bcache: only permit to recovery read error when cache device is clean
When bcache does read I/Os, for example in writeback or writethrough mode,
if a read request on cache device is failed, bcache will try to recovery
the request by reading from cached device. If the data on cached device is
not synced with cache device, then requester will get a stale data.
For critical storage system like database, providing stale data from
recovery may result an application level data corruption, which is
unacceptible.
With this patch, for a failed read request in writeback or writethrough
mode, recovery a recoverable read request only happens when cache device
is clean. That is to say, all data on cached device is up to update.
For other cache modes in bcache, read request will never hit
cached_dev_read_error(), they don't need this patch.
Please note, because cache mode can be switched arbitrarily in run time, a
writethrough mode might be switched from a writeback mode. Therefore
checking dc->has_data in writethrough mode still makes sense.
Changelog:
V4: Fix parens error pointed by Michael Lyle.
v3: By response from Kent Oversteet, he thinks recovering stale data is a
bug to fix, and option to permit it is unnecessary. So this version
the sysfs file is removed.
v2: rename sysfs entry from allow_stale_data_on_failure to
allow_stale_data_on_failure, and fix the confusing commit log.
v1: initial patch posted.
[small change to patch comment spelling by mlyle]
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Reported-by: Arne Wolf <awolf@lenovo.com>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Nix <nix@esperi.org.uk>
Cc: Kai Krakow <hurikhan77@gmail.com>
Cc: Eric Wheeler <bcache@lists.ewheeler.net>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-31 04:46:31 +07:00
|
|
|
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
bcache: only permit to recovery read error when cache device is clean
When bcache does read I/Os, for example in writeback or writethrough mode,
if a read request on cache device is failed, bcache will try to recovery
the request by reading from cached device. If the data on cached device is
not synced with cache device, then requester will get a stale data.
For critical storage system like database, providing stale data from
recovery may result an application level data corruption, which is
unacceptible.
With this patch, for a failed read request in writeback or writethrough
mode, recovery a recoverable read request only happens when cache device
is clean. That is to say, all data on cached device is up to update.
For other cache modes in bcache, read request will never hit
cached_dev_read_error(), they don't need this patch.
Please note, because cache mode can be switched arbitrarily in run time, a
writethrough mode might be switched from a writeback mode. Therefore
checking dc->has_data in writethrough mode still makes sense.
Changelog:
V4: Fix parens error pointed by Michael Lyle.
v3: By response from Kent Oversteet, he thinks recovering stale data is a
bug to fix, and option to permit it is unnecessary. So this version
the sysfs file is removed.
v2: rename sysfs entry from allow_stale_data_on_failure to
allow_stale_data_on_failure, and fix the confusing commit log.
v1: initial patch posted.
[small change to patch comment spelling by mlyle]
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Reported-by: Arne Wolf <awolf@lenovo.com>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Nix <nix@esperi.org.uk>
Cc: Kai Krakow <hurikhan77@gmail.com>
Cc: Eric Wheeler <bcache@lists.ewheeler.net>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-31 04:46:31 +07:00
|
|
|
/*
|
|
|
|
* If cache device is dirty (dc->has_dirty is non-zero), then
|
|
|
|
* recovery a failed read request from cached device may get a
|
|
|
|
* stale data back. So read failure recovery is only permitted
|
|
|
|
* when cache device is clean.
|
|
|
|
*/
|
|
|
|
if (s->recoverable &&
|
|
|
|
(dc && !atomic_read(&dc->has_dirty))) {
|
2013-04-27 05:39:55 +07:00
|
|
|
/* Retry from the backing device: */
|
|
|
|
trace_bcache_read_retry(s->orig_bio);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2017-06-03 14:38:06 +07:00
|
|
|
s->iop.status = 0;
|
2013-09-11 09:16:31 +07:00
|
|
|
do_bio_hook(s, s->orig_bio);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
|
|
|
/* XXX: invalidate cache */
|
|
|
|
|
2013-11-24 14:11:25 +07:00
|
|
|
closure_bio_submit(bio, cl);
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
2013-09-11 07:06:17 +07:00
|
|
|
continue_at(cl, cached_dev_cache_miss_done, NULL);
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
2013-09-11 07:06:17 +07:00
|
|
|
static void cached_dev_read_done(struct closure *cl)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
|
|
|
struct search *s = container_of(cl, struct search, cl);
|
|
|
|
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
|
|
|
|
|
|
|
|
/*
|
2013-09-11 07:06:17 +07:00
|
|
|
* We had a cache miss; cache_bio now contains data ready to be inserted
|
|
|
|
* into the cache.
|
2013-03-24 06:11:31 +07:00
|
|
|
*
|
|
|
|
* First, we copy the data we just read from cache_bio's bounce buffers
|
|
|
|
* to the buffers the original bio pointed to:
|
|
|
|
*/
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
if (s->iop.bio) {
|
|
|
|
bio_reset(s->iop.bio);
|
2013-10-12 05:44:27 +07:00
|
|
|
s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector;
|
2017-08-24 00:10:32 +07:00
|
|
|
bio_copy_dev(s->iop.bio, s->cache_miss);
|
2013-10-12 05:44:27 +07:00
|
|
|
s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
|
2013-09-11 09:02:45 +07:00
|
|
|
bch_bio_map(s->iop.bio, NULL);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
bio_copy_data(s->cache_miss, s->iop.bio);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
|
|
|
bio_put(s->cache_miss);
|
|
|
|
s->cache_miss = NULL;
|
|
|
|
}
|
|
|
|
|
2017-10-14 06:35:34 +07:00
|
|
|
if (verify(dc) && s->recoverable && !s->read_dirty_data)
|
2013-09-11 09:02:45 +07:00
|
|
|
bch_data_verify(dc, s->orig_bio);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
|
|
|
bio_complete(s);
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
if (s->iop.bio &&
|
|
|
|
!test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) {
|
|
|
|
BUG_ON(!s->iop.replace);
|
|
|
|
closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
2013-09-11 07:06:17 +07:00
|
|
|
continue_at(cl, cached_dev_cache_miss_done, NULL);
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
2013-09-11 07:06:17 +07:00
|
|
|
static void cached_dev_read_done_bh(struct closure *cl)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
|
|
|
struct search *s = container_of(cl, struct search, cl);
|
|
|
|
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
bch_mark_cache_accounting(s->iop.c, s->d,
|
|
|
|
!s->cache_miss, s->iop.bypass);
|
|
|
|
trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2017-06-03 14:38:06 +07:00
|
|
|
if (s->iop.status)
|
2013-09-11 07:06:17 +07:00
|
|
|
continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
|
2017-10-14 06:35:34 +07:00
|
|
|
else if (s->iop.bio || verify(dc))
|
2013-09-11 07:06:17 +07:00
|
|
|
continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
|
2013-03-24 06:11:31 +07:00
|
|
|
else
|
2013-09-11 07:06:17 +07:00
|
|
|
continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static int cached_dev_cache_miss(struct btree *b, struct search *s,
|
|
|
|
struct bio *bio, unsigned sectors)
|
|
|
|
{
|
2013-07-25 07:41:08 +07:00
|
|
|
int ret = MAP_CONTINUE;
|
2013-09-11 08:39:16 +07:00
|
|
|
unsigned reada = 0;
|
2013-03-24 06:11:31 +07:00
|
|
|
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
|
2013-09-11 07:06:17 +07:00
|
|
|
struct bio *miss, *cache_bio;
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
if (s->cache_miss || s->iop.bypass) {
|
2013-11-24 09:21:01 +07:00
|
|
|
miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
|
2013-07-25 07:41:08 +07:00
|
|
|
ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
|
2013-09-11 08:39:16 +07:00
|
|
|
goto out_submit;
|
|
|
|
}
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2016-08-06 04:35:16 +07:00
|
|
|
if (!(bio->bi_opf & REQ_RAHEAD) &&
|
|
|
|
!(bio->bi_opf & REQ_META) &&
|
2013-09-11 09:02:45 +07:00
|
|
|
s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
|
2013-09-11 08:39:16 +07:00
|
|
|
reada = min_t(sector_t, dc->readahead >> 9,
|
2017-08-24 00:10:32 +07:00
|
|
|
get_capacity(bio->bi_disk) - bio_end_sector(bio));
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
s->iop.replace_key = KEY(s->iop.inode,
|
2013-10-12 05:44:27 +07:00
|
|
|
bio->bi_iter.bi_sector + s->insert_bio_sectors,
|
2013-09-11 09:02:45 +07:00
|
|
|
s->insert_bio_sectors);
|
2013-09-11 08:39:16 +07:00
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key);
|
2013-09-11 08:39:16 +07:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
s->iop.replace = true;
|
2013-09-11 08:52:54 +07:00
|
|
|
|
2013-11-24 09:21:01 +07:00
|
|
|
miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
|
2013-07-25 07:41:08 +07:00
|
|
|
|
|
|
|
/* btree_search_recurse()'s btree iterator is no good anymore */
|
|
|
|
ret = miss == bio ? MAP_DONE : -EINTR;
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-09-11 07:06:17 +07:00
|
|
|
cache_bio = bio_alloc_bioset(GFP_NOWAIT,
|
2013-09-11 09:02:45 +07:00
|
|
|
DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS),
|
2013-03-24 06:11:31 +07:00
|
|
|
dc->disk.bio_split);
|
2013-09-11 07:06:17 +07:00
|
|
|
if (!cache_bio)
|
2013-03-24 06:11:31 +07:00
|
|
|
goto out_submit;
|
|
|
|
|
2013-10-12 05:44:27 +07:00
|
|
|
cache_bio->bi_iter.bi_sector = miss->bi_iter.bi_sector;
|
2017-08-24 00:10:32 +07:00
|
|
|
bio_copy_dev(cache_bio, miss);
|
2013-10-12 05:44:27 +07:00
|
|
|
cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-09-11 07:06:17 +07:00
|
|
|
cache_bio->bi_end_io = request_endio;
|
|
|
|
cache_bio->bi_private = &s->cl;
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-09-11 07:06:17 +07:00
|
|
|
bch_bio_map(cache_bio, NULL);
|
|
|
|
if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
|
2013-03-24 06:11:31 +07:00
|
|
|
goto out_put;
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
if (reada)
|
|
|
|
bch_mark_cache_readahead(s->iop.c, s->d);
|
|
|
|
|
2013-09-11 07:06:17 +07:00
|
|
|
s->cache_miss = miss;
|
2013-09-11 09:02:45 +07:00
|
|
|
s->iop.bio = cache_bio;
|
2013-09-11 07:06:17 +07:00
|
|
|
bio_get(cache_bio);
|
2013-11-24 14:11:25 +07:00
|
|
|
closure_bio_submit(cache_bio, &s->cl);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
out_put:
|
2013-09-11 07:06:17 +07:00
|
|
|
bio_put(cache_bio);
|
2013-03-24 06:11:31 +07:00
|
|
|
out_submit:
|
2013-09-11 08:39:16 +07:00
|
|
|
miss->bi_end_io = request_endio;
|
|
|
|
miss->bi_private = &s->cl;
|
2013-11-24 14:11:25 +07:00
|
|
|
closure_bio_submit(miss, &s->cl);
|
2013-03-24 06:11:31 +07:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-09-11 07:06:17 +07:00
|
|
|
static void cached_dev_read(struct cached_dev *dc, struct search *s)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
|
|
|
struct closure *cl = &s->cl;
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
closure_call(&s->iop.cl, cache_lookup, NULL, cl);
|
2013-09-11 07:06:17 +07:00
|
|
|
continue_at(cl, cached_dev_read_done_bh, NULL);
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Process writes */
|
|
|
|
|
|
|
|
static void cached_dev_write_complete(struct closure *cl)
|
|
|
|
{
|
|
|
|
struct search *s = container_of(cl, struct search, cl);
|
|
|
|
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
|
|
|
|
|
|
|
|
up_read_non_owner(&dc->writeback_lock);
|
|
|
|
cached_dev_bio_complete(cl);
|
|
|
|
}
|
|
|
|
|
2013-09-11 07:06:17 +07:00
|
|
|
static void cached_dev_write(struct cached_dev *dc, struct search *s)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
|
|
|
struct closure *cl = &s->cl;
|
|
|
|
struct bio *bio = &s->bio.bio;
|
2013-10-12 05:44:27 +07:00
|
|
|
struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, 0);
|
2013-07-25 07:24:52 +07:00
|
|
|
struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
|
|
|
down_read_non_owner(&dc->writeback_lock);
|
|
|
|
if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
|
2013-07-25 07:24:52 +07:00
|
|
|
/*
|
|
|
|
* We overlap with some dirty data undergoing background
|
|
|
|
* writeback, force this write to writeback
|
|
|
|
*/
|
2013-09-11 09:02:45 +07:00
|
|
|
s->iop.bypass = false;
|
|
|
|
s->iop.writeback = true;
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
2013-07-25 07:24:52 +07:00
|
|
|
/*
|
|
|
|
* Discards aren't _required_ to do anything, so skipping if
|
|
|
|
* check_overlapping returned true is ok
|
|
|
|
*
|
|
|
|
* But check_overlapping drops dirty keys for which io hasn't started,
|
|
|
|
* so we still want to call it.
|
|
|
|
*/
|
2016-06-06 02:32:05 +07:00
|
|
|
if (bio_op(bio) == REQ_OP_DISCARD)
|
2013-09-11 09:02:45 +07:00
|
|
|
s->iop.bypass = true;
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-06-05 20:24:39 +07:00
|
|
|
if (should_writeback(dc, s->orig_bio,
|
2017-10-14 06:35:34 +07:00
|
|
|
cache_mode(dc),
|
2013-09-11 09:02:45 +07:00
|
|
|
s->iop.bypass)) {
|
|
|
|
s->iop.bypass = false;
|
|
|
|
s->iop.writeback = true;
|
2013-06-05 20:24:39 +07:00
|
|
|
}
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
if (s->iop.bypass) {
|
|
|
|
s->iop.bio = s->orig_bio;
|
|
|
|
bio_get(s->iop.bio);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2016-06-06 02:32:05 +07:00
|
|
|
if ((bio_op(bio) != REQ_OP_DISCARD) ||
|
2013-07-25 07:24:52 +07:00
|
|
|
blk_queue_discard(bdev_get_queue(dc->bdev)))
|
2013-11-24 14:11:25 +07:00
|
|
|
closure_bio_submit(bio, cl);
|
2013-09-11 09:02:45 +07:00
|
|
|
} else if (s->iop.writeback) {
|
2013-06-05 20:21:07 +07:00
|
|
|
bch_writeback_add(dc);
|
2013-09-11 09:02:45 +07:00
|
|
|
s->iop.bio = bio;
|
2013-06-27 07:25:38 +07:00
|
|
|
|
2016-08-06 04:35:16 +07:00
|
|
|
if (bio->bi_opf & REQ_PREFLUSH) {
|
2013-06-27 07:25:38 +07:00
|
|
|
/* Also need to send a flush to the backing device */
|
2013-10-23 05:35:50 +07:00
|
|
|
struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0,
|
2013-09-24 13:17:36 +07:00
|
|
|
dc->disk.bio_split);
|
2013-06-27 07:25:38 +07:00
|
|
|
|
2017-08-24 00:10:32 +07:00
|
|
|
bio_copy_dev(flush, bio);
|
2013-09-24 13:17:36 +07:00
|
|
|
flush->bi_end_io = request_endio;
|
|
|
|
flush->bi_private = cl;
|
2016-11-01 20:40:10 +07:00
|
|
|
flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
|
2013-09-24 13:17:36 +07:00
|
|
|
|
2013-11-24 14:11:25 +07:00
|
|
|
closure_bio_submit(flush, cl);
|
2013-06-27 07:25:38 +07:00
|
|
|
}
|
2013-07-25 07:24:52 +07:00
|
|
|
} else {
|
2013-11-24 09:19:27 +07:00
|
|
|
s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
|
2013-07-25 07:24:52 +07:00
|
|
|
|
2013-11-24 14:11:25 +07:00
|
|
|
closure_bio_submit(bio, cl);
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
2013-07-25 07:24:52 +07:00
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
|
2013-03-24 06:11:31 +07:00
|
|
|
continue_at(cl, cached_dev_write_complete, NULL);
|
|
|
|
}
|
|
|
|
|
2013-10-25 07:07:04 +07:00
|
|
|
static void cached_dev_nodata(struct closure *cl)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
2013-10-25 07:07:04 +07:00
|
|
|
struct search *s = container_of(cl, struct search, cl);
|
2013-03-24 06:11:31 +07:00
|
|
|
struct bio *bio = &s->bio.bio;
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
if (s->iop.flush_journal)
|
|
|
|
bch_journal_meta(s->iop.c, cl);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-07-25 07:24:52 +07:00
|
|
|
/* If it's a flush, we send the flush to the backing device too */
|
2013-11-24 14:11:25 +07:00
|
|
|
closure_bio_submit(bio, cl);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
|
|
|
continue_at(cl, cached_dev_bio_complete, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Cached devices - read & write stuff */
|
|
|
|
|
2015-11-06 00:41:16 +07:00
|
|
|
static blk_qc_t cached_dev_make_request(struct request_queue *q,
|
|
|
|
struct bio *bio)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
|
|
|
struct search *s;
|
2017-08-24 00:10:32 +07:00
|
|
|
struct bcache_device *d = bio->bi_disk->private_data;
|
2013-03-24 06:11:31 +07:00
|
|
|
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
|
2014-11-24 10:05:24 +07:00
|
|
|
int rw = bio_data_dir(bio);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2017-07-01 10:55:08 +07:00
|
|
|
generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2017-08-24 00:10:32 +07:00
|
|
|
bio_set_dev(bio, dc->bdev);
|
2013-10-12 05:44:27 +07:00
|
|
|
bio->bi_iter.bi_sector += dc->sb.data_offset;
|
2013-03-24 06:11:31 +07:00
|
|
|
|
|
|
|
if (cached_dev_get(dc)) {
|
|
|
|
s = search_alloc(bio, d);
|
2013-09-11 09:02:45 +07:00
|
|
|
trace_bcache_request_start(s->d, bio);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-10-12 05:44:27 +07:00
|
|
|
if (!bio->bi_iter.bi_size) {
|
2013-10-25 07:07:04 +07:00
|
|
|
/*
|
|
|
|
* can't call bch_journal_meta from under
|
|
|
|
* generic_make_request
|
|
|
|
*/
|
|
|
|
continue_at_nobarrier(&s->cl,
|
|
|
|
cached_dev_nodata,
|
|
|
|
bcache_wq);
|
|
|
|
} else {
|
2013-09-11 09:02:45 +07:00
|
|
|
s->iop.bypass = check_should_bypass(dc, bio);
|
2013-07-25 07:24:52 +07:00
|
|
|
|
|
|
|
if (rw)
|
2013-09-11 07:06:17 +07:00
|
|
|
cached_dev_write(dc, s);
|
2013-07-25 07:24:52 +07:00
|
|
|
else
|
2013-09-11 07:06:17 +07:00
|
|
|
cached_dev_read(dc, s);
|
2013-07-25 07:24:52 +07:00
|
|
|
}
|
2013-03-24 06:11:31 +07:00
|
|
|
} else {
|
2016-06-06 02:32:05 +07:00
|
|
|
if ((bio_op(bio) == REQ_OP_DISCARD) &&
|
2013-03-24 06:11:31 +07:00
|
|
|
!blk_queue_discard(bdev_get_queue(dc->bdev)))
|
2015-07-20 20:29:37 +07:00
|
|
|
bio_endio(bio);
|
2013-03-24 06:11:31 +07:00
|
|
|
else
|
2013-11-24 14:11:25 +07:00
|
|
|
generic_make_request(bio);
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
2015-11-06 00:41:16 +07:00
|
|
|
|
|
|
|
return BLK_QC_T_NONE;
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
|
|
|
|
unsigned int cmd, unsigned long arg)
|
|
|
|
{
|
|
|
|
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
|
|
|
|
return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int cached_dev_congested(void *data, int bits)
|
|
|
|
{
|
|
|
|
struct bcache_device *d = data;
|
|
|
|
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
|
|
|
|
struct request_queue *q = bdev_get_queue(dc->bdev);
|
|
|
|
int ret = 0;
|
|
|
|
|
2017-02-02 21:56:50 +07:00
|
|
|
if (bdi_congested(q->backing_dev_info, bits))
|
2013-03-24 06:11:31 +07:00
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (cached_dev_get(dc)) {
|
|
|
|
unsigned i;
|
|
|
|
struct cache *ca;
|
|
|
|
|
|
|
|
for_each_cache(ca, d->c, i) {
|
|
|
|
q = bdev_get_queue(ca->bdev);
|
2017-02-02 21:56:50 +07:00
|
|
|
ret |= bdi_congested(q->backing_dev_info, bits);
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
cached_dev_put(dc);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
void bch_cached_dev_request_init(struct cached_dev *dc)
|
|
|
|
{
|
|
|
|
struct gendisk *g = dc->disk.disk;
|
|
|
|
|
|
|
|
g->queue->make_request_fn = cached_dev_make_request;
|
2017-02-02 21:56:50 +07:00
|
|
|
g->queue->backing_dev_info->congested_fn = cached_dev_congested;
|
2013-03-24 06:11:31 +07:00
|
|
|
dc->disk.cache_miss = cached_dev_cache_miss;
|
|
|
|
dc->disk.ioctl = cached_dev_ioctl;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Flash backed devices */
|
|
|
|
|
|
|
|
static int flash_dev_cache_miss(struct btree *b, struct search *s,
|
|
|
|
struct bio *bio, unsigned sectors)
|
|
|
|
{
|
2014-01-17 06:04:18 +07:00
|
|
|
unsigned bytes = min(sectors, bio_sectors(bio)) << 9;
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2014-01-17 06:04:18 +07:00
|
|
|
swap(bio->bi_iter.bi_size, bytes);
|
|
|
|
zero_fill_bio(bio);
|
|
|
|
swap(bio->bi_iter.bi_size, bytes);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2014-01-17 06:04:18 +07:00
|
|
|
bio_advance(bio, bytes);
|
2013-06-07 08:15:57 +07:00
|
|
|
|
2013-10-12 05:44:27 +07:00
|
|
|
if (!bio->bi_iter.bi_size)
|
2013-07-25 07:41:08 +07:00
|
|
|
return MAP_DONE;
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-07-25 07:41:08 +07:00
|
|
|
return MAP_CONTINUE;
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
2013-10-25 07:07:04 +07:00
|
|
|
static void flash_dev_nodata(struct closure *cl)
|
|
|
|
{
|
|
|
|
struct search *s = container_of(cl, struct search, cl);
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
if (s->iop.flush_journal)
|
|
|
|
bch_journal_meta(s->iop.c, cl);
|
2013-10-25 07:07:04 +07:00
|
|
|
|
|
|
|
continue_at(cl, search_free, NULL);
|
|
|
|
}
|
|
|
|
|
2015-11-06 00:41:16 +07:00
|
|
|
static blk_qc_t flash_dev_make_request(struct request_queue *q,
|
|
|
|
struct bio *bio)
|
2013-03-24 06:11:31 +07:00
|
|
|
{
|
|
|
|
struct search *s;
|
|
|
|
struct closure *cl;
|
2017-08-24 00:10:32 +07:00
|
|
|
struct bcache_device *d = bio->bi_disk->private_data;
|
2014-11-24 10:05:24 +07:00
|
|
|
int rw = bio_data_dir(bio);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2017-07-01 10:55:08 +07:00
|
|
|
generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
|
|
|
s = search_alloc(bio, d);
|
|
|
|
cl = &s->cl;
|
|
|
|
bio = &s->bio.bio;
|
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
trace_bcache_request_start(s->d, bio);
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-10-12 05:44:27 +07:00
|
|
|
if (!bio->bi_iter.bi_size) {
|
2013-10-25 07:07:04 +07:00
|
|
|
/*
|
|
|
|
* can't call bch_journal_meta from under
|
|
|
|
* generic_make_request
|
|
|
|
*/
|
|
|
|
continue_at_nobarrier(&s->cl,
|
|
|
|
flash_dev_nodata,
|
|
|
|
bcache_wq);
|
2015-11-06 00:41:16 +07:00
|
|
|
return BLK_QC_T_NONE;
|
2013-07-25 07:24:52 +07:00
|
|
|
} else if (rw) {
|
2013-09-11 09:02:45 +07:00
|
|
|
bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
|
2013-10-12 05:44:27 +07:00
|
|
|
&KEY(d->id, bio->bi_iter.bi_sector, 0),
|
2013-06-07 08:15:57 +07:00
|
|
|
&KEY(d->id, bio_end_sector(bio), 0));
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2016-06-06 02:32:05 +07:00
|
|
|
s->iop.bypass = (bio_op(bio) == REQ_OP_DISCARD) != 0;
|
2013-09-11 09:02:45 +07:00
|
|
|
s->iop.writeback = true;
|
|
|
|
s->iop.bio = bio;
|
2013-03-24 06:11:31 +07:00
|
|
|
|
2013-09-11 09:02:45 +07:00
|
|
|
closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
|
2013-03-24 06:11:31 +07:00
|
|
|
} else {
|
2013-09-11 09:02:45 +07:00
|
|
|
closure_call(&s->iop.cl, cache_lookup, NULL, cl);
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
continue_at(cl, search_free, NULL);
|
2015-11-06 00:41:16 +07:00
|
|
|
return BLK_QC_T_NONE;
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
|
|
|
|
unsigned int cmd, unsigned long arg)
|
|
|
|
{
|
|
|
|
return -ENOTTY;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int flash_dev_congested(void *data, int bits)
|
|
|
|
{
|
|
|
|
struct bcache_device *d = data;
|
|
|
|
struct request_queue *q;
|
|
|
|
struct cache *ca;
|
|
|
|
unsigned i;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
for_each_cache(ca, d->c, i) {
|
|
|
|
q = bdev_get_queue(ca->bdev);
|
2017-02-02 21:56:50 +07:00
|
|
|
ret |= bdi_congested(q->backing_dev_info, bits);
|
2013-03-24 06:11:31 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
void bch_flash_dev_request_init(struct bcache_device *d)
|
|
|
|
{
|
|
|
|
struct gendisk *g = d->disk;
|
|
|
|
|
|
|
|
g->queue->make_request_fn = flash_dev_make_request;
|
2017-02-02 21:56:50 +07:00
|
|
|
g->queue->backing_dev_info->congested_fn = flash_dev_congested;
|
2013-03-24 06:11:31 +07:00
|
|
|
d->cache_miss = flash_dev_cache_miss;
|
|
|
|
d->ioctl = flash_dev_ioctl;
|
|
|
|
}
|
|
|
|
|
|
|
|
void bch_request_exit(void)
|
|
|
|
{
|
|
|
|
if (bch_search_cache)
|
|
|
|
kmem_cache_destroy(bch_search_cache);
|
|
|
|
}
|
|
|
|
|
|
|
|
int __init bch_request_init(void)
|
|
|
|
{
|
|
|
|
bch_search_cache = KMEM_CACHE(search, 0);
|
|
|
|
if (!bch_search_cache)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|