linux_dsm_epyc7002/drivers/md/persistent-data/dm-block-manager.c

650 lines
14 KiB
C
Raw Normal View History

/*
* Copyright (C) 2011 Red Hat, Inc.
*
* This file is released under the GPL.
*/
#include "dm-block-manager.h"
#include "dm-persistent-data-internal.h"
#include <linux/dm-bufio.h>
#include <linux/crc32c.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/rwsem.h>
#include <linux/device-mapper.h>
#include <linux/stacktrace.h>
#include <linux/sched/task.h>
#define DM_MSG_PREFIX "block manager"
/*----------------------------------------------------------------*/
#ifdef CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING
/*
* This is a read/write semaphore with a couple of differences.
*
* i) There is a restriction on the number of concurrent read locks that
* may be held at once. This is just an implementation detail.
*
* ii) Recursive locking attempts are detected and return EINVAL. A stack
* trace is also emitted for the previous lock acquisition.
*
* iii) Priority is given to write locks.
*/
#define MAX_HOLDERS 4
#define MAX_STACK 10
dm persistent data: Simplify stack trace handling Replace the indirection through struct stack_trace with an invocation of the storage array based interface. This results in less storage space and indirection. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: dm-devel@redhat.com Cc: Mike Snitzer <snitzer@redhat.com> Cc: Alasdair Kergon <agk@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Alexander Potapenko <glider@google.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Christoph Lameter <cl@linux.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: linux-mm@kvack.org Cc: David Rientjes <rientjes@google.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: Andrey Ryabinin <aryabinin@virtuozzo.com> Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> Cc: Akinobu Mita <akinobu.mita@gmail.com> Cc: Christoph Hellwig <hch@lst.de> Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy <robin.murphy@arm.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: Johannes Thumshirn <jthumshirn@suse.de> Cc: David Sterba <dsterba@suse.com> Cc: Chris Mason <clm@fb.com> Cc: Josef Bacik <josef@toxicpanda.com> Cc: linux-btrfs@vger.kernel.org Cc: Daniel Vetter <daniel@ffwll.ch> Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com> Cc: dri-devel@lists.freedesktop.org Cc: David Airlie <airlied@linux.ie> Cc: Jani Nikula <jani.nikula@linux.intel.com> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Tom Zanussi <tom.zanussi@linux.intel.com> Cc: Miroslav Benes <mbenes@suse.cz> Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094802.533968922@linutronix.de
2019-04-25 16:45:08 +07:00
struct stack_store {
unsigned int nr_entries;
unsigned long entries[MAX_STACK];
};
struct block_lock {
spinlock_t lock;
__s32 count;
struct list_head waiters;
struct task_struct *holders[MAX_HOLDERS];
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
dm persistent data: Simplify stack trace handling Replace the indirection through struct stack_trace with an invocation of the storage array based interface. This results in less storage space and indirection. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: dm-devel@redhat.com Cc: Mike Snitzer <snitzer@redhat.com> Cc: Alasdair Kergon <agk@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Alexander Potapenko <glider@google.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Christoph Lameter <cl@linux.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: linux-mm@kvack.org Cc: David Rientjes <rientjes@google.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: Andrey Ryabinin <aryabinin@virtuozzo.com> Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> Cc: Akinobu Mita <akinobu.mita@gmail.com> Cc: Christoph Hellwig <hch@lst.de> Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy <robin.murphy@arm.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: Johannes Thumshirn <jthumshirn@suse.de> Cc: David Sterba <dsterba@suse.com> Cc: Chris Mason <clm@fb.com> Cc: Josef Bacik <josef@toxicpanda.com> Cc: linux-btrfs@vger.kernel.org Cc: Daniel Vetter <daniel@ffwll.ch> Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com> Cc: dri-devel@lists.freedesktop.org Cc: David Airlie <airlied@linux.ie> Cc: Jani Nikula <jani.nikula@linux.intel.com> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Tom Zanussi <tom.zanussi@linux.intel.com> Cc: Miroslav Benes <mbenes@suse.cz> Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094802.533968922@linutronix.de
2019-04-25 16:45:08 +07:00
struct stack_store traces[MAX_HOLDERS];
#endif
};
struct waiter {
struct list_head list;
struct task_struct *task;
int wants_write;
};
static unsigned __find_holder(struct block_lock *lock,
struct task_struct *task)
{
unsigned i;
for (i = 0; i < MAX_HOLDERS; i++)
if (lock->holders[i] == task)
break;
BUG_ON(i == MAX_HOLDERS);
return i;
}
/* call this *after* you increment lock->count */
static void __add_holder(struct block_lock *lock, struct task_struct *task)
{
unsigned h = __find_holder(lock, NULL);
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
dm persistent data: Simplify stack trace handling Replace the indirection through struct stack_trace with an invocation of the storage array based interface. This results in less storage space and indirection. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: dm-devel@redhat.com Cc: Mike Snitzer <snitzer@redhat.com> Cc: Alasdair Kergon <agk@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Alexander Potapenko <glider@google.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Christoph Lameter <cl@linux.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: linux-mm@kvack.org Cc: David Rientjes <rientjes@google.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: Andrey Ryabinin <aryabinin@virtuozzo.com> Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> Cc: Akinobu Mita <akinobu.mita@gmail.com> Cc: Christoph Hellwig <hch@lst.de> Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy <robin.murphy@arm.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: Johannes Thumshirn <jthumshirn@suse.de> Cc: David Sterba <dsterba@suse.com> Cc: Chris Mason <clm@fb.com> Cc: Josef Bacik <josef@toxicpanda.com> Cc: linux-btrfs@vger.kernel.org Cc: Daniel Vetter <daniel@ffwll.ch> Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com> Cc: dri-devel@lists.freedesktop.org Cc: David Airlie <airlied@linux.ie> Cc: Jani Nikula <jani.nikula@linux.intel.com> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Tom Zanussi <tom.zanussi@linux.intel.com> Cc: Miroslav Benes <mbenes@suse.cz> Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094802.533968922@linutronix.de
2019-04-25 16:45:08 +07:00
struct stack_store *t;
#endif
get_task_struct(task);
lock->holders[h] = task;
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
t = lock->traces + h;
dm persistent data: Simplify stack trace handling Replace the indirection through struct stack_trace with an invocation of the storage array based interface. This results in less storage space and indirection. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: dm-devel@redhat.com Cc: Mike Snitzer <snitzer@redhat.com> Cc: Alasdair Kergon <agk@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Alexander Potapenko <glider@google.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Christoph Lameter <cl@linux.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: linux-mm@kvack.org Cc: David Rientjes <rientjes@google.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: Andrey Ryabinin <aryabinin@virtuozzo.com> Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> Cc: Akinobu Mita <akinobu.mita@gmail.com> Cc: Christoph Hellwig <hch@lst.de> Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy <robin.murphy@arm.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: Johannes Thumshirn <jthumshirn@suse.de> Cc: David Sterba <dsterba@suse.com> Cc: Chris Mason <clm@fb.com> Cc: Josef Bacik <josef@toxicpanda.com> Cc: linux-btrfs@vger.kernel.org Cc: Daniel Vetter <daniel@ffwll.ch> Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com> Cc: dri-devel@lists.freedesktop.org Cc: David Airlie <airlied@linux.ie> Cc: Jani Nikula <jani.nikula@linux.intel.com> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Tom Zanussi <tom.zanussi@linux.intel.com> Cc: Miroslav Benes <mbenes@suse.cz> Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094802.533968922@linutronix.de
2019-04-25 16:45:08 +07:00
t->nr_entries = stack_trace_save(t->entries, MAX_STACK, 2);
#endif
}
/* call this *before* you decrement lock->count */
static void __del_holder(struct block_lock *lock, struct task_struct *task)
{
unsigned h = __find_holder(lock, task);
lock->holders[h] = NULL;
put_task_struct(task);
}
static int __check_holder(struct block_lock *lock)
{
unsigned i;
for (i = 0; i < MAX_HOLDERS; i++) {
if (lock->holders[i] == current) {
DMERR("recursive lock detected in metadata");
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
DMERR("previously held here:");
dm persistent data: Simplify stack trace handling Replace the indirection through struct stack_trace with an invocation of the storage array based interface. This results in less storage space and indirection. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: dm-devel@redhat.com Cc: Mike Snitzer <snitzer@redhat.com> Cc: Alasdair Kergon <agk@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Alexander Potapenko <glider@google.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Christoph Lameter <cl@linux.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: linux-mm@kvack.org Cc: David Rientjes <rientjes@google.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: Andrey Ryabinin <aryabinin@virtuozzo.com> Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> Cc: Akinobu Mita <akinobu.mita@gmail.com> Cc: Christoph Hellwig <hch@lst.de> Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy <robin.murphy@arm.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: Johannes Thumshirn <jthumshirn@suse.de> Cc: David Sterba <dsterba@suse.com> Cc: Chris Mason <clm@fb.com> Cc: Josef Bacik <josef@toxicpanda.com> Cc: linux-btrfs@vger.kernel.org Cc: Daniel Vetter <daniel@ffwll.ch> Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com> Cc: dri-devel@lists.freedesktop.org Cc: David Airlie <airlied@linux.ie> Cc: Jani Nikula <jani.nikula@linux.intel.com> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Tom Zanussi <tom.zanussi@linux.intel.com> Cc: Miroslav Benes <mbenes@suse.cz> Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094802.533968922@linutronix.de
2019-04-25 16:45:08 +07:00
stack_trace_print(lock->traces[i].entries,
lock->traces[i].nr_entries, 4);
DMERR("subsequent acquisition attempted here:");
dump_stack();
#endif
return -EINVAL;
}
}
return 0;
}
static void __wait(struct waiter *w)
{
for (;;) {
sched/core: Remove set_task_state() This is a nasty interface and setting the state of a foreign task must not be done. As of the following commit: be628be0956 ("bcache: Make gc wakeup sane, remove set_task_state()") ... everyone in the kernel calls set_task_state() with current, allowing the helper to be removed. However, as the comment indicates, it is still around for those archs where computing current is more expensive than using a pointer, at least in theory. An important arch that is affected is arm64, however this has been addressed now [1] and performance is up to par making no difference with either calls. Of all the callers, if any, it's the locking bits that would care most about this -- ie: we end up passing a tsk pointer to a lot of the lock slowpath, and setting ->state on that. The following numbers are based on two tests: a custom ad-hoc microbenchmark that just measures latencies (for ~65 million calls) between get_task_state() vs get_current_state(). Secondly for a higher overview, an unlink microbenchmark was used, which pounds on a single file with open, close,unlink combos with increasing thread counts (up to 4x ncpus). While the workload is quite unrealistic, it does contend a lot on the inode mutex or now rwsem. [1] https://lkml.kernel.org/r/1483468021-8237-1-git-send-email-mark.rutland@arm.com == 1. x86-64 == Avg runtime set_task_state(): 601 msecs Avg runtime set_current_state(): 552 msecs vanilla dirty Hmean unlink1-processes-2 36089.26 ( 0.00%) 38977.33 ( 8.00%) Hmean unlink1-processes-5 28555.01 ( 0.00%) 29832.55 ( 4.28%) Hmean unlink1-processes-8 37323.75 ( 0.00%) 44974.57 ( 20.50%) Hmean unlink1-processes-12 43571.88 ( 0.00%) 44283.01 ( 1.63%) Hmean unlink1-processes-21 34431.52 ( 0.00%) 38284.45 ( 11.19%) Hmean unlink1-processes-30 34813.26 ( 0.00%) 37975.17 ( 9.08%) Hmean unlink1-processes-48 37048.90 ( 0.00%) 39862.78 ( 7.59%) Hmean unlink1-processes-79 35630.01 ( 0.00%) 36855.30 ( 3.44%) Hmean unlink1-processes-110 36115.85 ( 0.00%) 39843.91 ( 10.32%) Hmean unlink1-processes-141 32546.96 ( 0.00%) 35418.52 ( 8.82%) Hmean unlink1-processes-172 34674.79 ( 0.00%) 36899.21 ( 6.42%) Hmean unlink1-processes-203 37303.11 ( 0.00%) 36393.04 ( -2.44%) Hmean unlink1-processes-224 35712.13 ( 0.00%) 36685.96 ( 2.73%) == 2. ppc64le == Avg runtime set_task_state(): 938 msecs Avg runtime set_current_state: 940 msecs vanilla dirty Hmean unlink1-processes-2 19269.19 ( 0.00%) 30704.50 ( 59.35%) Hmean unlink1-processes-5 20106.15 ( 0.00%) 21804.15 ( 8.45%) Hmean unlink1-processes-8 17496.97 ( 0.00%) 17243.28 ( -1.45%) Hmean unlink1-processes-12 14224.15 ( 0.00%) 17240.21 ( 21.20%) Hmean unlink1-processes-21 14155.66 ( 0.00%) 15681.23 ( 10.78%) Hmean unlink1-processes-30 14450.70 ( 0.00%) 15995.83 ( 10.69%) Hmean unlink1-processes-48 16945.57 ( 0.00%) 16370.42 ( -3.39%) Hmean unlink1-processes-79 15788.39 ( 0.00%) 14639.27 ( -7.28%) Hmean unlink1-processes-110 14268.48 ( 0.00%) 14377.40 ( 0.76%) Hmean unlink1-processes-141 14023.65 ( 0.00%) 16271.69 ( 16.03%) Hmean unlink1-processes-172 13417.62 ( 0.00%) 16067.55 ( 19.75%) Hmean unlink1-processes-203 15293.08 ( 0.00%) 15440.40 ( 0.96%) Hmean unlink1-processes-234 13719.32 ( 0.00%) 16190.74 ( 18.01%) Hmean unlink1-processes-265 16400.97 ( 0.00%) 16115.22 ( -1.74%) Hmean unlink1-processes-296 14388.60 ( 0.00%) 16216.13 ( 12.70%) Hmean unlink1-processes-320 15771.85 ( 0.00%) 15905.96 ( 0.85%) x86-64 (known to be fast for get_current()/this_cpu_read_stable() caching) and ppc64 (with paca) show similar improvements in the unlink microbenches. The small delta for ppc64 (2ms), does not represent the gains on the unlink runs. In the case of x86, there was a decent amount of variation in the latency runs, but always within a 20 to 50ms increase), ppc was more constant. Signed-off-by: Davidlohr Bueso <dbueso@suse.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: dave@stgolabs.net Cc: mark.rutland@arm.com Link: http://lkml.kernel.org/r/1483479794-14013-5-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-01-04 04:43:14 +07:00
set_current_state(TASK_UNINTERRUPTIBLE);
if (!w->task)
break;
schedule();
}
sched/core: Remove set_task_state() This is a nasty interface and setting the state of a foreign task must not be done. As of the following commit: be628be0956 ("bcache: Make gc wakeup sane, remove set_task_state()") ... everyone in the kernel calls set_task_state() with current, allowing the helper to be removed. However, as the comment indicates, it is still around for those archs where computing current is more expensive than using a pointer, at least in theory. An important arch that is affected is arm64, however this has been addressed now [1] and performance is up to par making no difference with either calls. Of all the callers, if any, it's the locking bits that would care most about this -- ie: we end up passing a tsk pointer to a lot of the lock slowpath, and setting ->state on that. The following numbers are based on two tests: a custom ad-hoc microbenchmark that just measures latencies (for ~65 million calls) between get_task_state() vs get_current_state(). Secondly for a higher overview, an unlink microbenchmark was used, which pounds on a single file with open, close,unlink combos with increasing thread counts (up to 4x ncpus). While the workload is quite unrealistic, it does contend a lot on the inode mutex or now rwsem. [1] https://lkml.kernel.org/r/1483468021-8237-1-git-send-email-mark.rutland@arm.com == 1. x86-64 == Avg runtime set_task_state(): 601 msecs Avg runtime set_current_state(): 552 msecs vanilla dirty Hmean unlink1-processes-2 36089.26 ( 0.00%) 38977.33 ( 8.00%) Hmean unlink1-processes-5 28555.01 ( 0.00%) 29832.55 ( 4.28%) Hmean unlink1-processes-8 37323.75 ( 0.00%) 44974.57 ( 20.50%) Hmean unlink1-processes-12 43571.88 ( 0.00%) 44283.01 ( 1.63%) Hmean unlink1-processes-21 34431.52 ( 0.00%) 38284.45 ( 11.19%) Hmean unlink1-processes-30 34813.26 ( 0.00%) 37975.17 ( 9.08%) Hmean unlink1-processes-48 37048.90 ( 0.00%) 39862.78 ( 7.59%) Hmean unlink1-processes-79 35630.01 ( 0.00%) 36855.30 ( 3.44%) Hmean unlink1-processes-110 36115.85 ( 0.00%) 39843.91 ( 10.32%) Hmean unlink1-processes-141 32546.96 ( 0.00%) 35418.52 ( 8.82%) Hmean unlink1-processes-172 34674.79 ( 0.00%) 36899.21 ( 6.42%) Hmean unlink1-processes-203 37303.11 ( 0.00%) 36393.04 ( -2.44%) Hmean unlink1-processes-224 35712.13 ( 0.00%) 36685.96 ( 2.73%) == 2. ppc64le == Avg runtime set_task_state(): 938 msecs Avg runtime set_current_state: 940 msecs vanilla dirty Hmean unlink1-processes-2 19269.19 ( 0.00%) 30704.50 ( 59.35%) Hmean unlink1-processes-5 20106.15 ( 0.00%) 21804.15 ( 8.45%) Hmean unlink1-processes-8 17496.97 ( 0.00%) 17243.28 ( -1.45%) Hmean unlink1-processes-12 14224.15 ( 0.00%) 17240.21 ( 21.20%) Hmean unlink1-processes-21 14155.66 ( 0.00%) 15681.23 ( 10.78%) Hmean unlink1-processes-30 14450.70 ( 0.00%) 15995.83 ( 10.69%) Hmean unlink1-processes-48 16945.57 ( 0.00%) 16370.42 ( -3.39%) Hmean unlink1-processes-79 15788.39 ( 0.00%) 14639.27 ( -7.28%) Hmean unlink1-processes-110 14268.48 ( 0.00%) 14377.40 ( 0.76%) Hmean unlink1-processes-141 14023.65 ( 0.00%) 16271.69 ( 16.03%) Hmean unlink1-processes-172 13417.62 ( 0.00%) 16067.55 ( 19.75%) Hmean unlink1-processes-203 15293.08 ( 0.00%) 15440.40 ( 0.96%) Hmean unlink1-processes-234 13719.32 ( 0.00%) 16190.74 ( 18.01%) Hmean unlink1-processes-265 16400.97 ( 0.00%) 16115.22 ( -1.74%) Hmean unlink1-processes-296 14388.60 ( 0.00%) 16216.13 ( 12.70%) Hmean unlink1-processes-320 15771.85 ( 0.00%) 15905.96 ( 0.85%) x86-64 (known to be fast for get_current()/this_cpu_read_stable() caching) and ppc64 (with paca) show similar improvements in the unlink microbenches. The small delta for ppc64 (2ms), does not represent the gains on the unlink runs. In the case of x86, there was a decent amount of variation in the latency runs, but always within a 20 to 50ms increase), ppc was more constant. Signed-off-by: Davidlohr Bueso <dbueso@suse.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: dave@stgolabs.net Cc: mark.rutland@arm.com Link: http://lkml.kernel.org/r/1483479794-14013-5-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-01-04 04:43:14 +07:00
set_current_state(TASK_RUNNING);
}
static void __wake_waiter(struct waiter *w)
{
struct task_struct *task;
list_del(&w->list);
task = w->task;
smp_mb();
w->task = NULL;
wake_up_process(task);
}
/*
* We either wake a few readers or a single writer.
*/
static void __wake_many(struct block_lock *lock)
{
struct waiter *w, *tmp;
BUG_ON(lock->count < 0);
list_for_each_entry_safe(w, tmp, &lock->waiters, list) {
if (lock->count >= MAX_HOLDERS)
return;
if (w->wants_write) {
if (lock->count > 0)
return; /* still read locked */
lock->count = -1;
__add_holder(lock, w->task);
__wake_waiter(w);
return;
}
lock->count++;
__add_holder(lock, w->task);
__wake_waiter(w);
}
}
static void bl_init(struct block_lock *lock)
{
int i;
spin_lock_init(&lock->lock);
lock->count = 0;
INIT_LIST_HEAD(&lock->waiters);
for (i = 0; i < MAX_HOLDERS; i++)
lock->holders[i] = NULL;
}
static int __available_for_read(struct block_lock *lock)
{
return lock->count >= 0 &&
lock->count < MAX_HOLDERS &&
list_empty(&lock->waiters);
}
static int bl_down_read(struct block_lock *lock)
{
int r;
struct waiter w;
spin_lock(&lock->lock);
r = __check_holder(lock);
if (r) {
spin_unlock(&lock->lock);
return r;
}
if (__available_for_read(lock)) {
lock->count++;
__add_holder(lock, current);
spin_unlock(&lock->lock);
return 0;
}
get_task_struct(current);
w.task = current;
w.wants_write = 0;
list_add_tail(&w.list, &lock->waiters);
spin_unlock(&lock->lock);
__wait(&w);
put_task_struct(current);
return 0;
}
static int bl_down_read_nonblock(struct block_lock *lock)
{
int r;
spin_lock(&lock->lock);
r = __check_holder(lock);
if (r)
goto out;
if (__available_for_read(lock)) {
lock->count++;
__add_holder(lock, current);
r = 0;
} else
r = -EWOULDBLOCK;
out:
spin_unlock(&lock->lock);
return r;
}
static void bl_up_read(struct block_lock *lock)
{
spin_lock(&lock->lock);
BUG_ON(lock->count <= 0);
__del_holder(lock, current);
--lock->count;
if (!list_empty(&lock->waiters))
__wake_many(lock);
spin_unlock(&lock->lock);
}
static int bl_down_write(struct block_lock *lock)
{
int r;
struct waiter w;
spin_lock(&lock->lock);
r = __check_holder(lock);
if (r) {
spin_unlock(&lock->lock);
return r;
}
if (lock->count == 0 && list_empty(&lock->waiters)) {
lock->count = -1;
__add_holder(lock, current);
spin_unlock(&lock->lock);
return 0;
}
get_task_struct(current);
w.task = current;
w.wants_write = 1;
/*
* Writers given priority. We know there's only one mutator in the
* system, so ignoring the ordering reversal.
*/
list_add(&w.list, &lock->waiters);
spin_unlock(&lock->lock);
__wait(&w);
put_task_struct(current);
return 0;
}
static void bl_up_write(struct block_lock *lock)
{
spin_lock(&lock->lock);
__del_holder(lock, current);
lock->count = 0;
if (!list_empty(&lock->waiters))
__wake_many(lock);
spin_unlock(&lock->lock);
}
static void report_recursive_bug(dm_block_t b, int r)
{
if (r == -EINVAL)
DMERR("recursive acquisition of block %llu requested.",
(unsigned long long) b);
}
#else /* !CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING */
#define bl_init(x) do { } while (0)
#define bl_down_read(x) 0
#define bl_down_read_nonblock(x) 0
#define bl_up_read(x) do { } while (0)
#define bl_down_write(x) 0
#define bl_up_write(x) do { } while (0)
#define report_recursive_bug(x, y) do { } while (0)
#endif /* CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING */
/*----------------------------------------------------------------*/
/*
* Block manager is currently implemented using dm-bufio. struct
* dm_block_manager and struct dm_block map directly onto a couple of
* structs in the bufio interface. I want to retain the freedom to move
* away from bufio in the future. So these structs are just cast within
* this .c file, rather than making it through to the public interface.
*/
static struct dm_buffer *to_buffer(struct dm_block *b)
{
return (struct dm_buffer *) b;
}
dm_block_t dm_block_location(struct dm_block *b)
{
return dm_bufio_get_block_number(to_buffer(b));
}
EXPORT_SYMBOL_GPL(dm_block_location);
void *dm_block_data(struct dm_block *b)
{
return dm_bufio_get_block_data(to_buffer(b));
}
EXPORT_SYMBOL_GPL(dm_block_data);
struct buffer_aux {
struct dm_block_validator *validator;
int write_locked;
#ifdef CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING
struct block_lock lock;
#endif
};
static void dm_block_manager_alloc_callback(struct dm_buffer *buf)
{
struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
aux->validator = NULL;
bl_init(&aux->lock);
}
static void dm_block_manager_write_callback(struct dm_buffer *buf)
{
struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
if (aux->validator) {
aux->validator->prepare_for_write(aux->validator, (struct dm_block *) buf,
dm_bufio_get_block_size(dm_bufio_get_client(buf)));
}
}
/*----------------------------------------------------------------
* Public interface
*--------------------------------------------------------------*/
struct dm_block_manager {
struct dm_bufio_client *bufio;
bool read_only:1;
};
struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
unsigned block_size,
unsigned max_held_per_thread)
{
int r;
struct dm_block_manager *bm;
bm = kmalloc(sizeof(*bm), GFP_KERNEL);
if (!bm) {
r = -ENOMEM;
goto bad;
}
bm->bufio = dm_bufio_client_create(bdev, block_size, max_held_per_thread,
sizeof(struct buffer_aux),
dm_block_manager_alloc_callback,
dm_block_manager_write_callback);
if (IS_ERR(bm->bufio)) {
r = PTR_ERR(bm->bufio);
kfree(bm);
goto bad;
}
bm->read_only = false;
return bm;
bad:
return ERR_PTR(r);
}
EXPORT_SYMBOL_GPL(dm_block_manager_create);
void dm_block_manager_destroy(struct dm_block_manager *bm)
{
dm_bufio_client_destroy(bm->bufio);
kfree(bm);
}
EXPORT_SYMBOL_GPL(dm_block_manager_destroy);
unsigned dm_bm_block_size(struct dm_block_manager *bm)
{
return dm_bufio_get_block_size(bm->bufio);
}
EXPORT_SYMBOL_GPL(dm_bm_block_size);
dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm)
{
return dm_bufio_get_device_size(bm->bufio);
}
static int dm_bm_validate_buffer(struct dm_block_manager *bm,
struct dm_buffer *buf,
struct buffer_aux *aux,
struct dm_block_validator *v)
{
if (unlikely(!aux->validator)) {
int r;
if (!v)
return 0;
r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(bm->bufio));
if (unlikely(r)) {
DMERR_LIMIT("%s validator check failed for block %llu", v->name,
(unsigned long long) dm_bufio_get_block_number(buf));
return r;
}
aux->validator = v;
} else {
if (unlikely(aux->validator != v)) {
DMERR_LIMIT("validator mismatch (old=%s vs new=%s) for block %llu",
aux->validator->name, v ? v->name : "NULL",
(unsigned long long) dm_bufio_get_block_number(buf));
return -EINVAL;
}
}
return 0;
}
int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
struct dm_block_validator *v,
struct dm_block **result)
{
struct buffer_aux *aux;
void *p;
int r;
p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
if (IS_ERR(p))
return PTR_ERR(p);
aux = dm_bufio_get_aux_data(to_buffer(*result));
r = bl_down_read(&aux->lock);
if (unlikely(r)) {
dm_bufio_release(to_buffer(*result));
report_recursive_bug(b, r);
return r;
}
aux->write_locked = 0;
r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
if (unlikely(r)) {
bl_up_read(&aux->lock);
dm_bufio_release(to_buffer(*result));
return r;
}
return 0;
}
EXPORT_SYMBOL_GPL(dm_bm_read_lock);
int dm_bm_write_lock(struct dm_block_manager *bm,
dm_block_t b, struct dm_block_validator *v,
struct dm_block **result)
{
struct buffer_aux *aux;
void *p;
int r;
if (dm_bm_is_read_only(bm))
return -EPERM;
p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
if (IS_ERR(p))
return PTR_ERR(p);
aux = dm_bufio_get_aux_data(to_buffer(*result));
r = bl_down_write(&aux->lock);
if (r) {
dm_bufio_release(to_buffer(*result));
report_recursive_bug(b, r);
return r;
}
aux->write_locked = 1;
r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
if (unlikely(r)) {
bl_up_write(&aux->lock);
dm_bufio_release(to_buffer(*result));
return r;
}
return 0;
}
EXPORT_SYMBOL_GPL(dm_bm_write_lock);
int dm_bm_read_try_lock(struct dm_block_manager *bm,
dm_block_t b, struct dm_block_validator *v,
struct dm_block **result)
{
struct buffer_aux *aux;
void *p;
int r;
p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result);
if (IS_ERR(p))
return PTR_ERR(p);
if (unlikely(!p))
return -EWOULDBLOCK;
aux = dm_bufio_get_aux_data(to_buffer(*result));
r = bl_down_read_nonblock(&aux->lock);
if (r < 0) {
dm_bufio_release(to_buffer(*result));
report_recursive_bug(b, r);
return r;
}
aux->write_locked = 0;
r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
if (unlikely(r)) {
bl_up_read(&aux->lock);
dm_bufio_release(to_buffer(*result));
return r;
}
return 0;
}
int dm_bm_write_lock_zero(struct dm_block_manager *bm,
dm_block_t b, struct dm_block_validator *v,
struct dm_block **result)
{
int r;
struct buffer_aux *aux;
void *p;
if (dm_bm_is_read_only(bm))
return -EPERM;
p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result);
if (IS_ERR(p))
return PTR_ERR(p);
memset(p, 0, dm_bm_block_size(bm));
aux = dm_bufio_get_aux_data(to_buffer(*result));
r = bl_down_write(&aux->lock);
if (r) {
dm_bufio_release(to_buffer(*result));
return r;
}
aux->write_locked = 1;
aux->validator = v;
return 0;
}
EXPORT_SYMBOL_GPL(dm_bm_write_lock_zero);
void dm_bm_unlock(struct dm_block *b)
{
struct buffer_aux *aux;
aux = dm_bufio_get_aux_data(to_buffer(b));
if (aux->write_locked) {
dm_bufio_mark_buffer_dirty(to_buffer(b));
bl_up_write(&aux->lock);
} else
bl_up_read(&aux->lock);
dm_bufio_release(to_buffer(b));
}
EXPORT_SYMBOL_GPL(dm_bm_unlock);
int dm_bm_flush(struct dm_block_manager *bm)
{
if (dm_bm_is_read_only(bm))
return -EPERM;
return dm_bufio_write_dirty_buffers(bm->bufio);
}
EXPORT_SYMBOL_GPL(dm_bm_flush);
void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b)
{
dm_bufio_prefetch(bm->bufio, b, 1);
}
bool dm_bm_is_read_only(struct dm_block_manager *bm)
{
return (bm ? bm->read_only : true);
}
EXPORT_SYMBOL_GPL(dm_bm_is_read_only);
void dm_bm_set_read_only(struct dm_block_manager *bm)
{
if (bm)
bm->read_only = true;
}
EXPORT_SYMBOL_GPL(dm_bm_set_read_only);
void dm_bm_set_read_write(struct dm_block_manager *bm)
{
if (bm)
bm->read_only = false;
}
EXPORT_SYMBOL_GPL(dm_bm_set_read_write);
u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
{
return crc32c(~(u32) 0, data, len) ^ init_xor;
}
EXPORT_SYMBOL_GPL(dm_bm_checksum);
/*----------------------------------------------------------------*/
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
MODULE_DESCRIPTION("Immutable metadata library for dm");
/*----------------------------------------------------------------*/