linux_dsm_epyc7002/arch/arc/include/asm/atomic.h
Linus Torvalds e192832869 Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull locking updates from Ingo Molnar:
 "The main changes in this cycle are:

   - rwsem scalability improvements, phase #2, by Waiman Long, which are
     rather impressive:

       "On a 2-socket 40-core 80-thread Skylake system with 40 reader
        and writer locking threads, the min/mean/max locking operations
        done in a 5-second testing window before the patchset were:

         40 readers, Iterations Min/Mean/Max = 1,807/1,808/1,810
         40 writers, Iterations Min/Mean/Max = 1,807/50,344/151,255

        After the patchset, they became:

         40 readers, Iterations Min/Mean/Max = 30,057/31,359/32,741
         40 writers, Iterations Min/Mean/Max = 94,466/95,845/97,098"

     There's a lot of changes to the locking implementation that makes
     it similar to qrwlock, including owner handoff for more fair
     locking.

     Another microbenchmark shows how across the spectrum the
     improvements are:

       "With a locking microbenchmark running on 5.1 based kernel, the
        total locking rates (in kops/s) on a 2-socket Skylake system
        with equal numbers of readers and writers (mixed) before and
        after this patchset were:

        # of Threads   Before Patch      After Patch
        ------------   ------------      -----------
             2            2,618             4,193
             4            1,202             3,726
             8              802             3,622
            16              729             3,359
            32              319             2,826
            64              102             2,744"

     The changes are extensive and the patch-set has been through
     several iterations addressing various locking workloads. There
     might be more regressions, but unless they are pathological I
     believe we want to use this new implementation as the baseline
     going forward.

   - jump-label optimizations by Daniel Bristot de Oliveira: the primary
     motivation was to remove IPI disturbance of isolated RT-workload
     CPUs, which resulted in the implementation of batched jump-label
     updates. Beyond the improvement of the real-time characteristics
     kernel, in one test this patchset improved static key update
     overhead from 57 msecs to just 1.4 msecs - which is a nice speedup
     as well.

   - atomic64_t cross-arch type cleanups by Mark Rutland: over the last
     ~10 years of atomic64_t existence the various types used by the
     APIs only had to be self-consistent within each architecture -
     which means they became wildly inconsistent across architectures.
     Mark puts and end to this by reworking all the atomic64
     implementations to use 's64' as the base type for atomic64_t, and
     to ensure that this type is consistently used for parameters and
     return values in the API, avoiding further problems in this area.

   - A large set of small improvements to lockdep by Yuyang Du: type
     cleanups, output cleanups, function return type and othr cleanups
     all around the place.

   - A set of percpu ops cleanups and fixes by Peter Zijlstra.

   - Misc other changes - please see the Git log for more details"

* 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (82 commits)
  locking/lockdep: increase size of counters for lockdep statistics
  locking/atomics: Use sed(1) instead of non-standard head(1) option
  locking/lockdep: Move mark_lock() inside CONFIG_TRACE_IRQFLAGS && CONFIG_PROVE_LOCKING
  x86/jump_label: Make tp_vec_nr static
  x86/percpu: Optimize raw_cpu_xchg()
  x86/percpu, sched/fair: Avoid local_clock()
  x86/percpu, x86/irq: Relax {set,get}_irq_regs()
  x86/percpu: Relax smp_processor_id()
  x86/percpu: Differentiate this_cpu_{}() and __this_cpu_{}()
  locking/rwsem: Guard against making count negative
  locking/rwsem: Adaptive disabling of reader optimistic spinning
  locking/rwsem: Enable time-based spinning on reader-owned rwsem
  locking/rwsem: Make rwsem->owner an atomic_long_t
  locking/rwsem: Enable readers spinning on writer
  locking/rwsem: Clarify usage of owner's nonspinaable bit
  locking/rwsem: Wake up almost all readers in wait queue
  locking/rwsem: More optimal RT task handling of null owner
  locking/rwsem: Always release wait_lock before waking up tasks
  locking/rwsem: Implement lock handoff to prevent lock starvation
  locking/rwsem: Make rwsem_spin_on_owner() return owner state
  ...
2019-07-08 16:12:03 -07:00

559 lines
13 KiB
C

/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
*/
#ifndef _ASM_ARC_ATOMIC_H
#define _ASM_ARC_ATOMIC_H
#ifndef __ASSEMBLY__
#include <linux/types.h>
#include <linux/compiler.h>
#include <asm/cmpxchg.h>
#include <asm/barrier.h>
#include <asm/smp.h>
#define ATOMIC_INIT(i) { (i) }
#ifndef CONFIG_ARC_PLAT_EZNPS
#define atomic_read(v) READ_ONCE((v)->counter)
#ifdef CONFIG_ARC_HAS_LLSC
#define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i))
#define ATOMIC_OP(op, c_op, asm_op) \
static inline void atomic_##op(int i, atomic_t *v) \
{ \
unsigned int val; \
\
__asm__ __volatile__( \
"1: llock %[val], [%[ctr]] \n" \
" " #asm_op " %[val], %[val], %[i] \n" \
" scond %[val], [%[ctr]] \n" \
" bnz 1b \n" \
: [val] "=&r" (val) /* Early clobber to prevent reg reuse */ \
: [ctr] "r" (&v->counter), /* Not "m": llock only supports reg direct addr mode */ \
[i] "ir" (i) \
: "cc"); \
} \
#define ATOMIC_OP_RETURN(op, c_op, asm_op) \
static inline int atomic_##op##_return(int i, atomic_t *v) \
{ \
unsigned int val; \
\
/* \
* Explicit full memory barrier needed before/after as \
* LLOCK/SCOND thmeselves don't provide any such semantics \
*/ \
smp_mb(); \
\
__asm__ __volatile__( \
"1: llock %[val], [%[ctr]] \n" \
" " #asm_op " %[val], %[val], %[i] \n" \
" scond %[val], [%[ctr]] \n" \
" bnz 1b \n" \
: [val] "=&r" (val) \
: [ctr] "r" (&v->counter), \
[i] "ir" (i) \
: "cc"); \
\
smp_mb(); \
\
return val; \
}
#define ATOMIC_FETCH_OP(op, c_op, asm_op) \
static inline int atomic_fetch_##op(int i, atomic_t *v) \
{ \
unsigned int val, orig; \
\
/* \
* Explicit full memory barrier needed before/after as \
* LLOCK/SCOND thmeselves don't provide any such semantics \
*/ \
smp_mb(); \
\
__asm__ __volatile__( \
"1: llock %[orig], [%[ctr]] \n" \
" " #asm_op " %[val], %[orig], %[i] \n" \
" scond %[val], [%[ctr]] \n" \
" bnz 1b \n" \
: [val] "=&r" (val), \
[orig] "=&r" (orig) \
: [ctr] "r" (&v->counter), \
[i] "ir" (i) \
: "cc"); \
\
smp_mb(); \
\
return orig; \
}
#else /* !CONFIG_ARC_HAS_LLSC */
#ifndef CONFIG_SMP
/* violating atomic_xxx API locking protocol in UP for optimization sake */
#define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i))
#else
static inline void atomic_set(atomic_t *v, int i)
{
/*
* Independent of hardware support, all of the atomic_xxx() APIs need
* to follow the same locking rules to make sure that a "hardware"
* atomic insn (e.g. LD) doesn't clobber an "emulated" atomic insn
* sequence
*
* Thus atomic_set() despite being 1 insn (and seemingly atomic)
* requires the locking.
*/
unsigned long flags;
atomic_ops_lock(flags);
WRITE_ONCE(v->counter, i);
atomic_ops_unlock(flags);
}
#define atomic_set_release(v, i) atomic_set((v), (i))
#endif
/*
* Non hardware assisted Atomic-R-M-W
* Locking would change to irq-disabling only (UP) and spinlocks (SMP)
*/
#define ATOMIC_OP(op, c_op, asm_op) \
static inline void atomic_##op(int i, atomic_t *v) \
{ \
unsigned long flags; \
\
atomic_ops_lock(flags); \
v->counter c_op i; \
atomic_ops_unlock(flags); \
}
#define ATOMIC_OP_RETURN(op, c_op, asm_op) \
static inline int atomic_##op##_return(int i, atomic_t *v) \
{ \
unsigned long flags; \
unsigned long temp; \
\
/* \
* spin lock/unlock provides the needed smp_mb() before/after \
*/ \
atomic_ops_lock(flags); \
temp = v->counter; \
temp c_op i; \
v->counter = temp; \
atomic_ops_unlock(flags); \
\
return temp; \
}
#define ATOMIC_FETCH_OP(op, c_op, asm_op) \
static inline int atomic_fetch_##op(int i, atomic_t *v) \
{ \
unsigned long flags; \
unsigned long orig; \
\
/* \
* spin lock/unlock provides the needed smp_mb() before/after \
*/ \
atomic_ops_lock(flags); \
orig = v->counter; \
v->counter c_op i; \
atomic_ops_unlock(flags); \
\
return orig; \
}
#endif /* !CONFIG_ARC_HAS_LLSC */
#define ATOMIC_OPS(op, c_op, asm_op) \
ATOMIC_OP(op, c_op, asm_op) \
ATOMIC_OP_RETURN(op, c_op, asm_op) \
ATOMIC_FETCH_OP(op, c_op, asm_op)
ATOMIC_OPS(add, +=, add)
ATOMIC_OPS(sub, -=, sub)
#define atomic_andnot atomic_andnot
#define atomic_fetch_andnot atomic_fetch_andnot
#undef ATOMIC_OPS
#define ATOMIC_OPS(op, c_op, asm_op) \
ATOMIC_OP(op, c_op, asm_op) \
ATOMIC_FETCH_OP(op, c_op, asm_op)
ATOMIC_OPS(and, &=, and)
ATOMIC_OPS(andnot, &= ~, bic)
ATOMIC_OPS(or, |=, or)
ATOMIC_OPS(xor, ^=, xor)
#else /* CONFIG_ARC_PLAT_EZNPS */
static inline int atomic_read(const atomic_t *v)
{
int temp;
__asm__ __volatile__(
" ld.di %0, [%1]"
: "=r"(temp)
: "r"(&v->counter)
: "memory");
return temp;
}
static inline void atomic_set(atomic_t *v, int i)
{
__asm__ __volatile__(
" st.di %0,[%1]"
:
: "r"(i), "r"(&v->counter)
: "memory");
}
#define ATOMIC_OP(op, c_op, asm_op) \
static inline void atomic_##op(int i, atomic_t *v) \
{ \
__asm__ __volatile__( \
" mov r2, %0\n" \
" mov r3, %1\n" \
" .word %2\n" \
: \
: "r"(i), "r"(&v->counter), "i"(asm_op) \
: "r2", "r3", "memory"); \
} \
#define ATOMIC_OP_RETURN(op, c_op, asm_op) \
static inline int atomic_##op##_return(int i, atomic_t *v) \
{ \
unsigned int temp = i; \
\
/* Explicit full memory barrier needed before/after */ \
smp_mb(); \
\
__asm__ __volatile__( \
" mov r2, %0\n" \
" mov r3, %1\n" \
" .word %2\n" \
" mov %0, r2" \
: "+r"(temp) \
: "r"(&v->counter), "i"(asm_op) \
: "r2", "r3", "memory"); \
\
smp_mb(); \
\
temp c_op i; \
\
return temp; \
}
#define ATOMIC_FETCH_OP(op, c_op, asm_op) \
static inline int atomic_fetch_##op(int i, atomic_t *v) \
{ \
unsigned int temp = i; \
\
/* Explicit full memory barrier needed before/after */ \
smp_mb(); \
\
__asm__ __volatile__( \
" mov r2, %0\n" \
" mov r3, %1\n" \
" .word %2\n" \
" mov %0, r2" \
: "+r"(temp) \
: "r"(&v->counter), "i"(asm_op) \
: "r2", "r3", "memory"); \
\
smp_mb(); \
\
return temp; \
}
#define ATOMIC_OPS(op, c_op, asm_op) \
ATOMIC_OP(op, c_op, asm_op) \
ATOMIC_OP_RETURN(op, c_op, asm_op) \
ATOMIC_FETCH_OP(op, c_op, asm_op)
ATOMIC_OPS(add, +=, CTOP_INST_AADD_DI_R2_R2_R3)
#define atomic_sub(i, v) atomic_add(-(i), (v))
#define atomic_sub_return(i, v) atomic_add_return(-(i), (v))
#define atomic_fetch_sub(i, v) atomic_fetch_add(-(i), (v))
#undef ATOMIC_OPS
#define ATOMIC_OPS(op, c_op, asm_op) \
ATOMIC_OP(op, c_op, asm_op) \
ATOMIC_FETCH_OP(op, c_op, asm_op)
ATOMIC_OPS(and, &=, CTOP_INST_AAND_DI_R2_R2_R3)
ATOMIC_OPS(or, |=, CTOP_INST_AOR_DI_R2_R2_R3)
ATOMIC_OPS(xor, ^=, CTOP_INST_AXOR_DI_R2_R2_R3)
#endif /* CONFIG_ARC_PLAT_EZNPS */
#undef ATOMIC_OPS
#undef ATOMIC_FETCH_OP
#undef ATOMIC_OP_RETURN
#undef ATOMIC_OP
#ifdef CONFIG_GENERIC_ATOMIC64
#include <asm-generic/atomic64.h>
#else /* Kconfig ensures this is only enabled with needed h/w assist */
/*
* ARCv2 supports 64-bit exclusive load (LLOCKD) / store (SCONDD)
* - The address HAS to be 64-bit aligned
* - There are 2 semantics involved here:
* = exclusive implies no interim update between load/store to same addr
* = both words are observed/updated together: this is guaranteed even
* for regular 64-bit load (LDD) / store (STD). Thus atomic64_set()
* is NOT required to use LLOCKD+SCONDD, STD suffices
*/
typedef struct {
s64 __aligned(8) counter;
} atomic64_t;
#define ATOMIC64_INIT(a) { (a) }
static inline s64 atomic64_read(const atomic64_t *v)
{
s64 val;
__asm__ __volatile__(
" ldd %0, [%1] \n"
: "=r"(val)
: "r"(&v->counter));
return val;
}
static inline void atomic64_set(atomic64_t *v, s64 a)
{
/*
* This could have been a simple assignment in "C" but would need
* explicit volatile. Otherwise gcc optimizers could elide the store
* which borked atomic64 self-test
* In the inline asm version, memory clobber needed for exact same
* reason, to tell gcc about the store.
*
* This however is not needed for sibling atomic64_add() etc since both
* load/store are explicitly done in inline asm. As long as API is used
* for each access, gcc has no way to optimize away any load/store
*/
__asm__ __volatile__(
" std %0, [%1] \n"
:
: "r"(a), "r"(&v->counter)
: "memory");
}
#define ATOMIC64_OP(op, op1, op2) \
static inline void atomic64_##op(s64 a, atomic64_t *v) \
{ \
s64 val; \
\
__asm__ __volatile__( \
"1: \n" \
" llockd %0, [%1] \n" \
" " #op1 " %L0, %L0, %L2 \n" \
" " #op2 " %H0, %H0, %H2 \n" \
" scondd %0, [%1] \n" \
" bnz 1b \n" \
: "=&r"(val) \
: "r"(&v->counter), "ir"(a) \
: "cc"); \
} \
#define ATOMIC64_OP_RETURN(op, op1, op2) \
static inline s64 atomic64_##op##_return(s64 a, atomic64_t *v) \
{ \
s64 val; \
\
smp_mb(); \
\
__asm__ __volatile__( \
"1: \n" \
" llockd %0, [%1] \n" \
" " #op1 " %L0, %L0, %L2 \n" \
" " #op2 " %H0, %H0, %H2 \n" \
" scondd %0, [%1] \n" \
" bnz 1b \n" \
: [val] "=&r"(val) \
: "r"(&v->counter), "ir"(a) \
: "cc"); /* memory clobber comes from smp_mb() */ \
\
smp_mb(); \
\
return val; \
}
#define ATOMIC64_FETCH_OP(op, op1, op2) \
static inline s64 atomic64_fetch_##op(s64 a, atomic64_t *v) \
{ \
s64 val, orig; \
\
smp_mb(); \
\
__asm__ __volatile__( \
"1: \n" \
" llockd %0, [%2] \n" \
" " #op1 " %L1, %L0, %L3 \n" \
" " #op2 " %H1, %H0, %H3 \n" \
" scondd %1, [%2] \n" \
" bnz 1b \n" \
: "=&r"(orig), "=&r"(val) \
: "r"(&v->counter), "ir"(a) \
: "cc"); /* memory clobber comes from smp_mb() */ \
\
smp_mb(); \
\
return orig; \
}
#define ATOMIC64_OPS(op, op1, op2) \
ATOMIC64_OP(op, op1, op2) \
ATOMIC64_OP_RETURN(op, op1, op2) \
ATOMIC64_FETCH_OP(op, op1, op2)
#define atomic64_andnot atomic64_andnot
#define atomic64_fetch_andnot atomic64_fetch_andnot
ATOMIC64_OPS(add, add.f, adc)
ATOMIC64_OPS(sub, sub.f, sbc)
ATOMIC64_OPS(and, and, and)
ATOMIC64_OPS(andnot, bic, bic)
ATOMIC64_OPS(or, or, or)
ATOMIC64_OPS(xor, xor, xor)
#undef ATOMIC64_OPS
#undef ATOMIC64_FETCH_OP
#undef ATOMIC64_OP_RETURN
#undef ATOMIC64_OP
static inline s64
atomic64_cmpxchg(atomic64_t *ptr, s64 expected, s64 new)
{
s64 prev;
smp_mb();
__asm__ __volatile__(
"1: llockd %0, [%1] \n"
" brne %L0, %L2, 2f \n"
" brne %H0, %H2, 2f \n"
" scondd %3, [%1] \n"
" bnz 1b \n"
"2: \n"
: "=&r"(prev)
: "r"(ptr), "ir"(expected), "r"(new)
: "cc"); /* memory clobber comes from smp_mb() */
smp_mb();
return prev;
}
static inline s64 atomic64_xchg(atomic64_t *ptr, s64 new)
{
s64 prev;
smp_mb();
__asm__ __volatile__(
"1: llockd %0, [%1] \n"
" scondd %2, [%1] \n"
" bnz 1b \n"
"2: \n"
: "=&r"(prev)
: "r"(ptr), "r"(new)
: "cc"); /* memory clobber comes from smp_mb() */
smp_mb();
return prev;
}
/**
* atomic64_dec_if_positive - decrement by 1 if old value positive
* @v: pointer of type atomic64_t
*
* The function returns the old value of *v minus 1, even if
* the atomic variable, v, was not decremented.
*/
static inline s64 atomic64_dec_if_positive(atomic64_t *v)
{
s64 val;
smp_mb();
__asm__ __volatile__(
"1: llockd %0, [%1] \n"
" sub.f %L0, %L0, 1 # w0 - 1, set C on borrow\n"
" sub.c %H0, %H0, 1 # if C set, w1 - 1\n"
" brlt %H0, 0, 2f \n"
" scondd %0, [%1] \n"
" bnz 1b \n"
"2: \n"
: "=&r"(val)
: "r"(&v->counter)
: "cc"); /* memory clobber comes from smp_mb() */
smp_mb();
return val;
}
#define atomic64_dec_if_positive atomic64_dec_if_positive
/**
* atomic64_fetch_add_unless - add unless the number is a given value
* @v: pointer of type atomic64_t
* @a: the amount to add to v...
* @u: ...unless v is equal to u.
*
* Atomically adds @a to @v, if it was not @u.
* Returns the old value of @v
*/
static inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
{
s64 old, temp;
smp_mb();
__asm__ __volatile__(
"1: llockd %0, [%2] \n"
" brne %L0, %L4, 2f # continue to add since v != u \n"
" breq.d %H0, %H4, 3f # return since v == u \n"
"2: \n"
" add.f %L1, %L0, %L3 \n"
" adc %H1, %H0, %H3 \n"
" scondd %1, [%2] \n"
" bnz 1b \n"
"3: \n"
: "=&r"(old), "=&r" (temp)
: "r"(&v->counter), "r"(a), "r"(u)
: "cc"); /* memory clobber comes from smp_mb() */
smp_mb();
return old;
}
#define atomic64_fetch_add_unless atomic64_fetch_add_unless
#endif /* !CONFIG_GENERIC_ATOMIC64 */
#endif /* !__ASSEMBLY__ */
#endif