mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-02-17 01:56:52 +07:00
Merge branch 'sparc64-queued-locks'
Babu Moger says: ==================== Enable queued rwlock and queued spinlock for SPARC This series of patches enables queued rwlock and queued spinlock support for SPARC. These features were introduced some time ago in upstream. Here are some of the earlier discussions. https://lwn.net/Articles/572765/ https://lwn.net/Articles/582200/ https://lwn.net/Articles/561775/ https://lwn.net/Articles/590243/ Tests: Ran AIM7 benchmark to verify the performance on various workloads. https://github.com/davidlohr/areaim. Same benchmark was used when this feature was introduced and enabled on x86. Here are the test results. Kernel 4.11.0-rc6 4.11.0-rc6 + Change baseline queued locks (Avg No.of jobs) (Avg No.of jobs) Workload High systime 10-100 user 17290.48 17295.18 +0.02 High systime 200-1000 users 109814.95 110248.87 +0.39 High systime 1200-2000 users 107912.40 127923.16 +18.54 Disk IO 10-100 users 168910.16 158834.17 -5.96 Disk IO 200-1000 users 242781.74 281285.80 +15.85 Disk IO 1200-2000 users 228518.23 218421.23 -4.41 Disk IO 10-100 users 183933.77 207928.67 +13.04 Disk IO 200-1000 users 491981.56 500162.33 +1.66 Disk IO 1200-2000 users 463395.66 467312.70 +0.84 fserver 10-100 users 254177.53 270283.08 +6.33 fserver IO 200-1000 users 269017.35 324812.2 +20.74 fserver IO 1200-2000 users 229538.87 284713.77 +24.03 Disk I/O results are little bit in negative territory. But majority of the performance changes are in positive and it is significant in some cases. Changes: v3 -> v4: 1. Took care of Geert Uytterhoeven's comment about patch #3(def_bool y) 2. Working on separate patch sets to define CPU_BIG_ENDIAN for all the default big endian architectures based on feedback from Geert and Arnd. v2 -> v3: 1. Rebased the patches on top of 4.12-rc2. 2. Re-ordered the patch #1 and patch #2. That is the same order I have seen the issues. So, it should be addressed in the same order. Patch #1 removes the check __LINUX_SPINLOCK_TYPES_H. Patch #2 addreses the compile error with qrwlock.c. This addresses the comments from Dave Miller on v2. v1 -> v2: Addressed the comments from David Miller. 1. Added CPU_BIG_ENDIAN for all SPARC 2. Removed #ifndef __LINUX_SPINLOCK_TYPES_H guard from spinlock_types.h 3. Removed check for CONFIG_QUEUED_RWLOCKS in SPARC64 as it is the default definition for SPARC64 now. Cleaned-up the previous arch_read_xxx and arch_write_xxx definitions as it is defined now in qrwlock.h. 4. Removed check for CONFIG_QUEUED_SPINLOCKS in SPARC64 as it is the default definition now for SPARC64 now. Cleaned-up the previous arch_spin_xxx definitions as it is defined in qspinlock.h. v1: Initial version ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
60925ee97e
@ -83,6 +83,8 @@ config SPARC64
|
||||
select ARCH_SUPPORTS_ATOMIC_RMW
|
||||
select HAVE_NMI
|
||||
select HAVE_REGS_AND_STACK_ACCESS_API
|
||||
select ARCH_USE_QUEUED_RWLOCKS
|
||||
select ARCH_USE_QUEUED_SPINLOCKS
|
||||
|
||||
config ARCH_DEFCONFIG
|
||||
string
|
||||
@ -92,6 +94,9 @@ config ARCH_DEFCONFIG
|
||||
config ARCH_PROC_KCORE_TEXT
|
||||
def_bool y
|
||||
|
||||
config CPU_BIG_ENDIAN
|
||||
def_bool y
|
||||
|
||||
config ARCH_ATU
|
||||
bool
|
||||
default y if SPARC64
|
||||
|
@ -6,6 +6,17 @@
|
||||
#ifndef __ARCH_SPARC64_CMPXCHG__
|
||||
#define __ARCH_SPARC64_CMPXCHG__
|
||||
|
||||
static inline unsigned long
|
||||
__cmpxchg_u32(volatile int *m, int old, int new)
|
||||
{
|
||||
__asm__ __volatile__("cas [%2], %3, %0"
|
||||
: "=&r" (new)
|
||||
: "0" (new), "r" (m), "r" (old)
|
||||
: "memory");
|
||||
|
||||
return new;
|
||||
}
|
||||
|
||||
static inline unsigned long xchg32(__volatile__ unsigned int *m, unsigned int val)
|
||||
{
|
||||
unsigned long tmp1, tmp2;
|
||||
@ -44,10 +55,38 @@ static inline unsigned long xchg64(__volatile__ unsigned long *m, unsigned long
|
||||
|
||||
void __xchg_called_with_bad_pointer(void);
|
||||
|
||||
/*
|
||||
* Use 4 byte cas instruction to achieve 2 byte xchg. Main logic
|
||||
* here is to get the bit shift of the byte we are interested in.
|
||||
* The XOR is handy for reversing the bits for big-endian byte order.
|
||||
*/
|
||||
static inline unsigned long
|
||||
xchg16(__volatile__ unsigned short *m, unsigned short val)
|
||||
{
|
||||
unsigned long maddr = (unsigned long)m;
|
||||
int bit_shift = (((unsigned long)m & 2) ^ 2) << 3;
|
||||
unsigned int mask = 0xffff << bit_shift;
|
||||
unsigned int *ptr = (unsigned int *) (maddr & ~2);
|
||||
unsigned int old32, new32, load32;
|
||||
|
||||
/* Read the old value */
|
||||
load32 = *ptr;
|
||||
|
||||
do {
|
||||
old32 = load32;
|
||||
new32 = (load32 & (~mask)) | val << bit_shift;
|
||||
load32 = __cmpxchg_u32(ptr, old32, new32);
|
||||
} while (load32 != old32);
|
||||
|
||||
return (load32 & mask) >> bit_shift;
|
||||
}
|
||||
|
||||
static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr,
|
||||
int size)
|
||||
{
|
||||
switch (size) {
|
||||
case 2:
|
||||
return xchg16(ptr, x);
|
||||
case 4:
|
||||
return xchg32(ptr, x);
|
||||
case 8:
|
||||
@ -65,16 +104,6 @@ static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr,
|
||||
|
||||
#include <asm-generic/cmpxchg-local.h>
|
||||
|
||||
static inline unsigned long
|
||||
__cmpxchg_u32(volatile int *m, int old, int new)
|
||||
{
|
||||
__asm__ __volatile__("cas [%2], %3, %0"
|
||||
: "=&r" (new)
|
||||
: "0" (new), "r" (m), "r" (old)
|
||||
: "memory");
|
||||
|
||||
return new;
|
||||
}
|
||||
|
||||
static inline unsigned long
|
||||
__cmpxchg_u64(volatile long *m, unsigned long old, unsigned long new)
|
||||
@ -87,6 +116,33 @@ __cmpxchg_u64(volatile long *m, unsigned long old, unsigned long new)
|
||||
return new;
|
||||
}
|
||||
|
||||
/*
|
||||
* Use 4 byte cas instruction to achieve 1 byte cmpxchg. Main logic
|
||||
* here is to get the bit shift of the byte we are interested in.
|
||||
* The XOR is handy for reversing the bits for big-endian byte order
|
||||
*/
|
||||
static inline unsigned long
|
||||
__cmpxchg_u8(volatile unsigned char *m, unsigned char old, unsigned char new)
|
||||
{
|
||||
unsigned long maddr = (unsigned long)m;
|
||||
int bit_shift = (((unsigned long)m & 3) ^ 3) << 3;
|
||||
unsigned int mask = 0xff << bit_shift;
|
||||
unsigned int *ptr = (unsigned int *) (maddr & ~3);
|
||||
unsigned int old32, new32, load;
|
||||
unsigned int load32 = *ptr;
|
||||
|
||||
do {
|
||||
new32 = (load32 & ~mask) | (new << bit_shift);
|
||||
old32 = (load32 & ~mask) | (old << bit_shift);
|
||||
load32 = __cmpxchg_u32(ptr, old32, new32);
|
||||
if (load32 == old32)
|
||||
return old;
|
||||
load = (load32 & mask) >> bit_shift;
|
||||
} while (load == old);
|
||||
|
||||
return load;
|
||||
}
|
||||
|
||||
/* This function doesn't exist, so you'll get a linker error
|
||||
if something tries to do an invalid cmpxchg(). */
|
||||
void __cmpxchg_called_with_bad_pointer(void);
|
||||
@ -95,6 +151,8 @@ static inline unsigned long
|
||||
__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size)
|
||||
{
|
||||
switch (size) {
|
||||
case 1:
|
||||
return __cmpxchg_u8(ptr, old, new);
|
||||
case 4:
|
||||
return __cmpxchg_u32(ptr, old, new);
|
||||
case 8:
|
||||
|
7
arch/sparc/include/asm/qrwlock.h
Normal file
7
arch/sparc/include/asm/qrwlock.h
Normal file
@ -0,0 +1,7 @@
|
||||
#ifndef _ASM_SPARC_QRWLOCK_H
|
||||
#define _ASM_SPARC_QRWLOCK_H
|
||||
|
||||
#include <asm-generic/qrwlock_types.h>
|
||||
#include <asm-generic/qrwlock.h>
|
||||
|
||||
#endif /* _ASM_SPARC_QRWLOCK_H */
|
7
arch/sparc/include/asm/qspinlock.h
Normal file
7
arch/sparc/include/asm/qspinlock.h
Normal file
@ -0,0 +1,7 @@
|
||||
#ifndef _ASM_SPARC_QSPINLOCK_H
|
||||
#define _ASM_SPARC_QSPINLOCK_H
|
||||
|
||||
#include <asm-generic/qspinlock_types.h>
|
||||
#include <asm-generic/qspinlock.h>
|
||||
|
||||
#endif /* _ASM_SPARC_QSPINLOCK_H */
|
@ -10,216 +10,12 @@
|
||||
|
||||
#include <asm/processor.h>
|
||||
#include <asm/barrier.h>
|
||||
|
||||
/* To get debugging spinlocks which detect and catch
|
||||
* deadlock situations, set CONFIG_DEBUG_SPINLOCK
|
||||
* and rebuild your kernel.
|
||||
*/
|
||||
|
||||
/* Because we play games to save cycles in the non-contention case, we
|
||||
* need to be extra careful about branch targets into the "spinning"
|
||||
* code. They live in their own section, but the newer V9 branches
|
||||
* have a shorter range than the traditional 32-bit sparc branch
|
||||
* variants. The rule is that the branches that go into and out of
|
||||
* the spinner sections must be pre-V9 branches.
|
||||
*/
|
||||
|
||||
#define arch_spin_is_locked(lp) ((lp)->lock != 0)
|
||||
|
||||
static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
|
||||
{
|
||||
smp_cond_load_acquire(&lock->lock, !VAL);
|
||||
}
|
||||
|
||||
static inline void arch_spin_lock(arch_spinlock_t *lock)
|
||||
{
|
||||
unsigned long tmp;
|
||||
|
||||
__asm__ __volatile__(
|
||||
"1: ldstub [%1], %0\n"
|
||||
" brnz,pn %0, 2f\n"
|
||||
" nop\n"
|
||||
" .subsection 2\n"
|
||||
"2: ldub [%1], %0\n"
|
||||
" brnz,pt %0, 2b\n"
|
||||
" nop\n"
|
||||
" ba,a,pt %%xcc, 1b\n"
|
||||
" .previous"
|
||||
: "=&r" (tmp)
|
||||
: "r" (lock)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline int arch_spin_trylock(arch_spinlock_t *lock)
|
||||
{
|
||||
unsigned long result;
|
||||
|
||||
__asm__ __volatile__(
|
||||
" ldstub [%1], %0\n"
|
||||
: "=r" (result)
|
||||
: "r" (lock)
|
||||
: "memory");
|
||||
|
||||
return (result == 0UL);
|
||||
}
|
||||
|
||||
static inline void arch_spin_unlock(arch_spinlock_t *lock)
|
||||
{
|
||||
__asm__ __volatile__(
|
||||
" stb %%g0, [%0]"
|
||||
: /* No outputs */
|
||||
: "r" (lock)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
|
||||
{
|
||||
unsigned long tmp1, tmp2;
|
||||
|
||||
__asm__ __volatile__(
|
||||
"1: ldstub [%2], %0\n"
|
||||
" brnz,pn %0, 2f\n"
|
||||
" nop\n"
|
||||
" .subsection 2\n"
|
||||
"2: rdpr %%pil, %1\n"
|
||||
" wrpr %3, %%pil\n"
|
||||
"3: ldub [%2], %0\n"
|
||||
" brnz,pt %0, 3b\n"
|
||||
" nop\n"
|
||||
" ba,pt %%xcc, 1b\n"
|
||||
" wrpr %1, %%pil\n"
|
||||
" .previous"
|
||||
: "=&r" (tmp1), "=&r" (tmp2)
|
||||
: "r"(lock), "r"(flags)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
/* Multi-reader locks, these are much saner than the 32-bit Sparc ones... */
|
||||
|
||||
static inline void arch_read_lock(arch_rwlock_t *lock)
|
||||
{
|
||||
unsigned long tmp1, tmp2;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"1: ldsw [%2], %0\n"
|
||||
" brlz,pn %0, 2f\n"
|
||||
"4: add %0, 1, %1\n"
|
||||
" cas [%2], %0, %1\n"
|
||||
" cmp %0, %1\n"
|
||||
" bne,pn %%icc, 1b\n"
|
||||
" nop\n"
|
||||
" .subsection 2\n"
|
||||
"2: ldsw [%2], %0\n"
|
||||
" brlz,pt %0, 2b\n"
|
||||
" nop\n"
|
||||
" ba,a,pt %%xcc, 4b\n"
|
||||
" .previous"
|
||||
: "=&r" (tmp1), "=&r" (tmp2)
|
||||
: "r" (lock)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline int arch_read_trylock(arch_rwlock_t *lock)
|
||||
{
|
||||
int tmp1, tmp2;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"1: ldsw [%2], %0\n"
|
||||
" brlz,a,pn %0, 2f\n"
|
||||
" mov 0, %0\n"
|
||||
" add %0, 1, %1\n"
|
||||
" cas [%2], %0, %1\n"
|
||||
" cmp %0, %1\n"
|
||||
" bne,pn %%icc, 1b\n"
|
||||
" mov 1, %0\n"
|
||||
"2:"
|
||||
: "=&r" (tmp1), "=&r" (tmp2)
|
||||
: "r" (lock)
|
||||
: "memory");
|
||||
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
static inline void arch_read_unlock(arch_rwlock_t *lock)
|
||||
{
|
||||
unsigned long tmp1, tmp2;
|
||||
|
||||
__asm__ __volatile__(
|
||||
"1: lduw [%2], %0\n"
|
||||
" sub %0, 1, %1\n"
|
||||
" cas [%2], %0, %1\n"
|
||||
" cmp %0, %1\n"
|
||||
" bne,pn %%xcc, 1b\n"
|
||||
" nop"
|
||||
: "=&r" (tmp1), "=&r" (tmp2)
|
||||
: "r" (lock)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline void arch_write_lock(arch_rwlock_t *lock)
|
||||
{
|
||||
unsigned long mask, tmp1, tmp2;
|
||||
|
||||
mask = 0x80000000UL;
|
||||
|
||||
__asm__ __volatile__(
|
||||
"1: lduw [%2], %0\n"
|
||||
" brnz,pn %0, 2f\n"
|
||||
"4: or %0, %3, %1\n"
|
||||
" cas [%2], %0, %1\n"
|
||||
" cmp %0, %1\n"
|
||||
" bne,pn %%icc, 1b\n"
|
||||
" nop\n"
|
||||
" .subsection 2\n"
|
||||
"2: lduw [%2], %0\n"
|
||||
" brnz,pt %0, 2b\n"
|
||||
" nop\n"
|
||||
" ba,a,pt %%xcc, 4b\n"
|
||||
" .previous"
|
||||
: "=&r" (tmp1), "=&r" (tmp2)
|
||||
: "r" (lock), "r" (mask)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline void arch_write_unlock(arch_rwlock_t *lock)
|
||||
{
|
||||
__asm__ __volatile__(
|
||||
" stw %%g0, [%0]"
|
||||
: /* no outputs */
|
||||
: "r" (lock)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline int arch_write_trylock(arch_rwlock_t *lock)
|
||||
{
|
||||
unsigned long mask, tmp1, tmp2, result;
|
||||
|
||||
mask = 0x80000000UL;
|
||||
|
||||
__asm__ __volatile__(
|
||||
" mov 0, %2\n"
|
||||
"1: lduw [%3], %0\n"
|
||||
" brnz,pn %0, 2f\n"
|
||||
" or %0, %4, %1\n"
|
||||
" cas [%3], %0, %1\n"
|
||||
" cmp %0, %1\n"
|
||||
" bne,pn %%icc, 1b\n"
|
||||
" nop\n"
|
||||
" mov 1, %2\n"
|
||||
"2:"
|
||||
: "=&r" (tmp1), "=&r" (tmp2), "=&r" (result)
|
||||
: "r" (lock), "r" (mask)
|
||||
: "memory");
|
||||
|
||||
return result;
|
||||
}
|
||||
#include <asm/qrwlock.h>
|
||||
#include <asm/qspinlock.h>
|
||||
|
||||
#define arch_read_lock_flags(p, f) arch_read_lock(p)
|
||||
#define arch_write_lock_flags(p, f) arch_write_lock(p)
|
||||
|
||||
#define arch_read_can_lock(rw) (!((rw)->lock & 0x80000000UL))
|
||||
#define arch_write_can_lock(rw) (!(rw)->lock)
|
||||
|
||||
#define arch_spin_relax(lock) cpu_relax()
|
||||
#define arch_read_relax(lock) cpu_relax()
|
||||
#define arch_write_relax(lock) cpu_relax()
|
||||
|
@ -1,20 +1,24 @@
|
||||
#ifndef __SPARC_SPINLOCK_TYPES_H
|
||||
#define __SPARC_SPINLOCK_TYPES_H
|
||||
|
||||
#ifndef __LINUX_SPINLOCK_TYPES_H
|
||||
# error "please don't include this file directly"
|
||||
#endif
|
||||
#ifdef CONFIG_QUEUED_SPINLOCKS
|
||||
#include <asm-generic/qspinlock_types.h>
|
||||
#else
|
||||
|
||||
typedef struct {
|
||||
volatile unsigned char lock;
|
||||
} arch_spinlock_t;
|
||||
|
||||
#define __ARCH_SPIN_LOCK_UNLOCKED { 0 }
|
||||
#endif /* CONFIG_QUEUED_SPINLOCKS */
|
||||
|
||||
#ifdef CONFIG_QUEUED_RWLOCKS
|
||||
#include <asm-generic/qrwlock_types.h>
|
||||
#else
|
||||
typedef struct {
|
||||
volatile unsigned int lock;
|
||||
} arch_rwlock_t;
|
||||
|
||||
#define __ARCH_RW_LOCK_UNLOCKED { 0 }
|
||||
|
||||
#endif /* CONFIG_QUEUED_RWLOCKS */
|
||||
#endif
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include <linux/cpumask.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/hardirq.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <asm/qrwlock.h>
|
||||
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user