mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-28 11:18:45 +07:00
97e914b7de
The Cavium Octeon CPU uses a special sync instruction for implementing
wmb, and due to a CPU bug, the instruction must appear twice. A macro
had been defined to hide this:
#define __SYNC_rpt(type) (1 + (type == __SYNC_wmb))
which was intended to evaluate to 2 for __SYNC_wmb, and 1 for any other
type of sync. However, this expression is evaluated by the assembler,
and not the compiler, and the result of '==' in the assembler is 0 or
-1, not 0 or 1 as it is in C. The net result was wmb() producing no code
at all. The simple fix in this patch is to change the '+' to '-'.
Fixes: bf92927251
("MIPS: barrier: Add __SYNC() infrastructure")
Signed-off-by: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz>
Tested-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Signed-off-by: Paul Burton <paulburton@kernel.org>
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
210 lines
7.6 KiB
C
210 lines
7.6 KiB
C
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
#ifndef __MIPS_ASM_SYNC_H__
|
|
#define __MIPS_ASM_SYNC_H__
|
|
|
|
/*
|
|
* sync types are defined by the MIPS64 Instruction Set documentation in Volume
|
|
* II-A of the MIPS Architecture Reference Manual, which can be found here:
|
|
*
|
|
* https://www.mips.com/?do-download=the-mips64-instruction-set-v6-06
|
|
*
|
|
* Two types of barrier are provided:
|
|
*
|
|
* 1) Completion barriers, which ensure that a memory operation has actually
|
|
* completed & often involve stalling the CPU pipeline to do so.
|
|
*
|
|
* 2) Ordering barriers, which only ensure that affected memory operations
|
|
* won't be reordered in the CPU pipeline in a manner that violates the
|
|
* restrictions imposed by the barrier.
|
|
*
|
|
* Ordering barriers can be more efficient than completion barriers, since:
|
|
*
|
|
* a) Ordering barriers only require memory access instructions which preceed
|
|
* them in program order (older instructions) to reach a point in the
|
|
* load/store datapath beyond which reordering is not possible before
|
|
* allowing memory access instructions which follow them (younger
|
|
* instructions) to be performed. That is, older instructions don't
|
|
* actually need to complete - they just need to get far enough that all
|
|
* other coherent CPUs will observe their completion before they observe
|
|
* the effects of younger instructions.
|
|
*
|
|
* b) Multiple variants of ordering barrier are provided which allow the
|
|
* effects to be restricted to different combinations of older or younger
|
|
* loads or stores. By way of example, if we only care that stores older
|
|
* than a barrier are observed prior to stores that are younger than a
|
|
* barrier & don't care about the ordering of loads then the 'wmb'
|
|
* ordering barrier can be used. Limiting the barrier's effects to stores
|
|
* allows loads to continue unaffected & potentially allows the CPU to
|
|
* make progress faster than if younger loads had to wait for older stores
|
|
* to complete.
|
|
*/
|
|
|
|
/*
|
|
* No sync instruction at all; used to allow code to nullify the effect of the
|
|
* __SYNC() macro without needing lots of #ifdefery.
|
|
*/
|
|
#define __SYNC_none -1
|
|
|
|
/*
|
|
* A full completion barrier; all memory accesses appearing prior to this sync
|
|
* instruction in program order must complete before any memory accesses
|
|
* appearing after this sync instruction in program order.
|
|
*/
|
|
#define __SYNC_full 0x00
|
|
|
|
/*
|
|
* For now we use a full completion barrier to implement all sync types, until
|
|
* we're satisfied that lightweight ordering barriers defined by MIPSr6 are
|
|
* sufficient to uphold our desired memory model.
|
|
*/
|
|
#define __SYNC_aq __SYNC_full
|
|
#define __SYNC_rl __SYNC_full
|
|
#define __SYNC_mb __SYNC_full
|
|
|
|
/*
|
|
* ...except on Cavium Octeon CPUs, which have been using the 'wmb' ordering
|
|
* barrier since 2010 & omit 'rmb' barriers because the CPUs don't perform
|
|
* speculative reads.
|
|
*/
|
|
#ifdef CONFIG_CPU_CAVIUM_OCTEON
|
|
# define __SYNC_rmb __SYNC_none
|
|
# define __SYNC_wmb 0x04
|
|
#else
|
|
# define __SYNC_rmb __SYNC_full
|
|
# define __SYNC_wmb __SYNC_full
|
|
#endif
|
|
|
|
/*
|
|
* A GINV sync is a little different; it doesn't relate directly to loads or
|
|
* stores, but instead causes synchronization of an icache or TLB global
|
|
* invalidation operation triggered by the ginvi or ginvt instructions
|
|
* respectively. In cases where we need to know that a ginvi or ginvt operation
|
|
* has been performed by all coherent CPUs, we must issue a sync instruction of
|
|
* this type. Once this instruction graduates all coherent CPUs will have
|
|
* observed the invalidation.
|
|
*/
|
|
#define __SYNC_ginv 0x14
|
|
|
|
/* Trivial; indicate that we always need this sync instruction. */
|
|
#define __SYNC_always (1 << 0)
|
|
|
|
/*
|
|
* Indicate that we need this sync instruction only on systems with weakly
|
|
* ordered memory access. In general this is most MIPS systems, but there are
|
|
* exceptions which provide strongly ordered memory.
|
|
*/
|
|
#ifdef CONFIG_WEAK_ORDERING
|
|
# define __SYNC_weak_ordering (1 << 1)
|
|
#else
|
|
# define __SYNC_weak_ordering 0
|
|
#endif
|
|
|
|
/*
|
|
* Indicate that we need this sync instruction only on systems where LL/SC
|
|
* don't implicitly provide a memory barrier. In general this is most MIPS
|
|
* systems.
|
|
*/
|
|
#ifdef CONFIG_WEAK_REORDERING_BEYOND_LLSC
|
|
# define __SYNC_weak_llsc (1 << 2)
|
|
#else
|
|
# define __SYNC_weak_llsc 0
|
|
#endif
|
|
|
|
/*
|
|
* Some Loongson 3 CPUs have a bug wherein execution of a memory access (load,
|
|
* store or prefetch) in between an LL & SC can cause the SC instruction to
|
|
* erroneously succeed, breaking atomicity. Whilst it's unusual to write code
|
|
* containing such sequences, this bug bites harder than we might otherwise
|
|
* expect due to reordering & speculation:
|
|
*
|
|
* 1) A memory access appearing prior to the LL in program order may actually
|
|
* be executed after the LL - this is the reordering case.
|
|
*
|
|
* In order to avoid this we need to place a memory barrier (ie. a SYNC
|
|
* instruction) prior to every LL instruction, in between it and any earlier
|
|
* memory access instructions.
|
|
*
|
|
* This reordering case is fixed by 3A R2 CPUs, ie. 3A2000 models and later.
|
|
*
|
|
* 2) If a conditional branch exists between an LL & SC with a target outside
|
|
* of the LL-SC loop, for example an exit upon value mismatch in cmpxchg()
|
|
* or similar, then misprediction of the branch may allow speculative
|
|
* execution of memory accesses from outside of the LL-SC loop.
|
|
*
|
|
* In order to avoid this we need a memory barrier (ie. a SYNC instruction)
|
|
* at each affected branch target.
|
|
*
|
|
* This case affects all current Loongson 3 CPUs.
|
|
*
|
|
* The above described cases cause an error in the cache coherence protocol;
|
|
* such that the Invalidate of a competing LL-SC goes 'missing' and SC
|
|
* erroneously observes its core still has Exclusive state and lets the SC
|
|
* proceed.
|
|
*
|
|
* Therefore the error only occurs on SMP systems.
|
|
*/
|
|
#ifdef CONFIG_CPU_LOONGSON3_WORKAROUNDS
|
|
# define __SYNC_loongson3_war (1 << 31)
|
|
#else
|
|
# define __SYNC_loongson3_war 0
|
|
#endif
|
|
|
|
/*
|
|
* Some Cavium Octeon CPUs suffer from a bug that causes a single wmb ordering
|
|
* barrier to be ineffective, requiring the use of 2 in sequence to provide an
|
|
* effective barrier as noted by commit 6b07d38aaa52 ("MIPS: Octeon: Use
|
|
* optimized memory barrier primitives."). Here we specify that the affected
|
|
* sync instructions should be emitted twice.
|
|
* Note that this expression is evaluated by the assembler (not the compiler),
|
|
* and that the assembler evaluates '==' as 0 or -1, not 0 or 1.
|
|
*/
|
|
#ifdef CONFIG_CPU_CAVIUM_OCTEON
|
|
# define __SYNC_rpt(type) (1 - (type == __SYNC_wmb))
|
|
#else
|
|
# define __SYNC_rpt(type) 1
|
|
#endif
|
|
|
|
/*
|
|
* The main event. Here we actually emit a sync instruction of a given type, if
|
|
* reason is non-zero.
|
|
*
|
|
* In future we have the option of emitting entries in a fixups-style table
|
|
* here that would allow us to opportunistically remove some sync instructions
|
|
* when we detect at runtime that we're running on a CPU that doesn't need
|
|
* them.
|
|
*/
|
|
#ifdef CONFIG_CPU_HAS_SYNC
|
|
# define ____SYNC(_type, _reason, _else) \
|
|
.if (( _type ) != -1) && ( _reason ); \
|
|
.set push; \
|
|
.set MIPS_ISA_LEVEL_RAW; \
|
|
.rept __SYNC_rpt(_type); \
|
|
sync _type; \
|
|
.endr; \
|
|
.set pop; \
|
|
.else; \
|
|
_else; \
|
|
.endif
|
|
#else
|
|
# define ____SYNC(_type, _reason, _else)
|
|
#endif
|
|
|
|
/*
|
|
* Preprocessor magic to expand macros used as arguments before we insert them
|
|
* into assembly code.
|
|
*/
|
|
#ifdef __ASSEMBLY__
|
|
# define ___SYNC(type, reason, else) \
|
|
____SYNC(type, reason, else)
|
|
#else
|
|
# define ___SYNC(type, reason, else) \
|
|
__stringify(____SYNC(type, reason, else))
|
|
#endif
|
|
|
|
#define __SYNC(type, reason) \
|
|
___SYNC(__SYNC_##type, __SYNC_##reason, )
|
|
#define __SYNC_ELSE(type, reason, else) \
|
|
___SYNC(__SYNC_##type, __SYNC_##reason, else)
|
|
|
|
#endif /* __MIPS_ASM_SYNC_H__ */
|