linux_dsm_epyc7002/arch/blackfin/mach-bf561/atomic.S
Sonic Zhang 064cc44e62 Blackfin: SMP: kgdb: flush core internal write buffer before flushinv
KGDB single step in SMP kernel may hang forever in flushinv without a
CSYNC ahead.  This is because the core internal write buffers need to
be flushed before invalidating the data cache to make sure the insn
fetch is not out of sync.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
2011-01-10 07:18:21 -05:00

920 lines
15 KiB
ArmAsm

/*
* Copyright 2007-2008 Analog Devices Inc.
* Philippe Gerum <rpm@xenomai.org>
*
* Licensed under the GPL-2 or later.
*/
#include <linux/linkage.h>
#include <asm/blackfin.h>
#include <asm/cache.h>
#include <asm/asm-offsets.h>
#include <asm/rwlock.h>
#include <asm/cplb.h>
.text
.macro coreslot_loadaddr reg:req
\reg\().l = _corelock;
\reg\().h = _corelock;
.endm
.macro safe_testset addr:req, scratch:req
#if ANOMALY_05000477
cli \scratch;
testset (\addr);
sti \scratch;
#else
testset (\addr);
#endif
.endm
/*
* r0 = address of atomic data to flush and invalidate (32bit).
*
* Clear interrupts and return the old mask.
* We assume that no atomic data can span cachelines.
*
* Clobbers: r2:0, p0
*/
ENTRY(_get_core_lock)
r1 = -L1_CACHE_BYTES;
r1 = r0 & r1;
cli r0;
coreslot_loadaddr p0;
.Lretry_corelock:
safe_testset p0, r2;
if cc jump .Ldone_corelock;
SSYNC(r2);
jump .Lretry_corelock
.Ldone_corelock:
p0 = r1;
/* flush core internal write buffer before invalidate dcache */
CSYNC(r2);
flushinv[p0];
SSYNC(r2);
rts;
ENDPROC(_get_core_lock)
/*
* r0 = address of atomic data in uncacheable memory region (32bit).
*
* Clear interrupts and return the old mask.
*
* Clobbers: r0, p0
*/
ENTRY(_get_core_lock_noflush)
cli r0;
coreslot_loadaddr p0;
.Lretry_corelock_noflush:
safe_testset p0, r2;
if cc jump .Ldone_corelock_noflush;
SSYNC(r2);
jump .Lretry_corelock_noflush
.Ldone_corelock_noflush:
rts;
ENDPROC(_get_core_lock_noflush)
/*
* r0 = interrupt mask to restore.
* r1 = address of atomic data to flush and invalidate (32bit).
*
* Interrupts are masked on entry (see _get_core_lock).
* Clobbers: r2:0, p0
*/
ENTRY(_put_core_lock)
/* Write-through cache assumed, so no flush needed here. */
coreslot_loadaddr p0;
r1 = 0;
[p0] = r1;
SSYNC(r2);
sti r0;
rts;
ENDPROC(_put_core_lock)
#ifdef __ARCH_SYNC_CORE_DCACHE
ENTRY(___raw_smp_mark_barrier_asm)
[--sp] = rets;
[--sp] = ( r7:5 );
[--sp] = r0;
[--sp] = p1;
[--sp] = p0;
call _get_core_lock_noflush;
/*
* Calculate current core mask
*/
GET_CPUID(p1, r7);
r6 = 1;
r6 <<= r7;
/*
* Set bit of other cores in barrier mask. Don't change current core bit.
*/
p1.l = _barrier_mask;
p1.h = _barrier_mask;
r7 = [p1];
r5 = r7 & r6;
r7 = ~r6;
cc = r5 == 0;
if cc jump 1f;
r7 = r7 | r6;
1:
[p1] = r7;
SSYNC(r2);
call _put_core_lock;
p0 = [sp++];
p1 = [sp++];
r0 = [sp++];
( r7:5 ) = [sp++];
rets = [sp++];
rts;
ENDPROC(___raw_smp_mark_barrier_asm)
ENTRY(___raw_smp_check_barrier_asm)
[--sp] = rets;
[--sp] = ( r7:5 );
[--sp] = r0;
[--sp] = p1;
[--sp] = p0;
call _get_core_lock_noflush;
/*
* Calculate current core mask
*/
GET_CPUID(p1, r7);
r6 = 1;
r6 <<= r7;
/*
* Clear current core bit in barrier mask if it is set.
*/
p1.l = _barrier_mask;
p1.h = _barrier_mask;
r7 = [p1];
r5 = r7 & r6;
cc = r5 == 0;
if cc jump 1f;
r6 = ~r6;
r7 = r7 & r6;
[p1] = r7;
SSYNC(r2);
call _put_core_lock;
/*
* Invalidate the entire D-cache of current core.
*/
sp += -12;
call _resync_core_dcache
sp += 12;
jump 2f;
1:
call _put_core_lock;
2:
p0 = [sp++];
p1 = [sp++];
r0 = [sp++];
( r7:5 ) = [sp++];
rets = [sp++];
rts;
ENDPROC(___raw_smp_check_barrier_asm)
/*
* r0 = irqflags
* r1 = address of atomic data
*
* Clobbers: r2:0, p1:0
*/
_start_lock_coherent:
[--sp] = rets;
[--sp] = ( r7:6 );
r7 = r0;
p1 = r1;
/*
* Determine whether the atomic data was previously
* owned by another CPU (=r6).
*/
GET_CPUID(p0, r2);
r1 = 1;
r1 <<= r2;
r2 = ~r1;
r1 = [p1];
r1 >>= 28; /* CPU fingerprints are stored in the high nibble. */
r6 = r1 & r2;
r1 = [p1];
r1 <<= 4;
r1 >>= 4;
[p1] = r1;
/*
* Release the core lock now, but keep IRQs disabled while we are
* performing the remaining housekeeping chores for the current CPU.
*/
coreslot_loadaddr p0;
r1 = 0;
[p0] = r1;
/*
* If another CPU has owned the same atomic section before us,
* then our D-cached copy of the shared data protected by the
* current spin/write_lock may be obsolete.
*/
cc = r6 == 0;
if cc jump .Lcache_synced
/*
* Invalidate the entire D-cache of the current core.
*/
sp += -12;
call _resync_core_dcache
sp += 12;
.Lcache_synced:
SSYNC(r2);
sti r7;
( r7:6 ) = [sp++];
rets = [sp++];
rts
/*
* r0 = irqflags
* r1 = address of atomic data
*
* Clobbers: r2:0, p1:0
*/
_end_lock_coherent:
p1 = r1;
GET_CPUID(p0, r2);
r2 += 28;
r1 = 1;
r1 <<= r2;
r2 = [p1];
r2 = r1 | r2;
[p1] = r2;
r1 = p1;
jump _put_core_lock;
#endif /* __ARCH_SYNC_CORE_DCACHE */
/*
* r0 = &spinlock->lock
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_spin_is_locked_asm)
p1 = r0;
[--sp] = rets;
call _get_core_lock;
r3 = [p1];
cc = bittst( r3, 0 );
r3 = cc;
r1 = p1;
call _put_core_lock;
rets = [sp++];
r0 = r3;
rts;
ENDPROC(___raw_spin_is_locked_asm)
/*
* r0 = &spinlock->lock
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_spin_lock_asm)
p1 = r0;
[--sp] = rets;
.Lretry_spinlock:
call _get_core_lock;
r1 = p1;
r2 = [p1];
cc = bittst( r2, 0 );
if cc jump .Lbusy_spinlock
#ifdef __ARCH_SYNC_CORE_DCACHE
r3 = p1;
bitset ( r2, 0 ); /* Raise the lock bit. */
[p1] = r2;
call _start_lock_coherent
#else
r2 = 1;
[p1] = r2;
call _put_core_lock;
#endif
rets = [sp++];
rts;
.Lbusy_spinlock:
/* We don't touch the atomic area if busy, so that flush
will behave like nop in _put_core_lock. */
call _put_core_lock;
SSYNC(r2);
r0 = p1;
jump .Lretry_spinlock
ENDPROC(___raw_spin_lock_asm)
/*
* r0 = &spinlock->lock
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_spin_trylock_asm)
p1 = r0;
[--sp] = rets;
call _get_core_lock;
r1 = p1;
r3 = [p1];
cc = bittst( r3, 0 );
if cc jump .Lfailed_trylock
#ifdef __ARCH_SYNC_CORE_DCACHE
bitset ( r3, 0 ); /* Raise the lock bit. */
[p1] = r3;
call _start_lock_coherent
#else
r2 = 1;
[p1] = r2;
call _put_core_lock;
#endif
r0 = 1;
rets = [sp++];
rts;
.Lfailed_trylock:
call _put_core_lock;
r0 = 0;
rets = [sp++];
rts;
ENDPROC(___raw_spin_trylock_asm)
/*
* r0 = &spinlock->lock
*
* Clobbers: r2:0, p1:0
*/
ENTRY(___raw_spin_unlock_asm)
p1 = r0;
[--sp] = rets;
call _get_core_lock;
r2 = [p1];
bitclr ( r2, 0 );
[p1] = r2;
r1 = p1;
#ifdef __ARCH_SYNC_CORE_DCACHE
call _end_lock_coherent
#else
call _put_core_lock;
#endif
rets = [sp++];
rts;
ENDPROC(___raw_spin_unlock_asm)
/*
* r0 = &rwlock->lock
*
* Clobbers: r2:0, p1:0
*/
ENTRY(___raw_read_lock_asm)
p1 = r0;
[--sp] = rets;
call _get_core_lock;
.Lrdlock_try:
r1 = [p1];
r1 += -1;
[p1] = r1;
cc = r1 < 0;
if cc jump .Lrdlock_failed
r1 = p1;
#ifdef __ARCH_SYNC_CORE_DCACHE
call _start_lock_coherent
#else
call _put_core_lock;
#endif
rets = [sp++];
rts;
.Lrdlock_failed:
r1 += 1;
[p1] = r1;
.Lrdlock_wait:
r1 = p1;
call _put_core_lock;
SSYNC(r2);
r0 = p1;
call _get_core_lock;
r1 = [p1];
cc = r1 < 2;
if cc jump .Lrdlock_wait;
jump .Lrdlock_try
ENDPROC(___raw_read_lock_asm)
/*
* r0 = &rwlock->lock
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_read_trylock_asm)
p1 = r0;
[--sp] = rets;
call _get_core_lock;
r1 = [p1];
cc = r1 <= 0;
if cc jump .Lfailed_tryrdlock;
r1 += -1;
[p1] = r1;
r1 = p1;
#ifdef __ARCH_SYNC_CORE_DCACHE
call _start_lock_coherent
#else
call _put_core_lock;
#endif
rets = [sp++];
r0 = 1;
rts;
.Lfailed_tryrdlock:
r1 = p1;
call _put_core_lock;
rets = [sp++];
r0 = 0;
rts;
ENDPROC(___raw_read_trylock_asm)
/*
* r0 = &rwlock->lock
*
* Note: Processing controlled by a reader lock should not have
* any side-effect on cache issues with the other core, so we
* just release the core lock and exit (no _end_lock_coherent).
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_read_unlock_asm)
p1 = r0;
[--sp] = rets;
call _get_core_lock;
r1 = [p1];
r1 += 1;
[p1] = r1;
r1 = p1;
call _put_core_lock;
rets = [sp++];
rts;
ENDPROC(___raw_read_unlock_asm)
/*
* r0 = &rwlock->lock
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_write_lock_asm)
p1 = r0;
r3.l = lo(RW_LOCK_BIAS);
r3.h = hi(RW_LOCK_BIAS);
[--sp] = rets;
call _get_core_lock;
.Lwrlock_try:
r1 = [p1];
r1 = r1 - r3;
#ifdef __ARCH_SYNC_CORE_DCACHE
r2 = r1;
r2 <<= 4;
r2 >>= 4;
cc = r2 == 0;
#else
cc = r1 == 0;
#endif
if !cc jump .Lwrlock_wait
[p1] = r1;
r1 = p1;
#ifdef __ARCH_SYNC_CORE_DCACHE
call _start_lock_coherent
#else
call _put_core_lock;
#endif
rets = [sp++];
rts;
.Lwrlock_wait:
r1 = p1;
call _put_core_lock;
SSYNC(r2);
r0 = p1;
call _get_core_lock;
r1 = [p1];
#ifdef __ARCH_SYNC_CORE_DCACHE
r1 <<= 4;
r1 >>= 4;
#endif
cc = r1 == r3;
if !cc jump .Lwrlock_wait;
jump .Lwrlock_try
ENDPROC(___raw_write_lock_asm)
/*
* r0 = &rwlock->lock
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_write_trylock_asm)
p1 = r0;
[--sp] = rets;
call _get_core_lock;
r1 = [p1];
r2.l = lo(RW_LOCK_BIAS);
r2.h = hi(RW_LOCK_BIAS);
cc = r1 == r2;
if !cc jump .Lfailed_trywrlock;
#ifdef __ARCH_SYNC_CORE_DCACHE
r1 >>= 28;
r1 <<= 28;
#else
r1 = 0;
#endif
[p1] = r1;
r1 = p1;
#ifdef __ARCH_SYNC_CORE_DCACHE
call _start_lock_coherent
#else
call _put_core_lock;
#endif
rets = [sp++];
r0 = 1;
rts;
.Lfailed_trywrlock:
r1 = p1;
call _put_core_lock;
rets = [sp++];
r0 = 0;
rts;
ENDPROC(___raw_write_trylock_asm)
/*
* r0 = &rwlock->lock
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_write_unlock_asm)
p1 = r0;
r3.l = lo(RW_LOCK_BIAS);
r3.h = hi(RW_LOCK_BIAS);
[--sp] = rets;
call _get_core_lock;
r1 = [p1];
r1 = r1 + r3;
[p1] = r1;
r1 = p1;
#ifdef __ARCH_SYNC_CORE_DCACHE
call _end_lock_coherent
#else
call _put_core_lock;
#endif
rets = [sp++];
rts;
ENDPROC(___raw_write_unlock_asm)
/*
* r0 = ptr
* r1 = value
*
* Add a signed value to a 32bit word and return the new value atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_atomic_update_asm)
p1 = r0;
r3 = r1;
[--sp] = rets;
call _get_core_lock;
r2 = [p1];
r3 = r3 + r2;
[p1] = r3;
r1 = p1;
call _put_core_lock;
r0 = r3;
rets = [sp++];
rts;
ENDPROC(___raw_atomic_update_asm)
/*
* r0 = ptr
* r1 = mask
*
* Clear the mask bits from a 32bit word and return the old 32bit value
* atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_atomic_clear_asm)
p1 = r0;
r3 = ~r1;
[--sp] = rets;
call _get_core_lock;
r2 = [p1];
r3 = r2 & r3;
[p1] = r3;
r3 = r2;
r1 = p1;
call _put_core_lock;
r0 = r3;
rets = [sp++];
rts;
ENDPROC(___raw_atomic_clear_asm)
/*
* r0 = ptr
* r1 = mask
*
* Set the mask bits into a 32bit word and return the old 32bit value
* atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_atomic_set_asm)
p1 = r0;
r3 = r1;
[--sp] = rets;
call _get_core_lock;
r2 = [p1];
r3 = r2 | r3;
[p1] = r3;
r3 = r2;
r1 = p1;
call _put_core_lock;
r0 = r3;
rets = [sp++];
rts;
ENDPROC(___raw_atomic_set_asm)
/*
* r0 = ptr
* r1 = mask
*
* XOR the mask bits with a 32bit word and return the old 32bit value
* atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_atomic_xor_asm)
p1 = r0;
r3 = r1;
[--sp] = rets;
call _get_core_lock;
r2 = [p1];
r3 = r2 ^ r3;
[p1] = r3;
r3 = r2;
r1 = p1;
call _put_core_lock;
r0 = r3;
rets = [sp++];
rts;
ENDPROC(___raw_atomic_xor_asm)
/*
* r0 = ptr
* r1 = mask
*
* Perform a logical AND between the mask bits and a 32bit word, and
* return the masked value. We need this on this architecture in
* order to invalidate the local cache before testing.
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_atomic_test_asm)
p1 = r0;
r3 = r1;
r1 = -L1_CACHE_BYTES;
r1 = r0 & r1;
p0 = r1;
/* flush core internal write buffer before invalidate dcache */
CSYNC(r2);
flushinv[p0];
SSYNC(r2);
r0 = [p1];
r0 = r0 & r3;
rts;
ENDPROC(___raw_atomic_test_asm)
/*
* r0 = ptr
* r1 = value
*
* Swap *ptr with value and return the old 32bit value atomically.
* Clobbers: r3:0, p1:0
*/
#define __do_xchg(src, dst) \
p1 = r0; \
r3 = r1; \
[--sp] = rets; \
call _get_core_lock; \
r2 = src; \
dst = r3; \
r3 = r2; \
r1 = p1; \
call _put_core_lock; \
r0 = r3; \
rets = [sp++]; \
rts;
ENTRY(___raw_xchg_1_asm)
__do_xchg(b[p1] (z), b[p1])
ENDPROC(___raw_xchg_1_asm)
ENTRY(___raw_xchg_2_asm)
__do_xchg(w[p1] (z), w[p1])
ENDPROC(___raw_xchg_2_asm)
ENTRY(___raw_xchg_4_asm)
__do_xchg([p1], [p1])
ENDPROC(___raw_xchg_4_asm)
/*
* r0 = ptr
* r1 = new
* r2 = old
*
* Swap *ptr with new if *ptr == old and return the previous *ptr
* value atomically.
*
* Clobbers: r3:0, p1:0
*/
#define __do_cmpxchg(src, dst) \
[--sp] = rets; \
[--sp] = r4; \
p1 = r0; \
r3 = r1; \
r4 = r2; \
call _get_core_lock; \
r2 = src; \
cc = r2 == r4; \
if !cc jump 1f; \
dst = r3; \
1: r3 = r2; \
r1 = p1; \
call _put_core_lock; \
r0 = r3; \
r4 = [sp++]; \
rets = [sp++]; \
rts;
ENTRY(___raw_cmpxchg_1_asm)
__do_cmpxchg(b[p1] (z), b[p1])
ENDPROC(___raw_cmpxchg_1_asm)
ENTRY(___raw_cmpxchg_2_asm)
__do_cmpxchg(w[p1] (z), w[p1])
ENDPROC(___raw_cmpxchg_2_asm)
ENTRY(___raw_cmpxchg_4_asm)
__do_cmpxchg([p1], [p1])
ENDPROC(___raw_cmpxchg_4_asm)
/*
* r0 = ptr
* r1 = bitnr
*
* Set a bit in a 32bit word and return the old 32bit value atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_bit_set_asm)
r2 = r1;
r1 = 1;
r1 <<= r2;
jump ___raw_atomic_set_asm
ENDPROC(___raw_bit_set_asm)
/*
* r0 = ptr
* r1 = bitnr
*
* Clear a bit in a 32bit word and return the old 32bit value atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_bit_clear_asm)
r2 = r1;
r1 = 1;
r1 <<= r2;
jump ___raw_atomic_clear_asm
ENDPROC(___raw_bit_clear_asm)
/*
* r0 = ptr
* r1 = bitnr
*
* Toggle a bit in a 32bit word and return the old 32bit value atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_bit_toggle_asm)
r2 = r1;
r1 = 1;
r1 <<= r2;
jump ___raw_atomic_xor_asm
ENDPROC(___raw_bit_toggle_asm)
/*
* r0 = ptr
* r1 = bitnr
*
* Test-and-set a bit in a 32bit word and return the old bit value atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_bit_test_set_asm)
[--sp] = rets;
[--sp] = r1;
call ___raw_bit_set_asm
r1 = [sp++];
r2 = 1;
r2 <<= r1;
r0 = r0 & r2;
cc = r0 == 0;
if cc jump 1f
r0 = 1;
1:
rets = [sp++];
rts;
ENDPROC(___raw_bit_test_set_asm)
/*
* r0 = ptr
* r1 = bitnr
*
* Test-and-clear a bit in a 32bit word and return the old bit value atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_bit_test_clear_asm)
[--sp] = rets;
[--sp] = r1;
call ___raw_bit_clear_asm
r1 = [sp++];
r2 = 1;
r2 <<= r1;
r0 = r0 & r2;
cc = r0 == 0;
if cc jump 1f
r0 = 1;
1:
rets = [sp++];
rts;
ENDPROC(___raw_bit_test_clear_asm)
/*
* r0 = ptr
* r1 = bitnr
*
* Test-and-toggle a bit in a 32bit word,
* and return the old bit value atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_bit_test_toggle_asm)
[--sp] = rets;
[--sp] = r1;
call ___raw_bit_toggle_asm
r1 = [sp++];
r2 = 1;
r2 <<= r1;
r0 = r0 & r2;
cc = r0 == 0;
if cc jump 1f
r0 = 1;
1:
rets = [sp++];
rts;
ENDPROC(___raw_bit_test_toggle_asm)
/*
* r0 = ptr
* r1 = bitnr
*
* Test a bit in a 32bit word and return its value.
* We need this on this architecture in order to invalidate
* the local cache before testing.
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_bit_test_asm)
r2 = r1;
r1 = 1;
r1 <<= r2;
jump ___raw_atomic_test_asm
ENDPROC(___raw_bit_test_asm)
/*
* r0 = ptr
*
* Fetch and return an uncached 32bit value.
*
* Clobbers: r2:0, p1:0
*/
ENTRY(___raw_uncached_fetch_asm)
p1 = r0;
r1 = -L1_CACHE_BYTES;
r1 = r0 & r1;
p0 = r1;
/* flush core internal write buffer before invalidate dcache */
CSYNC(r2);
flushinv[p0];
SSYNC(r2);
r0 = [p1];
rts;
ENDPROC(___raw_uncached_fetch_asm)