linux_dsm_epyc7002/arch/powerpc/include/asm/xive.h
Paul Mackerras da15c03b04 powerpc/xive: Implement get_irqchip_state method for XIVE to fix shutdown race
Testing has revealed the existence of a race condition where a XIVE
interrupt being shut down can be in one of the XIVE interrupt queues
(of which there are up to 8 per CPU, one for each priority) at the
point where free_irq() is called.  If this happens, can return an
interrupt number which has been shut down.  This can lead to various
symptoms:

- irq_to_desc(irq) can be NULL.  In this case, no end-of-interrupt
  function gets called, resulting in the CPU's elevated interrupt
  priority (numerically lowered CPPR) never gets reset.  That then
  means that the CPU stops processing interrupts, causing device
  timeouts and other errors in various device drivers.

- The irq descriptor or related data structures can be in the process
  of being freed as the interrupt code is using them.  This typically
  leads to crashes due to bad pointer dereferences.

This race is basically what commit 62e0468650 ("genirq: Add optional
hardware synchronization for shutdown", 2019-06-28) is intended to
fix, given a get_irqchip_state() method for the interrupt controller
being used.  It works by polling the interrupt controller when an
interrupt is being freed until the controller says it is not pending.

With XIVE, the PQ bits of the interrupt source indicate the state of
the interrupt source, and in particular the P bit goes from 0 to 1 at
the point where the hardware writes an entry into the interrupt queue
that this interrupt is directed towards.  Normally, the code will then
process the interrupt and do an end-of-interrupt (EOI) operation which
will reset PQ to 00 (assuming another interrupt hasn't been generated
in the meantime).  However, there are situations where the code resets
P even though a queue entry exists (for example, by setting PQ to 01,
which disables the interrupt source), and also situations where the
code leaves P at 1 after removing the queue entry (for example, this
is done for escalation interrupts so they cannot fire again until
they are explicitly re-enabled).

The code already has a 'saved_p' flag for the interrupt source which
indicates that a queue entry exists, although it isn't maintained
consistently.  This patch adds a 'stale_p' flag to indicate that
P has been left at 1 after processing a queue entry, and adds code
to set and clear saved_p and stale_p as necessary to maintain a
consistent indication of whether a queue entry may or may not exist.

With this, we can implement xive_get_irqchip_state() by looking at
stale_p, saved_p and the ESB PQ bits for the interrupt.

There is some additional code to handle escalation interrupts
properly; because they are enabled and disabled in KVM assembly code,
which does not have access to the xive_irq_data struct for the
escalation interrupt.  Hence, stale_p may be incorrect when the
escalation interrupt is freed in kvmppc_xive_{,native_}cleanup_vcpu().
Fortunately, we can fix it up by looking at vcpu->arch.xive_esc_on,
with some careful attention to barriers in order to ensure the correct
result if xive_esc_irq() races with kvmppc_xive_cleanup_vcpu().

Finally, this adds code to make noise on the console (pr_crit and
WARN_ON(1)) if we find an interrupt queue entry for an interrupt
which does not have a descriptor.  While this won't catch the race
reliably, if it does get triggered it will be an indication that
the race is occurring and needs to be debugged.

Fixes: 243e25112d ("powerpc/xive: Native exploitation of the XIVE interrupt controller")
Cc: stable@vger.kernel.org # v4.12+
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190813100648.GE9567@blackberry
2019-08-16 14:16:59 +10:00

159 lines
4.9 KiB
C

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Copyright 2016,2017 IBM Corporation.
*/
#ifndef _ASM_POWERPC_XIVE_H
#define _ASM_POWERPC_XIVE_H
#define XIVE_INVALID_VP 0xffffffff
#ifdef CONFIG_PPC_XIVE
/*
* Thread Interrupt Management Area (TIMA)
*
* This is a global MMIO region divided in 4 pages of varying access
* permissions, providing access to per-cpu interrupt management
* functions. It always identifies the CPU doing the access based
* on the PowerBus initiator ID, thus we always access via the
* same offset regardless of where the code is executing
*/
extern void __iomem *xive_tima;
extern unsigned long xive_tima_os;
/*
* Offset in the TM area of our current execution level (provided by
* the backend)
*/
extern u32 xive_tima_offset;
/*
* Per-irq data (irq_get_handler_data for normal IRQs), IPIs
* have it stored in the xive_cpu structure. We also cache
* for normal interrupts the current target CPU.
*
* This structure is setup by the backend for each interrupt.
*/
struct xive_irq_data {
u64 flags;
u64 eoi_page;
void __iomem *eoi_mmio;
u64 trig_page;
void __iomem *trig_mmio;
u32 esb_shift;
int src_chip;
u32 hw_irq;
/* Setup/used by frontend */
int target;
/*
* saved_p means that there is a queue entry for this interrupt
* in some CPU's queue (not including guest vcpu queues), even
* if P is not set in the source ESB.
* stale_p means that there is no queue entry for this interrupt
* in some CPU's queue, even if P is set in the source ESB.
*/
bool saved_p;
bool stale_p;
};
#define XIVE_IRQ_FLAG_STORE_EOI 0x01
#define XIVE_IRQ_FLAG_LSI 0x02
#define XIVE_IRQ_FLAG_SHIFT_BUG 0x04
#define XIVE_IRQ_FLAG_MASK_FW 0x08
#define XIVE_IRQ_FLAG_EOI_FW 0x10
#define XIVE_IRQ_FLAG_H_INT_ESB 0x20
/* Special flag set by KVM for excalation interrupts */
#define XIVE_IRQ_NO_EOI 0x80
#define XIVE_INVALID_CHIP_ID -1
/* A queue tracking structure in a CPU */
struct xive_q {
__be32 *qpage;
u32 msk;
u32 idx;
u32 toggle;
u64 eoi_phys;
u32 esc_irq;
atomic_t count;
atomic_t pending_count;
u64 guest_qaddr;
u32 guest_qshift;
};
/* Global enable flags for the XIVE support */
extern bool __xive_enabled;
static inline bool xive_enabled(void) { return __xive_enabled; }
extern bool xive_spapr_init(void);
extern bool xive_native_init(void);
extern void xive_smp_probe(void);
extern int xive_smp_prepare_cpu(unsigned int cpu);
extern void xive_smp_setup_cpu(void);
extern void xive_smp_disable_cpu(void);
extern void xive_teardown_cpu(void);
extern void xive_shutdown(void);
extern void xive_flush_interrupt(void);
/* xmon hook */
extern void xmon_xive_do_dump(int cpu);
/* APIs used by KVM */
extern u32 xive_native_default_eq_shift(void);
extern u32 xive_native_alloc_vp_block(u32 max_vcpus);
extern void xive_native_free_vp_block(u32 vp_base);
extern int xive_native_populate_irq_data(u32 hw_irq,
struct xive_irq_data *data);
extern void xive_cleanup_irq_data(struct xive_irq_data *xd);
extern u32 xive_native_alloc_irq(void);
extern void xive_native_free_irq(u32 irq);
extern int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
__be32 *qpage, u32 order, bool can_escalate);
extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
extern void xive_native_sync_source(u32 hw_irq);
extern void xive_native_sync_queue(u32 hw_irq);
extern bool is_xive_irq(struct irq_chip *chip);
extern int xive_native_enable_vp(u32 vp_id, bool single_escalation);
extern int xive_native_disable_vp(u32 vp_id);
extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id);
extern bool xive_native_has_single_escalation(void);
extern int xive_native_get_queue_info(u32 vp_id, uint32_t prio,
u64 *out_qpage,
u64 *out_qsize,
u64 *out_qeoi_page,
u32 *out_escalate_irq,
u64 *out_qflags);
extern int xive_native_get_queue_state(u32 vp_id, uint32_t prio, u32 *qtoggle,
u32 *qindex);
extern int xive_native_set_queue_state(u32 vp_id, uint32_t prio, u32 qtoggle,
u32 qindex);
extern int xive_native_get_vp_state(u32 vp_id, u64 *out_state);
#else
static inline bool xive_enabled(void) { return false; }
static inline bool xive_spapr_init(void) { return false; }
static inline bool xive_native_init(void) { return false; }
static inline void xive_smp_probe(void) { }
static inline int xive_smp_prepare_cpu(unsigned int cpu) { return -EINVAL; }
static inline void xive_smp_setup_cpu(void) { }
static inline void xive_smp_disable_cpu(void) { }
static inline void xive_kexec_teardown_cpu(int secondary) { }
static inline void xive_shutdown(void) { }
static inline void xive_flush_interrupt(void) { }
static inline u32 xive_native_alloc_vp_block(u32 max_vcpus) { return XIVE_INVALID_VP; }
static inline void xive_native_free_vp_block(u32 vp_base) { }
#endif
#endif /* _ASM_POWERPC_XIVE_H */