mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-28 11:18:45 +07:00
4ae2182b1e
A Root Port's AER structure (rpc) contains a queue of events. aer_irq() enqueues AER status information and schedules aer_isr() to dequeue and process it. When we remove a device, aer_remove() waits for the queue to be empty, then frees the rpc struct. But aer_isr() references the rpc struct after dequeueing and possibly emptying the queue, which can cause a use-after-free error as in the following scenario with two threads, aer_isr() on the left and a concurrent aer_remove() on the right: Thread A Thread B -------- -------- aer_irq(): rpc->prod_idx++ aer_remove(): wait_event(rpc->prod_idx == rpc->cons_idx) # now blocked until queue becomes empty aer_isr(): # ... rpc->cons_idx++ # unblocked because queue is now empty ... kfree(rpc) mutex_unlock(&rpc->rpc_mutex) To prevent this problem, use flush_work() to wait until the last scheduled instance of aer_isr() has completed before freeing the rpc struct in aer_remove(). I reproduced this use-after-free by flashing a device FPGA and re-enumerating the bus to find the new device. With SLUB debug, this crashes with 0x6b bytes (POISON_FREE, the use-after-free magic number) in GPR25: pcieport 0000:00:00.0: AER: Multiple Corrected error received: id=0000 Unable to handle kernel paging request for data at address 0x27ef9e3e Workqueue: events aer_isr GPR24: dd6aa000 6b6b6b6b 605f8378 605f8360 d99b12c0 604fc674 606b1704 d99b12c0 NIP [602f5328] pci_walk_bus+0xd4/0x104 [bhelgaas: changelog, stable tag] Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> CC: stable@vger.kernel.org
132 lines
3.3 KiB
C
132 lines
3.3 KiB
C
/*
|
|
* Copyright (C) 2006 Intel Corp.
|
|
* Tom Long Nguyen (tom.l.nguyen@intel.com)
|
|
* Zhang Yanmin (yanmin.zhang@intel.com)
|
|
*
|
|
*/
|
|
|
|
#ifndef _AERDRV_H_
|
|
#define _AERDRV_H_
|
|
|
|
#include <linux/workqueue.h>
|
|
#include <linux/pcieport_if.h>
|
|
#include <linux/aer.h>
|
|
#include <linux/interrupt.h>
|
|
|
|
#define SYSTEM_ERROR_INTR_ON_MESG_MASK (PCI_EXP_RTCTL_SECEE| \
|
|
PCI_EXP_RTCTL_SENFEE| \
|
|
PCI_EXP_RTCTL_SEFEE)
|
|
#define ROOT_PORT_INTR_ON_MESG_MASK (PCI_ERR_ROOT_CMD_COR_EN| \
|
|
PCI_ERR_ROOT_CMD_NONFATAL_EN| \
|
|
PCI_ERR_ROOT_CMD_FATAL_EN)
|
|
#define ERR_COR_ID(d) (d & 0xffff)
|
|
#define ERR_UNCOR_ID(d) (d >> 16)
|
|
|
|
#define AER_ERROR_SOURCES_MAX 100
|
|
|
|
#define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \
|
|
PCI_ERR_UNC_ECRC| \
|
|
PCI_ERR_UNC_UNSUP| \
|
|
PCI_ERR_UNC_COMP_ABORT| \
|
|
PCI_ERR_UNC_UNX_COMP| \
|
|
PCI_ERR_UNC_MALF_TLP)
|
|
|
|
#define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */
|
|
struct aer_err_info {
|
|
struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
|
|
int error_dev_num;
|
|
|
|
unsigned int id:16;
|
|
|
|
unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */
|
|
unsigned int __pad1:5;
|
|
unsigned int multi_error_valid:1;
|
|
|
|
unsigned int first_error:5;
|
|
unsigned int __pad2:2;
|
|
unsigned int tlp_header_valid:1;
|
|
|
|
unsigned int status; /* COR/UNCOR Error Status */
|
|
unsigned int mask; /* COR/UNCOR Error Mask */
|
|
struct aer_header_log_regs tlp; /* TLP Header */
|
|
};
|
|
|
|
struct aer_err_source {
|
|
unsigned int status;
|
|
unsigned int id;
|
|
};
|
|
|
|
struct aer_rpc {
|
|
struct pcie_device *rpd; /* Root Port device */
|
|
struct work_struct dpc_handler;
|
|
struct aer_err_source e_sources[AER_ERROR_SOURCES_MAX];
|
|
unsigned short prod_idx; /* Error Producer Index */
|
|
unsigned short cons_idx; /* Error Consumer Index */
|
|
int isr;
|
|
spinlock_t e_lock; /*
|
|
* Lock access to Error Status/ID Regs
|
|
* and error producer/consumer index
|
|
*/
|
|
struct mutex rpc_mutex; /*
|
|
* only one thread could do
|
|
* recovery on the same
|
|
* root port hierarchy
|
|
*/
|
|
};
|
|
|
|
struct aer_broadcast_data {
|
|
enum pci_channel_state state;
|
|
enum pci_ers_result result;
|
|
};
|
|
|
|
static inline pci_ers_result_t merge_result(enum pci_ers_result orig,
|
|
enum pci_ers_result new)
|
|
{
|
|
if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
|
|
return PCI_ERS_RESULT_NO_AER_DRIVER;
|
|
|
|
if (new == PCI_ERS_RESULT_NONE)
|
|
return orig;
|
|
|
|
switch (orig) {
|
|
case PCI_ERS_RESULT_CAN_RECOVER:
|
|
case PCI_ERS_RESULT_RECOVERED:
|
|
orig = new;
|
|
break;
|
|
case PCI_ERS_RESULT_DISCONNECT:
|
|
if (new == PCI_ERS_RESULT_NEED_RESET)
|
|
orig = PCI_ERS_RESULT_NEED_RESET;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return orig;
|
|
}
|
|
|
|
extern struct bus_type pcie_port_bus_type;
|
|
int aer_init(struct pcie_device *dev);
|
|
void aer_isr(struct work_struct *work);
|
|
void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
|
|
void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info);
|
|
irqreturn_t aer_irq(int irq, void *context);
|
|
|
|
#ifdef CONFIG_ACPI_APEI
|
|
int pcie_aer_get_firmware_first(struct pci_dev *pci_dev);
|
|
#else
|
|
static inline int pcie_aer_get_firmware_first(struct pci_dev *pci_dev)
|
|
{
|
|
if (pci_dev->__aer_firmware_first_valid)
|
|
return pci_dev->__aer_firmware_first;
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
static inline void pcie_aer_force_firmware_first(struct pci_dev *pci_dev,
|
|
int enable)
|
|
{
|
|
pci_dev->__aer_firmware_first = !!enable;
|
|
pci_dev->__aer_firmware_first_valid = 1;
|
|
}
|
|
#endif /* _AERDRV_H_ */
|