mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-12 01:36:43 +07:00
b1c089b7ca
This patch is required not to lost error records by action invoked on error recovery, such as slot reset etc. Following sample (real machine + dummy record injected by aer-inject) shows that record of 28:00.1 could not be retrieved by recovery of 28:00.0: - Before: pcieport-driver 0000:00:02.0: AER: Multiple Uncorrected (Non-Fatal) error received: id=2801 e1000e 0000:28:00.0: PCIE Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, id=2800(Receiver ID) e1000e 0000:28:00.0: device [8086:1096] error status/mask=00001000/00100000 e1000e 0000:28:00.0: [12] Poisoned TLP (First) e1000e 0000:28:00.0: TLP Header: 00000000 00000001 00000002 00000003 e1000e 0000:28:00.0: broadcast error_detected message e1000e 0000:28:00.0: broadcast slot_reset message e1000e 0000:28:00.0: setting latency timer to 64 e1000e 0000:28:00.0: restoring config space at offset 0x1 (was 0x100547, writing 0x100147) e1000e 0000:28:00.0: PME# disabled e1000e 0000:28:00.0: PME# disabled e1000e 0000:28:00.1: setting latency timer to 64 e1000e 0000:28:00.1: restoring config space at offset 0x1 (was 0x100547, writing 0x100147) e1000e 0000:28:00.1: PME# disabled e1000e 0000:28:00.1: PME# disabled e1000e 0000:28:00.0: broadcast resume message e1000e 0000:28:00.0: AER driver successfully recovered e1000e: eth0 NIC Link is Up 1000 Mbps Full Duplex, Flow Control: RX/TX - After: pcieport-driver 0000:00:02.0: AER: Multiple Uncorrected (Non-Fatal) error received: id=2801 e1000e 0000:28:00.0: PCIE Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, id=2800(Receiver ID) e1000e 0000:28:00.0: device [8086:1096] error status/mask=00001000/00100000 e1000e 0000:28:00.0: [12] Poisoned TLP (First) e1000e 0000:28:00.0: TLP Header: 00000000 00000001 00000002 00000003 e1000e 0000:28:00.1: PCIE Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, id=2801(Receiver ID) e1000e 0000:28:00.1: device [8086:1096] error status/mask=00081000/00100000 e1000e 0000:28:00.1: [12] Poisoned TLP (First) e1000e 0000:28:00.1: [19] ECRC e1000e 0000:28:00.1: TLP Header: 00000000 00000001 00000002 00000003 e1000e 0000:28:00.1: Error of this Agent(2801) is reported first e1000e 0000:28:00.0: broadcast error_detected message e1000e 0000:28:00.0: broadcast slot_reset message e1000e 0000:28:00.0: setting latency timer to 64 e1000e 0000:28:00.0: restoring config space at offset 0x1 (was 0x100547, writing 0x100147) e1000e 0000:28:00.0: PME# disabled e1000e 0000:28:00.0: PME# disabled e1000e 0000:28:00.1: setting latency timer to 64 e1000e 0000:28:00.1: restoring config space at offset 0x1 (was 0x100547, writing 0x100147) e1000e 0000:28:00.1: PME# disabled e1000e 0000:28:00.1: PME# disabled e1000e 0000:28:00.0: broadcast resume message e1000e 0000:28:00.0: AER driver successfully recovered e1000e: eth0 NIC Link is Up 1000 Mbps Full Duplex, Flow Control: RX/TX Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
138 lines
3.3 KiB
C
138 lines
3.3 KiB
C
/*
|
|
* Copyright (C) 2006 Intel Corp.
|
|
* Tom Long Nguyen (tom.l.nguyen@intel.com)
|
|
* Zhang Yanmin (yanmin.zhang@intel.com)
|
|
*
|
|
*/
|
|
|
|
#ifndef _AERDRV_H_
|
|
#define _AERDRV_H_
|
|
|
|
#include <linux/workqueue.h>
|
|
#include <linux/pcieport_if.h>
|
|
#include <linux/aer.h>
|
|
#include <linux/interrupt.h>
|
|
|
|
#define AER_NONFATAL 0
|
|
#define AER_FATAL 1
|
|
#define AER_CORRECTABLE 2
|
|
|
|
/* Root Error Status Register Bits */
|
|
#define ROOT_ERR_STATUS_MASKS 0x0f
|
|
|
|
#define SYSTEM_ERROR_INTR_ON_MESG_MASK (PCI_EXP_RTCTL_SECEE| \
|
|
PCI_EXP_RTCTL_SENFEE| \
|
|
PCI_EXP_RTCTL_SEFEE)
|
|
#define ROOT_PORT_INTR_ON_MESG_MASK (PCI_ERR_ROOT_CMD_COR_EN| \
|
|
PCI_ERR_ROOT_CMD_NONFATAL_EN| \
|
|
PCI_ERR_ROOT_CMD_FATAL_EN)
|
|
#define ERR_COR_ID(d) (d & 0xffff)
|
|
#define ERR_UNCOR_ID(d) (d >> 16)
|
|
|
|
#define AER_ERROR_SOURCES_MAX 100
|
|
|
|
#define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \
|
|
PCI_ERR_UNC_ECRC| \
|
|
PCI_ERR_UNC_UNSUP| \
|
|
PCI_ERR_UNC_COMP_ABORT| \
|
|
PCI_ERR_UNC_UNX_COMP| \
|
|
PCI_ERR_UNC_MALF_TLP)
|
|
|
|
struct header_log_regs {
|
|
unsigned int dw0;
|
|
unsigned int dw1;
|
|
unsigned int dw2;
|
|
unsigned int dw3;
|
|
};
|
|
|
|
#define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */
|
|
struct aer_err_info {
|
|
struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
|
|
int error_dev_num;
|
|
|
|
unsigned int id:16;
|
|
|
|
unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */
|
|
unsigned int __pad1:5;
|
|
unsigned int multi_error_valid:1;
|
|
|
|
unsigned int first_error:5;
|
|
unsigned int __pad2:2;
|
|
unsigned int tlp_header_valid:1;
|
|
|
|
unsigned int status; /* COR/UNCOR Error Status */
|
|
unsigned int mask; /* COR/UNCOR Error Mask */
|
|
struct header_log_regs tlp; /* TLP Header */
|
|
};
|
|
|
|
struct aer_err_source {
|
|
unsigned int status;
|
|
unsigned int id;
|
|
};
|
|
|
|
struct aer_rpc {
|
|
struct pcie_device *rpd; /* Root Port device */
|
|
struct work_struct dpc_handler;
|
|
struct aer_err_source e_sources[AER_ERROR_SOURCES_MAX];
|
|
unsigned short prod_idx; /* Error Producer Index */
|
|
unsigned short cons_idx; /* Error Consumer Index */
|
|
int isr;
|
|
spinlock_t e_lock; /*
|
|
* Lock access to Error Status/ID Regs
|
|
* and error producer/consumer index
|
|
*/
|
|
struct mutex rpc_mutex; /*
|
|
* only one thread could do
|
|
* recovery on the same
|
|
* root port hierarchy
|
|
*/
|
|
wait_queue_head_t wait_release;
|
|
};
|
|
|
|
struct aer_broadcast_data {
|
|
enum pci_channel_state state;
|
|
enum pci_ers_result result;
|
|
};
|
|
|
|
static inline pci_ers_result_t merge_result(enum pci_ers_result orig,
|
|
enum pci_ers_result new)
|
|
{
|
|
if (new == PCI_ERS_RESULT_NONE)
|
|
return orig;
|
|
|
|
switch (orig) {
|
|
case PCI_ERS_RESULT_CAN_RECOVER:
|
|
case PCI_ERS_RESULT_RECOVERED:
|
|
orig = new;
|
|
break;
|
|
case PCI_ERS_RESULT_DISCONNECT:
|
|
if (new == PCI_ERS_RESULT_NEED_RESET)
|
|
orig = new;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return orig;
|
|
}
|
|
|
|
extern struct bus_type pcie_port_bus_type;
|
|
extern void aer_enable_rootport(struct aer_rpc *rpc);
|
|
extern void aer_delete_rootport(struct aer_rpc *rpc);
|
|
extern int aer_init(struct pcie_device *dev);
|
|
extern void aer_isr(struct work_struct *work);
|
|
extern void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
|
|
extern void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info);
|
|
extern irqreturn_t aer_irq(int irq, void *context);
|
|
|
|
#ifdef CONFIG_ACPI
|
|
extern int aer_osc_setup(struct pcie_device *pciedev);
|
|
#else
|
|
static inline int aer_osc_setup(struct pcie_device *pciedev)
|
|
{
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
#endif /* _AERDRV_H_ */
|