mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-12 20:46:49 +07:00
Merge branch 'pci/aer'
- Decode AER errors with names similar to "lspci" (Tyler Baicar) - Expose AER statistics in sysfs (Rajat Jain) - Clear AER status bits selectively based on the type of recovery (Oza Pawandeep) - Honor "pcie_ports=native" even if HEST sets FIRMWARE_FIRST (Alexandru Gagniuc) - Don't clear AER status bits if we're using the "Firmware-First" strategy where firmware owns the registers (Alexandru Gagniuc) * pci/aer: PCI/AER: Don't clear AER bits if error handling is Firmware-First PCI/AER: Remove duplicate PCI_EXP_AER_FLAGS definition PCI/portdrv: Remove pcie_portdrv_err_handler.slot_reset PCI/AER: Clear device status bits during ERR_COR handling PCI/AER: Clear device status bits during ERR_FATAL and ERR_NONFATAL PCI/AER: Remove ERR_FATAL code from ERR_NONFATAL path PCI/AER: Factor out ERR_NONFATAL status bit clearing PCI/AER: Clear only ERR_NONFATAL bits during non-fatal recovery PCI/AER: Clear only ERR_FATAL status bits during fatal recovery PCI/AER: Honor "pcie_ports=native" even if HEST sets FIRMWARE_FIRST PCI/AER: Add sysfs attributes for rootport cumulative stats PCI/AER: Add sysfs attributes to provide AER stats and breakdown PCI/AER: Define aer_stats structure for AER capable devices PCI/AER: Move internal declarations to drivers/pci/pci.h PCI/AER: Adopt lspci names for AER error decoding PCI/AER: Expose internal API for obtaining AER information # Conflicts: # drivers/pci/pci.h
This commit is contained in:
commit
3c3ab37f4c
122
Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
Normal file
122
Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
Normal file
@ -0,0 +1,122 @@
|
||||
==========================
|
||||
PCIe Device AER statistics
|
||||
==========================
|
||||
These attributes show up under all the devices that are AER capable. These
|
||||
statistical counters indicate the errors "as seen/reported by the device".
|
||||
Note that this may mean that if an endpoint is causing problems, the AER
|
||||
counters may increment at its link partner (e.g. root port) because the
|
||||
errors may be "seen" / reported by the link partner and not the
|
||||
problematic endpoint itself (which may report all counters as 0 as it never
|
||||
saw any problems).
|
||||
|
||||
Where: /sys/bus/pci/devices/<dev>/aer_dev_correctable
|
||||
Date: July 2018
|
||||
Kernel Version: 4.19.0
|
||||
Contact: linux-pci@vger.kernel.org, rajatja@google.com
|
||||
Description: List of correctable errors seen and reported by this
|
||||
PCI device using ERR_COR. Note that since multiple errors may
|
||||
be reported using a single ERR_COR message, thus
|
||||
TOTAL_ERR_COR at the end of the file may not match the actual
|
||||
total of all the errors in the file. Sample output:
|
||||
-------------------------------------------------------------------------
|
||||
localhost /sys/devices/pci0000:00/0000:00:1c.0 # cat aer_dev_correctable
|
||||
Receiver Error 2
|
||||
Bad TLP 0
|
||||
Bad DLLP 0
|
||||
RELAY_NUM Rollover 0
|
||||
Replay Timer Timeout 0
|
||||
Advisory Non-Fatal 0
|
||||
Corrected Internal Error 0
|
||||
Header Log Overflow 0
|
||||
TOTAL_ERR_COR 2
|
||||
-------------------------------------------------------------------------
|
||||
|
||||
Where: /sys/bus/pci/devices/<dev>/aer_dev_fatal
|
||||
Date: July 2018
|
||||
Kernel Version: 4.19.0
|
||||
Contact: linux-pci@vger.kernel.org, rajatja@google.com
|
||||
Description: List of uncorrectable fatal errors seen and reported by this
|
||||
PCI device using ERR_FATAL. Note that since multiple errors may
|
||||
be reported using a single ERR_FATAL message, thus
|
||||
TOTAL_ERR_FATAL at the end of the file may not match the actual
|
||||
total of all the errors in the file. Sample output:
|
||||
-------------------------------------------------------------------------
|
||||
localhost /sys/devices/pci0000:00/0000:00:1c.0 # cat aer_dev_fatal
|
||||
Undefined 0
|
||||
Data Link Protocol 0
|
||||
Surprise Down Error 0
|
||||
Poisoned TLP 0
|
||||
Flow Control Protocol 0
|
||||
Completion Timeout 0
|
||||
Completer Abort 0
|
||||
Unexpected Completion 0
|
||||
Receiver Overflow 0
|
||||
Malformed TLP 0
|
||||
ECRC 0
|
||||
Unsupported Request 0
|
||||
ACS Violation 0
|
||||
Uncorrectable Internal Error 0
|
||||
MC Blocked TLP 0
|
||||
AtomicOp Egress Blocked 0
|
||||
TLP Prefix Blocked Error 0
|
||||
TOTAL_ERR_FATAL 0
|
||||
-------------------------------------------------------------------------
|
||||
|
||||
Where: /sys/bus/pci/devices/<dev>/aer_dev_nonfatal
|
||||
Date: July 2018
|
||||
Kernel Version: 4.19.0
|
||||
Contact: linux-pci@vger.kernel.org, rajatja@google.com
|
||||
Description: List of uncorrectable nonfatal errors seen and reported by this
|
||||
PCI device using ERR_NONFATAL. Note that since multiple errors
|
||||
may be reported using a single ERR_FATAL message, thus
|
||||
TOTAL_ERR_NONFATAL at the end of the file may not match the
|
||||
actual total of all the errors in the file. Sample output:
|
||||
-------------------------------------------------------------------------
|
||||
localhost /sys/devices/pci0000:00/0000:00:1c.0 # cat aer_dev_nonfatal
|
||||
Undefined 0
|
||||
Data Link Protocol 0
|
||||
Surprise Down Error 0
|
||||
Poisoned TLP 0
|
||||
Flow Control Protocol 0
|
||||
Completion Timeout 0
|
||||
Completer Abort 0
|
||||
Unexpected Completion 0
|
||||
Receiver Overflow 0
|
||||
Malformed TLP 0
|
||||
ECRC 0
|
||||
Unsupported Request 0
|
||||
ACS Violation 0
|
||||
Uncorrectable Internal Error 0
|
||||
MC Blocked TLP 0
|
||||
AtomicOp Egress Blocked 0
|
||||
TLP Prefix Blocked Error 0
|
||||
TOTAL_ERR_NONFATAL 0
|
||||
-------------------------------------------------------------------------
|
||||
|
||||
============================
|
||||
PCIe Rootport AER statistics
|
||||
============================
|
||||
These attributes show up under only the rootports (or root complex event
|
||||
collectors) that are AER capable. These indicate the number of error messages as
|
||||
"reported to" the rootport. Please note that the rootports also transmit
|
||||
(internally) the ERR_* messages for errors seen by the internal rootport PCI
|
||||
device, so these counters include them and are thus cumulative of all the error
|
||||
messages on the PCI hierarchy originating at that root port.
|
||||
|
||||
Where: /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_cor
|
||||
Date: July 2018
|
||||
Kernel Version: 4.19.0
|
||||
Contact: linux-pci@vger.kernel.org, rajatja@google.com
|
||||
Description: Total number of ERR_COR messages reported to rootport.
|
||||
|
||||
Where: /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_fatal
|
||||
Date: July 2018
|
||||
Kernel Version: 4.19.0
|
||||
Contact: linux-pci@vger.kernel.org, rajatja@google.com
|
||||
Description: Total number of ERR_FATAL messages reported to rootport.
|
||||
|
||||
Where: /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_nonfatal
|
||||
Date: July 2018
|
||||
Kernel Version: 4.19.0
|
||||
Contact: linux-pci@vger.kernel.org, rajatja@google.com
|
||||
Description: Total number of ERR_NONFATAL messages reported to rootport.
|
@ -73,6 +73,11 @@ In the example, 'Requester ID' means the ID of the device who sends
|
||||
the error message to root port. Pls. refer to pci express specs for
|
||||
other fields.
|
||||
|
||||
2.4 AER Statistics / Counters
|
||||
|
||||
When PCIe AER errors are captured, the counters / statistics are also exposed
|
||||
in the form of sysfs attributes which are documented at
|
||||
Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
|
||||
|
||||
3. Developer Guide
|
||||
|
||||
|
@ -1746,6 +1746,9 @@ static const struct attribute_group *pci_dev_attr_groups[] = {
|
||||
#endif
|
||||
&pci_bridge_attr_group,
|
||||
&pcie_dev_attr_group,
|
||||
#ifdef CONFIG_PCIEAER
|
||||
&aer_stats_attr_group,
|
||||
#endif
|
||||
NULL,
|
||||
};
|
||||
|
||||
|
@ -311,6 +311,34 @@ static inline bool pci_dev_is_added(const struct pci_dev *dev)
|
||||
return test_bit(PCI_DEV_ADDED, &dev->priv_flags);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PCIEAER
|
||||
#include <linux/aer.h>
|
||||
|
||||
#define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */
|
||||
|
||||
struct aer_err_info {
|
||||
struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
|
||||
int error_dev_num;
|
||||
|
||||
unsigned int id:16;
|
||||
|
||||
unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */
|
||||
unsigned int __pad1:5;
|
||||
unsigned int multi_error_valid:1;
|
||||
|
||||
unsigned int first_error:5;
|
||||
unsigned int __pad2:2;
|
||||
unsigned int tlp_header_valid:1;
|
||||
|
||||
unsigned int status; /* COR/UNCOR Error Status */
|
||||
unsigned int mask; /* COR/UNCOR Error Mask */
|
||||
struct aer_header_log_regs tlp; /* TLP Header */
|
||||
};
|
||||
|
||||
int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
|
||||
void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
|
||||
#endif /* CONFIG_PCIEAER */
|
||||
|
||||
#ifdef CONFIG_PCI_ATS
|
||||
void pci_restore_ats_state(struct pci_dev *dev);
|
||||
#else
|
||||
@ -467,4 +495,19 @@ static inline int devm_of_pci_get_host_bridge_resources(struct device *dev,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PCIEAER
|
||||
void pci_no_aer(void);
|
||||
void pci_aer_init(struct pci_dev *dev);
|
||||
void pci_aer_exit(struct pci_dev *dev);
|
||||
extern const struct attribute_group aer_stats_attr_group;
|
||||
void pci_aer_clear_fatal_status(struct pci_dev *dev);
|
||||
void pci_aer_clear_device_status(struct pci_dev *dev);
|
||||
#else
|
||||
static inline void pci_no_aer(void) { }
|
||||
static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; }
|
||||
static inline void pci_aer_exit(struct pci_dev *d) { }
|
||||
static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
|
||||
static inline void pci_aer_clear_device_status(struct pci_dev *dev) { }
|
||||
#endif
|
||||
|
||||
#endif /* DRIVERS_PCI_H */
|
||||
|
@ -31,26 +31,9 @@
|
||||
#include "portdrv.h"
|
||||
|
||||
#define AER_ERROR_SOURCES_MAX 100
|
||||
#define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */
|
||||
|
||||
struct aer_err_info {
|
||||
struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
|
||||
int error_dev_num;
|
||||
|
||||
unsigned int id:16;
|
||||
|
||||
unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */
|
||||
unsigned int __pad1:5;
|
||||
unsigned int multi_error_valid:1;
|
||||
|
||||
unsigned int first_error:5;
|
||||
unsigned int __pad2:2;
|
||||
unsigned int tlp_header_valid:1;
|
||||
|
||||
unsigned int status; /* COR/UNCOR Error Status */
|
||||
unsigned int mask; /* COR/UNCOR Error Mask */
|
||||
struct aer_header_log_regs tlp; /* TLP Header */
|
||||
};
|
||||
#define AER_MAX_TYPEOF_COR_ERRS 16 /* as per PCI_ERR_COR_STATUS */
|
||||
#define AER_MAX_TYPEOF_UNCOR_ERRS 26 /* as per PCI_ERR_UNCOR_STATUS*/
|
||||
|
||||
struct aer_err_source {
|
||||
unsigned int status;
|
||||
@ -76,6 +59,42 @@ struct aer_rpc {
|
||||
*/
|
||||
};
|
||||
|
||||
/* AER stats for the device */
|
||||
struct aer_stats {
|
||||
|
||||
/*
|
||||
* Fields for all AER capable devices. They indicate the errors
|
||||
* "as seen by this device". Note that this may mean that if an
|
||||
* end point is causing problems, the AER counters may increment
|
||||
* at its link partner (e.g. root port) because the errors will be
|
||||
* "seen" by the link partner and not the the problematic end point
|
||||
* itself (which may report all counters as 0 as it never saw any
|
||||
* problems).
|
||||
*/
|
||||
/* Counters for different type of correctable errors */
|
||||
u64 dev_cor_errs[AER_MAX_TYPEOF_COR_ERRS];
|
||||
/* Counters for different type of fatal uncorrectable errors */
|
||||
u64 dev_fatal_errs[AER_MAX_TYPEOF_UNCOR_ERRS];
|
||||
/* Counters for different type of nonfatal uncorrectable errors */
|
||||
u64 dev_nonfatal_errs[AER_MAX_TYPEOF_UNCOR_ERRS];
|
||||
/* Total number of ERR_COR sent by this device */
|
||||
u64 dev_total_cor_errs;
|
||||
/* Total number of ERR_FATAL sent by this device */
|
||||
u64 dev_total_fatal_errs;
|
||||
/* Total number of ERR_NONFATAL sent by this device */
|
||||
u64 dev_total_nonfatal_errs;
|
||||
|
||||
/*
|
||||
* Fields for Root ports & root complex event collectors only, these
|
||||
* indicate the total number of ERR_COR, ERR_FATAL, and ERR_NONFATAL
|
||||
* messages received by the root port / event collector, INCLUDING the
|
||||
* ones that are generated internally (by the rootport itself)
|
||||
*/
|
||||
u64 rootport_total_cor_errs;
|
||||
u64 rootport_total_fatal_errs;
|
||||
u64 rootport_total_nonfatal_errs;
|
||||
};
|
||||
|
||||
#define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \
|
||||
PCI_ERR_UNC_ECRC| \
|
||||
PCI_ERR_UNC_UNSUP| \
|
||||
@ -303,12 +322,13 @@ int pcie_aer_get_firmware_first(struct pci_dev *dev)
|
||||
if (!pci_is_pcie(dev))
|
||||
return 0;
|
||||
|
||||
if (pcie_ports_native)
|
||||
return 0;
|
||||
|
||||
if (!dev->__aer_firmware_first_valid)
|
||||
aer_set_firmware_first(dev);
|
||||
return dev->__aer_firmware_first;
|
||||
}
|
||||
#define PCI_EXP_AER_FLAGS (PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE | \
|
||||
PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE)
|
||||
|
||||
static bool aer_firmware_first;
|
||||
|
||||
@ -323,6 +343,9 @@ bool aer_acpi_firmware_first(void)
|
||||
.firmware_first = 0,
|
||||
};
|
||||
|
||||
if (pcie_ports_native)
|
||||
return false;
|
||||
|
||||
if (!parsed) {
|
||||
apei_hest_parse(aer_hest_parse, &info);
|
||||
aer_firmware_first = info.firmware_first;
|
||||
@ -357,16 +380,30 @@ int pci_disable_pcie_error_reporting(struct pci_dev *dev)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting);
|
||||
|
||||
void pci_aer_clear_device_status(struct pci_dev *dev)
|
||||
{
|
||||
u16 sta;
|
||||
|
||||
pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta);
|
||||
pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta);
|
||||
}
|
||||
|
||||
int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
|
||||
{
|
||||
int pos;
|
||||
u32 status;
|
||||
u32 status, sev;
|
||||
|
||||
pos = dev->aer_cap;
|
||||
if (!pos)
|
||||
return -EIO;
|
||||
|
||||
if (pcie_aer_get_firmware_first(dev))
|
||||
return -EIO;
|
||||
|
||||
/* Clear status bits for ERR_NONFATAL errors only */
|
||||
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
|
||||
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &sev);
|
||||
status &= ~sev;
|
||||
if (status)
|
||||
pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
|
||||
|
||||
@ -374,6 +411,26 @@ int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(pci_cleanup_aer_uncorrect_error_status);
|
||||
|
||||
void pci_aer_clear_fatal_status(struct pci_dev *dev)
|
||||
{
|
||||
int pos;
|
||||
u32 status, sev;
|
||||
|
||||
pos = dev->aer_cap;
|
||||
if (!pos)
|
||||
return;
|
||||
|
||||
if (pcie_aer_get_firmware_first(dev))
|
||||
return;
|
||||
|
||||
/* Clear status bits for ERR_FATAL errors only */
|
||||
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
|
||||
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &sev);
|
||||
status &= sev;
|
||||
if (status)
|
||||
pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
|
||||
}
|
||||
|
||||
int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
|
||||
{
|
||||
int pos;
|
||||
@ -387,6 +444,9 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
|
||||
if (!pos)
|
||||
return -EIO;
|
||||
|
||||
if (pcie_aer_get_firmware_first(dev))
|
||||
return -EIO;
|
||||
|
||||
port_type = pci_pcie_type(dev);
|
||||
if (port_type == PCI_EXP_TYPE_ROOT_PORT) {
|
||||
pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &status);
|
||||
@ -402,10 +462,20 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int pci_aer_init(struct pci_dev *dev)
|
||||
void pci_aer_init(struct pci_dev *dev)
|
||||
{
|
||||
dev->aer_cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
|
||||
return pci_cleanup_aer_error_status_regs(dev);
|
||||
|
||||
if (dev->aer_cap)
|
||||
dev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL);
|
||||
|
||||
pci_cleanup_aer_error_status_regs(dev);
|
||||
}
|
||||
|
||||
void pci_aer_exit(struct pci_dev *dev)
|
||||
{
|
||||
kfree(dev->aer_stats);
|
||||
dev->aer_stats = NULL;
|
||||
}
|
||||
|
||||
#define AER_AGENT_RECEIVER 0
|
||||
@ -458,52 +528,52 @@ static const char *aer_error_layer[] = {
|
||||
"Transaction Layer"
|
||||
};
|
||||
|
||||
static const char *aer_correctable_error_string[] = {
|
||||
"Receiver Error", /* Bit Position 0 */
|
||||
static const char *aer_correctable_error_string[AER_MAX_TYPEOF_COR_ERRS] = {
|
||||
"RxErr", /* Bit Position 0 */
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
"Bad TLP", /* Bit Position 6 */
|
||||
"Bad DLLP", /* Bit Position 7 */
|
||||
"RELAY_NUM Rollover", /* Bit Position 8 */
|
||||
"BadTLP", /* Bit Position 6 */
|
||||
"BadDLLP", /* Bit Position 7 */
|
||||
"Rollover", /* Bit Position 8 */
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
"Replay Timer Timeout", /* Bit Position 12 */
|
||||
"Advisory Non-Fatal", /* Bit Position 13 */
|
||||
"Corrected Internal Error", /* Bit Position 14 */
|
||||
"Header Log Overflow", /* Bit Position 15 */
|
||||
"Timeout", /* Bit Position 12 */
|
||||
"NonFatalErr", /* Bit Position 13 */
|
||||
"CorrIntErr", /* Bit Position 14 */
|
||||
"HeaderOF", /* Bit Position 15 */
|
||||
};
|
||||
|
||||
static const char *aer_uncorrectable_error_string[] = {
|
||||
static const char *aer_uncorrectable_error_string[AER_MAX_TYPEOF_UNCOR_ERRS] = {
|
||||
"Undefined", /* Bit Position 0 */
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
"Data Link Protocol", /* Bit Position 4 */
|
||||
"Surprise Down Error", /* Bit Position 5 */
|
||||
"DLP", /* Bit Position 4 */
|
||||
"SDES", /* Bit Position 5 */
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
"Poisoned TLP", /* Bit Position 12 */
|
||||
"Flow Control Protocol", /* Bit Position 13 */
|
||||
"Completion Timeout", /* Bit Position 14 */
|
||||
"Completer Abort", /* Bit Position 15 */
|
||||
"Unexpected Completion", /* Bit Position 16 */
|
||||
"Receiver Overflow", /* Bit Position 17 */
|
||||
"Malformed TLP", /* Bit Position 18 */
|
||||
"TLP", /* Bit Position 12 */
|
||||
"FCP", /* Bit Position 13 */
|
||||
"CmpltTO", /* Bit Position 14 */
|
||||
"CmpltAbrt", /* Bit Position 15 */
|
||||
"UnxCmplt", /* Bit Position 16 */
|
||||
"RxOF", /* Bit Position 17 */
|
||||
"MalfTLP", /* Bit Position 18 */
|
||||
"ECRC", /* Bit Position 19 */
|
||||
"Unsupported Request", /* Bit Position 20 */
|
||||
"ACS Violation", /* Bit Position 21 */
|
||||
"Uncorrectable Internal Error", /* Bit Position 22 */
|
||||
"MC Blocked TLP", /* Bit Position 23 */
|
||||
"AtomicOp Egress Blocked", /* Bit Position 24 */
|
||||
"TLP Prefix Blocked Error", /* Bit Position 25 */
|
||||
"UnsupReq", /* Bit Position 20 */
|
||||
"ACSViol", /* Bit Position 21 */
|
||||
"UncorrIntErr", /* Bit Position 22 */
|
||||
"BlockedTLP", /* Bit Position 23 */
|
||||
"AtomicOpBlocked", /* Bit Position 24 */
|
||||
"TLPBlockedErr", /* Bit Position 25 */
|
||||
};
|
||||
|
||||
static const char *aer_agent_string[] = {
|
||||
@ -513,6 +583,144 @@ static const char *aer_agent_string[] = {
|
||||
"Transmitter ID"
|
||||
};
|
||||
|
||||
#define aer_stats_dev_attr(name, stats_array, strings_array, \
|
||||
total_string, total_field) \
|
||||
static ssize_t \
|
||||
name##_show(struct device *dev, struct device_attribute *attr, \
|
||||
char *buf) \
|
||||
{ \
|
||||
unsigned int i; \
|
||||
char *str = buf; \
|
||||
struct pci_dev *pdev = to_pci_dev(dev); \
|
||||
u64 *stats = pdev->aer_stats->stats_array; \
|
||||
\
|
||||
for (i = 0; i < ARRAY_SIZE(strings_array); i++) { \
|
||||
if (strings_array[i]) \
|
||||
str += sprintf(str, "%s %llu\n", \
|
||||
strings_array[i], stats[i]); \
|
||||
else if (stats[i]) \
|
||||
str += sprintf(str, #stats_array "_bit[%d] %llu\n",\
|
||||
i, stats[i]); \
|
||||
} \
|
||||
str += sprintf(str, "TOTAL_%s %llu\n", total_string, \
|
||||
pdev->aer_stats->total_field); \
|
||||
return str-buf; \
|
||||
} \
|
||||
static DEVICE_ATTR_RO(name)
|
||||
|
||||
aer_stats_dev_attr(aer_dev_correctable, dev_cor_errs,
|
||||
aer_correctable_error_string, "ERR_COR",
|
||||
dev_total_cor_errs);
|
||||
aer_stats_dev_attr(aer_dev_fatal, dev_fatal_errs,
|
||||
aer_uncorrectable_error_string, "ERR_FATAL",
|
||||
dev_total_fatal_errs);
|
||||
aer_stats_dev_attr(aer_dev_nonfatal, dev_nonfatal_errs,
|
||||
aer_uncorrectable_error_string, "ERR_NONFATAL",
|
||||
dev_total_nonfatal_errs);
|
||||
|
||||
#define aer_stats_rootport_attr(name, field) \
|
||||
static ssize_t \
|
||||
name##_show(struct device *dev, struct device_attribute *attr, \
|
||||
char *buf) \
|
||||
{ \
|
||||
struct pci_dev *pdev = to_pci_dev(dev); \
|
||||
return sprintf(buf, "%llu\n", pdev->aer_stats->field); \
|
||||
} \
|
||||
static DEVICE_ATTR_RO(name)
|
||||
|
||||
aer_stats_rootport_attr(aer_rootport_total_err_cor,
|
||||
rootport_total_cor_errs);
|
||||
aer_stats_rootport_attr(aer_rootport_total_err_fatal,
|
||||
rootport_total_fatal_errs);
|
||||
aer_stats_rootport_attr(aer_rootport_total_err_nonfatal,
|
||||
rootport_total_nonfatal_errs);
|
||||
|
||||
static struct attribute *aer_stats_attrs[] __ro_after_init = {
|
||||
&dev_attr_aer_dev_correctable.attr,
|
||||
&dev_attr_aer_dev_fatal.attr,
|
||||
&dev_attr_aer_dev_nonfatal.attr,
|
||||
&dev_attr_aer_rootport_total_err_cor.attr,
|
||||
&dev_attr_aer_rootport_total_err_fatal.attr,
|
||||
&dev_attr_aer_rootport_total_err_nonfatal.attr,
|
||||
NULL
|
||||
};
|
||||
|
||||
static umode_t aer_stats_attrs_are_visible(struct kobject *kobj,
|
||||
struct attribute *a, int n)
|
||||
{
|
||||
struct device *dev = kobj_to_dev(kobj);
|
||||
struct pci_dev *pdev = to_pci_dev(dev);
|
||||
|
||||
if (!pdev->aer_stats)
|
||||
return 0;
|
||||
|
||||
if ((a == &dev_attr_aer_rootport_total_err_cor.attr ||
|
||||
a == &dev_attr_aer_rootport_total_err_fatal.attr ||
|
||||
a == &dev_attr_aer_rootport_total_err_nonfatal.attr) &&
|
||||
pci_pcie_type(pdev) != PCI_EXP_TYPE_ROOT_PORT)
|
||||
return 0;
|
||||
|
||||
return a->mode;
|
||||
}
|
||||
|
||||
const struct attribute_group aer_stats_attr_group = {
|
||||
.attrs = aer_stats_attrs,
|
||||
.is_visible = aer_stats_attrs_are_visible,
|
||||
};
|
||||
|
||||
static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
|
||||
struct aer_err_info *info)
|
||||
{
|
||||
int status, i, max = -1;
|
||||
u64 *counter = NULL;
|
||||
struct aer_stats *aer_stats = pdev->aer_stats;
|
||||
|
||||
if (!aer_stats)
|
||||
return;
|
||||
|
||||
switch (info->severity) {
|
||||
case AER_CORRECTABLE:
|
||||
aer_stats->dev_total_cor_errs++;
|
||||
counter = &aer_stats->dev_cor_errs[0];
|
||||
max = AER_MAX_TYPEOF_COR_ERRS;
|
||||
break;
|
||||
case AER_NONFATAL:
|
||||
aer_stats->dev_total_nonfatal_errs++;
|
||||
counter = &aer_stats->dev_nonfatal_errs[0];
|
||||
max = AER_MAX_TYPEOF_UNCOR_ERRS;
|
||||
break;
|
||||
case AER_FATAL:
|
||||
aer_stats->dev_total_fatal_errs++;
|
||||
counter = &aer_stats->dev_fatal_errs[0];
|
||||
max = AER_MAX_TYPEOF_UNCOR_ERRS;
|
||||
break;
|
||||
}
|
||||
|
||||
status = (info->status & ~info->mask);
|
||||
for (i = 0; i < max; i++)
|
||||
if (status & (1 << i))
|
||||
counter[i]++;
|
||||
}
|
||||
|
||||
static void pci_rootport_aer_stats_incr(struct pci_dev *pdev,
|
||||
struct aer_err_source *e_src)
|
||||
{
|
||||
struct aer_stats *aer_stats = pdev->aer_stats;
|
||||
|
||||
if (!aer_stats)
|
||||
return;
|
||||
|
||||
if (e_src->status & PCI_ERR_ROOT_COR_RCV)
|
||||
aer_stats->rootport_total_cor_errs++;
|
||||
|
||||
if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
|
||||
if (e_src->status & PCI_ERR_ROOT_FATAL_RCV)
|
||||
aer_stats->rootport_total_fatal_errs++;
|
||||
else
|
||||
aer_stats->rootport_total_nonfatal_errs++;
|
||||
}
|
||||
}
|
||||
|
||||
static void __print_tlp_header(struct pci_dev *dev,
|
||||
struct aer_header_log_regs *t)
|
||||
{
|
||||
@ -545,9 +753,10 @@ static void __aer_print_error(struct pci_dev *dev,
|
||||
pci_err(dev, " [%2d] Unknown Error Bit%s\n",
|
||||
i, info->first_error == i ? " (First)" : "");
|
||||
}
|
||||
pci_dev_aer_stats_incr(dev, info);
|
||||
}
|
||||
|
||||
static void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
|
||||
void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
|
||||
{
|
||||
int layer, agent;
|
||||
int id = ((dev->bus->number << 8) | dev->devfn);
|
||||
@ -799,6 +1008,7 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
|
||||
if (pos)
|
||||
pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
|
||||
info->status);
|
||||
pci_aer_clear_device_status(dev);
|
||||
} else if (info->severity == AER_NONFATAL)
|
||||
pcie_do_nonfatal_recovery(dev);
|
||||
else if (info->severity == AER_FATAL)
|
||||
@ -876,7 +1086,7 @@ EXPORT_SYMBOL_GPL(aer_recover_queue);
|
||||
#endif
|
||||
|
||||
/**
|
||||
* get_device_error_info - read error status from dev and store it to info
|
||||
* aer_get_device_error_info - read error status from dev and store it to info
|
||||
* @dev: pointer to the device expected to have a error record
|
||||
* @info: pointer to structure to store the error record
|
||||
*
|
||||
@ -884,7 +1094,7 @@ EXPORT_SYMBOL_GPL(aer_recover_queue);
|
||||
*
|
||||
* Note that @info is reused among all error devices. Clear fields properly.
|
||||
*/
|
||||
static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
|
||||
int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
|
||||
{
|
||||
int pos, temp;
|
||||
|
||||
@ -942,11 +1152,11 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info)
|
||||
|
||||
/* Report all before handle them, not to lost records by reset etc. */
|
||||
for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
|
||||
if (get_device_error_info(e_info->dev[i], e_info))
|
||||
if (aer_get_device_error_info(e_info->dev[i], e_info))
|
||||
aer_print_error(e_info->dev[i], e_info);
|
||||
}
|
||||
for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
|
||||
if (get_device_error_info(e_info->dev[i], e_info))
|
||||
if (aer_get_device_error_info(e_info->dev[i], e_info))
|
||||
handle_error_source(e_info->dev[i], e_info);
|
||||
}
|
||||
}
|
||||
@ -962,6 +1172,8 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
|
||||
struct pci_dev *pdev = rpc->rpd;
|
||||
struct aer_err_info *e_info = &rpc->e_info;
|
||||
|
||||
pci_rootport_aer_stats_incr(pdev, e_src);
|
||||
|
||||
/*
|
||||
* There is a possibility that both correctable error and
|
||||
* uncorrectable error being logged. Report correctable error first.
|
||||
@ -1336,20 +1548,8 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
|
||||
*/
|
||||
static void aer_error_resume(struct pci_dev *dev)
|
||||
{
|
||||
int pos;
|
||||
u32 status, mask;
|
||||
u16 reg16;
|
||||
|
||||
/* Clean up Root device status */
|
||||
pcie_capability_read_word(dev, PCI_EXP_DEVSTA, ®16);
|
||||
pcie_capability_write_word(dev, PCI_EXP_DEVSTA, reg16);
|
||||
|
||||
/* Clean AER Root Error Status */
|
||||
pos = dev->aer_cap;
|
||||
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
|
||||
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &mask);
|
||||
status &= ~mask; /* Clear corresponding nonfatal bits */
|
||||
pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
|
||||
pci_aer_clear_device_status(dev);
|
||||
pci_cleanup_aer_uncorrect_error_status(dev);
|
||||
}
|
||||
|
||||
static struct pcie_port_service_driver aerdriver = {
|
||||
|
@ -252,6 +252,7 @@ static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
|
||||
dev->error_state = state;
|
||||
pci_walk_bus(dev->subordinate, cb, &result_data);
|
||||
if (cb == report_resume) {
|
||||
pci_aer_clear_device_status(dev);
|
||||
pci_cleanup_aer_uncorrect_error_status(dev);
|
||||
dev->error_state = pci_channel_io_normal;
|
||||
}
|
||||
@ -259,15 +260,10 @@ static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
|
||||
/*
|
||||
* If the error is reported by an end point, we think this
|
||||
* error is related to the upstream link of the end point.
|
||||
* The error is non fatal so the bus is ok; just invoke
|
||||
* the callback for the function that logged the error.
|
||||
*/
|
||||
if (state == pci_channel_io_normal)
|
||||
/*
|
||||
* the error is non fatal so the bus is ok, just invoke
|
||||
* the callback for the function that logged the error.
|
||||
*/
|
||||
cb(dev, &result_data);
|
||||
else
|
||||
pci_walk_bus(dev->bus, cb, &result_data);
|
||||
cb(dev, &result_data);
|
||||
}
|
||||
|
||||
return result_data.result;
|
||||
@ -317,7 +313,8 @@ void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service)
|
||||
* do error recovery on all subordinates of the bridge instead
|
||||
* of the bridge and clear the error status of the bridge.
|
||||
*/
|
||||
pci_cleanup_aer_uncorrect_error_status(dev);
|
||||
pci_aer_clear_fatal_status(dev);
|
||||
pci_aer_clear_device_status(dev);
|
||||
}
|
||||
|
||||
if (result == PCI_ERS_RESULT_RECOVERED) {
|
||||
|
@ -42,17 +42,6 @@ __setup("pcie_ports=", pcie_port_setup);
|
||||
|
||||
/* global data */
|
||||
|
||||
static int pcie_portdrv_restore_config(struct pci_dev *dev)
|
||||
{
|
||||
int retval;
|
||||
|
||||
retval = pci_enable_device(dev);
|
||||
if (retval)
|
||||
return retval;
|
||||
pci_set_master(dev);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PM
|
||||
static int pcie_port_runtime_suspend(struct device *dev)
|
||||
{
|
||||
@ -160,19 +149,6 @@ static pci_ers_result_t pcie_portdrv_mmio_enabled(struct pci_dev *dev)
|
||||
return PCI_ERS_RESULT_RECOVERED;
|
||||
}
|
||||
|
||||
static pci_ers_result_t pcie_portdrv_slot_reset(struct pci_dev *dev)
|
||||
{
|
||||
/* If fatal, restore cfg space for possible link reset at upstream */
|
||||
if (dev->error_state == pci_channel_io_frozen) {
|
||||
dev->state_saved = true;
|
||||
pci_restore_state(dev);
|
||||
pcie_portdrv_restore_config(dev);
|
||||
pci_enable_pcie_error_reporting(dev);
|
||||
}
|
||||
|
||||
return PCI_ERS_RESULT_RECOVERED;
|
||||
}
|
||||
|
||||
static int resume_iter(struct device *device, void *data)
|
||||
{
|
||||
struct pcie_device *pcie_device;
|
||||
@ -208,7 +184,6 @@ static const struct pci_device_id port_pci_ids[] = { {
|
||||
static const struct pci_error_handlers pcie_portdrv_err_handler = {
|
||||
.error_detected = pcie_portdrv_error_detected,
|
||||
.mmio_enabled = pcie_portdrv_mmio_enabled,
|
||||
.slot_reset = pcie_portdrv_slot_reset,
|
||||
.resume = pcie_portdrv_err_resume,
|
||||
};
|
||||
|
||||
|
@ -2064,6 +2064,7 @@ static void pci_configure_device(struct pci_dev *dev)
|
||||
|
||||
static void pci_release_capabilities(struct pci_dev *dev)
|
||||
{
|
||||
pci_aer_exit(dev);
|
||||
pci_vpd_release(dev);
|
||||
pci_iov_release(dev);
|
||||
pci_free_cap_save_buffers(dev);
|
||||
|
@ -299,6 +299,7 @@ struct pci_dev {
|
||||
u8 hdr_type; /* PCI header type (`multi' flag masked out) */
|
||||
#ifdef CONFIG_PCIEAER
|
||||
u16 aer_cap; /* AER capability offset */
|
||||
struct aer_stats *aer_stats; /* AER stats for this device */
|
||||
#endif
|
||||
u8 pcie_cap; /* PCIe capability offset */
|
||||
u8 msi_cap; /* MSI capability offset */
|
||||
@ -1469,13 +1470,9 @@ static inline bool pcie_aspm_support_enabled(void) { return false; }
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PCIEAER
|
||||
void pci_no_aer(void);
|
||||
bool pci_aer_available(void);
|
||||
int pci_aer_init(struct pci_dev *dev);
|
||||
#else
|
||||
static inline void pci_no_aer(void) { }
|
||||
static inline bool pci_aer_available(void) { return false; }
|
||||
static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; }
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PCIE_ECRC
|
||||
|
Loading…
Reference in New Issue
Block a user