linux_dsm_epyc7002/drivers/pci/pcie/err.c
Jonathan Cameron 068c29a248 PCI/ERR: Clear PCIe Device Status errors only if OS owns AER
pcie_clear_device_status() resets the error bits in the PCIe Device Status
Register (PCI_EXP_DEVSTA).

Previously we did this unconditionally, but on ACPI systems, the _OSC AER
bit negotiates control of the AER capability.  Per sec 4.5.1 of the System
Firmware Intermediary _OSC and DPC Updates ECN [1], this bit also covers
other error enable/status bits including the following:

  Correctable Error Reporting Enable
  Non-Fatal Error Reporting Enable
  Fatal Error Reporting Enable
  Unsupported Request Reporting Enable

These bits are all in the PCIe Device Control register (the ECN omitted
"Reporting", but I think that's a typo), so by implication the _OSC AER bit
also applies to the error status bits in the PCIe Device Status register:

  Correctable Error Detected
  Non-Fatal Error Detected
  Fatal Error Detected
  Unsupported Request Detected

Clear the PCIe Device Status error bits only when the OS controls the AER
capability and related error enable/status bits.  If platform firmware
controls the AER capability, firmware is responsible for clearing these
bits.

One call path leading here is:

  ghes_do_proc
    ghes_handle_aer
      aer_recover_queue
        schedule_work(&aer_recover_work)
  ...
  aer_recover_work_func
    pcie_do_recovery
      pcie_clear_device_status

[1] System Firmware Intermediary (SFI) _OSC and DPC Updates ECN, Feb 24,
    2020, affecting PCI Firmware Specification, Rev. 3.2
    https://members.pcisig.com/wg/PCI-SIG/document/14076
[bhelgaas: commit log, move test from pcie_clear_device_status() to callers]
Link: https://lore.kernel.org/r/20200622113523.891666-1-Jonathan.Cameron@huawei.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
2020-07-22 15:41:03 -05:00

214 lines
5.5 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* This file implements the error recovery as a core part of PCIe error
* reporting. When a PCIe error is delivered, an error message will be
* collected and printed to console, then, an error recovery procedure
* will be executed by following the PCI error recovery rules.
*
* Copyright (C) 2006 Intel Corp.
* Tom Long Nguyen (tom.l.nguyen@intel.com)
* Zhang Yanmin (yanmin.zhang@intel.com)
*/
#define dev_fmt(fmt) "AER: " fmt
#include <linux/pci.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/aer.h>
#include "portdrv.h"
#include "../pci.h"
static pci_ers_result_t merge_result(enum pci_ers_result orig,
enum pci_ers_result new)
{
if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
return PCI_ERS_RESULT_NO_AER_DRIVER;
if (new == PCI_ERS_RESULT_NONE)
return orig;
switch (orig) {
case PCI_ERS_RESULT_CAN_RECOVER:
case PCI_ERS_RESULT_RECOVERED:
orig = new;
break;
case PCI_ERS_RESULT_DISCONNECT:
if (new == PCI_ERS_RESULT_NEED_RESET)
orig = PCI_ERS_RESULT_NEED_RESET;
break;
default:
break;
}
return orig;
}
static int report_error_detected(struct pci_dev *dev,
pci_channel_state_t state,
enum pci_ers_result *result)
{
pci_ers_result_t vote;
const struct pci_error_handlers *err_handler;
device_lock(&dev->dev);
if (!pci_dev_set_io_state(dev, state) ||
!dev->driver ||
!dev->driver->err_handler ||
!dev->driver->err_handler->error_detected) {
/*
* If any device in the subtree does not have an error_detected
* callback, PCI_ERS_RESULT_NO_AER_DRIVER prevents subsequent
* error callbacks of "any" device in the subtree, and will
* exit in the disconnected error state.
*/
if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
vote = PCI_ERS_RESULT_NO_AER_DRIVER;
pci_info(dev, "can't recover (no error_detected callback)\n");
} else {
vote = PCI_ERS_RESULT_NONE;
}
} else {
err_handler = dev->driver->err_handler;
vote = err_handler->error_detected(dev, state);
}
pci_uevent_ers(dev, vote);
*result = merge_result(*result, vote);
device_unlock(&dev->dev);
return 0;
}
static int report_frozen_detected(struct pci_dev *dev, void *data)
{
return report_error_detected(dev, pci_channel_io_frozen, data);
}
static int report_normal_detected(struct pci_dev *dev, void *data)
{
return report_error_detected(dev, pci_channel_io_normal, data);
}
static int report_mmio_enabled(struct pci_dev *dev, void *data)
{
pci_ers_result_t vote, *result = data;
const struct pci_error_handlers *err_handler;
device_lock(&dev->dev);
if (!dev->driver ||
!dev->driver->err_handler ||
!dev->driver->err_handler->mmio_enabled)
goto out;
err_handler = dev->driver->err_handler;
vote = err_handler->mmio_enabled(dev);
*result = merge_result(*result, vote);
out:
device_unlock(&dev->dev);
return 0;
}
static int report_slot_reset(struct pci_dev *dev, void *data)
{
pci_ers_result_t vote, *result = data;
const struct pci_error_handlers *err_handler;
device_lock(&dev->dev);
if (!dev->driver ||
!dev->driver->err_handler ||
!dev->driver->err_handler->slot_reset)
goto out;
err_handler = dev->driver->err_handler;
vote = err_handler->slot_reset(dev);
*result = merge_result(*result, vote);
out:
device_unlock(&dev->dev);
return 0;
}
static int report_resume(struct pci_dev *dev, void *data)
{
const struct pci_error_handlers *err_handler;
device_lock(&dev->dev);
if (!pci_dev_set_io_state(dev, pci_channel_io_normal) ||
!dev->driver ||
!dev->driver->err_handler ||
!dev->driver->err_handler->resume)
goto out;
err_handler = dev->driver->err_handler;
err_handler->resume(dev);
out:
pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
device_unlock(&dev->dev);
return 0;
}
pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
pci_channel_state_t state,
pci_ers_result_t (*reset_link)(struct pci_dev *pdev))
{
pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
struct pci_bus *bus;
/*
* Error recovery runs on all subordinates of the first downstream port.
* If the downstream port detected the error, it is cleared at the end.
*/
if (!(pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM))
dev = dev->bus->self;
bus = dev->subordinate;
pci_dbg(dev, "broadcast error_detected message\n");
if (state == pci_channel_io_frozen) {
pci_walk_bus(bus, report_frozen_detected, &status);
status = reset_link(dev);
if (status != PCI_ERS_RESULT_RECOVERED) {
pci_warn(dev, "link reset failed\n");
goto failed;
}
} else {
pci_walk_bus(bus, report_normal_detected, &status);
}
if (status == PCI_ERS_RESULT_CAN_RECOVER) {
status = PCI_ERS_RESULT_RECOVERED;
pci_dbg(dev, "broadcast mmio_enabled message\n");
pci_walk_bus(bus, report_mmio_enabled, &status);
}
if (status == PCI_ERS_RESULT_NEED_RESET) {
/*
* TODO: Should call platform-specific
* functions to reset slot before calling
* drivers' slot_reset callbacks?
*/
status = PCI_ERS_RESULT_RECOVERED;
pci_dbg(dev, "broadcast slot_reset message\n");
pci_walk_bus(bus, report_slot_reset, &status);
}
if (status != PCI_ERS_RESULT_RECOVERED)
goto failed;
pci_dbg(dev, "broadcast resume message\n");
pci_walk_bus(bus, report_resume, &status);
if (pcie_aer_is_native(dev))
pcie_clear_device_status(dev);
pci_aer_clear_nonfatal_status(dev);
pci_info(dev, "device recovery successful\n");
return status;
failed:
pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
/* TODO: Should kernel panic here? */
pci_info(dev, "device recovery failed\n");
return status;
}