linux_dsm_epyc7002/drivers/pci/pcie/aspm.c

1244 lines
35 KiB
C
Raw Normal View History

/*
* File: drivers/pci/pcie/aspm.c
* Enabling PCIe link L0s/L1 state and Clock Power Management
*
* Copyright (C) 2007 Intel
* Copyright (C) Zhang Yanmin (yanmin.zhang@intel.com)
* Copyright (C) Shaohua Li (shaohua.li@intel.com)
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/pci.h>
#include <linux/pci_regs.h>
#include <linux/errno.h>
#include <linux/pm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/jiffies.h>
#include <linux/delay.h>
#include <linux/pci-aspm.h>
#include "../pci.h"
#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
#endif
#define MODULE_PARAM_PREFIX "pcie_aspm."
/* Note: those are not register definitions */
#define ASPM_STATE_L0S_UP (1) /* Upstream direction L0s state */
#define ASPM_STATE_L0S_DW (2) /* Downstream direction L0s state */
#define ASPM_STATE_L1 (4) /* L1 state */
#define ASPM_STATE_L1_1 (8) /* ASPM L1.1 state */
#define ASPM_STATE_L1_2 (0x10) /* ASPM L1.2 state */
#define ASPM_STATE_L1_1_PCIPM (0x20) /* PCI PM L1.1 state */
#define ASPM_STATE_L1_2_PCIPM (0x40) /* PCI PM L1.2 state */
#define ASPM_STATE_L1_SS_PCIPM (ASPM_STATE_L1_1_PCIPM | ASPM_STATE_L1_2_PCIPM)
#define ASPM_STATE_L1_2_MASK (ASPM_STATE_L1_2 | ASPM_STATE_L1_2_PCIPM)
#define ASPM_STATE_L1SS (ASPM_STATE_L1_1 | ASPM_STATE_L1_1_PCIPM |\
ASPM_STATE_L1_2_MASK)
#define ASPM_STATE_L0S (ASPM_STATE_L0S_UP | ASPM_STATE_L0S_DW)
#define ASPM_STATE_ALL (ASPM_STATE_L0S | ASPM_STATE_L1 | \
ASPM_STATE_L1SS)
/*
* When L1 substates are enabled, the LTR L1.2 threshold is a timing parameter
* that decides whether L1.1 or L1.2 is entered (Refer PCIe spec for details).
* Not sure is there is a way to "calculate" this on the fly, but maybe we
* could turn it into a parameter in future. This value has been taken from
* the following files from Intel's coreboot (which is the only code I found
* to have used this):
* https://www.coreboot.org/pipermail/coreboot-gerrit/2015-March/021134.html
* https://review.coreboot.org/#/c/8832/
*/
#define LTR_L1_2_THRESHOLD_BITS ((1 << 21) | (1 << 23) | (1 << 30))
struct aspm_latency {
u32 l0s; /* L0s latency (nsec) */
u32 l1; /* L1 latency (nsec) */
};
struct pcie_link_state {
struct pci_dev *pdev; /* Upstream component of the Link */
struct pci_dev *downstream; /* Downstream component, function 0 */
struct pcie_link_state *root; /* pointer to the root port link */
struct pcie_link_state *parent; /* pointer to the parent Link state */
struct list_head sibling; /* node in link_list */
struct list_head children; /* list of child link states */
struct list_head link; /* node in parent's children list */
/* ASPM state */
u32 aspm_support:7; /* Supported ASPM state */
u32 aspm_enabled:7; /* Enabled ASPM state */
u32 aspm_capable:7; /* Capable ASPM state with latency */
u32 aspm_default:7; /* Default ASPM state by BIOS */
u32 aspm_disable:7; /* Disabled ASPM state */
/* Clock PM state */
u32 clkpm_capable:1; /* Clock PM capable? */
u32 clkpm_enabled:1; /* Current Clock PM state */
u32 clkpm_default:1; /* Default Clock PM state by BIOS */
/* Exit latencies */
struct aspm_latency latency_up; /* Upstream direction exit latency */
struct aspm_latency latency_dw; /* Downstream direction exit latency */
/*
* Endpoint acceptable latencies. A pcie downstream port only
* has one slot under it, so at most there are 8 functions.
*/
struct aspm_latency acceptable[8];
/* L1 PM Substate info */
struct {
u32 up_cap_ptr; /* L1SS cap ptr in upstream dev */
u32 dw_cap_ptr; /* L1SS cap ptr in downstream dev */
u32 ctl1; /* value to be programmed in ctl1 */
u32 ctl2; /* value to be programmed in ctl2 */
} l1ss;
};
static int aspm_disabled, aspm_force;
static bool aspm_support_enabled = true;
static DEFINE_MUTEX(aspm_lock);
static LIST_HEAD(link_list);
#define POLICY_DEFAULT 0 /* BIOS default setting */
#define POLICY_PERFORMANCE 1 /* high performance */
#define POLICY_POWERSAVE 2 /* high power saving */
#define POLICY_POWER_SUPERSAVE 3 /* possibly even more power saving */
#ifdef CONFIG_PCIEASPM_PERFORMANCE
static int aspm_policy = POLICY_PERFORMANCE;
#elif defined CONFIG_PCIEASPM_POWERSAVE
static int aspm_policy = POLICY_POWERSAVE;
#elif defined CONFIG_PCIEASPM_POWER_SUPERSAVE
static int aspm_policy = POLICY_POWER_SUPERSAVE;
#else
static int aspm_policy;
#endif
static const char *policy_str[] = {
[POLICY_DEFAULT] = "default",
[POLICY_PERFORMANCE] = "performance",
[POLICY_POWERSAVE] = "powersave",
[POLICY_POWER_SUPERSAVE] = "powersupersave"
};
#define LINK_RETRAIN_TIMEOUT HZ
static int policy_to_aspm_state(struct pcie_link_state *link)
{
switch (aspm_policy) {
case POLICY_PERFORMANCE:
/* Disable ASPM and Clock PM */
return 0;
case POLICY_POWERSAVE:
/* Enable ASPM L0s/L1 */
return (ASPM_STATE_L0S | ASPM_STATE_L1);
case POLICY_POWER_SUPERSAVE:
/* Enable Everything */
return ASPM_STATE_ALL;
case POLICY_DEFAULT:
return link->aspm_default;
}
return 0;
}
static int policy_to_clkpm_state(struct pcie_link_state *link)
{
switch (aspm_policy) {
case POLICY_PERFORMANCE:
/* Disable ASPM and Clock PM */
return 0;
case POLICY_POWERSAVE:
case POLICY_POWER_SUPERSAVE:
/* Enable Clock PM */
return 1;
case POLICY_DEFAULT:
return link->clkpm_default;
}
return 0;
}
static void pcie_set_clkpm_nocheck(struct pcie_link_state *link, int enable)
{
struct pci_dev *child;
struct pci_bus *linkbus = link->pdev->subordinate;
u32 val = enable ? PCI_EXP_LNKCTL_CLKREQ_EN : 0;
list_for_each_entry(child, &linkbus->devices, bus_list)
pcie_capability_clear_and_set_word(child, PCI_EXP_LNKCTL,
PCI_EXP_LNKCTL_CLKREQ_EN,
val);
link->clkpm_enabled = !!enable;
}
static void pcie_set_clkpm(struct pcie_link_state *link, int enable)
{
/* Don't enable Clock PM if the link is not Clock PM capable */
if (!link->clkpm_capable)
enable = 0;
/* Need nothing if the specified equals to current state */
if (link->clkpm_enabled == enable)
return;
pcie_set_clkpm_nocheck(link, enable);
}
static void pcie_clkpm_cap_init(struct pcie_link_state *link, int blacklist)
{
int capable = 1, enabled = 1;
u32 reg32;
u16 reg16;
struct pci_dev *child;
struct pci_bus *linkbus = link->pdev->subordinate;
/* All functions should have the same cap and state, take the worst */
list_for_each_entry(child, &linkbus->devices, bus_list) {
pcie_capability_read_dword(child, PCI_EXP_LNKCAP, &reg32);
if (!(reg32 & PCI_EXP_LNKCAP_CLKPM)) {
capable = 0;
enabled = 0;
break;
}
pcie_capability_read_word(child, PCI_EXP_LNKCTL, &reg16);
if (!(reg16 & PCI_EXP_LNKCTL_CLKREQ_EN))
enabled = 0;
}
link->clkpm_enabled = enabled;
link->clkpm_default = enabled;
link->clkpm_capable = (blacklist) ? 0 : capable;
}
/*
* pcie_aspm_configure_common_clock: check if the 2 ends of a link
* could use common clock. If they are, configure them to use the
* common clock. That will reduce the ASPM state exit latency.
*/
static void pcie_aspm_configure_common_clock(struct pcie_link_state *link)
{
int same_clock = 1;
u16 reg16, parent_reg, child_reg[8];
unsigned long start_jiffies;
struct pci_dev *child, *parent = link->pdev;
struct pci_bus *linkbus = parent->subordinate;
/*
* All functions of a slot should have the same Slot Clock
* Configuration, so just check one function
*/
child = list_entry(linkbus->devices.next, struct pci_dev, bus_list);
BUG_ON(!pci_is_pcie(child));
/* Check downstream component if bit Slot Clock Configuration is 1 */
pcie_capability_read_word(child, PCI_EXP_LNKSTA, &reg16);
if (!(reg16 & PCI_EXP_LNKSTA_SLC))
same_clock = 0;
/* Check upstream component if bit Slot Clock Configuration is 1 */
pcie_capability_read_word(parent, PCI_EXP_LNKSTA, &reg16);
if (!(reg16 & PCI_EXP_LNKSTA_SLC))
same_clock = 0;
/* Configure downstream component, all functions */
list_for_each_entry(child, &linkbus->devices, bus_list) {
pcie_capability_read_word(child, PCI_EXP_LNKCTL, &reg16);
child_reg[PCI_FUNC(child->devfn)] = reg16;
if (same_clock)
reg16 |= PCI_EXP_LNKCTL_CCC;
else
reg16 &= ~PCI_EXP_LNKCTL_CCC;
pcie_capability_write_word(child, PCI_EXP_LNKCTL, reg16);
}
/* Configure upstream component */
pcie_capability_read_word(parent, PCI_EXP_LNKCTL, &reg16);
parent_reg = reg16;
if (same_clock)
reg16 |= PCI_EXP_LNKCTL_CCC;
else
reg16 &= ~PCI_EXP_LNKCTL_CCC;
pcie_capability_write_word(parent, PCI_EXP_LNKCTL, reg16);
/* Retrain link */
reg16 |= PCI_EXP_LNKCTL_RL;
pcie_capability_write_word(parent, PCI_EXP_LNKCTL, reg16);
/* Wait for link training end. Break out after waiting for timeout */
start_jiffies = jiffies;
for (;;) {
pcie_capability_read_word(parent, PCI_EXP_LNKSTA, &reg16);
if (!(reg16 & PCI_EXP_LNKSTA_LT))
break;
if (time_after(jiffies, start_jiffies + LINK_RETRAIN_TIMEOUT))
break;
msleep(1);
}
if (!(reg16 & PCI_EXP_LNKSTA_LT))
return;
/* Training failed. Restore common clock configurations */
dev_err(&parent->dev, "ASPM: Could not configure common clock\n");
list_for_each_entry(child, &linkbus->devices, bus_list)
pcie_capability_write_word(child, PCI_EXP_LNKCTL,
child_reg[PCI_FUNC(child->devfn)]);
pcie_capability_write_word(parent, PCI_EXP_LNKCTL, parent_reg);
}
/* Convert L0s latency encoding to ns */
static u32 calc_l0s_latency(u32 encoding)
{
if (encoding == 0x7)
return (5 * 1000); /* > 4us */
return (64 << encoding);
}
/* Convert L0s acceptable latency encoding to ns */
static u32 calc_l0s_acceptable(u32 encoding)
{
if (encoding == 0x7)
return -1U;
return (64 << encoding);
}
/* Convert L1 latency encoding to ns */
static u32 calc_l1_latency(u32 encoding)
{
if (encoding == 0x7)
return (65 * 1000); /* > 64us */
return (1000 << encoding);
}
/* Convert L1 acceptable latency encoding to ns */
static u32 calc_l1_acceptable(u32 encoding)
{
if (encoding == 0x7)
return -1U;
return (1000 << encoding);
}
/* Convert L1SS T_pwr encoding to usec */
static u32 calc_l1ss_pwron(struct pci_dev *pdev, u32 scale, u32 val)
{
switch (scale) {
case 0:
return val * 2;
case 1:
return val * 10;
case 2:
return val * 100;
}
dev_err(&pdev->dev, "%s: Invalid T_PwrOn scale: %u\n",
__func__, scale);
return 0;
}
struct aspm_register_info {
u32 support:2;
u32 enabled:2;
u32 latency_encoding_l0s;
u32 latency_encoding_l1;
/* L1 substates */
u32 l1ss_cap_ptr;
u32 l1ss_cap;
u32 l1ss_ctl1;
u32 l1ss_ctl2;
};
static void pcie_get_aspm_reg(struct pci_dev *pdev,
struct aspm_register_info *info)
{
u16 reg16;
u32 reg32;
pcie_capability_read_dword(pdev, PCI_EXP_LNKCAP, &reg32);
info->support = (reg32 & PCI_EXP_LNKCAP_ASPMS) >> 10;
info->latency_encoding_l0s = (reg32 & PCI_EXP_LNKCAP_L0SEL) >> 12;
info->latency_encoding_l1 = (reg32 & PCI_EXP_LNKCAP_L1EL) >> 15;
pcie_capability_read_word(pdev, PCI_EXP_LNKCTL, &reg16);
info->enabled = reg16 & PCI_EXP_LNKCTL_ASPMC;
/* Read L1 PM substate capabilities */
info->l1ss_cap = info->l1ss_ctl1 = info->l1ss_ctl2 = 0;
info->l1ss_cap_ptr = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_L1SS);
if (!info->l1ss_cap_ptr)
return;
pci_read_config_dword(pdev, info->l1ss_cap_ptr + PCI_L1SS_CAP,
&info->l1ss_cap);
if (!(info->l1ss_cap & PCI_L1SS_CAP_L1_PM_SS)) {
info->l1ss_cap = 0;
return;
}
pci_read_config_dword(pdev, info->l1ss_cap_ptr + PCI_L1SS_CTL1,
&info->l1ss_ctl1);
pci_read_config_dword(pdev, info->l1ss_cap_ptr + PCI_L1SS_CTL2,
&info->l1ss_ctl2);
}
static void pcie_aspm_check_latency(struct pci_dev *endpoint)
{
u32 latency, l1_switch_latency = 0;
struct aspm_latency *acceptable;
struct pcie_link_state *link;
/* Device not in D0 doesn't need latency check */
if ((endpoint->current_state != PCI_D0) &&
(endpoint->current_state != PCI_UNKNOWN))
return;
link = endpoint->bus->self->link_state;
acceptable = &link->acceptable[PCI_FUNC(endpoint->devfn)];
while (link) {
/* Check upstream direction L0s latency */
if ((link->aspm_capable & ASPM_STATE_L0S_UP) &&
(link->latency_up.l0s > acceptable->l0s))
link->aspm_capable &= ~ASPM_STATE_L0S_UP;
/* Check downstream direction L0s latency */
if ((link->aspm_capable & ASPM_STATE_L0S_DW) &&
(link->latency_dw.l0s > acceptable->l0s))
link->aspm_capable &= ~ASPM_STATE_L0S_DW;
/*
* Check L1 latency.
* Every switch on the path to root complex need 1
* more microsecond for L1. Spec doesn't mention L0s.
*
* The exit latencies for L1 substates are not advertised
* by a device. Since the spec also doesn't mention a way
* to determine max latencies introduced by enabling L1
* substates on the components, it is not clear how to do
* a L1 substate exit latency check. We assume that the
* L1 exit latencies advertised by a device include L1
* substate latencies (and hence do not do any check).
*/
latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
if ((link->aspm_capable & ASPM_STATE_L1) &&
(latency + l1_switch_latency > acceptable->l1))
link->aspm_capable &= ~ASPM_STATE_L1;
l1_switch_latency += 1000;
link = link->parent;
}
}
/*
* The L1 PM substate capability is only implemented in function 0 in a
* multi function device.
*/
static struct pci_dev *pci_function_0(struct pci_bus *linkbus)
{
struct pci_dev *child;
list_for_each_entry(child, &linkbus->devices, bus_list)
if (PCI_FUNC(child->devfn) == 0)
return child;
return NULL;
}
/* Calculate L1.2 PM substate timing parameters */
static void aspm_calc_l1ss_info(struct pcie_link_state *link,
struct aspm_register_info *upreg,
struct aspm_register_info *dwreg)
{
u32 val1, val2, scale1, scale2;
link->l1ss.up_cap_ptr = upreg->l1ss_cap_ptr;
link->l1ss.dw_cap_ptr = dwreg->l1ss_cap_ptr;
link->l1ss.ctl1 = link->l1ss.ctl2 = 0;
if (!(link->aspm_support & ASPM_STATE_L1_2_MASK))
return;
/* Choose the greater of the two T_cmn_mode_rstr_time */
val1 = (upreg->l1ss_cap >> 8) & 0xFF;
val2 = (upreg->l1ss_cap >> 8) & 0xFF;
if (val1 > val2)
link->l1ss.ctl1 |= val1 << 8;
else
link->l1ss.ctl1 |= val2 << 8;
/*
* We currently use LTR L1.2 threshold to be fixed constant picked from
* Intel's coreboot.
*/
link->l1ss.ctl1 |= LTR_L1_2_THRESHOLD_BITS;
/* Choose the greater of the two T_pwr_on */
val1 = (upreg->l1ss_cap >> 19) & 0x1F;
scale1 = (upreg->l1ss_cap >> 16) & 0x03;
val2 = (dwreg->l1ss_cap >> 19) & 0x1F;
scale2 = (dwreg->l1ss_cap >> 16) & 0x03;
if (calc_l1ss_pwron(link->pdev, scale1, val1) >
calc_l1ss_pwron(link->downstream, scale2, val2))
link->l1ss.ctl2 |= scale1 | (val1 << 3);
else
link->l1ss.ctl2 |= scale2 | (val2 << 3);
}
static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist)
{
PCI/ASPM: Always set link->downstream to avoid NULL dereference on remove We call pcie_aspm_exit_link_state() when we remove a device. If the device is the last PCIe function to be removed below a bridge and the bridge has an ASPM link_state struct, we disable ASPM on the link. Disabling ASPM requires link->downstream (used in pcie_config_aspm_link()). We previously set link->downstream in pcie_aspm_cap_init(), but only if the device was not blacklisted. Removing the blacklisted device caused a NULL pointer dereference in the pcie_aspm_exit_link_state() -> pcie_config_aspm_link() path: # echo 1 > /sys/bus/pci/devices/0000\:0b\:00.0/remove ... BUG: unable to handle kernel NULL pointer dereference at 0000000000000080 IP: pcie_config_aspm_link+0x5d/0x2b0 Call Trace: pcie_aspm_exit_link_state+0x75/0x130 pci_stop_bus_device+0xa4/0xb0 pci_stop_and_remove_bus_device_locked+0x1a/0x30 remove_store+0x50/0x70 dev_attr_store+0x18/0x30 sysfs_kf_write+0x44/0x60 kernfs_fop_write+0x10e/0x190 __vfs_write+0x28/0x110 ? rcu_read_lock_sched_held+0x5d/0x80 ? rcu_sync_lockdep_assert+0x2c/0x60 ? __sb_start_write+0x173/0x1a0 ? vfs_write+0xb3/0x180 vfs_write+0xc4/0x180 SyS_write+0x49/0xa0 do_syscall_64+0xa6/0x1c0 entry_SYSCALL64_slow_path+0x25/0x25 ---[ end trace bd187ee0267df5d9 ]--- To avoid this, set link->downstream in alloc_pcie_link_state(), so every pcie_link_state structure has a valid link->downstream pointer. [bhelgaas: changelog] Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Acked-by: Rajat Jain <rajatja@google.com> CC: stable@vger.kernel.org
2017-03-01 15:25:40 +07:00
struct pci_dev *child = link->downstream, *parent = link->pdev;
struct pci_bus *linkbus = parent->subordinate;
struct aspm_register_info upreg, dwreg;
if (blacklist) {
/* Set enabled/disable so that we will disable ASPM later */
link->aspm_enabled = ASPM_STATE_ALL;
link->aspm_disable = ASPM_STATE_ALL;
return;
}
/* Get upstream/downstream components' register state */
pcie_get_aspm_reg(parent, &upreg);
pcie_get_aspm_reg(child, &dwreg);
/*
* If ASPM not supported, don't mess with the clocks and link,
* bail out now.
*/
if (!(upreg.support & dwreg.support))
return;
/* Configure common clock before checking latencies */
pcie_aspm_configure_common_clock(link);
/*
* Re-read upstream/downstream components' register state
* after clock configuration
*/
pcie_get_aspm_reg(parent, &upreg);
pcie_get_aspm_reg(child, &dwreg);
/*
* Setup L0s state
*
* Note that we must not enable L0s in either direction on a
* given link unless components on both sides of the link each
* support L0s.
*/
if (dwreg.support & upreg.support & PCIE_LINK_STATE_L0S)
link->aspm_support |= ASPM_STATE_L0S;
if (dwreg.enabled & PCIE_LINK_STATE_L0S)
link->aspm_enabled |= ASPM_STATE_L0S_UP;
if (upreg.enabled & PCIE_LINK_STATE_L0S)
link->aspm_enabled |= ASPM_STATE_L0S_DW;
link->latency_up.l0s = calc_l0s_latency(upreg.latency_encoding_l0s);
link->latency_dw.l0s = calc_l0s_latency(dwreg.latency_encoding_l0s);
/* Setup L1 state */
if (upreg.support & dwreg.support & PCIE_LINK_STATE_L1)
link->aspm_support |= ASPM_STATE_L1;
if (upreg.enabled & dwreg.enabled & PCIE_LINK_STATE_L1)
link->aspm_enabled |= ASPM_STATE_L1;
link->latency_up.l1 = calc_l1_latency(upreg.latency_encoding_l1);
link->latency_dw.l1 = calc_l1_latency(dwreg.latency_encoding_l1);
/* Setup L1 substate */
if (upreg.l1ss_cap & dwreg.l1ss_cap & PCI_L1SS_CAP_ASPM_L1_1)
link->aspm_support |= ASPM_STATE_L1_1;
if (upreg.l1ss_cap & dwreg.l1ss_cap & PCI_L1SS_CAP_ASPM_L1_2)
link->aspm_support |= ASPM_STATE_L1_2;
if (upreg.l1ss_cap & dwreg.l1ss_cap & PCI_L1SS_CAP_PCIPM_L1_1)
link->aspm_support |= ASPM_STATE_L1_1_PCIPM;
if (upreg.l1ss_cap & dwreg.l1ss_cap & PCI_L1SS_CAP_PCIPM_L1_2)
link->aspm_support |= ASPM_STATE_L1_2_PCIPM;
if (upreg.l1ss_ctl1 & dwreg.l1ss_ctl1 & PCI_L1SS_CTL1_ASPM_L1_1)
link->aspm_enabled |= ASPM_STATE_L1_1;
if (upreg.l1ss_ctl1 & dwreg.l1ss_ctl1 & PCI_L1SS_CTL1_ASPM_L1_2)
link->aspm_enabled |= ASPM_STATE_L1_2;
if (upreg.l1ss_ctl1 & dwreg.l1ss_ctl1 & PCI_L1SS_CTL1_PCIPM_L1_1)
link->aspm_enabled |= ASPM_STATE_L1_1_PCIPM;
if (upreg.l1ss_ctl1 & dwreg.l1ss_ctl1 & PCI_L1SS_CTL1_PCIPM_L1_2)
link->aspm_enabled |= ASPM_STATE_L1_2_PCIPM;
if (link->aspm_support & ASPM_STATE_L1SS)
aspm_calc_l1ss_info(link, &upreg, &dwreg);
/* Save default state */
link->aspm_default = link->aspm_enabled;
/* Setup initial capable state. Will be updated later */
link->aspm_capable = link->aspm_support;
/*
* If the downstream component has pci bridge function, don't
* do ASPM for now.
*/
list_for_each_entry(child, &linkbus->devices, bus_list) {
if (pci_pcie_type(child) == PCI_EXP_TYPE_PCI_BRIDGE) {
link->aspm_disable = ASPM_STATE_ALL;
break;
}
}
/* Get and check endpoint acceptable latencies */
list_for_each_entry(child, &linkbus->devices, bus_list) {
u32 reg32, encoding;
struct aspm_latency *acceptable =
&link->acceptable[PCI_FUNC(child->devfn)];
if (pci_pcie_type(child) != PCI_EXP_TYPE_ENDPOINT &&
pci_pcie_type(child) != PCI_EXP_TYPE_LEG_END)
continue;
pcie_capability_read_dword(child, PCI_EXP_DEVCAP, &reg32);
/* Calculate endpoint L0s acceptable latency */
encoding = (reg32 & PCI_EXP_DEVCAP_L0S) >> 6;
acceptable->l0s = calc_l0s_acceptable(encoding);
/* Calculate endpoint L1 acceptable latency */
encoding = (reg32 & PCI_EXP_DEVCAP_L1) >> 9;
acceptable->l1 = calc_l1_acceptable(encoding);
pcie_aspm_check_latency(child);
}
}
static void pci_clear_and_set_dword(struct pci_dev *pdev, int pos,
u32 clear, u32 set)
{
u32 val;
pci_read_config_dword(pdev, pos, &val);
val &= ~clear;
val |= set;
pci_write_config_dword(pdev, pos, val);
}
/* Configure the ASPM L1 substates */
static void pcie_config_aspm_l1ss(struct pcie_link_state *link, u32 state)
{
u32 val, enable_req;
struct pci_dev *child = link->downstream, *parent = link->pdev;
u32 up_cap_ptr = link->l1ss.up_cap_ptr;
u32 dw_cap_ptr = link->l1ss.dw_cap_ptr;
enable_req = (link->aspm_enabled ^ state) & state;
/*
* Here are the rules specified in the PCIe spec for enabling L1SS:
* - When enabling L1.x, enable bit at parent first, then at child
* - When disabling L1.x, disable bit at child first, then at parent
* - When enabling ASPM L1.x, need to disable L1
* (at child followed by parent).
* - The ASPM/PCIPM L1.2 must be disabled while programming timing
* parameters
*
* To keep it simple, disable all L1SS bits first, and later enable
* what is needed.
*/
/* Disable all L1 substates */
pci_clear_and_set_dword(child, dw_cap_ptr + PCI_L1SS_CTL1,
PCI_L1SS_CTL1_L1SS_MASK, 0);
pci_clear_and_set_dword(parent, up_cap_ptr + PCI_L1SS_CTL1,
PCI_L1SS_CTL1_L1SS_MASK, 0);
/*
* If needed, disable L1, and it gets enabled later
* in pcie_config_aspm_link().
*/
if (enable_req & (ASPM_STATE_L1_1 | ASPM_STATE_L1_2)) {
pcie_capability_clear_and_set_word(child, PCI_EXP_LNKCTL,
PCI_EXP_LNKCTL_ASPM_L1, 0);
pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
PCI_EXP_LNKCTL_ASPM_L1, 0);
}
if (enable_req & ASPM_STATE_L1_2_MASK) {
/* Program T_pwr_on in both ports */
pci_write_config_dword(parent, up_cap_ptr + PCI_L1SS_CTL2,
link->l1ss.ctl2);
pci_write_config_dword(child, dw_cap_ptr + PCI_L1SS_CTL2,
link->l1ss.ctl2);
/* Program T_cmn_mode in parent */
pci_clear_and_set_dword(parent, up_cap_ptr + PCI_L1SS_CTL1,
0xFF00, link->l1ss.ctl1);
/* Program LTR L1.2 threshold in both ports */
pci_clear_and_set_dword(parent, dw_cap_ptr + PCI_L1SS_CTL1,
0xE3FF0000, link->l1ss.ctl1);
pci_clear_and_set_dword(child, dw_cap_ptr + PCI_L1SS_CTL1,
0xE3FF0000, link->l1ss.ctl1);
}
val = 0;
if (state & ASPM_STATE_L1_1)
val |= PCI_L1SS_CTL1_ASPM_L1_1;
if (state & ASPM_STATE_L1_2)
val |= PCI_L1SS_CTL1_ASPM_L1_2;
if (state & ASPM_STATE_L1_1_PCIPM)
val |= PCI_L1SS_CTL1_PCIPM_L1_1;
if (state & ASPM_STATE_L1_2_PCIPM)
val |= PCI_L1SS_CTL1_PCIPM_L1_2;
/* Enable what we need to enable */
pci_clear_and_set_dword(parent, up_cap_ptr + PCI_L1SS_CTL1,
PCI_L1SS_CAP_L1_PM_SS, val);
pci_clear_and_set_dword(child, dw_cap_ptr + PCI_L1SS_CTL1,
PCI_L1SS_CAP_L1_PM_SS, val);
}
static void pcie_config_aspm_dev(struct pci_dev *pdev, u32 val)
{
pcie_capability_clear_and_set_word(pdev, PCI_EXP_LNKCTL,
PCI_EXP_LNKCTL_ASPMC, val);
}
static void pcie_config_aspm_link(struct pcie_link_state *link, u32 state)
{
u32 upstream = 0, dwstream = 0;
struct pci_dev *child = link->downstream, *parent = link->pdev;
struct pci_bus *linkbus = parent->subordinate;
/* Enable only the states that were not explicitly disabled */
state &= (link->aspm_capable & ~link->aspm_disable);
/* Can't enable any substates if L1 is not enabled */
if (!(state & ASPM_STATE_L1))
state &= ~ASPM_STATE_L1SS;
/* Spec says both ports must be in D0 before enabling PCI PM substates*/
if (parent->current_state != PCI_D0 || child->current_state != PCI_D0) {
state &= ~ASPM_STATE_L1_SS_PCIPM;
state |= (link->aspm_enabled & ASPM_STATE_L1_SS_PCIPM);
}
/* Nothing to do if the link is already in the requested state */
if (link->aspm_enabled == state)
return;
/* Convert ASPM state to upstream/downstream ASPM register state */
if (state & ASPM_STATE_L0S_UP)
dwstream |= PCI_EXP_LNKCTL_ASPM_L0S;
if (state & ASPM_STATE_L0S_DW)
upstream |= PCI_EXP_LNKCTL_ASPM_L0S;
if (state & ASPM_STATE_L1) {
upstream |= PCI_EXP_LNKCTL_ASPM_L1;
dwstream |= PCI_EXP_LNKCTL_ASPM_L1;
}
if (link->aspm_capable & ASPM_STATE_L1SS)
pcie_config_aspm_l1ss(link, state);
/*
* Spec 2.0 suggests all functions should be configured the
* same setting for ASPM. Enabling ASPM L1 should be done in
* upstream component first and then downstream, and vice
* versa for disabling ASPM L1. Spec doesn't mention L0S.
*/
if (state & ASPM_STATE_L1)
pcie_config_aspm_dev(parent, upstream);
list_for_each_entry(child, &linkbus->devices, bus_list)
pcie_config_aspm_dev(child, dwstream);
if (!(state & ASPM_STATE_L1))
pcie_config_aspm_dev(parent, upstream);
link->aspm_enabled = state;
}
static void pcie_config_aspm_path(struct pcie_link_state *link)
{
while (link) {
pcie_config_aspm_link(link, policy_to_aspm_state(link));
link = link->parent;
}
}
static void free_link_state(struct pcie_link_state *link)
{
link->pdev->link_state = NULL;
kfree(link);
}
static int pcie_aspm_sanity_check(struct pci_dev *pdev)
{
struct pci_dev *child;
u32 reg32;
/*
* Some functions in a slot might not all be PCIe functions,
* very strange. Disable ASPM for the whole slot
*/
list_for_each_entry(child, &pdev->subordinate->devices, bus_list) {
if (!pci_is_pcie(child))
return -EINVAL;
/*
* If ASPM is disabled then we're not going to change
* the BIOS state. It's safe to continue even if it's a
* pre-1.1 device
*/
if (aspm_disabled)
continue;
/*
* Disable ASPM for pre-1.1 PCIe device, we follow MS to use
* RBER bit to determine if a function is 1.1 version device
*/
pcie_capability_read_dword(child, PCI_EXP_DEVCAP, &reg32);
if (!(reg32 & PCI_EXP_DEVCAP_RBER) && !aspm_force) {
dev_info(&child->dev, "disabling ASPM on pre-1.1 PCIe device. You can enable it with 'pcie_aspm=force'\n");
return -EINVAL;
}
}
return 0;
}
static struct pcie_link_state *alloc_pcie_link_state(struct pci_dev *pdev)
{
struct pcie_link_state *link;
link = kzalloc(sizeof(*link), GFP_KERNEL);
if (!link)
return NULL;
INIT_LIST_HEAD(&link->sibling);
INIT_LIST_HEAD(&link->children);
INIT_LIST_HEAD(&link->link);
link->pdev = pdev;
PCI/ASPM: Always set link->downstream to avoid NULL dereference on remove We call pcie_aspm_exit_link_state() when we remove a device. If the device is the last PCIe function to be removed below a bridge and the bridge has an ASPM link_state struct, we disable ASPM on the link. Disabling ASPM requires link->downstream (used in pcie_config_aspm_link()). We previously set link->downstream in pcie_aspm_cap_init(), but only if the device was not blacklisted. Removing the blacklisted device caused a NULL pointer dereference in the pcie_aspm_exit_link_state() -> pcie_config_aspm_link() path: # echo 1 > /sys/bus/pci/devices/0000\:0b\:00.0/remove ... BUG: unable to handle kernel NULL pointer dereference at 0000000000000080 IP: pcie_config_aspm_link+0x5d/0x2b0 Call Trace: pcie_aspm_exit_link_state+0x75/0x130 pci_stop_bus_device+0xa4/0xb0 pci_stop_and_remove_bus_device_locked+0x1a/0x30 remove_store+0x50/0x70 dev_attr_store+0x18/0x30 sysfs_kf_write+0x44/0x60 kernfs_fop_write+0x10e/0x190 __vfs_write+0x28/0x110 ? rcu_read_lock_sched_held+0x5d/0x80 ? rcu_sync_lockdep_assert+0x2c/0x60 ? __sb_start_write+0x173/0x1a0 ? vfs_write+0xb3/0x180 vfs_write+0xc4/0x180 SyS_write+0x49/0xa0 do_syscall_64+0xa6/0x1c0 entry_SYSCALL64_slow_path+0x25/0x25 ---[ end trace bd187ee0267df5d9 ]--- To avoid this, set link->downstream in alloc_pcie_link_state(), so every pcie_link_state structure has a valid link->downstream pointer. [bhelgaas: changelog] Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Acked-by: Rajat Jain <rajatja@google.com> CC: stable@vger.kernel.org
2017-03-01 15:25:40 +07:00
link->downstream = pci_function_0(pdev->subordinate);
/*
* Root Ports and PCI/PCI-X to PCIe Bridges are roots of PCIe
* hierarchies.
*/
if (pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT ||
pci_pcie_type(pdev) == PCI_EXP_TYPE_PCIE_BRIDGE) {
link->root = link;
} else {
struct pcie_link_state *parent;
parent = pdev->bus->parent->self->link_state;
if (!parent) {
kfree(link);
return NULL;
}
link->parent = parent;
link->root = link->parent->root;
list_add(&link->link, &parent->children);
}
list_add(&link->sibling, &link_list);
pdev->link_state = link;
return link;
}
/*
* pcie_aspm_init_link_state: Initiate PCI express link state.
* It is called after the pcie and its children devices are scanned.
* @pdev: the root port or switch downstream port
*/
void pcie_aspm_init_link_state(struct pci_dev *pdev)
{
struct pcie_link_state *link;
int blacklist = !!pcie_aspm_sanity_check(pdev);
if (!aspm_support_enabled)
return;
PCI/ASPM: Use dev->has_secondary_link to find downstream links We allocate pcie_link_state for the component at the upstream end of a Link. Previously we did this by allocating pcie_link_state for Root Ports and Downstream Ports. This works fine for the typical topology: 00:1c.0 Root Port [bridge to bus 02] 02:00.0 Upstream Port [bridge to bus 03] 03:00.0 Downstream Port [bridge to bus 04] 04:00.0 Endpoint or Switch Port However, it is possible to have a Root Port connected to a Downstream Port instead of an Upstream Port, as in Robert White's ATCA system: 00:1c.0 Root Port [bridge to bus 02] 02:00.0 Downstream Port [bridge to bus 03] 03:01.0 Downstream Port [bridge to bus 04] 04:00.0 Endpoint or Switch Port In this topology, we wrongly allocated pcie_link_state for the 02:00.0 Downstream Port, which is actually the *downstream* end of a link. This led to the following NULL pointer dereference when we tried to connect this link into the tree of links starting at the 00:1c.0 Root Port: BUG: unable to handle kernel NULL pointer dereference at 0000000000000088 IP: [<ffffffff81550324>] pcie_aspm_init_link_state+0x744/0x850 Hardware name: Kontron B3001/B3001, BIOS 4.6.3 08/07/2012 Call Trace: [<ffffffff8153b865>] pci_scan_slot+0xd5/0x120 [<ffffffff8153ca1d>] pci_scan_child_bus+0x2d/0xd0 ... Instead of relying on the component type to identify the upstream end of a link, use the "dev->has_secondary_link" field. This means it's now possible for an Upstream Port to have a link on its secondary side, so alloc_pcie_link_state() needs to connect links originating from both Upstream and Downstream Ports into the tree. [bhelgaas: changelog, add comment] Link: https://bugzilla.kernel.org/show_bug.cgi?id=94361 Link: http://lkml.kernel.org/r/54EB81B2.4050904@pobox.com Reported-by: Robert White <rwhite@pobox.com> Signed-off-by: Yijing Wang <wangyijing@huawei.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
2015-05-21 14:05:03 +07:00
if (pdev->link_state)
return;
PCI/ASPM: Use dev->has_secondary_link to find downstream links We allocate pcie_link_state for the component at the upstream end of a Link. Previously we did this by allocating pcie_link_state for Root Ports and Downstream Ports. This works fine for the typical topology: 00:1c.0 Root Port [bridge to bus 02] 02:00.0 Upstream Port [bridge to bus 03] 03:00.0 Downstream Port [bridge to bus 04] 04:00.0 Endpoint or Switch Port However, it is possible to have a Root Port connected to a Downstream Port instead of an Upstream Port, as in Robert White's ATCA system: 00:1c.0 Root Port [bridge to bus 02] 02:00.0 Downstream Port [bridge to bus 03] 03:01.0 Downstream Port [bridge to bus 04] 04:00.0 Endpoint or Switch Port In this topology, we wrongly allocated pcie_link_state for the 02:00.0 Downstream Port, which is actually the *downstream* end of a link. This led to the following NULL pointer dereference when we tried to connect this link into the tree of links starting at the 00:1c.0 Root Port: BUG: unable to handle kernel NULL pointer dereference at 0000000000000088 IP: [<ffffffff81550324>] pcie_aspm_init_link_state+0x744/0x850 Hardware name: Kontron B3001/B3001, BIOS 4.6.3 08/07/2012 Call Trace: [<ffffffff8153b865>] pci_scan_slot+0xd5/0x120 [<ffffffff8153ca1d>] pci_scan_child_bus+0x2d/0xd0 ... Instead of relying on the component type to identify the upstream end of a link, use the "dev->has_secondary_link" field. This means it's now possible for an Upstream Port to have a link on its secondary side, so alloc_pcie_link_state() needs to connect links originating from both Upstream and Downstream Ports into the tree. [bhelgaas: changelog, add comment] Link: https://bugzilla.kernel.org/show_bug.cgi?id=94361 Link: http://lkml.kernel.org/r/54EB81B2.4050904@pobox.com Reported-by: Robert White <rwhite@pobox.com> Signed-off-by: Yijing Wang <wangyijing@huawei.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
2015-05-21 14:05:03 +07:00
/*
* We allocate pcie_link_state for the component on the upstream
* end of a Link, so there's nothing to do unless this device has a
* Link on its secondary side.
*/
if (!pdev->has_secondary_link)
return;
/* VIA has a strange chipset, root port is under a bridge */
if (pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT &&
pdev->bus->self)
return;
down_read(&pci_bus_sem);
if (list_empty(&pdev->subordinate->devices))
goto out;
mutex_lock(&aspm_lock);
link = alloc_pcie_link_state(pdev);
if (!link)
goto unlock;
/*
* Setup initial ASPM state. Note that we need to configure
* upstream links also because capable state of them can be
* update through pcie_aspm_cap_init().
*/
pcie_aspm_cap_init(link, blacklist);
/* Setup initial Clock PM state */
pcie_clkpm_cap_init(link, blacklist);
/*
* At this stage drivers haven't had an opportunity to change the
* link policy setting. Enabling ASPM on broken hardware can cripple
* it even before the driver has had a chance to disable ASPM, so
* default to a safe level right now. If we're enabling ASPM beyond
* the BIOS's expectation, we'll do so once pci_enable_device() is
* called.
*/
if (aspm_policy != POLICY_POWERSAVE &&
aspm_policy != POLICY_POWER_SUPERSAVE) {
pcie_config_aspm_path(link);
pcie_set_clkpm(link, policy_to_clkpm_state(link));
}
unlock:
mutex_unlock(&aspm_lock);
out:
up_read(&pci_bus_sem);
}
/* Recheck latencies and update aspm_capable for links under the root */
static void pcie_update_aspm_capable(struct pcie_link_state *root)
{
struct pcie_link_state *link;
BUG_ON(root->parent);
list_for_each_entry(link, &link_list, sibling) {
if (link->root != root)
continue;
link->aspm_capable = link->aspm_support;
}
list_for_each_entry(link, &link_list, sibling) {
struct pci_dev *child;
struct pci_bus *linkbus = link->pdev->subordinate;
if (link->root != root)
continue;
list_for_each_entry(child, &linkbus->devices, bus_list) {
if ((pci_pcie_type(child) != PCI_EXP_TYPE_ENDPOINT) &&
(pci_pcie_type(child) != PCI_EXP_TYPE_LEG_END))
continue;
pcie_aspm_check_latency(child);
}
}
}
/* @pdev: the endpoint device */
void pcie_aspm_exit_link_state(struct pci_dev *pdev)
{
struct pci_dev *parent = pdev->bus->self;
struct pcie_link_state *link, *root, *parent_link;
if (!parent || !parent->link_state)
return;
down_read(&pci_bus_sem);
mutex_lock(&aspm_lock);
/*
* All PCIe functions are in one slot, remove one function will remove
* the whole slot, so just wait until we are the last function left.
*/
if (!list_is_last(&pdev->bus_list, &parent->subordinate->devices))
goto out;
link = parent->link_state;
root = link->root;
parent_link = link->parent;
/* All functions are removed, so just disable ASPM for the link */
pcie_config_aspm_link(link, 0);
list_del(&link->sibling);
list_del(&link->link);
/* Clock PM is for endpoint device */
free_link_state(link);
/* Recheck latencies and configure upstream links */
PCI: fix BUG_ON triggered by logical PCIe root port removal This problem happened when removing PCIe root port using PCI logical hotplug operation. The immediate cause of this problem is that the pointer to invalid data structure is passed to pcie_update_aspm_capable() by pcie_aspm_exit_link_state(). When pcie_aspm_exit_link_state() received a pointer to root port link, it unconfigures the root port link and frees its data structure at first. At this point, there are not links to configure under the root port and the data structure for root port link is already freed. So pcie_aspm_exit_link_state() must not call pcie_update_aspm_capable() and pcie_config_aspm_path(). This patch fixes the problem by changing pcie_aspm_exit_link_state() not to call pcie_update_aspm_capable() and pcie_config_aspm_path() if the specified link is root port link. ------------[ cut here ]------------ kernel BUG at drivers/pci/pcie/aspm.c:606! invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC last sysfs file: /sys/devices/pci0000:40/0000:40:13.0/remove CPU 1 Modules linked in: shpchp Pid: 9345, comm: sysfsd Not tainted 2.6.32-rc5 #98 ProLiant DL785 G6 RIP: 0010:[<ffffffff811df69b>] [<ffffffff811df69b>] pcie_update_aspm_capable+0x15/0xbe RSP: 0018:ffff88082a2f5ca0 EFLAGS: 00010202 RAX: 0000000000000e77 RBX: ffff88182cc3e000 RCX: ffff88082a33d006 RDX: 0000000000000001 RSI: ffffffff811dff4a RDI: ffff88182cc3e000 RBP: ffff88082a2f5cc0 R08: ffff88182cc3e000 R09: 0000000000000000 R10: ffff88182fc00180 R11: ffff88182fc00198 R12: ffff88182cc3e000 R13: 0000000000000000 R14: ffff88182cc3e000 R15: ffff88082a2f5e20 FS: 00007f259a64b6f0(0000) GS:ffff880864600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b CR2: 00007feb53f73da0 CR3: 000000102cc94000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process sysfsd (pid: 9345, threadinfo ffff88082a2f4000, task ffff88082a33cf00) Stack: ffff88182cc3e000 ffff88182cc3e000 0000000000000000 ffff88082a33cf00 <0> ffff88082a2f5cf0 ffffffff811dff52 ffff88082a2f5cf0 ffff88082c525168 <0> ffff88402c9fd2f8 ffff88402c9fd2f8 ffff88082a2f5d20 ffffffff811d7db2 Call Trace: [<ffffffff811dff52>] pcie_aspm_exit_link_state+0xf5/0x11e [<ffffffff811d7db2>] pci_stop_bus_device+0x76/0x7e [<ffffffff811d7d67>] pci_stop_bus_device+0x2b/0x7e [<ffffffff811d7e4f>] pci_remove_bus_device+0x15/0xb9 [<ffffffff811dcb8c>] remove_callback+0x29/0x3a [<ffffffff81135aeb>] sysfs_schedule_callback_work+0x15/0x6d [<ffffffff81072790>] worker_thread+0x19d/0x298 [<ffffffff8107273b>] ? worker_thread+0x148/0x298 [<ffffffff81135ad6>] ? sysfs_schedule_callback_work+0x0/0x6d [<ffffffff810765c0>] ? autoremove_wake_function+0x0/0x38 [<ffffffff810725f3>] ? worker_thread+0x0/0x298 [<ffffffff8107629e>] kthread+0x7d/0x85 [<ffffffff8102eafa>] child_rip+0xa/0x20 [<ffffffff8102e4bc>] ? restore_args+0x0/0x30 [<ffffffff81076221>] ? kthread+0x0/0x85 [<ffffffff8102eaf0>] ? child_rip+0x0/0x20 Code: 89 e5 8a 50 48 31 c0 c0 ea 03 83 e2 07 e8 b2 de fe ff c9 48 98 c3 55 48 89 e5 41 56 49 89 fe 41 55 41 54 53 48 83 7f 10 00 74 04 <0f> 0b eb fe 48 8b 05 da 7d 63 00 4c 8d 60 e8 4c 89 e1 eb 24 4c RIP [<ffffffff811df69b>] pcie_update_aspm_capable+0x15/0xbe RSP <ffff88082a2f5ca0> ---[ end trace 6ae0f65bdeab8555 ]--- Reported-by: Alex Chiang <achiang@hp.com> Tested-by: Alex Chiang <achiang@hp.com> Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2009-11-06 09:25:13 +07:00
if (parent_link) {
pcie_update_aspm_capable(root);
pcie_config_aspm_path(parent_link);
}
out:
mutex_unlock(&aspm_lock);
up_read(&pci_bus_sem);
}
/* @pdev: the root port or switch downstream port */
void pcie_aspm_pm_state_change(struct pci_dev *pdev)
{
struct pcie_link_state *link = pdev->link_state;
if (aspm_disabled || !link)
return;
/*
* Devices changed PM state, we should recheck if latency
* meets all functions' requirement
*/
down_read(&pci_bus_sem);
mutex_lock(&aspm_lock);
pcie_update_aspm_capable(link->root);
pcie_config_aspm_path(link);
mutex_unlock(&aspm_lock);
up_read(&pci_bus_sem);
}
PCI: PCIe links may not get configured for ASPM under POWERSAVE mode v3 -> v2: Moved ASPM enabling logic to pci_set_power_state() v2 -> v1: Preserved the logic in pci_raw_set_power_state() : Added ASPM enabling logic after scanning Root Bridge : http://marc.info/?l=linux-pci&m=130046996216391&w=2 v1 : http://marc.info/?l=linux-pci&m=130013164703283&w=2 The assumption made in commit 41cd766b065970ff6f6c89dd1cf55fa706c84a3d (PCI: Don't enable aspm before drivers have had a chance to veto it) that pci_enable_device() will result in re-configuring ASPM when aspm_policy is POWERSAVE is no longer valid. This is due to commit 97c145f7c87453cec90e91238fba5fe2c1561b32 (PCI: read current power state at enable time) which resets dev->current_state to D0. Due to this the call to pcie_aspm_pm_state_change() is never made. Note the equality check (below) that returns early: ./drivers/pci/pci.c: pci_raw_set_pci_power_state() 546 /* Check if we're already there */ 547 if (dev->current_state == state) 548 return 0; Therefore OSPM never configures the PCIe links for ASPM to turn them "on". Fix it by configuring ASPM from the pci_enable_device() code path. This also allows a driver such as the e1000e networking driver a chance to disable ASPM (L0s, L1), if need be, prior to enabling the device. A driver may perform this action if the device is known to mis-behave wrt ASPM. Signed-off-by: Naga Chumbalkar <nagananda.chumbalkar@hp.com> Acked-by: Rafael J. Wysocki <rjw@sisk.pl> Cc: Matthew Garrett <mjg59@srcf.ucam.org> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-03-21 10:29:08 +07:00
void pcie_aspm_powersave_config_link(struct pci_dev *pdev)
{
struct pcie_link_state *link = pdev->link_state;
if (aspm_disabled || !link)
PCI: PCIe links may not get configured for ASPM under POWERSAVE mode v3 -> v2: Moved ASPM enabling logic to pci_set_power_state() v2 -> v1: Preserved the logic in pci_raw_set_power_state() : Added ASPM enabling logic after scanning Root Bridge : http://marc.info/?l=linux-pci&m=130046996216391&w=2 v1 : http://marc.info/?l=linux-pci&m=130013164703283&w=2 The assumption made in commit 41cd766b065970ff6f6c89dd1cf55fa706c84a3d (PCI: Don't enable aspm before drivers have had a chance to veto it) that pci_enable_device() will result in re-configuring ASPM when aspm_policy is POWERSAVE is no longer valid. This is due to commit 97c145f7c87453cec90e91238fba5fe2c1561b32 (PCI: read current power state at enable time) which resets dev->current_state to D0. Due to this the call to pcie_aspm_pm_state_change() is never made. Note the equality check (below) that returns early: ./drivers/pci/pci.c: pci_raw_set_pci_power_state() 546 /* Check if we're already there */ 547 if (dev->current_state == state) 548 return 0; Therefore OSPM never configures the PCIe links for ASPM to turn them "on". Fix it by configuring ASPM from the pci_enable_device() code path. This also allows a driver such as the e1000e networking driver a chance to disable ASPM (L0s, L1), if need be, prior to enabling the device. A driver may perform this action if the device is known to mis-behave wrt ASPM. Signed-off-by: Naga Chumbalkar <nagananda.chumbalkar@hp.com> Acked-by: Rafael J. Wysocki <rjw@sisk.pl> Cc: Matthew Garrett <mjg59@srcf.ucam.org> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-03-21 10:29:08 +07:00
return;
if (aspm_policy != POLICY_POWERSAVE &&
aspm_policy != POLICY_POWER_SUPERSAVE)
PCI: PCIe links may not get configured for ASPM under POWERSAVE mode v3 -> v2: Moved ASPM enabling logic to pci_set_power_state() v2 -> v1: Preserved the logic in pci_raw_set_power_state() : Added ASPM enabling logic after scanning Root Bridge : http://marc.info/?l=linux-pci&m=130046996216391&w=2 v1 : http://marc.info/?l=linux-pci&m=130013164703283&w=2 The assumption made in commit 41cd766b065970ff6f6c89dd1cf55fa706c84a3d (PCI: Don't enable aspm before drivers have had a chance to veto it) that pci_enable_device() will result in re-configuring ASPM when aspm_policy is POWERSAVE is no longer valid. This is due to commit 97c145f7c87453cec90e91238fba5fe2c1561b32 (PCI: read current power state at enable time) which resets dev->current_state to D0. Due to this the call to pcie_aspm_pm_state_change() is never made. Note the equality check (below) that returns early: ./drivers/pci/pci.c: pci_raw_set_pci_power_state() 546 /* Check if we're already there */ 547 if (dev->current_state == state) 548 return 0; Therefore OSPM never configures the PCIe links for ASPM to turn them "on". Fix it by configuring ASPM from the pci_enable_device() code path. This also allows a driver such as the e1000e networking driver a chance to disable ASPM (L0s, L1), if need be, prior to enabling the device. A driver may perform this action if the device is known to mis-behave wrt ASPM. Signed-off-by: Naga Chumbalkar <nagananda.chumbalkar@hp.com> Acked-by: Rafael J. Wysocki <rjw@sisk.pl> Cc: Matthew Garrett <mjg59@srcf.ucam.org> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-03-21 10:29:08 +07:00
return;
down_read(&pci_bus_sem);
mutex_lock(&aspm_lock);
pcie_config_aspm_path(link);
pcie_set_clkpm(link, policy_to_clkpm_state(link));
mutex_unlock(&aspm_lock);
up_read(&pci_bus_sem);
}
static void __pci_disable_link_state(struct pci_dev *pdev, int state, bool sem)
{
struct pci_dev *parent = pdev->bus->self;
struct pcie_link_state *link;
if (!pci_is_pcie(pdev))
return;
PCI/ASPM: Use dev->has_secondary_link to find downstream links We allocate pcie_link_state for the component at the upstream end of a Link. Previously we did this by allocating pcie_link_state for Root Ports and Downstream Ports. This works fine for the typical topology: 00:1c.0 Root Port [bridge to bus 02] 02:00.0 Upstream Port [bridge to bus 03] 03:00.0 Downstream Port [bridge to bus 04] 04:00.0 Endpoint or Switch Port However, it is possible to have a Root Port connected to a Downstream Port instead of an Upstream Port, as in Robert White's ATCA system: 00:1c.0 Root Port [bridge to bus 02] 02:00.0 Downstream Port [bridge to bus 03] 03:01.0 Downstream Port [bridge to bus 04] 04:00.0 Endpoint or Switch Port In this topology, we wrongly allocated pcie_link_state for the 02:00.0 Downstream Port, which is actually the *downstream* end of a link. This led to the following NULL pointer dereference when we tried to connect this link into the tree of links starting at the 00:1c.0 Root Port: BUG: unable to handle kernel NULL pointer dereference at 0000000000000088 IP: [<ffffffff81550324>] pcie_aspm_init_link_state+0x744/0x850 Hardware name: Kontron B3001/B3001, BIOS 4.6.3 08/07/2012 Call Trace: [<ffffffff8153b865>] pci_scan_slot+0xd5/0x120 [<ffffffff8153ca1d>] pci_scan_child_bus+0x2d/0xd0 ... Instead of relying on the component type to identify the upstream end of a link, use the "dev->has_secondary_link" field. This means it's now possible for an Upstream Port to have a link on its secondary side, so alloc_pcie_link_state() needs to connect links originating from both Upstream and Downstream Ports into the tree. [bhelgaas: changelog, add comment] Link: https://bugzilla.kernel.org/show_bug.cgi?id=94361 Link: http://lkml.kernel.org/r/54EB81B2.4050904@pobox.com Reported-by: Robert White <rwhite@pobox.com> Signed-off-by: Yijing Wang <wangyijing@huawei.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
2015-05-21 14:05:03 +07:00
if (pdev->has_secondary_link)
parent = pdev;
if (!parent || !parent->link_state)
return;
/*
* A driver requested that ASPM be disabled on this device, but
* if we don't have permission to manage ASPM (e.g., on ACPI
* systems we have to observe the FADT ACPI_FADT_NO_ASPM bit and
* the _OSC method), we can't honor that request. Windows has
* a similar mechanism using "PciASPMOptOut", which is also
* ignored in this situation.
*/
if (aspm_disabled) {
dev_warn(&pdev->dev, "can't disable ASPM; OS doesn't have ASPM control\n");
return;
}
PCI/e1000e: Add and use pci_disable_link_state_locked() Need to use it in _e1000e_disable_aspm. This routine is used for error recovery, where the pci_bus_sem is already held, and we don't want pci_disable_link_state to try to take it again. So add a locked variant for use in cases like this. Found lock up: [ 2374.654557] kworker/32:1 D ffff881027f6b0f0 0 6075 2 0x00000000 [ 2374.654816] ffff88503f099a68 0000000000000046 ffff88503f098000 0000000000004000 [ 2374.654837] 00000000001d1ec0 ffff88503f099fd8 00000000001d1ec0 ffff88503f099fd8 [ 2374.654860] 0000000000004000 00000000001d1ec0 ffff88503dcc8000 ffff88503f090000 [ 2374.654880] Call Trace: [ 2374.654898] [<ffffffff810b1302>] ? __lock_acquired+0x3a/0x224 [ 2374.654914] [<ffffffff81c2b59c>] ? _raw_spin_unlock_irq+0x30/0x36 [ 2374.654925] [<ffffffff810b069d>] ? trace_hardirqs_on_caller+0x1f/0x178 [ 2374.654936] [<ffffffff81c2ab24>] rwsem_down_failed_common+0xd3/0x103 [ 2374.654945] [<ffffffff810b158f>] ? __lock_contended+0x3a/0x2a2 [ 2374.654955] [<ffffffff81c2ab7b>] rwsem_down_read_failed+0x12/0x14 [ 2374.654967] [<ffffffff813371e4>] call_rwsem_down_read_failed+0x14/0x30 [ 2374.654981] [<ffffffff8135df20>] ? pci_disable_link_state+0x5f/0xf5 [ 2374.654990] [<ffffffff81c2a0e6>] ? down_read+0x7e/0x91 [ 2374.654999] [<ffffffff8135df20>] ? pci_disable_link_state+0x5f/0xf5 [ 2374.655008] [<ffffffff8135df20>] pci_disable_link_state+0x5f/0xf5 [ 2374.655024] [<ffffffff81661796>] e1000e_disable_aspm+0x55/0x5a [ 2374.655037] [<ffffffff816677eb>] e1000_io_slot_reset+0x59/0xea [ 2374.655048] [<ffffffff8135fe0d>] ? report_mmio_enabled+0x5d/0x5d [ 2374.655057] [<ffffffff8135fe3b>] report_slot_reset+0x2e/0x5d [ 2374.655072] [<ffffffff8135369e>] pci_walk_bus+0x8a/0xb7 [ 2374.655081] [<ffffffff8135fe0d>] ? report_mmio_enabled+0x5d/0x5d [ 2374.655091] [<ffffffff813603be>] broadcast_error_message+0xa4/0xb2 [ 2374.655101] [<ffffffff81352c71>] ? pci_bus_read_config_dword+0x72/0x80 [ 2374.655110] [<ffffffff813606df>] do_recovery+0x9e/0xf9 [ 2374.655120] [<ffffffff81360786>] handle_error_source+0x4c/0x51 [ 2374.655129] [<ffffffff81360974>] aer_isr_one_error+0x1e9/0x21a [ 2374.655138] [<ffffffff81360a6c>] aer_isr+0xc7/0xcc [ 2374.655147] [<ffffffff813609a5>] ? aer_isr_one_error+0x21a/0x21a [ 2374.655159] [<ffffffff81096d9f>] process_one_work+0x237/0x3ec [ 2374.655168] [<ffffffff81096d10>] ? process_one_work+0x1a8/0x3ec [ 2374.655178] [<ffffffff8109728d>] worker_thread+0x17c/0x240 [ 2374.655186] [<ffffffff810b0803>] ? trace_hardirqs_on+0xd/0xf [ 2374.655196] [<ffffffff81097111>] ? manage_workers+0xab/0xab [ 2374.655209] [<ffffffff8109c8ed>] kthread+0xa0/0xa8 [ 2374.655223] [<ffffffff81c332d4>] kernel_thread_helper+0x4/0x10 [ 2374.655232] [<ffffffff81c2b880>] ? retint_restore_args+0xe/0xe [ 2374.655243] [<ffffffff8109c84d>] ? __init_kthread_worker+0x5b/0x5b [ 2374.655252] [<ffffffff81c332d0>] ? gs_change+0xb/0xb when aer happens, pci_walk_bus already have down_read(&pci_bus_sem)... then report_slot_reset ==> e1000_io_slot_reset ==> e1000e_disable_aspm ==> pci_disable_link_state... We can not use pci_disable_link_state, and it will try to hold pci_bus_sem again. Try to have __pci_disable_link_state that will not need to hold pci_bus_sem. -v2: change name to pci_disable_link_state_locked() according to Jesse. [jbarnes: make sure new function is exported for modules] Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-05-13 07:11:47 +07:00
if (sem)
down_read(&pci_bus_sem);
mutex_lock(&aspm_lock);
link = parent->link_state;
if (state & PCIE_LINK_STATE_L0S)
link->aspm_disable |= ASPM_STATE_L0S;
if (state & PCIE_LINK_STATE_L1)
link->aspm_disable |= ASPM_STATE_L1;
pcie_config_aspm_link(link, policy_to_aspm_state(link));
if (state & PCIE_LINK_STATE_CLKPM) {
link->clkpm_capable = 0;
pcie_set_clkpm(link, 0);
}
mutex_unlock(&aspm_lock);
PCI/e1000e: Add and use pci_disable_link_state_locked() Need to use it in _e1000e_disable_aspm. This routine is used for error recovery, where the pci_bus_sem is already held, and we don't want pci_disable_link_state to try to take it again. So add a locked variant for use in cases like this. Found lock up: [ 2374.654557] kworker/32:1 D ffff881027f6b0f0 0 6075 2 0x00000000 [ 2374.654816] ffff88503f099a68 0000000000000046 ffff88503f098000 0000000000004000 [ 2374.654837] 00000000001d1ec0 ffff88503f099fd8 00000000001d1ec0 ffff88503f099fd8 [ 2374.654860] 0000000000004000 00000000001d1ec0 ffff88503dcc8000 ffff88503f090000 [ 2374.654880] Call Trace: [ 2374.654898] [<ffffffff810b1302>] ? __lock_acquired+0x3a/0x224 [ 2374.654914] [<ffffffff81c2b59c>] ? _raw_spin_unlock_irq+0x30/0x36 [ 2374.654925] [<ffffffff810b069d>] ? trace_hardirqs_on_caller+0x1f/0x178 [ 2374.654936] [<ffffffff81c2ab24>] rwsem_down_failed_common+0xd3/0x103 [ 2374.654945] [<ffffffff810b158f>] ? __lock_contended+0x3a/0x2a2 [ 2374.654955] [<ffffffff81c2ab7b>] rwsem_down_read_failed+0x12/0x14 [ 2374.654967] [<ffffffff813371e4>] call_rwsem_down_read_failed+0x14/0x30 [ 2374.654981] [<ffffffff8135df20>] ? pci_disable_link_state+0x5f/0xf5 [ 2374.654990] [<ffffffff81c2a0e6>] ? down_read+0x7e/0x91 [ 2374.654999] [<ffffffff8135df20>] ? pci_disable_link_state+0x5f/0xf5 [ 2374.655008] [<ffffffff8135df20>] pci_disable_link_state+0x5f/0xf5 [ 2374.655024] [<ffffffff81661796>] e1000e_disable_aspm+0x55/0x5a [ 2374.655037] [<ffffffff816677eb>] e1000_io_slot_reset+0x59/0xea [ 2374.655048] [<ffffffff8135fe0d>] ? report_mmio_enabled+0x5d/0x5d [ 2374.655057] [<ffffffff8135fe3b>] report_slot_reset+0x2e/0x5d [ 2374.655072] [<ffffffff8135369e>] pci_walk_bus+0x8a/0xb7 [ 2374.655081] [<ffffffff8135fe0d>] ? report_mmio_enabled+0x5d/0x5d [ 2374.655091] [<ffffffff813603be>] broadcast_error_message+0xa4/0xb2 [ 2374.655101] [<ffffffff81352c71>] ? pci_bus_read_config_dword+0x72/0x80 [ 2374.655110] [<ffffffff813606df>] do_recovery+0x9e/0xf9 [ 2374.655120] [<ffffffff81360786>] handle_error_source+0x4c/0x51 [ 2374.655129] [<ffffffff81360974>] aer_isr_one_error+0x1e9/0x21a [ 2374.655138] [<ffffffff81360a6c>] aer_isr+0xc7/0xcc [ 2374.655147] [<ffffffff813609a5>] ? aer_isr_one_error+0x21a/0x21a [ 2374.655159] [<ffffffff81096d9f>] process_one_work+0x237/0x3ec [ 2374.655168] [<ffffffff81096d10>] ? process_one_work+0x1a8/0x3ec [ 2374.655178] [<ffffffff8109728d>] worker_thread+0x17c/0x240 [ 2374.655186] [<ffffffff810b0803>] ? trace_hardirqs_on+0xd/0xf [ 2374.655196] [<ffffffff81097111>] ? manage_workers+0xab/0xab [ 2374.655209] [<ffffffff8109c8ed>] kthread+0xa0/0xa8 [ 2374.655223] [<ffffffff81c332d4>] kernel_thread_helper+0x4/0x10 [ 2374.655232] [<ffffffff81c2b880>] ? retint_restore_args+0xe/0xe [ 2374.655243] [<ffffffff8109c84d>] ? __init_kthread_worker+0x5b/0x5b [ 2374.655252] [<ffffffff81c332d0>] ? gs_change+0xb/0xb when aer happens, pci_walk_bus already have down_read(&pci_bus_sem)... then report_slot_reset ==> e1000_io_slot_reset ==> e1000e_disable_aspm ==> pci_disable_link_state... We can not use pci_disable_link_state, and it will try to hold pci_bus_sem again. Try to have __pci_disable_link_state that will not need to hold pci_bus_sem. -v2: change name to pci_disable_link_state_locked() according to Jesse. [jbarnes: make sure new function is exported for modules] Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-05-13 07:11:47 +07:00
if (sem)
up_read(&pci_bus_sem);
}
void pci_disable_link_state_locked(struct pci_dev *pdev, int state)
{
__pci_disable_link_state(pdev, state, false);
PCI/e1000e: Add and use pci_disable_link_state_locked() Need to use it in _e1000e_disable_aspm. This routine is used for error recovery, where the pci_bus_sem is already held, and we don't want pci_disable_link_state to try to take it again. So add a locked variant for use in cases like this. Found lock up: [ 2374.654557] kworker/32:1 D ffff881027f6b0f0 0 6075 2 0x00000000 [ 2374.654816] ffff88503f099a68 0000000000000046 ffff88503f098000 0000000000004000 [ 2374.654837] 00000000001d1ec0 ffff88503f099fd8 00000000001d1ec0 ffff88503f099fd8 [ 2374.654860] 0000000000004000 00000000001d1ec0 ffff88503dcc8000 ffff88503f090000 [ 2374.654880] Call Trace: [ 2374.654898] [<ffffffff810b1302>] ? __lock_acquired+0x3a/0x224 [ 2374.654914] [<ffffffff81c2b59c>] ? _raw_spin_unlock_irq+0x30/0x36 [ 2374.654925] [<ffffffff810b069d>] ? trace_hardirqs_on_caller+0x1f/0x178 [ 2374.654936] [<ffffffff81c2ab24>] rwsem_down_failed_common+0xd3/0x103 [ 2374.654945] [<ffffffff810b158f>] ? __lock_contended+0x3a/0x2a2 [ 2374.654955] [<ffffffff81c2ab7b>] rwsem_down_read_failed+0x12/0x14 [ 2374.654967] [<ffffffff813371e4>] call_rwsem_down_read_failed+0x14/0x30 [ 2374.654981] [<ffffffff8135df20>] ? pci_disable_link_state+0x5f/0xf5 [ 2374.654990] [<ffffffff81c2a0e6>] ? down_read+0x7e/0x91 [ 2374.654999] [<ffffffff8135df20>] ? pci_disable_link_state+0x5f/0xf5 [ 2374.655008] [<ffffffff8135df20>] pci_disable_link_state+0x5f/0xf5 [ 2374.655024] [<ffffffff81661796>] e1000e_disable_aspm+0x55/0x5a [ 2374.655037] [<ffffffff816677eb>] e1000_io_slot_reset+0x59/0xea [ 2374.655048] [<ffffffff8135fe0d>] ? report_mmio_enabled+0x5d/0x5d [ 2374.655057] [<ffffffff8135fe3b>] report_slot_reset+0x2e/0x5d [ 2374.655072] [<ffffffff8135369e>] pci_walk_bus+0x8a/0xb7 [ 2374.655081] [<ffffffff8135fe0d>] ? report_mmio_enabled+0x5d/0x5d [ 2374.655091] [<ffffffff813603be>] broadcast_error_message+0xa4/0xb2 [ 2374.655101] [<ffffffff81352c71>] ? pci_bus_read_config_dword+0x72/0x80 [ 2374.655110] [<ffffffff813606df>] do_recovery+0x9e/0xf9 [ 2374.655120] [<ffffffff81360786>] handle_error_source+0x4c/0x51 [ 2374.655129] [<ffffffff81360974>] aer_isr_one_error+0x1e9/0x21a [ 2374.655138] [<ffffffff81360a6c>] aer_isr+0xc7/0xcc [ 2374.655147] [<ffffffff813609a5>] ? aer_isr_one_error+0x21a/0x21a [ 2374.655159] [<ffffffff81096d9f>] process_one_work+0x237/0x3ec [ 2374.655168] [<ffffffff81096d10>] ? process_one_work+0x1a8/0x3ec [ 2374.655178] [<ffffffff8109728d>] worker_thread+0x17c/0x240 [ 2374.655186] [<ffffffff810b0803>] ? trace_hardirqs_on+0xd/0xf [ 2374.655196] [<ffffffff81097111>] ? manage_workers+0xab/0xab [ 2374.655209] [<ffffffff8109c8ed>] kthread+0xa0/0xa8 [ 2374.655223] [<ffffffff81c332d4>] kernel_thread_helper+0x4/0x10 [ 2374.655232] [<ffffffff81c2b880>] ? retint_restore_args+0xe/0xe [ 2374.655243] [<ffffffff8109c84d>] ? __init_kthread_worker+0x5b/0x5b [ 2374.655252] [<ffffffff81c332d0>] ? gs_change+0xb/0xb when aer happens, pci_walk_bus already have down_read(&pci_bus_sem)... then report_slot_reset ==> e1000_io_slot_reset ==> e1000e_disable_aspm ==> pci_disable_link_state... We can not use pci_disable_link_state, and it will try to hold pci_bus_sem again. Try to have __pci_disable_link_state that will not need to hold pci_bus_sem. -v2: change name to pci_disable_link_state_locked() according to Jesse. [jbarnes: make sure new function is exported for modules] Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-05-13 07:11:47 +07:00
}
EXPORT_SYMBOL(pci_disable_link_state_locked);
/**
* pci_disable_link_state - Disable device's link state, so the link will
* never enter specific states. Note that if the BIOS didn't grant ASPM
* control to the OS, this does nothing because we can't touch the LNKCTL
* register.
*
* @pdev: PCI device
* @state: ASPM link state to disable
*/
PCI/e1000e: Add and use pci_disable_link_state_locked() Need to use it in _e1000e_disable_aspm. This routine is used for error recovery, where the pci_bus_sem is already held, and we don't want pci_disable_link_state to try to take it again. So add a locked variant for use in cases like this. Found lock up: [ 2374.654557] kworker/32:1 D ffff881027f6b0f0 0 6075 2 0x00000000 [ 2374.654816] ffff88503f099a68 0000000000000046 ffff88503f098000 0000000000004000 [ 2374.654837] 00000000001d1ec0 ffff88503f099fd8 00000000001d1ec0 ffff88503f099fd8 [ 2374.654860] 0000000000004000 00000000001d1ec0 ffff88503dcc8000 ffff88503f090000 [ 2374.654880] Call Trace: [ 2374.654898] [<ffffffff810b1302>] ? __lock_acquired+0x3a/0x224 [ 2374.654914] [<ffffffff81c2b59c>] ? _raw_spin_unlock_irq+0x30/0x36 [ 2374.654925] [<ffffffff810b069d>] ? trace_hardirqs_on_caller+0x1f/0x178 [ 2374.654936] [<ffffffff81c2ab24>] rwsem_down_failed_common+0xd3/0x103 [ 2374.654945] [<ffffffff810b158f>] ? __lock_contended+0x3a/0x2a2 [ 2374.654955] [<ffffffff81c2ab7b>] rwsem_down_read_failed+0x12/0x14 [ 2374.654967] [<ffffffff813371e4>] call_rwsem_down_read_failed+0x14/0x30 [ 2374.654981] [<ffffffff8135df20>] ? pci_disable_link_state+0x5f/0xf5 [ 2374.654990] [<ffffffff81c2a0e6>] ? down_read+0x7e/0x91 [ 2374.654999] [<ffffffff8135df20>] ? pci_disable_link_state+0x5f/0xf5 [ 2374.655008] [<ffffffff8135df20>] pci_disable_link_state+0x5f/0xf5 [ 2374.655024] [<ffffffff81661796>] e1000e_disable_aspm+0x55/0x5a [ 2374.655037] [<ffffffff816677eb>] e1000_io_slot_reset+0x59/0xea [ 2374.655048] [<ffffffff8135fe0d>] ? report_mmio_enabled+0x5d/0x5d [ 2374.655057] [<ffffffff8135fe3b>] report_slot_reset+0x2e/0x5d [ 2374.655072] [<ffffffff8135369e>] pci_walk_bus+0x8a/0xb7 [ 2374.655081] [<ffffffff8135fe0d>] ? report_mmio_enabled+0x5d/0x5d [ 2374.655091] [<ffffffff813603be>] broadcast_error_message+0xa4/0xb2 [ 2374.655101] [<ffffffff81352c71>] ? pci_bus_read_config_dword+0x72/0x80 [ 2374.655110] [<ffffffff813606df>] do_recovery+0x9e/0xf9 [ 2374.655120] [<ffffffff81360786>] handle_error_source+0x4c/0x51 [ 2374.655129] [<ffffffff81360974>] aer_isr_one_error+0x1e9/0x21a [ 2374.655138] [<ffffffff81360a6c>] aer_isr+0xc7/0xcc [ 2374.655147] [<ffffffff813609a5>] ? aer_isr_one_error+0x21a/0x21a [ 2374.655159] [<ffffffff81096d9f>] process_one_work+0x237/0x3ec [ 2374.655168] [<ffffffff81096d10>] ? process_one_work+0x1a8/0x3ec [ 2374.655178] [<ffffffff8109728d>] worker_thread+0x17c/0x240 [ 2374.655186] [<ffffffff810b0803>] ? trace_hardirqs_on+0xd/0xf [ 2374.655196] [<ffffffff81097111>] ? manage_workers+0xab/0xab [ 2374.655209] [<ffffffff8109c8ed>] kthread+0xa0/0xa8 [ 2374.655223] [<ffffffff81c332d4>] kernel_thread_helper+0x4/0x10 [ 2374.655232] [<ffffffff81c2b880>] ? retint_restore_args+0xe/0xe [ 2374.655243] [<ffffffff8109c84d>] ? __init_kthread_worker+0x5b/0x5b [ 2374.655252] [<ffffffff81c332d0>] ? gs_change+0xb/0xb when aer happens, pci_walk_bus already have down_read(&pci_bus_sem)... then report_slot_reset ==> e1000_io_slot_reset ==> e1000e_disable_aspm ==> pci_disable_link_state... We can not use pci_disable_link_state, and it will try to hold pci_bus_sem again. Try to have __pci_disable_link_state that will not need to hold pci_bus_sem. -v2: change name to pci_disable_link_state_locked() according to Jesse. [jbarnes: make sure new function is exported for modules] Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-05-13 07:11:47 +07:00
void pci_disable_link_state(struct pci_dev *pdev, int state)
{
__pci_disable_link_state(pdev, state, true);
}
EXPORT_SYMBOL(pci_disable_link_state);
static int pcie_aspm_set_policy(const char *val, struct kernel_param *kp)
{
int i;
struct pcie_link_state *link;
PCI: Changing ASPM policy, via /sys, to POWERSAVE could cause NMIs v3 -> v2: Modified the text that describes the problem v2 -> v1: Returned -EPERM v1 : http://marc.info/?l=linux-pci&m=130013194803727&w=2 For servers whose hardware cannot handle ASPM the BIOS ought to set the FADT bit shown below: In Sec 5.2.9.3 (IA-PC Boot Arch. Flags) of ACPI4.0a Specification, please see Table 5-11: PCIe ASPM Controls: If set, indicates to OSPM that it must not enable OPSM ASPM control on this platform. However there are shipping servers whose BIOS did not set this bit. (An example is the HP ProLiant DL385 G6. A Maintenance BIOS will fix that). For such servers even if a call is made via pci_no_aspm(), based on _OSC support in the BIOS, it may be too late because the ASPM code may have already allocated and filled its "link_list". So if a user sets the ASPM "policy" to "powersave" via /sys then pcie_aspm_set_policy() will run through the "link_list" and re-configure ASPM policy on devices that advertise ASPM L0s/L1 capability: # echo powersave > /sys/module/pcie_aspm/parameters/policy # cat /sys/module/pcie_aspm/parameters/policy default performance [powersave] That can cause NMIs since the hardware doesn't play well with ASPM: [ 1651.906015] NMI: PCI system error (SERR) for reason b1 on CPU 0. [ 1651.906015] Dazed and confused, but trying to continue Ideally, the BIOS should have set that FADT bit in the first place but we could be more robust - especially given the fact that Windows doesn't cause NMIs in the above scenario. There should be a sanity check to not allow a user to modify ASPM policy when aspm_disabled is set. Signed-off-by: Naga Chumbalkar <nagananda.chumbalkar@hp.com> Acked-by: Rafael J. Wysocki <rjw@sisk.pl> Cc: Matthew Garrett <mjg59@srcf.ucam.org> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-03-21 10:29:14 +07:00
if (aspm_disabled)
return -EPERM;
for (i = 0; i < ARRAY_SIZE(policy_str); i++)
if (!strncmp(val, policy_str[i], strlen(policy_str[i])))
break;
if (i >= ARRAY_SIZE(policy_str))
return -EINVAL;
if (i == aspm_policy)
return 0;
down_read(&pci_bus_sem);
mutex_lock(&aspm_lock);
aspm_policy = i;
list_for_each_entry(link, &link_list, sibling) {
pcie_config_aspm_link(link, policy_to_aspm_state(link));
pcie_set_clkpm(link, policy_to_clkpm_state(link));
}
mutex_unlock(&aspm_lock);
up_read(&pci_bus_sem);
return 0;
}
static int pcie_aspm_get_policy(char *buffer, struct kernel_param *kp)
{
int i, cnt = 0;
for (i = 0; i < ARRAY_SIZE(policy_str); i++)
if (i == aspm_policy)
cnt += sprintf(buffer + cnt, "[%s] ", policy_str[i]);
else
cnt += sprintf(buffer + cnt, "%s ", policy_str[i]);
return cnt;
}
module_param_call(policy, pcie_aspm_set_policy, pcie_aspm_get_policy,
NULL, 0644);
#ifdef CONFIG_PCIEASPM_DEBUG
static ssize_t link_state_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct pci_dev *pci_device = to_pci_dev(dev);
struct pcie_link_state *link_state = pci_device->link_state;
return sprintf(buf, "%d\n", link_state->aspm_enabled);
}
static ssize_t link_state_store(struct device *dev,
struct device_attribute *attr,
const char *buf,
size_t n)
{
struct pci_dev *pdev = to_pci_dev(dev);
struct pcie_link_state *link, *root = pdev->link_state->root;
u32 state;
PCI: Changing ASPM policy, via /sys, to POWERSAVE could cause NMIs v3 -> v2: Modified the text that describes the problem v2 -> v1: Returned -EPERM v1 : http://marc.info/?l=linux-pci&m=130013194803727&w=2 For servers whose hardware cannot handle ASPM the BIOS ought to set the FADT bit shown below: In Sec 5.2.9.3 (IA-PC Boot Arch. Flags) of ACPI4.0a Specification, please see Table 5-11: PCIe ASPM Controls: If set, indicates to OSPM that it must not enable OPSM ASPM control on this platform. However there are shipping servers whose BIOS did not set this bit. (An example is the HP ProLiant DL385 G6. A Maintenance BIOS will fix that). For such servers even if a call is made via pci_no_aspm(), based on _OSC support in the BIOS, it may be too late because the ASPM code may have already allocated and filled its "link_list". So if a user sets the ASPM "policy" to "powersave" via /sys then pcie_aspm_set_policy() will run through the "link_list" and re-configure ASPM policy on devices that advertise ASPM L0s/L1 capability: # echo powersave > /sys/module/pcie_aspm/parameters/policy # cat /sys/module/pcie_aspm/parameters/policy default performance [powersave] That can cause NMIs since the hardware doesn't play well with ASPM: [ 1651.906015] NMI: PCI system error (SERR) for reason b1 on CPU 0. [ 1651.906015] Dazed and confused, but trying to continue Ideally, the BIOS should have set that FADT bit in the first place but we could be more robust - especially given the fact that Windows doesn't cause NMIs in the above scenario. There should be a sanity check to not allow a user to modify ASPM policy when aspm_disabled is set. Signed-off-by: Naga Chumbalkar <nagananda.chumbalkar@hp.com> Acked-by: Rafael J. Wysocki <rjw@sisk.pl> Cc: Matthew Garrett <mjg59@srcf.ucam.org> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-03-21 10:29:14 +07:00
if (aspm_disabled)
return -EPERM;
if (kstrtouint(buf, 10, &state))
return -EINVAL;
if ((state & ~ASPM_STATE_ALL) != 0)
return -EINVAL;
down_read(&pci_bus_sem);
mutex_lock(&aspm_lock);
list_for_each_entry(link, &link_list, sibling) {
if (link->root != root)
continue;
pcie_config_aspm_link(link, state);
}
mutex_unlock(&aspm_lock);
up_read(&pci_bus_sem);
return n;
}
static ssize_t clk_ctl_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct pci_dev *pci_device = to_pci_dev(dev);
struct pcie_link_state *link_state = pci_device->link_state;
return sprintf(buf, "%d\n", link_state->clkpm_enabled);
}
static ssize_t clk_ctl_store(struct device *dev,
struct device_attribute *attr,
const char *buf,
size_t n)
{
struct pci_dev *pdev = to_pci_dev(dev);
bool state;
if (strtobool(buf, &state))
return -EINVAL;
down_read(&pci_bus_sem);
mutex_lock(&aspm_lock);
pcie_set_clkpm_nocheck(pdev->link_state, state);
mutex_unlock(&aspm_lock);
up_read(&pci_bus_sem);
return n;
}
static DEVICE_ATTR_RW(link_state);
static DEVICE_ATTR_RW(clk_ctl);
static char power_group[] = "power";
void pcie_aspm_create_sysfs_dev_files(struct pci_dev *pdev)
{
struct pcie_link_state *link_state = pdev->link_state;
if (!link_state)
return;
if (link_state->aspm_support)
sysfs_add_file_to_group(&pdev->dev.kobj,
&dev_attr_link_state.attr, power_group);
if (link_state->clkpm_capable)
sysfs_add_file_to_group(&pdev->dev.kobj,
&dev_attr_clk_ctl.attr, power_group);
}
void pcie_aspm_remove_sysfs_dev_files(struct pci_dev *pdev)
{
struct pcie_link_state *link_state = pdev->link_state;
if (!link_state)
return;
if (link_state->aspm_support)
sysfs_remove_file_from_group(&pdev->dev.kobj,
&dev_attr_link_state.attr, power_group);
if (link_state->clkpm_capable)
sysfs_remove_file_from_group(&pdev->dev.kobj,
&dev_attr_clk_ctl.attr, power_group);
}
#endif
static int __init pcie_aspm_disable(char *str)
{
if (!strcmp(str, "off")) {
aspm_policy = POLICY_DEFAULT;
aspm_disabled = 1;
aspm_support_enabled = false;
printk(KERN_INFO "PCIe ASPM is disabled\n");
} else if (!strcmp(str, "force")) {
aspm_force = 1;
printk(KERN_INFO "PCIe ASPM is forcibly enabled\n");
}
return 1;
}
__setup("pcie_aspm=", pcie_aspm_disable);
void pcie_no_aspm(void)
{
/*
* Disabling ASPM is intended to prevent the kernel from modifying
* existing hardware state, not to clear existing state. To that end:
* (a) set policy to POLICY_DEFAULT in order to avoid changing state
* (b) prevent userspace from changing policy
*/
if (!aspm_force) {
aspm_policy = POLICY_DEFAULT;
aspm_disabled = 1;
}
}
bool pcie_aspm_support_enabled(void)
{
return aspm_support_enabled;
}
EXPORT_SYMBOL(pcie_aspm_support_enabled);