Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull RAS updates from Borislav Petkov:

 - Fully reworked thermal throttling notifications, there should be no
   more spamming of dmesg (Srinivas Pandruvada and Benjamin Berg)

 - More enablement for the Intel-compatible CPUs Zhaoxin (Tony W
   Wang-oc)

 - PPIN support for Icelake (Tony Luck)

* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mce/therm_throt: Optimize notifications of thermal throttle
  x86/mce: Add Xeon Icelake to list of CPUs that support PPIN
  x86/mce: Lower throttling MCE messages' priority to warning
  x86/mce: Add Zhaoxin LMCE support
  x86/mce: Add Zhaoxin CMCI support
  x86/mce: Add Zhaoxin MCE support
  x86/mce/amd: Make disable_err_thresholding() static
This commit is contained in:
Linus Torvalds 2019-11-25 17:31:39 -08:00
commit 28fcb77b38
5 changed files with 319 additions and 44 deletions

View File

@ -583,7 +583,7 @@ bool amd_filter_mce(struct mce *m)
* - Prevent possible spurious interrupts from the IF bank on Family 0x17
* Models 0x10-0x2F due to Erratum #1114.
*/
void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
{
int i, num_msrs;
u64 hwcr;

View File

@ -488,8 +488,9 @@ int mce_usable_address(struct mce *m)
if (!(m->status & MCI_STATUS_ADDRV))
return 0;
/* Checks after this one are Intel-specific: */
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
/* Checks after this one are Intel/Zhaoxin-specific: */
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
return 1;
if (!(m->status & MCI_STATUS_MISCV))
@ -507,10 +508,13 @@ EXPORT_SYMBOL_GPL(mce_usable_address);
bool mce_is_memory_error(struct mce *m)
{
if (m->cpuvendor == X86_VENDOR_AMD ||
m->cpuvendor == X86_VENDOR_HYGON) {
switch (m->cpuvendor) {
case X86_VENDOR_AMD:
case X86_VENDOR_HYGON:
return amd_mce_is_memory_error(m);
} else if (m->cpuvendor == X86_VENDOR_INTEL) {
case X86_VENDOR_INTEL:
case X86_VENDOR_ZHAOXIN:
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
*
@ -527,9 +531,10 @@ bool mce_is_memory_error(struct mce *m)
return (m->status & 0xef80) == BIT(7) ||
(m->status & 0xef00) == BIT(8) ||
(m->status & 0xeffc) == 0xc;
}
return false;
default:
return false;
}
}
EXPORT_SYMBOL_GPL(mce_is_memory_error);
@ -1127,6 +1132,12 @@ static bool __mc_check_crashing_cpu(int cpu)
u64 mcgstatus;
mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
if (mcgstatus & MCG_STATUS_LMCES)
return false;
}
if (mcgstatus & MCG_STATUS_RIPV) {
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
return true;
@ -1277,9 +1288,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
/*
* Check if this MCE is signaled to only this logical processor,
* on Intel only.
* on Intel, Zhaoxin only.
*/
if (m.cpuvendor == X86_VENDOR_INTEL)
if (m.cpuvendor == X86_VENDOR_INTEL ||
m.cpuvendor == X86_VENDOR_ZHAOXIN)
lmce = m.mcgstatus & MCG_STATUS_LMCES;
/*
@ -1697,6 +1709,18 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 6 && c->x86_model == 45)
quirk_no_way_out = quirk_sandybridge_ifu;
}
if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
/*
* All newer Zhaoxin CPUs support MCE broadcasting. Enable
* synchronization with a one second timeout.
*/
if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
if (cfg->monarch_timeout < 0)
cfg->monarch_timeout = USEC_PER_SEC;
}
}
if (cfg->monarch_timeout < 0)
cfg->monarch_timeout = 0;
if (cfg->bootlog != 0)
@ -1760,6 +1784,35 @@ static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
}
}
static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
/*
* These CPUs have MCA bank 8 which reports only one error type called
* SVAD (System View Address Decoder). The reporting of that error is
* controlled by IA32_MC8.CTL.0.
*
* If enabled, prefetching on these CPUs will cause SVAD MCE when
* virtual machines start and result in a system panic. Always disable
* bank 8 SVAD error by default.
*/
if ((c->x86 == 7 && c->x86_model == 0x1b) ||
(c->x86_model == 0x19 || c->x86_model == 0x1f)) {
if (this_cpu_read(mce_num_banks) > 8)
mce_banks[8].ctl = 0;
}
intel_init_cmci();
intel_init_lmce();
mce_adjust_timer = cmci_intel_adjust_timer;
}
static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
{
intel_clear_lmce();
}
static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
{
switch (c->x86_vendor) {
@ -1781,6 +1834,10 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
mce_centaur_feature_init(c);
break;
case X86_VENDOR_ZHAOXIN:
mce_zhaoxin_feature_init(c);
break;
default:
break;
}
@ -1792,6 +1849,11 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
case X86_VENDOR_INTEL:
mce_intel_feature_clear(c);
break;
case X86_VENDOR_ZHAOXIN:
mce_zhaoxin_feature_clear(c);
break;
default:
break;
}
@ -2014,15 +2076,16 @@ static void mce_disable_error_reporting(void)
static void vendor_disable_error_reporting(void)
{
/*
* Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs
* are socket-wide.
* Disabling them for just a single offlined CPU is bad, since it will
* inhibit reporting for all shared resources on the socket like the
* last level cache (LLC), the integrated memory controller (iMC), etc.
* Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these
* MSRs are socket-wide. Disabling them for just a single offlined CPU
* is bad, since it will inhibit reporting for all shared resources on
* the socket like the last level cache (LLC), the integrated memory
* controller (iMC), etc.
*/
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
return;
mce_disable_error_reporting();

View File

@ -85,8 +85,10 @@ static int cmci_supported(int *banks)
* initialization is vendor keyed and this
* makes sure none of the backdoors are entered otherwise.
*/
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
return 0;
if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
return 0;
rdmsrl(MSR_IA32_MCG_CAP, cap);
@ -423,7 +425,7 @@ void cmci_disable_bank(int bank)
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static void intel_init_cmci(void)
void intel_init_cmci(void)
{
int banks;
@ -442,7 +444,7 @@ static void intel_init_cmci(void)
cmci_recheck();
}
static void intel_init_lmce(void)
void intel_init_lmce(void)
{
u64 val;
@ -455,7 +457,7 @@ static void intel_init_lmce(void)
wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
}
static void intel_clear_lmce(void)
void intel_clear_lmce(void)
{
u64 val;
@ -482,6 +484,7 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
case INTEL_FAM6_BROADWELL_D:
case INTEL_FAM6_BROADWELL_X:
case INTEL_FAM6_SKYLAKE_X:
case INTEL_FAM6_ICELAKE_X:
case INTEL_FAM6_XEON_PHI_KNL:
case INTEL_FAM6_XEON_PHI_KNM:

View File

@ -45,11 +45,17 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval);
bool mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
void cmci_disable_bank(int bank);
void intel_init_cmci(void);
void intel_init_lmce(void);
void intel_clear_lmce(void);
#else
# define cmci_intel_adjust_timer mce_adjust_timer_default
static inline bool mce_intel_cmci_poll(void) { return false; }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
static inline void cmci_disable_bank(int bank) { }
static inline void intel_init_cmci(void) { }
static inline void intel_init_lmce(void) { }
static inline void intel_clear_lmce(void) { }
#endif
void mce_timer_kick(unsigned long interval);

View File

@ -40,15 +40,58 @@
#define THERMAL_THROTTLING_EVENT 0
#define POWER_LIMIT_EVENT 1
/*
* Current thermal event state:
/**
* struct _thermal_state - Represent the current thermal event state
* @next_check: Stores the next timestamp, when it is allowed
* to log the next warning message.
* @last_interrupt_time: Stores the timestamp for the last threshold
* high event.
* @therm_work: Delayed workqueue structure
* @count: Stores the current running count for thermal
* or power threshold interrupts.
* @last_count: Stores the previous running count for thermal
* or power threshold interrupts.
* @max_time_ms: This shows the maximum amount of time CPU was
* in throttled state for a single thermal
* threshold high to low state.
* @total_time_ms: This is a cumulative time during which CPU was
* in the throttled state.
* @rate_control_active: Set when a throttling message is logged.
* This is used for the purpose of rate-control.
* @new_event: Stores the last high/low status of the
* THERM_STATUS_PROCHOT or
* THERM_STATUS_POWER_LIMIT.
* @level: Stores whether this _thermal_state instance is
* for a CORE level or for PACKAGE level.
* @sample_index: Index for storing the next sample in the buffer
* temp_samples[].
* @sample_count: Total number of samples collected in the buffer
* temp_samples[].
* @average: The last moving average of temperature samples
* @baseline_temp: Temperature at which thermal threshold high
* interrupt was generated.
* @temp_samples: Storage for temperature samples to calculate
* moving average.
*
* This structure is used to represent data related to thermal state for a CPU.
* There is a separate storage for core and package level for each CPU.
*/
struct _thermal_state {
bool new_event;
int event;
u64 next_check;
u64 last_interrupt_time;
struct delayed_work therm_work;
unsigned long count;
unsigned long last_count;
unsigned long max_time_ms;
unsigned long total_time_ms;
bool rate_control_active;
bool new_event;
u8 level;
u8 sample_index;
u8 sample_count;
u8 average;
u8 baseline_temp;
u8 temp_samples[3];
};
struct thermal_state {
@ -121,8 +164,22 @@ define_therm_throt_device_one_ro(package_throttle_count);
define_therm_throt_device_show_func(package_power_limit, count);
define_therm_throt_device_one_ro(package_power_limit_count);
define_therm_throt_device_show_func(core_throttle, max_time_ms);
define_therm_throt_device_one_ro(core_throttle_max_time_ms);
define_therm_throt_device_show_func(package_throttle, max_time_ms);
define_therm_throt_device_one_ro(package_throttle_max_time_ms);
define_therm_throt_device_show_func(core_throttle, total_time_ms);
define_therm_throt_device_one_ro(core_throttle_total_time_ms);
define_therm_throt_device_show_func(package_throttle, total_time_ms);
define_therm_throt_device_one_ro(package_throttle_total_time_ms);
static struct attribute *thermal_throttle_attrs[] = {
&dev_attr_core_throttle_count.attr,
&dev_attr_core_throttle_max_time_ms.attr,
&dev_attr_core_throttle_total_time_ms.attr,
NULL
};
@ -135,6 +192,105 @@ static const struct attribute_group thermal_attr_group = {
#define CORE_LEVEL 0
#define PACKAGE_LEVEL 1
#define THERM_THROT_POLL_INTERVAL HZ
#define THERM_STATUS_PROCHOT_LOG BIT(1)
static void clear_therm_status_log(int level)
{
int msr;
u64 msr_val;
if (level == CORE_LEVEL)
msr = MSR_IA32_THERM_STATUS;
else
msr = MSR_IA32_PACKAGE_THERM_STATUS;
rdmsrl(msr, msr_val);
wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG);
}
static void get_therm_status(int level, bool *proc_hot, u8 *temp)
{
int msr;
u64 msr_val;
if (level == CORE_LEVEL)
msr = MSR_IA32_THERM_STATUS;
else
msr = MSR_IA32_PACKAGE_THERM_STATUS;
rdmsrl(msr, msr_val);
if (msr_val & THERM_STATUS_PROCHOT_LOG)
*proc_hot = true;
else
*proc_hot = false;
*temp = (msr_val >> 16) & 0x7F;
}
static void throttle_active_work(struct work_struct *work)
{
struct _thermal_state *state = container_of(to_delayed_work(work),
struct _thermal_state, therm_work);
unsigned int i, avg, this_cpu = smp_processor_id();
u64 now = get_jiffies_64();
bool hot;
u8 temp;
get_therm_status(state->level, &hot, &temp);
/* temperature value is offset from the max so lesser means hotter */
if (!hot && temp > state->baseline_temp) {
if (state->rate_control_active)
pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n",
this_cpu,
state->level == CORE_LEVEL ? "Core" : "Package",
state->count);
state->rate_control_active = false;
return;
}
if (time_before64(now, state->next_check) &&
state->rate_control_active)
goto re_arm;
state->next_check = now + CHECK_INTERVAL;
if (state->count != state->last_count) {
/* There was one new thermal interrupt */
state->last_count = state->count;
state->average = 0;
state->sample_count = 0;
state->sample_index = 0;
}
state->temp_samples[state->sample_index] = temp;
state->sample_count++;
state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples);
if (state->sample_count < ARRAY_SIZE(state->temp_samples))
goto re_arm;
avg = 0;
for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i)
avg += state->temp_samples[i];
avg /= ARRAY_SIZE(state->temp_samples);
if (state->average > avg) {
pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n",
this_cpu,
state->level == CORE_LEVEL ? "Core" : "Package",
state->count);
state->rate_control_active = true;
}
state->average = avg;
re_arm:
clear_therm_status_log(state->level);
schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
}
/***
* therm_throt_process - Process thermal throttling event from interrupt
* @curr: Whether the condition is current or not (boolean), since the
@ -178,27 +334,33 @@ static void therm_throt_process(bool new_event, int event, int level)
if (new_event)
state->count++;
if (time_before64(now, state->next_check) &&
state->count != state->last_count)
if (event != THERMAL_THROTTLING_EVENT)
return;
state->next_check = now + CHECK_INTERVAL;
state->last_count = state->count;
if (new_event && !state->last_interrupt_time) {
bool hot;
u8 temp;
/* if we just entered the thermal event */
if (new_event) {
if (event == THERMAL_THROTTLING_EVENT)
pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
this_cpu,
level == CORE_LEVEL ? "Core" : "Package",
state->count);
return;
}
if (old_event) {
if (event == THERMAL_THROTTLING_EVENT)
pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
level == CORE_LEVEL ? "Core" : "Package");
return;
get_therm_status(state->level, &hot, &temp);
/*
* Ignore short temperature spike as the system is not close
* to PROCHOT. 10C offset is large enough to ignore. It is
* already dropped from the high threshold temperature.
*/
if (temp > 10)
return;
state->baseline_temp = temp;
state->last_interrupt_time = now;
schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
} else if (old_event && state->last_interrupt_time) {
unsigned long throttle_time;
throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time);
if (throttle_time > state->max_time_ms)
state->max_time_ms = throttle_time;
state->total_time_ms += throttle_time;
state->last_interrupt_time = 0;
}
}
@ -244,20 +406,47 @@ static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
if (err)
return err;
if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_core_power_limit_count.attr,
thermal_attr_group.name);
if (err)
goto del_group;
}
if (cpu_has(c, X86_FEATURE_PTS)) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_throttle_count.attr,
thermal_attr_group.name);
if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
if (err)
goto del_group;
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_throttle_max_time_ms.attr,
thermal_attr_group.name);
if (err)
goto del_group;
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_throttle_total_time_ms.attr,
thermal_attr_group.name);
if (err)
goto del_group;
if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_power_limit_count.attr,
thermal_attr_group.name);
if (err)
goto del_group;
}
}
return 0;
del_group:
sysfs_remove_group(&dev->kobj, &thermal_attr_group);
return err;
}
@ -269,15 +458,29 @@ static void thermal_throttle_remove_dev(struct device *dev)
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int thermal_throttle_online(unsigned int cpu)
{
struct thermal_state *state = &per_cpu(thermal_state, cpu);
struct device *dev = get_cpu_device(cpu);
state->package_throttle.level = PACKAGE_LEVEL;
state->core_throttle.level = CORE_LEVEL;
INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work);
INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work);
return thermal_throttle_add_dev(dev, cpu);
}
static int thermal_throttle_offline(unsigned int cpu)
{
struct thermal_state *state = &per_cpu(thermal_state, cpu);
struct device *dev = get_cpu_device(cpu);
cancel_delayed_work(&state->package_throttle.therm_work);
cancel_delayed_work(&state->core_throttle.therm_work);
state->package_throttle.rate_control_active = false;
state->core_throttle.rate_control_active = false;
thermal_throttle_remove_dev(dev);
return 0;
}