mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-18 23:26:06 +07:00
7f17b4a121
memory_failure() offlines or repairs pages of memory that have been discovered to be corrupt. These may be detected by an external component, (e.g. the memory controller), and notified via an IRQ. In this case the work is queued as not all of memory_failure()s work can happen in IRQ context. If the error was detected as a result of user-space accessing a corrupt memory location the CPU may take an abort instead. On arm64 this is a 'synchronous external abort', and on a firmware first system it is replayed using NOTIFY_SEA. This notification has NMI like properties, (it can interrupt IRQ-masked code), so the memory_failure() work is queued. If we return to user-space before the queued memory_failure() work is processed, we will take the fault again. This loop may cause platform firmware to exceed some threshold and reboot when Linux could have recovered from this error. For NMIlike notifications keep track of whether memory_failure() work was queued, and make task_work pending to flush out the queue. To save memory allocations, the task_work is allocated as part of the ghes_estatus_node, and free()ing it back to the pool is deferred. Signed-off-by: James Morse <james.morse@arm.com> Tested-by: Tyler Baicar <baicar@os.amperecomputing.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
131 lines
3.0 KiB
C
131 lines
3.0 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef GHES_H
|
|
#define GHES_H
|
|
|
|
#include <acpi/apei.h>
|
|
#include <acpi/hed.h>
|
|
|
|
/*
|
|
* One struct ghes is created for each generic hardware error source.
|
|
* It provides the context for APEI hardware error timer/IRQ/SCI/NMI
|
|
* handler.
|
|
*
|
|
* estatus: memory buffer for error status block, allocated during
|
|
* HEST parsing.
|
|
*/
|
|
#define GHES_EXITING 0x0002
|
|
|
|
struct ghes {
|
|
union {
|
|
struct acpi_hest_generic *generic;
|
|
struct acpi_hest_generic_v2 *generic_v2;
|
|
};
|
|
struct acpi_hest_generic_status *estatus;
|
|
unsigned long flags;
|
|
union {
|
|
struct list_head list;
|
|
struct timer_list timer;
|
|
unsigned int irq;
|
|
};
|
|
};
|
|
|
|
struct ghes_estatus_node {
|
|
struct llist_node llnode;
|
|
struct acpi_hest_generic *generic;
|
|
struct ghes *ghes;
|
|
|
|
int task_work_cpu;
|
|
struct callback_head task_work;
|
|
};
|
|
|
|
struct ghes_estatus_cache {
|
|
u32 estatus_len;
|
|
atomic_t count;
|
|
struct acpi_hest_generic *generic;
|
|
unsigned long long time_in;
|
|
struct rcu_head rcu;
|
|
};
|
|
|
|
enum {
|
|
GHES_SEV_NO = 0x0,
|
|
GHES_SEV_CORRECTED = 0x1,
|
|
GHES_SEV_RECOVERABLE = 0x2,
|
|
GHES_SEV_PANIC = 0x3,
|
|
};
|
|
|
|
int ghes_estatus_pool_init(int num_ghes);
|
|
|
|
/* From drivers/edac/ghes_edac.c */
|
|
|
|
#ifdef CONFIG_EDAC_GHES
|
|
void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err);
|
|
|
|
int ghes_edac_register(struct ghes *ghes, struct device *dev);
|
|
|
|
void ghes_edac_unregister(struct ghes *ghes);
|
|
|
|
#else
|
|
static inline void ghes_edac_report_mem_error(int sev,
|
|
struct cper_sec_mem_err *mem_err)
|
|
{
|
|
}
|
|
|
|
static inline int ghes_edac_register(struct ghes *ghes, struct device *dev)
|
|
{
|
|
return -ENODEV;
|
|
}
|
|
|
|
static inline void ghes_edac_unregister(struct ghes *ghes)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
static inline int acpi_hest_get_version(struct acpi_hest_generic_data *gdata)
|
|
{
|
|
return gdata->revision >> 8;
|
|
}
|
|
|
|
static inline void *acpi_hest_get_payload(struct acpi_hest_generic_data *gdata)
|
|
{
|
|
if (acpi_hest_get_version(gdata) >= 3)
|
|
return (void *)(((struct acpi_hest_generic_data_v300 *)(gdata)) + 1);
|
|
|
|
return gdata + 1;
|
|
}
|
|
|
|
static inline int acpi_hest_get_error_length(struct acpi_hest_generic_data *gdata)
|
|
{
|
|
return ((struct acpi_hest_generic_data *)(gdata))->error_data_length;
|
|
}
|
|
|
|
static inline int acpi_hest_get_size(struct acpi_hest_generic_data *gdata)
|
|
{
|
|
if (acpi_hest_get_version(gdata) >= 3)
|
|
return sizeof(struct acpi_hest_generic_data_v300);
|
|
|
|
return sizeof(struct acpi_hest_generic_data);
|
|
}
|
|
|
|
static inline int acpi_hest_get_record_size(struct acpi_hest_generic_data *gdata)
|
|
{
|
|
return (acpi_hest_get_size(gdata) + acpi_hest_get_error_length(gdata));
|
|
}
|
|
|
|
static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata)
|
|
{
|
|
return (void *)(gdata) + acpi_hest_get_record_size(gdata);
|
|
}
|
|
|
|
#define apei_estatus_for_each_section(estatus, section) \
|
|
for (section = (struct acpi_hest_generic_data *)(estatus + 1); \
|
|
(void *)section - (void *)(estatus + 1) < estatus->data_length; \
|
|
section = acpi_hest_get_next(section))
|
|
|
|
#ifdef CONFIG_ACPI_APEI_SEA
|
|
int ghes_notify_sea(void);
|
|
#else
|
|
static inline int ghes_notify_sea(void) { return -ENOENT; }
|
|
#endif
|
|
|
|
#endif /* GHES_H */
|