linux_dsm_epyc7002/include/linux/oom.h
Michal Hocko c32b3cbe0d oom, PM: make OOM detection in the freezer path raceless
Commit 5695be142e ("OOM, PM: OOM killed task shouldn't escape PM
suspend") has left a race window when OOM killer manages to
note_oom_kill after freeze_processes checks the counter.  The race
window is quite small and really unlikely and partial solution deemed
sufficient at the time of submission.

Tejun wasn't happy about this partial solution though and insisted on a
full solution.  That requires the full OOM and freezer's task freezing
exclusion, though.  This is done by this patch which introduces oom_sem
RW lock and turns oom_killer_disable() into a full OOM barrier.

oom_killer_disabled check is moved from the allocation path to the OOM
level and we take oom_sem for reading for both the check and the whole
OOM invocation.

oom_killer_disable() takes oom_sem for writing so it waits for all
currently running OOM killer invocations.  Then it disable all the further
OOMs by setting oom_killer_disabled and checks for any oom victims.
Victims are counted via mark_tsk_oom_victim resp.  unmark_oom_victim.  The
last victim wakes up all waiters enqueued by oom_killer_disable().
Therefore this function acts as the full OOM barrier.

The page fault path is covered now as well although it was assumed to be
safe before.  As per Tejun, "We used to have freezing points deep in file
system code which may be reacheable from page fault." so it would be
better and more robust to not rely on freezing points here.  Same applies
to the memcg OOM killer.

out_of_memory tells the caller whether the OOM was allowed to trigger and
the callers are supposed to handle the situation.  The page allocation
path simply fails the allocation same as before.  The page fault path will
retry the fault (more on that later) and Sysrq OOM trigger will simply
complain to the log.

Normally there wouldn't be any unfrozen user tasks after
try_to_freeze_tasks so the function will not block. But if there was an
OOM killer racing with try_to_freeze_tasks and the OOM victim didn't
finish yet then we have to wait for it. This should complete in a finite
time, though, because

	- the victim cannot loop in the page fault handler (it would die
	  on the way out from the exception)
	- it cannot loop in the page allocator because all the further
	  allocation would fail and __GFP_NOFAIL allocations are not
	  acceptable at this stage
	- it shouldn't be blocked on any locks held by frozen tasks
	  (try_to_freeze expects lockless context) and kernel threads and
	  work queues are not frozen yet

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Suggested-by: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-11 17:06:03 -08:00

102 lines
2.9 KiB
C

#ifndef __INCLUDE_LINUX_OOM_H
#define __INCLUDE_LINUX_OOM_H
#include <linux/sched.h>
#include <linux/types.h>
#include <linux/nodemask.h>
#include <uapi/linux/oom.h>
struct zonelist;
struct notifier_block;
struct mem_cgroup;
struct task_struct;
/*
* Types of limitations to the nodes from which allocations may occur
*/
enum oom_constraint {
CONSTRAINT_NONE,
CONSTRAINT_CPUSET,
CONSTRAINT_MEMORY_POLICY,
CONSTRAINT_MEMCG,
};
enum oom_scan_t {
OOM_SCAN_OK, /* scan thread and find its badness */
OOM_SCAN_CONTINUE, /* do not consider thread for oom kill */
OOM_SCAN_ABORT, /* abort the iteration and return */
OOM_SCAN_SELECT, /* always select this thread first */
};
/* Thread is the potential origin of an oom condition; kill first on oom */
#define OOM_FLAG_ORIGIN ((__force oom_flags_t)0x1)
static inline void set_current_oom_origin(void)
{
current->signal->oom_flags |= OOM_FLAG_ORIGIN;
}
static inline void clear_current_oom_origin(void)
{
current->signal->oom_flags &= ~OOM_FLAG_ORIGIN;
}
static inline bool oom_task_origin(const struct task_struct *p)
{
return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
}
extern void mark_tsk_oom_victim(struct task_struct *tsk);
extern void unmark_oom_victim(void);
extern unsigned long oom_badness(struct task_struct *p,
struct mem_cgroup *memcg, const nodemask_t *nodemask,
unsigned long totalpages);
extern int oom_kills_count(void);
extern void note_oom_kill(void);
extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
unsigned int points, unsigned long totalpages,
struct mem_cgroup *memcg, nodemask_t *nodemask,
const char *message);
extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags);
extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags);
extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
int order, const nodemask_t *nodemask);
extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
unsigned long totalpages, const nodemask_t *nodemask,
bool force_kill);
extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *mask, bool force_kill);
extern int register_oom_notifier(struct notifier_block *nb);
extern int unregister_oom_notifier(struct notifier_block *nb);
extern bool oom_killer_disabled;
extern bool oom_killer_disable(void);
extern void oom_killer_enable(void);
extern struct task_struct *find_lock_task_mm(struct task_struct *p);
static inline bool task_will_free_mem(struct task_struct *task)
{
/*
* A coredumping process may sleep for an extended period in exit_mm(),
* so the oom killer cannot assume that the process will promptly exit
* and release memory.
*/
return (task->flags & PF_EXITING) &&
!(task->signal->flags & SIGNAL_GROUP_COREDUMP);
}
/* sysctls */
extern int sysctl_oom_dump_tasks;
extern int sysctl_oom_kill_allocating_task;
extern int sysctl_panic_on_oom;
#endif /* _INCLUDE_LINUX_OOM_H */