mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-18 09:36:11 +07:00
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main updates in this cycle were: - Group balancing enhancements and cleanups (Brendan Jackman) - Move CPU isolation related functionality into its separate kernel/sched/isolation.c file, with related 'housekeeping_*()' namespace and nomenclature et al. (Frederic Weisbecker) - Improve the interactive/cpu-intense fairness calculation (Josef Bacik) - Improve the PELT code and related cleanups (Peter Zijlstra) - Improve the logic of pick_next_task_fair() (Uladzislau Rezki) - Improve the RT IPI based balancing logic (Steven Rostedt) - Various micro-optimizations: - better !CONFIG_SCHED_DEBUG optimizations (Patrick Bellasi) - better idle loop (Cheng Jian) - ... plus misc fixes, cleanups and updates" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits) sched/core: Optimize sched_feat() for !CONFIG_SCHED_DEBUG builds sched/sysctl: Fix attributes of some extern declarations sched/isolation: Document isolcpus= boot parameter flags, mark it deprecated sched/isolation: Add basic isolcpus flags sched/isolation: Move isolcpus= handling to the housekeeping code sched/isolation: Handle the nohz_full= parameter sched/isolation: Introduce housekeeping flags sched/isolation: Split out new CONFIG_CPU_ISOLATION=y config from CONFIG_NO_HZ_FULL sched/isolation: Rename is_housekeeping_cpu() to housekeeping_cpu() sched/isolation: Use its own static key sched/isolation: Make the housekeeping cpumask private sched/isolation: Provide a dynamic off-case to housekeeping_any_cpu() sched/isolation, watchdog: Use housekeeping_cpumask() instead of ad-hoc version sched/isolation: Move housekeeping related code to its own file sched/idle: Micro-optimize the idle loop sched/isolcpus: Fix "isolcpus=" boot parameter handling when !CONFIG_CPUMASK_OFFSTACK x86/tsc: Append the 'tsc=' description for the 'tsc=unstable' boot parameter sched/rt: Simplify the IPI based RT balancing logic block/ioprio: Use a helper to check for RT prio sched/rt: Add a helper to test for a RT task ...
This commit is contained in:
commit
3e2014637c
@ -1730,20 +1730,33 @@
|
||||
isapnp= [ISAPNP]
|
||||
Format: <RDP>,<reset>,<pci_scan>,<verbosity>
|
||||
|
||||
isolcpus= [KNL,SMP] Isolate CPUs from the general scheduler.
|
||||
The argument is a cpu list, as described above.
|
||||
isolcpus= [KNL,SMP] Isolate a given set of CPUs from disturbance.
|
||||
[Deprecated - use cpusets instead]
|
||||
Format: [flag-list,]<cpu-list>
|
||||
|
||||
Specify one or more CPUs to isolate from disturbances
|
||||
specified in the flag list (default: domain):
|
||||
|
||||
nohz
|
||||
Disable the tick when a single task runs.
|
||||
domain
|
||||
Isolate from the general SMP balancing and scheduling
|
||||
algorithms. Note that performing domain isolation this way
|
||||
is irreversible: it's not possible to bring back a CPU to
|
||||
the domains once isolated through isolcpus. It's strongly
|
||||
advised to use cpusets instead to disable scheduler load
|
||||
balancing through the "cpuset.sched_load_balance" file.
|
||||
It offers a much more flexible interface where CPUs can
|
||||
move in and out of an isolated set anytime.
|
||||
|
||||
You can move a process onto or off an "isolated" CPU via
|
||||
the CPU affinity syscalls or cpuset.
|
||||
<cpu number> begins at 0 and the maximum value is
|
||||
"number of CPUs in system - 1".
|
||||
|
||||
The format of <cpu-list> is described above.
|
||||
|
||||
This option can be used to specify one or more CPUs
|
||||
to isolate from the general SMP balancing and scheduling
|
||||
algorithms. You can move a process onto or off an
|
||||
"isolated" CPU via the CPU affinity syscalls or cpuset.
|
||||
<cpu number> begins at 0 and the maximum value is
|
||||
"number of CPUs in system - 1".
|
||||
|
||||
This option is the preferred way to isolate CPUs. The
|
||||
alternative -- manually setting the CPU mask of all
|
||||
tasks in the system -- can cause problems and
|
||||
suboptimal load balancer performance.
|
||||
|
||||
iucv= [HW,NET]
|
||||
|
||||
@ -4209,6 +4222,9 @@
|
||||
Used to run time disable IRQ_TIME_ACCOUNTING on any
|
||||
platforms where RDTSC is slow and this accounting
|
||||
can add overhead.
|
||||
[x86] unstable: mark the TSC clocksource as unstable, this
|
||||
marks the TSC unconditionally unstable at bootup and
|
||||
avoids any further wobbles once the TSC watchdog notices.
|
||||
|
||||
turbografx.map[2|3]= [HW,JOY]
|
||||
TurboGraFX parallel port interface
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include <linux/cpufeature.h>
|
||||
#include <linux/tick.h>
|
||||
#include <linux/pm_qos.h>
|
||||
#include <linux/sched/isolation.h>
|
||||
|
||||
#include "base.h"
|
||||
|
||||
@ -271,8 +272,16 @@ static ssize_t print_cpus_isolated(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
int n = 0, len = PAGE_SIZE-2;
|
||||
cpumask_var_t isolated;
|
||||
|
||||
n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(cpu_isolated_map));
|
||||
if (!alloc_cpumask_var(&isolated, GFP_KERNEL))
|
||||
return -ENOMEM;
|
||||
|
||||
cpumask_andnot(isolated, cpu_possible_mask,
|
||||
housekeeping_cpumask(HK_FLAG_DOMAIN));
|
||||
n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(isolated));
|
||||
|
||||
free_cpumask_var(isolated);
|
||||
|
||||
return n;
|
||||
}
|
||||
|
@ -40,7 +40,7 @@
|
||||
#include <linux/tcp.h>
|
||||
#include <linux/net_tstamp.h>
|
||||
#include <linux/ptp_clock_kernel.h>
|
||||
#include <linux/tick.h>
|
||||
#include <linux/sched/isolation.h>
|
||||
|
||||
#include <asm/checksum.h>
|
||||
#include <asm/homecache.h>
|
||||
@ -2270,8 +2270,8 @@ static int __init tile_net_init_module(void)
|
||||
tile_net_dev_init(name, mac);
|
||||
|
||||
if (!network_cpus_init())
|
||||
cpumask_and(&network_cpus_map, housekeeping_cpumask(),
|
||||
cpu_online_mask);
|
||||
cpumask_and(&network_cpus_map,
|
||||
housekeeping_cpumask(HK_FLAG_MISC), cpu_online_mask);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -138,7 +138,7 @@ static const char * const task_state_array[] = {
|
||||
static inline const char *get_task_state(struct task_struct *tsk)
|
||||
{
|
||||
BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array));
|
||||
return task_state_array[__get_task_state(tsk)];
|
||||
return task_state_array[task_state_index(tsk)];
|
||||
}
|
||||
|
||||
static inline int get_task_umask(struct task_struct *tsk)
|
||||
|
@ -131,6 +131,11 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline unsigned int cpumask_last(const struct cpumask *srcp)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Valid inputs for n are -1 and 0. */
|
||||
static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
|
||||
{
|
||||
@ -179,6 +184,17 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
|
||||
return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
|
||||
}
|
||||
|
||||
/**
|
||||
* cpumask_last - get the last CPU in a cpumask
|
||||
* @srcp: - the cpumask pointer
|
||||
*
|
||||
* Returns >= nr_cpumask_bits if no CPUs set.
|
||||
*/
|
||||
static inline unsigned int cpumask_last(const struct cpumask *srcp)
|
||||
{
|
||||
return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
|
||||
}
|
||||
|
||||
unsigned int cpumask_next(int n, const struct cpumask *srcp);
|
||||
|
||||
/**
|
||||
|
@ -3,6 +3,7 @@
|
||||
#define IOPRIO_H
|
||||
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sched/rt.h>
|
||||
#include <linux/iocontext.h>
|
||||
|
||||
/*
|
||||
@ -63,7 +64,7 @@ static inline int task_nice_ioclass(struct task_struct *task)
|
||||
{
|
||||
if (task->policy == SCHED_IDLE)
|
||||
return IOPRIO_CLASS_IDLE;
|
||||
else if (task->policy == SCHED_FIFO || task->policy == SCHED_RR)
|
||||
else if (task_is_realtime(task))
|
||||
return IOPRIO_CLASS_RT;
|
||||
else
|
||||
return IOPRIO_CLASS_BE;
|
||||
|
@ -166,8 +166,6 @@ struct task_group;
|
||||
/* Task command name length: */
|
||||
#define TASK_COMM_LEN 16
|
||||
|
||||
extern cpumask_var_t cpu_isolated_map;
|
||||
|
||||
extern void scheduler_tick(void);
|
||||
|
||||
#define MAX_SCHEDULE_TIMEOUT LONG_MAX
|
||||
@ -332,9 +330,11 @@ struct load_weight {
|
||||
struct sched_avg {
|
||||
u64 last_update_time;
|
||||
u64 load_sum;
|
||||
u64 runnable_load_sum;
|
||||
u32 util_sum;
|
||||
u32 period_contrib;
|
||||
unsigned long load_avg;
|
||||
unsigned long runnable_load_avg;
|
||||
unsigned long util_avg;
|
||||
};
|
||||
|
||||
@ -377,6 +377,7 @@ struct sched_statistics {
|
||||
struct sched_entity {
|
||||
/* For load-balancing: */
|
||||
struct load_weight load;
|
||||
unsigned long runnable_weight;
|
||||
struct rb_node run_node;
|
||||
struct list_head group_node;
|
||||
unsigned int on_rq;
|
||||
@ -472,10 +473,10 @@ struct sched_dl_entity {
|
||||
* conditions between the inactive timer handler and the wakeup
|
||||
* code.
|
||||
*/
|
||||
int dl_throttled;
|
||||
int dl_boosted;
|
||||
int dl_yielded;
|
||||
int dl_non_contending;
|
||||
int dl_throttled : 1;
|
||||
int dl_boosted : 1;
|
||||
int dl_yielded : 1;
|
||||
int dl_non_contending : 1;
|
||||
|
||||
/*
|
||||
* Bandwidth enforcement timer. Each -deadline task has its
|
||||
@ -1246,7 +1247,7 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
|
||||
#define TASK_REPORT_IDLE (TASK_REPORT + 1)
|
||||
#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1)
|
||||
|
||||
static inline unsigned int __get_task_state(struct task_struct *tsk)
|
||||
static inline unsigned int task_state_index(struct task_struct *tsk)
|
||||
{
|
||||
unsigned int tsk_state = READ_ONCE(tsk->state);
|
||||
unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
|
||||
@ -1259,7 +1260,7 @@ static inline unsigned int __get_task_state(struct task_struct *tsk)
|
||||
return fls(state);
|
||||
}
|
||||
|
||||
static inline char __task_state_to_char(unsigned int state)
|
||||
static inline char task_index_to_char(unsigned int state)
|
||||
{
|
||||
static const char state_char[] = "RSDTtXZPI";
|
||||
|
||||
@ -1270,7 +1271,7 @@ static inline char __task_state_to_char(unsigned int state)
|
||||
|
||||
static inline char task_state_to_char(struct task_struct *tsk)
|
||||
{
|
||||
return __task_state_to_char(__get_task_state(tsk));
|
||||
return task_index_to_char(task_state_index(tsk));
|
||||
}
|
||||
|
||||
/**
|
||||
|
51
include/linux/sched/isolation.h
Normal file
51
include/linux/sched/isolation.h
Normal file
@ -0,0 +1,51 @@
|
||||
#ifndef _LINUX_SCHED_ISOLATION_H
|
||||
#define _LINUX_SCHED_ISOLATION_H
|
||||
|
||||
#include <linux/cpumask.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/tick.h>
|
||||
|
||||
enum hk_flags {
|
||||
HK_FLAG_TIMER = 1,
|
||||
HK_FLAG_RCU = (1 << 1),
|
||||
HK_FLAG_MISC = (1 << 2),
|
||||
HK_FLAG_SCHED = (1 << 3),
|
||||
HK_FLAG_TICK = (1 << 4),
|
||||
HK_FLAG_DOMAIN = (1 << 5),
|
||||
};
|
||||
|
||||
#ifdef CONFIG_CPU_ISOLATION
|
||||
DECLARE_STATIC_KEY_FALSE(housekeeping_overriden);
|
||||
extern int housekeeping_any_cpu(enum hk_flags flags);
|
||||
extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags);
|
||||
extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags);
|
||||
extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags);
|
||||
extern void __init housekeeping_init(void);
|
||||
|
||||
#else
|
||||
|
||||
static inline int housekeeping_any_cpu(enum hk_flags flags)
|
||||
{
|
||||
return smp_processor_id();
|
||||
}
|
||||
|
||||
static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
|
||||
{
|
||||
return cpu_possible_mask;
|
||||
}
|
||||
|
||||
static inline void housekeeping_affine(struct task_struct *t,
|
||||
enum hk_flags flags) { }
|
||||
static inline void housekeeping_init(void) { }
|
||||
#endif /* CONFIG_CPU_ISOLATION */
|
||||
|
||||
static inline bool housekeeping_cpu(int cpu, enum hk_flags flags)
|
||||
{
|
||||
#ifdef CONFIG_CPU_ISOLATION
|
||||
if (static_branch_unlikely(&housekeeping_overriden))
|
||||
return housekeeping_test_cpu(cpu, flags);
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif /* _LINUX_SCHED_ISOLATION_H */
|
@ -18,6 +18,17 @@ static inline int rt_task(struct task_struct *p)
|
||||
return rt_prio(p->prio);
|
||||
}
|
||||
|
||||
static inline bool task_is_realtime(struct task_struct *tsk)
|
||||
{
|
||||
int policy = tsk->policy;
|
||||
|
||||
if (policy == SCHED_FIFO || policy == SCHED_RR)
|
||||
return true;
|
||||
if (policy == SCHED_DEADLINE)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_RT_MUTEXES
|
||||
/*
|
||||
* Must hold either p->pi_lock or task_rq(p)->lock.
|
||||
|
@ -38,9 +38,9 @@ extern unsigned int sysctl_numa_balancing_scan_period_max;
|
||||
extern unsigned int sysctl_numa_balancing_scan_size;
|
||||
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
extern unsigned int sysctl_sched_migration_cost;
|
||||
extern unsigned int sysctl_sched_nr_migrate;
|
||||
extern unsigned int sysctl_sched_time_avg;
|
||||
extern __read_mostly unsigned int sysctl_sched_migration_cost;
|
||||
extern __read_mostly unsigned int sysctl_sched_nr_migrate;
|
||||
extern __read_mostly unsigned int sysctl_sched_time_avg;
|
||||
|
||||
int sched_proc_update_handler(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *length,
|
||||
|
@ -138,7 +138,6 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
|
||||
#ifdef CONFIG_NO_HZ_FULL
|
||||
extern bool tick_nohz_full_running;
|
||||
extern cpumask_var_t tick_nohz_full_mask;
|
||||
extern cpumask_var_t housekeeping_mask;
|
||||
|
||||
static inline bool tick_nohz_full_enabled(void)
|
||||
{
|
||||
@ -162,11 +161,6 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
|
||||
cpumask_or(mask, mask, tick_nohz_full_mask);
|
||||
}
|
||||
|
||||
static inline int housekeeping_any_cpu(void)
|
||||
{
|
||||
return cpumask_any_and(housekeeping_mask, cpu_online_mask);
|
||||
}
|
||||
|
||||
extern void tick_nohz_dep_set(enum tick_dep_bits bit);
|
||||
extern void tick_nohz_dep_clear(enum tick_dep_bits bit);
|
||||
extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit);
|
||||
@ -235,11 +229,8 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
|
||||
|
||||
extern void tick_nohz_full_kick_cpu(int cpu);
|
||||
extern void __tick_nohz_task_switch(void);
|
||||
extern void __init tick_nohz_full_setup(cpumask_var_t cpumask);
|
||||
#else
|
||||
static inline int housekeeping_any_cpu(void)
|
||||
{
|
||||
return smp_processor_id();
|
||||
}
|
||||
static inline bool tick_nohz_full_enabled(void) { return false; }
|
||||
static inline bool tick_nohz_full_cpu(int cpu) { return false; }
|
||||
static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
|
||||
@ -259,35 +250,9 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
|
||||
|
||||
static inline void tick_nohz_full_kick_cpu(int cpu) { }
|
||||
static inline void __tick_nohz_task_switch(void) { }
|
||||
static inline void tick_nohz_full_setup(cpumask_var_t cpumask) { }
|
||||
#endif
|
||||
|
||||
static inline const struct cpumask *housekeeping_cpumask(void)
|
||||
{
|
||||
#ifdef CONFIG_NO_HZ_FULL
|
||||
if (tick_nohz_full_enabled())
|
||||
return housekeeping_mask;
|
||||
#endif
|
||||
return cpu_possible_mask;
|
||||
}
|
||||
|
||||
static inline bool is_housekeeping_cpu(int cpu)
|
||||
{
|
||||
#ifdef CONFIG_NO_HZ_FULL
|
||||
if (tick_nohz_full_enabled())
|
||||
return cpumask_test_cpu(cpu, housekeeping_mask);
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline void housekeeping_affine(struct task_struct *t)
|
||||
{
|
||||
#ifdef CONFIG_NO_HZ_FULL
|
||||
if (tick_nohz_full_enabled())
|
||||
set_cpus_allowed_ptr(t, housekeeping_mask);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void tick_nohz_task_switch(void)
|
||||
{
|
||||
if (tick_nohz_full_enabled())
|
||||
|
@ -118,7 +118,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
|
||||
if (preempt)
|
||||
return TASK_STATE_MAX;
|
||||
|
||||
return __get_task_state(p);
|
||||
return task_state_index(p);
|
||||
}
|
||||
#endif /* CREATE_TRACE_POINTS */
|
||||
|
||||
|
@ -472,6 +472,13 @@ config TASK_IO_ACCOUNTING
|
||||
|
||||
endmenu # "CPU/Task time and stats accounting"
|
||||
|
||||
config CPU_ISOLATION
|
||||
bool "CPU isolation"
|
||||
help
|
||||
Make sure that CPUs running critical tasks are not disturbed by
|
||||
any source of "noise" such as unbound workqueues, timers, kthreads...
|
||||
Unbound jobs get offloaded to housekeeping CPUs.
|
||||
|
||||
source "kernel/rcu/Kconfig"
|
||||
|
||||
config BUILD_BIN2C
|
||||
|
@ -46,6 +46,7 @@
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/efi.h>
|
||||
#include <linux/tick.h>
|
||||
#include <linux/sched/isolation.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/taskstats_kern.h>
|
||||
#include <linux/delayacct.h>
|
||||
@ -606,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void)
|
||||
early_irq_init();
|
||||
init_IRQ();
|
||||
tick_init();
|
||||
housekeeping_init();
|
||||
rcu_init_nohz();
|
||||
init_timers();
|
||||
hrtimers_init();
|
||||
|
@ -57,7 +57,7 @@
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/sort.h>
|
||||
#include <linux/oom.h>
|
||||
|
||||
#include <linux/sched/isolation.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/mutex.h>
|
||||
@ -656,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
|
||||
int csn; /* how many cpuset ptrs in csa so far */
|
||||
int i, j, k; /* indices for partition finding loops */
|
||||
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
|
||||
cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
|
||||
struct sched_domain_attr *dattr; /* attributes for custom domains */
|
||||
int ndoms = 0; /* number of sched domains in result */
|
||||
int nslot; /* next empty doms[] struct cpumask slot */
|
||||
@ -666,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
|
||||
dattr = NULL;
|
||||
csa = NULL;
|
||||
|
||||
if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
|
||||
goto done;
|
||||
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
|
||||
|
||||
/* Special case for the 99% of systems with one, full, sched domain */
|
||||
if (is_sched_load_balance(&top_cpuset)) {
|
||||
ndoms = 1;
|
||||
@ -683,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
|
||||
update_domain_attr_tree(dattr, &top_cpuset);
|
||||
}
|
||||
cpumask_and(doms[0], top_cpuset.effective_cpus,
|
||||
non_isolated_cpus);
|
||||
housekeeping_cpumask(HK_FLAG_DOMAIN));
|
||||
|
||||
goto done;
|
||||
}
|
||||
@ -707,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
|
||||
*/
|
||||
if (!cpumask_empty(cp->cpus_allowed) &&
|
||||
!(is_sched_load_balance(cp) &&
|
||||
cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
|
||||
cpumask_intersects(cp->cpus_allowed,
|
||||
housekeeping_cpumask(HK_FLAG_DOMAIN))))
|
||||
continue;
|
||||
|
||||
if (is_sched_load_balance(cp))
|
||||
@ -789,7 +785,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
|
||||
|
||||
if (apn == b->pn) {
|
||||
cpumask_or(dp, dp, b->effective_cpus);
|
||||
cpumask_and(dp, dp, non_isolated_cpus);
|
||||
cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
|
||||
if (dattr)
|
||||
update_domain_attr_tree(dattr + nslot, b);
|
||||
|
||||
@ -802,7 +798,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
|
||||
BUG_ON(nslot != ndoms);
|
||||
|
||||
done:
|
||||
free_cpumask_var(non_isolated_cpus);
|
||||
kfree(csa);
|
||||
|
||||
/*
|
||||
|
@ -29,6 +29,7 @@
|
||||
#include <linux/oom.h>
|
||||
#include <linux/sched/debug.h>
|
||||
#include <linux/smpboot.h>
|
||||
#include <linux/sched/isolation.h>
|
||||
#include <uapi/linux/sched/types.h>
|
||||
#include "../time/tick-internal.h"
|
||||
|
||||
@ -2587,7 +2588,7 @@ static void rcu_bind_gp_kthread(void)
|
||||
|
||||
if (!tick_nohz_full_enabled())
|
||||
return;
|
||||
housekeeping_affine(current);
|
||||
housekeeping_affine(current, HK_FLAG_RCU);
|
||||
}
|
||||
|
||||
/* Record the current task on dyntick-idle entry. */
|
||||
|
@ -51,6 +51,7 @@
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/tick.h>
|
||||
#include <linux/rcupdate_wait.h>
|
||||
#include <linux/sched/isolation.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
|
||||
@ -714,7 +715,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
|
||||
LIST_HEAD(rcu_tasks_holdouts);
|
||||
|
||||
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
|
||||
housekeeping_affine(current);
|
||||
housekeeping_affine(current, HK_FLAG_RCU);
|
||||
|
||||
/*
|
||||
* Each pass through the following loop makes one check for
|
||||
|
@ -27,3 +27,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
|
||||
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
|
||||
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
|
||||
obj-$(CONFIG_MEMBARRIER) += membarrier.o
|
||||
obj-$(CONFIG_CPU_ISOLATION) += isolation.o
|
||||
|
@ -26,6 +26,7 @@
|
||||
#include <linux/profile.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/sched/isolation.h>
|
||||
|
||||
#include <asm/switch_to.h>
|
||||
#include <asm/tlb.h>
|
||||
@ -42,18 +43,21 @@
|
||||
|
||||
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
|
||||
|
||||
#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
|
||||
/*
|
||||
* Debugging: various feature bits
|
||||
*
|
||||
* If SCHED_DEBUG is disabled, each compilation unit has its own copy of
|
||||
* sysctl_sched_features, defined in sched.h, to allow constants propagation
|
||||
* at compile time and compiler optimization based on features default.
|
||||
*/
|
||||
|
||||
#define SCHED_FEAT(name, enabled) \
|
||||
(1UL << __SCHED_FEAT_##name) * enabled |
|
||||
|
||||
const_debug unsigned int sysctl_sched_features =
|
||||
#include "features.h"
|
||||
0;
|
||||
|
||||
#undef SCHED_FEAT
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Number of tasks to iterate in a single balance run.
|
||||
@ -83,9 +87,6 @@ __read_mostly int scheduler_running;
|
||||
*/
|
||||
int sysctl_sched_rt_runtime = 950000;
|
||||
|
||||
/* CPUs with isolated domains */
|
||||
cpumask_var_t cpu_isolated_map;
|
||||
|
||||
/*
|
||||
* __task_rq_lock - lock the rq @p resides on.
|
||||
*/
|
||||
@ -525,7 +526,7 @@ int get_nohz_timer_target(void)
|
||||
int i, cpu = smp_processor_id();
|
||||
struct sched_domain *sd;
|
||||
|
||||
if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
|
||||
if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
|
||||
return cpu;
|
||||
|
||||
rcu_read_lock();
|
||||
@ -534,15 +535,15 @@ int get_nohz_timer_target(void)
|
||||
if (cpu == i)
|
||||
continue;
|
||||
|
||||
if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
|
||||
if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
|
||||
cpu = i;
|
||||
goto unlock;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_housekeeping_cpu(cpu))
|
||||
cpu = housekeeping_any_cpu();
|
||||
if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
|
||||
cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
|
||||
unlock:
|
||||
rcu_read_unlock();
|
||||
return cpu;
|
||||
@ -732,7 +733,7 @@ int tg_nop(struct task_group *tg, void *data)
|
||||
}
|
||||
#endif
|
||||
|
||||
static void set_load_weight(struct task_struct *p)
|
||||
static void set_load_weight(struct task_struct *p, bool update_load)
|
||||
{
|
||||
int prio = p->static_prio - MAX_RT_PRIO;
|
||||
struct load_weight *load = &p->se.load;
|
||||
@ -746,8 +747,16 @@ static void set_load_weight(struct task_struct *p)
|
||||
return;
|
||||
}
|
||||
|
||||
load->weight = scale_load(sched_prio_to_weight[prio]);
|
||||
load->inv_weight = sched_prio_to_wmult[prio];
|
||||
/*
|
||||
* SCHED_OTHER tasks have to update their load when changing their
|
||||
* weight
|
||||
*/
|
||||
if (update_load && p->sched_class == &fair_sched_class) {
|
||||
reweight_task(p, prio);
|
||||
} else {
|
||||
load->weight = scale_load(sched_prio_to_weight[prio]);
|
||||
load->inv_weight = sched_prio_to_wmult[prio];
|
||||
}
|
||||
}
|
||||
|
||||
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
|
||||
@ -2357,7 +2366,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
|
||||
p->static_prio = NICE_TO_PRIO(0);
|
||||
|
||||
p->prio = p->normal_prio = __normal_prio(p);
|
||||
set_load_weight(p);
|
||||
set_load_weight(p, false);
|
||||
|
||||
/*
|
||||
* We don't need the reset flag anymore after the fork. It has
|
||||
@ -3804,7 +3813,7 @@ void set_user_nice(struct task_struct *p, long nice)
|
||||
put_prev_task(rq, p);
|
||||
|
||||
p->static_prio = NICE_TO_PRIO(nice);
|
||||
set_load_weight(p);
|
||||
set_load_weight(p, true);
|
||||
old_prio = p->prio;
|
||||
p->prio = effective_prio(p);
|
||||
delta = p->prio - old_prio;
|
||||
@ -3961,7 +3970,7 @@ static void __setscheduler_params(struct task_struct *p,
|
||||
*/
|
||||
p->rt_priority = attr->sched_priority;
|
||||
p->normal_prio = normal_prio(p);
|
||||
set_load_weight(p);
|
||||
set_load_weight(p, true);
|
||||
}
|
||||
|
||||
/* Actually do priority change: must hold pi & rq lock. */
|
||||
@ -5727,10 +5736,6 @@ static inline void sched_init_smt(void) { }
|
||||
|
||||
void __init sched_init_smp(void)
|
||||
{
|
||||
cpumask_var_t non_isolated_cpus;
|
||||
|
||||
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
|
||||
|
||||
sched_init_numa();
|
||||
|
||||
/*
|
||||
@ -5740,16 +5745,12 @@ void __init sched_init_smp(void)
|
||||
*/
|
||||
mutex_lock(&sched_domains_mutex);
|
||||
sched_init_domains(cpu_active_mask);
|
||||
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
|
||||
if (cpumask_empty(non_isolated_cpus))
|
||||
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
|
||||
mutex_unlock(&sched_domains_mutex);
|
||||
|
||||
/* Move init over to a non-isolated CPU */
|
||||
if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
|
||||
if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
|
||||
BUG();
|
||||
sched_init_granularity();
|
||||
free_cpumask_var(non_isolated_cpus);
|
||||
|
||||
init_sched_rt_class();
|
||||
init_sched_dl_class();
|
||||
@ -5934,7 +5935,7 @@ void __init sched_init(void)
|
||||
atomic_set(&rq->nr_iowait, 0);
|
||||
}
|
||||
|
||||
set_load_weight(&init_task);
|
||||
set_load_weight(&init_task, false);
|
||||
|
||||
/*
|
||||
* The boot idle thread does lazy MMU switching as well:
|
||||
@ -5953,9 +5954,6 @@ void __init sched_init(void)
|
||||
calc_load_update = jiffies + LOAD_FREQ;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/* May be allocated at isolcpus cmdline parse time */
|
||||
if (cpu_isolated_map == NULL)
|
||||
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
|
||||
idle_thread_set_boot_cpu();
|
||||
set_cpu_rq_start_time(smp_processor_id());
|
||||
#endif
|
||||
|
@ -243,7 +243,7 @@ static void task_non_contending(struct task_struct *p)
|
||||
if (p->state == TASK_DEAD)
|
||||
sub_rq_bw(p->dl.dl_bw, &rq->dl);
|
||||
raw_spin_lock(&dl_b->lock);
|
||||
__dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
|
||||
__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
|
||||
__dl_clear_params(p);
|
||||
raw_spin_unlock(&dl_b->lock);
|
||||
}
|
||||
@ -1210,7 +1210,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
|
||||
}
|
||||
|
||||
raw_spin_lock(&dl_b->lock);
|
||||
__dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
|
||||
__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
|
||||
raw_spin_unlock(&dl_b->lock);
|
||||
__dl_clear_params(p);
|
||||
|
||||
@ -1365,6 +1365,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
|
||||
update_dl_entity(dl_se, pi_se);
|
||||
} else if (flags & ENQUEUE_REPLENISH) {
|
||||
replenish_dl_entity(dl_se, pi_se);
|
||||
} else if ((flags & ENQUEUE_RESTORE) &&
|
||||
dl_time_before(dl_se->deadline,
|
||||
rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
|
||||
setup_new_dl_entity(dl_se);
|
||||
}
|
||||
|
||||
__enqueue_dl_entity(dl_se);
|
||||
@ -2167,7 +2171,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
|
||||
* until we complete the update.
|
||||
*/
|
||||
raw_spin_lock(&src_dl_b->lock);
|
||||
__dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
|
||||
__dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
|
||||
raw_spin_unlock(&src_dl_b->lock);
|
||||
}
|
||||
|
||||
@ -2256,13 +2260,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
|
||||
|
||||
return;
|
||||
}
|
||||
/*
|
||||
* If p is boosted we already updated its params in
|
||||
* rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
|
||||
* p's deadline being now already after rq_clock(rq).
|
||||
*/
|
||||
if (dl_time_before(p->dl.deadline, rq_clock(rq)))
|
||||
setup_new_dl_entity(&p->dl);
|
||||
|
||||
if (rq->curr != p) {
|
||||
#ifdef CONFIG_SMP
|
||||
@ -2452,7 +2449,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
|
||||
if (dl_policy(policy) && !task_has_dl_policy(p) &&
|
||||
!__dl_overflow(dl_b, cpus, 0, new_bw)) {
|
||||
if (hrtimer_active(&p->dl.inactive_timer))
|
||||
__dl_clear(dl_b, p->dl.dl_bw, cpus);
|
||||
__dl_sub(dl_b, p->dl.dl_bw, cpus);
|
||||
__dl_add(dl_b, new_bw, cpus);
|
||||
err = 0;
|
||||
} else if (dl_policy(policy) && task_has_dl_policy(p) &&
|
||||
@ -2464,7 +2461,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
|
||||
* But this would require to set the task's "inactive
|
||||
* timer" when the task is not inactive.
|
||||
*/
|
||||
__dl_clear(dl_b, p->dl.dl_bw, cpus);
|
||||
__dl_sub(dl_b, p->dl.dl_bw, cpus);
|
||||
__dl_add(dl_b, new_bw, cpus);
|
||||
dl_change_utilization(p, new_bw);
|
||||
err = 0;
|
||||
|
@ -441,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
|
||||
P_SCHEDSTAT(se->statistics.wait_count);
|
||||
}
|
||||
P(se->load.weight);
|
||||
P(se->runnable_weight);
|
||||
#ifdef CONFIG_SMP
|
||||
P(se->avg.load_avg);
|
||||
P(se->avg.util_avg);
|
||||
P(se->avg.runnable_load_avg);
|
||||
#endif
|
||||
|
||||
#undef PN_SCHEDSTAT
|
||||
@ -558,16 +560,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
||||
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
|
||||
#ifdef CONFIG_SMP
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight);
|
||||
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
|
||||
cfs_rq->avg.load_avg);
|
||||
SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
|
||||
cfs_rq->runnable_load_avg);
|
||||
cfs_rq->avg.runnable_load_avg);
|
||||
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
|
||||
cfs_rq->avg.util_avg);
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg",
|
||||
atomic_long_read(&cfs_rq->removed_load_avg));
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
|
||||
atomic_long_read(&cfs_rq->removed_util_avg));
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
|
||||
cfs_rq->removed.load_avg);
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
|
||||
cfs_rq->removed.util_avg);
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum",
|
||||
cfs_rq->removed.runnable_sum);
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
|
||||
cfs_rq->tg_load_avg_contrib);
|
||||
@ -1004,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
|
||||
"nr_involuntary_switches", (long long)p->nivcsw);
|
||||
|
||||
P(se.load.weight);
|
||||
P(se.runnable_weight);
|
||||
#ifdef CONFIG_SMP
|
||||
P(se.avg.load_sum);
|
||||
P(se.avg.runnable_load_sum);
|
||||
P(se.avg.util_sum);
|
||||
P(se.avg.load_avg);
|
||||
P(se.avg.runnable_load_avg);
|
||||
P(se.avg.util_avg);
|
||||
P(se.avg.last_update_time);
|
||||
#endif
|
||||
|
1091
kernel/sched/fair.c
1091
kernel/sched/fair.c
File diff suppressed because it is too large
Load Diff
@ -209,6 +209,7 @@ static void cpuidle_idle_call(void)
|
||||
*/
|
||||
static void do_idle(void)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
/*
|
||||
* If the arch has a polling bit, we maintain an invariant:
|
||||
*
|
||||
@ -219,14 +220,13 @@ static void do_idle(void)
|
||||
*/
|
||||
|
||||
__current_set_polling();
|
||||
quiet_vmstat();
|
||||
tick_nohz_idle_enter();
|
||||
|
||||
while (!need_resched()) {
|
||||
check_pgt_cache();
|
||||
rmb();
|
||||
|
||||
if (cpu_is_offline(smp_processor_id())) {
|
||||
if (cpu_is_offline(cpu)) {
|
||||
cpuhp_report_idle_dead();
|
||||
arch_cpu_idle_dead();
|
||||
}
|
||||
|
155
kernel/sched/isolation.c
Normal file
155
kernel/sched/isolation.c
Normal file
@ -0,0 +1,155 @@
|
||||
/*
|
||||
* Housekeeping management. Manage the targets for routine code that can run on
|
||||
* any CPU: unbound workqueues, timers, kthreads and any offloadable work.
|
||||
*
|
||||
* Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/sched/isolation.h>
|
||||
#include <linux/tick.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/static_key.h>
|
||||
#include <linux/ctype.h>
|
||||
|
||||
DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
|
||||
EXPORT_SYMBOL_GPL(housekeeping_overriden);
|
||||
static cpumask_var_t housekeeping_mask;
|
||||
static unsigned int housekeeping_flags;
|
||||
|
||||
int housekeeping_any_cpu(enum hk_flags flags)
|
||||
{
|
||||
if (static_branch_unlikely(&housekeeping_overriden))
|
||||
if (housekeeping_flags & flags)
|
||||
return cpumask_any_and(housekeeping_mask, cpu_online_mask);
|
||||
return smp_processor_id();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
|
||||
|
||||
const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
|
||||
{
|
||||
if (static_branch_unlikely(&housekeeping_overriden))
|
||||
if (housekeeping_flags & flags)
|
||||
return housekeeping_mask;
|
||||
return cpu_possible_mask;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(housekeeping_cpumask);
|
||||
|
||||
void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
|
||||
{
|
||||
if (static_branch_unlikely(&housekeeping_overriden))
|
||||
if (housekeeping_flags & flags)
|
||||
set_cpus_allowed_ptr(t, housekeeping_mask);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(housekeeping_affine);
|
||||
|
||||
bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
|
||||
{
|
||||
if (static_branch_unlikely(&housekeeping_overriden))
|
||||
if (housekeeping_flags & flags)
|
||||
return cpumask_test_cpu(cpu, housekeeping_mask);
|
||||
return true;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
|
||||
|
||||
void __init housekeeping_init(void)
|
||||
{
|
||||
if (!housekeeping_flags)
|
||||
return;
|
||||
|
||||
static_branch_enable(&housekeeping_overriden);
|
||||
|
||||
/* We need at least one CPU to handle housekeeping work */
|
||||
WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
|
||||
}
|
||||
|
||||
static int __init housekeeping_setup(char *str, enum hk_flags flags)
|
||||
{
|
||||
cpumask_var_t non_housekeeping_mask;
|
||||
int err;
|
||||
|
||||
alloc_bootmem_cpumask_var(&non_housekeeping_mask);
|
||||
err = cpulist_parse(str, non_housekeeping_mask);
|
||||
if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) {
|
||||
pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
|
||||
free_bootmem_cpumask_var(non_housekeeping_mask);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!housekeeping_flags) {
|
||||
alloc_bootmem_cpumask_var(&housekeeping_mask);
|
||||
cpumask_andnot(housekeeping_mask,
|
||||
cpu_possible_mask, non_housekeeping_mask);
|
||||
if (cpumask_empty(housekeeping_mask))
|
||||
cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
|
||||
} else {
|
||||
cpumask_var_t tmp;
|
||||
|
||||
alloc_bootmem_cpumask_var(&tmp);
|
||||
cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
|
||||
if (!cpumask_equal(tmp, housekeeping_mask)) {
|
||||
pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
|
||||
free_bootmem_cpumask_var(tmp);
|
||||
free_bootmem_cpumask_var(non_housekeeping_mask);
|
||||
return 0;
|
||||
}
|
||||
free_bootmem_cpumask_var(tmp);
|
||||
}
|
||||
|
||||
if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
|
||||
if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
|
||||
tick_nohz_full_setup(non_housekeeping_mask);
|
||||
} else {
|
||||
pr_warn("Housekeeping: nohz unsupported."
|
||||
" Build with CONFIG_NO_HZ_FULL\n");
|
||||
free_bootmem_cpumask_var(non_housekeeping_mask);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
housekeeping_flags |= flags;
|
||||
|
||||
free_bootmem_cpumask_var(non_housekeeping_mask);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int __init housekeeping_nohz_full_setup(char *str)
|
||||
{
|
||||
unsigned int flags;
|
||||
|
||||
flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
|
||||
|
||||
return housekeeping_setup(str, flags);
|
||||
}
|
||||
__setup("nohz_full=", housekeeping_nohz_full_setup);
|
||||
|
||||
static int __init housekeeping_isolcpus_setup(char *str)
|
||||
{
|
||||
unsigned int flags = 0;
|
||||
|
||||
while (isalpha(*str)) {
|
||||
if (!strncmp(str, "nohz,", 5)) {
|
||||
str += 5;
|
||||
flags |= HK_FLAG_TICK;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!strncmp(str, "domain,", 7)) {
|
||||
str += 7;
|
||||
flags |= HK_FLAG_DOMAIN;
|
||||
continue;
|
||||
}
|
||||
|
||||
pr_warn("isolcpus: Error, unknown flag\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Default behaviour for isolcpus without flags */
|
||||
if (!flags)
|
||||
flags |= HK_FLAG_DOMAIN;
|
||||
|
||||
return housekeeping_setup(str, flags);
|
||||
}
|
||||
__setup("isolcpus=", housekeeping_isolcpus_setup);
|
@ -74,10 +74,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
|
||||
raw_spin_unlock(&rt_b->rt_runtime_lock);
|
||||
}
|
||||
|
||||
#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
|
||||
static void push_irq_work_func(struct irq_work *work);
|
||||
#endif
|
||||
|
||||
void init_rt_rq(struct rt_rq *rt_rq)
|
||||
{
|
||||
struct rt_prio_array *array;
|
||||
@ -97,13 +93,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
|
||||
rt_rq->rt_nr_migratory = 0;
|
||||
rt_rq->overloaded = 0;
|
||||
plist_head_init(&rt_rq->pushable_tasks);
|
||||
|
||||
#ifdef HAVE_RT_PUSH_IPI
|
||||
rt_rq->push_flags = 0;
|
||||
rt_rq->push_cpu = nr_cpu_ids;
|
||||
raw_spin_lock_init(&rt_rq->push_lock);
|
||||
init_irq_work(&rt_rq->push_work, push_irq_work_func);
|
||||
#endif
|
||||
#endif /* CONFIG_SMP */
|
||||
/* We start is dequeued state, because no RT tasks are queued */
|
||||
rt_rq->rt_queued = 0;
|
||||
@ -1876,68 +1865,6 @@ static void push_rt_tasks(struct rq *rq)
|
||||
}
|
||||
|
||||
#ifdef HAVE_RT_PUSH_IPI
|
||||
/*
|
||||
* The search for the next cpu always starts at rq->cpu and ends
|
||||
* when we reach rq->cpu again. It will never return rq->cpu.
|
||||
* This returns the next cpu to check, or nr_cpu_ids if the loop
|
||||
* is complete.
|
||||
*
|
||||
* rq->rt.push_cpu holds the last cpu returned by this function,
|
||||
* or if this is the first instance, it must hold rq->cpu.
|
||||
*/
|
||||
static int rto_next_cpu(struct rq *rq)
|
||||
{
|
||||
int prev_cpu = rq->rt.push_cpu;
|
||||
int cpu;
|
||||
|
||||
cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
|
||||
|
||||
/*
|
||||
* If the previous cpu is less than the rq's CPU, then it already
|
||||
* passed the end of the mask, and has started from the beginning.
|
||||
* We end if the next CPU is greater or equal to rq's CPU.
|
||||
*/
|
||||
if (prev_cpu < rq->cpu) {
|
||||
if (cpu >= rq->cpu)
|
||||
return nr_cpu_ids;
|
||||
|
||||
} else if (cpu >= nr_cpu_ids) {
|
||||
/*
|
||||
* We passed the end of the mask, start at the beginning.
|
||||
* If the result is greater or equal to the rq's CPU, then
|
||||
* the loop is finished.
|
||||
*/
|
||||
cpu = cpumask_first(rq->rd->rto_mask);
|
||||
if (cpu >= rq->cpu)
|
||||
return nr_cpu_ids;
|
||||
}
|
||||
rq->rt.push_cpu = cpu;
|
||||
|
||||
/* Return cpu to let the caller know if the loop is finished or not */
|
||||
return cpu;
|
||||
}
|
||||
|
||||
static int find_next_push_cpu(struct rq *rq)
|
||||
{
|
||||
struct rq *next_rq;
|
||||
int cpu;
|
||||
|
||||
while (1) {
|
||||
cpu = rto_next_cpu(rq);
|
||||
if (cpu >= nr_cpu_ids)
|
||||
break;
|
||||
next_rq = cpu_rq(cpu);
|
||||
|
||||
/* Make sure the next rq can push to this rq */
|
||||
if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
|
||||
break;
|
||||
}
|
||||
|
||||
return cpu;
|
||||
}
|
||||
|
||||
#define RT_PUSH_IPI_EXECUTING 1
|
||||
#define RT_PUSH_IPI_RESTART 2
|
||||
|
||||
/*
|
||||
* When a high priority task schedules out from a CPU and a lower priority
|
||||
@ -1947,170 +1874,157 @@ static int find_next_push_cpu(struct rq *rq)
|
||||
* tasks queued on it (overloaded) needs to be notified that a CPU has opened
|
||||
* up that may be able to run one of its non-running queued RT tasks.
|
||||
*
|
||||
* On large CPU boxes, there's the case that several CPUs could schedule
|
||||
* a lower priority task at the same time, in which case it will look for
|
||||
* any overloaded CPUs that it could pull a task from. To do this, the runqueue
|
||||
* lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
|
||||
* for a single overloaded CPU's runqueue lock can produce a large latency.
|
||||
* (This has actually been observed on large boxes running cyclictest).
|
||||
* Instead of taking the runqueue lock of the overloaded CPU, each of the
|
||||
* CPUs that scheduled a lower priority task simply sends an IPI to the
|
||||
* overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
|
||||
* lots of contention. The overloaded CPU will look to push its non-running
|
||||
* RT task off, and if it does, it can then ignore the other IPIs coming
|
||||
* in, and just pass those IPIs off to any other overloaded CPU.
|
||||
* All CPUs with overloaded RT tasks need to be notified as there is currently
|
||||
* no way to know which of these CPUs have the highest priority task waiting
|
||||
* to run. Instead of trying to take a spinlock on each of these CPUs,
|
||||
* which has shown to cause large latency when done on machines with many
|
||||
* CPUs, sending an IPI to the CPUs to have them push off the overloaded
|
||||
* RT tasks waiting to run.
|
||||
*
|
||||
* When a CPU schedules a lower priority task, it only sends an IPI to
|
||||
* the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
|
||||
* as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
|
||||
* RT overloaded tasks, would cause 100 IPIs to go out at once.
|
||||
* Just sending an IPI to each of the CPUs is also an issue, as on large
|
||||
* count CPU machines, this can cause an IPI storm on a CPU, especially
|
||||
* if its the only CPU with multiple RT tasks queued, and a large number
|
||||
* of CPUs scheduling a lower priority task at the same time.
|
||||
*
|
||||
* The overloaded RT CPU, when receiving an IPI, will try to push off its
|
||||
* overloaded RT tasks and then send an IPI to the next CPU that has
|
||||
* overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
|
||||
* have completed. Just because a CPU may have pushed off its own overloaded
|
||||
* RT task does not mean it should stop sending the IPI around to other
|
||||
* overloaded CPUs. There may be another RT task waiting to run on one of
|
||||
* those CPUs that are of higher priority than the one that was just
|
||||
* pushed.
|
||||
* Each root domain has its own irq work function that can iterate over
|
||||
* all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
|
||||
* tassk must be checked if there's one or many CPUs that are lowering
|
||||
* their priority, there's a single irq work iterator that will try to
|
||||
* push off RT tasks that are waiting to run.
|
||||
*
|
||||
* An optimization that could possibly be made is to make a CPU array similar
|
||||
* to the cpupri array mask of all running RT tasks, but for the overloaded
|
||||
* case, then the IPI could be sent to only the CPU with the highest priority
|
||||
* RT task waiting, and that CPU could send off further IPIs to the CPU with
|
||||
* the next highest waiting task. Since the overloaded case is much less likely
|
||||
* to happen, the complexity of this implementation may not be worth it.
|
||||
* Instead, just send an IPI around to all overloaded CPUs.
|
||||
* When a CPU schedules a lower priority task, it will kick off the
|
||||
* irq work iterator that will jump to each CPU with overloaded RT tasks.
|
||||
* As it only takes the first CPU that schedules a lower priority task
|
||||
* to start the process, the rto_start variable is incremented and if
|
||||
* the atomic result is one, then that CPU will try to take the rto_lock.
|
||||
* This prevents high contention on the lock as the process handles all
|
||||
* CPUs scheduling lower priority tasks.
|
||||
*
|
||||
* The rq->rt.push_flags holds the status of the IPI that is going around.
|
||||
* A run queue can only send out a single IPI at a time. The possible flags
|
||||
* for rq->rt.push_flags are:
|
||||
* All CPUs that are scheduling a lower priority task will increment the
|
||||
* rt_loop_next variable. This will make sure that the irq work iterator
|
||||
* checks all RT overloaded CPUs whenever a CPU schedules a new lower
|
||||
* priority task, even if the iterator is in the middle of a scan. Incrementing
|
||||
* the rt_loop_next will cause the iterator to perform another scan.
|
||||
*
|
||||
* (None or zero): No IPI is going around for the current rq
|
||||
* RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around
|
||||
* RT_PUSH_IPI_RESTART: The priority of the running task for the rq
|
||||
* has changed, and the IPI should restart
|
||||
* circulating the overloaded CPUs again.
|
||||
*
|
||||
* rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
|
||||
* before sending to the next CPU.
|
||||
*
|
||||
* Instead of having all CPUs that schedule a lower priority task send
|
||||
* an IPI to the same "first" CPU in the RT overload mask, they send it
|
||||
* to the next overloaded CPU after their own CPU. This helps distribute
|
||||
* the work when there's more than one overloaded CPU and multiple CPUs
|
||||
* scheduling in lower priority tasks.
|
||||
*
|
||||
* When a rq schedules a lower priority task than what was currently
|
||||
* running, the next CPU with overloaded RT tasks is examined first.
|
||||
* That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
|
||||
* priority task, it will send an IPI first to CPU 5, then CPU 5 will
|
||||
* send to CPU 1 if it is still overloaded. CPU 1 will clear the
|
||||
* rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
|
||||
*
|
||||
* The first CPU to notice IPI_RESTART is set, will clear that flag and then
|
||||
* send an IPI to the next overloaded CPU after the rq->cpu and not the next
|
||||
* CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
|
||||
* schedules a lower priority task, and the IPI_RESTART gets set while the
|
||||
* handling is being done on CPU 5, it will clear the flag and send it back to
|
||||
* CPU 4 instead of CPU 1.
|
||||
*
|
||||
* Note, the above logic can be disabled by turning off the sched_feature
|
||||
* RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
|
||||
* taken by the CPU requesting a pull and the waiting RT task will be pulled
|
||||
* by that CPU. This may be fine for machines with few CPUs.
|
||||
*/
|
||||
static void tell_cpu_to_push(struct rq *rq)
|
||||
static int rto_next_cpu(struct rq *rq)
|
||||
{
|
||||
struct root_domain *rd = rq->rd;
|
||||
int next;
|
||||
int cpu;
|
||||
|
||||
if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
|
||||
raw_spin_lock(&rq->rt.push_lock);
|
||||
/* Make sure it's still executing */
|
||||
if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
|
||||
/*
|
||||
* Tell the IPI to restart the loop as things have
|
||||
* changed since it started.
|
||||
*/
|
||||
rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
|
||||
raw_spin_unlock(&rq->rt.push_lock);
|
||||
return;
|
||||
}
|
||||
raw_spin_unlock(&rq->rt.push_lock);
|
||||
/*
|
||||
* When starting the IPI RT pushing, the rto_cpu is set to -1,
|
||||
* rt_next_cpu() will simply return the first CPU found in
|
||||
* the rto_mask.
|
||||
*
|
||||
* If rto_next_cpu() is called with rto_cpu is a valid cpu, it
|
||||
* will return the next CPU found in the rto_mask.
|
||||
*
|
||||
* If there are no more CPUs left in the rto_mask, then a check is made
|
||||
* against rto_loop and rto_loop_next. rto_loop is only updated with
|
||||
* the rto_lock held, but any CPU may increment the rto_loop_next
|
||||
* without any locking.
|
||||
*/
|
||||
for (;;) {
|
||||
|
||||
/* When rto_cpu is -1 this acts like cpumask_first() */
|
||||
cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
|
||||
|
||||
rd->rto_cpu = cpu;
|
||||
|
||||
if (cpu < nr_cpu_ids)
|
||||
return cpu;
|
||||
|
||||
rd->rto_cpu = -1;
|
||||
|
||||
/*
|
||||
* ACQUIRE ensures we see the @rto_mask changes
|
||||
* made prior to the @next value observed.
|
||||
*
|
||||
* Matches WMB in rt_set_overload().
|
||||
*/
|
||||
next = atomic_read_acquire(&rd->rto_loop_next);
|
||||
|
||||
if (rd->rto_loop == next)
|
||||
break;
|
||||
|
||||
rd->rto_loop = next;
|
||||
}
|
||||
|
||||
/* When here, there's no IPI going around */
|
||||
return -1;
|
||||
}
|
||||
|
||||
rq->rt.push_cpu = rq->cpu;
|
||||
cpu = find_next_push_cpu(rq);
|
||||
if (cpu >= nr_cpu_ids)
|
||||
static inline bool rto_start_trylock(atomic_t *v)
|
||||
{
|
||||
return !atomic_cmpxchg_acquire(v, 0, 1);
|
||||
}
|
||||
|
||||
static inline void rto_start_unlock(atomic_t *v)
|
||||
{
|
||||
atomic_set_release(v, 0);
|
||||
}
|
||||
|
||||
static void tell_cpu_to_push(struct rq *rq)
|
||||
{
|
||||
int cpu = -1;
|
||||
|
||||
/* Keep the loop going if the IPI is currently active */
|
||||
atomic_inc(&rq->rd->rto_loop_next);
|
||||
|
||||
/* Only one CPU can initiate a loop at a time */
|
||||
if (!rto_start_trylock(&rq->rd->rto_loop_start))
|
||||
return;
|
||||
|
||||
rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
|
||||
raw_spin_lock(&rq->rd->rto_lock);
|
||||
|
||||
irq_work_queue_on(&rq->rt.push_work, cpu);
|
||||
/*
|
||||
* The rto_cpu is updated under the lock, if it has a valid cpu
|
||||
* then the IPI is still running and will continue due to the
|
||||
* update to loop_next, and nothing needs to be done here.
|
||||
* Otherwise it is finishing up and an ipi needs to be sent.
|
||||
*/
|
||||
if (rq->rd->rto_cpu < 0)
|
||||
cpu = rto_next_cpu(rq);
|
||||
|
||||
raw_spin_unlock(&rq->rd->rto_lock);
|
||||
|
||||
rto_start_unlock(&rq->rd->rto_loop_start);
|
||||
|
||||
if (cpu >= 0)
|
||||
irq_work_queue_on(&rq->rd->rto_push_work, cpu);
|
||||
}
|
||||
|
||||
/* Called from hardirq context */
|
||||
static void try_to_push_tasks(void *arg)
|
||||
void rto_push_irq_work_func(struct irq_work *work)
|
||||
{
|
||||
struct rt_rq *rt_rq = arg;
|
||||
struct rq *rq, *src_rq;
|
||||
int this_cpu;
|
||||
struct rq *rq;
|
||||
int cpu;
|
||||
|
||||
this_cpu = rt_rq->push_cpu;
|
||||
rq = this_rq();
|
||||
|
||||
/* Paranoid check */
|
||||
BUG_ON(this_cpu != smp_processor_id());
|
||||
|
||||
rq = cpu_rq(this_cpu);
|
||||
src_rq = rq_of_rt_rq(rt_rq);
|
||||
|
||||
again:
|
||||
/*
|
||||
* We do not need to grab the lock to check for has_pushable_tasks.
|
||||
* When it gets updated, a check is made if a push is possible.
|
||||
*/
|
||||
if (has_pushable_tasks(rq)) {
|
||||
raw_spin_lock(&rq->lock);
|
||||
push_rt_task(rq);
|
||||
push_rt_tasks(rq);
|
||||
raw_spin_unlock(&rq->lock);
|
||||
}
|
||||
|
||||
raw_spin_lock(&rq->rd->rto_lock);
|
||||
|
||||
/* Pass the IPI to the next rt overloaded queue */
|
||||
raw_spin_lock(&rt_rq->push_lock);
|
||||
/*
|
||||
* If the source queue changed since the IPI went out,
|
||||
* we need to restart the search from that CPU again.
|
||||
*/
|
||||
if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
|
||||
rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
|
||||
rt_rq->push_cpu = src_rq->cpu;
|
||||
}
|
||||
cpu = rto_next_cpu(rq);
|
||||
|
||||
cpu = find_next_push_cpu(src_rq);
|
||||
raw_spin_unlock(&rq->rd->rto_lock);
|
||||
|
||||
if (cpu >= nr_cpu_ids)
|
||||
rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
|
||||
raw_spin_unlock(&rt_rq->push_lock);
|
||||
|
||||
if (cpu >= nr_cpu_ids)
|
||||
if (cpu < 0)
|
||||
return;
|
||||
|
||||
/*
|
||||
* It is possible that a restart caused this CPU to be
|
||||
* chosen again. Don't bother with an IPI, just see if we
|
||||
* have more to push.
|
||||
*/
|
||||
if (unlikely(cpu == rq->cpu))
|
||||
goto again;
|
||||
|
||||
/* Try the next RT overloaded CPU */
|
||||
irq_work_queue_on(&rt_rq->push_work, cpu);
|
||||
}
|
||||
|
||||
static void push_irq_work_func(struct irq_work *work)
|
||||
{
|
||||
struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
|
||||
|
||||
try_to_push_tasks(rt_rq);
|
||||
irq_work_queue_on(&rq->rd->rto_push_work, cpu);
|
||||
}
|
||||
#endif /* HAVE_RT_PUSH_IPI */
|
||||
|
||||
|
@ -227,7 +227,7 @@ struct dl_bw {
|
||||
static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
|
||||
|
||||
static inline
|
||||
void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
|
||||
void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
|
||||
{
|
||||
dl_b->total_bw -= tsk_bw;
|
||||
__dl_update(dl_b, (s32)tsk_bw / cpus);
|
||||
@ -256,7 +256,6 @@ extern int sched_dl_overflow(struct task_struct *p, int policy,
|
||||
extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
|
||||
extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
|
||||
extern bool __checkparam_dl(const struct sched_attr *attr);
|
||||
extern void __dl_clear_params(struct task_struct *p);
|
||||
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
|
||||
extern int dl_task_can_attach(struct task_struct *p,
|
||||
const struct cpumask *cs_cpus_allowed);
|
||||
@ -419,6 +418,7 @@ struct cfs_bandwidth { };
|
||||
/* CFS-related fields in a runqueue */
|
||||
struct cfs_rq {
|
||||
struct load_weight load;
|
||||
unsigned long runnable_weight;
|
||||
unsigned int nr_running, h_nr_running;
|
||||
|
||||
u64 exec_clock;
|
||||
@ -444,18 +444,22 @@ struct cfs_rq {
|
||||
* CFS load tracking
|
||||
*/
|
||||
struct sched_avg avg;
|
||||
u64 runnable_load_sum;
|
||||
unsigned long runnable_load_avg;
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
unsigned long tg_load_avg_contrib;
|
||||
unsigned long propagate_avg;
|
||||
#endif
|
||||
atomic_long_t removed_load_avg, removed_util_avg;
|
||||
#ifndef CONFIG_64BIT
|
||||
u64 load_last_update_time_copy;
|
||||
#endif
|
||||
struct {
|
||||
raw_spinlock_t lock ____cacheline_aligned;
|
||||
int nr;
|
||||
unsigned long load_avg;
|
||||
unsigned long util_avg;
|
||||
unsigned long runnable_sum;
|
||||
} removed;
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
unsigned long tg_load_avg_contrib;
|
||||
long propagate;
|
||||
long prop_runnable_sum;
|
||||
|
||||
/*
|
||||
* h_load = weight * f(tg)
|
||||
*
|
||||
@ -502,7 +506,7 @@ static inline int rt_bandwidth_enabled(void)
|
||||
}
|
||||
|
||||
/* RT IPI pull logic requires IRQ_WORK */
|
||||
#ifdef CONFIG_IRQ_WORK
|
||||
#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
|
||||
# define HAVE_RT_PUSH_IPI
|
||||
#endif
|
||||
|
||||
@ -524,12 +528,6 @@ struct rt_rq {
|
||||
unsigned long rt_nr_total;
|
||||
int overloaded;
|
||||
struct plist_head pushable_tasks;
|
||||
#ifdef HAVE_RT_PUSH_IPI
|
||||
int push_flags;
|
||||
int push_cpu;
|
||||
struct irq_work push_work;
|
||||
raw_spinlock_t push_lock;
|
||||
#endif
|
||||
#endif /* CONFIG_SMP */
|
||||
int rt_queued;
|
||||
|
||||
@ -638,6 +636,19 @@ struct root_domain {
|
||||
struct dl_bw dl_bw;
|
||||
struct cpudl cpudl;
|
||||
|
||||
#ifdef HAVE_RT_PUSH_IPI
|
||||
/*
|
||||
* For IPI pull requests, loop across the rto_mask.
|
||||
*/
|
||||
struct irq_work rto_push_work;
|
||||
raw_spinlock_t rto_lock;
|
||||
/* These are only updated and read within rto_lock */
|
||||
int rto_loop;
|
||||
int rto_cpu;
|
||||
/* These atomics are updated outside of a lock */
|
||||
atomic_t rto_loop_next;
|
||||
atomic_t rto_loop_start;
|
||||
#endif
|
||||
/*
|
||||
* The "RT overload" flag: it gets set if a CPU has more than
|
||||
* one runnable RT task.
|
||||
@ -655,6 +666,9 @@ extern void init_defrootdomain(void);
|
||||
extern int sched_init_domains(const struct cpumask *cpu_map);
|
||||
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
|
||||
|
||||
#ifdef HAVE_RT_PUSH_IPI
|
||||
extern void rto_push_irq_work_func(struct irq_work *work);
|
||||
#endif
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
/*
|
||||
@ -1219,8 +1233,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
|
||||
# define const_debug const
|
||||
#endif
|
||||
|
||||
extern const_debug unsigned int sysctl_sched_features;
|
||||
|
||||
#define SCHED_FEAT(name, enabled) \
|
||||
__SCHED_FEAT_##name ,
|
||||
|
||||
@ -1232,6 +1244,13 @@ enum {
|
||||
#undef SCHED_FEAT
|
||||
|
||||
#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
|
||||
|
||||
/*
|
||||
* To support run-time toggling of sched features, all the translation units
|
||||
* (but core.c) reference the sysctl_sched_features defined in core.c.
|
||||
*/
|
||||
extern const_debug unsigned int sysctl_sched_features;
|
||||
|
||||
#define SCHED_FEAT(name, enabled) \
|
||||
static __always_inline bool static_branch_##name(struct static_key *key) \
|
||||
{ \
|
||||
@ -1239,13 +1258,27 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
|
||||
}
|
||||
|
||||
#include "features.h"
|
||||
|
||||
#undef SCHED_FEAT
|
||||
|
||||
extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
|
||||
#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
|
||||
|
||||
#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
|
||||
|
||||
/*
|
||||
* Each translation unit has its own copy of sysctl_sched_features to allow
|
||||
* constants propagation at compile time and compiler optimization based on
|
||||
* features default.
|
||||
*/
|
||||
#define SCHED_FEAT(name, enabled) \
|
||||
(1UL << __SCHED_FEAT_##name) * enabled |
|
||||
static const_debug __maybe_unused unsigned int sysctl_sched_features =
|
||||
#include "features.h"
|
||||
0;
|
||||
#undef SCHED_FEAT
|
||||
|
||||
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
|
||||
|
||||
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
|
||||
|
||||
extern struct static_key_false sched_numa_balancing;
|
||||
@ -1530,6 +1563,8 @@ extern void init_sched_dl_class(void);
|
||||
extern void init_sched_rt_class(void);
|
||||
extern void init_sched_fair_class(void);
|
||||
|
||||
extern void reweight_task(struct task_struct *p, int prio);
|
||||
|
||||
extern void resched_curr(struct rq *rq);
|
||||
extern void resched_cpu(int cpu);
|
||||
|
||||
|
@ -4,6 +4,7 @@
|
||||
*/
|
||||
#include <linux/sched.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/sched/isolation.h>
|
||||
|
||||
#include "sched.h"
|
||||
|
||||
@ -269,6 +270,12 @@ static int init_rootdomain(struct root_domain *rd)
|
||||
if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
|
||||
goto free_dlo_mask;
|
||||
|
||||
#ifdef HAVE_RT_PUSH_IPI
|
||||
rd->rto_cpu = -1;
|
||||
raw_spin_lock_init(&rd->rto_lock);
|
||||
init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
|
||||
#endif
|
||||
|
||||
init_dl_bw(&rd->dl_bw);
|
||||
if (cpudl_init(&rd->cpudl) != 0)
|
||||
goto free_rto_mask;
|
||||
@ -464,21 +471,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
|
||||
update_top_cache_domain(cpu);
|
||||
}
|
||||
|
||||
/* Setup the mask of CPUs configured for isolated domains */
|
||||
static int __init isolated_cpu_setup(char *str)
|
||||
{
|
||||
int ret;
|
||||
|
||||
alloc_bootmem_cpumask_var(&cpu_isolated_map);
|
||||
ret = cpulist_parse(str, cpu_isolated_map);
|
||||
if (ret) {
|
||||
pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids);
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
__setup("isolcpus=", isolated_cpu_setup);
|
||||
|
||||
struct s_data {
|
||||
struct sched_domain ** __percpu sd;
|
||||
struct root_domain *rd;
|
||||
@ -1158,6 +1150,7 @@ sd_init(struct sched_domain_topology_level *tl,
|
||||
sd->smt_gain = 1178; /* ~15% */
|
||||
|
||||
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
|
||||
sd->flags |= SD_PREFER_SIBLING;
|
||||
sd->imbalance_pct = 117;
|
||||
sd->cache_nice_tries = 1;
|
||||
sd->busy_idx = 2;
|
||||
@ -1332,6 +1325,10 @@ void sched_init_numa(void)
|
||||
if (!sched_domains_numa_distance)
|
||||
return;
|
||||
|
||||
/* Includes NUMA identity node at level 0. */
|
||||
sched_domains_numa_distance[level++] = curr_distance;
|
||||
sched_domains_numa_levels = level;
|
||||
|
||||
/*
|
||||
* O(nr_nodes^2) deduplicating selection sort -- in order to find the
|
||||
* unique distances in the node_distance() table.
|
||||
@ -1379,8 +1376,7 @@ void sched_init_numa(void)
|
||||
return;
|
||||
|
||||
/*
|
||||
* 'level' contains the number of unique distances, excluding the
|
||||
* identity distance node_distance(i,i).
|
||||
* 'level' contains the number of unique distances
|
||||
*
|
||||
* The sched_domains_numa_distance[] array includes the actual distance
|
||||
* numbers.
|
||||
@ -1441,10 +1437,19 @@ void sched_init_numa(void)
|
||||
for (i = 0; sched_domain_topology[i].mask; i++)
|
||||
tl[i] = sched_domain_topology[i];
|
||||
|
||||
/*
|
||||
* Add the NUMA identity distance, aka single NODE.
|
||||
*/
|
||||
tl[i++] = (struct sched_domain_topology_level){
|
||||
.mask = sd_numa_mask,
|
||||
.numa_level = 0,
|
||||
SD_INIT_NAME(NODE)
|
||||
};
|
||||
|
||||
/*
|
||||
* .. and append 'j' levels of NUMA goodness.
|
||||
*/
|
||||
for (j = 0; j < level; i++, j++) {
|
||||
for (j = 1; j < level; i++, j++) {
|
||||
tl[i] = (struct sched_domain_topology_level){
|
||||
.mask = sd_numa_mask,
|
||||
.sd_flags = cpu_numa_flags,
|
||||
@ -1774,7 +1779,7 @@ int sched_init_domains(const struct cpumask *cpu_map)
|
||||
doms_cur = alloc_sched_domains(ndoms_cur);
|
||||
if (!doms_cur)
|
||||
doms_cur = &fallback_doms;
|
||||
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
|
||||
cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
|
||||
err = build_sched_domains(doms_cur[0], NULL);
|
||||
register_sched_domain_sysctl();
|
||||
|
||||
@ -1857,7 +1862,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
|
||||
doms_new = alloc_sched_domains(1);
|
||||
if (doms_new) {
|
||||
n = 1;
|
||||
cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
|
||||
cpumask_and(doms_new[0], cpu_active_mask,
|
||||
housekeeping_cpumask(HK_FLAG_DOMAIN));
|
||||
}
|
||||
} else {
|
||||
n = ndoms_new;
|
||||
@ -1880,7 +1886,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
|
||||
if (!doms_new) {
|
||||
n = 0;
|
||||
doms_new = &fallback_doms;
|
||||
cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
|
||||
cpumask_and(doms_new[0], cpu_active_mask,
|
||||
housekeeping_cpumask(HK_FLAG_DOMAIN));
|
||||
}
|
||||
|
||||
/* Build new domains: */
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include <linux/irq_work.h>
|
||||
#include <linux/posix-timers.h>
|
||||
#include <linux/context_tracking.h>
|
||||
#include <linux/mm.h>
|
||||
|
||||
#include <asm/irq_regs.h>
|
||||
|
||||
@ -165,7 +166,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
|
||||
|
||||
#ifdef CONFIG_NO_HZ_FULL
|
||||
cpumask_var_t tick_nohz_full_mask;
|
||||
cpumask_var_t housekeeping_mask;
|
||||
bool tick_nohz_full_running;
|
||||
static atomic_t tick_dep_mask;
|
||||
|
||||
@ -385,20 +385,13 @@ void __tick_nohz_task_switch(void)
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
/* Parse the boot-time nohz CPU list from the kernel parameters. */
|
||||
static int __init tick_nohz_full_setup(char *str)
|
||||
/* Get the boot-time nohz CPU list from the kernel parameters. */
|
||||
void __init tick_nohz_full_setup(cpumask_var_t cpumask)
|
||||
{
|
||||
alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
|
||||
if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
|
||||
pr_warn("NO_HZ: Incorrect nohz_full cpumask\n");
|
||||
free_bootmem_cpumask_var(tick_nohz_full_mask);
|
||||
return 1;
|
||||
}
|
||||
cpumask_copy(tick_nohz_full_mask, cpumask);
|
||||
tick_nohz_full_running = true;
|
||||
|
||||
return 1;
|
||||
}
|
||||
__setup("nohz_full=", tick_nohz_full_setup);
|
||||
|
||||
static int tick_nohz_cpu_down(unsigned int cpu)
|
||||
{
|
||||
@ -437,13 +430,6 @@ void __init tick_nohz_init(void)
|
||||
return;
|
||||
}
|
||||
|
||||
if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
|
||||
WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
|
||||
cpumask_clear(tick_nohz_full_mask);
|
||||
tick_nohz_full_running = false;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Full dynticks uses irq work to drive the tick rescheduling on safe
|
||||
* locking contexts. But then we need irq work to raise its own
|
||||
@ -452,7 +438,6 @@ void __init tick_nohz_init(void)
|
||||
if (!arch_irq_work_has_interrupt()) {
|
||||
pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
|
||||
cpumask_clear(tick_nohz_full_mask);
|
||||
cpumask_copy(housekeeping_mask, cpu_possible_mask);
|
||||
tick_nohz_full_running = false;
|
||||
return;
|
||||
}
|
||||
@ -465,9 +450,6 @@ void __init tick_nohz_init(void)
|
||||
cpumask_clear_cpu(cpu, tick_nohz_full_mask);
|
||||
}
|
||||
|
||||
cpumask_andnot(housekeeping_mask,
|
||||
cpu_possible_mask, tick_nohz_full_mask);
|
||||
|
||||
for_each_cpu(cpu, tick_nohz_full_mask)
|
||||
context_tracking_cpu_set(cpu);
|
||||
|
||||
@ -477,12 +459,6 @@ void __init tick_nohz_init(void)
|
||||
WARN_ON(ret < 0);
|
||||
pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
|
||||
cpumask_pr_args(tick_nohz_full_mask));
|
||||
|
||||
/*
|
||||
* We need at least one CPU to handle housekeeping work such
|
||||
* as timekeeping, unbound timers, workqueues, ...
|
||||
*/
|
||||
WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -787,6 +763,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
|
||||
if (!ts->tick_stopped) {
|
||||
calc_load_nohz_start();
|
||||
cpu_load_update_nohz_start();
|
||||
quiet_vmstat();
|
||||
|
||||
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
|
||||
ts->tick_stopped = 1;
|
||||
|
@ -921,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
|
||||
|
||||
trace_assign_type(field, iter->ent);
|
||||
|
||||
T = __task_state_to_char(field->next_state);
|
||||
S = __task_state_to_char(field->prev_state);
|
||||
T = task_index_to_char(field->next_state);
|
||||
S = task_index_to_char(field->prev_state);
|
||||
trace_find_cmdline(field->next_pid, comm);
|
||||
trace_seq_printf(&iter->seq,
|
||||
" %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
|
||||
@ -957,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
|
||||
trace_assign_type(field, iter->ent);
|
||||
|
||||
if (!S)
|
||||
S = __task_state_to_char(field->prev_state);
|
||||
T = __task_state_to_char(field->next_state);
|
||||
S = task_index_to_char(field->prev_state);
|
||||
T = task_index_to_char(field->next_state);
|
||||
trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
|
||||
field->prev_pid,
|
||||
field->prev_prio,
|
||||
@ -993,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
|
||||
trace_assign_type(field, iter->ent);
|
||||
|
||||
if (!S)
|
||||
S = __task_state_to_char(field->prev_state);
|
||||
T = __task_state_to_char(field->next_state);
|
||||
S = task_index_to_char(field->prev_state);
|
||||
T = task_index_to_char(field->next_state);
|
||||
|
||||
SEQ_PUT_HEX_FIELD(s, field->prev_pid);
|
||||
SEQ_PUT_HEX_FIELD(s, field->prev_prio);
|
||||
|
@ -398,10 +398,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
|
||||
entry = ring_buffer_event_data(event);
|
||||
entry->prev_pid = prev->pid;
|
||||
entry->prev_prio = prev->prio;
|
||||
entry->prev_state = __get_task_state(prev);
|
||||
entry->prev_state = task_state_index(prev);
|
||||
entry->next_pid = next->pid;
|
||||
entry->next_prio = next->prio;
|
||||
entry->next_state = __get_task_state(next);
|
||||
entry->next_state = task_state_index(next);
|
||||
entry->next_cpu = task_cpu(next);
|
||||
|
||||
if (!call_filter_check_discard(call, entry, buffer, event))
|
||||
@ -426,10 +426,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
|
||||
entry = ring_buffer_event_data(event);
|
||||
entry->prev_pid = curr->pid;
|
||||
entry->prev_prio = curr->prio;
|
||||
entry->prev_state = __get_task_state(curr);
|
||||
entry->prev_state = task_state_index(curr);
|
||||
entry->next_pid = wakee->pid;
|
||||
entry->next_prio = wakee->prio;
|
||||
entry->next_state = __get_task_state(wakee);
|
||||
entry->next_state = task_state_index(wakee);
|
||||
entry->next_cpu = task_cpu(wakee);
|
||||
|
||||
if (!call_filter_check_discard(call, entry, buffer, event))
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/sched/clock.h>
|
||||
#include <linux/sched/debug.h>
|
||||
#include <linux/sched/isolation.h>
|
||||
|
||||
#include <asm/irq_regs.h>
|
||||
#include <linux/kvm_para.h>
|
||||
@ -774,15 +775,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
|
||||
|
||||
void __init lockup_detector_init(void)
|
||||
{
|
||||
#ifdef CONFIG_NO_HZ_FULL
|
||||
if (tick_nohz_full_enabled()) {
|
||||
if (tick_nohz_full_enabled())
|
||||
pr_info("Disabling watchdog on nohz_full cores by default\n");
|
||||
cpumask_copy(&watchdog_cpumask, housekeeping_mask);
|
||||
} else
|
||||
cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
|
||||
#else
|
||||
cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
|
||||
#endif
|
||||
|
||||
cpumask_copy(&watchdog_cpumask,
|
||||
housekeeping_cpumask(HK_FLAG_TIMER));
|
||||
|
||||
if (!watchdog_nmi_probe())
|
||||
nmi_watchdog_available = true;
|
||||
|
Loading…
Reference in New Issue
Block a user