Scheduler changes for v5.10:

- Reorganize & clean up the SD* flags definitions and add a bunch
    of sanity checks. These new checks caught quite a few bugs or at
    least inconsistencies, resulting in another set of patches.
 
  - Rseq updates, add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ
 
  - Add a new tracepoint to improve CPU capacity tracking
 
  - Improve overloaded SMP system load-balancing behavior
 
  - Tweak SMT balancing
 
  - Energy-aware scheduling updates
 
  - NUMA balancing improvements
 
  - Deadline scheduler fixes and improvements
 
  - CPU isolation fixes
 
  - Misc cleanups, simplifications and smaller optimizations.
 
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAl+EWRERHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1hV8A/7BB0nt/zYVZ8Z3Di8V0b9hMtr0d1xtRM5
 ZAvg4hcZl/fVgobFndxBw6KdlK8lSce9Mcq+bTTWeD46CS13cK5Vrpiaf7x7Q00P
 m8YHeYEH13ME0pbBrhDoRCR4XzfXukzjkUl7LiyrTekAvRUtFikJ/uKl8MeJtYGZ
 gANEkadqforxUW0v45iUEGepmCWAl8hSlSMb2mDKsVhw4DFMD+px0EBmmA0VDqjE
 e0rkh6dEoUVNqlic2KoaXULld1rLg1xiaOcLUbTAXnucfhmuv5p/H11AC4ABuf+s
 7d0zLrLEfZrcLJkthYxfMHs7DYMtARiQM9Db/a5hAq9Af4Z2bvvVAaHt3gCGvkV1
 llB6BB2yWCki9Qv7oiGOAhANnyJHG/cU4r6WwMuHdlYi4dFT/iN5qkOMUL1IrDgi
 a6ZzvECChXBeisQXHSlMd8Y5O+j0gRvDR7E18z2q0/PlmO8PGJq4w34mEWveWIg3
 LaVF16bmvaARuNFJTQH/zaHhjqVQANSMx5OIv9swp0OkwvQkw21ICYHG0YxfzWCr
 oa/FESEpOL9XdYp8UwMPI0bmVIsEfx79pmDMF3zInYTpJpwMUhV2yjHE8uYVMqEf
 7U8rZv7gdbZ2us38Gjf2l73hY+recp/GrgZKnk0R98OUeMk1l/iVP6dwco6ITUV5
 czGmKlIB1ec=
 =bXy6
 -----END PGP SIGNATURE-----

Merge tag 'sched-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - reorganize & clean up the SD* flags definitions and add a bunch of
   sanity checks. These new checks caught quite a few bugs or at least
   inconsistencies, resulting in another set of patches.

 - rseq updates, add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ

 - add a new tracepoint to improve CPU capacity tracking

 - improve overloaded SMP system load-balancing behavior

 - tweak SMT balancing

 - energy-aware scheduling updates

 - NUMA balancing improvements

 - deadline scheduler fixes and improvements

 - CPU isolation fixes

 - misc cleanups, simplifications and smaller optimizations

* tag 'sched-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (42 commits)
  sched/deadline: Unthrottle PI boosted threads while enqueuing
  sched/debug: Add new tracepoint to track cpu_capacity
  sched/fair: Tweak pick_next_entity()
  rseq/selftests: Test MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ
  rseq/selftests,x86_64: Add rseq_offset_deref_addv()
  rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ
  sched/fair: Use dst group while checking imbalance for NUMA balancer
  sched/fair: Reduce busy load balance interval
  sched/fair: Minimize concurrent LBs between domain level
  sched/fair: Reduce minimal imbalance threshold
  sched/fair: Relax constraint on task's load during load balance
  sched/fair: Remove the force parameter of update_tg_load_avg()
  sched/fair: Fix wrong cpu selecting from isolated domain
  sched: Remove unused inline function uclamp_bucket_base_value()
  sched/rt: Disable RT_RUNTIME_SHARE by default
  sched/deadline: Fix stale throttling on de-/boosted tasks
  sched/numa: Use runnable_avg to classify node
  sched/topology: Move sd_flag_debug out of #ifdef CONFIG_SYSCTL
  MAINTAINERS: Add myself as SCHED_DEADLINE reviewer
  sched/topology: Move SD_DEGENERATE_GROUPS_MASK out of linux/sched/topology.h
  ...
This commit is contained in:
Linus Torvalds 2020-10-12 12:56:01 -07:00
commit edaa5ddf38
19 changed files with 803 additions and 152 deletions

View File

@ -15407,6 +15407,7 @@ R: Dietmar Eggemann <dietmar.eggemann@arm.com> (SCHED_NORMAL)
R: Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR)
R: Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
R: Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
R: Daniel Bristot de Oliveira <bristot@redhat.com> (SCHED_DEADLINE)
L: linux-kernel@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core

View File

@ -177,15 +177,6 @@ static inline void parse_dt_topology(void) {}
static inline void update_cpu_capacity(unsigned int cpuid) {}
#endif
/*
* The current assumption is that we can power gate each core independently.
* This will be superseded by DT binding once available.
*/
const struct cpumask *cpu_corepower_mask(int cpu)
{
return &cpu_topology[cpu].thread_sibling;
}
/*
* store_cpu_topology is called at boot when only one cpu is running
* and with the mutex cpu_hotplug.lock locked, when several cpus have booted,
@ -241,20 +232,6 @@ void store_cpu_topology(unsigned int cpuid)
update_siblings_masks(cpuid);
}
static inline int cpu_corepower_flags(void)
{
return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN;
}
static struct sched_domain_topology_level arm_topology[] = {
#ifdef CONFIG_SCHED_MC
{ cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};
/*
* init_cpu_topology is called at boot when only one cpu is running
* which prevent simultaneous write access to cpu_topology array
@ -265,7 +242,4 @@ void __init init_cpu_topology(void)
smp_wmb();
parse_dt_topology();
/* Set scheduler topology descriptor */
set_sched_topology(arm_topology);
}

View File

@ -1491,9 +1491,10 @@ extern struct pid *cad_pid;
/*
* Per process flags
*/
#define PF_VCPU 0x00000001 /* I'm a virtual CPU */
#define PF_IDLE 0x00000002 /* I am an IDLE thread */
#define PF_EXITING 0x00000004 /* Getting shut down */
#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
#define PF_IO_WORKER 0x00000010 /* Task is an IO worker */
#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
#define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */
#define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */
@ -1517,7 +1518,6 @@ extern struct pid *cad_pid;
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */
#define PF_IO_WORKER 0x20000000 /* Task is an IO worker */
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
@ -2046,6 +2046,7 @@ const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);
int sched_trace_rq_cpu(struct rq *rq);
int sched_trace_rq_cpu_capacity(struct rq *rq);
int sched_trace_rq_nr_running(struct rq *rq);
const struct cpumask *sched_trace_rd_span(struct root_domain *rd);

View File

@ -348,10 +348,13 @@ enum {
MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3),
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4),
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5),
MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY = (1U << 6),
MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ = (1U << 7),
};
enum {
MEMBARRIER_FLAG_SYNC_CORE = (1U << 0),
MEMBARRIER_FLAG_RSEQ = (1U << 1),
};
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS

View File

@ -0,0 +1,156 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* sched-domains (multiprocessor balancing) flag declarations.
*/
#ifndef SD_FLAG
# error "Incorrect import of SD flags definitions"
#endif
/*
* Hierarchical metaflags
*
* SHARED_CHILD: These flags are meant to be set from the base domain upwards.
* If a domain has this flag set, all of its children should have it set. This
* is usually because the flag describes some shared resource (all CPUs in that
* domain share the same resource), or because they are tied to a scheduling
* behaviour that we want to disable at some point in the hierarchy for
* scalability reasons.
*
* In those cases it doesn't make sense to have the flag set for a domain but
* not have it in (some of) its children: sched domains ALWAYS span their child
* domains, so operations done with parent domains will cover CPUs in the lower
* child domains.
*
*
* SHARED_PARENT: These flags are meant to be set from the highest domain
* downwards. If a domain has this flag set, all of its parents should have it
* set. This is usually for topology properties that start to appear above a
* certain level (e.g. domain starts spanning CPUs outside of the base CPU's
* socket).
*/
#define SDF_SHARED_CHILD 0x1
#define SDF_SHARED_PARENT 0x2
/*
* Behavioural metaflags
*
* NEEDS_GROUPS: These flags are only relevant if the domain they are set on has
* more than one group. This is usually for balancing flags (load balancing
* involves equalizing a metric between groups), or for flags describing some
* shared resource (which would be shared between groups).
*/
#define SDF_NEEDS_GROUPS 0x4
/*
* Balance when about to become idle
*
* SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level.
* NEEDS_GROUPS: Load balancing flag.
*/
SD_FLAG(SD_BALANCE_NEWIDLE, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
/*
* Balance on exec
*
* SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
* NEEDS_GROUPS: Load balancing flag.
*/
SD_FLAG(SD_BALANCE_EXEC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
/*
* Balance on fork, clone
*
* SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
* NEEDS_GROUPS: Load balancing flag.
*/
SD_FLAG(SD_BALANCE_FORK, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
/*
* Balance on wakeup
*
* SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level.
* NEEDS_GROUPS: Load balancing flag.
*/
SD_FLAG(SD_BALANCE_WAKE, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
/*
* Consider waking task on waking CPU.
*
* SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
*/
SD_FLAG(SD_WAKE_AFFINE, SDF_SHARED_CHILD)
/*
* Domain members have different CPU capacities
*
* SHARED_PARENT: Set from the topmost domain down to the first domain where
* asymmetry is detected.
* NEEDS_GROUPS: Per-CPU capacity is asymmetric between groups.
*/
SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
/*
* Domain members share CPU capacity (i.e. SMT)
*
* SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
* CPU capacity.
* NEEDS_GROUPS: Capacity is shared between groups.
*/
SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
/*
* Domain members share CPU package resources (i.e. caches)
*
* SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
* the same cache(s).
* NEEDS_GROUPS: Caches are shared between groups.
*/
SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
/*
* Only a single load balancing instance
*
* SHARED_PARENT: Set for all NUMA levels above NODE. Could be set from a
* different level upwards, but it doesn't change that if a
* domain has this flag set, then all of its parents need to have
* it too (otherwise the serialization doesn't make sense).
* NEEDS_GROUPS: No point in preserving domain if it has a single group.
*/
SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
/*
* Place busy tasks earlier in the domain
*
* SHARED_CHILD: Usually set on the SMT level. Technically could be set further
* up, but currently assumed to be set from the base domain
* upwards (see update_top_cache_domain()).
* NEEDS_GROUPS: Load balancing flag.
*/
SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
/*
* Prefer to place tasks in a sibling domain
*
* Set up until domains start spanning NUMA nodes. Close to being a SHARED_CHILD
* flag, but cleared below domains with SD_ASYM_CPUCAPACITY.
*
* NEEDS_GROUPS: Load balancing flag.
*/
SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS)
/*
* sched_groups of this level overlap
*
* SHARED_PARENT: Set for all NUMA levels above NODE.
* NEEDS_GROUPS: Overlaps can only exist with more than one group.
*/
SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
/*
* Cross-node balancing
*
* SHARED_PARENT: Set for all NUMA levels above NODE.
* NEEDS_GROUPS: No point in preserving domain if it has a single group.
*/
SD_FLAG(SD_NUMA, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)

View File

@ -11,20 +11,29 @@
*/
#ifdef CONFIG_SMP
#define SD_BALANCE_NEWIDLE 0x0001 /* Balance when about to become idle */
#define SD_BALANCE_EXEC 0x0002 /* Balance on exec */
#define SD_BALANCE_FORK 0x0004 /* Balance on fork, clone */
#define SD_BALANCE_WAKE 0x0008 /* Balance on wakeup */
#define SD_WAKE_AFFINE 0x0010 /* Wake task to waking CPU */
#define SD_ASYM_CPUCAPACITY 0x0020 /* Domain members have different CPU capacities */
#define SD_SHARE_CPUCAPACITY 0x0040 /* Domain members share CPU capacity */
#define SD_SHARE_POWERDOMAIN 0x0080 /* Domain members share power domain */
#define SD_SHARE_PKG_RESOURCES 0x0100 /* Domain members share CPU pkg resources */
#define SD_SERIALIZE 0x0200 /* Only a single load balancing instance */
#define SD_ASYM_PACKING 0x0400 /* Place busy groups earlier in the domain */
#define SD_PREFER_SIBLING 0x0800 /* Prefer to place tasks in a sibling domain */
#define SD_OVERLAP 0x1000 /* sched_domains of this level overlap */
#define SD_NUMA 0x2000 /* cross-node balancing */
/* Generate SD flag indexes */
#define SD_FLAG(name, mflags) __##name,
enum {
#include <linux/sched/sd_flags.h>
__SD_FLAG_CNT,
};
#undef SD_FLAG
/* Generate SD flag bits */
#define SD_FLAG(name, mflags) name = 1 << __##name,
enum {
#include <linux/sched/sd_flags.h>
};
#undef SD_FLAG
#ifdef CONFIG_SCHED_DEBUG
struct sd_flag_debug {
unsigned int meta_flags;
char *name;
};
extern const struct sd_flag_debug sd_flag_debug[];
#endif
#ifdef CONFIG_SCHED_SMT
static inline int cpu_smt_flags(void)

View File

@ -974,7 +974,7 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename,
const char __user *const __user *argv,
const char __user *const __user *envp, int flags);
asmlinkage long sys_userfaultfd(int flags);
asmlinkage long sys_membarrier(int cmd, int flags);
asmlinkage long sys_membarrier(int cmd, unsigned int flags, int cpu_id);
asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in,
int fd_out, loff_t __user *off_out,

View File

@ -630,6 +630,10 @@ DECLARE_TRACE(pelt_se_tp,
TP_PROTO(struct sched_entity *se),
TP_ARGS(se));
DECLARE_TRACE(sched_cpu_capacity_tp,
TP_PROTO(struct rq *rq),
TP_ARGS(rq));
DECLARE_TRACE(sched_overutilized_tp,
TP_PROTO(struct root_domain *rd, bool overutilized),
TP_ARGS(rd, overutilized));

View File

@ -114,6 +114,26 @@
* If this command is not implemented by an
* architecture, -EINVAL is returned.
* Returns 0 on success.
* @MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
* Ensure the caller thread, upon return from
* system call, that all its running thread
* siblings have any currently running rseq
* critical sections restarted if @flags
* parameter is 0; if @flags parameter is
* MEMBARRIER_CMD_FLAG_CPU,
* then this operation is performed only
* on CPU indicated by @cpu_id. If this command is
* not implemented by an architecture, -EINVAL
* is returned. A process needs to register its
* intent to use the private expedited rseq
* command prior to using it, otherwise
* this command returns -EPERM.
* @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
* Register the process intent to use
* MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ.
* If this command is not implemented by an
* architecture, -EINVAL is returned.
* Returns 0 on success.
* @MEMBARRIER_CMD_SHARED:
* Alias to MEMBARRIER_CMD_GLOBAL. Provided for
* header backward compatibility.
@ -131,9 +151,15 @@ enum membarrier_cmd {
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4),
MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 5),
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6),
MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ = (1 << 7),
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ = (1 << 8),
/* Alias for header backward compatibility. */
MEMBARRIER_CMD_SHARED = MEMBARRIER_CMD_GLOBAL,
};
enum membarrier_cmd_flag {
MEMBARRIER_CMD_FLAG_CPU = (1 << 0),
};
#endif /* _UAPI_LINUX_MEMBARRIER_H */

View File

@ -36,6 +36,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
@ -940,11 +941,6 @@ static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
return clamp_value / UCLAMP_BUCKET_DELTA;
}
static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
{
return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
}
static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
@ -4551,9 +4547,12 @@ void __noreturn do_task_dead(void)
static inline void sched_submit_work(struct task_struct *tsk)
{
unsigned int task_flags;
if (!tsk->state)
return;
task_flags = tsk->flags;
/*
* If a worker went to sleep, notify and ask workqueue whether
* it wants to wake up a task to maintain concurrency.
@ -4562,9 +4561,9 @@ static inline void sched_submit_work(struct task_struct *tsk)
* in the possible wakeup of a kworker and because wq_worker_sleeping()
* requires it.
*/
if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
preempt_disable();
if (tsk->flags & PF_WQ_WORKER)
if (task_flags & PF_WQ_WORKER)
wq_worker_sleeping(tsk);
else
io_wq_worker_sleeping(tsk);

View File

@ -1525,14 +1525,38 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
*/
if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
pi_se = &pi_task->dl;
/*
* Because of delays in the detection of the overrun of a
* thread's runtime, it might be the case that a thread
* goes to sleep in a rt mutex with negative runtime. As
* a consequence, the thread will be throttled.
*
* While waiting for the mutex, this thread can also be
* boosted via PI, resulting in a thread that is throttled
* and boosted at the same time.
*
* In this case, the boost overrides the throttle.
*/
if (p->dl.dl_throttled) {
/*
* The replenish timer needs to be canceled. No
* problem if it fires concurrently: boosted threads
* are ignored in dl_task_timer().
*/
hrtimer_try_to_cancel(&p->dl.dl_timer);
p->dl.dl_throttled = 0;
}
} else if (!dl_prio(p->normal_prio)) {
/*
* Special case in which we have a !SCHED_DEADLINE task
* that is going to be deboosted, but exceeds its
* runtime while doing so. No point in replenishing
* it, as it's going to return back to its original
* scheduling class after this.
* Special case in which we have a !SCHED_DEADLINE task that is going
* to be deboosted, but exceeds its runtime while doing so. No point in
* replenishing it, as it's going to return back to its original
* scheduling class after this. If it has been throttled, we need to
* clear the flag, otherwise the task may wake up as throttled after
* being boosted again with no means to replenish the runtime and clear
* the throttle.
*/
p->dl.dl_throttled = 0;
BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
return;
}

View File

@ -245,6 +245,60 @@ set_table_entry(struct ctl_table *entry,
entry->proc_handler = proc_handler;
}
static int sd_ctl_doflags(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned long flags = *(unsigned long *)table->data;
size_t data_size = 0;
size_t len = 0;
char *tmp;
int idx;
if (write)
return 0;
for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
char *name = sd_flag_debug[idx].name;
/* Name plus whitespace */
data_size += strlen(name) + 1;
}
if (*ppos > data_size) {
*lenp = 0;
return 0;
}
tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL);
if (!tmp)
return -ENOMEM;
for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
char *name = sd_flag_debug[idx].name;
len += snprintf(tmp + len, strlen(name) + 2, "%s ", name);
}
tmp += *ppos;
len -= *ppos;
if (len > *lenp)
len = *lenp;
if (len)
memcpy(buffer, tmp, len);
if (len < *lenp) {
((char *)buffer)[len] = '\n';
len++;
}
*lenp = len;
*ppos += len;
kfree(tmp);
return 0;
}
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
@ -258,7 +312,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax);
set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, sd_ctl_doflags);
set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
/* &table[8] is terminator */

View File

@ -831,7 +831,7 @@ void init_entity_runnable_average(struct sched_entity *se)
void post_init_entity_util_avg(struct task_struct *p)
{
}
static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
static void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
}
#endif /* CONFIG_SMP */
@ -1504,6 +1504,7 @@ enum numa_type {
/* Cached statistics for all CPUs within a node */
struct numa_stats {
unsigned long load;
unsigned long runnable;
unsigned long util;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
@ -1547,19 +1548,22 @@ struct task_numa_env {
};
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
static unsigned long cpu_util(int cpu);
static inline long adjust_numa_imbalance(int imbalance, int src_nr_running);
static inline long adjust_numa_imbalance(int imbalance, int nr_running);
static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
struct numa_stats *ns)
{
if ((ns->nr_running > ns->weight) &&
((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
(((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
return node_overloaded;
if ((ns->nr_running < ns->weight) ||
((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
(((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
return node_has_spare;
return node_fully_busy;
@ -1610,6 +1614,7 @@ static void update_numa_stats(struct task_numa_env *env,
struct rq *rq = cpu_rq(cpu);
ns->load += cpu_load(rq);
ns->runnable += cpu_runnable(rq);
ns->util += cpu_util(cpu);
ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu);
@ -1925,7 +1930,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
src_running = env->src_stats.nr_running - 1;
dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running);
imbalance = adjust_numa_imbalance(imbalance, src_running);
imbalance = adjust_numa_imbalance(imbalance, dst_running);
/* Use idle CPU if there is no imbalance */
if (!imbalance) {
@ -3084,7 +3089,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
account_entity_dequeue(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
@ -3100,7 +3105,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
enqueue_load_avg(cfs_rq, se);
if (se->on_rq)
account_entity_enqueue(cfs_rq, se);
update_load_add(&cfs_rq->load, se->load.weight);
}
@ -3288,7 +3293,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
/**
* update_tg_load_avg - update the tg's load avg
* @cfs_rq: the cfs_rq whose avg changed
* @force: update regardless of how small the difference
*
* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
* However, because tg->load_avg is a global value there are performance
@ -3300,7 +3304,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
*
* Updating tg's load_avg is necessary before update_cfs_share().
*/
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
@ -3310,7 +3314,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
if (cfs_rq->tg == &root_task_group)
return;
if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
@ -3612,7 +3616,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
@ -3800,13 +3804,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* IOW we're enqueueing a task on a new CPU.
*/
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, 0);
update_tg_load_avg(cfs_rq);
} else if (decayed) {
cfs_rq_util_change(cfs_rq, 0);
if (flags & UPDATE_TG)
update_tg_load_avg(cfs_rq, 0);
update_tg_load_avg(cfs_rq);
}
}
@ -4461,17 +4465,17 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
se = second;
}
/*
* Prefer last buddy, try to return the CPU to a preempted task.
*/
if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
se = cfs_rq->last;
/*
* Someone really wants this to run. If it's not unfair, run it.
*/
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
/*
* Someone really wants this to run. If it's not unfair, run it.
*/
se = cfs_rq->next;
} else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
/*
* Prefer last buddy, try to return the CPU to a preempted task.
*/
se = cfs_rq->last;
}
clear_buddies(cfs_rq, se);
@ -6075,7 +6079,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
/*
* Scan the local SMT mask for idle CPUs.
*/
static int select_idle_smt(struct task_struct *p, int target)
static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
int cpu;
@ -6083,7 +6087,8 @@ static int select_idle_smt(struct task_struct *p, int target)
return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
@ -6099,7 +6104,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
return -1;
}
static inline int select_idle_smt(struct task_struct *p, int target)
static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
return -1;
}
@ -6274,7 +6279,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if ((unsigned)i < nr_cpumask_bits)
return i;
i = select_idle_smt(p, target);
i = select_idle_smt(p, sd, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
@ -6594,7 +6599,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
spare_cap = cpu_cap - util;
spare_cap = cpu_cap;
lsub_positive(&spare_cap, util);
/*
* Skip CPUs that cannot satisfy the capacity request.
@ -7402,6 +7408,10 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
if (unlikely(task_has_idle_policy(p)))
return 0;
/* SMT siblings share cache */
if (env->sd->flags & SD_SHARE_CPUCAPACITY)
return 0;
/*
* Buddy candidates are cache hot:
*/
@ -7669,8 +7679,8 @@ static int detach_tasks(struct lb_env *env)
* scheduler fails to find a good waiting task to
* migrate.
*/
if (load/2 > env->imbalance &&
env->sd->nr_balance_failed <= env->sd->cache_nice_tries)
if ((load >> env->sd->nr_balance_failed) > env->imbalance)
goto next;
env->imbalance -= load;
@ -7887,7 +7897,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
struct sched_entity *se;
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq, 0);
update_tg_load_avg(cfs_rq);
if (cfs_rq == &rq->cfs)
decayed = true;
@ -8098,6 +8108,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
capacity = 1;
cpu_rq(cpu)->cpu_capacity = capacity;
trace_sched_cpu_capacity_tp(cpu_rq(cpu));
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity;
sdg->sgc->max_capacity = capacity;
@ -8957,7 +8969,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
}
}
static inline long adjust_numa_imbalance(int imbalance, int src_nr_running)
static inline long adjust_numa_imbalance(int imbalance, int nr_running)
{
unsigned int imbalance_min;
@ -8966,7 +8978,7 @@ static inline long adjust_numa_imbalance(int imbalance, int src_nr_running)
* tasks that remain local when the source domain is almost idle.
*/
imbalance_min = 2;
if (src_nr_running <= imbalance_min)
if (nr_running <= imbalance_min)
return 0;
return imbalance;
@ -9780,6 +9792,15 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
/*
* Reduce likelihood of busy balancing at higher domains racing with
* balancing at lower domains by preventing their balancing periods
* from being multiples of each other.
*/
if (cpu_busy)
interval -= 1;
interval = clamp(interval, 1UL, max_load_balance_interval);
return interval;
@ -10786,7 +10807,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
}
@ -10805,7 +10826,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
}
@ -11302,6 +11323,18 @@ int sched_trace_rq_cpu(struct rq *rq)
}
EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
int sched_trace_rq_cpu_capacity(struct rq *rq)
{
return rq ?
#ifdef CONFIG_SMP
rq->cpu_capacity
#else
SCHED_CAPACITY_SCALE
#endif
: -1;
}
EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
{
#ifdef CONFIG_SMP

View File

@ -77,7 +77,7 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
SCHED_FEAT(RT_PUSH_IPI, true)
#endif
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(RT_RUNTIME_SHARE, false)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)

View File

@ -18,6 +18,14 @@
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
#endif
#ifdef CONFIG_RSEQ
#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \
(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
#else
#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
#endif
#define MEMBARRIER_CMD_BITMASK \
(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
@ -30,6 +38,11 @@ static void ipi_mb(void *info)
smp_mb(); /* IPIs should be serializing but paranoid. */
}
static void ipi_rseq(void *info)
{
rseq_preempt(current);
}
static void ipi_sync_rq_state(void *info)
{
struct mm_struct *mm = (struct mm_struct *) info;
@ -129,19 +142,27 @@ static int membarrier_global_expedited(void)
return 0;
}
static int membarrier_private_expedited(int flags)
static int membarrier_private_expedited(int flags, int cpu_id)
{
int cpu;
cpumask_var_t tmpmask;
struct mm_struct *mm = current->mm;
smp_call_func_t ipi_func = ipi_mb;
if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM;
} else if (flags == MEMBARRIER_FLAG_RSEQ) {
if (!IS_ENABLED(CONFIG_RSEQ))
return -EINVAL;
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
return -EPERM;
ipi_func = ipi_rseq;
} else {
WARN_ON_ONCE(flags);
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
return -EPERM;
@ -156,35 +177,59 @@ static int membarrier_private_expedited(int flags)
*/
smp_mb(); /* system call entry is not a mb. */
if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
return -ENOMEM;
cpus_read_lock();
rcu_read_lock();
for_each_online_cpu(cpu) {
if (cpu_id >= 0) {
struct task_struct *p;
/*
* Skipping the current CPU is OK even through we can be
* migrated at any point. The current CPU, at the point
* where we read raw_smp_processor_id(), is ensured to
* be in program order with respect to the caller
* thread. Therefore, we can skip this CPU from the
* iteration.
*/
if (cpu == raw_smp_processor_id())
continue;
p = rcu_dereference(cpu_rq(cpu)->curr);
if (p && p->mm == mm)
__cpumask_set_cpu(cpu, tmpmask);
if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
goto out;
if (cpu_id == raw_smp_processor_id())
goto out;
rcu_read_lock();
p = rcu_dereference(cpu_rq(cpu_id)->curr);
if (!p || p->mm != mm) {
rcu_read_unlock();
goto out;
}
rcu_read_unlock();
} else {
int cpu;
rcu_read_lock();
for_each_online_cpu(cpu) {
struct task_struct *p;
/*
* Skipping the current CPU is OK even through we can be
* migrated at any point. The current CPU, at the point
* where we read raw_smp_processor_id(), is ensured to
* be in program order with respect to the caller
* thread. Therefore, we can skip this CPU from the
* iteration.
*/
if (cpu == raw_smp_processor_id())
continue;
p = rcu_dereference(cpu_rq(cpu)->curr);
if (p && p->mm == mm)
__cpumask_set_cpu(cpu, tmpmask);
}
rcu_read_unlock();
}
rcu_read_unlock();
preempt_disable();
smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
if (cpu_id >= 0)
smp_call_function_single(cpu_id, ipi_func, NULL, 1);
else
smp_call_function_many(tmpmask, ipi_func, NULL, 1);
preempt_enable();
free_cpumask_var(tmpmask);
out:
if (cpu_id < 0)
free_cpumask_var(tmpmask);
cpus_read_unlock();
/*
@ -283,11 +328,18 @@ static int membarrier_register_private_expedited(int flags)
set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
ret;
if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
ready_state =
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
} else if (flags == MEMBARRIER_FLAG_RSEQ) {
if (!IS_ENABLED(CONFIG_RSEQ))
return -EINVAL;
ready_state =
MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
} else {
WARN_ON_ONCE(flags);
}
/*
@ -299,6 +351,8 @@ static int membarrier_register_private_expedited(int flags)
return 0;
if (flags & MEMBARRIER_FLAG_SYNC_CORE)
set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
if (flags & MEMBARRIER_FLAG_RSEQ)
set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
atomic_or(set_state, &mm->membarrier_state);
ret = sync_runqueues_membarrier_state(mm);
if (ret)
@ -310,8 +364,15 @@ static int membarrier_register_private_expedited(int flags)
/**
* sys_membarrier - issue memory barriers on a set of threads
* @cmd: Takes command values defined in enum membarrier_cmd.
* @flags: Currently needs to be 0. For future extensions.
* @cmd: Takes command values defined in enum membarrier_cmd.
* @flags: Currently needs to be 0 for all commands other than
* MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
* case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
* contains the CPU on which to interrupt (= restart)
* the RSEQ critical section.
* @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
* RSEQ CS should be interrupted (@cmd must be
* MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
*
* If this system call is not implemented, -ENOSYS is returned. If the
* command specified does not exist, not available on the running
@ -337,10 +398,21 @@ static int membarrier_register_private_expedited(int flags)
* smp_mb() X O O
* sys_membarrier() O O O
*/
SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
{
if (unlikely(flags))
return -EINVAL;
switch (cmd) {
case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
return -EINVAL;
break;
default:
if (unlikely(flags))
return -EINVAL;
}
if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
cpu_id = -1;
switch (cmd) {
case MEMBARRIER_CMD_QUERY:
{
@ -362,13 +434,17 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
return membarrier_register_global_expedited();
case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
return membarrier_private_expedited(0);
return membarrier_private_expedited(0, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
return membarrier_register_private_expedited(0);
case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
default:
return -EINVAL;
}

View File

@ -25,10 +25,18 @@ static inline bool sched_debug(void)
return sched_debug_enabled;
}
#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
const struct sd_flag_debug sd_flag_debug[] = {
#include <linux/sched/sd_flags.h>
};
#undef SD_FLAG
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
struct cpumask *groupmask)
{
struct sched_group *group = sd->groups;
unsigned long flags = sd->flags;
unsigned int idx;
cpumask_clear(groupmask);
@ -43,6 +51,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
}
for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
unsigned int flag = BIT(idx);
unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
!(sd->child->flags & flag))
printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
sd_flag_debug[idx].name);
if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
!(sd->parent->flags & flag))
printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
sd_flag_debug[idx].name);
}
printk(KERN_DEBUG "%*s groups:", level + 1, "");
do {
if (!group) {
@ -137,22 +160,22 @@ static inline bool sched_debug(void)
}
#endif /* CONFIG_SCHED_DEBUG */
/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
static const unsigned int SD_DEGENERATE_GROUPS_MASK =
#include <linux/sched/sd_flags.h>
0;
#undef SD_FLAG
static int sd_degenerate(struct sched_domain *sd)
{
if (cpumask_weight(sched_domain_span(sd)) == 1)
return 1;
/* Following flags need at least 2 groups */
if (sd->flags & (SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_SHARE_CPUCAPACITY |
SD_ASYM_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_SHARE_POWERDOMAIN)) {
if (sd->groups != sd->groups->next)
return 0;
}
if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
(sd->groups != sd->groups->next))
return 0;
/* Following flags don't use groups */
if (sd->flags & (SD_WAKE_AFFINE))
@ -173,18 +196,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
return 0;
/* Flags needing groups don't count if only 1 group in parent */
if (parent->groups == parent->groups->next) {
pflags &= ~(SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_ASYM_CPUCAPACITY |
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING |
SD_SHARE_POWERDOMAIN);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
if (parent->groups == parent->groups->next)
pflags &= ~SD_DEGENERATE_GROUPS_MASK;
if (~cflags & pflags)
return 0;
@ -1292,7 +1306,6 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
* SD_SHARE_CPUCAPACITY - describes SMT topologies
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain
*
* Odd one out, which beside describing the topology has a quirk also
* prescribes the desired behaviour that goes along with it:
@ -1303,8 +1316,7 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
(SD_SHARE_CPUCAPACITY | \
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING | \
SD_SHARE_POWERDOMAIN)
SD_ASYM_PACKING)
static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl,
@ -1336,8 +1348,8 @@ sd_init(struct sched_domain_topology_level *tl,
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
.busy_factor = 32,
.imbalance_pct = 125,
.busy_factor = 16,
.imbalance_pct = 117,
.cache_nice_tries = 0,
@ -1989,11 +2001,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) {
struct sched_domain_topology_level *tl;
int dflags = 0;
sd = NULL;
for_each_sd_topology(tl) {
int dflags = 0;
if (tl == tl_asym) {
dflags |= SD_ASYM_CPUCAPACITY;
has_asym = true;

View File

@ -1,8 +1,10 @@
// SPDX-License-Identifier: LGPL-2.1
#define _GNU_SOURCE
#include <assert.h>
#include <linux/membarrier.h>
#include <pthread.h>
#include <sched.h>
#include <stdatomic.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
@ -1131,6 +1133,220 @@ static int set_signal_handler(void)
return ret;
}
struct test_membarrier_thread_args {
int stop;
intptr_t percpu_list_ptr;
};
/* Worker threads modify data in their "active" percpu lists. */
void *test_membarrier_worker_thread(void *arg)
{
struct test_membarrier_thread_args *args =
(struct test_membarrier_thread_args *)arg;
const int iters = opt_reps;
int i;
if (rseq_register_current_thread()) {
fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
errno, strerror(errno));
abort();
}
/* Wait for initialization. */
while (!atomic_load(&args->percpu_list_ptr)) {}
for (i = 0; i < iters; ++i) {
int ret;
do {
int cpu = rseq_cpu_start();
ret = rseq_offset_deref_addv(&args->percpu_list_ptr,
sizeof(struct percpu_list_entry) * cpu, 1, cpu);
} while (rseq_unlikely(ret));
}
if (rseq_unregister_current_thread()) {
fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
errno, strerror(errno));
abort();
}
return NULL;
}
void test_membarrier_init_percpu_list(struct percpu_list *list)
{
int i;
memset(list, 0, sizeof(*list));
for (i = 0; i < CPU_SETSIZE; i++) {
struct percpu_list_node *node;
node = malloc(sizeof(*node));
assert(node);
node->data = 0;
node->next = NULL;
list->c[i].head = node;
}
}
void test_membarrier_free_percpu_list(struct percpu_list *list)
{
int i;
for (i = 0; i < CPU_SETSIZE; i++)
free(list->c[i].head);
}
static int sys_membarrier(int cmd, int flags, int cpu_id)
{
return syscall(__NR_membarrier, cmd, flags, cpu_id);
}
/*
* The manager thread swaps per-cpu lists that worker threads see,
* and validates that there are no unexpected modifications.
*/
void *test_membarrier_manager_thread(void *arg)
{
struct test_membarrier_thread_args *args =
(struct test_membarrier_thread_args *)arg;
struct percpu_list list_a, list_b;
intptr_t expect_a = 0, expect_b = 0;
int cpu_a = 0, cpu_b = 0;
if (rseq_register_current_thread()) {
fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
errno, strerror(errno));
abort();
}
/* Init lists. */
test_membarrier_init_percpu_list(&list_a);
test_membarrier_init_percpu_list(&list_b);
atomic_store(&args->percpu_list_ptr, (intptr_t)&list_a);
while (!atomic_load(&args->stop)) {
/* list_a is "active". */
cpu_a = rand() % CPU_SETSIZE;
/*
* As list_b is "inactive", we should never see changes
* to list_b.
*/
if (expect_b != atomic_load(&list_b.c[cpu_b].head->data)) {
fprintf(stderr, "Membarrier test failed\n");
abort();
}
/* Make list_b "active". */
atomic_store(&args->percpu_list_ptr, (intptr_t)&list_b);
if (sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
MEMBARRIER_CMD_FLAG_CPU, cpu_a) &&
errno != ENXIO /* missing CPU */) {
perror("sys_membarrier");
abort();
}
/*
* Cpu A should now only modify list_b, so the values
* in list_a should be stable.
*/
expect_a = atomic_load(&list_a.c[cpu_a].head->data);
cpu_b = rand() % CPU_SETSIZE;
/*
* As list_a is "inactive", we should never see changes
* to list_a.
*/
if (expect_a != atomic_load(&list_a.c[cpu_a].head->data)) {
fprintf(stderr, "Membarrier test failed\n");
abort();
}
/* Make list_a "active". */
atomic_store(&args->percpu_list_ptr, (intptr_t)&list_a);
if (sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
MEMBARRIER_CMD_FLAG_CPU, cpu_b) &&
errno != ENXIO /* missing CPU*/) {
perror("sys_membarrier");
abort();
}
/* Remember a value from list_b. */
expect_b = atomic_load(&list_b.c[cpu_b].head->data);
}
test_membarrier_free_percpu_list(&list_a);
test_membarrier_free_percpu_list(&list_b);
if (rseq_unregister_current_thread()) {
fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
errno, strerror(errno));
abort();
}
return NULL;
}
/* Test MEMBARRIER_CMD_PRIVATE_RESTART_RSEQ_ON_CPU membarrier command. */
#ifdef RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
void test_membarrier(void)
{
const int num_threads = opt_threads;
struct test_membarrier_thread_args thread_args;
pthread_t worker_threads[num_threads];
pthread_t manager_thread;
int i, ret;
if (sys_membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, 0, 0)) {
perror("sys_membarrier");
abort();
}
thread_args.stop = 0;
thread_args.percpu_list_ptr = 0;
ret = pthread_create(&manager_thread, NULL,
test_membarrier_manager_thread, &thread_args);
if (ret) {
errno = ret;
perror("pthread_create");
abort();
}
for (i = 0; i < num_threads; i++) {
ret = pthread_create(&worker_threads[i], NULL,
test_membarrier_worker_thread, &thread_args);
if (ret) {
errno = ret;
perror("pthread_create");
abort();
}
}
for (i = 0; i < num_threads; i++) {
ret = pthread_join(worker_threads[i], NULL);
if (ret) {
errno = ret;
perror("pthread_join");
abort();
}
}
atomic_store(&thread_args.stop, 1);
ret = pthread_join(manager_thread, NULL);
if (ret) {
errno = ret;
perror("pthread_join");
abort();
}
}
#else /* RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV */
void test_membarrier(void)
{
fprintf(stderr, "rseq_offset_deref_addv is not implemented on this architecture. "
"Skipping membarrier test.\n");
}
#endif
static void show_usage(int argc, char **argv)
{
printf("Usage : %s <OPTIONS>\n",
@ -1153,7 +1369,7 @@ static void show_usage(int argc, char **argv)
printf(" [-r N] Number of repetitions per thread (default 5000)\n");
printf(" [-d] Disable rseq system call (no initialization)\n");
printf(" [-D M] Disable rseq for each M threads\n");
printf(" [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement\n");
printf(" [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement, membarrie(r)\n");
printf(" [-M] Push into buffer and memcpy buffer with memory barriers.\n");
printf(" [-v] Verbose output.\n");
printf(" [-h] Show this help.\n");
@ -1268,6 +1484,7 @@ int main(int argc, char **argv)
case 'i':
case 'b':
case 'm':
case 'r':
break;
default:
show_usage(argc, argv);
@ -1320,6 +1537,10 @@ int main(int argc, char **argv)
printf_verbose("counter increment\n");
test_percpu_inc();
break;
case 'r':
printf_verbose("membarrier\n");
test_membarrier();
break;
}
if (!opt_disable_rseq && rseq_unregister_current_thread())
abort();

View File

@ -279,6 +279,63 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
#endif
}
#define RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
/*
* pval = *(ptr+off)
* *pval += inc;
*/
static inline __attribute__((always_inline))
int rseq_offset_deref_addv(intptr_t *ptr, off_t off, intptr_t inc, int cpu)
{
RSEQ_INJECT_C(9)
__asm__ __volatile__ goto (
RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
#endif
/* Start rseq by storing table entry pointer into rseq_cs. */
RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
RSEQ_INJECT_ASM(3)
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
#endif
/* get p+v */
"movq %[ptr], %%rbx\n\t"
"addq %[off], %%rbx\n\t"
/* get pv */
"movq (%%rbx), %%rcx\n\t"
/* *pv += inc */
"addq %[inc], (%%rcx)\n\t"
"2:\n\t"
RSEQ_INJECT_ASM(4)
RSEQ_ASM_DEFINE_ABORT(4, "", abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
[rseq_abi] "r" (&__rseq_abi),
/* final store input */
[ptr] "m" (*ptr),
[off] "er" (off),
[inc] "er" (inc)
: "memory", "cc", "rax", "rbx", "rcx"
RSEQ_INJECT_CLOBBER
: abort
#ifdef RSEQ_COMPARE_TWICE
, error1
#endif
);
return 0;
abort:
RSEQ_INJECT_FAILED
return -1;
#ifdef RSEQ_COMPARE_TWICE
error1:
rseq_bug("cpu_id comparison failed");
#endif
}
static inline __attribute__((always_inline))
int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
intptr_t *v2, intptr_t newv2,

View File

@ -15,6 +15,7 @@ TEST_LIST=(
"-T m"
"-T m -M"
"-T i"
"-T r"
)
TEST_NAME=(
@ -25,6 +26,7 @@ TEST_NAME=(
"memcpy"
"memcpy with barrier"
"increment"
"membarrier"
)
IFS="$OLDIFS"