mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-11-24 01:20:52 +07:00
Scheduler changes for v5.10:
- Reorganize & clean up the SD* flags definitions and add a bunch of sanity checks. These new checks caught quite a few bugs or at least inconsistencies, resulting in another set of patches. - Rseq updates, add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ - Add a new tracepoint to improve CPU capacity tracking - Improve overloaded SMP system load-balancing behavior - Tweak SMT balancing - Energy-aware scheduling updates - NUMA balancing improvements - Deadline scheduler fixes and improvements - CPU isolation fixes - Misc cleanups, simplifications and smaller optimizations. Signed-off-by: Ingo Molnar <mingo@kernel.org> -----BEGIN PGP SIGNATURE----- iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAl+EWRERHG1pbmdvQGtl cm5lbC5vcmcACgkQEnMQ0APhK1hV8A/7BB0nt/zYVZ8Z3Di8V0b9hMtr0d1xtRM5 ZAvg4hcZl/fVgobFndxBw6KdlK8lSce9Mcq+bTTWeD46CS13cK5Vrpiaf7x7Q00P m8YHeYEH13ME0pbBrhDoRCR4XzfXukzjkUl7LiyrTekAvRUtFikJ/uKl8MeJtYGZ gANEkadqforxUW0v45iUEGepmCWAl8hSlSMb2mDKsVhw4DFMD+px0EBmmA0VDqjE e0rkh6dEoUVNqlic2KoaXULld1rLg1xiaOcLUbTAXnucfhmuv5p/H11AC4ABuf+s 7d0zLrLEfZrcLJkthYxfMHs7DYMtARiQM9Db/a5hAq9Af4Z2bvvVAaHt3gCGvkV1 llB6BB2yWCki9Qv7oiGOAhANnyJHG/cU4r6WwMuHdlYi4dFT/iN5qkOMUL1IrDgi a6ZzvECChXBeisQXHSlMd8Y5O+j0gRvDR7E18z2q0/PlmO8PGJq4w34mEWveWIg3 LaVF16bmvaARuNFJTQH/zaHhjqVQANSMx5OIv9swp0OkwvQkw21ICYHG0YxfzWCr oa/FESEpOL9XdYp8UwMPI0bmVIsEfx79pmDMF3zInYTpJpwMUhV2yjHE8uYVMqEf 7U8rZv7gdbZ2us38Gjf2l73hY+recp/GrgZKnk0R98OUeMk1l/iVP6dwco6ITUV5 czGmKlIB1ec= =bXy6 -----END PGP SIGNATURE----- Merge tag 'sched-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull scheduler updates from Ingo Molnar: - reorganize & clean up the SD* flags definitions and add a bunch of sanity checks. These new checks caught quite a few bugs or at least inconsistencies, resulting in another set of patches. - rseq updates, add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ - add a new tracepoint to improve CPU capacity tracking - improve overloaded SMP system load-balancing behavior - tweak SMT balancing - energy-aware scheduling updates - NUMA balancing improvements - deadline scheduler fixes and improvements - CPU isolation fixes - misc cleanups, simplifications and smaller optimizations * tag 'sched-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (42 commits) sched/deadline: Unthrottle PI boosted threads while enqueuing sched/debug: Add new tracepoint to track cpu_capacity sched/fair: Tweak pick_next_entity() rseq/selftests: Test MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ rseq/selftests,x86_64: Add rseq_offset_deref_addv() rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ sched/fair: Use dst group while checking imbalance for NUMA balancer sched/fair: Reduce busy load balance interval sched/fair: Minimize concurrent LBs between domain level sched/fair: Reduce minimal imbalance threshold sched/fair: Relax constraint on task's load during load balance sched/fair: Remove the force parameter of update_tg_load_avg() sched/fair: Fix wrong cpu selecting from isolated domain sched: Remove unused inline function uclamp_bucket_base_value() sched/rt: Disable RT_RUNTIME_SHARE by default sched/deadline: Fix stale throttling on de-/boosted tasks sched/numa: Use runnable_avg to classify node sched/topology: Move sd_flag_debug out of #ifdef CONFIG_SYSCTL MAINTAINERS: Add myself as SCHED_DEADLINE reviewer sched/topology: Move SD_DEGENERATE_GROUPS_MASK out of linux/sched/topology.h ...
This commit is contained in:
commit
edaa5ddf38
@ -15407,6 +15407,7 @@ R: Dietmar Eggemann <dietmar.eggemann@arm.com> (SCHED_NORMAL)
|
||||
R: Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR)
|
||||
R: Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
|
||||
R: Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
|
||||
R: Daniel Bristot de Oliveira <bristot@redhat.com> (SCHED_DEADLINE)
|
||||
L: linux-kernel@vger.kernel.org
|
||||
S: Maintained
|
||||
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core
|
||||
|
@ -177,15 +177,6 @@ static inline void parse_dt_topology(void) {}
|
||||
static inline void update_cpu_capacity(unsigned int cpuid) {}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The current assumption is that we can power gate each core independently.
|
||||
* This will be superseded by DT binding once available.
|
||||
*/
|
||||
const struct cpumask *cpu_corepower_mask(int cpu)
|
||||
{
|
||||
return &cpu_topology[cpu].thread_sibling;
|
||||
}
|
||||
|
||||
/*
|
||||
* store_cpu_topology is called at boot when only one cpu is running
|
||||
* and with the mutex cpu_hotplug.lock locked, when several cpus have booted,
|
||||
@ -241,20 +232,6 @@ void store_cpu_topology(unsigned int cpuid)
|
||||
update_siblings_masks(cpuid);
|
||||
}
|
||||
|
||||
static inline int cpu_corepower_flags(void)
|
||||
{
|
||||
return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN;
|
||||
}
|
||||
|
||||
static struct sched_domain_topology_level arm_topology[] = {
|
||||
#ifdef CONFIG_SCHED_MC
|
||||
{ cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
|
||||
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
|
||||
#endif
|
||||
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
|
||||
{ NULL, },
|
||||
};
|
||||
|
||||
/*
|
||||
* init_cpu_topology is called at boot when only one cpu is running
|
||||
* which prevent simultaneous write access to cpu_topology array
|
||||
@ -265,7 +242,4 @@ void __init init_cpu_topology(void)
|
||||
smp_wmb();
|
||||
|
||||
parse_dt_topology();
|
||||
|
||||
/* Set scheduler topology descriptor */
|
||||
set_sched_topology(arm_topology);
|
||||
}
|
||||
|
@ -1491,9 +1491,10 @@ extern struct pid *cad_pid;
|
||||
/*
|
||||
* Per process flags
|
||||
*/
|
||||
#define PF_VCPU 0x00000001 /* I'm a virtual CPU */
|
||||
#define PF_IDLE 0x00000002 /* I am an IDLE thread */
|
||||
#define PF_EXITING 0x00000004 /* Getting shut down */
|
||||
#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
|
||||
#define PF_IO_WORKER 0x00000010 /* Task is an IO worker */
|
||||
#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
|
||||
#define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */
|
||||
#define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */
|
||||
@ -1517,7 +1518,6 @@ extern struct pid *cad_pid;
|
||||
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
|
||||
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
|
||||
#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */
|
||||
#define PF_IO_WORKER 0x20000000 /* Task is an IO worker */
|
||||
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
|
||||
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
|
||||
|
||||
@ -2046,6 +2046,7 @@ const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
|
||||
const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);
|
||||
|
||||
int sched_trace_rq_cpu(struct rq *rq);
|
||||
int sched_trace_rq_cpu_capacity(struct rq *rq);
|
||||
int sched_trace_rq_nr_running(struct rq *rq);
|
||||
|
||||
const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
|
||||
|
@ -348,10 +348,13 @@ enum {
|
||||
MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3),
|
||||
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4),
|
||||
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5),
|
||||
MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY = (1U << 6),
|
||||
MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ = (1U << 7),
|
||||
};
|
||||
|
||||
enum {
|
||||
MEMBARRIER_FLAG_SYNC_CORE = (1U << 0),
|
||||
MEMBARRIER_FLAG_RSEQ = (1U << 1),
|
||||
};
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
|
||||
|
156
include/linux/sched/sd_flags.h
Normal file
156
include/linux/sched/sd_flags.h
Normal file
@ -0,0 +1,156 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* sched-domains (multiprocessor balancing) flag declarations.
|
||||
*/
|
||||
|
||||
#ifndef SD_FLAG
|
||||
# error "Incorrect import of SD flags definitions"
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Hierarchical metaflags
|
||||
*
|
||||
* SHARED_CHILD: These flags are meant to be set from the base domain upwards.
|
||||
* If a domain has this flag set, all of its children should have it set. This
|
||||
* is usually because the flag describes some shared resource (all CPUs in that
|
||||
* domain share the same resource), or because they are tied to a scheduling
|
||||
* behaviour that we want to disable at some point in the hierarchy for
|
||||
* scalability reasons.
|
||||
*
|
||||
* In those cases it doesn't make sense to have the flag set for a domain but
|
||||
* not have it in (some of) its children: sched domains ALWAYS span their child
|
||||
* domains, so operations done with parent domains will cover CPUs in the lower
|
||||
* child domains.
|
||||
*
|
||||
*
|
||||
* SHARED_PARENT: These flags are meant to be set from the highest domain
|
||||
* downwards. If a domain has this flag set, all of its parents should have it
|
||||
* set. This is usually for topology properties that start to appear above a
|
||||
* certain level (e.g. domain starts spanning CPUs outside of the base CPU's
|
||||
* socket).
|
||||
*/
|
||||
#define SDF_SHARED_CHILD 0x1
|
||||
#define SDF_SHARED_PARENT 0x2
|
||||
|
||||
/*
|
||||
* Behavioural metaflags
|
||||
*
|
||||
* NEEDS_GROUPS: These flags are only relevant if the domain they are set on has
|
||||
* more than one group. This is usually for balancing flags (load balancing
|
||||
* involves equalizing a metric between groups), or for flags describing some
|
||||
* shared resource (which would be shared between groups).
|
||||
*/
|
||||
#define SDF_NEEDS_GROUPS 0x4
|
||||
|
||||
/*
|
||||
* Balance when about to become idle
|
||||
*
|
||||
* SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level.
|
||||
* NEEDS_GROUPS: Load balancing flag.
|
||||
*/
|
||||
SD_FLAG(SD_BALANCE_NEWIDLE, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
|
||||
|
||||
/*
|
||||
* Balance on exec
|
||||
*
|
||||
* SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
|
||||
* NEEDS_GROUPS: Load balancing flag.
|
||||
*/
|
||||
SD_FLAG(SD_BALANCE_EXEC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
|
||||
|
||||
/*
|
||||
* Balance on fork, clone
|
||||
*
|
||||
* SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
|
||||
* NEEDS_GROUPS: Load balancing flag.
|
||||
*/
|
||||
SD_FLAG(SD_BALANCE_FORK, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
|
||||
|
||||
/*
|
||||
* Balance on wakeup
|
||||
*
|
||||
* SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level.
|
||||
* NEEDS_GROUPS: Load balancing flag.
|
||||
*/
|
||||
SD_FLAG(SD_BALANCE_WAKE, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
|
||||
|
||||
/*
|
||||
* Consider waking task on waking CPU.
|
||||
*
|
||||
* SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
|
||||
*/
|
||||
SD_FLAG(SD_WAKE_AFFINE, SDF_SHARED_CHILD)
|
||||
|
||||
/*
|
||||
* Domain members have different CPU capacities
|
||||
*
|
||||
* SHARED_PARENT: Set from the topmost domain down to the first domain where
|
||||
* asymmetry is detected.
|
||||
* NEEDS_GROUPS: Per-CPU capacity is asymmetric between groups.
|
||||
*/
|
||||
SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
|
||||
|
||||
/*
|
||||
* Domain members share CPU capacity (i.e. SMT)
|
||||
*
|
||||
* SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
|
||||
* CPU capacity.
|
||||
* NEEDS_GROUPS: Capacity is shared between groups.
|
||||
*/
|
||||
SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
|
||||
|
||||
/*
|
||||
* Domain members share CPU package resources (i.e. caches)
|
||||
*
|
||||
* SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
|
||||
* the same cache(s).
|
||||
* NEEDS_GROUPS: Caches are shared between groups.
|
||||
*/
|
||||
SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
|
||||
|
||||
/*
|
||||
* Only a single load balancing instance
|
||||
*
|
||||
* SHARED_PARENT: Set for all NUMA levels above NODE. Could be set from a
|
||||
* different level upwards, but it doesn't change that if a
|
||||
* domain has this flag set, then all of its parents need to have
|
||||
* it too (otherwise the serialization doesn't make sense).
|
||||
* NEEDS_GROUPS: No point in preserving domain if it has a single group.
|
||||
*/
|
||||
SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
|
||||
|
||||
/*
|
||||
* Place busy tasks earlier in the domain
|
||||
*
|
||||
* SHARED_CHILD: Usually set on the SMT level. Technically could be set further
|
||||
* up, but currently assumed to be set from the base domain
|
||||
* upwards (see update_top_cache_domain()).
|
||||
* NEEDS_GROUPS: Load balancing flag.
|
||||
*/
|
||||
SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
|
||||
|
||||
/*
|
||||
* Prefer to place tasks in a sibling domain
|
||||
*
|
||||
* Set up until domains start spanning NUMA nodes. Close to being a SHARED_CHILD
|
||||
* flag, but cleared below domains with SD_ASYM_CPUCAPACITY.
|
||||
*
|
||||
* NEEDS_GROUPS: Load balancing flag.
|
||||
*/
|
||||
SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS)
|
||||
|
||||
/*
|
||||
* sched_groups of this level overlap
|
||||
*
|
||||
* SHARED_PARENT: Set for all NUMA levels above NODE.
|
||||
* NEEDS_GROUPS: Overlaps can only exist with more than one group.
|
||||
*/
|
||||
SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
|
||||
|
||||
/*
|
||||
* Cross-node balancing
|
||||
*
|
||||
* SHARED_PARENT: Set for all NUMA levels above NODE.
|
||||
* NEEDS_GROUPS: No point in preserving domain if it has a single group.
|
||||
*/
|
||||
SD_FLAG(SD_NUMA, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
|
@ -11,20 +11,29 @@
|
||||
*/
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
#define SD_BALANCE_NEWIDLE 0x0001 /* Balance when about to become idle */
|
||||
#define SD_BALANCE_EXEC 0x0002 /* Balance on exec */
|
||||
#define SD_BALANCE_FORK 0x0004 /* Balance on fork, clone */
|
||||
#define SD_BALANCE_WAKE 0x0008 /* Balance on wakeup */
|
||||
#define SD_WAKE_AFFINE 0x0010 /* Wake task to waking CPU */
|
||||
#define SD_ASYM_CPUCAPACITY 0x0020 /* Domain members have different CPU capacities */
|
||||
#define SD_SHARE_CPUCAPACITY 0x0040 /* Domain members share CPU capacity */
|
||||
#define SD_SHARE_POWERDOMAIN 0x0080 /* Domain members share power domain */
|
||||
#define SD_SHARE_PKG_RESOURCES 0x0100 /* Domain members share CPU pkg resources */
|
||||
#define SD_SERIALIZE 0x0200 /* Only a single load balancing instance */
|
||||
#define SD_ASYM_PACKING 0x0400 /* Place busy groups earlier in the domain */
|
||||
#define SD_PREFER_SIBLING 0x0800 /* Prefer to place tasks in a sibling domain */
|
||||
#define SD_OVERLAP 0x1000 /* sched_domains of this level overlap */
|
||||
#define SD_NUMA 0x2000 /* cross-node balancing */
|
||||
/* Generate SD flag indexes */
|
||||
#define SD_FLAG(name, mflags) __##name,
|
||||
enum {
|
||||
#include <linux/sched/sd_flags.h>
|
||||
__SD_FLAG_CNT,
|
||||
};
|
||||
#undef SD_FLAG
|
||||
/* Generate SD flag bits */
|
||||
#define SD_FLAG(name, mflags) name = 1 << __##name,
|
||||
enum {
|
||||
#include <linux/sched/sd_flags.h>
|
||||
};
|
||||
#undef SD_FLAG
|
||||
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
|
||||
struct sd_flag_debug {
|
||||
unsigned int meta_flags;
|
||||
char *name;
|
||||
};
|
||||
extern const struct sd_flag_debug sd_flag_debug[];
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SCHED_SMT
|
||||
static inline int cpu_smt_flags(void)
|
||||
|
@ -974,7 +974,7 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename,
|
||||
const char __user *const __user *argv,
|
||||
const char __user *const __user *envp, int flags);
|
||||
asmlinkage long sys_userfaultfd(int flags);
|
||||
asmlinkage long sys_membarrier(int cmd, int flags);
|
||||
asmlinkage long sys_membarrier(int cmd, unsigned int flags, int cpu_id);
|
||||
asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
|
||||
asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in,
|
||||
int fd_out, loff_t __user *off_out,
|
||||
|
@ -630,6 +630,10 @@ DECLARE_TRACE(pelt_se_tp,
|
||||
TP_PROTO(struct sched_entity *se),
|
||||
TP_ARGS(se));
|
||||
|
||||
DECLARE_TRACE(sched_cpu_capacity_tp,
|
||||
TP_PROTO(struct rq *rq),
|
||||
TP_ARGS(rq));
|
||||
|
||||
DECLARE_TRACE(sched_overutilized_tp,
|
||||
TP_PROTO(struct root_domain *rd, bool overutilized),
|
||||
TP_ARGS(rd, overutilized));
|
||||
|
@ -114,6 +114,26 @@
|
||||
* If this command is not implemented by an
|
||||
* architecture, -EINVAL is returned.
|
||||
* Returns 0 on success.
|
||||
* @MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
|
||||
* Ensure the caller thread, upon return from
|
||||
* system call, that all its running thread
|
||||
* siblings have any currently running rseq
|
||||
* critical sections restarted if @flags
|
||||
* parameter is 0; if @flags parameter is
|
||||
* MEMBARRIER_CMD_FLAG_CPU,
|
||||
* then this operation is performed only
|
||||
* on CPU indicated by @cpu_id. If this command is
|
||||
* not implemented by an architecture, -EINVAL
|
||||
* is returned. A process needs to register its
|
||||
* intent to use the private expedited rseq
|
||||
* command prior to using it, otherwise
|
||||
* this command returns -EPERM.
|
||||
* @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
|
||||
* Register the process intent to use
|
||||
* MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ.
|
||||
* If this command is not implemented by an
|
||||
* architecture, -EINVAL is returned.
|
||||
* Returns 0 on success.
|
||||
* @MEMBARRIER_CMD_SHARED:
|
||||
* Alias to MEMBARRIER_CMD_GLOBAL. Provided for
|
||||
* header backward compatibility.
|
||||
@ -131,9 +151,15 @@ enum membarrier_cmd {
|
||||
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4),
|
||||
MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 5),
|
||||
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6),
|
||||
MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ = (1 << 7),
|
||||
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ = (1 << 8),
|
||||
|
||||
/* Alias for header backward compatibility. */
|
||||
MEMBARRIER_CMD_SHARED = MEMBARRIER_CMD_GLOBAL,
|
||||
};
|
||||
|
||||
enum membarrier_cmd_flag {
|
||||
MEMBARRIER_CMD_FLAG_CPU = (1 << 0),
|
||||
};
|
||||
|
||||
#endif /* _UAPI_LINUX_MEMBARRIER_H */
|
||||
|
@ -36,6 +36,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
|
||||
@ -940,11 +941,6 @@ static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
|
||||
return clamp_value / UCLAMP_BUCKET_DELTA;
|
||||
}
|
||||
|
||||
static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
|
||||
{
|
||||
return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
|
||||
}
|
||||
|
||||
static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
|
||||
{
|
||||
if (clamp_id == UCLAMP_MIN)
|
||||
@ -4551,9 +4547,12 @@ void __noreturn do_task_dead(void)
|
||||
|
||||
static inline void sched_submit_work(struct task_struct *tsk)
|
||||
{
|
||||
unsigned int task_flags;
|
||||
|
||||
if (!tsk->state)
|
||||
return;
|
||||
|
||||
task_flags = tsk->flags;
|
||||
/*
|
||||
* If a worker went to sleep, notify and ask workqueue whether
|
||||
* it wants to wake up a task to maintain concurrency.
|
||||
@ -4562,9 +4561,9 @@ static inline void sched_submit_work(struct task_struct *tsk)
|
||||
* in the possible wakeup of a kworker and because wq_worker_sleeping()
|
||||
* requires it.
|
||||
*/
|
||||
if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
|
||||
if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
|
||||
preempt_disable();
|
||||
if (tsk->flags & PF_WQ_WORKER)
|
||||
if (task_flags & PF_WQ_WORKER)
|
||||
wq_worker_sleeping(tsk);
|
||||
else
|
||||
io_wq_worker_sleeping(tsk);
|
||||
|
@ -1525,14 +1525,38 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
|
||||
*/
|
||||
if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
|
||||
pi_se = &pi_task->dl;
|
||||
/*
|
||||
* Because of delays in the detection of the overrun of a
|
||||
* thread's runtime, it might be the case that a thread
|
||||
* goes to sleep in a rt mutex with negative runtime. As
|
||||
* a consequence, the thread will be throttled.
|
||||
*
|
||||
* While waiting for the mutex, this thread can also be
|
||||
* boosted via PI, resulting in a thread that is throttled
|
||||
* and boosted at the same time.
|
||||
*
|
||||
* In this case, the boost overrides the throttle.
|
||||
*/
|
||||
if (p->dl.dl_throttled) {
|
||||
/*
|
||||
* The replenish timer needs to be canceled. No
|
||||
* problem if it fires concurrently: boosted threads
|
||||
* are ignored in dl_task_timer().
|
||||
*/
|
||||
hrtimer_try_to_cancel(&p->dl.dl_timer);
|
||||
p->dl.dl_throttled = 0;
|
||||
}
|
||||
} else if (!dl_prio(p->normal_prio)) {
|
||||
/*
|
||||
* Special case in which we have a !SCHED_DEADLINE task
|
||||
* that is going to be deboosted, but exceeds its
|
||||
* runtime while doing so. No point in replenishing
|
||||
* it, as it's going to return back to its original
|
||||
* scheduling class after this.
|
||||
* Special case in which we have a !SCHED_DEADLINE task that is going
|
||||
* to be deboosted, but exceeds its runtime while doing so. No point in
|
||||
* replenishing it, as it's going to return back to its original
|
||||
* scheduling class after this. If it has been throttled, we need to
|
||||
* clear the flag, otherwise the task may wake up as throttled after
|
||||
* being boosted again with no means to replenish the runtime and clear
|
||||
* the throttle.
|
||||
*/
|
||||
p->dl.dl_throttled = 0;
|
||||
BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
|
||||
return;
|
||||
}
|
||||
|
@ -245,6 +245,60 @@ set_table_entry(struct ctl_table *entry,
|
||||
entry->proc_handler = proc_handler;
|
||||
}
|
||||
|
||||
static int sd_ctl_doflags(struct ctl_table *table, int write,
|
||||
void *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
unsigned long flags = *(unsigned long *)table->data;
|
||||
size_t data_size = 0;
|
||||
size_t len = 0;
|
||||
char *tmp;
|
||||
int idx;
|
||||
|
||||
if (write)
|
||||
return 0;
|
||||
|
||||
for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
|
||||
char *name = sd_flag_debug[idx].name;
|
||||
|
||||
/* Name plus whitespace */
|
||||
data_size += strlen(name) + 1;
|
||||
}
|
||||
|
||||
if (*ppos > data_size) {
|
||||
*lenp = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL);
|
||||
if (!tmp)
|
||||
return -ENOMEM;
|
||||
|
||||
for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
|
||||
char *name = sd_flag_debug[idx].name;
|
||||
|
||||
len += snprintf(tmp + len, strlen(name) + 2, "%s ", name);
|
||||
}
|
||||
|
||||
tmp += *ppos;
|
||||
len -= *ppos;
|
||||
|
||||
if (len > *lenp)
|
||||
len = *lenp;
|
||||
if (len)
|
||||
memcpy(buffer, tmp, len);
|
||||
if (len < *lenp) {
|
||||
((char *)buffer)[len] = '\n';
|
||||
len++;
|
||||
}
|
||||
|
||||
*lenp = len;
|
||||
*ppos += len;
|
||||
|
||||
kfree(tmp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct ctl_table *
|
||||
sd_alloc_ctl_domain_table(struct sched_domain *sd)
|
||||
{
|
||||
@ -258,7 +312,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
|
||||
set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
|
||||
set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
|
||||
set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
|
||||
set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax);
|
||||
set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, sd_ctl_doflags);
|
||||
set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
|
||||
set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
|
||||
/* &table[8] is terminator */
|
||||
|
@ -831,7 +831,7 @@ void init_entity_runnable_average(struct sched_entity *se)
|
||||
void post_init_entity_util_avg(struct task_struct *p)
|
||||
{
|
||||
}
|
||||
static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
|
||||
static void update_tg_load_avg(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_SMP */
|
||||
@ -1504,6 +1504,7 @@ enum numa_type {
|
||||
/* Cached statistics for all CPUs within a node */
|
||||
struct numa_stats {
|
||||
unsigned long load;
|
||||
unsigned long runnable;
|
||||
unsigned long util;
|
||||
/* Total compute capacity of CPUs on a node */
|
||||
unsigned long compute_capacity;
|
||||
@ -1547,19 +1548,22 @@ struct task_numa_env {
|
||||
};
|
||||
|
||||
static unsigned long cpu_load(struct rq *rq);
|
||||
static unsigned long cpu_runnable(struct rq *rq);
|
||||
static unsigned long cpu_util(int cpu);
|
||||
static inline long adjust_numa_imbalance(int imbalance, int src_nr_running);
|
||||
static inline long adjust_numa_imbalance(int imbalance, int nr_running);
|
||||
|
||||
static inline enum
|
||||
numa_type numa_classify(unsigned int imbalance_pct,
|
||||
struct numa_stats *ns)
|
||||
{
|
||||
if ((ns->nr_running > ns->weight) &&
|
||||
((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
|
||||
(((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
|
||||
((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
|
||||
return node_overloaded;
|
||||
|
||||
if ((ns->nr_running < ns->weight) ||
|
||||
((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
|
||||
(((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
|
||||
((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
|
||||
return node_has_spare;
|
||||
|
||||
return node_fully_busy;
|
||||
@ -1610,6 +1614,7 @@ static void update_numa_stats(struct task_numa_env *env,
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
|
||||
ns->load += cpu_load(rq);
|
||||
ns->runnable += cpu_runnable(rq);
|
||||
ns->util += cpu_util(cpu);
|
||||
ns->nr_running += rq->cfs.h_nr_running;
|
||||
ns->compute_capacity += capacity_of(cpu);
|
||||
@ -1925,7 +1930,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
|
||||
src_running = env->src_stats.nr_running - 1;
|
||||
dst_running = env->dst_stats.nr_running + 1;
|
||||
imbalance = max(0, dst_running - src_running);
|
||||
imbalance = adjust_numa_imbalance(imbalance, src_running);
|
||||
imbalance = adjust_numa_imbalance(imbalance, dst_running);
|
||||
|
||||
/* Use idle CPU if there is no imbalance */
|
||||
if (!imbalance) {
|
||||
@ -3084,7 +3089,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
||||
/* commit outstanding execution time */
|
||||
if (cfs_rq->curr == se)
|
||||
update_curr(cfs_rq);
|
||||
account_entity_dequeue(cfs_rq, se);
|
||||
update_load_sub(&cfs_rq->load, se->load.weight);
|
||||
}
|
||||
dequeue_load_avg(cfs_rq, se);
|
||||
|
||||
@ -3100,7 +3105,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
||||
|
||||
enqueue_load_avg(cfs_rq, se);
|
||||
if (se->on_rq)
|
||||
account_entity_enqueue(cfs_rq, se);
|
||||
update_load_add(&cfs_rq->load, se->load.weight);
|
||||
|
||||
}
|
||||
|
||||
@ -3288,7 +3293,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
|
||||
/**
|
||||
* update_tg_load_avg - update the tg's load avg
|
||||
* @cfs_rq: the cfs_rq whose avg changed
|
||||
* @force: update regardless of how small the difference
|
||||
*
|
||||
* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
|
||||
* However, because tg->load_avg is a global value there are performance
|
||||
@ -3300,7 +3304,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
|
||||
*
|
||||
* Updating tg's load_avg is necessary before update_cfs_share().
|
||||
*/
|
||||
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
|
||||
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
|
||||
|
||||
@ -3310,7 +3314,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
|
||||
if (cfs_rq->tg == &root_task_group)
|
||||
return;
|
||||
|
||||
if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
|
||||
if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
|
||||
atomic_long_add(delta, &cfs_rq->tg->load_avg);
|
||||
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
|
||||
}
|
||||
@ -3612,7 +3616,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
|
||||
|
||||
#else /* CONFIG_FAIR_GROUP_SCHED */
|
||||
|
||||
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
|
||||
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
|
||||
|
||||
static inline int propagate_entity_load_avg(struct sched_entity *se)
|
||||
{
|
||||
@ -3800,13 +3804,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
|
||||
* IOW we're enqueueing a task on a new CPU.
|
||||
*/
|
||||
attach_entity_load_avg(cfs_rq, se);
|
||||
update_tg_load_avg(cfs_rq, 0);
|
||||
update_tg_load_avg(cfs_rq);
|
||||
|
||||
} else if (decayed) {
|
||||
cfs_rq_util_change(cfs_rq, 0);
|
||||
|
||||
if (flags & UPDATE_TG)
|
||||
update_tg_load_avg(cfs_rq, 0);
|
||||
update_tg_load_avg(cfs_rq);
|
||||
}
|
||||
}
|
||||
|
||||
@ -4461,17 +4465,17 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
||||
se = second;
|
||||
}
|
||||
|
||||
/*
|
||||
* Prefer last buddy, try to return the CPU to a preempted task.
|
||||
*/
|
||||
if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
|
||||
se = cfs_rq->last;
|
||||
|
||||
/*
|
||||
* Someone really wants this to run. If it's not unfair, run it.
|
||||
*/
|
||||
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
|
||||
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
|
||||
/*
|
||||
* Someone really wants this to run. If it's not unfair, run it.
|
||||
*/
|
||||
se = cfs_rq->next;
|
||||
} else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
|
||||
/*
|
||||
* Prefer last buddy, try to return the CPU to a preempted task.
|
||||
*/
|
||||
se = cfs_rq->last;
|
||||
}
|
||||
|
||||
clear_buddies(cfs_rq, se);
|
||||
|
||||
@ -6075,7 +6079,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
|
||||
/*
|
||||
* Scan the local SMT mask for idle CPUs.
|
||||
*/
|
||||
static int select_idle_smt(struct task_struct *p, int target)
|
||||
static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
@ -6083,7 +6087,8 @@ static int select_idle_smt(struct task_struct *p, int target)
|
||||
return -1;
|
||||
|
||||
for_each_cpu(cpu, cpu_smt_mask(target)) {
|
||||
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
|
||||
if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
|
||||
!cpumask_test_cpu(cpu, sched_domain_span(sd)))
|
||||
continue;
|
||||
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
|
||||
return cpu;
|
||||
@ -6099,7 +6104,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
|
||||
return -1;
|
||||
}
|
||||
|
||||
static inline int select_idle_smt(struct task_struct *p, int target)
|
||||
static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
@ -6274,7 +6279,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
|
||||
if ((unsigned)i < nr_cpumask_bits)
|
||||
return i;
|
||||
|
||||
i = select_idle_smt(p, target);
|
||||
i = select_idle_smt(p, sd, target);
|
||||
if ((unsigned)i < nr_cpumask_bits)
|
||||
return i;
|
||||
|
||||
@ -6594,7 +6599,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
|
||||
|
||||
util = cpu_util_next(cpu, p, cpu);
|
||||
cpu_cap = capacity_of(cpu);
|
||||
spare_cap = cpu_cap - util;
|
||||
spare_cap = cpu_cap;
|
||||
lsub_positive(&spare_cap, util);
|
||||
|
||||
/*
|
||||
* Skip CPUs that cannot satisfy the capacity request.
|
||||
@ -7402,6 +7408,10 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
|
||||
if (unlikely(task_has_idle_policy(p)))
|
||||
return 0;
|
||||
|
||||
/* SMT siblings share cache */
|
||||
if (env->sd->flags & SD_SHARE_CPUCAPACITY)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Buddy candidates are cache hot:
|
||||
*/
|
||||
@ -7669,8 +7679,8 @@ static int detach_tasks(struct lb_env *env)
|
||||
* scheduler fails to find a good waiting task to
|
||||
* migrate.
|
||||
*/
|
||||
if (load/2 > env->imbalance &&
|
||||
env->sd->nr_balance_failed <= env->sd->cache_nice_tries)
|
||||
|
||||
if ((load >> env->sd->nr_balance_failed) > env->imbalance)
|
||||
goto next;
|
||||
|
||||
env->imbalance -= load;
|
||||
@ -7887,7 +7897,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
|
||||
struct sched_entity *se;
|
||||
|
||||
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
|
||||
update_tg_load_avg(cfs_rq, 0);
|
||||
update_tg_load_avg(cfs_rq);
|
||||
|
||||
if (cfs_rq == &rq->cfs)
|
||||
decayed = true;
|
||||
@ -8098,6 +8108,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
|
||||
capacity = 1;
|
||||
|
||||
cpu_rq(cpu)->cpu_capacity = capacity;
|
||||
trace_sched_cpu_capacity_tp(cpu_rq(cpu));
|
||||
|
||||
sdg->sgc->capacity = capacity;
|
||||
sdg->sgc->min_capacity = capacity;
|
||||
sdg->sgc->max_capacity = capacity;
|
||||
@ -8957,7 +8969,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
|
||||
}
|
||||
}
|
||||
|
||||
static inline long adjust_numa_imbalance(int imbalance, int src_nr_running)
|
||||
static inline long adjust_numa_imbalance(int imbalance, int nr_running)
|
||||
{
|
||||
unsigned int imbalance_min;
|
||||
|
||||
@ -8966,7 +8978,7 @@ static inline long adjust_numa_imbalance(int imbalance, int src_nr_running)
|
||||
* tasks that remain local when the source domain is almost idle.
|
||||
*/
|
||||
imbalance_min = 2;
|
||||
if (src_nr_running <= imbalance_min)
|
||||
if (nr_running <= imbalance_min)
|
||||
return 0;
|
||||
|
||||
return imbalance;
|
||||
@ -9780,6 +9792,15 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
|
||||
|
||||
/* scale ms to jiffies */
|
||||
interval = msecs_to_jiffies(interval);
|
||||
|
||||
/*
|
||||
* Reduce likelihood of busy balancing at higher domains racing with
|
||||
* balancing at lower domains by preventing their balancing periods
|
||||
* from being multiples of each other.
|
||||
*/
|
||||
if (cpu_busy)
|
||||
interval -= 1;
|
||||
|
||||
interval = clamp(interval, 1UL, max_load_balance_interval);
|
||||
|
||||
return interval;
|
||||
@ -10786,7 +10807,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
|
||||
/* Catch up with the cfs_rq and remove our load when we leave */
|
||||
update_load_avg(cfs_rq, se, 0);
|
||||
detach_entity_load_avg(cfs_rq, se);
|
||||
update_tg_load_avg(cfs_rq, false);
|
||||
update_tg_load_avg(cfs_rq);
|
||||
propagate_entity_cfs_rq(se);
|
||||
}
|
||||
|
||||
@ -10805,7 +10826,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
|
||||
/* Synchronize entity with its cfs_rq */
|
||||
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
|
||||
attach_entity_load_avg(cfs_rq, se);
|
||||
update_tg_load_avg(cfs_rq, false);
|
||||
update_tg_load_avg(cfs_rq);
|
||||
propagate_entity_cfs_rq(se);
|
||||
}
|
||||
|
||||
@ -11302,6 +11323,18 @@ int sched_trace_rq_cpu(struct rq *rq)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
|
||||
|
||||
int sched_trace_rq_cpu_capacity(struct rq *rq)
|
||||
{
|
||||
return rq ?
|
||||
#ifdef CONFIG_SMP
|
||||
rq->cpu_capacity
|
||||
#else
|
||||
SCHED_CAPACITY_SCALE
|
||||
#endif
|
||||
: -1;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
|
||||
|
||||
const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
|
@ -77,7 +77,7 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
|
||||
SCHED_FEAT(RT_PUSH_IPI, true)
|
||||
#endif
|
||||
|
||||
SCHED_FEAT(RT_RUNTIME_SHARE, true)
|
||||
SCHED_FEAT(RT_RUNTIME_SHARE, false)
|
||||
SCHED_FEAT(LB_MIN, false)
|
||||
SCHED_FEAT(ATTACH_AGE_LOAD, true)
|
||||
|
||||
|
@ -18,6 +18,14 @@
|
||||
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_RSEQ
|
||||
#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \
|
||||
(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \
|
||||
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
|
||||
#else
|
||||
#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
|
||||
#endif
|
||||
|
||||
#define MEMBARRIER_CMD_BITMASK \
|
||||
(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
|
||||
| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
|
||||
@ -30,6 +38,11 @@ static void ipi_mb(void *info)
|
||||
smp_mb(); /* IPIs should be serializing but paranoid. */
|
||||
}
|
||||
|
||||
static void ipi_rseq(void *info)
|
||||
{
|
||||
rseq_preempt(current);
|
||||
}
|
||||
|
||||
static void ipi_sync_rq_state(void *info)
|
||||
{
|
||||
struct mm_struct *mm = (struct mm_struct *) info;
|
||||
@ -129,19 +142,27 @@ static int membarrier_global_expedited(void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int membarrier_private_expedited(int flags)
|
||||
static int membarrier_private_expedited(int flags, int cpu_id)
|
||||
{
|
||||
int cpu;
|
||||
cpumask_var_t tmpmask;
|
||||
struct mm_struct *mm = current->mm;
|
||||
smp_call_func_t ipi_func = ipi_mb;
|
||||
|
||||
if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
|
||||
if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
|
||||
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
|
||||
return -EINVAL;
|
||||
if (!(atomic_read(&mm->membarrier_state) &
|
||||
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
|
||||
return -EPERM;
|
||||
} else if (flags == MEMBARRIER_FLAG_RSEQ) {
|
||||
if (!IS_ENABLED(CONFIG_RSEQ))
|
||||
return -EINVAL;
|
||||
if (!(atomic_read(&mm->membarrier_state) &
|
||||
MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
|
||||
return -EPERM;
|
||||
ipi_func = ipi_rseq;
|
||||
} else {
|
||||
WARN_ON_ONCE(flags);
|
||||
if (!(atomic_read(&mm->membarrier_state) &
|
||||
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
|
||||
return -EPERM;
|
||||
@ -156,35 +177,59 @@ static int membarrier_private_expedited(int flags)
|
||||
*/
|
||||
smp_mb(); /* system call entry is not a mb. */
|
||||
|
||||
if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
|
||||
if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
|
||||
return -ENOMEM;
|
||||
|
||||
cpus_read_lock();
|
||||
rcu_read_lock();
|
||||
for_each_online_cpu(cpu) {
|
||||
|
||||
if (cpu_id >= 0) {
|
||||
struct task_struct *p;
|
||||
|
||||
/*
|
||||
* Skipping the current CPU is OK even through we can be
|
||||
* migrated at any point. The current CPU, at the point
|
||||
* where we read raw_smp_processor_id(), is ensured to
|
||||
* be in program order with respect to the caller
|
||||
* thread. Therefore, we can skip this CPU from the
|
||||
* iteration.
|
||||
*/
|
||||
if (cpu == raw_smp_processor_id())
|
||||
continue;
|
||||
p = rcu_dereference(cpu_rq(cpu)->curr);
|
||||
if (p && p->mm == mm)
|
||||
__cpumask_set_cpu(cpu, tmpmask);
|
||||
if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
|
||||
goto out;
|
||||
if (cpu_id == raw_smp_processor_id())
|
||||
goto out;
|
||||
rcu_read_lock();
|
||||
p = rcu_dereference(cpu_rq(cpu_id)->curr);
|
||||
if (!p || p->mm != mm) {
|
||||
rcu_read_unlock();
|
||||
goto out;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
} else {
|
||||
int cpu;
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_online_cpu(cpu) {
|
||||
struct task_struct *p;
|
||||
|
||||
/*
|
||||
* Skipping the current CPU is OK even through we can be
|
||||
* migrated at any point. The current CPU, at the point
|
||||
* where we read raw_smp_processor_id(), is ensured to
|
||||
* be in program order with respect to the caller
|
||||
* thread. Therefore, we can skip this CPU from the
|
||||
* iteration.
|
||||
*/
|
||||
if (cpu == raw_smp_processor_id())
|
||||
continue;
|
||||
p = rcu_dereference(cpu_rq(cpu)->curr);
|
||||
if (p && p->mm == mm)
|
||||
__cpumask_set_cpu(cpu, tmpmask);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
preempt_disable();
|
||||
smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
|
||||
if (cpu_id >= 0)
|
||||
smp_call_function_single(cpu_id, ipi_func, NULL, 1);
|
||||
else
|
||||
smp_call_function_many(tmpmask, ipi_func, NULL, 1);
|
||||
preempt_enable();
|
||||
|
||||
free_cpumask_var(tmpmask);
|
||||
out:
|
||||
if (cpu_id < 0)
|
||||
free_cpumask_var(tmpmask);
|
||||
cpus_read_unlock();
|
||||
|
||||
/*
|
||||
@ -283,11 +328,18 @@ static int membarrier_register_private_expedited(int flags)
|
||||
set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
|
||||
ret;
|
||||
|
||||
if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
|
||||
if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
|
||||
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
|
||||
return -EINVAL;
|
||||
ready_state =
|
||||
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
|
||||
} else if (flags == MEMBARRIER_FLAG_RSEQ) {
|
||||
if (!IS_ENABLED(CONFIG_RSEQ))
|
||||
return -EINVAL;
|
||||
ready_state =
|
||||
MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
|
||||
} else {
|
||||
WARN_ON_ONCE(flags);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -299,6 +351,8 @@ static int membarrier_register_private_expedited(int flags)
|
||||
return 0;
|
||||
if (flags & MEMBARRIER_FLAG_SYNC_CORE)
|
||||
set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
|
||||
if (flags & MEMBARRIER_FLAG_RSEQ)
|
||||
set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
|
||||
atomic_or(set_state, &mm->membarrier_state);
|
||||
ret = sync_runqueues_membarrier_state(mm);
|
||||
if (ret)
|
||||
@ -310,8 +364,15 @@ static int membarrier_register_private_expedited(int flags)
|
||||
|
||||
/**
|
||||
* sys_membarrier - issue memory barriers on a set of threads
|
||||
* @cmd: Takes command values defined in enum membarrier_cmd.
|
||||
* @flags: Currently needs to be 0. For future extensions.
|
||||
* @cmd: Takes command values defined in enum membarrier_cmd.
|
||||
* @flags: Currently needs to be 0 for all commands other than
|
||||
* MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
|
||||
* case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
|
||||
* contains the CPU on which to interrupt (= restart)
|
||||
* the RSEQ critical section.
|
||||
* @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
|
||||
* RSEQ CS should be interrupted (@cmd must be
|
||||
* MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
|
||||
*
|
||||
* If this system call is not implemented, -ENOSYS is returned. If the
|
||||
* command specified does not exist, not available on the running
|
||||
@ -337,10 +398,21 @@ static int membarrier_register_private_expedited(int flags)
|
||||
* smp_mb() X O O
|
||||
* sys_membarrier() O O O
|
||||
*/
|
||||
SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
|
||||
SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
|
||||
{
|
||||
if (unlikely(flags))
|
||||
return -EINVAL;
|
||||
switch (cmd) {
|
||||
case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
|
||||
if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
|
||||
return -EINVAL;
|
||||
break;
|
||||
default:
|
||||
if (unlikely(flags))
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
|
||||
cpu_id = -1;
|
||||
|
||||
switch (cmd) {
|
||||
case MEMBARRIER_CMD_QUERY:
|
||||
{
|
||||
@ -362,13 +434,17 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
|
||||
case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
|
||||
return membarrier_register_global_expedited();
|
||||
case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
|
||||
return membarrier_private_expedited(0);
|
||||
return membarrier_private_expedited(0, cpu_id);
|
||||
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
|
||||
return membarrier_register_private_expedited(0);
|
||||
case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
|
||||
return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
|
||||
return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
|
||||
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
|
||||
return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
|
||||
case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
|
||||
return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
|
||||
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
|
||||
return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
@ -25,10 +25,18 @@ static inline bool sched_debug(void)
|
||||
return sched_debug_enabled;
|
||||
}
|
||||
|
||||
#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
|
||||
const struct sd_flag_debug sd_flag_debug[] = {
|
||||
#include <linux/sched/sd_flags.h>
|
||||
};
|
||||
#undef SD_FLAG
|
||||
|
||||
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
|
||||
struct cpumask *groupmask)
|
||||
{
|
||||
struct sched_group *group = sd->groups;
|
||||
unsigned long flags = sd->flags;
|
||||
unsigned int idx;
|
||||
|
||||
cpumask_clear(groupmask);
|
||||
|
||||
@ -43,6 +51,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
|
||||
printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
|
||||
}
|
||||
|
||||
for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
|
||||
unsigned int flag = BIT(idx);
|
||||
unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
|
||||
|
||||
if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
|
||||
!(sd->child->flags & flag))
|
||||
printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
|
||||
sd_flag_debug[idx].name);
|
||||
|
||||
if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
|
||||
!(sd->parent->flags & flag))
|
||||
printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
|
||||
sd_flag_debug[idx].name);
|
||||
}
|
||||
|
||||
printk(KERN_DEBUG "%*s groups:", level + 1, "");
|
||||
do {
|
||||
if (!group) {
|
||||
@ -137,22 +160,22 @@ static inline bool sched_debug(void)
|
||||
}
|
||||
#endif /* CONFIG_SCHED_DEBUG */
|
||||
|
||||
/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
|
||||
#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
|
||||
static const unsigned int SD_DEGENERATE_GROUPS_MASK =
|
||||
#include <linux/sched/sd_flags.h>
|
||||
0;
|
||||
#undef SD_FLAG
|
||||
|
||||
static int sd_degenerate(struct sched_domain *sd)
|
||||
{
|
||||
if (cpumask_weight(sched_domain_span(sd)) == 1)
|
||||
return 1;
|
||||
|
||||
/* Following flags need at least 2 groups */
|
||||
if (sd->flags & (SD_BALANCE_NEWIDLE |
|
||||
SD_BALANCE_FORK |
|
||||
SD_BALANCE_EXEC |
|
||||
SD_SHARE_CPUCAPACITY |
|
||||
SD_ASYM_CPUCAPACITY |
|
||||
SD_SHARE_PKG_RESOURCES |
|
||||
SD_SHARE_POWERDOMAIN)) {
|
||||
if (sd->groups != sd->groups->next)
|
||||
return 0;
|
||||
}
|
||||
if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
|
||||
(sd->groups != sd->groups->next))
|
||||
return 0;
|
||||
|
||||
/* Following flags don't use groups */
|
||||
if (sd->flags & (SD_WAKE_AFFINE))
|
||||
@ -173,18 +196,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
|
||||
return 0;
|
||||
|
||||
/* Flags needing groups don't count if only 1 group in parent */
|
||||
if (parent->groups == parent->groups->next) {
|
||||
pflags &= ~(SD_BALANCE_NEWIDLE |
|
||||
SD_BALANCE_FORK |
|
||||
SD_BALANCE_EXEC |
|
||||
SD_ASYM_CPUCAPACITY |
|
||||
SD_SHARE_CPUCAPACITY |
|
||||
SD_SHARE_PKG_RESOURCES |
|
||||
SD_PREFER_SIBLING |
|
||||
SD_SHARE_POWERDOMAIN);
|
||||
if (nr_node_ids == 1)
|
||||
pflags &= ~SD_SERIALIZE;
|
||||
}
|
||||
if (parent->groups == parent->groups->next)
|
||||
pflags &= ~SD_DEGENERATE_GROUPS_MASK;
|
||||
|
||||
if (~cflags & pflags)
|
||||
return 0;
|
||||
|
||||
@ -1292,7 +1306,6 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
|
||||
* SD_SHARE_CPUCAPACITY - describes SMT topologies
|
||||
* SD_SHARE_PKG_RESOURCES - describes shared caches
|
||||
* SD_NUMA - describes NUMA topologies
|
||||
* SD_SHARE_POWERDOMAIN - describes shared power domain
|
||||
*
|
||||
* Odd one out, which beside describing the topology has a quirk also
|
||||
* prescribes the desired behaviour that goes along with it:
|
||||
@ -1303,8 +1316,7 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
|
||||
(SD_SHARE_CPUCAPACITY | \
|
||||
SD_SHARE_PKG_RESOURCES | \
|
||||
SD_NUMA | \
|
||||
SD_ASYM_PACKING | \
|
||||
SD_SHARE_POWERDOMAIN)
|
||||
SD_ASYM_PACKING)
|
||||
|
||||
static struct sched_domain *
|
||||
sd_init(struct sched_domain_topology_level *tl,
|
||||
@ -1336,8 +1348,8 @@ sd_init(struct sched_domain_topology_level *tl,
|
||||
*sd = (struct sched_domain){
|
||||
.min_interval = sd_weight,
|
||||
.max_interval = 2*sd_weight,
|
||||
.busy_factor = 32,
|
||||
.imbalance_pct = 125,
|
||||
.busy_factor = 16,
|
||||
.imbalance_pct = 117,
|
||||
|
||||
.cache_nice_tries = 0,
|
||||
|
||||
@ -1989,11 +2001,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
|
||||
/* Set up domains for CPUs specified by the cpu_map: */
|
||||
for_each_cpu(i, cpu_map) {
|
||||
struct sched_domain_topology_level *tl;
|
||||
int dflags = 0;
|
||||
|
||||
sd = NULL;
|
||||
for_each_sd_topology(tl) {
|
||||
int dflags = 0;
|
||||
|
||||
if (tl == tl_asym) {
|
||||
dflags |= SD_ASYM_CPUCAPACITY;
|
||||
has_asym = true;
|
||||
|
@ -1,8 +1,10 @@
|
||||
// SPDX-License-Identifier: LGPL-2.1
|
||||
#define _GNU_SOURCE
|
||||
#include <assert.h>
|
||||
#include <linux/membarrier.h>
|
||||
#include <pthread.h>
|
||||
#include <sched.h>
|
||||
#include <stdatomic.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
@ -1131,6 +1133,220 @@ static int set_signal_handler(void)
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct test_membarrier_thread_args {
|
||||
int stop;
|
||||
intptr_t percpu_list_ptr;
|
||||
};
|
||||
|
||||
/* Worker threads modify data in their "active" percpu lists. */
|
||||
void *test_membarrier_worker_thread(void *arg)
|
||||
{
|
||||
struct test_membarrier_thread_args *args =
|
||||
(struct test_membarrier_thread_args *)arg;
|
||||
const int iters = opt_reps;
|
||||
int i;
|
||||
|
||||
if (rseq_register_current_thread()) {
|
||||
fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
|
||||
errno, strerror(errno));
|
||||
abort();
|
||||
}
|
||||
|
||||
/* Wait for initialization. */
|
||||
while (!atomic_load(&args->percpu_list_ptr)) {}
|
||||
|
||||
for (i = 0; i < iters; ++i) {
|
||||
int ret;
|
||||
|
||||
do {
|
||||
int cpu = rseq_cpu_start();
|
||||
|
||||
ret = rseq_offset_deref_addv(&args->percpu_list_ptr,
|
||||
sizeof(struct percpu_list_entry) * cpu, 1, cpu);
|
||||
} while (rseq_unlikely(ret));
|
||||
}
|
||||
|
||||
if (rseq_unregister_current_thread()) {
|
||||
fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
|
||||
errno, strerror(errno));
|
||||
abort();
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void test_membarrier_init_percpu_list(struct percpu_list *list)
|
||||
{
|
||||
int i;
|
||||
|
||||
memset(list, 0, sizeof(*list));
|
||||
for (i = 0; i < CPU_SETSIZE; i++) {
|
||||
struct percpu_list_node *node;
|
||||
|
||||
node = malloc(sizeof(*node));
|
||||
assert(node);
|
||||
node->data = 0;
|
||||
node->next = NULL;
|
||||
list->c[i].head = node;
|
||||
}
|
||||
}
|
||||
|
||||
void test_membarrier_free_percpu_list(struct percpu_list *list)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < CPU_SETSIZE; i++)
|
||||
free(list->c[i].head);
|
||||
}
|
||||
|
||||
static int sys_membarrier(int cmd, int flags, int cpu_id)
|
||||
{
|
||||
return syscall(__NR_membarrier, cmd, flags, cpu_id);
|
||||
}
|
||||
|
||||
/*
|
||||
* The manager thread swaps per-cpu lists that worker threads see,
|
||||
* and validates that there are no unexpected modifications.
|
||||
*/
|
||||
void *test_membarrier_manager_thread(void *arg)
|
||||
{
|
||||
struct test_membarrier_thread_args *args =
|
||||
(struct test_membarrier_thread_args *)arg;
|
||||
struct percpu_list list_a, list_b;
|
||||
intptr_t expect_a = 0, expect_b = 0;
|
||||
int cpu_a = 0, cpu_b = 0;
|
||||
|
||||
if (rseq_register_current_thread()) {
|
||||
fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
|
||||
errno, strerror(errno));
|
||||
abort();
|
||||
}
|
||||
|
||||
/* Init lists. */
|
||||
test_membarrier_init_percpu_list(&list_a);
|
||||
test_membarrier_init_percpu_list(&list_b);
|
||||
|
||||
atomic_store(&args->percpu_list_ptr, (intptr_t)&list_a);
|
||||
|
||||
while (!atomic_load(&args->stop)) {
|
||||
/* list_a is "active". */
|
||||
cpu_a = rand() % CPU_SETSIZE;
|
||||
/*
|
||||
* As list_b is "inactive", we should never see changes
|
||||
* to list_b.
|
||||
*/
|
||||
if (expect_b != atomic_load(&list_b.c[cpu_b].head->data)) {
|
||||
fprintf(stderr, "Membarrier test failed\n");
|
||||
abort();
|
||||
}
|
||||
|
||||
/* Make list_b "active". */
|
||||
atomic_store(&args->percpu_list_ptr, (intptr_t)&list_b);
|
||||
if (sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
|
||||
MEMBARRIER_CMD_FLAG_CPU, cpu_a) &&
|
||||
errno != ENXIO /* missing CPU */) {
|
||||
perror("sys_membarrier");
|
||||
abort();
|
||||
}
|
||||
/*
|
||||
* Cpu A should now only modify list_b, so the values
|
||||
* in list_a should be stable.
|
||||
*/
|
||||
expect_a = atomic_load(&list_a.c[cpu_a].head->data);
|
||||
|
||||
cpu_b = rand() % CPU_SETSIZE;
|
||||
/*
|
||||
* As list_a is "inactive", we should never see changes
|
||||
* to list_a.
|
||||
*/
|
||||
if (expect_a != atomic_load(&list_a.c[cpu_a].head->data)) {
|
||||
fprintf(stderr, "Membarrier test failed\n");
|
||||
abort();
|
||||
}
|
||||
|
||||
/* Make list_a "active". */
|
||||
atomic_store(&args->percpu_list_ptr, (intptr_t)&list_a);
|
||||
if (sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
|
||||
MEMBARRIER_CMD_FLAG_CPU, cpu_b) &&
|
||||
errno != ENXIO /* missing CPU*/) {
|
||||
perror("sys_membarrier");
|
||||
abort();
|
||||
}
|
||||
/* Remember a value from list_b. */
|
||||
expect_b = atomic_load(&list_b.c[cpu_b].head->data);
|
||||
}
|
||||
|
||||
test_membarrier_free_percpu_list(&list_a);
|
||||
test_membarrier_free_percpu_list(&list_b);
|
||||
|
||||
if (rseq_unregister_current_thread()) {
|
||||
fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
|
||||
errno, strerror(errno));
|
||||
abort();
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Test MEMBARRIER_CMD_PRIVATE_RESTART_RSEQ_ON_CPU membarrier command. */
|
||||
#ifdef RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
|
||||
void test_membarrier(void)
|
||||
{
|
||||
const int num_threads = opt_threads;
|
||||
struct test_membarrier_thread_args thread_args;
|
||||
pthread_t worker_threads[num_threads];
|
||||
pthread_t manager_thread;
|
||||
int i, ret;
|
||||
|
||||
if (sys_membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, 0, 0)) {
|
||||
perror("sys_membarrier");
|
||||
abort();
|
||||
}
|
||||
|
||||
thread_args.stop = 0;
|
||||
thread_args.percpu_list_ptr = 0;
|
||||
ret = pthread_create(&manager_thread, NULL,
|
||||
test_membarrier_manager_thread, &thread_args);
|
||||
if (ret) {
|
||||
errno = ret;
|
||||
perror("pthread_create");
|
||||
abort();
|
||||
}
|
||||
|
||||
for (i = 0; i < num_threads; i++) {
|
||||
ret = pthread_create(&worker_threads[i], NULL,
|
||||
test_membarrier_worker_thread, &thread_args);
|
||||
if (ret) {
|
||||
errno = ret;
|
||||
perror("pthread_create");
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for (i = 0; i < num_threads; i++) {
|
||||
ret = pthread_join(worker_threads[i], NULL);
|
||||
if (ret) {
|
||||
errno = ret;
|
||||
perror("pthread_join");
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
atomic_store(&thread_args.stop, 1);
|
||||
ret = pthread_join(manager_thread, NULL);
|
||||
if (ret) {
|
||||
errno = ret;
|
||||
perror("pthread_join");
|
||||
abort();
|
||||
}
|
||||
}
|
||||
#else /* RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV */
|
||||
void test_membarrier(void)
|
||||
{
|
||||
fprintf(stderr, "rseq_offset_deref_addv is not implemented on this architecture. "
|
||||
"Skipping membarrier test.\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
static void show_usage(int argc, char **argv)
|
||||
{
|
||||
printf("Usage : %s <OPTIONS>\n",
|
||||
@ -1153,7 +1369,7 @@ static void show_usage(int argc, char **argv)
|
||||
printf(" [-r N] Number of repetitions per thread (default 5000)\n");
|
||||
printf(" [-d] Disable rseq system call (no initialization)\n");
|
||||
printf(" [-D M] Disable rseq for each M threads\n");
|
||||
printf(" [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement\n");
|
||||
printf(" [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement, membarrie(r)\n");
|
||||
printf(" [-M] Push into buffer and memcpy buffer with memory barriers.\n");
|
||||
printf(" [-v] Verbose output.\n");
|
||||
printf(" [-h] Show this help.\n");
|
||||
@ -1268,6 +1484,7 @@ int main(int argc, char **argv)
|
||||
case 'i':
|
||||
case 'b':
|
||||
case 'm':
|
||||
case 'r':
|
||||
break;
|
||||
default:
|
||||
show_usage(argc, argv);
|
||||
@ -1320,6 +1537,10 @@ int main(int argc, char **argv)
|
||||
printf_verbose("counter increment\n");
|
||||
test_percpu_inc();
|
||||
break;
|
||||
case 'r':
|
||||
printf_verbose("membarrier\n");
|
||||
test_membarrier();
|
||||
break;
|
||||
}
|
||||
if (!opt_disable_rseq && rseq_unregister_current_thread())
|
||||
abort();
|
||||
|
@ -279,6 +279,63 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
|
||||
#endif
|
||||
}
|
||||
|
||||
#define RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
|
||||
|
||||
/*
|
||||
* pval = *(ptr+off)
|
||||
* *pval += inc;
|
||||
*/
|
||||
static inline __attribute__((always_inline))
|
||||
int rseq_offset_deref_addv(intptr_t *ptr, off_t off, intptr_t inc, int cpu)
|
||||
{
|
||||
RSEQ_INJECT_C(9)
|
||||
|
||||
__asm__ __volatile__ goto (
|
||||
RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
|
||||
#ifdef RSEQ_COMPARE_TWICE
|
||||
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
|
||||
#endif
|
||||
/* Start rseq by storing table entry pointer into rseq_cs. */
|
||||
RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
|
||||
RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
|
||||
RSEQ_INJECT_ASM(3)
|
||||
#ifdef RSEQ_COMPARE_TWICE
|
||||
RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
|
||||
#endif
|
||||
/* get p+v */
|
||||
"movq %[ptr], %%rbx\n\t"
|
||||
"addq %[off], %%rbx\n\t"
|
||||
/* get pv */
|
||||
"movq (%%rbx), %%rcx\n\t"
|
||||
/* *pv += inc */
|
||||
"addq %[inc], (%%rcx)\n\t"
|
||||
"2:\n\t"
|
||||
RSEQ_INJECT_ASM(4)
|
||||
RSEQ_ASM_DEFINE_ABORT(4, "", abort)
|
||||
: /* gcc asm goto does not allow outputs */
|
||||
: [cpu_id] "r" (cpu),
|
||||
[rseq_abi] "r" (&__rseq_abi),
|
||||
/* final store input */
|
||||
[ptr] "m" (*ptr),
|
||||
[off] "er" (off),
|
||||
[inc] "er" (inc)
|
||||
: "memory", "cc", "rax", "rbx", "rcx"
|
||||
RSEQ_INJECT_CLOBBER
|
||||
: abort
|
||||
#ifdef RSEQ_COMPARE_TWICE
|
||||
, error1
|
||||
#endif
|
||||
);
|
||||
return 0;
|
||||
abort:
|
||||
RSEQ_INJECT_FAILED
|
||||
return -1;
|
||||
#ifdef RSEQ_COMPARE_TWICE
|
||||
error1:
|
||||
rseq_bug("cpu_id comparison failed");
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __attribute__((always_inline))
|
||||
int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
|
||||
intptr_t *v2, intptr_t newv2,
|
||||
|
@ -15,6 +15,7 @@ TEST_LIST=(
|
||||
"-T m"
|
||||
"-T m -M"
|
||||
"-T i"
|
||||
"-T r"
|
||||
)
|
||||
|
||||
TEST_NAME=(
|
||||
@ -25,6 +26,7 @@ TEST_NAME=(
|
||||
"memcpy"
|
||||
"memcpy with barrier"
|
||||
"increment"
|
||||
"membarrier"
|
||||
)
|
||||
IFS="$OLDIFS"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user