linux_dsm_epyc7002/kernel/sched/idle_task.c
Peter Zijlstra 10e2f1acd0 sched/core: Rewrite and improve select_idle_siblings()
select_idle_siblings() is a known pain point for a number of
workloads; it either does too much or not enough and sometimes just
does plain wrong.

This rewrite attempts to address a number of issues (but sadly not
all).

The current code does an unconditional sched_domain iteration; with
the intent of finding an idle core (on SMT hardware). The problems
which this patch tries to address are:

 - its pointless to look for idle cores if the machine is real busy;
   at which point you're just wasting cycles.

 - it's behaviour is inconsistent between SMT and !SMT hardware in
   that !SMT hardware ends up doing a scan for any idle CPU in the LLC
   domain, while SMT hardware does a scan for idle cores and if that
   fails, falls back to a scan for idle threads on the 'target' core.

The new code replaces the sched_domain scan with 3 explicit scans:

 1) search for an idle core in the LLC
 2) search for an idle CPU in the LLC
 3) search for an idle thread in the 'target' core

where 1 and 3 are conditional on SMT support and 1 and 2 have runtime
heuristics to skip the step.

Step 1) is conditional on sd_llc_shared->has_idle_cores; when a cpu
goes idle and sd_llc_shared->has_idle_cores is false, we scan all SMT
siblings of the CPU going idle. Similarly, we clear
sd_llc_shared->has_idle_cores when we fail to find an idle core.

Step 2) tracks the average cost of the scan and compares this to the
average idle time guestimate for the CPU doing the wakeup. There is a
significant fudge factor involved to deal with the variability of the
averages. Esp. hackbench was sensitive to this.

Step 3) is unconditional; we assume (also per step 1) that scanning
all SMT siblings in a core is 'cheap'.

With this; SMT systems gain step 2, which cures a few benchmarks --
notably one from Facebook.

One 'feature' of the sched_domain iteration, which we preserve in the
new code, is that it would start scanning from the 'target' CPU,
instead of scanning the cpumask in cpu id order. This avoids multiple
CPUs in the LLC scanning for idle to gang up and find the same CPU
quite as much. The down side is that tasks can end up hopping across
the LLC for no apparent reason.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-09-30 11:03:09 +02:00

110 lines
2.3 KiB
C

#include "sched.h"
/*
* idle-task scheduling class.
*
* (NOTE: these are not related to SCHED_IDLE tasks which are
* handled in sched/fair.c)
*/
#ifdef CONFIG_SMP
static int
select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
#endif /* CONFIG_SMP */
/*
* Idle tasks are unconditionally rescheduled:
*/
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
{
resched_curr(rq);
}
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{
put_prev_task(rq, prev);
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
return rq->idle;
}
/*
* It is not legal to sleep in the idle task - print a warning
* message if some code attempts to do it:
*/
static void
dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
raw_spin_unlock_irq(&rq->lock);
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
dump_stack();
raw_spin_lock_irq(&rq->lock);
}
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
rq_last_tick_reset(rq);
}
static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
{
}
static void set_curr_task_idle(struct rq *rq)
{
}
static void switched_to_idle(struct rq *rq, struct task_struct *p)
{
BUG();
}
static void
prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
{
BUG();
}
static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
{
return 0;
}
static void update_curr_idle(struct rq *rq)
{
}
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
const struct sched_class idle_sched_class = {
/* .next is NULL */
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,
.check_preempt_curr = check_preempt_curr_idle,
.pick_next_task = pick_next_task_idle,
.put_prev_task = put_prev_task_idle,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
.set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,
.get_rr_interval = get_rr_interval_idle,
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
};