mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-11-24 14:51:00 +07:00
sched/fair: Rework find_idlest_group()
The slow wake up path computes per sched_group statisics to select the idlest group, which is quite similar to what load_balance() is doing for selecting busiest group. Rework find_idlest_group() to classify the sched_group and select the idlest one following the same steps as load_balance(). Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> Cc: Ben Segall <bsegall@google.com> Cc: Dietmar Eggemann <dietmar.eggemann@arm.com> Cc: Juri Lelli <juri.lelli@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mel Gorman <mgorman@suse.de> Cc: Mike Galbraith <efault@gmx.de> Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-12-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
fc1273f4ce
commit
57abff067a
@ -5531,127 +5531,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
|
|||||||
return target;
|
return target;
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned long cpu_util_without(int cpu, struct task_struct *p);
|
|
||||||
|
|
||||||
static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
|
|
||||||
{
|
|
||||||
return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* find_idlest_group finds and returns the least busy CPU group within the
|
|
||||||
* domain.
|
|
||||||
*
|
|
||||||
* Assumes p is allowed on at least one CPU in sd.
|
|
||||||
*/
|
|
||||||
static struct sched_group *
|
static struct sched_group *
|
||||||
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
|
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
|
||||||
int this_cpu, int sd_flag)
|
int this_cpu, int sd_flag);
|
||||||
{
|
|
||||||
struct sched_group *idlest = NULL, *group = sd->groups;
|
|
||||||
struct sched_group *most_spare_sg = NULL;
|
|
||||||
unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;
|
|
||||||
unsigned long most_spare = 0, this_spare = 0;
|
|
||||||
int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
|
|
||||||
unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
|
|
||||||
(sd->imbalance_pct-100) / 100;
|
|
||||||
|
|
||||||
do {
|
|
||||||
unsigned long load;
|
|
||||||
unsigned long spare_cap, max_spare_cap;
|
|
||||||
int local_group;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
/* Skip over this group if it has no CPUs allowed */
|
|
||||||
if (!cpumask_intersects(sched_group_span(group),
|
|
||||||
p->cpus_ptr))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
local_group = cpumask_test_cpu(this_cpu,
|
|
||||||
sched_group_span(group));
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Tally up the load of all CPUs in the group and find
|
|
||||||
* the group containing the CPU with most spare capacity.
|
|
||||||
*/
|
|
||||||
load = 0;
|
|
||||||
max_spare_cap = 0;
|
|
||||||
|
|
||||||
for_each_cpu(i, sched_group_span(group)) {
|
|
||||||
load += cpu_load(cpu_rq(i));
|
|
||||||
|
|
||||||
spare_cap = capacity_spare_without(i, p);
|
|
||||||
|
|
||||||
if (spare_cap > max_spare_cap)
|
|
||||||
max_spare_cap = spare_cap;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Adjust by relative CPU capacity of the group */
|
|
||||||
load = (load * SCHED_CAPACITY_SCALE) /
|
|
||||||
group->sgc->capacity;
|
|
||||||
|
|
||||||
if (local_group) {
|
|
||||||
this_load = load;
|
|
||||||
this_spare = max_spare_cap;
|
|
||||||
} else {
|
|
||||||
if (load < min_load) {
|
|
||||||
min_load = load;
|
|
||||||
idlest = group;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (most_spare < max_spare_cap) {
|
|
||||||
most_spare = max_spare_cap;
|
|
||||||
most_spare_sg = group;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} while (group = group->next, group != sd->groups);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* The cross-over point between using spare capacity or least load
|
|
||||||
* is too conservative for high utilization tasks on partially
|
|
||||||
* utilized systems if we require spare_capacity > task_util(p),
|
|
||||||
* so we allow for some task stuffing by using
|
|
||||||
* spare_capacity > task_util(p)/2.
|
|
||||||
*
|
|
||||||
* Spare capacity can't be used for fork because the utilization has
|
|
||||||
* not been set yet, we must first select a rq to compute the initial
|
|
||||||
* utilization.
|
|
||||||
*/
|
|
||||||
if (sd_flag & SD_BALANCE_FORK)
|
|
||||||
goto skip_spare;
|
|
||||||
|
|
||||||
if (this_spare > task_util(p) / 2 &&
|
|
||||||
imbalance_scale*this_spare > 100*most_spare)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
if (most_spare > task_util(p) / 2)
|
|
||||||
return most_spare_sg;
|
|
||||||
|
|
||||||
skip_spare:
|
|
||||||
if (!idlest)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* When comparing groups across NUMA domains, it's possible for the
|
|
||||||
* local domain to be very lightly loaded relative to the remote
|
|
||||||
* domains but "imbalance" skews the comparison making remote CPUs
|
|
||||||
* look much more favourable. When considering cross-domain, add
|
|
||||||
* imbalance to the load on the remote node and consider staying
|
|
||||||
* local.
|
|
||||||
*/
|
|
||||||
if ((sd->flags & SD_NUMA) &&
|
|
||||||
min_load + imbalance >= this_load)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
if (min_load >= this_load + imbalance)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
if ((this_load < (min_load + imbalance)) &&
|
|
||||||
(100*this_load < imbalance_scale*min_load))
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
return idlest;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
|
* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
|
||||||
@ -5724,7 +5606,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
|
|||||||
return prev_cpu;
|
return prev_cpu;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We need task's util for capacity_spare_without, sync it up to
|
* We need task's util for cpu_util_without, sync it up to
|
||||||
* prev_cpu's last_update_time.
|
* prev_cpu's last_update_time.
|
||||||
*/
|
*/
|
||||||
if (!(sd_flag & SD_BALANCE_FORK))
|
if (!(sd_flag & SD_BALANCE_FORK))
|
||||||
@ -7905,13 +7787,13 @@ static inline int sg_imbalanced(struct sched_group *group)
|
|||||||
* any benefit for the load balance.
|
* any benefit for the load balance.
|
||||||
*/
|
*/
|
||||||
static inline bool
|
static inline bool
|
||||||
group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
|
group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
|
||||||
{
|
{
|
||||||
if (sgs->sum_nr_running < sgs->group_weight)
|
if (sgs->sum_nr_running < sgs->group_weight)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
if ((sgs->group_capacity * 100) >
|
if ((sgs->group_capacity * 100) >
|
||||||
(sgs->group_util * env->sd->imbalance_pct))
|
(sgs->group_util * imbalance_pct))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
@ -7926,13 +7808,13 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
|
|||||||
* false.
|
* false.
|
||||||
*/
|
*/
|
||||||
static inline bool
|
static inline bool
|
||||||
group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
|
group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
|
||||||
{
|
{
|
||||||
if (sgs->sum_nr_running <= sgs->group_weight)
|
if (sgs->sum_nr_running <= sgs->group_weight)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if ((sgs->group_capacity * 100) <
|
if ((sgs->group_capacity * 100) <
|
||||||
(sgs->group_util * env->sd->imbalance_pct))
|
(sgs->group_util * imbalance_pct))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
@ -7959,11 +7841,11 @@ group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static inline enum
|
static inline enum
|
||||||
group_type group_classify(struct lb_env *env,
|
group_type group_classify(unsigned int imbalance_pct,
|
||||||
struct sched_group *group,
|
struct sched_group *group,
|
||||||
struct sg_lb_stats *sgs)
|
struct sg_lb_stats *sgs)
|
||||||
{
|
{
|
||||||
if (group_is_overloaded(env, sgs))
|
if (group_is_overloaded(imbalance_pct, sgs))
|
||||||
return group_overloaded;
|
return group_overloaded;
|
||||||
|
|
||||||
if (sg_imbalanced(group))
|
if (sg_imbalanced(group))
|
||||||
@ -7975,7 +7857,7 @@ group_type group_classify(struct lb_env *env,
|
|||||||
if (sgs->group_misfit_task_load)
|
if (sgs->group_misfit_task_load)
|
||||||
return group_misfit_task;
|
return group_misfit_task;
|
||||||
|
|
||||||
if (!group_has_capacity(env, sgs))
|
if (!group_has_capacity(imbalance_pct, sgs))
|
||||||
return group_fully_busy;
|
return group_fully_busy;
|
||||||
|
|
||||||
return group_has_spare;
|
return group_has_spare;
|
||||||
@ -8076,7 +7958,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
|||||||
|
|
||||||
sgs->group_weight = group->group_weight;
|
sgs->group_weight = group->group_weight;
|
||||||
|
|
||||||
sgs->group_type = group_classify(env, group, sgs);
|
sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
|
||||||
|
|
||||||
/* Computing avg_load makes sense only when group is overloaded */
|
/* Computing avg_load makes sense only when group is overloaded */
|
||||||
if (sgs->group_type == group_overloaded)
|
if (sgs->group_type == group_overloaded)
|
||||||
@ -8231,6 +8113,252 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
|
|||||||
}
|
}
|
||||||
#endif /* CONFIG_NUMA_BALANCING */
|
#endif /* CONFIG_NUMA_BALANCING */
|
||||||
|
|
||||||
|
|
||||||
|
struct sg_lb_stats;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
|
||||||
|
* @denv: The ched_domain level to look for idlest group.
|
||||||
|
* @group: sched_group whose statistics are to be updated.
|
||||||
|
* @sgs: variable to hold the statistics for this group.
|
||||||
|
*/
|
||||||
|
static inline void update_sg_wakeup_stats(struct sched_domain *sd,
|
||||||
|
struct sched_group *group,
|
||||||
|
struct sg_lb_stats *sgs,
|
||||||
|
struct task_struct *p)
|
||||||
|
{
|
||||||
|
int i, nr_running;
|
||||||
|
|
||||||
|
memset(sgs, 0, sizeof(*sgs));
|
||||||
|
|
||||||
|
for_each_cpu(i, sched_group_span(group)) {
|
||||||
|
struct rq *rq = cpu_rq(i);
|
||||||
|
|
||||||
|
sgs->group_load += cpu_load(rq);
|
||||||
|
sgs->group_util += cpu_util_without(i, p);
|
||||||
|
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
|
||||||
|
|
||||||
|
nr_running = rq->nr_running;
|
||||||
|
sgs->sum_nr_running += nr_running;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* No need to call idle_cpu() if nr_running is not 0
|
||||||
|
*/
|
||||||
|
if (!nr_running && idle_cpu(i))
|
||||||
|
sgs->idle_cpus++;
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check if task fits in the group */
|
||||||
|
if (sd->flags & SD_ASYM_CPUCAPACITY &&
|
||||||
|
!task_fits_capacity(p, group->sgc->max_capacity)) {
|
||||||
|
sgs->group_misfit_task_load = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
sgs->group_capacity = group->sgc->capacity;
|
||||||
|
|
||||||
|
sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Computing avg_load makes sense only when group is fully busy or
|
||||||
|
* overloaded
|
||||||
|
*/
|
||||||
|
if (sgs->group_type < group_fully_busy)
|
||||||
|
sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
|
||||||
|
sgs->group_capacity;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool update_pick_idlest(struct sched_group *idlest,
|
||||||
|
struct sg_lb_stats *idlest_sgs,
|
||||||
|
struct sched_group *group,
|
||||||
|
struct sg_lb_stats *sgs)
|
||||||
|
{
|
||||||
|
if (sgs->group_type < idlest_sgs->group_type)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (sgs->group_type > idlest_sgs->group_type)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The candidate and the current idlest group are the same type of
|
||||||
|
* group. Let check which one is the idlest according to the type.
|
||||||
|
*/
|
||||||
|
|
||||||
|
switch (sgs->group_type) {
|
||||||
|
case group_overloaded:
|
||||||
|
case group_fully_busy:
|
||||||
|
/* Select the group with lowest avg_load. */
|
||||||
|
if (idlest_sgs->avg_load <= sgs->avg_load)
|
||||||
|
return false;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case group_imbalanced:
|
||||||
|
case group_asym_packing:
|
||||||
|
/* Those types are not used in the slow wakeup path */
|
||||||
|
return false;
|
||||||
|
|
||||||
|
case group_misfit_task:
|
||||||
|
/* Select group with the highest max capacity */
|
||||||
|
if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
|
||||||
|
return false;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case group_has_spare:
|
||||||
|
/* Select group with most idle CPUs */
|
||||||
|
if (idlest_sgs->idle_cpus >= sgs->idle_cpus)
|
||||||
|
return false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* find_idlest_group() finds and returns the least busy CPU group within the
|
||||||
|
* domain.
|
||||||
|
*
|
||||||
|
* Assumes p is allowed on at least one CPU in sd.
|
||||||
|
*/
|
||||||
|
static struct sched_group *
|
||||||
|
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
|
||||||
|
int this_cpu, int sd_flag)
|
||||||
|
{
|
||||||
|
struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
|
||||||
|
struct sg_lb_stats local_sgs, tmp_sgs;
|
||||||
|
struct sg_lb_stats *sgs;
|
||||||
|
unsigned long imbalance;
|
||||||
|
struct sg_lb_stats idlest_sgs = {
|
||||||
|
.avg_load = UINT_MAX,
|
||||||
|
.group_type = group_overloaded,
|
||||||
|
};
|
||||||
|
|
||||||
|
imbalance = scale_load_down(NICE_0_LOAD) *
|
||||||
|
(sd->imbalance_pct-100) / 100;
|
||||||
|
|
||||||
|
do {
|
||||||
|
int local_group;
|
||||||
|
|
||||||
|
/* Skip over this group if it has no CPUs allowed */
|
||||||
|
if (!cpumask_intersects(sched_group_span(group),
|
||||||
|
p->cpus_ptr))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
local_group = cpumask_test_cpu(this_cpu,
|
||||||
|
sched_group_span(group));
|
||||||
|
|
||||||
|
if (local_group) {
|
||||||
|
sgs = &local_sgs;
|
||||||
|
local = group;
|
||||||
|
} else {
|
||||||
|
sgs = &tmp_sgs;
|
||||||
|
}
|
||||||
|
|
||||||
|
update_sg_wakeup_stats(sd, group, sgs, p);
|
||||||
|
|
||||||
|
if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
|
||||||
|
idlest = group;
|
||||||
|
idlest_sgs = *sgs;
|
||||||
|
}
|
||||||
|
|
||||||
|
} while (group = group->next, group != sd->groups);
|
||||||
|
|
||||||
|
|
||||||
|
/* There is no idlest group to push tasks to */
|
||||||
|
if (!idlest)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the local group is idler than the selected idlest group
|
||||||
|
* don't try and push the task.
|
||||||
|
*/
|
||||||
|
if (local_sgs.group_type < idlest_sgs.group_type)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the local group is busier than the selected idlest group
|
||||||
|
* try and push the task.
|
||||||
|
*/
|
||||||
|
if (local_sgs.group_type > idlest_sgs.group_type)
|
||||||
|
return idlest;
|
||||||
|
|
||||||
|
switch (local_sgs.group_type) {
|
||||||
|
case group_overloaded:
|
||||||
|
case group_fully_busy:
|
||||||
|
/*
|
||||||
|
* When comparing groups across NUMA domains, it's possible for
|
||||||
|
* the local domain to be very lightly loaded relative to the
|
||||||
|
* remote domains but "imbalance" skews the comparison making
|
||||||
|
* remote CPUs look much more favourable. When considering
|
||||||
|
* cross-domain, add imbalance to the load on the remote node
|
||||||
|
* and consider staying local.
|
||||||
|
*/
|
||||||
|
|
||||||
|
if ((sd->flags & SD_NUMA) &&
|
||||||
|
((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the local group is less loaded than the selected
|
||||||
|
* idlest group don't try and push any tasks.
|
||||||
|
*/
|
||||||
|
if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
|
||||||
|
return NULL;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case group_imbalanced:
|
||||||
|
case group_asym_packing:
|
||||||
|
/* Those type are not used in the slow wakeup path */
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
case group_misfit_task:
|
||||||
|
/* Select group with the highest max capacity */
|
||||||
|
if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
|
||||||
|
return NULL;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case group_has_spare:
|
||||||
|
if (sd->flags & SD_NUMA) {
|
||||||
|
#ifdef CONFIG_NUMA_BALANCING
|
||||||
|
int idlest_cpu;
|
||||||
|
/*
|
||||||
|
* If there is spare capacity at NUMA, try to select
|
||||||
|
* the preferred node
|
||||||
|
*/
|
||||||
|
if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
idlest_cpu = cpumask_first(sched_group_span(idlest));
|
||||||
|
if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
|
||||||
|
return idlest;
|
||||||
|
#endif
|
||||||
|
/*
|
||||||
|
* Otherwise, keep the task on this node to stay close
|
||||||
|
* its wakeup source and improve locality. If there is
|
||||||
|
* a real need of migration, periodic load balance will
|
||||||
|
* take care of it.
|
||||||
|
*/
|
||||||
|
if (local_sgs.idle_cpus)
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Select group with highest number of idle CPUs. We could also
|
||||||
|
* compare the utilization which is more stable but it can end
|
||||||
|
* up that the group has less spare capacity but finally more
|
||||||
|
* idle CPUs which means more opportunity to run task.
|
||||||
|
*/
|
||||||
|
if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
|
||||||
|
return NULL;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return idlest;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
|
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
|
||||||
* @env: The load balancing environment.
|
* @env: The load balancing environment.
|
||||||
|
Loading…
Reference in New Issue
Block a user