mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-11-25 00:40:56 +07:00
804d402fb6
Capacity Awareness refers to the fact that on heterogeneous systems (like Arm big.LITTLE), the capacity of the CPUs is not uniform, hence when placing tasks we need to be aware of this difference of CPU capacities. In such scenarios we want to ensure that the selected CPU has enough capacity to meet the requirement of the running task. Enough capacity means here that capacity_orig_of(cpu) >= task.requirement. The definition of task.requirement is dependent on the scheduling class. For CFS, utilization is used to select a CPU that has >= capacity value than the cfs_task.util. capacity_orig_of(cpu) >= cfs_task.util DL isn't capacity aware at the moment but can make use of the bandwidth reservation to implement that in a similar manner CFS uses utilization. The following patchset implements that: https://lore.kernel.org/lkml/20190506044836.2914-1-luca.abeni@santannapisa.it/ capacity_orig_of(cpu)/SCHED_CAPACITY >= dl_deadline/dl_runtime For RT we don't have a per task utilization signal and we lack any information in general about what performance requirement the RT task needs. But with the introduction of uclamp, RT tasks can now control that by setting uclamp_min to guarantee a minimum performance point. ATM the uclamp value are only used for frequency selection; but on heterogeneous systems this is not enough and we need to ensure that the capacity of the CPU is >= uclamp_min. Which is what implemented here. capacity_orig_of(cpu) >= rt_task.uclamp_min Note that by default uclamp.min is 1024, which means that RT tasks will always be biased towards the big CPUs, which make for a better more predictable behavior for the default case. Must stress that the bias acts as a hint rather than a definite placement strategy. For example, if all big cores are busy executing other RT tasks we can't guarantee that a new RT task will be placed there. On non-heterogeneous systems the original behavior of RT should be retained. Similarly if uclamp is not selected in the config. [ mingo: Minor edits to comments. ] Signed-off-by: Qais Yousef <qais.yousef@arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com> Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: https://lkml.kernel.org/r/20191009104611.15363-1-qais.yousef@arm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
259 lines
7.0 KiB
C
259 lines
7.0 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* kernel/sched/cpupri.c
|
|
*
|
|
* CPU priority management
|
|
*
|
|
* Copyright (C) 2007-2008 Novell
|
|
*
|
|
* Author: Gregory Haskins <ghaskins@novell.com>
|
|
*
|
|
* This code tracks the priority of each CPU so that global migration
|
|
* decisions are easy to calculate. Each CPU can be in a state as follows:
|
|
*
|
|
* (INVALID), IDLE, NORMAL, RT1, ... RT99
|
|
*
|
|
* going from the lowest priority to the highest. CPUs in the INVALID state
|
|
* are not eligible for routing. The system maintains this state with
|
|
* a 2 dimensional bitmap (the first for priority class, the second for CPUs
|
|
* in that class). Therefore a typical application without affinity
|
|
* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
|
|
* searches). For tasks with affinity restrictions, the algorithm has a
|
|
* worst case complexity of O(min(102, nr_domcpus)), though the scenario that
|
|
* yields the worst case search is fairly contrived.
|
|
*/
|
|
#include "sched.h"
|
|
|
|
/* Convert between a 140 based task->prio, and our 102 based cpupri */
|
|
static int convert_prio(int prio)
|
|
{
|
|
int cpupri;
|
|
|
|
if (prio == CPUPRI_INVALID)
|
|
cpupri = CPUPRI_INVALID;
|
|
else if (prio == MAX_PRIO)
|
|
cpupri = CPUPRI_IDLE;
|
|
else if (prio >= MAX_RT_PRIO)
|
|
cpupri = CPUPRI_NORMAL;
|
|
else
|
|
cpupri = MAX_RT_PRIO - prio + 1;
|
|
|
|
return cpupri;
|
|
}
|
|
|
|
/**
|
|
* cpupri_find - find the best (lowest-pri) CPU in the system
|
|
* @cp: The cpupri context
|
|
* @p: The task
|
|
* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
|
|
* @fitness_fn: A pointer to a function to do custom checks whether the CPU
|
|
* fits a specific criteria so that we only return those CPUs.
|
|
*
|
|
* Note: This function returns the recommended CPUs as calculated during the
|
|
* current invocation. By the time the call returns, the CPUs may have in
|
|
* fact changed priorities any number of times. While not ideal, it is not
|
|
* an issue of correctness since the normal rebalancer logic will correct
|
|
* any discrepancies created by racing against the uncertainty of the current
|
|
* priority configuration.
|
|
*
|
|
* Return: (int)bool - CPUs were found
|
|
*/
|
|
int cpupri_find(struct cpupri *cp, struct task_struct *p,
|
|
struct cpumask *lowest_mask,
|
|
bool (*fitness_fn)(struct task_struct *p, int cpu))
|
|
{
|
|
int idx = 0;
|
|
int task_pri = convert_prio(p->prio);
|
|
|
|
BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
|
|
|
|
for (idx = 0; idx < task_pri; idx++) {
|
|
struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
|
|
int skip = 0;
|
|
|
|
if (!atomic_read(&(vec)->count))
|
|
skip = 1;
|
|
/*
|
|
* When looking at the vector, we need to read the counter,
|
|
* do a memory barrier, then read the mask.
|
|
*
|
|
* Note: This is still all racey, but we can deal with it.
|
|
* Ideally, we only want to look at masks that are set.
|
|
*
|
|
* If a mask is not set, then the only thing wrong is that we
|
|
* did a little more work than necessary.
|
|
*
|
|
* If we read a zero count but the mask is set, because of the
|
|
* memory barriers, that can only happen when the highest prio
|
|
* task for a run queue has left the run queue, in which case,
|
|
* it will be followed by a pull. If the task we are processing
|
|
* fails to find a proper place to go, that pull request will
|
|
* pull this task if the run queue is running at a lower
|
|
* priority.
|
|
*/
|
|
smp_rmb();
|
|
|
|
/* Need to do the rmb for every iteration */
|
|
if (skip)
|
|
continue;
|
|
|
|
if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
|
|
continue;
|
|
|
|
if (lowest_mask) {
|
|
int cpu;
|
|
|
|
cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
|
|
|
|
/*
|
|
* We have to ensure that we have at least one bit
|
|
* still set in the array, since the map could have
|
|
* been concurrently emptied between the first and
|
|
* second reads of vec->mask. If we hit this
|
|
* condition, simply act as though we never hit this
|
|
* priority level and continue on.
|
|
*/
|
|
if (cpumask_empty(lowest_mask))
|
|
continue;
|
|
|
|
if (!fitness_fn)
|
|
return 1;
|
|
|
|
/* Ensure the capacity of the CPUs fit the task */
|
|
for_each_cpu(cpu, lowest_mask) {
|
|
if (!fitness_fn(p, cpu))
|
|
cpumask_clear_cpu(cpu, lowest_mask);
|
|
}
|
|
|
|
/*
|
|
* If no CPU at the current priority can fit the task
|
|
* continue looking
|
|
*/
|
|
if (cpumask_empty(lowest_mask))
|
|
continue;
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* cpupri_set - update the CPU priority setting
|
|
* @cp: The cpupri context
|
|
* @cpu: The target CPU
|
|
* @newpri: The priority (INVALID-RT99) to assign to this CPU
|
|
*
|
|
* Note: Assumes cpu_rq(cpu)->lock is locked
|
|
*
|
|
* Returns: (void)
|
|
*/
|
|
void cpupri_set(struct cpupri *cp, int cpu, int newpri)
|
|
{
|
|
int *currpri = &cp->cpu_to_pri[cpu];
|
|
int oldpri = *currpri;
|
|
int do_mb = 0;
|
|
|
|
newpri = convert_prio(newpri);
|
|
|
|
BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
|
|
|
|
if (newpri == oldpri)
|
|
return;
|
|
|
|
/*
|
|
* If the CPU was currently mapped to a different value, we
|
|
* need to map it to the new value then remove the old value.
|
|
* Note, we must add the new value first, otherwise we risk the
|
|
* cpu being missed by the priority loop in cpupri_find.
|
|
*/
|
|
if (likely(newpri != CPUPRI_INVALID)) {
|
|
struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
|
|
|
|
cpumask_set_cpu(cpu, vec->mask);
|
|
/*
|
|
* When adding a new vector, we update the mask first,
|
|
* do a write memory barrier, and then update the count, to
|
|
* make sure the vector is visible when count is set.
|
|
*/
|
|
smp_mb__before_atomic();
|
|
atomic_inc(&(vec)->count);
|
|
do_mb = 1;
|
|
}
|
|
if (likely(oldpri != CPUPRI_INVALID)) {
|
|
struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
|
|
|
|
/*
|
|
* Because the order of modification of the vec->count
|
|
* is important, we must make sure that the update
|
|
* of the new prio is seen before we decrement the
|
|
* old prio. This makes sure that the loop sees
|
|
* one or the other when we raise the priority of
|
|
* the run queue. We don't care about when we lower the
|
|
* priority, as that will trigger an rt pull anyway.
|
|
*
|
|
* We only need to do a memory barrier if we updated
|
|
* the new priority vec.
|
|
*/
|
|
if (do_mb)
|
|
smp_mb__after_atomic();
|
|
|
|
/*
|
|
* When removing from the vector, we decrement the counter first
|
|
* do a memory barrier and then clear the mask.
|
|
*/
|
|
atomic_dec(&(vec)->count);
|
|
smp_mb__after_atomic();
|
|
cpumask_clear_cpu(cpu, vec->mask);
|
|
}
|
|
|
|
*currpri = newpri;
|
|
}
|
|
|
|
/**
|
|
* cpupri_init - initialize the cpupri structure
|
|
* @cp: The cpupri context
|
|
*
|
|
* Return: -ENOMEM on memory allocation failure.
|
|
*/
|
|
int cpupri_init(struct cpupri *cp)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
|
|
struct cpupri_vec *vec = &cp->pri_to_cpu[i];
|
|
|
|
atomic_set(&vec->count, 0);
|
|
if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
|
|
goto cleanup;
|
|
}
|
|
|
|
cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
|
|
if (!cp->cpu_to_pri)
|
|
goto cleanup;
|
|
|
|
for_each_possible_cpu(i)
|
|
cp->cpu_to_pri[i] = CPUPRI_INVALID;
|
|
|
|
return 0;
|
|
|
|
cleanup:
|
|
for (i--; i >= 0; i--)
|
|
free_cpumask_var(cp->pri_to_cpu[i].mask);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
/**
|
|
* cpupri_cleanup - clean up the cpupri structure
|
|
* @cp: The cpupri context
|
|
*/
|
|
void cpupri_cleanup(struct cpupri *cp)
|
|
{
|
|
int i;
|
|
|
|
kfree(cp->cpu_to_pri);
|
|
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
|
|
free_cpumask_var(cp->pri_to_cpu[i].mask);
|
|
}
|