mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-03-01 21:06:53 +07:00
sched: mix tasks and groups
This patch allows tasks and groups to exist in the same cfs_rq. With this change the CFS group scheduling follows a 1/(M+N) model from a 1/(1+N) fairness model where M tasks and N groups exist at the cfs_rq level. [a.p.zijlstra@chello.nl: rt bits and assorted fixes] Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com> Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
parent
ea736ed5d3
commit
354d60c2ff
@ -273,6 +273,7 @@ struct task_group {
|
|||||||
struct list_head list;
|
struct list_head list;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef CONFIG_USER_SCHED
|
||||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||||
/* Default task group's sched entity on each cpu */
|
/* Default task group's sched entity on each cpu */
|
||||||
static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
|
static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
|
||||||
@ -284,6 +285,7 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
|
|||||||
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
|
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
|
||||||
static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
|
static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
/* task_group_lock serializes add/remove of task groups and also changes to
|
/* task_group_lock serializes add/remove of task groups and also changes to
|
||||||
* a task group's cpu shares.
|
* a task group's cpu shares.
|
||||||
@ -7447,6 +7449,10 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
|
|||||||
list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
|
list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
|
||||||
|
|
||||||
tg->se[cpu] = se;
|
tg->se[cpu] = se;
|
||||||
|
/* se could be NULL for init_task_group */
|
||||||
|
if (!se)
|
||||||
|
return;
|
||||||
|
|
||||||
se->cfs_rq = &rq->cfs;
|
se->cfs_rq = &rq->cfs;
|
||||||
se->my_q = cfs_rq;
|
se->my_q = cfs_rq;
|
||||||
se->load.weight = tg->shares;
|
se->load.weight = tg->shares;
|
||||||
@ -7469,6 +7475,9 @@ static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
|
|||||||
list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
|
list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
|
||||||
|
|
||||||
tg->rt_se[cpu] = rt_se;
|
tg->rt_se[cpu] = rt_se;
|
||||||
|
if (!rt_se)
|
||||||
|
return;
|
||||||
|
|
||||||
rt_se->rt_rq = &rq->rt;
|
rt_se->rt_rq = &rq->rt;
|
||||||
rt_se->my_q = rt_rq;
|
rt_se->my_q = rt_rq;
|
||||||
rt_se->parent = NULL;
|
rt_se->parent = NULL;
|
||||||
@ -7539,18 +7548,56 @@ void __init sched_init(void)
|
|||||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||||
init_task_group.shares = init_task_group_load;
|
init_task_group.shares = init_task_group_load;
|
||||||
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
|
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
|
||||||
|
#ifdef CONFIG_CGROUP_SCHED
|
||||||
|
/*
|
||||||
|
* How much cpu bandwidth does init_task_group get?
|
||||||
|
*
|
||||||
|
* In case of task-groups formed thr' the cgroup filesystem, it
|
||||||
|
* gets 100% of the cpu resources in the system. This overall
|
||||||
|
* system cpu resource is divided among the tasks of
|
||||||
|
* init_task_group and its child task-groups in a fair manner,
|
||||||
|
* based on each entity's (task or task-group's) weight
|
||||||
|
* (se->load.weight).
|
||||||
|
*
|
||||||
|
* In other words, if init_task_group has 10 tasks of weight
|
||||||
|
* 1024) and two child groups A0 and A1 (of weight 1024 each),
|
||||||
|
* then A0's share of the cpu resource is:
|
||||||
|
*
|
||||||
|
* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
|
||||||
|
*
|
||||||
|
* We achieve this by letting init_task_group's tasks sit
|
||||||
|
* directly in rq->cfs (i.e init_task_group->se[] = NULL).
|
||||||
|
*/
|
||||||
|
init_tg_cfs_entry(rq, &init_task_group, &rq->cfs, NULL, i, 1);
|
||||||
|
#elif defined CONFIG_USER_SCHED
|
||||||
|
/*
|
||||||
|
* In case of task-groups formed thr' the user id of tasks,
|
||||||
|
* init_task_group represents tasks belonging to root user.
|
||||||
|
* Hence it forms a sibling of all subsequent groups formed.
|
||||||
|
* In this case, init_task_group gets only a fraction of overall
|
||||||
|
* system cpu resource, based on the weight assigned to root
|
||||||
|
* user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
|
||||||
|
* by letting tasks of init_task_group sit in a separate cfs_rq
|
||||||
|
* (init_cfs_rq) and having one entity represent this group of
|
||||||
|
* tasks in rq->cfs (i.e init_task_group->se[] != NULL).
|
||||||
|
*/
|
||||||
init_tg_cfs_entry(rq, &init_task_group,
|
init_tg_cfs_entry(rq, &init_task_group,
|
||||||
&per_cpu(init_cfs_rq, i),
|
&per_cpu(init_cfs_rq, i),
|
||||||
&per_cpu(init_sched_entity, i), i, 1);
|
&per_cpu(init_sched_entity, i), i, 1);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||||
|
|
||||||
|
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
|
||||||
#ifdef CONFIG_RT_GROUP_SCHED
|
#ifdef CONFIG_RT_GROUP_SCHED
|
||||||
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
|
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
|
||||||
|
#ifdef CONFIG_CGROUP_SCHED
|
||||||
|
init_tg_rt_entry(rq, &init_task_group, &rq->rt, NULL, i, 1);
|
||||||
|
#elif defined CONFIG_USER_SCHED
|
||||||
init_tg_rt_entry(rq, &init_task_group,
|
init_tg_rt_entry(rq, &init_task_group,
|
||||||
&per_cpu(init_rt_rq, i),
|
&per_cpu(init_rt_rq, i),
|
||||||
&per_cpu(init_sched_rt_entity, i), i, 1);
|
&per_cpu(init_sched_rt_entity, i), i, 1);
|
||||||
#else
|
#endif
|
||||||
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
|
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
|
||||||
|
@ -1133,6 +1133,17 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* return depth at which a sched entity is present in the hierarchy */
|
||||||
|
static inline int depth_se(struct sched_entity *se)
|
||||||
|
{
|
||||||
|
int depth = 0;
|
||||||
|
|
||||||
|
for_each_sched_entity(se)
|
||||||
|
depth++;
|
||||||
|
|
||||||
|
return depth;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Preempt the current task with a newly woken task if needed:
|
* Preempt the current task with a newly woken task if needed:
|
||||||
*/
|
*/
|
||||||
@ -1141,6 +1152,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
|
|||||||
struct task_struct *curr = rq->curr;
|
struct task_struct *curr = rq->curr;
|
||||||
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
|
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
|
||||||
struct sched_entity *se = &curr->se, *pse = &p->se;
|
struct sched_entity *se = &curr->se, *pse = &p->se;
|
||||||
|
int se_depth, pse_depth;
|
||||||
|
|
||||||
if (unlikely(rt_prio(p->prio))) {
|
if (unlikely(rt_prio(p->prio))) {
|
||||||
update_rq_clock(rq);
|
update_rq_clock(rq);
|
||||||
@ -1165,6 +1177,27 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
|
|||||||
if (!sched_feat(WAKEUP_PREEMPT))
|
if (!sched_feat(WAKEUP_PREEMPT))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* preemption test can be made between sibling entities who are in the
|
||||||
|
* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
|
||||||
|
* both tasks until we find their ancestors who are siblings of common
|
||||||
|
* parent.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* First walk up until both entities are at same depth */
|
||||||
|
se_depth = depth_se(se);
|
||||||
|
pse_depth = depth_se(pse);
|
||||||
|
|
||||||
|
while (se_depth > pse_depth) {
|
||||||
|
se_depth--;
|
||||||
|
se = parent_entity(se);
|
||||||
|
}
|
||||||
|
|
||||||
|
while (pse_depth > se_depth) {
|
||||||
|
pse_depth--;
|
||||||
|
pse = parent_entity(pse);
|
||||||
|
}
|
||||||
|
|
||||||
while (!is_same_group(se, pse)) {
|
while (!is_same_group(se, pse)) {
|
||||||
se = parent_entity(se);
|
se = parent_entity(se);
|
||||||
pse = parent_entity(pse);
|
pse = parent_entity(pse);
|
||||||
@ -1223,13 +1256,22 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
|
|||||||
static struct task_struct *
|
static struct task_struct *
|
||||||
__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
|
__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
|
||||||
{
|
{
|
||||||
struct task_struct *p;
|
struct task_struct *p = NULL;
|
||||||
|
struct sched_entity *se;
|
||||||
|
|
||||||
if (!curr)
|
if (!curr)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
p = rb_entry(curr, struct task_struct, se.run_node);
|
/* Skip over entities that are not tasks */
|
||||||
cfs_rq->rb_load_balance_curr = rb_next(curr);
|
do {
|
||||||
|
se = rb_entry(curr, struct sched_entity, run_node);
|
||||||
|
curr = rb_next(curr);
|
||||||
|
} while (curr && !entity_is_task(se));
|
||||||
|
|
||||||
|
cfs_rq->rb_load_balance_curr = curr;
|
||||||
|
|
||||||
|
if (entity_is_task(se))
|
||||||
|
p = task_of(se);
|
||||||
|
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
@ -1489,9 +1531,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
|
|||||||
{
|
{
|
||||||
struct cfs_rq *cfs_rq;
|
struct cfs_rq *cfs_rq;
|
||||||
|
|
||||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
||||||
print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
|
|
||||||
#endif
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
|
for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
|
||||||
print_cfs_rq(m, cpu, cfs_rq);
|
print_cfs_rq(m, cpu, cfs_rq);
|
||||||
|
@ -374,11 +374,15 @@ static void update_curr_rt(struct rq *rq)
|
|||||||
curr->se.exec_start = rq->clock;
|
curr->se.exec_start = rq->clock;
|
||||||
cpuacct_charge(curr, delta_exec);
|
cpuacct_charge(curr, delta_exec);
|
||||||
|
|
||||||
spin_lock(&rt_rq->rt_runtime_lock);
|
for_each_sched_rt_entity(rt_se) {
|
||||||
rt_rq->rt_time += delta_exec;
|
rt_rq = rt_rq_of_se(rt_se);
|
||||||
if (sched_rt_runtime_exceeded(rt_rq))
|
|
||||||
resched_task(curr);
|
spin_lock(&rt_rq->rt_runtime_lock);
|
||||||
spin_unlock(&rt_rq->rt_runtime_lock);
|
rt_rq->rt_time += delta_exec;
|
||||||
|
if (sched_rt_runtime_exceeded(rt_rq))
|
||||||
|
resched_task(curr);
|
||||||
|
spin_unlock(&rt_rq->rt_runtime_lock);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline
|
static inline
|
||||||
@ -477,7 +481,6 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
|
|||||||
* entries, we must remove entries top - down.
|
* entries, we must remove entries top - down.
|
||||||
*
|
*
|
||||||
* XXX: O(1/2 h^2) because we can only walk up, not down the chain.
|
* XXX: O(1/2 h^2) because we can only walk up, not down the chain.
|
||||||
* doesn't matter much for now, as h=2 for GROUP_SCHED.
|
|
||||||
*/
|
*/
|
||||||
static void dequeue_rt_stack(struct task_struct *p)
|
static void dequeue_rt_stack(struct task_struct *p)
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user