2005-04-17 05:20:36 +07:00
|
|
|
/*
|
|
|
|
* include/linux/topology.h
|
|
|
|
*
|
|
|
|
* Written by: Matthew Dobson, IBM Corporation
|
|
|
|
*
|
|
|
|
* Copyright (C) 2002, IBM Corp.
|
|
|
|
*
|
2008-01-26 03:08:20 +07:00
|
|
|
* All rights reserved.
|
2005-04-17 05:20:36 +07:00
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful, but
|
|
|
|
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
|
|
|
* NON INFRINGEMENT. See the GNU General Public License for more
|
|
|
|
* details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
|
|
*
|
|
|
|
* Send feedback to <colpatch@us.ibm.com>
|
|
|
|
*/
|
|
|
|
#ifndef _LINUX_TOPOLOGY_H
|
|
|
|
#define _LINUX_TOPOLOGY_H
|
|
|
|
|
|
|
|
#include <linux/cpumask.h>
|
|
|
|
#include <linux/bitops.h>
|
|
|
|
#include <linux/mmzone.h>
|
|
|
|
#include <linux/smp.h>
|
|
|
|
#include <asm/topology.h>
|
|
|
|
|
|
|
|
#ifndef node_has_online_mem
|
|
|
|
#define node_has_online_mem(nid) (1)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef nr_cpus_node
|
2009-03-13 11:19:46 +07:00
|
|
|
#define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node))
|
2005-04-17 05:20:36 +07:00
|
|
|
#endif
|
|
|
|
|
2008-04-05 08:11:11 +07:00
|
|
|
#define for_each_node_with_cpus(node) \
|
|
|
|
for_each_online_node(node) \
|
2005-04-17 05:20:36 +07:00
|
|
|
if (nr_cpus_node(node))
|
|
|
|
|
2008-12-10 00:49:50 +07:00
|
|
|
int arch_update_cpu_topology(void);
|
2008-03-13 00:31:59 +07:00
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/* Conform to ACPI 2.0 SLIT distance definitions */
|
|
|
|
#define LOCAL_DISTANCE 10
|
|
|
|
#define REMOTE_DISTANCE 20
|
2007-07-12 02:21:47 +07:00
|
|
|
#ifndef node_distance
|
2005-04-17 05:20:36 +07:00
|
|
|
#define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
|
|
|
|
#endif
|
2006-01-19 08:42:31 +07:00
|
|
|
#ifndef RECLAIM_DISTANCE
|
|
|
|
/*
|
|
|
|
* If the distance between nodes in a system is larger than RECLAIM_DISTANCE
|
|
|
|
* (in whatever arch specific measurement units returned by node_distance())
|
|
|
|
* then switch on zone reclaim on boot.
|
|
|
|
*/
|
|
|
|
#define RECLAIM_DISTANCE 20
|
|
|
|
#endif
|
2005-04-17 05:20:36 +07:00
|
|
|
#ifndef PENALTY_FOR_NODE_WITH_CPUS
|
|
|
|
#define PENALTY_FOR_NODE_WITH_CPUS (1)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Below are the 3 major initializers used in building sched_domains:
|
|
|
|
* SD_SIBLING_INIT, for SMT domains
|
|
|
|
* SD_CPU_INIT, for SMP domains
|
|
|
|
* SD_NODE_INIT, for NUMA domains
|
|
|
|
*
|
|
|
|
* Any architecture that cares to do any tuning to these values should do so
|
|
|
|
* by defining their own arch-specific initializer in include/asm/topology.h.
|
|
|
|
* A definition there will automagically override these default initializers
|
|
|
|
* and allow arch-specific performance tuning of sched_domains.
|
2008-04-05 08:11:11 +07:00
|
|
|
* (Only non-zero and non-null fields need be specified.)
|
2005-04-17 05:20:36 +07:00
|
|
|
*/
|
2008-04-05 08:11:11 +07:00
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
#ifdef CONFIG_SCHED_SMT
|
|
|
|
/* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is,
|
|
|
|
* so can't we drop this in favor of CONFIG_SCHED_SMT?
|
|
|
|
*/
|
|
|
|
#define ARCH_HAS_SCHED_WAKE_IDLE
|
|
|
|
/* Common values for SMT siblings */
|
|
|
|
#ifndef SD_SIBLING_INIT
|
2009-09-04 16:21:24 +07:00
|
|
|
#define SD_SIBLING_INIT (struct sched_domain) { \
|
|
|
|
.min_interval = 1, \
|
|
|
|
.max_interval = 2, \
|
|
|
|
.busy_factor = 64, \
|
|
|
|
.imbalance_pct = 110, \
|
|
|
|
\
|
|
|
|
.flags = 1*SD_LOAD_BALANCE \
|
|
|
|
| 1*SD_BALANCE_NEWIDLE \
|
|
|
|
| 1*SD_BALANCE_EXEC \
|
|
|
|
| 1*SD_BALANCE_FORK \
|
sched: Merge select_task_rq_fair() and sched_balance_self()
The problem with wake_idle() is that is doesn't respect things like
cpu_power, which means it doesn't deal well with SMT nor the recent
RT interaction.
To cure this, it needs to do what sched_balance_self() does, which
leads to the possibility of merging select_task_rq_fair() and
sched_balance_self().
Modify sched_balance_self() to:
- update_shares() when walking up the domain tree,
(it only called it for the top domain, but it should
have done this anyway), which allows us to remove
this ugly bit from try_to_wake_up().
- do wake_affine() on the smallest domain that contains
both this (the waking) and the prev (the wakee) cpu for
WAKE invocations.
Then use the top-down balance steps it had to replace wake_idle().
This leads to the dissapearance of SD_WAKE_BALANCE and
SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE.
SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective.
Touch all topology bits to replace the old with new SD flags --
platforms might need re-tuning, enabling SD_BALANCE_WAKE
conditionally on a NUMA distance seems like a good additional
feature, magny-core and small nehalem systems would want this
enabled, systems with slow interconnects would not.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-10 18:50:02 +07:00
|
|
|
| 1*SD_BALANCE_WAKE \
|
2009-09-04 16:21:24 +07:00
|
|
|
| 1*SD_WAKE_AFFINE \
|
|
|
|
| 1*SD_SHARE_CPUPOWER \
|
|
|
|
| 0*SD_POWERSAVINGS_BALANCE \
|
|
|
|
| 0*SD_SHARE_PKG_RESOURCES \
|
|
|
|
| 0*SD_SERIALIZE \
|
|
|
|
| 0*SD_PREFER_SIBLING \
|
|
|
|
, \
|
|
|
|
.last_balance = jiffies, \
|
|
|
|
.balance_interval = 1, \
|
|
|
|
.smt_gain = 1178, /* 15% */ \
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#endif /* CONFIG_SCHED_SMT */
|
|
|
|
|
2006-10-03 15:14:09 +07:00
|
|
|
#ifdef CONFIG_SCHED_MC
|
|
|
|
/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
|
|
|
|
#ifndef SD_MC_INIT
|
2009-09-04 16:21:24 +07:00
|
|
|
#define SD_MC_INIT (struct sched_domain) { \
|
|
|
|
.min_interval = 1, \
|
|
|
|
.max_interval = 4, \
|
|
|
|
.busy_factor = 64, \
|
|
|
|
.imbalance_pct = 125, \
|
|
|
|
.cache_nice_tries = 1, \
|
|
|
|
.busy_idx = 2, \
|
2009-09-03 18:16:51 +07:00
|
|
|
.wake_idx = 0, \
|
2009-09-15 20:22:03 +07:00
|
|
|
.forkexec_idx = 0, \
|
2009-09-04 16:21:24 +07:00
|
|
|
\
|
|
|
|
.flags = 1*SD_LOAD_BALANCE \
|
2009-09-04 16:32:54 +07:00
|
|
|
| 1*SD_BALANCE_NEWIDLE \
|
2009-09-04 16:21:24 +07:00
|
|
|
| 1*SD_BALANCE_EXEC \
|
|
|
|
| 1*SD_BALANCE_FORK \
|
sched: Merge select_task_rq_fair() and sched_balance_self()
The problem with wake_idle() is that is doesn't respect things like
cpu_power, which means it doesn't deal well with SMT nor the recent
RT interaction.
To cure this, it needs to do what sched_balance_self() does, which
leads to the possibility of merging select_task_rq_fair() and
sched_balance_self().
Modify sched_balance_self() to:
- update_shares() when walking up the domain tree,
(it only called it for the top domain, but it should
have done this anyway), which allows us to remove
this ugly bit from try_to_wake_up().
- do wake_affine() on the smallest domain that contains
both this (the waking) and the prev (the wakee) cpu for
WAKE invocations.
Then use the top-down balance steps it had to replace wake_idle().
This leads to the dissapearance of SD_WAKE_BALANCE and
SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE.
SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective.
Touch all topology bits to replace the old with new SD flags --
platforms might need re-tuning, enabling SD_BALANCE_WAKE
conditionally on a NUMA distance seems like a good additional
feature, magny-core and small nehalem systems would want this
enabled, systems with slow interconnects would not.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-10 18:50:02 +07:00
|
|
|
| 1*SD_BALANCE_WAKE \
|
2009-09-04 16:21:24 +07:00
|
|
|
| 1*SD_WAKE_AFFINE \
|
2009-09-16 13:28:30 +07:00
|
|
|
| 1*SD_PREFER_LOCAL \
|
2009-09-04 16:21:24 +07:00
|
|
|
| 0*SD_SHARE_CPUPOWER \
|
|
|
|
| 1*SD_SHARE_PKG_RESOURCES \
|
|
|
|
| 0*SD_SERIALIZE \
|
|
|
|
| sd_balance_for_mc_power() \
|
|
|
|
| sd_power_saving_flags() \
|
|
|
|
, \
|
|
|
|
.last_balance = jiffies, \
|
|
|
|
.balance_interval = 1, \
|
2006-10-03 15:14:09 +07:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#endif /* CONFIG_SCHED_MC */
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/* Common values for CPUs */
|
|
|
|
#ifndef SD_CPU_INIT
|
2009-09-04 16:21:24 +07:00
|
|
|
#define SD_CPU_INIT (struct sched_domain) { \
|
|
|
|
.min_interval = 1, \
|
|
|
|
.max_interval = 4, \
|
|
|
|
.busy_factor = 64, \
|
|
|
|
.imbalance_pct = 125, \
|
|
|
|
.cache_nice_tries = 1, \
|
|
|
|
.busy_idx = 2, \
|
|
|
|
.idle_idx = 1, \
|
sched: Improve latencies and throughput
Make the idle balancer more agressive, to improve a
x264 encoding workload provided by Jason Garrett-Glaser:
NEXT_BUDDY NO_LB_BIAS
encoded 600 frames, 252.82 fps, 22096.60 kb/s
encoded 600 frames, 250.69 fps, 22096.60 kb/s
encoded 600 frames, 245.76 fps, 22096.60 kb/s
NO_NEXT_BUDDY LB_BIAS
encoded 600 frames, 344.44 fps, 22096.60 kb/s
encoded 600 frames, 346.66 fps, 22096.60 kb/s
encoded 600 frames, 352.59 fps, 22096.60 kb/s
NO_NEXT_BUDDY NO_LB_BIAS
encoded 600 frames, 425.75 fps, 22096.60 kb/s
encoded 600 frames, 425.45 fps, 22096.60 kb/s
encoded 600 frames, 422.49 fps, 22096.60 kb/s
Peter pointed out that this is better done via newidle_idx,
not via LB_BIAS, newidle balancing should look for where
there is load _now_, not where there was load 2 ticks ago.
Worst-case latencies are improved as well as no buddies
means less vruntime spread. (as per prior lkml discussions)
This change improves kbuild-peak parallelism as well.
Reported-by: Jason Garrett-Glaser <darkshikari@gmail.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1253011667.9128.16.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 20:07:03 +07:00
|
|
|
.newidle_idx = 0, \
|
2009-09-03 18:16:51 +07:00
|
|
|
.wake_idx = 0, \
|
2009-09-15 20:22:03 +07:00
|
|
|
.forkexec_idx = 0, \
|
2009-09-04 16:21:24 +07:00
|
|
|
\
|
|
|
|
.flags = 1*SD_LOAD_BALANCE \
|
2009-09-04 16:32:54 +07:00
|
|
|
| 1*SD_BALANCE_NEWIDLE \
|
2009-09-04 16:21:24 +07:00
|
|
|
| 1*SD_BALANCE_EXEC \
|
|
|
|
| 1*SD_BALANCE_FORK \
|
sched: Merge select_task_rq_fair() and sched_balance_self()
The problem with wake_idle() is that is doesn't respect things like
cpu_power, which means it doesn't deal well with SMT nor the recent
RT interaction.
To cure this, it needs to do what sched_balance_self() does, which
leads to the possibility of merging select_task_rq_fair() and
sched_balance_self().
Modify sched_balance_self() to:
- update_shares() when walking up the domain tree,
(it only called it for the top domain, but it should
have done this anyway), which allows us to remove
this ugly bit from try_to_wake_up().
- do wake_affine() on the smallest domain that contains
both this (the waking) and the prev (the wakee) cpu for
WAKE invocations.
Then use the top-down balance steps it had to replace wake_idle().
This leads to the dissapearance of SD_WAKE_BALANCE and
SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE.
SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective.
Touch all topology bits to replace the old with new SD flags --
platforms might need re-tuning, enabling SD_BALANCE_WAKE
conditionally on a NUMA distance seems like a good additional
feature, magny-core and small nehalem systems would want this
enabled, systems with slow interconnects would not.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-10 18:50:02 +07:00
|
|
|
| 1*SD_BALANCE_WAKE \
|
2009-09-11 23:42:15 +07:00
|
|
|
| 1*SD_WAKE_AFFINE \
|
2009-09-16 13:28:30 +07:00
|
|
|
| 1*SD_PREFER_LOCAL \
|
2009-09-04 16:21:24 +07:00
|
|
|
| 0*SD_SHARE_CPUPOWER \
|
|
|
|
| 0*SD_SHARE_PKG_RESOURCES \
|
|
|
|
| 0*SD_SERIALIZE \
|
|
|
|
| sd_balance_for_package_power() \
|
|
|
|
| sd_power_saving_flags() \
|
|
|
|
, \
|
|
|
|
.last_balance = jiffies, \
|
|
|
|
.balance_interval = 1, \
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2005-09-07 05:18:14 +07:00
|
|
|
/* sched_domains SD_ALLNODES_INIT for NUMA machines */
|
2009-09-04 16:21:24 +07:00
|
|
|
#define SD_ALLNODES_INIT (struct sched_domain) { \
|
|
|
|
.min_interval = 64, \
|
|
|
|
.max_interval = 64*num_online_cpus(), \
|
|
|
|
.busy_factor = 128, \
|
|
|
|
.imbalance_pct = 133, \
|
|
|
|
.cache_nice_tries = 1, \
|
|
|
|
.busy_idx = 3, \
|
|
|
|
.idle_idx = 3, \
|
|
|
|
.flags = 1*SD_LOAD_BALANCE \
|
|
|
|
| 1*SD_BALANCE_NEWIDLE \
|
|
|
|
| 0*SD_BALANCE_EXEC \
|
|
|
|
| 0*SD_BALANCE_FORK \
|
sched: Merge select_task_rq_fair() and sched_balance_self()
The problem with wake_idle() is that is doesn't respect things like
cpu_power, which means it doesn't deal well with SMT nor the recent
RT interaction.
To cure this, it needs to do what sched_balance_self() does, which
leads to the possibility of merging select_task_rq_fair() and
sched_balance_self().
Modify sched_balance_self() to:
- update_shares() when walking up the domain tree,
(it only called it for the top domain, but it should
have done this anyway), which allows us to remove
this ugly bit from try_to_wake_up().
- do wake_affine() on the smallest domain that contains
both this (the waking) and the prev (the wakee) cpu for
WAKE invocations.
Then use the top-down balance steps it had to replace wake_idle().
This leads to the dissapearance of SD_WAKE_BALANCE and
SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE.
SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective.
Touch all topology bits to replace the old with new SD flags --
platforms might need re-tuning, enabling SD_BALANCE_WAKE
conditionally on a NUMA distance seems like a good additional
feature, magny-core and small nehalem systems would want this
enabled, systems with slow interconnects would not.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-10 18:50:02 +07:00
|
|
|
| 0*SD_BALANCE_WAKE \
|
2009-09-11 23:42:15 +07:00
|
|
|
| 0*SD_WAKE_AFFINE \
|
2009-09-04 16:21:24 +07:00
|
|
|
| 0*SD_SHARE_CPUPOWER \
|
|
|
|
| 0*SD_POWERSAVINGS_BALANCE \
|
|
|
|
| 0*SD_SHARE_PKG_RESOURCES \
|
|
|
|
| 1*SD_SERIALIZE \
|
|
|
|
| 0*SD_PREFER_SIBLING \
|
|
|
|
, \
|
|
|
|
.last_balance = jiffies, \
|
|
|
|
.balance_interval = 64, \
|
2005-09-07 05:18:14 +07:00
|
|
|
}
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
#ifndef SD_NODE_INIT
|
|
|
|
#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
|
|
|
|
#endif
|
|
|
|
#endif /* CONFIG_NUMA */
|
|
|
|
|
2008-06-05 11:47:29 +07:00
|
|
|
#ifndef topology_physical_package_id
|
|
|
|
#define topology_physical_package_id(cpu) ((void)(cpu), -1)
|
|
|
|
#endif
|
|
|
|
#ifndef topology_core_id
|
|
|
|
#define topology_core_id(cpu) ((void)(cpu), 0)
|
|
|
|
#endif
|
|
|
|
#ifndef topology_thread_siblings
|
|
|
|
#define topology_thread_siblings(cpu) cpumask_of_cpu(cpu)
|
|
|
|
#endif
|
|
|
|
#ifndef topology_core_siblings
|
|
|
|
#define topology_core_siblings(cpu) cpumask_of_cpu(cpu)
|
|
|
|
#endif
|
2009-01-11 12:58:08 +07:00
|
|
|
#ifndef topology_thread_cpumask
|
|
|
|
#define topology_thread_cpumask(cpu) cpumask_of(cpu)
|
|
|
|
#endif
|
|
|
|
#ifndef topology_core_cpumask
|
|
|
|
#define topology_core_cpumask(cpu) cpumask_of(cpu)
|
|
|
|
#endif
|
2008-06-05 11:47:29 +07:00
|
|
|
|
2009-03-13 20:13:37 +07:00
|
|
|
/* Returns the number of the current Node. */
|
|
|
|
#ifndef numa_node_id
|
|
|
|
#define numa_node_id() (cpu_to_node(raw_smp_processor_id()))
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
#endif /* _LINUX_TOPOLOGY_H */
|