From 2b17c545a4cdbbbadcd7f1e9684c2d7db8f085a6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 4 Oct 2012 01:46:44 +0200 Subject: [PATCH 1/3] nohz: Fix one jiffy count too far in idle cputime When we stop the tick in idle, we save the current jiffies value in ts->idle_jiffies. This snapshot is substracted from the later value of jiffies when the tick is restarted and the resulting delta is accounted as idle cputime. This is how we handle the idle cputime accounting without the tick. But sometimes we need to schedule the next tick to some time in the future instead of completely stopping it. In this case, a tick may happen before we restart the periodic behaviour and from that tick we account one jiffy to idle cputime as usual but we also increment the ts->idle_jiffies snapshot by one so that when we compute the delta to account, we substract the one jiffy we just accounted. To prepare for stopping the tick outside idle, we introduced a check that prevents from fixing up that ts->idle_jiffies if we are not running the idle task. But we use idle_cpu() for that and this is a problem if we run the tick while another CPU remotely enqueues a ttwu to our runqueue: CPU 0: CPU 1: tick_sched_timer() { ttwu_queue_remote() if (idle_cpu(CPU 0)) ts->idle_jiffies++; } Here, idle_cpu() notes that &rq->wake_list is not empty and hence won't consider the CPU as idle. As a result, ts->idle_jiffies won't be incremented. But this is wrong because we actually account the current jiffy to idle cputime. And that jiffy won't get substracted from the nohz time delta. So in the end, this jiffy is accounted twice. Fix this by changing idle_cpu(smp_processor_id()) with is_idle_task(current). This way the jiffy is substracted correctly even if a ttwu operation is enqueued on the CPU. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: # 3.5+ Link: http://lkml.kernel.org/r/1349308004-3482-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f423bdd035c2..a40260885265 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -835,7 +835,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) */ if (ts->tick_stopped) { touch_softlockup_watchdog(); - if (idle_cpu(cpu)) + if (is_idle_task(current)) ts->idle_jiffies++; } update_process_times(user_mode(regs)); From 5f7865f3e44db4c73fdc454fb2af40806212a7ca Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 25 Sep 2012 21:12:30 +0800 Subject: [PATCH 2/3] sched: Ensure 'sched_domains_numa_levels' is safe to use in other functions We should temporarily reset 'sched_domains_numa_levels' to 0 after it is reset to 'level' in sched_init_numa(). If it fails to allocate memory for array sched_domains_numa_masks[][], the array will contain less then 'level' members. This could be dangerous when we use it to iterate array sched_domains_numa_masks[][] in other functions. This patch set sched_domains_numa_levels to 0 before initializing array sched_domains_numa_masks[][], and reset it to 'level' when sched_domains_numa_masks[][] is fully initialized. Signed-off-by: Tang Chen Signed-off-by: Wen Congyang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1348578751-16904-2-git-send-email-tangchen@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c17747236438..f895fdd32c5b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6122,6 +6122,17 @@ static void sched_init_numa(void) * numbers. */ + /* + * Here, we should temporarily reset sched_domains_numa_levels to 0. + * If it fails to allocate memory for array sched_domains_numa_masks[][], + * the array will contain less then 'level' members. This could be + * dangerous when we use it to iterate array sched_domains_numa_masks[][] + * in other functions. + * + * We reset it to 'level' at the end of this function. + */ + sched_domains_numa_levels = 0; + sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); if (!sched_domains_numa_masks) return; @@ -6176,6 +6187,8 @@ static void sched_init_numa(void) } sched_domain_topology = tl; + + sched_domains_numa_levels = level; } #else static inline void sched_init_numa(void) From 301a5cba2887d1f640e6d5184b05a6d7132017d5 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 25 Sep 2012 21:12:31 +0800 Subject: [PATCH 3/3] sched: Update sched_domains_numa_masks[][] when new cpus are onlined Once array sched_domains_numa_masks[] []is defined, it is never updated. When a new cpu on a new node is onlined, the coincident member in sched_domains_numa_masks[][] is not initialized, and all the masks are 0. As a result, the build_overlap_sched_groups() will initialize a NULL sched_group for the new cpu on the new node, which will lead to kernel panic: [ 3189.403280] Call Trace: [ 3189.403286] [] warn_slowpath_common+0x7f/0xc0 [ 3189.403289] [] warn_slowpath_null+0x1a/0x20 [ 3189.403292] [] build_sched_domains+0x467/0x470 [ 3189.403296] [] partition_sched_domains+0x307/0x510 [ 3189.403299] [] ? partition_sched_domains+0x142/0x510 [ 3189.403305] [] cpuset_update_active_cpus+0x83/0x90 [ 3189.403308] [] cpuset_cpu_active+0x38/0x70 [ 3189.403316] [] notifier_call_chain+0x67/0x150 [ 3189.403320] [] ? native_cpu_up+0x18a/0x1b5 [ 3189.403328] [] __raw_notifier_call_chain+0xe/0x10 [ 3189.403333] [] __cpu_notify+0x20/0x40 [ 3189.403337] [] _cpu_up+0xe9/0x131 [ 3189.403340] [] cpu_up+0xdb/0xee [ 3189.403348] [] store_online+0x9c/0xd0 [ 3189.403355] [] dev_attr_store+0x20/0x30 [ 3189.403361] [] sysfs_write_file+0xa3/0x100 [ 3189.403368] [] vfs_write+0xd0/0x1a0 [ 3189.403371] [] sys_write+0x54/0xa0 [ 3189.403375] [] system_call_fastpath+0x16/0x1b [ 3189.403377] ---[ end trace 1e6cf85d0859c941 ]--- [ 3189.403398] BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 This patch registers a new notifier for cpu hotplug notify chain, and updates sched_domains_numa_masks every time a new cpu is onlined or offlined. Signed-off-by: Tang Chen Signed-off-by: Wen Congyang [ fixed compile warning ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1348578751-16904-3-git-send-email-tangchen@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 56 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f895fdd32c5b..8322d73b4392 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6190,10 +6190,65 @@ static void sched_init_numa(void) sched_domains_numa_levels = level; } + +static void sched_domains_numa_masks_set(int cpu) +{ + int i, j; + int node = cpu_to_node(cpu); + + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) { + if (node_distance(j, node) <= sched_domains_numa_distance[i]) + cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); + } + } +} + +static void sched_domains_numa_masks_clear(int cpu) +{ + int i, j; + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) + cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + } +} + +/* + * Update sched_domains_numa_masks[level][node] array when new cpus + * are onlined. + */ +static int sched_domains_numa_masks_update(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + sched_domains_numa_masks_set(cpu); + break; + + case CPU_DEAD: + sched_domains_numa_masks_clear(cpu); + break; + + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} #else static inline void sched_init_numa(void) { } + +static int sched_domains_numa_masks_update(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + return 0; +} #endif /* CONFIG_NUMA */ static int __sdt_alloc(const struct cpumask *cpu_map) @@ -6642,6 +6697,7 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); put_online_cpus(); + hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);