linux_dsm_epyc7002/include/linux/smp.h

196 lines
4.9 KiB
C
Raw Normal View History

#ifndef __LINUX_SMP_H
#define __LINUX_SMP_H
/*
* Generic SMP support
* Alan Cox. <alan@redhat.com>
*/
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/list.h>
#include <linux/cpumask.h>
#include <linux/init.h>
extern void cpu_idle(void);
typedef void (*smp_call_func_t)(void *info);
struct call_single_data {
struct list_head list;
smp_call_func_t func;
void *info;
u16 flags;
};
cpumask: add sysfs displays for configured and disabled cpu maps Impact: add new sysfs files. Add sysfs files "kernel_max" and "offline" to display the max CPU index allowed (NR_CPUS-1), and the map of cpus that are offline. Cpus can be offlined via HOTPLUG, disabled by the BIOS ACPI tables, or if they exceed the number of cpus allowed by the NR_CPUS config option, or the "maxcpus=NUM" kernel start parameter. The "possible_cpus=NUM" parameter can also extend the number of possible cpus allowed, in which case the cpus not present at startup will be in the offline state. (These cpus can be HOTPLUGGED ON after system startup [pending a follow-on patch to provide the capability via the /sys/devices/sys/cpu/cpuN/online mechanism to bring them online.]) By design, the "offlined cpus > possible cpus" display will always use the following formats: * all possible cpus online: "x$" or "x-y$" * some possible cpus offline: ".*,x$" or ".*,x-y$" where: x == number of possible cpus (nr_cpu_ids); and y == number of cpus >= NR_CPUS or maxcpus (if y > x). One use of this feature is for distros to select (or configure) the appropriate kernel to install for the resident system. Notes: * cpus offlined <= possible cpus will be printed for all architectures. * cpus offlined > possible cpus will only be printed for arches that set 'total_cpus' [X86 only in this patch]. Based on tip/cpus4096 + .../rusty/linux-2.6-for-ingo.git/master + x86-only-patches sent 12/15. Signed-off-by: Mike Travis <travis@sgi.com> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2008-12-16 11:26:48 +07:00
/* total number of cpus in this system (may exceed NR_CPUS) */
extern unsigned int total_cpus;
int smp_call_function_single(int cpuid, smp_call_func_t func, void *info,
int wait);
/*
* Call a function on all processors
*/
int on_each_cpu(smp_call_func_t func, void *info, int wait);
/*
* Call a function on processors specified by mask, which might include
* the local one.
*/
void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
void *info, bool wait);
/*
* Call a function on each processor for which the supplied function
* cond_func returns a positive value. This may include the local
* processor.
*/
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags);
#ifdef CONFIG_SMP
#include <linux/preempt.h>
#include <linux/kernel.h>
#include <linux/compiler.h>
#include <linux/thread_info.h>
#include <asm/smp.h>
/*
* main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc.
* (defined in asm header):
*/
/*
* stops all CPUs but the current one:
*/
extern void smp_send_stop(void);
/*
* sends a 'reschedule' event to another CPU:
*/
extern void smp_send_reschedule(int cpu);
/*
* Prepare machine for booting other CPUs.
*/
extern void smp_prepare_cpus(unsigned int max_cpus);
/*
* Bring a CPU up
*/
extern int __cpu_up(unsigned int cpunum, struct task_struct *tidle);
/*
* Final polishing of CPUs
*/
extern void smp_cpus_done(unsigned int max_cpus);
/*
* Call a function on all other processors
*/
int smp_call_function(smp_call_func_t func, void *info, int wait);
void smp_call_function_many(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait);
void __smp_call_function_single(int cpuid, struct call_single_data *data,
int wait);
int smp_call_function_any(const struct cpumask *mask,
smp_call_func_t func, void *info, int wait);
void kick_all_cpus_sync(void);
/*
* Generic and arch helpers
*/
generic-ipi: Fix kexec boot crash by initializing call_single_queue before enabling interrupts There is a problem that kdump(2nd kernel) sometimes hangs up due to a pending IPI from 1st kernel. Kernel panic occurs because IPI comes before call_single_queue is initialized. To fix the crash, rename init_call_single_data() to call_function_init() and call it in start_kernel() so that call_single_queue can be initialized before enabling interrupts. The details of the crash are: (1) 2nd kernel boots up (2) A pending IPI from 1st kernel comes when irqs are first enabled in start_kernel(). (3) Kernel tries to handle the interrupt, but call_single_queue is not initialized yet at this point. As a result, in the generic_smp_call_function_single_interrupt(), NULL pointer dereference occurs when list_replace_init() tries to access &q->list.next. Therefore this patch changes the name of init_call_single_data() to call_function_init() and calls it before local_irq_enable() in start_kernel(). Signed-off-by: Takao Indoh <indou.takao@jp.fujitsu.com> Reviewed-by: WANG Cong <xiyou.wangcong@gmail.com> Acked-by: Neil Horman <nhorman@tuxdriver.com> Acked-by: Vivek Goyal <vgoyal@redhat.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Milton Miller <miltonm@bga.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: kexec@lists.infradead.org Link: http://lkml.kernel.org/r/D6CBEE2F420741indou.takao@jp.fujitsu.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-03-29 23:35:04 +07:00
void __init call_function_init(void);
void generic_smp_call_function_single_interrupt(void);
smp: make smp_call_function_many() use logic similar to smp_call_function_single() I'm testing swapout workload in a two-socket Xeon machine. The workload has 10 threads, each thread sequentially accesses separate memory region. TLB flush overhead is very big in the workload. For each page, page reclaim need move it from active lru list and then unmap it. Both need a TLB flush. And this is a multthread workload, TLB flush happens in 10 CPUs. In X86, TLB flush uses generic smp_call)function. So this workload stress smp_call_function_many heavily. Without patch, perf shows: + 24.49% [k] generic_smp_call_function_interrupt - 21.72% [k] _raw_spin_lock - _raw_spin_lock + 79.80% __page_check_address + 6.42% generic_smp_call_function_interrupt + 3.31% get_swap_page + 2.37% free_pcppages_bulk + 1.75% handle_pte_fault + 1.54% put_super + 1.41% grab_super_passive + 1.36% __swap_duplicate + 0.68% blk_flush_plug_list + 0.62% swap_info_get + 6.55% [k] flush_tlb_func + 6.46% [k] smp_call_function_many + 5.09% [k] call_function_interrupt + 4.75% [k] default_send_IPI_mask_sequence_phys + 2.18% [k] find_next_bit swapout throughput is around 1300M/s. With the patch, perf shows: - 27.23% [k] _raw_spin_lock - _raw_spin_lock + 80.53% __page_check_address + 8.39% generic_smp_call_function_single_interrupt + 2.44% get_swap_page + 1.76% free_pcppages_bulk + 1.40% handle_pte_fault + 1.15% __swap_duplicate + 1.05% put_super + 0.98% grab_super_passive + 0.86% blk_flush_plug_list + 0.57% swap_info_get + 8.25% [k] default_send_IPI_mask_sequence_phys + 7.55% [k] call_function_interrupt + 7.47% [k] smp_call_function_many + 7.25% [k] flush_tlb_func + 3.81% [k] _raw_spin_lock_irqsave + 3.78% [k] generic_smp_call_function_single_interrupt swapout throughput is around 1400M/s. So there is around a 7% improvement, and total cpu utilization doesn't change. Without the patch, cfd_data is shared by all CPUs. generic_smp_call_function_interrupt does read/write cfd_data several times which will create a lot of cache ping-pong. With the patch, the data becomes per-cpu. The ping-pong is avoided. And from the perf data, this doesn't make call_single_queue lock contend. Next step is to remove generic_smp_call_function_interrupt() from arch code. Signed-off-by: Shaohua Li <shli@fusionio.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Ingo Molnar <mingo@elte.hu> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-22 07:43:03 +07:00
#define generic_smp_call_function_interrupt \
generic_smp_call_function_single_interrupt
/*
* Mark the boot cpu "online" so that it can call console drivers in
* printk() and can access its per-cpu storage.
*/
void smp_prepare_boot_cpu(void);
extern unsigned int setup_max_cpus;
extern void __init setup_nr_cpu_ids(void);
extern void __init smp_init(void);
#else /* !SMP */
static inline void smp_send_stop(void) { }
/*
* These macros fold the SMP functionality into a single CPU system
*/
#define raw_smp_processor_id() 0
static inline int up_smp_call_function(smp_call_func_t func, void *info)
{
return 0;
}
#define smp_call_function(func, info, wait) \
(up_smp_call_function(func, info))
static inline void smp_send_reschedule(int cpu) { }
#define smp_prepare_boot_cpu() do {} while (0)
#define smp_call_function_many(mask, func, info, wait) \
(up_smp_call_function(func, info))
generic-ipi: Fix kexec boot crash by initializing call_single_queue before enabling interrupts There is a problem that kdump(2nd kernel) sometimes hangs up due to a pending IPI from 1st kernel. Kernel panic occurs because IPI comes before call_single_queue is initialized. To fix the crash, rename init_call_single_data() to call_function_init() and call it in start_kernel() so that call_single_queue can be initialized before enabling interrupts. The details of the crash are: (1) 2nd kernel boots up (2) A pending IPI from 1st kernel comes when irqs are first enabled in start_kernel(). (3) Kernel tries to handle the interrupt, but call_single_queue is not initialized yet at this point. As a result, in the generic_smp_call_function_single_interrupt(), NULL pointer dereference occurs when list_replace_init() tries to access &q->list.next. Therefore this patch changes the name of init_call_single_data() to call_function_init() and calls it before local_irq_enable() in start_kernel(). Signed-off-by: Takao Indoh <indou.takao@jp.fujitsu.com> Reviewed-by: WANG Cong <xiyou.wangcong@gmail.com> Acked-by: Neil Horman <nhorman@tuxdriver.com> Acked-by: Vivek Goyal <vgoyal@redhat.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Milton Miller <miltonm@bga.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: kexec@lists.infradead.org Link: http://lkml.kernel.org/r/D6CBEE2F420741indou.takao@jp.fujitsu.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-03-29 23:35:04 +07:00
static inline void call_function_init(void) { }
static inline int
smp_call_function_any(const struct cpumask *mask, smp_call_func_t func,
void *info, int wait)
{
return smp_call_function_single(0, func, info, wait);
}
static inline void kick_all_cpus_sync(void) { }
watchdog: update watchdog_thresh properly watchdog_tresh controls how often nmi perf event counter checks per-cpu hrtimer_interrupts counter and blows up if the counter hasn't changed since the last check. The counter is updated by per-cpu watchdog_hrtimer hrtimer which is scheduled with 2/5 watchdog_thresh period which guarantees that hrtimer is scheduled 2 times per the main period. Both hrtimer and perf event are started together when the watchdog is enabled. So far so good. But... But what happens when watchdog_thresh is updated from sysctl handler? proc_dowatchdog will set a new sampling period and hrtimer callback (watchdog_timer_fn) will use the new value in the next round. The problem, however, is that nobody tells the perf event that the sampling period has changed so it is ticking with the period configured when it has been set up. This might result in an ear ripping dissonance between perf and hrtimer parts if the watchdog_thresh is increased. And even worse it might lead to KABOOM if the watchdog is configured to panic on such a spurious lockup. This patch fixes the issue by updating both nmi perf even counter and hrtimers if the threshold value has changed. The nmi one is disabled and then reinitialized from scratch. This has an unpleasant side effect that the allocation of the new event might fail theoretically so the hard lockup detector would be disabled for such cpus. On the other hand such a memory allocation failure is very unlikely because the original event is deallocated right before. It would be much nicer if we just changed perf event period but there doesn't seem to be any API to do that right now. It is also unfortunate that perf_event_alloc uses GFP_KERNEL allocation unconditionally so we cannot use on_each_cpu() and do the same thing from the per-cpu context. The update from the current CPU should be safe because perf_event_disable removes the event atomically before it clears the per-cpu watchdog_ev so it cannot change anything under running handler feet. The hrtimer is simply restarted (thanks to Don Zickus who has pointed this out) if it is queued because we cannot rely it will fire&adopt to the new sampling period before a new nmi event triggers (when the treshold is decreased). [akpm@linux-foundation.org: the UP version of __smp_call_function_single ended up in the wrong place] Signed-off-by: Michal Hocko <mhocko@suse.cz> Acked-by: Don Zickus <dzickus@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@kernel.org> Cc: Fabio Estevam <festevam@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-25 05:27:30 +07:00
static inline void __smp_call_function_single(int cpuid,
struct call_single_data *data, int wait)
{
on_each_cpu(data->func, data->info, wait);
}
#endif /* !SMP */
/*
* smp_processor_id(): get the current CPU ID.
*
* if DEBUG_PREEMPT is enabled then we check whether it is
* used in a preemption-safe way. (smp_processor_id() is safe
* if it's used in a preemption-off critical section, or in
* a thread that is bound to the current CPU.)
*
* NOTE: raw_smp_processor_id() is for internal use only
* (smp_processor_id() is the preferred variant), but in rare
* instances it might also be used to turn off false positives
* (i.e. smp_processor_id() use that the debugging code reports but
* which use for some reason is legal). Don't use this to hack around
* the warning message, as your code might not work under PREEMPT.
*/
#ifdef CONFIG_DEBUG_PREEMPT
extern unsigned int debug_smp_processor_id(void);
# define smp_processor_id() debug_smp_processor_id()
#else
# define smp_processor_id() raw_smp_processor_id()
#endif
#define get_cpu() ({ preempt_disable(); smp_processor_id(); })
#define put_cpu() preempt_enable()
/*
* Callback to arch code if there's nosmp or maxcpus=0 on the
* boot command line:
*/
extern void arch_disable_smp_support(void);
void smp_setup_processor_id(void);
#endif /* __LINUX_SMP_H */