linux_dsm_epyc7002/kernel/sched_clock.c
Steven Rostedt a83bc47c33 sched_clock: record TSC after gtod
To read the gtod we need to grab the xtime lock for read. Reading the gtod
before the TSC can cause a bigger gab if the xtime lock is contended.

This patch simply reverses the order to read the TSC after the gtod.
The locking in the reading of the gtod handles any barriers one might
think is needed.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-11 15:53:27 +02:00

299 lines
6.6 KiB
C

/*
* sched_clock for unstable cpu clocks
*
* Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
*
* Based on code by:
* Ingo Molnar <mingo@redhat.com>
* Guillaume Chazarain <guichaz@gmail.com>
*
* Create a semi stable clock from a mixture of other events, including:
* - gtod
* - jiffies
* - sched_clock()
* - explicit idle events
*
* We use gtod as base and the unstable clock deltas. The deltas are filtered,
* making it monotonic and keeping it within an expected window. This window
* is set up using jiffies.
*
* Furthermore, explicit sleep and wakeup hooks allow us to account for time
* that is otherwise invisible (TSC gets stopped).
*
* The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
* consistent between cpus (never more than 1 jiffies difference).
*/
#include <linux/sched.h>
#include <linux/percpu.h>
#include <linux/spinlock.h>
#include <linux/ktime.h>
#include <linux/module.h>
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
struct sched_clock_data {
/*
* Raw spinlock - this is a special case: this might be called
* from within instrumentation code so we dont want to do any
* instrumentation ourselves.
*/
raw_spinlock_t lock;
unsigned long tick_jiffies;
u64 prev_raw;
u64 tick_raw;
u64 tick_gtod;
u64 clock;
#ifdef CONFIG_NO_HZ
int check_max;
#endif
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
static inline struct sched_clock_data *this_scd(void)
{
return &__get_cpu_var(sched_clock_data);
}
static inline struct sched_clock_data *cpu_sdc(int cpu)
{
return &per_cpu(sched_clock_data, cpu);
}
static __read_mostly int sched_clock_running;
void sched_clock_init(void)
{
u64 ktime_now = ktime_to_ns(ktime_get());
unsigned long now_jiffies = jiffies;
int cpu;
for_each_possible_cpu(cpu) {
struct sched_clock_data *scd = cpu_sdc(cpu);
scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
scd->tick_jiffies = now_jiffies;
scd->prev_raw = 0;
scd->tick_raw = 0;
scd->tick_gtod = ktime_now;
scd->clock = ktime_now;
#ifdef CONFIG_NO_HZ
scd->check_max = 1;
#endif
}
sched_clock_running = 1;
}
#ifdef CONFIG_NO_HZ
/*
* The dynamic ticks makes the delta jiffies inaccurate. This
* prevents us from checking the maximum time update.
* Disable the maximum check during stopped ticks.
*/
void sched_clock_tick_stop(int cpu)
{
struct sched_clock_data *scd = cpu_sdc(cpu);
scd->check_max = 0;
}
void sched_clock_tick_start(int cpu)
{
struct sched_clock_data *scd = cpu_sdc(cpu);
scd->check_max = 1;
}
static int check_max(struct sched_clock_data *scd)
{
return scd->check_max;
}
#else
static int check_max(struct sched_clock_data *scd)
{
return 1;
}
#endif /* CONFIG_NO_HZ */
/*
* update the percpu scd from the raw @now value
*
* - filter out backward motion
* - use jiffies to generate a min,max window to clip the raw values
*/
static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time)
{
unsigned long now_jiffies = jiffies;
long delta_jiffies = now_jiffies - scd->tick_jiffies;
u64 clock = scd->clock;
u64 min_clock, max_clock;
s64 delta = now - scd->prev_raw;
WARN_ON_ONCE(!irqs_disabled());
min_clock = scd->tick_gtod +
(delta_jiffies ? delta_jiffies - 1 : 0) * TICK_NSEC;
if (unlikely(delta < 0)) {
clock++;
goto out;
}
/*
* The clock must stay within a jiffie of the gtod.
* But since we may be at the start of a jiffy or the end of one
* we add another jiffy buffer.
*/
max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC;
if (unlikely(clock + delta > max_clock) && check_max(scd)) {
if (clock < max_clock)
clock = max_clock;
else
clock++;
} else {
clock += delta;
}
out:
if (unlikely(clock < min_clock))
clock = min_clock;
if (time)
*time = clock;
else {
scd->prev_raw = now;
scd->clock = clock;
}
}
static void lock_double_clock(struct sched_clock_data *data1,
struct sched_clock_data *data2)
{
if (data1 < data2) {
__raw_spin_lock(&data1->lock);
__raw_spin_lock(&data2->lock);
} else {
__raw_spin_lock(&data2->lock);
__raw_spin_lock(&data1->lock);
}
}
u64 sched_clock_cpu(int cpu)
{
struct sched_clock_data *scd = cpu_sdc(cpu);
u64 now, clock;
if (unlikely(!sched_clock_running))
return 0ull;
WARN_ON_ONCE(!irqs_disabled());
now = sched_clock();
if (cpu != raw_smp_processor_id()) {
/*
* in order to update a remote cpu's clock based on our
* unstable raw time rebase it against:
* tick_raw (offset between raw counters)
* tick_gotd (tick offset between cpus)
*/
struct sched_clock_data *my_scd = this_scd();
lock_double_clock(scd, my_scd);
now -= my_scd->tick_raw;
now += scd->tick_raw;
now += my_scd->tick_gtod;
now -= scd->tick_gtod;
__raw_spin_unlock(&my_scd->lock);
__update_sched_clock(scd, now, &clock);
__raw_spin_unlock(&scd->lock);
} else {
__raw_spin_lock(&scd->lock);
__update_sched_clock(scd, now, NULL);
clock = scd->clock;
__raw_spin_unlock(&scd->lock);
}
return clock;
}
void sched_clock_tick(void)
{
struct sched_clock_data *scd = this_scd();
unsigned long now_jiffies = jiffies;
u64 now, now_gtod;
if (unlikely(!sched_clock_running))
return;
WARN_ON_ONCE(!irqs_disabled());
now_gtod = ktime_to_ns(ktime_get());
now = sched_clock();
__raw_spin_lock(&scd->lock);
__update_sched_clock(scd, now, NULL);
/*
* update tick_gtod after __update_sched_clock() because that will
* already observe 1 new jiffy; adding a new tick_gtod to that would
* increase the clock 2 jiffies.
*/
scd->tick_jiffies = now_jiffies;
scd->tick_raw = now;
scd->tick_gtod = now_gtod;
__raw_spin_unlock(&scd->lock);
}
/*
* We are going deep-idle (irqs are disabled):
*/
void sched_clock_idle_sleep_event(void)
{
sched_clock_cpu(smp_processor_id());
}
EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
/*
* We just idled delta nanoseconds (called with irqs disabled):
*/
void sched_clock_idle_wakeup_event(u64 delta_ns)
{
struct sched_clock_data *scd = this_scd();
u64 now = sched_clock();
/*
* Override the previous timestamp and ignore all
* sched_clock() deltas that occured while we idled,
* and use the PM-provided delta_ns to advance the
* rq clock:
*/
__raw_spin_lock(&scd->lock);
scd->prev_raw = now;
scd->clock += delta_ns;
__raw_spin_unlock(&scd->lock);
touch_softlockup_watchdog();
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
#endif
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
* Architectures and sub-architectures can override this.
*/
unsigned long long __attribute__((weak)) sched_clock(void)
{
return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
}