mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-11-26 22:40:55 +07:00
e49a5bd381
Scheduler's task migration events don't work because they always pass NULL regs perf_sw_event(). The event hence gets filtered in perf_swevent_add(). Scheduler's context switches events use task_pt_regs() to get the context when the event occured which is a wrong thing to do as this won't give us the place in the kernel where we went to sleep but the place where we left userspace. The result is even more wrong if we switch from a kernel thread. Use the hot regs snapshot for both events as they belong to the non-interrupt/exception based events family. Unlike page faults or so that provide the regs matching the exact origin of the event, we need to save the current context. This makes the task migration event working and fix the context switch callchains and origin ip. Example: perf record -a -e cs Before: 10.91% ksoftirqd/0 0 [k] 0000000000000000 | --- (nil) perf_callchain perf_prepare_sample __perf_event_overflow perf_swevent_overflow perf_swevent_add perf_swevent_ctx_event do_perf_sw_event __perf_sw_event perf_event_task_sched_out schedule run_ksoftirqd kthread kernel_thread_helper After: 23.77% hald-addon-stor [kernel.kallsyms] [k] schedule | --- schedule | |--60.00%-- schedule_timeout | wait_for_common | wait_for_completion | blk_execute_rq | scsi_execute | scsi_execute_req | sr_test_unit_ready | | | |--66.67%-- sr_media_change | | media_changed | | cdrom_media_changed | | sr_block_media_changed | | check_disk_change | | cdrom_open v2: Always build perf_arch_fetch_caller_regs() now that software events need that too. They don't need it from modules, unlike trace events, so we keep the EXPORT_SYMBOL in trace_event_perf.c Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Ingo Molnar <mingo@elte.hu> Cc: David Miller <davem@davemloft.net>
1005 lines
25 KiB
C
1005 lines
25 KiB
C
/*
|
|
* Performance events:
|
|
*
|
|
* Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
|
|
* Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
|
|
* Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
|
|
*
|
|
* Data type definitions, declarations, prototypes.
|
|
*
|
|
* Started by: Thomas Gleixner and Ingo Molnar
|
|
*
|
|
* For licencing details see kernel-base/COPYING
|
|
*/
|
|
#ifndef _LINUX_PERF_EVENT_H
|
|
#define _LINUX_PERF_EVENT_H
|
|
|
|
#include <linux/types.h>
|
|
#include <linux/ioctl.h>
|
|
#include <asm/byteorder.h>
|
|
|
|
/*
|
|
* User-space ABI bits:
|
|
*/
|
|
|
|
/*
|
|
* attr.type
|
|
*/
|
|
enum perf_type_id {
|
|
PERF_TYPE_HARDWARE = 0,
|
|
PERF_TYPE_SOFTWARE = 1,
|
|
PERF_TYPE_TRACEPOINT = 2,
|
|
PERF_TYPE_HW_CACHE = 3,
|
|
PERF_TYPE_RAW = 4,
|
|
PERF_TYPE_BREAKPOINT = 5,
|
|
|
|
PERF_TYPE_MAX, /* non-ABI */
|
|
};
|
|
|
|
/*
|
|
* Generalized performance event event_id types, used by the
|
|
* attr.event_id parameter of the sys_perf_event_open()
|
|
* syscall:
|
|
*/
|
|
enum perf_hw_id {
|
|
/*
|
|
* Common hardware events, generalized by the kernel:
|
|
*/
|
|
PERF_COUNT_HW_CPU_CYCLES = 0,
|
|
PERF_COUNT_HW_INSTRUCTIONS = 1,
|
|
PERF_COUNT_HW_CACHE_REFERENCES = 2,
|
|
PERF_COUNT_HW_CACHE_MISSES = 3,
|
|
PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4,
|
|
PERF_COUNT_HW_BRANCH_MISSES = 5,
|
|
PERF_COUNT_HW_BUS_CYCLES = 6,
|
|
|
|
PERF_COUNT_HW_MAX, /* non-ABI */
|
|
};
|
|
|
|
/*
|
|
* Generalized hardware cache events:
|
|
*
|
|
* { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
|
|
* { read, write, prefetch } x
|
|
* { accesses, misses }
|
|
*/
|
|
enum perf_hw_cache_id {
|
|
PERF_COUNT_HW_CACHE_L1D = 0,
|
|
PERF_COUNT_HW_CACHE_L1I = 1,
|
|
PERF_COUNT_HW_CACHE_LL = 2,
|
|
PERF_COUNT_HW_CACHE_DTLB = 3,
|
|
PERF_COUNT_HW_CACHE_ITLB = 4,
|
|
PERF_COUNT_HW_CACHE_BPU = 5,
|
|
|
|
PERF_COUNT_HW_CACHE_MAX, /* non-ABI */
|
|
};
|
|
|
|
enum perf_hw_cache_op_id {
|
|
PERF_COUNT_HW_CACHE_OP_READ = 0,
|
|
PERF_COUNT_HW_CACHE_OP_WRITE = 1,
|
|
PERF_COUNT_HW_CACHE_OP_PREFETCH = 2,
|
|
|
|
PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */
|
|
};
|
|
|
|
enum perf_hw_cache_op_result_id {
|
|
PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0,
|
|
PERF_COUNT_HW_CACHE_RESULT_MISS = 1,
|
|
|
|
PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */
|
|
};
|
|
|
|
/*
|
|
* Special "software" events provided by the kernel, even if the hardware
|
|
* does not support performance events. These events measure various
|
|
* physical and sw events of the kernel (and allow the profiling of them as
|
|
* well):
|
|
*/
|
|
enum perf_sw_ids {
|
|
PERF_COUNT_SW_CPU_CLOCK = 0,
|
|
PERF_COUNT_SW_TASK_CLOCK = 1,
|
|
PERF_COUNT_SW_PAGE_FAULTS = 2,
|
|
PERF_COUNT_SW_CONTEXT_SWITCHES = 3,
|
|
PERF_COUNT_SW_CPU_MIGRATIONS = 4,
|
|
PERF_COUNT_SW_PAGE_FAULTS_MIN = 5,
|
|
PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6,
|
|
PERF_COUNT_SW_ALIGNMENT_FAULTS = 7,
|
|
PERF_COUNT_SW_EMULATION_FAULTS = 8,
|
|
|
|
PERF_COUNT_SW_MAX, /* non-ABI */
|
|
};
|
|
|
|
/*
|
|
* Bits that can be set in attr.sample_type to request information
|
|
* in the overflow packets.
|
|
*/
|
|
enum perf_event_sample_format {
|
|
PERF_SAMPLE_IP = 1U << 0,
|
|
PERF_SAMPLE_TID = 1U << 1,
|
|
PERF_SAMPLE_TIME = 1U << 2,
|
|
PERF_SAMPLE_ADDR = 1U << 3,
|
|
PERF_SAMPLE_READ = 1U << 4,
|
|
PERF_SAMPLE_CALLCHAIN = 1U << 5,
|
|
PERF_SAMPLE_ID = 1U << 6,
|
|
PERF_SAMPLE_CPU = 1U << 7,
|
|
PERF_SAMPLE_PERIOD = 1U << 8,
|
|
PERF_SAMPLE_STREAM_ID = 1U << 9,
|
|
PERF_SAMPLE_RAW = 1U << 10,
|
|
|
|
PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */
|
|
};
|
|
|
|
/*
|
|
* The format of the data returned by read() on a perf event fd,
|
|
* as specified by attr.read_format:
|
|
*
|
|
* struct read_format {
|
|
* { u64 value;
|
|
* { u64 time_enabled; } && PERF_FORMAT_ENABLED
|
|
* { u64 time_running; } && PERF_FORMAT_RUNNING
|
|
* { u64 id; } && PERF_FORMAT_ID
|
|
* } && !PERF_FORMAT_GROUP
|
|
*
|
|
* { u64 nr;
|
|
* { u64 time_enabled; } && PERF_FORMAT_ENABLED
|
|
* { u64 time_running; } && PERF_FORMAT_RUNNING
|
|
* { u64 value;
|
|
* { u64 id; } && PERF_FORMAT_ID
|
|
* } cntr[nr];
|
|
* } && PERF_FORMAT_GROUP
|
|
* };
|
|
*/
|
|
enum perf_event_read_format {
|
|
PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0,
|
|
PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1,
|
|
PERF_FORMAT_ID = 1U << 2,
|
|
PERF_FORMAT_GROUP = 1U << 3,
|
|
|
|
PERF_FORMAT_MAX = 1U << 4, /* non-ABI */
|
|
};
|
|
|
|
#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */
|
|
|
|
/*
|
|
* Hardware event_id to monitor via a performance monitoring event:
|
|
*/
|
|
struct perf_event_attr {
|
|
|
|
/*
|
|
* Major type: hardware/software/tracepoint/etc.
|
|
*/
|
|
__u32 type;
|
|
|
|
/*
|
|
* Size of the attr structure, for fwd/bwd compat.
|
|
*/
|
|
__u32 size;
|
|
|
|
/*
|
|
* Type specific configuration information.
|
|
*/
|
|
__u64 config;
|
|
|
|
union {
|
|
__u64 sample_period;
|
|
__u64 sample_freq;
|
|
};
|
|
|
|
__u64 sample_type;
|
|
__u64 read_format;
|
|
|
|
__u64 disabled : 1, /* off by default */
|
|
inherit : 1, /* children inherit it */
|
|
pinned : 1, /* must always be on PMU */
|
|
exclusive : 1, /* only group on PMU */
|
|
exclude_user : 1, /* don't count user */
|
|
exclude_kernel : 1, /* ditto kernel */
|
|
exclude_hv : 1, /* ditto hypervisor */
|
|
exclude_idle : 1, /* don't count when idle */
|
|
mmap : 1, /* include mmap data */
|
|
comm : 1, /* include comm data */
|
|
freq : 1, /* use freq, not period */
|
|
inherit_stat : 1, /* per task counts */
|
|
enable_on_exec : 1, /* next exec enables */
|
|
task : 1, /* trace fork/exit */
|
|
watermark : 1, /* wakeup_watermark */
|
|
|
|
__reserved_1 : 49;
|
|
|
|
union {
|
|
__u32 wakeup_events; /* wakeup every n events */
|
|
__u32 wakeup_watermark; /* bytes before wakeup */
|
|
};
|
|
|
|
__u32 bp_type;
|
|
__u64 bp_addr;
|
|
__u64 bp_len;
|
|
};
|
|
|
|
/*
|
|
* Ioctls that can be done on a perf event fd:
|
|
*/
|
|
#define PERF_EVENT_IOC_ENABLE _IO ('$', 0)
|
|
#define PERF_EVENT_IOC_DISABLE _IO ('$', 1)
|
|
#define PERF_EVENT_IOC_REFRESH _IO ('$', 2)
|
|
#define PERF_EVENT_IOC_RESET _IO ('$', 3)
|
|
#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64)
|
|
#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5)
|
|
#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *)
|
|
|
|
enum perf_event_ioc_flags {
|
|
PERF_IOC_FLAG_GROUP = 1U << 0,
|
|
};
|
|
|
|
/*
|
|
* Structure of the page that can be mapped via mmap
|
|
*/
|
|
struct perf_event_mmap_page {
|
|
__u32 version; /* version number of this structure */
|
|
__u32 compat_version; /* lowest version this is compat with */
|
|
|
|
/*
|
|
* Bits needed to read the hw events in user-space.
|
|
*
|
|
* u32 seq;
|
|
* s64 count;
|
|
*
|
|
* do {
|
|
* seq = pc->lock;
|
|
*
|
|
* barrier()
|
|
* if (pc->index) {
|
|
* count = pmc_read(pc->index - 1);
|
|
* count += pc->offset;
|
|
* } else
|
|
* goto regular_read;
|
|
*
|
|
* barrier();
|
|
* } while (pc->lock != seq);
|
|
*
|
|
* NOTE: for obvious reason this only works on self-monitoring
|
|
* processes.
|
|
*/
|
|
__u32 lock; /* seqlock for synchronization */
|
|
__u32 index; /* hardware event identifier */
|
|
__s64 offset; /* add to hardware event value */
|
|
__u64 time_enabled; /* time event active */
|
|
__u64 time_running; /* time event on cpu */
|
|
|
|
/*
|
|
* Hole for extension of the self monitor capabilities
|
|
*/
|
|
|
|
__u64 __reserved[123]; /* align to 1k */
|
|
|
|
/*
|
|
* Control data for the mmap() data buffer.
|
|
*
|
|
* User-space reading the @data_head value should issue an rmb(), on
|
|
* SMP capable platforms, after reading this value -- see
|
|
* perf_event_wakeup().
|
|
*
|
|
* When the mapping is PROT_WRITE the @data_tail value should be
|
|
* written by userspace to reflect the last read data. In this case
|
|
* the kernel will not over-write unread data.
|
|
*/
|
|
__u64 data_head; /* head in the data section */
|
|
__u64 data_tail; /* user-space written tail */
|
|
};
|
|
|
|
#define PERF_RECORD_MISC_CPUMODE_MASK (3 << 0)
|
|
#define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0)
|
|
#define PERF_RECORD_MISC_KERNEL (1 << 0)
|
|
#define PERF_RECORD_MISC_USER (2 << 0)
|
|
#define PERF_RECORD_MISC_HYPERVISOR (3 << 0)
|
|
|
|
struct perf_event_header {
|
|
__u32 type;
|
|
__u16 misc;
|
|
__u16 size;
|
|
};
|
|
|
|
enum perf_event_type {
|
|
|
|
/*
|
|
* The MMAP events record the PROT_EXEC mappings so that we can
|
|
* correlate userspace IPs to code. They have the following structure:
|
|
*
|
|
* struct {
|
|
* struct perf_event_header header;
|
|
*
|
|
* u32 pid, tid;
|
|
* u64 addr;
|
|
* u64 len;
|
|
* u64 pgoff;
|
|
* char filename[];
|
|
* };
|
|
*/
|
|
PERF_RECORD_MMAP = 1,
|
|
|
|
/*
|
|
* struct {
|
|
* struct perf_event_header header;
|
|
* u64 id;
|
|
* u64 lost;
|
|
* };
|
|
*/
|
|
PERF_RECORD_LOST = 2,
|
|
|
|
/*
|
|
* struct {
|
|
* struct perf_event_header header;
|
|
*
|
|
* u32 pid, tid;
|
|
* char comm[];
|
|
* };
|
|
*/
|
|
PERF_RECORD_COMM = 3,
|
|
|
|
/*
|
|
* struct {
|
|
* struct perf_event_header header;
|
|
* u32 pid, ppid;
|
|
* u32 tid, ptid;
|
|
* u64 time;
|
|
* };
|
|
*/
|
|
PERF_RECORD_EXIT = 4,
|
|
|
|
/*
|
|
* struct {
|
|
* struct perf_event_header header;
|
|
* u64 time;
|
|
* u64 id;
|
|
* u64 stream_id;
|
|
* };
|
|
*/
|
|
PERF_RECORD_THROTTLE = 5,
|
|
PERF_RECORD_UNTHROTTLE = 6,
|
|
|
|
/*
|
|
* struct {
|
|
* struct perf_event_header header;
|
|
* u32 pid, ppid;
|
|
* u32 tid, ptid;
|
|
* u64 time;
|
|
* };
|
|
*/
|
|
PERF_RECORD_FORK = 7,
|
|
|
|
/*
|
|
* struct {
|
|
* struct perf_event_header header;
|
|
* u32 pid, tid;
|
|
*
|
|
* struct read_format values;
|
|
* };
|
|
*/
|
|
PERF_RECORD_READ = 8,
|
|
|
|
/*
|
|
* struct {
|
|
* struct perf_event_header header;
|
|
*
|
|
* { u64 ip; } && PERF_SAMPLE_IP
|
|
* { u32 pid, tid; } && PERF_SAMPLE_TID
|
|
* { u64 time; } && PERF_SAMPLE_TIME
|
|
* { u64 addr; } && PERF_SAMPLE_ADDR
|
|
* { u64 id; } && PERF_SAMPLE_ID
|
|
* { u64 stream_id;} && PERF_SAMPLE_STREAM_ID
|
|
* { u32 cpu, res; } && PERF_SAMPLE_CPU
|
|
* { u64 period; } && PERF_SAMPLE_PERIOD
|
|
*
|
|
* { struct read_format values; } && PERF_SAMPLE_READ
|
|
*
|
|
* { u64 nr,
|
|
* u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN
|
|
*
|
|
* #
|
|
* # The RAW record below is opaque data wrt the ABI
|
|
* #
|
|
* # That is, the ABI doesn't make any promises wrt to
|
|
* # the stability of its content, it may vary depending
|
|
* # on event, hardware, kernel version and phase of
|
|
* # the moon.
|
|
* #
|
|
* # In other words, PERF_SAMPLE_RAW contents are not an ABI.
|
|
* #
|
|
*
|
|
* { u32 size;
|
|
* char data[size];}&& PERF_SAMPLE_RAW
|
|
* };
|
|
*/
|
|
PERF_RECORD_SAMPLE = 9,
|
|
|
|
PERF_RECORD_MAX, /* non-ABI */
|
|
};
|
|
|
|
enum perf_callchain_context {
|
|
PERF_CONTEXT_HV = (__u64)-32,
|
|
PERF_CONTEXT_KERNEL = (__u64)-128,
|
|
PERF_CONTEXT_USER = (__u64)-512,
|
|
|
|
PERF_CONTEXT_GUEST = (__u64)-2048,
|
|
PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,
|
|
PERF_CONTEXT_GUEST_USER = (__u64)-2560,
|
|
|
|
PERF_CONTEXT_MAX = (__u64)-4095,
|
|
};
|
|
|
|
#define PERF_FLAG_FD_NO_GROUP (1U << 0)
|
|
#define PERF_FLAG_FD_OUTPUT (1U << 1)
|
|
|
|
#ifdef __KERNEL__
|
|
/*
|
|
* Kernel-internal data types and definitions:
|
|
*/
|
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
# include <asm/perf_event.h>
|
|
#endif
|
|
|
|
#ifdef CONFIG_HAVE_HW_BREAKPOINT
|
|
#include <asm/hw_breakpoint.h>
|
|
#endif
|
|
|
|
#include <linux/list.h>
|
|
#include <linux/mutex.h>
|
|
#include <linux/rculist.h>
|
|
#include <linux/rcupdate.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/hrtimer.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/pid_namespace.h>
|
|
#include <linux/workqueue.h>
|
|
#include <linux/ftrace.h>
|
|
#include <linux/cpu.h>
|
|
#include <asm/atomic.h>
|
|
|
|
#define PERF_MAX_STACK_DEPTH 255
|
|
|
|
struct perf_callchain_entry {
|
|
__u64 nr;
|
|
__u64 ip[PERF_MAX_STACK_DEPTH];
|
|
};
|
|
|
|
struct perf_raw_record {
|
|
u32 size;
|
|
void *data;
|
|
};
|
|
|
|
struct task_struct;
|
|
|
|
/**
|
|
* struct hw_perf_event - performance event hardware details:
|
|
*/
|
|
struct hw_perf_event {
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
union {
|
|
struct { /* hardware */
|
|
u64 config;
|
|
u64 last_tag;
|
|
unsigned long config_base;
|
|
unsigned long event_base;
|
|
int idx;
|
|
int last_cpu;
|
|
};
|
|
struct { /* software */
|
|
s64 remaining;
|
|
struct hrtimer hrtimer;
|
|
};
|
|
#ifdef CONFIG_HAVE_HW_BREAKPOINT
|
|
/* breakpoint */
|
|
struct arch_hw_breakpoint info;
|
|
#endif
|
|
};
|
|
atomic64_t prev_count;
|
|
u64 sample_period;
|
|
u64 last_period;
|
|
atomic64_t period_left;
|
|
u64 interrupts;
|
|
|
|
u64 freq_time_stamp;
|
|
u64 freq_count_stamp;
|
|
#endif
|
|
};
|
|
|
|
struct perf_event;
|
|
|
|
/**
|
|
* struct pmu - generic performance monitoring unit
|
|
*/
|
|
struct pmu {
|
|
int (*enable) (struct perf_event *event);
|
|
void (*disable) (struct perf_event *event);
|
|
int (*start) (struct perf_event *event);
|
|
void (*stop) (struct perf_event *event);
|
|
void (*read) (struct perf_event *event);
|
|
void (*unthrottle) (struct perf_event *event);
|
|
};
|
|
|
|
/**
|
|
* enum perf_event_active_state - the states of a event
|
|
*/
|
|
enum perf_event_active_state {
|
|
PERF_EVENT_STATE_ERROR = -2,
|
|
PERF_EVENT_STATE_OFF = -1,
|
|
PERF_EVENT_STATE_INACTIVE = 0,
|
|
PERF_EVENT_STATE_ACTIVE = 1,
|
|
};
|
|
|
|
struct file;
|
|
|
|
struct perf_mmap_data {
|
|
struct rcu_head rcu_head;
|
|
#ifdef CONFIG_PERF_USE_VMALLOC
|
|
struct work_struct work;
|
|
#endif
|
|
int data_order;
|
|
int nr_pages; /* nr of data pages */
|
|
int writable; /* are we writable */
|
|
int nr_locked; /* nr pages mlocked */
|
|
|
|
atomic_t poll; /* POLL_ for wakeups */
|
|
atomic_t events; /* event_id limit */
|
|
|
|
atomic_long_t head; /* write position */
|
|
atomic_long_t done_head; /* completed head */
|
|
|
|
atomic_t lock; /* concurrent writes */
|
|
atomic_t wakeup; /* needs a wakeup */
|
|
atomic_t lost; /* nr records lost */
|
|
|
|
long watermark; /* wakeup watermark */
|
|
|
|
struct perf_event_mmap_page *user_page;
|
|
void *data_pages[0];
|
|
};
|
|
|
|
struct perf_pending_entry {
|
|
struct perf_pending_entry *next;
|
|
void (*func)(struct perf_pending_entry *);
|
|
};
|
|
|
|
struct perf_sample_data;
|
|
|
|
typedef void (*perf_overflow_handler_t)(struct perf_event *, int,
|
|
struct perf_sample_data *,
|
|
struct pt_regs *regs);
|
|
|
|
enum perf_group_flag {
|
|
PERF_GROUP_SOFTWARE = 0x1,
|
|
};
|
|
|
|
/**
|
|
* struct perf_event - performance event kernel representation:
|
|
*/
|
|
struct perf_event {
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
struct list_head group_entry;
|
|
struct list_head event_entry;
|
|
struct list_head sibling_list;
|
|
int nr_siblings;
|
|
int group_flags;
|
|
struct perf_event *group_leader;
|
|
struct perf_event *output;
|
|
const struct pmu *pmu;
|
|
|
|
enum perf_event_active_state state;
|
|
atomic64_t count;
|
|
|
|
/*
|
|
* These are the total time in nanoseconds that the event
|
|
* has been enabled (i.e. eligible to run, and the task has
|
|
* been scheduled in, if this is a per-task event)
|
|
* and running (scheduled onto the CPU), respectively.
|
|
*
|
|
* They are computed from tstamp_enabled, tstamp_running and
|
|
* tstamp_stopped when the event is in INACTIVE or ACTIVE state.
|
|
*/
|
|
u64 total_time_enabled;
|
|
u64 total_time_running;
|
|
|
|
/*
|
|
* These are timestamps used for computing total_time_enabled
|
|
* and total_time_running when the event is in INACTIVE or
|
|
* ACTIVE state, measured in nanoseconds from an arbitrary point
|
|
* in time.
|
|
* tstamp_enabled: the notional time when the event was enabled
|
|
* tstamp_running: the notional time when the event was scheduled on
|
|
* tstamp_stopped: in INACTIVE state, the notional time when the
|
|
* event was scheduled off.
|
|
*/
|
|
u64 tstamp_enabled;
|
|
u64 tstamp_running;
|
|
u64 tstamp_stopped;
|
|
|
|
struct perf_event_attr attr;
|
|
struct hw_perf_event hw;
|
|
|
|
struct perf_event_context *ctx;
|
|
struct file *filp;
|
|
|
|
/*
|
|
* These accumulate total time (in nanoseconds) that children
|
|
* events have been enabled and running, respectively.
|
|
*/
|
|
atomic64_t child_total_time_enabled;
|
|
atomic64_t child_total_time_running;
|
|
|
|
/*
|
|
* Protect attach/detach and child_list:
|
|
*/
|
|
struct mutex child_mutex;
|
|
struct list_head child_list;
|
|
struct perf_event *parent;
|
|
|
|
int oncpu;
|
|
int cpu;
|
|
|
|
struct list_head owner_entry;
|
|
struct task_struct *owner;
|
|
|
|
/* mmap bits */
|
|
struct mutex mmap_mutex;
|
|
atomic_t mmap_count;
|
|
struct perf_mmap_data *data;
|
|
|
|
/* poll related */
|
|
wait_queue_head_t waitq;
|
|
struct fasync_struct *fasync;
|
|
|
|
/* delayed work for NMIs and such */
|
|
int pending_wakeup;
|
|
int pending_kill;
|
|
int pending_disable;
|
|
struct perf_pending_entry pending;
|
|
|
|
atomic_t event_limit;
|
|
|
|
void (*destroy)(struct perf_event *);
|
|
struct rcu_head rcu_head;
|
|
|
|
struct pid_namespace *ns;
|
|
u64 id;
|
|
|
|
perf_overflow_handler_t overflow_handler;
|
|
|
|
#ifdef CONFIG_EVENT_TRACING
|
|
struct event_filter *filter;
|
|
#endif
|
|
|
|
#endif /* CONFIG_PERF_EVENTS */
|
|
};
|
|
|
|
/**
|
|
* struct perf_event_context - event context structure
|
|
*
|
|
* Used as a container for task events and CPU events as well:
|
|
*/
|
|
struct perf_event_context {
|
|
/*
|
|
* Protect the states of the events in the list,
|
|
* nr_active, and the list:
|
|
*/
|
|
raw_spinlock_t lock;
|
|
/*
|
|
* Protect the list of events. Locking either mutex or lock
|
|
* is sufficient to ensure the list doesn't change; to change
|
|
* the list you need to lock both the mutex and the spinlock.
|
|
*/
|
|
struct mutex mutex;
|
|
|
|
struct list_head pinned_groups;
|
|
struct list_head flexible_groups;
|
|
struct list_head event_list;
|
|
int nr_events;
|
|
int nr_active;
|
|
int is_active;
|
|
int nr_stat;
|
|
atomic_t refcount;
|
|
struct task_struct *task;
|
|
|
|
/*
|
|
* Context clock, runs when context enabled.
|
|
*/
|
|
u64 time;
|
|
u64 timestamp;
|
|
|
|
/*
|
|
* These fields let us detect when two contexts have both
|
|
* been cloned (inherited) from a common ancestor.
|
|
*/
|
|
struct perf_event_context *parent_ctx;
|
|
u64 parent_gen;
|
|
u64 generation;
|
|
int pin_count;
|
|
struct rcu_head rcu_head;
|
|
};
|
|
|
|
/**
|
|
* struct perf_event_cpu_context - per cpu event context structure
|
|
*/
|
|
struct perf_cpu_context {
|
|
struct perf_event_context ctx;
|
|
struct perf_event_context *task_ctx;
|
|
int active_oncpu;
|
|
int max_pertask;
|
|
int exclusive;
|
|
|
|
/*
|
|
* Recursion avoidance:
|
|
*
|
|
* task, softirq, irq, nmi context
|
|
*/
|
|
int recursion[4];
|
|
};
|
|
|
|
struct perf_output_handle {
|
|
struct perf_event *event;
|
|
struct perf_mmap_data *data;
|
|
unsigned long head;
|
|
unsigned long offset;
|
|
int nmi;
|
|
int sample;
|
|
int locked;
|
|
};
|
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
|
|
/*
|
|
* Set by architecture code:
|
|
*/
|
|
extern int perf_max_events;
|
|
|
|
extern const struct pmu *hw_perf_event_init(struct perf_event *event);
|
|
|
|
extern void perf_event_task_sched_in(struct task_struct *task);
|
|
extern void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
|
|
extern void perf_event_task_tick(struct task_struct *task);
|
|
extern int perf_event_init_task(struct task_struct *child);
|
|
extern void perf_event_exit_task(struct task_struct *child);
|
|
extern void perf_event_free_task(struct task_struct *task);
|
|
extern void set_perf_event_pending(void);
|
|
extern void perf_event_do_pending(void);
|
|
extern void perf_event_print_debug(void);
|
|
extern void __perf_disable(void);
|
|
extern bool __perf_enable(void);
|
|
extern void perf_disable(void);
|
|
extern void perf_enable(void);
|
|
extern int perf_event_task_disable(void);
|
|
extern int perf_event_task_enable(void);
|
|
extern int hw_perf_group_sched_in(struct perf_event *group_leader,
|
|
struct perf_cpu_context *cpuctx,
|
|
struct perf_event_context *ctx);
|
|
extern void perf_event_update_userpage(struct perf_event *event);
|
|
extern int perf_event_release_kernel(struct perf_event *event);
|
|
extern struct perf_event *
|
|
perf_event_create_kernel_counter(struct perf_event_attr *attr,
|
|
int cpu,
|
|
pid_t pid,
|
|
perf_overflow_handler_t callback);
|
|
extern u64 perf_event_read_value(struct perf_event *event,
|
|
u64 *enabled, u64 *running);
|
|
|
|
struct perf_sample_data {
|
|
u64 type;
|
|
|
|
u64 ip;
|
|
struct {
|
|
u32 pid;
|
|
u32 tid;
|
|
} tid_entry;
|
|
u64 time;
|
|
u64 addr;
|
|
u64 id;
|
|
u64 stream_id;
|
|
struct {
|
|
u32 cpu;
|
|
u32 reserved;
|
|
} cpu_entry;
|
|
u64 period;
|
|
struct perf_callchain_entry *callchain;
|
|
struct perf_raw_record *raw;
|
|
};
|
|
|
|
static inline
|
|
void perf_sample_data_init(struct perf_sample_data *data, u64 addr)
|
|
{
|
|
data->addr = addr;
|
|
data->raw = NULL;
|
|
}
|
|
|
|
extern void perf_output_sample(struct perf_output_handle *handle,
|
|
struct perf_event_header *header,
|
|
struct perf_sample_data *data,
|
|
struct perf_event *event);
|
|
extern void perf_prepare_sample(struct perf_event_header *header,
|
|
struct perf_sample_data *data,
|
|
struct perf_event *event,
|
|
struct pt_regs *regs);
|
|
|
|
extern int perf_event_overflow(struct perf_event *event, int nmi,
|
|
struct perf_sample_data *data,
|
|
struct pt_regs *regs);
|
|
|
|
/*
|
|
* Return 1 for a software event, 0 for a hardware event
|
|
*/
|
|
static inline int is_software_event(struct perf_event *event)
|
|
{
|
|
switch (event->attr.type) {
|
|
case PERF_TYPE_SOFTWARE:
|
|
case PERF_TYPE_TRACEPOINT:
|
|
/* for now the breakpoint stuff also works as software event */
|
|
case PERF_TYPE_BREAKPOINT:
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
|
|
|
|
extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64);
|
|
|
|
extern void
|
|
perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip);
|
|
|
|
/*
|
|
* Take a snapshot of the regs. Skip ip and frame pointer to
|
|
* the nth caller. We only need a few of the regs:
|
|
* - ip for PERF_SAMPLE_IP
|
|
* - cs for user_mode() tests
|
|
* - bp for callchains
|
|
* - eflags, for future purposes, just in case
|
|
*/
|
|
static inline void perf_fetch_caller_regs(struct pt_regs *regs, int skip)
|
|
{
|
|
unsigned long ip;
|
|
|
|
memset(regs, 0, sizeof(*regs));
|
|
|
|
switch (skip) {
|
|
case 1 :
|
|
ip = CALLER_ADDR0;
|
|
break;
|
|
case 2 :
|
|
ip = CALLER_ADDR1;
|
|
break;
|
|
case 3 :
|
|
ip = CALLER_ADDR2;
|
|
break;
|
|
case 4:
|
|
ip = CALLER_ADDR3;
|
|
break;
|
|
/* No need to support further for now */
|
|
default:
|
|
ip = 0;
|
|
}
|
|
|
|
return perf_arch_fetch_caller_regs(regs, ip, skip);
|
|
}
|
|
|
|
static inline void
|
|
perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
|
|
{
|
|
if (atomic_read(&perf_swevent_enabled[event_id])) {
|
|
struct pt_regs hot_regs;
|
|
|
|
if (!regs) {
|
|
perf_fetch_caller_regs(&hot_regs, 1);
|
|
regs = &hot_regs;
|
|
}
|
|
__perf_sw_event(event_id, nr, nmi, regs, addr);
|
|
}
|
|
}
|
|
|
|
extern void __perf_event_mmap(struct vm_area_struct *vma);
|
|
|
|
static inline void perf_event_mmap(struct vm_area_struct *vma)
|
|
{
|
|
if (vma->vm_flags & VM_EXEC)
|
|
__perf_event_mmap(vma);
|
|
}
|
|
|
|
extern void perf_event_comm(struct task_struct *tsk);
|
|
extern void perf_event_fork(struct task_struct *tsk);
|
|
|
|
extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
|
|
|
|
extern int sysctl_perf_event_paranoid;
|
|
extern int sysctl_perf_event_mlock;
|
|
extern int sysctl_perf_event_sample_rate;
|
|
|
|
static inline bool perf_paranoid_tracepoint_raw(void)
|
|
{
|
|
return sysctl_perf_event_paranoid > -1;
|
|
}
|
|
|
|
static inline bool perf_paranoid_cpu(void)
|
|
{
|
|
return sysctl_perf_event_paranoid > 0;
|
|
}
|
|
|
|
static inline bool perf_paranoid_kernel(void)
|
|
{
|
|
return sysctl_perf_event_paranoid > 1;
|
|
}
|
|
|
|
extern void perf_event_init(void);
|
|
extern void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
|
|
int entry_size, struct pt_regs *regs);
|
|
extern void perf_bp_event(struct perf_event *event, void *data);
|
|
|
|
#ifndef perf_misc_flags
|
|
#define perf_misc_flags(regs) (user_mode(regs) ? PERF_RECORD_MISC_USER : \
|
|
PERF_RECORD_MISC_KERNEL)
|
|
#define perf_instruction_pointer(regs) instruction_pointer(regs)
|
|
#endif
|
|
|
|
extern int perf_output_begin(struct perf_output_handle *handle,
|
|
struct perf_event *event, unsigned int size,
|
|
int nmi, int sample);
|
|
extern void perf_output_end(struct perf_output_handle *handle);
|
|
extern void perf_output_copy(struct perf_output_handle *handle,
|
|
const void *buf, unsigned int len);
|
|
extern int perf_swevent_get_recursion_context(void);
|
|
extern void perf_swevent_put_recursion_context(int rctx);
|
|
extern void perf_event_enable(struct perf_event *event);
|
|
extern void perf_event_disable(struct perf_event *event);
|
|
#else
|
|
static inline void
|
|
perf_event_task_sched_in(struct task_struct *task) { }
|
|
static inline void
|
|
perf_event_task_sched_out(struct task_struct *task,
|
|
struct task_struct *next) { }
|
|
static inline void
|
|
perf_event_task_tick(struct task_struct *task) { }
|
|
static inline int perf_event_init_task(struct task_struct *child) { return 0; }
|
|
static inline void perf_event_exit_task(struct task_struct *child) { }
|
|
static inline void perf_event_free_task(struct task_struct *task) { }
|
|
static inline void perf_event_do_pending(void) { }
|
|
static inline void perf_event_print_debug(void) { }
|
|
static inline void perf_disable(void) { }
|
|
static inline void perf_enable(void) { }
|
|
static inline int perf_event_task_disable(void) { return -EINVAL; }
|
|
static inline int perf_event_task_enable(void) { return -EINVAL; }
|
|
|
|
static inline void
|
|
perf_sw_event(u32 event_id, u64 nr, int nmi,
|
|
struct pt_regs *regs, u64 addr) { }
|
|
static inline void
|
|
perf_bp_event(struct perf_event *event, void *data) { }
|
|
|
|
static inline void perf_event_mmap(struct vm_area_struct *vma) { }
|
|
static inline void perf_event_comm(struct task_struct *tsk) { }
|
|
static inline void perf_event_fork(struct task_struct *tsk) { }
|
|
static inline void perf_event_init(void) { }
|
|
static inline int perf_swevent_get_recursion_context(void) { return -1; }
|
|
static inline void perf_swevent_put_recursion_context(int rctx) { }
|
|
static inline void perf_event_enable(struct perf_event *event) { }
|
|
static inline void perf_event_disable(struct perf_event *event) { }
|
|
#endif
|
|
|
|
#define perf_output_put(handle, x) \
|
|
perf_output_copy((handle), &(x), sizeof(x))
|
|
|
|
/*
|
|
* This has to have a higher priority than migration_notifier in sched.c.
|
|
*/
|
|
#define perf_cpu_notifier(fn) \
|
|
do { \
|
|
static struct notifier_block fn##_nb __cpuinitdata = \
|
|
{ .notifier_call = fn, .priority = 20 }; \
|
|
fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE, \
|
|
(void *)(unsigned long)smp_processor_id()); \
|
|
fn(&fn##_nb, (unsigned long)CPU_STARTING, \
|
|
(void *)(unsigned long)smp_processor_id()); \
|
|
fn(&fn##_nb, (unsigned long)CPU_ONLINE, \
|
|
(void *)(unsigned long)smp_processor_id()); \
|
|
register_cpu_notifier(&fn##_nb); \
|
|
} while (0)
|
|
|
|
#endif /* __KERNEL__ */
|
|
#endif /* _LINUX_PERF_EVENT_H */
|