2012-10-13 16:46:48 +07:00
|
|
|
/*
|
|
|
|
* Performance events:
|
|
|
|
*
|
|
|
|
* Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
|
|
|
|
* Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar
|
|
|
|
* Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra
|
|
|
|
*
|
|
|
|
* Data type definitions, declarations, prototypes.
|
|
|
|
*
|
|
|
|
* Started by: Thomas Gleixner and Ingo Molnar
|
|
|
|
*
|
|
|
|
* For licencing details see kernel-base/COPYING
|
|
|
|
*/
|
|
|
|
#ifndef _UAPI_LINUX_PERF_EVENT_H
|
|
|
|
#define _UAPI_LINUX_PERF_EVENT_H
|
|
|
|
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/ioctl.h>
|
|
|
|
#include <asm/byteorder.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* User-space ABI bits:
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* attr.type
|
|
|
|
*/
|
|
|
|
enum perf_type_id {
|
|
|
|
PERF_TYPE_HARDWARE = 0,
|
|
|
|
PERF_TYPE_SOFTWARE = 1,
|
|
|
|
PERF_TYPE_TRACEPOINT = 2,
|
|
|
|
PERF_TYPE_HW_CACHE = 3,
|
|
|
|
PERF_TYPE_RAW = 4,
|
|
|
|
PERF_TYPE_BREAKPOINT = 5,
|
|
|
|
|
|
|
|
PERF_TYPE_MAX, /* non-ABI */
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Generalized performance event event_id types, used by the
|
|
|
|
* attr.event_id parameter of the sys_perf_event_open()
|
|
|
|
* syscall:
|
|
|
|
*/
|
|
|
|
enum perf_hw_id {
|
|
|
|
/*
|
|
|
|
* Common hardware events, generalized by the kernel:
|
|
|
|
*/
|
|
|
|
PERF_COUNT_HW_CPU_CYCLES = 0,
|
|
|
|
PERF_COUNT_HW_INSTRUCTIONS = 1,
|
|
|
|
PERF_COUNT_HW_CACHE_REFERENCES = 2,
|
|
|
|
PERF_COUNT_HW_CACHE_MISSES = 3,
|
|
|
|
PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4,
|
|
|
|
PERF_COUNT_HW_BRANCH_MISSES = 5,
|
|
|
|
PERF_COUNT_HW_BUS_CYCLES = 6,
|
|
|
|
PERF_COUNT_HW_STALLED_CYCLES_FRONTEND = 7,
|
|
|
|
PERF_COUNT_HW_STALLED_CYCLES_BACKEND = 8,
|
|
|
|
PERF_COUNT_HW_REF_CPU_CYCLES = 9,
|
|
|
|
|
|
|
|
PERF_COUNT_HW_MAX, /* non-ABI */
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Generalized hardware cache events:
|
|
|
|
*
|
|
|
|
* { L1-D, L1-I, LLC, ITLB, DTLB, BPU, NODE } x
|
|
|
|
* { read, write, prefetch } x
|
|
|
|
* { accesses, misses }
|
|
|
|
*/
|
|
|
|
enum perf_hw_cache_id {
|
|
|
|
PERF_COUNT_HW_CACHE_L1D = 0,
|
|
|
|
PERF_COUNT_HW_CACHE_L1I = 1,
|
|
|
|
PERF_COUNT_HW_CACHE_LL = 2,
|
|
|
|
PERF_COUNT_HW_CACHE_DTLB = 3,
|
|
|
|
PERF_COUNT_HW_CACHE_ITLB = 4,
|
|
|
|
PERF_COUNT_HW_CACHE_BPU = 5,
|
|
|
|
PERF_COUNT_HW_CACHE_NODE = 6,
|
|
|
|
|
|
|
|
PERF_COUNT_HW_CACHE_MAX, /* non-ABI */
|
|
|
|
};
|
|
|
|
|
|
|
|
enum perf_hw_cache_op_id {
|
|
|
|
PERF_COUNT_HW_CACHE_OP_READ = 0,
|
|
|
|
PERF_COUNT_HW_CACHE_OP_WRITE = 1,
|
|
|
|
PERF_COUNT_HW_CACHE_OP_PREFETCH = 2,
|
|
|
|
|
|
|
|
PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */
|
|
|
|
};
|
|
|
|
|
|
|
|
enum perf_hw_cache_op_result_id {
|
|
|
|
PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0,
|
|
|
|
PERF_COUNT_HW_CACHE_RESULT_MISS = 1,
|
|
|
|
|
|
|
|
PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Special "software" events provided by the kernel, even if the hardware
|
|
|
|
* does not support performance events. These events measure various
|
|
|
|
* physical and sw events of the kernel (and allow the profiling of them as
|
|
|
|
* well):
|
|
|
|
*/
|
|
|
|
enum perf_sw_ids {
|
|
|
|
PERF_COUNT_SW_CPU_CLOCK = 0,
|
|
|
|
PERF_COUNT_SW_TASK_CLOCK = 1,
|
|
|
|
PERF_COUNT_SW_PAGE_FAULTS = 2,
|
|
|
|
PERF_COUNT_SW_CONTEXT_SWITCHES = 3,
|
|
|
|
PERF_COUNT_SW_CPU_MIGRATIONS = 4,
|
|
|
|
PERF_COUNT_SW_PAGE_FAULTS_MIN = 5,
|
|
|
|
PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6,
|
|
|
|
PERF_COUNT_SW_ALIGNMENT_FAULTS = 7,
|
|
|
|
PERF_COUNT_SW_EMULATION_FAULTS = 8,
|
2013-09-01 01:50:51 +07:00
|
|
|
PERF_COUNT_SW_DUMMY = 9,
|
2015-10-21 10:02:34 +07:00
|
|
|
PERF_COUNT_SW_BPF_OUTPUT = 10,
|
2012-10-13 16:46:48 +07:00
|
|
|
|
|
|
|
PERF_COUNT_SW_MAX, /* non-ABI */
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Bits that can be set in attr.sample_type to request information
|
|
|
|
* in the overflow packets.
|
|
|
|
*/
|
|
|
|
enum perf_event_sample_format {
|
|
|
|
PERF_SAMPLE_IP = 1U << 0,
|
|
|
|
PERF_SAMPLE_TID = 1U << 1,
|
|
|
|
PERF_SAMPLE_TIME = 1U << 2,
|
|
|
|
PERF_SAMPLE_ADDR = 1U << 3,
|
|
|
|
PERF_SAMPLE_READ = 1U << 4,
|
|
|
|
PERF_SAMPLE_CALLCHAIN = 1U << 5,
|
|
|
|
PERF_SAMPLE_ID = 1U << 6,
|
|
|
|
PERF_SAMPLE_CPU = 1U << 7,
|
|
|
|
PERF_SAMPLE_PERIOD = 1U << 8,
|
|
|
|
PERF_SAMPLE_STREAM_ID = 1U << 9,
|
|
|
|
PERF_SAMPLE_RAW = 1U << 10,
|
|
|
|
PERF_SAMPLE_BRANCH_STACK = 1U << 11,
|
|
|
|
PERF_SAMPLE_REGS_USER = 1U << 12,
|
|
|
|
PERF_SAMPLE_STACK_USER = 1U << 13,
|
2013-01-24 22:10:28 +07:00
|
|
|
PERF_SAMPLE_WEIGHT = 1U << 14,
|
2013-01-24 22:10:31 +07:00
|
|
|
PERF_SAMPLE_DATA_SRC = 1U << 15,
|
2013-08-27 15:23:07 +07:00
|
|
|
PERF_SAMPLE_IDENTIFIER = 1U << 16,
|
2013-09-20 21:40:39 +07:00
|
|
|
PERF_SAMPLE_TRANSACTION = 1U << 17,
|
2014-09-24 18:48:37 +07:00
|
|
|
PERF_SAMPLE_REGS_INTR = 1U << 18,
|
2013-01-24 22:10:28 +07:00
|
|
|
|
2014-09-24 18:48:37 +07:00
|
|
|
PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */
|
2012-10-13 16:46:48 +07:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set
|
|
|
|
*
|
|
|
|
* If the user does not pass priv level information via branch_sample_type,
|
|
|
|
* the kernel uses the event's priv level. Branch and event priv levels do
|
|
|
|
* not have to match. Branch priv level is checked for permissions.
|
|
|
|
*
|
|
|
|
* The branch types can be combined, however BRANCH_ANY covers all types
|
|
|
|
* of branches and therefore it supersedes all the other types.
|
|
|
|
*/
|
2014-11-05 09:55:57 +07:00
|
|
|
enum perf_branch_sample_type_shift {
|
|
|
|
PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */
|
|
|
|
PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */
|
|
|
|
PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */
|
|
|
|
|
|
|
|
PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */
|
|
|
|
PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */
|
|
|
|
PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */
|
|
|
|
PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */
|
|
|
|
PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */
|
|
|
|
PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */
|
|
|
|
PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */
|
|
|
|
PERF_SAMPLE_BRANCH_COND_SHIFT = 10, /* conditional branches */
|
|
|
|
|
2014-11-05 16:36:45 +07:00
|
|
|
PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* call/ret stack */
|
2015-05-15 04:09:58 +07:00
|
|
|
PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT = 12, /* indirect jumps */
|
2015-10-13 14:09:08 +07:00
|
|
|
PERF_SAMPLE_BRANCH_CALL_SHIFT = 13, /* direct call */
|
2014-11-05 16:36:45 +07:00
|
|
|
|
2015-10-21 01:46:34 +07:00
|
|
|
PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT = 14, /* no flags */
|
|
|
|
PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT = 15, /* no cycles */
|
|
|
|
|
2014-11-05 09:55:57 +07:00
|
|
|
PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */
|
|
|
|
};
|
|
|
|
|
2012-10-13 16:46:48 +07:00
|
|
|
enum perf_branch_sample_type {
|
2014-11-05 09:55:57 +07:00
|
|
|
PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT,
|
|
|
|
PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT,
|
|
|
|
PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT,
|
|
|
|
|
|
|
|
PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT,
|
2014-11-05 16:36:45 +07:00
|
|
|
PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT,
|
|
|
|
PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT,
|
|
|
|
PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT,
|
|
|
|
PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT,
|
2014-11-05 09:55:57 +07:00
|
|
|
PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT,
|
|
|
|
PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT,
|
|
|
|
PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT,
|
|
|
|
|
2014-11-05 16:36:45 +07:00
|
|
|
PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
|
2015-05-15 04:09:58 +07:00
|
|
|
PERF_SAMPLE_BRANCH_IND_JUMP = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT,
|
2015-10-13 14:09:08 +07:00
|
|
|
PERF_SAMPLE_BRANCH_CALL = 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT,
|
2014-11-05 16:36:45 +07:00
|
|
|
|
2015-10-21 01:46:34 +07:00
|
|
|
PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT,
|
|
|
|
PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT,
|
|
|
|
|
2014-11-05 09:55:57 +07:00
|
|
|
PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
|
2012-10-13 16:46:48 +07:00
|
|
|
};
|
|
|
|
|
|
|
|
#define PERF_SAMPLE_BRANCH_PLM_ALL \
|
|
|
|
(PERF_SAMPLE_BRANCH_USER|\
|
|
|
|
PERF_SAMPLE_BRANCH_KERNEL|\
|
|
|
|
PERF_SAMPLE_BRANCH_HV)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Values to determine ABI of the registers dump.
|
|
|
|
*/
|
|
|
|
enum perf_sample_regs_abi {
|
|
|
|
PERF_SAMPLE_REGS_ABI_NONE = 0,
|
|
|
|
PERF_SAMPLE_REGS_ABI_32 = 1,
|
|
|
|
PERF_SAMPLE_REGS_ABI_64 = 2,
|
|
|
|
};
|
|
|
|
|
2013-09-20 21:40:39 +07:00
|
|
|
/*
|
|
|
|
* Values for the memory transaction event qualifier, mostly for
|
|
|
|
* abort events. Multiple bits can be set.
|
|
|
|
*/
|
|
|
|
enum {
|
|
|
|
PERF_TXN_ELISION = (1 << 0), /* From elision */
|
|
|
|
PERF_TXN_TRANSACTION = (1 << 1), /* From transaction */
|
|
|
|
PERF_TXN_SYNC = (1 << 2), /* Instruction is related */
|
|
|
|
PERF_TXN_ASYNC = (1 << 3), /* Instruction not related */
|
|
|
|
PERF_TXN_RETRY = (1 << 4), /* Retry possible */
|
|
|
|
PERF_TXN_CONFLICT = (1 << 5), /* Conflict abort */
|
|
|
|
PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */
|
|
|
|
PERF_TXN_CAPACITY_READ = (1 << 7), /* Capacity read abort */
|
|
|
|
|
|
|
|
PERF_TXN_MAX = (1 << 8), /* non-ABI */
|
|
|
|
|
|
|
|
/* bits 32..63 are reserved for the abort code */
|
|
|
|
|
|
|
|
PERF_TXN_ABORT_MASK = (0xffffffffULL << 32),
|
|
|
|
PERF_TXN_ABORT_SHIFT = 32,
|
|
|
|
};
|
|
|
|
|
2012-10-13 16:46:48 +07:00
|
|
|
/*
|
|
|
|
* The format of the data returned by read() on a perf event fd,
|
|
|
|
* as specified by attr.read_format:
|
|
|
|
*
|
|
|
|
* struct read_format {
|
|
|
|
* { u64 value;
|
|
|
|
* { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
|
|
|
|
* { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
|
|
|
|
* { u64 id; } && PERF_FORMAT_ID
|
|
|
|
* } && !PERF_FORMAT_GROUP
|
|
|
|
*
|
|
|
|
* { u64 nr;
|
|
|
|
* { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
|
|
|
|
* { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
|
|
|
|
* { u64 value;
|
|
|
|
* { u64 id; } && PERF_FORMAT_ID
|
|
|
|
* } cntr[nr];
|
|
|
|
* } && PERF_FORMAT_GROUP
|
|
|
|
* };
|
|
|
|
*/
|
|
|
|
enum perf_event_read_format {
|
|
|
|
PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0,
|
|
|
|
PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1,
|
|
|
|
PERF_FORMAT_ID = 1U << 2,
|
|
|
|
PERF_FORMAT_GROUP = 1U << 3,
|
|
|
|
|
|
|
|
PERF_FORMAT_MAX = 1U << 4, /* non-ABI */
|
|
|
|
};
|
|
|
|
|
|
|
|
#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */
|
|
|
|
#define PERF_ATTR_SIZE_VER1 72 /* add: config2 */
|
|
|
|
#define PERF_ATTR_SIZE_VER2 80 /* add: branch_sample_type */
|
|
|
|
#define PERF_ATTR_SIZE_VER3 96 /* add: sample_regs_user */
|
|
|
|
/* add: sample_stack_user */
|
2014-09-24 18:48:37 +07:00
|
|
|
#define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */
|
2015-01-14 19:18:18 +07:00
|
|
|
#define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */
|
2012-10-13 16:46:48 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Hardware event_id to monitor via a performance monitoring event:
|
2016-04-28 23:16:33 +07:00
|
|
|
*
|
|
|
|
* @sample_max_stack: Max number of frame pointers in a callchain,
|
|
|
|
* should be < /proc/sys/kernel/perf_event_max_stack
|
2012-10-13 16:46:48 +07:00
|
|
|
*/
|
|
|
|
struct perf_event_attr {
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Major type: hardware/software/tracepoint/etc.
|
|
|
|
*/
|
|
|
|
__u32 type;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Size of the attr structure, for fwd/bwd compat.
|
|
|
|
*/
|
|
|
|
__u32 size;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Type specific configuration information.
|
|
|
|
*/
|
|
|
|
__u64 config;
|
|
|
|
|
|
|
|
union {
|
|
|
|
__u64 sample_period;
|
|
|
|
__u64 sample_freq;
|
|
|
|
};
|
|
|
|
|
|
|
|
__u64 sample_type;
|
|
|
|
__u64 read_format;
|
|
|
|
|
|
|
|
__u64 disabled : 1, /* off by default */
|
|
|
|
inherit : 1, /* children inherit it */
|
|
|
|
pinned : 1, /* must always be on PMU */
|
|
|
|
exclusive : 1, /* only group on PMU */
|
|
|
|
exclude_user : 1, /* don't count user */
|
|
|
|
exclude_kernel : 1, /* ditto kernel */
|
|
|
|
exclude_hv : 1, /* ditto hypervisor */
|
|
|
|
exclude_idle : 1, /* don't count when idle */
|
|
|
|
mmap : 1, /* include mmap data */
|
|
|
|
comm : 1, /* include comm data */
|
|
|
|
freq : 1, /* use freq, not period */
|
|
|
|
inherit_stat : 1, /* per task counts */
|
|
|
|
enable_on_exec : 1, /* next exec enables */
|
|
|
|
task : 1, /* trace fork/exit */
|
|
|
|
watermark : 1, /* wakeup_watermark */
|
|
|
|
/*
|
|
|
|
* precise_ip:
|
|
|
|
*
|
|
|
|
* 0 - SAMPLE_IP can have arbitrary skid
|
|
|
|
* 1 - SAMPLE_IP must have constant skid
|
|
|
|
* 2 - SAMPLE_IP requested to have 0 skid
|
|
|
|
* 3 - SAMPLE_IP must have 0 skid
|
|
|
|
*
|
|
|
|
* See also PERF_RECORD_MISC_EXACT_IP
|
|
|
|
*/
|
|
|
|
precise_ip : 2, /* skid constraint */
|
|
|
|
mmap_data : 1, /* non-exec mmap data */
|
|
|
|
sample_id_all : 1, /* sample_type all events */
|
|
|
|
|
|
|
|
exclude_host : 1, /* don't count in host */
|
|
|
|
exclude_guest : 1, /* don't count in guest */
|
|
|
|
|
|
|
|
exclude_callchain_kernel : 1, /* exclude kernel callchains */
|
|
|
|
exclude_callchain_user : 1, /* exclude user callchains */
|
2013-08-21 17:10:24 +07:00
|
|
|
mmap2 : 1, /* include mmap with inode data */
|
2014-05-28 15:45:04 +07:00
|
|
|
comm_exec : 1, /* flag comm events that are due to an exec */
|
2015-02-20 20:05:38 +07:00
|
|
|
use_clockid : 1, /* use @clockid for time fields */
|
2015-07-21 16:44:02 +07:00
|
|
|
context_switch : 1, /* context switch data */
|
perf/core: Add ::write_backward attribute to perf event
This patch introduces 'write_backward' bit to perf_event_attr, which
controls the direction of a ring buffer. After set, the corresponding
ring buffer is written from end to beginning. This feature is design to
support reading from overwritable ring buffer.
Ring buffer can be created by mapping a perf event fd. Kernel puts event
records into ring buffer, user tooling like perf fetch them from
address returned by mmap(). To prevent racing between kernel and tooling,
they communicate to each other through 'head' and 'tail' pointers.
Kernel maintains 'head' pointer, points it to the next free area (tail
of the last record). Tooling maintains 'tail' pointer, points it to the
tail of last consumed record (record has already been fetched). Kernel
determines the available space in a ring buffer using these two
pointers to avoid overwrite unfetched records.
By mapping without 'PROT_WRITE', an overwritable ring buffer is created.
Different from normal ring buffer, tooling is unable to maintain 'tail'
pointer because writing is forbidden. Therefore, for this type of ring
buffers, kernel overwrite old records unconditionally, works like flight
recorder. This feature would be useful if reading from overwritable ring
buffer were as easy as reading from normal ring buffer. However,
there's an obscure problem.
The following figure demonstrates a full overwritable ring buffer. In
this figure, the 'head' pointer points to the end of last record, and a
long record 'E' is pending. For a normal ring buffer, a 'tail' pointer
would have pointed to position (X), so kernel knows there's no more
space in the ring buffer. However, for an overwritable ring buffer,
kernel ignore the 'tail' pointer.
(X) head
. |
. V
+------+-------+----------+------+---+
|A....A|B.....B|C........C|D....D| |
+------+-------+----------+------+---+
Record 'A' is overwritten by event 'E':
head
|
V
+--+---+-------+----------+------+---+
|.E|..A|B.....B|C........C|D....D|E..|
+--+---+-------+----------+------+---+
Now tooling decides to read from this ring buffer. However, none of these
two natural positions, 'head' and the start of this ring buffer, are
pointing to the head of a record. Even the full ring buffer can be
accessed by tooling, it is unable to find a position to start decoding.
The first attempt tries to solve this problem AFAIK can be found from
[1]. It makes kernel to maintain 'tail' pointer: updates it when ring
buffer is half full. However, this approach introduces overhead to
fast path. Test result shows a 1% overhead [2]. In addition, this method
utilizes no more tham 50% records.
Another attempt can be found from [3], which allows putting the size of
an event at the end of each record. This approach allows tooling to find
records in a backward manner from 'head' pointer by reading size of a
record from its tail. However, because of alignment requirement, it
needs 8 bytes to record the size of a record, which is a huge waste. Its
performance is also not good, because more data need to be written.
This approach also introduces some extra branch instructions to fast
path.
'write_backward' is a better solution to this problem.
Following figure demonstrates the state of the overwritable ring buffer
when 'write_backward' is set before overwriting:
head
|
V
+---+------+----------+-------+------+
| |D....D|C........C|B.....B|A....A|
+---+------+----------+-------+------+
and after overwriting:
head
|
V
+---+------+----------+-------+---+--+
|..E|D....D|C........C|B.....B|A..|E.|
+---+------+----------+-------+---+--+
In each situation, 'head' points to the beginning of the newest record.
From this record, tooling can iterate over the full ring buffer and fetch
records one by one.
The only limitation that needs to be considered is back-to-back reading.
Due to the non-deterministic of user programs, it is impossible to ensure
the ring buffer keeps stable during reading. Consider an extreme situation:
tooling is scheduled out after reading record 'D', then a burst of events
come, eat up the whole ring buffer (one or multiple rounds). When the
tooling process comes back, reading after 'D' is incorrect now.
To prevent this problem, we need to find a way to ensure the ring buffer
is stable during reading. ioctl(PERF_EVENT_IOC_PAUSE_OUTPUT) is
suggested because its overhead is lower than
ioctl(PERF_EVENT_IOC_ENABLE).
By carefully verifying 'header' pointer, reader can avoid pausing the
ring-buffer. For example:
/* A union of all possible events */
union perf_event event;
p = head = perf_mmap__read_head();
while (true) {
/* copy header of next event */
fetch(&event.header, p, sizeof(event.header));
/* read 'head' pointer */
head = perf_mmap__read_head();
/* check overwritten: is the header good? */
if (!verify(sizeof(event.header), p, head))
break;
/* copy the whole event */
fetch(&event, p, event.header.size);
/* read 'head' pointer again */
head = perf_mmap__read_head();
/* is the whole event good? */
if (!verify(event.header.size, p, head))
break;
p += event.header.size;
}
However, the overhead is high because:
a) In-place decoding is not safe.
Copying-verifying-decoding is required.
b) Fetching 'head' pointer requires additional synchronization.
(From Alexei Starovoitov:
Even when this trick works, pause is needed for more than stability of
reading. When we collect the events into overwrite buffer we're waiting
for some other trigger (like all cpu utilization spike or just one cpu
running and all others are idle) and when it happens the buffer has
valuable info from the past. At this point new events are no longer
interesting and buffer should be paused, events read and unpaused until
next trigger comes.)
This patch utilizes event's default overflow_handler introduced
previously. perf_event_output_backward() is created as the default
overflow handler for backward ring buffers. To avoid extra overhead to
fast path, original perf_event_output() becomes __perf_event_output()
and marked '__always_inline'. In theory, there's no extra overhead
introduced to fast path.
Performance testing:
Calling 3000000 times of 'close(-1)', use gettimeofday() to check
duration. Use 'perf record -o /dev/null -e raw_syscalls:*' to capture
system calls. In ns.
Testing environment:
CPU : Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
Kernel : v4.5.0
MEAN STDVAR
BASE 800214.950 2853.083
PRE1 2253846.700 9997.014
PRE2 2257495.540 8516.293
POST 2250896.100 8933.921
Where 'BASE' is pure performance without capturing. 'PRE1' is test
result of pure 'v4.5.0' kernel. 'PRE2' is test result before this
patch. 'POST' is test result after this patch. See [4] for the detailed
experimental setup.
Considering the stdvar, this patch doesn't introduce performance
overhead to the fast path.
[1] http://lkml.iu.edu/hypermail/linux/kernel/1304.1/04584.html
[2] http://lkml.iu.edu/hypermail/linux/kernel/1307.1/00535.html
[3] http://lkml.iu.edu/hypermail/linux/kernel/1512.0/01265.html
[4] http://lkml.kernel.org/g/56F89DCD.1040202@huawei.com
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: <acme@kernel.org>
Cc: <pi3orama@163.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/1459865478-53413-1-git-send-email-wangnan0@huawei.com
[ Fixed the changelog some more. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-04-05 21:11:18 +07:00
|
|
|
write_backward : 1, /* Write ring buffer from end to beginning */
|
2017-03-08 03:41:36 +07:00
|
|
|
namespaces : 1, /* include namespaces data */
|
|
|
|
__reserved_1 : 35;
|
2012-10-13 16:46:48 +07:00
|
|
|
|
|
|
|
union {
|
|
|
|
__u32 wakeup_events; /* wakeup every n events */
|
|
|
|
__u32 wakeup_watermark; /* bytes before wakeup */
|
|
|
|
};
|
|
|
|
|
|
|
|
__u32 bp_type;
|
|
|
|
union {
|
|
|
|
__u64 bp_addr;
|
|
|
|
__u64 config1; /* extension of config */
|
|
|
|
};
|
|
|
|
union {
|
|
|
|
__u64 bp_len;
|
|
|
|
__u64 config2; /* extension of config1 */
|
|
|
|
};
|
|
|
|
__u64 branch_sample_type; /* enum perf_branch_sample_type */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Defines set of user regs to dump on samples.
|
|
|
|
* See asm/perf_regs.h for details.
|
|
|
|
*/
|
|
|
|
__u64 sample_regs_user;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Defines size of the user stack to dump on samples.
|
|
|
|
*/
|
|
|
|
__u32 sample_stack_user;
|
|
|
|
|
2015-02-20 20:05:38 +07:00
|
|
|
__s32 clockid;
|
2014-09-24 18:48:37 +07:00
|
|
|
/*
|
|
|
|
* Defines set of regs to dump for each sample
|
|
|
|
* state captured on:
|
|
|
|
* - precise = 0: PMU interrupt
|
|
|
|
* - precise > 0: sampled instruction
|
|
|
|
*
|
|
|
|
* See asm/perf_regs.h for details.
|
|
|
|
*/
|
|
|
|
__u64 sample_regs_intr;
|
2015-01-14 19:18:18 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Wakeup watermark for AUX area
|
|
|
|
*/
|
|
|
|
__u32 aux_watermark;
|
2016-04-28 23:16:33 +07:00
|
|
|
__u16 sample_max_stack;
|
|
|
|
__u16 __reserved_2; /* align to __u64 */
|
2012-10-13 16:46:48 +07:00
|
|
|
};
|
|
|
|
|
|
|
|
#define perf_flags(attr) (*(&(attr)->read_format + 1))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ioctls that can be done on a perf event fd:
|
|
|
|
*/
|
|
|
|
#define PERF_EVENT_IOC_ENABLE _IO ('$', 0)
|
|
|
|
#define PERF_EVENT_IOC_DISABLE _IO ('$', 1)
|
|
|
|
#define PERF_EVENT_IOC_REFRESH _IO ('$', 2)
|
|
|
|
#define PERF_EVENT_IOC_RESET _IO ('$', 3)
|
|
|
|
#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64)
|
|
|
|
#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5)
|
|
|
|
#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *)
|
2013-09-18 01:53:41 +07:00
|
|
|
#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *)
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-26 02:49:20 +07:00
|
|
|
#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
|
2016-03-28 13:41:29 +07:00
|
|
|
#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32)
|
2012-10-13 16:46:48 +07:00
|
|
|
|
|
|
|
enum perf_event_ioc_flags {
|
|
|
|
PERF_IOC_FLAG_GROUP = 1U << 0,
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Structure of the page that can be mapped via mmap
|
|
|
|
*/
|
|
|
|
struct perf_event_mmap_page {
|
|
|
|
__u32 version; /* version number of this structure */
|
|
|
|
__u32 compat_version; /* lowest version this is compat with */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Bits needed to read the hw events in user-space.
|
|
|
|
*
|
2014-10-03 12:16:36 +07:00
|
|
|
* u32 seq, time_mult, time_shift, index, width;
|
2012-10-13 16:46:48 +07:00
|
|
|
* u64 count, enabled, running;
|
|
|
|
* u64 cyc, time_offset;
|
|
|
|
* s64 pmc = 0;
|
|
|
|
*
|
|
|
|
* do {
|
|
|
|
* seq = pc->lock;
|
|
|
|
* barrier()
|
|
|
|
*
|
|
|
|
* enabled = pc->time_enabled;
|
|
|
|
* running = pc->time_running;
|
|
|
|
*
|
|
|
|
* if (pc->cap_usr_time && enabled != running) {
|
|
|
|
* cyc = rdtsc();
|
|
|
|
* time_offset = pc->time_offset;
|
|
|
|
* time_mult = pc->time_mult;
|
|
|
|
* time_shift = pc->time_shift;
|
|
|
|
* }
|
|
|
|
*
|
2014-10-03 12:16:36 +07:00
|
|
|
* index = pc->index;
|
2012-10-13 16:46:48 +07:00
|
|
|
* count = pc->offset;
|
2014-10-03 12:16:36 +07:00
|
|
|
* if (pc->cap_user_rdpmc && index) {
|
2012-10-13 16:46:48 +07:00
|
|
|
* width = pc->pmc_width;
|
2014-10-03 12:16:36 +07:00
|
|
|
* pmc = rdpmc(index - 1);
|
2012-10-13 16:46:48 +07:00
|
|
|
* }
|
|
|
|
*
|
|
|
|
* barrier();
|
|
|
|
* } while (pc->lock != seq);
|
|
|
|
*
|
|
|
|
* NOTE: for obvious reason this only works on self-monitoring
|
|
|
|
* processes.
|
|
|
|
*/
|
|
|
|
__u32 lock; /* seqlock for synchronization */
|
|
|
|
__u32 index; /* hardware event identifier */
|
|
|
|
__s64 offset; /* add to hardware event value */
|
|
|
|
__u64 time_enabled; /* time event active */
|
|
|
|
__u64 time_running; /* time event on cpu */
|
|
|
|
union {
|
|
|
|
__u64 capabilities;
|
2013-06-28 20:22:17 +07:00
|
|
|
struct {
|
perf: Fix capabilities bitfield compatibility in 'struct perf_event_mmap_page'
Solve the problems around the broken definition of perf_event_mmap_page::
cap_usr_time and cap_usr_rdpmc fields which used to overlap, partially
fixed by:
860f085b74e9 ("perf: Fix broken union in 'struct perf_event_mmap_page'")
The problem with the fix (merged in v3.12-rc1 and not yet released
officially), noticed by Vince Weaver is that the new behavior is
not detectable by new user-space, and that due to the reuse of the
field names it's easy to mis-compile a binary if old headers are used
on a new kernel or new headers are used on an old kernel.
To solve all that make this change explicit, detectable and self-contained,
by iterating the ABI the following way:
- Always clear bit 0, and rename it to usrpage->cap_bit0, to at least not
confuse old user-space binaries. RDPMC will be marked as unavailable
to old binaries but that's within the ABI, this is a capability bit.
- Rename bit 1 to ->cap_bit0_is_deprecated and always set it to 1, so new
libraries can reliably detect that bit 0 is deprecated and perma-zero
without having to check the kernel version.
- Use bits 2, 3, 4 for the newly defined, correct functionality:
cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */
cap_user_time : 1, /* The time_* fields are used */
cap_user_time_zero : 1, /* The time_zero field is used */
- Rename all the bitfield names in perf_event.h to be different from the
old names, to make sure it's not possible to mis-compile it
accidentally with old assumptions.
The 'size' field can then be used in the future to add new fields and it
will act as a natural ABI version indicator as well.
Also adjust tools/perf/ userspace for the new definitions, noticed by
Adrian Hunter.
Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Also-Fixed-by: Adrian Hunter <adrian.hunter@intel.com>
Link: http://lkml.kernel.org/n/tip-zr03yxjrpXesOzzupszqglbv@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-09-19 15:16:42 +07:00
|
|
|
__u64 cap_bit0 : 1, /* Always 0, deprecated, see commit 860f085b74e9 */
|
|
|
|
cap_bit0_is_deprecated : 1, /* Always 1, signals that bit 0 is zero */
|
|
|
|
|
|
|
|
cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */
|
|
|
|
cap_user_time : 1, /* The time_* fields are used */
|
|
|
|
cap_user_time_zero : 1, /* The time_zero field is used */
|
|
|
|
cap_____res : 59;
|
2013-06-28 20:22:17 +07:00
|
|
|
};
|
2012-10-13 16:46:48 +07:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
2014-10-03 12:16:36 +07:00
|
|
|
* If cap_user_rdpmc this field provides the bit-width of the value
|
2012-10-13 16:46:48 +07:00
|
|
|
* read using the rdpmc() or equivalent instruction. This can be used
|
|
|
|
* to sign extend the result like:
|
|
|
|
*
|
|
|
|
* pmc <<= 64 - width;
|
|
|
|
* pmc >>= 64 - width; // signed shift right
|
|
|
|
* count += pmc;
|
|
|
|
*/
|
|
|
|
__u16 pmc_width;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If cap_usr_time the below fields can be used to compute the time
|
|
|
|
* delta since time_enabled (in ns) using rdtsc or similar.
|
|
|
|
*
|
|
|
|
* u64 quot, rem;
|
|
|
|
* u64 delta;
|
|
|
|
*
|
|
|
|
* quot = (cyc >> time_shift);
|
2015-10-16 20:24:05 +07:00
|
|
|
* rem = cyc & (((u64)1 << time_shift) - 1);
|
2012-10-13 16:46:48 +07:00
|
|
|
* delta = time_offset + quot * time_mult +
|
|
|
|
* ((rem * time_mult) >> time_shift);
|
|
|
|
*
|
|
|
|
* Where time_offset,time_mult,time_shift and cyc are read in the
|
|
|
|
* seqcount loop described above. This delta can then be added to
|
2014-10-03 12:16:36 +07:00
|
|
|
* enabled and possible running (if index), improving the scaling:
|
2012-10-13 16:46:48 +07:00
|
|
|
*
|
|
|
|
* enabled += delta;
|
2014-10-03 12:16:36 +07:00
|
|
|
* if (index)
|
2012-10-13 16:46:48 +07:00
|
|
|
* running += delta;
|
|
|
|
*
|
|
|
|
* quot = count / running;
|
|
|
|
* rem = count % running;
|
|
|
|
* count = quot * enabled + (rem * enabled) / running;
|
|
|
|
*/
|
|
|
|
__u16 time_shift;
|
|
|
|
__u32 time_mult;
|
|
|
|
__u64 time_offset;
|
2013-06-28 20:22:18 +07:00
|
|
|
/*
|
|
|
|
* If cap_usr_time_zero, the hardware clock (e.g. TSC) can be calculated
|
|
|
|
* from sample timestamps.
|
|
|
|
*
|
|
|
|
* time = timestamp - time_zero;
|
|
|
|
* quot = time / time_mult;
|
|
|
|
* rem = time % time_mult;
|
|
|
|
* cyc = (quot << time_shift) + (rem << time_shift) / time_mult;
|
|
|
|
*
|
|
|
|
* And vice versa:
|
|
|
|
*
|
|
|
|
* quot = cyc >> time_shift;
|
2015-10-16 20:24:05 +07:00
|
|
|
* rem = cyc & (((u64)1 << time_shift) - 1);
|
2013-06-28 20:22:18 +07:00
|
|
|
* timestamp = time_zero + quot * time_mult +
|
|
|
|
* ((rem * time_mult) >> time_shift);
|
|
|
|
*/
|
|
|
|
__u64 time_zero;
|
perf: Fix capabilities bitfield compatibility in 'struct perf_event_mmap_page'
Solve the problems around the broken definition of perf_event_mmap_page::
cap_usr_time and cap_usr_rdpmc fields which used to overlap, partially
fixed by:
860f085b74e9 ("perf: Fix broken union in 'struct perf_event_mmap_page'")
The problem with the fix (merged in v3.12-rc1 and not yet released
officially), noticed by Vince Weaver is that the new behavior is
not detectable by new user-space, and that due to the reuse of the
field names it's easy to mis-compile a binary if old headers are used
on a new kernel or new headers are used on an old kernel.
To solve all that make this change explicit, detectable and self-contained,
by iterating the ABI the following way:
- Always clear bit 0, and rename it to usrpage->cap_bit0, to at least not
confuse old user-space binaries. RDPMC will be marked as unavailable
to old binaries but that's within the ABI, this is a capability bit.
- Rename bit 1 to ->cap_bit0_is_deprecated and always set it to 1, so new
libraries can reliably detect that bit 0 is deprecated and perma-zero
without having to check the kernel version.
- Use bits 2, 3, 4 for the newly defined, correct functionality:
cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */
cap_user_time : 1, /* The time_* fields are used */
cap_user_time_zero : 1, /* The time_zero field is used */
- Rename all the bitfield names in perf_event.h to be different from the
old names, to make sure it's not possible to mis-compile it
accidentally with old assumptions.
The 'size' field can then be used in the future to add new fields and it
will act as a natural ABI version indicator as well.
Also adjust tools/perf/ userspace for the new definitions, noticed by
Adrian Hunter.
Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Also-Fixed-by: Adrian Hunter <adrian.hunter@intel.com>
Link: http://lkml.kernel.org/n/tip-zr03yxjrpXesOzzupszqglbv@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-09-19 15:16:42 +07:00
|
|
|
__u32 size; /* Header size up to __reserved[] fields. */
|
2012-10-13 16:46:48 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Hole for extension of the self monitor capabilities
|
|
|
|
*/
|
|
|
|
|
perf: Fix capabilities bitfield compatibility in 'struct perf_event_mmap_page'
Solve the problems around the broken definition of perf_event_mmap_page::
cap_usr_time and cap_usr_rdpmc fields which used to overlap, partially
fixed by:
860f085b74e9 ("perf: Fix broken union in 'struct perf_event_mmap_page'")
The problem with the fix (merged in v3.12-rc1 and not yet released
officially), noticed by Vince Weaver is that the new behavior is
not detectable by new user-space, and that due to the reuse of the
field names it's easy to mis-compile a binary if old headers are used
on a new kernel or new headers are used on an old kernel.
To solve all that make this change explicit, detectable and self-contained,
by iterating the ABI the following way:
- Always clear bit 0, and rename it to usrpage->cap_bit0, to at least not
confuse old user-space binaries. RDPMC will be marked as unavailable
to old binaries but that's within the ABI, this is a capability bit.
- Rename bit 1 to ->cap_bit0_is_deprecated and always set it to 1, so new
libraries can reliably detect that bit 0 is deprecated and perma-zero
without having to check the kernel version.
- Use bits 2, 3, 4 for the newly defined, correct functionality:
cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */
cap_user_time : 1, /* The time_* fields are used */
cap_user_time_zero : 1, /* The time_zero field is used */
- Rename all the bitfield names in perf_event.h to be different from the
old names, to make sure it's not possible to mis-compile it
accidentally with old assumptions.
The 'size' field can then be used in the future to add new fields and it
will act as a natural ABI version indicator as well.
Also adjust tools/perf/ userspace for the new definitions, noticed by
Adrian Hunter.
Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Also-Fixed-by: Adrian Hunter <adrian.hunter@intel.com>
Link: http://lkml.kernel.org/n/tip-zr03yxjrpXesOzzupszqglbv@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-09-19 15:16:42 +07:00
|
|
|
__u8 __reserved[118*8+4]; /* align to 1k. */
|
2012-10-13 16:46:48 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Control data for the mmap() data buffer.
|
|
|
|
*
|
2013-10-28 19:55:29 +07:00
|
|
|
* User-space reading the @data_head value should issue an smp_rmb(),
|
|
|
|
* after reading this value.
|
2012-10-13 16:46:48 +07:00
|
|
|
*
|
|
|
|
* When the mapping is PROT_WRITE the @data_tail value should be
|
2013-10-28 19:55:29 +07:00
|
|
|
* written by userspace to reflect the last read data, after issueing
|
|
|
|
* an smp_mb() to separate the data read from the ->data_tail store.
|
|
|
|
* In this case the kernel will not over-write unread data.
|
|
|
|
*
|
|
|
|
* See perf_output_put_handle() for the data ordering.
|
2015-01-14 19:18:10 +07:00
|
|
|
*
|
|
|
|
* data_{offset,size} indicate the location and size of the perf record
|
|
|
|
* buffer within the mmapped area.
|
2012-10-13 16:46:48 +07:00
|
|
|
*/
|
|
|
|
__u64 data_head; /* head in the data section */
|
|
|
|
__u64 data_tail; /* user-space written tail */
|
2015-01-14 19:18:10 +07:00
|
|
|
__u64 data_offset; /* where the buffer starts */
|
|
|
|
__u64 data_size; /* data buffer size */
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 19:18:11 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* AUX area is defined by aux_{offset,size} fields that should be set
|
|
|
|
* by the userspace, so that
|
|
|
|
*
|
|
|
|
* aux_offset >= data_offset + data_size
|
|
|
|
*
|
|
|
|
* prior to mmap()ing it. Size of the mmap()ed area should be aux_size.
|
|
|
|
*
|
|
|
|
* Ring buffer pointers aux_{head,tail} have the same semantics as
|
|
|
|
* data_{head,tail} and same ordering rules apply.
|
|
|
|
*/
|
|
|
|
__u64 aux_head;
|
|
|
|
__u64 aux_tail;
|
|
|
|
__u64 aux_offset;
|
|
|
|
__u64 aux_size;
|
2012-10-13 16:46:48 +07:00
|
|
|
};
|
|
|
|
|
|
|
|
#define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0)
|
|
|
|
#define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0)
|
|
|
|
#define PERF_RECORD_MISC_KERNEL (1 << 0)
|
|
|
|
#define PERF_RECORD_MISC_USER (2 << 0)
|
|
|
|
#define PERF_RECORD_MISC_HYPERVISOR (3 << 0)
|
|
|
|
#define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0)
|
|
|
|
#define PERF_RECORD_MISC_GUEST_USER (5 << 0)
|
|
|
|
|
2015-06-17 20:51:10 +07:00
|
|
|
/*
|
|
|
|
* Indicates that /proc/PID/maps parsing are truncated by time out.
|
|
|
|
*/
|
|
|
|
#define PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT (1 << 12)
|
2014-05-28 15:45:04 +07:00
|
|
|
/*
|
|
|
|
* PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on
|
|
|
|
* different events so can reuse the same bit position.
|
2015-07-21 16:44:02 +07:00
|
|
|
* Ditto PERF_RECORD_MISC_SWITCH_OUT.
|
2014-05-28 15:45:04 +07:00
|
|
|
*/
|
2013-01-24 22:10:39 +07:00
|
|
|
#define PERF_RECORD_MISC_MMAP_DATA (1 << 13)
|
2014-05-28 15:45:04 +07:00
|
|
|
#define PERF_RECORD_MISC_COMM_EXEC (1 << 13)
|
2015-07-21 16:44:02 +07:00
|
|
|
#define PERF_RECORD_MISC_SWITCH_OUT (1 << 13)
|
2012-10-13 16:46:48 +07:00
|
|
|
/*
|
|
|
|
* Indicates that the content of PERF_SAMPLE_IP points to
|
|
|
|
* the actual instruction that triggered the event. See also
|
|
|
|
* perf_event_attr::precise_ip.
|
|
|
|
*/
|
|
|
|
#define PERF_RECORD_MISC_EXACT_IP (1 << 14)
|
|
|
|
/*
|
|
|
|
* Reserve the last bit to indicate some extended misc field
|
|
|
|
*/
|
|
|
|
#define PERF_RECORD_MISC_EXT_RESERVED (1 << 15)
|
|
|
|
|
|
|
|
struct perf_event_header {
|
|
|
|
__u32 type;
|
|
|
|
__u16 misc;
|
|
|
|
__u16 size;
|
|
|
|
};
|
|
|
|
|
2017-03-08 03:41:36 +07:00
|
|
|
struct perf_ns_link_info {
|
|
|
|
__u64 dev;
|
|
|
|
__u64 ino;
|
|
|
|
};
|
|
|
|
|
|
|
|
enum {
|
|
|
|
NET_NS_INDEX = 0,
|
|
|
|
UTS_NS_INDEX = 1,
|
|
|
|
IPC_NS_INDEX = 2,
|
|
|
|
PID_NS_INDEX = 3,
|
|
|
|
USER_NS_INDEX = 4,
|
|
|
|
MNT_NS_INDEX = 5,
|
|
|
|
CGROUP_NS_INDEX = 6,
|
|
|
|
|
|
|
|
NR_NAMESPACES, /* number of available namespaces */
|
|
|
|
};
|
|
|
|
|
2012-10-13 16:46:48 +07:00
|
|
|
enum perf_event_type {
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If perf_event_attr.sample_id_all is set then all event types will
|
|
|
|
* have the sample_type selected fields related to where/when
|
2013-08-27 15:23:07 +07:00
|
|
|
* (identity) an event took place (TID, TIME, ID, STREAM_ID, CPU,
|
|
|
|
* IDENTIFIER) described in PERF_RECORD_SAMPLE below, it will be stashed
|
|
|
|
* just after the perf_event_header and the fields already present for
|
|
|
|
* the existing fields, i.e. at the end of the payload. That way a newer
|
|
|
|
* perf.data file will be supported by older perf tools, with these new
|
|
|
|
* optional fields being ignored.
|
2012-10-13 16:46:48 +07:00
|
|
|
*
|
2013-07-16 22:09:07 +07:00
|
|
|
* struct sample_id {
|
|
|
|
* { u32 pid, tid; } && PERF_SAMPLE_TID
|
|
|
|
* { u64 time; } && PERF_SAMPLE_TIME
|
|
|
|
* { u64 id; } && PERF_SAMPLE_ID
|
|
|
|
* { u64 stream_id;} && PERF_SAMPLE_STREAM_ID
|
|
|
|
* { u32 cpu, res; } && PERF_SAMPLE_CPU
|
2013-08-27 15:23:07 +07:00
|
|
|
* { u64 id; } && PERF_SAMPLE_IDENTIFIER
|
2013-07-16 22:09:07 +07:00
|
|
|
* } && perf_event_attr::sample_id_all
|
2013-08-27 15:23:07 +07:00
|
|
|
*
|
|
|
|
* Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID. The
|
|
|
|
* advantage of PERF_SAMPLE_IDENTIFIER is that its position is fixed
|
|
|
|
* relative to header.size.
|
2013-07-16 22:09:07 +07:00
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
2012-10-13 16:46:48 +07:00
|
|
|
* The MMAP events record the PROT_EXEC mappings so that we can
|
|
|
|
* correlate userspace IPs to code. They have the following structure:
|
|
|
|
*
|
|
|
|
* struct {
|
|
|
|
* struct perf_event_header header;
|
|
|
|
*
|
|
|
|
* u32 pid, tid;
|
|
|
|
* u64 addr;
|
|
|
|
* u64 len;
|
|
|
|
* u64 pgoff;
|
|
|
|
* char filename[];
|
2013-09-14 04:39:17 +07:00
|
|
|
* struct sample_id sample_id;
|
2012-10-13 16:46:48 +07:00
|
|
|
* };
|
|
|
|
*/
|
|
|
|
PERF_RECORD_MMAP = 1,
|
|
|
|
|
|
|
|
/*
|
|
|
|
* struct {
|
|
|
|
* struct perf_event_header header;
|
|
|
|
* u64 id;
|
|
|
|
* u64 lost;
|
2013-07-16 22:09:07 +07:00
|
|
|
* struct sample_id sample_id;
|
2012-10-13 16:46:48 +07:00
|
|
|
* };
|
|
|
|
*/
|
|
|
|
PERF_RECORD_LOST = 2,
|
|
|
|
|
|
|
|
/*
|
|
|
|
* struct {
|
|
|
|
* struct perf_event_header header;
|
|
|
|
*
|
|
|
|
* u32 pid, tid;
|
|
|
|
* char comm[];
|
2013-07-16 22:09:07 +07:00
|
|
|
* struct sample_id sample_id;
|
2012-10-13 16:46:48 +07:00
|
|
|
* };
|
|
|
|
*/
|
|
|
|
PERF_RECORD_COMM = 3,
|
|
|
|
|
|
|
|
/*
|
|
|
|
* struct {
|
|
|
|
* struct perf_event_header header;
|
|
|
|
* u32 pid, ppid;
|
|
|
|
* u32 tid, ptid;
|
|
|
|
* u64 time;
|
2013-07-16 22:09:07 +07:00
|
|
|
* struct sample_id sample_id;
|
2012-10-13 16:46:48 +07:00
|
|
|
* };
|
|
|
|
*/
|
|
|
|
PERF_RECORD_EXIT = 4,
|
|
|
|
|
|
|
|
/*
|
|
|
|
* struct {
|
|
|
|
* struct perf_event_header header;
|
|
|
|
* u64 time;
|
|
|
|
* u64 id;
|
|
|
|
* u64 stream_id;
|
2013-07-16 22:09:07 +07:00
|
|
|
* struct sample_id sample_id;
|
2012-10-13 16:46:48 +07:00
|
|
|
* };
|
|
|
|
*/
|
|
|
|
PERF_RECORD_THROTTLE = 5,
|
|
|
|
PERF_RECORD_UNTHROTTLE = 6,
|
|
|
|
|
|
|
|
/*
|
|
|
|
* struct {
|
|
|
|
* struct perf_event_header header;
|
|
|
|
* u32 pid, ppid;
|
|
|
|
* u32 tid, ptid;
|
|
|
|
* u64 time;
|
2013-07-16 22:09:07 +07:00
|
|
|
* struct sample_id sample_id;
|
2012-10-13 16:46:48 +07:00
|
|
|
* };
|
|
|
|
*/
|
|
|
|
PERF_RECORD_FORK = 7,
|
|
|
|
|
|
|
|
/*
|
|
|
|
* struct {
|
|
|
|
* struct perf_event_header header;
|
|
|
|
* u32 pid, tid;
|
|
|
|
*
|
|
|
|
* struct read_format values;
|
2013-07-16 22:09:07 +07:00
|
|
|
* struct sample_id sample_id;
|
2012-10-13 16:46:48 +07:00
|
|
|
* };
|
|
|
|
*/
|
|
|
|
PERF_RECORD_READ = 8,
|
|
|
|
|
|
|
|
/*
|
|
|
|
* struct {
|
|
|
|
* struct perf_event_header header;
|
|
|
|
*
|
2013-08-27 15:23:07 +07:00
|
|
|
* #
|
|
|
|
* # Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID.
|
|
|
|
* # The advantage of PERF_SAMPLE_IDENTIFIER is that its position
|
|
|
|
* # is fixed relative to header.
|
|
|
|
* #
|
|
|
|
*
|
|
|
|
* { u64 id; } && PERF_SAMPLE_IDENTIFIER
|
2012-10-13 16:46:48 +07:00
|
|
|
* { u64 ip; } && PERF_SAMPLE_IP
|
|
|
|
* { u32 pid, tid; } && PERF_SAMPLE_TID
|
|
|
|
* { u64 time; } && PERF_SAMPLE_TIME
|
|
|
|
* { u64 addr; } && PERF_SAMPLE_ADDR
|
|
|
|
* { u64 id; } && PERF_SAMPLE_ID
|
|
|
|
* { u64 stream_id;} && PERF_SAMPLE_STREAM_ID
|
|
|
|
* { u32 cpu, res; } && PERF_SAMPLE_CPU
|
|
|
|
* { u64 period; } && PERF_SAMPLE_PERIOD
|
|
|
|
*
|
|
|
|
* { struct read_format values; } && PERF_SAMPLE_READ
|
|
|
|
*
|
|
|
|
* { u64 nr,
|
|
|
|
* u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN
|
|
|
|
*
|
|
|
|
* #
|
|
|
|
* # The RAW record below is opaque data wrt the ABI
|
|
|
|
* #
|
|
|
|
* # That is, the ABI doesn't make any promises wrt to
|
|
|
|
* # the stability of its content, it may vary depending
|
|
|
|
* # on event, hardware, kernel version and phase of
|
|
|
|
* # the moon.
|
|
|
|
* #
|
|
|
|
* # In other words, PERF_SAMPLE_RAW contents are not an ABI.
|
|
|
|
* #
|
|
|
|
*
|
|
|
|
* { u32 size;
|
|
|
|
* char data[size];}&& PERF_SAMPLE_RAW
|
|
|
|
*
|
2013-01-09 02:44:25 +07:00
|
|
|
* { u64 nr;
|
|
|
|
* { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK
|
2012-10-13 16:46:48 +07:00
|
|
|
*
|
|
|
|
* { u64 abi; # enum perf_sample_regs_abi
|
|
|
|
* u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
|
|
|
|
*
|
|
|
|
* { u64 size;
|
|
|
|
* char data[size];
|
|
|
|
* u64 dyn_size; } && PERF_SAMPLE_STACK_USER
|
2013-01-24 22:10:28 +07:00
|
|
|
*
|
|
|
|
* { u64 weight; } && PERF_SAMPLE_WEIGHT
|
2013-07-16 22:09:07 +07:00
|
|
|
* { u64 data_src; } && PERF_SAMPLE_DATA_SRC
|
2013-12-14 03:52:25 +07:00
|
|
|
* { u64 transaction; } && PERF_SAMPLE_TRANSACTION
|
2014-09-24 18:48:37 +07:00
|
|
|
* { u64 abi; # enum perf_sample_regs_abi
|
|
|
|
* u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
|
2012-10-13 16:46:48 +07:00
|
|
|
* };
|
|
|
|
*/
|
|
|
|
PERF_RECORD_SAMPLE = 9,
|
|
|
|
|
2013-08-21 17:10:24 +07:00
|
|
|
/*
|
|
|
|
* The MMAP2 records are an augmented version of MMAP, they add
|
|
|
|
* maj, min, ino numbers to be used to uniquely identify each mapping
|
|
|
|
*
|
|
|
|
* struct {
|
|
|
|
* struct perf_event_header header;
|
|
|
|
*
|
|
|
|
* u32 pid, tid;
|
|
|
|
* u64 addr;
|
|
|
|
* u64 len;
|
|
|
|
* u64 pgoff;
|
|
|
|
* u32 maj;
|
|
|
|
* u32 min;
|
|
|
|
* u64 ino;
|
|
|
|
* u64 ino_generation;
|
2014-05-20 02:13:47 +07:00
|
|
|
* u32 prot, flags;
|
2013-08-21 17:10:24 +07:00
|
|
|
* char filename[];
|
|
|
|
* struct sample_id sample_id;
|
|
|
|
* };
|
|
|
|
*/
|
|
|
|
PERF_RECORD_MMAP2 = 10,
|
|
|
|
|
2015-01-14 19:18:15 +07:00
|
|
|
/*
|
|
|
|
* Records that new data landed in the AUX buffer part.
|
|
|
|
*
|
|
|
|
* struct {
|
|
|
|
* struct perf_event_header header;
|
|
|
|
*
|
|
|
|
* u64 aux_offset;
|
|
|
|
* u64 aux_size;
|
|
|
|
* u64 flags;
|
|
|
|
* struct sample_id sample_id;
|
|
|
|
* };
|
|
|
|
*/
|
|
|
|
PERF_RECORD_AUX = 11,
|
|
|
|
|
perf: Add ITRACE_START record to indicate that tracing has started
For counters that generate AUX data that is bound to the context of a
running task, such as instruction tracing, the decoder needs to know
exactly which task is running when the event is first scheduled in,
before the first sched_switch. The decoder's need to know this stems
from the fact that instruction flow trace decoding will almost always
require program's object code in order to reconstruct said flow and
for that we need at least its pid/tid in the perf stream.
To single out such instruction tracing pmus, this patch introduces
ITRACE PMU capability. The reason this is not part of RECORD_AUX
record is that not all pmus capable of generating AUX data need this,
and the opposite is *probably* also true.
While sched_switch covers for most cases, there are two problems with it:
the consumer will need to process events out of order (that is, having
found RECORD_AUX, it will have to skip forward to the nearest sched_switch
to figure out which task it was, then go back to the actual trace to
decode it) and it completely misses the case when the tracing is enabled
and disabled before sched_switch, for example, via PERF_EVENT_IOC_DISABLE.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-15-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 19:18:23 +07:00
|
|
|
/*
|
|
|
|
* Indicates that instruction trace has started
|
|
|
|
*
|
|
|
|
* struct {
|
|
|
|
* struct perf_event_header header;
|
|
|
|
* u32 pid;
|
|
|
|
* u32 tid;
|
|
|
|
* };
|
|
|
|
*/
|
|
|
|
PERF_RECORD_ITRACE_START = 12,
|
|
|
|
|
2015-05-11 02:13:14 +07:00
|
|
|
/*
|
|
|
|
* Records the dropped/lost sample number.
|
|
|
|
*
|
|
|
|
* struct {
|
|
|
|
* struct perf_event_header header;
|
|
|
|
*
|
|
|
|
* u64 lost;
|
|
|
|
* struct sample_id sample_id;
|
|
|
|
* };
|
|
|
|
*/
|
|
|
|
PERF_RECORD_LOST_SAMPLES = 13,
|
|
|
|
|
2015-07-21 16:44:02 +07:00
|
|
|
/*
|
|
|
|
* Records a context switch in or out (flagged by
|
|
|
|
* PERF_RECORD_MISC_SWITCH_OUT). See also
|
|
|
|
* PERF_RECORD_SWITCH_CPU_WIDE.
|
|
|
|
*
|
|
|
|
* struct {
|
|
|
|
* struct perf_event_header header;
|
|
|
|
* struct sample_id sample_id;
|
|
|
|
* };
|
|
|
|
*/
|
|
|
|
PERF_RECORD_SWITCH = 14,
|
|
|
|
|
|
|
|
/*
|
|
|
|
* CPU-wide version of PERF_RECORD_SWITCH with next_prev_pid and
|
|
|
|
* next_prev_tid that are the next (switching out) or previous
|
|
|
|
* (switching in) pid/tid.
|
|
|
|
*
|
|
|
|
* struct {
|
|
|
|
* struct perf_event_header header;
|
|
|
|
* u32 next_prev_pid;
|
|
|
|
* u32 next_prev_tid;
|
|
|
|
* struct sample_id sample_id;
|
|
|
|
* };
|
|
|
|
*/
|
|
|
|
PERF_RECORD_SWITCH_CPU_WIDE = 15,
|
|
|
|
|
2017-03-08 03:41:36 +07:00
|
|
|
/*
|
|
|
|
* struct {
|
|
|
|
* struct perf_event_header header;
|
|
|
|
* u32 pid;
|
|
|
|
* u32 tid;
|
|
|
|
* u64 nr_namespaces;
|
|
|
|
* { u64 dev, inode; } [nr_namespaces];
|
|
|
|
* struct sample_id sample_id;
|
|
|
|
* };
|
|
|
|
*/
|
|
|
|
PERF_RECORD_NAMESPACES = 16,
|
|
|
|
|
2012-10-13 16:46:48 +07:00
|
|
|
PERF_RECORD_MAX, /* non-ABI */
|
|
|
|
};
|
|
|
|
|
|
|
|
#define PERF_MAX_STACK_DEPTH 127
|
2016-05-12 23:06:21 +07:00
|
|
|
#define PERF_MAX_CONTEXTS_PER_STACK 8
|
2012-10-13 16:46:48 +07:00
|
|
|
|
|
|
|
enum perf_callchain_context {
|
|
|
|
PERF_CONTEXT_HV = (__u64)-32,
|
|
|
|
PERF_CONTEXT_KERNEL = (__u64)-128,
|
|
|
|
PERF_CONTEXT_USER = (__u64)-512,
|
|
|
|
|
|
|
|
PERF_CONTEXT_GUEST = (__u64)-2048,
|
|
|
|
PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,
|
|
|
|
PERF_CONTEXT_GUEST_USER = (__u64)-2560,
|
|
|
|
|
|
|
|
PERF_CONTEXT_MAX = (__u64)-4095,
|
|
|
|
};
|
|
|
|
|
2015-01-14 19:18:15 +07:00
|
|
|
/**
|
|
|
|
* PERF_RECORD_AUX::flags bits
|
|
|
|
*/
|
|
|
|
#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */
|
2015-01-14 19:18:17 +07:00
|
|
|
#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */
|
2017-02-20 20:33:51 +07:00
|
|
|
#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */
|
2015-01-14 19:18:15 +07:00
|
|
|
|
2014-04-23 17:22:54 +07:00
|
|
|
#define PERF_FLAG_FD_NO_GROUP (1UL << 0)
|
|
|
|
#define PERF_FLAG_FD_OUTPUT (1UL << 1)
|
|
|
|
#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */
|
|
|
|
#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */
|
2012-10-13 16:46:48 +07:00
|
|
|
|
powerpc/perf: Define big-endian version of perf_mem_data_src
perf_mem_data_src is a union that is initialized in the kernel via the ->val
field and accessed by userspace via the mem_xxx bitfields. For this to work
correctly on big endian platforms, we need a big-endian definition for the
bitfields.
Currently on a big endian system, if a user requests PERF_SAMPLE_DATA_SRC (perf
report -d), they will get the default value from perf_sample_data_init(), which
is PERF_MEM_NA. The value for PERF_MEM_NA is constructed using shifts:
/* TLB access */
#define PERF_MEM_TLB_NA 0x01 /* not available */
...
#define PERF_MEM_TLB_SHIFT 26
#define PERF_MEM_S(a, s) \
(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
#define PERF_MEM_NA (PERF_MEM_S(OP, NA) |\
PERF_MEM_S(LVL, NA) |\
PERF_MEM_S(SNOOP, NA) |\
PERF_MEM_S(LOCK, NA) |\
PERF_MEM_S(TLB, NA))
Which works out as:
((0x01 << 0) | (0x01 << 5) | (0x01 << 19) | (0x01 << 24) | (0x01 << 26))
Which means the PERF_MEM_NA value comes out of the kernel as 0x5080021
in CPU endian.
But then in the perf tool, the code uses the bitfields to inspect the value, and
currently the bitfields are defined using little endian ordering.
So eg. in perf_mem__tlb_scnprintf() we see:
data_src->val = 0x5080021
op = 0x0
lvl = 0x0
snoop = 0x0
lock = 0x0
dtlb = 0x0
rsvd = 0x5080021
Because of the way the perf tool code is written this is still displayed to the
user as "N/A", so there is no bug visible at the UI level.
Currently there are no big endian architectures which export a meaningful
value (ie. other than PERF_MEM_NA), so the extent of the bug on big endian
platforms is that the PERF_MEM_NA value is exported incorrectly as described
above. Subsequent patches will add support on big endian powerpc for populating
the data source value.
This patch does a minimal fix of adding big endian definition of the bitfields
to match the values that are already exported by the kernel on big endian. And
it makes no change on little endian.
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-04-11 08:51:05 +07:00
|
|
|
#if defined(__LITTLE_ENDIAN_BITFIELD)
|
2013-01-24 22:10:31 +07:00
|
|
|
union perf_mem_data_src {
|
|
|
|
__u64 val;
|
|
|
|
struct {
|
|
|
|
__u64 mem_op:5, /* type of opcode */
|
|
|
|
mem_lvl:14, /* memory hierarchy level */
|
|
|
|
mem_snoop:5, /* snoop mode */
|
|
|
|
mem_lock:2, /* lock instr */
|
|
|
|
mem_dtlb:7, /* tlb access */
|
|
|
|
mem_rsvd:31;
|
|
|
|
};
|
|
|
|
};
|
powerpc/perf: Define big-endian version of perf_mem_data_src
perf_mem_data_src is a union that is initialized in the kernel via the ->val
field and accessed by userspace via the mem_xxx bitfields. For this to work
correctly on big endian platforms, we need a big-endian definition for the
bitfields.
Currently on a big endian system, if a user requests PERF_SAMPLE_DATA_SRC (perf
report -d), they will get the default value from perf_sample_data_init(), which
is PERF_MEM_NA. The value for PERF_MEM_NA is constructed using shifts:
/* TLB access */
#define PERF_MEM_TLB_NA 0x01 /* not available */
...
#define PERF_MEM_TLB_SHIFT 26
#define PERF_MEM_S(a, s) \
(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
#define PERF_MEM_NA (PERF_MEM_S(OP, NA) |\
PERF_MEM_S(LVL, NA) |\
PERF_MEM_S(SNOOP, NA) |\
PERF_MEM_S(LOCK, NA) |\
PERF_MEM_S(TLB, NA))
Which works out as:
((0x01 << 0) | (0x01 << 5) | (0x01 << 19) | (0x01 << 24) | (0x01 << 26))
Which means the PERF_MEM_NA value comes out of the kernel as 0x5080021
in CPU endian.
But then in the perf tool, the code uses the bitfields to inspect the value, and
currently the bitfields are defined using little endian ordering.
So eg. in perf_mem__tlb_scnprintf() we see:
data_src->val = 0x5080021
op = 0x0
lvl = 0x0
snoop = 0x0
lock = 0x0
dtlb = 0x0
rsvd = 0x5080021
Because of the way the perf tool code is written this is still displayed to the
user as "N/A", so there is no bug visible at the UI level.
Currently there are no big endian architectures which export a meaningful
value (ie. other than PERF_MEM_NA), so the extent of the bug on big endian
platforms is that the PERF_MEM_NA value is exported incorrectly as described
above. Subsequent patches will add support on big endian powerpc for populating
the data source value.
This patch does a minimal fix of adding big endian definition of the bitfields
to match the values that are already exported by the kernel on big endian. And
it makes no change on little endian.
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-04-11 08:51:05 +07:00
|
|
|
#elif defined(__BIG_ENDIAN_BITFIELD)
|
|
|
|
union perf_mem_data_src {
|
|
|
|
__u64 val;
|
|
|
|
struct {
|
|
|
|
__u64 mem_rsvd:31,
|
|
|
|
mem_dtlb:7, /* tlb access */
|
|
|
|
mem_lock:2, /* lock instr */
|
|
|
|
mem_snoop:5, /* snoop mode */
|
|
|
|
mem_lvl:14, /* memory hierarchy level */
|
|
|
|
mem_op:5; /* type of opcode */
|
|
|
|
};
|
|
|
|
};
|
|
|
|
#else
|
|
|
|
#error "Unknown endianness"
|
|
|
|
#endif
|
2013-01-24 22:10:31 +07:00
|
|
|
|
|
|
|
/* type of opcode (load/store/prefetch,code) */
|
|
|
|
#define PERF_MEM_OP_NA 0x01 /* not available */
|
|
|
|
#define PERF_MEM_OP_LOAD 0x02 /* load instruction */
|
|
|
|
#define PERF_MEM_OP_STORE 0x04 /* store instruction */
|
|
|
|
#define PERF_MEM_OP_PFETCH 0x08 /* prefetch */
|
|
|
|
#define PERF_MEM_OP_EXEC 0x10 /* code (execution) */
|
|
|
|
#define PERF_MEM_OP_SHIFT 0
|
|
|
|
|
|
|
|
/* memory hierarchy (memory level, hit or miss) */
|
|
|
|
#define PERF_MEM_LVL_NA 0x01 /* not available */
|
|
|
|
#define PERF_MEM_LVL_HIT 0x02 /* hit level */
|
|
|
|
#define PERF_MEM_LVL_MISS 0x04 /* miss level */
|
|
|
|
#define PERF_MEM_LVL_L1 0x08 /* L1 */
|
|
|
|
#define PERF_MEM_LVL_LFB 0x10 /* Line Fill Buffer */
|
2013-04-05 21:49:41 +07:00
|
|
|
#define PERF_MEM_LVL_L2 0x20 /* L2 */
|
|
|
|
#define PERF_MEM_LVL_L3 0x40 /* L3 */
|
2013-01-24 22:10:31 +07:00
|
|
|
#define PERF_MEM_LVL_LOC_RAM 0x80 /* Local DRAM */
|
|
|
|
#define PERF_MEM_LVL_REM_RAM1 0x100 /* Remote DRAM (1 hop) */
|
|
|
|
#define PERF_MEM_LVL_REM_RAM2 0x200 /* Remote DRAM (2 hops) */
|
|
|
|
#define PERF_MEM_LVL_REM_CCE1 0x400 /* Remote Cache (1 hop) */
|
|
|
|
#define PERF_MEM_LVL_REM_CCE2 0x800 /* Remote Cache (2 hops) */
|
|
|
|
#define PERF_MEM_LVL_IO 0x1000 /* I/O memory */
|
|
|
|
#define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */
|
|
|
|
#define PERF_MEM_LVL_SHIFT 5
|
|
|
|
|
|
|
|
/* snoop mode */
|
|
|
|
#define PERF_MEM_SNOOP_NA 0x01 /* not available */
|
|
|
|
#define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */
|
|
|
|
#define PERF_MEM_SNOOP_HIT 0x04 /* snoop hit */
|
|
|
|
#define PERF_MEM_SNOOP_MISS 0x08 /* snoop miss */
|
|
|
|
#define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */
|
|
|
|
#define PERF_MEM_SNOOP_SHIFT 19
|
|
|
|
|
|
|
|
/* locked instruction */
|
|
|
|
#define PERF_MEM_LOCK_NA 0x01 /* not available */
|
|
|
|
#define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */
|
|
|
|
#define PERF_MEM_LOCK_SHIFT 24
|
|
|
|
|
|
|
|
/* TLB access */
|
|
|
|
#define PERF_MEM_TLB_NA 0x01 /* not available */
|
|
|
|
#define PERF_MEM_TLB_HIT 0x02 /* hit level */
|
|
|
|
#define PERF_MEM_TLB_MISS 0x04 /* miss level */
|
|
|
|
#define PERF_MEM_TLB_L1 0x08 /* L1 */
|
|
|
|
#define PERF_MEM_TLB_L2 0x10 /* L2 */
|
|
|
|
#define PERF_MEM_TLB_WK 0x20 /* Hardware Walker*/
|
|
|
|
#define PERF_MEM_TLB_OS 0x40 /* OS fault handler */
|
|
|
|
#define PERF_MEM_TLB_SHIFT 26
|
|
|
|
|
|
|
|
#define PERF_MEM_S(a, s) \
|
2014-01-24 06:54:11 +07:00
|
|
|
(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
|
2013-01-24 22:10:31 +07:00
|
|
|
|
2013-08-24 02:51:03 +07:00
|
|
|
/*
|
|
|
|
* single taken branch record layout:
|
|
|
|
*
|
|
|
|
* from: source instruction (may not always be a branch insn)
|
|
|
|
* to: branch target
|
|
|
|
* mispred: branch target was mispredicted
|
|
|
|
* predicted: branch target was predicted
|
|
|
|
*
|
|
|
|
* support for mispred, predicted is optional. In case it
|
|
|
|
* is not supported mispred = predicted = 0.
|
|
|
|
*
|
|
|
|
* in_tx: running in a hardware transaction
|
|
|
|
* abort: aborting a hardware transaction
|
2015-05-11 02:22:42 +07:00
|
|
|
* cycles: cycles from last branch (or 0 if not supported)
|
2013-08-24 02:51:03 +07:00
|
|
|
*/
|
|
|
|
struct perf_branch_entry {
|
|
|
|
__u64 from;
|
|
|
|
__u64 to;
|
|
|
|
__u64 mispred:1, /* target mispredicted */
|
|
|
|
predicted:1,/* target predicted */
|
|
|
|
in_tx:1, /* in transaction */
|
|
|
|
abort:1, /* transaction abort */
|
2015-05-11 02:22:42 +07:00
|
|
|
cycles:16, /* cycle count to last branch */
|
|
|
|
reserved:44;
|
2013-08-24 02:51:03 +07:00
|
|
|
};
|
|
|
|
|
2012-10-13 16:46:48 +07:00
|
|
|
#endif /* _UAPI_LINUX_PERF_EVENT_H */
|