mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-18 17:57:36 +07:00
perf_counter: update documentation
Impact: documentation fix This updates the perfcounter documentation to reflect recent changes. Signed-off-by: Paul Mackerras <paulus@samba.org>
This commit is contained in:
parent
0fd112e41c
commit
f66c6b2066
@ -11,7 +11,9 @@ thus be used to profile the code that runs on that CPU.
|
||||
|
||||
The Linux Performance Counter subsystem provides an abstraction of these
|
||||
hardware capabilities. It provides per task and per CPU counters, counter
|
||||
groups, and it provides event capabilities on top of those.
|
||||
groups, and it provides event capabilities on top of those. It
|
||||
provides "virtual" 64-bit counters, regardless of the width of the
|
||||
underlying hardware counters.
|
||||
|
||||
Performance counters are accessed via special file descriptors.
|
||||
There's one file descriptor per virtual counter used.
|
||||
@ -20,7 +22,8 @@ The special file descriptor is opened via the perf_counter_open()
|
||||
system call:
|
||||
|
||||
int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
|
||||
pid_t pid, int cpu, int group_fd);
|
||||
pid_t pid, int cpu, int group_fd,
|
||||
unsigned long flags);
|
||||
|
||||
The syscall returns the new fd. The fd can be used via the normal
|
||||
VFS system calls: read() can be used to read the counter, fcntl()
|
||||
@ -32,90 +35,180 @@ can be poll()ed.
|
||||
When creating a new counter fd, 'perf_counter_hw_event' is:
|
||||
|
||||
/*
|
||||
* Hardware event to monitor via a performance monitoring counter:
|
||||
* Event to monitor via a performance monitoring counter:
|
||||
*/
|
||||
struct perf_counter_hw_event {
|
||||
s64 type;
|
||||
__u64 event_config;
|
||||
|
||||
u64 irq_period;
|
||||
u32 record_type;
|
||||
__u64 irq_period;
|
||||
__u64 record_type;
|
||||
__u64 read_format;
|
||||
|
||||
u32 disabled : 1, /* off by default */
|
||||
nmi : 1, /* NMI sampling */
|
||||
raw : 1, /* raw event type */
|
||||
__reserved_1 : 29;
|
||||
__u64 disabled : 1, /* off by default */
|
||||
nmi : 1, /* NMI sampling */
|
||||
inherit : 1, /* children inherit it */
|
||||
pinned : 1, /* must always be on PMU */
|
||||
exclusive : 1, /* only group on PMU */
|
||||
exclude_user : 1, /* don't count user */
|
||||
exclude_kernel : 1, /* ditto kernel */
|
||||
exclude_hv : 1, /* ditto hypervisor */
|
||||
exclude_idle : 1, /* don't count when idle */
|
||||
|
||||
u64 __reserved_2;
|
||||
__reserved_1 : 55;
|
||||
|
||||
__u32 extra_config_len;
|
||||
|
||||
__u32 __reserved_4;
|
||||
__u64 __reserved_2;
|
||||
__u64 __reserved_3;
|
||||
};
|
||||
|
||||
The 'event_config' field specifies what the counter should count. It
|
||||
is divided into 3 bit-fields:
|
||||
|
||||
raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000
|
||||
type: 7 bits (next most significant) 0x7f00_0000_0000_0000
|
||||
event_id: 56 bits (least significant) 0x00ff_0000_0000_0000
|
||||
|
||||
If 'raw_type' is 1, then the counter will count a hardware event
|
||||
specified by the remaining 63 bits of event_config. The encoding is
|
||||
machine-specific.
|
||||
|
||||
If 'raw_type' is 0, then the 'type' field says what kind of counter
|
||||
this is, with the following encoding:
|
||||
|
||||
enum perf_event_types {
|
||||
PERF_TYPE_HARDWARE = 0,
|
||||
PERF_TYPE_SOFTWARE = 1,
|
||||
PERF_TYPE_TRACEPOINT = 2,
|
||||
};
|
||||
|
||||
A counter of PERF_TYPE_HARDWARE will count the hardware event
|
||||
specified by 'event_id':
|
||||
|
||||
/*
|
||||
* Generalized performance counter event types, used by the hw_event.type
|
||||
* Generalized performance counter event types, used by the hw_event.event_id
|
||||
* parameter of the sys_perf_counter_open() syscall:
|
||||
*/
|
||||
enum hw_event_types {
|
||||
enum hw_event_ids {
|
||||
/*
|
||||
* Common hardware events, generalized by the kernel:
|
||||
*/
|
||||
PERF_COUNT_CYCLES = 0,
|
||||
PERF_COUNT_INSTRUCTIONS = 1,
|
||||
PERF_COUNT_CACHE_REFERENCES = 2,
|
||||
PERF_COUNT_CACHE_MISSES = 3,
|
||||
PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
|
||||
PERF_COUNT_BRANCH_MISSES = 5,
|
||||
|
||||
/*
|
||||
* Special "software" counters provided by the kernel, even if
|
||||
* the hardware does not support performance counters. These
|
||||
* counters measure various physical and sw events of the
|
||||
* kernel (and allow the profiling of them as well):
|
||||
*/
|
||||
PERF_COUNT_CPU_CLOCK = -1,
|
||||
PERF_COUNT_TASK_CLOCK = -2,
|
||||
/*
|
||||
* Future software events:
|
||||
*/
|
||||
/* PERF_COUNT_PAGE_FAULTS = -3,
|
||||
PERF_COUNT_CONTEXT_SWITCHES = -4, */
|
||||
PERF_COUNT_CPU_CYCLES = 0,
|
||||
PERF_COUNT_INSTRUCTIONS = 1,
|
||||
PERF_COUNT_CACHE_REFERENCES = 2,
|
||||
PERF_COUNT_CACHE_MISSES = 3,
|
||||
PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
|
||||
PERF_COUNT_BRANCH_MISSES = 5,
|
||||
PERF_COUNT_BUS_CYCLES = 6,
|
||||
};
|
||||
|
||||
These are standardized types of events that work uniformly on all CPUs
|
||||
that implements Performance Counters support under Linux. If a CPU is
|
||||
not able to count branch-misses, then the system call will return
|
||||
-EINVAL.
|
||||
These are standardized types of events that work relatively uniformly
|
||||
on all CPUs that implement Performance Counters support under Linux,
|
||||
although there may be variations (e.g., different CPUs might count
|
||||
cache references and misses at different levels of the cache hierarchy).
|
||||
If a CPU is not able to count the selected event, then the system call
|
||||
will return -EINVAL.
|
||||
|
||||
More hw_event_types are supported as well, but they are CPU
|
||||
specific and are enumerated via /sys on a per CPU basis. Raw hw event
|
||||
types can be passed in under hw_event.type if hw_event.raw is 1.
|
||||
For example, to count "External bus cycles while bus lock signal asserted"
|
||||
events on Intel Core CPUs, pass in a 0x4064 event type value and set
|
||||
hw_event.raw to 1.
|
||||
More hw_event_types are supported as well, but they are CPU-specific
|
||||
and accessed as raw events. For example, to count "External bus
|
||||
cycles while bus lock signal asserted" events on Intel Core CPUs, pass
|
||||
in a 0x4064 event_id value and set hw_event.raw_type to 1.
|
||||
|
||||
'record_type' is the type of data that a read() will provide for the
|
||||
counter, and it can be one of:
|
||||
A counter of type PERF_TYPE_SOFTWARE will count one of the available
|
||||
software events, selected by 'event_id':
|
||||
|
||||
/*
|
||||
* Special "software" counters provided by the kernel, even if the hardware
|
||||
* does not support performance counters. These counters measure various
|
||||
* physical and sw events of the kernel (and allow the profiling of them as
|
||||
* well):
|
||||
*/
|
||||
enum sw_event_ids {
|
||||
PERF_COUNT_CPU_CLOCK = 0,
|
||||
PERF_COUNT_TASK_CLOCK = 1,
|
||||
PERF_COUNT_PAGE_FAULTS = 2,
|
||||
PERF_COUNT_CONTEXT_SWITCHES = 3,
|
||||
PERF_COUNT_CPU_MIGRATIONS = 4,
|
||||
PERF_COUNT_PAGE_FAULTS_MIN = 5,
|
||||
PERF_COUNT_PAGE_FAULTS_MAJ = 6,
|
||||
};
|
||||
|
||||
Counters come in two flavours: counting counters and sampling
|
||||
counters. A "counting" counter is one that is used for counting the
|
||||
number of events that occur, and is characterised by having
|
||||
irq_period = 0 and record_type = PERF_RECORD_SIMPLE. A read() on a
|
||||
counting counter simply returns the current value of the counter as
|
||||
an 8-byte number.
|
||||
|
||||
A "sampling" counter is one that is set up to generate an interrupt
|
||||
every N events, where N is given by 'irq_period'. A sampling counter
|
||||
has irq_period > 0 and record_type != PERF_RECORD_SIMPLE. The
|
||||
record_type controls what data is recorded on each interrupt, and the
|
||||
available values are currently:
|
||||
|
||||
/*
|
||||
* IRQ-notification data record type:
|
||||
*/
|
||||
enum perf_counter_record_type {
|
||||
PERF_RECORD_SIMPLE = 0,
|
||||
PERF_RECORD_IRQ = 1,
|
||||
PERF_RECORD_GROUP = 2,
|
||||
PERF_RECORD_SIMPLE = 0,
|
||||
PERF_RECORD_IRQ = 1,
|
||||
PERF_RECORD_GROUP = 2,
|
||||
};
|
||||
|
||||
a "simple" counter is one that counts hardware events and allows
|
||||
them to be read out into a u64 count value. (read() returns 8 on
|
||||
a successful read of a simple counter.)
|
||||
A record_type value of PERF_RECORD_IRQ will record the instruction
|
||||
pointer (IP) at which the interrupt occurred. A record_type value of
|
||||
PERF_RECORD_GROUP will record the event_config and counter value of
|
||||
all of the other counters in the group, and should only be used on a
|
||||
group leader (see below). Currently these two values are mutually
|
||||
exclusive, but record_type will become a bit-mask in future and
|
||||
support other values.
|
||||
|
||||
An "irq" counter is one that will also provide an IRQ context information:
|
||||
the IP of the interrupted context. In this case read() will return
|
||||
the 8-byte counter value, plus the Instruction Pointer address of the
|
||||
interrupted context.
|
||||
A sampling counter has an event queue, into which an event is placed
|
||||
on each interrupt. A read() on a sampling counter will read the next
|
||||
event from the event queue. If the queue is empty, the read() will
|
||||
either block or return an EAGAIN error, depending on whether the fd
|
||||
has been set to non-blocking mode or not.
|
||||
|
||||
The parameter 'hw_event_period' is the number of events before waking up
|
||||
a read() that is blocked on a counter fd. Zero value means a non-blocking
|
||||
counter.
|
||||
The 'disabled' bit specifies whether the counter starts out disabled
|
||||
or enabled. If it is initially disabled, it can be enabled by ioctl
|
||||
or prctl (see below).
|
||||
|
||||
The 'pid' parameter allows the counter to be specific to a task:
|
||||
The 'nmi' bit specifies, for hardware events, whether the counter
|
||||
should be set up to request non-maskable interrupts (NMIs) or normal
|
||||
interrupts. This bit is ignored if the user doesn't have
|
||||
CAP_SYS_ADMIN privilege (i.e. is not root) or if the CPU doesn't
|
||||
generate NMIs from hardware counters.
|
||||
|
||||
The 'inherit' bit, if set, specifies that this counter should count
|
||||
events on descendant tasks as well as the task specified. This only
|
||||
applies to new descendents, not to any existing descendents at the
|
||||
time the counter is created (nor to any new descendents of existing
|
||||
descendents).
|
||||
|
||||
The 'pinned' bit, if set, specifies that the counter should always be
|
||||
on the CPU if at all possible. It only applies to hardware counters
|
||||
and only to group leaders. If a pinned counter cannot be put onto the
|
||||
CPU (e.g. because there are not enough hardware counters or because of
|
||||
a conflict with some other event), then the counter goes into an
|
||||
'error' state, where reads return end-of-file (i.e. read() returns 0)
|
||||
until the counter is subsequently enabled or disabled.
|
||||
|
||||
The 'exclusive' bit, if set, specifies that when this counter's group
|
||||
is on the CPU, it should be the only group using the CPU's counters.
|
||||
In future, this will allow sophisticated monitoring programs to supply
|
||||
extra configuration information via 'extra_config_len' to exploit
|
||||
advanced features of the CPU's Performance Monitor Unit (PMU) that are
|
||||
not otherwise accessible and that might disrupt other hardware
|
||||
counters.
|
||||
|
||||
The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
|
||||
way to request that counting of events be restricted to times when the
|
||||
CPU is in user, kernel and/or hypervisor mode.
|
||||
|
||||
|
||||
The 'pid' parameter to the perf_counter_open() system call allows the
|
||||
counter to be specific to a task:
|
||||
|
||||
pid == 0: if the pid parameter is zero, the counter is attached to the
|
||||
current task.
|
||||
@ -125,8 +218,7 @@ The 'pid' parameter allows the counter to be specific to a task:
|
||||
|
||||
pid < 0: all tasks are counted (per cpu counters)
|
||||
|
||||
The 'cpu' parameter allows a counter to be made specific to a full
|
||||
CPU:
|
||||
The 'cpu' parameter allows a counter to be made specific to a CPU:
|
||||
|
||||
cpu >= 0: the counter is restricted to a specific CPU
|
||||
cpu == -1: the counter counts on all CPUs
|
||||
@ -141,7 +233,51 @@ their own tasks.
|
||||
A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
|
||||
all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
|
||||
|
||||
Group counters are created by passing in a group_fd of another counter.
|
||||
Groups are scheduled at once and can be used with PERF_RECORD_GROUP
|
||||
to record multi-dimensional timestamps.
|
||||
The 'flags' parameter is currently unused and must be zero.
|
||||
|
||||
The 'group_fd' parameter allows counter "groups" to be set up. A
|
||||
counter group has one counter which is the group "leader". The leader
|
||||
is created first, with group_fd = -1 in the perf_counter_open call
|
||||
that creates it. The rest of the group members are created
|
||||
subsequently, with group_fd giving the fd of the group leader.
|
||||
(A single counter on its own is created with group_fd = -1 and is
|
||||
considered to be a group with only 1 member.)
|
||||
|
||||
A counter group is scheduled onto the CPU as a unit, that is, it will
|
||||
only be put onto the CPU if all of the counters in the group can be
|
||||
put onto the CPU. This means that the values of the member counters
|
||||
can be meaningfully compared, added, divided (to get ratios), etc.,
|
||||
with each other, since they have counted events for the same set of
|
||||
executed instructions.
|
||||
|
||||
Counters can be enabled and disabled in two ways: via ioctl and via
|
||||
prctl. When a counter is disabled, it doesn't count or generate
|
||||
events but does continue to exist and maintain its count value.
|
||||
|
||||
An individual counter or counter group can be enabled with
|
||||
|
||||
ioctl(fd, PERF_COUNTER_IOC_ENABLE);
|
||||
|
||||
or disabled with
|
||||
|
||||
ioctl(fd, PERF_COUNTER_IOC_DISABLE);
|
||||
|
||||
Enabling or disabling the leader of a group enables or disables the
|
||||
whole group; that is, while the group leader is disabled, none of the
|
||||
counters in the group will count. Enabling or disabling a member of a
|
||||
group other than the leader only affects that counter - disabling an
|
||||
non-leader stops that counter from counting but doesn't affect any
|
||||
other counter.
|
||||
|
||||
A process can enable or disable all the counter groups that are
|
||||
attached to it, using prctl:
|
||||
|
||||
prctl(PR_TASK_PERF_COUNTERS_ENABLE);
|
||||
|
||||
prctl(PR_TASK_PERF_COUNTERS_DISABLE);
|
||||
|
||||
This applies to all counters on the current process, whether created
|
||||
by this process or by another, and doesn't affect any counters that
|
||||
this process has created on other processes. It only enables or
|
||||
disables the group leaders, not any other members in the groups.
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user