linux_dsm_epyc7002/samples/bpf/cpustat_kern.c

282 lines
7.0 KiB
C
Raw Permalink Normal View History

samples/bpf: Add program for CPU state statistics CPU is active when have running tasks on it and CPUFreq governor can select different operating points (OPP) according to different workload; we use 'pstate' to present CPU state which have running tasks with one specific OPP. On the other hand, CPU is idle which only idle task on it, CPUIdle governor can select one specific idle state to power off hardware logics; we use 'cstate' to present CPU idle state. Based on trace events 'cpu_idle' and 'cpu_frequency' we can accomplish the duration statistics for every state. Every time when CPU enters into or exits from idle states, the trace event 'cpu_idle' is recorded; trace event 'cpu_frequency' records the event for CPU OPP changing, so it's easily to know how long time the CPU stays in the specified OPP, and the CPU must be not in any idle state. This patch is to utilize the mentioned trace events for pstate and cstate statistics. To achieve more accurate profiling data, the program uses below sequence to insure CPU running/idle time aren't missed: - Before profiling the user space program wakes up all CPUs for once, so can avoid to missing account time for CPU staying in idle state for long time; the program forces to set 'scaling_max_freq' to lowest frequency and then restore 'scaling_max_freq' to highest frequency, this can ensure the frequency to be set to lowest frequency and later after start to run workload the frequency can be easily to be changed to higher frequency; - User space program reads map data and update statistics for every 5s, so this is same with other sample bpf programs for avoiding big overload introduced by bpf program self; - When send signal to terminate program, the signal handler wakes up all CPUs, set lowest frequency and restore highest frequency to 'scaling_max_freq'; this is exactly same with the first step so avoid to missing account CPU pstate and cstate time during last stage. Finally it reports the latest statistics. The program has been tested on Hikey board with octa CA53 CPUs, below is one example for statistics result, the format mainly follows up Jesper Dangaard Brouer suggestion. Jesper reminds to 'get printf to pretty print with thousands separators use %' and setlocale(LC_NUMERIC, "en_US")', tried three different arm64 GCC toolchains (5.4.0 20160609, 6.2.1 20161016, 6.3.0 20170516) but all of them cannot support printf flag character %' on arm64 platform, so go back print number without grouping mode. CPU states statistics: state(ms) cstate-0 cstate-1 cstate-2 pstate-0 pstate-1 pstate-2 pstate-3 pstate-4 CPU-0 767 6111 111863 561 31 756 853 190 CPU-1 241 10606 107956 484 125 646 990 85 CPU-2 413 19721 98735 636 84 696 757 89 CPU-3 84 11711 79989 17516 909 4811 5773 341 CPU-4 152 19610 98229 444 53 649 708 1283 CPU-5 185 8781 108697 666 91 671 677 1365 CPU-6 157 21964 95825 581 67 566 684 1284 CPU-7 125 15238 102704 398 20 665 786 1197 Cc: Daniel Lezcano <daniel.lezcano@linaro.org> Cc: Vincent Guittot <vincent.guittot@linaro.org> Signed-off-by: Leo Yan <leo.yan@linaro.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-02-26 08:19:12 +07:00
// SPDX-License-Identifier: GPL-2.0
#include <linux/version.h>
#include <linux/ptrace.h>
#include <uapi/linux/bpf.h>
#include <bpf/bpf_helpers.h>
samples/bpf: Add program for CPU state statistics CPU is active when have running tasks on it and CPUFreq governor can select different operating points (OPP) according to different workload; we use 'pstate' to present CPU state which have running tasks with one specific OPP. On the other hand, CPU is idle which only idle task on it, CPUIdle governor can select one specific idle state to power off hardware logics; we use 'cstate' to present CPU idle state. Based on trace events 'cpu_idle' and 'cpu_frequency' we can accomplish the duration statistics for every state. Every time when CPU enters into or exits from idle states, the trace event 'cpu_idle' is recorded; trace event 'cpu_frequency' records the event for CPU OPP changing, so it's easily to know how long time the CPU stays in the specified OPP, and the CPU must be not in any idle state. This patch is to utilize the mentioned trace events for pstate and cstate statistics. To achieve more accurate profiling data, the program uses below sequence to insure CPU running/idle time aren't missed: - Before profiling the user space program wakes up all CPUs for once, so can avoid to missing account time for CPU staying in idle state for long time; the program forces to set 'scaling_max_freq' to lowest frequency and then restore 'scaling_max_freq' to highest frequency, this can ensure the frequency to be set to lowest frequency and later after start to run workload the frequency can be easily to be changed to higher frequency; - User space program reads map data and update statistics for every 5s, so this is same with other sample bpf programs for avoiding big overload introduced by bpf program self; - When send signal to terminate program, the signal handler wakes up all CPUs, set lowest frequency and restore highest frequency to 'scaling_max_freq'; this is exactly same with the first step so avoid to missing account CPU pstate and cstate time during last stage. Finally it reports the latest statistics. The program has been tested on Hikey board with octa CA53 CPUs, below is one example for statistics result, the format mainly follows up Jesper Dangaard Brouer suggestion. Jesper reminds to 'get printf to pretty print with thousands separators use %' and setlocale(LC_NUMERIC, "en_US")', tried three different arm64 GCC toolchains (5.4.0 20160609, 6.2.1 20161016, 6.3.0 20170516) but all of them cannot support printf flag character %' on arm64 platform, so go back print number without grouping mode. CPU states statistics: state(ms) cstate-0 cstate-1 cstate-2 pstate-0 pstate-1 pstate-2 pstate-3 pstate-4 CPU-0 767 6111 111863 561 31 756 853 190 CPU-1 241 10606 107956 484 125 646 990 85 CPU-2 413 19721 98735 636 84 696 757 89 CPU-3 84 11711 79989 17516 909 4811 5773 341 CPU-4 152 19610 98229 444 53 649 708 1283 CPU-5 185 8781 108697 666 91 671 677 1365 CPU-6 157 21964 95825 581 67 566 684 1284 CPU-7 125 15238 102704 398 20 665 786 1197 Cc: Daniel Lezcano <daniel.lezcano@linaro.org> Cc: Vincent Guittot <vincent.guittot@linaro.org> Signed-off-by: Leo Yan <leo.yan@linaro.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-02-26 08:19:12 +07:00
/*
* The CPU number, cstate number and pstate number are based
* on 96boards Hikey with octa CA53 CPUs.
*
* Every CPU have three idle states for cstate:
* WFI, CPU_OFF, CLUSTER_OFF
*
* Every CPU have 5 operating points:
* 208MHz, 432MHz, 729MHz, 960MHz, 1200MHz
*
* This code is based on these assumption and other platforms
* need to adjust these definitions.
*/
#define MAX_CPU 8
#define MAX_PSTATE_ENTRIES 5
#define MAX_CSTATE_ENTRIES 3
static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 };
/*
* my_map structure is used to record cstate and pstate index and
* timestamp (Idx, Ts), when new event incoming we need to update
* combination for new state index and timestamp (Idx`, Ts`).
*
* Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time
* interval for the previous state: Duration(Idx) = Ts` - Ts.
*
* Every CPU has one below array for recording state index and
* timestamp, and record for cstate and pstate saperately:
*
* +--------------------------+
* | cstate timestamp |
* +--------------------------+
* | cstate index |
* +--------------------------+
* | pstate timestamp |
* +--------------------------+
* | pstate index |
* +--------------------------+
*/
#define MAP_OFF_CSTATE_TIME 0
#define MAP_OFF_CSTATE_IDX 1
#define MAP_OFF_PSTATE_TIME 2
#define MAP_OFF_PSTATE_IDX 3
#define MAP_OFF_NUM 4
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, u32);
__type(value, u64);
__uint(max_entries, MAX_CPU * MAP_OFF_NUM);
} my_map SEC(".maps");
samples/bpf: Add program for CPU state statistics CPU is active when have running tasks on it and CPUFreq governor can select different operating points (OPP) according to different workload; we use 'pstate' to present CPU state which have running tasks with one specific OPP. On the other hand, CPU is idle which only idle task on it, CPUIdle governor can select one specific idle state to power off hardware logics; we use 'cstate' to present CPU idle state. Based on trace events 'cpu_idle' and 'cpu_frequency' we can accomplish the duration statistics for every state. Every time when CPU enters into or exits from idle states, the trace event 'cpu_idle' is recorded; trace event 'cpu_frequency' records the event for CPU OPP changing, so it's easily to know how long time the CPU stays in the specified OPP, and the CPU must be not in any idle state. This patch is to utilize the mentioned trace events for pstate and cstate statistics. To achieve more accurate profiling data, the program uses below sequence to insure CPU running/idle time aren't missed: - Before profiling the user space program wakes up all CPUs for once, so can avoid to missing account time for CPU staying in idle state for long time; the program forces to set 'scaling_max_freq' to lowest frequency and then restore 'scaling_max_freq' to highest frequency, this can ensure the frequency to be set to lowest frequency and later after start to run workload the frequency can be easily to be changed to higher frequency; - User space program reads map data and update statistics for every 5s, so this is same with other sample bpf programs for avoiding big overload introduced by bpf program self; - When send signal to terminate program, the signal handler wakes up all CPUs, set lowest frequency and restore highest frequency to 'scaling_max_freq'; this is exactly same with the first step so avoid to missing account CPU pstate and cstate time during last stage. Finally it reports the latest statistics. The program has been tested on Hikey board with octa CA53 CPUs, below is one example for statistics result, the format mainly follows up Jesper Dangaard Brouer suggestion. Jesper reminds to 'get printf to pretty print with thousands separators use %' and setlocale(LC_NUMERIC, "en_US")', tried three different arm64 GCC toolchains (5.4.0 20160609, 6.2.1 20161016, 6.3.0 20170516) but all of them cannot support printf flag character %' on arm64 platform, so go back print number without grouping mode. CPU states statistics: state(ms) cstate-0 cstate-1 cstate-2 pstate-0 pstate-1 pstate-2 pstate-3 pstate-4 CPU-0 767 6111 111863 561 31 756 853 190 CPU-1 241 10606 107956 484 125 646 990 85 CPU-2 413 19721 98735 636 84 696 757 89 CPU-3 84 11711 79989 17516 909 4811 5773 341 CPU-4 152 19610 98229 444 53 649 708 1283 CPU-5 185 8781 108697 666 91 671 677 1365 CPU-6 157 21964 95825 581 67 566 684 1284 CPU-7 125 15238 102704 398 20 665 786 1197 Cc: Daniel Lezcano <daniel.lezcano@linaro.org> Cc: Vincent Guittot <vincent.guittot@linaro.org> Signed-off-by: Leo Yan <leo.yan@linaro.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-02-26 08:19:12 +07:00
/* cstate_duration records duration time for every idle state per CPU */
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, u32);
__type(value, u64);
__uint(max_entries, MAX_CPU * MAX_CSTATE_ENTRIES);
} cstate_duration SEC(".maps");
samples/bpf: Add program for CPU state statistics CPU is active when have running tasks on it and CPUFreq governor can select different operating points (OPP) according to different workload; we use 'pstate' to present CPU state which have running tasks with one specific OPP. On the other hand, CPU is idle which only idle task on it, CPUIdle governor can select one specific idle state to power off hardware logics; we use 'cstate' to present CPU idle state. Based on trace events 'cpu_idle' and 'cpu_frequency' we can accomplish the duration statistics for every state. Every time when CPU enters into or exits from idle states, the trace event 'cpu_idle' is recorded; trace event 'cpu_frequency' records the event for CPU OPP changing, so it's easily to know how long time the CPU stays in the specified OPP, and the CPU must be not in any idle state. This patch is to utilize the mentioned trace events for pstate and cstate statistics. To achieve more accurate profiling data, the program uses below sequence to insure CPU running/idle time aren't missed: - Before profiling the user space program wakes up all CPUs for once, so can avoid to missing account time for CPU staying in idle state for long time; the program forces to set 'scaling_max_freq' to lowest frequency and then restore 'scaling_max_freq' to highest frequency, this can ensure the frequency to be set to lowest frequency and later after start to run workload the frequency can be easily to be changed to higher frequency; - User space program reads map data and update statistics for every 5s, so this is same with other sample bpf programs for avoiding big overload introduced by bpf program self; - When send signal to terminate program, the signal handler wakes up all CPUs, set lowest frequency and restore highest frequency to 'scaling_max_freq'; this is exactly same with the first step so avoid to missing account CPU pstate and cstate time during last stage. Finally it reports the latest statistics. The program has been tested on Hikey board with octa CA53 CPUs, below is one example for statistics result, the format mainly follows up Jesper Dangaard Brouer suggestion. Jesper reminds to 'get printf to pretty print with thousands separators use %' and setlocale(LC_NUMERIC, "en_US")', tried three different arm64 GCC toolchains (5.4.0 20160609, 6.2.1 20161016, 6.3.0 20170516) but all of them cannot support printf flag character %' on arm64 platform, so go back print number without grouping mode. CPU states statistics: state(ms) cstate-0 cstate-1 cstate-2 pstate-0 pstate-1 pstate-2 pstate-3 pstate-4 CPU-0 767 6111 111863 561 31 756 853 190 CPU-1 241 10606 107956 484 125 646 990 85 CPU-2 413 19721 98735 636 84 696 757 89 CPU-3 84 11711 79989 17516 909 4811 5773 341 CPU-4 152 19610 98229 444 53 649 708 1283 CPU-5 185 8781 108697 666 91 671 677 1365 CPU-6 157 21964 95825 581 67 566 684 1284 CPU-7 125 15238 102704 398 20 665 786 1197 Cc: Daniel Lezcano <daniel.lezcano@linaro.org> Cc: Vincent Guittot <vincent.guittot@linaro.org> Signed-off-by: Leo Yan <leo.yan@linaro.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-02-26 08:19:12 +07:00
/* pstate_duration records duration time for every operating point per CPU */
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, u32);
__type(value, u64);
__uint(max_entries, MAX_CPU * MAX_PSTATE_ENTRIES);
} pstate_duration SEC(".maps");
samples/bpf: Add program for CPU state statistics CPU is active when have running tasks on it and CPUFreq governor can select different operating points (OPP) according to different workload; we use 'pstate' to present CPU state which have running tasks with one specific OPP. On the other hand, CPU is idle which only idle task on it, CPUIdle governor can select one specific idle state to power off hardware logics; we use 'cstate' to present CPU idle state. Based on trace events 'cpu_idle' and 'cpu_frequency' we can accomplish the duration statistics for every state. Every time when CPU enters into or exits from idle states, the trace event 'cpu_idle' is recorded; trace event 'cpu_frequency' records the event for CPU OPP changing, so it's easily to know how long time the CPU stays in the specified OPP, and the CPU must be not in any idle state. This patch is to utilize the mentioned trace events for pstate and cstate statistics. To achieve more accurate profiling data, the program uses below sequence to insure CPU running/idle time aren't missed: - Before profiling the user space program wakes up all CPUs for once, so can avoid to missing account time for CPU staying in idle state for long time; the program forces to set 'scaling_max_freq' to lowest frequency and then restore 'scaling_max_freq' to highest frequency, this can ensure the frequency to be set to lowest frequency and later after start to run workload the frequency can be easily to be changed to higher frequency; - User space program reads map data and update statistics for every 5s, so this is same with other sample bpf programs for avoiding big overload introduced by bpf program self; - When send signal to terminate program, the signal handler wakes up all CPUs, set lowest frequency and restore highest frequency to 'scaling_max_freq'; this is exactly same with the first step so avoid to missing account CPU pstate and cstate time during last stage. Finally it reports the latest statistics. The program has been tested on Hikey board with octa CA53 CPUs, below is one example for statistics result, the format mainly follows up Jesper Dangaard Brouer suggestion. Jesper reminds to 'get printf to pretty print with thousands separators use %' and setlocale(LC_NUMERIC, "en_US")', tried three different arm64 GCC toolchains (5.4.0 20160609, 6.2.1 20161016, 6.3.0 20170516) but all of them cannot support printf flag character %' on arm64 platform, so go back print number without grouping mode. CPU states statistics: state(ms) cstate-0 cstate-1 cstate-2 pstate-0 pstate-1 pstate-2 pstate-3 pstate-4 CPU-0 767 6111 111863 561 31 756 853 190 CPU-1 241 10606 107956 484 125 646 990 85 CPU-2 413 19721 98735 636 84 696 757 89 CPU-3 84 11711 79989 17516 909 4811 5773 341 CPU-4 152 19610 98229 444 53 649 708 1283 CPU-5 185 8781 108697 666 91 671 677 1365 CPU-6 157 21964 95825 581 67 566 684 1284 CPU-7 125 15238 102704 398 20 665 786 1197 Cc: Daniel Lezcano <daniel.lezcano@linaro.org> Cc: Vincent Guittot <vincent.guittot@linaro.org> Signed-off-by: Leo Yan <leo.yan@linaro.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-02-26 08:19:12 +07:00
/*
* The trace events for cpu_idle and cpu_frequency are taken from:
* /sys/kernel/debug/tracing/events/power/cpu_idle/format
* /sys/kernel/debug/tracing/events/power/cpu_frequency/format
*
* These two events have same format, so define one common structure.
*/
struct cpu_args {
u64 pad;
u32 state;
u32 cpu_id;
};
/* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */
static u32 find_cpu_pstate_idx(u32 frequency)
{
u32 i;
for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) {
if (frequency == cpu_opps[i])
return i;
}
return i;
}
SEC("tracepoint/power/cpu_idle")
int bpf_prog1(struct cpu_args *ctx)
{
u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta;
u32 key, cpu, pstate_idx;
u64 *val;
if (ctx->cpu_id > MAX_CPU)
return 0;
cpu = ctx->cpu_id;
key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME;
cts = bpf_map_lookup_elem(&my_map, &key);
if (!cts)
return 0;
key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
cstate = bpf_map_lookup_elem(&my_map, &key);
if (!cstate)
return 0;
key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
pts = bpf_map_lookup_elem(&my_map, &key);
if (!pts)
return 0;
key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
pstate = bpf_map_lookup_elem(&my_map, &key);
if (!pstate)
return 0;
prev_state = *cstate;
*cstate = ctx->state;
if (!*cts) {
*cts = bpf_ktime_get_ns();
return 0;
}
cur_ts = bpf_ktime_get_ns();
delta = cur_ts - *cts;
*cts = cur_ts;
/*
* When state doesn't equal to (u32)-1, the cpu will enter
* one idle state; for this case we need to record interval
* for the pstate.
*
* OPP2
* +---------------------+
* OPP1 | |
* ---------+ |
* | Idle state
* +---------------
*
* |<- pstate duration ->|
* ^ ^
* pts cur_ts
*/
if (ctx->state != (u32)-1) {
/* record pstate after have first cpu_frequency event */
if (!*pts)
return 0;
delta = cur_ts - *pts;
pstate_idx = find_cpu_pstate_idx(*pstate);
if (pstate_idx >= MAX_PSTATE_ENTRIES)
return 0;
key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
val = bpf_map_lookup_elem(&pstate_duration, &key);
if (val)
__sync_fetch_and_add((long *)val, delta);
/*
* When state equal to (u32)-1, the cpu just exits from one
* specific idle state; for this case we need to record
* interval for the pstate.
*
* OPP2
* -----------+
* | OPP1
* | +-----------
* | Idle state |
* +---------------------+
*
* |<- cstate duration ->|
* ^ ^
* cts cur_ts
*/
} else {
key = cpu * MAX_CSTATE_ENTRIES + prev_state;
val = bpf_map_lookup_elem(&cstate_duration, &key);
if (val)
__sync_fetch_and_add((long *)val, delta);
}
/* Update timestamp for pstate as new start time */
if (*pts)
*pts = cur_ts;
return 0;
}
SEC("tracepoint/power/cpu_frequency")
int bpf_prog2(struct cpu_args *ctx)
{
u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta;
u32 key, cpu, pstate_idx;
u64 *val;
cpu = ctx->cpu_id;
key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
pts = bpf_map_lookup_elem(&my_map, &key);
if (!pts)
return 0;
key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
pstate = bpf_map_lookup_elem(&my_map, &key);
if (!pstate)
return 0;
key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
cstate = bpf_map_lookup_elem(&my_map, &key);
if (!cstate)
return 0;
prev_state = *pstate;
*pstate = ctx->state;
if (!*pts) {
*pts = bpf_ktime_get_ns();
return 0;
}
cur_ts = bpf_ktime_get_ns();
delta = cur_ts - *pts;
*pts = cur_ts;
/* When CPU is in idle, bail out to skip pstate statistics */
if (*cstate != (u32)(-1))
return 0;
/*
* The cpu changes to another different OPP (in below diagram
* change frequency from OPP3 to OPP1), need recording interval
* for previous frequency OPP3 and update timestamp as start
* time for new frequency OPP1.
*
* OPP3
* +---------------------+
* OPP2 | |
* ---------+ |
* | OPP1
* +---------------
*
* |<- pstate duration ->|
* ^ ^
* pts cur_ts
*/
pstate_idx = find_cpu_pstate_idx(*pstate);
if (pstate_idx >= MAX_PSTATE_ENTRIES)
return 0;
key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
val = bpf_map_lookup_elem(&pstate_duration, &key);
if (val)
__sync_fetch_and_add((long *)val, delta);
return 0;
}
char _license[] SEC("license") = "GPL";
u32 _version SEC("version") = LINUX_VERSION_CODE;