2014-09-05 12:17:18 +07:00
|
|
|
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of version 2 of the GNU General Public
|
|
|
|
* License as published by the Free Software Foundation.
|
|
|
|
*/
|
|
|
|
#ifndef _UAPI__LINUX_BPF_H__
|
|
|
|
#define _UAPI__LINUX_BPF_H__
|
|
|
|
|
|
|
|
#include <linux/types.h>
|
2014-10-14 16:08:54 +07:00
|
|
|
#include <linux/bpf_common.h>
|
2014-09-05 12:17:18 +07:00
|
|
|
|
|
|
|
/* Extended instruction set based on top of classic BPF */
|
|
|
|
|
|
|
|
/* instruction classes */
|
|
|
|
#define BPF_ALU64 0x07 /* alu mode in double word width */
|
|
|
|
|
|
|
|
/* ld/ldx fields */
|
|
|
|
#define BPF_DW 0x18 /* double word */
|
|
|
|
#define BPF_XADD 0xc0 /* exclusive add */
|
|
|
|
|
|
|
|
/* alu/jmp fields */
|
|
|
|
#define BPF_MOV 0xb0 /* mov reg to reg */
|
|
|
|
#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */
|
|
|
|
|
|
|
|
/* change endianness of a register */
|
|
|
|
#define BPF_END 0xd0 /* flags for endianness conversion: */
|
|
|
|
#define BPF_TO_LE 0x00 /* convert to little-endian */
|
|
|
|
#define BPF_TO_BE 0x08 /* convert to big-endian */
|
|
|
|
#define BPF_FROM_LE BPF_TO_LE
|
|
|
|
#define BPF_FROM_BE BPF_TO_BE
|
|
|
|
|
|
|
|
#define BPF_JNE 0x50 /* jump != */
|
|
|
|
#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */
|
|
|
|
#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */
|
|
|
|
#define BPF_CALL 0x80 /* function call */
|
|
|
|
#define BPF_EXIT 0x90 /* function return */
|
|
|
|
|
|
|
|
/* Register numbers */
|
|
|
|
enum {
|
|
|
|
BPF_REG_0 = 0,
|
|
|
|
BPF_REG_1,
|
|
|
|
BPF_REG_2,
|
|
|
|
BPF_REG_3,
|
|
|
|
BPF_REG_4,
|
|
|
|
BPF_REG_5,
|
|
|
|
BPF_REG_6,
|
|
|
|
BPF_REG_7,
|
|
|
|
BPF_REG_8,
|
|
|
|
BPF_REG_9,
|
|
|
|
BPF_REG_10,
|
|
|
|
__MAX_BPF_REG,
|
|
|
|
};
|
|
|
|
|
|
|
|
/* BPF has 10 general purpose 64-bit registers and stack frame. */
|
|
|
|
#define MAX_BPF_REG __MAX_BPF_REG
|
|
|
|
|
|
|
|
struct bpf_insn {
|
|
|
|
__u8 code; /* opcode */
|
|
|
|
__u8 dst_reg:4; /* dest register */
|
|
|
|
__u8 src_reg:4; /* source register */
|
|
|
|
__s16 off; /* signed offset */
|
|
|
|
__s32 imm; /* signed immediate constant */
|
|
|
|
};
|
|
|
|
|
2014-09-26 14:16:57 +07:00
|
|
|
/* BPF syscall commands */
|
|
|
|
enum bpf_cmd {
|
|
|
|
/* create a map with given type and attributes
|
|
|
|
* fd = bpf(BPF_MAP_CREATE, union bpf_attr *, u32 size)
|
|
|
|
* returns fd or negative error
|
|
|
|
* map is deleted when fd is closed
|
|
|
|
*/
|
|
|
|
BPF_MAP_CREATE,
|
bpf: add lookup/update/delete/iterate methods to BPF maps
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
The maps are accessed from user space via BPF syscall, which has commands:
- create a map with given type and attributes
fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
returns fd or negative error
- lookup key in a given map referenced by fd
err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero and stores found elem into value or negative error
- create or update key/value pair in a given map
err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero or negative error
- find and delete element by key in a given map
err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key
- iterate map elements (based on input key return next_key)
err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->next_key
- close(fd) deletes the map
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-09-26 14:16:59 +07:00
|
|
|
|
|
|
|
/* lookup key in a given map
|
|
|
|
* err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
|
|
|
|
* Using attr->map_fd, attr->key, attr->value
|
|
|
|
* returns zero and stores found elem into value
|
|
|
|
* or negative error
|
|
|
|
*/
|
|
|
|
BPF_MAP_LOOKUP_ELEM,
|
|
|
|
|
|
|
|
/* create or update key/value pair in a given map
|
|
|
|
* err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
|
bpf: add 'flags' attribute to BPF_MAP_UPDATE_ELEM command
the current meaning of BPF_MAP_UPDATE_ELEM syscall command is:
either update existing map element or create a new one.
Initially the plan was to add a new command to handle the case of
'create new element if it didn't exist', but 'flags' style looks
cleaner and overall diff is much smaller (more code reused), so add 'flags'
attribute to BPF_MAP_UPDATE_ELEM command with the following meaning:
#define BPF_ANY 0 /* create new element or update existing */
#define BPF_NOEXIST 1 /* create new element if it didn't exist */
#define BPF_EXIST 2 /* update existing element */
bpf_update_elem(fd, key, value, BPF_NOEXIST) call can fail with EEXIST
if element already exists.
bpf_update_elem(fd, key, value, BPF_EXIST) can fail with ENOENT
if element doesn't exist.
Userspace will call it as:
int bpf_update_elem(int fd, void *key, void *value, __u64 flags)
{
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
.value = ptr_to_u64(value),
.flags = flags;
};
return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}
First two bits of 'flags' are used to encode style of bpf_update_elem() command.
Bits 2-63 are reserved for future use.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-11-14 08:36:44 +07:00
|
|
|
* Using attr->map_fd, attr->key, attr->value, attr->flags
|
bpf: add lookup/update/delete/iterate methods to BPF maps
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
The maps are accessed from user space via BPF syscall, which has commands:
- create a map with given type and attributes
fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
returns fd or negative error
- lookup key in a given map referenced by fd
err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero and stores found elem into value or negative error
- create or update key/value pair in a given map
err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero or negative error
- find and delete element by key in a given map
err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key
- iterate map elements (based on input key return next_key)
err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->next_key
- close(fd) deletes the map
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-09-26 14:16:59 +07:00
|
|
|
* returns zero or negative error
|
|
|
|
*/
|
|
|
|
BPF_MAP_UPDATE_ELEM,
|
|
|
|
|
|
|
|
/* find and delete elem by key in a given map
|
|
|
|
* err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
|
|
|
|
* Using attr->map_fd, attr->key
|
|
|
|
* returns zero or negative error
|
|
|
|
*/
|
|
|
|
BPF_MAP_DELETE_ELEM,
|
|
|
|
|
|
|
|
/* lookup key in a given map and return next key
|
|
|
|
* err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
|
|
|
|
* Using attr->map_fd, attr->key, attr->next_key
|
|
|
|
* returns zero and stores next key or negative error
|
|
|
|
*/
|
|
|
|
BPF_MAP_GET_NEXT_KEY,
|
2014-09-26 14:17:00 +07:00
|
|
|
|
|
|
|
/* verify and load eBPF program
|
|
|
|
* prog_fd = bpf(BPF_PROG_LOAD, union bpf_attr *attr, u32 size)
|
|
|
|
* Using attr->prog_type, attr->insns, attr->license
|
|
|
|
* returns fd or negative error
|
|
|
|
*/
|
|
|
|
BPF_PROG_LOAD,
|
2014-09-26 14:16:57 +07:00
|
|
|
};
|
|
|
|
|
|
|
|
enum bpf_map_type {
|
|
|
|
BPF_MAP_TYPE_UNSPEC,
|
2014-11-14 08:36:45 +07:00
|
|
|
BPF_MAP_TYPE_HASH,
|
2014-11-14 08:36:46 +07:00
|
|
|
BPF_MAP_TYPE_ARRAY,
|
bpf: allow bpf programs to tail-call other bpf programs
introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
...
bpf_tail_call(ctx, &jmp_table, index);
...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
...
if (jmp_table[index])
return (*jmp_table[index])(ctx);
...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.
In case of x64 JIT the bigger part of generated assembler prologue
is common for all programs, so it is simply skipped while jumping.
Other JITs can do similar prologue-skipping optimization or
do stack unwind before jumping into the next program.
bpf_tail_call() arguments:
ctx - context pointer
jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table
index - index in the jump table
Since all BPF programs are idenitified by file descriptor, user space
need to populate the jmp_table with FDs of other BPF programs.
If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere
and program execution continues as normal.
New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can
populate this jmp_table array with FDs of other bpf programs.
Programs can share the same jmp_table array or use multiple jmp_tables.
The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.
Use cases:
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
==========
- simplify complex programs by splitting them into a sequence of small programs
- dispatch routine
For tracing and future seccomp the program may be triggered on all system
calls, but processing of syscall arguments will be different. It's more
efficient to implement them as:
int syscall_entry(struct seccomp_data *ctx)
{
bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */);
... default: process unknown syscall ...
}
int sys_write_event(struct seccomp_data *ctx) {...}
int sys_read_event(struct seccomp_data *ctx) {...}
syscall_jmp_table[__NR_write] = sys_write_event;
syscall_jmp_table[__NR_read] = sys_read_event;
For networking the program may call into different parsers depending on
packet format, like:
int packet_parser(struct __sk_buff *skb)
{
... parse L2, L3 here ...
__u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol));
bpf_tail_call(skb, &ipproto_jmp_table, ipproto);
... default: process unknown protocol ...
}
int parse_tcp(struct __sk_buff *skb) {...}
int parse_udp(struct __sk_buff *skb) {...}
ipproto_jmp_table[IPPROTO_TCP] = parse_tcp;
ipproto_jmp_table[IPPROTO_UDP] = parse_udp;
- for TC use case, bpf_tail_call() allows to implement reclassify-like logic
- bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table
are atomic, so user space can build chains of BPF programs on the fly
Implementation details:
=======================
- high performance of bpf_tail_call() is the goal.
It could have been implemented without JIT changes as a wrapper on top of
BPF_PROG_RUN() macro, but with two downsides:
. all programs would have to pay performance penalty for this feature and
tail call itself would be slower, since mandatory stack unwind, return,
stack allocate would be done for every tailcall.
. tailcall would be limited to programs running preempt_disabled, since
generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would
need to be either global per_cpu variable accessed by helper and by wrapper
or global variable protected by locks.
In this implementation x64 JIT bypasses stack unwind and jumps into the
callee program after prologue.
- bpf_prog_array_compatible() ensures that prog_type of callee and caller
are the same and JITed/non-JITed flag is the same, since calling JITed
program from non-JITed is invalid, since stack frames are different.
Similarly calling kprobe type program from socket type program is invalid.
- jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map'
abstraction, its user space API and all of verifier logic.
It's in the existing arraymap.c file, since several functions are
shared with regular array map.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 06:59:03 +07:00
|
|
|
BPF_MAP_TYPE_PROG_ARRAY,
|
2014-09-26 14:16:57 +07:00
|
|
|
};
|
|
|
|
|
2014-09-26 14:17:00 +07:00
|
|
|
enum bpf_prog_type {
|
|
|
|
BPF_PROG_TYPE_UNSPEC,
|
2014-12-02 06:06:34 +07:00
|
|
|
BPF_PROG_TYPE_SOCKET_FILTER,
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-26 02:49:20 +07:00
|
|
|
BPF_PROG_TYPE_KPROBE,
|
ebpf: add sched_cls_type and map it to sk_filter's verifier ops
As discussed recently and at netconf/netdev01, we want to prevent making
bpf_verifier_ops registration available for modules, but have them at a
controlled place inside the kernel instead.
The reason for this is, that out-of-tree modules can go crazy and define
and register any verfifier ops they want, doing all sorts of crap, even
bypassing available GPLed eBPF helper functions. We don't want to offer
such a shiny playground, of course, but keep strict control to ourselves
inside the core kernel.
This also encourages us to design eBPF user helpers carefully and
generically, so they can be shared among various subsystems using eBPF.
For the eBPF traffic classifier (cls_bpf), it's a good start to share
the same helper facilities as we currently do in eBPF for socket filters.
That way, we have BPF_PROG_TYPE_SCHED_CLS look like it's own type, thus
one day if there's a good reason to diverge the set of helper functions
from the set available to socket filters, we keep ABI compatibility.
In future, we could place all bpf_prog_type_list at a central place,
perhaps.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-01 18:31:46 +07:00
|
|
|
BPF_PROG_TYPE_SCHED_CLS,
|
2015-03-20 21:11:11 +07:00
|
|
|
BPF_PROG_TYPE_SCHED_ACT,
|
2014-09-26 14:17:00 +07:00
|
|
|
};
|
|
|
|
|
2015-03-01 18:31:43 +07:00
|
|
|
#define BPF_PSEUDO_MAP_FD 1
|
|
|
|
|
bpf: add 'flags' attribute to BPF_MAP_UPDATE_ELEM command
the current meaning of BPF_MAP_UPDATE_ELEM syscall command is:
either update existing map element or create a new one.
Initially the plan was to add a new command to handle the case of
'create new element if it didn't exist', but 'flags' style looks
cleaner and overall diff is much smaller (more code reused), so add 'flags'
attribute to BPF_MAP_UPDATE_ELEM command with the following meaning:
#define BPF_ANY 0 /* create new element or update existing */
#define BPF_NOEXIST 1 /* create new element if it didn't exist */
#define BPF_EXIST 2 /* update existing element */
bpf_update_elem(fd, key, value, BPF_NOEXIST) call can fail with EEXIST
if element already exists.
bpf_update_elem(fd, key, value, BPF_EXIST) can fail with ENOENT
if element doesn't exist.
Userspace will call it as:
int bpf_update_elem(int fd, void *key, void *value, __u64 flags)
{
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
.value = ptr_to_u64(value),
.flags = flags;
};
return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}
First two bits of 'flags' are used to encode style of bpf_update_elem() command.
Bits 2-63 are reserved for future use.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-11-14 08:36:44 +07:00
|
|
|
/* flags for BPF_MAP_UPDATE_ELEM command */
|
|
|
|
#define BPF_ANY 0 /* create new element or update existing */
|
|
|
|
#define BPF_NOEXIST 1 /* create new element if it didn't exist */
|
|
|
|
#define BPF_EXIST 2 /* update existing element */
|
|
|
|
|
2014-09-26 14:16:57 +07:00
|
|
|
union bpf_attr {
|
|
|
|
struct { /* anonymous struct used by BPF_MAP_CREATE command */
|
|
|
|
__u32 map_type; /* one of enum bpf_map_type */
|
|
|
|
__u32 key_size; /* size of key in bytes */
|
|
|
|
__u32 value_size; /* size of value in bytes */
|
|
|
|
__u32 max_entries; /* max number of entries in a map */
|
|
|
|
};
|
bpf: add lookup/update/delete/iterate methods to BPF maps
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
The maps are accessed from user space via BPF syscall, which has commands:
- create a map with given type and attributes
fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
returns fd or negative error
- lookup key in a given map referenced by fd
err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero and stores found elem into value or negative error
- create or update key/value pair in a given map
err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero or negative error
- find and delete element by key in a given map
err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key
- iterate map elements (based on input key return next_key)
err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->next_key
- close(fd) deletes the map
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-09-26 14:16:59 +07:00
|
|
|
|
|
|
|
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
|
|
|
|
__u32 map_fd;
|
|
|
|
__aligned_u64 key;
|
|
|
|
union {
|
|
|
|
__aligned_u64 value;
|
|
|
|
__aligned_u64 next_key;
|
|
|
|
};
|
bpf: add 'flags' attribute to BPF_MAP_UPDATE_ELEM command
the current meaning of BPF_MAP_UPDATE_ELEM syscall command is:
either update existing map element or create a new one.
Initially the plan was to add a new command to handle the case of
'create new element if it didn't exist', but 'flags' style looks
cleaner and overall diff is much smaller (more code reused), so add 'flags'
attribute to BPF_MAP_UPDATE_ELEM command with the following meaning:
#define BPF_ANY 0 /* create new element or update existing */
#define BPF_NOEXIST 1 /* create new element if it didn't exist */
#define BPF_EXIST 2 /* update existing element */
bpf_update_elem(fd, key, value, BPF_NOEXIST) call can fail with EEXIST
if element already exists.
bpf_update_elem(fd, key, value, BPF_EXIST) can fail with ENOENT
if element doesn't exist.
Userspace will call it as:
int bpf_update_elem(int fd, void *key, void *value, __u64 flags)
{
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
.value = ptr_to_u64(value),
.flags = flags;
};
return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}
First two bits of 'flags' are used to encode style of bpf_update_elem() command.
Bits 2-63 are reserved for future use.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-11-14 08:36:44 +07:00
|
|
|
__u64 flags;
|
bpf: add lookup/update/delete/iterate methods to BPF maps
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
The maps are accessed from user space via BPF syscall, which has commands:
- create a map with given type and attributes
fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
returns fd or negative error
- lookup key in a given map referenced by fd
err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero and stores found elem into value or negative error
- create or update key/value pair in a given map
err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero or negative error
- find and delete element by key in a given map
err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key
- iterate map elements (based on input key return next_key)
err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->next_key
- close(fd) deletes the map
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-09-26 14:16:59 +07:00
|
|
|
};
|
2014-09-26 14:17:00 +07:00
|
|
|
|
|
|
|
struct { /* anonymous struct used by BPF_PROG_LOAD command */
|
|
|
|
__u32 prog_type; /* one of enum bpf_prog_type */
|
|
|
|
__u32 insn_cnt;
|
|
|
|
__aligned_u64 insns;
|
|
|
|
__aligned_u64 license;
|
bpf: verifier (add ability to receive verification log)
add optional attributes for BPF_PROG_LOAD syscall:
union bpf_attr {
struct {
...
__u32 log_level; /* verbosity level of eBPF verifier */
__u32 log_size; /* size of user buffer */
__aligned_u64 log_buf; /* user supplied 'char *buffer' */
};
};
when log_level > 0 the verifier will return its verification log in the user
supplied buffer 'log_buf' which can be used by program author to analyze why
verifier rejected given program.
'Understanding eBPF verifier messages' section of Documentation/networking/filter.txt
provides several examples of these messages, like the program:
BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
BPF_EXIT_INSN(),
will be rejected with the following multi-line message in log_buf:
0: (7a) *(u64 *)(r10 -8) = 0
1: (bf) r2 = r10
2: (07) r2 += -8
3: (b7) r1 = 0
4: (85) call 1
5: (15) if r0 == 0x0 goto pc+1
R0=map_ptr R10=fp
6: (7a) *(u64 *)(r0 +4) = 0
misaligned access off 4 size 8
The format of the output can change at any time as verifier evolves.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-09-26 14:17:03 +07:00
|
|
|
__u32 log_level; /* verbosity level of verifier */
|
|
|
|
__u32 log_size; /* size of user buffer */
|
|
|
|
__aligned_u64 log_buf; /* user supplied buffer */
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-26 02:49:20 +07:00
|
|
|
__u32 kern_version; /* checked when prog_type=kprobe */
|
2014-09-26 14:17:00 +07:00
|
|
|
};
|
2014-09-26 14:16:57 +07:00
|
|
|
} __attribute__((aligned(8)));
|
|
|
|
|
2014-09-26 14:17:00 +07:00
|
|
|
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
|
|
|
|
* function eBPF program intends to call
|
|
|
|
*/
|
|
|
|
enum bpf_func_id {
|
|
|
|
BPF_FUNC_unspec,
|
2014-11-14 08:36:49 +07:00
|
|
|
BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
|
|
|
|
BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
|
|
|
|
BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
|
tracing, perf: Implement BPF programs attached to kprobes
BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.
The user interface is to attach to a kprobe via the perf syscall:
struct perf_event_attr attr = {
.type = PERF_TYPE_TRACEPOINT,
.config = event_id,
...
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
'prog_fd' is a file descriptor associated with BPF program
previously loaded.
'event_id' is an ID of the kprobe created.
Closing 'event_fd':
close(event_fd);
... automatically detaches BPF program from it.
BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any
kernel data structures
BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.
Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-03-26 02:49:20 +07:00
|
|
|
BPF_FUNC_probe_read, /* int bpf_probe_read(void *dst, int size, void *src) */
|
2015-03-26 02:49:21 +07:00
|
|
|
BPF_FUNC_ktime_get_ns, /* u64 bpf_ktime_get_ns(void) */
|
2015-03-26 02:49:22 +07:00
|
|
|
BPF_FUNC_trace_printk, /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */
|
2015-03-14 08:27:16 +07:00
|
|
|
BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */
|
2015-03-14 08:27:17 +07:00
|
|
|
BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */
|
2015-04-02 07:12:13 +07:00
|
|
|
|
|
|
|
/**
|
|
|
|
* skb_store_bytes(skb, offset, from, len, flags) - store bytes into packet
|
|
|
|
* @skb: pointer to skb
|
2015-04-16 02:55:45 +07:00
|
|
|
* @offset: offset within packet from skb->mac_header
|
2015-04-02 07:12:13 +07:00
|
|
|
* @from: pointer where to copy bytes from
|
|
|
|
* @len: number of bytes to store into packet
|
|
|
|
* @flags: bit 0 - if true, recompute skb->csum
|
|
|
|
* other bits - reserved
|
|
|
|
* Return: 0 on success
|
|
|
|
*/
|
|
|
|
BPF_FUNC_skb_store_bytes,
|
|
|
|
|
|
|
|
/**
|
|
|
|
* l3_csum_replace(skb, offset, from, to, flags) - recompute IP checksum
|
|
|
|
* @skb: pointer to skb
|
|
|
|
* @offset: offset within packet where IP checksum is located
|
|
|
|
* @from: old value of header field
|
|
|
|
* @to: new value of header field
|
|
|
|
* @flags: bits 0-3 - size of header field
|
|
|
|
* other bits - reserved
|
|
|
|
* Return: 0 on success
|
|
|
|
*/
|
|
|
|
BPF_FUNC_l3_csum_replace,
|
|
|
|
|
|
|
|
/**
|
|
|
|
* l4_csum_replace(skb, offset, from, to, flags) - recompute TCP/UDP checksum
|
|
|
|
* @skb: pointer to skb
|
|
|
|
* @offset: offset within packet where TCP/UDP checksum is located
|
|
|
|
* @from: old value of header field
|
|
|
|
* @to: new value of header field
|
|
|
|
* @flags: bits 0-3 - size of header field
|
|
|
|
* bit 4 - is pseudo header
|
|
|
|
* other bits - reserved
|
|
|
|
* Return: 0 on success
|
|
|
|
*/
|
|
|
|
BPF_FUNC_l4_csum_replace,
|
bpf: allow bpf programs to tail-call other bpf programs
introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
...
bpf_tail_call(ctx, &jmp_table, index);
...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
...
if (jmp_table[index])
return (*jmp_table[index])(ctx);
...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.
In case of x64 JIT the bigger part of generated assembler prologue
is common for all programs, so it is simply skipped while jumping.
Other JITs can do similar prologue-skipping optimization or
do stack unwind before jumping into the next program.
bpf_tail_call() arguments:
ctx - context pointer
jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table
index - index in the jump table
Since all BPF programs are idenitified by file descriptor, user space
need to populate the jmp_table with FDs of other BPF programs.
If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere
and program execution continues as normal.
New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can
populate this jmp_table array with FDs of other bpf programs.
Programs can share the same jmp_table array or use multiple jmp_tables.
The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.
Use cases:
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
==========
- simplify complex programs by splitting them into a sequence of small programs
- dispatch routine
For tracing and future seccomp the program may be triggered on all system
calls, but processing of syscall arguments will be different. It's more
efficient to implement them as:
int syscall_entry(struct seccomp_data *ctx)
{
bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */);
... default: process unknown syscall ...
}
int sys_write_event(struct seccomp_data *ctx) {...}
int sys_read_event(struct seccomp_data *ctx) {...}
syscall_jmp_table[__NR_write] = sys_write_event;
syscall_jmp_table[__NR_read] = sys_read_event;
For networking the program may call into different parsers depending on
packet format, like:
int packet_parser(struct __sk_buff *skb)
{
... parse L2, L3 here ...
__u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol));
bpf_tail_call(skb, &ipproto_jmp_table, ipproto);
... default: process unknown protocol ...
}
int parse_tcp(struct __sk_buff *skb) {...}
int parse_udp(struct __sk_buff *skb) {...}
ipproto_jmp_table[IPPROTO_TCP] = parse_tcp;
ipproto_jmp_table[IPPROTO_UDP] = parse_udp;
- for TC use case, bpf_tail_call() allows to implement reclassify-like logic
- bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table
are atomic, so user space can build chains of BPF programs on the fly
Implementation details:
=======================
- high performance of bpf_tail_call() is the goal.
It could have been implemented without JIT changes as a wrapper on top of
BPF_PROG_RUN() macro, but with two downsides:
. all programs would have to pay performance penalty for this feature and
tail call itself would be slower, since mandatory stack unwind, return,
stack allocate would be done for every tailcall.
. tailcall would be limited to programs running preempt_disabled, since
generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would
need to be either global per_cpu variable accessed by helper and by wrapper
or global variable protected by locks.
In this implementation x64 JIT bypasses stack unwind and jumps into the
callee program after prologue.
- bpf_prog_array_compatible() ensures that prog_type of callee and caller
are the same and JITed/non-JITed flag is the same, since calling JITed
program from non-JITed is invalid, since stack frames are different.
Similarly calling kprobe type program from socket type program is invalid.
- jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map'
abstraction, its user space API and all of verifier logic.
It's in the existing arraymap.c file, since several functions are
shared with regular array map.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 06:59:03 +07:00
|
|
|
|
|
|
|
/**
|
|
|
|
* bpf_tail_call(ctx, prog_array_map, index) - jump into another BPF program
|
|
|
|
* @ctx: context pointer passed to next program
|
|
|
|
* @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
|
|
|
|
* @index: index inside array that selects specific program to run
|
|
|
|
* Return: 0 on success
|
|
|
|
*/
|
|
|
|
BPF_FUNC_tail_call,
|
2015-06-03 06:03:14 +07:00
|
|
|
|
|
|
|
/**
|
|
|
|
* bpf_clone_redirect(skb, ifindex, flags) - redirect to another netdev
|
|
|
|
* @skb: pointer to skb
|
|
|
|
* @ifindex: ifindex of the net device
|
|
|
|
* @flags: bit 0 - if set, redirect to ingress instead of egress
|
|
|
|
* other bits - reserved
|
|
|
|
* Return: 0 on success
|
|
|
|
*/
|
|
|
|
BPF_FUNC_clone_redirect,
|
2015-06-13 09:39:12 +07:00
|
|
|
|
|
|
|
/**
|
|
|
|
* u64 bpf_get_current_pid_tgid(void)
|
|
|
|
* Return: current->tgid << 32 | current->pid
|
|
|
|
*/
|
|
|
|
BPF_FUNC_get_current_pid_tgid,
|
|
|
|
|
|
|
|
/**
|
|
|
|
* u64 bpf_get_current_uid_gid(void)
|
|
|
|
* Return: current_gid << 32 | current_uid
|
|
|
|
*/
|
|
|
|
BPF_FUNC_get_current_uid_gid,
|
|
|
|
|
|
|
|
/**
|
|
|
|
* bpf_get_current_comm(char *buf, int size_of_buf)
|
|
|
|
* stores current->comm into buf
|
|
|
|
* Return: 0 on success
|
|
|
|
*/
|
|
|
|
BPF_FUNC_get_current_comm,
|
2014-09-26 14:17:00 +07:00
|
|
|
__BPF_FUNC_MAX_ID,
|
|
|
|
};
|
|
|
|
|
2015-03-14 01:57:42 +07:00
|
|
|
/* user accessible mirror of in-kernel sk_buff.
|
|
|
|
* new fields can only be added to the end of this structure
|
|
|
|
*/
|
|
|
|
struct __sk_buff {
|
|
|
|
__u32 len;
|
|
|
|
__u32 pkt_type;
|
|
|
|
__u32 mark;
|
|
|
|
__u32 queue_mapping;
|
2015-03-17 08:06:02 +07:00
|
|
|
__u32 protocol;
|
|
|
|
__u32 vlan_present;
|
|
|
|
__u32 vlan_tci;
|
2015-03-24 20:48:41 +07:00
|
|
|
__u32 vlan_proto;
|
2015-04-04 01:52:24 +07:00
|
|
|
__u32 priority;
|
2015-05-28 05:30:39 +07:00
|
|
|
__u32 ingress_ifindex;
|
|
|
|
__u32 ifindex;
|
2015-06-05 00:11:54 +07:00
|
|
|
__u32 tc_index;
|
|
|
|
__u32 cb[5];
|
2015-03-14 01:57:42 +07:00
|
|
|
};
|
|
|
|
|
2014-09-05 12:17:18 +07:00
|
|
|
#endif /* _UAPI__LINUX_BPF_H__ */
|