io_uring: add a few test tools

This adds two test programs in tools/io_uring/ that demonstrate both
the raw io_uring API (and all features) through a small benchmark
app, io_uring-bench, and the liburing exposed API in a simplified
cp(1) implementation through io_uring-cp.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Jens Axboe 2019-03-06 09:03:50 -07:00
parent 31b5151064
commit 21b4aa5d20
9 changed files with 1380 additions and 0 deletions

18
tools/io_uring/Makefile Normal file
View File

@ -0,0 +1,18 @@
# SPDX-License-Identifier: GPL-2.0
# Makefile for io_uring test tools
CFLAGS += -Wall -Wextra -g -D_GNU_SOURCE
LDLIBS += -lpthread
all: io_uring-cp io_uring-bench
%: %.c
$(CC) $(CFLAGS) -o $@ $^
io_uring-bench: syscall.o io_uring-bench.o
$(CC) $(CFLAGS) $(LDLIBS) -o $@ $^
io_uring-cp: setup.o syscall.o queue.o
clean:
$(RM) io_uring-cp io_uring-bench *.o
.PHONY: all clean

29
tools/io_uring/README Normal file
View File

@ -0,0 +1,29 @@
This directory includes a few programs that demonstrate how to use io_uring
in an application. The examples are:
io_uring-cp
A very basic io_uring implementation of cp(1). It takes two
arguments, copies the first argument to the second. This example
is part of liburing, and hence uses the simplified liburing API
for setting up an io_uring instance, submitting IO, completing IO,
etc. The support functions in queue.c and setup.c are straight
out of liburing.
io_uring-bench
Benchmark program that does random reads on a number of files. This
app demonstrates the various features of io_uring, like fixed files,
fixed buffers, and polled IO. There are options in the program to
control which features to use. Arguments is the file (or files) that
io_uring-bench should operate on. This uses the raw io_uring
interface.
liburing can be cloned with git here:
git://git.kernel.dk/liburing
and contains a number of unit tests as well for testing io_uring. It also
comes with man pages for the three system calls.
Fio includes an io_uring engine, you can clone fio here:
git://git.kernel.dk/fio

16
tools/io_uring/barrier.h Normal file
View File

@ -0,0 +1,16 @@
#ifndef LIBURING_BARRIER_H
#define LIBURING_BARRIER_H
#if defined(__x86_64) || defined(__i386__)
#define read_barrier() __asm__ __volatile__("":::"memory")
#define write_barrier() __asm__ __volatile__("":::"memory")
#else
/*
* Add arch appropriate definitions. Be safe and use full barriers for
* archs we don't have support for.
*/
#define read_barrier() __sync_synchronize()
#define write_barrier() __sync_synchronize()
#endif
#endif

View File

@ -0,0 +1,616 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Simple benchmark program that uses the various features of io_uring
* to provide fast random access to a device/file. It has various
* options that are control how we use io_uring, see the OPTIONS section
* below. This uses the raw io_uring interface.
*
* Copyright (C) 2018-2019 Jens Axboe
*/
#include <stdio.h>
#include <errno.h>
#include <assert.h>
#include <stdlib.h>
#include <stddef.h>
#include <signal.h>
#include <inttypes.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <sys/resource.h>
#include <sys/mman.h>
#include <sys/uio.h>
#include <linux/fs.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <pthread.h>
#include <sched.h>
#include "liburing.h"
#include "barrier.h"
#ifndef IOCQE_FLAG_CACHEHIT
#define IOCQE_FLAG_CACHEHIT (1U << 0)
#endif
#define min(a, b) ((a < b) ? (a) : (b))
struct io_sq_ring {
unsigned *head;
unsigned *tail;
unsigned *ring_mask;
unsigned *ring_entries;
unsigned *flags;
unsigned *array;
};
struct io_cq_ring {
unsigned *head;
unsigned *tail;
unsigned *ring_mask;
unsigned *ring_entries;
struct io_uring_cqe *cqes;
};
#define DEPTH 128
#define BATCH_SUBMIT 32
#define BATCH_COMPLETE 32
#define BS 4096
#define MAX_FDS 16
static unsigned sq_ring_mask, cq_ring_mask;
struct file {
unsigned long max_blocks;
unsigned pending_ios;
int real_fd;
int fixed_fd;
};
struct submitter {
pthread_t thread;
int ring_fd;
struct drand48_data rand;
struct io_sq_ring sq_ring;
struct io_uring_sqe *sqes;
struct iovec iovecs[DEPTH];
struct io_cq_ring cq_ring;
int inflight;
unsigned long reaps;
unsigned long done;
unsigned long calls;
unsigned long cachehit, cachemiss;
volatile int finish;
__s32 *fds;
struct file files[MAX_FDS];
unsigned nr_files;
unsigned cur_file;
};
static struct submitter submitters[1];
static volatile int finish;
/*
* OPTIONS: Set these to test the various features of io_uring.
*/
static int polled = 1; /* use IO polling */
static int fixedbufs = 1; /* use fixed user buffers */
static int register_files = 1; /* use fixed files */
static int buffered = 0; /* use buffered IO, not O_DIRECT */
static int sq_thread_poll = 0; /* use kernel submission/poller thread */
static int sq_thread_cpu = -1; /* pin above thread to this CPU */
static int do_nop = 0; /* no-op SQ ring commands */
static int io_uring_register_buffers(struct submitter *s)
{
if (do_nop)
return 0;
return io_uring_register(s->ring_fd, IORING_REGISTER_BUFFERS, s->iovecs,
DEPTH);
}
static int io_uring_register_files(struct submitter *s)
{
unsigned i;
if (do_nop)
return 0;
s->fds = calloc(s->nr_files, sizeof(__s32));
for (i = 0; i < s->nr_files; i++) {
s->fds[i] = s->files[i].real_fd;
s->files[i].fixed_fd = i;
}
return io_uring_register(s->ring_fd, IORING_REGISTER_FILES, s->fds,
s->nr_files);
}
static int gettid(void)
{
return syscall(__NR_gettid);
}
static unsigned file_depth(struct submitter *s)
{
return (DEPTH + s->nr_files - 1) / s->nr_files;
}
static void init_io(struct submitter *s, unsigned index)
{
struct io_uring_sqe *sqe = &s->sqes[index];
unsigned long offset;
struct file *f;
long r;
if (do_nop) {
sqe->opcode = IORING_OP_NOP;
return;
}
if (s->nr_files == 1) {
f = &s->files[0];
} else {
f = &s->files[s->cur_file];
if (f->pending_ios >= file_depth(s)) {
s->cur_file++;
if (s->cur_file == s->nr_files)
s->cur_file = 0;
f = &s->files[s->cur_file];
}
}
f->pending_ios++;
lrand48_r(&s->rand, &r);
offset = (r % (f->max_blocks - 1)) * BS;
if (register_files) {
sqe->flags = IOSQE_FIXED_FILE;
sqe->fd = f->fixed_fd;
} else {
sqe->flags = 0;
sqe->fd = f->real_fd;
}
if (fixedbufs) {
sqe->opcode = IORING_OP_READ_FIXED;
sqe->addr = (unsigned long) s->iovecs[index].iov_base;
sqe->len = BS;
sqe->buf_index = index;
} else {
sqe->opcode = IORING_OP_READV;
sqe->addr = (unsigned long) &s->iovecs[index];
sqe->len = 1;
sqe->buf_index = 0;
}
sqe->ioprio = 0;
sqe->off = offset;
sqe->user_data = (unsigned long) f;
}
static int prep_more_ios(struct submitter *s, unsigned max_ios)
{
struct io_sq_ring *ring = &s->sq_ring;
unsigned index, tail, next_tail, prepped = 0;
next_tail = tail = *ring->tail;
do {
next_tail++;
read_barrier();
if (next_tail == *ring->head)
break;
index = tail & sq_ring_mask;
init_io(s, index);
ring->array[index] = index;
prepped++;
tail = next_tail;
} while (prepped < max_ios);
if (*ring->tail != tail) {
/* order tail store with writes to sqes above */
write_barrier();
*ring->tail = tail;
write_barrier();
}
return prepped;
}
static int get_file_size(struct file *f)
{
struct stat st;
if (fstat(f->real_fd, &st) < 0)
return -1;
if (S_ISBLK(st.st_mode)) {
unsigned long long bytes;
if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0)
return -1;
f->max_blocks = bytes / BS;
return 0;
} else if (S_ISREG(st.st_mode)) {
f->max_blocks = st.st_size / BS;
return 0;
}
return -1;
}
static int reap_events(struct submitter *s)
{
struct io_cq_ring *ring = &s->cq_ring;
struct io_uring_cqe *cqe;
unsigned head, reaped = 0;
head = *ring->head;
do {
struct file *f;
read_barrier();
if (head == *ring->tail)
break;
cqe = &ring->cqes[head & cq_ring_mask];
if (!do_nop) {
f = (struct file *) (uintptr_t) cqe->user_data;
f->pending_ios--;
if (cqe->res != BS) {
printf("io: unexpected ret=%d\n", cqe->res);
if (polled && cqe->res == -EOPNOTSUPP)
printf("Your filesystem doesn't support poll\n");
return -1;
}
}
if (cqe->flags & IOCQE_FLAG_CACHEHIT)
s->cachehit++;
else
s->cachemiss++;
reaped++;
head++;
} while (1);
s->inflight -= reaped;
*ring->head = head;
write_barrier();
return reaped;
}
static void *submitter_fn(void *data)
{
struct submitter *s = data;
struct io_sq_ring *ring = &s->sq_ring;
int ret, prepped;
printf("submitter=%d\n", gettid());
srand48_r(pthread_self(), &s->rand);
prepped = 0;
do {
int to_wait, to_submit, this_reap, to_prep;
if (!prepped && s->inflight < DEPTH) {
to_prep = min(DEPTH - s->inflight, BATCH_SUBMIT);
prepped = prep_more_ios(s, to_prep);
}
s->inflight += prepped;
submit_more:
to_submit = prepped;
submit:
if (to_submit && (s->inflight + to_submit <= DEPTH))
to_wait = 0;
else
to_wait = min(s->inflight + to_submit, BATCH_COMPLETE);
/*
* Only need to call io_uring_enter if we're not using SQ thread
* poll, or if IORING_SQ_NEED_WAKEUP is set.
*/
if (!sq_thread_poll || (*ring->flags & IORING_SQ_NEED_WAKEUP)) {
unsigned flags = 0;
if (to_wait)
flags = IORING_ENTER_GETEVENTS;
if ((*ring->flags & IORING_SQ_NEED_WAKEUP))
flags |= IORING_ENTER_SQ_WAKEUP;
ret = io_uring_enter(s->ring_fd, to_submit, to_wait,
flags, NULL);
s->calls++;
}
/*
* For non SQ thread poll, we already got the events we needed
* through the io_uring_enter() above. For SQ thread poll, we
* need to loop here until we find enough events.
*/
this_reap = 0;
do {
int r;
r = reap_events(s);
if (r == -1) {
s->finish = 1;
break;
} else if (r > 0)
this_reap += r;
} while (sq_thread_poll && this_reap < to_wait);
s->reaps += this_reap;
if (ret >= 0) {
if (!ret) {
to_submit = 0;
if (s->inflight)
goto submit;
continue;
} else if (ret < to_submit) {
int diff = to_submit - ret;
s->done += ret;
prepped -= diff;
goto submit_more;
}
s->done += ret;
prepped = 0;
continue;
} else if (ret < 0) {
if (errno == EAGAIN) {
if (s->finish)
break;
if (this_reap)
goto submit;
to_submit = 0;
goto submit;
}
printf("io_submit: %s\n", strerror(errno));
break;
}
} while (!s->finish);
finish = 1;
return NULL;
}
static void sig_int(int sig)
{
printf("Exiting on signal %d\n", sig);
submitters[0].finish = 1;
finish = 1;
}
static void arm_sig_int(void)
{
struct sigaction act;
memset(&act, 0, sizeof(act));
act.sa_handler = sig_int;
act.sa_flags = SA_RESTART;
sigaction(SIGINT, &act, NULL);
}
static int setup_ring(struct submitter *s)
{
struct io_sq_ring *sring = &s->sq_ring;
struct io_cq_ring *cring = &s->cq_ring;
struct io_uring_params p;
int ret, fd;
void *ptr;
memset(&p, 0, sizeof(p));
if (polled && !do_nop)
p.flags |= IORING_SETUP_IOPOLL;
if (sq_thread_poll) {
p.flags |= IORING_SETUP_SQPOLL;
if (sq_thread_cpu != -1) {
p.flags |= IORING_SETUP_SQ_AFF;
p.sq_thread_cpu = sq_thread_cpu;
}
}
fd = io_uring_setup(DEPTH, &p);
if (fd < 0) {
perror("io_uring_setup");
return 1;
}
s->ring_fd = fd;
if (fixedbufs) {
ret = io_uring_register_buffers(s);
if (ret < 0) {
perror("io_uring_register_buffers");
return 1;
}
}
if (register_files) {
ret = io_uring_register_files(s);
if (ret < 0) {
perror("io_uring_register_files");
return 1;
}
}
ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
IORING_OFF_SQ_RING);
printf("sq_ring ptr = 0x%p\n", ptr);
sring->head = ptr + p.sq_off.head;
sring->tail = ptr + p.sq_off.tail;
sring->ring_mask = ptr + p.sq_off.ring_mask;
sring->ring_entries = ptr + p.sq_off.ring_entries;
sring->flags = ptr + p.sq_off.flags;
sring->array = ptr + p.sq_off.array;
sq_ring_mask = *sring->ring_mask;
s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
IORING_OFF_SQES);
printf("sqes ptr = 0x%p\n", s->sqes);
ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
IORING_OFF_CQ_RING);
printf("cq_ring ptr = 0x%p\n", ptr);
cring->head = ptr + p.cq_off.head;
cring->tail = ptr + p.cq_off.tail;
cring->ring_mask = ptr + p.cq_off.ring_mask;
cring->ring_entries = ptr + p.cq_off.ring_entries;
cring->cqes = ptr + p.cq_off.cqes;
cq_ring_mask = *cring->ring_mask;
return 0;
}
static void file_depths(char *buf)
{
struct submitter *s = &submitters[0];
unsigned i;
char *p;
buf[0] = '\0';
p = buf;
for (i = 0; i < s->nr_files; i++) {
struct file *f = &s->files[i];
if (i + 1 == s->nr_files)
p += sprintf(p, "%d", f->pending_ios);
else
p += sprintf(p, "%d, ", f->pending_ios);
}
}
int main(int argc, char *argv[])
{
struct submitter *s = &submitters[0];
unsigned long done, calls, reap, cache_hit, cache_miss;
int err, i, flags, fd;
char *fdepths;
void *ret;
if (!do_nop && argc < 2) {
printf("%s: filename\n", argv[0]);
return 1;
}
flags = O_RDONLY | O_NOATIME;
if (!buffered)
flags |= O_DIRECT;
i = 1;
while (!do_nop && i < argc) {
struct file *f;
if (s->nr_files == MAX_FDS) {
printf("Max number of files (%d) reached\n", MAX_FDS);
break;
}
fd = open(argv[i], flags);
if (fd < 0) {
perror("open");
return 1;
}
f = &s->files[s->nr_files];
f->real_fd = fd;
if (get_file_size(f)) {
printf("failed getting size of device/file\n");
return 1;
}
if (f->max_blocks <= 1) {
printf("Zero file/device size?\n");
return 1;
}
f->max_blocks--;
printf("Added file %s\n", argv[i]);
s->nr_files++;
i++;
}
if (fixedbufs) {
struct rlimit rlim;
rlim.rlim_cur = RLIM_INFINITY;
rlim.rlim_max = RLIM_INFINITY;
if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
perror("setrlimit");
return 1;
}
}
arm_sig_int();
for (i = 0; i < DEPTH; i++) {
void *buf;
if (posix_memalign(&buf, BS, BS)) {
printf("failed alloc\n");
return 1;
}
s->iovecs[i].iov_base = buf;
s->iovecs[i].iov_len = BS;
}
err = setup_ring(s);
if (err) {
printf("ring setup failed: %s, %d\n", strerror(errno), err);
return 1;
}
printf("polled=%d, fixedbufs=%d, buffered=%d", polled, fixedbufs, buffered);
printf(" QD=%d, sq_ring=%d, cq_ring=%d\n", DEPTH, *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
pthread_create(&s->thread, NULL, submitter_fn, s);
fdepths = malloc(8 * s->nr_files);
cache_hit = cache_miss = reap = calls = done = 0;
do {
unsigned long this_done = 0;
unsigned long this_reap = 0;
unsigned long this_call = 0;
unsigned long this_cache_hit = 0;
unsigned long this_cache_miss = 0;
unsigned long rpc = 0, ipc = 0;
double hit = 0.0;
sleep(1);
this_done += s->done;
this_call += s->calls;
this_reap += s->reaps;
this_cache_hit += s->cachehit;
this_cache_miss += s->cachemiss;
if (this_cache_hit && this_cache_miss) {
unsigned long hits, total;
hits = this_cache_hit - cache_hit;
total = hits + this_cache_miss - cache_miss;
hit = (double) hits / (double) total;
hit *= 100.0;
}
if (this_call - calls) {
rpc = (this_done - done) / (this_call - calls);
ipc = (this_reap - reap) / (this_call - calls);
} else
rpc = ipc = -1;
file_depths(fdepths);
printf("IOPS=%lu, IOS/call=%ld/%ld, inflight=%u (%s), Cachehit=%0.2f%%\n",
this_done - done, rpc, ipc, s->inflight,
fdepths, hit);
done = this_done;
calls = this_call;
reap = this_reap;
cache_hit = s->cachehit;
cache_miss = s->cachemiss;
} while (!finish);
pthread_join(s->thread, &ret);
close(s->ring_fd);
free(fdepths);
return 0;
}

View File

@ -0,0 +1,251 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Simple test program that demonstrates a file copy through io_uring. This
* uses the API exposed by liburing.
*
* Copyright (C) 2018-2019 Jens Axboe
*/
#include <stdio.h>
#include <fcntl.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
#include <errno.h>
#include <inttypes.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include "liburing.h"
#define QD 64
#define BS (32*1024)
static int infd, outfd;
struct io_data {
int read;
off_t first_offset, offset;
size_t first_len;
struct iovec iov;
};
static int setup_context(unsigned entries, struct io_uring *ring)
{
int ret;
ret = io_uring_queue_init(entries, ring, 0);
if (ret < 0) {
fprintf(stderr, "queue_init: %s\n", strerror(-ret));
return -1;
}
return 0;
}
static int get_file_size(int fd, off_t *size)
{
struct stat st;
if (fstat(fd, &st) < 0)
return -1;
if (S_ISREG(st.st_mode)) {
*size = st.st_size;
return 0;
} else if (S_ISBLK(st.st_mode)) {
unsigned long long bytes;
if (ioctl(fd, BLKGETSIZE64, &bytes) != 0)
return -1;
*size = bytes;
return 0;
}
return -1;
}
static void queue_prepped(struct io_uring *ring, struct io_data *data)
{
struct io_uring_sqe *sqe;
sqe = io_uring_get_sqe(ring);
assert(sqe);
if (data->read)
io_uring_prep_readv(sqe, infd, &data->iov, 1, data->offset);
else
io_uring_prep_writev(sqe, outfd, &data->iov, 1, data->offset);
io_uring_sqe_set_data(sqe, data);
}
static int queue_read(struct io_uring *ring, off_t size, off_t offset)
{
struct io_uring_sqe *sqe;
struct io_data *data;
sqe = io_uring_get_sqe(ring);
if (!sqe)
return 1;
data = malloc(size + sizeof(*data));
data->read = 1;
data->offset = data->first_offset = offset;
data->iov.iov_base = data + 1;
data->iov.iov_len = size;
data->first_len = size;
io_uring_prep_readv(sqe, infd, &data->iov, 1, offset);
io_uring_sqe_set_data(sqe, data);
return 0;
}
static void queue_write(struct io_uring *ring, struct io_data *data)
{
data->read = 0;
data->offset = data->first_offset;
data->iov.iov_base = data + 1;
data->iov.iov_len = data->first_len;
queue_prepped(ring, data);
io_uring_submit(ring);
}
static int copy_file(struct io_uring *ring, off_t insize)
{
unsigned long reads, writes;
struct io_uring_cqe *cqe;
off_t write_left, offset;
int ret;
write_left = insize;
writes = reads = offset = 0;
while (insize || write_left) {
unsigned long had_reads;
int got_comp;
/*
* Queue up as many reads as we can
*/
had_reads = reads;
while (insize) {
off_t this_size = insize;
if (reads + writes >= QD)
break;
if (this_size > BS)
this_size = BS;
else if (!this_size)
break;
if (queue_read(ring, this_size, offset))
break;
insize -= this_size;
offset += this_size;
reads++;
}
if (had_reads != reads) {
ret = io_uring_submit(ring);
if (ret < 0) {
fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret));
break;
}
}
/*
* Queue is full at this point. Find at least one completion.
*/
got_comp = 0;
while (write_left) {
struct io_data *data;
if (!got_comp) {
ret = io_uring_wait_completion(ring, &cqe);
got_comp = 1;
} else
ret = io_uring_get_completion(ring, &cqe);
if (ret < 0) {
fprintf(stderr, "io_uring_get_completion: %s\n",
strerror(-ret));
return 1;
}
if (!cqe)
break;
data = (struct io_data *) (uintptr_t) cqe->user_data;
if (cqe->res < 0) {
if (cqe->res == -EAGAIN) {
queue_prepped(ring, data);
continue;
}
fprintf(stderr, "cqe failed: %s\n",
strerror(-cqe->res));
return 1;
} else if ((size_t) cqe->res != data->iov.iov_len) {
/* Short read/write, adjust and requeue */
data->iov.iov_base += cqe->res;
data->iov.iov_len -= cqe->res;
data->offset += cqe->res;
queue_prepped(ring, data);
continue;
}
/*
* All done. if write, nothing else to do. if read,
* queue up corresponding write.
*/
if (data->read) {
queue_write(ring, data);
write_left -= data->first_len;
reads--;
writes++;
} else {
free(data);
writes--;
}
}
}
return 0;
}
int main(int argc, char *argv[])
{
struct io_uring ring;
off_t insize;
int ret;
if (argc < 3) {
printf("%s: infile outfile\n", argv[0]);
return 1;
}
infd = open(argv[1], O_RDONLY);
if (infd < 0) {
perror("open infile");
return 1;
}
outfd = open(argv[2], O_WRONLY | O_CREAT | O_TRUNC, 0644);
if (outfd < 0) {
perror("open outfile");
return 1;
}
if (setup_context(QD, &ring))
return 1;
if (get_file_size(infd, &insize))
return 1;
ret = copy_file(&ring, insize);
close(infd);
close(outfd);
io_uring_queue_exit(&ring);
return ret;
}

143
tools/io_uring/liburing.h Normal file
View File

@ -0,0 +1,143 @@
#ifndef LIB_URING_H
#define LIB_URING_H
#include <sys/uio.h>
#include <signal.h>
#include <string.h>
#include "../../include/uapi/linux/io_uring.h"
/*
* Library interface to io_uring
*/
struct io_uring_sq {
unsigned *khead;
unsigned *ktail;
unsigned *kring_mask;
unsigned *kring_entries;
unsigned *kflags;
unsigned *kdropped;
unsigned *array;
struct io_uring_sqe *sqes;
unsigned sqe_head;
unsigned sqe_tail;
size_t ring_sz;
};
struct io_uring_cq {
unsigned *khead;
unsigned *ktail;
unsigned *kring_mask;
unsigned *kring_entries;
unsigned *koverflow;
struct io_uring_cqe *cqes;
size_t ring_sz;
};
struct io_uring {
struct io_uring_sq sq;
struct io_uring_cq cq;
int ring_fd;
};
/*
* System calls
*/
extern int io_uring_setup(unsigned entries, struct io_uring_params *p);
extern int io_uring_enter(unsigned fd, unsigned to_submit,
unsigned min_complete, unsigned flags, sigset_t *sig);
extern int io_uring_register(int fd, unsigned int opcode, void *arg,
unsigned int nr_args);
/*
* Library interface
*/
extern int io_uring_queue_init(unsigned entries, struct io_uring *ring,
unsigned flags);
extern int io_uring_queue_mmap(int fd, struct io_uring_params *p,
struct io_uring *ring);
extern void io_uring_queue_exit(struct io_uring *ring);
extern int io_uring_get_completion(struct io_uring *ring,
struct io_uring_cqe **cqe_ptr);
extern int io_uring_wait_completion(struct io_uring *ring,
struct io_uring_cqe **cqe_ptr);
extern int io_uring_submit(struct io_uring *ring);
extern struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring);
/*
* Command prep helpers
*/
static inline void io_uring_sqe_set_data(struct io_uring_sqe *sqe, void *data)
{
sqe->user_data = (unsigned long) data;
}
static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd,
void *addr, unsigned len, off_t offset)
{
memset(sqe, 0, sizeof(*sqe));
sqe->opcode = op;
sqe->fd = fd;
sqe->off = offset;
sqe->addr = (unsigned long) addr;
sqe->len = len;
}
static inline void io_uring_prep_readv(struct io_uring_sqe *sqe, int fd,
struct iovec *iovecs, unsigned nr_vecs,
off_t offset)
{
io_uring_prep_rw(IORING_OP_READV, sqe, fd, iovecs, nr_vecs, offset);
}
static inline void io_uring_prep_read_fixed(struct io_uring_sqe *sqe, int fd,
void *buf, unsigned nbytes,
off_t offset)
{
io_uring_prep_rw(IORING_OP_READ_FIXED, sqe, fd, buf, nbytes, offset);
}
static inline void io_uring_prep_writev(struct io_uring_sqe *sqe, int fd,
struct iovec *iovecs, unsigned nr_vecs,
off_t offset)
{
io_uring_prep_rw(IORING_OP_WRITEV, sqe, fd, iovecs, nr_vecs, offset);
}
static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd,
void *buf, unsigned nbytes,
off_t offset)
{
io_uring_prep_rw(IORING_OP_WRITE_FIXED, sqe, fd, buf, nbytes, offset);
}
static inline void io_uring_prep_poll_add(struct io_uring_sqe *sqe, int fd,
short poll_mask)
{
memset(sqe, 0, sizeof(*sqe));
sqe->opcode = IORING_OP_POLL_ADD;
sqe->fd = fd;
sqe->poll_events = poll_mask;
}
static inline void io_uring_prep_poll_remove(struct io_uring_sqe *sqe,
void *user_data)
{
memset(sqe, 0, sizeof(*sqe));
sqe->opcode = IORING_OP_POLL_REMOVE;
sqe->addr = (unsigned long) user_data;
}
static inline void io_uring_prep_fsync(struct io_uring_sqe *sqe, int fd,
int datasync)
{
memset(sqe, 0, sizeof(*sqe));
sqe->opcode = IORING_OP_FSYNC;
sqe->fd = fd;
if (datasync)
sqe->fsync_flags = IORING_FSYNC_DATASYNC;
}
#endif

164
tools/io_uring/queue.c Normal file
View File

@ -0,0 +1,164 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include "liburing.h"
#include "barrier.h"
static int __io_uring_get_completion(struct io_uring *ring,
struct io_uring_cqe **cqe_ptr, int wait)
{
struct io_uring_cq *cq = &ring->cq;
const unsigned mask = *cq->kring_mask;
unsigned head;
int ret;
*cqe_ptr = NULL;
head = *cq->khead;
do {
/*
* It's necessary to use a read_barrier() before reading
* the CQ tail, since the kernel updates it locklessly. The
* kernel has the matching store barrier for the update. The
* kernel also ensures that previous stores to CQEs are ordered
* with the tail update.
*/
read_barrier();
if (head != *cq->ktail) {
*cqe_ptr = &cq->cqes[head & mask];
break;
}
if (!wait)
break;
ret = io_uring_enter(ring->ring_fd, 0, 1,
IORING_ENTER_GETEVENTS, NULL);
if (ret < 0)
return -errno;
} while (1);
if (*cqe_ptr) {
*cq->khead = head + 1;
/*
* Ensure that the kernel sees our new head, the kernel has
* the matching read barrier.
*/
write_barrier();
}
return 0;
}
/*
* Return an IO completion, if one is readily available
*/
int io_uring_get_completion(struct io_uring *ring,
struct io_uring_cqe **cqe_ptr)
{
return __io_uring_get_completion(ring, cqe_ptr, 0);
}
/*
* Return an IO completion, waiting for it if necessary
*/
int io_uring_wait_completion(struct io_uring *ring,
struct io_uring_cqe **cqe_ptr)
{
return __io_uring_get_completion(ring, cqe_ptr, 1);
}
/*
* Submit sqes acquired from io_uring_get_sqe() to the kernel.
*
* Returns number of sqes submitted
*/
int io_uring_submit(struct io_uring *ring)
{
struct io_uring_sq *sq = &ring->sq;
const unsigned mask = *sq->kring_mask;
unsigned ktail, ktail_next, submitted;
int ret;
/*
* If we have pending IO in the kring, submit it first. We need a
* read barrier here to match the kernels store barrier when updating
* the SQ head.
*/
read_barrier();
if (*sq->khead != *sq->ktail) {
submitted = *sq->kring_entries;
goto submit;
}
if (sq->sqe_head == sq->sqe_tail)
return 0;
/*
* Fill in sqes that we have queued up, adding them to the kernel ring
*/
submitted = 0;
ktail = ktail_next = *sq->ktail;
while (sq->sqe_head < sq->sqe_tail) {
ktail_next++;
read_barrier();
sq->array[ktail & mask] = sq->sqe_head & mask;
ktail = ktail_next;
sq->sqe_head++;
submitted++;
}
if (!submitted)
return 0;
if (*sq->ktail != ktail) {
/*
* First write barrier ensures that the SQE stores are updated
* with the tail update. This is needed so that the kernel
* will never see a tail update without the preceeding sQE
* stores being done.
*/
write_barrier();
*sq->ktail = ktail;
/*
* The kernel has the matching read barrier for reading the
* SQ tail.
*/
write_barrier();
}
submit:
ret = io_uring_enter(ring->ring_fd, submitted, 0,
IORING_ENTER_GETEVENTS, NULL);
if (ret < 0)
return -errno;
return 0;
}
/*
* Return an sqe to fill. Application must later call io_uring_submit()
* when it's ready to tell the kernel about it. The caller may call this
* function multiple times before calling io_uring_submit().
*
* Returns a vacant sqe, or NULL if we're full.
*/
struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
{
struct io_uring_sq *sq = &ring->sq;
unsigned next = sq->sqe_tail + 1;
struct io_uring_sqe *sqe;
/*
* All sqes are used
*/
if (next - sq->sqe_head > *sq->kring_entries)
return NULL;
sqe = &sq->sqes[sq->sqe_tail & *sq->kring_mask];
sq->sqe_tail = next;
return sqe;
}

103
tools/io_uring/setup.c Normal file
View File

@ -0,0 +1,103 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include "liburing.h"
static int io_uring_mmap(int fd, struct io_uring_params *p,
struct io_uring_sq *sq, struct io_uring_cq *cq)
{
size_t size;
void *ptr;
int ret;
sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
if (ptr == MAP_FAILED)
return -errno;
sq->khead = ptr + p->sq_off.head;
sq->ktail = ptr + p->sq_off.tail;
sq->kring_mask = ptr + p->sq_off.ring_mask;
sq->kring_entries = ptr + p->sq_off.ring_entries;
sq->kflags = ptr + p->sq_off.flags;
sq->kdropped = ptr + p->sq_off.dropped;
sq->array = ptr + p->sq_off.array;
size = p->sq_entries * sizeof(struct io_uring_sqe),
sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, fd,
IORING_OFF_SQES);
if (sq->sqes == MAP_FAILED) {
ret = -errno;
err:
munmap(sq->khead, sq->ring_sz);
return ret;
}
cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
if (ptr == MAP_FAILED) {
ret = -errno;
munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe));
goto err;
}
cq->khead = ptr + p->cq_off.head;
cq->ktail = ptr + p->cq_off.tail;
cq->kring_mask = ptr + p->cq_off.ring_mask;
cq->kring_entries = ptr + p->cq_off.ring_entries;
cq->koverflow = ptr + p->cq_off.overflow;
cq->cqes = ptr + p->cq_off.cqes;
return 0;
}
/*
* For users that want to specify sq_thread_cpu or sq_thread_idle, this
* interface is a convenient helper for mmap()ing the rings.
* Returns -1 on error, or zero on success. On success, 'ring'
* contains the necessary information to read/write to the rings.
*/
int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring)
{
int ret;
memset(ring, 0, sizeof(*ring));
ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq);
if (!ret)
ring->ring_fd = fd;
return ret;
}
/*
* Returns -1 on error, or zero on success. On success, 'ring'
* contains the necessary information to read/write to the rings.
*/
int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags)
{
struct io_uring_params p;
int fd;
memset(&p, 0, sizeof(p));
p.flags = flags;
fd = io_uring_setup(entries, &p);
if (fd < 0)
return fd;
return io_uring_queue_mmap(fd, &p, ring);
}
void io_uring_queue_exit(struct io_uring *ring)
{
struct io_uring_sq *sq = &ring->sq;
struct io_uring_cq *cq = &ring->cq;
munmap(sq->sqes, *sq->kring_entries * sizeof(struct io_uring_sqe));
munmap(sq->khead, sq->ring_sz);
munmap(cq->khead, cq->ring_sz);
close(ring->ring_fd);
}

40
tools/io_uring/syscall.c Normal file
View File

@ -0,0 +1,40 @@
/*
* Will go away once libc support is there
*/
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/uio.h>
#include <signal.h>
#include "liburing.h"
#if defined(__x86_64) || defined(__i386__)
#ifndef __NR_sys_io_uring_setup
#define __NR_sys_io_uring_setup 425
#endif
#ifndef __NR_sys_io_uring_enter
#define __NR_sys_io_uring_enter 426
#endif
#ifndef __NR_sys_io_uring_register
#define __NR_sys_io_uring_register 427
#endif
#else
#error "Arch not supported yet"
#endif
int io_uring_register(int fd, unsigned int opcode, void *arg,
unsigned int nr_args)
{
return syscall(__NR_sys_io_uring_register, fd, opcode, arg, nr_args);
}
int io_uring_setup(unsigned entries, struct io_uring_params *p)
{
return syscall(__NR_sys_io_uring_setup, entries, p);
}
int io_uring_enter(unsigned fd, unsigned to_submit, unsigned min_complete,
unsigned flags, sigset_t *sig)
{
return syscall(__NR_sys_io_uring_enter, fd, to_submit, min_complete,
flags, sig, _NSIG / 8);
}