diff options
author | Jens Axboe <axboe@kernel.dk> | 2019-01-08 06:51:07 -0700 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2019-01-08 07:38:15 -0700 |
commit | f93c84e1b07474a7d776403b3516feeff4f3c933 (patch) | |
tree | 4b71b6a4013c643d669398fdb8f33f74b6081cd2 /src |
Initial commit
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'src')
-rw-r--r-- | src/Makefile | 61 | ||||
-rw-r--r-- | src/barrier.h | 16 | ||||
-rw-r--r-- | src/io_uring.c | 193 | ||||
-rw-r--r-- | src/io_uring.h | 115 | ||||
-rw-r--r-- | src/liburing.h | 57 | ||||
-rw-r--r-- | src/liburing.map | 11 | ||||
-rw-r--r-- | src/syscall.c | 31 |
7 files changed, 484 insertions, 0 deletions
diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..635f65a --- /dev/null +++ b/src/Makefile @@ -0,0 +1,61 @@ +prefix=/usr +includedir=$(prefix)/include +libdir=$(prefix)/lib + +CFLAGS ?= -g -fomit-frame-pointer -O2 +CFLAGS += -Wall -I. -fPIC +SO_CFLAGS=-shared $(CFLAGS) +L_CFLAGS=$(CFLAGS) +LINK_FLAGS= +LINK_FLAGS+=$(LDFLAGS) +ENABLE_SHARED ?= 1 + +soname=liburing.so.1 +minor=0 +micro=1 +libname=$(soname).$(minor).$(micro) +all_targets += liburing.a + +ifeq ($(ENABLE_SHARED),1) +all_targets += $(libname) +endif + +all: $(all_targets) + +liburing_srcs := io_uring.c syscall.c + +liburing_objs := $(patsubst %.c,%.ol,$(liburing_srcs)) +liburing_sobjs := $(patsubst %.c,%.os,$(liburing_srcs)) + +$(liburing_objs) $(liburing_sobjs): io_uring.h + +%.os: %.c + $(CC) $(SO_CFLAGS) -c -o $@ $< + +%.ol: %.c + $(CC) $(L_CFLAGS) -c -o $@ $< + +AR ?= ar +RANLIB ?= ranlib +liburing.a: $(liburing_objs) + rm -f liburing.a + $(AR) r liburing.a $^ + $(RANLIB) liburing.a + +$(libname): $(liburing_sobjs) liburing.map + $(CC) $(SO_CFLAGS) -Wl,--version-script=liburing.map -Wl,-soname=$(soname) -o $@ $(liburing_sobjs) $(LINK_FLAGS) + +install: $(all_targets) + install -D -m 644 io_uring.h $(includedir)/io_uring.h + install -D -m 644 liburing.a $(libdir)/liburing.a +ifeq ($(ENABLE_SHARED),1) + install -D -m 755 $(libname) $(libdir)/$(libname) + ln -sf $(libname) $(libdir)/$(soname) + ln -sf $(libname) $(libdir)/liburing.so +endif + +$(liburing_objs): liburing.h + +clean: + rm -f $(all_targets) $(liburing_objs) $(liburing_sobjs) $(soname).new + rm -f *.so* *.a *.o diff --git a/src/barrier.h b/src/barrier.h new file mode 100644 index 0000000..0fd5c39 --- /dev/null +++ b/src/barrier.h @@ -0,0 +1,16 @@ +#ifndef LIBURING_BARRIER_H +#define LIBURING_BARRIER_H + +#if defined(__x86_64) +#define read_barrier() __asm__ __volatile__("lfence":::"memory") +#define write_barrier() __asm__ __volatile__("sfence":::"memory") +#else +/* + * Add arch appropriate definitions. Be safe and use full barriers for + * archs we don't have support for. + */ +#define read_barrier() __sync_synchronize() +#define write_barrier() __sync_synchronize() +#endif + +#endif diff --git a/src/io_uring.c b/src/io_uring.c new file mode 100644 index 0000000..52b3553 --- /dev/null +++ b/src/io_uring.c @@ -0,0 +1,193 @@ +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <unistd.h> +#include <errno.h> +#include <string.h> + +#include "io_uring.h" +#include "liburing.h" +#include "barrier.h" + +/* + * Return an IO completion, waiting for it it necessary. + */ +int io_uring_get_completion(int fd, struct io_uring_cq *cq, + struct io_uring_event **ev_ptr) +{ + const unsigned mask = *cq->kring_mask; + struct io_uring_event *ev = NULL; + unsigned head; + int ret; + + head = *cq->khead; + do { + read_barrier(); + if (head != *cq->ktail) { + ev = &cq->events[head & mask]; + break; + } + ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS); + if (ret < 0) + return -errno; + } while (1); + + if (ev) { + *cq->khead = head + 1; + write_barrier(); + } + + *ev_ptr = ev; + return 0; +} + +/* + * Submit iocbs acquired from io_uring_get_iocb() to the kernel. + * + * Returns number of iocbs submitted + */ +int io_uring_submit(int fd, struct io_uring_sq *sq) +{ + const unsigned mask = *sq->kring_mask; + unsigned ktail, ktail_next, submitted; + + /* + * If we have pending IO in the kring, submit it first + */ + read_barrier(); + if (*sq->khead != *sq->ktail) { + submitted = *sq->kring_entries; + goto submit; + } + + if (sq->iocb_head == sq->iocb_tail) + return 0; + + /* + * Fill in iocbs that we have queued up, adding them to the kernel ring + */ + submitted = 0; + ktail = ktail_next = *sq->ktail; + while (sq->iocb_head < sq->iocb_tail) { + ktail_next++; + read_barrier(); + if (ktail_next == *sq->khead) + break; + + sq->array[ktail & mask] = sq->iocb_head & mask; + ktail = ktail_next; + + sq->iocb_head++; + submitted++; + } + + if (!submitted) + return 0; + + if (*sq->ktail != ktail) { + write_barrier(); + *sq->ktail = ktail; + write_barrier(); + } + +submit: + return io_uring_enter(fd, submitted, 0, IORING_ENTER_GETEVENTS); +} + +/* + * Return an iocb to fill. Application must later call io_uring_submit() + * when it's ready to tell the kernel about it. The caller may call this + * function multiple times before calling io_uring_submit(). + * + * Returns a vacant iocb, or NULL if we're full. + */ +struct io_uring_iocb *io_uring_get_iocb(struct io_uring_sq *sq) +{ + unsigned next = sq->iocb_tail + 1; + struct io_uring_iocb *iocb; + + /* + * All iocbs are used + */ + if (next - sq->iocb_head > *sq->kring_entries) + return NULL; + + iocb = &sq->iocbs[sq->iocb_tail & *sq->kring_mask]; + sq->iocb_tail = next; + return iocb; +} + +static int io_uring_mmap(int fd, struct io_uring_params *p, + struct io_uring_sq *sq, struct io_uring_cq *cq) +{ + size_t size; + void *ptr; + int ret; + + sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned); + ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); + if (ptr == MAP_FAILED) + return -errno; + sq->khead = ptr + p->sq_off.head; + sq->ktail = ptr + p->sq_off.tail; + sq->kring_mask = ptr + p->sq_off.ring_mask; + sq->kring_entries = ptr + p->sq_off.ring_entries; + sq->kflags = ptr + p->sq_off.flags; + sq->kdropped = ptr + p->sq_off.dropped; + sq->array = ptr + p->sq_off.array; + + size = p->sq_entries * sizeof(struct io_uring_iocb); + sq->iocbs = mmap(0, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, + IORING_OFF_IOCB); + if (sq->iocbs == MAP_FAILED) { + ret = -errno; +err: + munmap(sq->khead, sq->ring_sz); + return ret; + } + + cq->ring_sz = p->cq_off.events + p->cq_entries * sizeof(struct io_uring_event); + ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING); + if (ptr == MAP_FAILED) { + ret = -errno; + munmap(sq->iocbs, p->sq_entries * sizeof(struct io_uring_iocb)); + goto err; + } + cq->khead = ptr + p->cq_off.head; + cq->ktail = ptr + p->cq_off.tail; + cq->kring_mask = ptr + p->cq_off.ring_mask; + cq->kring_entries = ptr + p->cq_off.ring_entries; + cq->koverflow = ptr + p->cq_off.overflow; + cq->events = ptr + p->cq_off.events; + return fd; +} + +/* + * Returns -1 on error, or an 'fd' on success. On success, 'sq' and 'cq' + * contain the necessary information to read/write to the rings. + */ +int io_uring_queue_init(unsigned entries, struct io_uring_params *p, + struct iovec *iovecs, struct io_uring_sq *sq, + struct io_uring_cq *cq) +{ + int fd; + + fd = io_uring_setup(entries, iovecs, p); + if (fd < 0) + return fd; + + memset(sq, 0, sizeof(*sq)); + memset(cq, 0, sizeof(*cq)); + return io_uring_mmap(fd, p, sq, cq); +} + +void io_uring_queue_exit(int fd, struct io_uring_sq *sq, struct io_uring_cq *cq) +{ + munmap(sq->iocbs, *sq->kring_entries * sizeof(struct io_uring_iocb)); + munmap(sq->khead, sq->ring_sz); + munmap(cq->khead, cq->ring_sz); + close(fd); +} diff --git a/src/io_uring.h b/src/io_uring.h new file mode 100644 index 0000000..7dd2112 --- /dev/null +++ b/src/io_uring.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Header file for the io_uring interface. + * + * Copyright (C) 2019 Jens Axboe + * Copyright (C) 2019 Christoph Hellwig + */ +#ifndef LINUX_IO_URING_H +#define LINUX_IO_URING_H + +#include <linux/fs.h> +#include <linux/types.h> + +/* + * IO submission data structure + */ +struct io_uring_iocb { + __u8 opcode; + __u8 flags; + __u16 ioprio; + __s32 fd; + __u64 off; + union { + void *addr; + __u64 __pad; + }; + __u32 len; + union { + __kernel_rwf_t rw_flags; + __u32 __resv; + }; +}; + +/* + * io_uring_setup() flags + */ +#define IORING_SETUP_IOPOLL (1 << 0) /* io_context is polled */ +#define IORING_SETUP_FIXEDBUFS (1 << 1) /* IO buffers are fixed */ +#define IORING_SETUP_SQTHREAD (1 << 2) /* Use SQ thread */ +#define IORING_SETUP_SQWQ (1 << 3) /* Use SQ workqueue */ +#define IORING_SETUP_SQPOLL (1 << 4) /* SQ thread polls */ + +#define IORING_OP_READ 1 +#define IORING_OP_WRITE 2 +#define IORING_OP_FSYNC 3 +#define IORING_OP_FDSYNC 4 +#define IORING_OP_READ_FIXED 5 +#define IORING_OP_WRITE_FIXED 6 + +/* + * IO completion data structure + */ +struct io_uring_event { + __u64 index; /* what iocb this event came from */ + __s32 res; /* result code for this event */ + __u32 flags; +}; + +/* + * io_uring_event->flags + */ +#define IOEV_FLAG_CACHEHIT (1 << 0) /* IO did not hit media */ + +/* + * Magic offsets for the application to mmap the data it needs + */ +#define IORING_OFF_SQ_RING 0ULL +#define IORING_OFF_CQ_RING 0x8000000ULL +#define IORING_OFF_IOCB 0x10000000ULL + +/* + * Filled with the offset for mmap(2) + */ +struct io_sqring_offsets { + __u32 head; + __u32 tail; + __u32 ring_mask; + __u32 ring_entries; + __u32 flags; + __u32 dropped; + __u32 array; + __u32 resv[3]; +}; + +#define IORING_SQ_NEED_WAKEUP (1 << 0) /* needs io_uring_enter wakeup */ + +struct io_cqring_offsets { + __u32 head; + __u32 tail; + __u32 ring_mask; + __u32 ring_entries; + __u32 overflow; + __u32 events; + __u32 resv[4]; +}; + +/* + * io_uring_enter(2) flags + */ +#define IORING_ENTER_GETEVENTS (1 << 0) + +/* + * Passed in for io_uring_setup(2). Copied back with updated info on success + */ +struct io_uring_params { + __u32 sq_entries; + __u32 cq_entries; + __u32 flags; + __u16 sq_thread_cpu; + __u16 resv[9]; + struct io_sqring_offsets sq_off; + struct io_cqring_offsets cq_off; +}; + +#endif diff --git a/src/liburing.h b/src/liburing.h new file mode 100644 index 0000000..454591a --- /dev/null +++ b/src/liburing.h @@ -0,0 +1,57 @@ +#ifndef LIB_URING_H +#define LIB_URING_H + +#include <sys/uio.h> +#include "io_uring.h" + +/* + * Library interface to io_uring + */ +struct io_uring_sq { + unsigned *khead; + unsigned *ktail; + unsigned *kring_mask; + unsigned *kring_entries; + unsigned *kflags; + unsigned *kdropped; + unsigned *array; + struct io_uring_iocb *iocbs; + + unsigned iocb_head; + unsigned iocb_tail; + + size_t ring_sz; +}; + +struct io_uring_cq { + unsigned *khead; + unsigned *ktail; + unsigned *kring_mask; + unsigned *kring_entries; + unsigned *koverflow; + struct io_uring_event *events; + + size_t ring_sz; +}; + +/* + * System calls + */ +extern int io_uring_setup(unsigned entries, struct iovec *iovecs, + struct io_uring_params *p); +extern int io_uring_enter(unsigned fd, unsigned to_submit, + unsigned min_complete, unsigned flags); + +/* + * Library interface + */ +extern int io_uring_queue_init(unsigned entries, struct io_uring_params *p, + struct iovec *iovecs, struct io_uring_sq *sq, struct io_uring_cq *cq); +extern void io_uring_queue_exit(int fd, struct io_uring_sq *sq, + struct io_uring_cq *cq); +extern int io_uring_get_completion(int fd, struct io_uring_cq *cq, + struct io_uring_event **ev_ptr); +extern int io_uring_submit(int fd, struct io_uring_sq *sq); +extern struct io_uring_iocb *io_uring_get_iocb(struct io_uring_sq *sq); + +#endif diff --git a/src/liburing.map b/src/liburing.map new file mode 100644 index 0000000..ef48835 --- /dev/null +++ b/src/liburing.map @@ -0,0 +1,11 @@ +LIBURING_0.1 { + global: + io_uring_queue_init; + io_uring_queue_exit; + io_uring_get_completion; + io_uring_submit; + io_uring_get_iocb; + local: + *; + +}; diff --git a/src/syscall.c b/src/syscall.c new file mode 100644 index 0000000..eafdd39 --- /dev/null +++ b/src/syscall.c @@ -0,0 +1,31 @@ +/* + * Will go away once libc support is there + */ +#include <unistd.h> +#include <sys/syscall.h> +#include <sys/uio.h> +#include "io_uring.h" + +#if defined(__x86_64) +#ifndef __NR_sys_io_uring_setup +#define __NR_sys_io_uring_setup 335 +#endif +#ifndef __NR_sys_io_uring_enter +#define __NR_sys_io_uring_enter 336 +#endif +#else +#error "Arch not supported yet" +#endif + +int io_uring_setup(unsigned int entries, struct iovec *iovecs, + struct io_uring_params *p) +{ + return syscall(__NR_sys_io_uring_setup, entries, iovecs, p); +} + +int io_uring_enter(int fd, unsigned int to_submit, unsigned int min_complete, + unsigned int flags) +{ + return syscall(__NR_sys_io_uring_enter, fd, to_submit, min_complete, + flags); +} |