summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2019-01-08 06:51:07 -0700
committerJens Axboe <axboe@kernel.dk>2019-01-08 07:38:15 -0700
commitf93c84e1b07474a7d776403b3516feeff4f3c933 (patch)
tree4b71b6a4013c643d669398fdb8f33f74b6081cd2 /src
Initial commit
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'src')
-rw-r--r--src/Makefile61
-rw-r--r--src/barrier.h16
-rw-r--r--src/io_uring.c193
-rw-r--r--src/io_uring.h115
-rw-r--r--src/liburing.h57
-rw-r--r--src/liburing.map11
-rw-r--r--src/syscall.c31
7 files changed, 484 insertions, 0 deletions
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..635f65a
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,61 @@
+prefix=/usr
+includedir=$(prefix)/include
+libdir=$(prefix)/lib
+
+CFLAGS ?= -g -fomit-frame-pointer -O2
+CFLAGS += -Wall -I. -fPIC
+SO_CFLAGS=-shared $(CFLAGS)
+L_CFLAGS=$(CFLAGS)
+LINK_FLAGS=
+LINK_FLAGS+=$(LDFLAGS)
+ENABLE_SHARED ?= 1
+
+soname=liburing.so.1
+minor=0
+micro=1
+libname=$(soname).$(minor).$(micro)
+all_targets += liburing.a
+
+ifeq ($(ENABLE_SHARED),1)
+all_targets += $(libname)
+endif
+
+all: $(all_targets)
+
+liburing_srcs := io_uring.c syscall.c
+
+liburing_objs := $(patsubst %.c,%.ol,$(liburing_srcs))
+liburing_sobjs := $(patsubst %.c,%.os,$(liburing_srcs))
+
+$(liburing_objs) $(liburing_sobjs): io_uring.h
+
+%.os: %.c
+ $(CC) $(SO_CFLAGS) -c -o $@ $<
+
+%.ol: %.c
+ $(CC) $(L_CFLAGS) -c -o $@ $<
+
+AR ?= ar
+RANLIB ?= ranlib
+liburing.a: $(liburing_objs)
+ rm -f liburing.a
+ $(AR) r liburing.a $^
+ $(RANLIB) liburing.a
+
+$(libname): $(liburing_sobjs) liburing.map
+ $(CC) $(SO_CFLAGS) -Wl,--version-script=liburing.map -Wl,-soname=$(soname) -o $@ $(liburing_sobjs) $(LINK_FLAGS)
+
+install: $(all_targets)
+ install -D -m 644 io_uring.h $(includedir)/io_uring.h
+ install -D -m 644 liburing.a $(libdir)/liburing.a
+ifeq ($(ENABLE_SHARED),1)
+ install -D -m 755 $(libname) $(libdir)/$(libname)
+ ln -sf $(libname) $(libdir)/$(soname)
+ ln -sf $(libname) $(libdir)/liburing.so
+endif
+
+$(liburing_objs): liburing.h
+
+clean:
+ rm -f $(all_targets) $(liburing_objs) $(liburing_sobjs) $(soname).new
+ rm -f *.so* *.a *.o
diff --git a/src/barrier.h b/src/barrier.h
new file mode 100644
index 0000000..0fd5c39
--- /dev/null
+++ b/src/barrier.h
@@ -0,0 +1,16 @@
+#ifndef LIBURING_BARRIER_H
+#define LIBURING_BARRIER_H
+
+#if defined(__x86_64)
+#define read_barrier() __asm__ __volatile__("lfence":::"memory")
+#define write_barrier() __asm__ __volatile__("sfence":::"memory")
+#else
+/*
+ * Add arch appropriate definitions. Be safe and use full barriers for
+ * archs we don't have support for.
+ */
+#define read_barrier() __sync_synchronize()
+#define write_barrier() __sync_synchronize()
+#endif
+
+#endif
diff --git a/src/io_uring.c b/src/io_uring.c
new file mode 100644
index 0000000..52b3553
--- /dev/null
+++ b/src/io_uring.c
@@ -0,0 +1,193 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#include "io_uring.h"
+#include "liburing.h"
+#include "barrier.h"
+
+/*
+ * Return an IO completion, waiting for it it necessary.
+ */
+int io_uring_get_completion(int fd, struct io_uring_cq *cq,
+ struct io_uring_event **ev_ptr)
+{
+ const unsigned mask = *cq->kring_mask;
+ struct io_uring_event *ev = NULL;
+ unsigned head;
+ int ret;
+
+ head = *cq->khead;
+ do {
+ read_barrier();
+ if (head != *cq->ktail) {
+ ev = &cq->events[head & mask];
+ break;
+ }
+ ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS);
+ if (ret < 0)
+ return -errno;
+ } while (1);
+
+ if (ev) {
+ *cq->khead = head + 1;
+ write_barrier();
+ }
+
+ *ev_ptr = ev;
+ return 0;
+}
+
+/*
+ * Submit iocbs acquired from io_uring_get_iocb() to the kernel.
+ *
+ * Returns number of iocbs submitted
+ */
+int io_uring_submit(int fd, struct io_uring_sq *sq)
+{
+ const unsigned mask = *sq->kring_mask;
+ unsigned ktail, ktail_next, submitted;
+
+ /*
+ * If we have pending IO in the kring, submit it first
+ */
+ read_barrier();
+ if (*sq->khead != *sq->ktail) {
+ submitted = *sq->kring_entries;
+ goto submit;
+ }
+
+ if (sq->iocb_head == sq->iocb_tail)
+ return 0;
+
+ /*
+ * Fill in iocbs that we have queued up, adding them to the kernel ring
+ */
+ submitted = 0;
+ ktail = ktail_next = *sq->ktail;
+ while (sq->iocb_head < sq->iocb_tail) {
+ ktail_next++;
+ read_barrier();
+ if (ktail_next == *sq->khead)
+ break;
+
+ sq->array[ktail & mask] = sq->iocb_head & mask;
+ ktail = ktail_next;
+
+ sq->iocb_head++;
+ submitted++;
+ }
+
+ if (!submitted)
+ return 0;
+
+ if (*sq->ktail != ktail) {
+ write_barrier();
+ *sq->ktail = ktail;
+ write_barrier();
+ }
+
+submit:
+ return io_uring_enter(fd, submitted, 0, IORING_ENTER_GETEVENTS);
+}
+
+/*
+ * Return an iocb to fill. Application must later call io_uring_submit()
+ * when it's ready to tell the kernel about it. The caller may call this
+ * function multiple times before calling io_uring_submit().
+ *
+ * Returns a vacant iocb, or NULL if we're full.
+ */
+struct io_uring_iocb *io_uring_get_iocb(struct io_uring_sq *sq)
+{
+ unsigned next = sq->iocb_tail + 1;
+ struct io_uring_iocb *iocb;
+
+ /*
+ * All iocbs are used
+ */
+ if (next - sq->iocb_head > *sq->kring_entries)
+ return NULL;
+
+ iocb = &sq->iocbs[sq->iocb_tail & *sq->kring_mask];
+ sq->iocb_tail = next;
+ return iocb;
+}
+
+static int io_uring_mmap(int fd, struct io_uring_params *p,
+ struct io_uring_sq *sq, struct io_uring_cq *cq)
+{
+ size_t size;
+ void *ptr;
+ int ret;
+
+ sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
+ ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
+ if (ptr == MAP_FAILED)
+ return -errno;
+ sq->khead = ptr + p->sq_off.head;
+ sq->ktail = ptr + p->sq_off.tail;
+ sq->kring_mask = ptr + p->sq_off.ring_mask;
+ sq->kring_entries = ptr + p->sq_off.ring_entries;
+ sq->kflags = ptr + p->sq_off.flags;
+ sq->kdropped = ptr + p->sq_off.dropped;
+ sq->array = ptr + p->sq_off.array;
+
+ size = p->sq_entries * sizeof(struct io_uring_iocb);
+ sq->iocbs = mmap(0, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd,
+ IORING_OFF_IOCB);
+ if (sq->iocbs == MAP_FAILED) {
+ ret = -errno;
+err:
+ munmap(sq->khead, sq->ring_sz);
+ return ret;
+ }
+
+ cq->ring_sz = p->cq_off.events + p->cq_entries * sizeof(struct io_uring_event);
+ ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
+ if (ptr == MAP_FAILED) {
+ ret = -errno;
+ munmap(sq->iocbs, p->sq_entries * sizeof(struct io_uring_iocb));
+ goto err;
+ }
+ cq->khead = ptr + p->cq_off.head;
+ cq->ktail = ptr + p->cq_off.tail;
+ cq->kring_mask = ptr + p->cq_off.ring_mask;
+ cq->kring_entries = ptr + p->cq_off.ring_entries;
+ cq->koverflow = ptr + p->cq_off.overflow;
+ cq->events = ptr + p->cq_off.events;
+ return fd;
+}
+
+/*
+ * Returns -1 on error, or an 'fd' on success. On success, 'sq' and 'cq'
+ * contain the necessary information to read/write to the rings.
+ */
+int io_uring_queue_init(unsigned entries, struct io_uring_params *p,
+ struct iovec *iovecs, struct io_uring_sq *sq,
+ struct io_uring_cq *cq)
+{
+ int fd;
+
+ fd = io_uring_setup(entries, iovecs, p);
+ if (fd < 0)
+ return fd;
+
+ memset(sq, 0, sizeof(*sq));
+ memset(cq, 0, sizeof(*cq));
+ return io_uring_mmap(fd, p, sq, cq);
+}
+
+void io_uring_queue_exit(int fd, struct io_uring_sq *sq, struct io_uring_cq *cq)
+{
+ munmap(sq->iocbs, *sq->kring_entries * sizeof(struct io_uring_iocb));
+ munmap(sq->khead, sq->ring_sz);
+ munmap(cq->khead, cq->ring_sz);
+ close(fd);
+}
diff --git a/src/io_uring.h b/src/io_uring.h
new file mode 100644
index 0000000..7dd2112
--- /dev/null
+++ b/src/io_uring.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Header file for the io_uring interface.
+ *
+ * Copyright (C) 2019 Jens Axboe
+ * Copyright (C) 2019 Christoph Hellwig
+ */
+#ifndef LINUX_IO_URING_H
+#define LINUX_IO_URING_H
+
+#include <linux/fs.h>
+#include <linux/types.h>
+
+/*
+ * IO submission data structure
+ */
+struct io_uring_iocb {
+ __u8 opcode;
+ __u8 flags;
+ __u16 ioprio;
+ __s32 fd;
+ __u64 off;
+ union {
+ void *addr;
+ __u64 __pad;
+ };
+ __u32 len;
+ union {
+ __kernel_rwf_t rw_flags;
+ __u32 __resv;
+ };
+};
+
+/*
+ * io_uring_setup() flags
+ */
+#define IORING_SETUP_IOPOLL (1 << 0) /* io_context is polled */
+#define IORING_SETUP_FIXEDBUFS (1 << 1) /* IO buffers are fixed */
+#define IORING_SETUP_SQTHREAD (1 << 2) /* Use SQ thread */
+#define IORING_SETUP_SQWQ (1 << 3) /* Use SQ workqueue */
+#define IORING_SETUP_SQPOLL (1 << 4) /* SQ thread polls */
+
+#define IORING_OP_READ 1
+#define IORING_OP_WRITE 2
+#define IORING_OP_FSYNC 3
+#define IORING_OP_FDSYNC 4
+#define IORING_OP_READ_FIXED 5
+#define IORING_OP_WRITE_FIXED 6
+
+/*
+ * IO completion data structure
+ */
+struct io_uring_event {
+ __u64 index; /* what iocb this event came from */
+ __s32 res; /* result code for this event */
+ __u32 flags;
+};
+
+/*
+ * io_uring_event->flags
+ */
+#define IOEV_FLAG_CACHEHIT (1 << 0) /* IO did not hit media */
+
+/*
+ * Magic offsets for the application to mmap the data it needs
+ */
+#define IORING_OFF_SQ_RING 0ULL
+#define IORING_OFF_CQ_RING 0x8000000ULL
+#define IORING_OFF_IOCB 0x10000000ULL
+
+/*
+ * Filled with the offset for mmap(2)
+ */
+struct io_sqring_offsets {
+ __u32 head;
+ __u32 tail;
+ __u32 ring_mask;
+ __u32 ring_entries;
+ __u32 flags;
+ __u32 dropped;
+ __u32 array;
+ __u32 resv[3];
+};
+
+#define IORING_SQ_NEED_WAKEUP (1 << 0) /* needs io_uring_enter wakeup */
+
+struct io_cqring_offsets {
+ __u32 head;
+ __u32 tail;
+ __u32 ring_mask;
+ __u32 ring_entries;
+ __u32 overflow;
+ __u32 events;
+ __u32 resv[4];
+};
+
+/*
+ * io_uring_enter(2) flags
+ */
+#define IORING_ENTER_GETEVENTS (1 << 0)
+
+/*
+ * Passed in for io_uring_setup(2). Copied back with updated info on success
+ */
+struct io_uring_params {
+ __u32 sq_entries;
+ __u32 cq_entries;
+ __u32 flags;
+ __u16 sq_thread_cpu;
+ __u16 resv[9];
+ struct io_sqring_offsets sq_off;
+ struct io_cqring_offsets cq_off;
+};
+
+#endif
diff --git a/src/liburing.h b/src/liburing.h
new file mode 100644
index 0000000..454591a
--- /dev/null
+++ b/src/liburing.h
@@ -0,0 +1,57 @@
+#ifndef LIB_URING_H
+#define LIB_URING_H
+
+#include <sys/uio.h>
+#include "io_uring.h"
+
+/*
+ * Library interface to io_uring
+ */
+struct io_uring_sq {
+ unsigned *khead;
+ unsigned *ktail;
+ unsigned *kring_mask;
+ unsigned *kring_entries;
+ unsigned *kflags;
+ unsigned *kdropped;
+ unsigned *array;
+ struct io_uring_iocb *iocbs;
+
+ unsigned iocb_head;
+ unsigned iocb_tail;
+
+ size_t ring_sz;
+};
+
+struct io_uring_cq {
+ unsigned *khead;
+ unsigned *ktail;
+ unsigned *kring_mask;
+ unsigned *kring_entries;
+ unsigned *koverflow;
+ struct io_uring_event *events;
+
+ size_t ring_sz;
+};
+
+/*
+ * System calls
+ */
+extern int io_uring_setup(unsigned entries, struct iovec *iovecs,
+ struct io_uring_params *p);
+extern int io_uring_enter(unsigned fd, unsigned to_submit,
+ unsigned min_complete, unsigned flags);
+
+/*
+ * Library interface
+ */
+extern int io_uring_queue_init(unsigned entries, struct io_uring_params *p,
+ struct iovec *iovecs, struct io_uring_sq *sq, struct io_uring_cq *cq);
+extern void io_uring_queue_exit(int fd, struct io_uring_sq *sq,
+ struct io_uring_cq *cq);
+extern int io_uring_get_completion(int fd, struct io_uring_cq *cq,
+ struct io_uring_event **ev_ptr);
+extern int io_uring_submit(int fd, struct io_uring_sq *sq);
+extern struct io_uring_iocb *io_uring_get_iocb(struct io_uring_sq *sq);
+
+#endif
diff --git a/src/liburing.map b/src/liburing.map
new file mode 100644
index 0000000..ef48835
--- /dev/null
+++ b/src/liburing.map
@@ -0,0 +1,11 @@
+LIBURING_0.1 {
+ global:
+ io_uring_queue_init;
+ io_uring_queue_exit;
+ io_uring_get_completion;
+ io_uring_submit;
+ io_uring_get_iocb;
+ local:
+ *;
+
+};
diff --git a/src/syscall.c b/src/syscall.c
new file mode 100644
index 0000000..eafdd39
--- /dev/null
+++ b/src/syscall.c
@@ -0,0 +1,31 @@
+/*
+ * Will go away once libc support is there
+ */
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/uio.h>
+#include "io_uring.h"
+
+#if defined(__x86_64)
+#ifndef __NR_sys_io_uring_setup
+#define __NR_sys_io_uring_setup 335
+#endif
+#ifndef __NR_sys_io_uring_enter
+#define __NR_sys_io_uring_enter 336
+#endif
+#else
+#error "Arch not supported yet"
+#endif
+
+int io_uring_setup(unsigned int entries, struct iovec *iovecs,
+ struct io_uring_params *p)
+{
+ return syscall(__NR_sys_io_uring_setup, entries, iovecs, p);
+}
+
+int io_uring_enter(int fd, unsigned int to_submit, unsigned int min_complete,
+ unsigned int flags)
+{
+ return syscall(__NR_sys_io_uring_enter, fd, to_submit, min_complete,
+ flags);
+}