summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--man/io_uring_setup.2300
1 files changed, 300 insertions, 0 deletions
diff --git a/man/io_uring_setup.2 b/man/io_uring_setup.2
new file mode 100644
index 0000000..bf249e7
--- /dev/null
+++ b/man/io_uring_setup.2
@@ -0,0 +1,300 @@
+.\" Copyright (C) 2019 Jens Axboe <axboe@kernel.dk>
+.\" Copyright (C) 2019 Jon Corbet <corbet@lwn.net>
+.\" Copyright (C) 2019 Red Hat, Inc.
+.\"
+.\" %%%LICENSE_START(LGPL_V2.1)
+.\" This file is distributed according to the GNU Lesser General Public License.
+.\" %%%LICENSE_END
+.\"
+.TH IO_URING_SETUP 2 2019-01-29 "Linux" "Linux Programmer's Manual"
+.SH NAME
+io_uring_setup \- setup a context for performing asynchronous I/O
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/io_uring.h>"
+.PP
+.BI "int io_uring_setup(u32 " entries ", struct io_uring_params *" p);
+.fi
+.PP
+.SH DESCRIPTION
+.PP
+The io_uring_setup() system call sets up a submission queue (SQ) and
+completion queue (CQ) with at least
+.I entries
+entries, and returns a file descriptor which can be used to perform
+subsequent operations on the io_uring instance. The submission and
+completion queues are shared between userspace and the kernel, which
+eliminates the need to copy data when initiating and completing I/O.
+
+.I params
+is used by the application to pass options to the kernel, and by the
+kernel to convey information about the ring buffers.
+.PP
+.in +4n
+.EX
+struct io_uring_params {
+ __u32 sq_entries;
+ __u32 cq_entries;
+ __u32 flags;
+ __u32 sq_thread_cpu;
+ __u32 sq_thread_idle;
+ __u32 resv[5];
+ struct io_sqring_offsets sq_off;
+ struct io_cqring_offsets cq_off;
+};
+.EE
+.in
+.PP
+The
+.I flags, sq_thread_cpu
+and
+.I sq_thread_idle
+fields are used to configure the io_uring instance.
+.I flags
+is a bit mask of 0 or more of the following values ORed
+together:
+.TP
+.BR IORING_SETUP_IOPOLL
+Perform busy-waiting for an I/O completion, as opposed to getting
+notifications via an asynchronous IRQ (Interrupt Request). The file
+system (if any) and block device must support polling in order for
+this to work. Busy-waiting provides lower latency, but may consume
+more CPU resources than interrupt driven I/O. Currently, this feature
+is usable only on a file descriptor opened using the
+.B O_DIRECT
+flag. When a read or write is submitted to a polled context, the
+application must poll for completions on the CQ ring by calling
+.BR io_uring_enter(2).
+It is illegal to mix and match polled and non-polled I/O on an io_uring
+instance.
+
+.TP
+.BR IORING_SETUP_SQPOLL
+When this flag is specified, a kernel thread is created to perform
+submission queue polling. An io_uring instance configured in this way
+enables an application to issue I/O without ever context switching
+into the kernel. By using the submission queue to fill in new
+submission queue entries and watching for completions on the
+completion queue, the application can submit and reap I/Os without
+doing a single system call.
+
+If the kernel thread is idle for more than
+.I sq_thread_idle
+microseconds, it will set the
+.B IORING_SQ_NEED_WAKEUP
+bit in the
+.I flags
+field of the
+.I struct io_sq_ring.
+When this happens, the application must call
+.BR io_uring_enter(2)
+to wake the kernel thread. If I/O is kept busy, the kernel thread
+will never sleep. An application making use of this feature will need
+to guard the
+.BR io_uring_enter(2)
+call with the following code sequence:
+
+.in +4n
+.EX
+read_barrier();
+if (*sq_ring->flags & IORING_SQ_NEED_WAKEUP)
+ io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP);
+.EE
+.in
+
+where
+.I sq_ring
+is a submission queue ring setup using the
+.I struct io_sqring_offsets
+described below.
+.TP
+.BR IORING_SETUP_SQ_AFF
+If this flag is specified, then the poll thread will be bound to the
+cpu set in the
+.I sq_thread_cpu
+field of the
+.I struct io_uring_params.
+This flag is only meaningful when
+.B IORING_SETUP_SQPOLL
+is specified.
+.PP
+If no flags are specified, the io_uring instance is setup for
+interrupt driven I/O. I/O may be submitted using
+.BR io_uring_enter(2)
+and can be reaped by polling the completion queue.
+
+The
+.I resv
+array must be initialized to zero.
+
+The rest of the fields in the
+.I struct io_uring_params
+are filled in by the kernel, and provide the information necessary to
+memory map the submission queue, completion queue, and the array of
+submission queue entries.
+.I sq_entries
+specifies the number of submission queue entries allocated.
+.I sq_off
+describes the offsets of various ring buffer fields:
+.PP
+.in +4n
+.EX
+struct io_sqring_offsets {
+ __u32 head;
+ __u32 tail;
+ __u32 ring_mask;
+ __u32 ring_entries;
+ __u32 flags;
+ __u32 dropped;
+ __u32 array;
+ __u32 resv[3];
+};
+.EE
+.in
+.PP
+Taken together,
+.I sq_entries
+and
+.I sq_off
+provide all of the information necessary for accessing the submission
+queue ring buffer and the submission queue entry array. The
+submission queue can be mapped with a call like:
+.PP
+.in +4n
+.EX
+ptr = mmap(0, sq_off.array + sq_entries * sizeof(__u32),
+ PROT_READ|PROT_WRITE|MAP_SHARED|MAP_POPULATE,
+ ring_fd, IORING_OFF_SQ_RING);
+.EE
+.in
+.PP
+Where sq_off is the
+.I io_sqring_offsets
+structure, and
+.I ring_fd
+is the file descriptor returned from
+.BR io_uring_setup(2).
+The addition of
+.I sq_off.array
+to the length of the region accounts for the fact that the ring
+located at the end of the data structure. As an example, the ring
+buffer head pointer can be accessed by adding
+.I sq_off.head
+to the address returned from
+.BR mmap(2):
+.PP
+.in +4n
+.EX
+head = ptr + sq_off.head;
+.EE
+.in
+
+The
+.I flags
+field is used by the kernel to communicate state information to the
+application. Currently, it is used to inform the application when a
+call to
+.BR io_uring_enter(2)
+is necessary. See the documentation for the
+.B IORING_SETUP_SQPOLL
+flag above.
+The
+.I dropped
+member is incremented for each invalid submission queue entry
+encountered in the ring buffer.
+
+The head and tail track the ring buffer state. The tail is
+incremented by the application when submitting new I/O, and the head
+is incremented by the kernel when the I/O has been successfully
+submitted. Determining the index of the head or tail into the ring is
+accomplished by applying a mask:
+.PP
+.in +4n
+.EX
+index = tail & ring_mask;
+.EE
+.in
+.PP
+The array of submission queue entries is mapped with:
+.PP
+.in +4n
+.EX
+sqentries = mmap(0, sq_entries * sizeof(struct io_uring_sqe),
+ PROT_READ|PROT_WRITE|MAP_SHARED|MAP_POPULATE,
+ ring_fd, IORING_OFF_SQES);
+.EE
+.in
+.PP
+The completion queue is described by
+.I cq_entries
+and
+.I cq_off,
+shown here:
+.PP
+.in +4n
+.EX
+struct io_cqring_offsets {
+ __u32 head;
+ __u32 tail;
+ __u32 ring_mask;
+ __u32 ring_entries;
+ __u32 overflow;
+ __u32 cqes;
+ __u32 resv[4];
+};
+.EE
+.in
+.PP
+The completion queue is simpler, since the entries are not separated
+from the queue itself, and can be mapped with:
+.PP
+.in +4n
+.EX
+ptr = mmap(0, cq_off.cqes + cq_entries * sizeof(struct io_uring_cqe),
+ PROT_READ|PROT_WRITE|MAP_SHARED|MAP_POPULATE, ring_fd,
+ IORING_OFF_CQ_RING);
+.EE
+.in
+.PP
+Closing the fd returned by io_uring_setup(2) will free all resources
+associated with the io_uring context.
+.PP
+.SH RETURN VALUE
+io_uring_setup() returns a new file descriptor on success. The
+application may then provide the file descriptor in a subsequent mmap
+call to map the submission and completion queues, or to the
+io_uring_register or io_uring_enter system calls.
+
+On error, -1 is returned and errno is set appropriately.
+.PP
+.SH ERRORS
+.TP
+.BR EFAULT
+params is outside your accessible address space.
+.TP
+.BR EINVAL
+The resv array contains non-zero data, p.flags contains an unsupported
+flag,
+.I entries
+is out of bounds, or
+.B IORING_SETUP_SQ_AFF
+was specified, but
+.B IORING_SETUP_SQPOLL
+was not.
+.TP
+.BR EMFILE
+The per-process limit on the number of open file descriptors has been
+reached (see the description of
+.B RLIMIT_NOFILE
+in
+.BR getrlimit(2)).
+.TP
+.BR ENFILE
+The system-wide limit on the total number of open files has been
+reached.
+.TP
+.BR ENOMEM
+Insufficient kernel resources are available.
+.SH SEE ALSO
+.BR io_uring_register(2),
+.BR io_uring_enter(2)