From 76b61ebf1bd17d3a31c3bf2d8236b9bd50d0f9a8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 17 Apr 2019 09:25:32 -0600 Subject: Add io_uring_cqe_seen() There's a failure case where an application gets a cqe entry, but the kernel can then overwrite it before the application is done reading it. This can happen since the io_uring_{get,wait}_completion() interface both returns a CQE pointer AND increments the ring index. If the kernel reuses this entry before the applications is done reading it, the contents may be corrupted. Remove the CQ head increment from the CQE retrieval, and put it into a separate helper, io_uring_cqe_seen(). The application must call this helper when it got a new CQE entry through one of the above calls, and it's now done reading it. Signed-off-by: Jens Axboe --- src/liburing.h | 20 ++++++++++++++++++++ src/queue.c | 9 --------- test/cq-full.c | 1 + test/fsync.c | 2 ++ test/io_uring-cp.c | 3 +++ test/io_uring-test.c | 6 +++++- test/io_uring_enter.c | 1 + test/io_uring_register.c | 6 ++++-- test/nop.c | 2 ++ test/poll-cancel.c | 2 ++ test/poll.c | 1 + 11 files changed, 41 insertions(+), 12 deletions(-) diff --git a/src/liburing.h b/src/liburing.h index a544fa5..158239a 100644 --- a/src/liburing.h +++ b/src/liburing.h @@ -6,6 +6,7 @@ #include #include "compat.h" #include "io_uring.h" +#include "barrier.h" /* * Library interface to io_uring @@ -67,6 +68,25 @@ extern int io_uring_wait_completion(struct io_uring *ring, extern int io_uring_submit(struct io_uring *ring); extern struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring); +/* + * Must be called after io_uring_{get,wait}_completion() after the cqe has + * been processed by the application. + */ +static inline void io_uring_cqe_seen(struct io_uring *ring, + struct io_uring_cqe *cqe) +{ + if (cqe) { + struct io_uring_cq *cq = &ring->cq; + + (*cq->khead)++; + /* + * Ensure that the kernel sees our new head, the kernel has + * the matching read barrier. + */ + write_barrier(); + } +} + /* * Command prep helpers */ diff --git a/src/queue.c b/src/queue.c index 6767790..72b3af2 100644 --- a/src/queue.c +++ b/src/queue.c @@ -41,15 +41,6 @@ static int __io_uring_get_completion(struct io_uring *ring, return -errno; } while (1); - if (*cqe_ptr) { - *cq->khead = head + 1; - /* - * Ensure that the kernel sees our new head, the kernel has - * the matching read barrier. - */ - write_barrier(); - } - return 0; } diff --git a/test/cq-full.c b/test/cq-full.c index 426eb4a..42021bd 100644 --- a/test/cq-full.c +++ b/test/cq-full.c @@ -67,6 +67,7 @@ int main(int argc, char *argv[]) printf("wait completion %d\n", ret); goto err; } + io_uring_cqe_seen(&ring, cqe); if (!cqe) break; i++; diff --git a/test/fsync.c b/test/fsync.c index deacc5d..f127b17 100644 --- a/test/fsync.c +++ b/test/fsync.c @@ -45,6 +45,7 @@ static int test_single_fsync(struct io_uring *ring) goto err; } + io_uring_cqe_seen(ring, cqe); unlink(buf); return 0; err: @@ -122,6 +123,7 @@ static int test_barrier_fsync(struct io_uring *ring) goto err; } } + io_uring_cqe_seen(ring, cqe); } unlink("testfile"); diff --git a/test/io_uring-cp.c b/test/io_uring-cp.c index 4df8139..f704ff6 100644 --- a/test/io_uring-cp.c +++ b/test/io_uring-cp.c @@ -177,6 +177,7 @@ static int copy_file(struct io_uring *ring, off_t insize) if (cqe->res < 0) { if (cqe->res == -EAGAIN) { queue_prepped(ring, data); + io_uring_cqe_seen(ring, cqe); continue; } fprintf(stderr, "cqe failed: %s\n", @@ -188,6 +189,7 @@ static int copy_file(struct io_uring *ring, off_t insize) data->iov.iov_len -= cqe->res; data->offset += cqe->res; queue_prepped(ring, data); + io_uring_cqe_seen(ring, cqe); continue; } @@ -204,6 +206,7 @@ static int copy_file(struct io_uring *ring, off_t insize) free(data); writes--; } + io_uring_cqe_seen(ring, cqe); } } diff --git a/test/io_uring-test.c b/test/io_uring-test.c index caca379..fe0098c 100644 --- a/test/io_uring-test.c +++ b/test/io_uring-test.c @@ -74,10 +74,14 @@ int main(int argc, char *argv[]) } done++; + ret = 0; if (cqe->res != 4096) { fprintf(stderr, "ret=%d, wanted 4096\n", cqe->res); - return 1; + ret = 1; } + io_uring_cqe_seen(&ring, cqe); + if (ret) + break; } printf("Submitted=%d, completed=%d\n", pending, done); diff --git a/test/io_uring_enter.c b/test/io_uring_enter.c index 4dc46c4..51ab8bb 100644 --- a/test/io_uring_enter.c +++ b/test/io_uring_enter.c @@ -158,6 +158,7 @@ reap_events(struct io_uring *ring, unsigned nr) free(iov->iov_base); free(iov); left--; + io_uring_cqe_seen(ring, cqe); gettimeofday(&now, NULL); timersub(&now, &start, &elapsed); diff --git a/test/io_uring_register.c b/test/io_uring_register.c index 30a225b..61ea64e 100644 --- a/test/io_uring_register.c +++ b/test/io_uring_register.c @@ -428,13 +428,15 @@ ioring_poll(struct io_uring *ring, int fd, int fixed) printf("io_uring_wait_completion failed with %d\n", ret); return 1; } + ret = 0; if (cqe->res != POLLOUT) { printf("io_uring_wait_completion: expected 0x%.8x, got 0x%.8x\n", POLLOUT, cqe->res); - return 1; + ret = 1; } - return 0; + io_uring_cqe_seen(ring, cqe); + return ret; } int diff --git a/test/nop.c b/test/nop.c index a37246b..7fe2455 100644 --- a/test/nop.c +++ b/test/nop.c @@ -37,6 +37,7 @@ static int test_single_nop(struct io_uring *ring) goto err; } + io_uring_cqe_seen(ring, cqe); return 0; err: return 1; @@ -75,6 +76,7 @@ static int test_barrier_nop(struct io_uring *ring) printf("wait completion %d\n", ret); goto err; } + io_uring_cqe_seen(ring, cqe); } return 0; diff --git a/test/poll-cancel.c b/test/poll-cancel.c index 4a3626d..c318a61 100644 --- a/test/poll-cancel.c +++ b/test/poll-cancel.c @@ -99,6 +99,7 @@ int main(int argc, char *argv[]) pd->is_cancel, (long) cqe->res); return 1; } + io_uring_cqe_seen(&ring, cqe); ret = io_uring_wait_completion(&ring, &cqe); if (ret < 0) { @@ -113,5 +114,6 @@ int main(int argc, char *argv[]) return 1; } + io_uring_cqe_seen(&ring, cqe); return 0; } diff --git a/test/poll.c b/test/poll.c index c0a66cc..02d7899 100644 --- a/test/poll.c +++ b/test/poll.c @@ -74,6 +74,7 @@ int main(int argc, char *argv[]) printf("child: wait completion %d\n", ret); break; } + io_uring_cqe_seen(&ring, cqe); } while (ret != 0); if (ret < 0) { -- cgit