5 files changed, 107 insertions, 39 deletions
diff --git a/man/io_uring_setup.2 b/man/io_uring_setup.2
index ebaee2d..9ab0143 100644
--- a/man/io_uring_setup.2
+++ b/man/io_uring_setup.2
@@ -97,7 +97,11 @@ call with the following code sequence:
 
 .in +4n
 .EX
-read_barrier();
+/*
+ * Ensure that the wakeup flag is read after the tail pointer has been
+ * written.
+ */
+smp_mb();
 if (*sq_ring->flags & IORING_SQ_NEED_WAKEUP)
     io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP);
 .EE
diff --git a/src/barrier.h b/src/barrier.h
index ef00f67..e1a407f 100644
--- a/src/barrier.h
+++ b/src/barrier.h
@@ -1,16 +1,95 @@
 #ifndef LIBURING_BARRIER_H
 #define LIBURING_BARRIER_H
 
+/*
+From the kernel documentation file refcount-vs-atomic.rst:
+
+A RELEASE memory ordering guarantees that all prior loads and
+stores (all po-earlier instructions) on the same CPU are completed
+before the operation. It also guarantees that all po-earlier
+stores on the same CPU and all propagated stores from other CPUs
+must propagate to all other CPUs before the release operation
+(A-cumulative property). This is implemented using
+:c:func:`smp_store_release`.
+
+An ACQUIRE memory ordering guarantees that all post loads and
+stores (all po-later instructions) on the same CPU are
+completed after the acquire operation. It also guarantees that all
+po-later stores on the same CPU must propagate to all other CPUs
+after the acquire operation executes. This is implemented using
+:c:func:`smp_acquire__after_ctrl_dep`.
+*/
+
+/* From tools/include/linux/compiler.h */
+/* Optimization barrier */
+/* The "volatile" is due to gcc bugs */
+#define barrier() __asm__ __volatile__("": : :"memory")
+
+/* From tools/virtio/linux/compiler.h */
+#define WRITE_ONCE(var, val) \
+	(*((volatile typeof(val) *)(&(var))) = (val))
+#define READ_ONCE(var) (*((volatile typeof(var) *)(&(var))))
+
+
 #if defined(__x86_64) || defined(__i386__)
-#define read_barrier()	__asm__ __volatile__("":::"memory")
-#define write_barrier()	__asm__ __volatile__("":::"memory")
+/* From tools/arch/x86/include/asm/barrier.h */
+#if defined(__i386__)
+/*
+ * Some non-Intel clones support out of order store. wmb() ceases to be a
+ * nop for these.
+ */
+#define mb()	asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define rmb()	asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define wmb()	asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#elif defined(__x86_64__)
+#define mb()	asm volatile("mfence" ::: "memory")
+#define rmb()	asm volatile("lfence" ::: "memory")
+#define wmb()	asm volatile("sfence" ::: "memory")
+#define smp_rmb() barrier()
+#define smp_wmb() barrier()
+#define smp_mb()  asm volatile("lock; addl $0,-132(%%rsp)" ::: "memory", "cc")
+#endif
+
+#if defined(__x86_64__)
+#define smp_store_release(p, v)			\
+do {						\
+	barrier();				\
+	WRITE_ONCE(*(p), (v));			\
+} while (0)
+
+#define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*(p));	\
+	barrier();				\
+	___p1;					\
+})
+#endif /* defined(__x86_64__) */
 #else
 /*
  * Add arch appropriate definitions. Be safe and use full barriers for
  * archs we don't have support for.
  */
-#define read_barrier()	__sync_synchronize()
-#define write_barrier()	__sync_synchronize()
+#define smp_rmb()	__sync_synchronize()
+#define smp_wmb()	__sync_synchronize()
+#endif
+
+/* From tools/include/asm/barrier.h */
+
+#ifndef smp_store_release
+# define smp_store_release(p, v)		\
+do {						\
+	smp_mb();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+#endif
+
+#ifndef smp_load_acquire
+# define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	smp_mb();				\
+	___p1;					\
+})
 #endif
 
 #endif
diff --git a/src/liburing.h b/src/liburing.h
index d3fcd15..a350a01 100644
--- a/src/liburing.h
+++ b/src/liburing.h
@@ -88,11 +88,10 @@ extern int io_uring_register_eventfd(struct io_uring *ring, int fd);
 extern int io_uring_unregister_eventfd(struct io_uring *ring);
 
 #define io_uring_for_each_cqe(ring, head, cqe)					\
+	/* smp_load_acquire() enforces the order of tail and CQE reads. */	\
 	for (head = *(ring)->cq.khead;						\
-	     /* See read_barrier() explanation in __io_uring_get_cqe() */	\
-	     ({read_barrier();							\
-	       cqe = (head != *(ring)->cq.ktail ?				\
-		&(ring)->cq.cqes[head & (*(ring)->cq.kring_mask)] : NULL);});	\
+	     (cqe = (head != smp_load_acquire((ring)->cq.ktail) ?		\
+		&(ring)->cq.cqes[head & (*(ring)->cq.kring_mask)] : NULL));	\
 	     head++)								\
 
 
@@ -105,13 +104,11 @@ static inline void io_uring_cq_advance(struct io_uring *ring,
 	if (nr) {
 		struct io_uring_cq *cq = &ring->cq;
 
-		(*cq->khead) += nr;
-
 		/*
-		 * Ensure that the kernel sees our new head, the kernel has
-		 * the matching read barrier.
+		 * Ensure that the kernel only sees the new value of the head
+		 * index after the CQEs have been read.
 		 */
-		write_barrier();
+		smp_store_release(cq->khead, *cq->khead + nr);
 	}
 }
 
diff --git a/src/queue.c b/src/queue.c
index bec363f..72b2293 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -77,7 +77,7 @@ static int __io_uring_submit(struct io_uring *ring, unsigned wait_nr)
 {
 	struct io_uring_sq *sq = &ring->sq;
 	const unsigned mask = *sq->kring_mask;
-	unsigned ktail, ktail_next, submitted, to_submit;
+	unsigned ktail, submitted, to_submit;
 	unsigned flags;
 	int ret;
 
@@ -88,15 +88,11 @@ static int __io_uring_submit(struct io_uring *ring, unsigned wait_nr)
 	 * Fill in sqes that we have queued up, adding them to the kernel ring
 	 */
 	submitted = 0;
-	ktail = ktail_next = *sq->ktail;
+	ktail = *sq->ktail;
 	to_submit = sq->sqe_tail - sq->sqe_head;
 	while (to_submit--) {
-		ktail_next++;
-		read_barrier();
-
 		sq->array[ktail & mask] = sq->sqe_head & mask;
-		ktail = ktail_next;
-
+		ktail++;
 		sq->sqe_head++;
 		submitted++;
 	}
@@ -104,21 +100,11 @@ static int __io_uring_submit(struct io_uring *ring, unsigned wait_nr)
 	if (!submitted)
 		return 0;
 
-	if (*sq->ktail != ktail) {
-		/*
-		 * First write barrier ensures that the SQE stores are updated
-		 * with the tail update. This is needed so that the kernel
-		 * will never see a tail update without the preceeding sQE
-		 * stores being done.
-		 */
-		write_barrier();
-		*sq->ktail = ktail;
-		/*
-		 * The kernel has the matching read barrier for reading the
-		 * SQ tail.
-		 */
-		write_barrier();
-	}
+	/*
+	 * Ensure that the kernel sees the SQE updates before it sees the tail
+	 * update.
+	 */
+	smp_store_release(sq->ktail, ktail);
 
 	flags = 0;
 	if (wait_nr || sq_ring_needs_enter(ring, &flags)) {
diff --git a/test/io_uring_enter.c b/test/io_uring_enter.c
index d6e407e..b25afd5 100644
--- a/test/io_uring_enter.c
+++ b/test/io_uring_enter.c
@@ -262,9 +262,11 @@ main(int argc, char **argv)
 	ktail = *sq->ktail;
 	sq->array[ktail & mask] = index;
 	++ktail;
-	write_barrier();
-	*sq->ktail = ktail;
-	write_barrier();
+	/*
+	 * Ensure that the kernel sees the SQE update before it sees the tail
+	 * update.
+	 */
+	smp_store_release(sq->ktail, ktail);
 
 	ret = io_uring_enter(ring.ring_fd, 1, 0, 0, NULL);
 	/* now check to see if our sqe was dropped */