Merge tag 'io_uring-7.1-20260430' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring fixes from Jens Axboe:

 - Remove dead struct io_buffer_list member

 - Fix for incrementally consumed buffers with recvmsg multishot, which
   requires a minimum value left in a buffer for any receive for the
   headers. If there's still a bit of buffer left but it's smaller than
   that value, then userspace will see a spurious -EFAULT returned in
   the CQE

 - Locking fix for the DEFER_TASKRUN retry list, which otherwise could
   race with fallback cancelations. If the task is exiting with
   task_work left in both the normal and retry list AND the exit cleanup
   races with the task running task work, then entries could either be
   doubly completed or lost

 - Cap NAPI busy poll timeout to something sane, to avoid syzbot running
   into excessive polling and triggering warnings around that

* tag 'io_uring-7.1-20260430' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
  io_uring/tw: serialize ctx->retry_llist with ->uring_lock
  io_uring/napi: cap busy_poll_to 10 msec
  io_uring/kbuf: support min length left for incremental buffers
  io_uring/kbuf: kill dead struct io_buffer_list 'nr_entries' member
This commit is contained in:
Linus Torvalds
2026-05-01 11:01:31 -07:00
5 changed files with 29 additions and 5 deletions

View File

@@ -905,7 +905,8 @@ struct io_uring_buf_reg {
__u32 ring_entries;
__u16 bgid;
__u16 flags;
__u64 resv[3];
__u32 min_left;
__u32 resv[5];
};
/* argument for IORING_REGISTER_PBUF_STATUS */

View File

@@ -47,7 +47,7 @@ static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len)
this_len = min_t(u32, len, buf_len);
buf_len -= this_len;
/* Stop looping for invalid buffer length of 0 */
if (buf_len || !this_len) {
if (buf_len > bl->min_left_sub_one || !this_len) {
WRITE_ONCE(buf->addr, READ_ONCE(buf->addr) + this_len);
WRITE_ONCE(buf->len, buf_len);
return false;
@@ -637,6 +637,10 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (reg.ring_entries >= 65536)
return -EINVAL;
/* minimum left byte count is a property of incremental buffers */
if (!(reg.flags & IOU_PBUF_RING_INC) && reg.min_left)
return -EINVAL;
bl = io_buffer_get_list(ctx, reg.bgid);
if (bl) {
/* if mapped buffer ring OR classic exists, don't allow */
@@ -680,10 +684,11 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
}
#endif
bl->nr_entries = reg.ring_entries;
bl->mask = reg.ring_entries - 1;
bl->flags |= IOBL_BUF_RING;
bl->buf_ring = br;
if (reg.min_left)
bl->min_left_sub_one = reg.min_left - 1;
if (reg.flags & IOU_PBUF_RING_INC)
bl->flags |= IOBL_INC;
ret = io_buffer_add_list(ctx, bl, reg.bgid);

View File

@@ -27,12 +27,18 @@ struct io_buffer_list {
__u16 bgid;
/* below is for ring provided buffers */
__u16 nr_entries;
__u16 head;
__u16 mask;
__u16 flags;
/*
* minimum required amount to be left to reuse an incrementally
* consumed buffer. If less than this is left at consumption time,
* buffer is done and head is incremented to the next buffer.
*/
__u32 min_left_sub_one;
struct io_mapped_region region;
};

View File

@@ -276,6 +276,8 @@ static int io_napi_register_napi(struct io_ring_ctx *ctx,
/* clean the napi list for new settings */
io_napi_free(ctx);
WRITE_ONCE(ctx->napi_track_mode, napi->op_param);
/* cap NAPI at 10 msec of spin time */
napi->busy_poll_to = min(10000, napi->busy_poll_to);
WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC);
WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll);
return 0;

View File

@@ -273,8 +273,18 @@ void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags)
void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
{
struct llist_node *node = llist_del_all(&ctx->work_llist);
struct llist_node *node;
/*
* Running the work items may utilize ->retry_llist as a means
* for capping the number of task_work entries run at the same
* time. But that list can potentially race with moving the work
* from here, if the task is exiting. As any normal task_work
* running holds ->uring_lock already, just guard this slow path
* with ->uring_lock to avoid racing on ->retry_llist.
*/
guard(mutex)(&ctx->uring_lock);
node = llist_del_all(&ctx->work_llist);
__io_fallback_tw(node, false);
node = llist_del_all(&ctx->retry_llist);
__io_fallback_tw(node, false);