mirror of
https://github.com/torvalds/linux.git
synced 2026-05-30 00:29:35 +08:00
Merge tag 'io_uring-7.1-20260430' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring fixes from Jens Axboe: - Remove dead struct io_buffer_list member - Fix for incrementally consumed buffers with recvmsg multishot, which requires a minimum value left in a buffer for any receive for the headers. If there's still a bit of buffer left but it's smaller than that value, then userspace will see a spurious -EFAULT returned in the CQE - Locking fix for the DEFER_TASKRUN retry list, which otherwise could race with fallback cancelations. If the task is exiting with task_work left in both the normal and retry list AND the exit cleanup races with the task running task work, then entries could either be doubly completed or lost - Cap NAPI busy poll timeout to something sane, to avoid syzbot running into excessive polling and triggering warnings around that * tag 'io_uring-7.1-20260430' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: io_uring/tw: serialize ctx->retry_llist with ->uring_lock io_uring/napi: cap busy_poll_to 10 msec io_uring/kbuf: support min length left for incremental buffers io_uring/kbuf: kill dead struct io_buffer_list 'nr_entries' member
This commit is contained in:
@@ -905,7 +905,8 @@ struct io_uring_buf_reg {
|
||||
__u32 ring_entries;
|
||||
__u16 bgid;
|
||||
__u16 flags;
|
||||
__u64 resv[3];
|
||||
__u32 min_left;
|
||||
__u32 resv[5];
|
||||
};
|
||||
|
||||
/* argument for IORING_REGISTER_PBUF_STATUS */
|
||||
|
||||
@@ -47,7 +47,7 @@ static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len)
|
||||
this_len = min_t(u32, len, buf_len);
|
||||
buf_len -= this_len;
|
||||
/* Stop looping for invalid buffer length of 0 */
|
||||
if (buf_len || !this_len) {
|
||||
if (buf_len > bl->min_left_sub_one || !this_len) {
|
||||
WRITE_ONCE(buf->addr, READ_ONCE(buf->addr) + this_len);
|
||||
WRITE_ONCE(buf->len, buf_len);
|
||||
return false;
|
||||
@@ -637,6 +637,10 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
|
||||
if (reg.ring_entries >= 65536)
|
||||
return -EINVAL;
|
||||
|
||||
/* minimum left byte count is a property of incremental buffers */
|
||||
if (!(reg.flags & IOU_PBUF_RING_INC) && reg.min_left)
|
||||
return -EINVAL;
|
||||
|
||||
bl = io_buffer_get_list(ctx, reg.bgid);
|
||||
if (bl) {
|
||||
/* if mapped buffer ring OR classic exists, don't allow */
|
||||
@@ -680,10 +684,11 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
|
||||
}
|
||||
#endif
|
||||
|
||||
bl->nr_entries = reg.ring_entries;
|
||||
bl->mask = reg.ring_entries - 1;
|
||||
bl->flags |= IOBL_BUF_RING;
|
||||
bl->buf_ring = br;
|
||||
if (reg.min_left)
|
||||
bl->min_left_sub_one = reg.min_left - 1;
|
||||
if (reg.flags & IOU_PBUF_RING_INC)
|
||||
bl->flags |= IOBL_INC;
|
||||
ret = io_buffer_add_list(ctx, bl, reg.bgid);
|
||||
|
||||
@@ -27,12 +27,18 @@ struct io_buffer_list {
|
||||
__u16 bgid;
|
||||
|
||||
/* below is for ring provided buffers */
|
||||
__u16 nr_entries;
|
||||
__u16 head;
|
||||
__u16 mask;
|
||||
|
||||
__u16 flags;
|
||||
|
||||
/*
|
||||
* minimum required amount to be left to reuse an incrementally
|
||||
* consumed buffer. If less than this is left at consumption time,
|
||||
* buffer is done and head is incremented to the next buffer.
|
||||
*/
|
||||
__u32 min_left_sub_one;
|
||||
|
||||
struct io_mapped_region region;
|
||||
};
|
||||
|
||||
|
||||
@@ -276,6 +276,8 @@ static int io_napi_register_napi(struct io_ring_ctx *ctx,
|
||||
/* clean the napi list for new settings */
|
||||
io_napi_free(ctx);
|
||||
WRITE_ONCE(ctx->napi_track_mode, napi->op_param);
|
||||
/* cap NAPI at 10 msec of spin time */
|
||||
napi->busy_poll_to = min(10000, napi->busy_poll_to);
|
||||
WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC);
|
||||
WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll);
|
||||
return 0;
|
||||
|
||||
@@ -273,8 +273,18 @@ void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags)
|
||||
|
||||
void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct llist_node *node = llist_del_all(&ctx->work_llist);
|
||||
struct llist_node *node;
|
||||
|
||||
/*
|
||||
* Running the work items may utilize ->retry_llist as a means
|
||||
* for capping the number of task_work entries run at the same
|
||||
* time. But that list can potentially race with moving the work
|
||||
* from here, if the task is exiting. As any normal task_work
|
||||
* running holds ->uring_lock already, just guard this slow path
|
||||
* with ->uring_lock to avoid racing on ->retry_llist.
|
||||
*/
|
||||
guard(mutex)(&ctx->uring_lock);
|
||||
node = llist_del_all(&ctx->work_llist);
|
||||
__io_fallback_tw(node, false);
|
||||
node = llist_del_all(&ctx->retry_llist);
|
||||
__io_fallback_tw(node, false);
|
||||
|
||||
Reference in New Issue
Block a user