diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 17ac1b785440..909fb7aea638 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -905,7 +905,8 @@ struct io_uring_buf_reg { __u32 ring_entries; __u16 bgid; __u16 flags; - __u64 resv[3]; + __u32 min_left; + __u32 resv[5]; }; /* argument for IORING_REGISTER_PBUF_STATUS */ diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 8da2ff798170..63061aa1cab9 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -47,7 +47,7 @@ static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len) this_len = min_t(u32, len, buf_len); buf_len -= this_len; /* Stop looping for invalid buffer length of 0 */ - if (buf_len || !this_len) { + if (buf_len > bl->min_left_sub_one || !this_len) { WRITE_ONCE(buf->addr, READ_ONCE(buf->addr) + this_len); WRITE_ONCE(buf->len, buf_len); return false; @@ -637,6 +637,10 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (reg.ring_entries >= 65536) return -EINVAL; + /* minimum left byte count is a property of incremental buffers */ + if (!(reg.flags & IOU_PBUF_RING_INC) && reg.min_left) + return -EINVAL; + bl = io_buffer_get_list(ctx, reg.bgid); if (bl) { /* if mapped buffer ring OR classic exists, don't allow */ @@ -680,10 +684,11 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) } #endif - bl->nr_entries = reg.ring_entries; bl->mask = reg.ring_entries - 1; bl->flags |= IOBL_BUF_RING; bl->buf_ring = br; + if (reg.min_left) + bl->min_left_sub_one = reg.min_left - 1; if (reg.flags & IOU_PBUF_RING_INC) bl->flags |= IOBL_INC; ret = io_buffer_add_list(ctx, bl, reg.bgid); diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index bf15e26520d3..401773e1ef80 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -27,12 +27,18 @@ struct io_buffer_list { __u16 bgid; /* below is for ring provided buffers */ - __u16 nr_entries; __u16 head; __u16 mask; __u16 flags; + /* + * minimum required amount to be left to reuse an incrementally + * consumed buffer. If less than this is left at consumption time, + * buffer is done and head is incremented to the next buffer. + */ + __u32 min_left_sub_one; + struct io_mapped_region region; }; diff --git a/io_uring/napi.c b/io_uring/napi.c index 4a10de03e426..8d68366a4b90 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -276,6 +276,8 @@ static int io_napi_register_napi(struct io_ring_ctx *ctx, /* clean the napi list for new settings */ io_napi_free(ctx); WRITE_ONCE(ctx->napi_track_mode, napi->op_param); + /* cap NAPI at 10 msec of spin time */ + napi->busy_poll_to = min(10000, napi->busy_poll_to); WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC); WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll); return 0; diff --git a/io_uring/tw.c b/io_uring/tw.c index fdff81eebc95..023d5e6bc491 100644 --- a/io_uring/tw.c +++ b/io_uring/tw.c @@ -273,8 +273,18 @@ void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags) void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) { - struct llist_node *node = llist_del_all(&ctx->work_llist); + struct llist_node *node; + /* + * Running the work items may utilize ->retry_llist as a means + * for capping the number of task_work entries run at the same + * time. But that list can potentially race with moving the work + * from here, if the task is exiting. As any normal task_work + * running holds ->uring_lock already, just guard this slow path + * with ->uring_lock to avoid racing on ->retry_llist. + */ + guard(mutex)(&ctx->uring_lock); + node = llist_del_all(&ctx->work_llist); __io_fallback_tw(node, false); node = llist_del_all(&ctx->retry_llist); __io_fallback_tw(node, false);