#include <linux/namei.h>
#include <linux/fsnotify.h>
#include <linux/fadvise.h>
+#include <linux/eventpoll.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
struct socket *ring_sock;
#endif
+ struct idr personality_idr;
+
struct {
unsigned cached_cq_tail;
unsigned cq_entries;
u32 advice;
};
+struct io_epoll {
+ struct file *file;
+ int epfd;
+ int op;
+ int fd;
+ struct epoll_event event;
+};
+
struct io_async_connect {
struct sockaddr_storage address;
};
struct io_files_update files_update;
struct io_fadvise fadvise;
struct io_madvise madvise;
+ struct io_epoll epoll;
};
struct io_async_ctx *io;
unsigned unbound_nonreg_file : 1;
/* opcode is not supported by this kernel */
unsigned not_supported : 1;
+ /* needs file table */
+ unsigned file_table : 1;
};
static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .file_table = 1,
},
[IORING_OP_ASYNC_CANCEL] = {},
[IORING_OP_LINK_TIMEOUT] = {
[IORING_OP_OPENAT] = {
.needs_file = 1,
.fd_non_neg = 1,
+ .file_table = 1,
},
[IORING_OP_CLOSE] = {
.needs_file = 1,
+ .file_table = 1,
},
[IORING_OP_FILES_UPDATE] = {
.needs_mm = 1,
+ .file_table = 1,
},
[IORING_OP_STATX] = {
.needs_mm = 1,
[IORING_OP_OPENAT2] = {
.needs_file = 1,
.fd_non_neg = 1,
+ .file_table = 1,
+ },
+ [IORING_OP_EPOLL_CTL] = {
+ .unbound_nonreg_file = 1,
+ .file_table = 1,
},
};
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
+static int io_grab_files(struct io_kiocb *req);
static struct kmem_cache *req_cachep;
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->completions[0]);
init_completion(&ctx->completions[1]);
+ idr_init(&ctx->personality_idr);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait);
spin_lock_init(&ctx->completion_lock);
}
}
+static inline void io_req_work_grab_env(struct io_kiocb *req,
+ const struct io_op_def *def)
+{
+ if (!req->work.mm && def->needs_mm) {
+ mmgrab(current->mm);
+ req->work.mm = current->mm;
+ }
+ if (!req->work.creds)
+ req->work.creds = get_current_cred();
+}
+
+static inline void io_req_work_drop_env(struct io_kiocb *req)
+{
+ if (req->work.mm) {
+ mmdrop(req->work.mm);
+ req->work.mm = NULL;
+ }
+ if (req->work.creds) {
+ put_cred(req->work.creds);
+ req->work.creds = NULL;
+ }
+}
+
static inline bool io_prep_async_work(struct io_kiocb *req,
struct io_kiocb **link)
{
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
- if (def->needs_mm)
- req->work.flags |= IO_WQ_WORK_NEEDS_USER;
+
+ io_req_work_grab_env(req, def);
*link = io_prep_linked_timeout(req);
return do_hashed;
else
fput(req->file);
}
+
+ io_req_work_drop_env(req);
}
static void __io_free_req(struct io_kiocb *req)
struct file *file;
int ret;
- if (force_nonblock) {
- req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
+ if (force_nonblock)
return -EAGAIN;
- }
ret = build_open_flags(&req->open.how, &op);
if (ret)
return io_openat2(req, nxt, force_nonblock);
}
+static int io_epoll_ctl_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+#if defined(CONFIG_EPOLL)
+ if (sqe->ioprio || sqe->buf_index)
+ return -EINVAL;
+
+ req->epoll.epfd = READ_ONCE(sqe->fd);
+ req->epoll.op = READ_ONCE(sqe->len);
+ req->epoll.fd = READ_ONCE(sqe->off);
+
+ if (ep_op_has_event(req->epoll.op)) {
+ struct epoll_event __user *ev;
+
+ ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
+ return -EFAULT;
+ }
+
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
+{
+#if defined(CONFIG_EPOLL)
+ struct io_epoll *ie = &req->epoll;
+ int ret;
+
+ ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
+ if (force_nonblock && ret == -EAGAIN)
+ return -EAGAIN;
+
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ io_put_req_find_next(req, nxt);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
return ret;
/* if the file has a flush method, be safe and punt to async */
- if (req->close.put_file->f_op->flush && !io_wq_current_is_worker()) {
- req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
+ if (req->close.put_file->f_op->flush && !io_wq_current_is_worker())
goto eagain;
- }
/*
* No ->flush(), safely close from here and just punt the
ret = __io_accept(req, nxt, force_nonblock);
if (ret == -EAGAIN && force_nonblock) {
req->work.func = io_accept_finish;
- req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
io_put_req(req);
return -EAGAIN;
}
struct io_uring_files_update up;
int ret;
- if (force_nonblock) {
- req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
+ if (force_nonblock)
return -EAGAIN;
- }
up.offset = req->files_update.offset;
up.fds = req->files_update.arg;
{
ssize_t ret = 0;
+ if (io_op_defs[req->opcode].file_table) {
+ ret = io_grab_files(req);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ io_req_work_grab_env(req, &io_op_defs[req->opcode]);
+
switch (req->opcode) {
case IORING_OP_NOP:
break;
case IORING_OP_OPENAT2:
ret = io_openat2_prep(req, sqe);
break;
+ case IORING_OP_EPOLL_CTL:
+ ret = io_epoll_ctl_prep(req, sqe);
+ break;
default:
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
req->opcode);
}
ret = io_openat2(req, nxt, force_nonblock);
break;
+ case IORING_OP_EPOLL_CTL:
+ if (sqe) {
+ ret = io_epoll_ctl_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_epoll_ctl(req, nxt, force_nonblock);
+ break;
default:
ret = -EINVAL;
break;
int ret = -EBADF;
struct io_ring_ctx *ctx = req->ctx;
+ if (req->work.files)
+ return 0;
if (!ctx->ring_file)
return -EBADF;
if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
(req->flags & REQ_F_MUST_PUNT))) {
punt:
- if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
+ if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req);
if (ret)
goto err;
static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_submit_state *state, struct io_kiocb **link)
{
+ const struct cred *old_creds = NULL;
struct io_ring_ctx *ctx = req->ctx;
unsigned int sqe_flags;
- int ret;
+ int ret, id;
sqe_flags = READ_ONCE(sqe->flags);
ret = -EINVAL;
goto err_req;
}
+
+ id = READ_ONCE(sqe->personality);
+ if (id) {
+ const struct cred *personality_creds;
+
+ personality_creds = idr_find(&ctx->personality_idr, id);
+ if (unlikely(!personality_creds)) {
+ ret = -EINVAL;
+ goto err_req;
+ }
+ old_creds = override_creds(personality_creds);
+ }
+
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK|
IOSQE_ASYNC);
err_req:
io_cqring_add_event(req, ret);
io_double_put_req(req);
+ if (old_creds)
+ revert_creds(old_creds);
return false;
}
}
}
+ if (old_creds)
+ revert_creds(old_creds);
return true;
}
refcount_inc(&req->refs);
}
+static int io_init_wq_offload(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
+{
+ struct io_wq_data data;
+ struct fd f;
+ struct io_ring_ctx *ctx_attach;
+ unsigned int concurrency;
+ int ret = 0;
+
+ data.user = ctx->user;
+ data.get_work = io_get_work;
+ data.put_work = io_put_work;
+
+ if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
+ /* Do QD, or 4 * CPUS, whatever is smallest */
+ concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
+
+ ctx->io_wq = io_wq_create(concurrency, &data);
+ if (IS_ERR(ctx->io_wq)) {
+ ret = PTR_ERR(ctx->io_wq);
+ ctx->io_wq = NULL;
+ }
+ return ret;
+ }
+
+ f = fdget(p->wq_fd);
+ if (!f.file)
+ return -EBADF;
+
+ if (f.file->f_op != &io_uring_fops) {
+ ret = -EINVAL;
+ goto out_fput;
+ }
+
+ ctx_attach = f.file->private_data;
+ /* @io_wq is protected by holding the fd */
+ if (!io_wq_get(ctx_attach->io_wq, &data)) {
+ ret = -EINVAL;
+ goto out_fput;
+ }
+
+ ctx->io_wq = ctx_attach->io_wq;
+out_fput:
+ fdput(f);
+ return ret;
+}
+
static int io_sq_offload_start(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
- struct io_wq_data data;
- unsigned concurrency;
int ret;
init_waitqueue_head(&ctx->sqo_wait);
goto err;
}
- data.mm = ctx->sqo_mm;
- data.user = ctx->user;
- data.creds = ctx->creds;
- data.get_work = io_get_work;
- data.put_work = io_put_work;
-
- /* Do QD, or 4 * CPUS, whatever is smallest */
- concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
- ctx->io_wq = io_wq_create(concurrency, &data);
- if (IS_ERR(ctx->io_wq)) {
- ret = PTR_ERR(ctx->io_wq);
- ctx->io_wq = NULL;
+ ret = io_init_wq_offload(ctx, p);
+ if (ret)
goto err;
- }
return 0;
err:
struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
for (j = 0; j < imu->nr_bvecs; j++)
- put_user_page(imu->bvec[j].bv_page);
+ unpin_user_page(imu->bvec[j].bv_page);
if (ctx->account_mem)
io_unaccount_mem(ctx->user, imu->nr_bvecs);
ret = 0;
down_read(¤t->mm->mmap_sem);
- pret = get_user_pages(ubuf, nr_pages,
+ pret = pin_user_pages(ubuf, nr_pages,
FOLL_WRITE | FOLL_LONGTERM,
pages, vmas);
if (pret == nr_pages) {
* release any pages we did get
*/
if (pret > 0)
- put_user_pages(pages, pret);
+ unpin_user_pages(pages, pret);
if (ctx->account_mem)
io_unaccount_mem(ctx->user, nr_pages);
kvfree(imu->bvec);
return fasync_helper(fd, file, on, &ctx->cq_fasync);
}
+static int io_remove_personalities(int id, void *p, void *data)
+{
+ struct io_ring_ctx *ctx = data;
+ const struct cred *cred;
+
+ cred = idr_remove(&ctx->personality_idr, id);
+ if (cred)
+ put_cred(cred);
+ return 0;
+}
+
static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
mutex_lock(&ctx->uring_lock);
/* if we failed setting up the ctx, we might not have any rings */
if (ctx->rings)
io_cqring_overflow_flush(ctx, true);
+ idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
wait_for_completion(&ctx->completions[0]);
io_ring_ctx_free(ctx);
}
struct io_ring_ctx *ctx = file->private_data;
io_uring_cancel_files(ctx, data);
- if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
- io_cqring_overflow_flush(ctx, true);
- io_wq_cancel_all(ctx->io_wq);
- }
return 0;
}
} else if (to_submit) {
struct mm_struct *cur_mm;
- if (current->mm != ctx->sqo_mm ||
- current_cred() != ctx->creds) {
- ret = -EPERM;
- goto out;
- }
-
mutex_lock(&ctx->uring_lock);
/* already have mm, so io_submit_sqes() won't try to grab it */
cur_mm = ctx->sqo_mm;
goto err;
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
- IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS;
+ IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
+ IORING_FEAT_CUR_PERSONALITY;
trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
return ret;
err:
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
- IORING_SETUP_CLAMP))
+ IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
return -EINVAL;
ret = io_uring_create(entries, &p);
return ret;
}
+static int io_register_personality(struct io_ring_ctx *ctx)
+{
+ const struct cred *creds = get_current_cred();
+ int id;
+
+ id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
+ USHRT_MAX, GFP_KERNEL);
+ if (id < 0)
+ put_cred(creds);
+ return id;
+}
+
+static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
+{
+ const struct cred *old_creds;
+
+ old_creds = idr_remove(&ctx->personality_idr, id);
+ if (old_creds) {
+ put_cred(old_creds);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static bool io_register_op_must_quiesce(int op)
+{
+ switch (op) {
+ case IORING_UNREGISTER_FILES:
+ case IORING_REGISTER_FILES_UPDATE:
+ case IORING_REGISTER_PROBE:
+ case IORING_REGISTER_PERSONALITY:
+ case IORING_UNREGISTER_PERSONALITY:
+ return false;
+ default:
+ return true;
+ }
+}
+
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
if (percpu_ref_is_dying(&ctx->refs))
return -ENXIO;
- if (opcode != IORING_UNREGISTER_FILES &&
- opcode != IORING_REGISTER_FILES_UPDATE &&
- opcode != IORING_REGISTER_PROBE) {
+ if (io_register_op_must_quiesce(opcode)) {
percpu_ref_kill(&ctx->refs);
/*
break;
ret = io_probe(ctx, arg, nr_args);
break;
+ case IORING_REGISTER_PERSONALITY:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_register_personality(ctx);
+ break;
+ case IORING_UNREGISTER_PERSONALITY:
+ ret = -EINVAL;
+ if (arg)
+ break;
+ ret = io_unregister_personality(ctx, nr_args);
+ break;
default:
ret = -EINVAL;
break;
}
-
- if (opcode != IORING_UNREGISTER_FILES &&
- opcode != IORING_REGISTER_FILES_UPDATE &&
- opcode != IORING_REGISTER_PROBE) {
+ if (io_register_op_must_quiesce(opcode)) {
/* bring the ctx back to life */
percpu_ref_reinit(&ctx->refs);
out: