fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side. When the application reads the CQ ring
   8  * tail, it must use an appropriate smp_rmb() to order with the smp_wmb()
   9  * the kernel uses after writing the tail. Failure to do so could cause a
  10  * delay in when the application notices that completion events available.
  11  * This isn't a fatal condition. Likewise, the application must use an
  12  * appropriate smp_wmb() both before writing the SQ tail, and after writing
  13  * the SQ tail. The first one orders the sqe writes with the tail write, and
  14  * the latter is paired with the smp_rmb() the kernel will issue before
  15  * reading the SQ tail on submission.
  16  *
  17  * Also see the examples in the liburing library:
  18  *
  19  *      git://git.kernel.dk/liburing
  20  *
  21  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  22  * from data shared between the kernel and application. This is done both
  23  * for ordering purposes, but also to ensure that once a value is loaded from
  24  * data that the application could potentially modify, it remains stable.
  25  *
  26  * Copyright (C) 2018-2019 Jens Axboe
  27  * Copyright (c) 2018-2019 Christoph Hellwig
  28  */
  29 #include <linux/kernel.h>
  30 #include <linux/init.h>
  31 #include <linux/errno.h>
  32 #include <linux/syscalls.h>
  33 #include <linux/compat.h>
  34 #include <linux/refcount.h>
  35 #include <linux/uio.h>
  36
  37 #include <linux/sched/signal.h>
  38 #include <linux/fs.h>
  39 #include <linux/file.h>
  40 #include <linux/fdtable.h>
  41 #include <linux/mm.h>
  42 #include <linux/mman.h>
  43 #include <linux/mmu_context.h>
  44 #include <linux/percpu.h>
  45 #include <linux/slab.h>
  46 #include <linux/workqueue.h>
  47 #include <linux/blkdev.h>
  48 #include <linux/bvec.h>
  49 #include <linux/net.h>
  50 #include <net/sock.h>
  51 #include <net/af_unix.h>
  52 #include <net/scm.h>
  53 #include <linux/anon_inodes.h>
  54 #include <linux/sched/mm.h>
  55 #include <linux/uaccess.h>
  56 #include <linux/nospec.h>
  57 #include <linux/sizes.h>
  58 #include <linux/hugetlb.h>
  59
  60 #include <uapi/linux/io_uring.h>
  61
  62 #include "internal.h"
  63
  64 #define IORING_MAX_ENTRIES      4096
  65 #define IORING_MAX_FIXED_FILES  1024
  66
  67 struct io_uring {
  68         u32 head ____cacheline_aligned_in_smp;
  69         u32 tail ____cacheline_aligned_in_smp;
  70 };
  71
  72 struct io_sq_ring {
  73         struct io_uring         r;
  74         u32                     ring_mask;
  75         u32                     ring_entries;
  76         u32                     dropped;
  77         u32                     flags;
  78         u32                     array[];
  79 };
  80
  81 struct io_cq_ring {
  82         struct io_uring         r;
  83         u32                     ring_mask;
  84         u32                     ring_entries;
  85         u32                     overflow;
  86         struct io_uring_cqe     cqes[];
  87 };
  88
  89 struct io_mapped_ubuf {
  90         u64             ubuf;
  91         size_t          len;
  92         struct          bio_vec *bvec;
  93         unsigned int    nr_bvecs;
  94 };
  95
  96 struct io_ring_ctx {
  97         struct {
  98                 struct percpu_ref       refs;
  99         } ____cacheline_aligned_in_smp;
 100
 101         struct {
 102                 unsigned int            flags;
 103                 bool                    compat;
 104                 bool                    account_mem;
 105
 106                 /* SQ ring */
 107                 struct io_sq_ring       *sq_ring;
 108                 unsigned                cached_sq_head;
 109                 unsigned                sq_entries;
 110                 unsigned                sq_mask;
 111                 struct io_uring_sqe     *sq_sqes;
 112         } ____cacheline_aligned_in_smp;
 113
 114         /* IO offload */
 115         struct workqueue_struct *sqo_wq;
 116         struct mm_struct        *sqo_mm;
 117
 118         struct {
 119                 /* CQ ring */
 120                 struct io_cq_ring       *cq_ring;
 121                 unsigned                cached_cq_tail;
 122                 unsigned                cq_entries;
 123                 unsigned                cq_mask;
 124                 struct wait_queue_head  cq_wait;
 125                 struct fasync_struct    *cq_fasync;
 126         } ____cacheline_aligned_in_smp;
 127
 128         /*
 129          * If used, fixed file set. Writers must ensure that ->refs is dead,
 130          * readers must ensure that ->refs is alive as long as the file* is
 131          * used. Only updated through io_uring_register(2).
 132          */
 133         struct file             **user_files;
 134         unsigned                nr_user_files;
 135
 136         /* if used, fixed mapped user buffers */
 137         unsigned                nr_user_bufs;
 138         struct io_mapped_ubuf   *user_bufs;
 139
 140         struct user_struct      *user;
 141
 142         struct completion       ctx_done;
 143
 144         struct {
 145                 struct mutex            uring_lock;
 146                 wait_queue_head_t       wait;
 147         } ____cacheline_aligned_in_smp;
 148
 149         struct {
 150                 spinlock_t              completion_lock;
 151                 bool                    poll_multi_file;
 152                 /*
 153                  * ->poll_list is protected by the ctx->uring_lock for
 154                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 155                  * For SQPOLL, only the single threaded io_sq_thread() will
 156                  * manipulate the list, hence no extra locking is needed there.
 157                  */
 158                 struct list_head        poll_list;
 159         } ____cacheline_aligned_in_smp;
 160
 161 #if defined(CONFIG_UNIX)
 162         struct socket           *ring_sock;
 163 #endif
 164 };
 165
 166 struct sqe_submit {
 167         const struct io_uring_sqe       *sqe;
 168         unsigned short                  index;
 169         bool                            has_user;
 170         bool                            needs_lock;
 171 };
 172
 173 struct io_kiocb {
 174         struct kiocb            rw;
 175
 176         struct sqe_submit       submit;
 177
 178         struct io_ring_ctx      *ctx;
 179         struct list_head        list;
 180         unsigned int            flags;
 181 #define REQ_F_FORCE_NONBLOCK    1       /* inline submission attempt */
 182 #define REQ_F_IOPOLL_COMPLETED  2       /* polled IO has completed */
 183 #define REQ_F_FIXED_FILE        4       /* ctx owns file */
 184         u64                     user_data;
 185         u64                     error;
 186
 187         struct work_struct      work;
 188 };
 189
 190 #define IO_PLUG_THRESHOLD               2
 191 #define IO_IOPOLL_BATCH                 8
 192
 193 struct io_submit_state {
 194         struct blk_plug         plug;
 195
 196         /*
 197          * io_kiocb alloc cache
 198          */
 199         void                    *reqs[IO_IOPOLL_BATCH];
 200         unsigned                int free_reqs;
 201         unsigned                int cur_req;
 202
 203         /*
 204          * File reference cache
 205          */
 206         struct file             *file;
 207         unsigned int            fd;
 208         unsigned int            has_refs;
 209         unsigned int            used_refs;
 210         unsigned int            ios_left;
 211 };
 212
 213 static struct kmem_cache *req_cachep;
 214
 215 static const struct file_operations io_uring_fops;
 216
 217 struct sock *io_uring_get_socket(struct file *file)
 218 {
 219 #if defined(CONFIG_UNIX)
 220         if (file->f_op == &io_uring_fops) {
 221                 struct io_ring_ctx *ctx = file->private_data;
 222
 223                 return ctx->ring_sock->sk;
 224         }
 225 #endif
 226         return NULL;
 227 }
 228 EXPORT_SYMBOL(io_uring_get_socket);
 229
 230 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 231 {
 232         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 233
 234         complete(&ctx->ctx_done);
 235 }
 236
 237 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 238 {
 239         struct io_ring_ctx *ctx;
 240
 241         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 242         if (!ctx)
 243                 return NULL;
 244
 245         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) {
 246                 kfree(ctx);
 247                 return NULL;
 248         }
 249
 250         ctx->flags = p->flags;
 251         init_waitqueue_head(&ctx->cq_wait);
 252         init_completion(&ctx->ctx_done);
 253         mutex_init(&ctx->uring_lock);
 254         init_waitqueue_head(&ctx->wait);
 255         spin_lock_init(&ctx->completion_lock);
 256         INIT_LIST_HEAD(&ctx->poll_list);
 257         return ctx;
 258 }
 259
 260 static void io_commit_cqring(struct io_ring_ctx *ctx)
 261 {
 262         struct io_cq_ring *ring = ctx->cq_ring;
 263
 264         if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) {
 265                 /* order cqe stores with ring update */
 266                 smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
 267
 268                 /*
 269                  * Write sider barrier of tail update, app has read side. See
 270                  * comment at the top of this file.
 271                  */
 272                 smp_wmb();
 273
 274                 if (wq_has_sleeper(&ctx->cq_wait)) {
 275                         wake_up_interruptible(&ctx->cq_wait);
 276                         kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
 277                 }
 278         }
 279 }
 280
 281 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 282 {
 283         struct io_cq_ring *ring = ctx->cq_ring;
 284         unsigned tail;
 285
 286         tail = ctx->cached_cq_tail;
 287         /* See comment at the top of the file */
 288         smp_rmb();
 289         if (tail + 1 == READ_ONCE(ring->r.head))
 290                 return NULL;
 291
 292         ctx->cached_cq_tail++;
 293         return &ring->cqes[tail & ctx->cq_mask];
 294 }
 295
 296 static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 297                                  long res, unsigned ev_flags)
 298 {
 299         struct io_uring_cqe *cqe;
 300
 301         /*
 302          * If we can't get a cq entry, userspace overflowed the
 303          * submission (by quite a lot). Increment the overflow count in
 304          * the ring.
 305          */
 306         cqe = io_get_cqring(ctx);
 307         if (cqe) {
 308                 WRITE_ONCE(cqe->user_data, ki_user_data);
 309                 WRITE_ONCE(cqe->res, res);
 310                 WRITE_ONCE(cqe->flags, ev_flags);
 311         } else {
 312                 unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
 313
 314                 WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1);
 315         }
 316 }
 317
 318 static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 319                                 long res, unsigned ev_flags)
 320 {
 321         unsigned long flags;
 322
 323         spin_lock_irqsave(&ctx->completion_lock, flags);
 324         io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
 325         io_commit_cqring(ctx);
 326         spin_unlock_irqrestore(&ctx->completion_lock, flags);
 327
 328         if (waitqueue_active(&ctx->wait))
 329                 wake_up(&ctx->wait);
 330 }
 331
 332 static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
 333 {
 334         percpu_ref_put_many(&ctx->refs, refs);
 335
 336         if (waitqueue_active(&ctx->wait))
 337                 wake_up(&ctx->wait);
 338 }
 339
 340 static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 341                                    struct io_submit_state *state)
 342 {
 343         struct io_kiocb *req;
 344
 345         if (!percpu_ref_tryget(&ctx->refs))
 346                 return NULL;
 347
 348         if (!state) {
 349                 req = kmem_cache_alloc(req_cachep, __GFP_NOWARN);
 350                 if (unlikely(!req))
 351                         goto out;
 352         } else if (!state->free_reqs) {
 353                 size_t sz;
 354                 int ret;
 355
 356                 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
 357                 ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz,
 358                                                 state->reqs);
 359                 if (unlikely(ret <= 0))
 360                         goto out;
 361                 state->free_reqs = ret - 1;
 362                 state->cur_req = 1;
 363                 req = state->reqs[0];
 364         } else {
 365                 req = state->reqs[state->cur_req];
 366                 state->free_reqs--;
 367                 state->cur_req++;
 368         }
 369
 370         req->ctx = ctx;
 371         req->flags = 0;
 372         return req;
 373 out:
 374         io_ring_drop_ctx_refs(ctx, 1);
 375         return NULL;
 376 }
 377
 378 static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
 379 {
 380         if (*nr) {
 381                 kmem_cache_free_bulk(req_cachep, *nr, reqs);
 382                 io_ring_drop_ctx_refs(ctx, *nr);
 383                 *nr = 0;
 384         }
 385 }
 386
 387 static void io_free_req(struct io_kiocb *req)
 388 {
 389         io_ring_drop_ctx_refs(req->ctx, 1);
 390         kmem_cache_free(req_cachep, req);
 391 }
 392
 393 /*
 394  * Find and free completed poll iocbs
 395  */
 396 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 397                                struct list_head *done)
 398 {
 399         void *reqs[IO_IOPOLL_BATCH];
 400         int file_count, to_free;
 401         struct file *file = NULL;
 402         struct io_kiocb *req;
 403
 404         file_count = to_free = 0;
 405         while (!list_empty(done)) {
 406                 req = list_first_entry(done, struct io_kiocb, list);
 407                 list_del(&req->list);
 408
 409                 io_cqring_fill_event(ctx, req->user_data, req->error, 0);
 410
 411                 reqs[to_free++] = req;
 412                 (*nr_events)++;
 413
 414                 /*
 415                  * Batched puts of the same file, to avoid dirtying the
 416                  * file usage count multiple times, if avoidable.
 417                  */
 418                 if (!(req->flags & REQ_F_FIXED_FILE)) {
 419                         if (!file) {
 420                                 file = req->rw.ki_filp;
 421                                 file_count = 1;
 422                         } else if (file == req->rw.ki_filp) {
 423                                 file_count++;
 424                         } else {
 425                                 fput_many(file, file_count);
 426                                 file = req->rw.ki_filp;
 427                                 file_count = 1;
 428                         }
 429                 }
 430
 431                 if (to_free == ARRAY_SIZE(reqs))
 432                         io_free_req_many(ctx, reqs, &to_free);
 433         }
 434         io_commit_cqring(ctx);
 435
 436         if (file)
 437                 fput_many(file, file_count);
 438         io_free_req_many(ctx, reqs, &to_free);
 439 }
 440
 441 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 442                         long min)
 443 {
 444         struct io_kiocb *req, *tmp;
 445         LIST_HEAD(done);
 446         bool spin;
 447         int ret;
 448
 449         /*
 450          * Only spin for completions if we don't have multiple devices hanging
 451          * off our complete list, and we're under the requested amount.
 452          */
 453         spin = !ctx->poll_multi_file && *nr_events < min;
 454
 455         ret = 0;
 456         list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
 457                 struct kiocb *kiocb = &req->rw;
 458
 459                 /*
 460                  * Move completed entries to our local list. If we find a
 461                  * request that requires polling, break out and complete
 462                  * the done list first, if we have entries there.
 463                  */
 464                 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
 465                         list_move_tail(&req->list, &done);
 466                         continue;
 467                 }
 468                 if (!list_empty(&done))
 469                         break;
 470
 471                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
 472                 if (ret < 0)
 473                         break;
 474
 475                 if (ret && spin)
 476                         spin = false;
 477                 ret = 0;
 478         }
 479
 480         if (!list_empty(&done))
 481                 io_iopoll_complete(ctx, nr_events, &done);
 482
 483         return ret;
 484 }
 485
 486 /*
 487  * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
 488  * non-spinning poll check - we'll still enter the driver poll loop, but only
 489  * as a non-spinning completion check.
 490  */
 491 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
 492                                 long min)
 493 {
 494         while (!list_empty(&ctx->poll_list)) {
 495                 int ret;
 496
 497                 ret = io_do_iopoll(ctx, nr_events, min);
 498                 if (ret < 0)
 499                         return ret;
 500                 if (!min || *nr_events >= min)
 501                         return 0;
 502         }
 503
 504         return 1;
 505 }
 506
 507 /*
 508  * We can't just wait for polled events to come to us, we have to actively
 509  * find and complete them.
 510  */
 511 static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
 512 {
 513         if (!(ctx->flags & IORING_SETUP_IOPOLL))
 514                 return;
 515
 516         mutex_lock(&ctx->uring_lock);
 517         while (!list_empty(&ctx->poll_list)) {
 518                 unsigned int nr_events = 0;
 519
 520                 io_iopoll_getevents(ctx, &nr_events, 1);
 521         }
 522         mutex_unlock(&ctx->uring_lock);
 523 }
 524
 525 static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
 526                            long min)
 527 {
 528         int ret = 0;
 529
 530         do {
 531                 int tmin = 0;
 532
 533                 if (*nr_events < min)
 534                         tmin = min - *nr_events;
 535
 536                 ret = io_iopoll_getevents(ctx, nr_events, tmin);
 537                 if (ret <= 0)
 538                         break;
 539                 ret = 0;
 540         } while (min && !*nr_events && !need_resched());
 541
 542         return ret;
 543 }
 544
 545 static void kiocb_end_write(struct kiocb *kiocb)
 546 {
 547         if (kiocb->ki_flags & IOCB_WRITE) {
 548                 struct inode *inode = file_inode(kiocb->ki_filp);
 549
 550                 /*
 551                  * Tell lockdep we inherited freeze protection from submission
 552                  * thread.
 553                  */
 554                 if (S_ISREG(inode->i_mode))
 555                         __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
 556                 file_end_write(kiocb->ki_filp);
 557         }
 558 }
 559
 560 static void io_fput(struct io_kiocb *req)
 561 {
 562         if (!(req->flags & REQ_F_FIXED_FILE))
 563                 fput(req->rw.ki_filp);
 564 }
 565
 566 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 567 {
 568         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
 569
 570         kiocb_end_write(kiocb);
 571
 572         io_fput(req);
 573         io_cqring_add_event(req->ctx, req->user_data, res, 0);
 574         io_free_req(req);
 575 }
 576
 577 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 578 {
 579         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
 580
 581         kiocb_end_write(kiocb);
 582
 583         req->error = res;
 584         if (res != -EAGAIN)
 585                 req->flags |= REQ_F_IOPOLL_COMPLETED;
 586 }
 587
 588 /*
 589  * After the iocb has been issued, it's safe to be found on the poll list.
 590  * Adding the kiocb to the list AFTER submission ensures that we don't
 591  * find it from a io_iopoll_getevents() thread before the issuer is done
 592  * accessing the kiocb cookie.
 593  */
 594 static void io_iopoll_req_issued(struct io_kiocb *req)
 595 {
 596         struct io_ring_ctx *ctx = req->ctx;
 597
 598         /*
 599          * Track whether we have multiple files in our lists. This will impact
 600          * how we do polling eventually, not spinning if we're on potentially
 601          * different devices.
 602          */
 603         if (list_empty(&ctx->poll_list)) {
 604                 ctx->poll_multi_file = false;
 605         } else if (!ctx->poll_multi_file) {
 606                 struct io_kiocb *list_req;
 607
 608                 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
 609                                                 list);
 610                 if (list_req->rw.ki_filp != req->rw.ki_filp)
 611                         ctx->poll_multi_file = true;
 612         }
 613
 614         /*
 615          * For fast devices, IO may have already completed. If it has, add
 616          * it to the front so we find it first.
 617          */
 618         if (req->flags & REQ_F_IOPOLL_COMPLETED)
 619                 list_add(&req->list, &ctx->poll_list);
 620         else
 621                 list_add_tail(&req->list, &ctx->poll_list);
 622 }
 623
 624 static void io_file_put(struct io_submit_state *state, struct file *file)
 625 {
 626         if (!state) {
 627                 fput(file);
 628         } else if (state->file) {
 629                 int diff = state->has_refs - state->used_refs;
 630
 631                 if (diff)
 632                         fput_many(state->file, diff);
 633                 state->file = NULL;
 634         }
 635 }
 636
 637 /*
 638  * Get as many references to a file as we have IOs left in this submission,
 639  * assuming most submissions are for one file, or at least that each file
 640  * has more than one submission.
 641  */
 642 static struct file *io_file_get(struct io_submit_state *state, int fd)
 643 {
 644         if (!state)
 645                 return fget(fd);
 646
 647         if (state->file) {
 648                 if (state->fd == fd) {
 649                         state->used_refs++;
 650                         state->ios_left--;
 651                         return state->file;
 652                 }
 653                 io_file_put(state, NULL);
 654         }
 655         state->file = fget_many(fd, state->ios_left);
 656         if (!state->file)
 657                 return NULL;
 658
 659         state->fd = fd;
 660         state->has_refs = state->ios_left;
 661         state->used_refs = 1;
 662         state->ios_left--;
 663         return state->file;
 664 }
 665
 666 /*
 667  * If we tracked the file through the SCM inflight mechanism, we could support
 668  * any file. For now, just ensure that anything potentially problematic is done
 669  * inline.
 670  */
 671 static bool io_file_supports_async(struct file *file)
 672 {
 673         umode_t mode = file_inode(file)->i_mode;
 674
 675         if (S_ISBLK(mode) || S_ISCHR(mode))
 676                 return true;
 677         if (S_ISREG(mode) && file->f_op != &io_uring_fops)
 678                 return true;
 679
 680         return false;
 681 }
 682
 683 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 684                       bool force_nonblock, struct io_submit_state *state)
 685 {
 686         struct io_ring_ctx *ctx = req->ctx;
 687         struct kiocb *kiocb = &req->rw;
 688         unsigned ioprio, flags;
 689         int fd, ret;
 690
 691         /* For -EAGAIN retry, everything is already prepped */
 692         if (kiocb->ki_filp)
 693                 return 0;
 694
 695         flags = READ_ONCE(sqe->flags);
 696         fd = READ_ONCE(sqe->fd);
 697
 698         if (flags & IOSQE_FIXED_FILE) {
 699                 if (unlikely(!ctx->user_files ||
 700                     (unsigned) fd >= ctx->nr_user_files))
 701                         return -EBADF;
 702                 kiocb->ki_filp = ctx->user_files[fd];
 703                 req->flags |= REQ_F_FIXED_FILE;
 704         } else {
 705                 kiocb->ki_filp = io_file_get(state, fd);
 706                 if (unlikely(!kiocb->ki_filp))
 707                         return -EBADF;
 708                 if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
 709                         force_nonblock = false;
 710         }
 711         kiocb->ki_pos = READ_ONCE(sqe->off);
 712         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
 713         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
 714
 715         ioprio = READ_ONCE(sqe->ioprio);
 716         if (ioprio) {
 717                 ret = ioprio_check_cap(ioprio);
 718                 if (ret)
 719                         goto out_fput;
 720
 721                 kiocb->ki_ioprio = ioprio;
 722         } else
 723                 kiocb->ki_ioprio = get_current_ioprio();
 724
 725         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
 726         if (unlikely(ret))
 727                 goto out_fput;
 728         if (force_nonblock) {
 729                 kiocb->ki_flags |= IOCB_NOWAIT;
 730                 req->flags |= REQ_F_FORCE_NONBLOCK;
 731         }
 732         if (ctx->flags & IORING_SETUP_IOPOLL) {
 733                 ret = -EOPNOTSUPP;
 734                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
 735                     !kiocb->ki_filp->f_op->iopoll)
 736                         goto out_fput;
 737
 738                 req->error = 0;
 739                 kiocb->ki_flags |= IOCB_HIPRI;
 740                 kiocb->ki_complete = io_complete_rw_iopoll;
 741         } else {
 742                 if (kiocb->ki_flags & IOCB_HIPRI) {
 743                         ret = -EINVAL;
 744                         goto out_fput;
 745                 }
 746                 kiocb->ki_complete = io_complete_rw;
 747         }
 748         return 0;
 749 out_fput:
 750         if (!(flags & IOSQE_FIXED_FILE)) {
 751                 /*
 752                  * in case of error, we didn't use this file reference. drop it.
 753                  */
 754                 if (state)
 755                         state->used_refs--;
 756                 io_file_put(state, kiocb->ki_filp);
 757         }
 758         return ret;
 759 }
 760
 761 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
 762 {
 763         switch (ret) {
 764         case -EIOCBQUEUED:
 765                 break;
 766         case -ERESTARTSYS:
 767         case -ERESTARTNOINTR:
 768         case -ERESTARTNOHAND:
 769         case -ERESTART_RESTARTBLOCK:
 770                 /*
 771                  * We can't just restart the syscall, since previously
 772                  * submitted sqes may already be in progress. Just fail this
 773                  * IO with EINTR.
 774                  */
 775                 ret = -EINTR;
 776                 /* fall through */
 777         default:
 778                 kiocb->ki_complete(kiocb, ret, 0);
 779         }
 780 }
 781
 782 static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
 783                            const struct io_uring_sqe *sqe,
 784                            struct iov_iter *iter)
 785 {
 786         size_t len = READ_ONCE(sqe->len);
 787         struct io_mapped_ubuf *imu;
 788         unsigned index, buf_index;
 789         size_t offset;
 790         u64 buf_addr;
 791
 792         /* attempt to use fixed buffers without having provided iovecs */
 793         if (unlikely(!ctx->user_bufs))
 794                 return -EFAULT;
 795
 796         buf_index = READ_ONCE(sqe->buf_index);
 797         if (unlikely(buf_index >= ctx->nr_user_bufs))
 798                 return -EFAULT;
 799
 800         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
 801         imu = &ctx->user_bufs[index];
 802         buf_addr = READ_ONCE(sqe->addr);
 803
 804         /* overflow */
 805         if (buf_addr + len < buf_addr)
 806                 return -EFAULT;
 807         /* not inside the mapped region */
 808         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
 809                 return -EFAULT;
 810
 811         /*
 812          * May not be a start of buffer, set size appropriately
 813          * and advance us to the beginning.
 814          */
 815         offset = buf_addr - imu->ubuf;
 816         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
 817         if (offset)
 818                 iov_iter_advance(iter, offset);
 819         return 0;
 820 }
 821
 822 static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
 823                            const struct sqe_submit *s, struct iovec **iovec,
 824                            struct iov_iter *iter)
 825 {
 826         const struct io_uring_sqe *sqe = s->sqe;
 827         void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
 828         size_t sqe_len = READ_ONCE(sqe->len);
 829         u8 opcode;
 830
 831         /*
 832          * We're reading ->opcode for the second time, but the first read
 833          * doesn't care whether it's _FIXED or not, so it doesn't matter
 834          * whether ->opcode changes concurrently. The first read does care
 835          * about whether it is a READ or a WRITE, so we don't trust this read
 836          * for that purpose and instead let the caller pass in the read/write
 837          * flag.
 838          */
 839         opcode = READ_ONCE(sqe->opcode);
 840         if (opcode == IORING_OP_READ_FIXED ||
 841             opcode == IORING_OP_WRITE_FIXED) {
 842                 ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
 843                 *iovec = NULL;
 844                 return ret;
 845         }
 846
 847         if (!s->has_user)
 848                 return -EFAULT;
 849
 850 #ifdef CONFIG_COMPAT
 851         if (ctx->compat)
 852                 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
 853                                                 iovec, iter);
 854 #endif
 855
 856         return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
 857 }
 858
 859 static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
 860                        bool force_nonblock, struct io_submit_state *state)
 861 {
 862         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 863         struct kiocb *kiocb = &req->rw;
 864         struct iov_iter iter;
 865         struct file *file;
 866         ssize_t ret;
 867
 868         ret = io_prep_rw(req, s->sqe, force_nonblock, state);
 869         if (ret)
 870                 return ret;
 871         file = kiocb->ki_filp;
 872
 873         ret = -EBADF;
 874         if (unlikely(!(file->f_mode & FMODE_READ)))
 875                 goto out_fput;
 876         ret = -EINVAL;
 877         if (unlikely(!file->f_op->read_iter))
 878                 goto out_fput;
 879
 880         ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
 881         if (ret)
 882                 goto out_fput;
 883
 884         ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_iter_count(&iter));
 885         if (!ret) {
 886                 ssize_t ret2;
 887
 888                 /* Catch -EAGAIN return for forced non-blocking submission */
 889                 ret2 = call_read_iter(file, kiocb, &iter);
 890                 if (!force_nonblock || ret2 != -EAGAIN)
 891                         io_rw_done(kiocb, ret2);
 892                 else
 893                         ret = -EAGAIN;
 894         }
 895         kfree(iovec);
 896 out_fput:
 897         /* Hold on to the file for -EAGAIN */
 898         if (unlikely(ret && ret != -EAGAIN))
 899                 io_fput(req);
 900         return ret;
 901 }
 902
 903 static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
 904                         bool force_nonblock, struct io_submit_state *state)
 905 {
 906         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 907         struct kiocb *kiocb = &req->rw;
 908         struct iov_iter iter;
 909         struct file *file;
 910         ssize_t ret;
 911
 912         ret = io_prep_rw(req, s->sqe, force_nonblock, state);
 913         if (ret)
 914                 return ret;
 915         /* Hold on to the file for -EAGAIN */
 916         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
 917                 return -EAGAIN;
 918
 919         ret = -EBADF;
 920         file = kiocb->ki_filp;
 921         if (unlikely(!(file->f_mode & FMODE_WRITE)))
 922                 goto out_fput;
 923         ret = -EINVAL;
 924         if (unlikely(!file->f_op->write_iter))
 925                 goto out_fput;
 926
 927         ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
 928         if (ret)
 929                 goto out_fput;
 930
 931         ret = rw_verify_area(WRITE, file, &kiocb->ki_pos,
 932                                 iov_iter_count(&iter));
 933         if (!ret) {
 934                 /*
 935                  * Open-code file_start_write here to grab freeze protection,
 936                  * which will be released by another thread in
 937                  * io_complete_rw().  Fool lockdep by telling it the lock got
 938                  * released so that it doesn't complain about the held lock when
 939                  * we return to userspace.
 940                  */
 941                 if (S_ISREG(file_inode(file)->i_mode)) {
 942                         __sb_start_write(file_inode(file)->i_sb,
 943                                                 SB_FREEZE_WRITE, true);
 944                         __sb_writers_release(file_inode(file)->i_sb,
 945                                                 SB_FREEZE_WRITE);
 946                 }
 947                 kiocb->ki_flags |= IOCB_WRITE;
 948                 io_rw_done(kiocb, call_write_iter(file, kiocb, &iter));
 949         }
 950         kfree(iovec);
 951 out_fput:
 952         if (unlikely(ret))
 953                 io_fput(req);
 954         return ret;
 955 }
 956
 957 /*
 958  * IORING_OP_NOP just posts a completion event, nothing else.
 959  */
 960 static int io_nop(struct io_kiocb *req, u64 user_data)
 961 {
 962         struct io_ring_ctx *ctx = req->ctx;
 963         long err = 0;
 964
 965         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 966                 return -EINVAL;
 967
 968         /*
 969          * Twilight zone - it's possible that someone issued an opcode that
 970          * has a file attached, then got -EAGAIN on submission, and changed
 971          * the sqe before we retried it from async context. Avoid dropping
 972          * a file reference for this malicious case, and flag the error.
 973          */
 974         if (req->rw.ki_filp) {
 975                 err = -EBADF;
 976                 io_fput(req);
 977         }
 978         io_cqring_add_event(ctx, user_data, err, 0);
 979         io_free_req(req);
 980         return 0;
 981 }
 982
 983 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 984 {
 985         struct io_ring_ctx *ctx = req->ctx;
 986         unsigned flags;
 987         int fd;
 988
 989         /* Prep already done */
 990         if (req->rw.ki_filp)
 991                 return 0;
 992
 993         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 994                 return -EINVAL;
 995         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
 996                 return -EINVAL;
 997
 998         fd = READ_ONCE(sqe->fd);
 999         flags = READ_ONCE(sqe->flags);
1000
1001         if (flags & IOSQE_FIXED_FILE) {
1002                 if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
1003                         return -EBADF;
1004                 req->rw.ki_filp = ctx->user_files[fd];
1005                 req->flags |= REQ_F_FIXED_FILE;
1006         } else {
1007                 req->rw.ki_filp = fget(fd);
1008                 if (unlikely(!req->rw.ki_filp))
1009                         return -EBADF;
1010         }
1011
1012         return 0;
1013 }
1014
1015 static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1016                     bool force_nonblock)
1017 {
1018         loff_t sqe_off = READ_ONCE(sqe->off);
1019         loff_t sqe_len = READ_ONCE(sqe->len);
1020         loff_t end = sqe_off + sqe_len;
1021         unsigned fsync_flags;
1022         int ret;
1023
1024         fsync_flags = READ_ONCE(sqe->fsync_flags);
1025         if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
1026                 return -EINVAL;
1027
1028         ret = io_prep_fsync(req, sqe);
1029         if (ret)
1030                 return ret;
1031
1032         /* fsync always requires a blocking context */
1033         if (force_nonblock)
1034                 return -EAGAIN;
1035
1036         ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
1037                                 end > 0 ? end : LLONG_MAX,
1038                                 fsync_flags & IORING_FSYNC_DATASYNC);
1039
1040         io_fput(req);
1041         io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
1042         io_free_req(req);
1043         return 0;
1044 }
1045
1046 static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1047                            const struct sqe_submit *s, bool force_nonblock,
1048                            struct io_submit_state *state)
1049 {
1050         ssize_t ret;
1051         int opcode;
1052
1053         if (unlikely(s->index >= ctx->sq_entries))
1054                 return -EINVAL;
1055         req->user_data = READ_ONCE(s->sqe->user_data);
1056
1057         opcode = READ_ONCE(s->sqe->opcode);
1058         switch (opcode) {
1059         case IORING_OP_NOP:
1060                 ret = io_nop(req, req->user_data);
1061                 break;
1062         case IORING_OP_READV:
1063                 if (unlikely(s->sqe->buf_index))
1064                         return -EINVAL;
1065                 ret = io_read(req, s, force_nonblock, state);
1066                 break;
1067         case IORING_OP_WRITEV:
1068                 if (unlikely(s->sqe->buf_index))
1069                         return -EINVAL;
1070                 ret = io_write(req, s, force_nonblock, state);
1071                 break;
1072         case IORING_OP_READ_FIXED:
1073                 ret = io_read(req, s, force_nonblock, state);
1074                 break;
1075         case IORING_OP_WRITE_FIXED:
1076                 ret = io_write(req, s, force_nonblock, state);
1077                 break;
1078         case IORING_OP_FSYNC:
1079                 ret = io_fsync(req, s->sqe, force_nonblock);
1080                 break;
1081         default:
1082                 ret = -EINVAL;
1083                 break;
1084         }
1085
1086         if (ret)
1087                 return ret;
1088
1089         if (ctx->flags & IORING_SETUP_IOPOLL) {
1090                 if (req->error == -EAGAIN)
1091                         return -EAGAIN;
1092
1093                 /* workqueue context doesn't hold uring_lock, grab it now */
1094                 if (s->needs_lock)
1095                         mutex_lock(&ctx->uring_lock);
1096                 io_iopoll_req_issued(req);
1097                 if (s->needs_lock)
1098                         mutex_unlock(&ctx->uring_lock);
1099         }
1100
1101         return 0;
1102 }
1103
1104 static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
1105 {
1106         u8 opcode = READ_ONCE(sqe->opcode);
1107
1108         return !(opcode == IORING_OP_READ_FIXED ||
1109                  opcode == IORING_OP_WRITE_FIXED);
1110 }
1111
1112 static void io_sq_wq_submit_work(struct work_struct *work)
1113 {
1114         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
1115         struct sqe_submit *s = &req->submit;
1116         const struct io_uring_sqe *sqe = s->sqe;
1117         struct io_ring_ctx *ctx = req->ctx;
1118         mm_segment_t old_fs;
1119         bool needs_user;
1120         int ret;
1121
1122          /* Ensure we clear previously set forced non-block flag */
1123         req->flags &= ~REQ_F_FORCE_NONBLOCK;
1124         req->rw.ki_flags &= ~IOCB_NOWAIT;
1125
1126         s->needs_lock = true;
1127         s->has_user = false;
1128
1129         /*
1130          * If we're doing IO to fixed buffers, we don't need to get/set
1131          * user context
1132          */
1133         needs_user = io_sqe_needs_user(s->sqe);
1134         if (needs_user) {
1135                 if (!mmget_not_zero(ctx->sqo_mm)) {
1136                         ret = -EFAULT;
1137                         goto err;
1138                 }
1139                 use_mm(ctx->sqo_mm);
1140                 old_fs = get_fs();
1141                 set_fs(USER_DS);
1142                 s->has_user = true;
1143         }
1144
1145         do {
1146                 ret = __io_submit_sqe(ctx, req, s, false, NULL);
1147                 /*
1148                  * We can get EAGAIN for polled IO even though we're forcing
1149                  * a sync submission from here, since we can't wait for
1150                  * request slots on the block side.
1151                  */
1152                 if (ret != -EAGAIN)
1153                         break;
1154                 cond_resched();
1155         } while (1);
1156
1157         if (needs_user) {
1158                 set_fs(old_fs);
1159                 unuse_mm(ctx->sqo_mm);
1160                 mmput(ctx->sqo_mm);
1161         }
1162 err:
1163         if (ret) {
1164                 io_cqring_add_event(ctx, sqe->user_data, ret, 0);
1165                 io_free_req(req);
1166         }
1167
1168         /* async context always use a copy of the sqe */
1169         kfree(sqe);
1170 }
1171
1172 static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
1173                          struct io_submit_state *state)
1174 {
1175         struct io_kiocb *req;
1176         ssize_t ret;
1177
1178         /* enforce forwards compatibility on users */
1179         if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
1180                 return -EINVAL;
1181
1182         req = io_get_req(ctx, state);
1183         if (unlikely(!req))
1184                 return -EAGAIN;
1185
1186         req->rw.ki_filp = NULL;
1187
1188         ret = __io_submit_sqe(ctx, req, s, true, state);
1189         if (ret == -EAGAIN) {
1190                 struct io_uring_sqe *sqe_copy;
1191
1192                 sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
1193                 if (sqe_copy) {
1194                         memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
1195                         s->sqe = sqe_copy;
1196
1197                         memcpy(&req->submit, s, sizeof(*s));
1198                         INIT_WORK(&req->work, io_sq_wq_submit_work);
1199                         queue_work(ctx->sqo_wq, &req->work);
1200                         ret = 0;
1201                 }
1202         }
1203         if (ret)
1204                 io_free_req(req);
1205
1206         return ret;
1207 }
1208
1209 /*
1210  * Batched submission is done, ensure local IO is flushed out.
1211  */
1212 static void io_submit_state_end(struct io_submit_state *state)
1213 {
1214         blk_finish_plug(&state->plug);
1215         io_file_put(state, NULL);
1216         if (state->free_reqs)
1217                 kmem_cache_free_bulk(req_cachep, state->free_reqs,
1218                                         &state->reqs[state->cur_req]);
1219 }
1220
1221 /*
1222  * Start submission side cache.
1223  */
1224 static void io_submit_state_start(struct io_submit_state *state,
1225                                   struct io_ring_ctx *ctx, unsigned max_ios)
1226 {
1227         blk_start_plug(&state->plug);
1228         state->free_reqs = 0;
1229         state->file = NULL;
1230         state->ios_left = max_ios;
1231 }
1232
1233 static void io_commit_sqring(struct io_ring_ctx *ctx)
1234 {
1235         struct io_sq_ring *ring = ctx->sq_ring;
1236
1237         if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) {
1238                 /*
1239                  * Ensure any loads from the SQEs are done at this point,
1240                  * since once we write the new head, the application could
1241                  * write new data to them.
1242                  */
1243                 smp_store_release(&ring->r.head, ctx->cached_sq_head);
1244
1245                 /*
1246                  * write side barrier of head update, app has read side. See
1247                  * comment at the top of this file
1248                  */
1249                 smp_wmb();
1250         }
1251 }
1252
1253 /*
1254  * Undo last io_get_sqring()
1255  */
1256 static void io_drop_sqring(struct io_ring_ctx *ctx)
1257 {
1258         ctx->cached_sq_head--;
1259 }
1260
1261 /*
1262  * Fetch an sqe, if one is available. Note that s->sqe will point to memory
1263  * that is mapped by userspace. This means that care needs to be taken to
1264  * ensure that reads are stable, as we cannot rely on userspace always
1265  * being a good citizen. If members of the sqe are validated and then later
1266  * used, it's important that those reads are done through READ_ONCE() to
1267  * prevent a re-load down the line.
1268  */
1269 static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
1270 {
1271         struct io_sq_ring *ring = ctx->sq_ring;
1272         unsigned head;
1273
1274         /*
1275          * The cached sq head (or cq tail) serves two purposes:
1276          *
1277          * 1) allows us to batch the cost of updating the user visible
1278          *    head updates.
1279          * 2) allows the kernel side to track the head on its own, even
1280          *    though the application is the one updating it.
1281          */
1282         head = ctx->cached_sq_head;
1283         /* See comment at the top of this file */
1284         smp_rmb();
1285         if (head == READ_ONCE(ring->r.tail))
1286                 return false;
1287
1288         head = READ_ONCE(ring->array[head & ctx->sq_mask]);
1289         if (head < ctx->sq_entries) {
1290                 s->index = head;
1291                 s->sqe = &ctx->sq_sqes[head];
1292                 ctx->cached_sq_head++;
1293                 return true;
1294         }
1295
1296         /* drop invalid entries */
1297         ctx->cached_sq_head++;
1298         ring->dropped++;
1299         /* See comment at the top of this file */
1300         smp_wmb();
1301         return false;
1302 }
1303
1304 static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
1305 {
1306         struct io_submit_state state, *statep = NULL;
1307         int i, ret = 0, submit = 0;
1308
1309         if (to_submit > IO_PLUG_THRESHOLD) {
1310                 io_submit_state_start(&state, ctx, to_submit);
1311                 statep = &state;
1312         }
1313
1314         for (i = 0; i < to_submit; i++) {
1315                 struct sqe_submit s;
1316
1317                 if (!io_get_sqring(ctx, &s))
1318                         break;
1319
1320                 s.has_user = true;
1321                 s.needs_lock = false;
1322
1323                 ret = io_submit_sqe(ctx, &s, statep);
1324                 if (ret) {
1325                         io_drop_sqring(ctx);
1326                         break;
1327                 }
1328
1329                 submit++;
1330         }
1331         io_commit_sqring(ctx);
1332
1333         if (statep)
1334                 io_submit_state_end(statep);
1335
1336         return submit ? submit : ret;
1337 }
1338
1339 static unsigned io_cqring_events(struct io_cq_ring *ring)
1340 {
1341         return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
1342 }
1343
1344 /*
1345  * Wait until events become available, if we don't already have some. The
1346  * application must reap them itself, as they reside on the shared cq ring.
1347  */
1348 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
1349                           const sigset_t __user *sig, size_t sigsz)
1350 {
1351         struct io_cq_ring *ring = ctx->cq_ring;
1352         sigset_t ksigmask, sigsaved;
1353         DEFINE_WAIT(wait);
1354         int ret;
1355
1356         /* See comment at the top of this file */
1357         smp_rmb();
1358         if (io_cqring_events(ring) >= min_events)
1359                 return 0;
1360
1361         if (sig) {
1362                 ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz);
1363                 if (ret)
1364                         return ret;
1365         }
1366
1367         do {
1368                 prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE);
1369
1370                 ret = 0;
1371                 /* See comment at the top of this file */
1372                 smp_rmb();
1373                 if (io_cqring_events(ring) >= min_events)
1374                         break;
1375
1376                 schedule();
1377
1378                 ret = -EINTR;
1379                 if (signal_pending(current))
1380                         break;
1381         } while (1);
1382
1383         finish_wait(&ctx->wait, &wait);
1384
1385         if (sig)
1386                 restore_user_sigmask(sig, &sigsaved);
1387
1388         return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
1389 }
1390
1391 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
1392 {
1393 #if defined(CONFIG_UNIX)
1394         if (ctx->ring_sock) {
1395                 struct sock *sock = ctx->ring_sock->sk;
1396                 struct sk_buff *skb;
1397
1398                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
1399                         kfree_skb(skb);
1400         }
1401 #else
1402         int i;
1403
1404         for (i = 0; i < ctx->nr_user_files; i++)
1405                 fput(ctx->user_files[i]);
1406 #endif
1407 }
1408
1409 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
1410 {
1411         if (!ctx->user_files)
1412                 return -ENXIO;
1413
1414         __io_sqe_files_unregister(ctx);
1415         kfree(ctx->user_files);
1416         ctx->user_files = NULL;
1417         ctx->nr_user_files = 0;
1418         return 0;
1419 }
1420
1421 static void io_finish_async(struct io_ring_ctx *ctx)
1422 {
1423         if (ctx->sqo_wq) {
1424                 destroy_workqueue(ctx->sqo_wq);
1425                 ctx->sqo_wq = NULL;
1426         }
1427 }
1428
1429 #if defined(CONFIG_UNIX)
1430 static void io_destruct_skb(struct sk_buff *skb)
1431 {
1432         struct io_ring_ctx *ctx = skb->sk->sk_user_data;
1433
1434         io_finish_async(ctx);
1435         unix_destruct_scm(skb);
1436 }
1437
1438 /*
1439  * Ensure the UNIX gc is aware of our file set, so we are certain that
1440  * the io_uring can be safely unregistered on process exit, even if we have
1441  * loops in the file referencing.
1442  */
1443 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
1444 {
1445         struct sock *sk = ctx->ring_sock->sk;
1446         struct scm_fp_list *fpl;
1447         struct sk_buff *skb;
1448         int i;
1449
1450         if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
1451                 unsigned long inflight = ctx->user->unix_inflight + nr;
1452
1453                 if (inflight > task_rlimit(current, RLIMIT_NOFILE))
1454                         return -EMFILE;
1455         }
1456
1457         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
1458         if (!fpl)
1459                 return -ENOMEM;
1460
1461         skb = alloc_skb(0, GFP_KERNEL);
1462         if (!skb) {
1463                 kfree(fpl);
1464                 return -ENOMEM;
1465         }
1466
1467         skb->sk = sk;
1468         skb->destructor = io_destruct_skb;
1469
1470         fpl->user = get_uid(ctx->user);
1471         for (i = 0; i < nr; i++) {
1472                 fpl->fp[i] = get_file(ctx->user_files[i + offset]);
1473                 unix_inflight(fpl->user, fpl->fp[i]);
1474         }
1475
1476         fpl->max = fpl->count = nr;
1477         UNIXCB(skb).fp = fpl;
1478         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1479         skb_queue_head(&sk->sk_receive_queue, skb);
1480
1481         for (i = 0; i < nr; i++)
1482                 fput(fpl->fp[i]);
1483
1484         return 0;
1485 }
1486
1487 /*
1488  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
1489  * causes regular reference counting to break down. We rely on the UNIX
1490  * garbage collection to take care of this problem for us.
1491  */
1492 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
1493 {
1494         unsigned left, total;
1495         int ret = 0;
1496
1497         total = 0;
1498         left = ctx->nr_user_files;
1499         while (left) {
1500                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
1501                 int ret;
1502
1503                 ret = __io_sqe_files_scm(ctx, this_files, total);
1504                 if (ret)
1505                         break;
1506                 left -= this_files;
1507                 total += this_files;
1508         }
1509
1510         if (!ret)
1511                 return 0;
1512
1513         while (total < ctx->nr_user_files) {
1514                 fput(ctx->user_files[total]);
1515                 total++;
1516         }
1517
1518         return ret;
1519 }
1520 #else
1521 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
1522 {
1523         return 0;
1524 }
1525 #endif
1526
1527 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
1528                                  unsigned nr_args)
1529 {
1530         __s32 __user *fds = (__s32 __user *) arg;
1531         int fd, ret = 0;
1532         unsigned i;
1533
1534         if (ctx->user_files)
1535                 return -EBUSY;
1536         if (!nr_args)
1537                 return -EINVAL;
1538         if (nr_args > IORING_MAX_FIXED_FILES)
1539                 return -EMFILE;
1540
1541         ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
1542         if (!ctx->user_files)
1543                 return -ENOMEM;
1544
1545         for (i = 0; i < nr_args; i++) {
1546                 ret = -EFAULT;
1547                 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
1548                         break;
1549
1550                 ctx->user_files[i] = fget(fd);
1551
1552                 ret = -EBADF;
1553                 if (!ctx->user_files[i])
1554                         break;
1555                 /*
1556                  * Don't allow io_uring instances to be registered. If UNIX
1557                  * isn't enabled, then this causes a reference cycle and this
1558                  * instance can never get freed. If UNIX is enabled we'll
1559                  * handle it just fine, but there's still no point in allowing
1560                  * a ring fd as it doesn't support regular read/write anyway.
1561                  */
1562                 if (ctx->user_files[i]->f_op == &io_uring_fops) {
1563                         fput(ctx->user_files[i]);
1564                         break;
1565                 }
1566                 ctx->nr_user_files++;
1567                 ret = 0;
1568         }
1569
1570         if (ret) {
1571                 for (i = 0; i < ctx->nr_user_files; i++)
1572                         fput(ctx->user_files[i]);
1573
1574                 kfree(ctx->user_files);
1575                 ctx->nr_user_files = 0;
1576                 return ret;
1577         }
1578
1579         ret = io_sqe_files_scm(ctx);
1580         if (ret)
1581                 io_sqe_files_unregister(ctx);
1582
1583         return ret;
1584 }
1585
1586 static int io_sq_offload_start(struct io_ring_ctx *ctx)
1587 {
1588         int ret;
1589
1590         mmgrab(current->mm);
1591         ctx->sqo_mm = current->mm;
1592
1593         /* Do QD, or 2 * CPUS, whatever is smallest */
1594         ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE,
1595                         min(ctx->sq_entries - 1, 2 * num_online_cpus()));
1596         if (!ctx->sqo_wq) {
1597                 ret = -ENOMEM;
1598                 goto err;
1599         }
1600
1601         return 0;
1602 err:
1603         mmdrop(ctx->sqo_mm);
1604         ctx->sqo_mm = NULL;
1605         return ret;
1606 }
1607
1608 static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
1609 {
1610         atomic_long_sub(nr_pages, &user->locked_vm);
1611 }
1612
1613 static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
1614 {
1615         unsigned long page_limit, cur_pages, new_pages;
1616
1617         /* Don't allow more pages than we can safely lock */
1618         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1619
1620         do {
1621                 cur_pages = atomic_long_read(&user->locked_vm);
1622                 new_pages = cur_pages + nr_pages;
1623                 if (new_pages > page_limit)
1624                         return -ENOMEM;
1625         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
1626                                         new_pages) != cur_pages);
1627
1628         return 0;
1629 }
1630
1631 static void io_mem_free(void *ptr)
1632 {
1633         struct page *page = virt_to_head_page(ptr);
1634
1635         if (put_page_testzero(page))
1636                 free_compound_page(page);
1637 }
1638
1639 static void *io_mem_alloc(size_t size)
1640 {
1641         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
1642                                 __GFP_NORETRY;
1643
1644         return (void *) __get_free_pages(gfp_flags, get_order(size));
1645 }
1646
1647 static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
1648 {
1649         struct io_sq_ring *sq_ring;
1650         struct io_cq_ring *cq_ring;
1651         size_t bytes;
1652
1653         bytes = struct_size(sq_ring, array, sq_entries);
1654         bytes += array_size(sizeof(struct io_uring_sqe), sq_entries);
1655         bytes += struct_size(cq_ring, cqes, cq_entries);
1656
1657         return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
1658 }
1659
1660 static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
1661 {
1662         int i, j;
1663
1664         if (!ctx->user_bufs)
1665                 return -ENXIO;
1666
1667         for (i = 0; i < ctx->nr_user_bufs; i++) {
1668                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
1669
1670                 for (j = 0; j < imu->nr_bvecs; j++)
1671                         put_page(imu->bvec[j].bv_page);
1672
1673                 if (ctx->account_mem)
1674                         io_unaccount_mem(ctx->user, imu->nr_bvecs);
1675                 kfree(imu->bvec);
1676                 imu->nr_bvecs = 0;
1677         }
1678
1679         kfree(ctx->user_bufs);
1680         ctx->user_bufs = NULL;
1681         ctx->nr_user_bufs = 0;
1682         return 0;
1683 }
1684
1685 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
1686                        void __user *arg, unsigned index)
1687 {
1688         struct iovec __user *src;
1689
1690 #ifdef CONFIG_COMPAT
1691         if (ctx->compat) {
1692                 struct compat_iovec __user *ciovs;
1693                 struct compat_iovec ciov;
1694
1695                 ciovs = (struct compat_iovec __user *) arg;
1696                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
1697                         return -EFAULT;
1698
1699                 dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
1700                 dst->iov_len = ciov.iov_len;
1701                 return 0;
1702         }
1703 #endif
1704         src = (struct iovec __user *) arg;
1705         if (copy_from_user(dst, &src[index], sizeof(*dst)))
1706                 return -EFAULT;
1707         return 0;
1708 }
1709
1710 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
1711                                   unsigned nr_args)
1712 {
1713         struct vm_area_struct **vmas = NULL;
1714         struct page **pages = NULL;
1715         int i, j, got_pages = 0;
1716         int ret = -EINVAL;
1717
1718         if (ctx->user_bufs)
1719                 return -EBUSY;
1720         if (!nr_args || nr_args > UIO_MAXIOV)
1721                 return -EINVAL;
1722
1723         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
1724                                         GFP_KERNEL);
1725         if (!ctx->user_bufs)
1726                 return -ENOMEM;
1727
1728         for (i = 0; i < nr_args; i++) {
1729                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
1730                 unsigned long off, start, end, ubuf;
1731                 int pret, nr_pages;
1732                 struct iovec iov;
1733                 size_t size;
1734
1735                 ret = io_copy_iov(ctx, &iov, arg, i);
1736                 if (ret)
1737                         break;
1738
1739                 /*
1740                  * Don't impose further limits on the size and buffer
1741                  * constraints here, we'll -EINVAL later when IO is
1742                  * submitted if they are wrong.
1743                  */
1744                 ret = -EFAULT;
1745                 if (!iov.iov_base || !iov.iov_len)
1746                         goto err;
1747
1748                 /* arbitrary limit, but we need something */
1749                 if (iov.iov_len > SZ_1G)
1750                         goto err;
1751
1752                 ubuf = (unsigned long) iov.iov_base;
1753                 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1754                 start = ubuf >> PAGE_SHIFT;
1755                 nr_pages = end - start;
1756
1757                 if (ctx->account_mem) {
1758                         ret = io_account_mem(ctx->user, nr_pages);
1759                         if (ret)
1760                                 goto err;
1761                 }
1762
1763                 ret = 0;
1764                 if (!pages || nr_pages > got_pages) {
1765                         kfree(vmas);
1766                         kfree(pages);
1767                         pages = kmalloc_array(nr_pages, sizeof(struct page *),
1768                                                 GFP_KERNEL);
1769                         vmas = kmalloc_array(nr_pages,
1770                                         sizeof(struct vm_area_struct *),
1771                                         GFP_KERNEL);
1772                         if (!pages || !vmas) {
1773                                 ret = -ENOMEM;
1774                                 if (ctx->account_mem)
1775                                         io_unaccount_mem(ctx->user, nr_pages);
1776                                 goto err;
1777                         }
1778                         got_pages = nr_pages;
1779                 }
1780
1781                 imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec),
1782                                                 GFP_KERNEL);
1783                 ret = -ENOMEM;
1784                 if (!imu->bvec) {
1785                         if (ctx->account_mem)
1786                                 io_unaccount_mem(ctx->user, nr_pages);
1787                         goto err;
1788                 }
1789
1790                 ret = 0;
1791                 down_read(&current->mm->mmap_sem);
1792                 pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
1793                                                 pages, vmas);
1794                 if (pret == nr_pages) {
1795                         /* don't support file backed memory */
1796                         for (j = 0; j < nr_pages; j++) {
1797                                 struct vm_area_struct *vma = vmas[j];
1798
1799                                 if (vma->vm_file &&
1800                                     !is_file_hugepages(vma->vm_file)) {
1801                                         ret = -EOPNOTSUPP;
1802                                         break;
1803                                 }
1804                         }
1805                 } else {
1806                         ret = pret < 0 ? pret : -EFAULT;
1807                 }
1808                 up_read(&current->mm->mmap_sem);
1809                 if (ret) {
1810                         /*
1811                          * if we did partial map, or found file backed vmas,
1812                          * release any pages we did get
1813                          */
1814                         if (pret > 0) {
1815                                 for (j = 0; j < pret; j++)
1816                                         put_page(pages[j]);
1817                         }
1818                         if (ctx->account_mem)
1819                                 io_unaccount_mem(ctx->user, nr_pages);
1820                         goto err;
1821                 }
1822
1823                 off = ubuf & ~PAGE_MASK;
1824                 size = iov.iov_len;
1825                 for (j = 0; j < nr_pages; j++) {
1826                         size_t vec_len;
1827
1828                         vec_len = min_t(size_t, size, PAGE_SIZE - off);
1829                         imu->bvec[j].bv_page = pages[j];
1830                         imu->bvec[j].bv_len = vec_len;
1831                         imu->bvec[j].bv_offset = off;
1832                         off = 0;
1833                         size -= vec_len;
1834                 }
1835                 /* store original address for later verification */
1836                 imu->ubuf = ubuf;
1837                 imu->len = iov.iov_len;
1838                 imu->nr_bvecs = nr_pages;
1839
1840                 ctx->nr_user_bufs++;
1841         }
1842         kfree(pages);
1843         kfree(vmas);
1844         return 0;
1845 err:
1846         kfree(pages);
1847         kfree(vmas);
1848         io_sqe_buffer_unregister(ctx);
1849         return ret;
1850 }
1851
1852 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
1853 {
1854         io_finish_async(ctx);
1855         if (ctx->sqo_mm)
1856                 mmdrop(ctx->sqo_mm);
1857
1858         io_iopoll_reap_events(ctx);
1859         io_sqe_buffer_unregister(ctx);
1860         io_sqe_files_unregister(ctx);
1861
1862 #if defined(CONFIG_UNIX)
1863         if (ctx->ring_sock)
1864                 sock_release(ctx->ring_sock);
1865 #endif
1866
1867         io_mem_free(ctx->sq_ring);
1868         io_mem_free(ctx->sq_sqes);
1869         io_mem_free(ctx->cq_ring);
1870
1871         percpu_ref_exit(&ctx->refs);
1872         if (ctx->account_mem)
1873                 io_unaccount_mem(ctx->user,
1874                                 ring_pages(ctx->sq_entries, ctx->cq_entries));
1875         free_uid(ctx->user);
1876         kfree(ctx);
1877 }
1878
1879 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
1880 {
1881         struct io_ring_ctx *ctx = file->private_data;
1882         __poll_t mask = 0;
1883
1884         poll_wait(file, &ctx->cq_wait, wait);
1885         /* See comment at the top of this file */
1886         smp_rmb();
1887         if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head)
1888                 mask |= EPOLLOUT | EPOLLWRNORM;
1889         if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
1890                 mask |= EPOLLIN | EPOLLRDNORM;
1891
1892         return mask;
1893 }
1894
1895 static int io_uring_fasync(int fd, struct file *file, int on)
1896 {
1897         struct io_ring_ctx *ctx = file->private_data;
1898
1899         return fasync_helper(fd, file, on, &ctx->cq_fasync);
1900 }
1901
1902 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
1903 {
1904         mutex_lock(&ctx->uring_lock);
1905         percpu_ref_kill(&ctx->refs);
1906         mutex_unlock(&ctx->uring_lock);
1907
1908         io_iopoll_reap_events(ctx);
1909         wait_for_completion(&ctx->ctx_done);
1910         io_ring_ctx_free(ctx);
1911 }
1912
1913 static int io_uring_release(struct inode *inode, struct file *file)
1914 {
1915         struct io_ring_ctx *ctx = file->private_data;
1916
1917         file->private_data = NULL;
1918         io_ring_ctx_wait_and_kill(ctx);
1919         return 0;
1920 }
1921
1922 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
1923 {
1924         loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
1925         unsigned long sz = vma->vm_end - vma->vm_start;
1926         struct io_ring_ctx *ctx = file->private_data;
1927         unsigned long pfn;
1928         struct page *page;
1929         void *ptr;
1930
1931         switch (offset) {
1932         case IORING_OFF_SQ_RING:
1933                 ptr = ctx->sq_ring;
1934                 break;
1935         case IORING_OFF_SQES:
1936                 ptr = ctx->sq_sqes;
1937                 break;
1938         case IORING_OFF_CQ_RING:
1939                 ptr = ctx->cq_ring;
1940                 break;
1941         default:
1942                 return -EINVAL;
1943         }
1944
1945         page = virt_to_head_page(ptr);
1946         if (sz > (PAGE_SIZE << compound_order(page)))
1947                 return -EINVAL;
1948
1949         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
1950         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
1951 }
1952
1953 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
1954                 u32, min_complete, u32, flags, const sigset_t __user *, sig,
1955                 size_t, sigsz)
1956 {
1957         struct io_ring_ctx *ctx;
1958         long ret = -EBADF;
1959         int submitted = 0;
1960         struct fd f;
1961
1962         if (flags & ~IORING_ENTER_GETEVENTS)
1963                 return -EINVAL;
1964
1965         f = fdget(fd);
1966         if (!f.file)
1967                 return -EBADF;
1968
1969         ret = -EOPNOTSUPP;
1970         if (f.file->f_op != &io_uring_fops)
1971                 goto out_fput;
1972
1973         ret = -ENXIO;
1974         ctx = f.file->private_data;
1975         if (!percpu_ref_tryget(&ctx->refs))
1976                 goto out_fput;
1977
1978         ret = 0;
1979         if (to_submit) {
1980                 to_submit = min(to_submit, ctx->sq_entries);
1981
1982                 mutex_lock(&ctx->uring_lock);
1983                 submitted = io_ring_submit(ctx, to_submit);
1984                 mutex_unlock(&ctx->uring_lock);
1985
1986                 if (submitted < 0)
1987                         goto out_ctx;
1988         }
1989         if (flags & IORING_ENTER_GETEVENTS) {
1990                 unsigned nr_events = 0;
1991
1992                 min_complete = min(min_complete, ctx->cq_entries);
1993
1994                 /*
1995                  * The application could have included the 'to_submit' count
1996                  * in how many events it wanted to wait for. If we failed to
1997                  * submit the desired count, we may need to adjust the number
1998                  * of events to poll/wait for.
1999                  */
2000                 if (submitted < to_submit)
2001                         min_complete = min_t(unsigned, submitted, min_complete);
2002
2003                 if (ctx->flags & IORING_SETUP_IOPOLL) {
2004                         mutex_lock(&ctx->uring_lock);
2005                         ret = io_iopoll_check(ctx, &nr_events, min_complete);
2006                         mutex_unlock(&ctx->uring_lock);
2007                 } else {
2008                         ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
2009                 }
2010         }
2011
2012 out_ctx:
2013         io_ring_drop_ctx_refs(ctx, 1);
2014 out_fput:
2015         fdput(f);
2016         return submitted ? submitted : ret;
2017 }
2018
2019 static const struct file_operations io_uring_fops = {
2020         .release        = io_uring_release,
2021         .mmap           = io_uring_mmap,
2022         .poll           = io_uring_poll,
2023         .fasync         = io_uring_fasync,
2024 };
2025
2026 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
2027                                   struct io_uring_params *p)
2028 {
2029         struct io_sq_ring *sq_ring;
2030         struct io_cq_ring *cq_ring;
2031         size_t size;
2032
2033         sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries));
2034         if (!sq_ring)
2035                 return -ENOMEM;
2036
2037         ctx->sq_ring = sq_ring;
2038         sq_ring->ring_mask = p->sq_entries - 1;
2039         sq_ring->ring_entries = p->sq_entries;
2040         ctx->sq_mask = sq_ring->ring_mask;
2041         ctx->sq_entries = sq_ring->ring_entries;
2042
2043         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
2044         if (size == SIZE_MAX)
2045                 return -EOVERFLOW;
2046
2047         ctx->sq_sqes = io_mem_alloc(size);
2048         if (!ctx->sq_sqes) {
2049                 io_mem_free(ctx->sq_ring);
2050                 return -ENOMEM;
2051         }
2052
2053         cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
2054         if (!cq_ring) {
2055                 io_mem_free(ctx->sq_ring);
2056                 io_mem_free(ctx->sq_sqes);
2057                 return -ENOMEM;
2058         }
2059
2060         ctx->cq_ring = cq_ring;
2061         cq_ring->ring_mask = p->cq_entries - 1;
2062         cq_ring->ring_entries = p->cq_entries;
2063         ctx->cq_mask = cq_ring->ring_mask;
2064         ctx->cq_entries = cq_ring->ring_entries;
2065         return 0;
2066 }
2067
2068 /*
2069  * Allocate an anonymous fd, this is what constitutes the application
2070  * visible backing of an io_uring instance. The application mmaps this
2071  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
2072  * we have to tie this fd to a socket for file garbage collection purposes.
2073  */
2074 static int io_uring_get_fd(struct io_ring_ctx *ctx)
2075 {
2076         struct file *file;
2077         int ret;
2078
2079 #if defined(CONFIG_UNIX)
2080         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
2081                                 &ctx->ring_sock);
2082         if (ret)
2083                 return ret;
2084 #endif
2085
2086         ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
2087         if (ret < 0)
2088                 goto err;
2089
2090         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
2091                                         O_RDWR | O_CLOEXEC);
2092         if (IS_ERR(file)) {
2093                 put_unused_fd(ret);
2094                 ret = PTR_ERR(file);
2095                 goto err;
2096         }
2097
2098 #if defined(CONFIG_UNIX)
2099         ctx->ring_sock->file = file;
2100         ctx->ring_sock->sk->sk_user_data = ctx;
2101 #endif
2102         fd_install(ret, file);
2103         return ret;
2104 err:
2105 #if defined(CONFIG_UNIX)
2106         sock_release(ctx->ring_sock);
2107         ctx->ring_sock = NULL;
2108 #endif
2109         return ret;
2110 }
2111
2112 static int io_uring_create(unsigned entries, struct io_uring_params *p)
2113 {
2114         struct user_struct *user = NULL;
2115         struct io_ring_ctx *ctx;
2116         bool account_mem;
2117         int ret;
2118
2119         if (!entries || entries > IORING_MAX_ENTRIES)
2120                 return -EINVAL;
2121
2122         /*
2123          * Use twice as many entries for the CQ ring. It's possible for the
2124          * application to drive a higher depth than the size of the SQ ring,
2125          * since the sqes are only used at submission time. This allows for
2126          * some flexibility in overcommitting a bit.
2127          */
2128         p->sq_entries = roundup_pow_of_two(entries);
2129         p->cq_entries = 2 * p->sq_entries;
2130
2131         user = get_uid(current_user());
2132         account_mem = !capable(CAP_IPC_LOCK);
2133
2134         if (account_mem) {
2135                 ret = io_account_mem(user,
2136                                 ring_pages(p->sq_entries, p->cq_entries));
2137                 if (ret) {
2138                         free_uid(user);
2139                         return ret;
2140                 }
2141         }
2142
2143         ctx = io_ring_ctx_alloc(p);
2144         if (!ctx) {
2145                 if (account_mem)
2146                         io_unaccount_mem(user, ring_pages(p->sq_entries,
2147                                                                 p->cq_entries));
2148                 free_uid(user);
2149                 return -ENOMEM;
2150         }
2151         ctx->compat = in_compat_syscall();
2152         ctx->account_mem = account_mem;
2153         ctx->user = user;
2154
2155         ret = io_allocate_scq_urings(ctx, p);
2156         if (ret)
2157                 goto err;
2158
2159         ret = io_sq_offload_start(ctx);
2160         if (ret)
2161                 goto err;
2162
2163         ret = io_uring_get_fd(ctx);
2164         if (ret < 0)
2165                 goto err;
2166
2167         memset(&p->sq_off, 0, sizeof(p->sq_off));
2168         p->sq_off.head = offsetof(struct io_sq_ring, r.head);
2169         p->sq_off.tail = offsetof(struct io_sq_ring, r.tail);
2170         p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask);
2171         p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries);
2172         p->sq_off.flags = offsetof(struct io_sq_ring, flags);
2173         p->sq_off.dropped = offsetof(struct io_sq_ring, dropped);
2174         p->sq_off.array = offsetof(struct io_sq_ring, array);
2175
2176         memset(&p->cq_off, 0, sizeof(p->cq_off));
2177         p->cq_off.head = offsetof(struct io_cq_ring, r.head);
2178         p->cq_off.tail = offsetof(struct io_cq_ring, r.tail);
2179         p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask);
2180         p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries);
2181         p->cq_off.overflow = offsetof(struct io_cq_ring, overflow);
2182         p->cq_off.cqes = offsetof(struct io_cq_ring, cqes);
2183         return ret;
2184 err:
2185         io_ring_ctx_wait_and_kill(ctx);
2186         return ret;
2187 }
2188
2189 /*
2190  * Sets up an aio uring context, and returns the fd. Applications asks for a
2191  * ring size, we return the actual sq/cq ring sizes (among other things) in the
2192  * params structure passed in.
2193  */
2194 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
2195 {
2196         struct io_uring_params p;
2197         long ret;
2198         int i;
2199
2200         if (copy_from_user(&p, params, sizeof(p)))
2201                 return -EFAULT;
2202         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
2203                 if (p.resv[i])
2204                         return -EINVAL;
2205         }
2206
2207         if (p.flags & ~IORING_SETUP_IOPOLL)
2208                 return -EINVAL;
2209
2210         ret = io_uring_create(entries, &p);
2211         if (ret < 0)
2212                 return ret;
2213
2214         if (copy_to_user(params, &p, sizeof(p)))
2215                 return -EFAULT;
2216
2217         return ret;
2218 }
2219
2220 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
2221                 struct io_uring_params __user *, params)
2222 {
2223         return io_uring_setup(entries, params);
2224 }
2225
2226 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
2227                                void __user *arg, unsigned nr_args)
2228 {
2229         int ret;
2230
2231         percpu_ref_kill(&ctx->refs);
2232         wait_for_completion(&ctx->ctx_done);
2233
2234         switch (opcode) {
2235         case IORING_REGISTER_BUFFERS:
2236                 ret = io_sqe_buffer_register(ctx, arg, nr_args);
2237                 break;
2238         case IORING_UNREGISTER_BUFFERS:
2239                 ret = -EINVAL;
2240                 if (arg || nr_args)
2241                         break;
2242                 ret = io_sqe_buffer_unregister(ctx);
2243                 break;
2244         case IORING_REGISTER_FILES:
2245                 ret = io_sqe_files_register(ctx, arg, nr_args);
2246                 break;
2247         case IORING_UNREGISTER_FILES:
2248                 ret = -EINVAL;
2249                 if (arg || nr_args)
2250                         break;
2251                 ret = io_sqe_files_unregister(ctx);
2252                 break;
2253         default:
2254                 ret = -EINVAL;
2255                 break;
2256         }
2257
2258         /* bring the ctx back to life */
2259         reinit_completion(&ctx->ctx_done);
2260         percpu_ref_reinit(&ctx->refs);
2261         return ret;
2262 }
2263
2264 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
2265                 void __user *, arg, unsigned int, nr_args)
2266 {
2267         struct io_ring_ctx *ctx;
2268         long ret = -EBADF;
2269         struct fd f;
2270
2271         f = fdget(fd);
2272         if (!f.file)
2273                 return -EBADF;
2274
2275         ret = -EOPNOTSUPP;
2276         if (f.file->f_op != &io_uring_fops)
2277                 goto out_fput;
2278
2279         ctx = f.file->private_data;
2280
2281         mutex_lock(&ctx->uring_lock);
2282         ret = __io_uring_register(ctx, opcode, arg, nr_args);
2283         mutex_unlock(&ctx->uring_lock);
2284 out_fput:
2285         fdput(f);
2286         return ret;
2287 }
2288
2289 static int __init io_uring_init(void)
2290 {
2291         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
2292         return 0;
2293 };
2294 __initcall(io_uring_init);