fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqring (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <linux/refcount.h>
  48 #include <linux/uio.h>
  49
  50 #include <linux/sched/signal.h>
  51 #include <linux/fs.h>
  52 #include <linux/file.h>
  53 #include <linux/fdtable.h>
  54 #include <linux/mm.h>
  55 #include <linux/mman.h>
  56 #include <linux/mmu_context.h>
  57 #include <linux/percpu.h>
  58 #include <linux/slab.h>
  59 #include <linux/kthread.h>
  60 #include <linux/blkdev.h>
  61 #include <linux/bvec.h>
  62 #include <linux/net.h>
  63 #include <net/sock.h>
  64 #include <net/af_unix.h>
  65 #include <net/scm.h>
  66 #include <linux/anon_inodes.h>
  67 #include <linux/sched/mm.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/nospec.h>
  70 #include <linux/sizes.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/highmem.h>
  73
  74 #define CREATE_TRACE_POINTS
  75 #include <trace/events/io_uring.h>
  76
  77 #include <uapi/linux/io_uring.h>
  78
  79 #include "internal.h"
  80 #include "io-wq.h"
  81
  82 #define IORING_MAX_ENTRIES      32768
  83 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  84
  85 /*
  86  * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
  87  */
  88 #define IORING_FILE_TABLE_SHIFT 9
  89 #define IORING_MAX_FILES_TABLE  (1U << IORING_FILE_TABLE_SHIFT)
  90 #define IORING_FILE_TABLE_MASK  (IORING_MAX_FILES_TABLE - 1)
  91 #define IORING_MAX_FIXED_FILES  (64 * IORING_MAX_FILES_TABLE)
  92
  93 struct io_uring {
  94         u32 head ____cacheline_aligned_in_smp;
  95         u32 tail ____cacheline_aligned_in_smp;
  96 };
  97
  98 /*
  99  * This data is shared with the application through the mmap at offsets
 100  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 101  *
 102  * The offsets to the member fields are published through struct
 103  * io_sqring_offsets when calling io_uring_setup.
 104  */
 105 struct io_rings {
 106         /*
 107          * Head and tail offsets into the ring; the offsets need to be
 108          * masked to get valid indices.
 109          *
 110          * The kernel controls head of the sq ring and the tail of the cq ring,
 111          * and the application controls tail of the sq ring and the head of the
 112          * cq ring.
 113          */
 114         struct io_uring         sq, cq;
 115         /*
 116          * Bitmasks to apply to head and tail offsets (constant, equals
 117          * ring_entries - 1)
 118          */
 119         u32                     sq_ring_mask, cq_ring_mask;
 120         /* Ring sizes (constant, power of 2) */
 121         u32                     sq_ring_entries, cq_ring_entries;
 122         /*
 123          * Number of invalid entries dropped by the kernel due to
 124          * invalid index stored in array
 125          *
 126          * Written by the kernel, shouldn't be modified by the
 127          * application (i.e. get number of "new events" by comparing to
 128          * cached value).
 129          *
 130          * After a new SQ head value was read by the application this
 131          * counter includes all submissions that were dropped reaching
 132          * the new SQ head (and possibly more).
 133          */
 134         u32                     sq_dropped;
 135         /*
 136          * Runtime flags
 137          *
 138          * Written by the kernel, shouldn't be modified by the
 139          * application.
 140          *
 141          * The application needs a full memory barrier before checking
 142          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 143          */
 144         u32                     sq_flags;
 145         /*
 146          * Number of completion events lost because the queue was full;
 147          * this should be avoided by the application by making sure
 148          * there are not more requests pending than there is space in
 149          * the completion queue.
 150          *
 151          * Written by the kernel, shouldn't be modified by the
 152          * application (i.e. get number of "new events" by comparing to
 153          * cached value).
 154          *
 155          * As completion events come in out of order this counter is not
 156          * ordered with any other data.
 157          */
 158         u32                     cq_overflow;
 159         /*
 160          * Ring buffer of completion events.
 161          *
 162          * The kernel writes completion events fresh every time they are
 163          * produced, so the application is allowed to modify pending
 164          * entries.
 165          */
 166         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 167 };
 168
 169 struct io_mapped_ubuf {
 170         u64             ubuf;
 171         size_t          len;
 172         struct          bio_vec *bvec;
 173         unsigned int    nr_bvecs;
 174 };
 175
 176 struct fixed_file_table {
 177         struct file             **files;
 178 };
 179
 180 struct io_ring_ctx {
 181         struct {
 182                 struct percpu_ref       refs;
 183         } ____cacheline_aligned_in_smp;
 184
 185         struct {
 186                 unsigned int            flags;
 187                 bool                    compat;
 188                 bool                    account_mem;
 189                 bool                    cq_overflow_flushed;
 190                 bool                    drain_next;
 191
 192                 /*
 193                  * Ring buffer of indices into array of io_uring_sqe, which is
 194                  * mmapped by the application using the IORING_OFF_SQES offset.
 195                  *
 196                  * This indirection could e.g. be used to assign fixed
 197                  * io_uring_sqe entries to operations and only submit them to
 198                  * the queue when needed.
 199                  *
 200                  * The kernel modifies neither the indices array nor the entries
 201                  * array.
 202                  */
 203                 u32                     *sq_array;
 204                 unsigned                cached_sq_head;
 205                 unsigned                sq_entries;
 206                 unsigned                sq_mask;
 207                 unsigned                sq_thread_idle;
 208                 unsigned                cached_sq_dropped;
 209                 atomic_t                cached_cq_overflow;
 210                 struct io_uring_sqe     *sq_sqes;
 211
 212                 struct list_head        defer_list;
 213                 struct list_head        timeout_list;
 214                 struct list_head        cq_overflow_list;
 215
 216                 wait_queue_head_t       inflight_wait;
 217         } ____cacheline_aligned_in_smp;
 218
 219         struct io_rings *rings;
 220
 221         /* IO offload */
 222         struct io_wq            *io_wq;
 223         struct task_struct      *sqo_thread;    /* if using sq thread polling */
 224         struct mm_struct        *sqo_mm;
 225         wait_queue_head_t       sqo_wait;
 226
 227         /*
 228          * If used, fixed file set. Writers must ensure that ->refs is dead,
 229          * readers must ensure that ->refs is alive as long as the file* is
 230          * used. Only updated through io_uring_register(2).
 231          */
 232         struct fixed_file_table *file_table;
 233         unsigned                nr_user_files;
 234
 235         /* if used, fixed mapped user buffers */
 236         unsigned                nr_user_bufs;
 237         struct io_mapped_ubuf   *user_bufs;
 238
 239         struct user_struct      *user;
 240
 241         const struct cred       *creds;
 242
 243         /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
 244         struct completion       *completions;
 245
 246         /* if all else fails... */
 247         struct io_kiocb         *fallback_req;
 248
 249 #if defined(CONFIG_UNIX)
 250         struct socket           *ring_sock;
 251 #endif
 252
 253         struct {
 254                 unsigned                cached_cq_tail;
 255                 unsigned                cq_entries;
 256                 unsigned                cq_mask;
 257                 atomic_t                cq_timeouts;
 258                 struct wait_queue_head  cq_wait;
 259                 struct fasync_struct    *cq_fasync;
 260                 struct eventfd_ctx      *cq_ev_fd;
 261         } ____cacheline_aligned_in_smp;
 262
 263         struct {
 264                 struct mutex            uring_lock;
 265                 wait_queue_head_t       wait;
 266         } ____cacheline_aligned_in_smp;
 267
 268         struct {
 269                 spinlock_t              completion_lock;
 270                 bool                    poll_multi_file;
 271                 /*
 272                  * ->poll_list is protected by the ctx->uring_lock for
 273                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 274                  * For SQPOLL, only the single threaded io_sq_thread() will
 275                  * manipulate the list, hence no extra locking is needed there.
 276                  */
 277                 struct list_head        poll_list;
 278                 struct hlist_head       *cancel_hash;
 279                 unsigned                cancel_hash_bits;
 280
 281                 spinlock_t              inflight_lock;
 282                 struct list_head        inflight_list;
 283         } ____cacheline_aligned_in_smp;
 284 };
 285
 286 /*
 287  * First field must be the file pointer in all the
 288  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 289  */
 290 struct io_poll_iocb {
 291         struct file                     *file;
 292         union {
 293                 struct wait_queue_head  *head;
 294                 u64                     addr;
 295         };
 296         __poll_t                        events;
 297         bool                            done;
 298         bool                            canceled;
 299         struct wait_queue_entry         wait;
 300 };
 301
 302 struct io_timeout_data {
 303         struct io_kiocb                 *req;
 304         struct hrtimer                  timer;
 305         struct timespec64               ts;
 306         enum hrtimer_mode               mode;
 307         u32                             seq_offset;
 308 };
 309
 310 struct io_accept {
 311         struct file                     *file;
 312         struct sockaddr __user          *addr;
 313         int __user                      *addr_len;
 314         int                             flags;
 315 };
 316
 317 struct io_sync {
 318         struct file                     *file;
 319         loff_t                          len;
 320         loff_t                          off;
 321         int                             flags;
 322 };
 323
 324 struct io_cancel {
 325         struct file                     *file;
 326         u64                             addr;
 327 };
 328
 329 struct io_timeout {
 330         struct file                     *file;
 331         u64                             addr;
 332         int                             flags;
 333         unsigned                        count;
 334 };
 335
 336 struct io_rw {
 337         /* NOTE: kiocb has the file as the first member, so don't do it here */
 338         struct kiocb                    kiocb;
 339         u64                             addr;
 340         u64                             len;
 341 };
 342
 343 struct io_connect {
 344         struct file                     *file;
 345         struct sockaddr __user          *addr;
 346         int                             addr_len;
 347 };
 348
 349 struct io_sr_msg {
 350         struct file                     *file;
 351         struct user_msghdr __user       *msg;
 352         int                             msg_flags;
 353 };
 354
 355 struct io_async_connect {
 356         struct sockaddr_storage         address;
 357 };
 358
 359 struct io_async_msghdr {
 360         struct iovec                    fast_iov[UIO_FASTIOV];
 361         struct iovec                    *iov;
 362         struct sockaddr __user          *uaddr;
 363         struct msghdr                   msg;
 364 };
 365
 366 struct io_async_rw {
 367         struct iovec                    fast_iov[UIO_FASTIOV];
 368         struct iovec                    *iov;
 369         ssize_t                         nr_segs;
 370         ssize_t                         size;
 371 };
 372
 373 struct io_async_ctx {
 374         union {
 375                 struct io_async_rw      rw;
 376                 struct io_async_msghdr  msg;
 377                 struct io_async_connect connect;
 378                 struct io_timeout_data  timeout;
 379         };
 380 };
 381
 382 /*
 383  * NOTE! Each of the iocb union members has the file pointer
 384  * as the first entry in their struct definition. So you can
 385  * access the file pointer through any of the sub-structs,
 386  * or directly as just 'ki_filp' in this struct.
 387  */
 388 struct io_kiocb {
 389         union {
 390                 struct file             *file;
 391                 struct io_rw            rw;
 392                 struct io_poll_iocb     poll;
 393                 struct io_accept        accept;
 394                 struct io_sync          sync;
 395                 struct io_cancel        cancel;
 396                 struct io_timeout       timeout;
 397                 struct io_connect       connect;
 398                 struct io_sr_msg        sr_msg;
 399         };
 400
 401         struct io_async_ctx             *io;
 402         struct file                     *ring_file;
 403         int                             ring_fd;
 404         bool                            has_user;
 405         bool                            in_async;
 406         bool                            needs_fixed_file;
 407         u8                              opcode;
 408
 409         struct io_ring_ctx      *ctx;
 410         union {
 411                 struct list_head        list;
 412                 struct hlist_node       hash_node;
 413         };
 414         struct list_head        link_list;
 415         unsigned int            flags;
 416         refcount_t              refs;
 417 #define REQ_F_NOWAIT            1       /* must not punt to workers */
 418 #define REQ_F_IOPOLL_COMPLETED  2       /* polled IO has completed */
 419 #define REQ_F_FIXED_FILE        4       /* ctx owns file */
 420 #define REQ_F_LINK_NEXT         8       /* already grabbed next link */
 421 #define REQ_F_IO_DRAIN          16      /* drain existing IO first */
 422 #define REQ_F_IO_DRAINED        32      /* drain done */
 423 #define REQ_F_LINK              64      /* linked sqes */
 424 #define REQ_F_LINK_TIMEOUT      128     /* has linked timeout */
 425 #define REQ_F_FAIL_LINK         256     /* fail rest of links */
 426 #define REQ_F_DRAIN_LINK        512     /* link should be fully drained */
 427 #define REQ_F_TIMEOUT           1024    /* timeout request */
 428 #define REQ_F_ISREG             2048    /* regular file */
 429 #define REQ_F_MUST_PUNT         4096    /* must be punted even for NONBLOCK */
 430 #define REQ_F_TIMEOUT_NOSEQ     8192    /* no timeout sequence */
 431 #define REQ_F_INFLIGHT          16384   /* on inflight list */
 432 #define REQ_F_COMP_LOCKED       32768   /* completion under lock */
 433 #define REQ_F_HARDLINK          65536   /* doesn't sever on completion < 0 */
 434         u64                     user_data;
 435         u32                     result;
 436         u32                     sequence;
 437
 438         struct list_head        inflight_entry;
 439
 440         struct io_wq_work       work;
 441 };
 442
 443 #define IO_PLUG_THRESHOLD               2
 444 #define IO_IOPOLL_BATCH                 8
 445
 446 struct io_submit_state {
 447         struct blk_plug         plug;
 448
 449         /*
 450          * io_kiocb alloc cache
 451          */
 452         void                    *reqs[IO_IOPOLL_BATCH];
 453         unsigned                int free_reqs;
 454         unsigned                int cur_req;
 455
 456         /*
 457          * File reference cache
 458          */
 459         struct file             *file;
 460         unsigned int            fd;
 461         unsigned int            has_refs;
 462         unsigned int            used_refs;
 463         unsigned int            ios_left;
 464 };
 465
 466 static void io_wq_submit_work(struct io_wq_work **workptr);
 467 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 468 static void __io_free_req(struct io_kiocb *req);
 469 static void io_put_req(struct io_kiocb *req);
 470 static void io_double_put_req(struct io_kiocb *req);
 471 static void __io_double_put_req(struct io_kiocb *req);
 472 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 473 static void io_queue_linked_timeout(struct io_kiocb *req);
 474
 475 static struct kmem_cache *req_cachep;
 476
 477 static const struct file_operations io_uring_fops;
 478
 479 struct sock *io_uring_get_socket(struct file *file)
 480 {
 481 #if defined(CONFIG_UNIX)
 482         if (file->f_op == &io_uring_fops) {
 483                 struct io_ring_ctx *ctx = file->private_data;
 484
 485                 return ctx->ring_sock->sk;
 486         }
 487 #endif
 488         return NULL;
 489 }
 490 EXPORT_SYMBOL(io_uring_get_socket);
 491
 492 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 493 {
 494         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 495
 496         complete(&ctx->completions[0]);
 497 }
 498
 499 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 500 {
 501         struct io_ring_ctx *ctx;
 502         int hash_bits;
 503
 504         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 505         if (!ctx)
 506                 return NULL;
 507
 508         ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
 509         if (!ctx->fallback_req)
 510                 goto err;
 511
 512         ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
 513         if (!ctx->completions)
 514                 goto err;
 515
 516         /*
 517          * Use 5 bits less than the max cq entries, that should give us around
 518          * 32 entries per hash list if totally full and uniformly spread.
 519          */
 520         hash_bits = ilog2(p->cq_entries);
 521         hash_bits -= 5;
 522         if (hash_bits <= 0)
 523                 hash_bits = 1;
 524         ctx->cancel_hash_bits = hash_bits;
 525         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
 526                                         GFP_KERNEL);
 527         if (!ctx->cancel_hash)
 528                 goto err;
 529         __hash_init(ctx->cancel_hash, 1U << hash_bits);
 530
 531         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
 532                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
 533                 goto err;
 534
 535         ctx->flags = p->flags;
 536         init_waitqueue_head(&ctx->cq_wait);
 537         INIT_LIST_HEAD(&ctx->cq_overflow_list);
 538         init_completion(&ctx->completions[0]);
 539         init_completion(&ctx->completions[1]);
 540         mutex_init(&ctx->uring_lock);
 541         init_waitqueue_head(&ctx->wait);
 542         spin_lock_init(&ctx->completion_lock);
 543         INIT_LIST_HEAD(&ctx->poll_list);
 544         INIT_LIST_HEAD(&ctx->defer_list);
 545         INIT_LIST_HEAD(&ctx->timeout_list);
 546         init_waitqueue_head(&ctx->inflight_wait);
 547         spin_lock_init(&ctx->inflight_lock);
 548         INIT_LIST_HEAD(&ctx->inflight_list);
 549         return ctx;
 550 err:
 551         if (ctx->fallback_req)
 552                 kmem_cache_free(req_cachep, ctx->fallback_req);
 553         kfree(ctx->completions);
 554         kfree(ctx->cancel_hash);
 555         kfree(ctx);
 556         return NULL;
 557 }
 558
 559 static inline bool __req_need_defer(struct io_kiocb *req)
 560 {
 561         struct io_ring_ctx *ctx = req->ctx;
 562
 563         return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
 564                                         + atomic_read(&ctx->cached_cq_overflow);
 565 }
 566
 567 static inline bool req_need_defer(struct io_kiocb *req)
 568 {
 569         if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN)
 570                 return __req_need_defer(req);
 571
 572         return false;
 573 }
 574
 575 static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
 576 {
 577         struct io_kiocb *req;
 578
 579         req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
 580         if (req && !req_need_defer(req)) {
 581                 list_del_init(&req->list);
 582                 return req;
 583         }
 584
 585         return NULL;
 586 }
 587
 588 static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
 589 {
 590         struct io_kiocb *req;
 591
 592         req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
 593         if (req) {
 594                 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
 595                         return NULL;
 596                 if (!__req_need_defer(req)) {
 597                         list_del_init(&req->list);
 598                         return req;
 599                 }
 600         }
 601
 602         return NULL;
 603 }
 604
 605 static void __io_commit_cqring(struct io_ring_ctx *ctx)
 606 {
 607         struct io_rings *rings = ctx->rings;
 608
 609         if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {
 610                 /* order cqe stores with ring update */
 611                 smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
 612
 613                 if (wq_has_sleeper(&ctx->cq_wait)) {
 614                         wake_up_interruptible(&ctx->cq_wait);
 615                         kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
 616                 }
 617         }
 618 }
 619
 620 static inline bool io_req_needs_user(struct io_kiocb *req)
 621 {
 622         return !(req->opcode == IORING_OP_READ_FIXED ||
 623                  req->opcode == IORING_OP_WRITE_FIXED);
 624 }
 625
 626 static inline bool io_prep_async_work(struct io_kiocb *req,
 627                                       struct io_kiocb **link)
 628 {
 629         bool do_hashed = false;
 630
 631         switch (req->opcode) {
 632         case IORING_OP_WRITEV:
 633         case IORING_OP_WRITE_FIXED:
 634                 /* only regular files should be hashed for writes */
 635                 if (req->flags & REQ_F_ISREG)
 636                         do_hashed = true;
 637                 /* fall-through */
 638         case IORING_OP_READV:
 639         case IORING_OP_READ_FIXED:
 640         case IORING_OP_SENDMSG:
 641         case IORING_OP_RECVMSG:
 642         case IORING_OP_ACCEPT:
 643         case IORING_OP_POLL_ADD:
 644         case IORING_OP_CONNECT:
 645                 /*
 646                  * We know REQ_F_ISREG is not set on some of these
 647                  * opcodes, but this enables us to keep the check in
 648                  * just one place.
 649                  */
 650                 if (!(req->flags & REQ_F_ISREG))
 651                         req->work.flags |= IO_WQ_WORK_UNBOUND;
 652                 break;
 653         }
 654         if (io_req_needs_user(req))
 655                 req->work.flags |= IO_WQ_WORK_NEEDS_USER;
 656
 657         *link = io_prep_linked_timeout(req);
 658         return do_hashed;
 659 }
 660
 661 static inline void io_queue_async_work(struct io_kiocb *req)
 662 {
 663         struct io_ring_ctx *ctx = req->ctx;
 664         struct io_kiocb *link;
 665         bool do_hashed;
 666
 667         do_hashed = io_prep_async_work(req, &link);
 668
 669         trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
 670                                         req->flags);
 671         if (!do_hashed) {
 672                 io_wq_enqueue(ctx->io_wq, &req->work);
 673         } else {
 674                 io_wq_enqueue_hashed(ctx->io_wq, &req->work,
 675                                         file_inode(req->file));
 676         }
 677
 678         if (link)
 679                 io_queue_linked_timeout(link);
 680 }
 681
 682 static void io_kill_timeout(struct io_kiocb *req)
 683 {
 684         int ret;
 685
 686         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
 687         if (ret != -1) {
 688                 atomic_inc(&req->ctx->cq_timeouts);
 689                 list_del_init(&req->list);
 690                 io_cqring_fill_event(req, 0);
 691                 io_put_req(req);
 692         }
 693 }
 694
 695 static void io_kill_timeouts(struct io_ring_ctx *ctx)
 696 {
 697         struct io_kiocb *req, *tmp;
 698
 699         spin_lock_irq(&ctx->completion_lock);
 700         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
 701                 io_kill_timeout(req);
 702         spin_unlock_irq(&ctx->completion_lock);
 703 }
 704
 705 static void io_commit_cqring(struct io_ring_ctx *ctx)
 706 {
 707         struct io_kiocb *req;
 708
 709         while ((req = io_get_timeout_req(ctx)) != NULL)
 710                 io_kill_timeout(req);
 711
 712         __io_commit_cqring(ctx);
 713
 714         while ((req = io_get_deferred_req(ctx)) != NULL) {
 715                 req->flags |= REQ_F_IO_DRAINED;
 716                 io_queue_async_work(req);
 717         }
 718 }
 719
 720 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 721 {
 722         struct io_rings *rings = ctx->rings;
 723         unsigned tail;
 724
 725         tail = ctx->cached_cq_tail;
 726         /*
 727          * writes to the cq entry need to come after reading head; the
 728          * control dependency is enough as we're using WRITE_ONCE to
 729          * fill the cq entry
 730          */
 731         if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
 732                 return NULL;
 733
 734         ctx->cached_cq_tail++;
 735         return &rings->cqes[tail & ctx->cq_mask];
 736 }
 737
 738 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 739 {
 740         if (waitqueue_active(&ctx->wait))
 741                 wake_up(&ctx->wait);
 742         if (waitqueue_active(&ctx->sqo_wait))
 743                 wake_up(&ctx->sqo_wait);
 744         if (ctx->cq_ev_fd)
 745                 eventfd_signal(ctx->cq_ev_fd, 1);
 746 }
 747
 748 /* Returns true if there are no backlogged entries after the flush */
 749 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 750 {
 751         struct io_rings *rings = ctx->rings;
 752         struct io_uring_cqe *cqe;
 753         struct io_kiocb *req;
 754         unsigned long flags;
 755         LIST_HEAD(list);
 756
 757         if (!force) {
 758                 if (list_empty_careful(&ctx->cq_overflow_list))
 759                         return true;
 760                 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
 761                     rings->cq_ring_entries))
 762                         return false;
 763         }
 764
 765         spin_lock_irqsave(&ctx->completion_lock, flags);
 766
 767         /* if force is set, the ring is going away. always drop after that */
 768         if (force)
 769                 ctx->cq_overflow_flushed = true;
 770
 771         cqe = NULL;
 772         while (!list_empty(&ctx->cq_overflow_list)) {
 773                 cqe = io_get_cqring(ctx);
 774                 if (!cqe && !force)
 775                         break;
 776
 777                 req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
 778                                                 list);
 779                 list_move(&req->list, &list);
 780                 if (cqe) {
 781                         WRITE_ONCE(cqe->user_data, req->user_data);
 782                         WRITE_ONCE(cqe->res, req->result);
 783                         WRITE_ONCE(cqe->flags, 0);
 784                 } else {
 785                         WRITE_ONCE(ctx->rings->cq_overflow,
 786                                 atomic_inc_return(&ctx->cached_cq_overflow));
 787                 }
 788         }
 789
 790         io_commit_cqring(ctx);
 791         spin_unlock_irqrestore(&ctx->completion_lock, flags);
 792         io_cqring_ev_posted(ctx);
 793
 794         while (!list_empty(&list)) {
 795                 req = list_first_entry(&list, struct io_kiocb, list);
 796                 list_del(&req->list);
 797                 io_put_req(req);
 798         }
 799
 800         return cqe != NULL;
 801 }
 802
 803 static void io_cqring_fill_event(struct io_kiocb *req, long res)
 804 {
 805         struct io_ring_ctx *ctx = req->ctx;
 806         struct io_uring_cqe *cqe;
 807
 808         trace_io_uring_complete(ctx, req->user_data, res);
 809
 810         /*
 811          * If we can't get a cq entry, userspace overflowed the
 812          * submission (by quite a lot). Increment the overflow count in
 813          * the ring.
 814          */
 815         cqe = io_get_cqring(ctx);
 816         if (likely(cqe)) {
 817                 WRITE_ONCE(cqe->user_data, req->user_data);
 818                 WRITE_ONCE(cqe->res, res);
 819                 WRITE_ONCE(cqe->flags, 0);
 820         } else if (ctx->cq_overflow_flushed) {
 821                 WRITE_ONCE(ctx->rings->cq_overflow,
 822                                 atomic_inc_return(&ctx->cached_cq_overflow));
 823         } else {
 824                 refcount_inc(&req->refs);
 825                 req->result = res;
 826                 list_add_tail(&req->list, &ctx->cq_overflow_list);
 827         }
 828 }
 829
 830 static void io_cqring_add_event(struct io_kiocb *req, long res)
 831 {
 832         struct io_ring_ctx *ctx = req->ctx;
 833         unsigned long flags;
 834
 835         spin_lock_irqsave(&ctx->completion_lock, flags);
 836         io_cqring_fill_event(req, res);
 837         io_commit_cqring(ctx);
 838         spin_unlock_irqrestore(&ctx->completion_lock, flags);
 839
 840         io_cqring_ev_posted(ctx);
 841 }
 842
 843 static inline bool io_is_fallback_req(struct io_kiocb *req)
 844 {
 845         return req == (struct io_kiocb *)
 846                         ((unsigned long) req->ctx->fallback_req & ~1UL);
 847 }
 848
 849 static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
 850 {
 851         struct io_kiocb *req;
 852
 853         req = ctx->fallback_req;
 854         if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req))
 855                 return req;
 856
 857         return NULL;
 858 }
 859
 860 static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 861                                    struct io_submit_state *state)
 862 {
 863         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 864         struct io_kiocb *req;
 865
 866         if (!percpu_ref_tryget(&ctx->refs))
 867                 return NULL;
 868
 869         if (!state) {
 870                 req = kmem_cache_alloc(req_cachep, gfp);
 871                 if (unlikely(!req))
 872                         goto fallback;
 873         } else if (!state->free_reqs) {
 874                 size_t sz;
 875                 int ret;
 876
 877                 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
 878                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
 879
 880                 /*
 881                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
 882                  * retry single alloc to be on the safe side.
 883                  */
 884                 if (unlikely(ret <= 0)) {
 885                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
 886                         if (!state->reqs[0])
 887                                 goto fallback;
 888                         ret = 1;
 889                 }
 890                 state->free_reqs = ret - 1;
 891                 state->cur_req = 1;
 892                 req = state->reqs[0];
 893         } else {
 894                 req = state->reqs[state->cur_req];
 895                 state->free_reqs--;
 896                 state->cur_req++;
 897         }
 898
 899 got_it:
 900         req->io = NULL;
 901         req->ring_file = NULL;
 902         req->file = NULL;
 903         req->ctx = ctx;
 904         req->flags = 0;
 905         /* one is dropped after submission, the other at completion */
 906         refcount_set(&req->refs, 2);
 907         req->result = 0;
 908         INIT_IO_WORK(&req->work, io_wq_submit_work);
 909         return req;
 910 fallback:
 911         req = io_get_fallback_req(ctx);
 912         if (req)
 913                 goto got_it;
 914         percpu_ref_put(&ctx->refs);
 915         return NULL;
 916 }
 917
 918 static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
 919 {
 920         if (*nr) {
 921                 kmem_cache_free_bulk(req_cachep, *nr, reqs);
 922                 percpu_ref_put_many(&ctx->refs, *nr);
 923                 *nr = 0;
 924         }
 925 }
 926
 927 static void __io_free_req(struct io_kiocb *req)
 928 {
 929         struct io_ring_ctx *ctx = req->ctx;
 930
 931         if (req->io)
 932                 kfree(req->io);
 933         if (req->file && !(req->flags & REQ_F_FIXED_FILE))
 934                 fput(req->file);
 935         if (req->flags & REQ_F_INFLIGHT) {
 936                 unsigned long flags;
 937
 938                 spin_lock_irqsave(&ctx->inflight_lock, flags);
 939                 list_del(&req->inflight_entry);
 940                 if (waitqueue_active(&ctx->inflight_wait))
 941                         wake_up(&ctx->inflight_wait);
 942                 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
 943         }
 944         percpu_ref_put(&ctx->refs);
 945         if (likely(!io_is_fallback_req(req)))
 946                 kmem_cache_free(req_cachep, req);
 947         else
 948                 clear_bit_unlock(0, (unsigned long *) ctx->fallback_req);
 949 }
 950
 951 static bool io_link_cancel_timeout(struct io_kiocb *req)
 952 {
 953         struct io_ring_ctx *ctx = req->ctx;
 954         int ret;
 955
 956         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
 957         if (ret != -1) {
 958                 io_cqring_fill_event(req, -ECANCELED);
 959                 io_commit_cqring(ctx);
 960                 req->flags &= ~REQ_F_LINK;
 961                 io_put_req(req);
 962                 return true;
 963         }
 964
 965         return false;
 966 }
 967
 968 static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 969 {
 970         struct io_ring_ctx *ctx = req->ctx;
 971         bool wake_ev = false;
 972
 973         /* Already got next link */
 974         if (req->flags & REQ_F_LINK_NEXT)
 975                 return;
 976
 977         /*
 978          * The list should never be empty when we are called here. But could
 979          * potentially happen if the chain is messed up, check to be on the
 980          * safe side.
 981          */
 982         while (!list_empty(&req->link_list)) {
 983                 struct io_kiocb *nxt = list_first_entry(&req->link_list,
 984                                                 struct io_kiocb, link_list);
 985
 986                 if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
 987                              (nxt->flags & REQ_F_TIMEOUT))) {
 988                         list_del_init(&nxt->link_list);
 989                         wake_ev |= io_link_cancel_timeout(nxt);
 990                         req->flags &= ~REQ_F_LINK_TIMEOUT;
 991                         continue;
 992                 }
 993
 994                 list_del_init(&req->link_list);
 995                 if (!list_empty(&nxt->link_list))
 996                         nxt->flags |= REQ_F_LINK;
 997                 *nxtptr = nxt;
 998                 break;
 999         }
1000
1001         req->flags |= REQ_F_LINK_NEXT;
1002         if (wake_ev)
1003                 io_cqring_ev_posted(ctx);
1004 }
1005
1006 /*
1007  * Called if REQ_F_LINK is set, and we fail the head request
1008  */
1009 static void io_fail_links(struct io_kiocb *req)
1010 {
1011         struct io_ring_ctx *ctx = req->ctx;
1012         unsigned long flags;
1013
1014         spin_lock_irqsave(&ctx->completion_lock, flags);
1015
1016         while (!list_empty(&req->link_list)) {
1017                 struct io_kiocb *link = list_first_entry(&req->link_list,
1018                                                 struct io_kiocb, link_list);
1019
1020                 list_del_init(&link->link_list);
1021                 trace_io_uring_fail_link(req, link);
1022
1023                 if ((req->flags & REQ_F_LINK_TIMEOUT) &&
1024                     link->opcode == IORING_OP_LINK_TIMEOUT) {
1025                         io_link_cancel_timeout(link);
1026                 } else {
1027                         io_cqring_fill_event(link, -ECANCELED);
1028                         __io_double_put_req(link);
1029                 }
1030                 req->flags &= ~REQ_F_LINK_TIMEOUT;
1031         }
1032
1033         io_commit_cqring(ctx);
1034         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1035         io_cqring_ev_posted(ctx);
1036 }
1037
1038 static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
1039 {
1040         if (likely(!(req->flags & REQ_F_LINK)))
1041                 return;
1042
1043         /*
1044          * If LINK is set, we have dependent requests in this chain. If we
1045          * didn't fail this request, queue the first one up, moving any other
1046          * dependencies to the next request. In case of failure, fail the rest
1047          * of the chain.
1048          */
1049         if (req->flags & REQ_F_FAIL_LINK) {
1050                 io_fail_links(req);
1051         } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
1052                         REQ_F_LINK_TIMEOUT) {
1053                 struct io_ring_ctx *ctx = req->ctx;
1054                 unsigned long flags;
1055
1056                 /*
1057                  * If this is a timeout link, we could be racing with the
1058                  * timeout timer. Grab the completion lock for this case to
1059                  * protect against that.
1060                  */
1061                 spin_lock_irqsave(&ctx->completion_lock, flags);
1062                 io_req_link_next(req, nxt);
1063                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1064         } else {
1065                 io_req_link_next(req, nxt);
1066         }
1067 }
1068
1069 static void io_free_req(struct io_kiocb *req)
1070 {
1071         struct io_kiocb *nxt = NULL;
1072
1073         io_req_find_next(req, &nxt);
1074         __io_free_req(req);
1075
1076         if (nxt)
1077                 io_queue_async_work(nxt);
1078 }
1079
1080 /*
1081  * Drop reference to request, return next in chain (if there is one) if this
1082  * was the last reference to this request.
1083  */
1084 __attribute__((nonnull))
1085 static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
1086 {
1087         io_req_find_next(req, nxtptr);
1088
1089         if (refcount_dec_and_test(&req->refs))
1090                 __io_free_req(req);
1091 }
1092
1093 static void io_put_req(struct io_kiocb *req)
1094 {
1095         if (refcount_dec_and_test(&req->refs))
1096                 io_free_req(req);
1097 }
1098
1099 /*
1100  * Must only be used if we don't need to care about links, usually from
1101  * within the completion handling itself.
1102  */
1103 static void __io_double_put_req(struct io_kiocb *req)
1104 {
1105         /* drop both submit and complete references */
1106         if (refcount_sub_and_test(2, &req->refs))
1107                 __io_free_req(req);
1108 }
1109
1110 static void io_double_put_req(struct io_kiocb *req)
1111 {
1112         /* drop both submit and complete references */
1113         if (refcount_sub_and_test(2, &req->refs))
1114                 io_free_req(req);
1115 }
1116
1117 static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
1118 {
1119         struct io_rings *rings = ctx->rings;
1120
1121         /*
1122          * noflush == true is from the waitqueue handler, just ensure we wake
1123          * up the task, and the next invocation will flush the entries. We
1124          * cannot safely to it from here.
1125          */
1126         if (noflush && !list_empty(&ctx->cq_overflow_list))
1127                 return -1U;
1128
1129         io_cqring_overflow_flush(ctx, false);
1130
1131         /* See comment at the top of this file */
1132         smp_rmb();
1133         return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
1134 }
1135
1136 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
1137 {
1138         struct io_rings *rings = ctx->rings;
1139
1140         /* make sure SQ entry isn't read before tail */
1141         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
1142 }
1143
1144 /*
1145  * Find and free completed poll iocbs
1146  */
1147 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
1148                                struct list_head *done)
1149 {
1150         void *reqs[IO_IOPOLL_BATCH];
1151         struct io_kiocb *req;
1152         int to_free;
1153
1154         to_free = 0;
1155         while (!list_empty(done)) {
1156                 req = list_first_entry(done, struct io_kiocb, list);
1157                 list_del(&req->list);
1158
1159                 io_cqring_fill_event(req, req->result);
1160                 (*nr_events)++;
1161
1162                 if (refcount_dec_and_test(&req->refs)) {
1163                         /* If we're not using fixed files, we have to pair the
1164                          * completion part with the file put. Use regular
1165                          * completions for those, only batch free for fixed
1166                          * file and non-linked commands.
1167                          */
1168                         if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
1169                             REQ_F_FIXED_FILE) && !io_is_fallback_req(req) &&
1170                             !req->io) {
1171                                 reqs[to_free++] = req;
1172                                 if (to_free == ARRAY_SIZE(reqs))
1173                                         io_free_req_many(ctx, reqs, &to_free);
1174                         } else {
1175                                 io_free_req(req);
1176                         }
1177                 }
1178         }
1179
1180         io_commit_cqring(ctx);
1181         io_free_req_many(ctx, reqs, &to_free);
1182 }
1183
1184 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
1185                         long min)
1186 {
1187         struct io_kiocb *req, *tmp;
1188         LIST_HEAD(done);
1189         bool spin;
1190         int ret;
1191
1192         /*
1193          * Only spin for completions if we don't have multiple devices hanging
1194          * off our complete list, and we're under the requested amount.
1195          */
1196         spin = !ctx->poll_multi_file && *nr_events < min;
1197
1198         ret = 0;
1199         list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
1200                 struct kiocb *kiocb = &req->rw.kiocb;
1201
1202                 /*
1203                  * Move completed entries to our local list. If we find a
1204                  * request that requires polling, break out and complete
1205                  * the done list first, if we have entries there.
1206                  */
1207                 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
1208                         list_move_tail(&req->list, &done);
1209                         continue;
1210                 }
1211                 if (!list_empty(&done))
1212                         break;
1213
1214                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
1215                 if (ret < 0)
1216                         break;
1217
1218                 if (ret && spin)
1219                         spin = false;
1220                 ret = 0;
1221         }
1222
1223         if (!list_empty(&done))
1224                 io_iopoll_complete(ctx, nr_events, &done);
1225
1226         return ret;
1227 }
1228
1229 /*
1230  * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
1231  * non-spinning poll check - we'll still enter the driver poll loop, but only
1232  * as a non-spinning completion check.
1233  */
1234 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
1235                                 long min)
1236 {
1237         while (!list_empty(&ctx->poll_list) && !need_resched()) {
1238                 int ret;
1239
1240                 ret = io_do_iopoll(ctx, nr_events, min);
1241                 if (ret < 0)
1242                         return ret;
1243                 if (!min || *nr_events >= min)
1244                         return 0;
1245         }
1246
1247         return 1;
1248 }
1249
1250 /*
1251  * We can't just wait for polled events to come to us, we have to actively
1252  * find and complete them.
1253  */
1254 static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
1255 {
1256         if (!(ctx->flags & IORING_SETUP_IOPOLL))
1257                 return;
1258
1259         mutex_lock(&ctx->uring_lock);
1260         while (!list_empty(&ctx->poll_list)) {
1261                 unsigned int nr_events = 0;
1262
1263                 io_iopoll_getevents(ctx, &nr_events, 1);
1264
1265                 /*
1266                  * Ensure we allow local-to-the-cpu processing to take place,
1267                  * in this case we need to ensure that we reap all events.
1268                  */
1269                 cond_resched();
1270         }
1271         mutex_unlock(&ctx->uring_lock);
1272 }
1273
1274 static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1275                             long min)
1276 {
1277         int iters = 0, ret = 0;
1278
1279         do {
1280                 int tmin = 0;
1281
1282                 /*
1283                  * Don't enter poll loop if we already have events pending.
1284                  * If we do, we can potentially be spinning for commands that
1285                  * already triggered a CQE (eg in error).
1286                  */
1287                 if (io_cqring_events(ctx, false))
1288                         break;
1289
1290                 /*
1291                  * If a submit got punted to a workqueue, we can have the
1292                  * application entering polling for a command before it gets
1293                  * issued. That app will hold the uring_lock for the duration
1294                  * of the poll right here, so we need to take a breather every
1295                  * now and then to ensure that the issue has a chance to add
1296                  * the poll to the issued list. Otherwise we can spin here
1297                  * forever, while the workqueue is stuck trying to acquire the
1298                  * very same mutex.
1299                  */
1300                 if (!(++iters & 7)) {
1301                         mutex_unlock(&ctx->uring_lock);
1302                         mutex_lock(&ctx->uring_lock);
1303                 }
1304
1305                 if (*nr_events < min)
1306                         tmin = min - *nr_events;
1307
1308                 ret = io_iopoll_getevents(ctx, nr_events, tmin);
1309                 if (ret <= 0)
1310                         break;
1311                 ret = 0;
1312         } while (min && !*nr_events && !need_resched());
1313
1314         return ret;
1315 }
1316
1317 static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1318                            long min)
1319 {
1320         int ret;
1321
1322         /*
1323          * We disallow the app entering submit/complete with polling, but we
1324          * still need to lock the ring to prevent racing with polled issue
1325          * that got punted to a workqueue.
1326          */
1327         mutex_lock(&ctx->uring_lock);
1328         ret = __io_iopoll_check(ctx, nr_events, min);
1329         mutex_unlock(&ctx->uring_lock);
1330         return ret;
1331 }
1332
1333 static void kiocb_end_write(struct io_kiocb *req)
1334 {
1335         /*
1336          * Tell lockdep we inherited freeze protection from submission
1337          * thread.
1338          */
1339         if (req->flags & REQ_F_ISREG) {
1340                 struct inode *inode = file_inode(req->file);
1341
1342                 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
1343         }
1344         file_end_write(req->file);
1345 }
1346
1347 static inline void req_set_fail_links(struct io_kiocb *req)
1348 {
1349         if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1350                 req->flags |= REQ_F_FAIL_LINK;
1351 }
1352
1353 static void io_complete_rw_common(struct kiocb *kiocb, long res)
1354 {
1355         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1356
1357         if (kiocb->ki_flags & IOCB_WRITE)
1358                 kiocb_end_write(req);
1359
1360         if (res != req->result)
1361                 req_set_fail_links(req);
1362         io_cqring_add_event(req, res);
1363 }
1364
1365 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
1366 {
1367         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1368
1369         io_complete_rw_common(kiocb, res);
1370         io_put_req(req);
1371 }
1372
1373 static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res)
1374 {
1375         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1376         struct io_kiocb *nxt = NULL;
1377
1378         io_complete_rw_common(kiocb, res);
1379         io_put_req_find_next(req, &nxt);
1380
1381         return nxt;
1382 }
1383
1384 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
1385 {
1386         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1387
1388         if (kiocb->ki_flags & IOCB_WRITE)
1389                 kiocb_end_write(req);
1390
1391         if (res != req->result)
1392                 req_set_fail_links(req);
1393         req->result = res;
1394         if (res != -EAGAIN)
1395                 req->flags |= REQ_F_IOPOLL_COMPLETED;
1396 }
1397
1398 /*
1399  * After the iocb has been issued, it's safe to be found on the poll list.
1400  * Adding the kiocb to the list AFTER submission ensures that we don't
1401  * find it from a io_iopoll_getevents() thread before the issuer is done
1402  * accessing the kiocb cookie.
1403  */
1404 static void io_iopoll_req_issued(struct io_kiocb *req)
1405 {
1406         struct io_ring_ctx *ctx = req->ctx;
1407
1408         /*
1409          * Track whether we have multiple files in our lists. This will impact
1410          * how we do polling eventually, not spinning if we're on potentially
1411          * different devices.
1412          */
1413         if (list_empty(&ctx->poll_list)) {
1414                 ctx->poll_multi_file = false;
1415         } else if (!ctx->poll_multi_file) {
1416                 struct io_kiocb *list_req;
1417
1418                 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
1419                                                 list);
1420                 if (list_req->file != req->file)
1421                         ctx->poll_multi_file = true;
1422         }
1423
1424         /*
1425          * For fast devices, IO may have already completed. If it has, add
1426          * it to the front so we find it first.
1427          */
1428         if (req->flags & REQ_F_IOPOLL_COMPLETED)
1429                 list_add(&req->list, &ctx->poll_list);
1430         else
1431                 list_add_tail(&req->list, &ctx->poll_list);
1432 }
1433
1434 static void io_file_put(struct io_submit_state *state)
1435 {
1436         if (state->file) {
1437                 int diff = state->has_refs - state->used_refs;
1438
1439                 if (diff)
1440                         fput_many(state->file, diff);
1441                 state->file = NULL;
1442         }
1443 }
1444
1445 /*
1446  * Get as many references to a file as we have IOs left in this submission,
1447  * assuming most submissions are for one file, or at least that each file
1448  * has more than one submission.
1449  */
1450 static struct file *io_file_get(struct io_submit_state *state, int fd)
1451 {
1452         if (!state)
1453                 return fget(fd);
1454
1455         if (state->file) {
1456                 if (state->fd == fd) {
1457                         state->used_refs++;
1458                         state->ios_left--;
1459                         return state->file;
1460                 }
1461                 io_file_put(state);
1462         }
1463         state->file = fget_many(fd, state->ios_left);
1464         if (!state->file)
1465                 return NULL;
1466
1467         state->fd = fd;
1468         state->has_refs = state->ios_left;
1469         state->used_refs = 1;
1470         state->ios_left--;
1471         return state->file;
1472 }
1473
1474 /*
1475  * If we tracked the file through the SCM inflight mechanism, we could support
1476  * any file. For now, just ensure that anything potentially problematic is done
1477  * inline.
1478  */
1479 static bool io_file_supports_async(struct file *file)
1480 {
1481         umode_t mode = file_inode(file)->i_mode;
1482
1483         if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
1484                 return true;
1485         if (S_ISREG(mode) && file->f_op != &io_uring_fops)
1486                 return true;
1487
1488         return false;
1489 }
1490
1491 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1492                       bool force_nonblock)
1493 {
1494         struct io_ring_ctx *ctx = req->ctx;
1495         struct kiocb *kiocb = &req->rw.kiocb;
1496         unsigned ioprio;
1497         int ret;
1498
1499         if (!req->file)
1500                 return -EBADF;
1501
1502         if (S_ISREG(file_inode(req->file)->i_mode))
1503                 req->flags |= REQ_F_ISREG;
1504
1505         kiocb->ki_pos = READ_ONCE(sqe->off);
1506         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
1507         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
1508
1509         ioprio = READ_ONCE(sqe->ioprio);
1510         if (ioprio) {
1511                 ret = ioprio_check_cap(ioprio);
1512                 if (ret)
1513                         return ret;
1514
1515                 kiocb->ki_ioprio = ioprio;
1516         } else
1517                 kiocb->ki_ioprio = get_current_ioprio();
1518
1519         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
1520         if (unlikely(ret))
1521                 return ret;
1522
1523         /* don't allow async punt if RWF_NOWAIT was requested */
1524         if ((kiocb->ki_flags & IOCB_NOWAIT) ||
1525             (req->file->f_flags & O_NONBLOCK))
1526                 req->flags |= REQ_F_NOWAIT;
1527
1528         if (force_nonblock)
1529                 kiocb->ki_flags |= IOCB_NOWAIT;
1530
1531         if (ctx->flags & IORING_SETUP_IOPOLL) {
1532                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
1533                     !kiocb->ki_filp->f_op->iopoll)
1534                         return -EOPNOTSUPP;
1535
1536                 kiocb->ki_flags |= IOCB_HIPRI;
1537                 kiocb->ki_complete = io_complete_rw_iopoll;
1538                 req->result = 0;
1539         } else {
1540                 if (kiocb->ki_flags & IOCB_HIPRI)
1541                         return -EINVAL;
1542                 kiocb->ki_complete = io_complete_rw;
1543         }
1544
1545         req->rw.addr = READ_ONCE(sqe->addr);
1546         req->rw.len = READ_ONCE(sqe->len);
1547         /* we own ->private, reuse it for the buffer index */
1548         req->rw.kiocb.private = (void *) (unsigned long)
1549                                         READ_ONCE(sqe->buf_index);
1550         return 0;
1551 }
1552
1553 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
1554 {
1555         switch (ret) {
1556         case -EIOCBQUEUED:
1557                 break;
1558         case -ERESTARTSYS:
1559         case -ERESTARTNOINTR:
1560         case -ERESTARTNOHAND:
1561         case -ERESTART_RESTARTBLOCK:
1562                 /*
1563                  * We can't just restart the syscall, since previously
1564                  * submitted sqes may already be in progress. Just fail this
1565                  * IO with EINTR.
1566                  */
1567                 ret = -EINTR;
1568                 /* fall through */
1569         default:
1570                 kiocb->ki_complete(kiocb, ret, 0);
1571         }
1572 }
1573
1574 static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
1575                        bool in_async)
1576 {
1577         if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
1578                 *nxt = __io_complete_rw(kiocb, ret);
1579         else
1580                 io_rw_done(kiocb, ret);
1581 }
1582
1583 static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
1584                                struct iov_iter *iter)
1585 {
1586         struct io_ring_ctx *ctx = req->ctx;
1587         size_t len = req->rw.len;
1588         struct io_mapped_ubuf *imu;
1589         unsigned index, buf_index;
1590         size_t offset;
1591         u64 buf_addr;
1592
1593         /* attempt to use fixed buffers without having provided iovecs */
1594         if (unlikely(!ctx->user_bufs))
1595                 return -EFAULT;
1596
1597         buf_index = (unsigned long) req->rw.kiocb.private;
1598         if (unlikely(buf_index >= ctx->nr_user_bufs))
1599                 return -EFAULT;
1600
1601         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
1602         imu = &ctx->user_bufs[index];
1603         buf_addr = req->rw.addr;
1604
1605         /* overflow */
1606         if (buf_addr + len < buf_addr)
1607                 return -EFAULT;
1608         /* not inside the mapped region */
1609         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
1610                 return -EFAULT;
1611
1612         /*
1613          * May not be a start of buffer, set size appropriately
1614          * and advance us to the beginning.
1615          */
1616         offset = buf_addr - imu->ubuf;
1617         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
1618
1619         if (offset) {
1620                 /*
1621                  * Don't use iov_iter_advance() here, as it's really slow for
1622                  * using the latter parts of a big fixed buffer - it iterates
1623                  * over each segment manually. We can cheat a bit here, because
1624                  * we know that:
1625                  *
1626                  * 1) it's a BVEC iter, we set it up
1627                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
1628                  *    first and last bvec
1629                  *
1630                  * So just find our index, and adjust the iterator afterwards.
1631                  * If the offset is within the first bvec (or the whole first
1632                  * bvec, just use iov_iter_advance(). This makes it easier
1633                  * since we can just skip the first segment, which may not
1634                  * be PAGE_SIZE aligned.
1635                  */
1636                 const struct bio_vec *bvec = imu->bvec;
1637
1638                 if (offset <= bvec->bv_len) {
1639                         iov_iter_advance(iter, offset);
1640                 } else {
1641                         unsigned long seg_skip;
1642
1643                         /* skip first vec */
1644                         offset -= bvec->bv_len;
1645                         seg_skip = 1 + (offset >> PAGE_SHIFT);
1646
1647                         iter->bvec = bvec + seg_skip;
1648                         iter->nr_segs -= seg_skip;
1649                         iter->count -= bvec->bv_len + offset;
1650                         iter->iov_offset = offset & ~PAGE_MASK;
1651                 }
1652         }
1653
1654         return len;
1655 }
1656
1657 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
1658                                struct iovec **iovec, struct iov_iter *iter)
1659 {
1660         void __user *buf = u64_to_user_ptr(req->rw.addr);
1661         size_t sqe_len = req->rw.len;
1662         u8 opcode;
1663
1664         opcode = req->opcode;
1665         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
1666                 *iovec = NULL;
1667                 return io_import_fixed(req, rw, iter);
1668         }
1669
1670         /* buffer index only valid with fixed read/write */
1671         if (req->rw.kiocb.private)
1672                 return -EINVAL;
1673
1674         if (req->io) {
1675                 struct io_async_rw *iorw = &req->io->rw;
1676
1677                 *iovec = iorw->iov;
1678                 iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
1679                 if (iorw->iov == iorw->fast_iov)
1680                         *iovec = NULL;
1681                 return iorw->size;
1682         }
1683
1684         if (!req->has_user)
1685                 return -EFAULT;
1686
1687 #ifdef CONFIG_COMPAT
1688         if (req->ctx->compat)
1689                 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
1690                                                 iovec, iter);
1691 #endif
1692
1693         return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
1694 }
1695
1696 /*
1697  * For files that don't have ->read_iter() and ->write_iter(), handle them
1698  * by looping over ->read() or ->write() manually.
1699  */
1700 static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
1701                            struct iov_iter *iter)
1702 {
1703         ssize_t ret = 0;
1704
1705         /*
1706          * Don't support polled IO through this interface, and we can't
1707          * support non-blocking either. For the latter, this just causes
1708          * the kiocb to be handled from an async context.
1709          */
1710         if (kiocb->ki_flags & IOCB_HIPRI)
1711                 return -EOPNOTSUPP;
1712         if (kiocb->ki_flags & IOCB_NOWAIT)
1713                 return -EAGAIN;
1714
1715         while (iov_iter_count(iter)) {
1716                 struct iovec iovec;
1717                 ssize_t nr;
1718
1719                 if (!iov_iter_is_bvec(iter)) {
1720                         iovec = iov_iter_iovec(iter);
1721                 } else {
1722                         /* fixed buffers import bvec */
1723                         iovec.iov_base = kmap(iter->bvec->bv_page)
1724                                                 + iter->iov_offset;
1725                         iovec.iov_len = min(iter->count,
1726                                         iter->bvec->bv_len - iter->iov_offset);
1727                 }
1728
1729                 if (rw == READ) {
1730                         nr = file->f_op->read(file, iovec.iov_base,
1731                                               iovec.iov_len, &kiocb->ki_pos);
1732                 } else {
1733                         nr = file->f_op->write(file, iovec.iov_base,
1734                                                iovec.iov_len, &kiocb->ki_pos);
1735                 }
1736
1737                 if (iov_iter_is_bvec(iter))
1738                         kunmap(iter->bvec->bv_page);
1739
1740                 if (nr < 0) {
1741                         if (!ret)
1742                                 ret = nr;
1743                         break;
1744                 }
1745                 ret += nr;
1746                 if (nr != iovec.iov_len)
1747                         break;
1748                 iov_iter_advance(iter, nr);
1749         }
1750
1751         return ret;
1752 }
1753
1754 static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
1755                           struct iovec *iovec, struct iovec *fast_iov,
1756                           struct iov_iter *iter)
1757 {
1758         req->io->rw.nr_segs = iter->nr_segs;
1759         req->io->rw.size = io_size;
1760         req->io->rw.iov = iovec;
1761         if (!req->io->rw.iov) {
1762                 req->io->rw.iov = req->io->rw.fast_iov;
1763                 memcpy(req->io->rw.iov, fast_iov,
1764                         sizeof(struct iovec) * iter->nr_segs);
1765         }
1766 }
1767
1768 static int io_alloc_async_ctx(struct io_kiocb *req)
1769 {
1770         req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
1771         return req->io == NULL;
1772 }
1773
1774 static void io_rw_async(struct io_wq_work **workptr)
1775 {
1776         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
1777         struct iovec *iov = NULL;
1778
1779         if (req->io->rw.iov != req->io->rw.fast_iov)
1780                 iov = req->io->rw.iov;
1781         io_wq_submit_work(workptr);
1782         kfree(iov);
1783 }
1784
1785 static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
1786                              struct iovec *iovec, struct iovec *fast_iov,
1787                              struct iov_iter *iter)
1788 {
1789         if (!req->io && io_alloc_async_ctx(req))
1790                 return -ENOMEM;
1791
1792         io_req_map_rw(req, io_size, iovec, fast_iov, iter);
1793         req->work.func = io_rw_async;
1794         return 0;
1795 }
1796
1797 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1798                         bool force_nonblock)
1799 {
1800         struct io_async_ctx *io;
1801         struct iov_iter iter;
1802         ssize_t ret;
1803
1804         ret = io_prep_rw(req, sqe, force_nonblock);
1805         if (ret)
1806                 return ret;
1807
1808         if (unlikely(!(req->file->f_mode & FMODE_READ)))
1809                 return -EBADF;
1810
1811         if (!req->io)
1812                 return 0;
1813
1814         io = req->io;
1815         io->rw.iov = io->rw.fast_iov;
1816         req->io = NULL;
1817         ret = io_import_iovec(READ, req, &io->rw.iov, &iter);
1818         req->io = io;
1819         if (ret < 0)
1820                 return ret;
1821
1822         io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
1823         return 0;
1824 }
1825
1826 static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
1827                    bool force_nonblock)
1828 {
1829         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
1830         struct kiocb *kiocb = &req->rw.kiocb;
1831         struct iov_iter iter;
1832         size_t iov_count;
1833         ssize_t io_size, ret;
1834
1835         ret = io_import_iovec(READ, req, &iovec, &iter);
1836         if (ret < 0)
1837                 return ret;
1838
1839         /* Ensure we clear previously set non-block flag */
1840         if (!force_nonblock)
1841                 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
1842
1843         io_size = ret;
1844         if (req->flags & REQ_F_LINK)
1845                 req->result = io_size;
1846
1847         /*
1848          * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
1849          * we know to async punt it even if it was opened O_NONBLOCK
1850          */
1851         if (force_nonblock && !io_file_supports_async(req->file)) {
1852                 req->flags |= REQ_F_MUST_PUNT;
1853                 goto copy_iov;
1854         }
1855
1856         iov_count = iov_iter_count(&iter);
1857         ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
1858         if (!ret) {
1859                 ssize_t ret2;
1860
1861                 if (req->file->f_op->read_iter)
1862                         ret2 = call_read_iter(req->file, kiocb, &iter);
1863                 else
1864                         ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
1865
1866                 /*
1867                  * In case of a short read, punt to async. This can happen
1868                  * if we have data partially cached. Alternatively we can
1869                  * return the short read, in which case the application will
1870                  * need to issue another SQE and wait for it. That SQE will
1871                  * need async punt anyway, so it's more efficient to do it
1872                  * here.
1873                  */
1874                 if (force_nonblock && !(req->flags & REQ_F_NOWAIT) &&
1875                     (req->flags & REQ_F_ISREG) &&
1876                     ret2 > 0 && ret2 < io_size)
1877                         ret2 = -EAGAIN;
1878                 /* Catch -EAGAIN return for forced non-blocking submission */
1879                 if (!force_nonblock || ret2 != -EAGAIN) {
1880                         kiocb_done(kiocb, ret2, nxt, req->in_async);
1881                 } else {
1882 copy_iov:
1883                         ret = io_setup_async_rw(req, io_size, iovec,
1884                                                 inline_vecs, &iter);
1885                         if (ret)
1886                                 goto out_free;
1887                         return -EAGAIN;
1888                 }
1889         }
1890 out_free:
1891         if (!io_wq_current_is_worker())
1892                 kfree(iovec);
1893         return ret;
1894 }
1895
1896 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1897                          bool force_nonblock)
1898 {
1899         struct io_async_ctx *io;
1900         struct iov_iter iter;
1901         ssize_t ret;
1902
1903         ret = io_prep_rw(req, sqe, force_nonblock);
1904         if (ret)
1905                 return ret;
1906
1907         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
1908                 return -EBADF;
1909
1910         if (!req->io)
1911                 return 0;
1912
1913         io = req->io;
1914         io->rw.iov = io->rw.fast_iov;
1915         req->io = NULL;
1916         ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter);
1917         req->io = io;
1918         if (ret < 0)
1919                 return ret;
1920
1921         io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
1922         return 0;
1923 }
1924
1925 static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
1926                     bool force_nonblock)
1927 {
1928         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
1929         struct kiocb *kiocb = &req->rw.kiocb;
1930         struct iov_iter iter;
1931         size_t iov_count;
1932         ssize_t ret, io_size;
1933
1934         ret = io_import_iovec(WRITE, req, &iovec, &iter);
1935         if (ret < 0)
1936                 return ret;
1937
1938         /* Ensure we clear previously set non-block flag */
1939         if (!force_nonblock)
1940                 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
1941
1942         io_size = ret;
1943         if (req->flags & REQ_F_LINK)
1944                 req->result = io_size;
1945
1946         /*
1947          * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
1948          * we know to async punt it even if it was opened O_NONBLOCK
1949          */
1950         if (force_nonblock && !io_file_supports_async(req->file)) {
1951                 req->flags |= REQ_F_MUST_PUNT;
1952                 goto copy_iov;
1953         }
1954
1955         /* file path doesn't support NOWAIT for non-direct_IO */
1956         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
1957             (req->flags & REQ_F_ISREG))
1958                 goto copy_iov;
1959
1960         iov_count = iov_iter_count(&iter);
1961         ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
1962         if (!ret) {
1963                 ssize_t ret2;
1964
1965                 /*
1966                  * Open-code file_start_write here to grab freeze protection,
1967                  * which will be released by another thread in
1968                  * io_complete_rw().  Fool lockdep by telling it the lock got
1969                  * released so that it doesn't complain about the held lock when
1970                  * we return to userspace.
1971                  */
1972                 if (req->flags & REQ_F_ISREG) {
1973                         __sb_start_write(file_inode(req->file)->i_sb,
1974                                                 SB_FREEZE_WRITE, true);
1975                         __sb_writers_release(file_inode(req->file)->i_sb,
1976                                                 SB_FREEZE_WRITE);
1977                 }
1978                 kiocb->ki_flags |= IOCB_WRITE;
1979
1980                 if (req->file->f_op->write_iter)
1981                         ret2 = call_write_iter(req->file, kiocb, &iter);
1982                 else
1983                         ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
1984                 if (!force_nonblock || ret2 != -EAGAIN) {
1985                         kiocb_done(kiocb, ret2, nxt, req->in_async);
1986                 } else {
1987 copy_iov:
1988                         ret = io_setup_async_rw(req, io_size, iovec,
1989                                                 inline_vecs, &iter);
1990                         if (ret)
1991                                 goto out_free;
1992                         return -EAGAIN;
1993                 }
1994         }
1995 out_free:
1996         if (!io_wq_current_is_worker())
1997                 kfree(iovec);
1998         return ret;
1999 }
2000
2001 /*
2002  * IORING_OP_NOP just posts a completion event, nothing else.
2003  */
2004 static int io_nop(struct io_kiocb *req)
2005 {
2006         struct io_ring_ctx *ctx = req->ctx;
2007
2008         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2009                 return -EINVAL;
2010
2011         io_cqring_add_event(req, 0);
2012         io_put_req(req);
2013         return 0;
2014 }
2015
2016 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2017 {
2018         struct io_ring_ctx *ctx = req->ctx;
2019
2020         if (!req->file)
2021                 return -EBADF;
2022
2023         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2024                 return -EINVAL;
2025         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
2026                 return -EINVAL;
2027
2028         req->sync.flags = READ_ONCE(sqe->fsync_flags);
2029         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
2030                 return -EINVAL;
2031
2032         req->sync.off = READ_ONCE(sqe->off);
2033         req->sync.len = READ_ONCE(sqe->len);
2034         return 0;
2035 }
2036
2037 static bool io_req_cancelled(struct io_kiocb *req)
2038 {
2039         if (req->work.flags & IO_WQ_WORK_CANCEL) {
2040                 req_set_fail_links(req);
2041                 io_cqring_add_event(req, -ECANCELED);
2042                 io_put_req(req);
2043                 return true;
2044         }
2045
2046         return false;
2047 }
2048
2049 static void io_fsync_finish(struct io_wq_work **workptr)
2050 {
2051         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2052         loff_t end = req->sync.off + req->sync.len;
2053         struct io_kiocb *nxt = NULL;
2054         int ret;
2055
2056         if (io_req_cancelled(req))
2057                 return;
2058
2059         ret = vfs_fsync_range(req->file, req->sync.off,
2060                                 end > 0 ? end : LLONG_MAX,
2061                                 req->sync.flags & IORING_FSYNC_DATASYNC);
2062         if (ret < 0)
2063                 req_set_fail_links(req);
2064         io_cqring_add_event(req, ret);
2065         io_put_req_find_next(req, &nxt);
2066         if (nxt)
2067                 *workptr = &nxt->work;
2068 }
2069
2070 static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt,
2071                     bool force_nonblock)
2072 {
2073         struct io_wq_work *work, *old_work;
2074
2075         /* fsync always requires a blocking context */
2076         if (force_nonblock) {
2077                 io_put_req(req);
2078                 req->work.func = io_fsync_finish;
2079                 return -EAGAIN;
2080         }
2081
2082         work = old_work = &req->work;
2083         io_fsync_finish(&work);
2084         if (work && work != old_work)
2085                 *nxt = container_of(work, struct io_kiocb, work);
2086         return 0;
2087 }
2088
2089 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2090 {
2091         struct io_ring_ctx *ctx = req->ctx;
2092
2093         if (!req->file)
2094                 return -EBADF;
2095
2096         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2097                 return -EINVAL;
2098         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
2099                 return -EINVAL;
2100
2101         req->sync.off = READ_ONCE(sqe->off);
2102         req->sync.len = READ_ONCE(sqe->len);
2103         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
2104         return 0;
2105 }
2106
2107 static void io_sync_file_range_finish(struct io_wq_work **workptr)
2108 {
2109         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2110         struct io_kiocb *nxt = NULL;
2111         int ret;
2112
2113         if (io_req_cancelled(req))
2114                 return;
2115
2116         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
2117                                 req->sync.flags);
2118         if (ret < 0)
2119                 req_set_fail_links(req);
2120         io_cqring_add_event(req, ret);
2121         io_put_req_find_next(req, &nxt);
2122         if (nxt)
2123                 *workptr = &nxt->work;
2124 }
2125
2126 static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
2127                               bool force_nonblock)
2128 {
2129         struct io_wq_work *work, *old_work;
2130
2131         /* sync_file_range always requires a blocking context */
2132         if (force_nonblock) {
2133                 io_put_req(req);
2134                 req->work.func = io_sync_file_range_finish;
2135                 return -EAGAIN;
2136         }
2137
2138         work = old_work = &req->work;
2139         io_sync_file_range_finish(&work);
2140         if (work && work != old_work)
2141                 *nxt = container_of(work, struct io_kiocb, work);
2142         return 0;
2143 }
2144
2145 #if defined(CONFIG_NET)
2146 static void io_sendrecv_async(struct io_wq_work **workptr)
2147 {
2148         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2149         struct iovec *iov = NULL;
2150
2151         if (req->io->rw.iov != req->io->rw.fast_iov)
2152                 iov = req->io->msg.iov;
2153         io_wq_submit_work(workptr);
2154         kfree(iov);
2155 }
2156 #endif
2157
2158 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2159 {
2160 #if defined(CONFIG_NET)
2161         struct io_sr_msg *sr = &req->sr_msg;
2162         struct io_async_ctx *io = req->io;
2163
2164         sr->msg_flags = READ_ONCE(sqe->msg_flags);
2165         sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
2166
2167         if (!io)
2168                 return 0;
2169
2170         io->msg.iov = io->msg.fast_iov;
2171         return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
2172                                         &io->msg.iov);
2173 #else
2174         return -EOPNOTSUPP;
2175 #endif
2176 }
2177
2178 static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
2179                       bool force_nonblock)
2180 {
2181 #if defined(CONFIG_NET)
2182         struct io_async_msghdr *kmsg = NULL;
2183         struct socket *sock;
2184         int ret;
2185
2186         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2187                 return -EINVAL;
2188
2189         sock = sock_from_file(req->file, &ret);
2190         if (sock) {
2191                 struct io_async_ctx io;
2192                 struct sockaddr_storage addr;
2193                 unsigned flags;
2194
2195                 if (req->io) {
2196                         kmsg = &req->io->msg;
2197                         kmsg->msg.msg_name = &addr;
2198                         /* if iov is set, it's allocated already */
2199                         if (!kmsg->iov)
2200                                 kmsg->iov = kmsg->fast_iov;
2201                         kmsg->msg.msg_iter.iov = kmsg->iov;
2202                 } else {
2203                         struct io_sr_msg *sr = &req->sr_msg;
2204
2205                         kmsg = &io.msg;
2206                         kmsg->msg.msg_name = &addr;
2207
2208                         io.msg.iov = io.msg.fast_iov;
2209                         ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
2210                                         sr->msg_flags, &io.msg.iov);
2211                         if (ret)
2212                                 return ret;
2213                 }
2214
2215                 flags = req->sr_msg.msg_flags;
2216                 if (flags & MSG_DONTWAIT)
2217                         req->flags |= REQ_F_NOWAIT;
2218                 else if (force_nonblock)
2219                         flags |= MSG_DONTWAIT;
2220
2221                 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
2222                 if (force_nonblock && ret == -EAGAIN) {
2223                         if (req->io)
2224                                 return -EAGAIN;
2225                         if (io_alloc_async_ctx(req))
2226                                 return -ENOMEM;
2227                         memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
2228                         req->work.func = io_sendrecv_async;
2229                         return -EAGAIN;
2230                 }
2231                 if (ret == -ERESTARTSYS)
2232                         ret = -EINTR;
2233         }
2234
2235         if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
2236                 kfree(kmsg->iov);
2237         io_cqring_add_event(req, ret);
2238         if (ret < 0)
2239                 req_set_fail_links(req);
2240         io_put_req_find_next(req, nxt);
2241         return 0;
2242 #else
2243         return -EOPNOTSUPP;
2244 #endif
2245 }
2246
2247 static int io_recvmsg_prep(struct io_kiocb *req,
2248                            const struct io_uring_sqe *sqe)
2249 {
2250 #if defined(CONFIG_NET)
2251         struct io_sr_msg *sr = &req->sr_msg;
2252         struct io_async_ctx *io = req->io;
2253
2254         sr->msg_flags = READ_ONCE(sqe->msg_flags);
2255         sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
2256
2257         if (!io)
2258                 return 0;
2259
2260         io->msg.iov = io->msg.fast_iov;
2261         return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
2262                                         &io->msg.uaddr, &io->msg.iov);
2263 #else
2264         return -EOPNOTSUPP;
2265 #endif
2266 }
2267
2268 static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
2269                       bool force_nonblock)
2270 {
2271 #if defined(CONFIG_NET)
2272         struct io_async_msghdr *kmsg = NULL;
2273         struct socket *sock;
2274         int ret;
2275
2276         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2277                 return -EINVAL;
2278
2279         sock = sock_from_file(req->file, &ret);
2280         if (sock) {
2281                 struct io_async_ctx io;
2282                 struct sockaddr_storage addr;
2283                 unsigned flags;
2284
2285                 if (req->io) {
2286                         kmsg = &req->io->msg;
2287                         kmsg->msg.msg_name = &addr;
2288                         /* if iov is set, it's allocated already */
2289                         if (!kmsg->iov)
2290                                 kmsg->iov = kmsg->fast_iov;
2291                         kmsg->msg.msg_iter.iov = kmsg->iov;
2292                 } else {
2293                         struct io_sr_msg *sr = &req->sr_msg;
2294
2295                         kmsg = &io.msg;
2296                         kmsg->msg.msg_name = &addr;
2297
2298                         io.msg.iov = io.msg.fast_iov;
2299                         ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg,
2300                                         sr->msg_flags, &io.msg.uaddr,
2301                                         &io.msg.iov);
2302                         if (ret)
2303                                 return ret;
2304                 }
2305
2306                 flags = req->sr_msg.msg_flags;
2307                 if (flags & MSG_DONTWAIT)
2308                         req->flags |= REQ_F_NOWAIT;
2309                 else if (force_nonblock)
2310                         flags |= MSG_DONTWAIT;
2311
2312                 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
2313                                                 kmsg->uaddr, flags);
2314                 if (force_nonblock && ret == -EAGAIN) {
2315                         if (req->io)
2316                                 return -EAGAIN;
2317                         if (io_alloc_async_ctx(req))
2318                                 return -ENOMEM;
2319                         memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
2320                         req->work.func = io_sendrecv_async;
2321                         return -EAGAIN;
2322                 }
2323                 if (ret == -ERESTARTSYS)
2324                         ret = -EINTR;
2325         }
2326
2327         if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
2328                 kfree(kmsg->iov);
2329         io_cqring_add_event(req, ret);
2330         if (ret < 0)
2331                 req_set_fail_links(req);
2332         io_put_req_find_next(req, nxt);
2333         return 0;
2334 #else
2335         return -EOPNOTSUPP;
2336 #endif
2337 }
2338
2339 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2340 {
2341 #if defined(CONFIG_NET)
2342         struct io_accept *accept = &req->accept;
2343
2344         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
2345                 return -EINVAL;
2346         if (sqe->ioprio || sqe->len || sqe->buf_index)
2347                 return -EINVAL;
2348
2349         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
2350         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
2351         accept->flags = READ_ONCE(sqe->accept_flags);
2352         return 0;
2353 #else
2354         return -EOPNOTSUPP;
2355 #endif
2356 }
2357
2358 #if defined(CONFIG_NET)
2359 static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
2360                        bool force_nonblock)
2361 {
2362         struct io_accept *accept = &req->accept;
2363         unsigned file_flags;
2364         int ret;
2365
2366         file_flags = force_nonblock ? O_NONBLOCK : 0;
2367         ret = __sys_accept4_file(req->file, file_flags, accept->addr,
2368                                         accept->addr_len, accept->flags);
2369         if (ret == -EAGAIN && force_nonblock)
2370                 return -EAGAIN;
2371         if (ret == -ERESTARTSYS)
2372                 ret = -EINTR;
2373         if (ret < 0)
2374                 req_set_fail_links(req);
2375         io_cqring_add_event(req, ret);
2376         io_put_req_find_next(req, nxt);
2377         return 0;
2378 }
2379
2380 static void io_accept_finish(struct io_wq_work **workptr)
2381 {
2382         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2383         struct io_kiocb *nxt = NULL;
2384
2385         if (io_req_cancelled(req))
2386                 return;
2387         __io_accept(req, &nxt, false);
2388         if (nxt)
2389                 *workptr = &nxt->work;
2390 }
2391 #endif
2392
2393 static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
2394                      bool force_nonblock)
2395 {
2396 #if defined(CONFIG_NET)
2397         int ret;
2398
2399         ret = __io_accept(req, nxt, force_nonblock);
2400         if (ret == -EAGAIN && force_nonblock) {
2401                 req->work.func = io_accept_finish;
2402                 req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
2403                 io_put_req(req);
2404                 return -EAGAIN;
2405         }
2406         return 0;
2407 #else
2408         return -EOPNOTSUPP;
2409 #endif
2410 }
2411
2412 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2413 {
2414 #if defined(CONFIG_NET)
2415         struct io_connect *conn = &req->connect;
2416         struct io_async_ctx *io = req->io;
2417
2418         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
2419                 return -EINVAL;
2420         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
2421                 return -EINVAL;
2422
2423         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
2424         conn->addr_len =  READ_ONCE(sqe->addr2);
2425
2426         if (!io)
2427                 return 0;
2428
2429         return move_addr_to_kernel(conn->addr, conn->addr_len,
2430                                         &io->connect.address);
2431 #else
2432         return -EOPNOTSUPP;
2433 #endif
2434 }
2435
2436 static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt,
2437                       bool force_nonblock)
2438 {
2439 #if defined(CONFIG_NET)
2440         struct io_async_ctx __io, *io;
2441         unsigned file_flags;
2442         int ret;
2443
2444         if (req->io) {
2445                 io = req->io;
2446         } else {
2447                 ret = move_addr_to_kernel(req->connect.addr,
2448                                                 req->connect.addr_len,
2449                                                 &__io.connect.address);
2450                 if (ret)
2451                         goto out;
2452                 io = &__io;
2453         }
2454
2455         file_flags = force_nonblock ? O_NONBLOCK : 0;
2456
2457         ret = __sys_connect_file(req->file, &io->connect.address,
2458                                         req->connect.addr_len, file_flags);
2459         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
2460                 if (req->io)
2461                         return -EAGAIN;
2462                 if (io_alloc_async_ctx(req)) {
2463                         ret = -ENOMEM;
2464                         goto out;
2465                 }
2466                 memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
2467                 return -EAGAIN;
2468         }
2469         if (ret == -ERESTARTSYS)
2470                 ret = -EINTR;
2471 out:
2472         if (ret < 0)
2473                 req_set_fail_links(req);
2474         io_cqring_add_event(req, ret);
2475         io_put_req_find_next(req, nxt);
2476         return 0;
2477 #else
2478         return -EOPNOTSUPP;
2479 #endif
2480 }
2481
2482 static void io_poll_remove_one(struct io_kiocb *req)
2483 {
2484         struct io_poll_iocb *poll = &req->poll;
2485
2486         spin_lock(&poll->head->lock);
2487         WRITE_ONCE(poll->canceled, true);
2488         if (!list_empty(&poll->wait.entry)) {
2489                 list_del_init(&poll->wait.entry);
2490                 io_queue_async_work(req);
2491         }
2492         spin_unlock(&poll->head->lock);
2493         hash_del(&req->hash_node);
2494 }
2495
2496 static void io_poll_remove_all(struct io_ring_ctx *ctx)
2497 {
2498         struct hlist_node *tmp;
2499         struct io_kiocb *req;
2500         int i;
2501
2502         spin_lock_irq(&ctx->completion_lock);
2503         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
2504                 struct hlist_head *list;
2505
2506                 list = &ctx->cancel_hash[i];
2507                 hlist_for_each_entry_safe(req, tmp, list, hash_node)
2508                         io_poll_remove_one(req);
2509         }
2510         spin_unlock_irq(&ctx->completion_lock);
2511 }
2512
2513 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
2514 {
2515         struct hlist_head *list;
2516         struct io_kiocb *req;
2517
2518         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
2519         hlist_for_each_entry(req, list, hash_node) {
2520                 if (sqe_addr == req->user_data) {
2521                         io_poll_remove_one(req);
2522                         return 0;
2523                 }
2524         }
2525
2526         return -ENOENT;
2527 }
2528
2529 static int io_poll_remove_prep(struct io_kiocb *req,
2530                                const struct io_uring_sqe *sqe)
2531 {
2532         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2533                 return -EINVAL;
2534         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
2535             sqe->poll_events)
2536                 return -EINVAL;
2537
2538         req->poll.addr = READ_ONCE(sqe->addr);
2539         return 0;
2540 }
2541
2542 /*
2543  * Find a running poll command that matches one specified in sqe->addr,
2544  * and remove it if found.
2545  */
2546 static int io_poll_remove(struct io_kiocb *req)
2547 {
2548         struct io_ring_ctx *ctx = req->ctx;
2549         u64 addr;
2550         int ret;
2551
2552         addr = req->poll.addr;
2553         spin_lock_irq(&ctx->completion_lock);
2554         ret = io_poll_cancel(ctx, addr);
2555         spin_unlock_irq(&ctx->completion_lock);
2556
2557         io_cqring_add_event(req, ret);
2558         if (ret < 0)
2559                 req_set_fail_links(req);
2560         io_put_req(req);
2561         return 0;
2562 }
2563
2564 static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
2565 {
2566         struct io_ring_ctx *ctx = req->ctx;
2567
2568         req->poll.done = true;
2569         if (error)
2570                 io_cqring_fill_event(req, error);
2571         else
2572                 io_cqring_fill_event(req, mangle_poll(mask));
2573         io_commit_cqring(ctx);
2574 }
2575
2576 static void io_poll_complete_work(struct io_wq_work **workptr)
2577 {
2578         struct io_wq_work *work = *workptr;
2579         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
2580         struct io_poll_iocb *poll = &req->poll;
2581         struct poll_table_struct pt = { ._key = poll->events };
2582         struct io_ring_ctx *ctx = req->ctx;
2583         struct io_kiocb *nxt = NULL;
2584         __poll_t mask = 0;
2585         int ret = 0;
2586
2587         if (work->flags & IO_WQ_WORK_CANCEL) {
2588                 WRITE_ONCE(poll->canceled, true);
2589                 ret = -ECANCELED;
2590         } else if (READ_ONCE(poll->canceled)) {
2591                 ret = -ECANCELED;
2592         }
2593
2594         if (ret != -ECANCELED)
2595                 mask = vfs_poll(poll->file, &pt) & poll->events;
2596
2597         /*
2598          * Note that ->ki_cancel callers also delete iocb from active_reqs after
2599          * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
2600          * synchronize with them.  In the cancellation case the list_del_init
2601          * itself is not actually needed, but harmless so we keep it in to
2602          * avoid further branches in the fast path.
2603          */
2604         spin_lock_irq(&ctx->completion_lock);
2605         if (!mask && ret != -ECANCELED) {
2606                 add_wait_queue(poll->head, &poll->wait);
2607                 spin_unlock_irq(&ctx->completion_lock);
2608                 return;
2609         }
2610         hash_del(&req->hash_node);
2611         io_poll_complete(req, mask, ret);
2612         spin_unlock_irq(&ctx->completion_lock);
2613
2614         io_cqring_ev_posted(ctx);
2615
2616         if (ret < 0)
2617                 req_set_fail_links(req);
2618         io_put_req_find_next(req, &nxt);
2619         if (nxt)
2620                 *workptr = &nxt->work;
2621 }
2622
2623 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
2624                         void *key)
2625 {
2626         struct io_poll_iocb *poll = wait->private;
2627         struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
2628         struct io_ring_ctx *ctx = req->ctx;
2629         __poll_t mask = key_to_poll(key);
2630         unsigned long flags;
2631
2632         /* for instances that support it check for an event match first: */
2633         if (mask && !(mask & poll->events))
2634                 return 0;
2635
2636         list_del_init(&poll->wait.entry);
2637
2638         /*
2639          * Run completion inline if we can. We're using trylock here because
2640          * we are violating the completion_lock -> poll wq lock ordering.
2641          * If we have a link timeout we're going to need the completion_lock
2642          * for finalizing the request, mark us as having grabbed that already.
2643          */
2644         if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
2645                 hash_del(&req->hash_node);
2646                 io_poll_complete(req, mask, 0);
2647                 req->flags |= REQ_F_COMP_LOCKED;
2648                 io_put_req(req);
2649                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
2650
2651                 io_cqring_ev_posted(ctx);
2652         } else {
2653                 io_queue_async_work(req);
2654         }
2655
2656         return 1;
2657 }
2658
2659 struct io_poll_table {
2660         struct poll_table_struct pt;
2661         struct io_kiocb *req;
2662         int error;
2663 };
2664
2665 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
2666                                struct poll_table_struct *p)
2667 {
2668         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
2669
2670         if (unlikely(pt->req->poll.head)) {
2671                 pt->error = -EINVAL;
2672                 return;
2673         }
2674
2675         pt->error = 0;
2676         pt->req->poll.head = head;
2677         add_wait_queue(head, &pt->req->poll.wait);
2678 }
2679
2680 static void io_poll_req_insert(struct io_kiocb *req)
2681 {
2682         struct io_ring_ctx *ctx = req->ctx;
2683         struct hlist_head *list;
2684
2685         list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
2686         hlist_add_head(&req->hash_node, list);
2687 }
2688
2689 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2690 {
2691         struct io_poll_iocb *poll = &req->poll;
2692         u16 events;
2693
2694         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2695                 return -EINVAL;
2696         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
2697                 return -EINVAL;
2698         if (!poll->file)
2699                 return -EBADF;
2700
2701         events = READ_ONCE(sqe->poll_events);
2702         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
2703         return 0;
2704 }
2705
2706 static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
2707 {
2708         struct io_poll_iocb *poll = &req->poll;
2709         struct io_ring_ctx *ctx = req->ctx;
2710         struct io_poll_table ipt;
2711         bool cancel = false;
2712         __poll_t mask;
2713
2714         INIT_IO_WORK(&req->work, io_poll_complete_work);
2715         INIT_HLIST_NODE(&req->hash_node);
2716
2717         poll->head = NULL;
2718         poll->done = false;
2719         poll->canceled = false;
2720
2721         ipt.pt._qproc = io_poll_queue_proc;
2722         ipt.pt._key = poll->events;
2723         ipt.req = req;
2724         ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
2725
2726         /* initialized the list so that we can do list_empty checks */
2727         INIT_LIST_HEAD(&poll->wait.entry);
2728         init_waitqueue_func_entry(&poll->wait, io_poll_wake);
2729         poll->wait.private = poll;
2730
2731         INIT_LIST_HEAD(&req->list);
2732
2733         mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
2734
2735         spin_lock_irq(&ctx->completion_lock);
2736         if (likely(poll->head)) {
2737                 spin_lock(&poll->head->lock);
2738                 if (unlikely(list_empty(&poll->wait.entry))) {
2739                         if (ipt.error)
2740                                 cancel = true;
2741                         ipt.error = 0;
2742                         mask = 0;
2743                 }
2744                 if (mask || ipt.error)
2745                         list_del_init(&poll->wait.entry);
2746                 else if (cancel)
2747                         WRITE_ONCE(poll->canceled, true);
2748                 else if (!poll->done) /* actually waiting for an event */
2749                         io_poll_req_insert(req);
2750                 spin_unlock(&poll->head->lock);
2751         }
2752         if (mask) { /* no async, we'd stolen it */
2753                 ipt.error = 0;
2754                 io_poll_complete(req, mask, 0);
2755         }
2756         spin_unlock_irq(&ctx->completion_lock);
2757
2758         if (mask) {
2759                 io_cqring_ev_posted(ctx);
2760                 io_put_req_find_next(req, nxt);
2761         }
2762         return ipt.error;
2763 }
2764
2765 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
2766 {
2767         struct io_timeout_data *data = container_of(timer,
2768                                                 struct io_timeout_data, timer);
2769         struct io_kiocb *req = data->req;
2770         struct io_ring_ctx *ctx = req->ctx;
2771         unsigned long flags;
2772
2773         atomic_inc(&ctx->cq_timeouts);
2774
2775         spin_lock_irqsave(&ctx->completion_lock, flags);
2776         /*
2777          * We could be racing with timeout deletion. If the list is empty,
2778          * then timeout lookup already found it and will be handling it.
2779          */
2780         if (!list_empty(&req->list)) {
2781                 struct io_kiocb *prev;
2782
2783                 /*
2784                  * Adjust the reqs sequence before the current one because it
2785                  * will consume a slot in the cq_ring and the cq_tail
2786                  * pointer will be increased, otherwise other timeout reqs may
2787                  * return in advance without waiting for enough wait_nr.
2788                  */
2789                 prev = req;
2790                 list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
2791                         prev->sequence++;
2792                 list_del_init(&req->list);
2793         }
2794
2795         io_cqring_fill_event(req, -ETIME);
2796         io_commit_cqring(ctx);
2797         spin_unlock_irqrestore(&ctx->completion_lock, flags);
2798
2799         io_cqring_ev_posted(ctx);
2800         req_set_fail_links(req);
2801         io_put_req(req);
2802         return HRTIMER_NORESTART;
2803 }
2804
2805 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
2806 {
2807         struct io_kiocb *req;
2808         int ret = -ENOENT;
2809
2810         list_for_each_entry(req, &ctx->timeout_list, list) {
2811                 if (user_data == req->user_data) {
2812                         list_del_init(&req->list);
2813                         ret = 0;
2814                         break;
2815                 }
2816         }
2817
2818         if (ret == -ENOENT)
2819                 return ret;
2820
2821         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
2822         if (ret == -1)
2823                 return -EALREADY;
2824
2825         req_set_fail_links(req);
2826         io_cqring_fill_event(req, -ECANCELED);
2827         io_put_req(req);
2828         return 0;
2829 }
2830
2831 static int io_timeout_remove_prep(struct io_kiocb *req,
2832                                   const struct io_uring_sqe *sqe)
2833 {
2834         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2835                 return -EINVAL;
2836         if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
2837                 return -EINVAL;
2838
2839         req->timeout.addr = READ_ONCE(sqe->addr);
2840         req->timeout.flags = READ_ONCE(sqe->timeout_flags);
2841         if (req->timeout.flags)
2842                 return -EINVAL;
2843
2844         return 0;
2845 }
2846
2847 /*
2848  * Remove or update an existing timeout command
2849  */
2850 static int io_timeout_remove(struct io_kiocb *req)
2851 {
2852         struct io_ring_ctx *ctx = req->ctx;
2853         int ret;
2854
2855         spin_lock_irq(&ctx->completion_lock);
2856         ret = io_timeout_cancel(ctx, req->timeout.addr);
2857
2858         io_cqring_fill_event(req, ret);
2859         io_commit_cqring(ctx);
2860         spin_unlock_irq(&ctx->completion_lock);
2861         io_cqring_ev_posted(ctx);
2862         if (ret < 0)
2863                 req_set_fail_links(req);
2864         io_put_req(req);
2865         return 0;
2866 }
2867
2868 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2869                            bool is_timeout_link)
2870 {
2871         struct io_timeout_data *data;
2872         unsigned flags;
2873
2874         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2875                 return -EINVAL;
2876         if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
2877                 return -EINVAL;
2878         if (sqe->off && is_timeout_link)
2879                 return -EINVAL;
2880         flags = READ_ONCE(sqe->timeout_flags);
2881         if (flags & ~IORING_TIMEOUT_ABS)
2882                 return -EINVAL;
2883
2884         req->timeout.count = READ_ONCE(sqe->off);
2885
2886         if (!req->io && io_alloc_async_ctx(req))
2887                 return -ENOMEM;
2888
2889         data = &req->io->timeout;
2890         data->req = req;
2891         req->flags |= REQ_F_TIMEOUT;
2892
2893         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
2894                 return -EFAULT;
2895
2896         if (flags & IORING_TIMEOUT_ABS)
2897                 data->mode = HRTIMER_MODE_ABS;
2898         else
2899                 data->mode = HRTIMER_MODE_REL;
2900
2901         hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
2902         return 0;
2903 }
2904
2905 static int io_timeout(struct io_kiocb *req)
2906 {
2907         unsigned count;
2908         struct io_ring_ctx *ctx = req->ctx;
2909         struct io_timeout_data *data;
2910         struct list_head *entry;
2911         unsigned span = 0;
2912
2913         data = &req->io->timeout;
2914
2915         /*
2916          * sqe->off holds how many events that need to occur for this
2917          * timeout event to be satisfied. If it isn't set, then this is
2918          * a pure timeout request, sequence isn't used.
2919          */
2920         count = req->timeout.count;
2921         if (!count) {
2922                 req->flags |= REQ_F_TIMEOUT_NOSEQ;
2923                 spin_lock_irq(&ctx->completion_lock);
2924                 entry = ctx->timeout_list.prev;
2925                 goto add;
2926         }
2927
2928         req->sequence = ctx->cached_sq_head + count - 1;
2929         data->seq_offset = count;
2930
2931         /*
2932          * Insertion sort, ensuring the first entry in the list is always
2933          * the one we need first.
2934          */
2935         spin_lock_irq(&ctx->completion_lock);
2936         list_for_each_prev(entry, &ctx->timeout_list) {
2937                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
2938                 unsigned nxt_sq_head;
2939                 long long tmp, tmp_nxt;
2940                 u32 nxt_offset = nxt->io->timeout.seq_offset;
2941
2942                 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
2943                         continue;
2944
2945                 /*
2946                  * Since cached_sq_head + count - 1 can overflow, use type long
2947                  * long to store it.
2948                  */
2949                 tmp = (long long)ctx->cached_sq_head + count - 1;
2950                 nxt_sq_head = nxt->sequence - nxt_offset + 1;
2951                 tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
2952
2953                 /*
2954                  * cached_sq_head may overflow, and it will never overflow twice
2955                  * once there is some timeout req still be valid.
2956                  */
2957                 if (ctx->cached_sq_head < nxt_sq_head)
2958                         tmp += UINT_MAX;
2959
2960                 if (tmp > tmp_nxt)
2961                         break;
2962
2963                 /*
2964                  * Sequence of reqs after the insert one and itself should
2965                  * be adjusted because each timeout req consumes a slot.
2966                  */
2967                 span++;
2968                 nxt->sequence++;
2969         }
2970         req->sequence -= span;
2971 add:
2972         list_add(&req->list, entry);
2973         data->timer.function = io_timeout_fn;
2974         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
2975         spin_unlock_irq(&ctx->completion_lock);
2976         return 0;
2977 }
2978
2979 static bool io_cancel_cb(struct io_wq_work *work, void *data)
2980 {
2981         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
2982
2983         return req->user_data == (unsigned long) data;
2984 }
2985
2986 static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
2987 {
2988         enum io_wq_cancel cancel_ret;
2989         int ret = 0;
2990
2991         cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
2992         switch (cancel_ret) {
2993         case IO_WQ_CANCEL_OK:
2994                 ret = 0;
2995                 break;
2996         case IO_WQ_CANCEL_RUNNING:
2997                 ret = -EALREADY;
2998                 break;
2999         case IO_WQ_CANCEL_NOTFOUND:
3000                 ret = -ENOENT;
3001                 break;
3002         }
3003
3004         return ret;
3005 }
3006
3007 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
3008                                      struct io_kiocb *req, __u64 sqe_addr,
3009                                      struct io_kiocb **nxt, int success_ret)
3010 {
3011         unsigned long flags;
3012         int ret;
3013
3014         ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
3015         if (ret != -ENOENT) {
3016                 spin_lock_irqsave(&ctx->completion_lock, flags);
3017                 goto done;
3018         }
3019
3020         spin_lock_irqsave(&ctx->completion_lock, flags);
3021         ret = io_timeout_cancel(ctx, sqe_addr);
3022         if (ret != -ENOENT)
3023                 goto done;
3024         ret = io_poll_cancel(ctx, sqe_addr);
3025 done:
3026         if (!ret)
3027                 ret = success_ret;
3028         io_cqring_fill_event(req, ret);
3029         io_commit_cqring(ctx);
3030         spin_unlock_irqrestore(&ctx->completion_lock, flags);
3031         io_cqring_ev_posted(ctx);
3032
3033         if (ret < 0)
3034                 req_set_fail_links(req);
3035         io_put_req_find_next(req, nxt);
3036 }
3037
3038 static int io_async_cancel_prep(struct io_kiocb *req,
3039                                 const struct io_uring_sqe *sqe)
3040 {
3041         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3042                 return -EINVAL;
3043         if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
3044             sqe->cancel_flags)
3045                 return -EINVAL;
3046
3047         req->cancel.addr = READ_ONCE(sqe->addr);
3048         return 0;
3049 }
3050
3051 static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
3052 {
3053         struct io_ring_ctx *ctx = req->ctx;
3054
3055         io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0);
3056         return 0;
3057 }
3058
3059 static int io_req_defer_prep(struct io_kiocb *req,
3060                              const struct io_uring_sqe *sqe)
3061 {
3062         ssize_t ret = 0;
3063
3064         switch (req->opcode) {
3065         case IORING_OP_NOP:
3066                 break;
3067         case IORING_OP_READV:
3068         case IORING_OP_READ_FIXED:
3069                 ret = io_read_prep(req, sqe, true);
3070                 break;
3071         case IORING_OP_WRITEV:
3072         case IORING_OP_WRITE_FIXED:
3073                 ret = io_write_prep(req, sqe, true);
3074                 break;
3075         case IORING_OP_POLL_ADD:
3076                 ret = io_poll_add_prep(req, sqe);
3077                 break;
3078         case IORING_OP_POLL_REMOVE:
3079                 ret = io_poll_remove_prep(req, sqe);
3080                 break;
3081         case IORING_OP_FSYNC:
3082                 ret = io_prep_fsync(req, sqe);
3083                 break;
3084         case IORING_OP_SYNC_FILE_RANGE:
3085                 ret = io_prep_sfr(req, sqe);
3086                 break;
3087         case IORING_OP_SENDMSG:
3088                 ret = io_sendmsg_prep(req, sqe);
3089                 break;
3090         case IORING_OP_RECVMSG:
3091                 ret = io_recvmsg_prep(req, sqe);
3092                 break;
3093         case IORING_OP_CONNECT:
3094                 ret = io_connect_prep(req, sqe);
3095                 break;
3096         case IORING_OP_TIMEOUT:
3097                 ret = io_timeout_prep(req, sqe, false);
3098                 break;
3099         case IORING_OP_TIMEOUT_REMOVE:
3100                 ret = io_timeout_remove_prep(req, sqe);
3101                 break;
3102         case IORING_OP_ASYNC_CANCEL:
3103                 ret = io_async_cancel_prep(req, sqe);
3104                 break;
3105         case IORING_OP_LINK_TIMEOUT:
3106                 ret = io_timeout_prep(req, sqe, true);
3107                 break;
3108         case IORING_OP_ACCEPT:
3109                 ret = io_accept_prep(req, sqe);
3110                 break;
3111         default:
3112                 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
3113                                 req->opcode);
3114                 ret = -EINVAL;
3115                 break;
3116         }
3117
3118         return ret;
3119 }
3120
3121 static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3122 {
3123         struct io_ring_ctx *ctx = req->ctx;
3124         int ret;
3125
3126         /* Still need defer if there is pending req in defer list. */
3127         if (!req_need_defer(req) && list_empty(&ctx->defer_list))
3128                 return 0;
3129
3130         if (!req->io && io_alloc_async_ctx(req))
3131                 return -EAGAIN;
3132
3133         ret = io_req_defer_prep(req, sqe);
3134         if (ret < 0)
3135                 return ret;
3136
3137         spin_lock_irq(&ctx->completion_lock);
3138         if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
3139                 spin_unlock_irq(&ctx->completion_lock);
3140                 return 0;
3141         }
3142
3143         trace_io_uring_defer(ctx, req, req->user_data);
3144         list_add_tail(&req->list, &ctx->defer_list);
3145         spin_unlock_irq(&ctx->completion_lock);
3146         return -EIOCBQUEUED;
3147 }
3148
3149 static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
3150                         struct io_kiocb **nxt, bool force_nonblock)
3151 {
3152         struct io_ring_ctx *ctx = req->ctx;
3153         int ret;
3154
3155         switch (req->opcode) {
3156         case IORING_OP_NOP:
3157                 ret = io_nop(req);
3158                 break;
3159         case IORING_OP_READV:
3160         case IORING_OP_READ_FIXED:
3161                 if (sqe) {
3162                         ret = io_read_prep(req, sqe, force_nonblock);
3163                         if (ret < 0)
3164                                 break;
3165                 }
3166                 ret = io_read(req, nxt, force_nonblock);
3167                 break;
3168         case IORING_OP_WRITEV:
3169         case IORING_OP_WRITE_FIXED:
3170                 if (sqe) {
3171                         ret = io_write_prep(req, sqe, force_nonblock);
3172                         if (ret < 0)
3173                                 break;
3174                 }
3175                 ret = io_write(req, nxt, force_nonblock);
3176                 break;
3177         case IORING_OP_FSYNC:
3178                 if (sqe) {
3179                         ret = io_prep_fsync(req, sqe);
3180                         if (ret < 0)
3181                                 break;
3182                 }
3183                 ret = io_fsync(req, nxt, force_nonblock);
3184                 break;
3185         case IORING_OP_POLL_ADD:
3186                 if (sqe) {
3187                         ret = io_poll_add_prep(req, sqe);
3188                         if (ret)
3189                                 break;
3190                 }
3191                 ret = io_poll_add(req, nxt);
3192                 break;
3193         case IORING_OP_POLL_REMOVE:
3194                 if (sqe) {
3195                         ret = io_poll_remove_prep(req, sqe);
3196                         if (ret < 0)
3197                                 break;
3198                 }
3199                 ret = io_poll_remove(req);
3200                 break;
3201         case IORING_OP_SYNC_FILE_RANGE:
3202                 if (sqe) {
3203                         ret = io_prep_sfr(req, sqe);
3204                         if (ret < 0)
3205                                 break;
3206                 }
3207                 ret = io_sync_file_range(req, nxt, force_nonblock);
3208                 break;
3209         case IORING_OP_SENDMSG:
3210                 if (sqe) {
3211                         ret = io_sendmsg_prep(req, sqe);
3212                         if (ret < 0)
3213                                 break;
3214                 }
3215                 ret = io_sendmsg(req, nxt, force_nonblock);
3216                 break;
3217         case IORING_OP_RECVMSG:
3218                 if (sqe) {
3219                         ret = io_recvmsg_prep(req, sqe);
3220                         if (ret)
3221                                 break;
3222                 }
3223                 ret = io_recvmsg(req, nxt, force_nonblock);
3224                 break;
3225         case IORING_OP_TIMEOUT:
3226                 if (sqe) {
3227                         ret = io_timeout_prep(req, sqe, false);
3228                         if (ret)
3229                                 break;
3230                 }
3231                 ret = io_timeout(req);
3232                 break;
3233         case IORING_OP_TIMEOUT_REMOVE:
3234                 if (sqe) {
3235                         ret = io_timeout_remove_prep(req, sqe);
3236                         if (ret)
3237                                 break;
3238                 }
3239                 ret = io_timeout_remove(req);
3240                 break;
3241         case IORING_OP_ACCEPT:
3242                 if (sqe) {
3243                         ret = io_accept_prep(req, sqe);
3244                         if (ret)
3245                                 break;
3246                 }
3247                 ret = io_accept(req, nxt, force_nonblock);
3248                 break;
3249         case IORING_OP_CONNECT:
3250                 if (sqe) {
3251                         ret = io_connect_prep(req, sqe);
3252                         if (ret)
3253                                 break;
3254                 }
3255                 ret = io_connect(req, nxt, force_nonblock);
3256                 break;
3257         case IORING_OP_ASYNC_CANCEL:
3258                 if (sqe) {
3259                         ret = io_async_cancel_prep(req, sqe);
3260                         if (ret)
3261                                 break;
3262                 }
3263                 ret = io_async_cancel(req, nxt);
3264                 break;
3265         default:
3266                 ret = -EINVAL;
3267                 break;
3268         }
3269
3270         if (ret)
3271                 return ret;
3272
3273         if (ctx->flags & IORING_SETUP_IOPOLL) {
3274                 if (req->result == -EAGAIN)
3275                         return -EAGAIN;
3276
3277                 io_iopoll_req_issued(req);
3278         }
3279
3280         return 0;
3281 }
3282
3283 static void io_link_work_cb(struct io_wq_work **workptr)
3284 {
3285         struct io_wq_work *work = *workptr;
3286         struct io_kiocb *link = work->data;
3287
3288         io_queue_linked_timeout(link);
3289         work->func = io_wq_submit_work;
3290 }
3291
3292 static void io_wq_submit_work(struct io_wq_work **workptr)
3293 {
3294         struct io_wq_work *work = *workptr;
3295         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
3296         struct io_kiocb *nxt = NULL;
3297         int ret = 0;
3298
3299         if (work->flags & IO_WQ_WORK_CANCEL)
3300                 ret = -ECANCELED;
3301
3302         if (!ret) {
3303                 req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
3304                 req->in_async = true;
3305                 do {
3306                         ret = io_issue_sqe(req, NULL, &nxt, false);
3307                         /*
3308                          * We can get EAGAIN for polled IO even though we're
3309                          * forcing a sync submission from here, since we can't
3310                          * wait for request slots on the block side.
3311                          */
3312                         if (ret != -EAGAIN)
3313                                 break;
3314                         cond_resched();
3315                 } while (1);
3316         }
3317
3318         /* drop submission reference */
3319         io_put_req(req);
3320
3321         if (ret) {
3322                 req_set_fail_links(req);
3323                 io_cqring_add_event(req, ret);
3324                 io_put_req(req);
3325         }
3326
3327         /* if a dependent link is ready, pass it back */
3328         if (!ret && nxt) {
3329                 struct io_kiocb *link;
3330
3331                 io_prep_async_work(nxt, &link);
3332                 *workptr = &nxt->work;
3333                 if (link) {
3334                         nxt->work.flags |= IO_WQ_WORK_CB;
3335                         nxt->work.func = io_link_work_cb;
3336                         nxt->work.data = link;
3337                 }
3338         }
3339 }
3340
3341 static bool io_req_op_valid(int op)
3342 {
3343         return op >= IORING_OP_NOP && op < IORING_OP_LAST;
3344 }
3345
3346 static int io_req_needs_file(struct io_kiocb *req)
3347 {
3348         switch (req->opcode) {
3349         case IORING_OP_NOP:
3350         case IORING_OP_POLL_REMOVE:
3351         case IORING_OP_TIMEOUT:
3352         case IORING_OP_TIMEOUT_REMOVE:
3353         case IORING_OP_ASYNC_CANCEL:
3354         case IORING_OP_LINK_TIMEOUT:
3355                 return 0;
3356         default:
3357                 if (io_req_op_valid(req->opcode))
3358                         return 1;
3359                 return -EINVAL;
3360         }
3361 }
3362
3363 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
3364                                               int index)
3365 {
3366         struct fixed_file_table *table;
3367
3368         table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT];
3369         return table->files[index & IORING_FILE_TABLE_MASK];
3370 }
3371
3372 static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
3373                            const struct io_uring_sqe *sqe)
3374 {
3375         struct io_ring_ctx *ctx = req->ctx;
3376         unsigned flags;
3377         int fd, ret;
3378
3379         flags = READ_ONCE(sqe->flags);
3380         fd = READ_ONCE(sqe->fd);
3381
3382         if (flags & IOSQE_IO_DRAIN)
3383                 req->flags |= REQ_F_IO_DRAIN;
3384
3385         ret = io_req_needs_file(req);
3386         if (ret <= 0)
3387                 return ret;
3388
3389         if (flags & IOSQE_FIXED_FILE) {
3390                 if (unlikely(!ctx->file_table ||
3391                     (unsigned) fd >= ctx->nr_user_files))
3392                         return -EBADF;
3393                 fd = array_index_nospec(fd, ctx->nr_user_files);
3394                 req->file = io_file_from_index(ctx, fd);
3395                 if (!req->file)
3396                         return -EBADF;
3397                 req->flags |= REQ_F_FIXED_FILE;
3398         } else {
3399                 if (req->needs_fixed_file)
3400                         return -EBADF;
3401                 trace_io_uring_file_get(ctx, fd);
3402                 req->file = io_file_get(state, fd);
3403                 if (unlikely(!req->file))
3404                         return -EBADF;
3405         }
3406
3407         return 0;
3408 }
3409
3410 static int io_grab_files(struct io_kiocb *req)
3411 {
3412         int ret = -EBADF;
3413         struct io_ring_ctx *ctx = req->ctx;
3414
3415         rcu_read_lock();
3416         spin_lock_irq(&ctx->inflight_lock);
3417         /*
3418          * We use the f_ops->flush() handler to ensure that we can flush
3419          * out work accessing these files if the fd is closed. Check if
3420          * the fd has changed since we started down this path, and disallow
3421          * this operation if it has.
3422          */
3423         if (fcheck(req->ring_fd) == req->ring_file) {
3424                 list_add(&req->inflight_entry, &ctx->inflight_list);
3425                 req->flags |= REQ_F_INFLIGHT;
3426                 req->work.files = current->files;
3427                 ret = 0;
3428         }
3429         spin_unlock_irq(&ctx->inflight_lock);
3430         rcu_read_unlock();
3431
3432         return ret;
3433 }
3434
3435 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
3436 {
3437         struct io_timeout_data *data = container_of(timer,
3438                                                 struct io_timeout_data, timer);
3439         struct io_kiocb *req = data->req;
3440         struct io_ring_ctx *ctx = req->ctx;
3441         struct io_kiocb *prev = NULL;
3442         unsigned long flags;
3443
3444         spin_lock_irqsave(&ctx->completion_lock, flags);
3445
3446         /*
3447          * We don't expect the list to be empty, that will only happen if we
3448          * race with the completion of the linked work.
3449          */
3450         if (!list_empty(&req->link_list)) {
3451                 prev = list_entry(req->link_list.prev, struct io_kiocb,
3452                                   link_list);
3453                 if (refcount_inc_not_zero(&prev->refs)) {
3454                         list_del_init(&req->link_list);
3455                         prev->flags &= ~REQ_F_LINK_TIMEOUT;
3456                 } else
3457                         prev = NULL;
3458         }
3459
3460         spin_unlock_irqrestore(&ctx->completion_lock, flags);
3461
3462         if (prev) {
3463                 req_set_fail_links(prev);
3464                 io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
3465                                                 -ETIME);
3466                 io_put_req(prev);
3467         } else {
3468                 io_cqring_add_event(req, -ETIME);
3469                 io_put_req(req);
3470         }
3471         return HRTIMER_NORESTART;
3472 }
3473
3474 static void io_queue_linked_timeout(struct io_kiocb *req)
3475 {
3476         struct io_ring_ctx *ctx = req->ctx;
3477
3478         /*
3479          * If the list is now empty, then our linked request finished before
3480          * we got a chance to setup the timer
3481          */
3482         spin_lock_irq(&ctx->completion_lock);
3483         if (!list_empty(&req->link_list)) {
3484                 struct io_timeout_data *data = &req->io->timeout;
3485
3486                 data->timer.function = io_link_timeout_fn;
3487                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
3488                                 data->mode);
3489         }
3490         spin_unlock_irq(&ctx->completion_lock);
3491
3492         /* drop submission reference */
3493         io_put_req(req);
3494 }
3495
3496 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
3497 {
3498         struct io_kiocb *nxt;
3499
3500         if (!(req->flags & REQ_F_LINK))
3501                 return NULL;
3502
3503         nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
3504                                         link_list);
3505         if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
3506                 return NULL;
3507
3508         req->flags |= REQ_F_LINK_TIMEOUT;
3509         return nxt;
3510 }
3511
3512 static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3513 {
3514         struct io_kiocb *linked_timeout;
3515         struct io_kiocb *nxt = NULL;
3516         int ret;
3517
3518 again:
3519         linked_timeout = io_prep_linked_timeout(req);
3520
3521         ret = io_issue_sqe(req, sqe, &nxt, true);
3522
3523         /*
3524          * We async punt it if the file wasn't marked NOWAIT, or if the file
3525          * doesn't support non-blocking read/write attempts
3526          */
3527         if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
3528             (req->flags & REQ_F_MUST_PUNT))) {
3529                 if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
3530                         ret = io_grab_files(req);
3531                         if (ret)
3532                                 goto err;
3533                 }
3534
3535                 /*
3536                  * Queued up for async execution, worker will release
3537                  * submit reference when the iocb is actually submitted.
3538                  */
3539                 io_queue_async_work(req);
3540                 goto done_req;
3541         }
3542
3543 err:
3544         /* drop submission reference */
3545         io_put_req(req);
3546
3547         if (linked_timeout) {
3548                 if (!ret)
3549                         io_queue_linked_timeout(linked_timeout);
3550                 else
3551                         io_put_req(linked_timeout);
3552         }
3553
3554         /* and drop final reference, if we failed */
3555         if (ret) {
3556                 io_cqring_add_event(req, ret);
3557                 req_set_fail_links(req);
3558                 io_put_req(req);
3559         }
3560 done_req:
3561         if (nxt) {
3562                 req = nxt;
3563                 nxt = NULL;
3564                 goto again;
3565         }
3566 }
3567
3568 static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3569 {
3570         int ret;
3571
3572         if (unlikely(req->ctx->drain_next)) {
3573                 req->flags |= REQ_F_IO_DRAIN;
3574                 req->ctx->drain_next = false;
3575         }
3576         req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK);
3577
3578         ret = io_req_defer(req, sqe);
3579         if (ret) {
3580                 if (ret != -EIOCBQUEUED) {
3581                         io_cqring_add_event(req, ret);
3582                         req_set_fail_links(req);
3583                         io_double_put_req(req);
3584                 }
3585         } else
3586                 __io_queue_sqe(req, sqe);
3587 }
3588
3589 static inline void io_queue_link_head(struct io_kiocb *req)
3590 {
3591         if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
3592                 io_cqring_add_event(req, -ECANCELED);
3593                 io_double_put_req(req);
3594         } else
3595                 io_queue_sqe(req, NULL);
3596 }
3597
3598 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
3599                                 IOSQE_IO_HARDLINK)
3600
3601 static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
3602                           struct io_submit_state *state, struct io_kiocb **link)
3603 {
3604         struct io_ring_ctx *ctx = req->ctx;
3605         int ret;
3606
3607         /* enforce forwards compatibility on users */
3608         if (unlikely(sqe->flags & ~SQE_VALID_FLAGS)) {
3609                 ret = -EINVAL;
3610                 goto err_req;
3611         }
3612
3613         ret = io_req_set_file(state, req, sqe);
3614         if (unlikely(ret)) {
3615 err_req:
3616                 io_cqring_add_event(req, ret);
3617                 io_double_put_req(req);
3618                 return false;
3619         }
3620
3621         /*
3622          * If we already have a head request, queue this one for async
3623          * submittal once the head completes. If we don't have a head but
3624          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
3625          * submitted sync once the chain is complete. If none of those
3626          * conditions are true (normal request), then just queue it.
3627          */
3628         if (*link) {
3629                 struct io_kiocb *prev = *link;
3630
3631                 if (sqe->flags & IOSQE_IO_DRAIN)
3632                         (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
3633
3634                 if (sqe->flags & IOSQE_IO_HARDLINK)
3635                         req->flags |= REQ_F_HARDLINK;
3636
3637                 if (io_alloc_async_ctx(req)) {
3638                         ret = -EAGAIN;
3639                         goto err_req;
3640                 }
3641
3642                 ret = io_req_defer_prep(req, sqe);
3643                 if (ret) {
3644                         /* fail even hard links since we don't submit */
3645                         prev->flags |= REQ_F_FAIL_LINK;
3646                         goto err_req;
3647                 }
3648                 trace_io_uring_link(ctx, req, prev);
3649                 list_add_tail(&req->link_list, &prev->link_list);
3650         } else if (sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
3651                 req->flags |= REQ_F_LINK;
3652                 if (sqe->flags & IOSQE_IO_HARDLINK)
3653                         req->flags |= REQ_F_HARDLINK;
3654
3655                 INIT_LIST_HEAD(&req->link_list);
3656                 ret = io_req_defer_prep(req, sqe);
3657                 if (ret)
3658                         req->flags |= REQ_F_FAIL_LINK;
3659                 *link = req;
3660         } else {
3661                 io_queue_sqe(req, sqe);
3662         }
3663
3664         return true;
3665 }
3666
3667 /*
3668  * Batched submission is done, ensure local IO is flushed out.
3669  */
3670 static void io_submit_state_end(struct io_submit_state *state)
3671 {
3672         blk_finish_plug(&state->plug);
3673         io_file_put(state);
3674         if (state->free_reqs)
3675                 kmem_cache_free_bulk(req_cachep, state->free_reqs,
3676                                         &state->reqs[state->cur_req]);
3677 }
3678
3679 /*
3680  * Start submission side cache.
3681  */
3682 static void io_submit_state_start(struct io_submit_state *state,
3683                                   unsigned int max_ios)
3684 {
3685         blk_start_plug(&state->plug);
3686         state->free_reqs = 0;
3687         state->file = NULL;
3688         state->ios_left = max_ios;
3689 }
3690
3691 static void io_commit_sqring(struct io_ring_ctx *ctx)
3692 {
3693         struct io_rings *rings = ctx->rings;
3694
3695         if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) {
3696                 /*
3697                  * Ensure any loads from the SQEs are done at this point,
3698                  * since once we write the new head, the application could
3699                  * write new data to them.
3700                  */
3701                 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
3702         }
3703 }
3704
3705 /*
3706  * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
3707  * that is mapped by userspace. This means that care needs to be taken to
3708  * ensure that reads are stable, as we cannot rely on userspace always
3709  * being a good citizen. If members of the sqe are validated and then later
3710  * used, it's important that those reads are done through READ_ONCE() to
3711  * prevent a re-load down the line.
3712  */
3713 static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
3714                           const struct io_uring_sqe **sqe_ptr)
3715 {
3716         struct io_rings *rings = ctx->rings;
3717         u32 *sq_array = ctx->sq_array;
3718         unsigned head;
3719
3720         /*
3721          * The cached sq head (or cq tail) serves two purposes:
3722          *
3723          * 1) allows us to batch the cost of updating the user visible
3724          *    head updates.
3725          * 2) allows the kernel side to track the head on its own, even
3726          *    though the application is the one updating it.
3727          */
3728         head = ctx->cached_sq_head;
3729         /* make sure SQ entry isn't read before tail */
3730         if (unlikely(head == smp_load_acquire(&rings->sq.tail)))
3731                 return false;
3732
3733         head = READ_ONCE(sq_array[head & ctx->sq_mask]);
3734         if (likely(head < ctx->sq_entries)) {
3735                 /*
3736                  * All io need record the previous position, if LINK vs DARIN,
3737                  * it can be used to mark the position of the first IO in the
3738                  * link list.
3739                  */
3740                 req->sequence = ctx->cached_sq_head;
3741                 *sqe_ptr = &ctx->sq_sqes[head];
3742                 req->opcode = READ_ONCE((*sqe_ptr)->opcode);
3743                 req->user_data = READ_ONCE((*sqe_ptr)->user_data);
3744                 ctx->cached_sq_head++;
3745                 return true;
3746         }
3747
3748         /* drop invalid entries */
3749         ctx->cached_sq_head++;
3750         ctx->cached_sq_dropped++;
3751         WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped);
3752         return false;
3753 }
3754
3755 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
3756                           struct file *ring_file, int ring_fd,
3757                           struct mm_struct **mm, bool async)
3758 {
3759         struct io_submit_state state, *statep = NULL;
3760         struct io_kiocb *link = NULL;
3761         int i, submitted = 0;
3762         bool mm_fault = false;
3763
3764         /* if we have a backlog and couldn't flush it all, return BUSY */
3765         if (!list_empty(&ctx->cq_overflow_list) &&
3766             !io_cqring_overflow_flush(ctx, false))
3767                 return -EBUSY;
3768
3769         if (nr > IO_PLUG_THRESHOLD) {
3770                 io_submit_state_start(&state, nr);
3771                 statep = &state;
3772         }
3773
3774         for (i = 0; i < nr; i++) {
3775                 const struct io_uring_sqe *sqe;
3776                 struct io_kiocb *req;
3777                 unsigned int sqe_flags;
3778
3779                 req = io_get_req(ctx, statep);
3780                 if (unlikely(!req)) {
3781                         if (!submitted)
3782                                 submitted = -EAGAIN;
3783                         break;
3784                 }
3785                 if (!io_get_sqring(ctx, req, &sqe)) {
3786                         __io_free_req(req);
3787                         break;
3788                 }
3789
3790                 if (io_req_needs_user(req) && !*mm) {
3791                         mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
3792                         if (!mm_fault) {
3793                                 use_mm(ctx->sqo_mm);
3794                                 *mm = ctx->sqo_mm;
3795                         }
3796                 }
3797
3798                 submitted++;
3799                 sqe_flags = sqe->flags;
3800
3801                 req->ring_file = ring_file;
3802                 req->ring_fd = ring_fd;
3803                 req->has_user = *mm != NULL;
3804                 req->in_async = async;
3805                 req->needs_fixed_file = async;
3806                 trace_io_uring_submit_sqe(ctx, req->user_data, true, async);
3807                 if (!io_submit_sqe(req, sqe, statep, &link))
3808                         break;
3809                 /*
3810                  * If previous wasn't linked and we have a linked command,
3811                  * that's the end of the chain. Submit the previous link.
3812                  */
3813                 if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) {
3814                         io_queue_link_head(link);
3815                         link = NULL;
3816                 }
3817         }
3818
3819         if (link)
3820                 io_queue_link_head(link);
3821         if (statep)
3822                 io_submit_state_end(&state);
3823
3824          /* Commit SQ ring head once we've consumed and submitted all SQEs */
3825         io_commit_sqring(ctx);
3826
3827         return submitted;
3828 }
3829
3830 static int io_sq_thread(void *data)
3831 {
3832         struct io_ring_ctx *ctx = data;
3833         struct mm_struct *cur_mm = NULL;
3834         const struct cred *old_cred;
3835         mm_segment_t old_fs;
3836         DEFINE_WAIT(wait);
3837         unsigned inflight;
3838         unsigned long timeout;
3839         int ret;
3840
3841         complete(&ctx->completions[1]);
3842
3843         old_fs = get_fs();
3844         set_fs(USER_DS);
3845         old_cred = override_creds(ctx->creds);
3846
3847         ret = timeout = inflight = 0;
3848         while (!kthread_should_park()) {
3849                 unsigned int to_submit;
3850
3851                 if (inflight) {
3852                         unsigned nr_events = 0;
3853
3854                         if (ctx->flags & IORING_SETUP_IOPOLL) {
3855                                 /*
3856                                  * inflight is the count of the maximum possible
3857                                  * entries we submitted, but it can be smaller
3858                                  * if we dropped some of them. If we don't have
3859                                  * poll entries available, then we know that we
3860                                  * have nothing left to poll for. Reset the
3861                                  * inflight count to zero in that case.
3862                                  */
3863                                 mutex_lock(&ctx->uring_lock);
3864                                 if (!list_empty(&ctx->poll_list))
3865                                         __io_iopoll_check(ctx, &nr_events, 0);
3866                                 else
3867                                         inflight = 0;
3868                                 mutex_unlock(&ctx->uring_lock);
3869                         } else {
3870                                 /*
3871                                  * Normal IO, just pretend everything completed.
3872                                  * We don't have to poll completions for that.
3873                                  */
3874                                 nr_events = inflight;
3875                         }
3876
3877                         inflight -= nr_events;
3878                         if (!inflight)
3879                                 timeout = jiffies + ctx->sq_thread_idle;
3880                 }
3881
3882                 to_submit = io_sqring_entries(ctx);
3883
3884                 /*
3885                  * If submit got -EBUSY, flag us as needing the application
3886                  * to enter the kernel to reap and flush events.
3887                  */
3888                 if (!to_submit || ret == -EBUSY) {
3889                         /*
3890                          * We're polling. If we're within the defined idle
3891                          * period, then let us spin without work before going
3892                          * to sleep. The exception is if we got EBUSY doing
3893                          * more IO, we should wait for the application to
3894                          * reap events and wake us up.
3895                          */
3896                         if (inflight ||
3897                             (!time_after(jiffies, timeout) && ret != -EBUSY)) {
3898                                 cond_resched();
3899                                 continue;
3900                         }
3901
3902                         /*
3903                          * Drop cur_mm before scheduling, we can't hold it for
3904                          * long periods (or over schedule()). Do this before
3905                          * adding ourselves to the waitqueue, as the unuse/drop
3906                          * may sleep.
3907                          */
3908                         if (cur_mm) {
3909                                 unuse_mm(cur_mm);
3910                                 mmput(cur_mm);
3911                                 cur_mm = NULL;
3912                         }
3913
3914                         prepare_to_wait(&ctx->sqo_wait, &wait,
3915                                                 TASK_INTERRUPTIBLE);
3916
3917                         /* Tell userspace we may need a wakeup call */
3918                         ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
3919                         /* make sure to read SQ tail after writing flags */
3920                         smp_mb();
3921
3922                         to_submit = io_sqring_entries(ctx);
3923                         if (!to_submit || ret == -EBUSY) {
3924                                 if (kthread_should_park()) {
3925                                         finish_wait(&ctx->sqo_wait, &wait);
3926                                         break;
3927                                 }
3928                                 if (signal_pending(current))
3929                                         flush_signals(current);
3930                                 schedule();
3931                                 finish_wait(&ctx->sqo_wait, &wait);
3932
3933                                 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
3934                                 continue;
3935                         }
3936                         finish_wait(&ctx->sqo_wait, &wait);
3937
3938                         ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
3939                 }
3940
3941                 to_submit = min(to_submit, ctx->sq_entries);
3942                 mutex_lock(&ctx->uring_lock);
3943                 ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
3944                 mutex_unlock(&ctx->uring_lock);
3945                 if (ret > 0)
3946                         inflight += ret;
3947         }
3948
3949         set_fs(old_fs);
3950         if (cur_mm) {
3951                 unuse_mm(cur_mm);
3952                 mmput(cur_mm);
3953         }
3954         revert_creds(old_cred);
3955
3956         kthread_parkme();
3957
3958         return 0;
3959 }
3960
3961 struct io_wait_queue {
3962         struct wait_queue_entry wq;
3963         struct io_ring_ctx *ctx;
3964         unsigned to_wait;
3965         unsigned nr_timeouts;
3966 };
3967
3968 static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
3969 {
3970         struct io_ring_ctx *ctx = iowq->ctx;
3971
3972         /*
3973          * Wake up if we have enough events, or if a timeout occurred since we
3974          * started waiting. For timeouts, we always want to return to userspace,
3975          * regardless of event count.
3976          */
3977         return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
3978                         atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
3979 }
3980
3981 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
3982                             int wake_flags, void *key)
3983 {
3984         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
3985                                                         wq);
3986
3987         /* use noflush == true, as we can't safely rely on locking context */
3988         if (!io_should_wake(iowq, true))
3989                 return -1;
3990
3991         return autoremove_wake_function(curr, mode, wake_flags, key);
3992 }
3993
3994 /*
3995  * Wait until events become available, if we don't already have some. The
3996  * application must reap them itself, as they reside on the shared cq ring.
3997  */
3998 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
3999                           const sigset_t __user *sig, size_t sigsz)
4000 {
4001         struct io_wait_queue iowq = {
4002                 .wq = {
4003                         .private        = current,
4004                         .func           = io_wake_function,
4005                         .entry          = LIST_HEAD_INIT(iowq.wq.entry),
4006                 },
4007                 .ctx            = ctx,
4008                 .to_wait        = min_events,
4009         };
4010         struct io_rings *rings = ctx->rings;
4011         int ret = 0;
4012
4013         if (io_cqring_events(ctx, false) >= min_events)
4014                 return 0;
4015
4016         if (sig) {
4017 #ifdef CONFIG_COMPAT
4018                 if (in_compat_syscall())
4019                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
4020                                                       sigsz);
4021                 else
4022 #endif
4023                         ret = set_user_sigmask(sig, sigsz);
4024
4025                 if (ret)
4026                         return ret;
4027         }
4028
4029         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
4030         trace_io_uring_cqring_wait(ctx, min_events);
4031         do {
4032                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
4033                                                 TASK_INTERRUPTIBLE);
4034                 if (io_should_wake(&iowq, false))
4035                         break;
4036                 schedule();
4037                 if (signal_pending(current)) {
4038                         ret = -EINTR;
4039                         break;
4040                 }
4041         } while (1);
4042         finish_wait(&ctx->wait, &iowq.wq);
4043
4044         restore_saved_sigmask_unless(ret == -EINTR);
4045
4046         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
4047 }
4048
4049 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
4050 {
4051 #if defined(CONFIG_UNIX)
4052         if (ctx->ring_sock) {
4053                 struct sock *sock = ctx->ring_sock->sk;
4054                 struct sk_buff *skb;
4055
4056                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
4057                         kfree_skb(skb);
4058         }
4059 #else
4060         int i;
4061
4062         for (i = 0; i < ctx->nr_user_files; i++) {
4063                 struct file *file;
4064
4065                 file = io_file_from_index(ctx, i);
4066                 if (file)
4067                         fput(file);
4068         }
4069 #endif
4070 }
4071
4072 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
4073 {
4074         unsigned nr_tables, i;
4075
4076         if (!ctx->file_table)
4077                 return -ENXIO;
4078
4079         __io_sqe_files_unregister(ctx);
4080         nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
4081         for (i = 0; i < nr_tables; i++)
4082                 kfree(ctx->file_table[i].files);
4083         kfree(ctx->file_table);
4084         ctx->file_table = NULL;
4085         ctx->nr_user_files = 0;
4086         return 0;
4087 }
4088
4089 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
4090 {
4091         if (ctx->sqo_thread) {
4092                 wait_for_completion(&ctx->completions[1]);
4093                 /*
4094                  * The park is a bit of a work-around, without it we get
4095                  * warning spews on shutdown with SQPOLL set and affinity
4096                  * set to a single CPU.
4097                  */
4098                 kthread_park(ctx->sqo_thread);
4099                 kthread_stop(ctx->sqo_thread);
4100                 ctx->sqo_thread = NULL;
4101         }
4102 }
4103
4104 static void io_finish_async(struct io_ring_ctx *ctx)
4105 {
4106         io_sq_thread_stop(ctx);
4107
4108         if (ctx->io_wq) {
4109                 io_wq_destroy(ctx->io_wq);
4110                 ctx->io_wq = NULL;
4111         }
4112 }
4113
4114 #if defined(CONFIG_UNIX)
4115 static void io_destruct_skb(struct sk_buff *skb)
4116 {
4117         struct io_ring_ctx *ctx = skb->sk->sk_user_data;
4118
4119         if (ctx->io_wq)
4120                 io_wq_flush(ctx->io_wq);
4121
4122         unix_destruct_scm(skb);
4123 }
4124
4125 /*
4126  * Ensure the UNIX gc is aware of our file set, so we are certain that
4127  * the io_uring can be safely unregistered on process exit, even if we have
4128  * loops in the file referencing.
4129  */
4130 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
4131 {
4132         struct sock *sk = ctx->ring_sock->sk;
4133         struct scm_fp_list *fpl;
4134         struct sk_buff *skb;
4135         int i, nr_files;
4136
4137         if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
4138                 unsigned long inflight = ctx->user->unix_inflight + nr;
4139
4140                 if (inflight > task_rlimit(current, RLIMIT_NOFILE))
4141                         return -EMFILE;
4142         }
4143
4144         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
4145         if (!fpl)
4146                 return -ENOMEM;
4147
4148         skb = alloc_skb(0, GFP_KERNEL);
4149         if (!skb) {
4150                 kfree(fpl);
4151                 return -ENOMEM;
4152         }
4153
4154         skb->sk = sk;
4155
4156         nr_files = 0;
4157         fpl->user = get_uid(ctx->user);
4158         for (i = 0; i < nr; i++) {
4159                 struct file *file = io_file_from_index(ctx, i + offset);
4160
4161                 if (!file)
4162                         continue;
4163                 fpl->fp[nr_files] = get_file(file);
4164                 unix_inflight(fpl->user, fpl->fp[nr_files]);
4165                 nr_files++;
4166         }
4167
4168         if (nr_files) {
4169                 fpl->max = SCM_MAX_FD;
4170                 fpl->count = nr_files;
4171                 UNIXCB(skb).fp = fpl;
4172                 skb->destructor = io_destruct_skb;
4173                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
4174                 skb_queue_head(&sk->sk_receive_queue, skb);
4175
4176                 for (i = 0; i < nr_files; i++)
4177                         fput(fpl->fp[i]);
4178         } else {
4179                 kfree_skb(skb);
4180                 kfree(fpl);
4181         }
4182
4183         return 0;
4184 }
4185
4186 /*
4187  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
4188  * causes regular reference counting to break down. We rely on the UNIX
4189  * garbage collection to take care of this problem for us.
4190  */
4191 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
4192 {
4193         unsigned left, total;
4194         int ret = 0;
4195
4196         total = 0;
4197         left = ctx->nr_user_files;
4198         while (left) {
4199                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
4200
4201                 ret = __io_sqe_files_scm(ctx, this_files, total);
4202                 if (ret)
4203                         break;
4204                 left -= this_files;
4205                 total += this_files;
4206         }
4207
4208         if (!ret)
4209                 return 0;
4210
4211         while (total < ctx->nr_user_files) {
4212                 struct file *file = io_file_from_index(ctx, total);
4213
4214                 if (file)
4215                         fput(file);
4216                 total++;
4217         }
4218
4219         return ret;
4220 }
4221 #else
4222 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
4223 {
4224         return 0;
4225 }
4226 #endif
4227
4228 static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
4229                                     unsigned nr_files)
4230 {
4231         int i;
4232
4233         for (i = 0; i < nr_tables; i++) {
4234                 struct fixed_file_table *table = &ctx->file_table[i];
4235                 unsigned this_files;
4236
4237                 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
4238                 table->files = kcalloc(this_files, sizeof(struct file *),
4239                                         GFP_KERNEL);
4240                 if (!table->files)
4241                         break;
4242                 nr_files -= this_files;
4243         }
4244
4245         if (i == nr_tables)
4246                 return 0;
4247
4248         for (i = 0; i < nr_tables; i++) {
4249                 struct fixed_file_table *table = &ctx->file_table[i];
4250                 kfree(table->files);
4251         }
4252         return 1;
4253 }
4254
4255 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
4256                                  unsigned nr_args)
4257 {
4258         __s32 __user *fds = (__s32 __user *) arg;
4259         unsigned nr_tables;
4260         int fd, ret = 0;
4261         unsigned i;
4262
4263         if (ctx->file_table)
4264                 return -EBUSY;
4265         if (!nr_args)
4266                 return -EINVAL;
4267         if (nr_args > IORING_MAX_FIXED_FILES)
4268                 return -EMFILE;
4269
4270         nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
4271         ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table),
4272                                         GFP_KERNEL);
4273         if (!ctx->file_table)
4274                 return -ENOMEM;
4275
4276         if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
4277                 kfree(ctx->file_table);
4278                 ctx->file_table = NULL;
4279                 return -ENOMEM;
4280         }
4281
4282         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
4283                 struct fixed_file_table *table;
4284                 unsigned index;
4285
4286                 ret = -EFAULT;
4287                 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
4288                         break;
4289                 /* allow sparse sets */
4290                 if (fd == -1) {
4291                         ret = 0;
4292                         continue;
4293                 }
4294
4295                 table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
4296                 index = i & IORING_FILE_TABLE_MASK;
4297                 table->files[index] = fget(fd);
4298
4299                 ret = -EBADF;
4300                 if (!table->files[index])
4301                         break;
4302                 /*
4303                  * Don't allow io_uring instances to be registered. If UNIX
4304                  * isn't enabled, then this causes a reference cycle and this
4305                  * instance can never get freed. If UNIX is enabled we'll
4306                  * handle it just fine, but there's still no point in allowing
4307                  * a ring fd as it doesn't support regular read/write anyway.
4308                  */
4309                 if (table->files[index]->f_op == &io_uring_fops) {
4310                         fput(table->files[index]);
4311                         break;
4312                 }
4313                 ret = 0;
4314         }
4315
4316         if (ret) {
4317                 for (i = 0; i < ctx->nr_user_files; i++) {
4318                         struct file *file;
4319
4320                         file = io_file_from_index(ctx, i);
4321                         if (file)
4322                                 fput(file);
4323                 }
4324                 for (i = 0; i < nr_tables; i++)
4325                         kfree(ctx->file_table[i].files);
4326
4327                 kfree(ctx->file_table);
4328                 ctx->file_table = NULL;
4329                 ctx->nr_user_files = 0;
4330                 return ret;
4331         }
4332
4333         ret = io_sqe_files_scm(ctx);
4334         if (ret)
4335                 io_sqe_files_unregister(ctx);
4336
4337         return ret;
4338 }
4339
4340 static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index)
4341 {
4342 #if defined(CONFIG_UNIX)
4343         struct file *file = io_file_from_index(ctx, index);
4344         struct sock *sock = ctx->ring_sock->sk;
4345         struct sk_buff_head list, *head = &sock->sk_receive_queue;
4346         struct sk_buff *skb;
4347         int i;
4348
4349         __skb_queue_head_init(&list);
4350
4351         /*
4352          * Find the skb that holds this file in its SCM_RIGHTS. When found,
4353          * remove this entry and rearrange the file array.
4354          */
4355         skb = skb_dequeue(head);
4356         while (skb) {
4357                 struct scm_fp_list *fp;
4358
4359                 fp = UNIXCB(skb).fp;
4360                 for (i = 0; i < fp->count; i++) {
4361                         int left;
4362
4363                         if (fp->fp[i] != file)
4364                                 continue;
4365
4366                         unix_notinflight(fp->user, fp->fp[i]);
4367                         left = fp->count - 1 - i;
4368                         if (left) {
4369                                 memmove(&fp->fp[i], &fp->fp[i + 1],
4370                                                 left * sizeof(struct file *));
4371                         }
4372                         fp->count--;
4373                         if (!fp->count) {
4374                                 kfree_skb(skb);
4375                                 skb = NULL;
4376                         } else {
4377                                 __skb_queue_tail(&list, skb);
4378                         }
4379                         fput(file);
4380                         file = NULL;
4381                         break;
4382                 }
4383
4384                 if (!file)
4385                         break;
4386
4387                 __skb_queue_tail(&list, skb);
4388
4389                 skb = skb_dequeue(head);
4390         }
4391
4392         if (skb_peek(&list)) {
4393                 spin_lock_irq(&head->lock);
4394                 while ((skb = __skb_dequeue(&list)) != NULL)
4395                         __skb_queue_tail(head, skb);
4396                 spin_unlock_irq(&head->lock);
4397         }
4398 #else
4399         fput(io_file_from_index(ctx, index));
4400 #endif
4401 }
4402
4403 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
4404                                 int index)
4405 {
4406 #if defined(CONFIG_UNIX)
4407         struct sock *sock = ctx->ring_sock->sk;
4408         struct sk_buff_head *head = &sock->sk_receive_queue;
4409         struct sk_buff *skb;
4410
4411         /*
4412          * See if we can merge this file into an existing skb SCM_RIGHTS
4413          * file set. If there's no room, fall back to allocating a new skb
4414          * and filling it in.
4415          */
4416         spin_lock_irq(&head->lock);
4417         skb = skb_peek(head);
4418         if (skb) {
4419                 struct scm_fp_list *fpl = UNIXCB(skb).fp;
4420
4421                 if (fpl->count < SCM_MAX_FD) {
4422                         __skb_unlink(skb, head);
4423                         spin_unlock_irq(&head->lock);
4424                         fpl->fp[fpl->count] = get_file(file);
4425                         unix_inflight(fpl->user, fpl->fp[fpl->count]);
4426                         fpl->count++;
4427                         spin_lock_irq(&head->lock);
4428                         __skb_queue_head(head, skb);
4429                 } else {
4430                         skb = NULL;
4431                 }
4432         }
4433         spin_unlock_irq(&head->lock);
4434
4435         if (skb) {
4436                 fput(file);
4437                 return 0;
4438         }
4439
4440         return __io_sqe_files_scm(ctx, 1, index);
4441 #else
4442         return 0;
4443 #endif
4444 }
4445
4446 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
4447                                unsigned nr_args)
4448 {
4449         struct io_uring_files_update up;
4450         __s32 __user *fds;
4451         int fd, i, err;
4452         __u32 done;
4453
4454         if (!ctx->file_table)
4455                 return -ENXIO;
4456         if (!nr_args)
4457                 return -EINVAL;
4458         if (copy_from_user(&up, arg, sizeof(up)))
4459                 return -EFAULT;
4460         if (check_add_overflow(up.offset, nr_args, &done))
4461                 return -EOVERFLOW;
4462         if (done > ctx->nr_user_files)
4463                 return -EINVAL;
4464
4465         done = 0;
4466         fds = (__s32 __user *) up.fds;
4467         while (nr_args) {
4468                 struct fixed_file_table *table;
4469                 unsigned index;
4470
4471                 err = 0;
4472                 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
4473                         err = -EFAULT;
4474                         break;
4475                 }
4476                 i = array_index_nospec(up.offset, ctx->nr_user_files);
4477                 table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
4478                 index = i & IORING_FILE_TABLE_MASK;
4479                 if (table->files[index]) {
4480                         io_sqe_file_unregister(ctx, i);
4481                         table->files[index] = NULL;
4482                 }
4483                 if (fd != -1) {
4484                         struct file *file;
4485
4486                         file = fget(fd);
4487                         if (!file) {
4488                                 err = -EBADF;
4489                                 break;
4490                         }
4491                         /*
4492                          * Don't allow io_uring instances to be registered. If
4493                          * UNIX isn't enabled, then this causes a reference
4494                          * cycle and this instance can never get freed. If UNIX
4495                          * is enabled we'll handle it just fine, but there's
4496                          * still no point in allowing a ring fd as it doesn't
4497                          * support regular read/write anyway.
4498                          */
4499                         if (file->f_op == &io_uring_fops) {
4500                                 fput(file);
4501                                 err = -EBADF;
4502                                 break;
4503                         }
4504                         table->files[index] = file;
4505                         err = io_sqe_file_register(ctx, file, i);
4506                         if (err)
4507                                 break;
4508                 }
4509                 nr_args--;
4510                 done++;
4511                 up.offset++;
4512         }
4513
4514         return done ? done : err;
4515 }
4516
4517 static void io_put_work(struct io_wq_work *work)
4518 {
4519         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
4520
4521         io_put_req(req);
4522 }
4523
4524 static void io_get_work(struct io_wq_work *work)
4525 {
4526         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
4527
4528         refcount_inc(&req->refs);
4529 }
4530
4531 static int io_sq_offload_start(struct io_ring_ctx *ctx,
4532                                struct io_uring_params *p)
4533 {
4534         struct io_wq_data data;
4535         unsigned concurrency;
4536         int ret;
4537
4538         init_waitqueue_head(&ctx->sqo_wait);
4539         mmgrab(current->mm);
4540         ctx->sqo_mm = current->mm;
4541
4542         if (ctx->flags & IORING_SETUP_SQPOLL) {
4543                 ret = -EPERM;
4544                 if (!capable(CAP_SYS_ADMIN))
4545                         goto err;
4546
4547                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
4548                 if (!ctx->sq_thread_idle)
4549                         ctx->sq_thread_idle = HZ;
4550
4551                 if (p->flags & IORING_SETUP_SQ_AFF) {
4552                         int cpu = p->sq_thread_cpu;
4553
4554                         ret = -EINVAL;
4555                         if (cpu >= nr_cpu_ids)
4556                                 goto err;
4557                         if (!cpu_online(cpu))
4558                                 goto err;
4559
4560                         ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
4561                                                         ctx, cpu,
4562                                                         "io_uring-sq");
4563                 } else {
4564                         ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
4565                                                         "io_uring-sq");
4566                 }
4567                 if (IS_ERR(ctx->sqo_thread)) {
4568                         ret = PTR_ERR(ctx->sqo_thread);
4569                         ctx->sqo_thread = NULL;
4570                         goto err;
4571                 }
4572                 wake_up_process(ctx->sqo_thread);
4573         } else if (p->flags & IORING_SETUP_SQ_AFF) {
4574                 /* Can't have SQ_AFF without SQPOLL */
4575                 ret = -EINVAL;
4576                 goto err;
4577         }
4578
4579         data.mm = ctx->sqo_mm;
4580         data.user = ctx->user;
4581         data.creds = ctx->creds;
4582         data.get_work = io_get_work;
4583         data.put_work = io_put_work;
4584
4585         /* Do QD, or 4 * CPUS, whatever is smallest */
4586         concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
4587         ctx->io_wq = io_wq_create(concurrency, &data);
4588         if (IS_ERR(ctx->io_wq)) {
4589                 ret = PTR_ERR(ctx->io_wq);
4590                 ctx->io_wq = NULL;
4591                 goto err;
4592         }
4593
4594         return 0;
4595 err:
4596         io_finish_async(ctx);
4597         mmdrop(ctx->sqo_mm);
4598         ctx->sqo_mm = NULL;
4599         return ret;
4600 }
4601
4602 static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
4603 {
4604         atomic_long_sub(nr_pages, &user->locked_vm);
4605 }
4606
4607 static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
4608 {
4609         unsigned long page_limit, cur_pages, new_pages;
4610
4611         /* Don't allow more pages than we can safely lock */
4612         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
4613
4614         do {
4615                 cur_pages = atomic_long_read(&user->locked_vm);
4616                 new_pages = cur_pages + nr_pages;
4617                 if (new_pages > page_limit)
4618                         return -ENOMEM;
4619         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
4620                                         new_pages) != cur_pages);
4621
4622         return 0;
4623 }
4624
4625 static void io_mem_free(void *ptr)
4626 {
4627         struct page *page;
4628
4629         if (!ptr)
4630                 return;
4631
4632         page = virt_to_head_page(ptr);
4633         if (put_page_testzero(page))
4634                 free_compound_page(page);
4635 }
4636
4637 static void *io_mem_alloc(size_t size)
4638 {
4639         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
4640                                 __GFP_NORETRY;
4641
4642         return (void *) __get_free_pages(gfp_flags, get_order(size));
4643 }
4644
4645 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
4646                                 size_t *sq_offset)
4647 {
4648         struct io_rings *rings;
4649         size_t off, sq_array_size;
4650
4651         off = struct_size(rings, cqes, cq_entries);
4652         if (off == SIZE_MAX)
4653                 return SIZE_MAX;
4654
4655 #ifdef CONFIG_SMP
4656         off = ALIGN(off, SMP_CACHE_BYTES);
4657         if (off == 0)
4658                 return SIZE_MAX;
4659 #endif
4660
4661         sq_array_size = array_size(sizeof(u32), sq_entries);
4662         if (sq_array_size == SIZE_MAX)
4663                 return SIZE_MAX;
4664
4665         if (check_add_overflow(off, sq_array_size, &off))
4666                 return SIZE_MAX;
4667
4668         if (sq_offset)
4669                 *sq_offset = off;
4670
4671         return off;
4672 }
4673
4674 static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
4675 {
4676         size_t pages;
4677
4678         pages = (size_t)1 << get_order(
4679                 rings_size(sq_entries, cq_entries, NULL));
4680         pages += (size_t)1 << get_order(
4681                 array_size(sizeof(struct io_uring_sqe), sq_entries));
4682
4683         return pages;
4684 }
4685
4686 static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
4687 {
4688         int i, j;
4689
4690         if (!ctx->user_bufs)
4691                 return -ENXIO;
4692
4693         for (i = 0; i < ctx->nr_user_bufs; i++) {
4694                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
4695
4696                 for (j = 0; j < imu->nr_bvecs; j++)
4697                         put_user_page(imu->bvec[j].bv_page);
4698
4699                 if (ctx->account_mem)
4700                         io_unaccount_mem(ctx->user, imu->nr_bvecs);
4701                 kvfree(imu->bvec);
4702                 imu->nr_bvecs = 0;
4703         }
4704
4705         kfree(ctx->user_bufs);
4706         ctx->user_bufs = NULL;
4707         ctx->nr_user_bufs = 0;
4708         return 0;
4709 }
4710
4711 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
4712                        void __user *arg, unsigned index)
4713 {
4714         struct iovec __user *src;
4715
4716 #ifdef CONFIG_COMPAT
4717         if (ctx->compat) {
4718                 struct compat_iovec __user *ciovs;
4719                 struct compat_iovec ciov;
4720
4721                 ciovs = (struct compat_iovec __user *) arg;
4722                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
4723                         return -EFAULT;
4724
4725                 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
4726                 dst->iov_len = ciov.iov_len;
4727                 return 0;
4728         }
4729 #endif
4730         src = (struct iovec __user *) arg;
4731         if (copy_from_user(dst, &src[index], sizeof(*dst)))
4732                 return -EFAULT;
4733         return 0;
4734 }
4735
4736 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
4737                                   unsigned nr_args)
4738 {
4739         struct vm_area_struct **vmas = NULL;
4740         struct page **pages = NULL;
4741         int i, j, got_pages = 0;
4742         int ret = -EINVAL;
4743
4744         if (ctx->user_bufs)
4745                 return -EBUSY;
4746         if (!nr_args || nr_args > UIO_MAXIOV)
4747                 return -EINVAL;
4748
4749         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
4750                                         GFP_KERNEL);
4751         if (!ctx->user_bufs)
4752                 return -ENOMEM;
4753
4754         for (i = 0; i < nr_args; i++) {
4755                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
4756                 unsigned long off, start, end, ubuf;
4757                 int pret, nr_pages;
4758                 struct iovec iov;
4759                 size_t size;
4760
4761                 ret = io_copy_iov(ctx, &iov, arg, i);
4762                 if (ret)
4763                         goto err;
4764
4765                 /*
4766                  * Don't impose further limits on the size and buffer
4767                  * constraints here, we'll -EINVAL later when IO is
4768                  * submitted if they are wrong.
4769                  */
4770                 ret = -EFAULT;
4771                 if (!iov.iov_base || !iov.iov_len)
4772                         goto err;
4773
4774                 /* arbitrary limit, but we need something */
4775                 if (iov.iov_len > SZ_1G)
4776                         goto err;
4777
4778                 ubuf = (unsigned long) iov.iov_base;
4779                 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
4780                 start = ubuf >> PAGE_SHIFT;
4781                 nr_pages = end - start;
4782
4783                 if (ctx->account_mem) {
4784                         ret = io_account_mem(ctx->user, nr_pages);
4785                         if (ret)
4786                                 goto err;
4787                 }
4788
4789                 ret = 0;
4790                 if (!pages || nr_pages > got_pages) {
4791                         kfree(vmas);
4792                         kfree(pages);
4793                         pages = kvmalloc_array(nr_pages, sizeof(struct page *),
4794                                                 GFP_KERNEL);
4795                         vmas = kvmalloc_array(nr_pages,
4796                                         sizeof(struct vm_area_struct *),
4797                                         GFP_KERNEL);
4798                         if (!pages || !vmas) {
4799                                 ret = -ENOMEM;
4800                                 if (ctx->account_mem)
4801                                         io_unaccount_mem(ctx->user, nr_pages);
4802                                 goto err;
4803                         }
4804                         got_pages = nr_pages;
4805                 }
4806
4807                 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
4808                                                 GFP_KERNEL);
4809                 ret = -ENOMEM;
4810                 if (!imu->bvec) {
4811                         if (ctx->account_mem)
4812                                 io_unaccount_mem(ctx->user, nr_pages);
4813                         goto err;
4814                 }
4815
4816                 ret = 0;
4817                 down_read(&current->mm->mmap_sem);
4818                 pret = get_user_pages(ubuf, nr_pages,
4819                                       FOLL_WRITE | FOLL_LONGTERM,
4820                                       pages, vmas);
4821                 if (pret == nr_pages) {
4822                         /* don't support file backed memory */
4823                         for (j = 0; j < nr_pages; j++) {
4824                                 struct vm_area_struct *vma = vmas[j];
4825
4826                                 if (vma->vm_file &&
4827                                     !is_file_hugepages(vma->vm_file)) {
4828                                         ret = -EOPNOTSUPP;
4829                                         break;
4830                                 }
4831                         }
4832                 } else {
4833                         ret = pret < 0 ? pret : -EFAULT;
4834                 }
4835                 up_read(&current->mm->mmap_sem);
4836                 if (ret) {
4837                         /*
4838                          * if we did partial map, or found file backed vmas,
4839                          * release any pages we did get
4840                          */
4841                         if (pret > 0)
4842                                 put_user_pages(pages, pret);
4843                         if (ctx->account_mem)
4844                                 io_unaccount_mem(ctx->user, nr_pages);
4845                         kvfree(imu->bvec);
4846                         goto err;
4847                 }
4848
4849                 off = ubuf & ~PAGE_MASK;
4850                 size = iov.iov_len;
4851                 for (j = 0; j < nr_pages; j++) {
4852                         size_t vec_len;
4853
4854                         vec_len = min_t(size_t, size, PAGE_SIZE - off);
4855                         imu->bvec[j].bv_page = pages[j];
4856                         imu->bvec[j].bv_len = vec_len;
4857                         imu->bvec[j].bv_offset = off;
4858                         off = 0;
4859                         size -= vec_len;
4860                 }
4861                 /* store original address for later verification */
4862                 imu->ubuf = ubuf;
4863                 imu->len = iov.iov_len;
4864                 imu->nr_bvecs = nr_pages;
4865
4866                 ctx->nr_user_bufs++;
4867         }
4868         kvfree(pages);
4869         kvfree(vmas);
4870         return 0;
4871 err:
4872         kvfree(pages);
4873         kvfree(vmas);
4874         io_sqe_buffer_unregister(ctx);
4875         return ret;
4876 }
4877
4878 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
4879 {
4880         __s32 __user *fds = arg;
4881         int fd;
4882
4883         if (ctx->cq_ev_fd)
4884                 return -EBUSY;
4885
4886         if (copy_from_user(&fd, fds, sizeof(*fds)))
4887                 return -EFAULT;
4888
4889         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
4890         if (IS_ERR(ctx->cq_ev_fd)) {
4891                 int ret = PTR_ERR(ctx->cq_ev_fd);
4892                 ctx->cq_ev_fd = NULL;
4893                 return ret;
4894         }
4895
4896         return 0;
4897 }
4898
4899 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
4900 {
4901         if (ctx->cq_ev_fd) {
4902                 eventfd_ctx_put(ctx->cq_ev_fd);
4903                 ctx->cq_ev_fd = NULL;
4904                 return 0;
4905         }
4906
4907         return -ENXIO;
4908 }
4909
4910 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
4911 {
4912         io_finish_async(ctx);
4913         if (ctx->sqo_mm)
4914                 mmdrop(ctx->sqo_mm);
4915
4916         io_iopoll_reap_events(ctx);
4917         io_sqe_buffer_unregister(ctx);
4918         io_sqe_files_unregister(ctx);
4919         io_eventfd_unregister(ctx);
4920
4921 #if defined(CONFIG_UNIX)
4922         if (ctx->ring_sock) {
4923                 ctx->ring_sock->file = NULL; /* so that iput() is called */
4924                 sock_release(ctx->ring_sock);
4925         }
4926 #endif
4927
4928         io_mem_free(ctx->rings);
4929         io_mem_free(ctx->sq_sqes);
4930
4931         percpu_ref_exit(&ctx->refs);
4932         if (ctx->account_mem)
4933                 io_unaccount_mem(ctx->user,
4934                                 ring_pages(ctx->sq_entries, ctx->cq_entries));
4935         free_uid(ctx->user);
4936         put_cred(ctx->creds);
4937         kfree(ctx->completions);
4938         kfree(ctx->cancel_hash);
4939         kmem_cache_free(req_cachep, ctx->fallback_req);
4940         kfree(ctx);
4941 }
4942
4943 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
4944 {
4945         struct io_ring_ctx *ctx = file->private_data;
4946         __poll_t mask = 0;
4947
4948         poll_wait(file, &ctx->cq_wait, wait);
4949         /*
4950          * synchronizes with barrier from wq_has_sleeper call in
4951          * io_commit_cqring
4952          */
4953         smp_rmb();
4954         if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
4955             ctx->rings->sq_ring_entries)
4956                 mask |= EPOLLOUT | EPOLLWRNORM;
4957         if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)
4958                 mask |= EPOLLIN | EPOLLRDNORM;
4959
4960         return mask;
4961 }
4962
4963 static int io_uring_fasync(int fd, struct file *file, int on)
4964 {
4965         struct io_ring_ctx *ctx = file->private_data;
4966
4967         return fasync_helper(fd, file, on, &ctx->cq_fasync);
4968 }
4969
4970 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
4971 {
4972         mutex_lock(&ctx->uring_lock);
4973         percpu_ref_kill(&ctx->refs);
4974         mutex_unlock(&ctx->uring_lock);
4975
4976         io_kill_timeouts(ctx);
4977         io_poll_remove_all(ctx);
4978
4979         if (ctx->io_wq)
4980                 io_wq_cancel_all(ctx->io_wq);
4981
4982         io_iopoll_reap_events(ctx);
4983         /* if we failed setting up the ctx, we might not have any rings */
4984         if (ctx->rings)
4985                 io_cqring_overflow_flush(ctx, true);
4986         wait_for_completion(&ctx->completions[0]);
4987         io_ring_ctx_free(ctx);
4988 }
4989
4990 static int io_uring_release(struct inode *inode, struct file *file)
4991 {
4992         struct io_ring_ctx *ctx = file->private_data;
4993
4994         file->private_data = NULL;
4995         io_ring_ctx_wait_and_kill(ctx);
4996         return 0;
4997 }
4998
4999 static void io_uring_cancel_files(struct io_ring_ctx *ctx,
5000                                   struct files_struct *files)
5001 {
5002         struct io_kiocb *req;
5003         DEFINE_WAIT(wait);
5004
5005         while (!list_empty_careful(&ctx->inflight_list)) {
5006                 struct io_kiocb *cancel_req = NULL;
5007
5008                 spin_lock_irq(&ctx->inflight_lock);
5009                 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
5010                         if (req->work.files != files)
5011                                 continue;
5012                         /* req is being completed, ignore */
5013                         if (!refcount_inc_not_zero(&req->refs))
5014                                 continue;
5015                         cancel_req = req;
5016                         break;
5017                 }
5018                 if (cancel_req)
5019                         prepare_to_wait(&ctx->inflight_wait, &wait,
5020                                                 TASK_UNINTERRUPTIBLE);
5021                 spin_unlock_irq(&ctx->inflight_lock);
5022
5023                 /* We need to keep going until we don't find a matching req */
5024                 if (!cancel_req)
5025                         break;
5026
5027                 io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
5028                 io_put_req(cancel_req);
5029                 schedule();
5030         }
5031         finish_wait(&ctx->inflight_wait, &wait);
5032 }
5033
5034 static int io_uring_flush(struct file *file, void *data)
5035 {
5036         struct io_ring_ctx *ctx = file->private_data;
5037
5038         io_uring_cancel_files(ctx, data);
5039         if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
5040                 io_cqring_overflow_flush(ctx, true);
5041                 io_wq_cancel_all(ctx->io_wq);
5042         }
5043         return 0;
5044 }
5045
5046 static void *io_uring_validate_mmap_request(struct file *file,
5047                                             loff_t pgoff, size_t sz)
5048 {
5049         struct io_ring_ctx *ctx = file->private_data;
5050         loff_t offset = pgoff << PAGE_SHIFT;
5051         struct page *page;
5052         void *ptr;
5053
5054         switch (offset) {
5055         case IORING_OFF_SQ_RING:
5056         case IORING_OFF_CQ_RING:
5057                 ptr = ctx->rings;
5058                 break;
5059         case IORING_OFF_SQES:
5060                 ptr = ctx->sq_sqes;
5061                 break;
5062         default:
5063                 return ERR_PTR(-EINVAL);
5064         }
5065
5066         page = virt_to_head_page(ptr);
5067         if (sz > page_size(page))
5068                 return ERR_PTR(-EINVAL);
5069
5070         return ptr;
5071 }
5072
5073 #ifdef CONFIG_MMU
5074
5075 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
5076 {
5077         size_t sz = vma->vm_end - vma->vm_start;
5078         unsigned long pfn;
5079         void *ptr;
5080
5081         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
5082         if (IS_ERR(ptr))
5083                 return PTR_ERR(ptr);
5084
5085         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
5086         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
5087 }
5088
5089 #else /* !CONFIG_MMU */
5090
5091 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
5092 {
5093         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
5094 }
5095
5096 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
5097 {
5098         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
5099 }
5100
5101 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
5102         unsigned long addr, unsigned long len,
5103         unsigned long pgoff, unsigned long flags)
5104 {
5105         void *ptr;
5106
5107         ptr = io_uring_validate_mmap_request(file, pgoff, len);
5108         if (IS_ERR(ptr))
5109                 return PTR_ERR(ptr);
5110
5111         return (unsigned long) ptr;
5112 }
5113
5114 #endif /* !CONFIG_MMU */
5115
5116 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
5117                 u32, min_complete, u32, flags, const sigset_t __user *, sig,
5118                 size_t, sigsz)
5119 {
5120         struct io_ring_ctx *ctx;
5121         long ret = -EBADF;
5122         int submitted = 0;
5123         struct fd f;
5124
5125         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
5126                 return -EINVAL;
5127
5128         f = fdget(fd);
5129         if (!f.file)
5130                 return -EBADF;
5131
5132         ret = -EOPNOTSUPP;
5133         if (f.file->f_op != &io_uring_fops)
5134                 goto out_fput;
5135
5136         ret = -ENXIO;
5137         ctx = f.file->private_data;
5138         if (!percpu_ref_tryget(&ctx->refs))
5139                 goto out_fput;
5140
5141         /*
5142          * For SQ polling, the thread will do all submissions and completions.
5143          * Just return the requested submit count, and wake the thread if
5144          * we were asked to.
5145          */
5146         ret = 0;
5147         if (ctx->flags & IORING_SETUP_SQPOLL) {
5148                 if (!list_empty_careful(&ctx->cq_overflow_list))
5149                         io_cqring_overflow_flush(ctx, false);
5150                 if (flags & IORING_ENTER_SQ_WAKEUP)
5151                         wake_up(&ctx->sqo_wait);
5152                 submitted = to_submit;
5153         } else if (to_submit) {
5154                 struct mm_struct *cur_mm;
5155
5156                 to_submit = min(to_submit, ctx->sq_entries);
5157                 mutex_lock(&ctx->uring_lock);
5158                 /* already have mm, so io_submit_sqes() won't try to grab it */
5159                 cur_mm = ctx->sqo_mm;
5160                 submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
5161                                            &cur_mm, false);
5162                 mutex_unlock(&ctx->uring_lock);
5163
5164                 if (submitted != to_submit)
5165                         goto out;
5166         }
5167         if (flags & IORING_ENTER_GETEVENTS) {
5168                 unsigned nr_events = 0;
5169
5170                 min_complete = min(min_complete, ctx->cq_entries);
5171
5172                 if (ctx->flags & IORING_SETUP_IOPOLL) {
5173                         ret = io_iopoll_check(ctx, &nr_events, min_complete);
5174                 } else {
5175                         ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
5176                 }
5177         }
5178
5179 out:
5180         percpu_ref_put(&ctx->refs);
5181 out_fput:
5182         fdput(f);
5183         return submitted ? submitted : ret;
5184 }
5185
5186 static const struct file_operations io_uring_fops = {
5187         .release        = io_uring_release,
5188         .flush          = io_uring_flush,
5189         .mmap           = io_uring_mmap,
5190 #ifndef CONFIG_MMU
5191         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
5192         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
5193 #endif
5194         .poll           = io_uring_poll,
5195         .fasync         = io_uring_fasync,
5196 };
5197
5198 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
5199                                   struct io_uring_params *p)
5200 {
5201         struct io_rings *rings;
5202         size_t size, sq_array_offset;
5203
5204         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
5205         if (size == SIZE_MAX)
5206                 return -EOVERFLOW;
5207
5208         rings = io_mem_alloc(size);
5209         if (!rings)
5210                 return -ENOMEM;
5211
5212         ctx->rings = rings;
5213         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
5214         rings->sq_ring_mask = p->sq_entries - 1;
5215         rings->cq_ring_mask = p->cq_entries - 1;
5216         rings->sq_ring_entries = p->sq_entries;
5217         rings->cq_ring_entries = p->cq_entries;
5218         ctx->sq_mask = rings->sq_ring_mask;
5219         ctx->cq_mask = rings->cq_ring_mask;
5220         ctx->sq_entries = rings->sq_ring_entries;
5221         ctx->cq_entries = rings->cq_ring_entries;
5222
5223         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
5224         if (size == SIZE_MAX) {
5225                 io_mem_free(ctx->rings);
5226                 ctx->rings = NULL;
5227                 return -EOVERFLOW;
5228         }
5229
5230         ctx->sq_sqes = io_mem_alloc(size);
5231         if (!ctx->sq_sqes) {
5232                 io_mem_free(ctx->rings);
5233                 ctx->rings = NULL;
5234                 return -ENOMEM;
5235         }
5236
5237         return 0;
5238 }
5239
5240 /*
5241  * Allocate an anonymous fd, this is what constitutes the application
5242  * visible backing of an io_uring instance. The application mmaps this
5243  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
5244  * we have to tie this fd to a socket for file garbage collection purposes.
5245  */
5246 static int io_uring_get_fd(struct io_ring_ctx *ctx)
5247 {
5248         struct file *file;
5249         int ret;
5250
5251 #if defined(CONFIG_UNIX)
5252         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
5253                                 &ctx->ring_sock);
5254         if (ret)
5255                 return ret;
5256 #endif
5257
5258         ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
5259         if (ret < 0)
5260                 goto err;
5261
5262         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
5263                                         O_RDWR | O_CLOEXEC);
5264         if (IS_ERR(file)) {
5265                 put_unused_fd(ret);
5266                 ret = PTR_ERR(file);
5267                 goto err;
5268         }
5269
5270 #if defined(CONFIG_UNIX)
5271         ctx->ring_sock->file = file;
5272         ctx->ring_sock->sk->sk_user_data = ctx;
5273 #endif
5274         fd_install(ret, file);
5275         return ret;
5276 err:
5277 #if defined(CONFIG_UNIX)
5278         sock_release(ctx->ring_sock);
5279         ctx->ring_sock = NULL;
5280 #endif
5281         return ret;
5282 }
5283
5284 static int io_uring_create(unsigned entries, struct io_uring_params *p)
5285 {
5286         struct user_struct *user = NULL;
5287         struct io_ring_ctx *ctx;
5288         bool account_mem;
5289         int ret;
5290
5291         if (!entries || entries > IORING_MAX_ENTRIES)
5292                 return -EINVAL;
5293
5294         /*
5295          * Use twice as many entries for the CQ ring. It's possible for the
5296          * application to drive a higher depth than the size of the SQ ring,
5297          * since the sqes are only used at submission time. This allows for
5298          * some flexibility in overcommitting a bit. If the application has
5299          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
5300          * of CQ ring entries manually.
5301          */
5302         p->sq_entries = roundup_pow_of_two(entries);
5303         if (p->flags & IORING_SETUP_CQSIZE) {
5304                 /*
5305                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
5306                  * to a power-of-two, if it isn't already. We do NOT impose
5307                  * any cq vs sq ring sizing.
5308                  */
5309                 if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES)
5310                         return -EINVAL;
5311                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
5312         } else {
5313                 p->cq_entries = 2 * p->sq_entries;
5314         }
5315
5316         user = get_uid(current_user());
5317         account_mem = !capable(CAP_IPC_LOCK);
5318
5319         if (account_mem) {
5320                 ret = io_account_mem(user,
5321                                 ring_pages(p->sq_entries, p->cq_entries));
5322                 if (ret) {
5323                         free_uid(user);
5324                         return ret;
5325                 }
5326         }
5327
5328         ctx = io_ring_ctx_alloc(p);
5329         if (!ctx) {
5330                 if (account_mem)
5331                         io_unaccount_mem(user, ring_pages(p->sq_entries,
5332                                                                 p->cq_entries));
5333                 free_uid(user);
5334                 return -ENOMEM;
5335         }
5336         ctx->compat = in_compat_syscall();
5337         ctx->account_mem = account_mem;
5338         ctx->user = user;
5339         ctx->creds = get_current_cred();
5340
5341         ret = io_allocate_scq_urings(ctx, p);
5342         if (ret)
5343                 goto err;
5344
5345         ret = io_sq_offload_start(ctx, p);
5346         if (ret)
5347                 goto err;
5348
5349         memset(&p->sq_off, 0, sizeof(p->sq_off));
5350         p->sq_off.head = offsetof(struct io_rings, sq.head);
5351         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
5352         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
5353         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
5354         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
5355         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
5356         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
5357
5358         memset(&p->cq_off, 0, sizeof(p->cq_off));
5359         p->cq_off.head = offsetof(struct io_rings, cq.head);
5360         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
5361         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
5362         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
5363         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
5364         p->cq_off.cqes = offsetof(struct io_rings, cqes);
5365
5366         /*
5367          * Install ring fd as the very last thing, so we don't risk someone
5368          * having closed it before we finish setup
5369          */
5370         ret = io_uring_get_fd(ctx);
5371         if (ret < 0)
5372                 goto err;
5373
5374         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
5375                         IORING_FEAT_SUBMIT_STABLE;
5376         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
5377         return ret;
5378 err:
5379         io_ring_ctx_wait_and_kill(ctx);
5380         return ret;
5381 }
5382
5383 /*
5384  * Sets up an aio uring context, and returns the fd. Applications asks for a
5385  * ring size, we return the actual sq/cq ring sizes (among other things) in the
5386  * params structure passed in.
5387  */
5388 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
5389 {
5390         struct io_uring_params p;
5391         long ret;
5392         int i;
5393
5394         if (copy_from_user(&p, params, sizeof(p)))
5395                 return -EFAULT;
5396         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
5397                 if (p.resv[i])
5398                         return -EINVAL;
5399         }
5400
5401         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
5402                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE))
5403                 return -EINVAL;
5404
5405         ret = io_uring_create(entries, &p);
5406         if (ret < 0)
5407                 return ret;
5408
5409         if (copy_to_user(params, &p, sizeof(p)))
5410                 return -EFAULT;
5411
5412         return ret;
5413 }
5414
5415 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
5416                 struct io_uring_params __user *, params)
5417 {
5418         return io_uring_setup(entries, params);
5419 }
5420
5421 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
5422                                void __user *arg, unsigned nr_args)
5423         __releases(ctx->uring_lock)
5424         __acquires(ctx->uring_lock)
5425 {
5426         int ret;
5427
5428         /*
5429          * We're inside the ring mutex, if the ref is already dying, then
5430          * someone else killed the ctx or is already going through
5431          * io_uring_register().
5432          */
5433         if (percpu_ref_is_dying(&ctx->refs))
5434                 return -ENXIO;
5435
5436         percpu_ref_kill(&ctx->refs);
5437
5438         /*
5439          * Drop uring mutex before waiting for references to exit. If another
5440          * thread is currently inside io_uring_enter() it might need to grab
5441          * the uring_lock to make progress. If we hold it here across the drain
5442          * wait, then we can deadlock. It's safe to drop the mutex here, since
5443          * no new references will come in after we've killed the percpu ref.
5444          */
5445         mutex_unlock(&ctx->uring_lock);
5446         wait_for_completion(&ctx->completions[0]);
5447         mutex_lock(&ctx->uring_lock);
5448
5449         switch (opcode) {
5450         case IORING_REGISTER_BUFFERS:
5451                 ret = io_sqe_buffer_register(ctx, arg, nr_args);
5452                 break;
5453         case IORING_UNREGISTER_BUFFERS:
5454                 ret = -EINVAL;
5455                 if (arg || nr_args)
5456                         break;
5457                 ret = io_sqe_buffer_unregister(ctx);
5458                 break;
5459         case IORING_REGISTER_FILES:
5460                 ret = io_sqe_files_register(ctx, arg, nr_args);
5461                 break;
5462         case IORING_UNREGISTER_FILES:
5463                 ret = -EINVAL;
5464                 if (arg || nr_args)
5465                         break;
5466                 ret = io_sqe_files_unregister(ctx);
5467                 break;
5468         case IORING_REGISTER_FILES_UPDATE:
5469                 ret = io_sqe_files_update(ctx, arg, nr_args);
5470                 break;
5471         case IORING_REGISTER_EVENTFD:
5472                 ret = -EINVAL;
5473                 if (nr_args != 1)
5474                         break;
5475                 ret = io_eventfd_register(ctx, arg);
5476                 break;
5477         case IORING_UNREGISTER_EVENTFD:
5478                 ret = -EINVAL;
5479                 if (arg || nr_args)
5480                         break;
5481                 ret = io_eventfd_unregister(ctx);
5482                 break;
5483         default:
5484                 ret = -EINVAL;
5485                 break;
5486         }
5487
5488         /* bring the ctx back to life */
5489         reinit_completion(&ctx->completions[0]);
5490         percpu_ref_reinit(&ctx->refs);
5491         return ret;
5492 }
5493
5494 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
5495                 void __user *, arg, unsigned int, nr_args)
5496 {
5497         struct io_ring_ctx *ctx;
5498         long ret = -EBADF;
5499         struct fd f;
5500
5501         f = fdget(fd);
5502         if (!f.file)
5503                 return -EBADF;
5504
5505         ret = -EOPNOTSUPP;
5506         if (f.file->f_op != &io_uring_fops)
5507                 goto out_fput;
5508
5509         ctx = f.file->private_data;
5510
5511         mutex_lock(&ctx->uring_lock);
5512         ret = __io_uring_register(ctx, opcode, arg, nr_args);
5513         mutex_unlock(&ctx->uring_lock);
5514         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
5515                                                         ctx->cq_ev_fd != NULL, ret);
5516 out_fput:
5517         fdput(f);
5518         return ret;
5519 }
5520
5521 static int __init io_uring_init(void)
5522 {
5523         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
5524         return 0;
5525 };
5526 __initcall(io_uring_init);