mm/memcontrol.c

   1 /* memcontrol.c - Memory Controller
   2  *
   3  * Copyright IBM Corporation, 2007
   4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5  *
   6  * Copyright 2007 OpenVZ SWsoft Inc
   7  * Author: Pavel Emelianov <xemul@openvz.org>
   8  *
   9  * Memory thresholds
  10  * Copyright (C) 2009 Nokia Corporation
  11  * Author: Kirill A. Shutemov
  12  *
  13  * Kernel Memory Controller
  14  * Copyright (C) 2012 Parallels Inc. and Google Inc.
  15  * Authors: Glauber Costa and Suleiman Souhlal
  16  *
  17  * Native page reclaim
  18  * Charge lifetime sanitation
  19  * Lockless page tracking & accounting
  20  * Unified hierarchy configuration model
  21  * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
  22  *
  23  * This program is free software; you can redistribute it and/or modify
  24  * it under the terms of the GNU General Public License as published by
  25  * the Free Software Foundation; either version 2 of the License, or
  26  * (at your option) any later version.
  27  *
  28  * This program is distributed in the hope that it will be useful,
  29  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  30  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  31  * GNU General Public License for more details.
  32  */
  33
  34 #include <linux/page_counter.h>
  35 #include <linux/memcontrol.h>
  36 #include <linux/cgroup.h>
  37 #include <linux/mm.h>
  38 #include <linux/sched/mm.h>
  39 #include <linux/shmem_fs.h>
  40 #include <linux/hugetlb.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/smp.h>
  43 #include <linux/page-flags.h>
  44 #include <linux/backing-dev.h>
  45 #include <linux/bit_spinlock.h>
  46 #include <linux/rcupdate.h>
  47 #include <linux/limits.h>
  48 #include <linux/export.h>
  49 #include <linux/mutex.h>
  50 #include <linux/rbtree.h>
  51 #include <linux/slab.h>
  52 #include <linux/swap.h>
  53 #include <linux/swapops.h>
  54 #include <linux/spinlock.h>
  55 #include <linux/eventfd.h>
  56 #include <linux/poll.h>
  57 #include <linux/sort.h>
  58 #include <linux/fs.h>
  59 #include <linux/seq_file.h>
  60 #include <linux/vmpressure.h>
  61 #include <linux/mm_inline.h>
  62 #include <linux/swap_cgroup.h>
  63 #include <linux/cpu.h>
  64 #include <linux/oom.h>
  65 #include <linux/lockdep.h>
  66 #include <linux/file.h>
  67 #include <linux/tracehook.h>
  68 #include "internal.h"
  69 #include <net/sock.h>
  70 #include <net/ip.h>
  71 #include "slab.h"
  72
  73 #include <linux/uaccess.h>
  74
  75 #include <trace/events/vmscan.h>
  76
  77 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  78 EXPORT_SYMBOL(memory_cgrp_subsys);
  79
  80 struct mem_cgroup *root_mem_cgroup __read_mostly;
  81
  82 #define MEM_CGROUP_RECLAIM_RETRIES      5
  83
  84 /* Socket memory accounting disabled? */
  85 static bool cgroup_memory_nosocket;
  86
  87 /* Kernel memory accounting disabled? */
  88 static bool cgroup_memory_nokmem;
  89
  90 /* Whether the swap controller is active */
  91 #ifdef CONFIG_MEMCG_SWAP
  92 int do_swap_account __read_mostly;
  93 #else
  94 #define do_swap_account         0
  95 #endif
  96
  97 /* Whether legacy memory+swap accounting is active */
  98 static bool do_memsw_account(void)
  99 {
 100         return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
 101 }
 102
 103 static const char * const mem_cgroup_stat_names[] = {
 104         "cache",
 105         "rss",
 106         "rss_huge",
 107         "shmem",
 108         "mapped_file",
 109         "dirty",
 110         "writeback",
 111         "swap",
 112 };
 113
 114 static const char * const mem_cgroup_events_names[] = {
 115         "pgpgin",
 116         "pgpgout",
 117         "pgfault",
 118         "pgmajfault",
 119 };
 120
 121 static const char * const mem_cgroup_lru_names[] = {
 122         "inactive_anon",
 123         "active_anon",
 124         "inactive_file",
 125         "active_file",
 126         "unevictable",
 127 };
 128
 129 #define THRESHOLDS_EVENTS_TARGET 128
 130 #define SOFTLIMIT_EVENTS_TARGET 1024
 131 #define NUMAINFO_EVENTS_TARGET  1024
 132
 133 /*
 134  * Cgroups above their limits are maintained in a RB-Tree, independent of
 135  * their hierarchy representation
 136  */
 137
 138 struct mem_cgroup_tree_per_node {
 139         struct rb_root rb_root;
 140         spinlock_t lock;
 141 };
 142
 143 struct mem_cgroup_tree {
 144         struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 145 };
 146
 147 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 148
 149 /* for OOM */
 150 struct mem_cgroup_eventfd_list {
 151         struct list_head list;
 152         struct eventfd_ctx *eventfd;
 153 };
 154
 155 /*
 156  * cgroup_event represents events which userspace want to receive.
 157  */
 158 struct mem_cgroup_event {
 159         /*
 160          * memcg which the event belongs to.
 161          */
 162         struct mem_cgroup *memcg;
 163         /*
 164          * eventfd to signal userspace about the event.
 165          */
 166         struct eventfd_ctx *eventfd;
 167         /*
 168          * Each of these stored in a list by the cgroup.
 169          */
 170         struct list_head list;
 171         /*
 172          * register_event() callback will be used to add new userspace
 173          * waiter for changes related to this event.  Use eventfd_signal()
 174          * on eventfd to send notification to userspace.
 175          */
 176         int (*register_event)(struct mem_cgroup *memcg,
 177                               struct eventfd_ctx *eventfd, const char *args);
 178         /*
 179          * unregister_event() callback will be called when userspace closes
 180          * the eventfd or on cgroup removing.  This callback must be set,
 181          * if you want provide notification functionality.
 182          */
 183         void (*unregister_event)(struct mem_cgroup *memcg,
 184                                  struct eventfd_ctx *eventfd);
 185         /*
 186          * All fields below needed to unregister event when
 187          * userspace closes eventfd.
 188          */
 189         poll_table pt;
 190         wait_queue_head_t *wqh;
 191         wait_queue_t wait;
 192         struct work_struct remove;
 193 };
 194
 195 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 196 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 197
 198 /* Stuffs for move charges at task migration. */
 199 /*
 200  * Types of charges to be moved.
 201  */
 202 #define MOVE_ANON       0x1U
 203 #define MOVE_FILE       0x2U
 204 #define MOVE_MASK       (MOVE_ANON | MOVE_FILE)
 205
 206 /* "mc" and its members are protected by cgroup_mutex */
 207 static struct move_charge_struct {
 208         spinlock_t        lock; /* for from, to */
 209         struct mm_struct  *mm;
 210         struct mem_cgroup *from;
 211         struct mem_cgroup *to;
 212         unsigned long flags;
 213         unsigned long precharge;
 214         unsigned long moved_charge;
 215         unsigned long moved_swap;
 216         struct task_struct *moving_task;        /* a task moving charges */
 217         wait_queue_head_t waitq;                /* a waitq for other context */
 218 } mc = {
 219         .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 220         .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 221 };
 222
 223 /*
 224  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 225  * limit reclaim to prevent infinite loops, if they ever occur.
 226  */
 227 #define MEM_CGROUP_MAX_RECLAIM_LOOPS            100
 228 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
 229
 230 enum charge_type {
 231         MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 232         MEM_CGROUP_CHARGE_TYPE_ANON,
 233         MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
 234         MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
 235         NR_CHARGE_TYPE,
 236 };
 237
 238 /* for encoding cft->private value on file */
 239 enum res_type {
 240         _MEM,
 241         _MEMSWAP,
 242         _OOM_TYPE,
 243         _KMEM,
 244         _TCP,
 245 };
 246
 247 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
 248 #define MEMFILE_TYPE(val)       ((val) >> 16 & 0xffff)
 249 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
 250 /* Used for OOM nofiier */
 251 #define OOM_CONTROL             (0)
 252
 253 /* Some nice accessors for the vmpressure. */
 254 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 255 {
 256         if (!memcg)
 257                 memcg = root_mem_cgroup;
 258         return &memcg->vmpressure;
 259 }
 260
 261 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 262 {
 263         return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
 264 }
 265
 266 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 267 {
 268         return (memcg == root_mem_cgroup);
 269 }
 270
 271 #ifndef CONFIG_SLOB
 272 /*
 273  * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
 274  * The main reason for not using cgroup id for this:
 275  *  this works better in sparse environments, where we have a lot of memcgs,
 276  *  but only a few kmem-limited. Or also, if we have, for instance, 200
 277  *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 278  *  200 entry array for that.
 279  *
 280  * The current size of the caches array is stored in memcg_nr_cache_ids. It
 281  * will double each time we have to increase it.
 282  */
 283 static DEFINE_IDA(memcg_cache_ida);
 284 int memcg_nr_cache_ids;
 285
 286 /* Protects memcg_nr_cache_ids */
 287 static DECLARE_RWSEM(memcg_cache_ids_sem);
 288
 289 void memcg_get_cache_ids(void)
 290 {
 291         down_read(&memcg_cache_ids_sem);
 292 }
 293
 294 void memcg_put_cache_ids(void)
 295 {
 296         up_read(&memcg_cache_ids_sem);
 297 }
 298
 299 /*
 300  * MIN_SIZE is different than 1, because we would like to avoid going through
 301  * the alloc/free process all the time. In a small machine, 4 kmem-limited
 302  * cgroups is a reasonable guess. In the future, it could be a parameter or
 303  * tunable, but that is strictly not necessary.
 304  *
 305  * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
 306  * this constant directly from cgroup, but it is understandable that this is
 307  * better kept as an internal representation in cgroup.c. In any case, the
 308  * cgrp_id space is not getting any smaller, and we don't have to necessarily
 309  * increase ours as well if it increases.
 310  */
 311 #define MEMCG_CACHES_MIN_SIZE 4
 312 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
 313
 314 /*
 315  * A lot of the calls to the cache allocation functions are expected to be
 316  * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 317  * conditional to this static branch, we'll have to allow modules that does
 318  * kmem_cache_alloc and the such to see this symbol as well
 319  */
 320 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 321 EXPORT_SYMBOL(memcg_kmem_enabled_key);
 322
 323 struct workqueue_struct *memcg_kmem_cache_wq;
 324
 325 #endif /* !CONFIG_SLOB */
 326
 327 /**
 328  * mem_cgroup_css_from_page - css of the memcg associated with a page
 329  * @page: page of interest
 330  *
 331  * If memcg is bound to the default hierarchy, css of the memcg associated
 332  * with @page is returned.  The returned css remains associated with @page
 333  * until it is released.
 334  *
 335  * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 336  * is returned.
 337  */
 338 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
 339 {
 340         struct mem_cgroup *memcg;
 341
 342         memcg = page->mem_cgroup;
 343
 344         if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 345                 memcg = root_mem_cgroup;
 346
 347         return &memcg->css;
 348 }
 349
 350 /**
 351  * page_cgroup_ino - return inode number of the memcg a page is charged to
 352  * @page: the page
 353  *
 354  * Look up the closest online ancestor of the memory cgroup @page is charged to
 355  * and return its inode number or 0 if @page is not charged to any cgroup. It
 356  * is safe to call this function without holding a reference to @page.
 357  *
 358  * Note, this function is inherently racy, because there is nothing to prevent
 359  * the cgroup inode from getting torn down and potentially reallocated a moment
 360  * after page_cgroup_ino() returns, so it only should be used by callers that
 361  * do not care (such as procfs interfaces).
 362  */
 363 ino_t page_cgroup_ino(struct page *page)
 364 {
 365         struct mem_cgroup *memcg;
 366         unsigned long ino = 0;
 367
 368         rcu_read_lock();
 369         memcg = READ_ONCE(page->mem_cgroup);
 370         while (memcg && !(memcg->css.flags & CSS_ONLINE))
 371                 memcg = parent_mem_cgroup(memcg);
 372         if (memcg)
 373                 ino = cgroup_ino(memcg->css.cgroup);
 374         rcu_read_unlock();
 375         return ino;
 376 }
 377
 378 static struct mem_cgroup_per_node *
 379 mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
 380 {
 381         int nid = page_to_nid(page);
 382
 383         return memcg->nodeinfo[nid];
 384 }
 385
 386 static struct mem_cgroup_tree_per_node *
 387 soft_limit_tree_node(int nid)
 388 {
 389         return soft_limit_tree.rb_tree_per_node[nid];
 390 }
 391
 392 static struct mem_cgroup_tree_per_node *
 393 soft_limit_tree_from_page(struct page *page)
 394 {
 395         int nid = page_to_nid(page);
 396
 397         return soft_limit_tree.rb_tree_per_node[nid];
 398 }
 399
 400 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
 401                                          struct mem_cgroup_tree_per_node *mctz,
 402                                          unsigned long new_usage_in_excess)
 403 {
 404         struct rb_node **p = &mctz->rb_root.rb_node;
 405         struct rb_node *parent = NULL;
 406         struct mem_cgroup_per_node *mz_node;
 407
 408         if (mz->on_tree)
 409                 return;
 410
 411         mz->usage_in_excess = new_usage_in_excess;
 412         if (!mz->usage_in_excess)
 413                 return;
 414         while (*p) {
 415                 parent = *p;
 416                 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
 417                                         tree_node);
 418                 if (mz->usage_in_excess < mz_node->usage_in_excess)
 419                         p = &(*p)->rb_left;
 420                 /*
 421                  * We can't avoid mem cgroups that are over their soft
 422                  * limit by the same amount
 423                  */
 424                 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 425                         p = &(*p)->rb_right;
 426         }
 427         rb_link_node(&mz->tree_node, parent, p);
 428         rb_insert_color(&mz->tree_node, &mctz->rb_root);
 429         mz->on_tree = true;
 430 }
 431
 432 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 433                                          struct mem_cgroup_tree_per_node *mctz)
 434 {
 435         if (!mz->on_tree)
 436                 return;
 437         rb_erase(&mz->tree_node, &mctz->rb_root);
 438         mz->on_tree = false;
 439 }
 440
 441 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 442                                        struct mem_cgroup_tree_per_node *mctz)
 443 {
 444         unsigned long flags;
 445
 446         spin_lock_irqsave(&mctz->lock, flags);
 447         __mem_cgroup_remove_exceeded(mz, mctz);
 448         spin_unlock_irqrestore(&mctz->lock, flags);
 449 }
 450
 451 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
 452 {
 453         unsigned long nr_pages = page_counter_read(&memcg->memory);
 454         unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
 455         unsigned long excess = 0;
 456
 457         if (nr_pages > soft_limit)
 458                 excess = nr_pages - soft_limit;
 459
 460         return excess;
 461 }
 462
 463 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 464 {
 465         unsigned long excess;
 466         struct mem_cgroup_per_node *mz;
 467         struct mem_cgroup_tree_per_node *mctz;
 468
 469         mctz = soft_limit_tree_from_page(page);
 470         if (!mctz)
 471                 return;
 472         /*
 473          * Necessary to update all ancestors when hierarchy is used.
 474          * because their event counter is not touched.
 475          */
 476         for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 477                 mz = mem_cgroup_page_nodeinfo(memcg, page);
 478                 excess = soft_limit_excess(memcg);
 479                 /*
 480                  * We have to update the tree if mz is on RB-tree or
 481                  * mem is over its softlimit.
 482                  */
 483                 if (excess || mz->on_tree) {
 484                         unsigned long flags;
 485
 486                         spin_lock_irqsave(&mctz->lock, flags);
 487                         /* if on-tree, remove it */
 488                         if (mz->on_tree)
 489                                 __mem_cgroup_remove_exceeded(mz, mctz);
 490                         /*
 491                          * Insert again. mz->usage_in_excess will be updated.
 492                          * If excess is 0, no tree ops.
 493                          */
 494                         __mem_cgroup_insert_exceeded(mz, mctz, excess);
 495                         spin_unlock_irqrestore(&mctz->lock, flags);
 496                 }
 497         }
 498 }
 499
 500 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 501 {
 502         struct mem_cgroup_tree_per_node *mctz;
 503         struct mem_cgroup_per_node *mz;
 504         int nid;
 505
 506         for_each_node(nid) {
 507                 mz = mem_cgroup_nodeinfo(memcg, nid);
 508                 mctz = soft_limit_tree_node(nid);
 509                 if (mctz)
 510                         mem_cgroup_remove_exceeded(mz, mctz);
 511         }
 512 }
 513
 514 static struct mem_cgroup_per_node *
 515 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 516 {
 517         struct rb_node *rightmost = NULL;
 518         struct mem_cgroup_per_node *mz;
 519
 520 retry:
 521         mz = NULL;
 522         rightmost = rb_last(&mctz->rb_root);
 523         if (!rightmost)
 524                 goto done;              /* Nothing to reclaim from */
 525
 526         mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node);
 527         /*
 528          * Remove the node now but someone else can add it back,
 529          * we will to add it back at the end of reclaim to its correct
 530          * position in the tree.
 531          */
 532         __mem_cgroup_remove_exceeded(mz, mctz);
 533         if (!soft_limit_excess(mz->memcg) ||
 534             !css_tryget_online(&mz->memcg->css))
 535                 goto retry;
 536 done:
 537         return mz;
 538 }
 539
 540 static struct mem_cgroup_per_node *
 541 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 542 {
 543         struct mem_cgroup_per_node *mz;
 544
 545         spin_lock_irq(&mctz->lock);
 546         mz = __mem_cgroup_largest_soft_limit_node(mctz);
 547         spin_unlock_irq(&mctz->lock);
 548         return mz;
 549 }
 550
 551 /*
 552  * Return page count for single (non recursive) @memcg.
 553  *
 554  * Implementation Note: reading percpu statistics for memcg.
 555  *
 556  * Both of vmstat[] and percpu_counter has threshold and do periodic
 557  * synchronization to implement "quick" read. There are trade-off between
 558  * reading cost and precision of value. Then, we may have a chance to implement
 559  * a periodic synchronization of counter in memcg's counter.
 560  *
 561  * But this _read() function is used for user interface now. The user accounts
 562  * memory usage by memory cgroup and he _always_ requires exact value because
 563  * he accounts memory. Even if we provide quick-and-fuzzy read, we always
 564  * have to visit all online cpus and make sum. So, for now, unnecessary
 565  * synchronization is not implemented. (just implemented for cpu hotplug)
 566  *
 567  * If there are kernel internal actions which can make use of some not-exact
 568  * value, and reading all cpu value can be performance bottleneck in some
 569  * common workload, threshold and synchronization as vmstat[] should be
 570  * implemented.
 571  */
 572 static unsigned long
 573 mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
 574 {
 575         long val = 0;
 576         int cpu;
 577
 578         /* Per-cpu values can be negative, use a signed accumulator */
 579         for_each_possible_cpu(cpu)
 580                 val += per_cpu(memcg->stat->count[idx], cpu);
 581         /*
 582          * Summing races with updates, so val may be negative.  Avoid exposing
 583          * transient negative values.
 584          */
 585         if (val < 0)
 586                 val = 0;
 587         return val;
 588 }
 589
 590 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 591                                             enum mem_cgroup_events_index idx)
 592 {
 593         unsigned long val = 0;
 594         int cpu;
 595
 596         for_each_possible_cpu(cpu)
 597                 val += per_cpu(memcg->stat->events[idx], cpu);
 598         return val;
 599 }
 600
 601 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 602                                          struct page *page,
 603                                          bool compound, int nr_pages)
 604 {
 605         /*
 606          * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
 607          * counted as CACHE even if it's on ANON LRU.
 608          */
 609         if (PageAnon(page))
 610                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
 611                                 nr_pages);
 612         else {
 613                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
 614                                 nr_pages);
 615                 if (PageSwapBacked(page))
 616                         __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SHMEM],
 617                                        nr_pages);
 618         }
 619
 620         if (compound) {
 621                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 622                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
 623                                 nr_pages);
 624         }
 625
 626         /* pagein of a big page is an event. So, ignore page size */
 627         if (nr_pages > 0)
 628                 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
 629         else {
 630                 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
 631                 nr_pages = -nr_pages; /* for event */
 632         }
 633
 634         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 635 }
 636
 637 unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 638                                            int nid, unsigned int lru_mask)
 639 {
 640         struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
 641         unsigned long nr = 0;
 642         enum lru_list lru;
 643
 644         VM_BUG_ON((unsigned)nid >= nr_node_ids);
 645
 646         for_each_lru(lru) {
 647                 if (!(BIT(lru) & lru_mask))
 648                         continue;
 649                 nr += mem_cgroup_get_lru_size(lruvec, lru);
 650         }
 651         return nr;
 652 }
 653
 654 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
 655                         unsigned int lru_mask)
 656 {
 657         unsigned long nr = 0;
 658         int nid;
 659
 660         for_each_node_state(nid, N_MEMORY)
 661                 nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
 662         return nr;
 663 }
 664
 665 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 666                                        enum mem_cgroup_events_target target)
 667 {
 668         unsigned long val, next;
 669
 670         val = __this_cpu_read(memcg->stat->nr_page_events);
 671         next = __this_cpu_read(memcg->stat->targets[target]);
 672         /* from time_after() in jiffies.h */
 673         if ((long)next - (long)val < 0) {
 674                 switch (target) {
 675                 case MEM_CGROUP_TARGET_THRESH:
 676                         next = val + THRESHOLDS_EVENTS_TARGET;
 677                         break;
 678                 case MEM_CGROUP_TARGET_SOFTLIMIT:
 679                         next = val + SOFTLIMIT_EVENTS_TARGET;
 680                         break;
 681                 case MEM_CGROUP_TARGET_NUMAINFO:
 682                         next = val + NUMAINFO_EVENTS_TARGET;
 683                         break;
 684                 default:
 685                         break;
 686                 }
 687                 __this_cpu_write(memcg->stat->targets[target], next);
 688                 return true;
 689         }
 690         return false;
 691 }
 692
 693 /*
 694  * Check events in order.
 695  *
 696  */
 697 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 698 {
 699         /* threshold event is triggered in finer grain than soft limit */
 700         if (unlikely(mem_cgroup_event_ratelimit(memcg,
 701                                                 MEM_CGROUP_TARGET_THRESH))) {
 702                 bool do_softlimit;
 703                 bool do_numainfo __maybe_unused;
 704
 705                 do_softlimit = mem_cgroup_event_ratelimit(memcg,
 706                                                 MEM_CGROUP_TARGET_SOFTLIMIT);
 707 #if MAX_NUMNODES > 1
 708                 do_numainfo = mem_cgroup_event_ratelimit(memcg,
 709                                                 MEM_CGROUP_TARGET_NUMAINFO);
 710 #endif
 711                 mem_cgroup_threshold(memcg);
 712                 if (unlikely(do_softlimit))
 713                         mem_cgroup_update_tree(memcg, page);
 714 #if MAX_NUMNODES > 1
 715                 if (unlikely(do_numainfo))
 716                         atomic_inc(&memcg->numainfo_events);
 717 #endif
 718         }
 719 }
 720
 721 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 722 {
 723         /*
 724          * mm_update_next_owner() may clear mm->owner to NULL
 725          * if it races with swapoff, page migration, etc.
 726          * So this can be called with p == NULL.
 727          */
 728         if (unlikely(!p))
 729                 return NULL;
 730
 731         return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 732 }
 733 EXPORT_SYMBOL(mem_cgroup_from_task);
 734
 735 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 736 {
 737         struct mem_cgroup *memcg = NULL;
 738
 739         rcu_read_lock();
 740         do {
 741                 /*
 742                  * Page cache insertions can happen withou an
 743                  * actual mm context, e.g. during disk probing
 744                  * on boot, loopback IO, acct() writes etc.
 745                  */
 746                 if (unlikely(!mm))
 747                         memcg = root_mem_cgroup;
 748                 else {
 749                         memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 750                         if (unlikely(!memcg))
 751                                 memcg = root_mem_cgroup;
 752                 }
 753         } while (!css_tryget_online(&memcg->css));
 754         rcu_read_unlock();
 755         return memcg;
 756 }
 757
 758 /**
 759  * mem_cgroup_iter - iterate over memory cgroup hierarchy
 760  * @root: hierarchy root
 761  * @prev: previously returned memcg, NULL on first invocation
 762  * @reclaim: cookie for shared reclaim walks, NULL for full walks
 763  *
 764  * Returns references to children of the hierarchy below @root, or
 765  * @root itself, or %NULL after a full round-trip.
 766  *
 767  * Caller must pass the return value in @prev on subsequent
 768  * invocations for reference counting, or use mem_cgroup_iter_break()
 769  * to cancel a hierarchy walk before the round-trip is complete.
 770  *
 771  * Reclaimers can specify a zone and a priority level in @reclaim to
 772  * divide up the memcgs in the hierarchy among all concurrent
 773  * reclaimers operating on the same zone and priority.
 774  */
 775 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 776                                    struct mem_cgroup *prev,
 777                                    struct mem_cgroup_reclaim_cookie *reclaim)
 778 {
 779         struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
 780         struct cgroup_subsys_state *css = NULL;
 781         struct mem_cgroup *memcg = NULL;
 782         struct mem_cgroup *pos = NULL;
 783
 784         if (mem_cgroup_disabled())
 785                 return NULL;
 786
 787         if (!root)
 788                 root = root_mem_cgroup;
 789
 790         if (prev && !reclaim)
 791                 pos = prev;
 792
 793         if (!root->use_hierarchy && root != root_mem_cgroup) {
 794                 if (prev)
 795                         goto out;
 796                 return root;
 797         }
 798
 799         rcu_read_lock();
 800
 801         if (reclaim) {
 802                 struct mem_cgroup_per_node *mz;
 803
 804                 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
 805                 iter = &mz->iter[reclaim->priority];
 806
 807                 if (prev && reclaim->generation != iter->generation)
 808                         goto out_unlock;
 809
 810                 while (1) {
 811                         pos = READ_ONCE(iter->position);
 812                         if (!pos || css_tryget(&pos->css))
 813                                 break;
 814                         /*
 815                          * css reference reached zero, so iter->position will
 816                          * be cleared by ->css_released. However, we should not
 817                          * rely on this happening soon, because ->css_released
 818                          * is called from a work queue, and by busy-waiting we
 819                          * might block it. So we clear iter->position right
 820                          * away.
 821                          */
 822                         (void)cmpxchg(&iter->position, pos, NULL);
 823                 }
 824         }
 825
 826         if (pos)
 827                 css = &pos->css;
 828
 829         for (;;) {
 830                 css = css_next_descendant_pre(css, &root->css);
 831                 if (!css) {
 832                         /*
 833                          * Reclaimers share the hierarchy walk, and a
 834                          * new one might jump in right at the end of
 835                          * the hierarchy - make sure they see at least
 836                          * one group and restart from the beginning.
 837                          */
 838                         if (!prev)
 839                                 continue;
 840                         break;
 841                 }
 842
 843                 /*
 844                  * Verify the css and acquire a reference.  The root
 845                  * is provided by the caller, so we know it's alive
 846                  * and kicking, and don't take an extra reference.
 847                  */
 848                 memcg = mem_cgroup_from_css(css);
 849
 850                 if (css == &root->css)
 851                         break;
 852
 853                 if (css_tryget(css))
 854                         break;
 855
 856                 memcg = NULL;
 857         }
 858
 859         if (reclaim) {
 860                 /*
 861                  * The position could have already been updated by a competing
 862                  * thread, so check that the value hasn't changed since we read
 863                  * it to avoid reclaiming from the same cgroup twice.
 864                  */
 865                 (void)cmpxchg(&iter->position, pos, memcg);
 866
 867                 if (pos)
 868                         css_put(&pos->css);
 869
 870                 if (!memcg)
 871                         iter->generation++;
 872                 else if (!prev)
 873                         reclaim->generation = iter->generation;
 874         }
 875
 876 out_unlock:
 877         rcu_read_unlock();
 878 out:
 879         if (prev && prev != root)
 880                 css_put(&prev->css);
 881
 882         return memcg;
 883 }
 884
 885 /**
 886  * mem_cgroup_iter_break - abort a hierarchy walk prematurely
 887  * @root: hierarchy root
 888  * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
 889  */
 890 void mem_cgroup_iter_break(struct mem_cgroup *root,
 891                            struct mem_cgroup *prev)
 892 {
 893         if (!root)
 894                 root = root_mem_cgroup;
 895         if (prev && prev != root)
 896                 css_put(&prev->css);
 897 }
 898
 899 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
 900 {
 901         struct mem_cgroup *memcg = dead_memcg;
 902         struct mem_cgroup_reclaim_iter *iter;
 903         struct mem_cgroup_per_node *mz;
 904         int nid;
 905         int i;
 906
 907         while ((memcg = parent_mem_cgroup(memcg))) {
 908                 for_each_node(nid) {
 909                         mz = mem_cgroup_nodeinfo(memcg, nid);
 910                         for (i = 0; i <= DEF_PRIORITY; i++) {
 911                                 iter = &mz->iter[i];
 912                                 cmpxchg(&iter->position,
 913                                         dead_memcg, NULL);
 914                         }
 915                 }
 916         }
 917 }
 918
 919 /*
 920  * Iteration constructs for visiting all cgroups (under a tree).  If
 921  * loops are exited prematurely (break), mem_cgroup_iter_break() must
 922  * be used for reference counting.
 923  */
 924 #define for_each_mem_cgroup_tree(iter, root)            \
 925         for (iter = mem_cgroup_iter(root, NULL, NULL);  \
 926              iter != NULL;                              \
 927              iter = mem_cgroup_iter(root, iter, NULL))
 928
 929 #define for_each_mem_cgroup(iter)                       \
 930         for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
 931              iter != NULL;                              \
 932              iter = mem_cgroup_iter(NULL, iter, NULL))
 933
 934 /**
 935  * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
 936  * @memcg: hierarchy root
 937  * @fn: function to call for each task
 938  * @arg: argument passed to @fn
 939  *
 940  * This function iterates over tasks attached to @memcg or to any of its
 941  * descendants and calls @fn for each task. If @fn returns a non-zero
 942  * value, the function breaks the iteration loop and returns the value.
 943  * Otherwise, it will iterate over all tasks and return 0.
 944  *
 945  * This function must not be called for the root memory cgroup.
 946  */
 947 int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
 948                           int (*fn)(struct task_struct *, void *), void *arg)
 949 {
 950         struct mem_cgroup *iter;
 951         int ret = 0;
 952
 953         BUG_ON(memcg == root_mem_cgroup);
 954
 955         for_each_mem_cgroup_tree(iter, memcg) {
 956                 struct css_task_iter it;
 957                 struct task_struct *task;
 958
 959                 css_task_iter_start(&iter->css, &it);
 960                 while (!ret && (task = css_task_iter_next(&it)))
 961                         ret = fn(task, arg);
 962                 css_task_iter_end(&it);
 963                 if (ret) {
 964                         mem_cgroup_iter_break(memcg, iter);
 965                         break;
 966                 }
 967         }
 968         return ret;
 969 }
 970
 971 /**
 972  * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
 973  * @page: the page
 974  * @zone: zone of the page
 975  *
 976  * This function is only safe when following the LRU page isolation
 977  * and putback protocol: the LRU lock must be held, and the page must
 978  * either be PageLRU() or the caller must have isolated/allocated it.
 979  */
 980 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
 981 {
 982         struct mem_cgroup_per_node *mz;
 983         struct mem_cgroup *memcg;
 984         struct lruvec *lruvec;
 985
 986         if (mem_cgroup_disabled()) {
 987                 lruvec = &pgdat->lruvec;
 988                 goto out;
 989         }
 990
 991         memcg = page->mem_cgroup;
 992         /*
 993          * Swapcache readahead pages are added to the LRU - and
 994          * possibly migrated - before they are charged.
 995          */
 996         if (!memcg)
 997                 memcg = root_mem_cgroup;
 998
 999         mz = mem_cgroup_page_nodeinfo(memcg, page);
1000         lruvec = &mz->lruvec;
1001 out:
1002         /*
1003          * Since a node can be onlined after the mem_cgroup was created,
1004          * we have to be prepared to initialize lruvec->zone here;
1005          * and if offlined then reonlined, we need to reinitialize it.
1006          */
1007         if (unlikely(lruvec->pgdat != pgdat))
1008                 lruvec->pgdat = pgdat;
1009         return lruvec;
1010 }
1011
1012 /**
1013  * mem_cgroup_update_lru_size - account for adding or removing an lru page
1014  * @lruvec: mem_cgroup per zone lru vector
1015  * @lru: index of lru list the page is sitting on
1016  * @zid: zone id of the accounted pages
1017  * @nr_pages: positive when adding or negative when removing
1018  *
1019  * This function must be called under lru_lock, just before a page is added
1020  * to or just after a page is removed from an lru list (that ordering being
1021  * so as to allow it to check that lru_size 0 is consistent with list_empty).
1022  */
1023 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1024                                 int zid, int nr_pages)
1025 {
1026         struct mem_cgroup_per_node *mz;
1027         unsigned long *lru_size;
1028         long size;
1029
1030         if (mem_cgroup_disabled())
1031                 return;
1032
1033         mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1034         lru_size = &mz->lru_zone_size[zid][lru];
1035
1036         if (nr_pages < 0)
1037                 *lru_size += nr_pages;
1038
1039         size = *lru_size;
1040         if (WARN_ONCE(size < 0,
1041                 "%s(%p, %d, %d): lru_size %ld\n",
1042                 __func__, lruvec, lru, nr_pages, size)) {
1043                 VM_BUG_ON(1);
1044                 *lru_size = 0;
1045         }
1046
1047         if (nr_pages > 0)
1048                 *lru_size += nr_pages;
1049 }
1050
1051 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1052 {
1053         struct mem_cgroup *task_memcg;
1054         struct task_struct *p;
1055         bool ret;
1056
1057         p = find_lock_task_mm(task);
1058         if (p) {
1059                 task_memcg = get_mem_cgroup_from_mm(p->mm);
1060                 task_unlock(p);
1061         } else {
1062                 /*
1063                  * All threads may have already detached their mm's, but the oom
1064                  * killer still needs to detect if they have already been oom
1065                  * killed to prevent needlessly killing additional tasks.
1066                  */
1067                 rcu_read_lock();
1068                 task_memcg = mem_cgroup_from_task(task);
1069                 css_get(&task_memcg->css);
1070                 rcu_read_unlock();
1071         }
1072         ret = mem_cgroup_is_descendant(task_memcg, memcg);
1073         css_put(&task_memcg->css);
1074         return ret;
1075 }
1076
1077 /**
1078  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1079  * @memcg: the memory cgroup
1080  *
1081  * Returns the maximum amount of memory @mem can be charged with, in
1082  * pages.
1083  */
1084 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1085 {
1086         unsigned long margin = 0;
1087         unsigned long count;
1088         unsigned long limit;
1089
1090         count = page_counter_read(&memcg->memory);
1091         limit = READ_ONCE(memcg->memory.limit);
1092         if (count < limit)
1093                 margin = limit - count;
1094
1095         if (do_memsw_account()) {
1096                 count = page_counter_read(&memcg->memsw);
1097                 limit = READ_ONCE(memcg->memsw.limit);
1098                 if (count <= limit)
1099                         margin = min(margin, limit - count);
1100                 else
1101                         margin = 0;
1102         }
1103
1104         return margin;
1105 }
1106
1107 /*
1108  * A routine for checking "mem" is under move_account() or not.
1109  *
1110  * Checking a cgroup is mc.from or mc.to or under hierarchy of
1111  * moving cgroups. This is for waiting at high-memory pressure
1112  * caused by "move".
1113  */
1114 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1115 {
1116         struct mem_cgroup *from;
1117         struct mem_cgroup *to;
1118         bool ret = false;
1119         /*
1120          * Unlike task_move routines, we access mc.to, mc.from not under
1121          * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1122          */
1123         spin_lock(&mc.lock);
1124         from = mc.from;
1125         to = mc.to;
1126         if (!from)
1127                 goto unlock;
1128
1129         ret = mem_cgroup_is_descendant(from, memcg) ||
1130                 mem_cgroup_is_descendant(to, memcg);
1131 unlock:
1132         spin_unlock(&mc.lock);
1133         return ret;
1134 }
1135
1136 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1137 {
1138         if (mc.moving_task && current != mc.moving_task) {
1139                 if (mem_cgroup_under_move(memcg)) {
1140                         DEFINE_WAIT(wait);
1141                         prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1142                         /* moving charge context might have finished. */
1143                         if (mc.moving_task)
1144                                 schedule();
1145                         finish_wait(&mc.waitq, &wait);
1146                         return true;
1147                 }
1148         }
1149         return false;
1150 }
1151
1152 #define K(x) ((x) << (PAGE_SHIFT-10))
1153 /**
1154  * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
1155  * @memcg: The memory cgroup that went over limit
1156  * @p: Task that is going to be killed
1157  *
1158  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1159  * enabled
1160  */
1161 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1162 {
1163         struct mem_cgroup *iter;
1164         unsigned int i;
1165
1166         rcu_read_lock();
1167
1168         if (p) {
1169                 pr_info("Task in ");
1170                 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1171                 pr_cont(" killed as a result of limit of ");
1172         } else {
1173                 pr_info("Memory limit reached of cgroup ");
1174         }
1175
1176         pr_cont_cgroup_path(memcg->css.cgroup);
1177         pr_cont("\n");
1178
1179         rcu_read_unlock();
1180
1181         pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1182                 K((u64)page_counter_read(&memcg->memory)),
1183                 K((u64)memcg->memory.limit), memcg->memory.failcnt);
1184         pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1185                 K((u64)page_counter_read(&memcg->memsw)),
1186                 K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
1187         pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1188                 K((u64)page_counter_read(&memcg->kmem)),
1189                 K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
1190
1191         for_each_mem_cgroup_tree(iter, memcg) {
1192                 pr_info("Memory cgroup stats for ");
1193                 pr_cont_cgroup_path(iter->css.cgroup);
1194                 pr_cont(":");
1195
1196                 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1197                         if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1198                                 continue;
1199                         pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
1200                                 K(mem_cgroup_read_stat(iter, i)));
1201                 }
1202
1203                 for (i = 0; i < NR_LRU_LISTS; i++)
1204                         pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1205                                 K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1206
1207                 pr_cont("\n");
1208         }
1209 }
1210
1211 /*
1212  * This function returns the number of memcg under hierarchy tree. Returns
1213  * 1(self count) if no children.
1214  */
1215 static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1216 {
1217         int num = 0;
1218         struct mem_cgroup *iter;
1219
1220         for_each_mem_cgroup_tree(iter, memcg)
1221                 num++;
1222         return num;
1223 }
1224
1225 /*
1226  * Return the memory (and swap, if configured) limit for a memcg.
1227  */
1228 unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
1229 {
1230         unsigned long limit;
1231
1232         limit = memcg->memory.limit;
1233         if (mem_cgroup_swappiness(memcg)) {
1234                 unsigned long memsw_limit;
1235                 unsigned long swap_limit;
1236
1237                 memsw_limit = memcg->memsw.limit;
1238                 swap_limit = memcg->swap.limit;
1239                 swap_limit = min(swap_limit, (unsigned long)total_swap_pages);
1240                 limit = min(limit + swap_limit, memsw_limit);
1241         }
1242         return limit;
1243 }
1244
1245 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1246                                      int order)
1247 {
1248         struct oom_control oc = {
1249                 .zonelist = NULL,
1250                 .nodemask = NULL,
1251                 .memcg = memcg,
1252                 .gfp_mask = gfp_mask,
1253                 .order = order,
1254         };
1255         bool ret;
1256
1257         mutex_lock(&oom_lock);
1258         ret = out_of_memory(&oc);
1259         mutex_unlock(&oom_lock);
1260         return ret;
1261 }
1262
1263 #if MAX_NUMNODES > 1
1264
1265 /**
1266  * test_mem_cgroup_node_reclaimable
1267  * @memcg: the target memcg
1268  * @nid: the node ID to be checked.
1269  * @noswap : specify true here if the user wants flle only information.
1270  *
1271  * This function returns whether the specified memcg contains any
1272  * reclaimable pages on a node. Returns true if there are any reclaimable
1273  * pages in the node.
1274  */
1275 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1276                 int nid, bool noswap)
1277 {
1278         if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1279                 return true;
1280         if (noswap || !total_swap_pages)
1281                 return false;
1282         if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1283                 return true;
1284         return false;
1285
1286 }
1287
1288 /*
1289  * Always updating the nodemask is not very good - even if we have an empty
1290  * list or the wrong list here, we can start from some node and traverse all
1291  * nodes based on the zonelist. So update the list loosely once per 10 secs.
1292  *
1293  */
1294 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1295 {
1296         int nid;
1297         /*
1298          * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1299          * pagein/pageout changes since the last update.
1300          */
1301         if (!atomic_read(&memcg->numainfo_events))
1302                 return;
1303         if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1304                 return;
1305
1306         /* make a nodemask where this memcg uses memory from */
1307         memcg->scan_nodes = node_states[N_MEMORY];
1308
1309         for_each_node_mask(nid, node_states[N_MEMORY]) {
1310
1311                 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1312                         node_clear(nid, memcg->scan_nodes);
1313         }
1314
1315         atomic_set(&memcg->numainfo_events, 0);
1316         atomic_set(&memcg->numainfo_updating, 0);
1317 }
1318
1319 /*
1320  * Selecting a node where we start reclaim from. Because what we need is just
1321  * reducing usage counter, start from anywhere is O,K. Considering
1322  * memory reclaim from current node, there are pros. and cons.
1323  *
1324  * Freeing memory from current node means freeing memory from a node which
1325  * we'll use or we've used. So, it may make LRU bad. And if several threads
1326  * hit limits, it will see a contention on a node. But freeing from remote
1327  * node means more costs for memory reclaim because of memory latency.
1328  *
1329  * Now, we use round-robin. Better algorithm is welcomed.
1330  */
1331 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1332 {
1333         int node;
1334
1335         mem_cgroup_may_update_nodemask(memcg);
1336         node = memcg->last_scanned_node;
1337
1338         node = next_node_in(node, memcg->scan_nodes);
1339         /*
1340          * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
1341          * last time it really checked all the LRUs due to rate limiting.
1342          * Fallback to the current node in that case for simplicity.
1343          */
1344         if (unlikely(node == MAX_NUMNODES))
1345                 node = numa_node_id();
1346
1347         memcg->last_scanned_node = node;
1348         return node;
1349 }
1350 #else
1351 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1352 {
1353         return 0;
1354 }
1355 #endif
1356
1357 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1358                                    pg_data_t *pgdat,
1359                                    gfp_t gfp_mask,
1360                                    unsigned long *total_scanned)
1361 {
1362         struct mem_cgroup *victim = NULL;
1363         int total = 0;
1364         int loop = 0;
1365         unsigned long excess;
1366         unsigned long nr_scanned;
1367         struct mem_cgroup_reclaim_cookie reclaim = {
1368                 .pgdat = pgdat,
1369                 .priority = 0,
1370         };
1371
1372         excess = soft_limit_excess(root_memcg);
1373
1374         while (1) {
1375                 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1376                 if (!victim) {
1377                         loop++;
1378                         if (loop >= 2) {
1379                                 /*
1380                                  * If we have not been able to reclaim
1381                                  * anything, it might because there are
1382                                  * no reclaimable pages under this hierarchy
1383                                  */
1384                                 if (!total)
1385                                         break;
1386                                 /*
1387                                  * We want to do more targeted reclaim.
1388                                  * excess >> 2 is not to excessive so as to
1389                                  * reclaim too much, nor too less that we keep
1390                                  * coming back to reclaim from this cgroup
1391                                  */
1392                                 if (total >= (excess >> 2) ||
1393                                         (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1394                                         break;
1395                         }
1396                         continue;
1397                 }
1398                 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1399                                         pgdat, &nr_scanned);
1400                 *total_scanned += nr_scanned;
1401                 if (!soft_limit_excess(root_memcg))
1402                         break;
1403         }
1404         mem_cgroup_iter_break(root_memcg, victim);
1405         return total;
1406 }
1407
1408 #ifdef CONFIG_LOCKDEP
1409 static struct lockdep_map memcg_oom_lock_dep_map = {
1410         .name = "memcg_oom_lock",
1411 };
1412 #endif
1413
1414 static DEFINE_SPINLOCK(memcg_oom_lock);
1415
1416 /*
1417  * Check OOM-Killer is already running under our hierarchy.
1418  * If someone is running, return false.
1419  */
1420 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1421 {
1422         struct mem_cgroup *iter, *failed = NULL;
1423
1424         spin_lock(&memcg_oom_lock);
1425
1426         for_each_mem_cgroup_tree(iter, memcg) {
1427                 if (iter->oom_lock) {
1428                         /*
1429                          * this subtree of our hierarchy is already locked
1430                          * so we cannot give a lock.
1431                          */
1432                         failed = iter;
1433                         mem_cgroup_iter_break(memcg, iter);
1434                         break;
1435                 } else
1436                         iter->oom_lock = true;
1437         }
1438
1439         if (failed) {
1440                 /*
1441                  * OK, we failed to lock the whole subtree so we have
1442                  * to clean up what we set up to the failing subtree
1443                  */
1444                 for_each_mem_cgroup_tree(iter, memcg) {
1445                         if (iter == failed) {
1446                                 mem_cgroup_iter_break(memcg, iter);
1447                                 break;
1448                         }
1449                         iter->oom_lock = false;
1450                 }
1451         } else
1452                 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1453
1454         spin_unlock(&memcg_oom_lock);
1455
1456         return !failed;
1457 }
1458
1459 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1460 {
1461         struct mem_cgroup *iter;
1462
1463         spin_lock(&memcg_oom_lock);
1464         mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1465         for_each_mem_cgroup_tree(iter, memcg)
1466                 iter->oom_lock = false;
1467         spin_unlock(&memcg_oom_lock);
1468 }
1469
1470 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1471 {
1472         struct mem_cgroup *iter;
1473
1474         spin_lock(&memcg_oom_lock);
1475         for_each_mem_cgroup_tree(iter, memcg)
1476                 iter->under_oom++;
1477         spin_unlock(&memcg_oom_lock);
1478 }
1479
1480 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1481 {
1482         struct mem_cgroup *iter;
1483
1484         /*
1485          * When a new child is created while the hierarchy is under oom,
1486          * mem_cgroup_oom_lock() may not be called. Watch for underflow.
1487          */
1488         spin_lock(&memcg_oom_lock);
1489         for_each_mem_cgroup_tree(iter, memcg)
1490                 if (iter->under_oom > 0)
1491                         iter->under_oom--;
1492         spin_unlock(&memcg_oom_lock);
1493 }
1494
1495 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1496
1497 struct oom_wait_info {
1498         struct mem_cgroup *memcg;
1499         wait_queue_t    wait;
1500 };
1501
1502 static int memcg_oom_wake_function(wait_queue_t *wait,
1503         unsigned mode, int sync, void *arg)
1504 {
1505         struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1506         struct mem_cgroup *oom_wait_memcg;
1507         struct oom_wait_info *oom_wait_info;
1508
1509         oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1510         oom_wait_memcg = oom_wait_info->memcg;
1511
1512         if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1513             !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1514                 return 0;
1515         return autoremove_wake_function(wait, mode, sync, arg);
1516 }
1517
1518 static void memcg_oom_recover(struct mem_cgroup *memcg)
1519 {
1520         /*
1521          * For the following lockless ->under_oom test, the only required
1522          * guarantee is that it must see the state asserted by an OOM when
1523          * this function is called as a result of userland actions
1524          * triggered by the notification of the OOM.  This is trivially
1525          * achieved by invoking mem_cgroup_mark_under_oom() before
1526          * triggering notification.
1527          */
1528         if (memcg && memcg->under_oom)
1529                 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1530 }
1531
1532 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1533 {
1534         if (!current->memcg_may_oom)
1535                 return;
1536         /*
1537          * We are in the middle of the charge context here, so we
1538          * don't want to block when potentially sitting on a callstack
1539          * that holds all kinds of filesystem and mm locks.
1540          *
1541          * Also, the caller may handle a failed allocation gracefully
1542          * (like optional page cache readahead) and so an OOM killer
1543          * invocation might not even be necessary.
1544          *
1545          * That's why we don't do anything here except remember the
1546          * OOM context and then deal with it at the end of the page
1547          * fault when the stack is unwound, the locks are released,
1548          * and when we know whether the fault was overall successful.
1549          */
1550         css_get(&memcg->css);
1551         current->memcg_in_oom = memcg;
1552         current->memcg_oom_gfp_mask = mask;
1553         current->memcg_oom_order = order;
1554 }
1555
1556 /**
1557  * mem_cgroup_oom_synchronize - complete memcg OOM handling
1558  * @handle: actually kill/wait or just clean up the OOM state
1559  *
1560  * This has to be called at the end of a page fault if the memcg OOM
1561  * handler was enabled.
1562  *
1563  * Memcg supports userspace OOM handling where failed allocations must
1564  * sleep on a waitqueue until the userspace task resolves the
1565  * situation.  Sleeping directly in the charge context with all kinds
1566  * of locks held is not a good idea, instead we remember an OOM state
1567  * in the task and mem_cgroup_oom_synchronize() has to be called at
1568  * the end of the page fault to complete the OOM handling.
1569  *
1570  * Returns %true if an ongoing memcg OOM situation was detected and
1571  * completed, %false otherwise.
1572  */
1573 bool mem_cgroup_oom_synchronize(bool handle)
1574 {
1575         struct mem_cgroup *memcg = current->memcg_in_oom;
1576         struct oom_wait_info owait;
1577         bool locked;
1578
1579         /* OOM is global, do not handle */
1580         if (!memcg)
1581                 return false;
1582
1583         if (!handle)
1584                 goto cleanup;
1585
1586         owait.memcg = memcg;
1587         owait.wait.flags = 0;
1588         owait.wait.func = memcg_oom_wake_function;
1589         owait.wait.private = current;
1590         INIT_LIST_HEAD(&owait.wait.task_list);
1591
1592         prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1593         mem_cgroup_mark_under_oom(memcg);
1594
1595         locked = mem_cgroup_oom_trylock(memcg);
1596
1597         if (locked)
1598                 mem_cgroup_oom_notify(memcg);
1599
1600         if (locked && !memcg->oom_kill_disable) {
1601                 mem_cgroup_unmark_under_oom(memcg);
1602                 finish_wait(&memcg_oom_waitq, &owait.wait);
1603                 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1604                                          current->memcg_oom_order);
1605         } else {
1606                 schedule();
1607                 mem_cgroup_unmark_under_oom(memcg);
1608                 finish_wait(&memcg_oom_waitq, &owait.wait);
1609         }
1610
1611         if (locked) {
1612                 mem_cgroup_oom_unlock(memcg);
1613                 /*
1614                  * There is no guarantee that an OOM-lock contender
1615                  * sees the wakeups triggered by the OOM kill
1616                  * uncharges.  Wake any sleepers explicitely.
1617                  */
1618                 memcg_oom_recover(memcg);
1619         }
1620 cleanup:
1621         current->memcg_in_oom = NULL;
1622         css_put(&memcg->css);
1623         return true;
1624 }
1625
1626 /**
1627  * lock_page_memcg - lock a page->mem_cgroup binding
1628  * @page: the page
1629  *
1630  * This function protects unlocked LRU pages from being moved to
1631  * another cgroup and stabilizes their page->mem_cgroup binding.
1632  */
1633 void lock_page_memcg(struct page *page)
1634 {
1635         struct mem_cgroup *memcg;
1636         unsigned long flags;
1637
1638         /*
1639          * The RCU lock is held throughout the transaction.  The fast
1640          * path can get away without acquiring the memcg->move_lock
1641          * because page moving starts with an RCU grace period.
1642          */
1643         rcu_read_lock();
1644
1645         if (mem_cgroup_disabled())
1646                 return;
1647 again:
1648         memcg = page->mem_cgroup;
1649         if (unlikely(!memcg))
1650                 return;
1651
1652         if (atomic_read(&memcg->moving_account) <= 0)
1653                 return;
1654
1655         spin_lock_irqsave(&memcg->move_lock, flags);
1656         if (memcg != page->mem_cgroup) {
1657                 spin_unlock_irqrestore(&memcg->move_lock, flags);
1658                 goto again;
1659         }
1660
1661         /*
1662          * When charge migration first begins, we can have locked and
1663          * unlocked page stat updates happening concurrently.  Track
1664          * the task who has the lock for unlock_page_memcg().
1665          */
1666         memcg->move_lock_task = current;
1667         memcg->move_lock_flags = flags;
1668
1669         return;
1670 }
1671 EXPORT_SYMBOL(lock_page_memcg);
1672
1673 /**
1674  * unlock_page_memcg - unlock a page->mem_cgroup binding
1675  * @page: the page
1676  */
1677 void unlock_page_memcg(struct page *page)
1678 {
1679         struct mem_cgroup *memcg = page->mem_cgroup;
1680
1681         if (memcg && memcg->move_lock_task == current) {
1682                 unsigned long flags = memcg->move_lock_flags;
1683
1684                 memcg->move_lock_task = NULL;
1685                 memcg->move_lock_flags = 0;
1686
1687                 spin_unlock_irqrestore(&memcg->move_lock, flags);
1688         }
1689
1690         rcu_read_unlock();
1691 }
1692 EXPORT_SYMBOL(unlock_page_memcg);
1693
1694 /*
1695  * size of first charge trial. "32" comes from vmscan.c's magic value.
1696  * TODO: maybe necessary to use big numbers in big irons.
1697  */
1698 #define CHARGE_BATCH    32U
1699 struct memcg_stock_pcp {
1700         struct mem_cgroup *cached; /* this never be root cgroup */
1701         unsigned int nr_pages;
1702         struct work_struct work;
1703         unsigned long flags;
1704 #define FLUSHING_CACHED_CHARGE  0
1705 };
1706 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1707 static DEFINE_MUTEX(percpu_charge_mutex);
1708
1709 /**
1710  * consume_stock: Try to consume stocked charge on this cpu.
1711  * @memcg: memcg to consume from.
1712  * @nr_pages: how many pages to charge.
1713  *
1714  * The charges will only happen if @memcg matches the current cpu's memcg
1715  * stock, and at least @nr_pages are available in that stock.  Failure to
1716  * service an allocation will refill the stock.
1717  *
1718  * returns true if successful, false otherwise.
1719  */
1720 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1721 {
1722         struct memcg_stock_pcp *stock;
1723         unsigned long flags;
1724         bool ret = false;
1725
1726         if (nr_pages > CHARGE_BATCH)
1727                 return ret;
1728
1729         local_irq_save(flags);
1730
1731         stock = this_cpu_ptr(&memcg_stock);
1732         if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
1733                 stock->nr_pages -= nr_pages;
1734                 ret = true;
1735         }
1736
1737         local_irq_restore(flags);
1738
1739         return ret;
1740 }
1741
1742 /*
1743  * Returns stocks cached in percpu and reset cached information.
1744  */
1745 static void drain_stock(struct memcg_stock_pcp *stock)
1746 {
1747         struct mem_cgroup *old = stock->cached;
1748
1749         if (stock->nr_pages) {
1750                 page_counter_uncharge(&old->memory, stock->nr_pages);
1751                 if (do_memsw_account())
1752                         page_counter_uncharge(&old->memsw, stock->nr_pages);
1753                 css_put_many(&old->css, stock->nr_pages);
1754                 stock->nr_pages = 0;
1755         }
1756         stock->cached = NULL;
1757 }
1758
1759 static void drain_local_stock(struct work_struct *dummy)
1760 {
1761         struct memcg_stock_pcp *stock;
1762         unsigned long flags;
1763
1764         local_irq_save(flags);
1765
1766         stock = this_cpu_ptr(&memcg_stock);
1767         drain_stock(stock);
1768         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
1769
1770         local_irq_restore(flags);
1771 }
1772
1773 /*
1774  * Cache charges(val) to local per_cpu area.
1775  * This will be consumed by consume_stock() function, later.
1776  */
1777 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1778 {
1779         struct memcg_stock_pcp *stock;
1780         unsigned long flags;
1781
1782         local_irq_save(flags);
1783
1784         stock = this_cpu_ptr(&memcg_stock);
1785         if (stock->cached != memcg) { /* reset if necessary */
1786                 drain_stock(stock);
1787                 stock->cached = memcg;
1788         }
1789         stock->nr_pages += nr_pages;
1790
1791         local_irq_restore(flags);
1792 }
1793
1794 /*
1795  * Drains all per-CPU charge caches for given root_memcg resp. subtree
1796  * of the hierarchy under it.
1797  */
1798 static void drain_all_stock(struct mem_cgroup *root_memcg)
1799 {
1800         int cpu, curcpu;
1801
1802         /* If someone's already draining, avoid adding running more workers. */
1803         if (!mutex_trylock(&percpu_charge_mutex))
1804                 return;
1805         /* Notify other cpus that system-wide "drain" is running */
1806         get_online_cpus();
1807         curcpu = get_cpu();
1808         for_each_online_cpu(cpu) {
1809                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1810                 struct mem_cgroup *memcg;
1811
1812                 memcg = stock->cached;
1813                 if (!memcg || !stock->nr_pages)
1814                         continue;
1815                 if (!mem_cgroup_is_descendant(memcg, root_memcg))
1816                         continue;
1817                 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
1818                         if (cpu == curcpu)
1819                                 drain_local_stock(&stock->work);
1820                         else
1821                                 schedule_work_on(cpu, &stock->work);
1822                 }
1823         }
1824         put_cpu();
1825         put_online_cpus();
1826         mutex_unlock(&percpu_charge_mutex);
1827 }
1828
1829 static int memcg_hotplug_cpu_dead(unsigned int cpu)
1830 {
1831         struct memcg_stock_pcp *stock;
1832
1833         stock = &per_cpu(memcg_stock, cpu);
1834         drain_stock(stock);
1835         return 0;
1836 }
1837
1838 static void reclaim_high(struct mem_cgroup *memcg,
1839                          unsigned int nr_pages,
1840                          gfp_t gfp_mask)
1841 {
1842         do {
1843                 if (page_counter_read(&memcg->memory) <= memcg->high)
1844                         continue;
1845                 mem_cgroup_events(memcg, MEMCG_HIGH, 1);
1846                 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
1847         } while ((memcg = parent_mem_cgroup(memcg)));
1848 }
1849
1850 static void high_work_func(struct work_struct *work)
1851 {
1852         struct mem_cgroup *memcg;
1853
1854         memcg = container_of(work, struct mem_cgroup, high_work);
1855         reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL);
1856 }
1857
1858 /*
1859  * Scheduled by try_charge() to be executed from the userland return path
1860  * and reclaims memory over the high limit.
1861  */
1862 void mem_cgroup_handle_over_high(void)
1863 {
1864         unsigned int nr_pages = current->memcg_nr_pages_over_high;
1865         struct mem_cgroup *memcg;
1866
1867         if (likely(!nr_pages))
1868                 return;
1869
1870         memcg = get_mem_cgroup_from_mm(current->mm);
1871         reclaim_high(memcg, nr_pages, GFP_KERNEL);
1872         css_put(&memcg->css);
1873         current->memcg_nr_pages_over_high = 0;
1874 }
1875
1876 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
1877                       unsigned int nr_pages)
1878 {
1879         unsigned int batch = max(CHARGE_BATCH, nr_pages);
1880         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1881         struct mem_cgroup *mem_over_limit;
1882         struct page_counter *counter;
1883         unsigned long nr_reclaimed;
1884         bool may_swap = true;
1885         bool drained = false;
1886
1887         if (mem_cgroup_is_root(memcg))
1888                 return 0;
1889 retry:
1890         if (consume_stock(memcg, nr_pages))
1891                 return 0;
1892
1893         if (!do_memsw_account() ||
1894             page_counter_try_charge(&memcg->memsw, batch, &counter)) {
1895                 if (page_counter_try_charge(&memcg->memory, batch, &counter))
1896                         goto done_restock;
1897                 if (do_memsw_account())
1898                         page_counter_uncharge(&memcg->memsw, batch);
1899                 mem_over_limit = mem_cgroup_from_counter(counter, memory);
1900         } else {
1901                 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
1902                 may_swap = false;
1903         }
1904
1905         if (batch > nr_pages) {
1906                 batch = nr_pages;
1907                 goto retry;
1908         }
1909
1910         /*
1911          * Unlike in global OOM situations, memcg is not in a physical
1912          * memory shortage.  Allow dying and OOM-killed tasks to
1913          * bypass the last charges so that they can exit quickly and
1914          * free their memory.
1915          */
1916         if (unlikely(test_thread_flag(TIF_MEMDIE) ||
1917                      fatal_signal_pending(current) ||
1918                      current->flags & PF_EXITING))
1919                 goto force;
1920
1921         /*
1922          * Prevent unbounded recursion when reclaim operations need to
1923          * allocate memory. This might exceed the limits temporarily,
1924          * but we prefer facilitating memory reclaim and getting back
1925          * under the limit over triggering OOM kills in these cases.
1926          */
1927         if (unlikely(current->flags & PF_MEMALLOC))
1928                 goto force;
1929
1930         if (unlikely(task_in_memcg_oom(current)))
1931                 goto nomem;
1932
1933         if (!gfpflags_allow_blocking(gfp_mask))
1934                 goto nomem;
1935
1936         mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
1937
1938         nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
1939                                                     gfp_mask, may_swap);
1940
1941         if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
1942                 goto retry;
1943
1944         if (!drained) {
1945                 drain_all_stock(mem_over_limit);
1946                 drained = true;
1947                 goto retry;
1948         }
1949
1950         if (gfp_mask & __GFP_NORETRY)
1951                 goto nomem;
1952         /*
1953          * Even though the limit is exceeded at this point, reclaim
1954          * may have been able to free some pages.  Retry the charge
1955          * before killing the task.
1956          *
1957          * Only for regular pages, though: huge pages are rather
1958          * unlikely to succeed so close to the limit, and we fall back
1959          * to regular pages anyway in case of failure.
1960          */
1961         if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
1962                 goto retry;
1963         /*
1964          * At task move, charge accounts can be doubly counted. So, it's
1965          * better to wait until the end of task_move if something is going on.
1966          */
1967         if (mem_cgroup_wait_acct_move(mem_over_limit))
1968                 goto retry;
1969
1970         if (nr_retries--)
1971                 goto retry;
1972
1973         if (gfp_mask & __GFP_NOFAIL)
1974                 goto force;
1975
1976         if (fatal_signal_pending(current))
1977                 goto force;
1978
1979         mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
1980
1981         mem_cgroup_oom(mem_over_limit, gfp_mask,
1982                        get_order(nr_pages * PAGE_SIZE));
1983 nomem:
1984         if (!(gfp_mask & __GFP_NOFAIL))
1985                 return -ENOMEM;
1986 force:
1987         /*
1988          * The allocation either can't fail or will lead to more memory
1989          * being freed very soon.  Allow memory usage go over the limit
1990          * temporarily by force charging it.
1991          */
1992         page_counter_charge(&memcg->memory, nr_pages);
1993         if (do_memsw_account())
1994                 page_counter_charge(&memcg->memsw, nr_pages);
1995         css_get_many(&memcg->css, nr_pages);
1996
1997         return 0;
1998
1999 done_restock:
2000         css_get_many(&memcg->css, batch);
2001         if (batch > nr_pages)
2002                 refill_stock(memcg, batch - nr_pages);
2003
2004         /*
2005          * If the hierarchy is above the normal consumption range, schedule
2006          * reclaim on returning to userland.  We can perform reclaim here
2007          * if __GFP_RECLAIM but let's always punt for simplicity and so that
2008          * GFP_KERNEL can consistently be used during reclaim.  @memcg is
2009          * not recorded as it most likely matches current's and won't
2010          * change in the meantime.  As high limit is checked again before
2011          * reclaim, the cost of mismatch is negligible.
2012          */
2013         do {
2014                 if (page_counter_read(&memcg->memory) > memcg->high) {
2015                         /* Don't bother a random interrupted task */
2016                         if (in_interrupt()) {
2017                                 schedule_work(&memcg->high_work);
2018                                 break;
2019                         }
2020                         current->memcg_nr_pages_over_high += batch;
2021                         set_notify_resume(current);
2022                         break;
2023                 }
2024         } while ((memcg = parent_mem_cgroup(memcg)));
2025
2026         return 0;
2027 }
2028
2029 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2030 {
2031         if (mem_cgroup_is_root(memcg))
2032                 return;
2033
2034         page_counter_uncharge(&memcg->memory, nr_pages);
2035         if (do_memsw_account())
2036                 page_counter_uncharge(&memcg->memsw, nr_pages);
2037
2038         css_put_many(&memcg->css, nr_pages);
2039 }
2040
2041 static void lock_page_lru(struct page *page, int *isolated)
2042 {
2043         struct zone *zone = page_zone(page);
2044
2045         spin_lock_irq(zone_lru_lock(zone));
2046         if (PageLRU(page)) {
2047                 struct lruvec *lruvec;
2048
2049                 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2050                 ClearPageLRU(page);
2051                 del_page_from_lru_list(page, lruvec, page_lru(page));
2052                 *isolated = 1;
2053         } else
2054                 *isolated = 0;
2055 }
2056
2057 static void unlock_page_lru(struct page *page, int isolated)
2058 {
2059         struct zone *zone = page_zone(page);
2060
2061         if (isolated) {
2062                 struct lruvec *lruvec;
2063
2064                 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2065                 VM_BUG_ON_PAGE(PageLRU(page), page);
2066                 SetPageLRU(page);
2067                 add_page_to_lru_list(page, lruvec, page_lru(page));
2068         }
2069         spin_unlock_irq(zone_lru_lock(zone));
2070 }
2071
2072 static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2073                           bool lrucare)
2074 {
2075         int isolated;
2076
2077         VM_BUG_ON_PAGE(page->mem_cgroup, page);
2078
2079         /*
2080          * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2081          * may already be on some other mem_cgroup's LRU.  Take care of it.
2082          */
2083         if (lrucare)
2084                 lock_page_lru(page, &isolated);
2085
2086         /*
2087          * Nobody should be changing or seriously looking at
2088          * page->mem_cgroup at this point:
2089          *
2090          * - the page is uncharged
2091          *
2092          * - the page is off-LRU
2093          *
2094          * - an anonymous fault has exclusive page access, except for
2095          *   a locked page table
2096          *
2097          * - a page cache insertion, a swapin fault, or a migration
2098          *   have the page locked
2099          */
2100         page->mem_cgroup = memcg;
2101
2102         if (lrucare)
2103                 unlock_page_lru(page, isolated);
2104 }
2105
2106 #ifndef CONFIG_SLOB
2107 static int memcg_alloc_cache_id(void)
2108 {
2109         int id, size;
2110         int err;
2111
2112         id = ida_simple_get(&memcg_cache_ida,
2113                             0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2114         if (id < 0)
2115                 return id;
2116
2117         if (id < memcg_nr_cache_ids)
2118                 return id;
2119
2120         /*
2121          * There's no space for the new id in memcg_caches arrays,
2122          * so we have to grow them.
2123          */
2124         down_write(&memcg_cache_ids_sem);
2125
2126         size = 2 * (id + 1);
2127         if (size < MEMCG_CACHES_MIN_SIZE)
2128                 size = MEMCG_CACHES_MIN_SIZE;
2129         else if (size > MEMCG_CACHES_MAX_SIZE)
2130                 size = MEMCG_CACHES_MAX_SIZE;
2131
2132         err = memcg_update_all_caches(size);
2133         if (!err)
2134                 err = memcg_update_all_list_lrus(size);
2135         if (!err)
2136                 memcg_nr_cache_ids = size;
2137
2138         up_write(&memcg_cache_ids_sem);
2139
2140         if (err) {
2141                 ida_simple_remove(&memcg_cache_ida, id);
2142                 return err;
2143         }
2144         return id;
2145 }
2146
2147 static void memcg_free_cache_id(int id)
2148 {
2149         ida_simple_remove(&memcg_cache_ida, id);
2150 }
2151
2152 struct memcg_kmem_cache_create_work {
2153         struct mem_cgroup *memcg;
2154         struct kmem_cache *cachep;
2155         struct work_struct work;
2156 };
2157
2158 static void memcg_kmem_cache_create_func(struct work_struct *w)
2159 {
2160         struct memcg_kmem_cache_create_work *cw =
2161                 container_of(w, struct memcg_kmem_cache_create_work, work);
2162         struct mem_cgroup *memcg = cw->memcg;
2163         struct kmem_cache *cachep = cw->cachep;
2164
2165         memcg_create_kmem_cache(memcg, cachep);
2166
2167         css_put(&memcg->css);
2168         kfree(cw);
2169 }
2170
2171 /*
2172  * Enqueue the creation of a per-memcg kmem_cache.
2173  */
2174 static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2175                                                struct kmem_cache *cachep)
2176 {
2177         struct memcg_kmem_cache_create_work *cw;
2178
2179         cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
2180         if (!cw)
2181                 return;
2182
2183         css_get(&memcg->css);
2184
2185         cw->memcg = memcg;
2186         cw->cachep = cachep;
2187         INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2188
2189         queue_work(memcg_kmem_cache_wq, &cw->work);
2190 }
2191
2192 static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2193                                              struct kmem_cache *cachep)
2194 {
2195         /*
2196          * We need to stop accounting when we kmalloc, because if the
2197          * corresponding kmalloc cache is not yet created, the first allocation
2198          * in __memcg_schedule_kmem_cache_create will recurse.
2199          *
2200          * However, it is better to enclose the whole function. Depending on
2201          * the debugging options enabled, INIT_WORK(), for instance, can
2202          * trigger an allocation. This too, will make us recurse. Because at
2203          * this point we can't allow ourselves back into memcg_kmem_get_cache,
2204          * the safest choice is to do it like this, wrapping the whole function.
2205          */
2206         current->memcg_kmem_skip_account = 1;
2207         __memcg_schedule_kmem_cache_create(memcg, cachep);
2208         current->memcg_kmem_skip_account = 0;
2209 }
2210
2211 static inline bool memcg_kmem_bypass(void)
2212 {
2213         if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
2214                 return true;
2215         return false;
2216 }
2217
2218 /**
2219  * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
2220  * @cachep: the original global kmem cache
2221  *
2222  * Return the kmem_cache we're supposed to use for a slab allocation.
2223  * We try to use the current memcg's version of the cache.
2224  *
2225  * If the cache does not exist yet, if we are the first user of it, we
2226  * create it asynchronously in a workqueue and let the current allocation
2227  * go through with the original cache.
2228  *
2229  * This function takes a reference to the cache it returns to assure it
2230  * won't get destroyed while we are working with it. Once the caller is
2231  * done with it, memcg_kmem_put_cache() must be called to release the
2232  * reference.
2233  */
2234 struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2235 {
2236         struct mem_cgroup *memcg;
2237         struct kmem_cache *memcg_cachep;
2238         int kmemcg_id;
2239
2240         VM_BUG_ON(!is_root_cache(cachep));
2241
2242         if (memcg_kmem_bypass())
2243                 return cachep;
2244
2245         if (current->memcg_kmem_skip_account)
2246                 return cachep;
2247
2248         memcg = get_mem_cgroup_from_mm(current->mm);
2249         kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2250         if (kmemcg_id < 0)
2251                 goto out;
2252
2253         memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2254         if (likely(memcg_cachep))
2255                 return memcg_cachep;
2256
2257         /*
2258          * If we are in a safe context (can wait, and not in interrupt
2259          * context), we could be be predictable and return right away.
2260          * This would guarantee that the allocation being performed
2261          * already belongs in the new cache.
2262          *
2263          * However, there are some clashes that can arrive from locking.
2264          * For instance, because we acquire the slab_mutex while doing
2265          * memcg_create_kmem_cache, this means no further allocation
2266          * could happen with the slab_mutex held. So it's better to
2267          * defer everything.
2268          */
2269         memcg_schedule_kmem_cache_create(memcg, cachep);
2270 out:
2271         css_put(&memcg->css);
2272         return cachep;
2273 }
2274
2275 /**
2276  * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
2277  * @cachep: the cache returned by memcg_kmem_get_cache
2278  */
2279 void memcg_kmem_put_cache(struct kmem_cache *cachep)
2280 {
2281         if (!is_root_cache(cachep))
2282                 css_put(&cachep->memcg_params.memcg->css);
2283 }
2284
2285 /**
2286  * memcg_kmem_charge: charge a kmem page
2287  * @page: page to charge
2288  * @gfp: reclaim mode
2289  * @order: allocation order
2290  * @memcg: memory cgroup to charge
2291  *
2292  * Returns 0 on success, an error code on failure.
2293  */
2294 int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2295                             struct mem_cgroup *memcg)
2296 {
2297         unsigned int nr_pages = 1 << order;
2298         struct page_counter *counter;
2299         int ret;
2300
2301         ret = try_charge(memcg, gfp, nr_pages);
2302         if (ret)
2303                 return ret;
2304
2305         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
2306             !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
2307                 cancel_charge(memcg, nr_pages);
2308                 return -ENOMEM;
2309         }
2310
2311         page->mem_cgroup = memcg;
2312
2313         return 0;
2314 }
2315
2316 /**
2317  * memcg_kmem_charge: charge a kmem page to the current memory cgroup
2318  * @page: page to charge
2319  * @gfp: reclaim mode
2320  * @order: allocation order
2321  *
2322  * Returns 0 on success, an error code on failure.
2323  */
2324 int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
2325 {
2326         struct mem_cgroup *memcg;
2327         int ret = 0;
2328
2329         if (memcg_kmem_bypass())
2330                 return 0;
2331
2332         memcg = get_mem_cgroup_from_mm(current->mm);
2333         if (!mem_cgroup_is_root(memcg)) {
2334                 ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
2335                 if (!ret)
2336                         __SetPageKmemcg(page);
2337         }
2338         css_put(&memcg->css);
2339         return ret;
2340 }
2341 /**
2342  * memcg_kmem_uncharge: uncharge a kmem page
2343  * @page: page to uncharge
2344  * @order: allocation order
2345  */
2346 void memcg_kmem_uncharge(struct page *page, int order)
2347 {
2348         struct mem_cgroup *memcg = page->mem_cgroup;
2349         unsigned int nr_pages = 1 << order;
2350
2351         if (!memcg)
2352                 return;
2353
2354         VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2355
2356         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2357                 page_counter_uncharge(&memcg->kmem, nr_pages);
2358
2359         page_counter_uncharge(&memcg->memory, nr_pages);
2360         if (do_memsw_account())
2361                 page_counter_uncharge(&memcg->memsw, nr_pages);
2362
2363         page->mem_cgroup = NULL;
2364
2365         /* slab pages do not have PageKmemcg flag set */
2366         if (PageKmemcg(page))
2367                 __ClearPageKmemcg(page);
2368
2369         css_put_many(&memcg->css, nr_pages);
2370 }
2371 #endif /* !CONFIG_SLOB */
2372
2373 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2374
2375 /*
2376  * Because tail pages are not marked as "used", set it. We're under
2377  * zone_lru_lock and migration entries setup in all page mappings.
2378  */
2379 void mem_cgroup_split_huge_fixup(struct page *head)
2380 {
2381         int i;
2382
2383         if (mem_cgroup_disabled())
2384                 return;
2385
2386         for (i = 1; i < HPAGE_PMD_NR; i++)
2387                 head[i].mem_cgroup = head->mem_cgroup;
2388
2389         __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
2390                        HPAGE_PMD_NR);
2391 }
2392 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2393
2394 #ifdef CONFIG_MEMCG_SWAP
2395 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
2396                                          bool charge)
2397 {
2398         int val = (charge) ? 1 : -1;
2399         this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
2400 }
2401
2402 /**
2403  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2404  * @entry: swap entry to be moved
2405  * @from:  mem_cgroup which the entry is moved from
2406  * @to:  mem_cgroup which the entry is moved to
2407  *
2408  * It succeeds only when the swap_cgroup's record for this entry is the same
2409  * as the mem_cgroup's id of @from.
2410  *
2411  * Returns 0 on success, -EINVAL on failure.
2412  *
2413  * The caller must have charged to @to, IOW, called page_counter_charge() about
2414  * both res and memsw, and called css_get().
2415  */
2416 static int mem_cgroup_move_swap_account(swp_entry_t entry,
2417                                 struct mem_cgroup *from, struct mem_cgroup *to)
2418 {
2419         unsigned short old_id, new_id;
2420
2421         old_id = mem_cgroup_id(from);
2422         new_id = mem_cgroup_id(to);
2423
2424         if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2425                 mem_cgroup_swap_statistics(from, false);
2426                 mem_cgroup_swap_statistics(to, true);
2427                 return 0;
2428         }
2429         return -EINVAL;
2430 }
2431 #else
2432 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2433                                 struct mem_cgroup *from, struct mem_cgroup *to)
2434 {
2435         return -EINVAL;
2436 }
2437 #endif
2438
2439 static DEFINE_MUTEX(memcg_limit_mutex);
2440
2441 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2442                                    unsigned long limit)
2443 {
2444         unsigned long curusage;
2445         unsigned long oldusage;
2446         bool enlarge = false;
2447         int retry_count;
2448         int ret;
2449
2450         /*
2451          * For keeping hierarchical_reclaim simple, how long we should retry
2452          * is depends on callers. We set our retry-count to be function
2453          * of # of children which we should visit in this loop.
2454          */
2455         retry_count = MEM_CGROUP_RECLAIM_RETRIES *
2456                       mem_cgroup_count_children(memcg);
2457
2458         oldusage = page_counter_read(&memcg->memory);
2459
2460         do {
2461                 if (signal_pending(current)) {
2462                         ret = -EINTR;
2463                         break;
2464                 }
2465
2466                 mutex_lock(&memcg_limit_mutex);
2467                 if (limit > memcg->memsw.limit) {
2468                         mutex_unlock(&memcg_limit_mutex);
2469                         ret = -EINVAL;
2470                         break;
2471                 }
2472                 if (limit > memcg->memory.limit)
2473                         enlarge = true;
2474                 ret = page_counter_limit(&memcg->memory, limit);
2475                 mutex_unlock(&memcg_limit_mutex);
2476
2477                 if (!ret)
2478                         break;
2479
2480                 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
2481
2482                 curusage = page_counter_read(&memcg->memory);
2483                 /* Usage is reduced ? */
2484                 if (curusage >= oldusage)
2485                         retry_count--;
2486                 else
2487                         oldusage = curusage;
2488         } while (retry_count);
2489
2490         if (!ret && enlarge)
2491                 memcg_oom_recover(memcg);
2492
2493         return ret;
2494 }
2495
2496 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2497                                          unsigned long limit)
2498 {
2499         unsigned long curusage;
2500         unsigned long oldusage;
2501         bool enlarge = false;
2502         int retry_count;
2503         int ret;
2504
2505         /* see mem_cgroup_resize_res_limit */
2506         retry_count = MEM_CGROUP_RECLAIM_RETRIES *
2507                       mem_cgroup_count_children(memcg);
2508
2509         oldusage = page_counter_read(&memcg->memsw);
2510
2511         do {
2512                 if (signal_pending(current)) {
2513                         ret = -EINTR;
2514                         break;
2515                 }
2516
2517                 mutex_lock(&memcg_limit_mutex);
2518                 if (limit < memcg->memory.limit) {
2519                         mutex_unlock(&memcg_limit_mutex);
2520                         ret = -EINVAL;
2521                         break;
2522                 }
2523                 if (limit > memcg->memsw.limit)
2524                         enlarge = true;
2525                 ret = page_counter_limit(&memcg->memsw, limit);
2526                 mutex_unlock(&memcg_limit_mutex);
2527
2528                 if (!ret)
2529                         break;
2530
2531                 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
2532
2533                 curusage = page_counter_read(&memcg->memsw);
2534                 /* Usage is reduced ? */
2535                 if (curusage >= oldusage)
2536                         retry_count--;
2537                 else
2538                         oldusage = curusage;
2539         } while (retry_count);
2540
2541         if (!ret && enlarge)
2542                 memcg_oom_recover(memcg);
2543
2544         return ret;
2545 }
2546
2547 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
2548                                             gfp_t gfp_mask,
2549                                             unsigned long *total_scanned)
2550 {
2551         unsigned long nr_reclaimed = 0;
2552         struct mem_cgroup_per_node *mz, *next_mz = NULL;
2553         unsigned long reclaimed;
2554         int loop = 0;
2555         struct mem_cgroup_tree_per_node *mctz;
2556         unsigned long excess;
2557         unsigned long nr_scanned;
2558
2559         if (order > 0)
2560                 return 0;
2561
2562         mctz = soft_limit_tree_node(pgdat->node_id);
2563
2564         /*
2565          * Do not even bother to check the largest node if the root
2566          * is empty. Do it lockless to prevent lock bouncing. Races
2567          * are acceptable as soft limit is best effort anyway.
2568          */
2569         if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
2570                 return 0;
2571
2572         /*
2573          * This loop can run a while, specially if mem_cgroup's continuously
2574          * keep exceeding their soft limit and putting the system under
2575          * pressure
2576          */
2577         do {
2578                 if (next_mz)
2579                         mz = next_mz;
2580                 else
2581                         mz = mem_cgroup_largest_soft_limit_node(mctz);
2582                 if (!mz)
2583                         break;
2584
2585                 nr_scanned = 0;
2586                 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
2587                                                     gfp_mask, &nr_scanned);
2588                 nr_reclaimed += reclaimed;
2589                 *total_scanned += nr_scanned;
2590                 spin_lock_irq(&mctz->lock);
2591                 __mem_cgroup_remove_exceeded(mz, mctz);
2592
2593                 /*
2594                  * If we failed to reclaim anything from this memory cgroup
2595                  * it is time to move on to the next cgroup
2596                  */
2597                 next_mz = NULL;
2598                 if (!reclaimed)
2599                         next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
2600
2601                 excess = soft_limit_excess(mz->memcg);
2602                 /*
2603                  * One school of thought says that we should not add
2604                  * back the node to the tree if reclaim returns 0.
2605                  * But our reclaim could return 0, simply because due
2606                  * to priority we are exposing a smaller subset of
2607                  * memory to reclaim from. Consider this as a longer
2608                  * term TODO.
2609                  */
2610                 /* If excess == 0, no tree ops */
2611                 __mem_cgroup_insert_exceeded(mz, mctz, excess);
2612                 spin_unlock_irq(&mctz->lock);
2613                 css_put(&mz->memcg->css);
2614                 loop++;
2615                 /*
2616                  * Could not reclaim anything and there are no more
2617                  * mem cgroups to try or we seem to be looping without
2618                  * reclaiming anything.
2619                  */
2620                 if (!nr_reclaimed &&
2621                         (next_mz == NULL ||
2622                         loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2623                         break;
2624         } while (!nr_reclaimed);
2625         if (next_mz)
2626                 css_put(&next_mz->memcg->css);
2627         return nr_reclaimed;
2628 }
2629
2630 /*
2631  * Test whether @memcg has children, dead or alive.  Note that this
2632  * function doesn't care whether @memcg has use_hierarchy enabled and
2633  * returns %true if there are child csses according to the cgroup
2634  * hierarchy.  Testing use_hierarchy is the caller's responsiblity.
2635  */
2636 static inline bool memcg_has_children(struct mem_cgroup *memcg)
2637 {
2638         bool ret;
2639
2640         rcu_read_lock();
2641         ret = css_next_child(NULL, &memcg->css);
2642         rcu_read_unlock();
2643         return ret;
2644 }
2645
2646 /*
2647  * Reclaims as many pages from the given memcg as possible.
2648  *
2649  * Caller is responsible for holding css reference for memcg.
2650  */
2651 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
2652 {
2653         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2654
2655         /* we call try-to-free pages for make this cgroup empty */
2656         lru_add_drain_all();
2657         /* try to free all pages in this cgroup */
2658         while (nr_retries && page_counter_read(&memcg->memory)) {
2659                 int progress;
2660
2661                 if (signal_pending(current))
2662                         return -EINTR;
2663
2664                 progress = try_to_free_mem_cgroup_pages(memcg, 1,
2665                                                         GFP_KERNEL, true);
2666                 if (!progress) {
2667                         nr_retries--;
2668                         /* maybe some writeback is necessary */
2669                         congestion_wait(BLK_RW_ASYNC, HZ/10);
2670                 }
2671
2672         }
2673
2674         return 0;
2675 }
2676
2677 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
2678                                             char *buf, size_t nbytes,
2679                                             loff_t off)
2680 {
2681         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2682
2683         if (mem_cgroup_is_root(memcg))
2684                 return -EINVAL;
2685         return mem_cgroup_force_empty(memcg) ?: nbytes;
2686 }
2687
2688 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
2689                                      struct cftype *cft)
2690 {
2691         return mem_cgroup_from_css(css)->use_hierarchy;
2692 }
2693
2694 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
2695                                       struct cftype *cft, u64 val)
2696 {
2697         int retval = 0;
2698         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2699         struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
2700
2701         if (memcg->use_hierarchy == val)
2702                 return 0;
2703
2704         /*
2705          * If parent's use_hierarchy is set, we can't make any modifications
2706          * in the child subtrees. If it is unset, then the change can
2707          * occur, provided the current cgroup has no children.
2708          *
2709          * For the root cgroup, parent_mem is NULL, we allow value to be
2710          * set if there are no children.
2711          */
2712         if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
2713                                 (val == 1 || val == 0)) {
2714                 if (!memcg_has_children(memcg))
2715                         memcg->use_hierarchy = val;
2716                 else
2717                         retval = -EBUSY;
2718         } else
2719                 retval = -EINVAL;
2720
2721         return retval;
2722 }
2723
2724 static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
2725 {
2726         struct mem_cgroup *iter;
2727         int i;
2728
2729         memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
2730
2731         for_each_mem_cgroup_tree(iter, memcg) {
2732                 for (i = 0; i < MEMCG_NR_STAT; i++)
2733                         stat[i] += mem_cgroup_read_stat(iter, i);
2734         }
2735 }
2736
2737 static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
2738 {
2739         struct mem_cgroup *iter;
2740         int i;
2741
2742         memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS);
2743
2744         for_each_mem_cgroup_tree(iter, memcg) {
2745                 for (i = 0; i < MEMCG_NR_EVENTS; i++)
2746                         events[i] += mem_cgroup_read_events(iter, i);
2747         }
2748 }
2749
2750 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
2751 {
2752         unsigned long val = 0;
2753
2754         if (mem_cgroup_is_root(memcg)) {
2755                 struct mem_cgroup *iter;
2756
2757                 for_each_mem_cgroup_tree(iter, memcg) {
2758                         val += mem_cgroup_read_stat(iter,
2759                                         MEM_CGROUP_STAT_CACHE);
2760                         val += mem_cgroup_read_stat(iter,
2761                                         MEM_CGROUP_STAT_RSS);
2762                         if (swap)
2763                                 val += mem_cgroup_read_stat(iter,
2764                                                 MEM_CGROUP_STAT_SWAP);
2765                 }
2766         } else {
2767                 if (!swap)
2768                         val = page_counter_read(&memcg->memory);
2769                 else
2770                         val = page_counter_read(&memcg->memsw);
2771         }
2772         return val;
2773 }
2774
2775 enum {
2776         RES_USAGE,
2777         RES_LIMIT,
2778         RES_MAX_USAGE,
2779         RES_FAILCNT,
2780         RES_SOFT_LIMIT,
2781 };
2782
2783 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
2784                                struct cftype *cft)
2785 {
2786         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2787         struct page_counter *counter;
2788
2789         switch (MEMFILE_TYPE(cft->private)) {
2790         case _MEM:
2791                 counter = &memcg->memory;
2792                 break;
2793         case _MEMSWAP:
2794                 counter = &memcg->memsw;
2795                 break;
2796         case _KMEM:
2797                 counter = &memcg->kmem;
2798                 break;
2799         case _TCP:
2800                 counter = &memcg->tcpmem;
2801                 break;
2802         default:
2803                 BUG();
2804         }
2805
2806         switch (MEMFILE_ATTR(cft->private)) {
2807         case RES_USAGE:
2808                 if (counter == &memcg->memory)
2809                         return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
2810                 if (counter == &memcg->memsw)
2811                         return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
2812                 return (u64)page_counter_read(counter) * PAGE_SIZE;
2813         case RES_LIMIT:
2814                 return (u64)counter->limit * PAGE_SIZE;
2815         case RES_MAX_USAGE:
2816                 return (u64)counter->watermark * PAGE_SIZE;
2817         case RES_FAILCNT:
2818                 return counter->failcnt;
2819         case RES_SOFT_LIMIT:
2820                 return (u64)memcg->soft_limit * PAGE_SIZE;
2821         default:
2822                 BUG();
2823         }
2824 }
2825
2826 #ifndef CONFIG_SLOB
2827 static int memcg_online_kmem(struct mem_cgroup *memcg)
2828 {
2829         int memcg_id;
2830
2831         if (cgroup_memory_nokmem)
2832                 return 0;
2833
2834         BUG_ON(memcg->kmemcg_id >= 0);
2835         BUG_ON(memcg->kmem_state);
2836
2837         memcg_id = memcg_alloc_cache_id();
2838         if (memcg_id < 0)
2839                 return memcg_id;
2840
2841         static_branch_inc(&memcg_kmem_enabled_key);
2842         /*
2843          * A memory cgroup is considered kmem-online as soon as it gets
2844          * kmemcg_id. Setting the id after enabling static branching will
2845          * guarantee no one starts accounting before all call sites are
2846          * patched.
2847          */
2848         memcg->kmemcg_id = memcg_id;
2849         memcg->kmem_state = KMEM_ONLINE;
2850         INIT_LIST_HEAD(&memcg->kmem_caches);
2851
2852         return 0;
2853 }
2854
2855 static void memcg_offline_kmem(struct mem_cgroup *memcg)
2856 {
2857         struct cgroup_subsys_state *css;
2858         struct mem_cgroup *parent, *child;
2859         int kmemcg_id;
2860
2861         if (memcg->kmem_state != KMEM_ONLINE)
2862                 return;
2863         /*
2864          * Clear the online state before clearing memcg_caches array
2865          * entries. The slab_mutex in memcg_deactivate_kmem_caches()
2866          * guarantees that no cache will be created for this cgroup
2867          * after we are done (see memcg_create_kmem_cache()).
2868          */
2869         memcg->kmem_state = KMEM_ALLOCATED;
2870
2871         memcg_deactivate_kmem_caches(memcg);
2872
2873         kmemcg_id = memcg->kmemcg_id;
2874         BUG_ON(kmemcg_id < 0);
2875
2876         parent = parent_mem_cgroup(memcg);
2877         if (!parent)
2878                 parent = root_mem_cgroup;
2879
2880         /*
2881          * Change kmemcg_id of this cgroup and all its descendants to the
2882          * parent's id, and then move all entries from this cgroup's list_lrus
2883          * to ones of the parent. After we have finished, all list_lrus
2884          * corresponding to this cgroup are guaranteed to remain empty. The
2885          * ordering is imposed by list_lru_node->lock taken by
2886          * memcg_drain_all_list_lrus().
2887          */
2888         rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
2889         css_for_each_descendant_pre(css, &memcg->css) {
2890                 child = mem_cgroup_from_css(css);
2891                 BUG_ON(child->kmemcg_id != kmemcg_id);
2892                 child->kmemcg_id = parent->kmemcg_id;
2893                 if (!memcg->use_hierarchy)
2894                         break;
2895         }
2896         rcu_read_unlock();
2897
2898         memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
2899
2900         memcg_free_cache_id(kmemcg_id);
2901 }
2902
2903 static void memcg_free_kmem(struct mem_cgroup *memcg)
2904 {
2905         /* css_alloc() failed, offlining didn't happen */
2906         if (unlikely(memcg->kmem_state == KMEM_ONLINE))
2907                 memcg_offline_kmem(memcg);
2908
2909         if (memcg->kmem_state == KMEM_ALLOCATED) {
2910                 memcg_destroy_kmem_caches(memcg);
2911                 static_branch_dec(&memcg_kmem_enabled_key);
2912                 WARN_ON(page_counter_read(&memcg->kmem));
2913         }
2914 }
2915 #else
2916 static int memcg_online_kmem(struct mem_cgroup *memcg)
2917 {
2918         return 0;
2919 }
2920 static void memcg_offline_kmem(struct mem_cgroup *memcg)
2921 {
2922 }
2923 static void memcg_free_kmem(struct mem_cgroup *memcg)
2924 {
2925 }
2926 #endif /* !CONFIG_SLOB */
2927
2928 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
2929                                    unsigned long limit)
2930 {
2931         int ret;
2932
2933         mutex_lock(&memcg_limit_mutex);
2934         ret = page_counter_limit(&memcg->kmem, limit);
2935         mutex_unlock(&memcg_limit_mutex);
2936         return ret;
2937 }
2938
2939 static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
2940 {
2941         int ret;
2942
2943         mutex_lock(&memcg_limit_mutex);
2944
2945         ret = page_counter_limit(&memcg->tcpmem, limit);
2946         if (ret)
2947                 goto out;
2948
2949         if (!memcg->tcpmem_active) {
2950                 /*
2951                  * The active flag needs to be written after the static_key
2952                  * update. This is what guarantees that the socket activation
2953                  * function is the last one to run. See mem_cgroup_sk_alloc()
2954                  * for details, and note that we don't mark any socket as
2955                  * belonging to this memcg until that flag is up.
2956                  *
2957                  * We need to do this, because static_keys will span multiple
2958                  * sites, but we can't control their order. If we mark a socket
2959                  * as accounted, but the accounting functions are not patched in
2960                  * yet, we'll lose accounting.
2961                  *
2962                  * We never race with the readers in mem_cgroup_sk_alloc(),
2963                  * because when this value change, the code to process it is not
2964                  * patched in yet.
2965                  */
2966                 static_branch_inc(&memcg_sockets_enabled_key);
2967                 memcg->tcpmem_active = true;
2968         }
2969 out:
2970         mutex_unlock(&memcg_limit_mutex);
2971         return ret;
2972 }
2973
2974 /*
2975  * The user of this function is...
2976  * RES_LIMIT.
2977  */
2978 static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
2979                                 char *buf, size_t nbytes, loff_t off)
2980 {
2981         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2982         unsigned long nr_pages;
2983         int ret;
2984
2985         buf = strstrip(buf);
2986         ret = page_counter_memparse(buf, "-1", &nr_pages);
2987         if (ret)
2988                 return ret;
2989
2990         switch (MEMFILE_ATTR(of_cft(of)->private)) {
2991         case RES_LIMIT:
2992                 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2993                         ret = -EINVAL;
2994                         break;
2995                 }
2996                 switch (MEMFILE_TYPE(of_cft(of)->private)) {
2997                 case _MEM:
2998                         ret = mem_cgroup_resize_limit(memcg, nr_pages);
2999                         break;
3000                 case _MEMSWAP:
3001                         ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
3002                         break;
3003                 case _KMEM:
3004                         ret = memcg_update_kmem_limit(memcg, nr_pages);
3005                         break;
3006                 case _TCP:
3007                         ret = memcg_update_tcp_limit(memcg, nr_pages);
3008                         break;
3009                 }
3010                 break;
3011         case RES_SOFT_LIMIT:
3012                 memcg->soft_limit = nr_pages;
3013                 ret = 0;
3014                 break;
3015         }
3016         return ret ?: nbytes;
3017 }
3018
3019 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3020                                 size_t nbytes, loff_t off)
3021 {
3022         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3023         struct page_counter *counter;
3024
3025         switch (MEMFILE_TYPE(of_cft(of)->private)) {
3026         case _MEM:
3027                 counter = &memcg->memory;
3028                 break;
3029         case _MEMSWAP:
3030                 counter = &memcg->memsw;
3031                 break;
3032         case _KMEM:
3033                 counter = &memcg->kmem;
3034                 break;
3035         case _TCP:
3036                 counter = &memcg->tcpmem;
3037                 break;
3038         default:
3039                 BUG();
3040         }
3041
3042         switch (MEMFILE_ATTR(of_cft(of)->private)) {
3043         case RES_MAX_USAGE:
3044                 page_counter_reset_watermark(counter);
3045                 break;
3046         case RES_FAILCNT:
3047                 counter->failcnt = 0;
3048                 break;
3049         default:
3050                 BUG();
3051         }
3052
3053         return nbytes;
3054 }
3055
3056 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3057                                         struct cftype *cft)
3058 {
3059         return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3060 }
3061
3062 #ifdef CONFIG_MMU
3063 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3064                                         struct cftype *cft, u64 val)
3065 {
3066         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3067
3068         if (val & ~MOVE_MASK)
3069                 return -EINVAL;
3070
3071         /*
3072          * No kind of locking is needed in here, because ->can_attach() will
3073          * check this value once in the beginning of the process, and then carry
3074          * on with stale data. This means that changes to this value will only
3075          * affect task migrations starting after the change.
3076          */
3077         memcg->move_charge_at_immigrate = val;
3078         return 0;
3079 }
3080 #else
3081 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3082                                         struct cftype *cft, u64 val)
3083 {
3084         return -ENOSYS;
3085 }
3086 #endif
3087
3088 #ifdef CONFIG_NUMA
3089 static int memcg_numa_stat_show(struct seq_file *m, void *v)
3090 {
3091         struct numa_stat {
3092                 const char *name;
3093                 unsigned int lru_mask;
3094         };
3095
3096         static const struct numa_stat stats[] = {
3097                 { "total", LRU_ALL },
3098                 { "file", LRU_ALL_FILE },
3099                 { "anon", LRU_ALL_ANON },
3100                 { "unevictable", BIT(LRU_UNEVICTABLE) },
3101         };
3102         const struct numa_stat *stat;
3103         int nid;
3104         unsigned long nr;
3105         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3106
3107         for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3108                 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3109                 seq_printf(m, "%s=%lu", stat->name, nr);
3110                 for_each_node_state(nid, N_MEMORY) {
3111                         nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3112                                                           stat->lru_mask);
3113                         seq_printf(m, " N%d=%lu", nid, nr);
3114                 }
3115                 seq_putc(m, '\n');
3116         }
3117
3118         for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3119                 struct mem_cgroup *iter;
3120
3121                 nr = 0;
3122                 for_each_mem_cgroup_tree(iter, memcg)
3123                         nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3124                 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3125                 for_each_node_state(nid, N_MEMORY) {
3126                         nr = 0;
3127                         for_each_mem_cgroup_tree(iter, memcg)
3128                                 nr += mem_cgroup_node_nr_lru_pages(
3129                                         iter, nid, stat->lru_mask);
3130                         seq_printf(m, " N%d=%lu", nid, nr);
3131                 }
3132                 seq_putc(m, '\n');
3133         }
3134
3135         return 0;
3136 }
3137 #endif /* CONFIG_NUMA */
3138
3139 static int memcg_stat_show(struct seq_file *m, void *v)
3140 {
3141         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3142         unsigned long memory, memsw;
3143         struct mem_cgroup *mi;
3144         unsigned int i;
3145
3146         BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
3147                      MEM_CGROUP_STAT_NSTATS);
3148         BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
3149                      MEM_CGROUP_EVENTS_NSTATS);
3150         BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3151
3152         for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
3153                 if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
3154                         continue;
3155                 seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
3156                            mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
3157         }
3158
3159         for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
3160                 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
3161                            mem_cgroup_read_events(memcg, i));
3162
3163         for (i = 0; i < NR_LRU_LISTS; i++)
3164                 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3165                            mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
3166
3167         /* Hierarchical information */
3168         memory = memsw = PAGE_COUNTER_MAX;
3169         for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3170                 memory = min(memory, mi->memory.limit);
3171                 memsw = min(memsw, mi->memsw.limit);
3172         }
3173         seq_printf(m, "hierarchical_memory_limit %llu\n",
3174                    (u64)memory * PAGE_SIZE);
3175         if (do_memsw_account())
3176                 seq_printf(m, "hierarchical_memsw_limit %llu\n",
3177                            (u64)memsw * PAGE_SIZE);
3178
3179         for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
3180                 unsigned long long val = 0;
3181
3182                 if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
3183                         continue;
3184                 for_each_mem_cgroup_tree(mi, memcg)
3185                         val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
3186                 seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
3187         }
3188
3189         for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
3190                 unsigned long long val = 0;
3191
3192                 for_each_mem_cgroup_tree(mi, memcg)
3193                         val += mem_cgroup_read_events(mi, i);
3194                 seq_printf(m, "total_%s %llu\n",
3195                            mem_cgroup_events_names[i], val);
3196         }
3197
3198         for (i = 0; i < NR_LRU_LISTS; i++) {
3199                 unsigned long long val = 0;
3200
3201                 for_each_mem_cgroup_tree(mi, memcg)
3202                         val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
3203                 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
3204         }
3205
3206 #ifdef CONFIG_DEBUG_VM
3207         {
3208                 pg_data_t *pgdat;
3209                 struct mem_cgroup_per_node *mz;
3210                 struct zone_reclaim_stat *rstat;
3211                 unsigned long recent_rotated[2] = {0, 0};
3212                 unsigned long recent_scanned[2] = {0, 0};
3213
3214                 for_each_online_pgdat(pgdat) {
3215                         mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3216                         rstat = &mz->lruvec.reclaim_stat;
3217
3218                         recent_rotated[0] += rstat->recent_rotated[0];
3219                         recent_rotated[1] += rstat->recent_rotated[1];
3220                         recent_scanned[0] += rstat->recent_scanned[0];
3221                         recent_scanned[1] += rstat->recent_scanned[1];
3222                 }
3223                 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3224                 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3225                 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3226                 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
3227         }
3228 #endif
3229
3230         return 0;
3231 }
3232
3233 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
3234                                       struct cftype *cft)
3235 {
3236         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3237
3238         return mem_cgroup_swappiness(memcg);
3239 }
3240
3241 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
3242                                        struct cftype *cft, u64 val)
3243 {
3244         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3245
3246         if (val > 100)
3247                 return -EINVAL;
3248
3249         if (css->parent)
3250                 memcg->swappiness = val;
3251         else
3252                 vm_swappiness = val;
3253
3254         return 0;
3255 }
3256
3257 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3258 {
3259         struct mem_cgroup_threshold_ary *t;
3260         unsigned long usage;
3261         int i;
3262
3263         rcu_read_lock();
3264         if (!swap)
3265                 t = rcu_dereference(memcg->thresholds.primary);
3266         else
3267                 t = rcu_dereference(memcg->memsw_thresholds.primary);
3268
3269         if (!t)
3270                 goto unlock;
3271
3272         usage = mem_cgroup_usage(memcg, swap);
3273
3274         /*
3275          * current_threshold points to threshold just below or equal to usage.
3276          * If it's not true, a threshold was crossed after last
3277          * call of __mem_cgroup_threshold().
3278          */
3279         i = t->current_threshold;
3280
3281         /*
3282          * Iterate backward over array of thresholds starting from
3283          * current_threshold and check if a threshold is crossed.
3284          * If none of thresholds below usage is crossed, we read
3285          * only one element of the array here.
3286          */
3287         for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3288                 eventfd_signal(t->entries[i].eventfd, 1);
3289
3290         /* i = current_threshold + 1 */
3291         i++;
3292
3293         /*
3294          * Iterate forward over array of thresholds starting from
3295          * current_threshold+1 and check if a threshold is crossed.
3296          * If none of thresholds above usage is crossed, we read
3297          * only one element of the array here.
3298          */
3299         for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3300                 eventfd_signal(t->entries[i].eventfd, 1);
3301
3302         /* Update current_threshold */
3303         t->current_threshold = i - 1;
3304 unlock:
3305         rcu_read_unlock();
3306 }
3307
3308 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3309 {
3310         while (memcg) {
3311                 __mem_cgroup_threshold(memcg, false);
3312                 if (do_memsw_account())
3313                         __mem_cgroup_threshold(memcg, true);
3314
3315                 memcg = parent_mem_cgroup(memcg);
3316         }
3317 }
3318
3319 static int compare_thresholds(const void *a, const void *b)
3320 {
3321         const struct mem_cgroup_threshold *_a = a;
3322         const struct mem_cgroup_threshold *_b = b;
3323
3324         if (_a->threshold > _b->threshold)
3325                 return 1;
3326
3327         if (_a->threshold < _b->threshold)
3328                 return -1;
3329
3330         return 0;
3331 }
3332
3333 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
3334 {
3335         struct mem_cgroup_eventfd_list *ev;
3336
3337         spin_lock(&memcg_oom_lock);
3338
3339         list_for_each_entry(ev, &memcg->oom_notify, list)
3340                 eventfd_signal(ev->eventfd, 1);
3341
3342         spin_unlock(&memcg_oom_lock);
3343         return 0;
3344 }
3345
3346 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
3347 {
3348         struct mem_cgroup *iter;
3349
3350         for_each_mem_cgroup_tree(iter, memcg)
3351                 mem_cgroup_oom_notify_cb(iter);
3352 }
3353
3354 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3355         struct eventfd_ctx *eventfd, const char *args, enum res_type type)
3356 {
3357         struct mem_cgroup_thresholds *thresholds;
3358         struct mem_cgroup_threshold_ary *new;
3359         unsigned long threshold;
3360         unsigned long usage;
3361         int i, size, ret;
3362
3363         ret = page_counter_memparse(args, "-1", &threshold);
3364         if (ret)
3365                 return ret;
3366
3367         mutex_lock(&memcg->thresholds_lock);
3368
3369         if (type == _MEM) {
3370                 thresholds = &memcg->thresholds;
3371                 usage = mem_cgroup_usage(memcg, false);
3372         } else if (type == _MEMSWAP) {
3373                 thresholds = &memcg->memsw_thresholds;
3374                 usage = mem_cgroup_usage(memcg, true);
3375         } else
3376                 BUG();
3377
3378         /* Check if a threshold crossed before adding a new one */
3379         if (thresholds->primary)
3380                 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3381
3382         size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3383
3384         /* Allocate memory for new array of thresholds */
3385         new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3386                         GFP_KERNEL);
3387         if (!new) {
3388                 ret = -ENOMEM;
3389                 goto unlock;
3390         }
3391         new->size = size;
3392
3393         /* Copy thresholds (if any) to new array */
3394         if (thresholds->primary) {
3395                 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3396                                 sizeof(struct mem_cgroup_threshold));
3397         }
3398
3399         /* Add new threshold */
3400         new->entries[size - 1].eventfd = eventfd;
3401         new->entries[size - 1].threshold = threshold;
3402
3403         /* Sort thresholds. Registering of new threshold isn't time-critical */
3404         sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3405                         compare_thresholds, NULL);
3406
3407         /* Find current threshold */
3408         new->current_threshold = -1;
3409         for (i = 0; i < size; i++) {
3410                 if (new->entries[i].threshold <= usage) {
3411                         /*
3412                          * new->current_threshold will not be used until
3413                          * rcu_assign_pointer(), so it's safe to increment
3414                          * it here.
3415                          */
3416                         ++new->current_threshold;
3417                 } else
3418                         break;
3419         }
3420
3421         /* Free old spare buffer and save old primary buffer as spare */
3422         kfree(thresholds->spare);
3423         thresholds->spare = thresholds->primary;
3424
3425         rcu_assign_pointer(thresholds->primary, new);
3426
3427         /* To be sure that nobody uses thresholds */
3428         synchronize_rcu();
3429
3430 unlock:
3431         mutex_unlock(&memcg->thresholds_lock);
3432
3433         return ret;
3434 }
3435
3436 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3437         struct eventfd_ctx *eventfd, const char *args)
3438 {
3439         return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
3440 }
3441
3442 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
3443         struct eventfd_ctx *eventfd, const char *args)
3444 {
3445         return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
3446 }
3447
3448 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3449         struct eventfd_ctx *eventfd, enum res_type type)
3450 {
3451         struct mem_cgroup_thresholds *thresholds;
3452         struct mem_cgroup_threshold_ary *new;
3453         unsigned long usage;
3454         int i, j, size;
3455
3456         mutex_lock(&memcg->thresholds_lock);
3457
3458         if (type == _MEM) {
3459                 thresholds = &memcg->thresholds;
3460                 usage = mem_cgroup_usage(memcg, false);
3461         } else if (type == _MEMSWAP) {
3462                 thresholds = &memcg->memsw_thresholds;
3463                 usage = mem_cgroup_usage(memcg, true);
3464         } else
3465                 BUG();
3466
3467         if (!thresholds->primary)
3468                 goto unlock;
3469
3470         /* Check if a threshold crossed before removing */
3471         __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3472
3473         /* Calculate new number of threshold */
3474         size = 0;
3475         for (i = 0; i < thresholds->primary->size; i++) {
3476                 if (thresholds->primary->entries[i].eventfd != eventfd)
3477                         size++;
3478         }
3479
3480         new = thresholds->spare;
3481
3482         /* Set thresholds array to NULL if we don't have thresholds */
3483         if (!size) {
3484                 kfree(new);
3485                 new = NULL;
3486                 goto swap_buffers;
3487         }
3488
3489         new->size = size;
3490
3491         /* Copy thresholds and find current threshold */
3492         new->current_threshold = -1;
3493         for (i = 0, j = 0; i < thresholds->primary->size; i++) {
3494                 if (thresholds->primary->entries[i].eventfd == eventfd)
3495                         continue;
3496
3497                 new->entries[j] = thresholds->primary->entries[i];
3498                 if (new->entries[j].threshold <= usage) {
3499                         /*
3500                          * new->current_threshold will not be used
3501                          * until rcu_assign_pointer(), so it's safe to increment
3502                          * it here.
3503                          */
3504                         ++new->current_threshold;
3505                 }
3506                 j++;
3507         }
3508
3509 swap_buffers:
3510         /* Swap primary and spare array */
3511         thresholds->spare = thresholds->primary;
3512
3513         rcu_assign_pointer(thresholds->primary, new);
3514
3515         /* To be sure that nobody uses thresholds */
3516         synchronize_rcu();
3517
3518         /* If all events are unregistered, free the spare array */
3519         if (!new) {
3520                 kfree(thresholds->spare);
3521                 thresholds->spare = NULL;
3522         }
3523 unlock:
3524         mutex_unlock(&memcg->thresholds_lock);
3525 }
3526
3527 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3528         struct eventfd_ctx *eventfd)
3529 {
3530         return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
3531 }
3532
3533 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3534         struct eventfd_ctx *eventfd)
3535 {
3536         return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
3537 }
3538
3539 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
3540         struct eventfd_ctx *eventfd, const char *args)
3541 {
3542         struct mem_cgroup_eventfd_list *event;
3543
3544         event = kmalloc(sizeof(*event), GFP_KERNEL);
3545         if (!event)
3546                 return -ENOMEM;
3547
3548         spin_lock(&memcg_oom_lock);
3549
3550         event->eventfd = eventfd;
3551         list_add(&event->list, &memcg->oom_notify);
3552
3553         /* already in OOM ? */
3554         if (memcg->under_oom)
3555                 eventfd_signal(eventfd, 1);
3556         spin_unlock(&memcg_oom_lock);
3557
3558         return 0;
3559 }
3560
3561 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
3562         struct eventfd_ctx *eventfd)
3563 {
3564         struct mem_cgroup_eventfd_list *ev, *tmp;
3565
3566         spin_lock(&memcg_oom_lock);
3567
3568         list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
3569                 if (ev->eventfd == eventfd) {
3570                         list_del(&ev->list);
3571                         kfree(ev);
3572                 }
3573         }
3574
3575         spin_unlock(&memcg_oom_lock);
3576 }
3577
3578 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
3579 {
3580         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
3581
3582         seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
3583         seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
3584         return 0;
3585 }
3586
3587 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
3588         struct cftype *cft, u64 val)
3589 {
3590         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3591
3592         /* cannot set to root cgroup and only 0 and 1 are allowed */
3593         if (!css->parent || !((val == 0) || (val == 1)))
3594                 return -EINVAL;
3595
3596         memcg->oom_kill_disable = val;
3597         if (!val)
3598                 memcg_oom_recover(memcg);
3599
3600         return 0;
3601 }
3602
3603 #ifdef CONFIG_CGROUP_WRITEBACK
3604
3605 struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
3606 {
3607         return &memcg->cgwb_list;
3608 }
3609
3610 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3611 {
3612         return wb_domain_init(&memcg->cgwb_domain, gfp);
3613 }
3614
3615 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3616 {
3617         wb_domain_exit(&memcg->cgwb_domain);
3618 }
3619
3620 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3621 {
3622         wb_domain_size_changed(&memcg->cgwb_domain);
3623 }
3624
3625 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
3626 {
3627         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3628
3629         if (!memcg->css.parent)
3630                 return NULL;
3631
3632         return &memcg->cgwb_domain;
3633 }
3634
3635 /**
3636  * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
3637  * @wb: bdi_writeback in question
3638  * @pfilepages: out parameter for number of file pages
3639  * @pheadroom: out parameter for number of allocatable pages according to memcg
3640  * @pdirty: out parameter for number of dirty pages
3641  * @pwriteback: out parameter for number of pages under writeback
3642  *
3643  * Determine the numbers of file, headroom, dirty, and writeback pages in
3644  * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
3645  * is a bit more involved.
3646  *
3647  * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
3648  * headroom is calculated as the lowest headroom of itself and the
3649  * ancestors.  Note that this doesn't consider the actual amount of
3650  * available memory in the system.  The caller should further cap
3651  * *@pheadroom accordingly.
3652  */
3653 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
3654                          unsigned long *pheadroom, unsigned long *pdirty,
3655                          unsigned long *pwriteback)
3656 {
3657         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3658         struct mem_cgroup *parent;
3659
3660         *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
3661
3662         /* this should eventually include NR_UNSTABLE_NFS */
3663         *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
3664         *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
3665                                                      (1 << LRU_ACTIVE_FILE));
3666         *pheadroom = PAGE_COUNTER_MAX;
3667
3668         while ((parent = parent_mem_cgroup(memcg))) {
3669                 unsigned long ceiling = min(memcg->memory.limit, memcg->high);
3670                 unsigned long used = page_counter_read(&memcg->memory);
3671
3672                 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
3673                 memcg = parent;
3674         }
3675 }
3676
3677 #else   /* CONFIG_CGROUP_WRITEBACK */
3678
3679 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3680 {
3681         return 0;
3682 }
3683
3684 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3685 {
3686 }
3687
3688 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3689 {
3690 }
3691
3692 #endif  /* CONFIG_CGROUP_WRITEBACK */
3693
3694 /*
3695  * DO NOT USE IN NEW FILES.
3696  *
3697  * "cgroup.event_control" implementation.
3698  *
3699  * This is way over-engineered.  It tries to support fully configurable
3700  * events for each user.  Such level of flexibility is completely
3701  * unnecessary especially in the light of the planned unified hierarchy.
3702  *
3703  * Please deprecate this and replace with something simpler if at all
3704  * possible.
3705  */
3706
3707 /*
3708  * Unregister event and free resources.
3709  *
3710  * Gets called from workqueue.
3711  */
3712 static void memcg_event_remove(struct work_struct *work)
3713 {
3714         struct mem_cgroup_event *event =
3715                 container_of(work, struct mem_cgroup_event, remove);
3716         struct mem_cgroup *memcg = event->memcg;
3717
3718         remove_wait_queue(event->wqh, &event->wait);
3719
3720         event->unregister_event(memcg, event->eventfd);
3721
3722         /* Notify userspace the event is going away. */
3723         eventfd_signal(event->eventfd, 1);
3724
3725         eventfd_ctx_put(event->eventfd);
3726         kfree(event);
3727         css_put(&memcg->css);
3728 }
3729
3730 /*
3731  * Gets called on POLLHUP on eventfd when user closes it.
3732  *
3733  * Called with wqh->lock held and interrupts disabled.
3734  */
3735 static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
3736                             int sync, void *key)
3737 {
3738         struct mem_cgroup_event *event =
3739                 container_of(wait, struct mem_cgroup_event, wait);
3740         struct mem_cgroup *memcg = event->memcg;
3741         unsigned long flags = (unsigned long)key;
3742
3743         if (flags & POLLHUP) {
3744                 /*
3745                  * If the event has been detached at cgroup removal, we
3746                  * can simply return knowing the other side will cleanup
3747                  * for us.
3748                  *
3749                  * We can't race against event freeing since the other
3750                  * side will require wqh->lock via remove_wait_queue(),
3751                  * which we hold.
3752                  */
3753                 spin_lock(&memcg->event_list_lock);
3754                 if (!list_empty(&event->list)) {
3755                         list_del_init(&event->list);
3756                         /*
3757                          * We are in atomic context, but cgroup_event_remove()
3758                          * may sleep, so we have to call it in workqueue.
3759                          */
3760                         schedule_work(&event->remove);
3761                 }
3762                 spin_unlock(&memcg->event_list_lock);
3763         }
3764
3765         return 0;
3766 }
3767
3768 static void memcg_event_ptable_queue_proc(struct file *file,
3769                 wait_queue_head_t *wqh, poll_table *pt)
3770 {
3771         struct mem_cgroup_event *event =
3772                 container_of(pt, struct mem_cgroup_event, pt);
3773
3774         event->wqh = wqh;
3775         add_wait_queue(wqh, &event->wait);
3776 }
3777
3778 /*
3779  * DO NOT USE IN NEW FILES.
3780  *
3781  * Parse input and register new cgroup event handler.
3782  *
3783  * Input must be in format '<event_fd> <control_fd> <args>'.
3784  * Interpretation of args is defined by control file implementation.
3785  */
3786 static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
3787                                          char *buf, size_t nbytes, loff_t off)
3788 {
3789         struct cgroup_subsys_state *css = of_css(of);
3790         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3791         struct mem_cgroup_event *event;
3792         struct cgroup_subsys_state *cfile_css;
3793         unsigned int efd, cfd;
3794         struct fd efile;
3795         struct fd cfile;
3796         const char *name;
3797         char *endp;
3798         int ret;
3799
3800         buf = strstrip(buf);
3801
3802         efd = simple_strtoul(buf, &endp, 10);
3803         if (*endp != ' ')
3804                 return -EINVAL;
3805         buf = endp + 1;
3806
3807         cfd = simple_strtoul(buf, &endp, 10);
3808         if ((*endp != ' ') && (*endp != '\0'))
3809                 return -EINVAL;
3810         buf = endp + 1;
3811
3812         event = kzalloc(sizeof(*event), GFP_KERNEL);
3813         if (!event)
3814                 return -ENOMEM;
3815
3816         event->memcg = memcg;
3817         INIT_LIST_HEAD(&event->list);
3818         init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
3819         init_waitqueue_func_entry(&event->wait, memcg_event_wake);
3820         INIT_WORK(&event->remove, memcg_event_remove);
3821
3822         efile = fdget(efd);
3823         if (!efile.file) {
3824                 ret = -EBADF;
3825                 goto out_kfree;
3826         }
3827
3828         event->eventfd = eventfd_ctx_fileget(efile.file);
3829         if (IS_ERR(event->eventfd)) {
3830                 ret = PTR_ERR(event->eventfd);
3831                 goto out_put_efile;
3832         }
3833
3834         cfile = fdget(cfd);
3835         if (!cfile.file) {
3836                 ret = -EBADF;
3837                 goto out_put_eventfd;
3838         }
3839
3840         /* the process need read permission on control file */
3841         /* AV: shouldn't we check that it's been opened for read instead? */
3842         ret = inode_permission(file_inode(cfile.file), MAY_READ);
3843         if (ret < 0)
3844                 goto out_put_cfile;
3845
3846         /*
3847          * Determine the event callbacks and set them in @event.  This used
3848          * to be done via struct cftype but cgroup core no longer knows
3849          * about these events.  The following is crude but the whole thing
3850          * is for compatibility anyway.
3851          *
3852          * DO NOT ADD NEW FILES.
3853          */
3854         name = cfile.file->f_path.dentry->d_name.name;
3855
3856         if (!strcmp(name, "memory.usage_in_bytes")) {
3857                 event->register_event = mem_cgroup_usage_register_event;
3858                 event->unregister_event = mem_cgroup_usage_unregister_event;
3859         } else if (!strcmp(name, "memory.oom_control")) {
3860                 event->register_event = mem_cgroup_oom_register_event;
3861                 event->unregister_event = mem_cgroup_oom_unregister_event;
3862         } else if (!strcmp(name, "memory.pressure_level")) {
3863                 event->register_event = vmpressure_register_event;
3864                 event->unregister_event = vmpressure_unregister_event;
3865         } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
3866                 event->register_event = memsw_cgroup_usage_register_event;
3867                 event->unregister_event = memsw_cgroup_usage_unregister_event;
3868         } else {
3869                 ret = -EINVAL;
3870                 goto out_put_cfile;
3871         }
3872
3873         /*
3874          * Verify @cfile should belong to @css.  Also, remaining events are
3875          * automatically removed on cgroup destruction but the removal is
3876          * asynchronous, so take an extra ref on @css.
3877          */
3878         cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
3879                                                &memory_cgrp_subsys);
3880         ret = -EINVAL;
3881         if (IS_ERR(cfile_css))
3882                 goto out_put_cfile;
3883         if (cfile_css != css) {
3884                 css_put(cfile_css);
3885                 goto out_put_cfile;
3886         }
3887
3888         ret = event->register_event(memcg, event->eventfd, buf);
3889         if (ret)
3890                 goto out_put_css;
3891
3892         efile.file->f_op->poll(efile.file, &event->pt);
3893
3894         spin_lock(&memcg->event_list_lock);
3895         list_add(&event->list, &memcg->event_list);
3896         spin_unlock(&memcg->event_list_lock);
3897
3898         fdput(cfile);
3899         fdput(efile);
3900
3901         return nbytes;
3902
3903 out_put_css:
3904         css_put(css);
3905 out_put_cfile:
3906         fdput(cfile);
3907 out_put_eventfd:
3908         eventfd_ctx_put(event->eventfd);
3909 out_put_efile:
3910         fdput(efile);
3911 out_kfree:
3912         kfree(event);
3913
3914         return ret;
3915 }
3916
3917 static struct cftype mem_cgroup_legacy_files[] = {
3918         {
3919                 .name = "usage_in_bytes",
3920                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3921                 .read_u64 = mem_cgroup_read_u64,
3922         },
3923         {
3924                 .name = "max_usage_in_bytes",
3925                 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
3926                 .write = mem_cgroup_reset,
3927                 .read_u64 = mem_cgroup_read_u64,
3928         },
3929         {
3930                 .name = "limit_in_bytes",
3931                 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
3932                 .write = mem_cgroup_write,
3933                 .read_u64 = mem_cgroup_read_u64,
3934         },
3935         {
3936                 .name = "soft_limit_in_bytes",
3937                 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
3938                 .write = mem_cgroup_write,
3939                 .read_u64 = mem_cgroup_read_u64,
3940         },
3941         {
3942                 .name = "failcnt",
3943                 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
3944                 .write = mem_cgroup_reset,
3945                 .read_u64 = mem_cgroup_read_u64,
3946         },
3947         {
3948                 .name = "stat",
3949                 .seq_show = memcg_stat_show,
3950         },
3951         {
3952                 .name = "force_empty",
3953                 .write = mem_cgroup_force_empty_write,
3954         },
3955         {
3956                 .name = "use_hierarchy",
3957                 .write_u64 = mem_cgroup_hierarchy_write,
3958                 .read_u64 = mem_cgroup_hierarchy_read,
3959         },
3960         {
3961                 .name = "cgroup.event_control",         /* XXX: for compat */
3962                 .write = memcg_write_event_control,
3963                 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
3964         },
3965         {
3966                 .name = "swappiness",
3967                 .read_u64 = mem_cgroup_swappiness_read,
3968                 .write_u64 = mem_cgroup_swappiness_write,
3969         },
3970         {
3971                 .name = "move_charge_at_immigrate",
3972                 .read_u64 = mem_cgroup_move_charge_read,
3973                 .write_u64 = mem_cgroup_move_charge_write,
3974         },
3975         {
3976                 .name = "oom_control",
3977                 .seq_show = mem_cgroup_oom_control_read,
3978                 .write_u64 = mem_cgroup_oom_control_write,
3979                 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
3980         },
3981         {
3982                 .name = "pressure_level",
3983         },
3984 #ifdef CONFIG_NUMA
3985         {
3986                 .name = "numa_stat",
3987                 .seq_show = memcg_numa_stat_show,
3988         },
3989 #endif
3990         {
3991                 .name = "kmem.limit_in_bytes",
3992                 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
3993                 .write = mem_cgroup_write,
3994                 .read_u64 = mem_cgroup_read_u64,
3995         },
3996         {
3997                 .name = "kmem.usage_in_bytes",
3998                 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
3999                 .read_u64 = mem_cgroup_read_u64,
4000         },
4001         {
4002                 .name = "kmem.failcnt",
4003                 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
4004                 .write = mem_cgroup_reset,
4005                 .read_u64 = mem_cgroup_read_u64,
4006         },
4007         {
4008                 .name = "kmem.max_usage_in_bytes",
4009                 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
4010                 .write = mem_cgroup_reset,
4011                 .read_u64 = mem_cgroup_read_u64,
4012         },
4013 #ifdef CONFIG_SLABINFO
4014         {
4015                 .name = "kmem.slabinfo",
4016                 .seq_start = memcg_slab_start,
4017                 .seq_next = memcg_slab_next,
4018                 .seq_stop = memcg_slab_stop,
4019                 .seq_show = memcg_slab_show,
4020         },
4021 #endif
4022         {
4023                 .name = "kmem.tcp.limit_in_bytes",
4024                 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
4025                 .write = mem_cgroup_write,
4026                 .read_u64 = mem_cgroup_read_u64,
4027         },
4028         {
4029                 .name = "kmem.tcp.usage_in_bytes",
4030                 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
4031                 .read_u64 = mem_cgroup_read_u64,
4032         },
4033         {
4034                 .name = "kmem.tcp.failcnt",
4035                 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
4036                 .write = mem_cgroup_reset,
4037                 .read_u64 = mem_cgroup_read_u64,
4038         },
4039         {
4040                 .name = "kmem.tcp.max_usage_in_bytes",
4041                 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
4042                 .write = mem_cgroup_reset,
4043                 .read_u64 = mem_cgroup_read_u64,
4044         },
4045         { },    /* terminate */
4046 };
4047
4048 /*
4049  * Private memory cgroup IDR
4050  *
4051  * Swap-out records and page cache shadow entries need to store memcg
4052  * references in constrained space, so we maintain an ID space that is
4053  * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
4054  * memory-controlled cgroups to 64k.
4055  *
4056  * However, there usually are many references to the oflline CSS after
4057  * the cgroup has been destroyed, such as page cache or reclaimable
4058  * slab objects, that don't need to hang on to the ID. We want to keep
4059  * those dead CSS from occupying IDs, or we might quickly exhaust the
4060  * relatively small ID space and prevent the creation of new cgroups
4061  * even when there are much fewer than 64k cgroups - possibly none.
4062  *
4063  * Maintain a private 16-bit ID space for memcg, and allow the ID to
4064  * be freed and recycled when it's no longer needed, which is usually
4065  * when the CSS is offlined.
4066  *
4067  * The only exception to that are records of swapped out tmpfs/shmem
4068  * pages that need to be attributed to live ancestors on swapin. But
4069  * those references are manageable from userspace.
4070  */
4071
4072 static DEFINE_IDR(mem_cgroup_idr);
4073
4074 static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
4075 {
4076         VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
4077         atomic_add(n, &memcg->id.ref);
4078 }
4079
4080 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4081 {
4082         VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
4083         if (atomic_sub_and_test(n, &memcg->id.ref)) {
4084                 idr_remove(&mem_cgroup_idr, memcg->id.id);
4085                 memcg->id.id = 0;
4086
4087                 /* Memcg ID pins CSS */
4088                 css_put(&memcg->css);
4089         }
4090 }
4091
4092 static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
4093 {
4094         mem_cgroup_id_get_many(memcg, 1);
4095 }
4096
4097 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
4098 {
4099         mem_cgroup_id_put_many(memcg, 1);
4100 }
4101
4102 /**
4103  * mem_cgroup_from_id - look up a memcg from a memcg id
4104  * @id: the memcg id to look up
4105  *
4106  * Caller must hold rcu_read_lock().
4107  */
4108 struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
4109 {
4110         WARN_ON_ONCE(!rcu_read_lock_held());
4111         return idr_find(&mem_cgroup_idr, id);
4112 }
4113
4114 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4115 {
4116         struct mem_cgroup_per_node *pn;
4117         int tmp = node;
4118         /*
4119          * This routine is called against possible nodes.
4120          * But it's BUG to call kmalloc() against offline node.
4121          *
4122          * TODO: this routine can waste much memory for nodes which will
4123          *       never be onlined. It's better to use memory hotplug callback
4124          *       function.
4125          */
4126         if (!node_state(node, N_NORMAL_MEMORY))
4127                 tmp = -1;
4128         pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4129         if (!pn)
4130                 return 1;
4131
4132         lruvec_init(&pn->lruvec);
4133         pn->usage_in_excess = 0;
4134         pn->on_tree = false;
4135         pn->memcg = memcg;
4136
4137         memcg->nodeinfo[node] = pn;
4138         return 0;
4139 }
4140
4141 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4142 {
4143         kfree(memcg->nodeinfo[node]);
4144 }
4145
4146 static void __mem_cgroup_free(struct mem_cgroup *memcg)
4147 {
4148         int node;
4149
4150         for_each_node(node)
4151                 free_mem_cgroup_per_node_info(memcg, node);
4152         free_percpu(memcg->stat);
4153         kfree(memcg);
4154 }
4155
4156 static void mem_cgroup_free(struct mem_cgroup *memcg)
4157 {
4158         memcg_wb_domain_exit(memcg);
4159         __mem_cgroup_free(memcg);
4160 }
4161
4162 static struct mem_cgroup *mem_cgroup_alloc(void)
4163 {
4164         struct mem_cgroup *memcg;
4165         size_t size;
4166         int node;
4167
4168         size = sizeof(struct mem_cgroup);
4169         size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4170
4171         memcg = kzalloc(size, GFP_KERNEL);
4172         if (!memcg)
4173                 return NULL;
4174
4175         memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
4176                                  1, MEM_CGROUP_ID_MAX,
4177                                  GFP_KERNEL);
4178         if (memcg->id.id < 0)
4179                 goto fail;
4180
4181         memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4182         if (!memcg->stat)
4183                 goto fail;
4184
4185         for_each_node(node)
4186                 if (alloc_mem_cgroup_per_node_info(memcg, node))
4187                         goto fail;
4188
4189         if (memcg_wb_domain_init(memcg, GFP_KERNEL))
4190                 goto fail;
4191
4192         INIT_WORK(&memcg->high_work, high_work_func);
4193         memcg->last_scanned_node = MAX_NUMNODES;
4194         INIT_LIST_HEAD(&memcg->oom_notify);
4195         mutex_init(&memcg->thresholds_lock);
4196         spin_lock_init(&memcg->move_lock);
4197         vmpressure_init(&memcg->vmpressure);
4198         INIT_LIST_HEAD(&memcg->event_list);
4199         spin_lock_init(&memcg->event_list_lock);
4200         memcg->socket_pressure = jiffies;
4201 #ifndef CONFIG_SLOB
4202         memcg->kmemcg_id = -1;
4203 #endif
4204 #ifdef CONFIG_CGROUP_WRITEBACK
4205         INIT_LIST_HEAD(&memcg->cgwb_list);
4206 #endif
4207         idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
4208         return memcg;
4209 fail:
4210         if (memcg->id.id > 0)
4211                 idr_remove(&mem_cgroup_idr, memcg->id.id);
4212         __mem_cgroup_free(memcg);
4213         return NULL;
4214 }
4215
4216 static struct cgroup_subsys_state * __ref
4217 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4218 {
4219         struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4220         struct mem_cgroup *memcg;
4221         long error = -ENOMEM;
4222
4223         memcg = mem_cgroup_alloc();
4224         if (!memcg)
4225                 return ERR_PTR(error);
4226
4227         memcg->high = PAGE_COUNTER_MAX;
4228         memcg->soft_limit = PAGE_COUNTER_MAX;
4229         if (parent) {
4230                 memcg->swappiness = mem_cgroup_swappiness(parent);
4231                 memcg->oom_kill_disable = parent->oom_kill_disable;
4232         }
4233         if (parent && parent->use_hierarchy) {
4234                 memcg->use_hierarchy = true;
4235                 page_counter_init(&memcg->memory, &parent->memory);
4236                 page_counter_init(&memcg->swap, &parent->swap);
4237                 page_counter_init(&memcg->memsw, &parent->memsw);
4238                 page_counter_init(&memcg->kmem, &parent->kmem);
4239                 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
4240         } else {
4241                 page_counter_init(&memcg->memory, NULL);
4242                 page_counter_init(&memcg->swap, NULL);
4243                 page_counter_init(&memcg->memsw, NULL);
4244                 page_counter_init(&memcg->kmem, NULL);
4245                 page_counter_init(&memcg->tcpmem, NULL);
4246                 /*
4247                  * Deeper hierachy with use_hierarchy == false doesn't make
4248                  * much sense so let cgroup subsystem know about this
4249                  * unfortunate state in our controller.
4250                  */
4251                 if (parent != root_mem_cgroup)
4252                         memory_cgrp_subsys.broken_hierarchy = true;
4253         }
4254
4255         /* The following stuff does not apply to the root */
4256         if (!parent) {
4257                 root_mem_cgroup = memcg;
4258                 return &memcg->css;
4259         }
4260
4261         error = memcg_online_kmem(memcg);
4262         if (error)
4263                 goto fail;
4264
4265         if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4266                 static_branch_inc(&memcg_sockets_enabled_key);
4267
4268         return &memcg->css;
4269 fail:
4270         mem_cgroup_free(memcg);
4271         return ERR_PTR(-ENOMEM);
4272 }
4273
4274 static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
4275 {
4276         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4277
4278         /* Online state pins memcg ID, memcg ID pins CSS */
4279         atomic_set(&memcg->id.ref, 1);
4280         css_get(css);
4281         return 0;
4282 }
4283
4284 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4285 {
4286         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4287         struct mem_cgroup_event *event, *tmp;
4288
4289         /*
4290          * Unregister events and notify userspace.
4291          * Notify userspace about cgroup removing only after rmdir of cgroup
4292          * directory to avoid race between userspace and kernelspace.
4293          */
4294         spin_lock(&memcg->event_list_lock);
4295         list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
4296                 list_del_init(&event->list);
4297                 schedule_work(&event->remove);
4298         }
4299         spin_unlock(&memcg->event_list_lock);
4300
4301         memcg_offline_kmem(memcg);
4302         wb_memcg_offline(memcg);
4303
4304         mem_cgroup_id_put(memcg);
4305 }
4306
4307 static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
4308 {
4309         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4310
4311         invalidate_reclaim_iterators(memcg);
4312 }
4313
4314 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
4315 {
4316         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4317
4318         if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4319                 static_branch_dec(&memcg_sockets_enabled_key);
4320
4321         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
4322                 static_branch_dec(&memcg_sockets_enabled_key);
4323
4324         vmpressure_cleanup(&memcg->vmpressure);
4325         cancel_work_sync(&memcg->high_work);
4326         mem_cgroup_remove_from_trees(memcg);
4327         memcg_free_kmem(memcg);
4328         mem_cgroup_free(memcg);
4329 }
4330
4331 /**
4332  * mem_cgroup_css_reset - reset the states of a mem_cgroup
4333  * @css: the target css
4334  *
4335  * Reset the states of the mem_cgroup associated with @css.  This is
4336  * invoked when the userland requests disabling on the default hierarchy
4337  * but the memcg is pinned through dependency.  The memcg should stop
4338  * applying policies and should revert to the vanilla state as it may be
4339  * made visible again.
4340  *
4341  * The current implementation only resets the essential configurations.
4342  * This needs to be expanded to cover all the visible parts.
4343  */
4344 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
4345 {
4346         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4347
4348         page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX);
4349         page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX);
4350         page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX);
4351         page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
4352         page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX);
4353         memcg->low = 0;
4354         memcg->high = PAGE_COUNTER_MAX;
4355         memcg->soft_limit = PAGE_COUNTER_MAX;
4356         memcg_wb_domain_size_changed(memcg);
4357 }
4358
4359 #ifdef CONFIG_MMU
4360 /* Handlers for move charge at task migration. */
4361 static int mem_cgroup_do_precharge(unsigned long count)
4362 {
4363         int ret;
4364
4365         /* Try a single bulk charge without reclaim first, kswapd may wake */
4366         ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
4367         if (!ret) {
4368                 mc.precharge += count;
4369                 return ret;
4370         }
4371
4372         /* Try charges one by one with reclaim, but do not retry */
4373         while (count--) {
4374                 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
4375                 if (ret)
4376                         return ret;
4377                 mc.precharge++;
4378                 cond_resched();
4379         }
4380         return 0;
4381 }
4382
4383 union mc_target {
4384         struct page     *page;
4385         swp_entry_t     ent;
4386 };
4387
4388 enum mc_target_type {
4389         MC_TARGET_NONE = 0,
4390         MC_TARGET_PAGE,
4391         MC_TARGET_SWAP,
4392 };
4393
4394 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4395                                                 unsigned long addr, pte_t ptent)
4396 {
4397         struct page *page = vm_normal_page(vma, addr, ptent);
4398
4399         if (!page || !page_mapped(page))
4400                 return NULL;
4401         if (PageAnon(page)) {
4402                 if (!(mc.flags & MOVE_ANON))
4403                         return NULL;
4404         } else {
4405                 if (!(mc.flags & MOVE_FILE))
4406                         return NULL;
4407         }
4408         if (!get_page_unless_zero(page))
4409                 return NULL;
4410
4411         return page;
4412 }
4413
4414 #ifdef CONFIG_SWAP
4415 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4416                         pte_t ptent, swp_entry_t *entry)
4417 {
4418         struct page *page = NULL;
4419         swp_entry_t ent = pte_to_swp_entry(ptent);
4420
4421         if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
4422                 return NULL;
4423         /*
4424          * Because lookup_swap_cache() updates some statistics counter,
4425          * we call find_get_page() with swapper_space directly.
4426          */
4427         page = find_get_page(swap_address_space(ent), swp_offset(ent));
4428         if (do_memsw_account())
4429                 entry->val = ent.val;
4430
4431         return page;
4432 }
4433 #else
4434 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4435                         pte_t ptent, swp_entry_t *entry)
4436 {
4437         return NULL;
4438 }
4439 #endif
4440
4441 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4442                         unsigned long addr, pte_t ptent, swp_entry_t *entry)
4443 {
4444         struct page *page = NULL;
4445         struct address_space *mapping;
4446         pgoff_t pgoff;
4447
4448         if (!vma->vm_file) /* anonymous vma */
4449                 return NULL;
4450         if (!(mc.flags & MOVE_FILE))
4451                 return NULL;
4452
4453         mapping = vma->vm_file->f_mapping;
4454         pgoff = linear_page_index(vma, addr);
4455
4456         /* page is moved even if it's not RSS of this task(page-faulted). */
4457 #ifdef CONFIG_SWAP
4458         /* shmem/tmpfs may report page out on swap: account for that too. */
4459         if (shmem_mapping(mapping)) {
4460                 page = find_get_entry(mapping, pgoff);
4461                 if (radix_tree_exceptional_entry(page)) {
4462                         swp_entry_t swp = radix_to_swp_entry(page);
4463                         if (do_memsw_account())
4464                                 *entry = swp;
4465                         page = find_get_page(swap_address_space(swp),
4466                                              swp_offset(swp));
4467                 }
4468         } else
4469                 page = find_get_page(mapping, pgoff);
4470 #else
4471         page = find_get_page(mapping, pgoff);
4472 #endif
4473         return page;
4474 }
4475
4476 /**
4477  * mem_cgroup_move_account - move account of the page
4478  * @page: the page
4479  * @compound: charge the page as compound or small page
4480  * @from: mem_cgroup which the page is moved from.
4481  * @to: mem_cgroup which the page is moved to. @from != @to.
4482  *
4483  * The caller must make sure the page is not on LRU (isolate_page() is useful.)
4484  *
4485  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
4486  * from old cgroup.
4487  */
4488 static int mem_cgroup_move_account(struct page *page,
4489                                    bool compound,
4490                                    struct mem_cgroup *from,
4491                                    struct mem_cgroup *to)
4492 {
4493         unsigned long flags;
4494         unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
4495         int ret;
4496         bool anon;
4497
4498         VM_BUG_ON(from == to);
4499         VM_BUG_ON_PAGE(PageLRU(page), page);
4500         VM_BUG_ON(compound && !PageTransHuge(page));
4501
4502         /*
4503          * Prevent mem_cgroup_migrate() from looking at
4504          * page->mem_cgroup of its source page while we change it.
4505          */
4506         ret = -EBUSY;
4507         if (!trylock_page(page))
4508                 goto out;
4509
4510         ret = -EINVAL;
4511         if (page->mem_cgroup != from)
4512                 goto out_unlock;
4513
4514         anon = PageAnon(page);
4515
4516         spin_lock_irqsave(&from->move_lock, flags);
4517
4518         if (!anon && page_mapped(page)) {
4519                 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
4520                                nr_pages);
4521                 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
4522                                nr_pages);
4523         }
4524
4525         /*
4526          * move_lock grabbed above and caller set from->moving_account, so
4527          * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
4528          * So mapping should be stable for dirty pages.
4529          */
4530         if (!anon && PageDirty(page)) {
4531                 struct address_space *mapping = page_mapping(page);
4532
4533                 if (mapping_cap_account_dirty(mapping)) {
4534                         __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
4535                                        nr_pages);
4536                         __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
4537                                        nr_pages);
4538                 }
4539         }
4540
4541         if (PageWriteback(page)) {
4542                 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
4543                                nr_pages);
4544                 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
4545                                nr_pages);
4546         }
4547
4548         /*
4549          * It is safe to change page->mem_cgroup here because the page
4550          * is referenced, charged, and isolated - we can't race with
4551          * uncharging, charging, migration, or LRU putback.
4552          */
4553
4554         /* caller should have done css_get */
4555         page->mem_cgroup = to;
4556         spin_unlock_irqrestore(&from->move_lock, flags);
4557
4558         ret = 0;
4559
4560         local_irq_disable();
4561         mem_cgroup_charge_statistics(to, page, compound, nr_pages);
4562         memcg_check_events(to, page);
4563         mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
4564         memcg_check_events(from, page);
4565         local_irq_enable();
4566 out_unlock:
4567         unlock_page(page);
4568 out:
4569         return ret;
4570 }
4571
4572 /**
4573  * get_mctgt_type - get target type of moving charge
4574  * @vma: the vma the pte to be checked belongs
4575  * @addr: the address corresponding to the pte to be checked
4576  * @ptent: the pte to be checked
4577  * @target: the pointer the target page or swap ent will be stored(can be NULL)
4578  *
4579  * Returns
4580  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
4581  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
4582  *     move charge. if @target is not NULL, the page is stored in target->page
4583  *     with extra refcnt got(Callers should handle it).
4584  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4585  *     target for charge migration. if @target is not NULL, the entry is stored
4586  *     in target->ent.
4587  *
4588  * Called with pte lock held.
4589  */
4590
4591 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
4592                 unsigned long addr, pte_t ptent, union mc_target *target)
4593 {
4594         struct page *page = NULL;
4595         enum mc_target_type ret = MC_TARGET_NONE;
4596         swp_entry_t ent = { .val = 0 };
4597
4598         if (pte_present(ptent))
4599                 page = mc_handle_present_pte(vma, addr, ptent);
4600         else if (is_swap_pte(ptent))
4601                 page = mc_handle_swap_pte(vma, ptent, &ent);
4602         else if (pte_none(ptent))
4603                 page = mc_handle_file_pte(vma, addr, ptent, &ent);
4604
4605         if (!page && !ent.val)
4606                 return ret;
4607         if (page) {
4608                 /*
4609                  * Do only loose check w/o serialization.
4610                  * mem_cgroup_move_account() checks the page is valid or
4611                  * not under LRU exclusion.
4612                  */
4613                 if (page->mem_cgroup == mc.from) {
4614                         ret = MC_TARGET_PAGE;
4615                         if (target)
4616                                 target->page = page;
4617                 }
4618                 if (!ret || !target)
4619                         put_page(page);
4620         }
4621         /* There is a swap entry and a page doesn't exist or isn't charged */
4622         if (ent.val && !ret &&
4623             mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
4624                 ret = MC_TARGET_SWAP;
4625                 if (target)
4626                         target->ent = ent;
4627         }
4628         return ret;
4629 }
4630
4631 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4632 /*
4633  * We don't consider swapping or file mapped pages because THP does not
4634  * support them for now.
4635  * Caller should make sure that pmd_trans_huge(pmd) is true.
4636  */
4637 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
4638                 unsigned long addr, pmd_t pmd, union mc_target *target)
4639 {
4640         struct page *page = NULL;
4641         enum mc_target_type ret = MC_TARGET_NONE;
4642
4643         page = pmd_page(pmd);
4644         VM_BUG_ON_PAGE(!page || !PageHead(page), page);
4645         if (!(mc.flags & MOVE_ANON))
4646                 return ret;
4647         if (page->mem_cgroup == mc.from) {
4648                 ret = MC_TARGET_PAGE;
4649                 if (target) {
4650                         get_page(page);
4651                         target->page = page;
4652                 }
4653         }
4654         return ret;
4655 }
4656 #else
4657 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
4658                 unsigned long addr, pmd_t pmd, union mc_target *target)
4659 {
4660         return MC_TARGET_NONE;
4661 }
4662 #endif
4663
4664 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4665                                         unsigned long addr, unsigned long end,
4666                                         struct mm_walk *walk)
4667 {
4668         struct vm_area_struct *vma = walk->vma;
4669         pte_t *pte;
4670         spinlock_t *ptl;
4671
4672         ptl = pmd_trans_huge_lock(pmd, vma);
4673         if (ptl) {
4674                 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
4675                         mc.precharge += HPAGE_PMD_NR;
4676                 spin_unlock(ptl);
4677                 return 0;
4678         }
4679
4680         if (pmd_trans_unstable(pmd))
4681                 return 0;
4682         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4683         for (; addr != end; pte++, addr += PAGE_SIZE)
4684                 if (get_mctgt_type(vma, addr, *pte, NULL))
4685                         mc.precharge++; /* increment precharge temporarily */
4686         pte_unmap_unlock(pte - 1, ptl);
4687         cond_resched();
4688
4689         return 0;
4690 }
4691
4692 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4693 {
4694         unsigned long precharge;
4695
4696         struct mm_walk mem_cgroup_count_precharge_walk = {
4697                 .pmd_entry = mem_cgroup_count_precharge_pte_range,
4698                 .mm = mm,
4699         };
4700         down_read(&mm->mmap_sem);
4701         walk_page_range(0, mm->highest_vm_end,
4702                         &mem_cgroup_count_precharge_walk);
4703         up_read(&mm->mmap_sem);
4704
4705         precharge = mc.precharge;
4706         mc.precharge = 0;
4707
4708         return precharge;
4709 }
4710
4711 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4712 {
4713         unsigned long precharge = mem_cgroup_count_precharge(mm);
4714
4715         VM_BUG_ON(mc.moving_task);
4716         mc.moving_task = current;
4717         return mem_cgroup_do_precharge(precharge);
4718 }
4719
4720 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
4721 static void __mem_cgroup_clear_mc(void)
4722 {
4723         struct mem_cgroup *from = mc.from;
4724         struct mem_cgroup *to = mc.to;
4725
4726         /* we must uncharge all the leftover precharges from mc.to */
4727         if (mc.precharge) {
4728                 cancel_charge(mc.to, mc.precharge);
4729                 mc.precharge = 0;
4730         }
4731         /*
4732          * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
4733          * we must uncharge here.
4734          */
4735         if (mc.moved_charge) {
4736                 cancel_charge(mc.from, mc.moved_charge);
4737                 mc.moved_charge = 0;
4738         }
4739         /* we must fixup refcnts and charges */
4740         if (mc.moved_swap) {
4741                 /* uncharge swap account from the old cgroup */
4742                 if (!mem_cgroup_is_root(mc.from))
4743                         page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
4744
4745                 mem_cgroup_id_put_many(mc.from, mc.moved_swap);
4746
4747                 /*
4748                  * we charged both to->memory and to->memsw, so we
4749                  * should uncharge to->memory.
4750                  */
4751                 if (!mem_cgroup_is_root(mc.to))
4752                         page_counter_uncharge(&mc.to->memory, mc.moved_swap);
4753
4754                 mem_cgroup_id_get_many(mc.to, mc.moved_swap);
4755                 css_put_many(&mc.to->css, mc.moved_swap);
4756
4757                 mc.moved_swap = 0;
4758         }
4759         memcg_oom_recover(from);
4760         memcg_oom_recover(to);
4761         wake_up_all(&mc.waitq);
4762 }
4763
4764 static void mem_cgroup_clear_mc(void)
4765 {
4766         struct mm_struct *mm = mc.mm;
4767
4768         /*
4769          * we must clear moving_task before waking up waiters at the end of
4770          * task migration.
4771          */
4772         mc.moving_task = NULL;
4773         __mem_cgroup_clear_mc();
4774         spin_lock(&mc.lock);
4775         mc.from = NULL;
4776         mc.to = NULL;
4777         mc.mm = NULL;
4778         spin_unlock(&mc.lock);
4779
4780         mmput(mm);
4781 }
4782
4783 static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
4784 {
4785         struct cgroup_subsys_state *css;
4786         struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
4787         struct mem_cgroup *from;
4788         struct task_struct *leader, *p;
4789         struct mm_struct *mm;
4790         unsigned long move_flags;
4791         int ret = 0;
4792
4793         /* charge immigration isn't supported on the default hierarchy */
4794         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
4795                 return 0;
4796
4797         /*
4798          * Multi-process migrations only happen on the default hierarchy
4799          * where charge immigration is not used.  Perform charge
4800          * immigration if @tset contains a leader and whine if there are
4801          * multiple.
4802          */
4803         p = NULL;
4804         cgroup_taskset_for_each_leader(leader, css, tset) {
4805                 WARN_ON_ONCE(p);
4806                 p = leader;
4807                 memcg = mem_cgroup_from_css(css);
4808         }
4809         if (!p)
4810                 return 0;
4811
4812         /*
4813          * We are now commited to this value whatever it is. Changes in this
4814          * tunable will only affect upcoming migrations, not the current one.
4815          * So we need to save it, and keep it going.
4816          */
4817         move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
4818         if (!move_flags)
4819                 return 0;
4820
4821         from = mem_cgroup_from_task(p);
4822
4823         VM_BUG_ON(from == memcg);
4824
4825         mm = get_task_mm(p);
4826         if (!mm)
4827                 return 0;
4828         /* We move charges only when we move a owner of the mm */
4829         if (mm->owner == p) {
4830                 VM_BUG_ON(mc.from);
4831                 VM_BUG_ON(mc.to);
4832                 VM_BUG_ON(mc.precharge);
4833                 VM_BUG_ON(mc.moved_charge);
4834                 VM_BUG_ON(mc.moved_swap);
4835
4836                 spin_lock(&mc.lock);
4837                 mc.mm = mm;
4838                 mc.from = from;
4839                 mc.to = memcg;
4840                 mc.flags = move_flags;
4841                 spin_unlock(&mc.lock);
4842                 /* We set mc.moving_task later */
4843
4844                 ret = mem_cgroup_precharge_mc(mm);
4845                 if (ret)
4846                         mem_cgroup_clear_mc();
4847         } else {
4848                 mmput(mm);
4849         }
4850         return ret;
4851 }
4852
4853 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
4854 {
4855         if (mc.to)
4856                 mem_cgroup_clear_mc();
4857 }
4858
4859 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4860                                 unsigned long addr, unsigned long end,
4861                                 struct mm_walk *walk)
4862 {
4863         int ret = 0;
4864         struct vm_area_struct *vma = walk->vma;
4865         pte_t *pte;
4866         spinlock_t *ptl;
4867         enum mc_target_type target_type;
4868         union mc_target target;
4869         struct page *page;
4870
4871         ptl = pmd_trans_huge_lock(pmd, vma);
4872         if (ptl) {
4873                 if (mc.precharge < HPAGE_PMD_NR) {
4874                         spin_unlock(ptl);
4875                         return 0;
4876                 }
4877                 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
4878                 if (target_type == MC_TARGET_PAGE) {
4879                         page = target.page;
4880                         if (!isolate_lru_page(page)) {
4881                                 if (!mem_cgroup_move_account(page, true,
4882                                                              mc.from, mc.to)) {
4883                                         mc.precharge -= HPAGE_PMD_NR;
4884                                         mc.moved_charge += HPAGE_PMD_NR;
4885                                 }
4886                                 putback_lru_page(page);
4887                         }
4888                         put_page(page);
4889                 }
4890                 spin_unlock(ptl);
4891                 return 0;
4892         }
4893
4894         if (pmd_trans_unstable(pmd))
4895                 return 0;
4896 retry:
4897         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4898         for (; addr != end; addr += PAGE_SIZE) {
4899                 pte_t ptent = *(pte++);
4900                 swp_entry_t ent;
4901
4902                 if (!mc.precharge)
4903                         break;
4904
4905                 switch (get_mctgt_type(vma, addr, ptent, &target)) {
4906                 case MC_TARGET_PAGE:
4907                         page = target.page;
4908                         /*
4909                          * We can have a part of the split pmd here. Moving it
4910                          * can be done but it would be too convoluted so simply
4911                          * ignore such a partial THP and keep it in original
4912                          * memcg. There should be somebody mapping the head.
4913                          */
4914                         if (PageTransCompound(page))
4915                                 goto put;
4916                         if (isolate_lru_page(page))
4917                                 goto put;
4918                         if (!mem_cgroup_move_account(page, false,
4919                                                 mc.from, mc.to)) {
4920                                 mc.precharge--;
4921                                 /* we uncharge from mc.from later. */
4922                                 mc.moved_charge++;
4923                         }
4924                         putback_lru_page(page);
4925 put:                    /* get_mctgt_type() gets the page */
4926                         put_page(page);
4927                         break;
4928                 case MC_TARGET_SWAP:
4929                         ent = target.ent;
4930                         if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
4931                                 mc.precharge--;
4932                                 /* we fixup refcnts and charges later. */
4933                                 mc.moved_swap++;
4934                         }
4935                         break;
4936                 default:
4937                         break;
4938                 }
4939         }
4940         pte_unmap_unlock(pte - 1, ptl);
4941         cond_resched();
4942
4943         if (addr != end) {
4944                 /*
4945                  * We have consumed all precharges we got in can_attach().
4946                  * We try charge one by one, but don't do any additional
4947                  * charges to mc.to if we have failed in charge once in attach()
4948                  * phase.
4949                  */
4950                 ret = mem_cgroup_do_precharge(1);
4951                 if (!ret)
4952                         goto retry;
4953         }
4954
4955         return ret;
4956 }
4957
4958 static void mem_cgroup_move_charge(void)
4959 {
4960         struct mm_walk mem_cgroup_move_charge_walk = {
4961                 .pmd_entry = mem_cgroup_move_charge_pte_range,
4962                 .mm = mc.mm,
4963         };
4964
4965         lru_add_drain_all();
4966         /*
4967          * Signal lock_page_memcg() to take the memcg's move_lock
4968          * while we're moving its pages to another memcg. Then wait
4969          * for already started RCU-only updates to finish.
4970          */
4971         atomic_inc(&mc.from->moving_account);
4972         synchronize_rcu();
4973 retry:
4974         if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
4975                 /*
4976                  * Someone who are holding the mmap_sem might be waiting in
4977                  * waitq. So we cancel all extra charges, wake up all waiters,
4978                  * and retry. Because we cancel precharges, we might not be able
4979                  * to move enough charges, but moving charge is a best-effort
4980                  * feature anyway, so it wouldn't be a big problem.
4981                  */
4982                 __mem_cgroup_clear_mc();
4983                 cond_resched();
4984                 goto retry;
4985         }
4986         /*
4987          * When we have consumed all precharges and failed in doing
4988          * additional charge, the page walk just aborts.
4989          */
4990         walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
4991
4992         up_read(&mc.mm->mmap_sem);
4993         atomic_dec(&mc.from->moving_account);
4994 }
4995
4996 static void mem_cgroup_move_task(void)
4997 {
4998         if (mc.to) {
4999                 mem_cgroup_move_charge();
5000                 mem_cgroup_clear_mc();
5001         }
5002 }
5003 #else   /* !CONFIG_MMU */
5004 static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5005 {
5006         return 0;
5007 }
5008 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5009 {
5010 }
5011 static void mem_cgroup_move_task(void)
5012 {
5013 }
5014 #endif
5015
5016 /*
5017  * Cgroup retains root cgroups across [un]mount cycles making it necessary
5018  * to verify whether we're attached to the default hierarchy on each mount
5019  * attempt.
5020  */
5021 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5022 {
5023         /*
5024          * use_hierarchy is forced on the default hierarchy.  cgroup core
5025          * guarantees that @root doesn't have any children, so turning it
5026          * on for the root memcg is enough.
5027          */
5028         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5029                 root_mem_cgroup->use_hierarchy = true;
5030         else
5031                 root_mem_cgroup->use_hierarchy = false;
5032 }
5033
5034 static u64 memory_current_read(struct cgroup_subsys_state *css,
5035                                struct cftype *cft)
5036 {
5037         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5038
5039         return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
5040 }
5041
5042 static int memory_low_show(struct seq_file *m, void *v)
5043 {
5044         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5045         unsigned long low = READ_ONCE(memcg->low);
5046
5047         if (low == PAGE_COUNTER_MAX)
5048                 seq_puts(m, "max\n");
5049         else
5050                 seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5051
5052         return 0;
5053 }
5054
5055 static ssize_t memory_low_write(struct kernfs_open_file *of,
5056                                 char *buf, size_t nbytes, loff_t off)
5057 {
5058         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5059         unsigned long low;
5060         int err;
5061
5062         buf = strstrip(buf);
5063         err = page_counter_memparse(buf, "max", &low);
5064         if (err)
5065                 return err;
5066
5067         memcg->low = low;
5068
5069         return nbytes;
5070 }
5071
5072 static int memory_high_show(struct seq_file *m, void *v)
5073 {
5074         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5075         unsigned long high = READ_ONCE(memcg->high);
5076
5077         if (high == PAGE_COUNTER_MAX)
5078                 seq_puts(m, "max\n");
5079         else
5080                 seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5081
5082         return 0;
5083 }
5084
5085 static ssize_t memory_high_write(struct kernfs_open_file *of,
5086                                  char *buf, size_t nbytes, loff_t off)
5087 {
5088         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5089         unsigned long nr_pages;
5090         unsigned long high;
5091         int err;
5092
5093         buf = strstrip(buf);
5094         err = page_counter_memparse(buf, "max", &high);
5095         if (err)
5096                 return err;
5097
5098         memcg->high = high;
5099
5100         nr_pages = page_counter_read(&memcg->memory);
5101         if (nr_pages > high)
5102                 try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5103                                              GFP_KERNEL, true);
5104
5105         memcg_wb_domain_size_changed(memcg);
5106         return nbytes;
5107 }
5108
5109 static int memory_max_show(struct seq_file *m, void *v)
5110 {
5111         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5112         unsigned long max = READ_ONCE(memcg->memory.limit);
5113
5114         if (max == PAGE_COUNTER_MAX)
5115                 seq_puts(m, "max\n");
5116         else
5117                 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5118
5119         return 0;
5120 }
5121
5122 static ssize_t memory_max_write(struct kernfs_open_file *of,
5123                                 char *buf, size_t nbytes, loff_t off)
5124 {
5125         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5126         unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
5127         bool drained = false;
5128         unsigned long max;
5129         int err;
5130
5131         buf = strstrip(buf);
5132         err = page_counter_memparse(buf, "max", &max);
5133         if (err)
5134                 return err;
5135
5136         xchg(&memcg->memory.limit, max);
5137
5138         for (;;) {
5139                 unsigned long nr_pages = page_counter_read(&memcg->memory);
5140
5141                 if (nr_pages <= max)
5142                         break;
5143
5144                 if (signal_pending(current)) {
5145                         err = -EINTR;
5146                         break;
5147                 }
5148
5149                 if (!drained) {
5150                         drain_all_stock(memcg);
5151                         drained = true;
5152                         continue;
5153                 }
5154
5155                 if (nr_reclaims) {
5156                         if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
5157                                                           GFP_KERNEL, true))
5158                                 nr_reclaims--;
5159                         continue;
5160                 }
5161
5162                 mem_cgroup_events(memcg, MEMCG_OOM, 1);
5163                 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
5164                         break;
5165         }
5166
5167         memcg_wb_domain_size_changed(memcg);
5168         return nbytes;
5169 }
5170
5171 static int memory_events_show(struct seq_file *m, void *v)
5172 {
5173         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5174
5175         seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
5176         seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
5177         seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
5178         seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
5179
5180         return 0;
5181 }
5182
5183 static int memory_stat_show(struct seq_file *m, void *v)
5184 {
5185         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5186         unsigned long stat[MEMCG_NR_STAT];
5187         unsigned long events[MEMCG_NR_EVENTS];
5188         int i;
5189
5190         /*
5191          * Provide statistics on the state of the memory subsystem as
5192          * well as cumulative event counters that show past behavior.
5193          *
5194          * This list is ordered following a combination of these gradients:
5195          * 1) generic big picture -> specifics and details
5196          * 2) reflecting userspace activity -> reflecting kernel heuristics
5197          *
5198          * Current memory state:
5199          */
5200
5201         tree_stat(memcg, stat);
5202         tree_events(memcg, events);
5203
5204         seq_printf(m, "anon %llu\n",
5205                    (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE);
5206         seq_printf(m, "file %llu\n",
5207                    (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
5208         seq_printf(m, "kernel_stack %llu\n",
5209                    (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
5210         seq_printf(m, "slab %llu\n",
5211                    (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
5212                          stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
5213         seq_printf(m, "sock %llu\n",
5214                    (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
5215
5216         seq_printf(m, "shmem %llu\n",
5217                    (u64)stat[MEM_CGROUP_STAT_SHMEM] * PAGE_SIZE);
5218         seq_printf(m, "file_mapped %llu\n",
5219                    (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
5220         seq_printf(m, "file_dirty %llu\n",
5221                    (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE);
5222         seq_printf(m, "file_writeback %llu\n",
5223                    (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE);
5224
5225         for (i = 0; i < NR_LRU_LISTS; i++) {
5226                 struct mem_cgroup *mi;
5227                 unsigned long val = 0;
5228
5229                 for_each_mem_cgroup_tree(mi, memcg)
5230                         val += mem_cgroup_nr_lru_pages(mi, BIT(i));
5231                 seq_printf(m, "%s %llu\n",
5232                            mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
5233         }
5234
5235         seq_printf(m, "slab_reclaimable %llu\n",
5236                    (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE);
5237         seq_printf(m, "slab_unreclaimable %llu\n",
5238                    (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
5239
5240         /* Accumulated memory events */
5241
5242         seq_printf(m, "pgfault %lu\n",
5243                    events[MEM_CGROUP_EVENTS_PGFAULT]);
5244         seq_printf(m, "pgmajfault %lu\n",
5245                    events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
5246
5247         return 0;
5248 }
5249
5250 static struct cftype memory_files[] = {
5251         {
5252                 .name = "current",
5253                 .flags = CFTYPE_NOT_ON_ROOT,
5254                 .read_u64 = memory_current_read,
5255         },
5256         {
5257                 .name = "low",
5258                 .flags = CFTYPE_NOT_ON_ROOT,
5259                 .seq_show = memory_low_show,
5260                 .write = memory_low_write,
5261         },
5262         {
5263                 .name = "high",
5264                 .flags = CFTYPE_NOT_ON_ROOT,
5265                 .seq_show = memory_high_show,
5266                 .write = memory_high_write,
5267         },
5268         {
5269                 .name = "max",
5270                 .flags = CFTYPE_NOT_ON_ROOT,
5271                 .seq_show = memory_max_show,
5272                 .write = memory_max_write,
5273         },
5274         {
5275                 .name = "events",
5276                 .flags = CFTYPE_NOT_ON_ROOT,
5277                 .file_offset = offsetof(struct mem_cgroup, events_file),
5278                 .seq_show = memory_events_show,
5279         },
5280         {
5281                 .name = "stat",
5282                 .flags = CFTYPE_NOT_ON_ROOT,
5283                 .seq_show = memory_stat_show,
5284         },
5285         { }     /* terminate */
5286 };
5287
5288 struct cgroup_subsys memory_cgrp_subsys = {
5289         .css_alloc = mem_cgroup_css_alloc,
5290         .css_online = mem_cgroup_css_online,
5291         .css_offline = mem_cgroup_css_offline,
5292         .css_released = mem_cgroup_css_released,
5293         .css_free = mem_cgroup_css_free,
5294         .css_reset = mem_cgroup_css_reset,
5295         .can_attach = mem_cgroup_can_attach,
5296         .cancel_attach = mem_cgroup_cancel_attach,
5297         .post_attach = mem_cgroup_move_task,
5298         .bind = mem_cgroup_bind,
5299         .dfl_cftypes = memory_files,
5300         .legacy_cftypes = mem_cgroup_legacy_files,
5301         .early_init = 0,
5302 };
5303
5304 /**
5305  * mem_cgroup_low - check if memory consumption is below the normal range
5306  * @root: the highest ancestor to consider
5307  * @memcg: the memory cgroup to check
5308  *
5309  * Returns %true if memory consumption of @memcg, and that of all
5310  * configurable ancestors up to @root, is below the normal range.
5311  */
5312 bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
5313 {
5314         if (mem_cgroup_disabled())
5315                 return false;
5316
5317         /*
5318          * The toplevel group doesn't have a configurable range, so
5319          * it's never low when looked at directly, and it is not
5320          * considered an ancestor when assessing the hierarchy.
5321          */
5322
5323         if (memcg == root_mem_cgroup)
5324                 return false;
5325
5326         if (page_counter_read(&memcg->memory) >= memcg->low)
5327                 return false;
5328
5329         while (memcg != root) {
5330                 memcg = parent_mem_cgroup(memcg);
5331
5332                 if (memcg == root_mem_cgroup)
5333                         break;
5334
5335                 if (page_counter_read(&memcg->memory) >= memcg->low)
5336                         return false;
5337         }
5338         return true;
5339 }
5340
5341 /**
5342  * mem_cgroup_try_charge - try charging a page
5343  * @page: page to charge
5344  * @mm: mm context of the victim
5345  * @gfp_mask: reclaim mode
5346  * @memcgp: charged memcg return
5347  * @compound: charge the page as compound or small page
5348  *
5349  * Try to charge @page to the memcg that @mm belongs to, reclaiming
5350  * pages according to @gfp_mask if necessary.
5351  *
5352  * Returns 0 on success, with *@memcgp pointing to the charged memcg.
5353  * Otherwise, an error code is returned.
5354  *
5355  * After page->mapping has been set up, the caller must finalize the
5356  * charge with mem_cgroup_commit_charge().  Or abort the transaction
5357  * with mem_cgroup_cancel_charge() in case page instantiation fails.
5358  */
5359 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5360                           gfp_t gfp_mask, struct mem_cgroup **memcgp,
5361                           bool compound)
5362 {
5363         struct mem_cgroup *memcg = NULL;
5364         unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5365         int ret = 0;
5366
5367         if (mem_cgroup_disabled())
5368                 goto out;
5369
5370         if (PageSwapCache(page)) {
5371                 /*
5372                  * Every swap fault against a single page tries to charge the
5373                  * page, bail as early as possible.  shmem_unuse() encounters
5374                  * already charged pages, too.  The USED bit is protected by
5375                  * the page lock, which serializes swap cache removal, which
5376                  * in turn serializes uncharging.
5377                  */
5378                 VM_BUG_ON_PAGE(!PageLocked(page), page);
5379                 if (page->mem_cgroup)
5380                         goto out;
5381
5382                 if (do_swap_account) {
5383                         swp_entry_t ent = { .val = page_private(page), };
5384                         unsigned short id = lookup_swap_cgroup_id(ent);
5385
5386                         rcu_read_lock();
5387                         memcg = mem_cgroup_from_id(id);
5388                         if (memcg && !css_tryget_online(&memcg->css))
5389                                 memcg = NULL;
5390                         rcu_read_unlock();
5391                 }
5392         }
5393
5394         if (!memcg)
5395                 memcg = get_mem_cgroup_from_mm(mm);
5396
5397         ret = try_charge(memcg, gfp_mask, nr_pages);
5398
5399         css_put(&memcg->css);
5400 out:
5401         *memcgp = memcg;
5402         return ret;
5403 }
5404
5405 /**
5406  * mem_cgroup_commit_charge - commit a page charge
5407  * @page: page to charge
5408  * @memcg: memcg to charge the page to
5409  * @lrucare: page might be on LRU already
5410  * @compound: charge the page as compound or small page
5411  *
5412  * Finalize a charge transaction started by mem_cgroup_try_charge(),
5413  * after page->mapping has been set up.  This must happen atomically
5414  * as part of the page instantiation, i.e. under the page table lock
5415  * for anonymous pages, under the page lock for page and swap cache.
5416  *
5417  * In addition, the page must not be on the LRU during the commit, to
5418  * prevent racing with task migration.  If it might be, use @lrucare.
5419  *
5420  * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
5421  */
5422 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
5423                               bool lrucare, bool compound)
5424 {
5425         unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5426
5427         VM_BUG_ON_PAGE(!page->mapping, page);
5428         VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
5429
5430         if (mem_cgroup_disabled())
5431                 return;
5432         /*
5433          * Swap faults will attempt to charge the same page multiple
5434          * times.  But reuse_swap_page() might have removed the page
5435          * from swapcache already, so we can't check PageSwapCache().
5436          */
5437         if (!memcg)
5438                 return;
5439
5440         commit_charge(page, memcg, lrucare);
5441
5442         local_irq_disable();
5443         mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
5444         memcg_check_events(memcg, page);
5445         local_irq_enable();
5446
5447         if (do_memsw_account() && PageSwapCache(page)) {
5448                 swp_entry_t entry = { .val = page_private(page) };
5449                 /*
5450                  * The swap entry might not get freed for a long time,
5451                  * let's not wait for it.  The page already received a
5452                  * memory+swap charge, drop the swap entry duplicate.
5453                  */
5454                 mem_cgroup_uncharge_swap(entry);
5455         }
5456 }
5457
5458 /**
5459  * mem_cgroup_cancel_charge - cancel a page charge
5460  * @page: page to charge
5461  * @memcg: memcg to charge the page to
5462  * @compound: charge the page as compound or small page
5463  *
5464  * Cancel a charge transaction started by mem_cgroup_try_charge().
5465  */
5466 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
5467                 bool compound)
5468 {
5469         unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5470
5471         if (mem_cgroup_disabled())
5472                 return;
5473         /*
5474          * Swap faults will attempt to charge the same page multiple
5475          * times.  But reuse_swap_page() might have removed the page
5476          * from swapcache already, so we can't check PageSwapCache().
5477          */
5478         if (!memcg)
5479                 return;
5480
5481         cancel_charge(memcg, nr_pages);
5482 }
5483
5484 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
5485                            unsigned long nr_anon, unsigned long nr_file,
5486                            unsigned long nr_kmem, unsigned long nr_huge,
5487                            unsigned long nr_shmem, struct page *dummy_page)
5488 {
5489         unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
5490         unsigned long flags;
5491
5492         if (!mem_cgroup_is_root(memcg)) {
5493                 page_counter_uncharge(&memcg->memory, nr_pages);
5494                 if (do_memsw_account())
5495                         page_counter_uncharge(&memcg->memsw, nr_pages);
5496                 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
5497                         page_counter_uncharge(&memcg->kmem, nr_kmem);
5498                 memcg_oom_recover(memcg);
5499         }
5500
5501         local_irq_save(flags);
5502         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
5503         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
5504         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
5505         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_SHMEM], nr_shmem);
5506         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
5507         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
5508         memcg_check_events(memcg, dummy_page);
5509         local_irq_restore(flags);
5510
5511         if (!mem_cgroup_is_root(memcg))
5512                 css_put_many(&memcg->css, nr_pages);
5513 }
5514
5515 static void uncharge_list(struct list_head *page_list)
5516 {
5517         struct mem_cgroup *memcg = NULL;
5518         unsigned long nr_shmem = 0;
5519         unsigned long nr_anon = 0;
5520         unsigned long nr_file = 0;
5521         unsigned long nr_huge = 0;
5522         unsigned long nr_kmem = 0;
5523         unsigned long pgpgout = 0;
5524         struct list_head *next;
5525         struct page *page;
5526
5527         /*
5528          * Note that the list can be a single page->lru; hence the
5529          * do-while loop instead of a simple list_for_each_entry().
5530          */
5531         next = page_list->next;
5532         do {
5533                 page = list_entry(next, struct page, lru);
5534                 next = page->lru.next;
5535
5536                 VM_BUG_ON_PAGE(PageLRU(page), page);
5537                 VM_BUG_ON_PAGE(page_count(page), page);
5538
5539                 if (!page->mem_cgroup)
5540                         continue;
5541
5542                 /*
5543                  * Nobody should be changing or seriously looking at
5544                  * page->mem_cgroup at this point, we have fully
5545                  * exclusive access to the page.
5546                  */
5547
5548                 if (memcg != page->mem_cgroup) {
5549                         if (memcg) {
5550                                 uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
5551                                                nr_kmem, nr_huge, nr_shmem, page);
5552                                 pgpgout = nr_anon = nr_file = nr_kmem = 0;
5553                                 nr_huge = nr_shmem = 0;
5554                         }
5555                         memcg = page->mem_cgroup;
5556                 }
5557
5558                 if (!PageKmemcg(page)) {
5559                         unsigned int nr_pages = 1;
5560
5561                         if (PageTransHuge(page)) {
5562                                 nr_pages <<= compound_order(page);
5563                                 nr_huge += nr_pages;
5564                         }
5565                         if (PageAnon(page))
5566                                 nr_anon += nr_pages;
5567                         else {
5568                                 nr_file += nr_pages;
5569                                 if (PageSwapBacked(page))
5570                                         nr_shmem += nr_pages;
5571                         }
5572                         pgpgout++;
5573                 } else {
5574                         nr_kmem += 1 << compound_order(page);
5575                         __ClearPageKmemcg(page);
5576                 }
5577
5578                 page->mem_cgroup = NULL;
5579         } while (next != page_list);
5580
5581         if (memcg)
5582                 uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
5583                                nr_kmem, nr_huge, nr_shmem, page);
5584 }
5585
5586 /**
5587  * mem_cgroup_uncharge - uncharge a page
5588  * @page: page to uncharge
5589  *
5590  * Uncharge a page previously charged with mem_cgroup_try_charge() and
5591  * mem_cgroup_commit_charge().
5592  */
5593 void mem_cgroup_uncharge(struct page *page)
5594 {
5595         if (mem_cgroup_disabled())
5596                 return;
5597
5598         /* Don't touch page->lru of any random page, pre-check: */
5599         if (!page->mem_cgroup)
5600                 return;
5601
5602         INIT_LIST_HEAD(&page->lru);
5603         uncharge_list(&page->lru);
5604 }
5605
5606 /**
5607  * mem_cgroup_uncharge_list - uncharge a list of page
5608  * @page_list: list of pages to uncharge
5609  *
5610  * Uncharge a list of pages previously charged with
5611  * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
5612  */
5613 void mem_cgroup_uncharge_list(struct list_head *page_list)
5614 {
5615         if (mem_cgroup_disabled())
5616                 return;
5617
5618         if (!list_empty(page_list))
5619                 uncharge_list(page_list);
5620 }
5621
5622 /**
5623  * mem_cgroup_migrate - charge a page's replacement
5624  * @oldpage: currently circulating page
5625  * @newpage: replacement page
5626  *
5627  * Charge @newpage as a replacement page for @oldpage. @oldpage will
5628  * be uncharged upon free.
5629  *
5630  * Both pages must be locked, @newpage->mapping must be set up.
5631  */
5632 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
5633 {
5634         struct mem_cgroup *memcg;
5635         unsigned int nr_pages;
5636         bool compound;
5637         unsigned long flags;
5638
5639         VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
5640         VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
5641         VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
5642         VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
5643                        newpage);
5644
5645         if (mem_cgroup_disabled())
5646                 return;
5647
5648         /* Page cache replacement: new page already charged? */
5649         if (newpage->mem_cgroup)
5650                 return;
5651
5652         /* Swapcache readahead pages can get replaced before being charged */
5653         memcg = oldpage->mem_cgroup;
5654         if (!memcg)
5655                 return;
5656
5657         /* Force-charge the new page. The old one will be freed soon */
5658         compound = PageTransHuge(newpage);
5659         nr_pages = compound ? hpage_nr_pages(newpage) : 1;
5660
5661         page_counter_charge(&memcg->memory, nr_pages);
5662         if (do_memsw_account())
5663                 page_counter_charge(&memcg->memsw, nr_pages);
5664         css_get_many(&memcg->css, nr_pages);
5665
5666         commit_charge(newpage, memcg, false);
5667
5668         local_irq_save(flags);
5669         mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
5670         memcg_check_events(memcg, newpage);
5671         local_irq_restore(flags);
5672 }
5673
5674 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
5675 EXPORT_SYMBOL(memcg_sockets_enabled_key);
5676
5677 void mem_cgroup_sk_alloc(struct sock *sk)
5678 {
5679         struct mem_cgroup *memcg;
5680
5681         if (!mem_cgroup_sockets_enabled)
5682                 return;
5683
5684         /*
5685          * Socket cloning can throw us here with sk_memcg already
5686          * filled. It won't however, necessarily happen from
5687          * process context. So the test for root memcg given
5688          * the current task's memcg won't help us in this case.
5689          *
5690          * Respecting the original socket's memcg is a better
5691          * decision in this case.
5692          */
5693         if (sk->sk_memcg) {
5694                 BUG_ON(mem_cgroup_is_root(sk->sk_memcg));
5695                 css_get(&sk->sk_memcg->css);
5696                 return;
5697         }
5698
5699         rcu_read_lock();
5700         memcg = mem_cgroup_from_task(current);
5701         if (memcg == root_mem_cgroup)
5702                 goto out;
5703         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
5704                 goto out;
5705         if (css_tryget_online(&memcg->css))
5706                 sk->sk_memcg = memcg;
5707 out:
5708         rcu_read_unlock();
5709 }
5710
5711 void mem_cgroup_sk_free(struct sock *sk)
5712 {
5713         if (sk->sk_memcg)
5714                 css_put(&sk->sk_memcg->css);
5715 }
5716
5717 /**
5718  * mem_cgroup_charge_skmem - charge socket memory
5719  * @memcg: memcg to charge
5720  * @nr_pages: number of pages to charge
5721  *
5722  * Charges @nr_pages to @memcg. Returns %true if the charge fit within
5723  * @memcg's configured limit, %false if the charge had to be forced.
5724  */
5725 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
5726 {
5727         gfp_t gfp_mask = GFP_KERNEL;
5728
5729         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
5730                 struct page_counter *fail;
5731
5732                 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
5733                         memcg->tcpmem_pressure = 0;
5734                         return true;
5735                 }
5736                 page_counter_charge(&memcg->tcpmem, nr_pages);
5737                 memcg->tcpmem_pressure = 1;
5738                 return false;
5739         }
5740
5741         /* Don't block in the packet receive path */
5742         if (in_softirq())
5743                 gfp_mask = GFP_NOWAIT;
5744
5745         this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages);
5746
5747         if (try_charge(memcg, gfp_mask, nr_pages) == 0)
5748                 return true;
5749
5750         try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
5751         return false;
5752 }
5753
5754 /**
5755  * mem_cgroup_uncharge_skmem - uncharge socket memory
5756  * @memcg - memcg to uncharge
5757  * @nr_pages - number of pages to uncharge
5758  */
5759 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
5760 {
5761         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
5762                 page_counter_uncharge(&memcg->tcpmem, nr_pages);
5763                 return;
5764         }
5765
5766         this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
5767
5768         page_counter_uncharge(&memcg->memory, nr_pages);
5769         css_put_many(&memcg->css, nr_pages);
5770 }
5771
5772 static int __init cgroup_memory(char *s)
5773 {
5774         char *token;
5775
5776         while ((token = strsep(&s, ",")) != NULL) {
5777                 if (!*token)
5778                         continue;
5779                 if (!strcmp(token, "nosocket"))
5780                         cgroup_memory_nosocket = true;
5781                 if (!strcmp(token, "nokmem"))
5782                         cgroup_memory_nokmem = true;
5783         }
5784         return 0;
5785 }
5786 __setup("cgroup.memory=", cgroup_memory);
5787
5788 /*
5789  * subsys_initcall() for memory controller.
5790  *
5791  * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
5792  * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
5793  * basically everything that doesn't depend on a specific mem_cgroup structure
5794  * should be initialized from here.
5795  */
5796 static int __init mem_cgroup_init(void)
5797 {
5798         int cpu, node;
5799
5800 #ifndef CONFIG_SLOB
5801         /*
5802          * Kmem cache creation is mostly done with the slab_mutex held,
5803          * so use a workqueue with limited concurrency to avoid stalling
5804          * all worker threads in case lots of cgroups are created and
5805          * destroyed simultaneously.
5806          */
5807         memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
5808         BUG_ON(!memcg_kmem_cache_wq);
5809 #endif
5810
5811         cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
5812                                   memcg_hotplug_cpu_dead);
5813
5814         for_each_possible_cpu(cpu)
5815                 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
5816                           drain_local_stock);
5817
5818         for_each_node(node) {
5819                 struct mem_cgroup_tree_per_node *rtpn;
5820
5821                 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
5822                                     node_online(node) ? node : NUMA_NO_NODE);
5823
5824                 rtpn->rb_root = RB_ROOT;
5825                 spin_lock_init(&rtpn->lock);
5826                 soft_limit_tree.rb_tree_per_node[node] = rtpn;
5827         }
5828
5829         return 0;
5830 }
5831 subsys_initcall(mem_cgroup_init);
5832
5833 #ifdef CONFIG_MEMCG_SWAP
5834 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
5835 {
5836         while (!atomic_inc_not_zero(&memcg->id.ref)) {
5837                 /*
5838                  * The root cgroup cannot be destroyed, so it's refcount must
5839                  * always be >= 1.
5840                  */
5841                 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
5842                         VM_BUG_ON(1);
5843                         break;
5844                 }
5845                 memcg = parent_mem_cgroup(memcg);
5846                 if (!memcg)
5847                         memcg = root_mem_cgroup;
5848         }
5849         return memcg;
5850 }
5851
5852 /**
5853  * mem_cgroup_swapout - transfer a memsw charge to swap
5854  * @page: page whose memsw charge to transfer
5855  * @entry: swap entry to move the charge to
5856  *
5857  * Transfer the memsw charge of @page to @entry.
5858  */
5859 void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5860 {
5861         struct mem_cgroup *memcg, *swap_memcg;
5862         unsigned short oldid;
5863
5864         VM_BUG_ON_PAGE(PageLRU(page), page);
5865         VM_BUG_ON_PAGE(page_count(page), page);
5866
5867         if (!do_memsw_account())
5868                 return;
5869
5870         memcg = page->mem_cgroup;
5871
5872         /* Readahead page, never charged */
5873         if (!memcg)
5874                 return;
5875
5876         /*
5877          * In case the memcg owning these pages has been offlined and doesn't
5878          * have an ID allocated to it anymore, charge the closest online
5879          * ancestor for the swap instead and transfer the memory+swap charge.
5880          */
5881         swap_memcg = mem_cgroup_id_get_online(memcg);
5882         oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg));
5883         VM_BUG_ON_PAGE(oldid, page);
5884         mem_cgroup_swap_statistics(swap_memcg, true);
5885
5886         page->mem_cgroup = NULL;
5887
5888         if (!mem_cgroup_is_root(memcg))
5889                 page_counter_uncharge(&memcg->memory, 1);
5890
5891         if (memcg != swap_memcg) {
5892                 if (!mem_cgroup_is_root(swap_memcg))
5893                         page_counter_charge(&swap_memcg->memsw, 1);
5894                 page_counter_uncharge(&memcg->memsw, 1);
5895         }
5896
5897         /*
5898          * Interrupts should be disabled here because the caller holds the
5899          * mapping->tree_lock lock which is taken with interrupts-off. It is
5900          * important here to have the interrupts disabled because it is the
5901          * only synchronisation we have for udpating the per-CPU variables.
5902          */
5903         VM_BUG_ON(!irqs_disabled());
5904         mem_cgroup_charge_statistics(memcg, page, false, -1);
5905         memcg_check_events(memcg, page);
5906
5907         if (!mem_cgroup_is_root(memcg))
5908                 css_put(&memcg->css);
5909 }
5910
5911 /*
5912  * mem_cgroup_try_charge_swap - try charging a swap entry
5913  * @page: page being added to swap
5914  * @entry: swap entry to charge
5915  *
5916  * Try to charge @entry to the memcg that @page belongs to.
5917  *
5918  * Returns 0 on success, -ENOMEM on failure.
5919  */
5920 int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
5921 {
5922         struct mem_cgroup *memcg;
5923         struct page_counter *counter;
5924         unsigned short oldid;
5925
5926         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
5927                 return 0;
5928
5929         memcg = page->mem_cgroup;
5930
5931         /* Readahead page, never charged */
5932         if (!memcg)
5933                 return 0;
5934
5935         memcg = mem_cgroup_id_get_online(memcg);
5936
5937         if (!mem_cgroup_is_root(memcg) &&
5938             !page_counter_try_charge(&memcg->swap, 1, &counter)) {
5939                 mem_cgroup_id_put(memcg);
5940                 return -ENOMEM;
5941         }
5942
5943         oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
5944         VM_BUG_ON_PAGE(oldid, page);
5945         mem_cgroup_swap_statistics(memcg, true);
5946
5947         return 0;
5948 }
5949
5950 /**
5951  * mem_cgroup_uncharge_swap - uncharge a swap entry
5952  * @entry: swap entry to uncharge
5953  *
5954  * Drop the swap charge associated with @entry.
5955  */
5956 void mem_cgroup_uncharge_swap(swp_entry_t entry)
5957 {
5958         struct mem_cgroup *memcg;
5959         unsigned short id;
5960
5961         if (!do_swap_account)
5962                 return;
5963
5964         id = swap_cgroup_record(entry, 0);
5965         rcu_read_lock();
5966         memcg = mem_cgroup_from_id(id);
5967         if (memcg) {
5968                 if (!mem_cgroup_is_root(memcg)) {
5969                         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5970                                 page_counter_uncharge(&memcg->swap, 1);
5971                         else
5972                                 page_counter_uncharge(&memcg->memsw, 1);
5973                 }
5974                 mem_cgroup_swap_statistics(memcg, false);
5975                 mem_cgroup_id_put(memcg);
5976         }
5977         rcu_read_unlock();
5978 }
5979
5980 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
5981 {
5982         long nr_swap_pages = get_nr_swap_pages();
5983
5984         if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
5985                 return nr_swap_pages;
5986         for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
5987                 nr_swap_pages = min_t(long, nr_swap_pages,
5988                                       READ_ONCE(memcg->swap.limit) -
5989                                       page_counter_read(&memcg->swap));
5990         return nr_swap_pages;
5991 }
5992
5993 bool mem_cgroup_swap_full(struct page *page)
5994 {
5995         struct mem_cgroup *memcg;
5996
5997         VM_BUG_ON_PAGE(!PageLocked(page), page);
5998
5999         if (vm_swap_full())
6000                 return true;
6001         if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
6002                 return false;
6003
6004         memcg = page->mem_cgroup;
6005         if (!memcg)
6006                 return false;
6007
6008         for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6009                 if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit)
6010                         return true;
6011
6012         return false;
6013 }
6014
6015 /* for remember boot option*/
6016 #ifdef CONFIG_MEMCG_SWAP_ENABLED
6017 static int really_do_swap_account __initdata = 1;
6018 #else
6019 static int really_do_swap_account __initdata;
6020 #endif
6021
6022 static int __init enable_swap_account(char *s)
6023 {
6024         if (!strcmp(s, "1"))
6025                 really_do_swap_account = 1;
6026         else if (!strcmp(s, "0"))
6027                 really_do_swap_account = 0;
6028         return 1;
6029 }
6030 __setup("swapaccount=", enable_swap_account);
6031
6032 static u64 swap_current_read(struct cgroup_subsys_state *css,
6033                              struct cftype *cft)
6034 {
6035         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6036
6037         return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
6038 }
6039
6040 static int swap_max_show(struct seq_file *m, void *v)
6041 {
6042         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6043         unsigned long max = READ_ONCE(memcg->swap.limit);
6044
6045         if (max == PAGE_COUNTER_MAX)
6046                 seq_puts(m, "max\n");
6047         else
6048                 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
6049
6050         return 0;
6051 }
6052
6053 static ssize_t swap_max_write(struct kernfs_open_file *of,
6054                               char *buf, size_t nbytes, loff_t off)
6055 {
6056         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6057         unsigned long max;
6058         int err;
6059
6060         buf = strstrip(buf);
6061         err = page_counter_memparse(buf, "max", &max);
6062         if (err)
6063                 return err;
6064
6065         mutex_lock(&memcg_limit_mutex);
6066         err = page_counter_limit(&memcg->swap, max);
6067         mutex_unlock(&memcg_limit_mutex);
6068         if (err)
6069                 return err;
6070
6071         return nbytes;
6072 }
6073
6074 static struct cftype swap_files[] = {
6075         {
6076                 .name = "swap.current",
6077                 .flags = CFTYPE_NOT_ON_ROOT,
6078                 .read_u64 = swap_current_read,
6079         },
6080         {
6081                 .name = "swap.max",
6082                 .flags = CFTYPE_NOT_ON_ROOT,
6083                 .seq_show = swap_max_show,
6084                 .write = swap_max_write,
6085         },
6086         { }     /* terminate */
6087 };
6088
6089 static struct cftype memsw_cgroup_files[] = {
6090         {
6091                 .name = "memsw.usage_in_bytes",
6092                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6093                 .read_u64 = mem_cgroup_read_u64,
6094         },
6095         {
6096                 .name = "memsw.max_usage_in_bytes",
6097                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6098                 .write = mem_cgroup_reset,
6099                 .read_u64 = mem_cgroup_read_u64,
6100         },
6101         {
6102                 .name = "memsw.limit_in_bytes",
6103                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
6104                 .write = mem_cgroup_write,
6105                 .read_u64 = mem_cgroup_read_u64,
6106         },
6107         {
6108                 .name = "memsw.failcnt",
6109                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6110                 .write = mem_cgroup_reset,
6111                 .read_u64 = mem_cgroup_read_u64,
6112         },
6113         { },    /* terminate */
6114 };
6115
6116 static int __init mem_cgroup_swap_init(void)
6117 {
6118         if (!mem_cgroup_disabled() && really_do_swap_account) {
6119                 do_swap_account = 1;
6120                 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
6121                                                swap_files));
6122                 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
6123                                                   memsw_cgroup_files));
6124         }
6125         return 0;
6126 }
6127 subsys_initcall(mem_cgroup_swap_init);
6128
6129 #endif /* CONFIG_MEMCG_SWAP */