Merge branch 'for-4.14/block' of git://git.kernel.dk/linux-block

[linux.git] / kernel / cgroup / cgroup.c
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c

index 2aba1c519138c2f2a61eef51dcd7ce8283cac5e9..d6551cd452380b6c1f398e2410e47d83fb208b53 100644 (file)
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -162,6 +162,9 @@ static u16 cgrp_dfl_inhibit_ss_mask;
  /* some controllers are implicitly enabled on the default hierarchy */
  static u16 cgrp_dfl_implicit_ss_mask;
  
+/* some controllers can be threaded on the default hierarchy */
+static u16 cgrp_dfl_threaded_ss_mask;
+
  /* The list of hierarchy roots */
  LIST_HEAD(cgroup_roots);
  static int cgroup_root_count;
@@ -316,13 +319,87 @@ static void cgroup_idr_remove(struct idr *idr, int id)
         spin_unlock_bh(&cgroup_idr_lock);
  }
  
-static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+static bool cgroup_has_tasks(struct cgroup *cgrp)
  {
-       struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+       return cgrp->nr_populated_csets;
+}
  
-       if (parent_css)
-               return container_of(parent_css, struct cgroup, self);
-       return NULL;
+bool cgroup_is_threaded(struct cgroup *cgrp)
+{
+       return cgrp->dom_cgrp != cgrp;
+}
+
+/* can @cgrp host both domain and threaded children? */
+static bool cgroup_is_mixable(struct cgroup *cgrp)
+{
+       /*
+        * Root isn't under domain level resource control exempting it from
+        * the no-internal-process constraint, so it can serve as a thread
+        * root and a parent of resource domains at the same time.
+        */
+       return !cgroup_parent(cgrp);
+}
+
+/* can @cgrp become a thread root? should always be true for a thread root */
+static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
+{
+       /* mixables don't care */
+       if (cgroup_is_mixable(cgrp))
+               return true;
+
+       /* domain roots can't be nested under threaded */
+       if (cgroup_is_threaded(cgrp))
+               return false;
+
+       /* can only have either domain or threaded children */
+       if (cgrp->nr_populated_domain_children)
+               return false;
+
+       /* and no domain controllers can be enabled */
+       if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
+               return false;
+
+       return true;
+}
+
+/* is @cgrp root of a threaded subtree? */
+bool cgroup_is_thread_root(struct cgroup *cgrp)
+{
+       /* thread root should be a domain */
+       if (cgroup_is_threaded(cgrp))
+               return false;
+
+       /* a domain w/ threaded children is a thread root */
+       if (cgrp->nr_threaded_children)
+               return true;
+
+       /*
+        * A domain which has tasks and explicit threaded controllers
+        * enabled is a thread root.
+        */
+       if (cgroup_has_tasks(cgrp) &&
+           (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
+               return true;
+
+       return false;
+}
+
+/* a domain which isn't connected to the root w/o brekage can't be used */
+static bool cgroup_is_valid_domain(struct cgroup *cgrp)
+{
+       /* the cgroup itself can be a thread root */
+       if (cgroup_is_threaded(cgrp))
+               return false;
+
+       /* but the ancestors can't be unless mixable */
+       while ((cgrp = cgroup_parent(cgrp))) {
+               if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
+                       return false;
+               if (cgroup_is_threaded(cgrp))
+                       return false;
+       }
+
+       return true;
  }
  
  /* subsystems visibly enabled on a cgroup */
@@ -331,8 +408,14 @@ static u16 cgroup_control(struct cgroup *cgrp)
         struct cgroup *parent = cgroup_parent(cgrp);
         u16 root_ss_mask = cgrp->root->subsys_mask;
  
-       if (parent)
-               return parent->subtree_control;
+       if (parent) {
+               u16 ss_mask = parent->subtree_control;
+
+               /* threaded cgroups can only have threaded controllers */
+               if (cgroup_is_threaded(cgrp))
+                       ss_mask &= cgrp_dfl_threaded_ss_mask;
+               return ss_mask;
+       }
  
         if (cgroup_on_dfl(cgrp))
                 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
@@ -345,8 +428,14 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp)
  {
         struct cgroup *parent = cgroup_parent(cgrp);
  
-       if (parent)
-               return parent->subtree_ss_mask;
+       if (parent) {
+               u16 ss_mask = parent->subtree_ss_mask;
+
+               /* threaded cgroups can only have threaded controllers */
+               if (cgroup_is_threaded(cgrp))
+                       ss_mask &= cgrp_dfl_threaded_ss_mask;
+               return ss_mask;
+       }
  
         return cgrp->root->subsys_mask;
  }
@@ -436,22 +525,12 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
         return css;
  }
  
-static void __maybe_unused cgroup_get(struct cgroup *cgrp)
-{
-       css_get(&cgrp->self);
-}
-
  static void cgroup_get_live(struct cgroup *cgrp)
  {
         WARN_ON_ONCE(cgroup_is_dead(cgrp));
         css_get(&cgrp->self);
  }
  
-static bool cgroup_tryget(struct cgroup *cgrp)
-{
-       return css_tryget(&cgrp->self);
-}
-
  struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
  {
         struct cgroup *cgrp = of->kn->parent->priv;
@@ -560,9 +639,11 @@ EXPORT_SYMBOL_GPL(of_css);
   */
  struct css_set init_css_set = {
         .refcount               = REFCOUNT_INIT(1),
+       .dom_cset               = &init_css_set,
         .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
         .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
         .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
+       .threaded_csets         = LIST_HEAD_INIT(init_css_set.threaded_csets),
         .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
         .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
         .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
@@ -570,6 +651,11 @@ struct css_set init_css_set = {
  
  static int css_set_count       = 1;    /* 1 for init_css_set */
  
+static bool css_set_threaded(struct css_set *cset)
+{
+       return cset->dom_cset != cset;
+}
+
  /**
   * css_set_populated - does a css_set contain any tasks?
   * @cset: target css_set
@@ -587,39 +673,48 @@ static bool css_set_populated(struct css_set *cset)
  }
  
  /**
- * cgroup_update_populated - updated populated count of a cgroup
+ * cgroup_update_populated - update the populated count of a cgroup
   * @cgrp: the target cgroup
   * @populated: inc or dec populated count
   *
   * One of the css_sets associated with @cgrp is either getting its first
- * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
- * count is propagated towards root so that a given cgroup's populated_cnt
- * is zero iff the cgroup and all its descendants don't contain any tasks.
+ * task or losing the last.  Update @cgrp->nr_populated_* accordingly.  The
+ * count is propagated towards root so that a given cgroup's
+ * nr_populated_children is zero iff none of its descendants contain any
+ * tasks.
   *
- * @cgrp's interface file "cgroup.populated" is zero if
- * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
- * changes from or to zero, userland is notified that the content of the
- * interface file has changed.  This can be used to detect when @cgrp and
- * its descendants become populated or empty.
+ * @cgrp's interface file "cgroup.populated" is zero if both
+ * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
+ * 1 otherwise.  When the sum changes from or to zero, userland is notified
+ * that the content of the interface file has changed.  This can be used to
+ * detect when @cgrp and its descendants become populated or empty.
   */
  static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
  {
+       struct cgroup *child = NULL;
+       int adj = populated ? 1 : -1;
+
         lockdep_assert_held(&css_set_lock);
  
         do {
-               bool trigger;
+               bool was_populated = cgroup_is_populated(cgrp);
  
-               if (populated)
-                       trigger = !cgrp->populated_cnt++;
-               else
-                       trigger = !--cgrp->populated_cnt;
+               if (!child) {
+                       cgrp->nr_populated_csets += adj;
+               } else {
+                       if (cgroup_is_threaded(child))
+                               cgrp->nr_populated_threaded_children += adj;
+                       else
+                               cgrp->nr_populated_domain_children += adj;
+               }
  
-               if (!trigger)
+               if (was_populated == cgroup_is_populated(cgrp))
                         break;
  
                 cgroup1_check_for_release(cgrp);
                 cgroup_file_notify(&cgrp->events_file);
  
+               child = cgrp;
                 cgrp = cgroup_parent(cgrp);
         } while (cgrp);
  }
@@ -630,7 +725,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
   * @populated: whether @cset is populated or depopulated
   *
   * @cset is either getting the first task or losing the last.  Update the
- * ->populated_cnt of all associated cgroups accordingly.
+ * populated counters of all associated cgroups accordingly.
   */
  static void css_set_update_populated(struct css_set *cset, bool populated)
  {
@@ -653,7 +748,7 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
   * css_set, @from_cset can be NULL.  If @task is being disassociated
   * instead of moved, @to_cset can be NULL.
   *
- * This function automatically handles populated_cnt updates and
+ * This function automatically handles populated counter updates and
   * css_task_iter adjustments but the caller is responsible for managing
   * @from_cset and @to_cset's reference counts.
   */
@@ -737,6 +832,8 @@ void put_css_set_locked(struct css_set *cset)
         if (!refcount_dec_and_test(&cset->refcount))
                 return;
  
+       WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
+
         /* This css_set is dead. unlink it and release cgroup and css refs */
         for_each_subsys(ss, ssid) {
                 list_del(&cset->e_cset_node[ssid]);
@@ -753,6 +850,11 @@ void put_css_set_locked(struct css_set *cset)
                 kfree(link);
         }
  
+       if (css_set_threaded(cset)) {
+               list_del(&cset->threaded_csets_node);
+               put_css_set_locked(cset->dom_cset);
+       }
+
         kfree_rcu(cset, rcu_head);
  }
  
@@ -771,6 +873,7 @@ static bool compare_css_sets(struct css_set *cset,
                              struct cgroup *new_cgrp,
                              struct cgroup_subsys_state *template[])
  {
+       struct cgroup *new_dfl_cgrp;
         struct list_head *l1, *l2;
  
         /*
@@ -781,6 +884,16 @@ static bool compare_css_sets(struct css_set *cset,
         if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
                 return false;
  
+
+       /* @cset's domain should match the default cgroup's */
+       if (cgroup_on_dfl(new_cgrp))
+               new_dfl_cgrp = new_cgrp;
+       else
+               new_dfl_cgrp = old_cset->dfl_cgrp;
+
+       if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
+               return false;
+
         /*
          * Compare cgroup pointers in order to distinguish between
          * different cgroups in hierarchies.  As different cgroups may
@@ -988,9 +1101,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
         }
  
         refcount_set(&cset->refcount, 1);
+       cset->dom_cset = cset;
         INIT_LIST_HEAD(&cset->tasks);
         INIT_LIST_HEAD(&cset->mg_tasks);
         INIT_LIST_HEAD(&cset->task_iters);
+       INIT_LIST_HEAD(&cset->threaded_csets);
         INIT_HLIST_NODE(&cset->hlist);
         INIT_LIST_HEAD(&cset->cgrp_links);
         INIT_LIST_HEAD(&cset->mg_preload_node);
@@ -1028,6 +1143,28 @@ static struct css_set *find_css_set(struct css_set *old_cset,
  
         spin_unlock_irq(&css_set_lock);
  
+       /*
+        * If @cset should be threaded, look up the matching dom_cset and
+        * link them up.  We first fully initialize @cset then look for the
+        * dom_cset.  It's simpler this way and safe as @cset is guaranteed
+        * to stay empty until we return.
+        */
+       if (cgroup_is_threaded(cset->dfl_cgrp)) {
+               struct css_set *dcset;
+
+               dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
+               if (!dcset) {
+                       put_css_set(cset);
+                       return NULL;
+               }
+
+               spin_lock_irq(&css_set_lock);
+               cset->dom_cset = dcset;
+               list_add_tail(&cset->threaded_csets_node,
+                             &dcset->threaded_csets);
+               spin_unlock_irq(&css_set_lock);
+       }
+
         return cset;
  }
  
@@ -1155,6 +1292,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
  
         if (cset == &init_css_set) {
                 res = &root->cgrp;
+       } else if (root == &cgrp_dfl_root) {
+               res = cset->dfl_cgrp;
         } else {
                 struct cgrp_cset_link *link;
  
@@ -1670,6 +1809,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
         mutex_init(&cgrp->pidlist_mutex);
         cgrp->self.cgroup = cgrp;
         cgrp->self.flags |= CSS_ONLINE;
+       cgrp->dom_cgrp = cgrp;
+       cgrp->max_descendants = INT_MAX;
+       cgrp->max_depth = INT_MAX;
  
         for_each_subsys(ss, ssid)
                 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@ -2007,6 +2149,8 @@ static void cgroup_migrate_add_task(struct task_struct *task,
         if (!cset->mg_src_cgrp)
                 return;
  
+       mgctx->tset.nr_tasks++;
+
         list_move_tail(&task->cg_list, &cset->mg_tasks);
         if (list_empty(&cset->mg_node))
                 list_add_tail(&cset->mg_node,
@@ -2095,21 +2239,19 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
         struct css_set *cset, *tmp_cset;
         int ssid, failed_ssid, ret;
  
-       /* methods shouldn't be called if no task is actually migrating */
-       if (list_empty(&tset->src_csets))
-               return 0;
-
         /* check that we can legitimately attach to the cgroup */
-       do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
-               if (ss->can_attach) {
-                       tset->ssid = ssid;
-                       ret = ss->can_attach(tset);
-                       if (ret) {
-                               failed_ssid = ssid;
-                               goto out_cancel_attach;
+       if (tset->nr_tasks) {
+               do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+                       if (ss->can_attach) {
+                               tset->ssid = ssid;
+                               ret = ss->can_attach(tset);
+                               if (ret) {
+                                       failed_ssid = ssid;
+                                       goto out_cancel_attach;
+                               }
                         }
-               }
-       } while_each_subsys_mask();
+               } while_each_subsys_mask();
+       }
  
         /*
          * Now that we're guaranteed success, proceed to move all tasks to
@@ -2138,25 +2280,29 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
          */
         tset->csets = &tset->dst_csets;
  
-       do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
-               if (ss->attach) {
-                       tset->ssid = ssid;
-                       ss->attach(tset);
-               }
-       } while_each_subsys_mask();
+       if (tset->nr_tasks) {
+               do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+                       if (ss->attach) {
+                               tset->ssid = ssid;
+                               ss->attach(tset);
+                       }
+               } while_each_subsys_mask();
+       }
  
         ret = 0;
         goto out_release_tset;
  
  out_cancel_attach:
-       do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
-               if (ssid == failed_ssid)
-                       break;
-               if (ss->cancel_attach) {
-                       tset->ssid = ssid;
-                       ss->cancel_attach(tset);
-               }
-       } while_each_subsys_mask();
+       if (tset->nr_tasks) {
+               do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+                       if (ssid == failed_ssid)
+                               break;
+                       if (ss->cancel_attach) {
+                               tset->ssid = ssid;
+                               ss->cancel_attach(tset);
+                       }
+               } while_each_subsys_mask();
+       }
  out_release_tset:
         spin_lock_irq(&css_set_lock);
         list_splice_init(&tset->dst_csets, &tset->src_csets);
@@ -2169,17 +2315,40 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
  }
  
  /**
- * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
+ * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
   * @dst_cgrp: destination cgroup to test
   *
- * On the default hierarchy, except for the root, subtree_control must be
- * zero for migration destination cgroups with tasks so that child cgroups
- * don't compete against tasks.
+ * On the default hierarchy, except for the mixable, (possible) thread root
+ * and threaded cgroups, subtree_control must be zero for migration
+ * destination cgroups with tasks so that child cgroups don't compete
+ * against tasks.
   */
-bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
  {
-       return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
-               !dst_cgrp->subtree_control;
+       /* v1 doesn't have any restriction */
+       if (!cgroup_on_dfl(dst_cgrp))
+               return 0;
+
+       /* verify @dst_cgrp can host resources */
+       if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
+               return -EOPNOTSUPP;
+
+       /* mixables don't care */
+       if (cgroup_is_mixable(dst_cgrp))
+               return 0;
+
+       /*
+        * If @dst_cgrp is already or can become a thread root or is
+        * threaded, it doesn't matter.
+        */
+       if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
+               return 0;
+
+       /* apply no-internal-process constraint */
+       if (dst_cgrp->subtree_control)
+               return -EBUSY;
+
+       return 0;
  }
  
  /**
@@ -2384,8 +2553,9 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
         struct task_struct *task;
         int ret;
  
-       if (!cgroup_may_migrate_to(dst_cgrp))
-               return -EBUSY;
+       ret = cgroup_migrate_vet_dst(dst_cgrp);
+       if (ret)
+               return ret;
  
         /* look up all src csets */
         spin_lock_irq(&css_set_lock);
@@ -2412,96 +2582,23 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
         return ret;
  }
  
-static int cgroup_procs_write_permission(struct task_struct *task,
-                                        struct cgroup *dst_cgrp,
-                                        struct kernfs_open_file *of)
-{
-       struct super_block *sb = of->file->f_path.dentry->d_sb;
-       struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
-       struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
-       struct cgroup *src_cgrp, *com_cgrp;
-       struct inode *inode;
-       int ret;
-
-       if (!cgroup_on_dfl(dst_cgrp)) {
-               const struct cred *cred = current_cred();
-               const struct cred *tcred = get_task_cred(task);
-
-               /*
-                * even if we're attaching all tasks in the thread group,
-                * we only need to check permissions on one of them.
-                */
-               if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
-                   uid_eq(cred->euid, tcred->uid) ||
-                   uid_eq(cred->euid, tcred->suid))
-                       ret = 0;
-               else
-                       ret = -EACCES;
-
-               put_cred(tcred);
-               return ret;
-       }
-
-       /* find the source cgroup */
-       spin_lock_irq(&css_set_lock);
-       src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-       spin_unlock_irq(&css_set_lock);
-
-       /* and the common ancestor */
-       com_cgrp = src_cgrp;
-       while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
-               com_cgrp = cgroup_parent(com_cgrp);
-
-       /* %current should be authorized to migrate to the common ancestor */
-       inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
-       if (!inode)
-               return -ENOMEM;
-
-       ret = inode_permission(inode, MAY_WRITE);
-       iput(inode);
-       if (ret)
-               return ret;
-
-       /*
-        * If namespaces are delegation boundaries, %current must be able
-        * to see both source and destination cgroups from its namespace.
-        */
-       if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
-           (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
-            !cgroup_is_descendant(dst_cgrp, root_cgrp)))
-               return -ENOENT;
-
-       return 0;
-}
-
-/*
- * Find the task_struct of the task to attach by vpid and pass it along to the
- * function to attach either it or all tasks in its threadgroup. Will lock
- * cgroup_mutex and threadgroup.
- */
-ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
-                            size_t nbytes, loff_t off, bool threadgroup)
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+       __acquires(&cgroup_threadgroup_rwsem)
  {
         struct task_struct *tsk;
-       struct cgroup_subsys *ss;
-       struct cgroup *cgrp;
         pid_t pid;
-       int ssid, ret;
  
         if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
-               return -EINVAL;
-
-       cgrp = cgroup_kn_lock_live(of->kn, false);
-       if (!cgrp)
-               return -ENODEV;
+               return ERR_PTR(-EINVAL);
  
         percpu_down_write(&cgroup_threadgroup_rwsem);
+
         rcu_read_lock();
         if (pid) {
                 tsk = find_task_by_vpid(pid);
                 if (!tsk) {
-                       ret = -ESRCH;
-                       goto out_unlock_rcu;
+                       tsk = ERR_PTR(-ESRCH);
+                       goto out_unlock_threadgroup;
                 }
         } else {
                 tsk = current;
@@ -2517,35 +2614,33 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
          * cgroup with no rt_runtime allocated.  Just say no.
          */
         if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
-               ret = -EINVAL;
-               goto out_unlock_rcu;
+               tsk = ERR_PTR(-EINVAL);
+               goto out_unlock_threadgroup;
         }
  
         get_task_struct(tsk);
+       goto out_unlock_rcu;
+
+out_unlock_threadgroup:
+       percpu_up_write(&cgroup_threadgroup_rwsem);
+out_unlock_rcu:
         rcu_read_unlock();
+       return tsk;
+}
  
-       ret = cgroup_procs_write_permission(tsk, cgrp, of);
-       if (!ret)
-               ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+void cgroup_procs_write_finish(struct task_struct *task)
+       __releases(&cgroup_threadgroup_rwsem)
+{
+       struct cgroup_subsys *ss;
+       int ssid;
  
-       put_task_struct(tsk);
-       goto out_unlock_threadgroup;
+       /* release reference from cgroup_procs_write_start() */
+       put_task_struct(task);
  
-out_unlock_rcu:
-       rcu_read_unlock();
-out_unlock_threadgroup:
         percpu_up_write(&cgroup_threadgroup_rwsem);
         for_each_subsys(ss, ssid)
                 if (ss->post_attach)
                         ss->post_attach();
-       cgroup_kn_unlock(of->kn);
-       return ret ?: nbytes;
-}
-
-ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
-                          loff_t off)
-{
-       return __cgroup_procs_write(of, buf, nbytes, off, true);
  }
  
  static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@ -2888,6 +2983,46 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
         cgroup_apply_control_disable(cgrp);
  }
  
+static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
+{
+       u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
+
+       /* if nothing is getting enabled, nothing to worry about */
+       if (!enable)
+               return 0;
+
+       /* can @cgrp host any resources? */
+       if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
+               return -EOPNOTSUPP;
+
+       /* mixables don't care */
+       if (cgroup_is_mixable(cgrp))
+               return 0;
+
+       if (domain_enable) {
+               /* can't enable domain controllers inside a thread subtree */
+               if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+                       return -EOPNOTSUPP;
+       } else {
+               /*
+                * Threaded controllers can handle internal competitions
+                * and are always allowed inside a (prospective) thread
+                * subtree.
+                */
+               if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+                       return 0;
+       }
+
+       /*
+        * Controllers can't be enabled for a cgroup with tasks to avoid
+        * child cgroups competing against tasks.
+        */
+       if (cgroup_has_tasks(cgrp))
+               return -EBUSY;
+
+       return 0;
+}
+
  /* change the enabled child controllers for a cgroup in the default hierarchy */
  static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                             char *buf, size_t nbytes,
@@ -2963,33 +3098,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                 goto out_unlock;
         }
  
-       /*
-        * Except for the root, subtree_control must be zero for a cgroup
-        * with tasks so that child cgroups don't compete against tasks.
-        */
-       if (enable && cgroup_parent(cgrp)) {
-               struct cgrp_cset_link *link;
-
-               /*
-                * Because namespaces pin csets too, @cgrp->cset_links
-                * might not be empty even when @cgrp is empty.  Walk and
-                * verify each cset.
-                */
-               spin_lock_irq(&css_set_lock);
-
-               ret = 0;
-               list_for_each_entry(link, &cgrp->cset_links, cset_link) {
-                       if (css_set_populated(link->cset)) {
-                               ret = -EBUSY;
-                               break;
-                       }
-               }
-
-               spin_unlock_irq(&css_set_lock);
-
-               if (ret)
-                       goto out_unlock;
-       }
+       ret = cgroup_vet_subtree_control_enable(cgrp, enable);
+       if (ret)
+               goto out_unlock;
  
         /* save and update control masks and prepare csses */
         cgroup_save_control(cgrp);
@@ -2998,16 +3109,182 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
         cgrp->subtree_control &= ~disable;
  
         ret = cgroup_apply_control(cgrp);
-
         cgroup_finalize_control(cgrp, ret);
+       if (ret)
+               goto out_unlock;
  
         kernfs_activate(cgrp->kn);
-       ret = 0;
  out_unlock:
         cgroup_kn_unlock(of->kn);
         return ret ?: nbytes;
  }
  
+/**
+ * cgroup_enable_threaded - make @cgrp threaded
+ * @cgrp: the target cgroup
+ *
+ * Called when "threaded" is written to the cgroup.type interface file and
+ * tries to make @cgrp threaded and join the parent's resource domain.
+ * This function is never called on the root cgroup as cgroup.type doesn't
+ * exist on it.
+ */
+static int cgroup_enable_threaded(struct cgroup *cgrp)
+{
+       struct cgroup *parent = cgroup_parent(cgrp);
+       struct cgroup *dom_cgrp = parent->dom_cgrp;
+       int ret;
+
+       lockdep_assert_held(&cgroup_mutex);
+
+       /* noop if already threaded */
+       if (cgroup_is_threaded(cgrp))
+               return 0;
+
+       /* we're joining the parent's domain, ensure its validity */
+       if (!cgroup_is_valid_domain(dom_cgrp) ||
+           !cgroup_can_be_thread_root(dom_cgrp))
+               return -EOPNOTSUPP;
+
+       /*
+        * The following shouldn't cause actual migrations and should
+        * always succeed.
+        */
+       cgroup_save_control(cgrp);
+
+       cgrp->dom_cgrp = dom_cgrp;
+       ret = cgroup_apply_control(cgrp);
+       if (!ret)
+               parent->nr_threaded_children++;
+       else
+               cgrp->dom_cgrp = cgrp;
+
+       cgroup_finalize_control(cgrp, ret);
+       return ret;
+}
+
+static int cgroup_type_show(struct seq_file *seq, void *v)
+{
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+       if (cgroup_is_threaded(cgrp))
+               seq_puts(seq, "threaded\n");
+       else if (!cgroup_is_valid_domain(cgrp))
+               seq_puts(seq, "domain invalid\n");
+       else if (cgroup_is_thread_root(cgrp))
+               seq_puts(seq, "domain threaded\n");
+       else
+               seq_puts(seq, "domain\n");
+
+       return 0;
+}
+
+static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
+                                size_t nbytes, loff_t off)
+{
+       struct cgroup *cgrp;
+       int ret;
+
+       /* only switching to threaded mode is supported */
+       if (strcmp(strstrip(buf), "threaded"))
+               return -EINVAL;
+
+       cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!cgrp)
+               return -ENOENT;
+
+       /* threaded can only be enabled */
+       ret = cgroup_enable_threaded(cgrp);
+
+       cgroup_kn_unlock(of->kn);
+       return ret ?: nbytes;
+}
+
+static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
+{
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+       int descendants = READ_ONCE(cgrp->max_descendants);
+
+       if (descendants == INT_MAX)
+               seq_puts(seq, "max\n");
+       else
+               seq_printf(seq, "%d\n", descendants);
+
+       return 0;
+}
+
+static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
+                                          char *buf, size_t nbytes, loff_t off)
+{
+       struct cgroup *cgrp;
+       int descendants;
+       ssize_t ret;
+
+       buf = strstrip(buf);
+       if (!strcmp(buf, "max")) {
+               descendants = INT_MAX;
+       } else {
+               ret = kstrtoint(buf, 0, &descendants);
+               if (ret)
+                       return ret;
+       }
+
+       if (descendants < 0)
+               return -ERANGE;
+
+       cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!cgrp)
+               return -ENOENT;
+
+       cgrp->max_descendants = descendants;
+
+       cgroup_kn_unlock(of->kn);
+
+       return nbytes;
+}
+
+static int cgroup_max_depth_show(struct seq_file *seq, void *v)
+{
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+       int depth = READ_ONCE(cgrp->max_depth);
+
+       if (depth == INT_MAX)
+               seq_puts(seq, "max\n");
+       else
+               seq_printf(seq, "%d\n", depth);
+
+       return 0;
+}
+
+static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
+                                     char *buf, size_t nbytes, loff_t off)
+{
+       struct cgroup *cgrp;
+       ssize_t ret;
+       int depth;
+
+       buf = strstrip(buf);
+       if (!strcmp(buf, "max")) {
+               depth = INT_MAX;
+       } else {
+               ret = kstrtoint(buf, 0, &depth);
+               if (ret)
+                       return ret;
+       }
+
+       if (depth < 0)
+               return -ERANGE;
+
+       cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!cgrp)
+               return -ENOENT;
+
+       cgrp->max_depth = depth;
+
+       cgroup_kn_unlock(of->kn);
+
+       return nbytes;
+}
+
  static int cgroup_events_show(struct seq_file *seq, void *v)
  {
         seq_printf(seq, "populated %d\n",
@@ -3015,6 +3292,18 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
         return 0;
  }
  
+static int cgroup_stat_show(struct seq_file *seq, void *v)
+{
+       struct cgroup *cgroup = seq_css(seq)->cgroup;
+
+       seq_printf(seq, "nr_descendants %d\n",
+                  cgroup->nr_descendants);
+       seq_printf(seq, "nr_dying_descendants %d\n",
+                  cgroup->nr_dying_descendants);
+
+       return 0;
+}
+
  static int cgroup_file_open(struct kernfs_open_file *of)
  {
         struct cftype *cft = of->kn->priv;
@@ -3231,7 +3520,6 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
  
  static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
  {
-       LIST_HEAD(pending);
         struct cgroup_subsys *ss = cfts[0].ss;
         struct cgroup *root = &ss->root->cgrp;
         struct cgroup_subsys_state *css;
@@ -3656,6 +3944,58 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
         return ret;
  }
  
+static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
+{
+       struct list_head *l;
+       struct cgrp_cset_link *link;
+       struct css_set *cset;
+
+       lockdep_assert_held(&css_set_lock);
+
+       /* find the next threaded cset */
+       if (it->tcset_pos) {
+               l = it->tcset_pos->next;
+
+               if (l != it->tcset_head) {
+                       it->tcset_pos = l;
+                       return container_of(l, struct css_set,
+                                           threaded_csets_node);
+               }
+
+               it->tcset_pos = NULL;
+       }
+
+       /* find the next cset */
+       l = it->cset_pos;
+       l = l->next;
+       if (l == it->cset_head) {
+               it->cset_pos = NULL;
+               return NULL;
+       }
+
+       if (it->ss) {
+               cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
+       } else {
+               link = list_entry(l, struct cgrp_cset_link, cset_link);
+               cset = link->cset;
+       }
+
+       it->cset_pos = l;
+
+       /* initialize threaded css_set walking */
+       if (it->flags & CSS_TASK_ITER_THREADED) {
+               if (it->cur_dcset)
+                       put_css_set_locked(it->cur_dcset);
+               it->cur_dcset = cset;
+               get_css_set(cset);
+
+               it->tcset_head = &cset->threaded_csets;
+               it->tcset_pos = &cset->threaded_csets;
+       }
+
+       return cset;
+}
+
  /**
   * css_task_iter_advance_css_set - advance a task itererator to the next css_set
   * @it: the iterator to advance
@@ -3664,32 +4004,19 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
   */
  static void css_task_iter_advance_css_set(struct css_task_iter *it)
  {
-       struct list_head *l = it->cset_pos;
-       struct cgrp_cset_link *link;
         struct css_set *cset;
  
         lockdep_assert_held(&css_set_lock);
  
         /* Advance to the next non-empty css_set */
         do {
-               l = l->next;
-               if (l == it->cset_head) {
-                       it->cset_pos = NULL;
+               cset = css_task_iter_next_css_set(it);
+               if (!cset) {
                         it->task_pos = NULL;
                         return;
                 }
-
-               if (it->ss) {
-                       cset = container_of(l, struct css_set,
-                                           e_cset_node[it->ss->id]);
-               } else {
-                       link = list_entry(l, struct cgrp_cset_link, cset_link);
-                       cset = link->cset;
-               }
         } while (!css_set_populated(cset));
  
-       it->cset_pos = l;
-
         if (!list_empty(&cset->tasks))
                 it->task_pos = cset->tasks.next;
         else
@@ -3729,6 +4056,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
         lockdep_assert_held(&css_set_lock);
         WARN_ON_ONCE(!l);
  
+repeat:
         /*
          * Advance iterator to find next entry.  cset->tasks is consumed
          * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
@@ -3743,11 +4071,18 @@ static void css_task_iter_advance(struct css_task_iter *it)
                 css_task_iter_advance_css_set(it);
         else
                 it->task_pos = l;
+
+       /* if PROCS, skip over tasks which aren't group leaders */
+       if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
+           !thread_group_leader(list_entry(it->task_pos, struct task_struct,
+                                           cg_list)))
+               goto repeat;
  }
  
  /**
   * css_task_iter_start - initiate task iteration
   * @css: the css to walk tasks of
+ * @flags: CSS_TASK_ITER_* flags
   * @it: the task iterator to use
   *
   * Initiate iteration through the tasks of @css.  The caller can call
@@ -3755,7 +4090,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
   * returns NULL.  On completion of iteration, css_task_iter_end() must be
   * called.
   */
-void css_task_iter_start(struct cgroup_subsys_state *css,
+void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                          struct css_task_iter *it)
  {
         /* no one should try to iterate before mounting cgroups */
@@ -3766,6 +4101,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
         spin_lock_irq(&css_set_lock);
  
         it->ss = css->ss;
+       it->flags = flags;
  
         if (it->ss)
                 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
@@ -3823,6 +4159,9 @@ void css_task_iter_end(struct css_task_iter *it)
                 spin_unlock_irq(&css_set_lock);
         }
  
+       if (it->cur_dcset)
+               put_css_set(it->cur_dcset);
+
         if (it->cur_task)
                 put_task_struct(it->cur_task);
  }
@@ -3839,16 +4178,12 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
  {
         struct kernfs_open_file *of = s->private;
         struct css_task_iter *it = of->priv;
-       struct task_struct *task;
  
-       do {
-               task = css_task_iter_next(it);
-       } while (task && !thread_group_leader(task));
-
-       return task;
+       return css_task_iter_next(it);
  }
  
-static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
+                                 unsigned int iter_flags)
  {
         struct kernfs_open_file *of = s->private;
         struct cgroup *cgrp = seq_css(s)->cgroup;
@@ -3866,23 +4201,168 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
                 if (!it)
                         return ERR_PTR(-ENOMEM);
                 of->priv = it;
-               css_task_iter_start(&cgrp->self, it);
+               css_task_iter_start(&cgrp->self, iter_flags, it);
         } else if (!(*pos)++) {
                 css_task_iter_end(it);
-               css_task_iter_start(&cgrp->self, it);
+               css_task_iter_start(&cgrp->self, iter_flags, it);
         }
  
         return cgroup_procs_next(s, NULL, NULL);
  }
  
+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+{
+       struct cgroup *cgrp = seq_css(s)->cgroup;
+
+       /*
+        * All processes of a threaded subtree belong to the domain cgroup
+        * of the subtree.  Only threads can be distributed across the
+        * subtree.  Reject reads on cgroup.procs in the subtree proper.
+        * They're always empty anyway.
+        */
+       if (cgroup_is_threaded(cgrp))
+               return ERR_PTR(-EOPNOTSUPP);
+
+       return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
+                                           CSS_TASK_ITER_THREADED);
+}
+
  static int cgroup_procs_show(struct seq_file *s, void *v)
  {
-       seq_printf(s, "%d\n", task_tgid_vnr(v));
+       seq_printf(s, "%d\n", task_pid_vnr(v));
         return 0;
  }
  
+static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
+                                        struct cgroup *dst_cgrp,
+                                        struct super_block *sb)
+{
+       struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+       struct cgroup *com_cgrp = src_cgrp;
+       struct inode *inode;
+       int ret;
+
+       lockdep_assert_held(&cgroup_mutex);
+
+       /* find the common ancestor */
+       while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
+               com_cgrp = cgroup_parent(com_cgrp);
+
+       /* %current should be authorized to migrate to the common ancestor */
+       inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
+       if (!inode)
+               return -ENOMEM;
+
+       ret = inode_permission(inode, MAY_WRITE);
+       iput(inode);
+       if (ret)
+               return ret;
+
+       /*
+        * If namespaces are delegation boundaries, %current must be able
+        * to see both source and destination cgroups from its namespace.
+        */
+       if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
+           (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
+            !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
+               return -ENOENT;
+
+       return 0;
+}
+
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+                                 char *buf, size_t nbytes, loff_t off)
+{
+       struct cgroup *src_cgrp, *dst_cgrp;
+       struct task_struct *task;
+       ssize_t ret;
+
+       dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!dst_cgrp)
+               return -ENODEV;
+
+       task = cgroup_procs_write_start(buf, true);
+       ret = PTR_ERR_OR_ZERO(task);
+       if (ret)
+               goto out_unlock;
+
+       /* find the source cgroup */
+       spin_lock_irq(&css_set_lock);
+       src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+       spin_unlock_irq(&css_set_lock);
+
+       ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+                                           of->file->f_path.dentry->d_sb);
+       if (ret)
+               goto out_finish;
+
+       ret = cgroup_attach_task(dst_cgrp, task, true);
+
+out_finish:
+       cgroup_procs_write_finish(task);
+out_unlock:
+       cgroup_kn_unlock(of->kn);
+
+       return ret ?: nbytes;
+}
+
+static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
+{
+       return __cgroup_procs_start(s, pos, 0);
+}
+
+static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
+                                   char *buf, size_t nbytes, loff_t off)
+{
+       struct cgroup *src_cgrp, *dst_cgrp;
+       struct task_struct *task;
+       ssize_t ret;
+
+       buf = strstrip(buf);
+
+       dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!dst_cgrp)
+               return -ENODEV;
+
+       task = cgroup_procs_write_start(buf, false);
+       ret = PTR_ERR_OR_ZERO(task);
+       if (ret)
+               goto out_unlock;
+
+       /* find the source cgroup */
+       spin_lock_irq(&css_set_lock);
+       src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+       spin_unlock_irq(&css_set_lock);
+
+       /* thread migrations follow the cgroup.procs delegation rule */
+       ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+                                           of->file->f_path.dentry->d_sb);
+       if (ret)
+               goto out_finish;
+
+       /* and must be contained in the same domain */
+       ret = -EOPNOTSUPP;
+       if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
+               goto out_finish;
+
+       ret = cgroup_attach_task(dst_cgrp, task, false);
+
+out_finish:
+       cgroup_procs_write_finish(task);
+out_unlock:
+       cgroup_kn_unlock(of->kn);
+
+       return ret ?: nbytes;
+}
+
  /* cgroup core interface files for the default hierarchy */
  static struct cftype cgroup_base_files[] = {
+       {
+               .name = "cgroup.type",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cgroup_type_show,
+               .write = cgroup_type_write,
+       },
         {
                 .name = "cgroup.procs",
                 .flags = CFTYPE_NS_DELEGATABLE,
@@ -3893,6 +4373,14 @@ static struct cftype cgroup_base_files[] = {
                 .seq_show = cgroup_procs_show,
                 .write = cgroup_procs_write,
         },
+       {
+               .name = "cgroup.threads",
+               .release = cgroup_procs_release,
+               .seq_start = cgroup_threads_start,
+               .seq_next = cgroup_procs_next,
+               .seq_show = cgroup_procs_show,
+               .write = cgroup_threads_write,
+       },
         {
                 .name = "cgroup.controllers",
                 .seq_show = cgroup_controllers_show,
@@ -3909,6 +4397,20 @@ static struct cftype cgroup_base_files[] = {
                 .file_offset = offsetof(struct cgroup, events_file),
                 .seq_show = cgroup_events_show,
         },
+       {
+               .name = "cgroup.max.descendants",
+               .seq_show = cgroup_max_descendants_show,
+               .write = cgroup_max_descendants_write,
+       },
+       {
+               .name = "cgroup.max.depth",
+               .seq_show = cgroup_max_depth_show,
+               .write = cgroup_max_depth_write,
+       },
+       {
+               .name = "cgroup.stat",
+               .seq_show = cgroup_stat_show,
+       },
         { }     /* terminate */
  };
  
@@ -4008,9 +4510,15 @@ static void css_release_work_fn(struct work_struct *work)
                 if (ss->css_released)
                         ss->css_released(css);
         } else {
+               struct cgroup *tcgrp;
+
                 /* cgroup release path */
                 trace_cgroup_release(cgrp);
  
+               for (tcgrp = cgroup_parent(cgrp); tcgrp;
+                    tcgrp = cgroup_parent(tcgrp))
+                       tcgrp->nr_dying_descendants--;
+
                 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
                 cgrp->id = -1;
  
@@ -4097,9 +4605,6 @@ static void offline_css(struct cgroup_subsys_state *css)
         if (!(css->flags & CSS_ONLINE))
                 return;
  
-       if (ss->css_reset)
-               ss->css_reset(css);
-
         if (ss->css_offline)
                 ss->css_offline(css);
  
@@ -4209,9 +4714,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
         cgrp->root = root;
         cgrp->level = level;
  
-       for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+       for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
                 cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
  
+               if (tcgrp != cgrp)
+                       tcgrp->nr_descendants++;
+       }
+
         if (notify_on_release(parent))
                 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  
@@ -4252,6 +4761,29 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
         return ERR_PTR(ret);
  }
  
+static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
+{
+       struct cgroup *cgroup;
+       int ret = false;
+       int level = 1;
+
+       lockdep_assert_held(&cgroup_mutex);
+
+       for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
+               if (cgroup->nr_descendants >= cgroup->max_descendants)
+                       goto fail;
+
+               if (level > cgroup->max_depth)
+                       goto fail;
+
+               level++;
+       }
+
+       ret = true;
+fail:
+       return ret;
+}
+
  int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
  {
         struct cgroup *parent, *cgrp;
@@ -4266,6 +4798,11 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
         if (!parent)
                 return -ENODEV;
  
+       if (!cgroup_check_hierarchy_limits(parent)) {
+               ret = -EAGAIN;
+               goto out_unlock;
+       }
+
         cgrp = cgroup_create(parent);
         if (IS_ERR(cgrp)) {
                 ret = PTR_ERR(cgrp);
@@ -4417,6 +4954,7 @@ static void kill_css(struct cgroup_subsys_state *css)
  static int cgroup_destroy_locked(struct cgroup *cgrp)
         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
  {
+       struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
         struct cgroup_subsys_state *css;
         struct cgrp_cset_link *link;
         int ssid;
@@ -4461,7 +4999,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
          */
         kernfs_remove(cgrp->kn);
  
-       cgroup1_check_for_release(cgroup_parent(cgrp));
+       if (parent && cgroup_is_threaded(cgrp))
+               parent->nr_threaded_children--;
+
+       for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
+               tcgrp->nr_descendants--;
+               tcgrp->nr_dying_descendants++;
+       }
+
+       cgroup1_check_for_release(parent);
  
         /* put the base reference */
         percpu_ref_kill(&cgrp->self.refcnt);
@@ -4656,11 +5202,17 @@ int __init cgroup_init(void)
  
                 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
  
+               /* implicit controllers must be threaded too */
+               WARN_ON(ss->implicit_on_dfl && !ss->threaded);
+
                 if (ss->implicit_on_dfl)
                         cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
                 else if (!ss->dfl_cftypes)
                         cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
  
+               if (ss->threaded)
+                       cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
+
                 if (ss->dfl_cftypes == ss->legacy_cftypes) {
                         WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
                 } else {
@@ -4670,6 +5222,10 @@ int __init cgroup_init(void)
  
                 if (ss->bind)
                         ss->bind(init_css_set.subsys[ssid]);
+
+               mutex_lock(&cgroup_mutex);
+               css_populate_dir(init_css_set.subsys[ssid]);
+               mutex_unlock(&cgroup_mutex);
         }
  
         /* init_css_set.subsys[] has been updated, re-hash */