]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - kernel/fork.c
Merge tag 'iommu-updates-v5.4' of git://git.kernel.org/pub/scm/linux/kernel/git/joro...
[linux.git] / kernel / fork.c
index 61667909ce837bcaaa378cbdb760938cdd41b46b..0ad65a932936d62ea5dd82f2e48c74bc31b2ffc6 100644 (file)
@@ -677,7 +677,6 @@ void __mmdrop(struct mm_struct *mm)
        WARN_ON_ONCE(mm == current->active_mm);
        mm_free_pgd(mm);
        destroy_context(mm);
-       hmm_mm_destroy(mm);
        mmu_notifier_mm_destroy(mm);
        check_mm(mm);
        put_user_ns(mm->user_ns);
@@ -727,7 +726,7 @@ void __put_task_struct(struct task_struct *tsk)
        WARN_ON(tsk == current);
 
        cgroup_free(tsk);
-       task_numa_free(tsk);
+       task_numa_free(tsk, true);
        security_task_free(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
@@ -898,6 +897,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 #ifdef CONFIG_STACKPROTECTOR
        tsk->stack_canary = get_random_canary();
 #endif
+       if (orig->cpus_ptr == &orig->cpus_mask)
+               tsk->cpus_ptr = &tsk->cpus_mask;
 
        /*
         * One for us, one for whoever does the "release_task()" (usually
@@ -1689,6 +1690,14 @@ static inline void rcu_copy_process(struct task_struct *p)
 #endif /* #ifdef CONFIG_TASKS_RCU */
 }
 
+struct pid *pidfd_pid(const struct file *file)
+{
+       if (file->f_op == &pidfd_fops)
+               return file->private_data;
+
+       return ERR_PTR(-EBADF);
+}
+
 static int pidfd_release(struct inode *inode, struct file *file)
 {
        struct pid *pid = file->private_data;
@@ -1709,8 +1718,34 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 }
 #endif
 
+/*
+ * Poll support for process exit notification.
+ */
+static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts)
+{
+       struct task_struct *task;
+       struct pid *pid = file->private_data;
+       int poll_flags = 0;
+
+       poll_wait(file, &pid->wait_pidfd, pts);
+
+       rcu_read_lock();
+       task = pid_task(pid, PIDTYPE_PID);
+       /*
+        * Inform pollers only when the whole thread group exits.
+        * If the thread group leader exits before all other threads in the
+        * group, then poll(2) should block, similar to the wait(2) family.
+        */
+       if (!task || (task->exit_state && thread_group_empty(task)))
+               poll_flags = POLLIN | POLLRDNORM;
+       rcu_read_unlock();
+
+       return poll_flags;
+}
+
 const struct file_operations pidfd_fops = {
        .release = pidfd_release,
+       .poll = pidfd_poll,
 #ifdef CONFIG_PROC_FS
        .show_fdinfo = pidfd_show_fdinfo,
 #endif
@@ -1740,20 +1775,16 @@ static __always_inline void delayed_free_task(struct task_struct *tsk)
  * flags). The actual kick-off is left to the caller.
  */
 static __latent_entropy struct task_struct *copy_process(
-                                       unsigned long clone_flags,
-                                       unsigned long stack_start,
-                                       unsigned long stack_size,
-                                       int __user *parent_tidptr,
-                                       int __user *child_tidptr,
                                        struct pid *pid,
                                        int trace,
-                                       unsigned long tls,
-                                       int node)
+                                       int node,
+                                       struct kernel_clone_args *args)
 {
        int pidfd = -1, retval;
        struct task_struct *p;
        struct multiprocess_signals delayed;
        struct file *pidfile = NULL;
+       u64 clone_flags = args->flags;
 
        /*
         * Don't allow sharing the root directory with processes in a different
@@ -1803,14 +1834,11 @@ static __latent_entropy struct task_struct *copy_process(
 
        if (clone_flags & CLONE_PIDFD) {
                /*
-                * - CLONE_PARENT_SETTID is useless for pidfds and also
-                *   parent_tidptr is used to return pidfds.
                 * - CLONE_DETACHED is blocked so that we can potentially
                 *   reuse it later for CLONE_PIDFD.
                 * - CLONE_THREAD is blocked until someone really needs it.
                 */
-               if (clone_flags &
-                   (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
+               if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
                        return ERR_PTR(-EINVAL);
        }
 
@@ -1843,11 +1871,11 @@ static __latent_entropy struct task_struct *copy_process(
         * p->set_child_tid which is (ab)used as a kthread's data pointer for
         * kernel threads (PF_KTHREAD).
         */
-       p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
+       p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
        /*
         * Clear TID on mm_release()?
         */
-       p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
+       p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
 
        ftrace_graph_init_task(p);
 
@@ -1952,9 +1980,6 @@ static __latent_entropy struct task_struct *copy_process(
        p->pagefault_disabled = 0;
 
 #ifdef CONFIG_LOCKDEP
-       p->lockdep_depth = 0; /* no locks held yet */
-       p->curr_chain_key = 0;
-       p->lockdep_recursion = 0;
        lockdep_init_task(p);
 #endif
 
@@ -2006,7 +2031,8 @@ static __latent_entropy struct task_struct *copy_process(
        retval = copy_io(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_namespaces;
-       retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
+       retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
+                                args->tls);
        if (retval)
                goto bad_fork_cleanup_io;
 
@@ -2036,11 +2062,12 @@ static __latent_entropy struct task_struct *copy_process(
                                              O_RDWR | O_CLOEXEC);
                if (IS_ERR(pidfile)) {
                        put_unused_fd(pidfd);
+                       retval = PTR_ERR(pidfile);
                        goto bad_fork_free_pid;
                }
                get_pid(pid);   /* held by pidfile now */
 
-               retval = put_user(pidfd, parent_tidptr);
+               retval = put_user(pidfd, args->pidfd);
                if (retval)
                        goto bad_fork_put_pidfd;
        }
@@ -2083,7 +2110,7 @@ static __latent_entropy struct task_struct *copy_process(
                if (clone_flags & CLONE_PARENT)
                        p->exit_signal = current->group_leader->exit_signal;
                else
-                       p->exit_signal = (clone_flags & CSIGNAL);
+                       p->exit_signal = args->exit_signal;
                p->group_leader = p;
                p->tgid = p->pid;
        }
@@ -2116,7 +2143,7 @@ static __latent_entropy struct task_struct *copy_process(
         */
 
        p->start_time = ktime_get_ns();
-       p->real_start_time = ktime_get_boot_ns();
+       p->real_start_time = ktime_get_boottime_ns();
 
        /*
         * Make it visible to the rest of the system, but dont wake it up yet.
@@ -2296,8 +2323,11 @@ static inline void init_idle_pids(struct task_struct *idle)
 struct task_struct *fork_idle(int cpu)
 {
        struct task_struct *task;
-       task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0,
-                           cpu_to_node(cpu));
+       struct kernel_clone_args args = {
+               .flags = CLONE_VM,
+       };
+
+       task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
        if (!IS_ERR(task)) {
                init_idle_pids(task);
                init_idle(task, cpu);
@@ -2316,14 +2346,12 @@ struct mm_struct *copy_init_mm(void)
  *
  * It copies the process, and if successful kick-starts
  * it and waits for it to finish using the VM if required.
+ *
+ * args->exit_signal is expected to be checked for sanity by the caller.
  */
-long _do_fork(unsigned long clone_flags,
-             unsigned long stack_start,
-             unsigned long stack_size,
-             int __user *parent_tidptr,
-             int __user *child_tidptr,
-             unsigned long tls)
+long _do_fork(struct kernel_clone_args *args)
 {
+       u64 clone_flags = args->flags;
        struct completion vfork;
        struct pid *pid;
        struct task_struct *p;
@@ -2339,7 +2367,7 @@ long _do_fork(unsigned long clone_flags,
        if (!(clone_flags & CLONE_UNTRACED)) {
                if (clone_flags & CLONE_VFORK)
                        trace = PTRACE_EVENT_VFORK;
-               else if ((clone_flags & CSIGNAL) != SIGCHLD)
+               else if (args->exit_signal != SIGCHLD)
                        trace = PTRACE_EVENT_CLONE;
                else
                        trace = PTRACE_EVENT_FORK;
@@ -2348,8 +2376,7 @@ long _do_fork(unsigned long clone_flags,
                        trace = 0;
        }
 
-       p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
-                        child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
+       p = copy_process(NULL, trace, NUMA_NO_NODE, args);
        add_latent_entropy();
 
        if (IS_ERR(p))
@@ -2365,7 +2392,7 @@ long _do_fork(unsigned long clone_flags,
        nr = pid_vnr(pid);
 
        if (clone_flags & CLONE_PARENT_SETTID)
-               put_user(nr, parent_tidptr);
+               put_user(nr, args->parent_tid);
 
        if (clone_flags & CLONE_VFORK) {
                p->vfork_done = &vfork;
@@ -2388,6 +2415,16 @@ long _do_fork(unsigned long clone_flags,
        return nr;
 }
 
+bool legacy_clone_args_valid(const struct kernel_clone_args *kargs)
+{
+       /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
+       if ((kargs->flags & CLONE_PIDFD) &&
+           (kargs->flags & CLONE_PARENT_SETTID))
+               return false;
+
+       return true;
+}
+
 #ifndef CONFIG_HAVE_COPY_THREAD_TLS
 /* For compatibility with architectures that call do_fork directly rather than
  * using the syscall entry points below. */
@@ -2397,8 +2434,20 @@ long do_fork(unsigned long clone_flags,
              int __user *parent_tidptr,
              int __user *child_tidptr)
 {
-       return _do_fork(clone_flags, stack_start, stack_size,
-                       parent_tidptr, child_tidptr, 0);
+       struct kernel_clone_args args = {
+               .flags          = (clone_flags & ~CSIGNAL),
+               .pidfd          = parent_tidptr,
+               .child_tid      = child_tidptr,
+               .parent_tid     = parent_tidptr,
+               .exit_signal    = (clone_flags & CSIGNAL),
+               .stack          = stack_start,
+               .stack_size     = stack_size,
+       };
+
+       if (!legacy_clone_args_valid(&args))
+               return -EINVAL;
+
+       return _do_fork(&args);
 }
 #endif
 
@@ -2407,15 +2456,25 @@ long do_fork(unsigned long clone_flags,
  */
 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 {
-       return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
-               (unsigned long)arg, NULL, NULL, 0);
+       struct kernel_clone_args args = {
+               .flags          = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+               .exit_signal    = (flags & CSIGNAL),
+               .stack          = (unsigned long)fn,
+               .stack_size     = (unsigned long)arg,
+       };
+
+       return _do_fork(&args);
 }
 
 #ifdef __ARCH_WANT_SYS_FORK
 SYSCALL_DEFINE0(fork)
 {
 #ifdef CONFIG_MMU
-       return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
+       struct kernel_clone_args args = {
+               .exit_signal = SIGCHLD,
+       };
+
+       return _do_fork(&args);
 #else
        /* can not support in nommu mode */
        return -EINVAL;
@@ -2426,8 +2485,12 @@ SYSCALL_DEFINE0(fork)
 #ifdef __ARCH_WANT_SYS_VFORK
 SYSCALL_DEFINE0(vfork)
 {
-       return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
-                       0, NULL, NULL, 0);
+       struct kernel_clone_args args = {
+               .flags          = CLONE_VFORK | CLONE_VM,
+               .exit_signal    = SIGCHLD,
+       };
+
+       return _do_fork(&args);
 }
 #endif
 
@@ -2455,7 +2518,119 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 unsigned long, tls)
 #endif
 {
-       return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
+       struct kernel_clone_args args = {
+               .flags          = (clone_flags & ~CSIGNAL),
+               .pidfd          = parent_tidptr,
+               .child_tid      = child_tidptr,
+               .parent_tid     = parent_tidptr,
+               .exit_signal    = (clone_flags & CSIGNAL),
+               .stack          = newsp,
+               .tls            = tls,
+       };
+
+       if (!legacy_clone_args_valid(&args))
+               return -EINVAL;
+
+       return _do_fork(&args);
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_CLONE3
+noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
+                                             struct clone_args __user *uargs,
+                                             size_t size)
+{
+       struct clone_args args;
+
+       if (unlikely(size > PAGE_SIZE))
+               return -E2BIG;
+
+       if (unlikely(size < sizeof(struct clone_args)))
+               return -EINVAL;
+
+       if (unlikely(!access_ok(uargs, size)))
+               return -EFAULT;
+
+       if (size > sizeof(struct clone_args)) {
+               unsigned char __user *addr;
+               unsigned char __user *end;
+               unsigned char val;
+
+               addr = (void __user *)uargs + sizeof(struct clone_args);
+               end = (void __user *)uargs + size;
+
+               for (; addr < end; addr++) {
+                       if (get_user(val, addr))
+                               return -EFAULT;
+                       if (val)
+                               return -E2BIG;
+               }
+
+               size = sizeof(struct clone_args);
+       }
+
+       if (copy_from_user(&args, uargs, size))
+               return -EFAULT;
+
+       /*
+        * Verify that higher 32bits of exit_signal are unset and that
+        * it is a valid signal
+        */
+       if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
+                    !valid_signal(args.exit_signal)))
+               return -EINVAL;
+
+       *kargs = (struct kernel_clone_args){
+               .flags          = args.flags,
+               .pidfd          = u64_to_user_ptr(args.pidfd),
+               .child_tid      = u64_to_user_ptr(args.child_tid),
+               .parent_tid     = u64_to_user_ptr(args.parent_tid),
+               .exit_signal    = args.exit_signal,
+               .stack          = args.stack,
+               .stack_size     = args.stack_size,
+               .tls            = args.tls,
+       };
+
+       return 0;
+}
+
+static bool clone3_args_valid(const struct kernel_clone_args *kargs)
+{
+       /*
+        * All lower bits of the flag word are taken.
+        * Verify that no other unknown flags are passed along.
+        */
+       if (kargs->flags & ~CLONE_LEGACY_FLAGS)
+               return false;
+
+       /*
+        * - make the CLONE_DETACHED bit reuseable for clone3
+        * - make the CSIGNAL bits reuseable for clone3
+        */
+       if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
+               return false;
+
+       if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
+           kargs->exit_signal)
+               return false;
+
+       return true;
+}
+
+SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
+{
+       int err;
+
+       struct kernel_clone_args kargs;
+
+       err = copy_clone_args_from_user(&kargs, uargs, size);
+       if (err)
+               return err;
+
+       if (!clone3_args_valid(&kargs))
+               return -EINVAL;
+
+       return _do_fork(&kargs);
 }
 #endif