perf/core: Rework memory accounting in perf_mmap()

[linux.git] / kernel / events / core.c
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 4f08b17d642672f9822e3d842f07f2d836af6f9f..2b8265ad7bf5ef37f46770c7b1174ce559b0eff4 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2239,7 +2239,7 @@ static void __perf_event_disable(struct perf_event *event,
   *
   * If event->ctx is a cloned context, callers must make sure that
   * every task struct that event->ctx->task could possibly point to
- * remains valid.  This condition is satisifed when called through
+ * remains valid.  This condition is satisfied when called through
   * perf_event_for_each_child or perf_event_for_each because they
   * hold the top-level event's child_mutex, so any descendant that
   * goes to exit will block in perf_event_exit_event().
@@ -5668,7 +5668,8 @@ static void perf_mmap_close(struct vm_area_struct *vma)
          * undo the VM accounting.
          */
  
-       atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+       atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
+                       &mmap_user->locked_vm);
         atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
         free_uid(mmap_user);
  
@@ -5812,8 +5813,20 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
  
         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
  
-       if (user_locked > user_lock_limit)
+       if (user_locked <= user_lock_limit) {
+               /* charge all to locked_vm */
+       } else if (atomic_long_read(&user->locked_vm) >= user_lock_limit) {
+               /* charge all to pinned_vm */
+               extra = user_extra;
+               user_extra = 0;
+       } else {
+               /*
+                * charge locked_vm until it hits user_lock_limit;
+                * charge the rest from pinned_vm
+                */
                 extra = user_locked - user_lock_limit;
+               user_extra -= extra;
+       }
  
         lock_limit = rlimit(RLIMIT_MEMLOCK);
         lock_limit >>= PAGE_SHIFT;
@@ -6054,7 +6067,7 @@ static void perf_sample_regs_intr(struct perf_regs *regs_intr,
   * Get remaining task size from user stack pointer.
   *
   * It'd be better to take stack vma map and limit this more
- * precisly, but there's no way to get it safely under interrupt,
+ * precisely, but there's no way to get it safely under interrupt,
   * so using TASK_SIZE as limit.
   */
  static u64 perf_ustack_task_size(struct pt_regs *regs)
@@ -6616,7 +6629,7 @@ void perf_prepare_sample(struct perf_event_header *header,
  
         if (sample_type & PERF_SAMPLE_STACK_USER) {
                 /*
-                * Either we need PERF_SAMPLE_STACK_USER bit to be allways
+                * Either we need PERF_SAMPLE_STACK_USER bit to be always
                  * processed as the last one or have additional check added
                  * in case new sample type is added, because we could eat
                  * up the rest of the sample size.
@@ -10586,55 +10599,26 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
         u32 size;
         int ret;
  
-       if (!access_ok(uattr, PERF_ATTR_SIZE_VER0))
-               return -EFAULT;
-
-       /*
-        * zero the full structure, so that a short copy will be nice.
-        */
+       /* Zero the full structure, so that a short copy will be nice. */
         memset(attr, 0, sizeof(*attr));
  
         ret = get_user(size, &uattr->size);
         if (ret)
                 return ret;
  
-       if (size > PAGE_SIZE)   /* silly large */
-               goto err_size;
-
-       if (!size)              /* abi compat */
+       /* ABI compatibility quirk: */
+       if (!size)
                 size = PERF_ATTR_SIZE_VER0;
-
-       if (size < PERF_ATTR_SIZE_VER0)
+       if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
                 goto err_size;
  
-       /*
-        * If we're handed a bigger struct than we know of,
-        * ensure all the unknown bits are 0 - i.e. new
-        * user-space does not rely on any kernel feature
-        * extensions we dont know about yet.
-        */
-       if (size > sizeof(*attr)) {
-               unsigned char __user *addr;
-               unsigned char __user *end;
-               unsigned char val;
-
-               addr = (void __user *)uattr + sizeof(*attr);
-               end  = (void __user *)uattr + size;
-
-               for (; addr < end; addr++) {
-                       ret = get_user(val, addr);
-                       if (ret)
-                               return ret;
-                       if (val)
-                               goto err_size;
-               }
-               size = sizeof(*attr);
+       ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
+       if (ret) {
+               if (ret == -E2BIG)
+                       goto err_size;
+               return ret;
         }
  
-       ret = copy_from_user(attr, uattr, size);
-       if (ret)
-               return -EFAULT;
-
         attr->size = size;
  
         if (attr->__reserved_1)
@@ -10917,6 +10901,13 @@ SYSCALL_DEFINE5(perf_event_open,
             perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
                 return -EACCES;
  
+       err = security_locked_down(LOCKDOWN_PERF);
+       if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR))
+               /* REGS_INTR can leak data, lockdown must prevent this */
+               return err;
+
+       err = 0;
+
         /*
          * In cgroup mode, the pid argument is used to pass the fd
          * opened to the cgroup directory in cgroupfs. The cpu argument
@@ -11884,6 +11875,10 @@ static int inherit_group(struct perf_event *parent_event,
                                             child, leader, child_ctx);
                 if (IS_ERR(child_ctr))
                         return PTR_ERR(child_ctr);
+
+               if (sub->aux_event == parent_event &&
+                   !perf_get_aux_event(child_ctr, leader))
+                       return -EINVAL;
         }
         return 0;
  }