kernel/bpf/cgroup.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Functions to manage eBPF programs attached to cgroups
   4  *
   5  * Copyright (c) 2016 Daniel Mack
   6  */
   7
   8 #include <linux/kernel.h>
   9 #include <linux/atomic.h>
  10 #include <linux/cgroup.h>
  11 #include <linux/filter.h>
  12 #include <linux/slab.h>
  13 #include <linux/sysctl.h>
  14 #include <linux/string.h>
  15 #include <linux/bpf.h>
  16 #include <linux/bpf-cgroup.h>
  17 #include <net/sock.h>
  18 #include <net/bpf_sk_storage.h>
  19
  20 #include "../cgroup/cgroup-internal.h"
  21
  22 DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
  23 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
  24
  25 void cgroup_bpf_offline(struct cgroup *cgrp)
  26 {
  27         cgroup_get(cgrp);
  28         percpu_ref_kill(&cgrp->bpf.refcnt);
  29 }
  30
  31 /**
  32  * cgroup_bpf_release() - put references of all bpf programs and
  33  *                        release all cgroup bpf data
  34  * @work: work structure embedded into the cgroup to modify
  35  */
  36 static void cgroup_bpf_release(struct work_struct *work)
  37 {
  38         struct cgroup *cgrp = container_of(work, struct cgroup,
  39                                            bpf.release_work);
  40         enum bpf_cgroup_storage_type stype;
  41         struct bpf_prog_array *old_array;
  42         unsigned int type;
  43
  44         mutex_lock(&cgroup_mutex);
  45
  46         for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
  47                 struct list_head *progs = &cgrp->bpf.progs[type];
  48                 struct bpf_prog_list *pl, *tmp;
  49
  50                 list_for_each_entry_safe(pl, tmp, progs, node) {
  51                         list_del(&pl->node);
  52                         bpf_prog_put(pl->prog);
  53                         for_each_cgroup_storage_type(stype) {
  54                                 bpf_cgroup_storage_unlink(pl->storage[stype]);
  55                                 bpf_cgroup_storage_free(pl->storage[stype]);
  56                         }
  57                         kfree(pl);
  58                         static_branch_dec(&cgroup_bpf_enabled_key);
  59                 }
  60                 old_array = rcu_dereference_protected(
  61                                 cgrp->bpf.effective[type],
  62                                 lockdep_is_held(&cgroup_mutex));
  63                 bpf_prog_array_free(old_array);
  64         }
  65
  66         mutex_unlock(&cgroup_mutex);
  67
  68         percpu_ref_exit(&cgrp->bpf.refcnt);
  69         cgroup_put(cgrp);
  70 }
  71
  72 /**
  73  * cgroup_bpf_release_fn() - callback used to schedule releasing
  74  *                           of bpf cgroup data
  75  * @ref: percpu ref counter structure
  76  */
  77 static void cgroup_bpf_release_fn(struct percpu_ref *ref)
  78 {
  79         struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
  80
  81         INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
  82         queue_work(system_wq, &cgrp->bpf.release_work);
  83 }
  84
  85 /* count number of elements in the list.
  86  * it's slow but the list cannot be long
  87  */
  88 static u32 prog_list_length(struct list_head *head)
  89 {
  90         struct bpf_prog_list *pl;
  91         u32 cnt = 0;
  92
  93         list_for_each_entry(pl, head, node) {
  94                 if (!pl->prog)
  95                         continue;
  96                 cnt++;
  97         }
  98         return cnt;
  99 }
 100
 101 /* if parent has non-overridable prog attached,
 102  * disallow attaching new programs to the descendent cgroup.
 103  * if parent has overridable or multi-prog, allow attaching
 104  */
 105 static bool hierarchy_allows_attach(struct cgroup *cgrp,
 106                                     enum bpf_attach_type type)
 107 {
 108         struct cgroup *p;
 109
 110         p = cgroup_parent(cgrp);
 111         if (!p)
 112                 return true;
 113         do {
 114                 u32 flags = p->bpf.flags[type];
 115                 u32 cnt;
 116
 117                 if (flags & BPF_F_ALLOW_MULTI)
 118                         return true;
 119                 cnt = prog_list_length(&p->bpf.progs[type]);
 120                 WARN_ON_ONCE(cnt > 1);
 121                 if (cnt == 1)
 122                         return !!(flags & BPF_F_ALLOW_OVERRIDE);
 123                 p = cgroup_parent(p);
 124         } while (p);
 125         return true;
 126 }
 127
 128 /* compute a chain of effective programs for a given cgroup:
 129  * start from the list of programs in this cgroup and add
 130  * all parent programs.
 131  * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
 132  * to programs in this cgroup
 133  */
 134 static int compute_effective_progs(struct cgroup *cgrp,
 135                                    enum bpf_attach_type type,
 136                                    struct bpf_prog_array **array)
 137 {
 138         enum bpf_cgroup_storage_type stype;
 139         struct bpf_prog_array *progs;
 140         struct bpf_prog_list *pl;
 141         struct cgroup *p = cgrp;
 142         int cnt = 0;
 143
 144         /* count number of effective programs by walking parents */
 145         do {
 146                 if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
 147                         cnt += prog_list_length(&p->bpf.progs[type]);
 148                 p = cgroup_parent(p);
 149         } while (p);
 150
 151         progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
 152         if (!progs)
 153                 return -ENOMEM;
 154
 155         /* populate the array with effective progs */
 156         cnt = 0;
 157         p = cgrp;
 158         do {
 159                 if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
 160                         continue;
 161
 162                 list_for_each_entry(pl, &p->bpf.progs[type], node) {
 163                         if (!pl->prog)
 164                                 continue;
 165
 166                         progs->items[cnt].prog = pl->prog;
 167                         for_each_cgroup_storage_type(stype)
 168                                 progs->items[cnt].cgroup_storage[stype] =
 169                                         pl->storage[stype];
 170                         cnt++;
 171                 }
 172         } while ((p = cgroup_parent(p)));
 173
 174         *array = progs;
 175         return 0;
 176 }
 177
 178 static void activate_effective_progs(struct cgroup *cgrp,
 179                                      enum bpf_attach_type type,
 180                                      struct bpf_prog_array *old_array)
 181 {
 182         old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array,
 183                                         lockdep_is_held(&cgroup_mutex));
 184         /* free prog array after grace period, since __cgroup_bpf_run_*()
 185          * might be still walking the array
 186          */
 187         bpf_prog_array_free(old_array);
 188 }
 189
 190 /**
 191  * cgroup_bpf_inherit() - inherit effective programs from parent
 192  * @cgrp: the cgroup to modify
 193  */
 194 int cgroup_bpf_inherit(struct cgroup *cgrp)
 195 {
 196 /* has to use marco instead of const int, since compiler thinks
 197  * that array below is variable length
 198  */
 199 #define NR ARRAY_SIZE(cgrp->bpf.effective)
 200         struct bpf_prog_array *arrays[NR] = {};
 201         int ret, i;
 202
 203         ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
 204                               GFP_KERNEL);
 205         if (ret)
 206                 return ret;
 207
 208         for (i = 0; i < NR; i++)
 209                 INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
 210
 211         for (i = 0; i < NR; i++)
 212                 if (compute_effective_progs(cgrp, i, &arrays[i]))
 213                         goto cleanup;
 214
 215         for (i = 0; i < NR; i++)
 216                 activate_effective_progs(cgrp, i, arrays[i]);
 217
 218         return 0;
 219 cleanup:
 220         for (i = 0; i < NR; i++)
 221                 bpf_prog_array_free(arrays[i]);
 222
 223         percpu_ref_exit(&cgrp->bpf.refcnt);
 224
 225         return -ENOMEM;
 226 }
 227
 228 static int update_effective_progs(struct cgroup *cgrp,
 229                                   enum bpf_attach_type type)
 230 {
 231         struct cgroup_subsys_state *css;
 232         int err;
 233
 234         /* allocate and recompute effective prog arrays */
 235         css_for_each_descendant_pre(css, &cgrp->self) {
 236                 struct cgroup *desc = container_of(css, struct cgroup, self);
 237
 238                 if (percpu_ref_is_zero(&desc->bpf.refcnt))
 239                         continue;
 240
 241                 err = compute_effective_progs(desc, type, &desc->bpf.inactive);
 242                 if (err)
 243                         goto cleanup;
 244         }
 245
 246         /* all allocations were successful. Activate all prog arrays */
 247         css_for_each_descendant_pre(css, &cgrp->self) {
 248                 struct cgroup *desc = container_of(css, struct cgroup, self);
 249
 250                 if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
 251                         if (unlikely(desc->bpf.inactive)) {
 252                                 bpf_prog_array_free(desc->bpf.inactive);
 253                                 desc->bpf.inactive = NULL;
 254                         }
 255                         continue;
 256                 }
 257
 258                 activate_effective_progs(desc, type, desc->bpf.inactive);
 259                 desc->bpf.inactive = NULL;
 260         }
 261
 262         return 0;
 263
 264 cleanup:
 265         /* oom while computing effective. Free all computed effective arrays
 266          * since they were not activated
 267          */
 268         css_for_each_descendant_pre(css, &cgrp->self) {
 269                 struct cgroup *desc = container_of(css, struct cgroup, self);
 270
 271                 bpf_prog_array_free(desc->bpf.inactive);
 272                 desc->bpf.inactive = NULL;
 273         }
 274
 275         return err;
 276 }
 277
 278 #define BPF_CGROUP_MAX_PROGS 64
 279
 280 /**
 281  * __cgroup_bpf_attach() - Attach the program to a cgroup, and
 282  *                         propagate the change to descendants
 283  * @cgrp: The cgroup which descendants to traverse
 284  * @prog: A program to attach
 285  * @type: Type of attach operation
 286  * @flags: Option flags
 287  *
 288  * Must be called with cgroup_mutex held.
 289  */
 290 int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 291                         enum bpf_attach_type type, u32 flags)
 292 {
 293         struct list_head *progs = &cgrp->bpf.progs[type];
 294         struct bpf_prog *old_prog = NULL;
 295         struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
 296                 *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
 297         struct bpf_prog_list *pl, *replace_pl = NULL;
 298         enum bpf_cgroup_storage_type stype;
 299         int err;
 300
 301         if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
 302                 /* invalid combination */
 303                 return -EINVAL;
 304
 305         if (!hierarchy_allows_attach(cgrp, type))
 306                 return -EPERM;
 307
 308         if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
 309                 /* Disallow attaching non-overridable on top
 310                  * of existing overridable in this cgroup.
 311                  * Disallow attaching multi-prog if overridable or none
 312                  */
 313                 return -EPERM;
 314
 315         if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
 316                 return -E2BIG;
 317
 318         if (flags & BPF_F_ALLOW_MULTI) {
 319                 list_for_each_entry(pl, progs, node) {
 320                         if (pl->prog == prog)
 321                                 /* disallow attaching the same prog twice */
 322                                 return -EINVAL;
 323                 }
 324         } else if (!list_empty(progs)) {
 325                 replace_pl = list_first_entry(progs, typeof(*pl), node);
 326         }
 327
 328         for_each_cgroup_storage_type(stype) {
 329                 storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
 330                 if (IS_ERR(storage[stype])) {
 331                         storage[stype] = NULL;
 332                         for_each_cgroup_storage_type(stype)
 333                                 bpf_cgroup_storage_free(storage[stype]);
 334                         return -ENOMEM;
 335                 }
 336         }
 337
 338         if (replace_pl) {
 339                 pl = replace_pl;
 340                 old_prog = pl->prog;
 341                 for_each_cgroup_storage_type(stype) {
 342                         old_storage[stype] = pl->storage[stype];
 343                         bpf_cgroup_storage_unlink(old_storage[stype]);
 344                 }
 345         } else {
 346                 pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 347                 if (!pl) {
 348                         for_each_cgroup_storage_type(stype)
 349                                 bpf_cgroup_storage_free(storage[stype]);
 350                         return -ENOMEM;
 351                 }
 352                 list_add_tail(&pl->node, progs);
 353         }
 354
 355         pl->prog = prog;
 356         for_each_cgroup_storage_type(stype)
 357                 pl->storage[stype] = storage[stype];
 358
 359         cgrp->bpf.flags[type] = flags;
 360
 361         err = update_effective_progs(cgrp, type);
 362         if (err)
 363                 goto cleanup;
 364
 365         static_branch_inc(&cgroup_bpf_enabled_key);
 366         for_each_cgroup_storage_type(stype) {
 367                 if (!old_storage[stype])
 368                         continue;
 369                 bpf_cgroup_storage_free(old_storage[stype]);
 370         }
 371         if (old_prog) {
 372                 bpf_prog_put(old_prog);
 373                 static_branch_dec(&cgroup_bpf_enabled_key);
 374         }
 375         for_each_cgroup_storage_type(stype)
 376                 bpf_cgroup_storage_link(storage[stype], cgrp, type);
 377         return 0;
 378
 379 cleanup:
 380         /* and cleanup the prog list */
 381         pl->prog = old_prog;
 382         for_each_cgroup_storage_type(stype) {
 383                 bpf_cgroup_storage_free(pl->storage[stype]);
 384                 pl->storage[stype] = old_storage[stype];
 385                 bpf_cgroup_storage_link(old_storage[stype], cgrp, type);
 386         }
 387         if (!replace_pl) {
 388                 list_del(&pl->node);
 389                 kfree(pl);
 390         }
 391         return err;
 392 }
 393
 394 /**
 395  * __cgroup_bpf_detach() - Detach the program from a cgroup, and
 396  *                         propagate the change to descendants
 397  * @cgrp: The cgroup which descendants to traverse
 398  * @prog: A program to detach or NULL
 399  * @type: Type of detach operation
 400  *
 401  * Must be called with cgroup_mutex held.
 402  */
 403 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 404                         enum bpf_attach_type type)
 405 {
 406         struct list_head *progs = &cgrp->bpf.progs[type];
 407         enum bpf_cgroup_storage_type stype;
 408         u32 flags = cgrp->bpf.flags[type];
 409         struct bpf_prog *old_prog = NULL;
 410         struct bpf_prog_list *pl;
 411         int err;
 412
 413         if (flags & BPF_F_ALLOW_MULTI) {
 414                 if (!prog)
 415                         /* to detach MULTI prog the user has to specify valid FD
 416                          * of the program to be detached
 417                          */
 418                         return -EINVAL;
 419         } else {
 420                 if (list_empty(progs))
 421                         /* report error when trying to detach and nothing is attached */
 422                         return -ENOENT;
 423         }
 424
 425         if (flags & BPF_F_ALLOW_MULTI) {
 426                 /* find the prog and detach it */
 427                 list_for_each_entry(pl, progs, node) {
 428                         if (pl->prog != prog)
 429                                 continue;
 430                         old_prog = prog;
 431                         /* mark it deleted, so it's ignored while
 432                          * recomputing effective
 433                          */
 434                         pl->prog = NULL;
 435                         break;
 436                 }
 437                 if (!old_prog)
 438                         return -ENOENT;
 439         } else {
 440                 /* to maintain backward compatibility NONE and OVERRIDE cgroups
 441                  * allow detaching with invalid FD (prog==NULL)
 442                  */
 443                 pl = list_first_entry(progs, typeof(*pl), node);
 444                 old_prog = pl->prog;
 445                 pl->prog = NULL;
 446         }
 447
 448         err = update_effective_progs(cgrp, type);
 449         if (err)
 450                 goto cleanup;
 451
 452         /* now can actually delete it from this cgroup list */
 453         list_del(&pl->node);
 454         for_each_cgroup_storage_type(stype) {
 455                 bpf_cgroup_storage_unlink(pl->storage[stype]);
 456                 bpf_cgroup_storage_free(pl->storage[stype]);
 457         }
 458         kfree(pl);
 459         if (list_empty(progs))
 460                 /* last program was detached, reset flags to zero */
 461                 cgrp->bpf.flags[type] = 0;
 462
 463         bpf_prog_put(old_prog);
 464         static_branch_dec(&cgroup_bpf_enabled_key);
 465         return 0;
 466
 467 cleanup:
 468         /* and restore back old_prog */
 469         pl->prog = old_prog;
 470         return err;
 471 }
 472
 473 /* Must be called with cgroup_mutex held to avoid races. */
 474 int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 475                        union bpf_attr __user *uattr)
 476 {
 477         __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
 478         enum bpf_attach_type type = attr->query.attach_type;
 479         struct list_head *progs = &cgrp->bpf.progs[type];
 480         u32 flags = cgrp->bpf.flags[type];
 481         struct bpf_prog_array *effective;
 482         int cnt, ret = 0, i;
 483
 484         effective = rcu_dereference_protected(cgrp->bpf.effective[type],
 485                                               lockdep_is_held(&cgroup_mutex));
 486
 487         if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
 488                 cnt = bpf_prog_array_length(effective);
 489         else
 490                 cnt = prog_list_length(progs);
 491
 492         if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
 493                 return -EFAULT;
 494         if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
 495                 return -EFAULT;
 496         if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
 497                 /* return early if user requested only program count + flags */
 498                 return 0;
 499         if (attr->query.prog_cnt < cnt) {
 500                 cnt = attr->query.prog_cnt;
 501                 ret = -ENOSPC;
 502         }
 503
 504         if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
 505                 return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
 506         } else {
 507                 struct bpf_prog_list *pl;
 508                 u32 id;
 509
 510                 i = 0;
 511                 list_for_each_entry(pl, progs, node) {
 512                         id = pl->prog->aux->id;
 513                         if (copy_to_user(prog_ids + i, &id, sizeof(id)))
 514                                 return -EFAULT;
 515                         if (++i == cnt)
 516                                 break;
 517                 }
 518         }
 519         return ret;
 520 }
 521
 522 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
 523                            enum bpf_prog_type ptype, struct bpf_prog *prog)
 524 {
 525         struct cgroup *cgrp;
 526         int ret;
 527
 528         cgrp = cgroup_get_from_fd(attr->target_fd);
 529         if (IS_ERR(cgrp))
 530                 return PTR_ERR(cgrp);
 531
 532         ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
 533                                 attr->attach_flags);
 534         cgroup_put(cgrp);
 535         return ret;
 536 }
 537
 538 int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
 539 {
 540         struct bpf_prog *prog;
 541         struct cgroup *cgrp;
 542         int ret;
 543
 544         cgrp = cgroup_get_from_fd(attr->target_fd);
 545         if (IS_ERR(cgrp))
 546                 return PTR_ERR(cgrp);
 547
 548         prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
 549         if (IS_ERR(prog))
 550                 prog = NULL;
 551
 552         ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
 553         if (prog)
 554                 bpf_prog_put(prog);
 555
 556         cgroup_put(cgrp);
 557         return ret;
 558 }
 559
 560 int cgroup_bpf_prog_query(const union bpf_attr *attr,
 561                           union bpf_attr __user *uattr)
 562 {
 563         struct cgroup *cgrp;
 564         int ret;
 565
 566         cgrp = cgroup_get_from_fd(attr->query.target_fd);
 567         if (IS_ERR(cgrp))
 568                 return PTR_ERR(cgrp);
 569
 570         ret = cgroup_bpf_query(cgrp, attr, uattr);
 571
 572         cgroup_put(cgrp);
 573         return ret;
 574 }
 575
 576 /**
 577  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
 578  * @sk: The socket sending or receiving traffic
 579  * @skb: The skb that is being sent or received
 580  * @type: The type of program to be exectuted
 581  *
 582  * If no socket is passed, or the socket is not of type INET or INET6,
 583  * this function does nothing and returns 0.
 584  *
 585  * The program type passed in via @type must be suitable for network
 586  * filtering. No further check is performed to assert that.
 587  *
 588  * For egress packets, this function can return:
 589  *   NET_XMIT_SUCCESS    (0)    - continue with packet output
 590  *   NET_XMIT_DROP       (1)    - drop packet and notify TCP to call cwr
 591  *   NET_XMIT_CN         (2)    - continue with packet output and notify TCP
 592  *                                to call cwr
 593  *   -EPERM                     - drop packet
 594  *
 595  * For ingress packets, this function will return -EPERM if any
 596  * attached program was found and if it returned != 1 during execution.
 597  * Otherwise 0 is returned.
 598  */
 599 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 600                                 struct sk_buff *skb,
 601                                 enum bpf_attach_type type)
 602 {
 603         unsigned int offset = skb->data - skb_network_header(skb);
 604         struct sock *save_sk;
 605         void *saved_data_end;
 606         struct cgroup *cgrp;
 607         int ret;
 608
 609         if (!sk || !sk_fullsock(sk))
 610                 return 0;
 611
 612         if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 613                 return 0;
 614
 615         cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 616         save_sk = skb->sk;
 617         skb->sk = sk;
 618         __skb_push(skb, offset);
 619
 620         /* compute pointers for the bpf prog */
 621         bpf_compute_and_save_data_end(skb, &saved_data_end);
 622
 623         if (type == BPF_CGROUP_INET_EGRESS) {
 624                 ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
 625                         cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
 626         } else {
 627                 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
 628                                           __bpf_prog_run_save_cb);
 629                 ret = (ret == 1 ? 0 : -EPERM);
 630         }
 631         bpf_restore_data_end(skb, saved_data_end);
 632         __skb_pull(skb, offset);
 633         skb->sk = save_sk;
 634
 635         return ret;
 636 }
 637 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
 638
 639 /**
 640  * __cgroup_bpf_run_filter_sk() - Run a program on a sock
 641  * @sk: sock structure to manipulate
 642  * @type: The type of program to be exectuted
 643  *
 644  * socket is passed is expected to be of type INET or INET6.
 645  *
 646  * The program type passed in via @type must be suitable for sock
 647  * filtering. No further check is performed to assert that.
 648  *
 649  * This function will return %-EPERM if any if an attached program was found
 650  * and if it returned != 1 during execution. In all other cases, 0 is returned.
 651  */
 652 int __cgroup_bpf_run_filter_sk(struct sock *sk,
 653                                enum bpf_attach_type type)
 654 {
 655         struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 656         int ret;
 657
 658         ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
 659         return ret == 1 ? 0 : -EPERM;
 660 }
 661 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 662
 663 /**
 664  * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
 665  *                                       provided by user sockaddr
 666  * @sk: sock struct that will use sockaddr
 667  * @uaddr: sockaddr struct provided by user
 668  * @type: The type of program to be exectuted
 669  * @t_ctx: Pointer to attach type specific context
 670  *
 671  * socket is expected to be of type INET or INET6.
 672  *
 673  * This function will return %-EPERM if an attached program is found and
 674  * returned value != 1 during execution. In all other cases, 0 is returned.
 675  */
 676 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 677                                       struct sockaddr *uaddr,
 678                                       enum bpf_attach_type type,
 679                                       void *t_ctx)
 680 {
 681         struct bpf_sock_addr_kern ctx = {
 682                 .sk = sk,
 683                 .uaddr = uaddr,
 684                 .t_ctx = t_ctx,
 685         };
 686         struct sockaddr_storage unspec;
 687         struct cgroup *cgrp;
 688         int ret;
 689
 690         /* Check socket family since not all sockets represent network
 691          * endpoint (e.g. AF_UNIX).
 692          */
 693         if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 694                 return 0;
 695
 696         if (!ctx.uaddr) {
 697                 memset(&unspec, 0, sizeof(unspec));
 698                 ctx.uaddr = (struct sockaddr *)&unspec;
 699         }
 700
 701         cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 702         ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
 703
 704         return ret == 1 ? 0 : -EPERM;
 705 }
 706 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
 707
 708 /**
 709  * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
 710  * @sk: socket to get cgroup from
 711  * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
 712  * sk with connection information (IP addresses, etc.) May not contain
 713  * cgroup info if it is a req sock.
 714  * @type: The type of program to be exectuted
 715  *
 716  * socket passed is expected to be of type INET or INET6.
 717  *
 718  * The program type passed in via @type must be suitable for sock_ops
 719  * filtering. No further check is performed to assert that.
 720  *
 721  * This function will return %-EPERM if any if an attached program was found
 722  * and if it returned != 1 during execution. In all other cases, 0 is returned.
 723  */
 724 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 725                                      struct bpf_sock_ops_kern *sock_ops,
 726                                      enum bpf_attach_type type)
 727 {
 728         struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 729         int ret;
 730
 731         ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
 732                                  BPF_PROG_RUN);
 733         return ret == 1 ? 0 : -EPERM;
 734 }
 735 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
 736
 737 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 738                                       short access, enum bpf_attach_type type)
 739 {
 740         struct cgroup *cgrp;
 741         struct bpf_cgroup_dev_ctx ctx = {
 742                 .access_type = (access << 16) | dev_type,
 743                 .major = major,
 744                 .minor = minor,
 745         };
 746         int allow = 1;
 747
 748         rcu_read_lock();
 749         cgrp = task_dfl_cgroup(current);
 750         allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
 751                                    BPF_PROG_RUN);
 752         rcu_read_unlock();
 753
 754         return !allow;
 755 }
 756 EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
 757
 758 static const struct bpf_func_proto *
 759 cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 760 {
 761         switch (func_id) {
 762         case BPF_FUNC_map_lookup_elem:
 763                 return &bpf_map_lookup_elem_proto;
 764         case BPF_FUNC_map_update_elem:
 765                 return &bpf_map_update_elem_proto;
 766         case BPF_FUNC_map_delete_elem:
 767                 return &bpf_map_delete_elem_proto;
 768         case BPF_FUNC_map_push_elem:
 769                 return &bpf_map_push_elem_proto;
 770         case BPF_FUNC_map_pop_elem:
 771                 return &bpf_map_pop_elem_proto;
 772         case BPF_FUNC_map_peek_elem:
 773                 return &bpf_map_peek_elem_proto;
 774         case BPF_FUNC_get_current_uid_gid:
 775                 return &bpf_get_current_uid_gid_proto;
 776         case BPF_FUNC_get_local_storage:
 777                 return &bpf_get_local_storage_proto;
 778         case BPF_FUNC_get_current_cgroup_id:
 779                 return &bpf_get_current_cgroup_id_proto;
 780         case BPF_FUNC_trace_printk:
 781                 if (capable(CAP_SYS_ADMIN))
 782                         return bpf_get_trace_printk_proto();
 783                 /* fall through */
 784         default:
 785                 return NULL;
 786         }
 787 }
 788
 789 static const struct bpf_func_proto *
 790 cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 791 {
 792         return cgroup_base_func_proto(func_id, prog);
 793 }
 794
 795 static bool cgroup_dev_is_valid_access(int off, int size,
 796                                        enum bpf_access_type type,
 797                                        const struct bpf_prog *prog,
 798                                        struct bpf_insn_access_aux *info)
 799 {
 800         const int size_default = sizeof(__u32);
 801
 802         if (type == BPF_WRITE)
 803                 return false;
 804
 805         if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
 806                 return false;
 807         /* The verifier guarantees that size > 0. */
 808         if (off % size != 0)
 809                 return false;
 810
 811         switch (off) {
 812         case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
 813                 bpf_ctx_record_field_size(info, size_default);
 814                 if (!bpf_ctx_narrow_access_ok(off, size, size_default))
 815                         return false;
 816                 break;
 817         default:
 818                 if (size != size_default)
 819                         return false;
 820         }
 821
 822         return true;
 823 }
 824
 825 const struct bpf_prog_ops cg_dev_prog_ops = {
 826 };
 827
 828 const struct bpf_verifier_ops cg_dev_verifier_ops = {
 829         .get_func_proto         = cgroup_dev_func_proto,
 830         .is_valid_access        = cgroup_dev_is_valid_access,
 831 };
 832
 833 /**
 834  * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
 835  *
 836  * @head: sysctl table header
 837  * @table: sysctl table
 838  * @write: sysctl is being read (= 0) or written (= 1)
 839  * @buf: pointer to buffer passed by user space
 840  * @pcount: value-result argument: value is size of buffer pointed to by @buf,
 841  *      result is size of @new_buf if program set new value, initial value
 842  *      otherwise
 843  * @ppos: value-result argument: value is position at which read from or write
 844  *      to sysctl is happening, result is new position if program overrode it,
 845  *      initial value otherwise
 846  * @new_buf: pointer to pointer to new buffer that will be allocated if program
 847  *      overrides new value provided by user space on sysctl write
 848  *      NOTE: it's caller responsibility to free *new_buf if it was set
 849  * @type: type of program to be executed
 850  *
 851  * Program is run when sysctl is being accessed, either read or written, and
 852  * can allow or deny such access.
 853  *
 854  * This function will return %-EPERM if an attached program is found and
 855  * returned value != 1 during execution. In all other cases 0 is returned.
 856  */
 857 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 858                                    struct ctl_table *table, int write,
 859                                    void __user *buf, size_t *pcount,
 860                                    loff_t *ppos, void **new_buf,
 861                                    enum bpf_attach_type type)
 862 {
 863         struct bpf_sysctl_kern ctx = {
 864                 .head = head,
 865                 .table = table,
 866                 .write = write,
 867                 .ppos = ppos,
 868                 .cur_val = NULL,
 869                 .cur_len = PAGE_SIZE,
 870                 .new_val = NULL,
 871                 .new_len = 0,
 872                 .new_updated = 0,
 873         };
 874         struct cgroup *cgrp;
 875         int ret;
 876
 877         ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
 878         if (ctx.cur_val) {
 879                 mm_segment_t old_fs;
 880                 loff_t pos = 0;
 881
 882                 old_fs = get_fs();
 883                 set_fs(KERNEL_DS);
 884                 if (table->proc_handler(table, 0, (void __user *)ctx.cur_val,
 885                                         &ctx.cur_len, &pos)) {
 886                         /* Let BPF program decide how to proceed. */
 887                         ctx.cur_len = 0;
 888                 }
 889                 set_fs(old_fs);
 890         } else {
 891                 /* Let BPF program decide how to proceed. */
 892                 ctx.cur_len = 0;
 893         }
 894
 895         if (write && buf && *pcount) {
 896                 /* BPF program should be able to override new value with a
 897                  * buffer bigger than provided by user.
 898                  */
 899                 ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
 900                 ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
 901                 if (!ctx.new_val ||
 902                     copy_from_user(ctx.new_val, buf, ctx.new_len))
 903                         /* Let BPF program decide how to proceed. */
 904                         ctx.new_len = 0;
 905         }
 906
 907         rcu_read_lock();
 908         cgrp = task_dfl_cgroup(current);
 909         ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
 910         rcu_read_unlock();
 911
 912         kfree(ctx.cur_val);
 913
 914         if (ret == 1 && ctx.new_updated) {
 915                 *new_buf = ctx.new_val;
 916                 *pcount = ctx.new_len;
 917         } else {
 918                 kfree(ctx.new_val);
 919         }
 920
 921         return ret == 1 ? 0 : -EPERM;
 922 }
 923 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
 924
 925 #ifdef CONFIG_NET
 926 static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
 927                                              enum bpf_attach_type attach_type)
 928 {
 929         struct bpf_prog_array *prog_array;
 930         bool empty;
 931
 932         rcu_read_lock();
 933         prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
 934         empty = bpf_prog_array_is_empty(prog_array);
 935         rcu_read_unlock();
 936
 937         return empty;
 938 }
 939
 940 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
 941 {
 942         if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
 943                 return -EINVAL;
 944
 945         ctx->optval = kzalloc(max_optlen, GFP_USER);
 946         if (!ctx->optval)
 947                 return -ENOMEM;
 948
 949         ctx->optval_end = ctx->optval + max_optlen;
 950
 951         return 0;
 952 }
 953
 954 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
 955 {
 956         kfree(ctx->optval);
 957 }
 958
 959 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 960                                        int *optname, char __user *optval,
 961                                        int *optlen, char **kernel_optval)
 962 {
 963         struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 964         struct bpf_sockopt_kern ctx = {
 965                 .sk = sk,
 966                 .level = *level,
 967                 .optname = *optname,
 968         };
 969         int ret, max_optlen;
 970
 971         /* Opportunistic check to see whether we have any BPF program
 972          * attached to the hook so we don't waste time allocating
 973          * memory and locking the socket.
 974          */
 975         if (!cgroup_bpf_enabled ||
 976             __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
 977                 return 0;
 978
 979         /* Allocate a bit more than the initial user buffer for
 980          * BPF program. The canonical use case is overriding
 981          * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
 982          */
 983         max_optlen = max_t(int, 16, *optlen);
 984
 985         ret = sockopt_alloc_buf(&ctx, max_optlen);
 986         if (ret)
 987                 return ret;
 988
 989         ctx.optlen = *optlen;
 990
 991         if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
 992                 ret = -EFAULT;
 993                 goto out;
 994         }
 995
 996         lock_sock(sk);
 997         ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
 998                                  &ctx, BPF_PROG_RUN);
 999         release_sock(sk);
1000
1001         if (!ret) {
1002                 ret = -EPERM;
1003                 goto out;
1004         }
1005
1006         if (ctx.optlen == -1) {
1007                 /* optlen set to -1, bypass kernel */
1008                 ret = 1;
1009         } else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
1010                 /* optlen is out of bounds */
1011                 ret = -EFAULT;
1012         } else {
1013                 /* optlen within bounds, run kernel handler */
1014                 ret = 0;
1015
1016                 /* export any potential modifications */
1017                 *level = ctx.level;
1018                 *optname = ctx.optname;
1019                 *optlen = ctx.optlen;
1020                 *kernel_optval = ctx.optval;
1021         }
1022
1023 out:
1024         if (ret)
1025                 sockopt_free_buf(&ctx);
1026         return ret;
1027 }
1028 EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
1029
1030 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
1031                                        int optname, char __user *optval,
1032                                        int __user *optlen, int max_optlen,
1033                                        int retval)
1034 {
1035         struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1036         struct bpf_sockopt_kern ctx = {
1037                 .sk = sk,
1038                 .level = level,
1039                 .optname = optname,
1040                 .retval = retval,
1041         };
1042         int ret;
1043
1044         /* Opportunistic check to see whether we have any BPF program
1045          * attached to the hook so we don't waste time allocating
1046          * memory and locking the socket.
1047          */
1048         if (!cgroup_bpf_enabled ||
1049             __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
1050                 return retval;
1051
1052         ret = sockopt_alloc_buf(&ctx, max_optlen);
1053         if (ret)
1054                 return ret;
1055
1056         ctx.optlen = max_optlen;
1057
1058         if (!retval) {
1059                 /* If kernel getsockopt finished successfully,
1060                  * copy whatever was returned to the user back
1061                  * into our temporary buffer. Set optlen to the
1062                  * one that kernel returned as well to let
1063                  * BPF programs inspect the value.
1064                  */
1065
1066                 if (get_user(ctx.optlen, optlen)) {
1067                         ret = -EFAULT;
1068                         goto out;
1069                 }
1070
1071                 if (ctx.optlen > max_optlen)
1072                         ctx.optlen = max_optlen;
1073
1074                 if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
1075                         ret = -EFAULT;
1076                         goto out;
1077                 }
1078         }
1079
1080         lock_sock(sk);
1081         ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
1082                                  &ctx, BPF_PROG_RUN);
1083         release_sock(sk);
1084
1085         if (!ret) {
1086                 ret = -EPERM;
1087                 goto out;
1088         }
1089
1090         if (ctx.optlen > max_optlen) {
1091                 ret = -EFAULT;
1092                 goto out;
1093         }
1094
1095         /* BPF programs only allowed to set retval to 0, not some
1096          * arbitrary value.
1097          */
1098         if (ctx.retval != 0 && ctx.retval != retval) {
1099                 ret = -EFAULT;
1100                 goto out;
1101         }
1102
1103         if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
1104             put_user(ctx.optlen, optlen)) {
1105                 ret = -EFAULT;
1106                 goto out;
1107         }
1108
1109         ret = ctx.retval;
1110
1111 out:
1112         sockopt_free_buf(&ctx);
1113         return ret;
1114 }
1115 EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
1116 #endif
1117
1118 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
1119                               size_t *lenp)
1120 {
1121         ssize_t tmp_ret = 0, ret;
1122
1123         if (dir->header.parent) {
1124                 tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
1125                 if (tmp_ret < 0)
1126                         return tmp_ret;
1127         }
1128
1129         ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
1130         if (ret < 0)
1131                 return ret;
1132         *bufp += ret;
1133         *lenp -= ret;
1134         ret += tmp_ret;
1135
1136         /* Avoid leading slash. */
1137         if (!ret)
1138                 return ret;
1139
1140         tmp_ret = strscpy(*bufp, "/", *lenp);
1141         if (tmp_ret < 0)
1142                 return tmp_ret;
1143         *bufp += tmp_ret;
1144         *lenp -= tmp_ret;
1145
1146         return ret + tmp_ret;
1147 }
1148
1149 BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
1150            size_t, buf_len, u64, flags)
1151 {
1152         ssize_t tmp_ret = 0, ret;
1153
1154         if (!buf)
1155                 return -EINVAL;
1156
1157         if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
1158                 if (!ctx->head)
1159                         return -EINVAL;
1160                 tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
1161                 if (tmp_ret < 0)
1162                         return tmp_ret;
1163         }
1164
1165         ret = strscpy(buf, ctx->table->procname, buf_len);
1166
1167         return ret < 0 ? ret : tmp_ret + ret;
1168 }
1169
1170 static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
1171         .func           = bpf_sysctl_get_name,
1172         .gpl_only       = false,
1173         .ret_type       = RET_INTEGER,
1174         .arg1_type      = ARG_PTR_TO_CTX,
1175         .arg2_type      = ARG_PTR_TO_MEM,
1176         .arg3_type      = ARG_CONST_SIZE,
1177         .arg4_type      = ARG_ANYTHING,
1178 };
1179
1180 static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
1181                              size_t src_len)
1182 {
1183         if (!dst)
1184                 return -EINVAL;
1185
1186         if (!dst_len)
1187                 return -E2BIG;
1188
1189         if (!src || !src_len) {
1190                 memset(dst, 0, dst_len);
1191                 return -EINVAL;
1192         }
1193
1194         memcpy(dst, src, min(dst_len, src_len));
1195
1196         if (dst_len > src_len) {
1197                 memset(dst + src_len, '\0', dst_len - src_len);
1198                 return src_len;
1199         }
1200
1201         dst[dst_len - 1] = '\0';
1202
1203         return -E2BIG;
1204 }
1205
1206 BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
1207            char *, buf, size_t, buf_len)
1208 {
1209         return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
1210 }
1211
1212 static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
1213         .func           = bpf_sysctl_get_current_value,
1214         .gpl_only       = false,
1215         .ret_type       = RET_INTEGER,
1216         .arg1_type      = ARG_PTR_TO_CTX,
1217         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
1218         .arg3_type      = ARG_CONST_SIZE,
1219 };
1220
1221 BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
1222            size_t, buf_len)
1223 {
1224         if (!ctx->write) {
1225                 if (buf && buf_len)
1226                         memset(buf, '\0', buf_len);
1227                 return -EINVAL;
1228         }
1229         return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
1230 }
1231
1232 static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
1233         .func           = bpf_sysctl_get_new_value,
1234         .gpl_only       = false,
1235         .ret_type       = RET_INTEGER,
1236         .arg1_type      = ARG_PTR_TO_CTX,
1237         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
1238         .arg3_type      = ARG_CONST_SIZE,
1239 };
1240
1241 BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
1242            const char *, buf, size_t, buf_len)
1243 {
1244         if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
1245                 return -EINVAL;
1246
1247         if (buf_len > PAGE_SIZE - 1)
1248                 return -E2BIG;
1249
1250         memcpy(ctx->new_val, buf, buf_len);
1251         ctx->new_len = buf_len;
1252         ctx->new_updated = 1;
1253
1254         return 0;
1255 }
1256
1257 static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
1258         .func           = bpf_sysctl_set_new_value,
1259         .gpl_only       = false,
1260         .ret_type       = RET_INTEGER,
1261         .arg1_type      = ARG_PTR_TO_CTX,
1262         .arg2_type      = ARG_PTR_TO_MEM,
1263         .arg3_type      = ARG_CONST_SIZE,
1264 };
1265
1266 static const struct bpf_func_proto *
1267 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1268 {
1269         switch (func_id) {
1270         case BPF_FUNC_strtol:
1271                 return &bpf_strtol_proto;
1272         case BPF_FUNC_strtoul:
1273                 return &bpf_strtoul_proto;
1274         case BPF_FUNC_sysctl_get_name:
1275                 return &bpf_sysctl_get_name_proto;
1276         case BPF_FUNC_sysctl_get_current_value:
1277                 return &bpf_sysctl_get_current_value_proto;
1278         case BPF_FUNC_sysctl_get_new_value:
1279                 return &bpf_sysctl_get_new_value_proto;
1280         case BPF_FUNC_sysctl_set_new_value:
1281                 return &bpf_sysctl_set_new_value_proto;
1282         default:
1283                 return cgroup_base_func_proto(func_id, prog);
1284         }
1285 }
1286
1287 static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
1288                                    const struct bpf_prog *prog,
1289                                    struct bpf_insn_access_aux *info)
1290 {
1291         const int size_default = sizeof(__u32);
1292
1293         if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
1294                 return false;
1295
1296         switch (off) {
1297         case bpf_ctx_range(struct bpf_sysctl, write):
1298                 if (type != BPF_READ)
1299                         return false;
1300                 bpf_ctx_record_field_size(info, size_default);
1301                 return bpf_ctx_narrow_access_ok(off, size, size_default);
1302         case bpf_ctx_range(struct bpf_sysctl, file_pos):
1303                 if (type == BPF_READ) {
1304                         bpf_ctx_record_field_size(info, size_default);
1305                         return bpf_ctx_narrow_access_ok(off, size, size_default);
1306                 } else {
1307                         return size == size_default;
1308                 }
1309         default:
1310                 return false;
1311         }
1312 }
1313
1314 static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
1315                                      const struct bpf_insn *si,
1316                                      struct bpf_insn *insn_buf,
1317                                      struct bpf_prog *prog, u32 *target_size)
1318 {
1319         struct bpf_insn *insn = insn_buf;
1320         u32 read_size;
1321
1322         switch (si->off) {
1323         case offsetof(struct bpf_sysctl, write):
1324                 *insn++ = BPF_LDX_MEM(
1325                         BPF_SIZE(si->code), si->dst_reg, si->src_reg,
1326                         bpf_target_off(struct bpf_sysctl_kern, write,
1327                                        FIELD_SIZEOF(struct bpf_sysctl_kern,
1328                                                     write),
1329                                        target_size));
1330                 break;
1331         case offsetof(struct bpf_sysctl, file_pos):
1332                 /* ppos is a pointer so it should be accessed via indirect
1333                  * loads and stores. Also for stores additional temporary
1334                  * register is used since neither src_reg nor dst_reg can be
1335                  * overridden.
1336                  */
1337                 if (type == BPF_WRITE) {
1338                         int treg = BPF_REG_9;
1339
1340                         if (si->src_reg == treg || si->dst_reg == treg)
1341                                 --treg;
1342                         if (si->src_reg == treg || si->dst_reg == treg)
1343                                 --treg;
1344                         *insn++ = BPF_STX_MEM(
1345                                 BPF_DW, si->dst_reg, treg,
1346                                 offsetof(struct bpf_sysctl_kern, tmp_reg));
1347                         *insn++ = BPF_LDX_MEM(
1348                                 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1349                                 treg, si->dst_reg,
1350                                 offsetof(struct bpf_sysctl_kern, ppos));
1351                         *insn++ = BPF_STX_MEM(
1352                                 BPF_SIZEOF(u32), treg, si->src_reg,
1353                                 bpf_ctx_narrow_access_offset(
1354                                         0, sizeof(u32), sizeof(loff_t)));
1355                         *insn++ = BPF_LDX_MEM(
1356                                 BPF_DW, treg, si->dst_reg,
1357                                 offsetof(struct bpf_sysctl_kern, tmp_reg));
1358                 } else {
1359                         *insn++ = BPF_LDX_MEM(
1360                                 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1361                                 si->dst_reg, si->src_reg,
1362                                 offsetof(struct bpf_sysctl_kern, ppos));
1363                         read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
1364                         *insn++ = BPF_LDX_MEM(
1365                                 BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
1366                                 bpf_ctx_narrow_access_offset(
1367                                         0, read_size, sizeof(loff_t)));
1368                 }
1369                 *target_size = sizeof(u32);
1370                 break;
1371         }
1372
1373         return insn - insn_buf;
1374 }
1375
1376 const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
1377         .get_func_proto         = sysctl_func_proto,
1378         .is_valid_access        = sysctl_is_valid_access,
1379         .convert_ctx_access     = sysctl_convert_ctx_access,
1380 };
1381
1382 const struct bpf_prog_ops cg_sysctl_prog_ops = {
1383 };
1384
1385 static const struct bpf_func_proto *
1386 cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1387 {
1388         switch (func_id) {
1389 #ifdef CONFIG_NET
1390         case BPF_FUNC_sk_storage_get:
1391                 return &bpf_sk_storage_get_proto;
1392         case BPF_FUNC_sk_storage_delete:
1393                 return &bpf_sk_storage_delete_proto;
1394 #endif
1395 #ifdef CONFIG_INET
1396         case BPF_FUNC_tcp_sock:
1397                 return &bpf_tcp_sock_proto;
1398 #endif
1399         default:
1400                 return cgroup_base_func_proto(func_id, prog);
1401         }
1402 }
1403
1404 static bool cg_sockopt_is_valid_access(int off, int size,
1405                                        enum bpf_access_type type,
1406                                        const struct bpf_prog *prog,
1407                                        struct bpf_insn_access_aux *info)
1408 {
1409         const int size_default = sizeof(__u32);
1410
1411         if (off < 0 || off >= sizeof(struct bpf_sockopt))
1412                 return false;
1413
1414         if (off % size != 0)
1415                 return false;
1416
1417         if (type == BPF_WRITE) {
1418                 switch (off) {
1419                 case offsetof(struct bpf_sockopt, retval):
1420                         if (size != size_default)
1421                                 return false;
1422                         return prog->expected_attach_type ==
1423                                 BPF_CGROUP_GETSOCKOPT;
1424                 case offsetof(struct bpf_sockopt, optname):
1425                         /* fallthrough */
1426                 case offsetof(struct bpf_sockopt, level):
1427                         if (size != size_default)
1428                                 return false;
1429                         return prog->expected_attach_type ==
1430                                 BPF_CGROUP_SETSOCKOPT;
1431                 case offsetof(struct bpf_sockopt, optlen):
1432                         return size == size_default;
1433                 default:
1434                         return false;
1435                 }
1436         }
1437
1438         switch (off) {
1439         case offsetof(struct bpf_sockopt, sk):
1440                 if (size != sizeof(__u64))
1441                         return false;
1442                 info->reg_type = PTR_TO_SOCKET;
1443                 break;
1444         case offsetof(struct bpf_sockopt, optval):
1445                 if (size != sizeof(__u64))
1446                         return false;
1447                 info->reg_type = PTR_TO_PACKET;
1448                 break;
1449         case offsetof(struct bpf_sockopt, optval_end):
1450                 if (size != sizeof(__u64))
1451                         return false;
1452                 info->reg_type = PTR_TO_PACKET_END;
1453                 break;
1454         case offsetof(struct bpf_sockopt, retval):
1455                 if (size != size_default)
1456                         return false;
1457                 return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
1458         default:
1459                 if (size != size_default)
1460                         return false;
1461                 break;
1462         }
1463         return true;
1464 }
1465
1466 #define CG_SOCKOPT_ACCESS_FIELD(T, F)                                   \
1467         T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),                 \
1468           si->dst_reg, si->src_reg,                                     \
1469           offsetof(struct bpf_sockopt_kern, F))
1470
1471 static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
1472                                          const struct bpf_insn *si,
1473                                          struct bpf_insn *insn_buf,
1474                                          struct bpf_prog *prog,
1475                                          u32 *target_size)
1476 {
1477         struct bpf_insn *insn = insn_buf;
1478
1479         switch (si->off) {
1480         case offsetof(struct bpf_sockopt, sk):
1481                 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
1482                 break;
1483         case offsetof(struct bpf_sockopt, level):
1484                 if (type == BPF_WRITE)
1485                         *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
1486                 else
1487                         *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
1488                 break;
1489         case offsetof(struct bpf_sockopt, optname):
1490                 if (type == BPF_WRITE)
1491                         *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
1492                 else
1493                         *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
1494                 break;
1495         case offsetof(struct bpf_sockopt, optlen):
1496                 if (type == BPF_WRITE)
1497                         *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
1498                 else
1499                         *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
1500                 break;
1501         case offsetof(struct bpf_sockopt, retval):
1502                 if (type == BPF_WRITE)
1503                         *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
1504                 else
1505                         *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
1506                 break;
1507         case offsetof(struct bpf_sockopt, optval):
1508                 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
1509                 break;
1510         case offsetof(struct bpf_sockopt, optval_end):
1511                 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
1512                 break;
1513         }
1514
1515         return insn - insn_buf;
1516 }
1517
1518 static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
1519                                    bool direct_write,
1520                                    const struct bpf_prog *prog)
1521 {
1522         /* Nothing to do for sockopt argument. The data is kzalloc'ated.
1523          */
1524         return 0;
1525 }
1526
1527 const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
1528         .get_func_proto         = cg_sockopt_func_proto,
1529         .is_valid_access        = cg_sockopt_is_valid_access,
1530         .convert_ctx_access     = cg_sockopt_convert_ctx_access,
1531         .gen_prologue           = cg_sockopt_get_prologue,
1532 };
1533
1534 const struct bpf_prog_ops cg_sockopt_prog_ops = {
1535 };