net/xdp/xdp_umem.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /* XDP user-space packet buffer
   3  * Copyright(c) 2018 Intel Corporation.
   4  */
   5
   6 #include <linux/init.h>
   7 #include <linux/sched/mm.h>
   8 #include <linux/sched/signal.h>
   9 #include <linux/sched/task.h>
  10 #include <linux/uaccess.h>
  11 #include <linux/slab.h>
  12 #include <linux/bpf.h>
  13 #include <linux/mm.h>
  14 #include <linux/netdevice.h>
  15 #include <linux/rtnetlink.h>
  16 #include <linux/idr.h>
  17 #include <linux/vmalloc.h>
  18
  19 #include "xdp_umem.h"
  20 #include "xsk_queue.h"
  21
  22 #define XDP_UMEM_MIN_CHUNK_SIZE 2048
  23
  24 static DEFINE_IDA(umem_ida);
  25
  26 void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
  27 {
  28         unsigned long flags;
  29
  30         spin_lock_irqsave(&umem->xsk_list_lock, flags);
  31         list_add_rcu(&xs->list, &umem->xsk_list);
  32         spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
  33 }
  34
  35 void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
  36 {
  37         unsigned long flags;
  38
  39         spin_lock_irqsave(&umem->xsk_list_lock, flags);
  40         list_del_rcu(&xs->list);
  41         spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
  42 }
  43
  44 /* The umem is stored both in the _rx struct and the _tx struct as we do
  45  * not know if the device has more tx queues than rx, or the opposite.
  46  * This might also change during run time.
  47  */
  48 static int xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem,
  49                                u16 queue_id)
  50 {
  51         if (queue_id >= max_t(unsigned int,
  52                               dev->real_num_rx_queues,
  53                               dev->real_num_tx_queues))
  54                 return -EINVAL;
  55
  56         if (queue_id < dev->real_num_rx_queues)
  57                 dev->_rx[queue_id].umem = umem;
  58         if (queue_id < dev->real_num_tx_queues)
  59                 dev->_tx[queue_id].umem = umem;
  60
  61         return 0;
  62 }
  63
  64 struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
  65                                        u16 queue_id)
  66 {
  67         if (queue_id < dev->real_num_rx_queues)
  68                 return dev->_rx[queue_id].umem;
  69         if (queue_id < dev->real_num_tx_queues)
  70                 return dev->_tx[queue_id].umem;
  71
  72         return NULL;
  73 }
  74 EXPORT_SYMBOL(xdp_get_umem_from_qid);
  75
  76 static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
  77 {
  78         if (queue_id < dev->real_num_rx_queues)
  79                 dev->_rx[queue_id].umem = NULL;
  80         if (queue_id < dev->real_num_tx_queues)
  81                 dev->_tx[queue_id].umem = NULL;
  82 }
  83
  84 int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
  85                         u16 queue_id, u16 flags)
  86 {
  87         bool force_zc, force_copy;
  88         struct netdev_bpf bpf;
  89         int err = 0;
  90
  91         ASSERT_RTNL();
  92
  93         force_zc = flags & XDP_ZEROCOPY;
  94         force_copy = flags & XDP_COPY;
  95
  96         if (force_zc && force_copy)
  97                 return -EINVAL;
  98
  99         if (xdp_get_umem_from_qid(dev, queue_id))
 100                 return -EBUSY;
 101
 102         err = xdp_reg_umem_at_qid(dev, umem, queue_id);
 103         if (err)
 104                 return err;
 105
 106         umem->dev = dev;
 107         umem->queue_id = queue_id;
 108
 109         if (flags & XDP_USE_NEED_WAKEUP) {
 110                 umem->flags |= XDP_UMEM_USES_NEED_WAKEUP;
 111                 /* Tx needs to be explicitly woken up the first time.
 112                  * Also for supporting drivers that do not implement this
 113                  * feature. They will always have to call sendto().
 114                  */
 115                 xsk_set_tx_need_wakeup(umem);
 116         }
 117
 118         dev_hold(dev);
 119
 120         if (force_copy)
 121                 /* For copy-mode, we are done. */
 122                 return 0;
 123
 124         if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_wakeup) {
 125                 err = -EOPNOTSUPP;
 126                 goto err_unreg_umem;
 127         }
 128
 129         bpf.command = XDP_SETUP_XSK_UMEM;
 130         bpf.xsk.umem = umem;
 131         bpf.xsk.queue_id = queue_id;
 132
 133         err = dev->netdev_ops->ndo_bpf(dev, &bpf);
 134         if (err)
 135                 goto err_unreg_umem;
 136
 137         umem->zc = true;
 138         return 0;
 139
 140 err_unreg_umem:
 141         if (!force_zc)
 142                 err = 0; /* fallback to copy mode */
 143         if (err)
 144                 xdp_clear_umem_at_qid(dev, queue_id);
 145         return err;
 146 }
 147
 148 void xdp_umem_clear_dev(struct xdp_umem *umem)
 149 {
 150         struct netdev_bpf bpf;
 151         int err;
 152
 153         ASSERT_RTNL();
 154
 155         if (!umem->dev)
 156                 return;
 157
 158         if (umem->zc) {
 159                 bpf.command = XDP_SETUP_XSK_UMEM;
 160                 bpf.xsk.umem = NULL;
 161                 bpf.xsk.queue_id = umem->queue_id;
 162
 163                 err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
 164
 165                 if (err)
 166                         WARN(1, "failed to disable umem!\n");
 167         }
 168
 169         xdp_clear_umem_at_qid(umem->dev, umem->queue_id);
 170
 171         dev_put(umem->dev);
 172         umem->dev = NULL;
 173         umem->zc = false;
 174 }
 175
 176 static void xdp_umem_unmap_pages(struct xdp_umem *umem)
 177 {
 178         unsigned int i;
 179
 180         for (i = 0; i < umem->npgs; i++)
 181                 if (PageHighMem(umem->pgs[i]))
 182                         vunmap(umem->pages[i].addr);
 183 }
 184
 185 static int xdp_umem_map_pages(struct xdp_umem *umem)
 186 {
 187         unsigned int i;
 188         void *addr;
 189
 190         for (i = 0; i < umem->npgs; i++) {
 191                 if (PageHighMem(umem->pgs[i]))
 192                         addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL);
 193                 else
 194                         addr = page_address(umem->pgs[i]);
 195
 196                 if (!addr) {
 197                         xdp_umem_unmap_pages(umem);
 198                         return -ENOMEM;
 199                 }
 200
 201                 umem->pages[i].addr = addr;
 202         }
 203
 204         return 0;
 205 }
 206
 207 static void xdp_umem_unpin_pages(struct xdp_umem *umem)
 208 {
 209         put_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
 210
 211         kfree(umem->pgs);
 212         umem->pgs = NULL;
 213 }
 214
 215 static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
 216 {
 217         if (umem->user) {
 218                 atomic_long_sub(umem->npgs, &umem->user->locked_vm);
 219                 free_uid(umem->user);
 220         }
 221 }
 222
 223 static void xdp_umem_release(struct xdp_umem *umem)
 224 {
 225         rtnl_lock();
 226         xdp_umem_clear_dev(umem);
 227         rtnl_unlock();
 228
 229         ida_simple_remove(&umem_ida, umem->id);
 230
 231         if (umem->fq) {
 232                 xskq_destroy(umem->fq);
 233                 umem->fq = NULL;
 234         }
 235
 236         if (umem->cq) {
 237                 xskq_destroy(umem->cq);
 238                 umem->cq = NULL;
 239         }
 240
 241         xsk_reuseq_destroy(umem);
 242
 243         xdp_umem_unmap_pages(umem);
 244         xdp_umem_unpin_pages(umem);
 245
 246         kfree(umem->pages);
 247         umem->pages = NULL;
 248
 249         xdp_umem_unaccount_pages(umem);
 250         kfree(umem);
 251 }
 252
 253 static void xdp_umem_release_deferred(struct work_struct *work)
 254 {
 255         struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
 256
 257         xdp_umem_release(umem);
 258 }
 259
 260 void xdp_get_umem(struct xdp_umem *umem)
 261 {
 262         refcount_inc(&umem->users);
 263 }
 264
 265 void xdp_put_umem(struct xdp_umem *umem)
 266 {
 267         if (!umem)
 268                 return;
 269
 270         if (refcount_dec_and_test(&umem->users)) {
 271                 INIT_WORK(&umem->work, xdp_umem_release_deferred);
 272                 schedule_work(&umem->work);
 273         }
 274 }
 275
 276 static int xdp_umem_pin_pages(struct xdp_umem *umem)
 277 {
 278         unsigned int gup_flags = FOLL_WRITE;
 279         long npgs;
 280         int err;
 281
 282         umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs),
 283                             GFP_KERNEL | __GFP_NOWARN);
 284         if (!umem->pgs)
 285                 return -ENOMEM;
 286
 287         down_read(&current->mm->mmap_sem);
 288         npgs = get_user_pages(umem->address, umem->npgs,
 289                               gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
 290         up_read(&current->mm->mmap_sem);
 291
 292         if (npgs != umem->npgs) {
 293                 if (npgs >= 0) {
 294                         umem->npgs = npgs;
 295                         err = -ENOMEM;
 296                         goto out_pin;
 297                 }
 298                 err = npgs;
 299                 goto out_pgs;
 300         }
 301         return 0;
 302
 303 out_pin:
 304         xdp_umem_unpin_pages(umem);
 305 out_pgs:
 306         kfree(umem->pgs);
 307         umem->pgs = NULL;
 308         return err;
 309 }
 310
 311 static int xdp_umem_account_pages(struct xdp_umem *umem)
 312 {
 313         unsigned long lock_limit, new_npgs, old_npgs;
 314
 315         if (capable(CAP_IPC_LOCK))
 316                 return 0;
 317
 318         lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 319         umem->user = get_uid(current_user());
 320
 321         do {
 322                 old_npgs = atomic_long_read(&umem->user->locked_vm);
 323                 new_npgs = old_npgs + umem->npgs;
 324                 if (new_npgs > lock_limit) {
 325                         free_uid(umem->user);
 326                         umem->user = NULL;
 327                         return -ENOBUFS;
 328                 }
 329         } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
 330                                      new_npgs) != old_npgs);
 331         return 0;
 332 }
 333
 334 static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 335 {
 336         bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
 337         u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
 338         unsigned int chunks, chunks_per_page;
 339         u64 addr = mr->addr, size = mr->len;
 340         int size_chk, err;
 341
 342         if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
 343                 /* Strictly speaking we could support this, if:
 344                  * - huge pages, or*
 345                  * - using an IOMMU, or
 346                  * - making sure the memory area is consecutive
 347                  * but for now, we simply say "computer says no".
 348                  */
 349                 return -EINVAL;
 350         }
 351
 352         if (mr->flags & ~(XDP_UMEM_UNALIGNED_CHUNK_FLAG |
 353                         XDP_UMEM_USES_NEED_WAKEUP))
 354                 return -EINVAL;
 355
 356         if (!unaligned_chunks && !is_power_of_2(chunk_size))
 357                 return -EINVAL;
 358
 359         if (!PAGE_ALIGNED(addr)) {
 360                 /* Memory area has to be page size aligned. For
 361                  * simplicity, this might change.
 362                  */
 363                 return -EINVAL;
 364         }
 365
 366         if ((addr + size) < addr)
 367                 return -EINVAL;
 368
 369         chunks = (unsigned int)div_u64(size, chunk_size);
 370         if (chunks == 0)
 371                 return -EINVAL;
 372
 373         if (!unaligned_chunks) {
 374                 chunks_per_page = PAGE_SIZE / chunk_size;
 375                 if (chunks < chunks_per_page || chunks % chunks_per_page)
 376                         return -EINVAL;
 377         }
 378
 379         size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM;
 380         if (size_chk < 0)
 381                 return -EINVAL;
 382
 383         umem->address = (unsigned long)addr;
 384         umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
 385                                             : ~((u64)chunk_size - 1);
 386         umem->size = size;
 387         umem->headroom = headroom;
 388         umem->chunk_size_nohr = chunk_size - headroom;
 389         umem->npgs = size / PAGE_SIZE;
 390         umem->pgs = NULL;
 391         umem->user = NULL;
 392         umem->flags = mr->flags;
 393         INIT_LIST_HEAD(&umem->xsk_list);
 394         spin_lock_init(&umem->xsk_list_lock);
 395
 396         refcount_set(&umem->users, 1);
 397
 398         err = xdp_umem_account_pages(umem);
 399         if (err)
 400                 return err;
 401
 402         err = xdp_umem_pin_pages(umem);
 403         if (err)
 404                 goto out_account;
 405
 406         umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL);
 407         if (!umem->pages) {
 408                 err = -ENOMEM;
 409                 goto out_pin;
 410         }
 411
 412         err = xdp_umem_map_pages(umem);
 413         if (!err)
 414                 return 0;
 415
 416         kfree(umem->pages);
 417
 418 out_pin:
 419         xdp_umem_unpin_pages(umem);
 420 out_account:
 421         xdp_umem_unaccount_pages(umem);
 422         return err;
 423 }
 424
 425 struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
 426 {
 427         struct xdp_umem *umem;
 428         int err;
 429
 430         umem = kzalloc(sizeof(*umem), GFP_KERNEL);
 431         if (!umem)
 432                 return ERR_PTR(-ENOMEM);
 433
 434         err = ida_simple_get(&umem_ida, 0, 0, GFP_KERNEL);
 435         if (err < 0) {
 436                 kfree(umem);
 437                 return ERR_PTR(err);
 438         }
 439         umem->id = err;
 440
 441         err = xdp_umem_reg(umem, mr);
 442         if (err) {
 443                 ida_simple_remove(&umem_ida, umem->id);
 444                 kfree(umem);
 445                 return ERR_PTR(err);
 446         }
 447
 448         return umem;
 449 }
 450
 451 bool xdp_umem_validate_queues(struct xdp_umem *umem)
 452 {
 453         return umem->fq && umem->cq;
 454 }