]> asedeno.scripts.mit.edu Git - linux.git/blob - net/core/dev.c
gro: Enter slow-path if there is no tailroom
[linux.git] / net / core / dev.c
1 /*
2  *      NET3    Protocol independent device support routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  *      Derived from the non IP parts of dev.c 1.0.19
10  *              Authors:        Ross Biro
11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *      Additional Authors:
15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
17  *              David Hinds <dahinds@users.sourceforge.net>
18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *              Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *      Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *                                      to 2 if register_netdev gets called
25  *                                      before net_dev_init & also removed a
26  *                                      few lines of code in the process.
27  *              Alan Cox        :       device private ioctl copies fields back.
28  *              Alan Cox        :       Transmit queue code does relevant
29  *                                      stunts to keep the queue safe.
30  *              Alan Cox        :       Fixed double lock.
31  *              Alan Cox        :       Fixed promisc NULL pointer trap
32  *              ????????        :       Support the full private ioctl range
33  *              Alan Cox        :       Moved ioctl permission check into
34  *                                      drivers
35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
36  *              Alan Cox        :       100 backlog just doesn't cut it when
37  *                                      you start doing multicast video 8)
38  *              Alan Cox        :       Rewrote net_bh and list manager.
39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
40  *              Alan Cox        :       Took out transmit every packet pass
41  *                                      Saved a few bytes in the ioctl handler
42  *              Alan Cox        :       Network driver sets packet type before
43  *                                      calling netif_rx. Saves a function
44  *                                      call a packet.
45  *              Alan Cox        :       Hashed net_bh()
46  *              Richard Kooijman:       Timestamp fixes.
47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
48  *              Alan Cox        :       Device lock protection.
49  *              Alan Cox        :       Fixed nasty side effect of device close
50  *                                      changes.
51  *              Rudi Cilibrasi  :       Pass the right thing to
52  *                                      set_mac_address()
53  *              Dave Miller     :       32bit quantity for the device lock to
54  *                                      make it work out on a Sparc.
55  *              Bjorn Ekwall    :       Added KERNELD hack.
56  *              Alan Cox        :       Cleaned up the backlog initialise.
57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
58  *                                      1 device.
59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
60  *                                      is no device open function.
61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
63  *              Cyrus Durgin    :       Cleaned for KMOD
64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
65  *                                      A network device unload needs to purge
66  *                                      the backlog queue.
67  *      Paul Rusty Russell      :       SIOCSIFNAME
68  *              Pekka Riikonen  :       Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *                                      indefinitely on dev->refcnt
71  *              J Hadi Salim    :       - Backlog queue sampling
72  *                                      - netif_rx() feedback
73  */
74
75 #include <linux/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <linux/bpf.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <net/busy_poll.h>
101 #include <linux/rtnetlink.h>
102 #include <linux/stat.h>
103 #include <net/dst.h>
104 #include <net/dst_metadata.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/iw_handler.h>
115 #include <asm/current.h>
116 #include <linux/audit.h>
117 #include <linux/dmaengine.h>
118 #include <linux/err.h>
119 #include <linux/ctype.h>
120 #include <linux/if_arp.h>
121 #include <linux/if_vlan.h>
122 #include <linux/ip.h>
123 #include <net/ip.h>
124 #include <net/mpls.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/static_key.h>
136 #include <linux/hashtable.h>
137 #include <linux/vmalloc.h>
138 #include <linux/if_macvlan.h>
139 #include <linux/errqueue.h>
140 #include <linux/hrtimer.h>
141 #include <linux/netfilter_ingress.h>
142 #include <linux/crash_dump.h>
143
144 #include "net-sysfs.h"
145
146 /* Instead of increasing this, you should create a hash table. */
147 #define MAX_GRO_SKBS 8
148
149 /* This should be increased if a protocol with a bigger head is added. */
150 #define GRO_MAX_HEAD (MAX_HEADER + 128)
151
152 static DEFINE_SPINLOCK(ptype_lock);
153 static DEFINE_SPINLOCK(offload_lock);
154 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
155 struct list_head ptype_all __read_mostly;       /* Taps */
156 static struct list_head offload_base __read_mostly;
157
158 static int netif_rx_internal(struct sk_buff *skb);
159 static int call_netdevice_notifiers_info(unsigned long val,
160                                          struct net_device *dev,
161                                          struct netdev_notifier_info *info);
162
163 /*
164  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
165  * semaphore.
166  *
167  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
168  *
169  * Writers must hold the rtnl semaphore while they loop through the
170  * dev_base_head list, and hold dev_base_lock for writing when they do the
171  * actual updates.  This allows pure readers to access the list even
172  * while a writer is preparing to update it.
173  *
174  * To put it another way, dev_base_lock is held for writing only to
175  * protect against pure readers; the rtnl semaphore provides the
176  * protection against other writers.
177  *
178  * See, for example usages, register_netdevice() and
179  * unregister_netdevice(), which must be called with the rtnl
180  * semaphore held.
181  */
182 DEFINE_RWLOCK(dev_base_lock);
183 EXPORT_SYMBOL(dev_base_lock);
184
185 /* protects napi_hash addition/deletion and napi_gen_id */
186 static DEFINE_SPINLOCK(napi_hash_lock);
187
188 static unsigned int napi_gen_id = NR_CPUS;
189 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
190
191 static seqcount_t devnet_rename_seq;
192
193 static inline void dev_base_seq_inc(struct net *net)
194 {
195         while (++net->dev_base_seq == 0);
196 }
197
198 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
199 {
200         unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
201
202         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
203 }
204
205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206 {
207         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
208 }
209
210 static inline void rps_lock(struct softnet_data *sd)
211 {
212 #ifdef CONFIG_RPS
213         spin_lock(&sd->input_pkt_queue.lock);
214 #endif
215 }
216
217 static inline void rps_unlock(struct softnet_data *sd)
218 {
219 #ifdef CONFIG_RPS
220         spin_unlock(&sd->input_pkt_queue.lock);
221 #endif
222 }
223
224 /* Device list insertion */
225 static void list_netdevice(struct net_device *dev)
226 {
227         struct net *net = dev_net(dev);
228
229         ASSERT_RTNL();
230
231         write_lock_bh(&dev_base_lock);
232         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
233         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
234         hlist_add_head_rcu(&dev->index_hlist,
235                            dev_index_hash(net, dev->ifindex));
236         write_unlock_bh(&dev_base_lock);
237
238         dev_base_seq_inc(net);
239 }
240
241 /* Device list removal
242  * caller must respect a RCU grace period before freeing/reusing dev
243  */
244 static void unlist_netdevice(struct net_device *dev)
245 {
246         ASSERT_RTNL();
247
248         /* Unlink dev from the device chain */
249         write_lock_bh(&dev_base_lock);
250         list_del_rcu(&dev->dev_list);
251         hlist_del_rcu(&dev->name_hlist);
252         hlist_del_rcu(&dev->index_hlist);
253         write_unlock_bh(&dev_base_lock);
254
255         dev_base_seq_inc(dev_net(dev));
256 }
257
258 /*
259  *      Our notifier list
260  */
261
262 static RAW_NOTIFIER_HEAD(netdev_chain);
263
264 /*
265  *      Device drivers call our routines to queue packets here. We empty the
266  *      queue in the local softnet handler.
267  */
268
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270 EXPORT_PER_CPU_SYMBOL(softnet_data);
271
272 #ifdef CONFIG_LOCKDEP
273 /*
274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275  * according to dev->type
276  */
277 static const unsigned short netdev_lock_type[] =
278         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
291          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
292          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
293
294 static const char *const netdev_lock_name[] =
295         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
308          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
309          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
310
311 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
312 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
313
314 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
315 {
316         int i;
317
318         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319                 if (netdev_lock_type[i] == dev_type)
320                         return i;
321         /* the last key is used by default */
322         return ARRAY_SIZE(netdev_lock_type) - 1;
323 }
324
325 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326                                                  unsigned short dev_type)
327 {
328         int i;
329
330         i = netdev_lock_pos(dev_type);
331         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332                                    netdev_lock_name[i]);
333 }
334
335 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
336 {
337         int i;
338
339         i = netdev_lock_pos(dev->type);
340         lockdep_set_class_and_name(&dev->addr_list_lock,
341                                    &netdev_addr_lock_key[i],
342                                    netdev_lock_name[i]);
343 }
344 #else
345 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346                                                  unsigned short dev_type)
347 {
348 }
349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
350 {
351 }
352 #endif
353
354 /*******************************************************************************
355
356                 Protocol management and registration routines
357
358 *******************************************************************************/
359
360 /*
361  *      Add a protocol ID to the list. Now that the input handler is
362  *      smarter we can dispense with all the messy stuff that used to be
363  *      here.
364  *
365  *      BEWARE!!! Protocol handlers, mangling input packets,
366  *      MUST BE last in hash buckets and checking protocol handlers
367  *      MUST start from promiscuous ptype_all chain in net_bh.
368  *      It is true now, do not change it.
369  *      Explanation follows: if protocol handler, mangling packet, will
370  *      be the first on list, it is not able to sense, that packet
371  *      is cloned and should be copied-on-write, so that it will
372  *      change it and subsequent readers will get broken packet.
373  *                                                      --ANK (980803)
374  */
375
376 static inline struct list_head *ptype_head(const struct packet_type *pt)
377 {
378         if (pt->type == htons(ETH_P_ALL))
379                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
380         else
381                 return pt->dev ? &pt->dev->ptype_specific :
382                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383 }
384
385 /**
386  *      dev_add_pack - add packet handler
387  *      @pt: packet type declaration
388  *
389  *      Add a protocol handler to the networking stack. The passed &packet_type
390  *      is linked into kernel lists and may not be freed until it has been
391  *      removed from the kernel lists.
392  *
393  *      This call does not sleep therefore it can not
394  *      guarantee all CPU's that are in middle of receiving packets
395  *      will see the new packet type (until the next received packet).
396  */
397
398 void dev_add_pack(struct packet_type *pt)
399 {
400         struct list_head *head = ptype_head(pt);
401
402         spin_lock(&ptype_lock);
403         list_add_rcu(&pt->list, head);
404         spin_unlock(&ptype_lock);
405 }
406 EXPORT_SYMBOL(dev_add_pack);
407
408 /**
409  *      __dev_remove_pack        - remove packet handler
410  *      @pt: packet type declaration
411  *
412  *      Remove a protocol handler that was previously added to the kernel
413  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
414  *      from the kernel lists and can be freed or reused once this function
415  *      returns.
416  *
417  *      The packet type might still be in use by receivers
418  *      and must not be freed until after all the CPU's have gone
419  *      through a quiescent state.
420  */
421 void __dev_remove_pack(struct packet_type *pt)
422 {
423         struct list_head *head = ptype_head(pt);
424         struct packet_type *pt1;
425
426         spin_lock(&ptype_lock);
427
428         list_for_each_entry(pt1, head, list) {
429                 if (pt == pt1) {
430                         list_del_rcu(&pt->list);
431                         goto out;
432                 }
433         }
434
435         pr_warn("dev_remove_pack: %p not found\n", pt);
436 out:
437         spin_unlock(&ptype_lock);
438 }
439 EXPORT_SYMBOL(__dev_remove_pack);
440
441 /**
442  *      dev_remove_pack  - remove packet handler
443  *      @pt: packet type declaration
444  *
445  *      Remove a protocol handler that was previously added to the kernel
446  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
447  *      from the kernel lists and can be freed or reused once this function
448  *      returns.
449  *
450  *      This call sleeps to guarantee that no CPU is looking at the packet
451  *      type after return.
452  */
453 void dev_remove_pack(struct packet_type *pt)
454 {
455         __dev_remove_pack(pt);
456
457         synchronize_net();
458 }
459 EXPORT_SYMBOL(dev_remove_pack);
460
461
462 /**
463  *      dev_add_offload - register offload handlers
464  *      @po: protocol offload declaration
465  *
466  *      Add protocol offload handlers to the networking stack. The passed
467  *      &proto_offload is linked into kernel lists and may not be freed until
468  *      it has been removed from the kernel lists.
469  *
470  *      This call does not sleep therefore it can not
471  *      guarantee all CPU's that are in middle of receiving packets
472  *      will see the new offload handlers (until the next received packet).
473  */
474 void dev_add_offload(struct packet_offload *po)
475 {
476         struct packet_offload *elem;
477
478         spin_lock(&offload_lock);
479         list_for_each_entry(elem, &offload_base, list) {
480                 if (po->priority < elem->priority)
481                         break;
482         }
483         list_add_rcu(&po->list, elem->list.prev);
484         spin_unlock(&offload_lock);
485 }
486 EXPORT_SYMBOL(dev_add_offload);
487
488 /**
489  *      __dev_remove_offload     - remove offload handler
490  *      @po: packet offload declaration
491  *
492  *      Remove a protocol offload handler that was previously added to the
493  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
494  *      is removed from the kernel lists and can be freed or reused once this
495  *      function returns.
496  *
497  *      The packet type might still be in use by receivers
498  *      and must not be freed until after all the CPU's have gone
499  *      through a quiescent state.
500  */
501 static void __dev_remove_offload(struct packet_offload *po)
502 {
503         struct list_head *head = &offload_base;
504         struct packet_offload *po1;
505
506         spin_lock(&offload_lock);
507
508         list_for_each_entry(po1, head, list) {
509                 if (po == po1) {
510                         list_del_rcu(&po->list);
511                         goto out;
512                 }
513         }
514
515         pr_warn("dev_remove_offload: %p not found\n", po);
516 out:
517         spin_unlock(&offload_lock);
518 }
519
520 /**
521  *      dev_remove_offload       - remove packet offload handler
522  *      @po: packet offload declaration
523  *
524  *      Remove a packet offload handler that was previously added to the kernel
525  *      offload handlers by dev_add_offload(). The passed &offload_type is
526  *      removed from the kernel lists and can be freed or reused once this
527  *      function returns.
528  *
529  *      This call sleeps to guarantee that no CPU is looking at the packet
530  *      type after return.
531  */
532 void dev_remove_offload(struct packet_offload *po)
533 {
534         __dev_remove_offload(po);
535
536         synchronize_net();
537 }
538 EXPORT_SYMBOL(dev_remove_offload);
539
540 /******************************************************************************
541
542                       Device Boot-time Settings Routines
543
544 *******************************************************************************/
545
546 /* Boot time configuration table */
547 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
548
549 /**
550  *      netdev_boot_setup_add   - add new setup entry
551  *      @name: name of the device
552  *      @map: configured settings for the device
553  *
554  *      Adds new setup entry to the dev_boot_setup list.  The function
555  *      returns 0 on error and 1 on success.  This is a generic routine to
556  *      all netdevices.
557  */
558 static int netdev_boot_setup_add(char *name, struct ifmap *map)
559 {
560         struct netdev_boot_setup *s;
561         int i;
562
563         s = dev_boot_setup;
564         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
565                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
566                         memset(s[i].name, 0, sizeof(s[i].name));
567                         strlcpy(s[i].name, name, IFNAMSIZ);
568                         memcpy(&s[i].map, map, sizeof(s[i].map));
569                         break;
570                 }
571         }
572
573         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
574 }
575
576 /**
577  *      netdev_boot_setup_check - check boot time settings
578  *      @dev: the netdevice
579  *
580  *      Check boot time settings for the device.
581  *      The found settings are set for the device to be used
582  *      later in the device probing.
583  *      Returns 0 if no settings found, 1 if they are.
584  */
585 int netdev_boot_setup_check(struct net_device *dev)
586 {
587         struct netdev_boot_setup *s = dev_boot_setup;
588         int i;
589
590         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
591                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
592                     !strcmp(dev->name, s[i].name)) {
593                         dev->irq        = s[i].map.irq;
594                         dev->base_addr  = s[i].map.base_addr;
595                         dev->mem_start  = s[i].map.mem_start;
596                         dev->mem_end    = s[i].map.mem_end;
597                         return 1;
598                 }
599         }
600         return 0;
601 }
602 EXPORT_SYMBOL(netdev_boot_setup_check);
603
604
605 /**
606  *      netdev_boot_base        - get address from boot time settings
607  *      @prefix: prefix for network device
608  *      @unit: id for network device
609  *
610  *      Check boot time settings for the base address of device.
611  *      The found settings are set for the device to be used
612  *      later in the device probing.
613  *      Returns 0 if no settings found.
614  */
615 unsigned long netdev_boot_base(const char *prefix, int unit)
616 {
617         const struct netdev_boot_setup *s = dev_boot_setup;
618         char name[IFNAMSIZ];
619         int i;
620
621         sprintf(name, "%s%d", prefix, unit);
622
623         /*
624          * If device already registered then return base of 1
625          * to indicate not to probe for this interface
626          */
627         if (__dev_get_by_name(&init_net, name))
628                 return 1;
629
630         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
631                 if (!strcmp(name, s[i].name))
632                         return s[i].map.base_addr;
633         return 0;
634 }
635
636 /*
637  * Saves at boot time configured settings for any netdevice.
638  */
639 int __init netdev_boot_setup(char *str)
640 {
641         int ints[5];
642         struct ifmap map;
643
644         str = get_options(str, ARRAY_SIZE(ints), ints);
645         if (!str || !*str)
646                 return 0;
647
648         /* Save settings */
649         memset(&map, 0, sizeof(map));
650         if (ints[0] > 0)
651                 map.irq = ints[1];
652         if (ints[0] > 1)
653                 map.base_addr = ints[2];
654         if (ints[0] > 2)
655                 map.mem_start = ints[3];
656         if (ints[0] > 3)
657                 map.mem_end = ints[4];
658
659         /* Add new entry to the list */
660         return netdev_boot_setup_add(str, &map);
661 }
662
663 __setup("netdev=", netdev_boot_setup);
664
665 /*******************************************************************************
666
667                             Device Interface Subroutines
668
669 *******************************************************************************/
670
671 /**
672  *      dev_get_iflink  - get 'iflink' value of a interface
673  *      @dev: targeted interface
674  *
675  *      Indicates the ifindex the interface is linked to.
676  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
677  */
678
679 int dev_get_iflink(const struct net_device *dev)
680 {
681         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
682                 return dev->netdev_ops->ndo_get_iflink(dev);
683
684         return dev->ifindex;
685 }
686 EXPORT_SYMBOL(dev_get_iflink);
687
688 /**
689  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
690  *      @dev: targeted interface
691  *      @skb: The packet.
692  *
693  *      For better visibility of tunnel traffic OVS needs to retrieve
694  *      egress tunnel information for a packet. Following API allows
695  *      user to get this info.
696  */
697 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
698 {
699         struct ip_tunnel_info *info;
700
701         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
702                 return -EINVAL;
703
704         info = skb_tunnel_info_unclone(skb);
705         if (!info)
706                 return -ENOMEM;
707         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
708                 return -EINVAL;
709
710         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
711 }
712 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
713
714 /**
715  *      __dev_get_by_name       - find a device by its name
716  *      @net: the applicable net namespace
717  *      @name: name to find
718  *
719  *      Find an interface by name. Must be called under RTNL semaphore
720  *      or @dev_base_lock. If the name is found a pointer to the device
721  *      is returned. If the name is not found then %NULL is returned. The
722  *      reference counters are not incremented so the caller must be
723  *      careful with locks.
724  */
725
726 struct net_device *__dev_get_by_name(struct net *net, const char *name)
727 {
728         struct net_device *dev;
729         struct hlist_head *head = dev_name_hash(net, name);
730
731         hlist_for_each_entry(dev, head, name_hlist)
732                 if (!strncmp(dev->name, name, IFNAMSIZ))
733                         return dev;
734
735         return NULL;
736 }
737 EXPORT_SYMBOL(__dev_get_by_name);
738
739 /**
740  *      dev_get_by_name_rcu     - find a device by its name
741  *      @net: the applicable net namespace
742  *      @name: name to find
743  *
744  *      Find an interface by name.
745  *      If the name is found a pointer to the device is returned.
746  *      If the name is not found then %NULL is returned.
747  *      The reference counters are not incremented so the caller must be
748  *      careful with locks. The caller must hold RCU lock.
749  */
750
751 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
752 {
753         struct net_device *dev;
754         struct hlist_head *head = dev_name_hash(net, name);
755
756         hlist_for_each_entry_rcu(dev, head, name_hlist)
757                 if (!strncmp(dev->name, name, IFNAMSIZ))
758                         return dev;
759
760         return NULL;
761 }
762 EXPORT_SYMBOL(dev_get_by_name_rcu);
763
764 /**
765  *      dev_get_by_name         - find a device by its name
766  *      @net: the applicable net namespace
767  *      @name: name to find
768  *
769  *      Find an interface by name. This can be called from any
770  *      context and does its own locking. The returned handle has
771  *      the usage count incremented and the caller must use dev_put() to
772  *      release it when it is no longer needed. %NULL is returned if no
773  *      matching device is found.
774  */
775
776 struct net_device *dev_get_by_name(struct net *net, const char *name)
777 {
778         struct net_device *dev;
779
780         rcu_read_lock();
781         dev = dev_get_by_name_rcu(net, name);
782         if (dev)
783                 dev_hold(dev);
784         rcu_read_unlock();
785         return dev;
786 }
787 EXPORT_SYMBOL(dev_get_by_name);
788
789 /**
790  *      __dev_get_by_index - find a device by its ifindex
791  *      @net: the applicable net namespace
792  *      @ifindex: index of device
793  *
794  *      Search for an interface by index. Returns %NULL if the device
795  *      is not found or a pointer to the device. The device has not
796  *      had its reference counter increased so the caller must be careful
797  *      about locking. The caller must hold either the RTNL semaphore
798  *      or @dev_base_lock.
799  */
800
801 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
802 {
803         struct net_device *dev;
804         struct hlist_head *head = dev_index_hash(net, ifindex);
805
806         hlist_for_each_entry(dev, head, index_hlist)
807                 if (dev->ifindex == ifindex)
808                         return dev;
809
810         return NULL;
811 }
812 EXPORT_SYMBOL(__dev_get_by_index);
813
814 /**
815  *      dev_get_by_index_rcu - find a device by its ifindex
816  *      @net: the applicable net namespace
817  *      @ifindex: index of device
818  *
819  *      Search for an interface by index. Returns %NULL if the device
820  *      is not found or a pointer to the device. The device has not
821  *      had its reference counter increased so the caller must be careful
822  *      about locking. The caller must hold RCU lock.
823  */
824
825 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
826 {
827         struct net_device *dev;
828         struct hlist_head *head = dev_index_hash(net, ifindex);
829
830         hlist_for_each_entry_rcu(dev, head, index_hlist)
831                 if (dev->ifindex == ifindex)
832                         return dev;
833
834         return NULL;
835 }
836 EXPORT_SYMBOL(dev_get_by_index_rcu);
837
838
839 /**
840  *      dev_get_by_index - find a device by its ifindex
841  *      @net: the applicable net namespace
842  *      @ifindex: index of device
843  *
844  *      Search for an interface by index. Returns NULL if the device
845  *      is not found or a pointer to the device. The device returned has
846  *      had a reference added and the pointer is safe until the user calls
847  *      dev_put to indicate they have finished with it.
848  */
849
850 struct net_device *dev_get_by_index(struct net *net, int ifindex)
851 {
852         struct net_device *dev;
853
854         rcu_read_lock();
855         dev = dev_get_by_index_rcu(net, ifindex);
856         if (dev)
857                 dev_hold(dev);
858         rcu_read_unlock();
859         return dev;
860 }
861 EXPORT_SYMBOL(dev_get_by_index);
862
863 /**
864  *      netdev_get_name - get a netdevice name, knowing its ifindex.
865  *      @net: network namespace
866  *      @name: a pointer to the buffer where the name will be stored.
867  *      @ifindex: the ifindex of the interface to get the name from.
868  *
869  *      The use of raw_seqcount_begin() and cond_resched() before
870  *      retrying is required as we want to give the writers a chance
871  *      to complete when CONFIG_PREEMPT is not set.
872  */
873 int netdev_get_name(struct net *net, char *name, int ifindex)
874 {
875         struct net_device *dev;
876         unsigned int seq;
877
878 retry:
879         seq = raw_seqcount_begin(&devnet_rename_seq);
880         rcu_read_lock();
881         dev = dev_get_by_index_rcu(net, ifindex);
882         if (!dev) {
883                 rcu_read_unlock();
884                 return -ENODEV;
885         }
886
887         strcpy(name, dev->name);
888         rcu_read_unlock();
889         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
890                 cond_resched();
891                 goto retry;
892         }
893
894         return 0;
895 }
896
897 /**
898  *      dev_getbyhwaddr_rcu - find a device by its hardware address
899  *      @net: the applicable net namespace
900  *      @type: media type of device
901  *      @ha: hardware address
902  *
903  *      Search for an interface by MAC address. Returns NULL if the device
904  *      is not found or a pointer to the device.
905  *      The caller must hold RCU or RTNL.
906  *      The returned device has not had its ref count increased
907  *      and the caller must therefore be careful about locking
908  *
909  */
910
911 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
912                                        const char *ha)
913 {
914         struct net_device *dev;
915
916         for_each_netdev_rcu(net, dev)
917                 if (dev->type == type &&
918                     !memcmp(dev->dev_addr, ha, dev->addr_len))
919                         return dev;
920
921         return NULL;
922 }
923 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
924
925 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
926 {
927         struct net_device *dev;
928
929         ASSERT_RTNL();
930         for_each_netdev(net, dev)
931                 if (dev->type == type)
932                         return dev;
933
934         return NULL;
935 }
936 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
937
938 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
939 {
940         struct net_device *dev, *ret = NULL;
941
942         rcu_read_lock();
943         for_each_netdev_rcu(net, dev)
944                 if (dev->type == type) {
945                         dev_hold(dev);
946                         ret = dev;
947                         break;
948                 }
949         rcu_read_unlock();
950         return ret;
951 }
952 EXPORT_SYMBOL(dev_getfirstbyhwtype);
953
954 /**
955  *      __dev_get_by_flags - find any device with given flags
956  *      @net: the applicable net namespace
957  *      @if_flags: IFF_* values
958  *      @mask: bitmask of bits in if_flags to check
959  *
960  *      Search for any interface with the given flags. Returns NULL if a device
961  *      is not found or a pointer to the device. Must be called inside
962  *      rtnl_lock(), and result refcount is unchanged.
963  */
964
965 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
966                                       unsigned short mask)
967 {
968         struct net_device *dev, *ret;
969
970         ASSERT_RTNL();
971
972         ret = NULL;
973         for_each_netdev(net, dev) {
974                 if (((dev->flags ^ if_flags) & mask) == 0) {
975                         ret = dev;
976                         break;
977                 }
978         }
979         return ret;
980 }
981 EXPORT_SYMBOL(__dev_get_by_flags);
982
983 /**
984  *      dev_valid_name - check if name is okay for network device
985  *      @name: name string
986  *
987  *      Network device names need to be valid file names to
988  *      to allow sysfs to work.  We also disallow any kind of
989  *      whitespace.
990  */
991 bool dev_valid_name(const char *name)
992 {
993         if (*name == '\0')
994                 return false;
995         if (strlen(name) >= IFNAMSIZ)
996                 return false;
997         if (!strcmp(name, ".") || !strcmp(name, ".."))
998                 return false;
999
1000         while (*name) {
1001                 if (*name == '/' || *name == ':' || isspace(*name))
1002                         return false;
1003                 name++;
1004         }
1005         return true;
1006 }
1007 EXPORT_SYMBOL(dev_valid_name);
1008
1009 /**
1010  *      __dev_alloc_name - allocate a name for a device
1011  *      @net: network namespace to allocate the device name in
1012  *      @name: name format string
1013  *      @buf:  scratch buffer and result name string
1014  *
1015  *      Passed a format string - eg "lt%d" it will try and find a suitable
1016  *      id. It scans list of devices to build up a free map, then chooses
1017  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1018  *      while allocating the name and adding the device in order to avoid
1019  *      duplicates.
1020  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021  *      Returns the number of the unit assigned or a negative errno code.
1022  */
1023
1024 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025 {
1026         int i = 0;
1027         const char *p;
1028         const int max_netdevices = 8*PAGE_SIZE;
1029         unsigned long *inuse;
1030         struct net_device *d;
1031
1032         p = strnchr(name, IFNAMSIZ-1, '%');
1033         if (p) {
1034                 /*
1035                  * Verify the string as this thing may have come from
1036                  * the user.  There must be either one "%d" and no other "%"
1037                  * characters.
1038                  */
1039                 if (p[1] != 'd' || strchr(p + 2, '%'))
1040                         return -EINVAL;
1041
1042                 /* Use one page as a bit array of possible slots */
1043                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044                 if (!inuse)
1045                         return -ENOMEM;
1046
1047                 for_each_netdev(net, d) {
1048                         if (!sscanf(d->name, name, &i))
1049                                 continue;
1050                         if (i < 0 || i >= max_netdevices)
1051                                 continue;
1052
1053                         /*  avoid cases where sscanf is not exact inverse of printf */
1054                         snprintf(buf, IFNAMSIZ, name, i);
1055                         if (!strncmp(buf, d->name, IFNAMSIZ))
1056                                 set_bit(i, inuse);
1057                 }
1058
1059                 i = find_first_zero_bit(inuse, max_netdevices);
1060                 free_page((unsigned long) inuse);
1061         }
1062
1063         if (buf != name)
1064                 snprintf(buf, IFNAMSIZ, name, i);
1065         if (!__dev_get_by_name(net, buf))
1066                 return i;
1067
1068         /* It is possible to run out of possible slots
1069          * when the name is long and there isn't enough space left
1070          * for the digits, or if all bits are used.
1071          */
1072         return -ENFILE;
1073 }
1074
1075 /**
1076  *      dev_alloc_name - allocate a name for a device
1077  *      @dev: device
1078  *      @name: name format string
1079  *
1080  *      Passed a format string - eg "lt%d" it will try and find a suitable
1081  *      id. It scans list of devices to build up a free map, then chooses
1082  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1083  *      while allocating the name and adding the device in order to avoid
1084  *      duplicates.
1085  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086  *      Returns the number of the unit assigned or a negative errno code.
1087  */
1088
1089 int dev_alloc_name(struct net_device *dev, const char *name)
1090 {
1091         char buf[IFNAMSIZ];
1092         struct net *net;
1093         int ret;
1094
1095         BUG_ON(!dev_net(dev));
1096         net = dev_net(dev);
1097         ret = __dev_alloc_name(net, name, buf);
1098         if (ret >= 0)
1099                 strlcpy(dev->name, buf, IFNAMSIZ);
1100         return ret;
1101 }
1102 EXPORT_SYMBOL(dev_alloc_name);
1103
1104 static int dev_alloc_name_ns(struct net *net,
1105                              struct net_device *dev,
1106                              const char *name)
1107 {
1108         char buf[IFNAMSIZ];
1109         int ret;
1110
1111         ret = __dev_alloc_name(net, name, buf);
1112         if (ret >= 0)
1113                 strlcpy(dev->name, buf, IFNAMSIZ);
1114         return ret;
1115 }
1116
1117 static int dev_get_valid_name(struct net *net,
1118                               struct net_device *dev,
1119                               const char *name)
1120 {
1121         BUG_ON(!net);
1122
1123         if (!dev_valid_name(name))
1124                 return -EINVAL;
1125
1126         if (strchr(name, '%'))
1127                 return dev_alloc_name_ns(net, dev, name);
1128         else if (__dev_get_by_name(net, name))
1129                 return -EEXIST;
1130         else if (dev->name != name)
1131                 strlcpy(dev->name, name, IFNAMSIZ);
1132
1133         return 0;
1134 }
1135
1136 /**
1137  *      dev_change_name - change name of a device
1138  *      @dev: device
1139  *      @newname: name (or format string) must be at least IFNAMSIZ
1140  *
1141  *      Change name of a device, can pass format strings "eth%d".
1142  *      for wildcarding.
1143  */
1144 int dev_change_name(struct net_device *dev, const char *newname)
1145 {
1146         unsigned char old_assign_type;
1147         char oldname[IFNAMSIZ];
1148         int err = 0;
1149         int ret;
1150         struct net *net;
1151
1152         ASSERT_RTNL();
1153         BUG_ON(!dev_net(dev));
1154
1155         net = dev_net(dev);
1156         if (dev->flags & IFF_UP)
1157                 return -EBUSY;
1158
1159         write_seqcount_begin(&devnet_rename_seq);
1160
1161         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162                 write_seqcount_end(&devnet_rename_seq);
1163                 return 0;
1164         }
1165
1166         memcpy(oldname, dev->name, IFNAMSIZ);
1167
1168         err = dev_get_valid_name(net, dev, newname);
1169         if (err < 0) {
1170                 write_seqcount_end(&devnet_rename_seq);
1171                 return err;
1172         }
1173
1174         if (oldname[0] && !strchr(oldname, '%'))
1175                 netdev_info(dev, "renamed from %s\n", oldname);
1176
1177         old_assign_type = dev->name_assign_type;
1178         dev->name_assign_type = NET_NAME_RENAMED;
1179
1180 rollback:
1181         ret = device_rename(&dev->dev, dev->name);
1182         if (ret) {
1183                 memcpy(dev->name, oldname, IFNAMSIZ);
1184                 dev->name_assign_type = old_assign_type;
1185                 write_seqcount_end(&devnet_rename_seq);
1186                 return ret;
1187         }
1188
1189         write_seqcount_end(&devnet_rename_seq);
1190
1191         netdev_adjacent_rename_links(dev, oldname);
1192
1193         write_lock_bh(&dev_base_lock);
1194         hlist_del_rcu(&dev->name_hlist);
1195         write_unlock_bh(&dev_base_lock);
1196
1197         synchronize_rcu();
1198
1199         write_lock_bh(&dev_base_lock);
1200         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201         write_unlock_bh(&dev_base_lock);
1202
1203         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204         ret = notifier_to_errno(ret);
1205
1206         if (ret) {
1207                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1208                 if (err >= 0) {
1209                         err = ret;
1210                         write_seqcount_begin(&devnet_rename_seq);
1211                         memcpy(dev->name, oldname, IFNAMSIZ);
1212                         memcpy(oldname, newname, IFNAMSIZ);
1213                         dev->name_assign_type = old_assign_type;
1214                         old_assign_type = NET_NAME_RENAMED;
1215                         goto rollback;
1216                 } else {
1217                         pr_err("%s: name change rollback failed: %d\n",
1218                                dev->name, ret);
1219                 }
1220         }
1221
1222         return err;
1223 }
1224
1225 /**
1226  *      dev_set_alias - change ifalias of a device
1227  *      @dev: device
1228  *      @alias: name up to IFALIASZ
1229  *      @len: limit of bytes to copy from info
1230  *
1231  *      Set ifalias for a device,
1232  */
1233 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234 {
1235         char *new_ifalias;
1236
1237         ASSERT_RTNL();
1238
1239         if (len >= IFALIASZ)
1240                 return -EINVAL;
1241
1242         if (!len) {
1243                 kfree(dev->ifalias);
1244                 dev->ifalias = NULL;
1245                 return 0;
1246         }
1247
1248         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249         if (!new_ifalias)
1250                 return -ENOMEM;
1251         dev->ifalias = new_ifalias;
1252
1253         strlcpy(dev->ifalias, alias, len+1);
1254         return len;
1255 }
1256
1257
1258 /**
1259  *      netdev_features_change - device changes features
1260  *      @dev: device to cause notification
1261  *
1262  *      Called to indicate a device has changed features.
1263  */
1264 void netdev_features_change(struct net_device *dev)
1265 {
1266         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267 }
1268 EXPORT_SYMBOL(netdev_features_change);
1269
1270 /**
1271  *      netdev_state_change - device changes state
1272  *      @dev: device to cause notification
1273  *
1274  *      Called to indicate a device has changed state. This function calls
1275  *      the notifier chains for netdev_chain and sends a NEWLINK message
1276  *      to the routing socket.
1277  */
1278 void netdev_state_change(struct net_device *dev)
1279 {
1280         if (dev->flags & IFF_UP) {
1281                 struct netdev_notifier_change_info change_info;
1282
1283                 change_info.flags_changed = 0;
1284                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285                                               &change_info.info);
1286                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287         }
1288 }
1289 EXPORT_SYMBOL(netdev_state_change);
1290
1291 /**
1292  *      netdev_notify_peers - notify network peers about existence of @dev
1293  *      @dev: network device
1294  *
1295  * Generate traffic such that interested network peers are aware of
1296  * @dev, such as by generating a gratuitous ARP. This may be used when
1297  * a device wants to inform the rest of the network about some sort of
1298  * reconfiguration such as a failover event or virtual machine
1299  * migration.
1300  */
1301 void netdev_notify_peers(struct net_device *dev)
1302 {
1303         rtnl_lock();
1304         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305         rtnl_unlock();
1306 }
1307 EXPORT_SYMBOL(netdev_notify_peers);
1308
1309 static int __dev_open(struct net_device *dev)
1310 {
1311         const struct net_device_ops *ops = dev->netdev_ops;
1312         int ret;
1313
1314         ASSERT_RTNL();
1315
1316         if (!netif_device_present(dev))
1317                 return -ENODEV;
1318
1319         /* Block netpoll from trying to do any rx path servicing.
1320          * If we don't do this there is a chance ndo_poll_controller
1321          * or ndo_poll may be running while we open the device
1322          */
1323         netpoll_poll_disable(dev);
1324
1325         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326         ret = notifier_to_errno(ret);
1327         if (ret)
1328                 return ret;
1329
1330         set_bit(__LINK_STATE_START, &dev->state);
1331
1332         if (ops->ndo_validate_addr)
1333                 ret = ops->ndo_validate_addr(dev);
1334
1335         if (!ret && ops->ndo_open)
1336                 ret = ops->ndo_open(dev);
1337
1338         netpoll_poll_enable(dev);
1339
1340         if (ret)
1341                 clear_bit(__LINK_STATE_START, &dev->state);
1342         else {
1343                 dev->flags |= IFF_UP;
1344                 dev_set_rx_mode(dev);
1345                 dev_activate(dev);
1346                 add_device_randomness(dev->dev_addr, dev->addr_len);
1347         }
1348
1349         return ret;
1350 }
1351
1352 /**
1353  *      dev_open        - prepare an interface for use.
1354  *      @dev:   device to open
1355  *
1356  *      Takes a device from down to up state. The device's private open
1357  *      function is invoked and then the multicast lists are loaded. Finally
1358  *      the device is moved into the up state and a %NETDEV_UP message is
1359  *      sent to the netdev notifier chain.
1360  *
1361  *      Calling this function on an active interface is a nop. On a failure
1362  *      a negative errno code is returned.
1363  */
1364 int dev_open(struct net_device *dev)
1365 {
1366         int ret;
1367
1368         if (dev->flags & IFF_UP)
1369                 return 0;
1370
1371         ret = __dev_open(dev);
1372         if (ret < 0)
1373                 return ret;
1374
1375         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376         call_netdevice_notifiers(NETDEV_UP, dev);
1377
1378         return ret;
1379 }
1380 EXPORT_SYMBOL(dev_open);
1381
1382 static int __dev_close_many(struct list_head *head)
1383 {
1384         struct net_device *dev;
1385
1386         ASSERT_RTNL();
1387         might_sleep();
1388
1389         list_for_each_entry(dev, head, close_list) {
1390                 /* Temporarily disable netpoll until the interface is down */
1391                 netpoll_poll_disable(dev);
1392
1393                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394
1395                 clear_bit(__LINK_STATE_START, &dev->state);
1396
1397                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1398                  * can be even on different cpu. So just clear netif_running().
1399                  *
1400                  * dev->stop() will invoke napi_disable() on all of it's
1401                  * napi_struct instances on this device.
1402                  */
1403                 smp_mb__after_atomic(); /* Commit netif_running(). */
1404         }
1405
1406         dev_deactivate_many(head);
1407
1408         list_for_each_entry(dev, head, close_list) {
1409                 const struct net_device_ops *ops = dev->netdev_ops;
1410
1411                 /*
1412                  *      Call the device specific close. This cannot fail.
1413                  *      Only if device is UP
1414                  *
1415                  *      We allow it to be called even after a DETACH hot-plug
1416                  *      event.
1417                  */
1418                 if (ops->ndo_stop)
1419                         ops->ndo_stop(dev);
1420
1421                 dev->flags &= ~IFF_UP;
1422                 netpoll_poll_enable(dev);
1423         }
1424
1425         return 0;
1426 }
1427
1428 static int __dev_close(struct net_device *dev)
1429 {
1430         int retval;
1431         LIST_HEAD(single);
1432
1433         list_add(&dev->close_list, &single);
1434         retval = __dev_close_many(&single);
1435         list_del(&single);
1436
1437         return retval;
1438 }
1439
1440 int dev_close_many(struct list_head *head, bool unlink)
1441 {
1442         struct net_device *dev, *tmp;
1443
1444         /* Remove the devices that don't need to be closed */
1445         list_for_each_entry_safe(dev, tmp, head, close_list)
1446                 if (!(dev->flags & IFF_UP))
1447                         list_del_init(&dev->close_list);
1448
1449         __dev_close_many(head);
1450
1451         list_for_each_entry_safe(dev, tmp, head, close_list) {
1452                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1454                 if (unlink)
1455                         list_del_init(&dev->close_list);
1456         }
1457
1458         return 0;
1459 }
1460 EXPORT_SYMBOL(dev_close_many);
1461
1462 /**
1463  *      dev_close - shutdown an interface.
1464  *      @dev: device to shutdown
1465  *
1466  *      This function moves an active device into down state. A
1467  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469  *      chain.
1470  */
1471 int dev_close(struct net_device *dev)
1472 {
1473         if (dev->flags & IFF_UP) {
1474                 LIST_HEAD(single);
1475
1476                 list_add(&dev->close_list, &single);
1477                 dev_close_many(&single, true);
1478                 list_del(&single);
1479         }
1480         return 0;
1481 }
1482 EXPORT_SYMBOL(dev_close);
1483
1484
1485 /**
1486  *      dev_disable_lro - disable Large Receive Offload on a device
1487  *      @dev: device
1488  *
1489  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1490  *      called under RTNL.  This is needed if received packets may be
1491  *      forwarded to another interface.
1492  */
1493 void dev_disable_lro(struct net_device *dev)
1494 {
1495         struct net_device *lower_dev;
1496         struct list_head *iter;
1497
1498         dev->wanted_features &= ~NETIF_F_LRO;
1499         netdev_update_features(dev);
1500
1501         if (unlikely(dev->features & NETIF_F_LRO))
1502                 netdev_WARN(dev, "failed to disable LRO!\n");
1503
1504         netdev_for_each_lower_dev(dev, lower_dev, iter)
1505                 dev_disable_lro(lower_dev);
1506 }
1507 EXPORT_SYMBOL(dev_disable_lro);
1508
1509 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510                                    struct net_device *dev)
1511 {
1512         struct netdev_notifier_info info;
1513
1514         netdev_notifier_info_init(&info, dev);
1515         return nb->notifier_call(nb, val, &info);
1516 }
1517
1518 static int dev_boot_phase = 1;
1519
1520 /**
1521  *      register_netdevice_notifier - register a network notifier block
1522  *      @nb: notifier
1523  *
1524  *      Register a notifier to be called when network device events occur.
1525  *      The notifier passed is linked into the kernel structures and must
1526  *      not be reused until it has been unregistered. A negative errno code
1527  *      is returned on a failure.
1528  *
1529  *      When registered all registration and up events are replayed
1530  *      to the new notifier to allow device to have a race free
1531  *      view of the network device list.
1532  */
1533
1534 int register_netdevice_notifier(struct notifier_block *nb)
1535 {
1536         struct net_device *dev;
1537         struct net_device *last;
1538         struct net *net;
1539         int err;
1540
1541         rtnl_lock();
1542         err = raw_notifier_chain_register(&netdev_chain, nb);
1543         if (err)
1544                 goto unlock;
1545         if (dev_boot_phase)
1546                 goto unlock;
1547         for_each_net(net) {
1548                 for_each_netdev(net, dev) {
1549                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550                         err = notifier_to_errno(err);
1551                         if (err)
1552                                 goto rollback;
1553
1554                         if (!(dev->flags & IFF_UP))
1555                                 continue;
1556
1557                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1558                 }
1559         }
1560
1561 unlock:
1562         rtnl_unlock();
1563         return err;
1564
1565 rollback:
1566         last = dev;
1567         for_each_net(net) {
1568                 for_each_netdev(net, dev) {
1569                         if (dev == last)
1570                                 goto outroll;
1571
1572                         if (dev->flags & IFF_UP) {
1573                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574                                                         dev);
1575                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576                         }
1577                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578                 }
1579         }
1580
1581 outroll:
1582         raw_notifier_chain_unregister(&netdev_chain, nb);
1583         goto unlock;
1584 }
1585 EXPORT_SYMBOL(register_netdevice_notifier);
1586
1587 /**
1588  *      unregister_netdevice_notifier - unregister a network notifier block
1589  *      @nb: notifier
1590  *
1591  *      Unregister a notifier previously registered by
1592  *      register_netdevice_notifier(). The notifier is unlinked into the
1593  *      kernel structures and may then be reused. A negative errno code
1594  *      is returned on a failure.
1595  *
1596  *      After unregistering unregister and down device events are synthesized
1597  *      for all devices on the device list to the removed notifier to remove
1598  *      the need for special case cleanup code.
1599  */
1600
1601 int unregister_netdevice_notifier(struct notifier_block *nb)
1602 {
1603         struct net_device *dev;
1604         struct net *net;
1605         int err;
1606
1607         rtnl_lock();
1608         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609         if (err)
1610                 goto unlock;
1611
1612         for_each_net(net) {
1613                 for_each_netdev(net, dev) {
1614                         if (dev->flags & IFF_UP) {
1615                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616                                                         dev);
1617                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618                         }
1619                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620                 }
1621         }
1622 unlock:
1623         rtnl_unlock();
1624         return err;
1625 }
1626 EXPORT_SYMBOL(unregister_netdevice_notifier);
1627
1628 /**
1629  *      call_netdevice_notifiers_info - call all network notifier blocks
1630  *      @val: value passed unmodified to notifier function
1631  *      @dev: net_device pointer passed unmodified to notifier function
1632  *      @info: notifier information data
1633  *
1634  *      Call all network notifier blocks.  Parameters and return value
1635  *      are as for raw_notifier_call_chain().
1636  */
1637
1638 static int call_netdevice_notifiers_info(unsigned long val,
1639                                          struct net_device *dev,
1640                                          struct netdev_notifier_info *info)
1641 {
1642         ASSERT_RTNL();
1643         netdev_notifier_info_init(info, dev);
1644         return raw_notifier_call_chain(&netdev_chain, val, info);
1645 }
1646
1647 /**
1648  *      call_netdevice_notifiers - call all network notifier blocks
1649  *      @val: value passed unmodified to notifier function
1650  *      @dev: net_device pointer passed unmodified to notifier function
1651  *
1652  *      Call all network notifier blocks.  Parameters and return value
1653  *      are as for raw_notifier_call_chain().
1654  */
1655
1656 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657 {
1658         struct netdev_notifier_info info;
1659
1660         return call_netdevice_notifiers_info(val, dev, &info);
1661 }
1662 EXPORT_SYMBOL(call_netdevice_notifiers);
1663
1664 #ifdef CONFIG_NET_INGRESS
1665 static struct static_key ingress_needed __read_mostly;
1666
1667 void net_inc_ingress_queue(void)
1668 {
1669         static_key_slow_inc(&ingress_needed);
1670 }
1671 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672
1673 void net_dec_ingress_queue(void)
1674 {
1675         static_key_slow_dec(&ingress_needed);
1676 }
1677 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678 #endif
1679
1680 #ifdef CONFIG_NET_EGRESS
1681 static struct static_key egress_needed __read_mostly;
1682
1683 void net_inc_egress_queue(void)
1684 {
1685         static_key_slow_inc(&egress_needed);
1686 }
1687 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688
1689 void net_dec_egress_queue(void)
1690 {
1691         static_key_slow_dec(&egress_needed);
1692 }
1693 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694 #endif
1695
1696 static struct static_key netstamp_needed __read_mostly;
1697 #ifdef HAVE_JUMP_LABEL
1698 /* We are not allowed to call static_key_slow_dec() from irq context
1699  * If net_disable_timestamp() is called from irq context, defer the
1700  * static_key_slow_dec() calls.
1701  */
1702 static atomic_t netstamp_needed_deferred;
1703 #endif
1704
1705 void net_enable_timestamp(void)
1706 {
1707 #ifdef HAVE_JUMP_LABEL
1708         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1709
1710         if (deferred) {
1711                 while (--deferred)
1712                         static_key_slow_dec(&netstamp_needed);
1713                 return;
1714         }
1715 #endif
1716         static_key_slow_inc(&netstamp_needed);
1717 }
1718 EXPORT_SYMBOL(net_enable_timestamp);
1719
1720 void net_disable_timestamp(void)
1721 {
1722 #ifdef HAVE_JUMP_LABEL
1723         if (in_interrupt()) {
1724                 atomic_inc(&netstamp_needed_deferred);
1725                 return;
1726         }
1727 #endif
1728         static_key_slow_dec(&netstamp_needed);
1729 }
1730 EXPORT_SYMBOL(net_disable_timestamp);
1731
1732 static inline void net_timestamp_set(struct sk_buff *skb)
1733 {
1734         skb->tstamp = 0;
1735         if (static_key_false(&netstamp_needed))
1736                 __net_timestamp(skb);
1737 }
1738
1739 #define net_timestamp_check(COND, SKB)                  \
1740         if (static_key_false(&netstamp_needed)) {               \
1741                 if ((COND) && !(SKB)->tstamp)   \
1742                         __net_timestamp(SKB);           \
1743         }                                               \
1744
1745 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1746 {
1747         unsigned int len;
1748
1749         if (!(dev->flags & IFF_UP))
1750                 return false;
1751
1752         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1753         if (skb->len <= len)
1754                 return true;
1755
1756         /* if TSO is enabled, we don't care about the length as the packet
1757          * could be forwarded without being segmented before
1758          */
1759         if (skb_is_gso(skb))
1760                 return true;
1761
1762         return false;
1763 }
1764 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1765
1766 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1767 {
1768         int ret = ____dev_forward_skb(dev, skb);
1769
1770         if (likely(!ret)) {
1771                 skb->protocol = eth_type_trans(skb, dev);
1772                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1773         }
1774
1775         return ret;
1776 }
1777 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1778
1779 /**
1780  * dev_forward_skb - loopback an skb to another netif
1781  *
1782  * @dev: destination network device
1783  * @skb: buffer to forward
1784  *
1785  * return values:
1786  *      NET_RX_SUCCESS  (no congestion)
1787  *      NET_RX_DROP     (packet was dropped, but freed)
1788  *
1789  * dev_forward_skb can be used for injecting an skb from the
1790  * start_xmit function of one device into the receive queue
1791  * of another device.
1792  *
1793  * The receiving device may be in another namespace, so
1794  * we have to clear all information in the skb that could
1795  * impact namespace isolation.
1796  */
1797 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1798 {
1799         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1800 }
1801 EXPORT_SYMBOL_GPL(dev_forward_skb);
1802
1803 static inline int deliver_skb(struct sk_buff *skb,
1804                               struct packet_type *pt_prev,
1805                               struct net_device *orig_dev)
1806 {
1807         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1808                 return -ENOMEM;
1809         atomic_inc(&skb->users);
1810         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1811 }
1812
1813 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1814                                           struct packet_type **pt,
1815                                           struct net_device *orig_dev,
1816                                           __be16 type,
1817                                           struct list_head *ptype_list)
1818 {
1819         struct packet_type *ptype, *pt_prev = *pt;
1820
1821         list_for_each_entry_rcu(ptype, ptype_list, list) {
1822                 if (ptype->type != type)
1823                         continue;
1824                 if (pt_prev)
1825                         deliver_skb(skb, pt_prev, orig_dev);
1826                 pt_prev = ptype;
1827         }
1828         *pt = pt_prev;
1829 }
1830
1831 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1832 {
1833         if (!ptype->af_packet_priv || !skb->sk)
1834                 return false;
1835
1836         if (ptype->id_match)
1837                 return ptype->id_match(ptype, skb->sk);
1838         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1839                 return true;
1840
1841         return false;
1842 }
1843
1844 /*
1845  *      Support routine. Sends outgoing frames to any network
1846  *      taps currently in use.
1847  */
1848
1849 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1850 {
1851         struct packet_type *ptype;
1852         struct sk_buff *skb2 = NULL;
1853         struct packet_type *pt_prev = NULL;
1854         struct list_head *ptype_list = &ptype_all;
1855
1856         rcu_read_lock();
1857 again:
1858         list_for_each_entry_rcu(ptype, ptype_list, list) {
1859                 /* Never send packets back to the socket
1860                  * they originated from - MvS (miquels@drinkel.ow.org)
1861                  */
1862                 if (skb_loop_sk(ptype, skb))
1863                         continue;
1864
1865                 if (pt_prev) {
1866                         deliver_skb(skb2, pt_prev, skb->dev);
1867                         pt_prev = ptype;
1868                         continue;
1869                 }
1870
1871                 /* need to clone skb, done only once */
1872                 skb2 = skb_clone(skb, GFP_ATOMIC);
1873                 if (!skb2)
1874                         goto out_unlock;
1875
1876                 net_timestamp_set(skb2);
1877
1878                 /* skb->nh should be correctly
1879                  * set by sender, so that the second statement is
1880                  * just protection against buggy protocols.
1881                  */
1882                 skb_reset_mac_header(skb2);
1883
1884                 if (skb_network_header(skb2) < skb2->data ||
1885                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1886                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1887                                              ntohs(skb2->protocol),
1888                                              dev->name);
1889                         skb_reset_network_header(skb2);
1890                 }
1891
1892                 skb2->transport_header = skb2->network_header;
1893                 skb2->pkt_type = PACKET_OUTGOING;
1894                 pt_prev = ptype;
1895         }
1896
1897         if (ptype_list == &ptype_all) {
1898                 ptype_list = &dev->ptype_all;
1899                 goto again;
1900         }
1901 out_unlock:
1902         if (pt_prev)
1903                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1904         rcu_read_unlock();
1905 }
1906 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1907
1908 /**
1909  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1910  * @dev: Network device
1911  * @txq: number of queues available
1912  *
1913  * If real_num_tx_queues is changed the tc mappings may no longer be
1914  * valid. To resolve this verify the tc mapping remains valid and if
1915  * not NULL the mapping. With no priorities mapping to this
1916  * offset/count pair it will no longer be used. In the worst case TC0
1917  * is invalid nothing can be done so disable priority mappings. If is
1918  * expected that drivers will fix this mapping if they can before
1919  * calling netif_set_real_num_tx_queues.
1920  */
1921 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1922 {
1923         int i;
1924         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1925
1926         /* If TC0 is invalidated disable TC mapping */
1927         if (tc->offset + tc->count > txq) {
1928                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1929                 dev->num_tc = 0;
1930                 return;
1931         }
1932
1933         /* Invalidated prio to tc mappings set to TC0 */
1934         for (i = 1; i < TC_BITMASK + 1; i++) {
1935                 int q = netdev_get_prio_tc_map(dev, i);
1936
1937                 tc = &dev->tc_to_txq[q];
1938                 if (tc->offset + tc->count > txq) {
1939                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1940                                 i, q);
1941                         netdev_set_prio_tc_map(dev, i, 0);
1942                 }
1943         }
1944 }
1945
1946 int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
1947 {
1948         if (dev->num_tc) {
1949                 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1950                 int i;
1951
1952                 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
1953                         if ((txq - tc->offset) < tc->count)
1954                                 return i;
1955                 }
1956
1957                 return -1;
1958         }
1959
1960         return 0;
1961 }
1962
1963 #ifdef CONFIG_XPS
1964 static DEFINE_MUTEX(xps_map_mutex);
1965 #define xmap_dereference(P)             \
1966         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1967
1968 static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
1969                              int tci, u16 index)
1970 {
1971         struct xps_map *map = NULL;
1972         int pos;
1973
1974         if (dev_maps)
1975                 map = xmap_dereference(dev_maps->cpu_map[tci]);
1976         if (!map)
1977                 return false;
1978
1979         for (pos = map->len; pos--;) {
1980                 if (map->queues[pos] != index)
1981                         continue;
1982
1983                 if (map->len > 1) {
1984                         map->queues[pos] = map->queues[--map->len];
1985                         break;
1986                 }
1987
1988                 RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
1989                 kfree_rcu(map, rcu);
1990                 return false;
1991         }
1992
1993         return true;
1994 }
1995
1996 static bool remove_xps_queue_cpu(struct net_device *dev,
1997                                  struct xps_dev_maps *dev_maps,
1998                                  int cpu, u16 offset, u16 count)
1999 {
2000         int num_tc = dev->num_tc ? : 1;
2001         bool active = false;
2002         int tci;
2003
2004         for (tci = cpu * num_tc; num_tc--; tci++) {
2005                 int i, j;
2006
2007                 for (i = count, j = offset; i--; j++) {
2008                         if (!remove_xps_queue(dev_maps, cpu, j))
2009                                 break;
2010                 }
2011
2012                 active |= i < 0;
2013         }
2014
2015         return active;
2016 }
2017
2018 static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2019                                    u16 count)
2020 {
2021         struct xps_dev_maps *dev_maps;
2022         int cpu, i;
2023         bool active = false;
2024
2025         mutex_lock(&xps_map_mutex);
2026         dev_maps = xmap_dereference(dev->xps_maps);
2027
2028         if (!dev_maps)
2029                 goto out_no_maps;
2030
2031         for_each_possible_cpu(cpu)
2032                 active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2033                                                offset, count);
2034
2035         if (!active) {
2036                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2037                 kfree_rcu(dev_maps, rcu);
2038         }
2039
2040         for (i = offset + (count - 1); count--; i--)
2041                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2042                                              NUMA_NO_NODE);
2043
2044 out_no_maps:
2045         mutex_unlock(&xps_map_mutex);
2046 }
2047
2048 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2049 {
2050         netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2051 }
2052
2053 static struct xps_map *expand_xps_map(struct xps_map *map,
2054                                       int cpu, u16 index)
2055 {
2056         struct xps_map *new_map;
2057         int alloc_len = XPS_MIN_MAP_ALLOC;
2058         int i, pos;
2059
2060         for (pos = 0; map && pos < map->len; pos++) {
2061                 if (map->queues[pos] != index)
2062                         continue;
2063                 return map;
2064         }
2065
2066         /* Need to add queue to this CPU's existing map */
2067         if (map) {
2068                 if (pos < map->alloc_len)
2069                         return map;
2070
2071                 alloc_len = map->alloc_len * 2;
2072         }
2073
2074         /* Need to allocate new map to store queue on this CPU's map */
2075         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2076                                cpu_to_node(cpu));
2077         if (!new_map)
2078                 return NULL;
2079
2080         for (i = 0; i < pos; i++)
2081                 new_map->queues[i] = map->queues[i];
2082         new_map->alloc_len = alloc_len;
2083         new_map->len = pos;
2084
2085         return new_map;
2086 }
2087
2088 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2089                         u16 index)
2090 {
2091         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2092         int i, cpu, tci, numa_node_id = -2;
2093         int maps_sz, num_tc = 1, tc = 0;
2094         struct xps_map *map, *new_map;
2095         bool active = false;
2096
2097         if (dev->num_tc) {
2098                 num_tc = dev->num_tc;
2099                 tc = netdev_txq_to_tc(dev, index);
2100                 if (tc < 0)
2101                         return -EINVAL;
2102         }
2103
2104         maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
2105         if (maps_sz < L1_CACHE_BYTES)
2106                 maps_sz = L1_CACHE_BYTES;
2107
2108         mutex_lock(&xps_map_mutex);
2109
2110         dev_maps = xmap_dereference(dev->xps_maps);
2111
2112         /* allocate memory for queue storage */
2113         for_each_cpu_and(cpu, cpu_online_mask, mask) {
2114                 if (!new_dev_maps)
2115                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2116                 if (!new_dev_maps) {
2117                         mutex_unlock(&xps_map_mutex);
2118                         return -ENOMEM;
2119                 }
2120
2121                 tci = cpu * num_tc + tc;
2122                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2123                                  NULL;
2124
2125                 map = expand_xps_map(map, cpu, index);
2126                 if (!map)
2127                         goto error;
2128
2129                 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2130         }
2131
2132         if (!new_dev_maps)
2133                 goto out_no_new_maps;
2134
2135         for_each_possible_cpu(cpu) {
2136                 /* copy maps belonging to foreign traffic classes */
2137                 for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2138                         /* fill in the new device map from the old device map */
2139                         map = xmap_dereference(dev_maps->cpu_map[tci]);
2140                         RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2141                 }
2142
2143                 /* We need to explicitly update tci as prevous loop
2144                  * could break out early if dev_maps is NULL.
2145                  */
2146                 tci = cpu * num_tc + tc;
2147
2148                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2149                         /* add queue to CPU maps */
2150                         int pos = 0;
2151
2152                         map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2153                         while ((pos < map->len) && (map->queues[pos] != index))
2154                                 pos++;
2155
2156                         if (pos == map->len)
2157                                 map->queues[map->len++] = index;
2158 #ifdef CONFIG_NUMA
2159                         if (numa_node_id == -2)
2160                                 numa_node_id = cpu_to_node(cpu);
2161                         else if (numa_node_id != cpu_to_node(cpu))
2162                                 numa_node_id = -1;
2163 #endif
2164                 } else if (dev_maps) {
2165                         /* fill in the new device map from the old device map */
2166                         map = xmap_dereference(dev_maps->cpu_map[tci]);
2167                         RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2168                 }
2169
2170                 /* copy maps belonging to foreign traffic classes */
2171                 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2172                         /* fill in the new device map from the old device map */
2173                         map = xmap_dereference(dev_maps->cpu_map[tci]);
2174                         RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2175                 }
2176         }
2177
2178         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2179
2180         /* Cleanup old maps */
2181         if (!dev_maps)
2182                 goto out_no_old_maps;
2183
2184         for_each_possible_cpu(cpu) {
2185                 for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2186                         new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2187                         map = xmap_dereference(dev_maps->cpu_map[tci]);
2188                         if (map && map != new_map)
2189                                 kfree_rcu(map, rcu);
2190                 }
2191         }
2192
2193         kfree_rcu(dev_maps, rcu);
2194
2195 out_no_old_maps:
2196         dev_maps = new_dev_maps;
2197         active = true;
2198
2199 out_no_new_maps:
2200         /* update Tx queue numa node */
2201         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2202                                      (numa_node_id >= 0) ? numa_node_id :
2203                                      NUMA_NO_NODE);
2204
2205         if (!dev_maps)
2206                 goto out_no_maps;
2207
2208         /* removes queue from unused CPUs */
2209         for_each_possible_cpu(cpu) {
2210                 for (i = tc, tci = cpu * num_tc; i--; tci++)
2211                         active |= remove_xps_queue(dev_maps, tci, index);
2212                 if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2213                         active |= remove_xps_queue(dev_maps, tci, index);
2214                 for (i = num_tc - tc, tci++; --i; tci++)
2215                         active |= remove_xps_queue(dev_maps, tci, index);
2216         }
2217
2218         /* free map if not active */
2219         if (!active) {
2220                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2221                 kfree_rcu(dev_maps, rcu);
2222         }
2223
2224 out_no_maps:
2225         mutex_unlock(&xps_map_mutex);
2226
2227         return 0;
2228 error:
2229         /* remove any maps that we added */
2230         for_each_possible_cpu(cpu) {
2231                 for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2232                         new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2233                         map = dev_maps ?
2234                               xmap_dereference(dev_maps->cpu_map[tci]) :
2235                               NULL;
2236                         if (new_map && new_map != map)
2237                                 kfree(new_map);
2238                 }
2239         }
2240
2241         mutex_unlock(&xps_map_mutex);
2242
2243         kfree(new_dev_maps);
2244         return -ENOMEM;
2245 }
2246 EXPORT_SYMBOL(netif_set_xps_queue);
2247
2248 #endif
2249 void netdev_reset_tc(struct net_device *dev)
2250 {
2251 #ifdef CONFIG_XPS
2252         netif_reset_xps_queues_gt(dev, 0);
2253 #endif
2254         dev->num_tc = 0;
2255         memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2256         memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2257 }
2258 EXPORT_SYMBOL(netdev_reset_tc);
2259
2260 int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2261 {
2262         if (tc >= dev->num_tc)
2263                 return -EINVAL;
2264
2265 #ifdef CONFIG_XPS
2266         netif_reset_xps_queues(dev, offset, count);
2267 #endif
2268         dev->tc_to_txq[tc].count = count;
2269         dev->tc_to_txq[tc].offset = offset;
2270         return 0;
2271 }
2272 EXPORT_SYMBOL(netdev_set_tc_queue);
2273
2274 int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2275 {
2276         if (num_tc > TC_MAX_QUEUE)
2277                 return -EINVAL;
2278
2279 #ifdef CONFIG_XPS
2280         netif_reset_xps_queues_gt(dev, 0);
2281 #endif
2282         dev->num_tc = num_tc;
2283         return 0;
2284 }
2285 EXPORT_SYMBOL(netdev_set_num_tc);
2286
2287 /*
2288  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2289  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2290  */
2291 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2292 {
2293         int rc;
2294
2295         if (txq < 1 || txq > dev->num_tx_queues)
2296                 return -EINVAL;
2297
2298         if (dev->reg_state == NETREG_REGISTERED ||
2299             dev->reg_state == NETREG_UNREGISTERING) {
2300                 ASSERT_RTNL();
2301
2302                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2303                                                   txq);
2304                 if (rc)
2305                         return rc;
2306
2307                 if (dev->num_tc)
2308                         netif_setup_tc(dev, txq);
2309
2310                 if (txq < dev->real_num_tx_queues) {
2311                         qdisc_reset_all_tx_gt(dev, txq);
2312 #ifdef CONFIG_XPS
2313                         netif_reset_xps_queues_gt(dev, txq);
2314 #endif
2315                 }
2316         }
2317
2318         dev->real_num_tx_queues = txq;
2319         return 0;
2320 }
2321 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2322
2323 #ifdef CONFIG_SYSFS
2324 /**
2325  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2326  *      @dev: Network device
2327  *      @rxq: Actual number of RX queues
2328  *
2329  *      This must be called either with the rtnl_lock held or before
2330  *      registration of the net device.  Returns 0 on success, or a
2331  *      negative error code.  If called before registration, it always
2332  *      succeeds.
2333  */
2334 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2335 {
2336         int rc;
2337
2338         if (rxq < 1 || rxq > dev->num_rx_queues)
2339                 return -EINVAL;
2340
2341         if (dev->reg_state == NETREG_REGISTERED) {
2342                 ASSERT_RTNL();
2343
2344                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2345                                                   rxq);
2346                 if (rc)
2347                         return rc;
2348         }
2349
2350         dev->real_num_rx_queues = rxq;
2351         return 0;
2352 }
2353 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2354 #endif
2355
2356 /**
2357  * netif_get_num_default_rss_queues - default number of RSS queues
2358  *
2359  * This routine should set an upper limit on the number of RSS queues
2360  * used by default by multiqueue devices.
2361  */
2362 int netif_get_num_default_rss_queues(void)
2363 {
2364         return is_kdump_kernel() ?
2365                 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2366 }
2367 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2368
2369 static void __netif_reschedule(struct Qdisc *q)
2370 {
2371         struct softnet_data *sd;
2372         unsigned long flags;
2373
2374         local_irq_save(flags);
2375         sd = this_cpu_ptr(&softnet_data);
2376         q->next_sched = NULL;
2377         *sd->output_queue_tailp = q;
2378         sd->output_queue_tailp = &q->next_sched;
2379         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2380         local_irq_restore(flags);
2381 }
2382
2383 void __netif_schedule(struct Qdisc *q)
2384 {
2385         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2386                 __netif_reschedule(q);
2387 }
2388 EXPORT_SYMBOL(__netif_schedule);
2389
2390 struct dev_kfree_skb_cb {
2391         enum skb_free_reason reason;
2392 };
2393
2394 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2395 {
2396         return (struct dev_kfree_skb_cb *)skb->cb;
2397 }
2398
2399 void netif_schedule_queue(struct netdev_queue *txq)
2400 {
2401         rcu_read_lock();
2402         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2403                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2404
2405                 __netif_schedule(q);
2406         }
2407         rcu_read_unlock();
2408 }
2409 EXPORT_SYMBOL(netif_schedule_queue);
2410
2411 /**
2412  *      netif_wake_subqueue - allow sending packets on subqueue
2413  *      @dev: network device
2414  *      @queue_index: sub queue index
2415  *
2416  * Resume individual transmit queue of a device with multiple transmit queues.
2417  */
2418 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2419 {
2420         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2421
2422         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2423                 struct Qdisc *q;
2424
2425                 rcu_read_lock();
2426                 q = rcu_dereference(txq->qdisc);
2427                 __netif_schedule(q);
2428                 rcu_read_unlock();
2429         }
2430 }
2431 EXPORT_SYMBOL(netif_wake_subqueue);
2432
2433 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2434 {
2435         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2436                 struct Qdisc *q;
2437
2438                 rcu_read_lock();
2439                 q = rcu_dereference(dev_queue->qdisc);
2440                 __netif_schedule(q);
2441                 rcu_read_unlock();
2442         }
2443 }
2444 EXPORT_SYMBOL(netif_tx_wake_queue);
2445
2446 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2447 {
2448         unsigned long flags;
2449
2450         if (likely(atomic_read(&skb->users) == 1)) {
2451                 smp_rmb();
2452                 atomic_set(&skb->users, 0);
2453         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2454                 return;
2455         }
2456         get_kfree_skb_cb(skb)->reason = reason;
2457         local_irq_save(flags);
2458         skb->next = __this_cpu_read(softnet_data.completion_queue);
2459         __this_cpu_write(softnet_data.completion_queue, skb);
2460         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2461         local_irq_restore(flags);
2462 }
2463 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2464
2465 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2466 {
2467         if (in_irq() || irqs_disabled())
2468                 __dev_kfree_skb_irq(skb, reason);
2469         else
2470                 dev_kfree_skb(skb);
2471 }
2472 EXPORT_SYMBOL(__dev_kfree_skb_any);
2473
2474
2475 /**
2476  * netif_device_detach - mark device as removed
2477  * @dev: network device
2478  *
2479  * Mark device as removed from system and therefore no longer available.
2480  */
2481 void netif_device_detach(struct net_device *dev)
2482 {
2483         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2484             netif_running(dev)) {
2485                 netif_tx_stop_all_queues(dev);
2486         }
2487 }
2488 EXPORT_SYMBOL(netif_device_detach);
2489
2490 /**
2491  * netif_device_attach - mark device as attached
2492  * @dev: network device
2493  *
2494  * Mark device as attached from system and restart if needed.
2495  */
2496 void netif_device_attach(struct net_device *dev)
2497 {
2498         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2499             netif_running(dev)) {
2500                 netif_tx_wake_all_queues(dev);
2501                 __netdev_watchdog_up(dev);
2502         }
2503 }
2504 EXPORT_SYMBOL(netif_device_attach);
2505
2506 /*
2507  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2508  * to be used as a distribution range.
2509  */
2510 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2511                   unsigned int num_tx_queues)
2512 {
2513         u32 hash;
2514         u16 qoffset = 0;
2515         u16 qcount = num_tx_queues;
2516
2517         if (skb_rx_queue_recorded(skb)) {
2518                 hash = skb_get_rx_queue(skb);
2519                 while (unlikely(hash >= num_tx_queues))
2520                         hash -= num_tx_queues;
2521                 return hash;
2522         }
2523
2524         if (dev->num_tc) {
2525                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2526                 qoffset = dev->tc_to_txq[tc].offset;
2527                 qcount = dev->tc_to_txq[tc].count;
2528         }
2529
2530         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2531 }
2532 EXPORT_SYMBOL(__skb_tx_hash);
2533
2534 static void skb_warn_bad_offload(const struct sk_buff *skb)
2535 {
2536         static const netdev_features_t null_features;
2537         struct net_device *dev = skb->dev;
2538         const char *name = "";
2539
2540         if (!net_ratelimit())
2541                 return;
2542
2543         if (dev) {
2544                 if (dev->dev.parent)
2545                         name = dev_driver_string(dev->dev.parent);
2546                 else
2547                         name = netdev_name(dev);
2548         }
2549         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2550              "gso_type=%d ip_summed=%d\n",
2551              name, dev ? &dev->features : &null_features,
2552              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2553              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2554              skb_shinfo(skb)->gso_type, skb->ip_summed);
2555 }
2556
2557 /*
2558  * Invalidate hardware checksum when packet is to be mangled, and
2559  * complete checksum manually on outgoing path.
2560  */
2561 int skb_checksum_help(struct sk_buff *skb)
2562 {
2563         __wsum csum;
2564         int ret = 0, offset;
2565
2566         if (skb->ip_summed == CHECKSUM_COMPLETE)
2567                 goto out_set_summed;
2568
2569         if (unlikely(skb_shinfo(skb)->gso_size)) {
2570                 skb_warn_bad_offload(skb);
2571                 return -EINVAL;
2572         }
2573
2574         /* Before computing a checksum, we should make sure no frag could
2575          * be modified by an external entity : checksum could be wrong.
2576          */
2577         if (skb_has_shared_frag(skb)) {
2578                 ret = __skb_linearize(skb);
2579                 if (ret)
2580                         goto out;
2581         }
2582
2583         offset = skb_checksum_start_offset(skb);
2584         BUG_ON(offset >= skb_headlen(skb));
2585         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2586
2587         offset += skb->csum_offset;
2588         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2589
2590         if (skb_cloned(skb) &&
2591             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2592                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2593                 if (ret)
2594                         goto out;
2595         }
2596
2597         *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2598 out_set_summed:
2599         skb->ip_summed = CHECKSUM_NONE;
2600 out:
2601         return ret;
2602 }
2603 EXPORT_SYMBOL(skb_checksum_help);
2604
2605 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2606 {
2607         __be16 type = skb->protocol;
2608
2609         /* Tunnel gso handlers can set protocol to ethernet. */
2610         if (type == htons(ETH_P_TEB)) {
2611                 struct ethhdr *eth;
2612
2613                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2614                         return 0;
2615
2616                 eth = (struct ethhdr *)skb_mac_header(skb);
2617                 type = eth->h_proto;
2618         }
2619
2620         return __vlan_get_protocol(skb, type, depth);
2621 }
2622
2623 /**
2624  *      skb_mac_gso_segment - mac layer segmentation handler.
2625  *      @skb: buffer to segment
2626  *      @features: features for the output path (see dev->features)
2627  */
2628 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2629                                     netdev_features_t features)
2630 {
2631         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2632         struct packet_offload *ptype;
2633         int vlan_depth = skb->mac_len;
2634         __be16 type = skb_network_protocol(skb, &vlan_depth);
2635
2636         if (unlikely(!type))
2637                 return ERR_PTR(-EINVAL);
2638
2639         __skb_pull(skb, vlan_depth);
2640
2641         rcu_read_lock();
2642         list_for_each_entry_rcu(ptype, &offload_base, list) {
2643                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2644                         segs = ptype->callbacks.gso_segment(skb, features);
2645                         break;
2646                 }
2647         }
2648         rcu_read_unlock();
2649
2650         __skb_push(skb, skb->data - skb_mac_header(skb));
2651
2652         return segs;
2653 }
2654 EXPORT_SYMBOL(skb_mac_gso_segment);
2655
2656
2657 /* openvswitch calls this on rx path, so we need a different check.
2658  */
2659 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2660 {
2661         if (tx_path)
2662                 return skb->ip_summed != CHECKSUM_PARTIAL;
2663         else
2664                 return skb->ip_summed == CHECKSUM_NONE;
2665 }
2666
2667 /**
2668  *      __skb_gso_segment - Perform segmentation on skb.
2669  *      @skb: buffer to segment
2670  *      @features: features for the output path (see dev->features)
2671  *      @tx_path: whether it is called in TX path
2672  *
2673  *      This function segments the given skb and returns a list of segments.
2674  *
2675  *      It may return NULL if the skb requires no segmentation.  This is
2676  *      only possible when GSO is used for verifying header integrity.
2677  *
2678  *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2679  */
2680 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2681                                   netdev_features_t features, bool tx_path)
2682 {
2683         if (unlikely(skb_needs_check(skb, tx_path))) {
2684                 int err;
2685
2686                 skb_warn_bad_offload(skb);
2687
2688                 err = skb_cow_head(skb, 0);
2689                 if (err < 0)
2690                         return ERR_PTR(err);
2691         }
2692
2693         /* Only report GSO partial support if it will enable us to
2694          * support segmentation on this frame without needing additional
2695          * work.
2696          */
2697         if (features & NETIF_F_GSO_PARTIAL) {
2698                 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2699                 struct net_device *dev = skb->dev;
2700
2701                 partial_features |= dev->features & dev->gso_partial_features;
2702                 if (!skb_gso_ok(skb, features | partial_features))
2703                         features &= ~NETIF_F_GSO_PARTIAL;
2704         }
2705
2706         BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2707                      sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2708
2709         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2710         SKB_GSO_CB(skb)->encap_level = 0;
2711
2712         skb_reset_mac_header(skb);
2713         skb_reset_mac_len(skb);
2714
2715         return skb_mac_gso_segment(skb, features);
2716 }
2717 EXPORT_SYMBOL(__skb_gso_segment);
2718
2719 /* Take action when hardware reception checksum errors are detected. */
2720 #ifdef CONFIG_BUG
2721 void netdev_rx_csum_fault(struct net_device *dev)
2722 {
2723         if (net_ratelimit()) {
2724                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2725                 dump_stack();
2726         }
2727 }
2728 EXPORT_SYMBOL(netdev_rx_csum_fault);
2729 #endif
2730
2731 /* Actually, we should eliminate this check as soon as we know, that:
2732  * 1. IOMMU is present and allows to map all the memory.
2733  * 2. No high memory really exists on this machine.
2734  */
2735
2736 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2737 {
2738 #ifdef CONFIG_HIGHMEM
2739         int i;
2740         if (!(dev->features & NETIF_F_HIGHDMA)) {
2741                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2742                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2743                         if (PageHighMem(skb_frag_page(frag)))
2744                                 return 1;
2745                 }
2746         }
2747
2748         if (PCI_DMA_BUS_IS_PHYS) {
2749                 struct device *pdev = dev->dev.parent;
2750
2751                 if (!pdev)
2752                         return 0;
2753                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2754                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2755                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2756                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2757                                 return 1;
2758                 }
2759         }
2760 #endif
2761         return 0;
2762 }
2763
2764 /* If MPLS offload request, verify we are testing hardware MPLS features
2765  * instead of standard features for the netdev.
2766  */
2767 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2768 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2769                                            netdev_features_t features,
2770                                            __be16 type)
2771 {
2772         if (eth_p_mpls(type))
2773                 features &= skb->dev->mpls_features;
2774
2775         return features;
2776 }
2777 #else
2778 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2779                                            netdev_features_t features,
2780                                            __be16 type)
2781 {
2782         return features;
2783 }
2784 #endif
2785
2786 static netdev_features_t harmonize_features(struct sk_buff *skb,
2787         netdev_features_t features)
2788 {
2789         int tmp;
2790         __be16 type;
2791
2792         type = skb_network_protocol(skb, &tmp);
2793         features = net_mpls_features(skb, features, type);
2794
2795         if (skb->ip_summed != CHECKSUM_NONE &&
2796             !can_checksum_protocol(features, type)) {
2797                 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2798         } else if (illegal_highdma(skb->dev, skb)) {
2799                 features &= ~NETIF_F_SG;
2800         }
2801
2802         return features;
2803 }
2804
2805 netdev_features_t passthru_features_check(struct sk_buff *skb,
2806                                           struct net_device *dev,
2807                                           netdev_features_t features)
2808 {
2809         return features;
2810 }
2811 EXPORT_SYMBOL(passthru_features_check);
2812
2813 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2814                                              struct net_device *dev,
2815                                              netdev_features_t features)
2816 {
2817         return vlan_features_check(skb, features);
2818 }
2819
2820 static netdev_features_t gso_features_check(const struct sk_buff *skb,
2821                                             struct net_device *dev,
2822                                             netdev_features_t features)
2823 {
2824         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2825
2826         if (gso_segs > dev->gso_max_segs)
2827                 return features & ~NETIF_F_GSO_MASK;
2828
2829         /* Support for GSO partial features requires software
2830          * intervention before we can actually process the packets
2831          * so we need to strip support for any partial features now
2832          * and we can pull them back in after we have partially
2833          * segmented the frame.
2834          */
2835         if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2836                 features &= ~dev->gso_partial_features;
2837
2838         /* Make sure to clear the IPv4 ID mangling feature if the
2839          * IPv4 header has the potential to be fragmented.
2840          */
2841         if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2842                 struct iphdr *iph = skb->encapsulation ?
2843                                     inner_ip_hdr(skb) : ip_hdr(skb);
2844
2845                 if (!(iph->frag_off & htons(IP_DF)))
2846                         features &= ~NETIF_F_TSO_MANGLEID;
2847         }
2848
2849         return features;
2850 }
2851
2852 netdev_features_t netif_skb_features(struct sk_buff *skb)
2853 {
2854         struct net_device *dev = skb->dev;
2855         netdev_features_t features = dev->features;
2856
2857         if (skb_is_gso(skb))
2858                 features = gso_features_check(skb, dev, features);
2859
2860         /* If encapsulation offload request, verify we are testing
2861          * hardware encapsulation features instead of standard
2862          * features for the netdev
2863          */
2864         if (skb->encapsulation)
2865                 features &= dev->hw_enc_features;
2866
2867         if (skb_vlan_tagged(skb))
2868                 features = netdev_intersect_features(features,
2869                                                      dev->vlan_features |
2870                                                      NETIF_F_HW_VLAN_CTAG_TX |
2871                                                      NETIF_F_HW_VLAN_STAG_TX);
2872
2873         if (dev->netdev_ops->ndo_features_check)
2874                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2875                                                                 features);
2876         else
2877                 features &= dflt_features_check(skb, dev, features);
2878
2879         return harmonize_features(skb, features);
2880 }
2881 EXPORT_SYMBOL(netif_skb_features);
2882
2883 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2884                     struct netdev_queue *txq, bool more)
2885 {
2886         unsigned int len;
2887         int rc;
2888
2889         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2890                 dev_queue_xmit_nit(skb, dev);
2891
2892         len = skb->len;
2893         trace_net_dev_start_xmit(skb, dev);
2894         rc = netdev_start_xmit(skb, dev, txq, more);
2895         trace_net_dev_xmit(skb, rc, dev, len);
2896
2897         return rc;
2898 }
2899
2900 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2901                                     struct netdev_queue *txq, int *ret)
2902 {
2903         struct sk_buff *skb = first;
2904         int rc = NETDEV_TX_OK;
2905
2906         while (skb) {
2907                 struct sk_buff *next = skb->next;
2908
2909                 skb->next = NULL;
2910                 rc = xmit_one(skb, dev, txq, next != NULL);
2911                 if (unlikely(!dev_xmit_complete(rc))) {
2912                         skb->next = next;
2913                         goto out;
2914                 }
2915
2916                 skb = next;
2917                 if (netif_xmit_stopped(txq) && skb) {
2918                         rc = NETDEV_TX_BUSY;
2919                         break;
2920                 }
2921         }
2922
2923 out:
2924         *ret = rc;
2925         return skb;
2926 }
2927
2928 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2929                                           netdev_features_t features)
2930 {
2931         if (skb_vlan_tag_present(skb) &&
2932             !vlan_hw_offload_capable(features, skb->vlan_proto))
2933                 skb = __vlan_hwaccel_push_inside(skb);
2934         return skb;
2935 }
2936
2937 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2938 {
2939         netdev_features_t features;
2940
2941         features = netif_skb_features(skb);
2942         skb = validate_xmit_vlan(skb, features);
2943         if (unlikely(!skb))
2944                 goto out_null;
2945
2946         if (netif_needs_gso(skb, features)) {
2947                 struct sk_buff *segs;
2948
2949                 segs = skb_gso_segment(skb, features);
2950                 if (IS_ERR(segs)) {
2951                         goto out_kfree_skb;
2952                 } else if (segs) {
2953                         consume_skb(skb);
2954                         skb = segs;
2955                 }
2956         } else {
2957                 if (skb_needs_linearize(skb, features) &&
2958                     __skb_linearize(skb))
2959                         goto out_kfree_skb;
2960
2961                 /* If packet is not checksummed and device does not
2962                  * support checksumming for this protocol, complete
2963                  * checksumming here.
2964                  */
2965                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2966                         if (skb->encapsulation)
2967                                 skb_set_inner_transport_header(skb,
2968                                                                skb_checksum_start_offset(skb));
2969                         else
2970                                 skb_set_transport_header(skb,
2971                                                          skb_checksum_start_offset(skb));
2972                         if (!(features & NETIF_F_CSUM_MASK) &&
2973                             skb_checksum_help(skb))
2974                                 goto out_kfree_skb;
2975                 }
2976         }
2977
2978         return skb;
2979
2980 out_kfree_skb:
2981         kfree_skb(skb);
2982 out_null:
2983         atomic_long_inc(&dev->tx_dropped);
2984         return NULL;
2985 }
2986
2987 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2988 {
2989         struct sk_buff *next, *head = NULL, *tail;
2990
2991         for (; skb != NULL; skb = next) {
2992                 next = skb->next;
2993                 skb->next = NULL;
2994
2995                 /* in case skb wont be segmented, point to itself */
2996                 skb->prev = skb;
2997
2998                 skb = validate_xmit_skb(skb, dev);
2999                 if (!skb)
3000                         continue;
3001
3002                 if (!head)
3003                         head = skb;
3004                 else
3005                         tail->next = skb;
3006                 /* If skb was segmented, skb->prev points to
3007                  * the last segment. If not, it still contains skb.
3008                  */
3009                 tail = skb->prev;
3010         }
3011         return head;
3012 }
3013 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3014
3015 static void qdisc_pkt_len_init(struct sk_buff *skb)
3016 {
3017         const struct skb_shared_info *shinfo = skb_shinfo(skb);
3018
3019         qdisc_skb_cb(skb)->pkt_len = skb->len;
3020
3021         /* To get more precise estimation of bytes sent on wire,
3022          * we add to pkt_len the headers size of all segments
3023          */
3024         if (shinfo->gso_size)  {
3025                 unsigned int hdr_len;
3026                 u16 gso_segs = shinfo->gso_segs;
3027
3028                 /* mac layer + network layer */
3029                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3030
3031                 /* + transport layer */
3032                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3033                         hdr_len += tcp_hdrlen(skb);
3034                 else
3035                         hdr_len += sizeof(struct udphdr);
3036
3037                 if (shinfo->gso_type & SKB_GSO_DODGY)
3038                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3039                                                 shinfo->gso_size);
3040
3041                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3042         }
3043 }
3044
3045 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3046                                  struct net_device *dev,
3047                                  struct netdev_queue *txq)
3048 {
3049         spinlock_t *root_lock = qdisc_lock(q);
3050         struct sk_buff *to_free = NULL;
3051         bool contended;
3052         int rc;
3053
3054         qdisc_calculate_pkt_len(skb, q);
3055         /*
3056          * Heuristic to force contended enqueues to serialize on a
3057          * separate lock before trying to get qdisc main lock.
3058          * This permits qdisc->running owner to get the lock more
3059          * often and dequeue packets faster.
3060          */
3061         contended = qdisc_is_running(q);
3062         if (unlikely(contended))
3063                 spin_lock(&q->busylock);
3064
3065         spin_lock(root_lock);
3066         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3067                 __qdisc_drop(skb, &to_free);
3068                 rc = NET_XMIT_DROP;
3069         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3070                    qdisc_run_begin(q)) {
3071                 /*
3072                  * This is a work-conserving queue; there are no old skbs
3073                  * waiting to be sent out; and the qdisc is not running -
3074                  * xmit the skb directly.
3075                  */
3076
3077                 qdisc_bstats_update(q, skb);
3078
3079                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3080                         if (unlikely(contended)) {
3081                                 spin_unlock(&q->busylock);
3082                                 contended = false;
3083                         }
3084                         __qdisc_run(q);
3085                 } else
3086                         qdisc_run_end(q);
3087
3088                 rc = NET_XMIT_SUCCESS;
3089         } else {
3090                 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3091                 if (qdisc_run_begin(q)) {
3092                         if (unlikely(contended)) {
3093                                 spin_unlock(&q->busylock);
3094                                 contended = false;
3095                         }
3096                         __qdisc_run(q);
3097                 }
3098         }
3099         spin_unlock(root_lock);
3100         if (unlikely(to_free))
3101                 kfree_skb_list(to_free);
3102         if (unlikely(contended))
3103                 spin_unlock(&q->busylock);
3104         return rc;
3105 }
3106
3107 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3108 static void skb_update_prio(struct sk_buff *skb)
3109 {
3110         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3111
3112         if (!skb->priority && skb->sk && map) {
3113                 unsigned int prioidx =
3114                         sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3115
3116                 if (prioidx < map->priomap_len)
3117                         skb->priority = map->priomap[prioidx];
3118         }
3119 }
3120 #else
3121 #define skb_update_prio(skb)
3122 #endif
3123
3124 DEFINE_PER_CPU(int, xmit_recursion);
3125 EXPORT_SYMBOL(xmit_recursion);
3126
3127 /**
3128  *      dev_loopback_xmit - loop back @skb
3129  *      @net: network namespace this loopback is happening in
3130  *      @sk:  sk needed to be a netfilter okfn
3131  *      @skb: buffer to transmit
3132  */
3133 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3134 {
3135         skb_reset_mac_header(skb);
3136         __skb_pull(skb, skb_network_offset(skb));
3137         skb->pkt_type = PACKET_LOOPBACK;
3138         skb->ip_summed = CHECKSUM_UNNECESSARY;
3139         WARN_ON(!skb_dst(skb));
3140         skb_dst_force(skb);
3141         netif_rx_ni(skb);
3142         return 0;
3143 }
3144 EXPORT_SYMBOL(dev_loopback_xmit);
3145
3146 #ifdef CONFIG_NET_EGRESS
3147 static struct sk_buff *
3148 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3149 {
3150         struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3151         struct tcf_result cl_res;
3152
3153         if (!cl)
3154                 return skb;
3155
3156         /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3157          * earlier by the caller.
3158          */
3159         qdisc_bstats_cpu_update(cl->q, skb);
3160
3161         switch (tc_classify(skb, cl, &cl_res, false)) {
3162         case TC_ACT_OK:
3163         case TC_ACT_RECLASSIFY:
3164                 skb->tc_index = TC_H_MIN(cl_res.classid);
3165                 break;
3166         case TC_ACT_SHOT:
3167                 qdisc_qstats_cpu_drop(cl->q);
3168                 *ret = NET_XMIT_DROP;
3169                 kfree_skb(skb);
3170                 return NULL;
3171         case TC_ACT_STOLEN:
3172         case TC_ACT_QUEUED:
3173                 *ret = NET_XMIT_SUCCESS;
3174                 consume_skb(skb);
3175                 return NULL;
3176         case TC_ACT_REDIRECT:
3177                 /* No need to push/pop skb's mac_header here on egress! */
3178                 skb_do_redirect(skb);
3179                 *ret = NET_XMIT_SUCCESS;
3180                 return NULL;
3181         default:
3182                 break;
3183         }
3184
3185         return skb;
3186 }
3187 #endif /* CONFIG_NET_EGRESS */
3188
3189 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3190 {
3191 #ifdef CONFIG_XPS
3192         struct xps_dev_maps *dev_maps;
3193         struct xps_map *map;
3194         int queue_index = -1;
3195
3196         rcu_read_lock();
3197         dev_maps = rcu_dereference(dev->xps_maps);
3198         if (dev_maps) {
3199                 unsigned int tci = skb->sender_cpu - 1;
3200
3201                 if (dev->num_tc) {
3202                         tci *= dev->num_tc;
3203                         tci += netdev_get_prio_tc_map(dev, skb->priority);
3204                 }
3205
3206                 map = rcu_dereference(dev_maps->cpu_map[tci]);
3207                 if (map) {
3208                         if (map->len == 1)
3209                                 queue_index = map->queues[0];
3210                         else
3211                                 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3212                                                                            map->len)];
3213                         if (unlikely(queue_index >= dev->real_num_tx_queues))
3214                                 queue_index = -1;
3215                 }
3216         }
3217         rcu_read_unlock();
3218
3219         return queue_index;
3220 #else
3221         return -1;
3222 #endif
3223 }
3224
3225 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3226 {
3227         struct sock *sk = skb->sk;
3228         int queue_index = sk_tx_queue_get(sk);
3229
3230         if (queue_index < 0 || skb->ooo_okay ||
3231             queue_index >= dev->real_num_tx_queues) {
3232                 int new_index = get_xps_queue(dev, skb);
3233                 if (new_index < 0)
3234                         new_index = skb_tx_hash(dev, skb);
3235
3236                 if (queue_index != new_index && sk &&
3237                     sk_fullsock(sk) &&
3238                     rcu_access_pointer(sk->sk_dst_cache))
3239                         sk_tx_queue_set(sk, new_index);
3240
3241                 queue_index = new_index;
3242         }
3243
3244         return queue_index;
3245 }
3246
3247 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3248                                     struct sk_buff *skb,
3249                                     void *accel_priv)
3250 {
3251         int queue_index = 0;
3252
3253 #ifdef CONFIG_XPS
3254         u32 sender_cpu = skb->sender_cpu - 1;
3255
3256         if (sender_cpu >= (u32)NR_CPUS)
3257                 skb->sender_cpu = raw_smp_processor_id() + 1;
3258 #endif
3259
3260         if (dev->real_num_tx_queues != 1) {
3261                 const struct net_device_ops *ops = dev->netdev_ops;
3262                 if (ops->ndo_select_queue)
3263                         queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3264                                                             __netdev_pick_tx);
3265                 else
3266                         queue_index = __netdev_pick_tx(dev, skb);
3267
3268                 if (!accel_priv)
3269                         queue_index = netdev_cap_txqueue(dev, queue_index);
3270         }
3271
3272         skb_set_queue_mapping(skb, queue_index);
3273         return netdev_get_tx_queue(dev, queue_index);
3274 }
3275
3276 /**
3277  *      __dev_queue_xmit - transmit a buffer
3278  *      @skb: buffer to transmit
3279  *      @accel_priv: private data used for L2 forwarding offload
3280  *
3281  *      Queue a buffer for transmission to a network device. The caller must
3282  *      have set the device and priority and built the buffer before calling
3283  *      this function. The function can be called from an interrupt.
3284  *
3285  *      A negative errno code is returned on a failure. A success does not
3286  *      guarantee the frame will be transmitted as it may be dropped due
3287  *      to congestion or traffic shaping.
3288  *
3289  * -----------------------------------------------------------------------------------
3290  *      I notice this method can also return errors from the queue disciplines,
3291  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3292  *      be positive.
3293  *
3294  *      Regardless of the return value, the skb is consumed, so it is currently
3295  *      difficult to retry a send to this method.  (You can bump the ref count
3296  *      before sending to hold a reference for retry if you are careful.)
3297  *
3298  *      When calling this method, interrupts MUST be enabled.  This is because
3299  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3300  *          --BLG
3301  */
3302 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3303 {
3304         struct net_device *dev = skb->dev;
3305         struct netdev_queue *txq;
3306         struct Qdisc *q;
3307         int rc = -ENOMEM;
3308
3309         skb_reset_mac_header(skb);
3310
3311         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3312                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3313
3314         /* Disable soft irqs for various locks below. Also
3315          * stops preemption for RCU.
3316          */
3317         rcu_read_lock_bh();
3318
3319         skb_update_prio(skb);
3320
3321         qdisc_pkt_len_init(skb);
3322 #ifdef CONFIG_NET_CLS_ACT
3323         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3324 # ifdef CONFIG_NET_EGRESS
3325         if (static_key_false(&egress_needed)) {
3326                 skb = sch_handle_egress(skb, &rc, dev);
3327                 if (!skb)
3328                         goto out;
3329         }
3330 # endif
3331 #endif
3332         /* If device/qdisc don't need skb->dst, release it right now while
3333          * its hot in this cpu cache.
3334          */
3335         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3336                 skb_dst_drop(skb);
3337         else
3338                 skb_dst_force(skb);
3339
3340         txq = netdev_pick_tx(dev, skb, accel_priv);
3341         q = rcu_dereference_bh(txq->qdisc);
3342
3343         trace_net_dev_queue(skb);
3344         if (q->enqueue) {
3345                 rc = __dev_xmit_skb(skb, q, dev, txq);
3346                 goto out;
3347         }
3348
3349         /* The device has no queue. Common case for software devices:
3350            loopback, all the sorts of tunnels...
3351
3352            Really, it is unlikely that netif_tx_lock protection is necessary
3353            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3354            counters.)
3355            However, it is possible, that they rely on protection
3356            made by us here.
3357
3358            Check this and shot the lock. It is not prone from deadlocks.
3359            Either shot noqueue qdisc, it is even simpler 8)
3360          */
3361         if (dev->flags & IFF_UP) {
3362                 int cpu = smp_processor_id(); /* ok because BHs are off */
3363
3364                 if (txq->xmit_lock_owner != cpu) {
3365                         if (unlikely(__this_cpu_read(xmit_recursion) >
3366                                      XMIT_RECURSION_LIMIT))
3367                                 goto recursion_alert;
3368
3369                         skb = validate_xmit_skb(skb, dev);
3370                         if (!skb)
3371                                 goto out;
3372
3373                         HARD_TX_LOCK(dev, txq, cpu);
3374
3375                         if (!netif_xmit_stopped(txq)) {
3376                                 __this_cpu_inc(xmit_recursion);
3377                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3378                                 __this_cpu_dec(xmit_recursion);
3379                                 if (dev_xmit_complete(rc)) {
3380                                         HARD_TX_UNLOCK(dev, txq);
3381                                         goto out;
3382                                 }
3383                         }
3384                         HARD_TX_UNLOCK(dev, txq);
3385                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3386                                              dev->name);
3387                 } else {
3388                         /* Recursion is detected! It is possible,
3389                          * unfortunately
3390                          */
3391 recursion_alert:
3392                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3393                                              dev->name);
3394                 }
3395         }
3396
3397         rc = -ENETDOWN;
3398         rcu_read_unlock_bh();
3399
3400         atomic_long_inc(&dev->tx_dropped);
3401         kfree_skb_list(skb);
3402         return rc;
3403 out:
3404         rcu_read_unlock_bh();
3405         return rc;
3406 }
3407
3408 int dev_queue_xmit(struct sk_buff *skb)
3409 {
3410         return __dev_queue_xmit(skb, NULL);
3411 }
3412 EXPORT_SYMBOL(dev_queue_xmit);
3413
3414 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3415 {
3416         return __dev_queue_xmit(skb, accel_priv);
3417 }
3418 EXPORT_SYMBOL(dev_queue_xmit_accel);
3419
3420
3421 /*=======================================================================
3422                         Receiver routines
3423   =======================================================================*/
3424
3425 int netdev_max_backlog __read_mostly = 1000;
3426 EXPORT_SYMBOL(netdev_max_backlog);
3427
3428 int netdev_tstamp_prequeue __read_mostly = 1;
3429 int netdev_budget __read_mostly = 300;
3430 int weight_p __read_mostly = 64;            /* old backlog weight */
3431
3432 /* Called with irq disabled */
3433 static inline void ____napi_schedule(struct softnet_data *sd,
3434                                      struct napi_struct *napi)
3435 {
3436         list_add_tail(&napi->poll_list, &sd->poll_list);
3437         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3438 }
3439
3440 #ifdef CONFIG_RPS
3441
3442 /* One global table that all flow-based protocols share. */
3443 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3444 EXPORT_SYMBOL(rps_sock_flow_table);
3445 u32 rps_cpu_mask __read_mostly;
3446 EXPORT_SYMBOL(rps_cpu_mask);
3447
3448 struct static_key rps_needed __read_mostly;
3449 EXPORT_SYMBOL(rps_needed);
3450 struct static_key rfs_needed __read_mostly;
3451 EXPORT_SYMBOL(rfs_needed);
3452
3453 static struct rps_dev_flow *
3454 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3455             struct rps_dev_flow *rflow, u16 next_cpu)
3456 {
3457         if (next_cpu < nr_cpu_ids) {
3458 #ifdef CONFIG_RFS_ACCEL
3459                 struct netdev_rx_queue *rxqueue;
3460                 struct rps_dev_flow_table *flow_table;
3461                 struct rps_dev_flow *old_rflow;
3462                 u32 flow_id;
3463                 u16 rxq_index;
3464                 int rc;
3465
3466                 /* Should we steer this flow to a different hardware queue? */
3467                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3468                     !(dev->features & NETIF_F_NTUPLE))
3469                         goto out;
3470                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3471                 if (rxq_index == skb_get_rx_queue(skb))
3472                         goto out;
3473
3474                 rxqueue = dev->_rx + rxq_index;
3475                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3476                 if (!flow_table)
3477                         goto out;
3478                 flow_id = skb_get_hash(skb) & flow_table->mask;
3479                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3480                                                         rxq_index, flow_id);
3481                 if (rc < 0)
3482                         goto out;
3483                 old_rflow = rflow;
3484                 rflow = &flow_table->flows[flow_id];
3485                 rflow->filter = rc;
3486                 if (old_rflow->filter == rflow->filter)
3487                         old_rflow->filter = RPS_NO_FILTER;
3488         out:
3489 #endif
3490                 rflow->last_qtail =
3491                         per_cpu(softnet_data, next_cpu).input_queue_head;
3492         }
3493
3494         rflow->cpu = next_cpu;
3495         return rflow;
3496 }
3497
3498 /*
3499  * get_rps_cpu is called from netif_receive_skb and returns the target
3500  * CPU from the RPS map of the receiving queue for a given skb.
3501  * rcu_read_lock must be held on entry.
3502  */
3503 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3504                        struct rps_dev_flow **rflowp)
3505 {
3506         const struct rps_sock_flow_table *sock_flow_table;
3507         struct netdev_rx_queue *rxqueue = dev->_rx;
3508         struct rps_dev_flow_table *flow_table;
3509         struct rps_map *map;
3510         int cpu = -1;
3511         u32 tcpu;
3512         u32 hash;
3513
3514         if (skb_rx_queue_recorded(skb)) {
3515                 u16 index = skb_get_rx_queue(skb);
3516
3517                 if (unlikely(index >= dev->real_num_rx_queues)) {
3518                         WARN_ONCE(dev->real_num_rx_queues > 1,
3519                                   "%s received packet on queue %u, but number "
3520                                   "of RX queues is %u\n",
3521                                   dev->name, index, dev->real_num_rx_queues);
3522                         goto done;
3523                 }
3524                 rxqueue += index;
3525         }
3526
3527         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3528
3529         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3530         map = rcu_dereference(rxqueue->rps_map);
3531         if (!flow_table && !map)
3532                 goto done;
3533
3534         skb_reset_network_header(skb);
3535         hash = skb_get_hash(skb);
3536         if (!hash)
3537                 goto done;
3538
3539         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3540         if (flow_table && sock_flow_table) {
3541                 struct rps_dev_flow *rflow;
3542                 u32 next_cpu;
3543                 u32 ident;
3544
3545                 /* First check into global flow table if there is a match */
3546                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3547                 if ((ident ^ hash) & ~rps_cpu_mask)
3548                         goto try_rps;
3549
3550                 next_cpu = ident & rps_cpu_mask;
3551
3552                 /* OK, now we know there is a match,
3553                  * we can look at the local (per receive queue) flow table
3554                  */
3555                 rflow = &flow_table->flows[hash & flow_table->mask];
3556                 tcpu = rflow->cpu;
3557
3558                 /*
3559                  * If the desired CPU (where last recvmsg was done) is
3560                  * different from current CPU (one in the rx-queue flow
3561                  * table entry), switch if one of the following holds:
3562                  *   - Current CPU is unset (>= nr_cpu_ids).
3563                  *   - Current CPU is offline.
3564                  *   - The current CPU's queue tail has advanced beyond the
3565                  *     last packet that was enqueued using this table entry.
3566                  *     This guarantees that all previous packets for the flow
3567                  *     have been dequeued, thus preserving in order delivery.
3568                  */
3569                 if (unlikely(tcpu != next_cpu) &&
3570                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3571                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3572                       rflow->last_qtail)) >= 0)) {
3573                         tcpu = next_cpu;
3574                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3575                 }
3576
3577                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3578                         *rflowp = rflow;
3579                         cpu = tcpu;
3580                         goto done;
3581                 }
3582         }
3583
3584 try_rps:
3585
3586         if (map) {
3587                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3588                 if (cpu_online(tcpu)) {
3589                         cpu = tcpu;
3590                         goto done;
3591                 }
3592         }
3593
3594 done:
3595         return cpu;
3596 }
3597
3598 #ifdef CONFIG_RFS_ACCEL
3599
3600 /**
3601  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3602  * @dev: Device on which the filter was set
3603  * @rxq_index: RX queue index
3604  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3605  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3606  *
3607  * Drivers that implement ndo_rx_flow_steer() should periodically call
3608  * this function for each installed filter and remove the filters for
3609  * which it returns %true.
3610  */
3611 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3612                          u32 flow_id, u16 filter_id)
3613 {
3614         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3615         struct rps_dev_flow_table *flow_table;
3616         struct rps_dev_flow *rflow;
3617         bool expire = true;
3618         unsigned int cpu;
3619
3620         rcu_read_lock();
3621         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3622         if (flow_table && flow_id <= flow_table->mask) {
3623                 rflow = &flow_table->flows[flow_id];
3624                 cpu = ACCESS_ONCE(rflow->cpu);
3625                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3626                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3627                            rflow->last_qtail) <
3628                      (int)(10 * flow_table->mask)))
3629                         expire = false;
3630         }
3631         rcu_read_unlock();
3632         return expire;
3633 }
3634 EXPORT_SYMBOL(rps_may_expire_flow);
3635
3636 #endif /* CONFIG_RFS_ACCEL */
3637
3638 /* Called from hardirq (IPI) context */
3639 static void rps_trigger_softirq(void *data)
3640 {
3641         struct softnet_data *sd = data;
3642
3643         ____napi_schedule(sd, &sd->backlog);
3644         sd->received_rps++;
3645 }
3646
3647 #endif /* CONFIG_RPS */
3648
3649 /*
3650  * Check if this softnet_data structure is another cpu one
3651  * If yes, queue it to our IPI list and return 1
3652  * If no, return 0
3653  */
3654 static int rps_ipi_queued(struct softnet_data *sd)
3655 {
3656 #ifdef CONFIG_RPS
3657         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3658
3659         if (sd != mysd) {
3660                 sd->rps_ipi_next = mysd->rps_ipi_list;
3661                 mysd->rps_ipi_list = sd;
3662
3663                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3664                 return 1;
3665         }
3666 #endif /* CONFIG_RPS */
3667         return 0;
3668 }
3669
3670 #ifdef CONFIG_NET_FLOW_LIMIT
3671 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3672 #endif
3673
3674 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3675 {
3676 #ifdef CONFIG_NET_FLOW_LIMIT
3677         struct sd_flow_limit *fl;
3678         struct softnet_data *sd;
3679         unsigned int old_flow, new_flow;
3680
3681         if (qlen < (netdev_max_backlog >> 1))
3682                 return false;
3683
3684         sd = this_cpu_ptr(&softnet_data);
3685
3686         rcu_read_lock();
3687         fl = rcu_dereference(sd->flow_limit);
3688         if (fl) {
3689                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3690                 old_flow = fl->history[fl->history_head];
3691                 fl->history[fl->history_head] = new_flow;
3692
3693                 fl->history_head++;
3694                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3695
3696                 if (likely(fl->buckets[old_flow]))
3697                         fl->buckets[old_flow]--;
3698
3699                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3700                         fl->count++;
3701                         rcu_read_unlock();
3702                         return true;
3703                 }
3704         }
3705         rcu_read_unlock();
3706 #endif
3707         return false;
3708 }
3709
3710 /*
3711  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3712  * queue (may be a remote CPU queue).
3713  */
3714 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3715                               unsigned int *qtail)
3716 {
3717         struct softnet_data *sd;
3718         unsigned long flags;
3719         unsigned int qlen;
3720
3721         sd = &per_cpu(softnet_data, cpu);
3722
3723         local_irq_save(flags);
3724
3725         rps_lock(sd);
3726         if (!netif_running(skb->dev))
3727                 goto drop;
3728         qlen = skb_queue_len(&sd->input_pkt_queue);
3729         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3730                 if (qlen) {
3731 enqueue:
3732                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3733                         input_queue_tail_incr_save(sd, qtail);
3734                         rps_unlock(sd);
3735                         local_irq_restore(flags);
3736                         return NET_RX_SUCCESS;
3737                 }
3738
3739                 /* Schedule NAPI for backlog device
3740                  * We can use non atomic operation since we own the queue lock
3741                  */
3742                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3743                         if (!rps_ipi_queued(sd))
3744                                 ____napi_schedule(sd, &sd->backlog);
3745                 }
3746                 goto enqueue;
3747         }
3748
3749 drop:
3750         sd->dropped++;
3751         rps_unlock(sd);
3752
3753         local_irq_restore(flags);
3754
3755         atomic_long_inc(&skb->dev->rx_dropped);
3756         kfree_skb(skb);
3757         return NET_RX_DROP;
3758 }
3759
3760 static int netif_rx_internal(struct sk_buff *skb)
3761 {
3762         int ret;
3763
3764         net_timestamp_check(netdev_tstamp_prequeue, skb);
3765
3766         trace_netif_rx(skb);
3767 #ifdef CONFIG_RPS
3768         if (static_key_false(&rps_needed)) {
3769                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3770                 int cpu;
3771
3772                 preempt_disable();
3773                 rcu_read_lock();
3774
3775                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3776                 if (cpu < 0)
3777                         cpu = smp_processor_id();
3778
3779                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3780
3781                 rcu_read_unlock();
3782                 preempt_enable();
3783         } else
3784 #endif
3785         {
3786                 unsigned int qtail;
3787                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3788                 put_cpu();
3789         }
3790         return ret;
3791 }
3792
3793 /**
3794  *      netif_rx        -       post buffer to the network code
3795  *      @skb: buffer to post
3796  *
3797  *      This function receives a packet from a device driver and queues it for
3798  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3799  *      may be dropped during processing for congestion control or by the
3800  *      protocol layers.
3801  *
3802  *      return values:
3803  *      NET_RX_SUCCESS  (no congestion)
3804  *      NET_RX_DROP     (packet was dropped)
3805  *
3806  */
3807
3808 int netif_rx(struct sk_buff *skb)
3809 {
3810         trace_netif_rx_entry(skb);
3811
3812         return netif_rx_internal(skb);
3813 }
3814 EXPORT_SYMBOL(netif_rx);
3815
3816 int netif_rx_ni(struct sk_buff *skb)
3817 {
3818         int err;
3819
3820         trace_netif_rx_ni_entry(skb);
3821
3822         preempt_disable();
3823         err = netif_rx_internal(skb);
3824         if (local_softirq_pending())
3825                 do_softirq();
3826         preempt_enable();
3827
3828         return err;
3829 }
3830 EXPORT_SYMBOL(netif_rx_ni);
3831
3832 static __latent_entropy void net_tx_action(struct softirq_action *h)
3833 {
3834         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3835
3836         if (sd->completion_queue) {
3837                 struct sk_buff *clist;
3838
3839                 local_irq_disable();
3840                 clist = sd->completion_queue;
3841                 sd->completion_queue = NULL;
3842                 local_irq_enable();
3843
3844                 while (clist) {
3845                         struct sk_buff *skb = clist;
3846                         clist = clist->next;
3847
3848                         WARN_ON(atomic_read(&skb->users));
3849                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3850                                 trace_consume_skb(skb);
3851                         else
3852                                 trace_kfree_skb(skb, net_tx_action);
3853
3854                         if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3855                                 __kfree_skb(skb);
3856                         else
3857                                 __kfree_skb_defer(skb);
3858                 }
3859
3860                 __kfree_skb_flush();
3861         }
3862
3863         if (sd->output_queue) {
3864                 struct Qdisc *head;
3865
3866                 local_irq_disable();
3867                 head = sd->output_queue;
3868                 sd->output_queue = NULL;
3869                 sd->output_queue_tailp = &sd->output_queue;
3870                 local_irq_enable();
3871
3872                 while (head) {
3873                         struct Qdisc *q = head;
3874                         spinlock_t *root_lock;
3875
3876                         head = head->next_sched;
3877
3878                         root_lock = qdisc_lock(q);
3879                         spin_lock(root_lock);
3880                         /* We need to make sure head->next_sched is read
3881                          * before clearing __QDISC_STATE_SCHED
3882                          */
3883                         smp_mb__before_atomic();
3884                         clear_bit(__QDISC_STATE_SCHED, &q->state);
3885                         qdisc_run(q);
3886                         spin_unlock(root_lock);
3887                 }
3888         }
3889 }
3890
3891 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3892 /* This hook is defined here for ATM LANE */
3893 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3894                              unsigned char *addr) __read_mostly;
3895 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3896 #endif
3897
3898 static inline struct sk_buff *
3899 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3900                    struct net_device *orig_dev)
3901 {
3902 #ifdef CONFIG_NET_CLS_ACT
3903         struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3904         struct tcf_result cl_res;
3905
3906         /* If there's at least one ingress present somewhere (so
3907          * we get here via enabled static key), remaining devices
3908          * that are not configured with an ingress qdisc will bail
3909          * out here.
3910          */
3911         if (!cl)
3912                 return skb;
3913         if (*pt_prev) {
3914                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3915                 *pt_prev = NULL;
3916         }
3917
3918         qdisc_skb_cb(skb)->pkt_len = skb->len;
3919         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3920         qdisc_bstats_cpu_update(cl->q, skb);
3921
3922         switch (tc_classify(skb, cl, &cl_res, false)) {
3923         case TC_ACT_OK:
3924         case TC_ACT_RECLASSIFY:
3925                 skb->tc_index = TC_H_MIN(cl_res.classid);
3926                 break;
3927         case TC_ACT_SHOT:
3928                 qdisc_qstats_cpu_drop(cl->q);
3929                 kfree_skb(skb);
3930                 return NULL;
3931         case TC_ACT_STOLEN:
3932         case TC_ACT_QUEUED:
3933                 consume_skb(skb);
3934                 return NULL;
3935         case TC_ACT_REDIRECT:
3936                 /* skb_mac_header check was done by cls/act_bpf, so
3937                  * we can safely push the L2 header back before
3938                  * redirecting to another netdev
3939                  */
3940                 __skb_push(skb, skb->mac_len);
3941                 skb_do_redirect(skb);
3942                 return NULL;
3943         default:
3944                 break;
3945         }
3946 #endif /* CONFIG_NET_CLS_ACT */
3947         return skb;
3948 }
3949
3950 /**
3951  *      netdev_is_rx_handler_busy - check if receive handler is registered
3952  *      @dev: device to check
3953  *
3954  *      Check if a receive handler is already registered for a given device.
3955  *      Return true if there one.
3956  *
3957  *      The caller must hold the rtnl_mutex.
3958  */
3959 bool netdev_is_rx_handler_busy(struct net_device *dev)
3960 {
3961         ASSERT_RTNL();
3962         return dev && rtnl_dereference(dev->rx_handler);
3963 }
3964 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3965
3966 /**
3967  *      netdev_rx_handler_register - register receive handler
3968  *      @dev: device to register a handler for
3969  *      @rx_handler: receive handler to register
3970  *      @rx_handler_data: data pointer that is used by rx handler
3971  *
3972  *      Register a receive handler for a device. This handler will then be
3973  *      called from __netif_receive_skb. A negative errno code is returned
3974  *      on a failure.
3975  *
3976  *      The caller must hold the rtnl_mutex.
3977  *
3978  *      For a general description of rx_handler, see enum rx_handler_result.
3979  */
3980 int netdev_rx_handler_register(struct net_device *dev,
3981                                rx_handler_func_t *rx_handler,
3982                                void *rx_handler_data)
3983 {
3984         ASSERT_RTNL();
3985
3986         if (dev->rx_handler)
3987                 return -EBUSY;
3988
3989         /* Note: rx_handler_data must be set before rx_handler */
3990         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3991         rcu_assign_pointer(dev->rx_handler, rx_handler);
3992
3993         return 0;
3994 }
3995 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3996
3997 /**
3998  *      netdev_rx_handler_unregister - unregister receive handler
3999  *      @dev: device to unregister a handler from
4000  *
4001  *      Unregister a receive handler from a device.
4002  *
4003  *      The caller must hold the rtnl_mutex.
4004  */
4005 void netdev_rx_handler_unregister(struct net_device *dev)
4006 {
4007
4008         ASSERT_RTNL();
4009         RCU_INIT_POINTER(dev->rx_handler, NULL);
4010         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4011          * section has a guarantee to see a non NULL rx_handler_data
4012          * as well.
4013          */
4014         synchronize_net();
4015         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4016 }
4017 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4018
4019 /*
4020  * Limit the use of PFMEMALLOC reserves to those protocols that implement
4021  * the special handling of PFMEMALLOC skbs.
4022  */
4023 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4024 {
4025         switch (skb->protocol) {
4026         case htons(ETH_P_ARP):
4027         case htons(ETH_P_IP):
4028         case htons(ETH_P_IPV6):
4029         case htons(ETH_P_8021Q):
4030         case htons(ETH_P_8021AD):
4031                 return true;
4032         default:
4033                 return false;
4034         }
4035 }
4036
4037 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4038                              int *ret, struct net_device *orig_dev)
4039 {
4040 #ifdef CONFIG_NETFILTER_INGRESS
4041         if (nf_hook_ingress_active(skb)) {
4042                 int ingress_retval;
4043
4044                 if (*pt_prev) {
4045                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
4046                         *pt_prev = NULL;
4047                 }
4048
4049                 rcu_read_lock();
4050                 ingress_retval = nf_hook_ingress(skb);
4051                 rcu_read_unlock();
4052                 return ingress_retval;
4053         }
4054 #endif /* CONFIG_NETFILTER_INGRESS */
4055         return 0;
4056 }
4057
4058 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4059 {
4060         struct packet_type *ptype, *pt_prev;
4061         rx_handler_func_t *rx_handler;
4062         struct net_device *orig_dev;
4063         bool deliver_exact = false;
4064         int ret = NET_RX_DROP;
4065         __be16 type;
4066
4067         net_timestamp_check(!netdev_tstamp_prequeue, skb);
4068
4069         trace_netif_receive_skb(skb);
4070
4071         orig_dev = skb->dev;
4072
4073         skb_reset_network_header(skb);
4074         if (!skb_transport_header_was_set(skb))
4075                 skb_reset_transport_header(skb);
4076         skb_reset_mac_len(skb);
4077
4078         pt_prev = NULL;
4079
4080 another_round:
4081         skb->skb_iif = skb->dev->ifindex;
4082
4083         __this_cpu_inc(softnet_data.processed);
4084
4085         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4086             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4087                 skb = skb_vlan_untag(skb);
4088                 if (unlikely(!skb))
4089                         goto out;
4090         }
4091
4092 #ifdef CONFIG_NET_CLS_ACT
4093         if (skb->tc_verd & TC_NCLS) {
4094                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4095                 goto ncls;
4096         }
4097 #endif
4098
4099         if (pfmemalloc)
4100                 goto skip_taps;
4101
4102         list_for_each_entry_rcu(ptype, &ptype_all, list) {
4103                 if (pt_prev)
4104                         ret = deliver_skb(skb, pt_prev, orig_dev);
4105                 pt_prev = ptype;
4106         }
4107
4108         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4109                 if (pt_prev)
4110                         ret = deliver_skb(skb, pt_prev, orig_dev);
4111                 pt_prev = ptype;
4112         }
4113
4114 skip_taps:
4115 #ifdef CONFIG_NET_INGRESS
4116         if (static_key_false(&ingress_needed)) {
4117                 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4118                 if (!skb)
4119                         goto out;
4120
4121                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4122                         goto out;
4123         }
4124 #endif
4125 #ifdef CONFIG_NET_CLS_ACT
4126         skb->tc_verd = 0;
4127 ncls:
4128 #endif
4129         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4130                 goto drop;
4131
4132         if (skb_vlan_tag_present(skb)) {
4133                 if (pt_prev) {
4134                         ret = deliver_skb(skb, pt_prev, orig_dev);
4135                         pt_prev = NULL;
4136                 }
4137                 if (vlan_do_receive(&skb))
4138                         goto another_round;
4139                 else if (unlikely(!skb))
4140                         goto out;
4141         }
4142
4143         rx_handler = rcu_dereference(skb->dev->rx_handler);
4144         if (rx_handler) {
4145                 if (pt_prev) {
4146                         ret = deliver_skb(skb, pt_prev, orig_dev);
4147                         pt_prev = NULL;
4148                 }
4149                 switch (rx_handler(&skb)) {
4150                 case RX_HANDLER_CONSUMED:
4151                         ret = NET_RX_SUCCESS;
4152                         goto out;
4153                 case RX_HANDLER_ANOTHER:
4154                         goto another_round;
4155                 case RX_HANDLER_EXACT:
4156                         deliver_exact = true;
4157                 case RX_HANDLER_PASS:
4158                         break;
4159                 default:
4160                         BUG();
4161                 }
4162         }
4163
4164         if (unlikely(skb_vlan_tag_present(skb))) {
4165                 if (skb_vlan_tag_get_id(skb))
4166                         skb->pkt_type = PACKET_OTHERHOST;
4167                 /* Note: we might in the future use prio bits
4168                  * and set skb->priority like in vlan_do_receive()
4169                  * For the time being, just ignore Priority Code Point
4170                  */
4171                 skb->vlan_tci = 0;
4172         }
4173
4174         type = skb->protocol;
4175
4176         /* deliver only exact match when indicated */
4177         if (likely(!deliver_exact)) {
4178                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4179                                        &ptype_base[ntohs(type) &
4180                                                    PTYPE_HASH_MASK]);
4181         }
4182
4183         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4184                                &orig_dev->ptype_specific);
4185
4186         if (unlikely(skb->dev != orig_dev)) {
4187                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4188                                        &skb->dev->ptype_specific);
4189         }
4190
4191         if (pt_prev) {
4192                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4193                         goto drop;
4194                 else
4195                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4196         } else {
4197 drop:
4198                 if (!deliver_exact)
4199                         atomic_long_inc(&skb->dev->rx_dropped);
4200                 else
4201                         atomic_long_inc(&skb->dev->rx_nohandler);
4202                 kfree_skb(skb);
4203                 /* Jamal, now you will not able to escape explaining
4204                  * me how you were going to use this. :-)
4205                  */
4206                 ret = NET_RX_DROP;
4207         }
4208
4209 out:
4210         return ret;
4211 }
4212
4213 static int __netif_receive_skb(struct sk_buff *skb)
4214 {
4215         int ret;
4216
4217         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4218                 unsigned long pflags = current->flags;
4219
4220                 /*
4221                  * PFMEMALLOC skbs are special, they should
4222                  * - be delivered to SOCK_MEMALLOC sockets only
4223                  * - stay away from userspace
4224                  * - have bounded memory usage
4225                  *
4226                  * Use PF_MEMALLOC as this saves us from propagating the allocation
4227                  * context down to all allocation sites.
4228                  */
4229                 current->flags |= PF_MEMALLOC;
4230                 ret = __netif_receive_skb_core(skb, true);
4231                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4232         } else
4233                 ret = __netif_receive_skb_core(skb, false);
4234
4235         return ret;
4236 }
4237
4238 static int netif_receive_skb_internal(struct sk_buff *skb)
4239 {
4240         int ret;
4241
4242         net_timestamp_check(netdev_tstamp_prequeue, skb);
4243
4244         if (skb_defer_rx_timestamp(skb))
4245                 return NET_RX_SUCCESS;
4246
4247         rcu_read_lock();
4248
4249 #ifdef CONFIG_RPS
4250         if (static_key_false(&rps_needed)) {
4251                 struct rps_dev_flow voidflow, *rflow = &voidflow;
4252                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4253
4254                 if (cpu >= 0) {
4255                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4256                         rcu_read_unlock();
4257                         return ret;
4258                 }
4259         }
4260 #endif
4261         ret = __netif_receive_skb(skb);
4262         rcu_read_unlock();
4263         return ret;
4264 }
4265
4266 /**
4267  *      netif_receive_skb - process receive buffer from network
4268  *      @skb: buffer to process
4269  *
4270  *      netif_receive_skb() is the main receive data processing function.
4271  *      It always succeeds. The buffer may be dropped during processing
4272  *      for congestion control or by the protocol layers.
4273  *
4274  *      This function may only be called from softirq context and interrupts
4275  *      should be enabled.
4276  *
4277  *      Return values (usually ignored):
4278  *      NET_RX_SUCCESS: no congestion
4279  *      NET_RX_DROP: packet was dropped
4280  */
4281 int netif_receive_skb(struct sk_buff *skb)
4282 {
4283         trace_netif_receive_skb_entry(skb);
4284
4285         return netif_receive_skb_internal(skb);
4286 }
4287 EXPORT_SYMBOL(netif_receive_skb);
4288
4289 DEFINE_PER_CPU(struct work_struct, flush_works);
4290
4291 /* Network device is going away, flush any packets still pending */
4292 static void flush_backlog(struct work_struct *work)
4293 {
4294         struct sk_buff *skb, *tmp;
4295         struct softnet_data *sd;
4296
4297         local_bh_disable();
4298         sd = this_cpu_ptr(&softnet_data);
4299
4300         local_irq_disable();
4301         rps_lock(sd);
4302         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4303                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4304                         __skb_unlink(skb, &sd->input_pkt_queue);
4305                         kfree_skb(skb);
4306                         input_queue_head_incr(sd);
4307                 }
4308         }
4309         rps_unlock(sd);
4310         local_irq_enable();
4311
4312         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4313                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4314                         __skb_unlink(skb, &sd->process_queue);
4315                         kfree_skb(skb);
4316                         input_queue_head_incr(sd);
4317                 }
4318         }
4319         local_bh_enable();
4320 }
4321
4322 static void flush_all_backlogs(void)
4323 {
4324         unsigned int cpu;
4325
4326         get_online_cpus();
4327
4328         for_each_online_cpu(cpu)
4329                 queue_work_on(cpu, system_highpri_wq,
4330                               per_cpu_ptr(&flush_works, cpu));
4331
4332         for_each_online_cpu(cpu)
4333                 flush_work(per_cpu_ptr(&flush_works, cpu));
4334
4335         put_online_cpus();
4336 }
4337
4338 static int napi_gro_complete(struct sk_buff *skb)
4339 {
4340         struct packet_offload *ptype;
4341         __be16 type = skb->protocol;
4342         struct list_head *head = &offload_base;
4343         int err = -ENOENT;
4344
4345         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4346
4347         if (NAPI_GRO_CB(skb)->count == 1) {
4348                 skb_shinfo(skb)->gso_size = 0;
4349                 goto out;
4350         }
4351
4352         rcu_read_lock();
4353         list_for_each_entry_rcu(ptype, head, list) {
4354                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4355                         continue;
4356
4357                 err = ptype->callbacks.gro_complete(skb, 0);
4358                 break;
4359         }
4360         rcu_read_unlock();
4361
4362         if (err) {
4363                 WARN_ON(&ptype->list == head);
4364                 kfree_skb(skb);
4365                 return NET_RX_SUCCESS;
4366         }
4367
4368 out:
4369         return netif_receive_skb_internal(skb);
4370 }
4371
4372 /* napi->gro_list contains packets ordered by age.
4373  * youngest packets at the head of it.
4374  * Complete skbs in reverse order to reduce latencies.
4375  */
4376 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4377 {
4378         struct sk_buff *skb, *prev = NULL;
4379
4380         /* scan list and build reverse chain */
4381         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4382                 skb->prev = prev;
4383                 prev = skb;
4384         }
4385
4386         for (skb = prev; skb; skb = prev) {
4387                 skb->next = NULL;
4388
4389                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4390                         return;
4391
4392                 prev = skb->prev;
4393                 napi_gro_complete(skb);
4394                 napi->gro_count--;
4395         }
4396
4397         napi->gro_list = NULL;
4398 }
4399 EXPORT_SYMBOL(napi_gro_flush);
4400
4401 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4402 {
4403         struct sk_buff *p;
4404         unsigned int maclen = skb->dev->hard_header_len;
4405         u32 hash = skb_get_hash_raw(skb);
4406
4407         for (p = napi->gro_list; p; p = p->next) {
4408                 unsigned long diffs;
4409
4410                 NAPI_GRO_CB(p)->flush = 0;
4411
4412                 if (hash != skb_get_hash_raw(p)) {
4413                         NAPI_GRO_CB(p)->same_flow = 0;
4414                         continue;
4415                 }
4416
4417                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4418                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4419                 diffs |= skb_metadata_dst_cmp(p, skb);
4420                 if (maclen == ETH_HLEN)
4421                         diffs |= compare_ether_header(skb_mac_header(p),
4422                                                       skb_mac_header(skb));
4423                 else if (!diffs)
4424                         diffs = memcmp(skb_mac_header(p),
4425                                        skb_mac_header(skb),
4426                                        maclen);
4427                 NAPI_GRO_CB(p)->same_flow = !diffs;
4428         }
4429 }
4430
4431 static void skb_gro_reset_offset(struct sk_buff *skb)
4432 {
4433         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4434         const skb_frag_t *frag0 = &pinfo->frags[0];
4435
4436         NAPI_GRO_CB(skb)->data_offset = 0;
4437         NAPI_GRO_CB(skb)->frag0 = NULL;
4438         NAPI_GRO_CB(skb)->frag0_len = 0;
4439
4440         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4441             pinfo->nr_frags &&
4442             !PageHighMem(skb_frag_page(frag0))) {
4443                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4444                 NAPI_GRO_CB(skb)->frag0_len = min(skb_frag_size(frag0),
4445                                                   skb->end - skb->tail);
4446         }
4447 }
4448
4449 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4450 {
4451         struct skb_shared_info *pinfo = skb_shinfo(skb);
4452
4453         BUG_ON(skb->end - skb->tail < grow);
4454
4455         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4456
4457         skb->data_len -= grow;
4458         skb->tail += grow;
4459
4460         pinfo->frags[0].page_offset += grow;
4461         skb_frag_size_sub(&pinfo->frags[0], grow);
4462
4463         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4464                 skb_frag_unref(skb, 0);
4465                 memmove(pinfo->frags, pinfo->frags + 1,
4466                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4467         }
4468 }
4469
4470 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4471 {
4472         struct sk_buff **pp = NULL;
4473         struct packet_offload *ptype;
4474         __be16 type = skb->protocol;
4475         struct list_head *head = &offload_base;
4476         int same_flow;
4477         enum gro_result ret;
4478         int grow;
4479
4480         if (!(skb->dev->features & NETIF_F_GRO))
4481                 goto normal;
4482
4483         if (skb->csum_bad)
4484                 goto normal;
4485
4486         gro_list_prepare(napi, skb);
4487
4488         rcu_read_lock();
4489         list_for_each_entry_rcu(ptype, head, list) {
4490                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4491                         continue;
4492
4493                 skb_set_network_header(skb, skb_gro_offset(skb));
4494                 skb_reset_mac_len(skb);
4495                 NAPI_GRO_CB(skb)->same_flow = 0;
4496                 NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4497                 NAPI_GRO_CB(skb)->free = 0;
4498                 NAPI_GRO_CB(skb)->encap_mark = 0;
4499                 NAPI_GRO_CB(skb)->recursion_counter = 0;
4500                 NAPI_GRO_CB(skb)->is_fou = 0;
4501                 NAPI_GRO_CB(skb)->is_atomic = 1;
4502                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4503
4504                 /* Setup for GRO checksum validation */
4505                 switch (skb->ip_summed) {
4506                 case CHECKSUM_COMPLETE:
4507                         NAPI_GRO_CB(skb)->csum = skb->csum;
4508                         NAPI_GRO_CB(skb)->csum_valid = 1;
4509                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4510                         break;
4511                 case CHECKSUM_UNNECESSARY:
4512                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4513                         NAPI_GRO_CB(skb)->csum_valid = 0;
4514                         break;
4515                 default:
4516                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4517                         NAPI_GRO_CB(skb)->csum_valid = 0;
4518                 }
4519
4520                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4521                 break;
4522         }
4523         rcu_read_unlock();
4524
4525         if (&ptype->list == head)
4526                 goto normal;
4527
4528         same_flow = NAPI_GRO_CB(skb)->same_flow;
4529         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4530
4531         if (pp) {
4532                 struct sk_buff *nskb = *pp;
4533
4534                 *pp = nskb->next;
4535                 nskb->next = NULL;
4536                 napi_gro_complete(nskb);
4537                 napi->gro_count--;
4538         }
4539
4540         if (same_flow)
4541                 goto ok;
4542
4543         if (NAPI_GRO_CB(skb)->flush)
4544                 goto normal;
4545
4546         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4547                 struct sk_buff *nskb = napi->gro_list;
4548
4549                 /* locate the end of the list to select the 'oldest' flow */
4550                 while (nskb->next) {
4551                         pp = &nskb->next;
4552                         nskb = *pp;
4553                 }
4554                 *pp = NULL;
4555                 nskb->next = NULL;
4556                 napi_gro_complete(nskb);
4557         } else {
4558                 napi->gro_count++;
4559         }
4560         NAPI_GRO_CB(skb)->count = 1;
4561         NAPI_GRO_CB(skb)->age = jiffies;
4562         NAPI_GRO_CB(skb)->last = skb;
4563         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4564         skb->next = napi->gro_list;
4565         napi->gro_list = skb;
4566         ret = GRO_HELD;
4567
4568 pull:
4569         grow = skb_gro_offset(skb) - skb_headlen(skb);
4570         if (grow > 0)
4571                 gro_pull_from_frag0(skb, grow);
4572 ok:
4573         return ret;
4574
4575 normal:
4576         ret = GRO_NORMAL;
4577         goto pull;
4578 }
4579
4580 struct packet_offload *gro_find_receive_by_type(__be16 type)
4581 {
4582         struct list_head *offload_head = &offload_base;
4583         struct packet_offload *ptype;
4584
4585         list_for_each_entry_rcu(ptype, offload_head, list) {
4586                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4587                         continue;
4588                 return ptype;
4589         }
4590         return NULL;
4591 }
4592 EXPORT_SYMBOL(gro_find_receive_by_type);
4593
4594 struct packet_offload *gro_find_complete_by_type(__be16 type)
4595 {
4596         struct list_head *offload_head = &offload_base;
4597         struct packet_offload *ptype;
4598
4599         list_for_each_entry_rcu(ptype, offload_head, list) {
4600                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4601                         continue;
4602                 return ptype;
4603         }
4604         return NULL;
4605 }
4606 EXPORT_SYMBOL(gro_find_complete_by_type);
4607
4608 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4609 {
4610         switch (ret) {
4611         case GRO_NORMAL:
4612                 if (netif_receive_skb_internal(skb))
4613                         ret = GRO_DROP;
4614                 break;
4615
4616         case GRO_DROP:
4617                 kfree_skb(skb);
4618                 break;
4619
4620         case GRO_MERGED_FREE:
4621                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4622                         skb_dst_drop(skb);
4623                         kmem_cache_free(skbuff_head_cache, skb);
4624                 } else {
4625                         __kfree_skb(skb);
4626                 }
4627                 break;
4628
4629         case GRO_HELD:
4630         case GRO_MERGED:
4631                 break;
4632         }
4633
4634         return ret;
4635 }
4636
4637 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4638 {
4639         skb_mark_napi_id(skb, napi);
4640         trace_napi_gro_receive_entry(skb);
4641
4642         skb_gro_reset_offset(skb);
4643
4644         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4645 }
4646 EXPORT_SYMBOL(napi_gro_receive);
4647
4648 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4649 {
4650         if (unlikely(skb->pfmemalloc)) {
4651                 consume_skb(skb);
4652                 return;
4653         }
4654         __skb_pull(skb, skb_headlen(skb));
4655         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4656         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4657         skb->vlan_tci = 0;
4658         skb->dev = napi->dev;
4659         skb->skb_iif = 0;
4660         skb->encapsulation = 0;
4661         skb_shinfo(skb)->gso_type = 0;
4662         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4663
4664         napi->skb = skb;
4665 }
4666
4667 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4668 {
4669         struct sk_buff *skb = napi->skb;
4670
4671         if (!skb) {
4672                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4673                 if (skb) {
4674                         napi->skb = skb;
4675                         skb_mark_napi_id(skb, napi);
4676                 }
4677         }
4678         return skb;
4679 }
4680 EXPORT_SYMBOL(napi_get_frags);
4681
4682 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4683                                       struct sk_buff *skb,
4684                                       gro_result_t ret)
4685 {
4686         switch (ret) {
4687         case GRO_NORMAL:
4688         case GRO_HELD:
4689                 __skb_push(skb, ETH_HLEN);
4690                 skb->protocol = eth_type_trans(skb, skb->dev);
4691                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4692                         ret = GRO_DROP;
4693                 break;
4694
4695         case GRO_DROP:
4696         case GRO_MERGED_FREE:
4697                 napi_reuse_skb(napi, skb);
4698                 break;
4699
4700         case GRO_MERGED:
4701                 break;
4702         }
4703
4704         return ret;
4705 }
4706
4707 /* Upper GRO stack assumes network header starts at gro_offset=0
4708  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4709  * We copy ethernet header into skb->data to have a common layout.
4710  */
4711 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4712 {
4713         struct sk_buff *skb = napi->skb;
4714         const struct ethhdr *eth;
4715         unsigned int hlen = sizeof(*eth);
4716
4717         napi->skb = NULL;
4718
4719         skb_reset_mac_header(skb);
4720         skb_gro_reset_offset(skb);
4721
4722         eth = skb_gro_header_fast(skb, 0);
4723         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4724                 eth = skb_gro_header_slow(skb, hlen, 0);
4725                 if (unlikely(!eth)) {
4726                         net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4727                                              __func__, napi->dev->name);
4728                         napi_reuse_skb(napi, skb);
4729                         return NULL;
4730                 }
4731         } else {
4732                 gro_pull_from_frag0(skb, hlen);
4733                 NAPI_GRO_CB(skb)->frag0 += hlen;
4734                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4735         }
4736         __skb_pull(skb, hlen);
4737
4738         /*
4739          * This works because the only protocols we care about don't require
4740          * special handling.
4741          * We'll fix it up properly in napi_frags_finish()
4742          */
4743         skb->protocol = eth->h_proto;
4744
4745         return skb;
4746 }
4747
4748 gro_result_t napi_gro_frags(struct napi_struct *napi)
4749 {
4750         struct sk_buff *skb = napi_frags_skb(napi);
4751
4752         if (!skb)
4753                 return GRO_DROP;
4754
4755         trace_napi_gro_frags_entry(skb);
4756
4757         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4758 }
4759 EXPORT_SYMBOL(napi_gro_frags);
4760
4761 /* Compute the checksum from gro_offset and return the folded value
4762  * after adding in any pseudo checksum.
4763  */
4764 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4765 {
4766         __wsum wsum;
4767         __sum16 sum;
4768
4769         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4770
4771         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4772         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4773         if (likely(!sum)) {
4774                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4775                     !skb->csum_complete_sw)
4776                         netdev_rx_csum_fault(skb->dev);
4777         }
4778
4779         NAPI_GRO_CB(skb)->csum = wsum;
4780         NAPI_GRO_CB(skb)->csum_valid = 1;
4781
4782         return sum;
4783 }
4784 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4785
4786 /*
4787  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4788  * Note: called with local irq disabled, but exits with local irq enabled.
4789  */
4790 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4791 {
4792 #ifdef CONFIG_RPS
4793         struct softnet_data *remsd = sd->rps_ipi_list;
4794
4795         if (remsd) {
4796                 sd->rps_ipi_list = NULL;
4797
4798                 local_irq_enable();
4799
4800                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4801                 while (remsd) {
4802                         struct softnet_data *next = remsd->rps_ipi_next;
4803
4804                         if (cpu_online(remsd->cpu))
4805                                 smp_call_function_single_async(remsd->cpu,
4806                                                            &remsd->csd);
4807                         remsd = next;
4808                 }
4809         } else
4810 #endif
4811                 local_irq_enable();
4812 }
4813
4814 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4815 {
4816 #ifdef CONFIG_RPS
4817         return sd->rps_ipi_list != NULL;
4818 #else
4819         return false;
4820 #endif
4821 }
4822
4823 static int process_backlog(struct napi_struct *napi, int quota)
4824 {
4825         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4826         bool again = true;
4827         int work = 0;
4828
4829         /* Check if we have pending ipi, its better to send them now,
4830          * not waiting net_rx_action() end.
4831          */
4832         if (sd_has_rps_ipi_waiting(sd)) {
4833                 local_irq_disable();
4834                 net_rps_action_and_irq_enable(sd);
4835         }
4836
4837         napi->weight = weight_p;
4838         while (again) {
4839                 struct sk_buff *skb;
4840
4841                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4842                         rcu_read_lock();
4843                         __netif_receive_skb(skb);
4844                         rcu_read_unlock();
4845                         input_queue_head_incr(sd);
4846                         if (++work >= quota)
4847                                 return work;
4848
4849                 }
4850
4851                 local_irq_disable();
4852                 rps_lock(sd);
4853                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4854                         /*
4855                          * Inline a custom version of __napi_complete().
4856                          * only current cpu owns and manipulates this napi,
4857                          * and NAPI_STATE_SCHED is the only possible flag set
4858                          * on backlog.
4859                          * We can use a plain write instead of clear_bit(),
4860                          * and we dont need an smp_mb() memory barrier.
4861                          */
4862                         napi->state = 0;
4863                         again = false;
4864                 } else {
4865                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4866                                                    &sd->process_queue);
4867                 }
4868                 rps_unlock(sd);
4869                 local_irq_enable();
4870         }
4871
4872         return work;
4873 }
4874
4875 /**
4876  * __napi_schedule - schedule for receive
4877  * @n: entry to schedule
4878  *
4879  * The entry's receive function will be scheduled to run.
4880  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4881  */
4882 void __napi_schedule(struct napi_struct *n)
4883 {
4884         unsigned long flags;
4885
4886         local_irq_save(flags);
4887         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4888         local_irq_restore(flags);
4889 }
4890 EXPORT_SYMBOL(__napi_schedule);
4891
4892 /**
4893  * __napi_schedule_irqoff - schedule for receive
4894  * @n: entry to schedule
4895  *
4896  * Variant of __napi_schedule() assuming hard irqs are masked
4897  */
4898 void __napi_schedule_irqoff(struct napi_struct *n)
4899 {
4900         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4901 }
4902 EXPORT_SYMBOL(__napi_schedule_irqoff);
4903
4904 bool __napi_complete(struct napi_struct *n)
4905 {
4906         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4907
4908         /* Some drivers call us directly, instead of calling
4909          * napi_complete_done().
4910          */
4911         if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
4912                 return false;
4913
4914         list_del_init(&n->poll_list);
4915         smp_mb__before_atomic();
4916         clear_bit(NAPI_STATE_SCHED, &n->state);
4917         return true;
4918 }
4919 EXPORT_SYMBOL(__napi_complete);
4920
4921 bool napi_complete_done(struct napi_struct *n, int work_done)
4922 {
4923         unsigned long flags;
4924
4925         /*
4926          * 1) Don't let napi dequeue from the cpu poll list
4927          *    just in case its running on a different cpu.
4928          * 2) If we are busy polling, do nothing here, we have
4929          *    the guarantee we will be called later.
4930          */
4931         if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4932                                  NAPIF_STATE_IN_BUSY_POLL)))
4933                 return false;
4934
4935         if (n->gro_list) {
4936                 unsigned long timeout = 0;
4937
4938                 if (work_done)
4939                         timeout = n->dev->gro_flush_timeout;
4940
4941                 if (timeout)
4942                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4943                                       HRTIMER_MODE_REL_PINNED);
4944                 else
4945                         napi_gro_flush(n, false);
4946         }
4947         if (likely(list_empty(&n->poll_list))) {
4948                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4949         } else {
4950                 /* If n->poll_list is not empty, we need to mask irqs */
4951                 local_irq_save(flags);
4952                 __napi_complete(n);
4953                 local_irq_restore(flags);
4954         }
4955         return true;
4956 }
4957 EXPORT_SYMBOL(napi_complete_done);
4958
4959 /* must be called under rcu_read_lock(), as we dont take a reference */
4960 static struct napi_struct *napi_by_id(unsigned int napi_id)
4961 {
4962         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4963         struct napi_struct *napi;
4964
4965         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4966                 if (napi->napi_id == napi_id)
4967                         return napi;
4968
4969         return NULL;
4970 }
4971
4972 #if defined(CONFIG_NET_RX_BUSY_POLL)
4973
4974 #define BUSY_POLL_BUDGET 8
4975
4976 static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
4977 {
4978         int rc;
4979
4980         clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
4981
4982         local_bh_disable();
4983
4984         /* All we really want here is to re-enable device interrupts.
4985          * Ideally, a new ndo_busy_poll_stop() could avoid another round.
4986          */
4987         rc = napi->poll(napi, BUSY_POLL_BUDGET);
4988         netpoll_poll_unlock(have_poll_lock);
4989         if (rc == BUSY_POLL_BUDGET)
4990                 __napi_schedule(napi);
4991         local_bh_enable();
4992         if (local_softirq_pending())
4993                 do_softirq();
4994 }
4995
4996 bool sk_busy_loop(struct sock *sk, int nonblock)
4997 {
4998         unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4999         int (*napi_poll)(struct napi_struct *napi, int budget);
5000         int (*busy_poll)(struct napi_struct *dev);
5001         void *have_poll_lock = NULL;
5002         struct napi_struct *napi;
5003         int rc;
5004
5005 restart:
5006         rc = false;
5007         napi_poll = NULL;
5008
5009         rcu_read_lock();
5010
5011         napi = napi_by_id(sk->sk_napi_id);
5012         if (!napi)
5013                 goto out;
5014
5015         /* Note: ndo_busy_poll method is optional in linux-4.5 */
5016         busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
5017
5018         preempt_disable();
5019         for (;;) {
5020                 rc = 0;
5021                 local_bh_disable();
5022                 if (busy_poll) {
5023                         rc = busy_poll(napi);
5024                         goto count;
5025                 }
5026                 if (!napi_poll) {
5027                         unsigned long val = READ_ONCE(napi->state);
5028
5029                         /* If multiple threads are competing for this napi,
5030                          * we avoid dirtying napi->state as much as we can.
5031                          */
5032                         if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5033                                    NAPIF_STATE_IN_BUSY_POLL))
5034                                 goto count;
5035                         if (cmpxchg(&napi->state, val,
5036                                     val | NAPIF_STATE_IN_BUSY_POLL |
5037                                           NAPIF_STATE_SCHED) != val)
5038                                 goto count;
5039                         have_poll_lock = netpoll_poll_lock(napi);
5040                         napi_poll = napi->poll;
5041                 }
5042                 rc = napi_poll(napi, BUSY_POLL_BUDGET);
5043                 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5044 count:
5045                 if (rc > 0)
5046                         __NET_ADD_STATS(sock_net(sk),
5047                                         LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5048                 local_bh_enable();
5049
5050                 if (rc == LL_FLUSH_FAILED)
5051                         break; /* permanent failure */
5052
5053                 if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5054                     busy_loop_timeout(end_time))
5055                         break;
5056
5057                 if (unlikely(need_resched())) {
5058                         if (napi_poll)
5059                                 busy_poll_stop(napi, have_poll_lock);
5060                         preempt_enable();
5061                         rcu_read_unlock();
5062                         cond_resched();
5063                         rc = !skb_queue_empty(&sk->sk_receive_queue);
5064                         if (rc || busy_loop_timeout(end_time))
5065                                 return rc;
5066                         goto restart;
5067                 }
5068                 cpu_relax();
5069         }
5070         if (napi_poll)
5071                 busy_poll_stop(napi, have_poll_lock);
5072         preempt_enable();
5073         rc = !skb_queue_empty(&sk->sk_receive_queue);
5074 out:
5075         rcu_read_unlock();
5076         return rc;
5077 }
5078 EXPORT_SYMBOL(sk_busy_loop);
5079
5080 #endif /* CONFIG_NET_RX_BUSY_POLL */
5081
5082 static void napi_hash_add(struct napi_struct *napi)
5083 {
5084         if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5085             test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5086                 return;
5087
5088         spin_lock(&napi_hash_lock);
5089
5090         /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5091         do {
5092                 if (unlikely(++napi_gen_id < NR_CPUS + 1))
5093                         napi_gen_id = NR_CPUS + 1;
5094         } while (napi_by_id(napi_gen_id));
5095         napi->napi_id = napi_gen_id;
5096
5097         hlist_add_head_rcu(&napi->napi_hash_node,
5098                            &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5099
5100         spin_unlock(&napi_hash_lock);
5101 }
5102
5103 /* Warning : caller is responsible to make sure rcu grace period
5104  * is respected before freeing memory containing @napi
5105  */
5106 bool napi_hash_del(struct napi_struct *napi)
5107 {
5108         bool rcu_sync_needed = false;
5109
5110         spin_lock(&napi_hash_lock);
5111
5112         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5113                 rcu_sync_needed = true;
5114                 hlist_del_rcu(&napi->napi_hash_node);
5115         }
5116         spin_unlock(&napi_hash_lock);
5117         return rcu_sync_needed;
5118 }
5119 EXPORT_SYMBOL_GPL(napi_hash_del);
5120
5121 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5122 {
5123         struct napi_struct *napi;
5124
5125         napi = container_of(timer, struct napi_struct, timer);
5126         if (napi->gro_list)
5127                 napi_schedule(napi);
5128
5129         return HRTIMER_NORESTART;
5130 }
5131
5132 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5133                     int (*poll)(struct napi_struct *, int), int weight)
5134 {
5135         INIT_LIST_HEAD(&napi->poll_list);
5136         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5137         napi->timer.function = napi_watchdog;
5138         napi->gro_count = 0;
5139         napi->gro_list = NULL;
5140         napi->skb = NULL;
5141         napi->poll = poll;
5142         if (weight > NAPI_POLL_WEIGHT)
5143                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5144                             weight, dev->name);
5145         napi->weight = weight;
5146         list_add(&napi->dev_list, &dev->napi_list);
5147         napi->dev = dev;
5148 #ifdef CONFIG_NETPOLL
5149         napi->poll_owner = -1;
5150 #endif
5151         set_bit(NAPI_STATE_SCHED, &napi->state);
5152         napi_hash_add(napi);
5153 }
5154 EXPORT_SYMBOL(netif_napi_add);
5155
5156 void napi_disable(struct napi_struct *n)
5157 {
5158         might_sleep();
5159         set_bit(NAPI_STATE_DISABLE, &n->state);
5160
5161         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5162                 msleep(1);
5163         while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5164                 msleep(1);
5165
5166         hrtimer_cancel(&n->timer);
5167
5168         clear_bit(NAPI_STATE_DISABLE, &n->state);
5169 }
5170 EXPORT_SYMBOL(napi_disable);
5171
5172 /* Must be called in process context */
5173 void netif_napi_del(struct napi_struct *napi)
5174 {
5175         might_sleep();
5176         if (napi_hash_del(napi))
5177                 synchronize_net();
5178         list_del_init(&napi->dev_list);
5179         napi_free_frags(napi);
5180
5181         kfree_skb_list(napi->gro_list);
5182         napi->gro_list = NULL;
5183         napi->gro_count = 0;
5184 }
5185 EXPORT_SYMBOL(netif_napi_del);
5186
5187 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5188 {
5189         void *have;
5190         int work, weight;
5191
5192         list_del_init(&n->poll_list);
5193
5194         have = netpoll_poll_lock(n);
5195
5196         weight = n->weight;
5197
5198         /* This NAPI_STATE_SCHED test is for avoiding a race
5199          * with netpoll's poll_napi().  Only the entity which
5200          * obtains the lock and sees NAPI_STATE_SCHED set will
5201          * actually make the ->poll() call.  Therefore we avoid
5202          * accidentally calling ->poll() when NAPI is not scheduled.
5203          */
5204         work = 0;
5205         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5206                 work = n->poll(n, weight);
5207                 trace_napi_poll(n, work, weight);
5208         }
5209
5210         WARN_ON_ONCE(work > weight);
5211
5212         if (likely(work < weight))
5213                 goto out_unlock;
5214
5215         /* Drivers must not modify the NAPI state if they
5216          * consume the entire weight.  In such cases this code
5217          * still "owns" the NAPI instance and therefore can
5218          * move the instance around on the list at-will.
5219          */
5220         if (unlikely(napi_disable_pending(n))) {
5221                 napi_complete(n);
5222                 goto out_unlock;
5223         }
5224
5225         if (n->gro_list) {
5226                 /* flush too old packets
5227                  * If HZ < 1000, flush all packets.
5228                  */
5229                 napi_gro_flush(n, HZ >= 1000);
5230         }
5231
5232         /* Some drivers may have called napi_schedule
5233          * prior to exhausting their budget.
5234          */
5235         if (unlikely(!list_empty(&n->poll_list))) {
5236                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5237                              n->dev ? n->dev->name : "backlog");
5238                 goto out_unlock;
5239         }
5240
5241         list_add_tail(&n->poll_list, repoll);
5242
5243 out_unlock:
5244         netpoll_poll_unlock(have);
5245
5246         return work;
5247 }
5248
5249 static __latent_entropy void net_rx_action(struct softirq_action *h)
5250 {
5251         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5252         unsigned long time_limit = jiffies + 2;
5253         int budget = netdev_budget;
5254         LIST_HEAD(list);
5255         LIST_HEAD(repoll);
5256
5257         local_irq_disable();
5258         list_splice_init(&sd->poll_list, &list);
5259         local_irq_enable();
5260
5261         for (;;) {
5262                 struct napi_struct *n;
5263
5264                 if (list_empty(&list)) {
5265                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5266                                 goto out;
5267                         break;
5268                 }
5269
5270                 n = list_first_entry(&list, struct napi_struct, poll_list);
5271                 budget -= napi_poll(n, &repoll);
5272
5273                 /* If softirq window is exhausted then punt.
5274                  * Allow this to run for 2 jiffies since which will allow
5275                  * an average latency of 1.5/HZ.
5276                  */
5277                 if (unlikely(budget <= 0 ||
5278                              time_after_eq(jiffies, time_limit))) {
5279                         sd->time_squeeze++;
5280                         break;
5281                 }
5282         }
5283
5284         local_irq_disable();
5285
5286         list_splice_tail_init(&sd->poll_list, &list);
5287         list_splice_tail(&repoll, &list);
5288         list_splice(&list, &sd->poll_list);
5289         if (!list_empty(&sd->poll_list))
5290                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5291
5292         net_rps_action_and_irq_enable(sd);
5293 out:
5294         __kfree_skb_flush();
5295 }
5296
5297 struct netdev_adjacent {
5298         struct net_device *dev;
5299
5300         /* upper master flag, there can only be one master device per list */
5301         bool master;
5302
5303         /* counter for the number of times this device was added to us */
5304         u16 ref_nr;
5305
5306         /* private field for the users */
5307         void *private;
5308
5309         struct list_head list;
5310         struct rcu_head rcu;
5311 };
5312
5313 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5314                                                  struct list_head *adj_list)
5315 {
5316         struct netdev_adjacent *adj;
5317
5318         list_for_each_entry(adj, adj_list, list) {
5319                 if (adj->dev == adj_dev)
5320                         return adj;
5321         }
5322         return NULL;
5323 }
5324
5325 static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5326 {
5327         struct net_device *dev = data;
5328
5329         return upper_dev == dev;
5330 }
5331
5332 /**
5333  * netdev_has_upper_dev - Check if device is linked to an upper device
5334  * @dev: device
5335  * @upper_dev: upper device to check
5336  *
5337  * Find out if a device is linked to specified upper device and return true
5338  * in case it is. Note that this checks only immediate upper device,
5339  * not through a complete stack of devices. The caller must hold the RTNL lock.
5340  */
5341 bool netdev_has_upper_dev(struct net_device *dev,
5342                           struct net_device *upper_dev)
5343 {
5344         ASSERT_RTNL();
5345
5346         return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5347                                              upper_dev);
5348 }
5349 EXPORT_SYMBOL(netdev_has_upper_dev);
5350
5351 /**
5352  * netdev_has_upper_dev_all - Check if device is linked to an upper device
5353  * @dev: device
5354  * @upper_dev: upper device to check
5355  *
5356  * Find out if a device is linked to specified upper device and return true
5357  * in case it is. Note that this checks the entire upper device chain.
5358  * The caller must hold rcu lock.
5359  */
5360
5361 bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5362                                   struct net_device *upper_dev)
5363 {
5364         return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5365                                                upper_dev);
5366 }
5367 EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5368
5369 /**
5370  * netdev_has_any_upper_dev - Check if device is linked to some device
5371  * @dev: device
5372  *
5373  * Find out if a device is linked to an upper device and return true in case
5374  * it is. The caller must hold the RTNL lock.
5375  */
5376 static bool netdev_has_any_upper_dev(struct net_device *dev)
5377 {
5378         ASSERT_RTNL();
5379
5380         return !list_empty(&dev->adj_list.upper);
5381 }
5382
5383 /**
5384  * netdev_master_upper_dev_get - Get master upper device
5385  * @dev: device
5386  *
5387  * Find a master upper device and return pointer to it or NULL in case
5388  * it's not there. The caller must hold the RTNL lock.
5389  */
5390 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5391 {
5392         struct netdev_adjacent *upper;
5393
5394         ASSERT_RTNL();
5395
5396         if (list_empty(&dev->adj_list.upper))
5397                 return NULL;
5398
5399         upper = list_first_entry(&dev->adj_list.upper,
5400                                  struct netdev_adjacent, list);
5401         if (likely(upper->master))
5402                 return upper->dev;
5403         return NULL;
5404 }
5405 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5406
5407 /**
5408  * netdev_has_any_lower_dev - Check if device is linked to some device
5409  * @dev: device
5410  *
5411  * Find out if a device is linked to a lower device and return true in case
5412  * it is. The caller must hold the RTNL lock.
5413  */
5414 static bool netdev_has_any_lower_dev(struct net_device *dev)
5415 {
5416         ASSERT_RTNL();
5417
5418         return !list_empty(&dev->adj_list.lower);
5419 }
5420
5421 void *netdev_adjacent_get_private(struct list_head *adj_list)
5422 {
5423         struct netdev_adjacent *adj;
5424
5425         adj = list_entry(adj_list, struct netdev_adjacent, list);
5426
5427         return adj->private;
5428 }
5429 EXPORT_SYMBOL(netdev_adjacent_get_private);
5430
5431 /**
5432  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5433  * @dev: device
5434  * @iter: list_head ** of the current position
5435  *
5436  * Gets the next device from the dev's upper list, starting from iter
5437  * position. The caller must hold RCU read lock.
5438  */
5439 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5440                                                  struct list_head **iter)
5441 {
5442         struct netdev_adjacent *upper;
5443
5444         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5445
5446         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5447
5448         if (&upper->list == &dev->adj_list.upper)
5449                 return NULL;
5450
5451         *iter = &upper->list;
5452
5453         return upper->dev;
5454 }
5455 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5456
5457 static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5458                                                     struct list_head **iter)
5459 {
5460         struct netdev_adjacent *upper;
5461
5462         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5463
5464         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5465
5466         if (&upper->list == &dev->adj_list.upper)
5467                 return NULL;
5468
5469         *iter = &upper->list;
5470
5471         return upper->dev;
5472 }
5473
5474 int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5475                                   int (*fn)(struct net_device *dev,
5476                                             void *data),
5477                                   void *data)
5478 {
5479         struct net_device *udev;
5480         struct list_head *iter;
5481         int ret;
5482
5483         for (iter = &dev->adj_list.upper,
5484              udev = netdev_next_upper_dev_rcu(dev, &iter);
5485              udev;
5486              udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5487                 /* first is the upper device itself */
5488                 ret = fn(udev, data);
5489                 if (ret)
5490                         return ret;
5491
5492                 /* then look at all of its upper devices */
5493                 ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5494                 if (ret)
5495                         return ret;
5496         }
5497
5498         return 0;
5499 }
5500 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5501
5502 /**
5503  * netdev_lower_get_next_private - Get the next ->private from the
5504  *                                 lower neighbour list
5505  * @dev: device
5506  * @iter: list_head ** of the current position
5507  *
5508  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5509  * list, starting from iter position. The caller must hold either hold the
5510  * RTNL lock or its own locking that guarantees that the neighbour lower
5511  * list will remain unchanged.
5512  */
5513 void *netdev_lower_get_next_private(struct net_device *dev,
5514                                     struct list_head **iter)
5515 {
5516         struct netdev_adjacent *lower;
5517
5518         lower = list_entry(*iter, struct netdev_adjacent, list);
5519
5520         if (&lower->list == &dev->adj_list.lower)
5521                 return NULL;
5522
5523         *iter = lower->list.next;
5524
5525         return lower->private;
5526 }
5527 EXPORT_SYMBOL(netdev_lower_get_next_private);
5528
5529 /**
5530  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5531  *                                     lower neighbour list, RCU
5532  *                                     variant
5533  * @dev: device
5534  * @iter: list_head ** of the current position
5535  *
5536  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5537  * list, starting from iter position. The caller must hold RCU read lock.
5538  */
5539 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5540                                         struct list_head **iter)
5541 {
5542         struct netdev_adjacent *lower;
5543
5544         WARN_ON_ONCE(!rcu_read_lock_held());
5545
5546         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5547
5548         if (&lower->list == &dev->adj_list.lower)
5549                 return NULL;
5550
5551         *iter = &lower->list;
5552
5553         return lower->private;
5554 }
5555 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5556
5557 /**
5558  * netdev_lower_get_next - Get the next device from the lower neighbour
5559  *                         list
5560  * @dev: device
5561  * @iter: list_head ** of the current position
5562  *
5563  * Gets the next netdev_adjacent from the dev's lower neighbour
5564  * list, starting from iter position. The caller must hold RTNL lock or
5565  * its own locking that guarantees that the neighbour lower
5566  * list will remain unchanged.
5567  */
5568 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5569 {
5570         struct netdev_adjacent *lower;
5571
5572         lower = list_entry(*iter, struct netdev_adjacent, list);
5573
5574         if (&lower->list == &dev->adj_list.lower)
5575                 return NULL;
5576
5577         *iter = lower->list.next;
5578
5579         return lower->dev;
5580 }
5581 EXPORT_SYMBOL(netdev_lower_get_next);
5582
5583 static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5584                                                 struct list_head **iter)
5585 {
5586         struct netdev_adjacent *lower;
5587
5588         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5589
5590         if (&lower->list == &dev->adj_list.lower)
5591                 return NULL;
5592
5593         *iter = &lower->list;
5594
5595         return lower->dev;
5596 }
5597
5598 int netdev_walk_all_lower_dev(struct net_device *dev,
5599                               int (*fn)(struct net_device *dev,
5600                                         void *data),
5601                               void *data)
5602 {
5603         struct net_device *ldev;
5604         struct list_head *iter;
5605         int ret;
5606
5607         for (iter = &dev->adj_list.lower,
5608              ldev = netdev_next_lower_dev(dev, &iter);
5609              ldev;
5610              ldev = netdev_next_lower_dev(dev, &iter)) {
5611                 /* first is the lower device itself */
5612                 ret = fn(ldev, data);
5613                 if (ret)
5614                         return ret;
5615
5616                 /* then look at all of its lower devices */
5617                 ret = netdev_walk_all_lower_dev(ldev, fn, data);
5618                 if (ret)
5619                         return ret;
5620         }
5621
5622         return 0;
5623 }
5624 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5625
5626 static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5627                                                     struct list_head **iter)
5628 {
5629         struct netdev_adjacent *lower;
5630
5631         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5632         if (&lower->list == &dev->adj_list.lower)
5633                 return NULL;
5634
5635         *iter = &lower->list;
5636
5637         return lower->dev;
5638 }
5639
5640 int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5641                                   int (*fn)(struct net_device *dev,
5642                                             void *data),
5643                                   void *data)
5644 {
5645         struct net_device *ldev;
5646         struct list_head *iter;
5647         int ret;
5648
5649         for (iter = &dev->adj_list.lower,
5650              ldev = netdev_next_lower_dev_rcu(dev, &iter);
5651              ldev;
5652              ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
5653                 /* first is the lower device itself */
5654                 ret = fn(ldev, data);
5655                 if (ret)
5656                         return ret;
5657
5658                 /* then look at all of its lower devices */
5659                 ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
5660                 if (ret)
5661                         return ret;
5662         }
5663
5664         return 0;
5665 }
5666 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
5667
5668 /**
5669  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5670  *                                     lower neighbour list, RCU
5671  *                                     variant
5672  * @dev: device
5673  *
5674  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5675  * list. The caller must hold RCU read lock.
5676  */
5677 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5678 {
5679         struct netdev_adjacent *lower;
5680
5681         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5682                         struct netdev_adjacent, list);
5683         if (lower)
5684                 return lower->private;
5685         return NULL;
5686 }
5687 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5688
5689 /**
5690  * netdev_master_upper_dev_get_rcu - Get master upper device
5691  * @dev: device
5692  *
5693  * Find a master upper device and return pointer to it or NULL in case
5694  * it's not there. The caller must hold the RCU read lock.
5695  */
5696 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5697 {
5698         struct netdev_adjacent *upper;
5699
5700         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5701                                        struct netdev_adjacent, list);
5702         if (upper && likely(upper->master))
5703                 return upper->dev;
5704         return NULL;
5705 }
5706 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5707
5708 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5709                               struct net_device *adj_dev,
5710                               struct list_head *dev_list)
5711 {
5712         char linkname[IFNAMSIZ+7];
5713         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5714                 "upper_%s" : "lower_%s", adj_dev->name);
5715         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5716                                  linkname);
5717 }
5718 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5719                                char *name,
5720                                struct list_head *dev_list)
5721 {
5722         char linkname[IFNAMSIZ+7];
5723         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5724                 "upper_%s" : "lower_%s", name);
5725         sysfs_remove_link(&(dev->dev.kobj), linkname);
5726 }
5727
5728 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5729                                                  struct net_device *adj_dev,
5730                                                  struct list_head *dev_list)
5731 {
5732         return (dev_list == &dev->adj_list.upper ||
5733                 dev_list == &dev->adj_list.lower) &&
5734                 net_eq(dev_net(dev), dev_net(adj_dev));
5735 }
5736
5737 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5738                                         struct net_device *adj_dev,
5739                                         struct list_head *dev_list,
5740                                         void *private, bool master)
5741 {
5742         struct netdev_adjacent *adj;
5743         int ret;
5744
5745         adj = __netdev_find_adj(adj_dev, dev_list);
5746
5747         if (adj) {
5748                 adj->ref_nr += 1;
5749                 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5750                          dev->name, adj_dev->name, adj->ref_nr);
5751
5752                 return 0;
5753         }
5754
5755         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5756         if (!adj)
5757                 return -ENOMEM;
5758
5759         adj->dev = adj_dev;
5760         adj->master = master;
5761         adj->ref_nr = 1;
5762         adj->private = private;
5763         dev_hold(adj_dev);
5764
5765         pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5766                  dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
5767
5768         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5769                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5770                 if (ret)
5771                         goto free_adj;
5772         }
5773
5774         /* Ensure that master link is always the first item in list. */
5775         if (master) {
5776                 ret = sysfs_create_link(&(dev->dev.kobj),
5777                                         &(adj_dev->dev.kobj), "master");
5778                 if (ret)
5779                         goto remove_symlinks;
5780
5781                 list_add_rcu(&adj->list, dev_list);
5782         } else {
5783                 list_add_tail_rcu(&adj->list, dev_list);
5784         }
5785
5786         return 0;
5787
5788 remove_symlinks:
5789         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5790                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5791 free_adj:
5792         kfree(adj);
5793         dev_put(adj_dev);
5794
5795         return ret;
5796 }
5797
5798 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5799                                          struct net_device *adj_dev,
5800                                          u16 ref_nr,
5801                                          struct list_head *dev_list)
5802 {
5803         struct netdev_adjacent *adj;
5804
5805         pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5806                  dev->name, adj_dev->name, ref_nr);
5807
5808         adj = __netdev_find_adj(adj_dev, dev_list);
5809
5810         if (!adj) {
5811                 pr_err("Adjacency does not exist for device %s from %s\n",
5812                        dev->name, adj_dev->name);
5813                 WARN_ON(1);
5814                 return;
5815         }
5816
5817         if (adj->ref_nr > ref_nr) {
5818                 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5819                          dev->name, adj_dev->name, ref_nr,
5820                          adj->ref_nr - ref_nr);
5821                 adj->ref_nr -= ref_nr;
5822                 return;
5823         }
5824
5825         if (adj->master)
5826                 sysfs_remove_link(&(dev->dev.kobj), "master");
5827
5828         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5829                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5830
5831         list_del_rcu(&adj->list);
5832         pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
5833                  adj_dev->name, dev->name, adj_dev->name);
5834         dev_put(adj_dev);
5835         kfree_rcu(adj, rcu);
5836 }
5837
5838 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5839                                             struct net_device *upper_dev,
5840                                             struct list_head *up_list,
5841                                             struct list_head *down_list,
5842                                             void *private, bool master)
5843 {
5844         int ret;
5845
5846         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
5847                                            private, master);
5848         if (ret)
5849                 return ret;
5850
5851         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
5852                                            private, false);
5853         if (ret) {
5854                 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
5855                 return ret;
5856         }
5857
5858         return 0;
5859 }
5860
5861 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5862                                                struct net_device *upper_dev,
5863                                                u16 ref_nr,
5864                                                struct list_head *up_list,
5865                                                struct list_head *down_list)
5866 {
5867         __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5868         __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5869 }
5870
5871 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5872                                                 struct net_device *upper_dev,
5873                                                 void *private, bool master)
5874 {
5875         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5876                                                 &dev->adj_list.upper,
5877                                                 &upper_dev->adj_list.lower,
5878                                                 private, master);
5879 }
5880
5881 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5882                                                    struct net_device *upper_dev)
5883 {
5884         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5885                                            &dev->adj_list.upper,
5886                                            &upper_dev->adj_list.lower);
5887 }
5888
5889 static int __netdev_upper_dev_link(struct net_device *dev,
5890                                    struct net_device *upper_dev, bool master,
5891                                    void *upper_priv, void *upper_info)
5892 {
5893         struct netdev_notifier_changeupper_info changeupper_info;
5894         int ret = 0;
5895
5896         ASSERT_RTNL();
5897
5898         if (dev == upper_dev)
5899                 return -EBUSY;
5900
5901         /* To prevent loops, check if dev is not upper device to upper_dev. */
5902         if (netdev_has_upper_dev(upper_dev, dev))
5903                 return -EBUSY;
5904
5905         if (netdev_has_upper_dev(dev, upper_dev))
5906                 return -EEXIST;
5907
5908         if (master && netdev_master_upper_dev_get(dev))
5909                 return -EBUSY;
5910
5911         changeupper_info.upper_dev = upper_dev;
5912         changeupper_info.master = master;
5913         changeupper_info.linking = true;
5914         changeupper_info.upper_info = upper_info;
5915
5916         ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5917                                             &changeupper_info.info);
5918         ret = notifier_to_errno(ret);
5919         if (ret)
5920                 return ret;
5921
5922         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5923                                                    master);
5924         if (ret)
5925                 return ret;
5926
5927         ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5928                                             &changeupper_info.info);
5929         ret = notifier_to_errno(ret);
5930         if (ret)
5931                 goto rollback;
5932
5933         return 0;
5934
5935 rollback:
5936         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5937
5938         return ret;
5939 }
5940
5941 /**
5942  * netdev_upper_dev_link - Add a link to the upper device
5943  * @dev: device
5944  * @upper_dev: new upper device
5945  *
5946  * Adds a link to device which is upper to this one. The caller must hold
5947  * the RTNL lock. On a failure a negative errno code is returned.
5948  * On success the reference counts are adjusted and the function
5949  * returns zero.
5950  */
5951 int netdev_upper_dev_link(struct net_device *dev,
5952                           struct net_device *upper_dev)
5953 {
5954         return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5955 }
5956 EXPORT_SYMBOL(netdev_upper_dev_link);
5957
5958 /**
5959  * netdev_master_upper_dev_link - Add a master link to the upper device
5960  * @dev: device
5961  * @upper_dev: new upper device
5962  * @upper_priv: upper device private
5963  * @upper_info: upper info to be passed down via notifier
5964  *
5965  * Adds a link to device which is upper to this one. In this case, only
5966  * one master upper device can be linked, although other non-master devices
5967  * might be linked as well. The caller must hold the RTNL lock.
5968  * On a failure a negative errno code is returned. On success the reference
5969  * counts are adjusted and the function returns zero.
5970  */
5971 int netdev_master_upper_dev_link(struct net_device *dev,
5972                                  struct net_device *upper_dev,
5973                                  void *upper_priv, void *upper_info)
5974 {
5975         return __netdev_upper_dev_link(dev, upper_dev, true,
5976                                        upper_priv, upper_info);
5977 }
5978 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5979
5980 /**
5981  * netdev_upper_dev_unlink - Removes a link to upper device
5982  * @dev: device
5983  * @upper_dev: new upper device
5984  *
5985  * Removes a link to device which is upper to this one. The caller must hold
5986  * the RTNL lock.
5987  */
5988 void netdev_upper_dev_unlink(struct net_device *dev,
5989                              struct net_device *upper_dev)
5990 {
5991         struct netdev_notifier_changeupper_info changeupper_info;
5992         ASSERT_RTNL();
5993
5994         changeupper_info.upper_dev = upper_dev;
5995         changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5996         changeupper_info.linking = false;
5997
5998         call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5999                                       &changeupper_info.info);
6000
6001         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6002
6003         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6004                                       &changeupper_info.info);
6005 }
6006 EXPORT_SYMBOL(netdev_upper_dev_unlink);
6007
6008 /**
6009  * netdev_bonding_info_change - Dispatch event about slave change
6010  * @dev: device
6011  * @bonding_info: info to dispatch
6012  *
6013  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6014  * The caller must hold the RTNL lock.
6015  */
6016 void netdev_bonding_info_change(struct net_device *dev,
6017                                 struct netdev_bonding_info *bonding_info)
6018 {
6019         struct netdev_notifier_bonding_info     info;
6020
6021         memcpy(&info.bonding_info, bonding_info,
6022                sizeof(struct netdev_bonding_info));
6023         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
6024                                       &info.info);
6025 }
6026 EXPORT_SYMBOL(netdev_bonding_info_change);
6027
6028 static void netdev_adjacent_add_links(struct net_device *dev)
6029 {
6030         struct netdev_adjacent *iter;
6031
6032         struct net *net = dev_net(dev);
6033
6034         list_for_each_entry(iter, &dev->adj_list.upper, list) {
6035                 if (!net_eq(net, dev_net(iter->dev)))
6036                         continue;
6037                 netdev_adjacent_sysfs_add(iter->dev, dev,
6038                                           &iter->dev->adj_list.lower);
6039                 netdev_adjacent_sysfs_add(dev, iter->dev,
6040                                           &dev->adj_list.upper);
6041         }
6042
6043         list_for_each_entry(iter, &dev->adj_list.lower, list) {
6044                 if (!net_eq(net, dev_net(iter->dev)))
6045                         continue;
6046                 netdev_adjacent_sysfs_add(iter->dev, dev,
6047                                           &iter->dev->adj_list.upper);
6048                 netdev_adjacent_sysfs_add(dev, iter->dev,
6049                                           &dev->adj_list.lower);
6050         }
6051 }
6052
6053 static void netdev_adjacent_del_links(struct net_device *dev)
6054 {
6055         struct netdev_adjacent *iter;
6056
6057         struct net *net = dev_net(dev);
6058
6059         list_for_each_entry(iter, &dev->adj_list.upper, list) {
6060                 if (!net_eq(net, dev_net(iter->dev)))
6061                         continue;
6062                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6063                                           &iter->dev->adj_list.lower);
6064                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6065                                           &dev->adj_list.upper);
6066         }
6067
6068         list_for_each_entry(iter, &dev->adj_list.lower, list) {
6069                 if (!net_eq(net, dev_net(iter->dev)))
6070                         continue;
6071                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6072                                           &iter->dev->adj_list.upper);
6073                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6074                                           &dev->adj_list.lower);
6075         }
6076 }
6077
6078 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6079 {
6080         struct netdev_adjacent *iter;
6081
6082         struct net *net = dev_net(dev);
6083
6084         list_for_each_entry(iter, &dev->adj_list.upper, list) {
6085                 if (!net_eq(net, dev_net(iter->dev)))
6086                         continue;
6087                 netdev_adjacent_sysfs_del(iter->dev, oldname,
6088                                           &iter->dev->adj_list.lower);
6089                 netdev_adjacent_sysfs_add(iter->dev, dev,
6090                                           &iter->dev->adj_list.lower);
6091         }
6092
6093         list_for_each_entry(iter, &dev->adj_list.lower, list) {
6094                 if (!net_eq(net, dev_net(iter->dev)))
6095                         continue;
6096                 netdev_adjacent_sysfs_del(iter->dev, oldname,
6097                                           &iter->dev->adj_list.upper);
6098                 netdev_adjacent_sysfs_add(iter->dev, dev,
6099                                           &iter->dev->adj_list.upper);
6100         }
6101 }
6102
6103 void *netdev_lower_dev_get_private(struct net_device *dev,
6104                                    struct net_device *lower_dev)
6105 {
6106         struct netdev_adjacent *lower;
6107
6108         if (!lower_dev)
6109                 return NULL;
6110         lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6111         if (!lower)
6112                 return NULL;
6113
6114         return lower->private;
6115 }
6116 EXPORT_SYMBOL(netdev_lower_dev_get_private);
6117
6118
6119 int dev_get_nest_level(struct net_device *dev)
6120 {
6121         struct net_device *lower = NULL;
6122         struct list_head *iter;
6123         int max_nest = -1;
6124         int nest;
6125
6126         ASSERT_RTNL();
6127
6128         netdev_for_each_lower_dev(dev, lower, iter) {
6129                 nest = dev_get_nest_level(lower);
6130                 if (max_nest < nest)
6131                         max_nest = nest;
6132         }
6133
6134         return max_nest + 1;
6135 }
6136 EXPORT_SYMBOL(dev_get_nest_level);
6137
6138 /**
6139  * netdev_lower_change - Dispatch event about lower device state change
6140  * @lower_dev: device
6141  * @lower_state_info: state to dispatch
6142  *
6143  * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6144  * The caller must hold the RTNL lock.
6145  */
6146 void netdev_lower_state_changed(struct net_device *lower_dev,
6147                                 void *lower_state_info)
6148 {
6149         struct netdev_notifier_changelowerstate_info changelowerstate_info;
6150
6151         ASSERT_RTNL();
6152         changelowerstate_info.lower_state_info = lower_state_info;
6153         call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6154                                       &changelowerstate_info.info);
6155 }
6156 EXPORT_SYMBOL(netdev_lower_state_changed);
6157
6158 int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6159                                            struct neighbour *n)
6160 {
6161         struct net_device *lower_dev, *stop_dev;
6162         struct list_head *iter;
6163         int err;
6164
6165         netdev_for_each_lower_dev(dev, lower_dev, iter) {
6166                 if (!lower_dev->netdev_ops->ndo_neigh_construct)
6167                         continue;
6168                 err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6169                 if (err) {
6170                         stop_dev = lower_dev;
6171                         goto rollback;
6172                 }
6173         }
6174         return 0;
6175
6176 rollback:
6177         netdev_for_each_lower_dev(dev, lower_dev, iter) {
6178                 if (lower_dev == stop_dev)
6179                         break;
6180                 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6181                         continue;
6182                 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6183         }
6184         return err;
6185 }
6186 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6187
6188 void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6189                                           struct neighbour *n)
6190 {
6191         struct net_device *lower_dev;
6192         struct list_head *iter;
6193
6194         netdev_for_each_lower_dev(dev, lower_dev, iter) {
6195                 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6196                         continue;
6197                 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6198         }
6199 }
6200 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6201
6202 static void dev_change_rx_flags(struct net_device *dev, int flags)
6203 {
6204         const struct net_device_ops *ops = dev->netdev_ops;
6205
6206         if (ops->ndo_change_rx_flags)
6207                 ops->ndo_change_rx_flags(dev, flags);
6208 }
6209
6210 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6211 {
6212         unsigned int old_flags = dev->flags;
6213         kuid_t uid;
6214         kgid_t gid;
6215
6216         ASSERT_RTNL();
6217
6218         dev->flags |= IFF_PROMISC;
6219         dev->promiscuity += inc;
6220         if (dev->promiscuity == 0) {
6221                 /*
6222                  * Avoid overflow.
6223                  * If inc causes overflow, untouch promisc and return error.
6224                  */
6225                 if (inc < 0)
6226                         dev->flags &= ~IFF_PROMISC;
6227                 else {
6228                         dev->promiscuity -= inc;
6229                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6230                                 dev->name);
6231                         return -EOVERFLOW;
6232                 }
6233         }
6234         if (dev->flags != old_flags) {
6235                 pr_info("device %s %s promiscuous mode\n",
6236                         dev->name,
6237                         dev->flags & IFF_PROMISC ? "entered" : "left");
6238                 if (audit_enabled) {
6239                         current_uid_gid(&uid, &gid);
6240                         audit_log(current->audit_context, GFP_ATOMIC,
6241                                 AUDIT_ANOM_PROMISCUOUS,
6242                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6243                                 dev->name, (dev->flags & IFF_PROMISC),
6244                                 (old_flags & IFF_PROMISC),
6245                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
6246                                 from_kuid(&init_user_ns, uid),
6247                                 from_kgid(&init_user_ns, gid),
6248                                 audit_get_sessionid(current));
6249                 }
6250
6251                 dev_change_rx_flags(dev, IFF_PROMISC);
6252         }
6253         if (notify)
6254                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
6255         return 0;
6256 }
6257
6258 /**
6259  *      dev_set_promiscuity     - update promiscuity count on a device
6260  *      @dev: device
6261  *      @inc: modifier
6262  *
6263  *      Add or remove promiscuity from a device. While the count in the device
6264  *      remains above zero the interface remains promiscuous. Once it hits zero
6265  *      the device reverts back to normal filtering operation. A negative inc
6266  *      value is used to drop promiscuity on the device.
6267  *      Return 0 if successful or a negative errno code on error.
6268  */
6269 int dev_set_promiscuity(struct net_device *dev, int inc)
6270 {
6271         unsigned int old_flags = dev->flags;
6272         int err;
6273
6274         err = __dev_set_promiscuity(dev, inc, true);
6275         if (err < 0)
6276                 return err;
6277         if (dev->flags != old_flags)
6278                 dev_set_rx_mode(dev);
6279         return err;
6280 }
6281 EXPORT_SYMBOL(dev_set_promiscuity);
6282
6283 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6284 {
6285         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6286
6287         ASSERT_RTNL();
6288
6289         dev->flags |= IFF_ALLMULTI;
6290         dev->allmulti += inc;
6291         if (dev->allmulti == 0) {
6292                 /*
6293                  * Avoid overflow.
6294                  * If inc causes overflow, untouch allmulti and return error.
6295                  */
6296                 if (inc < 0)
6297                         dev->flags &= ~IFF_ALLMULTI;
6298                 else {
6299                         dev->allmulti -= inc;
6300                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6301                                 dev->name);
6302                         return -EOVERFLOW;
6303                 }
6304         }
6305         if (dev->flags ^ old_flags) {
6306                 dev_change_rx_flags(dev, IFF_ALLMULTI);
6307                 dev_set_rx_mode(dev);
6308                 if (notify)
6309                         __dev_notify_flags(dev, old_flags,
6310                                            dev->gflags ^ old_gflags);
6311         }
6312         return 0;
6313 }
6314
6315 /**
6316  *      dev_set_allmulti        - update allmulti count on a device
6317  *      @dev: device
6318  *      @inc: modifier
6319  *
6320  *      Add or remove reception of all multicast frames to a device. While the
6321  *      count in the device remains above zero the interface remains listening
6322  *      to all interfaces. Once it hits zero the device reverts back to normal
6323  *      filtering operation. A negative @inc value is used to drop the counter
6324  *      when releasing a resource needing all multicasts.
6325  *      Return 0 if successful or a negative errno code on error.
6326  */
6327
6328 int dev_set_allmulti(struct net_device *dev, int inc)
6329 {
6330         return __dev_set_allmulti(dev, inc, true);
6331 }
6332 EXPORT_SYMBOL(dev_set_allmulti);
6333
6334 /*
6335  *      Upload unicast and multicast address lists to device and
6336  *      configure RX filtering. When the device doesn't support unicast
6337  *      filtering it is put in promiscuous mode while unicast addresses
6338  *      are present.
6339  */
6340 void __dev_set_rx_mode(struct net_device *dev)
6341 {
6342         const struct net_device_ops *ops = dev->netdev_ops;
6343
6344         /* dev_open will call this function so the list will stay sane. */
6345         if (!(dev->flags&IFF_UP))
6346                 return;
6347
6348         if (!netif_device_present(dev))
6349                 return;
6350
6351         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6352                 /* Unicast addresses changes may only happen under the rtnl,
6353                  * therefore calling __dev_set_promiscuity here is safe.
6354                  */
6355                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6356                         __dev_set_promiscuity(dev, 1, false);
6357                         dev->uc_promisc = true;
6358                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6359                         __dev_set_promiscuity(dev, -1, false);
6360                         dev->uc_promisc = false;
6361                 }
6362         }
6363
6364         if (ops->ndo_set_rx_mode)
6365                 ops->ndo_set_rx_mode(dev);
6366 }
6367
6368 void dev_set_rx_mode(struct net_device *dev)
6369 {
6370         netif_addr_lock_bh(dev);
6371         __dev_set_rx_mode(dev);
6372         netif_addr_unlock_bh(dev);
6373 }
6374
6375 /**
6376  *      dev_get_flags - get flags reported to userspace
6377  *      @dev: device
6378  *
6379  *      Get the combination of flag bits exported through APIs to userspace.
6380  */
6381 unsigned int dev_get_flags(const struct net_device *dev)
6382 {
6383         unsigned int flags;
6384
6385         flags = (dev->flags & ~(IFF_PROMISC |
6386                                 IFF_ALLMULTI |
6387                                 IFF_RUNNING |
6388                                 IFF_LOWER_UP |
6389                                 IFF_DORMANT)) |
6390                 (dev->gflags & (IFF_PROMISC |
6391                                 IFF_ALLMULTI));
6392
6393         if (netif_running(dev)) {
6394                 if (netif_oper_up(dev))
6395                         flags |= IFF_RUNNING;
6396                 if (netif_carrier_ok(dev))
6397                         flags |= IFF_LOWER_UP;
6398                 if (netif_dormant(dev))
6399                         flags |= IFF_DORMANT;
6400         }
6401
6402         return flags;
6403 }
6404 EXPORT_SYMBOL(dev_get_flags);
6405
6406 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6407 {
6408         unsigned int old_flags = dev->flags;
6409         int ret;
6410
6411         ASSERT_RTNL();
6412
6413         /*
6414          *      Set the flags on our device.
6415          */
6416
6417         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6418                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6419                                IFF_AUTOMEDIA)) |
6420                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6421                                     IFF_ALLMULTI));
6422
6423         /*
6424          *      Load in the correct multicast list now the flags have changed.
6425          */
6426
6427         if ((old_flags ^ flags) & IFF_MULTICAST)
6428                 dev_change_rx_flags(dev, IFF_MULTICAST);
6429
6430         dev_set_rx_mode(dev);
6431
6432         /*
6433          *      Have we downed the interface. We handle IFF_UP ourselves
6434          *      according to user attempts to set it, rather than blindly
6435          *      setting it.
6436          */
6437
6438         ret = 0;
6439         if ((old_flags ^ flags) & IFF_UP)
6440                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6441
6442         if ((flags ^ dev->gflags) & IFF_PROMISC) {
6443                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6444                 unsigned int old_flags = dev->flags;
6445
6446                 dev->gflags ^= IFF_PROMISC;
6447
6448                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6449                         if (dev->flags != old_flags)
6450                                 dev_set_rx_mode(dev);
6451         }
6452
6453         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6454            is important. Some (broken) drivers set IFF_PROMISC, when
6455            IFF_ALLMULTI is requested not asking us and not reporting.
6456          */
6457         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6458                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6459
6460                 dev->gflags ^= IFF_ALLMULTI;
6461                 __dev_set_allmulti(dev, inc, false);
6462         }
6463
6464         return ret;
6465 }
6466
6467 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6468                         unsigned int gchanges)
6469 {
6470         unsigned int changes = dev->flags ^ old_flags;
6471
6472         if (gchanges)
6473                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6474
6475         if (changes & IFF_UP) {
6476                 if (dev->flags & IFF_UP)
6477                         call_netdevice_notifiers(NETDEV_UP, dev);
6478                 else
6479                         call_netdevice_notifiers(NETDEV_DOWN, dev);
6480         }
6481
6482         if (dev->flags & IFF_UP &&
6483             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6484                 struct netdev_notifier_change_info change_info;
6485
6486                 change_info.flags_changed = changes;
6487                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6488                                               &change_info.info);
6489         }
6490 }
6491
6492 /**
6493  *      dev_change_flags - change device settings
6494  *      @dev: device
6495  *      @flags: device state flags
6496  *
6497  *      Change settings on device based state flags. The flags are
6498  *      in the userspace exported format.
6499  */
6500 int dev_change_flags(struct net_device *dev, unsigned int flags)
6501 {
6502         int ret;
6503         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6504
6505         ret = __dev_change_flags(dev, flags);
6506         if (ret < 0)
6507                 return ret;
6508
6509         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6510         __dev_notify_flags(dev, old_flags, changes);
6511         return ret;
6512 }
6513 EXPORT_SYMBOL(dev_change_flags);
6514
6515 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6516 {
6517         const struct net_device_ops *ops = dev->netdev_ops;
6518
6519         if (ops->ndo_change_mtu)
6520                 return ops->ndo_change_mtu(dev, new_mtu);
6521
6522         dev->mtu = new_mtu;
6523         return 0;
6524 }
6525
6526 /**
6527  *      dev_set_mtu - Change maximum transfer unit
6528  *      @dev: device
6529  *      @new_mtu: new transfer unit
6530  *
6531  *      Change the maximum transfer size of the network device.
6532  */
6533 int dev_set_mtu(struct net_device *dev, int new_mtu)
6534 {
6535         int err, orig_mtu;
6536
6537         if (new_mtu == dev->mtu)
6538                 return 0;
6539
6540         /* MTU must be positive, and in range */
6541         if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6542                 net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6543                                     dev->name, new_mtu, dev->min_mtu);
6544                 return -EINVAL;
6545         }
6546
6547         if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6548                 net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6549                                     dev->name, new_mtu, dev->max_mtu);
6550                 return -EINVAL;
6551         }
6552
6553         if (!netif_device_present(dev))
6554                 return -ENODEV;
6555
6556         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6557         err = notifier_to_errno(err);
6558         if (err)
6559                 return err;
6560
6561         orig_mtu = dev->mtu;
6562         err = __dev_set_mtu(dev, new_mtu);
6563
6564         if (!err) {
6565                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6566                 err = notifier_to_errno(err);
6567                 if (err) {
6568                         /* setting mtu back and notifying everyone again,
6569                          * so that they have a chance to revert changes.
6570                          */
6571                         __dev_set_mtu(dev, orig_mtu);
6572                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6573                 }
6574         }
6575         return err;
6576 }
6577 EXPORT_SYMBOL(dev_set_mtu);
6578
6579 /**
6580  *      dev_set_group - Change group this device belongs to
6581  *      @dev: device
6582  *      @new_group: group this device should belong to
6583  */
6584 void dev_set_group(struct net_device *dev, int new_group)
6585 {
6586         dev->group = new_group;
6587 }
6588 EXPORT_SYMBOL(dev_set_group);
6589
6590 /**
6591  *      dev_set_mac_address - Change Media Access Control Address
6592  *      @dev: device
6593  *      @sa: new address
6594  *
6595  *      Change the hardware (MAC) address of the device
6596  */
6597 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6598 {
6599         const struct net_device_ops *ops = dev->netdev_ops;
6600         int err;
6601
6602         if (!ops->ndo_set_mac_address)
6603                 return -EOPNOTSUPP;
6604         if (sa->sa_family != dev->type)
6605                 return -EINVAL;
6606         if (!netif_device_present(dev))
6607                 return -ENODEV;
6608         err = ops->ndo_set_mac_address(dev, sa);
6609         if (err)
6610                 return err;
6611         dev->addr_assign_type = NET_ADDR_SET;
6612         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6613         add_device_randomness(dev->dev_addr, dev->addr_len);
6614         return 0;
6615 }
6616 EXPORT_SYMBOL(dev_set_mac_address);
6617
6618 /**
6619  *      dev_change_carrier - Change device carrier
6620  *      @dev: device
6621  *      @new_carrier: new value
6622  *
6623  *      Change device carrier
6624  */
6625 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6626 {
6627         const struct net_device_ops *ops = dev->netdev_ops;
6628
6629         if (!ops->ndo_change_carrier)
6630                 return -EOPNOTSUPP;
6631         if (!netif_device_present(dev))
6632                 return -ENODEV;
6633         return ops->ndo_change_carrier(dev, new_carrier);
6634 }
6635 EXPORT_SYMBOL(dev_change_carrier);
6636
6637 /**
6638  *      dev_get_phys_port_id - Get device physical port ID
6639  *      @dev: device
6640  *      @ppid: port ID
6641  *
6642  *      Get device physical port ID
6643  */
6644 int dev_get_phys_port_id(struct net_device *dev,
6645                          struct netdev_phys_item_id *ppid)
6646 {
6647         const struct net_device_ops *ops = dev->netdev_ops;
6648
6649         if (!ops->ndo_get_phys_port_id)
6650                 return -EOPNOTSUPP;
6651         return ops->ndo_get_phys_port_id(dev, ppid);
6652 }
6653 EXPORT_SYMBOL(dev_get_phys_port_id);
6654
6655 /**
6656  *      dev_get_phys_port_name - Get device physical port name
6657  *      @dev: device
6658  *      @name: port name
6659  *      @len: limit of bytes to copy to name
6660  *
6661  *      Get device physical port name
6662  */
6663 int dev_get_phys_port_name(struct net_device *dev,
6664                            char *name, size_t len)
6665 {
6666         const struct net_device_ops *ops = dev->netdev_ops;
6667
6668         if (!ops->ndo_get_phys_port_name)
6669                 return -EOPNOTSUPP;
6670         return ops->ndo_get_phys_port_name(dev, name, len);
6671 }
6672 EXPORT_SYMBOL(dev_get_phys_port_name);
6673
6674 /**
6675  *      dev_change_proto_down - update protocol port state information
6676  *      @dev: device
6677  *      @proto_down: new value
6678  *
6679  *      This info can be used by switch drivers to set the phys state of the
6680  *      port.
6681  */
6682 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6683 {
6684         const struct net_device_ops *ops = dev->netdev_ops;
6685
6686         if (!ops->ndo_change_proto_down)
6687                 return -EOPNOTSUPP;
6688         if (!netif_device_present(dev))
6689                 return -ENODEV;
6690         return ops->ndo_change_proto_down(dev, proto_down);
6691 }
6692 EXPORT_SYMBOL(dev_change_proto_down);
6693
6694 /**
6695  *      dev_change_xdp_fd - set or clear a bpf program for a device rx path
6696  *      @dev: device
6697  *      @fd: new program fd or negative value to clear
6698  *      @flags: xdp-related flags
6699  *
6700  *      Set or clear a bpf program for a device
6701  */
6702 int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
6703 {
6704         const struct net_device_ops *ops = dev->netdev_ops;
6705         struct bpf_prog *prog = NULL;
6706         struct netdev_xdp xdp;
6707         int err;
6708
6709         ASSERT_RTNL();
6710
6711         if (!ops->ndo_xdp)
6712                 return -EOPNOTSUPP;
6713         if (fd >= 0) {
6714                 if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
6715                         memset(&xdp, 0, sizeof(xdp));
6716                         xdp.command = XDP_QUERY_PROG;
6717
6718                         err = ops->ndo_xdp(dev, &xdp);
6719                         if (err < 0)
6720                                 return err;
6721                         if (xdp.prog_attached)
6722                                 return -EBUSY;
6723                 }
6724
6725                 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6726                 if (IS_ERR(prog))
6727                         return PTR_ERR(prog);
6728         }
6729
6730         memset(&xdp, 0, sizeof(xdp));
6731         xdp.command = XDP_SETUP_PROG;
6732         xdp.prog = prog;
6733
6734         err = ops->ndo_xdp(dev, &xdp);
6735         if (err < 0 && prog)
6736                 bpf_prog_put(prog);
6737
6738         return err;
6739 }
6740 EXPORT_SYMBOL(dev_change_xdp_fd);
6741
6742 /**
6743  *      dev_new_index   -       allocate an ifindex
6744  *      @net: the applicable net namespace
6745  *
6746  *      Returns a suitable unique value for a new device interface
6747  *      number.  The caller must hold the rtnl semaphore or the
6748  *      dev_base_lock to be sure it remains unique.
6749  */
6750 static int dev_new_index(struct net *net)
6751 {
6752         int ifindex = net->ifindex;
6753         for (;;) {
6754                 if (++ifindex <= 0)
6755                         ifindex = 1;
6756                 if (!__dev_get_by_index(net, ifindex))
6757                         return net->ifindex = ifindex;
6758         }
6759 }
6760
6761 /* Delayed registration/unregisteration */
6762 static LIST_HEAD(net_todo_list);
6763 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6764
6765 static void net_set_todo(struct net_device *dev)
6766 {
6767         list_add_tail(&dev->todo_list, &net_todo_list);
6768         dev_net(dev)->dev_unreg_count++;
6769 }
6770
6771 static void rollback_registered_many(struct list_head *head)
6772 {
6773         struct net_device *dev, *tmp;
6774         LIST_HEAD(close_head);
6775
6776         BUG_ON(dev_boot_phase);
6777         ASSERT_RTNL();
6778
6779         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6780                 /* Some devices call without registering
6781                  * for initialization unwind. Remove those
6782                  * devices and proceed with the remaining.
6783                  */
6784                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6785                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6786                                  dev->name, dev);
6787
6788                         WARN_ON(1);
6789                         list_del(&dev->unreg_list);
6790                         continue;
6791                 }
6792                 dev->dismantle = true;
6793                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6794         }
6795
6796         /* If device is running, close it first. */
6797         list_for_each_entry(dev, head, unreg_list)
6798                 list_add_tail(&dev->close_list, &close_head);
6799         dev_close_many(&close_head, true);
6800
6801         list_for_each_entry(dev, head, unreg_list) {
6802                 /* And unlink it from device chain. */
6803                 unlist_netdevice(dev);
6804
6805                 dev->reg_state = NETREG_UNREGISTERING;
6806         }
6807         flush_all_backlogs();
6808
6809         synchronize_net();
6810
6811         list_for_each_entry(dev, head, unreg_list) {
6812                 struct sk_buff *skb = NULL;
6813
6814                 /* Shutdown queueing discipline. */
6815                 dev_shutdown(dev);
6816
6817
6818                 /* Notify protocols, that we are about to destroy
6819                    this device. They should clean all the things.
6820                 */
6821                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6822
6823                 if (!dev->rtnl_link_ops ||
6824                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6825                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6826                                                      GFP_KERNEL);
6827
6828                 /*
6829                  *      Flush the unicast and multicast chains
6830                  */
6831                 dev_uc_flush(dev);
6832                 dev_mc_flush(dev);
6833
6834                 if (dev->netdev_ops->ndo_uninit)
6835                         dev->netdev_ops->ndo_uninit(dev);
6836
6837                 if (skb)
6838                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6839
6840                 /* Notifier chain MUST detach us all upper devices. */
6841                 WARN_ON(netdev_has_any_upper_dev(dev));
6842                 WARN_ON(netdev_has_any_lower_dev(dev));
6843
6844                 /* Remove entries from kobject tree */
6845                 netdev_unregister_kobject(dev);
6846 #ifdef CONFIG_XPS
6847                 /* Remove XPS queueing entries */
6848                 netif_reset_xps_queues_gt(dev, 0);
6849 #endif
6850         }
6851
6852         synchronize_net();
6853
6854         list_for_each_entry(dev, head, unreg_list)
6855                 dev_put(dev);
6856 }
6857
6858 static void rollback_registered(struct net_device *dev)
6859 {
6860         LIST_HEAD(single);
6861
6862         list_add(&dev->unreg_list, &single);
6863         rollback_registered_many(&single);
6864         list_del(&single);
6865 }
6866
6867 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6868         struct net_device *upper, netdev_features_t features)
6869 {
6870         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6871         netdev_features_t feature;
6872         int feature_bit;
6873
6874         for_each_netdev_feature(&upper_disables, feature_bit) {
6875                 feature = __NETIF_F_BIT(feature_bit);
6876                 if (!(upper->wanted_features & feature)
6877                     && (features & feature)) {
6878                         netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6879                                    &feature, upper->name);
6880                         features &= ~feature;
6881                 }
6882         }
6883
6884         return features;
6885 }
6886
6887 static void netdev_sync_lower_features(struct net_device *upper,
6888         struct net_device *lower, netdev_features_t features)
6889 {
6890         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6891         netdev_features_t feature;
6892         int feature_bit;
6893
6894         for_each_netdev_feature(&upper_disables, feature_bit) {
6895                 feature = __NETIF_F_BIT(feature_bit);
6896                 if (!(features & feature) && (lower->features & feature)) {
6897                         netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6898                                    &feature, lower->name);
6899                         lower->wanted_features &= ~feature;
6900                         netdev_update_features(lower);
6901
6902                         if (unlikely(lower->features & feature))
6903                                 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6904                                             &feature, lower->name);
6905                 }
6906         }
6907 }
6908
6909 static netdev_features_t netdev_fix_features(struct net_device *dev,
6910         netdev_features_t features)
6911 {
6912         /* Fix illegal checksum combinations */
6913         if ((features & NETIF_F_HW_CSUM) &&
6914             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6915                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6916                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6917         }
6918
6919         /* TSO requires that SG is present as well. */
6920         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6921                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6922                 features &= ~NETIF_F_ALL_TSO;
6923         }
6924
6925         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6926                                         !(features & NETIF_F_IP_CSUM)) {
6927                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6928                 features &= ~NETIF_F_TSO;
6929                 features &= ~NETIF_F_TSO_ECN;
6930         }
6931
6932         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6933                                          !(features & NETIF_F_IPV6_CSUM)) {
6934                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6935                 features &= ~NETIF_F_TSO6;
6936         }
6937
6938         /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6939         if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6940                 features &= ~NETIF_F_TSO_MANGLEID;
6941
6942         /* TSO ECN requires that TSO is present as well. */
6943         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6944                 features &= ~NETIF_F_TSO_ECN;
6945
6946         /* Software GSO depends on SG. */
6947         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6948                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6949                 features &= ~NETIF_F_GSO;
6950         }
6951
6952         /* UFO needs SG and checksumming */
6953         if (features & NETIF_F_UFO) {
6954                 /* maybe split UFO into V4 and V6? */
6955                 if (!(features & NETIF_F_HW_CSUM) &&
6956                     ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6957                      (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6958                         netdev_dbg(dev,
6959                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6960                         features &= ~NETIF_F_UFO;
6961                 }
6962
6963                 if (!(features & NETIF_F_SG)) {
6964                         netdev_dbg(dev,
6965                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6966                         features &= ~NETIF_F_UFO;
6967                 }
6968         }
6969
6970         /* GSO partial features require GSO partial be set */
6971         if ((features & dev->gso_partial_features) &&
6972             !(features & NETIF_F_GSO_PARTIAL)) {
6973                 netdev_dbg(dev,
6974                            "Dropping partially supported GSO features since no GSO partial.\n");
6975                 features &= ~dev->gso_partial_features;
6976         }
6977
6978 #ifdef CONFIG_NET_RX_BUSY_POLL
6979         if (dev->netdev_ops->ndo_busy_poll)
6980                 features |= NETIF_F_BUSY_POLL;
6981         else
6982 #endif
6983                 features &= ~NETIF_F_BUSY_POLL;
6984
6985         return features;
6986 }
6987
6988 int __netdev_update_features(struct net_device *dev)
6989 {
6990         struct net_device *upper, *lower;
6991         netdev_features_t features;
6992         struct list_head *iter;
6993         int err = -1;
6994
6995         ASSERT_RTNL();
6996
6997         features = netdev_get_wanted_features(dev);
6998
6999         if (dev->netdev_ops->ndo_fix_features)
7000                 features = dev->netdev_ops->ndo_fix_features(dev, features);
7001
7002         /* driver might be less strict about feature dependencies */
7003         features = netdev_fix_features(dev, features);
7004
7005         /* some features can't be enabled if they're off an an upper device */
7006         netdev_for_each_upper_dev_rcu(dev, upper, iter)
7007                 features = netdev_sync_upper_features(dev, upper, features);
7008
7009         if (dev->features == features)
7010                 goto sync_lower;
7011
7012         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
7013                 &dev->features, &features);
7014
7015         if (dev->netdev_ops->ndo_set_features)
7016                 err = dev->netdev_ops->ndo_set_features(dev, features);
7017         else
7018                 err = 0;
7019
7020         if (unlikely(err < 0)) {
7021                 netdev_err(dev,
7022                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
7023                         err, &features, &dev->features);
7024                 /* return non-0 since some features might have changed and
7025                  * it's better to fire a spurious notification than miss it
7026                  */
7027                 return -1;
7028         }
7029
7030 sync_lower:
7031         /* some features must be disabled on lower devices when disabled
7032          * on an upper device (think: bonding master or bridge)
7033          */
7034         netdev_for_each_lower_dev(dev, lower, iter)
7035                 netdev_sync_lower_features(dev, lower, features);
7036
7037         if (!err)
7038                 dev->features = features;
7039
7040         return err < 0 ? 0 : 1;
7041 }
7042
7043 /**
7044  *      netdev_update_features - recalculate device features
7045  *      @dev: the device to check
7046  *
7047  *      Recalculate dev->features set and send notifications if it
7048  *      has changed. Should be called after driver or hardware dependent
7049  *      conditions might have changed that influence the features.
7050  */
7051 void netdev_update_features(struct net_device *dev)
7052 {
7053         if (__netdev_update_features(dev))
7054                 netdev_features_change(dev);
7055 }
7056 EXPORT_SYMBOL(netdev_update_features);
7057
7058 /**
7059  *      netdev_change_features - recalculate device features
7060  *      @dev: the device to check
7061  *
7062  *      Recalculate dev->features set and send notifications even
7063  *      if they have not changed. Should be called instead of
7064  *      netdev_update_features() if also dev->vlan_features might
7065  *      have changed to allow the changes to be propagated to stacked
7066  *      VLAN devices.
7067  */
7068 void netdev_change_features(struct net_device *dev)
7069 {
7070         __netdev_update_features(dev);
7071         netdev_features_change(dev);
7072 }
7073 EXPORT_SYMBOL(netdev_change_features);
7074
7075 /**
7076  *      netif_stacked_transfer_operstate -      transfer operstate
7077  *      @rootdev: the root or lower level device to transfer state from
7078  *      @dev: the device to transfer operstate to
7079  *
7080  *      Transfer operational state from root to device. This is normally
7081  *      called when a stacking relationship exists between the root
7082  *      device and the device(a leaf device).
7083  */
7084 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7085                                         struct net_device *dev)
7086 {
7087         if (rootdev->operstate == IF_OPER_DORMANT)
7088                 netif_dormant_on(dev);
7089         else
7090                 netif_dormant_off(dev);
7091
7092         if (netif_carrier_ok(rootdev)) {
7093                 if (!netif_carrier_ok(dev))
7094                         netif_carrier_on(dev);
7095         } else {
7096                 if (netif_carrier_ok(dev))
7097                         netif_carrier_off(dev);
7098         }
7099 }
7100 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7101
7102 #ifdef CONFIG_SYSFS
7103 static int netif_alloc_rx_queues(struct net_device *dev)
7104 {
7105         unsigned int i, count = dev->num_rx_queues;
7106         struct netdev_rx_queue *rx;
7107         size_t sz = count * sizeof(*rx);
7108
7109         BUG_ON(count < 1);
7110
7111         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7112         if (!rx) {
7113                 rx = vzalloc(sz);
7114                 if (!rx)
7115                         return -ENOMEM;
7116         }
7117         dev->_rx = rx;
7118
7119         for (i = 0; i < count; i++)
7120                 rx[i].dev = dev;
7121         return 0;
7122 }
7123 #endif
7124
7125 static void netdev_init_one_queue(struct net_device *dev,
7126                                   struct netdev_queue *queue, void *_unused)
7127 {
7128         /* Initialize queue lock */
7129         spin_lock_init(&queue->_xmit_lock);
7130         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7131         queue->xmit_lock_owner = -1;
7132         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7133         queue->dev = dev;
7134 #ifdef CONFIG_BQL
7135         dql_init(&queue->dql, HZ);
7136 #endif
7137 }
7138
7139 static void netif_free_tx_queues(struct net_device *dev)
7140 {
7141         kvfree(dev->_tx);
7142 }
7143
7144 static int netif_alloc_netdev_queues(struct net_device *dev)
7145 {
7146         unsigned int count = dev->num_tx_queues;
7147         struct netdev_queue *tx;
7148         size_t sz = count * sizeof(*tx);
7149
7150         if (count < 1 || count > 0xffff)
7151                 return -EINVAL;
7152
7153         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7154         if (!tx) {
7155                 tx = vzalloc(sz);
7156                 if (!tx)
7157                         return -ENOMEM;
7158         }
7159         dev->_tx = tx;
7160
7161         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7162         spin_lock_init(&dev->tx_global_lock);
7163
7164         return 0;
7165 }
7166
7167 void netif_tx_stop_all_queues(struct net_device *dev)
7168 {
7169         unsigned int i;
7170
7171         for (i = 0; i < dev->num_tx_queues; i++) {
7172                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7173                 netif_tx_stop_queue(txq);
7174         }
7175 }
7176 EXPORT_SYMBOL(netif_tx_stop_all_queues);
7177
7178 /**
7179  *      register_netdevice      - register a network device
7180  *      @dev: device to register
7181  *
7182  *      Take a completed network device structure and add it to the kernel
7183  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7184  *      chain. 0 is returned on success. A negative errno code is returned
7185  *      on a failure to set up the device, or if the name is a duplicate.
7186  *
7187  *      Callers must hold the rtnl semaphore. You may want
7188  *      register_netdev() instead of this.
7189  *
7190  *      BUGS:
7191  *      The locking appears insufficient to guarantee two parallel registers
7192  *      will not get the same name.
7193  */
7194
7195 int register_netdevice(struct net_device *dev)
7196 {
7197         int ret;
7198         struct net *net = dev_net(dev);
7199
7200         BUG_ON(dev_boot_phase);
7201         ASSERT_RTNL();
7202
7203         might_sleep();
7204
7205         /* When net_device's are persistent, this will be fatal. */
7206         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7207         BUG_ON(!net);
7208
7209         spin_lock_init(&dev->addr_list_lock);
7210         netdev_set_addr_lockdep_class(dev);
7211
7212         ret = dev_get_valid_name(net, dev, dev->name);
7213         if (ret < 0)
7214                 goto out;
7215
7216         /* Init, if this function is available */
7217         if (dev->netdev_ops->ndo_init) {
7218                 ret = dev->netdev_ops->ndo_init(dev);
7219                 if (ret) {
7220                         if (ret > 0)
7221                                 ret = -EIO;
7222                         goto out;
7223                 }
7224         }
7225
7226         if (((dev->hw_features | dev->features) &
7227              NETIF_F_HW_VLAN_CTAG_FILTER) &&
7228             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7229              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7230                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7231                 ret = -EINVAL;
7232                 goto err_uninit;
7233         }
7234
7235         ret = -EBUSY;
7236         if (!dev->ifindex)
7237                 dev->ifindex = dev_new_index(net);
7238         else if (__dev_get_by_index(net, dev->ifindex))
7239                 goto err_uninit;
7240
7241         /* Transfer changeable features to wanted_features and enable
7242          * software offloads (GSO and GRO).
7243          */
7244         dev->hw_features |= NETIF_F_SOFT_FEATURES;
7245         dev->features |= NETIF_F_SOFT_FEATURES;
7246         dev->wanted_features = dev->features & dev->hw_features;
7247
7248         if (!(dev->flags & IFF_LOOPBACK))
7249                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
7250
7251         /* If IPv4 TCP segmentation offload is supported we should also
7252          * allow the device to enable segmenting the frame with the option
7253          * of ignoring a static IP ID value.  This doesn't enable the
7254          * feature itself but allows the user to enable it later.
7255          */
7256         if (dev->hw_features & NETIF_F_TSO)
7257                 dev->hw_features |= NETIF_F_TSO_MANGLEID;
7258         if (dev->vlan_features & NETIF_F_TSO)
7259                 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7260         if (dev->mpls_features & NETIF_F_TSO)
7261                 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7262         if (dev->hw_enc_features & NETIF_F_TSO)
7263                 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7264
7265         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7266          */
7267         dev->vlan_features |= NETIF_F_HIGHDMA;
7268
7269         /* Make NETIF_F_SG inheritable to tunnel devices.
7270          */
7271         dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7272
7273         /* Make NETIF_F_SG inheritable to MPLS.
7274          */
7275         dev->mpls_features |= NETIF_F_SG;
7276
7277         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7278         ret = notifier_to_errno(ret);
7279         if (ret)
7280                 goto err_uninit;
7281
7282         ret = netdev_register_kobject(dev);
7283         if (ret)
7284                 goto err_uninit;
7285         dev->reg_state = NETREG_REGISTERED;
7286
7287         __netdev_update_features(dev);
7288
7289         /*
7290          *      Default initial state at registry is that the
7291          *      device is present.
7292          */
7293
7294         set_bit(__LINK_STATE_PRESENT, &dev->state);
7295
7296         linkwatch_init_dev(dev);
7297
7298         dev_init_scheduler(dev);
7299         dev_hold(dev);
7300         list_netdevice(dev);
7301         add_device_randomness(dev->dev_addr, dev->addr_len);
7302
7303         /* If the device has permanent device address, driver should
7304          * set dev_addr and also addr_assign_type should be set to
7305          * NET_ADDR_PERM (default value).
7306          */
7307         if (dev->addr_assign_type == NET_ADDR_PERM)
7308                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7309
7310         /* Notify protocols, that a new device appeared. */
7311         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7312         ret = notifier_to_errno(ret);
7313         if (ret) {
7314                 rollback_registered(dev);
7315                 dev->reg_state = NETREG_UNREGISTERED;
7316         }
7317         /*
7318          *      Prevent userspace races by waiting until the network
7319          *      device is fully setup before sending notifications.
7320          */
7321         if (!dev->rtnl_link_ops ||
7322             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7323                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7324
7325 out:
7326         return ret;
7327
7328 err_uninit:
7329         if (dev->netdev_ops->ndo_uninit)
7330                 dev->netdev_ops->ndo_uninit(dev);
7331         goto out;
7332 }
7333 EXPORT_SYMBOL(register_netdevice);
7334
7335 /**
7336  *      init_dummy_netdev       - init a dummy network device for NAPI
7337  *      @dev: device to init
7338  *
7339  *      This takes a network device structure and initialize the minimum
7340  *      amount of fields so it can be used to schedule NAPI polls without
7341  *      registering a full blown interface. This is to be used by drivers
7342  *      that need to tie several hardware interfaces to a single NAPI
7343  *      poll scheduler due to HW limitations.
7344  */
7345 int init_dummy_netdev(struct net_device *dev)
7346 {
7347         /* Clear everything. Note we don't initialize spinlocks
7348          * are they aren't supposed to be taken by any of the
7349          * NAPI code and this dummy netdev is supposed to be
7350          * only ever used for NAPI polls
7351          */
7352         memset(dev, 0, sizeof(struct net_device));
7353
7354         /* make sure we BUG if trying to hit standard
7355          * register/unregister code path
7356          */
7357         dev->reg_state = NETREG_DUMMY;
7358
7359         /* NAPI wants this */
7360         INIT_LIST_HEAD(&dev->napi_list);
7361
7362         /* a dummy interface is started by default */
7363         set_bit(__LINK_STATE_PRESENT, &dev->state);
7364         set_bit(__LINK_STATE_START, &dev->state);
7365
7366         /* Note : We dont allocate pcpu_refcnt for dummy devices,
7367          * because users of this 'device' dont need to change
7368          * its refcount.
7369          */
7370
7371         return 0;
7372 }
7373 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7374
7375
7376 /**
7377  *      register_netdev - register a network device
7378  *      @dev: device to register
7379  *
7380  *      Take a completed network device structure and add it to the kernel
7381  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7382  *      chain. 0 is returned on success. A negative errno code is returned
7383  *      on a failure to set up the device, or if the name is a duplicate.
7384  *
7385  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
7386  *      and expands the device name if you passed a format string to
7387  *      alloc_netdev.
7388  */
7389 int register_netdev(struct net_device *dev)
7390 {
7391         int err;
7392
7393         rtnl_lock();
7394         err = register_netdevice(dev);
7395         rtnl_unlock();
7396         return err;
7397 }
7398 EXPORT_SYMBOL(register_netdev);
7399
7400 int netdev_refcnt_read(const struct net_device *dev)
7401 {
7402         int i, refcnt = 0;
7403
7404         for_each_possible_cpu(i)
7405                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7406         return refcnt;
7407 }
7408 EXPORT_SYMBOL(netdev_refcnt_read);
7409
7410 /**
7411  * netdev_wait_allrefs - wait until all references are gone.
7412  * @dev: target net_device
7413  *
7414  * This is called when unregistering network devices.
7415  *
7416  * Any protocol or device that holds a reference should register
7417  * for netdevice notification, and cleanup and put back the
7418  * reference if they receive an UNREGISTER event.
7419  * We can get stuck here if buggy protocols don't correctly
7420  * call dev_put.
7421  */
7422 static void netdev_wait_allrefs(struct net_device *dev)
7423 {
7424         unsigned long rebroadcast_time, warning_time;
7425         int refcnt;
7426
7427         linkwatch_forget_dev(dev);
7428
7429         rebroadcast_time = warning_time = jiffies;
7430         refcnt = netdev_refcnt_read(dev);
7431
7432         while (refcnt != 0) {
7433                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7434                         rtnl_lock();
7435
7436                         /* Rebroadcast unregister notification */
7437                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7438
7439                         __rtnl_unlock();
7440                         rcu_barrier();
7441                         rtnl_lock();
7442
7443                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7444                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7445                                      &dev->state)) {
7446                                 /* We must not have linkwatch events
7447                                  * pending on unregister. If this
7448                                  * happens, we simply run the queue
7449                                  * unscheduled, resulting in a noop
7450                                  * for this device.
7451                                  */
7452                                 linkwatch_run_queue();
7453                         }
7454
7455                         __rtnl_unlock();
7456
7457                         rebroadcast_time = jiffies;
7458                 }
7459
7460                 msleep(250);
7461
7462                 refcnt = netdev_refcnt_read(dev);
7463
7464                 if (time_after(jiffies, warning_time + 10 * HZ)) {
7465                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7466                                  dev->name, refcnt);
7467                         warning_time = jiffies;
7468                 }
7469         }
7470 }
7471
7472 /* The sequence is:
7473  *
7474  *      rtnl_lock();
7475  *      ...
7476  *      register_netdevice(x1);
7477  *      register_netdevice(x2);
7478  *      ...
7479  *      unregister_netdevice(y1);
7480  *      unregister_netdevice(y2);
7481  *      ...
7482  *      rtnl_unlock();
7483  *      free_netdev(y1);
7484  *      free_netdev(y2);
7485  *
7486  * We are invoked by rtnl_unlock().
7487  * This allows us to deal with problems:
7488  * 1) We can delete sysfs objects which invoke hotplug
7489  *    without deadlocking with linkwatch via keventd.
7490  * 2) Since we run with the RTNL semaphore not held, we can sleep
7491  *    safely in order to wait for the netdev refcnt to drop to zero.
7492  *
7493  * We must not return until all unregister events added during
7494  * the interval the lock was held have been completed.
7495  */
7496 void netdev_run_todo(void)
7497 {
7498         struct list_head list;
7499
7500         /* Snapshot list, allow later requests */
7501         list_replace_init(&net_todo_list, &list);
7502
7503         __rtnl_unlock();
7504
7505
7506         /* Wait for rcu callbacks to finish before next phase */
7507         if (!list_empty(&list))
7508                 rcu_barrier();
7509
7510         while (!list_empty(&list)) {
7511                 struct net_device *dev
7512                         = list_first_entry(&list, struct net_device, todo_list);
7513                 list_del(&dev->todo_list);
7514
7515                 rtnl_lock();
7516                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7517                 __rtnl_unlock();
7518
7519                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7520                         pr_err("network todo '%s' but state %d\n",
7521                                dev->name, dev->reg_state);
7522                         dump_stack();
7523                         continue;
7524                 }
7525
7526                 dev->reg_state = NETREG_UNREGISTERED;
7527
7528                 netdev_wait_allrefs(dev);
7529
7530                 /* paranoia */
7531                 BUG_ON(netdev_refcnt_read(dev));
7532                 BUG_ON(!list_empty(&dev->ptype_all));
7533                 BUG_ON(!list_empty(&dev->ptype_specific));
7534                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7535                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7536                 WARN_ON(dev->dn_ptr);
7537
7538                 if (dev->destructor)
7539                         dev->destructor(dev);
7540
7541                 /* Report a network device has been unregistered */
7542                 rtnl_lock();
7543                 dev_net(dev)->dev_unreg_count--;
7544                 __rtnl_unlock();
7545                 wake_up(&netdev_unregistering_wq);
7546
7547                 /* Free network device */
7548                 kobject_put(&dev->dev.kobj);
7549         }
7550 }
7551
7552 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7553  * all the same fields in the same order as net_device_stats, with only
7554  * the type differing, but rtnl_link_stats64 may have additional fields
7555  * at the end for newer counters.
7556  */
7557 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7558                              const struct net_device_stats *netdev_stats)
7559 {
7560 #if BITS_PER_LONG == 64
7561         BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7562         memcpy(stats64, netdev_stats, sizeof(*stats64));
7563         /* zero out counters that only exist in rtnl_link_stats64 */
7564         memset((char *)stats64 + sizeof(*netdev_stats), 0,
7565                sizeof(*stats64) - sizeof(*netdev_stats));
7566 #else
7567         size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7568         const unsigned long *src = (const unsigned long *)netdev_stats;
7569         u64 *dst = (u64 *)stats64;
7570
7571         BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7572         for (i = 0; i < n; i++)
7573                 dst[i] = src[i];
7574         /* zero out counters that only exist in rtnl_link_stats64 */
7575         memset((char *)stats64 + n * sizeof(u64), 0,
7576                sizeof(*stats64) - n * sizeof(u64));
7577 #endif
7578 }
7579 EXPORT_SYMBOL(netdev_stats_to_stats64);
7580
7581 /**
7582  *      dev_get_stats   - get network device statistics
7583  *      @dev: device to get statistics from
7584  *      @storage: place to store stats
7585  *
7586  *      Get network statistics from device. Return @storage.
7587  *      The device driver may provide its own method by setting
7588  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7589  *      otherwise the internal statistics structure is used.
7590  */
7591 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7592                                         struct rtnl_link_stats64 *storage)
7593 {
7594         const struct net_device_ops *ops = dev->netdev_ops;
7595
7596         if (ops->ndo_get_stats64) {
7597                 memset(storage, 0, sizeof(*storage));
7598                 ops->ndo_get_stats64(dev, storage);
7599         } else if (ops->ndo_get_stats) {
7600                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7601         } else {
7602                 netdev_stats_to_stats64(storage, &dev->stats);
7603         }
7604         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7605         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7606         storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7607         return storage;
7608 }
7609 EXPORT_SYMBOL(dev_get_stats);
7610
7611 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7612 {
7613         struct netdev_queue *queue = dev_ingress_queue(dev);
7614
7615 #ifdef CONFIG_NET_CLS_ACT
7616         if (queue)
7617                 return queue;
7618         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7619         if (!queue)
7620                 return NULL;
7621         netdev_init_one_queue(dev, queue, NULL);
7622         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7623         queue->qdisc_sleeping = &noop_qdisc;
7624         rcu_assign_pointer(dev->ingress_queue, queue);
7625 #endif
7626         return queue;
7627 }
7628
7629 static const struct ethtool_ops default_ethtool_ops;
7630
7631 void netdev_set_default_ethtool_ops(struct net_device *dev,
7632                                     const struct ethtool_ops *ops)
7633 {
7634         if (dev->ethtool_ops == &default_ethtool_ops)
7635                 dev->ethtool_ops = ops;
7636 }
7637 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7638
7639 void netdev_freemem(struct net_device *dev)
7640 {
7641         char *addr = (char *)dev - dev->padded;
7642
7643         kvfree(addr);
7644 }
7645
7646 /**
7647  *      alloc_netdev_mqs - allocate network device
7648  *      @sizeof_priv:           size of private data to allocate space for
7649  *      @name:                  device name format string
7650  *      @name_assign_type:      origin of device name
7651  *      @setup:                 callback to initialize device
7652  *      @txqs:                  the number of TX subqueues to allocate
7653  *      @rxqs:                  the number of RX subqueues to allocate
7654  *
7655  *      Allocates a struct net_device with private data area for driver use
7656  *      and performs basic initialization.  Also allocates subqueue structs
7657  *      for each queue on the device.
7658  */
7659 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7660                 unsigned char name_assign_type,
7661                 void (*setup)(struct net_device *),
7662                 unsigned int txqs, unsigned int rxqs)
7663 {
7664         struct net_device *dev;
7665         size_t alloc_size;
7666         struct net_device *p;
7667
7668         BUG_ON(strlen(name) >= sizeof(dev->name));
7669
7670         if (txqs < 1) {
7671                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7672                 return NULL;
7673         }
7674
7675 #ifdef CONFIG_SYSFS
7676         if (rxqs < 1) {
7677                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7678                 return NULL;
7679         }
7680 #endif
7681
7682         alloc_size = sizeof(struct net_device);
7683         if (sizeof_priv) {
7684                 /* ensure 32-byte alignment of private area */
7685                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7686                 alloc_size += sizeof_priv;
7687         }
7688         /* ensure 32-byte alignment of whole construct */
7689         alloc_size += NETDEV_ALIGN - 1;
7690
7691         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7692         if (!p)
7693                 p = vzalloc(alloc_size);
7694         if (!p)
7695                 return NULL;
7696
7697         dev = PTR_ALIGN(p, NETDEV_ALIGN);
7698         dev->padded = (char *)dev - (char *)p;
7699
7700         dev->pcpu_refcnt = alloc_percpu(int);
7701         if (!dev->pcpu_refcnt)
7702                 goto free_dev;
7703
7704         if (dev_addr_init(dev))
7705                 goto free_pcpu;
7706
7707         dev_mc_init(dev);
7708         dev_uc_init(dev);
7709
7710         dev_net_set(dev, &init_net);
7711
7712         dev->gso_max_size = GSO_MAX_SIZE;
7713         dev->gso_max_segs = GSO_MAX_SEGS;
7714
7715         INIT_LIST_HEAD(&dev->napi_list);
7716         INIT_LIST_HEAD(&dev->unreg_list);
7717         INIT_LIST_HEAD(&dev->close_list);
7718         INIT_LIST_HEAD(&dev->link_watch_list);
7719         INIT_LIST_HEAD(&dev->adj_list.upper);
7720         INIT_LIST_HEAD(&dev->adj_list.lower);
7721         INIT_LIST_HEAD(&dev->ptype_all);
7722         INIT_LIST_HEAD(&dev->ptype_specific);
7723 #ifdef CONFIG_NET_SCHED
7724         hash_init(dev->qdisc_hash);
7725 #endif
7726         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7727         setup(dev);
7728
7729         if (!dev->tx_queue_len) {
7730                 dev->priv_flags |= IFF_NO_QUEUE;
7731                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
7732         }
7733
7734         dev->num_tx_queues = txqs;
7735         dev->real_num_tx_queues = txqs;
7736         if (netif_alloc_netdev_queues(dev))
7737                 goto free_all;
7738
7739 #ifdef CONFIG_SYSFS
7740         dev->num_rx_queues = rxqs;
7741         dev->real_num_rx_queues = rxqs;
7742         if (netif_alloc_rx_queues(dev))
7743                 goto free_all;
7744 #endif
7745
7746         strcpy(dev->name, name);
7747         dev->name_assign_type = name_assign_type;
7748         dev->group = INIT_NETDEV_GROUP;
7749         if (!dev->ethtool_ops)
7750                 dev->ethtool_ops = &default_ethtool_ops;
7751
7752         nf_hook_ingress_init(dev);
7753
7754         return dev;
7755
7756 free_all:
7757         free_netdev(dev);
7758         return NULL;
7759
7760 free_pcpu:
7761         free_percpu(dev->pcpu_refcnt);
7762 free_dev:
7763         netdev_freemem(dev);
7764         return NULL;
7765 }
7766 EXPORT_SYMBOL(alloc_netdev_mqs);
7767
7768 /**
7769  *      free_netdev - free network device
7770  *      @dev: device
7771  *
7772  *      This function does the last stage of destroying an allocated device
7773  *      interface. The reference to the device object is released.
7774  *      If this is the last reference then it will be freed.
7775  *      Must be called in process context.
7776  */
7777 void free_netdev(struct net_device *dev)
7778 {
7779         struct napi_struct *p, *n;
7780
7781         might_sleep();
7782         netif_free_tx_queues(dev);
7783 #ifdef CONFIG_SYSFS
7784         kvfree(dev->_rx);
7785 #endif
7786
7787         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7788
7789         /* Flush device addresses */
7790         dev_addr_flush(dev);
7791
7792         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7793                 netif_napi_del(p);
7794
7795         free_percpu(dev->pcpu_refcnt);
7796         dev->pcpu_refcnt = NULL;
7797
7798         /*  Compatibility with error handling in drivers */
7799         if (dev->reg_state == NETREG_UNINITIALIZED) {
7800                 netdev_freemem(dev);
7801                 return;
7802         }
7803
7804         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7805         dev->reg_state = NETREG_RELEASED;
7806
7807         /* will free via device release */
7808         put_device(&dev->dev);
7809 }
7810 EXPORT_SYMBOL(free_netdev);
7811
7812 /**
7813  *      synchronize_net -  Synchronize with packet receive processing
7814  *
7815  *      Wait for packets currently being received to be done.
7816  *      Does not block later packets from starting.
7817  */
7818 void synchronize_net(void)
7819 {
7820         might_sleep();
7821         if (rtnl_is_locked())
7822                 synchronize_rcu_expedited();
7823         else
7824                 synchronize_rcu();
7825 }
7826 EXPORT_SYMBOL(synchronize_net);
7827
7828 /**
7829  *      unregister_netdevice_queue - remove device from the kernel
7830  *      @dev: device
7831  *      @head: list
7832  *
7833  *      This function shuts down a device interface and removes it
7834  *      from the kernel tables.
7835  *      If head not NULL, device is queued to be unregistered later.
7836  *
7837  *      Callers must hold the rtnl semaphore.  You may want
7838  *      unregister_netdev() instead of this.
7839  */
7840
7841 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7842 {
7843         ASSERT_RTNL();
7844
7845         if (head) {
7846                 list_move_tail(&dev->unreg_list, head);
7847         } else {
7848                 rollback_registered(dev);
7849                 /* Finish processing unregister after unlock */
7850                 net_set_todo(dev);
7851         }
7852 }
7853 EXPORT_SYMBOL(unregister_netdevice_queue);
7854
7855 /**
7856  *      unregister_netdevice_many - unregister many devices
7857  *      @head: list of devices
7858  *
7859  *  Note: As most callers use a stack allocated list_head,
7860  *  we force a list_del() to make sure stack wont be corrupted later.
7861  */
7862 void unregister_netdevice_many(struct list_head *head)
7863 {
7864         struct net_device *dev;
7865
7866         if (!list_empty(head)) {
7867                 rollback_registered_many(head);
7868                 list_for_each_entry(dev, head, unreg_list)
7869                         net_set_todo(dev);
7870                 list_del(head);
7871         }
7872 }
7873 EXPORT_SYMBOL(unregister_netdevice_many);
7874
7875 /**
7876  *      unregister_netdev - remove device from the kernel
7877  *      @dev: device
7878  *
7879  *      This function shuts down a device interface and removes it
7880  *      from the kernel tables.
7881  *
7882  *      This is just a wrapper for unregister_netdevice that takes
7883  *      the rtnl semaphore.  In general you want to use this and not
7884  *      unregister_netdevice.
7885  */
7886 void unregister_netdev(struct net_device *dev)
7887 {
7888         rtnl_lock();
7889         unregister_netdevice(dev);
7890         rtnl_unlock();
7891 }
7892 EXPORT_SYMBOL(unregister_netdev);
7893
7894 /**
7895  *      dev_change_net_namespace - move device to different nethost namespace
7896  *      @dev: device
7897  *      @net: network namespace
7898  *      @pat: If not NULL name pattern to try if the current device name
7899  *            is already taken in the destination network namespace.
7900  *
7901  *      This function shuts down a device interface and moves it
7902  *      to a new network namespace. On success 0 is returned, on
7903  *      a failure a netagive errno code is returned.
7904  *
7905  *      Callers must hold the rtnl semaphore.
7906  */
7907
7908 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7909 {
7910         int err;
7911
7912         ASSERT_RTNL();
7913
7914         /* Don't allow namespace local devices to be moved. */
7915         err = -EINVAL;
7916         if (dev->features & NETIF_F_NETNS_LOCAL)
7917                 goto out;
7918
7919         /* Ensure the device has been registrered */
7920         if (dev->reg_state != NETREG_REGISTERED)
7921                 goto out;
7922
7923         /* Get out if there is nothing todo */
7924         err = 0;
7925         if (net_eq(dev_net(dev), net))
7926                 goto out;
7927
7928         /* Pick the destination device name, and ensure
7929          * we can use it in the destination network namespace.
7930          */
7931         err = -EEXIST;
7932         if (__dev_get_by_name(net, dev->name)) {
7933                 /* We get here if we can't use the current device name */
7934                 if (!pat)
7935                         goto out;
7936                 if (dev_get_valid_name(net, dev, pat) < 0)
7937                         goto out;
7938         }
7939
7940         /*
7941          * And now a mini version of register_netdevice unregister_netdevice.
7942          */
7943
7944         /* If device is running close it first. */
7945         dev_close(dev);
7946
7947         /* And unlink it from device chain */
7948         err = -ENODEV;
7949         unlist_netdevice(dev);
7950
7951         synchronize_net();
7952
7953         /* Shutdown queueing discipline. */
7954         dev_shutdown(dev);
7955
7956         /* Notify protocols, that we are about to destroy
7957            this device. They should clean all the things.
7958
7959            Note that dev->reg_state stays at NETREG_REGISTERED.
7960            This is wanted because this way 8021q and macvlan know
7961            the device is just moving and can keep their slaves up.
7962         */
7963         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7964         rcu_barrier();
7965         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7966         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7967
7968         /*
7969          *      Flush the unicast and multicast chains
7970          */
7971         dev_uc_flush(dev);
7972         dev_mc_flush(dev);
7973
7974         /* Send a netdev-removed uevent to the old namespace */
7975         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7976         netdev_adjacent_del_links(dev);
7977
7978         /* Actually switch the network namespace */
7979         dev_net_set(dev, net);
7980
7981         /* If there is an ifindex conflict assign a new one */
7982         if (__dev_get_by_index(net, dev->ifindex))
7983                 dev->ifindex = dev_new_index(net);
7984
7985         /* Send a netdev-add uevent to the new namespace */
7986         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7987         netdev_adjacent_add_links(dev);
7988
7989         /* Fixup kobjects */
7990         err = device_rename(&dev->dev, dev->name);
7991         WARN_ON(err);
7992
7993         /* Add the device back in the hashes */
7994         list_netdevice(dev);
7995
7996         /* Notify protocols, that a new device appeared. */
7997         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7998
7999         /*
8000          *      Prevent userspace races by waiting until the network
8001          *      device is fully setup before sending notifications.
8002          */
8003         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8004
8005         synchronize_net();
8006         err = 0;
8007 out:
8008         return err;
8009 }
8010 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
8011
8012 static int dev_cpu_dead(unsigned int oldcpu)
8013 {
8014         struct sk_buff **list_skb;
8015         struct sk_buff *skb;
8016         unsigned int cpu;
8017         struct softnet_data *sd, *oldsd;
8018
8019         local_irq_disable();
8020         cpu = smp_processor_id();
8021         sd = &per_cpu(softnet_data, cpu);
8022         oldsd = &per_cpu(softnet_data, oldcpu);
8023
8024         /* Find end of our completion_queue. */
8025         list_skb = &sd->completion_queue;
8026         while (*list_skb)
8027                 list_skb = &(*list_skb)->next;
8028         /* Append completion queue from offline CPU. */
8029         *list_skb = oldsd->completion_queue;
8030         oldsd->completion_queue = NULL;
8031
8032         /* Append output queue from offline CPU. */
8033         if (oldsd->output_queue) {
8034                 *sd->output_queue_tailp = oldsd->output_queue;
8035                 sd->output_queue_tailp = oldsd->output_queue_tailp;
8036                 oldsd->output_queue = NULL;
8037                 oldsd->output_queue_tailp = &oldsd->output_queue;
8038         }
8039         /* Append NAPI poll list from offline CPU, with one exception :
8040          * process_backlog() must be called by cpu owning percpu backlog.
8041          * We properly handle process_queue & input_pkt_queue later.
8042          */
8043         while (!list_empty(&oldsd->poll_list)) {
8044                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8045                                                             struct napi_struct,
8046                                                             poll_list);
8047
8048                 list_del_init(&napi->poll_list);
8049                 if (napi->poll == process_backlog)
8050                         napi->state = 0;
8051                 else
8052                         ____napi_schedule(sd, napi);
8053         }
8054
8055         raise_softirq_irqoff(NET_TX_SOFTIRQ);
8056         local_irq_enable();
8057
8058         /* Process offline CPU's input_pkt_queue */
8059         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8060                 netif_rx_ni(skb);
8061                 input_queue_head_incr(oldsd);
8062         }
8063         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8064                 netif_rx_ni(skb);
8065                 input_queue_head_incr(oldsd);
8066         }
8067
8068         return 0;
8069 }
8070
8071 /**
8072  *      netdev_increment_features - increment feature set by one
8073  *      @all: current feature set
8074  *      @one: new feature set
8075  *      @mask: mask feature set
8076  *
8077  *      Computes a new feature set after adding a device with feature set
8078  *      @one to the master device with current feature set @all.  Will not
8079  *      enable anything that is off in @mask. Returns the new feature set.
8080  */
8081 netdev_features_t netdev_increment_features(netdev_features_t all,
8082         netdev_features_t one, netdev_features_t mask)
8083 {
8084         if (mask & NETIF_F_HW_CSUM)
8085                 mask |= NETIF_F_CSUM_MASK;
8086         mask |= NETIF_F_VLAN_CHALLENGED;
8087
8088         all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8089         all &= one | ~NETIF_F_ALL_FOR_ALL;
8090
8091         /* If one device supports hw checksumming, set for all. */
8092         if (all & NETIF_F_HW_CSUM)
8093                 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8094
8095         return all;
8096 }
8097 EXPORT_SYMBOL(netdev_increment_features);
8098
8099 static struct hlist_head * __net_init netdev_create_hash(void)
8100 {
8101         int i;
8102         struct hlist_head *hash;
8103
8104         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8105         if (hash != NULL)
8106                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
8107                         INIT_HLIST_HEAD(&hash[i]);
8108
8109         return hash;
8110 }
8111
8112 /* Initialize per network namespace state */
8113 static int __net_init netdev_init(struct net *net)
8114 {
8115         if (net != &init_net)
8116                 INIT_LIST_HEAD(&net->dev_base_head);
8117
8118         net->dev_name_head = netdev_create_hash();
8119         if (net->dev_name_head == NULL)
8120                 goto err_name;
8121
8122         net->dev_index_head = netdev_create_hash();
8123         if (net->dev_index_head == NULL)
8124                 goto err_idx;
8125
8126         return 0;
8127
8128 err_idx:
8129         kfree(net->dev_name_head);
8130 err_name:
8131         return -ENOMEM;
8132 }
8133
8134 /**
8135  *      netdev_drivername - network driver for the device
8136  *      @dev: network device
8137  *
8138  *      Determine network driver for device.
8139  */
8140 const char *netdev_drivername(const struct net_device *dev)
8141 {
8142         const struct device_driver *driver;
8143         const struct device *parent;
8144         const char *empty = "";
8145
8146         parent = dev->dev.parent;
8147         if (!parent)
8148                 return empty;
8149
8150         driver = parent->driver;
8151         if (driver && driver->name)
8152                 return driver->name;
8153         return empty;
8154 }
8155
8156 static void __netdev_printk(const char *level, const struct net_device *dev,
8157                             struct va_format *vaf)
8158 {
8159         if (dev && dev->dev.parent) {
8160                 dev_printk_emit(level[1] - '0',
8161                                 dev->dev.parent,
8162                                 "%s %s %s%s: %pV",
8163                                 dev_driver_string(dev->dev.parent),
8164                                 dev_name(dev->dev.parent),
8165                                 netdev_name(dev), netdev_reg_state(dev),
8166                                 vaf);
8167         } else if (dev) {
8168                 printk("%s%s%s: %pV",
8169                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
8170         } else {
8171                 printk("%s(NULL net_device): %pV", level, vaf);
8172         }
8173 }
8174
8175 void netdev_printk(const char *level, const struct net_device *dev,
8176                    const char *format, ...)
8177 {
8178         struct va_format vaf;
8179         va_list args;
8180
8181         va_start(args, format);
8182
8183         vaf.fmt = format;
8184         vaf.va = &args;
8185
8186         __netdev_printk(level, dev, &vaf);
8187
8188         va_end(args);
8189 }
8190 EXPORT_SYMBOL(netdev_printk);
8191
8192 #define define_netdev_printk_level(func, level)                 \
8193 void func(const struct net_device *dev, const char *fmt, ...)   \
8194 {                                                               \
8195         struct va_format vaf;                                   \
8196         va_list args;                                           \
8197                                                                 \
8198         va_start(args, fmt);                                    \
8199                                                                 \
8200         vaf.fmt = fmt;                                          \
8201         vaf.va = &args;                                         \
8202                                                                 \
8203         __netdev_printk(level, dev, &vaf);                      \
8204                                                                 \
8205         va_end(args);                                           \
8206 }                                                               \
8207 EXPORT_SYMBOL(func);
8208
8209 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8210 define_netdev_printk_level(netdev_alert, KERN_ALERT);
8211 define_netdev_printk_level(netdev_crit, KERN_CRIT);
8212 define_netdev_printk_level(netdev_err, KERN_ERR);
8213 define_netdev_printk_level(netdev_warn, KERN_WARNING);
8214 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8215 define_netdev_printk_level(netdev_info, KERN_INFO);
8216
8217 static void __net_exit netdev_exit(struct net *net)
8218 {
8219         kfree(net->dev_name_head);
8220         kfree(net->dev_index_head);
8221 }
8222
8223 static struct pernet_operations __net_initdata netdev_net_ops = {
8224         .init = netdev_init,
8225         .exit = netdev_exit,
8226 };
8227
8228 static void __net_exit default_device_exit(struct net *net)
8229 {
8230         struct net_device *dev, *aux;
8231         /*
8232          * Push all migratable network devices back to the
8233          * initial network namespace
8234          */
8235         rtnl_lock();
8236         for_each_netdev_safe(net, dev, aux) {
8237                 int err;
8238                 char fb_name[IFNAMSIZ];
8239
8240                 /* Ignore unmoveable devices (i.e. loopback) */
8241                 if (dev->features & NETIF_F_NETNS_LOCAL)
8242                         continue;
8243
8244                 /* Leave virtual devices for the generic cleanup */
8245                 if (dev->rtnl_link_ops)
8246                         continue;
8247
8248                 /* Push remaining network devices to init_net */
8249                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8250                 err = dev_change_net_namespace(dev, &init_net, fb_name);
8251                 if (err) {
8252                         pr_emerg("%s: failed to move %s to init_net: %d\n",
8253                                  __func__, dev->name, err);
8254                         BUG();
8255                 }
8256         }
8257         rtnl_unlock();
8258 }
8259
8260 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8261 {
8262         /* Return with the rtnl_lock held when there are no network
8263          * devices unregistering in any network namespace in net_list.
8264          */
8265         struct net *net;
8266         bool unregistering;
8267         DEFINE_WAIT_FUNC(wait, woken_wake_function);
8268
8269         add_wait_queue(&netdev_unregistering_wq, &wait);
8270         for (;;) {
8271                 unregistering = false;
8272                 rtnl_lock();
8273                 list_for_each_entry(net, net_list, exit_list) {
8274                         if (net->dev_unreg_count > 0) {
8275                                 unregistering = true;
8276                                 break;
8277                         }
8278                 }
8279                 if (!unregistering)
8280                         break;
8281                 __rtnl_unlock();
8282
8283                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8284         }
8285         remove_wait_queue(&netdev_unregistering_wq, &wait);
8286 }
8287
8288 static void __net_exit default_device_exit_batch(struct list_head *net_list)
8289 {
8290         /* At exit all network devices most be removed from a network
8291          * namespace.  Do this in the reverse order of registration.
8292          * Do this across as many network namespaces as possible to
8293          * improve batching efficiency.
8294          */
8295         struct net_device *dev;
8296         struct net *net;
8297         LIST_HEAD(dev_kill_list);
8298
8299         /* To prevent network device cleanup code from dereferencing
8300          * loopback devices or network devices that have been freed
8301          * wait here for all pending unregistrations to complete,
8302          * before unregistring the loopback device and allowing the
8303          * network namespace be freed.
8304          *
8305          * The netdev todo list containing all network devices
8306          * unregistrations that happen in default_device_exit_batch
8307          * will run in the rtnl_unlock() at the end of
8308          * default_device_exit_batch.
8309          */
8310         rtnl_lock_unregistering(net_list);
8311         list_for_each_entry(net, net_list, exit_list) {
8312                 for_each_netdev_reverse(net, dev) {
8313                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8314                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8315                         else
8316                                 unregister_netdevice_queue(dev, &dev_kill_list);
8317                 }
8318         }
8319         unregister_netdevice_many(&dev_kill_list);
8320         rtnl_unlock();
8321 }
8322
8323 static struct pernet_operations __net_initdata default_device_ops = {
8324         .exit = default_device_exit,
8325         .exit_batch = default_device_exit_batch,
8326 };
8327
8328 /*
8329  *      Initialize the DEV module. At boot time this walks the device list and
8330  *      unhooks any devices that fail to initialise (normally hardware not
8331  *      present) and leaves us with a valid list of present and active devices.
8332  *
8333  */
8334
8335 /*
8336  *       This is called single threaded during boot, so no need
8337  *       to take the rtnl semaphore.
8338  */
8339 static int __init net_dev_init(void)
8340 {
8341         int i, rc = -ENOMEM;
8342
8343         BUG_ON(!dev_boot_phase);
8344
8345         if (dev_proc_init())
8346                 goto out;
8347
8348         if (netdev_kobject_init())
8349                 goto out;
8350
8351         INIT_LIST_HEAD(&ptype_all);
8352         for (i = 0; i < PTYPE_HASH_SIZE; i++)
8353                 INIT_LIST_HEAD(&ptype_base[i]);
8354
8355         INIT_LIST_HEAD(&offload_base);
8356
8357         if (register_pernet_subsys(&netdev_net_ops))
8358                 goto out;
8359
8360         /*
8361          *      Initialise the packet receive queues.
8362          */
8363
8364         for_each_possible_cpu(i) {
8365                 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8366                 struct softnet_data *sd = &per_cpu(softnet_data, i);
8367
8368                 INIT_WORK(flush, flush_backlog);
8369
8370                 skb_queue_head_init(&sd->input_pkt_queue);
8371                 skb_queue_head_init(&sd->process_queue);
8372                 INIT_LIST_HEAD(&sd->poll_list);
8373                 sd->output_queue_tailp = &sd->output_queue;
8374 #ifdef CONFIG_RPS
8375                 sd->csd.func = rps_trigger_softirq;
8376                 sd->csd.info = sd;
8377                 sd->cpu = i;
8378 #endif
8379
8380                 sd->backlog.poll = process_backlog;
8381                 sd->backlog.weight = weight_p;
8382         }
8383
8384         dev_boot_phase = 0;
8385
8386         /* The loopback device is special if any other network devices
8387          * is present in a network namespace the loopback device must
8388          * be present. Since we now dynamically allocate and free the
8389          * loopback device ensure this invariant is maintained by
8390          * keeping the loopback device as the first device on the
8391          * list of network devices.  Ensuring the loopback devices
8392          * is the first device that appears and the last network device
8393          * that disappears.
8394          */
8395         if (register_pernet_device(&loopback_net_ops))
8396                 goto out;
8397
8398         if (register_pernet_device(&default_device_ops))
8399                 goto out;
8400
8401         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8402         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8403
8404         rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
8405                                        NULL, dev_cpu_dead);
8406         WARN_ON(rc < 0);
8407         dst_subsys_init();
8408         rc = 0;
8409 out:
8410         return rc;
8411 }
8412
8413 subsys_initcall(net_dev_init);