]> asedeno.scripts.mit.edu Git - linux.git/commitdiff
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
authorDavid S. Miller <davem@davemloft.net>
Sat, 2 Sep 2017 00:42:05 +0000 (17:42 -0700)
committerDavid S. Miller <davem@davemloft.net>
Sat, 2 Sep 2017 00:42:05 +0000 (17:42 -0700)
Three cases of simple overlapping changes.

Signed-off-by: David S. Miller <davem@davemloft.net>
78 files changed:
1  2 
drivers/net/dsa/bcm_sf2.c
drivers/net/dsa/bcm_sf2.h
drivers/net/ethernet/aquantia/atlantic/aq_ring.c
drivers/net/ethernet/aquantia/atlantic/aq_vec.c
drivers/net/ethernet/broadcom/bcmsysport.c
drivers/net/ethernet/broadcom/bnxt/bnxt.c
drivers/net/ethernet/broadcom/genet/bcmgenet.c
drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
drivers/net/ethernet/faraday/ftgmac100.c
drivers/net/ethernet/freescale/fman/mac.c
drivers/net/ethernet/marvell/mvpp2.c
drivers/net/ethernet/mellanox/mlx5/core/en.h
drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
drivers/net/ethernet/mellanox/mlx5/core/main.c
drivers/net/ethernet/mellanox/mlxsw/spectrum.c
drivers/net/ethernet/netronome/nfp/flower/offload.c
drivers/net/ethernet/netronome/nfp/nfp_main.c
drivers/net/ethernet/netronome/nfp/nfp_net_common.c
drivers/net/ethernet/netronome/nfp/nfp_net_main.c
drivers/net/ethernet/qlogic/qlge/qlge_dbg.c
drivers/net/hyperv/netvsc_drv.c
drivers/net/phy/phy.c
drivers/net/phy/phy_device.c
drivers/net/usb/cdc_ncm.c
drivers/net/virtio_net.c
drivers/net/wireless/intel/iwlwifi/pcie/internal.h
drivers/net/wireless/intel/iwlwifi/pcie/rx.c
drivers/net/wireless/intel/iwlwifi/pcie/trans.c
include/linux/mlx5/driver.h
include/linux/netdevice.h
include/linux/skbuff.h
include/net/ip6_fib.h
include/net/sch_generic.h
include/net/tcp.h
include/net/udp.h
kernel/events/core.c
net/core/datagram.c
net/core/dev.c
net/core/filter.c
net/core/skbuff.c
net/dsa/dsa2.c
net/dsa/tag_ksz.c
net/dsa/tag_trailer.c
net/ipv4/esp4.c
net/ipv4/esp4_offload.c
net/ipv4/tcp.c
net/ipv4/tcp_cong.c
net/ipv4/udp.c
net/ipv6/addrconf.c
net/ipv6/esp6.c
net/ipv6/esp6_offload.c
net/ipv6/ip6_fib.c
net/ipv6/route.c
net/ipv6/udp.c
net/kcm/kcmsock.c
net/packet/af_packet.c
net/sched/cls_api.c
net/sched/sch_api.c
net/sched/sch_cbq.c
net/sched/sch_fq_codel.c
net/sched/sch_generic.c
net/sched/sch_hfsc.c
net/sched/sch_htb.c
net/sched/sch_multiq.c
net/sched/sch_netem.c
net/sched/sch_sfq.c
net/sched/sch_tbf.c
net/sctp/socket.c
net/tipc/bearer.c
net/tipc/bearer.h
net/tipc/node.c
net/xfrm/xfrm_policy.c
net/xfrm/xfrm_state.c
net/xfrm/xfrm_user.c

index 8492c9d64004d1a76cc0a63fdc572b2e001d1d2d,9b6ce7c3f6c3228c88fced286a31c3be0c19900f..554fe2df9365c1dcac7da70c17da37a8dcb0fa74
@@@ -327,8 -327,12 +327,8 @@@ static void bcm_sf2_port_disable(struc
  static int bcm_sf2_eee_init(struct dsa_switch *ds, int port,
                            struct phy_device *phy)
  {
 -      struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 -      struct ethtool_eee *p = &priv->port_sts[port].eee;
        int ret;
  
 -      p->supported = (SUPPORTED_1000baseT_Full | SUPPORTED_100baseT_Full);
 -
        ret = phy_init_eee(phy, 0);
        if (ret)
                return 0;
        return 1;
  }
  
 -static int bcm_sf2_sw_get_eee(struct dsa_switch *ds, int port,
 -                            struct ethtool_eee *e)
 +static int bcm_sf2_sw_get_mac_eee(struct dsa_switch *ds, int port,
 +                                struct ethtool_eee *e)
  {
        struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
        struct ethtool_eee *p = &priv->port_sts[port].eee;
        return 0;
  }
  
 -static int bcm_sf2_sw_set_eee(struct dsa_switch *ds, int port,
 -                            struct phy_device *phydev,
 -                            struct ethtool_eee *e)
 +static int bcm_sf2_sw_set_mac_eee(struct dsa_switch *ds, int port,
 +                                struct ethtool_eee *e)
  {
        struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
        struct ethtool_eee *p = &priv->port_sts[port].eee;
  
        p->eee_enabled = e->eee_enabled;
 -
 -      if (!p->eee_enabled) {
 -              bcm_sf2_eee_enable_set(ds, port, false);
 -      } else {
 -              p->eee_enabled = bcm_sf2_eee_init(ds, port, phydev);
 -              if (!p->eee_enabled)
 -                      return -EOPNOTSUPP;
 -      }
 +      bcm_sf2_eee_enable_set(ds, port, e->eee_enabled);
  
        return 0;
  }
@@@ -788,7 -800,7 +788,7 @@@ static int bcm_sf2_sw_resume(struct dsa
  static void bcm_sf2_sw_get_wol(struct dsa_switch *ds, int port,
                               struct ethtool_wolinfo *wol)
  {
 -      struct net_device *p = ds->dst[ds->index].cpu_dp->netdev;
 +      struct net_device *p = ds->dst->cpu_dp->netdev;
        struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
        struct ethtool_wolinfo pwol;
  
  static int bcm_sf2_sw_set_wol(struct dsa_switch *ds, int port,
                              struct ethtool_wolinfo *wol)
  {
 -      struct net_device *p = ds->dst[ds->index].cpu_dp->netdev;
 +      struct net_device *p = ds->dst->cpu_dp->netdev;
        struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
        s8 cpu_port = ds->dst->cpu_dp->index;
        struct ethtool_wolinfo pwol;
@@@ -983,7 -995,7 +983,7 @@@ static int bcm_sf2_core_write64(struct 
        return 0;
  }
  
 -static struct b53_io_ops bcm_sf2_io_ops = {
 +static const struct b53_io_ops bcm_sf2_io_ops = {
        .read8  = bcm_sf2_core_read8,
        .read16 = bcm_sf2_core_read16,
        .read32 = bcm_sf2_core_read32,
@@@ -1011,8 -1023,8 +1011,8 @@@ static const struct dsa_switch_ops bcm_
        .set_wol                = bcm_sf2_sw_set_wol,
        .port_enable            = bcm_sf2_port_setup,
        .port_disable           = bcm_sf2_port_disable,
 -      .get_eee                = bcm_sf2_sw_get_eee,
 -      .set_eee                = bcm_sf2_sw_set_eee,
 +      .get_mac_eee            = bcm_sf2_sw_get_mac_eee,
 +      .set_mac_eee            = bcm_sf2_sw_set_mac_eee,
        .port_bridge_join       = b53_br_join,
        .port_bridge_leave      = b53_br_leave,
        .port_stp_state_set     = b53_br_set_stp_state,
        .port_vlan_prepare      = b53_vlan_prepare,
        .port_vlan_add          = b53_vlan_add,
        .port_vlan_del          = b53_vlan_del,
 -      .port_vlan_dump         = b53_vlan_dump,
 -      .port_fdb_prepare       = b53_fdb_prepare,
        .port_fdb_dump          = b53_fdb_dump,
        .port_fdb_add           = b53_fdb_add,
        .port_fdb_del           = b53_fdb_del,
@@@ -1034,6 -1048,7 +1034,7 @@@ struct bcm_sf2_of_data 
        u32 type;
        const u16 *reg_offsets;
        unsigned int core_reg_align;
+       unsigned int num_cfp_rules;
  };
  
  /* Register offsets for the SWITCH_REG_* block */
@@@ -1057,6 -1072,7 +1058,7 @@@ static const struct bcm_sf2_of_data bcm
        .type           = BCM7445_DEVICE_ID,
        .core_reg_align = 0,
        .reg_offsets    = bcm_sf2_7445_reg_offsets,
+       .num_cfp_rules  = 256,
  };
  
  static const u16 bcm_sf2_7278_reg_offsets[] = {
@@@ -1079,6 -1095,7 +1081,7 @@@ static const struct bcm_sf2_of_data bcm
        .type           = BCM7278_DEVICE_ID,
        .core_reg_align = 1,
        .reg_offsets    = bcm_sf2_7278_reg_offsets,
+       .num_cfp_rules  = 128,
  };
  
  static const struct of_device_id bcm_sf2_of_match[] = {
@@@ -1135,6 -1152,7 +1138,7 @@@ static int bcm_sf2_sw_probe(struct plat
        priv->type = data->type;
        priv->reg_offsets = data->reg_offsets;
        priv->core_reg_align = data->core_reg_align;
+       priv->num_cfp_rules = data->num_cfp_rules;
  
        /* Auto-detection using standard registers will not work, so
         * provide an indication of what kind of device we are for
index d9c96b281fc0af229d505aa4fa58a7568b34bbda,7f9125eef3df42038fdce368d4a5fe2690d3276c..02c499f9c56b3bdd0cccb277ce0be9e13d318055
@@@ -72,6 -72,7 +72,7 @@@ struct bcm_sf2_priv 
        u32                             type;
        const u16                       *reg_offsets;
        unsigned int                    core_reg_align;
+       unsigned int                    num_cfp_rules;
  
        /* spinlock protecting access to the indirect registers */
        spinlock_t                      indir_lock;
@@@ -130,12 -131,12 +131,12 @@@ static inline u32 bcm_sf2_mangle_addr(s
  #define SF2_IO_MACRO(name) \
  static inline u32 name##_readl(struct bcm_sf2_priv *priv, u32 off)    \
  {                                                                     \
 -      return __raw_readl(priv->name + off);                           \
 +      return readl_relaxed(priv->name + off);                         \
  }                                                                     \
  static inline void name##_writel(struct bcm_sf2_priv *priv,           \
                                  u32 val, u32 off)                     \
  {                                                                     \
 -      __raw_writel(val, priv->name + off);                            \
 +      writel_relaxed(val, priv->name + off);                          \
  }                                                                     \
  
  /* Accesses to 64-bits register requires us to latch the hi/lo pairs
@@@ -179,23 -180,23 +180,23 @@@ static inline void intrl2_##which##_mas
  static inline u32 core_readl(struct bcm_sf2_priv *priv, u32 off)
  {
        u32 tmp = bcm_sf2_mangle_addr(priv, off);
 -      return __raw_readl(priv->core + tmp);
 +      return readl_relaxed(priv->core + tmp);
  }
  
  static inline void core_writel(struct bcm_sf2_priv *priv, u32 val, u32 off)
  {
        u32 tmp = bcm_sf2_mangle_addr(priv, off);
 -      __raw_writel(val, priv->core + tmp);
 +      writel_relaxed(val, priv->core + tmp);
  }
  
  static inline u32 reg_readl(struct bcm_sf2_priv *priv, u16 off)
  {
 -      return __raw_readl(priv->reg + priv->reg_offsets[off]);
 +      return readl_relaxed(priv->reg + priv->reg_offsets[off]);
  }
  
  static inline void reg_writel(struct bcm_sf2_priv *priv, u32 val, u16 off)
  {
 -      __raw_writel(val, priv->reg + priv->reg_offsets[off]);
 +      writel_relaxed(val, priv->reg + priv->reg_offsets[off]);
  }
  
  SF2_IO64_MACRO(core);
index 4b445750b93e15733ff63b67945f88d7a2ca84df,ec5579fb8268b29c6040f524fe20495b43fa617d..4eee1996a8259e561c15a17b9a3792ef79f280e2
@@@ -101,7 -101,6 +101,6 @@@ int aq_ring_init(struct aq_ring_s *self
        self->hw_head = 0;
        self->sw_head = 0;
        self->sw_tail = 0;
-       spin_lock_init(&self->header.lock);
        return 0;
  }
  
@@@ -134,10 -133,7 +133,10 @@@ static inline unsigned int aq_ring_dx_i
  }
  
  #define AQ_SKB_ALIGN SKB_DATA_ALIGN(sizeof(struct skb_shared_info))
 -int aq_ring_rx_clean(struct aq_ring_s *self, int *work_done, int budget)
 +int aq_ring_rx_clean(struct aq_ring_s *self,
 +                   struct napi_struct *napi,
 +                   int *work_done,
 +                   int budget)
  {
        struct net_device *ndev = aq_nic_get_ndev(self->aq_nic);
        int err = 0;
  
                skb_record_rx_queue(skb, self->idx);
  
 -              netif_receive_skb(skb);
 +              napi_gro_receive(napi, skb);
  
                ++self->stats.rx.packets;
                self->stats.rx.bytes += skb->len;
index ec390c5eed35efb54d838a94f466833cdd3806eb,fee446af748ff1a64a984cf4b5ec12dce46471d1..ebf588004c4677140934b152eeab13d4d3aecbf7
@@@ -34,8 -34,6 +34,6 @@@ struct aq_vec_s 
  #define AQ_VEC_RX_ID 1
  
  static int aq_vec_poll(struct napi_struct *napi, int budget)
- __releases(&self->lock)
- __acquires(&self->lock)
  {
        struct aq_vec_s *self = container_of(napi, struct aq_vec_s, napi);
        struct aq_ring_s *ring = NULL;
@@@ -47,7 -45,7 +45,7 @@@
  
        if (!self) {
                err = -EINVAL;
-       } else if (spin_trylock(&self->header.lock)) {
+       } else {
                for (i = 0U, ring = self->ring[0];
                        self->tx_rings > i; ++i, ring = self->ring[i]) {
                        if (self->aq_hw_ops->hw_ring_tx_head_update) {
@@@ -78,7 -76,6 +76,7 @@@
                        if (ring[AQ_VEC_RX_ID].sw_head !=
                                ring[AQ_VEC_RX_ID].hw_head) {
                                err = aq_ring_rx_clean(&ring[AQ_VEC_RX_ID],
 +                                                     napi,
                                                       &work_done,
                                                       budget - work_done);
                                if (err < 0)
                        self->aq_hw_ops->hw_irq_enable(self->aq_hw,
                                        1U << self->aq_ring_param.vec_idx);
                }
- err_exit:
-               spin_unlock(&self->header.lock);
        }
+ err_exit:
        return work_done;
  }
  
@@@ -186,8 -180,6 +181,6 @@@ int aq_vec_init(struct aq_vec_s *self, 
        self->aq_hw_ops = aq_hw_ops;
        self->aq_hw = aq_hw;
  
-       spin_lock_init(&self->header.lock);
        for (i = 0U, ring = self->ring[0];
                self->tx_rings > i; ++i, ring = self->ring[i]) {
                err = aq_ring_init(&ring[AQ_VEC_TX_ID]);
index 931751e4f3692ffa00dfa385a68aad34675117a0,c28fa5a8734cbc769adc16dfc5e36a8cd13b35cb..eec77fae12a14c5c1c1f1136dd3f286968f57069
  #define BCM_SYSPORT_IO_MACRO(name, offset) \
  static inline u32 name##_readl(struct bcm_sysport_priv *priv, u32 off)        \
  {                                                                     \
 -      u32 reg = __raw_readl(priv->base + offset + off);               \
 +      u32 reg = readl_relaxed(priv->base + offset + off);             \
        return reg;                                                     \
  }                                                                     \
  static inline void name##_writel(struct bcm_sysport_priv *priv,               \
                                  u32 val, u32 off)                     \
  {                                                                     \
 -      __raw_writel(val, priv->base + offset + off);                   \
 +      writel_relaxed(val, priv->base + offset + off);                 \
  }                                                                     \
  
  BCM_SYSPORT_IO_MACRO(intrl2_0, SYS_PORT_INTRL2_0_OFFSET);
@@@ -59,14 -59,14 +59,14 @@@ static inline u32 rdma_readl(struct bcm
  {
        if (priv->is_lite && off >= RDMA_STATUS)
                off += 4;
 -      return __raw_readl(priv->base + SYS_PORT_RDMA_OFFSET + off);
 +      return readl_relaxed(priv->base + SYS_PORT_RDMA_OFFSET + off);
  }
  
  static inline void rdma_writel(struct bcm_sysport_priv *priv, u32 val, u32 off)
  {
        if (priv->is_lite && off >= RDMA_STATUS)
                off += 4;
 -      __raw_writel(val, priv->base + SYS_PORT_RDMA_OFFSET + off);
 +      writel_relaxed(val, priv->base + SYS_PORT_RDMA_OFFSET + off);
  }
  
  static inline u32 tdma_control_bit(struct bcm_sysport_priv *priv, u32 bit)
@@@ -110,10 -110,10 +110,10 @@@ static inline void dma_desc_set_addr(st
                                     dma_addr_t addr)
  {
  #ifdef CONFIG_PHYS_ADDR_T_64BIT
 -      __raw_writel(upper_32_bits(addr) & DESC_ADDR_HI_MASK,
 +      writel_relaxed(upper_32_bits(addr) & DESC_ADDR_HI_MASK,
                     d + DESC_ADDR_HI_STATUS_LEN);
  #endif
 -      __raw_writel(lower_32_bits(addr), d + DESC_ADDR_LO);
 +      writel_relaxed(lower_32_bits(addr), d + DESC_ADDR_LO);
  }
  
  static inline void tdma_port_write_desc_addr(struct bcm_sysport_priv *priv,
@@@ -201,10 -201,10 +201,10 @@@ static int bcm_sysport_set_features(str
   */
  static const struct bcm_sysport_stats bcm_sysport_gstrings_stats[] = {
        /* general stats */
 -      STAT_NETDEV(rx_packets),
 -      STAT_NETDEV(tx_packets),
 -      STAT_NETDEV(rx_bytes),
 -      STAT_NETDEV(tx_bytes),
 +      STAT_NETDEV64(rx_packets),
 +      STAT_NETDEV64(tx_packets),
 +      STAT_NETDEV64(rx_bytes),
 +      STAT_NETDEV64(tx_bytes),
        STAT_NETDEV(rx_errors),
        STAT_NETDEV(tx_errors),
        STAT_NETDEV(rx_dropped),
@@@ -316,7 -316,6 +316,7 @@@ static inline bool bcm_sysport_lite_sta
  {
        switch (type) {
        case BCM_SYSPORT_STAT_NETDEV:
 +      case BCM_SYSPORT_STAT_NETDEV64:
        case BCM_SYSPORT_STAT_RXCHK:
        case BCM_SYSPORT_STAT_RBUF:
        case BCM_SYSPORT_STAT_SOFT:
@@@ -399,7 -398,6 +399,7 @@@ static void bcm_sysport_update_mib_coun
                s = &bcm_sysport_gstrings_stats[i];
                switch (s->type) {
                case BCM_SYSPORT_STAT_NETDEV:
 +              case BCM_SYSPORT_STAT_NETDEV64:
                case BCM_SYSPORT_STAT_SOFT:
                        continue;
                case BCM_SYSPORT_STAT_MIB_RX:
@@@ -436,10 -434,7 +436,10 @@@ static void bcm_sysport_get_stats(struc
                                  struct ethtool_stats *stats, u64 *data)
  {
        struct bcm_sysport_priv *priv = netdev_priv(dev);
 +      struct bcm_sysport_stats64 *stats64 = &priv->stats64;
 +      struct u64_stats_sync *syncp = &priv->syncp;
        struct bcm_sysport_tx_ring *ring;
 +      unsigned int start;
        int i, j;
  
        if (netif_running(dev))
                s = &bcm_sysport_gstrings_stats[i];
                if (s->type == BCM_SYSPORT_STAT_NETDEV)
                        p = (char *)&dev->stats;
 +              else if (s->type == BCM_SYSPORT_STAT_NETDEV64)
 +                      p = (char *)stats64;
                else
                        p = (char *)priv;
  
                if (priv->is_lite && !bcm_sysport_lite_stat_valid(s->type))
                        continue;
 -
                p += s->stat_offset;
 -              data[j] = *(unsigned long *)p;
 +
 +              if (s->stat_sizeof == sizeof(u64))
 +                      do {
 +                              start = u64_stats_fetch_begin_irq(syncp);
 +                              data[i] = *(u64 *)p;
 +                      } while (u64_stats_fetch_retry_irq(syncp, start));
 +              else
 +                      data[i] = *(u32 *)p;
                j++;
        }
  
@@@ -610,7 -597,7 +610,7 @@@ static int bcm_sysport_set_coalesce(str
  
  static void bcm_sysport_free_cb(struct bcm_sysport_cb *cb)
  {
-       dev_kfree_skb_any(cb->skb);
+       dev_consume_skb_any(cb->skb);
        cb->skb = NULL;
        dma_unmap_addr_set(cb, dma_addr, 0);
  }
@@@ -679,7 -666,6 +679,7 @@@ static int bcm_sysport_alloc_rx_bufs(st
  static unsigned int bcm_sysport_desc_rx(struct bcm_sysport_priv *priv,
                                        unsigned int budget)
  {
 +      struct bcm_sysport_stats64 *stats64 = &priv->stats64;
        struct net_device *ndev = priv->netdev;
        unsigned int processed = 0, to_process;
        struct bcm_sysport_cb *cb;
                skb->protocol = eth_type_trans(skb, ndev);
                ndev->stats.rx_packets++;
                ndev->stats.rx_bytes += len;
 +              u64_stats_update_begin(&priv->syncp);
 +              stats64->rx_packets++;
 +              stats64->rx_bytes += len;
 +              u64_stats_update_end(&priv->syncp);
  
                napi_gro_receive(&priv->napi, skb);
  next:
@@@ -809,15 -791,17 +809,15 @@@ static void bcm_sysport_tx_reclaim_one(
        struct device *kdev = &priv->pdev->dev;
  
        if (cb->skb) {
 -              ring->bytes += cb->skb->len;
                *bytes_compl += cb->skb->len;
                dma_unmap_single(kdev, dma_unmap_addr(cb, dma_addr),
                                 dma_unmap_len(cb, dma_len),
                                 DMA_TO_DEVICE);
 -              ring->packets++;
                (*pkts_compl)++;
                bcm_sysport_free_cb(cb);
        /* SKB fragment */
        } else if (dma_unmap_addr(cb, dma_addr)) {
 -              ring->bytes += dma_unmap_len(cb, dma_len);
 +              *bytes_compl += dma_unmap_len(cb, dma_len);
                dma_unmap_page(kdev, dma_unmap_addr(cb, dma_addr),
                               dma_unmap_len(cb, dma_len), DMA_TO_DEVICE);
                dma_unmap_addr_set(cb, dma_addr, 0);
  static unsigned int __bcm_sysport_tx_reclaim(struct bcm_sysport_priv *priv,
                                             struct bcm_sysport_tx_ring *ring)
  {
 -      struct net_device *ndev = priv->netdev;
        unsigned int c_index, last_c_index, last_tx_cn, num_tx_cbs;
        unsigned int pkts_compl = 0, bytes_compl = 0;
 +      struct net_device *ndev = priv->netdev;
        struct bcm_sysport_cb *cb;
        u32 hw_ind;
  
                last_c_index &= (num_tx_cbs - 1);
        }
  
 +      u64_stats_update_begin(&priv->syncp);
 +      ring->packets += pkts_compl;
 +      ring->bytes += bytes_compl;
 +      u64_stats_update_end(&priv->syncp);
 +
        ring->c_index = c_index;
  
        netif_dbg(priv, tx_done, ndev,
@@@ -1367,6 -1346,8 +1367,8 @@@ static int bcm_sysport_init_tx_ring(str
  
        ring->cbs = kcalloc(size, sizeof(struct bcm_sysport_cb), GFP_KERNEL);
        if (!ring->cbs) {
+               dma_free_coherent(kdev, sizeof(struct dma_desc),
+                                 ring->desc_cpu, ring->desc_dma);
                netif_err(priv, hw, priv->netdev, "CB allocation failed\n");
                return -ENOMEM;
        }
@@@ -1696,41 -1677,22 +1698,41 @@@ static int bcm_sysport_change_mac(struc
        return 0;
  }
  
 -static struct net_device_stats *bcm_sysport_get_nstats(struct net_device *dev)
 +static void bcm_sysport_get_stats64(struct net_device *dev,
 +                                  struct rtnl_link_stats64 *stats)
  {
        struct bcm_sysport_priv *priv = netdev_priv(dev);
 -      unsigned long tx_bytes = 0, tx_packets = 0;
 +      struct bcm_sysport_stats64 *stats64 = &priv->stats64;
        struct bcm_sysport_tx_ring *ring;
 +      u64 tx_packets = 0, tx_bytes = 0;
 +      unsigned int start;
        unsigned int q;
  
 +      netdev_stats_to_stats64(stats, &dev->stats);
 +
        for (q = 0; q < dev->num_tx_queues; q++) {
                ring = &priv->tx_rings[q];
 -              tx_bytes += ring->bytes;
 -              tx_packets += ring->packets;
 +              do {
 +                      start = u64_stats_fetch_begin_irq(&priv->syncp);
 +                      tx_bytes = ring->bytes;
 +                      tx_packets = ring->packets;
 +              } while (u64_stats_fetch_retry_irq(&priv->syncp, start));
 +
 +              stats->tx_bytes += tx_bytes;
 +              stats->tx_packets += tx_packets;
        }
  
 -      dev->stats.tx_bytes = tx_bytes;
 -      dev->stats.tx_packets = tx_packets;
 -      return &dev->stats;
 +      /* lockless update tx_bytes and tx_packets */
 +      u64_stats_update_begin(&priv->syncp);
 +      stats64->tx_bytes = stats->tx_bytes;
 +      stats64->tx_packets = stats->tx_packets;
 +      u64_stats_update_end(&priv->syncp);
 +
 +      do {
 +              start = u64_stats_fetch_begin_irq(&priv->syncp);
 +              stats->rx_packets = stats64->rx_packets;
 +              stats->rx_bytes = stats64->rx_bytes;
 +      } while (u64_stats_fetch_retry_irq(&priv->syncp, start));
  }
  
  static void bcm_sysport_netif_start(struct net_device *dev)
@@@ -1762,14 -1724,10 +1764,14 @@@ static void rbuf_init(struct bcm_syspor
        reg = rbuf_readl(priv, RBUF_CONTROL);
        reg |= RBUF_4B_ALGN | RBUF_RSB_EN;
        /* Set a correct RSB format on SYSTEMPORT Lite */
 -      if (priv->is_lite) {
 +      if (priv->is_lite)
                reg &= ~RBUF_RSB_SWAP1;
 +
 +      /* Set a correct RSB format based on host endian */
 +      if (!IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
                reg |= RBUF_RSB_SWAP0;
 -      }
 +      else
 +              reg &= ~RBUF_RSB_SWAP0;
        rbuf_writel(priv, reg, RBUF_CONTROL);
  }
  
@@@ -1998,7 -1956,7 +2000,7 @@@ static const struct net_device_ops bcm_
  #ifdef CONFIG_NET_POLL_CONTROLLER
        .ndo_poll_controller    = bcm_sysport_poll_controller,
  #endif
 -      .ndo_get_stats          = bcm_sysport_get_nstats,
 +      .ndo_get_stats64        = bcm_sysport_get_stats64,
  };
  
  #define REV_FMT       "v%2x.%02x"
@@@ -2146,8 -2104,6 +2148,8 @@@ static int bcm_sysport_probe(struct pla
        /* libphy will adjust the link state accordingly */
        netif_carrier_off(dev);
  
 +      u64_stats_init(&priv->syncp);
 +
        ret = register_netdev(dev);
        if (ret) {
                dev_err(&pdev->dev, "failed to register net_device\n");
index d6367c10afb56f3daa9d938b896f49e3d7593575,f20b3d2a4c2330543f64eee1334ae4e543317f9c..aacec8bc19d5fbf6fe0f007d8a6a59fe2df23c8d
@@@ -33,7 -33,6 +33,7 @@@
  #include <linux/mii.h>
  #include <linux/if.h>
  #include <linux/if_vlan.h>
 +#include <linux/if_bridge.h>
  #include <linux/rtc.h>
  #include <linux/bpf.h>
  #include <net/ip.h>
@@@ -49,8 -48,6 +49,8 @@@
  #include <linux/aer.h>
  #include <linux/bitmap.h>
  #include <linux/cpu_rmap.h>
 +#include <linux/cpumask.h>
 +#include <net/pkt_cls.h>
  
  #include "bnxt_hsi.h"
  #include "bnxt.h"
@@@ -59,8 -56,6 +59,8 @@@
  #include "bnxt_ethtool.h"
  #include "bnxt_dcb.h"
  #include "bnxt_xdp.h"
 +#include "bnxt_vfr.h"
 +#include "bnxt_tc.h"
  
  #define BNXT_TX_TIMEOUT               (5 * HZ)
  
@@@ -106,8 -101,6 +106,8 @@@ enum board_idx 
        BCM57416_NPAR,
        BCM57452,
        BCM57454,
 +      BCM58802,
 +      BCM58808,
        NETXTREME_E_VF,
        NETXTREME_C_VF,
  };
  static const struct {
        char *name;
  } board_info[] = {
 -      { "Broadcom BCM57301 NetXtreme-C 10Gb Ethernet" },
 -      { "Broadcom BCM57302 NetXtreme-C 10Gb/25Gb Ethernet" },
 -      { "Broadcom BCM57304 NetXtreme-C 10Gb/25Gb/40Gb/50Gb Ethernet" },
 -      { "Broadcom BCM57417 NetXtreme-E Ethernet Partition" },
 -      { "Broadcom BCM58700 Nitro 1Gb/2.5Gb/10Gb Ethernet" },
 -      { "Broadcom BCM57311 NetXtreme-C 10Gb Ethernet" },
 -      { "Broadcom BCM57312 NetXtreme-C 10Gb/25Gb Ethernet" },
 -      { "Broadcom BCM57402 NetXtreme-E 10Gb Ethernet" },
 -      { "Broadcom BCM57404 NetXtreme-E 10Gb/25Gb Ethernet" },
 -      { "Broadcom BCM57406 NetXtreme-E 10GBase-T Ethernet" },
 -      { "Broadcom BCM57402 NetXtreme-E Ethernet Partition" },
 -      { "Broadcom BCM57407 NetXtreme-E 10GBase-T Ethernet" },
 -      { "Broadcom BCM57412 NetXtreme-E 10Gb Ethernet" },
 -      { "Broadcom BCM57414 NetXtreme-E 10Gb/25Gb Ethernet" },
 -      { "Broadcom BCM57416 NetXtreme-E 10GBase-T Ethernet" },
 -      { "Broadcom BCM57417 NetXtreme-E 10GBase-T Ethernet" },
 -      { "Broadcom BCM57412 NetXtreme-E Ethernet Partition" },
 -      { "Broadcom BCM57314 NetXtreme-C 10Gb/25Gb/40Gb/50Gb Ethernet" },
 -      { "Broadcom BCM57417 NetXtreme-E 10Gb/25Gb Ethernet" },
 -      { "Broadcom BCM57416 NetXtreme-E 10Gb Ethernet" },
 -      { "Broadcom BCM57404 NetXtreme-E Ethernet Partition" },
 -      { "Broadcom BCM57406 NetXtreme-E Ethernet Partition" },
 -      { "Broadcom BCM57407 NetXtreme-E 25Gb Ethernet" },
 -      { "Broadcom BCM57407 NetXtreme-E Ethernet Partition" },
 -      { "Broadcom BCM57414 NetXtreme-E Ethernet Partition" },
 -      { "Broadcom BCM57416 NetXtreme-E Ethernet Partition" },
 -      { "Broadcom BCM57452 NetXtreme-E 10Gb/25Gb/40Gb/50Gb Ethernet" },
 -      { "Broadcom BCM57454 NetXtreme-E 10Gb/25Gb/40Gb/50Gb/100Gb Ethernet" },
 -      { "Broadcom NetXtreme-E Ethernet Virtual Function" },
 -      { "Broadcom NetXtreme-C Ethernet Virtual Function" },
 +      [BCM57301] = { "Broadcom BCM57301 NetXtreme-C 10Gb Ethernet" },
 +      [BCM57302] = { "Broadcom BCM57302 NetXtreme-C 10Gb/25Gb Ethernet" },
 +      [BCM57304] = { "Broadcom BCM57304 NetXtreme-C 10Gb/25Gb/40Gb/50Gb Ethernet" },
 +      [BCM57417_NPAR] = { "Broadcom BCM57417 NetXtreme-E Ethernet Partition" },
 +      [BCM58700] = { "Broadcom BCM58700 Nitro 1Gb/2.5Gb/10Gb Ethernet" },
 +      [BCM57311] = { "Broadcom BCM57311 NetXtreme-C 10Gb Ethernet" },
 +      [BCM57312] = { "Broadcom BCM57312 NetXtreme-C 10Gb/25Gb Ethernet" },
 +      [BCM57402] = { "Broadcom BCM57402 NetXtreme-E 10Gb Ethernet" },
 +      [BCM57404] = { "Broadcom BCM57404 NetXtreme-E 10Gb/25Gb Ethernet" },
 +      [BCM57406] = { "Broadcom BCM57406 NetXtreme-E 10GBase-T Ethernet" },
 +      [BCM57402_NPAR] = { "Broadcom BCM57402 NetXtreme-E Ethernet Partition" },
 +      [BCM57407] = { "Broadcom BCM57407 NetXtreme-E 10GBase-T Ethernet" },
 +      [BCM57412] = { "Broadcom BCM57412 NetXtreme-E 10Gb Ethernet" },
 +      [BCM57414] = { "Broadcom BCM57414 NetXtreme-E 10Gb/25Gb Ethernet" },
 +      [BCM57416] = { "Broadcom BCM57416 NetXtreme-E 10GBase-T Ethernet" },
 +      [BCM57417] = { "Broadcom BCM57417 NetXtreme-E 10GBase-T Ethernet" },
 +      [BCM57412_NPAR] = { "Broadcom BCM57412 NetXtreme-E Ethernet Partition" },
 +      [BCM57314] = { "Broadcom BCM57314 NetXtreme-C 10Gb/25Gb/40Gb/50Gb Ethernet" },
 +      [BCM57417_SFP] = { "Broadcom BCM57417 NetXtreme-E 10Gb/25Gb Ethernet" },
 +      [BCM57416_SFP] = { "Broadcom BCM57416 NetXtreme-E 10Gb Ethernet" },
 +      [BCM57404_NPAR] = { "Broadcom BCM57404 NetXtreme-E Ethernet Partition" },
 +      [BCM57406_NPAR] = { "Broadcom BCM57406 NetXtreme-E Ethernet Partition" },
 +      [BCM57407_SFP] = { "Broadcom BCM57407 NetXtreme-E 25Gb Ethernet" },
 +      [BCM57407_NPAR] = { "Broadcom BCM57407 NetXtreme-E Ethernet Partition" },
 +      [BCM57414_NPAR] = { "Broadcom BCM57414 NetXtreme-E Ethernet Partition" },
 +      [BCM57416_NPAR] = { "Broadcom BCM57416 NetXtreme-E Ethernet Partition" },
 +      [BCM57452] = { "Broadcom BCM57452 NetXtreme-E 10Gb/25Gb/40Gb/50Gb Ethernet" },
 +      [BCM57454] = { "Broadcom BCM57454 NetXtreme-E 10Gb/25Gb/40Gb/50Gb/100Gb Ethernet" },
 +      [BCM58802] = { "Broadcom BCM58802 NetXtreme-S 10Gb/25Gb/40Gb/50Gb Ethernet" },
 +      [BCM58808] = { "Broadcom BCM58808 NetXtreme-S 10Gb/25Gb/40Gb/50Gb/100Gb Ethernet" },
 +      [NETXTREME_E_VF] = { "Broadcom NetXtreme-E Ethernet Virtual Function" },
 +      [NETXTREME_C_VF] = { "Broadcom NetXtreme-C Ethernet Virtual Function" },
  };
  
  static const struct pci_device_id bnxt_pci_tbl[] = {
 +      { PCI_VDEVICE(BROADCOM, 0x1614), .driver_data = BCM57454 },
        { PCI_VDEVICE(BROADCOM, 0x16c0), .driver_data = BCM57417_NPAR },
        { PCI_VDEVICE(BROADCOM, 0x16c8), .driver_data = BCM57301 },
        { PCI_VDEVICE(BROADCOM, 0x16c9), .driver_data = BCM57302 },
        { PCI_VDEVICE(BROADCOM, 0x16ed), .driver_data = BCM57414_NPAR },
        { PCI_VDEVICE(BROADCOM, 0x16ee), .driver_data = BCM57416_NPAR },
        { PCI_VDEVICE(BROADCOM, 0x16ef), .driver_data = BCM57416_NPAR },
 +      { PCI_VDEVICE(BROADCOM, 0x16f0), .driver_data = BCM58808 },
        { PCI_VDEVICE(BROADCOM, 0x16f1), .driver_data = BCM57452 },
 -      { PCI_VDEVICE(BROADCOM, 0x1614), .driver_data = BCM57454 },
 +      { PCI_VDEVICE(BROADCOM, 0xd802), .driver_data = BCM58802 },
  #ifdef CONFIG_BNXT_SRIOV
        { PCI_VDEVICE(BROADCOM, 0x1606), .driver_data = NETXTREME_E_VF },
        { PCI_VDEVICE(BROADCOM, 0x1609), .driver_data = NETXTREME_E_VF },
@@@ -254,16 -243,6 +254,16 @@@ const u16 bnxt_lhint_arr[] = 
        TX_BD_FLAGS_LHINT_2048_AND_LARGER,
  };
  
 +static u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb)
 +{
 +      struct metadata_dst *md_dst = skb_metadata_dst(skb);
 +
 +      if (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)
 +              return 0;
 +
 +      return md_dst->u.port_info.port_id;
 +}
 +
  static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev)
  {
        struct bnxt *bp = netdev_priv(dev);
        tx_buf->nr_frags = last_frag;
  
        vlan_tag_flags = 0;
 -      cfa_action = 0;
 +      cfa_action = bnxt_xmit_get_cfa_action(skb);
        if (skb_vlan_tag_present(skb)) {
                vlan_tag_flags = TX_BD_CFA_META_KEY_VLAN |
                                 skb_vlan_tag_get(skb);
                        tx_push1->tx_bd_hsize_lflags = 0;
  
                tx_push1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags);
 -              tx_push1->tx_bd_cfa_action = cpu_to_le32(cfa_action);
 +              tx_push1->tx_bd_cfa_action =
 +                      cpu_to_le32(cfa_action << TX_BD_CFA_ACTION_SHIFT);
  
                end = pdata + length;
                end = PTR_ALIGN(end, 8) - 1;
@@@ -449,8 -427,7 +449,8 @@@ normal_tx
        txbd->tx_bd_len_flags_type = cpu_to_le32(flags);
  
        txbd1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags);
 -      txbd1->tx_bd_cfa_action = cpu_to_le32(cfa_action);
 +      txbd1->tx_bd_cfa_action =
 +                      cpu_to_le32(cfa_action << TX_BD_CFA_ACTION_SHIFT);
        for (i = 0; i < last_frag; i++) {
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  
@@@ -1055,10 -1032,7 +1055,10 @@@ static void bnxt_tpa_start(struct bnxt 
                bnxt_sched_reset(bp, rxr);
                return;
        }
 -
 +      /* Store cfa_code in tpa_info to use in tpa_end
 +       * completion processing.
 +       */
 +      tpa_info->cfa_code = TPA_START_CFA_CODE(tpa_start1);
        prod_rx_buf->data = tpa_info->data;
        prod_rx_buf->data_ptr = tpa_info->data_ptr;
  
@@@ -1293,17 -1267,6 +1293,17 @@@ static inline struct sk_buff *bnxt_gro_
        return skb;
  }
  
 +/* Given the cfa_code of a received packet determine which
 + * netdev (vf-rep or PF) the packet is destined to.
 + */
 +static struct net_device *bnxt_get_pkt_dev(struct bnxt *bp, u16 cfa_code)
 +{
 +      struct net_device *dev = bnxt_get_vf_rep(bp, cfa_code);
 +
 +      /* if vf-rep dev is NULL, the must belongs to the PF */
 +      return dev ? dev : bp->dev;
 +}
 +
  static inline struct sk_buff *bnxt_tpa_end(struct bnxt *bp,
                                           struct bnxt_napi *bnapi,
                                           u32 *raw_cons,
                        return NULL;
                }
        }
 -      skb->protocol = eth_type_trans(skb, bp->dev);
 +
 +      skb->protocol =
 +              eth_type_trans(skb, bnxt_get_pkt_dev(bp, tpa_info->cfa_code));
  
        if (tpa_info->hash_type != PKT_HASH_TYPE_NONE)
                skb_set_hash(skb, tpa_info->rss_hash, tpa_info->hash_type);
        return skb;
  }
  
 +static void bnxt_deliver_skb(struct bnxt *bp, struct bnxt_napi *bnapi,
 +                           struct sk_buff *skb)
 +{
 +      if (skb->dev != bp->dev) {
 +              /* this packet belongs to a vf-rep */
 +              bnxt_vf_rep_rx(bp, skb);
 +              return;
 +      }
 +      skb_record_rx_queue(skb, bnapi->index);
 +      napi_gro_receive(&bnapi->napi, skb);
 +}
 +
  /* returns the following:
   * 1       - 1 packet successfully received
   * 0       - successful TPA_START, packet not completed yet
@@@ -1454,7 -1403,7 +1454,7 @@@ static int bnxt_rx_pkt(struct bnxt *bp
        struct rx_cmp *rxcmp;
        struct rx_cmp_ext *rxcmp1;
        u32 tmp_raw_cons = *raw_cons;
 -      u16 cons, prod, cp_cons = RING_CMP(tmp_raw_cons);
 +      u16 cfa_code, cons, prod, cp_cons = RING_CMP(tmp_raw_cons);
        struct bnxt_sw_rx_bd *rx_buf;
        unsigned int len;
        u8 *data_ptr, agg_bufs, cmp_type;
  
                rc = -ENOMEM;
                if (likely(skb)) {
 -                      skb_record_rx_queue(skb, bnapi->index);
 -                      napi_gro_receive(&bnapi->napi, skb);
 +                      bnxt_deliver_skb(bp, bnapi, skb);
                        rc = 1;
                }
                *event |= BNXT_RX_EVENT;
                skb_set_hash(skb, le32_to_cpu(rxcmp->rx_cmp_rss_hash), type);
        }
  
 -      skb->protocol = eth_type_trans(skb, dev);
 +      cfa_code = RX_CMP_CFA_CODE(rxcmp1);
 +      skb->protocol = eth_type_trans(skb, bnxt_get_pkt_dev(bp, cfa_code));
  
        if ((rxcmp1->rx_cmp_flags2 &
             cpu_to_le32(RX_CMP_FLAGS2_META_FORMAT_VLAN)) &&
                }
        }
  
 -      skb_record_rx_queue(skb, bnapi->index);
 -      napi_gro_receive(&bnapi->napi, skb);
 +      bnxt_deliver_skb(bp, bnapi, skb);
        rc = 1;
  
  next_rx:
@@@ -1852,13 -1802,6 +1852,13 @@@ static int bnxt_poll_work(struct bnxt *
                                                           &event);
                        if (likely(rc >= 0))
                                rx_pkts += rc;
 +                      /* Increment rx_pkts when rc is -ENOMEM to count towards
 +                       * the NAPI budget.  Otherwise, we may potentially loop
 +                       * here forever if we consistently cannot allocate
 +                       * buffers.
 +                       */
 +                      else if (rc == -ENOMEM)
 +                              rx_pkts++;
                        else if (rc == -EBUSY)  /* partial completion */
                                break;
                } else if (unlikely((TX_CMP_TYPE(txcmp) ==
@@@ -4477,33 -4420,9 +4477,33 @@@ static int bnxt_hwrm_reserve_tx_rings(s
        mutex_lock(&bp->hwrm_cmd_lock);
        rc = __bnxt_hwrm_get_tx_rings(bp, 0xffff, tx_rings);
        mutex_unlock(&bp->hwrm_cmd_lock);
 +      if (!rc)
 +              bp->tx_reserved_rings = *tx_rings;
        return rc;
  }
  
 +static int bnxt_hwrm_check_tx_rings(struct bnxt *bp, int tx_rings)
 +{
 +      struct hwrm_func_cfg_input req = {0};
 +      int rc;
 +
 +      if (bp->hwrm_spec_code < 0x10801)
 +              return 0;
 +
 +      if (BNXT_VF(bp))
 +              return 0;
 +
 +      bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
 +      req.fid = cpu_to_le16(0xffff);
 +      req.flags = cpu_to_le32(FUNC_CFG_REQ_FLAGS_TX_ASSETS_TEST);
 +      req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_NUM_TX_RINGS);
 +      req.num_tx_rings = cpu_to_le16(tx_rings);
 +      rc = hwrm_send_message_silent(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 +      if (rc)
 +              return -ENOMEM;
 +      return 0;
 +}
 +
  static void bnxt_hwrm_set_coal_params(struct bnxt *bp, u32 max_bufs,
        u32 buf_tmrs, u16 flags,
        struct hwrm_ring_cmpl_ring_cfg_aggint_params_input *req)
@@@ -4658,7 -4577,6 +4658,7 @@@ static int bnxt_hwrm_func_qcfg(struct b
  {
        struct hwrm_func_qcfg_input req = {0};
        struct hwrm_func_qcfg_output *resp = bp->hwrm_cmd_resp_addr;
 +      u16 flags;
        int rc;
  
        bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_QCFG, -1, -1);
                vf->vlan = le16_to_cpu(resp->vlan) & VLAN_VID_MASK;
        }
  #endif
 -      if (BNXT_PF(bp)) {
 -              u16 flags = le16_to_cpu(resp->flags);
 -
 -              if (flags & (FUNC_QCFG_RESP_FLAGS_FW_DCBX_AGENT_ENABLED |
 -                           FUNC_QCFG_RESP_FLAGS_FW_LLDP_AGENT_ENABLED))
 -                      bp->flags |= BNXT_FLAG_FW_LLDP_AGENT;
 -              if (flags & FUNC_QCFG_RESP_FLAGS_MULTI_HOST)
 -                      bp->flags |= BNXT_FLAG_MULTI_HOST;
 +      flags = le16_to_cpu(resp->flags);
 +      if (flags & (FUNC_QCFG_RESP_FLAGS_FW_DCBX_AGENT_ENABLED |
 +                   FUNC_QCFG_RESP_FLAGS_FW_LLDP_AGENT_ENABLED)) {
 +              bp->flags |= BNXT_FLAG_FW_LLDP_AGENT;
 +              if (flags & FUNC_QCFG_RESP_FLAGS_FW_DCBX_AGENT_ENABLED)
 +                      bp->flags |= BNXT_FLAG_FW_DCBX_AGENT;
        }
 +      if (BNXT_PF(bp) && (flags & FUNC_QCFG_RESP_FLAGS_MULTI_HOST))
 +              bp->flags |= BNXT_FLAG_MULTI_HOST;
  
        switch (resp->port_partition_type) {
        case FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR1_0:
                bp->port_partition_type = resp->port_partition_type;
                break;
        }
 +      if (bp->hwrm_spec_code < 0x10707 ||
 +          resp->evb_mode == FUNC_QCFG_RESP_EVB_MODE_VEB)
 +              bp->br_mode = BRIDGE_MODE_VEB;
 +      else if (resp->evb_mode == FUNC_QCFG_RESP_EVB_MODE_VEPA)
 +              bp->br_mode = BRIDGE_MODE_VEPA;
 +      else
 +              bp->br_mode = BRIDGE_MODE_UNDEF;
  
  func_qcfg_exit:
        mutex_unlock(&bp->hwrm_cmd_lock);
@@@ -4736,7 -4647,6 +4736,6 @@@ static int bnxt_hwrm_func_qcaps(struct 
                pf->port_id = le16_to_cpu(resp->port_id);
                bp->dev->dev_port = pf->port_id;
                memcpy(pf->mac_addr, resp->mac_address, ETH_ALEN);
-               memcpy(bp->dev->dev_addr, pf->mac_addr, ETH_ALEN);
                pf->max_rsscos_ctxs = le16_to_cpu(resp->max_rsscos_ctx);
                pf->max_cp_rings = le16_to_cpu(resp->max_cmpl_rings);
                pf->max_tx_rings = le16_to_cpu(resp->max_tx_rings);
                vf->max_stat_ctxs = le16_to_cpu(resp->max_stat_ctx);
  
                memcpy(vf->mac_addr, resp->mac_address, ETH_ALEN);
-               mutex_unlock(&bp->hwrm_cmd_lock);
-               if (is_valid_ether_addr(vf->mac_addr)) {
-                       /* overwrite netdev dev_adr with admin VF MAC */
-                       memcpy(bp->dev->dev_addr, vf->mac_addr, ETH_ALEN);
-               } else {
-                       eth_hw_addr_random(bp->dev);
-                       rc = bnxt_approve_mac(bp, bp->dev->dev_addr);
-               }
-               return rc;
  #endif
        }
  
@@@ -5000,26 -4900,6 +4989,26 @@@ static void bnxt_hwrm_resource_free(str
        }
  }
  
 +static int bnxt_hwrm_set_br_mode(struct bnxt *bp, u16 br_mode)
 +{
 +      struct hwrm_func_cfg_input req = {0};
 +      int rc;
 +
 +      bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
 +      req.fid = cpu_to_le16(0xffff);
 +      req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_EVB_MODE);
 +      if (br_mode == BRIDGE_MODE_VEB)
 +              req.evb_mode = FUNC_CFG_REQ_EVB_MODE_VEB;
 +      else if (br_mode == BRIDGE_MODE_VEPA)
 +              req.evb_mode = FUNC_CFG_REQ_EVB_MODE_VEPA;
 +      else
 +              return -EINVAL;
 +      rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 +      if (rc)
 +              rc = -EIO;
 +      return rc;
 +}
 +
  static int bnxt_setup_vnic(struct bnxt *bp, u16 vnic_id)
  {
        struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id];
@@@ -5155,15 -5035,6 +5144,15 @@@ static int bnxt_init_chip(struct bnxt *
                                   rc);
                        goto err_out;
                }
 +              if (bp->tx_reserved_rings != bp->tx_nr_rings) {
 +                      int tx = bp->tx_nr_rings;
 +
 +                      if (bnxt_hwrm_reserve_tx_rings(bp, &tx) ||
 +                          tx < bp->tx_nr_rings) {
 +                              rc = -ENOMEM;
 +                              goto err_out;
 +                      }
 +              }
        }
  
        rc = bnxt_hwrm_ring_alloc(bp);
@@@ -5570,15 -5441,8 +5559,15 @@@ static void bnxt_free_irq(struct bnxt *
  
        for (i = 0; i < bp->cp_nr_rings; i++) {
                irq = &bp->irq_tbl[i];
 -              if (irq->requested)
 +              if (irq->requested) {
 +                      if (irq->have_cpumask) {
 +                              irq_set_affinity_hint(irq->vector, NULL);
 +                              free_cpumask_var(irq->cpu_mask);
 +                              irq->have_cpumask = 0;
 +                      }
                        free_irq(irq->vector, bp->bnapi[i]);
 +              }
 +
                irq->requested = 0;
        }
  }
@@@ -5611,21 -5475,6 +5600,21 @@@ static int bnxt_request_irq(struct bnx
                        break;
  
                irq->requested = 1;
 +
 +              if (zalloc_cpumask_var(&irq->cpu_mask, GFP_KERNEL)) {
 +                      int numa_node = dev_to_node(&bp->pdev->dev);
 +
 +                      irq->have_cpumask = 1;
 +                      cpumask_set_cpu(cpumask_local_spread(i, numa_node),
 +                                      irq->cpu_mask);
 +                      rc = irq_set_affinity_hint(irq->vector, irq->cpu_mask);
 +                      if (rc) {
 +                              netdev_warn(bp->dev,
 +                                          "Set affinity failed, IRQ = %d\n",
 +                                          irq->vector);
 +                              break;
 +                      }
 +              }
        }
        return rc;
  }
@@@ -5699,10 -5548,12 +5688,10 @@@ void bnxt_tx_disable(struct bnxt *bp
  {
        int i;
        struct bnxt_tx_ring_info *txr;
 -      struct netdev_queue *txq;
  
        if (bp->tx_ring) {
                for (i = 0; i < bp->tx_nr_rings; i++) {
                        txr = &bp->tx_ring[i];
 -                      txq = netdev_get_tx_queue(bp->dev, i);
                        txr->dev_state = BNXT_DEV_STATE_CLOSING;
                }
        }
@@@ -5715,9 -5566,11 +5704,9 @@@ void bnxt_tx_enable(struct bnxt *bp
  {
        int i;
        struct bnxt_tx_ring_info *txr;
 -      struct netdev_queue *txq;
  
        for (i = 0; i < bp->tx_nr_rings; i++) {
                txr = &bp->tx_ring[i];
 -              txq = netdev_get_tx_queue(bp->dev, i);
                txr->dev_state = 0;
        }
        netif_tx_wake_all_queues(bp->dev);
@@@ -5782,7 -5635,7 +5771,7 @@@ static int bnxt_hwrm_phy_qcaps(struct b
        if (rc)
                goto hwrm_phy_qcaps_exit;
  
 -      if (resp->eee_supported & PORT_PHY_QCAPS_RESP_EEE_SUPPORTED) {
 +      if (resp->flags & PORT_PHY_QCAPS_RESP_FLAGS_EEE_SUPPORTED) {
                struct ethtool_eee *eee = &bp->eee;
                u16 fw_speeds = le16_to_cpu(resp->supported_speeds_eee_mode);
  
                link_info->support_auto_speeds =
                        le16_to_cpu(resp->supported_speeds_auto_mode);
  
 +      bp->port_count = resp->port_cnt;
 +
  hwrm_phy_qcaps_exit:
        mutex_unlock(&bp->hwrm_cmd_lock);
        return rc;
@@@ -5824,15 -5675,13 +5813,15 @@@ static int bnxt_update_link(struct bnx
  
        memcpy(&link_info->phy_qcfg_resp, resp, sizeof(*resp));
        link_info->phy_link_status = resp->link;
 -      link_info->duplex =  resp->duplex;
 +      link_info->duplex = resp->duplex_cfg;
 +      if (bp->hwrm_spec_code >= 0x10800)
 +              link_info->duplex = resp->duplex_state;
        link_info->pause = resp->pause;
        link_info->auto_mode = resp->auto_mode;
        link_info->auto_pause_setting = resp->auto_pause;
        link_info->lp_pause = resp->link_partner_adv_pause;
        link_info->force_pause_setting = resp->force_pause;
 -      link_info->duplex_setting = resp->duplex;
 +      link_info->duplex_setting = resp->duplex_cfg;
        if (link_info->phy_link_status == BNXT_LINK_LINK)
                link_info->link_speed = le16_to_cpu(resp->link_speed);
        else
@@@ -6354,9 -6203,6 +6343,9 @@@ static int __bnxt_open_nic(struct bnxt 
        /* Poll link status and check for SFP+ module status */
        bnxt_get_port_module_status(bp);
  
 +      /* VF-reps may need to be re-opened after the PF is re-opened */
 +      if (BNXT_PF(bp))
 +              bnxt_vf_reps_open(bp);
        return 0;
  
  open_err:
@@@ -6445,10 -6291,6 +6434,10 @@@ int bnxt_close_nic(struct bnxt *bp, boo
                if (rc)
                        netdev_warn(bp->dev, "timeout waiting for SRIOV config operation to complete!\n");
        }
 +
 +      /* Close the VF-reps before closing PF */
 +      if (BNXT_PF(bp))
 +              bnxt_vf_reps_close(bp);
  #endif
        /* Change device state to avoid TX queue wake up's */
        bnxt_tx_disable(bp);
@@@ -6960,8 -6802,7 +6949,8 @@@ static void bnxt_timer(unsigned long da
        if (atomic_read(&bp->intr_sem) != 0)
                goto bnxt_restart_timer;
  
 -      if (bp->link_info.link_up && (bp->flags & BNXT_FLAG_PORT_STATS)) {
 +      if (bp->link_info.link_up && (bp->flags & BNXT_FLAG_PORT_STATS) &&
 +          bp->stats_coal_ticks) {
                set_bit(BNXT_PERIODIC_STATS_SP_EVENT, &bp->sp_event);
                schedule_work(&bp->sp_task);
        }
@@@ -7071,8 -6912,8 +7060,8 @@@ static void bnxt_sp_task(struct work_st
  }
  
  /* Under rtnl_lock */
 -int bnxt_reserve_rings(struct bnxt *bp, int tx, int rx, bool sh, int tcs,
 -                     int tx_xdp)
 +int bnxt_check_rings(struct bnxt *bp, int tx, int rx, bool sh, int tcs,
 +                   int tx_xdp)
  {
        int max_rx, max_tx, tx_sets = 1;
        int tx_rings_needed;
        if (max_tx < tx_rings_needed)
                return -ENOMEM;
  
 -      if (bnxt_hwrm_reserve_tx_rings(bp, &tx_rings_needed) ||
 -          tx_rings_needed < (tx * tx_sets + tx_xdp))
 -              return -ENOMEM;
 -      return 0;
 +      return bnxt_hwrm_check_tx_rings(bp, tx_rings_needed);
  }
  
  static void bnxt_unmap_bars(struct bnxt *bp, struct pci_dev *pdev)
@@@ -7281,8 -7125,8 +7270,8 @@@ int bnxt_setup_mq_tc(struct net_device 
        if (bp->flags & BNXT_FLAG_SHARED_RINGS)
                sh = true;
  
 -      rc = bnxt_reserve_rings(bp, bp->tx_nr_rings_per_tc, bp->rx_nr_rings,
 -                              sh, tc, bp->tx_nr_rings_xdp);
 +      rc = bnxt_check_rings(bp, bp->tx_nr_rings_per_tc, bp->rx_nr_rings,
 +                            sh, tc, bp->tx_nr_rings_xdp);
        if (rc)
                return rc;
  
                bp->tx_nr_rings = bp->tx_nr_rings_per_tc;
                netdev_reset_tc(dev);
        }
+       bp->tx_nr_rings += bp->tx_nr_rings_xdp;
        bp->cp_nr_rings = sh ? max_t(int, bp->tx_nr_rings, bp->rx_nr_rings) :
                               bp->tx_nr_rings + bp->rx_nr_rings;
        bp->num_stat_ctxs = bp->cp_nr_rings;
        return 0;
  }
  
 -static int bnxt_setup_tc(struct net_device *dev, u32 handle, u32 chain_index,
 -                       __be16 proto, struct tc_to_netdev *ntc)
 +static int bnxt_setup_flower(struct net_device *dev,
 +                           struct tc_cls_flower_offload *cls_flower)
  {
 -      if (ntc->type != TC_SETUP_MQPRIO)
 -              return -EINVAL;
 +      struct bnxt *bp = netdev_priv(dev);
  
 -      ntc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
 +      if (BNXT_VF(bp))
 +              return -EOPNOTSUPP;
  
 -      return bnxt_setup_mq_tc(dev, ntc->mqprio->num_tc);
 +      return bnxt_tc_setup_flower(bp, bp->pf.fw_fid, cls_flower);
 +}
 +
 +static int bnxt_setup_tc(struct net_device *dev, enum tc_setup_type type,
 +                       void *type_data)
 +{
 +      switch (type) {
 +      case TC_SETUP_CLSFLOWER:
 +              return bnxt_setup_flower(dev, type_data);
 +      case TC_SETUP_MQPRIO: {
 +              struct tc_mqprio_qopt *mqprio = type_data;
 +
 +              mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
 +
 +              return bnxt_setup_mq_tc(dev, mqprio->num_tc);
 +      }
 +      default:
 +              return -EOPNOTSUPP;
 +      }
  }
  
  #ifdef CONFIG_RFS_ACCEL
@@@ -7585,102 -7412,6 +7575,102 @@@ static void bnxt_udp_tunnel_del(struct 
        schedule_work(&bp->sp_task);
  }
  
 +static int bnxt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 +                             struct net_device *dev, u32 filter_mask,
 +                             int nlflags)
 +{
 +      struct bnxt *bp = netdev_priv(dev);
 +
 +      return ndo_dflt_bridge_getlink(skb, pid, seq, dev, bp->br_mode, 0, 0,
 +                                     nlflags, filter_mask, NULL);
 +}
 +
 +static int bnxt_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
 +                             u16 flags)
 +{
 +      struct bnxt *bp = netdev_priv(dev);
 +      struct nlattr *attr, *br_spec;
 +      int rem, rc = 0;
 +
 +      if (bp->hwrm_spec_code < 0x10708 || !BNXT_SINGLE_PF(bp))
 +              return -EOPNOTSUPP;
 +
 +      br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
 +      if (!br_spec)
 +              return -EINVAL;
 +
 +      nla_for_each_nested(attr, br_spec, rem) {
 +              u16 mode;
 +
 +              if (nla_type(attr) != IFLA_BRIDGE_MODE)
 +                      continue;
 +
 +              if (nla_len(attr) < sizeof(mode))
 +                      return -EINVAL;
 +
 +              mode = nla_get_u16(attr);
 +              if (mode == bp->br_mode)
 +                      break;
 +
 +              rc = bnxt_hwrm_set_br_mode(bp, mode);
 +              if (!rc)
 +                      bp->br_mode = mode;
 +              break;
 +      }
 +      return rc;
 +}
 +
 +static int bnxt_get_phys_port_name(struct net_device *dev, char *buf,
 +                                 size_t len)
 +{
 +      struct bnxt *bp = netdev_priv(dev);
 +      int rc;
 +
 +      /* The PF and it's VF-reps only support the switchdev framework */
 +      if (!BNXT_PF(bp))
 +              return -EOPNOTSUPP;
 +
 +      rc = snprintf(buf, len, "p%d", bp->pf.port_id);
 +
 +      if (rc >= len)
 +              return -EOPNOTSUPP;
 +      return 0;
 +}
 +
 +int bnxt_port_attr_get(struct bnxt *bp, struct switchdev_attr *attr)
 +{
 +      if (bp->eswitch_mode != DEVLINK_ESWITCH_MODE_SWITCHDEV)
 +              return -EOPNOTSUPP;
 +
 +      /* The PF and it's VF-reps only support the switchdev framework */
 +      if (!BNXT_PF(bp))
 +              return -EOPNOTSUPP;
 +
 +      switch (attr->id) {
 +      case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
 +              /* In SRIOV each PF-pool (PF + child VFs) serves as a
 +               * switching domain, the PF's perm mac-addr can be used
 +               * as the unique parent-id
 +               */
 +              attr->u.ppid.id_len = ETH_ALEN;
 +              ether_addr_copy(attr->u.ppid.id, bp->pf.mac_addr);
 +              break;
 +      default:
 +              return -EOPNOTSUPP;
 +      }
 +      return 0;
 +}
 +
 +static int bnxt_swdev_port_attr_get(struct net_device *dev,
 +                                  struct switchdev_attr *attr)
 +{
 +      return bnxt_port_attr_get(netdev_priv(dev), attr);
 +}
 +
 +static const struct switchdev_ops bnxt_switchdev_ops = {
 +      .switchdev_port_attr_get        = bnxt_swdev_port_attr_get
 +};
 +
  static const struct net_device_ops bnxt_netdev_ops = {
        .ndo_open               = bnxt_open,
        .ndo_start_xmit         = bnxt_start_xmit,
        .ndo_udp_tunnel_add     = bnxt_udp_tunnel_add,
        .ndo_udp_tunnel_del     = bnxt_udp_tunnel_del,
        .ndo_xdp                = bnxt_xdp,
 +      .ndo_bridge_getlink     = bnxt_bridge_getlink,
 +      .ndo_bridge_setlink     = bnxt_bridge_setlink,
 +      .ndo_get_phys_port_name = bnxt_get_phys_port_name
  };
  
  static void bnxt_remove_one(struct pci_dev *pdev)
        struct net_device *dev = pci_get_drvdata(pdev);
        struct bnxt *bp = netdev_priv(dev);
  
 -      if (BNXT_PF(bp))
 +      if (BNXT_PF(bp)) {
                bnxt_sriov_disable(bp);
 +              bnxt_dl_unregister(bp);
 +      }
  
        pci_disable_pcie_error_reporting(pdev);
        unregister_netdev(dev);
 +      bnxt_shutdown_tc(bp);
        cancel_work_sync(&bp->sp_task);
        bp->sp_event = 0;
  
@@@ -7898,9 -7623,6 +7888,9 @@@ static int bnxt_set_dflt_rings(struct b
        if (sh)
                bp->flags |= BNXT_FLAG_SHARED_RINGS;
        dflt_rings = netif_get_num_default_rss_queues();
 +      /* Reduce default rings to reduce memory usage on multi-port cards */
 +      if (bp->port_count > 1)
 +              dflt_rings = min_t(int, dflt_rings, 4);
        rc = bnxt_get_dflt_rings(bp, &max_rx_rings, &max_tx_rings, sh);
        if (rc)
                return rc;
@@@ -7929,6 -7651,28 +7919,28 @@@ void bnxt_restore_pf_fw_resources(struc
        bnxt_subtract_ulp_resources(bp, BNXT_ROCE_ULP);
  }
  
+ static int bnxt_init_mac_addr(struct bnxt *bp)
+ {
+       int rc = 0;
+       if (BNXT_PF(bp)) {
+               memcpy(bp->dev->dev_addr, bp->pf.mac_addr, ETH_ALEN);
+       } else {
+ #ifdef CONFIG_BNXT_SRIOV
+               struct bnxt_vf_info *vf = &bp->vf;
+               if (is_valid_ether_addr(vf->mac_addr)) {
+                       /* overwrite netdev dev_adr with admin VF MAC */
+                       memcpy(bp->dev->dev_addr, vf->mac_addr, ETH_ALEN);
+               } else {
+                       eth_hw_addr_random(bp->dev);
+                       rc = bnxt_approve_mac(bp, bp->dev->dev_addr);
+               }
+ #endif
+       }
+       return rc;
+ }
  static void bnxt_parse_log_pcie_link(struct bnxt *bp)
  {
        enum pcie_link_width width = PCIE_LNK_WIDTH_UNKNOWN;
@@@ -7978,7 -7722,6 +7990,7 @@@ static int bnxt_init_one(struct pci_de
        dev->netdev_ops = &bnxt_netdev_ops;
        dev->watchdog_timeo = BNXT_TX_TIMEOUT;
        dev->ethtool_ops = &bnxt_ethtool_ops;
 +      SWITCHDEV_SET_OPS(dev, &bnxt_switchdev_ops);
        pci_set_drvdata(pdev, dev);
  
        rc = bnxt_alloc_hwrm_resources(bp);
  
  #ifdef CONFIG_BNXT_SRIOV
        init_waitqueue_head(&bp->sriov_cfg_wait);
 +      mutex_init(&bp->sriov_lock);
  #endif
        bp->gro_func = bnxt_gro_func_5730x;
        if (BNXT_CHIP_P4_PLUS(bp))
                rc = -1;
                goto init_err_pci_clean;
        }
+       rc = bnxt_init_mac_addr(bp);
+       if (rc) {
+               dev_err(&pdev->dev, "Unable to initialize mac address.\n");
+               rc = -EADDRNOTAVAIL;
+               goto init_err_pci_clean;
+       }
        rc = bnxt_hwrm_queue_qportcfg(bp);
        if (rc) {
                netdev_err(bp->dev, "hwrm query qportcfg failure rc: %x\n",
        bnxt_ethtool_init(bp);
        bnxt_dcb_init(bp);
  
 +      rc = bnxt_probe_phy(bp);
 +      if (rc)
 +              goto init_err_pci_clean;
 +
        bnxt_set_rx_skb_mode(bp, false);
        bnxt_set_tpa_flags(bp);
        bnxt_set_ring_params(bp);
        if (dev->hw_features & NETIF_F_HW_VLAN_CTAG_RX)
                bp->flags |= BNXT_FLAG_STRIP_VLAN;
  
 -      rc = bnxt_probe_phy(bp);
 -      if (rc)
 -              goto init_err_pci_clean;
 -
        rc = bnxt_init_int_mode(bp);
        if (rc)
                goto init_err_pci_clean;
        else
                device_set_wakeup_capable(&pdev->dev, false);
  
 +      if (BNXT_PF(bp))
 +              bnxt_init_tc(bp);
 +
        rc = register_netdev(dev);
        if (rc)
 -              goto init_err_clr_int;
 +              goto init_err_cleanup_tc;
 +
 +      if (BNXT_PF(bp))
 +              bnxt_dl_register(bp);
  
        netdev_info(dev, "%s found at mem %lx, node addr %pM\n",
                    board_info[ent->driver_data].name,
  
        return 0;
  
 -init_err_clr_int:
 +init_err_cleanup_tc:
 +      bnxt_shutdown_tc(bp);
        bnxt_clear_int_mode(bp);
  
  init_err_pci_clean:
index 612d1ef3b5f5ebe622ae1b74b74697d8d8a2c001,fea3f9a5fb2d37221cbf9abed77cdcdcf906a00c..9cebca89691380fd34aacb9c7dbb341c80b68229
  #define GENET_RDMA_REG_OFF    (priv->hw_params->rdma_offset + \
                                TOTAL_DESC * DMA_DESC_SIZE)
  
 +static inline void bcmgenet_writel(u32 value, void __iomem *offset)
 +{
 +      /* MIPS chips strapped for BE will automagically configure the
 +       * peripheral registers for CPU-native byte order.
 +       */
 +      if (IS_ENABLED(CONFIG_MIPS) && IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
 +              __raw_writel(value, offset);
 +      else
 +              writel_relaxed(value, offset);
 +}
 +
 +static inline u32 bcmgenet_readl(void __iomem *offset)
 +{
 +      if (IS_ENABLED(CONFIG_MIPS) && IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
 +              return __raw_readl(offset);
 +      else
 +              return readl_relaxed(offset);
 +}
 +
  static inline void dmadesc_set_length_status(struct bcmgenet_priv *priv,
                                             void __iomem *d, u32 value)
  {
 -      __raw_writel(value, d + DMA_DESC_LENGTH_STATUS);
 +      bcmgenet_writel(value, d + DMA_DESC_LENGTH_STATUS);
  }
  
  static inline u32 dmadesc_get_length_status(struct bcmgenet_priv *priv,
                                            void __iomem *d)
  {
 -      return __raw_readl(d + DMA_DESC_LENGTH_STATUS);
 +      return bcmgenet_readl(d + DMA_DESC_LENGTH_STATUS);
  }
  
  static inline void dmadesc_set_addr(struct bcmgenet_priv *priv,
                                    void __iomem *d,
                                    dma_addr_t addr)
  {
 -      __raw_writel(lower_32_bits(addr), d + DMA_DESC_ADDRESS_LO);
 +      bcmgenet_writel(lower_32_bits(addr), d + DMA_DESC_ADDRESS_LO);
  
        /* Register writes to GISB bus can take couple hundred nanoseconds
         * and are done for each packet, save these expensive writes unless
         */
  #ifdef CONFIG_PHYS_ADDR_T_64BIT
        if (priv->hw_params->flags & GENET_HAS_40BITS)
 -              __raw_writel(upper_32_bits(addr), d + DMA_DESC_ADDRESS_HI);
 +              bcmgenet_writel(upper_32_bits(addr), d + DMA_DESC_ADDRESS_HI);
  #endif
  }
  
@@@ -132,7 -113,7 +132,7 @@@ static inline dma_addr_t dmadesc_get_ad
  {
        dma_addr_t addr;
  
 -      addr = __raw_readl(d + DMA_DESC_ADDRESS_LO);
 +      addr = bcmgenet_readl(d + DMA_DESC_ADDRESS_LO);
  
        /* Register writes to GISB bus can take couple hundred nanoseconds
         * and are done for each packet, save these expensive writes unless
         */
  #ifdef CONFIG_PHYS_ADDR_T_64BIT
        if (priv->hw_params->flags & GENET_HAS_40BITS)
 -              addr |= (u64)__raw_readl(d + DMA_DESC_ADDRESS_HI) << 32;
 +              addr |= (u64)bcmgenet_readl(d + DMA_DESC_ADDRESS_HI) << 32;
  #endif
        return addr;
  }
@@@ -175,8 -156,8 +175,8 @@@ static inline u32 bcmgenet_tbuf_ctrl_ge
        if (GENET_IS_V1(priv))
                return bcmgenet_rbuf_readl(priv, TBUF_CTRL_V1);
        else
 -              return __raw_readl(priv->base +
 -                              priv->hw_params->tbuf_offset + TBUF_CTRL);
 +              return bcmgenet_readl(priv->base +
 +                                    priv->hw_params->tbuf_offset + TBUF_CTRL);
  }
  
  static inline void bcmgenet_tbuf_ctrl_set(struct bcmgenet_priv *priv, u32 val)
        if (GENET_IS_V1(priv))
                bcmgenet_rbuf_writel(priv, val, TBUF_CTRL_V1);
        else
 -              __raw_writel(val, priv->base +
 +              bcmgenet_writel(val, priv->base +
                                priv->hw_params->tbuf_offset + TBUF_CTRL);
  }
  
@@@ -193,8 -174,8 +193,8 @@@ static inline u32 bcmgenet_bp_mc_get(st
        if (GENET_IS_V1(priv))
                return bcmgenet_rbuf_readl(priv, TBUF_BP_MC_V1);
        else
 -              return __raw_readl(priv->base +
 -                              priv->hw_params->tbuf_offset + TBUF_BP_MC);
 +              return bcmgenet_readl(priv->base +
 +                                    priv->hw_params->tbuf_offset + TBUF_BP_MC);
  }
  
  static inline void bcmgenet_bp_mc_set(struct bcmgenet_priv *priv, u32 val)
        if (GENET_IS_V1(priv))
                bcmgenet_rbuf_writel(priv, val, TBUF_BP_MC_V1);
        else
 -              __raw_writel(val, priv->base +
 +              bcmgenet_writel(val, priv->base +
                                priv->hw_params->tbuf_offset + TBUF_BP_MC);
  }
  
@@@ -345,28 -326,28 +345,28 @@@ static inline struct bcmgenet_priv *dev
  static inline u32 bcmgenet_tdma_readl(struct bcmgenet_priv *priv,
                                      enum dma_reg r)
  {
 -      return __raw_readl(priv->base + GENET_TDMA_REG_OFF +
 -                      DMA_RINGS_SIZE + bcmgenet_dma_regs[r]);
 +      return bcmgenet_readl(priv->base + GENET_TDMA_REG_OFF +
 +                            DMA_RINGS_SIZE + bcmgenet_dma_regs[r]);
  }
  
  static inline void bcmgenet_tdma_writel(struct bcmgenet_priv *priv,
                                        u32 val, enum dma_reg r)
  {
 -      __raw_writel(val, priv->base + GENET_TDMA_REG_OFF +
 +      bcmgenet_writel(val, priv->base + GENET_TDMA_REG_OFF +
                        DMA_RINGS_SIZE + bcmgenet_dma_regs[r]);
  }
  
  static inline u32 bcmgenet_rdma_readl(struct bcmgenet_priv *priv,
                                      enum dma_reg r)
  {
 -      return __raw_readl(priv->base + GENET_RDMA_REG_OFF +
 -                      DMA_RINGS_SIZE + bcmgenet_dma_regs[r]);
 +      return bcmgenet_readl(priv->base + GENET_RDMA_REG_OFF +
 +                            DMA_RINGS_SIZE + bcmgenet_dma_regs[r]);
  }
  
  static inline void bcmgenet_rdma_writel(struct bcmgenet_priv *priv,
                                        u32 val, enum dma_reg r)
  {
 -      __raw_writel(val, priv->base + GENET_RDMA_REG_OFF +
 +      bcmgenet_writel(val, priv->base + GENET_RDMA_REG_OFF +
                        DMA_RINGS_SIZE + bcmgenet_dma_regs[r]);
  }
  
@@@ -437,16 -418,16 +437,16 @@@ static inline u32 bcmgenet_tdma_ring_re
                                           unsigned int ring,
                                           enum dma_ring_reg r)
  {
 -      return __raw_readl(priv->base + GENET_TDMA_REG_OFF +
 -                      (DMA_RING_SIZE * ring) +
 -                      genet_dma_ring_regs[r]);
 +      return bcmgenet_readl(priv->base + GENET_TDMA_REG_OFF +
 +                            (DMA_RING_SIZE * ring) +
 +                            genet_dma_ring_regs[r]);
  }
  
  static inline void bcmgenet_tdma_ring_writel(struct bcmgenet_priv *priv,
                                             unsigned int ring, u32 val,
                                             enum dma_ring_reg r)
  {
 -      __raw_writel(val, priv->base + GENET_TDMA_REG_OFF +
 +      bcmgenet_writel(val, priv->base + GENET_TDMA_REG_OFF +
                        (DMA_RING_SIZE * ring) +
                        genet_dma_ring_regs[r]);
  }
@@@ -455,16 -436,16 +455,16 @@@ static inline u32 bcmgenet_rdma_ring_re
                                           unsigned int ring,
                                           enum dma_ring_reg r)
  {
 -      return __raw_readl(priv->base + GENET_RDMA_REG_OFF +
 -                      (DMA_RING_SIZE * ring) +
 -                      genet_dma_ring_regs[r]);
 +      return bcmgenet_readl(priv->base + GENET_RDMA_REG_OFF +
 +                            (DMA_RING_SIZE * ring) +
 +                            genet_dma_ring_regs[r]);
  }
  
  static inline void bcmgenet_rdma_ring_writel(struct bcmgenet_priv *priv,
                                             unsigned int ring, u32 val,
                                             enum dma_ring_reg r)
  {
 -      __raw_writel(val, priv->base + GENET_RDMA_REG_OFF +
 +      bcmgenet_writel(val, priv->base + GENET_RDMA_REG_OFF +
                        (DMA_RING_SIZE * ring) +
                        genet_dma_ring_regs[r]);
  }
@@@ -1010,12 -991,12 +1010,12 @@@ static void bcmgenet_eee_enable_set(str
        bcmgenet_umac_writel(priv, reg, UMAC_EEE_CTRL);
  
        /* Enable EEE and switch to a 27Mhz clock automatically */
 -      reg = __raw_readl(priv->base + off);
 +      reg = bcmgenet_readl(priv->base + off);
        if (enable)
                reg |= TBUF_EEE_EN | TBUF_PM_EN;
        else
                reg &= ~(TBUF_EEE_EN | TBUF_PM_EN);
 -      __raw_writel(reg, priv->base + off);
 +      bcmgenet_writel(reg, priv->base + off);
  
        /* Do the same for thing for RBUF */
        reg = bcmgenet_rbuf_readl(priv, RBUF_ENERGY_CTRL);
@@@ -1379,7 -1360,7 +1379,7 @@@ static unsigned int __bcmgenet_tx_recla
                if (skb) {
                        pkts_compl++;
                        bytes_compl += GENET_CB(skb)->bytes_sent;
-                       dev_kfree_skb_any(skb);
+                       dev_consume_skb_any(skb);
                }
  
                txbds_processed++;
@@@ -1894,7 -1875,7 +1894,7 @@@ static int bcmgenet_alloc_rx_buffers(st
                cb = ring->cbs + i;
                skb = bcmgenet_rx_refill(priv, cb);
                if (skb)
-                       dev_kfree_skb_any(skb);
+                       dev_consume_skb_any(skb);
                if (!cb->skb)
                        return -ENOMEM;
        }
@@@ -1913,7 -1894,7 +1913,7 @@@ static void bcmgenet_free_rx_buffers(st
  
                skb = bcmgenet_free_rx_cb(&priv->pdev->dev, cb);
                if (skb)
-                       dev_kfree_skb_any(skb);
+                       dev_consume_skb_any(skb);
        }
  }
  
index a4a33ebd0b98e24cab187921938ab133f4719f6e,0293b41171a5d90070c2ff9a954e1dd0e42aea4d..08624db8a6e9b4f0b19f8977cd7fbec4f9cf3a10
@@@ -369,12 -369,12 +369,12 @@@ int t4_wr_mbox_meat_timeout(struct adap
                list_del(&entry.list);
                spin_unlock(&adap->mbox_lock);
                ret = (v == MBOX_OWNER_FW) ? -EBUSY : -ETIMEDOUT;
-               t4_record_mbox(adap, cmd, MBOX_LEN, access, ret);
+               t4_record_mbox(adap, cmd, size, access, ret);
                return ret;
        }
  
        /* Copy in the new mailbox command and send it on its way ... */
-       t4_record_mbox(adap, cmd, MBOX_LEN, access, 0);
+       t4_record_mbox(adap, cmd, size, access, 0);
        for (i = 0; i < size; i += 8)
                t4_write_reg64(adap, data_reg + i, be64_to_cpu(*p++));
  
        }
  
        ret = (pcie_fw & PCIE_FW_ERR_F) ? -ENXIO : -ETIMEDOUT;
-       t4_record_mbox(adap, cmd, MBOX_LEN, access, ret);
+       t4_record_mbox(adap, cmd, size, access, ret);
        dev_err(adap->pdev_dev, "command %#x in mailbox %d timed out\n",
                *(const u8 *)cmd, mbox);
        t4_report_fw_error(adap);
@@@ -913,8 -913,7 +913,8 @@@ void t4_get_regs(struct adapter *adap, 
                0xd010, 0xd03c,
                0xdfc0, 0xdfe0,
                0xe000, 0xea7c,
 -              0xf000, 0x11190,
 +              0xf000, 0x11110,
 +              0x11118, 0x11190,
                0x19040, 0x1906c,
                0x19078, 0x19080,
                0x1908c, 0x190e4,
                0x1ff00, 0x1ff84,
                0x1ffc0, 0x1ffc8,
                0x30000, 0x30030,
 -              0x30038, 0x30038,
 -              0x30040, 0x30040,
                0x30100, 0x30144,
                0x30190, 0x301a0,
                0x301a8, 0x301b8,
                0x33c3c, 0x33c50,
                0x33cf0, 0x33cfc,
                0x34000, 0x34030,
 -              0x34038, 0x34038,
 -              0x34040, 0x34040,
                0x34100, 0x34144,
                0x34190, 0x341a0,
                0x341a8, 0x341b8,
                0x37c3c, 0x37c50,
                0x37cf0, 0x37cfc,
                0x38000, 0x38030,
 -              0x38038, 0x38038,
 -              0x38040, 0x38040,
                0x38100, 0x38144,
                0x38190, 0x381a0,
                0x381a8, 0x381b8,
                0x3bc3c, 0x3bc50,
                0x3bcf0, 0x3bcfc,
                0x3c000, 0x3c030,
 -              0x3c038, 0x3c038,
 -              0x3c040, 0x3c040,
                0x3c100, 0x3c144,
                0x3c190, 0x3c1a0,
                0x3c1a8, 0x3c1b8,
                0x1190, 0x1194,
                0x11a0, 0x11a4,
                0x11b0, 0x11b4,
 -              0x11fc, 0x1258,
 -              0x1280, 0x12d4,
 -              0x12d9, 0x12d9,
 -              0x12de, 0x12de,
 -              0x12e3, 0x12e3,
 -              0x12e8, 0x133c,
 +              0x11fc, 0x1274,
 +              0x1280, 0x133c,
                0x1800, 0x18fc,
                0x3000, 0x302c,
                0x3060, 0x30b0,
                0x5ea0, 0x5eb0,
                0x5ec0, 0x5ec0,
                0x5ec8, 0x5ed0,
 +              0x5ee0, 0x5ee0,
 +              0x5ef0, 0x5ef0,
 +              0x5f00, 0x5f00,
                0x6000, 0x6020,
                0x6028, 0x6040,
                0x6058, 0x609c,
                0xd300, 0xd31c,
                0xdfc0, 0xdfe0,
                0xe000, 0xf008,
 +              0xf010, 0xf018,
 +              0xf020, 0xf028,
                0x11000, 0x11014,
                0x11048, 0x1106c,
                0x11074, 0x11088,
                0x1ff00, 0x1ff84,
                0x1ffc0, 0x1ffc8,
                0x30000, 0x30030,
 -              0x30038, 0x30038,
 -              0x30040, 0x30040,
 -              0x30048, 0x30048,
 -              0x30050, 0x30050,
 -              0x3005c, 0x30060,
 -              0x30068, 0x30068,
 -              0x30070, 0x30070,
                0x30100, 0x30168,
                0x30190, 0x301a0,
                0x301a8, 0x301b8,
                0x326a8, 0x326a8,
                0x326ec, 0x326ec,
                0x32a00, 0x32abc,
 -              0x32b00, 0x32b38,
 +              0x32b00, 0x32b18,
 +              0x32b20, 0x32b38,
                0x32b40, 0x32b58,
                0x32b60, 0x32b78,
                0x32c00, 0x32c00,
                0x32c08, 0x32c3c,
 -              0x32e00, 0x32e2c,
 -              0x32f00, 0x32f2c,
                0x33000, 0x3302c,
                0x33034, 0x33050,
                0x33058, 0x33058,
                0x33c38, 0x33c50,
                0x33cf0, 0x33cfc,
                0x34000, 0x34030,
 -              0x34038, 0x34038,
 -              0x34040, 0x34040,
 -              0x34048, 0x34048,
 -              0x34050, 0x34050,
 -              0x3405c, 0x34060,
 -              0x34068, 0x34068,
 -              0x34070, 0x34070,
                0x34100, 0x34168,
                0x34190, 0x341a0,
                0x341a8, 0x341b8,
                0x366a8, 0x366a8,
                0x366ec, 0x366ec,
                0x36a00, 0x36abc,
 -              0x36b00, 0x36b38,
 +              0x36b00, 0x36b18,
 +              0x36b20, 0x36b38,
                0x36b40, 0x36b58,
                0x36b60, 0x36b78,
                0x36c00, 0x36c00,
                0x36c08, 0x36c3c,
 -              0x36e00, 0x36e2c,
 -              0x36f00, 0x36f2c,
                0x37000, 0x3702c,
                0x37034, 0x37050,
                0x37058, 0x37058,
                0x40280, 0x40280,
                0x40304, 0x40304,
                0x40330, 0x4033c,
 -              0x41304, 0x413b8,
 -              0x413c0, 0x413c8,
 +              0x41304, 0x413c8,
                0x413d0, 0x413dc,
                0x413f0, 0x413f0,
                0x41400, 0x4140c,
@@@ -3076,179 -3099,6 +3076,179 @@@ int t4_get_exprom_version(struct adapte
        return 0;
  }
  
 +/**
 + *      t4_get_vpd_version - return the VPD version
 + *      @adapter: the adapter
 + *      @vers: where to place the version
 + *
 + *      Reads the VPD via the Firmware interface (thus this can only be called
 + *      once we're ready to issue Firmware commands).  The format of the
 + *      VPD version is adapter specific.  Returns 0 on success, an error on
 + *      failure.
 + *
 + *      Note that early versions of the Firmware didn't include the ability
 + *      to retrieve the VPD version, so we zero-out the return-value parameter
 + *      in that case to avoid leaving it with garbage in it.
 + *
 + *      Also note that the Firmware will return its cached copy of the VPD
 + *      Revision ID, not the actual Revision ID as written in the Serial
 + *      EEPROM.  This is only an issue if a new VPD has been written and the
 + *      Firmware/Chip haven't yet gone through a RESET sequence.  So it's best
 + *      to defer calling this routine till after a FW_RESET_CMD has been issued
 + *      if the Host Driver will be performing a full adapter initialization.
 + */
 +int t4_get_vpd_version(struct adapter *adapter, u32 *vers)
 +{
 +      u32 vpdrev_param;
 +      int ret;
 +
 +      vpdrev_param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
 +                      FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_VPDREV));
 +      ret = t4_query_params(adapter, adapter->mbox, adapter->pf, 0,
 +                            1, &vpdrev_param, vers);
 +      if (ret)
 +              *vers = 0;
 +      return ret;
 +}
 +
 +/**
 + *      t4_get_scfg_version - return the Serial Configuration version
 + *      @adapter: the adapter
 + *      @vers: where to place the version
 + *
 + *      Reads the Serial Configuration Version via the Firmware interface
 + *      (thus this can only be called once we're ready to issue Firmware
 + *      commands).  The format of the Serial Configuration version is
 + *      adapter specific.  Returns 0 on success, an error on failure.
 + *
 + *      Note that early versions of the Firmware didn't include the ability
 + *      to retrieve the Serial Configuration version, so we zero-out the
 + *      return-value parameter in that case to avoid leaving it with
 + *      garbage in it.
 + *
 + *      Also note that the Firmware will return its cached copy of the Serial
 + *      Initialization Revision ID, not the actual Revision ID as written in
 + *      the Serial EEPROM.  This is only an issue if a new VPD has been written
 + *      and the Firmware/Chip haven't yet gone through a RESET sequence.  So
 + *      it's best to defer calling this routine till after a FW_RESET_CMD has
 + *      been issued if the Host Driver will be performing a full adapter
 + *      initialization.
 + */
 +int t4_get_scfg_version(struct adapter *adapter, u32 *vers)
 +{
 +      u32 scfgrev_param;
 +      int ret;
 +
 +      scfgrev_param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
 +                       FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_SCFGREV));
 +      ret = t4_query_params(adapter, adapter->mbox, adapter->pf, 0,
 +                            1, &scfgrev_param, vers);
 +      if (ret)
 +              *vers = 0;
 +      return ret;
 +}
 +
 +/**
 + *      t4_get_version_info - extract various chip/firmware version information
 + *      @adapter: the adapter
 + *
 + *      Reads various chip/firmware version numbers and stores them into the
 + *      adapter Adapter Parameters structure.  If any of the efforts fails
 + *      the first failure will be returned, but all of the version numbers
 + *      will be read.
 + */
 +int t4_get_version_info(struct adapter *adapter)
 +{
 +      int ret = 0;
 +
 +      #define FIRST_RET(__getvinfo) \
 +      do { \
 +              int __ret = __getvinfo; \
 +              if (__ret && !ret) \
 +                      ret = __ret; \
 +      } while (0)
 +
 +      FIRST_RET(t4_get_fw_version(adapter, &adapter->params.fw_vers));
 +      FIRST_RET(t4_get_bs_version(adapter, &adapter->params.bs_vers));
 +      FIRST_RET(t4_get_tp_version(adapter, &adapter->params.tp_vers));
 +      FIRST_RET(t4_get_exprom_version(adapter, &adapter->params.er_vers));
 +      FIRST_RET(t4_get_scfg_version(adapter, &adapter->params.scfg_vers));
 +      FIRST_RET(t4_get_vpd_version(adapter, &adapter->params.vpd_vers));
 +
 +      #undef FIRST_RET
 +      return ret;
 +}
 +
 +/**
 + *      t4_dump_version_info - dump all of the adapter configuration IDs
 + *      @adapter: the adapter
 + *
 + *      Dumps all of the various bits of adapter configuration version/revision
 + *      IDs information.  This is typically called at some point after
 + *      t4_get_version_info() has been called.
 + */
 +void t4_dump_version_info(struct adapter *adapter)
 +{
 +      /* Device information */
 +      dev_info(adapter->pdev_dev, "Chelsio %s rev %d\n",
 +               adapter->params.vpd.id,
 +               CHELSIO_CHIP_RELEASE(adapter->params.chip));
 +      dev_info(adapter->pdev_dev, "S/N: %s, P/N: %s\n",
 +               adapter->params.vpd.sn, adapter->params.vpd.pn);
 +
 +      /* Firmware Version */
 +      if (!adapter->params.fw_vers)
 +              dev_warn(adapter->pdev_dev, "No firmware loaded\n");
 +      else
 +              dev_info(adapter->pdev_dev, "Firmware version: %u.%u.%u.%u\n",
 +                       FW_HDR_FW_VER_MAJOR_G(adapter->params.fw_vers),
 +                       FW_HDR_FW_VER_MINOR_G(adapter->params.fw_vers),
 +                       FW_HDR_FW_VER_MICRO_G(adapter->params.fw_vers),
 +                       FW_HDR_FW_VER_BUILD_G(adapter->params.fw_vers));
 +
 +      /* Bootstrap Firmware Version. (Some adapters don't have Bootstrap
 +       * Firmware, so dev_info() is more appropriate here.)
 +       */
 +      if (!adapter->params.bs_vers)
 +              dev_info(adapter->pdev_dev, "No bootstrap loaded\n");
 +      else
 +              dev_info(adapter->pdev_dev, "Bootstrap version: %u.%u.%u.%u\n",
 +                       FW_HDR_FW_VER_MAJOR_G(adapter->params.bs_vers),
 +                       FW_HDR_FW_VER_MINOR_G(adapter->params.bs_vers),
 +                       FW_HDR_FW_VER_MICRO_G(adapter->params.bs_vers),
 +                       FW_HDR_FW_VER_BUILD_G(adapter->params.bs_vers));
 +
 +      /* TP Microcode Version */
 +      if (!adapter->params.tp_vers)
 +              dev_warn(adapter->pdev_dev, "No TP Microcode loaded\n");
 +      else
 +              dev_info(adapter->pdev_dev,
 +                       "TP Microcode version: %u.%u.%u.%u\n",
 +                       FW_HDR_FW_VER_MAJOR_G(adapter->params.tp_vers),
 +                       FW_HDR_FW_VER_MINOR_G(adapter->params.tp_vers),
 +                       FW_HDR_FW_VER_MICRO_G(adapter->params.tp_vers),
 +                       FW_HDR_FW_VER_BUILD_G(adapter->params.tp_vers));
 +
 +      /* Expansion ROM version */
 +      if (!adapter->params.er_vers)
 +              dev_info(adapter->pdev_dev, "No Expansion ROM loaded\n");
 +      else
 +              dev_info(adapter->pdev_dev,
 +                       "Expansion ROM version: %u.%u.%u.%u\n",
 +                       FW_HDR_FW_VER_MAJOR_G(adapter->params.er_vers),
 +                       FW_HDR_FW_VER_MINOR_G(adapter->params.er_vers),
 +                       FW_HDR_FW_VER_MICRO_G(adapter->params.er_vers),
 +                       FW_HDR_FW_VER_BUILD_G(adapter->params.er_vers));
 +
 +      /* Serial Configuration version */
 +      dev_info(adapter->pdev_dev, "Serial Configuration version: %#x\n",
 +               adapter->params.scfg_vers);
 +
 +      /* VPD Version */
 +      dev_info(adapter->pdev_dev, "VPD version: %#x\n",
 +               adapter->params.vpd_vers);
 +}
 +
  /**
   *    t4_check_fw_version - check if the FW is supported with this driver
   *    @adap: the adapter
@@@ -3835,143 -3685,16 +3835,143 @@@ void t4_ulprx_read_la(struct adapter *a
        }
  }
  
 -#define ADVERT_MASK (FW_PORT_CAP_SPEED_100M | FW_PORT_CAP_SPEED_1G |\
 -                   FW_PORT_CAP_SPEED_10G | FW_PORT_CAP_SPEED_25G | \
 -                   FW_PORT_CAP_SPEED_40G | FW_PORT_CAP_SPEED_100G | \
 -                   FW_PORT_CAP_ANEG)
 +#define ADVERT_MASK (FW_PORT_CAP32_SPEED_V(FW_PORT_CAP32_SPEED_M) | \
 +                   FW_PORT_CAP32_ANEG)
 +
 +/**
 + *    fwcaps16_to_caps32 - convert 16-bit Port Capabilities to 32-bits
 + *    @caps16: a 16-bit Port Capabilities value
 + *
 + *    Returns the equivalent 32-bit Port Capabilities value.
 + */
 +static fw_port_cap32_t fwcaps16_to_caps32(fw_port_cap16_t caps16)
 +{
 +      fw_port_cap32_t caps32 = 0;
 +
 +      #define CAP16_TO_CAP32(__cap) \
 +              do { \
 +                      if (caps16 & FW_PORT_CAP_##__cap) \
 +                              caps32 |= FW_PORT_CAP32_##__cap; \
 +              } while (0)
 +
 +      CAP16_TO_CAP32(SPEED_100M);
 +      CAP16_TO_CAP32(SPEED_1G);
 +      CAP16_TO_CAP32(SPEED_25G);
 +      CAP16_TO_CAP32(SPEED_10G);
 +      CAP16_TO_CAP32(SPEED_40G);
 +      CAP16_TO_CAP32(SPEED_100G);
 +      CAP16_TO_CAP32(FC_RX);
 +      CAP16_TO_CAP32(FC_TX);
 +      CAP16_TO_CAP32(ANEG);
 +      CAP16_TO_CAP32(MDIX);
 +      CAP16_TO_CAP32(MDIAUTO);
 +      CAP16_TO_CAP32(FEC_RS);
 +      CAP16_TO_CAP32(FEC_BASER_RS);
 +      CAP16_TO_CAP32(802_3_PAUSE);
 +      CAP16_TO_CAP32(802_3_ASM_DIR);
 +
 +      #undef CAP16_TO_CAP32
 +
 +      return caps32;
 +}
 +
 +/**
 + *    fwcaps32_to_caps16 - convert 32-bit Port Capabilities to 16-bits
 + *    @caps32: a 32-bit Port Capabilities value
 + *
 + *    Returns the equivalent 16-bit Port Capabilities value.  Note that
 + *    not all 32-bit Port Capabilities can be represented in the 16-bit
 + *    Port Capabilities and some fields/values may not make it.
 + */
 +static fw_port_cap16_t fwcaps32_to_caps16(fw_port_cap32_t caps32)
 +{
 +      fw_port_cap16_t caps16 = 0;
 +
 +      #define CAP32_TO_CAP16(__cap) \
 +              do { \
 +                      if (caps32 & FW_PORT_CAP32_##__cap) \
 +                              caps16 |= FW_PORT_CAP_##__cap; \
 +              } while (0)
 +
 +      CAP32_TO_CAP16(SPEED_100M);
 +      CAP32_TO_CAP16(SPEED_1G);
 +      CAP32_TO_CAP16(SPEED_10G);
 +      CAP32_TO_CAP16(SPEED_25G);
 +      CAP32_TO_CAP16(SPEED_40G);
 +      CAP32_TO_CAP16(SPEED_100G);
 +      CAP32_TO_CAP16(FC_RX);
 +      CAP32_TO_CAP16(FC_TX);
 +      CAP32_TO_CAP16(802_3_PAUSE);
 +      CAP32_TO_CAP16(802_3_ASM_DIR);
 +      CAP32_TO_CAP16(ANEG);
 +      CAP32_TO_CAP16(MDIX);
 +      CAP32_TO_CAP16(MDIAUTO);
 +      CAP32_TO_CAP16(FEC_RS);
 +      CAP32_TO_CAP16(FEC_BASER_RS);
 +
 +      #undef CAP32_TO_CAP16
 +
 +      return caps16;
 +}
 +
 +/* Translate Firmware Port Capabilities Pause specification to Common Code */
 +static inline enum cc_pause fwcap_to_cc_pause(fw_port_cap32_t fw_pause)
 +{
 +      enum cc_pause cc_pause = 0;
 +
 +      if (fw_pause & FW_PORT_CAP32_FC_RX)
 +              cc_pause |= PAUSE_RX;
 +      if (fw_pause & FW_PORT_CAP32_FC_TX)
 +              cc_pause |= PAUSE_TX;
 +
 +      return cc_pause;
 +}
 +
 +/* Translate Common Code Pause specification into Firmware Port Capabilities */
 +static inline fw_port_cap32_t cc_to_fwcap_pause(enum cc_pause cc_pause)
 +{
 +      fw_port_cap32_t fw_pause = 0;
 +
 +      if (cc_pause & PAUSE_RX)
 +              fw_pause |= FW_PORT_CAP32_FC_RX;
 +      if (cc_pause & PAUSE_TX)
 +              fw_pause |= FW_PORT_CAP32_FC_TX;
 +
 +      return fw_pause;
 +}
 +
 +/* Translate Firmware Forward Error Correction specification to Common Code */
 +static inline enum cc_fec fwcap_to_cc_fec(fw_port_cap32_t fw_fec)
 +{
 +      enum cc_fec cc_fec = 0;
 +
 +      if (fw_fec & FW_PORT_CAP32_FEC_RS)
 +              cc_fec |= FEC_RS;
 +      if (fw_fec & FW_PORT_CAP32_FEC_BASER_RS)
 +              cc_fec |= FEC_BASER_RS;
 +
 +      return cc_fec;
 +}
 +
 +/* Translate Common Code Forward Error Correction specification to Firmware */
 +static inline fw_port_cap32_t cc_to_fwcap_fec(enum cc_fec cc_fec)
 +{
 +      fw_port_cap32_t fw_fec = 0;
 +
 +      if (cc_fec & FEC_RS)
 +              fw_fec |= FW_PORT_CAP32_FEC_RS;
 +      if (cc_fec & FEC_BASER_RS)
 +              fw_fec |= FW_PORT_CAP32_FEC_BASER_RS;
 +
 +      return fw_fec;
 +}
  
  /**
   *    t4_link_l1cfg - apply link configuration to MAC/PHY
 - *    @phy: the PHY to setup
 - *    @mac: the MAC to setup
 - *    @lc: the requested link configuration
 + *    @adapter: the adapter
 + *    @mbox: the Firmware Mailbox to use
 + *    @port: the Port ID
 + *    @lc: the Port's Link Configuration
   *
   *    Set up a port's MAC and PHY according to a desired link configuration.
   *    - If the PHY can auto-negotiate first decide what to advertise, then
   *    - If auto-negotiation is off set the MAC to the proper speed/duplex/FC,
   *      otherwise do it later based on the outcome of auto-negotiation.
   */
 -int t4_link_l1cfg(struct adapter *adap, unsigned int mbox, unsigned int port,
 -                struct link_config *lc)
 +int t4_link_l1cfg(struct adapter *adapter, unsigned int mbox,
 +                unsigned int port, struct link_config *lc)
  {
 -      struct fw_port_cmd c;
 -      unsigned int mdi = FW_PORT_CAP_MDI_V(FW_PORT_CAP_MDI_AUTO);
 -      unsigned int fc = 0, fec = 0, fw_fec = 0;
 +      unsigned int fw_caps = adapter->params.fw_caps_support;
 +      struct fw_port_cmd cmd;
 +      unsigned int fw_mdi = FW_PORT_CAP32_MDI_V(FW_PORT_CAP32_MDI_AUTO);
 +      fw_port_cap32_t fw_fc, cc_fec, fw_fec, rcap;
  
        lc->link_ok = 0;
 -      if (lc->requested_fc & PAUSE_RX)
 -              fc |= FW_PORT_CAP_FC_RX;
 -      if (lc->requested_fc & PAUSE_TX)
 -              fc |= FW_PORT_CAP_FC_TX;
 -
 -      fec = lc->requested_fec & FEC_AUTO ? lc->auto_fec : lc->requested_fec;
  
 -      if (fec & FEC_RS)
 -              fw_fec |= FW_PORT_CAP_FEC_RS;
 -      if (fec & FEC_BASER_RS)
 -              fw_fec |= FW_PORT_CAP_FEC_BASER_RS;
 -
 -      memset(&c, 0, sizeof(c));
 -      c.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) |
 -                                   FW_CMD_REQUEST_F | FW_CMD_EXEC_F |
 -                                   FW_PORT_CMD_PORTID_V(port));
 -      c.action_to_len16 =
 -              cpu_to_be32(FW_PORT_CMD_ACTION_V(FW_PORT_ACTION_L1_CFG) |
 -                          FW_LEN16(c));
 +      /* Convert driver coding of Pause Frame Flow Control settings into the
 +       * Firmware's API.
 +       */
 +      fw_fc = cc_to_fwcap_pause(lc->requested_fc);
 +
 +      /* Convert Common Code Forward Error Control settings into the
 +       * Firmware's API.  If the current Requested FEC has "Automatic"
 +       * (IEEE 802.3) specified, then we use whatever the Firmware
 +       * sent us as part of it's IEEE 802.3-based interpratation of
 +       * the Transceiver Module EPROM FEC parameters.  Otherwise we
 +       * use whatever is in the current Requested FEC settings.
 +       */
 +      if (lc->requested_fec & FEC_AUTO)
 +              cc_fec = fwcap_to_cc_fec(lc->def_acaps);
 +      else
 +              cc_fec = lc->requested_fec;
 +      fw_fec = cc_to_fwcap_fec(cc_fec);
  
 -      if (!(lc->supported & FW_PORT_CAP_ANEG)) {
 -              c.u.l1cfg.rcap = cpu_to_be32((lc->supported & ADVERT_MASK) |
 -                                           fc | fw_fec);
 -              lc->fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX);
 +      /* Figure out what our Requested Port Capabilities are going to be.
 +       */
 +      if (!(lc->pcaps & FW_PORT_CAP32_ANEG)) {
 +              rcap = (lc->pcaps & ADVERT_MASK) | fw_fc | fw_fec;
 +              lc->fc = lc->requested_fc & ~PAUSE_AUTONEG;
 +              lc->fec = cc_fec;
        } else if (lc->autoneg == AUTONEG_DISABLE) {
 -              c.u.l1cfg.rcap = cpu_to_be32(lc->requested_speed | fc |
 -                                           fw_fec | mdi);
 -              lc->fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX);
 -      } else
 -              c.u.l1cfg.rcap = cpu_to_be32(lc->advertising | fc |
 -                                           fw_fec | mdi);
 +              rcap = lc->speed_caps | fw_fc | fw_fec | fw_mdi;
 +              lc->fc = lc->requested_fc & ~PAUSE_AUTONEG;
 +              lc->fec = cc_fec;
 +      } else {
 +              rcap = lc->acaps | fw_fc | fw_fec | fw_mdi;
 +      }
  
 -      return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
 +      /* And send that on to the Firmware ...
 +       */
 +      memset(&cmd, 0, sizeof(cmd));
 +      cmd.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) |
 +                                     FW_CMD_REQUEST_F | FW_CMD_EXEC_F |
 +                                     FW_PORT_CMD_PORTID_V(port));
 +      cmd.action_to_len16 =
 +              cpu_to_be32(FW_PORT_CMD_ACTION_V(fw_caps == FW_CAPS16
 +                                               ? FW_PORT_ACTION_L1_CFG
 +                                               : FW_PORT_ACTION_L1_CFG32) |
 +                          FW_LEN16(cmd));
 +      if (fw_caps == FW_CAPS16)
 +              cmd.u.l1cfg.rcap = cpu_to_be32(fwcaps32_to_caps16(rcap));
 +      else
 +              cmd.u.l1cfg32.rcap32 = cpu_to_be32(rcap);
 +      return t4_wr_mbox(adapter, mbox, &cmd, sizeof(cmd), NULL);
  }
  
  /**
@@@ -4059,7 -3765,7 +4059,7 @@@ int t4_restart_aneg(struct adapter *ada
        c.action_to_len16 =
                cpu_to_be32(FW_PORT_CMD_ACTION_V(FW_PORT_ACTION_L1_CFG) |
                            FW_LEN16(c));
 -      c.u.l1cfg.rcap = cpu_to_be32(FW_PORT_CAP_ANEG);
 +      c.u.l1cfg.rcap = cpu_to_be32(FW_PORT_CAP32_ANEG);
        return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
  }
  
@@@ -6742,17 -6448,6 +6742,17 @@@ int t4_fw_upgrade(struct adapter *adap
        if (ret < 0)
                goto out;
  
 +      /*
 +       * If there was a Firmware Configuration File stored in FLASH,
 +       * there's a good chance that it won't be compatible with the new
 +       * Firmware.  In order to prevent difficult to diagnose adapter
 +       * initialization issues, we clear out the Firmware Configuration File
 +       * portion of the FLASH .  The user will need to re-FLASH a new
 +       * Firmware Configuration File which is compatible with the new
 +       * Firmware if that's desired.
 +       */
 +      (void)t4_load_cfg(adap, NULL, 0);
 +
        /*
         * Older versions of the firmware don't understand the new
         * PCIE_FW.HALT flag and so won't know to perform a RESET when they
@@@ -7775,98 -7470,6 +7775,98 @@@ static const char *t4_link_down_rc_str(
        return reason[link_down_rc];
  }
  
 +/**
 + * Return the highest speed set in the port capabilities, in Mb/s.
 + */
 +static unsigned int fwcap_to_speed(fw_port_cap32_t caps)
 +{
 +      #define TEST_SPEED_RETURN(__caps_speed, __speed) \
 +              do { \
 +                      if (caps & FW_PORT_CAP32_SPEED_##__caps_speed) \
 +                              return __speed; \
 +              } while (0)
 +
 +      TEST_SPEED_RETURN(400G, 400000);
 +      TEST_SPEED_RETURN(200G, 200000);
 +      TEST_SPEED_RETURN(100G, 100000);
 +      TEST_SPEED_RETURN(50G,   50000);
 +      TEST_SPEED_RETURN(40G,   40000);
 +      TEST_SPEED_RETURN(25G,   25000);
 +      TEST_SPEED_RETURN(10G,   10000);
 +      TEST_SPEED_RETURN(1G,     1000);
 +      TEST_SPEED_RETURN(100M,    100);
 +
 +      #undef TEST_SPEED_RETURN
 +
 +      return 0;
 +}
 +
 +/**
 + *    fwcap_to_fwspeed - return highest speed in Port Capabilities
 + *    @acaps: advertised Port Capabilities
 + *
 + *    Get the highest speed for the port from the advertised Port
 + *    Capabilities.  It will be either the highest speed from the list of
 + *    speeds or whatever user has set using ethtool.
 + */
 +static fw_port_cap32_t fwcap_to_fwspeed(fw_port_cap32_t acaps)
 +{
 +      #define TEST_SPEED_RETURN(__caps_speed) \
 +              do { \
 +                      if (acaps & FW_PORT_CAP32_SPEED_##__caps_speed) \
 +                              return FW_PORT_CAP32_SPEED_##__caps_speed; \
 +              } while (0)
 +
 +      TEST_SPEED_RETURN(400G);
 +      TEST_SPEED_RETURN(200G);
 +      TEST_SPEED_RETURN(100G);
 +      TEST_SPEED_RETURN(50G);
 +      TEST_SPEED_RETURN(40G);
 +      TEST_SPEED_RETURN(25G);
 +      TEST_SPEED_RETURN(10G);
 +      TEST_SPEED_RETURN(1G);
 +      TEST_SPEED_RETURN(100M);
 +
 +      #undef TEST_SPEED_RETURN
 +
 +      return 0;
 +}
 +
 +/**
 + *    lstatus_to_fwcap - translate old lstatus to 32-bit Port Capabilities
 + *    @lstatus: old FW_PORT_ACTION_GET_PORT_INFO lstatus value
 + *
 + *    Translates old FW_PORT_ACTION_GET_PORT_INFO lstatus field into new
 + *    32-bit Port Capabilities value.
 + */
 +static fw_port_cap32_t lstatus_to_fwcap(u32 lstatus)
 +{
 +      fw_port_cap32_t linkattr = 0;
 +
 +      /* Unfortunately the format of the Link Status in the old
 +       * 16-bit Port Information message isn't the same as the
 +       * 16-bit Port Capabilities bitfield used everywhere else ...
 +       */
 +      if (lstatus & FW_PORT_CMD_RXPAUSE_F)
 +              linkattr |= FW_PORT_CAP32_FC_RX;
 +      if (lstatus & FW_PORT_CMD_TXPAUSE_F)
 +              linkattr |= FW_PORT_CAP32_FC_TX;
 +      if (lstatus & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_100M))
 +              linkattr |= FW_PORT_CAP32_SPEED_100M;
 +      if (lstatus & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_1G))
 +              linkattr |= FW_PORT_CAP32_SPEED_1G;
 +      if (lstatus & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_10G))
 +              linkattr |= FW_PORT_CAP32_SPEED_10G;
 +      if (lstatus & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_25G))
 +              linkattr |= FW_PORT_CAP32_SPEED_25G;
 +      if (lstatus & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_40G))
 +              linkattr |= FW_PORT_CAP32_SPEED_40G;
 +      if (lstatus & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_100G))
 +              linkattr |= FW_PORT_CAP32_SPEED_100G;
 +
 +      return linkattr;
 +}
 +
  /**
   *    t4_handle_get_port_info - process a FW reply message
   *    @pi: the port info
   */
  void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl)
  {
 -      const struct fw_port_cmd *p = (const void *)rpl;
 -      struct adapter *adap = pi->adapter;
 -
 -      /* link/module state change message */
 -      int speed = 0, fc = 0;
 -      struct link_config *lc;
 -      u32 stat = be32_to_cpu(p->u.info.lstatus_to_modtype);
 -      int link_ok = (stat & FW_PORT_CMD_LSTATUS_F) != 0;
 -      u32 mod = FW_PORT_CMD_MODTYPE_G(stat);
 -
 -      if (stat & FW_PORT_CMD_RXPAUSE_F)
 -              fc |= PAUSE_RX;
 -      if (stat & FW_PORT_CMD_TXPAUSE_F)
 -              fc |= PAUSE_TX;
 -      if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_100M))
 -              speed = 100;
 -      else if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_1G))
 -              speed = 1000;
 -      else if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_10G))
 -              speed = 10000;
 -      else if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_25G))
 -              speed = 25000;
 -      else if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_40G))
 -              speed = 40000;
 -      else if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_100G))
 -              speed = 100000;
 -
 -      lc = &pi->link_cfg;
 -
 -      if (mod != pi->mod_type) {
 -              pi->mod_type = mod;
 -              t4_os_portmod_changed(adap, pi->port_id);
 +      const struct fw_port_cmd *cmd = (const void *)rpl;
 +      int action = FW_PORT_CMD_ACTION_G(be32_to_cpu(cmd->action_to_len16));
 +      struct adapter *adapter = pi->adapter;
 +      struct link_config *lc = &pi->link_cfg;
 +      int link_ok, linkdnrc;
 +      enum fw_port_type port_type;
 +      enum fw_port_module_type mod_type;
 +      unsigned int speed, fc, fec;
 +      fw_port_cap32_t pcaps, acaps, lpacaps, linkattr;
 +
 +      /* Extract the various fields from the Port Information message.
 +       */
 +      switch (action) {
 +      case FW_PORT_ACTION_GET_PORT_INFO: {
 +              u32 lstatus = be32_to_cpu(cmd->u.info.lstatus_to_modtype);
 +
 +              link_ok = (lstatus & FW_PORT_CMD_LSTATUS_F) != 0;
 +              linkdnrc = FW_PORT_CMD_LINKDNRC_G(lstatus);
 +              port_type = FW_PORT_CMD_PTYPE_G(lstatus);
 +              mod_type = FW_PORT_CMD_MODTYPE_G(lstatus);
 +              pcaps = fwcaps16_to_caps32(be16_to_cpu(cmd->u.info.pcap));
 +              acaps = fwcaps16_to_caps32(be16_to_cpu(cmd->u.info.acap));
 +              lpacaps = fwcaps16_to_caps32(be16_to_cpu(cmd->u.info.lpacap));
 +              linkattr = lstatus_to_fwcap(lstatus);
 +              break;
        }
 +
 +      case FW_PORT_ACTION_GET_PORT_INFO32: {
 +              u32 lstatus32;
 +
 +              lstatus32 = be32_to_cpu(cmd->u.info32.lstatus32_to_cbllen32);
 +              link_ok = (lstatus32 & FW_PORT_CMD_LSTATUS32_F) != 0;
 +              linkdnrc = FW_PORT_CMD_LINKDNRC32_G(lstatus32);
 +              port_type = FW_PORT_CMD_PORTTYPE32_G(lstatus32);
 +              mod_type = FW_PORT_CMD_MODTYPE32_G(lstatus32);
 +              pcaps = be32_to_cpu(cmd->u.info32.pcaps32);
 +              acaps = be32_to_cpu(cmd->u.info32.acaps32);
 +              lpacaps = be32_to_cpu(cmd->u.info32.lpacaps32);
 +              linkattr = be32_to_cpu(cmd->u.info32.linkattr32);
 +              break;
 +      }
 +
 +      default:
 +              dev_err(adapter->pdev_dev, "Handle Port Information: Bad Command/Action %#x\n",
 +                      be32_to_cpu(cmd->action_to_len16));
 +              return;
 +      }
 +
 +      fec = fwcap_to_cc_fec(acaps);
 +      fc = fwcap_to_cc_pause(linkattr);
 +      speed = fwcap_to_speed(linkattr);
 +
 +      if (mod_type != pi->mod_type) {
 +              /* With the newer SFP28 and QSFP28 Transceiver Module Types,
 +               * various fundamental Port Capabilities which used to be
 +               * immutable can now change radically.  We can now have
 +               * Speeds, Auto-Negotiation, Forward Error Correction, etc.
 +               * all change based on what Transceiver Module is inserted.
 +               * So we need to record the Physical "Port" Capabilities on
 +               * every Transceiver Module change.
 +               */
 +              lc->pcaps = pcaps;
 +
 +              /* When a new Transceiver Module is inserted, the Firmware
 +               * will examine its i2c EPROM to determine its type and
 +               * general operating parameters including things like Forward
 +               * Error Control, etc.  Various IEEE 802.3 standards dictate
 +               * how to interpret these i2c values to determine default
 +               * "sutomatic" settings.  We record these for future use when
 +               * the user explicitly requests these standards-based values.
 +               */
 +              lc->def_acaps = acaps;
 +
 +              /* Some versions of the early T6 Firmware "cheated" when
 +               * handling different Transceiver Modules by changing the
 +               * underlaying Port Type reported to the Host Drivers.  As
 +               * such we need to capture whatever Port Type the Firmware
 +               * sends us and record it in case it's different from what we
 +               * were told earlier.  Unfortunately, since Firmware is
 +               * forever, we'll need to keep this code here forever, but in
 +               * later T6 Firmware it should just be an assignment of the
 +               * same value already recorded.
 +               */
 +              pi->port_type = port_type;
 +
 +              pi->mod_type = mod_type;
 +              t4_os_portmod_changed(adapter, pi->port_id);
 +      }
 +
        if (link_ok != lc->link_ok || speed != lc->speed ||
 -          fc != lc->fc) {     /* something changed */
 +          fc != lc->fc || fec != lc->fec) {   /* something changed */
                if (!link_ok && lc->link_ok) {
 -                      unsigned char rc = FW_PORT_CMD_LINKDNRC_G(stat);
 -
 -                      lc->link_down_rc = rc;
 -                      dev_warn(adap->pdev_dev,
 -                               "Port %d link down, reason: %s\n",
 -                               pi->port_id, t4_link_down_rc_str(rc));
 +                      lc->link_down_rc = linkdnrc;
 +                      dev_warn(adapter->pdev_dev, "Port %d link down, reason: %s\n",
 +                               pi->tx_chan, t4_link_down_rc_str(linkdnrc));
                }
                lc->link_ok = link_ok;
                lc->speed = speed;
                lc->fc = fc;
 -              lc->supported = be16_to_cpu(p->u.info.pcap);
 -              lc->lp_advertising = be16_to_cpu(p->u.info.lpacap);
 +              lc->fec = fec;
 +
 +              lc->lpacaps = lpacaps;
 +              lc->acaps = acaps & ADVERT_MASK;
 +
 +              if (lc->acaps & FW_PORT_CAP32_ANEG) {
 +                      lc->autoneg = AUTONEG_ENABLE;
 +              } else {
 +                      /* When Autoneg is disabled, user needs to set
 +                       * single speed.
 +                       * Similar to cxgb4_ethtool.c: set_link_ksettings
 +                       */
 +                      lc->acaps = 0;
 +                      lc->speed_caps = fwcap_to_fwspeed(acaps);
 +                      lc->autoneg = AUTONEG_DISABLE;
 +              }
  
 -              t4_os_link_changed(adap, pi->port_id, link_ok);
 +              t4_os_link_changed(adapter, pi->port_id, link_ok);
        }
  }
  
   */
  int t4_update_port_info(struct port_info *pi)
  {
 +      unsigned int fw_caps = pi->adapter->params.fw_caps_support;
        struct fw_port_cmd port_cmd;
        int ret;
  
        memset(&port_cmd, 0, sizeof(port_cmd));
        port_cmd.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) |
                                            FW_CMD_REQUEST_F | FW_CMD_READ_F |
 -                                          FW_PORT_CMD_PORTID_V(pi->port_id));
 +                                          FW_PORT_CMD_PORTID_V(pi->tx_chan));
        port_cmd.action_to_len16 = cpu_to_be32(
 -              FW_PORT_CMD_ACTION_V(FW_PORT_ACTION_GET_PORT_INFO) |
 +              FW_PORT_CMD_ACTION_V(fw_caps == FW_CAPS16
 +                                   ? FW_PORT_ACTION_GET_PORT_INFO
 +                                   : FW_PORT_ACTION_GET_PORT_INFO32) |
                FW_LEN16(port_cmd));
        ret = t4_wr_mbox(pi->adapter, pi->adapter->mbox,
                         &port_cmd, sizeof(port_cmd), &port_cmd);
        return 0;
  }
  
 +/**
 + *    t4_get_link_params - retrieve basic link parameters for given port
 + *    @pi: the port
 + *    @link_okp: value return pointer for link up/down
 + *    @speedp: value return pointer for speed (Mb/s)
 + *    @mtup: value return pointer for mtu
 + *
 + *    Retrieves basic link parameters for a port: link up/down, speed (Mb/s),
 + *    and MTU for a specified port.  A negative error is returned on
 + *    failure; 0 on success.
 + */
 +int t4_get_link_params(struct port_info *pi, unsigned int *link_okp,
 +                     unsigned int *speedp, unsigned int *mtup)
 +{
 +      unsigned int fw_caps = pi->adapter->params.fw_caps_support;
 +      struct fw_port_cmd port_cmd;
 +      unsigned int action, link_ok, speed, mtu;
 +      fw_port_cap32_t linkattr;
 +      int ret;
 +
 +      memset(&port_cmd, 0, sizeof(port_cmd));
 +      port_cmd.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) |
 +                                          FW_CMD_REQUEST_F | FW_CMD_READ_F |
 +                                          FW_PORT_CMD_PORTID_V(pi->tx_chan));
 +      action = (fw_caps == FW_CAPS16
 +                ? FW_PORT_ACTION_GET_PORT_INFO
 +                : FW_PORT_ACTION_GET_PORT_INFO32);
 +      port_cmd.action_to_len16 = cpu_to_be32(
 +              FW_PORT_CMD_ACTION_V(action) |
 +              FW_LEN16(port_cmd));
 +      ret = t4_wr_mbox(pi->adapter, pi->adapter->mbox,
 +                       &port_cmd, sizeof(port_cmd), &port_cmd);
 +      if (ret)
 +              return ret;
 +
 +      if (action == FW_PORT_ACTION_GET_PORT_INFO) {
 +              u32 lstatus = be32_to_cpu(port_cmd.u.info.lstatus_to_modtype);
 +
 +              link_ok = !!(lstatus & FW_PORT_CMD_LSTATUS_F);
 +              linkattr = lstatus_to_fwcap(lstatus);
 +              mtu = be16_to_cpu(port_cmd.u.info.mtu);
 +      } else {
 +              u32 lstatus32 =
 +                         be32_to_cpu(port_cmd.u.info32.lstatus32_to_cbllen32);
 +
 +              link_ok = !!(lstatus32 & FW_PORT_CMD_LSTATUS32_F);
 +              linkattr = be32_to_cpu(port_cmd.u.info32.linkattr32);
 +              mtu = FW_PORT_CMD_MTU32_G(
 +                      be32_to_cpu(port_cmd.u.info32.auxlinfo32_mtu32));
 +      }
 +      speed = fwcap_to_speed(linkattr);
 +
 +      *link_okp = link_ok;
 +      *speedp = fwcap_to_speed(linkattr);
 +      *mtup = mtu;
 +
 +      return 0;
 +}
 +
  /**
   *      t4_handle_fw_rpl - process a FW reply message
   *      @adap: the adapter
@@@ -8107,9 -7581,7 +8107,9 @@@ int t4_handle_fw_rpl(struct adapter *ad
        unsigned int action =
                FW_PORT_CMD_ACTION_G(be32_to_cpu(p->action_to_len16));
  
 -      if (opcode == FW_PORT_CMD && action == FW_PORT_ACTION_GET_PORT_INFO) {
 +      if (opcode == FW_PORT_CMD &&
 +          (action == FW_PORT_ACTION_GET_PORT_INFO ||
 +           action == FW_PORT_ACTION_GET_PORT_INFO32)) {
                int i;
                int chan = FW_PORT_CMD_PORTID_G(be32_to_cpu(p->op_to_portid));
                struct port_info *pi = NULL;
  
                t4_handle_get_port_info(pi, rpl);
        } else {
 -              dev_warn(adap->pdev_dev, "Unknown firmware reply %d\n", opcode);
 +              dev_warn(adap->pdev_dev, "Unknown firmware reply %d\n",
 +                       opcode);
                return -EINVAL;
        }
        return 0;
@@@ -8142,35 -7613,38 +8142,35 @@@ static void get_pci_mode(struct adapte
  
  /**
   *    init_link_config - initialize a link's SW state
 - *    @lc: structure holding the link state
 - *    @caps: link capabilities
 + *    @lc: pointer to structure holding the link state
 + *    @pcaps: link Port Capabilities
 + *    @acaps: link current Advertised Port Capabilities
   *
   *    Initializes the SW state maintained for each link, including the link's
   *    capabilities and default speed/flow-control/autonegotiation settings.
   */
 -static void init_link_config(struct link_config *lc, unsigned int pcaps,
 -                           unsigned int acaps)
 +static void init_link_config(struct link_config *lc, fw_port_cap32_t pcaps,
 +                           fw_port_cap32_t acaps)
  {
 -      lc->supported = pcaps;
 -      lc->lp_advertising = 0;
 -      lc->requested_speed = 0;
 +      lc->pcaps = pcaps;
 +      lc->def_acaps = acaps;
 +      lc->lpacaps = 0;
 +      lc->speed_caps = 0;
        lc->speed = 0;
        lc->requested_fc = lc->fc = PAUSE_RX | PAUSE_TX;
 -      lc->auto_fec = 0;
  
        /* For Forward Error Control, we default to whatever the Firmware
         * tells us the Link is currently advertising.
         */
 -      if (acaps & FW_PORT_CAP_FEC_RS)
 -              lc->auto_fec |= FEC_RS;
 -      if (acaps & FW_PORT_CAP_FEC_BASER_RS)
 -              lc->auto_fec |= FEC_BASER_RS;
        lc->requested_fec = FEC_AUTO;
 -      lc->fec = lc->auto_fec;
 +      lc->fec = fwcap_to_cc_fec(lc->def_acaps);
  
 -      if (lc->supported & FW_PORT_CAP_ANEG) {
 -              lc->advertising = lc->supported & ADVERT_MASK;
 +      if (lc->pcaps & FW_PORT_CAP32_ANEG) {
 +              lc->acaps = lc->pcaps & ADVERT_MASK;
                lc->autoneg = AUTONEG_ENABLE;
                lc->requested_fc |= PAUSE_AUTONEG;
        } else {
 -              lc->advertising = 0;
 +              lc->acaps = 0;
                lc->autoneg = AUTONEG_DISABLE;
        }
  }
@@@ -8695,7 -8169,7 +8695,7 @@@ int t4_init_rss_mode(struct adapter *ad
  }
  
  /**
 - *    t4_init_portinfo - allocate a virtual interface amd initialize port_info
 + *    t4_init_portinfo - allocate a virtual interface and initialize port_info
   *    @pi: the port_info
   *    @mbox: mailbox to use for the FW command
   *    @port: physical port associated with the VI
  int t4_init_portinfo(struct port_info *pi, int mbox,
                     int port, int pf, int vf, u8 mac[])
  {
 -      int ret;
 -      struct fw_port_cmd c;
 +      struct adapter *adapter = pi->adapter;
 +      unsigned int fw_caps = adapter->params.fw_caps_support;
 +      struct fw_port_cmd cmd;
        unsigned int rss_size;
 +      enum fw_port_type port_type;
 +      int mdio_addr;
 +      fw_port_cap32_t pcaps, acaps;
 +      int ret;
  
 -      memset(&c, 0, sizeof(c));
 -      c.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) |
 -                                   FW_CMD_REQUEST_F | FW_CMD_READ_F |
 -                                   FW_PORT_CMD_PORTID_V(port));
 -      c.action_to_len16 = cpu_to_be32(
 -              FW_PORT_CMD_ACTION_V(FW_PORT_ACTION_GET_PORT_INFO) |
 -              FW_LEN16(c));
 -      ret = t4_wr_mbox(pi->adapter, mbox, &c, sizeof(c), &c);
 +      /* If we haven't yet determined whether we're talking to Firmware
 +       * which knows the new 32-bit Port Capabilities, it's time to find
 +       * out now.  This will also tell new Firmware to send us Port Status
 +       * Updates using the new 32-bit Port Capabilities version of the
 +       * Port Information message.
 +       */
 +      if (fw_caps == FW_CAPS_UNKNOWN) {
 +              u32 param, val;
 +
 +              param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_PFVF) |
 +                       FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_PFVF_PORT_CAPS32));
 +              val = 1;
 +              ret = t4_set_params(adapter, mbox, pf, vf, 1, &param, &val);
 +              fw_caps = (ret == 0 ? FW_CAPS32 : FW_CAPS16);
 +              adapter->params.fw_caps_support = fw_caps;
 +      }
 +
 +      memset(&cmd, 0, sizeof(cmd));
 +      cmd.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) |
 +                                     FW_CMD_REQUEST_F | FW_CMD_READ_F |
 +                                     FW_PORT_CMD_PORTID_V(port));
 +      cmd.action_to_len16 = cpu_to_be32(
 +              FW_PORT_CMD_ACTION_V(fw_caps == FW_CAPS16
 +                                   ? FW_PORT_ACTION_GET_PORT_INFO
 +                                   : FW_PORT_ACTION_GET_PORT_INFO32) |
 +              FW_LEN16(cmd));
 +      ret = t4_wr_mbox(pi->adapter, mbox, &cmd, sizeof(cmd), &cmd);
        if (ret)
                return ret;
  
 +      /* Extract the various fields from the Port Information message.
 +       */
 +      if (fw_caps == FW_CAPS16) {
 +              u32 lstatus = be32_to_cpu(cmd.u.info.lstatus_to_modtype);
 +
 +              port_type = FW_PORT_CMD_PTYPE_G(lstatus);
 +              mdio_addr = ((lstatus & FW_PORT_CMD_MDIOCAP_F)
 +                           ? FW_PORT_CMD_MDIOADDR_G(lstatus)
 +                           : -1);
 +              pcaps = fwcaps16_to_caps32(be16_to_cpu(cmd.u.info.pcap));
 +              acaps = fwcaps16_to_caps32(be16_to_cpu(cmd.u.info.acap));
 +      } else {
 +              u32 lstatus32 = be32_to_cpu(cmd.u.info32.lstatus32_to_cbllen32);
 +
 +              port_type = FW_PORT_CMD_PORTTYPE32_G(lstatus32);
 +              mdio_addr = ((lstatus32 & FW_PORT_CMD_MDIOCAP32_F)
 +                           ? FW_PORT_CMD_MDIOADDR32_G(lstatus32)
 +                           : -1);
 +              pcaps = be32_to_cpu(cmd.u.info32.pcaps32);
 +              acaps = be32_to_cpu(cmd.u.info32.acaps32);
 +      }
 +
        ret = t4_alloc_vi(pi->adapter, mbox, port, pf, vf, 1, mac, &rss_size);
        if (ret < 0)
                return ret;
        pi->lport = port;
        pi->rss_size = rss_size;
  
 -      ret = be32_to_cpu(c.u.info.lstatus_to_modtype);
 -      pi->mdio_addr = (ret & FW_PORT_CMD_MDIOCAP_F) ?
 -              FW_PORT_CMD_MDIOADDR_G(ret) : -1;
 -      pi->port_type = FW_PORT_CMD_PTYPE_G(ret);
 +      pi->port_type = port_type;
 +      pi->mdio_addr = mdio_addr;
        pi->mod_type = FW_PORT_MOD_TYPE_NA;
  
 -      init_link_config(&pi->link_cfg, be16_to_cpu(c.u.info.pcap),
 -                       be16_to_cpu(c.u.info.acap));
 +      init_link_config(&pi->link_cfg, pcaps, acaps);
        return 0;
  }
  
@@@ -9232,65 -8663,6 +9232,65 @@@ void t4_idma_monitor(struct adapter *ad
        }
  }
  
 +/**
 + *    t4_load_cfg - download config file
 + *    @adap: the adapter
 + *    @cfg_data: the cfg text file to write
 + *    @size: text file size
 + *
 + *    Write the supplied config text file to the card's serial flash.
 + */
 +int t4_load_cfg(struct adapter *adap, const u8 *cfg_data, unsigned int size)
 +{
 +      int ret, i, n, cfg_addr;
 +      unsigned int addr;
 +      unsigned int flash_cfg_start_sec;
 +      unsigned int sf_sec_size = adap->params.sf_size / adap->params.sf_nsec;
 +
 +      cfg_addr = t4_flash_cfg_addr(adap);
 +      if (cfg_addr < 0)
 +              return cfg_addr;
 +
 +      addr = cfg_addr;
 +      flash_cfg_start_sec = addr / SF_SEC_SIZE;
 +
 +      if (size > FLASH_CFG_MAX_SIZE) {
 +              dev_err(adap->pdev_dev, "cfg file too large, max is %u bytes\n",
 +                      FLASH_CFG_MAX_SIZE);
 +              return -EFBIG;
 +      }
 +
 +      i = DIV_ROUND_UP(FLASH_CFG_MAX_SIZE,    /* # of sectors spanned */
 +                       sf_sec_size);
 +      ret = t4_flash_erase_sectors(adap, flash_cfg_start_sec,
 +                                   flash_cfg_start_sec + i - 1);
 +      /* If size == 0 then we're simply erasing the FLASH sectors associated
 +       * with the on-adapter Firmware Configuration File.
 +       */
 +      if (ret || size == 0)
 +              goto out;
 +
 +      /* this will write to the flash up to SF_PAGE_SIZE at a time */
 +      for (i = 0; i < size; i += SF_PAGE_SIZE) {
 +              if ((size - i) <  SF_PAGE_SIZE)
 +                      n = size - i;
 +              else
 +                      n = SF_PAGE_SIZE;
 +              ret = t4_write_flash(adap, addr, n, cfg_data);
 +              if (ret)
 +                      goto out;
 +
 +              addr += SF_PAGE_SIZE;
 +              cfg_data += SF_PAGE_SIZE;
 +      }
 +
 +out:
 +      if (ret)
 +              dev_err(adap->pdev_dev, "config file %s failed %d\n",
 +                      (size == 0 ? "clear" : "download"), ret);
 +      return ret;
 +}
 +
  /**
   *    t4_set_vf_mac - Set MAC address for the specified VF
   *    @adapter: The adapter
index 05fe7123d5ae8868a1f4b1a0ef5f23de33b71ad6,59da7ac3c1087c03f52b514c3fc9b81074284a5e..9ed8e4b815304f1e443ccb7e4c863e800584b8ab
@@@ -1623,8 -1623,6 +1623,8 @@@ static const struct net_device_ops ftgm
  #ifdef CONFIG_NET_POLL_CONTROLLER
        .ndo_poll_controller    = ftgmac100_poll_controller,
  #endif
 +      .ndo_vlan_rx_add_vid    = ncsi_vlan_rx_add_vid,
 +      .ndo_vlan_rx_kill_vid   = ncsi_vlan_rx_kill_vid,
  };
  
  static int ftgmac100_setup_mdio(struct net_device *netdev)
@@@ -1839,9 -1837,6 +1839,9 @@@ static int ftgmac100_probe(struct platf
                NETIF_F_GRO | NETIF_F_SG | NETIF_F_HW_VLAN_CTAG_RX |
                NETIF_F_HW_VLAN_CTAG_TX;
  
 +      if (priv->use_ncsi)
 +              netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_FILTER;
 +
        /* AST2400  doesn't have working HW checksum generation */
        if (np && (of_device_is_compatible(np, "aspeed,ast2400-mac")))
                netdev->hw_features &= ~NETIF_F_HW_CSUM;
@@@ -1868,7 -1863,6 +1868,6 @@@ err_setup_mdio
  err_ioremap:
        release_resource(priv->res);
  err_req_mem:
-       netif_napi_del(&priv->napi);
        free_netdev(netdev);
  err_alloc_etherdev:
        return err;
index 14cd2c8b00248298088337c62a288f8c99039ec3,1c7da16ad0ffe5de0bbed64f027ed765daec7464..387eb4a88b723f2de346b7335e4fd818478efa1d
@@@ -623,6 -623,8 +623,8 @@@ static struct platform_device *dpaa_eth
                goto no_mem;
        }
  
+       pdev->dev.of_node = node;
+       pdev->dev.parent = priv->dev;
        set_dma_ops(&pdev->dev, get_dma_ops(priv->dev));
  
        ret = platform_device_add_data(pdev, &data, sizeof(data));
@@@ -698,8 -700,8 +700,8 @@@ static int mac_probe(struct platform_de
                priv->internal_phy_node = of_parse_phandle(mac_node,
                                                          "pcsphy-handle", 0);
        } else {
 -              dev_err(dev, "MAC node (%s) contains unsupported MAC\n",
 -                      mac_node->full_name);
 +              dev_err(dev, "MAC node (%pOF) contains unsupported MAC\n",
 +                      mac_node);
                err = -EINVAL;
                goto _return;
        }
        /* Get the FM node */
        dev_node = of_get_parent(mac_node);
        if (!dev_node) {
 -              dev_err(dev, "of_get_parent(%s) failed\n",
 -                      mac_node->full_name);
 +              dev_err(dev, "of_get_parent(%pOF) failed\n",
 +                      mac_node);
                err = -EINVAL;
                goto _return_dev_set_drvdata;
        }
  
        of_dev = of_find_device_by_node(dev_node);
        if (!of_dev) {
 -              dev_err(dev, "of_find_device_by_node(%s) failed\n",
 -                      dev_node->full_name);
 +              dev_err(dev, "of_find_device_by_node(%pOF) failed\n", dev_node);
                err = -EINVAL;
                goto _return_of_node_put;
        }
        /* Get the FMan cell-index */
        err = of_property_read_u32(dev_node, "cell-index", &val);
        if (err) {
 -              dev_err(dev, "failed to read cell-index for %s\n",
 -                      dev_node->full_name);
 +              dev_err(dev, "failed to read cell-index for %pOF\n", dev_node);
                err = -EINVAL;
                goto _return_of_node_put;
        }
  
        priv->fman = fman_bind(&of_dev->dev);
        if (!priv->fman) {
 -              dev_err(dev, "fman_bind(%s) failed\n", dev_node->full_name);
 +              dev_err(dev, "fman_bind(%pOF) failed\n", dev_node);
                err = -ENODEV;
                goto _return_of_node_put;
        }
        /* Get the address of the memory mapped registers */
        err = of_address_to_resource(mac_node, 0, &res);
        if (err < 0) {
 -              dev_err(dev, "of_address_to_resource(%s) = %d\n",
 -                      mac_node->full_name, err);
 +              dev_err(dev, "of_address_to_resource(%pOF) = %d\n",
 +                      mac_node, err);
                goto _return_dev_set_drvdata;
        }
  
        /* Get the cell-index */
        err = of_property_read_u32(mac_node, "cell-index", &val);
        if (err) {
 -              dev_err(dev, "failed to read cell-index for %s\n",
 -                      mac_node->full_name);
 +              dev_err(dev, "failed to read cell-index for %pOF\n", mac_node);
                err = -EINVAL;
                goto _return_dev_set_drvdata;
        }
        /* Get the MAC address */
        mac_addr = of_get_mac_address(mac_node);
        if (!mac_addr) {
 -              dev_err(dev, "of_get_mac_address(%s) failed\n",
 -                      mac_node->full_name);
 +              dev_err(dev, "of_get_mac_address(%pOF) failed\n", mac_node);
                err = -EINVAL;
                goto _return_dev_set_drvdata;
        }
        /* Get the port handles */
        nph = of_count_phandle_with_args(mac_node, "fsl,fman-ports", NULL);
        if (unlikely(nph < 0)) {
 -              dev_err(dev, "of_count_phandle_with_args(%s, fsl,fman-ports) failed\n",
 -                      mac_node->full_name);
 +              dev_err(dev, "of_count_phandle_with_args(%pOF, fsl,fman-ports) failed\n",
 +                      mac_node);
                err = nph;
                goto _return_dev_set_drvdata;
        }
  
        if (nph != ARRAY_SIZE(mac_dev->port)) {
 -              dev_err(dev, "Not supported number of fman-ports handles of mac node %s from device tree\n",
 -                      mac_node->full_name);
 +              dev_err(dev, "Not supported number of fman-ports handles of mac node %pOF from device tree\n",
 +                      mac_node);
                err = -EINVAL;
                goto _return_dev_set_drvdata;
        }
                /* Find the port node */
                dev_node = of_parse_phandle(mac_node, "fsl,fman-ports", i);
                if (!dev_node) {
 -                      dev_err(dev, "of_parse_phandle(%s, fsl,fman-ports) failed\n",
 -                              mac_node->full_name);
 +                      dev_err(dev, "of_parse_phandle(%pOF, fsl,fman-ports) failed\n",
 +                              mac_node);
                        err = -EINVAL;
                        goto _return_of_node_put;
                }
  
                of_dev = of_find_device_by_node(dev_node);
                if (!of_dev) {
 -                      dev_err(dev, "of_find_device_by_node(%s) failed\n",
 -                              dev_node->full_name);
 +                      dev_err(dev, "of_find_device_by_node(%pOF) failed\n",
 +                              dev_node);
                        err = -EINVAL;
                        goto _return_of_node_put;
                }
  
                mac_dev->port[i] = fman_port_bind(&of_dev->dev);
                if (!mac_dev->port[i]) {
 -                      dev_err(dev, "dev_get_drvdata(%s) failed\n",
 -                              dev_node->full_name);
 +                      dev_err(dev, "dev_get_drvdata(%pOF) failed\n",
 +                              dev_node);
                        err = -EINVAL;
                        goto _return_of_node_put;
                }
        phy_if = of_get_phy_mode(mac_node);
        if (phy_if < 0) {
                dev_warn(dev,
 -                       "of_get_phy_mode() for %s failed. Defaulting to SGMII\n",
 -                       mac_node->full_name);
 +                       "of_get_phy_mode() for %pOF failed. Defaulting to SGMII\n",
 +                       mac_node);
                phy_if = PHY_INTERFACE_MODE_SGMII;
        }
        priv->phy_if = phy_if;
index f37c05fed5bcf3c601ef64c912c9f58051fd7686,4d598ca8503a50952576354ae35f1b3b6a574b6e..d5624894152e1e6317b50092cdda223837e0ad0f
@@@ -18,7 -18,6 +18,7 @@@
  #include <linux/inetdevice.h>
  #include <linux/mbus.h>
  #include <linux/module.h>
 +#include <linux/mfd/syscon.h>
  #include <linux/interrupt.h>
  #include <linux/cpumask.h>
  #include <linux/of.h>
  #include <linux/of_address.h>
  #include <linux/of_device.h>
  #include <linux/phy.h>
 +#include <linux/phy/phy.h>
  #include <linux/clk.h>
  #include <linux/hrtimer.h>
  #include <linux/ktime.h>
 +#include <linux/regmap.h>
  #include <uapi/linux/ppp_defs.h>
  #include <net/ip.h>
  #include <net/ipv6.h>
 +#include <net/tso.h>
  
  /* RX Fifo Registers */
  #define MVPP2_RX_DATA_FIFO_SIZE_REG(port)     (0x00 + 4 * (port))
  #define MVPP2_TXQ_DESC_ADDR_REG                       0x2084
  #define MVPP2_TXQ_DESC_SIZE_REG                       0x2088
  #define     MVPP2_TXQ_DESC_SIZE_MASK          0x3ff0
 +#define MVPP2_TXQ_THRESH_REG                  0x2094
 +#define           MVPP2_TXQ_THRESH_OFFSET             16
 +#define           MVPP2_TXQ_THRESH_MASK               0x3fff
  #define MVPP2_AGGR_TXQ_UPDATE_REG             0x2090
  #define MVPP2_TXQ_INDEX_REG                   0x2098
  #define MVPP2_TXQ_PREF_BUF_REG                        0x209c
  #define MVPP22_AXI_CODE_DOMAIN_SYSTEM         3
  
  /* Interrupt Cause and Mask registers */
 +#define MVPP2_ISR_TX_THRESHOLD_REG(port)      (0x5140 + 4 * (port))
 +#define     MVPP2_MAX_ISR_TX_THRESHOLD                0xfffff0
 +
  #define MVPP2_ISR_RX_THRESHOLD_REG(rxq)               (0x5200 + 4 * (rxq))
  #define     MVPP2_MAX_ISR_RX_THRESHOLD                0xfffff0
 -#define MVPP21_ISR_RXQ_GROUP_REG(rxq)         (0x5400 + 4 * (rxq))
 +#define MVPP21_ISR_RXQ_GROUP_REG(port)                (0x5400 + 4 * (port))
  
 -#define MVPP22_ISR_RXQ_GROUP_INDEX_REG          0x5400
 +#define MVPP22_ISR_RXQ_GROUP_INDEX_REG                0x5400
  #define MVPP22_ISR_RXQ_GROUP_INDEX_SUBGROUP_MASK 0xf
 -#define MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_MASK   0x380
 -#define MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_OFFSET 7
 +#define MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_MASK 0x380
 +#define MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_OFFSET       7
  
  #define MVPP22_ISR_RXQ_GROUP_INDEX_SUBGROUP_MASK 0xf
 -#define MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_MASK   0x380
 +#define MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_MASK 0x380
  
 -#define MVPP22_ISR_RXQ_SUB_GROUP_CONFIG_REG     0x5404
 -#define MVPP22_ISR_RXQ_SUB_GROUP_STARTQ_MASK    0x1f
 -#define MVPP22_ISR_RXQ_SUB_GROUP_SIZE_MASK      0xf00
 -#define MVPP22_ISR_RXQ_SUB_GROUP_SIZE_OFFSET    8
 +#define MVPP22_ISR_RXQ_SUB_GROUP_CONFIG_REG   0x5404
 +#define MVPP22_ISR_RXQ_SUB_GROUP_STARTQ_MASK  0x1f
 +#define MVPP22_ISR_RXQ_SUB_GROUP_SIZE_MASK    0xf00
 +#define MVPP22_ISR_RXQ_SUB_GROUP_SIZE_OFFSET  8
  
  #define MVPP2_ISR_ENABLE_REG(port)            (0x5420 + 4 * (port))
  #define     MVPP2_ISR_ENABLE_INTERRUPT(mask)  ((mask) & 0xffff)
  #define MVPP2_ISR_RX_TX_CAUSE_REG(port)               (0x5480 + 4 * (port))
  #define     MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK       0xffff
  #define     MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK       0xff0000
 +#define     MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_OFFSET     16
  #define     MVPP2_CAUSE_RX_FIFO_OVERRUN_MASK  BIT(24)
  #define     MVPP2_CAUSE_FCS_ERR_MASK          BIT(25)
  #define     MVPP2_CAUSE_TX_FIFO_UNDERRUN_MASK BIT(26)
  #define MVPP2_BM_VIRT_RLS_REG                 0x64c0
  #define MVPP22_BM_ADDR_HIGH_RLS_REG           0x64c4
  #define     MVPP22_BM_ADDR_HIGH_PHYS_RLS_MASK 0xff
 -#define           MVPP22_BM_ADDR_HIGH_VIRT_RLS_MASK   0xff00
 +#define     MVPP22_BM_ADDR_HIGH_VIRT_RLS_MASK 0xff00
  #define     MVPP22_BM_ADDR_HIGH_VIRT_RLS_SHIFT        8
  
  /* TX Scheduler registers */
  
  /* Per-port registers */
  #define MVPP2_GMAC_CTRL_0_REG                 0x0
 -#define      MVPP2_GMAC_PORT_EN_MASK          BIT(0)
 -#define      MVPP2_GMAC_MAX_RX_SIZE_OFFS      2
 -#define      MVPP2_GMAC_MAX_RX_SIZE_MASK      0x7ffc
 -#define      MVPP2_GMAC_MIB_CNTR_EN_MASK      BIT(15)
 +#define     MVPP2_GMAC_PORT_EN_MASK           BIT(0)
 +#define     MVPP2_GMAC_PORT_TYPE_MASK         BIT(1)
 +#define     MVPP2_GMAC_MAX_RX_SIZE_OFFS               2
 +#define     MVPP2_GMAC_MAX_RX_SIZE_MASK               0x7ffc
 +#define     MVPP2_GMAC_MIB_CNTR_EN_MASK               BIT(15)
  #define MVPP2_GMAC_CTRL_1_REG                 0x4
 -#define      MVPP2_GMAC_PERIODIC_XON_EN_MASK  BIT(1)
 -#define      MVPP2_GMAC_GMII_LB_EN_MASK               BIT(5)
 -#define      MVPP2_GMAC_PCS_LB_EN_BIT         6
 -#define      MVPP2_GMAC_PCS_LB_EN_MASK                BIT(6)
 -#define      MVPP2_GMAC_SA_LOW_OFFS           7
 +#define     MVPP2_GMAC_PERIODIC_XON_EN_MASK   BIT(1)
 +#define     MVPP2_GMAC_GMII_LB_EN_MASK                BIT(5)
 +#define     MVPP2_GMAC_PCS_LB_EN_BIT          6
 +#define     MVPP2_GMAC_PCS_LB_EN_MASK         BIT(6)
 +#define     MVPP2_GMAC_SA_LOW_OFFS            7
  #define MVPP2_GMAC_CTRL_2_REG                 0x8
 -#define      MVPP2_GMAC_INBAND_AN_MASK                BIT(0)
 -#define      MVPP2_GMAC_PCS_ENABLE_MASK               BIT(3)
 -#define      MVPP2_GMAC_PORT_RGMII_MASK               BIT(4)
 -#define      MVPP2_GMAC_PORT_RESET_MASK               BIT(6)
 +#define     MVPP2_GMAC_INBAND_AN_MASK         BIT(0)
 +#define     MVPP2_GMAC_FLOW_CTRL_MASK         GENMASK(2, 1)
 +#define     MVPP2_GMAC_PCS_ENABLE_MASK                BIT(3)
 +#define     MVPP2_GMAC_PORT_RGMII_MASK                BIT(4)
 +#define     MVPP2_GMAC_DISABLE_PADDING                BIT(5)
 +#define     MVPP2_GMAC_PORT_RESET_MASK                BIT(6)
  #define MVPP2_GMAC_AUTONEG_CONFIG             0xc
 -#define      MVPP2_GMAC_FORCE_LINK_DOWN               BIT(0)
 -#define      MVPP2_GMAC_FORCE_LINK_PASS               BIT(1)
 -#define      MVPP2_GMAC_CONFIG_MII_SPEED      BIT(5)
 -#define      MVPP2_GMAC_CONFIG_GMII_SPEED     BIT(6)
 -#define      MVPP2_GMAC_AN_SPEED_EN           BIT(7)
 -#define      MVPP2_GMAC_FC_ADV_EN             BIT(9)
 -#define      MVPP2_GMAC_CONFIG_FULL_DUPLEX    BIT(12)
 -#define      MVPP2_GMAC_AN_DUPLEX_EN          BIT(13)
 +#define     MVPP2_GMAC_FORCE_LINK_DOWN                BIT(0)
 +#define     MVPP2_GMAC_FORCE_LINK_PASS                BIT(1)
 +#define     MVPP2_GMAC_IN_BAND_AUTONEG                BIT(2)
 +#define     MVPP2_GMAC_IN_BAND_AUTONEG_BYPASS BIT(3)
 +#define     MVPP2_GMAC_CONFIG_MII_SPEED       BIT(5)
 +#define     MVPP2_GMAC_CONFIG_GMII_SPEED      BIT(6)
 +#define     MVPP2_GMAC_AN_SPEED_EN            BIT(7)
 +#define     MVPP2_GMAC_FC_ADV_EN              BIT(9)
 +#define     MVPP2_GMAC_FLOW_CTRL_AUTONEG      BIT(11)
 +#define     MVPP2_GMAC_CONFIG_FULL_DUPLEX     BIT(12)
 +#define     MVPP2_GMAC_AN_DUPLEX_EN           BIT(13)
 +#define MVPP2_GMAC_STATUS0                    0x10
 +#define     MVPP2_GMAC_STATUS0_LINK_UP                BIT(0)
  #define MVPP2_GMAC_PORT_FIFO_CFG_1_REG                0x1c
 -#define      MVPP2_GMAC_TX_FIFO_MIN_TH_OFFS   6
 -#define      MVPP2_GMAC_TX_FIFO_MIN_TH_ALL_MASK       0x1fc0
 -#define      MVPP2_GMAC_TX_FIFO_MIN_TH_MASK(v)        (((v) << 6) & \
 +#define     MVPP2_GMAC_TX_FIFO_MIN_TH_OFFS    6
 +#define     MVPP2_GMAC_TX_FIFO_MIN_TH_ALL_MASK        0x1fc0
 +#define     MVPP2_GMAC_TX_FIFO_MIN_TH_MASK(v) (((v) << 6) & \
                                        MVPP2_GMAC_TX_FIFO_MIN_TH_ALL_MASK)
 +#define MVPP22_GMAC_INT_STAT                  0x20
 +#define     MVPP22_GMAC_INT_STAT_LINK         BIT(1)
 +#define MVPP22_GMAC_INT_MASK                  0x24
 +#define     MVPP22_GMAC_INT_MASK_LINK_STAT    BIT(1)
  #define MVPP22_GMAC_CTRL_4_REG                        0x90
 -#define      MVPP22_CTRL4_EXT_PIN_GMII_SEL    BIT(0)
 -#define      MVPP22_CTRL4_DP_CLK_SEL          BIT(5)
 -#define      MVPP22_CTRL4_SYNC_BYPASS         BIT(6)
 -#define      MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE        BIT(7)
 +#define     MVPP22_CTRL4_EXT_PIN_GMII_SEL     BIT(0)
 +#define     MVPP22_CTRL4_DP_CLK_SEL           BIT(5)
 +#define     MVPP22_CTRL4_SYNC_BYPASS_DIS      BIT(6)
 +#define     MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE BIT(7)
 +#define MVPP22_GMAC_INT_SUM_MASK              0xa4
 +#define     MVPP22_GMAC_INT_SUM_MASK_LINK_STAT        BIT(1)
  
  /* Per-port XGMAC registers. PPv2.2 only, only for GOP port 0,
   * relative to port->base.
   */
  #define MVPP22_XLG_CTRL0_REG                  0x100
 -#define      MVPP22_XLG_CTRL0_PORT_EN         BIT(0)
 -#define      MVPP22_XLG_CTRL0_MAC_RESET_DIS   BIT(1)
 -#define      MVPP22_XLG_CTRL0_MIB_CNT_DIS     BIT(14)
 -
 +#define     MVPP22_XLG_CTRL0_PORT_EN          BIT(0)
 +#define     MVPP22_XLG_CTRL0_MAC_RESET_DIS    BIT(1)
 +#define     MVPP22_XLG_CTRL0_RX_FLOW_CTRL_EN  BIT(7)
 +#define     MVPP22_XLG_CTRL0_MIB_CNT_DIS      BIT(14)
 +#define MVPP22_XLG_CTRL1_REG                  0x104
 +#define     MVPP22_XLG_CTRL1_FRAMESIZELIMIT_OFFS      0
 +#define     MVPP22_XLG_CTRL1_FRAMESIZELIMIT_MASK      0x1fff
 +#define MVPP22_XLG_STATUS                     0x10c
 +#define     MVPP22_XLG_STATUS_LINK_UP         BIT(0)
 +#define MVPP22_XLG_INT_STAT                   0x114
 +#define     MVPP22_XLG_INT_STAT_LINK          BIT(1)
 +#define MVPP22_XLG_INT_MASK                   0x118
 +#define     MVPP22_XLG_INT_MASK_LINK          BIT(1)
  #define MVPP22_XLG_CTRL3_REG                  0x11c
 -#define      MVPP22_XLG_CTRL3_MACMODESELECT_MASK      (7 << 13)
 -#define      MVPP22_XLG_CTRL3_MACMODESELECT_GMAC      (0 << 13)
 -#define      MVPP22_XLG_CTRL3_MACMODESELECT_10G               (1 << 13)
 +#define     MVPP22_XLG_CTRL3_MACMODESELECT_MASK       (7 << 13)
 +#define     MVPP22_XLG_CTRL3_MACMODESELECT_GMAC       (0 << 13)
 +#define     MVPP22_XLG_CTRL3_MACMODESELECT_10G        (1 << 13)
 +#define MVPP22_XLG_EXT_INT_MASK                       0x15c
 +#define     MVPP22_XLG_EXT_INT_MASK_XLG               BIT(1)
 +#define     MVPP22_XLG_EXT_INT_MASK_GIG               BIT(2)
 +#define MVPP22_XLG_CTRL4_REG                  0x184
 +#define     MVPP22_XLG_CTRL4_FWD_FC           BIT(5)
 +#define     MVPP22_XLG_CTRL4_FWD_PFC          BIT(6)
 +#define     MVPP22_XLG_CTRL4_MACMODSELECT_GMAC        BIT(12)
  
  /* SMI registers. PPv2.2 only, relative to priv->iface_base. */
  #define MVPP22_SMI_MISC_CFG_REG                       0x1204
 -#define      MVPP22_SMI_POLLING_EN            BIT(10)
 +#define     MVPP22_SMI_POLLING_EN             BIT(10)
  
  #define MVPP22_GMAC_BASE(port)                (0x7000 + (port) * 0x1000 + 0xe00)
  
  #define MVPP2_QUEUE_NEXT_DESC(q, index) \
        (((index) < (q)->last_desc) ? ((index) + 1) : 0)
  
 +/* XPCS registers. PPv2.2 only */
 +#define MVPP22_MPCS_BASE(port)                        (0x7000 + (port) * 0x1000)
 +#define MVPP22_MPCS_CTRL                      0x14
 +#define     MVPP22_MPCS_CTRL_FWD_ERR_CONN     BIT(10)
 +#define MVPP22_MPCS_CLK_RESET                 0x14c
 +#define     MAC_CLK_RESET_SD_TX                       BIT(0)
 +#define     MAC_CLK_RESET_SD_RX                       BIT(1)
 +#define     MAC_CLK_RESET_MAC                 BIT(2)
 +#define     MVPP22_MPCS_CLK_RESET_DIV_RATIO(n)        ((n) << 4)
 +#define     MVPP22_MPCS_CLK_RESET_DIV_SET     BIT(11)
 +
 +/* XPCS registers. PPv2.2 only */
 +#define MVPP22_XPCS_BASE(port)                        (0x7400 + (port) * 0x1000)
 +#define MVPP22_XPCS_CFG0                      0x0
 +#define     MVPP22_XPCS_CFG0_PCS_MODE(n)      ((n) << 3)
 +#define     MVPP22_XPCS_CFG0_ACTIVE_LANE(n)   ((n) << 5)
 +
 +/* System controller registers. Accessed through a regmap. */
 +#define GENCONF_SOFT_RESET1                           0x1108
 +#define     GENCONF_SOFT_RESET1_GOP                   BIT(6)
 +#define GENCONF_PORT_CTRL0                            0x1110
 +#define     GENCONF_PORT_CTRL0_BUS_WIDTH_SELECT               BIT(1)
 +#define     GENCONF_PORT_CTRL0_RX_DATA_SAMPLE         BIT(29)
 +#define     GENCONF_PORT_CTRL0_CLK_DIV_PHASE_CLR      BIT(31)
 +#define GENCONF_PORT_CTRL1                            0x1114
 +#define     GENCONF_PORT_CTRL1_EN(p)                  BIT(p)
 +#define     GENCONF_PORT_CTRL1_RESET(p)                       (BIT(p) << 28)
 +#define GENCONF_CTRL0                                 0x1120
 +#define     GENCONF_CTRL0_PORT0_RGMII                 BIT(0)
 +#define     GENCONF_CTRL0_PORT1_RGMII_MII             BIT(1)
 +#define     GENCONF_CTRL0_PORT1_RGMII                 BIT(2)
 +
  /* Various constants */
  
  /* Coalescing */
  #define MVPP2_TXDONE_COAL_PKTS_THRESH 15
  #define MVPP2_TXDONE_HRTIMER_PERIOD_NS        1000000UL
 +#define MVPP2_TXDONE_COAL_USEC                1000
  #define MVPP2_RX_COAL_PKTS            32
  #define MVPP2_RX_COAL_USEC            100
  
@@@ -759,8 -685,7 +759,8 @@@ enum mvpp2_prs_l3_cast 
  #define MVPP21_ADDR_SPACE_SZ          0
  #define MVPP22_ADDR_SPACE_SZ          SZ_64K
  
 -#define MVPP2_MAX_CPUS                        4
 +#define MVPP2_MAX_THREADS             8
 +#define MVPP2_MAX_QVECS                       MVPP2_MAX_THREADS
  
  enum mvpp2_bm_type {
        MVPP2_BM_FREE,
@@@ -776,17 -701,11 +776,17 @@@ struct mvpp2 
        void __iomem *lms_base;
        void __iomem *iface_base;
  
 -      /* On PPv2.2, each CPU can access the base register through a
 -       * separate address space, each 64 KB apart from each
 -       * other.
 +      /* On PPv2.2, each "software thread" can access the base
 +       * register through a separate address space, each 64 KB apart
 +       * from each other. Typically, such address spaces will be
 +       * used per CPU.
 +       */
 +      void __iomem *swth_base[MVPP2_MAX_THREADS];
 +
 +      /* On PPv2.2, some port control registers are located into the system
 +       * controller space. These registers are accessible through a regmap.
         */
 -      void __iomem *cpu_base[MVPP2_MAX_CPUS];
 +      struct regmap *sysctrl_base;
  
        /* Common clocks */
        struct clk *pp_clk;
@@@ -833,18 -752,6 +833,18 @@@ struct mvpp2_port_pcpu 
        struct tasklet_struct tx_done_tasklet;
  };
  
 +struct mvpp2_queue_vector {
 +      int irq;
 +      struct napi_struct napi;
 +      enum { MVPP2_QUEUE_VECTOR_SHARED, MVPP2_QUEUE_VECTOR_PRIVATE } type;
 +      int sw_thread_id;
 +      u16 sw_thread_mask;
 +      int first_rxq;
 +      int nrxqs;
 +      u32 pending_cause_rx;
 +      struct mvpp2_port *port;
 +};
 +
  struct mvpp2_port {
        u8 id;
  
         */
        int gop_id;
  
 -      int irq;
 +      int link_irq;
  
        struct mvpp2 *priv;
  
        void __iomem *base;
  
        struct mvpp2_rx_queue **rxqs;
 +      unsigned int nrxqs;
        struct mvpp2_tx_queue **txqs;
 +      unsigned int ntxqs;
        struct net_device *dev;
  
        int pkt_size;
  
 -      u32 pending_cause_rx;
 -      struct napi_struct napi;
 -
        /* Per-CPU port control */
        struct mvpp2_port_pcpu __percpu *pcpu;
  
  
        phy_interface_t phy_interface;
        struct device_node *phy_node;
 +      struct phy *comphy;
        unsigned int link;
        unsigned int duplex;
        unsigned int speed;
  
        /* Index of first port's physical RXQ */
        u8 first_rxq;
 +
 +      struct mvpp2_queue_vector qvecs[MVPP2_MAX_QVECS];
 +      unsigned int nqvecs;
 +      bool has_tx_irqs;
 +
 +      u32 tx_time_coal;
  };
  
  /* The mvpp2_tx_desc and mvpp2_rx_desc structures describe the
@@@ -1031,10 -932,6 +1031,10 @@@ struct mvpp2_txq_pcpu 
  
        /* Index of the TX DMA descriptor to be cleaned up */
        int txq_get_index;
 +
 +      /* DMA buffer for TSO headers */
 +      char *tso_headers;
 +      dma_addr_t tso_headers_dma;
  };
  
  struct mvpp2_tx_queue {
@@@ -1165,14 -1062,12 +1165,14 @@@ struct mvpp2_bm_pool 
        u32 port_map;
  };
  
 -/* Static declaractions */
 +/* Queue modes */
 +#define MVPP2_QDIST_SINGLE_MODE       0
 +#define MVPP2_QDIST_MULTI_MODE        1
  
 -/* Number of RXQs used by single port */
 -static int rxq_number = MVPP2_DEFAULT_RXQ;
 -/* Number of TXQs used by single port */
 -static int txq_number = MVPP2_MAX_TXQ;
 +static int queue_mode = MVPP2_QDIST_SINGLE_MODE;
 +
 +module_param(queue_mode, int, 0444);
 +MODULE_PARM_DESC(queue_mode, "Set queue_mode (single=0, multi=1)");
  
  #define MVPP2_DRIVER_NAME "mvpp2"
  #define MVPP2_DRIVER_VERSION "1.0"
  
  static void mvpp2_write(struct mvpp2 *priv, u32 offset, u32 data)
  {
 -      writel(data, priv->cpu_base[0] + offset);
 +      writel(data, priv->swth_base[0] + offset);
  }
  
  static u32 mvpp2_read(struct mvpp2 *priv, u32 offset)
  {
 -      return readl(priv->cpu_base[0] + offset);
 +      return readl(priv->swth_base[0] + offset);
  }
  
  /* These accessors should be used to access:
  static void mvpp2_percpu_write(struct mvpp2 *priv, int cpu,
                               u32 offset, u32 data)
  {
 -      writel(data, priv->cpu_base[cpu] + offset);
 +      writel(data, priv->swth_base[cpu] + offset);
  }
  
  static u32 mvpp2_percpu_read(struct mvpp2 *priv, int cpu,
                             u32 offset)
  {
 -      return readl(priv->cpu_base[cpu] + offset);
 +      return readl(priv->swth_base[cpu] + offset);
  }
  
  static dma_addr_t mvpp2_txdesc_dma_addr_get(struct mvpp2_port *port,
@@@ -4175,7 -4070,7 +4175,7 @@@ static int mvpp2_swf_bm_pool_init(struc
  
                port->pool_long->port_map |= (1 << port->id);
  
 -              for (rxq = 0; rxq < rxq_number; rxq++)
 +              for (rxq = 0; rxq < port->nrxqs; rxq++)
                        mvpp2_rxq_long_pool_set(port, rxq, port->pool_long->id);
        }
  
  
                port->pool_short->port_map |= (1 << port->id);
  
 -              for (rxq = 0; rxq < rxq_number; rxq++)
 +              for (rxq = 0; rxq < port->nrxqs; rxq++)
                        mvpp2_rxq_short_pool_set(port, rxq,
                                                 port->pool_short->id);
        }
@@@ -4230,40 -4125,22 +4230,40 @@@ static int mvpp2_bm_update_mtu(struct n
  
  static inline void mvpp2_interrupts_enable(struct mvpp2_port *port)
  {
 -      int cpu, cpu_mask = 0;
 +      int i, sw_thread_mask = 0;
 +
 +      for (i = 0; i < port->nqvecs; i++)
 +              sw_thread_mask |= port->qvecs[i].sw_thread_mask;
  
 -      for_each_present_cpu(cpu)
 -              cpu_mask |= 1 << cpu;
        mvpp2_write(port->priv, MVPP2_ISR_ENABLE_REG(port->id),
 -                  MVPP2_ISR_ENABLE_INTERRUPT(cpu_mask));
 +                  MVPP2_ISR_ENABLE_INTERRUPT(sw_thread_mask));
  }
  
  static inline void mvpp2_interrupts_disable(struct mvpp2_port *port)
  {
 -      int cpu, cpu_mask = 0;
 +      int i, sw_thread_mask = 0;
 +
 +      for (i = 0; i < port->nqvecs; i++)
 +              sw_thread_mask |= port->qvecs[i].sw_thread_mask;
 +
 +      mvpp2_write(port->priv, MVPP2_ISR_ENABLE_REG(port->id),
 +                  MVPP2_ISR_DISABLE_INTERRUPT(sw_thread_mask));
 +}
 +
 +static inline void mvpp2_qvec_interrupt_enable(struct mvpp2_queue_vector *qvec)
 +{
 +      struct mvpp2_port *port = qvec->port;
 +
 +      mvpp2_write(port->priv, MVPP2_ISR_ENABLE_REG(port->id),
 +                  MVPP2_ISR_ENABLE_INTERRUPT(qvec->sw_thread_mask));
 +}
 +
 +static inline void mvpp2_qvec_interrupt_disable(struct mvpp2_queue_vector *qvec)
 +{
 +      struct mvpp2_port *port = qvec->port;
  
 -      for_each_present_cpu(cpu)
 -              cpu_mask |= 1 << cpu;
        mvpp2_write(port->priv, MVPP2_ISR_ENABLE_REG(port->id),
 -                  MVPP2_ISR_DISABLE_INTERRUPT(cpu_mask));
 +                  MVPP2_ISR_DISABLE_INTERRUPT(qvec->sw_thread_mask));
  }
  
  /* Mask the current CPU's Rx/Tx interrupts
@@@ -4285,346 -4162,15 +4285,346 @@@ static void mvpp2_interrupts_mask(void 
  static void mvpp2_interrupts_unmask(void *arg)
  {
        struct mvpp2_port *port = arg;
 +      u32 val;
 +
 +      val = MVPP2_CAUSE_MISC_SUM_MASK |
 +              MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK;
 +      if (port->has_tx_irqs)
 +              val |= MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK;
  
        mvpp2_percpu_write(port->priv, smp_processor_id(),
 -                         MVPP2_ISR_RX_TX_MASK_REG(port->id),
 -                         (MVPP2_CAUSE_MISC_SUM_MASK |
 -                          MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK));
 +                         MVPP2_ISR_RX_TX_MASK_REG(port->id), val);
 +}
 +
 +static void
 +mvpp2_shared_interrupt_mask_unmask(struct mvpp2_port *port, bool mask)
 +{
 +      u32 val;
 +      int i;
 +
 +      if (port->priv->hw_version != MVPP22)
 +              return;
 +
 +      if (mask)
 +              val = 0;
 +      else
 +              val = MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK;
 +
 +      for (i = 0; i < port->nqvecs; i++) {
 +              struct mvpp2_queue_vector *v = port->qvecs + i;
 +
 +              if (v->type != MVPP2_QUEUE_VECTOR_SHARED)
 +                      continue;
 +
 +              mvpp2_percpu_write(port->priv, v->sw_thread_id,
 +                                 MVPP2_ISR_RX_TX_MASK_REG(port->id), val);
 +      }
  }
  
  /* Port configuration routines */
  
 +static void mvpp22_gop_init_rgmii(struct mvpp2_port *port)
 +{
 +      struct mvpp2 *priv = port->priv;
 +      u32 val;
 +
 +      regmap_read(priv->sysctrl_base, GENCONF_PORT_CTRL0, &val);
 +      val |= GENCONF_PORT_CTRL0_BUS_WIDTH_SELECT;
 +      regmap_write(priv->sysctrl_base, GENCONF_PORT_CTRL0, val);
 +
 +      regmap_read(priv->sysctrl_base, GENCONF_CTRL0, &val);
 +      if (port->gop_id == 2)
 +              val |= GENCONF_CTRL0_PORT0_RGMII | GENCONF_CTRL0_PORT1_RGMII;
 +      else if (port->gop_id == 3)
 +              val |= GENCONF_CTRL0_PORT1_RGMII_MII;
 +      regmap_write(priv->sysctrl_base, GENCONF_CTRL0, val);
 +}
 +
 +static void mvpp22_gop_init_sgmii(struct mvpp2_port *port)
 +{
 +      struct mvpp2 *priv = port->priv;
 +      u32 val;
 +
 +      regmap_read(priv->sysctrl_base, GENCONF_PORT_CTRL0, &val);
 +      val |= GENCONF_PORT_CTRL0_BUS_WIDTH_SELECT |
 +             GENCONF_PORT_CTRL0_RX_DATA_SAMPLE;
 +      regmap_write(priv->sysctrl_base, GENCONF_PORT_CTRL0, val);
 +
 +      if (port->gop_id > 1) {
 +              regmap_read(priv->sysctrl_base, GENCONF_CTRL0, &val);
 +              if (port->gop_id == 2)
 +                      val &= ~GENCONF_CTRL0_PORT0_RGMII;
 +              else if (port->gop_id == 3)
 +                      val &= ~GENCONF_CTRL0_PORT1_RGMII_MII;
 +              regmap_write(priv->sysctrl_base, GENCONF_CTRL0, val);
 +      }
 +}
 +
 +static void mvpp22_gop_init_10gkr(struct mvpp2_port *port)
 +{
 +      struct mvpp2 *priv = port->priv;
 +      void __iomem *mpcs = priv->iface_base + MVPP22_MPCS_BASE(port->gop_id);
 +      void __iomem *xpcs = priv->iface_base + MVPP22_XPCS_BASE(port->gop_id);
 +      u32 val;
 +
 +      /* XPCS */
 +      val = readl(xpcs + MVPP22_XPCS_CFG0);
 +      val &= ~(MVPP22_XPCS_CFG0_PCS_MODE(0x3) |
 +               MVPP22_XPCS_CFG0_ACTIVE_LANE(0x3));
 +      val |= MVPP22_XPCS_CFG0_ACTIVE_LANE(2);
 +      writel(val, xpcs + MVPP22_XPCS_CFG0);
 +
 +      /* MPCS */
 +      val = readl(mpcs + MVPP22_MPCS_CTRL);
 +      val &= ~MVPP22_MPCS_CTRL_FWD_ERR_CONN;
 +      writel(val, mpcs + MVPP22_MPCS_CTRL);
 +
 +      val = readl(mpcs + MVPP22_MPCS_CLK_RESET);
 +      val &= ~(MVPP22_MPCS_CLK_RESET_DIV_RATIO(0x7) | MAC_CLK_RESET_MAC |
 +               MAC_CLK_RESET_SD_RX | MAC_CLK_RESET_SD_TX);
 +      val |= MVPP22_MPCS_CLK_RESET_DIV_RATIO(1);
 +      writel(val, mpcs + MVPP22_MPCS_CLK_RESET);
 +
 +      val &= ~MVPP22_MPCS_CLK_RESET_DIV_SET;
 +      val |= MAC_CLK_RESET_MAC | MAC_CLK_RESET_SD_RX | MAC_CLK_RESET_SD_TX;
 +      writel(val, mpcs + MVPP22_MPCS_CLK_RESET);
 +}
 +
 +static int mvpp22_gop_init(struct mvpp2_port *port)
 +{
 +      struct mvpp2 *priv = port->priv;
 +      u32 val;
 +
 +      if (!priv->sysctrl_base)
 +              return 0;
 +
 +      switch (port->phy_interface) {
 +      case PHY_INTERFACE_MODE_RGMII:
 +      case PHY_INTERFACE_MODE_RGMII_ID:
 +      case PHY_INTERFACE_MODE_RGMII_RXID:
 +      case PHY_INTERFACE_MODE_RGMII_TXID:
 +              if (port->gop_id == 0)
 +                      goto invalid_conf;
 +              mvpp22_gop_init_rgmii(port);
 +              break;
 +      case PHY_INTERFACE_MODE_SGMII:
 +              mvpp22_gop_init_sgmii(port);
 +              break;
 +      case PHY_INTERFACE_MODE_10GKR:
 +              if (port->gop_id != 0)
 +                      goto invalid_conf;
 +              mvpp22_gop_init_10gkr(port);
 +              break;
 +      default:
 +              goto unsupported_conf;
 +      }
 +
 +      regmap_read(priv->sysctrl_base, GENCONF_PORT_CTRL1, &val);
 +      val |= GENCONF_PORT_CTRL1_RESET(port->gop_id) |
 +             GENCONF_PORT_CTRL1_EN(port->gop_id);
 +      regmap_write(priv->sysctrl_base, GENCONF_PORT_CTRL1, val);
 +
 +      regmap_read(priv->sysctrl_base, GENCONF_PORT_CTRL0, &val);
 +      val |= GENCONF_PORT_CTRL0_CLK_DIV_PHASE_CLR;
 +      regmap_write(priv->sysctrl_base, GENCONF_PORT_CTRL0, val);
 +
 +      regmap_read(priv->sysctrl_base, GENCONF_SOFT_RESET1, &val);
 +      val |= GENCONF_SOFT_RESET1_GOP;
 +      regmap_write(priv->sysctrl_base, GENCONF_SOFT_RESET1, val);
 +
 +unsupported_conf:
 +      return 0;
 +
 +invalid_conf:
 +      netdev_err(port->dev, "Invalid port configuration\n");
 +      return -EINVAL;
 +}
 +
 +static void mvpp22_gop_unmask_irq(struct mvpp2_port *port)
 +{
 +      u32 val;
 +
 +      if (phy_interface_mode_is_rgmii(port->phy_interface) ||
 +          port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
 +              /* Enable the GMAC link status irq for this port */
 +              val = readl(port->base + MVPP22_GMAC_INT_SUM_MASK);
 +              val |= MVPP22_GMAC_INT_SUM_MASK_LINK_STAT;
 +              writel(val, port->base + MVPP22_GMAC_INT_SUM_MASK);
 +      }
 +
 +      if (port->gop_id == 0) {
 +              /* Enable the XLG/GIG irqs for this port */
 +              val = readl(port->base + MVPP22_XLG_EXT_INT_MASK);
 +              if (port->phy_interface == PHY_INTERFACE_MODE_10GKR)
 +                      val |= MVPP22_XLG_EXT_INT_MASK_XLG;
 +              else
 +                      val |= MVPP22_XLG_EXT_INT_MASK_GIG;
 +              writel(val, port->base + MVPP22_XLG_EXT_INT_MASK);
 +      }
 +}
 +
 +static void mvpp22_gop_mask_irq(struct mvpp2_port *port)
 +{
 +      u32 val;
 +
 +      if (port->gop_id == 0) {
 +              val = readl(port->base + MVPP22_XLG_EXT_INT_MASK);
 +              val &= ~(MVPP22_XLG_EXT_INT_MASK_XLG |
 +                       MVPP22_XLG_EXT_INT_MASK_GIG);
 +              writel(val, port->base + MVPP22_XLG_EXT_INT_MASK);
 +      }
 +
 +      if (phy_interface_mode_is_rgmii(port->phy_interface) ||
 +          port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
 +              val = readl(port->base + MVPP22_GMAC_INT_SUM_MASK);
 +              val &= ~MVPP22_GMAC_INT_SUM_MASK_LINK_STAT;
 +              writel(val, port->base + MVPP22_GMAC_INT_SUM_MASK);
 +      }
 +}
 +
 +static void mvpp22_gop_setup_irq(struct mvpp2_port *port)
 +{
 +      u32 val;
 +
 +      if (phy_interface_mode_is_rgmii(port->phy_interface) ||
 +          port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
 +              val = readl(port->base + MVPP22_GMAC_INT_MASK);
 +              val |= MVPP22_GMAC_INT_MASK_LINK_STAT;
 +              writel(val, port->base + MVPP22_GMAC_INT_MASK);
 +      }
 +
 +      if (port->gop_id == 0) {
 +              val = readl(port->base + MVPP22_XLG_INT_MASK);
 +              val |= MVPP22_XLG_INT_MASK_LINK;
 +              writel(val, port->base + MVPP22_XLG_INT_MASK);
 +      }
 +
 +      mvpp22_gop_unmask_irq(port);
 +}
 +
 +static int mvpp22_comphy_init(struct mvpp2_port *port)
 +{
 +      enum phy_mode mode;
 +      int ret;
 +
 +      if (!port->comphy)
 +              return 0;
 +
 +      switch (port->phy_interface) {
 +      case PHY_INTERFACE_MODE_SGMII:
 +              mode = PHY_MODE_SGMII;
 +              break;
 +      case PHY_INTERFACE_MODE_10GKR:
 +              mode = PHY_MODE_10GKR;
 +              break;
 +      default:
 +              return -EINVAL;
 +      }
 +
 +      ret = phy_set_mode(port->comphy, mode);
 +      if (ret)
 +              return ret;
 +
 +      return phy_power_on(port->comphy);
 +}
 +
 +static void mvpp2_port_mii_gmac_configure_mode(struct mvpp2_port *port)
 +{
 +      u32 val;
 +
 +      if (port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
 +              val = readl(port->base + MVPP22_GMAC_CTRL_4_REG);
 +              val |= MVPP22_CTRL4_SYNC_BYPASS_DIS | MVPP22_CTRL4_DP_CLK_SEL |
 +                     MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
 +              val &= ~MVPP22_CTRL4_EXT_PIN_GMII_SEL;
 +              writel(val, port->base + MVPP22_GMAC_CTRL_4_REG);
 +
 +              val = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
 +              val |= MVPP2_GMAC_DISABLE_PADDING;
 +              val &= ~MVPP2_GMAC_FLOW_CTRL_MASK;
 +              writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
 +      } else if (phy_interface_mode_is_rgmii(port->phy_interface)) {
 +              val = readl(port->base + MVPP22_GMAC_CTRL_4_REG);
 +              val |= MVPP22_CTRL4_EXT_PIN_GMII_SEL |
 +                     MVPP22_CTRL4_SYNC_BYPASS_DIS |
 +                     MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
 +              val &= ~MVPP22_CTRL4_DP_CLK_SEL;
 +              writel(val, port->base + MVPP22_GMAC_CTRL_4_REG);
 +
 +              val = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
 +              val &= ~MVPP2_GMAC_DISABLE_PADDING;
 +              writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
 +      }
 +
 +      /* The port is connected to a copper PHY */
 +      val = readl(port->base + MVPP2_GMAC_CTRL_0_REG);
 +      val &= ~MVPP2_GMAC_PORT_TYPE_MASK;
 +      writel(val, port->base + MVPP2_GMAC_CTRL_0_REG);
 +
 +      val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 +      val |= MVPP2_GMAC_IN_BAND_AUTONEG_BYPASS |
 +             MVPP2_GMAC_AN_SPEED_EN | MVPP2_GMAC_FLOW_CTRL_AUTONEG |
 +             MVPP2_GMAC_AN_DUPLEX_EN;
 +      if (port->phy_interface == PHY_INTERFACE_MODE_SGMII)
 +              val |= MVPP2_GMAC_IN_BAND_AUTONEG;
 +      writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 +}
 +
 +static void mvpp2_port_mii_gmac_configure(struct mvpp2_port *port)
 +{
 +      u32 val;
 +
 +      /* Force link down */
 +      val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 +      val &= ~MVPP2_GMAC_FORCE_LINK_PASS;
 +      val |= MVPP2_GMAC_FORCE_LINK_DOWN;
 +      writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 +
 +      /* Set the GMAC in a reset state */
 +      val = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
 +      val |= MVPP2_GMAC_PORT_RESET_MASK;
 +      writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
 +
 +      /* Configure the PCS and in-band AN */
 +      val = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
 +      if (port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
 +              val |= MVPP2_GMAC_INBAND_AN_MASK | MVPP2_GMAC_PCS_ENABLE_MASK;
 +      } else if (phy_interface_mode_is_rgmii(port->phy_interface)) {
 +              val &= ~MVPP2_GMAC_PCS_ENABLE_MASK;
 +              val |= MVPP2_GMAC_PORT_RGMII_MASK;
 +      }
 +      writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
 +
 +      mvpp2_port_mii_gmac_configure_mode(port);
 +
 +      /* Unset the GMAC reset state */
 +      val = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
 +      val &= ~MVPP2_GMAC_PORT_RESET_MASK;
 +      writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
 +
 +      /* Stop forcing link down */
 +      val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 +      val &= ~MVPP2_GMAC_FORCE_LINK_DOWN;
 +      writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 +}
 +
 +static void mvpp2_port_mii_xlg_configure(struct mvpp2_port *port)
 +{
 +      u32 val;
 +
 +      if (port->gop_id != 0)
 +              return;
 +
 +      val = readl(port->base + MVPP22_XLG_CTRL0_REG);
 +      val |= MVPP22_XLG_CTRL0_RX_FLOW_CTRL_EN;
 +      writel(val, port->base + MVPP22_XLG_CTRL0_REG);
 +
 +      val = readl(port->base + MVPP22_XLG_CTRL4_REG);
 +      val &= ~MVPP22_XLG_CTRL4_MACMODSELECT_GMAC;
 +      val |= MVPP22_XLG_CTRL4_FWD_FC | MVPP22_XLG_CTRL4_FWD_PFC;
 +      writel(val, port->base + MVPP22_XLG_CTRL4_REG);
 +}
 +
  static void mvpp22_port_mii_set(struct mvpp2_port *port)
  {
        u32 val;
  
                writel(val, port->base + MVPP22_XLG_CTRL3_REG);
        }
 -
 -      val = readl(port->base + MVPP22_GMAC_CTRL_4_REG);
 -      if (port->phy_interface == PHY_INTERFACE_MODE_RGMII)
 -              val |= MVPP22_CTRL4_EXT_PIN_GMII_SEL;
 -      else
 -              val &= ~MVPP22_CTRL4_EXT_PIN_GMII_SEL;
 -      val &= ~MVPP22_CTRL4_DP_CLK_SEL;
 -      val |= MVPP22_CTRL4_SYNC_BYPASS;
 -      val |= MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
 -      writel(val, port->base + MVPP22_GMAC_CTRL_4_REG);
  }
  
  static void mvpp2_port_mii_set(struct mvpp2_port *port)
  {
 -      u32 val;
 -
        if (port->priv->hw_version == MVPP22)
                mvpp22_port_mii_set(port);
  
 -      val = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
 -
 -      switch (port->phy_interface) {
 -      case PHY_INTERFACE_MODE_SGMII:
 -              val |= MVPP2_GMAC_INBAND_AN_MASK;
 -              break;
 -      case PHY_INTERFACE_MODE_RGMII:
 -              val |= MVPP2_GMAC_PORT_RGMII_MASK;
 -      default:
 -              val &= ~MVPP2_GMAC_PCS_ENABLE_MASK;
 -      }
 -
 -      writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
 +      if (phy_interface_mode_is_rgmii(port->phy_interface) ||
 +          port->phy_interface == PHY_INTERFACE_MODE_SGMII)
 +              mvpp2_port_mii_gmac_configure(port);
 +      else if (port->phy_interface == PHY_INTERFACE_MODE_10GKR)
 +              mvpp2_port_mii_xlg_configure(port);
  }
  
  static void mvpp2_port_fc_adv_enable(struct mvpp2_port *port)
@@@ -4760,18 -4326,6 +4760,18 @@@ static inline void mvpp2_gmac_max_rx_si
        writel(val, port->base + MVPP2_GMAC_CTRL_0_REG);
  }
  
 +/* Change maximum receive size of the port */
 +static inline void mvpp2_xlg_max_rx_size_set(struct mvpp2_port *port)
 +{
 +      u32 val;
 +
 +      val =  readl(port->base + MVPP22_XLG_CTRL1_REG);
 +      val &= ~MVPP22_XLG_CTRL1_FRAMESIZELIMIT_MASK;
 +      val |= ((port->pkt_size - MVPP2_MH_SIZE) / 2) <<
 +             MVPP22_XLG_CTRL1_FRAMESIZELIMIT_OFFS;
 +      writel(val, port->base + MVPP22_XLG_CTRL1_REG);
 +}
 +
  /* Set defaults to the MVPP2 port */
  static void mvpp2_defaults_set(struct mvpp2_port *port)
  {
                    MVPP2_RX_LOW_LATENCY_PKT_SIZE(256));
  
        /* Enable Rx cache snoop */
 -      for (lrxq = 0; lrxq < rxq_number; lrxq++) {
 +      for (lrxq = 0; lrxq < port->nrxqs; lrxq++) {
                queue = port->rxqs[lrxq]->id;
                val = mvpp2_read(port->priv, MVPP2_RXQ_CONFIG_REG(queue));
                val |= MVPP2_SNOOP_PKT_SIZE_MASK |
@@@ -4840,7 -4394,7 +4840,7 @@@ static void mvpp2_ingress_enable(struc
        u32 val;
        int lrxq, queue;
  
 -      for (lrxq = 0; lrxq < rxq_number; lrxq++) {
 +      for (lrxq = 0; lrxq < port->nrxqs; lrxq++) {
                queue = port->rxqs[lrxq]->id;
                val = mvpp2_read(port->priv, MVPP2_RXQ_CONFIG_REG(queue));
                val &= ~MVPP2_RXQ_DISABLE_MASK;
@@@ -4853,7 -4407,7 +4853,7 @@@ static void mvpp2_ingress_disable(struc
        u32 val;
        int lrxq, queue;
  
 -      for (lrxq = 0; lrxq < rxq_number; lrxq++) {
 +      for (lrxq = 0; lrxq < port->nrxqs; lrxq++) {
                queue = port->rxqs[lrxq]->id;
                val = mvpp2_read(port->priv, MVPP2_RXQ_CONFIG_REG(queue));
                val |= MVPP2_RXQ_DISABLE_MASK;
@@@ -4872,7 -4426,7 +4872,7 @@@ static void mvpp2_egress_enable(struct 
  
        /* Enable all initialized TXs. */
        qmap = 0;
 -      for (queue = 0; queue < txq_number; queue++) {
 +      for (queue = 0; queue < port->ntxqs; queue++) {
                struct mvpp2_tx_queue *txq = port->txqs[queue];
  
                if (txq->descs)
@@@ -5158,7 -4712,7 +5158,7 @@@ static void mvpp2_txq_sent_counter_clea
        struct mvpp2_port *port = arg;
        int queue;
  
 -      for (queue = 0; queue < txq_number; queue++) {
 +      for (queue = 0; queue < port->ntxqs; queue++) {
                int id = port->txqs[queue]->id;
  
                mvpp2_percpu_read(port->priv, smp_processor_id(),
@@@ -5199,7 -4753,7 +5199,7 @@@ static void mvpp2_txp_max_tx_size_set(s
                mvpp2_write(port->priv, MVPP2_TXP_SCHED_TOKEN_SIZE_REG, val);
        }
  
 -      for (txq = 0; txq < txq_number; txq++) {
 +      for (txq = 0; txq < port->ntxqs; txq++) {
                val = mvpp2_read(port->priv,
                                 MVPP2_TXQ_SCHED_TOKEN_SIZE_REG(txq));
                size = val & MVPP2_TXQ_TOKEN_SIZE_MAX;
@@@ -5233,23 -4787,6 +5233,23 @@@ static void mvpp2_rx_pkts_coal_set(stru
        put_cpu();
  }
  
 +/* For some reason in the LSP this is done on each CPU. Why ? */
 +static void mvpp2_tx_pkts_coal_set(struct mvpp2_port *port,
 +                                 struct mvpp2_tx_queue *txq)
 +{
 +      int cpu = get_cpu();
 +      u32 val;
 +
 +      if (txq->done_pkts_coal > MVPP2_TXQ_THRESH_MASK)
 +              txq->done_pkts_coal = MVPP2_TXQ_THRESH_MASK;
 +
 +      val = (txq->done_pkts_coal << MVPP2_TXQ_THRESH_OFFSET);
 +      mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_NUM_REG, txq->id);
 +      mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_THRESH_REG, val);
 +
 +      put_cpu();
 +}
 +
  static u32 mvpp2_usec_to_cycles(u32 usec, unsigned long clk_hz)
  {
        u64 tmp = (u64)clk_hz * usec;
@@@ -5286,22 -4823,6 +5286,22 @@@ static void mvpp2_rx_time_coal_set(stru
        mvpp2_write(port->priv, MVPP2_ISR_RX_THRESHOLD_REG(rxq->id), val);
  }
  
 +static void mvpp2_tx_time_coal_set(struct mvpp2_port *port)
 +{
 +      unsigned long freq = port->priv->tclk;
 +      u32 val = mvpp2_usec_to_cycles(port->tx_time_coal, freq);
 +
 +      if (val > MVPP2_MAX_ISR_TX_THRESHOLD) {
 +              port->tx_time_coal =
 +                      mvpp2_cycles_to_usec(MVPP2_MAX_ISR_TX_THRESHOLD, freq);
 +
 +              /* re-evaluate to get actual register value */
 +              val = mvpp2_usec_to_cycles(port->tx_time_coal, freq);
 +      }
 +
 +      mvpp2_write(port->priv, MVPP2_ISR_TX_THRESHOLD_REG(port->id), val);
 +}
 +
  /* Free Tx queue skbuffs */
  static void mvpp2_txq_bufs_free(struct mvpp2_port *port,
                                struct mvpp2_tx_queue *txq,
@@@ -5360,8 -4881,7 +5360,8 @@@ static void mvpp2_txq_done(struct mvpp2
                        netif_tx_wake_queue(nq);
  }
  
 -static unsigned int mvpp2_tx_done(struct mvpp2_port *port, u32 cause)
 +static unsigned int mvpp2_tx_done(struct mvpp2_port *port, u32 cause,
 +                                int cpu)
  {
        struct mvpp2_tx_queue *txq;
        struct mvpp2_txq_pcpu *txq_pcpu;
                if (!txq)
                        break;
  
 -              txq_pcpu = this_cpu_ptr(txq->pcpu);
 +              txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
  
                if (txq_pcpu->count) {
                        mvpp2_txq_done(port, txq, txq_pcpu);
  
  /* Allocate and initialize descriptors for aggr TXQ */
  static int mvpp2_aggr_txq_init(struct platform_device *pdev,
 -                             struct mvpp2_tx_queue *aggr_txq,
 -                             int desc_num, int cpu,
 +                             struct mvpp2_tx_queue *aggr_txq, int cpu,
                               struct mvpp2 *priv)
  {
        u32 txq_dma;
  
        /* Allocate memory for TX descriptors */
        aggr_txq->descs = dma_alloc_coherent(&pdev->dev,
 -                              desc_num * MVPP2_DESC_ALIGNED_SIZE,
 +                              MVPP2_AGGR_TXQ_SIZE * MVPP2_DESC_ALIGNED_SIZE,
                                &aggr_txq->descs_dma, GFP_KERNEL);
        if (!aggr_txq->descs)
                return -ENOMEM;
                        MVPP22_AGGR_TXQ_DESC_ADDR_OFFS;
  
        mvpp2_write(priv, MVPP2_AGGR_TXQ_DESC_ADDR_REG(cpu), txq_dma);
 -      mvpp2_write(priv, MVPP2_AGGR_TXQ_DESC_SIZE_REG(cpu), desc_num);
 +      mvpp2_write(priv, MVPP2_AGGR_TXQ_DESC_SIZE_REG(cpu),
 +                  MVPP2_AGGR_TXQ_SIZE);
  
        return 0;
  }
@@@ -5598,14 -5118,6 +5598,14 @@@ static int mvpp2_txq_init(struct mvpp2_
                txq_pcpu->reserved_num = 0;
                txq_pcpu->txq_put_index = 0;
                txq_pcpu->txq_get_index = 0;
 +
 +              txq_pcpu->tso_headers =
 +                      dma_alloc_coherent(port->dev->dev.parent,
 +                                         MVPP2_AGGR_TXQ_SIZE * TSO_HEADER_SIZE,
 +                                         &txq_pcpu->tso_headers_dma,
 +                                         GFP_KERNEL);
 +              if (!txq_pcpu->tso_headers)
 +                      goto cleanup;
        }
  
        return 0;
@@@ -5613,11 -5125,6 +5613,11 @@@ cleanup
        for_each_present_cpu(cpu) {
                txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
                kfree(txq_pcpu->buffs);
 +
 +              dma_free_coherent(port->dev->dev.parent,
 +                                MVPP2_AGGR_TXQ_SIZE * MVPP2_DESC_ALIGNED_SIZE,
 +                                txq_pcpu->tso_headers,
 +                                txq_pcpu->tso_headers_dma);
        }
  
        dma_free_coherent(port->dev->dev.parent,
@@@ -5637,11 -5144,6 +5637,11 @@@ static void mvpp2_txq_deinit(struct mvp
        for_each_present_cpu(cpu) {
                txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
                kfree(txq_pcpu->buffs);
 +
 +              dma_free_coherent(port->dev->dev.parent,
 +                                MVPP2_AGGR_TXQ_SIZE * MVPP2_DESC_ALIGNED_SIZE,
 +                                txq_pcpu->tso_headers,
 +                                txq_pcpu->tso_headers_dma);
        }
  
        if (txq->descs)
@@@ -5727,7 -5229,7 +5727,7 @@@ static void mvpp2_cleanup_txqs(struct m
        val |= MVPP2_TX_PORT_FLUSH_MASK(port->id);
        mvpp2_write(port->priv, MVPP2_TX_PORT_FLUSH_REG, val);
  
 -      for (queue = 0; queue < txq_number; queue++) {
 +      for (queue = 0; queue < port->ntxqs; queue++) {
                txq = port->txqs[queue];
                mvpp2_txq_clean(port, txq);
                mvpp2_txq_deinit(port, txq);
@@@ -5744,7 -5246,7 +5744,7 @@@ static void mvpp2_cleanup_rxqs(struct m
  {
        int queue;
  
 -      for (queue = 0; queue < rxq_number; queue++)
 +      for (queue = 0; queue < port->nrxqs; queue++)
                mvpp2_rxq_deinit(port, port->rxqs[queue]);
  }
  
@@@ -5753,7 -5255,7 +5753,7 @@@ static int mvpp2_setup_rxqs(struct mvpp
  {
        int queue, err;
  
 -      for (queue = 0; queue < rxq_number; queue++) {
 +      for (queue = 0; queue < port->nrxqs; queue++) {
                err = mvpp2_rxq_init(port, port->rxqs[queue]);
                if (err)
                        goto err_cleanup;
@@@ -5771,21 -5273,13 +5771,21 @@@ static int mvpp2_setup_txqs(struct mvpp
        struct mvpp2_tx_queue *txq;
        int queue, err;
  
 -      for (queue = 0; queue < txq_number; queue++) {
 +      for (queue = 0; queue < port->ntxqs; queue++) {
                txq = port->txqs[queue];
                err = mvpp2_txq_init(port, txq);
                if (err)
                        goto err_cleanup;
        }
  
 +      if (port->has_tx_irqs) {
 +              mvpp2_tx_time_coal_set(port);
 +              for (queue = 0; queue < port->ntxqs; queue++) {
 +                      txq = port->txqs[queue];
 +                      mvpp2_tx_pkts_coal_set(port, txq);
 +              }
 +      }
 +
        on_each_cpu(mvpp2_txq_sent_counter_clear, port, 1);
        return 0;
  
@@@ -5797,170 -5291,72 +5797,170 @@@ err_cleanup
  /* The callback for per-port interrupt */
  static irqreturn_t mvpp2_isr(int irq, void *dev_id)
  {
 -      struct mvpp2_port *port = (struct mvpp2_port *)dev_id;
 +      struct mvpp2_queue_vector *qv = dev_id;
  
 -      mvpp2_interrupts_disable(port);
 +      mvpp2_qvec_interrupt_disable(qv);
  
 -      napi_schedule(&port->napi);
 +      napi_schedule(&qv->napi);
  
        return IRQ_HANDLED;
  }
  
 -/* Adjust link */
 -static void mvpp2_link_event(struct net_device *dev)
 +/* Per-port interrupt for link status changes */
 +static irqreturn_t mvpp2_link_status_isr(int irq, void *dev_id)
  {
 -      struct mvpp2_port *port = netdev_priv(dev);
 -      struct phy_device *phydev = dev->phydev;
 -      int status_change = 0;
 +      struct mvpp2_port *port = (struct mvpp2_port *)dev_id;
 +      struct net_device *dev = port->dev;
 +      bool event = false, link = false;
        u32 val;
  
 -      if (phydev->link) {
 -              if ((port->speed != phydev->speed) ||
 -                  (port->duplex != phydev->duplex)) {
 -                      u32 val;
 +      mvpp22_gop_mask_irq(port);
  
 -                      val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 -                      val &= ~(MVPP2_GMAC_CONFIG_MII_SPEED |
 -                               MVPP2_GMAC_CONFIG_GMII_SPEED |
 -                               MVPP2_GMAC_CONFIG_FULL_DUPLEX |
 -                               MVPP2_GMAC_AN_SPEED_EN |
 -                               MVPP2_GMAC_AN_DUPLEX_EN);
 +      if (port->gop_id == 0 &&
 +          port->phy_interface == PHY_INTERFACE_MODE_10GKR) {
 +              val = readl(port->base + MVPP22_XLG_INT_STAT);
 +              if (val & MVPP22_XLG_INT_STAT_LINK) {
 +                      event = true;
 +                      val = readl(port->base + MVPP22_XLG_STATUS);
 +                      if (val & MVPP22_XLG_STATUS_LINK_UP)
 +                              link = true;
 +              }
 +      } else if (phy_interface_mode_is_rgmii(port->phy_interface) ||
 +                 port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
 +              val = readl(port->base + MVPP22_GMAC_INT_STAT);
 +              if (val & MVPP22_GMAC_INT_STAT_LINK) {
 +                      event = true;
 +                      val = readl(port->base + MVPP2_GMAC_STATUS0);
 +                      if (val & MVPP2_GMAC_STATUS0_LINK_UP)
 +                              link = true;
 +              }
 +      }
  
 -                      if (phydev->duplex)
 -                              val |= MVPP2_GMAC_CONFIG_FULL_DUPLEX;
 +      if (!netif_running(dev) || !event)
 +              goto handled;
  
 -                      if (phydev->speed == SPEED_1000)
 -                              val |= MVPP2_GMAC_CONFIG_GMII_SPEED;
 -                      else if (phydev->speed == SPEED_100)
 -                              val |= MVPP2_GMAC_CONFIG_MII_SPEED;
 +      if (link) {
 +              mvpp2_interrupts_enable(port);
  
 -                      writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 +              mvpp2_egress_enable(port);
 +              mvpp2_ingress_enable(port);
 +              netif_carrier_on(dev);
 +              netif_tx_wake_all_queues(dev);
 +      } else {
 +              netif_tx_stop_all_queues(dev);
 +              netif_carrier_off(dev);
 +              mvpp2_ingress_disable(port);
 +              mvpp2_egress_disable(port);
  
 -                      port->duplex = phydev->duplex;
 -                      port->speed  = phydev->speed;
 -              }
 +              mvpp2_interrupts_disable(port);
        }
  
 -      if (phydev->link != port->link) {
 -              if (!phydev->link) {
 -                      port->duplex = -1;
 -                      port->speed = 0;
 +handled:
 +      mvpp22_gop_unmask_irq(port);
 +      return IRQ_HANDLED;
 +}
 +
 +static void mvpp2_gmac_set_autoneg(struct mvpp2_port *port,
 +                                 struct phy_device *phydev)
 +{
 +      u32 val;
 +
 +      if (port->phy_interface != PHY_INTERFACE_MODE_RGMII &&
 +          port->phy_interface != PHY_INTERFACE_MODE_RGMII_ID &&
 +          port->phy_interface != PHY_INTERFACE_MODE_RGMII_RXID &&
 +          port->phy_interface != PHY_INTERFACE_MODE_RGMII_TXID &&
 +          port->phy_interface != PHY_INTERFACE_MODE_SGMII)
 +              return;
 +
 +      val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 +      val &= ~(MVPP2_GMAC_CONFIG_MII_SPEED |
 +               MVPP2_GMAC_CONFIG_GMII_SPEED |
 +               MVPP2_GMAC_CONFIG_FULL_DUPLEX |
 +               MVPP2_GMAC_AN_SPEED_EN |
 +               MVPP2_GMAC_AN_DUPLEX_EN);
 +
 +      if (phydev->duplex)
 +              val |= MVPP2_GMAC_CONFIG_FULL_DUPLEX;
 +
 +      if (phydev->speed == SPEED_1000)
 +              val |= MVPP2_GMAC_CONFIG_GMII_SPEED;
 +      else if (phydev->speed == SPEED_100)
 +              val |= MVPP2_GMAC_CONFIG_MII_SPEED;
 +
 +      writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 +}
 +
 +/* Adjust link */
 +static void mvpp2_link_event(struct net_device *dev)
 +{
 +      struct mvpp2_port *port = netdev_priv(dev);
 +      struct phy_device *phydev = dev->phydev;
 +      bool link_reconfigured = false;
 +      u32 val;
 +
 +      if (phydev->link) {
 +              if (port->phy_interface != phydev->interface && port->comphy) {
 +                      /* disable current port for reconfiguration */
 +                      mvpp2_interrupts_disable(port);
 +                      netif_carrier_off(port->dev);
 +                      mvpp2_port_disable(port);
 +                      phy_power_off(port->comphy);
 +
 +                      /* comphy reconfiguration */
 +                      port->phy_interface = phydev->interface;
 +                      mvpp22_comphy_init(port);
 +
 +                      /* gop/mac reconfiguration */
 +                      mvpp22_gop_init(port);
 +                      mvpp2_port_mii_set(port);
 +
 +                      link_reconfigured = true;
                }
  
 -              port->link = phydev->link;
 -              status_change = 1;
 +              if ((port->speed != phydev->speed) ||
 +                  (port->duplex != phydev->duplex)) {
 +                      mvpp2_gmac_set_autoneg(port, phydev);
 +
 +                      port->duplex = phydev->duplex;
 +                      port->speed  = phydev->speed;
 +              }
        }
  
 -      if (status_change) {
 +      if (phydev->link != port->link || link_reconfigured) {
 +              port->link = phydev->link;
 +
                if (phydev->link) {
 -                      val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 -                      val |= (MVPP2_GMAC_FORCE_LINK_PASS |
 -                              MVPP2_GMAC_FORCE_LINK_DOWN);
 -                      writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 +                      if (port->phy_interface == PHY_INTERFACE_MODE_RGMII ||
 +                          port->phy_interface == PHY_INTERFACE_MODE_RGMII_ID ||
 +                          port->phy_interface == PHY_INTERFACE_MODE_RGMII_RXID ||
 +                          port->phy_interface == PHY_INTERFACE_MODE_RGMII_TXID ||
 +                          port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
 +                              val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 +                              val |= (MVPP2_GMAC_FORCE_LINK_PASS |
 +                                      MVPP2_GMAC_FORCE_LINK_DOWN);
 +                              writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
 +                      }
 +
 +                      mvpp2_interrupts_enable(port);
 +                      mvpp2_port_enable(port);
 +
                        mvpp2_egress_enable(port);
                        mvpp2_ingress_enable(port);
 +                      netif_carrier_on(dev);
 +                      netif_tx_wake_all_queues(dev);
                } else {
 +                      port->duplex = -1;
 +                      port->speed = 0;
 +
 +                      netif_tx_stop_all_queues(dev);
 +                      netif_carrier_off(dev);
                        mvpp2_ingress_disable(port);
                        mvpp2_egress_disable(port);
 +
 +                      mvpp2_port_disable(port);
 +                      mvpp2_interrupts_disable(port);
                }
 +
                phy_print_status(phydev);
        }
  }
@@@ -5989,8 -5385,8 +5989,8 @@@ static void mvpp2_tx_proc_cb(unsigned l
        port_pcpu->timer_scheduled = false;
  
        /* Process all the Tx queues */
 -      cause = (1 << txq_number) - 1;
 -      tx_todo = mvpp2_tx_done(port, cause);
 +      cause = (1 << port->ntxqs) - 1;
 +      tx_todo = mvpp2_tx_done(port, cause, smp_processor_id());
  
        /* Set the timer in case not all the packets were processed */
        if (tx_todo)
@@@ -6102,8 -5498,8 +6102,8 @@@ static u32 mvpp2_skb_tx_csum(struct mvp
  }
  
  /* Main rx processing */
 -static int mvpp2_rx(struct mvpp2_port *port, int rx_todo,
 -                  struct mvpp2_rx_queue *rxq)
 +static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi,
 +                  int rx_todo, struct mvpp2_rx_queue *rxq)
  {
        struct net_device *dev = port->dev;
        int rx_received;
@@@ -6181,7 -5577,7 +6181,7 @@@ err_drop_frame
                skb->protocol = eth_type_trans(skb, dev);
                mvpp2_rx_csum(port, rx_status, skb);
  
 -              napi_gro_receive(&port->napi, skb);
 +              napi_gro_receive(napi, skb);
        }
  
        if (rcvd_pkts) {
@@@ -6269,123 -5665,6 +6269,123 @@@ cleanup
        return -ENOMEM;
  }
  
 +static inline void mvpp2_tso_put_hdr(struct sk_buff *skb,
 +                                   struct net_device *dev,
 +                                   struct mvpp2_tx_queue *txq,
 +                                   struct mvpp2_tx_queue *aggr_txq,
 +                                   struct mvpp2_txq_pcpu *txq_pcpu,
 +                                   int hdr_sz)
 +{
 +      struct mvpp2_port *port = netdev_priv(dev);
 +      struct mvpp2_tx_desc *tx_desc = mvpp2_txq_next_desc_get(aggr_txq);
 +      dma_addr_t addr;
 +
 +      mvpp2_txdesc_txq_set(port, tx_desc, txq->id);
 +      mvpp2_txdesc_size_set(port, tx_desc, hdr_sz);
 +
 +      addr = txq_pcpu->tso_headers_dma +
 +             txq_pcpu->txq_put_index * TSO_HEADER_SIZE;
 +      mvpp2_txdesc_offset_set(port, tx_desc, addr & MVPP2_TX_DESC_ALIGN);
 +      mvpp2_txdesc_dma_addr_set(port, tx_desc, addr & ~MVPP2_TX_DESC_ALIGN);
 +
 +      mvpp2_txdesc_cmd_set(port, tx_desc, mvpp2_skb_tx_csum(port, skb) |
 +                                          MVPP2_TXD_F_DESC |
 +                                          MVPP2_TXD_PADDING_DISABLE);
 +      mvpp2_txq_inc_put(port, txq_pcpu, NULL, tx_desc);
 +}
 +
 +static inline int mvpp2_tso_put_data(struct sk_buff *skb,
 +                                   struct net_device *dev, struct tso_t *tso,
 +                                   struct mvpp2_tx_queue *txq,
 +                                   struct mvpp2_tx_queue *aggr_txq,
 +                                   struct mvpp2_txq_pcpu *txq_pcpu,
 +                                   int sz, bool left, bool last)
 +{
 +      struct mvpp2_port *port = netdev_priv(dev);
 +      struct mvpp2_tx_desc *tx_desc = mvpp2_txq_next_desc_get(aggr_txq);
 +      dma_addr_t buf_dma_addr;
 +
 +      mvpp2_txdesc_txq_set(port, tx_desc, txq->id);
 +      mvpp2_txdesc_size_set(port, tx_desc, sz);
 +
 +      buf_dma_addr = dma_map_single(dev->dev.parent, tso->data, sz,
 +                                    DMA_TO_DEVICE);
 +      if (unlikely(dma_mapping_error(dev->dev.parent, buf_dma_addr))) {
 +              mvpp2_txq_desc_put(txq);
 +              return -ENOMEM;
 +      }
 +
 +      mvpp2_txdesc_offset_set(port, tx_desc,
 +                              buf_dma_addr & MVPP2_TX_DESC_ALIGN);
 +      mvpp2_txdesc_dma_addr_set(port, tx_desc,
 +                                buf_dma_addr & ~MVPP2_TX_DESC_ALIGN);
 +
 +      if (!left) {
 +              mvpp2_txdesc_cmd_set(port, tx_desc, MVPP2_TXD_L_DESC);
 +              if (last) {
 +                      mvpp2_txq_inc_put(port, txq_pcpu, skb, tx_desc);
 +                      return 0;
 +              }
 +      } else {
 +              mvpp2_txdesc_cmd_set(port, tx_desc, 0);
 +      }
 +
 +      mvpp2_txq_inc_put(port, txq_pcpu, NULL, tx_desc);
 +      return 0;
 +}
 +
 +static int mvpp2_tx_tso(struct sk_buff *skb, struct net_device *dev,
 +                      struct mvpp2_tx_queue *txq,
 +                      struct mvpp2_tx_queue *aggr_txq,
 +                      struct mvpp2_txq_pcpu *txq_pcpu)
 +{
 +      struct mvpp2_port *port = netdev_priv(dev);
 +      struct tso_t tso;
 +      int hdr_sz = skb_transport_offset(skb) + tcp_hdrlen(skb);
 +      int i, len, descs = 0;
 +
 +      /* Check number of available descriptors */
 +      if (mvpp2_aggr_desc_num_check(port->priv, aggr_txq,
 +                                    tso_count_descs(skb)) ||
 +          mvpp2_txq_reserved_desc_num_proc(port->priv, txq, txq_pcpu,
 +                                           tso_count_descs(skb)))
 +              return 0;
 +
 +      tso_start(skb, &tso);
 +      len = skb->len - hdr_sz;
 +      while (len > 0) {
 +              int left = min_t(int, skb_shinfo(skb)->gso_size, len);
 +              char *hdr = txq_pcpu->tso_headers +
 +                          txq_pcpu->txq_put_index * TSO_HEADER_SIZE;
 +
 +              len -= left;
 +              descs++;
 +
 +              tso_build_hdr(skb, hdr, &tso, left, len == 0);
 +              mvpp2_tso_put_hdr(skb, dev, txq, aggr_txq, txq_pcpu, hdr_sz);
 +
 +              while (left > 0) {
 +                      int sz = min_t(int, tso.size, left);
 +                      left -= sz;
 +                      descs++;
 +
 +                      if (mvpp2_tso_put_data(skb, dev, &tso, txq, aggr_txq,
 +                                             txq_pcpu, sz, left, len == 0))
 +                              goto release;
 +                      tso_build_data(skb, &tso, sz);
 +              }
 +      }
 +
 +      return descs;
 +
 +release:
 +      for (i = descs - 1; i >= 0; i--) {
 +              struct mvpp2_tx_desc *tx_desc = txq->descs + i;
 +              tx_desc_unmap_put(port, txq, tx_desc);
 +      }
 +      return 0;
 +}
 +
  /* Main tx processing */
  static int mvpp2_tx(struct sk_buff *skb, struct net_device *dev)
  {
        txq_pcpu = this_cpu_ptr(txq->pcpu);
        aggr_txq = &port->priv->aggr_txqs[smp_processor_id()];
  
 +      if (skb_is_gso(skb)) {
 +              frags = mvpp2_tx_tso(skb, dev, txq, aggr_txq, txq_pcpu);
 +              goto out;
 +      }
        frags = skb_shinfo(skb)->nr_frags + 1;
  
        /* Check number of available descriptors */
                }
        }
  
 -      txq_pcpu->reserved_num -= frags;
 -      txq_pcpu->count += frags;
 -      aggr_txq->count += frags;
 -
 -      /* Enable transmit */
 -      wmb();
 -      mvpp2_aggr_txq_pend_desc_add(port, frags);
 -
 -      if (txq_pcpu->size - txq_pcpu->count < MAX_SKB_FRAGS + 1) {
 -              struct netdev_queue *nq = netdev_get_tx_queue(dev, txq_id);
 -
 -              netif_tx_stop_queue(nq);
 -      }
  out:
        if (frags > 0) {
                struct mvpp2_pcpu_stats *stats = this_cpu_ptr(port->stats);
 +              struct netdev_queue *nq = netdev_get_tx_queue(dev, txq_id);
 +
 +              txq_pcpu->reserved_num -= frags;
 +              txq_pcpu->count += frags;
 +              aggr_txq->count += frags;
 +
 +              /* Enable transmit */
 +              wmb();
 +              mvpp2_aggr_txq_pend_desc_add(port, frags);
 +
 +              if (txq_pcpu->size - txq_pcpu->count < MAX_SKB_FRAGS + 1)
 +                      netif_tx_stop_queue(nq);
  
                u64_stats_update_begin(&stats->syncp);
                stats->tx_packets++;
                mvpp2_txq_done(port, txq, txq_pcpu);
  
        /* Set the timer in case not all frags were processed */
 -      if (txq_pcpu->count <= frags && txq_pcpu->count > 0) {
 +      if (!port->has_tx_irqs && txq_pcpu->count <= frags &&
 +          txq_pcpu->count > 0) {
                struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu);
  
                mvpp2_timer_set(port_pcpu);
@@@ -6508,14 -5783,11 +6508,14 @@@ static inline void mvpp2_cause_error(st
  
  static int mvpp2_poll(struct napi_struct *napi, int budget)
  {
 -      u32 cause_rx_tx, cause_rx, cause_misc;
 +      u32 cause_rx_tx, cause_rx, cause_tx, cause_misc;
        int rx_done = 0;
        struct mvpp2_port *port = netdev_priv(napi->dev);
 +      struct mvpp2_queue_vector *qv;
        int cpu = smp_processor_id();
  
 +      qv = container_of(napi, struct mvpp2_queue_vector, napi);
 +
        /* Rx/Tx cause register
         *
         * Bits 0-15: each bit indicates received packets on the Rx queue
         *
         * Each CPU has its own Rx/Tx cause register
         */
 -      cause_rx_tx = mvpp2_percpu_read(port->priv, cpu,
 +      cause_rx_tx = mvpp2_percpu_read(port->priv, qv->sw_thread_id,
                                        MVPP2_ISR_RX_TX_CAUSE_REG(port->id));
 -      cause_rx_tx &= ~MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK;
 -      cause_misc = cause_rx_tx & MVPP2_CAUSE_MISC_SUM_MASK;
  
 +      cause_misc = cause_rx_tx & MVPP2_CAUSE_MISC_SUM_MASK;
        if (cause_misc) {
                mvpp2_cause_error(port->dev, cause_misc);
  
                                   cause_rx_tx & ~MVPP2_CAUSE_MISC_SUM_MASK);
        }
  
 -      cause_rx = cause_rx_tx & MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK;
 +      cause_tx = cause_rx_tx & MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK;
 +      if (cause_tx) {
 +              cause_tx >>= MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_OFFSET;
 +              mvpp2_tx_done(port, cause_tx, qv->sw_thread_id);
 +      }
  
        /* Process RX packets */
 -      cause_rx |= port->pending_cause_rx;
 +      cause_rx = cause_rx_tx & MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK;
 +      cause_rx <<= qv->first_rxq;
 +      cause_rx |= qv->pending_cause_rx;
        while (cause_rx && budget > 0) {
                int count;
                struct mvpp2_rx_queue *rxq;
                if (!rxq)
                        break;
  
 -              count = mvpp2_rx(port, budget, rxq);
 +              count = mvpp2_rx(port, napi, budget, rxq);
                rx_done += count;
                budget -= count;
                if (budget > 0) {
                cause_rx = 0;
                napi_complete_done(napi, rx_done);
  
 -              mvpp2_interrupts_enable(port);
 +              mvpp2_qvec_interrupt_enable(qv);
        }
 -      port->pending_cause_rx = cause_rx;
 +      qv->pending_cause_rx = cause_rx;
        return rx_done;
  }
  
  static void mvpp2_start_dev(struct mvpp2_port *port)
  {
        struct net_device *ndev = port->dev;
 +      int i;
 +
 +      if (port->gop_id == 0 &&
 +          (port->phy_interface == PHY_INTERFACE_MODE_XAUI ||
 +           port->phy_interface == PHY_INTERFACE_MODE_10GKR))
 +              mvpp2_xlg_max_rx_size_set(port);
 +      else
 +              mvpp2_gmac_max_rx_size_set(port);
  
 -      mvpp2_gmac_max_rx_size_set(port);
        mvpp2_txp_max_tx_size_set(port);
  
 -      napi_enable(&port->napi);
 +      for (i = 0; i < port->nqvecs; i++)
 +              napi_enable(&port->qvecs[i].napi);
  
        /* Enable interrupts on all CPUs */
        mvpp2_interrupts_enable(port);
  
 +      if (port->priv->hw_version == MVPP22) {
 +              mvpp22_comphy_init(port);
 +              mvpp22_gop_init(port);
 +      }
 +
 +      mvpp2_port_mii_set(port);
        mvpp2_port_enable(port);
 -      phy_start(ndev->phydev);
 +      if (ndev->phydev)
 +              phy_start(ndev->phydev);
        netif_tx_start_all_queues(port->dev);
  }
  
  static void mvpp2_stop_dev(struct mvpp2_port *port)
  {
        struct net_device *ndev = port->dev;
 +      int i;
  
        /* Stop new packets from arriving to RXQs */
        mvpp2_ingress_disable(port);
        /* Disable interrupts on all CPUs */
        mvpp2_interrupts_disable(port);
  
 -      napi_disable(&port->napi);
 +      for (i = 0; i < port->nqvecs; i++)
 +              napi_disable(&port->qvecs[i].napi);
  
        netif_carrier_off(port->dev);
        netif_tx_stop_all_queues(port->dev);
  
        mvpp2_egress_disable(port);
        mvpp2_port_disable(port);
 -      phy_stop(ndev->phydev);
 +      if (ndev->phydev)
 +              phy_stop(ndev->phydev);
 +      phy_power_off(port->comphy);
  }
  
  static int mvpp2_check_ringparam_valid(struct net_device *dev,
@@@ -6693,10 -5941,6 +6693,10 @@@ static int mvpp2_phy_connect(struct mvp
  {
        struct phy_device *phy_dev;
  
 +      /* No PHY is attached */
 +      if (!port->phy_node)
 +              return 0;
 +
        phy_dev = of_phy_connect(port->dev, port->phy_node, mvpp2_link_event, 0,
                                 port->phy_interface);
        if (!phy_dev) {
@@@ -6717,56 -5961,12 +6717,56 @@@ static void mvpp2_phy_disconnect(struc
  {
        struct net_device *ndev = port->dev;
  
 +      if (!ndev->phydev)
 +              return;
 +
        phy_disconnect(ndev->phydev);
  }
  
 +static int mvpp2_irqs_init(struct mvpp2_port *port)
 +{
 +      int err, i;
 +
 +      for (i = 0; i < port->nqvecs; i++) {
 +              struct mvpp2_queue_vector *qv = port->qvecs + i;
 +
 +              err = request_irq(qv->irq, mvpp2_isr, 0, port->dev->name, qv);
 +              if (err)
 +                      goto err;
 +
 +              if (qv->type == MVPP2_QUEUE_VECTOR_PRIVATE)
 +                      irq_set_affinity_hint(qv->irq,
 +                                            cpumask_of(qv->sw_thread_id));
 +      }
 +
 +      return 0;
 +err:
 +      for (i = 0; i < port->nqvecs; i++) {
 +              struct mvpp2_queue_vector *qv = port->qvecs + i;
 +
 +              irq_set_affinity_hint(qv->irq, NULL);
 +              free_irq(qv->irq, qv);
 +      }
 +
 +      return err;
 +}
 +
 +static void mvpp2_irqs_deinit(struct mvpp2_port *port)
 +{
 +      int i;
 +
 +      for (i = 0; i < port->nqvecs; i++) {
 +              struct mvpp2_queue_vector *qv = port->qvecs + i;
 +
 +              irq_set_affinity_hint(qv->irq, NULL);
 +              free_irq(qv->irq, qv);
 +      }
 +}
 +
  static int mvpp2_open(struct net_device *dev)
  {
        struct mvpp2_port *port = netdev_priv(dev);
 +      struct mvpp2 *priv = port->priv;
        unsigned char mac_bcast[ETH_ALEN] = {
                        0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
        int err;
                goto err_cleanup_rxqs;
        }
  
 -      err = request_irq(port->irq, mvpp2_isr, 0, dev->name, port);
 +      err = mvpp2_irqs_init(port);
        if (err) {
 -              netdev_err(port->dev, "cannot request IRQ %d\n", port->irq);
 +              netdev_err(port->dev, "cannot init IRQs\n");
                goto err_cleanup_txqs;
        }
  
 +      if (priv->hw_version == MVPP22 && !port->phy_node && port->link_irq) {
 +              err = request_irq(port->link_irq, mvpp2_link_status_isr, 0,
 +                                dev->name, port);
 +              if (err) {
 +                      netdev_err(port->dev, "cannot request link IRQ %d\n",
 +                                 port->link_irq);
 +                      goto err_free_irq;
 +              }
 +
 +              mvpp22_gop_setup_irq(port);
 +      }
 +
        /* In default link is down */
        netif_carrier_off(port->dev);
  
        err = mvpp2_phy_connect(port);
        if (err < 0)
 -              goto err_free_irq;
 +              goto err_free_link_irq;
  
        /* Unmask interrupts on all CPUs */
        on_each_cpu(mvpp2_interrupts_unmask, port, 1);
 +      mvpp2_shared_interrupt_mask_unmask(port, false);
  
        mvpp2_start_dev(port);
  
        return 0;
  
 +err_free_link_irq:
 +      if (priv->hw_version == MVPP22 && !port->phy_node && port->link_irq)
 +              free_irq(port->link_irq, port);
  err_free_irq:
 -      free_irq(port->irq, port);
 +      mvpp2_irqs_deinit(port);
  err_cleanup_txqs:
        mvpp2_cleanup_txqs(port);
  err_cleanup_rxqs:
@@@ -6855,7 -6039,6 +6855,7 @@@ static int mvpp2_stop(struct net_devic
  {
        struct mvpp2_port *port = netdev_priv(dev);
        struct mvpp2_port_pcpu *port_pcpu;
 +      struct mvpp2 *priv = port->priv;
        int cpu;
  
        mvpp2_stop_dev(port);
  
        /* Mask interrupts on all CPUs */
        on_each_cpu(mvpp2_interrupts_mask, port, 1);
 +      mvpp2_shared_interrupt_mask_unmask(port, true);
  
 -      free_irq(port->irq, port);
 -      for_each_present_cpu(cpu) {
 -              port_pcpu = per_cpu_ptr(port->pcpu, cpu);
 +      if (priv->hw_version == MVPP22 && !port->phy_node && port->link_irq)
 +              free_irq(port->link_irq, port);
  
 -              hrtimer_cancel(&port_pcpu->tx_done_timer);
 -              port_pcpu->timer_scheduled = false;
 -              tasklet_kill(&port_pcpu->tx_done_tasklet);
 +      mvpp2_irqs_deinit(port);
 +      if (!port->has_tx_irqs) {
 +              for_each_present_cpu(cpu) {
 +                      port_pcpu = per_cpu_ptr(port->pcpu, cpu);
 +
 +                      hrtimer_cancel(&port_pcpu->tx_done_timer);
 +                      port_pcpu->timer_scheduled = false;
 +                      tasklet_kill(&port_pcpu->tx_done_tasklet);
 +              }
        }
        mvpp2_cleanup_rxqs(port);
        mvpp2_cleanup_txqs(port);
@@@ -7051,7 -6228,7 +7051,7 @@@ static int mvpp2_ethtool_set_coalesce(s
        struct mvpp2_port *port = netdev_priv(dev);
        int queue;
  
 -      for (queue = 0; queue < rxq_number; queue++) {
 +      for (queue = 0; queue < port->nrxqs; queue++) {
                struct mvpp2_rx_queue *rxq = port->rxqs[queue];
  
                rxq->time_coal = c->rx_coalesce_usecs;
                mvpp2_rx_time_coal_set(port, rxq);
        }
  
 -      for (queue = 0; queue < txq_number; queue++) {
 +      if (port->has_tx_irqs) {
 +              port->tx_time_coal = c->tx_coalesce_usecs;
 +              mvpp2_tx_time_coal_set(port);
 +      }
 +
 +      for (queue = 0; queue < port->ntxqs; queue++) {
                struct mvpp2_tx_queue *txq = port->txqs[queue];
  
                txq->done_pkts_coal = c->tx_max_coalesced_frames;
 +
 +              if (port->has_tx_irqs)
 +                      mvpp2_tx_pkts_coal_set(port, txq);
        }
  
        return 0;
@@@ -7196,129 -6365,6 +7196,129 @@@ static const struct ethtool_ops mvpp2_e
        .set_link_ksettings = phy_ethtool_set_link_ksettings,
  };
  
 +/* Used for PPv2.1, or PPv2.2 with the old Device Tree binding that
 + * had a single IRQ defined per-port.
 + */
 +static int mvpp2_simple_queue_vectors_init(struct mvpp2_port *port,
 +                                         struct device_node *port_node)
 +{
 +      struct mvpp2_queue_vector *v = &port->qvecs[0];
 +
 +      v->first_rxq = 0;
 +      v->nrxqs = port->nrxqs;
 +      v->type = MVPP2_QUEUE_VECTOR_SHARED;
 +      v->sw_thread_id = 0;
 +      v->sw_thread_mask = *cpumask_bits(cpu_online_mask);
 +      v->port = port;
 +      v->irq = irq_of_parse_and_map(port_node, 0);
 +      if (v->irq <= 0)
 +              return -EINVAL;
 +      netif_napi_add(port->dev, &v->napi, mvpp2_poll,
 +                     NAPI_POLL_WEIGHT);
 +
 +      port->nqvecs = 1;
 +
 +      return 0;
 +}
 +
 +static int mvpp2_multi_queue_vectors_init(struct mvpp2_port *port,
 +                                        struct device_node *port_node)
 +{
 +      struct mvpp2_queue_vector *v;
 +      int i, ret;
 +
 +      port->nqvecs = num_possible_cpus();
 +      if (queue_mode == MVPP2_QDIST_SINGLE_MODE)
 +              port->nqvecs += 1;
 +
 +      for (i = 0; i < port->nqvecs; i++) {
 +              char irqname[16];
 +
 +              v = port->qvecs + i;
 +
 +              v->port = port;
 +              v->type = MVPP2_QUEUE_VECTOR_PRIVATE;
 +              v->sw_thread_id = i;
 +              v->sw_thread_mask = BIT(i);
 +
 +              snprintf(irqname, sizeof(irqname), "tx-cpu%d", i);
 +
 +              if (queue_mode == MVPP2_QDIST_MULTI_MODE) {
 +                      v->first_rxq = i * MVPP2_DEFAULT_RXQ;
 +                      v->nrxqs = MVPP2_DEFAULT_RXQ;
 +              } else if (queue_mode == MVPP2_QDIST_SINGLE_MODE &&
 +                         i == (port->nqvecs - 1)) {
 +                      v->first_rxq = 0;
 +                      v->nrxqs = port->nrxqs;
 +                      v->type = MVPP2_QUEUE_VECTOR_SHARED;
 +                      strncpy(irqname, "rx-shared", sizeof(irqname));
 +              }
 +
 +              v->irq = of_irq_get_byname(port_node, irqname);
 +              if (v->irq <= 0) {
 +                      ret = -EINVAL;
 +                      goto err;
 +              }
 +
 +              netif_napi_add(port->dev, &v->napi, mvpp2_poll,
 +                             NAPI_POLL_WEIGHT);
 +      }
 +
 +      return 0;
 +
 +err:
 +      for (i = 0; i < port->nqvecs; i++)
 +              irq_dispose_mapping(port->qvecs[i].irq);
 +      return ret;
 +}
 +
 +static int mvpp2_queue_vectors_init(struct mvpp2_port *port,
 +                                  struct device_node *port_node)
 +{
 +      if (port->has_tx_irqs)
 +              return mvpp2_multi_queue_vectors_init(port, port_node);
 +      else
 +              return mvpp2_simple_queue_vectors_init(port, port_node);
 +}
 +
 +static void mvpp2_queue_vectors_deinit(struct mvpp2_port *port)
 +{
 +      int i;
 +
 +      for (i = 0; i < port->nqvecs; i++)
 +              irq_dispose_mapping(port->qvecs[i].irq);
 +}
 +
 +/* Configure Rx queue group interrupt for this port */
 +static void mvpp2_rx_irqs_setup(struct mvpp2_port *port)
 +{
 +      struct mvpp2 *priv = port->priv;
 +      u32 val;
 +      int i;
 +
 +      if (priv->hw_version == MVPP21) {
 +              mvpp2_write(priv, MVPP21_ISR_RXQ_GROUP_REG(port->id),
 +                          port->nrxqs);
 +              return;
 +      }
 +
 +      /* Handle the more complicated PPv2.2 case */
 +      for (i = 0; i < port->nqvecs; i++) {
 +              struct mvpp2_queue_vector *qv = port->qvecs + i;
 +
 +              if (!qv->nrxqs)
 +                      continue;
 +
 +              val = qv->sw_thread_id;
 +              val |= port->id << MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_OFFSET;
 +              mvpp2_write(priv, MVPP22_ISR_RXQ_GROUP_INDEX_REG, val);
 +
 +              val = qv->first_rxq;
 +              val |= qv->nrxqs << MVPP22_ISR_RXQ_SUB_GROUP_SIZE_OFFSET;
 +              mvpp2_write(priv, MVPP22_ISR_RXQ_SUB_GROUP_CONFIG_REG, val);
 +      }
 +}
 +
  /* Initialize port HW */
  static int mvpp2_port_init(struct mvpp2_port *port)
  {
        struct mvpp2_txq_pcpu *txq_pcpu;
        int queue, cpu, err;
  
 -      if (port->first_rxq + rxq_number >
 +      /* Checks for hardware constraints */
 +      if (port->first_rxq + port->nrxqs >
            MVPP2_MAX_PORTS * priv->max_port_rxqs)
                return -EINVAL;
  
 +      if (port->nrxqs % 4 || (port->nrxqs > priv->max_port_rxqs) ||
 +          (port->ntxqs > MVPP2_MAX_TXQ))
 +              return -EINVAL;
 +
        /* Disable port */
        mvpp2_egress_disable(port);
        mvpp2_port_disable(port);
  
 -      port->txqs = devm_kcalloc(dev, txq_number, sizeof(*port->txqs),
 +      port->tx_time_coal = MVPP2_TXDONE_COAL_USEC;
 +
 +      port->txqs = devm_kcalloc(dev, port->ntxqs, sizeof(*port->txqs),
                                  GFP_KERNEL);
        if (!port->txqs)
                return -ENOMEM;
        /* Associate physical Tx queues to this port and initialize.
         * The mapping is predefined.
         */
 -      for (queue = 0; queue < txq_number; queue++) {
 +      for (queue = 0; queue < port->ntxqs; queue++) {
                int queue_phy_id = mvpp2_txq_phys(port->id, queue);
                struct mvpp2_tx_queue *txq;
  
                port->txqs[queue] = txq;
        }
  
 -      port->rxqs = devm_kcalloc(dev, rxq_number, sizeof(*port->rxqs),
 +      port->rxqs = devm_kcalloc(dev, port->nrxqs, sizeof(*port->rxqs),
                                  GFP_KERNEL);
        if (!port->rxqs) {
                err = -ENOMEM;
        }
  
        /* Allocate and initialize Rx queue for this port */
 -      for (queue = 0; queue < rxq_number; queue++) {
 +      for (queue = 0; queue < port->nrxqs; queue++) {
                struct mvpp2_rx_queue *rxq;
  
                /* Map physical Rx queue to port's logical Rx queue */
                port->rxqs[queue] = rxq;
        }
  
 -      /* Configure Rx queue group interrupt for this port */
 -      if (priv->hw_version == MVPP21) {
 -              mvpp2_write(priv, MVPP21_ISR_RXQ_GROUP_REG(port->id),
 -                          rxq_number);
 -      } else {
 -              u32 val;
 -
 -              val = (port->id << MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_OFFSET);
 -              mvpp2_write(priv, MVPP22_ISR_RXQ_GROUP_INDEX_REG, val);
 -
 -              val = (rxq_number << MVPP22_ISR_RXQ_SUB_GROUP_SIZE_OFFSET);
 -              mvpp2_write(priv, MVPP22_ISR_RXQ_SUB_GROUP_CONFIG_REG, val);
 -      }
 +      mvpp2_rx_irqs_setup(port);
  
        /* Create Rx descriptor rings */
 -      for (queue = 0; queue < rxq_number; queue++) {
 +      for (queue = 0; queue < port->nrxqs; queue++) {
                struct mvpp2_rx_queue *rxq = port->rxqs[queue];
  
                rxq->size = port->rx_ring_size;
        return 0;
  
  err_free_percpu:
 -      for (queue = 0; queue < txq_number; queue++) {
 +      for (queue = 0; queue < port->ntxqs; queue++) {
                if (!port->txqs[queue])
                        continue;
                free_percpu(port->txqs[queue]->pcpu);
        return err;
  }
  
 +/* Checks if the port DT description has the TX interrupts
 + * described. On PPv2.1, there are no such interrupts. On PPv2.2,
 + * there are available, but we need to keep support for old DTs.
 + */
 +static bool mvpp2_port_has_tx_irqs(struct mvpp2 *priv,
 +                                 struct device_node *port_node)
 +{
 +      char *irqs[5] = { "rx-shared", "tx-cpu0", "tx-cpu1",
 +                        "tx-cpu2", "tx-cpu3" };
 +      int ret, i;
 +
 +      if (priv->hw_version == MVPP21)
 +              return false;
 +
 +      for (i = 0; i < 5; i++) {
 +              ret = of_property_match_string(port_node, "interrupt-names",
 +                                             irqs[i]);
 +              if (ret < 0)
 +                      return false;
 +      }
 +
 +      return true;
 +}
 +
  /* Ports initialization */
  static int mvpp2_port_probe(struct platform_device *pdev,
                            struct device_node *port_node,
                            struct mvpp2 *priv)
  {
        struct device_node *phy_node;
 +      struct phy *comphy;
        struct mvpp2_port *port;
        struct mvpp2_port_pcpu *port_pcpu;
        struct net_device *dev;
        struct resource *res;
        const char *dt_mac_addr;
        const char *mac_from;
-       char hw_mac_addr[ETH_ALEN];
+       char hw_mac_addr[ETH_ALEN] = {0};
 +      unsigned int ntxqs, nrxqs;
 +      bool has_tx_irqs;
        u32 id;
        int features;
        int phy_mode;
        int err, i, cpu;
  
 -      dev = alloc_etherdev_mqs(sizeof(*port), txq_number, rxq_number);
 +      has_tx_irqs = mvpp2_port_has_tx_irqs(priv, port_node);
 +
 +      if (!has_tx_irqs)
 +              queue_mode = MVPP2_QDIST_SINGLE_MODE;
 +
 +      ntxqs = MVPP2_MAX_TXQ;
 +      if (priv->hw_version == MVPP22 && queue_mode == MVPP2_QDIST_MULTI_MODE)
 +              nrxqs = MVPP2_DEFAULT_RXQ * num_possible_cpus();
 +      else
 +              nrxqs = MVPP2_DEFAULT_RXQ;
 +
 +      dev = alloc_etherdev_mqs(sizeof(*port), ntxqs, nrxqs);
        if (!dev)
                return -ENOMEM;
  
        phy_node = of_parse_phandle(port_node, "phy", 0);
 -      if (!phy_node) {
 -              dev_err(&pdev->dev, "missing phy\n");
 -              err = -ENODEV;
 -              goto err_free_netdev;
 -      }
 -
        phy_mode = of_get_phy_mode(port_node);
        if (phy_mode < 0) {
                dev_err(&pdev->dev, "incorrect phy mode\n");
                goto err_free_netdev;
        }
  
 +      comphy = devm_of_phy_get(&pdev->dev, port_node, NULL);
 +      if (IS_ERR(comphy)) {
 +              if (PTR_ERR(comphy) == -EPROBE_DEFER) {
 +                      err = -EPROBE_DEFER;
 +                      goto err_free_netdev;
 +              }
 +              comphy = NULL;
 +      }
 +
        if (of_property_read_u32(port_node, "port-id", &id)) {
                err = -EINVAL;
                dev_err(&pdev->dev, "missing port-id value\n");
        dev->ethtool_ops = &mvpp2_eth_tool_ops;
  
        port = netdev_priv(dev);
 +      port->dev = dev;
 +      port->ntxqs = ntxqs;
 +      port->nrxqs = nrxqs;
 +      port->priv = priv;
 +      port->has_tx_irqs = has_tx_irqs;
  
 -      port->irq = irq_of_parse_and_map(port_node, 0);
 -      if (port->irq <= 0) {
 -              err = -EINVAL;
 +      err = mvpp2_queue_vectors_init(port, port_node);
 +      if (err)
                goto err_free_netdev;
 +
 +      port->link_irq = of_irq_get_byname(port_node, "link");
 +      if (port->link_irq == -EPROBE_DEFER) {
 +              err = -EPROBE_DEFER;
 +              goto err_deinit_qvecs;
        }
 +      if (port->link_irq <= 0)
 +              /* the link irq is optional */
 +              port->link_irq = 0;
  
        if (of_property_read_bool(port_node, "marvell,loopback"))
                port->flags |= MVPP2_F_LOOPBACK;
  
 -      port->priv = priv;
        port->id = id;
        if (priv->hw_version == MVPP21)
 -              port->first_rxq = port->id * rxq_number;
 +              port->first_rxq = port->id * port->nrxqs;
        else
                port->first_rxq = port->id * priv->max_port_rxqs;
  
        port->phy_node = phy_node;
        port->phy_interface = phy_mode;
 +      port->comphy = comphy;
  
        if (priv->hw_version == MVPP21) {
                res = platform_get_resource(pdev, IORESOURCE_MEM, 2 + id);
                                         &port->gop_id)) {
                        err = -EINVAL;
                        dev_err(&pdev->dev, "missing gop-port-id value\n");
 -                      goto err_free_irq;
 +                      goto err_deinit_qvecs;
                }
  
                port->base = priv->iface_base + MVPP22_GMAC_BASE(port->gop_id);
  
        port->tx_ring_size = MVPP2_MAX_TXD;
        port->rx_ring_size = MVPP2_MAX_RXD;
 -      port->dev = dev;
        SET_NETDEV_DEV(dev, &pdev->dev);
  
        err = mvpp2_port_init(port);
                goto err_free_stats;
        }
  
 -      mvpp2_port_mii_set(port);
        mvpp2_port_periodic_xon_disable(port);
  
        if (priv->hw_version == MVPP21)
                goto err_free_txq_pcpu;
        }
  
 -      for_each_present_cpu(cpu) {
 -              port_pcpu = per_cpu_ptr(port->pcpu, cpu);
 +      if (!port->has_tx_irqs) {
 +              for_each_present_cpu(cpu) {
 +                      port_pcpu = per_cpu_ptr(port->pcpu, cpu);
  
 -              hrtimer_init(&port_pcpu->tx_done_timer, CLOCK_MONOTONIC,
 -                           HRTIMER_MODE_REL_PINNED);
 -              port_pcpu->tx_done_timer.function = mvpp2_hr_timer_cb;
 -              port_pcpu->timer_scheduled = false;
 +                      hrtimer_init(&port_pcpu->tx_done_timer, CLOCK_MONOTONIC,
 +                                   HRTIMER_MODE_REL_PINNED);
 +                      port_pcpu->tx_done_timer.function = mvpp2_hr_timer_cb;
 +                      port_pcpu->timer_scheduled = false;
  
 -              tasklet_init(&port_pcpu->tx_done_tasklet, mvpp2_tx_proc_cb,
 -                           (unsigned long)dev);
 +                      tasklet_init(&port_pcpu->tx_done_tasklet,
 +                                   mvpp2_tx_proc_cb,
 +                                   (unsigned long)dev);
 +              }
        }
  
 -      netif_napi_add(dev, &port->napi, mvpp2_poll, NAPI_POLL_WEIGHT);
 -      features = NETIF_F_SG | NETIF_F_IP_CSUM;
 +      features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
        dev->features = features | NETIF_F_RXCSUM;
        dev->hw_features |= features | NETIF_F_RXCSUM | NETIF_F_GRO;
        dev->vlan_features |= features;
  err_free_port_pcpu:
        free_percpu(port->pcpu);
  err_free_txq_pcpu:
 -      for (i = 0; i < txq_number; i++)
 +      for (i = 0; i < port->ntxqs; i++)
                free_percpu(port->txqs[i]->pcpu);
  err_free_stats:
        free_percpu(port->stats);
  err_free_irq:
 -      irq_dispose_mapping(port->irq);
 +      if (port->link_irq)
 +              irq_dispose_mapping(port->link_irq);
 +err_deinit_qvecs:
 +      mvpp2_queue_vectors_deinit(port);
  err_free_netdev:
        of_node_put(phy_node);
        free_netdev(dev);
@@@ -7688,11 -6683,9 +7688,11 @@@ static void mvpp2_port_remove(struct mv
        of_node_put(port->phy_node);
        free_percpu(port->pcpu);
        free_percpu(port->stats);
 -      for (i = 0; i < txq_number; i++)
 +      for (i = 0; i < port->ntxqs; i++)
                free_percpu(port->txqs[i]->pcpu);
 -      irq_dispose_mapping(port->irq);
 +      mvpp2_queue_vectors_deinit(port);
 +      if (port->link_irq)
 +              irq_dispose_mapping(port->link_irq);
        free_netdev(port->dev);
  }
  
@@@ -7807,6 -6800,13 +7807,6 @@@ static int mvpp2_init(struct platform_d
        int err, i;
        u32 val;
  
 -      /* Checks for hardware constraints */
 -      if (rxq_number % 4 || (rxq_number > priv->max_port_rxqs) ||
 -          (txq_number > MVPP2_MAX_TXQ)) {
 -              dev_err(&pdev->dev, "invalid queue size parameter\n");
 -              return -EINVAL;
 -      }
 -
        /* MBUS windows configuration */
        dram_target_info = mv_mbus_dram_info();
        if (dram_target_info)
        for_each_present_cpu(i) {
                priv->aggr_txqs[i].id = i;
                priv->aggr_txqs[i].size = MVPP2_AGGR_TXQ_SIZE;
 -              err = mvpp2_aggr_txq_init(pdev, &priv->aggr_txqs[i],
 -                                        MVPP2_AGGR_TXQ_SIZE, i, priv);
 +              err = mvpp2_aggr_txq_init(pdev, &priv->aggr_txqs[i], i, priv);
                if (err < 0)
                        return err;
        }
        /* Rx Fifo Init */
        mvpp2_rx_fifo_init(priv);
  
 -      /* Reset Rx queue group interrupt configuration */
 -      for (i = 0; i < MVPP2_MAX_PORTS; i++) {
 -              if (priv->hw_version == MVPP21) {
 -                      mvpp2_write(priv, MVPP21_ISR_RXQ_GROUP_REG(i),
 -                                  rxq_number);
 -                      continue;
 -              } else {
 -                      u32 val;
 -
 -                      val = (i << MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_OFFSET);
 -                      mvpp2_write(priv, MVPP22_ISR_RXQ_GROUP_INDEX_REG, val);
 -
 -                      val = (rxq_number << MVPP22_ISR_RXQ_SUB_GROUP_SIZE_OFFSET);
 -                      mvpp2_write(priv, MVPP22_ISR_RXQ_SUB_GROUP_CONFIG_REG, val);
 -              }
 -      }
 -
        if (priv->hw_version == MVPP21)
                writel(MVPP2_EXT_GLOBAL_CTRL_DEFAULT,
                       priv->lms_base + MVPP2_MNG_EXTENDED_GLOBAL_CTRL_REG);
@@@ -7874,7 -6892,7 +7874,7 @@@ static int mvpp2_probe(struct platform_
        struct mvpp2 *priv;
        struct resource *res;
        void __iomem *base;
 -      int port_count, cpu;
 +      int port_count, i;
        int err;
  
        priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
                priv->iface_base = devm_ioremap_resource(&pdev->dev, res);
                if (IS_ERR(priv->iface_base))
                        return PTR_ERR(priv->iface_base);
 +
 +              priv->sysctrl_base =
 +                      syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
 +                                                      "marvell,system-controller");
 +              if (IS_ERR(priv->sysctrl_base))
 +                      /* The system controller regmap is optional for dt
 +                       * compatibility reasons. When not provided, the
 +                       * configuration of the GoP relies on the
 +                       * firmware/bootloader.
 +                       */
 +                      priv->sysctrl_base = NULL;
        }
  
 -      for_each_present_cpu(cpu) {
 +      for (i = 0; i < MVPP2_MAX_THREADS; i++) {
                u32 addr_space_sz;
  
                addr_space_sz = (priv->hw_version == MVPP21 ?
                                 MVPP21_ADDR_SPACE_SZ : MVPP22_ADDR_SPACE_SZ);
 -              priv->cpu_base[cpu] = base + cpu * addr_space_sz;
 +              priv->swth_base[i] = base + i * addr_space_sz;
        }
  
        if (priv->hw_version == MVPP21)
index a31912415264dda786037eaa0d332a0073a478e7,2f26fb34d7416b88ee8bf7a3464e40837ab90c3c..6c2abeccfa5a380ff05375393417f99de52ec2ca
@@@ -263,6 -263,7 +263,7 @@@ struct mlx5e_dcbx 
  
        /* The only setting that cannot be read from FW */
        u8                         tc_tsa[IEEE_8021QAZ_MAX_TCS];
+       u8                         cap;
  };
  #endif
  
@@@ -620,12 -621,6 +621,12 @@@ enum mlx5e_traffic_types 
        MLX5E_NUM_INDIR_TIRS = MLX5E_TT_ANY,
  };
  
 +enum mlx5e_tunnel_types {
 +      MLX5E_TT_IPV4_GRE,
 +      MLX5E_TT_IPV6_GRE,
 +      MLX5E_NUM_TUNNEL_TT,
 +};
 +
  enum {
        MLX5E_STATE_ASYNC_EVENTS_ENABLED,
        MLX5E_STATE_OPENED,
@@@ -685,7 -680,6 +686,7 @@@ struct mlx5e_l2_table 
  struct mlx5e_ttc_table {
        struct mlx5e_flow_table  ft;
        struct mlx5_flow_handle  *rules[MLX5E_NUM_TT];
 +      struct mlx5_flow_handle  *tunnel_rules[MLX5E_NUM_TUNNEL_TT];
  };
  
  #define ARFS_HASH_SHIFT BITS_PER_BYTE
@@@ -718,7 -712,6 +719,7 @@@ enum 
        MLX5E_VLAN_FT_LEVEL = 0,
        MLX5E_L2_FT_LEVEL,
        MLX5E_TTC_FT_LEVEL,
 +      MLX5E_INNER_TTC_FT_LEVEL,
        MLX5E_ARFS_FT_LEVEL
  };
  
@@@ -744,7 -737,6 +745,7 @@@ struct mlx5e_flow_steering 
        struct mlx5e_vlan_table         vlan;
        struct mlx5e_l2_table           l2;
        struct mlx5e_ttc_table          ttc;
 +      struct mlx5e_ttc_table          inner_ttc;
        struct mlx5e_arfs_tables        arfs;
  };
  
@@@ -778,7 -770,6 +779,7 @@@ struct mlx5e_priv 
        u32                        tisn[MLX5E_MAX_NUM_TC];
        struct mlx5e_rqt           indir_rqt;
        struct mlx5e_tir           indir_tir[MLX5E_NUM_INDIR_TIRS];
 +      struct mlx5e_tir           inner_indir_tir[MLX5E_NUM_INDIR_TIRS];
        struct mlx5e_tir           direct_tir[MLX5E_MAX_NUM_CHANNELS];
        u32                        tx_rates[MLX5E_MAX_NUM_SQS];
        int                        hard_mtu;
@@@ -913,7 -904,7 +914,7 @@@ int mlx5e_redirect_rqt(struct mlx5e_pri
                       struct mlx5e_redirect_rqt_param rrp);
  void mlx5e_build_indir_tir_ctx_hash(struct mlx5e_params *params,
                                    enum mlx5e_traffic_types tt,
 -                                  void *tirc);
 +                                  void *tirc, bool inner);
  
  int mlx5e_open_locked(struct net_device *netdev);
  int mlx5e_close_locked(struct net_device *netdev);
@@@ -942,12 -933,6 +943,12 @@@ void mlx5e_set_rx_cq_mode_params(struc
  void mlx5e_set_rq_type_params(struct mlx5_core_dev *mdev,
                              struct mlx5e_params *params, u8 rq_type);
  
 +static inline bool mlx5e_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev)
 +{
 +      return (MLX5_CAP_ETH(mdev, tunnel_stateless_gre) &&
 +              MLX5_CAP_FLOWTABLE_NIC_RX(mdev, ft_field_support.inner_ip_version));
 +}
 +
  static inline
  struct mlx5e_tx_wqe *mlx5e_post_nop(struct mlx5_wq_cyc *wq, u32 sqn, u16 *pc)
  {
index c6ec90e9c95bb23d51b3a19a94a790333be9659d,f5594014715bbbd1c281f95ecd871408b4949e06..6127e0d2f310cad2d56f6a54c8c3b694475615cf
@@@ -176,6 -176,7 +176,6 @@@ static bool mlx5e_query_global_pause_co
  
  int mlx5e_ethtool_get_sset_count(struct mlx5e_priv *priv, int sset)
  {
 -
        switch (sset) {
        case ETH_SS_STATS:
                return NUM_SW_COUNTERS +
@@@ -206,7 -207,7 +206,7 @@@ static int mlx5e_get_sset_count(struct 
        return mlx5e_ethtool_get_sset_count(priv, sset);
  }
  
 -static void mlx5e_fill_stats_strings(struct mlx5e_priv *priv, uint8_t *data)
 +static void mlx5e_fill_stats_strings(struct mlx5e_priv *priv, u8 *data)
  {
        int i, j, tc, prio, idx = 0;
        unsigned long pfc_combined;
                strcpy(data + (idx++) * ETH_GSTRING_LEN,
                       pport_phy_statistical_stats_desc[i].format);
  
 +      for (i = 0; i < NUM_PPORT_ETH_EXT_COUNTERS(priv); i++)
 +              strcpy(data + (idx++) * ETH_GSTRING_LEN,
 +                     pport_eth_ext_stats_desc[i].format);
 +
        for (i = 0; i < NUM_PCIE_PERF_COUNTERS(priv); i++)
                strcpy(data + (idx++) * ETH_GSTRING_LEN,
                       pcie_perf_stats_desc[i].format);
  
 +      for (i = 0; i < NUM_PCIE_PERF_COUNTERS64(priv); i++)
 +              strcpy(data + (idx++) * ETH_GSTRING_LEN,
 +                     pcie_perf_stats_desc64[i].format);
 +
 +      for (i = 0; i < NUM_PCIE_PERF_STALL_COUNTERS(priv); i++)
 +              strcpy(data + (idx++) * ETH_GSTRING_LEN,
 +                     pcie_perf_stall_stats_desc[i].format);
 +
        for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
                for (i = 0; i < NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS; i++)
                        sprintf(data + (idx++) * ETH_GSTRING_LEN,
                                        priv->channel_tc2txq[i][tc]);
  }
  
 -void mlx5e_ethtool_get_strings(struct mlx5e_priv *priv,
 -                             uint32_t stringset, uint8_t *data)
 +void mlx5e_ethtool_get_strings(struct mlx5e_priv *priv, u32 stringset, u8 *data)
  {
        int i;
  
        }
  }
  
 -static void mlx5e_get_strings(struct net_device *dev,
 -                            uint32_t stringset, uint8_t *data)
 +static void mlx5e_get_strings(struct net_device *dev, u32 stringset, u8 *data)
  {
        struct mlx5e_priv *priv = netdev_priv(dev);
  
@@@ -382,22 -373,10 +382,22 @@@ void mlx5e_ethtool_get_ethtool_stats(st
                data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.phy_statistical_counters,
                                                  pport_phy_statistical_stats_desc, i);
  
 +      for (i = 0; i < NUM_PPORT_ETH_EXT_COUNTERS(priv); i++)
 +              data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.eth_ext_counters,
 +                                                pport_eth_ext_stats_desc, i);
 +
        for (i = 0; i < NUM_PCIE_PERF_COUNTERS(priv); i++)
                data[idx++] = MLX5E_READ_CTR32_BE(&priv->stats.pcie.pcie_perf_counters,
                                                  pcie_perf_stats_desc, i);
  
 +      for (i = 0; i < NUM_PCIE_PERF_COUNTERS64(priv); i++)
 +              data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pcie.pcie_perf_counters,
 +                                                pcie_perf_stats_desc64, i);
 +
 +      for (i = 0; i < NUM_PCIE_PERF_STALL_COUNTERS(priv); i++)
 +              data[idx++] = MLX5E_READ_CTR32_BE(&priv->stats.pcie.pcie_perf_counters,
 +                                                pcie_perf_stall_stats_desc, i);
 +
        for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
                for (i = 0; i < NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS; i++)
                        data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.per_prio_counters[prio],
@@@ -662,8 -641,10 +662,10 @@@ int mlx5e_ethtool_set_channels(struct m
  
        new_channels.params = priv->channels.params;
        new_channels.params.num_channels = count;
-       mlx5e_build_default_indir_rqt(priv->mdev, new_channels.params.indirection_rqt,
-                                     MLX5E_INDIR_RQT_SIZE, count);
+       if (!netif_is_rxfh_configured(priv->netdev))
+               mlx5e_build_default_indir_rqt(priv->mdev,
+                                             new_channels.params.indirection_rqt,
+                                             MLX5E_INDIR_RQT_SIZE, count);
  
        if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
                priv->channels.params = new_channels.params;
@@@ -985,27 -966,24 +987,27 @@@ static u8 get_connector_port(u32 eth_pr
        if (connector_type && connector_type < MLX5E_CONNECTOR_TYPE_NUMBER)
                return ptys2connector_type[connector_type];
  
 -      if (eth_proto & (MLX5E_PROT_MASK(MLX5E_10GBASE_SR)
 -                       | MLX5E_PROT_MASK(MLX5E_40GBASE_SR4)
 -                       | MLX5E_PROT_MASK(MLX5E_100GBASE_SR4)
 -                       | MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII))) {
 -                      return PORT_FIBRE;
 +      if (eth_proto &
 +          (MLX5E_PROT_MASK(MLX5E_10GBASE_SR)   |
 +           MLX5E_PROT_MASK(MLX5E_40GBASE_SR4)  |
 +           MLX5E_PROT_MASK(MLX5E_100GBASE_SR4) |
 +           MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII))) {
 +              return PORT_FIBRE;
        }
  
 -      if (eth_proto & (MLX5E_PROT_MASK(MLX5E_40GBASE_CR4)
 -                       | MLX5E_PROT_MASK(MLX5E_10GBASE_CR)
 -                       | MLX5E_PROT_MASK(MLX5E_100GBASE_CR4))) {
 -                      return PORT_DA;
 +      if (eth_proto &
 +          (MLX5E_PROT_MASK(MLX5E_40GBASE_CR4) |
 +           MLX5E_PROT_MASK(MLX5E_10GBASE_CR)  |
 +           MLX5E_PROT_MASK(MLX5E_100GBASE_CR4))) {
 +              return PORT_DA;
        }
  
 -      if (eth_proto & (MLX5E_PROT_MASK(MLX5E_10GBASE_KX4)
 -                       | MLX5E_PROT_MASK(MLX5E_10GBASE_KR)
 -                       | MLX5E_PROT_MASK(MLX5E_40GBASE_KR4)
 -                       | MLX5E_PROT_MASK(MLX5E_100GBASE_KR4))) {
 -                      return PORT_NONE;
 +      if (eth_proto &
 +          (MLX5E_PROT_MASK(MLX5E_10GBASE_KX4) |
 +           MLX5E_PROT_MASK(MLX5E_10GBASE_KR)  |
 +           MLX5E_PROT_MASK(MLX5E_40GBASE_KR4) |
 +           MLX5E_PROT_MASK(MLX5E_100GBASE_KR4))) {
 +              return PORT_NONE;
        }
  
        return PORT_OTHER;
@@@ -1212,18 -1190,9 +1214,18 @@@ static void mlx5e_modify_tirs_hash(stru
  
        for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) {
                memset(tirc, 0, ctxlen);
 -              mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc);
 +              mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc, false);
                mlx5_core_modify_tir(mdev, priv->indir_tir[tt].tirn, in, inlen);
        }
 +
 +      if (!mlx5e_tunnel_inner_ft_supported(priv->mdev))
 +              return;
 +
 +      for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) {
 +              memset(tirc, 0, ctxlen);
 +              mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc, true);
 +              mlx5_core_modify_tir(mdev, priv->inner_indir_tir[tt].tirn, in, inlen);
 +      }
  }
  
  static int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir,
index 111c7523d4486c24378c697d6a87e41a617f0203,6ad7f07e7861d9c8d6922b0c115fc950e0126dfc..85841e24c65b5a2453a7c6853b33380ce4ec2bf6
@@@ -288,12 -288,6 +288,12 @@@ static void mlx5e_update_pport_counters
                mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
        }
  
 +      if (MLX5_CAP_PCAM_FEATURE(mdev, rx_buffer_fullness_counters)) {
 +              out = pstats->eth_ext_counters;
 +              MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP);
 +              mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 +      }
 +
        MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_PRIORITY_COUNTERS_GROUP);
        for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
                out = pstats->per_prio_counters[prio];
@@@ -1975,6 -1969,7 +1975,7 @@@ static void mlx5e_build_rx_cq_param(str
        }
  
        mlx5e_build_common_cq_param(priv, param);
+       param->cq_period_mode = params->rx_cq_period_mode;
  }
  
  static void mlx5e_build_tx_cq_param(struct mlx5e_priv *priv,
@@@ -2349,10 -2344,9 +2350,10 @@@ static void mlx5e_build_tir_ctx_lro(str
  
  void mlx5e_build_indir_tir_ctx_hash(struct mlx5e_params *params,
                                    enum mlx5e_traffic_types tt,
 -                                  void *tirc)
 +                                  void *tirc, bool inner)
  {
 -      void *hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
 +      void *hfso = inner ? MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_inner) :
 +                           MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
  
  #define MLX5_HASH_IP            (MLX5_HASH_FIELD_SEL_SRC_IP   |\
                                 MLX5_HASH_FIELD_SEL_DST_IP)
@@@ -2501,21 -2495,6 +2502,21 @@@ free_in
        return err;
  }
  
 +static void mlx5e_build_inner_indir_tir_ctx(struct mlx5e_priv *priv,
 +                                          enum mlx5e_traffic_types tt,
 +                                          u32 *tirc)
 +{
 +      MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.td.tdn);
 +
 +      mlx5e_build_tir_ctx_lro(&priv->channels.params, tirc);
 +
 +      MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
 +      MLX5_SET(tirc, tirc, indirect_table, priv->indir_rqt.rqtn);
 +      MLX5_SET(tirc, tirc, tunneled_offload_en, 0x1);
 +
 +      mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc, true);
 +}
 +
  static int mlx5e_set_mtu(struct mlx5e_priv *priv, u16 mtu)
  {
        struct mlx5_core_dev *mdev = priv->mdev;
@@@ -2603,6 -2582,12 +2604,6 @@@ static void mlx5e_build_channels_tx_map
        }
  }
  
 -static bool mlx5e_is_eswitch_vport_mngr(struct mlx5_core_dev *mdev)
 -{
 -      return (MLX5_CAP_GEN(mdev, vport_group_manager) &&
 -              MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH);
 -}
 -
  void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
  {
        int num_txqs = priv->channels.num * priv->channels.params.num_tc;
        mlx5e_activate_channels(&priv->channels);
        netif_tx_start_all_queues(priv->netdev);
  
 -      if (mlx5e_is_eswitch_vport_mngr(priv->mdev))
 +      if (MLX5_VPORT_MANAGER(priv->mdev))
                mlx5e_add_sqs_fwd_rules(priv);
  
        mlx5e_wait_channels_min_rx_wqes(&priv->channels);
@@@ -2627,7 -2612,7 +2628,7 @@@ void mlx5e_deactivate_priv_channels(str
  {
        mlx5e_redirect_rqts_to_drop(priv);
  
 -      if (mlx5e_is_eswitch_vport_mngr(priv->mdev))
 +      if (MLX5_VPORT_MANAGER(priv->mdev))
                mlx5e_remove_sqs_fwd_rules(priv);
  
        /* FIXME: This is a W/A only for tx timeout watch dog false alarm when
@@@ -2704,8 -2689,6 +2705,8 @@@ int mlx5e_open(struct net_device *netde
  
        mutex_lock(&priv->state_lock);
        err = mlx5e_open_locked(netdev);
 +      if (!err)
 +              mlx5_set_port_admin_status(priv->mdev, MLX5_PORT_UP);
        mutex_unlock(&priv->state_lock);
  
        return err;
@@@ -2740,7 -2723,6 +2741,7 @@@ int mlx5e_close(struct net_device *netd
                return -ENODEV;
  
        mutex_lock(&priv->state_lock);
 +      mlx5_set_port_admin_status(priv->mdev, MLX5_PORT_DOWN);
        err = mlx5e_close_locked(netdev);
        mutex_unlock(&priv->state_lock);
  
@@@ -2881,7 -2863,7 +2882,7 @@@ static void mlx5e_build_indir_tir_ctx(s
  
        MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
        MLX5_SET(tirc, tirc, indirect_table, priv->indir_rqt.rqtn);
 -      mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc);
 +      mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc, false);
  }
  
  static void mlx5e_build_direct_tir_ctx(struct mlx5e_priv *priv, u32 rqtn, u32 *tirc)
@@@ -2900,7 -2882,6 +2901,7 @@@ int mlx5e_create_indirect_tirs(struct m
        struct mlx5e_tir *tir;
        void *tirc;
        int inlen;
 +      int i = 0;
        int err;
        u32 *in;
        int tt;
                tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
                mlx5e_build_indir_tir_ctx(priv, tt, tirc);
                err = mlx5e_create_tir(priv->mdev, tir, in, inlen);
 -              if (err)
 -                      goto err_destroy_tirs;
 +              if (err) {
 +                      mlx5_core_warn(priv->mdev, "create indirect tirs failed, %d\n", err);
 +                      goto err_destroy_inner_tirs;
 +              }
 +      }
 +
 +      if (!mlx5e_tunnel_inner_ft_supported(priv->mdev))
 +              goto out;
 +
 +      for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++) {
 +              memset(in, 0, inlen);
 +              tir = &priv->inner_indir_tir[i];
 +              tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
 +              mlx5e_build_inner_indir_tir_ctx(priv, i, tirc);
 +              err = mlx5e_create_tir(priv->mdev, tir, in, inlen);
 +              if (err) {
 +                      mlx5_core_warn(priv->mdev, "create inner indirect tirs failed, %d\n", err);
 +                      goto err_destroy_inner_tirs;
 +              }
        }
  
 +out:
        kvfree(in);
  
        return 0;
  
 -err_destroy_tirs:
 -      mlx5_core_warn(priv->mdev, "create indirect tirs failed, %d\n", err);
 +err_destroy_inner_tirs:
 +      for (i--; i >= 0; i--)
 +              mlx5e_destroy_tir(priv->mdev, &priv->inner_indir_tir[i]);
 +
        for (tt--; tt >= 0; tt--)
                mlx5e_destroy_tir(priv->mdev, &priv->indir_tir[tt]);
  
@@@ -2999,12 -2960,6 +3000,12 @@@ void mlx5e_destroy_indirect_tirs(struc
  
        for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++)
                mlx5e_destroy_tir(priv->mdev, &priv->indir_tir[i]);
 +
 +      if (!mlx5e_tunnel_inner_ft_supported(priv->mdev))
 +              return;
 +
 +      for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++)
 +              mlx5e_destroy_tir(priv->mdev, &priv->inner_indir_tir[i]);
  }
  
  void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv)
@@@ -3044,16 -2999,12 +3045,16 @@@ static int mlx5e_modify_channels_vsd(st
        return 0;
  }
  
 -static int mlx5e_setup_tc(struct net_device *netdev, u8 tc)
 +static int mlx5e_setup_tc_mqprio(struct net_device *netdev,
 +                               struct tc_mqprio_qopt *mqprio)
  {
        struct mlx5e_priv *priv = netdev_priv(netdev);
        struct mlx5e_channels new_channels = {};
 +      u8 tc = mqprio->num_tc;
        int err = 0;
  
 +      mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
 +
        if (tc && tc != MLX5E_MAX_NUM_TC)
                return -EINVAL;
  
        return err;
  }
  
 -static int mlx5e_ndo_setup_tc(struct net_device *dev, u32 handle,
 -                            u32 chain_index, __be16 proto,
 -                            struct tc_to_netdev *tc)
 +#ifdef CONFIG_MLX5_ESWITCH
 +static int mlx5e_setup_tc_cls_flower(struct net_device *dev,
 +                                   struct tc_cls_flower_offload *cls_flower)
  {
        struct mlx5e_priv *priv = netdev_priv(dev);
  
 -      if (TC_H_MAJ(handle) != TC_H_MAJ(TC_H_INGRESS))
 -              goto mqprio;
 +      if (!is_classid_clsact_ingress(cls_flower->common.classid) ||
 +          cls_flower->common.chain_index)
 +              return -EOPNOTSUPP;
  
 -      if (chain_index)
 +      switch (cls_flower->command) {
 +      case TC_CLSFLOWER_REPLACE:
 +              return mlx5e_configure_flower(priv, cls_flower);
 +      case TC_CLSFLOWER_DESTROY:
 +              return mlx5e_delete_flower(priv, cls_flower);
 +      case TC_CLSFLOWER_STATS:
 +              return mlx5e_stats_flower(priv, cls_flower);
 +      default:
                return -EOPNOTSUPP;
 +      }
 +}
 +#endif
  
 -      switch (tc->type) {
 +static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
 +                        void *type_data)
 +{
 +      switch (type) {
 +#ifdef CONFIG_MLX5_ESWITCH
        case TC_SETUP_CLSFLOWER:
 -              switch (tc->cls_flower->command) {
 -              case TC_CLSFLOWER_REPLACE:
 -                      return mlx5e_configure_flower(priv, proto, tc->cls_flower);
 -              case TC_CLSFLOWER_DESTROY:
 -                      return mlx5e_delete_flower(priv, tc->cls_flower);
 -              case TC_CLSFLOWER_STATS:
 -                      return mlx5e_stats_flower(priv, tc->cls_flower);
 -              }
 +              return mlx5e_setup_tc_cls_flower(dev, type_data);
 +#endif
 +      case TC_SETUP_MQPRIO:
 +              return mlx5e_setup_tc_mqprio(dev, type_data);
        default:
                return -EOPNOTSUPP;
        }
 -
 -mqprio:
 -      if (tc->type != TC_SETUP_MQPRIO)
 -              return -EINVAL;
 -
 -      tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
 -
 -      return mlx5e_setup_tc(dev, tc->mqprio->num_tc);
  }
  
  static void
@@@ -3409,7 -3357,6 +3410,7 @@@ static int mlx5e_ioctl(struct net_devic
        }
  }
  
 +#ifdef CONFIG_MLX5_ESWITCH
  static int mlx5e_set_vf_mac(struct net_device *dev, int vf, u8 *mac)
  {
        struct mlx5e_priv *priv = netdev_priv(dev);
@@@ -3512,7 -3459,6 +3513,7 @@@ static int mlx5e_get_vf_stats(struct ne
        return mlx5_eswitch_get_vport_stats(mdev->priv.eswitch, vf + 1,
                                            vf_stats);
  }
 +#endif
  
  static void mlx5e_add_vxlan_port(struct net_device *netdev,
                                 struct udp_tunnel_info *ti)
@@@ -3542,13 -3488,13 +3543,13 @@@ static void mlx5e_del_vxlan_port(struc
        mlx5e_vxlan_queue_work(priv, ti->sa_family, be16_to_cpu(ti->port), 0);
  }
  
 -static netdev_features_t mlx5e_vxlan_features_check(struct mlx5e_priv *priv,
 -                                                  struct sk_buff *skb,
 -                                                  netdev_features_t features)
 +static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
 +                                                   struct sk_buff *skb,
 +                                                   netdev_features_t features)
  {
        struct udphdr *udph;
 -      u16 proto;
 -      u16 port = 0;
 +      u8 proto;
 +      u16 port;
  
        switch (vlan_get_protocol(skb)) {
        case htons(ETH_P_IP):
                goto out;
        }
  
 -      if (proto == IPPROTO_UDP) {
 +      switch (proto) {
 +      case IPPROTO_GRE:
 +              return features;
 +      case IPPROTO_UDP:
                udph = udp_hdr(skb);
                port = be16_to_cpu(udph->dest);
 -      }
  
 -      /* Verify if UDP port is being offloaded by HW */
 -      if (port && mlx5e_vxlan_lookup_port(priv, port))
 -              return features;
 +              /* Verify if UDP port is being offloaded by HW */
 +              if (mlx5e_vxlan_lookup_port(priv, port))
 +                      return features;
 +      }
  
  out:
        /* Disable CSUM and GSO if the udp dport is not offloaded by HW */
@@@ -3595,7 -3538,7 +3596,7 @@@ static netdev_features_t mlx5e_features
        /* Validate if the tunneled packet is being offloaded by HW */
        if (skb->encapsulation &&
            (features & NETIF_F_CSUM_MASK || features & NETIF_F_GSO_MASK))
 -              return mlx5e_vxlan_features_check(priv, skb, features);
 +              return mlx5e_tunnel_features_check(priv, skb, features);
  
        return features;
  }
@@@ -3749,11 -3692,11 +3750,11 @@@ static void mlx5e_netpoll(struct net_de
  }
  #endif
  
 -static const struct net_device_ops mlx5e_netdev_ops_basic = {
 +static const struct net_device_ops mlx5e_netdev_ops = {
        .ndo_open                = mlx5e_open,
        .ndo_stop                = mlx5e_close,
        .ndo_start_xmit          = mlx5e_xmit,
 -      .ndo_setup_tc            = mlx5e_ndo_setup_tc,
 +      .ndo_setup_tc            = mlx5e_setup_tc,
        .ndo_select_queue        = mlx5e_select_queue,
        .ndo_get_stats64         = mlx5e_get_stats,
        .ndo_set_rx_mode         = mlx5e_set_rx_mode,
        .ndo_change_mtu          = mlx5e_change_mtu,
        .ndo_do_ioctl            = mlx5e_ioctl,
        .ndo_set_tx_maxrate      = mlx5e_set_tx_maxrate,
 +      .ndo_udp_tunnel_add      = mlx5e_add_vxlan_port,
 +      .ndo_udp_tunnel_del      = mlx5e_del_vxlan_port,
 +      .ndo_features_check      = mlx5e_features_check,
  #ifdef CONFIG_RFS_ACCEL
        .ndo_rx_flow_steer       = mlx5e_rx_flow_steer,
  #endif
  #ifdef CONFIG_NET_POLL_CONTROLLER
        .ndo_poll_controller     = mlx5e_netpoll,
  #endif
 -};
 -
 -static const struct net_device_ops mlx5e_netdev_ops_sriov = {
 -      .ndo_open                = mlx5e_open,
 -      .ndo_stop                = mlx5e_close,
 -      .ndo_start_xmit          = mlx5e_xmit,
 -      .ndo_setup_tc            = mlx5e_ndo_setup_tc,
 -      .ndo_select_queue        = mlx5e_select_queue,
 -      .ndo_get_stats64         = mlx5e_get_stats,
 -      .ndo_set_rx_mode         = mlx5e_set_rx_mode,
 -      .ndo_set_mac_address     = mlx5e_set_mac,
 -      .ndo_vlan_rx_add_vid     = mlx5e_vlan_rx_add_vid,
 -      .ndo_vlan_rx_kill_vid    = mlx5e_vlan_rx_kill_vid,
 -      .ndo_set_features        = mlx5e_set_features,
 -      .ndo_change_mtu          = mlx5e_change_mtu,
 -      .ndo_do_ioctl            = mlx5e_ioctl,
 -      .ndo_udp_tunnel_add      = mlx5e_add_vxlan_port,
 -      .ndo_udp_tunnel_del      = mlx5e_del_vxlan_port,
 -      .ndo_set_tx_maxrate      = mlx5e_set_tx_maxrate,
 -      .ndo_features_check      = mlx5e_features_check,
 -#ifdef CONFIG_RFS_ACCEL
 -      .ndo_rx_flow_steer       = mlx5e_rx_flow_steer,
 -#endif
 +#ifdef CONFIG_MLX5_ESWITCH
 +      /* SRIOV E-Switch NDOs */
        .ndo_set_vf_mac          = mlx5e_set_vf_mac,
        .ndo_set_vf_vlan         = mlx5e_set_vf_vlan,
        .ndo_set_vf_spoofchk     = mlx5e_set_vf_spoofchk,
        .ndo_get_vf_config       = mlx5e_get_vf_config,
        .ndo_set_vf_link_state   = mlx5e_set_vf_link_state,
        .ndo_get_vf_stats        = mlx5e_get_vf_stats,
 -      .ndo_tx_timeout          = mlx5e_tx_timeout,
 -      .ndo_xdp                 = mlx5e_xdp,
 -#ifdef CONFIG_NET_POLL_CONTROLLER
 -      .ndo_poll_controller     = mlx5e_netpoll,
 -#endif
        .ndo_has_offload_stats   = mlx5e_has_offload_stats,
        .ndo_get_offload_stats   = mlx5e_get_offload_stats,
 +#endif
  };
  
  static int mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev)
@@@ -4017,11 -3982,9 +4018,11 @@@ static void mlx5e_set_netdev_dev_addr(s
        }
  }
  
 +#if IS_ENABLED(CONFIG_NET_SWITCHDEV) && IS_ENABLED(CONFIG_MLX5_ESWITCH)
  static const struct switchdev_ops mlx5e_switchdev_ops = {
        .switchdev_port_attr_get        = mlx5e_attr_get,
  };
 +#endif
  
  static void mlx5e_build_nic_netdev(struct net_device *netdev)
  {
  
        SET_NETDEV_DEV(netdev, &mdev->pdev->dev);
  
 -      if (MLX5_CAP_GEN(mdev, vport_group_manager)) {
 -              netdev->netdev_ops = &mlx5e_netdev_ops_sriov;
 +      netdev->netdev_ops = &mlx5e_netdev_ops;
 +
  #ifdef CONFIG_MLX5_CORE_EN_DCB
 -              if (MLX5_CAP_GEN(mdev, qos))
 -                      netdev->dcbnl_ops = &mlx5e_dcbnl_ops;
 +      if (MLX5_CAP_GEN(mdev, vport_group_manager) && MLX5_CAP_GEN(mdev, qos))
 +              netdev->dcbnl_ops = &mlx5e_dcbnl_ops;
  #endif
 -      } else {
 -              netdev->netdev_ops = &mlx5e_netdev_ops_basic;
 -      }
  
        netdev->watchdog_timeo    = 15 * HZ;
  
        netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_RX;
        netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_FILTER;
  
 -      if (mlx5e_vxlan_allowed(mdev)) {
 -              netdev->hw_features     |= NETIF_F_GSO_UDP_TUNNEL |
 -                                         NETIF_F_GSO_UDP_TUNNEL_CSUM |
 -                                         NETIF_F_GSO_PARTIAL;
 +      if (mlx5e_vxlan_allowed(mdev) || MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) {
 +              netdev->hw_features     |= NETIF_F_GSO_PARTIAL;
                netdev->hw_enc_features |= NETIF_F_IP_CSUM;
                netdev->hw_enc_features |= NETIF_F_IPV6_CSUM;
                netdev->hw_enc_features |= NETIF_F_TSO;
                netdev->hw_enc_features |= NETIF_F_TSO6;
 -              netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL;
 -              netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL_CSUM |
 -                                         NETIF_F_GSO_PARTIAL;
 +              netdev->hw_enc_features |= NETIF_F_GSO_PARTIAL;
 +      }
 +
 +      if (mlx5e_vxlan_allowed(mdev)) {
 +              netdev->hw_features     |= NETIF_F_GSO_UDP_TUNNEL |
 +                                         NETIF_F_GSO_UDP_TUNNEL_CSUM;
 +              netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL |
 +                                         NETIF_F_GSO_UDP_TUNNEL_CSUM;
                netdev->gso_partial_features = NETIF_F_GSO_UDP_TUNNEL_CSUM;
        }
  
 +      if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) {
 +              netdev->hw_features     |= NETIF_F_GSO_GRE |
 +                                         NETIF_F_GSO_GRE_CSUM;
 +              netdev->hw_enc_features |= NETIF_F_GSO_GRE |
 +                                         NETIF_F_GSO_GRE_CSUM;
 +              netdev->gso_partial_features |= NETIF_F_GSO_GRE |
 +                                              NETIF_F_GSO_GRE_CSUM;
 +      }
 +
        mlx5_query_port_fcs(mdev, &fcs_supported, &fcs_enabled);
  
        if (fcs_supported)
  
        mlx5e_set_netdev_dev_addr(netdev);
  
 -#ifdef CONFIG_NET_SWITCHDEV
 -      if (MLX5_CAP_GEN(mdev, vport_group_manager))
 +#if IS_ENABLED(CONFIG_NET_SWITCHDEV) && IS_ENABLED(CONFIG_MLX5_ESWITCH)
 +      if (MLX5_VPORT_MANAGER(mdev))
                netdev->switchdev_ops = &mlx5e_switchdev_ops;
  #endif
  
@@@ -4254,10 -4208,6 +4255,10 @@@ static void mlx5e_nic_enable(struct mlx
  
        mlx5e_init_l2_addr(priv);
  
 +      /* Marking the link as currently not needed by the Driver */
 +      if (!netif_running(netdev))
 +              mlx5_set_port_admin_status(mdev, MLX5_PORT_DOWN);
 +
        /* MTU range: 68 - hw-specific max */
        netdev->min_mtu = ETH_MIN_MTU;
        mlx5_query_port_max_mtu(priv->mdev, &max_mtu, 1);
  
        mlx5e_enable_async_events(priv);
  
 -      if (MLX5_CAP_GEN(mdev, vport_group_manager))
 +      if (MLX5_VPORT_MANAGER(priv->mdev))
                mlx5e_register_vport_reps(priv);
  
        if (netdev->reg_state != NETREG_REGISTERED)
@@@ -4302,7 -4252,7 +4303,7 @@@ static void mlx5e_nic_disable(struct ml
  
        queue_work(priv->wq, &priv->set_rx_mode_work);
  
 -      if (MLX5_CAP_GEN(mdev, vport_group_manager))
 +      if (MLX5_VPORT_MANAGER(priv->mdev))
                mlx5e_unregister_vport_reps(priv);
  
        mlx5e_disable_async_events(priv);
@@@ -4475,29 -4425,32 +4476,29 @@@ static void mlx5e_detach(struct mlx5_co
  
  static void *mlx5e_add(struct mlx5_core_dev *mdev)
  {
 -      struct mlx5_eswitch *esw = mdev->priv.eswitch;
 -      int total_vfs = MLX5_TOTAL_VPORTS(mdev);
 -      struct mlx5e_rep_priv *rpriv = NULL;
 +      struct net_device *netdev;
 +      void *rpriv = NULL;
        void *priv;
 -      int vport;
        int err;
 -      struct net_device *netdev;
  
        err = mlx5e_check_required_hca_cap(mdev);
        if (err)
                return NULL;
  
 -      if (MLX5_CAP_GEN(mdev, vport_group_manager)) {
 -              rpriv = kzalloc(sizeof(*rpriv), GFP_KERNEL);
 +#ifdef CONFIG_MLX5_ESWITCH
 +      if (MLX5_VPORT_MANAGER(mdev)) {
 +              rpriv = mlx5e_alloc_nic_rep_priv(mdev);
                if (!rpriv) {
 -                      mlx5_core_warn(mdev,
 -                                     "Not creating net device, Failed to alloc rep priv data\n");
 +                      mlx5_core_warn(mdev, "Failed to alloc NIC rep priv data\n");
                        return NULL;
                }
 -              rpriv->rep = &esw->offloads.vport_reps[0];
        }
 +#endif
  
        netdev = mlx5e_create_netdev(mdev, &mlx5e_nic_profile, rpriv);
        if (!netdev) {
                mlx5_core_err(mdev, "mlx5e_create_netdev failed\n");
 -              goto err_unregister_reps;
 +              goto err_free_rpriv;
        }
  
        priv = netdev_priv(netdev);
  
  err_detach:
        mlx5e_detach(mdev, priv);
 -
  err_destroy_netdev:
        mlx5e_destroy_netdev(priv);
 -
 -err_unregister_reps:
 -      for (vport = 1; vport < total_vfs; vport++)
 -              mlx5_eswitch_unregister_vport_rep(esw, vport);
 -
 +err_free_rpriv:
        kfree(rpriv);
        return NULL;
  }
index 55a6786d3c4ccb7ecb44dce4eaeab1b66f894456,7344433259fca32fe288ba4c63dff6c64ab2d126..be8197a75a634340a07575ac90af8b61465b03a5
@@@ -222,13 -222,13 +222,13 @@@ static inline int mlx5e_page_alloc_mapp
        if (unlikely(!page))
                return -ENOMEM;
  
-       dma_info->page = page;
        dma_info->addr = dma_map_page(rq->pdev, page, 0,
                                      RQ_PAGE_SIZE(rq), rq->buff.map_dir);
        if (unlikely(dma_mapping_error(rq->pdev, dma_info->addr))) {
                put_page(page);
                return -ENOMEM;
        }
+       dma_info->page = page;
  
        return 0;
  }
@@@ -509,8 -509,8 +509,8 @@@ static void mlx5e_lro_update_hdr(struc
        u16 tot_len;
  
        u8 l4_hdr_type = get_cqe_l4_hdr_type(cqe);
 -      int tcp_ack = ((CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA  == l4_hdr_type) ||
 -                     (CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA == l4_hdr_type));
 +      int tcp_ack = ((l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA) ||
 +                     (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA));
  
        skb->mac_len = ETH_HLEN;
        proto = __vlan_get_protocol(skb, eth->h_proto, &network_depth);
@@@ -857,7 -857,6 +857,7 @@@ wq_ll_pop
                       &wqe->next.next_wqe_index);
  }
  
 +#ifdef CONFIG_MLX5_ESWITCH
  void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
  {
        struct net_device *netdev = rq->netdev;
@@@ -902,7 -901,6 +902,7 @@@ wq_ll_pop
        mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
                       &wqe->next.next_wqe_index);
  }
 +#endif
  
  static inline void mlx5e_mpwqe_fill_rx_skb(struct mlx5e_rq *rq,
                                           struct mlx5_cqe64 *cqe,
index 3b10d3df7627a9fdf9d00953e83cd1c32b91d6a5,7f282e8f4e7fee460e1140dacc8f86ece6b1ea9e..da503e6411da07374f3250d450fd131c64c63c88
@@@ -1326,7 -1326,7 +1326,7 @@@ static int parse_tc_nic_actions(struct 
        LIST_HEAD(actions);
        int err;
  
 -      if (tc_no_actions(exts))
 +      if (!tcf_exts_has_actions(exts))
                return -EINVAL;
  
        attr->flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
@@@ -1443,12 -1443,10 +1443,10 @@@ static int mlx5e_route_lookup_ipv6(stru
        struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
        int ret;
  
-       dst = ip6_route_output(dev_net(mirred_dev), NULL, fl6);
-       ret = dst->error;
-       if (ret) {
-               dst_release(dst);
+       ret = ipv6_stub->ipv6_dst_lookup(dev_net(mirred_dev), NULL, &dst,
+                                        fl6);
+       if (ret < 0)
                return ret;
-       }
  
        *out_ttl = ip6_dst_hoplimit(dst);
  
@@@ -1839,7 -1837,7 +1837,7 @@@ static int parse_tc_fdb_actions(struct 
        bool encap = false;
        int err = 0;
  
 -      if (tc_no_actions(exts))
 +      if (!tcf_exts_has_actions(exts))
                return -EINVAL;
  
        memset(attr, 0, sizeof(*attr));
        return err;
  }
  
 -int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol,
 +int mlx5e_configure_flower(struct mlx5e_priv *priv,
                           struct tc_cls_flower_offload *f)
  {
        struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
index e7c186b585796d1726d242331e1e92f6c6e4a195,5bc0593bd76e706e2b5c38a0a767af0324e67685..d9fd8570b07c8344bb1dc04934392e20f7628245
@@@ -433,8 -433,6 +433,8 @@@ static int esw_create_offloads_fast_fdb
        struct mlx5_flow_table *fdb = NULL;
        int esw_size, err = 0;
        u32 flags = 0;
 +      u32 max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) |
 +                              MLX5_CAP_GEN(dev, max_flow_counter_15_0);
  
        root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
        if (!root_ns) {
  
        esw_debug(dev, "Create offloads FDB table, min (max esw size(2^%d), max counters(%d)*groups(%d))\n",
                  MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size),
 -                MLX5_CAP_GEN(dev, max_flow_counter), ESW_OFFLOADS_NUM_GROUPS);
 +                max_flow_counter, ESW_OFFLOADS_NUM_GROUPS);
  
 -      esw_size = min_t(int, MLX5_CAP_GEN(dev, max_flow_counter) * ESW_OFFLOADS_NUM_GROUPS,
 +      esw_size = min_t(int, max_flow_counter * ESW_OFFLOADS_NUM_GROUPS,
                         1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
  
        if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE)
@@@ -817,7 -815,7 +817,7 @@@ void esw_offloads_cleanup(struct mlx5_e
        struct mlx5_eswitch_rep *rep;
        int vport;
  
-       for (vport = 0; vport < nvports; vport++) {
+       for (vport = nvports - 1; vport >= 0; vport--) {
                rep = &esw->offloads.vport_reps[vport];
                if (!rep->valid)
                        continue;
index 514c22d21729869162686c014bcf002f26af15c6,16885827367bfd9153f96faaf7c2afedc6c6a5b0..bd84bdf56a83fcb03a5a7a1d0fe7642f8b070ea1
@@@ -53,8 -53,9 +53,8 @@@
  #include <net/devlink.h>
  #include "mlx5_core.h"
  #include "fs_core.h"
 -#ifdef CONFIG_MLX5_CORE_EN
 +#include "lib/mpfs.h"
  #include "eswitch.h"
 -#endif
  #include "lib/mlx5.h"
  #include "fpga/core.h"
  #include "accel/ipsec.h"
@@@ -836,6 -837,7 +836,6 @@@ static int mlx5_core_set_issi(struct ml
        return -EOPNOTSUPP;
  }
  
 -
  static int mlx5_pci_init(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
  {
        struct pci_dev *pdev = dev->pdev;
@@@ -944,17 -946,13 +944,17 @@@ static int mlx5_init_once(struct mlx5_c
                goto err_tables_cleanup;
        }
  
 -#ifdef CONFIG_MLX5_CORE_EN
 +      err = mlx5_mpfs_init(dev);
 +      if (err) {
 +              dev_err(&pdev->dev, "Failed to init l2 table %d\n", err);
 +              goto err_rl_cleanup;
 +      }
 +
        err = mlx5_eswitch_init(dev);
        if (err) {
                dev_err(&pdev->dev, "Failed to init eswitch %d\n", err);
 -              goto err_rl_cleanup;
 +              goto err_mpfs_cleanup;
        }
 -#endif
  
        err = mlx5_sriov_init(dev);
        if (err) {
  err_sriov_cleanup:
        mlx5_sriov_cleanup(dev);
  err_eswitch_cleanup:
 -#ifdef CONFIG_MLX5_CORE_EN
        mlx5_eswitch_cleanup(dev->priv.eswitch);
 -
 +err_mpfs_cleanup:
 +      mlx5_mpfs_cleanup(dev);
  err_rl_cleanup:
 -#endif
        mlx5_cleanup_rl_table(dev);
 -
  err_tables_cleanup:
        mlx5_cleanup_mkey_table(dev);
        mlx5_cleanup_srq_table(dev);
@@@ -995,8 -995,9 +995,8 @@@ static void mlx5_cleanup_once(struct ml
  {
        mlx5_fpga_cleanup(dev);
        mlx5_sriov_cleanup(dev);
 -#ifdef CONFIG_MLX5_CORE_EN
        mlx5_eswitch_cleanup(dev->priv.eswitch);
 -#endif
 +      mlx5_mpfs_cleanup(dev);
        mlx5_cleanup_rl_table(dev);
        mlx5_cleanup_reserved_gids(dev);
        mlx5_cleanup_mkey_table(dev);
@@@ -1154,6 -1155,10 +1154,6 @@@ static int mlx5_load_one(struct mlx5_co
                goto err_fs;
        }
  
 -#ifdef CONFIG_MLX5_CORE_EN
 -      mlx5_eswitch_attach(dev->priv.eswitch);
 -#endif
 -
        err = mlx5_sriov_attach(dev);
        if (err) {
                dev_err(&pdev->dev, "sriov init failed %d\n", err);
                }
        }
  
-       clear_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state);
        set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
  out:
        mutex_unlock(&dev->intf_state_mutex);
@@@ -1197,6 -1201,9 +1196,6 @@@ err_fpga_start
        mlx5_sriov_detach(dev);
  
  err_sriov:
 -#ifdef CONFIG_MLX5_CORE_EN
 -      mlx5_eswitch_detach(dev->priv.eswitch);
 -#endif
        mlx5_cleanup_fs(dev);
  
  err_fs:
@@@ -1253,7 -1260,7 +1252,7 @@@ static int mlx5_unload_one(struct mlx5_
                mlx5_drain_health_recovery(dev);
  
        mutex_lock(&dev->intf_state_mutex);
-       if (test_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state)) {
+       if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
                dev_warn(&dev->pdev->dev, "%s: interface is down, NOP\n",
                         __func__);
                if (cleanup)
        }
  
        clear_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
-       set_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state);
  
        if (mlx5_device_registered(dev))
                mlx5_detach_device(dev);
        mlx5_fpga_device_stop(dev);
  
        mlx5_sriov_detach(dev);
 -#ifdef CONFIG_MLX5_CORE_EN
 -      mlx5_eswitch_detach(dev->priv.eswitch);
 -#endif
        mlx5_cleanup_fs(dev);
        mlx5_irq_clear_affinity_hints(dev);
        free_comp_eqs(dev);
@@@ -1302,7 -1311,7 +1300,7 @@@ struct mlx5_core_event_handler 
  };
  
  static const struct devlink_ops mlx5_devlink_ops = {
 -#ifdef CONFIG_MLX5_CORE_EN
 +#ifdef CONFIG_MLX5_ESWITCH
        .eswitch_mode_set = mlx5_devlink_eswitch_mode_set,
        .eswitch_mode_get = mlx5_devlink_eswitch_mode_get,
        .eswitch_inline_mode_set = mlx5_devlink_eswitch_inline_mode_set,
@@@ -1342,9 -1351,6 +1340,9 @@@ static int init_one(struct pci_dev *pde
        mutex_init(&dev->pci_status_mutex);
        mutex_init(&dev->intf_state_mutex);
  
 +      INIT_LIST_HEAD(&priv->waiting_events_list);
 +      priv->is_accum_events = false;
 +
  #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
        err = init_srcu_struct(&priv->pfault_srcu);
        if (err) {
@@@ -1399,6 -1405,7 +1397,6 @@@ clean_srcu
        cleanup_srcu_struct(&priv->pfault_srcu);
  clean_dev:
  #endif
 -      pci_set_drvdata(pdev, NULL);
        devlink_free(devlink);
  
        return err;
@@@ -1425,6 -1432,7 +1423,6 @@@ static void remove_one(struct pci_dev *
  #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
        cleanup_srcu_struct(&priv->pfault_srcu);
  #endif
 -      pci_set_drvdata(pdev, NULL);
        devlink_free(devlink);
  }
  
@@@ -1555,8 -1563,6 +1553,6 @@@ static void shutdown(struct pci_dev *pd
        int err;
  
        dev_info(&pdev->dev, "Shutdown was called\n");
-       /* Notify mlx5 clients that the kernel is being shut down */
-       set_bit(MLX5_INTERFACE_STATE_SHUTDOWN, &dev->intf_state);
        err = mlx5_try_fast_unload(dev);
        if (err)
                mlx5_unload_one(dev, priv, false);
index 17fc98881642d10e0cf1f03dec9d0eac3111faa7,c6a3e61b53bdbf0c32212a6415f4a1d2da769bf8..992cbfa1f2bcd82df2c261c426dcc56283da0670
@@@ -58,7 -58,6 +58,7 @@@
  #include <net/tc_act/tc_mirred.h>
  #include <net/netevent.h>
  #include <net/tc_act/tc_sample.h>
 +#include <net/addrconf.h>
  
  #include "spectrum.h"
  #include "pci.h"
@@@ -382,14 -381,12 +382,14 @@@ int mlxsw_sp_flow_counter_get(struct ml
        int err;
  
        mlxsw_reg_mgpc_pack(mgpc_pl, counter_index, MLXSW_REG_MGPC_OPCODE_NOP,
 -                          MLXSW_REG_MGPC_COUNTER_SET_TYPE_PACKETS_BYTES);
 +                          MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES);
        err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(mgpc), mgpc_pl);
        if (err)
                return err;
 -      *packets = mlxsw_reg_mgpc_packet_counter_get(mgpc_pl);
 -      *bytes = mlxsw_reg_mgpc_byte_counter_get(mgpc_pl);
 +      if (packets)
 +              *packets = mlxsw_reg_mgpc_packet_counter_get(mgpc_pl);
 +      if (bytes)
 +              *bytes = mlxsw_reg_mgpc_byte_counter_get(mgpc_pl);
        return 0;
  }
  
@@@ -399,7 -396,7 +399,7 @@@ static int mlxsw_sp_flow_counter_clear(
        char mgpc_pl[MLXSW_REG_MGPC_LEN];
  
        mlxsw_reg_mgpc_pack(mgpc_pl, counter_index, MLXSW_REG_MGPC_OPCODE_CLEAR,
 -                          MLXSW_REG_MGPC_COUNTER_SET_TYPE_PACKETS_BYTES);
 +                          MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES);
        return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mgpc), mgpc_pl);
  }
  
@@@ -1619,16 -1616,16 +1619,16 @@@ mlxsw_sp_port_del_cls_matchall_sample(s
  }
  
  static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
 -                                        __be16 protocol,
 -                                        struct tc_cls_matchall_offload *cls,
 +                                        struct tc_cls_matchall_offload *f,
                                          bool ingress)
  {
        struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry;
 +      __be16 protocol = f->common.protocol;
        const struct tc_action *a;
        LIST_HEAD(actions);
        int err;
  
 -      if (!tc_single_action(cls->exts)) {
 +      if (!tcf_exts_has_one_action(f->exts)) {
                netdev_err(mlxsw_sp_port->dev, "only singular actions are supported\n");
                return -EOPNOTSUPP;
        }
        mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
        if (!mall_tc_entry)
                return -ENOMEM;
 -      mall_tc_entry->cookie = cls->cookie;
 +      mall_tc_entry->cookie = f->cookie;
  
 -      tcf_exts_to_list(cls->exts, &actions);
 +      tcf_exts_to_list(f->exts, &actions);
        a = list_first_entry(&actions, struct tc_action, list);
  
        if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) {
                                                            mirror, a, ingress);
        } else if (is_tcf_sample(a) && protocol == htons(ETH_P_ALL)) {
                mall_tc_entry->type = MLXSW_SP_PORT_MALL_SAMPLE;
 -              err = mlxsw_sp_port_add_cls_matchall_sample(mlxsw_sp_port, cls,
 +              err = mlxsw_sp_port_add_cls_matchall_sample(mlxsw_sp_port, f,
                                                            a, ingress);
        } else {
                err = -EOPNOTSUPP;
@@@ -1668,12 -1665,12 +1668,12 @@@ err_add_action
  }
  
  static void mlxsw_sp_port_del_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
 -                                         struct tc_cls_matchall_offload *cls)
 +                                         struct tc_cls_matchall_offload *f)
  {
        struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry;
  
        mall_tc_entry = mlxsw_sp_port_mall_tc_entry_find(mlxsw_sp_port,
 -                                                       cls->cookie);
 +                                                       f->cookie);
        if (!mall_tc_entry) {
                netdev_dbg(mlxsw_sp_port->dev, "tc entry not found on port\n");
                return;
        kfree(mall_tc_entry);
  }
  
 -static int mlxsw_sp_setup_tc(struct net_device *dev, u32 handle,
 -                           u32 chain_index, __be16 proto,
 -                           struct tc_to_netdev *tc)
 +static int mlxsw_sp_setup_tc_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
 +                                        struct tc_cls_matchall_offload *f)
  {
 -      struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
 -      bool ingress = TC_H_MAJ(handle) == TC_H_MAJ(TC_H_INGRESS);
 +      bool ingress;
  
 -      if (chain_index)
 +      if (is_classid_clsact_ingress(f->common.classid))
 +              ingress = true;
 +      else if (is_classid_clsact_egress(f->common.classid))
 +              ingress = false;
 +      else
                return -EOPNOTSUPP;
  
 -      switch (tc->type) {
 -      case TC_SETUP_MATCHALL:
 -              switch (tc->cls_mall->command) {
 -              case TC_CLSMATCHALL_REPLACE:
 -                      return mlxsw_sp_port_add_cls_matchall(mlxsw_sp_port,
 -                                                            proto,
 -                                                            tc->cls_mall,
 -                                                            ingress);
 -              case TC_CLSMATCHALL_DESTROY:
 -                      mlxsw_sp_port_del_cls_matchall(mlxsw_sp_port,
 -                                                     tc->cls_mall);
 -                      return 0;
 -              default:
 -                      return -EOPNOTSUPP;
 -              }
 -      case TC_SETUP_CLSFLOWER:
 -              switch (tc->cls_flower->command) {
 -              case TC_CLSFLOWER_REPLACE:
 -                      return mlxsw_sp_flower_replace(mlxsw_sp_port, ingress,
 -                                                     proto, tc->cls_flower);
 -              case TC_CLSFLOWER_DESTROY:
 -                      mlxsw_sp_flower_destroy(mlxsw_sp_port, ingress,
 -                                              tc->cls_flower);
 -                      return 0;
 -              case TC_CLSFLOWER_STATS:
 -                      return mlxsw_sp_flower_stats(mlxsw_sp_port, ingress,
 -                                                   tc->cls_flower);
 -              default:
 -                      return -EOPNOTSUPP;
 -              }
 +      if (f->common.chain_index)
 +              return -EOPNOTSUPP;
 +
 +      switch (f->command) {
 +      case TC_CLSMATCHALL_REPLACE:
 +              return mlxsw_sp_port_add_cls_matchall(mlxsw_sp_port, f,
 +                                                    ingress);
 +      case TC_CLSMATCHALL_DESTROY:
 +              mlxsw_sp_port_del_cls_matchall(mlxsw_sp_port, f);
 +              return 0;
 +      default:
 +              return -EOPNOTSUPP;
 +      }
 +}
 +
 +static int
 +mlxsw_sp_setup_tc_cls_flower(struct mlxsw_sp_port *mlxsw_sp_port,
 +                           struct tc_cls_flower_offload *f)
 +{
 +      bool ingress;
 +
 +      if (is_classid_clsact_ingress(f->common.classid))
 +              ingress = true;
 +      else if (is_classid_clsact_egress(f->common.classid))
 +              ingress = false;
 +      else
 +              return -EOPNOTSUPP;
 +
 +      switch (f->command) {
 +      case TC_CLSFLOWER_REPLACE:
 +              return mlxsw_sp_flower_replace(mlxsw_sp_port, ingress, f);
 +      case TC_CLSFLOWER_DESTROY:
 +              mlxsw_sp_flower_destroy(mlxsw_sp_port, ingress, f);
 +              return 0;
 +      case TC_CLSFLOWER_STATS:
 +              return mlxsw_sp_flower_stats(mlxsw_sp_port, ingress, f);
 +      default:
 +              return -EOPNOTSUPP;
        }
 +}
  
 -      return -EOPNOTSUPP;
 +static int mlxsw_sp_setup_tc(struct net_device *dev, enum tc_setup_type type,
 +                           void *type_data)
 +{
 +      struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
 +
 +      switch (type) {
 +      case TC_SETUP_CLSMATCHALL:
 +              return mlxsw_sp_setup_tc_cls_matchall(mlxsw_sp_port, type_data);
 +      case TC_SETUP_CLSFLOWER:
 +              return mlxsw_sp_setup_tc_cls_flower(mlxsw_sp_port, type_data);
 +      default:
 +              return -EOPNOTSUPP;
 +      }
  }
  
  static const struct net_device_ops mlxsw_sp_port_netdev_ops = {
@@@ -3359,47 -3333,15 +3359,47 @@@ static const struct mlxsw_listener mlxs
        MLXSW_SP_RXL_MARK(ARPBC, MIRROR_TO_CPU, ARP, false),
        MLXSW_SP_RXL_MARK(ARPUC, MIRROR_TO_CPU, ARP, false),
        MLXSW_SP_RXL_NO_MARK(FID_MISS, TRAP_TO_CPU, IP2ME, false),
 +      MLXSW_SP_RXL_MARK(IPV6_MLDV12_LISTENER_QUERY, MIRROR_TO_CPU, IPV6_MLD,
 +                        false),
 +      MLXSW_SP_RXL_NO_MARK(IPV6_MLDV1_LISTENER_REPORT, TRAP_TO_CPU, IPV6_MLD,
 +                           false),
 +      MLXSW_SP_RXL_NO_MARK(IPV6_MLDV1_LISTENER_DONE, TRAP_TO_CPU, IPV6_MLD,
 +                           false),
 +      MLXSW_SP_RXL_NO_MARK(IPV6_MLDV2_LISTENER_REPORT, TRAP_TO_CPU, IPV6_MLD,
 +                           false),
        /* L3 traps */
 -      MLXSW_SP_RXL_NO_MARK(MTUERROR, TRAP_TO_CPU, ROUTER_EXP, false),
 -      MLXSW_SP_RXL_NO_MARK(TTLERROR, TRAP_TO_CPU, ROUTER_EXP, false),
 -      MLXSW_SP_RXL_NO_MARK(LBERROR, TRAP_TO_CPU, ROUTER_EXP, false),
 -      MLXSW_SP_RXL_MARK(OSPF, TRAP_TO_CPU, OSPF, false),
 -      MLXSW_SP_RXL_NO_MARK(IP2ME, TRAP_TO_CPU, IP2ME, false),
 -      MLXSW_SP_RXL_NO_MARK(RTR_INGRESS0, TRAP_TO_CPU, REMOTE_ROUTE, false),
 -      MLXSW_SP_RXL_NO_MARK(HOST_MISS_IPV4, TRAP_TO_CPU, ARP_MISS, false),
 -      MLXSW_SP_RXL_NO_MARK(BGP_IPV4, TRAP_TO_CPU, BGP_IPV4, false),
 +      MLXSW_SP_RXL_MARK(MTUERROR, TRAP_TO_CPU, ROUTER_EXP, false),
 +      MLXSW_SP_RXL_MARK(TTLERROR, TRAP_TO_CPU, ROUTER_EXP, false),
 +      MLXSW_SP_RXL_MARK(LBERROR, TRAP_TO_CPU, ROUTER_EXP, false),
 +      MLXSW_SP_RXL_MARK(IP2ME, TRAP_TO_CPU, IP2ME, false),
 +      MLXSW_SP_RXL_MARK(IPV6_UNSPECIFIED_ADDRESS, TRAP_TO_CPU, ROUTER_EXP,
 +                        false),
 +      MLXSW_SP_RXL_MARK(IPV6_LINK_LOCAL_DEST, TRAP_TO_CPU, ROUTER_EXP, false),
 +      MLXSW_SP_RXL_MARK(IPV6_LINK_LOCAL_SRC, TRAP_TO_CPU, ROUTER_EXP, false),
 +      MLXSW_SP_RXL_MARK(IPV6_ALL_NODES_LINK, TRAP_TO_CPU, ROUTER_EXP, false),
 +      MLXSW_SP_RXL_MARK(IPV6_ALL_ROUTERS_LINK, TRAP_TO_CPU, ROUTER_EXP,
 +                        false),
 +      MLXSW_SP_RXL_MARK(IPV4_OSPF, TRAP_TO_CPU, OSPF, false),
 +      MLXSW_SP_RXL_MARK(IPV6_OSPF, TRAP_TO_CPU, OSPF, false),
 +      MLXSW_SP_RXL_MARK(IPV6_DHCP, TRAP_TO_CPU, DHCP, false),
 +      MLXSW_SP_RXL_MARK(RTR_INGRESS0, TRAP_TO_CPU, REMOTE_ROUTE, false),
 +      MLXSW_SP_RXL_MARK(IPV4_BGP, TRAP_TO_CPU, BGP, false),
 +      MLXSW_SP_RXL_MARK(IPV6_BGP, TRAP_TO_CPU, BGP, false),
 +      MLXSW_SP_RXL_MARK(L3_IPV6_ROUTER_SOLICITATION, TRAP_TO_CPU, IPV6_ND,
 +                        false),
 +      MLXSW_SP_RXL_MARK(L3_IPV6_ROUTER_ADVERTISMENT, TRAP_TO_CPU, IPV6_ND,
 +                        false),
 +      MLXSW_SP_RXL_MARK(L3_IPV6_NEIGHBOR_SOLICITATION, TRAP_TO_CPU, IPV6_ND,
 +                        false),
 +      MLXSW_SP_RXL_MARK(L3_IPV6_NEIGHBOR_ADVERTISMENT, TRAP_TO_CPU, IPV6_ND,
 +                        false),
 +      MLXSW_SP_RXL_MARK(L3_IPV6_REDIRECTION, TRAP_TO_CPU, IPV6_ND, false),
 +      MLXSW_SP_RXL_MARK(IPV6_MC_LINK_LOCAL_DEST, TRAP_TO_CPU, ROUTER_EXP,
 +                        false),
 +      MLXSW_SP_RXL_MARK(HOST_MISS_IPV4, TRAP_TO_CPU, HOST_MISS, false),
 +      MLXSW_SP_RXL_MARK(HOST_MISS_IPV6, TRAP_TO_CPU, HOST_MISS, false),
 +      MLXSW_SP_RXL_MARK(ROUTER_ALERT_IPV4, TRAP_TO_CPU, ROUTER_EXP, false),
 +      MLXSW_SP_RXL_MARK(ROUTER_ALERT_IPV6, TRAP_TO_CPU, ROUTER_EXP, false),
        /* PKT Sample trap */
        MLXSW_RXL(mlxsw_sp_rx_listener_sample_func, PKT_SAMPLE, MIRROR_TO_CPU,
                  false, SP_IP2ME, DISCARD),
@@@ -3434,17 -3376,15 +3434,17 @@@ static int mlxsw_sp_cpu_policers_set(st
                        burst_size = 7;
                        break;
                case MLXSW_REG_HTGT_TRAP_GROUP_SP_IGMP:
 +              case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_MLD:
                        rate = 16 * 1024;
                        burst_size = 10;
                        break;
 -              case MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP_IPV4:
 +              case MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP:
                case MLXSW_REG_HTGT_TRAP_GROUP_SP_ARP:
                case MLXSW_REG_HTGT_TRAP_GROUP_SP_DHCP:
 -              case MLXSW_REG_HTGT_TRAP_GROUP_SP_ARP_MISS:
 +              case MLXSW_REG_HTGT_TRAP_GROUP_SP_HOST_MISS:
                case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP:
                case MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE:
 +              case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND:
                        rate = 1024;
                        burst_size = 7;
                        break;
@@@ -3493,23 -3433,21 +3493,23 @@@ static int mlxsw_sp_trap_groups_set(str
                        priority = 5;
                        tc = 5;
                        break;
 -              case MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP_IPV4:
 +              case MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP:
                case MLXSW_REG_HTGT_TRAP_GROUP_SP_DHCP:
                        priority = 4;
                        tc = 4;
                        break;
                case MLXSW_REG_HTGT_TRAP_GROUP_SP_IGMP:
                case MLXSW_REG_HTGT_TRAP_GROUP_SP_IP2ME:
 +              case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_MLD:
                        priority = 3;
                        tc = 3;
                        break;
                case MLXSW_REG_HTGT_TRAP_GROUP_SP_ARP:
 +              case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND:
                        priority = 2;
                        tc = 2;
                        break;
 -              case MLXSW_REG_HTGT_TRAP_GROUP_SP_ARP_MISS:
 +              case MLXSW_REG_HTGT_TRAP_GROUP_SP_HOST_MISS:
                case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP:
                case MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE:
                        priority = 1;
@@@ -3756,7 -3694,7 +3756,7 @@@ static void mlxsw_sp_fini(struct mlxsw_
        mlxsw_sp_fids_fini(mlxsw_sp);
  }
  
 -static struct mlxsw_config_profile mlxsw_sp_config_profile = {
 +static const struct mlxsw_config_profile mlxsw_sp_config_profile = {
        .used_max_vepa_channels         = 1,
        .max_vepa_channels              = 0,
        .used_max_mid                   = 1,
@@@ -4201,6 -4139,8 +4201,8 @@@ static int mlxsw_sp_netdevice_port_uppe
                        return -EINVAL;
                if (!info->linking)
                        break;
+               if (netdev_has_any_upper_dev(upper_dev))
+                       return -EINVAL;
                if (netif_is_lag_master(upper_dev) &&
                    !mlxsw_sp_master_lag_check(mlxsw_sp, upper_dev,
                                               info->upper_info))
@@@ -4320,6 -4260,10 +4322,10 @@@ static int mlxsw_sp_netdevice_port_vlan
                upper_dev = info->upper_dev;
                if (!netif_is_bridge_master(upper_dev))
                        return -EINVAL;
+               if (!info->linking)
+                       break;
+               if (netdev_has_any_upper_dev(upper_dev))
+                       return -EINVAL;
                break;
        case NETDEV_CHANGEUPPER:
                upper_dev = info->upper_dev;
@@@ -4419,10 -4363,6 +4425,10 @@@ static struct notifier_block mlxsw_sp_i
        .priority = 10, /* Must be called before FIB notifier block */
  };
  
 +static struct notifier_block mlxsw_sp_inet6addr_nb __read_mostly = {
 +      .notifier_call = mlxsw_sp_inet6addr_event,
 +};
 +
  static struct notifier_block mlxsw_sp_router_netevent_nb __read_mostly = {
        .notifier_call = mlxsw_sp_router_netevent_event,
  };
@@@ -4443,7 -4383,6 +4449,7 @@@ static int __init mlxsw_sp_module_init(
  
        register_netdevice_notifier(&mlxsw_sp_netdevice_nb);
        register_inetaddr_notifier(&mlxsw_sp_inetaddr_nb);
 +      register_inet6addr_notifier(&mlxsw_sp_inet6addr_nb);
        register_netevent_notifier(&mlxsw_sp_router_netevent_nb);
  
        err = mlxsw_core_driver_register(&mlxsw_sp_driver);
@@@ -4460,7 -4399,6 +4466,7 @@@ err_pci_driver_register
        mlxsw_core_driver_unregister(&mlxsw_sp_driver);
  err_core_driver_register:
        unregister_netevent_notifier(&mlxsw_sp_router_netevent_nb);
 +      unregister_inet6addr_notifier(&mlxsw_sp_inet6addr_nb);
        unregister_inetaddr_notifier(&mlxsw_sp_inetaddr_nb);
        unregister_netdevice_notifier(&mlxsw_sp_netdevice_nb);
        return err;
@@@ -4471,7 -4409,6 +4477,7 @@@ static void __exit mlxsw_sp_module_exit
        mlxsw_pci_driver_unregister(&mlxsw_sp_pci_driver);
        mlxsw_core_driver_unregister(&mlxsw_sp_driver);
        unregister_netevent_notifier(&mlxsw_sp_router_netevent_nb);
 +      unregister_inet6addr_notifier(&mlxsw_sp_inet6addr_nb);
        unregister_inetaddr_notifier(&mlxsw_sp_inetaddr_nb);
        unregister_netdevice_notifier(&mlxsw_sp_netdevice_nb);
  }
index d868a5700e01595ddc763ab526324403380ab116,74a96d6bb05ce1c4b3a64d03d18016327b697a84..d396183108f76dc25f0d28724c5bee42b1948666
@@@ -105,43 -105,62 +105,62 @@@ static in
  nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls,
                                struct tc_cls_flower_offload *flow)
  {
-       struct flow_dissector_key_control *mask_enc_ctl;
-       struct flow_dissector_key_basic *mask_basic;
-       struct flow_dissector_key_basic *key_basic;
+       struct flow_dissector_key_basic *mask_basic = NULL;
+       struct flow_dissector_key_basic *key_basic = NULL;
+       struct flow_dissector_key_ip *mask_ip = NULL;
        u32 key_layer_two;
        u8 key_layer;
        int key_size;
  
-       mask_enc_ctl = skb_flow_dissector_target(flow->dissector,
-                                                FLOW_DISSECTOR_KEY_ENC_CONTROL,
-                                                flow->mask);
+       if (dissector_uses_key(flow->dissector,
+                              FLOW_DISSECTOR_KEY_ENC_CONTROL)) {
+               struct flow_dissector_key_control *mask_enc_ctl =
+                       skb_flow_dissector_target(flow->dissector,
+                                                 FLOW_DISSECTOR_KEY_ENC_CONTROL,
+                                                 flow->mask);
+               /* We are expecting a tunnel. For now we ignore offloading. */
+               if (mask_enc_ctl->addr_type)
+                       return -EOPNOTSUPP;
+       }
+       if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
+               mask_basic = skb_flow_dissector_target(flow->dissector,
+                                                      FLOW_DISSECTOR_KEY_BASIC,
+                                                      flow->mask);
  
-       mask_basic = skb_flow_dissector_target(flow->dissector,
-                                              FLOW_DISSECTOR_KEY_BASIC,
-                                              flow->mask);
+               key_basic = skb_flow_dissector_target(flow->dissector,
+                                                     FLOW_DISSECTOR_KEY_BASIC,
+                                                     flow->key);
+       }
+       if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_IP))
+               mask_ip = skb_flow_dissector_target(flow->dissector,
+                                                   FLOW_DISSECTOR_KEY_IP,
+                                                   flow->mask);
  
-       key_basic = skb_flow_dissector_target(flow->dissector,
-                                             FLOW_DISSECTOR_KEY_BASIC,
-                                             flow->key);
        key_layer_two = 0;
        key_layer = NFP_FLOWER_LAYER_PORT | NFP_FLOWER_LAYER_MAC;
        key_size = sizeof(struct nfp_flower_meta_one) +
                   sizeof(struct nfp_flower_in_port) +
                   sizeof(struct nfp_flower_mac_mpls);
  
-       /* We are expecting a tunnel. For now we ignore offloading. */
-       if (mask_enc_ctl->addr_type)
-               return -EOPNOTSUPP;
-       if (mask_basic->n_proto) {
+       if (mask_basic && mask_basic->n_proto) {
                /* Ethernet type is present in the key. */
                switch (key_basic->n_proto) {
                case cpu_to_be16(ETH_P_IP):
+                       if (mask_ip && mask_ip->tos)
+                               return -EOPNOTSUPP;
+                       if (mask_ip && mask_ip->ttl)
+                               return -EOPNOTSUPP;
                        key_layer |= NFP_FLOWER_LAYER_IPV4;
                        key_size += sizeof(struct nfp_flower_ipv4);
                        break;
  
                case cpu_to_be16(ETH_P_IPV6):
+                       if (mask_ip && mask_ip->tos)
+                               return -EOPNOTSUPP;
+                       if (mask_ip && mask_ip->ttl)
+                               return -EOPNOTSUPP;
                        key_layer |= NFP_FLOWER_LAYER_IPV6;
                        key_size += sizeof(struct nfp_flower_ipv6);
                        break;
                case cpu_to_be16(ETH_P_ARP):
                        return -EOPNOTSUPP;
  
+               /* Currently we do not offload MPLS. */
+               case cpu_to_be16(ETH_P_MPLS_UC):
+               case cpu_to_be16(ETH_P_MPLS_MC):
+                       return -EOPNOTSUPP;
                /* Will be included in layer 2. */
                case cpu_to_be16(ETH_P_8021Q):
                        break;
                }
        }
  
-       if (mask_basic->ip_proto) {
+       if (mask_basic && mask_basic->ip_proto) {
                /* Ethernet type is present in the key. */
                switch (key_basic->ip_proto) {
                case IPPROTO_TCP:
@@@ -385,15 -409,16 +409,15 @@@ nfp_flower_repr_offload(struct nfp_app 
  }
  
  int nfp_flower_setup_tc(struct nfp_app *app, struct net_device *netdev,
 -                      u32 handle, __be16 proto, struct tc_to_netdev *tc)
 +                      enum tc_setup_type type, void *type_data)
  {
 -      if (TC_H_MAJ(handle) != TC_H_MAJ(TC_H_INGRESS))
 -              return -EOPNOTSUPP;
 +      struct tc_cls_flower_offload *cls_flower = type_data;
  
 -      if (!eth_proto_is_802_3(proto))
 +      if (type != TC_SETUP_CLSFLOWER ||
 +          !is_classid_clsact_ingress(cls_flower->common.classid) ||
 +          !eth_proto_is_802_3(cls_flower->common.protocol) ||
 +          cls_flower->common.chain_index)
                return -EOPNOTSUPP;
  
 -      if (tc->type != TC_SETUP_CLSFLOWER)
 -              return -EINVAL;
 -
 -      return nfp_flower_repr_offload(app, netdev, tc->cls_flower);
 +      return nfp_flower_repr_offload(app, netdev, cls_flower);
  }
index dd769eceb33d3a91ebb237aa4ccdb8b2de84ff0d,3f199db2002e5ce1c4a0dabc1475647426f6c876..f055b1774d65312a492010dd92b790559435c71c
@@@ -98,21 -98,20 +98,20 @@@ static int nfp_pcie_sriov_enable(struc
        struct nfp_pf *pf = pci_get_drvdata(pdev);
        int err;
  
-       mutex_lock(&pf->lock);
        if (num_vfs > pf->limit_vfs) {
                nfp_info(pf->cpp, "Firmware limits number of VFs to %u\n",
                         pf->limit_vfs);
-               err = -EINVAL;
-               goto err_unlock;
+               return -EINVAL;
        }
  
        err = pci_enable_sriov(pdev, num_vfs);
        if (err) {
                dev_warn(&pdev->dev, "Failed to enable PCI SR-IOV: %d\n", err);
-               goto err_unlock;
+               return err;
        }
  
+       mutex_lock(&pf->lock);
        err = nfp_app_sriov_enable(pf->app, num_vfs);
        if (err) {
                dev_warn(&pdev->dev,
        return num_vfs;
  
  err_sriov_disable:
-       pci_disable_sriov(pdev);
- err_unlock:
        mutex_unlock(&pf->lock);
+       pci_disable_sriov(pdev);
        return err;
  #endif
        return 0;
@@@ -158,10 -156,10 +156,10 @@@ static int nfp_pcie_sriov_disable(struc
  
        pf->num_vfs = 0;
  
+       mutex_unlock(&pf->lock);
        pci_disable_sriov(pdev);
        dev_dbg(&pdev->dev, "Removed VFs.\n");
-       mutex_unlock(&pf->lock);
  #endif
        return 0;
  }
@@@ -174,21 -172,6 +172,21 @@@ static int nfp_pcie_sriov_configure(str
                return nfp_pcie_sriov_enable(pdev, num_vfs);
  }
  
 +static const struct firmware *
 +nfp_net_fw_request(struct pci_dev *pdev, struct nfp_pf *pf, const char *name)
 +{
 +      const struct firmware *fw = NULL;
 +      int err;
 +
 +      err = request_firmware_direct(&fw, name, &pdev->dev);
 +      nfp_info(pf->cpp, "  %s: %s\n",
 +               name, err ? "not found" : "found, loading...");
 +      if (err)
 +              return NULL;
 +
 +      return fw;
 +}
 +
  /**
   * nfp_net_fw_find() - Find the correct firmware image for netdev mode
   * @pdev:     PCI Device structure
  static const struct firmware *
  nfp_net_fw_find(struct pci_dev *pdev, struct nfp_pf *pf)
  {
 -      const struct firmware *fw = NULL;
        struct nfp_eth_table_port *port;
 +      const struct firmware *fw;
        const char *fw_model;
        char fw_name[256];
 -      int spc, err = 0;
 -      int i, j;
 -
 +      const u8 *serial;
 +      u16 interface;
 +      int spc, i, j;
 +
 +      nfp_info(pf->cpp, "Looking for firmware file in order of priority:\n");
 +
 +      /* First try to find a firmware image specific for this device */
 +      interface = nfp_cpp_interface(pf->cpp);
 +      nfp_cpp_serial(pf->cpp, &serial);
 +      sprintf(fw_name, "netronome/serial-%pMF-%02hhx-%02hhx.nffw",
 +              serial, interface >> 8, interface & 0xff);
 +      fw = nfp_net_fw_request(pdev, pf, fw_name);
 +      if (fw)
 +              return fw;
 +
 +      /* Then try the PCI name */
 +      sprintf(fw_name, "netronome/pci-%s.nffw", pci_name(pdev));
 +      fw = nfp_net_fw_request(pdev, pf, fw_name);
 +      if (fw)
 +              return fw;
 +
 +      /* Finally try the card type and media */
        if (!pf->eth_tbl) {
                dev_err(&pdev->dev, "Error: can't identify media config\n");
                return NULL;
        if (spc <= 0)
                return NULL;
  
 -      err = request_firmware(&fw, fw_name, &pdev->dev);
 -      if (err)
 -              return NULL;
 -
 -      dev_info(&pdev->dev, "Loading FW image: %s\n", fw_name);
 -
 -      return fw;
 +      return nfp_net_fw_request(pdev, pf, fw_name);
  }
  
  /**
index ecbec28cfa76441bed835b94204fca16ccf75027,66a09e490cf5a5f2ce19193653910e0fcbe5632e..2920889fa6d6198a0a44abe32c079fd6419c3e32
@@@ -71,7 -71,6 +71,7 @@@
  #include "nfp_app.h"
  #include "nfp_net_ctrl.h"
  #include "nfp_net.h"
 +#include "nfp_net_sriov.h"
  #include "nfp_port.h"
  
  /**
@@@ -896,6 -895,8 +896,8 @@@ static int nfp_net_tx(struct sk_buff *s
  
        netdev_tx_sent_queue(nd_q, txbuf->real_len);
  
+       skb_tx_timestamp(skb);
        tx_ring->wr_p += nr_frags + 1;
        if (nfp_net_tx_ring_should_stop(tx_ring))
                nfp_net_tx_ring_stop(nd_q, tx_ring);
        if (!skb->xmit_more || netif_xmit_stopped(nd_q))
                nfp_net_tx_xmit_more_flush(tx_ring);
  
-       skb_tx_timestamp(skb);
        return NETDEV_TX_OK;
  
  err_unmap:
@@@ -1752,6 -1751,10 +1752,10 @@@ static int nfp_net_rx(struct nfp_net_rx
                        continue;
                }
  
+               nfp_net_dma_unmap_rx(dp, rxbuf->dma_addr);
+               nfp_net_rx_give_one(dp, rx_ring, new_frag, new_dma_addr);
                if (likely(!meta.portid)) {
                        netdev = dp->netdev;
                } else {
                        nn = netdev_priv(dp->netdev);
                        netdev = nfp_app_repr_get(nn->app, meta.portid);
                        if (unlikely(!netdev)) {
-                               nfp_net_rx_drop(dp, r_vec, rx_ring, rxbuf, skb);
+                               nfp_net_rx_drop(dp, r_vec, rx_ring, NULL, skb);
                                continue;
                        }
                        nfp_repr_inc_rx_stats(netdev, pkt_len);
                }
  
-               nfp_net_dma_unmap_rx(dp, rxbuf->dma_addr);
-               nfp_net_rx_give_one(dp, rx_ring, new_frag, new_dma_addr);
                skb_reserve(skb, pkt_off);
                skb_put(skb, pkt_len);
  
@@@ -2660,7 -2659,6 +2660,7 @@@ static int nfp_net_netdev_close(struct 
        /* Step 2: Tell NFP
         */
        nfp_net_clear_config_and_disable(nn);
 +      nfp_port_configure(netdev, false);
  
        /* Step 3: Free resources
         */
@@@ -2778,21 -2776,16 +2778,21 @@@ static int nfp_net_netdev_open(struct n
                goto err_free_all;
  
        /* Step 2: Configure the NFP
 +       * - Ifup the physical interface if it exists
         * - Enable rings from 0 to tx_rings/rx_rings - 1.
         * - Write MAC address (in case it changed)
         * - Set the MTU
         * - Set the Freelist buffer size
         * - Enable the FW
         */
 -      err = nfp_net_set_config_and_enable(nn);
 +      err = nfp_port_configure(netdev, true);
        if (err)
                goto err_free_all;
  
 +      err = nfp_net_set_config_and_enable(nn);
 +      if (err)
 +              goto err_port_disable;
 +
        /* Step 3: Enable for kernel
         * - put some freelist descriptors on each RX ring
         * - enable NAPI on each ring
  
        return 0;
  
 +err_port_disable:
 +      nfp_port_configure(netdev, false);
  err_free_all:
        nfp_net_close_free_all(nn);
        return err;
@@@ -3422,11 -3413,6 +3422,11 @@@ const struct net_device_ops nfp_net_net
        .ndo_get_stats64        = nfp_net_stat64,
        .ndo_vlan_rx_add_vid    = nfp_net_vlan_rx_add_vid,
        .ndo_vlan_rx_kill_vid   = nfp_net_vlan_rx_kill_vid,
 +      .ndo_set_vf_mac         = nfp_app_set_vf_mac,
 +      .ndo_set_vf_vlan        = nfp_app_set_vf_vlan,
 +      .ndo_set_vf_spoofchk    = nfp_app_set_vf_spoofchk,
 +      .ndo_get_vf_config      = nfp_app_get_vf_config,
 +      .ndo_set_vf_link_state  = nfp_app_set_vf_link_state,
        .ndo_setup_tc           = nfp_port_setup_tc,
        .ndo_tx_timeout         = nfp_net_tx_timeout,
        .ndo_set_rx_mode        = nfp_net_set_rx_mode,
index 2da083fd5e137c25e06376816314f332cdba9271,34b985384d26129435686dca6c86b3e6e092cde2..7c22cc4654b74867338c9e087f6f2af7e7d63a2f
@@@ -57,7 -57,6 +57,7 @@@
  #include "nfpcore/nfp6000_pcie.h"
  #include "nfp_app.h"
  #include "nfp_net_ctrl.h"
 +#include "nfp_net_sriov.h"
  #include "nfp_net.h"
  #include "nfp_main.h"
  #include "nfp_port.h"
@@@ -389,7 -388,7 +389,7 @@@ nfp_net_pf_app_init(struct nfp_pf *pf, 
                                        NFP_PF_CSR_SLICE_SIZE,
                                        &pf->ctrl_vnic_bar);
        if (IS_ERR(ctrl_bar)) {
 -              nfp_err(pf->cpp, "Failed to find data vNIC memory symbol\n");
 +              nfp_err(pf->cpp, "Failed to find ctrl vNIC memory symbol\n");
                err = PTR_ERR(ctrl_bar);
                goto err_app_clean;
        }
@@@ -457,13 -456,9 +457,9 @@@ static int nfp_net_pf_app_start(struct 
  {
        int err;
  
-       err = nfp_net_pf_app_start_ctrl(pf);
-       if (err)
-               return err;
        err = nfp_app_start(pf->app, pf->ctrl_vnic);
        if (err)
-               goto err_ctrl_stop;
+               return err;
  
        if (pf->num_vfs) {
                err = nfp_app_sriov_enable(pf->app, pf->num_vfs);
  
  err_app_stop:
        nfp_app_stop(pf->app);
- err_ctrl_stop:
-       nfp_net_pf_app_stop_ctrl(pf);
        return err;
  }
  
@@@ -485,13 -478,10 +479,12 @@@ static void nfp_net_pf_app_stop(struct 
        if (pf->num_vfs)
                nfp_app_sriov_disable(pf->app);
        nfp_app_stop(pf->app);
-       nfp_net_pf_app_stop_ctrl(pf);
  }
  
  static void nfp_net_pci_unmap_mem(struct nfp_pf *pf)
  {
 +      if (pf->vfcfg_tbl2_area)
 +              nfp_cpp_area_release_free(pf->vfcfg_tbl2_area);
        if (pf->vf_cfg_bar)
                nfp_cpp_area_release_free(pf->vf_cfg_bar);
        if (pf->mac_stats_bar)
@@@ -507,7 -497,7 +500,7 @@@ static int nfp_net_pci_map_mem(struct n
        int err;
  
        min_size = pf->max_data_vnics * NFP_PF_CSR_SLICE_SIZE;
 -      mem = nfp_net_pf_map_rtsym(pf, "net.ctrl", "_pf%d_net_bar0",
 +      mem = nfp_net_pf_map_rtsym(pf, "net.bar0", "_pf%d_net_bar0",
                                   min_size, &pf->data_vnic_bar);
        if (IS_ERR(mem)) {
                nfp_err(pf->cpp, "Failed to find data vNIC memory symbol\n");
                pf->vf_cfg_mem = NULL;
        }
  
 +      min_size = NFP_NET_VF_CFG_SZ * pf->limit_vfs + NFP_NET_VF_CFG_MB_SZ;
 +      pf->vfcfg_tbl2 = nfp_net_pf_map_rtsym(pf, "net.vfcfg_tbl2",
 +                                            "_pf%d_net_vf_cfg2",
 +                                            min_size, &pf->vfcfg_tbl2_area);
 +      if (IS_ERR(pf->vfcfg_tbl2)) {
 +              if (PTR_ERR(pf->vfcfg_tbl2) != -ENOENT) {
 +                      err = PTR_ERR(pf->vfcfg_tbl2);
 +                      goto err_unmap_vf_cfg;
 +              }
 +              pf->vfcfg_tbl2 = NULL;
 +      }
 +
        mem = nfp_cpp_map_area(pf->cpp, "net.qc", 0, 0,
                               NFP_PCIE_QUEUE(0), NFP_QCP_QUEUE_AREA_SZ,
                               &pf->qc_area);
        if (IS_ERR(mem)) {
                nfp_err(pf->cpp, "Failed to map Queue Controller area.\n");
                err = PTR_ERR(mem);
 -              goto err_unmap_vf_cfg;
 +              goto err_unmap_vfcfg_tbl2;
        }
  
        return 0;
  
 +err_unmap_vfcfg_tbl2:
 +      if (pf->vfcfg_tbl2_area)
 +              nfp_cpp_area_release_free(pf->vfcfg_tbl2_area);
  err_unmap_vf_cfg:
        if (pf->vf_cfg_bar)
                nfp_cpp_area_release_free(pf->vf_cfg_bar);
@@@ -577,7 -552,7 +570,7 @@@ err_unmap_ctrl
  
  static void nfp_net_pci_remove_finish(struct nfp_pf *pf)
  {
-       nfp_net_pf_app_stop(pf);
+       nfp_net_pf_app_stop_ctrl(pf);
        /* stop app first, to avoid double free of ctrl vNIC's ddir */
        nfp_net_debugfs_dir_clean(&pf->ddir);
  
@@@ -708,6 -683,7 +701,7 @@@ int nfp_net_pci_probe(struct nfp_pf *pf
  {
        struct nfp_net_fw_version fw_ver;
        u8 __iomem *ctrl_bar, *qc_bar;
+       struct nfp_net *nn;
        int stride;
        int err;
  
        if (!pf->rtbl) {
                nfp_err(pf->cpp, "No %s, giving up.\n",
                        pf->fw_loaded ? "symbol table" : "firmware found");
 -              return -EPROBE_DEFER;
 +              return -EINVAL;
        }
  
        mutex_lock(&pf->lock);
        if (err)
                goto err_free_vnics;
  
-       err = nfp_net_pf_app_start(pf);
+       err = nfp_net_pf_app_start_ctrl(pf);
        if (err)
                goto err_free_irqs;
  
        if (err)
                goto err_stop_app;
  
+       err = nfp_net_pf_app_start(pf);
+       if (err)
+               goto err_clean_vnics;
        mutex_unlock(&pf->lock);
  
        return 0;
  
+ err_clean_vnics:
+       list_for_each_entry(nn, &pf->vnics, vnic_list)
+               if (nfp_net_is_data_vnic(nn))
+                       nfp_net_pf_clean_vnic(pf, nn);
  err_stop_app:
-       nfp_net_pf_app_stop(pf);
+       nfp_net_pf_app_stop_ctrl(pf);
  err_free_irqs:
        nfp_net_pf_free_irqs(pf);
  err_free_vnics:
@@@ -821,6 -805,8 +823,8 @@@ void nfp_net_pci_remove(struct nfp_pf *
        if (list_empty(&pf->vnics))
                goto out;
  
+       nfp_net_pf_app_stop(pf);
        list_for_each_entry(nn, &pf->vnics, vnic_list)
                if (nfp_net_is_data_vnic(nn))
                        nfp_net_pf_clean_vnic(pf, nn);
index 458d55ba423f599ec824dc327f248a9b64d8af96,e3223f2fe2ffc9d4b186a42e0cac87fc37021afd..fe2599b83d09066353ecd98d8f33c0f5ae75756b
@@@ -144,23 -144,42 +144,23 @@@ static int ql_get_serdes_regs(struct ql
        xaui_direct_valid = xaui_indirect_valid = 1;
  
        /* The XAUI needs to be read out per port */
 -      if (qdev->func & 1) {
 -              /* We are NIC 2 */
 -              status = ql_read_other_func_serdes_reg(qdev,
 -                              XG_SERDES_XAUI_HSS_PCS_START, &temp);
 -              if (status)
 -                      temp = XG_SERDES_ADDR_XAUI_PWR_DOWN;
 -              if ((temp & XG_SERDES_ADDR_XAUI_PWR_DOWN) ==
 -                                      XG_SERDES_ADDR_XAUI_PWR_DOWN)
 -                      xaui_indirect_valid = 0;
 -
 -              status = ql_read_serdes_reg(qdev,
 -                              XG_SERDES_XAUI_HSS_PCS_START, &temp);
 -              if (status)
 -                      temp = XG_SERDES_ADDR_XAUI_PWR_DOWN;
 -
 -              if ((temp & XG_SERDES_ADDR_XAUI_PWR_DOWN) ==
 -                                      XG_SERDES_ADDR_XAUI_PWR_DOWN)
 -                      xaui_direct_valid = 0;
 -      } else {
 -              /* We are NIC 1 */
 -              status = ql_read_other_func_serdes_reg(qdev,
 -                              XG_SERDES_XAUI_HSS_PCS_START, &temp);
 -              if (status)
 -                      temp = XG_SERDES_ADDR_XAUI_PWR_DOWN;
 -              if ((temp & XG_SERDES_ADDR_XAUI_PWR_DOWN) ==
 -                                      XG_SERDES_ADDR_XAUI_PWR_DOWN)
 -                      xaui_indirect_valid = 0;
 -
 -              status = ql_read_serdes_reg(qdev,
 -                              XG_SERDES_XAUI_HSS_PCS_START, &temp);
 -              if (status)
 -                      temp = XG_SERDES_ADDR_XAUI_PWR_DOWN;
 -              if ((temp & XG_SERDES_ADDR_XAUI_PWR_DOWN) ==
 -                                      XG_SERDES_ADDR_XAUI_PWR_DOWN)
 -                      xaui_direct_valid = 0;
 -      }
 +      status = ql_read_other_func_serdes_reg(qdev,
 +                      XG_SERDES_XAUI_HSS_PCS_START, &temp);
 +      if (status)
 +              temp = XG_SERDES_ADDR_XAUI_PWR_DOWN;
 +
 +      if ((temp & XG_SERDES_ADDR_XAUI_PWR_DOWN) ==
 +                              XG_SERDES_ADDR_XAUI_PWR_DOWN)
 +              xaui_indirect_valid = 0;
 +
 +      status = ql_read_serdes_reg(qdev, XG_SERDES_XAUI_HSS_PCS_START, &temp);
 +
 +      if (status)
 +              temp = XG_SERDES_ADDR_XAUI_PWR_DOWN;
 +
 +      if ((temp & XG_SERDES_ADDR_XAUI_PWR_DOWN) ==
 +                              XG_SERDES_ADDR_XAUI_PWR_DOWN)
 +              xaui_direct_valid = 0;
  
        /*
         * XFI register is shared so only need to read one
@@@ -705,7 -724,7 +705,7 @@@ static void ql_build_coredump_seg_heade
        seg_hdr->cookie = MPI_COREDUMP_COOKIE;
        seg_hdr->segNum = seg_number;
        seg_hdr->segSize = seg_size;
-       memcpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1);
+       strncpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1);
  }
  
  /*
index fac44c5c8d0d6fa5f4ced3e267f3b6a5e3f653f2,d91cbc6c3ca4eee43090bccc70c76b9a9d1fbb85..05ee870c3636cca58cac402d7ab98d1d111ad54d
@@@ -33,9 -33,6 +33,9 @@@
  #include <linux/if_vlan.h>
  #include <linux/in.h>
  #include <linux/slab.h>
 +#include <linux/rtnetlink.h>
 +#include <linux/netpoll.h>
 +
  #include <net/arp.h>
  #include <net/route.h>
  #include <net/sock.h>
  
  #include "hyperv_net.h"
  
 -#define RING_SIZE_MIN 64
 +#define RING_SIZE_MIN         64
 +#define NETVSC_MIN_TX_SECTIONS        10
 +#define NETVSC_DEFAULT_TX     192     /* ~1M */
 +#define NETVSC_MIN_RX_SECTIONS        10      /* ~64K */
 +#define NETVSC_DEFAULT_RX     2048    /* ~4M */
 +
  #define LINKCHANGE_INT (2 * HZ)
 +#define VF_TAKEOVER_INT (HZ / 10)
  
  static int ring_size = 128;
  module_param(ring_size, int, S_IRUGO);
@@@ -78,8 -69,7 +78,8 @@@ static void netvsc_set_multicast_list(s
  static int netvsc_open(struct net_device *net)
  {
        struct net_device_context *ndev_ctx = netdev_priv(net);
 -      struct netvsc_device *nvdev = ndev_ctx->nvdev;
 +      struct net_device *vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
 +      struct netvsc_device *nvdev = rtnl_dereference(ndev_ctx->nvdev);
        struct rndis_device *rdev;
        int ret = 0;
  
        netif_tx_wake_all_queues(net);
  
        rdev = nvdev->extension;
 -      if (!rdev->link_state && !ndev_ctx->datapath)
 +
 +      if (!rdev->link_state)
                netif_carrier_on(net);
  
 -      return ret;
 +      if (vf_netdev) {
 +              /* Setting synthetic device up transparently sets
 +               * slave as up. If open fails, then slave will be
 +               * still be offline (and not used).
 +               */
 +              ret = dev_open(vf_netdev);
 +              if (ret)
 +                      netdev_warn(net,
 +                                  "unable to open slave: %s: %d\n",
 +                                  vf_netdev->name, ret);
 +      }
 +      return 0;
  }
  
  static int netvsc_close(struct net_device *net)
  {
        struct net_device_context *net_device_ctx = netdev_priv(net);
 +      struct net_device *vf_netdev
 +              = rtnl_dereference(net_device_ctx->vf_netdev);
        struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
 -      int ret;
 +      int ret = 0;
        u32 aread, i, msec = 10, retry = 0, retry_max = 20;
        struct vmbus_channel *chn;
  
        netif_tx_disable(net);
  
 +      /* No need to close rndis filter if it is removed already */
 +      if (!nvdev)
 +              goto out;
 +
        ret = rndis_filter_close(nvdev);
        if (ret != 0) {
                netdev_err(net, "unable to close device (ret %d).\n", ret);
                ret = -ETIMEDOUT;
        }
  
 +out:
 +      if (vf_netdev)
 +              dev_close(vf_netdev);
 +
        return ret;
  }
  
  static void *init_ppi_data(struct rndis_message *msg, u32 ppi_size,
 -                              int pkt_type)
 +                         int pkt_type)
  {
        struct rndis_packet *rndis_pkt;
        struct rndis_per_packet_info *ppi;
        return ppi;
  }
  
 -/* Azure hosts don't support non-TCP port numbers in hashing yet. We compute
 - * hash for non-TCP traffic with only IP numbers.
 +/* Azure hosts don't support non-TCP port numbers in hashing for fragmented
 + * packets. We can use ethtool to change UDP hash level when necessary.
   */
 -static inline u32 netvsc_get_hash(struct sk_buff *skb, struct sock *sk)
 +static inline u32 netvsc_get_hash(
 +      struct sk_buff *skb,
 +      const struct net_device_context *ndc)
  {
        struct flow_keys flow;
        u32 hash;
        if (!skb_flow_dissect_flow_keys(skb, &flow, 0))
                return 0;
  
 -      if (flow.basic.ip_proto == IPPROTO_TCP) {
 +      if (flow.basic.ip_proto == IPPROTO_TCP ||
 +          (flow.basic.ip_proto == IPPROTO_UDP &&
 +           ((flow.basic.n_proto == htons(ETH_P_IP) && ndc->udp4_l4_hash) ||
 +            (flow.basic.n_proto == htons(ETH_P_IPV6) &&
 +             ndc->udp6_l4_hash)))) {
                return skb_get_hash(skb);
        } else {
                if (flow.basic.n_proto == htons(ETH_P_IP))
@@@ -238,7 -200,7 +238,7 @@@ static inline int netvsc_get_tx_queue(s
        struct sock *sk = skb->sk;
        int q_idx;
  
 -      q_idx = ndc->tx_send_table[netvsc_get_hash(skb, sk) &
 +      q_idx = ndc->tx_send_table[netvsc_get_hash(skb, ndc) &
                                   (VRSS_SEND_TAB_SIZE - 1)];
  
        /* If queue index changed record the new value */
   *
   * TODO support XPS - but get_xps_queue not exported
   */
 -static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
 -                      void *accel_priv, select_queue_fallback_t fallback)
 +static u16 netvsc_pick_tx(struct net_device *ndev, struct sk_buff *skb)
  {
 -      unsigned int num_tx_queues = ndev->real_num_tx_queues;
        int q_idx = sk_tx_queue_get(skb->sk);
  
 -      if (q_idx < 0 || skb->ooo_okay) {
 +      if (q_idx < 0 || skb->ooo_okay || q_idx >= ndev->real_num_tx_queues) {
                /* If forwarding a packet, we use the recorded queue when
                 * available for better cache locality.
                 */
                        q_idx = netvsc_get_tx_queue(ndev, skb, q_idx);
        }
  
 -      while (unlikely(q_idx >= num_tx_queues))
 -              q_idx -= num_tx_queues;
 -
        return q_idx;
  }
  
 +static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
 +                             void *accel_priv,
 +                             select_queue_fallback_t fallback)
 +{
 +      struct net_device_context *ndc = netdev_priv(ndev);
 +      struct net_device *vf_netdev;
 +      u16 txq;
 +
 +      rcu_read_lock();
 +      vf_netdev = rcu_dereference(ndc->vf_netdev);
 +      if (vf_netdev) {
 +              txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
 +              qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
 +      } else {
 +              txq = netvsc_pick_tx(ndev, skb);
 +      }
 +      rcu_read_unlock();
 +
 +      while (unlikely(txq >= ndev->real_num_tx_queues))
 +              txq -= ndev->real_num_tx_queues;
 +
 +      return txq;
 +}
 +
  static u32 fill_pg_buf(struct page *page, u32 offset, u32 len,
 -                      struct hv_page_buffer *pb)
 +                     struct hv_page_buffer *pb)
  {
        int j = 0;
  
  
  static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb,
                           struct hv_netvsc_packet *packet,
 -                         struct hv_page_buffer **page_buf)
 +                         struct hv_page_buffer *pb)
  {
 -      struct hv_page_buffer *pb = *page_buf;
        u32 slots_used = 0;
        char *data = skb->data;
        int frags = skb_shinfo(skb)->nr_frags;
         * 2. skb linear data
         * 3. skb fragment data
         */
 -      if (hdr != NULL)
 -              slots_used += fill_pg_buf(virt_to_page(hdr),
 -                                      offset_in_page(hdr),
 -                                      len, &pb[slots_used]);
 +      slots_used += fill_pg_buf(virt_to_page(hdr),
 +                                offset_in_page(hdr),
 +                                len, &pb[slots_used]);
  
        packet->rmsg_size = len;
        packet->rmsg_pgcnt = slots_used;
@@@ -414,40 -359,13 +414,40 @@@ static u32 net_checksum_info(struct sk_
  
                if (ip6->nexthdr == IPPROTO_TCP)
                        return TRANSPORT_INFO_IPV6_TCP;
 -              else if (ipv6_hdr(skb)->nexthdr == IPPROTO_UDP)
 +              else if (ip6->nexthdr == IPPROTO_UDP)
                        return TRANSPORT_INFO_IPV6_UDP;
        }
  
        return TRANSPORT_INFO_NOT_IP;
  }
  
 +/* Send skb on the slave VF device. */
 +static int netvsc_vf_xmit(struct net_device *net, struct net_device *vf_netdev,
 +                        struct sk_buff *skb)
 +{
 +      struct net_device_context *ndev_ctx = netdev_priv(net);
 +      unsigned int len = skb->len;
 +      int rc;
 +
 +      skb->dev = vf_netdev;
 +      skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
 +
 +      rc = dev_queue_xmit(skb);
 +      if (likely(rc == NET_XMIT_SUCCESS || rc == NET_XMIT_CN)) {
 +              struct netvsc_vf_pcpu_stats *pcpu_stats
 +                      = this_cpu_ptr(ndev_ctx->vf_stats);
 +
 +              u64_stats_update_begin(&pcpu_stats->syncp);
 +              pcpu_stats->tx_packets++;
 +              pcpu_stats->tx_bytes += len;
 +              u64_stats_update_end(&pcpu_stats->syncp);
 +      } else {
 +              this_cpu_inc(ndev_ctx->vf_stats->tx_dropped);
 +      }
 +
 +      return rc;
 +}
 +
  static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net)
  {
        struct net_device_context *net_device_ctx = netdev_priv(net);
        unsigned int num_data_pgs;
        struct rndis_message *rndis_msg;
        struct rndis_packet *rndis_pkt;
 +      struct net_device *vf_netdev;
        u32 rndis_msg_size;
        struct rndis_per_packet_info *ppi;
        u32 hash;
 -      struct hv_page_buffer page_buf[MAX_PAGE_BUFFER_COUNT];
 -      struct hv_page_buffer *pb = page_buf;
 +      struct hv_page_buffer pb[MAX_PAGE_BUFFER_COUNT];
 +
 +      /* if VF is present and up then redirect packets
 +       * already called with rcu_read_lock_bh
 +       */
 +      vf_netdev = rcu_dereference_bh(net_device_ctx->vf_netdev);
 +      if (vf_netdev && netif_running(vf_netdev) &&
 +          !netpoll_tx_running(net))
 +              return netvsc_vf_xmit(net, vf_netdev, skb);
  
        /* We will atmost need two pages to describe the rndis
         * header. We can only transmit MAX_PAGE_BUFFER_COUNT number
  
                rndis_msg_size += NDIS_VLAN_PPI_SIZE;
                ppi = init_ppi_data(rndis_msg, NDIS_VLAN_PPI_SIZE,
 -                                      IEEE_8021Q_INFO);
 -              vlan = (struct ndis_pkt_8021q_info *)((void *)ppi +
 -                                              ppi->ppi_offset);
 +                                  IEEE_8021Q_INFO);
 +
 +              vlan = (void *)ppi + ppi->ppi_offset;
                vlan->vlanid = skb->vlan_tci & VLAN_VID_MASK;
                vlan->pri = (skb->vlan_tci & VLAN_PRIO_MASK) >>
                                VLAN_PRIO_SHIFT;
                ppi = init_ppi_data(rndis_msg, NDIS_LSO_PPI_SIZE,
                                    TCP_LARGESEND_PKTINFO);
  
 -              lso_info = (struct ndis_tcp_lso_info *)((void *)ppi +
 -                                                      ppi->ppi_offset);
 +              lso_info = (void *)ppi + ppi->ppi_offset;
  
                lso_info->lso_v2_transmit.type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
                if (skb->protocol == htons(ETH_P_IP)) {
        rndis_msg->msg_len += rndis_msg_size;
        packet->total_data_buflen = rndis_msg->msg_len;
        packet->page_buf_cnt = init_page_array(rndis_msg, rndis_msg_size,
 -                                             skb, packet, &pb);
 +                                             skb, packet, pb);
  
        /* timestamp packet in software */
        skb_tx_timestamp(skb);
 -      ret = netvsc_send(net_device_ctx->device_ctx, packet,
 -                        rndis_msg, &pb, skb);
 +
 +      ret = netvsc_send(net_device_ctx, packet, rndis_msg, pb, skb);
        if (likely(ret == 0))
                return NETDEV_TX_OK;
  
@@@ -640,7 -551,6 +640,7 @@@ no_memory
        ++net_device_ctx->eth_stats.tx_no_memory;
        goto drop;
  }
 +
  /*
   * netvsc_linkstatus_callback - Link up/down notification
   */
@@@ -664,8 -574,8 +664,8 @@@ void netvsc_linkstatus_callback(struct 
        if (indicate->status == RNDIS_STATUS_LINK_SPEED_CHANGE) {
                u32 speed;
  
 -              speed = *(u32 *)((void *)indicate + indicate->
 -                               status_buf_offset) / 10000;
 +              speed = *(u32 *)((void *)indicate
 +                               + indicate->status_buf_offset) / 10000;
                ndev_ctx->speed = speed;
                return;
        }
@@@ -748,18 -658,29 +748,18 @@@ int netvsc_recv_callback(struct net_dev
        struct netvsc_device *net_device;
        u16 q_idx = channel->offermsg.offer.sub_channel_index;
        struct netvsc_channel *nvchan;
 -      struct net_device *vf_netdev;
        struct sk_buff *skb;
        struct netvsc_stats *rx_stats;
  
        if (net->reg_state != NETREG_REGISTERED)
                return NVSP_STAT_FAIL;
  
 -      /*
 -       * If necessary, inject this packet into the VF interface.
 -       * On Hyper-V, multicast and brodcast packets are only delivered
 -       * to the synthetic interface (after subjecting these to
 -       * policy filters on the host). Deliver these via the VF
 -       * interface in the guest.
 -       */
        rcu_read_lock();
        net_device = rcu_dereference(net_device_ctx->nvdev);
        if (unlikely(!net_device))
                goto drop;
  
        nvchan = &net_device->chan_table[q_idx];
 -      vf_netdev = rcu_dereference(net_device_ctx->vf_netdev);
 -      if (vf_netdev && (vf_netdev->flags & IFF_UP))
 -              net = vf_netdev;
  
        /* Allocate a skb - TODO direct I/O to pages? */
        skb = netvsc_alloc_recv_skb(net, &nvchan->napi,
@@@ -771,7 -692,8 +771,7 @@@ drop
                return NVSP_STAT_FAIL;
        }
  
 -      if (net != vf_netdev)
 -              skb_record_rx_queue(skb, q_idx);
 +      skb_record_rx_queue(skb, q_idx);
  
        /*
         * Even if injecting the packet, record the statistics
@@@ -814,16 -736,39 +814,16 @@@ static void netvsc_get_channels(struct 
        }
  }
  
 -static int netvsc_set_queues(struct net_device *net, struct hv_device *dev,
 -                           u32 num_chn)
 -{
 -      struct netvsc_device_info device_info;
 -      int ret;
 -
 -      memset(&device_info, 0, sizeof(device_info));
 -      device_info.num_chn = num_chn;
 -      device_info.ring_size = ring_size;
 -      device_info.max_num_vrss_chns = num_chn;
 -
 -      ret = rndis_filter_device_add(dev, &device_info);
 -      if (ret)
 -              return ret;
 -
 -      ret = netif_set_real_num_tx_queues(net, num_chn);
 -      if (ret)
 -              return ret;
 -
 -      ret = netif_set_real_num_rx_queues(net, num_chn);
 -
 -      return ret;
 -}
 -
  static int netvsc_set_channels(struct net_device *net,
                               struct ethtool_channels *channels)
  {
        struct net_device_context *net_device_ctx = netdev_priv(net);
        struct hv_device *dev = net_device_ctx->device_ctx;
        struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
 -      unsigned int count = channels->combined_count;
 -      bool was_running;
 -      int ret;
 +      unsigned int orig, count = channels->combined_count;
 +      struct netvsc_device_info device_info;
 +      bool was_opened;
 +      int ret = 0;
  
        /* We do not support separate count for rx, tx, or other */
        if (count == 0 ||
        if (count > nvdev->max_chn)
                return -EINVAL;
  
 -      was_running = netif_running(net);
 -      if (was_running) {
 -              ret = netvsc_close(net);
 -              if (ret)
 -                      return ret;
 -      }
 +      orig = nvdev->num_chn;
 +      was_opened = rndis_filter_opened(nvdev);
 +      if (was_opened)
 +              rndis_filter_close(nvdev);
 +
 +      memset(&device_info, 0, sizeof(device_info));
 +      device_info.num_chn = count;
 +      device_info.ring_size = ring_size;
 +      device_info.send_sections = nvdev->send_section_cnt;
 +      device_info.recv_sections = nvdev->recv_section_cnt;
  
        rndis_filter_device_remove(dev, nvdev);
  
 -      ret = netvsc_set_queues(net, dev, count);
 -      if (ret == 0)
 -              nvdev->num_chn = count;
 -      else
 -              netvsc_set_queues(net, dev, nvdev->num_chn);
 +      nvdev = rndis_filter_device_add(dev, &device_info);
 +      if (!IS_ERR(nvdev)) {
 +              netif_set_real_num_tx_queues(net, nvdev->num_chn);
 +              netif_set_real_num_rx_queues(net, nvdev->num_chn);
 +      } else {
 +              ret = PTR_ERR(nvdev);
 +              device_info.num_chn = orig;
 +              nvdev = rndis_filter_device_add(dev, &device_info);
  
 -      if (was_running)
 -              ret = netvsc_open(net);
 +              if (IS_ERR(nvdev)) {
 +                      netdev_err(net, "restoring channel setting failed: %ld\n",
 +                                 PTR_ERR(nvdev));
 +                      return ret;
 +              }
 +      }
 +
 +      if (was_opened)
 +              rndis_filter_open(nvdev);
  
        /* We may have missed link change notifications */
 +      net_device_ctx->last_reconfig = 0;
        schedule_delayed_work(&net_device_ctx->dwork, 0);
  
        return ret;
@@@ -902,9 -832,6 +902,9 @@@ static void netvsc_init_settings(struc
  {
        struct net_device_context *ndc = netdev_priv(dev);
  
 +      ndc->udp4_l4_hash = true;
 +      ndc->udp6_l4_hash = true;
 +
        ndc->speed = SPEED_UNKNOWN;
        ndc->duplex = DUPLEX_FULL;
  }
@@@ -942,61 -869,41 +942,61 @@@ static int netvsc_set_link_ksettings(st
  static int netvsc_change_mtu(struct net_device *ndev, int mtu)
  {
        struct net_device_context *ndevctx = netdev_priv(ndev);
 +      struct net_device *vf_netdev = rtnl_dereference(ndevctx->vf_netdev);
        struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
        struct hv_device *hdev = ndevctx->device_ctx;
 +      int orig_mtu = ndev->mtu;
        struct netvsc_device_info device_info;
 -      bool was_running;
 +      bool was_opened;
        int ret = 0;
  
        if (!nvdev || nvdev->destroy)
                return -ENODEV;
  
 -      was_running = netif_running(ndev);
 -      if (was_running) {
 -              ret = netvsc_close(ndev);
 +      /* Change MTU of underlying VF netdev first. */
 +      if (vf_netdev) {
 +              ret = dev_set_mtu(vf_netdev, mtu);
                if (ret)
                        return ret;
        }
  
 +      netif_device_detach(ndev);
 +      was_opened = rndis_filter_opened(nvdev);
 +      if (was_opened)
 +              rndis_filter_close(nvdev);
 +
        memset(&device_info, 0, sizeof(device_info));
        device_info.ring_size = ring_size;
        device_info.num_chn = nvdev->num_chn;
 -      device_info.max_num_vrss_chns = nvdev->num_chn;
 +      device_info.send_sections = nvdev->send_section_cnt;
 +      device_info.recv_sections = nvdev->recv_section_cnt;
  
        rndis_filter_device_remove(hdev, nvdev);
  
 -      /* 'nvdev' has been freed in rndis_filter_device_remove() ->
 -       * netvsc_device_remove () -> free_netvsc_device().
 -       * We mustn't access it before it's re-created in
 -       * rndis_filter_device_add() -> netvsc_device_add().
 -       */
 -
        ndev->mtu = mtu;
  
 -      rndis_filter_device_add(hdev, &device_info);
 +      nvdev = rndis_filter_device_add(hdev, &device_info);
 +      if (IS_ERR(nvdev)) {
 +              ret = PTR_ERR(nvdev);
 +
 +              /* Attempt rollback to original MTU */
 +              ndev->mtu = orig_mtu;
 +              nvdev = rndis_filter_device_add(hdev, &device_info);
 +
 +              if (vf_netdev)
 +                      dev_set_mtu(vf_netdev, orig_mtu);
 +
 +              if (IS_ERR(nvdev)) {
 +                      netdev_err(ndev, "restoring mtu failed: %ld\n",
 +                                 PTR_ERR(nvdev));
 +                      return ret;
 +              }
 +      }
 +
 +      if (was_opened)
 +              rndis_filter_open(nvdev);
  
 -      if (was_running)
 -              ret = netvsc_open(ndev);
 +      netif_device_attach(ndev);
  
        /* We may have missed link change notifications */
        schedule_delayed_work(&ndevctx->dwork, 0);
        return ret;
  }
  
 +static void netvsc_get_vf_stats(struct net_device *net,
 +                              struct netvsc_vf_pcpu_stats *tot)
 +{
 +      struct net_device_context *ndev_ctx = netdev_priv(net);
 +      int i;
 +
 +      memset(tot, 0, sizeof(*tot));
 +
 +      for_each_possible_cpu(i) {
 +              const struct netvsc_vf_pcpu_stats *stats
 +                      = per_cpu_ptr(ndev_ctx->vf_stats, i);
 +              u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
 +              unsigned int start;
 +
 +              do {
 +                      start = u64_stats_fetch_begin_irq(&stats->syncp);
 +                      rx_packets = stats->rx_packets;
 +                      tx_packets = stats->tx_packets;
 +                      rx_bytes = stats->rx_bytes;
 +                      tx_bytes = stats->tx_bytes;
 +              } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
 +
 +              tot->rx_packets += rx_packets;
 +              tot->tx_packets += tx_packets;
 +              tot->rx_bytes   += rx_bytes;
 +              tot->tx_bytes   += tx_bytes;
 +              tot->tx_dropped += stats->tx_dropped;
 +      }
 +}
 +
  static void netvsc_get_stats64(struct net_device *net,
                               struct rtnl_link_stats64 *t)
  {
        struct net_device_context *ndev_ctx = netdev_priv(net);
        struct netvsc_device *nvdev = rcu_dereference_rtnl(ndev_ctx->nvdev);
 +      struct netvsc_vf_pcpu_stats vf_tot;
        int i;
  
        if (!nvdev)
                return;
  
 +      netdev_stats_to_stats64(t, &net->stats);
 +
 +      netvsc_get_vf_stats(net, &vf_tot);
 +      t->rx_packets += vf_tot.rx_packets;
 +      t->tx_packets += vf_tot.tx_packets;
 +      t->rx_bytes   += vf_tot.rx_bytes;
 +      t->tx_bytes   += vf_tot.tx_bytes;
 +      t->tx_dropped += vf_tot.tx_dropped;
 +
        for (i = 0; i < nvdev->num_chn; i++) {
                const struct netvsc_channel *nvchan = &nvdev->chan_table[i];
                const struct netvsc_stats *stats;
                t->rx_packets   += packets;
                t->multicast    += multicast;
        }
 -
 -      t->tx_dropped   = net->stats.tx_dropped;
 -      t->tx_errors    = net->stats.tx_errors;
 -
 -      t->rx_dropped   = net->stats.rx_dropped;
 -      t->rx_errors    = net->stats.rx_errors;
  }
  
  static int netvsc_set_mac_addr(struct net_device *ndev, void *p)
  {
 +      struct net_device_context *ndc = netdev_priv(ndev);
 +      struct net_device *vf_netdev = rtnl_dereference(ndc->vf_netdev);
 +      struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
        struct sockaddr *addr = p;
 -      char save_adr[ETH_ALEN];
 -      unsigned char save_aatype;
        int err;
  
 -      memcpy(save_adr, ndev->dev_addr, ETH_ALEN);
 -      save_aatype = ndev->addr_assign_type;
 -
 -      err = eth_mac_addr(ndev, p);
 -      if (err != 0)
 +      err = eth_prepare_mac_addr_change(ndev, p);
 +      if (err)
                return err;
  
 -      err = rndis_filter_set_device_mac(ndev, addr->sa_data);
 -      if (err != 0) {
 -              /* roll back to saved MAC */
 -              memcpy(ndev->dev_addr, save_adr, ETH_ALEN);
 -              ndev->addr_assign_type = save_aatype;
 +      if (!nvdev)
 +              return -ENODEV;
 +
 +      if (vf_netdev) {
 +              err = dev_set_mac_address(vf_netdev, addr);
 +              if (err)
 +                      return err;
 +      }
 +
 +      err = rndis_filter_set_device_mac(nvdev, addr->sa_data);
 +      if (!err) {
 +              eth_commit_mac_addr_change(ndev, p);
 +      } else if (vf_netdev) {
 +              /* rollback change on VF */
 +              memcpy(addr->sa_data, ndev->dev_addr, ETH_ALEN);
 +              dev_set_mac_address(vf_netdev, addr);
        }
  
        return err;
@@@ -1126,18 -990,9 +1126,18 @@@ static const struct 
        { "tx_no_space",  offsetof(struct netvsc_ethtool_stats, tx_no_space) },
        { "tx_too_big",   offsetof(struct netvsc_ethtool_stats, tx_too_big) },
        { "tx_busy",      offsetof(struct netvsc_ethtool_stats, tx_busy) },
 +      { "tx_send_full", offsetof(struct netvsc_ethtool_stats, tx_send_full) },
 +      { "rx_comp_busy", offsetof(struct netvsc_ethtool_stats, rx_comp_busy) },
 +}, vf_stats[] = {
 +      { "vf_rx_packets", offsetof(struct netvsc_vf_pcpu_stats, rx_packets) },
 +      { "vf_rx_bytes",   offsetof(struct netvsc_vf_pcpu_stats, rx_bytes) },
 +      { "vf_tx_packets", offsetof(struct netvsc_vf_pcpu_stats, tx_packets) },
 +      { "vf_tx_bytes",   offsetof(struct netvsc_vf_pcpu_stats, tx_bytes) },
 +      { "vf_tx_dropped", offsetof(struct netvsc_vf_pcpu_stats, tx_dropped) },
  };
  
  #define NETVSC_GLOBAL_STATS_LEN       ARRAY_SIZE(netvsc_stats)
 +#define NETVSC_VF_STATS_LEN   ARRAY_SIZE(vf_stats)
  
  /* 4 statistics per queue (rx/tx packets/bytes) */
  #define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 4)
@@@ -1152,9 -1007,7 +1152,9 @@@ static int netvsc_get_sset_count(struc
  
        switch (string_set) {
        case ETH_SS_STATS:
 -              return NETVSC_GLOBAL_STATS_LEN + NETVSC_QUEUE_STATS_LEN(nvdev);
 +              return NETVSC_GLOBAL_STATS_LEN
 +                      + NETVSC_VF_STATS_LEN
 +                      + NETVSC_QUEUE_STATS_LEN(nvdev);
        default:
                return -EINVAL;
        }
@@@ -1164,10 -1017,9 +1164,10 @@@ static void netvsc_get_ethtool_stats(st
                                     struct ethtool_stats *stats, u64 *data)
  {
        struct net_device_context *ndc = netdev_priv(dev);
 -      struct netvsc_device *nvdev = rcu_dereference(ndc->nvdev);
 +      struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
        const void *nds = &ndc->eth_stats;
        const struct netvsc_stats *qstats;
 +      struct netvsc_vf_pcpu_stats sum;
        unsigned int start;
        u64 packets, bytes;
        int i, j;
        for (i = 0; i < NETVSC_GLOBAL_STATS_LEN; i++)
                data[i] = *(unsigned long *)(nds + netvsc_stats[i].offset);
  
 +      netvsc_get_vf_stats(dev, &sum);
 +      for (j = 0; j < NETVSC_VF_STATS_LEN; j++)
 +              data[i++] = *(u64 *)((void *)&sum + vf_stats[j].offset);
 +
        for (j = 0; j < nvdev->num_chn; j++) {
                qstats = &nvdev->chan_table[j].tx_stats;
  
  static void netvsc_get_strings(struct net_device *dev, u32 stringset, u8 *data)
  {
        struct net_device_context *ndc = netdev_priv(dev);
 -      struct netvsc_device *nvdev = rcu_dereference(ndc->nvdev);
 +      struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
        u8 *p = data;
        int i;
  
  
        switch (stringset) {
        case ETH_SS_STATS:
 -              for (i = 0; i < ARRAY_SIZE(netvsc_stats); i++)
 -                      memcpy(p + i * ETH_GSTRING_LEN,
 -                             netvsc_stats[i].name, ETH_GSTRING_LEN);
 +              for (i = 0; i < ARRAY_SIZE(netvsc_stats); i++) {
 +                      memcpy(p, netvsc_stats[i].name, ETH_GSTRING_LEN);
 +                      p += ETH_GSTRING_LEN;
 +              }
 +
 +              for (i = 0; i < ARRAY_SIZE(vf_stats); i++) {
 +                      memcpy(p, vf_stats[i].name, ETH_GSTRING_LEN);
 +                      p += ETH_GSTRING_LEN;
 +              }
  
 -              p += i * ETH_GSTRING_LEN;
                for (i = 0; i < nvdev->num_chn; i++) {
                        sprintf(p, "tx_queue_%u_packets", i);
                        p += ETH_GSTRING_LEN;
  }
  
  static int
 -netvsc_get_rss_hash_opts(struct netvsc_device *nvdev,
 +netvsc_get_rss_hash_opts(struct net_device_context *ndc,
                         struct ethtool_rxnfc *info)
  {
        info->data = RXH_IP_SRC | RXH_IP_DST;
        case TCP_V4_FLOW:
        case TCP_V6_FLOW:
                info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
 -              /* fallthrough */
 +              break;
 +
        case UDP_V4_FLOW:
 +              if (ndc->udp4_l4_hash)
 +                      info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
 +
 +              break;
 +
        case UDP_V6_FLOW:
 +              if (ndc->udp6_l4_hash)
 +                      info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
 +
 +              break;
 +
        case IPV4_FLOW:
        case IPV6_FLOW:
                break;
@@@ -1281,7 -1113,7 +1281,7 @@@ netvsc_get_rxnfc(struct net_device *dev
                 u32 *rules)
  {
        struct net_device_context *ndc = netdev_priv(dev);
 -      struct netvsc_device *nvdev = rcu_dereference(ndc->nvdev);
 +      struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
  
        if (!nvdev)
                return -ENODEV;
                return 0;
  
        case ETHTOOL_GRXFH:
 -              return netvsc_get_rss_hash_opts(nvdev, info);
 +              return netvsc_get_rss_hash_opts(ndc, info);
 +      }
 +      return -EOPNOTSUPP;
 +}
 +
 +static int netvsc_set_rss_hash_opts(struct net_device_context *ndc,
 +                                  struct ethtool_rxnfc *info)
 +{
 +      if (info->data == (RXH_IP_SRC | RXH_IP_DST |
 +                         RXH_L4_B_0_1 | RXH_L4_B_2_3)) {
 +              if (info->flow_type == UDP_V4_FLOW)
 +                      ndc->udp4_l4_hash = true;
 +              else if (info->flow_type == UDP_V6_FLOW)
 +                      ndc->udp6_l4_hash = true;
 +              else
 +                      return -EOPNOTSUPP;
 +
 +              return 0;
 +      }
 +
 +      if (info->data == (RXH_IP_SRC | RXH_IP_DST)) {
 +              if (info->flow_type == UDP_V4_FLOW)
 +                      ndc->udp4_l4_hash = false;
 +              else if (info->flow_type == UDP_V6_FLOW)
 +                      ndc->udp6_l4_hash = false;
 +              else
 +                      return -EOPNOTSUPP;
 +
 +              return 0;
        }
 +
 +      return -EOPNOTSUPP;
 +}
 +
 +static int
 +netvsc_set_rxnfc(struct net_device *ndev, struct ethtool_rxnfc *info)
 +{
 +      struct net_device_context *ndc = netdev_priv(ndev);
 +
 +      if (info->cmd == ETHTOOL_SRXFH)
 +              return netvsc_set_rss_hash_opts(ndc, info);
 +
        return -EOPNOTSUPP;
  }
  
@@@ -1371,7 -1163,7 +1371,7 @@@ static int netvsc_get_rxfh(struct net_d
                           u8 *hfunc)
  {
        struct net_device_context *ndc = netdev_priv(dev);
 -      struct netvsc_device *ndev = rcu_dereference(ndc->nvdev);
 +      struct netvsc_device *ndev = rtnl_dereference(ndc->nvdev);
        struct rndis_device *rndis_dev;
        int i;
  
@@@ -1427,104 -1219,6 +1427,104 @@@ static int netvsc_set_rxfh(struct net_d
        return rndis_filter_set_rss_param(rndis_dev, key, ndev->num_chn);
  }
  
 +/* Hyper-V RNDIS protocol does not have ring in the HW sense.
 + * It does have pre-allocated receive area which is divided into sections.
 + */
 +static void __netvsc_get_ringparam(struct netvsc_device *nvdev,
 +                                 struct ethtool_ringparam *ring)
 +{
 +      u32 max_buf_size;
 +
 +      ring->rx_pending = nvdev->recv_section_cnt;
 +      ring->tx_pending = nvdev->send_section_cnt;
 +
 +      if (nvdev->nvsp_version <= NVSP_PROTOCOL_VERSION_2)
 +              max_buf_size = NETVSC_RECEIVE_BUFFER_SIZE_LEGACY;
 +      else
 +              max_buf_size = NETVSC_RECEIVE_BUFFER_SIZE;
 +
 +      ring->rx_max_pending = max_buf_size / nvdev->recv_section_size;
 +      ring->tx_max_pending = NETVSC_SEND_BUFFER_SIZE
 +              / nvdev->send_section_size;
 +}
 +
 +static void netvsc_get_ringparam(struct net_device *ndev,
 +                               struct ethtool_ringparam *ring)
 +{
 +      struct net_device_context *ndevctx = netdev_priv(ndev);
 +      struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
 +
 +      if (!nvdev)
 +              return;
 +
 +      __netvsc_get_ringparam(nvdev, ring);
 +}
 +
 +static int netvsc_set_ringparam(struct net_device *ndev,
 +                              struct ethtool_ringparam *ring)
 +{
 +      struct net_device_context *ndevctx = netdev_priv(ndev);
 +      struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
 +      struct hv_device *hdev = ndevctx->device_ctx;
 +      struct netvsc_device_info device_info;
 +      struct ethtool_ringparam orig;
 +      u32 new_tx, new_rx;
 +      bool was_opened;
 +      int ret = 0;
 +
 +      if (!nvdev || nvdev->destroy)
 +              return -ENODEV;
 +
 +      memset(&orig, 0, sizeof(orig));
 +      __netvsc_get_ringparam(nvdev, &orig);
 +
 +      new_tx = clamp_t(u32, ring->tx_pending,
 +                       NETVSC_MIN_TX_SECTIONS, orig.tx_max_pending);
 +      new_rx = clamp_t(u32, ring->rx_pending,
 +                       NETVSC_MIN_RX_SECTIONS, orig.rx_max_pending);
 +
 +      if (new_tx == orig.tx_pending &&
 +          new_rx == orig.rx_pending)
 +              return 0;        /* no change */
 +
 +      memset(&device_info, 0, sizeof(device_info));
 +      device_info.num_chn = nvdev->num_chn;
 +      device_info.ring_size = ring_size;
 +      device_info.send_sections = new_tx;
 +      device_info.recv_sections = new_rx;
 +
 +      netif_device_detach(ndev);
 +      was_opened = rndis_filter_opened(nvdev);
 +      if (was_opened)
 +              rndis_filter_close(nvdev);
 +
 +      rndis_filter_device_remove(hdev, nvdev);
 +
 +      nvdev = rndis_filter_device_add(hdev, &device_info);
 +      if (IS_ERR(nvdev)) {
 +              ret = PTR_ERR(nvdev);
 +
 +              device_info.send_sections = orig.tx_pending;
 +              device_info.recv_sections = orig.rx_pending;
 +              nvdev = rndis_filter_device_add(hdev, &device_info);
 +              if (IS_ERR(nvdev)) {
 +                      netdev_err(ndev, "restoring ringparam failed: %ld\n",
 +                                 PTR_ERR(nvdev));
 +                      return ret;
 +              }
 +      }
 +
 +      if (was_opened)
 +              rndis_filter_open(nvdev);
 +      netif_device_attach(ndev);
 +
 +      /* We may have missed link change notifications */
 +      ndevctx->last_reconfig = 0;
 +      schedule_delayed_work(&ndevctx->dwork, 0);
 +
 +      return ret;
 +}
 +
  static const struct ethtool_ops ethtool_ops = {
        .get_drvinfo    = netvsc_get_drvinfo,
        .get_link       = ethtool_op_get_link,
        .set_channels   = netvsc_set_channels,
        .get_ts_info    = ethtool_op_get_ts_info,
        .get_rxnfc      = netvsc_get_rxnfc,
 +      .set_rxnfc      = netvsc_set_rxnfc,
        .get_rxfh_key_size = netvsc_get_rxfh_key_size,
        .get_rxfh_indir_size = netvsc_rss_indir_size,
        .get_rxfh       = netvsc_get_rxfh,
        .set_rxfh       = netvsc_set_rxfh,
        .get_link_ksettings = netvsc_get_link_ksettings,
        .set_link_ksettings = netvsc_set_link_ksettings,
 +      .get_ringparam  = netvsc_get_ringparam,
 +      .set_ringparam  = netvsc_set_ringparam,
  };
  
  static const struct net_device_ops device_ops = {
@@@ -1578,7 -1269,12 +1578,12 @@@ static void netvsc_link_change(struct w
        bool notify = false, reschedule = false;
        unsigned long flags, next_reconfig, delay;
  
-       rtnl_lock();
+       /* if changes are happening, comeback later */
+       if (!rtnl_trylock()) {
+               schedule_delayed_work(&ndev_ctx->dwork, LINKCHANGE_INT);
+               return;
+       }
        net_device = rtnl_dereference(ndev_ctx->nvdev);
        if (!net_device)
                goto out_unlock;
        case RNDIS_STATUS_MEDIA_CONNECT:
                if (rdev->link_state) {
                        rdev->link_state = false;
 -                      if (!ndev_ctx->datapath)
 -                              netif_carrier_on(net);
 +                      netif_carrier_on(net);
                        netif_tx_wake_all_queues(net);
                } else {
                        notify = true;
@@@ -1694,7 -1391,7 +1699,7 @@@ static struct net_device *get_netvsc_by
                        continue;       /* not a netvsc device */
  
                net_device_ctx = netdev_priv(dev);
 -              if (net_device_ctx->nvdev == NULL)
 +              if (!rtnl_dereference(net_device_ctx->nvdev))
                        continue;       /* device is removed */
  
                if (rtnl_dereference(net_device_ctx->vf_netdev) == vf_netdev)
        return NULL;
  }
  
 +/* Called when VF is injecting data into network stack.
 + * Change the associated network device from VF to netvsc.
 + * note: already called with rcu_read_lock
 + */
 +static rx_handler_result_t netvsc_vf_handle_frame(struct sk_buff **pskb)
 +{
 +      struct sk_buff *skb = *pskb;
 +      struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
 +      struct net_device_context *ndev_ctx = netdev_priv(ndev);
 +      struct netvsc_vf_pcpu_stats *pcpu_stats
 +               = this_cpu_ptr(ndev_ctx->vf_stats);
 +
 +      skb->dev = ndev;
 +
 +      u64_stats_update_begin(&pcpu_stats->syncp);
 +      pcpu_stats->rx_packets++;
 +      pcpu_stats->rx_bytes += skb->len;
 +      u64_stats_update_end(&pcpu_stats->syncp);
 +
 +      return RX_HANDLER_ANOTHER;
 +}
 +
 +static int netvsc_vf_join(struct net_device *vf_netdev,
 +                        struct net_device *ndev)
 +{
 +      struct net_device_context *ndev_ctx = netdev_priv(ndev);
 +      int ret;
 +
 +      ret = netdev_rx_handler_register(vf_netdev,
 +                                       netvsc_vf_handle_frame, ndev);
 +      if (ret != 0) {
 +              netdev_err(vf_netdev,
 +                         "can not register netvsc VF receive handler (err = %d)\n",
 +                         ret);
 +              goto rx_handler_failed;
 +      }
 +
 +      ret = netdev_upper_dev_link(vf_netdev, ndev);
 +      if (ret != 0) {
 +              netdev_err(vf_netdev,
 +                         "can not set master device %s (err = %d)\n",
 +                         ndev->name, ret);
 +              goto upper_link_failed;
 +      }
 +
 +      /* set slave flag before open to prevent IPv6 addrconf */
 +      vf_netdev->flags |= IFF_SLAVE;
 +
 +      schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
 +
 +      call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
 +
 +      netdev_info(vf_netdev, "joined to %s\n", ndev->name);
 +      return 0;
 +
 +upper_link_failed:
 +      netdev_rx_handler_unregister(vf_netdev);
 +rx_handler_failed:
 +      return ret;
 +}
 +
 +static void __netvsc_vf_setup(struct net_device *ndev,
 +                            struct net_device *vf_netdev)
 +{
 +      int ret;
 +
 +      /* Align MTU of VF with master */
 +      ret = dev_set_mtu(vf_netdev, ndev->mtu);
 +      if (ret)
 +              netdev_warn(vf_netdev,
 +                          "unable to change mtu to %u\n", ndev->mtu);
 +
 +      if (netif_running(ndev)) {
 +              ret = dev_open(vf_netdev);
 +              if (ret)
 +                      netdev_warn(vf_netdev,
 +                                  "unable to open: %d\n", ret);
 +      }
 +}
 +
 +/* Setup VF as slave of the synthetic device.
 + * Runs in workqueue to avoid recursion in netlink callbacks.
 + */
 +static void netvsc_vf_setup(struct work_struct *w)
 +{
 +      struct net_device_context *ndev_ctx
 +              = container_of(w, struct net_device_context, vf_takeover.work);
 +      struct net_device *ndev = hv_get_drvdata(ndev_ctx->device_ctx);
 +      struct net_device *vf_netdev;
 +
 +      if (!rtnl_trylock()) {
 +              schedule_delayed_work(&ndev_ctx->vf_takeover, 0);
 +              return;
 +      }
 +
 +      vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
 +      if (vf_netdev)
 +              __netvsc_vf_setup(ndev, vf_netdev);
 +
 +      rtnl_unlock();
 +}
 +
  static int netvsc_register_vf(struct net_device *vf_netdev)
  {
        struct net_device *ndev;
        if (!netvsc_dev || rtnl_dereference(net_device_ctx->vf_netdev))
                return NOTIFY_DONE;
  
 +      if (netvsc_vf_join(vf_netdev, ndev) != 0)
 +              return NOTIFY_DONE;
 +
        netdev_info(ndev, "VF registering: %s\n", vf_netdev->name);
 -      /*
 -       * Take a reference on the module.
 -       */
 +
 +      /* Prevent this module from being unloaded while VF is registered */
        try_module_get(THIS_MODULE);
  
        dev_hold(vf_netdev);
  
  static int netvsc_vf_up(struct net_device *vf_netdev)
  {
 -      struct net_device *ndev;
 -      struct netvsc_device *netvsc_dev;
        struct net_device_context *net_device_ctx;
 +      struct netvsc_device *netvsc_dev;
 +      struct net_device *ndev;
  
        ndev = get_netvsc_byref(vf_netdev);
        if (!ndev)
  
        net_device_ctx = netdev_priv(ndev);
        netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
 +      if (!netvsc_dev)
 +              return NOTIFY_DONE;
  
 -      netdev_info(ndev, "VF up: %s\n", vf_netdev->name);
 -
 -      /*
 -       * Open the device before switching data path.
 -       */
 +      /* Bump refcount when datapath is acvive - Why? */
        rndis_filter_open(netvsc_dev);
  
 -      /*
 -       * notify the host to switch the data path.
 -       */
 +      /* notify the host to switch the data path. */
        netvsc_switch_datapath(ndev, true);
        netdev_info(ndev, "Data path switched to VF: %s\n", vf_netdev->name);
  
 -      netif_carrier_off(ndev);
 -
 -      /* Now notify peers through VF device. */
 -      call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, vf_netdev);
 -
        return NOTIFY_OK;
  }
  
  static int netvsc_vf_down(struct net_device *vf_netdev)
  {
 -      struct net_device *ndev;
 -      struct netvsc_device *netvsc_dev;
        struct net_device_context *net_device_ctx;
 +      struct netvsc_device *netvsc_dev;
 +      struct net_device *ndev;
  
        ndev = get_netvsc_byref(vf_netdev);
        if (!ndev)
  
        net_device_ctx = netdev_priv(ndev);
        netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
 +      if (!netvsc_dev)
 +              return NOTIFY_DONE;
  
 -      netdev_info(ndev, "VF down: %s\n", vf_netdev->name);
        netvsc_switch_datapath(ndev, false);
        netdev_info(ndev, "Data path switched from VF: %s\n", vf_netdev->name);
        rndis_filter_close(netvsc_dev);
 -      netif_carrier_on(ndev);
 -
 -      /* Now notify peers through netvsc device. */
 -      call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, ndev);
  
        return NOTIFY_OK;
  }
@@@ -1899,11 -1504,9 +1904,11 @@@ static int netvsc_unregister_vf(struct 
                return NOTIFY_DONE;
  
        net_device_ctx = netdev_priv(ndev);
 +      cancel_delayed_work_sync(&net_device_ctx->vf_takeover);
  
        netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name);
  
 +      netdev_upper_dev_unlink(vf_netdev, ndev);
        RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL);
        dev_put(vf_netdev);
        module_put(THIS_MODULE);
@@@ -1917,12 -1520,12 +1922,12 @@@ static int netvsc_probe(struct hv_devic
        struct net_device_context *net_device_ctx;
        struct netvsc_device_info device_info;
        struct netvsc_device *nvdev;
 -      int ret;
 +      int ret = -ENOMEM;
  
        net = alloc_etherdev_mq(sizeof(struct net_device_context),
                                VRSS_CHANNEL_MAX);
        if (!net)
 -              return -ENOMEM;
 +              goto no_net;
  
        netif_carrier_off(net);
  
  
        spin_lock_init(&net_device_ctx->lock);
        INIT_LIST_HEAD(&net_device_ctx->reconfig_events);
 +      INIT_DELAYED_WORK(&net_device_ctx->vf_takeover, netvsc_vf_setup);
 +
 +      net_device_ctx->vf_stats
 +              = netdev_alloc_pcpu_stats(struct netvsc_vf_pcpu_stats);
 +      if (!net_device_ctx->vf_stats)
 +              goto no_stats;
  
        net->netdev_ops = &device_ops;
        net->ethtool_ops = &ethtool_ops;
        memset(&device_info, 0, sizeof(device_info));
        device_info.ring_size = ring_size;
        device_info.num_chn = VRSS_CHANNEL_DEFAULT;
 -      ret = rndis_filter_device_add(dev, &device_info);
 -      if (ret != 0) {
 +      device_info.send_sections = NETVSC_DEFAULT_TX;
 +      device_info.recv_sections = NETVSC_DEFAULT_RX;
 +
 +      nvdev = rndis_filter_device_add(dev, &device_info);
 +      if (IS_ERR(nvdev)) {
 +              ret = PTR_ERR(nvdev);
                netdev_err(net, "unable to add netvsc device (ret %d)\n", ret);
 -              free_netdev(net);
 -              hv_set_drvdata(dev, NULL);
 -              return ret;
 +              goto rndis_failed;
        }
 +
        memcpy(net->dev_addr, device_info.mac_adr, ETH_ALEN);
  
        /* hw_features computed in rndis_filter_device_add */
                NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
        net->vlan_features = net->features;
  
 -      /* RCU not necessary here, device not registered */
 -      nvdev = net_device_ctx->nvdev;
        netif_set_real_num_tx_queues(net, nvdev->num_chn);
        netif_set_real_num_rx_queues(net, nvdev->num_chn);
  
 +      netdev_lockdep_set_classes(net);
 +
        /* MTU range: 68 - 1500 or 65521 */
        net->min_mtu = NETVSC_MTU_MIN;
        if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2)
        ret = register_netdev(net);
        if (ret != 0) {
                pr_err("Unable to register netdev.\n");
 -              rndis_filter_device_remove(dev, nvdev);
 -              free_netdev(net);
 +              goto register_failed;
        }
  
        return ret;
 +
 +register_failed:
 +      rndis_filter_device_remove(dev, nvdev);
 +rndis_failed:
 +      free_percpu(net_device_ctx->vf_stats);
 +no_stats:
 +      hv_set_drvdata(dev, NULL);
 +      free_netdev(net);
 +no_net:
 +      return ret;
  }
  
  static int netvsc_remove(struct hv_device *dev)
         * removed. Also blocks mtu and channel changes.
         */
        rtnl_lock();
 -      rndis_filter_device_remove(dev, ndev_ctx->nvdev);
 +      rndis_filter_device_remove(dev,
 +                                 rtnl_dereference(ndev_ctx->nvdev));
        rtnl_unlock();
  
        unregister_netdev(net);
  
        hv_set_drvdata(dev, NULL);
  
 +      free_percpu(ndev_ctx->vf_stats);
        free_netdev(net);
        return 0;
  }
diff --combined drivers/net/phy/phy.c
index dae13f028c84ee177800a138dd80e3a5228a2d60,d0626bf5c540911b0d15bdbab1b960145b6d124c..e842d2cd1ee750f8930028370c1c7fac5a52dc77
@@@ -30,6 -30,7 +30,6 @@@
  #include <linux/ethtool.h>
  #include <linux/phy.h>
  #include <linux/phy_led_triggers.h>
 -#include <linux/timer.h>
  #include <linux/workqueue.h>
  #include <linux/mdio.h>
  #include <linux/io.h>
  
  #include <asm/irq.h>
  
 -static const char *phy_speed_to_str(int speed)
 -{
 -      switch (speed) {
 -      case SPEED_10:
 -              return "10Mbps";
 -      case SPEED_100:
 -              return "100Mbps";
 -      case SPEED_1000:
 -              return "1Gbps";
 -      case SPEED_2500:
 -              return "2.5Gbps";
 -      case SPEED_5000:
 -              return "5Gbps";
 -      case SPEED_10000:
 -              return "10Gbps";
 -      case SPEED_14000:
 -              return "14Gbps";
 -      case SPEED_20000:
 -              return "20Gbps";
 -      case SPEED_25000:
 -              return "25Gbps";
 -      case SPEED_40000:
 -              return "40Gbps";
 -      case SPEED_50000:
 -              return "50Gbps";
 -      case SPEED_56000:
 -              return "56Gbps";
 -      case SPEED_100000:
 -              return "100Gbps";
 -      case SPEED_UNKNOWN:
 -              return "Unknown";
 -      default:
 -              return "Unsupported (update phy.c)";
 -      }
 -}
 -
  #define PHY_STATE_STR(_state)                 \
        case PHY_##_state:                      \
                return __stringify(_state);     \
@@@ -73,7 -110,7 +73,7 @@@ void phy_print_status(struct phy_devic
                netdev_info(phydev->attached_dev,
                        "Link is Up - %s/%s - flow control %s\n",
                        phy_speed_to_str(phydev->speed),
 -                      DUPLEX_FULL == phydev->duplex ? "Full" : "Half",
 +                      phy_duplex_to_str(phydev->duplex),
                        phydev->pause ? "rx/tx" : "off");
        } else  {
                netdev_info(phydev->attached_dev, "Link is Down\n");
@@@ -157,6 -194,123 +157,6 @@@ int phy_aneg_done(struct phy_device *ph
  }
  EXPORT_SYMBOL(phy_aneg_done);
  
 -/* A structure for mapping a particular speed and duplex
 - * combination to a particular SUPPORTED and ADVERTISED value
 - */
 -struct phy_setting {
 -      int speed;
 -      int duplex;
 -      u32 setting;
 -};
 -
 -/* A mapping of all SUPPORTED settings to speed/duplex.  This table
 - * must be grouped by speed and sorted in descending match priority
 - * - iow, descending speed. */
 -static const struct phy_setting settings[] = {
 -      {
 -              .speed = SPEED_10000,
 -              .duplex = DUPLEX_FULL,
 -              .setting = SUPPORTED_10000baseKR_Full,
 -      },
 -      {
 -              .speed = SPEED_10000,
 -              .duplex = DUPLEX_FULL,
 -              .setting = SUPPORTED_10000baseKX4_Full,
 -      },
 -      {
 -              .speed = SPEED_10000,
 -              .duplex = DUPLEX_FULL,
 -              .setting = SUPPORTED_10000baseT_Full,
 -      },
 -      {
 -              .speed = SPEED_2500,
 -              .duplex = DUPLEX_FULL,
 -              .setting = SUPPORTED_2500baseX_Full,
 -      },
 -      {
 -              .speed = SPEED_1000,
 -              .duplex = DUPLEX_FULL,
 -              .setting = SUPPORTED_1000baseKX_Full,
 -      },
 -      {
 -              .speed = SPEED_1000,
 -              .duplex = DUPLEX_FULL,
 -              .setting = SUPPORTED_1000baseT_Full,
 -      },
 -      {
 -              .speed = SPEED_1000,
 -              .duplex = DUPLEX_HALF,
 -              .setting = SUPPORTED_1000baseT_Half,
 -      },
 -      {
 -              .speed = SPEED_100,
 -              .duplex = DUPLEX_FULL,
 -              .setting = SUPPORTED_100baseT_Full,
 -      },
 -      {
 -              .speed = SPEED_100,
 -              .duplex = DUPLEX_HALF,
 -              .setting = SUPPORTED_100baseT_Half,
 -      },
 -      {
 -              .speed = SPEED_10,
 -              .duplex = DUPLEX_FULL,
 -              .setting = SUPPORTED_10baseT_Full,
 -      },
 -      {
 -              .speed = SPEED_10,
 -              .duplex = DUPLEX_HALF,
 -              .setting = SUPPORTED_10baseT_Half,
 -      },
 -};
 -
 -/**
 - * phy_lookup_setting - lookup a PHY setting
 - * @speed: speed to match
 - * @duplex: duplex to match
 - * @features: allowed link modes
 - * @exact: an exact match is required
 - *
 - * Search the settings array for a setting that matches the speed and
 - * duplex, and which is supported.
 - *
 - * If @exact is unset, either an exact match or %NULL for no match will
 - * be returned.
 - *
 - * If @exact is set, an exact match, the fastest supported setting at
 - * or below the specified speed, the slowest supported setting, or if
 - * they all fail, %NULL will be returned.
 - */
 -static const struct phy_setting *
 -phy_lookup_setting(int speed, int duplex, u32 features, bool exact)
 -{
 -      const struct phy_setting *p, *match = NULL, *last = NULL;
 -      int i;
 -
 -      for (i = 0, p = settings; i < ARRAY_SIZE(settings); i++, p++) {
 -              if (p->setting & features) {
 -                      last = p;
 -                      if (p->speed == speed && p->duplex == duplex) {
 -                              /* Exact match for speed and duplex */
 -                              match = p;
 -                              break;
 -                      } else if (!exact) {
 -                              if (!match && p->speed <= speed)
 -                                      /* Candidate */
 -                                      match = p;
 -
 -                              if (p->speed < speed)
 -                                      break;
 -                      }
 -              }
 -      }
 -
 -      if (!match && !exact)
 -              match = last;
 -
 -      return match;
 -}
 -
  /**
   * phy_find_valid - find a PHY setting that matches the requested parameters
   * @speed: desired speed
  static const struct phy_setting *
  phy_find_valid(int speed, int duplex, u32 supported)
  {
 -      return phy_lookup_setting(speed, duplex, supported, false);
 +      unsigned long mask = supported;
 +
 +      return phy_lookup_setting(speed, duplex, &mask, BITS_PER_LONG, false);
  }
  
  /**
@@@ -192,9 -344,16 +192,9 @@@ unsigned int phy_supported_speeds(struc
                                  unsigned int *speeds,
                                  unsigned int size)
  {
 -      unsigned int count = 0;
 -      unsigned int idx = 0;
 +      unsigned long supported = phy->supported;
  
 -      for (idx = 0; idx < ARRAY_SIZE(settings) && count < size; idx++)
 -              /* Assumes settings are grouped by speed */
 -              if ((settings[idx].setting & phy->supported) &&
 -                  (count == 0 || speeds[count - 1] != settings[idx].speed))
 -                      speeds[count++] = settings[idx].speed;
 -
 -      return count;
 +      return phy_speeds(speeds, size, &supported, BITS_PER_LONG);
  }
  
  /**
   */
  static inline bool phy_check_valid(int speed, int duplex, u32 features)
  {
 -      return !!phy_lookup_setting(speed, duplex, features, true);
 +      unsigned long mask = features;
 +
 +      return !!phy_lookup_setting(speed, duplex, &mask, BITS_PER_LONG, true);
  }
  
  /**
@@@ -548,15 -705,14 +548,15 @@@ EXPORT_SYMBOL(phy_start_aneg)
   *
   * Description: The PHY infrastructure can run a state machine
   *   which tracks whether the PHY is starting up, negotiating,
 - *   etc.  This function starts the timer which tracks the state
 - *   of the PHY.  If you want to maintain your own state machine,
 + *   etc.  This function starts the delayed workqueue which tracks
 + *   the state of the PHY. If you want to maintain your own state machine,
   *   do not call this function.
   */
  void phy_start_machine(struct phy_device *phydev)
  {
        queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, HZ);
  }
 +EXPORT_SYMBOL_GPL(phy_start_machine);
  
  /**
   * phy_trigger_machine - trigger the state machine to run
@@@ -581,9 -737,9 +581,9 @@@ void phy_trigger_machine(struct phy_dev
   * phy_stop_machine - stop the PHY state machine tracking
   * @phydev: target phy_device struct
   *
 - * Description: Stops the state machine timer, sets the state to UP
 - *   (unless it wasn't up yet). This function must be called BEFORE
 - *   phy_detach.
 + * Description: Stops the state machine delayed workqueue, sets the
 + *   state to UP (unless it wasn't up yet). This function must be
 + *   called BEFORE phy_detach.
   */
  void phy_stop_machine(struct phy_device *phydev)
  {
        if (phydev->state > PHY_UP && phydev->state != PHY_HALTED)
                phydev->state = PHY_UP;
        mutex_unlock(&phydev->lock);
-       /* Now we can run the state machine synchronously */
-       phy_state_machine(&phydev->state_queue.work);
  }
  
  /**
@@@ -866,15 -1019,9 +863,15 @@@ void phy_start(struct phy_device *phyde
  }
  EXPORT_SYMBOL(phy_start);
  
 -static void phy_adjust_link(struct phy_device *phydev)
 +static void phy_link_up(struct phy_device *phydev)
  {
 -      phydev->adjust_link(phydev->attached_dev);
 +      phydev->phy_link_change(phydev, true, true);
 +      phy_led_trigger_change_speed(phydev);
 +}
 +
 +static void phy_link_down(struct phy_device *phydev, bool do_carrier)
 +{
 +      phydev->phy_link_change(phydev, false, do_carrier);
        phy_led_trigger_change_speed(phydev);
  }
  
@@@ -919,7 -1066,8 +916,7 @@@ void phy_state_machine(struct work_stru
                /* If the link is down, give up on negotiation for now */
                if (!phydev->link) {
                        phydev->state = PHY_NOLINK;
 -                      netif_carrier_off(phydev->attached_dev);
 -                      phy_adjust_link(phydev);
 +                      phy_link_down(phydev, true);
                        break;
                }
  
                /* If AN is done, we're running */
                if (err > 0) {
                        phydev->state = PHY_RUNNING;
 -                      netif_carrier_on(phydev->attached_dev);
 -                      phy_adjust_link(phydev);
 -
 +                      phy_link_up(phydev);
                } else if (0 == phydev->link_timeout--)
                        needs_aneg = true;
                break;
                                }
                        }
                        phydev->state = PHY_RUNNING;
 -                      netif_carrier_on(phydev->attached_dev);
 -                      phy_adjust_link(phydev);
 +                      phy_link_up(phydev);
                }
                break;
        case PHY_FORCING:
  
                if (phydev->link) {
                        phydev->state = PHY_RUNNING;
 -                      netif_carrier_on(phydev->attached_dev);
 +                      phy_link_up(phydev);
                } else {
                        if (0 == phydev->link_timeout--)
                                needs_aneg = true;
 +                      phy_link_down(phydev, false);
                }
 -
 -              phy_adjust_link(phydev);
                break;
        case PHY_RUNNING:
                /* Only register a CHANGE if we are polling and link changed
  
                if (phydev->link) {
                        phydev->state = PHY_RUNNING;
 -                      netif_carrier_on(phydev->attached_dev);
 +                      phy_link_up(phydev);
                } else {
                        phydev->state = PHY_NOLINK;
 -                      netif_carrier_off(phydev->attached_dev);
 +                      phy_link_down(phydev, true);
                }
  
 -              phy_adjust_link(phydev);
 -
                if (phy_interrupt_is_valid(phydev))
                        err = phy_config_interrupt(phydev,
                                                   PHY_INTERRUPT_ENABLED);
        case PHY_HALTED:
                if (phydev->link) {
                        phydev->link = 0;
 -                      netif_carrier_off(phydev->attached_dev);
 -                      phy_adjust_link(phydev);
 +                      phy_link_down(phydev, true);
                        do_suspend = true;
                }
                break;
  
                                if (phydev->link) {
                                        phydev->state = PHY_RUNNING;
 -                                      netif_carrier_on(phydev->attached_dev);
 +                                      phy_link_up(phydev);
                                } else  {
                                        phydev->state = PHY_NOLINK;
 +                                      phy_link_down(phydev, false);
                                }
 -                              phy_adjust_link(phydev);
                        } else {
                                phydev->state = PHY_AN;
                                phydev->link_timeout = PHY_AN_TIMEOUT;
  
                        if (phydev->link) {
                                phydev->state = PHY_RUNNING;
 -                              netif_carrier_on(phydev->attached_dev);
 +                              phy_link_up(phydev);
                        } else  {
                                phydev->state = PHY_NOLINK;
 +                              phy_link_down(phydev, false);
                        }
 -                      phy_adjust_link(phydev);
                }
                break;
        }
        if (err < 0)
                phy_error(phydev);
  
 -      phydev_dbg(phydev, "PHY state change %s -> %s\n",
 -                 phy_state_to_str(old_state),
 -                 phy_state_to_str(phydev->state));
 +      if (old_state != phydev->state)
 +              phydev_dbg(phydev, "PHY state change %s -> %s\n",
 +                         phy_state_to_str(old_state),
 +                         phy_state_to_str(phydev->state));
  
        /* Only re-schedule a PHY state machine change if we are polling the
         * PHY, if PHY_IGNORE_INTERRUPT is set, then we will be moving
index 9493fb369682eba2f5a6b80e3517ec9f31598746,2f742ae5b92ee7d7be080ec60fca7958f722e576..810f6fd2f6391e508432091d171b42664869013c
@@@ -688,19 -688,6 +688,19 @@@ struct phy_device *phy_find_first(struc
  }
  EXPORT_SYMBOL(phy_find_first);
  
 +static void phy_link_change(struct phy_device *phydev, bool up, bool do_carrier)
 +{
 +      struct net_device *netdev = phydev->attached_dev;
 +
 +      if (do_carrier) {
 +              if (up)
 +                      netif_carrier_on(netdev);
 +              else
 +                      netif_carrier_off(netdev);
 +      }
 +      phydev->adjust_link(netdev);
 +}
 +
  /**
   * phy_prepare_link - prepares the PHY layer to monitor link status
   * @phydev: target phy_device struct
@@@ -877,15 -864,17 +877,17 @@@ EXPORT_SYMBOL(phy_attached_info)
  #define ATTACHED_FMT "attached PHY driver [%s] (mii_bus:phy_addr=%s, irq=%d)"
  void phy_attached_print(struct phy_device *phydev, const char *fmt, ...)
  {
+       const char *drv_name = phydev->drv ? phydev->drv->name : "unbound";
        if (!fmt) {
                dev_info(&phydev->mdio.dev, ATTACHED_FMT "\n",
-                        phydev->drv->name, phydev_name(phydev),
+                        drv_name, phydev_name(phydev),
                         phydev->irq);
        } else {
                va_list ap;
  
                dev_info(&phydev->mdio.dev, ATTACHED_FMT,
-                        phydev->drv->name, phydev_name(phydev),
+                        drv_name, phydev_name(phydev),
                         phydev->irq);
  
                va_start(ap, fmt);
@@@ -964,7 -953,6 +966,7 @@@ int phy_attach_direct(struct net_devic
                goto error;
        }
  
 +      phydev->phy_link_change = phy_link_change;
        phydev->attached_dev = dev;
        dev->phydev = phydev;
  
@@@ -1084,7 -1072,6 +1086,7 @@@ void phy_detach(struct phy_device *phyd
        phydev->attached_dev->phydev = NULL;
        phydev->attached_dev = NULL;
        phy_suspend(phydev);
 +      phydev->phylink = NULL;
  
        phy_led_triggers_unregister(phydev);
  
index 811b18215cae1f3f6636d15050d5e76ea5be441f,9c80e80c5493b4f5a5d56c06f0d188debe53eca1..47cab1bde0659a518cea55e1ae54d9dccccdaa25
@@@ -367,7 -367,7 +367,7 @@@ static struct attribute *cdc_ncm_sysfs_
        NULL,
  };
  
 -static struct attribute_group cdc_ncm_sysfs_attr_group = {
 +static const struct attribute_group cdc_ncm_sysfs_attr_group = {
        .name = "cdc_ncm",
        .attrs = cdc_ncm_sysfs_attrs,
  };
@@@ -1758,6 -1758,13 +1758,13 @@@ static const struct usb_device_id cdc_d
          .driver_info = (unsigned long)&wwan_noarp_info,
        },
  
+       /* u-blox TOBY-L4 */
+       { USB_DEVICE_AND_INTERFACE_INFO(0x1546, 0x1010,
+               USB_CLASS_COMM,
+               USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE),
+         .driver_info = (unsigned long)&wwan_info,
+       },
        /* Generic CDC-NCM devices */
        { USB_INTERFACE_INFO(USB_CLASS_COMM,
                USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE),
diff --combined drivers/net/virtio_net.c
index 52ae78ca3d3865e8ce589de047ac3268a133fadf,b06169ea60dc9d519f9a7673e9aea809539dbd5c..511f8339fa963c300d41cfd2e27480ea43c6ed73
@@@ -57,13 -57,6 +57,13 @@@ DECLARE_EWMA(pkt_len, 0, 64
  
  #define VIRTNET_DRIVER_VERSION "1.0.0"
  
 +static const unsigned long guest_offloads[] = {
 +      VIRTIO_NET_F_GUEST_TSO4,
 +      VIRTIO_NET_F_GUEST_TSO6,
 +      VIRTIO_NET_F_GUEST_ECN,
 +      VIRTIO_NET_F_GUEST_UFO
 +};
 +
  struct virtnet_stats {
        struct u64_stats_sync tx_syncp;
        struct u64_stats_sync rx_syncp;
@@@ -171,13 -164,10 +171,13 @@@ struct virtnet_info 
        u8 ctrl_promisc;
        u8 ctrl_allmulti;
        u16 ctrl_vid;
 +      u64 ctrl_offloads;
  
        /* Ethtool settings */
        u8 duplex;
        u32 speed;
 +
 +      unsigned long guest_offloads;
  };
  
  struct padded_vnet_hdr {
@@@ -280,23 -270,6 +280,23 @@@ static void skb_xmit_done(struct virtqu
                netif_wake_subqueue(vi->dev, vq2txq(vq));
  }
  
 +#define MRG_CTX_HEADER_SHIFT 22
 +static void *mergeable_len_to_ctx(unsigned int truesize,
 +                                unsigned int headroom)
 +{
 +      return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize);
 +}
 +
 +static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx)
 +{
 +      return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT;
 +}
 +
 +static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
 +{
 +      return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1);
 +}
 +
  /* Called from bottom half context */
  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
                                   struct receive_queue *rq,
  
        hdr_len = vi->hdr_len;
        if (vi->mergeable_rx_bufs)
 -              hdr_padded_len = sizeof *hdr;
 +              hdr_padded_len = sizeof(*hdr);
        else
                hdr_padded_len = sizeof(struct padded_vnet_hdr);
  
@@@ -417,85 -390,19 +417,85 @@@ static unsigned int virtnet_get_headroo
        return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
  }
  
 +/* We copy the packet for XDP in the following cases:
 + *
 + * 1) Packet is scattered across multiple rx buffers.
 + * 2) Headroom space is insufficient.
 + *
 + * This is inefficient but it's a temporary condition that
 + * we hit right after XDP is enabled and until queue is refilled
 + * with large buffers with sufficient headroom - so it should affect
 + * at most queue size packets.
 + * Afterwards, the conditions to enable
 + * XDP should preclude the underlying device from sending packets
 + * across multiple buffers (num_buf > 1), and we make sure buffers
 + * have enough headroom.
 + */
 +static struct page *xdp_linearize_page(struct receive_queue *rq,
 +                                     u16 *num_buf,
 +                                     struct page *p,
 +                                     int offset,
 +                                     int page_off,
 +                                     unsigned int *len)
 +{
 +      struct page *page = alloc_page(GFP_ATOMIC);
 +
 +      if (!page)
 +              return NULL;
 +
 +      memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
 +      page_off += *len;
 +
 +      while (--*num_buf) {
 +              unsigned int buflen;
 +              void *buf;
 +              int off;
 +
 +              buf = virtqueue_get_buf(rq->vq, &buflen);
 +              if (unlikely(!buf))
 +                      goto err_buf;
 +
 +              p = virt_to_head_page(buf);
 +              off = buf - page_address(p);
 +
 +              /* guard against a misconfigured or uncooperative backend that
 +               * is sending packet larger than the MTU.
 +               */
 +              if ((page_off + buflen) > PAGE_SIZE) {
 +                      put_page(p);
 +                      goto err_buf;
 +              }
 +
 +              memcpy(page_address(page) + page_off,
 +                     page_address(p) + off, buflen);
 +              page_off += buflen;
 +              put_page(p);
 +      }
 +
 +      /* Headroom does not contribute to packet length */
 +      *len = page_off - VIRTIO_XDP_HEADROOM;
 +      return page;
 +err_buf:
 +      __free_pages(page, 0);
 +      return NULL;
 +}
 +
  static struct sk_buff *receive_small(struct net_device *dev,
                                     struct virtnet_info *vi,
                                     struct receive_queue *rq,
 -                                   void *buf, unsigned int len)
 +                                   void *buf, void *ctx,
 +                                   unsigned int len)
  {
        struct sk_buff *skb;
        struct bpf_prog *xdp_prog;
 -      unsigned int xdp_headroom = virtnet_get_headroom(vi);
 +      unsigned int xdp_headroom = (unsigned long)ctx;
        unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
        unsigned int headroom = vi->hdr_len + header_offset;
        unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
                              SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 +      struct page *page = virt_to_head_page(buf);
        unsigned int delta = 0;
 +      struct page *xdp_page;
        len -= vi->hdr_len;
  
        rcu_read_lock();
                if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
                        goto err_xdp;
  
 +              if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
 +                      int offset = buf - page_address(page) + header_offset;
 +                      unsigned int tlen = len + vi->hdr_len;
 +                      u16 num_buf = 1;
 +
 +                      xdp_headroom = virtnet_get_headroom(vi);
 +                      header_offset = VIRTNET_RX_PAD + xdp_headroom;
 +                      headroom = vi->hdr_len + header_offset;
 +                      buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
 +                               SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 +                      xdp_page = xdp_linearize_page(rq, &num_buf, page,
 +                                                    offset, header_offset,
 +                                                    &tlen);
 +                      if (!xdp_page)
 +                              goto err_xdp;
 +
 +                      buf = page_address(xdp_page);
 +                      put_page(page);
 +                      page = xdp_page;
 +              }
 +
                xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
                xdp.data = xdp.data_hard_start + xdp_headroom;
                xdp.data_end = xdp.data + len;
  
        skb = build_skb(buf, buflen);
        if (!skb) {
 -              put_page(virt_to_head_page(buf));
 +              put_page(page);
                goto err;
        }
        skb_reserve(skb, headroom - delta);
@@@ -574,7 -460,7 +574,7 @@@ err
  err_xdp:
        rcu_read_unlock();
        dev->stats.rx_dropped++;
 -      put_page(virt_to_head_page(buf));
 +      put_page(page);
  xdp_xmit:
        return NULL;
  }
@@@ -599,6 -485,66 +599,6 @@@ err
        return NULL;
  }
  
 -/* The conditions to enable XDP should preclude the underlying device from
 - * sending packets across multiple buffers (num_buf > 1). However per spec
 - * it does not appear to be illegal to do so but rather just against convention.
 - * So in order to avoid making a system unresponsive the packets are pushed
 - * into a page and the XDP program is run. This will be extremely slow and we
 - * push a warning to the user to fix this as soon as possible. Fixing this may
 - * require resolving the underlying hardware to determine why multiple buffers
 - * are being received or simply loading the XDP program in the ingress stack
 - * after the skb is built because there is no advantage to running it here
 - * anymore.
 - */
 -static struct page *xdp_linearize_page(struct receive_queue *rq,
 -                                     u16 *num_buf,
 -                                     struct page *p,
 -                                     int offset,
 -                                     unsigned int *len)
 -{
 -      struct page *page = alloc_page(GFP_ATOMIC);
 -      unsigned int page_off = VIRTIO_XDP_HEADROOM;
 -
 -      if (!page)
 -              return NULL;
 -
 -      memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
 -      page_off += *len;
 -
 -      while (--*num_buf) {
 -              unsigned int buflen;
 -              void *buf;
 -              int off;
 -
 -              buf = virtqueue_get_buf(rq->vq, &buflen);
 -              if (unlikely(!buf))
 -                      goto err_buf;
 -
 -              p = virt_to_head_page(buf);
 -              off = buf - page_address(p);
 -
 -              /* guard against a misconfigured or uncooperative backend that
 -               * is sending packet larger than the MTU.
 -               */
 -              if ((page_off + buflen) > PAGE_SIZE) {
 -                      put_page(p);
 -                      goto err_buf;
 -              }
 -
 -              memcpy(page_address(page) + page_off,
 -                     page_address(p) + off, buflen);
 -              page_off += buflen;
 -              put_page(p);
 -      }
 -
 -      /* Headroom does not contribute to packet length */
 -      *len = page_off - VIRTIO_XDP_HEADROOM;
 -      return page;
 -err_buf:
 -      __free_pages(page, 0);
 -      return NULL;
 -}
 -
  static struct sk_buff *receive_mergeable(struct net_device *dev,
                                         struct virtnet_info *vi,
                                         struct receive_queue *rq,
        struct sk_buff *head_skb, *curr_skb;
        struct bpf_prog *xdp_prog;
        unsigned int truesize;
 +      unsigned int headroom = mergeable_ctx_to_headroom(ctx);
  
        head_skb = NULL;
  
                u32 act;
  
                /* This happens when rx buffer size is underestimated */
 -              if (unlikely(num_buf > 1)) {
 +              if (unlikely(num_buf > 1 ||
 +                           headroom < virtnet_get_headroom(vi))) {
                        /* linearize data for XDP */
                        xdp_page = xdp_linearize_page(rq, &num_buf,
 -                                                    page, offset, &len);
 +                                                    page, offset,
 +                                                    VIRTIO_XDP_HEADROOM,
 +                                                    &len);
                        if (!xdp_page)
                                goto err_xdp;
                        offset = VIRTIO_XDP_HEADROOM;
        }
        rcu_read_unlock();
  
 -      if (unlikely(len > (unsigned long)ctx)) {
 +      truesize = mergeable_ctx_to_truesize(ctx);
 +      if (unlikely(len > truesize)) {
                pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
                         dev->name, len, (unsigned long)ctx);
                dev->stats.rx_length_errors++;
                goto err_skb;
        }
 -      truesize = (unsigned long)ctx;
 +
        head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
        curr_skb = head_skb;
  
                }
  
                page = virt_to_head_page(buf);
 -              if (unlikely(len > (unsigned long)ctx)) {
 +
 +              truesize = mergeable_ctx_to_truesize(ctx);
 +              if (unlikely(len > truesize)) {
                        pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
                                 dev->name, len, (unsigned long)ctx);
                        dev->stats.rx_length_errors++;
                        goto err_skb;
                }
 -              truesize = (unsigned long)ctx;
  
                num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
                if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
@@@ -814,7 -754,7 +814,7 @@@ static int receive_buf(struct virtnet_i
        else if (vi->big_packets)
                skb = receive_big(dev, vi, rq, buf, len);
        else
 -              skb = receive_small(dev, vi, rq, buf, len);
 +              skb = receive_small(dev, vi, rq, buf, ctx, len);
  
        if (unlikely(!skb))
                return 0;
@@@ -847,18 -787,12 +847,18 @@@ frame_err
        return 0;
  }
  
 +/* Unlike mergeable buffers, all buffers are allocated to the
 + * same size, except for the headroom. For this reason we do
 + * not need to use  mergeable_len_to_ctx here - it is enough
 + * to store the headroom as the context ignoring the truesize.
 + */
  static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
                             gfp_t gfp)
  {
        struct page_frag *alloc_frag = &rq->alloc_frag;
        char *buf;
        unsigned int xdp_headroom = virtnet_get_headroom(vi);
 +      void *ctx = (void *)(unsigned long)xdp_headroom;
        int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
        int err;
  
        alloc_frag->offset += len;
        sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
                    vi->hdr_len + GOOD_PACKET_LEN);
 -      err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
 +      err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
        if (err < 0)
                put_page(virt_to_head_page(buf));
 -
        return err;
  }
  
@@@ -967,7 -902,7 +967,7 @@@ static int add_recvbuf_mergeable(struc
        }
  
        sg_init_one(rq->sg, buf, len);
 -      ctx = (void *)(unsigned long)len;
 +      ctx = mergeable_len_to_ctx(len, headroom);
        err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
        if (err < 0)
                put_page(virt_to_head_page(buf));
@@@ -1079,7 -1014,7 +1079,7 @@@ static int virtnet_receive(struct recei
        void *buf;
        struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
  
 -      if (vi->mergeable_rx_bufs) {
 +      if (!vi->big_packets || vi->mergeable_rx_bufs) {
                void *ctx;
  
                while (received < budget &&
@@@ -1123,7 -1058,7 +1123,7 @@@ static void free_old_xmit_skbs(struct s
                bytes += skb->len;
                packets++;
  
-               dev_kfree_skb_any(skb);
+               dev_consume_skb_any(skb);
        }
  
        /* Avoid overhead when no packets have been processed
@@@ -1878,6 -1813,7 +1878,6 @@@ static void virtnet_freeze_down(struct 
  }
  
  static int init_vqs(struct virtnet_info *vi);
 -static void _remove_vq_common(struct virtnet_info *vi);
  
  static int virtnet_restore_up(struct virtio_device *vdev)
  {
        return err;
  }
  
 -static int virtnet_reset(struct virtnet_info *vi, int curr_qp, int xdp_qp)
 +static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
  {
 -      struct virtio_device *dev = vi->vdev;
 -      int ret;
 +      struct scatterlist sg;
 +      vi->ctrl_offloads = cpu_to_virtio64(vi->vdev, offloads);
  
 -      virtio_config_disable(dev);
 -      dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED;
 -      virtnet_freeze_down(dev);
 -      _remove_vq_common(vi);
 +      sg_init_one(&sg, &vi->ctrl_offloads, sizeof(vi->ctrl_offloads));
  
 -      virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
 -      virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);
 +      if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
 +                                VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
 +              dev_warn(&vi->dev->dev, "Fail to set guest offload. \n");
 +              return -EINVAL;
 +      }
  
 -      ret = virtio_finalize_features(dev);
 -      if (ret)
 -              goto err;
 +      return 0;
 +}
  
 -      vi->xdp_queue_pairs = xdp_qp;
 -      ret = virtnet_restore_up(dev);
 -      if (ret)
 -              goto err;
 -      ret = _virtnet_set_queues(vi, curr_qp);
 -      if (ret)
 -              goto err;
 +static int virtnet_clear_guest_offloads(struct virtnet_info *vi)
 +{
 +      u64 offloads = 0;
  
 -      virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
 -      virtio_config_enable(dev);
 -      return 0;
 -err:
 -      virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
 -      return ret;
 +      if (!vi->guest_offloads)
 +              return 0;
 +
 +      if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))
 +              offloads = 1ULL << VIRTIO_NET_F_GUEST_CSUM;
 +
 +      return virtnet_set_guest_offloads(vi, offloads);
 +}
 +
 +static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
 +{
 +      u64 offloads = vi->guest_offloads;
 +
 +      if (!vi->guest_offloads)
 +              return 0;
 +      if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))
 +              offloads |= 1ULL << VIRTIO_NET_F_GUEST_CSUM;
 +
 +      return virtnet_set_guest_offloads(vi, offloads);
  }
  
  static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
        u16 xdp_qp = 0, curr_qp;
        int i, err;
  
 -      if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
 -          virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
 -          virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
 -          virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) {
 +      if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
 +          && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
 +              virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
 +              virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
 +              virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO))) {
                NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO, disable LRO first");
                return -EOPNOTSUPP;
        }
                        return PTR_ERR(prog);
        }
  
 -      /* Changing the headroom in buffers is a disruptive operation because
 -       * existing buffers must be flushed and reallocated. This will happen
 -       * when a xdp program is initially added or xdp is disabled by removing
 -       * the xdp program resulting in number of XDP queues changing.
 -       */
 -      if (vi->xdp_queue_pairs != xdp_qp) {
 -              err = virtnet_reset(vi, curr_qp + xdp_qp, xdp_qp);
 -              if (err) {
 -                      dev_warn(&dev->dev, "XDP reset failure.\n");
 -                      goto virtio_reset_err;
 -              }
 -      }
 +      /* Make sure NAPI is not using any XDP TX queues for RX. */
 +      for (i = 0; i < vi->max_queue_pairs; i++)
 +              napi_disable(&vi->rq[i].napi);
  
        netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
 +      err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
 +      if (err)
 +              goto err;
 +      vi->xdp_queue_pairs = xdp_qp;
  
        for (i = 0; i < vi->max_queue_pairs; i++) {
                old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
                rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
 +              if (i == 0) {
 +                      if (!old_prog)
 +                              virtnet_clear_guest_offloads(vi);
 +                      if (!prog)
 +                              virtnet_restore_guest_offloads(vi);
 +              }
                if (old_prog)
                        bpf_prog_put(old_prog);
 +              virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
        }
  
        return 0;
  
 -virtio_reset_err:
 -      /* On reset error do our best to unwind XDP changes inflight and return
 -       * error up to user space for resolution. The underlying reset hung on
 -       * us so not much we can do here.
 -       */
 +err:
 +      for (i = 0; i < vi->max_queue_pairs; i++)
 +              virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
        if (prog)
                bpf_prog_sub(prog, vi->max_queue_pairs - 1);
        return err;
@@@ -2255,7 -2182,7 +2255,7 @@@ static int virtnet_find_vqs(struct virt
        names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
        if (!names)
                goto err_names;
 -      if (vi->mergeable_rx_bufs) {
 +      if (!vi->big_packets || vi->mergeable_rx_bufs) {
                ctx = kzalloc(total_vqs * sizeof(*ctx), GFP_KERNEL);
                if (!ctx)
                        goto err_ctx;
@@@ -2376,7 -2303,7 +2376,7 @@@ err
  
  #ifdef CONFIG_SYSFS
  static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
 -              struct rx_queue_attribute *attribute, char *buf)
 +              char *buf)
  {
        struct virtnet_info *vi = netdev_priv(queue->dev);
        unsigned int queue_index = get_netdev_rx_queue_index(queue);
@@@ -2501,7 -2428,7 +2501,7 @@@ static int virtnet_probe(struct virtio_
                        dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
  
                if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
 -                      dev->hw_features |= NETIF_F_TSO | NETIF_F_UFO
 +                      dev->hw_features |= NETIF_F_TSO
                                | NETIF_F_TSO_ECN | NETIF_F_TSO6;
                }
                /* Individual feature bits: what can host handle? */
                        dev->hw_features |= NETIF_F_TSO6;
                if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
                        dev->hw_features |= NETIF_F_TSO_ECN;
 -              if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UFO))
 -                      dev->hw_features |= NETIF_F_UFO;
  
                dev->features |= NETIF_F_GSO_ROBUST;
  
                if (gso)
 -                      dev->features |= dev->hw_features & (NETIF_F_ALL_TSO|NETIF_F_UFO);
 +                      dev->features |= dev->hw_features & NETIF_F_ALL_TSO;
                /* (!csum && gso) case will be fixed by register_netdev() */
        }
        if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
                netif_carrier_on(dev);
        }
  
 +      for (i = 0; i < ARRAY_SIZE(guest_offloads); i++)
 +              if (virtio_has_feature(vi->vdev, guest_offloads[i]))
 +                      set_bit(guest_offloads[i], &vi->guest_offloads);
 +
        pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
                 dev->name, max_queue_pairs);
  
@@@ -2672,6 -2597,15 +2672,6 @@@ free
        return err;
  }
  
 -static void _remove_vq_common(struct virtnet_info *vi)
 -{
 -      vi->vdev->config->reset(vi->vdev);
 -      free_unused_bufs(vi);
 -      _free_receive_bufs(vi);
 -      free_receive_page_frags(vi);
 -      virtnet_del_vqs(vi);
 -}
 -
  static void remove_vq_common(struct virtnet_info *vi)
  {
        vi->vdev->config->reset(vi->vdev);
@@@ -2703,7 -2637,8 +2703,7 @@@ static void virtnet_remove(struct virti
        free_netdev(vi->dev);
  }
  
 -#ifdef CONFIG_PM_SLEEP
 -static int virtnet_freeze(struct virtio_device *vdev)
 +static __maybe_unused int virtnet_freeze(struct virtio_device *vdev)
  {
        struct virtnet_info *vi = vdev->priv;
  
        return 0;
  }
  
 -static int virtnet_restore(struct virtio_device *vdev)
 +static __maybe_unused int virtnet_restore(struct virtio_device *vdev)
  {
        struct virtnet_info *vi = vdev->priv;
        int err;
  
        return 0;
  }
 -#endif
  
  static struct virtio_device_id id_table[] = {
        { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
        VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
        VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
        VIRTIO_NET_F_CTRL_MAC_ADDR, \
 -      VIRTIO_NET_F_MTU
 +      VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS
  
  static unsigned int features[] = {
        VIRTNET_FEATURES,
index 79020cf8c79cfd7b09a4310dd4f94d31c5254fdc,a1ea9ef97ed97adc31e7c399b66123cbab8058e1..4fb7647995c3964590e3d4b984da054e887016cb
@@@ -661,16 -661,10 +661,16 @@@ static inline void iwl_pcie_sw_reset(st
        usleep_range(5000, 6000);
  }
  
 +static inline u8 iwl_pcie_get_cmd_index(struct iwl_txq *q, u32 index)
 +{
 +      return index & (q->n_window - 1);
 +}
 +
  static inline void *iwl_pcie_get_tfd(struct iwl_trans_pcie *trans_pcie,
                                     struct iwl_txq *txq, int idx)
  {
 -      return txq->tfds + trans_pcie->tfd_size * idx;
 +      return txq->tfds + trans_pcie->tfd_size * iwl_pcie_get_cmd_index(txq,
 +                                                                       idx);
  }
  
  static inline void iwl_enable_rfkill_int(struct iwl_trans *trans)
@@@ -732,6 -726,11 +732,6 @@@ static inline bool iwl_queue_used(cons
                !(i < q->read_ptr && i >= q->write_ptr);
  }
  
 -static inline u8 get_cmd_index(struct iwl_txq *q, u32 index)
 -{
 -      return index & (q->n_window - 1);
 -}
 -
  static inline bool iwl_is_rfkill_set(struct iwl_trans *trans)
  {
        struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
@@@ -788,11 -787,13 +788,13 @@@ int iwl_pci_fw_enter_d0i3(struct iwl_tr
  
  void iwl_pcie_enable_rx_wake(struct iwl_trans *trans, bool enable);
  
+ void iwl_pcie_rx_allocator_work(struct work_struct *data);
  /* common functions that are used by gen2 transport */
  void iwl_pcie_apm_config(struct iwl_trans *trans);
  int iwl_pcie_prepare_card_hw(struct iwl_trans *trans);
  void iwl_pcie_synchronize_irqs(struct iwl_trans *trans);
 -bool iwl_trans_check_hw_rf_kill(struct iwl_trans *trans);
 +bool iwl_pcie_check_hw_rf_kill(struct iwl_trans *trans);
  void iwl_trans_pcie_handle_stop_rfkill(struct iwl_trans *trans,
                                       bool was_in_rfkill);
  void iwl_pcie_txq_free_tfd(struct iwl_trans *trans, struct iwl_txq *txq);
@@@ -807,8 -808,6 +809,8 @@@ int iwl_pcie_alloc_dma_ptr(struct iwl_t
                           struct iwl_dma_ptr *ptr, size_t size);
  void iwl_pcie_free_dma_ptr(struct iwl_trans *trans, struct iwl_dma_ptr *ptr);
  void iwl_pcie_apply_destination(struct iwl_trans *trans);
 +void iwl_pcie_free_tso_page(struct iwl_trans_pcie *trans_pcie,
 +                          struct sk_buff *skb);
  #ifdef CONFIG_INET
  struct iwl_tso_hdr_page *get_page_hdr(struct iwl_trans *trans, size_t len);
  #endif
index e5d2bf0bde3738b898398a3f4483be95a610ae86,942736d3fa75521018580aed3518dd2fea251fc6..a06b6612b6583d6b5efa1d2396bc2060a2ffa37f
@@@ -597,7 -597,7 +597,7 @@@ static void iwl_pcie_rx_allocator_get(s
        rxq->free_count += RX_CLAIM_REQ_ALLOC;
  }
  
static void iwl_pcie_rx_allocator_work(struct work_struct *data)
+ void iwl_pcie_rx_allocator_work(struct work_struct *data)
  {
        struct iwl_rb_allocator *rba_p =
                container_of(data, struct iwl_rb_allocator, rx_alloc);
@@@ -900,10 -900,6 +900,6 @@@ static int _iwl_pcie_rx_init(struct iwl
                        return err;
        }
        def_rxq = trans_pcie->rxq;
-       if (!rba->alloc_wq)
-               rba->alloc_wq = alloc_workqueue("rb_allocator",
-                                               WQ_HIGHPRI | WQ_UNBOUND, 1);
-       INIT_WORK(&rba->rx_alloc, iwl_pcie_rx_allocator_work);
  
        spin_lock(&rba->lock);
        atomic_set(&rba->req_pending, 0);
@@@ -1017,10 -1013,6 +1013,6 @@@ void iwl_pcie_rx_free(struct iwl_trans 
        }
  
        cancel_work_sync(&rba->rx_alloc);
-       if (rba->alloc_wq) {
-               destroy_workqueue(rba->alloc_wq);
-               rba->alloc_wq = NULL;
-       }
  
        iwl_pcie_free_rbs_pool(trans);
  
@@@ -1176,7 -1168,7 +1168,7 @@@ static void iwl_pcie_rx_handle_rb(struc
  
                sequence = le16_to_cpu(pkt->hdr.sequence);
                index = SEQ_TO_INDEX(sequence);
 -              cmd_index = get_cmd_index(txq, index);
 +              cmd_index = iwl_pcie_get_cmd_index(txq, index);
  
                if (rxq->id == 0)
                        iwl_op_mode_rx(trans->op_mode, &rxq->napi,
index 58873cc27396f038812b33658a916cff1e408000,3927bbf04f727d5e0b9a9c0a9d1c78cee42e3d94..2e3e013ec95acf94eecb843edef305864cf6939b
@@@ -986,7 -986,7 +986,7 @@@ static int iwl_pcie_load_given_ucode_80
                                               &first_ucode_section);
  }
  
 -bool iwl_trans_check_hw_rf_kill(struct iwl_trans *trans)
 +bool iwl_pcie_check_hw_rf_kill(struct iwl_trans *trans)
  {
        struct iwl_trans_pcie *trans_pcie =  IWL_TRANS_GET_PCIE_TRANS(trans);
        bool hw_rfkill = iwl_is_rfkill_set(trans);
@@@ -1252,7 -1252,7 +1252,7 @@@ static int iwl_trans_pcie_start_fw(stru
        mutex_lock(&trans_pcie->mutex);
  
        /* If platform's RF_KILL switch is NOT set to KILL */
 -      hw_rfkill = iwl_trans_check_hw_rf_kill(trans);
 +      hw_rfkill = iwl_pcie_check_hw_rf_kill(trans);
        if (hw_rfkill && !run_in_rfkill) {
                ret = -ERFKILL;
                goto out;
                ret = iwl_pcie_load_given_ucode(trans, fw);
  
        /* re-check RF-Kill state since we may have missed the interrupt */
 -      hw_rfkill = iwl_trans_check_hw_rf_kill(trans);
 +      hw_rfkill = iwl_pcie_check_hw_rf_kill(trans);
        if (hw_rfkill && !run_in_rfkill)
                ret = -ERFKILL;
  
@@@ -1663,7 -1663,7 +1663,7 @@@ static int _iwl_trans_pcie_start_hw(str
        trans_pcie->is_down = false;
  
        /* ...rfkill can call stop_device and set it false if needed */
 -      iwl_trans_check_hw_rf_kill(trans);
 +      iwl_pcie_check_hw_rf_kill(trans);
  
        /* Make sure we sync here, because we'll need full access later */
        if (low_power)
@@@ -1786,6 -1786,11 +1786,11 @@@ void iwl_trans_pcie_free(struct iwl_tra
                iwl_pcie_tx_free(trans);
        iwl_pcie_rx_free(trans);
  
+       if (trans_pcie->rba.alloc_wq) {
+               destroy_workqueue(trans_pcie->rba.alloc_wq);
+               trans_pcie->rba.alloc_wq = NULL;
+       }
        if (trans_pcie->msix_enabled) {
                for (i = 0; i < trans_pcie->alloc_vecs; i++) {
                        irq_set_affinity_hint(
@@@ -1842,8 -1847,8 +1847,8 @@@ static bool iwl_trans_pcie_grab_nic_acc
         * These bits say the device is running, and should keep running for
         * at least a short while (at least as long as MAC_ACCESS_REQ stays 1),
         * but they do not indicate that embedded SRAM is restored yet;
 -       * 3945 and 4965 have volatile SRAM, and must save/restore contents
 -       * to/from host DRAM when sleeping/waking for power-saving.
 +       * HW with volatile SRAM must save/restore contents to/from
 +       * host DRAM when sleeping/waking for power-saving.
         * Each direction takes approximately 1/4 millisecond; with this
         * overhead, it's a good idea to grab and hold MAC_ACCESS_REQUEST if a
         * series of register accesses are expected (e.g. reading Event Log),
         *
         * CSR_UCODE_DRV_GP1 register bit MAC_SLEEP == 0 indicates that
         * SRAM is okay/restored.  We don't check that here because this call
 -       * is just for hardware register access; but GP1 MAC_SLEEP check is a
 -       * good idea before accessing 3945/4965 SRAM (e.g. reading Event Log).
 +       * is just for hardware register access; but GP1 MAC_SLEEP
 +       * check is a good idea before accessing the SRAM of HW with
 +       * volatile SRAM (e.g. reading Event Log).
         *
         * 5000 series and later (including 1000 series) have non-volatile SRAM,
         * and do not save/restore SRAM when power cycling.
@@@ -2835,7 -2839,7 +2840,7 @@@ static struct iwl_trans_dump_dat
        spin_lock_bh(&cmdq->lock);
        ptr = cmdq->write_ptr;
        for (i = 0; i < cmdq->n_window; i++) {
 -              u8 idx = get_cmd_index(cmdq, ptr);
 +              u8 idx = iwl_pcie_get_cmd_index(cmdq, ptr);
                u32 caplen, cmdlen;
  
                cmdlen = iwl_trans_pcie_get_cmdlen(trans, cmdq->tfds +
@@@ -3138,18 -3142,7 +3143,18 @@@ struct iwl_trans *iwl_trans_pcie_alloc(
                iwl_set_bit(trans, CSR_HOST_CHICKEN,
                            CSR_HOST_CHICKEN_PM_IDLE_SRC_DIS_SB_PME);
  
 +#if IS_ENABLED(CONFIG_IWLMVM)
        trans->hw_rf_id = iwl_read32(trans, CSR_HW_RF_ID);
 +      if (trans->hw_rf_id == CSR_HW_RF_ID_TYPE_HR) {
 +              u32 hw_status;
 +
 +              hw_status = iwl_read_prph(trans, UMAG_GEN_HW_STATUS);
 +              if (hw_status & UMAG_GEN_HW_IS_FPGA)
 +                      trans->cfg = &iwla000_2ax_cfg_qnj_hr_f0;
 +              else
 +                      trans->cfg = &iwla000_2ac_cfg_hr;
 +      }
 +#endif
  
        iwl_pcie_set_interrupt_capa(pdev, trans);
        trans->hw_id = (pdev->device << 16) + pdev->subsystem_device;
                trans_pcie->inta_mask = CSR_INI_SET_MASK;
         }
  
+       trans_pcie->rba.alloc_wq = alloc_workqueue("rb_allocator",
+                                                  WQ_HIGHPRI | WQ_UNBOUND, 1);
+       INIT_WORK(&trans_pcie->rba.rx_alloc, iwl_pcie_rx_allocator_work);
  #ifdef CONFIG_IWLWIFI_PCIE_RTPM
        trans->runtime_pm_mode = IWL_PLAT_PM_MODE_D0I3;
  #else
index d5b6f6a9fcc508bab79c642239e103f4ef0502eb,205d82d4c468717ac26050358acb65a968481097..023b29d973e6f16310e0a2d6e909d40f208cc228
@@@ -550,7 -550,6 +550,7 @@@ struct mlx5_fc_stats 
        unsigned long sampling_interval; /* jiffies */
  };
  
 +struct mlx5_mpfs;
  struct mlx5_eswitch;
  struct mlx5_lag;
  struct mlx5_pagefault;
@@@ -647,11 -646,7 +647,11 @@@ struct mlx5_priv 
        struct list_head        ctx_list;
        spinlock_t              ctx_lock;
  
 +      struct list_head        waiting_events_list;
 +      bool                    is_accum_events;
 +
        struct mlx5_flow_steering *steering;
 +      struct mlx5_mpfs        *mpfs;
        struct mlx5_eswitch     *eswitch;
        struct mlx5_core_sriov  sriov;
        struct mlx5_lag         *lag;
@@@ -678,9 -673,7 +678,7 @@@ enum mlx5_device_state 
  };
  
  enum mlx5_interface_state {
-       MLX5_INTERFACE_STATE_DOWN = BIT(0),
-       MLX5_INTERFACE_STATE_UP = BIT(1),
-       MLX5_INTERFACE_STATE_SHUTDOWN = BIT(2),
+       MLX5_INTERFACE_STATE_UP = BIT(0),
  };
  
  enum mlx5_pci_status {
@@@ -890,6 -883,8 +888,6 @@@ static inline void *mlx5_buf_offset(str
                return buf->direct.buf + offset;
  }
  
 -extern struct workqueue_struct *mlx5_core_wq;
 -
  #define STRUCT_FIELD(header, field) \
        .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field),      \
        .struct_size_bytes   = sizeof((struct ib_unpacked_ ## header *)0)->field
index 35de8312e0b5a53b2851905a158995611d1709e2,c99ba7914c0a41d6d829e6018df16f4a229412b5..8aba119bb005be3e8bb07a07cdd7ca69129ee12f
@@@ -35,6 -35,7 +35,6 @@@
  
  #include <linux/percpu.h>
  #include <linux/rculist.h>
 -#include <linux/dmaengine.h>
  #include <linux/workqueue.h>
  #include <linux/dynamic_queue_limits.h>
  
@@@ -65,7 -66,6 +65,7 @@@ struct mpls_dev
  /* UDP Tunnel offloads */
  struct udp_tunnel_info;
  struct bpf_prog;
 +struct xdp_buff;
  
  void netdev_set_default_ethtool_ops(struct net_device *dev,
                                    const struct ethtool_ops *ops);
@@@ -693,9 -693,10 +693,9 @@@ struct netdev_rx_queue 
   */
  struct rx_queue_attribute {
        struct attribute attr;
 -      ssize_t (*show)(struct netdev_rx_queue *queue,
 -          struct rx_queue_attribute *attr, char *buf);
 +      ssize_t (*show)(struct netdev_rx_queue *queue, char *buf);
        ssize_t (*store)(struct netdev_rx_queue *queue,
 -          struct rx_queue_attribute *attr, const char *buf, size_t len);
 +                       const char *buf, size_t len);
  };
  
  #ifdef CONFIG_XPS
@@@ -769,14 -770,31 +769,14 @@@ static inline bool netdev_phys_item_id_
  typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
                                       struct sk_buff *skb);
  
 -/* These structures hold the attributes of qdisc and classifiers
 - * that are being passed to the netdevice through the setup_tc op.
 - */
 -enum {
 +enum tc_setup_type {
        TC_SETUP_MQPRIO,
        TC_SETUP_CLSU32,
        TC_SETUP_CLSFLOWER,
 -      TC_SETUP_MATCHALL,
 +      TC_SETUP_CLSMATCHALL,
        TC_SETUP_CLSBPF,
  };
  
 -struct tc_cls_u32_offload;
 -
 -struct tc_to_netdev {
 -      unsigned int type;
 -      union {
 -              struct tc_cls_u32_offload *cls_u32;
 -              struct tc_cls_flower_offload *cls_flower;
 -              struct tc_cls_matchall_offload *cls_mall;
 -              struct tc_cls_bpf_offload *cls_bpf;
 -              struct tc_mqprio_qopt *mqprio;
 -      };
 -      bool egress_dev;
 -};
 -
  /* These structures hold the attributes of xdp state that are being passed
   * to the netdevice through the xdp op.
   */
@@@ -959,8 -977,8 +959,8 @@@ struct xfrmdev_ops 
   *      with PF and querying it may introduce a theoretical security risk.
   * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting);
   * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
 - * int (*ndo_setup_tc)(struct net_device *dev, u32 handle, u32 chain_index,
 - *                   __be16 protocol, struct tc_to_netdev *tc);
 + * int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type,
 + *                   void *type_data);
   *    Called to setup any 'tc' scheduler, classifier or action on @dev.
   *    This is always called from the stack with the rtnl lock held and netif
   *    tx queues stopped. This allows the netdevice to perform queue
   * int (*ndo_xdp)(struct net_device *dev, struct netdev_xdp *xdp);
   *    This function is used to set or query state related to XDP on the
   *    netdevice. See definition of enum xdp_netdev_command for details.
 - *
 + * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp);
 + *    This function is used to submit a XDP packet for transmit on a
 + *    netdevice.
 + * void (*ndo_xdp_flush)(struct net_device *dev);
 + *    This function is used to inform the driver to flush a particular
 + *    xdp tx queue. Must be called on same CPU as xdp_xmit.
   */
  struct net_device_ops {
        int                     (*ndo_init)(struct net_device *dev);
                                                   struct net_device *dev,
                                                   int vf, bool setting);
        int                     (*ndo_setup_tc)(struct net_device *dev,
 -                                              u32 handle, u32 chain_index,
 -                                              __be16 protocol,
 -                                              struct tc_to_netdev *tc);
 +                                              enum tc_setup_type type,
 +                                              void *type_data);
  #if IS_ENABLED(CONFIG_FCOE)
        int                     (*ndo_fcoe_enable)(struct net_device *dev);
        int                     (*ndo_fcoe_disable)(struct net_device *dev);
                                                       int needed_headroom);
        int                     (*ndo_xdp)(struct net_device *dev,
                                           struct netdev_xdp *xdp);
 +      int                     (*ndo_xdp_xmit)(struct net_device *dev,
 +                                              struct xdp_buff *xdp);
 +      void                    (*ndo_xdp_flush)(struct net_device *dev);
  };
  
  /**
@@@ -2297,7 -2308,6 +2297,7 @@@ struct netdev_lag_lower_state_info 
  #define NETDEV_PRECHANGEUPPER 0x001A
  #define NETDEV_CHANGELOWERSTATE       0x001B
  #define NETDEV_UDP_TUNNEL_PUSH_INFO   0x001C
 +#define NETDEV_UDP_TUNNEL_DROP_INFO   0x001D
  #define NETDEV_CHANGE_TX_QUEUE_LEN    0x001E
  
  int register_netdevice_notifier(struct notifier_block *nb);
@@@ -2413,8 -2423,8 +2413,8 @@@ struct net_device *dev_get_by_name_rcu(
  struct net_device *__dev_get_by_name(struct net *net, const char *name);
  int dev_alloc_name(struct net_device *dev, const char *name);
  int dev_open(struct net_device *dev);
 -int dev_close(struct net_device *dev);
 -int dev_close_many(struct list_head *head, bool unlink);
 +void dev_close(struct net_device *dev);
 +void dev_close_many(struct list_head *head, bool unlink);
  void dev_disable_lro(struct net_device *dev);
  int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
  int dev_queue_xmit(struct sk_buff *skb);
@@@ -3241,8 -3251,6 +3241,8 @@@ static inline void dev_consume_skb_any(
        __dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
  }
  
 +void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
 +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
  int netif_rx(struct sk_buff *skb);
  int netif_rx_ni(struct sk_buff *skb);
  int netif_receive_skb(struct sk_buff *skb);
@@@ -3858,6 -3866,8 +3858,8 @@@ int netdev_walk_all_upper_dev_rcu(struc
  bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
                                  struct net_device *upper_dev);
  
+ bool netdev_has_any_upper_dev(struct net_device *dev);
  void *netdev_lower_get_next_private(struct net_device *dev,
                                    struct list_head **iter);
  void *netdev_lower_get_next_private_rcu(struct net_device *dev,
@@@ -4011,22 -4021,22 +4013,22 @@@ static inline netdev_tx_t netdev_start_
        return rc;
  }
  
 -int netdev_class_create_file_ns(struct class_attribute *class_attr,
 +int netdev_class_create_file_ns(const struct class_attribute *class_attr,
                                const void *ns);
 -void netdev_class_remove_file_ns(struct class_attribute *class_attr,
 +void netdev_class_remove_file_ns(const struct class_attribute *class_attr,
                                 const void *ns);
  
 -static inline int netdev_class_create_file(struct class_attribute *class_attr)
 +static inline int netdev_class_create_file(const struct class_attribute *class_attr)
  {
        return netdev_class_create_file_ns(class_attr, NULL);
  }
  
 -static inline void netdev_class_remove_file(struct class_attribute *class_attr)
 +static inline void netdev_class_remove_file(const struct class_attribute *class_attr)
  {
        netdev_class_remove_file_ns(class_attr, NULL);
  }
  
 -extern struct kobj_ns_type_operations net_ns_type_operations;
 +extern const struct kobj_ns_type_operations net_ns_type_operations;
  
  const char *netdev_drivername(const struct net_device *dev);
  
@@@ -4081,6 -4091,7 +4083,6 @@@ static inline bool net_gso_ok(netdev_fe
  
        /* check flags correspondence */
        BUILD_BUG_ON(SKB_GSO_TCPV4   != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT));
 -      BUILD_BUG_ON(SKB_GSO_UDP     != (NETIF_F_UFO >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_DODGY   != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT));
diff --combined include/linux/skbuff.h
index 7594e19bce622a38dc39c054093c3da15b99b67b,d67a8182e5eb2177d978ca8a5effeaf6bd579394..f93cc01064cb720d4ddf76c363044d1a6c62b68b
@@@ -345,42 -345,6 +345,42 @@@ static inline void skb_frag_size_sub(sk
        frag->size -= delta;
  }
  
 +static inline bool skb_frag_must_loop(struct page *p)
 +{
 +#if defined(CONFIG_HIGHMEM)
 +      if (PageHighMem(p))
 +              return true;
 +#endif
 +      return false;
 +}
 +
 +/**
 + *    skb_frag_foreach_page - loop over pages in a fragment
 + *
 + *    @f:             skb frag to operate on
 + *    @f_off:         offset from start of f->page.p
 + *    @f_len:         length from f_off to loop over
 + *    @p:             (temp var) current page
 + *    @p_off:         (temp var) offset from start of current page,
 + *                               non-zero only on first page.
 + *    @p_len:         (temp var) length in current page,
 + *                               < PAGE_SIZE only on first and last page.
 + *    @copied:        (temp var) length so far, excluding current p_len.
 + *
 + *    A fragment can hold a compound page, in which case per-page
 + *    operations, notably kmap_atomic, must be called for each
 + *    regular page.
 + */
 +#define skb_frag_foreach_page(f, f_off, f_len, p, p_off, p_len, copied)       \
 +      for (p = skb_frag_page(f) + ((f_off) >> PAGE_SHIFT),            \
 +           p_off = (f_off) & (PAGE_SIZE - 1),                         \
 +           p_len = skb_frag_must_loop(p) ?                            \
 +           min_t(u32, f_len, PAGE_SIZE - p_off) : f_len,              \
 +           copied = 0;                                                \
 +           copied < f_len;                                            \
 +           copied += p_len, p++, p_off = 0,                           \
 +           p_len = min_t(u32, f_len - copied, PAGE_SIZE))             \
 +
  #define HAVE_HW_TIME_STAMP
  
  /**
@@@ -429,7 -393,6 +429,7 @@@ enum 
        SKBTX_SCHED_TSTAMP = 1 << 6,
  };
  
 +#define SKBTX_ZEROCOPY_FRAG   (SKBTX_DEV_ZEROCOPY | SKBTX_SHARED_FRAG)
  #define SKBTX_ANY_SW_TSTAMP   (SKBTX_SW_TSTAMP    | \
                                 SKBTX_SCHED_TSTAMP)
  #define SKBTX_ANY_TSTAMP      (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP)
   */
  struct ubuf_info {
        void (*callback)(struct ubuf_info *, bool zerocopy_success);
 -      void *ctx;
 -      unsigned long desc;
 +      union {
 +              struct {
 +                      unsigned long desc;
 +                      void *ctx;
 +              };
 +              struct {
 +                      u32 id;
 +                      u16 len;
 +                      u16 zerocopy:1;
 +                      u32 bytelen;
 +              };
 +      };
 +      atomic_t refcnt;
 +
 +      struct mmpin {
 +              struct user_struct *user;
 +              unsigned int num_pg;
 +      } mmp;
  };
  
 +#define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
 +
 +struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
 +struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
 +                                      struct ubuf_info *uarg);
 +
 +static inline void sock_zerocopy_get(struct ubuf_info *uarg)
 +{
 +      atomic_inc(&uarg->refcnt);
 +}
 +
 +void sock_zerocopy_put(struct ubuf_info *uarg);
 +void sock_zerocopy_put_abort(struct ubuf_info *uarg);
 +
 +void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
 +
 +int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 +                           struct msghdr *msg, int len,
 +                           struct ubuf_info *uarg);
 +
  /* This data is invariant across clones and lives at
   * the end of the header data, ie. at skb->end.
   */
@@@ -536,38 -463,39 +536,38 @@@ enum 
  
  enum {
        SKB_GSO_TCPV4 = 1 << 0,
 -      SKB_GSO_UDP = 1 << 1,
  
        /* This indicates the skb is from an untrusted source. */
 -      SKB_GSO_DODGY = 1 << 2,
 +      SKB_GSO_DODGY = 1 << 1,
  
        /* This indicates the tcp segment has CWR set. */
 -      SKB_GSO_TCP_ECN = 1 << 3,
 +      SKB_GSO_TCP_ECN = 1 << 2,
  
 -      SKB_GSO_TCP_FIXEDID = 1 << 4,
 +      SKB_GSO_TCP_FIXEDID = 1 << 3,
  
 -      SKB_GSO_TCPV6 = 1 << 5,
 +      SKB_GSO_TCPV6 = 1 << 4,
  
 -      SKB_GSO_FCOE = 1 << 6,
 +      SKB_GSO_FCOE = 1 << 5,
  
 -      SKB_GSO_GRE = 1 << 7,
 +      SKB_GSO_GRE = 1 << 6,
  
 -      SKB_GSO_GRE_CSUM = 1 << 8,
 +      SKB_GSO_GRE_CSUM = 1 << 7,
  
 -      SKB_GSO_IPXIP4 = 1 << 9,
 +      SKB_GSO_IPXIP4 = 1 << 8,
  
 -      SKB_GSO_IPXIP6 = 1 << 10,
 +      SKB_GSO_IPXIP6 = 1 << 9,
  
 -      SKB_GSO_UDP_TUNNEL = 1 << 11,
 +      SKB_GSO_UDP_TUNNEL = 1 << 10,
  
 -      SKB_GSO_UDP_TUNNEL_CSUM = 1 << 12,
 +      SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,
  
 -      SKB_GSO_PARTIAL = 1 << 13,
 +      SKB_GSO_PARTIAL = 1 << 12,
  
 -      SKB_GSO_TUNNEL_REMCSUM = 1 << 14,
 +      SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
  
 -      SKB_GSO_SCTP = 1 << 15,
 +      SKB_GSO_SCTP = 1 << 14,
  
 -      SKB_GSO_ESP = 1 << 16,
 +      SKB_GSO_ESP = 1 << 15,
  };
  
  #if BITS_PER_LONG > 32
@@@ -1017,6 -945,12 +1017,6 @@@ static inline struct sk_buff *alloc_skb
        return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
  }
  
 -struct sk_buff *__alloc_skb_head(gfp_t priority, int node);
 -static inline struct sk_buff *alloc_skb_head(gfp_t priority)
 -{
 -      return __alloc_skb_head(priority, -1);
 -}
 -
  struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
  int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
  struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
@@@ -1039,7 -973,23 +1039,23 @@@ int __must_check skb_to_sgvec_nomark(st
  int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg,
                              int offset, int len);
  int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer);
- int skb_pad(struct sk_buff *skb, int pad);
+ int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error);
+ /**
+  *    skb_pad                 -       zero pad the tail of an skb
+  *    @skb: buffer to pad
+  *    @pad: space to pad
+  *
+  *    Ensure that a buffer is followed by a padding area that is zero
+  *    filled. Used by network drivers which may DMA or transfer data
+  *    beyond the buffer end onto the wire.
+  *
+  *    May return error in out of memory cases. The skb is freed on error.
+  */
+ static inline int skb_pad(struct sk_buff *skb, int pad)
+ {
+       return __skb_pad(skb, pad, true);
+ }
  #define dev_kfree_skb(a)      consume_skb(a)
  
  int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
@@@ -1195,6 -1145,8 +1211,6 @@@ static inline __u32 skb_get_hash(struc
        return skb->hash;
  }
  
 -__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6);
 -
  static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
  {
        if (!skb->l4_hash && !skb->sw_hash) {
        return skb->hash;
  }
  
 -__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl);
 -
 -static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4)
 -{
 -      if (!skb->l4_hash && !skb->sw_hash) {
 -              struct flow_keys keys;
 -              __u32 hash = __get_hash_from_flowi4(fl4, &keys);
 -
 -              __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
 -      }
 -
 -      return skb->hash;
 -}
 -
  __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb);
  
  static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
@@@ -1251,50 -1217,6 +1267,50 @@@ static inline struct skb_shared_hwtstam
        return &skb_shinfo(skb)->hwtstamps;
  }
  
 +static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
 +{
 +      bool is_zcopy = skb && skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY;
 +
 +      return is_zcopy ? skb_uarg(skb) : NULL;
 +}
 +
 +static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
 +{
 +      if (skb && uarg && !skb_zcopy(skb)) {
 +              sock_zerocopy_get(uarg);
 +              skb_shinfo(skb)->destructor_arg = uarg;
 +              skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
 +      }
 +}
 +
 +/* Release a reference on a zerocopy structure */
 +static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy)
 +{
 +      struct ubuf_info *uarg = skb_zcopy(skb);
 +
 +      if (uarg) {
 +              if (uarg->callback == sock_zerocopy_callback) {
 +                      uarg->zerocopy = uarg->zerocopy && zerocopy;
 +                      sock_zerocopy_put(uarg);
 +              } else {
 +                      uarg->callback(uarg, zerocopy);
 +              }
 +
 +              skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
 +      }
 +}
 +
 +/* Abort a zerocopy operation and revert zckey on error in send syscall */
 +static inline void skb_zcopy_abort(struct sk_buff *skb)
 +{
 +      struct ubuf_info *uarg = skb_zcopy(skb);
 +
 +      if (uarg) {
 +              sock_zerocopy_put_abort(uarg);
 +              skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
 +      }
 +}
 +
  /**
   *    skb_queue_empty - check if a queue is empty
   *    @list: queue head
@@@ -1877,18 -1799,13 +1893,18 @@@ static inline unsigned int skb_headlen(
        return skb->len - skb->data_len;
  }
  
 -static inline unsigned int skb_pagelen(const struct sk_buff *skb)
 +static inline unsigned int __skb_pagelen(const struct sk_buff *skb)
  {
        unsigned int i, len = 0;
  
        for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--)
                len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
 -      return len + skb_headlen(skb);
 +      return len;
 +}
 +
 +static inline unsigned int skb_pagelen(const struct sk_buff *skb)
 +{
 +      return skb_headlen(skb) + __skb_pagelen(skb);
  }
  
  /**
@@@ -2533,17 -2450,7 +2549,17 @@@ static inline void skb_orphan(struct sk
   */
  static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
  {
 -      if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY)))
 +      if (likely(!skb_zcopy(skb)))
 +              return 0;
 +      if (skb_uarg(skb)->callback == sock_zerocopy_callback)
 +              return 0;
 +      return skb_copy_ubufs(skb, gfp_mask);
 +}
 +
 +/* Frags must be orphaned, even if refcounted, if skb might loop to rx path */
 +static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask)
 +{
 +      if (likely(!skb_zcopy(skb)))
                return 0;
        return skb_copy_ubufs(skb, gfp_mask);
  }
@@@ -2934,25 -2841,42 +2950,42 @@@ static inline int skb_padto(struct sk_b
   *    skb_put_padto - increase size and pad an skbuff up to a minimal size
   *    @skb: buffer to pad
   *    @len: minimal length
+  *    @free_on_error: free buffer on error
   *
   *    Pads up a buffer to ensure the trailing bytes exist and are
   *    blanked. If the buffer already contains sufficient data it
   *    is untouched. Otherwise it is extended. Returns zero on
-  *    success. The skb is freed on error.
+  *    success. The skb is freed on error if @free_on_error is true.
   */
- static inline int skb_put_padto(struct sk_buff *skb, unsigned int len)
+ static inline int __skb_put_padto(struct sk_buff *skb, unsigned int len,
+                                 bool free_on_error)
  {
        unsigned int size = skb->len;
  
        if (unlikely(size < len)) {
                len -= size;
-               if (skb_pad(skb, len))
+               if (__skb_pad(skb, len, free_on_error))
                        return -ENOMEM;
                __skb_put(skb, len);
        }
        return 0;
  }
  
+ /**
+  *    skb_put_padto - increase size and pad an skbuff up to a minimal size
+  *    @skb: buffer to pad
+  *    @len: minimal length
+  *
+  *    Pads up a buffer to ensure the trailing bytes exist and are
+  *    blanked. If the buffer already contains sufficient data it
+  *    is untouched. Otherwise it is extended. Returns zero on
+  *    success. The skb is freed on error.
+  */
+ static inline int skb_put_padto(struct sk_buff *skb, unsigned int len)
+ {
+       return __skb_put_padto(skb, len, true);
+ }
  static inline int skb_add_data(struct sk_buff *skb,
                               struct iov_iter *from, int copy)
  {
  static inline bool skb_can_coalesce(struct sk_buff *skb, int i,
                                    const struct page *page, int off)
  {
 +      if (skb_zcopy(skb))
 +              return false;
        if (i) {
                const struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i - 1];
  
@@@ -3231,9 -3153,6 +3264,9 @@@ __wsum skb_copy_and_csum_bits(const str
  int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
                    struct pipe_inode_info *pipe, unsigned int len,
                    unsigned int flags);
 +int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
 +                       int len);
 +int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len);
  void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
  unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
  int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
diff --combined include/net/ip6_fib.h
index 71c1646298ae369e29f266d9e3b079635f22b95e,af509f801084dcf19a27f9d4c82344c5e54fe792..d060d711a6245b63a662c5ecdf839a0d4afe1af8
  #include <linux/ipv6_route.h>
  #include <linux/rtnetlink.h>
  #include <linux/spinlock.h>
 +#include <linux/notifier.h>
  #include <net/dst.h>
  #include <net/flow.h>
  #include <net/netlink.h>
  #include <net/inetpeer.h>
 +#include <net/fib_notifier.h>
  
  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
  #define FIB6_TABLE_HASHSZ 256
@@@ -72,6 -70,7 +72,7 @@@ struct fib6_node 
        __u16                   fn_flags;
        int                     fn_sernum;
        struct rt6_info         *rr_ptr;
+       struct rcu_head         rcu;
  };
  
  #ifndef CONFIG_IPV6_SUBTREES
@@@ -106,7 -105,7 +107,7 @@@ struct rt6_info 
         * the same cache line.
         */
        struct fib6_table               *rt6i_table;
-       struct fib6_node                *rt6i_node;
+       struct fib6_node __rcu          *rt6i_node;
  
        struct in6_addr                 rt6i_gateway;
  
  
        atomic_t                        rt6i_ref;
  
 +      unsigned int                    rt6i_nh_flags;
 +
        /* These are in a separate cache line. */
        struct rt6key                   rt6i_dst ____cacheline_aligned_in_smp;
        u32                             rt6i_flags;
@@@ -171,13 -168,40 +172,40 @@@ static inline void rt6_update_expires(s
        rt0->rt6i_flags |= RTF_EXPIRES;
  }
  
+ /* Function to safely get fn->sernum for passed in rt
+  * and store result in passed in cookie.
+  * Return true if we can get cookie safely
+  * Return false if not
+  */
+ static inline bool rt6_get_cookie_safe(const struct rt6_info *rt,
+                                      u32 *cookie)
+ {
+       struct fib6_node *fn;
+       bool status = false;
+       rcu_read_lock();
+       fn = rcu_dereference(rt->rt6i_node);
+       if (fn) {
+               *cookie = fn->fn_sernum;
+               status = true;
+       }
+       rcu_read_unlock();
+       return status;
+ }
  static inline u32 rt6_get_cookie(const struct rt6_info *rt)
  {
+       u32 cookie = 0;
        if (rt->rt6i_flags & RTF_PCPU ||
            (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
                rt = (struct rt6_info *)(rt->dst.from);
  
-       return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+       rt6_get_cookie_safe(rt, &cookie);
+       return cookie;
  }
  
  static inline void ip6_rt_put(struct rt6_info *rt)
        dst_release(&rt->dst);
  }
  
 +void rt6_free_pcpu(struct rt6_info *non_pcpu_rt);
 +
 +static inline void rt6_hold(struct rt6_info *rt)
 +{
 +      atomic_inc(&rt->rt6i_ref);
 +}
 +
 +static inline void rt6_release(struct rt6_info *rt)
 +{
 +      if (atomic_dec_and_test(&rt->rt6i_ref)) {
 +              rt6_free_pcpu(rt);
 +              dst_dev_put(&rt->dst);
 +              dst_release(&rt->dst);
 +      }
 +}
 +
  enum fib6_walk_state {
  #ifdef CONFIG_IPV6_SUBTREES
        FWS_S,
@@@ -253,7 -261,6 +281,7 @@@ struct fib6_table 
        struct fib6_node        tb6_root;
        struct inet_peer_base   tb6_peers;
        unsigned int            flags;
 +      unsigned int            fib_seq;
  #define RT6_TABLE_HAS_DFLT_ROUTER     BIT(0)
  };
  
@@@ -277,11 -284,6 +305,11 @@@ typedef struct rt6_info *(*pol_lookup_t
                                         struct fib6_table *,
                                         struct flowi6 *, int);
  
 +struct fib6_entry_notifier_info {
 +      struct fib_notifier_info info; /* must be first */
 +      struct rt6_info *rt;
 +};
 +
  /*
   *    exported functions
   */
@@@ -318,24 -320,9 +346,24 @@@ int fib6_init(void)
  
  int ipv6_route_open(struct inode *inode, struct file *file);
  
 +int call_fib6_notifier(struct notifier_block *nb, struct net *net,
 +                     enum fib_event_type event_type,
 +                     struct fib_notifier_info *info);
 +int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
 +                      struct fib_notifier_info *info);
 +
 +int __net_init fib6_notifier_init(struct net *net);
 +void __net_exit fib6_notifier_exit(struct net *net);
 +
 +unsigned int fib6_tables_seq_read(struct net *net);
 +int fib6_tables_dump(struct net *net, struct notifier_block *nb);
 +
  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
  int fib6_rules_init(void);
  void fib6_rules_cleanup(void);
 +bool fib6_rule_default(const struct fib_rule *rule);
 +int fib6_rules_dump(struct net *net, struct notifier_block *nb);
 +unsigned int fib6_rules_seq_read(struct net *net);
  #else
  static inline int               fib6_rules_init(void)
  {
@@@ -345,17 -332,5 +373,17 @@@ static inline void              fib6_ru
  {
        return ;
  }
 +static inline bool fib6_rule_default(const struct fib_rule *rule)
 +{
 +      return true;
 +}
 +static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb)
 +{
 +      return 0;
 +}
 +static inline unsigned int fib6_rules_seq_read(struct net *net)
 +{
 +      return 0;
 +}
  #endif
  #endif
index d6247a3c40df1209bdf50aacdca5d260c6962e4c,c1109cdbbfa6afb9aff0d6033aef7b615630ffc1..135f5a2dd93122dd905557028068a31aeea37cb0
@@@ -75,6 -75,7 +75,6 @@@ struct Qdisc 
        struct hlist_node       hash;
        u32                     handle;
        u32                     parent;
 -      void                    *u32_node;
  
        struct netdev_queue     *dev_queue;
  
        spinlock_t              busylock ____cacheline_aligned_in_smp;
  };
  
+ static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
+ {
+       if (qdisc->flags & TCQ_F_BUILTIN)
+               return;
+       refcount_inc(&qdisc->refcnt);
+ }
  static inline bool qdisc_is_running(const struct Qdisc *qdisc)
  {
        return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
@@@ -146,7 -154,8 +153,7 @@@ struct Qdisc_class_ops 
        void                    (*qlen_notify)(struct Qdisc *, unsigned long);
  
        /* Class manipulation routines */
 -      unsigned long           (*get)(struct Qdisc *, u32 classid);
 -      void                    (*put)(struct Qdisc *, unsigned long);
 +      unsigned long           (*find)(struct Qdisc *, u32 classid);
        int                     (*change)(struct Qdisc *, u32, u32,
                                        struct nlattr **, unsigned long *);
        int                     (*delete)(struct Qdisc *, unsigned long);
  
        /* Filter manipulation */
        struct tcf_block *      (*tcf_block)(struct Qdisc *, unsigned long);
 -      bool                    (*tcf_cl_offload)(u32 classid);
        unsigned long           (*bind_tcf)(struct Qdisc *, unsigned long,
                                        u32 classid);
        void                    (*unbind_tcf)(struct Qdisc *, unsigned long);
@@@ -210,17 -220,16 +217,17 @@@ struct tcf_proto_ops 
        int                     (*init)(struct tcf_proto*);
        void                    (*destroy)(struct tcf_proto*);
  
 -      unsigned long           (*get)(struct tcf_proto*, u32 handle);
 +      void*                   (*get)(struct tcf_proto*, u32 handle);
        int                     (*change)(struct net *net, struct sk_buff *,
                                        struct tcf_proto*, unsigned long,
                                        u32 handle, struct nlattr **,
 -                                      unsigned long *, bool);
 -      int                     (*delete)(struct tcf_proto*, unsigned long, bool*);
 +                                      void **, bool);
 +      int                     (*delete)(struct tcf_proto*, void *, bool*);
        void                    (*walk)(struct tcf_proto*, struct tcf_walker *arg);
 +      void                    (*bind_class)(void *, u32, unsigned long);
  
        /* rtnetlink specific */
 -      int                     (*dump)(struct net*, struct tcf_proto*, unsigned long,
 +      int                     (*dump)(struct net*, struct tcf_proto*, void *,
                                        struct sk_buff *skb, struct tcmsg*);
  
        struct module           *owner;
@@@ -392,9 -401,6 +399,9 @@@ qdisc_class_find(const struct Qdisc_cla
        struct Qdisc_class_common *cl;
        unsigned int h;
  
 +      if (!id)
 +              return NULL;
 +
        h = qdisc_class_hash(id, hash->hashmask);
        hlist_for_each_entry(cl, &hash->hash[h], hnode) {
                if (cl->classid == id)
diff --combined include/net/tcp.h
index 9c3db054e47f1a27fe3ff4a4a081674f424a7a93,f642a39f9eeeeb3a1bbff48fd467c3a3acb96160..b510f284427aabc1f508d24d29d0f812e5e0aa61
@@@ -139,7 -139,6 +139,7 @@@ void tcp_time_wait(struct sock *sk, in
  #endif
  #define TCP_RTO_MAX   ((unsigned)(120*HZ))
  #define TCP_RTO_MIN   ((unsigned)(HZ/5))
 +#define TCP_TIMEOUT_MIN       (2U) /* Min timeout for TCP timers in jiffies */
  #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ))   /* RFC6298 2.1 initial RTO value        */
  #define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ))       /* RFC 1122 initial RTO value, now
                                                 * used as a fallback RTO for the
  #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
                                                         * for local resources.
                                                         */
 -#define TCP_REO_TIMEOUT_MIN   (2000) /* Min RACK reordering timeout in usec */
 -
  #define TCP_KEEPALIVE_TIME    (120*60*HZ)     /* two hours */
  #define TCP_KEEPALIVE_PROBES  9               /* Max of 9 keepalive probes    */
  #define TCP_KEEPALIVE_INTVL   (75*HZ)
@@@ -256,6 -257,7 +256,6 @@@ extern int sysctl_tcp_rmem[3]
  extern int sysctl_tcp_app_win;
  extern int sysctl_tcp_adv_win_scale;
  extern int sysctl_tcp_frto;
 -extern int sysctl_tcp_low_latency;
  extern int sysctl_tcp_nometrics_save;
  extern int sysctl_tcp_moderate_rcvbuf;
  extern int sysctl_tcp_tso_win_divisor;
@@@ -350,11 -352,8 +350,11 @@@ int tcp_v4_rcv(struct sk_buff *skb)
  
  int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
  int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 +int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
  int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
                 int flags);
 +int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
 +                      size_t size, int flags);
  ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
                 size_t size, int flags);
  void tcp_release_cb(struct sock *sk);
@@@ -364,7 -363,7 +364,7 @@@ void tcp_delack_timer_handler(struct so
  int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
  int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
  void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 -                       const struct tcphdr *th, unsigned int len);
 +                       const struct tcphdr *th);
  void tcp_rcv_space_adjust(struct sock *sk);
  int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
  void tcp_twsk_destructor(struct sock *sk);
@@@ -797,12 -796,6 +797,12 @@@ struct tcp_skb_cb 
                        u16     tcp_gso_segs;
                        u16     tcp_gso_size;
                };
 +
 +              /* Used to stash the receive timestamp while this skb is in the
 +               * out of order queue, as skb->tstamp is overwritten by the
 +               * rbnode.
 +               */
 +              ktime_t         swtstamp;
        };
        __u8            tcp_flags;      /* TCP header flags. (tcp[13])  */
  
        __u8            ip_dsfield;     /* IPv4 tos or IPv6 dsfield     */
        __u8            txstamp_ack:1,  /* Record TX timestamp for ack? */
                        eor:1,          /* Is skb MSG_EOR marked? */
 -                      unused:6;
 +                      has_rxtstamp:1, /* SKB has a RX timestamp       */
 +                      unused:5;
        __u32           ack_seq;        /* Sequence number ACK'd        */
        union {
                struct {
@@@ -857,16 -849,6 +857,16 @@@ static inline int tcp_v6_iif(const stru
  
        return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
  }
 +
 +/* TCP_SKB_CB reference means this can not be used from early demux */
 +static inline int tcp_v6_sdif(const struct sk_buff *skb)
 +{
 +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
 +      if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags))
 +              return TCP_SKB_CB(skb)->header.h6.iif;
 +#endif
 +      return 0;
 +}
  #endif
  
  /* TCP_SKB_CB reference means this can not be used from early demux */
@@@ -880,16 -862,6 +880,16 @@@ static inline bool inet_exact_dif_match
        return false;
  }
  
 +/* TCP_SKB_CB reference means this can not be used from early demux */
 +static inline int tcp_v4_sdif(struct sk_buff *skb)
 +{
 +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
 +      if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags))
 +              return TCP_SKB_CB(skb)->header.h4.iif;
 +#endif
 +      return 0;
 +}
 +
  /* Due to TSO, an SKB can be composed of multiple actual
   * packets.  To keep these tracked properly, we use this.
   */
@@@ -1032,9 -1004,7 +1032,7 @@@ void tcp_get_default_congestion_control
  void tcp_get_available_congestion_control(char *buf, size_t len);
  void tcp_get_allowed_congestion_control(char *buf, size_t len);
  int tcp_set_allowed_congestion_control(char *allowed);
- int tcp_set_congestion_control(struct sock *sk, const char *name, bool load);
- void tcp_reinit_congestion_control(struct sock *sk,
-                                  const struct tcp_congestion_ops *ca);
+ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit);
  u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
  void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
  
@@@ -1273,6 -1243,17 +1271,6 @@@ static inline bool tcp_checksum_complet
                __tcp_checksum_complete(skb);
  }
  
 -/* Prequeue for VJ style copy to user, combined with checksumming. */
 -
 -static inline void tcp_prequeue_init(struct tcp_sock *tp)
 -{
 -      tp->ucopy.task = NULL;
 -      tp->ucopy.len = 0;
 -      tp->ucopy.memory = 0;
 -      skb_queue_head_init(&tp->ucopy.prequeue);
 -}
 -
 -bool tcp_prequeue(struct sock *sk, struct sk_buff *skb);
  bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb);
  int tcp_filter(struct sock *sk, struct sk_buff *skb);
  
@@@ -1564,7 -1545,8 +1562,7 @@@ int tcp_fastopen_reset_cipher(void *key
  void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
  struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
                              struct request_sock *req,
 -                            struct tcp_fastopen_cookie *foc,
 -                            struct dst_entry *dst);
 +                            struct tcp_fastopen_cookie *foc);
  void tcp_fastopen_init_key_once(bool publish);
  bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
                             struct tcp_fastopen_cookie *cookie);
@@@ -1945,8 -1927,7 +1943,8 @@@ static inline s64 tcp_rto_delta_us(cons
  /*
   * Save and compile IPv4 options, return a pointer to it
   */
 -static inline struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 +static inline struct ip_options_rcu *tcp_v4_save_options(struct net *net,
 +                                                       struct sk_buff *skb)
  {
        const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
        struct ip_options_rcu *dopt = NULL;
                int opt_size = sizeof(*dopt) + opt->optlen;
  
                dopt = kmalloc(opt_size, GFP_ATOMIC);
 -              if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) {
 +              if (dopt && __ip_options_echo(net, &dopt->opt, skb, opt)) {
                        kfree(dopt);
                        dopt = NULL;
                }
diff --combined include/net/udp.h
index 4e5f23fec35e6d4eb165872cc9ebec7dc31bc6d3,626c2d8a70c59f51fb5b2558433d222b56610246..12dfbfe2e2d7853427e244f9d6e2e39ca19bd41e
@@@ -260,7 -260,7 +260,7 @@@ static inline struct sk_buff *skb_recv_
  }
  
  void udp_v4_early_demux(struct sk_buff *skb);
void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst);
bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst);
  int udp_get_port(struct sock *sk, unsigned short snum,
                 int (*saddr_cmp)(const struct sock *,
                                  const struct sock *));
@@@ -287,7 -287,7 +287,7 @@@ int udp_lib_setsockopt(struct sock *sk
  struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
                             __be32 daddr, __be16 dport, int dif);
  struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 -                             __be32 daddr, __be16 dport, int dif,
 +                             __be32 daddr, __be16 dport, int dif, int sdif,
                               struct udp_table *tbl, struct sk_buff *skb);
  struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
                                 __be16 sport, __be16 dport);
@@@ -298,7 -298,7 +298,7 @@@ struct sock *udp6_lib_lookup(struct ne
  struct sock *__udp6_lib_lookup(struct net *net,
                               const struct in6_addr *saddr, __be16 sport,
                               const struct in6_addr *daddr, __be16 dport,
 -                             int dif, struct udp_table *tbl,
 +                             int dif, int sdif, struct udp_table *tbl,
                               struct sk_buff *skb);
  struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
                                 __be16 sport, __be16 dport);
diff --combined kernel/events/core.c
index 8c01572709aca5c4144d30d2226e1deeab9e3e0e,3504125871d2f058fa717638e785b9c85220213a..36f98198877c71b3ba9a86a6296623a1467f6b89
@@@ -8081,7 -8081,7 +8081,7 @@@ static void perf_event_free_bpf_handler
  
  static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
  {
 -      bool is_kprobe, is_tracepoint;
 +      bool is_kprobe, is_tracepoint, is_syscall_tp;
        struct bpf_prog *prog;
  
        if (event->attr.type != PERF_TYPE_TRACEPOINT)
  
        is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
        is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
 -      if (!is_kprobe && !is_tracepoint)
 +      is_syscall_tp = is_syscall_trace_event(event->tp_event);
 +      if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
                /* bpf programs can only be attached to u/kprobe or tracepoint */
                return -EINVAL;
  
                return PTR_ERR(prog);
  
        if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
 -          (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
 +          (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
 +          (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
                /* valid fd, but invalid bpf program type */
                bpf_prog_put(prog);
                return -EINVAL;
        }
  
 -      if (is_tracepoint) {
 +      if (is_tracepoint || is_syscall_tp) {
                int off = trace_event_get_offsets(event->tp_event);
  
                if (prog->aux->max_ctx_offset > off) {
@@@ -10034,28 -10032,27 +10034,27 @@@ SYSCALL_DEFINE5(perf_event_open
                        goto err_context;
  
                /*
-                * Do not allow to attach to a group in a different
-                * task or CPU context:
+                * Make sure we're both events for the same CPU;
+                * grouping events for different CPUs is broken; since
+                * you can never concurrently schedule them anyhow.
                 */
-               if (move_group) {
-                       /*
-                        * Make sure we're both on the same task, or both
-                        * per-cpu events.
-                        */
-                       if (group_leader->ctx->task != ctx->task)
-                               goto err_context;
+               if (group_leader->cpu != event->cpu)
+                       goto err_context;
  
-                       /*
-                        * Make sure we're both events for the same CPU;
-                        * grouping events for different CPUs is broken; since
-                        * you can never concurrently schedule them anyhow.
-                        */
-                       if (group_leader->cpu != event->cpu)
-                               goto err_context;
-               } else {
-                       if (group_leader->ctx != ctx)
-                               goto err_context;
-               }
+               /*
+                * Make sure we're both on the same task, or both
+                * per-CPU events.
+                */
+               if (group_leader->ctx->task != ctx->task)
+                       goto err_context;
+               /*
+                * Do not allow to attach to a group in a different task
+                * or CPU context. If we're moving SW events, we'll fix
+                * this up later, so allow that.
+                */
+               if (!move_group && group_leader->ctx != ctx)
+                       goto err_context;
  
                /*
                 * Only a group leader can be exclusive or pinned
diff --combined net/core/datagram.c
index a4d5f10d83a1ca6cf9bb1e8dc6d6faeae5947e4d,8c2f4489ff8f18680543b6adcad7604036458d5c..f7fb7e3f2acf33e42e1140372d87dc543a2f2c7c
@@@ -362,7 -362,7 +362,7 @@@ int __sk_queue_drop_skb(struct sock *sk
        if (flags & MSG_PEEK) {
                err = -ENOENT;
                spin_lock_bh(&sk_queue->lock);
-               if (skb == skb_peek(sk_queue)) {
+               if (skb->next) {
                        __skb_unlink(skb, sk_queue);
                        refcount_dec(&skb->users);
                        if (destructor)
@@@ -579,12 -579,27 +579,12 @@@ fault
  }
  EXPORT_SYMBOL(skb_copy_datagram_from_iter);
  
 -/**
 - *    zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
 - *    @skb: buffer to copy
 - *    @from: the source to copy from
 - *
 - *    The function will first copy up to headlen, and then pin the userspace
 - *    pages and build frags through them.
 - *
 - *    Returns 0, -EFAULT or -EMSGSIZE.
 - */
 -int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
 +int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
 +                          struct iov_iter *from, size_t length)
  {
 -      int len = iov_iter_count(from);
 -      int copy = min_t(int, skb_headlen(skb), len);
 -      int frag = 0;
 +      int frag = skb_shinfo(skb)->nr_frags;
  
 -      /* copy up to skb headlen */
 -      if (skb_copy_datagram_from_iter(skb, 0, from, copy))
 -              return -EFAULT;
 -
 -      while (iov_iter_count(from)) {
 +      while (length && iov_iter_count(from)) {
                struct page *pages[MAX_SKB_FRAGS];
                size_t start;
                ssize_t copied;
                if (frag == MAX_SKB_FRAGS)
                        return -EMSGSIZE;
  
 -              copied = iov_iter_get_pages(from, pages, ~0U,
 +              copied = iov_iter_get_pages(from, pages, length,
                                            MAX_SKB_FRAGS - frag, &start);
                if (copied < 0)
                        return -EFAULT;
  
                iov_iter_advance(from, copied);
 +              length -= copied;
  
                truesize = PAGE_ALIGN(copied + start);
                skb->data_len += copied;
                skb->len += copied;
                skb->truesize += truesize;
 -              refcount_add(truesize, &skb->sk->sk_wmem_alloc);
 +              if (sk && sk->sk_type == SOCK_STREAM) {
 +                      sk->sk_wmem_queued += truesize;
 +                      sk_mem_charge(sk, truesize);
 +              } else {
 +                      refcount_add(truesize, &skb->sk->sk_wmem_alloc);
 +              }
                while (copied) {
                        int size = min_t(int, copied, PAGE_SIZE - start);
                        skb_fill_page_desc(skb, frag++, pages[n], start, size);
        }
        return 0;
  }
 +EXPORT_SYMBOL(__zerocopy_sg_from_iter);
 +
 +/**
 + *    zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
 + *    @skb: buffer to copy
 + *    @from: the source to copy from
 + *
 + *    The function will first copy up to headlen, and then pin the userspace
 + *    pages and build frags through them.
 + *
 + *    Returns 0, -EFAULT or -EMSGSIZE.
 + */
 +int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
 +{
 +      int copy = min_t(int, skb_headlen(skb), iov_iter_count(from));
 +
 +      /* copy up to skb headlen */
 +      if (skb_copy_datagram_from_iter(skb, 0, from, copy))
 +              return -EFAULT;
 +
 +      return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
 +}
  EXPORT_SYMBOL(zerocopy_sg_from_iter);
  
  static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
diff --combined net/core/dev.c
index 270b547548213438c6cde035205758c08105ba9a,86b4b0a79e7abb6554af07ed81a7b91e2f8762bf..6f845e4fec175f333b568bfc44ed85a6a400ff6f
  #include <linux/netfilter_ingress.h>
  #include <linux/crash_dump.h>
  #include <linux/sctp.h>
 +#include <net/udp_tunnel.h>
  
  #include "net-sysfs.h"
  
@@@ -1414,7 -1413,7 +1414,7 @@@ int dev_open(struct net_device *dev
  }
  EXPORT_SYMBOL(dev_open);
  
 -static int __dev_close_many(struct list_head *head)
 +static void __dev_close_many(struct list_head *head)
  {
        struct net_device *dev;
  
                dev->flags &= ~IFF_UP;
                netpoll_poll_enable(dev);
        }
 -
 -      return 0;
  }
  
 -static int __dev_close(struct net_device *dev)
 +static void __dev_close(struct net_device *dev)
  {
 -      int retval;
        LIST_HEAD(single);
  
        list_add(&dev->close_list, &single);
 -      retval = __dev_close_many(&single);
 +      __dev_close_many(&single);
        list_del(&single);
 -
 -      return retval;
  }
  
 -int dev_close_many(struct list_head *head, bool unlink)
 +void dev_close_many(struct list_head *head, bool unlink)
  {
        struct net_device *dev, *tmp;
  
                if (unlink)
                        list_del_init(&dev->close_list);
        }
 -
 -      return 0;
  }
  EXPORT_SYMBOL(dev_close_many);
  
   *    is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
   *    chain.
   */
 -int dev_close(struct net_device *dev)
 +void dev_close(struct net_device *dev)
  {
        if (dev->flags & IFF_UP) {
                LIST_HEAD(single);
                dev_close_many(&single, true);
                list_del(&single);
        }
 -      return 0;
  }
  EXPORT_SYMBOL(dev_close);
  
@@@ -1853,7 -1860,7 +1853,7 @@@ static inline int deliver_skb(struct sk
                              struct packet_type *pt_prev,
                              struct net_device *orig_dev)
  {
 -      if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
 +      if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
                return -ENOMEM;
        refcount_inc(&skb->users);
        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
@@@ -2731,7 -2738,8 +2731,7 @@@ EXPORT_SYMBOL(skb_mac_gso_segment)
  static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
  {
        if (tx_path)
 -              return skb->ip_summed != CHECKSUM_PARTIAL &&
 -                     skb->ip_summed != CHECKSUM_UNNECESSARY;
 +              return skb->ip_summed != CHECKSUM_PARTIAL;
  
        return skb->ip_summed == CHECKSUM_NONE;
  }
        return NET_RX_DROP;
  }
  
 +static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 +                                   struct bpf_prog *xdp_prog)
 +{
 +      struct xdp_buff xdp;
 +      u32 act = XDP_DROP;
 +      void *orig_data;
 +      int hlen, off;
 +      u32 mac_len;
 +
 +      /* Reinjected packets coming from act_mirred or similar should
 +       * not get XDP generic processing.
 +       */
 +      if (skb_cloned(skb))
 +              return XDP_PASS;
 +
 +      if (skb_linearize(skb))
 +              goto do_drop;
 +
 +      /* The XDP program wants to see the packet starting at the MAC
 +       * header.
 +       */
 +      mac_len = skb->data - skb_mac_header(skb);
 +      hlen = skb_headlen(skb) + mac_len;
 +      xdp.data = skb->data - mac_len;
 +      xdp.data_end = xdp.data + hlen;
 +      xdp.data_hard_start = skb->data - skb_headroom(skb);
 +      orig_data = xdp.data;
 +
 +      act = bpf_prog_run_xdp(xdp_prog, &xdp);
 +
 +      off = xdp.data - orig_data;
 +      if (off > 0)
 +              __skb_pull(skb, off);
 +      else if (off < 0)
 +              __skb_push(skb, -off);
 +
 +      switch (act) {
 +      case XDP_REDIRECT:
 +      case XDP_TX:
 +              __skb_push(skb, mac_len);
 +              /* fall through */
 +      case XDP_PASS:
 +              break;
 +
 +      default:
 +              bpf_warn_invalid_xdp_action(act);
 +              /* fall through */
 +      case XDP_ABORTED:
 +              trace_xdp_exception(skb->dev, xdp_prog, act);
 +              /* fall through */
 +      case XDP_DROP:
 +      do_drop:
 +              kfree_skb(skb);
 +              break;
 +      }
 +
 +      return act;
 +}
 +
 +/* When doing generic XDP we have to bypass the qdisc layer and the
 + * network taps in order to match in-driver-XDP behavior.
 + */
 +void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 +{
 +      struct net_device *dev = skb->dev;
 +      struct netdev_queue *txq;
 +      bool free_skb = true;
 +      int cpu, rc;
 +
 +      txq = netdev_pick_tx(dev, skb, NULL);
 +      cpu = smp_processor_id();
 +      HARD_TX_LOCK(dev, txq, cpu);
 +      if (!netif_xmit_stopped(txq)) {
 +              rc = netdev_start_xmit(skb, dev, txq, 0);
 +              if (dev_xmit_complete(rc))
 +                      free_skb = false;
 +      }
 +      HARD_TX_UNLOCK(dev, txq);
 +      if (free_skb) {
 +              trace_xdp_exception(dev, xdp_prog, XDP_TX);
 +              kfree_skb(skb);
 +      }
 +}
 +EXPORT_SYMBOL_GPL(generic_xdp_tx);
 +
 +static struct static_key generic_xdp_needed __read_mostly;
 +
 +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 +{
 +      if (xdp_prog) {
 +              u32 act = netif_receive_generic_xdp(skb, xdp_prog);
 +              int err;
 +
 +              if (act != XDP_PASS) {
 +                      switch (act) {
 +                      case XDP_REDIRECT:
 +                              err = xdp_do_generic_redirect(skb->dev, skb,
 +                                                            xdp_prog);
 +                              if (err)
 +                                      goto out_redir;
 +                      /* fallthru to submit skb */
 +                      case XDP_TX:
 +                              generic_xdp_tx(skb, xdp_prog);
 +                              break;
 +                      }
 +                      return XDP_DROP;
 +              }
 +      }
 +      return XDP_PASS;
 +out_redir:
 +      kfree_skb(skb);
 +      return XDP_DROP;
 +}
 +EXPORT_SYMBOL_GPL(do_xdp_generic);
 +
  static int netif_rx_internal(struct sk_buff *skb)
  {
        int ret;
        net_timestamp_check(netdev_tstamp_prequeue, skb);
  
        trace_netif_rx(skb);
 +
 +      if (static_key_false(&generic_xdp_needed)) {
 +              int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
 +                                       skb);
 +
 +              /* Consider XDP consuming the packet a success from
 +               * the netdev point of view we do not want to count
 +               * this as an error.
 +               */
 +              if (ret != XDP_PASS)
 +                      return NET_RX_SUCCESS;
 +      }
 +
  #ifdef CONFIG_RPS
        if (static_key_false(&rps_needed)) {
                struct rps_dev_flow voidflow, *rflow = &voidflow;
@@@ -4412,7 -4292,7 +4412,7 @@@ skip_classify
        }
  
        if (pt_prev) {
 -              if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
 +              if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
                        goto drop;
                else
                        ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
@@@ -4458,6 -4338,8 +4458,6 @@@ static int __netif_receive_skb(struct s
        return ret;
  }
  
 -static struct static_key generic_xdp_needed __read_mostly;
 -
  static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp)
  {
        struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
        return ret;
  }
  
 -static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 -                                   struct bpf_prog *xdp_prog)
 -{
 -      struct xdp_buff xdp;
 -      u32 act = XDP_DROP;
 -      void *orig_data;
 -      int hlen, off;
 -      u32 mac_len;
 -
 -      /* Reinjected packets coming from act_mirred or similar should
 -       * not get XDP generic processing.
 -       */
 -      if (skb_cloned(skb))
 -              return XDP_PASS;
 -
 -      if (skb_linearize(skb))
 -              goto do_drop;
 -
 -      /* The XDP program wants to see the packet starting at the MAC
 -       * header.
 -       */
 -      mac_len = skb->data - skb_mac_header(skb);
 -      hlen = skb_headlen(skb) + mac_len;
 -      xdp.data = skb->data - mac_len;
 -      xdp.data_end = xdp.data + hlen;
 -      xdp.data_hard_start = skb->data - skb_headroom(skb);
 -      orig_data = xdp.data;
 -
 -      act = bpf_prog_run_xdp(xdp_prog, &xdp);
 -
 -      off = xdp.data - orig_data;
 -      if (off > 0)
 -              __skb_pull(skb, off);
 -      else if (off < 0)
 -              __skb_push(skb, -off);
 -
 -      switch (act) {
 -      case XDP_TX:
 -              __skb_push(skb, mac_len);
 -              /* fall through */
 -      case XDP_PASS:
 -              break;
 -
 -      default:
 -              bpf_warn_invalid_xdp_action(act);
 -              /* fall through */
 -      case XDP_ABORTED:
 -              trace_xdp_exception(skb->dev, xdp_prog, act);
 -              /* fall through */
 -      case XDP_DROP:
 -      do_drop:
 -              kfree_skb(skb);
 -              break;
 -      }
 -
 -      return act;
 -}
 -
 -/* When doing generic XDP we have to bypass the qdisc layer and the
 - * network taps in order to match in-driver-XDP behavior.
 - */
 -static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 -{
 -      struct net_device *dev = skb->dev;
 -      struct netdev_queue *txq;
 -      bool free_skb = true;
 -      int cpu, rc;
 -
 -      txq = netdev_pick_tx(dev, skb, NULL);
 -      cpu = smp_processor_id();
 -      HARD_TX_LOCK(dev, txq, cpu);
 -      if (!netif_xmit_stopped(txq)) {
 -              rc = netdev_start_xmit(skb, dev, txq, 0);
 -              if (dev_xmit_complete(rc))
 -                      free_skb = false;
 -      }
 -      HARD_TX_UNLOCK(dev, txq);
 -      if (free_skb) {
 -              trace_xdp_exception(dev, xdp_prog, XDP_TX);
 -              kfree_skb(skb);
 -      }
 -}
 -
  static int netif_receive_skb_internal(struct sk_buff *skb)
  {
        int ret;
        rcu_read_lock();
  
        if (static_key_false(&generic_xdp_needed)) {
 -              struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog);
 +              int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
 +                                       skb);
  
 -              if (xdp_prog) {
 -                      u32 act = netif_receive_generic_xdp(skb, xdp_prog);
 -
 -                      if (act != XDP_PASS) {
 -                              rcu_read_unlock();
 -                              if (act == XDP_TX)
 -                                      generic_xdp_tx(skb, xdp_prog);
 -                              return NET_RX_DROP;
 -                      }
 +              if (ret != XDP_PASS) {
 +                      rcu_read_unlock();
 +                      return NET_RX_DROP;
                }
        }
  
@@@ -5319,6 -5289,7 +5319,7 @@@ static void busy_poll_stop(struct napi_
         * Ideally, a new ndo_busy_poll_stop() could avoid another round.
         */
        rc = napi->poll(napi, BUSY_POLL_BUDGET);
+       trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
        netpoll_poll_unlock(have_poll_lock);
        if (rc == BUSY_POLL_BUDGET)
                __napi_schedule(napi);
@@@ -5697,12 -5668,13 +5698,13 @@@ EXPORT_SYMBOL(netdev_has_upper_dev_all_
   * Find out if a device is linked to an upper device and return true in case
   * it is. The caller must hold the RTNL lock.
   */
static bool netdev_has_any_upper_dev(struct net_device *dev)
+ bool netdev_has_any_upper_dev(struct net_device *dev)
  {
        ASSERT_RTNL();
  
        return !list_empty(&dev->adj_list.upper);
  }
+ EXPORT_SYMBOL(netdev_has_any_upper_dev);
  
  /**
   * netdev_master_upper_dev_get - Get master upper device
@@@ -6719,12 -6691,8 +6721,12 @@@ int __dev_change_flags(struct net_devic
         */
  
        ret = 0;
 -      if ((old_flags ^ flags) & IFF_UP)
 -              ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
 +      if ((old_flags ^ flags) & IFF_UP) {
 +              if (old_flags & IFF_UP)
 +                      __dev_close(dev);
 +              else
 +                      ret = __dev_open(dev);
 +      }
  
        if ((flags ^ dev->gflags) & IFF_PROMISC) {
                int inc = (flags & IFF_PROMISC) ? 1 : -1;
@@@ -7269,6 -7237,24 +7271,6 @@@ static netdev_features_t netdev_fix_fea
                features &= ~NETIF_F_GSO;
        }
  
 -      /* UFO needs SG and checksumming */
 -      if (features & NETIF_F_UFO) {
 -              /* maybe split UFO into V4 and V6? */
 -              if (!(features & NETIF_F_HW_CSUM) &&
 -                  ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
 -                   (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
 -                      netdev_dbg(dev,
 -                              "Dropping NETIF_F_UFO since no checksum offload features.\n");
 -                      features &= ~NETIF_F_UFO;
 -              }
 -
 -              if (!(features & NETIF_F_SG)) {
 -                      netdev_dbg(dev,
 -                              "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
 -                      features &= ~NETIF_F_UFO;
 -              }
 -      }
 -
        /* GSO partial features require GSO partial be set */
        if ((features & dev->gso_partial_features) &&
            !(features & NETIF_F_GSO_PARTIAL)) {
@@@ -7329,27 -7315,8 +7331,27 @@@ sync_lower
        netdev_for_each_lower_dev(dev, lower, iter)
                netdev_sync_lower_features(dev, lower, features);
  
 -      if (!err)
 +      if (!err) {
 +              netdev_features_t diff = features ^ dev->features;
 +
 +              if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
 +                      /* udp_tunnel_{get,drop}_rx_info both need
 +                       * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
 +                       * device, or they won't do anything.
 +                       * Thus we need to update dev->features
 +                       * *before* calling udp_tunnel_get_rx_info,
 +                       * but *after* calling udp_tunnel_drop_rx_info.
 +                       */
 +                      if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
 +                              dev->features = features;
 +                              udp_tunnel_get_rx_info(dev);
 +                      } else {
 +                              udp_tunnel_drop_rx_info(dev);
 +                      }
 +              }
 +
                dev->features = features;
 +      }
  
        return err < 0 ? 0 : 1;
  }
@@@ -7551,12 -7518,6 +7553,12 @@@ int register_netdevice(struct net_devic
         */
        dev->hw_features |= NETIF_F_SOFT_FEATURES;
        dev->features |= NETIF_F_SOFT_FEATURES;
 +
 +      if (dev->netdev_ops->ndo_udp_tunnel_add) {
 +              dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
 +              dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
 +      }
 +
        dev->wanted_features = dev->features & dev->hw_features;
  
        if (!(dev->flags & IFF_LOOPBACK))
diff --combined net/core/filter.c
index f9add024d92fcec8fb6dacce0a2251336204158b,169974998c7692b063947cb925fede167f2fb817..5912c738a7b272e3fb3eb1d66317fe2df9ce0c6d
@@@ -55,7 -55,6 +55,7 @@@
  #include <net/sock_reuseport.h>
  #include <net/busy_poll.h>
  #include <net/tcp.h>
 +#include <linux/bpf_trace.h>
  
  /**
   *    sk_filter_trim_cap - run a packet through a socket filter
@@@ -514,27 -513,14 +514,27 @@@ do_pass
                                break;
                        }
  
 -                      /* Convert JEQ into JNE when 'jump_true' is next insn. */
 -                      if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
 -                              insn->code = BPF_JMP | BPF_JNE | bpf_src;
 +                      /* Convert some jumps when 'jump_true' is next insn. */
 +                      if (fp->jt == 0) {
 +                              switch (BPF_OP(fp->code)) {
 +                              case BPF_JEQ:
 +                                      insn->code = BPF_JMP | BPF_JNE | bpf_src;
 +                                      break;
 +                              case BPF_JGT:
 +                                      insn->code = BPF_JMP | BPF_JLE | bpf_src;
 +                                      break;
 +                              case BPF_JGE:
 +                                      insn->code = BPF_JMP | BPF_JLT | bpf_src;
 +                                      break;
 +                              default:
 +                                      goto jmp_rest;
 +                              }
 +
                                target = i + fp->jf + 1;
                                BPF_EMIT_JMP;
                                break;
                        }
 -
 +jmp_rest:
                        /* Other jumps are mapped into two insns: Jxx and JA. */
                        target = i + fp->jt + 1;
                        insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
@@@ -1792,8 -1778,6 +1792,8 @@@ static const struct bpf_func_proto bpf_
  struct redirect_info {
        u32 ifindex;
        u32 flags;
 +      struct bpf_map *map;
 +      struct bpf_map *map_to_flush;
  };
  
  static DEFINE_PER_CPU(struct redirect_info, redirect_info);
@@@ -1807,7 -1791,6 +1807,7 @@@ BPF_CALL_2(bpf_redirect, u32, ifindex, 
  
        ri->ifindex = ifindex;
        ri->flags = flags;
 +      ri->map = NULL;
  
        return TC_ACT_REDIRECT;
  }
@@@ -1835,45 -1818,6 +1835,45 @@@ static const struct bpf_func_proto bpf_
        .arg2_type      = ARG_ANYTHING,
  };
  
 +BPF_CALL_3(bpf_sk_redirect_map, struct bpf_map *, map, u32, key, u64, flags)
 +{
 +      struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 +
 +      if (unlikely(flags))
 +              return SK_ABORTED;
 +
 +      ri->ifindex = key;
 +      ri->flags = flags;
 +      ri->map = map;
 +
 +      return SK_REDIRECT;
 +}
 +
 +struct sock *do_sk_redirect_map(void)
 +{
 +      struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 +      struct sock *sk = NULL;
 +
 +      if (ri->map) {
 +              sk = __sock_map_lookup_elem(ri->map, ri->ifindex);
 +
 +              ri->ifindex = 0;
 +              ri->map = NULL;
 +              /* we do not clear flags for future lookup */
 +      }
 +
 +      return sk;
 +}
 +
 +static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
 +      .func           = bpf_sk_redirect_map,
 +      .gpl_only       = false,
 +      .ret_type       = RET_INTEGER,
 +      .arg1_type      = ARG_CONST_MAP_PTR,
 +      .arg2_type      = ARG_ANYTHING,
 +      .arg3_type      = ARG_ANYTHING,
 +};
 +
  BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
  {
        return task_get_classid(skb);
@@@ -2080,8 -2024,8 +2080,8 @@@ static int bpf_skb_proto_4_to_6(struct 
                return ret;
  
        if (skb_is_gso(skb)) {
 -              /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to
 -               * be changed into SKB_GSO_TCPV6.
 +              /* SKB_GSO_TCPV4 needs to be changed into
 +               * SKB_GSO_TCPV6.
                 */
                if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
                        skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4;
@@@ -2116,8 -2060,8 +2116,8 @@@ static int bpf_skb_proto_6_to_4(struct 
                return ret;
  
        if (skb_is_gso(skb)) {
 -              /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to
 -               * be changed into SKB_GSO_TCPV4.
 +              /* SKB_GSO_TCPV6 needs to be changed into
 +               * SKB_GSO_TCPV4.
                 */
                if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) {
                        skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6;
@@@ -2468,180 -2412,6 +2468,180 @@@ static const struct bpf_func_proto bpf_
        .arg2_type      = ARG_ANYTHING,
  };
  
 +static int __bpf_tx_xdp(struct net_device *dev,
 +                      struct bpf_map *map,
 +                      struct xdp_buff *xdp,
 +                      u32 index)
 +{
 +      int err;
 +
 +      if (!dev->netdev_ops->ndo_xdp_xmit) {
 +              return -EOPNOTSUPP;
 +      }
 +
 +      err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
 +      if (err)
 +              return err;
 +      if (map)
 +              __dev_map_insert_ctx(map, index);
 +      else
 +              dev->netdev_ops->ndo_xdp_flush(dev);
 +      return 0;
 +}
 +
 +void xdp_do_flush_map(void)
 +{
 +      struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 +      struct bpf_map *map = ri->map_to_flush;
 +
 +      ri->map_to_flush = NULL;
 +      if (map)
 +              __dev_map_flush(map);
 +}
 +EXPORT_SYMBOL_GPL(xdp_do_flush_map);
 +
 +static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 +                             struct bpf_prog *xdp_prog)
 +{
 +      struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 +      struct bpf_map *map = ri->map;
 +      u32 index = ri->ifindex;
 +      struct net_device *fwd;
 +      int err;
 +
 +      ri->ifindex = 0;
 +      ri->map = NULL;
 +
 +      fwd = __dev_map_lookup_elem(map, index);
 +      if (!fwd) {
 +              err = -EINVAL;
 +              goto err;
 +      }
 +      if (ri->map_to_flush && ri->map_to_flush != map)
 +              xdp_do_flush_map();
 +
 +      err = __bpf_tx_xdp(fwd, map, xdp, index);
 +      if (unlikely(err))
 +              goto err;
 +
 +      ri->map_to_flush = map;
 +      _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
 +      return 0;
 +err:
 +      _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
 +      return err;
 +}
 +
 +int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 +                  struct bpf_prog *xdp_prog)
 +{
 +      struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 +      struct net_device *fwd;
 +      u32 index = ri->ifindex;
 +      int err;
 +
 +      if (ri->map)
 +              return xdp_do_redirect_map(dev, xdp, xdp_prog);
 +
 +      fwd = dev_get_by_index_rcu(dev_net(dev), index);
 +      ri->ifindex = 0;
 +      if (unlikely(!fwd)) {
 +              err = -EINVAL;
 +              goto err;
 +      }
 +
 +      err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
 +      if (unlikely(err))
 +              goto err;
 +
 +      _trace_xdp_redirect(dev, xdp_prog, index);
 +      return 0;
 +err:
 +      _trace_xdp_redirect_err(dev, xdp_prog, index, err);
 +      return err;
 +}
 +EXPORT_SYMBOL_GPL(xdp_do_redirect);
 +
 +int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 +                          struct bpf_prog *xdp_prog)
 +{
 +      struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 +      u32 index = ri->ifindex;
 +      struct net_device *fwd;
 +      unsigned int len;
 +      int err = 0;
 +
 +      fwd = dev_get_by_index_rcu(dev_net(dev), index);
 +      ri->ifindex = 0;
 +      if (unlikely(!fwd)) {
 +              err = -EINVAL;
 +              goto err;
 +      }
 +
 +      if (unlikely(!(fwd->flags & IFF_UP))) {
 +              err = -ENETDOWN;
 +              goto err;
 +      }
 +
 +      len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
 +      if (skb->len > len) {
 +              err = -EMSGSIZE;
 +              goto err;
 +      }
 +
 +      skb->dev = fwd;
 +      _trace_xdp_redirect(dev, xdp_prog, index);
 +      return 0;
 +err:
 +      _trace_xdp_redirect_err(dev, xdp_prog, index, err);
 +      return err;
 +}
 +EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
 +
 +BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
 +{
 +      struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 +
 +      if (unlikely(flags))
 +              return XDP_ABORTED;
 +
 +      ri->ifindex = ifindex;
 +      ri->flags = flags;
 +
 +      return XDP_REDIRECT;
 +}
 +
 +static const struct bpf_func_proto bpf_xdp_redirect_proto = {
 +      .func           = bpf_xdp_redirect,
 +      .gpl_only       = false,
 +      .ret_type       = RET_INTEGER,
 +      .arg1_type      = ARG_ANYTHING,
 +      .arg2_type      = ARG_ANYTHING,
 +};
 +
 +BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags)
 +{
 +      struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 +
 +      if (unlikely(flags))
 +              return XDP_ABORTED;
 +
 +      ri->ifindex = ifindex;
 +      ri->flags = flags;
 +      ri->map = map;
 +
 +      return XDP_REDIRECT;
 +}
 +
 +static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
 +      .func           = bpf_xdp_redirect_map,
 +      .gpl_only       = false,
 +      .ret_type       = RET_INTEGER,
 +      .arg1_type      = ARG_CONST_MAP_PTR,
 +      .arg2_type      = ARG_ANYTHING,
 +      .arg3_type      = ARG_ANYTHING,
 +};
 +
  bool bpf_helper_changes_pkt_data(void *func)
  {
        if (func == bpf_skb_vlan_push ||
@@@ -3066,15 -2836,12 +3066,12 @@@ BPF_CALL_5(bpf_setsockopt, struct bpf_s
                   sk->sk_prot->setsockopt == tcp_setsockopt) {
                if (optname == TCP_CONGESTION) {
                        char name[TCP_CA_NAME_MAX];
+                       bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN;
  
                        strncpy(name, optval, min_t(long, optlen,
                                                    TCP_CA_NAME_MAX-1));
                        name[TCP_CA_NAME_MAX-1] = 0;
-                       ret = tcp_set_congestion_control(sk, name, false);
-                       if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN)
-                               /* replacing an existing ca */
-                               tcp_reinit_congestion_control(sk,
-                                       inet_csk(sk)->icsk_ca_ops);
+                       ret = tcp_set_congestion_control(sk, name, false, reinit);
                } else {
                        struct tcp_sock *tp = tcp_sk(sk);
  
                                ret = -EINVAL;
                        }
                }
-               ret = -EINVAL;
  #endif
        } else {
                ret = -EINVAL;
@@@ -3149,20 -2915,6 +3145,20 @@@ bpf_base_func_proto(enum bpf_func_id fu
        }
  }
  
 +static const struct bpf_func_proto *
 +sock_filter_func_proto(enum bpf_func_id func_id)
 +{
 +      switch (func_id) {
 +      /* inet and inet6 sockets are created in a process
 +       * context so there is always a valid uid/gid
 +       */
 +      case BPF_FUNC_get_current_uid_gid:
 +              return &bpf_get_current_uid_gid_proto;
 +      default:
 +              return bpf_base_func_proto(func_id);
 +      }
 +}
 +
  static const struct bpf_func_proto *
  sk_filter_func_proto(enum bpf_func_id func_id)
  {
@@@ -3255,10 -3007,6 +3251,10 @@@ xdp_func_proto(enum bpf_func_id func_id
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_xdp_adjust_head:
                return &bpf_xdp_adjust_head_proto;
 +      case BPF_FUNC_redirect:
 +              return &bpf_xdp_redirect_proto;
 +      case BPF_FUNC_redirect_map:
 +              return &bpf_xdp_redirect_map_proto;
        default:
                return bpf_base_func_proto(func_id);
        }
@@@ -3297,32 -3045,6 +3293,32 @@@ static const struct bpf_func_proto 
        switch (func_id) {
        case BPF_FUNC_setsockopt:
                return &bpf_setsockopt_proto;
 +      case BPF_FUNC_sock_map_update:
 +              return &bpf_sock_map_update_proto;
 +      default:
 +              return bpf_base_func_proto(func_id);
 +      }
 +}
 +
 +static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id)
 +{
 +      switch (func_id) {
 +      case BPF_FUNC_skb_store_bytes:
 +              return &bpf_skb_store_bytes_proto;
 +      case BPF_FUNC_skb_load_bytes:
 +              return &bpf_skb_load_bytes_proto;
 +      case BPF_FUNC_skb_pull_data:
 +              return &bpf_skb_pull_data_proto;
 +      case BPF_FUNC_skb_change_tail:
 +              return &bpf_skb_change_tail_proto;
 +      case BPF_FUNC_skb_change_head:
 +              return &bpf_skb_change_head_proto;
 +      case BPF_FUNC_get_socket_cookie:
 +              return &bpf_get_socket_cookie_proto;
 +      case BPF_FUNC_get_socket_uid:
 +              return &bpf_get_socket_uid_proto;
 +      case BPF_FUNC_sk_redirect_map:
 +              return &bpf_sk_redirect_map_proto;
        default:
                return bpf_base_func_proto(func_id);
        }
@@@ -3380,10 -3102,6 +3376,10 @@@ static bool bpf_skb_is_valid_access(in
                if (off + size > offsetofend(struct __sk_buff, cb[4]))
                        return false;
                break;
 +      case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]):
 +      case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]):
 +      case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
 +      case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
        case bpf_ctx_range(struct __sk_buff, data):
        case bpf_ctx_range(struct __sk_buff, data_end):
                if (size != size_default)
@@@ -3412,7 -3130,6 +3408,7 @@@ static bool sk_filter_is_valid_access(i
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range(struct __sk_buff, data):
        case bpf_ctx_range(struct __sk_buff, data_end):
 +      case bpf_ctx_range_till(struct __sk_buff, family, local_port):
                return false;
        }
  
@@@ -3434,7 -3151,6 +3430,7 @@@ static bool lwt_is_valid_access(int off
  {
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
 +      case bpf_ctx_range_till(struct __sk_buff, family, local_port):
                return false;
        }
  
@@@ -3468,8 -3184,6 +3464,8 @@@ static bool sock_filter_is_valid_access
        if (type == BPF_WRITE) {
                switch (off) {
                case offsetof(struct bpf_sock, bound_dev_if):
 +              case offsetof(struct bpf_sock, mark):
 +              case offsetof(struct bpf_sock, priority):
                        break;
                default:
                        return false;
        return true;
  }
  
 -static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
 -                             const struct bpf_prog *prog)
 +static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
 +                              const struct bpf_prog *prog, int drop_verdict)
  {
        struct bpf_insn *insn = insn_buf;
  
         * return TC_ACT_SHOT;
         */
        *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
 -      *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT);
 +      *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict);
        *insn++ = BPF_EXIT_INSN();
  
        /* restore: */
        return insn - insn_buf;
  }
  
 +static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
 +                             const struct bpf_prog *prog)
 +{
 +      return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT);
 +}
 +
  static bool tc_cls_act_is_valid_access(int off, int size,
                                       enum bpf_access_type type,
                                       struct bpf_insn_access_aux *info)
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
 +      case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 +              return false;
        }
  
        return bpf_skb_is_valid_access(off, size, type, info);
@@@ -3630,41 -3336,6 +3626,41 @@@ static bool sock_ops_is_valid_access(in
        return __is_valid_sock_ops_access(off, size);
  }
  
 +static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
 +                         const struct bpf_prog *prog)
 +{
 +      return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP);
 +}
 +
 +static bool sk_skb_is_valid_access(int off, int size,
 +                                 enum bpf_access_type type,
 +                                 struct bpf_insn_access_aux *info)
 +{
 +      if (type == BPF_WRITE) {
 +              switch (off) {
 +              case bpf_ctx_range(struct __sk_buff, mark):
 +              case bpf_ctx_range(struct __sk_buff, tc_index):
 +              case bpf_ctx_range(struct __sk_buff, priority):
 +                      break;
 +              default:
 +                      return false;
 +              }
 +      }
 +
 +      switch (off) {
 +      case bpf_ctx_range(struct __sk_buff, tc_classid):
 +              return false;
 +      case bpf_ctx_range(struct __sk_buff, data):
 +              info->reg_type = PTR_TO_PACKET;
 +              break;
 +      case bpf_ctx_range(struct __sk_buff, data_end):
 +              info->reg_type = PTR_TO_PACKET_END;
 +              break;
 +      }
 +
 +      return bpf_skb_is_valid_access(off, size, type, info);
 +}
 +
  static u32 bpf_convert_ctx_access(enum bpf_access_type type,
                                  const struct bpf_insn *si,
                                  struct bpf_insn *insn_buf,
                *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
  #endif
                break;
 +      case offsetof(struct __sk_buff, family):
 +              BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
 +
 +              *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
 +                                    si->dst_reg, si->src_reg,
 +                                    offsetof(struct sk_buff, sk));
 +              *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
 +                                    bpf_target_off(struct sock_common,
 +                                                   skc_family,
 +                                                   2, target_size));
 +              break;
 +      case offsetof(struct __sk_buff, remote_ip4):
 +              BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
 +
 +              *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
 +                                    si->dst_reg, si->src_reg,
 +                                    offsetof(struct sk_buff, sk));
 +              *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 +                                    bpf_target_off(struct sock_common,
 +                                                   skc_daddr,
 +                                                   4, target_size));
 +              break;
 +      case offsetof(struct __sk_buff, local_ip4):
 +              BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
 +                                        skc_rcv_saddr) != 4);
 +
 +              *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
 +                                    si->dst_reg, si->src_reg,
 +                                    offsetof(struct sk_buff, sk));
 +              *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 +                                    bpf_target_off(struct sock_common,
 +                                                   skc_rcv_saddr,
 +                                                   4, target_size));
 +              break;
 +      case offsetof(struct __sk_buff, remote_ip6[0]) ...
 +           offsetof(struct __sk_buff, remote_ip6[3]):
 +#if IS_ENABLED(CONFIG_IPV6)
 +              BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
 +                                        skc_v6_daddr.s6_addr32[0]) != 4);
 +
 +              off = si->off;
 +              off -= offsetof(struct __sk_buff, remote_ip6[0]);
 +
 +              *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
 +                                    si->dst_reg, si->src_reg,
 +                                    offsetof(struct sk_buff, sk));
 +              *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 +                                    offsetof(struct sock_common,
 +                                             skc_v6_daddr.s6_addr32[0]) +
 +                                    off);
 +#else
 +              *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
 +#endif
 +              break;
 +      case offsetof(struct __sk_buff, local_ip6[0]) ...
 +           offsetof(struct __sk_buff, local_ip6[3]):
 +#if IS_ENABLED(CONFIG_IPV6)
 +              BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
 +                                        skc_v6_rcv_saddr.s6_addr32[0]) != 4);
 +
 +              off = si->off;
 +              off -= offsetof(struct __sk_buff, local_ip6[0]);
 +
 +              *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
 +                                    si->dst_reg, si->src_reg,
 +                                    offsetof(struct sk_buff, sk));
 +              *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 +                                    offsetof(struct sock_common,
 +                                             skc_v6_rcv_saddr.s6_addr32[0]) +
 +                                    off);
 +#else
 +              *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
 +#endif
 +              break;
 +
 +      case offsetof(struct __sk_buff, remote_port):
 +              BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
 +
 +              *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
 +                                    si->dst_reg, si->src_reg,
 +                                    offsetof(struct sk_buff, sk));
 +              *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
 +                                    bpf_target_off(struct sock_common,
 +                                                   skc_dport,
 +                                                   2, target_size));
 +#ifndef __BIG_ENDIAN_BITFIELD
 +              *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
 +#endif
 +              break;
 +
 +      case offsetof(struct __sk_buff, local_port):
 +              BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
 +
 +              *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
 +                                    si->dst_reg, si->src_reg,
 +                                    offsetof(struct sk_buff, sk));
 +              *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
 +                                    bpf_target_off(struct sock_common,
 +                                                   skc_num, 2, target_size));
 +              break;
        }
  
        return insn - insn_buf;
@@@ -3974,28 -3545,6 +3970,28 @@@ static u32 sock_filter_convert_ctx_acce
                                      offsetof(struct sock, sk_bound_dev_if));
                break;
  
 +      case offsetof(struct bpf_sock, mark):
 +              BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_mark) != 4);
 +
 +              if (type == BPF_WRITE)
 +                      *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
 +                                      offsetof(struct sock, sk_mark));
 +              else
 +                      *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 +                                    offsetof(struct sock, sk_mark));
 +              break;
 +
 +      case offsetof(struct bpf_sock, priority):
 +              BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);
 +
 +              if (type == BPF_WRITE)
 +                      *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
 +                                      offsetof(struct sock, sk_priority));
 +              else
 +                      *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 +                                    offsetof(struct sock, sk_priority));
 +              break;
 +
        case offsetof(struct bpf_sock, family):
                BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
  
@@@ -4245,7 -3794,7 +4241,7 @@@ const struct bpf_verifier_ops lwt_xmit_
  };
  
  const struct bpf_verifier_ops cg_sock_prog_ops = {
 -      .get_func_proto         = bpf_base_func_proto,
 +      .get_func_proto         = sock_filter_func_proto,
        .is_valid_access        = sock_filter_is_valid_access,
        .convert_ctx_access     = sock_filter_convert_ctx_access,
  };
@@@ -4256,13 -3805,6 +4252,13 @@@ const struct bpf_verifier_ops sock_ops_
        .convert_ctx_access     = sock_ops_convert_ctx_access,
  };
  
 +const struct bpf_verifier_ops sk_skb_prog_ops = {
 +      .get_func_proto         = sk_skb_func_proto,
 +      .is_valid_access        = sk_skb_is_valid_access,
 +      .convert_ctx_access     = bpf_convert_ctx_access,
 +      .gen_prologue           = sk_skb_prologue,
 +};
 +
  int sk_detach_filter(struct sock *sk)
  {
        int ret = -ENOENT;
diff --combined net/core/skbuff.c
index 917da73d3ab3b82163cf0a9ee944da09cb5a391f,e0755660628407e5a1cefc9ed2c4a725f68628a0..246ca1c81715787bb8e58a424670965ee9fc2d95
@@@ -158,6 -158,31 +158,6 @@@ out
   *
   */
  
 -struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node)
 -{
 -      struct sk_buff *skb;
 -
 -      /* Get the HEAD */
 -      skb = kmem_cache_alloc_node(skbuff_head_cache,
 -                                  gfp_mask & ~__GFP_DMA, node);
 -      if (!skb)
 -              goto out;
 -
 -      /*
 -       * Only clear those fields we need to clear, not those that we will
 -       * actually initialise below. Hence, don't put any more fields after
 -       * the tail pointer in struct sk_buff!
 -       */
 -      memset(skb, 0, offsetof(struct sk_buff, tail));
 -      skb->head = NULL;
 -      skb->truesize = sizeof(struct sk_buff);
 -      refcount_set(&skb->users, 1);
 -
 -      skb->mac_header = (typeof(skb->mac_header))~0U;
 -out:
 -      return skb;
 -}
 -
  /**
   *    __alloc_skb     -       allocate a network buffer
   *    @size: size to allocate
@@@ -567,10 -592,21 +567,10 @@@ static void skb_release_data(struct sk_
        for (i = 0; i < shinfo->nr_frags; i++)
                __skb_frag_unref(&shinfo->frags[i]);
  
 -      /*
 -       * If skb buf is from userspace, we need to notify the caller
 -       * the lower device DMA has done;
 -       */
 -      if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) {
 -              struct ubuf_info *uarg;
 -
 -              uarg = shinfo->destructor_arg;
 -              if (uarg->callback)
 -                      uarg->callback(uarg, true);
 -      }
 -
        if (shinfo->frag_list)
                kfree_skb_list(shinfo->frag_list);
  
 +      skb_zcopy_clear(skb, true);
        skb_free_head(skb);
  }
  
@@@ -684,7 -720,14 +684,7 @@@ EXPORT_SYMBOL(kfree_skb_list)
   */
  void skb_tx_error(struct sk_buff *skb)
  {
 -      if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
 -              struct ubuf_info *uarg;
 -
 -              uarg = skb_shinfo(skb)->destructor_arg;
 -              if (uarg->callback)
 -                      uarg->callback(uarg, false);
 -              skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
 -      }
 +      skb_zcopy_clear(skb, true);
  }
  EXPORT_SYMBOL(skb_tx_error);
  
@@@ -719,7 -762,8 +719,7 @@@ void consume_stateless_skb(struct sk_bu
                return;
  
        trace_consume_skb(skb);
 -      if (likely(skb->head))
 -              skb_release_data(skb);
 +      skb_release_data(skb);
        kfree_skbmem(skb);
  }
  
@@@ -897,273 -941,6 +897,273 @@@ struct sk_buff *skb_morph(struct sk_buf
  }
  EXPORT_SYMBOL_GPL(skb_morph);
  
 +static int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
 +{
 +      unsigned long max_pg, num_pg, new_pg, old_pg;
 +      struct user_struct *user;
 +
 +      if (capable(CAP_IPC_LOCK) || !size)
 +              return 0;
 +
 +      num_pg = (size >> PAGE_SHIFT) + 2;      /* worst case */
 +      max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 +      user = mmp->user ? : current_user();
 +
 +      do {
 +              old_pg = atomic_long_read(&user->locked_vm);
 +              new_pg = old_pg + num_pg;
 +              if (new_pg > max_pg)
 +                      return -ENOBUFS;
 +      } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) !=
 +               old_pg);
 +
 +      if (!mmp->user) {
 +              mmp->user = get_uid(user);
 +              mmp->num_pg = num_pg;
 +      } else {
 +              mmp->num_pg += num_pg;
 +      }
 +
 +      return 0;
 +}
 +
 +static void mm_unaccount_pinned_pages(struct mmpin *mmp)
 +{
 +      if (mmp->user) {
 +              atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
 +              free_uid(mmp->user);
 +      }
 +}
 +
 +struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 +{
 +      struct ubuf_info *uarg;
 +      struct sk_buff *skb;
 +
 +      WARN_ON_ONCE(!in_task());
 +
 +      if (!sock_flag(sk, SOCK_ZEROCOPY))
 +              return NULL;
 +
 +      skb = sock_omalloc(sk, 0, GFP_KERNEL);
 +      if (!skb)
 +              return NULL;
 +
 +      BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
 +      uarg = (void *)skb->cb;
 +      uarg->mmp.user = NULL;
 +
 +      if (mm_account_pinned_pages(&uarg->mmp, size)) {
 +              kfree_skb(skb);
 +              return NULL;
 +      }
 +
 +      uarg->callback = sock_zerocopy_callback;
 +      uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
 +      uarg->len = 1;
 +      uarg->bytelen = size;
 +      uarg->zerocopy = 1;
 +      atomic_set(&uarg->refcnt, 0);
 +      sock_hold(sk);
 +
 +      return uarg;
 +}
 +EXPORT_SYMBOL_GPL(sock_zerocopy_alloc);
 +
 +static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
 +{
 +      return container_of((void *)uarg, struct sk_buff, cb);
 +}
 +
 +struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
 +                                      struct ubuf_info *uarg)
 +{
 +      if (uarg) {
 +              const u32 byte_limit = 1 << 19;         /* limit to a few TSO */
 +              u32 bytelen, next;
 +
 +              /* realloc only when socket is locked (TCP, UDP cork),
 +               * so uarg->len and sk_zckey access is serialized
 +               */
 +              if (!sock_owned_by_user(sk)) {
 +                      WARN_ON_ONCE(1);
 +                      return NULL;
 +              }
 +
 +              bytelen = uarg->bytelen + size;
 +              if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
 +                      /* TCP can create new skb to attach new uarg */
 +                      if (sk->sk_type == SOCK_STREAM)
 +                              goto new_alloc;
 +                      return NULL;
 +              }
 +
 +              next = (u32)atomic_read(&sk->sk_zckey);
 +              if ((u32)(uarg->id + uarg->len) == next) {
 +                      if (mm_account_pinned_pages(&uarg->mmp, size))
 +                              return NULL;
 +                      uarg->len++;
 +                      uarg->bytelen = bytelen;
 +                      atomic_set(&sk->sk_zckey, ++next);
 +                      return uarg;
 +              }
 +      }
 +
 +new_alloc:
 +      return sock_zerocopy_alloc(sk, size);
 +}
 +EXPORT_SYMBOL_GPL(sock_zerocopy_realloc);
 +
 +static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
 +{
 +      struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
 +      u32 old_lo, old_hi;
 +      u64 sum_len;
 +
 +      old_lo = serr->ee.ee_info;
 +      old_hi = serr->ee.ee_data;
 +      sum_len = old_hi - old_lo + 1ULL + len;
 +
 +      if (sum_len >= (1ULL << 32))
 +              return false;
 +
 +      if (lo != old_hi + 1)
 +              return false;
 +
 +      serr->ee.ee_data += len;
 +      return true;
 +}
 +
 +void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
 +{
 +      struct sk_buff *tail, *skb = skb_from_uarg(uarg);
 +      struct sock_exterr_skb *serr;
 +      struct sock *sk = skb->sk;
 +      struct sk_buff_head *q;
 +      unsigned long flags;
 +      u32 lo, hi;
 +      u16 len;
 +
 +      mm_unaccount_pinned_pages(&uarg->mmp);
 +
 +      /* if !len, there was only 1 call, and it was aborted
 +       * so do not queue a completion notification
 +       */
 +      if (!uarg->len || sock_flag(sk, SOCK_DEAD))
 +              goto release;
 +
 +      len = uarg->len;
 +      lo = uarg->id;
 +      hi = uarg->id + len - 1;
 +
 +      serr = SKB_EXT_ERR(skb);
 +      memset(serr, 0, sizeof(*serr));
 +      serr->ee.ee_errno = 0;
 +      serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
 +      serr->ee.ee_data = hi;
 +      serr->ee.ee_info = lo;
 +      if (!success)
 +              serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
 +
 +      q = &sk->sk_error_queue;
 +      spin_lock_irqsave(&q->lock, flags);
 +      tail = skb_peek_tail(q);
 +      if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
 +          !skb_zerocopy_notify_extend(tail, lo, len)) {
 +              __skb_queue_tail(q, skb);
 +              skb = NULL;
 +      }
 +      spin_unlock_irqrestore(&q->lock, flags);
 +
 +      sk->sk_error_report(sk);
 +
 +release:
 +      consume_skb(skb);
 +      sock_put(sk);
 +}
 +EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
 +
 +void sock_zerocopy_put(struct ubuf_info *uarg)
 +{
 +      if (uarg && atomic_dec_and_test(&uarg->refcnt)) {
 +              if (uarg->callback)
 +                      uarg->callback(uarg, uarg->zerocopy);
 +              else
 +                      consume_skb(skb_from_uarg(uarg));
 +      }
 +}
 +EXPORT_SYMBOL_GPL(sock_zerocopy_put);
 +
 +void sock_zerocopy_put_abort(struct ubuf_info *uarg)
 +{
 +      if (uarg) {
 +              struct sock *sk = skb_from_uarg(uarg)->sk;
 +
 +              atomic_dec(&sk->sk_zckey);
 +              uarg->len--;
 +
 +              /* sock_zerocopy_put expects a ref. Most sockets take one per
 +               * skb, which is zero on abort. tcp_sendmsg holds one extra, to
 +               * avoid an skb send inside the main loop triggering uarg free.
 +               */
 +              if (sk->sk_type != SOCK_STREAM)
 +                      atomic_inc(&uarg->refcnt);
 +
 +              sock_zerocopy_put(uarg);
 +      }
 +}
 +EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
 +
 +extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
 +                                 struct iov_iter *from, size_t length);
 +
 +int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 +                           struct msghdr *msg, int len,
 +                           struct ubuf_info *uarg)
 +{
 +      struct ubuf_info *orig_uarg = skb_zcopy(skb);
 +      struct iov_iter orig_iter = msg->msg_iter;
 +      int err, orig_len = skb->len;
 +
 +      /* An skb can only point to one uarg. This edge case happens when
 +       * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
 +       */
 +      if (orig_uarg && uarg != orig_uarg)
 +              return -EEXIST;
 +
 +      err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
 +      if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
 +              /* Streams do not free skb on error. Reset to prev state. */
 +              msg->msg_iter = orig_iter;
 +              ___pskb_trim(skb, orig_len);
 +              return err;
 +      }
 +
 +      skb_zcopy_set(skb, uarg);
 +      return skb->len - orig_len;
 +}
 +EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
 +
 +static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
 +                            gfp_t gfp_mask)
 +{
 +      if (skb_zcopy(orig)) {
 +              if (skb_zcopy(nskb)) {
 +                      /* !gfp_mask callers are verified to !skb_zcopy(nskb) */
 +                      if (!gfp_mask) {
 +                              WARN_ON_ONCE(1);
 +                              return -ENOMEM;
 +                      }
 +                      if (skb_uarg(nskb) == skb_uarg(orig))
 +                              return 0;
 +                      if (skb_copy_ubufs(nskb, GFP_ATOMIC))
 +                              return -EIO;
 +              }
 +              skb_zcopy_set(nskb, skb_uarg(orig));
 +      }
 +      return 0;
 +}
 +
  /**
   *    skb_copy_ubufs  -       copy userspace skb frags buffers to kernel
   *    @skb: the skb to modify
   */
  int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
  {
 -      int i;
        int num_frags = skb_shinfo(skb)->nr_frags;
        struct page *page, *head = NULL;
 -      struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
 +      int i, new_frags;
 +      u32 d_off;
  
 -      for (i = 0; i < num_frags; i++) {
 -              u8 *vaddr;
 -              skb_frag_t *f = &skb_shinfo(skb)->frags[i];
 +      if (!num_frags)
 +              return 0;
 +
 +      if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
 +              return -EINVAL;
  
 +      new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 +      for (i = 0; i < new_frags; i++) {
                page = alloc_page(gfp_mask);
                if (!page) {
                        while (head) {
                        }
                        return -ENOMEM;
                }
 -              vaddr = kmap_atomic(skb_frag_page(f));
 -              memcpy(page_address(page),
 -                     vaddr + f->page_offset, skb_frag_size(f));
 -              kunmap_atomic(vaddr);
                set_page_private(page, (unsigned long)head);
                head = page;
        }
  
 +      page = head;
 +      d_off = 0;
 +      for (i = 0; i < num_frags; i++) {
 +              skb_frag_t *f = &skb_shinfo(skb)->frags[i];
 +              u32 p_off, p_len, copied;
 +              struct page *p;
 +              u8 *vaddr;
 +
 +              skb_frag_foreach_page(f, f->page_offset, skb_frag_size(f),
 +                                    p, p_off, p_len, copied) {
 +                      u32 copy, done = 0;
 +                      vaddr = kmap_atomic(p);
 +
 +                      while (done < p_len) {
 +                              if (d_off == PAGE_SIZE) {
 +                                      d_off = 0;
 +                                      page = (struct page *)page_private(page);
 +                              }
 +                              copy = min_t(u32, PAGE_SIZE - d_off, p_len - done);
 +                              memcpy(page_address(page) + d_off,
 +                                     vaddr + p_off + done, copy);
 +                              done += copy;
 +                              d_off += copy;
 +                      }
 +                      kunmap_atomic(vaddr);
 +              }
 +      }
 +
        /* skb frags release userspace buffers */
        for (i = 0; i < num_frags; i++)
                skb_frag_unref(skb, i);
  
 -      uarg->callback(uarg, false);
 -
        /* skb frags point to kernel buffers */
 -      for (i = num_frags - 1; i >= 0; i--) {
 -              __skb_fill_page_desc(skb, i, head, 0,
 -                                   skb_shinfo(skb)->frags[i].size);
 +      for (i = 0; i < new_frags - 1; i++) {
 +              __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE);
                head = (struct page *)page_private(head);
        }
 +      __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
 +      skb_shinfo(skb)->nr_frags = new_frags;
  
 -      skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
 +      skb_zcopy_clear(skb, false);
        return 0;
  }
  EXPORT_SYMBOL_GPL(skb_copy_ubufs);
@@@ -1408,8 -1158,7 +1408,8 @@@ struct sk_buff *__pskb_copy_fclone(stru
        if (skb_shinfo(skb)->nr_frags) {
                int i;
  
 -              if (skb_orphan_frags(skb, gfp_mask)) {
 +              if (skb_orphan_frags(skb, gfp_mask) ||
 +                  skb_zerocopy_clone(n, skb, gfp_mask)) {
                        kfree_skb(n);
                        n = NULL;
                        goto out;
@@@ -1486,10 -1235,9 +1486,10 @@@ int pskb_expand_head(struct sk_buff *sk
         * be since all we did is relocate the values
         */
        if (skb_cloned(skb)) {
 -              /* copy this zero copy skb frags */
                if (skb_orphan_frags(skb, gfp_mask))
                        goto nofrags;
 +              if (skb_zcopy(skb))
 +                      atomic_inc(&skb_uarg(skb)->refcnt);
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                        skb_frag_ref(skb, i);
  
@@@ -1615,18 -1363,20 +1615,20 @@@ struct sk_buff *skb_copy_expand(const s
  EXPORT_SYMBOL(skb_copy_expand);
  
  /**
-  *    skb_pad                 -       zero pad the tail of an skb
+  *    __skb_pad               -       zero pad the tail of an skb
   *    @skb: buffer to pad
   *    @pad: space to pad
+  *    @free_on_error: free buffer on error
   *
   *    Ensure that a buffer is followed by a padding area that is zero
   *    filled. Used by network drivers which may DMA or transfer data
   *    beyond the buffer end onto the wire.
   *
-  *    May return error in out of memory cases. The skb is freed on error.
+  *    May return error in out of memory cases. The skb is freed on error
+  *    if @free_on_error is true.
   */
  
- int skb_pad(struct sk_buff *skb, int pad)
+ int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
  {
        int err;
        int ntail;
        return 0;
  
  free_skb:
-       kfree_skb(skb);
+       if (free_on_error)
+               kfree_skb(skb);
        return err;
  }
- EXPORT_SYMBOL(skb_pad);
+ EXPORT_SYMBOL(__skb_pad);
  
  /**
   *    pskb_put - add data to the tail of a potentially fragmented buffer
@@@ -1971,8 -1722,6 +1974,8 @@@ pull_pages
                        if (eat) {
                                skb_shinfo(skb)->frags[k].page_offset += eat;
                                skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
 +                              if (!i)
 +                                      goto end;
                                eat = 0;
                        }
                        k++;
        }
        skb_shinfo(skb)->nr_frags = k;
  
 +end:
        skb->tail     += delta;
        skb->data_len -= delta;
  
 +      if (!skb->data_len)
 +              skb_zcopy_clear(skb, false);
 +
        return skb_tail_pointer(skb);
  }
  EXPORT_SYMBOL(__pskb_pull_tail);
@@@ -2034,20 -1779,16 +2037,20 @@@ int skb_copy_bits(const struct sk_buff 
  
                end = start + skb_frag_size(f);
                if ((copy = end - offset) > 0) {
 +                      u32 p_off, p_len, copied;
 +                      struct page *p;
                        u8 *vaddr;
  
                        if (copy > len)
                                copy = len;
  
 -                      vaddr = kmap_atomic(skb_frag_page(f));
 -                      memcpy(to,
 -                             vaddr + f->page_offset + offset - start,
 -                             copy);
 -                      kunmap_atomic(vaddr);
 +                      skb_frag_foreach_page(f,
 +                                            f->page_offset + offset - start,
 +                                            copy, p, p_off, p_len, copied) {
 +                              vaddr = kmap_atomic(p);
 +                              memcpy(to + copied, vaddr + p_off, p_len);
 +                              kunmap_atomic(vaddr);
 +                      }
  
                        if ((len -= copy) == 0)
                                return 0;
@@@ -2267,107 -2008,6 +2270,107 @@@ int skb_splice_bits(struct sk_buff *skb
  }
  EXPORT_SYMBOL_GPL(skb_splice_bits);
  
 +/* Send skb data on a socket. Socket must be locked. */
 +int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
 +                       int len)
 +{
 +      unsigned int orig_len = len;
 +      struct sk_buff *head = skb;
 +      unsigned short fragidx;
 +      int slen, ret;
 +
 +do_frag_list:
 +
 +      /* Deal with head data */
 +      while (offset < skb_headlen(skb) && len) {
 +              struct kvec kv;
 +              struct msghdr msg;
 +
 +              slen = min_t(int, len, skb_headlen(skb) - offset);
 +              kv.iov_base = skb->data + offset;
 +              kv.iov_len = slen;
 +              memset(&msg, 0, sizeof(msg));
 +
 +              ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen);
 +              if (ret <= 0)
 +                      goto error;
 +
 +              offset += ret;
 +              len -= ret;
 +      }
 +
 +      /* All the data was skb head? */
 +      if (!len)
 +              goto out;
 +
 +      /* Make offset relative to start of frags */
 +      offset -= skb_headlen(skb);
 +
 +      /* Find where we are in frag list */
 +      for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
 +              skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];
 +
 +              if (offset < frag->size)
 +                      break;
 +
 +              offset -= frag->size;
 +      }
 +
 +      for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
 +              skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];
 +
 +              slen = min_t(size_t, len, frag->size - offset);
 +
 +              while (slen) {
 +                      ret = kernel_sendpage_locked(sk, frag->page.p,
 +                                                   frag->page_offset + offset,
 +                                                   slen, MSG_DONTWAIT);
 +                      if (ret <= 0)
 +                              goto error;
 +
 +                      len -= ret;
 +                      offset += ret;
 +                      slen -= ret;
 +              }
 +
 +              offset = 0;
 +      }
 +
 +      if (len) {
 +              /* Process any frag lists */
 +
 +              if (skb == head) {
 +                      if (skb_has_frag_list(skb)) {
 +                              skb = skb_shinfo(skb)->frag_list;
 +                              goto do_frag_list;
 +                      }
 +              } else if (skb->next) {
 +                      skb = skb->next;
 +                      goto do_frag_list;
 +              }
 +      }
 +
 +out:
 +      return orig_len - len;
 +
 +error:
 +      return orig_len == len ? ret : orig_len - len;
 +}
 +EXPORT_SYMBOL_GPL(skb_send_sock_locked);
 +
 +/* Send skb data on a socket. */
 +int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
 +{
 +      int ret = 0;
 +
 +      lock_sock(sk);
 +      ret = skb_send_sock_locked(sk, skb, offset, len);
 +      release_sock(sk);
 +
 +      return ret;
 +}
 +EXPORT_SYMBOL_GPL(skb_send_sock);
 +
  /**
   *    skb_store_bits - store bits from kernel buffer to skb
   *    @skb: destination buffer
@@@ -2407,20 -2047,15 +2410,20 @@@ int skb_store_bits(struct sk_buff *skb
  
                end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
 +                      u32 p_off, p_len, copied;
 +                      struct page *p;
                        u8 *vaddr;
  
                        if (copy > len)
                                copy = len;
  
 -                      vaddr = kmap_atomic(skb_frag_page(frag));
 -                      memcpy(vaddr + frag->page_offset + offset - start,
 -                             from, copy);
 -                      kunmap_atomic(vaddr);
 +                      skb_frag_foreach_page(frag,
 +                                            frag->page_offset + offset - start,
 +                                            copy, p, p_off, p_len, copied) {
 +                              vaddr = kmap_atomic(p);
 +                              memcpy(vaddr + p_off, from + copied, p_len);
 +                              kunmap_atomic(vaddr);
 +                      }
  
                        if ((len -= copy) == 0)
                                return 0;
@@@ -2485,27 -2120,20 +2488,27 @@@ __wsum __skb_checksum(const struct sk_b
  
                end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
 +                      u32 p_off, p_len, copied;
 +                      struct page *p;
                        __wsum csum2;
                        u8 *vaddr;
  
                        if (copy > len)
                                copy = len;
 -                      vaddr = kmap_atomic(skb_frag_page(frag));
 -                      csum2 = ops->update(vaddr + frag->page_offset +
 -                                          offset - start, copy, 0);
 -                      kunmap_atomic(vaddr);
 -                      csum = ops->combine(csum, csum2, pos, copy);
 +
 +                      skb_frag_foreach_page(frag,
 +                                            frag->page_offset + offset - start,
 +                                            copy, p, p_off, p_len, copied) {
 +                              vaddr = kmap_atomic(p);
 +                              csum2 = ops->update(vaddr + p_off, p_len, 0);
 +                              kunmap_atomic(vaddr);
 +                              csum = ops->combine(csum, csum2, pos, p_len);
 +                              pos += p_len;
 +                      }
 +
                        if (!(len -= copy))
                                return csum;
                        offset += copy;
 -                      pos    += copy;
                }
                start = end;
        }
@@@ -2578,31 -2206,24 +2581,31 @@@ __wsum skb_copy_and_csum_bits(const str
  
                end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
                if ((copy = end - offset) > 0) {
 +                      skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 +                      u32 p_off, p_len, copied;
 +                      struct page *p;
                        __wsum csum2;
                        u8 *vaddr;
 -                      skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  
                        if (copy > len)
                                copy = len;
 -                      vaddr = kmap_atomic(skb_frag_page(frag));
 -                      csum2 = csum_partial_copy_nocheck(vaddr +
 -                                                        frag->page_offset +
 -                                                        offset - start, to,
 -                                                        copy, 0);
 -                      kunmap_atomic(vaddr);
 -                      csum = csum_block_add(csum, csum2, pos);
 +
 +                      skb_frag_foreach_page(frag,
 +                                            frag->page_offset + offset - start,
 +                                            copy, p, p_off, p_len, copied) {
 +                              vaddr = kmap_atomic(p);
 +                              csum2 = csum_partial_copy_nocheck(vaddr + p_off,
 +                                                                to + copied,
 +                                                                p_len, 0);
 +                              kunmap_atomic(vaddr);
 +                              csum = csum_block_add(csum, csum2, pos);
 +                              pos += p_len;
 +                      }
 +
                        if (!(len -= copy))
                                return csum;
                        offset += copy;
                        to     += copy;
 -                      pos    += copy;
                }
                start = end;
        }
@@@ -2742,7 -2363,6 +2745,7 @@@ skb_zerocopy(struct sk_buff *to, struc
                skb_tx_error(from);
                return -ENOMEM;
        }
 +      skb_zerocopy_clone(to, from, GFP_ATOMIC);
  
        for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
                if (!len)
@@@ -3040,7 -2660,6 +3043,7 @@@ void skb_split(struct sk_buff *skb, str
  
        skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags &
                                      SKBTX_SHARED_FRAG;
 +      skb_zerocopy_clone(skb1, skb, 0);
        if (len < pos)  /* Split line is inside header. */
                skb_split_inside_header(skb, skb1, len, pos);
        else            /* Second chunk has no header, nothing to copy. */
@@@ -3084,8 -2703,6 +3087,8 @@@ int skb_shift(struct sk_buff *tgt, stru
  
        if (skb_headlen(skb))
                return 0;
 +      if (skb_zcopy(tgt) || skb_zcopy(skb))
 +              return 0;
  
        todo = shiftlen;
        from = 0;
@@@ -3659,8 -3276,6 +3662,8 @@@ normal
  
                skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
                                              SKBTX_SHARED_FRAG;
 +              if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC))
 +                      goto err;
  
                while (pos < offset + len) {
                        if (i >= nfrags) {
@@@ -4784,8 -4399,6 +4787,8 @@@ bool skb_try_coalesce(struct sk_buff *t
  
        if (skb_has_frag_list(to) || skb_has_frag_list(from))
                return false;
 +      if (skb_zcopy(to) || skb_zcopy(from))
 +              return false;
  
        if (skb_headlen(from) != 0) {
                struct page *page;
diff --combined net/dsa/dsa2.c
index cceaa4dd9f53c30c5a3d9d647379a24358fca669,20bc9c56fca05c230477b15d6dad15e02b488800..873af0108e243fc269f591ec7d7c59a9c8b374fc
@@@ -219,7 -219,7 +219,7 @@@ static int dsa_dsa_port_apply(struct ds
        struct dsa_switch *ds = port->ds;
        int err;
  
 -      err = dsa_cpu_dsa_setup(ds, ds->dev, port, port->index);
 +      err = dsa_cpu_dsa_setup(port);
        if (err) {
                dev_warn(ds->dev, "Failed to setup dsa port %d: %d\n",
                         port->index, err);
@@@ -243,7 -243,7 +243,7 @@@ static int dsa_cpu_port_apply(struct ds
        struct dsa_switch *ds = port->ds;
        int err;
  
 -      err = dsa_cpu_dsa_setup(ds, ds->dev, port, port->index);
 +      err = dsa_cpu_dsa_setup(port);
        if (err) {
                dev_warn(ds->dev, "Failed to setup cpu port %d: %d\n",
                         port->index, err);
@@@ -275,7 -275,7 +275,7 @@@ static int dsa_user_port_apply(struct d
        if (!name)
                name = "eth%d";
  
 -      err = dsa_slave_create(ds, ds->dev, port->index, name);
 +      err = dsa_slave_create(port, name);
        if (err) {
                dev_warn(ds->dev, "Failed to create slave %d: %d\n",
                         port->index, err);
@@@ -577,7 -577,7 +577,7 @@@ static int dsa_dst_parse(struct dsa_swi
                        return err;
        }
  
-       if (!dst->cpu_dp->netdev) {
+       if (!dst->cpu_dp) {
                pr_warn("Tree has no master device\n");
                return -EINVAL;
        }
diff --combined net/dsa/tag_ksz.c
index 17f30675c15cee18642642334120b3f03eb09183,fcd90f79458e20fefd76661fe7bc7e07d42ed1a3..010ca0a336c46a34f6a89d8c6975ca4b00642e6e
@@@ -42,7 -42,8 +42,8 @@@ static struct sk_buff *ksz_xmit(struct 
        padlen = (skb->len >= ETH_ZLEN) ? 0 : ETH_ZLEN - skb->len;
  
        if (skb_tailroom(skb) >= padlen + KSZ_INGRESS_TAG_LEN) {
-               if (skb_put_padto(skb, skb->len + padlen))
+               /* Let dsa_slave_xmit() free skb */
+               if (__skb_put_padto(skb, skb->len + padlen, false))
                        return NULL;
  
                nskb = skb;
                                         skb_transport_header(skb) - skb->head);
                skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len));
  
-               if (skb_put_padto(nskb, nskb->len + padlen)) {
-                       kfree_skb(nskb);
+               /* Let skb_put_padto() free nskb, and let dsa_slave_xmit() free
+                * skb
+                */
+               if (skb_put_padto(nskb, nskb->len + padlen))
                        return NULL;
-               }
  
-               kfree_skb(skb);
+               consume_skb(skb);
        }
  
        tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN);
@@@ -76,7 -78,8 +78,7 @@@
  }
  
  static struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev,
 -                             struct packet_type *pt,
 -                             struct net_device *orig_dev)
 +                             struct packet_type *pt)
  {
        struct dsa_switch_tree *dst = dev->dsa_ptr;
        struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
diff --combined net/dsa/tag_trailer.c
index 8707157dea32ee88b77a306f9a5c3b4342bb0c1e,9c7b1d74a5c6cc6a80e51a259ef6645060390d31..d2fd4923aa3eb3d1d56f20dd159d0ca87408b835
@@@ -40,7 -40,7 +40,7 @@@ static struct sk_buff *trailer_xmit(str
        skb_set_network_header(nskb, skb_network_header(skb) - skb->head);
        skb_set_transport_header(nskb, skb_transport_header(skb) - skb->head);
        skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len));
-       kfree_skb(skb);
+       consume_skb(skb);
  
        if (padlen) {
                skb_put_zero(nskb, padlen);
@@@ -56,7 -56,8 +56,7 @@@
  }
  
  static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev,
 -                                 struct packet_type *pt,
 -                                 struct net_device *orig_dev)
 +                                 struct packet_type *pt)
  {
        struct dsa_switch_tree *dst = dev->dsa_ptr;
        struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
diff --combined net/ipv4/esp4.c
index 319000573bc7a6dc12966289c08f08521dead451,df68963dc90ada0ec19f8997d920f6faf3186e05..b00e4a43b4dc8538e016f8d1c23707a1f49060a8
@@@ -258,7 -258,7 +258,7 @@@ int esp_output_head(struct xfrm_state *
                esp_output_udp_encap(x, skb, esp);
  
        if (!skb_cloned(skb)) {
-               if (tailen <= skb_availroom(skb)) {
+               if (tailen <= skb_tailroom(skb)) {
                        nfrags = 1;
                        trailer = skb;
                        tail = skb_tail_pointer(trailer);
  
                        kunmap_atomic(vaddr);
  
-                       spin_unlock_bh(&x->lock);
                        nfrags = skb_shinfo(skb)->nr_frags;
  
                        __skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
                        skb_shinfo(skb)->nr_frags = ++nfrags;
  
                        pfrag->offset = pfrag->offset + allocsize;
+                       spin_unlock_bh(&x->lock);
                        nfrags++;
  
                        skb->len += tailen;
@@@ -381,7 -382,7 +382,7 @@@ int esp_output_tail(struct xfrm_state *
                           (unsigned char *)esph - skb->data,
                           assoclen + ivlen + esp->clen + alen);
        if (unlikely(err < 0))
-               goto error;
+               goto error_free;
  
        if (!esp->inplace) {
                int allocsize;
                spin_lock_bh(&x->lock);
                if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
                        spin_unlock_bh(&x->lock);
-                       goto error;
+                       goto error_free;
                }
  
                skb_shinfo(skb)->nr_frags = 1;
                                   (unsigned char *)esph - skb->data,
                                   assoclen + ivlen + esp->clen + alen);
                if (unlikely(err < 0))
-                       goto error;
+                       goto error_free;
        }
  
        if ((x->props.flags & XFRM_STATE_ESN))
  
        if (sg != dsg)
                esp_ssg_unref(x, tmp);
-       kfree(tmp);
  
+ error_free:
+       kfree(tmp);
  error:
        return err;
  }
@@@ -499,59 -501,18 +501,59 @@@ static int esp_output(struct xfrm_stat
        return esp_output_tail(x, skb, &esp);
  }
  
 +static inline int esp_remove_trailer(struct sk_buff *skb)
 +{
 +      struct xfrm_state *x = xfrm_input_state(skb);
 +      struct xfrm_offload *xo = xfrm_offload(skb);
 +      struct crypto_aead *aead = x->data;
 +      int alen, hlen, elen;
 +      int padlen, trimlen;
 +      __wsum csumdiff;
 +      u8 nexthdr[2];
 +      int ret;
 +
 +      alen = crypto_aead_authsize(aead);
 +      hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
 +      elen = skb->len - hlen;
 +
 +      if (xo && (xo->flags & XFRM_ESP_NO_TRAILER)) {
 +              ret = xo->proto;
 +              goto out;
 +      }
 +
 +      if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2))
 +              BUG();
 +
 +      ret = -EINVAL;
 +      padlen = nexthdr[0];
 +      if (padlen + 2 + alen >= elen) {
 +              net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n",
 +                                  padlen + 2, elen - alen);
 +              goto out;
 +      }
 +
 +      trimlen = alen + padlen + 2;
 +      if (skb->ip_summed == CHECKSUM_COMPLETE) {
 +              csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
 +              skb->csum = csum_block_sub(skb->csum, csumdiff,
 +                                         skb->len - trimlen);
 +      }
 +      pskb_trim(skb, skb->len - trimlen);
 +
 +      ret = nexthdr[1];
 +
 +out:
 +      return ret;
 +}
 +
  int esp_input_done2(struct sk_buff *skb, int err)
  {
        const struct iphdr *iph;
        struct xfrm_state *x = xfrm_input_state(skb);
        struct xfrm_offload *xo = xfrm_offload(skb);
        struct crypto_aead *aead = x->data;
 -      int alen = crypto_aead_authsize(aead);
        int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
 -      int elen = skb->len - hlen;
        int ihl;
 -      u8 nexthdr[2];
 -      int padlen;
  
        if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
                kfree(ESP_SKB_CB(skb)->tmp);
        if (unlikely(err))
                goto out;
  
 -      if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
 -              BUG();
 -
 -      err = -EINVAL;
 -      padlen = nexthdr[0];
 -      if (padlen + 2 + alen >= elen)
 +      err = esp_remove_trailer(skb);
 +      if (unlikely(err < 0))
                goto out;
  
 -      /* ... check padding bits here. Silly. :-) */
 -
        iph = ip_hdr(skb);
        ihl = iph->ihl * 4;
  
                        skb->ip_summed = CHECKSUM_UNNECESSARY;
        }
  
 -      pskb_trim(skb, skb->len - alen - padlen - 2);
 -      __skb_pull(skb, hlen);
 +      skb_pull_rcsum(skb, hlen);
        if (x->props.mode == XFRM_MODE_TUNNEL)
                skb_reset_transport_header(skb);
        else
                skb_set_transport_header(skb, -ihl);
  
 -      err = nexthdr[1];
 -
        /* RFC4303: Drop dummy packets without any error */
        if (err == IPPROTO_NONE)
                err = -EINVAL;
@@@ -727,8 -697,10 +729,10 @@@ skip_cow
  
        sg_init_table(sg, nfrags);
        err = skb_to_sgvec(skb, sg, 0, skb->len);
-       if (unlikely(err < 0))
+       if (unlikely(err < 0)) {
+               kfree(tmp);
                goto out;
+       }
  
        skb->ip_summed = CHECKSUM_NONE;
  
diff --combined net/ipv4/esp4_offload.c
index aca1c85f079528d61c6816f430f32df94d805f98,50112324fa5c3638527b12477c356ce406ba9a36..f8b918c766b0af1e572ed895dcf2435af92016a9
@@@ -182,13 -182,11 +182,13 @@@ out
  static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb)
  {
        struct crypto_aead *aead = x->data;
 +      struct xfrm_offload *xo = xfrm_offload(skb);
  
        if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead)))
                return -EINVAL;
  
 -      skb->ip_summed = CHECKSUM_NONE;
 +      if (!(xo->flags & CRYPTO_DONE))
 +              skb->ip_summed = CHECKSUM_NONE;
  
        return esp_input_done2(skb, 0);
  }
@@@ -259,7 -257,7 +259,7 @@@ static int esp_xmit(struct xfrm_state *
        esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32));
  
        err = esp_output_tail(x, skb, &esp);
-       if (err < 0)
+       if (err)
                return err;
  
        secpath_reset(skb);
@@@ -305,4 -303,3 +305,4 @@@ module_init(esp4_offload_init)
  module_exit(esp4_offload_exit);
  MODULE_LICENSE("GPL");
  MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
 +MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET, XFRM_PROTO_ESP);
diff --combined net/ipv4/tcp.c
index 21ca2df274c5130a13d31a391a1408d779af34af,a3e91b552edce4edee0d3b9ee5e07105946d2dd9..7a3d843758363af44c27bf3716ba2f488688fed6
  #include <linux/err.h>
  #include <linux/time.h>
  #include <linux/slab.h>
 +#include <linux/errqueue.h>
  
  #include <net/icmp.h>
  #include <net/inet_common.h>
@@@ -389,19 -388,6 +389,19 @@@ static int retrans_to_secs(u8 retrans, 
        return period;
  }
  
 +static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
 +{
 +      u32 rate = READ_ONCE(tp->rate_delivered);
 +      u32 intv = READ_ONCE(tp->rate_interval_us);
 +      u64 rate64 = 0;
 +
 +      if (rate && intv) {
 +              rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
 +              do_div(rate64, intv);
 +      }
 +      return rate64;
 +}
 +
  /* Address-family independent initialization for a tcp_sock.
   *
   * NOTE: A lot of things set to zero explicitly by call to
@@@ -414,6 -400,7 +414,6 @@@ void tcp_init_sock(struct sock *sk
  
        tp->out_of_order_queue = RB_ROOT;
        tcp_init_xmit_timers(sk);
 -      tcp_prequeue_init(tp);
        INIT_LIST_HEAD(&tp->tsq_node);
  
        icsk->icsk_rto = TCP_TIMEOUT_INIT;
@@@ -1047,29 -1034,23 +1047,29 @@@ out_err
  }
  EXPORT_SYMBOL_GPL(do_tcp_sendpages);
  
 -int tcp_sendpage(struct sock *sk, struct page *page, int offset,
 -               size_t size, int flags)
 +int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
 +                      size_t size, int flags)
  {
 -      ssize_t res;
 -
        if (!(sk->sk_route_caps & NETIF_F_SG) ||
            !sk_check_csum_caps(sk))
 -              return sock_no_sendpage(sk->sk_socket, page, offset, size,
 -                                      flags);
 -
 -      lock_sock(sk);
 +              return sock_no_sendpage_locked(sk, page, offset, size, flags);
  
        tcp_rate_check_app_limited(sk);  /* is sending application-limited? */
  
 -      res = do_tcp_sendpages(sk, page, offset, size, flags);
 +      return do_tcp_sendpages(sk, page, offset, size, flags);
 +}
 +EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
 +
 +int tcp_sendpage(struct sock *sk, struct page *page, int offset,
 +               size_t size, int flags)
 +{
 +      int ret;
 +
 +      lock_sock(sk);
 +      ret = tcp_sendpage_locked(sk, page, offset, size, flags);
        release_sock(sk);
 -      return res;
 +
 +      return ret;
  }
  EXPORT_SYMBOL(tcp_sendpage);
  
@@@ -1163,10 -1144,9 +1163,10 @@@ static int tcp_sendmsg_fastopen(struct 
        return err;
  }
  
 -int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 +int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
  {
        struct tcp_sock *tp = tcp_sk(sk);
 +      struct ubuf_info *uarg = NULL;
        struct sk_buff *skb;
        struct sockcm_cookie sockc;
        int flags, err, copied = 0;
        bool sg;
        long timeo;
  
 -      lock_sock(sk);
 -
        flags = msg->msg_flags;
 +
 +      if (flags & MSG_ZEROCOPY && size) {
 +              if (sk->sk_state != TCP_ESTABLISHED) {
 +                      err = -EINVAL;
 +                      goto out_err;
 +              }
 +
 +              skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL;
 +              uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
 +              if (!uarg) {
 +                      err = -ENOBUFS;
 +                      goto out_err;
 +              }
 +
 +              /* skb may be freed in main loop, keep extra ref on uarg */
 +              sock_zerocopy_get(uarg);
 +              if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG))
 +                      uarg->zerocopy = 0;
 +      }
 +
        if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
                err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
                if (err == -EINPROGRESS && copied_syn > 0)
@@@ -1319,7 -1281,7 +1319,7 @@@ new_segment
                        err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
                        if (err)
                                goto do_fault;
 -              } else {
 +              } else if (!uarg || !uarg->zerocopy) {
                        bool merge = true;
                        int i = skb_shinfo(skb)->nr_frags;
                        struct page_frag *pfrag = sk_page_frag(sk);
                                page_ref_inc(pfrag->page);
                        }
                        pfrag->offset += copy;
 +              } else {
 +                      err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
 +                      if (err == -EMSGSIZE || err == -EEXIST)
 +                              goto new_segment;
 +                      if (err < 0)
 +                              goto do_error;
 +                      copy = err;
                }
  
                if (!copied)
@@@ -1410,7 -1365,7 +1410,7 @@@ out
                tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
        }
  out_nopush:
 -      release_sock(sk);
 +      sock_zerocopy_put(uarg);
        return copied + copied_syn;
  
  do_fault:
@@@ -1427,7 -1382,6 +1427,7 @@@ do_error
        if (copied + copied_syn)
                goto out;
  out_err:
 +      sock_zerocopy_put_abort(uarg);
        err = sk_stream_error(sk, flags, err);
        /* make sure we wake any epoll edge trigger waiter */
        if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
                sk->sk_write_space(sk);
                tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
        }
 -      release_sock(sk);
        return err;
  }
 +EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
 +
 +int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 +{
 +      int ret;
 +
 +      lock_sock(sk);
 +      ret = tcp_sendmsg_locked(sk, msg, size);
 +      release_sock(sk);
 +
 +      return ret;
 +}
  EXPORT_SYMBOL(tcp_sendmsg);
  
  /*
@@@ -1582,6 -1525,20 +1582,6 @@@ static void tcp_cleanup_rbuf(struct soc
                tcp_send_ack(sk);
  }
  
 -static void tcp_prequeue_process(struct sock *sk)
 -{
 -      struct sk_buff *skb;
 -      struct tcp_sock *tp = tcp_sk(sk);
 -
 -      NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
 -
 -      while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
 -              sk_backlog_rcv(sk, skb);
 -
 -      /* Clear memory counter. */
 -      tp->ucopy.memory = 0;
 -}
 -
  static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
  {
        struct sk_buff *skb;
@@@ -1695,61 -1652,6 +1695,61 @@@ int tcp_peek_len(struct socket *sock
  }
  EXPORT_SYMBOL(tcp_peek_len);
  
 +static void tcp_update_recv_tstamps(struct sk_buff *skb,
 +                                  struct scm_timestamping *tss)
 +{
 +      if (skb->tstamp)
 +              tss->ts[0] = ktime_to_timespec(skb->tstamp);
 +      else
 +              tss->ts[0] = (struct timespec) {0};
 +
 +      if (skb_hwtstamps(skb)->hwtstamp)
 +              tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp);
 +      else
 +              tss->ts[2] = (struct timespec) {0};
 +}
 +
 +/* Similar to __sock_recv_timestamp, but does not require an skb */
 +void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
 +                      struct scm_timestamping *tss)
 +{
 +      struct timeval tv;
 +      bool has_timestamping = false;
 +
 +      if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
 +              if (sock_flag(sk, SOCK_RCVTSTAMP)) {
 +                      if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
 +                              put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
 +                                       sizeof(tss->ts[0]), &tss->ts[0]);
 +                      } else {
 +                              tv.tv_sec = tss->ts[0].tv_sec;
 +                              tv.tv_usec = tss->ts[0].tv_nsec / 1000;
 +
 +                              put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
 +                                       sizeof(tv), &tv);
 +                      }
 +              }
 +
 +              if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
 +                      has_timestamping = true;
 +              else
 +                      tss->ts[0] = (struct timespec) {0};
 +      }
 +
 +      if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
 +              if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
 +                      has_timestamping = true;
 +              else
 +                      tss->ts[2] = (struct timespec) {0};
 +      }
 +
 +      if (has_timestamping) {
 +              tss->ts[1] = (struct timespec) {0};
 +              put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING,
 +                       sizeof(*tss), tss);
 +      }
 +}
 +
  /*
   *    This routine copies from a sock struct into the user buffer.
   *
@@@ -1769,10 -1671,9 +1769,10 @@@ int tcp_recvmsg(struct sock *sk, struc
        int err;
        int target;             /* Read at least this many bytes */
        long timeo;
 -      struct task_struct *user_recv = NULL;
        struct sk_buff *skb, *last;
        u32 urg_hole = 0;
 +      struct scm_timestamping tss;
 +      bool has_tss = false;
  
        if (unlikely(flags & MSG_ERRQUEUE))
                return inet_recv_error(sk, msg, len, addr_len);
  
                tcp_cleanup_rbuf(sk, copied);
  
 -              if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
 -                      /* Install new reader */
 -                      if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
 -                              user_recv = current;
 -                              tp->ucopy.task = user_recv;
 -                              tp->ucopy.msg = msg;
 -                      }
 -
 -                      tp->ucopy.len = len;
 -
 -                      WARN_ON(tp->copied_seq != tp->rcv_nxt &&
 -                              !(flags & (MSG_PEEK | MSG_TRUNC)));
 -
 -                      /* Ugly... If prequeue is not empty, we have to
 -                       * process it before releasing socket, otherwise
 -                       * order will be broken at second iteration.
 -                       * More elegant solution is required!!!
 -                       *
 -                       * Look: we have the following (pseudo)queues:
 -                       *
 -                       * 1. packets in flight
 -                       * 2. backlog
 -                       * 3. prequeue
 -                       * 4. receive_queue
 -                       *
 -                       * Each queue can be processed only if the next ones
 -                       * are empty. At this point we have empty receive_queue.
 -                       * But prequeue _can_ be not empty after 2nd iteration,
 -                       * when we jumped to start of loop because backlog
 -                       * processing added something to receive_queue.
 -                       * We cannot release_sock(), because backlog contains
 -                       * packets arrived _after_ prequeued ones.
 -                       *
 -                       * Shortly, algorithm is clear --- to process all
 -                       * the queues in order. We could make it more directly,
 -                       * requeueing packets from backlog to prequeue, if
 -                       * is not empty. It is more elegant, but eats cycles,
 -                       * unfortunately.
 -                       */
 -                      if (!skb_queue_empty(&tp->ucopy.prequeue))
 -                              goto do_prequeue;
 -
 -                      /* __ Set realtime policy in scheduler __ */
 -              }
 -
                if (copied >= target) {
                        /* Do not sleep, just process backlog. */
                        release_sock(sk);
                        sk_wait_data(sk, &timeo, last);
                }
  
 -              if (user_recv) {
 -                      int chunk;
 -
 -                      /* __ Restore normal policy in scheduler __ */
 -
 -                      chunk = len - tp->ucopy.len;
 -                      if (chunk != 0) {
 -                              NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
 -                              len -= chunk;
 -                              copied += chunk;
 -                      }
 -
 -                      if (tp->rcv_nxt == tp->copied_seq &&
 -                          !skb_queue_empty(&tp->ucopy.prequeue)) {
 -do_prequeue:
 -                              tcp_prequeue_process(sk);
 -
 -                              chunk = len - tp->ucopy.len;
 -                              if (chunk != 0) {
 -                                      NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
 -                                      len -= chunk;
 -                                      copied += chunk;
 -                              }
 -                      }
 -              }
                if ((flags & MSG_PEEK) &&
                    (peek_seq - copied - urg_hole != tp->copied_seq)) {
                        net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
@@@ -1970,10 -1941,6 +1970,10 @@@ skip_copy
                if (used + offset < skb->len)
                        continue;
  
 +              if (TCP_SKB_CB(skb)->has_rxtstamp) {
 +                      tcp_update_recv_tstamps(skb, &tss);
 +                      has_tss = true;
 +              }
                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
                        goto found_fin_ok;
                if (!(flags & MSG_PEEK))
                break;
        } while (len > 0);
  
 -      if (user_recv) {
 -              if (!skb_queue_empty(&tp->ucopy.prequeue)) {
 -                      int chunk;
 -
 -                      tp->ucopy.len = copied > 0 ? len : 0;
 -
 -                      tcp_prequeue_process(sk);
 -
 -                      if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
 -                              NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
 -                              len -= chunk;
 -                              copied += chunk;
 -                      }
 -              }
 -
 -              tp->ucopy.task = NULL;
 -              tp->ucopy.len = 0;
 -      }
 -
        /* According to UNIX98, msg_name/msg_namelen are ignored
         * on connected socket. I was just happy when found this 8) --ANK
         */
  
 +      if (has_tss)
 +              tcp_recv_timestamp(msg, sk, &tss);
 +
        /* Clean up data we have read: This will do ACK frames. */
        tcp_cleanup_rbuf(sk, copied);
  
@@@ -2498,7 -2481,7 +2498,7 @@@ static int do_tcp_setsockopt(struct soc
                name[val] = 0;
  
                lock_sock(sk);
-               err = tcp_set_congestion_control(sk, name, true);
+               err = tcp_set_congestion_control(sk, name, true, true);
                release_sock(sk);
                return err;
        }
@@@ -2840,7 -2823,7 +2840,7 @@@ void tcp_get_info(struct sock *sk, stru
  {
        const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
        const struct inet_connection_sock *icsk = inet_csk(sk);
 -      u32 now, intv;
 +      u32 now;
        u64 rate64;
        bool slow;
        u32 rate;
        info->tcpi_data_segs_out = tp->data_segs_out;
  
        info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
 -      rate = READ_ONCE(tp->rate_delivered);
 -      intv = READ_ONCE(tp->rate_interval_us);
 -      if (rate && intv) {
 -              rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
 -              do_div(rate64, intv);
 +      rate64 = tcp_compute_delivery_rate(tp);
 +      if (rate64)
                info->tcpi_delivery_rate = rate64;
 -      }
        unlock_sock_fast(sk, slow);
  }
  EXPORT_SYMBOL_GPL(tcp_get_info);
@@@ -2951,12 -2938,8 +2951,12 @@@ struct sk_buff *tcp_get_timestamping_op
        const struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *stats;
        struct tcp_info info;
 +      u64 rate64;
 +      u32 rate;
  
 -      stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
 +      stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
 +                        3 * nla_total_size(sizeof(u32)) +
 +                        2 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
        if (!stats)
                return NULL;
  
                          tp->data_segs_out, TCP_NLA_PAD);
        nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
                          tp->total_retrans, TCP_NLA_PAD);
 +
 +      rate = READ_ONCE(sk->sk_pacing_rate);
 +      rate64 = rate != ~0U ? rate : ~0ULL;
 +      nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
 +
 +      rate64 = tcp_compute_delivery_rate(tp);
 +      nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
 +
 +      nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
 +      nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
 +      nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
 +
 +      nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
 +      nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
        return stats;
  }
  
diff --combined net/ipv4/tcp_cong.c
index c2b1744696459540c989a64e05eb2453cf2e366e,421ea1b918da5bc4a3974531539cd67266f70798..2f26124fd1601ad23662d1fc9152b4370270c112
@@@ -189,8 -189,8 +189,8 @@@ void tcp_init_congestion_control(struc
                INET_ECN_dontxmit(sk);
  }
  
- void tcp_reinit_congestion_control(struct sock *sk,
-                                  const struct tcp_congestion_ops *ca)
static void tcp_reinit_congestion_control(struct sock *sk,
+                                         const struct tcp_congestion_ops *ca)
  {
        struct inet_connection_sock *icsk = inet_csk(sk);
  
@@@ -338,7 -338,7 +338,7 @@@ out
   * tcp_reinit_congestion_control (if the current congestion control was
   * already initialized.
   */
- int tcp_set_congestion_control(struct sock *sk, const char *name, bool load)
+ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit)
  {
        struct inet_connection_sock *icsk = inet_csk(sk);
        const struct tcp_congestion_ops *ca;
        if (!ca) {
                err = -ENOENT;
        } else if (!load) {
-               icsk->icsk_ca_ops = ca;
-               if (!try_module_get(ca->owner))
+               const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops;
+               if (try_module_get(ca->owner)) {
+                       if (reinit) {
+                               tcp_reinit_congestion_control(sk, ca);
+                       } else {
+                               icsk->icsk_ca_ops = ca;
+                               module_put(old_ca->owner);
+                       }
+               } else {
                        err = -EBUSY;
+               }
        } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) {
                err = -EPERM;
@@@ -456,7 -465,7 +465,7 @@@ u32 tcp_reno_undo_cwnd(struct sock *sk
  {
        const struct tcp_sock *tp = tcp_sk(sk);
  
 -      return max(tp->snd_cwnd, tp->snd_ssthresh << 1);
 +      return max(tp->snd_cwnd, tp->prior_cwnd);
  }
  EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
  
diff --combined net/ipv4/udp.c
index bf6c406bf5e71f0200fcfc1439c48e2843fae6c2,62344804baaef96daf405dbdd5418db541b95864..f900cdd0fbfb00e4422da04daee88ebda69b2510
@@@ -380,8 -380,8 +380,8 @@@ int udp_v4_get_port(struct sock *sk, un
  
  static int compute_score(struct sock *sk, struct net *net,
                         __be32 saddr, __be16 sport,
 -                       __be32 daddr, unsigned short hnum, int dif,
 -                       bool exact_dif)
 +                       __be32 daddr, unsigned short hnum,
 +                       int dif, int sdif, bool exact_dif)
  {
        int score;
        struct inet_sock *inet;
        }
  
        if (sk->sk_bound_dev_if || exact_dif) {
 -              if (sk->sk_bound_dev_if != dif)
 +              bool dev_match = (sk->sk_bound_dev_if == dif ||
 +                                sk->sk_bound_dev_if == sdif);
 +
 +              if (exact_dif && !dev_match)
                        return -1;
 -              score += 4;
 +              if (sk->sk_bound_dev_if && dev_match)
 +                      score += 4;
        }
 +
        if (sk->sk_incoming_cpu == raw_smp_processor_id())
                score++;
        return score;
@@@ -441,11 -436,10 +441,11 @@@ static u32 udp_ehashfn(const struct ne
  
  /* called with rcu_read_lock() */
  static struct sock *udp4_lib_lookup2(struct net *net,
 -              __be32 saddr, __be16 sport,
 -              __be32 daddr, unsigned int hnum, int dif, bool exact_dif,
 -              struct udp_hslot *hslot2,
 -              struct sk_buff *skb)
 +                                   __be32 saddr, __be16 sport,
 +                                   __be32 daddr, unsigned int hnum,
 +                                   int dif, int sdif, bool exact_dif,
 +                                   struct udp_hslot *hslot2,
 +                                   struct sk_buff *skb)
  {
        struct sock *sk, *result;
        int score, badness, matches = 0, reuseport = 0;
        badness = 0;
        udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
                score = compute_score(sk, net, saddr, sport,
 -                                    daddr, hnum, dif, exact_dif);
 +                                    daddr, hnum, dif, sdif, exact_dif);
                if (score > badness) {
                        reuseport = sk->sk_reuseport;
                        if (reuseport) {
   * harder than this. -DaveM
   */
  struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 -              __be16 sport, __be32 daddr, __be16 dport,
 -              int dif, struct udp_table *udptable, struct sk_buff *skb)
 +              __be16 sport, __be32 daddr, __be16 dport, int dif,
 +              int sdif, struct udp_table *udptable, struct sk_buff *skb)
  {
        struct sock *sk, *result;
        unsigned short hnum = ntohs(dport);
                        goto begin;
  
                result = udp4_lib_lookup2(net, saddr, sport,
 -                                        daddr, hnum, dif,
 +                                        daddr, hnum, dif, sdif,
                                          exact_dif, hslot2, skb);
                if (!result) {
                        unsigned int old_slot2 = slot2;
                                goto begin;
  
                        result = udp4_lib_lookup2(net, saddr, sport,
 -                                                daddr, hnum, dif,
 +                                                daddr, hnum, dif, sdif,
                                                  exact_dif, hslot2, skb);
                }
                return result;
@@@ -527,7 -521,7 +527,7 @@@ begin
        badness = 0;
        sk_for_each_rcu(sk, &hslot->head) {
                score = compute_score(sk, net, saddr, sport,
 -                                    daddr, hnum, dif, exact_dif);
 +                                    daddr, hnum, dif, sdif, exact_dif);
                if (score > badness) {
                        reuseport = sk->sk_reuseport;
                        if (reuseport) {
@@@ -560,7 -554,7 +560,7 @@@ static inline struct sock *__udp4_lib_l
  
        return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
                                 iph->daddr, dport, inet_iif(skb),
 -                               udptable, skb);
 +                               inet_sdif(skb), udptable, skb);
  }
  
  struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
@@@ -582,7 -576,7 +582,7 @@@ struct sock *udp4_lib_lookup(struct ne
        struct sock *sk;
  
        sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
 -                             dif, &udp_table, NULL);
 +                             dif, 0, &udp_table, NULL);
        if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
                sk = NULL;
        return sk;
@@@ -593,7 -587,7 +593,7 @@@ EXPORT_SYMBOL_GPL(udp4_lib_lookup)
  static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
                                       __be16 loc_port, __be32 loc_addr,
                                       __be16 rmt_port, __be32 rmt_addr,
 -                                     int dif, unsigned short hnum)
 +                                     int dif, int sdif, unsigned short hnum)
  {
        struct inet_sock *inet = inet_sk(sk);
  
            (inet->inet_dport != rmt_port && inet->inet_dport) ||
            (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
            ipv6_only_sock(sk) ||
 -          (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
 +          (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
 +           sk->sk_bound_dev_if != sdif))
                return false;
 -      if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif))
 +      if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif))
                return false;
        return true;
  }
@@@ -635,8 -628,8 +635,8 @@@ void __udp4_lib_err(struct sk_buff *skb
        struct net *net = dev_net(skb->dev);
  
        sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
 -                      iph->saddr, uh->source, skb->dev->ifindex, udptable,
 -                      NULL);
 +                             iph->saddr, uh->source, skb->dev->ifindex, 0,
 +                             udptable, NULL);
        if (!sk) {
                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
                return; /* No socket for error */
@@@ -809,7 -802,7 +809,7 @@@ static int udp_send_skb(struct sk_buff 
        if (is_udplite)                                  /*     UDP-Lite      */
                csum = udplite_csum(skb);
  
 -      else if (sk->sk_no_check_tx && !skb_is_gso(skb)) {   /* UDP csum off */
 +      else if (sk->sk_no_check_tx) {                   /* UDP csum off */
  
                skb->ip_summed = CHECKSUM_NONE;
                goto send;
@@@ -1183,11 -1176,7 +1183,11 @@@ static void udp_set_dev_scratch(struct 
        scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
        scratch->is_linear = !skb_is_nonlinear(skb);
  #endif
 -      if (likely(!skb->_skb_refdst && !skb_sec_path(skb)))
 +      /* all head states execept sp (dst, sk, nf) are always cleared by
 +       * udp_rcv() and we need to preserve secpath, if present, to eventually
 +       * process IP_CMSG_PASSSEC at recvmsg() time
 +       */
 +      if (likely(!skb_sec_path(skb)))
                scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
  }
  
@@@ -1794,6 -1783,13 +1794,6 @@@ static int __udp_queue_rcv_skb(struct s
                sk_mark_napi_id_once(sk, skb);
        }
  
 -      /* At recvmsg() time we may access skb->dst or skb->sp depending on
 -       * the IP options and the cmsg flags, elsewhere can we clear all
 -       * pending head states while they are hot in the cache
 -       */
 -      if (likely(IPCB(skb)->opt.optlen == 0 && !skb_sec_path(skb)))
 -              skb_release_head_state(skb);
 -
        rc = __udp_enqueue_schedule_skb(sk, skb);
        if (rc < 0) {
                int is_udplite = IS_UDPLITE(sk);
@@@ -1933,14 -1929,16 +1933,16 @@@ drop
  /* For TCP sockets, sk_rx_dst is protected by socket lock
   * For UDP, we use xchg() to guard against concurrent changes.
   */
void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
  {
        struct dst_entry *old;
  
        if (dst_hold_safe(dst)) {
                old = xchg(&sk->sk_rx_dst, dst);
                dst_release(old);
+               return old != dst;
        }
+       return false;
  }
  EXPORT_SYMBOL(udp_sk_rx_dst_set);
  
@@@ -1961,7 -1959,6 +1963,7 @@@ static int __udp4_lib_mcast_deliver(str
        unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
        unsigned int offset = offsetof(typeof(*sk), sk_node);
        int dif = skb->dev->ifindex;
 +      int sdif = inet_sdif(skb);
        struct hlist_node *node;
        struct sk_buff *nskb;
  
@@@ -1976,7 -1973,7 +1978,7 @@@ start_lookup
  
        sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
                if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
 -                                       uh->source, saddr, dif, hnum))
 +                                       uh->source, saddr, dif, sdif, hnum))
                        continue;
  
                if (!first) {
@@@ -2166,7 -2163,7 +2168,7 @@@ drop
  static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
                                                  __be16 loc_port, __be32 loc_addr,
                                                  __be16 rmt_port, __be32 rmt_addr,
 -                                                int dif)
 +                                                int dif, int sdif)
  {
        struct sock *sk, *result;
        unsigned short hnum = ntohs(loc_port);
        result = NULL;
        sk_for_each_rcu(sk, &hslot->head) {
                if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
 -                                      rmt_port, rmt_addr, dif, hnum)) {
 +                                      rmt_port, rmt_addr, dif, sdif, hnum)) {
                        if (result)
                                return NULL;
                        result = sk;
  static struct sock *__udp4_lib_demux_lookup(struct net *net,
                                            __be16 loc_port, __be32 loc_addr,
                                            __be16 rmt_port, __be32 rmt_addr,
 -                                          int dif)
 +                                          int dif, int sdif)
  {
        unsigned short hnum = ntohs(loc_port);
        unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
  
        udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
                if (INET_MATCH(sk, net, acookie, rmt_addr,
 -                             loc_addr, ports, dif))
 +                             loc_addr, ports, dif, sdif))
                        return sk;
                /* Only check first socket in chain */
                break;
@@@ -2225,7 -2222,6 +2227,7 @@@ void udp_v4_early_demux(struct sk_buff 
        struct sock *sk = NULL;
        struct dst_entry *dst;
        int dif = skb->dev->ifindex;
 +      int sdif = inet_sdif(skb);
        int ours;
  
        /* validate the packet */
                }
  
                sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
 -                                                 uh->source, iph->saddr, dif);
 +                                                 uh->source, iph->saddr,
 +                                                 dif, sdif);
        } else if (skb->pkt_type == PACKET_HOST) {
                sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
 -                                           uh->source, iph->saddr, dif);
 +                                           uh->source, iph->saddr, dif, sdif);
        }
  
        if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
diff --combined net/ipv6/addrconf.c
index 45d0a24644debc829604ea2120897192c52f0ee3,936e9ab4dda5453ce30b8640b85693b9728502fd..c2e2a78787ec990f4dac2040fb1e26dc150860e2
@@@ -3030,6 -3030,9 +3030,6 @@@ static void sit_add_v4_addrs(struct ine
  static void init_loopback(struct net_device *dev)
  {
        struct inet6_dev  *idev;
 -      struct net_device *sp_dev;
 -      struct inet6_ifaddr *sp_ifa;
 -      struct rt6_info *sp_rt;
  
        /* ::1 */
  
        }
  
        add_addr(idev, &in6addr_loopback, 128, IFA_HOST);
 -
 -      /* Add routes to other interface's IPv6 addresses */
 -      for_each_netdev(dev_net(dev), sp_dev) {
 -              if (!strcmp(sp_dev->name, dev->name))
 -                      continue;
 -
 -              idev = __in6_dev_get(sp_dev);
 -              if (!idev)
 -                      continue;
 -
 -              read_lock_bh(&idev->lock);
 -              list_for_each_entry(sp_ifa, &idev->addr_list, if_list) {
 -
 -                      if (sp_ifa->flags & (IFA_F_DADFAILED | IFA_F_TENTATIVE))
 -                              continue;
 -
 -                      if (sp_ifa->rt) {
 -                              /* This dst has been added to garbage list when
 -                               * lo device down, release this obsolete dst and
 -                               * reallocate a new router for ifa.
 -                               */
 -                              if (!atomic_read(&sp_ifa->rt->rt6i_ref)) {
 -                                      ip6_rt_put(sp_ifa->rt);
 -                                      sp_ifa->rt = NULL;
 -                              } else {
 -                                      continue;
 -                              }
 -                      }
 -
 -                      sp_rt = addrconf_dst_alloc(idev, &sp_ifa->addr, false);
 -
 -                      /* Failure cases are ignored */
 -                      if (!IS_ERR(sp_rt)) {
 -                              sp_ifa->rt = sp_rt;
 -                              ip6_ins_rt(sp_rt);
 -                      }
 -              }
 -              read_unlock_bh(&idev->lock);
 -      }
  }
  
  void addrconf_add_linklocal(struct inet6_dev *idev,
@@@ -3279,11 -3321,11 +3279,11 @@@ static void addrconf_gre_config(struct 
  static int fixup_permanent_addr(struct inet6_dev *idev,
                                struct inet6_ifaddr *ifp)
  {
 -      /* rt6i_ref == 0 means the host route was removed from the
 +      /* !rt6i_node means the host route was removed from the
         * FIB, for example, if 'lo' device is taken down. In that
         * case regenerate the host route.
         */
 -      if (!ifp->rt || !atomic_read(&ifp->rt->rt6i_ref)) {
 +      if (!ifp->rt || !ifp->rt->rt6i_node) {
                struct rt6_info *rt, *prev;
  
                rt = addrconf_dst_alloc(idev, &ifp->addr, false);
@@@ -5514,7 -5556,7 +5514,7 @@@ static void __ipv6_ifa_notify(int event
                 * our DAD process, so we don't need
                 * to do it again
                 */
-               if (!(ifp->rt->rt6i_node))
+               if (!rcu_access_pointer(ifp->rt->rt6i_node))
                        ip6_ins_rt(ifp->rt);
                if (ifp->idev->cnf.forwarding)
                        addrconf_join_anycast(ifp);
@@@ -6563,21 -6605,21 +6563,21 @@@ int __init addrconf_init(void
        rtnl_af_register(&inet6_ops);
  
        err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo,
 -                            NULL);
 +                            0);
        if (err < 0)
                goto errout;
  
        /* Only the first call to __rtnl_register can fail */
 -      __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, NULL);
 -      __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, NULL);
 +      __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, 0);
 +      __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, 0);
        __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr,
 -                      inet6_dump_ifaddr, NULL);
 +                      inet6_dump_ifaddr, 0);
        __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL,
 -                      inet6_dump_ifmcaddr, NULL);
 +                      inet6_dump_ifmcaddr, 0);
        __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL,
 -                      inet6_dump_ifacaddr, NULL);
 +                      inet6_dump_ifacaddr, 0);
        __rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf,
 -                      inet6_netconf_dump_devconf, NULL);
 +                      inet6_netconf_dump_devconf, 0);
  
        ipv6_addr_label_rtnl_register();
  
diff --combined net/ipv6/esp6.c
index 7fb41b0ad437d79223b104bfd5d6b3092779d29c,ab64f367d11cc256ddc56527d979a06e32170745..89910e2c10f4a63bcd285e28820141266f6d056f
@@@ -226,7 -226,7 +226,7 @@@ int esp6_output_head(struct xfrm_state 
        int tailen = esp->tailen;
  
        if (!skb_cloned(skb)) {
-               if (tailen <= skb_availroom(skb)) {
+               if (tailen <= skb_tailroom(skb)) {
                        nfrags = 1;
                        trailer = skb;
                        tail = skb_tail_pointer(trailer);
  
                        kunmap_atomic(vaddr);
  
-                       spin_unlock_bh(&x->lock);
                        nfrags = skb_shinfo(skb)->nr_frags;
  
                        __skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
                        skb_shinfo(skb)->nr_frags = ++nfrags;
  
                        pfrag->offset = pfrag->offset + allocsize;
+                       spin_unlock_bh(&x->lock);
                        nfrags++;
  
                        skb->len += tailen;
@@@ -345,7 -346,7 +346,7 @@@ int esp6_output_tail(struct xfrm_state 
                           (unsigned char *)esph - skb->data,
                           assoclen + ivlen + esp->clen + alen);
        if (unlikely(err < 0))
-               goto error;
+               goto error_free;
  
        if (!esp->inplace) {
                int allocsize;
                spin_lock_bh(&x->lock);
                if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
                        spin_unlock_bh(&x->lock);
-                       goto error;
+                       goto error_free;
                }
  
                skb_shinfo(skb)->nr_frags = 1;
                                   (unsigned char *)esph - skb->data,
                                   assoclen + ivlen + esp->clen + alen);
                if (unlikely(err < 0))
-                       goto error;
+                       goto error_free;
        }
  
        if ((x->props.flags & XFRM_STATE_ESN))
  
        if (sg != dsg)
                esp_ssg_unref(x, tmp);
-       kfree(tmp);
  
+ error_free:
+       kfree(tmp);
  error:
        return err;
  }
@@@ -461,30 -463,28 +463,30 @@@ static int esp6_output(struct xfrm_stat
        return esp6_output_tail(x, skb, &esp);
  }
  
 -int esp6_input_done2(struct sk_buff *skb, int err)
 +static inline int esp_remove_trailer(struct sk_buff *skb)
  {
        struct xfrm_state *x = xfrm_input_state(skb);
        struct xfrm_offload *xo = xfrm_offload(skb);
        struct crypto_aead *aead = x->data;
 -      int alen = crypto_aead_authsize(aead);
 -      int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
 -      int elen = skb->len - hlen;
 -      int hdr_len = skb_network_header_len(skb);
 -      int padlen;
 +      int alen, hlen, elen;
 +      int padlen, trimlen;
 +      __wsum csumdiff;
        u8 nexthdr[2];
 +      int ret;
  
 -      if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
 -              kfree(ESP_SKB_CB(skb)->tmp);
 +      alen = crypto_aead_authsize(aead);
 +      hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
 +      elen = skb->len - hlen;
  
 -      if (unlikely(err))
 +      if (xo && (xo->flags & XFRM_ESP_NO_TRAILER)) {
 +              ret = xo->proto;
                goto out;
 +      }
  
        if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2))
                BUG();
  
 -      err = -EINVAL;
 +      ret = -EINVAL;
        padlen = nexthdr[0];
        if (padlen + 2 + alen >= elen) {
                net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n",
                goto out;
        }
  
 -      /* ... check padding bits here. Silly. :-) */
 +      trimlen = alen + padlen + 2;
 +      if (skb->ip_summed == CHECKSUM_COMPLETE) {
 +              csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
 +              skb->csum = csum_block_sub(skb->csum, csumdiff,
 +                                         skb->len - trimlen);
 +      }
 +      pskb_trim(skb, skb->len - trimlen);
 +
 +      ret = nexthdr[1];
 +
 +out:
 +      return ret;
 +}
  
 -      pskb_trim(skb, skb->len - alen - padlen - 2);
 -      __skb_pull(skb, hlen);
 +int esp6_input_done2(struct sk_buff *skb, int err)
 +{
 +      struct xfrm_state *x = xfrm_input_state(skb);
 +      struct xfrm_offload *xo = xfrm_offload(skb);
 +      struct crypto_aead *aead = x->data;
 +      int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
 +      int hdr_len = skb_network_header_len(skb);
 +
 +      if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
 +              kfree(ESP_SKB_CB(skb)->tmp);
 +
 +      if (unlikely(err))
 +              goto out;
 +
 +      err = esp_remove_trailer(skb);
 +      if (unlikely(err < 0))
 +              goto out;
 +
 +      skb_postpull_rcsum(skb, skb_network_header(skb),
 +                         skb_network_header_len(skb));
 +      skb_pull_rcsum(skb, hlen);
        if (x->props.mode == XFRM_MODE_TUNNEL)
                skb_reset_transport_header(skb);
        else
                skb_set_transport_header(skb, -hdr_len);
  
 -      err = nexthdr[1];
 -
        /* RFC4303: Drop dummy packets without any error */
        if (err == IPPROTO_NONE)
                err = -EINVAL;
diff --combined net/ipv6/esp6_offload.c
index 8d4e2ba9163da0831946a43cf01f403c09efb2db,1cf437f75b0bf2bc446337ededbf58bd22673823..333a478aa1610441ce08e3ac82e92d3b48e3222d
@@@ -209,13 -209,11 +209,13 @@@ out
  static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb)
  {
        struct crypto_aead *aead = x->data;
 +      struct xfrm_offload *xo = xfrm_offload(skb);
  
        if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead)))
                return -EINVAL;
  
 -      skb->ip_summed = CHECKSUM_NONE;
 +      if (!(xo->flags & CRYPTO_DONE))
 +              skb->ip_summed = CHECKSUM_NONE;
  
        return esp6_input_done2(skb, 0);
  }
@@@ -288,7 -286,7 +288,7 @@@ static int esp6_xmit(struct xfrm_state 
        esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32));
  
        err = esp6_output_tail(x, skb, &esp);
-       if (err < 0)
+       if (err)
                return err;
  
        secpath_reset(skb);
@@@ -334,4 -332,3 +334,4 @@@ module_init(esp6_offload_init)
  module_exit(esp6_offload_exit);
  MODULE_LICENSE("GPL");
  MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
 +MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET6, XFRM_PROTO_ESP);
diff --combined net/ipv6/ip6_fib.c
index 549aacc3cb2c6f803a19d97e295ceac56ce6ef44,e1c85bb4eac0fd50905fc441e726eca843fc36a8..a3b5c163325fa3448818c446c7de502eb4f5a9c8
@@@ -33,7 -33,6 +33,7 @@@
  #include <net/ndisc.h>
  #include <net/addrconf.h>
  #include <net/lwtunnel.h>
 +#include <net/fib_notifier.h>
  
  #include <net/ip6_fib.h>
  #include <net/ip6_route.h>
@@@ -149,12 -148,24 +149,24 @@@ static struct fib6_node *node_alloc(voi
        return fn;
  }
  
- static void node_free(struct fib6_node *fn)
+ static void node_free_immediate(struct fib6_node *fn)
+ {
+       kmem_cache_free(fib6_node_kmem, fn);
+ }
+ static void node_free_rcu(struct rcu_head *head)
  {
+       struct fib6_node *fn = container_of(head, struct fib6_node, rcu);
        kmem_cache_free(fib6_node_kmem, fn);
  }
  
 -static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
+ static void node_free(struct fib6_node *fn)
+ {
+       call_rcu(&fn->rcu, node_free_rcu);
+ }
 +void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
  {
        int cpu;
  
        free_percpu(non_pcpu_rt->rt6i_pcpu);
        non_pcpu_rt->rt6i_pcpu = NULL;
  }
 -
 -static void rt6_release(struct rt6_info *rt)
 -{
 -      if (atomic_dec_and_test(&rt->rt6i_ref)) {
 -              rt6_free_pcpu(rt);
 -              dst_dev_put(&rt->dst);
 -              dst_release(&rt->dst);
 -      }
 -}
 +EXPORT_SYMBOL_GPL(rt6_free_pcpu);
  
  static void fib6_link_table(struct net *net, struct fib6_table *tb)
  {
@@@ -295,109 -314,6 +307,109 @@@ static void __net_init fib6_tables_init
  
  #endif
  
 +unsigned int fib6_tables_seq_read(struct net *net)
 +{
 +      unsigned int h, fib_seq = 0;
 +
 +      rcu_read_lock();
 +      for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
 +              struct hlist_head *head = &net->ipv6.fib_table_hash[h];
 +              struct fib6_table *tb;
 +
 +              hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
 +                      read_lock_bh(&tb->tb6_lock);
 +                      fib_seq += tb->fib_seq;
 +                      read_unlock_bh(&tb->tb6_lock);
 +              }
 +      }
 +      rcu_read_unlock();
 +
 +      return fib_seq;
 +}
 +
 +static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
 +                                  enum fib_event_type event_type,
 +                                  struct rt6_info *rt)
 +{
 +      struct fib6_entry_notifier_info info = {
 +              .rt = rt,
 +      };
 +
 +      return call_fib6_notifier(nb, net, event_type, &info.info);
 +}
 +
 +static int call_fib6_entry_notifiers(struct net *net,
 +                                   enum fib_event_type event_type,
 +                                   struct rt6_info *rt)
 +{
 +      struct fib6_entry_notifier_info info = {
 +              .rt = rt,
 +      };
 +
 +      rt->rt6i_table->fib_seq++;
 +      return call_fib6_notifiers(net, event_type, &info.info);
 +}
 +
 +struct fib6_dump_arg {
 +      struct net *net;
 +      struct notifier_block *nb;
 +};
 +
 +static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg)
 +{
 +      if (rt == arg->net->ipv6.ip6_null_entry)
 +              return;
 +      call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
 +}
 +
 +static int fib6_node_dump(struct fib6_walker *w)
 +{
 +      struct rt6_info *rt;
 +
 +      for (rt = w->leaf; rt; rt = rt->dst.rt6_next)
 +              fib6_rt_dump(rt, w->args);
 +      w->leaf = NULL;
 +      return 0;
 +}
 +
 +static void fib6_table_dump(struct net *net, struct fib6_table *tb,
 +                          struct fib6_walker *w)
 +{
 +      w->root = &tb->tb6_root;
 +      read_lock_bh(&tb->tb6_lock);
 +      fib6_walk(net, w);
 +      read_unlock_bh(&tb->tb6_lock);
 +}
 +
 +/* Called with rcu_read_lock() */
 +int fib6_tables_dump(struct net *net, struct notifier_block *nb)
 +{
 +      struct fib6_dump_arg arg;
 +      struct fib6_walker *w;
 +      unsigned int h;
 +
 +      w = kzalloc(sizeof(*w), GFP_ATOMIC);
 +      if (!w)
 +              return -ENOMEM;
 +
 +      w->func = fib6_node_dump;
 +      arg.net = net;
 +      arg.nb = nb;
 +      w->args = &arg;
 +
 +      for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
 +              struct hlist_head *head = &net->ipv6.fib_table_hash[h];
 +              struct fib6_table *tb;
 +
 +              hlist_for_each_entry_rcu(tb, head, tb6_hlist)
 +                      fib6_table_dump(net, tb, w);
 +      }
 +
 +      kfree(w);
 +
 +      return 0;
 +}
 +
  static int fib6_dump_node(struct fib6_walker *w)
  {
        int res;
@@@ -697,9 -613,9 +709,9 @@@ insert_above
  
                if (!in || !ln) {
                        if (in)
-                               node_free(in);
+                               node_free_immediate(in);
                        if (ln)
-                               node_free(ln);
+                               node_free_immediate(ln);
                        return ERR_PTR(-ENOMEM);
                }
  
@@@ -829,6 -745,8 +841,6 @@@ static void fib6_purge_rt(struct rt6_in
                        }
                        fn = fn->parent;
                }
 -              /* No more references are possible at this point. */
 -              BUG_ON(atomic_read(&rt->rt6i_ref) != 1);
        }
  }
  
@@@ -971,10 -889,8 +983,10 @@@ add
  
                rt->dst.rt6_next = iter;
                *ins = rt;
-               rt->rt6i_node = fn;
+               rcu_assign_pointer(rt->rt6i_node, fn);
                atomic_inc(&rt->rt6i_ref);
 +              call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD,
 +                                        rt);
                if (!info->skip_notify)
                        inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
                info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
                        return err;
  
                *ins = rt;
-               rt->rt6i_node = fn;
+               rcu_assign_pointer(rt->rt6i_node, fn);
                rt->dst.rt6_next = iter->dst.rt6_next;
                atomic_inc(&rt->rt6i_ref);
 +              call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE,
 +                                        rt);
                if (!info->skip_notify)
                        inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
                if (!(fn->fn_flags & RTN_RTINFO)) {
                        fn->fn_flags |= RTN_RTINFO;
                }
                nsiblings = iter->rt6i_nsiblings;
 +              iter->rt6i_node = NULL;
                fib6_purge_rt(iter, fn, info->nl_net);
                if (fn->rr_ptr == iter)
                        fn->rr_ptr = NULL;
                                        break;
                                if (rt6_qualify_for_ecmp(iter)) {
                                        *ins = iter->dst.rt6_next;
 +                                      iter->rt6i_node = NULL;
                                        fib6_purge_rt(iter, fn, info->nl_net);
                                        if (fn->rr_ptr == iter)
                                                fn->rr_ptr = NULL;
@@@ -1138,7 -1050,7 +1150,7 @@@ int fib6_add(struct fib6_node *root, st
                                   root, and then (in failure) stale node
                                   in main tree.
                                 */
-                               node_free(sfn);
+                               node_free_immediate(sfn);
                                err = PTR_ERR(sn);
                                goto failure;
                        }
@@@ -1561,7 -1473,6 +1573,7 @@@ static void fib6_del_route(struct fib6_
  
        fib6_purge_rt(rt, fn, net);
  
 +      call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt);
        if (!info->skip_notify)
                inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
        rt6_release(rt);
  
  int fib6_del(struct rt6_info *rt, struct nl_info *info)
  {
+       struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
+                                   lockdep_is_held(&rt->rt6i_table->tb6_lock));
        struct net *net = info->nl_net;
-       struct fib6_node *fn = rt->rt6i_node;
        struct rt6_info **rtp;
  
  #if RT6_DEBUG >= 2
@@@ -1759,7 -1671,9 +1772,9 @@@ static int fib6_clean_node(struct fib6_
                        if (res) {
  #if RT6_DEBUG >= 2
                                pr_debug("%s: del failed: rt=%p@%p err=%d\n",
-                                        __func__, rt, rt->rt6i_node, res);
+                                        __func__, rt,
+                                        rcu_access_pointer(rt->rt6i_node),
+                                        res);
  #endif
                                continue;
                        }
@@@ -1881,8 -1795,10 +1896,10 @@@ static int fib6_age(struct rt6_info *rt
                }
                gc_args->more++;
        } else if (rt->rt6i_flags & RTF_CACHE) {
+               if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout))
+                       rt->dst.obsolete = DST_OBSOLETE_KILL;
                if (atomic_read(&rt->dst.__refcnt) == 1 &&
-                   time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
+                   rt->dst.obsolete == DST_OBSOLETE_KILL) {
                        RT6_TRACE("aging clone %p\n", rt);
                        return -1;
                } else if (rt->rt6i_flags & RTF_GATEWAY) {
@@@ -1942,11 -1858,6 +1959,11 @@@ static void fib6_gc_timer_cb(unsigned l
  static int __net_init fib6_net_init(struct net *net)
  {
        size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
 +      int err;
 +
 +      err = fib6_notifier_init(net);
 +      if (err)
 +              return err;
  
        spin_lock_init(&net->ipv6.fib6_gc_lock);
        rwlock_init(&net->ipv6.fib6_walker_lock);
@@@ -1999,7 -1910,6 +2016,7 @@@ out_fib_table_hash
  out_rt6_stats:
        kfree(net->ipv6.rt6_stats);
  out_timer:
 +      fib6_notifier_exit(net);
        return -ENOMEM;
  }
  
@@@ -2016,7 -1926,6 +2033,7 @@@ static void fib6_net_exit(struct net *n
        kfree(net->ipv6.fib6_main_tbl);
        kfree(net->ipv6.fib_table_hash);
        kfree(net->ipv6.rt6_stats);
 +      fib6_notifier_exit(net);
  }
  
  static struct pernet_operations fib6_net_ops = {
@@@ -2040,7 -1949,7 +2057,7 @@@ int __init fib6_init(void
                goto out_kmem_cache_create;
  
        ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib,
 -                            NULL);
 +                            0);
        if (ret)
                goto out_unregister_subsys;
  
diff --combined net/ipv6/route.c
index 4d0273459d49cc3b6cd72f0de135b63ac9d02250,2d0e7798c793a4058dc0ef3a5b50734e774500a9..26cc9f483b6d282f0a665bfc4c2c206da7981921
@@@ -440,11 -440,22 +440,12 @@@ static bool rt6_check_expired(const str
                if (time_after(jiffies, rt->dst.expires))
                        return true;
        } else if (rt->dst.from) {
-               return rt6_check_expired((struct rt6_info *) rt->dst.from);
+               return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
+                      rt6_check_expired((struct rt6_info *)rt->dst.from);
        }
        return false;
  }
  
 -/* Multipath route selection:
 - *   Hash based function using packet header and flowlabel.
 - * Adapted from fib_info_hashfn()
 - */
 -static int rt6_info_hash_nhsfn(unsigned int candidate_count,
 -                             const struct flowi6 *fl6)
 -{
 -      return get_hash_from_flowi6(fl6) % candidate_count;
 -}
 -
  static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
                                             struct flowi6 *fl6, int oif,
                                             int strict)
        struct rt6_info *sibling, *next_sibling;
        int route_choosen;
  
 -      route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
 +      /* We might have already computed the hash for ICMPv6 errors. In such
 +       * case it will always be non-zero. Otherwise now is the time to do it.
 +       */
 +      if (!fl6->mp_hash)
 +              fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
 +
 +      route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
        /* Don't change the route, if route_choosen == 0
         * (siblings does not include ourself)
         */
@@@ -954,34 -959,10 +955,34 @@@ int ip6_ins_rt(struct rt6_info *rt
        return __ip6_ins_rt(rt, &info, &mxc, NULL);
  }
  
 +/* called with rcu_lock held */
 +static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
 +{
 +      struct net_device *dev = rt->dst.dev;
 +
 +      if (rt->rt6i_flags & RTF_LOCAL) {
 +              /* for copies of local routes, dst->dev needs to be the
 +               * device if it is a master device, the master device if
 +               * device is enslaved, and the loopback as the default
 +               */
 +              if (netif_is_l3_slave(dev) &&
 +                  !rt6_need_strict(&rt->rt6i_dst.addr))
 +                      dev = l3mdev_master_dev_rcu(dev);
 +              else if (!netif_is_l3_master(dev))
 +                      dev = dev_net(dev)->loopback_dev;
 +              /* last case is netif_is_l3_master(dev) is true in which
 +               * case we want dev returned to be dev
 +               */
 +      }
 +
 +      return dev;
 +}
 +
  static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
                                           const struct in6_addr *daddr,
                                           const struct in6_addr *saddr)
  {
 +      struct net_device *dev;
        struct rt6_info *rt;
  
        /*
        if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
                ort = (struct rt6_info *)ort->dst.from;
  
 -      rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
 -
 +      rcu_read_lock();
 +      dev = ip6_rt_get_dev_rcu(ort);
 +      rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
 +      rcu_read_unlock();
        if (!rt)
                return NULL;
  
  
  static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
  {
 +      struct net_device *dev;
        struct rt6_info *pcpu_rt;
  
 -      pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
 -                                rt->dst.dev, rt->dst.flags);
 -
 +      rcu_read_lock();
 +      dev = ip6_rt_get_dev_rcu(rt);
 +      pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
 +      rcu_read_unlock();
        if (!pcpu_rt)
                return NULL;
        ip6_rt_copy_init(pcpu_rt, rt);
@@@ -1210,54 -1187,6 +1211,54 @@@ struct dst_entry *ip6_route_input_looku
  }
  EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
  
 +static void ip6_multipath_l3_keys(const struct sk_buff *skb,
 +                                struct flow_keys *keys)
 +{
 +      const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
 +      const struct ipv6hdr *key_iph = outer_iph;
 +      const struct ipv6hdr *inner_iph;
 +      const struct icmp6hdr *icmph;
 +      struct ipv6hdr _inner_iph;
 +
 +      if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
 +              goto out;
 +
 +      icmph = icmp6_hdr(skb);
 +      if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
 +          icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
 +          icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
 +          icmph->icmp6_type != ICMPV6_PARAMPROB)
 +              goto out;
 +
 +      inner_iph = skb_header_pointer(skb,
 +                                     skb_transport_offset(skb) + sizeof(*icmph),
 +                                     sizeof(_inner_iph), &_inner_iph);
 +      if (!inner_iph)
 +              goto out;
 +
 +      key_iph = inner_iph;
 +out:
 +      memset(keys, 0, sizeof(*keys));
 +      keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
 +      keys->addrs.v6addrs.src = key_iph->saddr;
 +      keys->addrs.v6addrs.dst = key_iph->daddr;
 +      keys->tags.flow_label = ip6_flowinfo(key_iph);
 +      keys->basic.ip_proto = key_iph->nexthdr;
 +}
 +
 +/* if skb is set it will be used and fl6 can be NULL */
 +u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
 +{
 +      struct flow_keys hash_keys;
 +
 +      if (skb) {
 +              ip6_multipath_l3_keys(skb, &hash_keys);
 +              return flow_hash_from_keys(&hash_keys);
 +      }
 +
 +      return get_hash_from_flowi6(fl6);
 +}
 +
  void ip6_route_input(struct sk_buff *skb)
  {
        const struct ipv6hdr *iph = ipv6_hdr(skb);
        tun_info = skb_tunnel_info(skb);
        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
                fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
 +      if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
 +              fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
        skb_dst_drop(skb);
        skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
  }
@@@ -1363,7 -1290,9 +1364,9 @@@ static void rt6_dst_from_metrics_check(
  
  static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
  {
-       if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
+       u32 rt_cookie = 0;
+       if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
                return NULL;
  
        if (rt6_check_expired(rt))
@@@ -1431,8 -1360,14 +1434,14 @@@ static void ip6_link_failure(struct sk_
                if (rt->rt6i_flags & RTF_CACHE) {
                        if (dst_hold_safe(&rt->dst))
                                ip6_del_rt(rt);
-               } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
-                       rt->rt6i_node->fn_sernum = -1;
+               } else {
+                       struct fib6_node *fn;
+                       rcu_read_lock();
+                       fn = rcu_dereference(rt->rt6i_node);
+                       if (fn && (rt->rt6i_flags & RTF_DEFAULT))
+                               fn->fn_sernum = -1;
+                       rcu_read_unlock();
                }
        }
  }
@@@ -1449,7 -1384,8 +1458,8 @@@ static void rt6_do_update_pmtu(struct r
  static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
  {
        return !(rt->rt6i_flags & RTF_CACHE) &&
-               (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
+               (rt->rt6i_flags & RTF_PCPU ||
+                rcu_access_pointer(rt->rt6i_node));
  }
  
  static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
@@@ -2762,9 -2698,15 +2772,9 @@@ struct rt6_info *addrconf_dst_alloc(str
  {
        u32 tb_id;
        struct net *net = dev_net(idev->dev);
 -      struct net_device *dev = net->loopback_dev;
 +      struct net_device *dev = idev->dev;
        struct rt6_info *rt;
  
 -      /* use L3 Master device as loopback for host routes if device
 -       * is enslaved and address is not link local or multicast
 -       */
 -      if (!rt6_need_strict(addr))
 -              dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
 -
        rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
        if (!rt)
                return ERR_PTR(-ENOMEM);
@@@ -3395,9 -3337,6 +3405,9 @@@ static int rt6_nexthop_info(struct sk_b
                        goto nla_put_failure;
        }
  
 +      if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
 +              *flags |= RTNH_F_OFFLOAD;
 +
        /* not needed for multipath encoding b/c it has a rtnexthop struct */
        if (!skip_oif && rt->dst.dev &&
            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
@@@ -3676,11 -3615,8 +3686,11 @@@ static int inet6_rtm_getroute(struct sk
                struct net_device *dev;
                int flags = 0;
  
 -              dev = __dev_get_by_index(net, iif);
 +              rcu_read_lock();
 +
 +              dev = dev_get_by_index_rcu(net, iif);
                if (!dev) {
 +                      rcu_read_unlock();
                        err = -ENODEV;
                        goto errout;
                }
  
                if (!fibmatch)
                        dst = ip6_route_input_lookup(net, dev, &fl6, flags);
 +              else
 +                      dst = ip6_route_lookup(net, &fl6, 0);
 +
 +              rcu_read_unlock();
        } else {
                fl6.flowi6_oif = oif;
  
                if (!fibmatch)
                        dst = ip6_route_output(net, NULL, &fl6);
 +              else
 +                      dst = ip6_route_lookup(net, &fl6, 0);
        }
  
 -      if (fibmatch)
 -              dst = ip6_route_lookup(net, &fl6, 0);
  
        rt = container_of(dst, struct rt6_info, dst);
        if (rt->dst.error) {
@@@ -3996,7 -3928,6 +4006,7 @@@ static int __net_init ip6_route_net_ini
                         ip6_template_metrics, true);
  
  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 +      net->ipv6.fib6_has_custom_rules = false;
        net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
                                               sizeof(*net->ipv6.ip6_prohibit_entry),
                                               GFP_KERNEL);
@@@ -4172,10 -4103,9 +4182,10 @@@ int __init ip6_route_init(void
                goto fib6_rules_init;
  
        ret = -ENOBUFS;
 -      if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
 -          __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
 -          __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
 +      if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
 +          __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
 +          __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
 +                          RTNL_FLAG_DOIT_UNLOCKED))
                goto out_register_late_subsys;
  
        ret = register_netdevice_notifier(&ip6_route_dev_notifier);
diff --combined net/ipv6/udp.c
index 976f3039135612ffea9da4861f7cf17fa797d77c,d6886228e1d05c4dd192f5fe431fdaca1ffadabd..42ebb9ad46cc16405a6b4c6d948a8e82238ef732
@@@ -129,7 -129,7 +129,7 @@@ static void udp_v6_rehash(struct sock *
  static int compute_score(struct sock *sk, struct net *net,
                         const struct in6_addr *saddr, __be16 sport,
                         const struct in6_addr *daddr, unsigned short hnum,
 -                       int dif, bool exact_dif)
 +                       int dif, int sdif, bool exact_dif)
  {
        int score;
        struct inet_sock *inet;
        }
  
        if (sk->sk_bound_dev_if || exact_dif) {
 -              if (sk->sk_bound_dev_if != dif)
 +              bool dev_match = (sk->sk_bound_dev_if == dif ||
 +                                sk->sk_bound_dev_if == sdif);
 +
 +              if (exact_dif && !dev_match)
                        return -1;
 -              score++;
 +              if (sk->sk_bound_dev_if && dev_match)
 +                      score++;
        }
  
        if (sk->sk_incoming_cpu == raw_smp_processor_id())
  /* called with rcu_read_lock() */
  static struct sock *udp6_lib_lookup2(struct net *net,
                const struct in6_addr *saddr, __be16 sport,
 -              const struct in6_addr *daddr, unsigned int hnum, int dif,
 -              bool exact_dif, struct udp_hslot *hslot2,
 -              struct sk_buff *skb)
 +              const struct in6_addr *daddr, unsigned int hnum,
 +              int dif, int sdif, bool exact_dif,
 +              struct udp_hslot *hslot2, struct sk_buff *skb)
  {
        struct sock *sk, *result;
        int score, badness, matches = 0, reuseport = 0;
        badness = -1;
        udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
                score = compute_score(sk, net, saddr, sport,
 -                                    daddr, hnum, dif, exact_dif);
 +                                    daddr, hnum, dif, sdif, exact_dif);
                if (score > badness) {
                        reuseport = sk->sk_reuseport;
                        if (reuseport) {
  
  /* rcu_read_lock() must be held */
  struct sock *__udp6_lib_lookup(struct net *net,
 -                                    const struct in6_addr *saddr, __be16 sport,
 -                                    const struct in6_addr *daddr, __be16 dport,
 -                                    int dif, struct udp_table *udptable,
 -                                    struct sk_buff *skb)
 +                             const struct in6_addr *saddr, __be16 sport,
 +                             const struct in6_addr *daddr, __be16 dport,
 +                             int dif, int sdif, struct udp_table *udptable,
 +                             struct sk_buff *skb)
  {
        struct sock *sk, *result;
        unsigned short hnum = ntohs(dport);
                        goto begin;
  
                result = udp6_lib_lookup2(net, saddr, sport,
 -                                        daddr, hnum, dif, exact_dif,
 +                                        daddr, hnum, dif, sdif, exact_dif,
                                          hslot2, skb);
                if (!result) {
                        unsigned int old_slot2 = slot2;
                                goto begin;
  
                        result = udp6_lib_lookup2(net, saddr, sport,
 -                                                daddr, hnum, dif,
 +                                                daddr, hnum, dif, sdif,
                                                  exact_dif, hslot2,
                                                  skb);
                }
@@@ -265,7 -261,7 +265,7 @@@ begin
        badness = -1;
        sk_for_each_rcu(sk, &hslot->head) {
                score = compute_score(sk, net, saddr, sport, daddr, hnum, dif,
 -                                    exact_dif);
 +                                    sdif, exact_dif);
                if (score > badness) {
                        reuseport = sk->sk_reuseport;
                        if (reuseport) {
@@@ -298,7 -294,7 +298,7 @@@ static struct sock *__udp6_lib_lookup_s
  
        return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
                                 &iph->daddr, dport, inet6_iif(skb),
 -                               udptable, skb);
 +                               inet6_sdif(skb), udptable, skb);
  }
  
  struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
  
        return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
                                 &iph->daddr, dport, inet6_iif(skb),
 -                               &udp_table, skb);
 +                               inet6_sdif(skb), &udp_table, skb);
  }
  EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
  
@@@ -324,7 -320,7 +324,7 @@@ struct sock *udp6_lib_lookup(struct ne
        struct sock *sk;
  
        sk =  __udp6_lib_lookup(net, saddr, sport, daddr, dport,
 -                              dif, &udp_table, NULL);
 +                              dif, 0, &udp_table, NULL);
        if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
                sk = NULL;
        return sk;
@@@ -506,7 -502,7 +506,7 @@@ void __udp6_lib_err(struct sk_buff *skb
        struct net *net = dev_net(skb->dev);
  
        sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
 -                             inet6_iif(skb), udptable, skb);
 +                             inet6_iif(skb), 0, udptable, skb);
        if (!sk) {
                __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
                                  ICMP6_MIB_INERRORS);
@@@ -772,6 -768,15 +772,15 @@@ start_lookup
        return 0;
  }
  
+ static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
+ {
+       if (udp_sk_rx_dst_set(sk, dst)) {
+               const struct rt6_info *rt = (const struct rt6_info *)dst;
+               inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
+       }
+ }
  int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
                   int proto)
  {
                int ret;
  
                if (unlikely(sk->sk_rx_dst != dst))
-                       udp_sk_rx_dst_set(sk, dst);
+                       udp6_sk_rx_dst_set(sk, dst);
  
                ret = udpv6_queue_rcv_skb(sk, skb);
                sock_put(sk);
@@@ -898,7 -903,7 +907,7 @@@ discard
  static struct sock *__udp6_lib_demux_lookup(struct net *net,
                        __be16 loc_port, const struct in6_addr *loc_addr,
                        __be16 rmt_port, const struct in6_addr *rmt_addr,
 -                      int dif)
 +                      int dif, int sdif)
  {
        unsigned short hnum = ntohs(loc_port);
        unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum);
  
        udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
                if (sk->sk_state == TCP_ESTABLISHED &&
 -                  INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif))
 +                  INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif, sdif))
                        return sk;
                /* Only check first socket in chain */
                break;
@@@ -924,7 -929,6 +933,7 @@@ static void udp_v6_early_demux(struct s
        struct sock *sk;
        struct dst_entry *dst;
        int dif = skb->dev->ifindex;
 +      int sdif = inet6_sdif(skb);
  
        if (!pskb_may_pull(skb, skb_transport_offset(skb) +
            sizeof(struct udphdr)))
                sk = __udp6_lib_demux_lookup(net, uh->dest,
                                             &ipv6_hdr(skb)->daddr,
                                             uh->source, &ipv6_hdr(skb)->saddr,
 -                                           dif);
 +                                           dif, sdif);
        else
                return;
  
@@@ -1472,9 -1476,6 +1481,9 @@@ int compat_udpv6_getsockopt(struct soc
  }
  #endif
  
 +/* thinking of making this const? Don't.
 + * early_demux can change based on sysctl.
 + */
  static struct inet6_protocol udpv6_protocol = {
        .early_demux    =       udp_v6_early_demux,
        .early_demux_handler =  udp_v6_early_demux,
diff --combined net/kcm/kcmsock.c
index 48e993b2dbcf1afae04968ed840e2e98c2cf6772,4abf6287d7e1c29314db5c846acd16c3a2a377db..af4e76ac88ff0817398d1d7460a41f0cd5fe6f30
@@@ -96,12 -96,12 +96,12 @@@ static void kcm_update_rx_mux_stats(str
                                    struct kcm_psock *psock)
  {
        STRP_STATS_ADD(mux->stats.rx_bytes,
 -                     psock->strp.stats.rx_bytes -
 +                     psock->strp.stats.bytes -
                       psock->saved_rx_bytes);
        mux->stats.rx_msgs +=
 -              psock->strp.stats.rx_msgs - psock->saved_rx_msgs;
 -      psock->saved_rx_msgs = psock->strp.stats.rx_msgs;
 -      psock->saved_rx_bytes = psock->strp.stats.rx_bytes;
 +              psock->strp.stats.msgs - psock->saved_rx_msgs;
 +      psock->saved_rx_msgs = psock->strp.stats.msgs;
 +      psock->saved_rx_bytes = psock->strp.stats.bytes;
  }
  
  static void kcm_update_tx_mux_stats(struct kcm_mux *mux,
@@@ -1118,7 -1118,7 +1118,7 @@@ static int kcm_recvmsg(struct socket *s
        struct kcm_sock *kcm = kcm_sk(sk);
        int err = 0;
        long timeo;
 -      struct strp_rx_msg *rxm;
 +      struct strp_msg *stm;
        int copied = 0;
        struct sk_buff *skb;
  
  
        /* Okay, have a message on the receive queue */
  
 -      rxm = strp_rx_msg(skb);
 +      stm = strp_msg(skb);
  
 -      if (len > rxm->full_len)
 -              len = rxm->full_len;
 +      if (len > stm->full_len)
 +              len = stm->full_len;
  
 -      err = skb_copy_datagram_msg(skb, rxm->offset, msg, len);
 +      err = skb_copy_datagram_msg(skb, stm->offset, msg, len);
        if (err < 0)
                goto out;
  
        copied = len;
        if (likely(!(flags & MSG_PEEK))) {
                KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
 -              if (copied < rxm->full_len) {
 +              if (copied < stm->full_len) {
                        if (sock->type == SOCK_DGRAM) {
                                /* Truncated message */
                                msg->msg_flags |= MSG_TRUNC;
                                goto msg_finished;
                        }
 -                      rxm->offset += copied;
 -                      rxm->full_len -= copied;
 +                      stm->offset += copied;
 +                      stm->full_len -= copied;
                } else {
  msg_finished:
                        /* Finished with message */
@@@ -1175,7 -1175,7 +1175,7 @@@ static ssize_t kcm_splice_read(struct s
        struct sock *sk = sock->sk;
        struct kcm_sock *kcm = kcm_sk(sk);
        long timeo;
 -      struct strp_rx_msg *rxm;
 +      struct strp_msg *stm;
        int err = 0;
        ssize_t copied;
        struct sk_buff *skb;
  
        /* Okay, have a message on the receive queue */
  
 -      rxm = strp_rx_msg(skb);
 +      stm = strp_msg(skb);
  
 -      if (len > rxm->full_len)
 -              len = rxm->full_len;
 +      if (len > stm->full_len)
 +              len = stm->full_len;
  
 -      copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags);
 +      copied = skb_splice_bits(skb, sk, stm->offset, pipe, len, flags);
        if (copied < 0) {
                err = copied;
                goto err_out;
  
        KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
  
 -      rxm->offset += copied;
 -      rxm->full_len -= copied;
 +      stm->offset += copied;
 +      stm->full_len -= copied;
  
        /* We have no way to return MSG_EOR. If all the bytes have been
         * read we still leave the message in the receive socket buffer.
@@@ -1376,17 -1376,17 +1376,21 @@@ static int kcm_attach(struct socket *so
        struct kcm_psock *psock = NULL, *tpsock;
        struct list_head *head;
        int index = 0;
 -      struct strp_callbacks cb;
 +      static const struct strp_callbacks cb = {
 +              .rcv_msg = kcm_rcv_strparser,
 +              .parse_msg = kcm_parse_func_strparser,
 +              .read_sock_done = kcm_read_sock_done,
 +      };
        int err;
  
        csk = csock->sk;
        if (!csk)
                return -EINVAL;
  
+       /* We must prevent loops or risk deadlock ! */
+       if (csk->sk_family == PF_KCM)
+               return -EOPNOTSUPP;
        psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
        if (!psock)
                return -ENOMEM;
        psock->sk = csk;
        psock->bpf_prog = prog;
  
 -      cb.rcv_msg = kcm_rcv_strparser;
 -      cb.abort_parser = NULL;
 -      cb.parse_msg = kcm_parse_func_strparser;
 -      cb.read_sock_done = kcm_read_sock_done;
 -
        err = strp_init(&psock->strp, csk, &cb);
        if (err) {
                kmem_cache_free(kcm_psockp, psock);
diff --combined net/packet/af_packet.c
index f31cb71172e00cf0d00e05e2771281795d42213b,1c61af9af67dae10ea9675a45b191d7302c69151..c26172995511f77bf9ed4c36d55fd1f430f6de5e
@@@ -177,6 -177,8 +177,6 @@@ static int packet_set_ring(struct sock 
  #define BLK_PLUS_PRIV(sz_of_priv) \
        (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
  
 -#define PGV_FROM_VMALLOC 1
 -
  #define BLOCK_STATUS(x)       ((x)->hdr.bh1.block_status)
  #define BLOCK_NUM_PKTS(x)     ((x)->hdr.bh1.num_pkts)
  #define BLOCK_O2FP(x)         ((x)->hdr.bh1.offset_to_first_pkt)
@@@ -2189,6 -2191,7 +2189,7 @@@ static int tpacket_rcv(struct sk_buff *
        struct timespec ts;
        __u32 ts_status;
        bool is_drop_n_account = false;
+       bool do_vnet = false;
  
        /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
         * We may add members to them until current aligned size without forcing
                netoff = TPACKET_ALIGN(po->tp_hdrlen +
                                       (maclen < 16 ? 16 : maclen)) +
                                       po->tp_reserve;
-               if (po->has_vnet_hdr)
+               if (po->has_vnet_hdr) {
                        netoff += sizeof(struct virtio_net_hdr);
+                       do_vnet = true;
+               }
                macoff = netoff - maclen;
        }
        if (po->tp_version <= TPACKET_V2) {
                                        skb_set_owner_r(copy_skb, sk);
                        }
                        snaplen = po->rx_ring.frame_size - macoff;
-                       if ((int)snaplen < 0)
+                       if ((int)snaplen < 0) {
                                snaplen = 0;
+                               do_vnet = false;
+                       }
                }
        } else if (unlikely(macoff + snaplen >
                            GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
                if (unlikely((int)snaplen < 0)) {
                        snaplen = 0;
                        macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
+                       do_vnet = false;
                }
        }
        spin_lock(&sk->sk_receive_queue.lock);
        }
        spin_unlock(&sk->sk_receive_queue.lock);
  
-       if (po->has_vnet_hdr) {
+       if (do_vnet) {
                if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
                                            sizeof(struct virtio_net_hdr),
                                            vio_le(), true)) {
diff --combined net/sched/cls_api.c
index d470a4e2de58f16afc534945a00a2c6c3284f300,6c5ea84d2682ab81fb9755361fa77326fa9d9935..ea6c65fd5fc5fa31669191470d963bc851822a00
@@@ -100,6 -100,21 +100,6 @@@ int unregister_tcf_proto_ops(struct tcf
  }
  EXPORT_SYMBOL(unregister_tcf_proto_ops);
  
 -static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 -                        struct nlmsghdr *n, struct tcf_proto *tp,
 -                        unsigned long fh, int event, bool unicast);
 -
 -static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
 -                               struct nlmsghdr *n,
 -                               struct tcf_chain *chain, int event)
 -{
 -      struct tcf_proto *tp;
 -
 -      for (tp = rtnl_dereference(chain->filter_chain);
 -           tp; tp = rtnl_dereference(tp->next))
 -              tfilter_notify(net, oskb, n, tp, 0, event, false);
 -}
 -
  /* Select new prio value from the range, managed by kernel. */
  
  static inline u32 tcf_auto_prio(struct tcf_proto *tp)
@@@ -200,9 -215,15 +200,15 @@@ static void tcf_chain_flush(struct tcf_
  
  static void tcf_chain_destroy(struct tcf_chain *chain)
  {
-       list_del(&chain->list);
-       tcf_chain_flush(chain);
-       kfree(chain);
+       /* May be already removed from the list by the previous call. */
+       if (!list_empty(&chain->list))
+               list_del_init(&chain->list);
+       /* There might still be a reference held when we got here from
+        * tcf_block_put. Wait for the user to drop reference before free.
+        */
+       if (!chain->refcnt)
+               kfree(chain);
  }
  
  struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
@@@ -273,8 -294,10 +279,10 @@@ void tcf_block_put(struct tcf_block *bl
        if (!block)
                return;
  
-       list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
+       list_for_each_entry_safe(chain, tmp, &block->chain_list, list) {
+               tcf_chain_flush(chain);
                tcf_chain_destroy(chain);
+       }
        kfree(block);
  }
  EXPORT_SYMBOL(tcf_block_put);
@@@ -392,109 -415,6 +400,109 @@@ static struct tcf_proto *tcf_chain_tp_f
        return tp;
  }
  
 +static int tcf_fill_node(struct net *net, struct sk_buff *skb,
 +                       struct tcf_proto *tp, void *fh, u32 portid,
 +                       u32 seq, u16 flags, int event)
 +{
 +      struct tcmsg *tcm;
 +      struct nlmsghdr  *nlh;
 +      unsigned char *b = skb_tail_pointer(skb);
 +
 +      nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
 +      if (!nlh)
 +              goto out_nlmsg_trim;
 +      tcm = nlmsg_data(nlh);
 +      tcm->tcm_family = AF_UNSPEC;
 +      tcm->tcm__pad1 = 0;
 +      tcm->tcm__pad2 = 0;
 +      tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
 +      tcm->tcm_parent = tp->classid;
 +      tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
 +      if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
 +              goto nla_put_failure;
 +      if (nla_put_u32(skb, TCA_CHAIN, tp->chain->index))
 +              goto nla_put_failure;
 +      if (!fh) {
 +              tcm->tcm_handle = 0;
 +      } else {
 +              if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
 +                      goto nla_put_failure;
 +      }
 +      nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 +      return skb->len;
 +
 +out_nlmsg_trim:
 +nla_put_failure:
 +      nlmsg_trim(skb, b);
 +      return -1;
 +}
 +
 +static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 +                        struct nlmsghdr *n, struct tcf_proto *tp,
 +                        void *fh, int event, bool unicast)
 +{
 +      struct sk_buff *skb;
 +      u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 +
 +      skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 +      if (!skb)
 +              return -ENOBUFS;
 +
 +      if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq,
 +                        n->nlmsg_flags, event) <= 0) {
 +              kfree_skb(skb);
 +              return -EINVAL;
 +      }
 +
 +      if (unicast)
 +              return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
 +
 +      return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 +                            n->nlmsg_flags & NLM_F_ECHO);
 +}
 +
 +static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
 +                            struct nlmsghdr *n, struct tcf_proto *tp,
 +                            void *fh, bool unicast, bool *last)
 +{
 +      struct sk_buff *skb;
 +      u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 +      int err;
 +
 +      skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 +      if (!skb)
 +              return -ENOBUFS;
 +
 +      if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq,
 +                        n->nlmsg_flags, RTM_DELTFILTER) <= 0) {
 +              kfree_skb(skb);
 +              return -EINVAL;
 +      }
 +
 +      err = tp->ops->delete(tp, fh, last);
 +      if (err) {
 +              kfree_skb(skb);
 +              return err;
 +      }
 +
 +      if (unicast)
 +              return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
 +
 +      return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 +                            n->nlmsg_flags & NLM_F_ECHO);
 +}
 +
 +static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
 +                               struct nlmsghdr *n,
 +                               struct tcf_chain *chain, int event)
 +{
 +      struct tcf_proto *tp;
 +
 +      for (tp = rtnl_dereference(chain->filter_chain);
 +           tp; tp = rtnl_dereference(tp->next))
 +              tfilter_notify(net, oskb, n, tp, 0, event, false);
 +}
 +
  /* Add/change/delete/get a filter node */
  
  static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
        struct tcf_proto *tp;
        const struct Qdisc_class_ops *cops;
        unsigned long cl;
 -      unsigned long fh;
 +      void *fh;
        int err;
        int tp_created;
  
@@@ -586,7 -506,7 +594,7 @@@ replay
  
        /* Do we search for filter, attached to class? */
        if (TC_H_MIN(parent)) {
 -              cl = cops->get(q, parent);
 +              cl = cops->find(q, parent);
                if (cl == 0)
                        return -ENOENT;
        }
  
        fh = tp->ops->get(tp, t->tcm_handle);
  
 -      if (fh == 0) {
 +      if (!fh) {
                if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
                        tcf_chain_tp_remove(chain, &chain_info, tp);
                        tfilter_notify(net, skb, n, tp, fh,
                        }
                        break;
                case RTM_DELTFILTER:
 -                      err = tp->ops->delete(tp, fh, &last);
 +                      err = tfilter_del_notify(net, skb, n, tp, fh, false,
 +                                               &last);
                        if (err)
                                goto errout;
 -                      tfilter_notify(net, skb, n, tp, t->tcm_handle,
 -                                     RTM_DELTFILTER, false);
                        if (last) {
                                tcf_chain_tp_remove(chain, &chain_info, tp);
                                tcf_proto_destroy(tp);
  errout:
        if (chain)
                tcf_chain_put(chain);
 -      if (cl)
 -              cops->put(q, cl);
        if (err == -EAGAIN)
                /* Replay the request. */
                goto replay;
        return err;
  }
  
 -static int tcf_fill_node(struct net *net, struct sk_buff *skb,
 -                       struct tcf_proto *tp, unsigned long fh, u32 portid,
 -                       u32 seq, u16 flags, int event)
 -{
 -      struct tcmsg *tcm;
 -      struct nlmsghdr  *nlh;
 -      unsigned char *b = skb_tail_pointer(skb);
 -
 -      nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
 -      if (!nlh)
 -              goto out_nlmsg_trim;
 -      tcm = nlmsg_data(nlh);
 -      tcm->tcm_family = AF_UNSPEC;
 -      tcm->tcm__pad1 = 0;
 -      tcm->tcm__pad2 = 0;
 -      tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
 -      tcm->tcm_parent = tp->classid;
 -      tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
 -      if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
 -              goto nla_put_failure;
 -      if (nla_put_u32(skb, TCA_CHAIN, tp->chain->index))
 -              goto nla_put_failure;
 -      tcm->tcm_handle = fh;
 -      if (RTM_DELTFILTER != event) {
 -              tcm->tcm_handle = 0;
 -              if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
 -                      goto nla_put_failure;
 -      }
 -      nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 -      return skb->len;
 -
 -out_nlmsg_trim:
 -nla_put_failure:
 -      nlmsg_trim(skb, b);
 -      return -1;
 -}
 -
 -static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 -                        struct nlmsghdr *n, struct tcf_proto *tp,
 -                        unsigned long fh, int event, bool unicast)
 -{
 -      struct sk_buff *skb;
 -      u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 -
 -      skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 -      if (!skb)
 -              return -ENOBUFS;
 -
 -      if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq,
 -                        n->nlmsg_flags, event) <= 0) {
 -              kfree_skb(skb);
 -              return -EINVAL;
 -      }
 -
 -      if (unicast)
 -              return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
 -
 -      return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 -                            n->nlmsg_flags & NLM_F_ECHO);
 -}
 -
  struct tcf_dump_args {
        struct tcf_walker w;
        struct sk_buff *skb;
        struct netlink_callback *cb;
  };
  
 -static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
 -                       struct tcf_walker *arg)
 +static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
  {
        struct tcf_dump_args *a = (void *)arg;
        struct net *net = sock_net(a->skb->sk);
@@@ -820,17 -805,17 +828,17 @@@ static int tc_dump_tfilter(struct sk_bu
                goto out;
        cops = q->ops->cl_ops;
        if (!cops)
 -              goto errout;
 +              goto out;
        if (!cops->tcf_block)
 -              goto errout;
 +              goto out;
        if (TC_H_MIN(tcm->tcm_parent)) {
 -              cl = cops->get(q, tcm->tcm_parent);
 +              cl = cops->find(q, tcm->tcm_parent);
                if (cl == 0)
 -                      goto errout;
 +                      goto out;
        }
        block = cops->tcf_block(q, cl);
        if (!block)
 -              goto errout;
 +              goto out;
  
        index_start = cb->args[0];
        index = 0;
  
        cb->args[0] = index;
  
 -errout:
 -      if (cl)
 -              cops->put(q, cl);
  out:
        return skb->len;
  }
@@@ -903,12 -891,18 +911,12 @@@ int tcf_exts_validate(struct net *net, 
  }
  EXPORT_SYMBOL(tcf_exts_validate);
  
 -void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
 -                   struct tcf_exts *src)
 +void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src)
  {
  #ifdef CONFIG_NET_CLS_ACT
        struct tcf_exts old = *dst;
  
 -      tcf_tree_lock(tp);
 -      dst->nr_actions = src->nr_actions;
 -      dst->actions = src->actions;
 -      dst->type = src->type;
 -      tcf_tree_unlock(tp);
 -
 +      *dst = *src;
        tcf_exts_destroy(&old);
  #endif
  }
@@@ -929,7 -923,7 +937,7 @@@ int tcf_exts_dump(struct sk_buff *skb, 
  #ifdef CONFIG_NET_CLS_ACT
        struct nlattr *nest;
  
 -      if (exts->action && exts->nr_actions) {
 +      if (exts->action && tcf_exts_has_actions(exts)) {
                /*
                 * again for backward compatible mode - we want
                 * to work with both old and new modes of entering
@@@ -986,7 -980,7 +994,7 @@@ int tcf_exts_get_dev(struct net_device 
        const struct tc_action *a;
        LIST_HEAD(actions);
  
 -      if (tc_no_actions(exts))
 +      if (!tcf_exts_has_actions(exts))
                return -EINVAL;
  
        tcf_exts_to_list(exts, &actions);
@@@ -1005,10 -999,10 +1013,10 @@@ EXPORT_SYMBOL(tcf_exts_get_dev)
  
  static int __init tc_filter_init(void)
  {
 -      rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
 -      rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, NULL);
 +      rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0);
 +      rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, 0);
        rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
 -                    tc_dump_tfilter, NULL);
 +                    tc_dump_tfilter, 0);
  
        return 0;
  }
diff --combined net/sched/sch_api.c
index 929b024f41ba02dcf3296e13f5442e1d787f1d82,4fb5a3222d0d324167f079f755be14eb028b4a50..c6deb74e3d2f4a007554b9cf78e4ddf7b7b84535
  #include <net/sock.h>
  #include <net/netlink.h>
  #include <net/pkt_sched.h>
 -
 -static int qdisc_notify(struct net *net, struct sk_buff *oskb,
 -                      struct nlmsghdr *n, u32 clid,
 -                      struct Qdisc *old, struct Qdisc *new);
 -static int tclass_notify(struct net *net, struct sk_buff *oskb,
 -                       struct nlmsghdr *n, struct Qdisc *q,
 -                       unsigned long cl, int event);
 +#include <net/pkt_cls.h>
  
  /*
  
@@@ -154,7 -160,7 +154,7 @@@ int register_qdisc(struct Qdisc_ops *qo
        if (qops->cl_ops) {
                const struct Qdisc_class_ops *cops = qops->cl_ops;
  
 -              if (!(cops->get && cops->put && cops->walk && cops->leaf))
 +              if (!(cops->find && cops->walk && cops->leaf))
                        goto out_einval;
  
                if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
@@@ -321,11 -327,12 +321,11 @@@ static struct Qdisc *qdisc_leaf(struct 
  
        if (cops == NULL)
                return NULL;
 -      cl = cops->get(p, classid);
 +      cl = cops->find(p, classid);
  
        if (cl == 0)
                return NULL;
        leaf = cops->leaf(p, cl);
 -      cops->put(p, cl);
        return leaf;
  }
  
@@@ -614,10 -621,14 +614,10 @@@ EXPORT_SYMBOL(qdisc_watchdog_cancel)
  
  static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
  {
 -      unsigned int size = n * sizeof(struct hlist_head), i;
        struct hlist_head *h;
 +      unsigned int i;
  
 -      if (size <= PAGE_SIZE)
 -              h = kmalloc(size, GFP_KERNEL);
 -      else
 -              h = (struct hlist_head *)
 -                      __get_free_pages(GFP_KERNEL, get_order(size));
 +      h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
  
        if (h != NULL) {
                for (i = 0; i < n; i++)
        return h;
  }
  
 -static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
 -{
 -      unsigned int size = n * sizeof(struct hlist_head);
 -
 -      if (size <= PAGE_SIZE)
 -              kfree(h);
 -      else
 -              free_pages((unsigned long)h, get_order(size));
 -}
 -
  void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
  {
        struct Qdisc_class_common *cl;
        clhash->hashmask = nmask;
        sch_tree_unlock(sch);
  
 -      qdisc_class_hash_free(ohash, osize);
 +      kvfree(ohash);
  }
  EXPORT_SYMBOL(qdisc_class_hash_grow);
  
@@@ -678,7 -699,7 +678,7 @@@ EXPORT_SYMBOL(qdisc_class_hash_init)
  
  void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
  {
 -      qdisc_class_hash_free(clhash->hash, clhash->hashsize);
 +      kvfree(clhash->hash);
  }
  EXPORT_SYMBOL(qdisc_class_hash_destroy);
  
@@@ -728,7 -749,6 +728,7 @@@ void qdisc_tree_reduce_backlog(struct Q
        const struct Qdisc_class_ops *cops;
        unsigned long cl;
        u32 parentid;
 +      bool notify;
        int drops;
  
        if (n == 0 && len == 0)
  
                if (sch->flags & TCQ_F_NOPARENT)
                        break;
 +              /* Notify parent qdisc only if child qdisc becomes empty.
 +               *
 +               * If child was empty even before update then backlog
 +               * counter is screwed and we skip notification because
 +               * parent class is already passive.
 +               */
 +              notify = !sch->q.qlen && !WARN_ON_ONCE(!n);
                /* TODO: perform the search on a per txq basis */
                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
                if (sch == NULL) {
                        break;
                }
                cops = sch->ops->cl_ops;
 -              if (cops->qlen_notify) {
 -                      cl = cops->get(sch, parentid);
 +              if (notify && cops->qlen_notify) {
 +                      cl = cops->find(sch, parentid);
                        cops->qlen_notify(sch, cl);
 -                      cops->put(sch, cl);
                }
                sch->q.qlen -= n;
                sch->qstats.backlog -= len;
  }
  EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
  
 +static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 +                       u32 portid, u32 seq, u16 flags, int event)
 +{
 +      struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
 +      struct gnet_stats_queue __percpu *cpu_qstats = NULL;
 +      struct tcmsg *tcm;
 +      struct nlmsghdr  *nlh;
 +      unsigned char *b = skb_tail_pointer(skb);
 +      struct gnet_dump d;
 +      struct qdisc_size_table *stab;
 +      __u32 qlen;
 +
 +      cond_resched();
 +      nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
 +      if (!nlh)
 +              goto out_nlmsg_trim;
 +      tcm = nlmsg_data(nlh);
 +      tcm->tcm_family = AF_UNSPEC;
 +      tcm->tcm__pad1 = 0;
 +      tcm->tcm__pad2 = 0;
 +      tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
 +      tcm->tcm_parent = clid;
 +      tcm->tcm_handle = q->handle;
 +      tcm->tcm_info = refcount_read(&q->refcnt);
 +      if (nla_put_string(skb, TCA_KIND, q->ops->id))
 +              goto nla_put_failure;
 +      if (q->ops->dump && q->ops->dump(q, skb) < 0)
 +              goto nla_put_failure;
 +      qlen = q->q.qlen;
 +
 +      stab = rtnl_dereference(q->stab);
 +      if (stab && qdisc_dump_stab(skb, stab) < 0)
 +              goto nla_put_failure;
 +
 +      if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
 +                                       NULL, &d, TCA_PAD) < 0)
 +              goto nla_put_failure;
 +
 +      if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
 +              goto nla_put_failure;
 +
 +      if (qdisc_is_percpu_stats(q)) {
 +              cpu_bstats = q->cpu_bstats;
 +              cpu_qstats = q->cpu_qstats;
 +      }
 +
 +      if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
 +                                &d, cpu_bstats, &q->bstats) < 0 ||
 +          gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
 +          gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
 +              goto nla_put_failure;
 +
 +      if (gnet_stats_finish_copy(&d) < 0)
 +              goto nla_put_failure;
 +
 +      nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 +      return skb->len;
 +
 +out_nlmsg_trim:
 +nla_put_failure:
 +      nlmsg_trim(skb, b);
 +      return -1;
 +}
 +
 +static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
 +{
 +      if (q->flags & TCQ_F_BUILTIN)
 +              return true;
 +      if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
 +              return true;
 +
 +      return false;
 +}
 +
 +static int qdisc_notify(struct net *net, struct sk_buff *oskb,
 +                      struct nlmsghdr *n, u32 clid,
 +                      struct Qdisc *old, struct Qdisc *new)
 +{
 +      struct sk_buff *skb;
 +      u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 +
 +      skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 +      if (!skb)
 +              return -ENOBUFS;
 +
 +      if (old && !tc_qdisc_dump_ignore(old, false)) {
 +              if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
 +                                0, RTM_DELQDISC) < 0)
 +                      goto err_out;
 +      }
 +      if (new && !tc_qdisc_dump_ignore(new, false)) {
 +              if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
 +                                old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
 +                      goto err_out;
 +      }
 +
 +      if (skb->len)
 +              return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 +                                    n->nlmsg_flags & NLM_F_ECHO);
 +
 +err_out:
 +      kfree_skb(skb);
 +      return -EINVAL;
 +}
 +
  static void notify_and_destroy(struct net *net, struct sk_buff *skb,
                               struct nlmsghdr *n, u32 clid,
                               struct Qdisc *old, struct Qdisc *new)
@@@ -927,7 -836,7 +927,7 @@@ static int qdisc_graft(struct net_devic
  
                        old = dev_graft_qdisc(dev_queue, new);
                        if (new && i > 0)
-                               refcount_inc(&new->refcnt);
+                               qdisc_refcount_inc(new);
  
                        if (!ingress)
                                qdisc_destroy(old);
@@@ -938,7 -847,7 +938,7 @@@ skip
                        notify_and_destroy(net, skb, n, classid,
                                           dev->qdisc, new);
                        if (new && !new->ops->attach)
-                               refcount_inc(&new->refcnt);
+                               qdisc_refcount_inc(new);
                        dev->qdisc = new ? : &noop_qdisc;
  
                        if (new && new->ops->attach)
  
                err = -EOPNOTSUPP;
                if (cops && cops->graft) {
 -                      unsigned long cl = cops->get(parent, classid);
 -                      if (cl) {
 +                      unsigned long cl = cops->find(parent, classid);
 +
 +                      if (cl)
                                err = cops->graft(parent, cl, new, &old);
 -                              cops->put(parent, cl);
 -                      } else
 +                      else
                                err = -ENOENT;
                }
                if (!err)
@@@ -1347,7 -1256,7 +1347,7 @@@ replay
                                if (q == p ||
                                    (p && check_loop(q, p, 0)))
                                        return -ELOOP;
-                               refcount_inc(&q->refcnt);
+                               qdisc_refcount_inc(q);
                                goto graft;
                        } else {
                                if (!q)
@@@ -1439,6 -1348,111 +1439,6 @@@ graft
        return 0;
  }
  
 -static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 -                       u32 portid, u32 seq, u16 flags, int event)
 -{
 -      struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
 -      struct gnet_stats_queue __percpu *cpu_qstats = NULL;
 -      struct tcmsg *tcm;
 -      struct nlmsghdr  *nlh;
 -      unsigned char *b = skb_tail_pointer(skb);
 -      struct gnet_dump d;
 -      struct qdisc_size_table *stab;
 -      __u32 qlen;
 -
 -      cond_resched();
 -      nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
 -      if (!nlh)
 -              goto out_nlmsg_trim;
 -      tcm = nlmsg_data(nlh);
 -      tcm->tcm_family = AF_UNSPEC;
 -      tcm->tcm__pad1 = 0;
 -      tcm->tcm__pad2 = 0;
 -      tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
 -      tcm->tcm_parent = clid;
 -      tcm->tcm_handle = q->handle;
 -      tcm->tcm_info = refcount_read(&q->refcnt);
 -      if (nla_put_string(skb, TCA_KIND, q->ops->id))
 -              goto nla_put_failure;
 -      if (q->ops->dump && q->ops->dump(q, skb) < 0)
 -              goto nla_put_failure;
 -      qlen = q->q.qlen;
 -
 -      stab = rtnl_dereference(q->stab);
 -      if (stab && qdisc_dump_stab(skb, stab) < 0)
 -              goto nla_put_failure;
 -
 -      if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
 -                                       NULL, &d, TCA_PAD) < 0)
 -              goto nla_put_failure;
 -
 -      if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
 -              goto nla_put_failure;
 -
 -      if (qdisc_is_percpu_stats(q)) {
 -              cpu_bstats = q->cpu_bstats;
 -              cpu_qstats = q->cpu_qstats;
 -      }
 -
 -      if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
 -                                &d, cpu_bstats, &q->bstats) < 0 ||
 -          gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
 -          gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
 -              goto nla_put_failure;
 -
 -      if (gnet_stats_finish_copy(&d) < 0)
 -              goto nla_put_failure;
 -
 -      nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 -      return skb->len;
 -
 -out_nlmsg_trim:
 -nla_put_failure:
 -      nlmsg_trim(skb, b);
 -      return -1;
 -}
 -
 -static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
 -{
 -      if (q->flags & TCQ_F_BUILTIN)
 -              return true;
 -      if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
 -              return true;
 -
 -      return false;
 -}
 -
 -static int qdisc_notify(struct net *net, struct sk_buff *oskb,
 -                      struct nlmsghdr *n, u32 clid,
 -                      struct Qdisc *old, struct Qdisc *new)
 -{
 -      struct sk_buff *skb;
 -      u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 -
 -      skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 -      if (!skb)
 -              return -ENOBUFS;
 -
 -      if (old && !tc_qdisc_dump_ignore(old, false)) {
 -              if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
 -                                0, RTM_DELQDISC) < 0)
 -                      goto err_out;
 -      }
 -      if (new && !tc_qdisc_dump_ignore(new, false)) {
 -              if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
 -                                old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
 -                      goto err_out;
 -      }
 -
 -      if (skb->len)
 -              return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 -                                    n->nlmsg_flags & NLM_F_ECHO);
 -
 -err_out:
 -      kfree_skb(skb);
 -      return -EINVAL;
 -}
 -
  static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
                              struct netlink_callback *cb,
                              int *q_idx_p, int s_q_idx, bool recur,
   *    Traffic classes manipulation.           *
   ************************************************/
  
 +static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
 +                        unsigned long cl,
 +                        u32 portid, u32 seq, u16 flags, int event)
 +{
 +      struct tcmsg *tcm;
 +      struct nlmsghdr  *nlh;
 +      unsigned char *b = skb_tail_pointer(skb);
 +      struct gnet_dump d;
 +      const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
 +
 +      cond_resched();
 +      nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
 +      if (!nlh)
 +              goto out_nlmsg_trim;
 +      tcm = nlmsg_data(nlh);
 +      tcm->tcm_family = AF_UNSPEC;
 +      tcm->tcm__pad1 = 0;
 +      tcm->tcm__pad2 = 0;
 +      tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
 +      tcm->tcm_parent = q->handle;
 +      tcm->tcm_handle = q->handle;
 +      tcm->tcm_info = 0;
 +      if (nla_put_string(skb, TCA_KIND, q->ops->id))
 +              goto nla_put_failure;
 +      if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
 +              goto nla_put_failure;
 +
 +      if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
 +                                       NULL, &d, TCA_PAD) < 0)
 +              goto nla_put_failure;
 +
 +      if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
 +              goto nla_put_failure;
 +
 +      if (gnet_stats_finish_copy(&d) < 0)
 +              goto nla_put_failure;
 +
 +      nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 +      return skb->len;
 +
 +out_nlmsg_trim:
 +nla_put_failure:
 +      nlmsg_trim(skb, b);
 +      return -1;
 +}
 +
 +static int tclass_notify(struct net *net, struct sk_buff *oskb,
 +                       struct nlmsghdr *n, struct Qdisc *q,
 +                       unsigned long cl, int event)
 +{
 +      struct sk_buff *skb;
 +      u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 +
 +      skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 +      if (!skb)
 +              return -ENOBUFS;
 +
 +      if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
 +              kfree_skb(skb);
 +              return -EINVAL;
 +      }
 +
 +      return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 +                            n->nlmsg_flags & NLM_F_ECHO);
 +}
 +
 +static int tclass_del_notify(struct net *net,
 +                           const struct Qdisc_class_ops *cops,
 +                           struct sk_buff *oskb, struct nlmsghdr *n,
 +                           struct Qdisc *q, unsigned long cl)
 +{
 +      u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 +      struct sk_buff *skb;
 +      int err = 0;
 +
 +      if (!cops->delete)
 +              return -EOPNOTSUPP;
 +
 +      skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 +      if (!skb)
 +              return -ENOBUFS;
 +
 +      if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
 +                         RTM_DELTCLASS) < 0) {
 +              kfree_skb(skb);
 +              return -EINVAL;
 +      }
 +
 +      err = cops->delete(q, cl);
 +      if (err) {
 +              kfree_skb(skb);
 +              return err;
 +      }
 +
 +      return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 +                            n->nlmsg_flags & NLM_F_ECHO);
 +}
 +
 +#ifdef CONFIG_NET_CLS
 +
 +struct tcf_bind_args {
 +      struct tcf_walker w;
 +      u32 classid;
 +      unsigned long cl;
 +};
 +
 +static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
 +{
 +      struct tcf_bind_args *a = (void *)arg;
 +
 +      if (tp->ops->bind_class) {
 +              tcf_tree_lock(tp);
 +              tp->ops->bind_class(n, a->classid, a->cl);
 +              tcf_tree_unlock(tp);
 +      }
 +      return 0;
 +}
 +
 +static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
 +                         unsigned long new_cl)
 +{
 +      const struct Qdisc_class_ops *cops = q->ops->cl_ops;
 +      struct tcf_block *block;
 +      struct tcf_chain *chain;
 +      unsigned long cl;
 +
 +      cl = cops->find(q, portid);
 +      if (!cl)
 +              return;
 +      block = cops->tcf_block(q, cl);
 +      if (!block)
 +              return;
 +      list_for_each_entry(chain, &block->chain_list, list) {
 +              struct tcf_proto *tp;
 +
 +              for (tp = rtnl_dereference(chain->filter_chain);
 +                   tp; tp = rtnl_dereference(tp->next)) {
 +                      struct tcf_bind_args arg = {};
 +
 +                      arg.w.fn = tcf_node_bind;
 +                      arg.classid = clid;
 +                      arg.cl = new_cl;
 +                      tp->ops->walk(tp, &arg.w);
 +              }
 +      }
 +}
 +
 +#else
  
 +static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
 +                         unsigned long new_cl)
 +{
 +}
 +
 +#endif
  
  static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
                         struct netlink_ext_ack *extack)
                clid = TC_H_MAKE(qid, clid);
  
        if (clid)
 -              cl = cops->get(q, clid);
 +              cl = cops->find(q, clid);
  
        if (cl == 0) {
                err = -ENOENT;
                                goto out;
                        break;
                case RTM_DELTCLASS:
 -                      err = -EOPNOTSUPP;
 -                      if (cops->delete)
 -                              err = cops->delete(q, cl);
 -                      if (err == 0)
 -                              tclass_notify(net, skb, n, q, cl,
 -                                            RTM_DELTCLASS);
 +                      err = tclass_del_notify(net, cops, skb, n, q, cl);
 +                      /* Unbind the class with flilters with 0 */
 +                      tc_bind_tclass(q, portid, clid, 0);
                        goto out;
                case RTM_GETTCLASS:
                        err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
        err = -EOPNOTSUPP;
        if (cops->change)
                err = cops->change(q, clid, portid, tca, &new_cl);
 -      if (err == 0)
 +      if (err == 0) {
                tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
 -
 +              /* We just create a new class, need to do reverse binding. */
 +              if (cl != new_cl)
 +                      tc_bind_tclass(q, portid, clid, new_cl);
 +      }
  out:
 -      if (cl)
 -              cops->put(q, cl);
 -
        return err;
  }
  
 -
 -static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
 -                        unsigned long cl,
 -                        u32 portid, u32 seq, u16 flags, int event)
 -{
 -      struct tcmsg *tcm;
 -      struct nlmsghdr  *nlh;
 -      unsigned char *b = skb_tail_pointer(skb);
 -      struct gnet_dump d;
 -      const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
 -
 -      cond_resched();
 -      nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
 -      if (!nlh)
 -              goto out_nlmsg_trim;
 -      tcm = nlmsg_data(nlh);
 -      tcm->tcm_family = AF_UNSPEC;
 -      tcm->tcm__pad1 = 0;
 -      tcm->tcm__pad2 = 0;
 -      tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
 -      tcm->tcm_parent = q->handle;
 -      tcm->tcm_handle = q->handle;
 -      tcm->tcm_info = 0;
 -      if (nla_put_string(skb, TCA_KIND, q->ops->id))
 -              goto nla_put_failure;
 -      if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
 -              goto nla_put_failure;
 -
 -      if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
 -                                       NULL, &d, TCA_PAD) < 0)
 -              goto nla_put_failure;
 -
 -      if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
 -              goto nla_put_failure;
 -
 -      if (gnet_stats_finish_copy(&d) < 0)
 -              goto nla_put_failure;
 -
 -      nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 -      return skb->len;
 -
 -out_nlmsg_trim:
 -nla_put_failure:
 -      nlmsg_trim(skb, b);
 -      return -1;
 -}
 -
 -static int tclass_notify(struct net *net, struct sk_buff *oskb,
 -                       struct nlmsghdr *n, struct Qdisc *q,
 -                       unsigned long cl, int event)
 -{
 -      struct sk_buff *skb;
 -      u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 -
 -      skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 -      if (!skb)
 -              return -ENOBUFS;
 -
 -      if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
 -              kfree_skb(skb);
 -              return -EINVAL;
 -      }
 -
 -      return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 -                            n->nlmsg_flags & NLM_F_ECHO);
 -}
 -
  struct qdisc_dump_args {
        struct qdisc_walker     w;
        struct sk_buff          *skb;
@@@ -2019,14 -1949,14 +2019,14 @@@ static int __init pktsched_init(void
        register_qdisc(&mq_qdisc_ops);
        register_qdisc(&noqueue_qdisc_ops);
  
 -      rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
 -      rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
 +      rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
 +      rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
 -                    NULL);
 -      rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
 -      rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
 +                    0);
 +      rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
 +      rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
 -                    NULL);
 +                    0);
  
        return 0;
  }
diff --combined net/sched/sch_cbq.c
index 3ec8bec109bbee2014c8b2204906bd123c12fd58,156c8a33c6777a644c77b1adec9057b482bac109..dcef97fa804739df3ae3bc837e1938f3914d6d4f
@@@ -129,6 -129,7 +129,6 @@@ struct cbq_class 
        struct tcf_proto __rcu  *filter_list;
        struct tcf_block        *block;
  
 -      int                     refcnt;
        int                     filters;
  
        struct cbq_class        *defaults[TC_PRIO_MAX + 1];
@@@ -1138,6 -1139,13 +1138,13 @@@ static int cbq_init(struct Qdisc *sch, 
        struct tc_ratespec *r;
        int err;
  
+       qdisc_watchdog_init(&q->watchdog, sch);
+       hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+       q->delay_timer.function = cbq_undelay;
+       if (!opt)
+               return -EINVAL;
        err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL);
        if (err < 0)
                return err;
        if (err < 0)
                goto put_rtab;
  
 -      q->link.refcnt = 1;
        q->link.sibling = &q->link;
        q->link.common.classid = sch->handle;
        q->link.qdisc = sch;
        q->link.avpkt = q->link.allot/2;
        q->link.minidle = -0x7FFFFFFF;
  
-       qdisc_watchdog_init(&q->watchdog, sch);
-       hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
-       q->delay_timer.function = cbq_undelay;
        q->toplevel = TC_CBQ_MAXLEVEL;
        q->now = psched_get_time();
  
@@@ -1383,14 -1389,20 +1387,14 @@@ static void cbq_qlen_notify(struct Qdis
  {
        struct cbq_class *cl = (struct cbq_class *)arg;
  
 -      if (cl->q->q.qlen == 0)
 -              cbq_deactivate_class(cl);
 +      cbq_deactivate_class(cl);
  }
  
 -static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
 +static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
  {
        struct cbq_sched_data *q = qdisc_priv(sch);
 -      struct cbq_class *cl = cbq_class_lookup(q, classid);
  
 -      if (cl) {
 -              cl->refcnt++;
 -              return (unsigned long)cl;
 -      }
 -      return 0;
 +      return (unsigned long)cbq_class_lookup(q, classid);
  }
  
  static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
@@@ -1436,6 -1448,25 +1440,6 @@@ static void cbq_destroy(struct Qdisc *s
        qdisc_class_hash_destroy(&q->clhash);
  }
  
 -static void cbq_put(struct Qdisc *sch, unsigned long arg)
 -{
 -      struct cbq_class *cl = (struct cbq_class *)arg;
 -
 -      if (--cl->refcnt == 0) {
 -#ifdef CONFIG_NET_CLS_ACT
 -              spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
 -              struct cbq_sched_data *q = qdisc_priv(sch);
 -
 -              spin_lock_bh(root_lock);
 -              if (q->rx_class == cl)
 -                      q->rx_class = NULL;
 -              spin_unlock_bh(root_lock);
 -#endif
 -
 -              cbq_destroy_class(sch, cl);
 -      }
 -}
 -
  static int
  cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
                 unsigned long *arg)
  
        cl->R_tab = rtab;
        rtab = NULL;
 -      cl->refcnt = 1;
        cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
        if (!cl->q)
                cl->q = &noop_qdisc;
@@@ -1662,7 -1694,12 +1666,7 @@@ static int cbq_delete(struct Qdisc *sch
        cbq_rmprio(q, cl);
        sch_tree_unlock(sch);
  
 -      BUG_ON(--cl->refcnt == 0);
 -      /*
 -       * This shouldn't happen: we "hold" one cops->get() when called
 -       * from tc_ctl_tclass; the destroy method is done from cops->put().
 -       */
 -
 +      cbq_destroy_class(sch, cl);
        return 0;
  }
  
@@@ -1728,7 -1765,8 +1732,7 @@@ static const struct Qdisc_class_ops cbq
        .graft          =       cbq_graft,
        .leaf           =       cbq_leaf,
        .qlen_notify    =       cbq_qlen_notify,
 -      .get            =       cbq_get,
 -      .put            =       cbq_put,
 +      .find           =       cbq_find,
        .change         =       cbq_change_class,
        .delete         =       cbq_delete,
        .walk           =       cbq_walk,
diff --combined net/sched/sch_fq_codel.c
index 7699b50688cd6f2eec4d86d15f3559876e522f49,2c0c05f2cc34a9de51390c45f29dd8db810075c7..de3b57ceca7bd625c874fbece917c944aabc26d8
@@@ -491,10 -491,8 +491,8 @@@ static int fq_codel_init(struct Qdisc *
                if (!q->flows)
                        return -ENOMEM;
                q->backlogs = kvzalloc(q->flows_cnt * sizeof(u32), GFP_KERNEL);
-               if (!q->backlogs) {
-                       kvfree(q->flows);
+               if (!q->backlogs)
                        return -ENOMEM;
-               }
                for (i = 0; i < q->flows_cnt; i++) {
                        struct fq_codel_flow *flow = q->flows + i;
  
@@@ -579,7 -577,7 +577,7 @@@ static struct Qdisc *fq_codel_leaf(stru
        return NULL;
  }
  
 -static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid)
 +static unsigned long fq_codel_find(struct Qdisc *sch, u32 classid)
  {
        return 0;
  }
@@@ -592,7 -590,7 +590,7 @@@ static unsigned long fq_codel_bind(stru
        return 0;
  }
  
 -static void fq_codel_put(struct Qdisc *q, unsigned long cl)
 +static void fq_codel_unbind(struct Qdisc *q, unsigned long cl)
  {
  }
  
@@@ -683,10 -681,11 +681,10 @@@ static void fq_codel_walk(struct Qdisc 
  
  static const struct Qdisc_class_ops fq_codel_class_ops = {
        .leaf           =       fq_codel_leaf,
 -      .get            =       fq_codel_get,
 -      .put            =       fq_codel_put,
 +      .find           =       fq_codel_find,
        .tcf_block      =       fq_codel_tcf_block,
        .bind_tcf       =       fq_codel_bind,
 -      .unbind_tcf     =       fq_codel_put,
 +      .unbind_tcf     =       fq_codel_unbind,
        .dump           =       fq_codel_dump_class,
        .dump_stats     =       fq_codel_dump_class_stats,
        .walk           =       fq_codel_walk,
diff --combined net/sched/sch_generic.c
index c6b89a34e8d2e8eecc7ad6d21950bd84335443a0,4ba6da5fb2546c35ad48fe1f3632df8ca9957b34..92237e75dbbc5e3e7dab124fb67baca98ae2160f
@@@ -29,7 -29,6 +29,7 @@@
  #include <net/sch_generic.h>
  #include <net/pkt_sched.h>
  #include <net/dst.h>
 +#include <trace/events/qdisc.h>
  
  /* Qdisc to use by default */
  const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
@@@ -127,7 -126,7 +127,7 @@@ static struct sk_buff *dequeue_skb(stru
                        q->q.qlen--;
                } else
                        skb = NULL;
 -              return skb;
 +              goto trace;
        }
        *validate = true;
        skb = q->skb_bad_txq;
                        q->q.qlen--;
                        goto bulk;
                }
 -              return NULL;
 +              skb = NULL;
 +              goto trace;
        }
        if (!(q->flags & TCQ_F_ONETXQUEUE) ||
            !netif_xmit_frozen_or_stopped(txq))
@@@ -153,8 -151,6 +153,8 @@@ bulk
                else
                        try_bulk_dequeue_skb_slow(q, skb, packets);
        }
 +trace:
 +      trace_qdisc_dequeue(q, txq, *packets, skb);
        return skb;
  }
  
@@@ -789,7 -785,7 +789,7 @@@ static void attach_default_qdiscs(struc
            dev->priv_flags & IFF_NO_QUEUE) {
                netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
                dev->qdisc = txq->qdisc_sleeping;
-               refcount_inc(&dev->qdisc->refcnt);
+               qdisc_refcount_inc(dev->qdisc);
        } else {
                qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
                if (qdisc) {
diff --combined net/sched/sch_hfsc.c
index 7c7820d0fdc7afbf5e38c6845b09de88515d7551,11ab8dace901534b23b8f376ac704f995dc6b66b..daaf214e5201919ca3681e1670ac1389cb7985a4
@@@ -110,6 -110,7 +110,6 @@@ enum hfsc_class_flags 
  
  struct hfsc_class {
        struct Qdisc_class_common cl_common;
 -      unsigned int    refcnt;         /* usage count */
  
        struct gnet_stats_basic_packed bstats;
        struct gnet_stats_queue qstats;
@@@ -828,6 -829,28 +828,6 @@@ update_vf(struct hfsc_class *cl, unsign
        }
  }
  
 -static void
 -set_active(struct hfsc_class *cl, unsigned int len)
 -{
 -      if (cl->cl_flags & HFSC_RSC)
 -              init_ed(cl, len);
 -      if (cl->cl_flags & HFSC_FSC)
 -              init_vf(cl, len);
 -
 -}
 -
 -static void
 -set_passive(struct hfsc_class *cl)
 -{
 -      if (cl->cl_flags & HFSC_RSC)
 -              eltree_remove(cl);
 -
 -      /*
 -       * vttree is now handled in update_vf() so that update_vf(cl, 0, 0)
 -       * needs to be called explicitly to remove a class from vttree.
 -       */
 -}
 -
  static unsigned int
  qdisc_peek_len(struct Qdisc *sch)
  {
@@@ -1044,6 -1067,7 +1044,6 @@@ hfsc_change_class(struct Qdisc *sch, u3
                hfsc_change_usc(cl, usc, 0);
  
        cl->cl_common.classid = classid;
 -      cl->refcnt    = 1;
        cl->sched     = q;
        cl->cl_parent = parent;
        cl->qdisc = qdisc_create_dflt(sch->dev_queue,
@@@ -1099,9 -1123,13 +1099,9 @@@ hfsc_delete_class(struct Qdisc *sch, un
        hfsc_purge_queue(sch, cl);
        qdisc_class_hash_remove(&q->clhash, &cl->cl_common);
  
 -      BUG_ON(--cl->refcnt == 0);
 -      /*
 -       * This shouldn't happen: we "hold" one cops->get() when called
 -       * from tc_ctl_tclass; the destroy method is done from cops->put().
 -       */
 -
        sch_tree_unlock(sch);
 +
 +      hfsc_destroy_class(sch, cl);
        return 0;
  }
  
@@@ -1193,18 -1221,30 +1193,18 @@@ hfsc_qlen_notify(struct Qdisc *sch, uns
  {
        struct hfsc_class *cl = (struct hfsc_class *)arg;
  
 -      if (cl->qdisc->q.qlen == 0) {
 -              update_vf(cl, 0, 0);
 -              set_passive(cl);
 -      }
 +      /* vttree is now handled in update_vf() so that update_vf(cl, 0, 0)
 +       * needs to be called explicitly to remove a class from vttree.
 +       */
 +      update_vf(cl, 0, 0);
 +      if (cl->cl_flags & HFSC_RSC)
 +              eltree_remove(cl);
  }
  
  static unsigned long
 -hfsc_get_class(struct Qdisc *sch, u32 classid)
 -{
 -      struct hfsc_class *cl = hfsc_find_class(classid, sch);
 -
 -      if (cl != NULL)
 -              cl->refcnt++;
 -
 -      return (unsigned long)cl;
 -}
 -
 -static void
 -hfsc_put_class(struct Qdisc *sch, unsigned long arg)
 +hfsc_search_class(struct Qdisc *sch, u32 classid)
  {
 -      struct hfsc_class *cl = (struct hfsc_class *)arg;
 -
 -      if (--cl->refcnt == 0)
 -              hfsc_destroy_class(sch, cl);
 +      return (unsigned long)hfsc_find_class(classid, sch);
  }
  
  static unsigned long
@@@ -1378,6 -1418,8 +1378,8 @@@ hfsc_init_qdisc(struct Qdisc *sch, stru
        struct tc_hfsc_qopt *qopt;
        int err;
  
+       qdisc_watchdog_init(&q->watchdog, sch);
        if (opt == NULL || nla_len(opt) < sizeof(*qopt))
                return -EINVAL;
        qopt = nla_data(opt);
  
        err = tcf_block_get(&q->root.block, &q->root.filter_list);
        if (err)
-               goto err_tcf;
+               return err;
  
        q->root.cl_common.classid = sch->handle;
 -      q->root.refcnt  = 1;
        q->root.sched   = q;
        q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
                                          sch->handle);
        qdisc_class_hash_insert(&q->clhash, &q->root.cl_common);
        qdisc_class_hash_grow(sch, &q->clhash);
  
-       qdisc_watchdog_init(&q->watchdog, sch);
        return 0;
- err_tcf:
-       qdisc_class_hash_destroy(&q->clhash);
-       return err;
  }
  
  static int
@@@ -1544,12 -1581,7 +1540,12 @@@ hfsc_enqueue(struct sk_buff *skb, struc
        }
  
        if (cl->qdisc->q.qlen == 1) {
 -              set_active(cl, qdisc_pkt_len(skb));
 +              unsigned int len = qdisc_pkt_len(skb);
 +
 +              if (cl->cl_flags & HFSC_RSC)
 +                      init_ed(cl, len);
 +              if (cl->cl_flags & HFSC_FSC)
 +                      init_vf(cl, len);
                /*
                 * If this is the first packet, isolate the head so an eventual
                 * head drop before the first dequeue operation has no chance
@@@ -1613,18 -1645,18 +1609,18 @@@ hfsc_dequeue(struct Qdisc *sch
        if (realtime)
                cl->cl_cumul += qdisc_pkt_len(skb);
  
 -      if (cl->qdisc->q.qlen != 0) {
 -              if (cl->cl_flags & HFSC_RSC) {
 +      if (cl->cl_flags & HFSC_RSC) {
 +              if (cl->qdisc->q.qlen != 0) {
                        /* update ed */
                        next_len = qdisc_peek_len(cl->qdisc);
                        if (realtime)
                                update_ed(cl, next_len);
                        else
                                update_d(cl, next_len);
 +              } else {
 +                      /* the class becomes passive */
 +                      eltree_remove(cl);
                }
 -      } else {
 -              /* the class becomes passive */
 -              set_passive(cl);
        }
  
        qdisc_bstats_update(sch, skb);
@@@ -1640,7 -1672,8 +1636,7 @@@ static const struct Qdisc_class_ops hfs
        .graft          = hfsc_graft_class,
        .leaf           = hfsc_class_leaf,
        .qlen_notify    = hfsc_qlen_notify,
 -      .get            = hfsc_get_class,
 -      .put            = hfsc_put_class,
 +      .find           = hfsc_search_class,
        .bind_tcf       = hfsc_bind_tcf,
        .unbind_tcf     = hfsc_unbind_tcf,
        .tcf_block      = hfsc_tcf_block,
diff --combined net/sched/sch_htb.c
index f955b59d3c7c4b3a1a765e0c1f2ec669c0bca247,5bf5177b2bd3f6aa1b0ba9e4e59a946e1c739e0a..7e148376ba528efabe5a53a09653f9161c264be7
@@@ -107,6 -107,7 +107,6 @@@ struct htb_class 
        struct tcf_proto __rcu  *filter_list;   /* class attached filters */
        struct tcf_block        *block;
        int                     filter_cnt;
 -      int                     refcnt;         /* usage count of this class */
  
        int                     level;          /* our level (see above) */
        unsigned int            children;
@@@ -192,10 -193,6 +192,10 @@@ static inline struct htb_class *htb_fin
        return container_of(clc, struct htb_class, common);
  }
  
 +static unsigned long htb_search(struct Qdisc *sch, u32 handle)
 +{
 +      return (unsigned long)htb_find(handle, sch);
 +}
  /**
   * htb_classify - classify a packet into class
   *
@@@ -1020,6 -1017,9 +1020,9 @@@ static int htb_init(struct Qdisc *sch, 
        int err;
        int i;
  
+       qdisc_watchdog_init(&q->watchdog, sch);
+       INIT_WORK(&q->work, htb_work_func);
        if (!opt)
                return -EINVAL;
  
        for (i = 0; i < TC_HTB_NUMPRIO; i++)
                INIT_LIST_HEAD(q->drops + i);
  
-       qdisc_watchdog_init(&q->watchdog, sch);
-       INIT_WORK(&q->work, htb_work_func);
        qdisc_skb_head_init(&q->direct_queue);
  
        if (tb[TCA_HTB_DIRECT_QLEN])
@@@ -1189,7 -1187,16 +1190,7 @@@ static void htb_qlen_notify(struct Qdis
  {
        struct htb_class *cl = (struct htb_class *)arg;
  
 -      if (cl->un.leaf.q->q.qlen == 0)
 -              htb_deactivate(qdisc_priv(sch), cl);
 -}
 -
 -static unsigned long htb_get(struct Qdisc *sch, u32 classid)
 -{
 -      struct htb_class *cl = htb_find(classid, sch);
 -      if (cl)
 -              cl->refcnt++;
 -      return (unsigned long)cl;
 +      htb_deactivate(qdisc_priv(sch), cl);
  }
  
  static inline int htb_parent_last_child(struct htb_class *cl)
@@@ -1311,10 -1318,22 +1312,10 @@@ static int htb_delete(struct Qdisc *sch
        if (last_child)
                htb_parent_to_leaf(q, cl, new_q);
  
 -      BUG_ON(--cl->refcnt == 0);
 -      /*
 -       * This shouldn't happen: we "hold" one cops->get() when called
 -       * from tc_ctl_tclass; the destroy method is done from cops->put().
 -       */
 -
        sch_tree_unlock(sch);
 -      return 0;
 -}
  
 -static void htb_put(struct Qdisc *sch, unsigned long arg)
 -{
 -      struct htb_class *cl = (struct htb_class *)arg;
 -
 -      if (--cl->refcnt == 0)
 -              htb_destroy_class(sch, cl);
 +      htb_destroy_class(sch, cl);
 +      return 0;
  }
  
  static int htb_change_class(struct Qdisc *sch, u32 classid,
                        }
                }
  
 -              cl->refcnt = 1;
                cl->children = 0;
                INIT_LIST_HEAD(&cl->un.leaf.drop_list);
                RB_CLEAR_NODE(&cl->pq_node);
@@@ -1580,7 -1600,8 +1581,7 @@@ static const struct Qdisc_class_ops htb
        .graft          =       htb_graft,
        .leaf           =       htb_leaf,
        .qlen_notify    =       htb_qlen_notify,
 -      .get            =       htb_get,
 -      .put            =       htb_put,
 +      .find           =       htb_search,
        .change         =       htb_change_class,
        .delete         =       htb_delete,
        .walk           =       htb_walk,
diff --combined net/sched/sch_multiq.c
index a5df979b624811e78ce07f228832face6f2c13b8,9c454f5d6c38820512485cceecbc06c9fa86f634..ff4fc3e0facd7d98b504b5315d7cfe7d0ffdf68a
@@@ -257,12 -257,7 +257,7 @@@ static int multiq_init(struct Qdisc *sc
        for (i = 0; i < q->max_bands; i++)
                q->queues[i] = &noop_qdisc;
  
-       err = multiq_tune(sch, opt);
-       if (err)
-               kfree(q->queues);
-       return err;
+       return multiq_tune(sch, opt);
  }
  
  static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
@@@ -306,7 -301,7 +301,7 @@@ multiq_leaf(struct Qdisc *sch, unsigne
        return q->queues[band];
  }
  
 -static unsigned long multiq_get(struct Qdisc *sch, u32 classid)
 +static unsigned long multiq_find(struct Qdisc *sch, u32 classid)
  {
        struct multiq_sched_data *q = qdisc_priv(sch);
        unsigned long band = TC_H_MIN(classid);
  static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent,
                                 u32 classid)
  {
 -      return multiq_get(sch, classid);
 +      return multiq_find(sch, classid);
  }
  
  
 -static void multiq_put(struct Qdisc *q, unsigned long cl)
 +static void multiq_unbind(struct Qdisc *q, unsigned long cl)
  {
  }
  
@@@ -385,11 -380,12 +380,11 @@@ static struct tcf_block *multiq_tcf_blo
  static const struct Qdisc_class_ops multiq_class_ops = {
        .graft          =       multiq_graft,
        .leaf           =       multiq_leaf,
 -      .get            =       multiq_get,
 -      .put            =       multiq_put,
 +      .find           =       multiq_find,
        .walk           =       multiq_walk,
        .tcf_block      =       multiq_tcf_block,
        .bind_tcf       =       multiq_bind,
 -      .unbind_tcf     =       multiq_put,
 +      .unbind_tcf     =       multiq_unbind,
        .dump           =       multiq_dump_class,
        .dump_stats     =       multiq_dump_class_stats,
  };
diff --combined net/sched/sch_netem.c
index cf5aad0aabfcc5ed7ea28402e25b3ee1a5c027b5,14d1724e0dc436f49da643be8606be273ce22ebd..b1266e75ca43cf5a66b951ecabccfc5b24069444
@@@ -933,11 -933,11 +933,11 @@@ static int netem_init(struct Qdisc *sch
        struct netem_sched_data *q = qdisc_priv(sch);
        int ret;
  
+       qdisc_watchdog_init(&q->watchdog, sch);
        if (!opt)
                return -EINVAL;
  
-       qdisc_watchdog_init(&q->watchdog, sch);
        q->loss_model = CLG_RANDOM;
        ret = netem_change(sch, opt);
        if (ret)
@@@ -1096,11 -1096,15 +1096,11 @@@ static struct Qdisc *netem_leaf(struct 
        return q->qdisc;
  }
  
 -static unsigned long netem_get(struct Qdisc *sch, u32 classid)
 +static unsigned long netem_find(struct Qdisc *sch, u32 classid)
  {
        return 1;
  }
  
 -static void netem_put(struct Qdisc *sch, unsigned long arg)
 -{
 -}
 -
  static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
  {
        if (!walker->stop) {
  static const struct Qdisc_class_ops netem_class_ops = {
        .graft          =       netem_graft,
        .leaf           =       netem_leaf,
 -      .get            =       netem_get,
 -      .put            =       netem_put,
 +      .find           =       netem_find,
        .walk           =       netem_walk,
        .dump           =       netem_dump_class,
  };
diff --combined net/sched/sch_sfq.c
index e0f029a887ac5dec95706624419c0a1354576baf,fc69fc5956e9d4d2dbfe645e4c25e83328517371..74ea863b824009acb94b956facc889dd80970edf
@@@ -292,7 -292,7 +292,7 @@@ static inline void slot_queue_add(struc
        slot->skblist_prev = skb;
  }
  
 -static unsigned int sfq_drop(struct Qdisc *sch)
 +static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
  {
        struct sfq_sched_data *q = qdisc_priv(sch);
        sfq_index x, d = q->cur_depth;
@@@ -310,8 -310,9 +310,8 @@@ drop
                slot->backlog -= len;
                sfq_dec(q, x);
                sch->q.qlen--;
 -              qdisc_qstats_drop(sch);
                qdisc_qstats_backlog_dec(sch, skb);
 -              kfree_skb(skb);
 +              qdisc_drop(skb, sch, to_free);
                return len;
        }
  
@@@ -359,7 -360,7 +359,7 @@@ sfq_enqueue(struct sk_buff *skb, struc
        if (hash == 0) {
                if (ret & __NET_XMIT_BYPASS)
                        qdisc_qstats_drop(sch);
 -              kfree_skb(skb);
 +              __qdisc_drop(skb, to_free);
                return ret;
        }
        hash--;
@@@ -464,7 -465,7 +464,7 @@@ enqueue
                return NET_XMIT_SUCCESS;
  
        qlen = slot->qlen;
 -      dropped = sfq_drop(sch);
 +      dropped = sfq_drop(sch, to_free);
        /* Return Congestion Notification only if we dropped a packet
         * from this flow.
         */
@@@ -627,8 -628,6 +627,8 @@@ static int sfq_change(struct Qdisc *sch
        struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
        unsigned int qlen, dropped = 0;
        struct red_parms *p = NULL;
 +      struct sk_buff *to_free = NULL;
 +      struct sk_buff *tail = NULL;
  
        if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
                return -EINVAL;
        }
  
        qlen = sch->q.qlen;
 -      while (sch->q.qlen > q->limit)
 -              dropped += sfq_drop(sch);
 +      while (sch->q.qlen > q->limit) {
 +              dropped += sfq_drop(sch, &to_free);
 +              if (!tail)
 +                      tail = to_free;
 +      }
 +
 +      rtnl_kfree_skbs(to_free, tail);
        qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
  
        del_timer(&q->perturb_timer);
@@@ -722,13 -716,13 +722,13 @@@ static int sfq_init(struct Qdisc *sch, 
        int i;
        int err;
  
+       setup_deferrable_timer(&q->perturb_timer, sfq_perturbation,
+                              (unsigned long)sch);
        err = tcf_block_get(&q->block, &q->filter_list);
        if (err)
                return err;
  
-       setup_deferrable_timer(&q->perturb_timer, sfq_perturbation,
-                              (unsigned long)sch);
        for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
                q->dep[i].next = i + SFQ_MAX_FLOWS;
                q->dep[i].prev = i + SFQ_MAX_FLOWS;
@@@ -814,7 -808,7 +814,7 @@@ static struct Qdisc *sfq_leaf(struct Qd
        return NULL;
  }
  
 -static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 +static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
  {
        return 0;
  }
@@@ -827,7 -821,7 +827,7 @@@ static unsigned long sfq_bind(struct Qd
        return 0;
  }
  
 -static void sfq_put(struct Qdisc *q, unsigned long cl)
 +static void sfq_unbind(struct Qdisc *q, unsigned long cl)
  {
  }
  
@@@ -891,10 -885,11 +891,10 @@@ static void sfq_walk(struct Qdisc *sch
  
  static const struct Qdisc_class_ops sfq_class_ops = {
        .leaf           =       sfq_leaf,
 -      .get            =       sfq_get,
 -      .put            =       sfq_put,
 +      .find           =       sfq_find,
        .tcf_block      =       sfq_tcf_block,
        .bind_tcf       =       sfq_bind,
 -      .unbind_tcf     =       sfq_put,
 +      .unbind_tcf     =       sfq_unbind,
        .dump           =       sfq_dump_class,
        .dump_stats     =       sfq_dump_class_stats,
        .walk           =       sfq_walk,
diff --combined net/sched/sch_tbf.c
index d5dba972ab06842da7a72cf0359dbed5af4dc3ab,493270f0d5b055fa07d4dee2b35ec9d40bddc3d0..120f4f36596786746b89a2832c125e1814d6fd9b
@@@ -425,12 -425,13 +425,13 @@@ static int tbf_init(struct Qdisc *sch, 
  {
        struct tbf_sched_data *q = qdisc_priv(sch);
  
+       qdisc_watchdog_init(&q->watchdog, sch);
+       q->qdisc = &noop_qdisc;
        if (opt == NULL)
                return -EINVAL;
  
        q->t_c = ktime_get_ns();
-       qdisc_watchdog_init(&q->watchdog, sch);
-       q->qdisc = &noop_qdisc;
  
        return tbf_change(sch, opt);
  }
@@@ -510,11 -511,15 +511,11 @@@ static struct Qdisc *tbf_leaf(struct Qd
        return q->qdisc;
  }
  
 -static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
 +static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
  {
        return 1;
  }
  
 -static void tbf_put(struct Qdisc *sch, unsigned long arg)
 -{
 -}
 -
  static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
  {
        if (!walker->stop) {
  static const struct Qdisc_class_ops tbf_class_ops = {
        .graft          =       tbf_graft,
        .leaf           =       tbf_leaf,
 -      .get            =       tbf_get,
 -      .put            =       tbf_put,
 +      .find           =       tbf_find,
        .walk           =       tbf_walk,
        .dump           =       tbf_dump_class,
  };
diff --combined net/sctp/socket.c
index c01af72cc603c794204db4b63dff001ae15360ba,8d760863bc411023835b20383620f38d14ee2df1..1b00a1e09b93e4106a38b4f6d45df7175e27598b
@@@ -100,9 -100,8 +100,9 @@@ static int sctp_send_asconf(struct sctp
                            struct sctp_chunk *chunk);
  static int sctp_do_bind(struct sock *, union sctp_addr *, int);
  static int sctp_autobind(struct sock *sk);
 -static void sctp_sock_migrate(struct sock *, struct sock *,
 -                            struct sctp_association *, sctp_socket_type_t);
 +static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 +                            struct sctp_association *assoc,
 +                            enum sctp_socket_type type);
  
  static unsigned long sctp_memory_pressure;
  static atomic_long_t sctp_memory_allocated;
@@@ -1056,7 -1055,7 +1056,7 @@@ static int __sctp_connect(struct sock *
        struct sctp_association *asoc2;
        struct sctp_transport *transport;
        union sctp_addr to;
 -      sctp_scope_t scope;
 +      enum sctp_scope scope;
        long timeo;
        int err = 0;
        int addrcnt = 0;
@@@ -1594,8 -1593,7 +1594,8 @@@ static int sctp_error(struct sock *sk, 
   */
  /* BUG:  We do not implement the equivalent of sk_stream_wait_memory(). */
  
 -static int sctp_msghdr_parse(const struct msghdr *, sctp_cmsgs_t *);
 +static int sctp_msghdr_parse(const struct msghdr *msg,
 +                           struct sctp_cmsgs *cmsgs);
  
  static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
  {
        struct sctp_sndrcvinfo *sinfo;
        struct sctp_initmsg *sinit;
        sctp_assoc_t associd = 0;
 -      sctp_cmsgs_t cmsgs = { NULL };
 -      sctp_scope_t scope;
 +      struct sctp_cmsgs cmsgs = { NULL };
 +      enum sctp_scope scope;
        bool fill_sinfo_ttl = false, wait_connect = false;
        struct sctp_datamsg *datamsg;
        int msg_flags = msg->msg_flags;
@@@ -4540,8 -4538,7 +4540,7 @@@ int sctp_get_sctp_info(struct sock *sk
        info->sctpi_ictrlchunks = asoc->stats.ictrlchunks;
  
        prim = asoc->peer.primary_path;
-       memcpy(&info->sctpi_p_address, &prim->ipaddr,
-              sizeof(struct sockaddr_storage));
+       memcpy(&info->sctpi_p_address, &prim->ipaddr, sizeof(prim->ipaddr));
        info->sctpi_p_state = prim->state;
        info->sctpi_p_cwnd = prim->cwnd;
        info->sctpi_p_srtt = prim->srtt;
@@@ -7447,10 -7444,10 +7446,10 @@@ static int sctp_autobind(struct sock *s
   * msg_control
   * points here
   */
 -static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs)
 +static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs)
  {
 -      struct cmsghdr *cmsg;
        struct msghdr *my_msg = (struct msghdr *)msg;
 +      struct cmsghdr *cmsg;
  
        for_each_cmsghdr(cmsg, my_msg) {
                if (!CMSG_OK(my_msg, cmsg))
@@@ -8087,7 -8084,7 +8086,7 @@@ static inline void sctp_copy_descendant
   */
  static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
                              struct sctp_association *assoc,
 -                            sctp_socket_type_t type)
 +                            enum sctp_socket_type type)
  {
        struct sctp_sock *oldsp = sctp_sk(oldsk);
        struct sctp_sock *newsp = sctp_sk(newsk);
diff --combined net/tipc/bearer.c
index d49598f6002bc154182f4eb7c120942d67f11de0,89cd061c4468247cf761541ff1a2ca27f0836d6f..ac1d66d7e1fdddcfc53e2251e542c0c8e28ef6d8
@@@ -65,6 -65,8 +65,8 @@@ static struct tipc_bearer *bearer_get(s
  }
  
  static void bearer_disable(struct net *net, struct tipc_bearer *b);
+ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev,
+                          struct packet_type *pt, struct net_device *orig_dev);
  
  /**
   * tipc_media_find - locates specified media object by name
@@@ -365,6 -367,30 +367,6 @@@ static int tipc_reset_bearer(struct ne
        return 0;
  }
  
 -/* tipc_bearer_reset_all - reset all links on all bearers
 - */
 -void tipc_bearer_reset_all(struct net *net)
 -{
 -      struct tipc_bearer *b;
 -      int i;
 -
 -      for (i = 0; i < MAX_BEARERS; i++) {
 -              b = bearer_get(net, i);
 -              if (b)
 -                      clear_bit_unlock(0, &b->up);
 -      }
 -      for (i = 0; i < MAX_BEARERS; i++) {
 -              b = bearer_get(net, i);
 -              if (b)
 -                      tipc_reset_bearer(net, b);
 -      }
 -      for (i = 0; i < MAX_BEARERS; i++) {
 -              b = bearer_get(net, i);
 -              if (b)
 -                      test_and_set_bit_lock(0, &b->up);
 -      }
 -}
 -
  /**
   * bearer_disable
   *
@@@ -404,6 -430,10 +406,10 @@@ int tipc_enable_l2_media(struct net *ne
  
        /* Associate TIPC bearer with L2 bearer */
        rcu_assign_pointer(b->media_ptr, dev);
+       b->pt.dev = dev;
+       b->pt.type = htons(ETH_P_TIPC);
+       b->pt.func = tipc_l2_rcv_msg;
+       dev_add_pack(&b->pt);
        memset(&b->bcast_addr, 0, sizeof(b->bcast_addr));
        memcpy(b->bcast_addr.value, dev->broadcast, b->media->hwaddr_len);
        b->bcast_addr.media_id = b->media->type_id;
@@@ -423,6 -453,7 +429,7 @@@ void tipc_disable_l2_media(struct tipc_
        struct net_device *dev;
  
        dev = (struct net_device *)rtnl_dereference(b->media_ptr);
+       dev_remove_pack(&b->pt);
        RCU_INIT_POINTER(dev->tipc_ptr, NULL);
        synchronize_net();
        dev_put(dev);
@@@ -570,11 -601,12 +577,12 @@@ static int tipc_l2_rcv_msg(struct sk_bu
        struct tipc_bearer *b;
  
        rcu_read_lock();
-       b = rcu_dereference_rtnl(dev->tipc_ptr);
+       b = rcu_dereference_rtnl(dev->tipc_ptr) ?:
+               rcu_dereference_rtnl(orig_dev->tipc_ptr);
        if (likely(b && test_bit(0, &b->up) &&
                   (skb->pkt_type <= PACKET_MULTICAST))) {
                skb->next = NULL;
-               tipc_rcv(dev_net(dev), skb, b);
+               tipc_rcv(dev_net(b->pt.dev), skb, b);
                rcu_read_unlock();
                return NET_RX_SUCCESS;
        }
@@@ -635,11 -667,6 +643,6 @@@ static int tipc_l2_device_event(struct 
        return NOTIFY_OK;
  }
  
- static struct packet_type tipc_packet_type __read_mostly = {
-       .type = htons(ETH_P_TIPC),
-       .func = tipc_l2_rcv_msg,
- };
  static struct notifier_block notifier = {
        .notifier_call  = tipc_l2_device_event,
        .priority       = 0,
  
  int tipc_bearer_setup(void)
  {
-       int err;
-       err = register_netdevice_notifier(&notifier);
-       if (err)
-               return err;
-       dev_add_pack(&tipc_packet_type);
-       return 0;
+       return register_netdevice_notifier(&notifier);
  }
  
  void tipc_bearer_cleanup(void)
  {
        unregister_netdevice_notifier(&notifier);
-       dev_remove_pack(&tipc_packet_type);
  }
  
  void tipc_bearer_stop(struct net *net)
diff --combined net/tipc/bearer.h
index 865cb0901a20094a9abbf8eb66da4faac5539950,e07a55a80c18ba0f3c4f1187b7544faeca395a4e..42d6eeeb646ddca457aec269de1650b1269cb411
@@@ -131,6 -131,7 +131,7 @@@ struct tipc_media 
   * @name: bearer name (format = media:interface)
   * @media: ptr to media structure associated with bearer
   * @bcast_addr: media address used in broadcasting
+  * @pt: packet type for bearer
   * @rcu: rcu struct for tipc_bearer
   * @priority: default link priority for bearer
   * @window: default window size for bearer
@@@ -151,6 -152,7 +152,7 @@@ struct tipc_bearer 
        char name[TIPC_MAX_BEARER_NAME];
        struct tipc_media *media;
        struct tipc_media_addr bcast_addr;
+       struct packet_type pt;
        struct rcu_head rcu;
        u32 priority;
        u32 window;
@@@ -210,6 -212,7 +212,6 @@@ void tipc_bearer_remove_dest(struct ne
  struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name);
  int tipc_bearer_get_name(struct net *net, char *name, u32 bearer_id);
  struct tipc_media *tipc_media_find(const char *name);
 -void tipc_bearer_reset_all(struct net *net);
  int tipc_bearer_setup(void);
  void tipc_bearer_cleanup(void);
  void tipc_bearer_stop(struct net *net);
diff --combined net/tipc/node.c
index eb728397c810af8c63a1d5501226f04a40e6a65d,7dd22330a6b4bf9113e189c613a863fce13425a2..198dbc7adbe126cdb00d8e4508ff47a250f5b2f4
@@@ -1126,8 -1126,8 +1126,8 @@@ int tipc_node_get_linkname(struct net *
                strncpy(linkname, tipc_link_name(link), len);
                err = 0;
        }
- exit:
        tipc_node_read_unlock(node);
+ exit:
        tipc_node_put(node);
        return err;
  }
@@@ -1284,7 -1284,7 +1284,7 @@@ static void tipc_node_bc_sync_rcv(struc
        rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr);
  
        if (rc & TIPC_LINK_DOWN_EVT) {
 -              tipc_bearer_reset_all(n->net);
 +              tipc_node_reset_links(n);
                return;
        }
  
@@@ -1351,9 -1351,15 +1351,9 @@@ static void tipc_node_bc_rcv(struct ne
        if (!skb_queue_empty(&be->inputq1))
                tipc_node_mcast_rcv(n);
  
 -      if (rc & TIPC_LINK_DOWN_EVT) {
 -              /* Reception reassembly failure => reset all links to peer */
 -              if (!tipc_link_is_up(be->link))
 -                      tipc_node_reset_links(n);
 -
 -              /* Retransmission failure => reset all links to all peers */
 -              if (!tipc_link_is_up(tipc_bc_sndlink(net)))
 -                      tipc_bearer_reset_all(net);
 -      }
 +      /* If reassembly or retransmission failure => reset all links to peer */
 +      if (rc & TIPC_LINK_DOWN_EVT)
 +              tipc_node_reset_links(n);
  
        tipc_node_put(n);
  }
@@@ -1551,6 -1557,8 +1551,8 @@@ void tipc_rcv(struct net *net, struct s
  
        /* Check/update node state before receiving */
        if (unlikely(skb)) {
+               if (unlikely(skb_linearize(skb)))
+                       goto discard;
                tipc_node_write_lock(n);
                if (tipc_node_check_state(n, skb, bearer_id, &xmitq)) {
                        if (le->link) {
diff --combined net/xfrm/xfrm_policy.c
index cc0d783ccbad0b52713dae77d564468b966b9b3c,69b16ee327d9958769f09c66d54ace50889d6665..f06253969972aa3489e557faf1ef76f54b1eb3d3
@@@ -24,7 -24,6 +24,7 @@@
  #include <linux/netfilter.h>
  #include <linux/module.h>
  #include <linux/cache.h>
 +#include <linux/cpu.h>
  #include <linux/audit.h>
  #include <net/dst.h>
  #include <net/flow.h>
@@@ -45,8 -44,6 +45,8 @@@ struct xfrm_flo 
        u8 flags;
  };
  
 +static DEFINE_PER_CPU(struct xfrm_dst *, xfrm_last_dst);
 +static struct work_struct *xfrm_pcpu_work __read_mostly;
  static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
  static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
                                                __read_mostly;
@@@ -122,7 -119,7 +122,7 @@@ static const struct xfrm_policy_afinfo 
  struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
                                    const xfrm_address_t *saddr,
                                    const xfrm_address_t *daddr,
 -                                  int family)
 +                                  int family, u32 mark)
  {
        const struct xfrm_policy_afinfo *afinfo;
        struct dst_entry *dst;
        if (unlikely(afinfo == NULL))
                return ERR_PTR(-EAFNOSUPPORT);
  
 -      dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr);
 +      dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr, mark);
  
        rcu_read_unlock();
  
@@@ -143,7 -140,7 +143,7 @@@ static inline struct dst_entry *xfrm_ds
                                                int tos, int oif,
                                                xfrm_address_t *prev_saddr,
                                                xfrm_address_t *prev_daddr,
 -                                              int family)
 +                                              int family, u32 mark)
  {
        struct net *net = xs_net(x);
        xfrm_address_t *saddr = &x->props.saddr;
                daddr = x->coaddr;
        }
  
 -      dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family);
 +      dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family, mark);
  
        if (!IS_ERR(dst)) {
                if (prev_saddr != saddr)
@@@ -249,6 -246,36 +249,6 @@@ expired
        xfrm_pol_put(xp);
  }
  
 -static struct flow_cache_object *xfrm_policy_flo_get(struct flow_cache_object *flo)
 -{
 -      struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
 -
 -      if (unlikely(pol->walk.dead))
 -              flo = NULL;
 -      else
 -              xfrm_pol_hold(pol);
 -
 -      return flo;
 -}
 -
 -static int xfrm_policy_flo_check(struct flow_cache_object *flo)
 -{
 -      struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
 -
 -      return !pol->walk.dead;
 -}
 -
 -static void xfrm_policy_flo_delete(struct flow_cache_object *flo)
 -{
 -      xfrm_pol_put(container_of(flo, struct xfrm_policy, flo));
 -}
 -
 -static const struct flow_cache_ops xfrm_policy_fc_ops = {
 -      .get = xfrm_policy_flo_get,
 -      .check = xfrm_policy_flo_check,
 -      .delete = xfrm_policy_flo_delete,
 -};
 -
  /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
   * SPD calls.
   */
@@@ -271,6 -298,7 +271,6 @@@ struct xfrm_policy *xfrm_policy_alloc(s
                                (unsigned long)policy);
                setup_timer(&policy->polq.hold_timer, xfrm_policy_queue_process,
                            (unsigned long)policy);
 -              policy->flo.ops = &xfrm_policy_fc_ops;
        }
        return policy;
  }
@@@ -770,6 -798,7 +770,6 @@@ int xfrm_policy_insert(int dir, struct 
        else
                hlist_add_head(&policy->bydst, chain);
        __xfrm_policy_link(policy, dir);
 -      atomic_inc(&net->xfrm.flow_cache_genid);
  
        /* After previous checking, family can either be AF_INET or AF_INET6 */
        if (policy->family == AF_INET)
@@@ -975,8 -1004,6 +975,8 @@@ int xfrm_policy_flush(struct net *net, 
        }
        if (!cnt)
                err = -ESRCH;
 +      else
 +              xfrm_policy_cache_flush();
  out:
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
        return err;
@@@ -1148,7 -1175,7 +1148,7 @@@ fail
  }
  
  static struct xfrm_policy *
 -__xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir)
 +xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir)
  {
  #ifdef CONFIG_XFRM_SUB_POLICY
        struct xfrm_policy *pol;
        return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
  }
  
 -static int flow_to_policy_dir(int dir)
 -{
 -      if (XFRM_POLICY_IN == FLOW_DIR_IN &&
 -          XFRM_POLICY_OUT == FLOW_DIR_OUT &&
 -          XFRM_POLICY_FWD == FLOW_DIR_FWD)
 -              return dir;
 -
 -      switch (dir) {
 -      default:
 -      case FLOW_DIR_IN:
 -              return XFRM_POLICY_IN;
 -      case FLOW_DIR_OUT:
 -              return XFRM_POLICY_OUT;
 -      case FLOW_DIR_FWD:
 -              return XFRM_POLICY_FWD;
 -      }
 -}
 -
 -static struct flow_cache_object *
 -xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family,
 -                 u8 dir, struct flow_cache_object *old_obj, void *ctx)
 -{
 -      struct xfrm_policy *pol;
 -
 -      if (old_obj)
 -              xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
 -
 -      pol = __xfrm_policy_lookup(net, fl, family, flow_to_policy_dir(dir));
 -      if (IS_ERR_OR_NULL(pol))
 -              return ERR_CAST(pol);
 -
 -      /* Resolver returns two references:
 -       * one for cache and one for caller of flow_cache_lookup() */
 -      xfrm_pol_hold(pol);
 -
 -      return &pol->flo;
 -}
 -
 -static inline int policy_to_flow_dir(int dir)
 -{
 -      if (XFRM_POLICY_IN == FLOW_DIR_IN &&
 -          XFRM_POLICY_OUT == FLOW_DIR_OUT &&
 -          XFRM_POLICY_FWD == FLOW_DIR_FWD)
 -              return dir;
 -      switch (dir) {
 -      default:
 -      case XFRM_POLICY_IN:
 -              return FLOW_DIR_IN;
 -      case XFRM_POLICY_OUT:
 -              return FLOW_DIR_OUT;
 -      case XFRM_POLICY_FWD:
 -              return FLOW_DIR_FWD;
 -      }
 -}
 -
  static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
                                                 const struct flowi *fl, u16 family)
  {
                        }
                        err = security_xfrm_policy_lookup(pol->security,
                                                      fl->flowi_secid,
 -                                                    policy_to_flow_dir(dir));
 +                                                    dir);
                        if (!err) {
                                if (!xfrm_pol_hold_rcu(pol))
                                        goto again;
@@@ -1340,14 -1422,14 +1340,14 @@@ int __xfrm_sk_clone_policy(struct sock 
  
  static int
  xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local,
 -             xfrm_address_t *remote, unsigned short family)
 +             xfrm_address_t *remote, unsigned short family, u32 mark)
  {
        int err;
        const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
  
        if (unlikely(afinfo == NULL))
                return -EINVAL;
 -      err = afinfo->get_saddr(net, oif, local, remote);
 +      err = afinfo->get_saddr(net, oif, local, remote, mark);
        rcu_read_unlock();
        return err;
  }
@@@ -1378,7 -1460,7 +1378,7 @@@ xfrm_tmpl_resolve_one(struct xfrm_polic
                        if (xfrm_addr_any(local, tmpl->encap_family)) {
                                error = xfrm_get_saddr(net, fl->flowi_oif,
                                                       &tmp, remote,
 -                                                     tmpl->encap_family);
 +                                                     tmpl->encap_family, 0);
                                if (error)
                                        goto fail;
                                local = &tmp;
@@@ -1463,6 -1545,58 +1463,6 @@@ static int xfrm_get_tos(const struct fl
        return tos;
  }
  
 -static struct flow_cache_object *xfrm_bundle_flo_get(struct flow_cache_object *flo)
 -{
 -      struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
 -      struct dst_entry *dst = &xdst->u.dst;
 -
 -      if (xdst->route == NULL) {
 -              /* Dummy bundle - if it has xfrms we were not
 -               * able to build bundle as template resolution failed.
 -               * It means we need to try again resolving. */
 -              if (xdst->num_xfrms > 0)
 -                      return NULL;
 -      } else if (dst->flags & DST_XFRM_QUEUE) {
 -              return NULL;
 -      } else {
 -              /* Real bundle */
 -              if (stale_bundle(dst))
 -                      return NULL;
 -      }
 -
 -      dst_hold(dst);
 -      return flo;
 -}
 -
 -static int xfrm_bundle_flo_check(struct flow_cache_object *flo)
 -{
 -      struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
 -      struct dst_entry *dst = &xdst->u.dst;
 -
 -      if (!xdst->route)
 -              return 0;
 -      if (stale_bundle(dst))
 -              return 0;
 -
 -      return 1;
 -}
 -
 -static void xfrm_bundle_flo_delete(struct flow_cache_object *flo)
 -{
 -      struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
 -      struct dst_entry *dst = &xdst->u.dst;
 -
 -      /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
 -      dst->obsolete = DST_OBSOLETE_DEAD;
 -      dst_release_immediate(dst);
 -}
 -
 -static const struct flow_cache_ops xfrm_bundle_fc_ops = {
 -      .get = xfrm_bundle_flo_get,
 -      .check = xfrm_bundle_flo_check,
 -      .delete = xfrm_bundle_flo_delete,
 -};
 -
  static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
  {
        const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
                struct dst_entry *dst = &xdst->u.dst;
  
                memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst));
 -              xdst->flo.ops = &xfrm_bundle_fc_ops;
        } else
                xdst = ERR_PTR(-ENOBUFS);
  
@@@ -1598,8 -1733,7 +1598,8 @@@ static struct dst_entry *xfrm_bundle_cr
                if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
                        family = xfrm[i]->props.family;
                        dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif,
 -                                            &saddr, &daddr, family);
 +                                            &saddr, &daddr, family,
 +                                            xfrm[i]->props.output_mark);
                        err = PTR_ERR(dst);
                        if (IS_ERR(dst))
                                goto put_states;
@@@ -1706,102 -1840,6 +1706,102 @@@ static int xfrm_expand_policies(const s
  
  }
  
 +static void xfrm_last_dst_update(struct xfrm_dst *xdst, struct xfrm_dst *old)
 +{
 +      this_cpu_write(xfrm_last_dst, xdst);
 +      if (old)
 +              dst_release(&old->u.dst);
 +}
 +
 +static void __xfrm_pcpu_work_fn(void)
 +{
 +      struct xfrm_dst *old;
 +
 +      old = this_cpu_read(xfrm_last_dst);
 +      if (old && !xfrm_bundle_ok(old))
 +              xfrm_last_dst_update(NULL, old);
 +}
 +
 +static void xfrm_pcpu_work_fn(struct work_struct *work)
 +{
 +      local_bh_disable();
 +      rcu_read_lock();
 +      __xfrm_pcpu_work_fn();
 +      rcu_read_unlock();
 +      local_bh_enable();
 +}
 +
 +void xfrm_policy_cache_flush(void)
 +{
 +      struct xfrm_dst *old;
 +      bool found = 0;
 +      int cpu;
 +
 +      local_bh_disable();
 +      rcu_read_lock();
 +      for_each_possible_cpu(cpu) {
 +              old = per_cpu(xfrm_last_dst, cpu);
 +              if (old && !xfrm_bundle_ok(old)) {
 +                      if (smp_processor_id() == cpu) {
 +                              __xfrm_pcpu_work_fn();
 +                              continue;
 +                      }
 +                      found = true;
 +                      break;
 +              }
 +      }
 +
 +      rcu_read_unlock();
 +      local_bh_enable();
 +
 +      if (!found)
 +              return;
 +
 +      get_online_cpus();
 +
 +      for_each_possible_cpu(cpu) {
 +              bool bundle_release;
 +
 +              rcu_read_lock();
 +              old = per_cpu(xfrm_last_dst, cpu);
 +              bundle_release = old && !xfrm_bundle_ok(old);
 +              rcu_read_unlock();
 +
 +              if (!bundle_release)
 +                      continue;
 +
 +              if (cpu_online(cpu)) {
 +                      schedule_work_on(cpu, &xfrm_pcpu_work[cpu]);
 +                      continue;
 +              }
 +
 +              rcu_read_lock();
 +              old = per_cpu(xfrm_last_dst, cpu);
 +              if (old && !xfrm_bundle_ok(old)) {
 +                      per_cpu(xfrm_last_dst, cpu) = NULL;
 +                      dst_release(&old->u.dst);
 +              }
 +              rcu_read_unlock();
 +      }
 +
 +      put_online_cpus();
 +}
 +
 +static bool xfrm_pol_dead(struct xfrm_dst *xdst)
 +{
 +      unsigned int num_pols = xdst->num_pols;
 +      unsigned int pol_dead = 0, i;
 +
 +      for (i = 0; i < num_pols; i++)
 +              pol_dead |= xdst->pols[i]->walk.dead;
 +
 +      /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
 +      if (pol_dead)
 +              xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
 +
 +      return pol_dead;
 +}
 +
  static struct xfrm_dst *
  xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
                               const struct flowi *fl, u16 family,
  {
        struct net *net = xp_net(pols[0]);
        struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
 +      struct xfrm_dst *xdst, *old;
        struct dst_entry *dst;
 -      struct xfrm_dst *xdst;
        int err;
  
 +      xdst = this_cpu_read(xfrm_last_dst);
 +      if (xdst &&
 +          xdst->u.dst.dev == dst_orig->dev &&
 +          xdst->num_pols == num_pols &&
 +          !xfrm_pol_dead(xdst) &&
 +          memcmp(xdst->pols, pols,
 +                 sizeof(struct xfrm_policy *) * num_pols) == 0 &&
 +          xfrm_bundle_ok(xdst)) {
 +              dst_hold(&xdst->u.dst);
 +              return xdst;
 +      }
 +
 +      old = xdst;
        /* Try to instantiate a bundle */
        err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
        if (err <= 0) {
        memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
        xdst->policy_genid = atomic_read(&pols[0]->genid);
  
 +      atomic_set(&xdst->u.dst.__refcnt, 2);
 +      xfrm_last_dst_update(xdst, old);
 +
        return xdst;
  }
  
@@@ -2029,39 -2051,86 +2029,39 @@@ free_dst
        goto out;
  }
  
 -static struct flow_cache_object *
 -xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
 -                 struct flow_cache_object *oldflo, void *ctx)
 +static struct xfrm_dst *
 +xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct xfrm_flo *xflo)
  {
 -      struct xfrm_flo *xflo = (struct xfrm_flo *)ctx;
        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
 -      struct xfrm_dst *xdst, *new_xdst;
 -      int num_pols = 0, num_xfrms = 0, i, err, pol_dead;
 -
 -      /* Check if the policies from old bundle are usable */
 -      xdst = NULL;
 -      if (oldflo) {
 -              xdst = container_of(oldflo, struct xfrm_dst, flo);
 -              num_pols = xdst->num_pols;
 -              num_xfrms = xdst->num_xfrms;
 -              pol_dead = 0;
 -              for (i = 0; i < num_pols; i++) {
 -                      pols[i] = xdst->pols[i];
 -                      pol_dead |= pols[i]->walk.dead;
 -              }
 -              if (pol_dead) {
 -                      /* Mark DST_OBSOLETE_DEAD to fail the next
 -                       * xfrm_dst_check()
 -                       */
 -                      xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
 -                      dst_release_immediate(&xdst->u.dst);
 -                      xdst = NULL;
 -                      num_pols = 0;
 -                      num_xfrms = 0;
 -                      oldflo = NULL;
 -              }
 -      }
 +      int num_pols = 0, num_xfrms = 0, err;
 +      struct xfrm_dst *xdst;
  
        /* Resolve policies to use if we couldn't get them from
         * previous cache entry */
 -      if (xdst == NULL) {
 -              num_pols = 1;
 -              pols[0] = __xfrm_policy_lookup(net, fl, family,
 -                                             flow_to_policy_dir(dir));
 -              err = xfrm_expand_policies(fl, family, pols,
 +      num_pols = 1;
 +      pols[0] = xfrm_policy_lookup(net, fl, family, dir);
 +      err = xfrm_expand_policies(fl, family, pols,
                                           &num_pols, &num_xfrms);
 -              if (err < 0)
 -                      goto inc_error;
 -              if (num_pols == 0)
 -                      return NULL;
 -              if (num_xfrms <= 0)
 -                      goto make_dummy_bundle;
 -      }
 +      if (err < 0)
 +              goto inc_error;
 +      if (num_pols == 0)
 +              return NULL;
 +      if (num_xfrms <= 0)
 +              goto make_dummy_bundle;
  
 -      new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
 +      xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
                                                  xflo->dst_orig);
 -      if (IS_ERR(new_xdst)) {
 -              err = PTR_ERR(new_xdst);
 +      if (IS_ERR(xdst)) {
 +              err = PTR_ERR(xdst);
                if (err != -EAGAIN)
                        goto error;
 -              if (oldflo == NULL)
 -                      goto make_dummy_bundle;
 -              dst_hold(&xdst->u.dst);
 -              return oldflo;
 -      } else if (new_xdst == NULL) {
 +              goto make_dummy_bundle;
 +      } else if (xdst == NULL) {
                num_xfrms = 0;
 -              if (oldflo == NULL)
 -                      goto make_dummy_bundle;
 -              xdst->num_xfrms = 0;
 -              dst_hold(&xdst->u.dst);
 -              return oldflo;
 -      }
 -
 -      /* Kill the previous bundle */
 -      if (xdst) {
 -              /* The policies were stolen for newly generated bundle */
 -              xdst->num_pols = 0;
 -              /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
 -              xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
 -              dst_release_immediate(&xdst->u.dst);
 +              goto make_dummy_bundle;
        }
  
 -      /* We do need to return one reference for original caller */
 -      dst_hold(&new_xdst->u.dst);
 -      return &new_xdst->flo;
 +      return xdst;
  
  make_dummy_bundle:
        /* We found policies, but there's no bundles to instantiate:
        memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
  
        dst_hold(&xdst->u.dst);
 -      return &xdst->flo;
 +      return xdst;
  
  inc_error:
        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
  error:
 -      if (xdst != NULL) {
 -              /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
 -              xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
 -              dst_release_immediate(&xdst->u.dst);
 -      } else
 -              xfrm_pols_put(pols, num_pols);
 +      xfrm_pols_put(pols, num_pols);
        return ERR_PTR(err);
  }
  
@@@ -2113,10 -2187,11 +2113,10 @@@ struct dst_entry *xfrm_lookup(struct ne
                              const struct sock *sk, int flags)
  {
        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
 -      struct flow_cache_object *flo;
        struct xfrm_dst *xdst;
        struct dst_entry *dst, *route;
        u16 family = dst_orig->ops->family;
 -      u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
 +      u8 dir = XFRM_POLICY_OUT;
        int i, err, num_pols, num_xfrms = 0, drop_pols = 0;
  
        dst = NULL;
                                goto no_transform;
                        }
  
-                       dst_hold(&xdst->u.dst);
                        route = xdst->route;
                }
        }
                    !net->xfrm.policy_count[XFRM_POLICY_OUT])
                        goto nopol;
  
 -              flo = flow_cache_lookup(net, fl, family, dir,
 -                                      xfrm_bundle_lookup, &xflo);
 -              if (flo == NULL)
 +              xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo);
 +              if (xdst == NULL)
                        goto nopol;
 -              if (IS_ERR(flo)) {
 -                      err = PTR_ERR(flo);
 +              if (IS_ERR(xdst)) {
 +                      err = PTR_ERR(xdst);
                        goto dropdst;
                }
 -              xdst = container_of(flo, struct xfrm_dst, flo);
  
                num_pols = xdst->num_pols;
                num_xfrms = xdst->num_xfrms;
@@@ -2372,10 -2448,12 +2371,10 @@@ int __xfrm_policy_check(struct sock *sk
        int pi;
        int reverse;
        struct flowi fl;
 -      u8 fl_dir;
        int xerr_idx = -1;
  
        reverse = dir & ~XFRM_POLICY_MASK;
        dir &= XFRM_POLICY_MASK;
 -      fl_dir = policy_to_flow_dir(dir);
  
        if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
                }
        }
  
 -      if (!pol) {
 -              struct flow_cache_object *flo;
 -
 -              flo = flow_cache_lookup(net, &fl, family, fl_dir,
 -                                      xfrm_policy_lookup, NULL);
 -              if (IS_ERR_OR_NULL(flo))
 -                      pol = ERR_CAST(flo);
 -              else
 -                      pol = container_of(flo, struct xfrm_policy, flo);
 -      }
 +      if (!pol)
 +              pol = xfrm_policy_lookup(net, &fl, family, dir);
  
        if (IS_ERR(pol)) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
@@@ -2554,9 -2640,11 +2553,9 @@@ static struct dst_entry *xfrm_dst_check
         * notice.  That's what we are validating here via the
         * stale_bundle() check.
         *
 -       * When an xdst is removed from flow cache, DST_OBSOLETE_DEAD will
 -       * be marked on it.
         * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will
         * be marked on it.
 -       * Both will force stable_bundle() to fail on any xdst bundle with
 +       * This will force stale_bundle() to fail on any xdst bundle with
         * this dst linked in it.
         */
        if (dst->obsolete < 0 && !stale_bundle(dst))
@@@ -2596,6 -2684,18 +2595,6 @@@ static struct dst_entry *xfrm_negative_
        return dst;
  }
  
 -void xfrm_garbage_collect(struct net *net)
 -{
 -      flow_cache_flush(net);
 -}
 -EXPORT_SYMBOL(xfrm_garbage_collect);
 -
 -void xfrm_garbage_collect_deferred(struct net *net)
 -{
 -      flow_cache_flush_deferred(net);
 -}
 -EXPORT_SYMBOL(xfrm_garbage_collect_deferred);
 -
  static void xfrm_init_pmtu(struct dst_entry *dst)
  {
        do {
@@@ -2933,9 -3033,14 +2932,9 @@@ static int __net_init xfrm_net_init(str
        rv = xfrm_sysctl_init(net);
        if (rv < 0)
                goto out_sysctl;
 -      rv = flow_cache_init(net);
 -      if (rv < 0)
 -              goto out;
  
        return 0;
  
 -out:
 -      xfrm_sysctl_fini(net);
  out_sysctl:
        xfrm_policy_fini(net);
  out_policy:
@@@ -2948,6 -3053,7 +2947,6 @@@ out_statistics
  
  static void __net_exit xfrm_net_exit(struct net *net)
  {
 -      flow_cache_fini(net);
        xfrm_sysctl_fini(net);
        xfrm_policy_fini(net);
        xfrm_state_fini(net);
@@@ -2961,15 -3067,7 +2960,15 @@@ static struct pernet_operations __net_i
  
  void __init xfrm_init(void)
  {
 -      flow_cache_hp_init();
 +      int i;
 +
 +      xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work),
 +                                     GFP_KERNEL);
 +      BUG_ON(!xfrm_pcpu_work);
 +
 +      for (i = 0; i < NR_CPUS; i++)
 +              INIT_WORK(&xfrm_pcpu_work[i], xfrm_pcpu_work_fn);
 +
        register_pernet_subsys(&xfrm_net_ops);
        seqcount_init(&xfrm_policy_hash_generation);
        xfrm_input_init();
@@@ -3209,9 -3307,15 +3208,15 @@@ int xfrm_migrate(const struct xfrm_sele
        struct xfrm_state *x_new[XFRM_MAX_DEPTH];
        struct xfrm_migrate *mp;
  
+       /* Stage 0 - sanity checks */
        if ((err = xfrm_migrate_check(m, num_migrate)) < 0)
                goto out;
  
+       if (dir >= XFRM_POLICY_MAX) {
+               err = -EINVAL;
+               goto out;
+       }
        /* Stage 1 - find policy */
        if ((pol = xfrm_migrate_policy_find(sel, dir, type, net)) == NULL) {
                err = -ENOENT;
diff --combined net/xfrm/xfrm_state.c
index a41e2ef789c025ce5b4612208223a7fd65d5e9d4,a792effdb0b5d51fb88835349a44a756d3a9e5e7..0dab1cd79ce4d1afe84ba9422a740689a9ebdf71
@@@ -296,14 -296,12 +296,14 @@@ int xfrm_unregister_type_offload(const 
  }
  EXPORT_SYMBOL(xfrm_unregister_type_offload);
  
 -static const struct xfrm_type_offload *xfrm_get_type_offload(u8 proto, unsigned short family)
 +static const struct xfrm_type_offload *
 +xfrm_get_type_offload(u8 proto, unsigned short family, bool try_load)
  {
        struct xfrm_state_afinfo *afinfo;
        const struct xfrm_type_offload **typemap;
        const struct xfrm_type_offload *type;
  
 +retry:
        afinfo = xfrm_state_get_afinfo(family);
        if (unlikely(afinfo == NULL))
                return NULL;
        if ((type && !try_module_get(type->owner)))
                type = NULL;
  
 +      if (!type && try_load) {
 +              request_module("xfrm-offload-%d-%d", family, proto);
 +              try_load = 0;
 +              goto retry;
 +      }
 +
        rcu_read_unlock();
        return type;
  }
@@@ -732,10 -724,9 +732,10 @@@ restart
                        }
                }
        }
 -      if (cnt)
 +      if (cnt) {
                err = 0;
 -
 +              xfrm_policy_cache_flush();
 +      }
  out:
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
        return err;
@@@ -1629,6 -1620,7 +1629,7 @@@ in
  xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
               unsigned short family, struct net *net)
  {
+       int i;
        int err = 0;
        struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
        if (!afinfo)
        spin_lock_bh(&net->xfrm.xfrm_state_lock); /*FIXME*/
        if (afinfo->tmpl_sort)
                err = afinfo->tmpl_sort(dst, src, n);
+       else
+               for (i = 0; i < n; i++)
+                       dst[i] = src[i];
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
        rcu_read_unlock();
        return err;
@@@ -1647,6 -1642,7 +1651,7 @@@ in
  xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
                unsigned short family)
  {
+       int i;
        int err = 0;
        struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
        struct net *net = xs_net(*src);
        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        if (afinfo->state_sort)
                err = afinfo->state_sort(dst, src, n);
+       else
+               for (i = 0; i < n; i++)
+                       dst[i] = src[i];
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
        rcu_read_unlock();
        return err;
@@@ -2173,7 -2172,7 +2181,7 @@@ int xfrm_state_mtu(struct xfrm_state *x
        return mtu - x->props.header_len;
  }
  
 -int __xfrm_init_state(struct xfrm_state *x, bool init_replay)
 +int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload)
  {
        struct xfrm_state_afinfo *afinfo;
        struct xfrm_mode *inner_mode;
        if (x->type == NULL)
                goto error;
  
 -      x->type_offload = xfrm_get_type_offload(x->id.proto, family);
 +      x->type_offload = xfrm_get_type_offload(x->id.proto, family, offload);
  
        err = x->type->init_state(x);
        if (err)
@@@ -2266,7 -2265,7 +2274,7 @@@ EXPORT_SYMBOL(__xfrm_init_state)
  
  int xfrm_init_state(struct xfrm_state *x)
  {
 -      return __xfrm_init_state(x, true);
 +      return __xfrm_init_state(x, true, false);
  }
  
  EXPORT_SYMBOL(xfrm_init_state);
diff --combined net/xfrm/xfrm_user.c
index 490132d6dc36dd4c1060b8848ff7d7c1a362f5ae,9391ced0525986ce72938a9ed59c27ea124f7ba5..2bfbd9121e3b21b0eb793d2d3a685bd4cebde22b
@@@ -584,10 -584,7 +584,10 @@@ static struct xfrm_state *xfrm_state_co
  
        xfrm_mark_get(attrs, &x->mark);
  
 -      err = __xfrm_init_state(x, false);
 +      if (attrs[XFRMA_OUTPUT_MARK])
 +              x->props.output_mark = nla_get_u32(attrs[XFRMA_OUTPUT_MARK]);
 +
 +      err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]);
        if (err)
                goto error;
  
@@@ -799,7 -796,7 +799,7 @@@ static int copy_user_offload(struct xfr
                return -EMSGSIZE;
  
        xuo = nla_data(attr);
+       memset(xuo, 0, sizeof(*xuo));
        xuo->ifindex = xso->dev->ifindex;
        xuo->flags = xso->flags;
  
@@@ -900,11 -897,6 +900,11 @@@ static int copy_to_user_state_extra(str
                ret = copy_user_offload(&x->xso, skb);
        if (ret)
                goto out;
 +      if (x->props.output_mark) {
 +              ret = nla_put_u32(skb, XFRMA_OUTPUT_MARK, x->props.output_mark);
 +              if (ret)
 +                      goto out;
 +      }
        if (x->security)
                ret = copy_sec_ctx(x->security, skb);
  out:
@@@ -1823,6 -1815,8 +1823,6 @@@ static int xfrm_get_policy(struct sk_bu
  
  out:
        xfrm_pol_put(xp);
 -      if (delete && err == 0)
 -              xfrm_garbage_collect(net);
        return err;
  }
  
@@@ -1875,6 -1869,7 +1875,7 @@@ static int build_aevent(struct sk_buff 
                return -EMSGSIZE;
  
        id = nlmsg_data(nlh);
+       memset(&id->sa_id, 0, sizeof(id->sa_id));
        memcpy(&id->sa_id.daddr, &x->id.daddr, sizeof(x->id.daddr));
        id->sa_id.spi = x->id.spi;
        id->sa_id.family = x->props.family;
@@@ -2033,6 -2028,7 +2034,6 @@@ static int xfrm_flush_policy(struct sk_
                        return 0;
                return err;
        }
 -      xfrm_garbage_collect(net);
  
        c.data.type = type;
        c.event = nlh->nlmsg_type;
@@@ -2462,7 -2458,6 +2463,7 @@@ static const struct nla_policy xfrma_po
        [XFRMA_PROTO]           = { .type = NLA_U8 },
        [XFRMA_ADDRESS_FILTER]  = { .len = sizeof(struct xfrm_address_filter) },
        [XFRMA_OFFLOAD_DEV]     = { .len = sizeof(struct xfrm_user_offload) },
 +      [XFRMA_OUTPUT_MARK]     = { .len = NLA_U32 },
  };
  
  static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
@@@ -2584,6 -2579,8 +2585,8 @@@ static int build_expire(struct sk_buff 
        ue = nlmsg_data(nlh);
        copy_to_user_state(x, &ue->state);
        ue->hard = (c->data.hard != 0) ? 1 : 0;
+       /* clear the padding bytes */
+       memset(&ue->hard + 1, 0, sizeof(*ue) - offsetofend(typeof(*ue), hard));
  
        err = xfrm_mark_put(skb, &x->mark);
        if (err)
@@@ -2682,8 -2679,6 +2685,8 @@@ static inline size_t xfrm_sa_len(struc
                l += nla_total_size(sizeof(x->props.extra_flags));
        if (x->xso.dev)
                 l += nla_total_size(sizeof(x->xso));
 +      if (x->props.output_mark)
 +              l += nla_total_size(sizeof(x->props.output_mark));
  
        /* Must count x->lastused as it may become non-zero behind our back. */
        l += nla_total_size_64bit(sizeof(u64));
@@@ -2723,6 -2718,7 +2726,7 @@@ static int xfrm_notify_sa(struct xfrm_s
                struct nlattr *attr;
  
                id = nlmsg_data(nlh);
+               memset(id, 0, sizeof(*id));
                memcpy(&id->daddr, &x->id.daddr, sizeof(id->daddr));
                id->spi = x->id.spi;
                id->family = x->props.family;