Three cases of simple overlapping changes.
Signed-off-by: David S. Miller <davem@davemloft.net>
static int bcm_sf2_eee_init(struct dsa_switch *ds, int port,
struct phy_device *phy)
{
- struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
- struct ethtool_eee *p = &priv->port_sts[port].eee;
int ret;
- p->supported = (SUPPORTED_1000baseT_Full | SUPPORTED_100baseT_Full);
-
ret = phy_init_eee(phy, 0);
if (ret)
return 0;
return 1;
}
-static int bcm_sf2_sw_get_eee(struct dsa_switch *ds, int port,
- struct ethtool_eee *e)
+static int bcm_sf2_sw_get_mac_eee(struct dsa_switch *ds, int port,
+ struct ethtool_eee *e)
{
struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
struct ethtool_eee *p = &priv->port_sts[port].eee;
return 0;
}
-static int bcm_sf2_sw_set_eee(struct dsa_switch *ds, int port,
- struct phy_device *phydev,
- struct ethtool_eee *e)
+static int bcm_sf2_sw_set_mac_eee(struct dsa_switch *ds, int port,
+ struct ethtool_eee *e)
{
struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
struct ethtool_eee *p = &priv->port_sts[port].eee;
p->eee_enabled = e->eee_enabled;
-
- if (!p->eee_enabled) {
- bcm_sf2_eee_enable_set(ds, port, false);
- } else {
- p->eee_enabled = bcm_sf2_eee_init(ds, port, phydev);
- if (!p->eee_enabled)
- return -EOPNOTSUPP;
- }
+ bcm_sf2_eee_enable_set(ds, port, e->eee_enabled);
return 0;
}
static void bcm_sf2_sw_get_wol(struct dsa_switch *ds, int port,
struct ethtool_wolinfo *wol)
{
- struct net_device *p = ds->dst[ds->index].cpu_dp->netdev;
+ struct net_device *p = ds->dst->cpu_dp->netdev;
struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
struct ethtool_wolinfo pwol;
static int bcm_sf2_sw_set_wol(struct dsa_switch *ds, int port,
struct ethtool_wolinfo *wol)
{
- struct net_device *p = ds->dst[ds->index].cpu_dp->netdev;
+ struct net_device *p = ds->dst->cpu_dp->netdev;
struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
s8 cpu_port = ds->dst->cpu_dp->index;
struct ethtool_wolinfo pwol;
return 0;
}
-static struct b53_io_ops bcm_sf2_io_ops = {
+static const struct b53_io_ops bcm_sf2_io_ops = {
.read8 = bcm_sf2_core_read8,
.read16 = bcm_sf2_core_read16,
.read32 = bcm_sf2_core_read32,
.set_wol = bcm_sf2_sw_set_wol,
.port_enable = bcm_sf2_port_setup,
.port_disable = bcm_sf2_port_disable,
- .get_eee = bcm_sf2_sw_get_eee,
- .set_eee = bcm_sf2_sw_set_eee,
+ .get_mac_eee = bcm_sf2_sw_get_mac_eee,
+ .set_mac_eee = bcm_sf2_sw_set_mac_eee,
.port_bridge_join = b53_br_join,
.port_bridge_leave = b53_br_leave,
.port_stp_state_set = b53_br_set_stp_state,
.port_vlan_prepare = b53_vlan_prepare,
.port_vlan_add = b53_vlan_add,
.port_vlan_del = b53_vlan_del,
- .port_vlan_dump = b53_vlan_dump,
- .port_fdb_prepare = b53_fdb_prepare,
.port_fdb_dump = b53_fdb_dump,
.port_fdb_add = b53_fdb_add,
.port_fdb_del = b53_fdb_del,
u32 type;
const u16 *reg_offsets;
unsigned int core_reg_align;
+ unsigned int num_cfp_rules;
};
/* Register offsets for the SWITCH_REG_* block */
.type = BCM7445_DEVICE_ID,
.core_reg_align = 0,
.reg_offsets = bcm_sf2_7445_reg_offsets,
+ .num_cfp_rules = 256,
};
static const u16 bcm_sf2_7278_reg_offsets[] = {
.type = BCM7278_DEVICE_ID,
.core_reg_align = 1,
.reg_offsets = bcm_sf2_7278_reg_offsets,
+ .num_cfp_rules = 128,
};
static const struct of_device_id bcm_sf2_of_match[] = {
priv->type = data->type;
priv->reg_offsets = data->reg_offsets;
priv->core_reg_align = data->core_reg_align;
+ priv->num_cfp_rules = data->num_cfp_rules;
/* Auto-detection using standard registers will not work, so
* provide an indication of what kind of device we are for
u32 type;
const u16 *reg_offsets;
unsigned int core_reg_align;
+ unsigned int num_cfp_rules;
/* spinlock protecting access to the indirect registers */
spinlock_t indir_lock;
#define SF2_IO_MACRO(name) \
static inline u32 name##_readl(struct bcm_sf2_priv *priv, u32 off) \
{ \
- return __raw_readl(priv->name + off); \
+ return readl_relaxed(priv->name + off); \
} \
static inline void name##_writel(struct bcm_sf2_priv *priv, \
u32 val, u32 off) \
{ \
- __raw_writel(val, priv->name + off); \
+ writel_relaxed(val, priv->name + off); \
} \
/* Accesses to 64-bits register requires us to latch the hi/lo pairs
static inline u32 core_readl(struct bcm_sf2_priv *priv, u32 off)
{
u32 tmp = bcm_sf2_mangle_addr(priv, off);
- return __raw_readl(priv->core + tmp);
+ return readl_relaxed(priv->core + tmp);
}
static inline void core_writel(struct bcm_sf2_priv *priv, u32 val, u32 off)
{
u32 tmp = bcm_sf2_mangle_addr(priv, off);
- __raw_writel(val, priv->core + tmp);
+ writel_relaxed(val, priv->core + tmp);
}
static inline u32 reg_readl(struct bcm_sf2_priv *priv, u16 off)
{
- return __raw_readl(priv->reg + priv->reg_offsets[off]);
+ return readl_relaxed(priv->reg + priv->reg_offsets[off]);
}
static inline void reg_writel(struct bcm_sf2_priv *priv, u32 val, u16 off)
{
- __raw_writel(val, priv->reg + priv->reg_offsets[off]);
+ writel_relaxed(val, priv->reg + priv->reg_offsets[off]);
}
SF2_IO64_MACRO(core);
self->hw_head = 0;
self->sw_head = 0;
self->sw_tail = 0;
- spin_lock_init(&self->header.lock);
return 0;
}
}
#define AQ_SKB_ALIGN SKB_DATA_ALIGN(sizeof(struct skb_shared_info))
-int aq_ring_rx_clean(struct aq_ring_s *self, int *work_done, int budget)
+int aq_ring_rx_clean(struct aq_ring_s *self,
+ struct napi_struct *napi,
+ int *work_done,
+ int budget)
{
struct net_device *ndev = aq_nic_get_ndev(self->aq_nic);
int err = 0;
skb_record_rx_queue(skb, self->idx);
- netif_receive_skb(skb);
+ napi_gro_receive(napi, skb);
++self->stats.rx.packets;
self->stats.rx.bytes += skb->len;
#define AQ_VEC_RX_ID 1
static int aq_vec_poll(struct napi_struct *napi, int budget)
- __releases(&self->lock)
- __acquires(&self->lock)
{
struct aq_vec_s *self = container_of(napi, struct aq_vec_s, napi);
struct aq_ring_s *ring = NULL;
if (!self) {
err = -EINVAL;
- } else if (spin_trylock(&self->header.lock)) {
+ } else {
for (i = 0U, ring = self->ring[0];
self->tx_rings > i; ++i, ring = self->ring[i]) {
if (self->aq_hw_ops->hw_ring_tx_head_update) {
if (ring[AQ_VEC_RX_ID].sw_head !=
ring[AQ_VEC_RX_ID].hw_head) {
err = aq_ring_rx_clean(&ring[AQ_VEC_RX_ID],
+ napi,
&work_done,
budget - work_done);
if (err < 0)
self->aq_hw_ops->hw_irq_enable(self->aq_hw,
1U << self->aq_ring_param.vec_idx);
}
-
- err_exit:
- spin_unlock(&self->header.lock);
}
-
+ err_exit:
return work_done;
}
self->aq_hw_ops = aq_hw_ops;
self->aq_hw = aq_hw;
- spin_lock_init(&self->header.lock);
-
for (i = 0U, ring = self->ring[0];
self->tx_rings > i; ++i, ring = self->ring[i]) {
err = aq_ring_init(&ring[AQ_VEC_TX_ID]);
#define BCM_SYSPORT_IO_MACRO(name, offset) \
static inline u32 name##_readl(struct bcm_sysport_priv *priv, u32 off) \
{ \
- u32 reg = __raw_readl(priv->base + offset + off); \
+ u32 reg = readl_relaxed(priv->base + offset + off); \
return reg; \
} \
static inline void name##_writel(struct bcm_sysport_priv *priv, \
u32 val, u32 off) \
{ \
- __raw_writel(val, priv->base + offset + off); \
+ writel_relaxed(val, priv->base + offset + off); \
} \
BCM_SYSPORT_IO_MACRO(intrl2_0, SYS_PORT_INTRL2_0_OFFSET);
{
if (priv->is_lite && off >= RDMA_STATUS)
off += 4;
- return __raw_readl(priv->base + SYS_PORT_RDMA_OFFSET + off);
+ return readl_relaxed(priv->base + SYS_PORT_RDMA_OFFSET + off);
}
static inline void rdma_writel(struct bcm_sysport_priv *priv, u32 val, u32 off)
{
if (priv->is_lite && off >= RDMA_STATUS)
off += 4;
- __raw_writel(val, priv->base + SYS_PORT_RDMA_OFFSET + off);
+ writel_relaxed(val, priv->base + SYS_PORT_RDMA_OFFSET + off);
}
static inline u32 tdma_control_bit(struct bcm_sysport_priv *priv, u32 bit)
dma_addr_t addr)
{
#ifdef CONFIG_PHYS_ADDR_T_64BIT
- __raw_writel(upper_32_bits(addr) & DESC_ADDR_HI_MASK,
+ writel_relaxed(upper_32_bits(addr) & DESC_ADDR_HI_MASK,
d + DESC_ADDR_HI_STATUS_LEN);
#endif
- __raw_writel(lower_32_bits(addr), d + DESC_ADDR_LO);
+ writel_relaxed(lower_32_bits(addr), d + DESC_ADDR_LO);
}
static inline void tdma_port_write_desc_addr(struct bcm_sysport_priv *priv,
*/
static const struct bcm_sysport_stats bcm_sysport_gstrings_stats[] = {
/* general stats */
- STAT_NETDEV(rx_packets),
- STAT_NETDEV(tx_packets),
- STAT_NETDEV(rx_bytes),
- STAT_NETDEV(tx_bytes),
+ STAT_NETDEV64(rx_packets),
+ STAT_NETDEV64(tx_packets),
+ STAT_NETDEV64(rx_bytes),
+ STAT_NETDEV64(tx_bytes),
STAT_NETDEV(rx_errors),
STAT_NETDEV(tx_errors),
STAT_NETDEV(rx_dropped),
{
switch (type) {
case BCM_SYSPORT_STAT_NETDEV:
+ case BCM_SYSPORT_STAT_NETDEV64:
case BCM_SYSPORT_STAT_RXCHK:
case BCM_SYSPORT_STAT_RBUF:
case BCM_SYSPORT_STAT_SOFT:
s = &bcm_sysport_gstrings_stats[i];
switch (s->type) {
case BCM_SYSPORT_STAT_NETDEV:
+ case BCM_SYSPORT_STAT_NETDEV64:
case BCM_SYSPORT_STAT_SOFT:
continue;
case BCM_SYSPORT_STAT_MIB_RX:
struct ethtool_stats *stats, u64 *data)
{
struct bcm_sysport_priv *priv = netdev_priv(dev);
+ struct bcm_sysport_stats64 *stats64 = &priv->stats64;
+ struct u64_stats_sync *syncp = &priv->syncp;
struct bcm_sysport_tx_ring *ring;
+ unsigned int start;
int i, j;
if (netif_running(dev))
s = &bcm_sysport_gstrings_stats[i];
if (s->type == BCM_SYSPORT_STAT_NETDEV)
p = (char *)&dev->stats;
+ else if (s->type == BCM_SYSPORT_STAT_NETDEV64)
+ p = (char *)stats64;
else
p = (char *)priv;
if (priv->is_lite && !bcm_sysport_lite_stat_valid(s->type))
continue;
-
p += s->stat_offset;
- data[j] = *(unsigned long *)p;
+
+ if (s->stat_sizeof == sizeof(u64))
+ do {
+ start = u64_stats_fetch_begin_irq(syncp);
+ data[i] = *(u64 *)p;
+ } while (u64_stats_fetch_retry_irq(syncp, start));
+ else
+ data[i] = *(u32 *)p;
j++;
}
static void bcm_sysport_free_cb(struct bcm_sysport_cb *cb)
{
- dev_kfree_skb_any(cb->skb);
+ dev_consume_skb_any(cb->skb);
cb->skb = NULL;
dma_unmap_addr_set(cb, dma_addr, 0);
}
static unsigned int bcm_sysport_desc_rx(struct bcm_sysport_priv *priv,
unsigned int budget)
{
+ struct bcm_sysport_stats64 *stats64 = &priv->stats64;
struct net_device *ndev = priv->netdev;
unsigned int processed = 0, to_process;
struct bcm_sysport_cb *cb;
skb->protocol = eth_type_trans(skb, ndev);
ndev->stats.rx_packets++;
ndev->stats.rx_bytes += len;
+ u64_stats_update_begin(&priv->syncp);
+ stats64->rx_packets++;
+ stats64->rx_bytes += len;
+ u64_stats_update_end(&priv->syncp);
napi_gro_receive(&priv->napi, skb);
next:
struct device *kdev = &priv->pdev->dev;
if (cb->skb) {
- ring->bytes += cb->skb->len;
*bytes_compl += cb->skb->len;
dma_unmap_single(kdev, dma_unmap_addr(cb, dma_addr),
dma_unmap_len(cb, dma_len),
DMA_TO_DEVICE);
- ring->packets++;
(*pkts_compl)++;
bcm_sysport_free_cb(cb);
/* SKB fragment */
} else if (dma_unmap_addr(cb, dma_addr)) {
- ring->bytes += dma_unmap_len(cb, dma_len);
+ *bytes_compl += dma_unmap_len(cb, dma_len);
dma_unmap_page(kdev, dma_unmap_addr(cb, dma_addr),
dma_unmap_len(cb, dma_len), DMA_TO_DEVICE);
dma_unmap_addr_set(cb, dma_addr, 0);
static unsigned int __bcm_sysport_tx_reclaim(struct bcm_sysport_priv *priv,
struct bcm_sysport_tx_ring *ring)
{
- struct net_device *ndev = priv->netdev;
unsigned int c_index, last_c_index, last_tx_cn, num_tx_cbs;
unsigned int pkts_compl = 0, bytes_compl = 0;
+ struct net_device *ndev = priv->netdev;
struct bcm_sysport_cb *cb;
u32 hw_ind;
last_c_index &= (num_tx_cbs - 1);
}
+ u64_stats_update_begin(&priv->syncp);
+ ring->packets += pkts_compl;
+ ring->bytes += bytes_compl;
+ u64_stats_update_end(&priv->syncp);
+
ring->c_index = c_index;
netif_dbg(priv, tx_done, ndev,
ring->cbs = kcalloc(size, sizeof(struct bcm_sysport_cb), GFP_KERNEL);
if (!ring->cbs) {
+ dma_free_coherent(kdev, sizeof(struct dma_desc),
+ ring->desc_cpu, ring->desc_dma);
netif_err(priv, hw, priv->netdev, "CB allocation failed\n");
return -ENOMEM;
}
return 0;
}
-static struct net_device_stats *bcm_sysport_get_nstats(struct net_device *dev)
+static void bcm_sysport_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *stats)
{
struct bcm_sysport_priv *priv = netdev_priv(dev);
- unsigned long tx_bytes = 0, tx_packets = 0;
+ struct bcm_sysport_stats64 *stats64 = &priv->stats64;
struct bcm_sysport_tx_ring *ring;
+ u64 tx_packets = 0, tx_bytes = 0;
+ unsigned int start;
unsigned int q;
+ netdev_stats_to_stats64(stats, &dev->stats);
+
for (q = 0; q < dev->num_tx_queues; q++) {
ring = &priv->tx_rings[q];
- tx_bytes += ring->bytes;
- tx_packets += ring->packets;
+ do {
+ start = u64_stats_fetch_begin_irq(&priv->syncp);
+ tx_bytes = ring->bytes;
+ tx_packets = ring->packets;
+ } while (u64_stats_fetch_retry_irq(&priv->syncp, start));
+
+ stats->tx_bytes += tx_bytes;
+ stats->tx_packets += tx_packets;
}
- dev->stats.tx_bytes = tx_bytes;
- dev->stats.tx_packets = tx_packets;
- return &dev->stats;
+ /* lockless update tx_bytes and tx_packets */
+ u64_stats_update_begin(&priv->syncp);
+ stats64->tx_bytes = stats->tx_bytes;
+ stats64->tx_packets = stats->tx_packets;
+ u64_stats_update_end(&priv->syncp);
+
+ do {
+ start = u64_stats_fetch_begin_irq(&priv->syncp);
+ stats->rx_packets = stats64->rx_packets;
+ stats->rx_bytes = stats64->rx_bytes;
+ } while (u64_stats_fetch_retry_irq(&priv->syncp, start));
}
static void bcm_sysport_netif_start(struct net_device *dev)
reg = rbuf_readl(priv, RBUF_CONTROL);
reg |= RBUF_4B_ALGN | RBUF_RSB_EN;
/* Set a correct RSB format on SYSTEMPORT Lite */
- if (priv->is_lite) {
+ if (priv->is_lite)
reg &= ~RBUF_RSB_SWAP1;
+
+ /* Set a correct RSB format based on host endian */
+ if (!IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
reg |= RBUF_RSB_SWAP0;
- }
+ else
+ reg &= ~RBUF_RSB_SWAP0;
rbuf_writel(priv, reg, RBUF_CONTROL);
}
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = bcm_sysport_poll_controller,
#endif
- .ndo_get_stats = bcm_sysport_get_nstats,
+ .ndo_get_stats64 = bcm_sysport_get_stats64,
};
#define REV_FMT "v%2x.%02x"
/* libphy will adjust the link state accordingly */
netif_carrier_off(dev);
+ u64_stats_init(&priv->syncp);
+
ret = register_netdev(dev);
if (ret) {
dev_err(&pdev->dev, "failed to register net_device\n");
#include <linux/mii.h>
#include <linux/if.h>
#include <linux/if_vlan.h>
+#include <linux/if_bridge.h>
#include <linux/rtc.h>
#include <linux/bpf.h>
#include <net/ip.h>
#include <linux/aer.h>
#include <linux/bitmap.h>
#include <linux/cpu_rmap.h>
+#include <linux/cpumask.h>
+#include <net/pkt_cls.h>
#include "bnxt_hsi.h"
#include "bnxt.h"
#include "bnxt_ethtool.h"
#include "bnxt_dcb.h"
#include "bnxt_xdp.h"
+#include "bnxt_vfr.h"
+#include "bnxt_tc.h"
#define BNXT_TX_TIMEOUT (5 * HZ)
BCM57416_NPAR,
BCM57452,
BCM57454,
+ BCM58802,
+ BCM58808,
NETXTREME_E_VF,
NETXTREME_C_VF,
};
static const struct {
char *name;
} board_info[] = {
- { "Broadcom BCM57301 NetXtreme-C 10Gb Ethernet" },
- { "Broadcom BCM57302 NetXtreme-C 10Gb/25Gb Ethernet" },
- { "Broadcom BCM57304 NetXtreme-C 10Gb/25Gb/40Gb/50Gb Ethernet" },
- { "Broadcom BCM57417 NetXtreme-E Ethernet Partition" },
- { "Broadcom BCM58700 Nitro 1Gb/2.5Gb/10Gb Ethernet" },
- { "Broadcom BCM57311 NetXtreme-C 10Gb Ethernet" },
- { "Broadcom BCM57312 NetXtreme-C 10Gb/25Gb Ethernet" },
- { "Broadcom BCM57402 NetXtreme-E 10Gb Ethernet" },
- { "Broadcom BCM57404 NetXtreme-E 10Gb/25Gb Ethernet" },
- { "Broadcom BCM57406 NetXtreme-E 10GBase-T Ethernet" },
- { "Broadcom BCM57402 NetXtreme-E Ethernet Partition" },
- { "Broadcom BCM57407 NetXtreme-E 10GBase-T Ethernet" },
- { "Broadcom BCM57412 NetXtreme-E 10Gb Ethernet" },
- { "Broadcom BCM57414 NetXtreme-E 10Gb/25Gb Ethernet" },
- { "Broadcom BCM57416 NetXtreme-E 10GBase-T Ethernet" },
- { "Broadcom BCM57417 NetXtreme-E 10GBase-T Ethernet" },
- { "Broadcom BCM57412 NetXtreme-E Ethernet Partition" },
- { "Broadcom BCM57314 NetXtreme-C 10Gb/25Gb/40Gb/50Gb Ethernet" },
- { "Broadcom BCM57417 NetXtreme-E 10Gb/25Gb Ethernet" },
- { "Broadcom BCM57416 NetXtreme-E 10Gb Ethernet" },
- { "Broadcom BCM57404 NetXtreme-E Ethernet Partition" },
- { "Broadcom BCM57406 NetXtreme-E Ethernet Partition" },
- { "Broadcom BCM57407 NetXtreme-E 25Gb Ethernet" },
- { "Broadcom BCM57407 NetXtreme-E Ethernet Partition" },
- { "Broadcom BCM57414 NetXtreme-E Ethernet Partition" },
- { "Broadcom BCM57416 NetXtreme-E Ethernet Partition" },
- { "Broadcom BCM57452 NetXtreme-E 10Gb/25Gb/40Gb/50Gb Ethernet" },
- { "Broadcom BCM57454 NetXtreme-E 10Gb/25Gb/40Gb/50Gb/100Gb Ethernet" },
- { "Broadcom NetXtreme-E Ethernet Virtual Function" },
- { "Broadcom NetXtreme-C Ethernet Virtual Function" },
+ [BCM57301] = { "Broadcom BCM57301 NetXtreme-C 10Gb Ethernet" },
+ [BCM57302] = { "Broadcom BCM57302 NetXtreme-C 10Gb/25Gb Ethernet" },
+ [BCM57304] = { "Broadcom BCM57304 NetXtreme-C 10Gb/25Gb/40Gb/50Gb Ethernet" },
+ [BCM57417_NPAR] = { "Broadcom BCM57417 NetXtreme-E Ethernet Partition" },
+ [BCM58700] = { "Broadcom BCM58700 Nitro 1Gb/2.5Gb/10Gb Ethernet" },
+ [BCM57311] = { "Broadcom BCM57311 NetXtreme-C 10Gb Ethernet" },
+ [BCM57312] = { "Broadcom BCM57312 NetXtreme-C 10Gb/25Gb Ethernet" },
+ [BCM57402] = { "Broadcom BCM57402 NetXtreme-E 10Gb Ethernet" },
+ [BCM57404] = { "Broadcom BCM57404 NetXtreme-E 10Gb/25Gb Ethernet" },
+ [BCM57406] = { "Broadcom BCM57406 NetXtreme-E 10GBase-T Ethernet" },
+ [BCM57402_NPAR] = { "Broadcom BCM57402 NetXtreme-E Ethernet Partition" },
+ [BCM57407] = { "Broadcom BCM57407 NetXtreme-E 10GBase-T Ethernet" },
+ [BCM57412] = { "Broadcom BCM57412 NetXtreme-E 10Gb Ethernet" },
+ [BCM57414] = { "Broadcom BCM57414 NetXtreme-E 10Gb/25Gb Ethernet" },
+ [BCM57416] = { "Broadcom BCM57416 NetXtreme-E 10GBase-T Ethernet" },
+ [BCM57417] = { "Broadcom BCM57417 NetXtreme-E 10GBase-T Ethernet" },
+ [BCM57412_NPAR] = { "Broadcom BCM57412 NetXtreme-E Ethernet Partition" },
+ [BCM57314] = { "Broadcom BCM57314 NetXtreme-C 10Gb/25Gb/40Gb/50Gb Ethernet" },
+ [BCM57417_SFP] = { "Broadcom BCM57417 NetXtreme-E 10Gb/25Gb Ethernet" },
+ [BCM57416_SFP] = { "Broadcom BCM57416 NetXtreme-E 10Gb Ethernet" },
+ [BCM57404_NPAR] = { "Broadcom BCM57404 NetXtreme-E Ethernet Partition" },
+ [BCM57406_NPAR] = { "Broadcom BCM57406 NetXtreme-E Ethernet Partition" },
+ [BCM57407_SFP] = { "Broadcom BCM57407 NetXtreme-E 25Gb Ethernet" },
+ [BCM57407_NPAR] = { "Broadcom BCM57407 NetXtreme-E Ethernet Partition" },
+ [BCM57414_NPAR] = { "Broadcom BCM57414 NetXtreme-E Ethernet Partition" },
+ [BCM57416_NPAR] = { "Broadcom BCM57416 NetXtreme-E Ethernet Partition" },
+ [BCM57452] = { "Broadcom BCM57452 NetXtreme-E 10Gb/25Gb/40Gb/50Gb Ethernet" },
+ [BCM57454] = { "Broadcom BCM57454 NetXtreme-E 10Gb/25Gb/40Gb/50Gb/100Gb Ethernet" },
+ [BCM58802] = { "Broadcom BCM58802 NetXtreme-S 10Gb/25Gb/40Gb/50Gb Ethernet" },
+ [BCM58808] = { "Broadcom BCM58808 NetXtreme-S 10Gb/25Gb/40Gb/50Gb/100Gb Ethernet" },
+ [NETXTREME_E_VF] = { "Broadcom NetXtreme-E Ethernet Virtual Function" },
+ [NETXTREME_C_VF] = { "Broadcom NetXtreme-C Ethernet Virtual Function" },
};
static const struct pci_device_id bnxt_pci_tbl[] = {
+ { PCI_VDEVICE(BROADCOM, 0x1614), .driver_data = BCM57454 },
{ PCI_VDEVICE(BROADCOM, 0x16c0), .driver_data = BCM57417_NPAR },
{ PCI_VDEVICE(BROADCOM, 0x16c8), .driver_data = BCM57301 },
{ PCI_VDEVICE(BROADCOM, 0x16c9), .driver_data = BCM57302 },
{ PCI_VDEVICE(BROADCOM, 0x16ed), .driver_data = BCM57414_NPAR },
{ PCI_VDEVICE(BROADCOM, 0x16ee), .driver_data = BCM57416_NPAR },
{ PCI_VDEVICE(BROADCOM, 0x16ef), .driver_data = BCM57416_NPAR },
+ { PCI_VDEVICE(BROADCOM, 0x16f0), .driver_data = BCM58808 },
{ PCI_VDEVICE(BROADCOM, 0x16f1), .driver_data = BCM57452 },
- { PCI_VDEVICE(BROADCOM, 0x1614), .driver_data = BCM57454 },
+ { PCI_VDEVICE(BROADCOM, 0xd802), .driver_data = BCM58802 },
#ifdef CONFIG_BNXT_SRIOV
{ PCI_VDEVICE(BROADCOM, 0x1606), .driver_data = NETXTREME_E_VF },
{ PCI_VDEVICE(BROADCOM, 0x1609), .driver_data = NETXTREME_E_VF },
TX_BD_FLAGS_LHINT_2048_AND_LARGER,
};
+static u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb)
+{
+ struct metadata_dst *md_dst = skb_metadata_dst(skb);
+
+ if (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)
+ return 0;
+
+ return md_dst->u.port_info.port_id;
+}
+
static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct bnxt *bp = netdev_priv(dev);
tx_buf->nr_frags = last_frag;
vlan_tag_flags = 0;
- cfa_action = 0;
+ cfa_action = bnxt_xmit_get_cfa_action(skb);
if (skb_vlan_tag_present(skb)) {
vlan_tag_flags = TX_BD_CFA_META_KEY_VLAN |
skb_vlan_tag_get(skb);
tx_push1->tx_bd_hsize_lflags = 0;
tx_push1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags);
- tx_push1->tx_bd_cfa_action = cpu_to_le32(cfa_action);
+ tx_push1->tx_bd_cfa_action =
+ cpu_to_le32(cfa_action << TX_BD_CFA_ACTION_SHIFT);
end = pdata + length;
end = PTR_ALIGN(end, 8) - 1;
txbd->tx_bd_len_flags_type = cpu_to_le32(flags);
txbd1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags);
- txbd1->tx_bd_cfa_action = cpu_to_le32(cfa_action);
+ txbd1->tx_bd_cfa_action =
+ cpu_to_le32(cfa_action << TX_BD_CFA_ACTION_SHIFT);
for (i = 0; i < last_frag; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
bnxt_sched_reset(bp, rxr);
return;
}
-
+ /* Store cfa_code in tpa_info to use in tpa_end
+ * completion processing.
+ */
+ tpa_info->cfa_code = TPA_START_CFA_CODE(tpa_start1);
prod_rx_buf->data = tpa_info->data;
prod_rx_buf->data_ptr = tpa_info->data_ptr;
return skb;
}
+/* Given the cfa_code of a received packet determine which
+ * netdev (vf-rep or PF) the packet is destined to.
+ */
+static struct net_device *bnxt_get_pkt_dev(struct bnxt *bp, u16 cfa_code)
+{
+ struct net_device *dev = bnxt_get_vf_rep(bp, cfa_code);
+
+ /* if vf-rep dev is NULL, the must belongs to the PF */
+ return dev ? dev : bp->dev;
+}
+
static inline struct sk_buff *bnxt_tpa_end(struct bnxt *bp,
struct bnxt_napi *bnapi,
u32 *raw_cons,
return NULL;
}
}
- skb->protocol = eth_type_trans(skb, bp->dev);
+
+ skb->protocol =
+ eth_type_trans(skb, bnxt_get_pkt_dev(bp, tpa_info->cfa_code));
if (tpa_info->hash_type != PKT_HASH_TYPE_NONE)
skb_set_hash(skb, tpa_info->rss_hash, tpa_info->hash_type);
return skb;
}
+static void bnxt_deliver_skb(struct bnxt *bp, struct bnxt_napi *bnapi,
+ struct sk_buff *skb)
+{
+ if (skb->dev != bp->dev) {
+ /* this packet belongs to a vf-rep */
+ bnxt_vf_rep_rx(bp, skb);
+ return;
+ }
+ skb_record_rx_queue(skb, bnapi->index);
+ napi_gro_receive(&bnapi->napi, skb);
+}
+
/* returns the following:
* 1 - 1 packet successfully received
* 0 - successful TPA_START, packet not completed yet
struct rx_cmp *rxcmp;
struct rx_cmp_ext *rxcmp1;
u32 tmp_raw_cons = *raw_cons;
- u16 cons, prod, cp_cons = RING_CMP(tmp_raw_cons);
+ u16 cfa_code, cons, prod, cp_cons = RING_CMP(tmp_raw_cons);
struct bnxt_sw_rx_bd *rx_buf;
unsigned int len;
u8 *data_ptr, agg_bufs, cmp_type;
rc = -ENOMEM;
if (likely(skb)) {
- skb_record_rx_queue(skb, bnapi->index);
- napi_gro_receive(&bnapi->napi, skb);
+ bnxt_deliver_skb(bp, bnapi, skb);
rc = 1;
}
*event |= BNXT_RX_EVENT;
skb_set_hash(skb, le32_to_cpu(rxcmp->rx_cmp_rss_hash), type);
}
- skb->protocol = eth_type_trans(skb, dev);
+ cfa_code = RX_CMP_CFA_CODE(rxcmp1);
+ skb->protocol = eth_type_trans(skb, bnxt_get_pkt_dev(bp, cfa_code));
if ((rxcmp1->rx_cmp_flags2 &
cpu_to_le32(RX_CMP_FLAGS2_META_FORMAT_VLAN)) &&
}
}
- skb_record_rx_queue(skb, bnapi->index);
- napi_gro_receive(&bnapi->napi, skb);
+ bnxt_deliver_skb(bp, bnapi, skb);
rc = 1;
next_rx:
&event);
if (likely(rc >= 0))
rx_pkts += rc;
+ /* Increment rx_pkts when rc is -ENOMEM to count towards
+ * the NAPI budget. Otherwise, we may potentially loop
+ * here forever if we consistently cannot allocate
+ * buffers.
+ */
+ else if (rc == -ENOMEM)
+ rx_pkts++;
else if (rc == -EBUSY) /* partial completion */
break;
} else if (unlikely((TX_CMP_TYPE(txcmp) ==
mutex_lock(&bp->hwrm_cmd_lock);
rc = __bnxt_hwrm_get_tx_rings(bp, 0xffff, tx_rings);
mutex_unlock(&bp->hwrm_cmd_lock);
+ if (!rc)
+ bp->tx_reserved_rings = *tx_rings;
return rc;
}
+static int bnxt_hwrm_check_tx_rings(struct bnxt *bp, int tx_rings)
+{
+ struct hwrm_func_cfg_input req = {0};
+ int rc;
+
+ if (bp->hwrm_spec_code < 0x10801)
+ return 0;
+
+ if (BNXT_VF(bp))
+ return 0;
+
+ bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
+ req.fid = cpu_to_le16(0xffff);
+ req.flags = cpu_to_le32(FUNC_CFG_REQ_FLAGS_TX_ASSETS_TEST);
+ req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_NUM_TX_RINGS);
+ req.num_tx_rings = cpu_to_le16(tx_rings);
+ rc = hwrm_send_message_silent(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+ if (rc)
+ return -ENOMEM;
+ return 0;
+}
+
static void bnxt_hwrm_set_coal_params(struct bnxt *bp, u32 max_bufs,
u32 buf_tmrs, u16 flags,
struct hwrm_ring_cmpl_ring_cfg_aggint_params_input *req)
{
struct hwrm_func_qcfg_input req = {0};
struct hwrm_func_qcfg_output *resp = bp->hwrm_cmd_resp_addr;
+ u16 flags;
int rc;
bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_QCFG, -1, -1);
vf->vlan = le16_to_cpu(resp->vlan) & VLAN_VID_MASK;
}
#endif
- if (BNXT_PF(bp)) {
- u16 flags = le16_to_cpu(resp->flags);
-
- if (flags & (FUNC_QCFG_RESP_FLAGS_FW_DCBX_AGENT_ENABLED |
- FUNC_QCFG_RESP_FLAGS_FW_LLDP_AGENT_ENABLED))
- bp->flags |= BNXT_FLAG_FW_LLDP_AGENT;
- if (flags & FUNC_QCFG_RESP_FLAGS_MULTI_HOST)
- bp->flags |= BNXT_FLAG_MULTI_HOST;
+ flags = le16_to_cpu(resp->flags);
+ if (flags & (FUNC_QCFG_RESP_FLAGS_FW_DCBX_AGENT_ENABLED |
+ FUNC_QCFG_RESP_FLAGS_FW_LLDP_AGENT_ENABLED)) {
+ bp->flags |= BNXT_FLAG_FW_LLDP_AGENT;
+ if (flags & FUNC_QCFG_RESP_FLAGS_FW_DCBX_AGENT_ENABLED)
+ bp->flags |= BNXT_FLAG_FW_DCBX_AGENT;
}
+ if (BNXT_PF(bp) && (flags & FUNC_QCFG_RESP_FLAGS_MULTI_HOST))
+ bp->flags |= BNXT_FLAG_MULTI_HOST;
switch (resp->port_partition_type) {
case FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR1_0:
bp->port_partition_type = resp->port_partition_type;
break;
}
+ if (bp->hwrm_spec_code < 0x10707 ||
+ resp->evb_mode == FUNC_QCFG_RESP_EVB_MODE_VEB)
+ bp->br_mode = BRIDGE_MODE_VEB;
+ else if (resp->evb_mode == FUNC_QCFG_RESP_EVB_MODE_VEPA)
+ bp->br_mode = BRIDGE_MODE_VEPA;
+ else
+ bp->br_mode = BRIDGE_MODE_UNDEF;
func_qcfg_exit:
mutex_unlock(&bp->hwrm_cmd_lock);
pf->port_id = le16_to_cpu(resp->port_id);
bp->dev->dev_port = pf->port_id;
memcpy(pf->mac_addr, resp->mac_address, ETH_ALEN);
- memcpy(bp->dev->dev_addr, pf->mac_addr, ETH_ALEN);
pf->max_rsscos_ctxs = le16_to_cpu(resp->max_rsscos_ctx);
pf->max_cp_rings = le16_to_cpu(resp->max_cmpl_rings);
pf->max_tx_rings = le16_to_cpu(resp->max_tx_rings);
vf->max_stat_ctxs = le16_to_cpu(resp->max_stat_ctx);
memcpy(vf->mac_addr, resp->mac_address, ETH_ALEN);
- mutex_unlock(&bp->hwrm_cmd_lock);
-
- if (is_valid_ether_addr(vf->mac_addr)) {
- /* overwrite netdev dev_adr with admin VF MAC */
- memcpy(bp->dev->dev_addr, vf->mac_addr, ETH_ALEN);
- } else {
- eth_hw_addr_random(bp->dev);
- rc = bnxt_approve_mac(bp, bp->dev->dev_addr);
- }
- return rc;
#endif
}
}
}
+static int bnxt_hwrm_set_br_mode(struct bnxt *bp, u16 br_mode)
+{
+ struct hwrm_func_cfg_input req = {0};
+ int rc;
+
+ bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
+ req.fid = cpu_to_le16(0xffff);
+ req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_EVB_MODE);
+ if (br_mode == BRIDGE_MODE_VEB)
+ req.evb_mode = FUNC_CFG_REQ_EVB_MODE_VEB;
+ else if (br_mode == BRIDGE_MODE_VEPA)
+ req.evb_mode = FUNC_CFG_REQ_EVB_MODE_VEPA;
+ else
+ return -EINVAL;
+ rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+ if (rc)
+ rc = -EIO;
+ return rc;
+}
+
static int bnxt_setup_vnic(struct bnxt *bp, u16 vnic_id)
{
struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id];
rc);
goto err_out;
}
+ if (bp->tx_reserved_rings != bp->tx_nr_rings) {
+ int tx = bp->tx_nr_rings;
+
+ if (bnxt_hwrm_reserve_tx_rings(bp, &tx) ||
+ tx < bp->tx_nr_rings) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+ }
}
rc = bnxt_hwrm_ring_alloc(bp);
for (i = 0; i < bp->cp_nr_rings; i++) {
irq = &bp->irq_tbl[i];
- if (irq->requested)
+ if (irq->requested) {
+ if (irq->have_cpumask) {
+ irq_set_affinity_hint(irq->vector, NULL);
+ free_cpumask_var(irq->cpu_mask);
+ irq->have_cpumask = 0;
+ }
free_irq(irq->vector, bp->bnapi[i]);
+ }
+
irq->requested = 0;
}
}
break;
irq->requested = 1;
+
+ if (zalloc_cpumask_var(&irq->cpu_mask, GFP_KERNEL)) {
+ int numa_node = dev_to_node(&bp->pdev->dev);
+
+ irq->have_cpumask = 1;
+ cpumask_set_cpu(cpumask_local_spread(i, numa_node),
+ irq->cpu_mask);
+ rc = irq_set_affinity_hint(irq->vector, irq->cpu_mask);
+ if (rc) {
+ netdev_warn(bp->dev,
+ "Set affinity failed, IRQ = %d\n",
+ irq->vector);
+ break;
+ }
+ }
}
return rc;
}
{
int i;
struct bnxt_tx_ring_info *txr;
- struct netdev_queue *txq;
if (bp->tx_ring) {
for (i = 0; i < bp->tx_nr_rings; i++) {
txr = &bp->tx_ring[i];
- txq = netdev_get_tx_queue(bp->dev, i);
txr->dev_state = BNXT_DEV_STATE_CLOSING;
}
}
{
int i;
struct bnxt_tx_ring_info *txr;
- struct netdev_queue *txq;
for (i = 0; i < bp->tx_nr_rings; i++) {
txr = &bp->tx_ring[i];
- txq = netdev_get_tx_queue(bp->dev, i);
txr->dev_state = 0;
}
netif_tx_wake_all_queues(bp->dev);
if (rc)
goto hwrm_phy_qcaps_exit;
- if (resp->eee_supported & PORT_PHY_QCAPS_RESP_EEE_SUPPORTED) {
+ if (resp->flags & PORT_PHY_QCAPS_RESP_FLAGS_EEE_SUPPORTED) {
struct ethtool_eee *eee = &bp->eee;
u16 fw_speeds = le16_to_cpu(resp->supported_speeds_eee_mode);
link_info->support_auto_speeds =
le16_to_cpu(resp->supported_speeds_auto_mode);
+ bp->port_count = resp->port_cnt;
+
hwrm_phy_qcaps_exit:
mutex_unlock(&bp->hwrm_cmd_lock);
return rc;
memcpy(&link_info->phy_qcfg_resp, resp, sizeof(*resp));
link_info->phy_link_status = resp->link;
- link_info->duplex = resp->duplex;
+ link_info->duplex = resp->duplex_cfg;
+ if (bp->hwrm_spec_code >= 0x10800)
+ link_info->duplex = resp->duplex_state;
link_info->pause = resp->pause;
link_info->auto_mode = resp->auto_mode;
link_info->auto_pause_setting = resp->auto_pause;
link_info->lp_pause = resp->link_partner_adv_pause;
link_info->force_pause_setting = resp->force_pause;
- link_info->duplex_setting = resp->duplex;
+ link_info->duplex_setting = resp->duplex_cfg;
if (link_info->phy_link_status == BNXT_LINK_LINK)
link_info->link_speed = le16_to_cpu(resp->link_speed);
else
/* Poll link status and check for SFP+ module status */
bnxt_get_port_module_status(bp);
+ /* VF-reps may need to be re-opened after the PF is re-opened */
+ if (BNXT_PF(bp))
+ bnxt_vf_reps_open(bp);
return 0;
open_err:
if (rc)
netdev_warn(bp->dev, "timeout waiting for SRIOV config operation to complete!\n");
}
+
+ /* Close the VF-reps before closing PF */
+ if (BNXT_PF(bp))
+ bnxt_vf_reps_close(bp);
#endif
/* Change device state to avoid TX queue wake up's */
bnxt_tx_disable(bp);
if (atomic_read(&bp->intr_sem) != 0)
goto bnxt_restart_timer;
- if (bp->link_info.link_up && (bp->flags & BNXT_FLAG_PORT_STATS)) {
+ if (bp->link_info.link_up && (bp->flags & BNXT_FLAG_PORT_STATS) &&
+ bp->stats_coal_ticks) {
set_bit(BNXT_PERIODIC_STATS_SP_EVENT, &bp->sp_event);
schedule_work(&bp->sp_task);
}
}
/* Under rtnl_lock */
-int bnxt_reserve_rings(struct bnxt *bp, int tx, int rx, bool sh, int tcs,
- int tx_xdp)
+int bnxt_check_rings(struct bnxt *bp, int tx, int rx, bool sh, int tcs,
+ int tx_xdp)
{
int max_rx, max_tx, tx_sets = 1;
int tx_rings_needed;
if (max_tx < tx_rings_needed)
return -ENOMEM;
- if (bnxt_hwrm_reserve_tx_rings(bp, &tx_rings_needed) ||
- tx_rings_needed < (tx * tx_sets + tx_xdp))
- return -ENOMEM;
- return 0;
+ return bnxt_hwrm_check_tx_rings(bp, tx_rings_needed);
}
static void bnxt_unmap_bars(struct bnxt *bp, struct pci_dev *pdev)
if (bp->flags & BNXT_FLAG_SHARED_RINGS)
sh = true;
- rc = bnxt_reserve_rings(bp, bp->tx_nr_rings_per_tc, bp->rx_nr_rings,
- sh, tc, bp->tx_nr_rings_xdp);
+ rc = bnxt_check_rings(bp, bp->tx_nr_rings_per_tc, bp->rx_nr_rings,
+ sh, tc, bp->tx_nr_rings_xdp);
if (rc)
return rc;
bp->tx_nr_rings = bp->tx_nr_rings_per_tc;
netdev_reset_tc(dev);
}
+ bp->tx_nr_rings += bp->tx_nr_rings_xdp;
bp->cp_nr_rings = sh ? max_t(int, bp->tx_nr_rings, bp->rx_nr_rings) :
bp->tx_nr_rings + bp->rx_nr_rings;
bp->num_stat_ctxs = bp->cp_nr_rings;
return 0;
}
-static int bnxt_setup_tc(struct net_device *dev, u32 handle, u32 chain_index,
- __be16 proto, struct tc_to_netdev *ntc)
+static int bnxt_setup_flower(struct net_device *dev,
+ struct tc_cls_flower_offload *cls_flower)
{
- if (ntc->type != TC_SETUP_MQPRIO)
- return -EINVAL;
+ struct bnxt *bp = netdev_priv(dev);
- ntc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+ if (BNXT_VF(bp))
+ return -EOPNOTSUPP;
- return bnxt_setup_mq_tc(dev, ntc->mqprio->num_tc);
+ return bnxt_tc_setup_flower(bp, bp->pf.fw_fid, cls_flower);
+}
+
+static int bnxt_setup_tc(struct net_device *dev, enum tc_setup_type type,
+ void *type_data)
+{
+ switch (type) {
+ case TC_SETUP_CLSFLOWER:
+ return bnxt_setup_flower(dev, type_data);
+ case TC_SETUP_MQPRIO: {
+ struct tc_mqprio_qopt *mqprio = type_data;
+
+ mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+
+ return bnxt_setup_mq_tc(dev, mqprio->num_tc);
+ }
+ default:
+ return -EOPNOTSUPP;
+ }
}
#ifdef CONFIG_RFS_ACCEL
schedule_work(&bp->sp_task);
}
+static int bnxt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+ struct net_device *dev, u32 filter_mask,
+ int nlflags)
+{
+ struct bnxt *bp = netdev_priv(dev);
+
+ return ndo_dflt_bridge_getlink(skb, pid, seq, dev, bp->br_mode, 0, 0,
+ nlflags, filter_mask, NULL);
+}
+
+static int bnxt_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
+ u16 flags)
+{
+ struct bnxt *bp = netdev_priv(dev);
+ struct nlattr *attr, *br_spec;
+ int rem, rc = 0;
+
+ if (bp->hwrm_spec_code < 0x10708 || !BNXT_SINGLE_PF(bp))
+ return -EOPNOTSUPP;
+
+ br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+ if (!br_spec)
+ return -EINVAL;
+
+ nla_for_each_nested(attr, br_spec, rem) {
+ u16 mode;
+
+ if (nla_type(attr) != IFLA_BRIDGE_MODE)
+ continue;
+
+ if (nla_len(attr) < sizeof(mode))
+ return -EINVAL;
+
+ mode = nla_get_u16(attr);
+ if (mode == bp->br_mode)
+ break;
+
+ rc = bnxt_hwrm_set_br_mode(bp, mode);
+ if (!rc)
+ bp->br_mode = mode;
+ break;
+ }
+ return rc;
+}
+
+static int bnxt_get_phys_port_name(struct net_device *dev, char *buf,
+ size_t len)
+{
+ struct bnxt *bp = netdev_priv(dev);
+ int rc;
+
+ /* The PF and it's VF-reps only support the switchdev framework */
+ if (!BNXT_PF(bp))
+ return -EOPNOTSUPP;
+
+ rc = snprintf(buf, len, "p%d", bp->pf.port_id);
+
+ if (rc >= len)
+ return -EOPNOTSUPP;
+ return 0;
+}
+
+int bnxt_port_attr_get(struct bnxt *bp, struct switchdev_attr *attr)
+{
+ if (bp->eswitch_mode != DEVLINK_ESWITCH_MODE_SWITCHDEV)
+ return -EOPNOTSUPP;
+
+ /* The PF and it's VF-reps only support the switchdev framework */
+ if (!BNXT_PF(bp))
+ return -EOPNOTSUPP;
+
+ switch (attr->id) {
+ case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
+ /* In SRIOV each PF-pool (PF + child VFs) serves as a
+ * switching domain, the PF's perm mac-addr can be used
+ * as the unique parent-id
+ */
+ attr->u.ppid.id_len = ETH_ALEN;
+ ether_addr_copy(attr->u.ppid.id, bp->pf.mac_addr);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+static int bnxt_swdev_port_attr_get(struct net_device *dev,
+ struct switchdev_attr *attr)
+{
+ return bnxt_port_attr_get(netdev_priv(dev), attr);
+}
+
+static const struct switchdev_ops bnxt_switchdev_ops = {
+ .switchdev_port_attr_get = bnxt_swdev_port_attr_get
+};
+
static const struct net_device_ops bnxt_netdev_ops = {
.ndo_open = bnxt_open,
.ndo_start_xmit = bnxt_start_xmit,
.ndo_udp_tunnel_add = bnxt_udp_tunnel_add,
.ndo_udp_tunnel_del = bnxt_udp_tunnel_del,
.ndo_xdp = bnxt_xdp,
+ .ndo_bridge_getlink = bnxt_bridge_getlink,
+ .ndo_bridge_setlink = bnxt_bridge_setlink,
+ .ndo_get_phys_port_name = bnxt_get_phys_port_name
};
static void bnxt_remove_one(struct pci_dev *pdev)
struct net_device *dev = pci_get_drvdata(pdev);
struct bnxt *bp = netdev_priv(dev);
- if (BNXT_PF(bp))
+ if (BNXT_PF(bp)) {
bnxt_sriov_disable(bp);
+ bnxt_dl_unregister(bp);
+ }
pci_disable_pcie_error_reporting(pdev);
unregister_netdev(dev);
+ bnxt_shutdown_tc(bp);
cancel_work_sync(&bp->sp_task);
bp->sp_event = 0;
if (sh)
bp->flags |= BNXT_FLAG_SHARED_RINGS;
dflt_rings = netif_get_num_default_rss_queues();
+ /* Reduce default rings to reduce memory usage on multi-port cards */
+ if (bp->port_count > 1)
+ dflt_rings = min_t(int, dflt_rings, 4);
rc = bnxt_get_dflt_rings(bp, &max_rx_rings, &max_tx_rings, sh);
if (rc)
return rc;
bnxt_subtract_ulp_resources(bp, BNXT_ROCE_ULP);
}
+ static int bnxt_init_mac_addr(struct bnxt *bp)
+ {
+ int rc = 0;
+
+ if (BNXT_PF(bp)) {
+ memcpy(bp->dev->dev_addr, bp->pf.mac_addr, ETH_ALEN);
+ } else {
+ #ifdef CONFIG_BNXT_SRIOV
+ struct bnxt_vf_info *vf = &bp->vf;
+
+ if (is_valid_ether_addr(vf->mac_addr)) {
+ /* overwrite netdev dev_adr with admin VF MAC */
+ memcpy(bp->dev->dev_addr, vf->mac_addr, ETH_ALEN);
+ } else {
+ eth_hw_addr_random(bp->dev);
+ rc = bnxt_approve_mac(bp, bp->dev->dev_addr);
+ }
+ #endif
+ }
+ return rc;
+ }
+
static void bnxt_parse_log_pcie_link(struct bnxt *bp)
{
enum pcie_link_width width = PCIE_LNK_WIDTH_UNKNOWN;
dev->netdev_ops = &bnxt_netdev_ops;
dev->watchdog_timeo = BNXT_TX_TIMEOUT;
dev->ethtool_ops = &bnxt_ethtool_ops;
+ SWITCHDEV_SET_OPS(dev, &bnxt_switchdev_ops);
pci_set_drvdata(pdev, dev);
rc = bnxt_alloc_hwrm_resources(bp);
#ifdef CONFIG_BNXT_SRIOV
init_waitqueue_head(&bp->sriov_cfg_wait);
+ mutex_init(&bp->sriov_lock);
#endif
bp->gro_func = bnxt_gro_func_5730x;
if (BNXT_CHIP_P4_PLUS(bp))
rc = -1;
goto init_err_pci_clean;
}
-
+ rc = bnxt_init_mac_addr(bp);
+ if (rc) {
+ dev_err(&pdev->dev, "Unable to initialize mac address.\n");
+ rc = -EADDRNOTAVAIL;
+ goto init_err_pci_clean;
+ }
rc = bnxt_hwrm_queue_qportcfg(bp);
if (rc) {
netdev_err(bp->dev, "hwrm query qportcfg failure rc: %x\n",
bnxt_ethtool_init(bp);
bnxt_dcb_init(bp);
+ rc = bnxt_probe_phy(bp);
+ if (rc)
+ goto init_err_pci_clean;
+
bnxt_set_rx_skb_mode(bp, false);
bnxt_set_tpa_flags(bp);
bnxt_set_ring_params(bp);
if (dev->hw_features & NETIF_F_HW_VLAN_CTAG_RX)
bp->flags |= BNXT_FLAG_STRIP_VLAN;
- rc = bnxt_probe_phy(bp);
- if (rc)
- goto init_err_pci_clean;
-
rc = bnxt_init_int_mode(bp);
if (rc)
goto init_err_pci_clean;
else
device_set_wakeup_capable(&pdev->dev, false);
+ if (BNXT_PF(bp))
+ bnxt_init_tc(bp);
+
rc = register_netdev(dev);
if (rc)
- goto init_err_clr_int;
+ goto init_err_cleanup_tc;
+
+ if (BNXT_PF(bp))
+ bnxt_dl_register(bp);
netdev_info(dev, "%s found at mem %lx, node addr %pM\n",
board_info[ent->driver_data].name,
return 0;
-init_err_clr_int:
+init_err_cleanup_tc:
+ bnxt_shutdown_tc(bp);
bnxt_clear_int_mode(bp);
init_err_pci_clean:
#define GENET_RDMA_REG_OFF (priv->hw_params->rdma_offset + \
TOTAL_DESC * DMA_DESC_SIZE)
+static inline void bcmgenet_writel(u32 value, void __iomem *offset)
+{
+ /* MIPS chips strapped for BE will automagically configure the
+ * peripheral registers for CPU-native byte order.
+ */
+ if (IS_ENABLED(CONFIG_MIPS) && IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
+ __raw_writel(value, offset);
+ else
+ writel_relaxed(value, offset);
+}
+
+static inline u32 bcmgenet_readl(void __iomem *offset)
+{
+ if (IS_ENABLED(CONFIG_MIPS) && IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
+ return __raw_readl(offset);
+ else
+ return readl_relaxed(offset);
+}
+
static inline void dmadesc_set_length_status(struct bcmgenet_priv *priv,
void __iomem *d, u32 value)
{
- __raw_writel(value, d + DMA_DESC_LENGTH_STATUS);
+ bcmgenet_writel(value, d + DMA_DESC_LENGTH_STATUS);
}
static inline u32 dmadesc_get_length_status(struct bcmgenet_priv *priv,
void __iomem *d)
{
- return __raw_readl(d + DMA_DESC_LENGTH_STATUS);
+ return bcmgenet_readl(d + DMA_DESC_LENGTH_STATUS);
}
static inline void dmadesc_set_addr(struct bcmgenet_priv *priv,
void __iomem *d,
dma_addr_t addr)
{
- __raw_writel(lower_32_bits(addr), d + DMA_DESC_ADDRESS_LO);
+ bcmgenet_writel(lower_32_bits(addr), d + DMA_DESC_ADDRESS_LO);
/* Register writes to GISB bus can take couple hundred nanoseconds
* and are done for each packet, save these expensive writes unless
*/
#ifdef CONFIG_PHYS_ADDR_T_64BIT
if (priv->hw_params->flags & GENET_HAS_40BITS)
- __raw_writel(upper_32_bits(addr), d + DMA_DESC_ADDRESS_HI);
+ bcmgenet_writel(upper_32_bits(addr), d + DMA_DESC_ADDRESS_HI);
#endif
}
{
dma_addr_t addr;
- addr = __raw_readl(d + DMA_DESC_ADDRESS_LO);
+ addr = bcmgenet_readl(d + DMA_DESC_ADDRESS_LO);
/* Register writes to GISB bus can take couple hundred nanoseconds
* and are done for each packet, save these expensive writes unless
*/
#ifdef CONFIG_PHYS_ADDR_T_64BIT
if (priv->hw_params->flags & GENET_HAS_40BITS)
- addr |= (u64)__raw_readl(d + DMA_DESC_ADDRESS_HI) << 32;
+ addr |= (u64)bcmgenet_readl(d + DMA_DESC_ADDRESS_HI) << 32;
#endif
return addr;
}
if (GENET_IS_V1(priv))
return bcmgenet_rbuf_readl(priv, TBUF_CTRL_V1);
else
- return __raw_readl(priv->base +
- priv->hw_params->tbuf_offset + TBUF_CTRL);
+ return bcmgenet_readl(priv->base +
+ priv->hw_params->tbuf_offset + TBUF_CTRL);
}
static inline void bcmgenet_tbuf_ctrl_set(struct bcmgenet_priv *priv, u32 val)
if (GENET_IS_V1(priv))
bcmgenet_rbuf_writel(priv, val, TBUF_CTRL_V1);
else
- __raw_writel(val, priv->base +
+ bcmgenet_writel(val, priv->base +
priv->hw_params->tbuf_offset + TBUF_CTRL);
}
if (GENET_IS_V1(priv))
return bcmgenet_rbuf_readl(priv, TBUF_BP_MC_V1);
else
- return __raw_readl(priv->base +
- priv->hw_params->tbuf_offset + TBUF_BP_MC);
+ return bcmgenet_readl(priv->base +
+ priv->hw_params->tbuf_offset + TBUF_BP_MC);
}
static inline void bcmgenet_bp_mc_set(struct bcmgenet_priv *priv, u32 val)
if (GENET_IS_V1(priv))
bcmgenet_rbuf_writel(priv, val, TBUF_BP_MC_V1);
else
- __raw_writel(val, priv->base +
+ bcmgenet_writel(val, priv->base +
priv->hw_params->tbuf_offset + TBUF_BP_MC);
}
static inline u32 bcmgenet_tdma_readl(struct bcmgenet_priv *priv,
enum dma_reg r)
{
- return __raw_readl(priv->base + GENET_TDMA_REG_OFF +
- DMA_RINGS_SIZE + bcmgenet_dma_regs[r]);
+ return bcmgenet_readl(priv->base + GENET_TDMA_REG_OFF +
+ DMA_RINGS_SIZE + bcmgenet_dma_regs[r]);
}
static inline void bcmgenet_tdma_writel(struct bcmgenet_priv *priv,
u32 val, enum dma_reg r)
{
- __raw_writel(val, priv->base + GENET_TDMA_REG_OFF +
+ bcmgenet_writel(val, priv->base + GENET_TDMA_REG_OFF +
DMA_RINGS_SIZE + bcmgenet_dma_regs[r]);
}
static inline u32 bcmgenet_rdma_readl(struct bcmgenet_priv *priv,
enum dma_reg r)
{
- return __raw_readl(priv->base + GENET_RDMA_REG_OFF +
- DMA_RINGS_SIZE + bcmgenet_dma_regs[r]);
+ return bcmgenet_readl(priv->base + GENET_RDMA_REG_OFF +
+ DMA_RINGS_SIZE + bcmgenet_dma_regs[r]);
}
static inline void bcmgenet_rdma_writel(struct bcmgenet_priv *priv,
u32 val, enum dma_reg r)
{
- __raw_writel(val, priv->base + GENET_RDMA_REG_OFF +
+ bcmgenet_writel(val, priv->base + GENET_RDMA_REG_OFF +
DMA_RINGS_SIZE + bcmgenet_dma_regs[r]);
}
unsigned int ring,
enum dma_ring_reg r)
{
- return __raw_readl(priv->base + GENET_TDMA_REG_OFF +
- (DMA_RING_SIZE * ring) +
- genet_dma_ring_regs[r]);
+ return bcmgenet_readl(priv->base + GENET_TDMA_REG_OFF +
+ (DMA_RING_SIZE * ring) +
+ genet_dma_ring_regs[r]);
}
static inline void bcmgenet_tdma_ring_writel(struct bcmgenet_priv *priv,
unsigned int ring, u32 val,
enum dma_ring_reg r)
{
- __raw_writel(val, priv->base + GENET_TDMA_REG_OFF +
+ bcmgenet_writel(val, priv->base + GENET_TDMA_REG_OFF +
(DMA_RING_SIZE * ring) +
genet_dma_ring_regs[r]);
}
unsigned int ring,
enum dma_ring_reg r)
{
- return __raw_readl(priv->base + GENET_RDMA_REG_OFF +
- (DMA_RING_SIZE * ring) +
- genet_dma_ring_regs[r]);
+ return bcmgenet_readl(priv->base + GENET_RDMA_REG_OFF +
+ (DMA_RING_SIZE * ring) +
+ genet_dma_ring_regs[r]);
}
static inline void bcmgenet_rdma_ring_writel(struct bcmgenet_priv *priv,
unsigned int ring, u32 val,
enum dma_ring_reg r)
{
- __raw_writel(val, priv->base + GENET_RDMA_REG_OFF +
+ bcmgenet_writel(val, priv->base + GENET_RDMA_REG_OFF +
(DMA_RING_SIZE * ring) +
genet_dma_ring_regs[r]);
}
bcmgenet_umac_writel(priv, reg, UMAC_EEE_CTRL);
/* Enable EEE and switch to a 27Mhz clock automatically */
- reg = __raw_readl(priv->base + off);
+ reg = bcmgenet_readl(priv->base + off);
if (enable)
reg |= TBUF_EEE_EN | TBUF_PM_EN;
else
reg &= ~(TBUF_EEE_EN | TBUF_PM_EN);
- __raw_writel(reg, priv->base + off);
+ bcmgenet_writel(reg, priv->base + off);
/* Do the same for thing for RBUF */
reg = bcmgenet_rbuf_readl(priv, RBUF_ENERGY_CTRL);
if (skb) {
pkts_compl++;
bytes_compl += GENET_CB(skb)->bytes_sent;
- dev_kfree_skb_any(skb);
+ dev_consume_skb_any(skb);
}
txbds_processed++;
cb = ring->cbs + i;
skb = bcmgenet_rx_refill(priv, cb);
if (skb)
- dev_kfree_skb_any(skb);
+ dev_consume_skb_any(skb);
if (!cb->skb)
return -ENOMEM;
}
skb = bcmgenet_free_rx_cb(&priv->pdev->dev, cb);
if (skb)
- dev_kfree_skb_any(skb);
+ dev_consume_skb_any(skb);
}
}
list_del(&entry.list);
spin_unlock(&adap->mbox_lock);
ret = (v == MBOX_OWNER_FW) ? -EBUSY : -ETIMEDOUT;
- t4_record_mbox(adap, cmd, MBOX_LEN, access, ret);
+ t4_record_mbox(adap, cmd, size, access, ret);
return ret;
}
/* Copy in the new mailbox command and send it on its way ... */
- t4_record_mbox(adap, cmd, MBOX_LEN, access, 0);
+ t4_record_mbox(adap, cmd, size, access, 0);
for (i = 0; i < size; i += 8)
t4_write_reg64(adap, data_reg + i, be64_to_cpu(*p++));
}
ret = (pcie_fw & PCIE_FW_ERR_F) ? -ENXIO : -ETIMEDOUT;
- t4_record_mbox(adap, cmd, MBOX_LEN, access, ret);
+ t4_record_mbox(adap, cmd, size, access, ret);
dev_err(adap->pdev_dev, "command %#x in mailbox %d timed out\n",
*(const u8 *)cmd, mbox);
t4_report_fw_error(adap);
0xd010, 0xd03c,
0xdfc0, 0xdfe0,
0xe000, 0xea7c,
- 0xf000, 0x11190,
+ 0xf000, 0x11110,
+ 0x11118, 0x11190,
0x19040, 0x1906c,
0x19078, 0x19080,
0x1908c, 0x190e4,
0x1ff00, 0x1ff84,
0x1ffc0, 0x1ffc8,
0x30000, 0x30030,
- 0x30038, 0x30038,
- 0x30040, 0x30040,
0x30100, 0x30144,
0x30190, 0x301a0,
0x301a8, 0x301b8,
0x33c3c, 0x33c50,
0x33cf0, 0x33cfc,
0x34000, 0x34030,
- 0x34038, 0x34038,
- 0x34040, 0x34040,
0x34100, 0x34144,
0x34190, 0x341a0,
0x341a8, 0x341b8,
0x37c3c, 0x37c50,
0x37cf0, 0x37cfc,
0x38000, 0x38030,
- 0x38038, 0x38038,
- 0x38040, 0x38040,
0x38100, 0x38144,
0x38190, 0x381a0,
0x381a8, 0x381b8,
0x3bc3c, 0x3bc50,
0x3bcf0, 0x3bcfc,
0x3c000, 0x3c030,
- 0x3c038, 0x3c038,
- 0x3c040, 0x3c040,
0x3c100, 0x3c144,
0x3c190, 0x3c1a0,
0x3c1a8, 0x3c1b8,
0x1190, 0x1194,
0x11a0, 0x11a4,
0x11b0, 0x11b4,
- 0x11fc, 0x1258,
- 0x1280, 0x12d4,
- 0x12d9, 0x12d9,
- 0x12de, 0x12de,
- 0x12e3, 0x12e3,
- 0x12e8, 0x133c,
+ 0x11fc, 0x1274,
+ 0x1280, 0x133c,
0x1800, 0x18fc,
0x3000, 0x302c,
0x3060, 0x30b0,
0x5ea0, 0x5eb0,
0x5ec0, 0x5ec0,
0x5ec8, 0x5ed0,
+ 0x5ee0, 0x5ee0,
+ 0x5ef0, 0x5ef0,
+ 0x5f00, 0x5f00,
0x6000, 0x6020,
0x6028, 0x6040,
0x6058, 0x609c,
0xd300, 0xd31c,
0xdfc0, 0xdfe0,
0xe000, 0xf008,
+ 0xf010, 0xf018,
+ 0xf020, 0xf028,
0x11000, 0x11014,
0x11048, 0x1106c,
0x11074, 0x11088,
0x1ff00, 0x1ff84,
0x1ffc0, 0x1ffc8,
0x30000, 0x30030,
- 0x30038, 0x30038,
- 0x30040, 0x30040,
- 0x30048, 0x30048,
- 0x30050, 0x30050,
- 0x3005c, 0x30060,
- 0x30068, 0x30068,
- 0x30070, 0x30070,
0x30100, 0x30168,
0x30190, 0x301a0,
0x301a8, 0x301b8,
0x326a8, 0x326a8,
0x326ec, 0x326ec,
0x32a00, 0x32abc,
- 0x32b00, 0x32b38,
+ 0x32b00, 0x32b18,
+ 0x32b20, 0x32b38,
0x32b40, 0x32b58,
0x32b60, 0x32b78,
0x32c00, 0x32c00,
0x32c08, 0x32c3c,
- 0x32e00, 0x32e2c,
- 0x32f00, 0x32f2c,
0x33000, 0x3302c,
0x33034, 0x33050,
0x33058, 0x33058,
0x33c38, 0x33c50,
0x33cf0, 0x33cfc,
0x34000, 0x34030,
- 0x34038, 0x34038,
- 0x34040, 0x34040,
- 0x34048, 0x34048,
- 0x34050, 0x34050,
- 0x3405c, 0x34060,
- 0x34068, 0x34068,
- 0x34070, 0x34070,
0x34100, 0x34168,
0x34190, 0x341a0,
0x341a8, 0x341b8,
0x366a8, 0x366a8,
0x366ec, 0x366ec,
0x36a00, 0x36abc,
- 0x36b00, 0x36b38,
+ 0x36b00, 0x36b18,
+ 0x36b20, 0x36b38,
0x36b40, 0x36b58,
0x36b60, 0x36b78,
0x36c00, 0x36c00,
0x36c08, 0x36c3c,
- 0x36e00, 0x36e2c,
- 0x36f00, 0x36f2c,
0x37000, 0x3702c,
0x37034, 0x37050,
0x37058, 0x37058,
0x40280, 0x40280,
0x40304, 0x40304,
0x40330, 0x4033c,
- 0x41304, 0x413b8,
- 0x413c0, 0x413c8,
+ 0x41304, 0x413c8,
0x413d0, 0x413dc,
0x413f0, 0x413f0,
0x41400, 0x4140c,
return 0;
}
+/**
+ * t4_get_vpd_version - return the VPD version
+ * @adapter: the adapter
+ * @vers: where to place the version
+ *
+ * Reads the VPD via the Firmware interface (thus this can only be called
+ * once we're ready to issue Firmware commands). The format of the
+ * VPD version is adapter specific. Returns 0 on success, an error on
+ * failure.
+ *
+ * Note that early versions of the Firmware didn't include the ability
+ * to retrieve the VPD version, so we zero-out the return-value parameter
+ * in that case to avoid leaving it with garbage in it.
+ *
+ * Also note that the Firmware will return its cached copy of the VPD
+ * Revision ID, not the actual Revision ID as written in the Serial
+ * EEPROM. This is only an issue if a new VPD has been written and the
+ * Firmware/Chip haven't yet gone through a RESET sequence. So it's best
+ * to defer calling this routine till after a FW_RESET_CMD has been issued
+ * if the Host Driver will be performing a full adapter initialization.
+ */
+int t4_get_vpd_version(struct adapter *adapter, u32 *vers)
+{
+ u32 vpdrev_param;
+ int ret;
+
+ vpdrev_param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+ FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_VPDREV));
+ ret = t4_query_params(adapter, adapter->mbox, adapter->pf, 0,
+ 1, &vpdrev_param, vers);
+ if (ret)
+ *vers = 0;
+ return ret;
+}
+
+/**
+ * t4_get_scfg_version - return the Serial Configuration version
+ * @adapter: the adapter
+ * @vers: where to place the version
+ *
+ * Reads the Serial Configuration Version via the Firmware interface
+ * (thus this can only be called once we're ready to issue Firmware
+ * commands). The format of the Serial Configuration version is
+ * adapter specific. Returns 0 on success, an error on failure.
+ *
+ * Note that early versions of the Firmware didn't include the ability
+ * to retrieve the Serial Configuration version, so we zero-out the
+ * return-value parameter in that case to avoid leaving it with
+ * garbage in it.
+ *
+ * Also note that the Firmware will return its cached copy of the Serial
+ * Initialization Revision ID, not the actual Revision ID as written in
+ * the Serial EEPROM. This is only an issue if a new VPD has been written
+ * and the Firmware/Chip haven't yet gone through a RESET sequence. So
+ * it's best to defer calling this routine till after a FW_RESET_CMD has
+ * been issued if the Host Driver will be performing a full adapter
+ * initialization.
+ */
+int t4_get_scfg_version(struct adapter *adapter, u32 *vers)
+{
+ u32 scfgrev_param;
+ int ret;
+
+ scfgrev_param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+ FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_SCFGREV));
+ ret = t4_query_params(adapter, adapter->mbox, adapter->pf, 0,
+ 1, &scfgrev_param, vers);
+ if (ret)
+ *vers = 0;
+ return ret;
+}
+
+/**
+ * t4_get_version_info - extract various chip/firmware version information
+ * @adapter: the adapter
+ *
+ * Reads various chip/firmware version numbers and stores them into the
+ * adapter Adapter Parameters structure. If any of the efforts fails
+ * the first failure will be returned, but all of the version numbers
+ * will be read.
+ */
+int t4_get_version_info(struct adapter *adapter)
+{
+ int ret = 0;
+
+ #define FIRST_RET(__getvinfo) \
+ do { \
+ int __ret = __getvinfo; \
+ if (__ret && !ret) \
+ ret = __ret; \
+ } while (0)
+
+ FIRST_RET(t4_get_fw_version(adapter, &adapter->params.fw_vers));
+ FIRST_RET(t4_get_bs_version(adapter, &adapter->params.bs_vers));
+ FIRST_RET(t4_get_tp_version(adapter, &adapter->params.tp_vers));
+ FIRST_RET(t4_get_exprom_version(adapter, &adapter->params.er_vers));
+ FIRST_RET(t4_get_scfg_version(adapter, &adapter->params.scfg_vers));
+ FIRST_RET(t4_get_vpd_version(adapter, &adapter->params.vpd_vers));
+
+ #undef FIRST_RET
+ return ret;
+}
+
+/**
+ * t4_dump_version_info - dump all of the adapter configuration IDs
+ * @adapter: the adapter
+ *
+ * Dumps all of the various bits of adapter configuration version/revision
+ * IDs information. This is typically called at some point after
+ * t4_get_version_info() has been called.
+ */
+void t4_dump_version_info(struct adapter *adapter)
+{
+ /* Device information */
+ dev_info(adapter->pdev_dev, "Chelsio %s rev %d\n",
+ adapter->params.vpd.id,
+ CHELSIO_CHIP_RELEASE(adapter->params.chip));
+ dev_info(adapter->pdev_dev, "S/N: %s, P/N: %s\n",
+ adapter->params.vpd.sn, adapter->params.vpd.pn);
+
+ /* Firmware Version */
+ if (!adapter->params.fw_vers)
+ dev_warn(adapter->pdev_dev, "No firmware loaded\n");
+ else
+ dev_info(adapter->pdev_dev, "Firmware version: %u.%u.%u.%u\n",
+ FW_HDR_FW_VER_MAJOR_G(adapter->params.fw_vers),
+ FW_HDR_FW_VER_MINOR_G(adapter->params.fw_vers),
+ FW_HDR_FW_VER_MICRO_G(adapter->params.fw_vers),
+ FW_HDR_FW_VER_BUILD_G(adapter->params.fw_vers));
+
+ /* Bootstrap Firmware Version. (Some adapters don't have Bootstrap
+ * Firmware, so dev_info() is more appropriate here.)
+ */
+ if (!adapter->params.bs_vers)
+ dev_info(adapter->pdev_dev, "No bootstrap loaded\n");
+ else
+ dev_info(adapter->pdev_dev, "Bootstrap version: %u.%u.%u.%u\n",
+ FW_HDR_FW_VER_MAJOR_G(adapter->params.bs_vers),
+ FW_HDR_FW_VER_MINOR_G(adapter->params.bs_vers),
+ FW_HDR_FW_VER_MICRO_G(adapter->params.bs_vers),
+ FW_HDR_FW_VER_BUILD_G(adapter->params.bs_vers));
+
+ /* TP Microcode Version */
+ if (!adapter->params.tp_vers)
+ dev_warn(adapter->pdev_dev, "No TP Microcode loaded\n");
+ else
+ dev_info(adapter->pdev_dev,
+ "TP Microcode version: %u.%u.%u.%u\n",
+ FW_HDR_FW_VER_MAJOR_G(adapter->params.tp_vers),
+ FW_HDR_FW_VER_MINOR_G(adapter->params.tp_vers),
+ FW_HDR_FW_VER_MICRO_G(adapter->params.tp_vers),
+ FW_HDR_FW_VER_BUILD_G(adapter->params.tp_vers));
+
+ /* Expansion ROM version */
+ if (!adapter->params.er_vers)
+ dev_info(adapter->pdev_dev, "No Expansion ROM loaded\n");
+ else
+ dev_info(adapter->pdev_dev,
+ "Expansion ROM version: %u.%u.%u.%u\n",
+ FW_HDR_FW_VER_MAJOR_G(adapter->params.er_vers),
+ FW_HDR_FW_VER_MINOR_G(adapter->params.er_vers),
+ FW_HDR_FW_VER_MICRO_G(adapter->params.er_vers),
+ FW_HDR_FW_VER_BUILD_G(adapter->params.er_vers));
+
+ /* Serial Configuration version */
+ dev_info(adapter->pdev_dev, "Serial Configuration version: %#x\n",
+ adapter->params.scfg_vers);
+
+ /* VPD Version */
+ dev_info(adapter->pdev_dev, "VPD version: %#x\n",
+ adapter->params.vpd_vers);
+}
+
/**
* t4_check_fw_version - check if the FW is supported with this driver
* @adap: the adapter
}
}
-#define ADVERT_MASK (FW_PORT_CAP_SPEED_100M | FW_PORT_CAP_SPEED_1G |\
- FW_PORT_CAP_SPEED_10G | FW_PORT_CAP_SPEED_25G | \
- FW_PORT_CAP_SPEED_40G | FW_PORT_CAP_SPEED_100G | \
- FW_PORT_CAP_ANEG)
+#define ADVERT_MASK (FW_PORT_CAP32_SPEED_V(FW_PORT_CAP32_SPEED_M) | \
+ FW_PORT_CAP32_ANEG)
+
+/**
+ * fwcaps16_to_caps32 - convert 16-bit Port Capabilities to 32-bits
+ * @caps16: a 16-bit Port Capabilities value
+ *
+ * Returns the equivalent 32-bit Port Capabilities value.
+ */
+static fw_port_cap32_t fwcaps16_to_caps32(fw_port_cap16_t caps16)
+{
+ fw_port_cap32_t caps32 = 0;
+
+ #define CAP16_TO_CAP32(__cap) \
+ do { \
+ if (caps16 & FW_PORT_CAP_##__cap) \
+ caps32 |= FW_PORT_CAP32_##__cap; \
+ } while (0)
+
+ CAP16_TO_CAP32(SPEED_100M);
+ CAP16_TO_CAP32(SPEED_1G);
+ CAP16_TO_CAP32(SPEED_25G);
+ CAP16_TO_CAP32(SPEED_10G);
+ CAP16_TO_CAP32(SPEED_40G);
+ CAP16_TO_CAP32(SPEED_100G);
+ CAP16_TO_CAP32(FC_RX);
+ CAP16_TO_CAP32(FC_TX);
+ CAP16_TO_CAP32(ANEG);
+ CAP16_TO_CAP32(MDIX);
+ CAP16_TO_CAP32(MDIAUTO);
+ CAP16_TO_CAP32(FEC_RS);
+ CAP16_TO_CAP32(FEC_BASER_RS);
+ CAP16_TO_CAP32(802_3_PAUSE);
+ CAP16_TO_CAP32(802_3_ASM_DIR);
+
+ #undef CAP16_TO_CAP32
+
+ return caps32;
+}
+
+/**
+ * fwcaps32_to_caps16 - convert 32-bit Port Capabilities to 16-bits
+ * @caps32: a 32-bit Port Capabilities value
+ *
+ * Returns the equivalent 16-bit Port Capabilities value. Note that
+ * not all 32-bit Port Capabilities can be represented in the 16-bit
+ * Port Capabilities and some fields/values may not make it.
+ */
+static fw_port_cap16_t fwcaps32_to_caps16(fw_port_cap32_t caps32)
+{
+ fw_port_cap16_t caps16 = 0;
+
+ #define CAP32_TO_CAP16(__cap) \
+ do { \
+ if (caps32 & FW_PORT_CAP32_##__cap) \
+ caps16 |= FW_PORT_CAP_##__cap; \
+ } while (0)
+
+ CAP32_TO_CAP16(SPEED_100M);
+ CAP32_TO_CAP16(SPEED_1G);
+ CAP32_TO_CAP16(SPEED_10G);
+ CAP32_TO_CAP16(SPEED_25G);
+ CAP32_TO_CAP16(SPEED_40G);
+ CAP32_TO_CAP16(SPEED_100G);
+ CAP32_TO_CAP16(FC_RX);
+ CAP32_TO_CAP16(FC_TX);
+ CAP32_TO_CAP16(802_3_PAUSE);
+ CAP32_TO_CAP16(802_3_ASM_DIR);
+ CAP32_TO_CAP16(ANEG);
+ CAP32_TO_CAP16(MDIX);
+ CAP32_TO_CAP16(MDIAUTO);
+ CAP32_TO_CAP16(FEC_RS);
+ CAP32_TO_CAP16(FEC_BASER_RS);
+
+ #undef CAP32_TO_CAP16
+
+ return caps16;
+}
+
+/* Translate Firmware Port Capabilities Pause specification to Common Code */
+static inline enum cc_pause fwcap_to_cc_pause(fw_port_cap32_t fw_pause)
+{
+ enum cc_pause cc_pause = 0;
+
+ if (fw_pause & FW_PORT_CAP32_FC_RX)
+ cc_pause |= PAUSE_RX;
+ if (fw_pause & FW_PORT_CAP32_FC_TX)
+ cc_pause |= PAUSE_TX;
+
+ return cc_pause;
+}
+
+/* Translate Common Code Pause specification into Firmware Port Capabilities */
+static inline fw_port_cap32_t cc_to_fwcap_pause(enum cc_pause cc_pause)
+{
+ fw_port_cap32_t fw_pause = 0;
+
+ if (cc_pause & PAUSE_RX)
+ fw_pause |= FW_PORT_CAP32_FC_RX;
+ if (cc_pause & PAUSE_TX)
+ fw_pause |= FW_PORT_CAP32_FC_TX;
+
+ return fw_pause;
+}
+
+/* Translate Firmware Forward Error Correction specification to Common Code */
+static inline enum cc_fec fwcap_to_cc_fec(fw_port_cap32_t fw_fec)
+{
+ enum cc_fec cc_fec = 0;
+
+ if (fw_fec & FW_PORT_CAP32_FEC_RS)
+ cc_fec |= FEC_RS;
+ if (fw_fec & FW_PORT_CAP32_FEC_BASER_RS)
+ cc_fec |= FEC_BASER_RS;
+
+ return cc_fec;
+}
+
+/* Translate Common Code Forward Error Correction specification to Firmware */
+static inline fw_port_cap32_t cc_to_fwcap_fec(enum cc_fec cc_fec)
+{
+ fw_port_cap32_t fw_fec = 0;
+
+ if (cc_fec & FEC_RS)
+ fw_fec |= FW_PORT_CAP32_FEC_RS;
+ if (cc_fec & FEC_BASER_RS)
+ fw_fec |= FW_PORT_CAP32_FEC_BASER_RS;
+
+ return fw_fec;
+}
/**
* t4_link_l1cfg - apply link configuration to MAC/PHY
- * @phy: the PHY to setup
- * @mac: the MAC to setup
- * @lc: the requested link configuration
+ * @adapter: the adapter
+ * @mbox: the Firmware Mailbox to use
+ * @port: the Port ID
+ * @lc: the Port's Link Configuration
*
* Set up a port's MAC and PHY according to a desired link configuration.
* - If the PHY can auto-negotiate first decide what to advertise, then
* - If auto-negotiation is off set the MAC to the proper speed/duplex/FC,
* otherwise do it later based on the outcome of auto-negotiation.
*/
-int t4_link_l1cfg(struct adapter *adap, unsigned int mbox, unsigned int port,
- struct link_config *lc)
+int t4_link_l1cfg(struct adapter *adapter, unsigned int mbox,
+ unsigned int port, struct link_config *lc)
{
- struct fw_port_cmd c;
- unsigned int mdi = FW_PORT_CAP_MDI_V(FW_PORT_CAP_MDI_AUTO);
- unsigned int fc = 0, fec = 0, fw_fec = 0;
+ unsigned int fw_caps = adapter->params.fw_caps_support;
+ struct fw_port_cmd cmd;
+ unsigned int fw_mdi = FW_PORT_CAP32_MDI_V(FW_PORT_CAP32_MDI_AUTO);
+ fw_port_cap32_t fw_fc, cc_fec, fw_fec, rcap;
lc->link_ok = 0;
- if (lc->requested_fc & PAUSE_RX)
- fc |= FW_PORT_CAP_FC_RX;
- if (lc->requested_fc & PAUSE_TX)
- fc |= FW_PORT_CAP_FC_TX;
-
- fec = lc->requested_fec & FEC_AUTO ? lc->auto_fec : lc->requested_fec;
- if (fec & FEC_RS)
- fw_fec |= FW_PORT_CAP_FEC_RS;
- if (fec & FEC_BASER_RS)
- fw_fec |= FW_PORT_CAP_FEC_BASER_RS;
-
- memset(&c, 0, sizeof(c));
- c.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) |
- FW_CMD_REQUEST_F | FW_CMD_EXEC_F |
- FW_PORT_CMD_PORTID_V(port));
- c.action_to_len16 =
- cpu_to_be32(FW_PORT_CMD_ACTION_V(FW_PORT_ACTION_L1_CFG) |
- FW_LEN16(c));
+ /* Convert driver coding of Pause Frame Flow Control settings into the
+ * Firmware's API.
+ */
+ fw_fc = cc_to_fwcap_pause(lc->requested_fc);
+
+ /* Convert Common Code Forward Error Control settings into the
+ * Firmware's API. If the current Requested FEC has "Automatic"
+ * (IEEE 802.3) specified, then we use whatever the Firmware
+ * sent us as part of it's IEEE 802.3-based interpratation of
+ * the Transceiver Module EPROM FEC parameters. Otherwise we
+ * use whatever is in the current Requested FEC settings.
+ */
+ if (lc->requested_fec & FEC_AUTO)
+ cc_fec = fwcap_to_cc_fec(lc->def_acaps);
+ else
+ cc_fec = lc->requested_fec;
+ fw_fec = cc_to_fwcap_fec(cc_fec);
- if (!(lc->supported & FW_PORT_CAP_ANEG)) {
- c.u.l1cfg.rcap = cpu_to_be32((lc->supported & ADVERT_MASK) |
- fc | fw_fec);
- lc->fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX);
+ /* Figure out what our Requested Port Capabilities are going to be.
+ */
+ if (!(lc->pcaps & FW_PORT_CAP32_ANEG)) {
+ rcap = (lc->pcaps & ADVERT_MASK) | fw_fc | fw_fec;
+ lc->fc = lc->requested_fc & ~PAUSE_AUTONEG;
+ lc->fec = cc_fec;
} else if (lc->autoneg == AUTONEG_DISABLE) {
- c.u.l1cfg.rcap = cpu_to_be32(lc->requested_speed | fc |
- fw_fec | mdi);
- lc->fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX);
- } else
- c.u.l1cfg.rcap = cpu_to_be32(lc->advertising | fc |
- fw_fec | mdi);
+ rcap = lc->speed_caps | fw_fc | fw_fec | fw_mdi;
+ lc->fc = lc->requested_fc & ~PAUSE_AUTONEG;
+ lc->fec = cc_fec;
+ } else {
+ rcap = lc->acaps | fw_fc | fw_fec | fw_mdi;
+ }
- return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+ /* And send that on to the Firmware ...
+ */
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) |
+ FW_CMD_REQUEST_F | FW_CMD_EXEC_F |
+ FW_PORT_CMD_PORTID_V(port));
+ cmd.action_to_len16 =
+ cpu_to_be32(FW_PORT_CMD_ACTION_V(fw_caps == FW_CAPS16
+ ? FW_PORT_ACTION_L1_CFG
+ : FW_PORT_ACTION_L1_CFG32) |
+ FW_LEN16(cmd));
+ if (fw_caps == FW_CAPS16)
+ cmd.u.l1cfg.rcap = cpu_to_be32(fwcaps32_to_caps16(rcap));
+ else
+ cmd.u.l1cfg32.rcap32 = cpu_to_be32(rcap);
+ return t4_wr_mbox(adapter, mbox, &cmd, sizeof(cmd), NULL);
}
/**
c.action_to_len16 =
cpu_to_be32(FW_PORT_CMD_ACTION_V(FW_PORT_ACTION_L1_CFG) |
FW_LEN16(c));
- c.u.l1cfg.rcap = cpu_to_be32(FW_PORT_CAP_ANEG);
+ c.u.l1cfg.rcap = cpu_to_be32(FW_PORT_CAP32_ANEG);
return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
}
if (ret < 0)
goto out;
+ /*
+ * If there was a Firmware Configuration File stored in FLASH,
+ * there's a good chance that it won't be compatible with the new
+ * Firmware. In order to prevent difficult to diagnose adapter
+ * initialization issues, we clear out the Firmware Configuration File
+ * portion of the FLASH . The user will need to re-FLASH a new
+ * Firmware Configuration File which is compatible with the new
+ * Firmware if that's desired.
+ */
+ (void)t4_load_cfg(adap, NULL, 0);
+
/*
* Older versions of the firmware don't understand the new
* PCIE_FW.HALT flag and so won't know to perform a RESET when they
return reason[link_down_rc];
}
+/**
+ * Return the highest speed set in the port capabilities, in Mb/s.
+ */
+static unsigned int fwcap_to_speed(fw_port_cap32_t caps)
+{
+ #define TEST_SPEED_RETURN(__caps_speed, __speed) \
+ do { \
+ if (caps & FW_PORT_CAP32_SPEED_##__caps_speed) \
+ return __speed; \
+ } while (0)
+
+ TEST_SPEED_RETURN(400G, 400000);
+ TEST_SPEED_RETURN(200G, 200000);
+ TEST_SPEED_RETURN(100G, 100000);
+ TEST_SPEED_RETURN(50G, 50000);
+ TEST_SPEED_RETURN(40G, 40000);
+ TEST_SPEED_RETURN(25G, 25000);
+ TEST_SPEED_RETURN(10G, 10000);
+ TEST_SPEED_RETURN(1G, 1000);
+ TEST_SPEED_RETURN(100M, 100);
+
+ #undef TEST_SPEED_RETURN
+
+ return 0;
+}
+
+/**
+ * fwcap_to_fwspeed - return highest speed in Port Capabilities
+ * @acaps: advertised Port Capabilities
+ *
+ * Get the highest speed for the port from the advertised Port
+ * Capabilities. It will be either the highest speed from the list of
+ * speeds or whatever user has set using ethtool.
+ */
+static fw_port_cap32_t fwcap_to_fwspeed(fw_port_cap32_t acaps)
+{
+ #define TEST_SPEED_RETURN(__caps_speed) \
+ do { \
+ if (acaps & FW_PORT_CAP32_SPEED_##__caps_speed) \
+ return FW_PORT_CAP32_SPEED_##__caps_speed; \
+ } while (0)
+
+ TEST_SPEED_RETURN(400G);
+ TEST_SPEED_RETURN(200G);
+ TEST_SPEED_RETURN(100G);
+ TEST_SPEED_RETURN(50G);
+ TEST_SPEED_RETURN(40G);
+ TEST_SPEED_RETURN(25G);
+ TEST_SPEED_RETURN(10G);
+ TEST_SPEED_RETURN(1G);
+ TEST_SPEED_RETURN(100M);
+
+ #undef TEST_SPEED_RETURN
+
+ return 0;
+}
+
+/**
+ * lstatus_to_fwcap - translate old lstatus to 32-bit Port Capabilities
+ * @lstatus: old FW_PORT_ACTION_GET_PORT_INFO lstatus value
+ *
+ * Translates old FW_PORT_ACTION_GET_PORT_INFO lstatus field into new
+ * 32-bit Port Capabilities value.
+ */
+static fw_port_cap32_t lstatus_to_fwcap(u32 lstatus)
+{
+ fw_port_cap32_t linkattr = 0;
+
+ /* Unfortunately the format of the Link Status in the old
+ * 16-bit Port Information message isn't the same as the
+ * 16-bit Port Capabilities bitfield used everywhere else ...
+ */
+ if (lstatus & FW_PORT_CMD_RXPAUSE_F)
+ linkattr |= FW_PORT_CAP32_FC_RX;
+ if (lstatus & FW_PORT_CMD_TXPAUSE_F)
+ linkattr |= FW_PORT_CAP32_FC_TX;
+ if (lstatus & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_100M))
+ linkattr |= FW_PORT_CAP32_SPEED_100M;
+ if (lstatus & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_1G))
+ linkattr |= FW_PORT_CAP32_SPEED_1G;
+ if (lstatus & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_10G))
+ linkattr |= FW_PORT_CAP32_SPEED_10G;
+ if (lstatus & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_25G))
+ linkattr |= FW_PORT_CAP32_SPEED_25G;
+ if (lstatus & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_40G))
+ linkattr |= FW_PORT_CAP32_SPEED_40G;
+ if (lstatus & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_100G))
+ linkattr |= FW_PORT_CAP32_SPEED_100G;
+
+ return linkattr;
+}
+
/**
* t4_handle_get_port_info - process a FW reply message
* @pi: the port info
*/
void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl)
{
- const struct fw_port_cmd *p = (const void *)rpl;
- struct adapter *adap = pi->adapter;
-
- /* link/module state change message */
- int speed = 0, fc = 0;
- struct link_config *lc;
- u32 stat = be32_to_cpu(p->u.info.lstatus_to_modtype);
- int link_ok = (stat & FW_PORT_CMD_LSTATUS_F) != 0;
- u32 mod = FW_PORT_CMD_MODTYPE_G(stat);
-
- if (stat & FW_PORT_CMD_RXPAUSE_F)
- fc |= PAUSE_RX;
- if (stat & FW_PORT_CMD_TXPAUSE_F)
- fc |= PAUSE_TX;
- if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_100M))
- speed = 100;
- else if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_1G))
- speed = 1000;
- else if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_10G))
- speed = 10000;
- else if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_25G))
- speed = 25000;
- else if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_40G))
- speed = 40000;
- else if (stat & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_100G))
- speed = 100000;
-
- lc = &pi->link_cfg;
-
- if (mod != pi->mod_type) {
- pi->mod_type = mod;
- t4_os_portmod_changed(adap, pi->port_id);
+ const struct fw_port_cmd *cmd = (const void *)rpl;
+ int action = FW_PORT_CMD_ACTION_G(be32_to_cpu(cmd->action_to_len16));
+ struct adapter *adapter = pi->adapter;
+ struct link_config *lc = &pi->link_cfg;
+ int link_ok, linkdnrc;
+ enum fw_port_type port_type;
+ enum fw_port_module_type mod_type;
+ unsigned int speed, fc, fec;
+ fw_port_cap32_t pcaps, acaps, lpacaps, linkattr;
+
+ /* Extract the various fields from the Port Information message.
+ */
+ switch (action) {
+ case FW_PORT_ACTION_GET_PORT_INFO: {
+ u32 lstatus = be32_to_cpu(cmd->u.info.lstatus_to_modtype);
+
+ link_ok = (lstatus & FW_PORT_CMD_LSTATUS_F) != 0;
+ linkdnrc = FW_PORT_CMD_LINKDNRC_G(lstatus);
+ port_type = FW_PORT_CMD_PTYPE_G(lstatus);
+ mod_type = FW_PORT_CMD_MODTYPE_G(lstatus);
+ pcaps = fwcaps16_to_caps32(be16_to_cpu(cmd->u.info.pcap));
+ acaps = fwcaps16_to_caps32(be16_to_cpu(cmd->u.info.acap));
+ lpacaps = fwcaps16_to_caps32(be16_to_cpu(cmd->u.info.lpacap));
+ linkattr = lstatus_to_fwcap(lstatus);
+ break;
}
+
+ case FW_PORT_ACTION_GET_PORT_INFO32: {
+ u32 lstatus32;
+
+ lstatus32 = be32_to_cpu(cmd->u.info32.lstatus32_to_cbllen32);
+ link_ok = (lstatus32 & FW_PORT_CMD_LSTATUS32_F) != 0;
+ linkdnrc = FW_PORT_CMD_LINKDNRC32_G(lstatus32);
+ port_type = FW_PORT_CMD_PORTTYPE32_G(lstatus32);
+ mod_type = FW_PORT_CMD_MODTYPE32_G(lstatus32);
+ pcaps = be32_to_cpu(cmd->u.info32.pcaps32);
+ acaps = be32_to_cpu(cmd->u.info32.acaps32);
+ lpacaps = be32_to_cpu(cmd->u.info32.lpacaps32);
+ linkattr = be32_to_cpu(cmd->u.info32.linkattr32);
+ break;
+ }
+
+ default:
+ dev_err(adapter->pdev_dev, "Handle Port Information: Bad Command/Action %#x\n",
+ be32_to_cpu(cmd->action_to_len16));
+ return;
+ }
+
+ fec = fwcap_to_cc_fec(acaps);
+ fc = fwcap_to_cc_pause(linkattr);
+ speed = fwcap_to_speed(linkattr);
+
+ if (mod_type != pi->mod_type) {
+ /* With the newer SFP28 and QSFP28 Transceiver Module Types,
+ * various fundamental Port Capabilities which used to be
+ * immutable can now change radically. We can now have
+ * Speeds, Auto-Negotiation, Forward Error Correction, etc.
+ * all change based on what Transceiver Module is inserted.
+ * So we need to record the Physical "Port" Capabilities on
+ * every Transceiver Module change.
+ */
+ lc->pcaps = pcaps;
+
+ /* When a new Transceiver Module is inserted, the Firmware
+ * will examine its i2c EPROM to determine its type and
+ * general operating parameters including things like Forward
+ * Error Control, etc. Various IEEE 802.3 standards dictate
+ * how to interpret these i2c values to determine default
+ * "sutomatic" settings. We record these for future use when
+ * the user explicitly requests these standards-based values.
+ */
+ lc->def_acaps = acaps;
+
+ /* Some versions of the early T6 Firmware "cheated" when
+ * handling different Transceiver Modules by changing the
+ * underlaying Port Type reported to the Host Drivers. As
+ * such we need to capture whatever Port Type the Firmware
+ * sends us and record it in case it's different from what we
+ * were told earlier. Unfortunately, since Firmware is
+ * forever, we'll need to keep this code here forever, but in
+ * later T6 Firmware it should just be an assignment of the
+ * same value already recorded.
+ */
+ pi->port_type = port_type;
+
+ pi->mod_type = mod_type;
+ t4_os_portmod_changed(adapter, pi->port_id);
+ }
+
if (link_ok != lc->link_ok || speed != lc->speed ||
- fc != lc->fc) { /* something changed */
+ fc != lc->fc || fec != lc->fec) { /* something changed */
if (!link_ok && lc->link_ok) {
- unsigned char rc = FW_PORT_CMD_LINKDNRC_G(stat);
-
- lc->link_down_rc = rc;
- dev_warn(adap->pdev_dev,
- "Port %d link down, reason: %s\n",
- pi->port_id, t4_link_down_rc_str(rc));
+ lc->link_down_rc = linkdnrc;
+ dev_warn(adapter->pdev_dev, "Port %d link down, reason: %s\n",
+ pi->tx_chan, t4_link_down_rc_str(linkdnrc));
}
lc->link_ok = link_ok;
lc->speed = speed;
lc->fc = fc;
- lc->supported = be16_to_cpu(p->u.info.pcap);
- lc->lp_advertising = be16_to_cpu(p->u.info.lpacap);
+ lc->fec = fec;
+
+ lc->lpacaps = lpacaps;
+ lc->acaps = acaps & ADVERT_MASK;
+
+ if (lc->acaps & FW_PORT_CAP32_ANEG) {
+ lc->autoneg = AUTONEG_ENABLE;
+ } else {
+ /* When Autoneg is disabled, user needs to set
+ * single speed.
+ * Similar to cxgb4_ethtool.c: set_link_ksettings
+ */
+ lc->acaps = 0;
+ lc->speed_caps = fwcap_to_fwspeed(acaps);
+ lc->autoneg = AUTONEG_DISABLE;
+ }
- t4_os_link_changed(adap, pi->port_id, link_ok);
+ t4_os_link_changed(adapter, pi->port_id, link_ok);
}
}
*/
int t4_update_port_info(struct port_info *pi)
{
+ unsigned int fw_caps = pi->adapter->params.fw_caps_support;
struct fw_port_cmd port_cmd;
int ret;
memset(&port_cmd, 0, sizeof(port_cmd));
port_cmd.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) |
FW_CMD_REQUEST_F | FW_CMD_READ_F |
- FW_PORT_CMD_PORTID_V(pi->port_id));
+ FW_PORT_CMD_PORTID_V(pi->tx_chan));
port_cmd.action_to_len16 = cpu_to_be32(
- FW_PORT_CMD_ACTION_V(FW_PORT_ACTION_GET_PORT_INFO) |
+ FW_PORT_CMD_ACTION_V(fw_caps == FW_CAPS16
+ ? FW_PORT_ACTION_GET_PORT_INFO
+ : FW_PORT_ACTION_GET_PORT_INFO32) |
FW_LEN16(port_cmd));
ret = t4_wr_mbox(pi->adapter, pi->adapter->mbox,
&port_cmd, sizeof(port_cmd), &port_cmd);
return 0;
}
+/**
+ * t4_get_link_params - retrieve basic link parameters for given port
+ * @pi: the port
+ * @link_okp: value return pointer for link up/down
+ * @speedp: value return pointer for speed (Mb/s)
+ * @mtup: value return pointer for mtu
+ *
+ * Retrieves basic link parameters for a port: link up/down, speed (Mb/s),
+ * and MTU for a specified port. A negative error is returned on
+ * failure; 0 on success.
+ */
+int t4_get_link_params(struct port_info *pi, unsigned int *link_okp,
+ unsigned int *speedp, unsigned int *mtup)
+{
+ unsigned int fw_caps = pi->adapter->params.fw_caps_support;
+ struct fw_port_cmd port_cmd;
+ unsigned int action, link_ok, speed, mtu;
+ fw_port_cap32_t linkattr;
+ int ret;
+
+ memset(&port_cmd, 0, sizeof(port_cmd));
+ port_cmd.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) |
+ FW_CMD_REQUEST_F | FW_CMD_READ_F |
+ FW_PORT_CMD_PORTID_V(pi->tx_chan));
+ action = (fw_caps == FW_CAPS16
+ ? FW_PORT_ACTION_GET_PORT_INFO
+ : FW_PORT_ACTION_GET_PORT_INFO32);
+ port_cmd.action_to_len16 = cpu_to_be32(
+ FW_PORT_CMD_ACTION_V(action) |
+ FW_LEN16(port_cmd));
+ ret = t4_wr_mbox(pi->adapter, pi->adapter->mbox,
+ &port_cmd, sizeof(port_cmd), &port_cmd);
+ if (ret)
+ return ret;
+
+ if (action == FW_PORT_ACTION_GET_PORT_INFO) {
+ u32 lstatus = be32_to_cpu(port_cmd.u.info.lstatus_to_modtype);
+
+ link_ok = !!(lstatus & FW_PORT_CMD_LSTATUS_F);
+ linkattr = lstatus_to_fwcap(lstatus);
+ mtu = be16_to_cpu(port_cmd.u.info.mtu);
+ } else {
+ u32 lstatus32 =
+ be32_to_cpu(port_cmd.u.info32.lstatus32_to_cbllen32);
+
+ link_ok = !!(lstatus32 & FW_PORT_CMD_LSTATUS32_F);
+ linkattr = be32_to_cpu(port_cmd.u.info32.linkattr32);
+ mtu = FW_PORT_CMD_MTU32_G(
+ be32_to_cpu(port_cmd.u.info32.auxlinfo32_mtu32));
+ }
+ speed = fwcap_to_speed(linkattr);
+
+ *link_okp = link_ok;
+ *speedp = fwcap_to_speed(linkattr);
+ *mtup = mtu;
+
+ return 0;
+}
+
/**
* t4_handle_fw_rpl - process a FW reply message
* @adap: the adapter
unsigned int action =
FW_PORT_CMD_ACTION_G(be32_to_cpu(p->action_to_len16));
- if (opcode == FW_PORT_CMD && action == FW_PORT_ACTION_GET_PORT_INFO) {
+ if (opcode == FW_PORT_CMD &&
+ (action == FW_PORT_ACTION_GET_PORT_INFO ||
+ action == FW_PORT_ACTION_GET_PORT_INFO32)) {
int i;
int chan = FW_PORT_CMD_PORTID_G(be32_to_cpu(p->op_to_portid));
struct port_info *pi = NULL;
t4_handle_get_port_info(pi, rpl);
} else {
- dev_warn(adap->pdev_dev, "Unknown firmware reply %d\n", opcode);
+ dev_warn(adap->pdev_dev, "Unknown firmware reply %d\n",
+ opcode);
return -EINVAL;
}
return 0;
/**
* init_link_config - initialize a link's SW state
- * @lc: structure holding the link state
- * @caps: link capabilities
+ * @lc: pointer to structure holding the link state
+ * @pcaps: link Port Capabilities
+ * @acaps: link current Advertised Port Capabilities
*
* Initializes the SW state maintained for each link, including the link's
* capabilities and default speed/flow-control/autonegotiation settings.
*/
-static void init_link_config(struct link_config *lc, unsigned int pcaps,
- unsigned int acaps)
+static void init_link_config(struct link_config *lc, fw_port_cap32_t pcaps,
+ fw_port_cap32_t acaps)
{
- lc->supported = pcaps;
- lc->lp_advertising = 0;
- lc->requested_speed = 0;
+ lc->pcaps = pcaps;
+ lc->def_acaps = acaps;
+ lc->lpacaps = 0;
+ lc->speed_caps = 0;
lc->speed = 0;
lc->requested_fc = lc->fc = PAUSE_RX | PAUSE_TX;
- lc->auto_fec = 0;
/* For Forward Error Control, we default to whatever the Firmware
* tells us the Link is currently advertising.
*/
- if (acaps & FW_PORT_CAP_FEC_RS)
- lc->auto_fec |= FEC_RS;
- if (acaps & FW_PORT_CAP_FEC_BASER_RS)
- lc->auto_fec |= FEC_BASER_RS;
lc->requested_fec = FEC_AUTO;
- lc->fec = lc->auto_fec;
+ lc->fec = fwcap_to_cc_fec(lc->def_acaps);
- if (lc->supported & FW_PORT_CAP_ANEG) {
- lc->advertising = lc->supported & ADVERT_MASK;
+ if (lc->pcaps & FW_PORT_CAP32_ANEG) {
+ lc->acaps = lc->pcaps & ADVERT_MASK;
lc->autoneg = AUTONEG_ENABLE;
lc->requested_fc |= PAUSE_AUTONEG;
} else {
- lc->advertising = 0;
+ lc->acaps = 0;
lc->autoneg = AUTONEG_DISABLE;
}
}
}
/**
- * t4_init_portinfo - allocate a virtual interface amd initialize port_info
+ * t4_init_portinfo - allocate a virtual interface and initialize port_info
* @pi: the port_info
* @mbox: mailbox to use for the FW command
* @port: physical port associated with the VI
int t4_init_portinfo(struct port_info *pi, int mbox,
int port, int pf, int vf, u8 mac[])
{
- int ret;
- struct fw_port_cmd c;
+ struct adapter *adapter = pi->adapter;
+ unsigned int fw_caps = adapter->params.fw_caps_support;
+ struct fw_port_cmd cmd;
unsigned int rss_size;
+ enum fw_port_type port_type;
+ int mdio_addr;
+ fw_port_cap32_t pcaps, acaps;
+ int ret;
- memset(&c, 0, sizeof(c));
- c.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) |
- FW_CMD_REQUEST_F | FW_CMD_READ_F |
- FW_PORT_CMD_PORTID_V(port));
- c.action_to_len16 = cpu_to_be32(
- FW_PORT_CMD_ACTION_V(FW_PORT_ACTION_GET_PORT_INFO) |
- FW_LEN16(c));
- ret = t4_wr_mbox(pi->adapter, mbox, &c, sizeof(c), &c);
+ /* If we haven't yet determined whether we're talking to Firmware
+ * which knows the new 32-bit Port Capabilities, it's time to find
+ * out now. This will also tell new Firmware to send us Port Status
+ * Updates using the new 32-bit Port Capabilities version of the
+ * Port Information message.
+ */
+ if (fw_caps == FW_CAPS_UNKNOWN) {
+ u32 param, val;
+
+ param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_PFVF) |
+ FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_PFVF_PORT_CAPS32));
+ val = 1;
+ ret = t4_set_params(adapter, mbox, pf, vf, 1, ¶m, &val);
+ fw_caps = (ret == 0 ? FW_CAPS32 : FW_CAPS16);
+ adapter->params.fw_caps_support = fw_caps;
+ }
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) |
+ FW_CMD_REQUEST_F | FW_CMD_READ_F |
+ FW_PORT_CMD_PORTID_V(port));
+ cmd.action_to_len16 = cpu_to_be32(
+ FW_PORT_CMD_ACTION_V(fw_caps == FW_CAPS16
+ ? FW_PORT_ACTION_GET_PORT_INFO
+ : FW_PORT_ACTION_GET_PORT_INFO32) |
+ FW_LEN16(cmd));
+ ret = t4_wr_mbox(pi->adapter, mbox, &cmd, sizeof(cmd), &cmd);
if (ret)
return ret;
+ /* Extract the various fields from the Port Information message.
+ */
+ if (fw_caps == FW_CAPS16) {
+ u32 lstatus = be32_to_cpu(cmd.u.info.lstatus_to_modtype);
+
+ port_type = FW_PORT_CMD_PTYPE_G(lstatus);
+ mdio_addr = ((lstatus & FW_PORT_CMD_MDIOCAP_F)
+ ? FW_PORT_CMD_MDIOADDR_G(lstatus)
+ : -1);
+ pcaps = fwcaps16_to_caps32(be16_to_cpu(cmd.u.info.pcap));
+ acaps = fwcaps16_to_caps32(be16_to_cpu(cmd.u.info.acap));
+ } else {
+ u32 lstatus32 = be32_to_cpu(cmd.u.info32.lstatus32_to_cbllen32);
+
+ port_type = FW_PORT_CMD_PORTTYPE32_G(lstatus32);
+ mdio_addr = ((lstatus32 & FW_PORT_CMD_MDIOCAP32_F)
+ ? FW_PORT_CMD_MDIOADDR32_G(lstatus32)
+ : -1);
+ pcaps = be32_to_cpu(cmd.u.info32.pcaps32);
+ acaps = be32_to_cpu(cmd.u.info32.acaps32);
+ }
+
ret = t4_alloc_vi(pi->adapter, mbox, port, pf, vf, 1, mac, &rss_size);
if (ret < 0)
return ret;
pi->lport = port;
pi->rss_size = rss_size;
- ret = be32_to_cpu(c.u.info.lstatus_to_modtype);
- pi->mdio_addr = (ret & FW_PORT_CMD_MDIOCAP_F) ?
- FW_PORT_CMD_MDIOADDR_G(ret) : -1;
- pi->port_type = FW_PORT_CMD_PTYPE_G(ret);
+ pi->port_type = port_type;
+ pi->mdio_addr = mdio_addr;
pi->mod_type = FW_PORT_MOD_TYPE_NA;
- init_link_config(&pi->link_cfg, be16_to_cpu(c.u.info.pcap),
- be16_to_cpu(c.u.info.acap));
+ init_link_config(&pi->link_cfg, pcaps, acaps);
return 0;
}
}
}
+/**
+ * t4_load_cfg - download config file
+ * @adap: the adapter
+ * @cfg_data: the cfg text file to write
+ * @size: text file size
+ *
+ * Write the supplied config text file to the card's serial flash.
+ */
+int t4_load_cfg(struct adapter *adap, const u8 *cfg_data, unsigned int size)
+{
+ int ret, i, n, cfg_addr;
+ unsigned int addr;
+ unsigned int flash_cfg_start_sec;
+ unsigned int sf_sec_size = adap->params.sf_size / adap->params.sf_nsec;
+
+ cfg_addr = t4_flash_cfg_addr(adap);
+ if (cfg_addr < 0)
+ return cfg_addr;
+
+ addr = cfg_addr;
+ flash_cfg_start_sec = addr / SF_SEC_SIZE;
+
+ if (size > FLASH_CFG_MAX_SIZE) {
+ dev_err(adap->pdev_dev, "cfg file too large, max is %u bytes\n",
+ FLASH_CFG_MAX_SIZE);
+ return -EFBIG;
+ }
+
+ i = DIV_ROUND_UP(FLASH_CFG_MAX_SIZE, /* # of sectors spanned */
+ sf_sec_size);
+ ret = t4_flash_erase_sectors(adap, flash_cfg_start_sec,
+ flash_cfg_start_sec + i - 1);
+ /* If size == 0 then we're simply erasing the FLASH sectors associated
+ * with the on-adapter Firmware Configuration File.
+ */
+ if (ret || size == 0)
+ goto out;
+
+ /* this will write to the flash up to SF_PAGE_SIZE at a time */
+ for (i = 0; i < size; i += SF_PAGE_SIZE) {
+ if ((size - i) < SF_PAGE_SIZE)
+ n = size - i;
+ else
+ n = SF_PAGE_SIZE;
+ ret = t4_write_flash(adap, addr, n, cfg_data);
+ if (ret)
+ goto out;
+
+ addr += SF_PAGE_SIZE;
+ cfg_data += SF_PAGE_SIZE;
+ }
+
+out:
+ if (ret)
+ dev_err(adap->pdev_dev, "config file %s failed %d\n",
+ (size == 0 ? "clear" : "download"), ret);
+ return ret;
+}
+
/**
* t4_set_vf_mac - Set MAC address for the specified VF
* @adapter: The adapter
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = ftgmac100_poll_controller,
#endif
+ .ndo_vlan_rx_add_vid = ncsi_vlan_rx_add_vid,
+ .ndo_vlan_rx_kill_vid = ncsi_vlan_rx_kill_vid,
};
static int ftgmac100_setup_mdio(struct net_device *netdev)
NETIF_F_GRO | NETIF_F_SG | NETIF_F_HW_VLAN_CTAG_RX |
NETIF_F_HW_VLAN_CTAG_TX;
+ if (priv->use_ncsi)
+ netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+
/* AST2400 doesn't have working HW checksum generation */
if (np && (of_device_is_compatible(np, "aspeed,ast2400-mac")))
netdev->hw_features &= ~NETIF_F_HW_CSUM;
err_ioremap:
release_resource(priv->res);
err_req_mem:
- netif_napi_del(&priv->napi);
free_netdev(netdev);
err_alloc_etherdev:
return err;
goto no_mem;
}
+ pdev->dev.of_node = node;
+ pdev->dev.parent = priv->dev;
set_dma_ops(&pdev->dev, get_dma_ops(priv->dev));
ret = platform_device_add_data(pdev, &data, sizeof(data));
priv->internal_phy_node = of_parse_phandle(mac_node,
"pcsphy-handle", 0);
} else {
- dev_err(dev, "MAC node (%s) contains unsupported MAC\n",
- mac_node->full_name);
+ dev_err(dev, "MAC node (%pOF) contains unsupported MAC\n",
+ mac_node);
err = -EINVAL;
goto _return;
}
/* Get the FM node */
dev_node = of_get_parent(mac_node);
if (!dev_node) {
- dev_err(dev, "of_get_parent(%s) failed\n",
- mac_node->full_name);
+ dev_err(dev, "of_get_parent(%pOF) failed\n",
+ mac_node);
err = -EINVAL;
goto _return_dev_set_drvdata;
}
of_dev = of_find_device_by_node(dev_node);
if (!of_dev) {
- dev_err(dev, "of_find_device_by_node(%s) failed\n",
- dev_node->full_name);
+ dev_err(dev, "of_find_device_by_node(%pOF) failed\n", dev_node);
err = -EINVAL;
goto _return_of_node_put;
}
/* Get the FMan cell-index */
err = of_property_read_u32(dev_node, "cell-index", &val);
if (err) {
- dev_err(dev, "failed to read cell-index for %s\n",
- dev_node->full_name);
+ dev_err(dev, "failed to read cell-index for %pOF\n", dev_node);
err = -EINVAL;
goto _return_of_node_put;
}
priv->fman = fman_bind(&of_dev->dev);
if (!priv->fman) {
- dev_err(dev, "fman_bind(%s) failed\n", dev_node->full_name);
+ dev_err(dev, "fman_bind(%pOF) failed\n", dev_node);
err = -ENODEV;
goto _return_of_node_put;
}
/* Get the address of the memory mapped registers */
err = of_address_to_resource(mac_node, 0, &res);
if (err < 0) {
- dev_err(dev, "of_address_to_resource(%s) = %d\n",
- mac_node->full_name, err);
+ dev_err(dev, "of_address_to_resource(%pOF) = %d\n",
+ mac_node, err);
goto _return_dev_set_drvdata;
}
/* Get the cell-index */
err = of_property_read_u32(mac_node, "cell-index", &val);
if (err) {
- dev_err(dev, "failed to read cell-index for %s\n",
- mac_node->full_name);
+ dev_err(dev, "failed to read cell-index for %pOF\n", mac_node);
err = -EINVAL;
goto _return_dev_set_drvdata;
}
/* Get the MAC address */
mac_addr = of_get_mac_address(mac_node);
if (!mac_addr) {
- dev_err(dev, "of_get_mac_address(%s) failed\n",
- mac_node->full_name);
+ dev_err(dev, "of_get_mac_address(%pOF) failed\n", mac_node);
err = -EINVAL;
goto _return_dev_set_drvdata;
}
/* Get the port handles */
nph = of_count_phandle_with_args(mac_node, "fsl,fman-ports", NULL);
if (unlikely(nph < 0)) {
- dev_err(dev, "of_count_phandle_with_args(%s, fsl,fman-ports) failed\n",
- mac_node->full_name);
+ dev_err(dev, "of_count_phandle_with_args(%pOF, fsl,fman-ports) failed\n",
+ mac_node);
err = nph;
goto _return_dev_set_drvdata;
}
if (nph != ARRAY_SIZE(mac_dev->port)) {
- dev_err(dev, "Not supported number of fman-ports handles of mac node %s from device tree\n",
- mac_node->full_name);
+ dev_err(dev, "Not supported number of fman-ports handles of mac node %pOF from device tree\n",
+ mac_node);
err = -EINVAL;
goto _return_dev_set_drvdata;
}
/* Find the port node */
dev_node = of_parse_phandle(mac_node, "fsl,fman-ports", i);
if (!dev_node) {
- dev_err(dev, "of_parse_phandle(%s, fsl,fman-ports) failed\n",
- mac_node->full_name);
+ dev_err(dev, "of_parse_phandle(%pOF, fsl,fman-ports) failed\n",
+ mac_node);
err = -EINVAL;
goto _return_of_node_put;
}
of_dev = of_find_device_by_node(dev_node);
if (!of_dev) {
- dev_err(dev, "of_find_device_by_node(%s) failed\n",
- dev_node->full_name);
+ dev_err(dev, "of_find_device_by_node(%pOF) failed\n",
+ dev_node);
err = -EINVAL;
goto _return_of_node_put;
}
mac_dev->port[i] = fman_port_bind(&of_dev->dev);
if (!mac_dev->port[i]) {
- dev_err(dev, "dev_get_drvdata(%s) failed\n",
- dev_node->full_name);
+ dev_err(dev, "dev_get_drvdata(%pOF) failed\n",
+ dev_node);
err = -EINVAL;
goto _return_of_node_put;
}
phy_if = of_get_phy_mode(mac_node);
if (phy_if < 0) {
dev_warn(dev,
- "of_get_phy_mode() for %s failed. Defaulting to SGMII\n",
- mac_node->full_name);
+ "of_get_phy_mode() for %pOF failed. Defaulting to SGMII\n",
+ mac_node);
phy_if = PHY_INTERFACE_MODE_SGMII;
}
priv->phy_if = phy_if;
#include <linux/inetdevice.h>
#include <linux/mbus.h>
#include <linux/module.h>
+#include <linux/mfd/syscon.h>
#include <linux/interrupt.h>
#include <linux/cpumask.h>
#include <linux/of.h>
#include <linux/of_address.h>
#include <linux/of_device.h>
#include <linux/phy.h>
+#include <linux/phy/phy.h>
#include <linux/clk.h>
#include <linux/hrtimer.h>
#include <linux/ktime.h>
+#include <linux/regmap.h>
#include <uapi/linux/ppp_defs.h>
#include <net/ip.h>
#include <net/ipv6.h>
+#include <net/tso.h>
/* RX Fifo Registers */
#define MVPP2_RX_DATA_FIFO_SIZE_REG(port) (0x00 + 4 * (port))
#define MVPP2_TXQ_DESC_ADDR_REG 0x2084
#define MVPP2_TXQ_DESC_SIZE_REG 0x2088
#define MVPP2_TXQ_DESC_SIZE_MASK 0x3ff0
+#define MVPP2_TXQ_THRESH_REG 0x2094
+#define MVPP2_TXQ_THRESH_OFFSET 16
+#define MVPP2_TXQ_THRESH_MASK 0x3fff
#define MVPP2_AGGR_TXQ_UPDATE_REG 0x2090
#define MVPP2_TXQ_INDEX_REG 0x2098
#define MVPP2_TXQ_PREF_BUF_REG 0x209c
#define MVPP22_AXI_CODE_DOMAIN_SYSTEM 3
/* Interrupt Cause and Mask registers */
+#define MVPP2_ISR_TX_THRESHOLD_REG(port) (0x5140 + 4 * (port))
+#define MVPP2_MAX_ISR_TX_THRESHOLD 0xfffff0
+
#define MVPP2_ISR_RX_THRESHOLD_REG(rxq) (0x5200 + 4 * (rxq))
#define MVPP2_MAX_ISR_RX_THRESHOLD 0xfffff0
-#define MVPP21_ISR_RXQ_GROUP_REG(rxq) (0x5400 + 4 * (rxq))
+#define MVPP21_ISR_RXQ_GROUP_REG(port) (0x5400 + 4 * (port))
-#define MVPP22_ISR_RXQ_GROUP_INDEX_REG 0x5400
+#define MVPP22_ISR_RXQ_GROUP_INDEX_REG 0x5400
#define MVPP22_ISR_RXQ_GROUP_INDEX_SUBGROUP_MASK 0xf
-#define MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_MASK 0x380
-#define MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_OFFSET 7
+#define MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_MASK 0x380
+#define MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_OFFSET 7
#define MVPP22_ISR_RXQ_GROUP_INDEX_SUBGROUP_MASK 0xf
-#define MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_MASK 0x380
+#define MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_MASK 0x380
-#define MVPP22_ISR_RXQ_SUB_GROUP_CONFIG_REG 0x5404
-#define MVPP22_ISR_RXQ_SUB_GROUP_STARTQ_MASK 0x1f
-#define MVPP22_ISR_RXQ_SUB_GROUP_SIZE_MASK 0xf00
-#define MVPP22_ISR_RXQ_SUB_GROUP_SIZE_OFFSET 8
+#define MVPP22_ISR_RXQ_SUB_GROUP_CONFIG_REG 0x5404
+#define MVPP22_ISR_RXQ_SUB_GROUP_STARTQ_MASK 0x1f
+#define MVPP22_ISR_RXQ_SUB_GROUP_SIZE_MASK 0xf00
+#define MVPP22_ISR_RXQ_SUB_GROUP_SIZE_OFFSET 8
#define MVPP2_ISR_ENABLE_REG(port) (0x5420 + 4 * (port))
#define MVPP2_ISR_ENABLE_INTERRUPT(mask) ((mask) & 0xffff)
#define MVPP2_ISR_RX_TX_CAUSE_REG(port) (0x5480 + 4 * (port))
#define MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK 0xffff
#define MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK 0xff0000
+#define MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_OFFSET 16
#define MVPP2_CAUSE_RX_FIFO_OVERRUN_MASK BIT(24)
#define MVPP2_CAUSE_FCS_ERR_MASK BIT(25)
#define MVPP2_CAUSE_TX_FIFO_UNDERRUN_MASK BIT(26)
#define MVPP2_BM_VIRT_RLS_REG 0x64c0
#define MVPP22_BM_ADDR_HIGH_RLS_REG 0x64c4
#define MVPP22_BM_ADDR_HIGH_PHYS_RLS_MASK 0xff
-#define MVPP22_BM_ADDR_HIGH_VIRT_RLS_MASK 0xff00
+#define MVPP22_BM_ADDR_HIGH_VIRT_RLS_MASK 0xff00
#define MVPP22_BM_ADDR_HIGH_VIRT_RLS_SHIFT 8
/* TX Scheduler registers */
/* Per-port registers */
#define MVPP2_GMAC_CTRL_0_REG 0x0
-#define MVPP2_GMAC_PORT_EN_MASK BIT(0)
-#define MVPP2_GMAC_MAX_RX_SIZE_OFFS 2
-#define MVPP2_GMAC_MAX_RX_SIZE_MASK 0x7ffc
-#define MVPP2_GMAC_MIB_CNTR_EN_MASK BIT(15)
+#define MVPP2_GMAC_PORT_EN_MASK BIT(0)
+#define MVPP2_GMAC_PORT_TYPE_MASK BIT(1)
+#define MVPP2_GMAC_MAX_RX_SIZE_OFFS 2
+#define MVPP2_GMAC_MAX_RX_SIZE_MASK 0x7ffc
+#define MVPP2_GMAC_MIB_CNTR_EN_MASK BIT(15)
#define MVPP2_GMAC_CTRL_1_REG 0x4
-#define MVPP2_GMAC_PERIODIC_XON_EN_MASK BIT(1)
-#define MVPP2_GMAC_GMII_LB_EN_MASK BIT(5)
-#define MVPP2_GMAC_PCS_LB_EN_BIT 6
-#define MVPP2_GMAC_PCS_LB_EN_MASK BIT(6)
-#define MVPP2_GMAC_SA_LOW_OFFS 7
+#define MVPP2_GMAC_PERIODIC_XON_EN_MASK BIT(1)
+#define MVPP2_GMAC_GMII_LB_EN_MASK BIT(5)
+#define MVPP2_GMAC_PCS_LB_EN_BIT 6
+#define MVPP2_GMAC_PCS_LB_EN_MASK BIT(6)
+#define MVPP2_GMAC_SA_LOW_OFFS 7
#define MVPP2_GMAC_CTRL_2_REG 0x8
-#define MVPP2_GMAC_INBAND_AN_MASK BIT(0)
-#define MVPP2_GMAC_PCS_ENABLE_MASK BIT(3)
-#define MVPP2_GMAC_PORT_RGMII_MASK BIT(4)
-#define MVPP2_GMAC_PORT_RESET_MASK BIT(6)
+#define MVPP2_GMAC_INBAND_AN_MASK BIT(0)
+#define MVPP2_GMAC_FLOW_CTRL_MASK GENMASK(2, 1)
+#define MVPP2_GMAC_PCS_ENABLE_MASK BIT(3)
+#define MVPP2_GMAC_PORT_RGMII_MASK BIT(4)
+#define MVPP2_GMAC_DISABLE_PADDING BIT(5)
+#define MVPP2_GMAC_PORT_RESET_MASK BIT(6)
#define MVPP2_GMAC_AUTONEG_CONFIG 0xc
-#define MVPP2_GMAC_FORCE_LINK_DOWN BIT(0)
-#define MVPP2_GMAC_FORCE_LINK_PASS BIT(1)
-#define MVPP2_GMAC_CONFIG_MII_SPEED BIT(5)
-#define MVPP2_GMAC_CONFIG_GMII_SPEED BIT(6)
-#define MVPP2_GMAC_AN_SPEED_EN BIT(7)
-#define MVPP2_GMAC_FC_ADV_EN BIT(9)
-#define MVPP2_GMAC_CONFIG_FULL_DUPLEX BIT(12)
-#define MVPP2_GMAC_AN_DUPLEX_EN BIT(13)
+#define MVPP2_GMAC_FORCE_LINK_DOWN BIT(0)
+#define MVPP2_GMAC_FORCE_LINK_PASS BIT(1)
+#define MVPP2_GMAC_IN_BAND_AUTONEG BIT(2)
+#define MVPP2_GMAC_IN_BAND_AUTONEG_BYPASS BIT(3)
+#define MVPP2_GMAC_CONFIG_MII_SPEED BIT(5)
+#define MVPP2_GMAC_CONFIG_GMII_SPEED BIT(6)
+#define MVPP2_GMAC_AN_SPEED_EN BIT(7)
+#define MVPP2_GMAC_FC_ADV_EN BIT(9)
+#define MVPP2_GMAC_FLOW_CTRL_AUTONEG BIT(11)
+#define MVPP2_GMAC_CONFIG_FULL_DUPLEX BIT(12)
+#define MVPP2_GMAC_AN_DUPLEX_EN BIT(13)
+#define MVPP2_GMAC_STATUS0 0x10
+#define MVPP2_GMAC_STATUS0_LINK_UP BIT(0)
#define MVPP2_GMAC_PORT_FIFO_CFG_1_REG 0x1c
-#define MVPP2_GMAC_TX_FIFO_MIN_TH_OFFS 6
-#define MVPP2_GMAC_TX_FIFO_MIN_TH_ALL_MASK 0x1fc0
-#define MVPP2_GMAC_TX_FIFO_MIN_TH_MASK(v) (((v) << 6) & \
+#define MVPP2_GMAC_TX_FIFO_MIN_TH_OFFS 6
+#define MVPP2_GMAC_TX_FIFO_MIN_TH_ALL_MASK 0x1fc0
+#define MVPP2_GMAC_TX_FIFO_MIN_TH_MASK(v) (((v) << 6) & \
MVPP2_GMAC_TX_FIFO_MIN_TH_ALL_MASK)
+#define MVPP22_GMAC_INT_STAT 0x20
+#define MVPP22_GMAC_INT_STAT_LINK BIT(1)
+#define MVPP22_GMAC_INT_MASK 0x24
+#define MVPP22_GMAC_INT_MASK_LINK_STAT BIT(1)
#define MVPP22_GMAC_CTRL_4_REG 0x90
-#define MVPP22_CTRL4_EXT_PIN_GMII_SEL BIT(0)
-#define MVPP22_CTRL4_DP_CLK_SEL BIT(5)
-#define MVPP22_CTRL4_SYNC_BYPASS BIT(6)
-#define MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE BIT(7)
+#define MVPP22_CTRL4_EXT_PIN_GMII_SEL BIT(0)
+#define MVPP22_CTRL4_DP_CLK_SEL BIT(5)
+#define MVPP22_CTRL4_SYNC_BYPASS_DIS BIT(6)
+#define MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE BIT(7)
+#define MVPP22_GMAC_INT_SUM_MASK 0xa4
+#define MVPP22_GMAC_INT_SUM_MASK_LINK_STAT BIT(1)
/* Per-port XGMAC registers. PPv2.2 only, only for GOP port 0,
* relative to port->base.
*/
#define MVPP22_XLG_CTRL0_REG 0x100
-#define MVPP22_XLG_CTRL0_PORT_EN BIT(0)
-#define MVPP22_XLG_CTRL0_MAC_RESET_DIS BIT(1)
-#define MVPP22_XLG_CTRL0_MIB_CNT_DIS BIT(14)
-
+#define MVPP22_XLG_CTRL0_PORT_EN BIT(0)
+#define MVPP22_XLG_CTRL0_MAC_RESET_DIS BIT(1)
+#define MVPP22_XLG_CTRL0_RX_FLOW_CTRL_EN BIT(7)
+#define MVPP22_XLG_CTRL0_MIB_CNT_DIS BIT(14)
+#define MVPP22_XLG_CTRL1_REG 0x104
+#define MVPP22_XLG_CTRL1_FRAMESIZELIMIT_OFFS 0
+#define MVPP22_XLG_CTRL1_FRAMESIZELIMIT_MASK 0x1fff
+#define MVPP22_XLG_STATUS 0x10c
+#define MVPP22_XLG_STATUS_LINK_UP BIT(0)
+#define MVPP22_XLG_INT_STAT 0x114
+#define MVPP22_XLG_INT_STAT_LINK BIT(1)
+#define MVPP22_XLG_INT_MASK 0x118
+#define MVPP22_XLG_INT_MASK_LINK BIT(1)
#define MVPP22_XLG_CTRL3_REG 0x11c
-#define MVPP22_XLG_CTRL3_MACMODESELECT_MASK (7 << 13)
-#define MVPP22_XLG_CTRL3_MACMODESELECT_GMAC (0 << 13)
-#define MVPP22_XLG_CTRL3_MACMODESELECT_10G (1 << 13)
+#define MVPP22_XLG_CTRL3_MACMODESELECT_MASK (7 << 13)
+#define MVPP22_XLG_CTRL3_MACMODESELECT_GMAC (0 << 13)
+#define MVPP22_XLG_CTRL3_MACMODESELECT_10G (1 << 13)
+#define MVPP22_XLG_EXT_INT_MASK 0x15c
+#define MVPP22_XLG_EXT_INT_MASK_XLG BIT(1)
+#define MVPP22_XLG_EXT_INT_MASK_GIG BIT(2)
+#define MVPP22_XLG_CTRL4_REG 0x184
+#define MVPP22_XLG_CTRL4_FWD_FC BIT(5)
+#define MVPP22_XLG_CTRL4_FWD_PFC BIT(6)
+#define MVPP22_XLG_CTRL4_MACMODSELECT_GMAC BIT(12)
/* SMI registers. PPv2.2 only, relative to priv->iface_base. */
#define MVPP22_SMI_MISC_CFG_REG 0x1204
-#define MVPP22_SMI_POLLING_EN BIT(10)
+#define MVPP22_SMI_POLLING_EN BIT(10)
#define MVPP22_GMAC_BASE(port) (0x7000 + (port) * 0x1000 + 0xe00)
#define MVPP2_QUEUE_NEXT_DESC(q, index) \
(((index) < (q)->last_desc) ? ((index) + 1) : 0)
+/* XPCS registers. PPv2.2 only */
+#define MVPP22_MPCS_BASE(port) (0x7000 + (port) * 0x1000)
+#define MVPP22_MPCS_CTRL 0x14
+#define MVPP22_MPCS_CTRL_FWD_ERR_CONN BIT(10)
+#define MVPP22_MPCS_CLK_RESET 0x14c
+#define MAC_CLK_RESET_SD_TX BIT(0)
+#define MAC_CLK_RESET_SD_RX BIT(1)
+#define MAC_CLK_RESET_MAC BIT(2)
+#define MVPP22_MPCS_CLK_RESET_DIV_RATIO(n) ((n) << 4)
+#define MVPP22_MPCS_CLK_RESET_DIV_SET BIT(11)
+
+/* XPCS registers. PPv2.2 only */
+#define MVPP22_XPCS_BASE(port) (0x7400 + (port) * 0x1000)
+#define MVPP22_XPCS_CFG0 0x0
+#define MVPP22_XPCS_CFG0_PCS_MODE(n) ((n) << 3)
+#define MVPP22_XPCS_CFG0_ACTIVE_LANE(n) ((n) << 5)
+
+/* System controller registers. Accessed through a regmap. */
+#define GENCONF_SOFT_RESET1 0x1108
+#define GENCONF_SOFT_RESET1_GOP BIT(6)
+#define GENCONF_PORT_CTRL0 0x1110
+#define GENCONF_PORT_CTRL0_BUS_WIDTH_SELECT BIT(1)
+#define GENCONF_PORT_CTRL0_RX_DATA_SAMPLE BIT(29)
+#define GENCONF_PORT_CTRL0_CLK_DIV_PHASE_CLR BIT(31)
+#define GENCONF_PORT_CTRL1 0x1114
+#define GENCONF_PORT_CTRL1_EN(p) BIT(p)
+#define GENCONF_PORT_CTRL1_RESET(p) (BIT(p) << 28)
+#define GENCONF_CTRL0 0x1120
+#define GENCONF_CTRL0_PORT0_RGMII BIT(0)
+#define GENCONF_CTRL0_PORT1_RGMII_MII BIT(1)
+#define GENCONF_CTRL0_PORT1_RGMII BIT(2)
+
/* Various constants */
/* Coalescing */
#define MVPP2_TXDONE_COAL_PKTS_THRESH 15
#define MVPP2_TXDONE_HRTIMER_PERIOD_NS 1000000UL
+#define MVPP2_TXDONE_COAL_USEC 1000
#define MVPP2_RX_COAL_PKTS 32
#define MVPP2_RX_COAL_USEC 100
#define MVPP21_ADDR_SPACE_SZ 0
#define MVPP22_ADDR_SPACE_SZ SZ_64K
-#define MVPP2_MAX_CPUS 4
+#define MVPP2_MAX_THREADS 8
+#define MVPP2_MAX_QVECS MVPP2_MAX_THREADS
enum mvpp2_bm_type {
MVPP2_BM_FREE,
void __iomem *lms_base;
void __iomem *iface_base;
- /* On PPv2.2, each CPU can access the base register through a
- * separate address space, each 64 KB apart from each
- * other.
+ /* On PPv2.2, each "software thread" can access the base
+ * register through a separate address space, each 64 KB apart
+ * from each other. Typically, such address spaces will be
+ * used per CPU.
+ */
+ void __iomem *swth_base[MVPP2_MAX_THREADS];
+
+ /* On PPv2.2, some port control registers are located into the system
+ * controller space. These registers are accessible through a regmap.
*/
- void __iomem *cpu_base[MVPP2_MAX_CPUS];
+ struct regmap *sysctrl_base;
/* Common clocks */
struct clk *pp_clk;
struct tasklet_struct tx_done_tasklet;
};
+struct mvpp2_queue_vector {
+ int irq;
+ struct napi_struct napi;
+ enum { MVPP2_QUEUE_VECTOR_SHARED, MVPP2_QUEUE_VECTOR_PRIVATE } type;
+ int sw_thread_id;
+ u16 sw_thread_mask;
+ int first_rxq;
+ int nrxqs;
+ u32 pending_cause_rx;
+ struct mvpp2_port *port;
+};
+
struct mvpp2_port {
u8 id;
*/
int gop_id;
- int irq;
+ int link_irq;
struct mvpp2 *priv;
void __iomem *base;
struct mvpp2_rx_queue **rxqs;
+ unsigned int nrxqs;
struct mvpp2_tx_queue **txqs;
+ unsigned int ntxqs;
struct net_device *dev;
int pkt_size;
- u32 pending_cause_rx;
- struct napi_struct napi;
-
/* Per-CPU port control */
struct mvpp2_port_pcpu __percpu *pcpu;
phy_interface_t phy_interface;
struct device_node *phy_node;
+ struct phy *comphy;
unsigned int link;
unsigned int duplex;
unsigned int speed;
/* Index of first port's physical RXQ */
u8 first_rxq;
+
+ struct mvpp2_queue_vector qvecs[MVPP2_MAX_QVECS];
+ unsigned int nqvecs;
+ bool has_tx_irqs;
+
+ u32 tx_time_coal;
};
/* The mvpp2_tx_desc and mvpp2_rx_desc structures describe the
/* Index of the TX DMA descriptor to be cleaned up */
int txq_get_index;
+
+ /* DMA buffer for TSO headers */
+ char *tso_headers;
+ dma_addr_t tso_headers_dma;
};
struct mvpp2_tx_queue {
u32 port_map;
};
-/* Static declaractions */
+/* Queue modes */
+#define MVPP2_QDIST_SINGLE_MODE 0
+#define MVPP2_QDIST_MULTI_MODE 1
-/* Number of RXQs used by single port */
-static int rxq_number = MVPP2_DEFAULT_RXQ;
-/* Number of TXQs used by single port */
-static int txq_number = MVPP2_MAX_TXQ;
+static int queue_mode = MVPP2_QDIST_SINGLE_MODE;
+
+module_param(queue_mode, int, 0444);
+MODULE_PARM_DESC(queue_mode, "Set queue_mode (single=0, multi=1)");
#define MVPP2_DRIVER_NAME "mvpp2"
#define MVPP2_DRIVER_VERSION "1.0"
static void mvpp2_write(struct mvpp2 *priv, u32 offset, u32 data)
{
- writel(data, priv->cpu_base[0] + offset);
+ writel(data, priv->swth_base[0] + offset);
}
static u32 mvpp2_read(struct mvpp2 *priv, u32 offset)
{
- return readl(priv->cpu_base[0] + offset);
+ return readl(priv->swth_base[0] + offset);
}
/* These accessors should be used to access:
static void mvpp2_percpu_write(struct mvpp2 *priv, int cpu,
u32 offset, u32 data)
{
- writel(data, priv->cpu_base[cpu] + offset);
+ writel(data, priv->swth_base[cpu] + offset);
}
static u32 mvpp2_percpu_read(struct mvpp2 *priv, int cpu,
u32 offset)
{
- return readl(priv->cpu_base[cpu] + offset);
+ return readl(priv->swth_base[cpu] + offset);
}
static dma_addr_t mvpp2_txdesc_dma_addr_get(struct mvpp2_port *port,
port->pool_long->port_map |= (1 << port->id);
- for (rxq = 0; rxq < rxq_number; rxq++)
+ for (rxq = 0; rxq < port->nrxqs; rxq++)
mvpp2_rxq_long_pool_set(port, rxq, port->pool_long->id);
}
port->pool_short->port_map |= (1 << port->id);
- for (rxq = 0; rxq < rxq_number; rxq++)
+ for (rxq = 0; rxq < port->nrxqs; rxq++)
mvpp2_rxq_short_pool_set(port, rxq,
port->pool_short->id);
}
static inline void mvpp2_interrupts_enable(struct mvpp2_port *port)
{
- int cpu, cpu_mask = 0;
+ int i, sw_thread_mask = 0;
+
+ for (i = 0; i < port->nqvecs; i++)
+ sw_thread_mask |= port->qvecs[i].sw_thread_mask;
- for_each_present_cpu(cpu)
- cpu_mask |= 1 << cpu;
mvpp2_write(port->priv, MVPP2_ISR_ENABLE_REG(port->id),
- MVPP2_ISR_ENABLE_INTERRUPT(cpu_mask));
+ MVPP2_ISR_ENABLE_INTERRUPT(sw_thread_mask));
}
static inline void mvpp2_interrupts_disable(struct mvpp2_port *port)
{
- int cpu, cpu_mask = 0;
+ int i, sw_thread_mask = 0;
+
+ for (i = 0; i < port->nqvecs; i++)
+ sw_thread_mask |= port->qvecs[i].sw_thread_mask;
+
+ mvpp2_write(port->priv, MVPP2_ISR_ENABLE_REG(port->id),
+ MVPP2_ISR_DISABLE_INTERRUPT(sw_thread_mask));
+}
+
+static inline void mvpp2_qvec_interrupt_enable(struct mvpp2_queue_vector *qvec)
+{
+ struct mvpp2_port *port = qvec->port;
+
+ mvpp2_write(port->priv, MVPP2_ISR_ENABLE_REG(port->id),
+ MVPP2_ISR_ENABLE_INTERRUPT(qvec->sw_thread_mask));
+}
+
+static inline void mvpp2_qvec_interrupt_disable(struct mvpp2_queue_vector *qvec)
+{
+ struct mvpp2_port *port = qvec->port;
- for_each_present_cpu(cpu)
- cpu_mask |= 1 << cpu;
mvpp2_write(port->priv, MVPP2_ISR_ENABLE_REG(port->id),
- MVPP2_ISR_DISABLE_INTERRUPT(cpu_mask));
+ MVPP2_ISR_DISABLE_INTERRUPT(qvec->sw_thread_mask));
}
/* Mask the current CPU's Rx/Tx interrupts
static void mvpp2_interrupts_unmask(void *arg)
{
struct mvpp2_port *port = arg;
+ u32 val;
+
+ val = MVPP2_CAUSE_MISC_SUM_MASK |
+ MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK;
+ if (port->has_tx_irqs)
+ val |= MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK;
mvpp2_percpu_write(port->priv, smp_processor_id(),
- MVPP2_ISR_RX_TX_MASK_REG(port->id),
- (MVPP2_CAUSE_MISC_SUM_MASK |
- MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK));
+ MVPP2_ISR_RX_TX_MASK_REG(port->id), val);
+}
+
+static void
+mvpp2_shared_interrupt_mask_unmask(struct mvpp2_port *port, bool mask)
+{
+ u32 val;
+ int i;
+
+ if (port->priv->hw_version != MVPP22)
+ return;
+
+ if (mask)
+ val = 0;
+ else
+ val = MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK;
+
+ for (i = 0; i < port->nqvecs; i++) {
+ struct mvpp2_queue_vector *v = port->qvecs + i;
+
+ if (v->type != MVPP2_QUEUE_VECTOR_SHARED)
+ continue;
+
+ mvpp2_percpu_write(port->priv, v->sw_thread_id,
+ MVPP2_ISR_RX_TX_MASK_REG(port->id), val);
+ }
}
/* Port configuration routines */
+static void mvpp22_gop_init_rgmii(struct mvpp2_port *port)
+{
+ struct mvpp2 *priv = port->priv;
+ u32 val;
+
+ regmap_read(priv->sysctrl_base, GENCONF_PORT_CTRL0, &val);
+ val |= GENCONF_PORT_CTRL0_BUS_WIDTH_SELECT;
+ regmap_write(priv->sysctrl_base, GENCONF_PORT_CTRL0, val);
+
+ regmap_read(priv->sysctrl_base, GENCONF_CTRL0, &val);
+ if (port->gop_id == 2)
+ val |= GENCONF_CTRL0_PORT0_RGMII | GENCONF_CTRL0_PORT1_RGMII;
+ else if (port->gop_id == 3)
+ val |= GENCONF_CTRL0_PORT1_RGMII_MII;
+ regmap_write(priv->sysctrl_base, GENCONF_CTRL0, val);
+}
+
+static void mvpp22_gop_init_sgmii(struct mvpp2_port *port)
+{
+ struct mvpp2 *priv = port->priv;
+ u32 val;
+
+ regmap_read(priv->sysctrl_base, GENCONF_PORT_CTRL0, &val);
+ val |= GENCONF_PORT_CTRL0_BUS_WIDTH_SELECT |
+ GENCONF_PORT_CTRL0_RX_DATA_SAMPLE;
+ regmap_write(priv->sysctrl_base, GENCONF_PORT_CTRL0, val);
+
+ if (port->gop_id > 1) {
+ regmap_read(priv->sysctrl_base, GENCONF_CTRL0, &val);
+ if (port->gop_id == 2)
+ val &= ~GENCONF_CTRL0_PORT0_RGMII;
+ else if (port->gop_id == 3)
+ val &= ~GENCONF_CTRL0_PORT1_RGMII_MII;
+ regmap_write(priv->sysctrl_base, GENCONF_CTRL0, val);
+ }
+}
+
+static void mvpp22_gop_init_10gkr(struct mvpp2_port *port)
+{
+ struct mvpp2 *priv = port->priv;
+ void __iomem *mpcs = priv->iface_base + MVPP22_MPCS_BASE(port->gop_id);
+ void __iomem *xpcs = priv->iface_base + MVPP22_XPCS_BASE(port->gop_id);
+ u32 val;
+
+ /* XPCS */
+ val = readl(xpcs + MVPP22_XPCS_CFG0);
+ val &= ~(MVPP22_XPCS_CFG0_PCS_MODE(0x3) |
+ MVPP22_XPCS_CFG0_ACTIVE_LANE(0x3));
+ val |= MVPP22_XPCS_CFG0_ACTIVE_LANE(2);
+ writel(val, xpcs + MVPP22_XPCS_CFG0);
+
+ /* MPCS */
+ val = readl(mpcs + MVPP22_MPCS_CTRL);
+ val &= ~MVPP22_MPCS_CTRL_FWD_ERR_CONN;
+ writel(val, mpcs + MVPP22_MPCS_CTRL);
+
+ val = readl(mpcs + MVPP22_MPCS_CLK_RESET);
+ val &= ~(MVPP22_MPCS_CLK_RESET_DIV_RATIO(0x7) | MAC_CLK_RESET_MAC |
+ MAC_CLK_RESET_SD_RX | MAC_CLK_RESET_SD_TX);
+ val |= MVPP22_MPCS_CLK_RESET_DIV_RATIO(1);
+ writel(val, mpcs + MVPP22_MPCS_CLK_RESET);
+
+ val &= ~MVPP22_MPCS_CLK_RESET_DIV_SET;
+ val |= MAC_CLK_RESET_MAC | MAC_CLK_RESET_SD_RX | MAC_CLK_RESET_SD_TX;
+ writel(val, mpcs + MVPP22_MPCS_CLK_RESET);
+}
+
+static int mvpp22_gop_init(struct mvpp2_port *port)
+{
+ struct mvpp2 *priv = port->priv;
+ u32 val;
+
+ if (!priv->sysctrl_base)
+ return 0;
+
+ switch (port->phy_interface) {
+ case PHY_INTERFACE_MODE_RGMII:
+ case PHY_INTERFACE_MODE_RGMII_ID:
+ case PHY_INTERFACE_MODE_RGMII_RXID:
+ case PHY_INTERFACE_MODE_RGMII_TXID:
+ if (port->gop_id == 0)
+ goto invalid_conf;
+ mvpp22_gop_init_rgmii(port);
+ break;
+ case PHY_INTERFACE_MODE_SGMII:
+ mvpp22_gop_init_sgmii(port);
+ break;
+ case PHY_INTERFACE_MODE_10GKR:
+ if (port->gop_id != 0)
+ goto invalid_conf;
+ mvpp22_gop_init_10gkr(port);
+ break;
+ default:
+ goto unsupported_conf;
+ }
+
+ regmap_read(priv->sysctrl_base, GENCONF_PORT_CTRL1, &val);
+ val |= GENCONF_PORT_CTRL1_RESET(port->gop_id) |
+ GENCONF_PORT_CTRL1_EN(port->gop_id);
+ regmap_write(priv->sysctrl_base, GENCONF_PORT_CTRL1, val);
+
+ regmap_read(priv->sysctrl_base, GENCONF_PORT_CTRL0, &val);
+ val |= GENCONF_PORT_CTRL0_CLK_DIV_PHASE_CLR;
+ regmap_write(priv->sysctrl_base, GENCONF_PORT_CTRL0, val);
+
+ regmap_read(priv->sysctrl_base, GENCONF_SOFT_RESET1, &val);
+ val |= GENCONF_SOFT_RESET1_GOP;
+ regmap_write(priv->sysctrl_base, GENCONF_SOFT_RESET1, val);
+
+unsupported_conf:
+ return 0;
+
+invalid_conf:
+ netdev_err(port->dev, "Invalid port configuration\n");
+ return -EINVAL;
+}
+
+static void mvpp22_gop_unmask_irq(struct mvpp2_port *port)
+{
+ u32 val;
+
+ if (phy_interface_mode_is_rgmii(port->phy_interface) ||
+ port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+ /* Enable the GMAC link status irq for this port */
+ val = readl(port->base + MVPP22_GMAC_INT_SUM_MASK);
+ val |= MVPP22_GMAC_INT_SUM_MASK_LINK_STAT;
+ writel(val, port->base + MVPP22_GMAC_INT_SUM_MASK);
+ }
+
+ if (port->gop_id == 0) {
+ /* Enable the XLG/GIG irqs for this port */
+ val = readl(port->base + MVPP22_XLG_EXT_INT_MASK);
+ if (port->phy_interface == PHY_INTERFACE_MODE_10GKR)
+ val |= MVPP22_XLG_EXT_INT_MASK_XLG;
+ else
+ val |= MVPP22_XLG_EXT_INT_MASK_GIG;
+ writel(val, port->base + MVPP22_XLG_EXT_INT_MASK);
+ }
+}
+
+static void mvpp22_gop_mask_irq(struct mvpp2_port *port)
+{
+ u32 val;
+
+ if (port->gop_id == 0) {
+ val = readl(port->base + MVPP22_XLG_EXT_INT_MASK);
+ val &= ~(MVPP22_XLG_EXT_INT_MASK_XLG |
+ MVPP22_XLG_EXT_INT_MASK_GIG);
+ writel(val, port->base + MVPP22_XLG_EXT_INT_MASK);
+ }
+
+ if (phy_interface_mode_is_rgmii(port->phy_interface) ||
+ port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+ val = readl(port->base + MVPP22_GMAC_INT_SUM_MASK);
+ val &= ~MVPP22_GMAC_INT_SUM_MASK_LINK_STAT;
+ writel(val, port->base + MVPP22_GMAC_INT_SUM_MASK);
+ }
+}
+
+static void mvpp22_gop_setup_irq(struct mvpp2_port *port)
+{
+ u32 val;
+
+ if (phy_interface_mode_is_rgmii(port->phy_interface) ||
+ port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+ val = readl(port->base + MVPP22_GMAC_INT_MASK);
+ val |= MVPP22_GMAC_INT_MASK_LINK_STAT;
+ writel(val, port->base + MVPP22_GMAC_INT_MASK);
+ }
+
+ if (port->gop_id == 0) {
+ val = readl(port->base + MVPP22_XLG_INT_MASK);
+ val |= MVPP22_XLG_INT_MASK_LINK;
+ writel(val, port->base + MVPP22_XLG_INT_MASK);
+ }
+
+ mvpp22_gop_unmask_irq(port);
+}
+
+static int mvpp22_comphy_init(struct mvpp2_port *port)
+{
+ enum phy_mode mode;
+ int ret;
+
+ if (!port->comphy)
+ return 0;
+
+ switch (port->phy_interface) {
+ case PHY_INTERFACE_MODE_SGMII:
+ mode = PHY_MODE_SGMII;
+ break;
+ case PHY_INTERFACE_MODE_10GKR:
+ mode = PHY_MODE_10GKR;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ ret = phy_set_mode(port->comphy, mode);
+ if (ret)
+ return ret;
+
+ return phy_power_on(port->comphy);
+}
+
+static void mvpp2_port_mii_gmac_configure_mode(struct mvpp2_port *port)
+{
+ u32 val;
+
+ if (port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+ val = readl(port->base + MVPP22_GMAC_CTRL_4_REG);
+ val |= MVPP22_CTRL4_SYNC_BYPASS_DIS | MVPP22_CTRL4_DP_CLK_SEL |
+ MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
+ val &= ~MVPP22_CTRL4_EXT_PIN_GMII_SEL;
+ writel(val, port->base + MVPP22_GMAC_CTRL_4_REG);
+
+ val = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
+ val |= MVPP2_GMAC_DISABLE_PADDING;
+ val &= ~MVPP2_GMAC_FLOW_CTRL_MASK;
+ writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
+ } else if (phy_interface_mode_is_rgmii(port->phy_interface)) {
+ val = readl(port->base + MVPP22_GMAC_CTRL_4_REG);
+ val |= MVPP22_CTRL4_EXT_PIN_GMII_SEL |
+ MVPP22_CTRL4_SYNC_BYPASS_DIS |
+ MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
+ val &= ~MVPP22_CTRL4_DP_CLK_SEL;
+ writel(val, port->base + MVPP22_GMAC_CTRL_4_REG);
+
+ val = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
+ val &= ~MVPP2_GMAC_DISABLE_PADDING;
+ writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
+ }
+
+ /* The port is connected to a copper PHY */
+ val = readl(port->base + MVPP2_GMAC_CTRL_0_REG);
+ val &= ~MVPP2_GMAC_PORT_TYPE_MASK;
+ writel(val, port->base + MVPP2_GMAC_CTRL_0_REG);
+
+ val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+ val |= MVPP2_GMAC_IN_BAND_AUTONEG_BYPASS |
+ MVPP2_GMAC_AN_SPEED_EN | MVPP2_GMAC_FLOW_CTRL_AUTONEG |
+ MVPP2_GMAC_AN_DUPLEX_EN;
+ if (port->phy_interface == PHY_INTERFACE_MODE_SGMII)
+ val |= MVPP2_GMAC_IN_BAND_AUTONEG;
+ writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+}
+
+static void mvpp2_port_mii_gmac_configure(struct mvpp2_port *port)
+{
+ u32 val;
+
+ /* Force link down */
+ val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+ val &= ~MVPP2_GMAC_FORCE_LINK_PASS;
+ val |= MVPP2_GMAC_FORCE_LINK_DOWN;
+ writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+
+ /* Set the GMAC in a reset state */
+ val = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
+ val |= MVPP2_GMAC_PORT_RESET_MASK;
+ writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
+
+ /* Configure the PCS and in-band AN */
+ val = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
+ if (port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+ val |= MVPP2_GMAC_INBAND_AN_MASK | MVPP2_GMAC_PCS_ENABLE_MASK;
+ } else if (phy_interface_mode_is_rgmii(port->phy_interface)) {
+ val &= ~MVPP2_GMAC_PCS_ENABLE_MASK;
+ val |= MVPP2_GMAC_PORT_RGMII_MASK;
+ }
+ writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
+
+ mvpp2_port_mii_gmac_configure_mode(port);
+
+ /* Unset the GMAC reset state */
+ val = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
+ val &= ~MVPP2_GMAC_PORT_RESET_MASK;
+ writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
+
+ /* Stop forcing link down */
+ val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+ val &= ~MVPP2_GMAC_FORCE_LINK_DOWN;
+ writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+}
+
+static void mvpp2_port_mii_xlg_configure(struct mvpp2_port *port)
+{
+ u32 val;
+
+ if (port->gop_id != 0)
+ return;
+
+ val = readl(port->base + MVPP22_XLG_CTRL0_REG);
+ val |= MVPP22_XLG_CTRL0_RX_FLOW_CTRL_EN;
+ writel(val, port->base + MVPP22_XLG_CTRL0_REG);
+
+ val = readl(port->base + MVPP22_XLG_CTRL4_REG);
+ val &= ~MVPP22_XLG_CTRL4_MACMODSELECT_GMAC;
+ val |= MVPP22_XLG_CTRL4_FWD_FC | MVPP22_XLG_CTRL4_FWD_PFC;
+ writel(val, port->base + MVPP22_XLG_CTRL4_REG);
+}
+
static void mvpp22_port_mii_set(struct mvpp2_port *port)
{
u32 val;
writel(val, port->base + MVPP22_XLG_CTRL3_REG);
}
-
- val = readl(port->base + MVPP22_GMAC_CTRL_4_REG);
- if (port->phy_interface == PHY_INTERFACE_MODE_RGMII)
- val |= MVPP22_CTRL4_EXT_PIN_GMII_SEL;
- else
- val &= ~MVPP22_CTRL4_EXT_PIN_GMII_SEL;
- val &= ~MVPP22_CTRL4_DP_CLK_SEL;
- val |= MVPP22_CTRL4_SYNC_BYPASS;
- val |= MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
- writel(val, port->base + MVPP22_GMAC_CTRL_4_REG);
}
static void mvpp2_port_mii_set(struct mvpp2_port *port)
{
- u32 val;
-
if (port->priv->hw_version == MVPP22)
mvpp22_port_mii_set(port);
- val = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
-
- switch (port->phy_interface) {
- case PHY_INTERFACE_MODE_SGMII:
- val |= MVPP2_GMAC_INBAND_AN_MASK;
- break;
- case PHY_INTERFACE_MODE_RGMII:
- val |= MVPP2_GMAC_PORT_RGMII_MASK;
- default:
- val &= ~MVPP2_GMAC_PCS_ENABLE_MASK;
- }
-
- writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
+ if (phy_interface_mode_is_rgmii(port->phy_interface) ||
+ port->phy_interface == PHY_INTERFACE_MODE_SGMII)
+ mvpp2_port_mii_gmac_configure(port);
+ else if (port->phy_interface == PHY_INTERFACE_MODE_10GKR)
+ mvpp2_port_mii_xlg_configure(port);
}
static void mvpp2_port_fc_adv_enable(struct mvpp2_port *port)
writel(val, port->base + MVPP2_GMAC_CTRL_0_REG);
}
+/* Change maximum receive size of the port */
+static inline void mvpp2_xlg_max_rx_size_set(struct mvpp2_port *port)
+{
+ u32 val;
+
+ val = readl(port->base + MVPP22_XLG_CTRL1_REG);
+ val &= ~MVPP22_XLG_CTRL1_FRAMESIZELIMIT_MASK;
+ val |= ((port->pkt_size - MVPP2_MH_SIZE) / 2) <<
+ MVPP22_XLG_CTRL1_FRAMESIZELIMIT_OFFS;
+ writel(val, port->base + MVPP22_XLG_CTRL1_REG);
+}
+
/* Set defaults to the MVPP2 port */
static void mvpp2_defaults_set(struct mvpp2_port *port)
{
MVPP2_RX_LOW_LATENCY_PKT_SIZE(256));
/* Enable Rx cache snoop */
- for (lrxq = 0; lrxq < rxq_number; lrxq++) {
+ for (lrxq = 0; lrxq < port->nrxqs; lrxq++) {
queue = port->rxqs[lrxq]->id;
val = mvpp2_read(port->priv, MVPP2_RXQ_CONFIG_REG(queue));
val |= MVPP2_SNOOP_PKT_SIZE_MASK |
u32 val;
int lrxq, queue;
- for (lrxq = 0; lrxq < rxq_number; lrxq++) {
+ for (lrxq = 0; lrxq < port->nrxqs; lrxq++) {
queue = port->rxqs[lrxq]->id;
val = mvpp2_read(port->priv, MVPP2_RXQ_CONFIG_REG(queue));
val &= ~MVPP2_RXQ_DISABLE_MASK;
u32 val;
int lrxq, queue;
- for (lrxq = 0; lrxq < rxq_number; lrxq++) {
+ for (lrxq = 0; lrxq < port->nrxqs; lrxq++) {
queue = port->rxqs[lrxq]->id;
val = mvpp2_read(port->priv, MVPP2_RXQ_CONFIG_REG(queue));
val |= MVPP2_RXQ_DISABLE_MASK;
/* Enable all initialized TXs. */
qmap = 0;
- for (queue = 0; queue < txq_number; queue++) {
+ for (queue = 0; queue < port->ntxqs; queue++) {
struct mvpp2_tx_queue *txq = port->txqs[queue];
if (txq->descs)
struct mvpp2_port *port = arg;
int queue;
- for (queue = 0; queue < txq_number; queue++) {
+ for (queue = 0; queue < port->ntxqs; queue++) {
int id = port->txqs[queue]->id;
mvpp2_percpu_read(port->priv, smp_processor_id(),
mvpp2_write(port->priv, MVPP2_TXP_SCHED_TOKEN_SIZE_REG, val);
}
- for (txq = 0; txq < txq_number; txq++) {
+ for (txq = 0; txq < port->ntxqs; txq++) {
val = mvpp2_read(port->priv,
MVPP2_TXQ_SCHED_TOKEN_SIZE_REG(txq));
size = val & MVPP2_TXQ_TOKEN_SIZE_MAX;
put_cpu();
}
+/* For some reason in the LSP this is done on each CPU. Why ? */
+static void mvpp2_tx_pkts_coal_set(struct mvpp2_port *port,
+ struct mvpp2_tx_queue *txq)
+{
+ int cpu = get_cpu();
+ u32 val;
+
+ if (txq->done_pkts_coal > MVPP2_TXQ_THRESH_MASK)
+ txq->done_pkts_coal = MVPP2_TXQ_THRESH_MASK;
+
+ val = (txq->done_pkts_coal << MVPP2_TXQ_THRESH_OFFSET);
+ mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_NUM_REG, txq->id);
+ mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_THRESH_REG, val);
+
+ put_cpu();
+}
+
static u32 mvpp2_usec_to_cycles(u32 usec, unsigned long clk_hz)
{
u64 tmp = (u64)clk_hz * usec;
mvpp2_write(port->priv, MVPP2_ISR_RX_THRESHOLD_REG(rxq->id), val);
}
+static void mvpp2_tx_time_coal_set(struct mvpp2_port *port)
+{
+ unsigned long freq = port->priv->tclk;
+ u32 val = mvpp2_usec_to_cycles(port->tx_time_coal, freq);
+
+ if (val > MVPP2_MAX_ISR_TX_THRESHOLD) {
+ port->tx_time_coal =
+ mvpp2_cycles_to_usec(MVPP2_MAX_ISR_TX_THRESHOLD, freq);
+
+ /* re-evaluate to get actual register value */
+ val = mvpp2_usec_to_cycles(port->tx_time_coal, freq);
+ }
+
+ mvpp2_write(port->priv, MVPP2_ISR_TX_THRESHOLD_REG(port->id), val);
+}
+
/* Free Tx queue skbuffs */
static void mvpp2_txq_bufs_free(struct mvpp2_port *port,
struct mvpp2_tx_queue *txq,
netif_tx_wake_queue(nq);
}
-static unsigned int mvpp2_tx_done(struct mvpp2_port *port, u32 cause)
+static unsigned int mvpp2_tx_done(struct mvpp2_port *port, u32 cause,
+ int cpu)
{
struct mvpp2_tx_queue *txq;
struct mvpp2_txq_pcpu *txq_pcpu;
if (!txq)
break;
- txq_pcpu = this_cpu_ptr(txq->pcpu);
+ txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
if (txq_pcpu->count) {
mvpp2_txq_done(port, txq, txq_pcpu);
/* Allocate and initialize descriptors for aggr TXQ */
static int mvpp2_aggr_txq_init(struct platform_device *pdev,
- struct mvpp2_tx_queue *aggr_txq,
- int desc_num, int cpu,
+ struct mvpp2_tx_queue *aggr_txq, int cpu,
struct mvpp2 *priv)
{
u32 txq_dma;
/* Allocate memory for TX descriptors */
aggr_txq->descs = dma_alloc_coherent(&pdev->dev,
- desc_num * MVPP2_DESC_ALIGNED_SIZE,
+ MVPP2_AGGR_TXQ_SIZE * MVPP2_DESC_ALIGNED_SIZE,
&aggr_txq->descs_dma, GFP_KERNEL);
if (!aggr_txq->descs)
return -ENOMEM;
MVPP22_AGGR_TXQ_DESC_ADDR_OFFS;
mvpp2_write(priv, MVPP2_AGGR_TXQ_DESC_ADDR_REG(cpu), txq_dma);
- mvpp2_write(priv, MVPP2_AGGR_TXQ_DESC_SIZE_REG(cpu), desc_num);
+ mvpp2_write(priv, MVPP2_AGGR_TXQ_DESC_SIZE_REG(cpu),
+ MVPP2_AGGR_TXQ_SIZE);
return 0;
}
txq_pcpu->reserved_num = 0;
txq_pcpu->txq_put_index = 0;
txq_pcpu->txq_get_index = 0;
+
+ txq_pcpu->tso_headers =
+ dma_alloc_coherent(port->dev->dev.parent,
+ MVPP2_AGGR_TXQ_SIZE * TSO_HEADER_SIZE,
+ &txq_pcpu->tso_headers_dma,
+ GFP_KERNEL);
+ if (!txq_pcpu->tso_headers)
+ goto cleanup;
}
return 0;
for_each_present_cpu(cpu) {
txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
kfree(txq_pcpu->buffs);
+
+ dma_free_coherent(port->dev->dev.parent,
+ MVPP2_AGGR_TXQ_SIZE * MVPP2_DESC_ALIGNED_SIZE,
+ txq_pcpu->tso_headers,
+ txq_pcpu->tso_headers_dma);
}
dma_free_coherent(port->dev->dev.parent,
for_each_present_cpu(cpu) {
txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
kfree(txq_pcpu->buffs);
+
+ dma_free_coherent(port->dev->dev.parent,
+ MVPP2_AGGR_TXQ_SIZE * MVPP2_DESC_ALIGNED_SIZE,
+ txq_pcpu->tso_headers,
+ txq_pcpu->tso_headers_dma);
}
if (txq->descs)
val |= MVPP2_TX_PORT_FLUSH_MASK(port->id);
mvpp2_write(port->priv, MVPP2_TX_PORT_FLUSH_REG, val);
- for (queue = 0; queue < txq_number; queue++) {
+ for (queue = 0; queue < port->ntxqs; queue++) {
txq = port->txqs[queue];
mvpp2_txq_clean(port, txq);
mvpp2_txq_deinit(port, txq);
{
int queue;
- for (queue = 0; queue < rxq_number; queue++)
+ for (queue = 0; queue < port->nrxqs; queue++)
mvpp2_rxq_deinit(port, port->rxqs[queue]);
}
{
int queue, err;
- for (queue = 0; queue < rxq_number; queue++) {
+ for (queue = 0; queue < port->nrxqs; queue++) {
err = mvpp2_rxq_init(port, port->rxqs[queue]);
if (err)
goto err_cleanup;
struct mvpp2_tx_queue *txq;
int queue, err;
- for (queue = 0; queue < txq_number; queue++) {
+ for (queue = 0; queue < port->ntxqs; queue++) {
txq = port->txqs[queue];
err = mvpp2_txq_init(port, txq);
if (err)
goto err_cleanup;
}
+ if (port->has_tx_irqs) {
+ mvpp2_tx_time_coal_set(port);
+ for (queue = 0; queue < port->ntxqs; queue++) {
+ txq = port->txqs[queue];
+ mvpp2_tx_pkts_coal_set(port, txq);
+ }
+ }
+
on_each_cpu(mvpp2_txq_sent_counter_clear, port, 1);
return 0;
/* The callback for per-port interrupt */
static irqreturn_t mvpp2_isr(int irq, void *dev_id)
{
- struct mvpp2_port *port = (struct mvpp2_port *)dev_id;
+ struct mvpp2_queue_vector *qv = dev_id;
- mvpp2_interrupts_disable(port);
+ mvpp2_qvec_interrupt_disable(qv);
- napi_schedule(&port->napi);
+ napi_schedule(&qv->napi);
return IRQ_HANDLED;
}
-/* Adjust link */
-static void mvpp2_link_event(struct net_device *dev)
+/* Per-port interrupt for link status changes */
+static irqreturn_t mvpp2_link_status_isr(int irq, void *dev_id)
{
- struct mvpp2_port *port = netdev_priv(dev);
- struct phy_device *phydev = dev->phydev;
- int status_change = 0;
+ struct mvpp2_port *port = (struct mvpp2_port *)dev_id;
+ struct net_device *dev = port->dev;
+ bool event = false, link = false;
u32 val;
- if (phydev->link) {
- if ((port->speed != phydev->speed) ||
- (port->duplex != phydev->duplex)) {
- u32 val;
+ mvpp22_gop_mask_irq(port);
- val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
- val &= ~(MVPP2_GMAC_CONFIG_MII_SPEED |
- MVPP2_GMAC_CONFIG_GMII_SPEED |
- MVPP2_GMAC_CONFIG_FULL_DUPLEX |
- MVPP2_GMAC_AN_SPEED_EN |
- MVPP2_GMAC_AN_DUPLEX_EN);
+ if (port->gop_id == 0 &&
+ port->phy_interface == PHY_INTERFACE_MODE_10GKR) {
+ val = readl(port->base + MVPP22_XLG_INT_STAT);
+ if (val & MVPP22_XLG_INT_STAT_LINK) {
+ event = true;
+ val = readl(port->base + MVPP22_XLG_STATUS);
+ if (val & MVPP22_XLG_STATUS_LINK_UP)
+ link = true;
+ }
+ } else if (phy_interface_mode_is_rgmii(port->phy_interface) ||
+ port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+ val = readl(port->base + MVPP22_GMAC_INT_STAT);
+ if (val & MVPP22_GMAC_INT_STAT_LINK) {
+ event = true;
+ val = readl(port->base + MVPP2_GMAC_STATUS0);
+ if (val & MVPP2_GMAC_STATUS0_LINK_UP)
+ link = true;
+ }
+ }
- if (phydev->duplex)
- val |= MVPP2_GMAC_CONFIG_FULL_DUPLEX;
+ if (!netif_running(dev) || !event)
+ goto handled;
- if (phydev->speed == SPEED_1000)
- val |= MVPP2_GMAC_CONFIG_GMII_SPEED;
- else if (phydev->speed == SPEED_100)
- val |= MVPP2_GMAC_CONFIG_MII_SPEED;
+ if (link) {
+ mvpp2_interrupts_enable(port);
- writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+ mvpp2_egress_enable(port);
+ mvpp2_ingress_enable(port);
+ netif_carrier_on(dev);
+ netif_tx_wake_all_queues(dev);
+ } else {
+ netif_tx_stop_all_queues(dev);
+ netif_carrier_off(dev);
+ mvpp2_ingress_disable(port);
+ mvpp2_egress_disable(port);
- port->duplex = phydev->duplex;
- port->speed = phydev->speed;
- }
+ mvpp2_interrupts_disable(port);
}
- if (phydev->link != port->link) {
- if (!phydev->link) {
- port->duplex = -1;
- port->speed = 0;
+handled:
+ mvpp22_gop_unmask_irq(port);
+ return IRQ_HANDLED;
+}
+
+static void mvpp2_gmac_set_autoneg(struct mvpp2_port *port,
+ struct phy_device *phydev)
+{
+ u32 val;
+
+ if (port->phy_interface != PHY_INTERFACE_MODE_RGMII &&
+ port->phy_interface != PHY_INTERFACE_MODE_RGMII_ID &&
+ port->phy_interface != PHY_INTERFACE_MODE_RGMII_RXID &&
+ port->phy_interface != PHY_INTERFACE_MODE_RGMII_TXID &&
+ port->phy_interface != PHY_INTERFACE_MODE_SGMII)
+ return;
+
+ val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+ val &= ~(MVPP2_GMAC_CONFIG_MII_SPEED |
+ MVPP2_GMAC_CONFIG_GMII_SPEED |
+ MVPP2_GMAC_CONFIG_FULL_DUPLEX |
+ MVPP2_GMAC_AN_SPEED_EN |
+ MVPP2_GMAC_AN_DUPLEX_EN);
+
+ if (phydev->duplex)
+ val |= MVPP2_GMAC_CONFIG_FULL_DUPLEX;
+
+ if (phydev->speed == SPEED_1000)
+ val |= MVPP2_GMAC_CONFIG_GMII_SPEED;
+ else if (phydev->speed == SPEED_100)
+ val |= MVPP2_GMAC_CONFIG_MII_SPEED;
+
+ writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+}
+
+/* Adjust link */
+static void mvpp2_link_event(struct net_device *dev)
+{
+ struct mvpp2_port *port = netdev_priv(dev);
+ struct phy_device *phydev = dev->phydev;
+ bool link_reconfigured = false;
+ u32 val;
+
+ if (phydev->link) {
+ if (port->phy_interface != phydev->interface && port->comphy) {
+ /* disable current port for reconfiguration */
+ mvpp2_interrupts_disable(port);
+ netif_carrier_off(port->dev);
+ mvpp2_port_disable(port);
+ phy_power_off(port->comphy);
+
+ /* comphy reconfiguration */
+ port->phy_interface = phydev->interface;
+ mvpp22_comphy_init(port);
+
+ /* gop/mac reconfiguration */
+ mvpp22_gop_init(port);
+ mvpp2_port_mii_set(port);
+
+ link_reconfigured = true;
}
- port->link = phydev->link;
- status_change = 1;
+ if ((port->speed != phydev->speed) ||
+ (port->duplex != phydev->duplex)) {
+ mvpp2_gmac_set_autoneg(port, phydev);
+
+ port->duplex = phydev->duplex;
+ port->speed = phydev->speed;
+ }
}
- if (status_change) {
+ if (phydev->link != port->link || link_reconfigured) {
+ port->link = phydev->link;
+
if (phydev->link) {
- val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
- val |= (MVPP2_GMAC_FORCE_LINK_PASS |
- MVPP2_GMAC_FORCE_LINK_DOWN);
- writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+ if (port->phy_interface == PHY_INTERFACE_MODE_RGMII ||
+ port->phy_interface == PHY_INTERFACE_MODE_RGMII_ID ||
+ port->phy_interface == PHY_INTERFACE_MODE_RGMII_RXID ||
+ port->phy_interface == PHY_INTERFACE_MODE_RGMII_TXID ||
+ port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+ val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+ val |= (MVPP2_GMAC_FORCE_LINK_PASS |
+ MVPP2_GMAC_FORCE_LINK_DOWN);
+ writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+ }
+
+ mvpp2_interrupts_enable(port);
+ mvpp2_port_enable(port);
+
mvpp2_egress_enable(port);
mvpp2_ingress_enable(port);
+ netif_carrier_on(dev);
+ netif_tx_wake_all_queues(dev);
} else {
+ port->duplex = -1;
+ port->speed = 0;
+
+ netif_tx_stop_all_queues(dev);
+ netif_carrier_off(dev);
mvpp2_ingress_disable(port);
mvpp2_egress_disable(port);
+
+ mvpp2_port_disable(port);
+ mvpp2_interrupts_disable(port);
}
+
phy_print_status(phydev);
}
}
port_pcpu->timer_scheduled = false;
/* Process all the Tx queues */
- cause = (1 << txq_number) - 1;
- tx_todo = mvpp2_tx_done(port, cause);
+ cause = (1 << port->ntxqs) - 1;
+ tx_todo = mvpp2_tx_done(port, cause, smp_processor_id());
/* Set the timer in case not all the packets were processed */
if (tx_todo)
}
/* Main rx processing */
-static int mvpp2_rx(struct mvpp2_port *port, int rx_todo,
- struct mvpp2_rx_queue *rxq)
+static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi,
+ int rx_todo, struct mvpp2_rx_queue *rxq)
{
struct net_device *dev = port->dev;
int rx_received;
skb->protocol = eth_type_trans(skb, dev);
mvpp2_rx_csum(port, rx_status, skb);
- napi_gro_receive(&port->napi, skb);
+ napi_gro_receive(napi, skb);
}
if (rcvd_pkts) {
return -ENOMEM;
}
+static inline void mvpp2_tso_put_hdr(struct sk_buff *skb,
+ struct net_device *dev,
+ struct mvpp2_tx_queue *txq,
+ struct mvpp2_tx_queue *aggr_txq,
+ struct mvpp2_txq_pcpu *txq_pcpu,
+ int hdr_sz)
+{
+ struct mvpp2_port *port = netdev_priv(dev);
+ struct mvpp2_tx_desc *tx_desc = mvpp2_txq_next_desc_get(aggr_txq);
+ dma_addr_t addr;
+
+ mvpp2_txdesc_txq_set(port, tx_desc, txq->id);
+ mvpp2_txdesc_size_set(port, tx_desc, hdr_sz);
+
+ addr = txq_pcpu->tso_headers_dma +
+ txq_pcpu->txq_put_index * TSO_HEADER_SIZE;
+ mvpp2_txdesc_offset_set(port, tx_desc, addr & MVPP2_TX_DESC_ALIGN);
+ mvpp2_txdesc_dma_addr_set(port, tx_desc, addr & ~MVPP2_TX_DESC_ALIGN);
+
+ mvpp2_txdesc_cmd_set(port, tx_desc, mvpp2_skb_tx_csum(port, skb) |
+ MVPP2_TXD_F_DESC |
+ MVPP2_TXD_PADDING_DISABLE);
+ mvpp2_txq_inc_put(port, txq_pcpu, NULL, tx_desc);
+}
+
+static inline int mvpp2_tso_put_data(struct sk_buff *skb,
+ struct net_device *dev, struct tso_t *tso,
+ struct mvpp2_tx_queue *txq,
+ struct mvpp2_tx_queue *aggr_txq,
+ struct mvpp2_txq_pcpu *txq_pcpu,
+ int sz, bool left, bool last)
+{
+ struct mvpp2_port *port = netdev_priv(dev);
+ struct mvpp2_tx_desc *tx_desc = mvpp2_txq_next_desc_get(aggr_txq);
+ dma_addr_t buf_dma_addr;
+
+ mvpp2_txdesc_txq_set(port, tx_desc, txq->id);
+ mvpp2_txdesc_size_set(port, tx_desc, sz);
+
+ buf_dma_addr = dma_map_single(dev->dev.parent, tso->data, sz,
+ DMA_TO_DEVICE);
+ if (unlikely(dma_mapping_error(dev->dev.parent, buf_dma_addr))) {
+ mvpp2_txq_desc_put(txq);
+ return -ENOMEM;
+ }
+
+ mvpp2_txdesc_offset_set(port, tx_desc,
+ buf_dma_addr & MVPP2_TX_DESC_ALIGN);
+ mvpp2_txdesc_dma_addr_set(port, tx_desc,
+ buf_dma_addr & ~MVPP2_TX_DESC_ALIGN);
+
+ if (!left) {
+ mvpp2_txdesc_cmd_set(port, tx_desc, MVPP2_TXD_L_DESC);
+ if (last) {
+ mvpp2_txq_inc_put(port, txq_pcpu, skb, tx_desc);
+ return 0;
+ }
+ } else {
+ mvpp2_txdesc_cmd_set(port, tx_desc, 0);
+ }
+
+ mvpp2_txq_inc_put(port, txq_pcpu, NULL, tx_desc);
+ return 0;
+}
+
+static int mvpp2_tx_tso(struct sk_buff *skb, struct net_device *dev,
+ struct mvpp2_tx_queue *txq,
+ struct mvpp2_tx_queue *aggr_txq,
+ struct mvpp2_txq_pcpu *txq_pcpu)
+{
+ struct mvpp2_port *port = netdev_priv(dev);
+ struct tso_t tso;
+ int hdr_sz = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ int i, len, descs = 0;
+
+ /* Check number of available descriptors */
+ if (mvpp2_aggr_desc_num_check(port->priv, aggr_txq,
+ tso_count_descs(skb)) ||
+ mvpp2_txq_reserved_desc_num_proc(port->priv, txq, txq_pcpu,
+ tso_count_descs(skb)))
+ return 0;
+
+ tso_start(skb, &tso);
+ len = skb->len - hdr_sz;
+ while (len > 0) {
+ int left = min_t(int, skb_shinfo(skb)->gso_size, len);
+ char *hdr = txq_pcpu->tso_headers +
+ txq_pcpu->txq_put_index * TSO_HEADER_SIZE;
+
+ len -= left;
+ descs++;
+
+ tso_build_hdr(skb, hdr, &tso, left, len == 0);
+ mvpp2_tso_put_hdr(skb, dev, txq, aggr_txq, txq_pcpu, hdr_sz);
+
+ while (left > 0) {
+ int sz = min_t(int, tso.size, left);
+ left -= sz;
+ descs++;
+
+ if (mvpp2_tso_put_data(skb, dev, &tso, txq, aggr_txq,
+ txq_pcpu, sz, left, len == 0))
+ goto release;
+ tso_build_data(skb, &tso, sz);
+ }
+ }
+
+ return descs;
+
+release:
+ for (i = descs - 1; i >= 0; i--) {
+ struct mvpp2_tx_desc *tx_desc = txq->descs + i;
+ tx_desc_unmap_put(port, txq, tx_desc);
+ }
+ return 0;
+}
+
/* Main tx processing */
static int mvpp2_tx(struct sk_buff *skb, struct net_device *dev)
{
txq_pcpu = this_cpu_ptr(txq->pcpu);
aggr_txq = &port->priv->aggr_txqs[smp_processor_id()];
+ if (skb_is_gso(skb)) {
+ frags = mvpp2_tx_tso(skb, dev, txq, aggr_txq, txq_pcpu);
+ goto out;
+ }
frags = skb_shinfo(skb)->nr_frags + 1;
/* Check number of available descriptors */
}
}
- txq_pcpu->reserved_num -= frags;
- txq_pcpu->count += frags;
- aggr_txq->count += frags;
-
- /* Enable transmit */
- wmb();
- mvpp2_aggr_txq_pend_desc_add(port, frags);
-
- if (txq_pcpu->size - txq_pcpu->count < MAX_SKB_FRAGS + 1) {
- struct netdev_queue *nq = netdev_get_tx_queue(dev, txq_id);
-
- netif_tx_stop_queue(nq);
- }
out:
if (frags > 0) {
struct mvpp2_pcpu_stats *stats = this_cpu_ptr(port->stats);
+ struct netdev_queue *nq = netdev_get_tx_queue(dev, txq_id);
+
+ txq_pcpu->reserved_num -= frags;
+ txq_pcpu->count += frags;
+ aggr_txq->count += frags;
+
+ /* Enable transmit */
+ wmb();
+ mvpp2_aggr_txq_pend_desc_add(port, frags);
+
+ if (txq_pcpu->size - txq_pcpu->count < MAX_SKB_FRAGS + 1)
+ netif_tx_stop_queue(nq);
u64_stats_update_begin(&stats->syncp);
stats->tx_packets++;
mvpp2_txq_done(port, txq, txq_pcpu);
/* Set the timer in case not all frags were processed */
- if (txq_pcpu->count <= frags && txq_pcpu->count > 0) {
+ if (!port->has_tx_irqs && txq_pcpu->count <= frags &&
+ txq_pcpu->count > 0) {
struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu);
mvpp2_timer_set(port_pcpu);
static int mvpp2_poll(struct napi_struct *napi, int budget)
{
- u32 cause_rx_tx, cause_rx, cause_misc;
+ u32 cause_rx_tx, cause_rx, cause_tx, cause_misc;
int rx_done = 0;
struct mvpp2_port *port = netdev_priv(napi->dev);
+ struct mvpp2_queue_vector *qv;
int cpu = smp_processor_id();
+ qv = container_of(napi, struct mvpp2_queue_vector, napi);
+
/* Rx/Tx cause register
*
* Bits 0-15: each bit indicates received packets on the Rx queue
*
* Each CPU has its own Rx/Tx cause register
*/
- cause_rx_tx = mvpp2_percpu_read(port->priv, cpu,
+ cause_rx_tx = mvpp2_percpu_read(port->priv, qv->sw_thread_id,
MVPP2_ISR_RX_TX_CAUSE_REG(port->id));
- cause_rx_tx &= ~MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK;
- cause_misc = cause_rx_tx & MVPP2_CAUSE_MISC_SUM_MASK;
+ cause_misc = cause_rx_tx & MVPP2_CAUSE_MISC_SUM_MASK;
if (cause_misc) {
mvpp2_cause_error(port->dev, cause_misc);
cause_rx_tx & ~MVPP2_CAUSE_MISC_SUM_MASK);
}
- cause_rx = cause_rx_tx & MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK;
+ cause_tx = cause_rx_tx & MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK;
+ if (cause_tx) {
+ cause_tx >>= MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_OFFSET;
+ mvpp2_tx_done(port, cause_tx, qv->sw_thread_id);
+ }
/* Process RX packets */
- cause_rx |= port->pending_cause_rx;
+ cause_rx = cause_rx_tx & MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK;
+ cause_rx <<= qv->first_rxq;
+ cause_rx |= qv->pending_cause_rx;
while (cause_rx && budget > 0) {
int count;
struct mvpp2_rx_queue *rxq;
if (!rxq)
break;
- count = mvpp2_rx(port, budget, rxq);
+ count = mvpp2_rx(port, napi, budget, rxq);
rx_done += count;
budget -= count;
if (budget > 0) {
cause_rx = 0;
napi_complete_done(napi, rx_done);
- mvpp2_interrupts_enable(port);
+ mvpp2_qvec_interrupt_enable(qv);
}
- port->pending_cause_rx = cause_rx;
+ qv->pending_cause_rx = cause_rx;
return rx_done;
}
static void mvpp2_start_dev(struct mvpp2_port *port)
{
struct net_device *ndev = port->dev;
+ int i;
+
+ if (port->gop_id == 0 &&
+ (port->phy_interface == PHY_INTERFACE_MODE_XAUI ||
+ port->phy_interface == PHY_INTERFACE_MODE_10GKR))
+ mvpp2_xlg_max_rx_size_set(port);
+ else
+ mvpp2_gmac_max_rx_size_set(port);
- mvpp2_gmac_max_rx_size_set(port);
mvpp2_txp_max_tx_size_set(port);
- napi_enable(&port->napi);
+ for (i = 0; i < port->nqvecs; i++)
+ napi_enable(&port->qvecs[i].napi);
/* Enable interrupts on all CPUs */
mvpp2_interrupts_enable(port);
+ if (port->priv->hw_version == MVPP22) {
+ mvpp22_comphy_init(port);
+ mvpp22_gop_init(port);
+ }
+
+ mvpp2_port_mii_set(port);
mvpp2_port_enable(port);
- phy_start(ndev->phydev);
+ if (ndev->phydev)
+ phy_start(ndev->phydev);
netif_tx_start_all_queues(port->dev);
}
static void mvpp2_stop_dev(struct mvpp2_port *port)
{
struct net_device *ndev = port->dev;
+ int i;
/* Stop new packets from arriving to RXQs */
mvpp2_ingress_disable(port);
/* Disable interrupts on all CPUs */
mvpp2_interrupts_disable(port);
- napi_disable(&port->napi);
+ for (i = 0; i < port->nqvecs; i++)
+ napi_disable(&port->qvecs[i].napi);
netif_carrier_off(port->dev);
netif_tx_stop_all_queues(port->dev);
mvpp2_egress_disable(port);
mvpp2_port_disable(port);
- phy_stop(ndev->phydev);
+ if (ndev->phydev)
+ phy_stop(ndev->phydev);
+ phy_power_off(port->comphy);
}
static int mvpp2_check_ringparam_valid(struct net_device *dev,
{
struct phy_device *phy_dev;
+ /* No PHY is attached */
+ if (!port->phy_node)
+ return 0;
+
phy_dev = of_phy_connect(port->dev, port->phy_node, mvpp2_link_event, 0,
port->phy_interface);
if (!phy_dev) {
{
struct net_device *ndev = port->dev;
+ if (!ndev->phydev)
+ return;
+
phy_disconnect(ndev->phydev);
}
+static int mvpp2_irqs_init(struct mvpp2_port *port)
+{
+ int err, i;
+
+ for (i = 0; i < port->nqvecs; i++) {
+ struct mvpp2_queue_vector *qv = port->qvecs + i;
+
+ err = request_irq(qv->irq, mvpp2_isr, 0, port->dev->name, qv);
+ if (err)
+ goto err;
+
+ if (qv->type == MVPP2_QUEUE_VECTOR_PRIVATE)
+ irq_set_affinity_hint(qv->irq,
+ cpumask_of(qv->sw_thread_id));
+ }
+
+ return 0;
+err:
+ for (i = 0; i < port->nqvecs; i++) {
+ struct mvpp2_queue_vector *qv = port->qvecs + i;
+
+ irq_set_affinity_hint(qv->irq, NULL);
+ free_irq(qv->irq, qv);
+ }
+
+ return err;
+}
+
+static void mvpp2_irqs_deinit(struct mvpp2_port *port)
+{
+ int i;
+
+ for (i = 0; i < port->nqvecs; i++) {
+ struct mvpp2_queue_vector *qv = port->qvecs + i;
+
+ irq_set_affinity_hint(qv->irq, NULL);
+ free_irq(qv->irq, qv);
+ }
+}
+
static int mvpp2_open(struct net_device *dev)
{
struct mvpp2_port *port = netdev_priv(dev);
+ struct mvpp2 *priv = port->priv;
unsigned char mac_bcast[ETH_ALEN] = {
0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
int err;
goto err_cleanup_rxqs;
}
- err = request_irq(port->irq, mvpp2_isr, 0, dev->name, port);
+ err = mvpp2_irqs_init(port);
if (err) {
- netdev_err(port->dev, "cannot request IRQ %d\n", port->irq);
+ netdev_err(port->dev, "cannot init IRQs\n");
goto err_cleanup_txqs;
}
+ if (priv->hw_version == MVPP22 && !port->phy_node && port->link_irq) {
+ err = request_irq(port->link_irq, mvpp2_link_status_isr, 0,
+ dev->name, port);
+ if (err) {
+ netdev_err(port->dev, "cannot request link IRQ %d\n",
+ port->link_irq);
+ goto err_free_irq;
+ }
+
+ mvpp22_gop_setup_irq(port);
+ }
+
/* In default link is down */
netif_carrier_off(port->dev);
err = mvpp2_phy_connect(port);
if (err < 0)
- goto err_free_irq;
+ goto err_free_link_irq;
/* Unmask interrupts on all CPUs */
on_each_cpu(mvpp2_interrupts_unmask, port, 1);
+ mvpp2_shared_interrupt_mask_unmask(port, false);
mvpp2_start_dev(port);
return 0;
+err_free_link_irq:
+ if (priv->hw_version == MVPP22 && !port->phy_node && port->link_irq)
+ free_irq(port->link_irq, port);
err_free_irq:
- free_irq(port->irq, port);
+ mvpp2_irqs_deinit(port);
err_cleanup_txqs:
mvpp2_cleanup_txqs(port);
err_cleanup_rxqs:
{
struct mvpp2_port *port = netdev_priv(dev);
struct mvpp2_port_pcpu *port_pcpu;
+ struct mvpp2 *priv = port->priv;
int cpu;
mvpp2_stop_dev(port);
/* Mask interrupts on all CPUs */
on_each_cpu(mvpp2_interrupts_mask, port, 1);
+ mvpp2_shared_interrupt_mask_unmask(port, true);
- free_irq(port->irq, port);
- for_each_present_cpu(cpu) {
- port_pcpu = per_cpu_ptr(port->pcpu, cpu);
+ if (priv->hw_version == MVPP22 && !port->phy_node && port->link_irq)
+ free_irq(port->link_irq, port);
- hrtimer_cancel(&port_pcpu->tx_done_timer);
- port_pcpu->timer_scheduled = false;
- tasklet_kill(&port_pcpu->tx_done_tasklet);
+ mvpp2_irqs_deinit(port);
+ if (!port->has_tx_irqs) {
+ for_each_present_cpu(cpu) {
+ port_pcpu = per_cpu_ptr(port->pcpu, cpu);
+
+ hrtimer_cancel(&port_pcpu->tx_done_timer);
+ port_pcpu->timer_scheduled = false;
+ tasklet_kill(&port_pcpu->tx_done_tasklet);
+ }
}
mvpp2_cleanup_rxqs(port);
mvpp2_cleanup_txqs(port);
struct mvpp2_port *port = netdev_priv(dev);
int queue;
- for (queue = 0; queue < rxq_number; queue++) {
+ for (queue = 0; queue < port->nrxqs; queue++) {
struct mvpp2_rx_queue *rxq = port->rxqs[queue];
rxq->time_coal = c->rx_coalesce_usecs;
mvpp2_rx_time_coal_set(port, rxq);
}
- for (queue = 0; queue < txq_number; queue++) {
+ if (port->has_tx_irqs) {
+ port->tx_time_coal = c->tx_coalesce_usecs;
+ mvpp2_tx_time_coal_set(port);
+ }
+
+ for (queue = 0; queue < port->ntxqs; queue++) {
struct mvpp2_tx_queue *txq = port->txqs[queue];
txq->done_pkts_coal = c->tx_max_coalesced_frames;
+
+ if (port->has_tx_irqs)
+ mvpp2_tx_pkts_coal_set(port, txq);
}
return 0;
.set_link_ksettings = phy_ethtool_set_link_ksettings,
};
+/* Used for PPv2.1, or PPv2.2 with the old Device Tree binding that
+ * had a single IRQ defined per-port.
+ */
+static int mvpp2_simple_queue_vectors_init(struct mvpp2_port *port,
+ struct device_node *port_node)
+{
+ struct mvpp2_queue_vector *v = &port->qvecs[0];
+
+ v->first_rxq = 0;
+ v->nrxqs = port->nrxqs;
+ v->type = MVPP2_QUEUE_VECTOR_SHARED;
+ v->sw_thread_id = 0;
+ v->sw_thread_mask = *cpumask_bits(cpu_online_mask);
+ v->port = port;
+ v->irq = irq_of_parse_and_map(port_node, 0);
+ if (v->irq <= 0)
+ return -EINVAL;
+ netif_napi_add(port->dev, &v->napi, mvpp2_poll,
+ NAPI_POLL_WEIGHT);
+
+ port->nqvecs = 1;
+
+ return 0;
+}
+
+static int mvpp2_multi_queue_vectors_init(struct mvpp2_port *port,
+ struct device_node *port_node)
+{
+ struct mvpp2_queue_vector *v;
+ int i, ret;
+
+ port->nqvecs = num_possible_cpus();
+ if (queue_mode == MVPP2_QDIST_SINGLE_MODE)
+ port->nqvecs += 1;
+
+ for (i = 0; i < port->nqvecs; i++) {
+ char irqname[16];
+
+ v = port->qvecs + i;
+
+ v->port = port;
+ v->type = MVPP2_QUEUE_VECTOR_PRIVATE;
+ v->sw_thread_id = i;
+ v->sw_thread_mask = BIT(i);
+
+ snprintf(irqname, sizeof(irqname), "tx-cpu%d", i);
+
+ if (queue_mode == MVPP2_QDIST_MULTI_MODE) {
+ v->first_rxq = i * MVPP2_DEFAULT_RXQ;
+ v->nrxqs = MVPP2_DEFAULT_RXQ;
+ } else if (queue_mode == MVPP2_QDIST_SINGLE_MODE &&
+ i == (port->nqvecs - 1)) {
+ v->first_rxq = 0;
+ v->nrxqs = port->nrxqs;
+ v->type = MVPP2_QUEUE_VECTOR_SHARED;
+ strncpy(irqname, "rx-shared", sizeof(irqname));
+ }
+
+ v->irq = of_irq_get_byname(port_node, irqname);
+ if (v->irq <= 0) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ netif_napi_add(port->dev, &v->napi, mvpp2_poll,
+ NAPI_POLL_WEIGHT);
+ }
+
+ return 0;
+
+err:
+ for (i = 0; i < port->nqvecs; i++)
+ irq_dispose_mapping(port->qvecs[i].irq);
+ return ret;
+}
+
+static int mvpp2_queue_vectors_init(struct mvpp2_port *port,
+ struct device_node *port_node)
+{
+ if (port->has_tx_irqs)
+ return mvpp2_multi_queue_vectors_init(port, port_node);
+ else
+ return mvpp2_simple_queue_vectors_init(port, port_node);
+}
+
+static void mvpp2_queue_vectors_deinit(struct mvpp2_port *port)
+{
+ int i;
+
+ for (i = 0; i < port->nqvecs; i++)
+ irq_dispose_mapping(port->qvecs[i].irq);
+}
+
+/* Configure Rx queue group interrupt for this port */
+static void mvpp2_rx_irqs_setup(struct mvpp2_port *port)
+{
+ struct mvpp2 *priv = port->priv;
+ u32 val;
+ int i;
+
+ if (priv->hw_version == MVPP21) {
+ mvpp2_write(priv, MVPP21_ISR_RXQ_GROUP_REG(port->id),
+ port->nrxqs);
+ return;
+ }
+
+ /* Handle the more complicated PPv2.2 case */
+ for (i = 0; i < port->nqvecs; i++) {
+ struct mvpp2_queue_vector *qv = port->qvecs + i;
+
+ if (!qv->nrxqs)
+ continue;
+
+ val = qv->sw_thread_id;
+ val |= port->id << MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_OFFSET;
+ mvpp2_write(priv, MVPP22_ISR_RXQ_GROUP_INDEX_REG, val);
+
+ val = qv->first_rxq;
+ val |= qv->nrxqs << MVPP22_ISR_RXQ_SUB_GROUP_SIZE_OFFSET;
+ mvpp2_write(priv, MVPP22_ISR_RXQ_SUB_GROUP_CONFIG_REG, val);
+ }
+}
+
/* Initialize port HW */
static int mvpp2_port_init(struct mvpp2_port *port)
{
struct mvpp2_txq_pcpu *txq_pcpu;
int queue, cpu, err;
- if (port->first_rxq + rxq_number >
+ /* Checks for hardware constraints */
+ if (port->first_rxq + port->nrxqs >
MVPP2_MAX_PORTS * priv->max_port_rxqs)
return -EINVAL;
+ if (port->nrxqs % 4 || (port->nrxqs > priv->max_port_rxqs) ||
+ (port->ntxqs > MVPP2_MAX_TXQ))
+ return -EINVAL;
+
/* Disable port */
mvpp2_egress_disable(port);
mvpp2_port_disable(port);
- port->txqs = devm_kcalloc(dev, txq_number, sizeof(*port->txqs),
+ port->tx_time_coal = MVPP2_TXDONE_COAL_USEC;
+
+ port->txqs = devm_kcalloc(dev, port->ntxqs, sizeof(*port->txqs),
GFP_KERNEL);
if (!port->txqs)
return -ENOMEM;
/* Associate physical Tx queues to this port and initialize.
* The mapping is predefined.
*/
- for (queue = 0; queue < txq_number; queue++) {
+ for (queue = 0; queue < port->ntxqs; queue++) {
int queue_phy_id = mvpp2_txq_phys(port->id, queue);
struct mvpp2_tx_queue *txq;
port->txqs[queue] = txq;
}
- port->rxqs = devm_kcalloc(dev, rxq_number, sizeof(*port->rxqs),
+ port->rxqs = devm_kcalloc(dev, port->nrxqs, sizeof(*port->rxqs),
GFP_KERNEL);
if (!port->rxqs) {
err = -ENOMEM;
}
/* Allocate and initialize Rx queue for this port */
- for (queue = 0; queue < rxq_number; queue++) {
+ for (queue = 0; queue < port->nrxqs; queue++) {
struct mvpp2_rx_queue *rxq;
/* Map physical Rx queue to port's logical Rx queue */
port->rxqs[queue] = rxq;
}
- /* Configure Rx queue group interrupt for this port */
- if (priv->hw_version == MVPP21) {
- mvpp2_write(priv, MVPP21_ISR_RXQ_GROUP_REG(port->id),
- rxq_number);
- } else {
- u32 val;
-
- val = (port->id << MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_OFFSET);
- mvpp2_write(priv, MVPP22_ISR_RXQ_GROUP_INDEX_REG, val);
-
- val = (rxq_number << MVPP22_ISR_RXQ_SUB_GROUP_SIZE_OFFSET);
- mvpp2_write(priv, MVPP22_ISR_RXQ_SUB_GROUP_CONFIG_REG, val);
- }
+ mvpp2_rx_irqs_setup(port);
/* Create Rx descriptor rings */
- for (queue = 0; queue < rxq_number; queue++) {
+ for (queue = 0; queue < port->nrxqs; queue++) {
struct mvpp2_rx_queue *rxq = port->rxqs[queue];
rxq->size = port->rx_ring_size;
return 0;
err_free_percpu:
- for (queue = 0; queue < txq_number; queue++) {
+ for (queue = 0; queue < port->ntxqs; queue++) {
if (!port->txqs[queue])
continue;
free_percpu(port->txqs[queue]->pcpu);
return err;
}
+/* Checks if the port DT description has the TX interrupts
+ * described. On PPv2.1, there are no such interrupts. On PPv2.2,
+ * there are available, but we need to keep support for old DTs.
+ */
+static bool mvpp2_port_has_tx_irqs(struct mvpp2 *priv,
+ struct device_node *port_node)
+{
+ char *irqs[5] = { "rx-shared", "tx-cpu0", "tx-cpu1",
+ "tx-cpu2", "tx-cpu3" };
+ int ret, i;
+
+ if (priv->hw_version == MVPP21)
+ return false;
+
+ for (i = 0; i < 5; i++) {
+ ret = of_property_match_string(port_node, "interrupt-names",
+ irqs[i]);
+ if (ret < 0)
+ return false;
+ }
+
+ return true;
+}
+
/* Ports initialization */
static int mvpp2_port_probe(struct platform_device *pdev,
struct device_node *port_node,
struct mvpp2 *priv)
{
struct device_node *phy_node;
+ struct phy *comphy;
struct mvpp2_port *port;
struct mvpp2_port_pcpu *port_pcpu;
struct net_device *dev;
struct resource *res;
const char *dt_mac_addr;
const char *mac_from;
- char hw_mac_addr[ETH_ALEN];
+ char hw_mac_addr[ETH_ALEN] = {0};
+ unsigned int ntxqs, nrxqs;
+ bool has_tx_irqs;
u32 id;
int features;
int phy_mode;
int err, i, cpu;
- dev = alloc_etherdev_mqs(sizeof(*port), txq_number, rxq_number);
+ has_tx_irqs = mvpp2_port_has_tx_irqs(priv, port_node);
+
+ if (!has_tx_irqs)
+ queue_mode = MVPP2_QDIST_SINGLE_MODE;
+
+ ntxqs = MVPP2_MAX_TXQ;
+ if (priv->hw_version == MVPP22 && queue_mode == MVPP2_QDIST_MULTI_MODE)
+ nrxqs = MVPP2_DEFAULT_RXQ * num_possible_cpus();
+ else
+ nrxqs = MVPP2_DEFAULT_RXQ;
+
+ dev = alloc_etherdev_mqs(sizeof(*port), ntxqs, nrxqs);
if (!dev)
return -ENOMEM;
phy_node = of_parse_phandle(port_node, "phy", 0);
- if (!phy_node) {
- dev_err(&pdev->dev, "missing phy\n");
- err = -ENODEV;
- goto err_free_netdev;
- }
-
phy_mode = of_get_phy_mode(port_node);
if (phy_mode < 0) {
dev_err(&pdev->dev, "incorrect phy mode\n");
goto err_free_netdev;
}
+ comphy = devm_of_phy_get(&pdev->dev, port_node, NULL);
+ if (IS_ERR(comphy)) {
+ if (PTR_ERR(comphy) == -EPROBE_DEFER) {
+ err = -EPROBE_DEFER;
+ goto err_free_netdev;
+ }
+ comphy = NULL;
+ }
+
if (of_property_read_u32(port_node, "port-id", &id)) {
err = -EINVAL;
dev_err(&pdev->dev, "missing port-id value\n");
dev->ethtool_ops = &mvpp2_eth_tool_ops;
port = netdev_priv(dev);
+ port->dev = dev;
+ port->ntxqs = ntxqs;
+ port->nrxqs = nrxqs;
+ port->priv = priv;
+ port->has_tx_irqs = has_tx_irqs;
- port->irq = irq_of_parse_and_map(port_node, 0);
- if (port->irq <= 0) {
- err = -EINVAL;
+ err = mvpp2_queue_vectors_init(port, port_node);
+ if (err)
goto err_free_netdev;
+
+ port->link_irq = of_irq_get_byname(port_node, "link");
+ if (port->link_irq == -EPROBE_DEFER) {
+ err = -EPROBE_DEFER;
+ goto err_deinit_qvecs;
}
+ if (port->link_irq <= 0)
+ /* the link irq is optional */
+ port->link_irq = 0;
if (of_property_read_bool(port_node, "marvell,loopback"))
port->flags |= MVPP2_F_LOOPBACK;
- port->priv = priv;
port->id = id;
if (priv->hw_version == MVPP21)
- port->first_rxq = port->id * rxq_number;
+ port->first_rxq = port->id * port->nrxqs;
else
port->first_rxq = port->id * priv->max_port_rxqs;
port->phy_node = phy_node;
port->phy_interface = phy_mode;
+ port->comphy = comphy;
if (priv->hw_version == MVPP21) {
res = platform_get_resource(pdev, IORESOURCE_MEM, 2 + id);
&port->gop_id)) {
err = -EINVAL;
dev_err(&pdev->dev, "missing gop-port-id value\n");
- goto err_free_irq;
+ goto err_deinit_qvecs;
}
port->base = priv->iface_base + MVPP22_GMAC_BASE(port->gop_id);
port->tx_ring_size = MVPP2_MAX_TXD;
port->rx_ring_size = MVPP2_MAX_RXD;
- port->dev = dev;
SET_NETDEV_DEV(dev, &pdev->dev);
err = mvpp2_port_init(port);
goto err_free_stats;
}
- mvpp2_port_mii_set(port);
mvpp2_port_periodic_xon_disable(port);
if (priv->hw_version == MVPP21)
goto err_free_txq_pcpu;
}
- for_each_present_cpu(cpu) {
- port_pcpu = per_cpu_ptr(port->pcpu, cpu);
+ if (!port->has_tx_irqs) {
+ for_each_present_cpu(cpu) {
+ port_pcpu = per_cpu_ptr(port->pcpu, cpu);
- hrtimer_init(&port_pcpu->tx_done_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL_PINNED);
- port_pcpu->tx_done_timer.function = mvpp2_hr_timer_cb;
- port_pcpu->timer_scheduled = false;
+ hrtimer_init(&port_pcpu->tx_done_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_PINNED);
+ port_pcpu->tx_done_timer.function = mvpp2_hr_timer_cb;
+ port_pcpu->timer_scheduled = false;
- tasklet_init(&port_pcpu->tx_done_tasklet, mvpp2_tx_proc_cb,
- (unsigned long)dev);
+ tasklet_init(&port_pcpu->tx_done_tasklet,
+ mvpp2_tx_proc_cb,
+ (unsigned long)dev);
+ }
}
- netif_napi_add(dev, &port->napi, mvpp2_poll, NAPI_POLL_WEIGHT);
- features = NETIF_F_SG | NETIF_F_IP_CSUM;
+ features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
dev->features = features | NETIF_F_RXCSUM;
dev->hw_features |= features | NETIF_F_RXCSUM | NETIF_F_GRO;
dev->vlan_features |= features;
err_free_port_pcpu:
free_percpu(port->pcpu);
err_free_txq_pcpu:
- for (i = 0; i < txq_number; i++)
+ for (i = 0; i < port->ntxqs; i++)
free_percpu(port->txqs[i]->pcpu);
err_free_stats:
free_percpu(port->stats);
err_free_irq:
- irq_dispose_mapping(port->irq);
+ if (port->link_irq)
+ irq_dispose_mapping(port->link_irq);
+err_deinit_qvecs:
+ mvpp2_queue_vectors_deinit(port);
err_free_netdev:
of_node_put(phy_node);
free_netdev(dev);
of_node_put(port->phy_node);
free_percpu(port->pcpu);
free_percpu(port->stats);
- for (i = 0; i < txq_number; i++)
+ for (i = 0; i < port->ntxqs; i++)
free_percpu(port->txqs[i]->pcpu);
- irq_dispose_mapping(port->irq);
+ mvpp2_queue_vectors_deinit(port);
+ if (port->link_irq)
+ irq_dispose_mapping(port->link_irq);
free_netdev(port->dev);
}
int err, i;
u32 val;
- /* Checks for hardware constraints */
- if (rxq_number % 4 || (rxq_number > priv->max_port_rxqs) ||
- (txq_number > MVPP2_MAX_TXQ)) {
- dev_err(&pdev->dev, "invalid queue size parameter\n");
- return -EINVAL;
- }
-
/* MBUS windows configuration */
dram_target_info = mv_mbus_dram_info();
if (dram_target_info)
for_each_present_cpu(i) {
priv->aggr_txqs[i].id = i;
priv->aggr_txqs[i].size = MVPP2_AGGR_TXQ_SIZE;
- err = mvpp2_aggr_txq_init(pdev, &priv->aggr_txqs[i],
- MVPP2_AGGR_TXQ_SIZE, i, priv);
+ err = mvpp2_aggr_txq_init(pdev, &priv->aggr_txqs[i], i, priv);
if (err < 0)
return err;
}
/* Rx Fifo Init */
mvpp2_rx_fifo_init(priv);
- /* Reset Rx queue group interrupt configuration */
- for (i = 0; i < MVPP2_MAX_PORTS; i++) {
- if (priv->hw_version == MVPP21) {
- mvpp2_write(priv, MVPP21_ISR_RXQ_GROUP_REG(i),
- rxq_number);
- continue;
- } else {
- u32 val;
-
- val = (i << MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_OFFSET);
- mvpp2_write(priv, MVPP22_ISR_RXQ_GROUP_INDEX_REG, val);
-
- val = (rxq_number << MVPP22_ISR_RXQ_SUB_GROUP_SIZE_OFFSET);
- mvpp2_write(priv, MVPP22_ISR_RXQ_SUB_GROUP_CONFIG_REG, val);
- }
- }
-
if (priv->hw_version == MVPP21)
writel(MVPP2_EXT_GLOBAL_CTRL_DEFAULT,
priv->lms_base + MVPP2_MNG_EXTENDED_GLOBAL_CTRL_REG);
struct mvpp2 *priv;
struct resource *res;
void __iomem *base;
- int port_count, cpu;
+ int port_count, i;
int err;
priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
priv->iface_base = devm_ioremap_resource(&pdev->dev, res);
if (IS_ERR(priv->iface_base))
return PTR_ERR(priv->iface_base);
+
+ priv->sysctrl_base =
+ syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
+ "marvell,system-controller");
+ if (IS_ERR(priv->sysctrl_base))
+ /* The system controller regmap is optional for dt
+ * compatibility reasons. When not provided, the
+ * configuration of the GoP relies on the
+ * firmware/bootloader.
+ */
+ priv->sysctrl_base = NULL;
}
- for_each_present_cpu(cpu) {
+ for (i = 0; i < MVPP2_MAX_THREADS; i++) {
u32 addr_space_sz;
addr_space_sz = (priv->hw_version == MVPP21 ?
MVPP21_ADDR_SPACE_SZ : MVPP22_ADDR_SPACE_SZ);
- priv->cpu_base[cpu] = base + cpu * addr_space_sz;
+ priv->swth_base[i] = base + i * addr_space_sz;
}
if (priv->hw_version == MVPP21)
/* The only setting that cannot be read from FW */
u8 tc_tsa[IEEE_8021QAZ_MAX_TCS];
+ u8 cap;
};
#endif
MLX5E_NUM_INDIR_TIRS = MLX5E_TT_ANY,
};
+enum mlx5e_tunnel_types {
+ MLX5E_TT_IPV4_GRE,
+ MLX5E_TT_IPV6_GRE,
+ MLX5E_NUM_TUNNEL_TT,
+};
+
enum {
MLX5E_STATE_ASYNC_EVENTS_ENABLED,
MLX5E_STATE_OPENED,
struct mlx5e_ttc_table {
struct mlx5e_flow_table ft;
struct mlx5_flow_handle *rules[MLX5E_NUM_TT];
+ struct mlx5_flow_handle *tunnel_rules[MLX5E_NUM_TUNNEL_TT];
};
#define ARFS_HASH_SHIFT BITS_PER_BYTE
MLX5E_VLAN_FT_LEVEL = 0,
MLX5E_L2_FT_LEVEL,
MLX5E_TTC_FT_LEVEL,
+ MLX5E_INNER_TTC_FT_LEVEL,
MLX5E_ARFS_FT_LEVEL
};
struct mlx5e_vlan_table vlan;
struct mlx5e_l2_table l2;
struct mlx5e_ttc_table ttc;
+ struct mlx5e_ttc_table inner_ttc;
struct mlx5e_arfs_tables arfs;
};
u32 tisn[MLX5E_MAX_NUM_TC];
struct mlx5e_rqt indir_rqt;
struct mlx5e_tir indir_tir[MLX5E_NUM_INDIR_TIRS];
+ struct mlx5e_tir inner_indir_tir[MLX5E_NUM_INDIR_TIRS];
struct mlx5e_tir direct_tir[MLX5E_MAX_NUM_CHANNELS];
u32 tx_rates[MLX5E_MAX_NUM_SQS];
int hard_mtu;
struct mlx5e_redirect_rqt_param rrp);
void mlx5e_build_indir_tir_ctx_hash(struct mlx5e_params *params,
enum mlx5e_traffic_types tt,
- void *tirc);
+ void *tirc, bool inner);
int mlx5e_open_locked(struct net_device *netdev);
int mlx5e_close_locked(struct net_device *netdev);
void mlx5e_set_rq_type_params(struct mlx5_core_dev *mdev,
struct mlx5e_params *params, u8 rq_type);
+static inline bool mlx5e_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev)
+{
+ return (MLX5_CAP_ETH(mdev, tunnel_stateless_gre) &&
+ MLX5_CAP_FLOWTABLE_NIC_RX(mdev, ft_field_support.inner_ip_version));
+}
+
static inline
struct mlx5e_tx_wqe *mlx5e_post_nop(struct mlx5_wq_cyc *wq, u32 sqn, u16 *pc)
{
int mlx5e_ethtool_get_sset_count(struct mlx5e_priv *priv, int sset)
{
-
switch (sset) {
case ETH_SS_STATS:
return NUM_SW_COUNTERS +
return mlx5e_ethtool_get_sset_count(priv, sset);
}
-static void mlx5e_fill_stats_strings(struct mlx5e_priv *priv, uint8_t *data)
+static void mlx5e_fill_stats_strings(struct mlx5e_priv *priv, u8 *data)
{
int i, j, tc, prio, idx = 0;
unsigned long pfc_combined;
strcpy(data + (idx++) * ETH_GSTRING_LEN,
pport_phy_statistical_stats_desc[i].format);
+ for (i = 0; i < NUM_PPORT_ETH_EXT_COUNTERS(priv); i++)
+ strcpy(data + (idx++) * ETH_GSTRING_LEN,
+ pport_eth_ext_stats_desc[i].format);
+
for (i = 0; i < NUM_PCIE_PERF_COUNTERS(priv); i++)
strcpy(data + (idx++) * ETH_GSTRING_LEN,
pcie_perf_stats_desc[i].format);
+ for (i = 0; i < NUM_PCIE_PERF_COUNTERS64(priv); i++)
+ strcpy(data + (idx++) * ETH_GSTRING_LEN,
+ pcie_perf_stats_desc64[i].format);
+
+ for (i = 0; i < NUM_PCIE_PERF_STALL_COUNTERS(priv); i++)
+ strcpy(data + (idx++) * ETH_GSTRING_LEN,
+ pcie_perf_stall_stats_desc[i].format);
+
for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
for (i = 0; i < NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS; i++)
sprintf(data + (idx++) * ETH_GSTRING_LEN,
priv->channel_tc2txq[i][tc]);
}
-void mlx5e_ethtool_get_strings(struct mlx5e_priv *priv,
- uint32_t stringset, uint8_t *data)
+void mlx5e_ethtool_get_strings(struct mlx5e_priv *priv, u32 stringset, u8 *data)
{
int i;
}
}
-static void mlx5e_get_strings(struct net_device *dev,
- uint32_t stringset, uint8_t *data)
+static void mlx5e_get_strings(struct net_device *dev, u32 stringset, u8 *data)
{
struct mlx5e_priv *priv = netdev_priv(dev);
data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.phy_statistical_counters,
pport_phy_statistical_stats_desc, i);
+ for (i = 0; i < NUM_PPORT_ETH_EXT_COUNTERS(priv); i++)
+ data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.eth_ext_counters,
+ pport_eth_ext_stats_desc, i);
+
for (i = 0; i < NUM_PCIE_PERF_COUNTERS(priv); i++)
data[idx++] = MLX5E_READ_CTR32_BE(&priv->stats.pcie.pcie_perf_counters,
pcie_perf_stats_desc, i);
+ for (i = 0; i < NUM_PCIE_PERF_COUNTERS64(priv); i++)
+ data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pcie.pcie_perf_counters,
+ pcie_perf_stats_desc64, i);
+
+ for (i = 0; i < NUM_PCIE_PERF_STALL_COUNTERS(priv); i++)
+ data[idx++] = MLX5E_READ_CTR32_BE(&priv->stats.pcie.pcie_perf_counters,
+ pcie_perf_stall_stats_desc, i);
+
for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
for (i = 0; i < NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS; i++)
data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.per_prio_counters[prio],
new_channels.params = priv->channels.params;
new_channels.params.num_channels = count;
- mlx5e_build_default_indir_rqt(priv->mdev, new_channels.params.indirection_rqt,
- MLX5E_INDIR_RQT_SIZE, count);
+ if (!netif_is_rxfh_configured(priv->netdev))
+ mlx5e_build_default_indir_rqt(priv->mdev,
+ new_channels.params.indirection_rqt,
+ MLX5E_INDIR_RQT_SIZE, count);
if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
priv->channels.params = new_channels.params;
if (connector_type && connector_type < MLX5E_CONNECTOR_TYPE_NUMBER)
return ptys2connector_type[connector_type];
- if (eth_proto & (MLX5E_PROT_MASK(MLX5E_10GBASE_SR)
- | MLX5E_PROT_MASK(MLX5E_40GBASE_SR4)
- | MLX5E_PROT_MASK(MLX5E_100GBASE_SR4)
- | MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII))) {
- return PORT_FIBRE;
+ if (eth_proto &
+ (MLX5E_PROT_MASK(MLX5E_10GBASE_SR) |
+ MLX5E_PROT_MASK(MLX5E_40GBASE_SR4) |
+ MLX5E_PROT_MASK(MLX5E_100GBASE_SR4) |
+ MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII))) {
+ return PORT_FIBRE;
}
- if (eth_proto & (MLX5E_PROT_MASK(MLX5E_40GBASE_CR4)
- | MLX5E_PROT_MASK(MLX5E_10GBASE_CR)
- | MLX5E_PROT_MASK(MLX5E_100GBASE_CR4))) {
- return PORT_DA;
+ if (eth_proto &
+ (MLX5E_PROT_MASK(MLX5E_40GBASE_CR4) |
+ MLX5E_PROT_MASK(MLX5E_10GBASE_CR) |
+ MLX5E_PROT_MASK(MLX5E_100GBASE_CR4))) {
+ return PORT_DA;
}
- if (eth_proto & (MLX5E_PROT_MASK(MLX5E_10GBASE_KX4)
- | MLX5E_PROT_MASK(MLX5E_10GBASE_KR)
- | MLX5E_PROT_MASK(MLX5E_40GBASE_KR4)
- | MLX5E_PROT_MASK(MLX5E_100GBASE_KR4))) {
- return PORT_NONE;
+ if (eth_proto &
+ (MLX5E_PROT_MASK(MLX5E_10GBASE_KX4) |
+ MLX5E_PROT_MASK(MLX5E_10GBASE_KR) |
+ MLX5E_PROT_MASK(MLX5E_40GBASE_KR4) |
+ MLX5E_PROT_MASK(MLX5E_100GBASE_KR4))) {
+ return PORT_NONE;
}
return PORT_OTHER;
for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) {
memset(tirc, 0, ctxlen);
- mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc);
+ mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc, false);
mlx5_core_modify_tir(mdev, priv->indir_tir[tt].tirn, in, inlen);
}
+
+ if (!mlx5e_tunnel_inner_ft_supported(priv->mdev))
+ return;
+
+ for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) {
+ memset(tirc, 0, ctxlen);
+ mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc, true);
+ mlx5_core_modify_tir(mdev, priv->inner_indir_tir[tt].tirn, in, inlen);
+ }
}
static int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir,
mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
}
+ if (MLX5_CAP_PCAM_FEATURE(mdev, rx_buffer_fullness_counters)) {
+ out = pstats->eth_ext_counters;
+ MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP);
+ mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+ }
+
MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_PRIORITY_COUNTERS_GROUP);
for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
out = pstats->per_prio_counters[prio];
}
mlx5e_build_common_cq_param(priv, param);
+ param->cq_period_mode = params->rx_cq_period_mode;
}
static void mlx5e_build_tx_cq_param(struct mlx5e_priv *priv,
void mlx5e_build_indir_tir_ctx_hash(struct mlx5e_params *params,
enum mlx5e_traffic_types tt,
- void *tirc)
+ void *tirc, bool inner)
{
- void *hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
+ void *hfso = inner ? MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_inner) :
+ MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
#define MLX5_HASH_IP (MLX5_HASH_FIELD_SEL_SRC_IP |\
MLX5_HASH_FIELD_SEL_DST_IP)
return err;
}
+static void mlx5e_build_inner_indir_tir_ctx(struct mlx5e_priv *priv,
+ enum mlx5e_traffic_types tt,
+ u32 *tirc)
+{
+ MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.td.tdn);
+
+ mlx5e_build_tir_ctx_lro(&priv->channels.params, tirc);
+
+ MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
+ MLX5_SET(tirc, tirc, indirect_table, priv->indir_rqt.rqtn);
+ MLX5_SET(tirc, tirc, tunneled_offload_en, 0x1);
+
+ mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc, true);
+}
+
static int mlx5e_set_mtu(struct mlx5e_priv *priv, u16 mtu)
{
struct mlx5_core_dev *mdev = priv->mdev;
}
}
-static bool mlx5e_is_eswitch_vport_mngr(struct mlx5_core_dev *mdev)
-{
- return (MLX5_CAP_GEN(mdev, vport_group_manager) &&
- MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH);
-}
-
void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
{
int num_txqs = priv->channels.num * priv->channels.params.num_tc;
mlx5e_activate_channels(&priv->channels);
netif_tx_start_all_queues(priv->netdev);
- if (mlx5e_is_eswitch_vport_mngr(priv->mdev))
+ if (MLX5_VPORT_MANAGER(priv->mdev))
mlx5e_add_sqs_fwd_rules(priv);
mlx5e_wait_channels_min_rx_wqes(&priv->channels);
{
mlx5e_redirect_rqts_to_drop(priv);
- if (mlx5e_is_eswitch_vport_mngr(priv->mdev))
+ if (MLX5_VPORT_MANAGER(priv->mdev))
mlx5e_remove_sqs_fwd_rules(priv);
/* FIXME: This is a W/A only for tx timeout watch dog false alarm when
mutex_lock(&priv->state_lock);
err = mlx5e_open_locked(netdev);
+ if (!err)
+ mlx5_set_port_admin_status(priv->mdev, MLX5_PORT_UP);
mutex_unlock(&priv->state_lock);
return err;
return -ENODEV;
mutex_lock(&priv->state_lock);
+ mlx5_set_port_admin_status(priv->mdev, MLX5_PORT_DOWN);
err = mlx5e_close_locked(netdev);
mutex_unlock(&priv->state_lock);
MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
MLX5_SET(tirc, tirc, indirect_table, priv->indir_rqt.rqtn);
- mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc);
+ mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc, false);
}
static void mlx5e_build_direct_tir_ctx(struct mlx5e_priv *priv, u32 rqtn, u32 *tirc)
struct mlx5e_tir *tir;
void *tirc;
int inlen;
+ int i = 0;
int err;
u32 *in;
int tt;
tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
mlx5e_build_indir_tir_ctx(priv, tt, tirc);
err = mlx5e_create_tir(priv->mdev, tir, in, inlen);
- if (err)
- goto err_destroy_tirs;
+ if (err) {
+ mlx5_core_warn(priv->mdev, "create indirect tirs failed, %d\n", err);
+ goto err_destroy_inner_tirs;
+ }
+ }
+
+ if (!mlx5e_tunnel_inner_ft_supported(priv->mdev))
+ goto out;
+
+ for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++) {
+ memset(in, 0, inlen);
+ tir = &priv->inner_indir_tir[i];
+ tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+ mlx5e_build_inner_indir_tir_ctx(priv, i, tirc);
+ err = mlx5e_create_tir(priv->mdev, tir, in, inlen);
+ if (err) {
+ mlx5_core_warn(priv->mdev, "create inner indirect tirs failed, %d\n", err);
+ goto err_destroy_inner_tirs;
+ }
}
+out:
kvfree(in);
return 0;
-err_destroy_tirs:
- mlx5_core_warn(priv->mdev, "create indirect tirs failed, %d\n", err);
+err_destroy_inner_tirs:
+ for (i--; i >= 0; i--)
+ mlx5e_destroy_tir(priv->mdev, &priv->inner_indir_tir[i]);
+
for (tt--; tt >= 0; tt--)
mlx5e_destroy_tir(priv->mdev, &priv->indir_tir[tt]);
for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++)
mlx5e_destroy_tir(priv->mdev, &priv->indir_tir[i]);
+
+ if (!mlx5e_tunnel_inner_ft_supported(priv->mdev))
+ return;
+
+ for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++)
+ mlx5e_destroy_tir(priv->mdev, &priv->inner_indir_tir[i]);
}
void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv)
return 0;
}
-static int mlx5e_setup_tc(struct net_device *netdev, u8 tc)
+static int mlx5e_setup_tc_mqprio(struct net_device *netdev,
+ struct tc_mqprio_qopt *mqprio)
{
struct mlx5e_priv *priv = netdev_priv(netdev);
struct mlx5e_channels new_channels = {};
+ u8 tc = mqprio->num_tc;
int err = 0;
+ mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+
if (tc && tc != MLX5E_MAX_NUM_TC)
return -EINVAL;
return err;
}
-static int mlx5e_ndo_setup_tc(struct net_device *dev, u32 handle,
- u32 chain_index, __be16 proto,
- struct tc_to_netdev *tc)
+#ifdef CONFIG_MLX5_ESWITCH
+static int mlx5e_setup_tc_cls_flower(struct net_device *dev,
+ struct tc_cls_flower_offload *cls_flower)
{
struct mlx5e_priv *priv = netdev_priv(dev);
- if (TC_H_MAJ(handle) != TC_H_MAJ(TC_H_INGRESS))
- goto mqprio;
+ if (!is_classid_clsact_ingress(cls_flower->common.classid) ||
+ cls_flower->common.chain_index)
+ return -EOPNOTSUPP;
- if (chain_index)
+ switch (cls_flower->command) {
+ case TC_CLSFLOWER_REPLACE:
+ return mlx5e_configure_flower(priv, cls_flower);
+ case TC_CLSFLOWER_DESTROY:
+ return mlx5e_delete_flower(priv, cls_flower);
+ case TC_CLSFLOWER_STATS:
+ return mlx5e_stats_flower(priv, cls_flower);
+ default:
return -EOPNOTSUPP;
+ }
+}
+#endif
- switch (tc->type) {
+static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
+ void *type_data)
+{
+ switch (type) {
+#ifdef CONFIG_MLX5_ESWITCH
case TC_SETUP_CLSFLOWER:
- switch (tc->cls_flower->command) {
- case TC_CLSFLOWER_REPLACE:
- return mlx5e_configure_flower(priv, proto, tc->cls_flower);
- case TC_CLSFLOWER_DESTROY:
- return mlx5e_delete_flower(priv, tc->cls_flower);
- case TC_CLSFLOWER_STATS:
- return mlx5e_stats_flower(priv, tc->cls_flower);
- }
+ return mlx5e_setup_tc_cls_flower(dev, type_data);
+#endif
+ case TC_SETUP_MQPRIO:
+ return mlx5e_setup_tc_mqprio(dev, type_data);
default:
return -EOPNOTSUPP;
}
-
-mqprio:
- if (tc->type != TC_SETUP_MQPRIO)
- return -EINVAL;
-
- tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
-
- return mlx5e_setup_tc(dev, tc->mqprio->num_tc);
}
static void
}
}
+#ifdef CONFIG_MLX5_ESWITCH
static int mlx5e_set_vf_mac(struct net_device *dev, int vf, u8 *mac)
{
struct mlx5e_priv *priv = netdev_priv(dev);
return mlx5_eswitch_get_vport_stats(mdev->priv.eswitch, vf + 1,
vf_stats);
}
+#endif
static void mlx5e_add_vxlan_port(struct net_device *netdev,
struct udp_tunnel_info *ti)
mlx5e_vxlan_queue_work(priv, ti->sa_family, be16_to_cpu(ti->port), 0);
}
-static netdev_features_t mlx5e_vxlan_features_check(struct mlx5e_priv *priv,
- struct sk_buff *skb,
- netdev_features_t features)
+static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
+ struct sk_buff *skb,
+ netdev_features_t features)
{
struct udphdr *udph;
- u16 proto;
- u16 port = 0;
+ u8 proto;
+ u16 port;
switch (vlan_get_protocol(skb)) {
case htons(ETH_P_IP):
goto out;
}
- if (proto == IPPROTO_UDP) {
+ switch (proto) {
+ case IPPROTO_GRE:
+ return features;
+ case IPPROTO_UDP:
udph = udp_hdr(skb);
port = be16_to_cpu(udph->dest);
- }
- /* Verify if UDP port is being offloaded by HW */
- if (port && mlx5e_vxlan_lookup_port(priv, port))
- return features;
+ /* Verify if UDP port is being offloaded by HW */
+ if (mlx5e_vxlan_lookup_port(priv, port))
+ return features;
+ }
out:
/* Disable CSUM and GSO if the udp dport is not offloaded by HW */
/* Validate if the tunneled packet is being offloaded by HW */
if (skb->encapsulation &&
(features & NETIF_F_CSUM_MASK || features & NETIF_F_GSO_MASK))
- return mlx5e_vxlan_features_check(priv, skb, features);
+ return mlx5e_tunnel_features_check(priv, skb, features);
return features;
}
}
#endif
-static const struct net_device_ops mlx5e_netdev_ops_basic = {
+static const struct net_device_ops mlx5e_netdev_ops = {
.ndo_open = mlx5e_open,
.ndo_stop = mlx5e_close,
.ndo_start_xmit = mlx5e_xmit,
- .ndo_setup_tc = mlx5e_ndo_setup_tc,
+ .ndo_setup_tc = mlx5e_setup_tc,
.ndo_select_queue = mlx5e_select_queue,
.ndo_get_stats64 = mlx5e_get_stats,
.ndo_set_rx_mode = mlx5e_set_rx_mode,
.ndo_change_mtu = mlx5e_change_mtu,
.ndo_do_ioctl = mlx5e_ioctl,
.ndo_set_tx_maxrate = mlx5e_set_tx_maxrate,
+ .ndo_udp_tunnel_add = mlx5e_add_vxlan_port,
+ .ndo_udp_tunnel_del = mlx5e_del_vxlan_port,
+ .ndo_features_check = mlx5e_features_check,
#ifdef CONFIG_RFS_ACCEL
.ndo_rx_flow_steer = mlx5e_rx_flow_steer,
#endif
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = mlx5e_netpoll,
#endif
-};
-
-static const struct net_device_ops mlx5e_netdev_ops_sriov = {
- .ndo_open = mlx5e_open,
- .ndo_stop = mlx5e_close,
- .ndo_start_xmit = mlx5e_xmit,
- .ndo_setup_tc = mlx5e_ndo_setup_tc,
- .ndo_select_queue = mlx5e_select_queue,
- .ndo_get_stats64 = mlx5e_get_stats,
- .ndo_set_rx_mode = mlx5e_set_rx_mode,
- .ndo_set_mac_address = mlx5e_set_mac,
- .ndo_vlan_rx_add_vid = mlx5e_vlan_rx_add_vid,
- .ndo_vlan_rx_kill_vid = mlx5e_vlan_rx_kill_vid,
- .ndo_set_features = mlx5e_set_features,
- .ndo_change_mtu = mlx5e_change_mtu,
- .ndo_do_ioctl = mlx5e_ioctl,
- .ndo_udp_tunnel_add = mlx5e_add_vxlan_port,
- .ndo_udp_tunnel_del = mlx5e_del_vxlan_port,
- .ndo_set_tx_maxrate = mlx5e_set_tx_maxrate,
- .ndo_features_check = mlx5e_features_check,
-#ifdef CONFIG_RFS_ACCEL
- .ndo_rx_flow_steer = mlx5e_rx_flow_steer,
-#endif
+#ifdef CONFIG_MLX5_ESWITCH
+ /* SRIOV E-Switch NDOs */
.ndo_set_vf_mac = mlx5e_set_vf_mac,
.ndo_set_vf_vlan = mlx5e_set_vf_vlan,
.ndo_set_vf_spoofchk = mlx5e_set_vf_spoofchk,
.ndo_get_vf_config = mlx5e_get_vf_config,
.ndo_set_vf_link_state = mlx5e_set_vf_link_state,
.ndo_get_vf_stats = mlx5e_get_vf_stats,
- .ndo_tx_timeout = mlx5e_tx_timeout,
- .ndo_xdp = mlx5e_xdp,
-#ifdef CONFIG_NET_POLL_CONTROLLER
- .ndo_poll_controller = mlx5e_netpoll,
-#endif
.ndo_has_offload_stats = mlx5e_has_offload_stats,
.ndo_get_offload_stats = mlx5e_get_offload_stats,
+#endif
};
static int mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev)
}
}
+#if IS_ENABLED(CONFIG_NET_SWITCHDEV) && IS_ENABLED(CONFIG_MLX5_ESWITCH)
static const struct switchdev_ops mlx5e_switchdev_ops = {
.switchdev_port_attr_get = mlx5e_attr_get,
};
+#endif
static void mlx5e_build_nic_netdev(struct net_device *netdev)
{
SET_NETDEV_DEV(netdev, &mdev->pdev->dev);
- if (MLX5_CAP_GEN(mdev, vport_group_manager)) {
- netdev->netdev_ops = &mlx5e_netdev_ops_sriov;
+ netdev->netdev_ops = &mlx5e_netdev_ops;
+
#ifdef CONFIG_MLX5_CORE_EN_DCB
- if (MLX5_CAP_GEN(mdev, qos))
- netdev->dcbnl_ops = &mlx5e_dcbnl_ops;
+ if (MLX5_CAP_GEN(mdev, vport_group_manager) && MLX5_CAP_GEN(mdev, qos))
+ netdev->dcbnl_ops = &mlx5e_dcbnl_ops;
#endif
- } else {
- netdev->netdev_ops = &mlx5e_netdev_ops_basic;
- }
netdev->watchdog_timeo = 15 * HZ;
netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_RX;
netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_FILTER;
- if (mlx5e_vxlan_allowed(mdev)) {
- netdev->hw_features |= NETIF_F_GSO_UDP_TUNNEL |
- NETIF_F_GSO_UDP_TUNNEL_CSUM |
- NETIF_F_GSO_PARTIAL;
+ if (mlx5e_vxlan_allowed(mdev) || MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) {
+ netdev->hw_features |= NETIF_F_GSO_PARTIAL;
netdev->hw_enc_features |= NETIF_F_IP_CSUM;
netdev->hw_enc_features |= NETIF_F_IPV6_CSUM;
netdev->hw_enc_features |= NETIF_F_TSO;
netdev->hw_enc_features |= NETIF_F_TSO6;
- netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL;
- netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL_CSUM |
- NETIF_F_GSO_PARTIAL;
+ netdev->hw_enc_features |= NETIF_F_GSO_PARTIAL;
+ }
+
+ if (mlx5e_vxlan_allowed(mdev)) {
+ netdev->hw_features |= NETIF_F_GSO_UDP_TUNNEL |
+ NETIF_F_GSO_UDP_TUNNEL_CSUM;
+ netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL |
+ NETIF_F_GSO_UDP_TUNNEL_CSUM;
netdev->gso_partial_features = NETIF_F_GSO_UDP_TUNNEL_CSUM;
}
+ if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) {
+ netdev->hw_features |= NETIF_F_GSO_GRE |
+ NETIF_F_GSO_GRE_CSUM;
+ netdev->hw_enc_features |= NETIF_F_GSO_GRE |
+ NETIF_F_GSO_GRE_CSUM;
+ netdev->gso_partial_features |= NETIF_F_GSO_GRE |
+ NETIF_F_GSO_GRE_CSUM;
+ }
+
mlx5_query_port_fcs(mdev, &fcs_supported, &fcs_enabled);
if (fcs_supported)
mlx5e_set_netdev_dev_addr(netdev);
-#ifdef CONFIG_NET_SWITCHDEV
- if (MLX5_CAP_GEN(mdev, vport_group_manager))
+#if IS_ENABLED(CONFIG_NET_SWITCHDEV) && IS_ENABLED(CONFIG_MLX5_ESWITCH)
+ if (MLX5_VPORT_MANAGER(mdev))
netdev->switchdev_ops = &mlx5e_switchdev_ops;
#endif
mlx5e_init_l2_addr(priv);
+ /* Marking the link as currently not needed by the Driver */
+ if (!netif_running(netdev))
+ mlx5_set_port_admin_status(mdev, MLX5_PORT_DOWN);
+
/* MTU range: 68 - hw-specific max */
netdev->min_mtu = ETH_MIN_MTU;
mlx5_query_port_max_mtu(priv->mdev, &max_mtu, 1);
mlx5e_enable_async_events(priv);
- if (MLX5_CAP_GEN(mdev, vport_group_manager))
+ if (MLX5_VPORT_MANAGER(priv->mdev))
mlx5e_register_vport_reps(priv);
if (netdev->reg_state != NETREG_REGISTERED)
queue_work(priv->wq, &priv->set_rx_mode_work);
- if (MLX5_CAP_GEN(mdev, vport_group_manager))
+ if (MLX5_VPORT_MANAGER(priv->mdev))
mlx5e_unregister_vport_reps(priv);
mlx5e_disable_async_events(priv);
static void *mlx5e_add(struct mlx5_core_dev *mdev)
{
- struct mlx5_eswitch *esw = mdev->priv.eswitch;
- int total_vfs = MLX5_TOTAL_VPORTS(mdev);
- struct mlx5e_rep_priv *rpriv = NULL;
+ struct net_device *netdev;
+ void *rpriv = NULL;
void *priv;
- int vport;
int err;
- struct net_device *netdev;
err = mlx5e_check_required_hca_cap(mdev);
if (err)
return NULL;
- if (MLX5_CAP_GEN(mdev, vport_group_manager)) {
- rpriv = kzalloc(sizeof(*rpriv), GFP_KERNEL);
+#ifdef CONFIG_MLX5_ESWITCH
+ if (MLX5_VPORT_MANAGER(mdev)) {
+ rpriv = mlx5e_alloc_nic_rep_priv(mdev);
if (!rpriv) {
- mlx5_core_warn(mdev,
- "Not creating net device, Failed to alloc rep priv data\n");
+ mlx5_core_warn(mdev, "Failed to alloc NIC rep priv data\n");
return NULL;
}
- rpriv->rep = &esw->offloads.vport_reps[0];
}
+#endif
netdev = mlx5e_create_netdev(mdev, &mlx5e_nic_profile, rpriv);
if (!netdev) {
mlx5_core_err(mdev, "mlx5e_create_netdev failed\n");
- goto err_unregister_reps;
+ goto err_free_rpriv;
}
priv = netdev_priv(netdev);
err_detach:
mlx5e_detach(mdev, priv);
-
err_destroy_netdev:
mlx5e_destroy_netdev(priv);
-
-err_unregister_reps:
- for (vport = 1; vport < total_vfs; vport++)
- mlx5_eswitch_unregister_vport_rep(esw, vport);
-
+err_free_rpriv:
kfree(rpriv);
return NULL;
}
if (unlikely(!page))
return -ENOMEM;
- dma_info->page = page;
dma_info->addr = dma_map_page(rq->pdev, page, 0,
RQ_PAGE_SIZE(rq), rq->buff.map_dir);
if (unlikely(dma_mapping_error(rq->pdev, dma_info->addr))) {
put_page(page);
return -ENOMEM;
}
+ dma_info->page = page;
return 0;
}
u16 tot_len;
u8 l4_hdr_type = get_cqe_l4_hdr_type(cqe);
- int tcp_ack = ((CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA == l4_hdr_type) ||
- (CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA == l4_hdr_type));
+ int tcp_ack = ((l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA) ||
+ (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA));
skb->mac_len = ETH_HLEN;
proto = __vlan_get_protocol(skb, eth->h_proto, &network_depth);
&wqe->next.next_wqe_index);
}
+#ifdef CONFIG_MLX5_ESWITCH
void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
{
struct net_device *netdev = rq->netdev;
mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
&wqe->next.next_wqe_index);
}
+#endif
static inline void mlx5e_mpwqe_fill_rx_skb(struct mlx5e_rq *rq,
struct mlx5_cqe64 *cqe,
LIST_HEAD(actions);
int err;
- if (tc_no_actions(exts))
+ if (!tcf_exts_has_actions(exts))
return -EINVAL;
attr->flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
int ret;
- dst = ip6_route_output(dev_net(mirred_dev), NULL, fl6);
- ret = dst->error;
- if (ret) {
- dst_release(dst);
+ ret = ipv6_stub->ipv6_dst_lookup(dev_net(mirred_dev), NULL, &dst,
+ fl6);
+ if (ret < 0)
return ret;
- }
*out_ttl = ip6_dst_hoplimit(dst);
bool encap = false;
int err = 0;
- if (tc_no_actions(exts))
+ if (!tcf_exts_has_actions(exts))
return -EINVAL;
memset(attr, 0, sizeof(*attr));
return err;
}
-int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol,
+int mlx5e_configure_flower(struct mlx5e_priv *priv,
struct tc_cls_flower_offload *f)
{
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
struct mlx5_flow_table *fdb = NULL;
int esw_size, err = 0;
u32 flags = 0;
+ u32 max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) |
+ MLX5_CAP_GEN(dev, max_flow_counter_15_0);
root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
if (!root_ns) {
esw_debug(dev, "Create offloads FDB table, min (max esw size(2^%d), max counters(%d)*groups(%d))\n",
MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size),
- MLX5_CAP_GEN(dev, max_flow_counter), ESW_OFFLOADS_NUM_GROUPS);
+ max_flow_counter, ESW_OFFLOADS_NUM_GROUPS);
- esw_size = min_t(int, MLX5_CAP_GEN(dev, max_flow_counter) * ESW_OFFLOADS_NUM_GROUPS,
+ esw_size = min_t(int, max_flow_counter * ESW_OFFLOADS_NUM_GROUPS,
1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE)
struct mlx5_eswitch_rep *rep;
int vport;
- for (vport = 0; vport < nvports; vport++) {
+ for (vport = nvports - 1; vport >= 0; vport--) {
rep = &esw->offloads.vport_reps[vport];
if (!rep->valid)
continue;
#include <net/devlink.h>
#include "mlx5_core.h"
#include "fs_core.h"
-#ifdef CONFIG_MLX5_CORE_EN
+#include "lib/mpfs.h"
#include "eswitch.h"
-#endif
#include "lib/mlx5.h"
#include "fpga/core.h"
#include "accel/ipsec.h"
return -EOPNOTSUPP;
}
-
static int mlx5_pci_init(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
{
struct pci_dev *pdev = dev->pdev;
goto err_tables_cleanup;
}
-#ifdef CONFIG_MLX5_CORE_EN
+ err = mlx5_mpfs_init(dev);
+ if (err) {
+ dev_err(&pdev->dev, "Failed to init l2 table %d\n", err);
+ goto err_rl_cleanup;
+ }
+
err = mlx5_eswitch_init(dev);
if (err) {
dev_err(&pdev->dev, "Failed to init eswitch %d\n", err);
- goto err_rl_cleanup;
+ goto err_mpfs_cleanup;
}
-#endif
err = mlx5_sriov_init(dev);
if (err) {
err_sriov_cleanup:
mlx5_sriov_cleanup(dev);
err_eswitch_cleanup:
-#ifdef CONFIG_MLX5_CORE_EN
mlx5_eswitch_cleanup(dev->priv.eswitch);
-
+err_mpfs_cleanup:
+ mlx5_mpfs_cleanup(dev);
err_rl_cleanup:
-#endif
mlx5_cleanup_rl_table(dev);
-
err_tables_cleanup:
mlx5_cleanup_mkey_table(dev);
mlx5_cleanup_srq_table(dev);
{
mlx5_fpga_cleanup(dev);
mlx5_sriov_cleanup(dev);
-#ifdef CONFIG_MLX5_CORE_EN
mlx5_eswitch_cleanup(dev->priv.eswitch);
-#endif
+ mlx5_mpfs_cleanup(dev);
mlx5_cleanup_rl_table(dev);
mlx5_cleanup_reserved_gids(dev);
mlx5_cleanup_mkey_table(dev);
goto err_fs;
}
-#ifdef CONFIG_MLX5_CORE_EN
- mlx5_eswitch_attach(dev->priv.eswitch);
-#endif
-
err = mlx5_sriov_attach(dev);
if (err) {
dev_err(&pdev->dev, "sriov init failed %d\n", err);
}
}
- clear_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state);
set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
out:
mutex_unlock(&dev->intf_state_mutex);
mlx5_sriov_detach(dev);
err_sriov:
-#ifdef CONFIG_MLX5_CORE_EN
- mlx5_eswitch_detach(dev->priv.eswitch);
-#endif
mlx5_cleanup_fs(dev);
err_fs:
mlx5_drain_health_recovery(dev);
mutex_lock(&dev->intf_state_mutex);
- if (test_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state)) {
+ if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
dev_warn(&dev->pdev->dev, "%s: interface is down, NOP\n",
__func__);
if (cleanup)
}
clear_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
- set_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state);
if (mlx5_device_registered(dev))
mlx5_detach_device(dev);
mlx5_fpga_device_stop(dev);
mlx5_sriov_detach(dev);
-#ifdef CONFIG_MLX5_CORE_EN
- mlx5_eswitch_detach(dev->priv.eswitch);
-#endif
mlx5_cleanup_fs(dev);
mlx5_irq_clear_affinity_hints(dev);
free_comp_eqs(dev);
};
static const struct devlink_ops mlx5_devlink_ops = {
-#ifdef CONFIG_MLX5_CORE_EN
+#ifdef CONFIG_MLX5_ESWITCH
.eswitch_mode_set = mlx5_devlink_eswitch_mode_set,
.eswitch_mode_get = mlx5_devlink_eswitch_mode_get,
.eswitch_inline_mode_set = mlx5_devlink_eswitch_inline_mode_set,
mutex_init(&dev->pci_status_mutex);
mutex_init(&dev->intf_state_mutex);
+ INIT_LIST_HEAD(&priv->waiting_events_list);
+ priv->is_accum_events = false;
+
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
err = init_srcu_struct(&priv->pfault_srcu);
if (err) {
cleanup_srcu_struct(&priv->pfault_srcu);
clean_dev:
#endif
- pci_set_drvdata(pdev, NULL);
devlink_free(devlink);
return err;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
cleanup_srcu_struct(&priv->pfault_srcu);
#endif
- pci_set_drvdata(pdev, NULL);
devlink_free(devlink);
}
int err;
dev_info(&pdev->dev, "Shutdown was called\n");
- /* Notify mlx5 clients that the kernel is being shut down */
- set_bit(MLX5_INTERFACE_STATE_SHUTDOWN, &dev->intf_state);
err = mlx5_try_fast_unload(dev);
if (err)
mlx5_unload_one(dev, priv, false);
#include <net/tc_act/tc_mirred.h>
#include <net/netevent.h>
#include <net/tc_act/tc_sample.h>
+#include <net/addrconf.h>
#include "spectrum.h"
#include "pci.h"
int err;
mlxsw_reg_mgpc_pack(mgpc_pl, counter_index, MLXSW_REG_MGPC_OPCODE_NOP,
- MLXSW_REG_MGPC_COUNTER_SET_TYPE_PACKETS_BYTES);
+ MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES);
err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(mgpc), mgpc_pl);
if (err)
return err;
- *packets = mlxsw_reg_mgpc_packet_counter_get(mgpc_pl);
- *bytes = mlxsw_reg_mgpc_byte_counter_get(mgpc_pl);
+ if (packets)
+ *packets = mlxsw_reg_mgpc_packet_counter_get(mgpc_pl);
+ if (bytes)
+ *bytes = mlxsw_reg_mgpc_byte_counter_get(mgpc_pl);
return 0;
}
char mgpc_pl[MLXSW_REG_MGPC_LEN];
mlxsw_reg_mgpc_pack(mgpc_pl, counter_index, MLXSW_REG_MGPC_OPCODE_CLEAR,
- MLXSW_REG_MGPC_COUNTER_SET_TYPE_PACKETS_BYTES);
+ MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES);
return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mgpc), mgpc_pl);
}
}
static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
- __be16 protocol,
- struct tc_cls_matchall_offload *cls,
+ struct tc_cls_matchall_offload *f,
bool ingress)
{
struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry;
+ __be16 protocol = f->common.protocol;
const struct tc_action *a;
LIST_HEAD(actions);
int err;
- if (!tc_single_action(cls->exts)) {
+ if (!tcf_exts_has_one_action(f->exts)) {
netdev_err(mlxsw_sp_port->dev, "only singular actions are supported\n");
return -EOPNOTSUPP;
}
mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
if (!mall_tc_entry)
return -ENOMEM;
- mall_tc_entry->cookie = cls->cookie;
+ mall_tc_entry->cookie = f->cookie;
- tcf_exts_to_list(cls->exts, &actions);
+ tcf_exts_to_list(f->exts, &actions);
a = list_first_entry(&actions, struct tc_action, list);
if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) {
mirror, a, ingress);
} else if (is_tcf_sample(a) && protocol == htons(ETH_P_ALL)) {
mall_tc_entry->type = MLXSW_SP_PORT_MALL_SAMPLE;
- err = mlxsw_sp_port_add_cls_matchall_sample(mlxsw_sp_port, cls,
+ err = mlxsw_sp_port_add_cls_matchall_sample(mlxsw_sp_port, f,
a, ingress);
} else {
err = -EOPNOTSUPP;
}
static void mlxsw_sp_port_del_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
- struct tc_cls_matchall_offload *cls)
+ struct tc_cls_matchall_offload *f)
{
struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry;
mall_tc_entry = mlxsw_sp_port_mall_tc_entry_find(mlxsw_sp_port,
- cls->cookie);
+ f->cookie);
if (!mall_tc_entry) {
netdev_dbg(mlxsw_sp_port->dev, "tc entry not found on port\n");
return;
kfree(mall_tc_entry);
}
-static int mlxsw_sp_setup_tc(struct net_device *dev, u32 handle,
- u32 chain_index, __be16 proto,
- struct tc_to_netdev *tc)
+static int mlxsw_sp_setup_tc_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
+ struct tc_cls_matchall_offload *f)
{
- struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
- bool ingress = TC_H_MAJ(handle) == TC_H_MAJ(TC_H_INGRESS);
+ bool ingress;
- if (chain_index)
+ if (is_classid_clsact_ingress(f->common.classid))
+ ingress = true;
+ else if (is_classid_clsact_egress(f->common.classid))
+ ingress = false;
+ else
return -EOPNOTSUPP;
- switch (tc->type) {
- case TC_SETUP_MATCHALL:
- switch (tc->cls_mall->command) {
- case TC_CLSMATCHALL_REPLACE:
- return mlxsw_sp_port_add_cls_matchall(mlxsw_sp_port,
- proto,
- tc->cls_mall,
- ingress);
- case TC_CLSMATCHALL_DESTROY:
- mlxsw_sp_port_del_cls_matchall(mlxsw_sp_port,
- tc->cls_mall);
- return 0;
- default:
- return -EOPNOTSUPP;
- }
- case TC_SETUP_CLSFLOWER:
- switch (tc->cls_flower->command) {
- case TC_CLSFLOWER_REPLACE:
- return mlxsw_sp_flower_replace(mlxsw_sp_port, ingress,
- proto, tc->cls_flower);
- case TC_CLSFLOWER_DESTROY:
- mlxsw_sp_flower_destroy(mlxsw_sp_port, ingress,
- tc->cls_flower);
- return 0;
- case TC_CLSFLOWER_STATS:
- return mlxsw_sp_flower_stats(mlxsw_sp_port, ingress,
- tc->cls_flower);
- default:
- return -EOPNOTSUPP;
- }
+ if (f->common.chain_index)
+ return -EOPNOTSUPP;
+
+ switch (f->command) {
+ case TC_CLSMATCHALL_REPLACE:
+ return mlxsw_sp_port_add_cls_matchall(mlxsw_sp_port, f,
+ ingress);
+ case TC_CLSMATCHALL_DESTROY:
+ mlxsw_sp_port_del_cls_matchall(mlxsw_sp_port, f);
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int
+mlxsw_sp_setup_tc_cls_flower(struct mlxsw_sp_port *mlxsw_sp_port,
+ struct tc_cls_flower_offload *f)
+{
+ bool ingress;
+
+ if (is_classid_clsact_ingress(f->common.classid))
+ ingress = true;
+ else if (is_classid_clsact_egress(f->common.classid))
+ ingress = false;
+ else
+ return -EOPNOTSUPP;
+
+ switch (f->command) {
+ case TC_CLSFLOWER_REPLACE:
+ return mlxsw_sp_flower_replace(mlxsw_sp_port, ingress, f);
+ case TC_CLSFLOWER_DESTROY:
+ mlxsw_sp_flower_destroy(mlxsw_sp_port, ingress, f);
+ return 0;
+ case TC_CLSFLOWER_STATS:
+ return mlxsw_sp_flower_stats(mlxsw_sp_port, ingress, f);
+ default:
+ return -EOPNOTSUPP;
}
+}
- return -EOPNOTSUPP;
+static int mlxsw_sp_setup_tc(struct net_device *dev, enum tc_setup_type type,
+ void *type_data)
+{
+ struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+ switch (type) {
+ case TC_SETUP_CLSMATCHALL:
+ return mlxsw_sp_setup_tc_cls_matchall(mlxsw_sp_port, type_data);
+ case TC_SETUP_CLSFLOWER:
+ return mlxsw_sp_setup_tc_cls_flower(mlxsw_sp_port, type_data);
+ default:
+ return -EOPNOTSUPP;
+ }
}
static const struct net_device_ops mlxsw_sp_port_netdev_ops = {
MLXSW_SP_RXL_MARK(ARPBC, MIRROR_TO_CPU, ARP, false),
MLXSW_SP_RXL_MARK(ARPUC, MIRROR_TO_CPU, ARP, false),
MLXSW_SP_RXL_NO_MARK(FID_MISS, TRAP_TO_CPU, IP2ME, false),
+ MLXSW_SP_RXL_MARK(IPV6_MLDV12_LISTENER_QUERY, MIRROR_TO_CPU, IPV6_MLD,
+ false),
+ MLXSW_SP_RXL_NO_MARK(IPV6_MLDV1_LISTENER_REPORT, TRAP_TO_CPU, IPV6_MLD,
+ false),
+ MLXSW_SP_RXL_NO_MARK(IPV6_MLDV1_LISTENER_DONE, TRAP_TO_CPU, IPV6_MLD,
+ false),
+ MLXSW_SP_RXL_NO_MARK(IPV6_MLDV2_LISTENER_REPORT, TRAP_TO_CPU, IPV6_MLD,
+ false),
/* L3 traps */
- MLXSW_SP_RXL_NO_MARK(MTUERROR, TRAP_TO_CPU, ROUTER_EXP, false),
- MLXSW_SP_RXL_NO_MARK(TTLERROR, TRAP_TO_CPU, ROUTER_EXP, false),
- MLXSW_SP_RXL_NO_MARK(LBERROR, TRAP_TO_CPU, ROUTER_EXP, false),
- MLXSW_SP_RXL_MARK(OSPF, TRAP_TO_CPU, OSPF, false),
- MLXSW_SP_RXL_NO_MARK(IP2ME, TRAP_TO_CPU, IP2ME, false),
- MLXSW_SP_RXL_NO_MARK(RTR_INGRESS0, TRAP_TO_CPU, REMOTE_ROUTE, false),
- MLXSW_SP_RXL_NO_MARK(HOST_MISS_IPV4, TRAP_TO_CPU, ARP_MISS, false),
- MLXSW_SP_RXL_NO_MARK(BGP_IPV4, TRAP_TO_CPU, BGP_IPV4, false),
+ MLXSW_SP_RXL_MARK(MTUERROR, TRAP_TO_CPU, ROUTER_EXP, false),
+ MLXSW_SP_RXL_MARK(TTLERROR, TRAP_TO_CPU, ROUTER_EXP, false),
+ MLXSW_SP_RXL_MARK(LBERROR, TRAP_TO_CPU, ROUTER_EXP, false),
+ MLXSW_SP_RXL_MARK(IP2ME, TRAP_TO_CPU, IP2ME, false),
+ MLXSW_SP_RXL_MARK(IPV6_UNSPECIFIED_ADDRESS, TRAP_TO_CPU, ROUTER_EXP,
+ false),
+ MLXSW_SP_RXL_MARK(IPV6_LINK_LOCAL_DEST, TRAP_TO_CPU, ROUTER_EXP, false),
+ MLXSW_SP_RXL_MARK(IPV6_LINK_LOCAL_SRC, TRAP_TO_CPU, ROUTER_EXP, false),
+ MLXSW_SP_RXL_MARK(IPV6_ALL_NODES_LINK, TRAP_TO_CPU, ROUTER_EXP, false),
+ MLXSW_SP_RXL_MARK(IPV6_ALL_ROUTERS_LINK, TRAP_TO_CPU, ROUTER_EXP,
+ false),
+ MLXSW_SP_RXL_MARK(IPV4_OSPF, TRAP_TO_CPU, OSPF, false),
+ MLXSW_SP_RXL_MARK(IPV6_OSPF, TRAP_TO_CPU, OSPF, false),
+ MLXSW_SP_RXL_MARK(IPV6_DHCP, TRAP_TO_CPU, DHCP, false),
+ MLXSW_SP_RXL_MARK(RTR_INGRESS0, TRAP_TO_CPU, REMOTE_ROUTE, false),
+ MLXSW_SP_RXL_MARK(IPV4_BGP, TRAP_TO_CPU, BGP, false),
+ MLXSW_SP_RXL_MARK(IPV6_BGP, TRAP_TO_CPU, BGP, false),
+ MLXSW_SP_RXL_MARK(L3_IPV6_ROUTER_SOLICITATION, TRAP_TO_CPU, IPV6_ND,
+ false),
+ MLXSW_SP_RXL_MARK(L3_IPV6_ROUTER_ADVERTISMENT, TRAP_TO_CPU, IPV6_ND,
+ false),
+ MLXSW_SP_RXL_MARK(L3_IPV6_NEIGHBOR_SOLICITATION, TRAP_TO_CPU, IPV6_ND,
+ false),
+ MLXSW_SP_RXL_MARK(L3_IPV6_NEIGHBOR_ADVERTISMENT, TRAP_TO_CPU, IPV6_ND,
+ false),
+ MLXSW_SP_RXL_MARK(L3_IPV6_REDIRECTION, TRAP_TO_CPU, IPV6_ND, false),
+ MLXSW_SP_RXL_MARK(IPV6_MC_LINK_LOCAL_DEST, TRAP_TO_CPU, ROUTER_EXP,
+ false),
+ MLXSW_SP_RXL_MARK(HOST_MISS_IPV4, TRAP_TO_CPU, HOST_MISS, false),
+ MLXSW_SP_RXL_MARK(HOST_MISS_IPV6, TRAP_TO_CPU, HOST_MISS, false),
+ MLXSW_SP_RXL_MARK(ROUTER_ALERT_IPV4, TRAP_TO_CPU, ROUTER_EXP, false),
+ MLXSW_SP_RXL_MARK(ROUTER_ALERT_IPV6, TRAP_TO_CPU, ROUTER_EXP, false),
/* PKT Sample trap */
MLXSW_RXL(mlxsw_sp_rx_listener_sample_func, PKT_SAMPLE, MIRROR_TO_CPU,
false, SP_IP2ME, DISCARD),
burst_size = 7;
break;
case MLXSW_REG_HTGT_TRAP_GROUP_SP_IGMP:
+ case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_MLD:
rate = 16 * 1024;
burst_size = 10;
break;
- case MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP_IPV4:
+ case MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP:
case MLXSW_REG_HTGT_TRAP_GROUP_SP_ARP:
case MLXSW_REG_HTGT_TRAP_GROUP_SP_DHCP:
- case MLXSW_REG_HTGT_TRAP_GROUP_SP_ARP_MISS:
+ case MLXSW_REG_HTGT_TRAP_GROUP_SP_HOST_MISS:
case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP:
case MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE:
+ case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND:
rate = 1024;
burst_size = 7;
break;
priority = 5;
tc = 5;
break;
- case MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP_IPV4:
+ case MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP:
case MLXSW_REG_HTGT_TRAP_GROUP_SP_DHCP:
priority = 4;
tc = 4;
break;
case MLXSW_REG_HTGT_TRAP_GROUP_SP_IGMP:
case MLXSW_REG_HTGT_TRAP_GROUP_SP_IP2ME:
+ case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_MLD:
priority = 3;
tc = 3;
break;
case MLXSW_REG_HTGT_TRAP_GROUP_SP_ARP:
+ case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND:
priority = 2;
tc = 2;
break;
- case MLXSW_REG_HTGT_TRAP_GROUP_SP_ARP_MISS:
+ case MLXSW_REG_HTGT_TRAP_GROUP_SP_HOST_MISS:
case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP:
case MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE:
priority = 1;
mlxsw_sp_fids_fini(mlxsw_sp);
}
-static struct mlxsw_config_profile mlxsw_sp_config_profile = {
+static const struct mlxsw_config_profile mlxsw_sp_config_profile = {
.used_max_vepa_channels = 1,
.max_vepa_channels = 0,
.used_max_mid = 1,
return -EINVAL;
if (!info->linking)
break;
+ if (netdev_has_any_upper_dev(upper_dev))
+ return -EINVAL;
if (netif_is_lag_master(upper_dev) &&
!mlxsw_sp_master_lag_check(mlxsw_sp, upper_dev,
info->upper_info))
upper_dev = info->upper_dev;
if (!netif_is_bridge_master(upper_dev))
return -EINVAL;
+ if (!info->linking)
+ break;
+ if (netdev_has_any_upper_dev(upper_dev))
+ return -EINVAL;
break;
case NETDEV_CHANGEUPPER:
upper_dev = info->upper_dev;
.priority = 10, /* Must be called before FIB notifier block */
};
+static struct notifier_block mlxsw_sp_inet6addr_nb __read_mostly = {
+ .notifier_call = mlxsw_sp_inet6addr_event,
+};
+
static struct notifier_block mlxsw_sp_router_netevent_nb __read_mostly = {
.notifier_call = mlxsw_sp_router_netevent_event,
};
register_netdevice_notifier(&mlxsw_sp_netdevice_nb);
register_inetaddr_notifier(&mlxsw_sp_inetaddr_nb);
+ register_inet6addr_notifier(&mlxsw_sp_inet6addr_nb);
register_netevent_notifier(&mlxsw_sp_router_netevent_nb);
err = mlxsw_core_driver_register(&mlxsw_sp_driver);
mlxsw_core_driver_unregister(&mlxsw_sp_driver);
err_core_driver_register:
unregister_netevent_notifier(&mlxsw_sp_router_netevent_nb);
+ unregister_inet6addr_notifier(&mlxsw_sp_inet6addr_nb);
unregister_inetaddr_notifier(&mlxsw_sp_inetaddr_nb);
unregister_netdevice_notifier(&mlxsw_sp_netdevice_nb);
return err;
mlxsw_pci_driver_unregister(&mlxsw_sp_pci_driver);
mlxsw_core_driver_unregister(&mlxsw_sp_driver);
unregister_netevent_notifier(&mlxsw_sp_router_netevent_nb);
+ unregister_inet6addr_notifier(&mlxsw_sp_inet6addr_nb);
unregister_inetaddr_notifier(&mlxsw_sp_inetaddr_nb);
unregister_netdevice_notifier(&mlxsw_sp_netdevice_nb);
}
nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls,
struct tc_cls_flower_offload *flow)
{
- struct flow_dissector_key_control *mask_enc_ctl;
- struct flow_dissector_key_basic *mask_basic;
- struct flow_dissector_key_basic *key_basic;
+ struct flow_dissector_key_basic *mask_basic = NULL;
+ struct flow_dissector_key_basic *key_basic = NULL;
+ struct flow_dissector_key_ip *mask_ip = NULL;
u32 key_layer_two;
u8 key_layer;
int key_size;
- mask_enc_ctl = skb_flow_dissector_target(flow->dissector,
- FLOW_DISSECTOR_KEY_ENC_CONTROL,
- flow->mask);
+ if (dissector_uses_key(flow->dissector,
+ FLOW_DISSECTOR_KEY_ENC_CONTROL)) {
+ struct flow_dissector_key_control *mask_enc_ctl =
+ skb_flow_dissector_target(flow->dissector,
+ FLOW_DISSECTOR_KEY_ENC_CONTROL,
+ flow->mask);
+ /* We are expecting a tunnel. For now we ignore offloading. */
+ if (mask_enc_ctl->addr_type)
+ return -EOPNOTSUPP;
+ }
+
+ if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
+ mask_basic = skb_flow_dissector_target(flow->dissector,
+ FLOW_DISSECTOR_KEY_BASIC,
+ flow->mask);
- mask_basic = skb_flow_dissector_target(flow->dissector,
- FLOW_DISSECTOR_KEY_BASIC,
- flow->mask);
+ key_basic = skb_flow_dissector_target(flow->dissector,
+ FLOW_DISSECTOR_KEY_BASIC,
+ flow->key);
+ }
+
+ if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_IP))
+ mask_ip = skb_flow_dissector_target(flow->dissector,
+ FLOW_DISSECTOR_KEY_IP,
+ flow->mask);
- key_basic = skb_flow_dissector_target(flow->dissector,
- FLOW_DISSECTOR_KEY_BASIC,
- flow->key);
key_layer_two = 0;
key_layer = NFP_FLOWER_LAYER_PORT | NFP_FLOWER_LAYER_MAC;
key_size = sizeof(struct nfp_flower_meta_one) +
sizeof(struct nfp_flower_in_port) +
sizeof(struct nfp_flower_mac_mpls);
- /* We are expecting a tunnel. For now we ignore offloading. */
- if (mask_enc_ctl->addr_type)
- return -EOPNOTSUPP;
-
- if (mask_basic->n_proto) {
+ if (mask_basic && mask_basic->n_proto) {
/* Ethernet type is present in the key. */
switch (key_basic->n_proto) {
case cpu_to_be16(ETH_P_IP):
+ if (mask_ip && mask_ip->tos)
+ return -EOPNOTSUPP;
+ if (mask_ip && mask_ip->ttl)
+ return -EOPNOTSUPP;
key_layer |= NFP_FLOWER_LAYER_IPV4;
key_size += sizeof(struct nfp_flower_ipv4);
break;
case cpu_to_be16(ETH_P_IPV6):
+ if (mask_ip && mask_ip->tos)
+ return -EOPNOTSUPP;
+ if (mask_ip && mask_ip->ttl)
+ return -EOPNOTSUPP;
key_layer |= NFP_FLOWER_LAYER_IPV6;
key_size += sizeof(struct nfp_flower_ipv6);
break;
case cpu_to_be16(ETH_P_ARP):
return -EOPNOTSUPP;
+ /* Currently we do not offload MPLS. */
+ case cpu_to_be16(ETH_P_MPLS_UC):
+ case cpu_to_be16(ETH_P_MPLS_MC):
+ return -EOPNOTSUPP;
+
/* Will be included in layer 2. */
case cpu_to_be16(ETH_P_8021Q):
break;
}
}
- if (mask_basic->ip_proto) {
+ if (mask_basic && mask_basic->ip_proto) {
/* Ethernet type is present in the key. */
switch (key_basic->ip_proto) {
case IPPROTO_TCP:
}
int nfp_flower_setup_tc(struct nfp_app *app, struct net_device *netdev,
- u32 handle, __be16 proto, struct tc_to_netdev *tc)
+ enum tc_setup_type type, void *type_data)
{
- if (TC_H_MAJ(handle) != TC_H_MAJ(TC_H_INGRESS))
- return -EOPNOTSUPP;
+ struct tc_cls_flower_offload *cls_flower = type_data;
- if (!eth_proto_is_802_3(proto))
+ if (type != TC_SETUP_CLSFLOWER ||
+ !is_classid_clsact_ingress(cls_flower->common.classid) ||
+ !eth_proto_is_802_3(cls_flower->common.protocol) ||
+ cls_flower->common.chain_index)
return -EOPNOTSUPP;
- if (tc->type != TC_SETUP_CLSFLOWER)
- return -EINVAL;
-
- return nfp_flower_repr_offload(app, netdev, tc->cls_flower);
+ return nfp_flower_repr_offload(app, netdev, cls_flower);
}
struct nfp_pf *pf = pci_get_drvdata(pdev);
int err;
- mutex_lock(&pf->lock);
-
if (num_vfs > pf->limit_vfs) {
nfp_info(pf->cpp, "Firmware limits number of VFs to %u\n",
pf->limit_vfs);
- err = -EINVAL;
- goto err_unlock;
+ return -EINVAL;
}
err = pci_enable_sriov(pdev, num_vfs);
if (err) {
dev_warn(&pdev->dev, "Failed to enable PCI SR-IOV: %d\n", err);
- goto err_unlock;
+ return err;
}
+ mutex_lock(&pf->lock);
+
err = nfp_app_sriov_enable(pf->app, num_vfs);
if (err) {
dev_warn(&pdev->dev,
return num_vfs;
err_sriov_disable:
- pci_disable_sriov(pdev);
- err_unlock:
mutex_unlock(&pf->lock);
+ pci_disable_sriov(pdev);
return err;
#endif
return 0;
pf->num_vfs = 0;
+ mutex_unlock(&pf->lock);
+
pci_disable_sriov(pdev);
dev_dbg(&pdev->dev, "Removed VFs.\n");
-
- mutex_unlock(&pf->lock);
#endif
return 0;
}
return nfp_pcie_sriov_enable(pdev, num_vfs);
}
+static const struct firmware *
+nfp_net_fw_request(struct pci_dev *pdev, struct nfp_pf *pf, const char *name)
+{
+ const struct firmware *fw = NULL;
+ int err;
+
+ err = request_firmware_direct(&fw, name, &pdev->dev);
+ nfp_info(pf->cpp, " %s: %s\n",
+ name, err ? "not found" : "found, loading...");
+ if (err)
+ return NULL;
+
+ return fw;
+}
+
/**
* nfp_net_fw_find() - Find the correct firmware image for netdev mode
* @pdev: PCI Device structure
static const struct firmware *
nfp_net_fw_find(struct pci_dev *pdev, struct nfp_pf *pf)
{
- const struct firmware *fw = NULL;
struct nfp_eth_table_port *port;
+ const struct firmware *fw;
const char *fw_model;
char fw_name[256];
- int spc, err = 0;
- int i, j;
-
+ const u8 *serial;
+ u16 interface;
+ int spc, i, j;
+
+ nfp_info(pf->cpp, "Looking for firmware file in order of priority:\n");
+
+ /* First try to find a firmware image specific for this device */
+ interface = nfp_cpp_interface(pf->cpp);
+ nfp_cpp_serial(pf->cpp, &serial);
+ sprintf(fw_name, "netronome/serial-%pMF-%02hhx-%02hhx.nffw",
+ serial, interface >> 8, interface & 0xff);
+ fw = nfp_net_fw_request(pdev, pf, fw_name);
+ if (fw)
+ return fw;
+
+ /* Then try the PCI name */
+ sprintf(fw_name, "netronome/pci-%s.nffw", pci_name(pdev));
+ fw = nfp_net_fw_request(pdev, pf, fw_name);
+ if (fw)
+ return fw;
+
+ /* Finally try the card type and media */
if (!pf->eth_tbl) {
dev_err(&pdev->dev, "Error: can't identify media config\n");
return NULL;
if (spc <= 0)
return NULL;
- err = request_firmware(&fw, fw_name, &pdev->dev);
- if (err)
- return NULL;
-
- dev_info(&pdev->dev, "Loading FW image: %s\n", fw_name);
-
- return fw;
+ return nfp_net_fw_request(pdev, pf, fw_name);
}
/**
#include "nfp_app.h"
#include "nfp_net_ctrl.h"
#include "nfp_net.h"
+#include "nfp_net_sriov.h"
#include "nfp_port.h"
/**
netdev_tx_sent_queue(nd_q, txbuf->real_len);
+ skb_tx_timestamp(skb);
+
tx_ring->wr_p += nr_frags + 1;
if (nfp_net_tx_ring_should_stop(tx_ring))
nfp_net_tx_ring_stop(nd_q, tx_ring);
if (!skb->xmit_more || netif_xmit_stopped(nd_q))
nfp_net_tx_xmit_more_flush(tx_ring);
- skb_tx_timestamp(skb);
-
return NETDEV_TX_OK;
err_unmap:
continue;
}
+ nfp_net_dma_unmap_rx(dp, rxbuf->dma_addr);
+
+ nfp_net_rx_give_one(dp, rx_ring, new_frag, new_dma_addr);
+
if (likely(!meta.portid)) {
netdev = dp->netdev;
} else {
nn = netdev_priv(dp->netdev);
netdev = nfp_app_repr_get(nn->app, meta.portid);
if (unlikely(!netdev)) {
- nfp_net_rx_drop(dp, r_vec, rx_ring, rxbuf, skb);
+ nfp_net_rx_drop(dp, r_vec, rx_ring, NULL, skb);
continue;
}
nfp_repr_inc_rx_stats(netdev, pkt_len);
}
- nfp_net_dma_unmap_rx(dp, rxbuf->dma_addr);
-
- nfp_net_rx_give_one(dp, rx_ring, new_frag, new_dma_addr);
-
skb_reserve(skb, pkt_off);
skb_put(skb, pkt_len);
/* Step 2: Tell NFP
*/
nfp_net_clear_config_and_disable(nn);
+ nfp_port_configure(netdev, false);
/* Step 3: Free resources
*/
goto err_free_all;
/* Step 2: Configure the NFP
+ * - Ifup the physical interface if it exists
* - Enable rings from 0 to tx_rings/rx_rings - 1.
* - Write MAC address (in case it changed)
* - Set the MTU
* - Set the Freelist buffer size
* - Enable the FW
*/
- err = nfp_net_set_config_and_enable(nn);
+ err = nfp_port_configure(netdev, true);
if (err)
goto err_free_all;
+ err = nfp_net_set_config_and_enable(nn);
+ if (err)
+ goto err_port_disable;
+
/* Step 3: Enable for kernel
* - put some freelist descriptors on each RX ring
* - enable NAPI on each ring
return 0;
+err_port_disable:
+ nfp_port_configure(netdev, false);
err_free_all:
nfp_net_close_free_all(nn);
return err;
.ndo_get_stats64 = nfp_net_stat64,
.ndo_vlan_rx_add_vid = nfp_net_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = nfp_net_vlan_rx_kill_vid,
+ .ndo_set_vf_mac = nfp_app_set_vf_mac,
+ .ndo_set_vf_vlan = nfp_app_set_vf_vlan,
+ .ndo_set_vf_spoofchk = nfp_app_set_vf_spoofchk,
+ .ndo_get_vf_config = nfp_app_get_vf_config,
+ .ndo_set_vf_link_state = nfp_app_set_vf_link_state,
.ndo_setup_tc = nfp_port_setup_tc,
.ndo_tx_timeout = nfp_net_tx_timeout,
.ndo_set_rx_mode = nfp_net_set_rx_mode,
#include "nfpcore/nfp6000_pcie.h"
#include "nfp_app.h"
#include "nfp_net_ctrl.h"
+#include "nfp_net_sriov.h"
#include "nfp_net.h"
#include "nfp_main.h"
#include "nfp_port.h"
NFP_PF_CSR_SLICE_SIZE,
&pf->ctrl_vnic_bar);
if (IS_ERR(ctrl_bar)) {
- nfp_err(pf->cpp, "Failed to find data vNIC memory symbol\n");
+ nfp_err(pf->cpp, "Failed to find ctrl vNIC memory symbol\n");
err = PTR_ERR(ctrl_bar);
goto err_app_clean;
}
{
int err;
- err = nfp_net_pf_app_start_ctrl(pf);
- if (err)
- return err;
-
err = nfp_app_start(pf->app, pf->ctrl_vnic);
if (err)
- goto err_ctrl_stop;
+ return err;
if (pf->num_vfs) {
err = nfp_app_sriov_enable(pf->app, pf->num_vfs);
err_app_stop:
nfp_app_stop(pf->app);
- err_ctrl_stop:
- nfp_net_pf_app_stop_ctrl(pf);
return err;
}
if (pf->num_vfs)
nfp_app_sriov_disable(pf->app);
nfp_app_stop(pf->app);
- nfp_net_pf_app_stop_ctrl(pf);
}
static void nfp_net_pci_unmap_mem(struct nfp_pf *pf)
{
+ if (pf->vfcfg_tbl2_area)
+ nfp_cpp_area_release_free(pf->vfcfg_tbl2_area);
if (pf->vf_cfg_bar)
nfp_cpp_area_release_free(pf->vf_cfg_bar);
if (pf->mac_stats_bar)
int err;
min_size = pf->max_data_vnics * NFP_PF_CSR_SLICE_SIZE;
- mem = nfp_net_pf_map_rtsym(pf, "net.ctrl", "_pf%d_net_bar0",
+ mem = nfp_net_pf_map_rtsym(pf, "net.bar0", "_pf%d_net_bar0",
min_size, &pf->data_vnic_bar);
if (IS_ERR(mem)) {
nfp_err(pf->cpp, "Failed to find data vNIC memory symbol\n");
pf->vf_cfg_mem = NULL;
}
+ min_size = NFP_NET_VF_CFG_SZ * pf->limit_vfs + NFP_NET_VF_CFG_MB_SZ;
+ pf->vfcfg_tbl2 = nfp_net_pf_map_rtsym(pf, "net.vfcfg_tbl2",
+ "_pf%d_net_vf_cfg2",
+ min_size, &pf->vfcfg_tbl2_area);
+ if (IS_ERR(pf->vfcfg_tbl2)) {
+ if (PTR_ERR(pf->vfcfg_tbl2) != -ENOENT) {
+ err = PTR_ERR(pf->vfcfg_tbl2);
+ goto err_unmap_vf_cfg;
+ }
+ pf->vfcfg_tbl2 = NULL;
+ }
+
mem = nfp_cpp_map_area(pf->cpp, "net.qc", 0, 0,
NFP_PCIE_QUEUE(0), NFP_QCP_QUEUE_AREA_SZ,
&pf->qc_area);
if (IS_ERR(mem)) {
nfp_err(pf->cpp, "Failed to map Queue Controller area.\n");
err = PTR_ERR(mem);
- goto err_unmap_vf_cfg;
+ goto err_unmap_vfcfg_tbl2;
}
return 0;
+err_unmap_vfcfg_tbl2:
+ if (pf->vfcfg_tbl2_area)
+ nfp_cpp_area_release_free(pf->vfcfg_tbl2_area);
err_unmap_vf_cfg:
if (pf->vf_cfg_bar)
nfp_cpp_area_release_free(pf->vf_cfg_bar);
static void nfp_net_pci_remove_finish(struct nfp_pf *pf)
{
- nfp_net_pf_app_stop(pf);
+ nfp_net_pf_app_stop_ctrl(pf);
/* stop app first, to avoid double free of ctrl vNIC's ddir */
nfp_net_debugfs_dir_clean(&pf->ddir);
{
struct nfp_net_fw_version fw_ver;
u8 __iomem *ctrl_bar, *qc_bar;
+ struct nfp_net *nn;
int stride;
int err;
if (!pf->rtbl) {
nfp_err(pf->cpp, "No %s, giving up.\n",
pf->fw_loaded ? "symbol table" : "firmware found");
- return -EPROBE_DEFER;
+ return -EINVAL;
}
mutex_lock(&pf->lock);
if (err)
goto err_free_vnics;
- err = nfp_net_pf_app_start(pf);
+ err = nfp_net_pf_app_start_ctrl(pf);
if (err)
goto err_free_irqs;
if (err)
goto err_stop_app;
+ err = nfp_net_pf_app_start(pf);
+ if (err)
+ goto err_clean_vnics;
+
mutex_unlock(&pf->lock);
return 0;
+ err_clean_vnics:
+ list_for_each_entry(nn, &pf->vnics, vnic_list)
+ if (nfp_net_is_data_vnic(nn))
+ nfp_net_pf_clean_vnic(pf, nn);
err_stop_app:
- nfp_net_pf_app_stop(pf);
+ nfp_net_pf_app_stop_ctrl(pf);
err_free_irqs:
nfp_net_pf_free_irqs(pf);
err_free_vnics:
if (list_empty(&pf->vnics))
goto out;
+ nfp_net_pf_app_stop(pf);
+
list_for_each_entry(nn, &pf->vnics, vnic_list)
if (nfp_net_is_data_vnic(nn))
nfp_net_pf_clean_vnic(pf, nn);
xaui_direct_valid = xaui_indirect_valid = 1;
/* The XAUI needs to be read out per port */
- if (qdev->func & 1) {
- /* We are NIC 2 */
- status = ql_read_other_func_serdes_reg(qdev,
- XG_SERDES_XAUI_HSS_PCS_START, &temp);
- if (status)
- temp = XG_SERDES_ADDR_XAUI_PWR_DOWN;
- if ((temp & XG_SERDES_ADDR_XAUI_PWR_DOWN) ==
- XG_SERDES_ADDR_XAUI_PWR_DOWN)
- xaui_indirect_valid = 0;
-
- status = ql_read_serdes_reg(qdev,
- XG_SERDES_XAUI_HSS_PCS_START, &temp);
- if (status)
- temp = XG_SERDES_ADDR_XAUI_PWR_DOWN;
-
- if ((temp & XG_SERDES_ADDR_XAUI_PWR_DOWN) ==
- XG_SERDES_ADDR_XAUI_PWR_DOWN)
- xaui_direct_valid = 0;
- } else {
- /* We are NIC 1 */
- status = ql_read_other_func_serdes_reg(qdev,
- XG_SERDES_XAUI_HSS_PCS_START, &temp);
- if (status)
- temp = XG_SERDES_ADDR_XAUI_PWR_DOWN;
- if ((temp & XG_SERDES_ADDR_XAUI_PWR_DOWN) ==
- XG_SERDES_ADDR_XAUI_PWR_DOWN)
- xaui_indirect_valid = 0;
-
- status = ql_read_serdes_reg(qdev,
- XG_SERDES_XAUI_HSS_PCS_START, &temp);
- if (status)
- temp = XG_SERDES_ADDR_XAUI_PWR_DOWN;
- if ((temp & XG_SERDES_ADDR_XAUI_PWR_DOWN) ==
- XG_SERDES_ADDR_XAUI_PWR_DOWN)
- xaui_direct_valid = 0;
- }
+ status = ql_read_other_func_serdes_reg(qdev,
+ XG_SERDES_XAUI_HSS_PCS_START, &temp);
+ if (status)
+ temp = XG_SERDES_ADDR_XAUI_PWR_DOWN;
+
+ if ((temp & XG_SERDES_ADDR_XAUI_PWR_DOWN) ==
+ XG_SERDES_ADDR_XAUI_PWR_DOWN)
+ xaui_indirect_valid = 0;
+
+ status = ql_read_serdes_reg(qdev, XG_SERDES_XAUI_HSS_PCS_START, &temp);
+
+ if (status)
+ temp = XG_SERDES_ADDR_XAUI_PWR_DOWN;
+
+ if ((temp & XG_SERDES_ADDR_XAUI_PWR_DOWN) ==
+ XG_SERDES_ADDR_XAUI_PWR_DOWN)
+ xaui_direct_valid = 0;
/*
* XFI register is shared so only need to read one
seg_hdr->cookie = MPI_COREDUMP_COOKIE;
seg_hdr->segNum = seg_number;
seg_hdr->segSize = seg_size;
- memcpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1);
+ strncpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1);
}
/*
#include <linux/if_vlan.h>
#include <linux/in.h>
#include <linux/slab.h>
+#include <linux/rtnetlink.h>
+#include <linux/netpoll.h>
+
#include <net/arp.h>
#include <net/route.h>
#include <net/sock.h>
#include "hyperv_net.h"
-#define RING_SIZE_MIN 64
+#define RING_SIZE_MIN 64
+#define NETVSC_MIN_TX_SECTIONS 10
+#define NETVSC_DEFAULT_TX 192 /* ~1M */
+#define NETVSC_MIN_RX_SECTIONS 10 /* ~64K */
+#define NETVSC_DEFAULT_RX 2048 /* ~4M */
+
#define LINKCHANGE_INT (2 * HZ)
+#define VF_TAKEOVER_INT (HZ / 10)
static int ring_size = 128;
module_param(ring_size, int, S_IRUGO);
static int netvsc_open(struct net_device *net)
{
struct net_device_context *ndev_ctx = netdev_priv(net);
- struct netvsc_device *nvdev = ndev_ctx->nvdev;
+ struct net_device *vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
+ struct netvsc_device *nvdev = rtnl_dereference(ndev_ctx->nvdev);
struct rndis_device *rdev;
int ret = 0;
netif_tx_wake_all_queues(net);
rdev = nvdev->extension;
- if (!rdev->link_state && !ndev_ctx->datapath)
+
+ if (!rdev->link_state)
netif_carrier_on(net);
- return ret;
+ if (vf_netdev) {
+ /* Setting synthetic device up transparently sets
+ * slave as up. If open fails, then slave will be
+ * still be offline (and not used).
+ */
+ ret = dev_open(vf_netdev);
+ if (ret)
+ netdev_warn(net,
+ "unable to open slave: %s: %d\n",
+ vf_netdev->name, ret);
+ }
+ return 0;
}
static int netvsc_close(struct net_device *net)
{
struct net_device_context *net_device_ctx = netdev_priv(net);
+ struct net_device *vf_netdev
+ = rtnl_dereference(net_device_ctx->vf_netdev);
struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
- int ret;
+ int ret = 0;
u32 aread, i, msec = 10, retry = 0, retry_max = 20;
struct vmbus_channel *chn;
netif_tx_disable(net);
+ /* No need to close rndis filter if it is removed already */
+ if (!nvdev)
+ goto out;
+
ret = rndis_filter_close(nvdev);
if (ret != 0) {
netdev_err(net, "unable to close device (ret %d).\n", ret);
ret = -ETIMEDOUT;
}
+out:
+ if (vf_netdev)
+ dev_close(vf_netdev);
+
return ret;
}
static void *init_ppi_data(struct rndis_message *msg, u32 ppi_size,
- int pkt_type)
+ int pkt_type)
{
struct rndis_packet *rndis_pkt;
struct rndis_per_packet_info *ppi;
return ppi;
}
-/* Azure hosts don't support non-TCP port numbers in hashing yet. We compute
- * hash for non-TCP traffic with only IP numbers.
+/* Azure hosts don't support non-TCP port numbers in hashing for fragmented
+ * packets. We can use ethtool to change UDP hash level when necessary.
*/
-static inline u32 netvsc_get_hash(struct sk_buff *skb, struct sock *sk)
+static inline u32 netvsc_get_hash(
+ struct sk_buff *skb,
+ const struct net_device_context *ndc)
{
struct flow_keys flow;
u32 hash;
if (!skb_flow_dissect_flow_keys(skb, &flow, 0))
return 0;
- if (flow.basic.ip_proto == IPPROTO_TCP) {
+ if (flow.basic.ip_proto == IPPROTO_TCP ||
+ (flow.basic.ip_proto == IPPROTO_UDP &&
+ ((flow.basic.n_proto == htons(ETH_P_IP) && ndc->udp4_l4_hash) ||
+ (flow.basic.n_proto == htons(ETH_P_IPV6) &&
+ ndc->udp6_l4_hash)))) {
return skb_get_hash(skb);
} else {
if (flow.basic.n_proto == htons(ETH_P_IP))
struct sock *sk = skb->sk;
int q_idx;
- q_idx = ndc->tx_send_table[netvsc_get_hash(skb, sk) &
+ q_idx = ndc->tx_send_table[netvsc_get_hash(skb, ndc) &
(VRSS_SEND_TAB_SIZE - 1)];
/* If queue index changed record the new value */
*
* TODO support XPS - but get_xps_queue not exported
*/
-static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
- void *accel_priv, select_queue_fallback_t fallback)
+static u16 netvsc_pick_tx(struct net_device *ndev, struct sk_buff *skb)
{
- unsigned int num_tx_queues = ndev->real_num_tx_queues;
int q_idx = sk_tx_queue_get(skb->sk);
- if (q_idx < 0 || skb->ooo_okay) {
+ if (q_idx < 0 || skb->ooo_okay || q_idx >= ndev->real_num_tx_queues) {
/* If forwarding a packet, we use the recorded queue when
* available for better cache locality.
*/
q_idx = netvsc_get_tx_queue(ndev, skb, q_idx);
}
- while (unlikely(q_idx >= num_tx_queues))
- q_idx -= num_tx_queues;
-
return q_idx;
}
+static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
+ void *accel_priv,
+ select_queue_fallback_t fallback)
+{
+ struct net_device_context *ndc = netdev_priv(ndev);
+ struct net_device *vf_netdev;
+ u16 txq;
+
+ rcu_read_lock();
+ vf_netdev = rcu_dereference(ndc->vf_netdev);
+ if (vf_netdev) {
+ txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
+ qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
+ } else {
+ txq = netvsc_pick_tx(ndev, skb);
+ }
+ rcu_read_unlock();
+
+ while (unlikely(txq >= ndev->real_num_tx_queues))
+ txq -= ndev->real_num_tx_queues;
+
+ return txq;
+}
+
static u32 fill_pg_buf(struct page *page, u32 offset, u32 len,
- struct hv_page_buffer *pb)
+ struct hv_page_buffer *pb)
{
int j = 0;
static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb,
struct hv_netvsc_packet *packet,
- struct hv_page_buffer **page_buf)
+ struct hv_page_buffer *pb)
{
- struct hv_page_buffer *pb = *page_buf;
u32 slots_used = 0;
char *data = skb->data;
int frags = skb_shinfo(skb)->nr_frags;
* 2. skb linear data
* 3. skb fragment data
*/
- if (hdr != NULL)
- slots_used += fill_pg_buf(virt_to_page(hdr),
- offset_in_page(hdr),
- len, &pb[slots_used]);
+ slots_used += fill_pg_buf(virt_to_page(hdr),
+ offset_in_page(hdr),
+ len, &pb[slots_used]);
packet->rmsg_size = len;
packet->rmsg_pgcnt = slots_used;
if (ip6->nexthdr == IPPROTO_TCP)
return TRANSPORT_INFO_IPV6_TCP;
- else if (ipv6_hdr(skb)->nexthdr == IPPROTO_UDP)
+ else if (ip6->nexthdr == IPPROTO_UDP)
return TRANSPORT_INFO_IPV6_UDP;
}
return TRANSPORT_INFO_NOT_IP;
}
+/* Send skb on the slave VF device. */
+static int netvsc_vf_xmit(struct net_device *net, struct net_device *vf_netdev,
+ struct sk_buff *skb)
+{
+ struct net_device_context *ndev_ctx = netdev_priv(net);
+ unsigned int len = skb->len;
+ int rc;
+
+ skb->dev = vf_netdev;
+ skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
+
+ rc = dev_queue_xmit(skb);
+ if (likely(rc == NET_XMIT_SUCCESS || rc == NET_XMIT_CN)) {
+ struct netvsc_vf_pcpu_stats *pcpu_stats
+ = this_cpu_ptr(ndev_ctx->vf_stats);
+
+ u64_stats_update_begin(&pcpu_stats->syncp);
+ pcpu_stats->tx_packets++;
+ pcpu_stats->tx_bytes += len;
+ u64_stats_update_end(&pcpu_stats->syncp);
+ } else {
+ this_cpu_inc(ndev_ctx->vf_stats->tx_dropped);
+ }
+
+ return rc;
+}
+
static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net)
{
struct net_device_context *net_device_ctx = netdev_priv(net);
unsigned int num_data_pgs;
struct rndis_message *rndis_msg;
struct rndis_packet *rndis_pkt;
+ struct net_device *vf_netdev;
u32 rndis_msg_size;
struct rndis_per_packet_info *ppi;
u32 hash;
- struct hv_page_buffer page_buf[MAX_PAGE_BUFFER_COUNT];
- struct hv_page_buffer *pb = page_buf;
+ struct hv_page_buffer pb[MAX_PAGE_BUFFER_COUNT];
+
+ /* if VF is present and up then redirect packets
+ * already called with rcu_read_lock_bh
+ */
+ vf_netdev = rcu_dereference_bh(net_device_ctx->vf_netdev);
+ if (vf_netdev && netif_running(vf_netdev) &&
+ !netpoll_tx_running(net))
+ return netvsc_vf_xmit(net, vf_netdev, skb);
/* We will atmost need two pages to describe the rndis
* header. We can only transmit MAX_PAGE_BUFFER_COUNT number
rndis_msg_size += NDIS_VLAN_PPI_SIZE;
ppi = init_ppi_data(rndis_msg, NDIS_VLAN_PPI_SIZE,
- IEEE_8021Q_INFO);
- vlan = (struct ndis_pkt_8021q_info *)((void *)ppi +
- ppi->ppi_offset);
+ IEEE_8021Q_INFO);
+
+ vlan = (void *)ppi + ppi->ppi_offset;
vlan->vlanid = skb->vlan_tci & VLAN_VID_MASK;
vlan->pri = (skb->vlan_tci & VLAN_PRIO_MASK) >>
VLAN_PRIO_SHIFT;
ppi = init_ppi_data(rndis_msg, NDIS_LSO_PPI_SIZE,
TCP_LARGESEND_PKTINFO);
- lso_info = (struct ndis_tcp_lso_info *)((void *)ppi +
- ppi->ppi_offset);
+ lso_info = (void *)ppi + ppi->ppi_offset;
lso_info->lso_v2_transmit.type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
if (skb->protocol == htons(ETH_P_IP)) {
rndis_msg->msg_len += rndis_msg_size;
packet->total_data_buflen = rndis_msg->msg_len;
packet->page_buf_cnt = init_page_array(rndis_msg, rndis_msg_size,
- skb, packet, &pb);
+ skb, packet, pb);
/* timestamp packet in software */
skb_tx_timestamp(skb);
- ret = netvsc_send(net_device_ctx->device_ctx, packet,
- rndis_msg, &pb, skb);
+
+ ret = netvsc_send(net_device_ctx, packet, rndis_msg, pb, skb);
if (likely(ret == 0))
return NETDEV_TX_OK;
++net_device_ctx->eth_stats.tx_no_memory;
goto drop;
}
+
/*
* netvsc_linkstatus_callback - Link up/down notification
*/
if (indicate->status == RNDIS_STATUS_LINK_SPEED_CHANGE) {
u32 speed;
- speed = *(u32 *)((void *)indicate + indicate->
- status_buf_offset) / 10000;
+ speed = *(u32 *)((void *)indicate
+ + indicate->status_buf_offset) / 10000;
ndev_ctx->speed = speed;
return;
}
struct netvsc_device *net_device;
u16 q_idx = channel->offermsg.offer.sub_channel_index;
struct netvsc_channel *nvchan;
- struct net_device *vf_netdev;
struct sk_buff *skb;
struct netvsc_stats *rx_stats;
if (net->reg_state != NETREG_REGISTERED)
return NVSP_STAT_FAIL;
- /*
- * If necessary, inject this packet into the VF interface.
- * On Hyper-V, multicast and brodcast packets are only delivered
- * to the synthetic interface (after subjecting these to
- * policy filters on the host). Deliver these via the VF
- * interface in the guest.
- */
rcu_read_lock();
net_device = rcu_dereference(net_device_ctx->nvdev);
if (unlikely(!net_device))
goto drop;
nvchan = &net_device->chan_table[q_idx];
- vf_netdev = rcu_dereference(net_device_ctx->vf_netdev);
- if (vf_netdev && (vf_netdev->flags & IFF_UP))
- net = vf_netdev;
/* Allocate a skb - TODO direct I/O to pages? */
skb = netvsc_alloc_recv_skb(net, &nvchan->napi,
return NVSP_STAT_FAIL;
}
- if (net != vf_netdev)
- skb_record_rx_queue(skb, q_idx);
+ skb_record_rx_queue(skb, q_idx);
/*
* Even if injecting the packet, record the statistics
}
}
-static int netvsc_set_queues(struct net_device *net, struct hv_device *dev,
- u32 num_chn)
-{
- struct netvsc_device_info device_info;
- int ret;
-
- memset(&device_info, 0, sizeof(device_info));
- device_info.num_chn = num_chn;
- device_info.ring_size = ring_size;
- device_info.max_num_vrss_chns = num_chn;
-
- ret = rndis_filter_device_add(dev, &device_info);
- if (ret)
- return ret;
-
- ret = netif_set_real_num_tx_queues(net, num_chn);
- if (ret)
- return ret;
-
- ret = netif_set_real_num_rx_queues(net, num_chn);
-
- return ret;
-}
-
static int netvsc_set_channels(struct net_device *net,
struct ethtool_channels *channels)
{
struct net_device_context *net_device_ctx = netdev_priv(net);
struct hv_device *dev = net_device_ctx->device_ctx;
struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
- unsigned int count = channels->combined_count;
- bool was_running;
- int ret;
+ unsigned int orig, count = channels->combined_count;
+ struct netvsc_device_info device_info;
+ bool was_opened;
+ int ret = 0;
/* We do not support separate count for rx, tx, or other */
if (count == 0 ||
if (count > nvdev->max_chn)
return -EINVAL;
- was_running = netif_running(net);
- if (was_running) {
- ret = netvsc_close(net);
- if (ret)
- return ret;
- }
+ orig = nvdev->num_chn;
+ was_opened = rndis_filter_opened(nvdev);
+ if (was_opened)
+ rndis_filter_close(nvdev);
+
+ memset(&device_info, 0, sizeof(device_info));
+ device_info.num_chn = count;
+ device_info.ring_size = ring_size;
+ device_info.send_sections = nvdev->send_section_cnt;
+ device_info.recv_sections = nvdev->recv_section_cnt;
rndis_filter_device_remove(dev, nvdev);
- ret = netvsc_set_queues(net, dev, count);
- if (ret == 0)
- nvdev->num_chn = count;
- else
- netvsc_set_queues(net, dev, nvdev->num_chn);
+ nvdev = rndis_filter_device_add(dev, &device_info);
+ if (!IS_ERR(nvdev)) {
+ netif_set_real_num_tx_queues(net, nvdev->num_chn);
+ netif_set_real_num_rx_queues(net, nvdev->num_chn);
+ } else {
+ ret = PTR_ERR(nvdev);
+ device_info.num_chn = orig;
+ nvdev = rndis_filter_device_add(dev, &device_info);
- if (was_running)
- ret = netvsc_open(net);
+ if (IS_ERR(nvdev)) {
+ netdev_err(net, "restoring channel setting failed: %ld\n",
+ PTR_ERR(nvdev));
+ return ret;
+ }
+ }
+
+ if (was_opened)
+ rndis_filter_open(nvdev);
/* We may have missed link change notifications */
+ net_device_ctx->last_reconfig = 0;
schedule_delayed_work(&net_device_ctx->dwork, 0);
return ret;
{
struct net_device_context *ndc = netdev_priv(dev);
+ ndc->udp4_l4_hash = true;
+ ndc->udp6_l4_hash = true;
+
ndc->speed = SPEED_UNKNOWN;
ndc->duplex = DUPLEX_FULL;
}
static int netvsc_change_mtu(struct net_device *ndev, int mtu)
{
struct net_device_context *ndevctx = netdev_priv(ndev);
+ struct net_device *vf_netdev = rtnl_dereference(ndevctx->vf_netdev);
struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
struct hv_device *hdev = ndevctx->device_ctx;
+ int orig_mtu = ndev->mtu;
struct netvsc_device_info device_info;
- bool was_running;
+ bool was_opened;
int ret = 0;
if (!nvdev || nvdev->destroy)
return -ENODEV;
- was_running = netif_running(ndev);
- if (was_running) {
- ret = netvsc_close(ndev);
+ /* Change MTU of underlying VF netdev first. */
+ if (vf_netdev) {
+ ret = dev_set_mtu(vf_netdev, mtu);
if (ret)
return ret;
}
+ netif_device_detach(ndev);
+ was_opened = rndis_filter_opened(nvdev);
+ if (was_opened)
+ rndis_filter_close(nvdev);
+
memset(&device_info, 0, sizeof(device_info));
device_info.ring_size = ring_size;
device_info.num_chn = nvdev->num_chn;
- device_info.max_num_vrss_chns = nvdev->num_chn;
+ device_info.send_sections = nvdev->send_section_cnt;
+ device_info.recv_sections = nvdev->recv_section_cnt;
rndis_filter_device_remove(hdev, nvdev);
- /* 'nvdev' has been freed in rndis_filter_device_remove() ->
- * netvsc_device_remove () -> free_netvsc_device().
- * We mustn't access it before it's re-created in
- * rndis_filter_device_add() -> netvsc_device_add().
- */
-
ndev->mtu = mtu;
- rndis_filter_device_add(hdev, &device_info);
+ nvdev = rndis_filter_device_add(hdev, &device_info);
+ if (IS_ERR(nvdev)) {
+ ret = PTR_ERR(nvdev);
+
+ /* Attempt rollback to original MTU */
+ ndev->mtu = orig_mtu;
+ nvdev = rndis_filter_device_add(hdev, &device_info);
+
+ if (vf_netdev)
+ dev_set_mtu(vf_netdev, orig_mtu);
+
+ if (IS_ERR(nvdev)) {
+ netdev_err(ndev, "restoring mtu failed: %ld\n",
+ PTR_ERR(nvdev));
+ return ret;
+ }
+ }
+
+ if (was_opened)
+ rndis_filter_open(nvdev);
- if (was_running)
- ret = netvsc_open(ndev);
+ netif_device_attach(ndev);
/* We may have missed link change notifications */
schedule_delayed_work(&ndevctx->dwork, 0);
return ret;
}
+static void netvsc_get_vf_stats(struct net_device *net,
+ struct netvsc_vf_pcpu_stats *tot)
+{
+ struct net_device_context *ndev_ctx = netdev_priv(net);
+ int i;
+
+ memset(tot, 0, sizeof(*tot));
+
+ for_each_possible_cpu(i) {
+ const struct netvsc_vf_pcpu_stats *stats
+ = per_cpu_ptr(ndev_ctx->vf_stats, i);
+ u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
+ rx_packets = stats->rx_packets;
+ tx_packets = stats->tx_packets;
+ rx_bytes = stats->rx_bytes;
+ tx_bytes = stats->tx_bytes;
+ } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+
+ tot->rx_packets += rx_packets;
+ tot->tx_packets += tx_packets;
+ tot->rx_bytes += rx_bytes;
+ tot->tx_bytes += tx_bytes;
+ tot->tx_dropped += stats->tx_dropped;
+ }
+}
+
static void netvsc_get_stats64(struct net_device *net,
struct rtnl_link_stats64 *t)
{
struct net_device_context *ndev_ctx = netdev_priv(net);
struct netvsc_device *nvdev = rcu_dereference_rtnl(ndev_ctx->nvdev);
+ struct netvsc_vf_pcpu_stats vf_tot;
int i;
if (!nvdev)
return;
+ netdev_stats_to_stats64(t, &net->stats);
+
+ netvsc_get_vf_stats(net, &vf_tot);
+ t->rx_packets += vf_tot.rx_packets;
+ t->tx_packets += vf_tot.tx_packets;
+ t->rx_bytes += vf_tot.rx_bytes;
+ t->tx_bytes += vf_tot.tx_bytes;
+ t->tx_dropped += vf_tot.tx_dropped;
+
for (i = 0; i < nvdev->num_chn; i++) {
const struct netvsc_channel *nvchan = &nvdev->chan_table[i];
const struct netvsc_stats *stats;
t->rx_packets += packets;
t->multicast += multicast;
}
-
- t->tx_dropped = net->stats.tx_dropped;
- t->tx_errors = net->stats.tx_errors;
-
- t->rx_dropped = net->stats.rx_dropped;
- t->rx_errors = net->stats.rx_errors;
}
static int netvsc_set_mac_addr(struct net_device *ndev, void *p)
{
+ struct net_device_context *ndc = netdev_priv(ndev);
+ struct net_device *vf_netdev = rtnl_dereference(ndc->vf_netdev);
+ struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
struct sockaddr *addr = p;
- char save_adr[ETH_ALEN];
- unsigned char save_aatype;
int err;
- memcpy(save_adr, ndev->dev_addr, ETH_ALEN);
- save_aatype = ndev->addr_assign_type;
-
- err = eth_mac_addr(ndev, p);
- if (err != 0)
+ err = eth_prepare_mac_addr_change(ndev, p);
+ if (err)
return err;
- err = rndis_filter_set_device_mac(ndev, addr->sa_data);
- if (err != 0) {
- /* roll back to saved MAC */
- memcpy(ndev->dev_addr, save_adr, ETH_ALEN);
- ndev->addr_assign_type = save_aatype;
+ if (!nvdev)
+ return -ENODEV;
+
+ if (vf_netdev) {
+ err = dev_set_mac_address(vf_netdev, addr);
+ if (err)
+ return err;
+ }
+
+ err = rndis_filter_set_device_mac(nvdev, addr->sa_data);
+ if (!err) {
+ eth_commit_mac_addr_change(ndev, p);
+ } else if (vf_netdev) {
+ /* rollback change on VF */
+ memcpy(addr->sa_data, ndev->dev_addr, ETH_ALEN);
+ dev_set_mac_address(vf_netdev, addr);
}
return err;
{ "tx_no_space", offsetof(struct netvsc_ethtool_stats, tx_no_space) },
{ "tx_too_big", offsetof(struct netvsc_ethtool_stats, tx_too_big) },
{ "tx_busy", offsetof(struct netvsc_ethtool_stats, tx_busy) },
+ { "tx_send_full", offsetof(struct netvsc_ethtool_stats, tx_send_full) },
+ { "rx_comp_busy", offsetof(struct netvsc_ethtool_stats, rx_comp_busy) },
+}, vf_stats[] = {
+ { "vf_rx_packets", offsetof(struct netvsc_vf_pcpu_stats, rx_packets) },
+ { "vf_rx_bytes", offsetof(struct netvsc_vf_pcpu_stats, rx_bytes) },
+ { "vf_tx_packets", offsetof(struct netvsc_vf_pcpu_stats, tx_packets) },
+ { "vf_tx_bytes", offsetof(struct netvsc_vf_pcpu_stats, tx_bytes) },
+ { "vf_tx_dropped", offsetof(struct netvsc_vf_pcpu_stats, tx_dropped) },
};
#define NETVSC_GLOBAL_STATS_LEN ARRAY_SIZE(netvsc_stats)
+#define NETVSC_VF_STATS_LEN ARRAY_SIZE(vf_stats)
/* 4 statistics per queue (rx/tx packets/bytes) */
#define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 4)
switch (string_set) {
case ETH_SS_STATS:
- return NETVSC_GLOBAL_STATS_LEN + NETVSC_QUEUE_STATS_LEN(nvdev);
+ return NETVSC_GLOBAL_STATS_LEN
+ + NETVSC_VF_STATS_LEN
+ + NETVSC_QUEUE_STATS_LEN(nvdev);
default:
return -EINVAL;
}
struct ethtool_stats *stats, u64 *data)
{
struct net_device_context *ndc = netdev_priv(dev);
- struct netvsc_device *nvdev = rcu_dereference(ndc->nvdev);
+ struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
const void *nds = &ndc->eth_stats;
const struct netvsc_stats *qstats;
+ struct netvsc_vf_pcpu_stats sum;
unsigned int start;
u64 packets, bytes;
int i, j;
for (i = 0; i < NETVSC_GLOBAL_STATS_LEN; i++)
data[i] = *(unsigned long *)(nds + netvsc_stats[i].offset);
+ netvsc_get_vf_stats(dev, &sum);
+ for (j = 0; j < NETVSC_VF_STATS_LEN; j++)
+ data[i++] = *(u64 *)((void *)&sum + vf_stats[j].offset);
+
for (j = 0; j < nvdev->num_chn; j++) {
qstats = &nvdev->chan_table[j].tx_stats;
static void netvsc_get_strings(struct net_device *dev, u32 stringset, u8 *data)
{
struct net_device_context *ndc = netdev_priv(dev);
- struct netvsc_device *nvdev = rcu_dereference(ndc->nvdev);
+ struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
u8 *p = data;
int i;
switch (stringset) {
case ETH_SS_STATS:
- for (i = 0; i < ARRAY_SIZE(netvsc_stats); i++)
- memcpy(p + i * ETH_GSTRING_LEN,
- netvsc_stats[i].name, ETH_GSTRING_LEN);
+ for (i = 0; i < ARRAY_SIZE(netvsc_stats); i++) {
+ memcpy(p, netvsc_stats[i].name, ETH_GSTRING_LEN);
+ p += ETH_GSTRING_LEN;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(vf_stats); i++) {
+ memcpy(p, vf_stats[i].name, ETH_GSTRING_LEN);
+ p += ETH_GSTRING_LEN;
+ }
- p += i * ETH_GSTRING_LEN;
for (i = 0; i < nvdev->num_chn; i++) {
sprintf(p, "tx_queue_%u_packets", i);
p += ETH_GSTRING_LEN;
}
static int
-netvsc_get_rss_hash_opts(struct netvsc_device *nvdev,
+netvsc_get_rss_hash_opts(struct net_device_context *ndc,
struct ethtool_rxnfc *info)
{
info->data = RXH_IP_SRC | RXH_IP_DST;
case TCP_V4_FLOW:
case TCP_V6_FLOW:
info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
- /* fallthrough */
+ break;
+
case UDP_V4_FLOW:
+ if (ndc->udp4_l4_hash)
+ info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+
+ break;
+
case UDP_V6_FLOW:
+ if (ndc->udp6_l4_hash)
+ info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+
+ break;
+
case IPV4_FLOW:
case IPV6_FLOW:
break;
u32 *rules)
{
struct net_device_context *ndc = netdev_priv(dev);
- struct netvsc_device *nvdev = rcu_dereference(ndc->nvdev);
+ struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
if (!nvdev)
return -ENODEV;
return 0;
case ETHTOOL_GRXFH:
- return netvsc_get_rss_hash_opts(nvdev, info);
+ return netvsc_get_rss_hash_opts(ndc, info);
+ }
+ return -EOPNOTSUPP;
+}
+
+static int netvsc_set_rss_hash_opts(struct net_device_context *ndc,
+ struct ethtool_rxnfc *info)
+{
+ if (info->data == (RXH_IP_SRC | RXH_IP_DST |
+ RXH_L4_B_0_1 | RXH_L4_B_2_3)) {
+ if (info->flow_type == UDP_V4_FLOW)
+ ndc->udp4_l4_hash = true;
+ else if (info->flow_type == UDP_V6_FLOW)
+ ndc->udp6_l4_hash = true;
+ else
+ return -EOPNOTSUPP;
+
+ return 0;
+ }
+
+ if (info->data == (RXH_IP_SRC | RXH_IP_DST)) {
+ if (info->flow_type == UDP_V4_FLOW)
+ ndc->udp4_l4_hash = false;
+ else if (info->flow_type == UDP_V6_FLOW)
+ ndc->udp6_l4_hash = false;
+ else
+ return -EOPNOTSUPP;
+
+ return 0;
}
+
+ return -EOPNOTSUPP;
+}
+
+static int
+netvsc_set_rxnfc(struct net_device *ndev, struct ethtool_rxnfc *info)
+{
+ struct net_device_context *ndc = netdev_priv(ndev);
+
+ if (info->cmd == ETHTOOL_SRXFH)
+ return netvsc_set_rss_hash_opts(ndc, info);
+
return -EOPNOTSUPP;
}
u8 *hfunc)
{
struct net_device_context *ndc = netdev_priv(dev);
- struct netvsc_device *ndev = rcu_dereference(ndc->nvdev);
+ struct netvsc_device *ndev = rtnl_dereference(ndc->nvdev);
struct rndis_device *rndis_dev;
int i;
return rndis_filter_set_rss_param(rndis_dev, key, ndev->num_chn);
}
+/* Hyper-V RNDIS protocol does not have ring in the HW sense.
+ * It does have pre-allocated receive area which is divided into sections.
+ */
+static void __netvsc_get_ringparam(struct netvsc_device *nvdev,
+ struct ethtool_ringparam *ring)
+{
+ u32 max_buf_size;
+
+ ring->rx_pending = nvdev->recv_section_cnt;
+ ring->tx_pending = nvdev->send_section_cnt;
+
+ if (nvdev->nvsp_version <= NVSP_PROTOCOL_VERSION_2)
+ max_buf_size = NETVSC_RECEIVE_BUFFER_SIZE_LEGACY;
+ else
+ max_buf_size = NETVSC_RECEIVE_BUFFER_SIZE;
+
+ ring->rx_max_pending = max_buf_size / nvdev->recv_section_size;
+ ring->tx_max_pending = NETVSC_SEND_BUFFER_SIZE
+ / nvdev->send_section_size;
+}
+
+static void netvsc_get_ringparam(struct net_device *ndev,
+ struct ethtool_ringparam *ring)
+{
+ struct net_device_context *ndevctx = netdev_priv(ndev);
+ struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
+
+ if (!nvdev)
+ return;
+
+ __netvsc_get_ringparam(nvdev, ring);
+}
+
+static int netvsc_set_ringparam(struct net_device *ndev,
+ struct ethtool_ringparam *ring)
+{
+ struct net_device_context *ndevctx = netdev_priv(ndev);
+ struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
+ struct hv_device *hdev = ndevctx->device_ctx;
+ struct netvsc_device_info device_info;
+ struct ethtool_ringparam orig;
+ u32 new_tx, new_rx;
+ bool was_opened;
+ int ret = 0;
+
+ if (!nvdev || nvdev->destroy)
+ return -ENODEV;
+
+ memset(&orig, 0, sizeof(orig));
+ __netvsc_get_ringparam(nvdev, &orig);
+
+ new_tx = clamp_t(u32, ring->tx_pending,
+ NETVSC_MIN_TX_SECTIONS, orig.tx_max_pending);
+ new_rx = clamp_t(u32, ring->rx_pending,
+ NETVSC_MIN_RX_SECTIONS, orig.rx_max_pending);
+
+ if (new_tx == orig.tx_pending &&
+ new_rx == orig.rx_pending)
+ return 0; /* no change */
+
+ memset(&device_info, 0, sizeof(device_info));
+ device_info.num_chn = nvdev->num_chn;
+ device_info.ring_size = ring_size;
+ device_info.send_sections = new_tx;
+ device_info.recv_sections = new_rx;
+
+ netif_device_detach(ndev);
+ was_opened = rndis_filter_opened(nvdev);
+ if (was_opened)
+ rndis_filter_close(nvdev);
+
+ rndis_filter_device_remove(hdev, nvdev);
+
+ nvdev = rndis_filter_device_add(hdev, &device_info);
+ if (IS_ERR(nvdev)) {
+ ret = PTR_ERR(nvdev);
+
+ device_info.send_sections = orig.tx_pending;
+ device_info.recv_sections = orig.rx_pending;
+ nvdev = rndis_filter_device_add(hdev, &device_info);
+ if (IS_ERR(nvdev)) {
+ netdev_err(ndev, "restoring ringparam failed: %ld\n",
+ PTR_ERR(nvdev));
+ return ret;
+ }
+ }
+
+ if (was_opened)
+ rndis_filter_open(nvdev);
+ netif_device_attach(ndev);
+
+ /* We may have missed link change notifications */
+ ndevctx->last_reconfig = 0;
+ schedule_delayed_work(&ndevctx->dwork, 0);
+
+ return ret;
+}
+
static const struct ethtool_ops ethtool_ops = {
.get_drvinfo = netvsc_get_drvinfo,
.get_link = ethtool_op_get_link,
.set_channels = netvsc_set_channels,
.get_ts_info = ethtool_op_get_ts_info,
.get_rxnfc = netvsc_get_rxnfc,
+ .set_rxnfc = netvsc_set_rxnfc,
.get_rxfh_key_size = netvsc_get_rxfh_key_size,
.get_rxfh_indir_size = netvsc_rss_indir_size,
.get_rxfh = netvsc_get_rxfh,
.set_rxfh = netvsc_set_rxfh,
.get_link_ksettings = netvsc_get_link_ksettings,
.set_link_ksettings = netvsc_set_link_ksettings,
+ .get_ringparam = netvsc_get_ringparam,
+ .set_ringparam = netvsc_set_ringparam,
};
static const struct net_device_ops device_ops = {
bool notify = false, reschedule = false;
unsigned long flags, next_reconfig, delay;
- rtnl_lock();
+ /* if changes are happening, comeback later */
+ if (!rtnl_trylock()) {
+ schedule_delayed_work(&ndev_ctx->dwork, LINKCHANGE_INT);
+ return;
+ }
+
net_device = rtnl_dereference(ndev_ctx->nvdev);
if (!net_device)
goto out_unlock;
case RNDIS_STATUS_MEDIA_CONNECT:
if (rdev->link_state) {
rdev->link_state = false;
- if (!ndev_ctx->datapath)
- netif_carrier_on(net);
+ netif_carrier_on(net);
netif_tx_wake_all_queues(net);
} else {
notify = true;
continue; /* not a netvsc device */
net_device_ctx = netdev_priv(dev);
- if (net_device_ctx->nvdev == NULL)
+ if (!rtnl_dereference(net_device_ctx->nvdev))
continue; /* device is removed */
if (rtnl_dereference(net_device_ctx->vf_netdev) == vf_netdev)
return NULL;
}
+/* Called when VF is injecting data into network stack.
+ * Change the associated network device from VF to netvsc.
+ * note: already called with rcu_read_lock
+ */
+static rx_handler_result_t netvsc_vf_handle_frame(struct sk_buff **pskb)
+{
+ struct sk_buff *skb = *pskb;
+ struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
+ struct net_device_context *ndev_ctx = netdev_priv(ndev);
+ struct netvsc_vf_pcpu_stats *pcpu_stats
+ = this_cpu_ptr(ndev_ctx->vf_stats);
+
+ skb->dev = ndev;
+
+ u64_stats_update_begin(&pcpu_stats->syncp);
+ pcpu_stats->rx_packets++;
+ pcpu_stats->rx_bytes += skb->len;
+ u64_stats_update_end(&pcpu_stats->syncp);
+
+ return RX_HANDLER_ANOTHER;
+}
+
+static int netvsc_vf_join(struct net_device *vf_netdev,
+ struct net_device *ndev)
+{
+ struct net_device_context *ndev_ctx = netdev_priv(ndev);
+ int ret;
+
+ ret = netdev_rx_handler_register(vf_netdev,
+ netvsc_vf_handle_frame, ndev);
+ if (ret != 0) {
+ netdev_err(vf_netdev,
+ "can not register netvsc VF receive handler (err = %d)\n",
+ ret);
+ goto rx_handler_failed;
+ }
+
+ ret = netdev_upper_dev_link(vf_netdev, ndev);
+ if (ret != 0) {
+ netdev_err(vf_netdev,
+ "can not set master device %s (err = %d)\n",
+ ndev->name, ret);
+ goto upper_link_failed;
+ }
+
+ /* set slave flag before open to prevent IPv6 addrconf */
+ vf_netdev->flags |= IFF_SLAVE;
+
+ schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
+
+ call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
+
+ netdev_info(vf_netdev, "joined to %s\n", ndev->name);
+ return 0;
+
+upper_link_failed:
+ netdev_rx_handler_unregister(vf_netdev);
+rx_handler_failed:
+ return ret;
+}
+
+static void __netvsc_vf_setup(struct net_device *ndev,
+ struct net_device *vf_netdev)
+{
+ int ret;
+
+ /* Align MTU of VF with master */
+ ret = dev_set_mtu(vf_netdev, ndev->mtu);
+ if (ret)
+ netdev_warn(vf_netdev,
+ "unable to change mtu to %u\n", ndev->mtu);
+
+ if (netif_running(ndev)) {
+ ret = dev_open(vf_netdev);
+ if (ret)
+ netdev_warn(vf_netdev,
+ "unable to open: %d\n", ret);
+ }
+}
+
+/* Setup VF as slave of the synthetic device.
+ * Runs in workqueue to avoid recursion in netlink callbacks.
+ */
+static void netvsc_vf_setup(struct work_struct *w)
+{
+ struct net_device_context *ndev_ctx
+ = container_of(w, struct net_device_context, vf_takeover.work);
+ struct net_device *ndev = hv_get_drvdata(ndev_ctx->device_ctx);
+ struct net_device *vf_netdev;
+
+ if (!rtnl_trylock()) {
+ schedule_delayed_work(&ndev_ctx->vf_takeover, 0);
+ return;
+ }
+
+ vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
+ if (vf_netdev)
+ __netvsc_vf_setup(ndev, vf_netdev);
+
+ rtnl_unlock();
+}
+
static int netvsc_register_vf(struct net_device *vf_netdev)
{
struct net_device *ndev;
if (!netvsc_dev || rtnl_dereference(net_device_ctx->vf_netdev))
return NOTIFY_DONE;
+ if (netvsc_vf_join(vf_netdev, ndev) != 0)
+ return NOTIFY_DONE;
+
netdev_info(ndev, "VF registering: %s\n", vf_netdev->name);
- /*
- * Take a reference on the module.
- */
+
+ /* Prevent this module from being unloaded while VF is registered */
try_module_get(THIS_MODULE);
dev_hold(vf_netdev);
static int netvsc_vf_up(struct net_device *vf_netdev)
{
- struct net_device *ndev;
- struct netvsc_device *netvsc_dev;
struct net_device_context *net_device_ctx;
+ struct netvsc_device *netvsc_dev;
+ struct net_device *ndev;
ndev = get_netvsc_byref(vf_netdev);
if (!ndev)
net_device_ctx = netdev_priv(ndev);
netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
+ if (!netvsc_dev)
+ return NOTIFY_DONE;
- netdev_info(ndev, "VF up: %s\n", vf_netdev->name);
-
- /*
- * Open the device before switching data path.
- */
+ /* Bump refcount when datapath is acvive - Why? */
rndis_filter_open(netvsc_dev);
- /*
- * notify the host to switch the data path.
- */
+ /* notify the host to switch the data path. */
netvsc_switch_datapath(ndev, true);
netdev_info(ndev, "Data path switched to VF: %s\n", vf_netdev->name);
- netif_carrier_off(ndev);
-
- /* Now notify peers through VF device. */
- call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, vf_netdev);
-
return NOTIFY_OK;
}
static int netvsc_vf_down(struct net_device *vf_netdev)
{
- struct net_device *ndev;
- struct netvsc_device *netvsc_dev;
struct net_device_context *net_device_ctx;
+ struct netvsc_device *netvsc_dev;
+ struct net_device *ndev;
ndev = get_netvsc_byref(vf_netdev);
if (!ndev)
net_device_ctx = netdev_priv(ndev);
netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
+ if (!netvsc_dev)
+ return NOTIFY_DONE;
- netdev_info(ndev, "VF down: %s\n", vf_netdev->name);
netvsc_switch_datapath(ndev, false);
netdev_info(ndev, "Data path switched from VF: %s\n", vf_netdev->name);
rndis_filter_close(netvsc_dev);
- netif_carrier_on(ndev);
-
- /* Now notify peers through netvsc device. */
- call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, ndev);
return NOTIFY_OK;
}
return NOTIFY_DONE;
net_device_ctx = netdev_priv(ndev);
+ cancel_delayed_work_sync(&net_device_ctx->vf_takeover);
netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name);
+ netdev_upper_dev_unlink(vf_netdev, ndev);
RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL);
dev_put(vf_netdev);
module_put(THIS_MODULE);
struct net_device_context *net_device_ctx;
struct netvsc_device_info device_info;
struct netvsc_device *nvdev;
- int ret;
+ int ret = -ENOMEM;
net = alloc_etherdev_mq(sizeof(struct net_device_context),
VRSS_CHANNEL_MAX);
if (!net)
- return -ENOMEM;
+ goto no_net;
netif_carrier_off(net);
spin_lock_init(&net_device_ctx->lock);
INIT_LIST_HEAD(&net_device_ctx->reconfig_events);
+ INIT_DELAYED_WORK(&net_device_ctx->vf_takeover, netvsc_vf_setup);
+
+ net_device_ctx->vf_stats
+ = netdev_alloc_pcpu_stats(struct netvsc_vf_pcpu_stats);
+ if (!net_device_ctx->vf_stats)
+ goto no_stats;
net->netdev_ops = &device_ops;
net->ethtool_ops = ðtool_ops;
memset(&device_info, 0, sizeof(device_info));
device_info.ring_size = ring_size;
device_info.num_chn = VRSS_CHANNEL_DEFAULT;
- ret = rndis_filter_device_add(dev, &device_info);
- if (ret != 0) {
+ device_info.send_sections = NETVSC_DEFAULT_TX;
+ device_info.recv_sections = NETVSC_DEFAULT_RX;
+
+ nvdev = rndis_filter_device_add(dev, &device_info);
+ if (IS_ERR(nvdev)) {
+ ret = PTR_ERR(nvdev);
netdev_err(net, "unable to add netvsc device (ret %d)\n", ret);
- free_netdev(net);
- hv_set_drvdata(dev, NULL);
- return ret;
+ goto rndis_failed;
}
+
memcpy(net->dev_addr, device_info.mac_adr, ETH_ALEN);
/* hw_features computed in rndis_filter_device_add */
NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
net->vlan_features = net->features;
- /* RCU not necessary here, device not registered */
- nvdev = net_device_ctx->nvdev;
netif_set_real_num_tx_queues(net, nvdev->num_chn);
netif_set_real_num_rx_queues(net, nvdev->num_chn);
+ netdev_lockdep_set_classes(net);
+
/* MTU range: 68 - 1500 or 65521 */
net->min_mtu = NETVSC_MTU_MIN;
if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2)
ret = register_netdev(net);
if (ret != 0) {
pr_err("Unable to register netdev.\n");
- rndis_filter_device_remove(dev, nvdev);
- free_netdev(net);
+ goto register_failed;
}
return ret;
+
+register_failed:
+ rndis_filter_device_remove(dev, nvdev);
+rndis_failed:
+ free_percpu(net_device_ctx->vf_stats);
+no_stats:
+ hv_set_drvdata(dev, NULL);
+ free_netdev(net);
+no_net:
+ return ret;
}
static int netvsc_remove(struct hv_device *dev)
* removed. Also blocks mtu and channel changes.
*/
rtnl_lock();
- rndis_filter_device_remove(dev, ndev_ctx->nvdev);
+ rndis_filter_device_remove(dev,
+ rtnl_dereference(ndev_ctx->nvdev));
rtnl_unlock();
unregister_netdev(net);
hv_set_drvdata(dev, NULL);
+ free_percpu(ndev_ctx->vf_stats);
free_netdev(net);
return 0;
}
#include <linux/ethtool.h>
#include <linux/phy.h>
#include <linux/phy_led_triggers.h>
-#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/mdio.h>
#include <linux/io.h>
#include <asm/irq.h>
-static const char *phy_speed_to_str(int speed)
-{
- switch (speed) {
- case SPEED_10:
- return "10Mbps";
- case SPEED_100:
- return "100Mbps";
- case SPEED_1000:
- return "1Gbps";
- case SPEED_2500:
- return "2.5Gbps";
- case SPEED_5000:
- return "5Gbps";
- case SPEED_10000:
- return "10Gbps";
- case SPEED_14000:
- return "14Gbps";
- case SPEED_20000:
- return "20Gbps";
- case SPEED_25000:
- return "25Gbps";
- case SPEED_40000:
- return "40Gbps";
- case SPEED_50000:
- return "50Gbps";
- case SPEED_56000:
- return "56Gbps";
- case SPEED_100000:
- return "100Gbps";
- case SPEED_UNKNOWN:
- return "Unknown";
- default:
- return "Unsupported (update phy.c)";
- }
-}
-
#define PHY_STATE_STR(_state) \
case PHY_##_state: \
return __stringify(_state); \
netdev_info(phydev->attached_dev,
"Link is Up - %s/%s - flow control %s\n",
phy_speed_to_str(phydev->speed),
- DUPLEX_FULL == phydev->duplex ? "Full" : "Half",
+ phy_duplex_to_str(phydev->duplex),
phydev->pause ? "rx/tx" : "off");
} else {
netdev_info(phydev->attached_dev, "Link is Down\n");
}
EXPORT_SYMBOL(phy_aneg_done);
-/* A structure for mapping a particular speed and duplex
- * combination to a particular SUPPORTED and ADVERTISED value
- */
-struct phy_setting {
- int speed;
- int duplex;
- u32 setting;
-};
-
-/* A mapping of all SUPPORTED settings to speed/duplex. This table
- * must be grouped by speed and sorted in descending match priority
- * - iow, descending speed. */
-static const struct phy_setting settings[] = {
- {
- .speed = SPEED_10000,
- .duplex = DUPLEX_FULL,
- .setting = SUPPORTED_10000baseKR_Full,
- },
- {
- .speed = SPEED_10000,
- .duplex = DUPLEX_FULL,
- .setting = SUPPORTED_10000baseKX4_Full,
- },
- {
- .speed = SPEED_10000,
- .duplex = DUPLEX_FULL,
- .setting = SUPPORTED_10000baseT_Full,
- },
- {
- .speed = SPEED_2500,
- .duplex = DUPLEX_FULL,
- .setting = SUPPORTED_2500baseX_Full,
- },
- {
- .speed = SPEED_1000,
- .duplex = DUPLEX_FULL,
- .setting = SUPPORTED_1000baseKX_Full,
- },
- {
- .speed = SPEED_1000,
- .duplex = DUPLEX_FULL,
- .setting = SUPPORTED_1000baseT_Full,
- },
- {
- .speed = SPEED_1000,
- .duplex = DUPLEX_HALF,
- .setting = SUPPORTED_1000baseT_Half,
- },
- {
- .speed = SPEED_100,
- .duplex = DUPLEX_FULL,
- .setting = SUPPORTED_100baseT_Full,
- },
- {
- .speed = SPEED_100,
- .duplex = DUPLEX_HALF,
- .setting = SUPPORTED_100baseT_Half,
- },
- {
- .speed = SPEED_10,
- .duplex = DUPLEX_FULL,
- .setting = SUPPORTED_10baseT_Full,
- },
- {
- .speed = SPEED_10,
- .duplex = DUPLEX_HALF,
- .setting = SUPPORTED_10baseT_Half,
- },
-};
-
-/**
- * phy_lookup_setting - lookup a PHY setting
- * @speed: speed to match
- * @duplex: duplex to match
- * @features: allowed link modes
- * @exact: an exact match is required
- *
- * Search the settings array for a setting that matches the speed and
- * duplex, and which is supported.
- *
- * If @exact is unset, either an exact match or %NULL for no match will
- * be returned.
- *
- * If @exact is set, an exact match, the fastest supported setting at
- * or below the specified speed, the slowest supported setting, or if
- * they all fail, %NULL will be returned.
- */
-static const struct phy_setting *
-phy_lookup_setting(int speed, int duplex, u32 features, bool exact)
-{
- const struct phy_setting *p, *match = NULL, *last = NULL;
- int i;
-
- for (i = 0, p = settings; i < ARRAY_SIZE(settings); i++, p++) {
- if (p->setting & features) {
- last = p;
- if (p->speed == speed && p->duplex == duplex) {
- /* Exact match for speed and duplex */
- match = p;
- break;
- } else if (!exact) {
- if (!match && p->speed <= speed)
- /* Candidate */
- match = p;
-
- if (p->speed < speed)
- break;
- }
- }
- }
-
- if (!match && !exact)
- match = last;
-
- return match;
-}
-
/**
* phy_find_valid - find a PHY setting that matches the requested parameters
* @speed: desired speed
static const struct phy_setting *
phy_find_valid(int speed, int duplex, u32 supported)
{
- return phy_lookup_setting(speed, duplex, supported, false);
+ unsigned long mask = supported;
+
+ return phy_lookup_setting(speed, duplex, &mask, BITS_PER_LONG, false);
}
/**
unsigned int *speeds,
unsigned int size)
{
- unsigned int count = 0;
- unsigned int idx = 0;
+ unsigned long supported = phy->supported;
- for (idx = 0; idx < ARRAY_SIZE(settings) && count < size; idx++)
- /* Assumes settings are grouped by speed */
- if ((settings[idx].setting & phy->supported) &&
- (count == 0 || speeds[count - 1] != settings[idx].speed))
- speeds[count++] = settings[idx].speed;
-
- return count;
+ return phy_speeds(speeds, size, &supported, BITS_PER_LONG);
}
/**
*/
static inline bool phy_check_valid(int speed, int duplex, u32 features)
{
- return !!phy_lookup_setting(speed, duplex, features, true);
+ unsigned long mask = features;
+
+ return !!phy_lookup_setting(speed, duplex, &mask, BITS_PER_LONG, true);
}
/**
*
* Description: The PHY infrastructure can run a state machine
* which tracks whether the PHY is starting up, negotiating,
- * etc. This function starts the timer which tracks the state
- * of the PHY. If you want to maintain your own state machine,
+ * etc. This function starts the delayed workqueue which tracks
+ * the state of the PHY. If you want to maintain your own state machine,
* do not call this function.
*/
void phy_start_machine(struct phy_device *phydev)
{
queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, HZ);
}
+EXPORT_SYMBOL_GPL(phy_start_machine);
/**
* phy_trigger_machine - trigger the state machine to run
* phy_stop_machine - stop the PHY state machine tracking
* @phydev: target phy_device struct
*
- * Description: Stops the state machine timer, sets the state to UP
- * (unless it wasn't up yet). This function must be called BEFORE
- * phy_detach.
+ * Description: Stops the state machine delayed workqueue, sets the
+ * state to UP (unless it wasn't up yet). This function must be
+ * called BEFORE phy_detach.
*/
void phy_stop_machine(struct phy_device *phydev)
{
if (phydev->state > PHY_UP && phydev->state != PHY_HALTED)
phydev->state = PHY_UP;
mutex_unlock(&phydev->lock);
-
- /* Now we can run the state machine synchronously */
- phy_state_machine(&phydev->state_queue.work);
}
/**
}
EXPORT_SYMBOL(phy_start);
-static void phy_adjust_link(struct phy_device *phydev)
+static void phy_link_up(struct phy_device *phydev)
{
- phydev->adjust_link(phydev->attached_dev);
+ phydev->phy_link_change(phydev, true, true);
+ phy_led_trigger_change_speed(phydev);
+}
+
+static void phy_link_down(struct phy_device *phydev, bool do_carrier)
+{
+ phydev->phy_link_change(phydev, false, do_carrier);
phy_led_trigger_change_speed(phydev);
}
/* If the link is down, give up on negotiation for now */
if (!phydev->link) {
phydev->state = PHY_NOLINK;
- netif_carrier_off(phydev->attached_dev);
- phy_adjust_link(phydev);
+ phy_link_down(phydev, true);
break;
}
/* If AN is done, we're running */
if (err > 0) {
phydev->state = PHY_RUNNING;
- netif_carrier_on(phydev->attached_dev);
- phy_adjust_link(phydev);
-
+ phy_link_up(phydev);
} else if (0 == phydev->link_timeout--)
needs_aneg = true;
break;
}
}
phydev->state = PHY_RUNNING;
- netif_carrier_on(phydev->attached_dev);
- phy_adjust_link(phydev);
+ phy_link_up(phydev);
}
break;
case PHY_FORCING:
if (phydev->link) {
phydev->state = PHY_RUNNING;
- netif_carrier_on(phydev->attached_dev);
+ phy_link_up(phydev);
} else {
if (0 == phydev->link_timeout--)
needs_aneg = true;
+ phy_link_down(phydev, false);
}
-
- phy_adjust_link(phydev);
break;
case PHY_RUNNING:
/* Only register a CHANGE if we are polling and link changed
if (phydev->link) {
phydev->state = PHY_RUNNING;
- netif_carrier_on(phydev->attached_dev);
+ phy_link_up(phydev);
} else {
phydev->state = PHY_NOLINK;
- netif_carrier_off(phydev->attached_dev);
+ phy_link_down(phydev, true);
}
- phy_adjust_link(phydev);
-
if (phy_interrupt_is_valid(phydev))
err = phy_config_interrupt(phydev,
PHY_INTERRUPT_ENABLED);
case PHY_HALTED:
if (phydev->link) {
phydev->link = 0;
- netif_carrier_off(phydev->attached_dev);
- phy_adjust_link(phydev);
+ phy_link_down(phydev, true);
do_suspend = true;
}
break;
if (phydev->link) {
phydev->state = PHY_RUNNING;
- netif_carrier_on(phydev->attached_dev);
+ phy_link_up(phydev);
} else {
phydev->state = PHY_NOLINK;
+ phy_link_down(phydev, false);
}
- phy_adjust_link(phydev);
} else {
phydev->state = PHY_AN;
phydev->link_timeout = PHY_AN_TIMEOUT;
if (phydev->link) {
phydev->state = PHY_RUNNING;
- netif_carrier_on(phydev->attached_dev);
+ phy_link_up(phydev);
} else {
phydev->state = PHY_NOLINK;
+ phy_link_down(phydev, false);
}
- phy_adjust_link(phydev);
}
break;
}
if (err < 0)
phy_error(phydev);
- phydev_dbg(phydev, "PHY state change %s -> %s\n",
- phy_state_to_str(old_state),
- phy_state_to_str(phydev->state));
+ if (old_state != phydev->state)
+ phydev_dbg(phydev, "PHY state change %s -> %s\n",
+ phy_state_to_str(old_state),
+ phy_state_to_str(phydev->state));
/* Only re-schedule a PHY state machine change if we are polling the
* PHY, if PHY_IGNORE_INTERRUPT is set, then we will be moving
}
EXPORT_SYMBOL(phy_find_first);
+static void phy_link_change(struct phy_device *phydev, bool up, bool do_carrier)
+{
+ struct net_device *netdev = phydev->attached_dev;
+
+ if (do_carrier) {
+ if (up)
+ netif_carrier_on(netdev);
+ else
+ netif_carrier_off(netdev);
+ }
+ phydev->adjust_link(netdev);
+}
+
/**
* phy_prepare_link - prepares the PHY layer to monitor link status
* @phydev: target phy_device struct
#define ATTACHED_FMT "attached PHY driver [%s] (mii_bus:phy_addr=%s, irq=%d)"
void phy_attached_print(struct phy_device *phydev, const char *fmt, ...)
{
+ const char *drv_name = phydev->drv ? phydev->drv->name : "unbound";
+
if (!fmt) {
dev_info(&phydev->mdio.dev, ATTACHED_FMT "\n",
- phydev->drv->name, phydev_name(phydev),
+ drv_name, phydev_name(phydev),
phydev->irq);
} else {
va_list ap;
dev_info(&phydev->mdio.dev, ATTACHED_FMT,
- phydev->drv->name, phydev_name(phydev),
+ drv_name, phydev_name(phydev),
phydev->irq);
va_start(ap, fmt);
goto error;
}
+ phydev->phy_link_change = phy_link_change;
phydev->attached_dev = dev;
dev->phydev = phydev;
phydev->attached_dev->phydev = NULL;
phydev->attached_dev = NULL;
phy_suspend(phydev);
+ phydev->phylink = NULL;
phy_led_triggers_unregister(phydev);
NULL,
};
-static struct attribute_group cdc_ncm_sysfs_attr_group = {
+static const struct attribute_group cdc_ncm_sysfs_attr_group = {
.name = "cdc_ncm",
.attrs = cdc_ncm_sysfs_attrs,
};
.driver_info = (unsigned long)&wwan_noarp_info,
},
+ /* u-blox TOBY-L4 */
+ { USB_DEVICE_AND_INTERFACE_INFO(0x1546, 0x1010,
+ USB_CLASS_COMM,
+ USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE),
+ .driver_info = (unsigned long)&wwan_info,
+ },
+
/* Generic CDC-NCM devices */
{ USB_INTERFACE_INFO(USB_CLASS_COMM,
USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE),
#define VIRTNET_DRIVER_VERSION "1.0.0"
+static const unsigned long guest_offloads[] = {
+ VIRTIO_NET_F_GUEST_TSO4,
+ VIRTIO_NET_F_GUEST_TSO6,
+ VIRTIO_NET_F_GUEST_ECN,
+ VIRTIO_NET_F_GUEST_UFO
+};
+
struct virtnet_stats {
struct u64_stats_sync tx_syncp;
struct u64_stats_sync rx_syncp;
u8 ctrl_promisc;
u8 ctrl_allmulti;
u16 ctrl_vid;
+ u64 ctrl_offloads;
/* Ethtool settings */
u8 duplex;
u32 speed;
+
+ unsigned long guest_offloads;
};
struct padded_vnet_hdr {
netif_wake_subqueue(vi->dev, vq2txq(vq));
}
+#define MRG_CTX_HEADER_SHIFT 22
+static void *mergeable_len_to_ctx(unsigned int truesize,
+ unsigned int headroom)
+{
+ return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize);
+}
+
+static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx)
+{
+ return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT;
+}
+
+static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
+{
+ return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1);
+}
+
/* Called from bottom half context */
static struct sk_buff *page_to_skb(struct virtnet_info *vi,
struct receive_queue *rq,
hdr_len = vi->hdr_len;
if (vi->mergeable_rx_bufs)
- hdr_padded_len = sizeof *hdr;
+ hdr_padded_len = sizeof(*hdr);
else
hdr_padded_len = sizeof(struct padded_vnet_hdr);
return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
}
+/* We copy the packet for XDP in the following cases:
+ *
+ * 1) Packet is scattered across multiple rx buffers.
+ * 2) Headroom space is insufficient.
+ *
+ * This is inefficient but it's a temporary condition that
+ * we hit right after XDP is enabled and until queue is refilled
+ * with large buffers with sufficient headroom - so it should affect
+ * at most queue size packets.
+ * Afterwards, the conditions to enable
+ * XDP should preclude the underlying device from sending packets
+ * across multiple buffers (num_buf > 1), and we make sure buffers
+ * have enough headroom.
+ */
+static struct page *xdp_linearize_page(struct receive_queue *rq,
+ u16 *num_buf,
+ struct page *p,
+ int offset,
+ int page_off,
+ unsigned int *len)
+{
+ struct page *page = alloc_page(GFP_ATOMIC);
+
+ if (!page)
+ return NULL;
+
+ memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
+ page_off += *len;
+
+ while (--*num_buf) {
+ unsigned int buflen;
+ void *buf;
+ int off;
+
+ buf = virtqueue_get_buf(rq->vq, &buflen);
+ if (unlikely(!buf))
+ goto err_buf;
+
+ p = virt_to_head_page(buf);
+ off = buf - page_address(p);
+
+ /* guard against a misconfigured or uncooperative backend that
+ * is sending packet larger than the MTU.
+ */
+ if ((page_off + buflen) > PAGE_SIZE) {
+ put_page(p);
+ goto err_buf;
+ }
+
+ memcpy(page_address(page) + page_off,
+ page_address(p) + off, buflen);
+ page_off += buflen;
+ put_page(p);
+ }
+
+ /* Headroom does not contribute to packet length */
+ *len = page_off - VIRTIO_XDP_HEADROOM;
+ return page;
+err_buf:
+ __free_pages(page, 0);
+ return NULL;
+}
+
static struct sk_buff *receive_small(struct net_device *dev,
struct virtnet_info *vi,
struct receive_queue *rq,
- void *buf, unsigned int len)
+ void *buf, void *ctx,
+ unsigned int len)
{
struct sk_buff *skb;
struct bpf_prog *xdp_prog;
- unsigned int xdp_headroom = virtnet_get_headroom(vi);
+ unsigned int xdp_headroom = (unsigned long)ctx;
unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
unsigned int headroom = vi->hdr_len + header_offset;
unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ struct page *page = virt_to_head_page(buf);
unsigned int delta = 0;
+ struct page *xdp_page;
len -= vi->hdr_len;
rcu_read_lock();
if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
goto err_xdp;
+ if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
+ int offset = buf - page_address(page) + header_offset;
+ unsigned int tlen = len + vi->hdr_len;
+ u16 num_buf = 1;
+
+ xdp_headroom = virtnet_get_headroom(vi);
+ header_offset = VIRTNET_RX_PAD + xdp_headroom;
+ headroom = vi->hdr_len + header_offset;
+ buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ xdp_page = xdp_linearize_page(rq, &num_buf, page,
+ offset, header_offset,
+ &tlen);
+ if (!xdp_page)
+ goto err_xdp;
+
+ buf = page_address(xdp_page);
+ put_page(page);
+ page = xdp_page;
+ }
+
xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
xdp.data = xdp.data_hard_start + xdp_headroom;
xdp.data_end = xdp.data + len;
skb = build_skb(buf, buflen);
if (!skb) {
- put_page(virt_to_head_page(buf));
+ put_page(page);
goto err;
}
skb_reserve(skb, headroom - delta);
err_xdp:
rcu_read_unlock();
dev->stats.rx_dropped++;
- put_page(virt_to_head_page(buf));
+ put_page(page);
xdp_xmit:
return NULL;
}
return NULL;
}
-/* The conditions to enable XDP should preclude the underlying device from
- * sending packets across multiple buffers (num_buf > 1). However per spec
- * it does not appear to be illegal to do so but rather just against convention.
- * So in order to avoid making a system unresponsive the packets are pushed
- * into a page and the XDP program is run. This will be extremely slow and we
- * push a warning to the user to fix this as soon as possible. Fixing this may
- * require resolving the underlying hardware to determine why multiple buffers
- * are being received or simply loading the XDP program in the ingress stack
- * after the skb is built because there is no advantage to running it here
- * anymore.
- */
-static struct page *xdp_linearize_page(struct receive_queue *rq,
- u16 *num_buf,
- struct page *p,
- int offset,
- unsigned int *len)
-{
- struct page *page = alloc_page(GFP_ATOMIC);
- unsigned int page_off = VIRTIO_XDP_HEADROOM;
-
- if (!page)
- return NULL;
-
- memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
- page_off += *len;
-
- while (--*num_buf) {
- unsigned int buflen;
- void *buf;
- int off;
-
- buf = virtqueue_get_buf(rq->vq, &buflen);
- if (unlikely(!buf))
- goto err_buf;
-
- p = virt_to_head_page(buf);
- off = buf - page_address(p);
-
- /* guard against a misconfigured or uncooperative backend that
- * is sending packet larger than the MTU.
- */
- if ((page_off + buflen) > PAGE_SIZE) {
- put_page(p);
- goto err_buf;
- }
-
- memcpy(page_address(page) + page_off,
- page_address(p) + off, buflen);
- page_off += buflen;
- put_page(p);
- }
-
- /* Headroom does not contribute to packet length */
- *len = page_off - VIRTIO_XDP_HEADROOM;
- return page;
-err_buf:
- __free_pages(page, 0);
- return NULL;
-}
-
static struct sk_buff *receive_mergeable(struct net_device *dev,
struct virtnet_info *vi,
struct receive_queue *rq,
struct sk_buff *head_skb, *curr_skb;
struct bpf_prog *xdp_prog;
unsigned int truesize;
+ unsigned int headroom = mergeable_ctx_to_headroom(ctx);
head_skb = NULL;
u32 act;
/* This happens when rx buffer size is underestimated */
- if (unlikely(num_buf > 1)) {
+ if (unlikely(num_buf > 1 ||
+ headroom < virtnet_get_headroom(vi))) {
/* linearize data for XDP */
xdp_page = xdp_linearize_page(rq, &num_buf,
- page, offset, &len);
+ page, offset,
+ VIRTIO_XDP_HEADROOM,
+ &len);
if (!xdp_page)
goto err_xdp;
offset = VIRTIO_XDP_HEADROOM;
}
rcu_read_unlock();
- if (unlikely(len > (unsigned long)ctx)) {
+ truesize = mergeable_ctx_to_truesize(ctx);
+ if (unlikely(len > truesize)) {
pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
dev->name, len, (unsigned long)ctx);
dev->stats.rx_length_errors++;
goto err_skb;
}
- truesize = (unsigned long)ctx;
+
head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
curr_skb = head_skb;
}
page = virt_to_head_page(buf);
- if (unlikely(len > (unsigned long)ctx)) {
+
+ truesize = mergeable_ctx_to_truesize(ctx);
+ if (unlikely(len > truesize)) {
pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
dev->name, len, (unsigned long)ctx);
dev->stats.rx_length_errors++;
goto err_skb;
}
- truesize = (unsigned long)ctx;
num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
else if (vi->big_packets)
skb = receive_big(dev, vi, rq, buf, len);
else
- skb = receive_small(dev, vi, rq, buf, len);
+ skb = receive_small(dev, vi, rq, buf, ctx, len);
if (unlikely(!skb))
return 0;
return 0;
}
+/* Unlike mergeable buffers, all buffers are allocated to the
+ * same size, except for the headroom. For this reason we do
+ * not need to use mergeable_len_to_ctx here - it is enough
+ * to store the headroom as the context ignoring the truesize.
+ */
static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
gfp_t gfp)
{
struct page_frag *alloc_frag = &rq->alloc_frag;
char *buf;
unsigned int xdp_headroom = virtnet_get_headroom(vi);
+ void *ctx = (void *)(unsigned long)xdp_headroom;
int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
int err;
alloc_frag->offset += len;
sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
vi->hdr_len + GOOD_PACKET_LEN);
- err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
+ err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
if (err < 0)
put_page(virt_to_head_page(buf));
-
return err;
}
}
sg_init_one(rq->sg, buf, len);
- ctx = (void *)(unsigned long)len;
+ ctx = mergeable_len_to_ctx(len, headroom);
err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
if (err < 0)
put_page(virt_to_head_page(buf));
void *buf;
struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
- if (vi->mergeable_rx_bufs) {
+ if (!vi->big_packets || vi->mergeable_rx_bufs) {
void *ctx;
while (received < budget &&
bytes += skb->len;
packets++;
- dev_kfree_skb_any(skb);
+ dev_consume_skb_any(skb);
}
/* Avoid overhead when no packets have been processed
}
static int init_vqs(struct virtnet_info *vi);
-static void _remove_vq_common(struct virtnet_info *vi);
static int virtnet_restore_up(struct virtio_device *vdev)
{
return err;
}
-static int virtnet_reset(struct virtnet_info *vi, int curr_qp, int xdp_qp)
+static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
{
- struct virtio_device *dev = vi->vdev;
- int ret;
+ struct scatterlist sg;
+ vi->ctrl_offloads = cpu_to_virtio64(vi->vdev, offloads);
- virtio_config_disable(dev);
- dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED;
- virtnet_freeze_down(dev);
- _remove_vq_common(vi);
+ sg_init_one(&sg, &vi->ctrl_offloads, sizeof(vi->ctrl_offloads));
- virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
- virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);
+ if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
+ VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
+ dev_warn(&vi->dev->dev, "Fail to set guest offload. \n");
+ return -EINVAL;
+ }
- ret = virtio_finalize_features(dev);
- if (ret)
- goto err;
+ return 0;
+}
- vi->xdp_queue_pairs = xdp_qp;
- ret = virtnet_restore_up(dev);
- if (ret)
- goto err;
- ret = _virtnet_set_queues(vi, curr_qp);
- if (ret)
- goto err;
+static int virtnet_clear_guest_offloads(struct virtnet_info *vi)
+{
+ u64 offloads = 0;
- virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
- virtio_config_enable(dev);
- return 0;
-err:
- virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
- return ret;
+ if (!vi->guest_offloads)
+ return 0;
+
+ if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))
+ offloads = 1ULL << VIRTIO_NET_F_GUEST_CSUM;
+
+ return virtnet_set_guest_offloads(vi, offloads);
+}
+
+static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
+{
+ u64 offloads = vi->guest_offloads;
+
+ if (!vi->guest_offloads)
+ return 0;
+ if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))
+ offloads |= 1ULL << VIRTIO_NET_F_GUEST_CSUM;
+
+ return virtnet_set_guest_offloads(vi, offloads);
}
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
u16 xdp_qp = 0, curr_qp;
int i, err;
- if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
- virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
- virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
- virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) {
+ if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
+ && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
+ virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
+ virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
+ virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO))) {
NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO, disable LRO first");
return -EOPNOTSUPP;
}
return PTR_ERR(prog);
}
- /* Changing the headroom in buffers is a disruptive operation because
- * existing buffers must be flushed and reallocated. This will happen
- * when a xdp program is initially added or xdp is disabled by removing
- * the xdp program resulting in number of XDP queues changing.
- */
- if (vi->xdp_queue_pairs != xdp_qp) {
- err = virtnet_reset(vi, curr_qp + xdp_qp, xdp_qp);
- if (err) {
- dev_warn(&dev->dev, "XDP reset failure.\n");
- goto virtio_reset_err;
- }
- }
+ /* Make sure NAPI is not using any XDP TX queues for RX. */
+ for (i = 0; i < vi->max_queue_pairs; i++)
+ napi_disable(&vi->rq[i].napi);
netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
+ err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
+ if (err)
+ goto err;
+ vi->xdp_queue_pairs = xdp_qp;
for (i = 0; i < vi->max_queue_pairs; i++) {
old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
+ if (i == 0) {
+ if (!old_prog)
+ virtnet_clear_guest_offloads(vi);
+ if (!prog)
+ virtnet_restore_guest_offloads(vi);
+ }
if (old_prog)
bpf_prog_put(old_prog);
+ virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
}
return 0;
-virtio_reset_err:
- /* On reset error do our best to unwind XDP changes inflight and return
- * error up to user space for resolution. The underlying reset hung on
- * us so not much we can do here.
- */
+err:
+ for (i = 0; i < vi->max_queue_pairs; i++)
+ virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
if (prog)
bpf_prog_sub(prog, vi->max_queue_pairs - 1);
return err;
names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
if (!names)
goto err_names;
- if (vi->mergeable_rx_bufs) {
+ if (!vi->big_packets || vi->mergeable_rx_bufs) {
ctx = kzalloc(total_vqs * sizeof(*ctx), GFP_KERNEL);
if (!ctx)
goto err_ctx;
#ifdef CONFIG_SYSFS
static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
- struct rx_queue_attribute *attribute, char *buf)
+ char *buf)
{
struct virtnet_info *vi = netdev_priv(queue->dev);
unsigned int queue_index = get_netdev_rx_queue_index(queue);
dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
- dev->hw_features |= NETIF_F_TSO | NETIF_F_UFO
+ dev->hw_features |= NETIF_F_TSO
| NETIF_F_TSO_ECN | NETIF_F_TSO6;
}
/* Individual feature bits: what can host handle? */
dev->hw_features |= NETIF_F_TSO6;
if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
dev->hw_features |= NETIF_F_TSO_ECN;
- if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UFO))
- dev->hw_features |= NETIF_F_UFO;
dev->features |= NETIF_F_GSO_ROBUST;
if (gso)
- dev->features |= dev->hw_features & (NETIF_F_ALL_TSO|NETIF_F_UFO);
+ dev->features |= dev->hw_features & NETIF_F_ALL_TSO;
/* (!csum && gso) case will be fixed by register_netdev() */
}
if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
netif_carrier_on(dev);
}
+ for (i = 0; i < ARRAY_SIZE(guest_offloads); i++)
+ if (virtio_has_feature(vi->vdev, guest_offloads[i]))
+ set_bit(guest_offloads[i], &vi->guest_offloads);
+
pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
dev->name, max_queue_pairs);
return err;
}
-static void _remove_vq_common(struct virtnet_info *vi)
-{
- vi->vdev->config->reset(vi->vdev);
- free_unused_bufs(vi);
- _free_receive_bufs(vi);
- free_receive_page_frags(vi);
- virtnet_del_vqs(vi);
-}
-
static void remove_vq_common(struct virtnet_info *vi)
{
vi->vdev->config->reset(vi->vdev);
free_netdev(vi->dev);
}
-#ifdef CONFIG_PM_SLEEP
-static int virtnet_freeze(struct virtio_device *vdev)
+static __maybe_unused int virtnet_freeze(struct virtio_device *vdev)
{
struct virtnet_info *vi = vdev->priv;
return 0;
}
-static int virtnet_restore(struct virtio_device *vdev)
+static __maybe_unused int virtnet_restore(struct virtio_device *vdev)
{
struct virtnet_info *vi = vdev->priv;
int err;
return 0;
}
-#endif
static struct virtio_device_id id_table[] = {
{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
VIRTIO_NET_F_CTRL_MAC_ADDR, \
- VIRTIO_NET_F_MTU
+ VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS
static unsigned int features[] = {
VIRTNET_FEATURES,
usleep_range(5000, 6000);
}
+static inline u8 iwl_pcie_get_cmd_index(struct iwl_txq *q, u32 index)
+{
+ return index & (q->n_window - 1);
+}
+
static inline void *iwl_pcie_get_tfd(struct iwl_trans_pcie *trans_pcie,
struct iwl_txq *txq, int idx)
{
- return txq->tfds + trans_pcie->tfd_size * idx;
+ return txq->tfds + trans_pcie->tfd_size * iwl_pcie_get_cmd_index(txq,
+ idx);
}
static inline void iwl_enable_rfkill_int(struct iwl_trans *trans)
!(i < q->read_ptr && i >= q->write_ptr);
}
-static inline u8 get_cmd_index(struct iwl_txq *q, u32 index)
-{
- return index & (q->n_window - 1);
-}
-
static inline bool iwl_is_rfkill_set(struct iwl_trans *trans)
{
struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
void iwl_pcie_enable_rx_wake(struct iwl_trans *trans, bool enable);
+ void iwl_pcie_rx_allocator_work(struct work_struct *data);
+
/* common functions that are used by gen2 transport */
void iwl_pcie_apm_config(struct iwl_trans *trans);
int iwl_pcie_prepare_card_hw(struct iwl_trans *trans);
void iwl_pcie_synchronize_irqs(struct iwl_trans *trans);
-bool iwl_trans_check_hw_rf_kill(struct iwl_trans *trans);
+bool iwl_pcie_check_hw_rf_kill(struct iwl_trans *trans);
void iwl_trans_pcie_handle_stop_rfkill(struct iwl_trans *trans,
bool was_in_rfkill);
void iwl_pcie_txq_free_tfd(struct iwl_trans *trans, struct iwl_txq *txq);
struct iwl_dma_ptr *ptr, size_t size);
void iwl_pcie_free_dma_ptr(struct iwl_trans *trans, struct iwl_dma_ptr *ptr);
void iwl_pcie_apply_destination(struct iwl_trans *trans);
+void iwl_pcie_free_tso_page(struct iwl_trans_pcie *trans_pcie,
+ struct sk_buff *skb);
#ifdef CONFIG_INET
struct iwl_tso_hdr_page *get_page_hdr(struct iwl_trans *trans, size_t len);
#endif
rxq->free_count += RX_CLAIM_REQ_ALLOC;
}
- static void iwl_pcie_rx_allocator_work(struct work_struct *data)
+ void iwl_pcie_rx_allocator_work(struct work_struct *data)
{
struct iwl_rb_allocator *rba_p =
container_of(data, struct iwl_rb_allocator, rx_alloc);
return err;
}
def_rxq = trans_pcie->rxq;
- if (!rba->alloc_wq)
- rba->alloc_wq = alloc_workqueue("rb_allocator",
- WQ_HIGHPRI | WQ_UNBOUND, 1);
- INIT_WORK(&rba->rx_alloc, iwl_pcie_rx_allocator_work);
spin_lock(&rba->lock);
atomic_set(&rba->req_pending, 0);
}
cancel_work_sync(&rba->rx_alloc);
- if (rba->alloc_wq) {
- destroy_workqueue(rba->alloc_wq);
- rba->alloc_wq = NULL;
- }
iwl_pcie_free_rbs_pool(trans);
sequence = le16_to_cpu(pkt->hdr.sequence);
index = SEQ_TO_INDEX(sequence);
- cmd_index = get_cmd_index(txq, index);
+ cmd_index = iwl_pcie_get_cmd_index(txq, index);
if (rxq->id == 0)
iwl_op_mode_rx(trans->op_mode, &rxq->napi,
&first_ucode_section);
}
-bool iwl_trans_check_hw_rf_kill(struct iwl_trans *trans)
+bool iwl_pcie_check_hw_rf_kill(struct iwl_trans *trans)
{
struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
bool hw_rfkill = iwl_is_rfkill_set(trans);
mutex_lock(&trans_pcie->mutex);
/* If platform's RF_KILL switch is NOT set to KILL */
- hw_rfkill = iwl_trans_check_hw_rf_kill(trans);
+ hw_rfkill = iwl_pcie_check_hw_rf_kill(trans);
if (hw_rfkill && !run_in_rfkill) {
ret = -ERFKILL;
goto out;
ret = iwl_pcie_load_given_ucode(trans, fw);
/* re-check RF-Kill state since we may have missed the interrupt */
- hw_rfkill = iwl_trans_check_hw_rf_kill(trans);
+ hw_rfkill = iwl_pcie_check_hw_rf_kill(trans);
if (hw_rfkill && !run_in_rfkill)
ret = -ERFKILL;
trans_pcie->is_down = false;
/* ...rfkill can call stop_device and set it false if needed */
- iwl_trans_check_hw_rf_kill(trans);
+ iwl_pcie_check_hw_rf_kill(trans);
/* Make sure we sync here, because we'll need full access later */
if (low_power)
iwl_pcie_tx_free(trans);
iwl_pcie_rx_free(trans);
+ if (trans_pcie->rba.alloc_wq) {
+ destroy_workqueue(trans_pcie->rba.alloc_wq);
+ trans_pcie->rba.alloc_wq = NULL;
+ }
+
if (trans_pcie->msix_enabled) {
for (i = 0; i < trans_pcie->alloc_vecs; i++) {
irq_set_affinity_hint(
* These bits say the device is running, and should keep running for
* at least a short while (at least as long as MAC_ACCESS_REQ stays 1),
* but they do not indicate that embedded SRAM is restored yet;
- * 3945 and 4965 have volatile SRAM, and must save/restore contents
- * to/from host DRAM when sleeping/waking for power-saving.
+ * HW with volatile SRAM must save/restore contents to/from
+ * host DRAM when sleeping/waking for power-saving.
* Each direction takes approximately 1/4 millisecond; with this
* overhead, it's a good idea to grab and hold MAC_ACCESS_REQUEST if a
* series of register accesses are expected (e.g. reading Event Log),
*
* CSR_UCODE_DRV_GP1 register bit MAC_SLEEP == 0 indicates that
* SRAM is okay/restored. We don't check that here because this call
- * is just for hardware register access; but GP1 MAC_SLEEP check is a
- * good idea before accessing 3945/4965 SRAM (e.g. reading Event Log).
+ * is just for hardware register access; but GP1 MAC_SLEEP
+ * check is a good idea before accessing the SRAM of HW with
+ * volatile SRAM (e.g. reading Event Log).
*
* 5000 series and later (including 1000 series) have non-volatile SRAM,
* and do not save/restore SRAM when power cycling.
spin_lock_bh(&cmdq->lock);
ptr = cmdq->write_ptr;
for (i = 0; i < cmdq->n_window; i++) {
- u8 idx = get_cmd_index(cmdq, ptr);
+ u8 idx = iwl_pcie_get_cmd_index(cmdq, ptr);
u32 caplen, cmdlen;
cmdlen = iwl_trans_pcie_get_cmdlen(trans, cmdq->tfds +
iwl_set_bit(trans, CSR_HOST_CHICKEN,
CSR_HOST_CHICKEN_PM_IDLE_SRC_DIS_SB_PME);
+#if IS_ENABLED(CONFIG_IWLMVM)
trans->hw_rf_id = iwl_read32(trans, CSR_HW_RF_ID);
+ if (trans->hw_rf_id == CSR_HW_RF_ID_TYPE_HR) {
+ u32 hw_status;
+
+ hw_status = iwl_read_prph(trans, UMAG_GEN_HW_STATUS);
+ if (hw_status & UMAG_GEN_HW_IS_FPGA)
+ trans->cfg = &iwla000_2ax_cfg_qnj_hr_f0;
+ else
+ trans->cfg = &iwla000_2ac_cfg_hr;
+ }
+#endif
iwl_pcie_set_interrupt_capa(pdev, trans);
trans->hw_id = (pdev->device << 16) + pdev->subsystem_device;
trans_pcie->inta_mask = CSR_INI_SET_MASK;
}
+ trans_pcie->rba.alloc_wq = alloc_workqueue("rb_allocator",
+ WQ_HIGHPRI | WQ_UNBOUND, 1);
+ INIT_WORK(&trans_pcie->rba.rx_alloc, iwl_pcie_rx_allocator_work);
+
#ifdef CONFIG_IWLWIFI_PCIE_RTPM
trans->runtime_pm_mode = IWL_PLAT_PM_MODE_D0I3;
#else
unsigned long sampling_interval; /* jiffies */
};
+struct mlx5_mpfs;
struct mlx5_eswitch;
struct mlx5_lag;
struct mlx5_pagefault;
struct list_head ctx_list;
spinlock_t ctx_lock;
+ struct list_head waiting_events_list;
+ bool is_accum_events;
+
struct mlx5_flow_steering *steering;
+ struct mlx5_mpfs *mpfs;
struct mlx5_eswitch *eswitch;
struct mlx5_core_sriov sriov;
struct mlx5_lag *lag;
};
enum mlx5_interface_state {
- MLX5_INTERFACE_STATE_DOWN = BIT(0),
- MLX5_INTERFACE_STATE_UP = BIT(1),
- MLX5_INTERFACE_STATE_SHUTDOWN = BIT(2),
+ MLX5_INTERFACE_STATE_UP = BIT(0),
};
enum mlx5_pci_status {
return buf->direct.buf + offset;
}
-extern struct workqueue_struct *mlx5_core_wq;
-
#define STRUCT_FIELD(header, field) \
.struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \
.struct_size_bytes = sizeof((struct ib_unpacked_ ## header *)0)->field
#include <linux/percpu.h>
#include <linux/rculist.h>
-#include <linux/dmaengine.h>
#include <linux/workqueue.h>
#include <linux/dynamic_queue_limits.h>
/* UDP Tunnel offloads */
struct udp_tunnel_info;
struct bpf_prog;
+struct xdp_buff;
void netdev_set_default_ethtool_ops(struct net_device *dev,
const struct ethtool_ops *ops);
*/
struct rx_queue_attribute {
struct attribute attr;
- ssize_t (*show)(struct netdev_rx_queue *queue,
- struct rx_queue_attribute *attr, char *buf);
+ ssize_t (*show)(struct netdev_rx_queue *queue, char *buf);
ssize_t (*store)(struct netdev_rx_queue *queue,
- struct rx_queue_attribute *attr, const char *buf, size_t len);
+ const char *buf, size_t len);
};
#ifdef CONFIG_XPS
typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
struct sk_buff *skb);
-/* These structures hold the attributes of qdisc and classifiers
- * that are being passed to the netdevice through the setup_tc op.
- */
-enum {
+enum tc_setup_type {
TC_SETUP_MQPRIO,
TC_SETUP_CLSU32,
TC_SETUP_CLSFLOWER,
- TC_SETUP_MATCHALL,
+ TC_SETUP_CLSMATCHALL,
TC_SETUP_CLSBPF,
};
-struct tc_cls_u32_offload;
-
-struct tc_to_netdev {
- unsigned int type;
- union {
- struct tc_cls_u32_offload *cls_u32;
- struct tc_cls_flower_offload *cls_flower;
- struct tc_cls_matchall_offload *cls_mall;
- struct tc_cls_bpf_offload *cls_bpf;
- struct tc_mqprio_qopt *mqprio;
- };
- bool egress_dev;
-};
-
/* These structures hold the attributes of xdp state that are being passed
* to the netdevice through the xdp op.
*/
* with PF and querying it may introduce a theoretical security risk.
* int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting);
* int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
- * int (*ndo_setup_tc)(struct net_device *dev, u32 handle, u32 chain_index,
- * __be16 protocol, struct tc_to_netdev *tc);
+ * int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type,
+ * void *type_data);
* Called to setup any 'tc' scheduler, classifier or action on @dev.
* This is always called from the stack with the rtnl lock held and netif
* tx queues stopped. This allows the netdevice to perform queue
* int (*ndo_xdp)(struct net_device *dev, struct netdev_xdp *xdp);
* This function is used to set or query state related to XDP on the
* netdevice. See definition of enum xdp_netdev_command for details.
- *
+ * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp);
+ * This function is used to submit a XDP packet for transmit on a
+ * netdevice.
+ * void (*ndo_xdp_flush)(struct net_device *dev);
+ * This function is used to inform the driver to flush a particular
+ * xdp tx queue. Must be called on same CPU as xdp_xmit.
*/
struct net_device_ops {
int (*ndo_init)(struct net_device *dev);
struct net_device *dev,
int vf, bool setting);
int (*ndo_setup_tc)(struct net_device *dev,
- u32 handle, u32 chain_index,
- __be16 protocol,
- struct tc_to_netdev *tc);
+ enum tc_setup_type type,
+ void *type_data);
#if IS_ENABLED(CONFIG_FCOE)
int (*ndo_fcoe_enable)(struct net_device *dev);
int (*ndo_fcoe_disable)(struct net_device *dev);
int needed_headroom);
int (*ndo_xdp)(struct net_device *dev,
struct netdev_xdp *xdp);
+ int (*ndo_xdp_xmit)(struct net_device *dev,
+ struct xdp_buff *xdp);
+ void (*ndo_xdp_flush)(struct net_device *dev);
};
/**
#define NETDEV_PRECHANGEUPPER 0x001A
#define NETDEV_CHANGELOWERSTATE 0x001B
#define NETDEV_UDP_TUNNEL_PUSH_INFO 0x001C
+#define NETDEV_UDP_TUNNEL_DROP_INFO 0x001D
#define NETDEV_CHANGE_TX_QUEUE_LEN 0x001E
int register_netdevice_notifier(struct notifier_block *nb);
struct net_device *__dev_get_by_name(struct net *net, const char *name);
int dev_alloc_name(struct net_device *dev, const char *name);
int dev_open(struct net_device *dev);
-int dev_close(struct net_device *dev);
-int dev_close_many(struct list_head *head, bool unlink);
+void dev_close(struct net_device *dev);
+void dev_close_many(struct list_head *head, bool unlink);
void dev_disable_lro(struct net_device *dev);
int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
int dev_queue_xmit(struct sk_buff *skb);
__dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
}
+void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
+int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
int netif_rx(struct sk_buff *skb);
int netif_rx_ni(struct sk_buff *skb);
int netif_receive_skb(struct sk_buff *skb);
bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
struct net_device *upper_dev);
+ bool netdev_has_any_upper_dev(struct net_device *dev);
+
void *netdev_lower_get_next_private(struct net_device *dev,
struct list_head **iter);
void *netdev_lower_get_next_private_rcu(struct net_device *dev,
return rc;
}
-int netdev_class_create_file_ns(struct class_attribute *class_attr,
+int netdev_class_create_file_ns(const struct class_attribute *class_attr,
const void *ns);
-void netdev_class_remove_file_ns(struct class_attribute *class_attr,
+void netdev_class_remove_file_ns(const struct class_attribute *class_attr,
const void *ns);
-static inline int netdev_class_create_file(struct class_attribute *class_attr)
+static inline int netdev_class_create_file(const struct class_attribute *class_attr)
{
return netdev_class_create_file_ns(class_attr, NULL);
}
-static inline void netdev_class_remove_file(struct class_attribute *class_attr)
+static inline void netdev_class_remove_file(const struct class_attribute *class_attr)
{
netdev_class_remove_file_ns(class_attr, NULL);
}
-extern struct kobj_ns_type_operations net_ns_type_operations;
+extern const struct kobj_ns_type_operations net_ns_type_operations;
const char *netdev_drivername(const struct net_device *dev);
/* check flags correspondence */
BUILD_BUG_ON(SKB_GSO_TCPV4 != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT));
- BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_UFO >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_DODGY != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT));
frag->size -= delta;
}
+static inline bool skb_frag_must_loop(struct page *p)
+{
+#if defined(CONFIG_HIGHMEM)
+ if (PageHighMem(p))
+ return true;
+#endif
+ return false;
+}
+
+/**
+ * skb_frag_foreach_page - loop over pages in a fragment
+ *
+ * @f: skb frag to operate on
+ * @f_off: offset from start of f->page.p
+ * @f_len: length from f_off to loop over
+ * @p: (temp var) current page
+ * @p_off: (temp var) offset from start of current page,
+ * non-zero only on first page.
+ * @p_len: (temp var) length in current page,
+ * < PAGE_SIZE only on first and last page.
+ * @copied: (temp var) length so far, excluding current p_len.
+ *
+ * A fragment can hold a compound page, in which case per-page
+ * operations, notably kmap_atomic, must be called for each
+ * regular page.
+ */
+#define skb_frag_foreach_page(f, f_off, f_len, p, p_off, p_len, copied) \
+ for (p = skb_frag_page(f) + ((f_off) >> PAGE_SHIFT), \
+ p_off = (f_off) & (PAGE_SIZE - 1), \
+ p_len = skb_frag_must_loop(p) ? \
+ min_t(u32, f_len, PAGE_SIZE - p_off) : f_len, \
+ copied = 0; \
+ copied < f_len; \
+ copied += p_len, p++, p_off = 0, \
+ p_len = min_t(u32, f_len - copied, PAGE_SIZE)) \
+
#define HAVE_HW_TIME_STAMP
/**
SKBTX_SCHED_TSTAMP = 1 << 6,
};
+#define SKBTX_ZEROCOPY_FRAG (SKBTX_DEV_ZEROCOPY | SKBTX_SHARED_FRAG)
#define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \
SKBTX_SCHED_TSTAMP)
#define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP)
*/
struct ubuf_info {
void (*callback)(struct ubuf_info *, bool zerocopy_success);
- void *ctx;
- unsigned long desc;
+ union {
+ struct {
+ unsigned long desc;
+ void *ctx;
+ };
+ struct {
+ u32 id;
+ u16 len;
+ u16 zerocopy:1;
+ u32 bytelen;
+ };
+ };
+ atomic_t refcnt;
+
+ struct mmpin {
+ struct user_struct *user;
+ unsigned int num_pg;
+ } mmp;
};
+#define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
+
+struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
+struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
+ struct ubuf_info *uarg);
+
+static inline void sock_zerocopy_get(struct ubuf_info *uarg)
+{
+ atomic_inc(&uarg->refcnt);
+}
+
+void sock_zerocopy_put(struct ubuf_info *uarg);
+void sock_zerocopy_put_abort(struct ubuf_info *uarg);
+
+void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
+
+int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
+ struct msghdr *msg, int len,
+ struct ubuf_info *uarg);
+
/* This data is invariant across clones and lives at
* the end of the header data, ie. at skb->end.
*/
enum {
SKB_GSO_TCPV4 = 1 << 0,
- SKB_GSO_UDP = 1 << 1,
/* This indicates the skb is from an untrusted source. */
- SKB_GSO_DODGY = 1 << 2,
+ SKB_GSO_DODGY = 1 << 1,
/* This indicates the tcp segment has CWR set. */
- SKB_GSO_TCP_ECN = 1 << 3,
+ SKB_GSO_TCP_ECN = 1 << 2,
- SKB_GSO_TCP_FIXEDID = 1 << 4,
+ SKB_GSO_TCP_FIXEDID = 1 << 3,
- SKB_GSO_TCPV6 = 1 << 5,
+ SKB_GSO_TCPV6 = 1 << 4,
- SKB_GSO_FCOE = 1 << 6,
+ SKB_GSO_FCOE = 1 << 5,
- SKB_GSO_GRE = 1 << 7,
+ SKB_GSO_GRE = 1 << 6,
- SKB_GSO_GRE_CSUM = 1 << 8,
+ SKB_GSO_GRE_CSUM = 1 << 7,
- SKB_GSO_IPXIP4 = 1 << 9,
+ SKB_GSO_IPXIP4 = 1 << 8,
- SKB_GSO_IPXIP6 = 1 << 10,
+ SKB_GSO_IPXIP6 = 1 << 9,
- SKB_GSO_UDP_TUNNEL = 1 << 11,
+ SKB_GSO_UDP_TUNNEL = 1 << 10,
- SKB_GSO_UDP_TUNNEL_CSUM = 1 << 12,
+ SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,
- SKB_GSO_PARTIAL = 1 << 13,
+ SKB_GSO_PARTIAL = 1 << 12,
- SKB_GSO_TUNNEL_REMCSUM = 1 << 14,
+ SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
- SKB_GSO_SCTP = 1 << 15,
+ SKB_GSO_SCTP = 1 << 14,
- SKB_GSO_ESP = 1 << 16,
+ SKB_GSO_ESP = 1 << 15,
};
#if BITS_PER_LONG > 32
return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
}
-struct sk_buff *__alloc_skb_head(gfp_t priority, int node);
-static inline struct sk_buff *alloc_skb_head(gfp_t priority)
-{
- return __alloc_skb_head(priority, -1);
-}
-
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg,
int offset, int len);
int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer);
- int skb_pad(struct sk_buff *skb, int pad);
+ int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error);
+
+ /**
+ * skb_pad - zero pad the tail of an skb
+ * @skb: buffer to pad
+ * @pad: space to pad
+ *
+ * Ensure that a buffer is followed by a padding area that is zero
+ * filled. Used by network drivers which may DMA or transfer data
+ * beyond the buffer end onto the wire.
+ *
+ * May return error in out of memory cases. The skb is freed on error.
+ */
+ static inline int skb_pad(struct sk_buff *skb, int pad)
+ {
+ return __skb_pad(skb, pad, true);
+ }
#define dev_kfree_skb(a) consume_skb(a)
int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
return skb->hash;
}
-__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6);
-
static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
{
if (!skb->l4_hash && !skb->sw_hash) {
return skb->hash;
}
-__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl);
-
-static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4)
-{
- if (!skb->l4_hash && !skb->sw_hash) {
- struct flow_keys keys;
- __u32 hash = __get_hash_from_flowi4(fl4, &keys);
-
- __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
- }
-
- return skb->hash;
-}
-
__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb);
static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
return &skb_shinfo(skb)->hwtstamps;
}
+static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
+{
+ bool is_zcopy = skb && skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY;
+
+ return is_zcopy ? skb_uarg(skb) : NULL;
+}
+
+static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
+{
+ if (skb && uarg && !skb_zcopy(skb)) {
+ sock_zerocopy_get(uarg);
+ skb_shinfo(skb)->destructor_arg = uarg;
+ skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
+ }
+}
+
+/* Release a reference on a zerocopy structure */
+static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy)
+{
+ struct ubuf_info *uarg = skb_zcopy(skb);
+
+ if (uarg) {
+ if (uarg->callback == sock_zerocopy_callback) {
+ uarg->zerocopy = uarg->zerocopy && zerocopy;
+ sock_zerocopy_put(uarg);
+ } else {
+ uarg->callback(uarg, zerocopy);
+ }
+
+ skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
+ }
+}
+
+/* Abort a zerocopy operation and revert zckey on error in send syscall */
+static inline void skb_zcopy_abort(struct sk_buff *skb)
+{
+ struct ubuf_info *uarg = skb_zcopy(skb);
+
+ if (uarg) {
+ sock_zerocopy_put_abort(uarg);
+ skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
+ }
+}
+
/**
* skb_queue_empty - check if a queue is empty
* @list: queue head
return skb->len - skb->data_len;
}
-static inline unsigned int skb_pagelen(const struct sk_buff *skb)
+static inline unsigned int __skb_pagelen(const struct sk_buff *skb)
{
unsigned int i, len = 0;
for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--)
len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
- return len + skb_headlen(skb);
+ return len;
+}
+
+static inline unsigned int skb_pagelen(const struct sk_buff *skb)
+{
+ return skb_headlen(skb) + __skb_pagelen(skb);
}
/**
*/
static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
{
- if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY)))
+ if (likely(!skb_zcopy(skb)))
+ return 0;
+ if (skb_uarg(skb)->callback == sock_zerocopy_callback)
+ return 0;
+ return skb_copy_ubufs(skb, gfp_mask);
+}
+
+/* Frags must be orphaned, even if refcounted, if skb might loop to rx path */
+static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask)
+{
+ if (likely(!skb_zcopy(skb)))
return 0;
return skb_copy_ubufs(skb, gfp_mask);
}
* skb_put_padto - increase size and pad an skbuff up to a minimal size
* @skb: buffer to pad
* @len: minimal length
+ * @free_on_error: free buffer on error
*
* Pads up a buffer to ensure the trailing bytes exist and are
* blanked. If the buffer already contains sufficient data it
* is untouched. Otherwise it is extended. Returns zero on
- * success. The skb is freed on error.
+ * success. The skb is freed on error if @free_on_error is true.
*/
- static inline int skb_put_padto(struct sk_buff *skb, unsigned int len)
+ static inline int __skb_put_padto(struct sk_buff *skb, unsigned int len,
+ bool free_on_error)
{
unsigned int size = skb->len;
if (unlikely(size < len)) {
len -= size;
- if (skb_pad(skb, len))
+ if (__skb_pad(skb, len, free_on_error))
return -ENOMEM;
__skb_put(skb, len);
}
return 0;
}
+ /**
+ * skb_put_padto - increase size and pad an skbuff up to a minimal size
+ * @skb: buffer to pad
+ * @len: minimal length
+ *
+ * Pads up a buffer to ensure the trailing bytes exist and are
+ * blanked. If the buffer already contains sufficient data it
+ * is untouched. Otherwise it is extended. Returns zero on
+ * success. The skb is freed on error.
+ */
+ static inline int skb_put_padto(struct sk_buff *skb, unsigned int len)
+ {
+ return __skb_put_padto(skb, len, true);
+ }
+
static inline int skb_add_data(struct sk_buff *skb,
struct iov_iter *from, int copy)
{
static inline bool skb_can_coalesce(struct sk_buff *skb, int i,
const struct page *page, int off)
{
+ if (skb_zcopy(skb))
+ return false;
if (i) {
const struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i - 1];
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
struct pipe_inode_info *pipe, unsigned int len,
unsigned int flags);
+int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
+ int len);
+int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len);
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
#include <linux/ipv6_route.h>
#include <linux/rtnetlink.h>
#include <linux/spinlock.h>
+#include <linux/notifier.h>
#include <net/dst.h>
#include <net/flow.h>
#include <net/netlink.h>
#include <net/inetpeer.h>
+#include <net/fib_notifier.h>
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
#define FIB6_TABLE_HASHSZ 256
__u16 fn_flags;
int fn_sernum;
struct rt6_info *rr_ptr;
+ struct rcu_head rcu;
};
#ifndef CONFIG_IPV6_SUBTREES
* the same cache line.
*/
struct fib6_table *rt6i_table;
- struct fib6_node *rt6i_node;
+ struct fib6_node __rcu *rt6i_node;
struct in6_addr rt6i_gateway;
atomic_t rt6i_ref;
+ unsigned int rt6i_nh_flags;
+
/* These are in a separate cache line. */
struct rt6key rt6i_dst ____cacheline_aligned_in_smp;
u32 rt6i_flags;
rt0->rt6i_flags |= RTF_EXPIRES;
}
+ /* Function to safely get fn->sernum for passed in rt
+ * and store result in passed in cookie.
+ * Return true if we can get cookie safely
+ * Return false if not
+ */
+ static inline bool rt6_get_cookie_safe(const struct rt6_info *rt,
+ u32 *cookie)
+ {
+ struct fib6_node *fn;
+ bool status = false;
+
+ rcu_read_lock();
+ fn = rcu_dereference(rt->rt6i_node);
+
+ if (fn) {
+ *cookie = fn->fn_sernum;
+ status = true;
+ }
+
+ rcu_read_unlock();
+ return status;
+ }
+
static inline u32 rt6_get_cookie(const struct rt6_info *rt)
{
+ u32 cookie = 0;
+
if (rt->rt6i_flags & RTF_PCPU ||
(unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
rt = (struct rt6_info *)(rt->dst.from);
- return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+ rt6_get_cookie_safe(rt, &cookie);
+
+ return cookie;
}
static inline void ip6_rt_put(struct rt6_info *rt)
dst_release(&rt->dst);
}
+void rt6_free_pcpu(struct rt6_info *non_pcpu_rt);
+
+static inline void rt6_hold(struct rt6_info *rt)
+{
+ atomic_inc(&rt->rt6i_ref);
+}
+
+static inline void rt6_release(struct rt6_info *rt)
+{
+ if (atomic_dec_and_test(&rt->rt6i_ref)) {
+ rt6_free_pcpu(rt);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
+ }
+}
+
enum fib6_walk_state {
#ifdef CONFIG_IPV6_SUBTREES
FWS_S,
struct fib6_node tb6_root;
struct inet_peer_base tb6_peers;
unsigned int flags;
+ unsigned int fib_seq;
#define RT6_TABLE_HAS_DFLT_ROUTER BIT(0)
};
struct fib6_table *,
struct flowi6 *, int);
+struct fib6_entry_notifier_info {
+ struct fib_notifier_info info; /* must be first */
+ struct rt6_info *rt;
+};
+
/*
* exported functions
*/
int ipv6_route_open(struct inode *inode, struct file *file);
+int call_fib6_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type,
+ struct fib_notifier_info *info);
+int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
+ struct fib_notifier_info *info);
+
+int __net_init fib6_notifier_init(struct net *net);
+void __net_exit fib6_notifier_exit(struct net *net);
+
+unsigned int fib6_tables_seq_read(struct net *net);
+int fib6_tables_dump(struct net *net, struct notifier_block *nb);
+
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
int fib6_rules_init(void);
void fib6_rules_cleanup(void);
+bool fib6_rule_default(const struct fib_rule *rule);
+int fib6_rules_dump(struct net *net, struct notifier_block *nb);
+unsigned int fib6_rules_seq_read(struct net *net);
#else
static inline int fib6_rules_init(void)
{
{
return ;
}
+static inline bool fib6_rule_default(const struct fib_rule *rule)
+{
+ return true;
+}
+static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb)
+{
+ return 0;
+}
+static inline unsigned int fib6_rules_seq_read(struct net *net)
+{
+ return 0;
+}
#endif
#endif
struct hlist_node hash;
u32 handle;
u32 parent;
- void *u32_node;
struct netdev_queue *dev_queue;
spinlock_t busylock ____cacheline_aligned_in_smp;
};
+ static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
+ {
+ if (qdisc->flags & TCQ_F_BUILTIN)
+ return;
+ refcount_inc(&qdisc->refcnt);
+ }
+
static inline bool qdisc_is_running(const struct Qdisc *qdisc)
{
return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
void (*qlen_notify)(struct Qdisc *, unsigned long);
/* Class manipulation routines */
- unsigned long (*get)(struct Qdisc *, u32 classid);
- void (*put)(struct Qdisc *, unsigned long);
+ unsigned long (*find)(struct Qdisc *, u32 classid);
int (*change)(struct Qdisc *, u32, u32,
struct nlattr **, unsigned long *);
int (*delete)(struct Qdisc *, unsigned long);
/* Filter manipulation */
struct tcf_block * (*tcf_block)(struct Qdisc *, unsigned long);
- bool (*tcf_cl_offload)(u32 classid);
unsigned long (*bind_tcf)(struct Qdisc *, unsigned long,
u32 classid);
void (*unbind_tcf)(struct Qdisc *, unsigned long);
int (*init)(struct tcf_proto*);
void (*destroy)(struct tcf_proto*);
- unsigned long (*get)(struct tcf_proto*, u32 handle);
+ void* (*get)(struct tcf_proto*, u32 handle);
int (*change)(struct net *net, struct sk_buff *,
struct tcf_proto*, unsigned long,
u32 handle, struct nlattr **,
- unsigned long *, bool);
- int (*delete)(struct tcf_proto*, unsigned long, bool*);
+ void **, bool);
+ int (*delete)(struct tcf_proto*, void *, bool*);
void (*walk)(struct tcf_proto*, struct tcf_walker *arg);
+ void (*bind_class)(void *, u32, unsigned long);
/* rtnetlink specific */
- int (*dump)(struct net*, struct tcf_proto*, unsigned long,
+ int (*dump)(struct net*, struct tcf_proto*, void *,
struct sk_buff *skb, struct tcmsg*);
struct module *owner;
struct Qdisc_class_common *cl;
unsigned int h;
+ if (!id)
+ return NULL;
+
h = qdisc_class_hash(id, hash->hashmask);
hlist_for_each_entry(cl, &hash->hash[h], hnode) {
if (cl->classid == id)
#endif
#define TCP_RTO_MAX ((unsigned)(120*HZ))
#define TCP_RTO_MIN ((unsigned)(HZ/5))
+#define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */
#define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */
#define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now
* used as a fallback RTO for the
#define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
* for local resources.
*/
-#define TCP_REO_TIMEOUT_MIN (2000) /* Min RACK reordering timeout in usec */
-
#define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */
#define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */
#define TCP_KEEPALIVE_INTVL (75*HZ)
extern int sysctl_tcp_app_win;
extern int sysctl_tcp_adv_win_scale;
extern int sysctl_tcp_frto;
-extern int sysctl_tcp_low_latency;
extern int sysctl_tcp_nometrics_save;
extern int sysctl_tcp_moderate_rcvbuf;
extern int sysctl_tcp_tso_win_divisor;
int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
+int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
int flags);
+int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags);
ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
size_t size, int flags);
void tcp_release_cb(struct sock *sk);
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th, unsigned int len);
+ const struct tcphdr *th);
void tcp_rcv_space_adjust(struct sock *sk);
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
void tcp_twsk_destructor(struct sock *sk);
u16 tcp_gso_segs;
u16 tcp_gso_size;
};
+
+ /* Used to stash the receive timestamp while this skb is in the
+ * out of order queue, as skb->tstamp is overwritten by the
+ * rbnode.
+ */
+ ktime_t swtstamp;
};
__u8 tcp_flags; /* TCP header flags. (tcp[13]) */
__u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
__u8 txstamp_ack:1, /* Record TX timestamp for ack? */
eor:1, /* Is skb MSG_EOR marked? */
- unused:6;
+ has_rxtstamp:1, /* SKB has a RX timestamp */
+ unused:5;
__u32 ack_seq; /* Sequence number ACK'd */
union {
struct {
return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
}
+
+/* TCP_SKB_CB reference means this can not be used from early demux */
+static inline int tcp_v6_sdif(const struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+ if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags))
+ return TCP_SKB_CB(skb)->header.h6.iif;
+#endif
+ return 0;
+}
#endif
/* TCP_SKB_CB reference means this can not be used from early demux */
return false;
}
+/* TCP_SKB_CB reference means this can not be used from early demux */
+static inline int tcp_v4_sdif(struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+ if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags))
+ return TCP_SKB_CB(skb)->header.h4.iif;
+#endif
+ return 0;
+}
+
/* Due to TSO, an SKB can be composed of multiple actual
* packets. To keep these tracked properly, we use this.
*/
void tcp_get_available_congestion_control(char *buf, size_t len);
void tcp_get_allowed_congestion_control(char *buf, size_t len);
int tcp_set_allowed_congestion_control(char *allowed);
- int tcp_set_congestion_control(struct sock *sk, const char *name, bool load);
- void tcp_reinit_congestion_control(struct sock *sk,
- const struct tcp_congestion_ops *ca);
+ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit);
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
__tcp_checksum_complete(skb);
}
-/* Prequeue for VJ style copy to user, combined with checksumming. */
-
-static inline void tcp_prequeue_init(struct tcp_sock *tp)
-{
- tp->ucopy.task = NULL;
- tp->ucopy.len = 0;
- tp->ucopy.memory = 0;
- skb_queue_head_init(&tp->ucopy.prequeue);
-}
-
-bool tcp_prequeue(struct sock *sk, struct sk_buff *skb);
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb);
int tcp_filter(struct sock *sk, struct sk_buff *skb);
void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
- struct tcp_fastopen_cookie *foc,
- struct dst_entry *dst);
+ struct tcp_fastopen_cookie *foc);
void tcp_fastopen_init_key_once(bool publish);
bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
struct tcp_fastopen_cookie *cookie);
/*
* Save and compile IPv4 options, return a pointer to it
*/
-static inline struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
+static inline struct ip_options_rcu *tcp_v4_save_options(struct net *net,
+ struct sk_buff *skb)
{
const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
struct ip_options_rcu *dopt = NULL;
int opt_size = sizeof(*dopt) + opt->optlen;
dopt = kmalloc(opt_size, GFP_ATOMIC);
- if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) {
+ if (dopt && __ip_options_echo(net, &dopt->opt, skb, opt)) {
kfree(dopt);
dopt = NULL;
}
}
void udp_v4_early_demux(struct sk_buff *skb);
- void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst);
+ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst);
int udp_get_port(struct sock *sk, unsigned short snum,
int (*saddr_cmp)(const struct sock *,
const struct sock *));
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif);
struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
- __be32 daddr, __be16 dport, int dif,
+ __be32 daddr, __be16 dport, int dif, int sdif,
struct udp_table *tbl, struct sk_buff *skb);
struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
__be16 sport, __be16 dport);
struct sock *__udp6_lib_lookup(struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport,
- int dif, struct udp_table *tbl,
+ int dif, int sdif, struct udp_table *tbl,
struct sk_buff *skb);
struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
__be16 sport, __be16 dport);
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
- bool is_kprobe, is_tracepoint;
+ bool is_kprobe, is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
- if (!is_kprobe && !is_tracepoint)
+ is_syscall_tp = is_syscall_trace_event(event->tp_event);
+ if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
/* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;
return PTR_ERR(prog);
if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
- (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
+ (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
+ (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
/* valid fd, but invalid bpf program type */
bpf_prog_put(prog);
return -EINVAL;
}
- if (is_tracepoint) {
+ if (is_tracepoint || is_syscall_tp) {
int off = trace_event_get_offsets(event->tp_event);
if (prog->aux->max_ctx_offset > off) {
goto err_context;
/*
- * Do not allow to attach to a group in a different
- * task or CPU context:
+ * Make sure we're both events for the same CPU;
+ * grouping events for different CPUs is broken; since
+ * you can never concurrently schedule them anyhow.
*/
- if (move_group) {
- /*
- * Make sure we're both on the same task, or both
- * per-cpu events.
- */
- if (group_leader->ctx->task != ctx->task)
- goto err_context;
+ if (group_leader->cpu != event->cpu)
+ goto err_context;
- /*
- * Make sure we're both events for the same CPU;
- * grouping events for different CPUs is broken; since
- * you can never concurrently schedule them anyhow.
- */
- if (group_leader->cpu != event->cpu)
- goto err_context;
- } else {
- if (group_leader->ctx != ctx)
- goto err_context;
- }
+ /*
+ * Make sure we're both on the same task, or both
+ * per-CPU events.
+ */
+ if (group_leader->ctx->task != ctx->task)
+ goto err_context;
+
+ /*
+ * Do not allow to attach to a group in a different task
+ * or CPU context. If we're moving SW events, we'll fix
+ * this up later, so allow that.
+ */
+ if (!move_group && group_leader->ctx != ctx)
+ goto err_context;
/*
* Only a group leader can be exclusive or pinned
if (flags & MSG_PEEK) {
err = -ENOENT;
spin_lock_bh(&sk_queue->lock);
- if (skb == skb_peek(sk_queue)) {
+ if (skb->next) {
__skb_unlink(skb, sk_queue);
refcount_dec(&skb->users);
if (destructor)
}
EXPORT_SYMBOL(skb_copy_datagram_from_iter);
-/**
- * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
- * @skb: buffer to copy
- * @from: the source to copy from
- *
- * The function will first copy up to headlen, and then pin the userspace
- * pages and build frags through them.
- *
- * Returns 0, -EFAULT or -EMSGSIZE.
- */
-int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
+int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
+ struct iov_iter *from, size_t length)
{
- int len = iov_iter_count(from);
- int copy = min_t(int, skb_headlen(skb), len);
- int frag = 0;
+ int frag = skb_shinfo(skb)->nr_frags;
- /* copy up to skb headlen */
- if (skb_copy_datagram_from_iter(skb, 0, from, copy))
- return -EFAULT;
-
- while (iov_iter_count(from)) {
+ while (length && iov_iter_count(from)) {
struct page *pages[MAX_SKB_FRAGS];
size_t start;
ssize_t copied;
if (frag == MAX_SKB_FRAGS)
return -EMSGSIZE;
- copied = iov_iter_get_pages(from, pages, ~0U,
+ copied = iov_iter_get_pages(from, pages, length,
MAX_SKB_FRAGS - frag, &start);
if (copied < 0)
return -EFAULT;
iov_iter_advance(from, copied);
+ length -= copied;
truesize = PAGE_ALIGN(copied + start);
skb->data_len += copied;
skb->len += copied;
skb->truesize += truesize;
- refcount_add(truesize, &skb->sk->sk_wmem_alloc);
+ if (sk && sk->sk_type == SOCK_STREAM) {
+ sk->sk_wmem_queued += truesize;
+ sk_mem_charge(sk, truesize);
+ } else {
+ refcount_add(truesize, &skb->sk->sk_wmem_alloc);
+ }
while (copied) {
int size = min_t(int, copied, PAGE_SIZE - start);
skb_fill_page_desc(skb, frag++, pages[n], start, size);
}
return 0;
}
+EXPORT_SYMBOL(__zerocopy_sg_from_iter);
+
+/**
+ * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
+ * @skb: buffer to copy
+ * @from: the source to copy from
+ *
+ * The function will first copy up to headlen, and then pin the userspace
+ * pages and build frags through them.
+ *
+ * Returns 0, -EFAULT or -EMSGSIZE.
+ */
+int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
+{
+ int copy = min_t(int, skb_headlen(skb), iov_iter_count(from));
+
+ /* copy up to skb headlen */
+ if (skb_copy_datagram_from_iter(skb, 0, from, copy))
+ return -EFAULT;
+
+ return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
+}
EXPORT_SYMBOL(zerocopy_sg_from_iter);
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
#include <linux/netfilter_ingress.h>
#include <linux/crash_dump.h>
#include <linux/sctp.h>
+#include <net/udp_tunnel.h>
#include "net-sysfs.h"
}
EXPORT_SYMBOL(dev_open);
-static int __dev_close_many(struct list_head *head)
+static void __dev_close_many(struct list_head *head)
{
struct net_device *dev;
dev->flags &= ~IFF_UP;
netpoll_poll_enable(dev);
}
-
- return 0;
}
-static int __dev_close(struct net_device *dev)
+static void __dev_close(struct net_device *dev)
{
- int retval;
LIST_HEAD(single);
list_add(&dev->close_list, &single);
- retval = __dev_close_many(&single);
+ __dev_close_many(&single);
list_del(&single);
-
- return retval;
}
-int dev_close_many(struct list_head *head, bool unlink)
+void dev_close_many(struct list_head *head, bool unlink)
{
struct net_device *dev, *tmp;
if (unlink)
list_del_init(&dev->close_list);
}
-
- return 0;
}
EXPORT_SYMBOL(dev_close_many);
* is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
* chain.
*/
-int dev_close(struct net_device *dev)
+void dev_close(struct net_device *dev)
{
if (dev->flags & IFF_UP) {
LIST_HEAD(single);
dev_close_many(&single, true);
list_del(&single);
}
- return 0;
}
EXPORT_SYMBOL(dev_close);
struct packet_type *pt_prev,
struct net_device *orig_dev)
{
- if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+ if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
return -ENOMEM;
refcount_inc(&skb->users);
return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
{
if (tx_path)
- return skb->ip_summed != CHECKSUM_PARTIAL &&
- skb->ip_summed != CHECKSUM_UNNECESSARY;
+ return skb->ip_summed != CHECKSUM_PARTIAL;
return skb->ip_summed == CHECKSUM_NONE;
}
return NET_RX_DROP;
}
+static u32 netif_receive_generic_xdp(struct sk_buff *skb,
+ struct bpf_prog *xdp_prog)
+{
+ struct xdp_buff xdp;
+ u32 act = XDP_DROP;
+ void *orig_data;
+ int hlen, off;
+ u32 mac_len;
+
+ /* Reinjected packets coming from act_mirred or similar should
+ * not get XDP generic processing.
+ */
+ if (skb_cloned(skb))
+ return XDP_PASS;
+
+ if (skb_linearize(skb))
+ goto do_drop;
+
+ /* The XDP program wants to see the packet starting at the MAC
+ * header.
+ */
+ mac_len = skb->data - skb_mac_header(skb);
+ hlen = skb_headlen(skb) + mac_len;
+ xdp.data = skb->data - mac_len;
+ xdp.data_end = xdp.data + hlen;
+ xdp.data_hard_start = skb->data - skb_headroom(skb);
+ orig_data = xdp.data;
+
+ act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+ off = xdp.data - orig_data;
+ if (off > 0)
+ __skb_pull(skb, off);
+ else if (off < 0)
+ __skb_push(skb, -off);
+
+ switch (act) {
+ case XDP_REDIRECT:
+ case XDP_TX:
+ __skb_push(skb, mac_len);
+ /* fall through */
+ case XDP_PASS:
+ break;
+
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ /* fall through */
+ case XDP_ABORTED:
+ trace_xdp_exception(skb->dev, xdp_prog, act);
+ /* fall through */
+ case XDP_DROP:
+ do_drop:
+ kfree_skb(skb);
+ break;
+ }
+
+ return act;
+}
+
+/* When doing generic XDP we have to bypass the qdisc layer and the
+ * network taps in order to match in-driver-XDP behavior.
+ */
+void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
+{
+ struct net_device *dev = skb->dev;
+ struct netdev_queue *txq;
+ bool free_skb = true;
+ int cpu, rc;
+
+ txq = netdev_pick_tx(dev, skb, NULL);
+ cpu = smp_processor_id();
+ HARD_TX_LOCK(dev, txq, cpu);
+ if (!netif_xmit_stopped(txq)) {
+ rc = netdev_start_xmit(skb, dev, txq, 0);
+ if (dev_xmit_complete(rc))
+ free_skb = false;
+ }
+ HARD_TX_UNLOCK(dev, txq);
+ if (free_skb) {
+ trace_xdp_exception(dev, xdp_prog, XDP_TX);
+ kfree_skb(skb);
+ }
+}
+EXPORT_SYMBOL_GPL(generic_xdp_tx);
+
+static struct static_key generic_xdp_needed __read_mostly;
+
+int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
+{
+ if (xdp_prog) {
+ u32 act = netif_receive_generic_xdp(skb, xdp_prog);
+ int err;
+
+ if (act != XDP_PASS) {
+ switch (act) {
+ case XDP_REDIRECT:
+ err = xdp_do_generic_redirect(skb->dev, skb,
+ xdp_prog);
+ if (err)
+ goto out_redir;
+ /* fallthru to submit skb */
+ case XDP_TX:
+ generic_xdp_tx(skb, xdp_prog);
+ break;
+ }
+ return XDP_DROP;
+ }
+ }
+ return XDP_PASS;
+out_redir:
+ kfree_skb(skb);
+ return XDP_DROP;
+}
+EXPORT_SYMBOL_GPL(do_xdp_generic);
+
static int netif_rx_internal(struct sk_buff *skb)
{
int ret;
net_timestamp_check(netdev_tstamp_prequeue, skb);
trace_netif_rx(skb);
+
+ if (static_key_false(&generic_xdp_needed)) {
+ int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+ skb);
+
+ /* Consider XDP consuming the packet a success from
+ * the netdev point of view we do not want to count
+ * this as an error.
+ */
+ if (ret != XDP_PASS)
+ return NET_RX_SUCCESS;
+ }
+
#ifdef CONFIG_RPS
if (static_key_false(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
}
if (pt_prev) {
- if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+ if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
goto drop;
else
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
return ret;
}
-static struct static_key generic_xdp_needed __read_mostly;
-
static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp)
{
struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
return ret;
}
-static u32 netif_receive_generic_xdp(struct sk_buff *skb,
- struct bpf_prog *xdp_prog)
-{
- struct xdp_buff xdp;
- u32 act = XDP_DROP;
- void *orig_data;
- int hlen, off;
- u32 mac_len;
-
- /* Reinjected packets coming from act_mirred or similar should
- * not get XDP generic processing.
- */
- if (skb_cloned(skb))
- return XDP_PASS;
-
- if (skb_linearize(skb))
- goto do_drop;
-
- /* The XDP program wants to see the packet starting at the MAC
- * header.
- */
- mac_len = skb->data - skb_mac_header(skb);
- hlen = skb_headlen(skb) + mac_len;
- xdp.data = skb->data - mac_len;
- xdp.data_end = xdp.data + hlen;
- xdp.data_hard_start = skb->data - skb_headroom(skb);
- orig_data = xdp.data;
-
- act = bpf_prog_run_xdp(xdp_prog, &xdp);
-
- off = xdp.data - orig_data;
- if (off > 0)
- __skb_pull(skb, off);
- else if (off < 0)
- __skb_push(skb, -off);
-
- switch (act) {
- case XDP_TX:
- __skb_push(skb, mac_len);
- /* fall through */
- case XDP_PASS:
- break;
-
- default:
- bpf_warn_invalid_xdp_action(act);
- /* fall through */
- case XDP_ABORTED:
- trace_xdp_exception(skb->dev, xdp_prog, act);
- /* fall through */
- case XDP_DROP:
- do_drop:
- kfree_skb(skb);
- break;
- }
-
- return act;
-}
-
-/* When doing generic XDP we have to bypass the qdisc layer and the
- * network taps in order to match in-driver-XDP behavior.
- */
-static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
-{
- struct net_device *dev = skb->dev;
- struct netdev_queue *txq;
- bool free_skb = true;
- int cpu, rc;
-
- txq = netdev_pick_tx(dev, skb, NULL);
- cpu = smp_processor_id();
- HARD_TX_LOCK(dev, txq, cpu);
- if (!netif_xmit_stopped(txq)) {
- rc = netdev_start_xmit(skb, dev, txq, 0);
- if (dev_xmit_complete(rc))
- free_skb = false;
- }
- HARD_TX_UNLOCK(dev, txq);
- if (free_skb) {
- trace_xdp_exception(dev, xdp_prog, XDP_TX);
- kfree_skb(skb);
- }
-}
-
static int netif_receive_skb_internal(struct sk_buff *skb)
{
int ret;
rcu_read_lock();
if (static_key_false(&generic_xdp_needed)) {
- struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog);
+ int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+ skb);
- if (xdp_prog) {
- u32 act = netif_receive_generic_xdp(skb, xdp_prog);
-
- if (act != XDP_PASS) {
- rcu_read_unlock();
- if (act == XDP_TX)
- generic_xdp_tx(skb, xdp_prog);
- return NET_RX_DROP;
- }
+ if (ret != XDP_PASS) {
+ rcu_read_unlock();
+ return NET_RX_DROP;
}
}
* Ideally, a new ndo_busy_poll_stop() could avoid another round.
*/
rc = napi->poll(napi, BUSY_POLL_BUDGET);
+ trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
netpoll_poll_unlock(have_poll_lock);
if (rc == BUSY_POLL_BUDGET)
__napi_schedule(napi);
* Find out if a device is linked to an upper device and return true in case
* it is. The caller must hold the RTNL lock.
*/
- static bool netdev_has_any_upper_dev(struct net_device *dev)
+ bool netdev_has_any_upper_dev(struct net_device *dev)
{
ASSERT_RTNL();
return !list_empty(&dev->adj_list.upper);
}
+ EXPORT_SYMBOL(netdev_has_any_upper_dev);
/**
* netdev_master_upper_dev_get - Get master upper device
*/
ret = 0;
- if ((old_flags ^ flags) & IFF_UP)
- ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
+ if ((old_flags ^ flags) & IFF_UP) {
+ if (old_flags & IFF_UP)
+ __dev_close(dev);
+ else
+ ret = __dev_open(dev);
+ }
if ((flags ^ dev->gflags) & IFF_PROMISC) {
int inc = (flags & IFF_PROMISC) ? 1 : -1;
features &= ~NETIF_F_GSO;
}
- /* UFO needs SG and checksumming */
- if (features & NETIF_F_UFO) {
- /* maybe split UFO into V4 and V6? */
- if (!(features & NETIF_F_HW_CSUM) &&
- ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
- (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
- netdev_dbg(dev,
- "Dropping NETIF_F_UFO since no checksum offload features.\n");
- features &= ~NETIF_F_UFO;
- }
-
- if (!(features & NETIF_F_SG)) {
- netdev_dbg(dev,
- "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
- features &= ~NETIF_F_UFO;
- }
- }
-
/* GSO partial features require GSO partial be set */
if ((features & dev->gso_partial_features) &&
!(features & NETIF_F_GSO_PARTIAL)) {
netdev_for_each_lower_dev(dev, lower, iter)
netdev_sync_lower_features(dev, lower, features);
- if (!err)
+ if (!err) {
+ netdev_features_t diff = features ^ dev->features;
+
+ if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
+ /* udp_tunnel_{get,drop}_rx_info both need
+ * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
+ * device, or they won't do anything.
+ * Thus we need to update dev->features
+ * *before* calling udp_tunnel_get_rx_info,
+ * but *after* calling udp_tunnel_drop_rx_info.
+ */
+ if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
+ dev->features = features;
+ udp_tunnel_get_rx_info(dev);
+ } else {
+ udp_tunnel_drop_rx_info(dev);
+ }
+ }
+
dev->features = features;
+ }
return err < 0 ? 0 : 1;
}
*/
dev->hw_features |= NETIF_F_SOFT_FEATURES;
dev->features |= NETIF_F_SOFT_FEATURES;
+
+ if (dev->netdev_ops->ndo_udp_tunnel_add) {
+ dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
+ dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
+ }
+
dev->wanted_features = dev->features & dev->hw_features;
if (!(dev->flags & IFF_LOOPBACK))
#include <net/sock_reuseport.h>
#include <net/busy_poll.h>
#include <net/tcp.h>
+#include <linux/bpf_trace.h>
/**
* sk_filter_trim_cap - run a packet through a socket filter
break;
}
- /* Convert JEQ into JNE when 'jump_true' is next insn. */
- if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
- insn->code = BPF_JMP | BPF_JNE | bpf_src;
+ /* Convert some jumps when 'jump_true' is next insn. */
+ if (fp->jt == 0) {
+ switch (BPF_OP(fp->code)) {
+ case BPF_JEQ:
+ insn->code = BPF_JMP | BPF_JNE | bpf_src;
+ break;
+ case BPF_JGT:
+ insn->code = BPF_JMP | BPF_JLE | bpf_src;
+ break;
+ case BPF_JGE:
+ insn->code = BPF_JMP | BPF_JLT | bpf_src;
+ break;
+ default:
+ goto jmp_rest;
+ }
+
target = i + fp->jf + 1;
BPF_EMIT_JMP;
break;
}
-
+jmp_rest:
/* Other jumps are mapped into two insns: Jxx and JA. */
target = i + fp->jt + 1;
insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
struct redirect_info {
u32 ifindex;
u32 flags;
+ struct bpf_map *map;
+ struct bpf_map *map_to_flush;
};
static DEFINE_PER_CPU(struct redirect_info, redirect_info);
ri->ifindex = ifindex;
ri->flags = flags;
+ ri->map = NULL;
return TC_ACT_REDIRECT;
}
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_3(bpf_sk_redirect_map, struct bpf_map *, map, u32, key, u64, flags)
+{
+ struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+
+ if (unlikely(flags))
+ return SK_ABORTED;
+
+ ri->ifindex = key;
+ ri->flags = flags;
+ ri->map = map;
+
+ return SK_REDIRECT;
+}
+
+struct sock *do_sk_redirect_map(void)
+{
+ struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+ struct sock *sk = NULL;
+
+ if (ri->map) {
+ sk = __sock_map_lookup_elem(ri->map, ri->ifindex);
+
+ ri->ifindex = 0;
+ ri->map = NULL;
+ /* we do not clear flags for future lookup */
+ }
+
+ return sk;
+}
+
+static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
+ .func = bpf_sk_redirect_map,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
return task_get_classid(skb);
return ret;
if (skb_is_gso(skb)) {
- /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to
- * be changed into SKB_GSO_TCPV6.
+ /* SKB_GSO_TCPV4 needs to be changed into
+ * SKB_GSO_TCPV6.
*/
if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4;
return ret;
if (skb_is_gso(skb)) {
- /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to
- * be changed into SKB_GSO_TCPV4.
+ /* SKB_GSO_TCPV6 needs to be changed into
+ * SKB_GSO_TCPV4.
*/
if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) {
skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6;
.arg2_type = ARG_ANYTHING,
};
+static int __bpf_tx_xdp(struct net_device *dev,
+ struct bpf_map *map,
+ struct xdp_buff *xdp,
+ u32 index)
+{
+ int err;
+
+ if (!dev->netdev_ops->ndo_xdp_xmit) {
+ return -EOPNOTSUPP;
+ }
+
+ err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
+ if (err)
+ return err;
+ if (map)
+ __dev_map_insert_ctx(map, index);
+ else
+ dev->netdev_ops->ndo_xdp_flush(dev);
+ return 0;
+}
+
+void xdp_do_flush_map(void)
+{
+ struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+ struct bpf_map *map = ri->map_to_flush;
+
+ ri->map_to_flush = NULL;
+ if (map)
+ __dev_map_flush(map);
+}
+EXPORT_SYMBOL_GPL(xdp_do_flush_map);
+
+static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog)
+{
+ struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+ struct bpf_map *map = ri->map;
+ u32 index = ri->ifindex;
+ struct net_device *fwd;
+ int err;
+
+ ri->ifindex = 0;
+ ri->map = NULL;
+
+ fwd = __dev_map_lookup_elem(map, index);
+ if (!fwd) {
+ err = -EINVAL;
+ goto err;
+ }
+ if (ri->map_to_flush && ri->map_to_flush != map)
+ xdp_do_flush_map();
+
+ err = __bpf_tx_xdp(fwd, map, xdp, index);
+ if (unlikely(err))
+ goto err;
+
+ ri->map_to_flush = map;
+ _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+ return 0;
+err:
+ _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+ return err;
+}
+
+int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog)
+{
+ struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+ struct net_device *fwd;
+ u32 index = ri->ifindex;
+ int err;
+
+ if (ri->map)
+ return xdp_do_redirect_map(dev, xdp, xdp_prog);
+
+ fwd = dev_get_by_index_rcu(dev_net(dev), index);
+ ri->ifindex = 0;
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
+ if (unlikely(err))
+ goto err;
+
+ _trace_xdp_redirect(dev, xdp_prog, index);
+ return 0;
+err:
+ _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+ return err;
+}
+EXPORT_SYMBOL_GPL(xdp_do_redirect);
+
+int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
+ struct bpf_prog *xdp_prog)
+{
+ struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+ u32 index = ri->ifindex;
+ struct net_device *fwd;
+ unsigned int len;
+ int err = 0;
+
+ fwd = dev_get_by_index_rcu(dev_net(dev), index);
+ ri->ifindex = 0;
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ if (unlikely(!(fwd->flags & IFF_UP))) {
+ err = -ENETDOWN;
+ goto err;
+ }
+
+ len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
+ if (skb->len > len) {
+ err = -EMSGSIZE;
+ goto err;
+ }
+
+ skb->dev = fwd;
+ _trace_xdp_redirect(dev, xdp_prog, index);
+ return 0;
+err:
+ _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+ return err;
+}
+EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
+
+BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
+{
+ struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+
+ if (unlikely(flags))
+ return XDP_ABORTED;
+
+ ri->ifindex = ifindex;
+ ri->flags = flags;
+
+ return XDP_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_xdp_redirect_proto = {
+ .func = bpf_xdp_redirect,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags)
+{
+ struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+
+ if (unlikely(flags))
+ return XDP_ABORTED;
+
+ ri->ifindex = ifindex;
+ ri->flags = flags;
+ ri->map = map;
+
+ return XDP_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
+ .func = bpf_xdp_redirect_map,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
bool bpf_helper_changes_pkt_data(void *func)
{
if (func == bpf_skb_vlan_push ||
sk->sk_prot->setsockopt == tcp_setsockopt) {
if (optname == TCP_CONGESTION) {
char name[TCP_CA_NAME_MAX];
+ bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN;
strncpy(name, optval, min_t(long, optlen,
TCP_CA_NAME_MAX-1));
name[TCP_CA_NAME_MAX-1] = 0;
- ret = tcp_set_congestion_control(sk, name, false);
- if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN)
- /* replacing an existing ca */
- tcp_reinit_congestion_control(sk,
- inet_csk(sk)->icsk_ca_ops);
+ ret = tcp_set_congestion_control(sk, name, false, reinit);
} else {
struct tcp_sock *tp = tcp_sk(sk);
ret = -EINVAL;
}
}
- ret = -EINVAL;
#endif
} else {
ret = -EINVAL;
}
}
+static const struct bpf_func_proto *
+sock_filter_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ /* inet and inet6 sockets are created in a process
+ * context so there is always a valid uid/gid
+ */
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id)
{
return &bpf_get_smp_processor_id_proto;
case BPF_FUNC_xdp_adjust_head:
return &bpf_xdp_adjust_head_proto;
+ case BPF_FUNC_redirect:
+ return &bpf_xdp_redirect_proto;
+ case BPF_FUNC_redirect_map:
+ return &bpf_xdp_redirect_map_proto;
default:
return bpf_base_func_proto(func_id);
}
switch (func_id) {
case BPF_FUNC_setsockopt:
return &bpf_setsockopt_proto;
+ case BPF_FUNC_sock_map_update:
+ return &bpf_sock_map_update_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_store_bytes:
+ return &bpf_skb_store_bytes_proto;
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
+ case BPF_FUNC_skb_change_tail:
+ return &bpf_skb_change_tail_proto;
+ case BPF_FUNC_skb_change_head:
+ return &bpf_skb_change_head_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_proto;
+ case BPF_FUNC_get_socket_uid:
+ return &bpf_get_socket_uid_proto;
+ case BPF_FUNC_sk_redirect_map:
+ return &bpf_sk_redirect_map_proto;
default:
return bpf_base_func_proto(func_id);
}
if (off + size > offsetofend(struct __sk_buff, cb[4]))
return false;
break;
+ case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]):
+ case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]):
+ case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
+ case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_end):
if (size != size_default)
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_end):
+ case bpf_ctx_range_till(struct __sk_buff, family, local_port):
return false;
}
{
switch (off) {
case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range_till(struct __sk_buff, family, local_port):
return false;
}
if (type == BPF_WRITE) {
switch (off) {
case offsetof(struct bpf_sock, bound_dev_if):
+ case offsetof(struct bpf_sock, mark):
+ case offsetof(struct bpf_sock, priority):
break;
default:
return false;
return true;
}
-static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
- const struct bpf_prog *prog)
+static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog, int drop_verdict)
{
struct bpf_insn *insn = insn_buf;
* return TC_ACT_SHOT;
*/
*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
- *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT);
+ *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict);
*insn++ = BPF_EXIT_INSN();
/* restore: */
return insn - insn_buf;
}
+static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog)
+{
+ return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT);
+}
+
static bool tc_cls_act_is_valid_access(int off, int size,
enum bpf_access_type type,
struct bpf_insn_access_aux *info)
case bpf_ctx_range(struct __sk_buff, data_end):
info->reg_type = PTR_TO_PACKET_END;
break;
+ case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+ return false;
}
return bpf_skb_is_valid_access(off, size, type, info);
return __is_valid_sock_ops_access(off, size);
}
+static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog)
+{
+ return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP);
+}
+
+static bool sk_skb_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, mark):
+ case bpf_ctx_range(struct __sk_buff, tc_index):
+ case bpf_ctx_range(struct __sk_buff, priority):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ return false;
+ case bpf_ctx_range(struct __sk_buff, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return bpf_skb_is_valid_access(off, size, type, info);
+}
+
static u32 bpf_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
*insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#endif
break;
+ case offsetof(struct __sk_buff, family):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_family,
+ 2, target_size));
+ break;
+ case offsetof(struct __sk_buff, remote_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_daddr,
+ 4, target_size));
+ break;
+ case offsetof(struct __sk_buff, local_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_rcv_saddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_rcv_saddr,
+ 4, target_size));
+ break;
+ case offsetof(struct __sk_buff, remote_ip6[0]) ...
+ offsetof(struct __sk_buff, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct __sk_buff, remote_ip6[0]);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+ case offsetof(struct __sk_buff, local_ip6[0]) ...
+ offsetof(struct __sk_buff, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct __sk_buff, local_ip6[0]);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct __sk_buff, remote_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_dport,
+ 2, target_size));
+#ifndef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+ break;
+
+ case offsetof(struct __sk_buff, local_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_num, 2, target_size));
+ break;
}
return insn - insn_buf;
offsetof(struct sock, sk_bound_dev_if));
break;
+ case offsetof(struct bpf_sock, mark):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_mark) != 4);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_mark));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_mark));
+ break;
+
+ case offsetof(struct bpf_sock, priority):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_priority));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_priority));
+ break;
+
case offsetof(struct bpf_sock, family):
BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
};
const struct bpf_verifier_ops cg_sock_prog_ops = {
- .get_func_proto = bpf_base_func_proto,
+ .get_func_proto = sock_filter_func_proto,
.is_valid_access = sock_filter_is_valid_access,
.convert_ctx_access = sock_filter_convert_ctx_access,
};
.convert_ctx_access = sock_ops_convert_ctx_access,
};
+const struct bpf_verifier_ops sk_skb_prog_ops = {
+ .get_func_proto = sk_skb_func_proto,
+ .is_valid_access = sk_skb_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+ .gen_prologue = sk_skb_prologue,
+};
+
int sk_detach_filter(struct sock *sk)
{
int ret = -ENOENT;
*
*/
-struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node)
-{
- struct sk_buff *skb;
-
- /* Get the HEAD */
- skb = kmem_cache_alloc_node(skbuff_head_cache,
- gfp_mask & ~__GFP_DMA, node);
- if (!skb)
- goto out;
-
- /*
- * Only clear those fields we need to clear, not those that we will
- * actually initialise below. Hence, don't put any more fields after
- * the tail pointer in struct sk_buff!
- */
- memset(skb, 0, offsetof(struct sk_buff, tail));
- skb->head = NULL;
- skb->truesize = sizeof(struct sk_buff);
- refcount_set(&skb->users, 1);
-
- skb->mac_header = (typeof(skb->mac_header))~0U;
-out:
- return skb;
-}
-
/**
* __alloc_skb - allocate a network buffer
* @size: size to allocate
for (i = 0; i < shinfo->nr_frags; i++)
__skb_frag_unref(&shinfo->frags[i]);
- /*
- * If skb buf is from userspace, we need to notify the caller
- * the lower device DMA has done;
- */
- if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) {
- struct ubuf_info *uarg;
-
- uarg = shinfo->destructor_arg;
- if (uarg->callback)
- uarg->callback(uarg, true);
- }
-
if (shinfo->frag_list)
kfree_skb_list(shinfo->frag_list);
+ skb_zcopy_clear(skb, true);
skb_free_head(skb);
}
*/
void skb_tx_error(struct sk_buff *skb)
{
- if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
- struct ubuf_info *uarg;
-
- uarg = skb_shinfo(skb)->destructor_arg;
- if (uarg->callback)
- uarg->callback(uarg, false);
- skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
- }
+ skb_zcopy_clear(skb, true);
}
EXPORT_SYMBOL(skb_tx_error);
return;
trace_consume_skb(skb);
- if (likely(skb->head))
- skb_release_data(skb);
+ skb_release_data(skb);
kfree_skbmem(skb);
}
}
EXPORT_SYMBOL_GPL(skb_morph);
+static int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
+{
+ unsigned long max_pg, num_pg, new_pg, old_pg;
+ struct user_struct *user;
+
+ if (capable(CAP_IPC_LOCK) || !size)
+ return 0;
+
+ num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */
+ max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ user = mmp->user ? : current_user();
+
+ do {
+ old_pg = atomic_long_read(&user->locked_vm);
+ new_pg = old_pg + num_pg;
+ if (new_pg > max_pg)
+ return -ENOBUFS;
+ } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) !=
+ old_pg);
+
+ if (!mmp->user) {
+ mmp->user = get_uid(user);
+ mmp->num_pg = num_pg;
+ } else {
+ mmp->num_pg += num_pg;
+ }
+
+ return 0;
+}
+
+static void mm_unaccount_pinned_pages(struct mmpin *mmp)
+{
+ if (mmp->user) {
+ atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
+ free_uid(mmp->user);
+ }
+}
+
+struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
+{
+ struct ubuf_info *uarg;
+ struct sk_buff *skb;
+
+ WARN_ON_ONCE(!in_task());
+
+ if (!sock_flag(sk, SOCK_ZEROCOPY))
+ return NULL;
+
+ skb = sock_omalloc(sk, 0, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
+ uarg = (void *)skb->cb;
+ uarg->mmp.user = NULL;
+
+ if (mm_account_pinned_pages(&uarg->mmp, size)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ uarg->callback = sock_zerocopy_callback;
+ uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
+ uarg->len = 1;
+ uarg->bytelen = size;
+ uarg->zerocopy = 1;
+ atomic_set(&uarg->refcnt, 0);
+ sock_hold(sk);
+
+ return uarg;
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_alloc);
+
+static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
+{
+ return container_of((void *)uarg, struct sk_buff, cb);
+}
+
+struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
+ struct ubuf_info *uarg)
+{
+ if (uarg) {
+ const u32 byte_limit = 1 << 19; /* limit to a few TSO */
+ u32 bytelen, next;
+
+ /* realloc only when socket is locked (TCP, UDP cork),
+ * so uarg->len and sk_zckey access is serialized
+ */
+ if (!sock_owned_by_user(sk)) {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
+ bytelen = uarg->bytelen + size;
+ if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
+ /* TCP can create new skb to attach new uarg */
+ if (sk->sk_type == SOCK_STREAM)
+ goto new_alloc;
+ return NULL;
+ }
+
+ next = (u32)atomic_read(&sk->sk_zckey);
+ if ((u32)(uarg->id + uarg->len) == next) {
+ if (mm_account_pinned_pages(&uarg->mmp, size))
+ return NULL;
+ uarg->len++;
+ uarg->bytelen = bytelen;
+ atomic_set(&sk->sk_zckey, ++next);
+ return uarg;
+ }
+ }
+
+new_alloc:
+ return sock_zerocopy_alloc(sk, size);
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_realloc);
+
+static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
+{
+ struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+ u32 old_lo, old_hi;
+ u64 sum_len;
+
+ old_lo = serr->ee.ee_info;
+ old_hi = serr->ee.ee_data;
+ sum_len = old_hi - old_lo + 1ULL + len;
+
+ if (sum_len >= (1ULL << 32))
+ return false;
+
+ if (lo != old_hi + 1)
+ return false;
+
+ serr->ee.ee_data += len;
+ return true;
+}
+
+void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
+{
+ struct sk_buff *tail, *skb = skb_from_uarg(uarg);
+ struct sock_exterr_skb *serr;
+ struct sock *sk = skb->sk;
+ struct sk_buff_head *q;
+ unsigned long flags;
+ u32 lo, hi;
+ u16 len;
+
+ mm_unaccount_pinned_pages(&uarg->mmp);
+
+ /* if !len, there was only 1 call, and it was aborted
+ * so do not queue a completion notification
+ */
+ if (!uarg->len || sock_flag(sk, SOCK_DEAD))
+ goto release;
+
+ len = uarg->len;
+ lo = uarg->id;
+ hi = uarg->id + len - 1;
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ serr->ee.ee_errno = 0;
+ serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
+ serr->ee.ee_data = hi;
+ serr->ee.ee_info = lo;
+ if (!success)
+ serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
+
+ q = &sk->sk_error_queue;
+ spin_lock_irqsave(&q->lock, flags);
+ tail = skb_peek_tail(q);
+ if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
+ !skb_zerocopy_notify_extend(tail, lo, len)) {
+ __skb_queue_tail(q, skb);
+ skb = NULL;
+ }
+ spin_unlock_irqrestore(&q->lock, flags);
+
+ sk->sk_error_report(sk);
+
+release:
+ consume_skb(skb);
+ sock_put(sk);
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
+
+void sock_zerocopy_put(struct ubuf_info *uarg)
+{
+ if (uarg && atomic_dec_and_test(&uarg->refcnt)) {
+ if (uarg->callback)
+ uarg->callback(uarg, uarg->zerocopy);
+ else
+ consume_skb(skb_from_uarg(uarg));
+ }
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_put);
+
+void sock_zerocopy_put_abort(struct ubuf_info *uarg)
+{
+ if (uarg) {
+ struct sock *sk = skb_from_uarg(uarg)->sk;
+
+ atomic_dec(&sk->sk_zckey);
+ uarg->len--;
+
+ /* sock_zerocopy_put expects a ref. Most sockets take one per
+ * skb, which is zero on abort. tcp_sendmsg holds one extra, to
+ * avoid an skb send inside the main loop triggering uarg free.
+ */
+ if (sk->sk_type != SOCK_STREAM)
+ atomic_inc(&uarg->refcnt);
+
+ sock_zerocopy_put(uarg);
+ }
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
+
+extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
+ struct iov_iter *from, size_t length);
+
+int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
+ struct msghdr *msg, int len,
+ struct ubuf_info *uarg)
+{
+ struct ubuf_info *orig_uarg = skb_zcopy(skb);
+ struct iov_iter orig_iter = msg->msg_iter;
+ int err, orig_len = skb->len;
+
+ /* An skb can only point to one uarg. This edge case happens when
+ * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
+ */
+ if (orig_uarg && uarg != orig_uarg)
+ return -EEXIST;
+
+ err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
+ if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
+ /* Streams do not free skb on error. Reset to prev state. */
+ msg->msg_iter = orig_iter;
+ ___pskb_trim(skb, orig_len);
+ return err;
+ }
+
+ skb_zcopy_set(skb, uarg);
+ return skb->len - orig_len;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
+
+static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
+ gfp_t gfp_mask)
+{
+ if (skb_zcopy(orig)) {
+ if (skb_zcopy(nskb)) {
+ /* !gfp_mask callers are verified to !skb_zcopy(nskb) */
+ if (!gfp_mask) {
+ WARN_ON_ONCE(1);
+ return -ENOMEM;
+ }
+ if (skb_uarg(nskb) == skb_uarg(orig))
+ return 0;
+ if (skb_copy_ubufs(nskb, GFP_ATOMIC))
+ return -EIO;
+ }
+ skb_zcopy_set(nskb, skb_uarg(orig));
+ }
+ return 0;
+}
+
/**
* skb_copy_ubufs - copy userspace skb frags buffers to kernel
* @skb: the skb to modify
*/
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
{
- int i;
int num_frags = skb_shinfo(skb)->nr_frags;
struct page *page, *head = NULL;
- struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
+ int i, new_frags;
+ u32 d_off;
- for (i = 0; i < num_frags; i++) {
- u8 *vaddr;
- skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+ if (!num_frags)
+ return 0;
+
+ if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
+ return -EINVAL;
+ new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ for (i = 0; i < new_frags; i++) {
page = alloc_page(gfp_mask);
if (!page) {
while (head) {
}
return -ENOMEM;
}
- vaddr = kmap_atomic(skb_frag_page(f));
- memcpy(page_address(page),
- vaddr + f->page_offset, skb_frag_size(f));
- kunmap_atomic(vaddr);
set_page_private(page, (unsigned long)head);
head = page;
}
+ page = head;
+ d_off = 0;
+ for (i = 0; i < num_frags; i++) {
+ skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
+
+ skb_frag_foreach_page(f, f->page_offset, skb_frag_size(f),
+ p, p_off, p_len, copied) {
+ u32 copy, done = 0;
+ vaddr = kmap_atomic(p);
+
+ while (done < p_len) {
+ if (d_off == PAGE_SIZE) {
+ d_off = 0;
+ page = (struct page *)page_private(page);
+ }
+ copy = min_t(u32, PAGE_SIZE - d_off, p_len - done);
+ memcpy(page_address(page) + d_off,
+ vaddr + p_off + done, copy);
+ done += copy;
+ d_off += copy;
+ }
+ kunmap_atomic(vaddr);
+ }
+ }
+
/* skb frags release userspace buffers */
for (i = 0; i < num_frags; i++)
skb_frag_unref(skb, i);
- uarg->callback(uarg, false);
-
/* skb frags point to kernel buffers */
- for (i = num_frags - 1; i >= 0; i--) {
- __skb_fill_page_desc(skb, i, head, 0,
- skb_shinfo(skb)->frags[i].size);
+ for (i = 0; i < new_frags - 1; i++) {
+ __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE);
head = (struct page *)page_private(head);
}
+ __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
+ skb_shinfo(skb)->nr_frags = new_frags;
- skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
+ skb_zcopy_clear(skb, false);
return 0;
}
EXPORT_SYMBOL_GPL(skb_copy_ubufs);
if (skb_shinfo(skb)->nr_frags) {
int i;
- if (skb_orphan_frags(skb, gfp_mask)) {
+ if (skb_orphan_frags(skb, gfp_mask) ||
+ skb_zerocopy_clone(n, skb, gfp_mask)) {
kfree_skb(n);
n = NULL;
goto out;
* be since all we did is relocate the values
*/
if (skb_cloned(skb)) {
- /* copy this zero copy skb frags */
if (skb_orphan_frags(skb, gfp_mask))
goto nofrags;
+ if (skb_zcopy(skb))
+ atomic_inc(&skb_uarg(skb)->refcnt);
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_frag_ref(skb, i);
EXPORT_SYMBOL(skb_copy_expand);
/**
- * skb_pad - zero pad the tail of an skb
+ * __skb_pad - zero pad the tail of an skb
* @skb: buffer to pad
* @pad: space to pad
+ * @free_on_error: free buffer on error
*
* Ensure that a buffer is followed by a padding area that is zero
* filled. Used by network drivers which may DMA or transfer data
* beyond the buffer end onto the wire.
*
- * May return error in out of memory cases. The skb is freed on error.
+ * May return error in out of memory cases. The skb is freed on error
+ * if @free_on_error is true.
*/
- int skb_pad(struct sk_buff *skb, int pad)
+ int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
{
int err;
int ntail;
return 0;
free_skb:
- kfree_skb(skb);
+ if (free_on_error)
+ kfree_skb(skb);
return err;
}
- EXPORT_SYMBOL(skb_pad);
+ EXPORT_SYMBOL(__skb_pad);
/**
* pskb_put - add data to the tail of a potentially fragmented buffer
if (eat) {
skb_shinfo(skb)->frags[k].page_offset += eat;
skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
+ if (!i)
+ goto end;
eat = 0;
}
k++;
}
skb_shinfo(skb)->nr_frags = k;
+end:
skb->tail += delta;
skb->data_len -= delta;
+ if (!skb->data_len)
+ skb_zcopy_clear(skb, false);
+
return skb_tail_pointer(skb);
}
EXPORT_SYMBOL(__pskb_pull_tail);
end = start + skb_frag_size(f);
if ((copy = end - offset) > 0) {
+ u32 p_off, p_len, copied;
+ struct page *p;
u8 *vaddr;
if (copy > len)
copy = len;
- vaddr = kmap_atomic(skb_frag_page(f));
- memcpy(to,
- vaddr + f->page_offset + offset - start,
- copy);
- kunmap_atomic(vaddr);
+ skb_frag_foreach_page(f,
+ f->page_offset + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ memcpy(to + copied, vaddr + p_off, p_len);
+ kunmap_atomic(vaddr);
+ }
if ((len -= copy) == 0)
return 0;
}
EXPORT_SYMBOL_GPL(skb_splice_bits);
+/* Send skb data on a socket. Socket must be locked. */
+int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
+ int len)
+{
+ unsigned int orig_len = len;
+ struct sk_buff *head = skb;
+ unsigned short fragidx;
+ int slen, ret;
+
+do_frag_list:
+
+ /* Deal with head data */
+ while (offset < skb_headlen(skb) && len) {
+ struct kvec kv;
+ struct msghdr msg;
+
+ slen = min_t(int, len, skb_headlen(skb) - offset);
+ kv.iov_base = skb->data + offset;
+ kv.iov_len = slen;
+ memset(&msg, 0, sizeof(msg));
+
+ ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen);
+ if (ret <= 0)
+ goto error;
+
+ offset += ret;
+ len -= ret;
+ }
+
+ /* All the data was skb head? */
+ if (!len)
+ goto out;
+
+ /* Make offset relative to start of frags */
+ offset -= skb_headlen(skb);
+
+ /* Find where we are in frag list */
+ for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx];
+
+ if (offset < frag->size)
+ break;
+
+ offset -= frag->size;
+ }
+
+ for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx];
+
+ slen = min_t(size_t, len, frag->size - offset);
+
+ while (slen) {
+ ret = kernel_sendpage_locked(sk, frag->page.p,
+ frag->page_offset + offset,
+ slen, MSG_DONTWAIT);
+ if (ret <= 0)
+ goto error;
+
+ len -= ret;
+ offset += ret;
+ slen -= ret;
+ }
+
+ offset = 0;
+ }
+
+ if (len) {
+ /* Process any frag lists */
+
+ if (skb == head) {
+ if (skb_has_frag_list(skb)) {
+ skb = skb_shinfo(skb)->frag_list;
+ goto do_frag_list;
+ }
+ } else if (skb->next) {
+ skb = skb->next;
+ goto do_frag_list;
+ }
+ }
+
+out:
+ return orig_len - len;
+
+error:
+ return orig_len == len ? ret : orig_len - len;
+}
+EXPORT_SYMBOL_GPL(skb_send_sock_locked);
+
+/* Send skb data on a socket. */
+int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
+{
+ int ret = 0;
+
+ lock_sock(sk);
+ ret = skb_send_sock_locked(sk, skb, offset, len);
+ release_sock(sk);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(skb_send_sock);
+
/**
* skb_store_bits - store bits from kernel buffer to skb
* @skb: destination buffer
end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
+ u32 p_off, p_len, copied;
+ struct page *p;
u8 *vaddr;
if (copy > len)
copy = len;
- vaddr = kmap_atomic(skb_frag_page(frag));
- memcpy(vaddr + frag->page_offset + offset - start,
- from, copy);
- kunmap_atomic(vaddr);
+ skb_frag_foreach_page(frag,
+ frag->page_offset + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ memcpy(vaddr + p_off, from + copied, p_len);
+ kunmap_atomic(vaddr);
+ }
if ((len -= copy) == 0)
return 0;
end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
+ u32 p_off, p_len, copied;
+ struct page *p;
__wsum csum2;
u8 *vaddr;
if (copy > len)
copy = len;
- vaddr = kmap_atomic(skb_frag_page(frag));
- csum2 = ops->update(vaddr + frag->page_offset +
- offset - start, copy, 0);
- kunmap_atomic(vaddr);
- csum = ops->combine(csum, csum2, pos, copy);
+
+ skb_frag_foreach_page(frag,
+ frag->page_offset + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ csum2 = ops->update(vaddr + p_off, p_len, 0);
+ kunmap_atomic(vaddr);
+ csum = ops->combine(csum, csum2, pos, p_len);
+ pos += p_len;
+ }
+
if (!(len -= copy))
return csum;
offset += copy;
- pos += copy;
}
start = end;
}
end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
if ((copy = end - offset) > 0) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ u32 p_off, p_len, copied;
+ struct page *p;
__wsum csum2;
u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
if (copy > len)
copy = len;
- vaddr = kmap_atomic(skb_frag_page(frag));
- csum2 = csum_partial_copy_nocheck(vaddr +
- frag->page_offset +
- offset - start, to,
- copy, 0);
- kunmap_atomic(vaddr);
- csum = csum_block_add(csum, csum2, pos);
+
+ skb_frag_foreach_page(frag,
+ frag->page_offset + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ csum2 = csum_partial_copy_nocheck(vaddr + p_off,
+ to + copied,
+ p_len, 0);
+ kunmap_atomic(vaddr);
+ csum = csum_block_add(csum, csum2, pos);
+ pos += p_len;
+ }
+
if (!(len -= copy))
return csum;
offset += copy;
to += copy;
- pos += copy;
}
start = end;
}
skb_tx_error(from);
return -ENOMEM;
}
+ skb_zerocopy_clone(to, from, GFP_ATOMIC);
for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
if (!len)
skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags &
SKBTX_SHARED_FRAG;
+ skb_zerocopy_clone(skb1, skb, 0);
if (len < pos) /* Split line is inside header. */
skb_split_inside_header(skb, skb1, len, pos);
else /* Second chunk has no header, nothing to copy. */
if (skb_headlen(skb))
return 0;
+ if (skb_zcopy(tgt) || skb_zcopy(skb))
+ return 0;
todo = shiftlen;
from = 0;
skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
SKBTX_SHARED_FRAG;
+ if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC))
+ goto err;
while (pos < offset + len) {
if (i >= nfrags) {
if (skb_has_frag_list(to) || skb_has_frag_list(from))
return false;
+ if (skb_zcopy(to) || skb_zcopy(from))
+ return false;
if (skb_headlen(from) != 0) {
struct page *page;
struct dsa_switch *ds = port->ds;
int err;
- err = dsa_cpu_dsa_setup(ds, ds->dev, port, port->index);
+ err = dsa_cpu_dsa_setup(port);
if (err) {
dev_warn(ds->dev, "Failed to setup dsa port %d: %d\n",
port->index, err);
struct dsa_switch *ds = port->ds;
int err;
- err = dsa_cpu_dsa_setup(ds, ds->dev, port, port->index);
+ err = dsa_cpu_dsa_setup(port);
if (err) {
dev_warn(ds->dev, "Failed to setup cpu port %d: %d\n",
port->index, err);
if (!name)
name = "eth%d";
- err = dsa_slave_create(ds, ds->dev, port->index, name);
+ err = dsa_slave_create(port, name);
if (err) {
dev_warn(ds->dev, "Failed to create slave %d: %d\n",
port->index, err);
return err;
}
- if (!dst->cpu_dp->netdev) {
+ if (!dst->cpu_dp) {
pr_warn("Tree has no master device\n");
return -EINVAL;
}
padlen = (skb->len >= ETH_ZLEN) ? 0 : ETH_ZLEN - skb->len;
if (skb_tailroom(skb) >= padlen + KSZ_INGRESS_TAG_LEN) {
- if (skb_put_padto(skb, skb->len + padlen))
+ /* Let dsa_slave_xmit() free skb */
+ if (__skb_put_padto(skb, skb->len + padlen, false))
return NULL;
nskb = skb;
skb_transport_header(skb) - skb->head);
skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len));
- if (skb_put_padto(nskb, nskb->len + padlen)) {
- kfree_skb(nskb);
+ /* Let skb_put_padto() free nskb, and let dsa_slave_xmit() free
+ * skb
+ */
+ if (skb_put_padto(nskb, nskb->len + padlen))
return NULL;
- }
- kfree_skb(skb);
+ consume_skb(skb);
}
tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN);
}
static struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt,
- struct net_device *orig_dev)
+ struct packet_type *pt)
{
struct dsa_switch_tree *dst = dev->dsa_ptr;
struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
skb_set_network_header(nskb, skb_network_header(skb) - skb->head);
skb_set_transport_header(nskb, skb_transport_header(skb) - skb->head);
skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len));
- kfree_skb(skb);
+ consume_skb(skb);
if (padlen) {
skb_put_zero(nskb, padlen);
}
static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt,
- struct net_device *orig_dev)
+ struct packet_type *pt)
{
struct dsa_switch_tree *dst = dev->dsa_ptr;
struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
esp_output_udp_encap(x, skb, esp);
if (!skb_cloned(skb)) {
- if (tailen <= skb_availroom(skb)) {
+ if (tailen <= skb_tailroom(skb)) {
nfrags = 1;
trailer = skb;
tail = skb_tail_pointer(trailer);
kunmap_atomic(vaddr);
- spin_unlock_bh(&x->lock);
-
nfrags = skb_shinfo(skb)->nr_frags;
__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
skb_shinfo(skb)->nr_frags = ++nfrags;
pfrag->offset = pfrag->offset + allocsize;
+
+ spin_unlock_bh(&x->lock);
+
nfrags++;
skb->len += tailen;
(unsigned char *)esph - skb->data,
assoclen + ivlen + esp->clen + alen);
if (unlikely(err < 0))
- goto error;
+ goto error_free;
if (!esp->inplace) {
int allocsize;
spin_lock_bh(&x->lock);
if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
spin_unlock_bh(&x->lock);
- goto error;
+ goto error_free;
}
skb_shinfo(skb)->nr_frags = 1;
(unsigned char *)esph - skb->data,
assoclen + ivlen + esp->clen + alen);
if (unlikely(err < 0))
- goto error;
+ goto error_free;
}
if ((x->props.flags & XFRM_STATE_ESN))
if (sg != dsg)
esp_ssg_unref(x, tmp);
- kfree(tmp);
+ error_free:
+ kfree(tmp);
error:
return err;
}
return esp_output_tail(x, skb, &esp);
}
+static inline int esp_remove_trailer(struct sk_buff *skb)
+{
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ struct crypto_aead *aead = x->data;
+ int alen, hlen, elen;
+ int padlen, trimlen;
+ __wsum csumdiff;
+ u8 nexthdr[2];
+ int ret;
+
+ alen = crypto_aead_authsize(aead);
+ hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+ elen = skb->len - hlen;
+
+ if (xo && (xo->flags & XFRM_ESP_NO_TRAILER)) {
+ ret = xo->proto;
+ goto out;
+ }
+
+ if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2))
+ BUG();
+
+ ret = -EINVAL;
+ padlen = nexthdr[0];
+ if (padlen + 2 + alen >= elen) {
+ net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n",
+ padlen + 2, elen - alen);
+ goto out;
+ }
+
+ trimlen = alen + padlen + 2;
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
+ skb->csum = csum_block_sub(skb->csum, csumdiff,
+ skb->len - trimlen);
+ }
+ pskb_trim(skb, skb->len - trimlen);
+
+ ret = nexthdr[1];
+
+out:
+ return ret;
+}
+
int esp_input_done2(struct sk_buff *skb, int err)
{
const struct iphdr *iph;
struct xfrm_state *x = xfrm_input_state(skb);
struct xfrm_offload *xo = xfrm_offload(skb);
struct crypto_aead *aead = x->data;
- int alen = crypto_aead_authsize(aead);
int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
- int elen = skb->len - hlen;
int ihl;
- u8 nexthdr[2];
- int padlen;
if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
kfree(ESP_SKB_CB(skb)->tmp);
if (unlikely(err))
goto out;
- if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
- BUG();
-
- err = -EINVAL;
- padlen = nexthdr[0];
- if (padlen + 2 + alen >= elen)
+ err = esp_remove_trailer(skb);
+ if (unlikely(err < 0))
goto out;
- /* ... check padding bits here. Silly. :-) */
-
iph = ip_hdr(skb);
ihl = iph->ihl * 4;
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
- pskb_trim(skb, skb->len - alen - padlen - 2);
- __skb_pull(skb, hlen);
+ skb_pull_rcsum(skb, hlen);
if (x->props.mode == XFRM_MODE_TUNNEL)
skb_reset_transport_header(skb);
else
skb_set_transport_header(skb, -ihl);
- err = nexthdr[1];
-
/* RFC4303: Drop dummy packets without any error */
if (err == IPPROTO_NONE)
err = -EINVAL;
sg_init_table(sg, nfrags);
err = skb_to_sgvec(skb, sg, 0, skb->len);
- if (unlikely(err < 0))
+ if (unlikely(err < 0)) {
+ kfree(tmp);
goto out;
+ }
skb->ip_summed = CHECKSUM_NONE;
static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb)
{
struct crypto_aead *aead = x->data;
+ struct xfrm_offload *xo = xfrm_offload(skb);
if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead)))
return -EINVAL;
- skb->ip_summed = CHECKSUM_NONE;
+ if (!(xo->flags & CRYPTO_DONE))
+ skb->ip_summed = CHECKSUM_NONE;
return esp_input_done2(skb, 0);
}
esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32));
err = esp_output_tail(x, skb, &esp);
- if (err < 0)
+ if (err)
return err;
secpath_reset(skb);
module_exit(esp4_offload_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
+MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET, XFRM_PROTO_ESP);
#include <linux/err.h>
#include <linux/time.h>
#include <linux/slab.h>
+#include <linux/errqueue.h>
#include <net/icmp.h>
#include <net/inet_common.h>
return period;
}
+static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
+{
+ u32 rate = READ_ONCE(tp->rate_delivered);
+ u32 intv = READ_ONCE(tp->rate_interval_us);
+ u64 rate64 = 0;
+
+ if (rate && intv) {
+ rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
+ do_div(rate64, intv);
+ }
+ return rate64;
+}
+
/* Address-family independent initialization for a tcp_sock.
*
* NOTE: A lot of things set to zero explicitly by call to
tp->out_of_order_queue = RB_ROOT;
tcp_init_xmit_timers(sk);
- tcp_prequeue_init(tp);
INIT_LIST_HEAD(&tp->tsq_node);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
}
EXPORT_SYMBOL_GPL(do_tcp_sendpages);
-int tcp_sendpage(struct sock *sk, struct page *page, int offset,
- size_t size, int flags)
+int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
{
- ssize_t res;
-
if (!(sk->sk_route_caps & NETIF_F_SG) ||
!sk_check_csum_caps(sk))
- return sock_no_sendpage(sk->sk_socket, page, offset, size,
- flags);
-
- lock_sock(sk);
+ return sock_no_sendpage_locked(sk, page, offset, size, flags);
tcp_rate_check_app_limited(sk); /* is sending application-limited? */
- res = do_tcp_sendpages(sk, page, offset, size, flags);
+ return do_tcp_sendpages(sk, page, offset, size, flags);
+}
+EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
+
+int tcp_sendpage(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
+{
+ int ret;
+
+ lock_sock(sk);
+ ret = tcp_sendpage_locked(sk, page, offset, size, flags);
release_sock(sk);
- return res;
+
+ return ret;
}
EXPORT_SYMBOL(tcp_sendpage);
return err;
}
-int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct ubuf_info *uarg = NULL;
struct sk_buff *skb;
struct sockcm_cookie sockc;
int flags, err, copied = 0;
bool sg;
long timeo;
- lock_sock(sk);
-
flags = msg->msg_flags;
+
+ if (flags & MSG_ZEROCOPY && size) {
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL;
+ uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
+ if (!uarg) {
+ err = -ENOBUFS;
+ goto out_err;
+ }
+
+ /* skb may be freed in main loop, keep extra ref on uarg */
+ sock_zerocopy_get(uarg);
+ if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG))
+ uarg->zerocopy = 0;
+ }
+
if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
if (err == -EINPROGRESS && copied_syn > 0)
err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
if (err)
goto do_fault;
- } else {
+ } else if (!uarg || !uarg->zerocopy) {
bool merge = true;
int i = skb_shinfo(skb)->nr_frags;
struct page_frag *pfrag = sk_page_frag(sk);
page_ref_inc(pfrag->page);
}
pfrag->offset += copy;
+ } else {
+ err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
+ if (err == -EMSGSIZE || err == -EEXIST)
+ goto new_segment;
+ if (err < 0)
+ goto do_error;
+ copy = err;
}
if (!copied)
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
}
out_nopush:
- release_sock(sk);
+ sock_zerocopy_put(uarg);
return copied + copied_syn;
do_fault:
if (copied + copied_syn)
goto out;
out_err:
+ sock_zerocopy_put_abort(uarg);
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
- release_sock(sk);
return err;
}
+EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
+
+int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ int ret;
+
+ lock_sock(sk);
+ ret = tcp_sendmsg_locked(sk, msg, size);
+ release_sock(sk);
+
+ return ret;
+}
EXPORT_SYMBOL(tcp_sendmsg);
/*
tcp_send_ack(sk);
}
-static void tcp_prequeue_process(struct sock *sk)
-{
- struct sk_buff *skb;
- struct tcp_sock *tp = tcp_sk(sk);
-
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
-
- while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
- sk_backlog_rcv(sk, skb);
-
- /* Clear memory counter. */
- tp->ucopy.memory = 0;
-}
-
static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
}
EXPORT_SYMBOL(tcp_peek_len);
+static void tcp_update_recv_tstamps(struct sk_buff *skb,
+ struct scm_timestamping *tss)
+{
+ if (skb->tstamp)
+ tss->ts[0] = ktime_to_timespec(skb->tstamp);
+ else
+ tss->ts[0] = (struct timespec) {0};
+
+ if (skb_hwtstamps(skb)->hwtstamp)
+ tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp);
+ else
+ tss->ts[2] = (struct timespec) {0};
+}
+
+/* Similar to __sock_recv_timestamp, but does not require an skb */
+void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
+ struct scm_timestamping *tss)
+{
+ struct timeval tv;
+ bool has_timestamping = false;
+
+ if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
+ if (sock_flag(sk, SOCK_RCVTSTAMP)) {
+ if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
+ put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
+ sizeof(tss->ts[0]), &tss->ts[0]);
+ } else {
+ tv.tv_sec = tss->ts[0].tv_sec;
+ tv.tv_usec = tss->ts[0].tv_nsec / 1000;
+
+ put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
+ sizeof(tv), &tv);
+ }
+ }
+
+ if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
+ has_timestamping = true;
+ else
+ tss->ts[0] = (struct timespec) {0};
+ }
+
+ if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
+ if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
+ has_timestamping = true;
+ else
+ tss->ts[2] = (struct timespec) {0};
+ }
+
+ if (has_timestamping) {
+ tss->ts[1] = (struct timespec) {0};
+ put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING,
+ sizeof(*tss), tss);
+ }
+}
+
/*
* This routine copies from a sock struct into the user buffer.
*
int err;
int target; /* Read at least this many bytes */
long timeo;
- struct task_struct *user_recv = NULL;
struct sk_buff *skb, *last;
u32 urg_hole = 0;
+ struct scm_timestamping tss;
+ bool has_tss = false;
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
tcp_cleanup_rbuf(sk, copied);
- if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
- /* Install new reader */
- if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
- user_recv = current;
- tp->ucopy.task = user_recv;
- tp->ucopy.msg = msg;
- }
-
- tp->ucopy.len = len;
-
- WARN_ON(tp->copied_seq != tp->rcv_nxt &&
- !(flags & (MSG_PEEK | MSG_TRUNC)));
-
- /* Ugly... If prequeue is not empty, we have to
- * process it before releasing socket, otherwise
- * order will be broken at second iteration.
- * More elegant solution is required!!!
- *
- * Look: we have the following (pseudo)queues:
- *
- * 1. packets in flight
- * 2. backlog
- * 3. prequeue
- * 4. receive_queue
- *
- * Each queue can be processed only if the next ones
- * are empty. At this point we have empty receive_queue.
- * But prequeue _can_ be not empty after 2nd iteration,
- * when we jumped to start of loop because backlog
- * processing added something to receive_queue.
- * We cannot release_sock(), because backlog contains
- * packets arrived _after_ prequeued ones.
- *
- * Shortly, algorithm is clear --- to process all
- * the queues in order. We could make it more directly,
- * requeueing packets from backlog to prequeue, if
- * is not empty. It is more elegant, but eats cycles,
- * unfortunately.
- */
- if (!skb_queue_empty(&tp->ucopy.prequeue))
- goto do_prequeue;
-
- /* __ Set realtime policy in scheduler __ */
- }
-
if (copied >= target) {
/* Do not sleep, just process backlog. */
release_sock(sk);
sk_wait_data(sk, &timeo, last);
}
- if (user_recv) {
- int chunk;
-
- /* __ Restore normal policy in scheduler __ */
-
- chunk = len - tp->ucopy.len;
- if (chunk != 0) {
- NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
- len -= chunk;
- copied += chunk;
- }
-
- if (tp->rcv_nxt == tp->copied_seq &&
- !skb_queue_empty(&tp->ucopy.prequeue)) {
-do_prequeue:
- tcp_prequeue_process(sk);
-
- chunk = len - tp->ucopy.len;
- if (chunk != 0) {
- NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
- len -= chunk;
- copied += chunk;
- }
- }
- }
if ((flags & MSG_PEEK) &&
(peek_seq - copied - urg_hole != tp->copied_seq)) {
net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
if (used + offset < skb->len)
continue;
+ if (TCP_SKB_CB(skb)->has_rxtstamp) {
+ tcp_update_recv_tstamps(skb, &tss);
+ has_tss = true;
+ }
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
if (!(flags & MSG_PEEK))
break;
} while (len > 0);
- if (user_recv) {
- if (!skb_queue_empty(&tp->ucopy.prequeue)) {
- int chunk;
-
- tp->ucopy.len = copied > 0 ? len : 0;
-
- tcp_prequeue_process(sk);
-
- if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
- NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
- len -= chunk;
- copied += chunk;
- }
- }
-
- tp->ucopy.task = NULL;
- tp->ucopy.len = 0;
- }
-
/* According to UNIX98, msg_name/msg_namelen are ignored
* on connected socket. I was just happy when found this 8) --ANK
*/
+ if (has_tss)
+ tcp_recv_timestamp(msg, sk, &tss);
+
/* Clean up data we have read: This will do ACK frames. */
tcp_cleanup_rbuf(sk, copied);
name[val] = 0;
lock_sock(sk);
- err = tcp_set_congestion_control(sk, name, true);
+ err = tcp_set_congestion_control(sk, name, true, true);
release_sock(sk);
return err;
}
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 now, intv;
+ u32 now;
u64 rate64;
bool slow;
u32 rate;
info->tcpi_data_segs_out = tp->data_segs_out;
info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
- rate = READ_ONCE(tp->rate_delivered);
- intv = READ_ONCE(tp->rate_interval_us);
- if (rate && intv) {
- rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
- do_div(rate64, intv);
+ rate64 = tcp_compute_delivery_rate(tp);
+ if (rate64)
info->tcpi_delivery_rate = rate64;
- }
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *stats;
struct tcp_info info;
+ u64 rate64;
+ u32 rate;
- stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
+ stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
+ 3 * nla_total_size(sizeof(u32)) +
+ 2 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
if (!stats)
return NULL;
tp->data_segs_out, TCP_NLA_PAD);
nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
tp->total_retrans, TCP_NLA_PAD);
+
+ rate = READ_ONCE(sk->sk_pacing_rate);
+ rate64 = rate != ~0U ? rate : ~0ULL;
+ nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
+
+ rate64 = tcp_compute_delivery_rate(tp);
+ nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
+
+ nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
+ nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
+ nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
+
+ nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
+ nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
return stats;
}
INET_ECN_dontxmit(sk);
}
- void tcp_reinit_congestion_control(struct sock *sk,
- const struct tcp_congestion_ops *ca)
+ static void tcp_reinit_congestion_control(struct sock *sk,
+ const struct tcp_congestion_ops *ca)
{
struct inet_connection_sock *icsk = inet_csk(sk);
* tcp_reinit_congestion_control (if the current congestion control was
* already initialized.
*/
- int tcp_set_congestion_control(struct sock *sk, const char *name, bool load)
+ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_congestion_ops *ca;
if (!ca) {
err = -ENOENT;
} else if (!load) {
- icsk->icsk_ca_ops = ca;
- if (!try_module_get(ca->owner))
+ const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops;
+
+ if (try_module_get(ca->owner)) {
+ if (reinit) {
+ tcp_reinit_congestion_control(sk, ca);
+ } else {
+ icsk->icsk_ca_ops = ca;
+ module_put(old_ca->owner);
+ }
+ } else {
err = -EBUSY;
+ }
} else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) {
err = -EPERM;
{
const struct tcp_sock *tp = tcp_sk(sk);
- return max(tp->snd_cwnd, tp->snd_ssthresh << 1);
+ return max(tp->snd_cwnd, tp->prior_cwnd);
}
EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
static int compute_score(struct sock *sk, struct net *net,
__be32 saddr, __be16 sport,
- __be32 daddr, unsigned short hnum, int dif,
- bool exact_dif)
+ __be32 daddr, unsigned short hnum,
+ int dif, int sdif, bool exact_dif)
{
int score;
struct inet_sock *inet;
}
if (sk->sk_bound_dev_if || exact_dif) {
- if (sk->sk_bound_dev_if != dif)
+ bool dev_match = (sk->sk_bound_dev_if == dif ||
+ sk->sk_bound_dev_if == sdif);
+
+ if (exact_dif && !dev_match)
return -1;
- score += 4;
+ if (sk->sk_bound_dev_if && dev_match)
+ score += 4;
}
+
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
return score;
/* called with rcu_read_lock() */
static struct sock *udp4_lib_lookup2(struct net *net,
- __be32 saddr, __be16 sport,
- __be32 daddr, unsigned int hnum, int dif, bool exact_dif,
- struct udp_hslot *hslot2,
- struct sk_buff *skb)
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum,
+ int dif, int sdif, bool exact_dif,
+ struct udp_hslot *hslot2,
+ struct sk_buff *skb)
{
struct sock *sk, *result;
int score, badness, matches = 0, reuseport = 0;
badness = 0;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, exact_dif);
+ daddr, hnum, dif, sdif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
* harder than this. -DaveM
*/
struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
- __be16 sport, __be32 daddr, __be16 dport,
- int dif, struct udp_table *udptable, struct sk_buff *skb)
+ __be16 sport, __be32 daddr, __be16 dport, int dif,
+ int sdif, struct udp_table *udptable, struct sk_buff *skb)
{
struct sock *sk, *result;
unsigned short hnum = ntohs(dport);
goto begin;
result = udp4_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif,
+ daddr, hnum, dif, sdif,
exact_dif, hslot2, skb);
if (!result) {
unsigned int old_slot2 = slot2;
goto begin;
result = udp4_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif,
+ daddr, hnum, dif, sdif,
exact_dif, hslot2, skb);
}
return result;
badness = 0;
sk_for_each_rcu(sk, &hslot->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, exact_dif);
+ daddr, hnum, dif, sdif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
iph->daddr, dport, inet_iif(skb),
- udptable, skb);
+ inet_sdif(skb), udptable, skb);
}
struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
struct sock *sk;
sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
- dif, &udp_table, NULL);
+ dif, 0, &udp_table, NULL);
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
return sk;
static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr,
- int dif, unsigned short hnum)
+ int dif, int sdif, unsigned short hnum)
{
struct inet_sock *inet = inet_sk(sk);
(inet->inet_dport != rmt_port && inet->inet_dport) ||
(inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
ipv6_only_sock(sk) ||
- (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
+ sk->sk_bound_dev_if != sdif))
return false;
- if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif))
+ if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif))
return false;
return true;
}
struct net *net = dev_net(skb->dev);
sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
- iph->saddr, uh->source, skb->dev->ifindex, udptable,
- NULL);
+ iph->saddr, uh->source, skb->dev->ifindex, 0,
+ udptable, NULL);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return; /* No socket for error */
if (is_udplite) /* UDP-Lite */
csum = udplite_csum(skb);
- else if (sk->sk_no_check_tx && !skb_is_gso(skb)) { /* UDP csum off */
+ else if (sk->sk_no_check_tx) { /* UDP csum off */
skb->ip_summed = CHECKSUM_NONE;
goto send;
scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
scratch->is_linear = !skb_is_nonlinear(skb);
#endif
- if (likely(!skb->_skb_refdst && !skb_sec_path(skb)))
+ /* all head states execept sp (dst, sk, nf) are always cleared by
+ * udp_rcv() and we need to preserve secpath, if present, to eventually
+ * process IP_CMSG_PASSSEC at recvmsg() time
+ */
+ if (likely(!skb_sec_path(skb)))
scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
}
sk_mark_napi_id_once(sk, skb);
}
- /* At recvmsg() time we may access skb->dst or skb->sp depending on
- * the IP options and the cmsg flags, elsewhere can we clear all
- * pending head states while they are hot in the cache
- */
- if (likely(IPCB(skb)->opt.optlen == 0 && !skb_sec_path(skb)))
- skb_release_head_state(skb);
-
rc = __udp_enqueue_schedule_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
/* For TCP sockets, sk_rx_dst is protected by socket lock
* For UDP, we use xchg() to guard against concurrent changes.
*/
- void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
+ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
{
struct dst_entry *old;
if (dst_hold_safe(dst)) {
old = xchg(&sk->sk_rx_dst, dst);
dst_release(old);
+ return old != dst;
}
+ return false;
}
EXPORT_SYMBOL(udp_sk_rx_dst_set);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
unsigned int offset = offsetof(typeof(*sk), sk_node);
int dif = skb->dev->ifindex;
+ int sdif = inet_sdif(skb);
struct hlist_node *node;
struct sk_buff *nskb;
sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
- uh->source, saddr, dif, hnum))
+ uh->source, saddr, dif, sdif, hnum))
continue;
if (!first) {
static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr,
- int dif)
+ int dif, int sdif)
{
struct sock *sk, *result;
unsigned short hnum = ntohs(loc_port);
result = NULL;
sk_for_each_rcu(sk, &hslot->head) {
if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
- rmt_port, rmt_addr, dif, hnum)) {
+ rmt_port, rmt_addr, dif, sdif, hnum)) {
if (result)
return NULL;
result = sk;
static struct sock *__udp4_lib_demux_lookup(struct net *net,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr,
- int dif)
+ int dif, int sdif)
{
unsigned short hnum = ntohs(loc_port);
unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
if (INET_MATCH(sk, net, acookie, rmt_addr,
- loc_addr, ports, dif))
+ loc_addr, ports, dif, sdif))
return sk;
/* Only check first socket in chain */
break;
struct sock *sk = NULL;
struct dst_entry *dst;
int dif = skb->dev->ifindex;
+ int sdif = inet_sdif(skb);
int ours;
/* validate the packet */
}
sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
- uh->source, iph->saddr, dif);
+ uh->source, iph->saddr,
+ dif, sdif);
} else if (skb->pkt_type == PACKET_HOST) {
sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
- uh->source, iph->saddr, dif);
+ uh->source, iph->saddr, dif, sdif);
}
if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
static void init_loopback(struct net_device *dev)
{
struct inet6_dev *idev;
- struct net_device *sp_dev;
- struct inet6_ifaddr *sp_ifa;
- struct rt6_info *sp_rt;
/* ::1 */
}
add_addr(idev, &in6addr_loopback, 128, IFA_HOST);
-
- /* Add routes to other interface's IPv6 addresses */
- for_each_netdev(dev_net(dev), sp_dev) {
- if (!strcmp(sp_dev->name, dev->name))
- continue;
-
- idev = __in6_dev_get(sp_dev);
- if (!idev)
- continue;
-
- read_lock_bh(&idev->lock);
- list_for_each_entry(sp_ifa, &idev->addr_list, if_list) {
-
- if (sp_ifa->flags & (IFA_F_DADFAILED | IFA_F_TENTATIVE))
- continue;
-
- if (sp_ifa->rt) {
- /* This dst has been added to garbage list when
- * lo device down, release this obsolete dst and
- * reallocate a new router for ifa.
- */
- if (!atomic_read(&sp_ifa->rt->rt6i_ref)) {
- ip6_rt_put(sp_ifa->rt);
- sp_ifa->rt = NULL;
- } else {
- continue;
- }
- }
-
- sp_rt = addrconf_dst_alloc(idev, &sp_ifa->addr, false);
-
- /* Failure cases are ignored */
- if (!IS_ERR(sp_rt)) {
- sp_ifa->rt = sp_rt;
- ip6_ins_rt(sp_rt);
- }
- }
- read_unlock_bh(&idev->lock);
- }
}
void addrconf_add_linklocal(struct inet6_dev *idev,
static int fixup_permanent_addr(struct inet6_dev *idev,
struct inet6_ifaddr *ifp)
{
- /* rt6i_ref == 0 means the host route was removed from the
+ /* !rt6i_node means the host route was removed from the
* FIB, for example, if 'lo' device is taken down. In that
* case regenerate the host route.
*/
- if (!ifp->rt || !atomic_read(&ifp->rt->rt6i_ref)) {
+ if (!ifp->rt || !ifp->rt->rt6i_node) {
struct rt6_info *rt, *prev;
rt = addrconf_dst_alloc(idev, &ifp->addr, false);
* our DAD process, so we don't need
* to do it again
*/
- if (!(ifp->rt->rt6i_node))
+ if (!rcu_access_pointer(ifp->rt->rt6i_node))
ip6_ins_rt(ifp->rt);
if (ifp->idev->cnf.forwarding)
addrconf_join_anycast(ifp);
rtnl_af_register(&inet6_ops);
err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo,
- NULL);
+ 0);
if (err < 0)
goto errout;
/* Only the first call to __rtnl_register can fail */
- __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, NULL);
- __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, NULL);
+ __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, 0);
+ __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, 0);
__rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr,
- inet6_dump_ifaddr, NULL);
+ inet6_dump_ifaddr, 0);
__rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL,
- inet6_dump_ifmcaddr, NULL);
+ inet6_dump_ifmcaddr, 0);
__rtnl_register(PF_INET6, RTM_GETANYCAST, NULL,
- inet6_dump_ifacaddr, NULL);
+ inet6_dump_ifacaddr, 0);
__rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf,
- inet6_netconf_dump_devconf, NULL);
+ inet6_netconf_dump_devconf, 0);
ipv6_addr_label_rtnl_register();
int tailen = esp->tailen;
if (!skb_cloned(skb)) {
- if (tailen <= skb_availroom(skb)) {
+ if (tailen <= skb_tailroom(skb)) {
nfrags = 1;
trailer = skb;
tail = skb_tail_pointer(trailer);
kunmap_atomic(vaddr);
- spin_unlock_bh(&x->lock);
-
nfrags = skb_shinfo(skb)->nr_frags;
__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
skb_shinfo(skb)->nr_frags = ++nfrags;
pfrag->offset = pfrag->offset + allocsize;
+
+ spin_unlock_bh(&x->lock);
+
nfrags++;
skb->len += tailen;
(unsigned char *)esph - skb->data,
assoclen + ivlen + esp->clen + alen);
if (unlikely(err < 0))
- goto error;
+ goto error_free;
if (!esp->inplace) {
int allocsize;
spin_lock_bh(&x->lock);
if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
spin_unlock_bh(&x->lock);
- goto error;
+ goto error_free;
}
skb_shinfo(skb)->nr_frags = 1;
(unsigned char *)esph - skb->data,
assoclen + ivlen + esp->clen + alen);
if (unlikely(err < 0))
- goto error;
+ goto error_free;
}
if ((x->props.flags & XFRM_STATE_ESN))
if (sg != dsg)
esp_ssg_unref(x, tmp);
- kfree(tmp);
+ error_free:
+ kfree(tmp);
error:
return err;
}
return esp6_output_tail(x, skb, &esp);
}
-int esp6_input_done2(struct sk_buff *skb, int err)
+static inline int esp_remove_trailer(struct sk_buff *skb)
{
struct xfrm_state *x = xfrm_input_state(skb);
struct xfrm_offload *xo = xfrm_offload(skb);
struct crypto_aead *aead = x->data;
- int alen = crypto_aead_authsize(aead);
- int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
- int elen = skb->len - hlen;
- int hdr_len = skb_network_header_len(skb);
- int padlen;
+ int alen, hlen, elen;
+ int padlen, trimlen;
+ __wsum csumdiff;
u8 nexthdr[2];
+ int ret;
- if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
- kfree(ESP_SKB_CB(skb)->tmp);
+ alen = crypto_aead_authsize(aead);
+ hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+ elen = skb->len - hlen;
- if (unlikely(err))
+ if (xo && (xo->flags & XFRM_ESP_NO_TRAILER)) {
+ ret = xo->proto;
goto out;
+ }
if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2))
BUG();
- err = -EINVAL;
+ ret = -EINVAL;
padlen = nexthdr[0];
if (padlen + 2 + alen >= elen) {
net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n",
goto out;
}
- /* ... check padding bits here. Silly. :-) */
+ trimlen = alen + padlen + 2;
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
+ skb->csum = csum_block_sub(skb->csum, csumdiff,
+ skb->len - trimlen);
+ }
+ pskb_trim(skb, skb->len - trimlen);
+
+ ret = nexthdr[1];
+
+out:
+ return ret;
+}
- pskb_trim(skb, skb->len - alen - padlen - 2);
- __skb_pull(skb, hlen);
+int esp6_input_done2(struct sk_buff *skb, int err)
+{
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ struct crypto_aead *aead = x->data;
+ int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+ int hdr_len = skb_network_header_len(skb);
+
+ if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
+ kfree(ESP_SKB_CB(skb)->tmp);
+
+ if (unlikely(err))
+ goto out;
+
+ err = esp_remove_trailer(skb);
+ if (unlikely(err < 0))
+ goto out;
+
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ skb_network_header_len(skb));
+ skb_pull_rcsum(skb, hlen);
if (x->props.mode == XFRM_MODE_TUNNEL)
skb_reset_transport_header(skb);
else
skb_set_transport_header(skb, -hdr_len);
- err = nexthdr[1];
-
/* RFC4303: Drop dummy packets without any error */
if (err == IPPROTO_NONE)
err = -EINVAL;
static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb)
{
struct crypto_aead *aead = x->data;
+ struct xfrm_offload *xo = xfrm_offload(skb);
if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead)))
return -EINVAL;
- skb->ip_summed = CHECKSUM_NONE;
+ if (!(xo->flags & CRYPTO_DONE))
+ skb->ip_summed = CHECKSUM_NONE;
return esp6_input_done2(skb, 0);
}
esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32));
err = esp6_output_tail(x, skb, &esp);
- if (err < 0)
+ if (err)
return err;
secpath_reset(skb);
module_exit(esp6_offload_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
+MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET6, XFRM_PROTO_ESP);
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/lwtunnel.h>
+#include <net/fib_notifier.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
return fn;
}
- static void node_free(struct fib6_node *fn)
+ static void node_free_immediate(struct fib6_node *fn)
+ {
+ kmem_cache_free(fib6_node_kmem, fn);
+ }
+
+ static void node_free_rcu(struct rcu_head *head)
{
+ struct fib6_node *fn = container_of(head, struct fib6_node, rcu);
+
kmem_cache_free(fib6_node_kmem, fn);
}
-static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
+ static void node_free(struct fib6_node *fn)
+ {
+ call_rcu(&fn->rcu, node_free_rcu);
+ }
+
+void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
{
int cpu;
free_percpu(non_pcpu_rt->rt6i_pcpu);
non_pcpu_rt->rt6i_pcpu = NULL;
}
-
-static void rt6_release(struct rt6_info *rt)
-{
- if (atomic_dec_and_test(&rt->rt6i_ref)) {
- rt6_free_pcpu(rt);
- dst_dev_put(&rt->dst);
- dst_release(&rt->dst);
- }
-}
+EXPORT_SYMBOL_GPL(rt6_free_pcpu);
static void fib6_link_table(struct net *net, struct fib6_table *tb)
{
#endif
+unsigned int fib6_tables_seq_read(struct net *net)
+{
+ unsigned int h, fib_seq = 0;
+
+ rcu_read_lock();
+ for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv6.fib_table_hash[h];
+ struct fib6_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
+ read_lock_bh(&tb->tb6_lock);
+ fib_seq += tb->fib_seq;
+ read_unlock_bh(&tb->tb6_lock);
+ }
+ }
+ rcu_read_unlock();
+
+ return fib_seq;
+}
+
+static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type,
+ struct rt6_info *rt)
+{
+ struct fib6_entry_notifier_info info = {
+ .rt = rt,
+ };
+
+ return call_fib6_notifier(nb, net, event_type, &info.info);
+}
+
+static int call_fib6_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct rt6_info *rt)
+{
+ struct fib6_entry_notifier_info info = {
+ .rt = rt,
+ };
+
+ rt->rt6i_table->fib_seq++;
+ return call_fib6_notifiers(net, event_type, &info.info);
+}
+
+struct fib6_dump_arg {
+ struct net *net;
+ struct notifier_block *nb;
+};
+
+static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg)
+{
+ if (rt == arg->net->ipv6.ip6_null_entry)
+ return;
+ call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
+}
+
+static int fib6_node_dump(struct fib6_walker *w)
+{
+ struct rt6_info *rt;
+
+ for (rt = w->leaf; rt; rt = rt->dst.rt6_next)
+ fib6_rt_dump(rt, w->args);
+ w->leaf = NULL;
+ return 0;
+}
+
+static void fib6_table_dump(struct net *net, struct fib6_table *tb,
+ struct fib6_walker *w)
+{
+ w->root = &tb->tb6_root;
+ read_lock_bh(&tb->tb6_lock);
+ fib6_walk(net, w);
+ read_unlock_bh(&tb->tb6_lock);
+}
+
+/* Called with rcu_read_lock() */
+int fib6_tables_dump(struct net *net, struct notifier_block *nb)
+{
+ struct fib6_dump_arg arg;
+ struct fib6_walker *w;
+ unsigned int h;
+
+ w = kzalloc(sizeof(*w), GFP_ATOMIC);
+ if (!w)
+ return -ENOMEM;
+
+ w->func = fib6_node_dump;
+ arg.net = net;
+ arg.nb = nb;
+ w->args = &arg;
+
+ for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv6.fib_table_hash[h];
+ struct fib6_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb6_hlist)
+ fib6_table_dump(net, tb, w);
+ }
+
+ kfree(w);
+
+ return 0;
+}
+
static int fib6_dump_node(struct fib6_walker *w)
{
int res;
if (!in || !ln) {
if (in)
- node_free(in);
+ node_free_immediate(in);
if (ln)
- node_free(ln);
+ node_free_immediate(ln);
return ERR_PTR(-ENOMEM);
}
}
fn = fn->parent;
}
- /* No more references are possible at this point. */
- BUG_ON(atomic_read(&rt->rt6i_ref) != 1);
}
}
rt->dst.rt6_next = iter;
*ins = rt;
- rt->rt6i_node = fn;
+ rcu_assign_pointer(rt->rt6i_node, fn);
atomic_inc(&rt->rt6i_ref);
+ call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD,
+ rt);
if (!info->skip_notify)
inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
return err;
*ins = rt;
- rt->rt6i_node = fn;
+ rcu_assign_pointer(rt->rt6i_node, fn);
rt->dst.rt6_next = iter->dst.rt6_next;
atomic_inc(&rt->rt6i_ref);
+ call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE,
+ rt);
if (!info->skip_notify)
inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
if (!(fn->fn_flags & RTN_RTINFO)) {
fn->fn_flags |= RTN_RTINFO;
}
nsiblings = iter->rt6i_nsiblings;
+ iter->rt6i_node = NULL;
fib6_purge_rt(iter, fn, info->nl_net);
if (fn->rr_ptr == iter)
fn->rr_ptr = NULL;
break;
if (rt6_qualify_for_ecmp(iter)) {
*ins = iter->dst.rt6_next;
+ iter->rt6i_node = NULL;
fib6_purge_rt(iter, fn, info->nl_net);
if (fn->rr_ptr == iter)
fn->rr_ptr = NULL;
root, and then (in failure) stale node
in main tree.
*/
- node_free(sfn);
+ node_free_immediate(sfn);
err = PTR_ERR(sn);
goto failure;
}
fib6_purge_rt(rt, fn, net);
+ call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt);
if (!info->skip_notify)
inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
rt6_release(rt);
int fib6_del(struct rt6_info *rt, struct nl_info *info)
{
+ struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
+ lockdep_is_held(&rt->rt6i_table->tb6_lock));
struct net *net = info->nl_net;
- struct fib6_node *fn = rt->rt6i_node;
struct rt6_info **rtp;
#if RT6_DEBUG >= 2
if (res) {
#if RT6_DEBUG >= 2
pr_debug("%s: del failed: rt=%p@%p err=%d\n",
- __func__, rt, rt->rt6i_node, res);
+ __func__, rt,
+ rcu_access_pointer(rt->rt6i_node),
+ res);
#endif
continue;
}
}
gc_args->more++;
} else if (rt->rt6i_flags & RTF_CACHE) {
+ if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout))
+ rt->dst.obsolete = DST_OBSOLETE_KILL;
if (atomic_read(&rt->dst.__refcnt) == 1 &&
- time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
+ rt->dst.obsolete == DST_OBSOLETE_KILL) {
RT6_TRACE("aging clone %p\n", rt);
return -1;
} else if (rt->rt6i_flags & RTF_GATEWAY) {
static int __net_init fib6_net_init(struct net *net)
{
size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
+ int err;
+
+ err = fib6_notifier_init(net);
+ if (err)
+ return err;
spin_lock_init(&net->ipv6.fib6_gc_lock);
rwlock_init(&net->ipv6.fib6_walker_lock);
out_rt6_stats:
kfree(net->ipv6.rt6_stats);
out_timer:
+ fib6_notifier_exit(net);
return -ENOMEM;
}
kfree(net->ipv6.fib6_main_tbl);
kfree(net->ipv6.fib_table_hash);
kfree(net->ipv6.rt6_stats);
+ fib6_notifier_exit(net);
}
static struct pernet_operations fib6_net_ops = {
goto out_kmem_cache_create;
ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib,
- NULL);
+ 0);
if (ret)
goto out_unregister_subsys;
if (time_after(jiffies, rt->dst.expires))
return true;
} else if (rt->dst.from) {
- return rt6_check_expired((struct rt6_info *) rt->dst.from);
+ return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
+ rt6_check_expired((struct rt6_info *)rt->dst.from);
}
return false;
}
-/* Multipath route selection:
- * Hash based function using packet header and flowlabel.
- * Adapted from fib_info_hashfn()
- */
-static int rt6_info_hash_nhsfn(unsigned int candidate_count,
- const struct flowi6 *fl6)
-{
- return get_hash_from_flowi6(fl6) % candidate_count;
-}
-
static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
struct flowi6 *fl6, int oif,
int strict)
struct rt6_info *sibling, *next_sibling;
int route_choosen;
- route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
+ /* We might have already computed the hash for ICMPv6 errors. In such
+ * case it will always be non-zero. Otherwise now is the time to do it.
+ */
+ if (!fl6->mp_hash)
+ fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
+
+ route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
/* Don't change the route, if route_choosen == 0
* (siblings does not include ourself)
*/
return __ip6_ins_rt(rt, &info, &mxc, NULL);
}
+/* called with rcu_lock held */
+static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
+{
+ struct net_device *dev = rt->dst.dev;
+
+ if (rt->rt6i_flags & RTF_LOCAL) {
+ /* for copies of local routes, dst->dev needs to be the
+ * device if it is a master device, the master device if
+ * device is enslaved, and the loopback as the default
+ */
+ if (netif_is_l3_slave(dev) &&
+ !rt6_need_strict(&rt->rt6i_dst.addr))
+ dev = l3mdev_master_dev_rcu(dev);
+ else if (!netif_is_l3_master(dev))
+ dev = dev_net(dev)->loopback_dev;
+ /* last case is netif_is_l3_master(dev) is true in which
+ * case we want dev returned to be dev
+ */
+ }
+
+ return dev;
+}
+
static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
const struct in6_addr *daddr,
const struct in6_addr *saddr)
{
+ struct net_device *dev;
struct rt6_info *rt;
/*
if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
ort = (struct rt6_info *)ort->dst.from;
- rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
-
+ rcu_read_lock();
+ dev = ip6_rt_get_dev_rcu(ort);
+ rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
+ rcu_read_unlock();
if (!rt)
return NULL;
static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
{
+ struct net_device *dev;
struct rt6_info *pcpu_rt;
- pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
- rt->dst.dev, rt->dst.flags);
-
+ rcu_read_lock();
+ dev = ip6_rt_get_dev_rcu(rt);
+ pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
+ rcu_read_unlock();
if (!pcpu_rt)
return NULL;
ip6_rt_copy_init(pcpu_rt, rt);
}
EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
+static void ip6_multipath_l3_keys(const struct sk_buff *skb,
+ struct flow_keys *keys)
+{
+ const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
+ const struct ipv6hdr *key_iph = outer_iph;
+ const struct ipv6hdr *inner_iph;
+ const struct icmp6hdr *icmph;
+ struct ipv6hdr _inner_iph;
+
+ if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
+ goto out;
+
+ icmph = icmp6_hdr(skb);
+ if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
+ icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
+ icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
+ icmph->icmp6_type != ICMPV6_PARAMPROB)
+ goto out;
+
+ inner_iph = skb_header_pointer(skb,
+ skb_transport_offset(skb) + sizeof(*icmph),
+ sizeof(_inner_iph), &_inner_iph);
+ if (!inner_iph)
+ goto out;
+
+ key_iph = inner_iph;
+out:
+ memset(keys, 0, sizeof(*keys));
+ keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ keys->addrs.v6addrs.src = key_iph->saddr;
+ keys->addrs.v6addrs.dst = key_iph->daddr;
+ keys->tags.flow_label = ip6_flowinfo(key_iph);
+ keys->basic.ip_proto = key_iph->nexthdr;
+}
+
+/* if skb is set it will be used and fl6 can be NULL */
+u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
+{
+ struct flow_keys hash_keys;
+
+ if (skb) {
+ ip6_multipath_l3_keys(skb, &hash_keys);
+ return flow_hash_from_keys(&hash_keys);
+ }
+
+ return get_hash_from_flowi6(fl6);
+}
+
void ip6_route_input(struct sk_buff *skb)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
tun_info = skb_tunnel_info(skb);
if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
+ if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
+ fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
skb_dst_drop(skb);
skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
}
static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
{
- if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
+ u32 rt_cookie = 0;
+
+ if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
return NULL;
if (rt6_check_expired(rt))
if (rt->rt6i_flags & RTF_CACHE) {
if (dst_hold_safe(&rt->dst))
ip6_del_rt(rt);
- } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
- rt->rt6i_node->fn_sernum = -1;
+ } else {
+ struct fib6_node *fn;
+
+ rcu_read_lock();
+ fn = rcu_dereference(rt->rt6i_node);
+ if (fn && (rt->rt6i_flags & RTF_DEFAULT))
+ fn->fn_sernum = -1;
+ rcu_read_unlock();
}
}
}
static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
{
return !(rt->rt6i_flags & RTF_CACHE) &&
- (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
+ (rt->rt6i_flags & RTF_PCPU ||
+ rcu_access_pointer(rt->rt6i_node));
}
static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
{
u32 tb_id;
struct net *net = dev_net(idev->dev);
- struct net_device *dev = net->loopback_dev;
+ struct net_device *dev = idev->dev;
struct rt6_info *rt;
- /* use L3 Master device as loopback for host routes if device
- * is enslaved and address is not link local or multicast
- */
- if (!rt6_need_strict(addr))
- dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
-
rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
if (!rt)
return ERR_PTR(-ENOMEM);
goto nla_put_failure;
}
+ if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
+ *flags |= RTNH_F_OFFLOAD;
+
/* not needed for multipath encoding b/c it has a rtnexthop struct */
if (!skip_oif && rt->dst.dev &&
nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
struct net_device *dev;
int flags = 0;
- dev = __dev_get_by_index(net, iif);
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, iif);
if (!dev) {
+ rcu_read_unlock();
err = -ENODEV;
goto errout;
}
if (!fibmatch)
dst = ip6_route_input_lookup(net, dev, &fl6, flags);
+ else
+ dst = ip6_route_lookup(net, &fl6, 0);
+
+ rcu_read_unlock();
} else {
fl6.flowi6_oif = oif;
if (!fibmatch)
dst = ip6_route_output(net, NULL, &fl6);
+ else
+ dst = ip6_route_lookup(net, &fl6, 0);
}
- if (fibmatch)
- dst = ip6_route_lookup(net, &fl6, 0);
rt = container_of(dst, struct rt6_info, dst);
if (rt->dst.error) {
ip6_template_metrics, true);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+ net->ipv6.fib6_has_custom_rules = false;
net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
sizeof(*net->ipv6.ip6_prohibit_entry),
GFP_KERNEL);
goto fib6_rules_init;
ret = -ENOBUFS;
- if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
- __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
- __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
+ if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
+ __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
+ __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
+ RTNL_FLAG_DOIT_UNLOCKED))
goto out_register_late_subsys;
ret = register_netdevice_notifier(&ip6_route_dev_notifier);
static int compute_score(struct sock *sk, struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, unsigned short hnum,
- int dif, bool exact_dif)
+ int dif, int sdif, bool exact_dif)
{
int score;
struct inet_sock *inet;
}
if (sk->sk_bound_dev_if || exact_dif) {
- if (sk->sk_bound_dev_if != dif)
+ bool dev_match = (sk->sk_bound_dev_if == dif ||
+ sk->sk_bound_dev_if == sdif);
+
+ if (exact_dif && !dev_match)
return -1;
- score++;
+ if (sk->sk_bound_dev_if && dev_match)
+ score++;
}
if (sk->sk_incoming_cpu == raw_smp_processor_id())
/* called with rcu_read_lock() */
static struct sock *udp6_lib_lookup2(struct net *net,
const struct in6_addr *saddr, __be16 sport,
- const struct in6_addr *daddr, unsigned int hnum, int dif,
- bool exact_dif, struct udp_hslot *hslot2,
- struct sk_buff *skb)
+ const struct in6_addr *daddr, unsigned int hnum,
+ int dif, int sdif, bool exact_dif,
+ struct udp_hslot *hslot2, struct sk_buff *skb)
{
struct sock *sk, *result;
int score, badness, matches = 0, reuseport = 0;
badness = -1;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, exact_dif);
+ daddr, hnum, dif, sdif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
/* rcu_read_lock() must be held */
struct sock *__udp6_lib_lookup(struct net *net,
- const struct in6_addr *saddr, __be16 sport,
- const struct in6_addr *daddr, __be16 dport,
- int dif, struct udp_table *udptable,
- struct sk_buff *skb)
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr, __be16 dport,
+ int dif, int sdif, struct udp_table *udptable,
+ struct sk_buff *skb)
{
struct sock *sk, *result;
unsigned short hnum = ntohs(dport);
goto begin;
result = udp6_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif, exact_dif,
+ daddr, hnum, dif, sdif, exact_dif,
hslot2, skb);
if (!result) {
unsigned int old_slot2 = slot2;
goto begin;
result = udp6_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif,
+ daddr, hnum, dif, sdif,
exact_dif, hslot2,
skb);
}
badness = -1;
sk_for_each_rcu(sk, &hslot->head) {
score = compute_score(sk, net, saddr, sport, daddr, hnum, dif,
- exact_dif);
+ sdif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
&iph->daddr, dport, inet6_iif(skb),
- udptable, skb);
+ inet6_sdif(skb), udptable, skb);
}
struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
&iph->daddr, dport, inet6_iif(skb),
- &udp_table, skb);
+ inet6_sdif(skb), &udp_table, skb);
}
EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
struct sock *sk;
sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport,
- dif, &udp_table, NULL);
+ dif, 0, &udp_table, NULL);
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
return sk;
struct net *net = dev_net(skb->dev);
sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
- inet6_iif(skb), udptable, skb);
+ inet6_iif(skb), 0, udptable, skb);
if (!sk) {
__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
ICMP6_MIB_INERRORS);
return 0;
}
+ static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
+ {
+ if (udp_sk_rx_dst_set(sk, dst)) {
+ const struct rt6_info *rt = (const struct rt6_info *)dst;
+
+ inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
+ }
+ }
+
int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int proto)
{
int ret;
if (unlikely(sk->sk_rx_dst != dst))
- udp_sk_rx_dst_set(sk, dst);
+ udp6_sk_rx_dst_set(sk, dst);
ret = udpv6_queue_rcv_skb(sk, skb);
sock_put(sk);
static struct sock *__udp6_lib_demux_lookup(struct net *net,
__be16 loc_port, const struct in6_addr *loc_addr,
__be16 rmt_port, const struct in6_addr *rmt_addr,
- int dif)
+ int dif, int sdif)
{
unsigned short hnum = ntohs(loc_port);
unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum);
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
if (sk->sk_state == TCP_ESTABLISHED &&
- INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif))
+ INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif, sdif))
return sk;
/* Only check first socket in chain */
break;
struct sock *sk;
struct dst_entry *dst;
int dif = skb->dev->ifindex;
+ int sdif = inet6_sdif(skb);
if (!pskb_may_pull(skb, skb_transport_offset(skb) +
sizeof(struct udphdr)))
sk = __udp6_lib_demux_lookup(net, uh->dest,
&ipv6_hdr(skb)->daddr,
uh->source, &ipv6_hdr(skb)->saddr,
- dif);
+ dif, sdif);
else
return;
}
#endif
+/* thinking of making this const? Don't.
+ * early_demux can change based on sysctl.
+ */
static struct inet6_protocol udpv6_protocol = {
.early_demux = udp_v6_early_demux,
.early_demux_handler = udp_v6_early_demux,
struct kcm_psock *psock)
{
STRP_STATS_ADD(mux->stats.rx_bytes,
- psock->strp.stats.rx_bytes -
+ psock->strp.stats.bytes -
psock->saved_rx_bytes);
mux->stats.rx_msgs +=
- psock->strp.stats.rx_msgs - psock->saved_rx_msgs;
- psock->saved_rx_msgs = psock->strp.stats.rx_msgs;
- psock->saved_rx_bytes = psock->strp.stats.rx_bytes;
+ psock->strp.stats.msgs - psock->saved_rx_msgs;
+ psock->saved_rx_msgs = psock->strp.stats.msgs;
+ psock->saved_rx_bytes = psock->strp.stats.bytes;
}
static void kcm_update_tx_mux_stats(struct kcm_mux *mux,
struct kcm_sock *kcm = kcm_sk(sk);
int err = 0;
long timeo;
- struct strp_rx_msg *rxm;
+ struct strp_msg *stm;
int copied = 0;
struct sk_buff *skb;
/* Okay, have a message on the receive queue */
- rxm = strp_rx_msg(skb);
+ stm = strp_msg(skb);
- if (len > rxm->full_len)
- len = rxm->full_len;
+ if (len > stm->full_len)
+ len = stm->full_len;
- err = skb_copy_datagram_msg(skb, rxm->offset, msg, len);
+ err = skb_copy_datagram_msg(skb, stm->offset, msg, len);
if (err < 0)
goto out;
copied = len;
if (likely(!(flags & MSG_PEEK))) {
KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
- if (copied < rxm->full_len) {
+ if (copied < stm->full_len) {
if (sock->type == SOCK_DGRAM) {
/* Truncated message */
msg->msg_flags |= MSG_TRUNC;
goto msg_finished;
}
- rxm->offset += copied;
- rxm->full_len -= copied;
+ stm->offset += copied;
+ stm->full_len -= copied;
} else {
msg_finished:
/* Finished with message */
struct sock *sk = sock->sk;
struct kcm_sock *kcm = kcm_sk(sk);
long timeo;
- struct strp_rx_msg *rxm;
+ struct strp_msg *stm;
int err = 0;
ssize_t copied;
struct sk_buff *skb;
/* Okay, have a message on the receive queue */
- rxm = strp_rx_msg(skb);
+ stm = strp_msg(skb);
- if (len > rxm->full_len)
- len = rxm->full_len;
+ if (len > stm->full_len)
+ len = stm->full_len;
- copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags);
+ copied = skb_splice_bits(skb, sk, stm->offset, pipe, len, flags);
if (copied < 0) {
err = copied;
goto err_out;
KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
- rxm->offset += copied;
- rxm->full_len -= copied;
+ stm->offset += copied;
+ stm->full_len -= copied;
/* We have no way to return MSG_EOR. If all the bytes have been
* read we still leave the message in the receive socket buffer.
struct kcm_psock *psock = NULL, *tpsock;
struct list_head *head;
int index = 0;
- struct strp_callbacks cb;
+ static const struct strp_callbacks cb = {
+ .rcv_msg = kcm_rcv_strparser,
+ .parse_msg = kcm_parse_func_strparser,
+ .read_sock_done = kcm_read_sock_done,
+ };
int err;
csk = csock->sk;
if (!csk)
return -EINVAL;
+ /* We must prevent loops or risk deadlock ! */
+ if (csk->sk_family == PF_KCM)
+ return -EOPNOTSUPP;
+
psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
if (!psock)
return -ENOMEM;
psock->sk = csk;
psock->bpf_prog = prog;
- cb.rcv_msg = kcm_rcv_strparser;
- cb.abort_parser = NULL;
- cb.parse_msg = kcm_parse_func_strparser;
- cb.read_sock_done = kcm_read_sock_done;
-
err = strp_init(&psock->strp, csk, &cb);
if (err) {
kmem_cache_free(kcm_psockp, psock);
#define BLK_PLUS_PRIV(sz_of_priv) \
(BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
-#define PGV_FROM_VMALLOC 1
-
#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
struct timespec ts;
__u32 ts_status;
bool is_drop_n_account = false;
+ bool do_vnet = false;
/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
* We may add members to them until current aligned size without forcing
netoff = TPACKET_ALIGN(po->tp_hdrlen +
(maclen < 16 ? 16 : maclen)) +
po->tp_reserve;
- if (po->has_vnet_hdr)
+ if (po->has_vnet_hdr) {
netoff += sizeof(struct virtio_net_hdr);
+ do_vnet = true;
+ }
macoff = netoff - maclen;
}
if (po->tp_version <= TPACKET_V2) {
skb_set_owner_r(copy_skb, sk);
}
snaplen = po->rx_ring.frame_size - macoff;
- if ((int)snaplen < 0)
+ if ((int)snaplen < 0) {
snaplen = 0;
+ do_vnet = false;
+ }
}
} else if (unlikely(macoff + snaplen >
GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
if (unlikely((int)snaplen < 0)) {
snaplen = 0;
macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
+ do_vnet = false;
}
}
spin_lock(&sk->sk_receive_queue.lock);
}
spin_unlock(&sk->sk_receive_queue.lock);
- if (po->has_vnet_hdr) {
+ if (do_vnet) {
if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
sizeof(struct virtio_net_hdr),
vio_le(), true)) {
}
EXPORT_SYMBOL(unregister_tcf_proto_ops);
-static int tfilter_notify(struct net *net, struct sk_buff *oskb,
- struct nlmsghdr *n, struct tcf_proto *tp,
- unsigned long fh, int event, bool unicast);
-
-static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
- struct nlmsghdr *n,
- struct tcf_chain *chain, int event)
-{
- struct tcf_proto *tp;
-
- for (tp = rtnl_dereference(chain->filter_chain);
- tp; tp = rtnl_dereference(tp->next))
- tfilter_notify(net, oskb, n, tp, 0, event, false);
-}
-
/* Select new prio value from the range, managed by kernel. */
static inline u32 tcf_auto_prio(struct tcf_proto *tp)
static void tcf_chain_destroy(struct tcf_chain *chain)
{
- list_del(&chain->list);
- tcf_chain_flush(chain);
- kfree(chain);
+ /* May be already removed from the list by the previous call. */
+ if (!list_empty(&chain->list))
+ list_del_init(&chain->list);
+
+ /* There might still be a reference held when we got here from
+ * tcf_block_put. Wait for the user to drop reference before free.
+ */
+ if (!chain->refcnt)
+ kfree(chain);
}
struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
if (!block)
return;
- list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
+ list_for_each_entry_safe(chain, tmp, &block->chain_list, list) {
+ tcf_chain_flush(chain);
tcf_chain_destroy(chain);
+ }
kfree(block);
}
EXPORT_SYMBOL(tcf_block_put);
return tp;
}
+static int tcf_fill_node(struct net *net, struct sk_buff *skb,
+ struct tcf_proto *tp, void *fh, u32 portid,
+ u32 seq, u16 flags, int event)
+{
+ struct tcmsg *tcm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb_tail_pointer(skb);
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ tcm = nlmsg_data(nlh);
+ tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm__pad1 = 0;
+ tcm->tcm__pad2 = 0;
+ tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
+ tcm->tcm_parent = tp->classid;
+ tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
+ if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_CHAIN, tp->chain->index))
+ goto nla_put_failure;
+ if (!fh) {
+ tcm->tcm_handle = 0;
+ } else {
+ if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
+ goto nla_put_failure;
+ }
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ return skb->len;
+
+out_nlmsg_trim:
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tfilter_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, struct tcf_proto *tp,
+ void *fh, int event, bool unicast)
+{
+ struct sk_buff *skb;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq,
+ n->nlmsg_flags, event) <= 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ if (unicast)
+ return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
+
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+}
+
+static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, struct tcf_proto *tp,
+ void *fh, bool unicast, bool *last)
+{
+ struct sk_buff *skb;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+ int err;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq,
+ n->nlmsg_flags, RTM_DELTFILTER) <= 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ err = tp->ops->delete(tp, fh, last);
+ if (err) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ if (unicast)
+ return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
+
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+}
+
+static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n,
+ struct tcf_chain *chain, int event)
+{
+ struct tcf_proto *tp;
+
+ for (tp = rtnl_dereference(chain->filter_chain);
+ tp; tp = rtnl_dereference(tp->next))
+ tfilter_notify(net, oskb, n, tp, 0, event, false);
+}
+
/* Add/change/delete/get a filter node */
static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
struct tcf_proto *tp;
const struct Qdisc_class_ops *cops;
unsigned long cl;
- unsigned long fh;
+ void *fh;
int err;
int tp_created;
/* Do we search for filter, attached to class? */
if (TC_H_MIN(parent)) {
- cl = cops->get(q, parent);
+ cl = cops->find(q, parent);
if (cl == 0)
return -ENOENT;
}
fh = tp->ops->get(tp, t->tcm_handle);
- if (fh == 0) {
+ if (!fh) {
if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
tcf_chain_tp_remove(chain, &chain_info, tp);
tfilter_notify(net, skb, n, tp, fh,
}
break;
case RTM_DELTFILTER:
- err = tp->ops->delete(tp, fh, &last);
+ err = tfilter_del_notify(net, skb, n, tp, fh, false,
+ &last);
if (err)
goto errout;
- tfilter_notify(net, skb, n, tp, t->tcm_handle,
- RTM_DELTFILTER, false);
if (last) {
tcf_chain_tp_remove(chain, &chain_info, tp);
tcf_proto_destroy(tp);
errout:
if (chain)
tcf_chain_put(chain);
- if (cl)
- cops->put(q, cl);
if (err == -EAGAIN)
/* Replay the request. */
goto replay;
return err;
}
-static int tcf_fill_node(struct net *net, struct sk_buff *skb,
- struct tcf_proto *tp, unsigned long fh, u32 portid,
- u32 seq, u16 flags, int event)
-{
- struct tcmsg *tcm;
- struct nlmsghdr *nlh;
- unsigned char *b = skb_tail_pointer(skb);
-
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
- if (!nlh)
- goto out_nlmsg_trim;
- tcm = nlmsg_data(nlh);
- tcm->tcm_family = AF_UNSPEC;
- tcm->tcm__pad1 = 0;
- tcm->tcm__pad2 = 0;
- tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
- tcm->tcm_parent = tp->classid;
- tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
- if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
- goto nla_put_failure;
- if (nla_put_u32(skb, TCA_CHAIN, tp->chain->index))
- goto nla_put_failure;
- tcm->tcm_handle = fh;
- if (RTM_DELTFILTER != event) {
- tcm->tcm_handle = 0;
- if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
- goto nla_put_failure;
- }
- nlh->nlmsg_len = skb_tail_pointer(skb) - b;
- return skb->len;
-
-out_nlmsg_trim:
-nla_put_failure:
- nlmsg_trim(skb, b);
- return -1;
-}
-
-static int tfilter_notify(struct net *net, struct sk_buff *oskb,
- struct nlmsghdr *n, struct tcf_proto *tp,
- unsigned long fh, int event, bool unicast)
-{
- struct sk_buff *skb;
- u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
-
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb)
- return -ENOBUFS;
-
- if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq,
- n->nlmsg_flags, event) <= 0) {
- kfree_skb(skb);
- return -EINVAL;
- }
-
- if (unicast)
- return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
-
- return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
- n->nlmsg_flags & NLM_F_ECHO);
-}
-
struct tcf_dump_args {
struct tcf_walker w;
struct sk_buff *skb;
struct netlink_callback *cb;
};
-static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
- struct tcf_walker *arg)
+static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
{
struct tcf_dump_args *a = (void *)arg;
struct net *net = sock_net(a->skb->sk);
goto out;
cops = q->ops->cl_ops;
if (!cops)
- goto errout;
+ goto out;
if (!cops->tcf_block)
- goto errout;
+ goto out;
if (TC_H_MIN(tcm->tcm_parent)) {
- cl = cops->get(q, tcm->tcm_parent);
+ cl = cops->find(q, tcm->tcm_parent);
if (cl == 0)
- goto errout;
+ goto out;
}
block = cops->tcf_block(q, cl);
if (!block)
- goto errout;
+ goto out;
index_start = cb->args[0];
index = 0;
cb->args[0] = index;
-errout:
- if (cl)
- cops->put(q, cl);
out:
return skb->len;
}
}
EXPORT_SYMBOL(tcf_exts_validate);
-void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
- struct tcf_exts *src)
+void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src)
{
#ifdef CONFIG_NET_CLS_ACT
struct tcf_exts old = *dst;
- tcf_tree_lock(tp);
- dst->nr_actions = src->nr_actions;
- dst->actions = src->actions;
- dst->type = src->type;
- tcf_tree_unlock(tp);
-
+ *dst = *src;
tcf_exts_destroy(&old);
#endif
}
#ifdef CONFIG_NET_CLS_ACT
struct nlattr *nest;
- if (exts->action && exts->nr_actions) {
+ if (exts->action && tcf_exts_has_actions(exts)) {
/*
* again for backward compatible mode - we want
* to work with both old and new modes of entering
const struct tc_action *a;
LIST_HEAD(actions);
- if (tc_no_actions(exts))
+ if (!tcf_exts_has_actions(exts))
return -EINVAL;
tcf_exts_to_list(exts, &actions);
static int __init tc_filter_init(void)
{
- rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
- rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
- tc_dump_tfilter, NULL);
+ tc_dump_tfilter, 0);
return 0;
}
#include <net/sock.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
-
-static int qdisc_notify(struct net *net, struct sk_buff *oskb,
- struct nlmsghdr *n, u32 clid,
- struct Qdisc *old, struct Qdisc *new);
-static int tclass_notify(struct net *net, struct sk_buff *oskb,
- struct nlmsghdr *n, struct Qdisc *q,
- unsigned long cl, int event);
+#include <net/pkt_cls.h>
/*
if (qops->cl_ops) {
const struct Qdisc_class_ops *cops = qops->cl_ops;
- if (!(cops->get && cops->put && cops->walk && cops->leaf))
+ if (!(cops->find && cops->walk && cops->leaf))
goto out_einval;
if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
if (cops == NULL)
return NULL;
- cl = cops->get(p, classid);
+ cl = cops->find(p, classid);
if (cl == 0)
return NULL;
leaf = cops->leaf(p, cl);
- cops->put(p, cl);
return leaf;
}
static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
{
- unsigned int size = n * sizeof(struct hlist_head), i;
struct hlist_head *h;
+ unsigned int i;
- if (size <= PAGE_SIZE)
- h = kmalloc(size, GFP_KERNEL);
- else
- h = (struct hlist_head *)
- __get_free_pages(GFP_KERNEL, get_order(size));
+ h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
if (h != NULL) {
for (i = 0; i < n; i++)
return h;
}
-static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
-{
- unsigned int size = n * sizeof(struct hlist_head);
-
- if (size <= PAGE_SIZE)
- kfree(h);
- else
- free_pages((unsigned long)h, get_order(size));
-}
-
void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
{
struct Qdisc_class_common *cl;
clhash->hashmask = nmask;
sch_tree_unlock(sch);
- qdisc_class_hash_free(ohash, osize);
+ kvfree(ohash);
}
EXPORT_SYMBOL(qdisc_class_hash_grow);
void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
{
- qdisc_class_hash_free(clhash->hash, clhash->hashsize);
+ kvfree(clhash->hash);
}
EXPORT_SYMBOL(qdisc_class_hash_destroy);
const struct Qdisc_class_ops *cops;
unsigned long cl;
u32 parentid;
+ bool notify;
int drops;
if (n == 0 && len == 0)
if (sch->flags & TCQ_F_NOPARENT)
break;
+ /* Notify parent qdisc only if child qdisc becomes empty.
+ *
+ * If child was empty even before update then backlog
+ * counter is screwed and we skip notification because
+ * parent class is already passive.
+ */
+ notify = !sch->q.qlen && !WARN_ON_ONCE(!n);
/* TODO: perform the search on a per txq basis */
sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
if (sch == NULL) {
break;
}
cops = sch->ops->cl_ops;
- if (cops->qlen_notify) {
- cl = cops->get(sch, parentid);
+ if (notify && cops->qlen_notify) {
+ cl = cops->find(sch, parentid);
cops->qlen_notify(sch, cl);
- cops->put(sch, cl);
}
sch->q.qlen -= n;
sch->qstats.backlog -= len;
}
EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
+static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
+ u32 portid, u32 seq, u16 flags, int event)
+{
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
+ struct gnet_stats_queue __percpu *cpu_qstats = NULL;
+ struct tcmsg *tcm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb_tail_pointer(skb);
+ struct gnet_dump d;
+ struct qdisc_size_table *stab;
+ __u32 qlen;
+
+ cond_resched();
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ tcm = nlmsg_data(nlh);
+ tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm__pad1 = 0;
+ tcm->tcm__pad2 = 0;
+ tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
+ tcm->tcm_parent = clid;
+ tcm->tcm_handle = q->handle;
+ tcm->tcm_info = refcount_read(&q->refcnt);
+ if (nla_put_string(skb, TCA_KIND, q->ops->id))
+ goto nla_put_failure;
+ if (q->ops->dump && q->ops->dump(q, skb) < 0)
+ goto nla_put_failure;
+ qlen = q->q.qlen;
+
+ stab = rtnl_dereference(q->stab);
+ if (stab && qdisc_dump_stab(skb, stab) < 0)
+ goto nla_put_failure;
+
+ if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
+ NULL, &d, TCA_PAD) < 0)
+ goto nla_put_failure;
+
+ if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
+ goto nla_put_failure;
+
+ if (qdisc_is_percpu_stats(q)) {
+ cpu_bstats = q->cpu_bstats;
+ cpu_qstats = q->cpu_qstats;
+ }
+
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
+ &d, cpu_bstats, &q->bstats) < 0 ||
+ gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
+ gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
+ goto nla_put_failure;
+
+ if (gnet_stats_finish_copy(&d) < 0)
+ goto nla_put_failure;
+
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ return skb->len;
+
+out_nlmsg_trim:
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
+{
+ if (q->flags & TCQ_F_BUILTIN)
+ return true;
+ if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
+ return true;
+
+ return false;
+}
+
+static int qdisc_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, u32 clid,
+ struct Qdisc *old, struct Qdisc *new)
+{
+ struct sk_buff *skb;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (old && !tc_qdisc_dump_ignore(old, false)) {
+ if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
+ 0, RTM_DELQDISC) < 0)
+ goto err_out;
+ }
+ if (new && !tc_qdisc_dump_ignore(new, false)) {
+ if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
+ old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
+ goto err_out;
+ }
+
+ if (skb->len)
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+
+err_out:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
static void notify_and_destroy(struct net *net, struct sk_buff *skb,
struct nlmsghdr *n, u32 clid,
struct Qdisc *old, struct Qdisc *new)
old = dev_graft_qdisc(dev_queue, new);
if (new && i > 0)
- refcount_inc(&new->refcnt);
+ qdisc_refcount_inc(new);
if (!ingress)
qdisc_destroy(old);
notify_and_destroy(net, skb, n, classid,
dev->qdisc, new);
if (new && !new->ops->attach)
- refcount_inc(&new->refcnt);
+ qdisc_refcount_inc(new);
dev->qdisc = new ? : &noop_qdisc;
if (new && new->ops->attach)
err = -EOPNOTSUPP;
if (cops && cops->graft) {
- unsigned long cl = cops->get(parent, classid);
- if (cl) {
+ unsigned long cl = cops->find(parent, classid);
+
+ if (cl)
err = cops->graft(parent, cl, new, &old);
- cops->put(parent, cl);
- } else
+ else
err = -ENOENT;
}
if (!err)
if (q == p ||
(p && check_loop(q, p, 0)))
return -ELOOP;
- refcount_inc(&q->refcnt);
+ qdisc_refcount_inc(q);
goto graft;
} else {
if (!q)
return 0;
}
-static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
- u32 portid, u32 seq, u16 flags, int event)
-{
- struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
- struct gnet_stats_queue __percpu *cpu_qstats = NULL;
- struct tcmsg *tcm;
- struct nlmsghdr *nlh;
- unsigned char *b = skb_tail_pointer(skb);
- struct gnet_dump d;
- struct qdisc_size_table *stab;
- __u32 qlen;
-
- cond_resched();
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
- if (!nlh)
- goto out_nlmsg_trim;
- tcm = nlmsg_data(nlh);
- tcm->tcm_family = AF_UNSPEC;
- tcm->tcm__pad1 = 0;
- tcm->tcm__pad2 = 0;
- tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
- tcm->tcm_parent = clid;
- tcm->tcm_handle = q->handle;
- tcm->tcm_info = refcount_read(&q->refcnt);
- if (nla_put_string(skb, TCA_KIND, q->ops->id))
- goto nla_put_failure;
- if (q->ops->dump && q->ops->dump(q, skb) < 0)
- goto nla_put_failure;
- qlen = q->q.qlen;
-
- stab = rtnl_dereference(q->stab);
- if (stab && qdisc_dump_stab(skb, stab) < 0)
- goto nla_put_failure;
-
- if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
- NULL, &d, TCA_PAD) < 0)
- goto nla_put_failure;
-
- if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
- goto nla_put_failure;
-
- if (qdisc_is_percpu_stats(q)) {
- cpu_bstats = q->cpu_bstats;
- cpu_qstats = q->cpu_qstats;
- }
-
- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
- &d, cpu_bstats, &q->bstats) < 0 ||
- gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
- gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
- goto nla_put_failure;
-
- if (gnet_stats_finish_copy(&d) < 0)
- goto nla_put_failure;
-
- nlh->nlmsg_len = skb_tail_pointer(skb) - b;
- return skb->len;
-
-out_nlmsg_trim:
-nla_put_failure:
- nlmsg_trim(skb, b);
- return -1;
-}
-
-static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
-{
- if (q->flags & TCQ_F_BUILTIN)
- return true;
- if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
- return true;
-
- return false;
-}
-
-static int qdisc_notify(struct net *net, struct sk_buff *oskb,
- struct nlmsghdr *n, u32 clid,
- struct Qdisc *old, struct Qdisc *new)
-{
- struct sk_buff *skb;
- u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
-
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb)
- return -ENOBUFS;
-
- if (old && !tc_qdisc_dump_ignore(old, false)) {
- if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
- 0, RTM_DELQDISC) < 0)
- goto err_out;
- }
- if (new && !tc_qdisc_dump_ignore(new, false)) {
- if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
- old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
- goto err_out;
- }
-
- if (skb->len)
- return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
- n->nlmsg_flags & NLM_F_ECHO);
-
-err_out:
- kfree_skb(skb);
- return -EINVAL;
-}
-
static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
struct netlink_callback *cb,
int *q_idx_p, int s_q_idx, bool recur,
* Traffic classes manipulation. *
************************************************/
+static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
+ unsigned long cl,
+ u32 portid, u32 seq, u16 flags, int event)
+{
+ struct tcmsg *tcm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb_tail_pointer(skb);
+ struct gnet_dump d;
+ const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
+
+ cond_resched();
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ tcm = nlmsg_data(nlh);
+ tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm__pad1 = 0;
+ tcm->tcm__pad2 = 0;
+ tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
+ tcm->tcm_parent = q->handle;
+ tcm->tcm_handle = q->handle;
+ tcm->tcm_info = 0;
+ if (nla_put_string(skb, TCA_KIND, q->ops->id))
+ goto nla_put_failure;
+ if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
+ goto nla_put_failure;
+
+ if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
+ NULL, &d, TCA_PAD) < 0)
+ goto nla_put_failure;
+
+ if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
+ goto nla_put_failure;
+
+ if (gnet_stats_finish_copy(&d) < 0)
+ goto nla_put_failure;
+
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ return skb->len;
+
+out_nlmsg_trim:
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tclass_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, struct Qdisc *q,
+ unsigned long cl, int event)
+{
+ struct sk_buff *skb;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+}
+
+static int tclass_del_notify(struct net *net,
+ const struct Qdisc_class_ops *cops,
+ struct sk_buff *oskb, struct nlmsghdr *n,
+ struct Qdisc *q, unsigned long cl)
+{
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+ struct sk_buff *skb;
+ int err = 0;
+
+ if (!cops->delete)
+ return -EOPNOTSUPP;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
+ RTM_DELTCLASS) < 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ err = cops->delete(q, cl);
+ if (err) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+}
+
+#ifdef CONFIG_NET_CLS
+
+struct tcf_bind_args {
+ struct tcf_walker w;
+ u32 classid;
+ unsigned long cl;
+};
+
+static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
+{
+ struct tcf_bind_args *a = (void *)arg;
+
+ if (tp->ops->bind_class) {
+ tcf_tree_lock(tp);
+ tp->ops->bind_class(n, a->classid, a->cl);
+ tcf_tree_unlock(tp);
+ }
+ return 0;
+}
+
+static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
+ unsigned long new_cl)
+{
+ const struct Qdisc_class_ops *cops = q->ops->cl_ops;
+ struct tcf_block *block;
+ struct tcf_chain *chain;
+ unsigned long cl;
+
+ cl = cops->find(q, portid);
+ if (!cl)
+ return;
+ block = cops->tcf_block(q, cl);
+ if (!block)
+ return;
+ list_for_each_entry(chain, &block->chain_list, list) {
+ struct tcf_proto *tp;
+
+ for (tp = rtnl_dereference(chain->filter_chain);
+ tp; tp = rtnl_dereference(tp->next)) {
+ struct tcf_bind_args arg = {};
+
+ arg.w.fn = tcf_node_bind;
+ arg.classid = clid;
+ arg.cl = new_cl;
+ tp->ops->walk(tp, &arg.w);
+ }
+ }
+}
+
+#else
+static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
+ unsigned long new_cl)
+{
+}
+
+#endif
static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
struct netlink_ext_ack *extack)
clid = TC_H_MAKE(qid, clid);
if (clid)
- cl = cops->get(q, clid);
+ cl = cops->find(q, clid);
if (cl == 0) {
err = -ENOENT;
goto out;
break;
case RTM_DELTCLASS:
- err = -EOPNOTSUPP;
- if (cops->delete)
- err = cops->delete(q, cl);
- if (err == 0)
- tclass_notify(net, skb, n, q, cl,
- RTM_DELTCLASS);
+ err = tclass_del_notify(net, cops, skb, n, q, cl);
+ /* Unbind the class with flilters with 0 */
+ tc_bind_tclass(q, portid, clid, 0);
goto out;
case RTM_GETTCLASS:
err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
err = -EOPNOTSUPP;
if (cops->change)
err = cops->change(q, clid, portid, tca, &new_cl);
- if (err == 0)
+ if (err == 0) {
tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
-
+ /* We just create a new class, need to do reverse binding. */
+ if (cl != new_cl)
+ tc_bind_tclass(q, portid, clid, new_cl);
+ }
out:
- if (cl)
- cops->put(q, cl);
-
return err;
}
-
-static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
- unsigned long cl,
- u32 portid, u32 seq, u16 flags, int event)
-{
- struct tcmsg *tcm;
- struct nlmsghdr *nlh;
- unsigned char *b = skb_tail_pointer(skb);
- struct gnet_dump d;
- const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
-
- cond_resched();
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
- if (!nlh)
- goto out_nlmsg_trim;
- tcm = nlmsg_data(nlh);
- tcm->tcm_family = AF_UNSPEC;
- tcm->tcm__pad1 = 0;
- tcm->tcm__pad2 = 0;
- tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
- tcm->tcm_parent = q->handle;
- tcm->tcm_handle = q->handle;
- tcm->tcm_info = 0;
- if (nla_put_string(skb, TCA_KIND, q->ops->id))
- goto nla_put_failure;
- if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
- goto nla_put_failure;
-
- if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
- NULL, &d, TCA_PAD) < 0)
- goto nla_put_failure;
-
- if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
- goto nla_put_failure;
-
- if (gnet_stats_finish_copy(&d) < 0)
- goto nla_put_failure;
-
- nlh->nlmsg_len = skb_tail_pointer(skb) - b;
- return skb->len;
-
-out_nlmsg_trim:
-nla_put_failure:
- nlmsg_trim(skb, b);
- return -1;
-}
-
-static int tclass_notify(struct net *net, struct sk_buff *oskb,
- struct nlmsghdr *n, struct Qdisc *q,
- unsigned long cl, int event)
-{
- struct sk_buff *skb;
- u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
-
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb)
- return -ENOBUFS;
-
- if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
- kfree_skb(skb);
- return -EINVAL;
- }
-
- return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
- n->nlmsg_flags & NLM_F_ECHO);
-}
-
struct qdisc_dump_args {
struct qdisc_walker w;
struct sk_buff *skb;
register_qdisc(&mq_qdisc_ops);
register_qdisc(&noqueue_qdisc_ops);
- rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
- rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
- NULL);
- rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
- rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
+ 0);
+ rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
- NULL);
+ 0);
return 0;
}
struct tcf_proto __rcu *filter_list;
struct tcf_block *block;
- int refcnt;
int filters;
struct cbq_class *defaults[TC_PRIO_MAX + 1];
struct tc_ratespec *r;
int err;
+ qdisc_watchdog_init(&q->watchdog, sch);
+ hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+ q->delay_timer.function = cbq_undelay;
+
+ if (!opt)
+ return -EINVAL;
+
err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL);
if (err < 0)
return err;
if (err < 0)
goto put_rtab;
- q->link.refcnt = 1;
q->link.sibling = &q->link;
q->link.common.classid = sch->handle;
q->link.qdisc = sch;
q->link.avpkt = q->link.allot/2;
q->link.minidle = -0x7FFFFFFF;
- qdisc_watchdog_init(&q->watchdog, sch);
- hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
- q->delay_timer.function = cbq_undelay;
q->toplevel = TC_CBQ_MAXLEVEL;
q->now = psched_get_time();
{
struct cbq_class *cl = (struct cbq_class *)arg;
- if (cl->q->q.qlen == 0)
- cbq_deactivate_class(cl);
+ cbq_deactivate_class(cl);
}
-static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
+static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
{
struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = cbq_class_lookup(q, classid);
- if (cl) {
- cl->refcnt++;
- return (unsigned long)cl;
- }
- return 0;
+ return (unsigned long)cbq_class_lookup(q, classid);
}
static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
qdisc_class_hash_destroy(&q->clhash);
}
-static void cbq_put(struct Qdisc *sch, unsigned long arg)
-{
- struct cbq_class *cl = (struct cbq_class *)arg;
-
- if (--cl->refcnt == 0) {
-#ifdef CONFIG_NET_CLS_ACT
- spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
- struct cbq_sched_data *q = qdisc_priv(sch);
-
- spin_lock_bh(root_lock);
- if (q->rx_class == cl)
- q->rx_class = NULL;
- spin_unlock_bh(root_lock);
-#endif
-
- cbq_destroy_class(sch, cl);
- }
-}
-
static int
cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
unsigned long *arg)
cl->R_tab = rtab;
rtab = NULL;
- cl->refcnt = 1;
cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
if (!cl->q)
cl->q = &noop_qdisc;
cbq_rmprio(q, cl);
sch_tree_unlock(sch);
- BUG_ON(--cl->refcnt == 0);
- /*
- * This shouldn't happen: we "hold" one cops->get() when called
- * from tc_ctl_tclass; the destroy method is done from cops->put().
- */
-
+ cbq_destroy_class(sch, cl);
return 0;
}
.graft = cbq_graft,
.leaf = cbq_leaf,
.qlen_notify = cbq_qlen_notify,
- .get = cbq_get,
- .put = cbq_put,
+ .find = cbq_find,
.change = cbq_change_class,
.delete = cbq_delete,
.walk = cbq_walk,
if (!q->flows)
return -ENOMEM;
q->backlogs = kvzalloc(q->flows_cnt * sizeof(u32), GFP_KERNEL);
- if (!q->backlogs) {
- kvfree(q->flows);
+ if (!q->backlogs)
return -ENOMEM;
- }
for (i = 0; i < q->flows_cnt; i++) {
struct fq_codel_flow *flow = q->flows + i;
return NULL;
}
-static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid)
+static unsigned long fq_codel_find(struct Qdisc *sch, u32 classid)
{
return 0;
}
return 0;
}
-static void fq_codel_put(struct Qdisc *q, unsigned long cl)
+static void fq_codel_unbind(struct Qdisc *q, unsigned long cl)
{
}
static const struct Qdisc_class_ops fq_codel_class_ops = {
.leaf = fq_codel_leaf,
- .get = fq_codel_get,
- .put = fq_codel_put,
+ .find = fq_codel_find,
.tcf_block = fq_codel_tcf_block,
.bind_tcf = fq_codel_bind,
- .unbind_tcf = fq_codel_put,
+ .unbind_tcf = fq_codel_unbind,
.dump = fq_codel_dump_class,
.dump_stats = fq_codel_dump_class_stats,
.walk = fq_codel_walk,
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
+#include <trace/events/qdisc.h>
/* Qdisc to use by default */
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
q->q.qlen--;
} else
skb = NULL;
- return skb;
+ goto trace;
}
*validate = true;
skb = q->skb_bad_txq;
q->q.qlen--;
goto bulk;
}
- return NULL;
+ skb = NULL;
+ goto trace;
}
if (!(q->flags & TCQ_F_ONETXQUEUE) ||
!netif_xmit_frozen_or_stopped(txq))
else
try_bulk_dequeue_skb_slow(q, skb, packets);
}
+trace:
+ trace_qdisc_dequeue(q, txq, *packets, skb);
return skb;
}
dev->priv_flags & IFF_NO_QUEUE) {
netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
dev->qdisc = txq->qdisc_sleeping;
- refcount_inc(&dev->qdisc->refcnt);
+ qdisc_refcount_inc(dev->qdisc);
} else {
qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
if (qdisc) {
struct hfsc_class {
struct Qdisc_class_common cl_common;
- unsigned int refcnt; /* usage count */
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
}
}
-static void
-set_active(struct hfsc_class *cl, unsigned int len)
-{
- if (cl->cl_flags & HFSC_RSC)
- init_ed(cl, len);
- if (cl->cl_flags & HFSC_FSC)
- init_vf(cl, len);
-
-}
-
-static void
-set_passive(struct hfsc_class *cl)
-{
- if (cl->cl_flags & HFSC_RSC)
- eltree_remove(cl);
-
- /*
- * vttree is now handled in update_vf() so that update_vf(cl, 0, 0)
- * needs to be called explicitly to remove a class from vttree.
- */
-}
-
static unsigned int
qdisc_peek_len(struct Qdisc *sch)
{
hfsc_change_usc(cl, usc, 0);
cl->cl_common.classid = classid;
- cl->refcnt = 1;
cl->sched = q;
cl->cl_parent = parent;
cl->qdisc = qdisc_create_dflt(sch->dev_queue,
hfsc_purge_queue(sch, cl);
qdisc_class_hash_remove(&q->clhash, &cl->cl_common);
- BUG_ON(--cl->refcnt == 0);
- /*
- * This shouldn't happen: we "hold" one cops->get() when called
- * from tc_ctl_tclass; the destroy method is done from cops->put().
- */
-
sch_tree_unlock(sch);
+
+ hfsc_destroy_class(sch, cl);
return 0;
}
{
struct hfsc_class *cl = (struct hfsc_class *)arg;
- if (cl->qdisc->q.qlen == 0) {
- update_vf(cl, 0, 0);
- set_passive(cl);
- }
+ /* vttree is now handled in update_vf() so that update_vf(cl, 0, 0)
+ * needs to be called explicitly to remove a class from vttree.
+ */
+ update_vf(cl, 0, 0);
+ if (cl->cl_flags & HFSC_RSC)
+ eltree_remove(cl);
}
static unsigned long
-hfsc_get_class(struct Qdisc *sch, u32 classid)
-{
- struct hfsc_class *cl = hfsc_find_class(classid, sch);
-
- if (cl != NULL)
- cl->refcnt++;
-
- return (unsigned long)cl;
-}
-
-static void
-hfsc_put_class(struct Qdisc *sch, unsigned long arg)
+hfsc_search_class(struct Qdisc *sch, u32 classid)
{
- struct hfsc_class *cl = (struct hfsc_class *)arg;
-
- if (--cl->refcnt == 0)
- hfsc_destroy_class(sch, cl);
+ return (unsigned long)hfsc_find_class(classid, sch);
}
static unsigned long
struct tc_hfsc_qopt *qopt;
int err;
+ qdisc_watchdog_init(&q->watchdog, sch);
+
if (opt == NULL || nla_len(opt) < sizeof(*qopt))
return -EINVAL;
qopt = nla_data(opt);
err = tcf_block_get(&q->root.block, &q->root.filter_list);
if (err)
- goto err_tcf;
+ return err;
q->root.cl_common.classid = sch->handle;
- q->root.refcnt = 1;
q->root.sched = q;
q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
sch->handle);
qdisc_class_hash_insert(&q->clhash, &q->root.cl_common);
qdisc_class_hash_grow(sch, &q->clhash);
- qdisc_watchdog_init(&q->watchdog, sch);
-
return 0;
-
- err_tcf:
- qdisc_class_hash_destroy(&q->clhash);
- return err;
}
static int
}
if (cl->qdisc->q.qlen == 1) {
- set_active(cl, qdisc_pkt_len(skb));
+ unsigned int len = qdisc_pkt_len(skb);
+
+ if (cl->cl_flags & HFSC_RSC)
+ init_ed(cl, len);
+ if (cl->cl_flags & HFSC_FSC)
+ init_vf(cl, len);
/*
* If this is the first packet, isolate the head so an eventual
* head drop before the first dequeue operation has no chance
if (realtime)
cl->cl_cumul += qdisc_pkt_len(skb);
- if (cl->qdisc->q.qlen != 0) {
- if (cl->cl_flags & HFSC_RSC) {
+ if (cl->cl_flags & HFSC_RSC) {
+ if (cl->qdisc->q.qlen != 0) {
/* update ed */
next_len = qdisc_peek_len(cl->qdisc);
if (realtime)
update_ed(cl, next_len);
else
update_d(cl, next_len);
+ } else {
+ /* the class becomes passive */
+ eltree_remove(cl);
}
- } else {
- /* the class becomes passive */
- set_passive(cl);
}
qdisc_bstats_update(sch, skb);
.graft = hfsc_graft_class,
.leaf = hfsc_class_leaf,
.qlen_notify = hfsc_qlen_notify,
- .get = hfsc_get_class,
- .put = hfsc_put_class,
+ .find = hfsc_search_class,
.bind_tcf = hfsc_bind_tcf,
.unbind_tcf = hfsc_unbind_tcf,
.tcf_block = hfsc_tcf_block,
struct tcf_proto __rcu *filter_list; /* class attached filters */
struct tcf_block *block;
int filter_cnt;
- int refcnt; /* usage count of this class */
int level; /* our level (see above) */
unsigned int children;
return container_of(clc, struct htb_class, common);
}
+static unsigned long htb_search(struct Qdisc *sch, u32 handle)
+{
+ return (unsigned long)htb_find(handle, sch);
+}
/**
* htb_classify - classify a packet into class
*
int err;
int i;
+ qdisc_watchdog_init(&q->watchdog, sch);
+ INIT_WORK(&q->work, htb_work_func);
+
if (!opt)
return -EINVAL;
for (i = 0; i < TC_HTB_NUMPRIO; i++)
INIT_LIST_HEAD(q->drops + i);
- qdisc_watchdog_init(&q->watchdog, sch);
- INIT_WORK(&q->work, htb_work_func);
qdisc_skb_head_init(&q->direct_queue);
if (tb[TCA_HTB_DIRECT_QLEN])
{
struct htb_class *cl = (struct htb_class *)arg;
- if (cl->un.leaf.q->q.qlen == 0)
- htb_deactivate(qdisc_priv(sch), cl);
-}
-
-static unsigned long htb_get(struct Qdisc *sch, u32 classid)
-{
- struct htb_class *cl = htb_find(classid, sch);
- if (cl)
- cl->refcnt++;
- return (unsigned long)cl;
+ htb_deactivate(qdisc_priv(sch), cl);
}
static inline int htb_parent_last_child(struct htb_class *cl)
if (last_child)
htb_parent_to_leaf(q, cl, new_q);
- BUG_ON(--cl->refcnt == 0);
- /*
- * This shouldn't happen: we "hold" one cops->get() when called
- * from tc_ctl_tclass; the destroy method is done from cops->put().
- */
-
sch_tree_unlock(sch);
- return 0;
-}
-static void htb_put(struct Qdisc *sch, unsigned long arg)
-{
- struct htb_class *cl = (struct htb_class *)arg;
-
- if (--cl->refcnt == 0)
- htb_destroy_class(sch, cl);
+ htb_destroy_class(sch, cl);
+ return 0;
}
static int htb_change_class(struct Qdisc *sch, u32 classid,
}
}
- cl->refcnt = 1;
cl->children = 0;
INIT_LIST_HEAD(&cl->un.leaf.drop_list);
RB_CLEAR_NODE(&cl->pq_node);
.graft = htb_graft,
.leaf = htb_leaf,
.qlen_notify = htb_qlen_notify,
- .get = htb_get,
- .put = htb_put,
+ .find = htb_search,
.change = htb_change_class,
.delete = htb_delete,
.walk = htb_walk,
for (i = 0; i < q->max_bands; i++)
q->queues[i] = &noop_qdisc;
- err = multiq_tune(sch, opt);
-
- if (err)
- kfree(q->queues);
-
- return err;
+ return multiq_tune(sch, opt);
}
static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
return q->queues[band];
}
-static unsigned long multiq_get(struct Qdisc *sch, u32 classid)
+static unsigned long multiq_find(struct Qdisc *sch, u32 classid)
{
struct multiq_sched_data *q = qdisc_priv(sch);
unsigned long band = TC_H_MIN(classid);
static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
- return multiq_get(sch, classid);
+ return multiq_find(sch, classid);
}
-static void multiq_put(struct Qdisc *q, unsigned long cl)
+static void multiq_unbind(struct Qdisc *q, unsigned long cl)
{
}
static const struct Qdisc_class_ops multiq_class_ops = {
.graft = multiq_graft,
.leaf = multiq_leaf,
- .get = multiq_get,
- .put = multiq_put,
+ .find = multiq_find,
.walk = multiq_walk,
.tcf_block = multiq_tcf_block,
.bind_tcf = multiq_bind,
- .unbind_tcf = multiq_put,
+ .unbind_tcf = multiq_unbind,
.dump = multiq_dump_class,
.dump_stats = multiq_dump_class_stats,
};
struct netem_sched_data *q = qdisc_priv(sch);
int ret;
+ qdisc_watchdog_init(&q->watchdog, sch);
+
if (!opt)
return -EINVAL;
- qdisc_watchdog_init(&q->watchdog, sch);
-
q->loss_model = CLG_RANDOM;
ret = netem_change(sch, opt);
if (ret)
return q->qdisc;
}
-static unsigned long netem_get(struct Qdisc *sch, u32 classid)
+static unsigned long netem_find(struct Qdisc *sch, u32 classid)
{
return 1;
}
-static void netem_put(struct Qdisc *sch, unsigned long arg)
-{
-}
-
static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
if (!walker->stop) {
static const struct Qdisc_class_ops netem_class_ops = {
.graft = netem_graft,
.leaf = netem_leaf,
- .get = netem_get,
- .put = netem_put,
+ .find = netem_find,
.walk = netem_walk,
.dump = netem_dump_class,
};
slot->skblist_prev = skb;
}
-static unsigned int sfq_drop(struct Qdisc *sch)
+static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
{
struct sfq_sched_data *q = qdisc_priv(sch);
sfq_index x, d = q->cur_depth;
slot->backlog -= len;
sfq_dec(q, x);
sch->q.qlen--;
- qdisc_qstats_drop(sch);
qdisc_qstats_backlog_dec(sch, skb);
- kfree_skb(skb);
+ qdisc_drop(skb, sch, to_free);
return len;
}
if (hash == 0) {
if (ret & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return ret;
}
hash--;
return NET_XMIT_SUCCESS;
qlen = slot->qlen;
- dropped = sfq_drop(sch);
+ dropped = sfq_drop(sch, to_free);
/* Return Congestion Notification only if we dropped a packet
* from this flow.
*/
struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
unsigned int qlen, dropped = 0;
struct red_parms *p = NULL;
+ struct sk_buff *to_free = NULL;
+ struct sk_buff *tail = NULL;
if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
return -EINVAL;
}
qlen = sch->q.qlen;
- while (sch->q.qlen > q->limit)
- dropped += sfq_drop(sch);
+ while (sch->q.qlen > q->limit) {
+ dropped += sfq_drop(sch, &to_free);
+ if (!tail)
+ tail = to_free;
+ }
+
+ rtnl_kfree_skbs(to_free, tail);
qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
del_timer(&q->perturb_timer);
int i;
int err;
+ setup_deferrable_timer(&q->perturb_timer, sfq_perturbation,
+ (unsigned long)sch);
+
err = tcf_block_get(&q->block, &q->filter_list);
if (err)
return err;
- setup_deferrable_timer(&q->perturb_timer, sfq_perturbation,
- (unsigned long)sch);
-
for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
q->dep[i].next = i + SFQ_MAX_FLOWS;
q->dep[i].prev = i + SFQ_MAX_FLOWS;
return NULL;
}
-static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
+static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
{
return 0;
}
return 0;
}
-static void sfq_put(struct Qdisc *q, unsigned long cl)
+static void sfq_unbind(struct Qdisc *q, unsigned long cl)
{
}
static const struct Qdisc_class_ops sfq_class_ops = {
.leaf = sfq_leaf,
- .get = sfq_get,
- .put = sfq_put,
+ .find = sfq_find,
.tcf_block = sfq_tcf_block,
.bind_tcf = sfq_bind,
- .unbind_tcf = sfq_put,
+ .unbind_tcf = sfq_unbind,
.dump = sfq_dump_class,
.dump_stats = sfq_dump_class_stats,
.walk = sfq_walk,
{
struct tbf_sched_data *q = qdisc_priv(sch);
+ qdisc_watchdog_init(&q->watchdog, sch);
+ q->qdisc = &noop_qdisc;
+
if (opt == NULL)
return -EINVAL;
q->t_c = ktime_get_ns();
- qdisc_watchdog_init(&q->watchdog, sch);
- q->qdisc = &noop_qdisc;
return tbf_change(sch, opt);
}
return q->qdisc;
}
-static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
+static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
{
return 1;
}
-static void tbf_put(struct Qdisc *sch, unsigned long arg)
-{
-}
-
static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
if (!walker->stop) {
static const struct Qdisc_class_ops tbf_class_ops = {
.graft = tbf_graft,
.leaf = tbf_leaf,
- .get = tbf_get,
- .put = tbf_put,
+ .find = tbf_find,
.walk = tbf_walk,
.dump = tbf_dump_class,
};
struct sctp_chunk *chunk);
static int sctp_do_bind(struct sock *, union sctp_addr *, int);
static int sctp_autobind(struct sock *sk);
-static void sctp_sock_migrate(struct sock *, struct sock *,
- struct sctp_association *, sctp_socket_type_t);
+static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
+ struct sctp_association *assoc,
+ enum sctp_socket_type type);
static unsigned long sctp_memory_pressure;
static atomic_long_t sctp_memory_allocated;
struct sctp_association *asoc2;
struct sctp_transport *transport;
union sctp_addr to;
- sctp_scope_t scope;
+ enum sctp_scope scope;
long timeo;
int err = 0;
int addrcnt = 0;
*/
/* BUG: We do not implement the equivalent of sk_stream_wait_memory(). */
-static int sctp_msghdr_parse(const struct msghdr *, sctp_cmsgs_t *);
+static int sctp_msghdr_parse(const struct msghdr *msg,
+ struct sctp_cmsgs *cmsgs);
static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
{
struct sctp_sndrcvinfo *sinfo;
struct sctp_initmsg *sinit;
sctp_assoc_t associd = 0;
- sctp_cmsgs_t cmsgs = { NULL };
- sctp_scope_t scope;
+ struct sctp_cmsgs cmsgs = { NULL };
+ enum sctp_scope scope;
bool fill_sinfo_ttl = false, wait_connect = false;
struct sctp_datamsg *datamsg;
int msg_flags = msg->msg_flags;
info->sctpi_ictrlchunks = asoc->stats.ictrlchunks;
prim = asoc->peer.primary_path;
- memcpy(&info->sctpi_p_address, &prim->ipaddr,
- sizeof(struct sockaddr_storage));
+ memcpy(&info->sctpi_p_address, &prim->ipaddr, sizeof(prim->ipaddr));
info->sctpi_p_state = prim->state;
info->sctpi_p_cwnd = prim->cwnd;
info->sctpi_p_srtt = prim->srtt;
* msg_control
* points here
*/
-static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs)
+static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs)
{
- struct cmsghdr *cmsg;
struct msghdr *my_msg = (struct msghdr *)msg;
+ struct cmsghdr *cmsg;
for_each_cmsghdr(cmsg, my_msg) {
if (!CMSG_OK(my_msg, cmsg))
*/
static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
struct sctp_association *assoc,
- sctp_socket_type_t type)
+ enum sctp_socket_type type)
{
struct sctp_sock *oldsp = sctp_sk(oldsk);
struct sctp_sock *newsp = sctp_sk(newsk);
}
static void bearer_disable(struct net *net, struct tipc_bearer *b);
+ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev);
/**
* tipc_media_find - locates specified media object by name
return 0;
}
-/* tipc_bearer_reset_all - reset all links on all bearers
- */
-void tipc_bearer_reset_all(struct net *net)
-{
- struct tipc_bearer *b;
- int i;
-
- for (i = 0; i < MAX_BEARERS; i++) {
- b = bearer_get(net, i);
- if (b)
- clear_bit_unlock(0, &b->up);
- }
- for (i = 0; i < MAX_BEARERS; i++) {
- b = bearer_get(net, i);
- if (b)
- tipc_reset_bearer(net, b);
- }
- for (i = 0; i < MAX_BEARERS; i++) {
- b = bearer_get(net, i);
- if (b)
- test_and_set_bit_lock(0, &b->up);
- }
-}
-
/**
* bearer_disable
*
/* Associate TIPC bearer with L2 bearer */
rcu_assign_pointer(b->media_ptr, dev);
+ b->pt.dev = dev;
+ b->pt.type = htons(ETH_P_TIPC);
+ b->pt.func = tipc_l2_rcv_msg;
+ dev_add_pack(&b->pt);
memset(&b->bcast_addr, 0, sizeof(b->bcast_addr));
memcpy(b->bcast_addr.value, dev->broadcast, b->media->hwaddr_len);
b->bcast_addr.media_id = b->media->type_id;
struct net_device *dev;
dev = (struct net_device *)rtnl_dereference(b->media_ptr);
+ dev_remove_pack(&b->pt);
RCU_INIT_POINTER(dev->tipc_ptr, NULL);
synchronize_net();
dev_put(dev);
struct tipc_bearer *b;
rcu_read_lock();
- b = rcu_dereference_rtnl(dev->tipc_ptr);
+ b = rcu_dereference_rtnl(dev->tipc_ptr) ?:
+ rcu_dereference_rtnl(orig_dev->tipc_ptr);
if (likely(b && test_bit(0, &b->up) &&
(skb->pkt_type <= PACKET_MULTICAST))) {
skb->next = NULL;
- tipc_rcv(dev_net(dev), skb, b);
+ tipc_rcv(dev_net(b->pt.dev), skb, b);
rcu_read_unlock();
return NET_RX_SUCCESS;
}
return NOTIFY_OK;
}
- static struct packet_type tipc_packet_type __read_mostly = {
- .type = htons(ETH_P_TIPC),
- .func = tipc_l2_rcv_msg,
- };
-
static struct notifier_block notifier = {
.notifier_call = tipc_l2_device_event,
.priority = 0,
int tipc_bearer_setup(void)
{
- int err;
-
- err = register_netdevice_notifier(¬ifier);
- if (err)
- return err;
- dev_add_pack(&tipc_packet_type);
- return 0;
+ return register_netdevice_notifier(¬ifier);
}
void tipc_bearer_cleanup(void)
{
unregister_netdevice_notifier(¬ifier);
- dev_remove_pack(&tipc_packet_type);
}
void tipc_bearer_stop(struct net *net)
* @name: bearer name (format = media:interface)
* @media: ptr to media structure associated with bearer
* @bcast_addr: media address used in broadcasting
+ * @pt: packet type for bearer
* @rcu: rcu struct for tipc_bearer
* @priority: default link priority for bearer
* @window: default window size for bearer
char name[TIPC_MAX_BEARER_NAME];
struct tipc_media *media;
struct tipc_media_addr bcast_addr;
+ struct packet_type pt;
struct rcu_head rcu;
u32 priority;
u32 window;
struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name);
int tipc_bearer_get_name(struct net *net, char *name, u32 bearer_id);
struct tipc_media *tipc_media_find(const char *name);
-void tipc_bearer_reset_all(struct net *net);
int tipc_bearer_setup(void);
void tipc_bearer_cleanup(void);
void tipc_bearer_stop(struct net *net);
strncpy(linkname, tipc_link_name(link), len);
err = 0;
}
- exit:
tipc_node_read_unlock(node);
+ exit:
tipc_node_put(node);
return err;
}
rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr);
if (rc & TIPC_LINK_DOWN_EVT) {
- tipc_bearer_reset_all(n->net);
+ tipc_node_reset_links(n);
return;
}
if (!skb_queue_empty(&be->inputq1))
tipc_node_mcast_rcv(n);
- if (rc & TIPC_LINK_DOWN_EVT) {
- /* Reception reassembly failure => reset all links to peer */
- if (!tipc_link_is_up(be->link))
- tipc_node_reset_links(n);
-
- /* Retransmission failure => reset all links to all peers */
- if (!tipc_link_is_up(tipc_bc_sndlink(net)))
- tipc_bearer_reset_all(net);
- }
+ /* If reassembly or retransmission failure => reset all links to peer */
+ if (rc & TIPC_LINK_DOWN_EVT)
+ tipc_node_reset_links(n);
tipc_node_put(n);
}
/* Check/update node state before receiving */
if (unlikely(skb)) {
+ if (unlikely(skb_linearize(skb)))
+ goto discard;
tipc_node_write_lock(n);
if (tipc_node_check_state(n, skb, bearer_id, &xmitq)) {
if (le->link) {
#include <linux/netfilter.h>
#include <linux/module.h>
#include <linux/cache.h>
+#include <linux/cpu.h>
#include <linux/audit.h>
#include <net/dst.h>
#include <net/flow.h>
u8 flags;
};
+static DEFINE_PER_CPU(struct xfrm_dst *, xfrm_last_dst);
+static struct work_struct *xfrm_pcpu_work __read_mostly;
static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
__read_mostly;
struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
const xfrm_address_t *saddr,
const xfrm_address_t *daddr,
- int family)
+ int family, u32 mark)
{
const struct xfrm_policy_afinfo *afinfo;
struct dst_entry *dst;
if (unlikely(afinfo == NULL))
return ERR_PTR(-EAFNOSUPPORT);
- dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr);
+ dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr, mark);
rcu_read_unlock();
int tos, int oif,
xfrm_address_t *prev_saddr,
xfrm_address_t *prev_daddr,
- int family)
+ int family, u32 mark)
{
struct net *net = xs_net(x);
xfrm_address_t *saddr = &x->props.saddr;
daddr = x->coaddr;
}
- dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family);
+ dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family, mark);
if (!IS_ERR(dst)) {
if (prev_saddr != saddr)
xfrm_pol_put(xp);
}
-static struct flow_cache_object *xfrm_policy_flo_get(struct flow_cache_object *flo)
-{
- struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
-
- if (unlikely(pol->walk.dead))
- flo = NULL;
- else
- xfrm_pol_hold(pol);
-
- return flo;
-}
-
-static int xfrm_policy_flo_check(struct flow_cache_object *flo)
-{
- struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
-
- return !pol->walk.dead;
-}
-
-static void xfrm_policy_flo_delete(struct flow_cache_object *flo)
-{
- xfrm_pol_put(container_of(flo, struct xfrm_policy, flo));
-}
-
-static const struct flow_cache_ops xfrm_policy_fc_ops = {
- .get = xfrm_policy_flo_get,
- .check = xfrm_policy_flo_check,
- .delete = xfrm_policy_flo_delete,
-};
-
/* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
* SPD calls.
*/
(unsigned long)policy);
setup_timer(&policy->polq.hold_timer, xfrm_policy_queue_process,
(unsigned long)policy);
- policy->flo.ops = &xfrm_policy_fc_ops;
}
return policy;
}
else
hlist_add_head(&policy->bydst, chain);
__xfrm_policy_link(policy, dir);
- atomic_inc(&net->xfrm.flow_cache_genid);
/* After previous checking, family can either be AF_INET or AF_INET6 */
if (policy->family == AF_INET)
}
if (!cnt)
err = -ESRCH;
+ else
+ xfrm_policy_cache_flush();
out:
spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
return err;
}
static struct xfrm_policy *
-__xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir)
+xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir)
{
#ifdef CONFIG_XFRM_SUB_POLICY
struct xfrm_policy *pol;
return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
}
-static int flow_to_policy_dir(int dir)
-{
- if (XFRM_POLICY_IN == FLOW_DIR_IN &&
- XFRM_POLICY_OUT == FLOW_DIR_OUT &&
- XFRM_POLICY_FWD == FLOW_DIR_FWD)
- return dir;
-
- switch (dir) {
- default:
- case FLOW_DIR_IN:
- return XFRM_POLICY_IN;
- case FLOW_DIR_OUT:
- return XFRM_POLICY_OUT;
- case FLOW_DIR_FWD:
- return XFRM_POLICY_FWD;
- }
-}
-
-static struct flow_cache_object *
-xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family,
- u8 dir, struct flow_cache_object *old_obj, void *ctx)
-{
- struct xfrm_policy *pol;
-
- if (old_obj)
- xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
-
- pol = __xfrm_policy_lookup(net, fl, family, flow_to_policy_dir(dir));
- if (IS_ERR_OR_NULL(pol))
- return ERR_CAST(pol);
-
- /* Resolver returns two references:
- * one for cache and one for caller of flow_cache_lookup() */
- xfrm_pol_hold(pol);
-
- return &pol->flo;
-}
-
-static inline int policy_to_flow_dir(int dir)
-{
- if (XFRM_POLICY_IN == FLOW_DIR_IN &&
- XFRM_POLICY_OUT == FLOW_DIR_OUT &&
- XFRM_POLICY_FWD == FLOW_DIR_FWD)
- return dir;
- switch (dir) {
- default:
- case XFRM_POLICY_IN:
- return FLOW_DIR_IN;
- case XFRM_POLICY_OUT:
- return FLOW_DIR_OUT;
- case XFRM_POLICY_FWD:
- return FLOW_DIR_FWD;
- }
-}
-
static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
const struct flowi *fl, u16 family)
{
}
err = security_xfrm_policy_lookup(pol->security,
fl->flowi_secid,
- policy_to_flow_dir(dir));
+ dir);
if (!err) {
if (!xfrm_pol_hold_rcu(pol))
goto again;
static int
xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local,
- xfrm_address_t *remote, unsigned short family)
+ xfrm_address_t *remote, unsigned short family, u32 mark)
{
int err;
const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
if (unlikely(afinfo == NULL))
return -EINVAL;
- err = afinfo->get_saddr(net, oif, local, remote);
+ err = afinfo->get_saddr(net, oif, local, remote, mark);
rcu_read_unlock();
return err;
}
if (xfrm_addr_any(local, tmpl->encap_family)) {
error = xfrm_get_saddr(net, fl->flowi_oif,
&tmp, remote,
- tmpl->encap_family);
+ tmpl->encap_family, 0);
if (error)
goto fail;
local = &tmp;
return tos;
}
-static struct flow_cache_object *xfrm_bundle_flo_get(struct flow_cache_object *flo)
-{
- struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
- struct dst_entry *dst = &xdst->u.dst;
-
- if (xdst->route == NULL) {
- /* Dummy bundle - if it has xfrms we were not
- * able to build bundle as template resolution failed.
- * It means we need to try again resolving. */
- if (xdst->num_xfrms > 0)
- return NULL;
- } else if (dst->flags & DST_XFRM_QUEUE) {
- return NULL;
- } else {
- /* Real bundle */
- if (stale_bundle(dst))
- return NULL;
- }
-
- dst_hold(dst);
- return flo;
-}
-
-static int xfrm_bundle_flo_check(struct flow_cache_object *flo)
-{
- struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
- struct dst_entry *dst = &xdst->u.dst;
-
- if (!xdst->route)
- return 0;
- if (stale_bundle(dst))
- return 0;
-
- return 1;
-}
-
-static void xfrm_bundle_flo_delete(struct flow_cache_object *flo)
-{
- struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
- struct dst_entry *dst = &xdst->u.dst;
-
- /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
- dst->obsolete = DST_OBSOLETE_DEAD;
- dst_release_immediate(dst);
-}
-
-static const struct flow_cache_ops xfrm_bundle_fc_ops = {
- .get = xfrm_bundle_flo_get,
- .check = xfrm_bundle_flo_check,
- .delete = xfrm_bundle_flo_delete,
-};
-
static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
{
const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
struct dst_entry *dst = &xdst->u.dst;
memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst));
- xdst->flo.ops = &xfrm_bundle_fc_ops;
} else
xdst = ERR_PTR(-ENOBUFS);
if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
family = xfrm[i]->props.family;
dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif,
- &saddr, &daddr, family);
+ &saddr, &daddr, family,
+ xfrm[i]->props.output_mark);
err = PTR_ERR(dst);
if (IS_ERR(dst))
goto put_states;
}
+static void xfrm_last_dst_update(struct xfrm_dst *xdst, struct xfrm_dst *old)
+{
+ this_cpu_write(xfrm_last_dst, xdst);
+ if (old)
+ dst_release(&old->u.dst);
+}
+
+static void __xfrm_pcpu_work_fn(void)
+{
+ struct xfrm_dst *old;
+
+ old = this_cpu_read(xfrm_last_dst);
+ if (old && !xfrm_bundle_ok(old))
+ xfrm_last_dst_update(NULL, old);
+}
+
+static void xfrm_pcpu_work_fn(struct work_struct *work)
+{
+ local_bh_disable();
+ rcu_read_lock();
+ __xfrm_pcpu_work_fn();
+ rcu_read_unlock();
+ local_bh_enable();
+}
+
+void xfrm_policy_cache_flush(void)
+{
+ struct xfrm_dst *old;
+ bool found = 0;
+ int cpu;
+
+ local_bh_disable();
+ rcu_read_lock();
+ for_each_possible_cpu(cpu) {
+ old = per_cpu(xfrm_last_dst, cpu);
+ if (old && !xfrm_bundle_ok(old)) {
+ if (smp_processor_id() == cpu) {
+ __xfrm_pcpu_work_fn();
+ continue;
+ }
+ found = true;
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+ local_bh_enable();
+
+ if (!found)
+ return;
+
+ get_online_cpus();
+
+ for_each_possible_cpu(cpu) {
+ bool bundle_release;
+
+ rcu_read_lock();
+ old = per_cpu(xfrm_last_dst, cpu);
+ bundle_release = old && !xfrm_bundle_ok(old);
+ rcu_read_unlock();
+
+ if (!bundle_release)
+ continue;
+
+ if (cpu_online(cpu)) {
+ schedule_work_on(cpu, &xfrm_pcpu_work[cpu]);
+ continue;
+ }
+
+ rcu_read_lock();
+ old = per_cpu(xfrm_last_dst, cpu);
+ if (old && !xfrm_bundle_ok(old)) {
+ per_cpu(xfrm_last_dst, cpu) = NULL;
+ dst_release(&old->u.dst);
+ }
+ rcu_read_unlock();
+ }
+
+ put_online_cpus();
+}
+
+static bool xfrm_pol_dead(struct xfrm_dst *xdst)
+{
+ unsigned int num_pols = xdst->num_pols;
+ unsigned int pol_dead = 0, i;
+
+ for (i = 0; i < num_pols; i++)
+ pol_dead |= xdst->pols[i]->walk.dead;
+
+ /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
+ if (pol_dead)
+ xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
+
+ return pol_dead;
+}
+
static struct xfrm_dst *
xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
const struct flowi *fl, u16 family,
{
struct net *net = xp_net(pols[0]);
struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
+ struct xfrm_dst *xdst, *old;
struct dst_entry *dst;
- struct xfrm_dst *xdst;
int err;
+ xdst = this_cpu_read(xfrm_last_dst);
+ if (xdst &&
+ xdst->u.dst.dev == dst_orig->dev &&
+ xdst->num_pols == num_pols &&
+ !xfrm_pol_dead(xdst) &&
+ memcmp(xdst->pols, pols,
+ sizeof(struct xfrm_policy *) * num_pols) == 0 &&
+ xfrm_bundle_ok(xdst)) {
+ dst_hold(&xdst->u.dst);
+ return xdst;
+ }
+
+ old = xdst;
/* Try to instantiate a bundle */
err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
if (err <= 0) {
memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
xdst->policy_genid = atomic_read(&pols[0]->genid);
+ atomic_set(&xdst->u.dst.__refcnt, 2);
+ xfrm_last_dst_update(xdst, old);
+
return xdst;
}
goto out;
}
-static struct flow_cache_object *
-xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
- struct flow_cache_object *oldflo, void *ctx)
+static struct xfrm_dst *
+xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct xfrm_flo *xflo)
{
- struct xfrm_flo *xflo = (struct xfrm_flo *)ctx;
struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
- struct xfrm_dst *xdst, *new_xdst;
- int num_pols = 0, num_xfrms = 0, i, err, pol_dead;
-
- /* Check if the policies from old bundle are usable */
- xdst = NULL;
- if (oldflo) {
- xdst = container_of(oldflo, struct xfrm_dst, flo);
- num_pols = xdst->num_pols;
- num_xfrms = xdst->num_xfrms;
- pol_dead = 0;
- for (i = 0; i < num_pols; i++) {
- pols[i] = xdst->pols[i];
- pol_dead |= pols[i]->walk.dead;
- }
- if (pol_dead) {
- /* Mark DST_OBSOLETE_DEAD to fail the next
- * xfrm_dst_check()
- */
- xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
- dst_release_immediate(&xdst->u.dst);
- xdst = NULL;
- num_pols = 0;
- num_xfrms = 0;
- oldflo = NULL;
- }
- }
+ int num_pols = 0, num_xfrms = 0, err;
+ struct xfrm_dst *xdst;
/* Resolve policies to use if we couldn't get them from
* previous cache entry */
- if (xdst == NULL) {
- num_pols = 1;
- pols[0] = __xfrm_policy_lookup(net, fl, family,
- flow_to_policy_dir(dir));
- err = xfrm_expand_policies(fl, family, pols,
+ num_pols = 1;
+ pols[0] = xfrm_policy_lookup(net, fl, family, dir);
+ err = xfrm_expand_policies(fl, family, pols,
&num_pols, &num_xfrms);
- if (err < 0)
- goto inc_error;
- if (num_pols == 0)
- return NULL;
- if (num_xfrms <= 0)
- goto make_dummy_bundle;
- }
+ if (err < 0)
+ goto inc_error;
+ if (num_pols == 0)
+ return NULL;
+ if (num_xfrms <= 0)
+ goto make_dummy_bundle;
- new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
+ xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
xflo->dst_orig);
- if (IS_ERR(new_xdst)) {
- err = PTR_ERR(new_xdst);
+ if (IS_ERR(xdst)) {
+ err = PTR_ERR(xdst);
if (err != -EAGAIN)
goto error;
- if (oldflo == NULL)
- goto make_dummy_bundle;
- dst_hold(&xdst->u.dst);
- return oldflo;
- } else if (new_xdst == NULL) {
+ goto make_dummy_bundle;
+ } else if (xdst == NULL) {
num_xfrms = 0;
- if (oldflo == NULL)
- goto make_dummy_bundle;
- xdst->num_xfrms = 0;
- dst_hold(&xdst->u.dst);
- return oldflo;
- }
-
- /* Kill the previous bundle */
- if (xdst) {
- /* The policies were stolen for newly generated bundle */
- xdst->num_pols = 0;
- /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
- xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
- dst_release_immediate(&xdst->u.dst);
+ goto make_dummy_bundle;
}
- /* We do need to return one reference for original caller */
- dst_hold(&new_xdst->u.dst);
- return &new_xdst->flo;
+ return xdst;
make_dummy_bundle:
/* We found policies, but there's no bundles to instantiate:
memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
dst_hold(&xdst->u.dst);
- return &xdst->flo;
+ return xdst;
inc_error:
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
error:
- if (xdst != NULL) {
- /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
- xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
- dst_release_immediate(&xdst->u.dst);
- } else
- xfrm_pols_put(pols, num_pols);
+ xfrm_pols_put(pols, num_pols);
return ERR_PTR(err);
}
const struct sock *sk, int flags)
{
struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
- struct flow_cache_object *flo;
struct xfrm_dst *xdst;
struct dst_entry *dst, *route;
u16 family = dst_orig->ops->family;
- u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
+ u8 dir = XFRM_POLICY_OUT;
int i, err, num_pols, num_xfrms = 0, drop_pols = 0;
dst = NULL;
goto no_transform;
}
- dst_hold(&xdst->u.dst);
route = xdst->route;
}
}
!net->xfrm.policy_count[XFRM_POLICY_OUT])
goto nopol;
- flo = flow_cache_lookup(net, fl, family, dir,
- xfrm_bundle_lookup, &xflo);
- if (flo == NULL)
+ xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo);
+ if (xdst == NULL)
goto nopol;
- if (IS_ERR(flo)) {
- err = PTR_ERR(flo);
+ if (IS_ERR(xdst)) {
+ err = PTR_ERR(xdst);
goto dropdst;
}
- xdst = container_of(flo, struct xfrm_dst, flo);
num_pols = xdst->num_pols;
num_xfrms = xdst->num_xfrms;
int pi;
int reverse;
struct flowi fl;
- u8 fl_dir;
int xerr_idx = -1;
reverse = dir & ~XFRM_POLICY_MASK;
dir &= XFRM_POLICY_MASK;
- fl_dir = policy_to_flow_dir(dir);
if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
}
}
- if (!pol) {
- struct flow_cache_object *flo;
-
- flo = flow_cache_lookup(net, &fl, family, fl_dir,
- xfrm_policy_lookup, NULL);
- if (IS_ERR_OR_NULL(flo))
- pol = ERR_CAST(flo);
- else
- pol = container_of(flo, struct xfrm_policy, flo);
- }
+ if (!pol)
+ pol = xfrm_policy_lookup(net, &fl, family, dir);
if (IS_ERR(pol)) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
* notice. That's what we are validating here via the
* stale_bundle() check.
*
- * When an xdst is removed from flow cache, DST_OBSOLETE_DEAD will
- * be marked on it.
* When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will
* be marked on it.
- * Both will force stable_bundle() to fail on any xdst bundle with
+ * This will force stale_bundle() to fail on any xdst bundle with
* this dst linked in it.
*/
if (dst->obsolete < 0 && !stale_bundle(dst))
return dst;
}
-void xfrm_garbage_collect(struct net *net)
-{
- flow_cache_flush(net);
-}
-EXPORT_SYMBOL(xfrm_garbage_collect);
-
-void xfrm_garbage_collect_deferred(struct net *net)
-{
- flow_cache_flush_deferred(net);
-}
-EXPORT_SYMBOL(xfrm_garbage_collect_deferred);
-
static void xfrm_init_pmtu(struct dst_entry *dst)
{
do {
rv = xfrm_sysctl_init(net);
if (rv < 0)
goto out_sysctl;
- rv = flow_cache_init(net);
- if (rv < 0)
- goto out;
return 0;
-out:
- xfrm_sysctl_fini(net);
out_sysctl:
xfrm_policy_fini(net);
out_policy:
static void __net_exit xfrm_net_exit(struct net *net)
{
- flow_cache_fini(net);
xfrm_sysctl_fini(net);
xfrm_policy_fini(net);
xfrm_state_fini(net);
void __init xfrm_init(void)
{
- flow_cache_hp_init();
+ int i;
+
+ xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work),
+ GFP_KERNEL);
+ BUG_ON(!xfrm_pcpu_work);
+
+ for (i = 0; i < NR_CPUS; i++)
+ INIT_WORK(&xfrm_pcpu_work[i], xfrm_pcpu_work_fn);
+
register_pernet_subsys(&xfrm_net_ops);
seqcount_init(&xfrm_policy_hash_generation);
xfrm_input_init();
struct xfrm_state *x_new[XFRM_MAX_DEPTH];
struct xfrm_migrate *mp;
+ /* Stage 0 - sanity checks */
if ((err = xfrm_migrate_check(m, num_migrate)) < 0)
goto out;
+ if (dir >= XFRM_POLICY_MAX) {
+ err = -EINVAL;
+ goto out;
+ }
+
/* Stage 1 - find policy */
if ((pol = xfrm_migrate_policy_find(sel, dir, type, net)) == NULL) {
err = -ENOENT;
}
EXPORT_SYMBOL(xfrm_unregister_type_offload);
-static const struct xfrm_type_offload *xfrm_get_type_offload(u8 proto, unsigned short family)
+static const struct xfrm_type_offload *
+xfrm_get_type_offload(u8 proto, unsigned short family, bool try_load)
{
struct xfrm_state_afinfo *afinfo;
const struct xfrm_type_offload **typemap;
const struct xfrm_type_offload *type;
+retry:
afinfo = xfrm_state_get_afinfo(family);
if (unlikely(afinfo == NULL))
return NULL;
if ((type && !try_module_get(type->owner)))
type = NULL;
+ if (!type && try_load) {
+ request_module("xfrm-offload-%d-%d", family, proto);
+ try_load = 0;
+ goto retry;
+ }
+
rcu_read_unlock();
return type;
}
}
}
}
- if (cnt)
+ if (cnt) {
err = 0;
-
+ xfrm_policy_cache_flush();
+ }
out:
spin_unlock_bh(&net->xfrm.xfrm_state_lock);
return err;
xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
unsigned short family, struct net *net)
{
+ int i;
int err = 0;
struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
if (!afinfo)
spin_lock_bh(&net->xfrm.xfrm_state_lock); /*FIXME*/
if (afinfo->tmpl_sort)
err = afinfo->tmpl_sort(dst, src, n);
+ else
+ for (i = 0; i < n; i++)
+ dst[i] = src[i];
spin_unlock_bh(&net->xfrm.xfrm_state_lock);
rcu_read_unlock();
return err;
xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
unsigned short family)
{
+ int i;
int err = 0;
struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
struct net *net = xs_net(*src);
spin_lock_bh(&net->xfrm.xfrm_state_lock);
if (afinfo->state_sort)
err = afinfo->state_sort(dst, src, n);
+ else
+ for (i = 0; i < n; i++)
+ dst[i] = src[i];
spin_unlock_bh(&net->xfrm.xfrm_state_lock);
rcu_read_unlock();
return err;
return mtu - x->props.header_len;
}
-int __xfrm_init_state(struct xfrm_state *x, bool init_replay)
+int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload)
{
struct xfrm_state_afinfo *afinfo;
struct xfrm_mode *inner_mode;
if (x->type == NULL)
goto error;
- x->type_offload = xfrm_get_type_offload(x->id.proto, family);
+ x->type_offload = xfrm_get_type_offload(x->id.proto, family, offload);
err = x->type->init_state(x);
if (err)
int xfrm_init_state(struct xfrm_state *x)
{
- return __xfrm_init_state(x, true);
+ return __xfrm_init_state(x, true, false);
}
EXPORT_SYMBOL(xfrm_init_state);
xfrm_mark_get(attrs, &x->mark);
- err = __xfrm_init_state(x, false);
+ if (attrs[XFRMA_OUTPUT_MARK])
+ x->props.output_mark = nla_get_u32(attrs[XFRMA_OUTPUT_MARK]);
+
+ err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]);
if (err)
goto error;
return -EMSGSIZE;
xuo = nla_data(attr);
-
+ memset(xuo, 0, sizeof(*xuo));
xuo->ifindex = xso->dev->ifindex;
xuo->flags = xso->flags;
ret = copy_user_offload(&x->xso, skb);
if (ret)
goto out;
+ if (x->props.output_mark) {
+ ret = nla_put_u32(skb, XFRMA_OUTPUT_MARK, x->props.output_mark);
+ if (ret)
+ goto out;
+ }
if (x->security)
ret = copy_sec_ctx(x->security, skb);
out:
out:
xfrm_pol_put(xp);
- if (delete && err == 0)
- xfrm_garbage_collect(net);
return err;
}
return -EMSGSIZE;
id = nlmsg_data(nlh);
+ memset(&id->sa_id, 0, sizeof(id->sa_id));
memcpy(&id->sa_id.daddr, &x->id.daddr, sizeof(x->id.daddr));
id->sa_id.spi = x->id.spi;
id->sa_id.family = x->props.family;
return 0;
return err;
}
- xfrm_garbage_collect(net);
c.data.type = type;
c.event = nlh->nlmsg_type;
[XFRMA_PROTO] = { .type = NLA_U8 },
[XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) },
[XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) },
+ [XFRMA_OUTPUT_MARK] = { .len = NLA_U32 },
};
static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
ue = nlmsg_data(nlh);
copy_to_user_state(x, &ue->state);
ue->hard = (c->data.hard != 0) ? 1 : 0;
+ /* clear the padding bytes */
+ memset(&ue->hard + 1, 0, sizeof(*ue) - offsetofend(typeof(*ue), hard));
err = xfrm_mark_put(skb, &x->mark);
if (err)
l += nla_total_size(sizeof(x->props.extra_flags));
if (x->xso.dev)
l += nla_total_size(sizeof(x->xso));
+ if (x->props.output_mark)
+ l += nla_total_size(sizeof(x->props.output_mark));
/* Must count x->lastused as it may become non-zero behind our back. */
l += nla_total_size_64bit(sizeof(u64));
struct nlattr *attr;
id = nlmsg_data(nlh);
+ memset(id, 0, sizeof(*id));
memcpy(&id->daddr, &x->id.daddr, sizeof(id->daddr));
id->spi = x->id.spi;
id->family = x->props.family;