From 9f9e9cd50eac6ad09cb053509f2e764bddc05f18 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 28 Nov 2017 11:03:37 +0200 Subject: [PATCH] net/mlx5e: Remove page_ref bulking in Striding RQ When many packets reside on the same page, the bulking of page_ref modifications reduces the total number of atomic operations executed. Besides the necessary 2 operations on page alloc/free, we have the following extra ops per page: - one on WQE allocation (bump refcnt to maximum possible), - zero ops for SKBs, - one on WQE free, a constant of two operations in total, no matter how many packets/SKBs actually populate the page. Without this bulking, we have: - no ops on WQE allocation or free, - one op per SKB, Comparing the two methods when PAGE_SIZE is 4K: - As mentioned above, bulking method always executes 2 operations, not more, but not less. - In the default MTU configuration (1500, stride size is 2K), the non-bulking method execute 2 ops as well. - For larger MTUs with stride size of 4K, non-bulking method executes only a single op. - For XDP (stride size of 4K, no SKBs), non-bulking method executes no ops at all! Hence, to optimize the flows with linear SKB and XDP over Striding RQ, we here remove the page_ref bulking method. Performance testing: ConnectX-5, Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz. Single core packet rate (64 bytes). Early drop in TC: no degradation. XDP_DROP: before: 14,270,188 pps after: 20,503,603 pps, 43% improvement. Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 - .../net/ethernet/mellanox/mlx5/core/en_rx.c | 47 +++++++------------ 2 files changed, 16 insertions(+), 32 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 7997d7c159db..5853de4e4fc7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -456,7 +456,6 @@ struct mlx5e_umr_dma_info { struct mlx5e_mpw_info { struct mlx5e_umr_dma_info umr; u16 consumed_strides; - u16 skbs_frags[MLX5_MPWRQ_PAGES_PER_WQE]; DECLARE_BITMAP(xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE); }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 1da79cab1838..9bb47a6d40f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -296,37 +296,28 @@ void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix) mlx5e_free_rx_wqe(rq, wi); } -static inline int mlx5e_mpwqe_strides_per_page(struct mlx5e_rq *rq) -{ - return rq->mpwqe.num_strides >> MLX5_MPWRQ_WQE_PAGE_ORDER; -} - static inline void mlx5e_add_skb_frag_mpwqe(struct mlx5e_rq *rq, struct sk_buff *skb, - struct mlx5e_mpw_info *wi, - u32 page_idx, u32 frag_offset, - u32 len) + struct mlx5e_dma_info *di, + u32 frag_offset, u32 len) { unsigned int truesize = ALIGN(len, BIT(rq->mpwqe.log_stride_sz)); dma_sync_single_for_cpu(rq->pdev, - wi->umr.dma_info[page_idx].addr + frag_offset, + di->addr + frag_offset, len, DMA_FROM_DEVICE); - wi->skbs_frags[page_idx]++; + page_ref_inc(di->page); skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, - wi->umr.dma_info[page_idx].page, frag_offset, - len, truesize); + di->page, frag_offset, len, truesize); } static inline void mlx5e_copy_skb_header_mpwqe(struct device *pdev, struct sk_buff *skb, - struct mlx5e_mpw_info *wi, - u32 page_idx, u32 offset, - u32 headlen) + struct mlx5e_dma_info *dma_info, + u32 offset, u32 headlen) { u16 headlen_pg = min_t(u32, headlen, PAGE_SIZE - offset); - struct mlx5e_dma_info *dma_info = &wi->umr.dma_info[page_idx]; unsigned int len; /* Aligning len to sizeof(long) optimizes memcpy performance */ @@ -351,15 +342,12 @@ void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi) { const bool no_xdp_xmit = bitmap_empty(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE); - int pg_strides = mlx5e_mpwqe_strides_per_page(rq); struct mlx5e_dma_info *dma_info = wi->umr.dma_info; int i; - for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) { - page_ref_sub(dma_info[i].page, pg_strides - wi->skbs_frags[i]); + for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) if (no_xdp_xmit || !test_bit(i, wi->xdp_xmit_bitmap)) mlx5e_page_release(rq, &dma_info[i], true); - } } static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq) @@ -380,7 +368,6 @@ static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq) static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) { struct mlx5e_mpw_info *wi = &rq->mpwqe.info[ix]; - int pg_strides = mlx5e_mpwqe_strides_per_page(rq); struct mlx5e_dma_info *dma_info = &wi->umr.dma_info[0]; struct mlx5e_icosq *sq = &rq->channel->icosq; struct mlx5_wq_cyc *wq = &sq->wq; @@ -403,10 +390,8 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) if (unlikely(err)) goto err_unmap; umr_wqe->inline_mtts[i].ptag = cpu_to_be64(dma_info->addr | MLX5_EN_WR); - page_ref_add(dma_info->page, pg_strides); } - memset(wi->skbs_frags, 0, sizeof(*wi->skbs_frags) * MLX5_MPWRQ_PAGES_PER_WQE); bitmap_zero(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE); wi->consumed_strides = 0; @@ -425,7 +410,6 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) err_unmap: while (--i >= 0) { dma_info--; - page_ref_sub(dma_info->page, pg_strides); mlx5e_page_release(rq, dma_info, true); } rq->stats.buff_alloc_err++; @@ -987,9 +971,10 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w u16 cqe_bcnt, u32 head_offset, u32 page_idx) { u16 headlen = min_t(u16, MLX5_MPWRQ_SMALL_PACKET_THRESHOLD, cqe_bcnt); + struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx]; u32 frag_offset = head_offset + headlen; - u16 byte_cnt = cqe_bcnt - headlen; - u32 head_page_idx = page_idx; + u32 byte_cnt = cqe_bcnt - headlen; + struct mlx5e_dma_info *head_di = di; struct sk_buff *skb; skb = napi_alloc_skb(rq->cq.napi, @@ -1002,7 +987,7 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w prefetchw(skb->data); if (unlikely(frag_offset >= PAGE_SIZE)) { - page_idx++; + di++; frag_offset -= PAGE_SIZE; } @@ -1010,14 +995,14 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w u32 pg_consumed_bytes = min_t(u32, PAGE_SIZE - frag_offset, byte_cnt); - mlx5e_add_skb_frag_mpwqe(rq, skb, wi, page_idx, frag_offset, + mlx5e_add_skb_frag_mpwqe(rq, skb, di, frag_offset, pg_consumed_bytes); byte_cnt -= pg_consumed_bytes; frag_offset = 0; - page_idx++; + di++; } /* copy header */ - mlx5e_copy_skb_header_mpwqe(rq->pdev, skb, wi, head_page_idx, + mlx5e_copy_skb_header_mpwqe(rq->pdev, skb, head_di, head_offset, headlen); /* skb linear part was allocated with headlen and aligned to long */ skb->tail += headlen; @@ -1060,7 +1045,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, return NULL; /* queue up for recycling/reuse */ - wi->skbs_frags[page_idx]++; + page_ref_inc(di->page); return skb; } -- 2.45.2