]> asedeno.scripts.mit.edu Git - linux.git/commitdiff
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 22 Mar 2016 22:48:44 +0000 (15:48 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 22 Mar 2016 22:48:44 +0000 (15:48 -0700)
Pull more rdma updates from Doug Ledford:
 "Round two of 4.6 merge window patches.

  This is a monster pull request.  I held off on the hfi1 driver updates
  (the hfi1 driver is intimately tied to the qib driver and the new
  rdmavt software library that was created to help both of them) in my
  first pull request.  The hfi1/qib/rdmavt update is probably 90% of
  this pull request.  The hfi1 driver is being left in staging so that
  it can be fixed up in regards to the API that Al and yourself didn't
  like.  Intel has agreed to do the work, but in the meantime, this
  clears out 300+ patches in the backlog queue and brings my tree and
  their tree closer to sync.

  This also includes about 10 patches to the core and a few to mlx5 to
  create an infrastructure for configuring SRIOV ports on IB devices.
  That series includes one patch to the net core that we sent to netdev@
  and Dave Miller with each of the three revisions to the series.  We
  didn't get any response to the patch, so we took that as implicit
  approval.

  Finally, this series includes Intel's new iWARP driver for their x722
  cards.  It's not nearly the beast as the hfi1 driver.  It also has a
  linux-next merge issue, but that has been resolved and it now passes
  just fine.

  Summary:

   - A few minor core fixups needed for the next patch series

   - The IB SRIOV series.  This has bounced around for several versions.
     Of note is the fact that the first patch in this series effects the
     net core.  It was directed to netdev and DaveM for each iteration
     of the series (three versions total).  Dave did not object, but did
     not respond either.  I've taken this as permission to move forward
     with the series.

   - The new Intel X722 iWARP driver

   - A huge set of updates to the Intel hfi1 driver.  Of particular
     interest here is that we have left the driver in staging since it
     still has an API that people object to.  Intel is working on a fix,
     but getting these patches in now helps keep me sane as the upstream
     and Intel's trees were over 300 patches apart"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma: (362 commits)
  IB/ipoib: Allow mcast packets from other VFs
  IB/mlx5: Implement callbacks for manipulating VFs
  net/mlx5_core: Implement modify HCA vport command
  net/mlx5_core: Add VF param when querying vport counter
  IB/ipoib: Add ndo operations for configuring VFs
  IB/core: Add interfaces to control VF attributes
  IB/core: Support accessing SA in virtualized environment
  IB/core: Add subnet prefix to port info
  IB/mlx5: Fix decision on using MAD_IFC
  net/core: Add support for configuring VF GUIDs
  IB/{core, ulp} Support above 32 possible device capability flags
  IB/core: Replace setting the zero values in ib_uverbs_ex_query_device
  net/mlx5_core: Introduce offload arithmetic hardware capabilities
  net/mlx5_core: Refactor device capability function
  net/mlx5_core: Fix caching ATOMIC endian mode capability
  ib_srpt: fix a WARN_ON() message
  i40iw: Replace the obsolete crypto hash interface with shash
  IB/hfi1: Add SDMA cache eviction algorithm
  IB/hfi1: Switch to using the pin query function
  IB/hfi1: Specify mm when releasing pages
  ...

27 files changed:
1  2 
MAINTAINERS
drivers/infiniband/core/sa_query.c
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/ulp/srpt/ib_srpt.c
drivers/net/ethernet/intel/i40e/i40e.h
drivers/net/ethernet/intel/i40e/i40e_main.c
drivers/net/ethernet/intel/i40e/i40e_type.h
drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
drivers/net/ethernet/mellanox/mlx5/core/cmd.c
drivers/net/ethernet/mellanox/mlx5/core/main.c
drivers/staging/rdma/hfi1/chip.c
drivers/staging/rdma/hfi1/diag.c
drivers/staging/rdma/hfi1/driver.c
drivers/staging/rdma/hfi1/efivar.c
drivers/staging/rdma/hfi1/file_ops.c
drivers/staging/rdma/hfi1/init.c
drivers/staging/rdma/hfi1/mad.c
drivers/staging/rdma/hfi1/pcie.c
drivers/staging/rdma/hfi1/pio_copy.c
drivers/staging/rdma/hfi1/user_sdma.c
include/linux/mlx5/device.h
include/linux/mlx5/driver.h
include/linux/mlx5/mlx5_ifc.h
include/linux/netdevice.h
include/uapi/linux/if_link.h
net/core/rtnetlink.c

diff --combined MAINTAINERS
index 0f3063cce44cff0e957c86ef30634433e8c96b13,c62557e6893edef161242c4417c2a38d71951263..32bafda47c2fedacaf2f5b4a71347cc5889dc773
@@@ -151,7 -151,7 +151,7 @@@ S: Maintaine
  F:    drivers/scsi/53c700*
  
  6LOWPAN GENERIC (BTLE/IEEE 802.15.4)
 -M:    Alexander Aring <alex.aring@gmail.com>
 +M:    Alexander Aring <aar@pengutronix.de>
  M:    Jukka Rissanen <jukka.rissanen@linux.intel.com>
  L:    linux-bluetooth@vger.kernel.org
  L:    linux-wpan@vger.kernel.org
@@@ -238,12 -238,6 +238,12 @@@ L:       lm-sensors@lm-sensors.or
  S:    Maintained
  F:    drivers/hwmon/abituguru3.c
  
 +ACCES 104-DIO-48E GPIO DRIVER
 +M:    William Breathitt Gray <vilhelm.gray@gmail.com>
 +L:    linux-gpio@vger.kernel.org
 +S:    Maintained
 +F:    drivers/gpio/gpio-104-dio-48e.c
 +
  ACCES 104-IDI-48 GPIO DRIVER
  M:    "William Breathitt Gray" <vilhelm.gray@gmail.com>
  L:    linux-gpio@vger.kernel.org
@@@ -679,19 -673,11 +679,19 @@@ F:      drivers/gpu/drm/radeon/radeon_kfd.
  F:    drivers/gpu/drm/radeon/radeon_kfd.h
  F:    include/uapi/linux/kfd_ioctl.h
  
 +AMD SEATTLE DEVICE TREE SUPPORT
 +M:    Brijesh Singh <brijeshkumar.singh@amd.com>
 +M:    Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
 +M:    Tom Lendacky <thomas.lendacky@amd.com>
 +S:    Supported
 +F:    arch/arm64/boot/dts/amd/
 +
  AMD XGBE DRIVER
  M:    Tom Lendacky <thomas.lendacky@amd.com>
  L:    netdev@vger.kernel.org
  S:    Supported
  F:    drivers/net/ethernet/amd/xgbe/
 +F:    arch/arm64/boot/dts/amd/amd-seattle-xgbe*.dtsi
  
  AMS (Apple Motion Sensor) DRIVER
  M:    Michael Hanselmann <linux-kernel@hansmi.ch>
@@@ -783,12 -769,6 +783,12 @@@ L:       alsa-devel@alsa-project.org (moderat
  S:    Maintained
  F:    sound/aoa/
  
 +APEX EMBEDDED SYSTEMS STX104 DAC DRIVER
 +M:    William Breathitt Gray <vilhelm.gray@gmail.com>
 +L:    linux-iio@vger.kernel.org
 +S:    Maintained
 +F:    drivers/iio/dac/stx104.c
 +
  APM DRIVER
  M:    Jiri Kosina <jikos@kernel.org>
  S:    Odd fixes
@@@ -847,12 -827,6 +847,12 @@@ S:       Maintaine
  F:    drivers/net/arcnet/
  F:    include/uapi/linux/if_arcnet.h
  
 +ARM HDLCD DRM DRIVER
 +M:    Liviu Dudau <liviu.dudau@arm.com>
 +S:    Supported
 +F:    drivers/gpu/drm/arm/
 +F:    Documentation/devicetree/bindings/display/arm,hdlcd.txt
 +
  ARM MFM AND FLOPPY DRIVERS
  M:    Ian Molton <spyro@f2s.com>
  S:    Maintained
@@@ -965,16 -939,6 +965,16 @@@ F:       arch/arm/boot/dts/alpine
  F:    arch/arm64/boot/dts/al/
  F:    drivers/*/*alpine*
  
 +ARM/ARTPEC MACHINE SUPPORT
 +M:    Jesper Nilsson <jesper.nilsson@axis.com>
 +M:    Lars Persson <lars.persson@axis.com>
 +M:    Niklas Cassel <niklas.cassel@axis.com>
 +S:    Maintained
 +L:    linux-arm-kernel@axis.com
 +F:    arch/arm/mach-artpec
 +F:    arch/arm/boot/dts/artpec6*
 +F:    drivers/clk/clk-artpec6.c
 +
  ARM/ATMEL AT91RM9200, AT91SAM9 AND SAMA5 SOC SUPPORT
  M:    Nicolas Ferre <nicolas.ferre@atmel.com>
  M:    Alexandre Belloni <alexandre.belloni@free-electrons.com>
@@@ -1321,7 -1285,6 +1321,7 @@@ F:      arch/arm/mach-mvebu
  F:    drivers/rtc/rtc-armada38x.c
  F:    arch/arm/boot/dts/armada*
  F:    arch/arm/boot/dts/kirkwood*
 +F:    arch/arm64/boot/dts/marvell/armada*
  
  
  ARM/Marvell Berlin SoC support
@@@ -1540,7 -1503,6 +1540,7 @@@ F:      arch/arm/mach-s5p*
  F:    arch/arm/mach-exynos*/
  F:    drivers/*/*s3c2410*
  F:    drivers/*/*/*s3c2410*
 +F:    drivers/soc/samsung/*
  F:    drivers/spi/spi-s3c*
  F:    sound/soc/samsung/*
  F:    Documentation/arm/Samsung/
@@@ -1838,13 -1800,11 +1838,13 @@@ F:   drivers/edac/synopsys_edac.
  
  ARM SMMU DRIVERS
  M:    Will Deacon <will.deacon@arm.com>
 +R:    Robin Murphy <robin.murphy@arm.com>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
  F:    drivers/iommu/arm-smmu.c
  F:    drivers/iommu/arm-smmu-v3.c
  F:    drivers/iommu/io-pgtable-arm.c
 +F:    drivers/iommu/io-pgtable-arm-v7s.c
  
  ARM64 PORT (AARCH64 ARCHITECTURE)
  M:    Catalin Marinas <catalin.marinas@arm.com>
@@@ -1996,12 -1956,6 +1996,12 @@@ M:    Nicolas Ferre <nicolas.ferre@atmel.c
  S:    Supported
  F:    drivers/tty/serial/atmel_serial.c
  
 +ATMEL SAMA5D2 ADC DRIVER
 +M:    Ludovic Desroches <ludovic.desroches@atmel.com>
 +L:    linux-iio@vger.kernel.org
 +S:    Supported
 +F:    drivers/iio/adc/at91-sama5d2_adc.c
 +
  ATMEL Audio ALSA driver
  M:    Nicolas Ferre <nicolas.ferre@atmel.com>
  L:    alsa-devel@alsa-project.org (moderated for non-subscribers)
@@@ -2204,8 -2158,7 +2204,8 @@@ M:      Marek Lindner <mareklindner@neomailb
  M:    Simon Wunderlich <sw@simonwunderlich.de>
  M:    Antonio Quartulli <a@unstable.cc>
  L:    b.a.t.m.a.n@lists.open-mesh.org
 -W:    http://www.open-mesh.org/
 +W:    https://www.open-mesh.org/
 +Q:    https://patchwork.open-mesh.org/project/batman/list/
  S:    Maintained
  F:    net/batman-adv/
  
@@@ -2434,9 -2387,8 +2434,9 @@@ F:      arch/arm/boot/dts/bcm470
  
  BROADCOM BCM63XX ARM ARCHITECTURE
  M:    Florian Fainelli <f.fainelli@gmail.com>
 -L:    linux-arm-kernel@lists.infradead.org
 -T:    git git://github.com/broadcom/arm-bcm63xx.git
 +L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 +L:    bcm-kernel-feedback-list@broadcom.com
 +T:    git git://github.com/broadcom/stblinux.git
  S:    Maintained
  F:    arch/arm/mach-bcm/bcm63xx.c
  F:    arch/arm/include/debug/bcm63xx.S
@@@ -2470,14 -2422,12 +2470,14 @@@ F:   arch/mips/bmips/
  F:    arch/mips/include/asm/mach-bmips/*
  F:    arch/mips/kernel/*bmips*
  F:    arch/mips/boot/dts/brcm/bcm*.dts*
 +F:    drivers/irqchip/irq-bcm63*
  F:    drivers/irqchip/irq-bcm7*
  F:    drivers/irqchip/irq-brcmstb*
  F:    include/linux/bcm963xx_nvram.h
  F:    include/linux/bcm963xx_tag.h
  
  BROADCOM TG3 GIGABIT ETHERNET DRIVER
 +M:    Siva Reddy Kallam <siva.kallam@broadcom.com>
  M:    Prashant Sreedharan <prashant@broadcom.com>
  M:    Michael Chan <mchan@broadcom.com>
  L:    netdev@vger.kernel.org
@@@ -2569,13 -2519,6 +2569,13 @@@ L:    netdev@vger.kernel.or
  S:    Supported
  F:    drivers/net/ethernet/broadcom/bcmsysport.*
  
 +BROADCOM VULCAN ARM64 SOC
 +M:    Jayachandran C. <jchandra@broadcom.com>
 +L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 +L:    bcm-kernel-feedback-list@broadcom.com
 +S:    Maintained
 +F:    arch/arm64/boot/dts/broadcom/vulcan*
 +
  BROCADE BFA FC SCSI DRIVER
  M:    Anil Gurumurthy <anil.gurumurthy@qlogic.com>
  M:    Sudarsana Kalluru <sudarsana.kalluru@qlogic.com>
@@@ -3561,14 -3504,6 +3561,14 @@@ F:    include/linux/device-mapper.
  F:    include/linux/dm-*.h
  F:    include/uapi/linux/dm-*.h
  
 +DEVLINK
 +M:    Jiri Pirko <jiri@mellanox.com>
 +L:    netdev@vger.kernel.org
 +S:    Supported
 +F:    net/core/devlink.c
 +F:    include/net/devlink.h
 +F:    include/uapi/linux/devlink.h
 +
  DIALOG SEMICONDUCTOR DRIVERS
  M:    Support Opensource <support.opensource@diasemi.com>
  W:    http://www.dialog-semiconductor.com/products
@@@ -3606,6 -3541,13 +3606,6 @@@ L:     driverdev-devel@linuxdriverproject.o
  S:    Maintained
  F:    drivers/staging/dgnc/
  
 -DIGI EPCA PCI PRODUCTS
 -M:    Lidza Louina <lidza.louina@gmail.com>
 -M:    Daeseok Youn <daeseok.youn@gmail.com>
 -L:    driverdev-devel@linuxdriverproject.org
 -S:    Maintained
 -F:    drivers/staging/dgap/
 -
  DIOLAN U2C-12 I2C DRIVER
  M:    Guenter Roeck <linux@roeck-us.net>
  L:    linux-i2c@vger.kernel.org
@@@ -3762,7 -3704,7 +3762,7 @@@ F:      drivers/gpu/vga
  F:    include/drm/
  F:    include/uapi/drm/
  
 -RADEON DRM DRIVERS
 +RADEON and AMDGPU DRM DRIVERS
  M:    Alex Deucher <alexander.deucher@amd.com>
  M:    Christian König <christian.koenig@amd.com>
  L:    dri-devel@lists.freedesktop.org
@@@ -3770,8 -3712,6 +3770,8 @@@ T:      git git://people.freedesktop.org/~ag
  S:    Supported
  F:    drivers/gpu/drm/radeon/
  F:    include/uapi/drm/radeon*
 +F:    drivers/gpu/drm/amd/
 +F:    include/uapi/drm/amdgpu*
  
  DRM PANEL DRIVERS
  M:    Thierry Reding <thierry.reding@gmail.com>
@@@ -3816,7 -3756,7 +3816,7 @@@ F:      include/drm/exynos
  F:    include/uapi/drm/exynos*
  
  DRM DRIVERS FOR FREESCALE DCU
 -M:    Jianwei Wang <jianwei.wang.chn@gmail.com>
 +M:    Stefan Agner <stefan@agner.ch>
  M:    Alison Wang <alison.wang@freescale.com>
  L:    dri-devel@lists.freedesktop.org
  S:    Supported
@@@ -4288,6 -4228,13 +4288,6 @@@ M:     Maxim Levitsky <maximlevitsky@gmail.
  S:    Maintained
  F:    drivers/media/rc/ene_ir.*
  
 -ENHANCED ERROR HANDLING (EEH)
 -M:    Gavin Shan <shangw@linux.vnet.ibm.com>
 -L:    linuxppc-dev@lists.ozlabs.org
 -S:    Supported
 -F:    Documentation/powerpc/eeh-pci-error-recovery.txt
 -F:    arch/powerpc/kernel/eeh*.c
 -
  EPSON S1D13XXX FRAMEBUFFER DRIVER
  M:    Kristoffer Ericson <kristoffer.ericson@gmail.com>
  S:    Maintained
@@@ -4364,12 -4311,6 +4364,12 @@@ L:    dri-devel@lists.freedesktop.or
  S:    Maintained
  F:    drivers/gpu/drm/exynos/exynos_dp*
  
 +EXYNOS SYSMMU (IOMMU) driver
 +M:    Marek Szyprowski <m.szyprowski@samsung.com>
 +L:    iommu@lists.linux-foundation.org
 +S:    Maintained
 +F:    drivers/iommu/exynos-iommu.c
 +
  EXYNOS MIPI DISPLAY DRIVERS
  M:    Inki Dae <inki.dae@samsung.com>
  M:    Donghwa Lee <dh09.lee@samsung.com>
@@@ -4577,12 -4518,6 +4577,12 @@@ L:    linuxppc-dev@lists.ozlabs.or
  S:    Maintained
  F:    drivers/dma/fsldma.*
  
 +FREESCALE GPMI NAND DRIVER
 +M:    Han Xu <han.xu@nxp.com>
 +L:    linux-mtd@lists.infradead.org
 +S:    Maintained
 +F:    drivers/mtd/nand/gpmi-nand/*
 +
  FREESCALE I2C CPM DRIVER
  M:    Jochen Friedrich <jochen@scram.de>
  L:    linuxppc-dev@lists.ozlabs.org
@@@ -4599,7 -4534,7 +4599,7 @@@ F:      include/linux/platform_data/video-im
  F:    drivers/video/fbdev/imxfb.c
  
  FREESCALE QUAD SPI DRIVER
 -M:    Han Xu <han.xu@freescale.com>
 +M:    Han Xu <han.xu@nxp.com>
  L:    linux-mtd@lists.infradead.org
  S:    Maintained
  F:    drivers/mtd/spi-nor/fsl-quadspi.c
@@@ -4613,15 -4548,6 +4613,15 @@@ S:    Maintaine
  F:    drivers/net/ethernet/freescale/fs_enet/
  F:    include/linux/fs_enet_pd.h
  
 +FREESCALE IMX / MXC FEC DRIVER
 +M:    Fugang Duan <fugang.duan@nxp.com>
 +L:    netdev@vger.kernel.org
 +S:    Maintained
 +F:    drivers/net/ethernet/freescale/fec_main.c
 +F:    drivers/net/ethernet/freescale/fec_ptp.c
 +F:    drivers/net/ethernet/freescale/fec.h
 +F:    Documentation/devicetree/bindings/net/fsl-fec.txt
 +
  FREESCALE QUICC ENGINE LIBRARY
  L:    linuxppc-dev@lists.ozlabs.org
  S:    Orphan
@@@ -4885,14 -4811,10 +4885,14 @@@ L:   linux-gpio@vger.kernel.or
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-gpio.git
  S:    Maintained
  F:    Documentation/gpio/
 +F:    Documentation/ABI/testing/gpio-cdev
 +F:    Documentation/ABI/obsolete/sysfs-gpio
  F:    drivers/gpio/
  F:    include/linux/gpio/
  F:    include/linux/gpio.h
  F:    include/asm-generic/gpio.h
 +F:    include/uapi/linux/gpio.h
 +F:    tools/gpio/
  
  GRE DEMULTIPLEXER DRIVER
  M:    Dmitry Kozlov <xeb@mail.ru>
@@@ -5041,7 -4963,6 +5041,7 @@@ F:      include/linux/hw_random.
  
  HARDWARE SPINLOCK CORE
  M:    Ohad Ben-Cohen <ohad@wizery.com>
 +M:    Bjorn Andersson <bjorn.andersson@linaro.org>
  S:    Maintained
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/ohad/hwspinlock.git
  F:    Documentation/hwspinlock.txt
@@@ -5063,10 -4984,16 +5063,10 @@@ T:   git git://linuxtv.org/anttip/media_t
  S:    Maintained
  F:    drivers/media/dvb-frontends/hd29l2*
  
 -HEWLETT-PACKARD SMART2 RAID DRIVER
 -L:    iss_storagedev@hp.com
 -S:    Orphan
 -F:    Documentation/blockdev/cpqarray.txt
 -F:    drivers/block/cpqarray.*
 -
  HEWLETT-PACKARD SMART ARRAY RAID DRIVER (hpsa)
 -M:    Don Brace <don.brace@pmcs.com>
 +M:    Don Brace <don.brace@microsemi.com>
  L:    iss_storagedev@hp.com
 -L:    storagedev@pmcs.com
 +L:    esc.storagedev@microsemi.com
  L:    linux-scsi@vger.kernel.org
  S:    Supported
  F:    Documentation/scsi/hpsa.txt
@@@ -5075,9 -5002,9 +5075,9 @@@ F:      include/linux/cciss*.
  F:    include/uapi/linux/cciss*.h
  
  HEWLETT-PACKARD SMART CISS RAID DRIVER (cciss)
 -M:    Don Brace <don.brace@pmcs.com>
 +M:    Don Brace <don.brace@microsemi.com>
  L:    iss_storagedev@hp.com
 -L:    storagedev@pmcs.com
 +L:    esc.storagedev@microsemi.com
  L:    linux-scsi@vger.kernel.org
  S:    Supported
  F:    Documentation/blockdev/cciss.txt
@@@ -5262,7 -5189,6 +5262,7 @@@ F:      arch/x86/kernel/cpu/mshyperv.
  F:    drivers/hid/hid-hyperv.c
  F:    drivers/hv/
  F:    drivers/input/serio/hyperv-keyboard.c
 +F:    drivers/pci/host/pci-hyperv.c
  F:    drivers/net/hyperv/
  F:    drivers/scsi/storvsc_drv.c
  F:    drivers/video/fbdev/hyperv_fb.c
@@@ -5270,16 -5196,6 +5270,16 @@@ F:    include/linux/hyperv.
  F:    tools/hv/
  F:    Documentation/ABI/stable/sysfs-bus-vmbus
  
 +I2C MUXES
 +M:    Peter Rosin <peda@axentia.se>
 +L:    linux-i2c@vger.kernel.org
 +S:    Maintained
 +F:    Documentation/i2c/muxes/
 +F:    Documentation/devicetree/bindings/i2c/i2c-mux*
 +F:    drivers/i2c/i2c-mux.c
 +F:    drivers/i2c/muxes/
 +F:    include/linux/i2c-mux.h
 +
  I2C OVER PARALLEL PORT
  M:    Jean Delvare <jdelvare@suse.com>
  L:    linux-i2c@vger.kernel.org
@@@ -5504,11 -5420,10 +5504,11 @@@ S:   Supporte
  F:    drivers/idle/i7300_idle.c
  
  IEEE 802.15.4 SUBSYSTEM
 -M:    Alexander Aring <alex.aring@gmail.com>
 +M:    Alexander Aring <aar@pengutronix.de>
  L:    linux-wpan@vger.kernel.org
 -W:    https://github.com/linux-wpan
 -T:    git git://github.com/linux-wpan/linux-wpan-next.git
 +W:    http://wpan.cakelab.org/
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth.git
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git
  S:    Maintained
  F:    net/ieee802154/
  F:    net/mac802154/
@@@ -5638,7 -5553,6 +5638,7 @@@ F:      drivers/input
  F:    include/linux/input.h
  F:    include/uapi/linux/input.h
  F:    include/linux/input/
 +F:    Documentation/devicetree/bindings/input/
  
  INPUT MULTITOUCH (MT) PROTOCOL
  M:    Henrik Rydberg <rydberg@bitmath.org>
@@@ -5770,6 -5684,16 +5770,16 @@@ F:    Documentation/networking/i40evf.tx
  F:    drivers/net/ethernet/intel/
  F:    drivers/net/ethernet/intel/*/
  
+ INTEL RDMA RNIC DRIVER
+ M:     Faisal Latif <faisal.latif@intel.com>
+ R:     Chien Tin Tung <chien.tin.tung@intel.com>
+ R:     Mustafa Ismail <mustafa.ismail@intel.com>
+ R:     Shiraz Saleem <shiraz.saleem@intel.com>
+ R:     Tatyana Nikolova <tatyana.e.nikolova@intel.com>
+ L:     linux-rdma@vger.kernel.org
+ S:     Supported
+ F:     drivers/infiniband/hw/i40iw/
  INTEL-MID GPIO DRIVER
  M:    David Cohen <david.a.cohen@linux.intel.com>
  L:    linux-gpio@vger.kernel.org
@@@ -5833,7 -5757,6 +5843,7 @@@ S:      Supporte
  F:    include/uapi/linux/mei.h
  F:    include/linux/mei_cl_bus.h
  F:    drivers/misc/mei/*
 +F:    drivers/watchdog/mei_wdt.c
  F:    Documentation/misc-devices/mei/*
  
  INTEL MIC DRIVERS (mic)
@@@ -6136,7 -6059,7 +6146,7 @@@ S:      Maintaine
  F:    drivers/media/platform/rcar_jpu.c
  
  JSM Neo PCI based serial card
 -M:    Thadeu Lima de Souza Cascardo <cascardo@linux.vnet.ibm.com>
 +M:    Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
  L:    linux-serial@vger.kernel.org
  S:    Maintained
  F:    drivers/tty/serial/jsm/
@@@ -6654,10 -6577,9 +6664,10 @@@ F:    drivers/platform/x86/hp_accel.
  
  LIVE PATCHING
  M:    Josh Poimboeuf <jpoimboe@redhat.com>
 -M:    Seth Jennings <sjenning@redhat.com>
 +M:    Jessica Yu <jeyu@redhat.com>
  M:    Jiri Kosina <jikos@kernel.org>
 -M:    Vojtech Pavlik <vojtech@suse.com>
 +M:    Miroslav Benes <mbenes@suse.cz>
 +R:    Petr Mladek <pmladek@suse.com>
  S:    Maintained
  F:    kernel/livepatch/
  F:    include/linux/livepatch.h
@@@ -6668,11 -6590,6 +6678,11 @@@ F:    samples/livepatch
  L:    live-patching@vger.kernel.org
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/jikos/livepatching.git
  
 +LINUX KERNEL DUMP TEST MODULE (LKDTM)
 +M:    Kees Cook <keescook@chromium.org>
 +S:    Maintained
 +F:    drivers/misc/lkdtm.c
 +
  LLC (802.2)
  M:    Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
  S:    Maintained
@@@ -6758,12 -6675,13 +6768,12 @@@ S:   Maintaine
  F:    arch/arm/mach-lpc32xx/
  
  LSILOGIC MPT FUSION DRIVERS (FC/SAS/SPI)
 -M:    Nagalakshmi Nandigama <nagalakshmi.nandigama@avagotech.com>
 -M:    Praveen Krishnamoorthy <praveen.krishnamoorthy@avagotech.com>
 -M:    Sreekanth Reddy <sreekanth.reddy@avagotech.com>
 -M:    Abhijit Mahajan <abhijit.mahajan@avagotech.com>
 -L:    MPT-FusionLinux.pdl@avagotech.com
 +M:    Sathya Prakash <sathya.prakash@broadcom.com>
 +M:    Chaitra P B <chaitra.basappa@broadcom.com>
 +M:    Suganath Prabu Subramani <suganath-prabu.subramani@broadcom.com>
 +L:    MPT-FusionLinux.pdl@broadcom.com
  L:    linux-scsi@vger.kernel.org
 -W:    http://www.lsilogic.com/support
 +W:    http://www.avagotech.com/support/
  S:    Supported
  F:    drivers/message/fusion/
  F:    drivers/scsi/mpt2sas/
@@@ -6856,7 -6774,6 +6866,7 @@@ S:      Maintaine
  F:    Documentation/networking/mac80211-injection.txt
  F:    include/net/mac80211.h
  F:    net/mac80211/
 +F:    drivers/net/wireless/mac80211_hwsim.[ch]
  
  MACVLAN DRIVER
  M:    Patrick McHardy <kaber@trash.net>
@@@ -6986,7 -6903,7 +6996,7 @@@ MAXIM MAX77802 MULTIFUNCTION PMIC DEVIC
  M:    Javier Martinez Canillas <javier@osg.samsung.com>
  L:    linux-kernel@vger.kernel.org
  S:    Supported
 -F:    drivers/*/*max77802.c
 +F:    drivers/*/*max77802*.c
  F:    Documentation/devicetree/bindings/*/*max77802.txt
  F:    include/dt-bindings/*/*max77802.h
  
@@@ -6996,7 -6913,7 +7006,7 @@@ M:      Krzysztof Kozlowski <k.kozlowski@sam
  L:    linux-kernel@vger.kernel.org
  S:    Supported
  F:    drivers/*/max14577.c
 -F:    drivers/*/max77686.c
 +F:    drivers/*/max77686*.c
  F:    drivers/*/max77693.c
  F:    drivers/extcon/extcon-max14577.c
  F:    drivers/extcon/extcon-max77693.c
@@@ -7101,13 -7018,6 +7111,13 @@@ F:    include/uapi/linux/meye.
  F:    include/uapi/linux/ivtv*
  F:    include/uapi/linux/uvcvideo.h
  
 +MEDIATEK ETHERNET DRIVER
 +M:    Felix Fietkau <nbd@openwrt.org>
 +M:    John Crispin <blogic@openwrt.org>
 +L:    netdev@vger.kernel.org
 +S:    Maintained
 +F:    drivers/net/ethernet/mediatek/
 +
  MEDIATEK MT7601U WIRELESS LAN DRIVER
  M:    Jakub Kicinski <kubakici@wp.pl>
  L:    linux-wireless@vger.kernel.org
@@@ -7319,8 -7229,10 +7329,8 @@@ L:     linux-media@vger.kernel.or
  W:    https://linuxtv.org
  W:    http://palosaari.fi/linux/
  Q:    http://patchwork.linuxtv.org/project/linux-media/list/
 -T:    git git://linuxtv.org/anttip/media_tree.git
  S:    Maintained
 -F:    drivers/staging/media/mn88473/
 -F:    drivers/media/dvb-frontends/mn88473.h
 +F:    drivers/media/dvb-frontends/mn88473*
  
  MODULE SUPPORT
  M:    Rusty Russell <rusty@rustcorp.com.au>
@@@ -7481,17 -7393,6 +7491,17 @@@ W:    https://www.myricom.com/support/down
  S:    Supported
  F:    drivers/net/ethernet/myricom/myri10ge/
  
 +NAND FLASH SUBSYSTEM
 +M:    Boris Brezillon <boris.brezillon@free-electrons.com>
 +R:    Richard Weinberger <richard@nod.at>
 +L:    linux-mtd@lists.infradead.org
 +W:    http://www.linux-mtd.infradead.org/
 +Q:    http://patchwork.ozlabs.org/project/linux-mtd/list/
 +T:    git git://github.com/linux-nand/linux.git
 +S:    Maintained
 +F:    drivers/mtd/nand/
 +F:    include/linux/mtd/nand*.h
 +
  NATSEMI ETHERNET DRIVER (DP8381x)
  S:    Orphan
  F:    drivers/net/ethernet/natsemi/natsemi.c
@@@ -7605,6 -7506,7 +7615,6 @@@ F:      net/netrom
  
  NETRONOME ETHERNET DRIVERS
  M:    Jakub Kicinski <jakub.kicinski@netronome.com>
 -M:    Rolf Neugebauer <rolf.neugebauer@netronome.com>
  L:    oss-drivers@netronome.com
  S:    Maintained
  F:    drivers/net/ethernet/netronome/
@@@ -7741,6 -7643,7 +7751,6 @@@ F:      net/nfc
  F:    include/net/nfc/
  F:    include/uapi/linux/nfc.h
  F:    drivers/nfc/
 -F:    include/linux/platform_data/microread.h
  F:    include/linux/platform_data/nfcmrvl.h
  F:    include/linux/platform_data/nxp-nci.h
  F:    include/linux/platform_data/pn544.h
@@@ -7891,11 -7794,6 +7901,11 @@@ L:    alsa-devel@alsa-project.org (moderat
  S:    Maintained
  F:    sound/soc/codecs/tfa9879*
  
 +OBJTOOL
 +M:    Josh Poimboeuf <jpoimboe@redhat.com>
 +S:    Supported
 +F:    tools/objtool/
 +
  OMAP SUPPORT
  M:    Tony Lindgren <tony@atomide.com>
  L:    linux-omap@vger.kernel.org
@@@ -7946,7 -7844,7 +7956,7 @@@ S:      Maintaine
  F:    arch/arm/*omap*/*clock*
  
  OMAP POWER MANAGEMENT SUPPORT
 -M:    Kevin Hilman <khilman@deeprootsystems.com>
 +M:    Kevin Hilman <khilman@kernel.org>
  L:    linux-omap@vger.kernel.org
  S:    Maintained
  F:    arch/arm/*omap*/*pm*
@@@ -8050,7 -7948,7 +8060,7 @@@ F:      arch/arm/*omap*/usb
  OMAP GPIO DRIVER
  M:    Grygorii Strashko <grygorii.strashko@ti.com>
  M:    Santosh Shilimkar <ssantosh@kernel.org>
 -M:    Kevin Hilman <khilman@deeprootsystems.com>
 +M:    Kevin Hilman <khilman@kernel.org>
  L:    linux-omap@vger.kernel.org
  S:    Maintained
  F:    Documentation/devicetree/bindings/gpio/gpio-omap.txt
@@@ -8269,13 -8167,6 +8279,13 @@@ S:    Maintaine
  F:    Documentation/mn10300/
  F:    arch/mn10300/
  
 +PARALLEL LCD/KEYPAD PANEL DRIVER
 +M:      Willy Tarreau <willy@haproxy.com>
 +M:      Ksenija Stanojevic <ksenija.stanojevic@gmail.com>
 +S:      Odd Fixes
 +F:      Documentation/misc-devices/lcd-panel-cgram.txt
 +F:      drivers/misc/panel.c
 +
  PARALLEL PORT SUBSYSTEM
  M:    Sudip Mukherjee <sudipm.mukherjee@gmail.com>
  M:    Sudip Mukherjee <sudip@vectorindia.org>
@@@ -8367,15 -8258,6 +8377,15 @@@ L:    linux-pci@vger.kernel.or
  S:    Supported
  F:    Documentation/PCI/pci-error-recovery.txt
  
 +PCI ENHANCED ERROR HANDLING (EEH) FOR POWERPC
 +M:    Russell Currey <ruscur@russell.cc>
 +L:    linuxppc-dev@lists.ozlabs.org
 +S:    Supported
 +F:    Documentation/powerpc/eeh-pci-error-recovery.txt
 +F:    arch/powerpc/kernel/eeh*.c
 +F:    arch/powerpc/platforms/*/eeh*.c
 +F:    arch/powerpc/include/*/eeh*.h
 +
  PCI SUBSYSTEM
  M:    Bjorn Helgaas <bhelgaas@google.com>
  L:    linux-pci@vger.kernel.org
@@@ -8483,20 -8365,12 +8493,20 @@@ L:   linux-pci@vger.kernel.or
  S:    Maintained
  F:    drivers/pci/host/*designware*
  
 +PCI DRIVER FOR SYNOPSYS PROTOTYPING DEVICE
 +M:    Joao Pinto <jpinto@synopsys.com>
 +L:    linux-pci@vger.kernel.org
 +S:    Maintained
 +F:    Documentation/devicetree/bindings/pci/designware-pcie.txt
 +F:    drivers/pci/host/pcie-designware-plat.c
 +
  PCI DRIVER FOR GENERIC OF HOSTS
  M:    Will Deacon <will.deacon@arm.com>
  L:    linux-pci@vger.kernel.org
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
  F:    Documentation/devicetree/bindings/pci/host-generic-pci.txt
 +F:    drivers/pci/host/pci-host-common.c
  F:    drivers/pci/host/pci-host-generic.c
  
  PCI DRIVER FOR INTEL VOLUME MANAGEMENT DEVICE (VMD)
@@@ -8542,14 -8416,6 +8552,14 @@@ L:     linux-arm-msm@vger.kernel.or
  S:     Maintained
  F:     drivers/pci/host/*qcom*
  
 +PCIE DRIVER FOR CAVIUM THUNDERX
 +M:    David Daney <david.daney@cavium.com>
 +L:    linux-pci@vger.kernel.org
 +L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 +S:    Supported
 +F:    Documentation/devicetree/bindings/pci/pci-thunder-*
 +F:    drivers/pci/host/pci-thunder-*
 +
  PCMCIA SUBSYSTEM
  P:    Linux PCMCIA Team
  L:    linux-pcmcia@lists.infradead.org
@@@ -8575,7 -8441,7 +8585,7 @@@ F:      include/crypto/pcrypt.
  
  PER-CPU MEMORY ALLOCATOR
  M:    Tejun Heo <tj@kernel.org>
 -M:    Christoph Lameter <cl@linux-foundation.org>
 +M:    Christoph Lameter <cl@linux.com>
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu.git
  S:    Maintained
  F:    include/linux/percpu*.h
@@@ -8592,7 -8458,6 +8602,7 @@@ PERFORMANCE EVENTS SUBSYSTE
  M:    Peter Zijlstra <peterz@infradead.org>
  M:    Ingo Molnar <mingo@redhat.com>
  M:    Arnaldo Carvalho de Melo <acme@kernel.org>
 +R:    Alexander Shishkin <alexander.shishkin@linux.intel.com>
  L:    linux-kernel@vger.kernel.org
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core
  S:    Supported
@@@ -9215,15 -9080,17 +9225,21 @@@ S:   Maintaine
  F:    drivers/net/ethernet/rdc/r6040.c
  
  RDS - RELIABLE DATAGRAM SOCKETS
 -M:    Chien Yen <chien.yen@oracle.com>
 +M:    Santosh Shilimkar <santosh.shilimkar@oracle.com>
 +L:    netdev@vger.kernel.org
 +L:    linux-rdma@vger.kernel.org
  L:    rds-devel@oss.oracle.com (moderated for non-subscribers)
 +W:    https://oss.oracle.com/projects/rds/
  S:    Supported
  F:    net/rds/
 +F:    Documentation/networking/rds.txt
  
+ RDMAVT - RDMA verbs software
+ M:    Dennis Dalessandro <dennis.dalessandro@intel.com>
+ L:    linux-rdma@vger.kernel.org
+ S:    Supported
+ F:    drivers/infiniband/sw/rdmavt
  READ-COPY UPDATE (RCU)
  M:    "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
  M:    Josh Triplett <josh@joshtriplett.org>
@@@ -9275,7 -9142,6 +9291,7 @@@ F:      include/linux/regmap.
  
  REMOTE PROCESSOR (REMOTEPROC) SUBSYSTEM
  M:    Ohad Ben-Cohen <ohad@wizery.com>
 +M:    Bjorn Andersson <bjorn.andersson@linaro.org>
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/ohad/remoteproc.git
  S:    Maintained
  F:    drivers/remoteproc/
@@@ -9284,7 -9150,6 +9300,7 @@@ F:      include/linux/remoteproc.
  
  REMOTE PROCESSOR MESSAGING (RPMSG) SUBSYSTEM
  M:    Ohad Ben-Cohen <ohad@wizery.com>
 +M:    Bjorn Andersson <bjorn.andersson@linaro.org>
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/ohad/rpmsg.git
  S:    Maintained
  F:    drivers/rpmsg/
@@@ -9623,7 -9488,6 +9639,7 @@@ F:      drivers/media/i2c/s5k5baf.
  
  SAMSUNG S3FWRN5 NFC DRIVER
  M:    Robert Baldyga <r.baldyga@samsung.com>
 +M:    Krzysztof Opasiak <k.opasiak@samsung.com>
  L:    linux-nfc@lists.01.org (moderated for non-subscribers)
  S:    Supported
  F:    drivers/nfc/s3fwrn5
@@@ -9686,9 -9550,9 +9702,9 @@@ F: Documentation/devicetree/bindings/ne
  F: drivers/net/ethernet/synopsys/dwc_eth_qos.c
  
  SYNOPSYS DESIGNWARE I2C DRIVER
 -M:    Andy Shevchenko <andriy.shevchenko@linux.intel.com>
  M:    Jarkko Nikula <jarkko.nikula@linux.intel.com>
 -M:    Mika Westerberg <mika.westerberg@linux.intel.com>
 +R:    Andy Shevchenko <andriy.shevchenko@linux.intel.com>
 +R:    Mika Westerberg <mika.westerberg@linux.intel.com>
  L:    linux-i2c@vger.kernel.org
  S:    Maintained
  F:    drivers/i2c/busses/i2c-designware-*
@@@ -9797,7 -9661,7 +9813,7 @@@ F:      drivers/scsi/sg.
  F:    include/scsi/sg.h
  
  SCSI SUBSYSTEM
 -M:    "James E.J. Bottomley" <JBottomley@odin.com>
 +M:    "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi.git
  M:    "Martin K. Petersen" <martin.petersen@oracle.com>
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git
@@@ -9854,12 -9718,10 +9870,12 @@@ S:   Maintaine
  F:    drivers/mmc/host/sdricoh_cs.c
  
  SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) DRIVER
 +M:    Adrian Hunter <adrian.hunter@intel.com>
  L:    linux-mmc@vger.kernel.org
 -S:    Orphan
 -F:    drivers/mmc/host/sdhci.*
 -F:    drivers/mmc/host/sdhci-pltfm.[ch]
 +T:    git git://git.infradead.org/users/ahunter/linux-sdhci.git
 +S:    Maintained
 +F:    drivers/mmc/host/sdhci*
 +F:    include/linux/mmc/sdhci*
  
  SECURE COMPUTING
  M:    Kees Cook <keescook@chromium.org>
@@@ -10109,7 -9971,7 +10125,7 @@@ F:    arch/arm/mach-s3c24xx/bast-irq.
  
  TI DAVINCI MACHINE SUPPORT
  M:    Sekhar Nori <nsekhar@ti.com>
 -M:    Kevin Hilman <khilman@deeprootsystems.com>
 +M:    Kevin Hilman <khilman@kernel.org>
  T:    git git://gitorious.org/linux-davinci/linux-davinci.git
  Q:    http://patchwork.kernel.org/project/linux-davinci/list/
  S:    Supported
@@@ -10311,7 -10173,7 +10327,7 @@@ F:   drivers/media/pci/solo6x10
  SOFTWARE RAID (Multiple Disks) SUPPORT
  M:    Shaohua Li <shli@kernel.org>
  L:    linux-raid@vger.kernel.org
 -T:    git git://neil.brown.name/md
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/shli/md.git
  S:    Supported
  F:    drivers/md/
  F:    include/linux/raid/
@@@ -10545,6 -10407,19 +10561,6 @@@ L:  linux-tegra@vger.kernel.or
  S:    Maintained
  F:    drivers/staging/nvec/
  
 -STAGING - OLPC SECONDARY DISPLAY CONTROLLER (DCON)
 -M:    Jens Frederich <jfrederich@gmail.com>
 -M:    Daniel Drake <dsd@laptop.org>
 -M:    Jon Nettleton <jon.nettleton@gmail.com>
 -W:    http://wiki.laptop.org/go/DCON
 -S:    Maintained
 -F:    drivers/staging/olpc_dcon/
 -
 -STAGING - PARALLEL LCD/KEYPAD PANEL DRIVER
 -M:    Willy Tarreau <willy@meta-x.org>
 -S:    Odd Fixes
 -F:    drivers/staging/panel/
 -
  STAGING - REALTEK RTL8712U DRIVERS
  M:    Larry Finger <Larry.Finger@lwfinger.net>
  M:    Florian Schilhabel <florian.c.schilhabel@googlemail.com>.
@@@ -10993,14 -10868,6 +11009,14 @@@ L: linux-omap@vger.kernel.or
  S:    Maintained
  F:    drivers/thermal/ti-soc-thermal/
  
 +TI VPE/CAL DRIVERS
 +M:    Benoit Parrot <bparrot@ti.com>
 +L:    linux-media@vger.kernel.org
 +W:    http://linuxtv.org/
 +Q:    http://patchwork.linuxtv.org/project/linux-media/list/
 +S:    Maintained
 +F:    drivers/media/platform/ti-vpe/
 +
  TI CDCE706 CLOCK DRIVER
  M:    Max Filippov <jcmvbkbc@gmail.com>
  S:    Maintained
@@@ -11224,8 -11091,8 +11240,8 @@@ M:   Jarkko Sakkinen <jarkko.sakkinen@lin
  R:    Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
  W:    http://tpmdd.sourceforge.net
  L:    tpmdd-devel@lists.sourceforge.net (moderated for non-subscribers)
 -Q:    git git://github.com/PeterHuewe/linux-tpmdd.git
 -T:    git https://github.com/PeterHuewe/linux-tpmdd
 +Q:    https://patchwork.kernel.org/project/tpmdd-devel/list/
 +T:    git git://git.infradead.org/users/jjs/linux-tpmdd.git
  S:    Maintained
  F:    drivers/char/tpm/
  
@@@ -11380,6 -11247,7 +11396,6 @@@ F:   include/linux/cdrom.
  F:    include/uapi/linux/cdrom.h
  
  UNISYS S-PAR DRIVERS
 -M:    Benjamin Romer <benjamin.romer@unisys.com>
  M:    David Kershner <david.kershner@unisys.com>
  L:    sparmaintainer@unisys.com (Unisys internal)
  S:    Supported
@@@ -11404,7 -11272,7 +11420,7 @@@ F:   include/linux/mtd/ubi.
  F:    include/uapi/mtd/ubi-user.h
  
  USB ACM DRIVER
 -M:    Oliver Neukum <oliver@neukum.org>
 +M:    Oliver Neukum <oneukum@suse.com>
  L:    linux-usb@vger.kernel.org
  S:    Maintained
  F:    Documentation/usb/acm.txt
@@@ -11489,13 -11357,6 +11505,13 @@@ S: Maintaine
  F:    drivers/usb/host/isp116x*
  F:    include/linux/usb/isp116x.h
  
 +USB LAN78XX ETHERNET DRIVER
 +M:    Woojung Huh <woojung.huh@microchip.com>
 +M:    Microchip Linux Driver Support <UNGLinuxDriver@microchip.com>
 +L:    netdev@vger.kernel.org
 +S:    Maintained
 +F:    drivers/net/usb/lan78xx.*
 +
  USB MASS STORAGE DRIVER
  M:    Matthew Dharm <mdharm-usb@one-eyed-alien.net>
  L:    linux-usb@vger.kernel.org
@@@ -11535,7 -11396,6 +11551,7 @@@ M:   Valentina Manea <valentina.manea.m@g
  M:    Shuah Khan <shuah.kh@samsung.com>
  L:    linux-usb@vger.kernel.org
  S:    Maintained
 +F:    Documentation/usb/usbip_protocol.txt
  F:    drivers/usb/usbip/
  F:    tools/usb/usbip/
  
@@@ -12026,18 -11886,6 +12042,18 @@@ M: David Härdeman <david@hardeman.nu
  S:    Maintained
  F:    drivers/media/rc/winbond-cir.c
  
 +WINSYSTEMS EBC-C384 WATCHDOG DRIVER
 +M:    William Breathitt Gray <vilhelm.gray@gmail.com>
 +L:    linux-watchdog@vger.kernel.org
 +S:    Maintained
 +F:    drivers/watchdog/ebc-c384_wdt.c
 +
 +WINSYSTEMS WS16C48 GPIO DRIVER
 +M:    William Breathitt Gray <vilhelm.gray@gmail.com>
 +L:    linux-gpio@vger.kernel.org
 +S:    Maintained
 +F:    drivers/gpio/gpio-ws16c48.c
 +
  WIMAX STACK
  M:    Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
  M:    linux-wimax@intel.com
index b5656a2298ee0b1aa88f0cf50672f533b0a0fb6e,d2214a55ac4ac4bab9aa4f9c7540745f96ab02ec..8a09c0fb268d8d89529f0f22249422ee3fe05320
@@@ -885,6 -885,11 +885,11 @@@ static void update_sm_ah(struct work_st
        ah_attr.dlid     = port_attr.sm_lid;
        ah_attr.sl       = port_attr.sm_sl;
        ah_attr.port_num = port->port_num;
+       if (port_attr.grh_required) {
+               ah_attr.ah_flags = IB_AH_GRH;
+               ah_attr.grh.dgid.global.subnet_prefix = cpu_to_be64(port_attr.subnet_prefix);
+               ah_attr.grh.dgid.global.interface_id = cpu_to_be64(IB_SA_WELL_KNOWN_GUID);
+       }
  
        new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
        if (IS_ERR(new_ah->ah)) {
@@@ -1070,7 -1075,7 +1075,7 @@@ int ib_init_ah_from_path(struct ib_devi
                }
        }
  
 -      if (rec->hop_limit > 1 || use_roce) {
 +      if (rec->hop_limit > 0 || use_roce) {
                ah_attr->ah_flags = IB_AH_GRH;
                ah_attr->grh.dgid = rec->dgid;
  
index edd8b87418466a7b3717856915087b41122ff8cb,e305990b73f6b78bc49f5213d7678e2b859cf18a..5acf346e048e3bb45fc33d97eeecd1321bf2ef9d
@@@ -42,7 -42,6 +42,7 @@@
  #include <rdma/ib_user_verbs.h>
  #include <rdma/ib_addr.h>
  #include <rdma/ib_cache.h>
 +#include <linux/mlx5/port.h>
  #include <linux/mlx5/vport.h>
  #include <rdma/ib_smi.h>
  #include <rdma/ib_umem.h>
@@@ -284,7 -283,7 +284,7 @@@ __be16 mlx5_get_roce_udp_sport(struct m
  
  static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
  {
-       return !dev->mdev->issi;
+       return !MLX5_CAP_GEN(dev->mdev, ib_virt);
  }
  
  enum {
@@@ -563,6 -562,9 +563,9 @@@ static int mlx5_ib_query_device(struct 
        if (MLX5_CAP_GEN(mdev, cd))
                props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
  
+       if (!mlx5_core_is_pf(mdev))
+               props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
        return 0;
  }
  
@@@ -700,6 -702,7 +703,7 @@@ static int mlx5_query_hca_port(struct i
        props->qkey_viol_cntr   = rep->qkey_violation_counter;
        props->subnet_timeout   = rep->subnet_timeout;
        props->init_type_reply  = rep->init_type_reply;
+       props->grh_required     = rep->grh_required;
  
        err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port);
        if (err)
@@@ -2350,6 -2353,12 +2354,12 @@@ static void *mlx5_ib_add(struct mlx5_co
        dev->ib_dev.map_mr_sg           = mlx5_ib_map_mr_sg;
        dev->ib_dev.check_mr_status     = mlx5_ib_check_mr_status;
        dev->ib_dev.get_port_immutable  = mlx5_port_immutable;
+       if (mlx5_core_is_pf(mdev)) {
+               dev->ib_dev.get_vf_config       = mlx5_ib_get_vf_config;
+               dev->ib_dev.set_vf_link_state   = mlx5_ib_set_vf_link_state;
+               dev->ib_dev.get_vf_stats        = mlx5_ib_get_vf_stats;
+               dev->ib_dev.set_vf_guid         = mlx5_ib_set_vf_guid;
+       }
  
        mlx5_ib_internal_fill_odp_caps(dev);
  
index 1d1309091abace1362ad9ca4608a5de7c5c3b576,578c3703421ddb18a5bad7c64bde26c83d546964..0bd3cb2f3c671a21fefd57dbc4851a24daa61e82
@@@ -839,7 -839,7 +839,7 @@@ static void srpt_zerolength_write_done(
                if (srpt_set_ch_state(ch, CH_DISCONNECTED))
                        schedule_work(&ch->release_work);
                else
-                       WARN_ONCE("%s-%d\n", ch->sess_name, ch->qp->qp_num);
+                       WARN_ONCE(1, "%s-%d\n", ch->sess_name, ch->qp->qp_num);
        }
  }
  
@@@ -1264,26 -1264,40 +1264,26 @@@ free_mem
   */
  static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch)
  {
 +      struct se_session *se_sess;
        struct srpt_send_ioctx *ioctx;
 -      unsigned long flags;
 +      int tag;
  
        BUG_ON(!ch);
 +      se_sess = ch->sess;
  
 -      ioctx = NULL;
 -      spin_lock_irqsave(&ch->spinlock, flags);
 -      if (!list_empty(&ch->free_list)) {
 -              ioctx = list_first_entry(&ch->free_list,
 -                                       struct srpt_send_ioctx, free_list);
 -              list_del(&ioctx->free_list);
 +      tag = percpu_ida_alloc(&se_sess->sess_tag_pool, TASK_RUNNING);
 +      if (tag < 0) {
 +              pr_err("Unable to obtain tag for srpt_send_ioctx\n");
 +              return NULL;
        }
 -      spin_unlock_irqrestore(&ch->spinlock, flags);
 -
 -      if (!ioctx)
 -              return ioctx;
 -
 -      BUG_ON(ioctx->ch != ch);
 +      ioctx = &((struct srpt_send_ioctx *)se_sess->sess_cmd_map)[tag];
 +      memset(ioctx, 0, sizeof(struct srpt_send_ioctx));
 +      ioctx->ch = ch;
        spin_lock_init(&ioctx->spinlock);
        ioctx->state = SRPT_STATE_NEW;
 -      ioctx->n_rbuf = 0;
 -      ioctx->rbufs = NULL;
 -      ioctx->n_rdma = 0;
 -      ioctx->n_rdma_wrs = 0;
 -      ioctx->rdma_wrs = NULL;
 -      ioctx->mapped_sg_count = 0;
        init_completion(&ioctx->tx_done);
 -      ioctx->queue_status_only = false;
 -      /*
 -       * transport_init_se_cmd() does not initialize all fields, so do it
 -       * here.
 -       */
 -      memset(&ioctx->cmd, 0, sizeof(ioctx->cmd));
 -      memset(&ioctx->sense_data, 0, sizeof(ioctx->sense_data));
 +
 +      ioctx->cmd.map_tag = tag;
  
        return ioctx;
  }
@@@ -2020,8 -2034,9 +2020,8 @@@ static int srpt_cm_req_recv(struct ib_c
        struct srp_login_rej *rej;
        struct ib_cm_rep_param *rep_param;
        struct srpt_rdma_ch *ch, *tmp_ch;
 -      struct se_node_acl *se_acl;
        u32 it_iu_len;
 -      int i, ret = 0;
 +      int ret = 0;
        unsigned char *p;
  
        WARN_ON_ONCE(irqs_disabled());
        if (!ch->ioctx_ring)
                goto free_ch;
  
 -      INIT_LIST_HEAD(&ch->free_list);
 -      for (i = 0; i < ch->rq_size; i++) {
 -              ch->ioctx_ring[i]->ch = ch;
 -              list_add_tail(&ch->ioctx_ring[i]->free_list, &ch->free_list);
 -      }
 -
        ret = srpt_create_ch_ib(ch);
        if (ret) {
                rej->reason = cpu_to_be32(
        pr_debug("registering session %s\n", ch->sess_name);
        p = &ch->sess_name[0];
  
 -      ch->sess = transport_init_session(TARGET_PROT_NORMAL);
 -      if (IS_ERR(ch->sess)) {
 -              rej->reason = cpu_to_be32(
 -                              SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
 -              pr_debug("Failed to create session\n");
 -              goto destroy_ib;
 -      }
 -
  try_again:
 -      se_acl = core_tpg_get_initiator_node_acl(&sport->port_tpg_1, p);
 -      if (!se_acl) {
 +      ch->sess = target_alloc_session(&sport->port_tpg_1, ch->rq_size,
 +                                      sizeof(struct srpt_send_ioctx),
 +                                      TARGET_PROT_NORMAL, p, ch, NULL);
 +      if (IS_ERR(ch->sess)) {
                pr_info("Rejected login because no ACL has been"
 -                      " configured yet for initiator %s.\n", ch->sess_name);
 +                      " configured yet for initiator %s.\n", p);
                /*
                 * XXX: Hack to retry of ch->i_port_id without leading '0x'
                 */
                        p += 2;
                        goto try_again;
                }
 -              rej->reason = cpu_to_be32(
 +              rej->reason = cpu_to_be32((PTR_ERR(ch->sess) == -ENOMEM) ?
 +                              SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES :
                                SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED);
 -              transport_free_session(ch->sess);
                goto destroy_ib;
        }
 -      ch->sess->se_node_acl = se_acl;
 -
 -      transport_register_session(&sport->port_tpg_1, se_acl, ch->sess, ch);
  
        pr_debug("Establish connection sess=%p name=%s cm_id=%p\n", ch->sess,
                 ch->sess_name, ch->cm_id);
@@@ -2881,7 -2911,7 +2881,7 @@@ static void srpt_release_cmd(struct se_
        struct srpt_send_ioctx *ioctx = container_of(se_cmd,
                                struct srpt_send_ioctx, cmd);
        struct srpt_rdma_ch *ch = ioctx->ch;
 -      unsigned long flags;
 +      struct se_session *se_sess = ch->sess;
  
        WARN_ON(ioctx->state != SRPT_STATE_DONE);
        WARN_ON(ioctx->mapped_sg_count != 0);
                ioctx->n_rbuf = 0;
        }
  
 -      spin_lock_irqsave(&ch->spinlock, flags);
 -      list_add(&ioctx->free_list, &ch->free_list);
 -      spin_unlock_irqrestore(&ch->spinlock, flags);
 +      percpu_ida_free(&se_sess->sess_tag_pool, se_cmd->map_tag);
  }
  
  /**
index 2f6210ae8ba0f3cf2621035b314d33ced74e0068,e734c649227d9d255dddff79bb898c70c2ed40d8..1ce6e9c0427d54505bb339b42d2d226d69726750
@@@ -1,7 -1,7 +1,7 @@@
  /*******************************************************************************
   *
   * Intel Ethernet Controller XL710 Family Linux Driver
 - * Copyright(c) 2013 - 2015 Intel Corporation.
 + * Copyright(c) 2013 - 2016 Intel Corporation.
   *
   * This program is free software; you can redistribute it and/or modify it
   * under the terms and conditions of the GNU General Public License,
  #ifdef I40E_FCOE
  #include "i40e_fcoe.h"
  #endif
+ #include "i40e_client.h"
  #include "i40e_virtchnl.h"
  #include "i40e_virtchnl_pf.h"
  #include "i40e_txrx.h"
  #include "i40e_dcb.h"
  
  /* Useful i40e defaults */
 -#define I40E_BASE_PF_SEID     16
 -#define I40E_BASE_VSI_SEID    512
 -#define I40E_BASE_VEB_SEID    288
  #define I40E_MAX_VEB          16
  
  #define I40E_MAX_NUM_DESCRIPTORS      4096
  #define I40E_PRIV_FLAGS_FD_ATR                BIT(2)
  #define I40E_PRIV_FLAGS_VEB_STATS     BIT(3)
  #define I40E_PRIV_FLAGS_PS            BIT(4)
 +#define I40E_PRIV_FLAGS_HW_ATR_EVICT  BIT(5)
  
  #define I40E_NVM_VERSION_LO_SHIFT  0
  #define I40E_NVM_VERSION_LO_MASK   (0xff << I40E_NVM_VERSION_LO_SHIFT)
  #define I40E_OEM_VER_PATCH_MASK    0xff
  #define I40E_OEM_VER_BUILD_SHIFT   8
  #define I40E_OEM_VER_SHIFT         24
 +#define I40E_PHY_DEBUG_PORT        BIT(4)
  
  /* The values in here are decimal coded as hex as is the case in the NVM map*/
  #define I40E_CURRENT_NVM_VERSION_HI 0x2
  /* default to trying for four seconds */
  #define I40E_TRY_LINK_TIMEOUT (4 * HZ)
  
 +/**
 + * i40e_is_mac_710 - Return true if MAC is X710/XL710
 + * @hw: ptr to the hardware info
 + **/
 +static inline bool i40e_is_mac_710(struct i40e_hw *hw)
 +{
 +      if ((hw->mac.type == I40E_MAC_X710) ||
 +          (hw->mac.type == I40E_MAC_XL710))
 +              return true;
 +
 +      return false;
 +}
 +
  /* driver state flags */
  enum i40e_state_t {
        __I40E_TESTING,
@@@ -190,6 -179,7 +191,7 @@@ struct i40e_lump_tracking 
        u16 search_hint;
        u16 list[0];
  #define I40E_PILE_VALID_BIT  0x8000
+ #define I40E_IWARP_IRQ_PILE_ID  (I40E_PILE_VALID_BIT - 2)
  };
  
  #define I40E_DEFAULT_ATR_SAMPLE_RATE  20
@@@ -282,6 -272,8 +284,8 @@@ struct i40e_pf 
  #endif /* I40E_FCOE */
        u16 num_lan_qps;           /* num lan queues this PF has set up */
        u16 num_lan_msix;          /* num queue vectors for the base PF vsi */
+       u16 num_iwarp_msix;        /* num of iwarp vectors for this PF */
+       int iwarp_base_vector;
        int queues_left;           /* queues left unclaimed */
        u16 alloc_rss_size;        /* allocated RSS queues */
        u16 rss_size_max;          /* HW defined max RSS queues */
  #define I40E_FLAG_16BYTE_RX_DESC_ENABLED      BIT_ULL(13)
  #define I40E_FLAG_CLEAN_ADMINQ                        BIT_ULL(14)
  #define I40E_FLAG_FILTER_SYNC                 BIT_ULL(15)
+ #define I40E_FLAG_SERVICE_CLIENT_REQUESTED    BIT_ULL(16)
  #define I40E_FLAG_PROCESS_MDD_EVENT           BIT_ULL(17)
  #define I40E_FLAG_PROCESS_VFLR_EVENT          BIT_ULL(18)
  #define I40E_FLAG_SRIOV_ENABLED                       BIT_ULL(19)
  #define I40E_FLAG_VEB_MODE_ENABLED            BIT_ULL(40)
  #define I40E_FLAG_GENEVE_OFFLOAD_CAPABLE      BIT_ULL(41)
  #define I40E_FLAG_NO_PCI_LINK_CHECK           BIT_ULL(42)
 +#define I40E_FLAG_100M_SGMII_CAPABLE          BIT_ULL(43)
 +#define I40E_FLAG_RESTART_AUTONEG             BIT_ULL(44)
 +#define I40E_FLAG_NO_DCB_SUPPORT              BIT_ULL(45)
 +#define I40E_FLAG_USE_SET_LLDP_MIB            BIT_ULL(46)
 +#define I40E_FLAG_STOP_FW_LLDP                        BIT_ULL(47)
 +#define I40E_FLAG_HAVE_10GBASET_PHY           BIT_ULL(48)
  #define I40E_FLAG_PF_MAC                      BIT_ULL(50)
  
        /* tracks features that get auto disabled by errors */
        struct i40e_vf *vf;
        int num_alloc_vfs;      /* actual number of VFs allocated */
        u32 vf_aq_requests;
 +      u32 arq_overflows;      /* Not fatal, possibly indicative of problems */
  
        /* DCBx/DCBNL capability for PF that indicates
         * whether DCBx is managed by firmware or host
  
        u32 ioremap_len;
        u32 fd_inv;
 +      u16 phy_led_val;
  };
  
  struct i40e_mac_filter {
@@@ -512,7 -497,6 +517,7 @@@ struct i40e_vsi 
        u32 tx_busy;
        u64 tx_linearize;
        u64 tx_force_wb;
 +      u64 tx_lost_interrupt;
        u32 rx_buf_failed;
        u32 rx_page_failed;
  
        struct i40e_ring **tx_rings;
  
        u16 work_limit;
 -      /* high bit set means dynamic, use accessor routines to read/write.
 -       * hardware only supports 2us resolution for the ITR registers.
 -       * these values always store the USER setting, and must be converted
 -       * before programming to a register.
 -       */
 -      u16 rx_itr_setting;
 -      u16 tx_itr_setting;
        u16 int_rate_limit;  /* value in usecs */
  
        u16 rss_table_size; /* HW RSS table size */
        struct kobject *kobj;  /* sysfs object */
        bool current_isup;     /* Sync 'link up' logging */
  
+       void *priv;     /* client driver data reference. */
        /* VSI specific handlers */
        irqreturn_t (*irq_handler)(int irq, void *data);
  
@@@ -728,6 -721,10 +735,10 @@@ void i40e_vsi_setup_queue_map(struct i4
                              struct i40e_vsi_context *ctxt,
                              u8 enabled_tc, bool is_add);
  #endif
+ void i40e_service_event_schedule(struct i40e_pf *pf);
+ void i40e_notify_client_of_vf_msg(struct i40e_vsi *vsi, u32 vf_id,
+                                 u8 *msg, u16 len);
  int i40e_vsi_control_rings(struct i40e_vsi *vsi, bool enable);
  int i40e_reconfig_rss_queues(struct i40e_pf *pf, int queue_count);
  struct i40e_veb *i40e_veb_setup(struct i40e_pf *pf, u16 flags, u16 uplink_seid,
@@@ -750,6 -747,17 +761,17 @@@ static inline void i40e_dbg_pf_exit(str
  static inline void i40e_dbg_init(void) {}
  static inline void i40e_dbg_exit(void) {}
  #endif /* CONFIG_DEBUG_FS*/
+ /* needed by client drivers */
+ int i40e_lan_add_device(struct i40e_pf *pf);
+ int i40e_lan_del_device(struct i40e_pf *pf);
+ void i40e_client_subtask(struct i40e_pf *pf);
+ void i40e_notify_client_of_l2_param_changes(struct i40e_vsi *vsi);
+ void i40e_notify_client_of_netdev_open(struct i40e_vsi *vsi);
+ void i40e_notify_client_of_netdev_close(struct i40e_vsi *vsi, bool reset);
+ void i40e_notify_client_of_vf_enable(struct i40e_pf *pf, u32 num_vfs);
+ void i40e_notify_client_of_vf_reset(struct i40e_pf *pf, u32 vf_id);
+ int i40e_vf_client_capable(struct i40e_pf *pf, u32 vf_id,
+                          enum i40e_client_type type);
  /**
   * i40e_irq_dynamic_enable - Enable default interrupt generation settings
   * @vsi: pointer to a vsi
@@@ -761,9 -769,6 +783,9 @@@ static inline void i40e_irq_dynamic_ena
        struct i40e_hw *hw = &pf->hw;
        u32 val;
  
 +      /* definitely clear the PBA here, as this function is meant to
 +       * clean out all previous interrupts AND enable the interrupt
 +       */
        val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
              I40E_PFINT_DYN_CTLN_CLEARPBA_MASK |
              (I40E_ITR_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT);
        /* skip the flush */
  }
  
 -void i40e_irq_dynamic_disable(struct i40e_vsi *vsi, int vector);
  void i40e_irq_dynamic_disable_icr0(struct i40e_pf *pf);
 -void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf);
 +void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf, bool clearpba);
  #ifdef I40E_FCOE
  struct rtnl_link_stats64 *i40e_get_netdev_stats_struct(
                                             struct net_device *netdev,
@@@ -802,8 -808,7 +824,8 @@@ struct i40e_mac_filter *i40e_find_mac(s
                                      bool is_vf, bool is_netdev);
  #ifdef I40E_FCOE
  int i40e_close(struct net_device *netdev);
 -int i40e_setup_tc(struct net_device *netdev, u8 tc);
 +int __i40e_setup_tc(struct net_device *netdev, u32 handle, __be16 proto,
 +                  struct tc_to_netdev *tc);
  void i40e_netpoll(struct net_device *netdev);
  int i40e_fcoe_enable(struct net_device *netdev);
  int i40e_fcoe_disable(struct net_device *netdev);
index 70d9605a0d9e712f1bb89a38da053c8b07ee9a83,1df2629d37059b88502fca4cae95f0c06bf3764d..67006431726aa03c41f38c5e54bf9009e3bf9aa2
@@@ -1,7 -1,7 +1,7 @@@
  /*******************************************************************************
   *
   * Intel Ethernet Controller XL710 Family Linux Driver
 - * Copyright(c) 2013 - 2015 Intel Corporation.
 + * Copyright(c) 2013 - 2016 Intel Corporation.
   *
   * This program is free software; you can redistribute it and/or modify it
   * under the terms and conditions of the GNU General Public License,
  #include <linux/of_net.h>
  #include <linux/pci.h>
  
 -#ifdef CONFIG_SPARC
 -#include <asm/idprom.h>
 -#include <asm/prom.h>
 -#endif
 -
  /* Local includes */
  #include "i40e.h"
  #include "i40e_diag.h"
@@@ -46,7 -51,7 +46,7 @@@ static const char i40e_driver_string[] 
  
  #define DRV_VERSION_MAJOR 1
  #define DRV_VERSION_MINOR 4
 -#define DRV_VERSION_BUILD 8
 +#define DRV_VERSION_BUILD 25
  #define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \
             __stringify(DRV_VERSION_MINOR) "." \
             __stringify(DRV_VERSION_BUILD)    DRV_KERN
@@@ -85,8 -90,6 +85,8 @@@ static const struct pci_device_id i40e_
        {PCI_VDEVICE(INTEL, I40E_DEV_ID_10G_BASE_T), 0},
        {PCI_VDEVICE(INTEL, I40E_DEV_ID_10G_BASE_T4), 0},
        {PCI_VDEVICE(INTEL, I40E_DEV_ID_20G_KR2), 0},
 +      {PCI_VDEVICE(INTEL, I40E_DEV_ID_KX_X722), 0},
 +      {PCI_VDEVICE(INTEL, I40E_DEV_ID_QSFP_X722), 0},
        {PCI_VDEVICE(INTEL, I40E_DEV_ID_SFP_X722), 0},
        {PCI_VDEVICE(INTEL, I40E_DEV_ID_1G_BASE_T_X722), 0},
        {PCI_VDEVICE(INTEL, I40E_DEV_ID_10G_BASE_T_X722), 0},
@@@ -107,8 -110,6 +107,8 @@@ MODULE_DESCRIPTION("Intel(R) Ethernet C
  MODULE_LICENSE("GPL");
  MODULE_VERSION(DRV_VERSION);
  
 +static struct workqueue_struct *i40e_wq;
 +
  /**
   * i40e_allocate_dma_mem_d - OS specific memory alloc for shared code
   * @hw:   pointer to the HW structure
@@@ -289,12 -290,12 +289,12 @@@ struct i40e_vsi *i40e_find_vsi_from_id(
   *
   * If not already scheduled, this puts the task into the work queue
   **/
static void i40e_service_event_schedule(struct i40e_pf *pf)
+ void i40e_service_event_schedule(struct i40e_pf *pf)
  {
        if (!test_bit(__I40E_DOWN, &pf->state) &&
            !test_bit(__I40E_RESET_RECOVERY_PENDING, &pf->state) &&
            !test_and_set_bit(__I40E_SERVICE_SCHED, &pf->state))
 -              schedule_work(&pf->service_task);
 +              queue_work(i40e_wq, &pf->service_task);
  }
  
  /**
@@@ -768,7 -769,7 +768,7 @@@ static void i40e_update_fcoe_stats(stru
        if (vsi->type != I40E_VSI_FCOE)
                return;
  
 -      idx = (pf->pf_seid - I40E_BASE_PF_SEID) + I40E_FCOE_PF_STAT_OFFSET;
 +      idx = hw->pf_id + I40E_FCOE_PF_STAT_OFFSET;
        fs = &vsi->fcoe_stats;
        ofs = &vsi->fcoe_stats_offsets;
  
@@@ -819,7 -820,6 +819,7 @@@ static void i40e_update_vsi_stats(struc
        struct i40e_eth_stats *oes;
        struct i40e_eth_stats *es;     /* device's eth stats */
        u32 tx_restart, tx_busy;
 +      u64 tx_lost_interrupt;
        struct i40e_ring *p;
        u32 rx_page, rx_buf;
        u64 bytes, packets;
        rx_b = rx_p = 0;
        tx_b = tx_p = 0;
        tx_restart = tx_busy = tx_linearize = tx_force_wb = 0;
 +      tx_lost_interrupt = 0;
        rx_page = 0;
        rx_buf = 0;
        rcu_read_lock();
                tx_busy += p->tx_stats.tx_busy;
                tx_linearize += p->tx_stats.tx_linearize;
                tx_force_wb += p->tx_stats.tx_force_wb;
 +              tx_lost_interrupt += p->tx_stats.tx_lost_interrupt;
  
                /* Rx queue is part of the same block as Tx queue */
                p = &p[1];
        vsi->tx_busy = tx_busy;
        vsi->tx_linearize = tx_linearize;
        vsi->tx_force_wb = tx_force_wb;
 +      vsi->tx_lost_interrupt = tx_lost_interrupt;
        vsi->rx_page_failed = rx_page;
        vsi->rx_buf_failed = rx_buf;
  
@@@ -1371,7 -1368,7 +1371,7 @@@ struct i40e_mac_filter *i40e_add_filter
                f->changed = true;
  
                INIT_LIST_HEAD(&f->list);
 -              list_add(&f->list, &vsi->mac_filter_list);
 +              list_add_tail(&f->list, &vsi->mac_filter_list);
        }
  
        /* increment counter and add a new flag if needed */
@@@ -1541,11 -1538,7 +1541,11 @@@ static int i40e_set_mac(struct net_devi
  
        ether_addr_copy(netdev->dev_addr, addr->sa_data);
  
 -      return i40e_sync_vsi_filters(vsi);
 +      /* schedule our worker thread which will take care of
 +       * applying the new filter changes
 +       */
 +      i40e_service_event_schedule(vsi->back);
 +      return 0;
  }
  
  /**
@@@ -1769,11 -1762,6 +1769,11 @@@ bottom_of_search_loop
                vsi->flags |= I40E_VSI_FLAG_FILTER_CHANGED;
                vsi->back->flags |= I40E_FLAG_FILTER_SYNC;
        }
 +
 +      /* schedule our worker thread which will take care of
 +       * applying the new filter changes
 +       */
 +      i40e_service_event_schedule(vsi->back);
  }
  
  /**
@@@ -1945,7 -1933,7 +1945,7 @@@ int i40e_sync_vsi_filters(struct i40e_v
                            sizeof(struct i40e_aqc_remove_macvlan_element_data);
                del_list_size = filter_list_len *
                            sizeof(struct i40e_aqc_remove_macvlan_element_data);
 -              del_list = kzalloc(del_list_size, GFP_KERNEL);
 +              del_list = kzalloc(del_list_size, GFP_ATOMIC);
                if (!del_list) {
                        i40e_cleanup_add_list(&tmp_add_list);
  
                               sizeof(struct i40e_aqc_add_macvlan_element_data),
                add_list_size = filter_list_len *
                               sizeof(struct i40e_aqc_add_macvlan_element_data);
 -              add_list = kzalloc(add_list_size, GFP_KERNEL);
 +              add_list = kzalloc(add_list_size, GFP_ATOMIC);
                if (!add_list) {
                        /* Purge element from temporary lists */
                        i40e_cleanup_add_list(&tmp_add_list);
                cur_promisc = (!!(vsi->current_netdev_flags & IFF_PROMISC) ||
                               test_bit(__I40E_FILTER_OVERFLOW_PROMISC,
                                        &vsi->state));
 -              if (vsi->type == I40E_VSI_MAIN && pf->lan_veb != I40E_NO_VEB) {
 +              if ((vsi->type == I40E_VSI_MAIN) &&
 +                  (pf->lan_veb != I40E_NO_VEB) &&
 +                  !(pf->flags & I40E_FLAG_MFP_ENABLED)) {
                        /* set defport ON for Main VSI instead of true promisc
                         * this way we will get all unicast/multicast and VLAN
                         * promisc behavior but will not get VF or VMDq traffic
                }
        }
  out:
 +      /* if something went wrong then set the changed flag so we try again */
 +      if (retval)
 +              vsi->flags |= I40E_VSI_FLAG_FILTER_CHANGED;
 +
        clear_bit(__I40E_CONFIG_BUSY, &vsi->state);
        return retval;
  }
@@@ -2230,7 -2212,7 +2230,7 @@@ static int i40e_change_mtu(struct net_d
        netdev->mtu = new_mtu;
        if (netif_running(netdev))
                i40e_vsi_reinit_locked(vsi);
+       i40e_notify_client_of_l2_param_changes(vsi);
        return 0;
  }
  
@@@ -3124,11 -3106,11 +3124,11 @@@ static void i40e_vsi_configure_msix(str
                struct i40e_q_vector *q_vector = vsi->q_vectors[i];
  
                q_vector->itr_countdown = ITR_COUNTDOWN_START;
 -              q_vector->rx.itr = ITR_TO_REG(vsi->rx_itr_setting);
 +              q_vector->rx.itr = ITR_TO_REG(vsi->rx_rings[i]->rx_itr_setting);
                q_vector->rx.latency_range = I40E_LOW_LATENCY;
                wr32(hw, I40E_PFINT_ITRN(I40E_RX_ITR, vector - 1),
                     q_vector->rx.itr);
 -              q_vector->tx.itr = ITR_TO_REG(vsi->tx_itr_setting);
 +              q_vector->tx.itr = ITR_TO_REG(vsi->tx_rings[i]->tx_itr_setting);
                q_vector->tx.latency_range = I40E_LOW_LATENCY;
                wr32(hw, I40E_PFINT_ITRN(I40E_TX_ITR, vector - 1),
                     q_vector->tx.itr);
@@@ -3220,10 -3202,10 +3220,10 @@@ static void i40e_configure_msi_and_lega
  
        /* set the ITR configuration */
        q_vector->itr_countdown = ITR_COUNTDOWN_START;
 -      q_vector->rx.itr = ITR_TO_REG(vsi->rx_itr_setting);
 +      q_vector->rx.itr = ITR_TO_REG(vsi->rx_rings[0]->rx_itr_setting);
        q_vector->rx.latency_range = I40E_LOW_LATENCY;
        wr32(hw, I40E_PFINT_ITR0(I40E_RX_ITR), q_vector->rx.itr);
 -      q_vector->tx.itr = ITR_TO_REG(vsi->tx_itr_setting);
 +      q_vector->tx.itr = ITR_TO_REG(vsi->tx_rings[0]->tx_itr_setting);
        q_vector->tx.latency_range = I40E_LOW_LATENCY;
        wr32(hw, I40E_PFINT_ITR0(I40E_TX_ITR), q_vector->tx.itr);
  
@@@ -3263,21 -3245,36 +3263,21 @@@ void i40e_irq_dynamic_disable_icr0(stru
  /**
   * i40e_irq_dynamic_enable_icr0 - Enable default interrupt generation for icr0
   * @pf: board private structure
 + * @clearpba: true when all pending interrupt events should be cleared
   **/
 -void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf)
 +void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf, bool clearpba)
  {
        struct i40e_hw *hw = &pf->hw;
        u32 val;
  
        val = I40E_PFINT_DYN_CTL0_INTENA_MASK   |
 -            I40E_PFINT_DYN_CTL0_CLEARPBA_MASK |
 +            (clearpba ? I40E_PFINT_DYN_CTL0_CLEARPBA_MASK : 0) |
              (I40E_ITR_NONE << I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT);
  
        wr32(hw, I40E_PFINT_DYN_CTL0, val);
        i40e_flush(hw);
  }
  
 -/**
 - * i40e_irq_dynamic_disable - Disable default interrupt generation settings
 - * @vsi: pointer to a vsi
 - * @vector: disable a particular Hw Interrupt vector
 - **/
 -void i40e_irq_dynamic_disable(struct i40e_vsi *vsi, int vector)
 -{
 -      struct i40e_pf *pf = vsi->back;
 -      struct i40e_hw *hw = &pf->hw;
 -      u32 val;
 -
 -      val = I40E_ITR_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT;
 -      wr32(hw, I40E_PFINT_DYN_CTLN(vector - 1), val);
 -      i40e_flush(hw);
 -}
 -
  /**
   * i40e_msix_clean_rings - MSIX mode Interrupt Handler
   * @irq: interrupt number
@@@ -3403,7 -3400,7 +3403,7 @@@ static int i40e_vsi_enable_irq(struct i
                for (i = 0; i < vsi->num_q_vectors; i++)
                        i40e_irq_dynamic_enable(vsi, i);
        } else {
 -              i40e_irq_dynamic_enable_icr0(pf);
 +              i40e_irq_dynamic_enable_icr0(pf, true);
        }
  
        i40e_flush(&pf->hw);
@@@ -3462,12 -3459,16 +3462,12 @@@ static irqreturn_t i40e_intr(int irq, v
                struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
                struct i40e_q_vector *q_vector = vsi->q_vectors[0];
  
 -              /* temporarily disable queue cause for NAPI processing */
 -              u32 qval = rd32(hw, I40E_QINT_RQCTL(0));
 -
 -              qval &= ~I40E_QINT_RQCTL_CAUSE_ENA_MASK;
 -              wr32(hw, I40E_QINT_RQCTL(0), qval);
 -
 -              qval = rd32(hw, I40E_QINT_TQCTL(0));
 -              qval &= ~I40E_QINT_TQCTL_CAUSE_ENA_MASK;
 -              wr32(hw, I40E_QINT_TQCTL(0), qval);
 -
 +              /* We do not have a way to disarm Queue causes while leaving
 +               * interrupt enabled for all other causes, ideally
 +               * interrupt should be disabled while we are in NAPI but
 +               * this is not a performance path and napi_schedule()
 +               * can deal with rescheduling.
 +               */
                if (!test_bit(__I40E_DOWN, &pf->state))
                        napi_schedule_irqoff(&q_vector->napi);
        }
        if (icr0 & I40E_PFINT_ICR0_ADMINQ_MASK) {
                ena_mask &= ~I40E_PFINT_ICR0_ENA_ADMINQ_MASK;
                set_bit(__I40E_ADMINQ_EVENT_PENDING, &pf->state);
 +              i40e_debug(&pf->hw, I40E_DEBUG_NVM, "AdminQ event\n");
        }
  
        if (icr0 & I40E_PFINT_ICR0_MAL_DETECT_MASK) {
@@@ -3546,7 -3546,7 +3546,7 @@@ enable_intr
        wr32(hw, I40E_PFINT_ICR0_ENA, ena_mask);
        if (!test_bit(__I40E_DOWN, &pf->state)) {
                i40e_service_event_schedule(pf);
 -              i40e_irq_dynamic_enable_icr0(pf);
 +              i40e_irq_dynamic_enable_icr0(pf, false);
        }
  
        return ret;
@@@ -3750,7 -3750,7 +3750,7 @@@ static int i40e_vsi_request_irq(struct 
  
  #ifdef CONFIG_NET_POLL_CONTROLLER
  /**
 - * i40e_netpoll - A Polling 'interrupt'handler
 + * i40e_netpoll - A Polling 'interrupt' handler
   * @netdev: network interface device structure
   *
   * This is used by netconsole to send skbs without having to re-enable
@@@ -3929,9 -3929,6 +3929,9 @@@ static int i40e_vsi_control_rx(struct i
                else
                        rx_reg &= ~I40E_QRX_ENA_QENA_REQ_MASK;
                wr32(hw, I40E_QRX_ENA(pf_q), rx_reg);
 +              /* No waiting for the Tx queue to disable */
 +              if (!enable && test_bit(__I40E_PORT_TX_SUSPENDED, &pf->state))
 +                      continue;
  
                /* wait for the change to finish */
                ret = i40e_pf_rxq_wait(pf, pf_q, enable);
@@@ -4169,6 -4166,9 +4169,9 @@@ static void i40e_clear_interrupt_scheme
                free_irq(pf->msix_entries[0].vector, pf);
        }
  
+       i40e_put_lump(pf->irq_pile, pf->iwarp_base_vector,
+                     I40E_IWARP_IRQ_PILE_ID);
        i40e_put_lump(pf->irq_pile, 0, I40E_PILE_VALID_BIT-1);
        for (i = 0; i < pf->num_alloc_vsi; i++)
                if (pf->vsi[i])
@@@ -4212,12 -4212,17 +4215,17 @@@ static void i40e_napi_disable_all(struc
   **/
  static void i40e_vsi_close(struct i40e_vsi *vsi)
  {
+       bool reset = false;
        if (!test_and_set_bit(__I40E_DOWN, &vsi->state))
                i40e_down(vsi);
        i40e_vsi_free_irq(vsi);
        i40e_vsi_free_tx_resources(vsi);
        i40e_vsi_free_rx_resources(vsi);
        vsi->current_netdev_flags = 0;
+       if (test_bit(__I40E_RESET_RECOVERY_PENDING, &vsi->back->state))
+               reset = true;
+       i40e_notify_client_of_netdev_close(vsi, reset);
  }
  
  /**
@@@ -4290,12 -4295,12 +4298,12 @@@ static void i40e_pf_unquiesce_all_vsi(s
  
  #ifdef CONFIG_I40E_DCB
  /**
 - * i40e_vsi_wait_txq_disabled - Wait for VSI's queues to be disabled
 + * i40e_vsi_wait_queues_disabled - Wait for VSI's queues to be disabled
   * @vsi: the VSI being configured
   *
 - * This function waits for the given VSI's Tx queues to be disabled.
 + * This function waits for the given VSI's queues to be disabled.
   **/
 -static int i40e_vsi_wait_txq_disabled(struct i40e_vsi *vsi)
 +static int i40e_vsi_wait_queues_disabled(struct i40e_vsi *vsi)
  {
        struct i40e_pf *pf = vsi->back;
        int i, pf_q, ret;
                }
        }
  
 +      pf_q = vsi->base_queue;
 +      for (i = 0; i < vsi->num_queue_pairs; i++, pf_q++) {
 +              /* Check and wait for the disable status of the queue */
 +              ret = i40e_pf_rxq_wait(pf, pf_q, false);
 +              if (ret) {
 +                      dev_info(&pf->pdev->dev,
 +                               "VSI seid %d Rx ring %d disable timeout\n",
 +                               vsi->seid, pf_q);
 +                      return ret;
 +              }
 +      }
 +
        return 0;
  }
  
  /**
 - * i40e_pf_wait_txq_disabled - Wait for all queues of PF VSIs to be disabled
 + * i40e_pf_wait_queues_disabled - Wait for all queues of PF VSIs to be disabled
   * @pf: the PF
   *
 - * This function waits for the Tx queues to be in disabled state for all the
 + * This function waits for the queues to be in disabled state for all the
   * VSIs that are managed by this PF.
   **/
 -static int i40e_pf_wait_txq_disabled(struct i40e_pf *pf)
 +static int i40e_pf_wait_queues_disabled(struct i40e_pf *pf)
  {
        int v, ret = 0;
  
        for (v = 0; v < pf->hw.func_caps.num_vsis; v++) {
                /* No need to wait for FCoE VSI queues */
                if (pf->vsi[v] && pf->vsi[v]->type != I40E_VSI_FCOE) {
 -                      ret = i40e_vsi_wait_txq_disabled(pf->vsi[v]);
 +                      ret = i40e_vsi_wait_queues_disabled(pf->vsi[v]);
                        if (ret)
                                break;
                }
@@@ -4367,7 -4360,7 +4375,7 @@@ static void i40e_detect_recover_hung_qu
  {
        struct i40e_ring *tx_ring = NULL;
        struct i40e_pf  *pf;
 -      u32 head, val, tx_pending;
 +      u32 head, val, tx_pending_hw;
        int i;
  
        pf = vsi->back;
        else
                val = rd32(&pf->hw, I40E_PFINT_DYN_CTL0);
  
 -      /* Bail out if interrupts are disabled because napi_poll
 -       * execution in-progress or will get scheduled soon.
 -       * napi_poll cleans TX and RX queues and updates 'next_to_clean'.
 -       */
 -      if (!(val & I40E_PFINT_DYN_CTLN_INTENA_MASK))
 -              return;
 -
        head = i40e_get_head(tx_ring);
  
 -      tx_pending = i40e_get_tx_pending(tx_ring);
 +      tx_pending_hw = i40e_get_tx_pending(tx_ring, false);
  
        /* HW is done executing descriptors, updated HEAD write back,
         * but SW hasn't processed those descriptors. If interrupt is
         * dev_watchdog detecting timeout on those netdev_queue,
         * hence proactively trigger SW interrupt.
         */
 -      if (tx_pending) {
 +      if (tx_pending_hw && (!(val & I40E_PFINT_DYN_CTLN_INTENA_MASK))) {
                /* NAPI Poll didn't run and clear since it was set */
                if (test_and_clear_bit(I40E_Q_VECTOR_HUNG_DETECT,
                                       &tx_ring->q_vector->hung_detected)) {
 -                      netdev_info(vsi->netdev, "VSI_seid %d, Hung TX queue %d, tx_pending: %d, NTC:0x%x, HWB: 0x%x, NTU: 0x%x, TAIL: 0x%x\n",
 -                                  vsi->seid, q_idx, tx_pending,
 +                      netdev_info(vsi->netdev, "VSI_seid %d, Hung TX queue %d, tx_pending_hw: %d, NTC:0x%x, HWB: 0x%x, NTU: 0x%x, TAIL: 0x%x\n",
 +                                  vsi->seid, q_idx, tx_pending_hw,
                                    tx_ring->next_to_clean, head,
                                    tx_ring->next_to_use,
                                    readl(tx_ring->tail));
                                &tx_ring->q_vector->hung_detected);
                }
        }
 +
 +      /* This is the case where we have interrupts missing,
 +       * so the tx_pending in HW will most likely be 0, but we
 +       * will have tx_pending in SW since the WB happened but the
 +       * interrupt got lost.
 +       */
 +      if ((!tx_pending_hw) && i40e_get_tx_pending(tx_ring, true) &&
 +          (!(val & I40E_PFINT_DYN_CTLN_INTENA_MASK))) {
 +              if (napi_reschedule(&tx_ring->q_vector->napi))
 +                      tx_ring->tx_stats.tx_lost_interrupt++;
 +      }
  }
  
  /**
@@@ -4850,6 -4839,12 +4858,12 @@@ static int i40e_vsi_config_tc(struct i4
        ctxt.info = vsi->info;
        i40e_vsi_setup_queue_map(vsi, &ctxt, enabled_tc, false);
  
+       if (vsi->back->flags & I40E_FLAG_IWARP_ENABLED) {
+               ctxt.info.valid_sections |=
+                               cpu_to_le16(I40E_AQ_VSI_PROP_QUEUE_OPT_VALID);
+               ctxt.info.queueing_opt_flags |= I40E_AQ_VSI_QUE_OPT_TCP_ENA;
+       }
        /* Update the VSI after updating the VSI queue-mapping information */
        ret = i40e_aq_update_vsi_params(&vsi->back->hw, &ctxt, NULL);
        if (ret) {
@@@ -4993,6 -4988,7 +5007,7 @@@ static void i40e_dcb_reconfigure(struc
                        if (pf->vsi[v]->netdev)
                                i40e_dcbnl_set_all(pf->vsi[v]);
                }
+               i40e_notify_client_of_l2_param_changes(pf->vsi[v]);
        }
  }
  
@@@ -5035,7 -5031,8 +5050,7 @@@ static int i40e_init_pf_dcb(struct i40e
        int err = 0;
  
        /* Do not enable DCB for SW1 and SW2 images even if the FW is capable */
 -      if (((pf->hw.aq.fw_maj_ver == 4) && (pf->hw.aq.fw_min_ver < 33)) ||
 -          (pf->hw.aq.fw_maj_ver < 4))
 +      if (pf->flags & I40E_FLAG_NO_DCB_SUPPORT)
                goto out;
  
        /* Get the initial DCB configuration */
@@@ -5191,6 -5188,11 +5206,11 @@@ static int i40e_up_complete(struct i40e
                }
                i40e_fdir_filter_restore(vsi);
        }
+       /* On the next run of the service_task, notify any clients of the new
+        * opened netdev
+        */
+       pf->flags |= I40E_FLAG_SERVICE_CLIENT_REQUESTED;
        i40e_service_event_schedule(pf);
  
        return 0;
@@@ -5267,7 -5269,11 +5287,7 @@@ void i40e_down(struct i40e_vsi *vsi
   * @netdev: net device to configure
   * @tc: number of traffic classes to enable
   **/
 -#ifdef I40E_FCOE
 -int i40e_setup_tc(struct net_device *netdev, u8 tc)
 -#else
  static int i40e_setup_tc(struct net_device *netdev, u8 tc)
 -#endif
  {
        struct i40e_netdev_priv *np = netdev_priv(netdev);
        struct i40e_vsi *vsi = np->vsi;
@@@ -5320,19 -5326,6 +5340,19 @@@ exit
        return ret;
  }
  
 +#ifdef I40E_FCOE
 +int __i40e_setup_tc(struct net_device *netdev, u32 handle, __be16 proto,
 +                  struct tc_to_netdev *tc)
 +#else
 +static int __i40e_setup_tc(struct net_device *netdev, u32 handle, __be16 proto,
 +                         struct tc_to_netdev *tc)
 +#endif
 +{
 +      if (handle != TC_H_ROOT || tc->type != TC_SETUP_MQPRIO)
 +              return -EINVAL;
 +      return i40e_setup_tc(netdev, tc->tc);
 +}
 +
  /**
   * i40e_open - Called when a network interface is made active
   * @netdev: network interface device structure
@@@ -5375,10 -5368,11 +5395,12 @@@ int i40e_open(struct net_device *netdev
        vxlan_get_rx_port(netdev);
  #endif
  #ifdef CONFIG_I40E_GENEVE
 -      geneve_get_rx_port(netdev);
 +      if (pf->flags & I40E_FLAG_GENEVE_OFFLOAD_CAPABLE)
 +              geneve_get_rx_port(netdev);
  #endif
  
+       i40e_notify_client_of_netdev_open(vsi);
        return 0;
  }
  
@@@ -5741,8 -5735,8 +5763,8 @@@ static int i40e_handle_lldp_event(struc
        if (ret)
                goto exit;
  
 -      /* Wait for the PF's Tx queues to be disabled */
 -      ret = i40e_pf_wait_txq_disabled(pf);
 +      /* Wait for the PF's queues to be disabled */
 +      ret = i40e_pf_wait_queues_disabled(pf);
        if (ret) {
                /* Schedule PF reset to recover */
                set_bit(__I40E_PF_RESET_REQUESTED, &pf->state);
@@@ -6043,6 -6037,7 +6065,7 @@@ static void i40e_vsi_link_event(struct 
        case I40E_VSI_SRIOV:
        case I40E_VSI_VMDQ2:
        case I40E_VSI_CTRL:
+       case I40E_VSI_IWARP:
        case I40E_VSI_MIRROR:
        default:
                /* there is no notification for other VSIs */
@@@ -6272,7 -6267,6 +6295,7 @@@ static void i40e_clean_adminq_subtask(s
                if (hw->debug_mask & I40E_DEBUG_AQ)
                        dev_info(&pf->pdev->dev, "ARQ Overflow Error detected\n");
                val &= ~I40E_PF_ARQLEN_ARQOVFL_MASK;
 +              pf->arq_overflows++;
        }
        if (val & I40E_PF_ARQLEN_ARQCRIT_MASK) {
                if (hw->debug_mask & I40E_DEBUG_AQ)
                case i40e_aqc_opc_nvm_erase:
                case i40e_aqc_opc_nvm_update:
                case i40e_aqc_opc_oem_post_update:
 -                      i40e_debug(&pf->hw, I40E_DEBUG_NVM, "ARQ NVM operation completed\n");
 +                      i40e_debug(&pf->hw, I40E_DEBUG_NVM,
 +                                 "ARQ NVM operation 0x%04x completed\n",
 +                                 opcode);
                        break;
                default:
                        dev_info(&pf->pdev->dev,
@@@ -6834,12 -6826,12 +6857,12 @@@ static void i40e_reset_and_rebuild(stru
        if (ret)
                goto end_core_reset;
  
 -      /* driver is only interested in link up/down and module qualification
 -       * reports from firmware
 +      /* The driver only wants link up/down and module qualification
 +       * reports from firmware.  Note the negative logic.
         */
        ret = i40e_aq_set_phy_int_mask(&pf->hw,
 -                                     I40E_AQ_EVENT_LINK_UPDOWN |
 -                                     I40E_AQ_EVENT_MODULE_QUAL_FAIL, NULL);
 +                                     ~(I40E_AQ_EVENT_LINK_UPDOWN |
 +                                       I40E_AQ_EVENT_MODULE_QUAL_FAIL), NULL);
        if (ret)
                dev_info(&pf->pdev->dev, "set phy mask fail, err %s aq_err %s\n",
                         i40e_stat_str(&pf->hw, ret),
                wr32(hw, I40E_REG_MSS, val);
        }
  
 -      if (((pf->hw.aq.fw_maj_ver == 4) && (pf->hw.aq.fw_min_ver < 33)) ||
 -          (pf->hw.aq.fw_maj_ver < 4)) {
 +      if (pf->flags & I40E_FLAG_RESTART_AUTONEG) {
                msleep(75);
                ret = i40e_aq_set_link_restart_an(&pf->hw, true, NULL);
                if (ret)
@@@ -7109,13 -7102,12 +7132,13 @@@ static void i40e_sync_udp_filters_subta
                                ret = i40e_aq_del_udp_tunnel(hw, i, NULL);
  
                        if (ret) {
 -                              dev_info(&pf->pdev->dev,
 -                                       "%s vxlan port %d, index %d failed, err %s aq_err %s\n",
 -                                       port ? "add" : "delete",
 -                                       ntohs(port), i,
 -                                       i40e_stat_str(&pf->hw, ret),
 -                                       i40e_aq_str(&pf->hw,
 +                              dev_dbg(&pf->pdev->dev,
 +                                      "%s %s port %d, index %d failed, err %s aq_err %s\n",
 +                                      pf->udp_ports[i].type ? "vxlan" : "geneve",
 +                                      port ? "add" : "delete",
 +                                      ntohs(port), i,
 +                                      i40e_stat_str(&pf->hw, ret),
 +                                      i40e_aq_str(&pf->hw,
                                                    pf->hw.aq.asq_last_status));
                                pf->udp_ports[i].index = 0;
                        }
@@@ -7142,12 -7134,12 +7165,13 @@@ static void i40e_service_task(struct wo
        }
  
        i40e_detect_recover_hung(pf);
 +      i40e_sync_filters_subtask(pf);
        i40e_reset_subtask(pf);
        i40e_handle_mdd_event(pf);
        i40e_vc_process_vflr_event(pf);
        i40e_watchdog_subtask(pf);
        i40e_fdir_reinit_subtask(pf);
+       i40e_client_subtask(pf);
        i40e_sync_filters_subtask(pf);
        i40e_sync_udp_filters_subtask(pf);
        i40e_clean_adminq_subtask(pf);
@@@ -7322,6 -7314,8 +7346,6 @@@ static int i40e_vsi_mem_alloc(struct i4
        set_bit(__I40E_DOWN, &vsi->state);
        vsi->flags = 0;
        vsi->idx = vsi_idx;
 -      vsi->rx_itr_setting = pf->rx_itr_default;
 -      vsi->tx_itr_setting = pf->tx_itr_default;
        vsi->int_rate_limit = 0;
        vsi->rss_table_size = (vsi->type == I40E_VSI_MAIN) ?
                                pf->rss_table_size : 64;
@@@ -7488,7 -7482,8 +7512,7 @@@ static int i40e_alloc_rings(struct i40e
                tx_ring->dcb_tc = 0;
                if (vsi->back->flags & I40E_FLAG_WB_ON_ITR_CAPABLE)
                        tx_ring->flags = I40E_TXR_FLAGS_WB_ON_ITR;
 -              if (vsi->back->flags & I40E_FLAG_OUTER_UDP_CSUM_CAPABLE)
 -                      tx_ring->flags |= I40E_TXR_FLAGS_OUTER_UDP_CSUM;
 +              tx_ring->tx_itr_setting = pf->tx_itr_default;
                vsi->tx_rings[i] = tx_ring;
  
                rx_ring = &tx_ring[1];
                        set_ring_16byte_desc_enabled(rx_ring);
                else
                        clear_ring_16byte_desc_enabled(rx_ring);
 +              rx_ring->rx_itr_setting = pf->rx_itr_default;
                vsi->rx_rings[i] = rx_ring;
        }
  
@@@ -7550,6 -7544,7 +7574,7 @@@ static int i40e_init_msix(struct i40e_p
        int vectors_left;
        int v_budget, i;
        int v_actual;
+       int iwarp_requested = 0;
  
        if (!(pf->flags & I40E_FLAG_MSIX_ENABLED))
                return -ENODEV;
         *              is governed by number of cpus in the system.
         *      - assumes symmetric Tx/Rx pairing
         *   - The number of VMDq pairs
+        *   - The CPU count within the NUMA node if iWARP is enabled
  #ifdef I40E_FCOE
         *   - The number of FCOE qps.
  #endif
        }
  
  #endif
+       /* can we reserve enough for iWARP? */
+       if (pf->flags & I40E_FLAG_IWARP_ENABLED) {
+               if (!vectors_left)
+                       pf->num_iwarp_msix = 0;
+               else if (vectors_left < pf->num_iwarp_msix)
+                       pf->num_iwarp_msix = 1;
+               v_budget += pf->num_iwarp_msix;
+               vectors_left -= pf->num_iwarp_msix;
+       }
        /* any vectors left over go for VMDq support */
        if (pf->flags & I40E_FLAG_VMDQ_ENABLED) {
                int vmdq_vecs_wanted = pf->num_vmdq_vsis * pf->num_vmdq_qps;
                 * of these features based on the policy and at the end disable
                 * the features that did not get any vectors.
                 */
+               iwarp_requested = pf->num_iwarp_msix;
+               pf->num_iwarp_msix = 0;
  #ifdef I40E_FCOE
                pf->num_fcoe_qps = 0;
                pf->num_fcoe_msix = 0;
                        pf->num_lan_msix = 1;
                        break;
                case 3:
+                       if (pf->flags & I40E_FLAG_IWARP_ENABLED) {
+                               pf->num_lan_msix = 1;
+                               pf->num_iwarp_msix = 1;
+                       } else {
+                               pf->num_lan_msix = 2;
+                       }
  #ifdef I40E_FCOE
                        /* give one vector to FCoE */
                        if (pf->flags & I40E_FLAG_FCOE_ENABLED) {
                                pf->num_lan_msix = 1;
                                pf->num_fcoe_msix = 1;
                        }
- #else
-                       pf->num_lan_msix = 2;
  #endif
                        break;
                default:
+                       if (pf->flags & I40E_FLAG_IWARP_ENABLED) {
+                               pf->num_iwarp_msix = min_t(int, (vec / 3),
+                                                iwarp_requested);
+                               pf->num_vmdq_vsis = min_t(int, (vec / 3),
+                                                 I40E_DEFAULT_NUM_VMDQ_VSI);
+                       } else {
+                               pf->num_vmdq_vsis = min_t(int, (vec / 2),
+                                                 I40E_DEFAULT_NUM_VMDQ_VSI);
+                       }
+                       pf->num_lan_msix = min_t(int,
+                              (vec - (pf->num_iwarp_msix + pf->num_vmdq_vsis)),
+                                                             pf->num_lan_msix);
  #ifdef I40E_FCOE
                        /* give one vector to FCoE */
                        if (pf->flags & I40E_FLAG_FCOE_ENABLED) {
                                vec--;
                        }
  #endif
-                       /* give the rest to the PF */
-                       pf->num_lan_msix = min_t(int, vec, pf->num_lan_qps);
                        break;
                }
        }
                dev_info(&pf->pdev->dev, "VMDq disabled, not enough MSI-X vectors\n");
                pf->flags &= ~I40E_FLAG_VMDQ_ENABLED;
        }
+       if ((pf->flags & I40E_FLAG_IWARP_ENABLED) &&
+           (pf->num_iwarp_msix == 0)) {
+               dev_info(&pf->pdev->dev, "IWARP disabled, not enough MSI-X vectors\n");
+               pf->flags &= ~I40E_FLAG_IWARP_ENABLED;
+       }
  #ifdef I40E_FCOE
  
        if ((pf->flags & I40E_FLAG_FCOE_ENABLED) && (pf->num_fcoe_msix == 0)) {
@@@ -7801,6 -7829,7 +7859,7 @@@ static int i40e_init_interrupt_scheme(s
                vectors = i40e_init_msix(pf);
                if (vectors < 0) {
                        pf->flags &= ~(I40E_FLAG_MSIX_ENABLED   |
+                                      I40E_FLAG_IWARP_ENABLED  |
  #ifdef I40E_FCOE
                                       I40E_FLAG_FCOE_ENABLED   |
  #endif
@@@ -7882,7 -7911,7 +7941,7 @@@ static int i40e_setup_misc_vector(struc
  
        i40e_flush(hw);
  
 -      i40e_irq_dynamic_enable_icr0(pf);
 +      i40e_irq_dynamic_enable_icr0(pf, true);
  
        return err;
  }
@@@ -7965,52 -7994,6 +8024,52 @@@ static int i40e_vsi_config_rss(struct i
        return ret;
  }
  
 +/**
 + * i40e_get_rss_aq - Get RSS keys and lut by using AQ commands
 + * @vsi: Pointer to vsi structure
 + * @seed: Buffter to store the hash keys
 + * @lut: Buffer to store the lookup table entries
 + * @lut_size: Size of buffer to store the lookup table entries
 + *
 + * Return 0 on success, negative on failure
 + */
 +static int i40e_get_rss_aq(struct i40e_vsi *vsi, const u8 *seed,
 +                         u8 *lut, u16 lut_size)
 +{
 +      struct i40e_pf *pf = vsi->back;
 +      struct i40e_hw *hw = &pf->hw;
 +      int ret = 0;
 +
 +      if (seed) {
 +              ret = i40e_aq_get_rss_key(hw, vsi->id,
 +                      (struct i40e_aqc_get_set_rss_key_data *)seed);
 +              if (ret) {
 +                      dev_info(&pf->pdev->dev,
 +                               "Cannot get RSS key, err %s aq_err %s\n",
 +                               i40e_stat_str(&pf->hw, ret),
 +                               i40e_aq_str(&pf->hw,
 +                                           pf->hw.aq.asq_last_status));
 +                      return ret;
 +              }
 +      }
 +
 +      if (lut) {
 +              bool pf_lut = vsi->type == I40E_VSI_MAIN ? true : false;
 +
 +              ret = i40e_aq_get_rss_lut(hw, vsi->id, pf_lut, lut, lut_size);
 +              if (ret) {
 +                      dev_info(&pf->pdev->dev,
 +                               "Cannot get RSS lut, err %s aq_err %s\n",
 +                               i40e_stat_str(&pf->hw, ret),
 +                               i40e_aq_str(&pf->hw,
 +                                           pf->hw.aq.asq_last_status));
 +                      return ret;
 +              }
 +      }
 +
 +      return ret;
 +}
 +
  /**
   * i40e_config_rss_reg - Configure RSS keys and lut by writing registers
   * @vsi: Pointer to vsi structure
@@@ -8032,7 -8015,7 +8091,7 @@@ static int i40e_config_rss_reg(struct i
                u32 *seed_dw = (u32 *)seed;
  
                for (i = 0; i <= I40E_PFQF_HKEY_MAX_INDEX; i++)
 -                      wr32(hw, I40E_PFQF_HKEY(i), seed_dw[i]);
 +                      i40e_write_rx_ctl(hw, I40E_PFQF_HKEY(i), seed_dw[i]);
        }
  
        if (lut) {
@@@ -8069,7 -8052,7 +8128,7 @@@ static int i40e_get_rss_reg(struct i40e
                u32 *seed_dw = (u32 *)seed;
  
                for (i = 0; i <= I40E_PFQF_HKEY_MAX_INDEX; i++)
 -                      seed_dw[i] = rd32(hw, I40E_PFQF_HKEY(i));
 +                      seed_dw[i] = i40e_read_rx_ctl(hw, I40E_PFQF_HKEY(i));
        }
        if (lut) {
                u32 *lut_dw = (u32 *)lut;
@@@ -8113,12 -8096,7 +8172,12 @@@ int i40e_config_rss(struct i40e_vsi *vs
   */
  int i40e_get_rss(struct i40e_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size)
  {
 -      return i40e_get_rss_reg(vsi, seed, lut, lut_size);
 +      struct i40e_pf *pf = vsi->back;
 +
 +      if (pf->flags & I40E_FLAG_RSS_AQ_CAPABLE)
 +              return i40e_get_rss_aq(vsi, seed, lut, lut_size);
 +      else
 +              return i40e_get_rss_reg(vsi, seed, lut, lut_size);
  }
  
  /**
@@@ -8152,19 -8130,19 +8211,19 @@@ static int i40e_pf_config_rss(struct i4
        int ret;
  
        /* By default we enable TCP/UDP with IPv4/IPv6 ptypes */
 -      hena = (u64)rd32(hw, I40E_PFQF_HENA(0)) |
 -              ((u64)rd32(hw, I40E_PFQF_HENA(1)) << 32);
 +      hena = (u64)i40e_read_rx_ctl(hw, I40E_PFQF_HENA(0)) |
 +              ((u64)i40e_read_rx_ctl(hw, I40E_PFQF_HENA(1)) << 32);
        hena |= i40e_pf_get_default_rss_hena(pf);
  
 -      wr32(hw, I40E_PFQF_HENA(0), (u32)hena);
 -      wr32(hw, I40E_PFQF_HENA(1), (u32)(hena >> 32));
 +      i40e_write_rx_ctl(hw, I40E_PFQF_HENA(0), (u32)hena);
 +      i40e_write_rx_ctl(hw, I40E_PFQF_HENA(1), (u32)(hena >> 32));
  
        /* Determine the RSS table size based on the hardware capabilities */
 -      reg_val = rd32(hw, I40E_PFQF_CTL_0);
 +      reg_val = i40e_read_rx_ctl(hw, I40E_PFQF_CTL_0);
        reg_val = (pf->rss_table_size == 512) ?
                        (reg_val | I40E_PFQF_CTL_0_HASHLUTSIZE_512) :
                        (reg_val & ~I40E_PFQF_CTL_0_HASHLUTSIZE_512);
 -      wr32(hw, I40E_PFQF_CTL_0, reg_val);
 +      i40e_write_rx_ctl(hw, I40E_PFQF_CTL_0, reg_val);
  
        /* Determine the RSS size of the VSI */
        if (!vsi->rss_size)
@@@ -8448,32 -8426,18 +8507,38 @@@ static int i40e_sw_init(struct i40e_pf 
                                 pf->hw.func_caps.fd_filters_best_effort;
        }
  
 +      if (i40e_is_mac_710(&pf->hw) &&
 +          (((pf->hw.aq.fw_maj_ver == 4) && (pf->hw.aq.fw_min_ver < 33)) ||
 +          (pf->hw.aq.fw_maj_ver < 4))) {
 +              pf->flags |= I40E_FLAG_RESTART_AUTONEG;
 +              /* No DCB support  for FW < v4.33 */
 +              pf->flags |= I40E_FLAG_NO_DCB_SUPPORT;
 +      }
 +
 +      /* Disable FW LLDP if FW < v4.3 */
 +      if (i40e_is_mac_710(&pf->hw) &&
 +          (((pf->hw.aq.fw_maj_ver == 4) && (pf->hw.aq.fw_min_ver < 3)) ||
 +          (pf->hw.aq.fw_maj_ver < 4)))
 +              pf->flags |= I40E_FLAG_STOP_FW_LLDP;
 +
 +      /* Use the FW Set LLDP MIB API if FW > v4.40 */
 +      if (i40e_is_mac_710(&pf->hw) &&
 +          (((pf->hw.aq.fw_maj_ver == 4) && (pf->hw.aq.fw_min_ver >= 40)) ||
 +          (pf->hw.aq.fw_maj_ver >= 5)))
 +              pf->flags |= I40E_FLAG_USE_SET_LLDP_MIB;
 +
        if (pf->hw.func_caps.vmdq) {
                pf->num_vmdq_vsis = I40E_DEFAULT_NUM_VMDQ_VSI;
                pf->flags |= I40E_FLAG_VMDQ_ENABLED;
                pf->num_vmdq_qps = i40e_default_queues_per_vmdq(pf);
        }
  
+       if (pf->hw.func_caps.iwarp) {
+               pf->flags |= I40E_FLAG_IWARP_ENABLED;
+               /* IWARP needs one extra vector for CQP just like MISC.*/
+               pf->num_iwarp_msix = (int)num_online_cpus() + 1;
+       }
  #ifdef I40E_FCOE
        i40e_init_pf_fcoe(pf);
  
                             I40E_FLAG_OUTER_UDP_CSUM_CAPABLE |
                             I40E_FLAG_WB_ON_ITR_CAPABLE |
                             I40E_FLAG_MULTIPLE_TCP_UDP_RSS_PCTYPE |
 +                           I40E_FLAG_100M_SGMII_CAPABLE |
 +                           I40E_FLAG_USE_SET_LLDP_MIB |
                             I40E_FLAG_GENEVE_OFFLOAD_CAPABLE;
 +      } else if ((pf->hw.aq.api_maj_ver > 1) ||
 +                 ((pf->hw.aq.api_maj_ver == 1) &&
 +                  (pf->hw.aq.api_min_ver > 4))) {
 +              /* Supported in FW API version higher than 1.4 */
 +              pf->flags |= I40E_FLAG_GENEVE_OFFLOAD_CAPABLE;
 +              pf->auto_disable_flags = I40E_FLAG_HW_ATR_EVICT_CAPABLE;
 +      } else {
 +              pf->auto_disable_flags = I40E_FLAG_HW_ATR_EVICT_CAPABLE;
        }
 +
        pf->eeprom_version = 0xDEAD;
        pf->lan_veb = I40E_NO_VEB;
        pf->lan_vsi = I40E_NO_VSI;
@@@ -8642,6 -8595,9 +8707,6 @@@ static void i40e_add_vxlan_port(struct 
        u8 next_idx;
        u8 idx;
  
 -      if (sa_family == AF_INET6)
 -              return;
 -
        idx = i40e_get_udp_port_idx(pf, port);
  
        /* Check if port already exists */
@@@ -8681,6 -8637,9 +8746,6 @@@ static void i40e_del_vxlan_port(struct 
        struct i40e_pf *pf = vsi->back;
        u8 idx;
  
 -      if (sa_family == AF_INET6)
 -              return;
 -
        idx = i40e_get_udp_port_idx(pf, port);
  
        /* Check if port already exists */
@@@ -8714,7 -8673,7 +8779,7 @@@ static void i40e_add_geneve_port(struc
        u8 next_idx;
        u8 idx;
  
 -      if (sa_family == AF_INET6)
 +      if (!(pf->flags & I40E_FLAG_GENEVE_OFFLOAD_CAPABLE))
                return;
  
        idx = i40e_get_udp_port_idx(pf, port);
@@@ -8758,7 -8717,7 +8823,7 @@@ static void i40e_del_geneve_port(struc
        struct i40e_pf *pf = vsi->back;
        u8 idx;
  
 -      if (sa_family == AF_INET6)
 +      if (!(pf->flags & I40E_FLAG_GENEVE_OFFLOAD_CAPABLE))
                return;
  
        idx = i40e_get_udp_port_idx(pf, port);
@@@ -8996,7 -8955,7 +9061,7 @@@ static const struct net_device_ops i40e
  #ifdef CONFIG_NET_POLL_CONTROLLER
        .ndo_poll_controller    = i40e_netpoll,
  #endif
 -      .ndo_setup_tc           = i40e_setup_tc,
 +      .ndo_setup_tc           = __i40e_setup_tc,
  #ifdef I40E_FCOE
        .ndo_fcoe_enable        = i40e_fcoe_enable,
        .ndo_fcoe_disable       = i40e_fcoe_disable,
@@@ -9048,15 -9007,11 +9113,15 @@@ static int i40e_config_netdev(struct i4
        np = netdev_priv(netdev);
        np->vsi = vsi;
  
 -      netdev->hw_enc_features |= NETIF_F_IP_CSUM       |
 -                                NETIF_F_RXCSUM         |
 -                                NETIF_F_GSO_UDP_TUNNEL |
 -                                NETIF_F_GSO_GRE        |
 -                                NETIF_F_TSO;
 +      netdev->hw_enc_features |= NETIF_F_IP_CSUM             |
 +                                 NETIF_F_IPV6_CSUM           |
 +                                 NETIF_F_TSO                 |
 +                                 NETIF_F_TSO6                |
 +                                 NETIF_F_TSO_ECN             |
 +                                 NETIF_F_GSO_GRE             |
 +                                 NETIF_F_GSO_UDP_TUNNEL      |
 +                                 NETIF_F_GSO_UDP_TUNNEL_CSUM |
 +                                 0;
  
        netdev->features = NETIF_F_SG                  |
                           NETIF_F_IP_CSUM             |
  
        if (!(pf->flags & I40E_FLAG_MFP_ENABLED))
                netdev->features |= NETIF_F_NTUPLE;
 +      if (pf->flags & I40E_FLAG_OUTER_UDP_CSUM_CAPABLE)
 +              netdev->features |= NETIF_F_GSO_UDP_TUNNEL_CSUM;
  
        /* copy netdev features into list of user selectable features */
        netdev->hw_features |= netdev->features;
@@@ -9328,6 -9281,13 +9393,13 @@@ static int i40e_add_vsi(struct i40e_vs
                                cpu_to_le16(I40E_AQ_VSI_SW_ID_FLAG_ALLOW_LB);
                }
  
+               if (vsi->back->flags & I40E_FLAG_IWARP_ENABLED) {
+                       ctxt.info.valid_sections |=
+                               cpu_to_le16(I40E_AQ_VSI_PROP_QUEUE_OPT_VALID);
+                       ctxt.info.queueing_opt_flags |=
+                                               I40E_AQ_VSI_QUE_OPT_TCP_ENA;
+               }
                ctxt.info.valid_sections |= cpu_to_le16(I40E_AQ_VSI_PROP_VLAN_VALID);
                ctxt.info.port_vlan_flags |= I40E_AQ_VSI_PVLAN_MODE_ALL;
                if (pf->vf[vsi->vf_id].spoofchk) {
                break;
  
  #endif /* I40E_FCOE */
+       case I40E_VSI_IWARP:
+               /* send down message to iWARP */
+               break;
        default:
                return -ENODEV;
        }
@@@ -9583,15 -9547,10 +9659,15 @@@ vector_setup_out
   **/
  static struct i40e_vsi *i40e_vsi_reinit_setup(struct i40e_vsi *vsi)
  {
 -      struct i40e_pf *pf = vsi->back;
 +      struct i40e_pf *pf;
        u8 enabled_tc;
        int ret;
  
 +      if (!vsi)
 +              return NULL;
 +
 +      pf = vsi->back;
 +
        i40e_put_lump(pf->qp_pile, vsi->base_queue, vsi->idx);
        i40e_vsi_clear_rings(vsi);
  
@@@ -10092,13 -10051,13 +10168,13 @@@ static int i40e_add_veb(struct i40e_ve
  {
        struct i40e_pf *pf = veb->pf;
        bool is_default = veb->pf->cur_promisc;
 -      bool is_cloud = false;
 +      bool enable_stats = !!(pf->flags & I40E_FLAG_VEB_STATS_ENABLED);
        int ret;
  
        /* get a VEB from the hardware */
        ret = i40e_aq_add_veb(&pf->hw, veb->uplink_seid, vsi->seid,
                              veb->enabled_tc, is_default,
 -                            is_cloud, &veb->seid, NULL);
 +                            &veb->seid, enable_stats, NULL);
        if (ret) {
                dev_info(&pf->pdev->dev,
                         "couldn't add VEB, err %s aq_err %s\n",
@@@ -10467,6 -10426,7 +10543,7 @@@ static void i40e_determine_queue_usage(
  
                /* make sure all the fancies are disabled */
                pf->flags &= ~(I40E_FLAG_RSS_ENABLED    |
+                              I40E_FLAG_IWARP_ENABLED  |
  #ifdef I40E_FCOE
                               I40E_FLAG_FCOE_ENABLED   |
  #endif
                queues_left -= pf->num_lan_qps;
  
                pf->flags &= ~(I40E_FLAG_RSS_ENABLED    |
+                              I40E_FLAG_IWARP_ENABLED  |
  #ifdef I40E_FCOE
                               I40E_FLAG_FCOE_ENABLED   |
  #endif
@@@ -10655,9 -10616,21 +10733,9 @@@ static void i40e_print_features(struct 
   **/
  static void i40e_get_platform_mac_addr(struct pci_dev *pdev, struct i40e_pf *pf)
  {
 -      struct device_node *dp = pci_device_to_OF_node(pdev);
 -      const unsigned char *addr;
 -      u8 *mac_addr = pf->hw.mac.addr;
 -
        pf->flags &= ~I40E_FLAG_PF_MAC;
 -      addr = of_get_mac_address(dp);
 -      if (addr) {
 -              ether_addr_copy(mac_addr, addr);
 +      if (!eth_platform_get_mac_address(&pdev->dev, pf->hw.mac.addr))
                pf->flags |= I40E_FLAG_PF_MAC;
 -#ifdef CONFIG_SPARC
 -      } else {
 -              ether_addr_copy(mac_addr, idprom->id_ethaddr);
 -              pf->flags |= I40E_FLAG_PF_MAC;
 -#endif /* CONFIG_SPARC */
 -      }
  }
  
  /**
@@@ -10680,6 -10653,7 +10758,6 @@@ static int i40e_probe(struct pci_dev *p
        u16 wol_nvm_bits;
        u16 link_status;
        int err;
 -      u32 len;
        u32 val;
        u32 i;
        u8 set_fc_aq_fail;
         * Ignore error return codes because if it was already disabled via
         * hardware settings this will fail
         */
 -      if (((pf->hw.aq.fw_maj_ver == 4) && (pf->hw.aq.fw_min_ver < 3)) ||
 -          (pf->hw.aq.fw_maj_ver < 4)) {
 +      if (pf->flags & I40E_FLAG_STOP_FW_LLDP) {
                dev_info(&pdev->dev, "Stopping firmware LLDP agent.\n");
                i40e_aq_stop_lldp(hw, true, NULL);
        }
                pf->num_alloc_vsi = pf->hw.func_caps.num_vsis;
  
        /* Set up the *vsi struct and our local tracking of the MAIN PF vsi. */
 -      len = sizeof(struct i40e_vsi *) * pf->num_alloc_vsi;
 -      pf->vsi = kzalloc(len, GFP_KERNEL);
 +      pf->vsi = kcalloc(pf->num_alloc_vsi, sizeof(struct i40e_vsi *),
 +                        GFP_KERNEL);
        if (!pf->vsi) {
                err = -ENOMEM;
                goto err_switch_setup;
                }
        }
  
 -      /* driver is only interested in link up/down and module qualification
 -       * reports from firmware
 +      /* The driver only wants link up/down and module qualification
 +       * reports from firmware.  Note the negative logic.
         */
        err = i40e_aq_set_phy_int_mask(&pf->hw,
 -                                     I40E_AQ_EVENT_LINK_UPDOWN |
 -                                     I40E_AQ_EVENT_MODULE_QUAL_FAIL, NULL);
 +                                     ~(I40E_AQ_EVENT_LINK_UPDOWN |
 +                                       I40E_AQ_EVENT_MODULE_QUAL_FAIL), NULL);
        if (err)
                dev_info(&pf->pdev->dev, "set phy mask fail, err %s aq_err %s\n",
                         i40e_stat_str(&pf->hw, err),
                wr32(hw, I40E_REG_MSS, val);
        }
  
 -      if (((pf->hw.aq.fw_maj_ver == 4) && (pf->hw.aq.fw_min_ver < 33)) ||
 -          (pf->hw.aq.fw_maj_ver < 4)) {
 +      if (pf->flags & I40E_FLAG_RESTART_AUTONEG) {
                msleep(75);
                err = i40e_aq_set_link_restart_an(&pf->hw, true, NULL);
                if (err)
        if ((pf->flags & I40E_FLAG_SRIOV_ENABLED) &&
            (pf->flags & I40E_FLAG_MSIX_ENABLED) &&
            !test_bit(__I40E_BAD_EEPROM, &pf->state)) {
 -              u32 val;
 -
                /* disable link interrupts for VFs */
                val = rd32(hw, I40E_PFGEN_PORTMDIO_NUM);
                val &= ~I40E_PFGEN_PORTMDIO_NUM_VFLINK_STAT_ENA_MASK;
        }
  #endif /* CONFIG_PCI_IOV */
  
-       pfs_found++;
+       if (pf->flags & I40E_FLAG_IWARP_ENABLED) {
+               pf->iwarp_base_vector = i40e_get_lump(pf, pf->irq_pile,
+                                                     pf->num_iwarp_msix,
+                                                     I40E_IWARP_IRQ_PILE_ID);
+               if (pf->iwarp_base_vector < 0) {
+                       dev_info(&pdev->dev,
+                                "failed to get tracking for %d vectors for IWARP err=%d\n",
+                                pf->num_iwarp_msix, pf->iwarp_base_vector);
+                       pf->flags &= ~I40E_FLAG_IWARP_ENABLED;
+               }
+       }
  
        i40e_dbg_pf_init(pf);
  
        mod_timer(&pf->service_timer,
                  round_jiffies(jiffies + pf->service_timer_period));
  
+       /* add this PF to client device list and launch a client service task */
+       err = i40e_lan_add_device(pf);
+       if (err)
+               dev_info(&pdev->dev, "Failed to add PF to client API service list: %d\n",
+                        err);
  #ifdef I40E_FCOE
        /* create FCoE interface */
        i40e_fcoe_vsi_setup(pf);
        i40e_add_filter_to_drop_tx_flow_control_frames(&pf->hw,
                                                       pf->main_vsi_seid);
  
 +      if ((pf->hw.device_id == I40E_DEV_ID_10G_BASE_T) ||
 +          (pf->hw.device_id == I40E_DEV_ID_10G_BASE_T4))
 +              pf->flags |= I40E_FLAG_HAVE_10GBASET_PHY;
 +
        /* print a string summarizing features */
        i40e_print_features(pf);
  
@@@ -11211,11 -11201,10 +11305,11 @@@ static void i40e_remove(struct pci_dev 
        i40e_ptp_stop(pf);
  
        /* Disable RSS in hw */
 -      wr32(hw, I40E_PFQF_HENA(0), 0);
 -      wr32(hw, I40E_PFQF_HENA(1), 0);
 +      i40e_write_rx_ctl(hw, I40E_PFQF_HENA(0), 0);
 +      i40e_write_rx_ctl(hw, I40E_PFQF_HENA(1), 0);
  
        /* no more scheduling of any task */
 +      set_bit(__I40E_SUSPENDED, &pf->state);
        set_bit(__I40E_DOWN, &pf->state);
        del_timer_sync(&pf->service_timer);
        cancel_work_sync(&pf->service_task);
        if (pf->vsi[pf->lan_vsi])
                i40e_vsi_release(pf->vsi[pf->lan_vsi]);
  
+       /* remove attached clients */
+       ret_code = i40e_lan_del_device(pf);
+       if (ret_code) {
+               dev_warn(&pdev->dev, "Failed to delete client device: %d\n",
+                        ret_code);
+       }
        /* shutdown and destroy the HMC */
 -      if (pf->hw.hmc.hmc_obj) {
 -              ret_code = i40e_shutdown_lan_hmc(&pf->hw);
 +      if (hw->hmc.hmc_obj) {
 +              ret_code = i40e_shutdown_lan_hmc(hw);
                if (ret_code)
                        dev_warn(&pdev->dev,
                                 "Failed to destroy the HMC resources: %d\n",
        }
  
        /* shutdown the adminq */
 -      ret_code = i40e_shutdown_adminq(&pf->hw);
 +      ret_code = i40e_shutdown_adminq(hw);
        if (ret_code)
                dev_warn(&pdev->dev,
                         "Failed to destroy the Admin Queue resources: %d\n",
        kfree(pf->qp_pile);
        kfree(pf->vsi);
  
 -      iounmap(pf->hw.hw_addr);
 +      iounmap(hw->hw_addr);
        kfree(pf);
        pci_release_selected_regions(pdev,
                                     pci_select_bars(pdev, IORESOURCE_MEM));
@@@ -11518,16 -11514,6 +11619,16 @@@ static int __init i40e_init_module(void
                i40e_driver_string, i40e_driver_version_str);
        pr_info("%s: %s\n", i40e_driver_name, i40e_copyright);
  
 +      /* we will see if single thread per module is enough for now,
 +       * it can't be any worse than using the system workqueue which
 +       * was already single threaded
 +       */
 +      i40e_wq = create_singlethread_workqueue(i40e_driver_name);
 +      if (!i40e_wq) {
 +              pr_err("%s: Failed to create workqueue\n", i40e_driver_name);
 +              return -ENOMEM;
 +      }
 +
        i40e_dbg_init();
        return pci_register_driver(&i40e_driver);
  }
@@@ -11542,7 -11528,6 +11643,7 @@@ module_init(i40e_init_module)
  static void __exit i40e_exit_module(void)
  {
        pci_unregister_driver(&i40e_driver);
 +      destroy_workqueue(i40e_wq);
        i40e_dbg_exit();
  }
  module_exit(i40e_exit_module);
index 0a0baf71041b050d3cda5ee902f94a8e33b81548,79e975d29a1e0b70eb4989c8f773da9590b8003c..3335f9d13374d154b6c64a75fc5646ebe3dad2dc
@@@ -78,7 -78,7 +78,7 @@@ enum i40e_debug_mask 
        I40E_DEBUG_DCB                  = 0x00000400,
        I40E_DEBUG_DIAG                 = 0x00000800,
        I40E_DEBUG_FD                   = 0x00001000,
+       I40E_DEBUG_IWARP                = 0x00F00000,
        I40E_DEBUG_AQ_MESSAGE           = 0x01000000,
        I40E_DEBUG_AQ_DESCRIPTOR        = 0x02000000,
        I40E_DEBUG_AQ_DESC_BUFFER       = 0x04000000,
        I40E_DEBUG_ALL                  = 0xFFFFFFFF
  };
  
 +#define I40E_MDIO_STCODE                0
 +#define I40E_MDIO_OPCODE_ADDRESS        0
 +#define I40E_MDIO_OPCODE_WRITE          I40E_MASK(1, \
 +                                                I40E_GLGEN_MSCA_OPCODE_SHIFT)
 +#define I40E_MDIO_OPCODE_READ_INC_ADDR  I40E_MASK(2, \
 +                                                I40E_GLGEN_MSCA_OPCODE_SHIFT)
 +#define I40E_MDIO_OPCODE_READ           I40E_MASK(3, \
 +                                                I40E_GLGEN_MSCA_OPCODE_SHIFT)
 +
 +#define I40E_PHY_COM_REG_PAGE                   0x1E
 +#define I40E_PHY_LED_LINK_MODE_MASK             0xF0
 +#define I40E_PHY_LED_MANUAL_ON                  0x100
 +#define I40E_PHY_LED_PROV_REG_1                 0xC430
 +#define I40E_PHY_LED_MODE_MASK                  0xFFFF
 +#define I40E_PHY_LED_MODE_ORIG                  0x80000000
 +
  /* These are structs for managing the hardware information and the operations.
   * The structures of function pointers are filled out at init time when we
   * know for sure exactly which hardware we're working with.  This gives us the
@@@ -160,6 -144,7 +160,7 @@@ enum i40e_vsi_type 
        I40E_VSI_MIRROR = 5,
        I40E_VSI_SRIOV  = 6,
        I40E_VSI_FDIR   = 7,
+       I40E_VSI_IWARP  = 8,
        I40E_VSI_TYPE_UNKNOWN
  };
  
@@@ -1114,10 -1099,6 +1115,10 @@@ enum i40e_filter_program_desc_pcmd 
                                         I40E_TXD_FLTR_QW1_CMD_SHIFT)
  #define I40E_TXD_FLTR_QW1_ATR_MASK    BIT_ULL(I40E_TXD_FLTR_QW1_ATR_SHIFT)
  
 +#define I40E_TXD_FLTR_QW1_ATR_SHIFT   (0xEULL + \
 +                                       I40E_TXD_FLTR_QW1_CMD_SHIFT)
 +#define I40E_TXD_FLTR_QW1_ATR_MASK    BIT_ULL(I40E_TXD_FLTR_QW1_ATR_SHIFT)
 +
  #define I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT 20
  #define I40E_TXD_FLTR_QW1_CNTINDEX_MASK       (0x1FFUL << \
                                         I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT)
index acd2693a4e97d9747ea446d4c499dd44f00918cf,bf35b64f6a4a0094b5c7edec399909898acd86f4..816c6bbf70931a98142085302ba2cbb41355072d
@@@ -1,7 -1,7 +1,7 @@@
  /*******************************************************************************
   *
   * Intel Ethernet Controller XL710 Family Linux Driver
 - * Copyright(c) 2013 - 2015 Intel Corporation.
 + * Copyright(c) 2013 - 2016 Intel Corporation.
   *
   * This program is free software; you can redistribute it and/or modify it
   * under the terms and conditions of the GNU General Public License,
@@@ -351,6 -351,136 +351,136 @@@ irq_list_done
        i40e_flush(hw);
  }
  
+ /**
+  * i40e_release_iwarp_qvlist
+  * @vf: pointer to the VF.
+  *
+  **/
+ static void i40e_release_iwarp_qvlist(struct i40e_vf *vf)
+ {
+       struct i40e_pf *pf = vf->pf;
+       struct i40e_virtchnl_iwarp_qvlist_info *qvlist_info = vf->qvlist_info;
+       u32 msix_vf;
+       u32 i;
+       if (!vf->qvlist_info)
+               return;
+       msix_vf = pf->hw.func_caps.num_msix_vectors_vf;
+       for (i = 0; i < qvlist_info->num_vectors; i++) {
+               struct i40e_virtchnl_iwarp_qv_info *qv_info;
+               u32 next_q_index, next_q_type;
+               struct i40e_hw *hw = &pf->hw;
+               u32 v_idx, reg_idx, reg;
+               qv_info = &qvlist_info->qv_info[i];
+               if (!qv_info)
+                       continue;
+               v_idx = qv_info->v_idx;
+               if (qv_info->ceq_idx != I40E_QUEUE_INVALID_IDX) {
+                       /* Figure out the queue after CEQ and make that the
+                        * first queue.
+                        */
+                       reg_idx = (msix_vf - 1) * vf->vf_id + qv_info->ceq_idx;
+                       reg = rd32(hw, I40E_VPINT_CEQCTL(reg_idx));
+                       next_q_index = (reg & I40E_VPINT_CEQCTL_NEXTQ_INDX_MASK)
+                                       >> I40E_VPINT_CEQCTL_NEXTQ_INDX_SHIFT;
+                       next_q_type = (reg & I40E_VPINT_CEQCTL_NEXTQ_TYPE_MASK)
+                                       >> I40E_VPINT_CEQCTL_NEXTQ_TYPE_SHIFT;
+                       reg_idx = ((msix_vf - 1) * vf->vf_id) + (v_idx - 1);
+                       reg = (next_q_index &
+                              I40E_VPINT_LNKLSTN_FIRSTQ_INDX_MASK) |
+                              (next_q_type <<
+                              I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_SHIFT);
+                       wr32(hw, I40E_VPINT_LNKLSTN(reg_idx), reg);
+               }
+       }
+       kfree(vf->qvlist_info);
+       vf->qvlist_info = NULL;
+ }
+ /**
+  * i40e_config_iwarp_qvlist
+  * @vf: pointer to the VF info
+  * @qvlist_info: queue and vector list
+  *
+  * Return 0 on success or < 0 on error
+  **/
+ static int i40e_config_iwarp_qvlist(struct i40e_vf *vf,
+                                   struct i40e_virtchnl_iwarp_qvlist_info *qvlist_info)
+ {
+       struct i40e_pf *pf = vf->pf;
+       struct i40e_hw *hw = &pf->hw;
+       struct i40e_virtchnl_iwarp_qv_info *qv_info;
+       u32 v_idx, i, reg_idx, reg;
+       u32 next_q_idx, next_q_type;
+       u32 msix_vf, size;
+       size = sizeof(struct i40e_virtchnl_iwarp_qvlist_info) +
+              (sizeof(struct i40e_virtchnl_iwarp_qv_info) *
+                                               (qvlist_info->num_vectors - 1));
+       vf->qvlist_info = kzalloc(size, GFP_KERNEL);
+       vf->qvlist_info->num_vectors = qvlist_info->num_vectors;
+       msix_vf = pf->hw.func_caps.num_msix_vectors_vf;
+       for (i = 0; i < qvlist_info->num_vectors; i++) {
+               qv_info = &qvlist_info->qv_info[i];
+               if (!qv_info)
+                       continue;
+               v_idx = qv_info->v_idx;
+               /* Validate vector id belongs to this vf */
+               if (!i40e_vc_isvalid_vector_id(vf, v_idx))
+                       goto err;
+               vf->qvlist_info->qv_info[i] = *qv_info;
+               reg_idx = ((msix_vf - 1) * vf->vf_id) + (v_idx - 1);
+               /* We might be sharing the interrupt, so get the first queue
+                * index and type, push it down the list by adding the new
+                * queue on top. Also link it with the new queue in CEQCTL.
+                */
+               reg = rd32(hw, I40E_VPINT_LNKLSTN(reg_idx));
+               next_q_idx = ((reg & I40E_VPINT_LNKLSTN_FIRSTQ_INDX_MASK) >>
+                               I40E_VPINT_LNKLSTN_FIRSTQ_INDX_SHIFT);
+               next_q_type = ((reg & I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_MASK) >>
+                               I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_SHIFT);
+               if (qv_info->ceq_idx != I40E_QUEUE_INVALID_IDX) {
+                       reg_idx = (msix_vf - 1) * vf->vf_id + qv_info->ceq_idx;
+                       reg = (I40E_VPINT_CEQCTL_CAUSE_ENA_MASK |
+                       (v_idx << I40E_VPINT_CEQCTL_MSIX_INDX_SHIFT) |
+                       (qv_info->itr_idx << I40E_VPINT_CEQCTL_ITR_INDX_SHIFT) |
+                       (next_q_type << I40E_VPINT_CEQCTL_NEXTQ_TYPE_SHIFT) |
+                       (next_q_idx << I40E_VPINT_CEQCTL_NEXTQ_INDX_SHIFT));
+                       wr32(hw, I40E_VPINT_CEQCTL(reg_idx), reg);
+                       reg_idx = ((msix_vf - 1) * vf->vf_id) + (v_idx - 1);
+                       reg = (qv_info->ceq_idx &
+                              I40E_VPINT_LNKLSTN_FIRSTQ_INDX_MASK) |
+                              (I40E_QUEUE_TYPE_PE_CEQ <<
+                              I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_SHIFT);
+                       wr32(hw, I40E_VPINT_LNKLSTN(reg_idx), reg);
+               }
+               if (qv_info->aeq_idx != I40E_QUEUE_INVALID_IDX) {
+                       reg = (I40E_VPINT_AEQCTL_CAUSE_ENA_MASK |
+                       (v_idx << I40E_VPINT_AEQCTL_MSIX_INDX_SHIFT) |
+                       (qv_info->itr_idx << I40E_VPINT_AEQCTL_ITR_INDX_SHIFT));
+                       wr32(hw, I40E_VPINT_AEQCTL(vf->vf_id), reg);
+               }
+       }
+       return 0;
+ err:
+       kfree(vf->qvlist_info);
+       vf->qvlist_info = NULL;
+       return -EINVAL;
+ }
  /**
   * i40e_config_vsi_tx_queue
   * @vf: pointer to the VF info
@@@ -461,7 -591,7 +591,7 @@@ static int i40e_config_vsi_rx_queue(str
                rx_ctx.hbuff = info->hdr_size >> I40E_RXQ_CTX_HBUFF_SHIFT;
  
                /* set splitalways mode 10b */
 -              rx_ctx.dtype = 0x2;
 +              rx_ctx.dtype = I40E_RX_DTYPE_HEADER_SPLIT;
        }
  
        /* databuffer length validation */
@@@ -602,8 -732,8 +732,8 @@@ static void i40e_enable_vf_mappings(str
         * that VF queues be mapped using this method, even when they are
         * contiguous in real life
         */
 -      wr32(hw, I40E_VSILAN_QBASE(vf->lan_vsi_id),
 -           I40E_VSILAN_QBASE_VSIQTABLE_ENA_MASK);
 +      i40e_write_rx_ctl(hw, I40E_VSILAN_QBASE(vf->lan_vsi_id),
 +                        I40E_VSILAN_QBASE_VSIQTABLE_ENA_MASK);
  
        /* enable VF vplan_qtable mappings */
        reg = I40E_VPLAN_MAPENA_TXRX_ENA_MASK;
                                                      (j * 2) + 1);
                        reg |= qid << 16;
                }
 -              wr32(hw, I40E_VSILAN_QTABLE(j, vf->lan_vsi_id), reg);
 +              i40e_write_rx_ctl(hw, I40E_VSILAN_QTABLE(j, vf->lan_vsi_id),
 +                                reg);
        }
  
        i40e_flush(hw);
@@@ -850,9 -979,11 +980,11 @@@ complete_reset
        /* reallocate VF resources to reset the VSI state */
        i40e_free_vf_res(vf);
        if (!i40e_alloc_vf_res(vf)) {
+               int abs_vf_id = vf->vf_id + hw->func_caps.vf_base_id;
                i40e_enable_vf_mappings(vf);
                set_bit(I40E_VF_STAT_ACTIVE, &vf->vf_states);
                clear_bit(I40E_VF_STAT_DISABLED, &vf->vf_states);
+               i40e_notify_client_of_vf_reset(pf, abs_vf_id);
        }
        /* tell the VF the reset is done */
        wr32(hw, I40E_VFGEN_RSTAT1(vf->vf_id), I40E_VFR_VFACTIVE);
@@@ -877,11 -1008,7 +1009,7 @@@ void i40e_free_vfs(struct i40e_pf *pf
        while (test_and_set_bit(__I40E_VF_DISABLE, &pf->state))
                usleep_range(1000, 2000);
  
-       for (i = 0; i < pf->num_alloc_vfs; i++)
-               if (test_bit(I40E_VF_STAT_INIT, &pf->vf[i].vf_states))
-                       i40e_vsi_control_rings(pf->vsi[pf->vf[i].lan_vsi_idx],
-                                              false);
+       i40e_notify_client_of_vf_enable(pf, 0);
        for (i = 0; i < pf->num_alloc_vfs; i++)
                if (test_bit(I40E_VF_STAT_INIT, &pf->vf[i].vf_states))
                        i40e_vsi_control_rings(pf->vsi[pf->vf[i].lan_vsi_idx],
@@@ -953,6 -1080,7 +1081,7 @@@ int i40e_alloc_vfs(struct i40e_pf *pf, 
                        goto err_iov;
                }
        }
+       i40e_notify_client_of_vf_enable(pf, num_alloc_vfs);
        /* allocate memory */
        vfs = kcalloc(num_alloc_vfs, sizeof(struct i40e_vf), GFP_KERNEL);
        if (!vfs) {
@@@ -981,7 -1109,7 +1110,7 @@@ err_alloc
                i40e_free_vfs(pf);
  err_iov:
        /* Re-enable interrupt 0. */
 -      i40e_irq_dynamic_enable_icr0(pf);
 +      i40e_irq_dynamic_enable_icr0(pf, false);
        return ret;
  }
  
@@@ -1206,6 -1334,13 +1335,13 @@@ static int i40e_vc_get_vf_resources_msg
        vsi = pf->vsi[vf->lan_vsi_idx];
        if (!vsi->info.pvid)
                vfres->vf_offload_flags |= I40E_VIRTCHNL_VF_OFFLOAD_VLAN;
+       if (i40e_vf_client_capable(pf, vf->vf_id, I40E_CLIENT_IWARP) &&
+           (vf->driver_caps & I40E_VIRTCHNL_VF_OFFLOAD_IWARP)) {
+               vfres->vf_offload_flags |= I40E_VIRTCHNL_VF_OFFLOAD_IWARP;
+               set_bit(I40E_VF_STAT_IWARPENA, &vf->vf_states);
+       }
        if (pf->flags & I40E_FLAG_RSS_AQ_CAPABLE) {
                if (vf->driver_caps & I40E_VIRTCHNL_VF_OFFLOAD_RSS_AQ)
                        vfres->vf_offload_flags |=
                vfres->vf_offload_flags |= I40E_VIRTCHNL_VF_OFFLOAD_RSS_REG;
        }
  
 +      if (pf->flags & I40E_FLAG_MULTIPLE_TCP_UDP_RSS_PCTYPE) {
 +              if (vf->driver_caps & I40E_VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2)
 +                      vfres->vf_offload_flags |=
 +                              I40E_VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2;
 +      }
 +
        if (vf->driver_caps & I40E_VIRTCHNL_VF_OFFLOAD_RX_POLLING)
                vfres->vf_offload_flags |= I40E_VIRTCHNL_VF_OFFLOAD_RX_POLLING;
  
 +      if (pf->flags & I40E_FLAG_WB_ON_ITR_CAPABLE) {
 +              if (vf->driver_caps & I40E_VIRTCHNL_VF_OFFLOAD_WB_ON_ITR)
 +                      vfres->vf_offload_flags |=
 +                                      I40E_VIRTCHNL_VF_OFFLOAD_WB_ON_ITR;
 +      }
 +
        vfres->num_vsis = num_vsis;
        vfres->num_queue_pairs = vf->num_queue_pairs;
        vfres->max_vectors = pf->hw.func_caps.num_msix_vectors_vf;
@@@ -1826,6 -1949,72 +1962,72 @@@ error_param
        return i40e_vc_send_resp_to_vf(vf, I40E_VIRTCHNL_OP_DEL_VLAN, aq_ret);
  }
  
+ /**
+  * i40e_vc_iwarp_msg
+  * @vf: pointer to the VF info
+  * @msg: pointer to the msg buffer
+  * @msglen: msg length
+  *
+  * called from the VF for the iwarp msgs
+  **/
+ static int i40e_vc_iwarp_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
+ {
+       struct i40e_pf *pf = vf->pf;
+       int abs_vf_id = vf->vf_id + pf->hw.func_caps.vf_base_id;
+       i40e_status aq_ret = 0;
+       if (!test_bit(I40E_VF_STAT_ACTIVE, &vf->vf_states) ||
+           !test_bit(I40E_VF_STAT_IWARPENA, &vf->vf_states)) {
+               aq_ret = I40E_ERR_PARAM;
+               goto error_param;
+       }
+       i40e_notify_client_of_vf_msg(pf->vsi[pf->lan_vsi], abs_vf_id,
+                                    msg, msglen);
+ error_param:
+       /* send the response to the VF */
+       return i40e_vc_send_resp_to_vf(vf, I40E_VIRTCHNL_OP_IWARP,
+                                      aq_ret);
+ }
+ /**
+  * i40e_vc_iwarp_qvmap_msg
+  * @vf: pointer to the VF info
+  * @msg: pointer to the msg buffer
+  * @msglen: msg length
+  * @config: config qvmap or release it
+  *
+  * called from the VF for the iwarp msgs
+  **/
+ static int i40e_vc_iwarp_qvmap_msg(struct i40e_vf *vf, u8 *msg, u16 msglen,
+                                  bool config)
+ {
+       struct i40e_virtchnl_iwarp_qvlist_info *qvlist_info =
+                               (struct i40e_virtchnl_iwarp_qvlist_info *)msg;
+       i40e_status aq_ret = 0;
+       if (!test_bit(I40E_VF_STAT_ACTIVE, &vf->vf_states) ||
+           !test_bit(I40E_VF_STAT_IWARPENA, &vf->vf_states)) {
+               aq_ret = I40E_ERR_PARAM;
+               goto error_param;
+       }
+       if (config) {
+               if (i40e_config_iwarp_qvlist(vf, qvlist_info))
+                       aq_ret = I40E_ERR_PARAM;
+       } else {
+               i40e_release_iwarp_qvlist(vf);
+       }
+ error_param:
+       /* send the response to the VF */
+       return i40e_vc_send_resp_to_vf(vf,
+                              config ? I40E_VIRTCHNL_OP_RELEASE_IWARP_IRQ_MAP :
+                              I40E_VIRTCHNL_OP_CONFIG_IWARP_IRQ_MAP,
+                              aq_ret);
+ }
  /**
   * i40e_vc_validate_vf_msg
   * @vf: pointer to the VF info
@@@ -1921,6 -2110,32 +2123,32 @@@ static int i40e_vc_validate_vf_msg(stru
        case I40E_VIRTCHNL_OP_GET_STATS:
                valid_len = sizeof(struct i40e_virtchnl_queue_select);
                break;
+       case I40E_VIRTCHNL_OP_IWARP:
+               /* These messages are opaque to us and will be validated in
+                * the RDMA client code. We just need to check for nonzero
+                * length. The firmware will enforce max length restrictions.
+                */
+               if (msglen)
+                       valid_len = msglen;
+               else
+                       err_msg_format = true;
+               break;
+       case I40E_VIRTCHNL_OP_RELEASE_IWARP_IRQ_MAP:
+               valid_len = 0;
+               break;
+       case I40E_VIRTCHNL_OP_CONFIG_IWARP_IRQ_MAP:
+               valid_len = sizeof(struct i40e_virtchnl_iwarp_qvlist_info);
+               if (msglen >= valid_len) {
+                       struct i40e_virtchnl_iwarp_qvlist_info *qv =
+                               (struct i40e_virtchnl_iwarp_qvlist_info *)msg;
+                       if (qv->num_vectors == 0) {
+                               err_msg_format = true;
+                               break;
+                       }
+                       valid_len += ((qv->num_vectors - 1) *
+                               sizeof(struct i40e_virtchnl_iwarp_qv_info));
+               }
+               break;
        /* These are always errors coming from the VF. */
        case I40E_VIRTCHNL_OP_EVENT:
        case I40E_VIRTCHNL_OP_UNKNOWN:
@@@ -2010,6 -2225,15 +2238,15 @@@ int i40e_vc_process_vf_msg(struct i40e_
        case I40E_VIRTCHNL_OP_GET_STATS:
                ret = i40e_vc_get_stats_msg(vf, msg, msglen);
                break;
+       case I40E_VIRTCHNL_OP_IWARP:
+               ret = i40e_vc_iwarp_msg(vf, msg, msglen);
+               break;
+       case I40E_VIRTCHNL_OP_CONFIG_IWARP_IRQ_MAP:
+               ret = i40e_vc_iwarp_qvmap_msg(vf, msg, msglen, true);
+               break;
+       case I40E_VIRTCHNL_OP_RELEASE_IWARP_IRQ_MAP:
+               ret = i40e_vc_iwarp_qvmap_msg(vf, msg, msglen, false);
+               break;
        case I40E_VIRTCHNL_OP_UNKNOWN:
        default:
                dev_err(&pf->pdev->dev, "Unsupported opcode %d from VF %d\n",
@@@ -2038,11 -2262,7 +2275,11 @@@ int i40e_vc_process_vflr_event(struct i
        if (!test_bit(__I40E_VFLR_EVENT_PENDING, &pf->state))
                return 0;
  
 -      /* re-enable vflr interrupt cause */
 +      /* Re-enable the VFLR interrupt cause here, before looking for which
 +       * VF got reset. Otherwise, if another VF gets a reset while the
 +       * first one is being processed, that interrupt will be lost, and
 +       * that VF will be stuck in reset forever.
 +       */
        reg = rd32(hw, I40E_PFINT_ICR0_ENA);
        reg |= I40E_PFINT_ICR0_ENA_VFLR_MASK;
        wr32(hw, I40E_PFINT_ICR0_ENA, reg);
@@@ -2203,8 -2423,6 +2440,8 @@@ int i40e_ndo_set_vf_port_vlan(struct ne
                 * and then reloading the VF driver.
                 */
                i40e_vc_disable_vf(pf, vf);
 +              /* During reset the VF got a new VSI, so refresh the pointer. */
 +              vsi = pf->vsi[vf->lan_vsi_idx];
        }
  
        /* Check for condition where there was already a port VLAN ID
@@@ -2313,9 -2531,6 +2550,9 @@@ int i40e_ndo_set_vf_bw(struct net_devic
        case I40E_LINK_SPEED_40GB:
                speed = 40000;
                break;
 +      case I40E_LINK_SPEED_20GB:
 +              speed = 20000;
 +              break;
        case I40E_LINK_SPEED_10GB:
                speed = 10000;
                break;
index e74642a0c42ef0dd0889f269ab358ccd4c082914,1da4d9ac4c7ab2b7ec1082d78dcb849882b4c47f..e7b2fba0309ee4af95ef247420987ad684eb1d7c
@@@ -58,6 -58,7 +58,7 @@@ enum i40e_queue_ctrl 
  enum i40e_vf_states {
        I40E_VF_STAT_INIT = 0,
        I40E_VF_STAT_ACTIVE,
+       I40E_VF_STAT_IWARPENA,
        I40E_VF_STAT_FCOEENA,
        I40E_VF_STAT_DISABLED,
  };
@@@ -66,6 -67,7 +67,7 @@@
  enum i40e_vf_capabilities {
        I40E_VIRTCHNL_VF_CAP_PRIVILEGE = 0,
        I40E_VIRTCHNL_VF_CAP_L2,
+       I40E_VIRTCHNL_VF_CAP_IWARP,
  };
  
  /* VF information structure */
@@@ -91,8 -93,8 +93,8 @@@ struct i40e_vf 
         * When assigned, these will be non-zero, because VSI 0 is always
         * the main LAN VSI for the PF.
         */
 -      u8 lan_vsi_idx;         /* index into PF struct */
 -      u8 lan_vsi_id;          /* ID as used by firmware */
 +      u16 lan_vsi_idx;        /* index into PF struct */
 +      u16 lan_vsi_id;         /* ID as used by firmware */
  
        u8 num_queue_pairs;     /* num of qps assigned to VF vsis */
        u64 num_mdd_events;     /* num of mdd events detected */
        bool link_forced;
        bool link_up;           /* only valid if VF link is forced */
        bool spoofchk;
+       /* RDMA Client */
+       struct i40e_virtchnl_iwarp_qvlist_info *qvlist_info;
  };
  
  void i40e_free_vfs(struct i40e_pf *pf);
index 97f5114fc11394b1583ed5e6009064bfbfb0daa5,ebb4036b98e5773343e25f4bd91a1b508b702613..eb926e1ee71c259291850ca906dd1bdb3c79a753
@@@ -1,5 -1,5 +1,5 @@@
  /*
 - * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
 + * Copyright (c) 2013-2016, Mellanox Technologies. All rights reserved.
   *
   * This software is available to you under a choice of one of two
   * licenses.  You may choose to be licensed under the terms of the GNU
@@@ -407,6 -407,12 +407,12 @@@ static int mlx5_internal_err_ret_value(
  const char *mlx5_command_str(int command)
  {
        switch (command) {
+       case MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT:
+               return "QUERY_HCA_VPORT_CONTEXT";
+       case MLX5_CMD_OP_MODIFY_HCA_VPORT_CONTEXT:
+               return "MODIFY_HCA_VPORT_CONTEXT";
        case MLX5_CMD_OP_QUERY_HCA_CAP:
                return "QUERY_HCA_CAP";
  
        case MLX5_CMD_OP_ACCESS_REG:
                return "MLX5_CMD_OP_ACCESS_REG";
  
 +      case MLX5_CMD_OP_SET_WOL_ROL:
 +              return "SET_WOL_ROL";
 +
 +      case MLX5_CMD_OP_QUERY_WOL_ROL:
 +              return "QUERY_WOL_ROL";
 +
 +      case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT:
 +              return "ADD_VXLAN_UDP_DPORT";
 +
 +      case MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT:
 +              return "DELETE_VXLAN_UDP_DPORT";
 +
        default: return "unknown command opcode";
        }
  }
index 72a94e72ee250ac2a24cf2223e21f999fd8e5185,f2354bc0ec19cbef3ce737d4ec810e30c6a4b3c7..3f3b2fae4991025a1018f4e4e4d87c88b6a30f1a
@@@ -341,8 -341,9 +341,9 @@@ static u16 to_fw_pkey_sz(u32 size
        }
  }
  
- int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type,
-                      enum mlx5_cap_mode cap_mode)
+ static int mlx5_core_get_caps_mode(struct mlx5_core_dev *dev,
+                                  enum mlx5_cap_type cap_type,
+                                  enum mlx5_cap_mode cap_mode)
  {
        u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)];
        int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
@@@ -392,6 -393,16 +393,16 @@@ query_ex
        return err;
  }
  
+ int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type)
+ {
+       int ret;
+       ret = mlx5_core_get_caps_mode(dev, cap_type, HCA_CAP_OPMOD_GET_CUR);
+       if (ret)
+               return ret;
+       return mlx5_core_get_caps_mode(dev, cap_type, HCA_CAP_OPMOD_GET_MAX);
+ }
  static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz, int opmod)
  {
        u32 out[MLX5_ST_SZ_DW(set_hca_cap_out)];
@@@ -419,8 -430,7 +430,7 @@@ static int handle_hca_cap_atomic(struc
        int err;
  
        if (MLX5_CAP_GEN(dev, atomic)) {
-               err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC,
-                                        HCA_CAP_OPMOD_GET_CUR);
+               err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC);
                if (err)
                        return err;
        } else {
@@@ -462,11 -472,7 +472,7 @@@ static int handle_hca_cap(struct mlx5_c
        if (!set_ctx)
                goto query_ex;
  
-       err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL, HCA_CAP_OPMOD_GET_MAX);
-       if (err)
-               goto query_ex;
-       err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL, HCA_CAP_OPMOD_GET_CUR);
+       err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL);
        if (err)
                goto query_ex;
  
@@@ -767,6 -773,22 +773,6 @@@ static int mlx5_core_set_issi(struct ml
        return -ENOTSUPP;
  }
  
 -static int map_bf_area(struct mlx5_core_dev *dev)
 -{
 -      resource_size_t bf_start = pci_resource_start(dev->pdev, 0);
 -      resource_size_t bf_len = pci_resource_len(dev->pdev, 0);
 -
 -      dev->priv.bf_mapping = io_mapping_create_wc(bf_start, bf_len);
 -
 -      return dev->priv.bf_mapping ? 0 : -ENOMEM;
 -}
 -
 -static void unmap_bf_area(struct mlx5_core_dev *dev)
 -{
 -      if (dev->priv.bf_mapping)
 -              io_mapping_free(dev->priv.bf_mapping);
 -}
 -
  static void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
  {
        struct mlx5_device_context *dev_ctx;
@@@ -1087,9 -1109,14 +1093,9 @@@ static int mlx5_load_one(struct mlx5_co
                goto err_stop_eqs;
        }
  
 -      if (map_bf_area(dev))
 -              dev_err(&pdev->dev, "Failed to map blue flame area\n");
 -
        err = mlx5_irq_set_affinity_hints(dev);
 -      if (err) {
 +      if (err)
                dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
 -              goto err_unmap_bf_area;
 -      }
  
        MLX5_INIT_DOORBELL_LOCK(&priv->cq_uar_lock);
  
@@@ -1148,6 -1175,10 +1154,6 @@@ err_fs
        mlx5_cleanup_qp_table(dev);
        mlx5_cleanup_cq_table(dev);
        mlx5_irq_clear_affinity_hints(dev);
 -
 -err_unmap_bf_area:
 -      unmap_bf_area(dev);
 -
        free_comp_eqs(dev);
  
  err_stop_eqs:
@@@ -1217,6 -1248,7 +1223,6 @@@ static int mlx5_unload_one(struct mlx5_
        mlx5_cleanup_qp_table(dev);
        mlx5_cleanup_cq_table(dev);
        mlx5_irq_clear_affinity_hints(dev);
 -      unmap_bf_area(dev);
        free_comp_eqs(dev);
        mlx5_stop_eqs(dev);
        mlx5_free_uuars(dev, &priv->uuari);
index 46a1830b509b66a6578cdd99fbbddad68daad371,c29860c05ed441c21a6c085668cceadf19023859..16eb653903e0b873909f3cf3175cf597fae88ea8
@@@ -1,12 -1,11 +1,11 @@@
  /*
+  * Copyright(c) 2015, 2016 Intel Corporation.
   *
   * This file is provided under a dual BSD/GPLv2 license.  When using or
   * redistributing this file, you may do so under either license.
   *
   * GPL LICENSE SUMMARY
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of version 2 of the GNU General Public License as
   * published by the Free Software Foundation.
@@@ -18,8 -17,6 +17,6 @@@
   *
   * BSD LICENSE
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
@@@ -64,6 -61,8 +61,8 @@@
  #include "sdma.h"
  #include "eprom.h"
  #include "efivar.h"
+ #include "platform.h"
+ #include "aspm.h"
  
  #define NUM_IB_PORTS 1
  
@@@ -420,10 -419,10 +419,10 @@@ static struct flag_table pio_err_status
        SEC_SPC_FREEZE,
        SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK),
  /*23*/        FLAG_ENTRY("PioWriteQwValidParity",
-       SEC_WRITE_DROPPED|SEC_SPC_FREEZE,
+       SEC_WRITE_DROPPED | SEC_SPC_FREEZE,
        SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK),
  /*24*/        FLAG_ENTRY("PioBlockQwCountParity",
-       SEC_WRITE_DROPPED|SEC_SPC_FREEZE,
+       SEC_WRITE_DROPPED | SEC_SPC_FREEZE,
        SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK),
  /*25*/        FLAG_ENTRY("PioVlfVlLenParity",
        SEC_SPC_FREEZE,
@@@ -509,6 -508,12 +508,12 @@@ static struct flag_table sdma_err_statu
                | SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK \
                | SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK)
  
+ /* SendEgressErrInfo bits that correspond to a PortXmitDiscard counter */
+ #define PORT_DISCARD_EGRESS_ERRS \
+       (SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK \
+       | SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK \
+       | SEND_EGRESS_ERR_INFO_VL_ERR_SMASK)
  /*
   * TXE Egress Error flags
   */
@@@ -936,7 -941,7 +941,7 @@@ static struct flag_table dc8051_err_fla
        FLAG_ENTRY0("IRAM_MBE", D8E(IRAM_MBE)),
        FLAG_ENTRY0("IRAM_SBE", D8E(IRAM_SBE)),
        FLAG_ENTRY0("UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES",
-               D8E(UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES)),
+                   D8E(UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES)),
        FLAG_ENTRY0("INVALID_CSR_ADDR", D8E(INVALID_CSR_ADDR)),
  };
  
@@@ -950,7 -955,7 +955,7 @@@ static struct flag_table dc8051_info_er
        FLAG_ENTRY0("Unknown frame received",  UNKNOWN_FRAME),
        FLAG_ENTRY0("Target BER not met",      TARGET_BER_NOT_MET),
        FLAG_ENTRY0("Serdes internal loopback failure",
-                                       FAILED_SERDES_INTERNAL_LOOPBACK),
+                   FAILED_SERDES_INTERNAL_LOOPBACK),
        FLAG_ENTRY0("Failed SerDes init",      FAILED_SERDES_INIT),
        FLAG_ENTRY0("Failed LNI(Polling)",     FAILED_LNI_POLLING),
        FLAG_ENTRY0("Failed LNI(Debounce)",    FAILED_LNI_DEBOUNCE),
        FLAG_ENTRY0("Failed LNI(OptEq)",       FAILED_LNI_OPTEQ),
        FLAG_ENTRY0("Failed LNI(VerifyCap_1)", FAILED_LNI_VERIFY_CAP1),
        FLAG_ENTRY0("Failed LNI(VerifyCap_2)", FAILED_LNI_VERIFY_CAP2),
-       FLAG_ENTRY0("Failed LNI(ConfigLT)",    FAILED_LNI_CONFIGLT)
+       FLAG_ENTRY0("Failed LNI(ConfigLT)",    FAILED_LNI_CONFIGLT),
+       FLAG_ENTRY0("Host Handshake Timeout",  HOST_HANDSHAKE_TIMEOUT)
  };
  
  /*
@@@ -978,7 -984,6 +984,6 @@@ static struct flag_table dc8051_info_ho
        FLAG_ENTRY0("Link going down", 0x0100),
  };
  
  static u32 encoded_size(u32 size);
  static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate);
  static int set_physical_link_state(struct hfi1_devdata *dd, u64 state);
@@@ -1140,11 -1145,8 +1145,8 @@@ struct cntr_entry 
        /*
         * accessor for stat element, context either dd or ppd
         */
-       u64 (*rw_cntr)(const struct cntr_entry *,
-                              void *context,
-                              int vl,
-                              int mode,
-                              u64 data);
+       u64 (*rw_cntr)(const struct cntr_entry *, void *context, int vl,
+                      int mode, u64 data);
  };
  
  #define C_RCV_HDR_OVF_FIRST C_RCV_HDR_OVF_0
@@@ -1188,7 -1190,7 +1190,7 @@@ CNTR_ELEM(#name, 
  #define OVR_LBL(ctx) C_RCV_HDR_OVF_ ## ctx
  #define OVR_ELM(ctx) \
  CNTR_ELEM("RcvHdrOvr" #ctx, \
-         (RCV_HDR_OVFL_CNT + ctx*0x100), \
+         (RCV_HDR_OVFL_CNT + ctx * 0x100), \
          0, CNTR_NORMAL, port_access_u64_csr)
  
  /* 32bit TXE */
@@@ -1250,8 -1252,11 +1252,8 @@@ CNTR_ELEM(#name, 
  
  u64 read_csr(const struct hfi1_devdata *dd, u32 offset)
  {
 -      u64 val;
 -
        if (dd->flags & HFI1_PRESENT) {
 -              val = readq((void __iomem *)dd->kregbase + offset);
 -              return val;
 +              return readq((void __iomem *)dd->kregbase + offset);
        }
        return -1;
  }
@@@ -1274,7 -1279,6 +1276,6 @@@ static inline u64 read_write_csr(const 
  {
        u64 ret;
  
        if (mode == CNTR_MODE_R) {
                ret = read_csr(dd, csr);
        } else if (mode == CNTR_MODE_W) {
  
  /* Dev Access */
  static u64 dev_access_u32_csr(const struct cntr_entry *entry,
-                           void *context, int vl, int mode, u64 data)
+                             void *context, int vl, int mode, u64 data)
  {
        struct hfi1_devdata *dd = context;
+       u64 csr = entry->csr;
  
-       if (vl != CNTR_INVALID_VL)
-               return 0;
-       return read_write_csr(dd, entry->csr, mode, data);
+       if (entry->flags & CNTR_SDMA) {
+               if (vl == CNTR_INVALID_VL)
+                       return 0;
+               csr += 0x100 * vl;
+       } else {
+               if (vl != CNTR_INVALID_VL)
+                       return 0;
+       }
+       return read_write_csr(dd, csr, mode, data);
+ }
+ static u64 access_sde_err_cnt(const struct cntr_entry *entry,
+                             void *context, int idx, int mode, u64 data)
+ {
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+       if (dd->per_sdma && idx < dd->num_sdma)
+               return dd->per_sdma[idx].err_cnt;
+       return 0;
+ }
+ static u64 access_sde_int_cnt(const struct cntr_entry *entry,
+                             void *context, int idx, int mode, u64 data)
+ {
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+       if (dd->per_sdma && idx < dd->num_sdma)
+               return dd->per_sdma[idx].sdma_int_cnt;
+       return 0;
+ }
+ static u64 access_sde_idle_int_cnt(const struct cntr_entry *entry,
+                                  void *context, int idx, int mode, u64 data)
+ {
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+       if (dd->per_sdma && idx < dd->num_sdma)
+               return dd->per_sdma[idx].idle_int_cnt;
+       return 0;
+ }
+ static u64 access_sde_progress_int_cnt(const struct cntr_entry *entry,
+                                      void *context, int idx, int mode,
+                                      u64 data)
+ {
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+       if (dd->per_sdma && idx < dd->num_sdma)
+               return dd->per_sdma[idx].progress_int_cnt;
+       return 0;
  }
  
  static u64 dev_access_u64_csr(const struct cntr_entry *entry, void *context,
-                           int vl, int mode, u64 data)
+                             int vl, int mode, u64 data)
  {
        struct hfi1_devdata *dd = context;
  
  }
  
  static u64 dc_access_lcb_cntr(const struct cntr_entry *entry, void *context,
-                           int vl, int mode, u64 data)
+                             int vl, int mode, u64 data)
  {
        struct hfi1_devdata *dd = context;
        u32 csr = entry->csr;
  
  /* Port Access */
  static u64 port_access_u32_csr(const struct cntr_entry *entry, void *context,
-                            int vl, int mode, u64 data)
+                              int vl, int mode, u64 data)
  {
        struct hfi1_pportdata *ppd = context;
  
  }
  
  static u64 port_access_u64_csr(const struct cntr_entry *entry,
-                            void *context, int vl, int mode, u64 data)
+                              void *context, int vl, int mode, u64 data)
  {
        struct hfi1_pportdata *ppd = context;
        u64 val;
@@@ -1396,7 -1448,7 +1445,7 @@@ static inline u64 read_write_sw(struct 
  }
  
  static u64 access_sw_link_dn_cnt(const struct cntr_entry *entry, void *context,
-                              int vl, int mode, u64 data)
+                                int vl, int mode, u64 data)
  {
        struct hfi1_pportdata *ppd = context;
  
  }
  
  static u64 access_sw_link_up_cnt(const struct cntr_entry *entry, void *context,
-                              int vl, int mode, u64 data)
+                                int vl, int mode, u64 data)
  {
        struct hfi1_pportdata *ppd = context;
  
@@@ -1427,18 -1479,25 +1476,25 @@@ static u64 access_sw_unknown_frame_cnt(
  }
  
  static u64 access_sw_xmit_discards(const struct cntr_entry *entry,
-                                   void *context, int vl, int mode, u64 data)
+                                  void *context, int vl, int mode, u64 data)
  {
-       struct hfi1_pportdata *ppd = context;
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+       u64 zero = 0;
+       u64 *counter;
  
-       if (vl != CNTR_INVALID_VL)
-               return 0;
+       if (vl == CNTR_INVALID_VL)
+               counter = &ppd->port_xmit_discards;
+       else if (vl >= 0 && vl < C_VL_COUNT)
+               counter = &ppd->port_xmit_discards_vl[vl];
+       else
+               counter = &zero;
  
-       return read_write_sw(ppd->dd, &ppd->port_xmit_discards, mode, data);
+       return read_write_sw(ppd->dd, counter, mode, data);
  }
  
  static u64 access_xmit_constraint_errs(const struct cntr_entry *entry,
-                                    void *context, int vl, int mode, u64 data)
+                                      void *context, int vl, int mode,
+                                      u64 data)
  {
        struct hfi1_pportdata *ppd = context;
  
  }
  
  static u64 access_rcv_constraint_errs(const struct cntr_entry *entry,
-                                    void *context, int vl, int mode, u64 data)
+                                     void *context, int vl, int mode, u64 data)
  {
        struct hfi1_pportdata *ppd = context;
  
@@@ -1475,7 -1534,6 +1531,6 @@@ static u64 read_write_cpu(struct hfi1_d
                          u64 __percpu *cntr,
                          int vl, int mode, u64 data)
  {
        u64 ret = 0;
  
        if (vl != CNTR_INVALID_VL)
@@@ -1507,7 -1565,7 +1562,7 @@@ static u64 access_sw_cpu_intr(const str
  }
  
  static u64 access_sw_cpu_rcv_limit(const struct cntr_entry *entry,
-                             void *context, int vl, int mode, u64 data)
+                                  void *context, int vl, int mode, u64 data)
  {
        struct hfi1_devdata *dd = context;
  
@@@ -1523,6 -1581,14 +1578,14 @@@ static u64 access_sw_pio_wait(const str
        return dd->verbs_dev.n_piowait;
  }
  
+ static u64 access_sw_pio_drain(const struct cntr_entry *entry,
+                              void *context, int vl, int mode, u64 data)
+ {
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+       return dd->verbs_dev.n_piodrain;
+ }
  static u64 access_sw_vtx_wait(const struct cntr_entry *entry,
                              void *context, int vl, int mode, u64 data)
  {
@@@ -1540,11 -1606,12 +1603,12 @@@ static u64 access_sw_kmem_wait(const st
  }
  
  static u64 access_sw_send_schedule(const struct cntr_entry *entry,
-                              void *context, int vl, int mode, u64 data)
+                                  void *context, int vl, int mode, u64 data)
  {
        struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
  
-       return dd->verbs_dev.n_send_schedule;
+       return read_write_cpu(dd, &dd->z_send_schedule, dd->send_schedule, vl,
+                             mode, data);
  }
  
  /* Software counters for the error status bits within MISC_ERR_STATUS */
@@@ -3882,8 -3949,8 +3946,8 @@@ static u64 access_sw_cpu_##cntr(const s
                              void *context, int vl, int mode, u64 data)      \
  {                                                                           \
        struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;        \
-       return read_write_cpu(ppd->dd, &ppd->ibport_data.z_ ##cntr,           \
-                             ppd->ibport_data.cntr, vl,                      \
+       return read_write_cpu(ppd->dd, &ppd->ibport_data.rvp.z_ ##cntr,       \
+                             ppd->ibport_data.rvp.cntr, vl,                  \
                              mode, data);                                    \
  }
  
@@@ -3900,7 -3967,7 +3964,7 @@@ static u64 access_ibp_##cntr(const stru
        if (vl != CNTR_INVALID_VL)                                            \
                return 0;                                                     \
                                                                              \
-       return read_write_sw(ppd->dd, &ppd->ibport_data.n_ ##cntr,            \
+       return read_write_sw(ppd->dd, &ppd->ibport_data.rvp.n_ ##cntr,        \
                             mode, data);                                     \
  }
  
@@@ -4063,10 -4130,28 +4127,28 @@@ static struct cntr_entry dev_cntrs[DEV_
                            access_sw_vtx_wait),
  [C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL,
                            access_sw_pio_wait),
+ [C_SW_PIO_DRAIN] = CNTR_ELEM("PioDrain", 0, 0, CNTR_NORMAL,
+                           access_sw_pio_drain),
  [C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
                            access_sw_kmem_wait),
  [C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
                            access_sw_send_schedule),
+ [C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn",
+                                     SEND_DMA_DESC_FETCHED_CNT, 0,
+                                     CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+                                     dev_access_u32_csr),
+ [C_SDMA_INT_CNT] = CNTR_ELEM("SDMAInt", 0, 0,
+                            CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+                            access_sde_int_cnt),
+ [C_SDMA_ERR_CNT] = CNTR_ELEM("SDMAErrCt", 0, 0,
+                            CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+                            access_sde_err_cnt),
+ [C_SDMA_IDLE_INT_CNT] = CNTR_ELEM("SDMAIdInt", 0, 0,
+                                 CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+                                 access_sde_idle_int_cnt),
+ [C_SDMA_PROGRESS_INT_CNT] = CNTR_ELEM("SDMAPrIntCn", 0, 0,
+                                     CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+                                     access_sde_progress_int_cnt),
  /* MISC_ERR_STATUS */
  [C_MISC_PLL_LOCK_FAIL_ERR] = CNTR_ELEM("MISC_PLL_LOCK_FAIL_ERR", 0, 0,
                                CNTR_NORMAL,
@@@ -4876,28 -4961,28 +4958,28 @@@ static struct cntr_entry port_cntrs[POR
  [C_TX_WORDS] = TXE64_PORT_CNTR_ELEM(TxWords, SEND_DWORD_CNT, CNTR_NORMAL),
  [C_TX_WAIT] = TXE64_PORT_CNTR_ELEM(TxWait, SEND_WAIT_CNT, CNTR_SYNTH),
  [C_TX_FLIT_VL] = TXE64_PORT_CNTR_ELEM(TxFlitVL, SEND_DATA_VL0_CNT,
-                       CNTR_SYNTH | CNTR_VL),
+                                     CNTR_SYNTH | CNTR_VL),
  [C_TX_PKT_VL] = TXE64_PORT_CNTR_ELEM(TxPktVL, SEND_DATA_PKT_VL0_CNT,
-                       CNTR_SYNTH | CNTR_VL),
+                                    CNTR_SYNTH | CNTR_VL),
  [C_TX_WAIT_VL] = TXE64_PORT_CNTR_ELEM(TxWaitVL, SEND_WAIT_VL0_CNT,
-                       CNTR_SYNTH | CNTR_VL),
+                                     CNTR_SYNTH | CNTR_VL),
  [C_RX_PKT] = RXE64_PORT_CNTR_ELEM(RxPkt, RCV_DATA_PKT_CNT, CNTR_NORMAL),
  [C_RX_WORDS] = RXE64_PORT_CNTR_ELEM(RxWords, RCV_DWORD_CNT, CNTR_NORMAL),
  [C_SW_LINK_DOWN] = CNTR_ELEM("SwLinkDown", 0, 0, CNTR_SYNTH | CNTR_32BIT,
-                       access_sw_link_dn_cnt),
+                            access_sw_link_dn_cnt),
  [C_SW_LINK_UP] = CNTR_ELEM("SwLinkUp", 0, 0, CNTR_SYNTH | CNTR_32BIT,
-                       access_sw_link_up_cnt),
+                          access_sw_link_up_cnt),
  [C_SW_UNKNOWN_FRAME] = CNTR_ELEM("UnknownFrame", 0, 0, CNTR_NORMAL,
                                 access_sw_unknown_frame_cnt),
  [C_SW_XMIT_DSCD] = CNTR_ELEM("XmitDscd", 0, 0, CNTR_SYNTH | CNTR_32BIT,
-                       access_sw_xmit_discards),
+                            access_sw_xmit_discards),
  [C_SW_XMIT_DSCD_VL] = CNTR_ELEM("XmitDscdVl", 0, 0,
-                       CNTR_SYNTH | CNTR_32BIT | CNTR_VL,
-                       access_sw_xmit_discards),
+                               CNTR_SYNTH | CNTR_32BIT | CNTR_VL,
+                               access_sw_xmit_discards),
  [C_SW_XMIT_CSTR_ERR] = CNTR_ELEM("XmitCstrErr", 0, 0, CNTR_SYNTH,
-                       access_xmit_constraint_errs),
+                                access_xmit_constraint_errs),
  [C_SW_RCV_CSTR_ERR] = CNTR_ELEM("RcvCstrErr", 0, 0, CNTR_SYNTH,
-                       access_rcv_constraint_errs),
+                               access_rcv_constraint_errs),
  [C_SW_IBP_LOOP_PKTS] = SW_IBP_CNTR(LoopPkts, loop_pkts),
  [C_SW_IBP_RC_RESENDS] = SW_IBP_CNTR(RcResend, rc_resends),
  [C_SW_IBP_RNR_NAKS] = SW_IBP_CNTR(RnrNak, rnr_naks),
  [C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL,
                               access_sw_cpu_rc_acks),
  [C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL,
-                              access_sw_cpu_rc_qacks),
+                               access_sw_cpu_rc_qacks),
  [C_SW_CPU_RC_DELAYED_COMP] = CNTR_ELEM("RcDelayComp", 0, 0, CNTR_NORMAL,
-                              access_sw_cpu_rc_delayed_comp),
+                                      access_sw_cpu_rc_delayed_comp),
  [OVR_LBL(0)] = OVR_ELM(0), [OVR_LBL(1)] = OVR_ELM(1),
  [OVR_LBL(2)] = OVR_ELM(2), [OVR_LBL(3)] = OVR_ELM(3),
  [OVR_LBL(4)] = OVR_ELM(4), [OVR_LBL(5)] = OVR_ELM(5),
@@@ -5064,7 -5149,7 +5146,7 @@@ done
   * the buffer.  End in '*' if the buffer is too short.
   */
  static char *flag_string(char *buf, int buf_len, u64 flags,
-                               struct flag_table *table, int table_size)
+                        struct flag_table *table, int table_size)
  {
        char extra[32];
        char *p = buf;
@@@ -5125,10 -5210,8 +5207,8 @@@ static char *is_misc_err_name(char *buf
        if (source < ARRAY_SIZE(cce_misc_names))
                strncpy(buf, cce_misc_names[source], bsize);
        else
-               snprintf(buf,
-                       bsize,
-                       "Reserved%u",
-                       source + IS_GENERAL_ERR_START);
+               snprintf(buf, bsize, "Reserved%u",
+                        source + IS_GENERAL_ERR_START);
  
        return buf;
  }
@@@ -5167,7 -5250,7 +5247,7 @@@ static char *is_various_name(char *buf
        if (source < ARRAY_SIZE(various_names))
                strncpy(buf, various_names[source], bsize);
        else
-               snprintf(buf, bsize, "Reserved%u", source+IS_VARIOUS_START);
+               snprintf(buf, bsize, "Reserved%u", source + IS_VARIOUS_START);
        return buf;
  }
  
@@@ -5252,51 -5335,56 +5332,56 @@@ static char *is_reserved_name(char *buf
  static char *cce_err_status_string(char *buf, int buf_len, u64 flags)
  {
        return flag_string(buf, buf_len, flags,
-                       cce_err_status_flags, ARRAY_SIZE(cce_err_status_flags));
+                          cce_err_status_flags,
+                          ARRAY_SIZE(cce_err_status_flags));
  }
  
  static char *rxe_err_status_string(char *buf, int buf_len, u64 flags)
  {
        return flag_string(buf, buf_len, flags,
-                       rxe_err_status_flags, ARRAY_SIZE(rxe_err_status_flags));
+                          rxe_err_status_flags,
+                          ARRAY_SIZE(rxe_err_status_flags));
  }
  
  static char *misc_err_status_string(char *buf, int buf_len, u64 flags)
  {
        return flag_string(buf, buf_len, flags, misc_err_status_flags,
-                       ARRAY_SIZE(misc_err_status_flags));
+                          ARRAY_SIZE(misc_err_status_flags));
  }
  
  static char *pio_err_status_string(char *buf, int buf_len, u64 flags)
  {
        return flag_string(buf, buf_len, flags,
-                       pio_err_status_flags, ARRAY_SIZE(pio_err_status_flags));
+                          pio_err_status_flags,
+                          ARRAY_SIZE(pio_err_status_flags));
  }
  
  static char *sdma_err_status_string(char *buf, int buf_len, u64 flags)
  {
        return flag_string(buf, buf_len, flags,
-                       sdma_err_status_flags,
-                       ARRAY_SIZE(sdma_err_status_flags));
+                          sdma_err_status_flags,
+                          ARRAY_SIZE(sdma_err_status_flags));
  }
  
  static char *egress_err_status_string(char *buf, int buf_len, u64 flags)
  {
        return flag_string(buf, buf_len, flags,
-               egress_err_status_flags, ARRAY_SIZE(egress_err_status_flags));
+                          egress_err_status_flags,
+                          ARRAY_SIZE(egress_err_status_flags));
  }
  
  static char *egress_err_info_string(char *buf, int buf_len, u64 flags)
  {
        return flag_string(buf, buf_len, flags,
-               egress_err_info_flags, ARRAY_SIZE(egress_err_info_flags));
+                          egress_err_info_flags,
+                          ARRAY_SIZE(egress_err_info_flags));
  }
  
  static char *send_err_status_string(char *buf, int buf_len, u64 flags)
  {
        return flag_string(buf, buf_len, flags,
-                       send_err_status_flags,
-                       ARRAY_SIZE(send_err_status_flags));
+                          send_err_status_flags,
+                          ARRAY_SIZE(send_err_status_flags));
  }
  
  static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
         * report or record it.
         */
        dd_dev_info(dd, "CCE Error: %s\n",
-               cce_err_status_string(buf, sizeof(buf), reg));
+                   cce_err_status_string(buf, sizeof(buf), reg));
  
        if ((reg & CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK) &&
            is_ax(dd) && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) {
@@@ -5339,14 -5427,14 +5424,14 @@@ static void update_rcverr_timer(unsigne
        u32 cur_ovfl_cnt = read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
  
        if (dd->rcv_ovfl_cnt < cur_ovfl_cnt &&
-               ppd->port_error_action & OPA_PI_MASK_EX_BUFFER_OVERRUN) {
+           ppd->port_error_action & OPA_PI_MASK_EX_BUFFER_OVERRUN) {
                dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
-               set_link_down_reason(ppd,
-                 OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0,
-                       OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN);
+               set_link_down_reason(
+               ppd, OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0,
+               OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN);
                queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
        }
-       dd->rcv_ovfl_cnt = (u32) cur_ovfl_cnt;
+       dd->rcv_ovfl_cnt = (u32)cur_ovfl_cnt;
  
        mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
  }
@@@ -5372,7 -5460,7 +5457,7 @@@ static void handle_rxe_err(struct hfi1_
        int i = 0;
  
        dd_dev_info(dd, "Receive Error: %s\n",
-               rxe_err_status_string(buf, sizeof(buf), reg));
+                   rxe_err_status_string(buf, sizeof(buf), reg));
  
        if (reg & ALL_RXE_FREEZE_ERR) {
                int flags = 0;
@@@ -5399,7 -5487,7 +5484,7 @@@ static void handle_misc_err(struct hfi1
        int i = 0;
  
        dd_dev_info(dd, "Misc Error: %s",
-               misc_err_status_string(buf, sizeof(buf), reg));
+                   misc_err_status_string(buf, sizeof(buf), reg));
        for (i = 0; i < NUM_MISC_ERR_STATUS_COUNTERS; i++) {
                if (reg & (1ull << i))
                        incr_cntr64(&dd->misc_err_status_cnt[i]);
@@@ -5412,7 -5500,7 +5497,7 @@@ static void handle_pio_err(struct hfi1_
        int i = 0;
  
        dd_dev_info(dd, "PIO Error: %s\n",
-               pio_err_status_string(buf, sizeof(buf), reg));
+                   pio_err_status_string(buf, sizeof(buf), reg));
  
        if (reg & ALL_PIO_FREEZE_ERR)
                start_freeze_handling(dd->pport, 0);
@@@ -5429,7 -5517,7 +5514,7 @@@ static void handle_sdma_err(struct hfi1
        int i = 0;
  
        dd_dev_info(dd, "SDMA Error: %s\n",
-               sdma_err_status_string(buf, sizeof(buf), reg));
+                   sdma_err_status_string(buf, sizeof(buf), reg));
  
        if (reg & ALL_SDMA_FREEZE_ERR)
                start_freeze_handling(dd->pport, 0);
        }
  }
  
- static void count_port_inactive(struct hfi1_devdata *dd)
+ static inline void __count_port_discards(struct hfi1_pportdata *ppd)
  {
-       struct hfi1_pportdata *ppd = dd->pport;
+       incr_cntr64(&ppd->port_xmit_discards);
+ }
  
-       if (ppd->port_xmit_discards < ~(u64)0)
-               ppd->port_xmit_discards++;
+ static void count_port_inactive(struct hfi1_devdata *dd)
+ {
+       __count_port_discards(dd->pport);
  }
  
  /*
   * egress error if more than one packet fails the same integrity check
   * since we cleared the corresponding bit in SEND_EGRESS_ERR_INFO.
   */
- static void handle_send_egress_err_info(struct hfi1_devdata *dd)
+ static void handle_send_egress_err_info(struct hfi1_devdata *dd,
+                                       int vl)
  {
        struct hfi1_pportdata *ppd = dd->pport;
        u64 src = read_csr(dd, SEND_EGRESS_ERR_SOURCE); /* read first */
        write_csr(dd, SEND_EGRESS_ERR_INFO, info);
  
        dd_dev_info(dd,
-               "Egress Error Info: 0x%llx, %s Egress Error Src 0x%llx\n",
-               info, egress_err_info_string(buf, sizeof(buf), info), src);
+                   "Egress Error Info: 0x%llx, %s Egress Error Src 0x%llx\n",
+                   info, egress_err_info_string(buf, sizeof(buf), info), src);
  
        /* Eventually add other counters for each bit */
+       if (info & PORT_DISCARD_EGRESS_ERRS) {
+               int weight, i;
  
-       if (info & SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK) {
-               if (ppd->port_xmit_discards < ~(u64)0)
-                       ppd->port_xmit_discards++;
+               /*
+                * Count all applicable bits as individual errors and
+                * attribute them to the packet that triggered this handler.
+                * This may not be completely accurate due to limitations
+                * on the available hardware error information.  There is
+                * a single information register and any number of error
+                * packets may have occurred and contributed to it before
+                * this routine is called.  This means that:
+                * a) If multiple packets with the same error occur before
+                *    this routine is called, earlier packets are missed.
+                *    There is only a single bit for each error type.
+                * b) Errors may not be attributed to the correct VL.
+                *    The driver is attributing all bits in the info register
+                *    to the packet that triggered this call, but bits
+                *    could be an accumulation of different packets with
+                *    different VLs.
+                * c) A single error packet may have multiple counts attached
+                *    to it.  There is no way for the driver to know if
+                *    multiple bits set in the info register are due to a
+                *    single packet or multiple packets.  The driver assumes
+                *    multiple packets.
+                */
+               weight = hweight64(info & PORT_DISCARD_EGRESS_ERRS);
+               for (i = 0; i < weight; i++) {
+                       __count_port_discards(ppd);
+                       if (vl >= 0 && vl < TXE_NUM_DATA_VL)
+                               incr_cntr64(&ppd->port_xmit_discards_vl[vl]);
+                       else if (vl == 15)
+                               incr_cntr64(&ppd->port_xmit_discards_vl
+                                           [C_VL_15]);
+               }
        }
  }
  
@@@ -5493,12 -5614,71 +5611,71 @@@ static inline int port_inactive_err(u6
   * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
   * register. Does it represent a 'disallowed packet' error?
   */
- static inline int disallowed_pkt_err(u64 posn)
+ static inline int disallowed_pkt_err(int posn)
  {
        return (posn >= SEES(TX_SDMA0_DISALLOWED_PACKET) &&
                posn <= SEES(TX_SDMA15_DISALLOWED_PACKET));
  }
  
+ /*
+  * Input value is a bit position of one of the SDMA engine disallowed
+  * packet errors.  Return which engine.  Use of this must be guarded by
+  * disallowed_pkt_err().
+  */
+ static inline int disallowed_pkt_engine(int posn)
+ {
+       return posn - SEES(TX_SDMA0_DISALLOWED_PACKET);
+ }
+ /*
+  * Translate an SDMA engine to a VL.  Return -1 if the tranlation cannot
+  * be done.
+  */
+ static int engine_to_vl(struct hfi1_devdata *dd, int engine)
+ {
+       struct sdma_vl_map *m;
+       int vl;
+       /* range check */
+       if (engine < 0 || engine >= TXE_NUM_SDMA_ENGINES)
+               return -1;
+       rcu_read_lock();
+       m = rcu_dereference(dd->sdma_map);
+       vl = m->engine_to_vl[engine];
+       rcu_read_unlock();
+       return vl;
+ }
+ /*
+  * Translate the send context (sofware index) into a VL.  Return -1 if the
+  * translation cannot be done.
+  */
+ static int sc_to_vl(struct hfi1_devdata *dd, int sw_index)
+ {
+       struct send_context_info *sci;
+       struct send_context *sc;
+       int i;
+       sci = &dd->send_contexts[sw_index];
+       /* there is no information for user (PSM) and ack contexts */
+       if (sci->type != SC_KERNEL)
+               return -1;
+       sc = sci->sc;
+       if (!sc)
+               return -1;
+       if (dd->vld[15].sc == sc)
+               return 15;
+       for (i = 0; i < num_vls; i++)
+               if (dd->vld[i].sc == sc)
+                       return i;
+       return -1;
+ }
  static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
  {
        u64 reg_copy = reg, handled = 0;
  
        if (reg & ALL_TXE_EGRESS_FREEZE_ERR)
                start_freeze_handling(dd->pport, 0);
-       if (is_ax(dd) && (reg &
-                   SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK)
-                   && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR))
+       else if (is_ax(dd) &&
+                (reg & SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK) &&
+                (dd->icode != ICODE_FUNCTIONAL_SIMULATOR))
                start_freeze_handling(dd->pport, 0);
  
        while (reg_copy) {
                int posn = fls64(reg_copy);
-               /*
-                * fls64() returns a 1-based offset, but we generally
-                * want 0-based offsets.
-                */
+               /* fls64() returns a 1-based offset, we want it zero based */
                int shift = posn - 1;
+               u64 mask = 1ULL << shift;
  
                if (port_inactive_err(shift)) {
                        count_port_inactive(dd);
-                       handled |= (1ULL << shift);
+                       handled |= mask;
                } else if (disallowed_pkt_err(shift)) {
-                       handle_send_egress_err_info(dd);
-                       handled |= (1ULL << shift);
+                       int vl = engine_to_vl(dd, disallowed_pkt_engine(shift));
+                       handle_send_egress_err_info(dd, vl);
+                       handled |= mask;
                }
-               clear_bit(shift, (unsigned long *)&reg_copy);
+               reg_copy &= ~mask;
        }
  
        reg &= ~handled;
  
        if (reg)
                dd_dev_info(dd, "Egress Error: %s\n",
-                       egress_err_status_string(buf, sizeof(buf), reg));
+                           egress_err_status_string(buf, sizeof(buf), reg));
  
        for (i = 0; i < NUM_SEND_EGRESS_ERR_STATUS_COUNTERS; i++) {
                if (reg & (1ull << i))
@@@ -5548,7 -5728,7 +5725,7 @@@ static void handle_txe_err(struct hfi1_
        int i = 0;
  
        dd_dev_info(dd, "Send Error: %s\n",
-               send_err_status_string(buf, sizeof(buf), reg));
+                   send_err_status_string(buf, sizeof(buf), reg));
  
        for (i = 0; i < NUM_SEND_ERR_STATUS_COUNTERS; i++) {
                if (reg & (1ull << i))
@@@ -5594,7 -5774,7 +5771,7 @@@ static void interrupt_clear_down(struc
                        u64 mask;
  
                        dd_dev_err(dd, "Repeating %s bits 0x%llx - masking\n",
-                               eri->desc, reg);
+                                  eri->desc, reg);
                        /*
                         * Read-modify-write so any other masked bits
                         * remain masked.
@@@ -5618,14 -5798,15 +5795,15 @@@ static void is_misc_err_int(struct hfi1
                interrupt_clear_down(dd, 0, eri);
        } else {
                dd_dev_err(dd, "Unexpected misc interrupt (%u) - reserved\n",
-                       source);
+                          source);
        }
  }
  
  static char *send_context_err_status_string(char *buf, int buf_len, u64 flags)
  {
        return flag_string(buf, buf_len, flags,
-                       sc_err_status_flags, ARRAY_SIZE(sc_err_status_flags));
+                          sc_err_status_flags,
+                          ARRAY_SIZE(sc_err_status_flags));
  }
  
  /*
@@@ -5650,15 -5831,15 +5828,15 @@@ static void is_sendctxt_err_int(struct 
        sw_index = dd->hw_to_sw[hw_context];
        if (sw_index >= dd->num_send_contexts) {
                dd_dev_err(dd,
-                       "out of range sw index %u for send context %u\n",
-                       sw_index, hw_context);
+                          "out of range sw index %u for send context %u\n",
+                          sw_index, hw_context);
                return;
        }
        sci = &dd->send_contexts[sw_index];
        sc = sci->sc;
        if (!sc) {
                dd_dev_err(dd, "%s: context %u(%u): no sc?\n", __func__,
-                       sw_index, hw_context);
+                          sw_index, hw_context);
                return;
        }
  
        status = read_kctxt_csr(dd, hw_context, SEND_CTXT_ERR_STATUS);
  
        dd_dev_info(dd, "Send Context %u(%u) Error: %s\n", sw_index, hw_context,
-               send_context_err_status_string(flags, sizeof(flags), status));
+                   send_context_err_status_string(flags, sizeof(flags),
+                                                  status));
  
        if (status & SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK)
-               handle_send_egress_err_info(dd);
+               handle_send_egress_err_info(dd, sc_to_vl(dd, sw_index));
  
        /*
         * Automatically restart halted kernel contexts out of interrupt
@@@ -5704,6 -5886,7 +5883,7 @@@ static void handle_sdma_eng_err(struct 
        dd_dev_err(sde->dd, "CONFIG SDMA(%u) source: %u status 0x%llx\n",
                   sde->this_idx, source, (unsigned long long)status);
  #endif
+       sde->err_cnt++;
        sdma_engine_error(sde, status);
  
        /*
@@@ -5752,23 -5935,22 +5932,22 @@@ static void is_various_int(struct hfi1_
                interrupt_clear_down(dd, 0, eri);
        else
                dd_dev_info(dd,
-                       "%s: Unimplemented/reserved interrupt %d\n",
-                       __func__, source);
+                           "%s: Unimplemented/reserved interrupt %d\n",
+                           __func__, source);
  }
  
  static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg)
  {
-       /* source is always zero */
+       /* src_ctx is always zero */
        struct hfi1_pportdata *ppd = dd->pport;
        unsigned long flags;
        u64 qsfp_int_mgmt = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
  
        if (reg & QSFP_HFI0_MODPRST_N) {
-               dd_dev_info(dd, "%s: ModPresent triggered QSFP interrupt\n",
-                               __func__);
                if (!qsfp_mod_present(ppd)) {
+                       dd_dev_info(dd, "%s: QSFP module removed\n",
+                                   __func__);
                        ppd->driver_link_ready = 0;
                        /*
                         * Cable removed, reset all our information about the
                         * an interrupt when a cable is inserted
                         */
                        ppd->qsfp_info.cache_valid = 0;
-                       ppd->qsfp_info.qsfp_interrupt_functional = 0;
+                       ppd->qsfp_info.reset_needed = 0;
+                       ppd->qsfp_info.limiting_active = 0;
                        spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
-                                               flags);
-                       write_csr(dd,
-                                       dd->hfi1_id ?
-                                               ASIC_QSFP2_INVERT :
-                                               ASIC_QSFP1_INVERT,
-                               qsfp_int_mgmt);
+                                              flags);
+                       /* Invert the ModPresent pin now to detect plug-in */
+                       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT :
+                                 ASIC_QSFP1_INVERT, qsfp_int_mgmt);
+                       if ((ppd->offline_disabled_reason >
+                         HFI1_ODR_MASK(
+                         OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED)) ||
+                         (ppd->offline_disabled_reason ==
+                         HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)))
+                               ppd->offline_disabled_reason =
+                               HFI1_ODR_MASK(
+                               OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED);
                        if (ppd->host_link_state == HLS_DN_POLL) {
                                /*
                                 * The link is still in POLL. This means
                                queue_work(ppd->hfi1_wq, &ppd->link_down_work);
                        }
                } else {
+                       dd_dev_info(dd, "%s: QSFP module inserted\n",
+                                   __func__);
                        spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
                        ppd->qsfp_info.cache_valid = 0;
                        ppd->qsfp_info.cache_refresh_required = 1;
                        spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
-                                               flags);
+                                              flags);
  
+                       /*
+                        * Stop inversion of ModPresent pin to detect
+                        * removal of the cable
+                        */
                        qsfp_int_mgmt &= ~(u64)QSFP_HFI0_MODPRST_N;
-                       write_csr(dd,
-                                       dd->hfi1_id ?
-                                               ASIC_QSFP2_INVERT :
-                                               ASIC_QSFP1_INVERT,
-                               qsfp_int_mgmt);
+                       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT :
+                                 ASIC_QSFP1_INVERT, qsfp_int_mgmt);
+                       ppd->offline_disabled_reason =
+                               HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT);
                }
        }
  
        if (reg & QSFP_HFI0_INT_N) {
-               dd_dev_info(dd, "%s: IntN triggered QSFP interrupt\n",
-                               __func__);
+               dd_dev_info(dd, "%s: Interrupt received from QSFP module\n",
+                           __func__);
                spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
                ppd->qsfp_info.check_interrupt_flags = 1;
-               ppd->qsfp_info.qsfp_interrupt_functional = 1;
                spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
        }
  
@@@ -5834,11 -6030,11 +6027,11 @@@ static int request_host_lcb_access(stru
        int ret;
  
        ret = do_8051_command(dd, HCMD_MISC,
-               (u64)HCMD_MISC_REQUEST_LCB_ACCESS << LOAD_DATA_FIELD_ID_SHIFT,
-               NULL);
+                             (u64)HCMD_MISC_REQUEST_LCB_ACCESS <<
+                             LOAD_DATA_FIELD_ID_SHIFT, NULL);
        if (ret != HCMD_SUCCESS) {
                dd_dev_err(dd, "%s: command failed with error %d\n",
-                       __func__, ret);
+                          __func__, ret);
        }
        return ret == HCMD_SUCCESS ? 0 : -EBUSY;
  }
@@@ -5848,11 -6044,11 +6041,11 @@@ static int request_8051_lcb_access(stru
        int ret;
  
        ret = do_8051_command(dd, HCMD_MISC,
-               (u64)HCMD_MISC_GRANT_LCB_ACCESS << LOAD_DATA_FIELD_ID_SHIFT,
-               NULL);
+                             (u64)HCMD_MISC_GRANT_LCB_ACCESS <<
+                             LOAD_DATA_FIELD_ID_SHIFT, NULL);
        if (ret != HCMD_SUCCESS) {
                dd_dev_err(dd, "%s: command failed with error %d\n",
-                       __func__, ret);
+                          __func__, ret);
        }
        return ret == HCMD_SUCCESS ? 0 : -EBUSY;
  }
  static inline void set_host_lcb_access(struct hfi1_devdata *dd)
  {
        write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
-                               DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK
-                               | DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK);
+                 DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK |
+                 DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK);
  }
  
  /*
  static inline void set_8051_lcb_access(struct hfi1_devdata *dd)
  {
        write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
-                               DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK);
+                 DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK);
  }
  
  /*
@@@ -5909,7 -6105,7 +6102,7 @@@ int acquire_lcb_access(struct hfi1_devd
        /* this access is valid only when the link is up */
        if ((ppd->host_link_state & HLS_UP) == 0) {
                dd_dev_info(dd, "%s: link state %s not up\n",
-                       __func__, link_state_name(ppd->host_link_state));
+                           __func__, link_state_name(ppd->host_link_state));
                ret = -EBUSY;
                goto done;
        }
                ret = request_host_lcb_access(dd);
                if (ret) {
                        dd_dev_err(dd,
-                               "%s: unable to acquire LCB access, err %d\n",
-                               __func__, ret);
+                                  "%s: unable to acquire LCB access, err %d\n",
+                                  __func__, ret);
                        goto done;
                }
                set_host_lcb_access(dd);
@@@ -5956,7 -6152,7 +6149,7 @@@ int release_lcb_access(struct hfi1_devd
  
        if (dd->lcb_access_count == 0) {
                dd_dev_err(dd, "%s: LCB access count is zero.  Skipping.\n",
-                       __func__);
+                          __func__);
                goto done;
        }
  
                ret = request_8051_lcb_access(dd);
                if (ret) {
                        dd_dev_err(dd,
-                               "%s: unable to release LCB access, err %d\n",
-                               __func__, ret);
+                                  "%s: unable to release LCB access, err %d\n",
+                                  __func__, ret);
                        /* restore host access if the grant didn't work */
                        set_host_lcb_access(dd);
                        goto done;
@@@ -5998,19 -6194,26 +6191,26 @@@ static void init_lcb_access(struct hfi1
  static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)
  {
        write_csr(dd, DC_DC8051_CFG_EXT_DEV_0,
-               DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK
-               | (u64)return_code << DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT
-               | (u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
+                 DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK |
+                 (u64)return_code <<
+                 DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT |
+                 (u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
  }
  
  /*
-  * Handle requests from the 8051.
+  * Handle host requests from the 8051.
+  *
+  * This is a work-queue function outside of the interrupt.
   */
static void handle_8051_request(struct hfi1_devdata *dd)
void handle_8051_request(struct work_struct *work)
  {
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                       dc_host_req_work);
+       struct hfi1_devdata *dd = ppd->dd;
        u64 reg;
-       u16 data;
-       u8 type;
+       u16 data = 0;
+       u8 type, i, lanes, *cache = ppd->qsfp_info.cache;
+       u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS];
  
        reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1);
        if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0)
        case HREQ_READ_CONFIG:
        case HREQ_SET_TX_EQ_ABS:
        case HREQ_SET_TX_EQ_REL:
-       case HREQ_ENABLE:
                dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
-                       type);
+                           type);
                hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
                break;
  
+       case HREQ_ENABLE:
+               lanes = data & 0xF;
+               for (i = 0; lanes; lanes >>= 1, i++) {
+                       if (!(lanes & 1))
+                               continue;
+                       if (data & 0x200) {
+                               /* enable TX CDR */
+                               if (cache[QSFP_MOD_PWR_OFFS] & 0x8 &&
+                                   cache[QSFP_CDR_INFO_OFFS] & 0x80)
+                                       cdr_ctrl_byte |= (1 << (i + 4));
+                       } else {
+                               /* disable TX CDR */
+                               if (cache[QSFP_MOD_PWR_OFFS] & 0x8 &&
+                                   cache[QSFP_CDR_INFO_OFFS] & 0x80)
+                                       cdr_ctrl_byte &= ~(1 << (i + 4));
+                       }
+                       if (data & 0x800) {
+                               /* enable RX CDR */
+                               if (cache[QSFP_MOD_PWR_OFFS] & 0x4 &&
+                                   cache[QSFP_CDR_INFO_OFFS] & 0x40)
+                                       cdr_ctrl_byte |= (1 << i);
+                       } else {
+                               /* disable RX CDR */
+                               if (cache[QSFP_MOD_PWR_OFFS] & 0x4 &&
+                                   cache[QSFP_CDR_INFO_OFFS] & 0x40)
+                                       cdr_ctrl_byte &= ~(1 << i);
+                       }
+               }
+               one_qsfp_write(ppd, dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS,
+                              &cdr_ctrl_byte, 1);
+               hreq_response(dd, HREQ_SUCCESS, data);
+               refresh_qsfp_cache(ppd, &ppd->qsfp_info);
+               break;
        case HREQ_CONFIG_DONE:
                hreq_response(dd, HREQ_SUCCESS, 0);
                break;
@@@ -6056,11 -6293,11 +6290,11 @@@ static void write_global_credit(struct 
                                u8 vau, u16 total, u16 shared)
  {
        write_csr(dd, SEND_CM_GLOBAL_CREDIT,
-               ((u64)total
-                       << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT)
-               | ((u64)shared
-                       << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT)
-               | ((u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT));
+                 ((u64)total <<
+                  SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT) |
+                 ((u64)shared <<
+                  SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT) |
+                 ((u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT));
  }
  
  /*
@@@ -6097,7 -6334,7 +6331,7 @@@ void reset_link_credits(struct hfi1_dev
  
        /* remove all previous VL credit limits */
        for (i = 0; i < TXE_NUM_DATA_VL; i++)
-               write_csr(dd, SEND_CM_CREDIT_VL + (8*i), 0);
+               write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0);
        write_csr(dd, SEND_CM_CREDIT_VL15, 0);
        write_global_credit(dd, 0, 0, 0);
        /* reset the CM block */
@@@ -6139,15 -6376,14 +6373,14 @@@ static void lcb_shutdown(struct hfi1_de
        write_csr(dd, DC_LCB_CFG_RUN, 0);
        /* set tx fifo reset: LCB_CFG_TX_FIFOS_RESET.VAL = 1 */
        write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET,
-               1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT);
+                 1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT);
        /* set dcc reset csr: DCC_CFG_RESET.{reset_lcb,reset_rx_fpe} = 1 */
        dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN);
        reg = read_csr(dd, DCC_CFG_RESET);
-       write_csr(dd, DCC_CFG_RESET,
-               reg
-               | (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT)
-               | (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT));
-       (void) read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */
+       write_csr(dd, DCC_CFG_RESET, reg |
+                 (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT) |
+                 (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT));
+       (void)read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */
        if (!abort) {
                udelay(1);    /* must hold for the longer of 16cclks or 20ns */
                write_csr(dd, DCC_CFG_RESET, reg);
@@@ -6176,14 -6412,18 +6409,18 @@@ static void dc_shutdown(struct hfi1_dev
        spin_unlock_irqrestore(&dd->dc8051_lock, flags);
        /* Shutdown the LCB */
        lcb_shutdown(dd, 1);
-       /* Going to OFFLINE would have causes the 8051 to put the
+       /*
+        * Going to OFFLINE would have causes the 8051 to put the
         * SerDes into reset already. Just need to shut down the 8051,
-        * itself. */
+        * itself.
+        */
        write_csr(dd, DC_DC8051_CFG_RST, 0x1);
  }
  
- /* Calling this after the DC has been brought out of reset should not
-  * do any damage. */
+ /*
+  * Calling this after the DC has been brought out of reset should not
+  * do any damage.
+  */
  static void dc_start(struct hfi1_devdata *dd)
  {
        unsigned long flags;
        ret = wait_fm_ready(dd, TIMEOUT_8051_START);
        if (ret) {
                dd_dev_err(dd, "%s: timeout starting 8051 firmware\n",
-                       __func__);
+                          __func__);
        }
        /* Take away reset for LCB and RX FPE (set in lcb_shutdown). */
        write_csr(dd, DCC_CFG_RESET, 0x10);
@@@ -6292,7 -6532,7 +6529,7 @@@ static void adjust_lcb_for_fpga_serdes(
        write_csr(dd, DC_LCB_CFG_RX_FIFOS_RADR, rx_radr);
        /* LCB_CFG_IGNORE_LOST_RCLK.EN = 1 */
        write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK,
-               DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK);
+                 DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK);
        write_csr(dd, DC_LCB_CFG_TX_FIFOS_RADR, tx_radr);
  }
  
@@@ -6309,8 -6549,10 +6546,10 @@@ void handle_sma_message(struct work_str
        u64 msg;
        int ret;
  
-       /* msg is bytes 1-4 of the 40-bit idle message - the command code
-          is stripped off */
+       /*
+        * msg is bytes 1-4 of the 40-bit idle message - the command code
+        * is stripped off
+        */
        ret = read_idle_sma(dd, &msg);
        if (ret)
                return;
                 *
                 * Can activate the node.  Discard otherwise.
                 */
-               if (ppd->host_link_state == HLS_UP_ARMED
-                                       && ppd->is_active_optimize_enabled) {
+               if (ppd->host_link_state == HLS_UP_ARMED &&
+                   ppd->is_active_optimize_enabled) {
                        ppd->neighbor_normal = 1;
                        ret = set_link_state(ppd, HLS_UP_ACTIVE);
                        if (ret)
                break;
        default:
                dd_dev_err(dd,
-                       "%s: received unexpected SMA idle message 0x%llx\n",
-                       __func__, msg);
+                          "%s: received unexpected SMA idle message 0x%llx\n",
+                          __func__, msg);
                break;
        }
  }
@@@ -6442,10 -6684,9 +6681,9 @@@ static void wait_for_freeze_status(stru
  
                if (time_after(jiffies, timeout)) {
                        dd_dev_err(dd,
-                               "Time out waiting for SPC %sfreeze, bits 0x%llx, expecting 0x%llx, continuing",
-                               freeze ? "" : "un",
-                               reg & ALL_FROZE,
-                               freeze ? ALL_FROZE : 0ull);
+                                  "Time out waiting for SPC %sfreeze, bits 0x%llx, expecting 0x%llx, continuing",
+                                  freeze ? "" : "un", reg & ALL_FROZE,
+                                  freeze ? ALL_FROZE : 0ull);
                        return;
                }
                usleep_range(80, 120);
@@@ -6475,11 -6716,17 +6713,17 @@@ static void rxe_freeze(struct hfi1_devd
   */
  static void rxe_kernel_unfreeze(struct hfi1_devdata *dd)
  {
+       u32 rcvmask;
        int i;
  
        /* enable all kernel contexts */
-       for (i = 0; i < dd->n_krcv_queues; i++)
-               hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, i);
+       for (i = 0; i < dd->n_krcv_queues; i++) {
+               rcvmask = HFI1_RCVCTRL_CTXT_ENB;
+               /* HFI1_RCVCTRL_TAILUPD_[ENB|DIS] needs to be set explicitly */
+               rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
+                       HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
+               hfi1_rcvctrl(dd, rcvmask, i);
+       }
  
        /* enable port */
        add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
@@@ -6564,7 -6811,7 +6808,7 @@@ void handle_freeze(struct work_struct *
  void handle_link_up(struct work_struct *work)
  {
        struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                               link_up_work);
+                                                 link_up_work);
        set_link_state(ppd, HLS_UP_INIT);
  
        /* cache the read of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
        if ((ppd->link_speed_active & ppd->link_speed_enabled) == 0) {
                /* oops - current speed is not enabled, bounce */
                dd_dev_err(ppd->dd,
-                       "Link speed active 0x%x is outside enabled 0x%x, downing link\n",
-                       ppd->link_speed_active, ppd->link_speed_enabled);
+                          "Link speed active 0x%x is outside enabled 0x%x, downing link\n",
+                          ppd->link_speed_active, ppd->link_speed_enabled);
                set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SPEED_POLICY, 0,
-                       OPA_LINKDOWN_REASON_SPEED_POLICY);
+                                    OPA_LINKDOWN_REASON_SPEED_POLICY);
                set_link_state(ppd, HLS_DN_OFFLINE);
+               tune_serdes(ppd);
                start_link(ppd);
        }
  }
  
- /* Several pieces of LNI information were cached for SMA in ppd.
-  * Reset these on link down */
+ /*
+  * Several pieces of LNI information were cached for SMA in ppd.
+  * Reset these on link down
+  */
  static void reset_neighbor_info(struct hfi1_pportdata *ppd)
  {
        ppd->neighbor_guid = 0;
@@@ -6613,7 -6863,13 +6860,13 @@@ void handle_link_down(struct work_struc
        struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
                                                                link_down_work);
  
-       /* go offline first, then deal with reasons */
+       if ((ppd->host_link_state &
+            (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) &&
+            ppd->port_type == PORT_TYPE_FIXED)
+               ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED);
+       /* Go offline first, then deal with reading/writing through 8051 */
        set_link_state(ppd, HLS_DN_OFFLINE);
  
        lcl_reason = 0;
        /* disable the port */
        clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
  
-       /* If there is no cable attached, turn the DC off. Otherwise,
-        * start the link bring up. */
-       if (!qsfp_mod_present(ppd))
+       /*
+        * If there is no cable attached, turn the DC off. Otherwise,
+        * start the link bring up.
+        */
+       if (!qsfp_mod_present(ppd)) {
                dc_shutdown(ppd->dd);
-       else
+       } else {
+               tune_serdes(ppd);
                start_link(ppd);
+       }
  }
  
  void handle_link_bounce(struct work_struct *work)
         */
        if (ppd->host_link_state & HLS_UP) {
                set_link_state(ppd, HLS_DN_OFFLINE);
+               tune_serdes(ppd);
                start_link(ppd);
        } else {
                dd_dev_info(ppd->dd, "%s: link not up (%s), nothing to do\n",
-                       __func__, link_state_name(ppd->host_link_state));
+                           __func__, link_state_name(ppd->host_link_state));
        }
  }
  
@@@ -6751,7 -7012,7 +7009,7 @@@ static u16 link_width_to_bits(struct hf
        case 3: return OPA_LINK_WIDTH_3X;
        default:
                dd_dev_info(dd, "%s: invalid width %d, using 4\n",
-                       __func__, width);
+                           __func__, width);
                /* fall through */
        case 4: return OPA_LINK_WIDTH_4X;
        }
  static const u8 bit_counts[16] = {
        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
  };
  static inline u8 nibble_to_count(u8 nibble)
  {
        return bit_counts[nibble & 0xf];
@@@ -6788,7 -7050,7 +7047,7 @@@ static void get_link_widths(struct hfi1
  
        /* read the active lanes */
        read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
-                               &rx_polarity_inversion, &max_rate);
+                        &rx_polarity_inversion, &max_rate);
        read_local_lni(dd, &enable_lane_rx);
  
        /* convert to counts */
         * handle_verify_cap().  The ASIC 8051 firmware does not correctly
         * set the max_rate field in handle_verify_cap until v0.19.
         */
-       if ((dd->icode == ICODE_RTL_SILICON)
-                               && (dd->dc8051_ver < dc8051_ver(0, 19))) {
+       if ((dd->icode == ICODE_RTL_SILICON) &&
+           (dd->dc8051_ver < dc8051_ver(0, 19))) {
                /* max_rate: 0 = 12.5G, 1 = 25G */
                switch (max_rate) {
                case 0:
                        break;
                default:
                        dd_dev_err(dd,
-                               "%s: unexpected max rate %d, using 25Gb\n",
-                               __func__, (int)max_rate);
+                                  "%s: unexpected max rate %d, using 25Gb\n",
+                                  __func__, (int)max_rate);
                        /* fall through */
                case 1:
                        dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G;
        }
  
        dd_dev_info(dd,
-               "Fabric active lanes (width): tx 0x%x (%d), rx 0x%x (%d)\n",
-               enable_lane_tx, tx, enable_lane_rx, rx);
+                   "Fabric active lanes (width): tx 0x%x (%d), rx 0x%x (%d)\n",
+                   enable_lane_tx, tx, enable_lane_rx, rx);
        *tx_width = link_width_to_bits(dd, tx);
        *rx_width = link_width_to_bits(dd, rx);
  }
@@@ -6923,13 -7185,8 +7182,8 @@@ void handle_verify_cap(struct work_stru
         */
  
        read_vc_remote_phy(dd, &power_management, &continious);
-       read_vc_remote_fabric(
-               dd,
-               &vau,
-               &z,
-               &vcu,
-               &vl15buf,
-               &partner_supported_crc);
+       read_vc_remote_fabric(dd, &vau, &z, &vcu, &vl15buf,
+                             &partner_supported_crc);
        read_vc_remote_link_width(dd, &remote_tx_rate, &link_widths);
        read_remote_device_id(dd, &device_id, &device_rev);
        /*
        /* print the active widths */
        get_link_widths(dd, &active_tx, &active_rx);
        dd_dev_info(dd,
-               "Peer PHY: power management 0x%x, continuous updates 0x%x\n",
-               (int)power_management, (int)continious);
+                   "Peer PHY: power management 0x%x, continuous updates 0x%x\n",
+                   (int)power_management, (int)continious);
        dd_dev_info(dd,
-               "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n",
-               (int)vau,
-               (int)z,
-               (int)vcu,
-               (int)vl15buf,
-               (int)partner_supported_crc);
+                   "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n",
+                   (int)vau, (int)z, (int)vcu, (int)vl15buf,
+                   (int)partner_supported_crc);
        dd_dev_info(dd, "Peer Link Width: tx rate 0x%x, widths 0x%x\n",
-               (u32)remote_tx_rate, (u32)link_widths);
+                   (u32)remote_tx_rate, (u32)link_widths);
        dd_dev_info(dd, "Peer Device ID: 0x%04x, Revision 0x%02x\n",
-               (u32)device_id, (u32)device_rev);
+                   (u32)device_id, (u32)device_rev);
        /*
         * The peer vAU value just read is the peer receiver value.  HFI does
         * not support a transmit vAU of 0 (AU == 8).  We advertised that
        reg = read_csr(dd, SEND_CM_CTRL);
        if (crc_val == LCB_CRC_14B && crc_14b_sideband) {
                write_csr(dd, SEND_CM_CTRL,
-                       reg | SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
+                         reg | SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
        } else {
                write_csr(dd, SEND_CM_CTRL,
-                       reg & ~SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
+                         reg & ~SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
        }
  
        ppd->link_speed_active = 0;     /* invalid value */
        }
        if (ppd->link_speed_active == 0) {
                dd_dev_err(dd, "%s: unexpected remote tx rate %d, using 25Gb\n",
-                       __func__, (int)remote_tx_rate);
+                          __func__, (int)remote_tx_rate);
                ppd->link_speed_active = OPA_LINK_SPEED_25G;
        }
  
                read_csr(dd, DC_DC8051_STS_REMOTE_FM_SECURITY) &
                DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK;
        dd_dev_info(dd,
-               "Neighbor Guid: %llx Neighbor type %d MgmtAllowed %d FM security bypass %d\n",
-               ppd->neighbor_guid, ppd->neighbor_type,
-               ppd->mgmt_allowed, ppd->neighbor_fm_security);
+                   "Neighbor Guid: %llx Neighbor type %d MgmtAllowed %d FM security bypass %d\n",
+                   ppd->neighbor_guid, ppd->neighbor_type,
+                   ppd->mgmt_allowed, ppd->neighbor_fm_security);
        if (ppd->mgmt_allowed)
                add_full_mgmt_pkey(ppd);
  
@@@ -7127,28 -7381,27 +7378,27 @@@ retry
  
                /* bounce if not at starting active width */
                if ((ppd->link_width_active !=
-                                       ppd->link_width_downgrade_tx_active)
-                               || (ppd->link_width_active !=
-                                       ppd->link_width_downgrade_rx_active)) {
+                    ppd->link_width_downgrade_tx_active) ||
+                   (ppd->link_width_active !=
+                    ppd->link_width_downgrade_rx_active)) {
                        dd_dev_err(ppd->dd,
-                               "Link downgrade is disabled and link has downgraded, downing link\n");
+                                  "Link downgrade is disabled and link has downgraded, downing link\n");
                        dd_dev_err(ppd->dd,
-                               "  original 0x%x, tx active 0x%x, rx active 0x%x\n",
-                               ppd->link_width_active,
-                               ppd->link_width_downgrade_tx_active,
-                               ppd->link_width_downgrade_rx_active);
+                                  "  original 0x%x, tx active 0x%x, rx active 0x%x\n",
+                                  ppd->link_width_active,
+                                  ppd->link_width_downgrade_tx_active,
+                                  ppd->link_width_downgrade_rx_active);
                        do_bounce = 1;
                }
-       } else if ((lwde & ppd->link_width_downgrade_tx_active) == 0
-               || (lwde & ppd->link_width_downgrade_rx_active) == 0) {
+       } else if ((lwde & ppd->link_width_downgrade_tx_active) == 0 ||
+                  (lwde & ppd->link_width_downgrade_rx_active) == 0) {
                /* Tx or Rx is outside the enabled policy */
                dd_dev_err(ppd->dd,
-                       "Link is outside of downgrade allowed, downing link\n");
+                          "Link is outside of downgrade allowed, downing link\n");
                dd_dev_err(ppd->dd,
-                       "  enabled 0x%x, tx active 0x%x, rx active 0x%x\n",
-                       lwde,
-                       ppd->link_width_downgrade_tx_active,
-                       ppd->link_width_downgrade_rx_active);
+                          "  enabled 0x%x, tx active 0x%x, rx active 0x%x\n",
+                          lwde, ppd->link_width_downgrade_tx_active,
+                          ppd->link_width_downgrade_rx_active);
                do_bounce = 1;
        }
  
@@@ -7157,8 -7410,9 +7407,9 @@@ done
  
        if (do_bounce) {
                set_link_down_reason(ppd, OPA_LINKDOWN_REASON_WIDTH_POLICY, 0,
-                 OPA_LINKDOWN_REASON_WIDTH_POLICY);
+                                    OPA_LINKDOWN_REASON_WIDTH_POLICY);
                set_link_state(ppd, HLS_DN_OFFLINE);
+               tune_serdes(ppd);
                start_link(ppd);
        }
  }
@@@ -7239,9 -7493,10 +7490,10 @@@ static void handle_8051_interrupt(struc
                            & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
                                queue_link_down = 1;
                                dd_dev_info(dd, "Link error: %s\n",
-                                       dc8051_info_err_string(buf,
-                                               sizeof(buf),
-                                               err & FAILED_LNI));
+                                           dc8051_info_err_string(buf,
+                                                                  sizeof(buf),
+                                                                  err &
+                                                                  FAILED_LNI));
                        }
                        err &= ~(u64)FAILED_LNI;
                }
                if (err) {
                        /* report remaining errors, but do not do anything */
                        dd_dev_err(dd, "8051 info error: %s\n",
-                               dc8051_info_err_string(buf, sizeof(buf), err));
+                                  dc8051_info_err_string(buf, sizeof(buf),
+                                                         err));
                }
  
                /*
                        host_msg &= ~(u64)LINKUP_ACHIEVED;
                }
                if (host_msg & EXT_DEVICE_CFG_REQ) {
-                       handle_8051_request(dd);
+                       queue_work(ppd->hfi1_wq, &ppd->dc_host_req_work);
                        host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
                }
                if (host_msg & VERIFY_CAP_FRAME) {
                if (host_msg) {
                        /* report remaining messages, but do not do anything */
                        dd_dev_info(dd, "8051 info host message: %s\n",
-                               dc8051_info_host_msg_string(buf, sizeof(buf),
-                                       host_msg));
+                                   dc8051_info_host_msg_string(buf,
+                                                               sizeof(buf),
+                                                               host_msg));
                }
  
                reg &= ~DC_DC8051_ERR_FLG_SET_BY_8051_SMASK;
                 */
                dd_dev_err(dd, "Lost 8051 heartbeat\n");
                write_csr(dd, DC_DC8051_ERR_EN,
-                       read_csr(dd, DC_DC8051_ERR_EN)
-                         ~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK);
+                         read_csr(dd, DC_DC8051_ERR_EN) &
+                         ~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK);
  
                reg &= ~DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK;
        }
        if (reg) {
                /* report the error, but do not do anything */
                dd_dev_err(dd, "8051 error: %s\n",
-                       dc8051_err_string(buf, sizeof(buf), reg));
+                          dc8051_err_string(buf, sizeof(buf), reg));
        }
  
        if (queue_link_down) {
-               /* if the link is already going down or disabled, do not
-                * queue another */
-               if ((ppd->host_link_state
-                                   & (HLS_GOING_OFFLINE|HLS_LINK_COOLDOWN))
-                               || ppd->link_enabled == 0) {
+               /*
+                * if the link is already going down or disabled, do not
+                * queue another
+                */
+               if ((ppd->host_link_state &
+                   (HLS_GOING_OFFLINE | HLS_LINK_COOLDOWN)) ||
+                   ppd->link_enabled == 0) {
                        dd_dev_info(dd, "%s: not queuing link down\n",
-                               __func__);
+                                   __func__);
                } else {
                        queue_work(ppd->hfi1_wq, &ppd->link_down_work);
                }
@@@ -7480,8 -7739,10 +7736,10 @@@ static void handle_dcc_err(struct hfi1_
                        /* set status bit */
                        dd->err_info_rcvport.status_and_code |=
                                OPA_EI_STATUS_SMASK;
-                       /* save first 2 flits in the packet that caused
-                        * the error */
+                       /*
+                        * save first 2 flits in the packet that caused
+                        * the error
+                        */
                         dd->err_info_rcvport.packet_flit1 = hdr0;
                         dd->err_info_rcvport.packet_flit2 = hdr1;
                }
                /* just report this */
                dd_dev_info(dd, "DCC Error: PortRcv error: %s\n", extra);
                dd_dev_info(dd, "           hdr0 0x%llx, hdr1 0x%llx\n",
-                       hdr0, hdr1);
+                           hdr0, hdr1);
  
                reg &= ~DCC_ERR_FLG_RCVPORT_ERR_SMASK;
        }
        /* report any remaining errors */
        if (reg)
                dd_dev_info(dd, "DCC Error: %s\n",
-                       dcc_err_string(buf, sizeof(buf), reg));
+                           dcc_err_string(buf, sizeof(buf), reg));
  
        if (lcl_reason == 0)
                lcl_reason = OPA_LINKDOWN_REASON_UNKNOWN;
@@@ -7550,7 -7811,7 +7808,7 @@@ static void handle_lcb_err(struct hfi1_
        char buf[96];
  
        dd_dev_info(dd, "LCB Error: %s\n",
-               lcb_err_string(buf, sizeof(buf), reg));
+                   lcb_err_string(buf, sizeof(buf), reg));
  }
  
  /*
@@@ -7640,7 -7901,7 +7898,7 @@@ static void is_rcv_avail_int(struct hfi
                err_detail = "out of range";
        }
        dd_dev_err(dd, "unexpected %s receive available context interrupt %u\n",
-               err_detail, source);
+                  err_detail, source);
  }
  
  /*
@@@ -7666,7 -7927,7 +7924,7 @@@ static void is_rcv_urgent_int(struct hf
                err_detail = "out of range";
        }
        dd_dev_err(dd, "unexpected %s receive urgent context interrupt %u\n",
-               err_detail, source);
+                  err_detail, source);
  }
  
  /*
@@@ -7677,12 -7938,14 +7935,14 @@@ static void is_reserved_int(struct hfi1
        char name[64];
  
        dd_dev_err(dd, "unexpected %s interrupt\n",
-                               is_reserved_name(name, sizeof(name), source));
+                  is_reserved_name(name, sizeof(name), source));
  }
  
  static const struct is_table is_table[] = {
- /* start                   end
-                               name func               interrupt func */
+ /*
+  * start               end
+  *                            name func               interrupt func
+  */
  { IS_GENERAL_ERR_START,  IS_GENERAL_ERR_END,
                                is_misc_err_name,       is_misc_err_int },
  { IS_SDMAENG_ERR_START,  IS_SDMAENG_ERR_END,
@@@ -7753,7 -8016,7 +8013,7 @@@ static irqreturn_t general_interrupt(in
  
        /* phase 2: call the appropriate handler */
        for_each_set_bit(bit, (unsigned long *)&regs[0],
-                                               CCE_NUM_INT_CSRS*64) {
+                        CCE_NUM_INT_CSRS * 64) {
                is_interrupt(dd, bit);
        }
  
@@@ -7776,27 -8039,27 +8036,27 @@@ static irqreturn_t sdma_interrupt(int i
  
        /* This read_csr is really bad in the hot path */
        status = read_csr(dd,
-                       CCE_INT_STATUS + (8*(IS_SDMA_START/64)))
-                       & sde->imask;
+                         CCE_INT_STATUS + (8 * (IS_SDMA_START / 64)))
+                         & sde->imask;
        if (likely(status)) {
                /* clear the interrupt(s) */
                write_csr(dd,
-                       CCE_INT_CLEAR + (8*(IS_SDMA_START/64)),
-                       status);
+                         CCE_INT_CLEAR + (8 * (IS_SDMA_START / 64)),
+                         status);
  
                /* handle the interrupt(s) */
                sdma_engine_interrupt(sde, status);
        } else
                dd_dev_err(dd, "SDMA engine %u interrupt, but no status bits set\n",
-                       sde->this_idx);
+                          sde->this_idx);
  
        return IRQ_HANDLED;
  }
  
  /*
-  * Clear the receive interrupt, forcing the write and making sure
-  * we have data from the chip, pushing everything in front of it
-  * back to the host.
+  * Clear the receive interrupt.  Use a read of the interrupt clear CSR
+  * to insure that the write completed.  This does NOT guarantee that
+  * queued DMA writes to memory from the chip are pushed.
   */
  static inline void clear_recv_intr(struct hfi1_ctxtdata *rcd)
  {
  }
  
  /* force the receive interrupt */
static inline void force_recv_intr(struct hfi1_ctxtdata *rcd)
+ void force_recv_intr(struct hfi1_ctxtdata *rcd)
  {
        write_csr(rcd->dd, CCE_INT_FORCE + (8 * rcd->ireg), rcd->imask);
  }
  
- /* return non-zero if a packet is present */
+ /*
+  * Return non-zero if a packet is present.
+  *
+  * This routine is called when rechecking for packets after the RcvAvail
+  * interrupt has been cleared down.  First, do a quick check of memory for
+  * a packet present.  If not found, use an expensive CSR read of the context
+  * tail to determine the actual tail.  The CSR read is necessary because there
+  * is no method to push pending DMAs to memory other than an interrupt and we
+  * are trying to determine if we need to force an interrupt.
+  */
  static inline int check_packet_present(struct hfi1_ctxtdata *rcd)
  {
+       u32 tail;
+       int present;
        if (!HFI1_CAP_IS_KSET(DMA_RTAIL))
-               return (rcd->seq_cnt ==
+               present = (rcd->seq_cnt ==
                                rhf_rcv_seq(rhf_to_cpu(get_rhf_addr(rcd))));
+       else /* is RDMA rtail */
+               present = (rcd->head != get_rcvhdrtail(rcd));
+       if (present)
+               return 1;
  
-       /* else is RDMA rtail */
-       return (rcd->head != get_rcvhdrtail(rcd));
+       /* fall back to a CSR read, correct indpendent of DMA_RTAIL */
+       tail = (u32)read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
+       return rcd->head != tail;
  }
  
  /*
   * Receive packet IRQ handler.  This routine expects to be on its own IRQ.
   * This routine will try to handle packets immediately (latency), but if
   * it finds too many, it will invoke the thread handler (bandwitdh).  The
-  * chip receive interupt is *not* cleared down until this or the thread (if
+  * chip receive interrupt is *not* cleared down until this or the thread (if
   * invoked) is finished.  The intent is to avoid extra interrupts while we
   * are processing packets anyway.
   */
@@@ -7843,6 -8124,7 +8121,7 @@@ static irqreturn_t receive_context_inte
  
        trace_hfi1_receive_interrupt(dd, rcd->ctxt);
        this_cpu_inc(*dd->int_counter);
+       aspm_ctx_disable(rcd);
  
        /* receive interrupt remains blocked while processing packets */
        disposition = rcd->do_interrupt(rcd, 0);
@@@ -7909,7 -8191,7 +8188,7 @@@ u32 read_physical_state(struct hfi1_dev
                                & DC_DC8051_STS_CUR_STATE_PORT_MASK;
  }
  
static u32 read_logical_state(struct hfi1_devdata *dd)
+ u32 read_logical_state(struct hfi1_devdata *dd)
  {
        u64 reg;
  
@@@ -8157,8 -8439,8 +8436,8 @@@ static int set_physical_link_state(stru
        return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL);
  }
  
static int load_8051_config(struct hfi1_devdata *dd, u8 field_id,
-                           u8 lane_id, u32 config_data)
+ int load_8051_config(struct hfi1_devdata *dd, u8 field_id,
+                    u8 lane_id, u32 config_data)
  {
        u64 data;
        int ret;
        ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL);
        if (ret != HCMD_SUCCESS) {
                dd_dev_err(dd,
-                       "load 8051 config: field id %d, lane %d, err %d\n",
-                       (int)field_id, (int)lane_id, ret);
+                          "load 8051 config: field id %d, lane %d, err %d\n",
+                          (int)field_id, (int)lane_id, ret);
        }
        return ret;
  }
   * set the result, even on error.
   * Return 0 on success, -errno on failure
   */
static int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id,
-                           u32 *result)
+ int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id,
+                    u32 *result)
  {
        u64 big_data;
        u32 addr;
        } else {
                *result = 0;
                dd_dev_err(dd, "%s: direct read failed, lane %d, field %d!\n",
-                       __func__, lane_id, field_id);
+                          __func__, lane_id, field_id);
        }
  
        return ret;
@@@ -8244,7 -8526,7 +8523,7 @@@ static void read_vc_local_link_width(st
        u32 frame;
  
        read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
-                               &frame);
+                        &frame);
        *misc_bits = (frame >> MISC_CONFIG_BITS_SHIFT) & MISC_CONFIG_BITS_MASK;
        *flag_bits = (frame >> LOCAL_FLAG_BITS_SHIFT) & LOCAL_FLAG_BITS_MASK;
        *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
@@@ -8326,7 -8608,7 +8605,7 @@@ static void read_vc_remote_link_width(s
        u32 frame;
  
        read_8051_config(dd, VERIFY_CAP_REMOTE_LINK_WIDTH, GENERAL_CONFIG,
-                               &frame);
+                        &frame);
        *remote_tx_rate = (frame >> REMOTE_TX_RATE_SHIFT)
                                & REMOTE_TX_RATE_MASK;
        *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
@@@ -8366,7 -8648,7 +8645,7 @@@ void hfi1_read_link_quality(struct hfi1
        *link_quality = 0;
        if (dd->pport->host_link_state & HLS_UP) {
                ret = read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG,
-                                       &frame);
+                                      &frame);
                if (ret == 0)
                        *link_quality = (frame >> LINK_QUALITY_SHIFT)
                                                & LINK_QUALITY_MASK;
@@@ -8426,10 -8708,9 +8705,9 @@@ static void check_fabric_firmware_versi
        for (lane = 0; lane < 4; lane++) {
                ret = read_8051_config(dd, SPICO_FW_VERSION, lane, &frame);
                if (ret) {
-                       dd_dev_err(
-                               dd,
-                               "Unable to read lane %d firmware details\n",
-                               lane);
+                       dd_dev_err(dd,
+                                  "Unable to read lane %d firmware details\n",
+                                  lane);
                        continue;
                }
                version = (frame >> SPICO_ROM_VERSION_SHIFT)
                prod_id = (frame >> SPICO_ROM_PROD_ID_SHIFT)
                                        & SPICO_ROM_PROD_ID_MASK;
                dd_dev_info(dd,
-                       "Lane %d firmware: version 0x%04x, prod_id 0x%04x\n",
-                       lane, version, prod_id);
+                           "Lane %d firmware: version 0x%04x, prod_id 0x%04x\n",
+                           lane, version, prod_id);
        }
  }
  
@@@ -8451,11 -8732,10 +8729,10 @@@ static int read_idle_message(struct hfi
  {
        int ret;
  
-       ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG,
-               type, data_out);
+       ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG, type, data_out);
        if (ret != HCMD_SUCCESS) {
                dd_dev_err(dd, "read idle message: type %d, err %d\n",
-                       (u32)type, ret);
+                          (u32)type, ret);
                return -EINVAL;
        }
        dd_dev_info(dd, "%s: read idle message 0x%llx\n", __func__, *data_out);
   */
  static int read_idle_sma(struct hfi1_devdata *dd, u64 *data)
  {
-       return read_idle_message(dd,
-                       (u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT, data);
+       return read_idle_message(dd, (u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT,
+                                data);
  }
  
  /*
@@@ -8489,7 -8769,7 +8766,7 @@@ static int send_idle_message(struct hfi
        ret = do_8051_command(dd, HCMD_SEND_LCB_IDLE_MSG, data, NULL);
        if (ret != HCMD_SUCCESS) {
                dd_dev_err(dd, "send idle message: data 0x%llx, err %d\n",
-                       data, ret);
+                          data, ret);
                return -EINVAL;
        }
        return 0;
@@@ -8504,8 -8784,8 +8781,8 @@@ int send_idle_sma(struct hfi1_devdata *
  {
        u64 data;
  
-       data = ((message & IDLE_PAYLOAD_MASK) << IDLE_PAYLOAD_SHIFT)
-               ((u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT);
+       data = ((message & IDLE_PAYLOAD_MASK) << IDLE_PAYLOAD_SHIFT) |
+               ((u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT);
        return send_idle_message(dd, data);
  }
  
@@@ -8527,7 -8807,7 +8804,7 @@@ static int do_quick_linkup(struct hfi1_
                /* LCB_CFG_LOOPBACK.VAL = 2 */
                /* LCB_CFG_LANE_WIDTH.VAL = 0 */
                write_csr(dd, DC_LCB_CFG_LOOPBACK,
-                       IB_PACKET_TYPE << DC_LCB_CFG_LOOPBACK_VAL_SHIFT);
+                         IB_PACKET_TYPE << DC_LCB_CFG_LOOPBACK_VAL_SHIFT);
                write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0);
        }
  
        if (loopback && dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
                /* LCB_CFG_RUN.EN = 1 */
                write_csr(dd, DC_LCB_CFG_RUN,
-                       1ull << DC_LCB_CFG_RUN_EN_SHIFT);
+                         1ull << DC_LCB_CFG_RUN_EN_SHIFT);
  
                /* watch LCB_STS_LINK_TRANSFER_ACTIVE */
                timeout = jiffies + msecs_to_jiffies(10);
                while (1) {
-                       reg = read_csr(dd,
-                               DC_LCB_STS_LINK_TRANSFER_ACTIVE);
+                       reg = read_csr(dd, DC_LCB_STS_LINK_TRANSFER_ACTIVE);
                        if (reg)
                                break;
                        if (time_after(jiffies, timeout)) {
                                dd_dev_err(dd,
-                                       "timeout waiting for LINK_TRANSFER_ACTIVE\n");
+                                          "timeout waiting for LINK_TRANSFER_ACTIVE\n");
                                return -ETIMEDOUT;
                        }
                        udelay(2);
                }
  
                write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP,
-                       1ull << DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT);
+                         1ull << DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT);
        }
  
        if (!loopback) {
                 * done with LCB set up before resuming.
                 */
                dd_dev_err(dd,
-                       "Pausing for peer to be finished with LCB set up\n");
+                          "Pausing for peer to be finished with LCB set up\n");
                msleep(5000);
-               dd_dev_err(dd,
-                       "Continuing with quick linkup\n");
+               dd_dev_err(dd, "Continuing with quick linkup\n");
        }
  
        write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
        ret = set_physical_link_state(dd, PLS_QUICK_LINKUP);
        if (ret != HCMD_SUCCESS) {
                dd_dev_err(dd,
-                       "%s: set physical link state to quick LinkUp failed with return %d\n",
-                       __func__, ret);
+                          "%s: set physical link state to quick LinkUp failed with return %d\n",
+                          __func__, ret);
  
                set_host_lcb_access(dd);
                write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
@@@ -8612,8 -8890,8 +8887,8 @@@ static int set_serdes_loopback_mode(str
        if (ret == HCMD_SUCCESS)
                return 0;
        dd_dev_err(dd,
-               "Set physical link state to SerDes Loopback failed with return %d\n",
-               ret);
+                  "Set physical link state to SerDes Loopback failed with return %d\n",
+                  ret);
        if (ret >= 0)
                ret = -EINVAL;
        return ret;
@@@ -8628,7 -8906,7 +8903,7 @@@ static int init_loopback(struct hfi1_de
  
        /* all loopbacks should disable self GUID check */
        write_csr(dd, DC_DC8051_CFG_MODE,
-               (read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK));
+                 (read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK));
  
        /*
         * The simulator has only one loopback option - LCB.  Switch
         *
         * Accept all valid loopback values.
         */
-       if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
-               && (loopback == LOOPBACK_SERDES
-                       || loopback == LOOPBACK_LCB
-                       || loopback == LOOPBACK_CABLE)) {
+       if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR) &&
+           (loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB ||
+            loopback == LOOPBACK_CABLE)) {
                loopback = LOOPBACK_LCB;
                quick_linkup = 1;
                return 0;
                /* not supported in emulation due to emulation RTL changes */
                if (dd->icode == ICODE_FPGA_EMULATION) {
                        dd_dev_err(dd,
-                               "LCB loopback not supported in emulation\n");
+                                  "LCB loopback not supported in emulation\n");
                        return -EINVAL;
                }
                return 0;
@@@ -8687,10 -8964,10 +8961,10 @@@ static u16 opa_to_vc_link_widths(u16 op
                u16 from;
                u16 to;
        } opa_link_xlate[] = {
-               { OPA_LINK_WIDTH_1X, 1 << (1-1)  },
-               { OPA_LINK_WIDTH_2X, 1 << (2-1)  },
-               { OPA_LINK_WIDTH_3X, 1 << (3-1)  },
-               { OPA_LINK_WIDTH_4X, 1 << (4-1)  },
+               { OPA_LINK_WIDTH_1X, 1 << (1 - 1)  },
+               { OPA_LINK_WIDTH_2X, 1 << (2 - 1)  },
+               { OPA_LINK_WIDTH_3X, 1 << (3 - 1)  },
+               { OPA_LINK_WIDTH_4X, 1 << (4 - 1)  },
        };
  
        for (i = 0; i < ARRAY_SIZE(opa_link_xlate); i++) {
@@@ -8716,7 -8993,7 +8990,7 @@@ static int set_local_link_attributes(st
  
        /* set the local tx rate - need to read-modify-write */
        ret = read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
-               &rx_polarity_inversion, &ppd->local_tx_rate);
+                              &rx_polarity_inversion, &ppd->local_tx_rate);
        if (ret)
                goto set_local_link_attributes_fail;
  
  
        enable_lane_tx = 0xF; /* enable all four lanes */
        ret = write_tx_settings(dd, enable_lane_tx, tx_polarity_inversion,
-                    rx_polarity_inversion, ppd->local_tx_rate);
+                               rx_polarity_inversion, ppd->local_tx_rate);
        if (ret != HCMD_SUCCESS)
                goto set_local_link_attributes_fail;
  
        /*
         * DC supports continuous updates.
         */
-       ret = write_vc_local_phy(dd, 0 /* no power management */,
-                                    1 /* continuous updates */);
+       ret = write_vc_local_phy(dd,
+                                0 /* no power management */,
+                                1 /* continuous updates */);
        if (ret != HCMD_SUCCESS)
                goto set_local_link_attributes_fail;
  
                goto set_local_link_attributes_fail;
  
        ret = write_vc_local_link_width(dd, 0, 0,
-                    opa_to_vc_link_widths(ppd->link_width_enabled));
+                                       opa_to_vc_link_widths(
+                                               ppd->link_width_enabled));
        if (ret != HCMD_SUCCESS)
                goto set_local_link_attributes_fail;
  
  
  set_local_link_attributes_fail:
        dd_dev_err(dd,
-               "Failed to set local link attributes, return 0x%x\n",
-               ret);
+                  "Failed to set local link attributes, return 0x%x\n",
+                  ret);
        return ret;
  }
  
@@@ -8781,54 -9060,101 +9057,101 @@@ int start_link(struct hfi1_pportdata *p
  {
        if (!ppd->link_enabled) {
                dd_dev_info(ppd->dd,
-                       "%s: stopping link start because link is disabled\n",
-                       __func__);
+                           "%s: stopping link start because link is disabled\n",
+                           __func__);
                return 0;
        }
        if (!ppd->driver_link_ready) {
                dd_dev_info(ppd->dd,
-                       "%s: stopping link start because driver is not ready\n",
-                       __func__);
+                           "%s: stopping link start because driver is not ready\n",
+                           __func__);
                return 0;
        }
  
        if (qsfp_mod_present(ppd) || loopback == LOOPBACK_SERDES ||
-                       loopback == LOOPBACK_LCB ||
-                       ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+           loopback == LOOPBACK_LCB ||
+           ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
                return set_link_state(ppd, HLS_DN_POLL);
  
        dd_dev_info(ppd->dd,
-               "%s: stopping link start because no cable is present\n",
-               __func__);
+                   "%s: stopping link start because no cable is present\n",
+                   __func__);
        return -EAGAIN;
  }
  
- static void reset_qsfp(struct hfi1_pportdata *ppd)
+ static void wait_for_qsfp_init(struct hfi1_pportdata *ppd)
+ {
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 mask;
+       unsigned long timeout;
+       /*
+        * Check for QSFP interrupt for t_init (SFF 8679)
+        */
+       timeout = jiffies + msecs_to_jiffies(2000);
+       while (1) {
+               mask = read_csr(dd, dd->hfi1_id ?
+                               ASIC_QSFP2_IN : ASIC_QSFP1_IN);
+               if (!(mask & QSFP_HFI0_INT_N)) {
+                       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR :
+                                 ASIC_QSFP1_CLEAR, QSFP_HFI0_INT_N);
+                       break;
+               }
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_info(dd, "%s: No IntN detected, reset complete\n",
+                                   __func__);
+                       break;
+               }
+               udelay(2);
+       }
+ }
+ static void set_qsfp_int_n(struct hfi1_pportdata *ppd, u8 enable)
+ {
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 mask;
+       mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK);
+       if (enable)
+               mask |= (u64)QSFP_HFI0_INT_N;
+       else
+               mask &= ~(u64)QSFP_HFI0_INT_N;
+       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, mask);
+ }
+ void reset_qsfp(struct hfi1_pportdata *ppd)
  {
        struct hfi1_devdata *dd = ppd->dd;
        u64 mask, qsfp_mask;
  
+       /* Disable INT_N from triggering QSFP interrupts */
+       set_qsfp_int_n(ppd, 0);
+       /* Reset the QSFP */
        mask = (u64)QSFP_HFI0_RESET_N;
-       qsfp_mask = read_csr(dd,
-               dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE);
+       qsfp_mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE);
        qsfp_mask |= mask;
-       write_csr(dd,
-               dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE,
-               qsfp_mask);
+       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE, qsfp_mask);
  
        qsfp_mask = read_csr(dd,
-               dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT);
+                            dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT);
        qsfp_mask &= ~mask;
        write_csr(dd,
-               dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT,
-               qsfp_mask);
+                 dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask);
  
        udelay(10);
  
        qsfp_mask |= mask;
        write_csr(dd,
-               dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT,
-               qsfp_mask);
+                 dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask);
+       wait_for_qsfp_init(ppd);
+       /*
+        * Allow INT_N to trigger the QSFP interrupt to watch
+        * for alarms and warnings
+        */
+       set_qsfp_int_n(ppd, 1);
  }
  
  static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
        struct hfi1_devdata *dd = ppd->dd;
  
        if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) ||
-               (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING))
-               dd_dev_info(dd,
-                       "%s: QSFP cable on fire\n",
-                       __func__);
+           (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING))
+               dd_dev_info(dd, "%s: QSFP cable on fire\n",
+                           __func__);
  
        if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) ||
-               (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING))
-               dd_dev_info(dd,
-                       "%s: QSFP cable temperature too low\n",
-                       __func__);
+           (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING))
+               dd_dev_info(dd, "%s: QSFP cable temperature too low\n",
+                           __func__);
  
        if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) ||
-               (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING))
-               dd_dev_info(dd,
-                       "%s: QSFP supply voltage too high\n",
-                       __func__);
+           (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING))
+               dd_dev_info(dd, "%s: QSFP supply voltage too high\n",
+                           __func__);
  
        if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) ||
-               (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING))
-               dd_dev_info(dd,
-                       "%s: QSFP supply voltage too low\n",
-                       __func__);
+           (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING))
+               dd_dev_info(dd, "%s: QSFP supply voltage too low\n",
+                           __func__);
  
        /* Byte 2 is vendor specific */
  
        if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) ||
-               (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING))
-               dd_dev_info(dd,
-                       "%s: Cable RX channel 1/2 power too high\n",
-                       __func__);
+           (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable RX channel 1/2 power too high\n",
+                           __func__);
  
        if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) ||
-               (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING))
-               dd_dev_info(dd,
-                       "%s: Cable RX channel 1/2 power too low\n",
-                       __func__);
+           (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable RX channel 1/2 power too low\n",
+                           __func__);
  
        if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) ||
-               (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING))
-               dd_dev_info(dd,
-                       "%s: Cable RX channel 3/4 power too high\n",
-                       __func__);
+           (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable RX channel 3/4 power too high\n",
+                           __func__);
  
        if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) ||
-               (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING))
-               dd_dev_info(dd,
-                       "%s: Cable RX channel 3/4 power too low\n",
-                       __func__);
+           (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable RX channel 3/4 power too low\n",
+                           __func__);
  
        if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) ||
-               (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING))
-               dd_dev_info(dd,
-                       "%s: Cable TX channel 1/2 bias too high\n",
-                       __func__);
+           (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too high\n",
+                           __func__);
  
        if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) ||
-               (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING))
-               dd_dev_info(dd,
-                       "%s: Cable TX channel 1/2 bias too low\n",
-                       __func__);
+           (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too low\n",
+                           __func__);
  
        if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) ||
-               (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING))
-               dd_dev_info(dd,
-                       "%s: Cable TX channel 3/4 bias too high\n",
-                       __func__);
+           (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too high\n",
+                           __func__);
  
        if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) ||
-               (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING))
-               dd_dev_info(dd,
-                       "%s: Cable TX channel 3/4 bias too low\n",
-                       __func__);
+           (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too low\n",
+                           __func__);
  
        if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) ||
-               (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING))
-               dd_dev_info(dd,
-                       "%s: Cable TX channel 1/2 power too high\n",
-                       __func__);
+           (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 1/2 power too high\n",
+                           __func__);
  
        if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) ||
-               (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING))
-               dd_dev_info(dd,
-                       "%s: Cable TX channel 1/2 power too low\n",
-                       __func__);
+           (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 1/2 power too low\n",
+                           __func__);
  
        if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) ||
-               (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING))
-               dd_dev_info(dd,
-                       "%s: Cable TX channel 3/4 power too high\n",
-                       __func__);
+           (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 3/4 power too high\n",
+                           __func__);
  
        if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) ||
-               (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING))
-               dd_dev_info(dd,
-                       "%s: Cable TX channel 3/4 power too low\n",
-                       __func__);
+           (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 3/4 power too low\n",
+                           __func__);
  
        /* Bytes 9-10 and 11-12 are reserved */
        /* Bytes 13-15 are vendor specific */
        return 0;
  }
  
- static int do_pre_lni_host_behaviors(struct hfi1_pportdata *ppd)
- {
-       refresh_qsfp_cache(ppd, &ppd->qsfp_info);
-       return 0;
- }
- static int do_qsfp_intr_fallback(struct hfi1_pportdata *ppd)
- {
-       struct hfi1_devdata *dd = ppd->dd;
-       u8 qsfp_interrupt_status = 0;
-       if (qsfp_read(ppd, dd->hfi1_id, 2, &qsfp_interrupt_status, 1)
-               != 1) {
-               dd_dev_info(dd,
-                       "%s: Failed to read status of QSFP module\n",
-                       __func__);
-               return -EIO;
-       }
-       /* We don't care about alarms & warnings with a non-functional INT_N */
-       if (!(qsfp_interrupt_status & QSFP_DATA_NOT_READY))
-               do_pre_lni_host_behaviors(ppd);
-       return 0;
- }
  /* This routine will only be scheduled if the QSFP module is present */
static void qsfp_event(struct work_struct *work)
+ void qsfp_event(struct work_struct *work)
  {
        struct qsfp_data *qd;
        struct hfi1_pportdata *ppd;
        dc_start(dd);
  
        if (qd->cache_refresh_required) {
-               msleep(3000);
-               reset_qsfp(ppd);
+               set_qsfp_int_n(ppd, 0);
  
-               /* Check for QSFP interrupt after t_init (SFF 8679)
-                * + extra
+               wait_for_qsfp_init(ppd);
+               /*
+                * Allow INT_N to trigger the QSFP interrupt to watch
+                * for alarms and warnings
                 */
-               msleep(3000);
-               if (!qd->qsfp_interrupt_functional) {
-                       if (do_qsfp_intr_fallback(ppd) < 0)
-                               dd_dev_info(dd, "%s: QSFP fallback failed\n",
-                                       __func__);
-                       ppd->driver_link_ready = 1;
-                       start_link(ppd);
-               }
+               set_qsfp_int_n(ppd, 1);
+               tune_serdes(ppd);
+               start_link(ppd);
        }
  
        if (qd->check_interrupt_flags) {
                u8 qsfp_interrupt_status[16] = {0,};
  
-               if (qsfp_read(ppd, dd->hfi1_id, 6,
-                             &qsfp_interrupt_status[0], 16) != 16) {
+               if (one_qsfp_read(ppd, dd->hfi1_id, 6,
+                                 &qsfp_interrupt_status[0], 16) != 16) {
                        dd_dev_info(dd,
-                               "%s: Failed to read status of QSFP module\n",
-                               __func__);
+                                   "%s: Failed to read status of QSFP module\n",
+                                   __func__);
                } else {
                        unsigned long flags;
-                       u8 data_status;
  
+                       handle_qsfp_error_conditions(
+                                       ppd, qsfp_interrupt_status);
                        spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
                        ppd->qsfp_info.check_interrupt_flags = 0;
                        spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
-                                                               flags);
-                       if (qsfp_read(ppd, dd->hfi1_id, 2, &data_status, 1)
-                                != 1) {
-                               dd_dev_info(dd,
-                               "%s: Failed to read status of QSFP module\n",
-                                       __func__);
-                       }
-                       if (!(data_status & QSFP_DATA_NOT_READY)) {
-                               do_pre_lni_host_behaviors(ppd);
-                               start_link(ppd);
-                       } else
-                               handle_qsfp_error_conditions(ppd,
-                                               qsfp_interrupt_status);
+                                              flags);
                }
        }
  }
  
void init_qsfp(struct hfi1_pportdata *ppd)
static void init_qsfp_int(struct hfi1_devdata *dd)
  {
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 qsfp_mask;
+       struct hfi1_pportdata *ppd = dd->pport;
+       u64 qsfp_mask, cce_int_mask;
+       const int qsfp1_int_smask = QSFP1_INT % 64;
+       const int qsfp2_int_smask = QSFP2_INT % 64;
  
-       if (loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB ||
-                       ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
-               ppd->driver_link_ready = 1;
-               return;
+       /*
+        * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0
+        * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR,
+        * therefore just one of QSFP1_INT/QSFP2_INT can be used to find
+        * the index of the appropriate CSR in the CCEIntMask CSR array
+        */
+       cce_int_mask = read_csr(dd, CCE_INT_MASK +
+                               (8 * (QSFP1_INT / 64)));
+       if (dd->hfi1_id) {
+               cce_int_mask &= ~((u64)1 << qsfp1_int_smask);
+               write_csr(dd, CCE_INT_MASK + (8 * (QSFP1_INT / 64)),
+                         cce_int_mask);
+       } else {
+               cce_int_mask &= ~((u64)1 << qsfp2_int_smask);
+               write_csr(dd, CCE_INT_MASK + (8 * (QSFP2_INT / 64)),
+                         cce_int_mask);
        }
  
-       ppd->qsfp_info.ppd = ppd;
-       INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event);
        qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
        /* Clear current status to avoid spurious interrupts */
-       write_csr(dd,
-                       dd->hfi1_id ?
-                               ASIC_QSFP2_CLEAR :
-                               ASIC_QSFP1_CLEAR,
-               qsfp_mask);
+       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : ASIC_QSFP1_CLEAR,
+                 qsfp_mask);
+       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK,
+                 qsfp_mask);
+       set_qsfp_int_n(ppd, 0);
  
        /* Handle active low nature of INT_N and MODPRST_N pins */
        if (qsfp_mod_present(ppd))
        write_csr(dd,
                  dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT,
                  qsfp_mask);
-       /* Allow only INT_N and MODPRST_N to trigger QSFP interrupts */
-       qsfp_mask |= (u64)QSFP_HFI0_MODPRST_N;
-       write_csr(dd,
-               dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK,
-               qsfp_mask);
-       if (qsfp_mod_present(ppd)) {
-               msleep(3000);
-               reset_qsfp(ppd);
-               /* Check for QSFP interrupt after t_init (SFF 8679)
-                * + extra
-                */
-               msleep(3000);
-               if (!ppd->qsfp_info.qsfp_interrupt_functional) {
-                       if (do_qsfp_intr_fallback(ppd) < 0)
-                               dd_dev_info(dd,
-                                       "%s: QSFP fallback failed\n",
-                                       __func__);
-                       ppd->driver_link_ready = 1;
-               }
-       }
  }
  
  /*
   */
  static void init_lcb(struct hfi1_devdata *dd)
  {
+       /* simulator does not correctly handle LCB cclk loopback, skip */
+       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+               return;
        /* the DC has been reset earlier in the driver load */
  
        /* set LCB for cclk loopback on the port */
@@@ -9125,8 -9388,6 +9385,6 @@@ int bringup_serdes(struct hfi1_pportdat
                ppd->guid = guid;
        }
  
-       /* the link defaults to enabled */
-       ppd->link_enabled = 1;
        /* Set linkinit_reason on power up per OPA spec */
        ppd->linkinit_reason = OPA_LINKINIT_REASON_LINKUP;
  
                        return ret;
        }
  
+       /* tune the SERDES to a ballpark setting for
+        * optimal signal and bit error rate
+        * Needs to be done before starting the link
+        */
+       tune_serdes(ppd);
        return start_link(ppd);
  }
  
@@@ -9156,8 -9423,10 +9420,10 @@@ void hfi1_quiet_serdes(struct hfi1_ppor
        ppd->driver_link_ready = 0;
        ppd->link_enabled = 0;
  
+       ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED);
        set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SMA_DISABLED, 0,
-         OPA_LINKDOWN_REASON_SMA_DISABLED);
+                            OPA_LINKDOWN_REASON_SMA_DISABLED);
        set_link_state(ppd, HLS_DN_OFFLINE);
  
        /* disable the port */
@@@ -9171,14 -9440,14 +9437,14 @@@ static inline int init_cpu_counters(str
  
        ppd = (struct hfi1_pportdata *)(dd + 1);
        for (i = 0; i < dd->num_pports; i++, ppd++) {
-               ppd->ibport_data.rc_acks = NULL;
-               ppd->ibport_data.rc_qacks = NULL;
-               ppd->ibport_data.rc_acks = alloc_percpu(u64);
-               ppd->ibport_data.rc_qacks = alloc_percpu(u64);
-               ppd->ibport_data.rc_delayed_comp = alloc_percpu(u64);
-               if ((ppd->ibport_data.rc_acks == NULL) ||
-                   (ppd->ibport_data.rc_delayed_comp == NULL) ||
-                   (ppd->ibport_data.rc_qacks == NULL))
+               ppd->ibport_data.rvp.rc_acks = NULL;
+               ppd->ibport_data.rvp.rc_qacks = NULL;
+               ppd->ibport_data.rvp.rc_acks = alloc_percpu(u64);
+               ppd->ibport_data.rvp.rc_qacks = alloc_percpu(u64);
+               ppd->ibport_data.rvp.rc_delayed_comp = alloc_percpu(u64);
+               if (!ppd->ibport_data.rvp.rc_acks ||
+                   !ppd->ibport_data.rvp.rc_delayed_comp ||
+                   !ppd->ibport_data.rvp.rc_qacks)
                        return -ENOMEM;
        }
  
@@@ -9213,8 -9482,8 +9479,8 @@@ void hfi1_put_tid(struct hfi1_devdata *
                pa = 0;
        } else if (type > PT_INVALID) {
                dd_dev_err(dd,
-                       "unexpected receive array type %u for index %u, not handled\n",
-                       type, index);
+                          "unexpected receive array type %u for index %u, not handled\n",
+                          type, index);
                goto done;
        }
  
@@@ -9429,12 -9698,15 +9695,15 @@@ static void set_send_length(struct hfi1
        /* all kernel receive contexts have the same hdrqentsize */
        for (i = 0; i < ppd->vls_supported; i++) {
                sc_set_cr_threshold(dd->vld[i].sc,
-                       sc_mtu_to_threshold(dd->vld[i].sc, dd->vld[i].mtu,
-                               dd->rcd[0]->rcvhdrqentsize));
+                                   sc_mtu_to_threshold(dd->vld[i].sc,
+                                                       dd->vld[i].mtu,
+                                                       dd->rcd[0]->
+                                                       rcvhdrqentsize));
        }
        sc_set_cr_threshold(dd->vld[15].sc,
-               sc_mtu_to_threshold(dd->vld[15].sc, dd->vld[15].mtu,
-                       dd->rcd[0]->rcvhdrqentsize));
+                           sc_mtu_to_threshold(dd->vld[15].sc,
+                                               dd->vld[15].mtu,
+                                               dd->rcd[0]->rcvhdrqentsize));
  
        /* Adjust maximum MTU for the port in DC */
        dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
@@@ -9460,7 -9732,7 +9729,7 @@@ static void set_lidlmc(struct hfi1_ppor
        c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK
                | DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK);
        c1 |= ((ppd->lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK)
-                       << DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT)|
+                       << DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT) |
              ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK)
                        << DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT);
        write_csr(ppd->dd, DCC_CFG_PORT_CONFIG1, c1);
@@@ -9495,8 -9767,8 +9764,8 @@@ static int wait_phy_linkstate(struct hf
                        break;
                if (time_after(jiffies, timeout)) {
                        dd_dev_err(dd,
-                               "timeout waiting for phy link state 0x%x, current state is 0x%x\n",
-                               state, curr_state);
+                                  "timeout waiting for phy link state 0x%x, current state is 0x%x\n",
+                                  state, curr_state);
                        return -ETIMEDOUT;
                }
                usleep_range(1950, 2050); /* sleep 2ms-ish */
@@@ -9539,17 -9811,18 +9808,18 @@@ static int goto_offline(struct hfi1_ppo
  
        if (do_transition) {
                ret = set_physical_link_state(dd,
-                       PLS_OFFLINE | (rem_reason << 8));
+                                             (rem_reason << 8) | PLS_OFFLINE);
  
                if (ret != HCMD_SUCCESS) {
                        dd_dev_err(dd,
-                               "Failed to transition to Offline link state, return %d\n",
-                               ret);
+                                  "Failed to transition to Offline link state, return %d\n",
+                                  ret);
                        return -EINVAL;
                }
-               if (ppd->offline_disabled_reason == OPA_LINKDOWN_REASON_NONE)
+               if (ppd->offline_disabled_reason ==
+                               HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))
                        ppd->offline_disabled_reason =
-                       OPA_LINKDOWN_REASON_TRANSIENT;
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT);
        }
  
        if (do_wait) {
        write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
        ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */
  
+       if (ppd->port_type == PORT_TYPE_QSFP &&
+           ppd->qsfp_info.limiting_active &&
+           qsfp_mod_present(ppd)) {
+               int ret;
+               ret = acquire_chip_resource(dd, qsfp_resource(dd), QSFP_WAIT);
+               if (ret == 0) {
+                       set_qsfp_tx(ppd, 0);
+                       release_chip_resource(dd, qsfp_resource(dd));
+               } else {
+                       /* not fatal, but should warn */
+                       dd_dev_err(dd,
+                                  "Unable to acquire lock to turn off QSFP TX\n");
+               }
+       }
        /*
         * The LNI has a mandatory wait time after the physical state
         * moves to Offline.Quiet.  The wait time may be different
        ret = wait_fm_ready(dd, 7000);
        if (ret) {
                dd_dev_err(dd,
-                       "After going offline, timed out waiting for the 8051 to become ready to accept host requests\n");
+                          "After going offline, timed out waiting for the 8051 to become ready to accept host requests\n");
                /* state is really offline, so make it so */
                ppd->host_link_state = HLS_DN_OFFLINE;
                return ret;
                read_last_local_state(dd, &last_local_state);
                read_last_remote_state(dd, &last_remote_state);
                dd_dev_err(dd,
-                       "LNI failure last states: local 0x%08x, remote 0x%08x\n",
-                       last_local_state, last_remote_state);
+                          "LNI failure last states: local 0x%08x, remote 0x%08x\n",
+                          last_local_state, last_remote_state);
        }
  
        /* the active link width (downgrade) is 0 on link down */
@@@ -9754,14 -10043,14 +10040,14 @@@ int set_link_state(struct hfi1_pportdat
                state = dd->link_default;
  
        /* interpret poll -> poll as a link bounce */
-       poll_bounce = ppd->host_link_state == HLS_DN_POLL
-                               && state == HLS_DN_POLL;
+       poll_bounce = ppd->host_link_state == HLS_DN_POLL &&
+                     state == HLS_DN_POLL;
  
        dd_dev_info(dd, "%s: current %s, new %s %s%s\n", __func__,
-               link_state_name(ppd->host_link_state),
-               link_state_name(orig_new_state),
-               poll_bounce ? "(bounce) " : "",
-               link_state_reason_name(ppd, state));
+                   link_state_name(ppd->host_link_state),
+                   link_state_name(orig_new_state),
+                   poll_bounce ? "(bounce) " : "",
+                   link_state_reason_name(ppd, state));
  
        was_up = !!(ppd->host_link_state & HLS_UP);
  
  
        switch (state) {
        case HLS_UP_INIT:
-               if (ppd->host_link_state == HLS_DN_POLL && (quick_linkup
-                           || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) {
+               if (ppd->host_link_state == HLS_DN_POLL &&
+                   (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) {
                        /*
                         * Quick link up jumps from polling to here.
                         *
                         * simulator jumps from polling to link up.
                         * Accept that here.
                         */
-                       /* OK */;
+                       /* OK */
                } else if (ppd->host_link_state != HLS_GOING_UP) {
                        goto unexpected;
                }
                        /* logical state didn't change, stay at going_up */
                        ppd->host_link_state = HLS_GOING_UP;
                        dd_dev_err(dd,
-                               "%s: logical state did not change to INIT\n",
-                               __func__);
+                                  "%s: logical state did not change to INIT\n",
+                                  __func__);
                } else {
                        /* clear old transient LINKINIT_REASON code */
                        if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR)
                        /* logical state didn't change, stay at init */
                        ppd->host_link_state = HLS_UP_INIT;
                        dd_dev_err(dd,
-                               "%s: logical state did not change to ARMED\n",
-                               __func__);
+                                  "%s: logical state did not change to ARMED\n",
+                                  __func__);
                }
                /*
                 * The simulator does not currently implement SMA messages,
                        /* logical state didn't change, stay at armed */
                        ppd->host_link_state = HLS_UP_ARMED;
                        dd_dev_err(dd,
-                               "%s: logical state did not change to ACTIVE\n",
-                               __func__);
+                                  "%s: logical state did not change to ACTIVE\n",
+                                  __func__);
                } else {
                        /* tell all engines to go running */
                        sdma_all_running(dd);
  
                        /* Signal the IB layer that the port has went active */
-                       event.device = &dd->verbs_dev.ibdev;
+                       event.device = &dd->verbs_dev.rdi.ibdev;
                        event.element.port_num = ppd->port;
                        event.event = IB_EVENT_PORT_ACTIVE;
                }
                                ppd->link_enabled = 1;
                }
  
+               set_all_slowpath(ppd->dd);
                ret = set_local_link_attributes(ppd);
                if (ret)
                        break;
                        ret1 = set_physical_link_state(dd, PLS_POLLING);
                        if (ret1 != HCMD_SUCCESS) {
                                dd_dev_err(dd,
-                                       "Failed to transition to Polling link state, return 0x%x\n",
-                                       ret1);
+                                          "Failed to transition to Polling link state, return 0x%x\n",
+                                          ret1);
                                ret = -EINVAL;
                        }
                }
-               ppd->offline_disabled_reason = OPA_LINKDOWN_REASON_NONE;
+               ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
                /*
                 * If an error occurred above, go back to offline.  The
                 * caller may reschedule another attempt.
                ret1 = set_physical_link_state(dd, PLS_DISABLED);
                if (ret1 != HCMD_SUCCESS) {
                        dd_dev_err(dd,
-                               "Failed to transition to Disabled link state, return 0x%x\n",
-                               ret1);
+                                  "Failed to transition to Disabled link state, return 0x%x\n",
+                                  ret1);
                        ret = -EINVAL;
                        break;
                }
                ret1 = set_physical_link_state(dd, PLS_LINKUP);
                if (ret1 != HCMD_SUCCESS) {
                        dd_dev_err(dd,
-                               "Failed to transition to link up state, return 0x%x\n",
-                               ret1);
+                                  "Failed to transition to link up state, return 0x%x\n",
+                                  ret1);
                        ret = -EINVAL;
                        break;
                }
        case HLS_LINK_COOLDOWN:         /* transient within goto_offline() */
        default:
                dd_dev_info(dd, "%s: state 0x%x: not supported\n",
-                       __func__, state);
+                           __func__, state);
                ret = -EINVAL;
                break;
        }
  
  unexpected:
        dd_dev_err(dd, "%s: unexpected state transition from %s to %s\n",
-               __func__, link_state_name(ppd->host_link_state),
-               link_state_name(state));
+                  __func__, link_state_name(ppd->host_link_state),
+                  link_state_name(state));
        ret = -EINVAL;
  
  done:
@@@ -10016,7 -10306,7 +10303,7 @@@ int hfi1_set_ib_cfg(struct hfi1_pportda
                 * The VL Arbitrator high limit is sent in units of 4k
                 * bytes, while HFI stores it in units of 64 bytes.
                 */
-               val *= 4096/64;
+               val *= 4096 / 64;
                reg = ((u64)val & SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK)
                        << SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT;
                write_csr(ppd->dd, SEND_HIGH_PRIORITY_LIMIT, reg);
                        ppd->vls_operational = val;
                        if (!ppd->port)
                                ret = -EINVAL;
-                       else
-                               ret = sdma_map_init(
-                                       ppd->dd,
-                                       ppd->port - 1,
-                                       val,
-                                       NULL);
                }
                break;
        /*
        default:
                if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
                        dd_dev_info(ppd->dd,
-                         "%s: which %s, val 0x%x: not implemented\n",
-                         __func__, ib_cfg_name(which), val);
+                                   "%s: which %s, val 0x%x: not implemented\n",
+                                   __func__, ib_cfg_name(which), val);
                break;
        }
        return ret;
@@@ -10152,6 -10436,7 +10433,7 @@@ static int vl_arb_match_cache(struct vl
  {
        return !memcmp(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
  }
  /* end functions related to vl arbitration table caching */
  
  static int set_vl_weights(struct hfi1_pportdata *ppd, u32 target,
@@@ -10239,7 -10524,7 +10521,7 @@@ static int get_buffer_control(struct hf
  
        /* OPA and HFI have a 1-1 mapping */
        for (i = 0; i < TXE_NUM_DATA_VL; i++)
-               read_one_cm_vl(dd, SEND_CM_CREDIT_VL + (8*i), &bc->vl[i]);
+               read_one_cm_vl(dd, SEND_CM_CREDIT_VL + (8 * i), &bc->vl[i]);
  
        /* NOTE: assumes that VL* and VL15 CSRs are bit-wise identical */
        read_one_cm_vl(dd, SEND_CM_CREDIT_VL15, &bc->vl[15]);
@@@ -10293,41 -10578,41 +10575,41 @@@ static void get_vlarb_preempt(struct hf
  static void set_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
  {
        write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0,
-               DC_SC_VL_VAL(15_0,
-               0, dp->vlnt[0] & 0xf,
-               1, dp->vlnt[1] & 0xf,
-               2, dp->vlnt[2] & 0xf,
-               3, dp->vlnt[3] & 0xf,
-               4, dp->vlnt[4] & 0xf,
-               5, dp->vlnt[5] & 0xf,
-               6, dp->vlnt[6] & 0xf,
-               7, dp->vlnt[7] & 0xf,
-               8, dp->vlnt[8] & 0xf,
-               9, dp->vlnt[9] & 0xf,
-               10, dp->vlnt[10] & 0xf,
-               11, dp->vlnt[11] & 0xf,
-               12, dp->vlnt[12] & 0xf,
-               13, dp->vlnt[13] & 0xf,
-               14, dp->vlnt[14] & 0xf,
-               15, dp->vlnt[15] & 0xf));
+                 DC_SC_VL_VAL(15_0,
+                              0, dp->vlnt[0] & 0xf,
+                              1, dp->vlnt[1] & 0xf,
+                              2, dp->vlnt[2] & 0xf,
+                              3, dp->vlnt[3] & 0xf,
+                              4, dp->vlnt[4] & 0xf,
+                              5, dp->vlnt[5] & 0xf,
+                              6, dp->vlnt[6] & 0xf,
+                              7, dp->vlnt[7] & 0xf,
+                              8, dp->vlnt[8] & 0xf,
+                              9, dp->vlnt[9] & 0xf,
+                              10, dp->vlnt[10] & 0xf,
+                              11, dp->vlnt[11] & 0xf,
+                              12, dp->vlnt[12] & 0xf,
+                              13, dp->vlnt[13] & 0xf,
+                              14, dp->vlnt[14] & 0xf,
+                              15, dp->vlnt[15] & 0xf));
        write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16,
-               DC_SC_VL_VAL(31_16,
-               16, dp->vlnt[16] & 0xf,
-               17, dp->vlnt[17] & 0xf,
-               18, dp->vlnt[18] & 0xf,
-               19, dp->vlnt[19] & 0xf,
-               20, dp->vlnt[20] & 0xf,
-               21, dp->vlnt[21] & 0xf,
-               22, dp->vlnt[22] & 0xf,
-               23, dp->vlnt[23] & 0xf,
-               24, dp->vlnt[24] & 0xf,
-               25, dp->vlnt[25] & 0xf,
-               26, dp->vlnt[26] & 0xf,
-               27, dp->vlnt[27] & 0xf,
-               28, dp->vlnt[28] & 0xf,
-               29, dp->vlnt[29] & 0xf,
-               30, dp->vlnt[30] & 0xf,
-               31, dp->vlnt[31] & 0xf));
+                 DC_SC_VL_VAL(31_16,
+                              16, dp->vlnt[16] & 0xf,
+                              17, dp->vlnt[17] & 0xf,
+                              18, dp->vlnt[18] & 0xf,
+                              19, dp->vlnt[19] & 0xf,
+                              20, dp->vlnt[20] & 0xf,
+                              21, dp->vlnt[21] & 0xf,
+                              22, dp->vlnt[22] & 0xf,
+                              23, dp->vlnt[23] & 0xf,
+                              24, dp->vlnt[24] & 0xf,
+                              25, dp->vlnt[25] & 0xf,
+                              26, dp->vlnt[26] & 0xf,
+                              27, dp->vlnt[27] & 0xf,
+                              28, dp->vlnt[28] & 0xf,
+                              29, dp->vlnt[29] & 0xf,
+                              30, dp->vlnt[30] & 0xf,
+                              31, dp->vlnt[31] & 0xf));
  }
  
  static void nonzero_msg(struct hfi1_devdata *dd, int idx, const char *what,
  {
        if (limit != 0)
                dd_dev_info(dd, "Invalid %s limit %d on VL %d, ignoring\n",
-                       what, (int)limit, idx);
+                           what, (int)limit, idx);
  }
  
  /* change only the shared limit portion of SendCmGLobalCredit */
@@@ -10413,14 -10698,14 +10695,14 @@@ static void wait_for_vl_status_clear(st
        }
  
        dd_dev_err(dd,
-               "%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n",
-               which, VL_STATUS_CLEAR_TIMEOUT, mask, reg);
+                  "%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n",
+                  which, VL_STATUS_CLEAR_TIMEOUT, mask, reg);
        /*
         * If this occurs, it is likely there was a credit loss on the link.
         * The only recovery from that is a link bounce.
         */
        dd_dev_err(dd,
-               "Continuing anyway.  A credit loss may occur.  Suggest a link bounce\n");
+                  "Continuing anyway.  A credit loss may occur.  Suggest a link bounce\n");
  }
  
  /*
   * raise = if the new limit is higher than the current value (may be changed
   *    earlier in the algorithm), set the new limit to the new value
   */
static int set_buffer_control(struct hfi1_devdata *dd,
-                             struct buffer_control *new_bc)
int set_buffer_control(struct hfi1_pportdata *ppd,
+                      struct buffer_control *new_bc)
  {
+       struct hfi1_devdata *dd = ppd->dd;
        u64 changing_mask, ld_mask, stat_mask;
        int change_count;
        int i, use_all_mask;
        int this_shared_changing;
+       int vl_count = 0, ret;
        /*
         * A0: add the variable any_shared_limit_changing below and in the
         * algorithm above.  If removing A0 support, it can be removed.
  #define valid_vl(idx) ((idx) < TXE_NUM_DATA_VL || (idx) == 15)
  #define NUM_USABLE_VLS 16     /* look at VL15 and less */
  
        /* find the new total credits, do sanity check on unused VLs */
        for (i = 0; i < OPA_MAX_VLS; i++) {
                if (valid_vl(i)) {
                        continue;
                }
                nonzero_msg(dd, i, "dedicated",
-                       be16_to_cpu(new_bc->vl[i].dedicated));
+                           be16_to_cpu(new_bc->vl[i].dedicated));
                nonzero_msg(dd, i, "shared",
-                       be16_to_cpu(new_bc->vl[i].shared));
+                           be16_to_cpu(new_bc->vl[i].shared));
                new_bc->vl[i].dedicated = 0;
                new_bc->vl[i].shared = 0;
        }
         */
        memset(changing, 0, sizeof(changing));
        memset(lowering_dedicated, 0, sizeof(lowering_dedicated));
-       /* NOTE: Assumes that the individual VL bits are adjacent and in
-          increasing order */
+       /*
+        * NOTE: Assumes that the individual VL bits are adjacent and in
+        * increasing order
+        */
        stat_mask =
                SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK;
        changing_mask = 0;
                                                != cur_bc.vl[i].shared;
                if (this_shared_changing)
                        any_shared_limit_changing = 1;
-               if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated
-                               || this_shared_changing) {
+               if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated ||
+                   this_shared_changing) {
                        changing[i] = 1;
                        changing_mask |= stat_mask;
                        change_count++;
        }
  
        wait_for_vl_status_clear(dd, use_all_mask ? all_mask : changing_mask,
-               "shared");
+                                "shared");
  
        if (change_count > 0) {
                for (i = 0; i < NUM_USABLE_VLS; i++) {
  
                        if (lowering_dedicated[i]) {
                                set_vl_dedicated(dd, i,
-                                       be16_to_cpu(new_bc->vl[i].dedicated));
+                                                be16_to_cpu(new_bc->
+                                                            vl[i].dedicated));
                                cur_bc.vl[i].dedicated =
                                                new_bc->vl[i].dedicated;
                        }
                        if (be16_to_cpu(new_bc->vl[i].dedicated) >
                                        be16_to_cpu(cur_bc.vl[i].dedicated))
                                set_vl_dedicated(dd, i,
-                                       be16_to_cpu(new_bc->vl[i].dedicated));
+                                                be16_to_cpu(new_bc->
+                                                            vl[i].dedicated));
                }
        }
  
  
        /* finally raise the global shared */
        if (be16_to_cpu(new_bc->overall_shared_limit) >
-                       be16_to_cpu(cur_bc.overall_shared_limit))
+           be16_to_cpu(cur_bc.overall_shared_limit))
                set_global_shared(dd,
-                       be16_to_cpu(new_bc->overall_shared_limit));
+                                 be16_to_cpu(new_bc->overall_shared_limit));
  
        /* bracket the credit change with a total adjustment */
        if (new_total < cur_total)
                set_global_limit(dd, new_total);
+       /*
+        * Determine the actual number of operational VLS using the number of
+        * dedicated and shared credits for each VL.
+        */
+       if (change_count > 0) {
+               for (i = 0; i < TXE_NUM_DATA_VL; i++)
+                       if (be16_to_cpu(new_bc->vl[i].dedicated) > 0 ||
+                           be16_to_cpu(new_bc->vl[i].shared) > 0)
+                               vl_count++;
+               ppd->actual_vls_operational = vl_count;
+               ret = sdma_map_init(dd, ppd->port - 1, vl_count ?
+                                   ppd->actual_vls_operational :
+                                   ppd->vls_operational,
+                                   NULL);
+               if (ret == 0)
+                       ret = pio_map_init(dd, ppd->port - 1, vl_count ?
+                                          ppd->actual_vls_operational :
+                                          ppd->vls_operational, NULL);
+               if (ret)
+                       return ret;
+       }
        return 0;
  }
  
@@@ -10696,7 -11008,7 +11005,7 @@@ int fm_set_table(struct hfi1_pportdata 
                                     VL_ARB_LOW_PRIO_TABLE_SIZE, t);
                break;
        case FM_TBL_BUFFER_CONTROL:
-               ret = set_buffer_control(ppd->dd, t);
+               ret = set_buffer_control(ppd, t);
                break;
        case FM_TBL_SC2VLNT:
                set_sc2vlnt(ppd->dd, t);
@@@ -10846,10 -11158,13 +11155,13 @@@ static void adjust_rcv_timeout(struct h
        }
  
        rcd->rcvavail_timeout = timeout;
-       /* timeout cannot be larger than rcv_intr_timeout_csr which has already
-          been verified to be in range */
+       /*
+        * timeout cannot be larger than rcv_intr_timeout_csr which has already
+        * been verified to be in range
+        */
        write_kctxt_csr(dd, rcd->ctxt, RCV_AVAIL_TIME_OUT,
-               (u64)timeout << RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
+                       (u64)timeout <<
+                       RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
  }
  
  void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd,
@@@ -10915,16 -11230,16 +11227,16 @@@ u32 hdrqempty(struct hfi1_ctxtdata *rcd
  static u32 encoded_size(u32 size)
  {
        switch (size) {
-       case   4*1024: return 0x1;
-       case   8*1024: return 0x2;
-       case  16*1024: return 0x3;
-       case  32*1024: return 0x4;
-       case  64*1024: return 0x5;
-       case 128*1024: return 0x6;
-       case 256*1024: return 0x7;
-       case 512*1024: return 0x8;
-       case   1*1024*1024: return 0x9;
-       case   2*1024*1024: return 0xa;
+       case   4 * 1024: return 0x1;
+       case   8 * 1024: return 0x2;
+       case  16 * 1024: return 0x3;
+       case  32 * 1024: return 0x4;
+       case  64 * 1024: return 0x5;
+       case 128 * 1024: return 0x6;
+       case 256 * 1024: return 0x7;
+       case 512 * 1024: return 0x8;
+       case   1 * 1024 * 1024: return 0x9;
+       case   2 * 1024 * 1024: return 0xa;
        }
        return 0x1;     /* if invalid, go with the minimum size */
  }
@@@ -10943,8 -11258,8 +11255,8 @@@ void hfi1_rcvctrl(struct hfi1_devdata *
  
        rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL);
        /* if the context already enabled, don't do the extra steps */
-       if ((op & HFI1_RCVCTRL_CTXT_ENB)
-                       && !(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) {
+       if ((op & HFI1_RCVCTRL_CTXT_ENB) &&
+           !(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) {
                /* reset the tail and hdr addresses, and sequence count */
                write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR,
                                rcd->rcvhdrq_phys);
                if (dd->rcvhdrtail_dummy_physaddr) {
                        write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
                                        dd->rcvhdrtail_dummy_physaddr);
+                       /* Enabling RcvCtxtCtrl.TailUpd is intentional. */
                        rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
                }
  
                rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
        if (op & HFI1_RCVCTRL_TAILUPD_ENB && rcd->rcvhdrqtailaddr_phys)
                rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
-       if (op & HFI1_RCVCTRL_TAILUPD_DIS)
-               rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+       if (op & HFI1_RCVCTRL_TAILUPD_DIS) {
+               /* See comment on RcvCtxtCtrl.TailUpd above */
+               if (!(op & HFI1_RCVCTRL_CTXT_DIS))
+                       rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+       }
        if (op & HFI1_RCVCTRL_TIDFLOW_ENB)
                rcvctrl |= RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
        if (op & HFI1_RCVCTRL_TIDFLOW_DIS)
                rcvctrl &= ~RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
        if (op & HFI1_RCVCTRL_ONE_PKT_EGR_ENB) {
-               /* In one-packet-per-eager mode, the size comes from
-                  the RcvArray entry. */
+               /*
+                * In one-packet-per-eager mode, the size comes from
+                * the RcvArray entry.
+                */
                rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
                rcvctrl |= RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
        }
        write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcd->rcvctrl);
  
        /* work around sticky RcvCtxtStatus.BlockedRHQFull */
-       if (did_enable
-           && (rcvctrl & RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK)) {
+       if (did_enable &&
+           (rcvctrl & RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK)) {
                reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
                if (reg != 0) {
                        dd_dev_info(dd, "ctxt %d status %lld (blocked)\n",
-                               ctxt, reg);
+                                   ctxt, reg);
                        read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
                        write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x10);
                        write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x00);
                        read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
                        reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
                        dd_dev_info(dd, "ctxt %d status %lld (%s blocked)\n",
-                               ctxt, reg, reg == 0 ? "not" : "still");
+                                   ctxt, reg, reg == 0 ? "not" : "still");
                }
        }
  
                 */
                /* set interrupt timeout */
                write_kctxt_csr(dd, ctxt, RCV_AVAIL_TIME_OUT,
-                       (u64)rcd->rcvavail_timeout <<
+                               (u64)rcd->rcvavail_timeout <<
                                RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
  
                /* set RcvHdrHead.Counter, zero RcvHdrHead.Head (again) */
                                dd->rcvhdrtail_dummy_physaddr);
  }
  
- u32 hfi1_read_cntrs(struct hfi1_devdata *dd, loff_t pos, char **namep,
-                   u64 **cntrp)
+ u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp)
  {
        int ret;
        u64 val = 0;
  
        if (namep) {
                ret = dd->cntrnameslen;
-               if (pos != 0) {
-                       dd_dev_err(dd, "read_cntrs does not support indexing");
-                       return 0;
-               }
                *namep = dd->cntrnames;
        } else {
                const struct cntr_entry *entry;
                int i, j;
  
                ret = (dd->ndevcntrs) * sizeof(u64);
-               if (pos != 0) {
-                       dd_dev_err(dd, "read_cntrs does not support indexing");
-                       return 0;
-               }
  
                /* Get the start of the block of counters */
                *cntrp = dd->cntrs;
                                                dd->cntrs[entry->offset + j] =
                                                                            val;
                                        }
+                               } else if (entry->flags & CNTR_SDMA) {
+                                       hfi1_cdbg(CNTR,
+                                                 "\t Per SDMA Engine\n");
+                                       for (j = 0; j < dd->chip_sdma_engines;
+                                            j++) {
+                                               val =
+                                               entry->rw_cntr(entry, dd, j,
+                                                              CNTR_MODE_R, 0);
+                                               hfi1_cdbg(CNTR,
+                                                         "\t\tRead 0x%llx for %d\n",
+                                                         val, j);
+                                               dd->cntrs[entry->offset + j] =
+                                                                       val;
+                                       }
                                } else {
                                        val = entry->rw_cntr(entry, dd,
                                                        CNTR_INVALID_VL,
  /*
   * Used by sysfs to create files for hfi stats to read
   */
- u32 hfi1_read_portcntrs(struct hfi1_devdata *dd, loff_t pos, u32 port,
-                       char **namep, u64 **cntrp)
+ u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp)
  {
        int ret;
        u64 val = 0;
  
        if (namep) {
-               ret = dd->portcntrnameslen;
-               if (pos != 0) {
-                       dd_dev_err(dd, "index not supported");
-                       return 0;
-               }
-               *namep = dd->portcntrnames;
+               ret = ppd->dd->portcntrnameslen;
+               *namep = ppd->dd->portcntrnames;
        } else {
                const struct cntr_entry *entry;
-               struct hfi1_pportdata *ppd;
                int i, j;
  
-               ret = (dd->nportcntrs) * sizeof(u64);
-               if (pos != 0) {
-                       dd_dev_err(dd, "indexing not supported");
-                       return 0;
-               }
-               ppd = (struct hfi1_pportdata *)(dd + 1 + port);
+               ret = ppd->dd->nportcntrs * sizeof(u64);
                *cntrp = ppd->cntrs;
  
                for (i = 0; i < PORT_CNTR_LAST; i++) {
@@@ -11235,14 -11550,14 +11547,14 @@@ static void free_cntrs(struct hfi1_devd
        for (i = 0; i < dd->num_pports; i++, ppd++) {
                kfree(ppd->cntrs);
                kfree(ppd->scntrs);
-               free_percpu(ppd->ibport_data.rc_acks);
-               free_percpu(ppd->ibport_data.rc_qacks);
-               free_percpu(ppd->ibport_data.rc_delayed_comp);
+               free_percpu(ppd->ibport_data.rvp.rc_acks);
+               free_percpu(ppd->ibport_data.rvp.rc_qacks);
+               free_percpu(ppd->ibport_data.rvp.rc_delayed_comp);
                ppd->cntrs = NULL;
                ppd->scntrs = NULL;
-               ppd->ibport_data.rc_acks = NULL;
-               ppd->ibport_data.rc_qacks = NULL;
-               ppd->ibport_data.rc_delayed_comp = NULL;
+               ppd->ibport_data.rvp.rc_acks = NULL;
+               ppd->ibport_data.rvp.rc_qacks = NULL;
+               ppd->ibport_data.rvp.rc_delayed_comp = NULL;
        }
        kfree(dd->portcntrnames);
        dd->portcntrnames = NULL;
@@@ -11510,11 -11825,13 +11822,13 @@@ mod_timer(&dd->synth_stats_timer, jiffi
  #define C_MAX_NAME 13 /* 12 chars + one for /0 */
  static int init_cntrs(struct hfi1_devdata *dd)
  {
-       int i, rcv_ctxts, index, j;
+       int i, rcv_ctxts, j;
        size_t sz;
        char *p;
        char name[C_MAX_NAME];
        struct hfi1_pportdata *ppd;
+       const char *bit_type_32 = ",32";
+       const int bit_type_32_sz = strlen(bit_type_32);
  
        /* set up the stats timer; the add_timer is done at the end */
        setup_timer(&dd->synth_stats_timer, update_synth_timer,
        /* size names and determine how many we have*/
        dd->ndevcntrs = 0;
        sz = 0;
-       index = 0;
  
        for (i = 0; i < DEV_CNTR_LAST; i++) {
-               hfi1_dbg_early("Init cntr %s\n", dev_cntrs[i].name);
                if (dev_cntrs[i].flags & CNTR_DISABLED) {
                        hfi1_dbg_early("\tSkipping %s\n", dev_cntrs[i].name);
                        continue;
                }
  
                if (dev_cntrs[i].flags & CNTR_VL) {
-                       hfi1_dbg_early("\tProcessing VL cntr\n");
-                       dev_cntrs[i].offset = index;
+                       dev_cntrs[i].offset = dd->ndevcntrs;
                        for (j = 0; j < C_VL_COUNT; j++) {
-                               memset(name, '\0', C_MAX_NAME);
                                snprintf(name, C_MAX_NAME, "%s%d",
-                                       dev_cntrs[i].name,
-                                       vl_from_idx(j));
+                                        dev_cntrs[i].name, vl_from_idx(j));
+                               sz += strlen(name);
+                               /* Add ",32" for 32-bit counters */
+                               if (dev_cntrs[i].flags & CNTR_32BIT)
+                                       sz += bit_type_32_sz;
+                               sz++;
+                               dd->ndevcntrs++;
+                       }
+               } else if (dev_cntrs[i].flags & CNTR_SDMA) {
+                       dev_cntrs[i].offset = dd->ndevcntrs;
+                       for (j = 0; j < dd->chip_sdma_engines; j++) {
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                        dev_cntrs[i].name, j);
                                sz += strlen(name);
+                               /* Add ",32" for 32-bit counters */
+                               if (dev_cntrs[i].flags & CNTR_32BIT)
+                                       sz += bit_type_32_sz;
                                sz++;
-                               hfi1_dbg_early("\t\t%s\n", name);
                                dd->ndevcntrs++;
-                               index++;
                        }
                } else {
-                       /* +1 for newline  */
+                       /* +1 for newline. */
                        sz += strlen(dev_cntrs[i].name) + 1;
+                       /* Add ",32" for 32-bit counters */
+                       if (dev_cntrs[i].flags & CNTR_32BIT)
+                               sz += bit_type_32_sz;
+                       dev_cntrs[i].offset = dd->ndevcntrs;
                        dd->ndevcntrs++;
-                       dev_cntrs[i].offset = index;
-                       index++;
-                       hfi1_dbg_early("\tAdding %s\n", dev_cntrs[i].name);
                }
        }
  
        /* allocate space for the counter values */
-       dd->cntrs = kcalloc(index, sizeof(u64), GFP_KERNEL);
+       dd->cntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL);
        if (!dd->cntrs)
                goto bail;
  
-       dd->scntrs = kcalloc(index, sizeof(u64), GFP_KERNEL);
+       dd->scntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL);
        if (!dd->scntrs)
                goto bail;
  
        /* allocate space for the counter names */
        dd->cntrnameslen = sz;
        dd->cntrnames = kmalloc(sz, GFP_KERNEL);
                goto bail;
  
        /* fill in the names */
-       for (p = dd->cntrnames, i = 0, index = 0; i < DEV_CNTR_LAST; i++) {
+       for (p = dd->cntrnames, i = 0; i < DEV_CNTR_LAST; i++) {
                if (dev_cntrs[i].flags & CNTR_DISABLED) {
                        /* Nothing */
-               } else {
-                       if (dev_cntrs[i].flags & CNTR_VL) {
-                               for (j = 0; j < C_VL_COUNT; j++) {
-                                       memset(name, '\0', C_MAX_NAME);
-                                       snprintf(name, C_MAX_NAME, "%s%d",
-                                               dev_cntrs[i].name,
-                                               vl_from_idx(j));
-                                       memcpy(p, name, strlen(name));
-                                       p += strlen(name);
-                                       *p++ = '\n';
+               } else if (dev_cntrs[i].flags & CNTR_VL) {
+                       for (j = 0; j < C_VL_COUNT; j++) {
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                        dev_cntrs[i].name,
+                                        vl_from_idx(j));
+                               memcpy(p, name, strlen(name));
+                               p += strlen(name);
+                               /* Counter is 32 bits */
+                               if (dev_cntrs[i].flags & CNTR_32BIT) {
+                                       memcpy(p, bit_type_32, bit_type_32_sz);
+                                       p += bit_type_32_sz;
                                }
-                       } else {
-                               memcpy(p, dev_cntrs[i].name,
-                                      strlen(dev_cntrs[i].name));
-                               p += strlen(dev_cntrs[i].name);
+                               *p++ = '\n';
+                       }
+               } else if (dev_cntrs[i].flags & CNTR_SDMA) {
+                       for (j = 0; j < dd->chip_sdma_engines; j++) {
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                        dev_cntrs[i].name, j);
+                               memcpy(p, name, strlen(name));
+                               p += strlen(name);
+                               /* Counter is 32 bits */
+                               if (dev_cntrs[i].flags & CNTR_32BIT) {
+                                       memcpy(p, bit_type_32, bit_type_32_sz);
+                                       p += bit_type_32_sz;
+                               }
                                *p++ = '\n';
                        }
-                       index++;
+               } else {
+                       memcpy(p, dev_cntrs[i].name, strlen(dev_cntrs[i].name));
+                       p += strlen(dev_cntrs[i].name);
+                       /* Counter is 32 bits */
+                       if (dev_cntrs[i].flags & CNTR_32BIT) {
+                               memcpy(p, bit_type_32, bit_type_32_sz);
+                               p += bit_type_32_sz;
+                       }
+                       *p++ = '\n';
                }
        }
  
        sz = 0;
        dd->nportcntrs = 0;
        for (i = 0; i < PORT_CNTR_LAST; i++) {
-               hfi1_dbg_early("Init pcntr %s\n", port_cntrs[i].name);
                if (port_cntrs[i].flags & CNTR_DISABLED) {
                        hfi1_dbg_early("\tSkipping %s\n", port_cntrs[i].name);
                        continue;
                }
  
                if (port_cntrs[i].flags & CNTR_VL) {
-                       hfi1_dbg_early("\tProcessing VL cntr\n");
                        port_cntrs[i].offset = dd->nportcntrs;
                        for (j = 0; j < C_VL_COUNT; j++) {
-                               memset(name, '\0', C_MAX_NAME);
                                snprintf(name, C_MAX_NAME, "%s%d",
-                                       port_cntrs[i].name,
-                                       vl_from_idx(j));
+                                        port_cntrs[i].name, vl_from_idx(j));
                                sz += strlen(name);
+                               /* Add ",32" for 32-bit counters */
+                               if (port_cntrs[i].flags & CNTR_32BIT)
+                                       sz += bit_type_32_sz;
                                sz++;
-                               hfi1_dbg_early("\t\t%s\n", name);
                                dd->nportcntrs++;
                        }
                } else {
-                       /* +1 for newline  */
+                       /* +1 for newline */
                        sz += strlen(port_cntrs[i].name) + 1;
+                       /* Add ",32" for 32-bit counters */
+                       if (port_cntrs[i].flags & CNTR_32BIT)
+                               sz += bit_type_32_sz;
                        port_cntrs[i].offset = dd->nportcntrs;
                        dd->nportcntrs++;
-                       hfi1_dbg_early("\tAdding %s\n", port_cntrs[i].name);
                }
        }
  
  
                if (port_cntrs[i].flags & CNTR_VL) {
                        for (j = 0; j < C_VL_COUNT; j++) {
-                               memset(name, '\0', C_MAX_NAME);
                                snprintf(name, C_MAX_NAME, "%s%d",
-                                       port_cntrs[i].name,
-                                       vl_from_idx(j));
+                                        port_cntrs[i].name, vl_from_idx(j));
                                memcpy(p, name, strlen(name));
                                p += strlen(name);
+                               /* Counter is 32 bits */
+                               if (port_cntrs[i].flags & CNTR_32BIT) {
+                                       memcpy(p, bit_type_32, bit_type_32_sz);
+                                       p += bit_type_32_sz;
+                               }
                                *p++ = '\n';
                        }
                } else {
                        memcpy(p, port_cntrs[i].name,
                               strlen(port_cntrs[i].name));
                        p += strlen(port_cntrs[i].name);
+                       /* Counter is 32 bits */
+                       if (port_cntrs[i].flags & CNTR_32BIT) {
+                               memcpy(p, bit_type_32, bit_type_32_sz);
+                               p += bit_type_32_sz;
+                       }
                        *p++ = '\n';
                }
        }
        return -ENOMEM;
  }
  
  static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate)
  {
        switch (chip_lstate) {
        default:
                dd_dev_err(dd,
-                        "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n",
-                        chip_lstate);
+                          "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n",
+                          chip_lstate);
                /* fall through */
        case LSTATE_DOWN:
                return IB_PORT_DOWN;
@@@ -11726,7 -12086,7 +12083,7 @@@ u32 chip_to_opa_pstate(struct hfi1_devd
        switch (chip_pstate & 0xf0) {
        default:
                dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n",
-                       chip_pstate);
+                          chip_pstate);
                /* fall through */
        case PLS_DISABLED:
                return IB_PORTPHYSSTATE_DISABLED;
@@@ -11792,7 -12152,7 +12149,7 @@@ u32 get_logical_state(struct hfi1_pport
        new_state = chip_to_opa_lstate(ppd->dd, read_logical_state(ppd->dd));
        if (new_state != ppd->lstate) {
                dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n",
-                       opa_lstate_name(new_state), new_state);
+                           opa_lstate_name(new_state), new_state);
                ppd->lstate = new_state;
        }
        /*
@@@ -11851,18 -12211,17 +12208,17 @@@ static int wait_logical_linkstate(struc
  
  u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd)
  {
-       static u32 remembered_state = 0xff;
        u32 pstate;
        u32 ib_pstate;
  
        pstate = read_physical_state(ppd->dd);
        ib_pstate = chip_to_opa_pstate(ppd->dd, pstate);
-       if (remembered_state != ib_pstate) {
+       if (ppd->last_pstate != ib_pstate) {
                dd_dev_info(ppd->dd,
-                       "%s: physical state changed to %s (0x%x), phy 0x%x\n",
-                       __func__, opa_pstate_name(ib_pstate), ib_pstate,
-                       pstate);
-               remembered_state = ib_pstate;
+                           "%s: physical state changed to %s (0x%x), phy 0x%x\n",
+                           __func__, opa_pstate_name(ib_pstate), ib_pstate,
+                           pstate);
+               ppd->last_pstate = ib_pstate;
        }
        return ib_pstate;
  }
@@@ -11906,7 -12265,7 +12262,7 @@@ u64 hfi1_gpio_mod(struct hfi1_devdata *
  
  int hfi1_init_ctxt(struct send_context *sc)
  {
-       if (sc != NULL) {
+       if (sc) {
                struct hfi1_devdata *dd = sc->dd;
                u64 reg;
                u8 set = (sc->type == SC_USER ?
@@@ -11963,34 -12322,14 +12319,14 @@@ void set_intr_state(struct hfi1_devdat
         * In HFI, the mask needs to be 1 to allow interrupts.
         */
        if (enable) {
-               u64 cce_int_mask;
-               const int qsfp1_int_smask = QSFP1_INT % 64;
-               const int qsfp2_int_smask = QSFP2_INT % 64;
                /* enable all interrupts */
                for (i = 0; i < CCE_NUM_INT_CSRS; i++)
-                       write_csr(dd, CCE_INT_MASK + (8*i), ~(u64)0);
+                       write_csr(dd, CCE_INT_MASK + (8 * i), ~(u64)0);
  
-               /*
-                * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0
-                * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR,
-                * therefore just one of QSFP1_INT/QSFP2_INT can be used to find
-                * the index of the appropriate CSR in the CCEIntMask CSR array
-                */
-               cce_int_mask = read_csr(dd, CCE_INT_MASK +
-                                               (8*(QSFP1_INT/64)));
-               if (dd->hfi1_id) {
-                       cce_int_mask &= ~((u64)1 << qsfp1_int_smask);
-                       write_csr(dd, CCE_INT_MASK + (8*(QSFP1_INT/64)),
-                                       cce_int_mask);
-               } else {
-                       cce_int_mask &= ~((u64)1 << qsfp2_int_smask);
-                       write_csr(dd, CCE_INT_MASK + (8*(QSFP2_INT/64)),
-                                       cce_int_mask);
-               }
+               init_qsfp_int(dd);
        } else {
                for (i = 0; i < CCE_NUM_INT_CSRS; i++)
-                       write_csr(dd, CCE_INT_MASK + (8*i), 0ull);
+                       write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
        }
  }
  
@@@ -12002,7 -12341,7 +12338,7 @@@ static void clear_all_interrupts(struc
        int i;
  
        for (i = 0; i < CCE_NUM_INT_CSRS; i++)
-               write_csr(dd, CCE_INT_CLEAR + (8*i), ~(u64)0);
+               write_csr(dd, CCE_INT_CLEAR + (8 * i), ~(u64)0);
  
        write_csr(dd, CCE_ERR_CLEAR, ~(u64)0);
        write_csr(dd, MISC_ERR_CLEAR, ~(u64)0);
@@@ -12037,10 -12376,9 +12373,9 @@@ static void clean_up_interrupts(struct 
                struct hfi1_msix_entry *me = dd->msix_entries;
  
                for (i = 0; i < dd->num_msix_entries; i++, me++) {
-                       if (me->arg == NULL) /* => no irq, no affinity */
-                               break;
-                       irq_set_affinity_hint(dd->msix_entries[i].msix.vector,
-                                       NULL);
+                       if (!me->arg) /* => no irq, no affinity */
+                               continue;
+                       hfi1_put_irq_affinity(dd, &dd->msix_entries[i]);
                        free_irq(me->msix.vector, me->arg);
                }
        } else {
        }
  
        /* clean structures */
-       for (i = 0; i < dd->num_msix_entries; i++)
-               free_cpumask_var(dd->msix_entries[i].mask);
        kfree(dd->msix_entries);
        dd->msix_entries = NULL;
        dd->num_msix_entries = 0;
@@@ -12085,10 -12421,10 +12418,10 @@@ static void remap_intr(struct hfi1_devd
        /* direct the chip source to the given MSI-X interrupt */
        m = isrc / 8;
        n = isrc % 8;
-       reg = read_csr(dd, CCE_INT_MAP + (8*m));
-       reg &= ~((u64)0xff << (8*n));
-       reg |= ((u64)msix_intr & 0xff) << (8*n);
-       write_csr(dd, CCE_INT_MAP + (8*m), reg);
+       reg = read_csr(dd, CCE_INT_MAP + (8 * m));
+       reg &= ~((u64)0xff << (8 * n));
+       reg |= ((u64)msix_intr & 0xff) << (8 * n);
+       write_csr(dd, CCE_INT_MAP + (8 * m), reg);
  }
  
  static void remap_sdma_interrupts(struct hfi1_devdata *dd,
         *      SDMAProgress
         *      SDMAIdle
         */
-       remap_intr(dd, IS_SDMA_START + 0*TXE_NUM_SDMA_ENGINES + engine,
-               msix_intr);
-       remap_intr(dd, IS_SDMA_START + 1*TXE_NUM_SDMA_ENGINES + engine,
-               msix_intr);
-       remap_intr(dd, IS_SDMA_START + 2*TXE_NUM_SDMA_ENGINES + engine,
-               msix_intr);
+       remap_intr(dd, IS_SDMA_START + 0 * TXE_NUM_SDMA_ENGINES + engine,
+                  msix_intr);
+       remap_intr(dd, IS_SDMA_START + 1 * TXE_NUM_SDMA_ENGINES + engine,
+                  msix_intr);
+       remap_intr(dd, IS_SDMA_START + 2 * TXE_NUM_SDMA_ENGINES + engine,
+                  msix_intr);
  }
  
  static int request_intx_irq(struct hfi1_devdata *dd)
        snprintf(dd->intx_name, sizeof(dd->intx_name), DRIVER_NAME "_%d",
                 dd->unit);
        ret = request_irq(dd->pcidev->irq, general_interrupt,
-                                 IRQF_SHARED, dd->intx_name, dd);
+                         IRQF_SHARED, dd->intx_name, dd);
        if (ret)
                dd_dev_err(dd, "unable to request INTx interrupt, err %d\n",
-                               ret);
+                          ret);
        else
                dd->requested_intx_irq = 1;
        return ret;
  
  static int request_msix_irqs(struct hfi1_devdata *dd)
  {
-       const struct cpumask *local_mask;
-       cpumask_var_t def, rcv;
-       bool def_ret, rcv_ret;
        int first_general, last_general;
        int first_sdma, last_sdma;
        int first_rx, last_rx;
-       int first_cpu, curr_cpu;
-       int rcv_cpu, sdma_cpu;
-       int i, ret = 0, possible;
-       int ht;
+       int i, ret = 0;
  
        /* calculate the ranges we are going to use */
        first_general = 0;
-       first_sdma = last_general = first_general + 1;
-       first_rx = last_sdma = first_sdma + dd->num_sdma;
+       last_general = first_general + 1;
+       first_sdma = last_general;
+       last_sdma = first_sdma + dd->num_sdma;
+       first_rx = last_sdma;
        last_rx = first_rx + dd->n_krcv_queues;
  
-       /*
-        * Interrupt affinity.
-        *
-        * non-rcv avail gets a default mask that
-        * starts as possible cpus with threads reset
-        * and each rcv avail reset.
-        *
-        * rcv avail gets node relative 1 wrapping back
-        * to the node relative 1 as necessary.
-        *
-        */
-       local_mask = cpumask_of_pcibus(dd->pcidev->bus);
-       /* if first cpu is invalid, use NUMA 0 */
-       if (cpumask_first(local_mask) >= nr_cpu_ids)
-               local_mask = topology_core_cpumask(0);
-       def_ret = zalloc_cpumask_var(&def, GFP_KERNEL);
-       rcv_ret = zalloc_cpumask_var(&rcv, GFP_KERNEL);
-       if (!def_ret || !rcv_ret)
-               goto bail;
-       /* use local mask as default */
-       cpumask_copy(def, local_mask);
-       possible = cpumask_weight(def);
-       /* disarm threads from default */
-       ht = cpumask_weight(
-                       topology_sibling_cpumask(cpumask_first(local_mask)));
-       for (i = possible/ht; i < possible; i++)
-               cpumask_clear_cpu(i, def);
-       /* def now has full cores on chosen node*/
-       first_cpu = cpumask_first(def);
-       if (nr_cpu_ids >= first_cpu)
-               first_cpu++;
-       curr_cpu = first_cpu;
-       /*  One context is reserved as control context */
-       for (i = first_cpu; i < dd->n_krcv_queues + first_cpu - 1; i++) {
-               cpumask_clear_cpu(curr_cpu, def);
-               cpumask_set_cpu(curr_cpu, rcv);
-               curr_cpu = cpumask_next(curr_cpu, def);
-               if (curr_cpu >= nr_cpu_ids)
-                       break;
-       }
-       /* def mask has non-rcv, rcv has recv mask */
-       rcv_cpu = cpumask_first(rcv);
-       sdma_cpu = cpumask_first(def);
        /*
         * Sanity check - the code expects all SDMA chip source
         * interrupts to be in the same CSR, starting at bit 0.  Verify
                        snprintf(me->name, sizeof(me->name),
                                 DRIVER_NAME "_%d", dd->unit);
                        err_info = "general";
+                       me->type = IRQ_GENERAL;
                } else if (first_sdma <= i && i < last_sdma) {
                        idx = i - first_sdma;
                        sde = &dd->per_sdma[idx];
                                 DRIVER_NAME "_%d sdma%d", dd->unit, idx);
                        err_info = "sdma";
                        remap_sdma_interrupts(dd, idx, i);
+                       me->type = IRQ_SDMA;
                } else if (first_rx <= i && i < last_rx) {
                        idx = i - first_rx;
                        rcd = dd->rcd[idx];
                         * Set the interrupt register and mask for this
                         * context's interrupt.
                         */
-                       rcd->ireg = (IS_RCVAVAIL_START+idx) / 64;
+                       rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
                        rcd->imask = ((u64)1) <<
-                                       ((IS_RCVAVAIL_START+idx) % 64);
+                                       ((IS_RCVAVAIL_START + idx) % 64);
                        handler = receive_context_interrupt;
                        thread = receive_context_thread;
                        arg = rcd;
                                 DRIVER_NAME "_%d kctxt%d", dd->unit, idx);
                        err_info = "receive context";
                        remap_intr(dd, IS_RCVAVAIL_START + idx, i);
+                       me->type = IRQ_RCVCTXT;
                } else {
                        /* not in our expected range - complain, then
-                          ignore it */
+                        * ignore it
+                        */
                        dd_dev_err(dd,
-                               "Unexpected extra MSI-X interrupt %d\n", i);
+                                  "Unexpected extra MSI-X interrupt %d\n", i);
                        continue;
                }
                /* no argument, no interrupt */
-               if (arg == NULL)
+               if (!arg)
                        continue;
                /* make sure the name is terminated */
-               me->name[sizeof(me->name)-1] = 0;
+               me->name[sizeof(me->name) - 1] = 0;
  
                ret = request_threaded_irq(me->msix.vector, handler, thread, 0,
-                                               me->name, arg);
+                                          me->name, arg);
                if (ret) {
                        dd_dev_err(dd,
-                               "unable to allocate %s interrupt, vector %d, index %d, err %d\n",
-                                err_info, me->msix.vector, idx, ret);
+                                  "unable to allocate %s interrupt, vector %d, index %d, err %d\n",
+                                  err_info, me->msix.vector, idx, ret);
                        return ret;
                }
                /*
                 */
                me->arg = arg;
  
-               if (!zalloc_cpumask_var(
-                       &dd->msix_entries[i].mask,
-                       GFP_KERNEL))
-                       goto bail;
-               if (handler == sdma_interrupt) {
-                       dd_dev_info(dd, "sdma engine %d cpu %d\n",
-                               sde->this_idx, sdma_cpu);
-                       sde->cpu = sdma_cpu;
-                       cpumask_set_cpu(sdma_cpu, dd->msix_entries[i].mask);
-                       sdma_cpu = cpumask_next(sdma_cpu, def);
-                       if (sdma_cpu >= nr_cpu_ids)
-                               sdma_cpu = cpumask_first(def);
-               } else if (handler == receive_context_interrupt) {
-                       dd_dev_info(dd, "rcv ctxt %d cpu %d\n", rcd->ctxt,
-                                   (rcd->ctxt == HFI1_CTRL_CTXT) ?
-                                           cpumask_first(def) : rcv_cpu);
-                       if (rcd->ctxt == HFI1_CTRL_CTXT) {
-                               /* map to first default */
-                               cpumask_set_cpu(cpumask_first(def),
-                                               dd->msix_entries[i].mask);
-                       } else {
-                               cpumask_set_cpu(rcv_cpu,
-                                               dd->msix_entries[i].mask);
-                               rcv_cpu = cpumask_next(rcv_cpu, rcv);
-                               if (rcv_cpu >= nr_cpu_ids)
-                                       rcv_cpu = cpumask_first(rcv);
-                       }
-               } else {
-                       /* otherwise first def */
-                       dd_dev_info(dd, "%s cpu %d\n",
-                               err_info, cpumask_first(def));
-                       cpumask_set_cpu(
-                               cpumask_first(def), dd->msix_entries[i].mask);
-               }
-               irq_set_affinity_hint(
-                       dd->msix_entries[i].msix.vector,
-                       dd->msix_entries[i].mask);
+               ret = hfi1_get_irq_affinity(dd, me);
+               if (ret)
+                       dd_dev_err(dd,
+                                  "unable to pin IRQ %d\n", ret);
        }
  
- out:
-       free_cpumask_var(def);
-       free_cpumask_var(rcv);
        return ret;
- bail:
-       ret = -ENOMEM;
-       goto  out;
  }
  
  /*
@@@ -12333,7 -12584,7 +12581,7 @@@ static void reset_interrupts(struct hfi
  
        /* all chip interrupts map to MSI-X 0 */
        for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
-               write_csr(dd, CCE_INT_MAP + (8*i), 0);
+               write_csr(dd, CCE_INT_MAP + (8 * i), 0);
  }
  
  static int set_up_interrupts(struct hfi1_devdata *dd)
@@@ -12442,7 -12693,7 +12690,7 @@@ static int set_up_context_variables(str
                 */
                num_kernel_contexts = n_krcvqs + MIN_KERNEL_KCTXTS - 1;
        else
-               num_kernel_contexts = num_online_nodes();
+               num_kernel_contexts = num_online_nodes() + 1;
        num_kernel_contexts =
                max_t(int, MIN_KERNEL_KCTXTS, num_kernel_contexts);
        /*
        dd->num_rcv_contexts = total_contexts;
        dd->n_krcv_queues = num_kernel_contexts;
        dd->first_user_ctxt = num_kernel_contexts;
+       dd->num_user_contexts = num_user_contexts;
        dd->freectxts = num_user_contexts;
        dd_dev_info(dd,
-               "rcv contexts: chip %d, used %d (kernel %d, user %d)\n",
-               (int)dd->chip_rcv_contexts,
-               (int)dd->num_rcv_contexts,
-               (int)dd->n_krcv_queues,
-               (int)dd->num_rcv_contexts - dd->n_krcv_queues);
+                   "rcv contexts: chip %d, used %d (kernel %d, user %d)\n",
+                   (int)dd->chip_rcv_contexts,
+                   (int)dd->num_rcv_contexts,
+                   (int)dd->n_krcv_queues,
+                   (int)dd->num_rcv_contexts - dd->n_krcv_queues);
  
        /*
         * Receive array allocation:
                dd->rcv_entries.ngroups = (MAX_EAGER_ENTRIES * 2) /
                        dd->rcv_entries.group_size;
                dd_dev_info(dd,
-                  "RcvArray group count too high, change to %u\n",
-                  dd->rcv_entries.ngroups);
+                           "RcvArray group count too high, change to %u\n",
+                           dd->rcv_entries.ngroups);
                dd->rcv_entries.nctxt_extra = 0;
        }
        /*
@@@ -12582,7 -12834,7 +12831,7 @@@ static void write_uninitialized_csrs_an
  
        /* CceIntMap */
        for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
-               write_csr(dd, CCE_INT_MAP+(8*i), 0);
+               write_csr(dd, CCE_INT_MAP + (8 * i), 0);
  
        /* SendCtxtCreditReturnAddr */
        for (i = 0; i < dd->chip_send_contexts; i++)
  
        /* PIO Send buffers */
        /* SDMA Send buffers */
-       /* These are not normally read, and (presently) have no method
-          to be read, so are not pre-initialized */
+       /*
+        * These are not normally read, and (presently) have no method
+        * to be read, so are not pre-initialized
+        */
  
        /* RcvHdrAddr */
        /* RcvHdrTailAddr */
                write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
                write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
                for (j = 0; j < RXE_NUM_TID_FLOWS; j++)
-                       write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE+(8*j), 0);
+                       write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j), 0);
        }
  
        /* RcvArray */
        for (i = 0; i < dd->chip_rcv_array_count; i++)
-               write_csr(dd, RCV_ARRAY + (8*i),
-                                       RCV_ARRAY_RT_WRITE_ENABLE_SMASK);
+               write_csr(dd, RCV_ARRAY + (8 * i),
+                         RCV_ARRAY_RT_WRITE_ENABLE_SMASK);
  
        /* RcvQPMapTable */
        for (i = 0; i < 32; i++)
@@@ -12638,8 -12892,8 +12889,8 @@@ static void clear_cce_status(struct hfi
                        return;
                if (time_after(jiffies, timeout)) {
                        dd_dev_err(dd,
-                               "Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n",
-                               status_bits, reg & status_bits);
+                                  "Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n",
+                                  status_bits, reg & status_bits);
                        return;
                }
                udelay(1);
@@@ -12671,7 -12925,7 +12922,7 @@@ static void reset_cce_csrs(struct hfi1_
        for (i = 0; i < CCE_NUM_MSIX_VECTORS; i++) {
                write_csr(dd, CCE_MSIX_TABLE_LOWER + (8 * i), 0);
                write_csr(dd, CCE_MSIX_TABLE_UPPER + (8 * i),
-                                       CCE_MSIX_TABLE_UPPER_RESETCSR);
+                         CCE_MSIX_TABLE_UPPER_RESETCSR);
        }
        for (i = 0; i < CCE_NUM_MSIX_PBAS; i++) {
                /* CCE_MSIX_PBA read-only */
                write_csr(dd, CCE_INT_COUNTER_ARRAY32 + (8 * i), 0);
  }
  
- /* set ASIC CSRs to chip reset defaults */
- static void reset_asic_csrs(struct hfi1_devdata *dd)
- {
-       int i;
-       /*
-        * If the HFIs are shared between separate nodes or VMs,
-        * then more will need to be done here.  One idea is a module
-        * parameter that returns early, letting the first power-on or
-        * a known first load do the reset and blocking all others.
-        */
-       if (!(dd->flags & HFI1_DO_INIT_ASIC))
-               return;
-       if (dd->icode != ICODE_FPGA_EMULATION) {
-               /* emulation does not have an SBus - leave these alone */
-               /*
-                * All writes to ASIC_CFG_SBUS_REQUEST do something.
-                * Notes:
-                * o The reset is not zero if aimed at the core.  See the
-                *   SBus documentation for details.
-                * o If the SBus firmware has been updated (e.g. by the BIOS),
-                *   will the reset revert that?
-                */
-               /* ASIC_CFG_SBUS_REQUEST leave alone */
-               write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
-       }
-       /* ASIC_SBUS_RESULT read-only */
-       write_csr(dd, ASIC_STS_SBUS_COUNTERS, 0);
-       for (i = 0; i < ASIC_NUM_SCRATCH; i++)
-               write_csr(dd, ASIC_CFG_SCRATCH + (8 * i), 0);
-       write_csr(dd, ASIC_CFG_MUTEX, 0);       /* this will clear it */
-       /* We might want to retain this state across FLR if we ever use it */
-       write_csr(dd, ASIC_CFG_DRV_STR, 0);
-       /* ASIC_CFG_THERM_POLL_EN leave alone */
-       /* ASIC_STS_THERM read-only */
-       /* ASIC_CFG_RESET leave alone */
-       write_csr(dd, ASIC_PCIE_SD_HOST_CMD, 0);
-       /* ASIC_PCIE_SD_HOST_STATUS read-only */
-       write_csr(dd, ASIC_PCIE_SD_INTRPT_DATA_CODE, 0);
-       write_csr(dd, ASIC_PCIE_SD_INTRPT_ENABLE, 0);
-       /* ASIC_PCIE_SD_INTRPT_PROGRESS read-only */
-       write_csr(dd, ASIC_PCIE_SD_INTRPT_STATUS, ~0ull); /* clear */
-       /* ASIC_HFI0_PCIE_SD_INTRPT_RSPD_DATA read-only */
-       /* ASIC_HFI1_PCIE_SD_INTRPT_RSPD_DATA read-only */
-       for (i = 0; i < 16; i++)
-               write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (8 * i), 0);
-       /* ASIC_GPIO_IN read-only */
-       write_csr(dd, ASIC_GPIO_OE, 0);
-       write_csr(dd, ASIC_GPIO_INVERT, 0);
-       write_csr(dd, ASIC_GPIO_OUT, 0);
-       write_csr(dd, ASIC_GPIO_MASK, 0);
-       /* ASIC_GPIO_STATUS read-only */
-       write_csr(dd, ASIC_GPIO_CLEAR, ~0ull);
-       /* ASIC_GPIO_FORCE leave alone */
-       /* ASIC_QSFP1_IN read-only */
-       write_csr(dd, ASIC_QSFP1_OE, 0);
-       write_csr(dd, ASIC_QSFP1_INVERT, 0);
-       write_csr(dd, ASIC_QSFP1_OUT, 0);
-       write_csr(dd, ASIC_QSFP1_MASK, 0);
-       /* ASIC_QSFP1_STATUS read-only */
-       write_csr(dd, ASIC_QSFP1_CLEAR, ~0ull);
-       /* ASIC_QSFP1_FORCE leave alone */
-       /* ASIC_QSFP2_IN read-only */
-       write_csr(dd, ASIC_QSFP2_OE, 0);
-       write_csr(dd, ASIC_QSFP2_INVERT, 0);
-       write_csr(dd, ASIC_QSFP2_OUT, 0);
-       write_csr(dd, ASIC_QSFP2_MASK, 0);
-       /* ASIC_QSFP2_STATUS read-only */
-       write_csr(dd, ASIC_QSFP2_CLEAR, ~0ull);
-       /* ASIC_QSFP2_FORCE leave alone */
-       write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_RESETCSR);
-       /* this also writes a NOP command, clearing paging mode */
-       write_csr(dd, ASIC_EEP_ADDR_CMD, 0);
-       write_csr(dd, ASIC_EEP_DATA, 0);
- }
  /* set MISC CSRs to chip reset defaults */
  static void reset_misc_csrs(struct hfi1_devdata *dd)
  {
                write_csr(dd, MISC_CFG_RSA_SIGNATURE + (8 * i), 0);
                write_csr(dd, MISC_CFG_RSA_MODULUS + (8 * i), 0);
        }
-       /* MISC_CFG_SHA_PRELOAD leave alone - always reads 0 and can
-          only be written 128-byte chunks */
+       /*
+        * MISC_CFG_SHA_PRELOAD leave alone - always reads 0 and can
+        * only be written 128-byte chunks
+        */
        /* init RSA engine to clear lingering errors */
        write_csr(dd, MISC_CFG_RSA_CMD, 1);
        write_csr(dd, MISC_CFG_RSA_MU, 0);
@@@ -12843,18 -13014,17 +13011,17 @@@ static void reset_txe_csrs(struct hfi1_
        write_csr(dd, SEND_ERR_CLEAR, ~0ull);
        /* SEND_ERR_FORCE read-only */
        for (i = 0; i < VL_ARB_LOW_PRIO_TABLE_SIZE; i++)
-               write_csr(dd, SEND_LOW_PRIORITY_LIST + (8*i), 0);
+               write_csr(dd, SEND_LOW_PRIORITY_LIST + (8 * i), 0);
        for (i = 0; i < VL_ARB_HIGH_PRIO_TABLE_SIZE; i++)
-               write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8*i), 0);
-       for (i = 0; i < dd->chip_send_contexts/NUM_CONTEXTS_PER_SET; i++)
-               write_csr(dd, SEND_CONTEXT_SET_CTRL + (8*i), 0);
+               write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8 * i), 0);
+       for (i = 0; i < dd->chip_send_contexts / NUM_CONTEXTS_PER_SET; i++)
+               write_csr(dd, SEND_CONTEXT_SET_CTRL + (8 * i), 0);
        for (i = 0; i < TXE_NUM_32_BIT_COUNTER; i++)
-               write_csr(dd, SEND_COUNTER_ARRAY32 + (8*i), 0);
+               write_csr(dd, SEND_COUNTER_ARRAY32 + (8 * i), 0);
        for (i = 0; i < TXE_NUM_64_BIT_COUNTER; i++)
-               write_csr(dd, SEND_COUNTER_ARRAY64 + (8*i), 0);
+               write_csr(dd, SEND_COUNTER_ARRAY64 + (8 * i), 0);
        write_csr(dd, SEND_CM_CTRL, SEND_CM_CTRL_RESETCSR);
-       write_csr(dd, SEND_CM_GLOBAL_CREDIT,
-                                       SEND_CM_GLOBAL_CREDIT_RESETCSR);
+       write_csr(dd, SEND_CM_GLOBAL_CREDIT, SEND_CM_GLOBAL_CREDIT_RESETCSR);
        /* SEND_CM_CREDIT_USED_STATUS read-only */
        write_csr(dd, SEND_CM_TIMER_CTRL, 0);
        write_csr(dd, SEND_CM_LOCAL_AU_TABLE0_TO3, 0);
        write_csr(dd, SEND_CM_REMOTE_AU_TABLE0_TO3, 0);
        write_csr(dd, SEND_CM_REMOTE_AU_TABLE4_TO7, 0);
        for (i = 0; i < TXE_NUM_DATA_VL; i++)
-               write_csr(dd, SEND_CM_CREDIT_VL + (8*i), 0);
+               write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0);
        write_csr(dd, SEND_CM_CREDIT_VL15, 0);
        /* SEND_CM_CREDIT_USED_VL read-only */
        /* SEND_CM_CREDIT_USED_VL15 read-only */
@@@ -12948,8 -13118,8 +13115,8 @@@ static void init_rbufs(struct hfi1_devd
                 */
                if (count++ > 500) {
                        dd_dev_err(dd,
-                               "%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n",
-                               __func__, reg);
+                                  "%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n",
+                                  __func__, reg);
                        break;
                }
                udelay(2); /* do not busy-wait the CSR */
                /* give up after 100us - slowest possible at 33MHz is 73us */
                if (count++ > 50) {
                        dd_dev_err(dd,
-                               "%s: RcvStatus.RxRbufInit not set, continuing\n",
-                               __func__);
+                                  "%s: RcvStatus.RxRbufInit not set, continuing\n",
+                                  __func__);
                        break;
                }
        }
@@@ -13005,7 -13175,7 +13172,7 @@@ static void reset_rxe_csrs(struct hfi1_
        write_csr(dd, RCV_VL15, 0);
        /* this is a clear-down */
        write_csr(dd, RCV_ERR_INFO,
-                       RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
+                 RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
        /* RCV_ERR_STATUS read-only */
        write_csr(dd, RCV_ERR_MASK, 0);
        write_csr(dd, RCV_ERR_CLEAR, ~0ull);
                write_uctxt_csr(dd, i, RCV_EGR_INDEX_HEAD, 0);
                /* RCV_EGR_OFFSET_TAIL read-only */
                for (j = 0; j < RXE_NUM_TID_FLOWS; j++) {
-                       write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j),
-                               0);
+                       write_uctxt_csr(dd, i,
+                                       RCV_TID_FLOW_TABLE + (8 * j), 0);
                }
        }
  }
@@@ -13154,7 -13324,7 +13321,7 @@@ static void init_chip(struct hfi1_devda
                write_csr(dd, RCV_CTXT_CTRL, 0);
        /* mask all interrupt sources */
        for (i = 0; i < CCE_NUM_INT_CSRS; i++)
-               write_csr(dd, CCE_INT_MASK + (8*i), 0ull);
+               write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
  
        /*
         * DC Reset: do a full DC reset before the register clear.
         * across the clear.
         */
        write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
-       (void) read_csr(dd, CCE_DC_CTRL);
+       (void)read_csr(dd, CCE_DC_CTRL);
  
        if (use_flr) {
                /*
                        hfi1_pcie_flr(dd);
                        restore_pci_variables(dd);
                }
-               reset_asic_csrs(dd);
        } else {
                dd_dev_info(dd, "Resetting CSRs with writes\n");
                reset_cce_csrs(dd);
                reset_txe_csrs(dd);
                reset_rxe_csrs(dd);
-               reset_asic_csrs(dd);
                reset_misc_csrs(dd);
        }
        /* clear the DC reset */
        write_csr(dd, CCE_DC_CTRL, 0);
  
        /* Set the LED off */
-       if (is_ax(dd))
-               setextled(dd, 0);
+       setextled(dd, 0);
        /*
         * Clear the QSFP reset.
         * An FLR enforces a 0 on all out pins. The driver does not touch
         */
        write_csr(dd, ASIC_QSFP1_OUT, 0x1f);
        write_csr(dd, ASIC_QSFP2_OUT, 0x1f);
+       init_chip_resources(dd);
  }
  
  static void init_early_variables(struct hfi1_devdata *dd)
@@@ -13252,12 -13420,12 +13417,12 @@@ static void init_kdeth_qp(struct hfi1_d
                kdeth_qp = DEFAULT_KDETH_QP;
  
        write_csr(dd, SEND_BTH_QP,
-                       (kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK)
-                               << SEND_BTH_QP_KDETH_QP_SHIFT);
+                 (kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK) <<
+                 SEND_BTH_QP_KDETH_QP_SHIFT);
  
        write_csr(dd, RCV_BTH_QP,
-                       (kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK)
-                               << RCV_BTH_QP_KDETH_QP_SHIFT);
+                 (kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK) <<
+                 RCV_BTH_QP_KDETH_QP_SHIFT);
  }
  
  /**
@@@ -13382,22 -13550,21 +13547,21 @@@ static void init_qos(struct hfi1_devdat
                write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rsmmap[i]);
        /* add rule0 */
        write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */,
-               RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK
-                       << RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT |
-               2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
+                 RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK <<
+                 RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT |
+                 2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
        write_csr(dd, RCV_RSM_SELECT /* + (8 * 0) */,
-               LRH_BTH_MATCH_OFFSET
-                       << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
-               LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
-               LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
-               ((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
-               QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
-               ((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
+                 LRH_BTH_MATCH_OFFSET << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
+                 LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
+                 LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
+                 ((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
+                 QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
+                 ((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
        write_csr(dd, RCV_RSM_MATCH /* + (8 * 0) */,
-               LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT |
-               LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT |
-               LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT |
-               LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT);
+                 LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT |
+                 LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT |
+                 LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT |
+                 LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT);
        /* Enable RSM */
        add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
        kfree(rsmmap);
@@@ -13415,9 -13582,8 +13579,8 @@@ static void init_rxe(struct hfi1_devdat
        /* enable all receive errors */
        write_csr(dd, RCV_ERR_MASK, ~0ull);
        /* setup QPN map table - start where VL15 context leaves off */
-       init_qos(
-               dd,
-               dd->n_krcv_queues > MIN_KERNEL_KCTXTS ? MIN_KERNEL_KCTXTS : 0);
+       init_qos(dd, dd->n_krcv_queues > MIN_KERNEL_KCTXTS ?
+                MIN_KERNEL_KCTXTS : 0);
        /*
         * make sure RcvCtrl.RcvWcb <= PCIe Device Control
         * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config
@@@ -13454,36 -13620,33 +13617,33 @@@ static void assign_cm_au_table(struct h
                               u32 csr0to3, u32 csr4to7)
  {
        write_csr(dd, csr0to3,
-                  0ull <<
-                       SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT
-               |  1ull <<
-                       SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT
-               |  2ull * cu <<
-                       SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT
-               |  4ull * cu <<
-                       SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT);
+                 0ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT |
+                 1ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT |
+                 2ull * cu <<
+                 SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT |
+                 4ull * cu <<
+                 SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT);
        write_csr(dd, csr4to7,
-                  8ull * cu <<
-                       SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT
-               | 16ull * cu <<
-                       SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT
-               | 32ull * cu <<
-                       SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT
-               | 64ull * cu <<
-                       SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT);
+                 8ull * cu <<
+                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT |
+                 16ull * cu <<
+                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT |
+                 32ull * cu <<
+                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT |
+                 64ull * cu <<
+                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT);
  }
  
  static void assign_local_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
  {
        assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_LOCAL_AU_TABLE0_TO3,
-                                       SEND_CM_LOCAL_AU_TABLE4_TO7);
+                          SEND_CM_LOCAL_AU_TABLE4_TO7);
  }
  
  void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
  {
        assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_REMOTE_AU_TABLE0_TO3,
-                                       SEND_CM_REMOTE_AU_TABLE4_TO7);
+                          SEND_CM_REMOTE_AU_TABLE4_TO7);
  }
  
  static void init_txe(struct hfi1_devdata *dd)
@@@ -13586,9 -13749,9 +13746,9 @@@ int hfi1_set_ctxt_pkey(struct hfi1_devd
        int ret = 0;
        u64 reg;
  
-       if (ctxt < dd->num_rcv_contexts)
+       if (ctxt < dd->num_rcv_contexts) {
                rcd = dd->rcd[ctxt];
-       else {
+       else {
                ret = -EINVAL;
                goto done;
        }
@@@ -13614,9 -13777,9 +13774,9 @@@ int hfi1_clear_ctxt_pkey(struct hfi1_de
        int ret = 0;
        u64 reg;
  
-       if (ctxt < dd->num_rcv_contexts)
+       if (ctxt < dd->num_rcv_contexts) {
                rcd = dd->rcd[ctxt];
-       else {
+       else {
                ret = -EINVAL;
                goto done;
        }
   */
  void hfi1_start_cleanup(struct hfi1_devdata *dd)
  {
+       aspm_exit(dd);
        free_cntrs(dd);
        free_rcverr(dd);
        clean_up_interrupts(dd);
+       finish_chip_resources(dd);
  }
  
  #define HFI_BASE_GUID(dev) \
        ((dev)->base_guid & ~(1ULL << GUID_HFI_INDEX_SHIFT))
  
  /*
-  * Certain chip functions need to be initialized only once per asic
-  * instead of per-device. This function finds the peer device and
-  * checks whether that chip initialization needs to be done by this
-  * device.
+  * Information can be shared between the two HFIs on the same ASIC
+  * in the same OS.  This function finds the peer device and sets
+  * up a shared structure.
   */
- static void asic_should_init(struct hfi1_devdata *dd)
+ static int init_asic_data(struct hfi1_devdata *dd)
  {
        unsigned long flags;
        struct hfi1_devdata *tmp, *peer = NULL;
+       int ret = 0;
  
        spin_lock_irqsave(&hfi1_devs_lock, flags);
        /* Find our peer device */
                }
        }
  
-       /*
-        * "Claim" the ASIC for initialization if it hasn't been
-        " "claimed" yet.
-        */
-       if (!peer || !(peer->flags & HFI1_DO_INIT_ASIC))
-               dd->flags |= HFI1_DO_INIT_ASIC;
+       if (peer) {
+               dd->asic_data = peer->asic_data;
+       } else {
+               dd->asic_data = kzalloc(sizeof(*dd->asic_data), GFP_KERNEL);
+               if (!dd->asic_data) {
+                       ret = -ENOMEM;
+                       goto done;
+               }
+               mutex_init(&dd->asic_data->asic_resource_mutex);
+       }
+       dd->asic_data->dds[dd->hfi1_id] = dd; /* self back-pointer */
+ done:
        spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       return ret;
  }
  
  /*
@@@ -13694,7 -13867,7 +13864,7 @@@ static int obtain_boardname(struct hfi1
        ret = read_hfi1_efi_var(dd, "description", &size,
                                (void **)&dd->boardname);
        if (ret) {
-               dd_dev_err(dd, "Board description not found\n");
+               dd_dev_info(dd, "Board description not found\n");
                /* use generic description */
                dd->boardname = kstrdup(generic, GFP_KERNEL);
                if (!dd->boardname)
        return 0;
  }
  
+ /*
+  * Check the interrupt registers to make sure that they are mapped correctly.
+  * It is intended to help user identify any mismapping by VMM when the driver
+  * is running in a VM. This function should only be called before interrupt
+  * is set up properly.
+  *
+  * Return 0 on success, -EINVAL on failure.
+  */
+ static int check_int_registers(struct hfi1_devdata *dd)
+ {
+       u64 reg;
+       u64 all_bits = ~(u64)0;
+       u64 mask;
+       /* Clear CceIntMask[0] to avoid raising any interrupts */
+       mask = read_csr(dd, CCE_INT_MASK);
+       write_csr(dd, CCE_INT_MASK, 0ull);
+       reg = read_csr(dd, CCE_INT_MASK);
+       if (reg)
+               goto err_exit;
+       /* Clear all interrupt status bits */
+       write_csr(dd, CCE_INT_CLEAR, all_bits);
+       reg = read_csr(dd, CCE_INT_STATUS);
+       if (reg)
+               goto err_exit;
+       /* Set all interrupt status bits */
+       write_csr(dd, CCE_INT_FORCE, all_bits);
+       reg = read_csr(dd, CCE_INT_STATUS);
+       if (reg != all_bits)
+               goto err_exit;
+       /* Restore the interrupt mask */
+       write_csr(dd, CCE_INT_CLEAR, all_bits);
+       write_csr(dd, CCE_INT_MASK, mask);
+       return 0;
+ err_exit:
+       write_csr(dd, CCE_INT_MASK, mask);
+       dd_dev_err(dd, "Interrupt registers not properly mapped by VMM\n");
+       return -EINVAL;
+ }
  /**
   * Allocate and initialize the device structure for the hfi.
   * @dev: the pci_dev for hfi1_ib device
@@@ -13727,9 -13944,10 +13941,10 @@@ struct hfi1_devdata *hfi1_init_dd(struc
                "RTL FPGA emulation",
                "Functional simulator"
        };
+       struct pci_dev *parent = pdev->bus->self;
  
-       dd = hfi1_alloc_devdata(pdev,
-               NUM_IB_PORTS * sizeof(struct hfi1_pportdata));
+       dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS *
+                               sizeof(struct hfi1_pportdata));
        if (IS_ERR(dd))
                goto bail;
        ppd = dd->pport;
                /* link width active is 0 when link is down */
                /* link width downgrade active is 0 when link is down */
  
-               if (num_vls < HFI1_MIN_VLS_SUPPORTED
-                       || num_vls > HFI1_MAX_VLS_SUPPORTED) {
+               if (num_vls < HFI1_MIN_VLS_SUPPORTED ||
+                   num_vls > HFI1_MAX_VLS_SUPPORTED) {
                        hfi1_early_err(&pdev->dev,
                                       "Invalid num_vls %u, using %u VLs\n",
                                    num_vls, HFI1_MAX_VLS_SUPPORTED);
                }
                ppd->vls_supported = num_vls;
                ppd->vls_operational = ppd->vls_supported;
+               ppd->actual_vls_operational = ppd->vls_supported;
                /* Set the default MTU. */
                for (vl = 0; vl < num_vls; vl++)
                        dd->vld[vl].mtu = hfi1_max_mtu;
                /* start in offline */
                ppd->host_link_state = HLS_DN_OFFLINE;
                init_vl_arb_caches(ppd);
+               ppd->last_pstate = 0xff; /* invalid value */
        }
  
        dd->link_default = HLS_DN_POLL;
        dd->minrev = (dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT)
                        & CCE_REVISION_CHIP_REV_MINOR_MASK;
  
-       /* obtain the hardware ID - NOT related to unit, which is a
-          software enumeration */
+       /*
+        * Check interrupt registers mapping if the driver has no access to
+        * the upstream component. In this case, it is likely that the driver
+        * is running in a VM.
+        */
+       if (!parent) {
+               ret = check_int_registers(dd);
+               if (ret)
+                       goto bail_cleanup;
+       }
+       /*
+        * obtain the hardware ID - NOT related to unit, which is a
+        * software enumeration
+        */
        reg = read_csr(dd, CCE_REVISION2);
        dd->hfi1_id = (reg >> CCE_REVISION2_HFI_ID_SHIFT)
                                        & CCE_REVISION2_HFI_ID_MASK;
        dd->icode = reg >> CCE_REVISION2_IMPL_CODE_SHIFT;
        dd->irev = reg >> CCE_REVISION2_IMPL_REVISION_SHIFT;
        dd_dev_info(dd, "Implementation: %s, revision 0x%x\n",
-               dd->icode < ARRAY_SIZE(inames) ? inames[dd->icode] : "unknown",
-               (int)dd->irev);
+                   dd->icode < ARRAY_SIZE(inames) ?
+                   inames[dd->icode] : "unknown", (int)dd->irev);
  
        /* speeds the hardware can support */
        dd->pport->link_speed_supported = OPA_LINK_SPEED_25G;
                           num_vls, dd->chip_sdma_engines);
                num_vls = dd->chip_sdma_engines;
                ppd->vls_supported = dd->chip_sdma_engines;
+               ppd->vls_operational = ppd->vls_supported;
        }
  
        /*
        /* needs to be done before we look for the peer device */
        read_guid(dd);
  
-       /* should this device init the ASIC block? */
-       asic_should_init(dd);
+       /* set up shared ASIC data with peer device */
+       ret = init_asic_data(dd);
+       if (ret)
+               goto bail_cleanup;
  
        /* obtain chip sizes, reset chip CSRs */
        init_chip(dd);
        if (ret)
                goto bail_cleanup;
  
+       /* Needs to be called before hfi1_firmware_init */
+       get_platform_config(dd);
        /* read in firmware */
        ret = hfi1_firmware_init(dd);
        if (ret)
        /* set up KDETH QP prefix in both RX and TX CSRs */
        init_kdeth_qp(dd);
  
+       ret = hfi1_dev_affinity_init(dd);
+       if (ret)
+               goto bail_cleanup;
        /* send contexts must be set up before receive contexts */
        ret = init_send_contexts(dd);
        if (ret)
@@@ -14022,7 -14265,6 +14262,6 @@@ static u16 delay_cycles(struct hfi1_ppo
        return (u16)delta_cycles;
  }
  
  /**
   * create_pbc - build a pbc for transmission
   * @flags: special case flags or-ed in built pbc
@@@ -14078,10 -14320,15 +14317,15 @@@ static int thermal_init(struct hfi1_dev
        int ret = 0;
  
        if (dd->icode != ICODE_RTL_SILICON ||
-           !(dd->flags & HFI1_DO_INIT_ASIC))
+           check_chip_resource(dd, CR_THERM_INIT, NULL))
                return ret;
  
-       acquire_hw_mutex(dd);
+       ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Acquire SBus");
+               return ret;
+       }
        dd_dev_info(dd, "Initializing thermal sensor\n");
        /* Disable polling of thermal readings */
        write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0);
  
        /* Enable polling of thermal readings */
        write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
+       /* Set initialized flag */
+       ret = acquire_chip_resource(dd, CR_THERM_INIT, 0);
+       if (ret)
+               THERM_FAILURE(dd, ret, "Unable to set thermal init flag");
  done:
-       release_hw_mutex(dd);
+       release_chip_resource(dd, CR_SBUS);
        return ret;
  }
  
@@@ -14144,7 -14397,7 +14394,7 @@@ static void handle_temp_err(struct hfi1
        dd_dev_emerg(dd,
                     "Critical temperature reached! Forcing device into freeze mode!\n");
        dd->flags |= HFI1_FORCED_FREEZE;
-       start_freeze_handling(ppd, FREEZE_SELF|FREEZE_ABORT);
+       start_freeze_handling(ppd, FREEZE_SELF | FREEZE_ABORT);
        /*
         * Shut DC down as much and as quickly as possible.
         *
         */
        ppd->driver_link_ready = 0;
        ppd->link_enabled = 0;
-       set_physical_link_state(dd, PLS_OFFLINE |
-                               (OPA_LINKDOWN_REASON_SMA_DISABLED << 8));
+       set_physical_link_state(dd, (OPA_LINKDOWN_REASON_SMA_DISABLED << 8) |
+                               PLS_OFFLINE);
        /*
         * Step 2: Shutdown LCB and 8051
         *         After shutdown, do not restore DC_CFG_RESET value.
index e41159fe6889737e3899a0413c9c25a74530b5df,6546e91f85b753f77d5231418dc2d08191e680cb..c5b520bf610e29149baf6aff3b2f3f5a66ecafa2
@@@ -1,12 -1,11 +1,11 @@@
  /*
+  * Copyright(c) 2015, 2016 Intel Corporation.
   *
   * This file is provided under a dual BSD/GPLv2 license.  When using or
   * redistributing this file, you may do so under either license.
   *
   * GPL LICENSE SUMMARY
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of version 2 of the GNU General Public License as
   * published by the Free Software Foundation.
@@@ -18,8 -17,6 +17,6 @@@
   *
   * BSD LICENSE
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
@@@ -70,6 -67,7 +67,7 @@@
  #include "hfi.h"
  #include "device.h"
  #include "common.h"
+ #include "verbs_txreq.h"
  #include "trace.h"
  
  #undef pr_fmt
  /* Snoop option mask */
  #define SNOOP_DROP_SEND               BIT(0)
  #define SNOOP_USE_METADATA    BIT(1)
+ #define SNOOP_SET_VL0TOVL15     BIT(2)
  
  static u8 snoop_flags;
  
  /*
   * Extract packet length from LRH header.
-  * Why & 0x7FF? Because len is only 11 bits in case it wasn't 0'd we throw the
-  * bogus bits away. This is in Dwords so multiply by 4 to get size in bytes
+  * This is in Dwords so multiply by 4 to get size in bytes
   */
- #define HFI1_GET_PKT_LEN(x)      (((be16_to_cpu((x)->lrh[2]) & 0x7FF)) << 2)
+ #define HFI1_GET_PKT_LEN(x)      (((be16_to_cpu((x)->lrh[2]) & 0xFFF)) << 2)
  
  enum hfi1_filter_status {
        HFI1_FILTER_HIT,
@@@ -257,7 -255,7 +255,7 @@@ static int hfi1_filter_ib_service_level
  static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value);
  static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value);
  
 -static struct hfi1_filter_array hfi1_filters[] = {
 +static const struct hfi1_filter_array hfi1_filters[] = {
        { hfi1_filter_lid },
        { hfi1_filter_dlid },
        { hfi1_filter_mad_mgmt_class },
@@@ -860,7 -858,7 +858,7 @@@ static ssize_t hfi1_snoop_write(struct 
                        vl = sc4;
                } else {
                        sl = (byte_two >> 4) & 0xf;
-                       ibp = to_iport(&dd->verbs_dev.ibdev, 1);
+                       ibp = to_iport(&dd->verbs_dev.rdi.ibdev, 1);
                        sc5 = ibp->sl_to_sc[sl];
                        vl = sc_to_vlt(dd, sc5);
                        if (vl != sc4) {
@@@ -966,6 -964,65 +964,65 @@@ static ssize_t hfi1_snoop_read(struct f
        return ret;
  }
  
+ /**
+  * hfi1_assign_snoop_link_credits -- Set up credits for VL15 and others
+  * @ppd : ptr to hfi1 port data
+  * @value : options from user space
+  *
+  * Assumes the rest of the CM credit registers are zero from a
+  * previous global or credit reset.
+  * Leave shared count at zero for both global and all vls.
+  * In snoop mode ideally we don't use shared credits
+  * Reserve 8.5k for VL15
+  * If total credits less than 8.5kbytes return error.
+  * Divide the rest of the credits across VL0 to VL7 and if
+  * each of these levels has less than 34 credits (at least 2048 + 128 bytes)
+  * return with an error.
+  * The credit registers will be reset to zero on link negotiation or link up
+  * so this function should be activated from user space only if the port has
+  * gone past link negotiation and link up.
+  *
+  * Return -- 0 if successful else error condition
+  *
+  */
+ static long hfi1_assign_snoop_link_credits(struct hfi1_pportdata *ppd,
+                                          int value)
+ {
+ #define  OPA_MIN_PER_VL_CREDITS  34  /* 2048 + 128 bytes */
+       struct buffer_control t;
+       int i;
+       struct hfi1_devdata *dd = ppd->dd;
+       u16  total_credits = (value >> 16) & 0xffff;
+       u16  vl15_credits = dd->vl15_init / 2;
+       u16  per_vl_credits;
+       __be16 be_per_vl_credits;
+       if (!(ppd->host_link_state & HLS_UP))
+               goto err_exit;
+       if (total_credits  <  vl15_credits)
+               goto err_exit;
+       per_vl_credits = (total_credits - vl15_credits) / TXE_NUM_DATA_VL;
+       if (per_vl_credits < OPA_MIN_PER_VL_CREDITS)
+               goto err_exit;
+       memset(&t, 0, sizeof(t));
+       be_per_vl_credits = cpu_to_be16(per_vl_credits);
+       for (i = 0; i < TXE_NUM_DATA_VL; i++)
+               t.vl[i].dedicated = be_per_vl_credits;
+       t.vl[15].dedicated  = cpu_to_be16(vl15_credits);
+       return set_buffer_control(ppd, &t);
+ err_exit:
+       snoop_dbg("port_state = 0x%x, total_credits = %d, vl15_credits = %d",
+                 ppd->host_link_state, total_credits, vl15_credits);
+       return -EINVAL;
+ }
  static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg)
  {
        struct hfi1_devdata *dd;
                        snoop_flags |= SNOOP_DROP_SEND;
                if (value & SNOOP_USE_METADATA)
                        snoop_flags |= SNOOP_USE_METADATA;
+               if (value & (SNOOP_SET_VL0TOVL15)) {
+                       ppd = &dd->pport[0];  /* first port will do */
+                       ret = hfi1_assign_snoop_link_credits(ppd, value);
+               }
                break;
        default:
                return -ENOTTY;
@@@ -1603,7 -1664,7 +1664,7 @@@ int snoop_recv_handler(struct hfi1_pack
  /*
   * Handle snooping and capturing packets when sdma is being used.
   */
- int snoop_send_dma_handler(struct hfi1_qp *qp, struct hfi1_pkt_state *ps,
+ int snoop_send_dma_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
                           u64 pbc)
  {
        pr_alert("Snooping/Capture of Send DMA Packets Is Not Supported!\n");
   * bypass packets. The only way to send a bypass packet currently is to use the
   * diagpkt interface. When that interface is enable snoop/capture is not.
   */
- int snoop_send_pio_handler(struct hfi1_qp *qp, struct hfi1_pkt_state *ps,
+ int snoop_send_pio_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
                           u64 pbc)
  {
-       struct ahg_ib_header *ahdr = qp->s_hdr;
        u32 hdrwords = qp->s_hdrwords;
-       struct hfi1_sge_state *ss = qp->s_cur_sge;
+       struct rvt_sge_state *ss = qp->s_cur_sge;
        u32 len = qp->s_cur_size;
        u32 dwords = (len + 3) >> 2;
        u32 plen = hdrwords + dwords + 2; /* includes pbc */
        struct hfi1_pportdata *ppd = ps->ppd;
        struct snoop_packet *s_packet = NULL;
-       u32 *hdr = (u32 *)&ahdr->ibh;
+       u32 *hdr = (u32 *)&ps->s_txreq->phdr.hdr;
        u32 length = 0;
-       struct hfi1_sge_state temp_ss;
+       struct rvt_sge_state temp_ss;
        void *data = NULL;
        void *data_start = NULL;
        int ret;
        struct capture_md md;
        u32 vl;
        u32 hdr_len = hdrwords << 2;
-       u32 tlen = HFI1_GET_PKT_LEN(&ahdr->ibh);
+       u32 tlen = HFI1_GET_PKT_LEN(&ps->s_txreq->phdr.hdr);
  
        md.u.pbc = 0;
  
                md.port = 1;
                md.dir = PKT_DIR_EGRESS;
                if (likely(pbc == 0)) {
-                       vl = be16_to_cpu(ahdr->ibh.lrh[0]) >> 12;
+                       vl = be16_to_cpu(ps->s_txreq->phdr.hdr.lrh[0]) >> 12;
                        md.u.pbc = create_pbc(ppd, 0, qp->s_srate, vl, plen);
                } else {
                        md.u.pbc = 0;
                ret = HFI1_FILTER_HIT;
        } else {
                ret = ppd->dd->hfi1_snoop.filter_callback(
-                                       &ahdr->ibh,
+                                       &ps->s_txreq->phdr.hdr,
                                        NULL,
                                        ppd->dd->hfi1_snoop.filter_value);
        }
                                spin_unlock_irqrestore(&qp->s_lock, flags);
                        } else if (qp->ibqp.qp_type == IB_QPT_RC) {
                                spin_lock_irqsave(&qp->s_lock, flags);
-                               hfi1_rc_send_complete(qp, &ahdr->ibh);
+                               hfi1_rc_send_complete(qp,
+                                                     &ps->s_txreq->phdr.hdr);
                                spin_unlock_irqrestore(&qp->s_lock, flags);
                        }
+                       /*
+                        * If snoop is dropping the packet we need to put the
+                        * txreq back because no one else will.
+                        */
+                       hfi1_put_txreq(ps->s_txreq);
                        return 0;
                }
                break;
index ee50bbf64d39603939bfbedff8ae7965e7678769,914beedb556b42eabd4bbdfc5680efd286655858..34511e5df1d56e7765c5d11d4e07d68514cf3894
@@@ -1,12 -1,11 +1,11 @@@
  /*
+  * Copyright(c) 2015, 2016 Intel Corporation.
   *
   * This file is provided under a dual BSD/GPLv2 license.  When using or
   * redistributing this file, you may do so under either license.
   *
   * GPL LICENSE SUMMARY
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of version 2 of the GNU General Public License as
   * published by the Free Software Foundation.
@@@ -18,8 -17,6 +17,6 @@@
   *
   * BSD LICENSE
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
@@@ -56,6 -53,7 +53,7 @@@
  #include <linux/vmalloc.h>
  #include <linux/module.h>
  #include <linux/prefetch.h>
+ #include <rdma/ib_verbs.h>
  
  #include "hfi.h"
  #include "trace.h"
@@@ -162,6 -160,22 +160,22 @@@ const char *get_unit_name(int unit
        return iname;
  }
  
+ const char *get_card_name(struct rvt_dev_info *rdi)
+ {
+       struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi);
+       struct hfi1_devdata *dd = container_of(ibdev,
+                                              struct hfi1_devdata, verbs_dev);
+       return get_unit_name(dd->unit);
+ }
+ struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi)
+ {
+       struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi);
+       struct hfi1_devdata *dd = container_of(ibdev,
+                                              struct hfi1_devdata, verbs_dev);
+       return dd->pcidev;
+ }
  /*
   * Return count of units with at least one port ACTIVE.
   */
@@@ -246,7 -260,7 +260,7 @@@ static inline void *get_egrbuf(const st
   */
  inline int hfi1_rcvbuf_validate(u32 size, u8 type, u16 *encoded)
  {
 -      if (unlikely(!IS_ALIGNED(size, PAGE_SIZE)))
 +      if (unlikely(!PAGE_ALIGNED(size)))
                return 0;
        if (unlikely(size < MIN_EAGER_BUFFER))
                return 0;
@@@ -265,6 -279,8 +279,8 @@@ static void rcv_hdrerr(struct hfi1_ctxt
        u32 rte = rhf_rcv_type_err(packet->rhf);
        int lnh = be16_to_cpu(rhdr->lrh[0]) & 3;
        struct hfi1_ibport *ibp = &ppd->ibport_data;
+       struct hfi1_devdata *dd = ppd->dd;
+       struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
  
        if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
                return;
                        goto drop;
  
                /* Check for GRH */
-               if (lnh == HFI1_LRH_BTH)
+               if (lnh == HFI1_LRH_BTH) {
                        ohdr = &hdr->u.oth;
-               else if (lnh == HFI1_LRH_GRH) {
+               else if (lnh == HFI1_LRH_GRH) {
                        u32 vtf;
  
                        ohdr = &hdr->u.l.oth;
                        if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
                                goto drop;
                        rcv_flags |= HFI1_HAS_GRH;
-               } else
+               } else {
                        goto drop;
+               }
                /* Get the destination QP number. */
-               qp_num = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
-               if (lid < HFI1_MULTICAST_LID_BASE) {
-                       struct hfi1_qp *qp;
+               qp_num = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
+               if (lid < be16_to_cpu(IB_MULTICAST_LID_BASE)) {
+                       struct rvt_qp *qp;
                        unsigned long flags;
  
                        rcu_read_lock();
-                       qp = hfi1_lookup_qpn(ibp, qp_num);
+                       qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
                        if (!qp) {
                                rcu_read_unlock();
                                goto drop;
                        spin_lock_irqsave(&qp->r_lock, flags);
  
                        /* Check for valid receive state. */
-                       if (!(ib_hfi1_state_ops[qp->state] &
-                             HFI1_PROCESS_RECV_OK)) {
-                               ibp->n_pkt_drops++;
+                       if (!(ib_rvt_state_ops[qp->state] &
+                             RVT_PROCESS_RECV_OK)) {
+                               ibp->rvp.n_pkt_drops++;
                        }
  
                        switch (qp->ibqp.qp_type) {
                if (rhf_use_egr_bfr(packet->rhf))
                        ebuf = packet->ebuf;
  
-               if (ebuf == NULL)
+               if (!ebuf)
                        goto drop; /* this should never happen */
  
                if (lnh == HFI1_LRH_BTH)
                         * Only in pre-B0 h/w is the CNP_OPCODE handled
                         * via this code path.
                         */
-                       struct hfi1_qp *qp = NULL;
+                       struct rvt_qp *qp = NULL;
                        u32 lqpn, rqpn;
                        u16 rlid;
                        u8 svc_type, sl, sc5;
                                sc5 |= 0x10;
                        sl = ibp->sc_to_sl[sc5];
  
-                       lqpn = be32_to_cpu(bth[1]) & HFI1_QPN_MASK;
+                       lqpn = be32_to_cpu(bth[1]) & RVT_QPN_MASK;
                        rcu_read_lock();
-                       qp = hfi1_lookup_qpn(ibp, lqpn);
-                       if (qp == NULL) {
+                       qp = rvt_lookup_qpn(rdi, &ibp->rvp, lqpn);
+                       if (!qp) {
                                rcu_read_unlock();
                                goto drop;
                        }
@@@ -419,9 -435,8 +435,8 @@@ drop
  }
  
  static inline void init_packet(struct hfi1_ctxtdata *rcd,
-                             struct hfi1_packet *packet)
+                              struct hfi1_packet *packet)
  {
        packet->rsize = rcd->rcvhdrqentsize; /* words */
        packet->maxcnt = rcd->rcvhdrq_cnt * packet->rsize; /* words */
        packet->rcd = rcd;
        packet->rcv_flags = 0;
  }
  
- #ifndef CONFIG_PRESCAN_RXQ
- static void prescan_rxq(struct hfi1_packet *packet) {}
- #else /* !CONFIG_PRESCAN_RXQ */
- static int prescan_receive_queue;
- static void process_ecn(struct hfi1_qp *qp, struct hfi1_ib_header *hdr,
+ static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr,
                        struct hfi1_other_headers *ohdr,
                        u64 rhf, u32 bth1, struct ib_grh *grh)
  {
        case IB_QPT_GSI:
        case IB_QPT_UD:
                rlid = be16_to_cpu(hdr->lrh[3]);
-               rqpn = be32_to_cpu(ohdr->u.ud.deth[1]) & HFI1_QPN_MASK;
+               rqpn = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK;
                svc_type = IB_CC_SVCTYPE_UD;
                break;
        case IB_QPT_UC:
  
        if (bth1 & HFI1_BECN_SMASK) {
                struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-               u32 lqpn = bth1 & HFI1_QPN_MASK;
+               u32 lqpn = bth1 & RVT_QPN_MASK;
                u8 sl = ibp->sc_to_sl[sc5];
  
                process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
@@@ -562,26 -572,31 +572,31 @@@ static inline void update_ps_mdata(stru
   * containing Excplicit Congestion Notifications (FECNs, or BECNs).
   * When an ECN is found, process the Congestion Notification, and toggle
   * it off.
+  * This is declared as a macro to allow quick checking of the port to avoid
+  * the overhead of a function call if not enabled.
   */
- static void prescan_rxq(struct hfi1_packet *packet)
+ #define prescan_rxq(rcd, packet) \
+       do { \
+               if (rcd->ppd->cc_prescan) \
+                       __prescan_rxq(packet); \
+       } while (0)
+ static void __prescan_rxq(struct hfi1_packet *packet)
  {
        struct hfi1_ctxtdata *rcd = packet->rcd;
        struct ps_mdata mdata;
  
-       if (!prescan_receive_queue)
-               return;
        init_ps_mdata(&mdata, packet);
  
        while (1) {
                struct hfi1_devdata *dd = rcd->dd;
                struct hfi1_ibport *ibp = &rcd->ppd->ibport_data;
-               __le32 *rhf_addr = (__le32 *) rcd->rcvhdrq + mdata.ps_head +
+               __le32 *rhf_addr = (__le32 *)rcd->rcvhdrq + mdata.ps_head +
                                         dd->rhf_offset;
-               struct hfi1_qp *qp;
+               struct rvt_qp *qp;
                struct hfi1_ib_header *hdr;
                struct hfi1_other_headers *ohdr;
                struct ib_grh *grh = NULL;
+               struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
                u64 rhf = rhf_to_cpu(rhf_addr);
                u32 etype = rhf_rcv_type(rhf), qpn, bth1;
                int is_ecn = 0;
                        hfi1_get_msgheader(dd, rhf_addr);
                lnh = be16_to_cpu(hdr->lrh[0]) & 3;
  
-               if (lnh == HFI1_LRH_BTH)
+               if (lnh == HFI1_LRH_BTH) {
                        ohdr = &hdr->u.oth;
-               else if (lnh == HFI1_LRH_GRH) {
+               else if (lnh == HFI1_LRH_GRH) {
                        ohdr = &hdr->u.l.oth;
                        grh = &hdr->u.l.grh;
-               } else
+               } else {
                        goto next; /* just in case */
+               }
                bth1 = be32_to_cpu(ohdr->bth[1]);
                is_ecn = !!(bth1 & (HFI1_FECN_SMASK | HFI1_BECN_SMASK));
  
                if (!is_ecn)
                        goto next;
  
-               qpn = bth1 & HFI1_QPN_MASK;
+               qpn = bth1 & RVT_QPN_MASK;
                rcu_read_lock();
-               qp = hfi1_lookup_qpn(ibp, qpn);
+               qp = rvt_lookup_qpn(rdi, &ibp->rvp, qpn);
  
-               if (qp == NULL) {
+               if (!qp) {
                        rcu_read_unlock();
                        goto next;
                }
@@@ -633,7 -648,6 +648,6 @@@ next
                update_ps_mdata(&mdata, rcd);
        }
  }
- #endif /* CONFIG_PRESCAN_RXQ */
  
  static inline int skip_rcv_packet(struct hfi1_packet *packet, int thread)
  {
@@@ -683,8 -697,9 +697,9 @@@ static inline int process_rcv_packet(st
                 * The +2 is the size of the RHF.
                 */
                prefetch_range(packet->ebuf,
-                       packet->tlen - ((packet->rcd->rcvhdrqentsize -
-                                 (rhf_hdrq_offset(packet->rhf)+2)) * 4));
+                              packet->tlen - ((packet->rcd->rcvhdrqentsize -
+                                              (rhf_hdrq_offset(packet->rhf)
+                                               + 2)) * 4));
        }
  
        /*
                }
        }
  
-       packet->rhf_addr = (__le32 *) packet->rcd->rcvhdrq + packet->rhqoff +
+       packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff +
                                      packet->rcd->dd->rhf_offset;
        packet->rhf = rhf_to_cpu(packet->rhf_addr);
  
@@@ -737,7 -752,6 +752,6 @@@ static inline void process_rcv_update(i
  
  static inline void finish_packet(struct hfi1_packet *packet)
  {
        /*
         * Nothing we need to free for the packet.
         *
         */
        update_usrhead(packet->rcd, packet->rcd->head, packet->updegr,
                       packet->etail, rcv_intr_dynamic, packet->numpkt);
  }
  
  static inline void process_rcv_qp_work(struct hfi1_packet *packet)
  {
        struct hfi1_ctxtdata *rcd;
-       struct hfi1_qp *qp, *nqp;
+       struct rvt_qp *qp, *nqp;
  
        rcd = packet->rcd;
        rcd->head = packet->rhqoff;
         */
        list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) {
                list_del_init(&qp->rspwait);
-               if (qp->r_flags & HFI1_R_RSP_DEFERED_ACK) {
-                       qp->r_flags &= ~HFI1_R_RSP_DEFERED_ACK;
+               if (qp->r_flags & RVT_R_RSP_NAK) {
+                       qp->r_flags &= ~RVT_R_RSP_NAK;
                        hfi1_send_rc_ack(rcd, qp, 0);
                }
-               if (qp->r_flags & HFI1_R_RSP_SEND) {
+               if (qp->r_flags & RVT_R_RSP_SEND) {
                        unsigned long flags;
  
-                       qp->r_flags &= ~HFI1_R_RSP_SEND;
+                       qp->r_flags &= ~RVT_R_RSP_SEND;
                        spin_lock_irqsave(&qp->s_lock, flags);
-                       if (ib_hfi1_state_ops[qp->state] &
-                                       HFI1_PROCESS_OR_FLUSH_SEND)
+                       if (ib_rvt_state_ops[qp->state] &
+                                       RVT_PROCESS_OR_FLUSH_SEND)
                                hfi1_schedule_send(qp);
                        spin_unlock_irqrestore(&qp->s_lock, flags);
                }
@@@ -799,7 -811,7 +811,7 @@@ int handle_receive_interrupt_nodma_rtai
                goto bail;
        }
  
-       prescan_rxq(&packet);
+       prescan_rxq(rcd, &packet);
  
        while (last == RCV_PKT_OK) {
                last = process_rcv_packet(&packet, thread);
@@@ -830,7 -842,7 +842,7 @@@ int handle_receive_interrupt_dma_rtail(
        }
        smp_rmb();  /* prevent speculative reads of dma'ed hdrq */
  
-       prescan_rxq(&packet);
+       prescan_rxq(rcd, &packet);
  
        while (last == RCV_PKT_OK) {
                last = process_rcv_packet(&packet, thread);
@@@ -862,6 -874,37 +874,37 @@@ static inline void set_all_dma_rtail(st
                        &handle_receive_interrupt_dma_rtail;
  }
  
+ void set_all_slowpath(struct hfi1_devdata *dd)
+ {
+       int i;
+       /* HFI1_CTRL_CTXT must always use the slow path interrupt handler */
+       for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
+               dd->rcd[i]->do_interrupt = &handle_receive_interrupt;
+ }
+ static inline int set_armed_to_active(struct hfi1_ctxtdata *rcd,
+                                     struct hfi1_packet packet,
+                                     struct hfi1_devdata *dd)
+ {
+       struct work_struct *lsaw = &rcd->ppd->linkstate_active_work;
+       struct hfi1_message_header *hdr = hfi1_get_msgheader(packet.rcd->dd,
+                                                            packet.rhf_addr);
+       if (hdr2sc(hdr, packet.rhf) != 0xf) {
+               int hwstate = read_logical_state(dd);
+               if (hwstate != LSTATE_ACTIVE) {
+                       dd_dev_info(dd, "Unexpected link state %d\n", hwstate);
+                       return 0;
+               }
+               queue_work(rcd->ppd->hfi1_wq, lsaw);
+               return 1;
+       }
+       return 0;
+ }
  /*
   * handle_receive_interrupt - receive a packet
   * @rcd: the context
@@@ -910,17 -953,17 +953,17 @@@ int handle_receive_interrupt(struct hfi
                }
        }
  
-       prescan_rxq(&packet);
+       prescan_rxq(rcd, &packet);
  
        while (last == RCV_PKT_OK) {
-               if (unlikely(dd->do_drop && atomic_xchg(&dd->drop_packet,
-                       DROP_PACKET_OFF) == DROP_PACKET_ON)) {
+               if (unlikely(dd->do_drop &&
+                            atomic_xchg(&dd->drop_packet, DROP_PACKET_OFF) ==
+                            DROP_PACKET_ON)) {
                        dd->do_drop = 0;
  
                        /* On to the next packet */
                        packet.rhqoff += packet.rsize;
-                       packet.rhf_addr = (__le32 *) rcd->rcvhdrq +
+                       packet.rhf_addr = (__le32 *)rcd->rcvhdrq +
                                          packet.rhqoff +
                                          dd->rhf_offset;
                        packet.rhf = rhf_to_cpu(packet.rhf_addr);
                        last = skip_rcv_packet(&packet, thread);
                        skip_pkt = 0;
                } else {
+                       /* Auto activate link on non-SC15 packet receive */
+                       if (unlikely(rcd->ppd->host_link_state ==
+                                    HLS_UP_ARMED) &&
+                           set_armed_to_active(rcd, packet, dd))
+                               goto bail;
                        last = process_rcv_packet(&packet, thread);
                }
  
                        if (seq != rcd->seq_cnt)
                                last = RCV_PKT_DONE;
                        if (needset) {
-                               dd_dev_info(dd,
-                                       "Switching to NO_DMA_RTAIL\n");
+                               dd_dev_info(dd, "Switching to NO_DMA_RTAIL\n");
                                set_all_nodma_rtail(dd);
                                needset = 0;
                        }
@@@ -983,6 -1030,42 +1030,42 @@@ bail
        return last;
  }
  
+ /*
+  * We may discover in the interrupt that the hardware link state has
+  * changed from ARMED to ACTIVE (due to the arrival of a non-SC15 packet),
+  * and we need to update the driver's notion of the link state.  We cannot
+  * run set_link_state from interrupt context, so we queue this function on
+  * a workqueue.
+  *
+  * We delay the regular interrupt processing until after the state changes
+  * so that the link will be in the correct state by the time any application
+  * we wake up attempts to send a reply to any message it received.
+  * (Subsequent receive interrupts may possibly force the wakeup before we
+  * update the link state.)
+  *
+  * The rcd is freed in hfi1_free_ctxtdata after hfi1_postinit_cleanup invokes
+  * dd->f_cleanup(dd) to disable the interrupt handler and flush workqueues,
+  * so we're safe from use-after-free of the rcd.
+  */
+ void receive_interrupt_work(struct work_struct *work)
+ {
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                 linkstate_active_work);
+       struct hfi1_devdata *dd = ppd->dd;
+       int i;
+       /* Received non-SC15 packet implies neighbor_normal */
+       ppd->neighbor_normal = 1;
+       set_link_state(ppd, HLS_UP_ACTIVE);
+       /*
+        * Interrupt all kernel contexts that could have had an
+        * interrupt during auto activation.
+        */
+       for (i = HFI1_CTRL_CTXT; i < dd->first_user_ctxt; i++)
+               force_recv_intr(dd->rcd[i]);
+ }
  /*
   * Convert a given MTU size to the on-wire MAD packet enumeration.
   * Return -1 if the size is invalid.
@@@ -1037,9 -1120,9 +1120,9 @@@ int set_mtu(struct hfi1_pportdata *ppd
        ppd->ibmaxlen = ppd->ibmtu + lrh_max_header_bytes(ppd->dd);
  
        mutex_lock(&ppd->hls_lock);
-       if (ppd->host_link_state == HLS_UP_INIT
-                       || ppd->host_link_state == HLS_UP_ARMED
-                       || ppd->host_link_state == HLS_UP_ACTIVE)
+       if (ppd->host_link_state == HLS_UP_INIT ||
+           ppd->host_link_state == HLS_UP_ARMED ||
+           ppd->host_link_state == HLS_UP_ACTIVE)
                is_up = 1;
  
        drain = !is_ax(dd) && is_up;
@@@ -1082,79 -1165,80 +1165,80 @@@ int hfi1_set_lid(struct hfi1_pportdata 
        return 0;
  }
  
- /*
-  * Following deal with the "obviously simple" task of overriding the state
-  * of the LEDs, which normally indicate link physical and logical status.
-  * The complications arise in dealing with different hardware mappings
-  * and the board-dependent routine being called from interrupts.
-  * and then there's the requirement to _flash_ them.
-  */
- #define LED_OVER_FREQ_SHIFT 8
- #define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT)
- /* Below is "non-zero" to force override, but both actual LEDs are off */
- #define LED_OVER_BOTH_OFF (8)
+ void shutdown_led_override(struct hfi1_pportdata *ppd)
+ {
+       struct hfi1_devdata *dd = ppd->dd;
+       /*
+        * This pairs with the memory barrier in hfi1_start_led_override to
+        * ensure that we read the correct state of LED beaconing represented
+        * by led_override_timer_active
+        */
+       smp_rmb();
+       if (atomic_read(&ppd->led_override_timer_active)) {
+               del_timer_sync(&ppd->led_override_timer);
+               atomic_set(&ppd->led_override_timer_active, 0);
+               /* Ensure the atomic_set is visible to all CPUs */
+               smp_wmb();
+       }
+       /* Hand control of the LED to the DC for normal operation */
+       write_csr(dd, DCC_CFG_LED_CNTRL, 0);
+ }
  
  static void run_led_override(unsigned long opaque)
  {
        struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)opaque;
        struct hfi1_devdata *dd = ppd->dd;
-       int timeoff;
-       int ph_idx;
+       unsigned long timeout;
+       int phase_idx;
  
        if (!(dd->flags & HFI1_INITTED))
                return;
  
-       ph_idx = ppd->led_override_phase++ & 1;
-       ppd->led_override = ppd->led_override_vals[ph_idx];
-       timeoff = ppd->led_override_timeoff;
+       phase_idx = ppd->led_override_phase & 1;
  
-       /*
-        * don't re-fire the timer if user asked for it to be off; we let
-        * it fire one more time after they turn it off to simplify
-        */
-       if (ppd->led_override_vals[0] || ppd->led_override_vals[1])
-               mod_timer(&ppd->led_override_timer, jiffies + timeoff);
+       setextled(dd, phase_idx);
+       timeout = ppd->led_override_vals[phase_idx];
+       /* Set up for next phase */
+       ppd->led_override_phase = !ppd->led_override_phase;
+       mod_timer(&ppd->led_override_timer, jiffies + timeout);
  }
  
- void hfi1_set_led_override(struct hfi1_pportdata *ppd, unsigned int val)
+ /*
+  * To have the LED blink in a particular pattern, provide timeon and timeoff
+  * in milliseconds.
+  * To turn off custom blinking and return to normal operation, use
+  * shutdown_led_override()
+  */
+ void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon,
+                            unsigned int timeoff)
  {
-       struct hfi1_devdata *dd = ppd->dd;
-       int timeoff, freq;
-       if (!(dd->flags & HFI1_INITTED))
+       if (!(ppd->dd->flags & HFI1_INITTED))
                return;
  
-       /* First check if we are blinking. If not, use 1HZ polling */
-       timeoff = HZ;
-       freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT;
+       /* Convert to jiffies for direct use in timer */
+       ppd->led_override_vals[0] = msecs_to_jiffies(timeoff);
+       ppd->led_override_vals[1] = msecs_to_jiffies(timeon);
  
-       if (freq) {
-               /* For blink, set each phase from one nybble of val */
-               ppd->led_override_vals[0] = val & 0xF;
-               ppd->led_override_vals[1] = (val >> 4) & 0xF;
-               timeoff = (HZ << 4)/freq;
-       } else {
-               /* Non-blink set both phases the same. */
-               ppd->led_override_vals[0] = val & 0xF;
-               ppd->led_override_vals[1] = val & 0xF;
-       }
-       ppd->led_override_timeoff = timeoff;
+       /* Arbitrarily start from LED on phase */
+       ppd->led_override_phase = 1;
  
        /*
         * If the timer has not already been started, do so. Use a "quick"
-        * timeout so the function will be called soon, to look at our request.
+        * timeout so the handler will be called soon to look at our request.
         */
-       if (atomic_inc_return(&ppd->led_override_timer_active) == 1) {
-               /* Need to start timer */
+       if (!timer_pending(&ppd->led_override_timer)) {
                setup_timer(&ppd->led_override_timer, run_led_override,
-                               (unsigned long)ppd);
+                           (unsigned long)ppd);
                ppd->led_override_timer.expires = jiffies + 1;
                add_timer(&ppd->led_override_timer);
-       } else {
-               if (ppd->led_override_vals[0] || ppd->led_override_vals[1])
-                       mod_timer(&ppd->led_override_timer, jiffies + 1);
-               atomic_dec(&ppd->led_override_timer_active);
+               atomic_set(&ppd->led_override_timer_active, 1);
+               /* Ensure the atomic_set is visible to all CPUs */
+               smp_wmb();
        }
  }
  
@@@ -1184,8 -1268,8 +1268,8 @@@ int hfi1_reset_device(int unit
  
        if (!dd->kregbase || !(dd->flags & HFI1_PRESENT)) {
                dd_dev_info(dd,
-                       "Invalid unit number %u or not initialized or not present\n",
-                       unit);
+                           "Invalid unit number %u or not initialized or not present\n",
+                           unit);
                ret = -ENXIO;
                goto bail;
        }
  
        for (pidx = 0; pidx < dd->num_pports; ++pidx) {
                ppd = dd->pport + pidx;
-               if (atomic_read(&ppd->led_override_timer_active)) {
-                       /* Need to stop LED timer, _then_ shut off LEDs */
-                       del_timer_sync(&ppd->led_override_timer);
-                       atomic_set(&ppd->led_override_timer_active, 0);
-               }
  
-               /* Shut off LEDs after we are sure timer is not running */
-               ppd->led_override = LED_OVER_BOTH_OFF;
+               shutdown_led_override(ppd);
        }
        if (dd->flags & HFI1_HAS_SEND_DMA)
                sdma_exit(dd);
  
        if (ret)
                dd_dev_err(dd,
-                       "Reinitialize unit %u after reset failed with %d\n",
-                       unit, ret);
+                          "Reinitialize unit %u after reset failed with %d\n",
+                          unit, ret);
        else
                dd_dev_info(dd, "Reinitialized unit %u after resetting\n",
-                       unit);
+                           unit);
  
  bail:
        return ret;
@@@ -1282,7 -1360,7 +1360,7 @@@ int process_receive_bypass(struct hfi1_
                handle_eflags(packet);
  
        dd_dev_err(packet->rcd->dd,
-          "Bypass packets are not supported in normal operation. Dropping\n");
+                  "Bypass packets are not supported in normal operation. Dropping\n");
        return RHF_RCV_CONTINUE;
  }
  
@@@ -1320,6 -1398,6 +1398,6 @@@ int kdeth_process_eager(struct hfi1_pac
  int process_receive_invalid(struct hfi1_packet *packet)
  {
        dd_dev_err(packet->rcd->dd, "Invalid packet type %d. Dropping\n",
-               rhf_rcv_type(packet->rhf));
+                  rhf_rcv_type(packet->rhf));
        return RHF_RCV_CONTINUE;
  }
index 47dfe25847605ac2903d07f98f52bd227079534c,3f014f96f9e0a2b2a5f3d89b82e7f89ee586a371..106349fc1fb9bf5777284e15cfc83e9393ae380c
@@@ -1,12 -1,11 +1,11 @@@
  /*
+  * Copyright(c) 2015, 2016 Intel Corporation.
   *
   * This file is provided under a dual BSD/GPLv2 license.  When using or
   * redistributing this file, you may do so under either license.
   *
   * GPL LICENSE SUMMARY
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of version 2 of the GNU General Public License as
   * published by the Free Software Foundation.
@@@ -18,8 -17,6 +17,6 @@@
   *
   * BSD LICENSE
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
@@@ -83,7 -80,8 +80,7 @@@ static int read_efi_var(const char *nam
        if (!efi_enabled(EFI_RUNTIME_SERVICES))
                return -EOPNOTSUPP;
  
 -      uni_name = kzalloc(sizeof(efi_char16_t) * (strlen(name) + 1),
 -                         GFP_KERNEL);
 +      uni_name = kcalloc(strlen(name) + 1, sizeof(efi_char16_t), GFP_KERNEL);
        temp_buffer = kzalloc(EFI_DATA_SIZE, GFP_KERNEL);
  
        if (!uni_name || !temp_buffer) {
index 8b911e8bf0df5edfe312d9343647d655ccc73466,e460261f94b7e39b04323609ead6ecbfb75d90ee..8396dc5fb6c1899bc32b775e19877b00bfd0151f
@@@ -1,12 -1,11 +1,11 @@@
  /*
+  * Copyright(c) 2015, 2016 Intel Corporation.
   *
   * This file is provided under a dual BSD/GPLv2 license.  When using or
   * redistributing this file, you may do so under either license.
   *
   * GPL LICENSE SUMMARY
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of version 2 of the GNU General Public License as
   * published by the Free Software Foundation.
@@@ -18,8 -17,6 +17,6 @@@
   *
   * BSD LICENSE
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
@@@ -60,6 -57,8 +57,8 @@@
  #include "user_sdma.h"
  #include "user_exp_rcv.h"
  #include "eprom.h"
+ #include "aspm.h"
+ #include "mmu_rb.h"
  
  #undef pr_fmt
  #define pr_fmt(fmt) DRIVER_NAME ": " fmt
@@@ -96,9 -95,6 +95,6 @@@ static int user_event_ack(struct hfi1_c
  static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16);
  static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int);
  static int vma_fault(struct vm_area_struct *, struct vm_fault *);
- static int exp_tid_setup(struct file *, struct hfi1_tid_info *);
- static int exp_tid_free(struct file *, struct hfi1_tid_info *);
- static void unlock_exp_tids(struct hfi1_ctxtdata *);
  
  static const struct file_operations hfi1_file_ops = {
        .owner = THIS_MODULE,
@@@ -164,7 -160,6 +160,6 @@@ enum mmap_types 
  #define dbg(fmt, ...)                         \
        pr_info(fmt, ##__VA_ARGS__)
  
  static inline int is_valid_mmap(u64 token)
  {
        return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC);
@@@ -188,6 -183,7 +183,7 @@@ static ssize_t hfi1_file_write(struct f
        struct hfi1_cmd cmd;
        struct hfi1_user_info uinfo;
        struct hfi1_tid_info tinfo;
+       unsigned long addr;
        ssize_t consumed = 0, copy = 0, ret = 0;
        void *dest = NULL;
        __u64 user_val = 0;
                break;
        case HFI1_CMD_TID_UPDATE:
        case HFI1_CMD_TID_FREE:
+       case HFI1_CMD_TID_INVAL_READ:
                copy = sizeof(tinfo);
                dest = &tinfo;
                break;
                        sc_return_credits(uctxt->sc);
                break;
        case HFI1_CMD_TID_UPDATE:
-               ret = exp_tid_setup(fp, &tinfo);
+               ret = hfi1_user_exp_rcv_setup(fp, &tinfo);
                if (!ret) {
-                       unsigned long addr;
                        /*
                         * Copy the number of tidlist entries we used
                         * and the length of the buffer we registered.
                                ret = -EFAULT;
                }
                break;
+       case HFI1_CMD_TID_INVAL_READ:
+               ret = hfi1_user_exp_rcv_invalid(fp, &tinfo);
+               if (ret)
+                       break;
+               addr = (unsigned long)cmd.addr +
+                       offsetof(struct hfi1_tid_info, tidcnt);
+               if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+                                sizeof(tinfo.tidcnt)))
+                       ret = -EFAULT;
+               break;
        case HFI1_CMD_TID_FREE:
-               ret = exp_tid_free(fp, &tinfo);
+               ret = hfi1_user_exp_rcv_clear(fp, &tinfo);
+               if (ret)
+                       break;
+               addr = (unsigned long)cmd.addr +
+                       offsetof(struct hfi1_tid_info, tidcnt);
+               if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+                                sizeof(tinfo.tidcnt)))
+                       ret = -EFAULT;
                break;
        case HFI1_CMD_RECV_CTRL:
                ret = manage_rcvq(uctxt, fd->subctxt, (int)user_val);
                                break;
                        }
                        if (dd->flags & HFI1_FORCED_FREEZE) {
-                               /* Don't allow context reset if we are into
-                                * forced freeze */
+                               /*
+                                * Don't allow context reset if we are into
+                                * forced freeze
+                                */
                                ret = -ENODEV;
                                break;
                        }
                        ret = sc_enable(sc);
                        hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB,
                                     uctxt->ctxt);
-               } else
+               } else {
                        ret = sc_restart(sc);
+               }
                if (!ret)
                        sc_return_credits(sc);
                break;
        case HFI1_CMD_EP_ERASE_RANGE:
        case HFI1_CMD_EP_READ_RANGE:
        case HFI1_CMD_EP_WRITE_RANGE:
-               ret = handle_eprom_command(&cmd);
+               ret = handle_eprom_command(fp, &cmd);
                break;
        }
  
@@@ -487,7 -503,8 +503,7 @@@ static int hfi1_file_mmap(struct file *
                 * Map only the amount allocated to the context, not the
                 * entire available context's PIO space.
                 */
 -              memlen = ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE,
 -                             PAGE_SIZE);
 +              memlen = PAGE_ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE);
                flags &= ~VM_MAYREAD;
                flags |= VM_DONTCOPY | VM_DONTEXPAND;
                vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
                        goto done;
                }
                memaddr = (u64)cq->comps;
 -              memlen = ALIGN(sizeof(*cq->comps) * cq->nentries, PAGE_SIZE);
 +              memlen = PAGE_ALIGN(sizeof(*cq->comps) * cq->nentries);
                flags |= VM_IO | VM_DONTEXPAND;
                vmf = 1;
                break;
@@@ -732,6 -749,9 +748,9 @@@ static int hfi1_file_close(struct inod
        /* drain user sdma queue */
        hfi1_user_sdma_free_queues(fdata);
  
+       /* release the cpu */
+       hfi1_put_proc_affinity(dd, fdata->rec_cpu_num);
        /*
         * Clear any left over, unhandled events so the next process that
         * gets this context doesn't get confused.
        hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
                     HFI1_RCVCTRL_TIDFLOW_DIS |
                     HFI1_RCVCTRL_INTRAVAIL_DIS |
+                    HFI1_RCVCTRL_TAILUPD_DIS |
                     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
                     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
                     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt);
        uctxt->pionowait = 0;
        uctxt->event_flags = 0;
  
-       hfi1_clear_tids(uctxt);
+       hfi1_user_exp_rcv_free(fdata);
        hfi1_clear_ctxt_pkey(dd, uctxt->ctxt);
  
-       if (uctxt->tid_pg_list)
-               unlock_exp_tids(uctxt);
        hfi1_stats.sps_ctxts--;
-       dd->freectxts++;
+       if (++dd->freectxts == dd->num_user_contexts)
+               aspm_enable_all(dd);
        mutex_unlock(&hfi1_mutex);
        hfi1_free_ctxtdata(dd, uctxt);
  done:
@@@ -826,8 -845,16 +844,16 @@@ static int assign_ctxt(struct file *fp
  
        mutex_lock(&hfi1_mutex);
        /* First, lets check if we need to setup a shared context? */
-       if (uinfo->subctxt_cnt)
+       if (uinfo->subctxt_cnt) {
+               struct hfi1_filedata *fd = fp->private_data;
                ret = find_shared_ctxt(fp, uinfo);
+               if (ret < 0)
+                       goto done_unlock;
+               if (ret)
+                       fd->rec_cpu_num = hfi1_get_proc_affinity(
+                               fd->uctxt->dd, fd->uctxt->numa_id);
+       }
  
        /*
         * We execute the following block if we couldn't find a
                i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE;
                ret = get_user_context(fp, uinfo, i_minor - 1, alg);
        }
+ done_unlock:
        mutex_unlock(&hfi1_mutex);
  done:
        return ret;
@@@ -962,7 -990,7 +989,7 @@@ static int allocate_ctxt(struct file *f
        struct hfi1_filedata *fd = fp->private_data;
        struct hfi1_ctxtdata *uctxt;
        unsigned ctxt;
-       int ret;
+       int ret, numa;
  
        if (dd->flags & HFI1_FROZEN) {
                /*
        if (ctxt == dd->num_rcv_contexts)
                return -EBUSY;
  
-       uctxt = hfi1_create_ctxtdata(dd->pport, ctxt);
+       fd->rec_cpu_num = hfi1_get_proc_affinity(dd, -1);
+       if (fd->rec_cpu_num != -1)
+               numa = cpu_to_node(fd->rec_cpu_num);
+       else
+               numa = numa_node_id();
+       uctxt = hfi1_create_ctxtdata(dd->pport, ctxt, numa);
        if (!uctxt) {
                dd_dev_err(dd,
                           "Unable to allocate ctxtdata memory, failing open\n");
                return -ENOMEM;
        }
+       hfi1_cdbg(PROC, "[%u:%u] pid %u assigned to CPU %d (NUMA %u)",
+                 uctxt->ctxt, fd->subctxt, current->pid, fd->rec_cpu_num,
+                 uctxt->numa_id);
        /*
         * Allocate and enable a PIO send context.
         */
        uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize,
-                            uctxt->numa_id);
+                            uctxt->dd->node);
        if (!uctxt->sc)
                return -ENOMEM;
  
        INIT_LIST_HEAD(&uctxt->sdma_queues);
        spin_lock_init(&uctxt->sdma_qlock);
        hfi1_stats.sps_ctxts++;
-       dd->freectxts--;
+       /*
+        * Disable ASPM when there are open user/PSM contexts to avoid
+        * issues with ASPM L1 exit latency
+        */
+       if (dd->freectxts-- == dd->num_user_contexts)
+               aspm_disable_all(dd);
        fd->uctxt = uctxt;
  
        return 0;
  static int init_subctxts(struct hfi1_ctxtdata *uctxt,
                         const struct hfi1_user_info *uinfo)
  {
-       int ret = 0;
        unsigned num_subctxts;
  
        num_subctxts = uinfo->subctxt_cnt;
-       if (num_subctxts > HFI1_MAX_SHARED_CTXTS) {
-               ret = -EINVAL;
-               goto bail;
-       }
+       if (num_subctxts > HFI1_MAX_SHARED_CTXTS)
+               return -EINVAL;
  
        uctxt->subctxt_cnt = uinfo->subctxt_cnt;
        uctxt->subctxt_id = uinfo->subctxt_id;
        uctxt->active_slaves = 1;
        uctxt->redirect_seq_cnt = 1;
        set_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
- bail:
-       return ret;
+       return 0;
  }
  
  static int setup_subctxt(struct hfi1_ctxtdata *uctxt)
@@@ -1105,10 -1144,10 +1143,10 @@@ static int user_init(struct file *fp
         * has done it.
         */
        if (fd->subctxt) {
-               ret = wait_event_interruptible(uctxt->wait,
-                       !test_bit(HFI1_CTXT_MASTER_UNINIT,
-                       &uctxt->event_flags));
-               goto done;
+               ret = wait_event_interruptible(uctxt->wait, !test_bit(
+                                              HFI1_CTXT_MASTER_UNINIT,
+                                              &uctxt->event_flags));
+               goto expected;
        }
  
        /* initialize poll variables... */
                rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
        if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
                rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
+       /*
+        * The RcvCtxtCtrl.TailUpd bit has to be explicitly written.
+        * We can't rely on the correct value to be set from prior
+        * uses of the chip or ctxt. Therefore, add the rcvctrl op
+        * for both cases.
+        */
        if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
                rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
+       else
+               rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS;
        hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt);
  
        /* Notify any waiting slaves */
                clear_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
                wake_up(&uctxt->wait);
        }
-       ret = 0;
  
+ expected:
+       /*
+        * Expected receive has to be setup for all processes (including
+        * shared contexts). However, it has to be done after the master
+        * context has been fully configured as it depends on the
+        * eager/expected split of the RcvArray entries.
+        * Setting it up here ensures that the subcontexts will be waiting
+        * (due to the above wait_event_interruptible() until the master
+        * is setup.
+        */
+       ret = hfi1_user_exp_rcv_init(fp);
  done:
        return ret;
  }
@@@ -1226,46 -1283,6 +1282,6 @@@ static int setup_ctxt(struct file *fp
                        if (ret)
                                goto done;
                }
-               /* Setup Expected Rcv memories */
-               uctxt->tid_pg_list = vzalloc(uctxt->expected_count *
-                                            sizeof(struct page **));
-               if (!uctxt->tid_pg_list) {
-                       ret = -ENOMEM;
-                       goto done;
-               }
-               uctxt->physshadow = vzalloc(uctxt->expected_count *
-                                           sizeof(*uctxt->physshadow));
-               if (!uctxt->physshadow) {
-                       ret = -ENOMEM;
-                       goto done;
-               }
-               /* allocate expected TID map and initialize the cursor */
-               atomic_set(&uctxt->tidcursor, 0);
-               uctxt->numtidgroups = uctxt->expected_count /
-                       dd->rcv_entries.group_size;
-               uctxt->tidmapcnt = uctxt->numtidgroups / BITS_PER_LONG +
-                       !!(uctxt->numtidgroups % BITS_PER_LONG);
-               uctxt->tidusemap = kzalloc_node(uctxt->tidmapcnt *
-                                               sizeof(*uctxt->tidusemap),
-                                               GFP_KERNEL, uctxt->numa_id);
-               if (!uctxt->tidusemap) {
-                       ret = -ENOMEM;
-                       goto done;
-               }
-               /*
-                * In case that the number of groups is not a multiple of
-                * 64 (the number of groups in a tidusemap element), mark
-                * the extra ones as used. This will effectively make them
-                * permanently used and should never be assigned. Otherwise,
-                * the code which checks how many free groups we have will
-                * get completely confused about the state of the bits.
-                */
-               if (uctxt->numtidgroups % BITS_PER_LONG)
-                       uctxt->tidusemap[uctxt->tidmapcnt - 1] =
-                               ~((1ULL << (uctxt->numtidgroups %
-                                           BITS_PER_LONG)) - 1);
-               trace_hfi1_exp_tid_map(uctxt->ctxt, fd->subctxt, 0,
-                                      uctxt->tidusemap, uctxt->tidmapcnt);
        }
        ret = hfi1_user_sdma_alloc_queues(uctxt, fp);
        if (ret)
@@@ -1391,8 -1408,9 +1407,9 @@@ static unsigned int poll_next(struct fi
                set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags);
                hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt->ctxt);
                pollflag = 0;
-       } else
+       } else {
                pollflag = POLLIN | POLLRDNORM;
+       }
        spin_unlock_irq(&dd->uctxt_lock);
  
        return pollflag;
@@@ -1470,8 -1488,9 +1487,9 @@@ static int manage_rcvq(struct hfi1_ctxt
                if (uctxt->rcvhdrtail_kvaddr)
                        clear_rcvhdrtail(uctxt);
                rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB;
-       } else
+       } else {
                rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS;
+       }
        hfi1_rcvctrl(dd, rcvctrl_op, uctxt->ctxt);
        /* always; new head should be equal to new tail; see above */
  bail:
@@@ -1504,367 -1523,6 +1522,6 @@@ static int user_event_ack(struct hfi1_c
        return 0;
  }
  
- #define num_user_pages(vaddr, len)                                    \
-       (1 + (((((unsigned long)(vaddr) +                               \
-                (unsigned long)(len) - 1) & PAGE_MASK) -               \
-              ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
- /**
-  * tzcnt - count the number of trailing zeros in a 64bit value
-  * @value: the value to be examined
-  *
-  * Returns the number of trailing least significant zeros in the
-  * the input value. If the value is zero, return the number of
-  * bits of the value.
-  */
- static inline u8 tzcnt(u64 value)
- {
-       return value ? __builtin_ctzl(value) : sizeof(value) * 8;
- }
- static inline unsigned num_free_groups(unsigned long map, u16 *start)
- {
-       unsigned free;
-       u16 bitidx = *start;
-       if (bitidx >= BITS_PER_LONG)
-               return 0;
-       /* "Turn off" any bits set before our bit index */
-       map &= ~((1ULL << bitidx) - 1);
-       free = tzcnt(map) - bitidx;
-       while (!free && bitidx < BITS_PER_LONG) {
-               /* Zero out the last set bit so we look at the rest */
-               map &= ~(1ULL << bitidx);
-               /*
-                * Account for the previously checked bits and advance
-                * the bit index. We don't have to check for bitidx
-                * getting bigger than BITS_PER_LONG here as it would
-                * mean extra instructions that we don't need. If it
-                * did happen, it would push free to a negative value
-                * which will break the loop.
-                */
-               free = tzcnt(map) - ++bitidx;
-       }
-       *start = bitidx;
-       return free;
- }
- static int exp_tid_setup(struct file *fp, struct hfi1_tid_info *tinfo)
- {
-       int ret = 0;
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-       unsigned tid, mapped = 0, npages, ngroups, exp_groups,
-               tidpairs = uctxt->expected_count / 2;
-       struct page **pages;
-       unsigned long vaddr, tidmap[uctxt->tidmapcnt];
-       dma_addr_t *phys;
-       u32 tidlist[tidpairs], pairidx = 0, tidcursor;
-       u16 useidx, idx, bitidx, tidcnt = 0;
-       vaddr = tinfo->vaddr;
-       if (offset_in_page(vaddr)) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       npages = num_user_pages(vaddr, tinfo->length);
-       if (!npages) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
-                      npages * PAGE_SIZE)) {
-               dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
-                          (void *)vaddr, npages);
-               ret = -EFAULT;
-               goto bail;
-       }
-       memset(tidmap, 0, sizeof(tidmap[0]) * uctxt->tidmapcnt);
-       memset(tidlist, 0, sizeof(tidlist[0]) * tidpairs);
-       exp_groups = uctxt->expected_count / dd->rcv_entries.group_size;
-       /* which group set do we look at first? */
-       tidcursor = atomic_read(&uctxt->tidcursor);
-       useidx = (tidcursor >> 16) & 0xffff;
-       bitidx = tidcursor & 0xffff;
-       /*
-        * Keep going until we've mapped all pages or we've exhausted all
-        * RcvArray entries.
-        * This iterates over the number of tidmaps + 1
-        * (idx <= uctxt->tidmapcnt) so we check the bitmap which we
-        * started from one more time for any free bits before the
-        * starting point bit.
-        */
-       for (mapped = 0, idx = 0;
-            mapped < npages && idx <= uctxt->tidmapcnt;) {
-               u64 i, offset = 0;
-               unsigned free, pinned, pmapped = 0, bits_used;
-               u16 grp;
-               /*
-                * "Reserve" the needed group bits under lock so other
-                * processes can't step in the middle of it. Once
-                * reserved, we don't need the lock anymore since we
-                * are guaranteed the groups.
-                */
-               spin_lock(&uctxt->exp_lock);
-               if (uctxt->tidusemap[useidx] == -1ULL ||
-                   bitidx >= BITS_PER_LONG) {
-                       /* no free groups in the set, use the next */
-                       useidx = (useidx + 1) % uctxt->tidmapcnt;
-                       idx++;
-                       bitidx = 0;
-                       spin_unlock(&uctxt->exp_lock);
-                       continue;
-               }
-               ngroups = ((npages - mapped) / dd->rcv_entries.group_size) +
-                       !!((npages - mapped) % dd->rcv_entries.group_size);
-               /*
-                * If we've gotten here, the current set of groups does have
-                * one or more free groups.
-                */
-               free = num_free_groups(uctxt->tidusemap[useidx], &bitidx);
-               if (!free) {
-                       /*
-                        * Despite the check above, free could still come back
-                        * as 0 because we don't check the entire bitmap but
-                        * we start from bitidx.
-                        */
-                       spin_unlock(&uctxt->exp_lock);
-                       continue;
-               }
-               bits_used = min(free, ngroups);
-               tidmap[useidx] |= ((1ULL << bits_used) - 1) << bitidx;
-               uctxt->tidusemap[useidx] |= tidmap[useidx];
-               spin_unlock(&uctxt->exp_lock);
-               /*
-                * At this point, we know where in the map we have free bits.
-                * properly offset into the various "shadow" arrays and compute
-                * the RcvArray entry index.
-                */
-               offset = ((useidx * BITS_PER_LONG) + bitidx) *
-                       dd->rcv_entries.group_size;
-               pages = uctxt->tid_pg_list + offset;
-               phys = uctxt->physshadow + offset;
-               tid = uctxt->expected_base + offset;
-               /* Calculate how many pages we can pin based on free bits */
-               pinned = min((bits_used * dd->rcv_entries.group_size),
-                            (npages - mapped));
-               /*
-                * Now that we know how many free RcvArray entries we have,
-                * we can pin that many user pages.
-                */
-               ret = hfi1_acquire_user_pages(vaddr + (mapped * PAGE_SIZE),
-                                             pinned, true, pages);
-               if (ret) {
-                       /*
-                        * We can't continue because the pages array won't be
-                        * initialized. This should never happen,
-                        * unless perhaps the user has mpin'ed the pages
-                        * themselves.
-                        */
-                       dd_dev_info(dd,
-                                   "Failed to lock addr %p, %u pages: errno %d\n",
-                                   (void *) vaddr, pinned, -ret);
-                       /*
-                        * Let go of the bits that we reserved since we are not
-                        * going to use them.
-                        */
-                       spin_lock(&uctxt->exp_lock);
-                       uctxt->tidusemap[useidx] &=
-                               ~(((1ULL << bits_used) - 1) << bitidx);
-                       spin_unlock(&uctxt->exp_lock);
-                       goto done;
-               }
-               /*
-                * How many groups do we need based on how many pages we have
-                * pinned?
-                */
-               ngroups = (pinned / dd->rcv_entries.group_size) +
-                       !!(pinned % dd->rcv_entries.group_size);
-               /*
-                * Keep programming RcvArray entries for all the <ngroups> free
-                * groups.
-                */
-               for (i = 0, grp = 0; grp < ngroups; i++, grp++) {
-                       unsigned j;
-                       u32 pair_size = 0, tidsize;
-                       /*
-                        * This inner loop will program an entire group or the
-                        * array of pinned pages (which ever limit is hit
-                        * first).
-                        */
-                       for (j = 0; j < dd->rcv_entries.group_size &&
-                                    pmapped < pinned; j++, pmapped++, tid++) {
-                               tidsize = PAGE_SIZE;
-                               phys[pmapped] = hfi1_map_page(dd->pcidev,
-                                                  pages[pmapped], 0,
-                                                  tidsize, PCI_DMA_FROMDEVICE);
-                               trace_hfi1_exp_rcv_set(uctxt->ctxt,
-                                                      fd->subctxt,
-                                                      tid, vaddr,
-                                                      phys[pmapped],
-                                                      pages[pmapped]);
-                               /*
-                                * Each RcvArray entry is programmed with one
-                                * page * worth of memory. This will handle
-                                * the 8K MTU as well as anything smaller
-                                * due to the fact that both entries in the
-                                * RcvTidPair are programmed with a page.
-                                * PSM currently does not handle anything
-                                * bigger than 8K MTU, so should we even worry
-                                * about 10K here?
-                                */
-                               hfi1_put_tid(dd, tid, PT_EXPECTED,
-                                            phys[pmapped],
-                                            ilog2(tidsize >> PAGE_SHIFT) + 1);
-                               pair_size += tidsize >> PAGE_SHIFT;
-                               EXP_TID_RESET(tidlist[pairidx], LEN, pair_size);
-                               if (!(tid % 2)) {
-                                       tidlist[pairidx] |=
-                                          EXP_TID_SET(IDX,
-                                               (tid - uctxt->expected_base)
-                                                      / 2);
-                                       tidlist[pairidx] |=
-                                               EXP_TID_SET(CTRL, 1);
-                                       tidcnt++;
-                               } else {
-                                       tidlist[pairidx] |=
-                                               EXP_TID_SET(CTRL, 2);
-                                       pair_size = 0;
-                                       pairidx++;
-                               }
-                       }
-                       /*
-                        * We've programmed the entire group (or as much of the
-                        * group as we'll use. Now, it's time to push it out...
-                        */
-                       flush_wc();
-               }
-               mapped += pinned;
-               atomic_set(&uctxt->tidcursor,
-                          (((useidx & 0xffffff) << 16) |
-                           ((bitidx + bits_used) & 0xffffff)));
-       }
-       trace_hfi1_exp_tid_map(uctxt->ctxt, fd->subctxt, 0, uctxt->tidusemap,
-                              uctxt->tidmapcnt);
- done:
-       /* If we've mapped anything, copy relevant info to user */
-       if (mapped) {
-               if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
-                                tidlist, sizeof(tidlist[0]) * tidcnt)) {
-                       ret = -EFAULT;
-                       goto done;
-               }
-               /* copy TID info to user */
-               if (copy_to_user((void __user *)(unsigned long)tinfo->tidmap,
-                                tidmap, sizeof(tidmap[0]) * uctxt->tidmapcnt))
-                       ret = -EFAULT;
-       }
- bail:
-       /*
-        * Calculate mapped length. New Exp TID protocol does not "unwind" and
-        * report an error if it can't map the entire buffer. It just reports
-        * the length that was mapped.
-        */
-       tinfo->length = mapped * PAGE_SIZE;
-       tinfo->tidcnt = tidcnt;
-       return ret;
- }
- static int exp_tid_free(struct file *fp, struct hfi1_tid_info *tinfo)
- {
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-       unsigned long tidmap[uctxt->tidmapcnt];
-       struct page **pages;
-       dma_addr_t *phys;
-       u16 idx, bitidx, tid;
-       int ret = 0;
-       if (copy_from_user(&tidmap, (void __user *)(unsigned long)
-                          tinfo->tidmap,
-                          sizeof(tidmap[0]) * uctxt->tidmapcnt)) {
-               ret = -EFAULT;
-               goto done;
-       }
-       for (idx = 0; idx < uctxt->tidmapcnt; idx++) {
-               unsigned long map;
-               bitidx = 0;
-               if (!tidmap[idx])
-                       continue;
-               map = tidmap[idx];
-               while ((bitidx = tzcnt(map)) < BITS_PER_LONG) {
-                       int i, pcount = 0;
-                       struct page *pshadow[dd->rcv_entries.group_size];
-                       unsigned offset = ((idx * BITS_PER_LONG) + bitidx) *
-                               dd->rcv_entries.group_size;
-                       pages = uctxt->tid_pg_list + offset;
-                       phys = uctxt->physshadow + offset;
-                       tid = uctxt->expected_base + offset;
-                       for (i = 0; i < dd->rcv_entries.group_size;
-                            i++, tid++) {
-                               if (pages[i]) {
-                                       hfi1_put_tid(dd, tid, PT_INVALID,
-                                                     0, 0);
-                                       trace_hfi1_exp_rcv_free(uctxt->ctxt,
-                                                               fd->subctxt,
-                                                               tid, phys[i],
-                                                               pages[i]);
-                                       pci_unmap_page(dd->pcidev, phys[i],
-                                             PAGE_SIZE, PCI_DMA_FROMDEVICE);
-                                       pshadow[pcount] = pages[i];
-                                       pages[i] = NULL;
-                                       pcount++;
-                                       phys[i] = 0;
-                               }
-                       }
-                       flush_wc();
-                       hfi1_release_user_pages(pshadow, pcount, true);
-                       clear_bit(bitidx, &uctxt->tidusemap[idx]);
-                       map &= ~(1ULL<<bitidx);
-               }
-       }
-       trace_hfi1_exp_tid_map(uctxt->ctxt, fd->subctxt, 1, uctxt->tidusemap,
-                              uctxt->tidmapcnt);
- done:
-       return ret;
- }
- static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt)
- {
-       struct hfi1_devdata *dd = uctxt->dd;
-       unsigned tid;
-       dd_dev_info(dd, "ctxt %u unlocking any locked expTID pages\n",
-                   uctxt->ctxt);
-       for (tid = 0; tid < uctxt->expected_count; tid++) {
-               struct page *p = uctxt->tid_pg_list[tid];
-               dma_addr_t phys;
-               if (!p)
-                       continue;
-               phys = uctxt->physshadow[tid];
-               uctxt->physshadow[tid] = 0;
-               uctxt->tid_pg_list[tid] = NULL;
-               pci_unmap_page(dd->pcidev, phys, PAGE_SIZE, PCI_DMA_FROMDEVICE);
-               hfi1_release_user_pages(&p, 1, true);
-       }
- }
  static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
                         u16 pkey)
  {
@@@ -1933,10 -1591,9 +1590,9 @@@ static loff_t ui_lseek(struct file *fil
        return filp->f_pos;
  }
  
  /* NOTE: assumes unsigned long is 8 bytes */
  static ssize_t ui_read(struct file *filp, char __user *buf, size_t count,
-                       loff_t *f_pos)
+                      loff_t *f_pos)
  {
        struct hfi1_devdata *dd = filp->private_data;
        void __iomem *base = dd->kregbase;
                 * them.  These registers are defined as having a read value
                 * of 0.
                 */
-               else if (csr_off == ASIC_GPIO_CLEAR
-                               || csr_off == ASIC_GPIO_FORCE
-                               || csr_off == ASIC_QSFP1_CLEAR
-                               || csr_off == ASIC_QSFP1_FORCE
-                               || csr_off == ASIC_QSFP2_CLEAR
-                               || csr_off == ASIC_QSFP2_FORCE)
+               else if (csr_off == ASIC_GPIO_CLEAR ||
+                        csr_off == ASIC_GPIO_FORCE ||
+                        csr_off == ASIC_QSFP1_CLEAR ||
+                        csr_off == ASIC_QSFP1_FORCE ||
+                        csr_off == ASIC_QSFP2_CLEAR ||
+                        csr_off == ASIC_QSFP2_FORCE)
                        data = 0;
                else if (csr_off >= barlen) {
                        /*
index 02df291eb172c23b44b9a44adc16421107326ff8,deabb0812023e8c899fc12368228c6fdc39f27bc..cfcdc16b41c371a18a4e1cec24b51c4c421b7e80
@@@ -1,12 -1,11 +1,11 @@@
  /*
+  * Copyright(c) 2015, 2016 Intel Corporation.
   *
   * This file is provided under a dual BSD/GPLv2 license.  When using or
   * redistributing this file, you may do so under either license.
   *
   * GPL LICENSE SUMMARY
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of version 2 of the GNU General Public License as
   * published by the Free Software Foundation.
@@@ -18,8 -17,6 +17,6 @@@
   *
   * BSD LICENSE
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
@@@ -56,6 -53,7 +53,7 @@@
  #include <linux/module.h>
  #include <linux/printk.h>
  #include <linux/hrtimer.h>
+ #include <rdma/rdma_vt.h>
  
  #include "hfi.h"
  #include "device.h"
@@@ -65,6 -63,7 +63,7 @@@
  #include "sdma.h"
  #include "debugfs.h"
  #include "verbs.h"
+ #include "aspm.h"
  
  #undef pr_fmt
  #define pr_fmt(fmt) DRIVER_NAME ": " fmt
@@@ -75,6 -74,7 +74,7 @@@
  #define HFI1_MIN_USER_CTXT_BUFCNT 7
  
  #define HFI1_MIN_HDRQ_EGRBUF_CNT 2
+ #define HFI1_MAX_HDRQ_EGRBUF_CNT 16352
  #define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
  #define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
  
@@@ -87,9 -87,9 +87,9 @@@ module_param_named(num_user_contexts, n
  MODULE_PARM_DESC(
        num_user_contexts, "Set max number of user contexts to use");
  
- u8 krcvqs[RXE_NUM_DATA_VL];
+ uint krcvqs[RXE_NUM_DATA_VL];
  int krcvqsset;
- module_param_array(krcvqs, byte, &krcvqsset, S_IRUGO);
+ module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO);
  MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL");
  
  /* computed based on above array */
@@@ -128,16 -128,12 +128,12 @@@ int hfi1_create_ctxts(struct hfi1_devda
  {
        unsigned i;
        int ret;
-       int local_node_id = pcibus_to_node(dd->pcidev->bus);
  
        /* Control context has to be always 0 */
        BUILD_BUG_ON(HFI1_CTRL_CTXT != 0);
  
-       if (local_node_id < 0)
-               local_node_id = numa_node_id();
-       dd->assigned_node_id = local_node_id;
-       dd->rcd = kcalloc(dd->num_rcv_contexts, sizeof(*dd->rcd), GFP_KERNEL);
+       dd->rcd = kzalloc_node(dd->num_rcv_contexts * sizeof(*dd->rcd),
+                              GFP_KERNEL, dd->node);
        if (!dd->rcd)
                goto nomem;
  
                struct hfi1_ctxtdata *rcd;
  
                ppd = dd->pport + (i % dd->num_pports);
-               rcd = hfi1_create_ctxtdata(ppd, i);
+               rcd = hfi1_create_ctxtdata(ppd, i, dd->node);
                if (!rcd) {
                        dd_dev_err(dd,
-                               "Unable to allocate kernel receive context, failing\n");
+                                  "Unable to allocate kernel receive context, failing\n");
                        goto nomem;
                }
                /*
                rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node);
                if (!rcd->sc) {
                        dd_dev_err(dd,
-                               "Unable to allocate kernel send context, failing\n");
+                                  "Unable to allocate kernel send context, failing\n");
                        dd->rcd[rcd->ctxt] = NULL;
                        hfi1_free_ctxtdata(dd, rcd);
                        goto nomem;
                }
        }
  
+       /*
+        * Initialize aspm, to be done after gen3 transition and setting up
+        * contexts and before enabling interrupts
+        */
+       aspm_init(dd);
        return 0;
  nomem:
        ret = -ENOMEM;
@@@ -201,7 -203,8 +203,8 @@@ bail
  /*
   * Common code for user and kernel context setup.
   */
- struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt)
+ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt,
+                                          int numa)
  {
        struct hfi1_devdata *dd = ppd->dd;
        struct hfi1_ctxtdata *rcd;
                rcd->cnt = 1;
                rcd->ctxt = ctxt;
                dd->rcd[ctxt] = rcd;
-               rcd->numa_id = numa_node_id();
+               rcd->numa_id = numa;
                rcd->rcv_array_groups = dd->rcv_entries.ngroups;
  
-               spin_lock_init(&rcd->exp_lock);
+               mutex_init(&rcd->exp_lock);
  
                /*
                 * Calculate the context's RcvArray entry starting point.
                /* Validate and initialize Rcv Hdr Q variables */
                if (rcvhdrcnt % HDRQ_INCREMENT) {
                        dd_dev_err(dd,
-                                  "ctxt%u: header queue count %d must be divisible by %d\n",
+                                  "ctxt%u: header queue count %d must be divisible by %lu\n",
                                   rcd->ctxt, rcvhdrcnt, HDRQ_INCREMENT);
                        goto bail;
                }
        }
        return rcd;
  bail:
 -      kfree(rcd->opstats);
        kfree(rcd->egrbufs.rcvtids);
        kfree(rcd->egrbufs.buffers);
        kfree(rcd);
@@@ -379,7 -383,7 +382,7 @@@ void set_link_ipg(struct hfi1_pportdat
  
        cc_state = get_cc_state(ppd);
  
-       if (cc_state == NULL)
+       if (!cc_state)
                /*
                 * This should _never_ happen - rcu_read_lock() is held,
                 * and set_link_ipg() should not be called if cc_state
@@@ -431,7 -435,7 +434,7 @@@ static enum hrtimer_restart cca_timer_f
  
        cc_state = get_cc_state(ppd);
  
-       if (cc_state == NULL) {
+       if (!cc_state) {
                rcu_read_unlock();
                return HRTIMER_NORESTART;
        }
@@@ -493,14 -497,19 +496,19 @@@ void hfi1_init_pportdata(struct pci_de
        INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
        INIT_WORK(&ppd->link_up_work, handle_link_up);
        INIT_WORK(&ppd->link_down_work, handle_link_down);
+       INIT_WORK(&ppd->dc_host_req_work, handle_8051_request);
        INIT_WORK(&ppd->freeze_work, handle_freeze);
        INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
        INIT_WORK(&ppd->sma_message_work, handle_sma_message);
        INIT_WORK(&ppd->link_bounce_work, handle_link_bounce);
+       INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work);
+       INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event);
        mutex_init(&ppd->hls_lock);
        spin_lock_init(&ppd->sdma_alllock);
        spin_lock_init(&ppd->qsfp_info.qsfp_lock);
  
+       ppd->qsfp_info.ppd = ppd;
        ppd->sm_trap_qp = 0x0;
        ppd->sa_qp = 0x1;
  
@@@ -582,8 -591,8 +590,8 @@@ static void enable_chip(struct hfi1_dev
         * Enable kernel ctxts' receive and receive interrupt.
         * Other ctxts done as user opens and initializes them.
         */
-       rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB;
        for (i = 0; i < dd->first_user_ctxt; ++i) {
+               rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB;
                rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
                        HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
                if (!HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, MULTI_PKT_EGR))
@@@ -729,14 -738,14 +737,14 @@@ int hfi1_init(struct hfi1_devdata *dd, 
                        lastfail = hfi1_setup_eagerbufs(rcd);
                if (lastfail)
                        dd_dev_err(dd,
-                               "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
+                                  "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
        }
        if (lastfail)
                ret = lastfail;
  
        /* Allocate enough memory for user event notification. */
 -      len = ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS *
 -                  sizeof(*dd->events), PAGE_SIZE);
 +      len = PAGE_ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS *
 +                       sizeof(*dd->events));
        dd->events = vmalloc_user(len);
        if (!dd->events)
                dd_dev_err(dd, "Failed to allocate user events page\n");
        /* enable chip even if we have an error, so we can debug cause */
        enable_chip(dd);
  
-       ret = hfi1_cq_init(dd);
  done:
        /*
         * Set status even if port serdes is not initialized
                for (pidx = 0; pidx < dd->num_pports; ++pidx) {
                        ppd = dd->pport + pidx;
  
-                       /* initialize the qsfp if it exists
-                        * Requires interrupts to be enabled so we are notified
-                        * when the QSFP completes reset, and has
-                        * to be done before bringing up the SERDES
+                       /*
+                        * start the serdes - must be after interrupts are
+                        * enabled so we are notified when the link goes up
                         */
-                       init_qsfp(ppd);
-                       /* start the serdes - must be after interrupts are
-                          enabled so we are notified when the link goes up */
                        lastfail = bringup_serdes(ppd);
                        if (lastfail)
                                dd_dev_info(dd,
-                                       "Failed to bring up port %u\n",
-                                       ppd->port);
+                                           "Failed to bring up port %u\n",
+                                           ppd->port);
  
                        /*
                         * Set status even if port serdes is not initialized
@@@ -904,6 -907,8 +906,8 @@@ static void shutdown_device(struct hfi1
                /* disable the send device */
                pio_send_control(dd, PSC_GLOBAL_DISABLE);
  
+               shutdown_led_override(ppd);
                /*
                 * Clear SerdesEnable.
                 * We can't count on interrupts since we are stopping.
@@@ -961,17 -966,33 +965,33 @@@ void hfi1_free_ctxtdata(struct hfi1_dev
        kfree(rcd->egrbufs.buffers);
  
        sc_free(rcd->sc);
-       vfree(rcd->physshadow);
-       vfree(rcd->tid_pg_list);
        vfree(rcd->user_event_mask);
        vfree(rcd->subctxt_uregbase);
        vfree(rcd->subctxt_rcvegrbuf);
        vfree(rcd->subctxt_rcvhdr_base);
-       kfree(rcd->tidusemap);
        kfree(rcd->opstats);
        kfree(rcd);
  }
  
+ /*
+  * Release our hold on the shared asic data.  If we are the last one,
+  * free the structure.  Must be holding hfi1_devs_lock.
+  */
+ static void release_asic_data(struct hfi1_devdata *dd)
+ {
+       int other;
+       if (!dd->asic_data)
+               return;
+       dd->asic_data->dds[dd->hfi1_id] = NULL;
+       other = dd->hfi1_id ? 0 : 1;
+       if (!dd->asic_data->dds[other]) {
+               /* we are the last holder, free it */
+               kfree(dd->asic_data);
+       }
+       dd->asic_data = NULL;
+ }
  void hfi1_free_devdata(struct hfi1_devdata *dd)
  {
        unsigned long flags;
        spin_lock_irqsave(&hfi1_devs_lock, flags);
        idr_remove(&hfi1_unit_table, dd->unit);
        list_del(&dd->list);
+       release_asic_data(dd);
        spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-       hfi1_dbg_ibdev_exit(&dd->verbs_dev);
+       free_platform_config(dd);
        rcu_barrier(); /* wait for rcu callbacks to complete */
        free_percpu(dd->int_counter);
        free_percpu(dd->rcv_limit);
-       ib_dealloc_device(&dd->verbs_dev.ibdev);
+       hfi1_dev_affinity_free(dd);
+       free_percpu(dd->send_schedule);
+       ib_dealloc_device(&dd->verbs_dev.rdi.ibdev);
  }
  
  /*
@@@ -999,19 -1023,19 +1022,19 @@@ struct hfi1_devdata *hfi1_alloc_devdata
  {
        unsigned long flags;
        struct hfi1_devdata *dd;
-       int ret;
+       int ret, nports;
  
-       dd = (struct hfi1_devdata *)ib_alloc_device(sizeof(*dd) + extra);
+       /* extra is * number of ports */
+       nports = extra / sizeof(struct hfi1_pportdata);
+       dd = (struct hfi1_devdata *)rvt_alloc_device(sizeof(*dd) + extra,
+                                                    nports);
        if (!dd)
                return ERR_PTR(-ENOMEM);
-       /* extra is * number of ports */
-       dd->num_pports = extra / sizeof(struct hfi1_pportdata);
+       dd->num_pports = nports;
        dd->pport = (struct hfi1_pportdata *)(dd + 1);
  
        INIT_LIST_HEAD(&dd->list);
-       dd->node = dev_to_node(&pdev->dev);
-       if (dd->node < 0)
-               dd->node = 0;
        idr_preload(GFP_KERNEL);
        spin_lock_irqsave(&hfi1_devs_lock, flags);
  
        spin_lock_init(&dd->sc_init_lock);
        spin_lock_init(&dd->dc8051_lock);
        spin_lock_init(&dd->dc8051_memlock);
-       mutex_init(&dd->qsfp_i2c_mutex);
        seqlock_init(&dd->sc2vl_lock);
        spin_lock_init(&dd->sde_map_lock);
+       spin_lock_init(&dd->pio_map_lock);
        init_waitqueue_head(&dd->event_queue);
  
        dd->int_counter = alloc_percpu(u64);
                goto bail;
        }
  
+       dd->send_schedule = alloc_percpu(u64);
+       if (!dd->send_schedule) {
+               ret = -ENOMEM;
+               hfi1_early_err(&pdev->dev,
+                              "Could not allocate per-cpu int_counter\n");
+               goto bail;
+       }
        if (!hfi1_cpulist_count) {
                u32 count = num_online_cpus();
  
                        &pdev->dev,
                        "Could not alloc cpulist info, cpu affinity might be wrong\n");
        }
-       hfi1_dbg_ibdev_init(&dd->verbs_dev);
        return dd;
  
  bail:
        if (!list_empty(&dd->list))
                list_del_init(&dd->list);
-       ib_dealloc_device(&dd->verbs_dev.ibdev);
+       ib_dealloc_device(&dd->verbs_dev.rdi.ibdev);
        return ERR_PTR(ret);
  }
  
@@@ -1173,8 -1204,10 +1203,10 @@@ static int __init hfi1_mod_init(void
                user_credit_return_threshold = 100;
  
        compute_krcvqs();
-       /* sanitize receive interrupt count, time must wait until after
-          the hardware type is known */
+       /*
+        * sanitize receive interrupt count, time must wait until after
+        * the hardware type is known
+        */
        if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK)
                rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK;
        /* reject invalid combinations */
        idr_init(&hfi1_unit_table);
  
        hfi1_dbg_init();
+       ret = hfi1_wss_init();
+       if (ret < 0)
+               goto bail_wss;
        ret = pci_register_driver(&hfi1_pci_driver);
        if (ret < 0) {
                pr_err("Unable to register driver: error %d\n", -ret);
        goto bail; /* all OK */
  
  bail_dev:
+       hfi1_wss_exit();
+ bail_wss:
        hfi1_dbg_exit();
        idr_destroy(&hfi1_unit_table);
        dev_cleanup();
@@@ -1232,6 -1270,7 +1269,7 @@@ module_init(hfi1_mod_init)
  static void __exit hfi1_mod_cleanup(void)
  {
        pci_unregister_driver(&hfi1_pci_driver);
+       hfi1_wss_exit();
        hfi1_dbg_exit();
        hfi1_cpulist_count = 0;
        kfree(hfi1_cpulist);
@@@ -1303,16 -1342,18 +1341,18 @@@ static void cleanup_device_data(struct 
                }
        }
        kfree(tmp);
+       free_pio_map(dd);
        /* must follow rcv context free - need to remove rcv's hooks */
        for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++)
                sc_free(dd->send_contexts[ctxt].sc);
        dd->num_send_contexts = 0;
        kfree(dd->send_contexts);
        dd->send_contexts = NULL;
+       kfree(dd->hw_to_sw);
+       dd->hw_to_sw = NULL;
        kfree(dd->boardname);
        vfree(dd->events);
        vfree(dd->status);
-       hfi1_cq_exit(dd);
  }
  
  /*
@@@ -1346,6 -1387,13 +1386,13 @@@ static int init_one(struct pci_dev *pde
                ret = -EINVAL;
                goto bail;
        }
+       if (rcvhdrcnt > HFI1_MAX_HDRQ_EGRBUF_CNT) {
+               hfi1_early_err(&pdev->dev,
+                              "Receive header queue count cannot be greater than %u\n",
+                              HFI1_MAX_HDRQ_EGRBUF_CNT);
+               ret = -EINVAL;
+               goto bail;
+       }
        /* use the encoding function as a sanitization check */
        if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) {
                hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n",
         * we still create devices, so diags, etc. can be used
         * to determine cause of problem.
         */
-       if (!initfail && !ret)
+       if (!initfail && !ret) {
                dd->flags |= HFI1_INITTED;
+               /* create debufs files after init and ib register */
+               hfi1_dbg_ibdev_init(&dd->verbs_dev);
+       }
  
        j = hfi1_device_create(dd);
        if (j)
@@@ -1464,6 -1515,8 +1514,8 @@@ static void remove_one(struct pci_dev *
  {
        struct hfi1_devdata *dd = pci_get_drvdata(pdev);
  
+       /* close debugfs files before ib unregister */
+       hfi1_dbg_ibdev_exit(&dd->verbs_dev);
        /* unregister from IB core */
        hfi1_unregister_ib_device(dd);
  
@@@ -1505,8 -1558,8 +1557,8 @@@ int hfi1_create_rcvhdrq(struct hfi1_dev
                 * rcvhdrqentsize is in DWs, so we have to convert to bytes
                 * (* sizeof(u32)).
                 */
 -              amt = ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize *
 -                          sizeof(u32), PAGE_SIZE);
 +              amt = PAGE_ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize *
 +                               sizeof(u32));
  
                gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ?
                        GFP_USER : GFP_KERNEL;
  
                if (!rcd->rcvhdrq) {
                        dd_dev_err(dd,
-                               "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
-                               amt, rcd->ctxt);
+                                  "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
+                                  amt, rcd->ctxt);
                        goto bail;
                }
  
-               /* Event mask is per device now and is in hfi1_devdata */
-               /*if (rcd->ctxt >= dd->first_user_ctxt) {
-                       rcd->user_event_mask = vmalloc_user(PAGE_SIZE);
-                       if (!rcd->user_event_mask)
-                               goto bail_free_hdrq;
-                               }*/
                if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
                        rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent(
                                &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
  
  bail_free:
        dd_dev_err(dd,
-               "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
-               rcd->ctxt);
+                  "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
+                  rcd->ctxt);
        vfree(rcd->user_event_mask);
        rcd->user_event_mask = NULL;
        dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq,
@@@ -1659,7 -1705,7 +1704,7 @@@ int hfi1_setup_eagerbufs(struct hfi1_ct
                        if (rcd->egrbufs.rcvtid_size == round_mtu ||
                            !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) {
                                dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n",
-                                       rcd->ctxt);
+                                          rcd->ctxt);
                                goto bail_rcvegrbuf_phys;
                        }
  
                                     rcd->egrbufs.buffers[j].len)) {
                                        j++;
                                        offset = 0;
-                               } else
+                               } else {
                                        offset += new_size;
+                               }
                        }
                        rcd->egrbufs.rcvtid_size = new_size;
                }
                  rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size,
                  rcd->egrbufs.size);
  
        /*
         * Set the contexts rcv array head update threshold to the closest
         * power of 2 (so we can use a mask instead of modulo) below half
  
        for (idx = 0; idx < rcd->egrbufs.alloced; idx++) {
                hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER,
-                             rcd->egrbufs.rcvtids[idx].phys, order);
+                            rcd->egrbufs.rcvtids[idx].phys, order);
                cond_resched();
        }
        goto bail;
  
  bail_rcvegrbuf_phys:
        for (idx = 0; idx < rcd->egrbufs.alloced &&
-                    rcd->egrbufs.buffers[idx].addr;
+            rcd->egrbufs.buffers[idx].addr;
             idx++) {
                dma_free_coherent(&dd->pcidev->dev,
                                  rcd->egrbufs.buffers[idx].len,
index 77700b818e3d89fecf428756bba1886b7ce8ce28,0ec748e7e7b649419732351fc2e61b7ba7813355..d1e7f4d7cf6fdf3fb32d699af45348201511b38d
@@@ -1,12 -1,11 +1,11 @@@
  /*
+  * Copyright(c) 2015, 2016 Intel Corporation.
   *
   * This file is provided under a dual BSD/GPLv2 license.  When using or
   * redistributing this file, you may do so under either license.
   *
   * GPL LICENSE SUMMARY
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of version 2 of the GNU General Public License as
   * published by the Free Software Foundation.
@@@ -18,8 -17,6 +17,6 @@@
   *
   * BSD LICENSE
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
@@@ -55,6 -52,7 +52,7 @@@
  #include "hfi.h"
  #include "mad.h"
  #include "trace.h"
+ #include "qp.h"
  
  /* the reset value from the FM is supposed to be 0xffff, handle both */
  #define OPA_LINK_WIDTH_RESET_OLD 0x0fff
@@@ -91,7 -89,7 +89,7 @@@ static void send_trap(struct hfi1_ibpor
        int pkey_idx;
        u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp;
  
-       agent = ibp->send_agent;
+       agent = ibp->rvp.send_agent;
        if (!agent)
                return;
  
                return;
  
        /* o14-2 */
-       if (ibp->trap_timeout && time_before(jiffies, ibp->trap_timeout))
+       if (ibp->rvp.trap_timeout && time_before(jiffies,
+                                                ibp->rvp.trap_timeout))
                return;
  
        pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
        smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
        smp->class_version = OPA_SMI_CLASS_VERSION;
        smp->method = IB_MGMT_METHOD_TRAP;
-       ibp->tid++;
-       smp->tid = cpu_to_be64(ibp->tid);
+       ibp->rvp.tid++;
+       smp->tid = cpu_to_be64(ibp->rvp.tid);
        smp->attr_id = IB_SMP_ATTR_NOTICE;
        /* o14-1: smp->mkey = 0; */
        memcpy(smp->route.lid.data, data, len);
  
-       spin_lock_irqsave(&ibp->lock, flags);
-       if (!ibp->sm_ah) {
-               if (ibp->sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) {
+       spin_lock_irqsave(&ibp->rvp.lock, flags);
+       if (!ibp->rvp.sm_ah) {
+               if (ibp->rvp.sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) {
                        struct ib_ah *ah;
  
-                       ah = hfi1_create_qp0_ah(ibp, ibp->sm_lid);
-                       if (IS_ERR(ah))
+                       ah = hfi1_create_qp0_ah(ibp, ibp->rvp.sm_lid);
+                       if (IS_ERR(ah)) {
                                ret = PTR_ERR(ah);
-                       else {
+                       else {
                                send_buf->ah = ah;
-                               ibp->sm_ah = to_iah(ah);
+                               ibp->rvp.sm_ah = ibah_to_rvtah(ah);
                                ret = 0;
                        }
-               } else
+               } else {
                        ret = -EINVAL;
+               }
        } else {
-               send_buf->ah = &ibp->sm_ah->ibah;
+               send_buf->ah = &ibp->rvp.sm_ah->ibah;
                ret = 0;
        }
-       spin_unlock_irqrestore(&ibp->lock, flags);
+       spin_unlock_irqrestore(&ibp->rvp.lock, flags);
  
        if (!ret)
                ret = ib_post_send_mad(send_buf, NULL);
        if (!ret) {
                /* 4.096 usec. */
-               timeout = (4096 * (1UL << ibp->subnet_timeout)) / 1000;
-               ibp->trap_timeout = jiffies + usecs_to_jiffies(timeout);
+               timeout = (4096 * (1UL << ibp->rvp.subnet_timeout)) / 1000;
+               ibp->rvp.trap_timeout = jiffies + usecs_to_jiffies(timeout);
        } else {
                ib_free_send_mad(send_buf);
-               ibp->trap_timeout = 0;
+               ibp->rvp.trap_timeout = 0;
        }
  }
  
@@@ -174,10 -174,10 +174,10 @@@ void hfi1_bad_pqkey(struct hfi1_ibport 
        memset(&data, 0, sizeof(data));
  
        if (trap_num == OPA_TRAP_BAD_P_KEY)
-               ibp->pkey_violations++;
+               ibp->rvp.pkey_violations++;
        else
-               ibp->qkey_violations++;
-       ibp->n_pkt_drops++;
+               ibp->rvp.qkey_violations++;
+       ibp->rvp.n_pkt_drops++;
  
        /* Send violation trap */
        data.generic_type = IB_NOTICE_TYPE_SECURITY;
@@@ -233,9 -233,12 +233,12 @@@ static void bad_mkey(struct hfi1_ibpor
  /*
   * Send a Port Capability Mask Changed trap (ch. 14.3.11).
   */
- void hfi1_cap_mask_chg(struct hfi1_ibport *ibp)
+ void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num)
  {
        struct opa_mad_notice_attr data;
+       struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
+       struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
+       struct hfi1_ibport *ibp = &dd->pport[port_num - 1].ibport_data;
        u32 lid = ppd_from_ibp(ibp)->lid;
  
        memset(&data, 0, sizeof(data));
        data.trap_num = OPA_TRAP_CHANGE_CAPABILITY;
        data.issuer_lid = cpu_to_be32(lid);
        data.ntc_144.lid = data.issuer_lid;
-       data.ntc_144.new_cap_mask = cpu_to_be32(ibp->port_cap_flags);
+       data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
  
        send_trap(ibp, &data, sizeof(data));
  }
@@@ -407,37 -410,38 +410,38 @@@ static int check_mkey(struct hfi1_ibpor
        int ret = 0;
  
        /* Is the mkey in the process of expiring? */
-       if (ibp->mkey_lease_timeout &&
-           time_after_eq(jiffies, ibp->mkey_lease_timeout)) {
+       if (ibp->rvp.mkey_lease_timeout &&
+           time_after_eq(jiffies, ibp->rvp.mkey_lease_timeout)) {
                /* Clear timeout and mkey protection field. */
-               ibp->mkey_lease_timeout = 0;
-               ibp->mkeyprot = 0;
+               ibp->rvp.mkey_lease_timeout = 0;
+               ibp->rvp.mkeyprot = 0;
        }
  
-       if ((mad_flags & IB_MAD_IGNORE_MKEY) ||  ibp->mkey == 0 ||
-           ibp->mkey == mkey)
+       if ((mad_flags & IB_MAD_IGNORE_MKEY) ||  ibp->rvp.mkey == 0 ||
+           ibp->rvp.mkey == mkey)
                valid_mkey = 1;
  
        /* Unset lease timeout on any valid Get/Set/TrapRepress */
-       if (valid_mkey && ibp->mkey_lease_timeout &&
+       if (valid_mkey && ibp->rvp.mkey_lease_timeout &&
            (mad->method == IB_MGMT_METHOD_GET ||
             mad->method == IB_MGMT_METHOD_SET ||
             mad->method == IB_MGMT_METHOD_TRAP_REPRESS))
-               ibp->mkey_lease_timeout = 0;
+               ibp->rvp.mkey_lease_timeout = 0;
  
        if (!valid_mkey) {
                switch (mad->method) {
                case IB_MGMT_METHOD_GET:
                        /* Bad mkey not a violation below level 2 */
-                       if (ibp->mkeyprot < 2)
+                       if (ibp->rvp.mkeyprot < 2)
                                break;
                case IB_MGMT_METHOD_SET:
                case IB_MGMT_METHOD_TRAP_REPRESS:
-                       if (ibp->mkey_violations != 0xFFFF)
-                               ++ibp->mkey_violations;
-                       if (!ibp->mkey_lease_timeout && ibp->mkey_lease_period)
-                               ibp->mkey_lease_timeout = jiffies +
-                                       ibp->mkey_lease_period * HZ;
+                       if (ibp->rvp.mkey_violations != 0xFFFF)
+                               ++ibp->rvp.mkey_violations;
+                       if (!ibp->rvp.mkey_lease_timeout &&
+                           ibp->rvp.mkey_lease_period)
+                               ibp->rvp.mkey_lease_timeout = jiffies +
+                                       ibp->rvp.mkey_lease_period * HZ;
                        /* Generate a trap notice. */
                        bad_mkey(ibp, mad, mkey, dr_slid, return_path,
                                 hop_cnt);
@@@ -501,16 -505,6 +505,6 @@@ void read_ltp_rtt(struct hfi1_devdata *
                write_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, reg);
  }
  
- static u8 __opa_porttype(struct hfi1_pportdata *ppd)
- {
-       if (qsfp_mod_present(ppd)) {
-               if (ppd->qsfp_info.cache_valid)
-                       return OPA_PORT_TYPE_STANDARD;
-               return OPA_PORT_TYPE_DISCONNECTED;
-       }
-       return OPA_PORT_TYPE_UNKNOWN;
- }
  static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
                                   struct ib_device *ibdev, u8 port,
                                   u32 *resp_len)
        struct opa_port_info *pi = (struct opa_port_info *)data;
        u8 mtu;
        u8 credit_rate;
+       u8 is_beaconing_active;
        u32 state;
        u32 num_ports = OPA_AM_NPORT(am);
        u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
        ppd = dd->pport + (port - 1);
        ibp = &ppd->ibport_data;
  
-       if (ppd->vls_supported/2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
-               ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
+       if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
+           ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
                smp->status |= IB_SMP_INVALID_FIELD;
                return reply((struct ib_mad_hdr *)smp);
        }
  
        /* Only return the mkey if the protection field allows it. */
        if (!(smp->method == IB_MGMT_METHOD_GET &&
-             ibp->mkey != smp->mkey &&
-             ibp->mkeyprot == 1))
-               pi->mkey = ibp->mkey;
-       pi->subnet_prefix = ibp->gid_prefix;
-       pi->sm_lid = cpu_to_be32(ibp->sm_lid);
-       pi->ib_cap_mask = cpu_to_be32(ibp->port_cap_flags);
-       pi->mkey_lease_period = cpu_to_be16(ibp->mkey_lease_period);
+             ibp->rvp.mkey != smp->mkey &&
+             ibp->rvp.mkeyprot == 1))
+               pi->mkey = ibp->rvp.mkey;
+       pi->subnet_prefix = ibp->rvp.gid_prefix;
+       pi->sm_lid = cpu_to_be32(ibp->rvp.sm_lid);
+       pi->ib_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
+       pi->mkey_lease_period = cpu_to_be16(ibp->rvp.mkey_lease_period);
        pi->sm_trap_qp = cpu_to_be32(ppd->sm_trap_qp);
        pi->sa_qp = cpu_to_be32(ppd->sa_qp);
  
        if (start_of_sm_config && (state == IB_PORT_INIT))
                ppd->is_sm_config_started = 1;
  
-       pi->port_phys_conf = __opa_porttype(ppd) & 0xf;
+       pi->port_phys_conf = (ppd->port_type & 0xf);
  
  #if PI_LED_ENABLE_SUP
        pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
        pi->port_states.ledenable_offlinereason |=
                ppd->is_sm_config_started << 5;
+       /*
+        * This pairs with the memory barrier in hfi1_start_led_override to
+        * ensure that we read the correct state of LED beaconing represented
+        * by led_override_timer_active
+        */
+       smp_rmb();
+       is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active);
+       pi->port_states.ledenable_offlinereason |= is_beaconing_active << 6;
        pi->port_states.ledenable_offlinereason |=
-               ppd->offline_disabled_reason & OPA_PI_MASK_OFFLINE_REASON;
+               ppd->offline_disabled_reason;
  #else
        pi->port_states.offline_reason = ppd->neighbor_normal << 4;
        pi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
-       pi->port_states.offline_reason |= ppd->offline_disabled_reason &
-                                               OPA_PI_MASK_OFFLINE_REASON;
+       pi->port_states.offline_reason |= ppd->offline_disabled_reason;
  #endif /* PI_LED_ENABLE_SUP */
  
        pi->port_states.portphysstate_portstate =
                (hfi1_ibphys_portstate(ppd) << 4) | state;
  
-       pi->mkeyprotect_lmc = (ibp->mkeyprot << 6) | ppd->lmc;
+       pi->mkeyprotect_lmc = (ibp->rvp.mkeyprot << 6) | ppd->lmc;
  
        memset(pi->neigh_mtu.pvlx_to_mtu, 0, sizeof(pi->neigh_mtu.pvlx_to_mtu));
        for (i = 0; i < ppd->vls_supported; i++) {
                mtu = mtu_to_enum(dd->vld[i].mtu, HFI1_DEFAULT_ACTIVE_MTU);
                if ((i % 2) == 0)
-                       pi->neigh_mtu.pvlx_to_mtu[i/2] |= (mtu << 4);
+                       pi->neigh_mtu.pvlx_to_mtu[i / 2] |= (mtu << 4);
                else
-                       pi->neigh_mtu.pvlx_to_mtu[i/2] |= mtu;
+                       pi->neigh_mtu.pvlx_to_mtu[i / 2] |= mtu;
        }
        /* don't forget VL 15 */
        mtu = mtu_to_enum(dd->vld[15].mtu, 2048);
-       pi->neigh_mtu.pvlx_to_mtu[15/2] |= mtu;
-       pi->smsl = ibp->sm_sl & OPA_PI_MASK_SMSL;
+       pi->neigh_mtu.pvlx_to_mtu[15 / 2] |= mtu;
+       pi->smsl = ibp->rvp.sm_sl & OPA_PI_MASK_SMSL;
        pi->operational_vls = hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS);
        pi->partenforce_filterraw |=
                (ppd->linkinit_reason & OPA_PI_MASK_LINKINIT_REASON);
                pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_IN;
        if (ppd->part_enforce & HFI1_PART_ENFORCE_OUT)
                pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_OUT;
-       pi->mkey_violations = cpu_to_be16(ibp->mkey_violations);
+       pi->mkey_violations = cpu_to_be16(ibp->rvp.mkey_violations);
        /* P_KeyViolations are counted by hardware. */
-       pi->pkey_violations = cpu_to_be16(ibp->pkey_violations);
-       pi->qkey_violations = cpu_to_be16(ibp->qkey_violations);
+       pi->pkey_violations = cpu_to_be16(ibp->rvp.pkey_violations);
+       pi->qkey_violations = cpu_to_be16(ibp->rvp.qkey_violations);
  
        pi->vl.cap = ppd->vls_supported;
-       pi->vl.high_limit = cpu_to_be16(ibp->vl_high_limit);
+       pi->vl.high_limit = cpu_to_be16(ibp->rvp.vl_high_limit);
        pi->vl.arb_high_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_CAP);
        pi->vl.arb_low_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_LOW_CAP);
  
-       pi->clientrereg_subnettimeout = ibp->subnet_timeout;
+       pi->clientrereg_subnettimeout = ibp->rvp.subnet_timeout;
  
        pi->port_link_mode  = cpu_to_be16(OPA_PORT_LINK_MODE_OPA << 10 |
                                          OPA_PORT_LINK_MODE_OPA << 5 |
        /* read the cached value of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
        read_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, &tmp);
  
-       /* this counter is 16 bits wide, but the replay_depth.wire
-        * variable is only 8 bits */
+       /*
+        * this counter is 16 bits wide, but the replay_depth.wire
+        * variable is only 8 bits
+        */
        if (tmp > 0xff)
                tmp = 0xff;
        pi->replay_depth.wire = tmp;
@@@ -749,7 -753,7 +753,7 @@@ static int __subn_get_opa_pkeytable(str
                return reply((struct ib_mad_hdr *)smp);
        }
  
-       n_blocks_avail = (u16) (npkeys/OPA_PARTITION_TABLE_BLK_SIZE) + 1;
+       n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1;
  
        size = (n_blocks_req * OPA_PARTITION_TABLE_BLK_SIZE) * sizeof(u16);
  
                return reply((struct ib_mad_hdr *)smp);
        }
  
-       p = (__be16 *) data;
+       p = (__be16 *)data;
        q = (u16 *)data;
        /* get the real pkeys if we are requesting the first block */
        if (start_block == 0) {
                        p[i] = cpu_to_be16(q[i]);
                if (resp_len)
                        *resp_len += size;
-       } else
+       } else {
                smp->status |= IB_SMP_INVALID_FIELD;
+       }
        return reply((struct ib_mad_hdr *)smp);
  }
  
@@@ -901,8 -905,8 +905,8 @@@ static int port_states_transition_allow
        u32 logical_old = driver_logical_state(ppd);
        int ret, logical_allowed, physical_allowed;
  
-       logical_allowed = ret =
-               logical_transition_allowed(logical_old, logical_new);
+       ret = logical_transition_allowed(logical_old, logical_new);
+       logical_allowed = ret;
  
        if (ret == HFI_TRANSITION_DISALLOWED ||
            ret == HFI_TRANSITION_UNDEFINED) {
                return ret;
        }
  
-       physical_allowed = ret =
-               physical_transition_allowed(physical_old, physical_new);
+       ret = physical_transition_allowed(physical_old, physical_new);
+       physical_allowed = ret;
  
        if (ret == HFI_TRANSITION_DISALLOWED ||
            ret == HFI_TRANSITION_UNDEFINED) {
            physical_allowed == HFI_TRANSITION_IGNORED)
                return HFI_TRANSITION_IGNORED;
  
+       /*
+        * A change request of Physical Port State from
+        * 'Offline' to 'Polling' should be ignored.
+        */
+       if ((physical_old == OPA_PORTPHYSSTATE_OFFLINE) &&
+           (physical_new == IB_PORTPHYSSTATE_POLLING))
+               return HFI_TRANSITION_IGNORED;
        /*
         * Either physical_allowed or logical_allowed is
         * HFI_TRANSITION_ALLOWED.
@@@ -972,16 -984,15 +984,15 @@@ static int set_port_states(struct hfi1_
                        break;
                /* FALLTHROUGH */
        case IB_PORT_DOWN:
-               if (phys_state == IB_PORTPHYSSTATE_NOP)
+               if (phys_state == IB_PORTPHYSSTATE_NOP) {
                        link_state = HLS_DN_DOWNDEF;
-               else if (phys_state == IB_PORTPHYSSTATE_POLLING) {
+               else if (phys_state == IB_PORTPHYSSTATE_POLLING) {
                        link_state = HLS_DN_POLL;
-                       set_link_down_reason(ppd,
-                            OPA_LINKDOWN_REASON_FM_BOUNCE, 0,
-                            OPA_LINKDOWN_REASON_FM_BOUNCE);
-               } else if (phys_state == IB_PORTPHYSSTATE_DISABLED)
+                       set_link_down_reason(ppd, OPA_LINKDOWN_REASON_FM_BOUNCE,
+                                            0, OPA_LINKDOWN_REASON_FM_BOUNCE);
+               } else if (phys_state == IB_PORTPHYSSTATE_DISABLED) {
                        link_state = HLS_DN_DISABLE;
-               else {
+               else {
                        pr_warn("SubnSet(OPA_PortInfo) invalid physical state 0x%x\n",
                                phys_state);
                        smp->status |= IB_SMP_INVALID_FIELD;
                set_link_state(ppd, link_state);
                if (link_state == HLS_DN_DISABLE &&
                    (ppd->offline_disabled_reason >
-                    OPA_LINKDOWN_REASON_SMA_DISABLED ||
+                    HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED) ||
                     ppd->offline_disabled_reason ==
-                    OPA_LINKDOWN_REASON_NONE))
+                    HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)))
                        ppd->offline_disabled_reason =
-                       OPA_LINKDOWN_REASON_SMA_DISABLED;
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED);
                /*
                 * Don't send a reply if the response would be sent
                 * through the disabled port.
@@@ -1091,13 -1102,13 +1102,13 @@@ static int __subn_set_opa_portinfo(stru
  
        ls_old = driver_lstate(ppd);
  
-       ibp->mkey = pi->mkey;
-       ibp->gid_prefix = pi->subnet_prefix;
-       ibp->mkey_lease_period = be16_to_cpu(pi->mkey_lease_period);
+       ibp->rvp.mkey = pi->mkey;
+       ibp->rvp.gid_prefix = pi->subnet_prefix;
+       ibp->rvp.mkey_lease_period = be16_to_cpu(pi->mkey_lease_period);
  
        /* Must be a valid unicast LID address. */
        if ((lid == 0 && ls_old > IB_PORT_INIT) ||
-            lid >= HFI1_MULTICAST_LID_BASE) {
+           lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
                smp->status |= IB_SMP_INVALID_FIELD;
                pr_warn("SubnSet(OPA_PortInfo) lid invalid 0x%x\n",
                        lid);
  
        /* Must be a valid unicast LID address. */
        if ((smlid == 0 && ls_old > IB_PORT_INIT) ||
-            smlid >= HFI1_MULTICAST_LID_BASE) {
+           smlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
                smp->status |= IB_SMP_INVALID_FIELD;
                pr_warn("SubnSet(OPA_PortInfo) smlid invalid 0x%x\n", smlid);
-       } else if (smlid != ibp->sm_lid || msl != ibp->sm_sl) {
+       } else if (smlid != ibp->rvp.sm_lid || msl != ibp->rvp.sm_sl) {
                pr_warn("SubnSet(OPA_PortInfo) smlid 0x%x\n", smlid);
-               spin_lock_irqsave(&ibp->lock, flags);
-               if (ibp->sm_ah) {
-                       if (smlid != ibp->sm_lid)
-                               ibp->sm_ah->attr.dlid = smlid;
-                       if (msl != ibp->sm_sl)
-                               ibp->sm_ah->attr.sl = msl;
+               spin_lock_irqsave(&ibp->rvp.lock, flags);
+               if (ibp->rvp.sm_ah) {
+                       if (smlid != ibp->rvp.sm_lid)
+                               ibp->rvp.sm_ah->attr.dlid = smlid;
+                       if (msl != ibp->rvp.sm_sl)
+                               ibp->rvp.sm_ah->attr.sl = msl;
                }
-               spin_unlock_irqrestore(&ibp->lock, flags);
-               if (smlid != ibp->sm_lid)
-                       ibp->sm_lid = smlid;
-               if (msl != ibp->sm_sl)
-                       ibp->sm_sl = msl;
+               spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+               if (smlid != ibp->rvp.sm_lid)
+                       ibp->rvp.sm_lid = smlid;
+               if (msl != ibp->rvp.sm_sl)
+                       ibp->rvp.sm_sl = msl;
                event.event = IB_EVENT_SM_CHANGE;
                ib_dispatch_event(&event);
        }
        ppd->port_error_action = be32_to_cpu(pi->port_error_action);
        lwe = be16_to_cpu(pi->link_width.enabled);
        if (lwe) {
-               if (lwe == OPA_LINK_WIDTH_RESET
-                               || lwe == OPA_LINK_WIDTH_RESET_OLD)
+               if (lwe == OPA_LINK_WIDTH_RESET ||
+                   lwe == OPA_LINK_WIDTH_RESET_OLD)
                        set_link_width_enabled(ppd, ppd->link_width_supported);
                else if ((lwe & ~ppd->link_width_supported) == 0)
                        set_link_width_enabled(ppd, lwe);
        }
        lwe = be16_to_cpu(pi->link_width_downgrade.enabled);
        /* LWD.E is always applied - 0 means "disabled" */
-       if (lwe == OPA_LINK_WIDTH_RESET
-                       || lwe == OPA_LINK_WIDTH_RESET_OLD) {
+       if (lwe == OPA_LINK_WIDTH_RESET ||
+           lwe == OPA_LINK_WIDTH_RESET_OLD) {
                set_link_width_downgrade_enabled(ppd,
-                               ppd->link_width_downgrade_supported);
+                                                ppd->
+                                                link_width_downgrade_supported
+                                                );
        } else if ((lwe & ~ppd->link_width_downgrade_supported) == 0) {
                /* only set and apply if something changed */
                if (lwe != ppd->link_width_downgrade_enabled) {
                        set_link_width_downgrade_enabled(ppd, lwe);
                        call_link_downgrade_policy = 1;
                }
-       } else
+       } else {
                smp->status |= IB_SMP_INVALID_FIELD;
+       }
        lse = be16_to_cpu(pi->link_speed.enabled);
        if (lse) {
                if (lse & be16_to_cpu(pi->link_speed.supported))
                        smp->status |= IB_SMP_INVALID_FIELD;
        }
  
-       ibp->mkeyprot = (pi->mkeyprotect_lmc & OPA_PI_MASK_MKEY_PROT_BIT) >> 6;
-       ibp->vl_high_limit = be16_to_cpu(pi->vl.high_limit) & 0xFF;
+       ibp->rvp.mkeyprot =
+               (pi->mkeyprotect_lmc & OPA_PI_MASK_MKEY_PROT_BIT) >> 6;
+       ibp->rvp.vl_high_limit = be16_to_cpu(pi->vl.high_limit) & 0xFF;
        (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_LIMIT,
-                                   ibp->vl_high_limit);
+                                   ibp->rvp.vl_high_limit);
  
-       if (ppd->vls_supported/2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
-               ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
+       if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
+           ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
                smp->status |= IB_SMP_INVALID_FIELD;
                return reply((struct ib_mad_hdr *)smp);
        }
        for (i = 0; i < ppd->vls_supported; i++) {
                if ((i % 2) == 0)
-                       mtu = enum_to_mtu((pi->neigh_mtu.pvlx_to_mtu[i/2] >> 4)
-                                         & 0xF);
+                       mtu = enum_to_mtu((pi->neigh_mtu.pvlx_to_mtu[i / 2] >>
+                                          4) & 0xF);
                else
-                       mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[i/2] & 0xF);
+                       mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[i / 2] &
+                                         0xF);
                if (mtu == 0xffff) {
                        pr_warn("SubnSet(OPA_PortInfo) mtu invalid %d (0x%x)\n",
                                mtu,
                }
                if (dd->vld[i].mtu != mtu) {
                        dd_dev_info(dd,
-                               "MTU change on vl %d from %d to %d\n",
-                               i, dd->vld[i].mtu, mtu);
+                                   "MTU change on vl %d from %d to %d\n",
+                                   i, dd->vld[i].mtu, mtu);
                        dd->vld[i].mtu = mtu;
                        call_set_mtu++;
                }
        /* As per OPAV1 spec: VL15 must support and be configured
         * for operation with a 2048 or larger MTU.
         */
-       mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[15/2] & 0xF);
+       mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[15 / 2] & 0xF);
        if (mtu < 2048 || mtu == 0xffff)
                mtu = 2048;
        if (dd->vld[15].mtu != mtu) {
                dd_dev_info(dd,
-                       "MTU change on vl 15 from %d to %d\n",
-                       dd->vld[15].mtu, mtu);
+                           "MTU change on vl 15 from %d to %d\n",
+                           dd->vld[15].mtu, mtu);
                dd->vld[15].mtu = mtu;
                call_set_mtu++;
        }
                        smp->status |= IB_SMP_INVALID_FIELD;
                } else {
                        if (hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS,
-                                               vls) == -EINVAL)
+                                           vls) == -EINVAL)
                                smp->status |= IB_SMP_INVALID_FIELD;
                }
        }
  
        if (pi->mkey_violations == 0)
-               ibp->mkey_violations = 0;
+               ibp->rvp.mkey_violations = 0;
  
        if (pi->pkey_violations == 0)
-               ibp->pkey_violations = 0;
+               ibp->rvp.pkey_violations = 0;
  
        if (pi->qkey_violations == 0)
-               ibp->qkey_violations = 0;
+               ibp->rvp.qkey_violations = 0;
  
-       ibp->subnet_timeout =
+       ibp->rvp.subnet_timeout =
                pi->clientrereg_subnettimeout & OPA_PI_MASK_SUBNET_TIMEOUT;
  
        crc_enabled = be16_to_cpu(pi->port_ltp_crc_mode);
@@@ -1388,7 -1403,7 +1403,7 @@@ static int set_pkeys(struct hfi1_devdat
                (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
  
                event.event = IB_EVENT_PKEY_CHANGE;
-               event.device = &dd->verbs_dev.ibdev;
+               event.device = &dd->verbs_dev.rdi.ibdev;
                event.element.port_num = port;
                ib_dispatch_event(&event);
        }
@@@ -1402,7 -1417,7 +1417,7 @@@ static int __subn_set_opa_pkeytable(str
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
        u32 n_blocks_sent = OPA_AM_NBLK(am);
        u32 start_block = am & 0x7ff;
-       u16 *p = (u16 *) data;
+       u16 *p = (u16 *)data;
        __be16 *q = (__be16 *)data;
        int i;
        u16 n_blocks_avail;
                return reply((struct ib_mad_hdr *)smp);
        }
  
-       n_blocks_avail = (u16)(npkeys/OPA_PARTITION_TABLE_BLK_SIZE) + 1;
+       n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1;
  
        if (start_block + n_blocks_sent > n_blocks_avail ||
            n_blocks_sent > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
@@@ -1514,14 -1529,22 +1529,22 @@@ static int __subn_set_opa_sl_to_sc(stru
        struct hfi1_ibport *ibp = to_iport(ibdev, port);
        u8 *p = data;
        int i;
+       u8 sc;
  
        if (am) {
                smp->status |= IB_SMP_INVALID_FIELD;
                return reply((struct ib_mad_hdr *)smp);
        }
  
-       for (i = 0; i <  ARRAY_SIZE(ibp->sl_to_sc); i++)
-               ibp->sl_to_sc[i] = *p++;
+       for (i = 0; i <  ARRAY_SIZE(ibp->sl_to_sc); i++) {
+               sc = *p++;
+               if (ibp->sl_to_sc[i] != sc) {
+                       ibp->sl_to_sc[i] = sc;
+                       /* Put all stale qps into error state */
+                       hfi1_error_port_qps(ibp, i);
+               }
+       }
  
        return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len);
  }
@@@ -1574,7 -1597,7 +1597,7 @@@ static int __subn_get_opa_sc_to_vlt(str
  {
        u32 n_blocks = OPA_AM_NBLK(am);
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       void *vp = (void *) data;
+       void *vp = (void *)data;
        size_t size = 4 * sizeof(u64);
  
        if (n_blocks != 1) {
@@@ -1597,7 -1620,7 +1620,7 @@@ static int __subn_set_opa_sc_to_vlt(str
        u32 n_blocks = OPA_AM_NBLK(am);
        int async_update = OPA_AM_ASYNC(am);
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       void *vp = (void *) data;
+       void *vp = (void *)data;
        struct hfi1_pportdata *ppd;
        int lstate;
  
        /* IB numbers ports from 1, hw from 0 */
        ppd = dd->pport + (port - 1);
        lstate = driver_lstate(ppd);
-       /* it's known that async_update is 0 by this point, but include
-        * the explicit check for clarity */
+       /*
+        * it's known that async_update is 0 by this point, but include
+        * the explicit check for clarity
+        */
        if (!async_update &&
            (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE)) {
                smp->status |= IB_SMP_INVALID_FIELD;
@@@ -1629,7 -1654,7 +1654,7 @@@ static int __subn_get_opa_sc_to_vlnt(st
        u32 n_blocks = OPA_AM_NPORT(am);
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
        struct hfi1_pportdata *ppd;
-       void *vp = (void *) data;
+       void *vp = (void *)data;
        int size;
  
        if (n_blocks != 1) {
@@@ -1654,7 -1679,7 +1679,7 @@@ static int __subn_set_opa_sc_to_vlnt(st
        u32 n_blocks = OPA_AM_NPORT(am);
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
        struct hfi1_pportdata *ppd;
-       void *vp = (void *) data;
+       void *vp = (void *)data;
        int lstate;
  
        if (n_blocks != 1) {
@@@ -1687,7 -1712,7 +1712,7 @@@ static int __subn_get_opa_psi(struct op
        u32 lstate;
        struct hfi1_ibport *ibp;
        struct hfi1_pportdata *ppd;
-       struct opa_port_state_info *psi = (struct opa_port_state_info *) data;
+       struct opa_port_state_info *psi = (struct opa_port_state_info *)data;
  
        if (nports != 1) {
                smp->status |= IB_SMP_INVALID_FIELD;
        psi->port_states.ledenable_offlinereason |=
                ppd->is_sm_config_started << 5;
        psi->port_states.ledenable_offlinereason |=
-               ppd->offline_disabled_reason & OPA_PI_MASK_OFFLINE_REASON;
+               ppd->offline_disabled_reason;
  #else
        psi->port_states.offline_reason = ppd->neighbor_normal << 4;
        psi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
-       psi->port_states.offline_reason |= ppd->offline_disabled_reason &
-                               OPA_PI_MASK_OFFLINE_REASON;
+       psi->port_states.offline_reason |= ppd->offline_disabled_reason;
  #endif /* PI_LED_ENABLE_SUP */
  
        psi->port_states.portphysstate_portstate =
@@@ -1737,7 -1761,7 +1761,7 @@@ static int __subn_set_opa_psi(struct op
        u8 ls_new, ps_new;
        struct hfi1_ibport *ibp;
        struct hfi1_pportdata *ppd;
-       struct opa_port_state_info *psi = (struct opa_port_state_info *) data;
+       struct opa_port_state_info *psi = (struct opa_port_state_info *)data;
        int ret, invalid = 0;
  
        if (nports != 1) {
@@@ -1782,14 -1806,16 +1806,16 @@@ static int __subn_get_opa_cable_info(st
        u32 len = OPA_AM_CI_LEN(am) + 1;
        int ret;
  
- #define __CI_PAGE_SIZE (1 << 7) /* 128 bytes */
+ #define __CI_PAGE_SIZE BIT(7) /* 128 bytes */
  #define __CI_PAGE_MASK ~(__CI_PAGE_SIZE - 1)
  #define __CI_PAGE_NUM(a) ((a) & __CI_PAGE_MASK)
  
-       /* check that addr is within spec, and
-        * addr and (addr + len - 1) are on the same "page" */
+       /*
+        * check that addr is within spec, and
+        * addr and (addr + len - 1) are on the same "page"
+        */
        if (addr >= 4096 ||
-               (__CI_PAGE_NUM(addr) != __CI_PAGE_NUM(addr + len - 1))) {
+           (__CI_PAGE_NUM(addr) != __CI_PAGE_NUM(addr + len - 1))) {
                smp->status |= IB_SMP_INVALID_FIELD;
                return reply((struct ib_mad_hdr *)smp);
        }
@@@ -1823,7 -1849,7 +1849,7 @@@ static int __subn_get_opa_bct(struct op
        u32 num_ports = OPA_AM_NPORT(am);
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
        struct hfi1_pportdata *ppd;
-       struct buffer_control *p = (struct buffer_control *) data;
+       struct buffer_control *p = (struct buffer_control *)data;
        int size;
  
        if (num_ports != 1) {
@@@ -1846,7 -1872,7 +1872,7 @@@ static int __subn_set_opa_bct(struct op
        u32 num_ports = OPA_AM_NPORT(am);
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
        struct hfi1_pportdata *ppd;
-       struct buffer_control *p = (struct buffer_control *) data;
+       struct buffer_control *p = (struct buffer_control *)data;
  
        if (num_ports != 1) {
                smp->status |= IB_SMP_INVALID_FIELD;
@@@ -1919,13 -1945,15 +1945,15 @@@ static int __subn_set_opa_vl_arb(struc
  
        switch (section) {
        case OPA_VLARB_LOW_ELEMENTS:
-               (void) fm_set_table(ppd, FM_TBL_VL_LOW_ARB, p);
+               (void)fm_set_table(ppd, FM_TBL_VL_LOW_ARB, p);
                break;
        case OPA_VLARB_HIGH_ELEMENTS:
-               (void) fm_set_table(ppd, FM_TBL_VL_HIGH_ARB, p);
+               (void)fm_set_table(ppd, FM_TBL_VL_HIGH_ARB, p);
                break;
-       /* neither OPA_VLARB_PREEMPT_ELEMENTS, or OPA_VLARB_PREEMPT_MATRIX
-        * can be changed from the default values */
+       /*
+        * neither OPA_VLARB_PREEMPT_ELEMENTS, or OPA_VLARB_PREEMPT_MATRIX
+        * can be changed from the default values
+        */
        case OPA_VLARB_PREEMPT_ELEMENTS:
                /* FALLTHROUGH */
        case OPA_VLARB_PREEMPT_MATRIX:
@@@ -2137,8 -2165,10 +2165,10 @@@ struct opa_port_data_counters_msg 
  };
  
  struct opa_port_error_counters64_msg {
-       /* Request contains first two fields, response contains the
-        * whole magilla */
+       /*
+        * Request contains first two fields, response contains the
+        * whole magilla
+        */
        __be64 port_select_mask[4];
        __be32 vl_select_mask;
  
@@@ -2172,7 -2202,6 +2202,6 @@@ struct opa_port_error_info_msg 
        __be32 error_info_select_mask;
        __be32 reserved1;
        struct _port_ei {
                u8 port_number;
                u8 reserved2[7];
  
@@@ -2251,7 -2280,7 +2280,7 @@@ enum error_info_selects 
  };
  
  static int pma_get_opa_classportinfo(struct opa_pma_mad *pmp,
-                               struct ib_device *ibdev, u32 *resp_len)
+                                    struct ib_device *ibdev, u32 *resp_len)
  {
        struct opa_class_port_info *p =
                (struct opa_class_port_info *)pmp->data;
@@@ -2299,9 -2328,9 +2328,9 @@@ static void a0_portstatus(struct hfi1_p
        }
  }
  
  static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
-                       struct ib_device *ibdev, u8 port, u32 *resp_len)
+                                 struct ib_device *ibdev,
+                                 u8 port, u32 *resp_len)
  {
        struct opa_port_status_req *req =
                (struct opa_port_status_req *)pmp->data;
                return reply((struct ib_mad_hdr *)pmp);
        }
  
-       if (nports != 1 || (port_num && port_num != port)
-           || num_vls > OPA_MAX_VLS || (vl_select_mask & ~VL_MASK_ALL)) {
+       if (nports != 1 || (port_num && port_num != port) ||
+           num_vls > OPA_MAX_VLS || (vl_select_mask & ~VL_MASK_ALL)) {
                pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
                return reply((struct ib_mad_hdr *)pmp);
        }
                                         CNTR_INVALID_VL));
        rsp->port_multicast_xmit_pkts =
                cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
-                                       CNTR_INVALID_VL));
+                                         CNTR_INVALID_VL));
        rsp->port_multicast_rcv_pkts =
                cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
                                          CNTR_INVALID_VL));
        }
        tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
        tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
-                                       CNTR_INVALID_VL);
+                                  CNTR_INVALID_VL);
        if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
                /* overflow/wrapped */
                rsp->link_error_recovery = cpu_to_be32(~0);
                cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
                                          CNTR_INVALID_VL));
        rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
-                                         CNTR_INVALID_VL));
+                                                     CNTR_INVALID_VL));
  
        /* rsp->uncorrectable_errors is 8 bits wide, and it pegs at 0xff */
        tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
        rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
  
-       vlinfo = &(rsp->vls[0]);
+       vlinfo = &rsp->vls[0];
        vfi = 0;
        /* The vl_select_mask has been checked above, and we know
         * that it contains only entries which represent valid VLs.
  
                rsp->vls[vfi].port_vl_rcv_pkts =
                        cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
-                                       idx_from_vl(vl)));
+                                                 idx_from_vl(vl)));
  
                rsp->vls[vfi].port_vl_xmit_data =
                        cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
-                                       idx_from_vl(vl)));
+                                                  idx_from_vl(vl)));
  
                rsp->vls[vfi].port_vl_xmit_pkts =
                        cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
-                                       idx_from_vl(vl)));
+                                                  idx_from_vl(vl)));
  
                rsp->vls[vfi].port_vl_xmit_wait =
                        cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
-                                       idx_from_vl(vl)));
+                                                  idx_from_vl(vl)));
  
                rsp->vls[vfi].port_vl_rcv_fecn =
                        cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
-                                       idx_from_vl(vl)));
+                                                 idx_from_vl(vl)));
  
                rsp->vls[vfi].port_vl_rcv_becn =
                        cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
-                                       idx_from_vl(vl)));
+                                                 idx_from_vl(vl)));
  
                vlinfo++;
                vfi++;
@@@ -2473,7 -2502,7 +2502,7 @@@ static u64 get_error_counter_summary(st
        error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
                                                CNTR_INVALID_VL);
        error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
-                                               CNTR_INVALID_VL);
+                                              CNTR_INVALID_VL);
        /* local link integrity must be right-shifted by the lli resolution */
        tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
        tmp += read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
        tmp += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL);
        error_counter_summary += (tmp >> res_ler);
        error_counter_summary += read_dev_cntr(dd, C_DC_RCV_ERR,
-                                               CNTR_INVALID_VL);
+                                              CNTR_INVALID_VL);
        error_counter_summary += read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
        error_counter_summary += read_dev_cntr(dd, C_DC_FM_CFG_ERR,
-                                               CNTR_INVALID_VL);
+                                              CNTR_INVALID_VL);
        /* ppd->link_downed is a 32-bit value */
        error_counter_summary += read_port_cntr(ppd, C_SW_LINK_DOWN,
                                                CNTR_INVALID_VL);
@@@ -2512,7 -2541,7 +2541,7 @@@ static void a0_datacounters(struct hfi1
                                                 idx_from_vl(vl));
                        if (tmp < sum_vl_xmit_wait) {
                                /* we wrapped */
-                               sum_vl_xmit_wait = (u64) ~0;
+                               sum_vl_xmit_wait = (u64)~0;
                                break;
                        }
                        sum_vl_xmit_wait = tmp;
        }
  }
  
+ static void pma_get_opa_port_dctrs(struct ib_device *ibdev,
+                                  struct _port_dctrs *rsp)
+ {
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
+                                               CNTR_INVALID_VL));
+       rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
+                                               CNTR_INVALID_VL));
+       rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
+                                               CNTR_INVALID_VL));
+       rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
+                                               CNTR_INVALID_VL));
+       rsp->port_multicast_xmit_pkts =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
+                                         CNTR_INVALID_VL));
+       rsp->port_multicast_rcv_pkts =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
+                                         CNTR_INVALID_VL));
+ }
  static int pma_get_opa_datacounters(struct opa_pma_mad *pmp,
-                       struct ib_device *ibdev, u8 port, u32 *resp_len)
+                                   struct ib_device *ibdev,
+                                   u8 port, u32 *resp_len)
  {
        struct opa_port_data_counters_msg *req =
                (struct opa_port_data_counters_msg *)pmp->data;
                return reply((struct ib_mad_hdr *)pmp);
        }
  
 -      rsp = (struct _port_dctrs *)&req->port[0];
 +      rsp = &req->port[0];
        memset(rsp, 0, sizeof(*rsp));
  
        rsp->port_number = port;
         */
        hfi1_read_link_quality(dd, &lq);
        rsp->link_quality_indicator = cpu_to_be32((u32)lq);
+       pma_get_opa_port_dctrs(ibdev, rsp);
  
-       /* rsp->sw_port_congestion is 0 for HFIs */
-       /* rsp->port_xmit_time_cong is 0 for HFIs */
-       /* rsp->port_xmit_wasted_bw ??? */
-       /* rsp->port_xmit_wait_data ??? */
-       /* rsp->port_mark_fecn is 0 for HFIs */
-       rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
-                                               CNTR_INVALID_VL));
-       rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
-                                               CNTR_INVALID_VL));
-       rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
-                                               CNTR_INVALID_VL));
-       rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
-                                               CNTR_INVALID_VL));
-       rsp->port_multicast_xmit_pkts =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
-                                               CNTR_INVALID_VL));
-       rsp->port_multicast_rcv_pkts =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
-                                               CNTR_INVALID_VL));
        rsp->port_xmit_wait =
                cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
        rsp->port_rcv_fecn =
                cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
        rsp->port_rcv_becn =
                cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
        rsp->port_error_counter_summary =
                cpu_to_be64(get_error_counter_summary(ibdev, port,
                                                      res_lli, res_ler));
  
-       vlinfo = &(rsp->vls[0]);
+       vlinfo = &rsp->vls[0];
        vfi = 0;
        /* The vl_select_mask has been checked above, and we know
         * that it contains only entries which represent valid VLs.
         * any additional checks for vl.
         */
        for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
-                8 * sizeof(req->vl_select_mask)) {
+                        8 * sizeof(req->vl_select_mask)) {
                memset(vlinfo, 0, sizeof(*vlinfo));
  
                rsp->vls[vfi].port_vl_xmit_data =
                        cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
-                                                       idx_from_vl(vl)));
+                                                  idx_from_vl(vl)));
  
                rsp->vls[vfi].port_vl_rcv_data =
                        cpu_to_be64(read_dev_cntr(dd, C_DC_RX_FLIT_VL,
-                                                       idx_from_vl(vl)));
+                                                 idx_from_vl(vl)));
  
                rsp->vls[vfi].port_vl_xmit_pkts =
                        cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
-                                                       idx_from_vl(vl)));
+                                                  idx_from_vl(vl)));
  
                rsp->vls[vfi].port_vl_rcv_pkts =
                        cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
-                                                       idx_from_vl(vl)));
+                                                 idx_from_vl(vl)));
  
                rsp->vls[vfi].port_vl_xmit_wait =
                        cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
-                                                       idx_from_vl(vl)));
+                                                  idx_from_vl(vl)));
  
                rsp->vls[vfi].port_vl_rcv_fecn =
                        cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
-                                                       idx_from_vl(vl)));
+                                                 idx_from_vl(vl)));
                rsp->vls[vfi].port_vl_rcv_becn =
                        cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
-                                                       idx_from_vl(vl)));
+                                                 idx_from_vl(vl)));
  
                /* rsp->port_vl_xmit_time_cong is 0 for HFIs */
                /* rsp->port_vl_xmit_wasted_bw ??? */
                /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ???
-                * does this differ from rsp->vls[vfi].port_vl_xmit_wait */
+                * does this differ from rsp->vls[vfi].port_vl_xmit_wait
+                */
                /*rsp->vls[vfi].port_vl_mark_fecn =
-                       cpu_to_be64(read_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT
-                               + offset));
-               */
+                *      cpu_to_be64(read_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT
+                *              + offset));
+                */
                vlinfo++;
                vfi++;
        }
        return reply((struct ib_mad_hdr *)pmp);
  }
  
+ static int pma_get_ib_portcounters_ext(struct ib_pma_mad *pmp,
+                                      struct ib_device *ibdev, u8 port)
+ {
+       struct ib_pma_portcounters_ext *p = (struct ib_pma_portcounters_ext *)
+                                               pmp->data;
+       struct _port_dctrs rsp;
+       if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               goto bail;
+       }
+       memset(&rsp, 0, sizeof(rsp));
+       pma_get_opa_port_dctrs(ibdev, &rsp);
+       p->port_xmit_data = rsp.port_xmit_data;
+       p->port_rcv_data = rsp.port_rcv_data;
+       p->port_xmit_packets = rsp.port_xmit_pkts;
+       p->port_rcv_packets = rsp.port_rcv_pkts;
+       p->port_unicast_xmit_packets = 0;
+       p->port_unicast_rcv_packets =  0;
+       p->port_multicast_xmit_packets = rsp.port_multicast_xmit_pkts;
+       p->port_multicast_rcv_packets = rsp.port_multicast_rcv_pkts;
+ bail:
+       return reply((struct ib_mad_hdr *)pmp);
+ }
+ static void pma_get_opa_port_ectrs(struct ib_device *ibdev,
+                                  struct _port_ectrs *rsp, u8 port)
+ {
+       u64 tmp, tmp2;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+       tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+                                       CNTR_INVALID_VL);
+       if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
+               /* overflow/wrapped */
+               rsp->link_error_recovery = cpu_to_be32(~0);
+       } else {
+               rsp->link_error_recovery = cpu_to_be32(tmp2);
+       }
+       rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
+                                               CNTR_INVALID_VL));
+       rsp->port_rcv_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
+       rsp->port_rcv_remote_physical_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+                                         CNTR_INVALID_VL));
+       rsp->port_rcv_switch_relay_errors = 0;
+       rsp->port_xmit_discards =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
+                                          CNTR_INVALID_VL));
+       rsp->port_xmit_constraint_errors =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+                                          CNTR_INVALID_VL));
+       rsp->port_rcv_constraint_errors =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+                                          CNTR_INVALID_VL));
+       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
+       tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
+       if (tmp2 < tmp) {
+               /* overflow/wrapped */
+               rsp->local_link_integrity_errors = cpu_to_be64(~0);
+       } else {
+               rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
+       }
+       rsp->excessive_buffer_overruns =
+               cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
+ }
  static int pma_get_opa_porterrors(struct opa_pma_mad *pmp,
-                       struct ib_device *ibdev, u8 port, u32 *resp_len)
+                                 struct ib_device *ibdev,
+                                 u8 port, u32 *resp_len)
  {
        size_t response_data_size;
        struct _port_ectrs *rsp;
-       unsigned long port_num;
+       u8 port_num;
        struct opa_port_error_counters64_msg *req;
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
        u32 num_ports;
        struct hfi1_pportdata *ppd;
        struct _vls_ectrs *vlinfo;
        unsigned long vl;
-       u64 port_mask, tmp, tmp2;
+       u64 port_mask, tmp;
        u32 vl_select_mask;
        int vfi;
  
         */
        port_mask = be64_to_cpu(req->port_select_mask[3]);
        port_num = find_first_bit((unsigned long *)&port_mask,
-                                       sizeof(port_mask));
+                                 sizeof(port_mask));
  
-       if ((u8)port_num != port) {
+       if (port_num != port) {
                pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
                return reply((struct ib_mad_hdr *)pmp);
        }
  
 -      rsp = (struct _port_ectrs *)&req->port[0];
 +      rsp = &req->port[0];
  
        ibp = to_iport(ibdev, port_num);
        ppd = ppd_from_ibp(ibp);
  
        memset(rsp, 0, sizeof(*rsp));
-       rsp->port_number = (u8)port_num;
+       rsp->port_number = port_num;
+       pma_get_opa_port_ectrs(ibdev, rsp, port_num);
  
-       rsp->port_rcv_constraint_errors =
-               cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
-                                          CNTR_INVALID_VL));
-       /* port_rcv_switch_relay_errors is 0 for HFIs */
-       rsp->port_xmit_discards =
-               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
-                                               CNTR_INVALID_VL));
        rsp->port_rcv_remote_physical_errors =
                cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
-                                               CNTR_INVALID_VL));
-       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
-       tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
-       if (tmp2 < tmp) {
-               /* overflow/wrapped */
-               rsp->local_link_integrity_errors = cpu_to_be64(~0);
-       } else {
-               rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
-       }
-       tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
-       tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
-                                       CNTR_INVALID_VL);
-       if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
-               /* overflow/wrapped */
-               rsp->link_error_recovery = cpu_to_be32(~0);
-       } else {
-               rsp->link_error_recovery = cpu_to_be32(tmp2);
-       }
-       rsp->port_xmit_constraint_errors =
-               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
-                                          CNTR_INVALID_VL));
-       rsp->excessive_buffer_overruns =
-               cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
+                                         CNTR_INVALID_VL));
        rsp->fm_config_errors =
                cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
-                                               CNTR_INVALID_VL));
-       rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
-                                               CNTR_INVALID_VL));
+                                         CNTR_INVALID_VL));
        tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
        rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
  
 -      vlinfo = (struct _vls_ectrs *)&rsp->vls[0];
 +      vlinfo = &rsp->vls[0];
        vfi = 0;
        vl_select_mask = be32_to_cpu(req->vl_select_mask);
        for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
        return reply((struct ib_mad_hdr *)pmp);
  }
  
+ static int pma_get_ib_portcounters(struct ib_pma_mad *pmp,
+                                  struct ib_device *ibdev, u8 port)
+ {
+       struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+               pmp->data;
+       struct _port_ectrs rsp;
+       u64 temp_link_overrun_errors;
+       u64 temp_64;
+       u32 temp_32;
+       memset(&rsp, 0, sizeof(rsp));
+       pma_get_opa_port_ectrs(ibdev, &rsp, port);
+       if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               goto bail;
+       }
+       p->symbol_error_counter = 0; /* N/A for OPA */
+       temp_32 = be32_to_cpu(rsp.link_error_recovery);
+       if (temp_32 > 0xFFUL)
+               p->link_error_recovery_counter = 0xFF;
+       else
+               p->link_error_recovery_counter = (u8)temp_32;
+       temp_32 = be32_to_cpu(rsp.link_downed);
+       if (temp_32 > 0xFFUL)
+               p->link_downed_counter = 0xFF;
+       else
+               p->link_downed_counter = (u8)temp_32;
+       temp_64 = be64_to_cpu(rsp.port_rcv_errors);
+       if (temp_64 > 0xFFFFUL)
+               p->port_rcv_errors = cpu_to_be16(0xFFFF);
+       else
+               p->port_rcv_errors = cpu_to_be16((u16)temp_64);
+       temp_64 = be64_to_cpu(rsp.port_rcv_remote_physical_errors);
+       if (temp_64 > 0xFFFFUL)
+               p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
+       else
+               p->port_rcv_remphys_errors = cpu_to_be16((u16)temp_64);
+       temp_64 = be64_to_cpu(rsp.port_rcv_switch_relay_errors);
+       p->port_rcv_switch_relay_errors = cpu_to_be16((u16)temp_64);
+       temp_64 = be64_to_cpu(rsp.port_xmit_discards);
+       if (temp_64 > 0xFFFFUL)
+               p->port_xmit_discards = cpu_to_be16(0xFFFF);
+       else
+               p->port_xmit_discards = cpu_to_be16((u16)temp_64);
+       temp_64 = be64_to_cpu(rsp.port_xmit_constraint_errors);
+       if (temp_64 > 0xFFUL)
+               p->port_xmit_constraint_errors = 0xFF;
+       else
+               p->port_xmit_constraint_errors = (u8)temp_64;
+       temp_64 = be64_to_cpu(rsp.port_rcv_constraint_errors);
+       if (temp_64 > 0xFFUL)
+               p->port_rcv_constraint_errors = 0xFFUL;
+       else
+               p->port_rcv_constraint_errors = (u8)temp_64;
+       /* LocalLink: 7:4, BufferOverrun: 3:0 */
+       temp_64 = be64_to_cpu(rsp.local_link_integrity_errors);
+       if (temp_64 > 0xFUL)
+               temp_64 = 0xFUL;
+       temp_link_overrun_errors = temp_64 << 4;
+       temp_64 = be64_to_cpu(rsp.excessive_buffer_overruns);
+       if (temp_64 > 0xFUL)
+               temp_64 = 0xFUL;
+       temp_link_overrun_errors |= temp_64;
+       p->link_overrun_errors = (u8)temp_link_overrun_errors;
+       p->vl15_dropped = 0; /* N/A for OPA */
+ bail:
+       return reply((struct ib_mad_hdr *)pmp);
+ }
  static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp,
-                       struct ib_device *ibdev, u8 port, u32 *resp_len)
+                                struct ib_device *ibdev,
+                                u8 port, u32 *resp_len)
  {
        size_t response_data_size;
        struct _port_ei *rsp;
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
        u64 port_mask;
        u32 num_ports;
-       unsigned long port_num;
+       u8 port_num;
        u8 num_pslm;
        u64 reg;
  
        req = (struct opa_port_error_info_msg *)pmp->data;
 -      rsp = (struct _port_ei *)&req->port[0];
 +      rsp = &req->port[0];
  
        num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
        num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
        port_num = find_first_bit((unsigned long *)&port_mask,
                                  sizeof(port_mask));
  
-       if ((u8)port_num != port) {
+       if (port_num != port) {
                pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
                return reply((struct ib_mad_hdr *)pmp);
        }
        rsp->port_rcv_ei.status_and_code =
                dd->err_info_rcvport.status_and_code;
        memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit1,
-               &dd->err_info_rcvport.packet_flit1, sizeof(u64));
+              &dd->err_info_rcvport.packet_flit1, sizeof(u64));
        memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit2,
-               &dd->err_info_rcvport.packet_flit2, sizeof(u64));
+              &dd->err_info_rcvport.packet_flit2, sizeof(u64));
  
        /* ExcessiverBufferOverrunInfo */
        reg = read_csr(dd, RCV_ERR_INFO);
        if (reg & RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK) {
-               /* if the RcvExcessBufferOverrun bit is set, save SC of
-                * first pkt that encountered an excess buffer overrun */
+               /*
+                * if the RcvExcessBufferOverrun bit is set, save SC of
+                * first pkt that encountered an excess buffer overrun
+                */
                u8 tmp = (u8)reg;
  
                tmp &=  RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK;
  }
  
  static int pma_set_opa_portstatus(struct opa_pma_mad *pmp,
-                       struct ib_device *ibdev, u8 port, u32 *resp_len)
+                                 struct ib_device *ibdev,
+                                 u8 port, u32 *resp_len)
  {
        struct opa_clear_port_status *req =
                (struct opa_clear_port_status *)pmp->data;
                write_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL, 0);
  
        /* Only applicable for switch */
-       /*if (counter_select & CS_PORT_MARK_FECN)
-               write_csr(dd, DCC_PRF_PORT_MARK_FECN_CNT, 0);*/
+       /* if (counter_select & CS_PORT_MARK_FECN)
+        *      write_csr(dd, DCC_PRF_PORT_MARK_FECN_CNT, 0);
+        */
  
        if (counter_select & CS_PORT_RCV_CONSTRAINT_ERRORS)
                write_port_cntr(ppd, C_SW_RCV_CSTR_ERR, CNTR_INVALID_VL, 0);
        if (counter_select & CS_LINK_ERROR_RECOVERY) {
                write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
                write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
-                                               CNTR_INVALID_VL, 0);
+                              CNTR_INVALID_VL, 0);
        }
  
        if (counter_select & CS_PORT_RCV_ERRORS)
  
        for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
                         8 * sizeof(vl_select_mask)) {
                if (counter_select & CS_PORT_XMIT_DATA)
                        write_port_cntr(ppd, C_TX_FLIT_VL, idx_from_vl(vl), 0);
  
                if (counter_select & CS_PORT_RCV_BUBBLE)
                        write_dev_cntr(dd, C_DC_RCV_BBL_VL, idx_from_vl(vl), 0);
  
-               /*if (counter_select & CS_PORT_MARK_FECN)
-                    write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0);
-               */
+               /* if (counter_select & CS_PORT_MARK_FECN)
+                    write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0);
+                */
                /* port_vl_xmit_discards ??? */
        }
  
  }
  
  static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp,
-                       struct ib_device *ibdev, u8 port, u32 *resp_len)
+                                struct ib_device *ibdev,
+                                u8 port, u32 *resp_len)
  {
        struct _port_ei *rsp;
        struct opa_port_error_info_msg *req;
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
        u64 port_mask;
        u32 num_ports;
-       unsigned long port_num;
+       u8 port_num;
        u8 num_pslm;
        u32 error_info_select;
  
        req = (struct opa_port_error_info_msg *)pmp->data;
 -      rsp = (struct _port_ei *)&req->port[0];
 +      rsp = &req->port[0];
  
        num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
        num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
        port_num = find_first_bit((unsigned long *)&port_mask,
                                  sizeof(port_mask));
  
-       if ((u8)port_num != port) {
+       if (port_num != port) {
                pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
                return reply((struct ib_mad_hdr *)pmp);
        }
  
        /* ExcessiverBufferOverrunInfo */
        if (error_info_select & ES_EXCESSIVE_BUFFER_OVERRUN_INFO)
-               /* status bit is essentially kept in the h/w - bit 5 of
-                * RCV_ERR_INFO */
+               /*
+                * status bit is essentially kept in the h/w - bit 5 of
+                * RCV_ERR_INFO
+                */
                write_csr(dd, RCV_ERR_INFO,
                          RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
  
@@@ -3138,13 -3310,12 +3310,12 @@@ static int __subn_get_opa_cong_info(str
  }
  
  static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am,
-                                            u8 *data,
-                                            struct ib_device *ibdev,
-                                            u8 port, u32 *resp_len)
+                                      u8 *data, struct ib_device *ibdev,
+                                      u8 port, u32 *resp_len)
  {
        int i;
        struct opa_congestion_setting_attr *p =
-               (struct opa_congestion_setting_attr *) data;
+               (struct opa_congestion_setting_attr *)data;
        struct hfi1_ibport *ibp = to_iport(ibdev, port);
        struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
        struct opa_congestion_setting_entry_shadow *entries;
  
        cc_state = get_cc_state(ppd);
  
-       if (cc_state == NULL) {
+       if (!cc_state) {
                rcu_read_unlock();
                return reply((struct ib_mad_hdr *)smp);
        }
@@@ -3183,7 -3354,7 +3354,7 @@@ static int __subn_set_opa_cong_setting(
                                       u32 *resp_len)
  {
        struct opa_congestion_setting_attr *p =
-               (struct opa_congestion_setting_attr *) data;
+               (struct opa_congestion_setting_attr *)data;
        struct hfi1_ibport *ibp = to_iport(ibdev, port);
        struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
        struct opa_congestion_setting_entry_shadow *entries;
@@@ -3245,7 -3416,7 +3416,7 @@@ static int __subn_get_opa_hfi1_cong_log
                        continue;
                memcpy(cong_log->events[i].local_qp_cn_entry, &cce->lqpn, 3);
                memcpy(cong_log->events[i].remote_qp_number_cn_entry,
-                       &cce->rqpn, 3);
+                      &cce->rqpn, 3);
                cong_log->events[i].sl_svc_type_cn_entry =
                        ((cce->sl & 0x1f) << 3) | (cce->svc_type & 0x7);
                cong_log->events[i].remote_lid_cn_entry =
@@@ -3275,7 -3446,7 +3446,7 @@@ static int __subn_get_opa_cc_table(stru
                                   u32 *resp_len)
  {
        struct ib_cc_table_attr *cc_table_attr =
-               (struct ib_cc_table_attr *) data;
+               (struct ib_cc_table_attr *)data;
        struct hfi1_ibport *ibp = to_iport(ibdev, port);
        struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
        u32 start_block = OPA_AM_START_BLK(am);
  
        cc_state = get_cc_state(ppd);
  
-       if (cc_state == NULL) {
+       if (!cc_state) {
                rcu_read_unlock();
                return reply((struct ib_mad_hdr *)smp);
        }
        rcu_read_unlock();
  
        if (resp_len)
-               *resp_len += sizeof(u16)*(IB_CCT_ENTRIES * n_blocks + 1);
+               *resp_len += sizeof(u16) * (IB_CCT_ENTRIES * n_blocks + 1);
  
        return reply((struct ib_mad_hdr *)smp);
  }
@@@ -3332,7 -3503,7 +3503,7 @@@ static int __subn_set_opa_cc_table(stru
                                   struct ib_device *ibdev, u8 port,
                                   u32 *resp_len)
  {
-       struct ib_cc_table_attr *p = (struct ib_cc_table_attr *) data;
+       struct ib_cc_table_attr *p = (struct ib_cc_table_attr *)data;
        struct hfi1_ibport *ibp = to_iport(ibdev, port);
        struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
        u32 start_block = OPA_AM_START_BLK(am);
        }
  
        new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL);
-       if (new_cc_state == NULL)
+       if (!new_cc_state)
                goto getit;
  
        spin_lock(&ppd->cc_state_lock);
  
        old_cc_state = get_cc_state(ppd);
  
-       if (old_cc_state == NULL) {
+       if (!old_cc_state) {
                spin_unlock(&ppd->cc_state_lock);
                kfree(new_cc_state);
                return reply((struct ib_mad_hdr *)smp);
@@@ -3409,26 -3580,31 +3580,31 @@@ struct opa_led_info 
  };
  
  #define OPA_LED_SHIFT 31
- #define OPA_LED_MASK  (1 << OPA_LED_SHIFT)
+ #define OPA_LED_MASK  BIT(OPA_LED_SHIFT)
  
  static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
                                   struct ib_device *ibdev, u8 port,
                                   u32 *resp_len)
  {
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct opa_led_info *p = (struct opa_led_info *) data;
+       struct hfi1_pportdata *ppd = dd->pport;
+       struct opa_led_info *p = (struct opa_led_info *)data;
        u32 nport = OPA_AM_NPORT(am);
-       u64 reg;
+       u32 is_beaconing_active;
  
        if (nport != 1) {
                smp->status |= IB_SMP_INVALID_FIELD;
                return reply((struct ib_mad_hdr *)smp);
        }
  
-       reg = read_csr(dd, DCC_CFG_LED_CNTRL);
-       if ((reg & DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK) &&
-               ((reg & DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK) == 0xf))
-                       p->rsvd_led_mask = cpu_to_be32(OPA_LED_MASK);
+       /*
+        * This pairs with the memory barrier in hfi1_start_led_override to
+        * ensure that we read the correct state of LED beaconing represented
+        * by led_override_timer_active
+        */
+       smp_rmb();
+       is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active);
+       p->rsvd_led_mask = cpu_to_be32(is_beaconing_active << OPA_LED_SHIFT);
  
        if (resp_len)
                *resp_len += sizeof(struct opa_led_info);
@@@ -3441,7 -3617,7 +3617,7 @@@ static int __subn_set_opa_led_info(stru
                                   u32 *resp_len)
  {
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct opa_led_info *p = (struct opa_led_info *) data;
+       struct opa_led_info *p = (struct opa_led_info *)data;
        u32 nport = OPA_AM_NPORT(am);
        int on = !!(be32_to_cpu(p->rsvd_led_mask) & OPA_LED_MASK);
  
                return reply((struct ib_mad_hdr *)smp);
        }
  
-       setextled(dd, on);
+       if (on)
+               hfi1_start_led_override(dd->pport, 2000, 1500);
+       else
+               shutdown_led_override(dd->pport);
  
        return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len);
  }
@@@ -3493,7 -3672,7 +3672,7 @@@ static int subn_get_opa_sma(__be16 attr
                break;
        case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
                ret = __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
-                                              resp_len);
+                                               resp_len);
                break;
        case OPA_ATTRIB_ID_PORT_STATE_INFO:
                ret = __subn_get_opa_psi(smp, am, data, ibdev, port,
                                              resp_len);
                break;
        case IB_SMP_ATTR_SM_INFO:
-               if (ibp->port_cap_flags & IB_PORT_SM_DISABLED)
+               if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED)
                        return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
-               if (ibp->port_cap_flags & IB_PORT_SM)
+               if (ibp->rvp.port_cap_flags & IB_PORT_SM)
                        return IB_MAD_RESULT_SUCCESS;
                /* FALLTHROUGH */
        default:
@@@ -3575,7 -3754,7 +3754,7 @@@ static int subn_set_opa_sma(__be16 attr
                break;
        case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
                ret = __subn_set_opa_sc_to_vlnt(smp, am, data, ibdev, port,
-                                              resp_len);
+                                               resp_len);
                break;
        case OPA_ATTRIB_ID_PORT_STATE_INFO:
                ret = __subn_set_opa_psi(smp, am, data, ibdev, port,
                                              resp_len);
                break;
        case IB_SMP_ATTR_SM_INFO:
-               if (ibp->port_cap_flags & IB_PORT_SM_DISABLED)
+               if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED)
                        return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
-               if (ibp->port_cap_flags & IB_PORT_SM)
+               if (ibp->rvp.port_cap_flags & IB_PORT_SM)
                        return IB_MAD_RESULT_SUCCESS;
                /* FALLTHROUGH */
        default:
@@@ -3654,14 -3833,13 +3833,13 @@@ static int subn_get_opa_aggregate(struc
                /* zero the payload for this segment */
                memset(next_smp + sizeof(*agg), 0, agg_data_len);
  
-               (void) subn_get_opa_sma(agg->attr_id, smp, am, agg->data,
+               (void)subn_get_opa_sma(agg->attr_id, smp, am, agg->data,
                                        ibdev, port, NULL);
                if (smp->status & ~IB_SMP_DIRECTION) {
                        set_aggr_error(agg);
                        return reply((struct ib_mad_hdr *)smp);
                }
                next_smp += agg_size;
        }
  
        return reply((struct ib_mad_hdr *)smp);
@@@ -3698,14 -3876,13 +3876,13 @@@ static int subn_set_opa_aggregate(struc
                        return reply((struct ib_mad_hdr *)smp);
                }
  
-               (void) subn_set_opa_sma(agg->attr_id, smp, am, agg->data,
+               (void)subn_set_opa_sma(agg->attr_id, smp, am, agg->data,
                                        ibdev, port, NULL);
                if (smp->status & ~IB_SMP_DIRECTION) {
                        set_aggr_error(agg);
                        return reply((struct ib_mad_hdr *)smp);
                }
                next_smp += agg_size;
        }
  
        return reply((struct ib_mad_hdr *)smp);
@@@ -3823,7 -4000,7 +4000,7 @@@ static int process_subn_opa(struct ib_d
        if (smp->class_version != OPA_SMI_CLASS_VERSION) {
                smp->status |= IB_SMP_UNSUP_VERSION;
                ret = reply((struct ib_mad_hdr *)smp);
-               goto bail;
+               return ret;
        }
        ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, smp->mkey,
                         smp->route.dr.dr_slid, smp->route.dr.return_path,
                     smp->method == IB_MGMT_METHOD_SET) &&
                    port_num && port_num <= ibdev->phys_port_cnt &&
                    port != port_num)
-                       (void) check_mkey(to_iport(ibdev, port_num),
+                       (void)check_mkey(to_iport(ibdev, port_num),
                                          (struct ib_mad_hdr *)smp, 0,
                                          smp->mkey, smp->route.dr.dr_slid,
                                          smp->route.dr.return_path,
                                          smp->hop_cnt);
                ret = IB_MAD_RESULT_FAILURE;
-               goto bail;
+               return ret;
        }
  
        *resp_len = opa_get_smp_header_size(smp);
                        clear_opa_smp_data(smp);
                        ret = subn_get_opa_sma(attr_id, smp, am, data,
                                               ibdev, port, resp_len);
-                       goto bail;
+                       break;
                case OPA_ATTRIB_ID_AGGREGATE:
                        ret = subn_get_opa_aggregate(smp, ibdev, port,
                                                     resp_len);
-                       goto bail;
+                       break;
                }
+               break;
        case IB_MGMT_METHOD_SET:
                switch (attr_id) {
                default:
                        ret = subn_set_opa_sma(attr_id, smp, am, data,
                                               ibdev, port, resp_len);
-                       goto bail;
+                       break;
                case OPA_ATTRIB_ID_AGGREGATE:
                        ret = subn_set_opa_aggregate(smp, ibdev, port,
                                                     resp_len);
-                       goto bail;
+                       break;
                }
+               break;
        case IB_MGMT_METHOD_TRAP:
        case IB_MGMT_METHOD_REPORT:
        case IB_MGMT_METHOD_REPORT_RESP:
                 * Just tell the caller to process it normally.
                 */
                ret = IB_MAD_RESULT_SUCCESS;
-               goto bail;
+               break;
        default:
                smp->status |= IB_SMP_UNSUP_METHOD;
                ret = reply((struct ib_mad_hdr *)smp);
+               break;
        }
  
- bail:
        return ret;
  }
  
@@@ -3910,7 -4089,7 +4089,7 @@@ static int process_subn(struct ib_devic
        if (smp->class_version != 1) {
                smp->status |= IB_SMP_UNSUP_VERSION;
                ret = reply((struct ib_mad_hdr *)smp);
-               goto bail;
+               return ret;
        }
  
        ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags,
                     smp->method == IB_MGMT_METHOD_SET) &&
                    port_num && port_num <= ibdev->phys_port_cnt &&
                    port != port_num)
-                       (void) check_mkey(to_iport(ibdev, port_num),
-                                         (struct ib_mad_hdr *)smp, 0,
-                                         smp->mkey,
-                                         (__force __be32)smp->dr_slid,
-                                         smp->return_path, smp->hop_cnt);
+                       (void)check_mkey(to_iport(ibdev, port_num),
+                                        (struct ib_mad_hdr *)smp, 0,
+                                        smp->mkey,
+                                        (__force __be32)smp->dr_slid,
+                                        smp->return_path, smp->hop_cnt);
                ret = IB_MAD_RESULT_FAILURE;
-               goto bail;
+               return ret;
        }
  
        switch (smp->method) {
                switch (smp->attr_id) {
                case IB_SMP_ATTR_NODE_INFO:
                        ret = subn_get_nodeinfo(smp, ibdev, port);
-                       goto bail;
+                       break;
                default:
                        smp->status |= IB_SMP_UNSUP_METH_ATTR;
                        ret = reply((struct ib_mad_hdr *)smp);
-                       goto bail;
+                       break;
                }
+               break;
+       }
+       return ret;
+ }
+ static int process_perf(struct ib_device *ibdev, u8 port,
+                       const struct ib_mad *in_mad,
+                       struct ib_mad *out_mad)
+ {
+       struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
+       struct ib_class_port_info *cpi = (struct ib_class_port_info *)
+                                               &pmp->data;
+       int ret = IB_MAD_RESULT_FAILURE;
+       *out_mad = *in_mad;
+       if (pmp->mad_hdr.class_version != 1) {
+               pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+               ret = reply((struct ib_mad_hdr *)pmp);
+               return ret;
+       }
+       switch (pmp->mad_hdr.method) {
+       case IB_MGMT_METHOD_GET:
+               switch (pmp->mad_hdr.attr_id) {
+               case IB_PMA_PORT_COUNTERS:
+                       ret = pma_get_ib_portcounters(pmp, ibdev, port);
+                       break;
+               case IB_PMA_PORT_COUNTERS_EXT:
+                       ret = pma_get_ib_portcounters_ext(pmp, ibdev, port);
+                       break;
+               case IB_PMA_CLASS_PORT_INFO:
+                       cpi->capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH;
+                       ret = reply((struct ib_mad_hdr *)pmp);
+                       break;
+               default:
+                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_mad_hdr *)pmp);
+                       break;
+               }
+               break;
+       case IB_MGMT_METHOD_SET:
+               if (pmp->mad_hdr.attr_id) {
+                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_mad_hdr *)pmp);
+               }
+               break;
+       case IB_MGMT_METHOD_TRAP:
+       case IB_MGMT_METHOD_GET_RESP:
+               /*
+                * The ib_mad module will call us to process responses
+                * before checking for other consumers.
+                * Just tell the caller to process it normally.
+                */
+               ret = IB_MAD_RESULT_SUCCESS;
+               break;
+       default:
+               pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+               ret = reply((struct ib_mad_hdr *)pmp);
+               break;
        }
  
- bail:
        return ret;
  }
  
@@@ -3978,44 -4219,46 +4219,46 @@@ static int process_perf_opa(struct ib_d
                switch (pmp->mad_hdr.attr_id) {
                case IB_PMA_CLASS_PORT_INFO:
                        ret = pma_get_opa_classportinfo(pmp, ibdev, resp_len);
-                       goto bail;
+                       break;
                case OPA_PM_ATTRIB_ID_PORT_STATUS:
                        ret = pma_get_opa_portstatus(pmp, ibdev, port,
-                                                               resp_len);
-                       goto bail;
+                                                    resp_len);
+                       break;
                case OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS:
                        ret = pma_get_opa_datacounters(pmp, ibdev, port,
-                                                               resp_len);
-                       goto bail;
+                                                      resp_len);
+                       break;
                case OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS:
                        ret = pma_get_opa_porterrors(pmp, ibdev, port,
-                                                               resp_len);
-                       goto bail;
+                                                    resp_len);
+                       break;
                case OPA_PM_ATTRIB_ID_ERROR_INFO:
                        ret = pma_get_opa_errorinfo(pmp, ibdev, port,
-                                                               resp_len);
-                       goto bail;
+                                                   resp_len);
+                       break;
                default:
                        pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
                        ret = reply((struct ib_mad_hdr *)pmp);
-                       goto bail;
+                       break;
                }
+               break;
  
        case IB_MGMT_METHOD_SET:
                switch (pmp->mad_hdr.attr_id) {
                case OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS:
                        ret = pma_set_opa_portstatus(pmp, ibdev, port,
-                                                               resp_len);
-                       goto bail;
+                                                    resp_len);
+                       break;
                case OPA_PM_ATTRIB_ID_ERROR_INFO:
                        ret = pma_set_opa_errorinfo(pmp, ibdev, port,
-                                                               resp_len);
-                       goto bail;
+                                                   resp_len);
+                       break;
                default:
                        pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
                        ret = reply((struct ib_mad_hdr *)pmp);
-                       goto bail;
+                       break;
                }
+               break;
  
        case IB_MGMT_METHOD_TRAP:
        case IB_MGMT_METHOD_GET_RESP:
                 * Just tell the caller to process it normally.
                 */
                ret = IB_MAD_RESULT_SUCCESS;
-               goto bail;
+               break;
  
        default:
                pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
                ret = reply((struct ib_mad_hdr *)pmp);
+               break;
        }
  
- bail:
        return ret;
  }
  
@@@ -4097,12 -4340,15 +4340,15 @@@ static int hfi1_process_ib_mad(struct i
        case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
        case IB_MGMT_CLASS_SUBN_LID_ROUTED:
                ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad);
-               goto bail;
+               break;
+       case IB_MGMT_CLASS_PERF_MGMT:
+               ret = process_perf(ibdev, port, in_mad, out_mad);
+               break;
        default:
                ret = IB_MAD_RESULT_SUCCESS;
+               break;
        }
  
- bail:
        return ret;
  }
  
@@@ -4154,66 -4400,3 +4400,3 @@@ int hfi1_process_mad(struct ib_device *
  
        return IB_MAD_RESULT_FAILURE;
  }
- static void send_handler(struct ib_mad_agent *agent,
-                        struct ib_mad_send_wc *mad_send_wc)
- {
-       ib_free_send_mad(mad_send_wc->send_buf);
- }
- int hfi1_create_agents(struct hfi1_ibdev *dev)
- {
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-       struct ib_mad_agent *agent;
-       struct hfi1_ibport *ibp;
-       int p;
-       int ret;
-       for (p = 0; p < dd->num_pports; p++) {
-               ibp = &dd->pport[p].ibport_data;
-               agent = ib_register_mad_agent(&dev->ibdev, p + 1, IB_QPT_SMI,
-                                             NULL, 0, send_handler,
-                                             NULL, NULL, 0);
-               if (IS_ERR(agent)) {
-                       ret = PTR_ERR(agent);
-                       goto err;
-               }
-               ibp->send_agent = agent;
-       }
-       return 0;
- err:
-       for (p = 0; p < dd->num_pports; p++) {
-               ibp = &dd->pport[p].ibport_data;
-               if (ibp->send_agent) {
-                       agent = ibp->send_agent;
-                       ibp->send_agent = NULL;
-                       ib_unregister_mad_agent(agent);
-               }
-       }
-       return ret;
- }
- void hfi1_free_agents(struct hfi1_ibdev *dev)
- {
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-       struct ib_mad_agent *agent;
-       struct hfi1_ibport *ibp;
-       int p;
-       for (p = 0; p < dd->num_pports; p++) {
-               ibp = &dd->pport[p].ibport_data;
-               if (ibp->send_agent) {
-                       agent = ibp->send_agent;
-                       ibp->send_agent = NULL;
-                       ib_unregister_mad_agent(agent);
-               }
-               if (ibp->sm_ah) {
-                       ib_destroy_ah(&ibp->sm_ah->ibah);
-                       ibp->sm_ah = NULL;
-               }
-       }
- }
index 47ca6314e3284dd6d67ef5d61013894fd22756ff,42a409f1644955a449dd58f08630aafcb02583e6..0bac21e6a658ca242b856910e06be7a91cbf7284
@@@ -1,12 -1,11 +1,11 @@@
  /*
+  * Copyright(c) 2015, 2016 Intel Corporation.
   *
   * This file is provided under a dual BSD/GPLv2 license.  When using or
   * redistributing this file, you may do so under either license.
   *
   * GPL LICENSE SUMMARY
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of version 2 of the GNU General Public License as
   * published by the Free Software Foundation.
@@@ -18,8 -17,6 +17,6 @@@
   *
   * BSD LICENSE
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
@@@ -57,6 -54,7 +54,7 @@@
  
  #include "hfi.h"
  #include "chip_registers.h"
+ #include "aspm.h"
  
  /* link speed vector for Gen3 speed - not in Linux headers */
  #define GEN1_SPEED_VECTOR 0x1
@@@ -122,8 -120,9 +120,9 @@@ int hfi1_pcie_init(struct pci_dev *pdev
                        goto bail;
                }
                ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
-       } else
+       } else {
                ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+       }
        if (ret) {
                hfi1_early_err(&pdev->dev,
                               "Unable to set DMA consistent mask: %d\n", ret);
        }
  
        pci_set_master(pdev);
-       ret = pci_enable_pcie_error_reporting(pdev);
-       if (ret) {
-               hfi1_early_err(&pdev->dev,
-                              "Unable to enable pcie error reporting: %d\n",
-                             ret);
-               ret = 0;
-       }
+       (void)pci_enable_pcie_error_reporting(pdev);
        goto done;
  
  bail:
@@@ -222,10 -215,9 +215,9 @@@ int hfi1_pcie_ddinit(struct hfi1_devdat
        pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &dd->pcie_devctl);
        pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &dd->pcie_lnkctl);
        pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2,
-                                                       &dd->pcie_devctl2);
+                                 &dd->pcie_devctl2);
        pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0);
-       pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
-                                                       &dd->pci_lnkctl3);
+       pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, &dd->pci_lnkctl3);
        pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2);
  
        return 0;
   */
  void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd)
  {
-       u64 __iomem *base = (void __iomem *) dd->kregbase;
+       u64 __iomem *base = (void __iomem *)dd->kregbase;
  
        dd->flags &= ~HFI1_PRESENT;
        dd->kregbase = NULL;
                iounmap(dd->rcvarray_wc);
        if (dd->piobase)
                iounmap(dd->piobase);
 -
 -      pci_set_drvdata(dd->pcidev, NULL);
  }
  
  /*
@@@ -274,7 -268,7 +266,7 @@@ void hfi1_pcie_flr(struct hfi1_devdata 
  
  clear:
        pcie_capability_set_word(dd->pcidev, PCI_EXP_DEVCTL,
-                                               PCI_EXP_DEVCTL_BCR_FLR);
+                                PCI_EXP_DEVCTL_BCR_FLR);
        /* PCIe spec requires the function to be back within 100ms */
        msleep(100);
  }
@@@ -287,9 -281,11 +279,11 @@@ static void msix_setup(struct hfi1_devd
        struct msix_entry *msix_entry;
        int i;
  
-       /* We can't pass hfi1_msix_entry array to msix_setup
+       /*
+        * We can't pass hfi1_msix_entry array to msix_setup
         * so use a dummy msix_entry array and copy the allocated
-        * irq back to the hfi1_msix_entry array. */
+        * irq back to the hfi1_msix_entry array.
+        */
        msix_entry = kmalloc_array(nvec, sizeof(*msix_entry), GFP_KERNEL);
        if (!msix_entry) {
                ret = -ENOMEM;
@@@ -319,7 -315,6 +313,6 @@@ do_intx
                   nvec, ret);
        *msixcnt = 0;
        hfi1_enable_intx(dd->pcidev);
  }
  
  /* return the PCIe link speed from the given link status */
@@@ -367,6 -362,7 +360,7 @@@ static void update_lbus_info(struct hfi
  int pcie_speeds(struct hfi1_devdata *dd)
  {
        u32 linkcap;
+       struct pci_dev *parent = dd->pcidev->bus->self;
  
        if (!pci_is_pcie(dd->pcidev)) {
                dd_dev_err(dd, "Can't find PCI Express capability!\n");
        pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap);
        if ((linkcap & PCI_EXP_LNKCAP_SLS) != GEN3_SPEED_VECTOR) {
                dd_dev_info(dd,
-                       "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n",
-                       linkcap & PCI_EXP_LNKCAP_SLS);
+                           "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n",
+                           linkcap & PCI_EXP_LNKCAP_SLS);
                dd->link_gen3_capable = 0;
        }
  
        /*
         * bus->max_bus_speed is set from the bridge's linkcap Max Link Speed
         */
-       if (dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) {
+       if (parent && dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) {
                dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n");
                dd->link_gen3_capable = 0;
        }
        /* obtain the link width and current speed */
        update_lbus_info(dd);
  
-       /* check against expected pcie width and complain if "wrong" */
-       if (dd->lbus_width < 16)
-               dd_dev_err(dd, "PCIe width %u (x16 HFI)\n", dd->lbus_width);
+       dd_dev_info(dd, "%s\n", dd->lbus_info);
  
        return 0;
  }
@@@ -436,23 -430,18 +428,18 @@@ void hfi1_enable_intx(struct pci_dev *p
  void restore_pci_variables(struct hfi1_devdata *dd)
  {
        pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command);
-       pci_write_config_dword(dd->pcidev,
-                               PCI_BASE_ADDRESS_0, dd->pcibar0);
-       pci_write_config_dword(dd->pcidev,
-                               PCI_BASE_ADDRESS_1, dd->pcibar1);
-       pci_write_config_dword(dd->pcidev,
-                               PCI_ROM_ADDRESS, dd->pci_rom);
+       pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, dd->pcibar0);
+       pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, dd->pcibar1);
+       pci_write_config_dword(dd->pcidev, PCI_ROM_ADDRESS, dd->pci_rom);
        pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, dd->pcie_devctl);
        pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, dd->pcie_lnkctl);
        pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2,
-                                                       dd->pcie_devctl2);
+                                  dd->pcie_devctl2);
        pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0);
-       pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
-                                                       dd->pci_lnkctl3);
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, dd->pci_lnkctl3);
        pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2);
  }
  
  /*
   * BIOS may not set PCIe bus-utilization parameters for best performance.
   * Check and optionally adjust them to maximize our throughput.
@@@ -461,6 -450,10 +448,10 @@@ static int hfi1_pcie_caps
  module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO);
  MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)");
  
+ uint aspm_mode = ASPM_MODE_DISABLED;
+ module_param_named(aspm, aspm_mode, uint, S_IRUGO);
+ MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic");
  static void tune_pcie_caps(struct hfi1_devdata *dd)
  {
        struct pci_dev *parent;
        }
        /* Find out supported and configured values for parent (root) */
        parent = dd->pcidev->bus->self;
+       /*
+        * The driver cannot perform the tuning if it does not have
+        * access to the upstream component.
+        */
+       if (!parent)
+               return;
        if (!pci_is_root_bus(parent->bus)) {
                dd_dev_info(dd, "Parent not root\n");
                return;
                pcie_set_readrq(dd->pcidev, ep_mrrs);
        }
  }
  /* End of PCIe capability tuning */
  
  /*
@@@ -746,21 -746,22 +744,22 @@@ static int load_eq_table(struct hfi1_de
                c0 = fs - (eq[i][PREC] / div) - (eq[i][POST] / div);
                c_plus1 = eq[i][POST] / div;
                pci_write_config_dword(pdev, PCIE_CFG_REG_PL102,
-                       eq_value(c_minus1, c0, c_plus1));
+                                      eq_value(c_minus1, c0, c_plus1));
                /* check if these coefficients violate EQ rules */
                pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL105,
-                                                               &violation);
+                                     &violation);
                if (violation
                    & PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK){
                        if (hit_error == 0) {
                                dd_dev_err(dd,
-                                       "Gen3 EQ Table Coefficient rule violations\n");
+                                          "Gen3 EQ Table Coefficient rule violations\n");
                                dd_dev_err(dd, "         prec   attn   post\n");
                        }
                        dd_dev_err(dd, "   p%02d:   %02x     %02x     %02x\n",
-                               i, (u32)eq[i][0], (u32)eq[i][1], (u32)eq[i][2]);
+                                  i, (u32)eq[i][0], (u32)eq[i][1],
+                                  (u32)eq[i][2]);
                        dd_dev_err(dd, "            %02x     %02x     %02x\n",
-                               (u32)c_minus1, (u32)c0, (u32)c_plus1);
+                                  (u32)c_minus1, (u32)c0, (u32)c_plus1);
                        hit_error = 1;
                }
        }
  /*
   * Steps to be done after the PCIe firmware is downloaded and
   * before the SBR for the Pcie Gen3.
-  * The hardware mutex is already being held.
+  * The SBus resource is already being held.
   */
  static void pcie_post_steps(struct hfi1_devdata *dd)
  {
@@@ -815,8 -816,8 +814,8 @@@ static int trigger_sbr(struct hfi1_devd
        list_for_each_entry(pdev, &dev->bus->devices, bus_list)
                if (pdev != dev) {
                        dd_dev_err(dd,
-                               "%s: another device is on the same bus\n",
-                               __func__);
+                                  "%s: another device is on the same bus\n",
+                                  __func__);
                        return -ENOTTY;
                }
  
@@@ -840,8 -841,8 +839,8 @@@ static void write_gasket_interrupt(stru
                                   u16 code, u16 data)
  {
        write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (index * 8),
-           (((u64)code << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT)
-           |((u64)data << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT)));
+                 (((u64)code << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT) |
+                  ((u64)data << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT)));
  }
  
  /*
@@@ -851,14 -852,13 +850,13 @@@ static void arm_gasket_logic(struct hfi
  {
        u64 reg;
  
-       reg = (((u64)1 << dd->hfi1_id)
-                       << ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT)
-               | ((u64)pcie_serdes_broadcast[dd->hfi1_id]
-                       << ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT
-               | ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK
-               | ((u64)SBR_DELAY_US & ASIC_PCIE_SD_HOST_CMD_TIMER_MASK)
-                       << ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT
-               );
+       reg = (((u64)1 << dd->hfi1_id) <<
+              ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT) |
+             ((u64)pcie_serdes_broadcast[dd->hfi1_id] <<
+              ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT |
+              ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK |
+              ((u64)SBR_DELAY_US & ASIC_PCIE_SD_HOST_CMD_TIMER_MASK) <<
+              ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT);
        write_csr(dd, ASIC_PCIE_SD_HOST_CMD, reg);
        /* read back to push the write */
        read_csr(dd, ASIC_PCIE_SD_HOST_CMD);
@@@ -946,7 -946,7 +944,7 @@@ static void write_xmt_margin(struct hfi
   */
  int do_pcie_gen3_transition(struct hfi1_devdata *dd)
  {
-       struct pci_dev *parent;
+       struct pci_dev *parent = dd->pcidev->bus->self;
        u64 fw_ctrl;
        u64 reg, therm;
        u32 reg32, fs, lf;
        int do_retry, retry_count = 0;
        uint default_pset;
        u16 target_vector, target_speed;
-       u16 lnkctl, lnkctl2, vendor;
-       u8 nsbr = 1;
+       u16 lnkctl2, vendor;
        u8 div;
        const u8 (*eq)[3];
        int return_error = 0;
        /* if already at target speed, done (unless forced) */
        if (dd->lbus_speed == target_speed) {
                dd_dev_info(dd, "%s: PCIe already at gen%d, %s\n", __func__,
-                       pcie_target,
-                       pcie_force ? "re-doing anyway" : "skipping");
+                           pcie_target,
+                           pcie_force ? "re-doing anyway" : "skipping");
                if (!pcie_force)
                        return 0;
        }
  
        /*
-        * A0 needs an additional SBR
+        * The driver cannot do the transition if it has no access to the
+        * upstream component
         */
-       if (is_ax(dd))
-               nsbr++;
+       if (!parent) {
+               dd_dev_info(dd, "%s: No upstream, Can't do gen3 transition\n",
+                           __func__);
+               return 0;
+       }
  
        /*
         * Do the Gen3 transition.  Steps are those of the PCIe Gen3
                goto done_no_mutex;
        }
  
-       /* hold the HW mutex across the firmware download and SBR */
-       ret = acquire_hw_mutex(dd);
-       if (ret)
+       /* hold the SBus resource across the firmware download and SBR */
+       ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+       if (ret) {
+               dd_dev_err(dd, "%s: unable to acquire SBus resource\n",
+                          __func__);
                return ret;
+       }
  
        /* make sure thermal polling is not causing interrupts */
        therm = read_csr(dd, ASIC_CFG_THERM_POLL_EN);
@@@ -1030,8 -1036,11 +1034,11 @@@ retry
        /* step 4: download PCIe Gen3 SerDes firmware */
        dd_dev_info(dd, "%s: downloading firmware\n", __func__);
        ret = load_pcie_firmware(dd);
-       if (ret)
+       if (ret) {
+               /* do not proceed if the firmware cannot be downloaded */
+               return_error = 1;
                goto done;
+       }
  
        /* step 5: set up device parameter settings */
        dd_dev_info(dd, "%s: setting PCIe registers\n", __func__);
                default_pset = DEFAULT_MCP_PSET;
        }
        pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101,
-               (fs << PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT)
-               | (lf << PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT));
+                              (fs <<
+                               PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT) |
+                              (lf <<
+                               PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT));
        ret = load_eq_table(dd, eq, fs, div);
        if (ret)
                goto done;
                pcie_pset = default_pset;
        if (pcie_pset > 10) {   /* valid range is 0-10, inclusive */
                dd_dev_err(dd, "%s: Invalid Eq Pset %u, setting to %d\n",
-                       __func__, pcie_pset, default_pset);
+                          __func__, pcie_pset, default_pset);
                pcie_pset = default_pset;
        }
        dd_dev_info(dd, "%s: using EQ Pset %u\n", __func__, pcie_pset);
        pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL106,
-               ((1 << pcie_pset)
-                       << PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT)
-               | PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK
-               | PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK);
+                              ((1 << pcie_pset) <<
+                       PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT) |
+                       PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK |
+                       PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK);
  
        /*
         * step 5b: Do post firmware download steps via SBus
         */
        write_xmt_margin(dd, __func__);
  
-       /* step 5e: disable active state power management (ASPM) */
+       /*
+        * step 5e: disable active state power management (ASPM). It
+        * will be enabled if required later
+        */
        dd_dev_info(dd, "%s: clearing ASPM\n", __func__);
-       pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &lnkctl);
-       lnkctl &= ~PCI_EXP_LNKCTL_ASPMC;
-       pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, lnkctl);
+       aspm_hw_disable_l1(dd);
  
        /*
         * step 5f: clear DirectSpeedChange
         * that it is Gen3 capable earlier.
         */
        dd_dev_info(dd, "%s: setting parent target link speed\n", __func__);
-       parent = dd->pcidev->bus->self;
        pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2);
        dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
-               (u32)lnkctl2);
+                   (u32)lnkctl2);
        /* only write to parent if target is not as high as ours */
        if ((lnkctl2 & LNKCTL2_TARGET_LINK_SPEED_MASK) < target_vector) {
                lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
                lnkctl2 |= target_vector;
                dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
-                       (u32)lnkctl2);
+                           (u32)lnkctl2);
                pcie_capability_write_word(parent, PCI_EXP_LNKCTL2, lnkctl2);
        } else {
                dd_dev_info(dd, "%s: ..target speed is OK\n", __func__);
        dd_dev_info(dd, "%s: setting target link speed\n", __func__);
        pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2);
        dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
-               (u32)lnkctl2);
+                   (u32)lnkctl2);
        lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
        lnkctl2 |= target_vector;
        dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
-               (u32)lnkctl2);
+                   (u32)lnkctl2);
        pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2);
  
        /* step 5h: arm gasket logic */
        /* hold DC in reset across the SBR */
        write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
-       (void) read_csr(dd, CCE_DC_CTRL); /* DC reset hold */
+       (void)read_csr(dd, CCE_DC_CTRL); /* DC reset hold */
        /* save firmware control across the SBR */
        fw_ctrl = read_csr(dd, MISC_CFG_FW_CTRL);
  
        ret = pci_read_config_word(dd->pcidev, PCI_VENDOR_ID, &vendor);
        if (ret) {
                dd_dev_info(dd,
-                       "%s: read of VendorID failed after SBR, err %d\n",
-                       __func__, ret);
+                           "%s: read of VendorID failed after SBR, err %d\n",
+                           __func__, ret);
                return_error = 1;
                goto done;
        }
        write_csr(dd, CCE_DC_CTRL, 0);
  
        /* Set the LED off */
-       if (is_ax(dd))
-               setextled(dd, 0);
+       setextled(dd, 0);
  
        /* check for any per-lane errors */
        pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, &reg32);
                        & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK;
        if ((status & (1 << dd->hfi1_id)) == 0) {
                dd_dev_err(dd,
-                       "%s: gasket status 0x%x, expecting 0x%x\n",
-                       __func__, status, 1 << dd->hfi1_id);
+                          "%s: gasket status 0x%x, expecting 0x%x\n",
+                          __func__, status, 1 << dd->hfi1_id);
                ret = -EIO;
                goto done;
        }
        /* update our link information cache */
        update_lbus_info(dd);
        dd_dev_info(dd, "%s: new speed and width: %s\n", __func__,
-               dd->lbus_info);
+                   dd->lbus_info);
  
        if (dd->lbus_speed != target_speed) { /* not target */
                /* maybe retry */
                do_retry = retry_count < pcie_retry;
                dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n",
-                       pcie_target, do_retry ? ", retrying" : "");
+                          pcie_target, do_retry ? ", retrying" : "");
                retry_count++;
                if (do_retry) {
                        msleep(100); /* allow time to settle */
@@@ -1317,7 -1327,7 +1325,7 @@@ done
                dd_dev_info(dd, "%s: Re-enable therm polling\n",
                            __func__);
        }
-       release_hw_mutex(dd);
+       release_chip_resource(dd, CR_SBUS);
  done_no_mutex:
        /* return no error if it is OK to be at current speed */
        if (ret && !return_error) {
index 64bef6c266539f75349300812f84ace27cff492e,228e9fb76e08045d85ee15ab23ea2089e2fc6b69..8c25e1b58849a17a22a96d6ca8c4f54de4992098
@@@ -1,12 -1,11 +1,11 @@@
  /*
+  * Copyright(c) 2015, 2016 Intel Corporation.
   *
   * This file is provided under a dual BSD/GPLv2 license.  When using or
   * redistributing this file, you may do so under either license.
   *
   * GPL LICENSE SUMMARY
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of version 2 of the GNU General Public License as
   * published by the Free Software Foundation.
@@@ -18,8 -17,6 +17,6 @@@
   *
   * BSD LICENSE
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
@@@ -52,9 -49,9 +49,9 @@@
  
  /* additive distance between non-SOP and SOP space */
  #define SOP_DISTANCE (TXE_PIO_SIZE / 2)
- #define PIO_BLOCK_MASK (PIO_BLOCK_SIZE-1)
+ #define PIO_BLOCK_MASK (PIO_BLOCK_SIZE - 1)
  /* number of QUADWORDs in a block */
- #define PIO_BLOCK_QWS (PIO_BLOCK_SIZE/sizeof(u64))
+ #define PIO_BLOCK_QWS (PIO_BLOCK_SIZE / sizeof(u64))
  
  /**
   * pio_copy - copy data block to MMIO space
@@@ -83,11 -80,13 +80,13 @@@ void pio_copy(struct hfi1_devdata *dd, 
        dest += sizeof(u64);
  
        /* calculate where the QWORD data ends - in SOP=1 space */
-       dend = dest + ((count>>1) * sizeof(u64));
+       dend = dest + ((count >> 1) * sizeof(u64));
  
        if (dend < send) {
-               /* all QWORD data is within the SOP block, does *not*
-                  reach the end of the SOP block */
+               /*
+                * all QWORD data is within the SOP block, does *not*
+                * reach the end of the SOP block
+                */
  
                while (dest < dend) {
                        writeq(*(u64 *)from, dest);
                writeq(val.val64, dest);
                dest += sizeof(u64);
        }
-       /* fill in rest of block, no need to check pbuf->end
-          as we only wrap on a block boundary */
+       /*
+        * fill in rest of block, no need to check pbuf->end
+        * as we only wrap on a block boundary
+        */
        while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
                writeq(0, dest);
                dest += sizeof(u64);
   * "zero" shift - bit shift used to zero out upper bytes.  Input is
   * the count of LSB bytes to preserve.
   */
- #define zshift(x) (8 * (8-(x)))
+ #define zshift(x) (8 * (8 - (x)))
  
  /*
   * "merge" shift - bit shift used to merge with carry bytes.  Input is
   * o nbytes must not span a QW boundary
   */
  static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
-                                                       unsigned int nbytes)
+                                 unsigned int nbytes)
  {
        unsigned long off;
  
   * o nbytes may span a QW boundary
   */
  static inline void read_extra_bytes(struct pio_buf *pbuf,
-                                       const void *from, unsigned int nbytes)
+                                   const void *from, unsigned int nbytes)
  {
        unsigned long off = (unsigned long)from & 0x7;
        unsigned int room, xbytes;
        while (nbytes) {
                /* find the number of bytes in this u64 */
                room = 8 - off; /* this u64 has room for this many bytes */
 -              xbytes = nbytes > room ? room : nbytes;
 +              xbytes = min(room, nbytes);
  
                /*
                 * shift down to zero lower bytes, shift up to zero upper
                pbuf->carry.val64 |= (((*(u64 *)from)
                                        >> mshift(off))
                                        << zshift(xbytes))
-                                       >> zshift(xbytes+pbuf->carry_bytes);
+                                       >> zshift(xbytes + pbuf->carry_bytes);
                off = 0;
                pbuf->carry_bytes += xbytes;
                nbytes -= xbytes;
@@@ -362,7 -363,7 +363,7 @@@ static inline void jcopy(u8 *dest, cons
   * o from may _not_ be u64 aligned.
   */
  static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
-                                                       unsigned int nbytes)
+                                 unsigned int nbytes)
  {
        jcopy(&pbuf->carry.val8[0], from, nbytes);
        pbuf->carry_bytes = nbytes;
   * o nbytes may span a QW boundary
   */
  static inline void read_extra_bytes(struct pio_buf *pbuf,
-                                       const void *from, unsigned int nbytes)
+                                   const void *from, unsigned int nbytes)
  {
        jcopy(&pbuf->carry.val8[pbuf->carry_bytes], from, nbytes);
        pbuf->carry_bytes += nbytes;
@@@ -411,7 -412,7 +412,7 @@@ static inline void merge_write8
  
        jcopy(&pbuf->carry.val8[pbuf->carry_bytes], src, remainder);
        writeq(pbuf->carry.val64, dest);
-       jcopy(&pbuf->carry.val8[0], src+remainder, pbuf->carry_bytes);
+       jcopy(&pbuf->carry.val8[0], src + remainder, pbuf->carry_bytes);
  }
  
  /*
@@@ -433,7 -434,7 +434,7 @@@ static inline int carry_write8(struct p
                u64 zero = 0;
  
                jcopy(&pbuf->carry.val8[pbuf->carry_bytes], (u8 *)&zero,
-                                               8 - pbuf->carry_bytes);
+                     8 - pbuf->carry_bytes);
                writeq(pbuf->carry.val64, dest);
                return 1;
        }
   * @nbytes: bytes to copy
   */
  void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
-                               const void *from, size_t nbytes)
+                       const void *from, size_t nbytes)
  {
        void __iomem *dest = pbuf->start + SOP_DISTANCE;
        void __iomem *send = dest + PIO_BLOCK_SIZE;
        dest += sizeof(u64);
  
        /* calculate where the QWORD data ends - in SOP=1 space */
-       dend = dest + ((nbytes>>3) * sizeof(u64));
+       dend = dest + ((nbytes >> 3) * sizeof(u64));
  
        if (dend < send) {
-               /* all QWORD data is within the SOP block, does *not*
-                  reach the end of the SOP block */
+               /*
+                * all QWORD data is within the SOP block, does *not*
+                * reach the end of the SOP block
+                */
  
                while (dest < dend) {
                        writeq(*(u64 *)from, dest);
@@@ -562,10 -565,12 +565,12 @@@ static void mid_copy_mix(struct pio_bu
                void __iomem *send;             /* SOP end */
                void __iomem *xend;
  
-               /* calculate the end of data or end of block, whichever
-                  comes first */
+               /*
+                * calculate the end of data or end of block, whichever
+                * comes first
+                */
                send = pbuf->start + PIO_BLOCK_SIZE;
 -              xend = send < dend ? send : dend;
 +              xend = min(send, dend);
  
                /* shift up to SOP=1 space */
                dest += SOP_DISTANCE;
   * Must handle nbytes < 8.
   */
  static void mid_copy_straight(struct pio_buf *pbuf,
-                                               const void *from, size_t nbytes)
+                             const void *from, size_t nbytes)
  {
        void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
        void __iomem *dend;                     /* 8-byte data end */
  
        /* calculate 8-byte data end */
-       dend = dest + ((nbytes>>3) * sizeof(u64));
+       dend = dest + ((nbytes >> 3) * sizeof(u64));
  
        if (pbuf->qw_written < PIO_BLOCK_QWS) {
                /*
                void __iomem *send;             /* SOP end */
                void __iomem *xend;
  
-               /* calculate the end of data or end of block, whichever
-                  comes first */
+               /*
+                * calculate the end of data or end of block, whichever
+                * comes first
+                */
                send = pbuf->start + PIO_BLOCK_SIZE;
 -              xend = send < dend ? send : dend;
 +              xend = min(send, dend);
  
                /* shift up to SOP=1 space */
                dest += SOP_DISTANCE;
        /* we know carry_bytes was zero on entry to this routine */
        read_low_bytes(pbuf, from, nbytes & 0x7);
  
-       pbuf->qw_written += nbytes>>3;
+       pbuf->qw_written += nbytes >> 3;
  }
  
  /*
index 9d4f5d6aaf33ebf1c76544311f7d1095336d1dc0,46e254d52dadd514e9fa7abcc433f4f6cbfdc313..ab6b6a42000f709020a001a2aa9594d0f2f5b851
@@@ -1,12 -1,11 +1,11 @@@
  /*
+  * Copyright(c) 2015, 2016 Intel Corporation.
   *
   * This file is provided under a dual BSD/GPLv2 license.  When using or
   * redistributing this file, you may do so under either license.
   *
   * GPL LICENSE SUMMARY
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of version 2 of the GNU General Public License as
   * published by the Free Software Foundation.
@@@ -18,8 -17,6 +17,6 @@@
   *
   * BSD LICENSE
   *
-  * Copyright(c) 2015 Intel Corporation.
-  *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
  #include "hfi.h"
  #include "sdma.h"
  #include "user_sdma.h"
 -#include "sdma.h"
  #include "verbs.h"  /* for the headers */
  #include "common.h" /* for struct hfi1_tid_info */
  #include "trace.h"
+ #include "mmu_rb.h"
  
  static uint hfi1_sdma_comp_ring_size = 128;
  module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
@@@ -146,7 -145,6 +144,6 @@@ MODULE_PARM_DESC(sdma_comp_size, "Size 
  
  /* Last packet in the request */
  #define TXREQ_FLAGS_REQ_LAST_PKT BIT(0)
- #define TXREQ_FLAGS_IOVEC_LAST_PKT BIT(0)
  
  #define SDMA_REQ_IN_USE     0
  #define SDMA_REQ_FOR_THREAD 1
@@@ -170,16 -168,28 +167,28 @@@ static unsigned initial_pkt_count = 8
  #define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */
  
  struct user_sdma_iovec {
+       struct list_head list;
        struct iovec iov;
        /* number of pages in this vector */
        unsigned npages;
        /* array of pinned pages for this vector */
        struct page **pages;
-       /* offset into the virtual address space of the vector at
-        * which we last left off. */
+       /*
+        * offset into the virtual address space of the vector at
+        * which we last left off.
+        */
        u64 offset;
  };
  
+ struct sdma_mmu_node {
+       struct mmu_rb_node rb;
+       struct list_head list;
+       struct hfi1_user_sdma_pkt_q *pq;
+       atomic_t refcount;
+       struct page **pages;
+       unsigned npages;
+ };
  struct user_sdma_request {
        struct sdma_req_info info;
        struct hfi1_user_sdma_pkt_q *pq;
         * to 0.
         */
        u8 omfactor;
-       /*
-        * pointer to the user's mm_struct. We are going to
-        * get a reference to it so it doesn't get freed
-        * since we might not be in process context when we
-        * are processing the iov's.
-        * Using this mm_struct, we can get vma based on the
-        * iov's address (find_vma()).
-        */
-       struct mm_struct *user_mm;
        /*
         * We copy the iovs for this request (based on
         * info.iovcnt). These are only the data vectors
        u16 tididx;
        u32 sent;
        u64 seqnum;
+       u64 seqcomp;
+       u64 seqsubmitted;
        struct list_head txps;
-       spinlock_t txcmp_lock;  /* protect txcmp list */
-       struct list_head txcmp;
        unsigned long flags;
        /* status of the last txreq completed */
        int status;
-       struct work_struct worker;
  };
  
  /*
@@@ -259,11 -259,6 +258,6 @@@ struct user_sdma_txreq 
        struct sdma_txreq txreq;
        struct list_head list;
        struct user_sdma_request *req;
-       struct {
-               struct user_sdma_iovec *vec;
-               u8 flags;
-       } iovecs[3];
-       int idx;
        u16 flags;
        unsigned busycount;
        u64 seqnum;
  
  static int user_sdma_send_pkts(struct user_sdma_request *, unsigned);
  static int num_user_pages(const struct iovec *);
- static void user_sdma_txreq_cb(struct sdma_txreq *, int, int);
- static void user_sdma_delayed_completion(struct work_struct *);
- static void user_sdma_free_request(struct user_sdma_request *);
+ static void user_sdma_txreq_cb(struct sdma_txreq *, int);
+ static inline void pq_update(struct hfi1_user_sdma_pkt_q *);
+ static void user_sdma_free_request(struct user_sdma_request *, bool);
  static int pin_vector_pages(struct user_sdma_request *,
                            struct user_sdma_iovec *);
- static void unpin_vector_pages(struct user_sdma_request *,
-                              struct user_sdma_iovec *);
+ static void unpin_vector_pages(struct mm_struct *, struct page **, unsigned);
  static int check_header_template(struct user_sdma_request *,
                                 struct hfi1_pkt_header *, u32, u32);
  static int set_txreq_header(struct user_sdma_request *,
                            struct user_sdma_txreq *, u32);
  static int set_txreq_header_ahg(struct user_sdma_request *,
                                struct user_sdma_txreq *, u32);
- static inline void set_comp_state(struct user_sdma_request *,
-                                       enum hfi1_sdma_comp_state, int);
+ static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *,
+                                 struct hfi1_user_sdma_comp_q *,
+                                 u16, enum hfi1_sdma_comp_state, int);
  static inline u32 set_pkt_bth_psn(__be32, u8, u32);
  static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
  
@@@ -303,6 -298,17 +297,17 @@@ static int defer_packet_queue
        struct sdma_txreq *,
        unsigned seq);
  static void activate_packet_queue(struct iowait *, int);
+ static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long);
+ static int sdma_rb_insert(struct rb_root *, struct mmu_rb_node *);
+ static void sdma_rb_remove(struct rb_root *, struct mmu_rb_node *, bool);
+ static int sdma_rb_invalidate(struct rb_root *, struct mmu_rb_node *);
+ static struct mmu_rb_ops sdma_rb_ops = {
+       .filter = sdma_rb_filter,
+       .insert = sdma_rb_insert,
+       .remove = sdma_rb_remove,
+       .invalidate = sdma_rb_invalidate
+ };
  
  static int defer_packet_queue(
        struct sdma_engine *sde,
@@@ -345,7 -351,7 +350,7 @@@ static void activate_packet_queue(struc
  
  static void sdma_kmem_cache_ctor(void *obj)
  {
 -      struct user_sdma_txreq *tx = (struct user_sdma_txreq *)obj;
 +      struct user_sdma_txreq *tx = obj;
  
        memset(tx, 0, sizeof(*tx));
  }
@@@ -380,7 -386,7 +385,7 @@@ int hfi1_user_sdma_alloc_queues(struct 
                goto pq_nomem;
  
        memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size;
-       pq->reqs = kmalloc(memsize, GFP_KERNEL);
+       pq->reqs = kzalloc(memsize, GFP_KERNEL);
        if (!pq->reqs)
                goto pq_reqs_nomem;
  
        pq->state = SDMA_PKT_Q_INACTIVE;
        atomic_set(&pq->n_reqs, 0);
        init_waitqueue_head(&pq->wait);
+       pq->sdma_rb_root = RB_ROOT;
+       INIT_LIST_HEAD(&pq->evict);
+       spin_lock_init(&pq->evict_lock);
  
        iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
-                   activate_packet_queue);
+                   activate_packet_queue, NULL);
        pq->reqidx = 0;
        snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
                 fd->subctxt);
        if (!cq)
                goto cq_nomem;
  
 -      memsize = ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size,
 -                      PAGE_SIZE);
 +      memsize = PAGE_ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size);
        cq->comps = vmalloc_user(memsize);
        if (!cq->comps)
                goto cq_comps_nomem;
        cq->nentries = hfi1_sdma_comp_ring_size;
        fd->cq = cq;
  
+       ret = hfi1_mmu_rb_register(&pq->sdma_rb_root, &sdma_rb_ops);
+       if (ret) {
+               dd_dev_err(dd, "Failed to register with MMU %d", ret);
+               goto done;
+       }
        spin_lock_irqsave(&uctxt->sdma_qlock, flags);
        list_add(&pq->list, &uctxt->sdma_queues);
        spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
@@@ -450,6 -466,7 +464,7 @@@ int hfi1_user_sdma_free_queues(struct h
        hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit,
                  uctxt->ctxt, fd->subctxt);
        pq = fd->pq;
+       hfi1_mmu_rb_unregister(&pq->sdma_rb_root);
        if (pq) {
                spin_lock_irqsave(&uctxt->sdma_qlock, flags);
                if (!list_empty(&pq->list))
                fd->pq = NULL;
        }
        if (fd->cq) {
 -              if (fd->cq->comps)
 -                      vfree(fd->cq->comps);
 +              vfree(fd->cq->comps);
                kfree(fd->cq);
                fd->cq = NULL;
        }
  int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
                                   unsigned long dim, unsigned long *count)
  {
-       int ret = 0, i = 0, sent;
+       int ret = 0, i = 0;
        struct hfi1_filedata *fd = fp->private_data;
        struct hfi1_ctxtdata *uctxt = fd->uctxt;
        struct hfi1_user_sdma_pkt_q *pq = fd->pq;
                          dd->unit, uctxt->ctxt, fd->subctxt, ret);
                return -EFAULT;
        }
        trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
                                     (u16 *)&info);
-       if (cq->comps[info.comp_idx].status == QUEUED) {
+       if (cq->comps[info.comp_idx].status == QUEUED ||
+           test_bit(SDMA_REQ_IN_USE, &pq->reqs[info.comp_idx].flags)) {
                hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state",
                          dd->unit, uctxt->ctxt, fd->subctxt,
                          info.comp_idx);
        req->cq = cq;
        req->status = -1;
        INIT_LIST_HEAD(&req->txps);
-       INIT_LIST_HEAD(&req->txcmp);
-       INIT_WORK(&req->worker, user_sdma_delayed_completion);
  
-       spin_lock_init(&req->txcmp_lock);
        memcpy(&req->info, &info, sizeof(info));
  
        if (req_opcode(info.ctrl) == EXPECTED)
        }
  
        req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
-       /* Calculate the initial TID offset based on the values of
-          KDETH.OFFSET and KDETH.OM that are passed in. */
+       /*
+        * Calculate the initial TID offset based on the values of
+        * KDETH.OFFSET and KDETH.OM that are passed in.
+        */
        req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
                (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
                 KDETH_OM_LARGE : KDETH_OM_SMALL);
  
        /* Save all the IO vector structures */
        while (i < req->data_iovs) {
+               INIT_LIST_HEAD(&req->iovs[i].list);
                memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec));
-               req->iovs[i].offset = 0;
+               ret = pin_vector_pages(req, &req->iovs[i]);
+               if (ret) {
+                       req->status = ret;
+                       goto free_req;
+               }
                req->data_len += req->iovs[i++].iov.iov_len;
        }
        SDMA_DBG(req, "total data length %u", req->data_len);
                }
        }
  
-       set_comp_state(req, QUEUED, 0);
+       set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
+       atomic_inc(&pq->n_reqs);
        /* Send the first N packets in the request to buy us some time */
-       sent = user_sdma_send_pkts(req, pcount);
-       if (unlikely(sent < 0)) {
-               if (sent != -EBUSY) {
-                       req->status = sent;
-                       set_comp_state(req, ERROR, req->status);
-                       return sent;
-               } else
-                       sent = 0;
+       ret = user_sdma_send_pkts(req, pcount);
+       if (unlikely(ret < 0 && ret != -EBUSY)) {
+               req->status = ret;
+               goto free_req;
        }
-       atomic_inc(&pq->n_reqs);
-       xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
  
-       if (sent < req->info.npkts) {
-               /*
-                * This is a somewhat blocking send implementation.
-                * The driver will block the caller until all packets of the
-                * request have been submitted to the SDMA engine. However, it
-                * will not wait for send completions.
-                */
-               while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) {
-                       ret = user_sdma_send_pkts(req, pcount);
-                       if (ret < 0) {
-                               if (ret != -EBUSY) {
-                                       req->status = ret;
-                                       return ret;
-                               }
-                               wait_event_interruptible_timeout(
-                                       pq->busy.wait_dma,
-                                       (pq->state == SDMA_PKT_Q_ACTIVE),
-                                       msecs_to_jiffies(
-                                               SDMA_IOWAIT_TIMEOUT));
+       /*
+        * It is possible that the SDMA engine would have processed all the
+        * submitted packets by the time we get here. Therefore, only set
+        * packet queue state to ACTIVE if there are still uncompleted
+        * requests.
+        */
+       if (atomic_read(&pq->n_reqs))
+               xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
+       /*
+        * This is a somewhat blocking send implementation.
+        * The driver will block the caller until all packets of the
+        * request have been submitted to the SDMA engine. However, it
+        * will not wait for send completions.
+        */
+       while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) {
+               ret = user_sdma_send_pkts(req, pcount);
+               if (ret < 0) {
+                       if (ret != -EBUSY) {
+                               req->status = ret;
+                               set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
+                               if (ACCESS_ONCE(req->seqcomp) ==
+                                   req->seqsubmitted - 1)
+                                       goto free_req;
+                               return ret;
                        }
+                       wait_event_interruptible_timeout(
+                               pq->busy.wait_dma,
+                               (pq->state == SDMA_PKT_Q_ACTIVE),
+                               msecs_to_jiffies(
+                                       SDMA_IOWAIT_TIMEOUT));
                }
        }
        *count += idx;
        return 0;
  free_req:
-       user_sdma_free_request(req);
+       user_sdma_free_request(req, true);
+       pq_update(pq);
+       set_comp_state(pq, cq, info.comp_idx, ERROR, req->status);
        return ret;
  }
  
  static inline u32 compute_data_length(struct user_sdma_request *req,
-                                           struct user_sdma_txreq *tx)
+                                     struct user_sdma_txreq *tx)
  {
        /*
         * Determine the proper size of the packet data.
        } else if (req_opcode(req->info.ctrl) == EXPECTED) {
                u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
                        PAGE_SIZE;
-               /* Get the data length based on the remaining space in the
-                * TID pair. */
+               /*
+                * Get the data length based on the remaining space in the
+                * TID pair.
+                */
                len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
                /* If we've filled up the TID pair, move to the next one. */
                if (unlikely(!len) && ++req->tididx < req->n_tids &&
                        req->tidoffset = 0;
                        len = min_t(u32, tidlen, req->info.fragsize);
                }
-               /* Since the TID pairs map entire pages, make sure that we
+               /*
+                * Since the TID pairs map entire pages, make sure that we
                 * are not going to try to send more data that we have
-                * remaining. */
+                * remaining.
+                */
                len = min(len, req->data_len - req->sent);
-       } else
+       } else {
                len = min(req->data_len - req->sent, (u32)req->info.fragsize);
+       }
        SDMA_DBG(req, "Data Length = %u", len);
        return len;
  }
@@@ -810,9 -846,7 +843,7 @@@ static int user_sdma_send_pkts(struct u
                tx->flags = 0;
                tx->req = req;
                tx->busycount = 0;
-               tx->idx = -1;
                INIT_LIST_HEAD(&tx->list);
-               memset(tx->iovecs, 0, sizeof(tx->iovecs));
  
                if (req->seqnum == req->info.npkts - 1)
                        tx->flags |= TXREQ_FLAGS_REQ_LAST_PKT;
                                WARN_ON(iovec->offset);
                        }
  
-                       /*
-                        * This request might include only a header and no user
-                        * data, so pin pages only if there is data and it the
-                        * pages have not been pinned already.
-                        */
-                       if (unlikely(!iovec->pages && iovec->iov.iov_len)) {
-                               ret = pin_vector_pages(req, iovec);
-                               if (ret)
-                                       goto free_tx;
-                       }
-                       tx->iovecs[++tx->idx].vec = iovec;
                        datalen = compute_data_length(req, tx);
                        if (!datalen) {
                                SDMA_DBG(req,
                        unsigned pageidx, len;
  
                        base = (unsigned long)iovec->iov.iov_base;
 -                      offset = ((base + iovec->offset + iov_offset) &
 -                                ~PAGE_MASK);
 +                      offset = offset_in_page(base + iovec->offset +
 +                                              iov_offset);
                        pageidx = (((iovec->offset + iov_offset +
                                     base) - (base & PAGE_MASK)) >> PAGE_SHIFT);
                        len = offset + req->info.fragsize > PAGE_SIZE ?
                                              iovec->pages[pageidx],
                                              offset, len);
                        if (ret) {
-                               int i;
                                SDMA_DBG(req, "SDMA txreq add page failed %d\n",
                                         ret);
-                               /* Mark all assigned vectors as complete so they
-                                * are unpinned in the callback. */
-                               for (i = tx->idx; i >= 0; i--) {
-                                       tx->iovecs[i].flags |=
-                                               TXREQ_FLAGS_IOVEC_LAST_PKT;
-                               }
                                goto free_txreq;
                        }
                        iov_offset += len;
                        data_sent += len;
                        if (unlikely(queued < datalen &&
                                     pageidx == iovec->npages &&
-                                    req->iov_idx < req->data_iovs - 1 &&
-                                    tx->idx < ARRAY_SIZE(tx->iovecs))) {
+                                    req->iov_idx < req->data_iovs - 1)) {
                                iovec->offset += iov_offset;
-                               tx->iovecs[tx->idx].flags |=
-                                       TXREQ_FLAGS_IOVEC_LAST_PKT;
                                iovec = &req->iovs[++req->iov_idx];
-                               if (!iovec->pages) {
-                                       ret = pin_vector_pages(req, iovec);
-                                       if (ret)
-                                               goto free_txreq;
-                               }
                                iov_offset = 0;
-                               tx->iovecs[++tx->idx].vec = iovec;
                        }
                }
                /*
                if (req_opcode(req->info.ctrl) == EXPECTED)
                        req->tidoffset += datalen;
                req->sent += data_sent;
-               if (req->data_len) {
-                       tx->iovecs[tx->idx].vec->offset += iov_offset;
-                       /* If we've reached the end of the io vector, mark it
-                        * so the callback can unpin the pages and free it. */
-                       if (tx->iovecs[tx->idx].vec->offset ==
-                           tx->iovecs[tx->idx].vec->iov.iov_len)
-                               tx->iovecs[tx->idx].flags |=
-                                       TXREQ_FLAGS_IOVEC_LAST_PKT;
-               }
+               if (req->data_len)
+                       iovec->offset += iov_offset;
+               list_add_tail(&tx->txreq.list, &req->txps);
                /*
                 * It is important to increment this here as it is used to
                 * generate the BTH.PSN and, therefore, can't be bulk-updated
                 * outside of the loop.
                 */
                tx->seqnum = req->seqnum++;
-               list_add_tail(&tx->txreq.list, &req->txps);
                npkts++;
        }
  dosend:
        ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps);
-       if (list_empty(&req->txps))
+       if (list_empty(&req->txps)) {
+               req->seqsubmitted = req->seqnum;
                if (req->seqnum == req->info.npkts) {
                        set_bit(SDMA_REQ_SEND_DONE, &req->flags);
                        /*
                        if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags))
                                sdma_ahg_free(req->sde, req->ahg_idx);
                }
+       } else if (ret > 0) {
+               req->seqsubmitted += ret;
+               ret = 0;
+       }
        return ret;
  
  free_txreq:
@@@ -1021,7 -1023,7 +1020,7 @@@ free_tx
   */
  static inline int num_user_pages(const struct iovec *iov)
  {
-       const unsigned long addr  = (unsigned long) iov->iov_base;
+       const unsigned long addr  = (unsigned long)iov->iov_base;
        const unsigned long len   = iov->iov_len;
        const unsigned long spage = addr & PAGE_MASK;
        const unsigned long epage = (addr + len - 1) & PAGE_MASK;
        return 1 + ((epage - spage) >> PAGE_SHIFT);
  }
  
- static int pin_vector_pages(struct user_sdma_request *req,
-                           struct user_sdma_iovec *iovec) {
-       int pinned, npages;
+ /* Caller must hold pq->evict_lock */
+ static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
+ {
+       u32 cleared = 0;
+       struct sdma_mmu_node *node, *ptr;
  
-       npages = num_user_pages(&iovec->iov);
-       iovec->pages = kcalloc(npages, sizeof(*iovec->pages), GFP_KERNEL);
-       if (!iovec->pages) {
-               SDMA_DBG(req, "Failed page array alloc");
-               return -ENOMEM;
+       list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) {
+               /* Make sure that no one is still using the node. */
+               if (!atomic_read(&node->refcount)) {
+                       /*
+                        * Need to use the page count now as the remove callback
+                        * will free the node.
+                        */
+                       cleared += node->npages;
+                       spin_unlock(&pq->evict_lock);
+                       hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb);
+                       spin_lock(&pq->evict_lock);
+                       if (cleared >= npages)
+                               break;
+               }
        }
+       return cleared;
+ }
  
-       /*
-        * Get a reference to the process's mm so we can use it when
-        * unpinning the io vectors.
-        */
-       req->pq->user_mm = get_task_mm(current);
+ static int pin_vector_pages(struct user_sdma_request *req,
+                           struct user_sdma_iovec *iovec) {
+       int ret = 0, pinned, npages, cleared;
+       struct page **pages;
+       struct hfi1_user_sdma_pkt_q *pq = req->pq;
+       struct sdma_mmu_node *node = NULL;
+       struct mmu_rb_node *rb_node;
+       rb_node = hfi1_mmu_rb_search(&pq->sdma_rb_root,
+                                    (unsigned long)iovec->iov.iov_base,
+                                    iovec->iov.iov_len);
+       if (rb_node)
+               node = container_of(rb_node, struct sdma_mmu_node, rb);
+       if (!node) {
+               node = kzalloc(sizeof(*node), GFP_KERNEL);
+               if (!node)
+                       return -ENOMEM;
  
-       pinned = hfi1_acquire_user_pages((unsigned long)iovec->iov.iov_base,
-                                        npages, 0, iovec->pages);
+               node->rb.addr = (unsigned long)iovec->iov.iov_base;
+               node->rb.len = iovec->iov.iov_len;
+               node->pq = pq;
+               atomic_set(&node->refcount, 0);
+               INIT_LIST_HEAD(&node->list);
+       }
  
-       if (pinned < 0)
-               return pinned;
+       npages = num_user_pages(&iovec->iov);
+       if (node->npages < npages) {
+               pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
+               if (!pages) {
+                       SDMA_DBG(req, "Failed page array alloc");
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+               memcpy(pages, node->pages, node->npages * sizeof(*pages));
+               npages -= node->npages;
+ retry:
+               if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) {
+                       spin_lock(&pq->evict_lock);
+                       cleared = sdma_cache_evict(pq, npages);
+                       spin_unlock(&pq->evict_lock);
+                       if (cleared >= npages)
+                               goto retry;
+               }
+               pinned = hfi1_acquire_user_pages(
+                       ((unsigned long)iovec->iov.iov_base +
+                        (node->npages * PAGE_SIZE)), npages, 0,
+                       pages + node->npages);
+               if (pinned < 0) {
+                       kfree(pages);
+                       ret = pinned;
+                       goto bail;
+               }
+               if (pinned != npages) {
+                       unpin_vector_pages(current->mm, pages, pinned);
+                       ret = -EFAULT;
+                       goto bail;
+               }
+               kfree(node->pages);
+               node->pages = pages;
+               node->npages += pinned;
+               npages = node->npages;
+               spin_lock(&pq->evict_lock);
+               if (!rb_node)
+                       list_add(&node->list, &pq->evict);
+               else
+                       list_move(&node->list, &pq->evict);
+               pq->n_locked += pinned;
+               spin_unlock(&pq->evict_lock);
+       }
+       iovec->pages = node->pages;
+       iovec->npages = npages;
  
-       iovec->npages = pinned;
-       if (pinned != npages) {
-               SDMA_DBG(req, "Failed to pin pages (%d/%u)", pinned, npages);
-               unpin_vector_pages(req, iovec);
-               return -EFAULT;
+       if (!rb_node) {
+               ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb);
+               if (ret) {
+                       spin_lock(&pq->evict_lock);
+                       list_del(&node->list);
+                       pq->n_locked -= node->npages;
+                       spin_unlock(&pq->evict_lock);
+                       ret = 0;
+                       goto bail;
+               }
+       } else {
+               atomic_inc(&node->refcount);
        }
        return 0;
+ bail:
+       if (!rb_node)
+               kfree(node);
+       return ret;
  }
  
- static void unpin_vector_pages(struct user_sdma_request *req,
-                              struct user_sdma_iovec *iovec)
+ static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
+                              unsigned npages)
  {
-       /*
-        * Unpinning is done through the workqueue so use the
-        * process's mm if we have a reference to it.
-        */
-       if ((current->flags & PF_KTHREAD) && req->pq->user_mm)
-               use_mm(req->pq->user_mm);
-       hfi1_release_user_pages(iovec->pages, iovec->npages, 0);
-       /*
-        * Unuse the user's mm (see above) and release the
-        * reference to it.
-        */
-       if (req->pq->user_mm) {
-               if (current->flags & PF_KTHREAD)
-                       unuse_mm(req->pq->user_mm);
-               mmput(req->pq->user_mm);
-       }
-       kfree(iovec->pages);
-       iovec->pages = NULL;
-       iovec->npages = 0;
-       iovec->offset = 0;
+       hfi1_release_user_pages(mm, pages, npages, 0);
+       kfree(pages);
  }
  
  static int check_header_template(struct user_sdma_request *req,
@@@ -1209,7 -1276,6 +1273,6 @@@ static int set_txreq_header(struct user
                if (ret)
                        return ret;
                goto done;
        }
  
        hdr->bth[2] = cpu_to_be32(
  
        /* Set ACK request on last packet */
        if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
-               hdr->bth[2] |= cpu_to_be32(1UL<<31);
+               hdr->bth[2] |= cpu_to_be32(1UL << 31);
  
        /* Set the new offset */
        hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
                if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
                                         PAGE_SIZE)) {
                        req->tidoffset = 0;
-                       /* Since we don't copy all the TIDs, all at once,
-                        * we have to check again. */
+                       /*
+                        * Since we don't copy all the TIDs, all at once,
+                        * we have to check again.
+                        */
                        if (++req->tididx > req->n_tids - 1 ||
                            !req->tids[req->tididx]) {
                                return -EINVAL;
@@@ -1315,8 -1383,10 +1380,10 @@@ static int set_txreq_header_ahg(struct 
                if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
                                         PAGE_SIZE)) {
                        req->tidoffset = 0;
-                       /* Since we don't copy all the TIDs, all at once,
-                        * we have to check again. */
+                       /*
+                        * Since we don't copy all the TIDs, all at once,
+                        * we have to check again.
+                        */
                        if (++req->tididx > req->n_tids - 1 ||
                            !req->tids[req->tididx]) {
                                return -EINVAL;
                                                                INTR) >> 16);
                        val &= cpu_to_le16(~(1U << 13));
                        AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val);
-               } else
+               } else {
                        AHG_HEADER_SET(req->ahg, diff, 7, 16, 12, val);
+               }
        }
  
        trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
   * tx request have been processed by the DMA engine. Called in
   * interrupt context.
   */
- static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status,
-                              int drain)
+ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
  {
        struct user_sdma_txreq *tx =
                container_of(txreq, struct user_sdma_txreq, txreq);
        struct user_sdma_request *req;
-       bool defer;
-       int i;
+       struct hfi1_user_sdma_pkt_q *pq;
+       struct hfi1_user_sdma_comp_q *cq;
+       u16 idx;
  
        if (!tx->req)
                return;
  
        req = tx->req;
-       /*
-        * If this is the callback for the last packet of the request,
-        * queue up the request for clean up.
-        */
-       defer = (tx->seqnum == req->info.npkts - 1);
-       /*
-        * If we have any io vectors associated with this txreq,
-        * check whether they need to be 'freed'. We can't free them
-        * here because the unpin function needs to be able to sleep.
-        */
-       for (i = tx->idx; i >= 0; i--) {
-               if (tx->iovecs[i].flags & TXREQ_FLAGS_IOVEC_LAST_PKT) {
-                       defer = true;
-                       break;
-               }
-       }
+       pq = req->pq;
+       cq = req->cq;
  
-       req->status = status;
        if (status != SDMA_TXREQ_S_OK) {
                SDMA_DBG(req, "SDMA completion with error %d",
                         status);
                set_bit(SDMA_REQ_HAS_ERROR, &req->flags);
-               defer = true;
        }
  
-       /*
-        * Defer the clean up of the iovectors and the request until later
-        * so it can be done outside of interrupt context.
-        */
-       if (defer) {
-               spin_lock(&req->txcmp_lock);
-               list_add_tail(&tx->list, &req->txcmp);
-               spin_unlock(&req->txcmp_lock);
-               schedule_work(&req->worker);
+       req->seqcomp = tx->seqnum;
+       kmem_cache_free(pq->txreq_cache, tx);
+       tx = NULL;
+       idx = req->info.comp_idx;
+       if (req->status == -1 && status == SDMA_TXREQ_S_OK) {
+               if (req->seqcomp == req->info.npkts - 1) {
+                       req->status = 0;
+                       user_sdma_free_request(req, false);
+                       pq_update(pq);
+                       set_comp_state(pq, cq, idx, COMPLETE, 0);
+               }
        } else {
-               kmem_cache_free(req->pq->txreq_cache, tx);
+               if (status != SDMA_TXREQ_S_OK)
+                       req->status = status;
+               if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) &&
+                   (test_bit(SDMA_REQ_SEND_DONE, &req->flags) ||
+                    test_bit(SDMA_REQ_DONE_ERROR, &req->flags))) {
+                       user_sdma_free_request(req, false);
+                       pq_update(pq);
+                       set_comp_state(pq, cq, idx, ERROR, req->status);
+               }
        }
  }
  
- static void user_sdma_delayed_completion(struct work_struct *work)
+ static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
  {
-       struct user_sdma_request *req =
-               container_of(work, struct user_sdma_request, worker);
-       struct hfi1_user_sdma_pkt_q *pq = req->pq;
-       struct user_sdma_txreq *tx = NULL;
-       unsigned long flags;
-       u64 seqnum;
-       int i;
-       while (1) {
-               spin_lock_irqsave(&req->txcmp_lock, flags);
-               if (!list_empty(&req->txcmp)) {
-                       tx = list_first_entry(&req->txcmp,
-                                             struct user_sdma_txreq, list);
-                       list_del(&tx->list);
-               }
-               spin_unlock_irqrestore(&req->txcmp_lock, flags);
-               if (!tx)
-                       break;
-               for (i = tx->idx; i >= 0; i--)
-                       if (tx->iovecs[i].flags & TXREQ_FLAGS_IOVEC_LAST_PKT)
-                               unpin_vector_pages(req, tx->iovecs[i].vec);
-               seqnum = tx->seqnum;
-               kmem_cache_free(pq->txreq_cache, tx);
-               tx = NULL;
-               if (req->status != SDMA_TXREQ_S_OK) {
-                       if (seqnum == ACCESS_ONCE(req->seqnum) &&
-                           test_bit(SDMA_REQ_DONE_ERROR, &req->flags)) {
-                               atomic_dec(&pq->n_reqs);
-                               set_comp_state(req, ERROR, req->status);
-                               user_sdma_free_request(req);
-                               break;
-                       }
-               } else {
-                       if (seqnum == req->info.npkts - 1) {
-                               atomic_dec(&pq->n_reqs);
-                               set_comp_state(req, COMPLETE, 0);
-                               user_sdma_free_request(req);
-                               break;
-                       }
-               }
-       }
-       if (!atomic_read(&pq->n_reqs)) {
+       if (atomic_dec_and_test(&pq->n_reqs)) {
                xchg(&pq->state, SDMA_PKT_Q_INACTIVE);
                wake_up(&pq->wait);
        }
  }
  
- static void user_sdma_free_request(struct user_sdma_request *req)
+ static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
  {
        if (!list_empty(&req->txps)) {
                struct sdma_txreq *t, *p;
                }
        }
        if (req->data_iovs) {
+               struct sdma_mmu_node *node;
+               struct mmu_rb_node *mnode;
                int i;
  
-               for (i = 0; i < req->data_iovs; i++)
-                       if (req->iovs[i].npages && req->iovs[i].pages)
-                               unpin_vector_pages(req, &req->iovs[i]);
+               for (i = 0; i < req->data_iovs; i++) {
+                       mnode = hfi1_mmu_rb_search(
+                               &req->pq->sdma_rb_root,
+                               (unsigned long)req->iovs[i].iov.iov_base,
+                               req->iovs[i].iov.iov_len);
+                       if (!mnode)
+                               continue;
+                       node = container_of(mnode, struct sdma_mmu_node, rb);
+                       if (unpin)
+                               hfi1_mmu_rb_remove(&req->pq->sdma_rb_root,
+                                                  &node->rb);
+                       else
+                               atomic_dec(&node->refcount);
+               }
        }
        kfree(req->tids);
        clear_bit(SDMA_REQ_IN_USE, &req->flags);
  }
  
- static inline void set_comp_state(struct user_sdma_request *req,
-                                       enum hfi1_sdma_comp_state state,
-                                       int ret)
+ static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
+                                 struct hfi1_user_sdma_comp_q *cq,
+                                 u16 idx, enum hfi1_sdma_comp_state state,
+                                 int ret)
  {
-       SDMA_DBG(req, "Setting completion status %u %d", state, ret);
-       req->cq->comps[req->info.comp_idx].status = state;
+       hfi1_cdbg(SDMA, "[%u:%u:%u:%u] Setting completion status %u %d",
+                 pq->dd->unit, pq->ctxt, pq->subctxt, idx, state, ret);
+       cq->comps[idx].status = state;
        if (state == ERROR)
-               req->cq->comps[req->info.comp_idx].errcode = -ret;
-       trace_hfi1_sdma_user_completion(req->pq->dd, req->pq->ctxt,
-                                       req->pq->subctxt, req->info.comp_idx,
-                                       state, ret);
+               cq->comps[idx].errcode = -ret;
+       trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
+                                       idx, state, ret);
+ }
+ static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
+                          unsigned long len)
+ {
+       return (bool)(node->addr == addr);
+ }
+ static int sdma_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode)
+ {
+       struct sdma_mmu_node *node =
+               container_of(mnode, struct sdma_mmu_node, rb);
+       atomic_inc(&node->refcount);
+       return 0;
+ }
+ static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode,
+                          bool notifier)
+ {
+       struct sdma_mmu_node *node =
+               container_of(mnode, struct sdma_mmu_node, rb);
+       spin_lock(&node->pq->evict_lock);
+       list_del(&node->list);
+       node->pq->n_locked -= node->npages;
+       spin_unlock(&node->pq->evict_lock);
+       unpin_vector_pages(notifier ? NULL : current->mm, node->pages,
+                          node->npages);
+       /*
+        * If called by the MMU notifier, we have to adjust the pinned
+        * page count ourselves.
+        */
+       if (notifier)
+               current->mm->pinned_vm -= node->npages;
+       kfree(node);
+ }
+ static int sdma_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode)
+ {
+       struct sdma_mmu_node *node =
+               container_of(mnode, struct sdma_mmu_node, rb);
+       if (!atomic_read(&node->refcount))
+               return 1;
+       return 0;
  }
index 02ac3000ee3c14e83550e2a9481f5bb82df91422,4b531c44b3c7767660375515f093242394a37e84..8156e3c9239ce6a5c6883040b1b6d1b619ad1169
@@@ -373,12 -373,6 +373,12 @@@ enum 
        MLX5_SET_PORT_PKEY_TABLE        = 20,
  };
  
 +enum {
 +      MLX5_BW_NO_LIMIT   = 0,
 +      MLX5_100_MBPS_UNIT = 3,
 +      MLX5_GBPS_UNIT     = 4,
 +};
 +
  enum {
        MLX5_MAX_PAGE_SHIFT             = 31
  };
@@@ -1206,17 -1200,6 +1206,17 @@@ enum 
        MLX5_RQC_RQ_TYPE_MEMORY_RQ_RPM    = 0x1,
  };
  
 +enum mlx5_wol_mode {
 +      MLX5_WOL_DISABLE        = 0,
 +      MLX5_WOL_SECURED_MAGIC  = 1 << 1,
 +      MLX5_WOL_MAGIC          = 1 << 2,
 +      MLX5_WOL_ARP            = 1 << 3,
 +      MLX5_WOL_BROADCAST      = 1 << 4,
 +      MLX5_WOL_MULTICAST      = 1 << 5,
 +      MLX5_WOL_UNICAST        = 1 << 6,
 +      MLX5_WOL_PHY_ACTIVITY   = 1 << 7,
 +};
 +
  /* MLX5 DEV CAPs */
  
  /* TODO: EAT.ME */
@@@ -1236,6 -1219,8 +1236,8 @@@ enum mlx5_cap_type 
        MLX5_CAP_FLOW_TABLE,
        MLX5_CAP_ESWITCH_FLOW_TABLE,
        MLX5_CAP_ESWITCH,
+       MLX5_CAP_RESERVED,
+       MLX5_CAP_VECTOR_CALC,
        /* NUM OF CAP Types */
        MLX5_CAP_NUM
  };
  #define MLX5_CAP_ODP(mdev, cap)\
        MLX5_GET(odp_cap, mdev->hca_caps_cur[MLX5_CAP_ODP], cap)
  
+ #define MLX5_CAP_VECTOR_CALC(mdev, cap) \
+       MLX5_GET(vector_calc_cap, \
+                mdev->hca_caps_cur[MLX5_CAP_VECTOR_CALC], cap)
  enum {
        MLX5_CMD_STAT_OK                        = 0x0,
        MLX5_CMD_STAT_INT_ERR                   = 0x1,
index 3a954465b2bfc00d81f7fff8d87a4ce038995cb6,e1d987fb49b2789e618a0dd94c413c296d4e45d0..dcd5ac8d3b1403875bce11aeddaf106acc0cd218
@@@ -54,7 -54,7 +54,7 @@@ enum 
        /* one minute for the sake of bringup. Generally, commands must always
         * complete and we may need to increase this timeout value
         */
 -      MLX5_CMD_TIMEOUT_MSEC   = 7200 * 1000,
 +      MLX5_CMD_TIMEOUT_MSEC   = 60 * 1000,
        MLX5_CMD_WQ_MAX_NAME    = 32,
  };
  
@@@ -99,8 -99,6 +99,8 @@@ enum 
  };
  
  enum {
 +      MLX5_REG_QETCR           = 0x4005,
 +      MLX5_REG_QTCT            = 0x400a,
        MLX5_REG_PCAP            = 0x5001,
        MLX5_REG_PMTU            = 0x5003,
        MLX5_REG_PTYS            = 0x5004,
@@@ -460,6 -458,8 +460,6 @@@ struct mlx5_priv 
        struct mlx5_uuar_info   uuari;
        MLX5_DECLARE_DOORBELL_LOCK(cq_uar_lock);
  
 -      struct io_mapping       *bf_mapping;
 -
        /* pages stuff */
        struct workqueue_struct *pg_wq;
        struct rb_root          page_root;
@@@ -613,7 -613,10 +613,10 @@@ struct mlx5_pas 
  };
  
  enum port_state_policy {
-       MLX5_AAA_000
+       MLX5_POLICY_DOWN        = 0,
+       MLX5_POLICY_UP          = 1,
+       MLX5_POLICY_FOLLOW      = 2,
+       MLX5_POLICY_INVALID     = 0xffffffff
  };
  
  enum phy_port_state {
@@@ -706,8 -709,7 +709,7 @@@ void mlx5_cmd_use_events(struct mlx5_co
  void mlx5_cmd_use_polling(struct mlx5_core_dev *dev);
  int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr);
  int mlx5_cmd_status_to_err_v2(void *ptr);
- int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type,
-                      enum mlx5_cap_mode cap_mode);
+ int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type);
  int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
                  int out_size);
  int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size,
@@@ -717,8 -719,7 +719,8 @@@ int mlx5_cmd_alloc_uar(struct mlx5_core
  int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn);
  int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari);
  int mlx5_free_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari);
 -int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar);
 +int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar,
 +                     bool map_wc);
  void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar);
  void mlx5_health_cleanup(struct mlx5_core_dev *dev);
  int mlx5_health_init(struct mlx5_core_dev *dev);
@@@ -797,6 -798,37 +799,6 @@@ int mlx5_core_access_reg(struct mlx5_co
                         int size_in, void *data_out, int size_out,
                         u16 reg_num, int arg, int write);
  
 -int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps);
 -int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys,
 -                       int ptys_size, int proto_mask, u8 local_port);
 -int mlx5_query_port_proto_cap(struct mlx5_core_dev *dev,
 -                            u32 *proto_cap, int proto_mask);
 -int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev,
 -                              u32 *proto_admin, int proto_mask);
 -int mlx5_query_port_link_width_oper(struct mlx5_core_dev *dev,
 -                                  u8 *link_width_oper, u8 local_port);
 -int mlx5_query_port_proto_oper(struct mlx5_core_dev *dev,
 -                             u8 *proto_oper, int proto_mask,
 -                             u8 local_port);
 -int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin,
 -                      int proto_mask);
 -int mlx5_set_port_admin_status(struct mlx5_core_dev *dev,
 -                             enum mlx5_port_status status);
 -int mlx5_query_port_admin_status(struct mlx5_core_dev *dev,
 -                               enum mlx5_port_status *status);
 -
 -int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port);
 -void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, u8 port);
 -void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu,
 -                            u8 port);
 -
 -int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev,
 -                            u8 *vl_hw_cap, u8 local_port);
 -
 -int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause);
 -int mlx5_query_port_pause(struct mlx5_core_dev *dev,
 -                        u32 *rx_pause, u32 *tx_pause);
 -
  int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
  void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
  int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
index e52730e01ed68bca8863cab4cca05e339b50cf87,bb9e07ca65345ac0b739b5a2b87c5f6c711c21a1..c15b8a8649377ea401a556c0f709b3bfe112f30f
@@@ -166,8 -166,6 +166,8 @@@ enum 
        MLX5_CMD_OP_SET_L2_TABLE_ENTRY            = 0x829,
        MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY          = 0x82a,
        MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY         = 0x82b,
 +      MLX5_CMD_OP_SET_WOL_ROL                   = 0x830,
 +      MLX5_CMD_OP_QUERY_WOL_ROL                 = 0x831,
        MLX5_CMD_OP_CREATE_TIR                    = 0x900,
        MLX5_CMD_OP_MODIFY_TIR                    = 0x901,
        MLX5_CMD_OP_DESTROY_TIR                   = 0x902,
@@@ -618,6 -616,33 +618,33 @@@ struct mlx5_ifc_odp_cap_bits 
        u8         reserved_at_e0[0x720];
  };
  
+ struct mlx5_ifc_calc_op {
+       u8        reserved_at_0[0x10];
+       u8        reserved_at_10[0x9];
+       u8        op_swap_endianness[0x1];
+       u8        op_min[0x1];
+       u8        op_xor[0x1];
+       u8        op_or[0x1];
+       u8        op_and[0x1];
+       u8        op_max[0x1];
+       u8        op_add[0x1];
+ };
+ struct mlx5_ifc_vector_calc_cap_bits {
+       u8         calc_matrix[0x1];
+       u8         reserved_at_1[0x1f];
+       u8         reserved_at_20[0x8];
+       u8         max_vec_count[0x8];
+       u8         reserved_at_30[0xd];
+       u8         max_chunk_size[0x3];
+       struct mlx5_ifc_calc_op calc0;
+       struct mlx5_ifc_calc_op calc1;
+       struct mlx5_ifc_calc_op calc2;
+       struct mlx5_ifc_calc_op calc3;
+       u8         reserved_at_e0[0x720];
+ };
  enum {
        MLX5_WQ_TYPE_LINKED_LIST  = 0x0,
        MLX5_WQ_TYPE_CYCLIC       = 0x1,
@@@ -732,19 -757,7 +759,19 @@@ struct mlx5_ifc_cmd_hca_cap_bits 
  
        u8         reserved_at_1bf[0x3];
        u8         log_max_msg[0x5];
 -      u8         reserved_at_1c7[0x18];
 +      u8         reserved_at_1c7[0x4];
 +      u8         max_tc[0x4];
 +      u8         reserved_at_1cf[0x6];
 +      u8         rol_s[0x1];
 +      u8         rol_g[0x1];
 +      u8         reserved_at_1d7[0x1];
 +      u8         wol_s[0x1];
 +      u8         wol_g[0x1];
 +      u8         wol_a[0x1];
 +      u8         wol_b[0x1];
 +      u8         wol_m[0x1];
 +      u8         wol_u[0x1];
 +      u8         wol_p[0x1];
  
        u8         stat_rate_support[0x10];
        u8         reserved_at_1ef[0xc];
        u8         cd[0x1];
        u8         reserved_at_22c[0x1];
        u8         apm[0x1];
-       u8         reserved_at_22e[0x2];
+       u8         vector_calc[0x1];
+       u8         reserved_at_22f[0x1];
        u8         imaicl[0x1];
        u8         reserved_at_231[0x4];
        u8         qkv[0x1];
@@@ -1954,6 -1968,7 +1982,7 @@@ union mlx5_ifc_hca_cap_union_bits 
        struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap;
        struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap;
        struct mlx5_ifc_e_switch_cap_bits e_switch_cap;
+       struct mlx5_ifc_vector_calc_cap_bits vector_calc_cap;
        u8         reserved_at_0[0x8000];
  };
  
@@@ -3681,6 -3696,12 +3710,12 @@@ struct mlx5_ifc_query_hca_vport_pkey_in
        u8         pkey_index[0x10];
  };
  
+ enum {
+       MLX5_HCA_VPORT_SEL_PORT_GUID    = 1 << 0,
+       MLX5_HCA_VPORT_SEL_NODE_GUID    = 1 << 1,
+       MLX5_HCA_VPORT_SEL_STATE_POLICY = 1 << 2,
+ };
  struct mlx5_ifc_query_hca_vport_gid_out_bits {
        u8         status[0x8];
        u8         reserved_at_8[0x18];
@@@ -4297,9 -4318,7 +4332,9 @@@ struct mlx5_ifc_modify_tir_bitmask_bit
  
        u8         reserved_at_20[0x1b];
        u8         self_lb_en[0x1];
 -      u8         reserved_at_3c[0x3];
 +      u8         reserved_at_3c[0x1];
 +      u8         hash[0x1];
 +      u8         reserved_at_3e[0x1];
        u8         lro[0x1];
  };
  
@@@ -6925,54 -6944,6 +6960,54 @@@ struct mlx5_ifc_mtt_bits 
        u8         rd_en[0x1];
  };
  
 +struct mlx5_ifc_query_wol_rol_out_bits {
 +      u8         status[0x8];
 +      u8         reserved_at_8[0x18];
 +
 +      u8         syndrome[0x20];
 +
 +      u8         reserved_at_40[0x10];
 +      u8         rol_mode[0x8];
 +      u8         wol_mode[0x8];
 +
 +      u8         reserved_at_60[0x20];
 +};
 +
 +struct mlx5_ifc_query_wol_rol_in_bits {
 +      u8         opcode[0x10];
 +      u8         reserved_at_10[0x10];
 +
 +      u8         reserved_at_20[0x10];
 +      u8         op_mod[0x10];
 +
 +      u8         reserved_at_40[0x40];
 +};
 +
 +struct mlx5_ifc_set_wol_rol_out_bits {
 +      u8         status[0x8];
 +      u8         reserved_at_8[0x18];
 +
 +      u8         syndrome[0x20];
 +
 +      u8         reserved_at_40[0x40];
 +};
 +
 +struct mlx5_ifc_set_wol_rol_in_bits {
 +      u8         opcode[0x10];
 +      u8         reserved_at_10[0x10];
 +
 +      u8         reserved_at_20[0x10];
 +      u8         op_mod[0x10];
 +
 +      u8         rol_mode_valid[0x1];
 +      u8         wol_mode_valid[0x1];
 +      u8         reserved_at_42[0xe];
 +      u8         rol_mode[0x8];
 +      u8         wol_mode[0x8];
 +
 +      u8         reserved_at_60[0x20];
 +};
 +
  enum {
        MLX5_INITIAL_SEG_NIC_INTERFACE_FULL_DRIVER  = 0x0,
        MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED     = 0x1,
@@@ -7164,49 -7135,4 +7199,49 @@@ struct mlx5_ifc_modify_flow_table_in_bi
        u8         reserved_at_100[0x100];
  };
  
 +struct mlx5_ifc_ets_tcn_config_reg_bits {
 +      u8         g[0x1];
 +      u8         b[0x1];
 +      u8         r[0x1];
 +      u8         reserved_at_3[0x9];
 +      u8         group[0x4];
 +      u8         reserved_at_10[0x9];
 +      u8         bw_allocation[0x7];
 +
 +      u8         reserved_at_20[0xc];
 +      u8         max_bw_units[0x4];
 +      u8         reserved_at_30[0x8];
 +      u8         max_bw_value[0x8];
 +};
 +
 +struct mlx5_ifc_ets_global_config_reg_bits {
 +      u8         reserved_at_0[0x2];
 +      u8         r[0x1];
 +      u8         reserved_at_3[0x1d];
 +
 +      u8         reserved_at_20[0xc];
 +      u8         max_bw_units[0x4];
 +      u8         reserved_at_30[0x8];
 +      u8         max_bw_value[0x8];
 +};
 +
 +struct mlx5_ifc_qetc_reg_bits {
 +      u8                                         reserved_at_0[0x8];
 +      u8                                         port_number[0x8];
 +      u8                                         reserved_at_10[0x30];
 +
 +      struct mlx5_ifc_ets_tcn_config_reg_bits    tc_configuration[0x8];
 +      struct mlx5_ifc_ets_global_config_reg_bits global_configuration;
 +};
 +
 +struct mlx5_ifc_qtct_reg_bits {
 +      u8         reserved_at_0[0x8];
 +      u8         port_number[0x8];
 +      u8         reserved_at_10[0xd];
 +      u8         prio[0x3];
 +
 +      u8         reserved_at_20[0x1d];
 +      u8         tclass[0x3];
 +};
 +
  #endif /* MLX5_IFC_H */
index be693b34662f9c95ec4de34ed4cf4b558a1e9738,7b4ae218b90bcfe6eeef660fe34972a3bddc3800..009c85adae4c5036be4351c2bd020753af5befc5
@@@ -51,7 -51,6 +51,7 @@@
  #include <linux/neighbour.h>
  #include <uapi/linux/netdevice.h>
  #include <uapi/linux/if_bonding.h>
 +#include <uapi/linux/pkt_cls.h>
  
  struct netpoll_info;
  struct device;
@@@ -268,7 -267,6 +268,7 @@@ struct header_ops 
        void    (*cache_update)(struct hh_cache *hh,
                                const struct net_device *dev,
                                const unsigned char *haddr);
 +      bool    (*validate)(const char *ll_header, unsigned int len);
  };
  
  /* These flag bits are private to the generic network queueing
@@@ -780,27 -778,6 +780,27 @@@ static inline bool netdev_phys_item_id_
  typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
                                       struct sk_buff *skb);
  
 +/* These structures hold the attributes of qdisc and classifiers
 + * that are being passed to the netdevice through the setup_tc op.
 + */
 +enum {
 +      TC_SETUP_MQPRIO,
 +      TC_SETUP_CLSU32,
 +      TC_SETUP_CLSFLOWER,
 +};
 +
 +struct tc_cls_u32_offload;
 +
 +struct tc_to_netdev {
 +      unsigned int type;
 +      union {
 +              u8 tc;
 +              struct tc_cls_u32_offload *cls_u32;
 +              struct tc_cls_flower_offload *cls_flower;
 +      };
 +};
 +
 +
  /*
   * This structure defines the management hooks for network devices.
   * The following hooks can be defined; unless noted otherwise, they are
   *    This function is used to get egress tunnel information for given skb.
   *    This is useful for retrieving outer tunnel header parameters while
   *    sampling packet.
 + * void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom);
 + *    This function is used to specify the headroom that the skb must
 + *    consider when allocation skb during packet reception. Setting
 + *    appropriate rx headroom value allows avoiding skb head copy on
 + *    forward. Setting a negative value reset the rx headroom to the
 + *    default value.
   *
   */
  struct net_device_ops {
                                                   struct nlattr *port[]);
        int                     (*ndo_get_vf_port)(struct net_device *dev,
                                                   int vf, struct sk_buff *skb);
+       int                     (*ndo_set_vf_guid)(struct net_device *dev,
+                                                  int vf, u64 guid,
+                                                  int guid_type);
        int                     (*ndo_set_vf_rss_query_en)(
                                                   struct net_device *dev,
                                                   int vf, bool setting);
 -      int                     (*ndo_setup_tc)(struct net_device *dev, u8 tc);
 +      int                     (*ndo_setup_tc)(struct net_device *dev,
 +                                              u32 handle,
 +                                              __be16 protocol,
 +                                              struct tc_to_netdev *tc);
  #if IS_ENABLED(CONFIG_FCOE)
        int                     (*ndo_fcoe_enable)(struct net_device *dev);
        int                     (*ndo_fcoe_disable)(struct net_device *dev);
                                                         bool proto_down);
        int                     (*ndo_fill_metadata_dst)(struct net_device *dev,
                                                       struct sk_buff *skb);
 +      void                    (*ndo_set_rx_headroom)(struct net_device *dev,
 +                                                     int needed_headroom);
  };
  
  /**
   * @IFF_OPENVSWITCH: device is a Open vSwitch master
   * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device
   * @IFF_TEAM: device is a team device
 + * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured
 + * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external
 + *    entity (i.e. the master device for bridged veth)
 + * @IFF_MACSEC: device is a MACsec device
   */
  enum netdev_priv_flags {
        IFF_802_1Q_VLAN                 = 1<<0,
        IFF_OPENVSWITCH                 = 1<<22,
        IFF_L3MDEV_SLAVE                = 1<<23,
        IFF_TEAM                        = 1<<24,
 +      IFF_RXFH_CONFIGURED             = 1<<25,
 +      IFF_PHONY_HEADROOM              = 1<<26,
 +      IFF_MACSEC                      = 1<<27,
  };
  
  #define IFF_802_1Q_VLAN                       IFF_802_1Q_VLAN
  #define IFF_OPENVSWITCH                       IFF_OPENVSWITCH
  #define IFF_L3MDEV_SLAVE              IFF_L3MDEV_SLAVE
  #define IFF_TEAM                      IFF_TEAM
 +#define IFF_RXFH_CONFIGURED           IFF_RXFH_CONFIGURED
 +#define IFF_MACSEC                    IFF_MACSEC
  
  /**
   *    struct net_device - The DEVICE structure.
   *                    do not use this in drivers
   *    @tx_dropped:    Dropped packets by core network,
   *                    do not use this in drivers
 + *    @rx_nohandler:  nohandler dropped packets by core network on
 + *                    inactive devices, do not use this in drivers
   *
   *    @wireless_handlers:     List of functions to handle Wireless Extensions,
   *                            instead of ioctl,
   *    @dma:           DMA channel
   *    @mtu:           Interface MTU value
   *    @type:          Interface hardware type
 - *    @hard_header_len: Hardware header length, which means that this is the
 - *                      minimum size of a packet.
 + *    @hard_header_len: Maximum hardware header length.
   *
   *    @needed_headroom: Extra headroom the hardware may need, but not in all
   *                      cases can this be guaranteed
@@@ -1655,7 -1614,6 +1658,7 @@@ struct net_device 
  
        atomic_long_t           rx_dropped;
        atomic_long_t           tx_dropped;
 +      atomic_long_t           rx_nohandler;
  
  #ifdef CONFIG_WIRELESS_EXT
        const struct iw_handler_def *   wireless_handlers;
@@@ -1953,26 -1911,6 +1956,26 @@@ struct netdev_queue *netdev_pick_tx(str
                                    struct sk_buff *skb,
                                    void *accel_priv);
  
 +/* returns the headroom that the master device needs to take in account
 + * when forwarding to this dev
 + */
 +static inline unsigned netdev_get_fwd_headroom(struct net_device *dev)
 +{
 +      return dev->priv_flags & IFF_PHONY_HEADROOM ? 0 : dev->needed_headroom;
 +}
 +
 +static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr)
 +{
 +      if (dev->netdev_ops->ndo_set_rx_headroom)
 +              dev->netdev_ops->ndo_set_rx_headroom(dev, new_hr);
 +}
 +
 +/* set the device rx headroom to the dev's default */
 +static inline void netdev_reset_rx_headroom(struct net_device *dev)
 +{
 +      netdev_set_rx_headroom(dev, -1);
 +}
 +
  /*
   * Net namespace inlines
   */
@@@ -2692,24 -2630,6 +2695,24 @@@ static inline int dev_parse_header(cons
        return dev->header_ops->parse(skb, haddr);
  }
  
 +/* ll_header must have at least hard_header_len allocated */
 +static inline bool dev_validate_header(const struct net_device *dev,
 +                                     char *ll_header, int len)
 +{
 +      if (likely(len >= dev->hard_header_len))
 +              return true;
 +
 +      if (capable(CAP_SYS_RAWIO)) {
 +              memset(ll_header + len, 0, dev->hard_header_len - len);
 +              return true;
 +      }
 +
 +      if (dev->header_ops && dev->header_ops->validate)
 +              return dev->header_ops->validate(ll_header, len);
 +
 +      return false;
 +}
 +
  typedef int gifconf_func_t(struct net_device * dev, char __user * bufptr, int len);
  int register_gifconf(unsigned int family, gifconf_func_t *gifconf);
  static inline int unregister_gifconf(unsigned int family)
@@@ -3824,7 -3744,7 +3827,7 @@@ void netdev_lower_state_changed(struct 
  
  /* RSS keys are 40 or 52 bytes long */
  #define NETDEV_RSS_KEY_LEN 52
 -extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN];
 +extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly;
  void netdev_rss_key_fill(void *buffer, size_t len);
  
  int dev_get_nest_level(struct net_device *dev,
@@@ -4048,11 -3968,6 +4051,11 @@@ static inline void skb_gso_error_unwind
        skb->mac_len = mac_len;
  }
  
 +static inline bool netif_is_macsec(const struct net_device *dev)
 +{
 +      return dev->priv_flags & IFF_MACSEC;
 +}
 +
  static inline bool netif_is_macvlan(const struct net_device *dev)
  {
        return dev->priv_flags & IFF_MACVLAN;
@@@ -4133,11 -4048,6 +4136,11 @@@ static inline bool netif_is_lag_port(co
        return netif_is_bond_slave(dev) || netif_is_team_port(dev);
  }
  
 +static inline bool netif_is_rxfh_configured(const struct net_device *dev)
 +{
 +      return dev->priv_flags & IFF_RXFH_CONFIGURED;
 +}
 +
  /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
  static inline void netif_keep_dst(struct net_device *dev)
  {
index 8e3f88fa5b59056cb29e7736b5d4d89e08964321,1d01e8a4e5dd3ab4d052f234a541c7d7bc778c02..a62a0129d614049293e585a871ecc956afc17ad0
@@@ -35,8 -35,6 +35,8 @@@ struct rtnl_link_stats 
        /* for cslip etc */
        __u32   rx_compressed;
        __u32   tx_compressed;
 +
 +      __u32   rx_nohandler;           /* dropped, no handler found    */
  };
  
  /* The main device statistics structure */
@@@ -70,8 -68,6 +70,8 @@@ struct rtnl_link_stats64 
        /* for cslip etc */
        __u64   rx_compressed;
        __u64   tx_compressed;
 +
 +      __u64   rx_nohandler;           /* dropped, no handler found    */
  };
  
  /* The struct should be in sync with struct ifmap */
@@@ -405,43 -401,6 +405,43 @@@ enum 
  
  #define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1)
  
 +enum {
 +      IFLA_VRF_PORT_UNSPEC,
 +      IFLA_VRF_PORT_TABLE,
 +      __IFLA_VRF_PORT_MAX
 +};
 +
 +#define IFLA_VRF_PORT_MAX (__IFLA_VRF_PORT_MAX - 1)
 +
 +/* MACSEC section */
 +enum {
 +      IFLA_MACSEC_UNSPEC,
 +      IFLA_MACSEC_SCI,
 +      IFLA_MACSEC_PORT,
 +      IFLA_MACSEC_ICV_LEN,
 +      IFLA_MACSEC_CIPHER_SUITE,
 +      IFLA_MACSEC_WINDOW,
 +      IFLA_MACSEC_ENCODING_SA,
 +      IFLA_MACSEC_ENCRYPT,
 +      IFLA_MACSEC_PROTECT,
 +      IFLA_MACSEC_INC_SCI,
 +      IFLA_MACSEC_ES,
 +      IFLA_MACSEC_SCB,
 +      IFLA_MACSEC_REPLAY_PROTECT,
 +      IFLA_MACSEC_VALIDATION,
 +      __IFLA_MACSEC_MAX,
 +};
 +
 +#define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1)
 +
 +enum macsec_validation_type {
 +      MACSEC_VALIDATE_DISABLED = 0,
 +      MACSEC_VALIDATE_CHECK = 1,
 +      MACSEC_VALIDATE_STRICT = 2,
 +      __MACSEC_VALIDATE_END,
 +      MACSEC_VALIDATE_MAX = __MACSEC_VALIDATE_END - 1,
 +};
 +
  /* IPVLAN section */
  enum {
        IFLA_IPVLAN_UNSPEC,
@@@ -485,7 -444,6 +485,7 @@@ enum 
        IFLA_VXLAN_GBP,
        IFLA_VXLAN_REMCSUM_NOPARTIAL,
        IFLA_VXLAN_COLLECT_METADATA,
 +      IFLA_VXLAN_LABEL,
        __IFLA_VXLAN_MAX
  };
  #define IFLA_VXLAN_MAX        (__IFLA_VXLAN_MAX - 1)
@@@ -508,7 -466,6 +508,7 @@@ enum 
        IFLA_GENEVE_UDP_CSUM,
        IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
        IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
 +      IFLA_GENEVE_LABEL,
        __IFLA_GENEVE_MAX
  };
  #define IFLA_GENEVE_MAX       (__IFLA_GENEVE_MAX - 1)
@@@ -599,6 -556,8 +599,8 @@@ enum 
                                 */
        IFLA_VF_STATS,          /* network device statistics */
        IFLA_VF_TRUST,          /* Trust VF */
+       IFLA_VF_IB_NODE_GUID,   /* VF Infiniband node GUID */
+       IFLA_VF_IB_PORT_GUID,   /* VF Infiniband port GUID */
        __IFLA_VF_MAX,
  };
  
@@@ -631,6 -590,11 +633,11 @@@ struct ifla_vf_spoofchk 
        __u32 setting;
  };
  
+ struct ifla_vf_guid {
+       __u32 vf;
+       __u64 guid;
+ };
  enum {
        IFLA_VF_LINK_STATE_AUTO,        /* link state of the uplink */
        IFLA_VF_LINK_STATE_ENABLE,      /* link always up */
diff --combined net/core/rtnetlink.c
index d2d9e5ebf58ea827f8e0b5aaa85cea23cd3b77dd,4b6f3db9f8afb8589be7ec4363911d6770abae2c..167883e0931735a58d823a4efc46776b9c50a071
@@@ -804,8 -804,6 +804,8 @@@ static void copy_rtnl_link_stats(struc
  
        a->rx_compressed = b->rx_compressed;
        a->tx_compressed = b->tx_compressed;
 +
 +      a->rx_nohandler = b->rx_nohandler;
  }
  
  static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
@@@ -1389,8 -1387,19 +1389,10 @@@ static const struct nla_policy ifla_vf_
        [IFLA_VF_RSS_QUERY_EN]  = { .len = sizeof(struct ifla_vf_rss_query_en) },
        [IFLA_VF_STATS]         = { .type = NLA_NESTED },
        [IFLA_VF_TRUST]         = { .len = sizeof(struct ifla_vf_trust) },
+       [IFLA_VF_IB_NODE_GUID]  = { .len = sizeof(struct ifla_vf_guid) },
+       [IFLA_VF_IB_PORT_GUID]  = { .len = sizeof(struct ifla_vf_guid) },
  };
  
 -static const struct nla_policy ifla_vf_stats_policy[IFLA_VF_STATS_MAX + 1] = {
 -      [IFLA_VF_STATS_RX_PACKETS]      = { .type = NLA_U64 },
 -      [IFLA_VF_STATS_TX_PACKETS]      = { .type = NLA_U64 },
 -      [IFLA_VF_STATS_RX_BYTES]        = { .type = NLA_U64 },
 -      [IFLA_VF_STATS_TX_BYTES]        = { .type = NLA_U64 },
 -      [IFLA_VF_STATS_BROADCAST]       = { .type = NLA_U64 },
 -      [IFLA_VF_STATS_MULTICAST]       = { .type = NLA_U64 },
 -};
 -
  static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
        [IFLA_PORT_VF]          = { .type = NLA_U32 },
        [IFLA_PORT_PROFILE]     = { .type = NLA_STRING,
        [IFLA_PORT_RESPONSE]    = { .type = NLA_U16, },
  };
  
 +static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
 +{
 +      const struct rtnl_link_ops *ops = NULL;
 +      struct nlattr *linfo[IFLA_INFO_MAX + 1];
 +
 +      if (nla_parse_nested(linfo, IFLA_INFO_MAX, nla, ifla_info_policy) < 0)
 +              return NULL;
 +
 +      if (linfo[IFLA_INFO_KIND]) {
 +              char kind[MODULE_NAME_LEN];
 +
 +              nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
 +              ops = rtnl_link_ops_get(kind);
 +      }
 +
 +      return ops;
 +}
 +
 +static bool link_master_filtered(struct net_device *dev, int master_idx)
 +{
 +      struct net_device *master;
 +
 +      if (!master_idx)
 +              return false;
 +
 +      master = netdev_master_upper_dev_get(dev);
 +      if (!master || master->ifindex != master_idx)
 +              return true;
 +
 +      return false;
 +}
 +
 +static bool link_kind_filtered(const struct net_device *dev,
 +                             const struct rtnl_link_ops *kind_ops)
 +{
 +      if (kind_ops && dev->rtnl_link_ops != kind_ops)
 +              return true;
 +
 +      return false;
 +}
 +
 +static bool link_dump_filtered(struct net_device *dev,
 +                             int master_idx,
 +                             const struct rtnl_link_ops *kind_ops)
 +{
 +      if (link_master_filtered(dev, master_idx) ||
 +          link_kind_filtered(dev, kind_ops))
 +              return true;
 +
 +      return false;
 +}
 +
  static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
  {
        struct net *net = sock_net(skb->sk);
        struct hlist_head *head;
        struct nlattr *tb[IFLA_MAX+1];
        u32 ext_filter_mask = 0;
 +      const struct rtnl_link_ops *kind_ops = NULL;
 +      unsigned int flags = NLM_F_MULTI;
 +      int master_idx = 0;
        int err;
        int hdrlen;
  
  
                if (tb[IFLA_EXT_MASK])
                        ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
 +
 +              if (tb[IFLA_MASTER])
 +                      master_idx = nla_get_u32(tb[IFLA_MASTER]);
 +
 +              if (tb[IFLA_LINKINFO])
 +                      kind_ops = linkinfo_to_kind_ops(tb[IFLA_LINKINFO]);
 +
 +              if (master_idx || kind_ops)
 +                      flags |= NLM_F_DUMP_FILTERED;
        }
  
        for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
                idx = 0;
                head = &net->dev_index_head[h];
                hlist_for_each_entry(dev, head, index_hlist) {
 +                      if (link_dump_filtered(dev, master_idx, kind_ops))
 +                              continue;
                        if (idx < s_idx)
                                goto cont;
                        err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
                                               NETLINK_CB(cb->skb).portid,
                                               cb->nlh->nlmsg_seq, 0,
 -                                             NLM_F_MULTI,
 +                                             flags,
                                               ext_filter_mask);
                        /* If we ran out of room on the first message,
                         * we're in trouble
@@@ -1593,6 -1536,22 +1595,22 @@@ static int validate_linkmsg(struct net_
        return 0;
  }
  
+ static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt,
+                                 int guid_type)
+ {
+       const struct net_device_ops *ops = dev->netdev_ops;
+       return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type);
+ }
+ static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type)
+ {
+       if (dev->type != ARPHRD_INFINIBAND)
+               return -EOPNOTSUPP;
+       return handle_infiniband_guid(dev, ivt, guid_type);
+ }
  static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
  {
        const struct net_device_ops *ops = dev->netdev_ops;
                        return err;
        }
  
+       if (tb[IFLA_VF_IB_NODE_GUID]) {
+               struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]);
+               if (!ops->ndo_set_vf_guid)
+                       return -EOPNOTSUPP;
+               return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID);
+       }
+       if (tb[IFLA_VF_IB_PORT_GUID]) {
+               struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]);
+               if (!ops->ndo_set_vf_guid)
+                       return -EOPNOTSUPP;
+               return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID);
+       }
        return err;
  }
  
@@@ -2970,7 -2947,6 +3006,7 @@@ int ndo_dflt_fdb_dump(struct sk_buff *s
        nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc);
  out:
        netif_addr_unlock_bh(dev);
 +      cb->args[1] = err;
        return idx;
  }
  EXPORT_SYMBOL(ndo_dflt_fdb_dump);
@@@ -3004,7 -2980,6 +3040,7 @@@ static int rtnl_fdb_dump(struct sk_buf
                ops = br_dev->netdev_ops;
        }
  
 +      cb->args[1] = 0;
        for_each_netdev(net, dev) {
                if (brport_idx && (dev->ifindex != brport_idx))
                        continue;
                                idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
                                                         idx);
                }
 +              if (cb->args[1] == -EMSGSIZE)
 +                      break;
  
                if (dev->netdev_ops->ndo_fdb_dump)
                        idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
                                                            idx);
                else
                        idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
 +              if (cb->args[1] == -EMSGSIZE)
 +                      break;
  
                cops = NULL;
        }